From 351f94d981f363909ae6e76ed57cd0a75c3f5688 Mon Sep 17 00:00:00 2001
From: Rageking8 <106309953+Rageking8@users.noreply.github.com>
Date: Thu, 8 Feb 2024 13:05:53 +0800
Subject: [clang][NFC] resolve redundant predicates (#79701)

Fixes #79686
---
 clang/lib/Sema/SemaChecking.cpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp
index b071a02..c775ff2 100644
--- a/clang/lib/Sema/SemaChecking.cpp
+++ b/clang/lib/Sema/SemaChecking.cpp
@@ -17183,7 +17183,7 @@ public:
     //  evaluates to true.
     bool EvalResult = false;
     bool EvalOK = Eval.evaluate(BO->getLHS(), EvalResult);
-    bool ShouldVisitRHS = !EvalOK || (EvalOK && !EvalResult);
+    bool ShouldVisitRHS = !EvalOK || !EvalResult;
     if (ShouldVisitRHS) {
       Region = RHSRegion;
       Visit(BO->getRHS());
@@ -17215,7 +17215,7 @@ public:
     //  [...] the second operand is not evaluated if the first operand is false.
     bool EvalResult = false;
     bool EvalOK = Eval.evaluate(BO->getLHS(), EvalResult);
-    bool ShouldVisitRHS = !EvalOK || (EvalOK && EvalResult);
+    bool ShouldVisitRHS = !EvalOK || EvalResult;
     if (ShouldVisitRHS) {
       Region = RHSRegion;
       Visit(BO->getRHS());
@@ -17266,8 +17266,8 @@ public:
     // evaluated. [...]
     bool EvalResult = false;
     bool EvalOK = Eval.evaluate(CO->getCond(), EvalResult);
-    bool ShouldVisitTrueExpr = !EvalOK || (EvalOK && EvalResult);
-    bool ShouldVisitFalseExpr = !EvalOK || (EvalOK && !EvalResult);
+    bool ShouldVisitTrueExpr = !EvalOK || EvalResult;
+    bool ShouldVisitFalseExpr = !EvalOK || !EvalResult;
     if (ShouldVisitTrueExpr) {
       Region = TrueRegion;
       Visit(CO->getTrueExpr());
-- 
cgit v1.1


From 8f6e13e6da84510c8321717860fd506e12118514 Mon Sep 17 00:00:00 2001
From: Owen Pan <owenpiano@gmail.com>
Date: Wed, 7 Feb 2024 21:35:43 -0800
Subject: [clang-format] Fix a regression in dumping the config (#80628)

Commit d813af73f70f addressed a regression introduced by commit
3791b3fca6ea
but caused `clang-format -dump-config` to "hang".

This patch reverts changes to ClangFormat.cpp by both commits and
reworks the cleanup.

Fixes #80621.
---
 clang/test/Format/dump-config-objc-stdin.m |  3 ++
 clang/test/Format/verbose.cpp              | 10 ++----
 clang/tools/clang-format/ClangFormat.cpp   | 49 +++++++++++++++---------------
 3 files changed, 30 insertions(+), 32 deletions(-)

diff --git a/clang/test/Format/dump-config-objc-stdin.m b/clang/test/Format/dump-config-objc-stdin.m
index b22ff7b..d81711a 100644
--- a/clang/test/Format/dump-config-objc-stdin.m
+++ b/clang/test/Format/dump-config-objc-stdin.m
@@ -1,5 +1,8 @@
+// RUN: clang-format -assume-filename=foo.m -dump-config | FileCheck %s
+
 // RUN: clang-format -dump-config - < %s | FileCheck %s
 
 // CHECK: Language: ObjC
+
 @interface Foo
 @end
diff --git a/clang/test/Format/verbose.cpp b/clang/test/Format/verbose.cpp
index dd625e3..4ab03d8 100644
--- a/clang/test/Format/verbose.cpp
+++ b/clang/test/Format/verbose.cpp
@@ -1,12 +1,6 @@
-// RUN: clang-format %s  2> %t.stderr
+// RUN: clang-format -verbose 2> %t.stderr
 // RUN: not grep "Formatting" %t.stderr
-// RUN: clang-format %s -verbose 2> %t.stderr
-// RUN: grep -E "Formatting (.*)verbose.cpp(.*)" %t.stderr
-// RUN: clang-format %s -verbose=false 2> %t.stderr
-// RUN: not grep "Formatting" %t.stderr
-
-int a;
-// RUN: clang-format %s  2> %t.stderr
+// RUN: clang-format %s 2> %t.stderr
 // RUN: not grep "Formatting" %t.stderr
 // RUN: clang-format %s -verbose 2> %t.stderr
 // RUN: grep -E "Formatting (.*)verbose.cpp(.*)" %t.stderr
diff --git a/clang/tools/clang-format/ClangFormat.cpp b/clang/tools/clang-format/ClangFormat.cpp
index 5ee6092..e122cea 100644
--- a/clang/tools/clang-format/ClangFormat.cpp
+++ b/clang/tools/clang-format/ClangFormat.cpp
@@ -399,7 +399,8 @@ class ClangFormatDiagConsumer : public DiagnosticConsumer {
 };
 
 // Returns true on error.
-static bool format(StringRef FileName, bool IsSTDIN) {
+static bool format(StringRef FileName) {
+  const bool IsSTDIN = FileName == "-";
   if (!OutputXML && Inplace && IsSTDIN) {
     errs() << "error: cannot use -i when reading from stdin.\n";
     return false;
@@ -545,24 +546,25 @@ static void PrintVersion(raw_ostream &OS) {
 }
 
 // Dump the configuration.
-static int dumpConfig(bool IsSTDIN) {
+static int dumpConfig() {
   std::unique_ptr<llvm::MemoryBuffer> Code;
-
-  // `FileNames` must have at least "-" in it even if no file was specified.
-  assert(!FileNames.empty());
-
-  // Read in the code in case the filename alone isn't enough to detect the
-  // language.
-  ErrorOr<std::unique_ptr<MemoryBuffer>> CodeOrErr =
-      MemoryBuffer::getFileOrSTDIN(FileNames[0]);
-  if (std::error_code EC = CodeOrErr.getError()) {
-    llvm::errs() << EC.message() << "\n";
-    return 1;
+  // We can't read the code to detect the language if there's no file name.
+  if (!FileNames.empty()) {
+    // Read in the code in case the filename alone isn't enough to detect the
+    // language.
+    ErrorOr<std::unique_ptr<MemoryBuffer>> CodeOrErr =
+        MemoryBuffer::getFileOrSTDIN(FileNames[0]);
+    if (std::error_code EC = CodeOrErr.getError()) {
+      llvm::errs() << EC.message() << "\n";
+      return 1;
+    }
+    Code = std::move(CodeOrErr.get());
   }
-  Code = std::move(CodeOrErr.get());
-
   llvm::Expected<clang::format::FormatStyle> FormatStyle =
-      clang::format::getStyle(Style, IsSTDIN ? AssumeFileName : FileNames[0],
+      clang::format::getStyle(Style,
+                              FileNames.empty() || FileNames[0] == "-"
+                                  ? AssumeFileName
+                                  : FileNames[0],
                               FallbackStyle, Code ? Code->getBuffer() : "");
   if (!FormatStyle) {
     llvm::errs() << llvm::toString(FormatStyle.takeError()) << "\n";
@@ -682,11 +684,8 @@ int main(int argc, const char **argv) {
     return 0;
   }
 
-  if (FileNames.empty())
-    FileNames.push_back("-");
-
   if (DumpConfig)
-    return dumpConfig(FileNames[0] == "-");
+    return dumpConfig();
 
   if (!Files.empty()) {
     std::ifstream ExternalFileOfFiles{std::string(Files)};
@@ -699,7 +698,10 @@ int main(int argc, const char **argv) {
     errs() << "Clang-formating " << LineNo << " files\n";
   }
 
-  if (FileNames.size() != 1 &&
+  if (FileNames.empty())
+    return clang::format::format("-");
+
+  if (FileNames.size() > 1 &&
       (!Offsets.empty() || !Lengths.empty() || !LineRanges.empty())) {
     errs() << "error: -offset, -length and -lines can only be used for "
               "single file.\n";
@@ -709,14 +711,13 @@ int main(int argc, const char **argv) {
   unsigned FileNo = 1;
   bool Error = false;
   for (const auto &FileName : FileNames) {
-    const bool IsSTDIN = FileName == "-";
-    if (!IsSTDIN && isIgnored(FileName))
+    if (isIgnored(FileName))
       continue;
     if (Verbose) {
       errs() << "Formatting [" << FileNo++ << "/" << FileNames.size() << "] "
              << FileName << "\n";
     }
-    Error |= clang::format::format(FileName, IsSTDIN);
+    Error |= clang::format::format(FileName);
   }
   return Error ? 1 : 0;
 }
-- 
cgit v1.1


From c8ca98a2a9796797f2eab00cc6516610c133633a Mon Sep 17 00:00:00 2001
From: Yingwei Zheng <dtcxzyw2333@gmail.com>
Date: Thu, 8 Feb 2024 13:45:27 +0800
Subject: [InstCombine] Handle IsInf/IsZero idioms (#80607)

This patch does the following folds:
```
icmp eq/ne (bitcast X to int), (bitcast +/-inf to int) -> llvm.is.fpclass(X, (~)fcPosInf/fcNegInf)
icmp eq/ne (bitcast X to int), (bitcast +0/-0 to int) -> llvm.is.fpclass(X, (~)fcPosZero/fcNegZero)
```
Alive2: https://alive2.llvm.org/ce/z/JJmEE9
---
 .../Transforms/InstCombine/InstCombineCompares.cpp |  30 +++--
 .../Transforms/InstCombine/fpclass-check-idioms.ll | 150 ++++++++++++++++++++-
 2 files changed, 169 insertions(+), 11 deletions(-)

diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
index 7aac13f..cbb6988 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
@@ -3229,16 +3229,16 @@ Instruction *InstCombinerImpl::foldICmpBitCast(ICmpInst &Cmp) {
       if (Cmp.isEquality() && match(Op1, m_Zero()))
         return new ICmpInst(Pred, X, ConstantInt::getNullValue(X->getType()));
 
-    // If this is a sign-bit test of a bitcast of a casted FP value, eliminate
-    // the FP extend/truncate because that cast does not change the sign-bit.
-    // This is true for all standard IEEE-754 types and the X86 80-bit type.
-    // The sign-bit is always the most significant bit in those types.
     const APInt *C;
     bool TrueIfSigned;
-    if (match(Op1, m_APInt(C)) && Bitcast->hasOneUse() &&
-        isSignBitCheck(Pred, *C, TrueIfSigned)) {
-      if (match(BCSrcOp, m_FPExt(m_Value(X))) ||
-          match(BCSrcOp, m_FPTrunc(m_Value(X)))) {
+    if (match(Op1, m_APInt(C)) && Bitcast->hasOneUse()) {
+      // If this is a sign-bit test of a bitcast of a casted FP value, eliminate
+      // the FP extend/truncate because that cast does not change the sign-bit.
+      // This is true for all standard IEEE-754 types and the X86 80-bit type.
+      // The sign-bit is always the most significant bit in those types.
+      if (isSignBitCheck(Pred, *C, TrueIfSigned) &&
+          (match(BCSrcOp, m_FPExt(m_Value(X))) ||
+           match(BCSrcOp, m_FPTrunc(m_Value(X))))) {
         // (bitcast (fpext/fptrunc X)) to iX) < 0 --> (bitcast X to iY) < 0
         // (bitcast (fpext/fptrunc X)) to iX) > -1 --> (bitcast X to iY) > -1
         Type *XType = X->getType();
@@ -3257,6 +3257,20 @@ Instruction *InstCombinerImpl::foldICmpBitCast(ICmpInst &Cmp) {
                                 ConstantInt::getAllOnesValue(NewType));
         }
       }
+
+      // icmp eq/ne (bitcast X to int), special fp -> llvm.is.fpclass(X, class)
+      Type *FPType = SrcType->getScalarType();
+      if (!Cmp.getParent()->getParent()->hasFnAttribute(
+              Attribute::NoImplicitFloat) &&
+          Cmp.isEquality() && FPType->isIEEELikeFPTy()) {
+        FPClassTest Mask = APFloat(FPType->getFltSemantics(), *C).classify();
+        if (Mask & (fcInf | fcZero)) {
+          if (Pred == ICmpInst::ICMP_NE)
+            Mask = ~Mask;
+          return replaceInstUsesWith(Cmp,
+                                     Builder.createIsFPClass(BCSrcOp, Mask));
+        }
+      }
     }
   }
 
diff --git a/llvm/test/Transforms/InstCombine/fpclass-check-idioms.ll b/llvm/test/Transforms/InstCombine/fpclass-check-idioms.ll
index 019db34..d2b4536 100644
--- a/llvm/test/Transforms/InstCombine/fpclass-check-idioms.ll
+++ b/llvm/test/Transforms/InstCombine/fpclass-check-idioms.ll
@@ -40,13 +40,11 @@ define i1 @f64_fcnan_fcinf(double %a) {
   ret i1 %cmp
 }
 
-; TODO: handle more fpclass check idioms
 define i1 @f32_fcinf(float %a) {
 ; CHECK-LABEL: define i1 @f32_fcinf(
 ; CHECK-SAME: float [[A:%.*]]) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = call float @llvm.fabs.f32(float [[A]])
-; CHECK-NEXT:    [[AND:%.*]] = bitcast float [[TMP1]] to i32
-; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[AND]], 2139095040
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp oeq float [[TMP1]], 0x7FF0000000000000
 ; CHECK-NEXT:    ret i1 [[CMP]]
 ;
   %i32 = bitcast float %a to i32
@@ -55,6 +53,63 @@ define i1 @f32_fcinf(float %a) {
   ret i1 %cmp
 }
 
+define i1 @f32_fcposinf(float %a) {
+; CHECK-LABEL: define i1 @f32_fcposinf(
+; CHECK-SAME: float [[A:%.*]]) {
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp oeq float [[A]], 0x7FF0000000000000
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %i32 = bitcast float %a to i32
+  %cmp = icmp eq i32 %i32, 2139095040
+  ret i1 %cmp
+}
+
+define i1 @f32_fcneginf(float %a) {
+; CHECK-LABEL: define i1 @f32_fcneginf(
+; CHECK-SAME: float [[A:%.*]]) {
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp oeq float [[A]], 0xFFF0000000000000
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %i32 = bitcast float %a to i32
+  %cmp = icmp eq i32 %i32, 4286578688
+  ret i1 %cmp
+}
+
+define i1 @f32_fcposzero(float %a) {
+; CHECK-LABEL: define i1 @f32_fcposzero(
+; CHECK-SAME: float [[A:%.*]]) {
+; CHECK-NEXT:    [[CMP:%.*]] = call i1 @llvm.is.fpclass.f32(float [[A]], i32 64)
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %i32 = bitcast float %a to i32
+  %cmp = icmp eq i32 %i32, 0
+  ret i1 %cmp
+}
+
+define i1 @f32_fcnegzero(float %a) {
+; CHECK-LABEL: define i1 @f32_fcnegzero(
+; CHECK-SAME: float [[A:%.*]]) {
+; CHECK-NEXT:    [[CMP:%.*]] = call i1 @llvm.is.fpclass.f32(float [[A]], i32 32)
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %i32 = bitcast float %a to i32
+  %cmp = icmp eq i32 %i32, 2147483648
+  ret i1 %cmp
+}
+
+define i1 @f32_fczero(float %a) {
+; CHECK-LABEL: define i1 @f32_fczero(
+; CHECK-SAME: float [[A:%.*]]) {
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp oeq float [[A]], 0.000000e+00
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %i32 = bitcast float %a to i32
+  %and = and i32 %i32, 2147483647
+  %cmp = icmp eq i32 %and, 0
+  ret i1 %cmp
+}
+
+; TODO: handle more fpclass check idioms
 define i1 @f32_fcnan(float %a) {
 ; CHECK-LABEL: define i1 @f32_fcnan(
 ; CHECK-SAME: float [[A:%.*]]) {
@@ -101,6 +156,19 @@ define <2 x i1> @f32_fcnan_fcinf_vec(<2 x float> %a) {
   ret <2 x i1> %cmp
 }
 
+define <2 x i1> @f32_fcinf_vec(<2 x float> %a) {
+; CHECK-LABEL: define <2 x i1> @f32_fcinf_vec(
+; CHECK-SAME: <2 x float> [[A:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = call <2 x float> @llvm.fabs.v2f32(<2 x float> [[A]])
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp oeq <2 x float> [[TMP1]], <float 0x7FF0000000000000, float 0x7FF0000000000000>
+; CHECK-NEXT:    ret <2 x i1> [[CMP]]
+;
+  %i32 = bitcast <2 x float> %a to <2 x i32>
+  %and = and <2 x i32> %i32, <i32 2147483647, i32 2147483647>
+  %cmp = icmp eq <2 x i32> %and, <i32 2139095040, i32 2139095040>
+  ret <2 x i1> %cmp
+}
+
 ; Negative tests
 
 define i1 @f32_fcnan_fcinf_wrong_mask1(float %a) {
@@ -158,6 +226,18 @@ define i1 @f32_fcnan_fcinf_wrong_pred(float %a) {
   ret i1 %cmp
 }
 
+define i1 @f32_fcposzero_wrong_pred(float %a) {
+; CHECK-LABEL: define i1 @f32_fcposzero_wrong_pred(
+; CHECK-SAME: float [[A:%.*]]) {
+; CHECK-NEXT:    [[I32:%.*]] = bitcast float [[A]] to i32
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[I32]], 0
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %i32 = bitcast float %a to i32
+  %cmp = icmp slt i32 %i32, 0
+  ret i1 %cmp
+}
+
 define i1 @f32_fcnan_fcinf_wrong_type1(<2 x float> %a) {
 ; CHECK-LABEL: define i1 @f32_fcnan_fcinf_wrong_type1(
 ; CHECK-SAME: <2 x float> [[A:%.*]]) {
@@ -172,6 +252,18 @@ define i1 @f32_fcnan_fcinf_wrong_type1(<2 x float> %a) {
   ret i1 %cmp
 }
 
+define i1 @f32_fcposinf_wrong_type1(<2 x float> %a) {
+; CHECK-LABEL: define i1 @f32_fcposinf_wrong_type1(
+; CHECK-SAME: <2 x float> [[A:%.*]]) {
+; CHECK-NEXT:    [[I64:%.*]] = bitcast <2 x float> [[A]] to i64
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i64 [[I64]], 2139095040
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %i64 = bitcast <2 x float> %a to i64
+  %cmp = icmp eq i64 %i64, 2139095040
+  ret i1 %cmp
+}
+
 define i1 @f32_fcnan_fcinf_wrong_type2(x86_fp80 %a) {
 ; CHECK-LABEL: define i1 @f32_fcnan_fcinf_wrong_type2(
 ; CHECK-SAME: x86_fp80 [[A:%.*]]) {
@@ -186,6 +278,18 @@ define i1 @f32_fcnan_fcinf_wrong_type2(x86_fp80 %a) {
   ret i1 %cmp
 }
 
+define i1 @f32_fcposzero_wrong_type2(x86_fp80 %a) {
+; CHECK-LABEL: define i1 @f32_fcposzero_wrong_type2(
+; CHECK-SAME: x86_fp80 [[A:%.*]]) {
+; CHECK-NEXT:    [[I80:%.*]] = bitcast x86_fp80 [[A]] to i80
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i80 [[I80]], 0
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %i80 = bitcast x86_fp80 %a to i80
+  %cmp = icmp eq i80 %i80, 0
+  ret i1 %cmp
+}
+
 define i1 @f32_fcnan_fcinf_noimplicitfloat(float %a) #0 {
 ; CHECK-LABEL: define i1 @f32_fcnan_fcinf_noimplicitfloat(
 ; CHECK-SAME: float [[A:%.*]]) #[[ATTR1:[0-9]+]] {
@@ -200,4 +304,44 @@ define i1 @f32_fcnan_fcinf_noimplicitfloat(float %a) #0 {
   ret i1 %cmp
 }
 
+define i1 @f32_fcposinf_noimplicitfloat(float %a) #0 {
+; CHECK-LABEL: define i1 @f32_fcposinf_noimplicitfloat(
+; CHECK-SAME: float [[A:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[I32:%.*]] = bitcast float [[A]] to i32
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[I32]], 2139095040
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %i32 = bitcast float %a to i32
+  %cmp = icmp eq i32 %i32, 2139095040
+  ret i1 %cmp
+}
+
+define i1 @f32_fcposnan(float %a) {
+; CHECK-LABEL: define i1 @f32_fcposnan(
+; CHECK-SAME: float [[A:%.*]]) {
+; CHECK-NEXT:    [[I32:%.*]] = bitcast float [[A]] to i32
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[I32]], 2139095041
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %i32 = bitcast float %a to i32
+  %cmp = icmp eq i32 %i32, 2139095041
+  ret i1 %cmp
+}
+
+define i1 @f32_fcposinf_multiuse(float %a) {
+; CHECK-LABEL: define i1 @f32_fcposinf_multiuse(
+; CHECK-SAME: float [[A:%.*]]) {
+; CHECK-NEXT:    [[I32:%.*]] = bitcast float [[A]] to i32
+; CHECK-NEXT:    call void @usei32(i32 [[I32]])
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[I32]], 2139095040
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %i32 = bitcast float %a to i32
+  call void @usei32(i32 %i32)
+  %cmp = icmp eq i32 %i32, 2139095040
+  ret i1 %cmp
+}
+
+declare void @usei32(i32)
+
 attributes #0 = { noimplicitfloat }
-- 
cgit v1.1


From e17dded8d712fb13c30fd88f7810edaa0ee3e60d Mon Sep 17 00:00:00 2001
From: Yingwei Zheng <dtcxzyw2333@gmail.com>
Date: Thu, 8 Feb 2024 15:07:35 +0800
Subject: [InstSimplify] Generalize `simplifyAndOrOfFCmps` (#81027)

This patch generalizes `simplifyAndOrOfFCmps` to simplify patterns like:
```
define i1 @src(float %x, float %y) {
  %or.cond.i = fcmp ord float %x, 0.000000e+00
  %cmp.i.i34 = fcmp olt float %x, %y
  %cmp.i2.sink.i = and i1 %or.cond.i, %cmp.i.i34
  ret i1 %cmp.i2.sink.i
}

define i1 @tgt(float %x, float %y) {
  %cmp.i.i34 = fcmp olt float %x, %y
  ret i1 %cmp.i.i34
}
```
Alive2: https://alive2.llvm.org/ce/z/9rydcx

This patch and #80986 will fix the regression introduced by #80941.
See also the IR diff
https://github.com/dtcxzyw/llvm-opt-benchmark/pull/199#discussion_r1480974120.
---
 llvm/lib/Analysis/InstructionSimplify.cpp          |  43 +++---
 .../InstCombine/create-class-from-logic-fcmp.ll    |  20 +--
 .../test/Transforms/InstSimplify/logic-of-fcmps.ll | 167 +++++++++++++++++++++
 3 files changed, 199 insertions(+), 31 deletions(-)

diff --git a/llvm/lib/Analysis/InstructionSimplify.cpp b/llvm/lib/Analysis/InstructionSimplify.cpp
index 01b0171..51e258d 100644
--- a/llvm/lib/Analysis/InstructionSimplify.cpp
+++ b/llvm/lib/Analysis/InstructionSimplify.cpp
@@ -1853,35 +1853,36 @@ static Value *simplifyAndOrOfFCmps(const SimplifyQuery &Q, FCmpInst *LHS,
     return nullptr;
 
   FCmpInst::Predicate PredL = LHS->getPredicate(), PredR = RHS->getPredicate();
-  if ((PredL == FCmpInst::FCMP_ORD && PredR == FCmpInst::FCMP_ORD && IsAnd) ||
-      (PredL == FCmpInst::FCMP_UNO && PredR == FCmpInst::FCMP_UNO && !IsAnd)) {
-    // (fcmp ord NNAN, X) & (fcmp ord X, Y) --> fcmp ord X, Y
-    // (fcmp ord NNAN, X) & (fcmp ord Y, X) --> fcmp ord Y, X
-    // (fcmp ord X, NNAN) & (fcmp ord X, Y) --> fcmp ord X, Y
-    // (fcmp ord X, NNAN) & (fcmp ord Y, X) --> fcmp ord Y, X
-    // (fcmp uno NNAN, X) | (fcmp uno X, Y) --> fcmp uno X, Y
-    // (fcmp uno NNAN, X) | (fcmp uno Y, X) --> fcmp uno Y, X
-    // (fcmp uno X, NNAN) | (fcmp uno X, Y) --> fcmp uno X, Y
-    // (fcmp uno X, NNAN) | (fcmp uno Y, X) --> fcmp uno Y, X
+  if ((PredL == FCmpInst::FCMP_ORD || PredL == FCmpInst::FCMP_UNO) &&
+      ((FCmpInst::isOrdered(PredR) && IsAnd) ||
+       (FCmpInst::isUnordered(PredR) && !IsAnd))) {
+    // (fcmp ord X, NNAN) & (fcmp o** X, Y) --> fcmp o** X, Y
+    // (fcmp uno X, NNAN) & (fcmp o** X, Y) --> false
+    // (fcmp uno X, NNAN) | (fcmp u** X, Y) --> fcmp u** X, Y
+    // (fcmp ord X, NNAN) | (fcmp u** X, Y) --> true
     if (((LHS1 == RHS0 || LHS1 == RHS1) &&
          isKnownNeverNaN(LHS0, /*Depth=*/0, Q)) ||
         ((LHS0 == RHS0 || LHS0 == RHS1) &&
          isKnownNeverNaN(LHS1, /*Depth=*/0, Q)))
-      return RHS;
-
-    // (fcmp ord X, Y) & (fcmp ord NNAN, X) --> fcmp ord X, Y
-    // (fcmp ord Y, X) & (fcmp ord NNAN, X) --> fcmp ord Y, X
-    // (fcmp ord X, Y) & (fcmp ord X, NNAN) --> fcmp ord X, Y
-    // (fcmp ord Y, X) & (fcmp ord X, NNAN) --> fcmp ord Y, X
-    // (fcmp uno X, Y) | (fcmp uno NNAN, X) --> fcmp uno X, Y
-    // (fcmp uno Y, X) | (fcmp uno NNAN, X) --> fcmp uno Y, X
-    // (fcmp uno X, Y) | (fcmp uno X, NNAN) --> fcmp uno X, Y
-    // (fcmp uno Y, X) | (fcmp uno X, NNAN) --> fcmp uno Y, X
+      return FCmpInst::isOrdered(PredL) == FCmpInst::isOrdered(PredR)
+                 ? static_cast<Value *>(RHS)
+                 : ConstantInt::getBool(LHS->getType(), !IsAnd);
+  }
+
+  if ((PredR == FCmpInst::FCMP_ORD || PredR == FCmpInst::FCMP_UNO) &&
+      ((FCmpInst::isOrdered(PredL) && IsAnd) ||
+       (FCmpInst::isUnordered(PredL) && !IsAnd))) {
+    // (fcmp o** X, Y) & (fcmp ord X, NNAN) --> fcmp o** X, Y
+    // (fcmp o** X, Y) & (fcmp uno X, NNAN) --> false
+    // (fcmp u** X, Y) | (fcmp uno X, NNAN) --> fcmp u** X, Y
+    // (fcmp u** X, Y) | (fcmp ord X, NNAN) --> true
     if (((RHS1 == LHS0 || RHS1 == LHS1) &&
          isKnownNeverNaN(RHS0, /*Depth=*/0, Q)) ||
         ((RHS0 == LHS0 || RHS0 == LHS1) &&
          isKnownNeverNaN(RHS1, /*Depth=*/0, Q)))
-      return LHS;
+      return FCmpInst::isOrdered(PredL) == FCmpInst::isOrdered(PredR)
+                 ? static_cast<Value *>(LHS)
+                 : ConstantInt::getBool(LHS->getType(), !IsAnd);
   }
 
   return nullptr;
diff --git a/llvm/test/Transforms/InstCombine/create-class-from-logic-fcmp.ll b/llvm/test/Transforms/InstCombine/create-class-from-logic-fcmp.ll
index 24dac97..12c608c 100644
--- a/llvm/test/Transforms/InstCombine/create-class-from-logic-fcmp.ll
+++ b/llvm/test/Transforms/InstCombine/create-class-from-logic-fcmp.ll
@@ -1100,8 +1100,8 @@ define i1 @uge_smallest_normal_or_ord(half %x) #0 {
 ; -> nan | pnormal | pinf
 define i1 @uge_smallest_normal_or_uno(half %x) #0 {
 ; CHECK-LABEL: @uge_smallest_normal_or_uno(
-; CHECK-NEXT:    [[CLASS:%.*]] = call i1 @llvm.is.fpclass.f16(half [[X:%.*]], i32 771)
-; CHECK-NEXT:    ret i1 [[CLASS]]
+; CHECK-NEXT:    [[CMP_SMALLEST_NORMAL:%.*]] = fcmp uge half [[X:%.*]], 0xH0400
+; CHECK-NEXT:    ret i1 [[CMP_SMALLEST_NORMAL]]
 ;
   %uno = fcmp uno half %x, 0.0
   %cmp.smallest.normal = fcmp uge half %x, 0xH0400
@@ -1307,8 +1307,8 @@ define i1 @oge_fabs_eq_inf_and_ord(half %x) #0 {
 
 define i1 @oge_eq_inf_and_ord(half %x) #0 {
 ; CHECK-LABEL: @oge_eq_inf_and_ord(
-; CHECK-NEXT:    [[AND:%.*]] = fcmp oeq half [[X:%.*]], 0xH7C00
-; CHECK-NEXT:    ret i1 [[AND]]
+; CHECK-NEXT:    [[OGE_FABS_INF:%.*]] = fcmp oeq half [[X:%.*]], 0xH7C00
+; CHECK-NEXT:    ret i1 [[OGE_FABS_INF]]
 ;
   %oge.fabs.inf = fcmp oge half %x, 0xH7C00
   %ord = fcmp ord half %x, 0xH0000
@@ -1379,8 +1379,8 @@ define i1 @ult_fabs_eq_inf_or_uno(half %x) #0 {
 
 define i1 @ult_eq_inf_or_uno(half %x) #0 {
 ; CHECK-LABEL: @ult_eq_inf_or_uno(
-; CHECK-NEXT:    [[OR:%.*]] = fcmp une half [[X:%.*]], 0xH7C00
-; CHECK-NEXT:    ret i1 [[OR]]
+; CHECK-NEXT:    [[ULT_FABS_INF:%.*]] = fcmp une half [[X:%.*]], 0xH7C00
+; CHECK-NEXT:    ret i1 [[ULT_FABS_INF]]
 ;
   %ult.fabs.inf = fcmp ult half %x, 0xH7C00
   %uno = fcmp uno half %x, 0xH0000
@@ -1465,8 +1465,8 @@ define i1 @oeq_neginfinity_or_ord(half %x) #0 {
 ; -> ninf
 define i1 @oeq_neginfinity_and_ord(half %x) #0 {
 ; CHECK-LABEL: @oeq_neginfinity_and_ord(
-; CHECK-NEXT:    [[CLASS:%.*]] = fcmp oeq half [[X:%.*]], 0xHFC00
-; CHECK-NEXT:    ret i1 [[CLASS]]
+; CHECK-NEXT:    [[OEQ_NEG_INFINITY:%.*]] = fcmp oeq half [[X:%.*]], 0xHFC00
+; CHECK-NEXT:    ret i1 [[OEQ_NEG_INFINITY]]
 ;
   %oeq.neg.infinity = fcmp oeq half %x, 0xHFC00
   %ord = fcmp ord half %x, 0.0
@@ -1597,8 +1597,8 @@ define i1 @ueq_neginfinity_and_olt_smallest_normal(half %x) #0 {
 ; -> nan|ninf
 define i1 @ueq_neginfinity_or_uno(half %x) #0 {
 ; CHECK-LABEL: @ueq_neginfinity_or_uno(
-; CHECK-NEXT:    [[CLASS:%.*]] = fcmp ueq half [[X:%.*]], 0xHFC00
-; CHECK-NEXT:    ret i1 [[CLASS]]
+; CHECK-NEXT:    [[UEQ_NEG_INFINITY:%.*]] = fcmp ueq half [[X:%.*]], 0xHFC00
+; CHECK-NEXT:    ret i1 [[UEQ_NEG_INFINITY]]
 ;
   %ueq.neg.infinity = fcmp ueq half %x, 0xHFC00
   %uno = fcmp uno half %x, 0.0
diff --git a/llvm/test/Transforms/InstSimplify/logic-of-fcmps.ll b/llvm/test/Transforms/InstSimplify/logic-of-fcmps.ll
index d898df0..4b2ff1b 100644
--- a/llvm/test/Transforms/InstSimplify/logic-of-fcmps.ll
+++ b/llvm/test/Transforms/InstSimplify/logic-of-fcmps.ll
@@ -259,3 +259,170 @@ define <2 x i1> @uno8(<2 x double> %x, <2 x double> %y) {
   %r = or <2 x i1> %cmp1, %cmp2
   ret <2 x i1> %r
 }
+
+define i1 @olt_implies_ord(float %x, float %y) {
+; CHECK-LABEL: @olt_implies_ord(
+; CHECK-NEXT:    [[OLT:%.*]] = fcmp olt float [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    ret i1 [[OLT]]
+;
+  %ord = fcmp ord float %x, 0.000000e+00
+  %olt = fcmp olt float %x, %y
+  %ret = and i1 %olt, %ord
+  ret i1 %ret
+}
+
+define i1 @olt_implies_ord_commuted1(float %x, float %y) {
+; CHECK-LABEL: @olt_implies_ord_commuted1(
+; CHECK-NEXT:    [[OLT:%.*]] = fcmp olt float [[Y:%.*]], [[X:%.*]]
+; CHECK-NEXT:    ret i1 [[OLT]]
+;
+  %ord = fcmp ord float %x, 0.000000e+00
+  %olt = fcmp olt float %y, %x
+  %ret = and i1 %olt, %ord
+  ret i1 %ret
+}
+
+define i1 @olt_implies_ord_commuted2(float %x, float %y) {
+; CHECK-LABEL: @olt_implies_ord_commuted2(
+; CHECK-NEXT:    [[OLT:%.*]] = fcmp olt float [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    ret i1 [[OLT]]
+;
+  %ord = fcmp ord float %x, 0.000000e+00
+  %olt = fcmp olt float %x, %y
+  %ret = and i1 %ord, %olt
+  ret i1 %ret
+}
+
+define i1 @olt_implies_ord_commuted3(float %x, float %y) {
+; CHECK-LABEL: @olt_implies_ord_commuted3(
+; CHECK-NEXT:    [[OLT:%.*]] = fcmp olt float [[Y:%.*]], [[X:%.*]]
+; CHECK-NEXT:    ret i1 [[OLT]]
+;
+  %ord = fcmp ord float %x, 0.000000e+00
+  %olt = fcmp olt float %y, %x
+  %ret = and i1 %ord, %olt
+  ret i1 %ret
+}
+
+define <2 x i1> @olt_implies_ord_vec(<2 x float> %x, <2 x float> %y) {
+; CHECK-LABEL: @olt_implies_ord_vec(
+; CHECK-NEXT:    [[OLT:%.*]] = fcmp olt <2 x float> [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    ret <2 x i1> [[OLT]]
+;
+  %ord = fcmp ord <2 x float> %x, zeroinitializer
+  %olt = fcmp olt <2 x float> %x, %y
+  %ret = and <2 x i1> %ord, %olt
+  ret <2 x i1> %ret
+}
+
+define i1 @ord_implies_ord(float %x, float %y) {
+; CHECK-LABEL: @ord_implies_ord(
+; CHECK-NEXT:    [[ORD2:%.*]] = fcmp ord float [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    ret i1 [[ORD2]]
+;
+  %ord = fcmp ord float %x, 0.000000e+00
+  %ord2 = fcmp ord float %x, %y
+  %ret = and i1 %ord, %ord2
+  ret i1 %ret
+}
+
+define i1 @olt_implies_uno(float %x, float %y) {
+; CHECK-LABEL: @olt_implies_uno(
+; CHECK-NEXT:    ret i1 false
+;
+  %uno = fcmp uno float %x, 0.000000e+00
+  %olt = fcmp olt float %x, %y
+  %ret = and i1 %olt, %uno
+  ret i1 %ret
+}
+
+define i1 @ult_implies_uno(float %x, float %y) {
+; CHECK-LABEL: @ult_implies_uno(
+; CHECK-NEXT:    [[ULT:%.*]] = fcmp ult float [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    ret i1 [[ULT]]
+;
+  %uno = fcmp uno float %x, 0.000000e+00
+  %ult = fcmp ult float %x, %y
+  %ret = or i1 %ult, %uno
+  ret i1 %ret
+}
+
+define i1 @uno_implies_uno(float %x, float %y) {
+; CHECK-LABEL: @uno_implies_uno(
+; CHECK-NEXT:    [[UNO2:%.*]] = fcmp uno float [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    ret i1 [[UNO2]]
+;
+  %uno = fcmp uno float %x, 0.000000e+00
+  %uno2 = fcmp uno float %x, %y
+  %ret = or i1 %uno, %uno2
+  ret i1 %ret
+}
+
+define i1 @ult_implies_ord(float %x, float %y) {
+; CHECK-LABEL: @ult_implies_ord(
+; CHECK-NEXT:    ret i1 true
+;
+  %ord = fcmp ord float %x, 0.000000e+00
+  %ult = fcmp ult float %x, %y
+  %ret = or i1 %ult, %ord
+  ret i1 %ret
+}
+
+; TODO: %cmp1 is false implies %cmp3 is true
+define float @test_ord_implies_uno(float %x) {
+; CHECK-LABEL: @test_ord_implies_uno(
+; CHECK-NEXT:    [[CMP1:%.*]] = fcmp ord float [[X:%.*]], 0.000000e+00
+; CHECK-NEXT:    [[CMP2:%.*]] = fcmp olt float [[X]], 0.000000e+00
+; CHECK-NEXT:    [[CMP3:%.*]] = fcmp uno float [[X]], 0.000000e+00
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[CMP1]], i1 [[CMP2]], i1 [[CMP3]]
+; CHECK-NEXT:    [[RET:%.*]] = select i1 [[SEL]], float 0.000000e+00, float [[X]]
+; CHECK-NEXT:    ret float [[RET]]
+;
+  %cmp1 = fcmp ord float %x, 0.000000e+00
+  %cmp2 = fcmp olt float %x, 0.000000e+00
+  %cmp3 = fcmp uno float %x, 0.000000e+00
+  %sel = select i1 %cmp1, i1 %cmp2, i1 %cmp3
+  %ret = select i1 %sel, float 0.000000e+00, float %x
+  ret float %ret
+}
+
+; Negative tests
+
+define i1 @olt_implies_ord_fail(float %x, float %y, float %z) {
+; CHECK-LABEL: @olt_implies_ord_fail(
+; CHECK-NEXT:    [[ORD:%.*]] = fcmp ord float [[X:%.*]], [[Z:%.*]]
+; CHECK-NEXT:    [[OLT:%.*]] = fcmp olt float [[X]], [[Y:%.*]]
+; CHECK-NEXT:    [[RET:%.*]] = and i1 [[OLT]], [[ORD]]
+; CHECK-NEXT:    ret i1 [[RET]]
+;
+  %ord = fcmp ord float %x, %z
+  %olt = fcmp olt float %x, %y
+  %ret = and i1 %olt, %ord
+  ret i1 %ret
+}
+
+define i1 @ult_implies_uno_and(float %x, float %y) {
+; CHECK-LABEL: @ult_implies_uno_and(
+; CHECK-NEXT:    [[UNO:%.*]] = fcmp uno float [[X:%.*]], 0.000000e+00
+; CHECK-NEXT:    [[ULT:%.*]] = fcmp ult float [[X]], [[Y:%.*]]
+; CHECK-NEXT:    [[RET:%.*]] = and i1 [[ULT]], [[UNO]]
+; CHECK-NEXT:    ret i1 [[RET]]
+;
+  %uno = fcmp uno float %x, 0.000000e+00
+  %ult = fcmp ult float %x, %y
+  %ret = and i1 %ult, %uno
+  ret i1 %ret
+}
+
+define i1 @olt_implies_olt_fail(float %x, float %y) {
+; CHECK-LABEL: @olt_implies_olt_fail(
+; CHECK-NEXT:    [[OLT:%.*]] = fcmp olt float [[X:%.*]], 0.000000e+00
+; CHECK-NEXT:    [[OLT2:%.*]] = fcmp olt float [[X]], [[Y:%.*]]
+; CHECK-NEXT:    [[RET:%.*]] = and i1 [[OLT]], [[OLT2]]
+; CHECK-NEXT:    ret i1 [[RET]]
+;
+  %olt = fcmp olt float %x, 0.000000e+00
+  %olt2 = fcmp olt float %x, %y
+  %ret = and i1 %olt, %olt2
+  ret i1 %ret
+}
-- 
cgit v1.1


From 9ff3b82948c90c54f2f6ec20798c529cb93fab3b Mon Sep 17 00:00:00 2001
From: Pierre van Houtryve <pierre.vanhoutryve@amd.com>
Date: Thu, 8 Feb 2024 08:30:59 +0100
Subject: [AMDGPU] Revert Metadata Version Upgrade (#80995)

Metadata is still 1.2, not 1.3 after V6.
I thought that amdhsa.version mapped to the COV version but it's
separate, and there are no MD changes in V6, hence it doesn't need to be
updated.
---
 llvm/include/llvm/Support/AMDGPUMetadata.h                    |  2 +-
 llvm/test/CodeGen/AMDGPU/codegen-internal-only-func.ll        |  7 +++----
 llvm/test/CodeGen/AMDGPU/tid-mul-func-xnack-all-any.ll        | 11 +++++------
 .../CodeGen/AMDGPU/tid-mul-func-xnack-all-not-supported.ll    | 11 +++++------
 llvm/test/CodeGen/AMDGPU/tid-mul-func-xnack-all-off.ll        |  9 ++++-----
 llvm/test/CodeGen/AMDGPU/tid-mul-func-xnack-all-on.ll         |  9 ++++-----
 llvm/test/CodeGen/AMDGPU/tid-mul-func-xnack-any-off-1.ll      | 11 +++++------
 llvm/test/CodeGen/AMDGPU/tid-mul-func-xnack-any-off-2.ll      | 11 +++++------
 llvm/test/CodeGen/AMDGPU/tid-mul-func-xnack-any-on-1.ll       | 11 +++++------
 llvm/test/CodeGen/AMDGPU/tid-mul-func-xnack-any-on-2.ll       | 11 +++++------
 llvm/test/CodeGen/AMDGPU/tid-one-func-xnack-not-supported.ll  | 11 +++++------
 llvm/test/CodeGen/AMDGPU/tid-one-func-xnack-off.ll            | 11 +++++------
 llvm/test/CodeGen/AMDGPU/tid-one-func-xnack-on.ll             | 11 +++++------
 13 files changed, 57 insertions(+), 69 deletions(-)

diff --git a/llvm/include/llvm/Support/AMDGPUMetadata.h b/llvm/include/llvm/Support/AMDGPUMetadata.h
index d5e0f40..76ac7ab 100644
--- a/llvm/include/llvm/Support/AMDGPUMetadata.h
+++ b/llvm/include/llvm/Support/AMDGPUMetadata.h
@@ -47,7 +47,7 @@ constexpr uint32_t VersionMinorV5 = 2;
 /// HSA metadata major version for code object V6.
 constexpr uint32_t VersionMajorV6 = 1;
 /// HSA metadata minor version for code object V6.
-constexpr uint32_t VersionMinorV6 = 3;
+constexpr uint32_t VersionMinorV6 = 2;
 
 /// Old HSA metadata beginning assembler directive for V2. This is only used for
 /// diagnostics now.
diff --git a/llvm/test/CodeGen/AMDGPU/codegen-internal-only-func.ll b/llvm/test/CodeGen/AMDGPU/codegen-internal-only-func.ll
index 7404015..bc8f3eb 100644
--- a/llvm/test/CodeGen/AMDGPU/codegen-internal-only-func.ll
+++ b/llvm/test/CodeGen/AMDGPU/codegen-internal-only-func.ll
@@ -1,8 +1,8 @@
 ; REQUIRES: asserts
 ; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck -check-prefixes=OPT,COV4 %s
 ; RUN: not llc --crash -O0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -filetype=null %s
-; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck -check-prefixes=OPT,COV5 %s
-; RUN: sed 's/CODE_OBJECT_VERSION/600/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck -check-prefixes=OPT,COV6 %s
+; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck -check-prefixes=OPT,COV5,COV56 %s
+; RUN: sed 's/CODE_OBJECT_VERSION/600/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck -check-prefixes=OPT,COV6,COV56 %s
 
 ; AMDGPUAttributor deletes the function "by accident" so it's never
 ; codegened with optimizations.
@@ -20,8 +20,7 @@
 ; OPT-NEXT: amdhsa.version:
 ; OPT-NEXT: - 1
 ; COV4: - 1
-; COV5: - 2
-; COV6: - 3
+; COV56: - 2
 ; OPT: ...
 define internal i32 @func() {
   ret i32 0
diff --git a/llvm/test/CodeGen/AMDGPU/tid-mul-func-xnack-all-any.ll b/llvm/test/CodeGen/AMDGPU/tid-mul-func-xnack-all-any.ll
index 4faaf60..89d89a7 100644
--- a/llvm/test/CodeGen/AMDGPU/tid-mul-func-xnack-all-any.ll
+++ b/llvm/test/CodeGen/AMDGPU/tid-mul-func-xnack-all-any.ll
@@ -1,6 +1,6 @@
 ; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck --check-prefixes=ASM,ASM4 %s
-; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck --check-prefixes=ASM,ASM5 %s
-; RUN: sed 's/CODE_OBJECT_VERSION/600/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck --check-prefixes=ASM,ASM6 %s
+; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck --check-prefixes=ASM,ASM56 %s
+; RUN: sed 's/CODE_OBJECT_VERSION/600/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck --check-prefixes=ASM,ASM56 %s
 
 ; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 --amdhsa-code-object-version=4 --filetype=obj | llvm-readobj --file-headers - | FileCheck --check-prefixes=ELF,ELF4 %s
 ; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 --amdhsa-code-object-version=5 --filetype=obj | llvm-readobj --file-headers - | FileCheck --check-prefixes=ELF,ELF5 %s
@@ -13,10 +13,9 @@
 ; ASM: .amdgcn_target "amdgcn-amd-amdhsa--gfx900"
 ; ASM:  amdhsa.target: amdgcn-amd-amdhsa--gfx900
 ; ASM:  amdhsa.version:
-; ASM:    - 1
-; ASM4:   - 1
-; ASM5:   - 2
-; ASM6:   - 3
+; ASM:     - 1
+; ASM4:    - 1
+; ASM56:   - 2
 
 ; ELF:      OS/ABI: AMDGPU_HSA (0x40)
 ; ELF4:      ABIVersion: 2
diff --git a/llvm/test/CodeGen/AMDGPU/tid-mul-func-xnack-all-not-supported.ll b/llvm/test/CodeGen/AMDGPU/tid-mul-func-xnack-all-not-supported.ll
index 2079db7..bc57c99 100644
--- a/llvm/test/CodeGen/AMDGPU/tid-mul-func-xnack-all-not-supported.ll
+++ b/llvm/test/CodeGen/AMDGPU/tid-mul-func-xnack-all-not-supported.ll
@@ -1,6 +1,6 @@
 ; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 | FileCheck --check-prefixes=ASM,ASM4 %s
-; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 | FileCheck --check-prefixes=ASM,ASM5 %s
-; RUN: sed 's/CODE_OBJECT_VERSION/600/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 | FileCheck --check-prefixes=ASM,ASM6 %s
+; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 | FileCheck --check-prefixes=ASM,ASM56 %s
+; RUN: sed 's/CODE_OBJECT_VERSION/600/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 | FileCheck --check-prefixes=ASM,ASM56 %s
 
 ; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 --amdhsa-code-object-version=4 --filetype=obj | llvm-readobj --file-headers - | FileCheck --check-prefixes=ELF,ELF4 %s
 ; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 --amdhsa-code-object-version=5 --filetype=obj | llvm-readobj --file-headers - | FileCheck --check-prefixes=ELF,ELF5 %s
@@ -13,10 +13,9 @@
 ; ASM: .amdgcn_target "amdgcn-amd-amdhsa--gfx700"
 ; ASM:  amdhsa.target: amdgcn-amd-amdhsa--gfx700
 ; ASM:  amdhsa.version:
-; ASM:    - 1
-; ASM4:   - 1
-; ASM5:   - 2
-; ASM6:   - 3
+; ASM:     - 1
+; ASM4:    - 1
+; ASM56:   - 2
 
 ; ELF:      OS/ABI: AMDGPU_HSA (0x40)
 ; ELF4:      ABIVersion: 2
diff --git a/llvm/test/CodeGen/AMDGPU/tid-mul-func-xnack-all-off.ll b/llvm/test/CodeGen/AMDGPU/tid-mul-func-xnack-all-off.ll
index 5fa49c5..51351c3 100644
--- a/llvm/test/CodeGen/AMDGPU/tid-mul-func-xnack-all-off.ll
+++ b/llvm/test/CodeGen/AMDGPU/tid-mul-func-xnack-all-off.ll
@@ -1,6 +1,6 @@
 ; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck --check-prefixes=ASM,ASM4 %s
-; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck --check-prefixes=ASM,ASM5 %s
-; RUN: sed 's/CODE_OBJECT_VERSION/600/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck --check-prefixes=ASM,ASM6 %s
+; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck --check-prefixes=ASM,ASM56 %s
+; RUN: sed 's/CODE_OBJECT_VERSION/600/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck --check-prefixes=ASM,ASM56 %s
 
 ; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 --amdhsa-code-object-version=4 --filetype=obj | llvm-readobj --file-headers - | FileCheck --check-prefixes=ELF,ELF4 %s
 ; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 --amdhsa-code-object-version=5 --filetype=obj | llvm-readobj --file-headers - | FileCheck --check-prefixes=ELF,ELF5 %s
@@ -13,10 +13,9 @@
 ; ASM: .amdgcn_target  "amdgcn-amd-amdhsa--gfx900:xnack-"
 ; ASM:  amdhsa.target: 'amdgcn-amd-amdhsa--gfx900:xnack-'
 ; ASM:  amdhsa.version:
-; ASM:    - 1
+; ASM:     - 1
 ; ASM4:    - 1
-; ASM5:    - 2
-; ASM6:    - 3
+; ASM56:   - 2
 
 ; ELF:      OS/ABI: AMDGPU_HSA (0x40)
 ; ELF4:      ABIVersion: 2
diff --git a/llvm/test/CodeGen/AMDGPU/tid-mul-func-xnack-all-on.ll b/llvm/test/CodeGen/AMDGPU/tid-mul-func-xnack-all-on.ll
index 0d0a8d8..f408cbe 100644
--- a/llvm/test/CodeGen/AMDGPU/tid-mul-func-xnack-all-on.ll
+++ b/llvm/test/CodeGen/AMDGPU/tid-mul-func-xnack-all-on.ll
@@ -1,6 +1,6 @@
 ; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck --check-prefixes=ASM,ASM4 %s
-; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck --check-prefixes=ASM,ASM5 %s
-; RUN: sed 's/CODE_OBJECT_VERSION/600/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck --check-prefixes=ASM,ASM6 %s
+; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck --check-prefixes=ASM,ASM56 %s
+; RUN: sed 's/CODE_OBJECT_VERSION/600/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck --check-prefixes=ASM,ASM56 %s
 
 ; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 --amdhsa-code-object-version=4 --filetype=obj | llvm-readobj --file-headers - | FileCheck --check-prefixes=ELF,ELF4 %s
 ; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 --amdhsa-code-object-version=5 --filetype=obj | llvm-readobj --file-headers - | FileCheck --check-prefixes=ELF,ELF5 %s
@@ -13,10 +13,9 @@
 ; ASM: .amdgcn_target  "amdgcn-amd-amdhsa--gfx900:xnack+"
 ; ASM:  amdhsa.target: 'amdgcn-amd-amdhsa--gfx900:xnack+'
 ; ASM:  amdhsa.version:
-; ASM:    - 1
+; ASM:     - 1
 ; ASM4:    - 1
-; ASM5:    - 2
-; ASM6:    - 3
+; ASM56:   - 2
 
 ; ELF:      OS/ABI: AMDGPU_HSA (0x40)
 ; ELF4:      ABIVersion: 2
diff --git a/llvm/test/CodeGen/AMDGPU/tid-mul-func-xnack-any-off-1.ll b/llvm/test/CodeGen/AMDGPU/tid-mul-func-xnack-any-off-1.ll
index c29fb1f..78b3376 100644
--- a/llvm/test/CodeGen/AMDGPU/tid-mul-func-xnack-any-off-1.ll
+++ b/llvm/test/CodeGen/AMDGPU/tid-mul-func-xnack-any-off-1.ll
@@ -1,6 +1,6 @@
 ; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck --check-prefixes=ASM,ASM4 %s
-; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck --check-prefixes=ASM,ASM5 %s
-; RUN: sed 's/CODE_OBJECT_VERSION/600/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck --check-prefixes=ASM,ASM6 %s
+; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck --check-prefixes=ASM,ASM56 %s
+; RUN: sed 's/CODE_OBJECT_VERSION/600/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck --check-prefixes=ASM,ASM56 %s
 
 ; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 --amdhsa-code-object-version=4 --filetype=obj | llvm-readobj --file-headers - | FileCheck --check-prefixes=ELF,ELF4 %s
 ; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 --amdhsa-code-object-version=5 --filetype=obj | llvm-readobj --file-headers - | FileCheck --check-prefixes=ELF,ELF5 %s
@@ -13,10 +13,9 @@
 ; ASM: .amdgcn_target  "amdgcn-amd-amdhsa--gfx900:xnack-"
 ; ASM:  amdhsa.target: 'amdgcn-amd-amdhsa--gfx900:xnack-'
 ; ASM:  amdhsa.version:
-; ASM:    - 1
-; ASM4:   - 1
-; ASM5:   - 2
-; ASM6:   - 3
+; ASM:     - 1
+; ASM4:    - 1
+; ASM56:   - 2
 
 ; ELF:      OS/ABI: AMDGPU_HSA (0x40)
 ; ELF4:      ABIVersion: 2
diff --git a/llvm/test/CodeGen/AMDGPU/tid-mul-func-xnack-any-off-2.ll b/llvm/test/CodeGen/AMDGPU/tid-mul-func-xnack-any-off-2.ll
index 8f6a4ff..d1c98c7 100644
--- a/llvm/test/CodeGen/AMDGPU/tid-mul-func-xnack-any-off-2.ll
+++ b/llvm/test/CodeGen/AMDGPU/tid-mul-func-xnack-any-off-2.ll
@@ -1,6 +1,6 @@
 ; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck --check-prefixes=ASM,ASM4 %s
-; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck --check-prefixes=ASM,ASM5 %s
-; RUN: sed 's/CODE_OBJECT_VERSION/600/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck --check-prefixes=ASM,ASM6 %s
+; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck --check-prefixes=ASM,ASM56 %s
+; RUN: sed 's/CODE_OBJECT_VERSION/600/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck --check-prefixes=ASM,ASM56 %s
 
 ; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 --amdhsa-code-object-version=4 --filetype=obj | llvm-readobj --file-headers - | FileCheck --check-prefixes=ELF,ELF4 %s
 ; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 --amdhsa-code-object-version=5 --filetype=obj | llvm-readobj --file-headers - | FileCheck --check-prefixes=ELF,ELF5 %s
@@ -13,10 +13,9 @@
 ; ASM: .amdgcn_target  "amdgcn-amd-amdhsa--gfx900:xnack-"
 ; ASM:  amdhsa.target: 'amdgcn-amd-amdhsa--gfx900:xnack-'
 ; ASM:  amdhsa.version:
-; ASM:    - 1
-; ASM4:   - 1
-; ASM5:   - 2
-; ASM6:   - 3
+; ASM:     - 1
+; ASM4:    - 1
+; ASM56:   - 2
 
 ; ELF:      OS/ABI: AMDGPU_HSA (0x40)
 ; ELF4:      ABIVersion: 2
diff --git a/llvm/test/CodeGen/AMDGPU/tid-mul-func-xnack-any-on-1.ll b/llvm/test/CodeGen/AMDGPU/tid-mul-func-xnack-any-on-1.ll
index f24e0b2..adf84db 100644
--- a/llvm/test/CodeGen/AMDGPU/tid-mul-func-xnack-any-on-1.ll
+++ b/llvm/test/CodeGen/AMDGPU/tid-mul-func-xnack-any-on-1.ll
@@ -1,6 +1,6 @@
 ; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck --check-prefixes=ASM,ASM4 %s
-; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck --check-prefixes=ASM,ASM5 %s
-; RUN: sed 's/CODE_OBJECT_VERSION/600/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck --check-prefixes=ASM,ASM6 %s
+; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck --check-prefixes=ASM,ASM56 %s
+; RUN: sed 's/CODE_OBJECT_VERSION/600/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck --check-prefixes=ASM,ASM56 %s
 
 ; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 --amdhsa-code-object-version=4 --filetype=obj | llvm-readobj --file-headers - | FileCheck --check-prefixes=ELF,ELF4 %s
 ; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 --amdhsa-code-object-version=5 --filetype=obj | llvm-readobj --file-headers - | FileCheck --check-prefixes=ELF,ELF5 %s
@@ -13,10 +13,9 @@
 ; ASM: .amdgcn_target  "amdgcn-amd-amdhsa--gfx900:xnack+"
 ; ASM:  amdhsa.target: 'amdgcn-amd-amdhsa--gfx900:xnack+'
 ; ASM:  amdhsa.version:
-; ASM:    - 1
-; ASM4:   - 1
-; ASM5:   - 2
-; ASM6:   - 3
+; ASM:     - 1
+; ASM4:    - 1
+; ASM56:   - 2
 
 ; ELF:      OS/ABI: AMDGPU_HSA (0x40)
 ; ELF4:      ABIVersion: 2
diff --git a/llvm/test/CodeGen/AMDGPU/tid-mul-func-xnack-any-on-2.ll b/llvm/test/CodeGen/AMDGPU/tid-mul-func-xnack-any-on-2.ll
index 1493004..210b2e8 100644
--- a/llvm/test/CodeGen/AMDGPU/tid-mul-func-xnack-any-on-2.ll
+++ b/llvm/test/CodeGen/AMDGPU/tid-mul-func-xnack-any-on-2.ll
@@ -1,6 +1,6 @@
 ; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck --check-prefixes=ASM,ASM4 %s
-; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck --check-prefixes=ASM,ASM5 %s
-; RUN: sed 's/CODE_OBJECT_VERSION/600/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck --check-prefixes=ASM,ASM6 %s
+; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck --check-prefixes=ASM,ASM56 %s
+; RUN: sed 's/CODE_OBJECT_VERSION/600/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck --check-prefixes=ASM,ASM56 %s
 
 ; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 --amdhsa-code-object-version=4 --filetype=obj | llvm-readobj --file-headers - | FileCheck --check-prefixes=ELF,ELF4 %s
 ; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 --amdhsa-code-object-version=5 --filetype=obj | llvm-readobj --file-headers - | FileCheck --check-prefixes=ELF,ELF5 %s
@@ -13,10 +13,9 @@
 ; ASM: .amdgcn_target  "amdgcn-amd-amdhsa--gfx900:xnack+"
 ; ASM:  amdhsa.target: 'amdgcn-amd-amdhsa--gfx900:xnack+'
 ; ASM:  amdhsa.version:
-; ASM:    - 1
-; ASM4:   - 1
-; ASM5:   - 2
-; ASM6:   - 3
+; ASM:     - 1
+; ASM4:    - 1
+; ASM56:   - 2
 
 ; ELF:      OS/ABI: AMDGPU_HSA (0x40)
 ; ELF4:      ABIVersion: 2
diff --git a/llvm/test/CodeGen/AMDGPU/tid-one-func-xnack-not-supported.ll b/llvm/test/CodeGen/AMDGPU/tid-one-func-xnack-not-supported.ll
index f0af6ca..44e77a2 100644
--- a/llvm/test/CodeGen/AMDGPU/tid-one-func-xnack-not-supported.ll
+++ b/llvm/test/CodeGen/AMDGPU/tid-one-func-xnack-not-supported.ll
@@ -1,6 +1,6 @@
 ; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 | FileCheck --check-prefixes=ASM,ASM4 %s
-; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 | FileCheck --check-prefixes=ASM,ASM5 %s
-; RUN: sed 's/CODE_OBJECT_VERSION/600/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 | FileCheck --check-prefixes=ASM,ASM6 %s
+; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 | FileCheck --check-prefixes=ASM,ASM56 %s
+; RUN: sed 's/CODE_OBJECT_VERSION/600/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 | FileCheck --check-prefixes=ASM,ASM56 %s
 
 ; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 --amdhsa-code-object-version=4 --filetype=obj | llvm-readobj --file-headers - | FileCheck --check-prefixes=ELF,ELF4 %s
 ; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 --amdhsa-code-object-version=5 --filetype=obj | llvm-readobj --file-headers - | FileCheck --check-prefixes=ELF,ELF5 %s
@@ -13,10 +13,9 @@
 ; ASM: .amdgcn_target "amdgcn-amd-amdhsa--gfx700"
 ; ASM:  amdhsa.target: amdgcn-amd-amdhsa--gfx700
 ; ASM:  amdhsa.version:
-; ASM:    - 1
-; ASM4:   - 1
-; ASM5:   - 2
-; ASM6:   - 3
+; ASM:     - 1
+; ASM4:    - 1
+; ASM56:   - 2
 
 ; ELF:      OS/ABI: AMDGPU_HSA (0x40)
 ; ELF4:      ABIVersion: 2
diff --git a/llvm/test/CodeGen/AMDGPU/tid-one-func-xnack-off.ll b/llvm/test/CodeGen/AMDGPU/tid-one-func-xnack-off.ll
index 5501ce9..3205dbe 100644
--- a/llvm/test/CodeGen/AMDGPU/tid-one-func-xnack-off.ll
+++ b/llvm/test/CodeGen/AMDGPU/tid-one-func-xnack-off.ll
@@ -1,6 +1,6 @@
 ; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck --check-prefixes=ASM,ASM4 %s
-; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck --check-prefixes=ASM,ASM5 %s
-; RUN: sed 's/CODE_OBJECT_VERSION/600/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck --check-prefixes=ASM,ASM6 %s
+; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck --check-prefixes=ASM,ASM56 %s
+; RUN: sed 's/CODE_OBJECT_VERSION/600/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck --check-prefixes=ASM,ASM56 %s
 
 ; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 --amdhsa-code-object-version=4 --filetype=obj | llvm-readobj --file-headers - | FileCheck --check-prefixes=ELF,ELF4 %s
 ; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 --amdhsa-code-object-version=5 --filetype=obj | llvm-readobj --file-headers - | FileCheck --check-prefixes=ELF,ELF5 %s
@@ -13,10 +13,9 @@
 ; ASM: .amdgcn_target  "amdgcn-amd-amdhsa--gfx900:xnack-"
 ; ASM:  amdhsa.target: 'amdgcn-amd-amdhsa--gfx900:xnack-'
 ; ASM:  amdhsa.version:
-; ASM:    - 1
-; ASM4:   - 1
-; ASM5:   - 2
-; ASM6:   - 3
+; ASM:     - 1
+; ASM4:    - 1
+; ASM56:   - 2
 
 ; ELF:      OS/ABI: AMDGPU_HSA (0x40)
 ; ELF4:      ABIVersion: 2
diff --git a/llvm/test/CodeGen/AMDGPU/tid-one-func-xnack-on.ll b/llvm/test/CodeGen/AMDGPU/tid-one-func-xnack-on.ll
index 4cec639..6e7c575 100644
--- a/llvm/test/CodeGen/AMDGPU/tid-one-func-xnack-on.ll
+++ b/llvm/test/CodeGen/AMDGPU/tid-one-func-xnack-on.ll
@@ -1,6 +1,6 @@
 ; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck --check-prefixes=ASM,ASM4 %s
-; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck --check-prefixes=ASM,ASM5 %s
-; RUN: sed 's/CODE_OBJECT_VERSION/600/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck --check-prefixes=ASM,ASM6 %s
+; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck --check-prefixes=ASM,ASM56 %s
+; RUN: sed 's/CODE_OBJECT_VERSION/600/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck --check-prefixes=ASM,ASM56 %s
 
 ; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 --amdhsa-code-object-version=4 --filetype=obj | llvm-readobj --file-headers - | FileCheck --check-prefixes=ELF,ELF4 %s
 ; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 --amdhsa-code-object-version=5 --filetype=obj | llvm-readobj --file-headers - | FileCheck --check-prefixes=ELF,ELF5 %s
@@ -13,10 +13,9 @@
 ; ASM: .amdgcn_target  "amdgcn-amd-amdhsa--gfx900:xnack+"
 ; ASM:  amdhsa.target: 'amdgcn-amd-amdhsa--gfx900:xnack+'
 ; ASM:  amdhsa.version:
-; ASM:    - 1
-; ASM4:   - 1
-; ASM5:   - 2
-; ASM6:   - 3
+; ASM:     - 1
+; ASM4:    - 1
+; ASM56:   - 2
 
 ; ELF:      OS/ABI: AMDGPU_HSA (0x40)
 ; ELF4:      ABIVersion: 2
-- 
cgit v1.1


From a446c9bf69b4797da329977366ca62e55a429a90 Mon Sep 17 00:00:00 2001
From: martinboehme <mboehme@google.com>
Date: Thu, 8 Feb 2024 08:38:35 +0100
Subject: [clang][dataflow] Add support for `CXXRewrittenBinaryOperator`.
 (#81086)

This occurs in rewritten candidates for binary operators (a C++20
feature).

The patch modifies UncheckedOptionalAccessModelTest to run in C++20 mode
(as
well as C++17 mode, as before) and to use rewritten candidates. The
modified
test fails without the newly added support for
`CXXRewrittenBinaryOperator`.
---
 clang/lib/Analysis/FlowSensitive/Transfer.cpp            |  4 ++++
 .../FlowSensitive/UncheckedOptionalAccessModelTest.cpp   | 16 +++++++++++++++-
 2 files changed, 19 insertions(+), 1 deletion(-)

diff --git a/clang/lib/Analysis/FlowSensitive/Transfer.cpp b/clang/lib/Analysis/FlowSensitive/Transfer.cpp
index bb3aec7..a098471 100644
--- a/clang/lib/Analysis/FlowSensitive/Transfer.cpp
+++ b/clang/lib/Analysis/FlowSensitive/Transfer.cpp
@@ -545,6 +545,10 @@ public:
     VisitCallExpr(S);
   }
 
+  void VisitCXXRewrittenBinaryOperator(const CXXRewrittenBinaryOperator *RBO) {
+    propagateValue(*RBO->getSemanticForm(), *RBO, Env);
+  }
+
   void VisitCXXFunctionalCastExpr(const CXXFunctionalCastExpr *S) {
     if (S->getCastKind() == CK_ConstructorConversion) {
       const Expr *SubExpr = S->getSubExpr();
diff --git a/clang/unittests/Analysis/FlowSensitive/UncheckedOptionalAccessModelTest.cpp b/clang/unittests/Analysis/FlowSensitive/UncheckedOptionalAccessModelTest.cpp
index 73fb406..b6e4973 100644
--- a/clang/unittests/Analysis/FlowSensitive/UncheckedOptionalAccessModelTest.cpp
+++ b/clang/unittests/Analysis/FlowSensitive/UncheckedOptionalAccessModelTest.cpp
@@ -770,12 +770,17 @@ constexpr bool operator!=(const optional<T> &lhs, const optional<U> &rhs);
 
 template <typename T>
 constexpr bool operator==(const optional<T> &opt, nullopt_t);
+
+// C++20 and later do not define the following overloads because they are
+// provided by rewritten candidates instead.
+#if __cplusplus < 202002L
 template <typename T>
 constexpr bool operator==(nullopt_t, const optional<T> &opt);
 template <typename T>
 constexpr bool operator!=(const optional<T> &opt, nullopt_t);
 template <typename T>
 constexpr bool operator!=(nullopt_t, const optional<T> &opt);
+#endif  // __cplusplus < 202002L
 
 template <typename T, typename U>
 constexpr bool operator==(const optional<T> &opt, const U &value);
@@ -1289,6 +1294,15 @@ protected:
   template <typename FuncDeclMatcher>
   void ExpectDiagnosticsFor(std::string SourceCode,
                             FuncDeclMatcher FuncMatcher) {
+    // Run in C++17 and C++20 mode to cover differences in the AST between modes
+    // (e.g. C++20 can contain `CXXRewrittenBinaryOperator`).
+    for (const char *CxxMode : {"-std=c++17", "-std=c++20"})
+      ExpectDiagnosticsFor(SourceCode, FuncMatcher, CxxMode);
+  }
+
+  template <typename FuncDeclMatcher>
+  void ExpectDiagnosticsFor(std::string SourceCode, FuncDeclMatcher FuncMatcher,
+                            const char *CxxMode) {
     ReplaceAllOccurrences(SourceCode, "$ns", GetParam().NamespaceName);
     ReplaceAllOccurrences(SourceCode, "$optional", GetParam().TypeName);
 
@@ -1332,7 +1346,7 @@ protected:
                   llvm::move(EltDiagnostics, std::back_inserter(Diagnostics));
                 })
             .withASTBuildArgs(
-                {"-fsyntax-only", "-std=c++17", "-Wno-undefined-inline"})
+                {"-fsyntax-only", CxxMode, "-Wno-undefined-inline"})
             .withASTBuildVirtualMappedFiles(
                 tooling::FileContentMappings(Headers.begin(), Headers.end())),
         /*VerifyResults=*/[&Diagnostics](
-- 
cgit v1.1


From a24b0c351a75a87410203dd3777c0d8ee87f65c1 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Thu, 8 Feb 2024 13:20:34 +0530
Subject: clang/AMDGPU: Regenerate test checks in hip header tests

---
 clang/test/Headers/__clang_hip_cmath.hip |  16 +-
 clang/test/Headers/__clang_hip_math.hip  | 896 +++++++++++++++----------------
 2 files changed, 456 insertions(+), 456 deletions(-)

diff --git a/clang/test/Headers/__clang_hip_cmath.hip b/clang/test/Headers/__clang_hip_cmath.hip
index c194f44..cd085fd 100644
--- a/clang/test/Headers/__clang_hip_cmath.hip
+++ b/clang/test/Headers/__clang_hip_cmath.hip
@@ -61,13 +61,13 @@ extern "C" __device__ float test_fabs_f32(float x) {
 
 // DEFAULT-LABEL: @test_sin_f32(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I_I:%.*]] = tail call contract noundef float @__ocml_sin_f32(float noundef [[X:%.*]]) #[[ATTR8:[0-9]+]]
-// DEFAULT-NEXT:    ret float [[CALL_I_I]]
+// DEFAULT-NEXT:    [[CALL_I1:%.*]] = tail call contract noundef float @__ocml_sin_f32(float noundef [[X:%.*]]) #[[ATTR8:[0-9]+]]
+// DEFAULT-NEXT:    ret float [[CALL_I1]]
 //
 // FINITEONLY-LABEL: @test_sin_f32(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_sin_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR8:[0-9]+]]
-// FINITEONLY-NEXT:    ret float [[CALL_I_I]]
+// FINITEONLY-NEXT:    [[CALL_I1:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_sin_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR8:[0-9]+]]
+// FINITEONLY-NEXT:    ret float [[CALL_I1]]
 //
 extern "C" __device__ float test_sin_f32(float x) {
   return sin(x);
@@ -75,13 +75,13 @@ extern "C" __device__ float test_sin_f32(float x) {
 
 // DEFAULT-LABEL: @test_cos_f32(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I_I:%.*]] = tail call contract noundef float @__ocml_cos_f32(float noundef [[X:%.*]]) #[[ATTR8]]
-// DEFAULT-NEXT:    ret float [[CALL_I_I]]
+// DEFAULT-NEXT:    [[CALL_I1:%.*]] = tail call contract noundef float @__ocml_cos_f32(float noundef [[X:%.*]]) #[[ATTR8]]
+// DEFAULT-NEXT:    ret float [[CALL_I1]]
 //
 // FINITEONLY-LABEL: @test_cos_f32(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_cos_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR8]]
-// FINITEONLY-NEXT:    ret float [[CALL_I_I]]
+// FINITEONLY-NEXT:    [[CALL_I1:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_cos_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR8]]
+// FINITEONLY-NEXT:    ret float [[CALL_I1]]
 //
 extern "C" __device__ float test_cos_f32(float x) {
   return cos(x);
diff --git a/clang/test/Headers/__clang_hip_math.hip b/clang/test/Headers/__clang_hip_math.hip
index 5230c36..e9a9cb4 100644
--- a/clang/test/Headers/__clang_hip_math.hip
+++ b/clang/test/Headers/__clang_hip_math.hip
@@ -258,17 +258,17 @@ extern "C" __device__ long long test_llabs(long x) {
 
 // DEFAULT-LABEL: @test_acosf(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_acos_f32(float noundef [[X:%.*]]) #[[ATTR14:[0-9]+]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_acos_f32(float noundef [[X:%.*]]) #[[ATTR12:[0-9]+]]
 // DEFAULT-NEXT:    ret float [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_acosf(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_acos_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR14:[0-9]+]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_acos_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR12:[0-9]+]]
 // FINITEONLY-NEXT:    ret float [[CALL_I]]
 //
 // APPROX-LABEL: @test_acosf(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_acos_f32(float noundef [[X:%.*]]) #[[ATTR14:[0-9]+]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_acos_f32(float noundef [[X:%.*]]) #[[ATTR12:[0-9]+]]
 // APPROX-NEXT:    ret float [[CALL_I]]
 //
 extern "C" __device__ float test_acosf(float x) {
@@ -277,17 +277,17 @@ extern "C" __device__ float test_acosf(float x) {
 
 // DEFAULT-LABEL: @test_acos(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_acos_f64(double noundef [[X:%.*]]) #[[ATTR14]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_acos_f64(double noundef [[X:%.*]]) #[[ATTR12]]
 // DEFAULT-NEXT:    ret double [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_acos(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_acos_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR14]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_acos_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR12]]
 // FINITEONLY-NEXT:    ret double [[CALL_I]]
 //
 // APPROX-LABEL: @test_acos(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_acos_f64(double noundef [[X:%.*]]) #[[ATTR14]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_acos_f64(double noundef [[X:%.*]]) #[[ATTR12]]
 // APPROX-NEXT:    ret double [[CALL_I]]
 //
 extern "C" __device__ double test_acos(double x) {
@@ -296,17 +296,17 @@ extern "C" __device__ double test_acos(double x) {
 
 // DEFAULT-LABEL: @test_acoshf(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_acosh_f32(float noundef [[X:%.*]]) #[[ATTR15:[0-9]+]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_acosh_f32(float noundef [[X:%.*]]) #[[ATTR13:[0-9]+]]
 // DEFAULT-NEXT:    ret float [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_acoshf(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_acosh_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR15:[0-9]+]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_acosh_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR13:[0-9]+]]
 // FINITEONLY-NEXT:    ret float [[CALL_I]]
 //
 // APPROX-LABEL: @test_acoshf(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_acosh_f32(float noundef [[X:%.*]]) #[[ATTR15:[0-9]+]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_acosh_f32(float noundef [[X:%.*]]) #[[ATTR13:[0-9]+]]
 // APPROX-NEXT:    ret float [[CALL_I]]
 //
 extern "C" __device__ float test_acoshf(float x) {
@@ -315,17 +315,17 @@ extern "C" __device__ float test_acoshf(float x) {
 
 // DEFAULT-LABEL: @test_acosh(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_acosh_f64(double noundef [[X:%.*]]) #[[ATTR15]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_acosh_f64(double noundef [[X:%.*]]) #[[ATTR13]]
 // DEFAULT-NEXT:    ret double [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_acosh(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_acosh_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR15]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_acosh_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR13]]
 // FINITEONLY-NEXT:    ret double [[CALL_I]]
 //
 // APPROX-LABEL: @test_acosh(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_acosh_f64(double noundef [[X:%.*]]) #[[ATTR15]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_acosh_f64(double noundef [[X:%.*]]) #[[ATTR13]]
 // APPROX-NEXT:    ret double [[CALL_I]]
 //
 extern "C" __device__ double test_acosh(double x) {
@@ -334,17 +334,17 @@ extern "C" __device__ double test_acosh(double x) {
 
 // DEFAULT-LABEL: @test_asinf(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_asin_f32(float noundef [[X:%.*]]) #[[ATTR14]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_asin_f32(float noundef [[X:%.*]]) #[[ATTR12]]
 // DEFAULT-NEXT:    ret float [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_asinf(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_asin_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR14]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_asin_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR12]]
 // FINITEONLY-NEXT:    ret float [[CALL_I]]
 //
 // APPROX-LABEL: @test_asinf(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_asin_f32(float noundef [[X:%.*]]) #[[ATTR14]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_asin_f32(float noundef [[X:%.*]]) #[[ATTR12]]
 // APPROX-NEXT:    ret float [[CALL_I]]
 //
 extern "C" __device__ float test_asinf(float x) {
@@ -353,17 +353,17 @@ extern "C" __device__ float test_asinf(float x) {
 
 // DEFAULT-LABEL: @test_asin(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_asin_f64(double noundef [[X:%.*]]) #[[ATTR14]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_asin_f64(double noundef [[X:%.*]]) #[[ATTR12]]
 // DEFAULT-NEXT:    ret double [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_asin(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_asin_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR14]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_asin_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR12]]
 // FINITEONLY-NEXT:    ret double [[CALL_I]]
 //
 // APPROX-LABEL: @test_asin(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_asin_f64(double noundef [[X:%.*]]) #[[ATTR14]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_asin_f64(double noundef [[X:%.*]]) #[[ATTR12]]
 // APPROX-NEXT:    ret double [[CALL_I]]
 //
 extern "C" __device__ double test_asin(double x) {
@@ -373,17 +373,17 @@ extern "C" __device__ double test_asin(double x) {
 
 // DEFAULT-LABEL: @test_asinhf(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_asinh_f32(float noundef [[X:%.*]]) #[[ATTR15]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_asinh_f32(float noundef [[X:%.*]]) #[[ATTR13]]
 // DEFAULT-NEXT:    ret float [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_asinhf(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_asinh_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR15]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_asinh_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR13]]
 // FINITEONLY-NEXT:    ret float [[CALL_I]]
 //
 // APPROX-LABEL: @test_asinhf(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_asinh_f32(float noundef [[X:%.*]]) #[[ATTR15]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_asinh_f32(float noundef [[X:%.*]]) #[[ATTR13]]
 // APPROX-NEXT:    ret float [[CALL_I]]
 //
 extern "C" __device__ float test_asinhf(float x) {
@@ -392,17 +392,17 @@ extern "C" __device__ float test_asinhf(float x) {
 
 // DEFAULT-LABEL: @test_asinh(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_asinh_f64(double noundef [[X:%.*]]) #[[ATTR15]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_asinh_f64(double noundef [[X:%.*]]) #[[ATTR13]]
 // DEFAULT-NEXT:    ret double [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_asinh(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_asinh_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR15]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_asinh_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR13]]
 // FINITEONLY-NEXT:    ret double [[CALL_I]]
 //
 // APPROX-LABEL: @test_asinh(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_asinh_f64(double noundef [[X:%.*]]) #[[ATTR15]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_asinh_f64(double noundef [[X:%.*]]) #[[ATTR13]]
 // APPROX-NEXT:    ret double [[CALL_I]]
 //
 extern "C" __device__ double test_asinh(double x) {
@@ -411,17 +411,17 @@ extern "C" __device__ double test_asinh(double x) {
 
 // DEFAULT-LABEL: @test_atan2f(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_atan2_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]]) #[[ATTR14]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_atan2_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]]) #[[ATTR12]]
 // DEFAULT-NEXT:    ret float [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_atan2f(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_atan2_f32(float noundef nofpclass(nan inf) [[X:%.*]], float noundef nofpclass(nan inf) [[Y:%.*]]) #[[ATTR14]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_atan2_f32(float noundef nofpclass(nan inf) [[X:%.*]], float noundef nofpclass(nan inf) [[Y:%.*]]) #[[ATTR12]]
 // FINITEONLY-NEXT:    ret float [[CALL_I]]
 //
 // APPROX-LABEL: @test_atan2f(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_atan2_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]]) #[[ATTR14]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_atan2_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]]) #[[ATTR12]]
 // APPROX-NEXT:    ret float [[CALL_I]]
 //
 extern "C" __device__ float test_atan2f(float x, float y) {
@@ -430,17 +430,17 @@ extern "C" __device__ float test_atan2f(float x, float y) {
 
 // DEFAULT-LABEL: @test_atan2(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_atan2_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]]) #[[ATTR14]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_atan2_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]]) #[[ATTR12]]
 // DEFAULT-NEXT:    ret double [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_atan2(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_atan2_f64(double noundef nofpclass(nan inf) [[X:%.*]], double noundef nofpclass(nan inf) [[Y:%.*]]) #[[ATTR14]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_atan2_f64(double noundef nofpclass(nan inf) [[X:%.*]], double noundef nofpclass(nan inf) [[Y:%.*]]) #[[ATTR12]]
 // FINITEONLY-NEXT:    ret double [[CALL_I]]
 //
 // APPROX-LABEL: @test_atan2(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_atan2_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]]) #[[ATTR14]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_atan2_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]]) #[[ATTR12]]
 // APPROX-NEXT:    ret double [[CALL_I]]
 //
 extern "C" __device__ double test_atan2(double x, double y) {
@@ -449,17 +449,17 @@ extern "C" __device__ double test_atan2(double x, double y) {
 
 // DEFAULT-LABEL: @test_atanf(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_atan_f32(float noundef [[X:%.*]]) #[[ATTR14]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_atan_f32(float noundef [[X:%.*]]) #[[ATTR12]]
 // DEFAULT-NEXT:    ret float [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_atanf(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_atan_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR14]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_atan_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR12]]
 // FINITEONLY-NEXT:    ret float [[CALL_I]]
 //
 // APPROX-LABEL: @test_atanf(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_atan_f32(float noundef [[X:%.*]]) #[[ATTR14]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_atan_f32(float noundef [[X:%.*]]) #[[ATTR12]]
 // APPROX-NEXT:    ret float [[CALL_I]]
 //
 extern "C" __device__ float test_atanf(float x) {
@@ -468,17 +468,17 @@ extern "C" __device__ float test_atanf(float x) {
 
 // DEFAULT-LABEL: @test_atan(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_atan_f64(double noundef [[X:%.*]]) #[[ATTR14]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_atan_f64(double noundef [[X:%.*]]) #[[ATTR12]]
 // DEFAULT-NEXT:    ret double [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_atan(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_atan_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR14]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_atan_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR12]]
 // FINITEONLY-NEXT:    ret double [[CALL_I]]
 //
 // APPROX-LABEL: @test_atan(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_atan_f64(double noundef [[X:%.*]]) #[[ATTR14]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_atan_f64(double noundef [[X:%.*]]) #[[ATTR12]]
 // APPROX-NEXT:    ret double [[CALL_I]]
 //
 extern "C" __device__ double test_atan(double x) {
@@ -487,17 +487,17 @@ extern "C" __device__ double test_atan(double x) {
 
 // DEFAULT-LABEL: @test_atanhf(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_atanh_f32(float noundef [[X:%.*]]) #[[ATTR15]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_atanh_f32(float noundef [[X:%.*]]) #[[ATTR13]]
 // DEFAULT-NEXT:    ret float [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_atanhf(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_atanh_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR15]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_atanh_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR13]]
 // FINITEONLY-NEXT:    ret float [[CALL_I]]
 //
 // APPROX-LABEL: @test_atanhf(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_atanh_f32(float noundef [[X:%.*]]) #[[ATTR15]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_atanh_f32(float noundef [[X:%.*]]) #[[ATTR13]]
 // APPROX-NEXT:    ret float [[CALL_I]]
 //
 extern "C" __device__ float test_atanhf(float x) {
@@ -506,17 +506,17 @@ extern "C" __device__ float test_atanhf(float x) {
 
 // DEFAULT-LABEL: @test_atanh(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_atanh_f64(double noundef [[X:%.*]]) #[[ATTR15]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_atanh_f64(double noundef [[X:%.*]]) #[[ATTR13]]
 // DEFAULT-NEXT:    ret double [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_atanh(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_atanh_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR15]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_atanh_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR13]]
 // FINITEONLY-NEXT:    ret double [[CALL_I]]
 //
 // APPROX-LABEL: @test_atanh(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_atanh_f64(double noundef [[X:%.*]]) #[[ATTR15]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_atanh_f64(double noundef [[X:%.*]]) #[[ATTR13]]
 // APPROX-NEXT:    ret double [[CALL_I]]
 //
 extern "C" __device__ double test_atanh(double x) {
@@ -525,17 +525,17 @@ extern "C" __device__ double test_atanh(double x) {
 
 // DEFAULT-LABEL: @test_cbrtf(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_cbrt_f32(float noundef [[X:%.*]]) #[[ATTR15]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_cbrt_f32(float noundef [[X:%.*]]) #[[ATTR13]]
 // DEFAULT-NEXT:    ret float [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_cbrtf(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_cbrt_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR15]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_cbrt_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR13]]
 // FINITEONLY-NEXT:    ret float [[CALL_I]]
 //
 // APPROX-LABEL: @test_cbrtf(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_cbrt_f32(float noundef [[X:%.*]]) #[[ATTR15]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_cbrt_f32(float noundef [[X:%.*]]) #[[ATTR13]]
 // APPROX-NEXT:    ret float [[CALL_I]]
 //
 extern "C" __device__ float test_cbrtf(float x) {
@@ -544,17 +544,17 @@ extern "C" __device__ float test_cbrtf(float x) {
 
 // DEFAULT-LABEL: @test_cbrt(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_cbrt_f64(double noundef [[X:%.*]]) #[[ATTR15]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_cbrt_f64(double noundef [[X:%.*]]) #[[ATTR13]]
 // DEFAULT-NEXT:    ret double [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_cbrt(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_cbrt_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR15]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_cbrt_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR13]]
 // FINITEONLY-NEXT:    ret double [[CALL_I]]
 //
 // APPROX-LABEL: @test_cbrt(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_cbrt_f64(double noundef [[X:%.*]]) #[[ATTR15]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_cbrt_f64(double noundef [[X:%.*]]) #[[ATTR13]]
 // APPROX-NEXT:    ret double [[CALL_I]]
 //
 extern "C" __device__ double test_cbrt(double x) {
@@ -639,17 +639,17 @@ extern "C" __device__ double test_copysign(double x, double y) {
 
 // DEFAULT-LABEL: @test_cosf(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_cos_f32(float noundef [[X:%.*]]) #[[ATTR16:[0-9]+]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_cos_f32(float noundef [[X:%.*]]) #[[ATTR14:[0-9]+]]
 // DEFAULT-NEXT:    ret float [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_cosf(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_cos_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR16:[0-9]+]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_cos_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR14:[0-9]+]]
 // FINITEONLY-NEXT:    ret float [[CALL_I]]
 //
 // APPROX-LABEL: @test_cosf(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I1:%.*]] = tail call contract noundef float @__ocml_native_cos_f32(float noundef [[X:%.*]]) #[[ATTR16:[0-9]+]]
+// APPROX-NEXT:    [[CALL_I1:%.*]] = tail call contract noundef float @__ocml_native_cos_f32(float noundef [[X:%.*]]) #[[ATTR14:[0-9]+]]
 // APPROX-NEXT:    ret float [[CALL_I1]]
 //
 extern "C" __device__ float test_cosf(float x) {
@@ -658,17 +658,17 @@ extern "C" __device__ float test_cosf(float x) {
 
 // DEFAULT-LABEL: @test_cos(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_cos_f64(double noundef [[X:%.*]]) #[[ATTR16]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_cos_f64(double noundef [[X:%.*]]) #[[ATTR14]]
 // DEFAULT-NEXT:    ret double [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_cos(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_cos_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR16]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_cos_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR14]]
 // FINITEONLY-NEXT:    ret double [[CALL_I]]
 //
 // APPROX-LABEL: @test_cos(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_cos_f64(double noundef [[X:%.*]]) #[[ATTR16]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_cos_f64(double noundef [[X:%.*]]) #[[ATTR14]]
 // APPROX-NEXT:    ret double [[CALL_I]]
 //
 extern "C" __device__ double test_cos(double x) {
@@ -677,17 +677,17 @@ extern "C" __device__ double test_cos(double x) {
 
 // DEFAULT-LABEL: @test_coshf(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_cosh_f32(float noundef [[X:%.*]]) #[[ATTR15]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_cosh_f32(float noundef [[X:%.*]]) #[[ATTR13]]
 // DEFAULT-NEXT:    ret float [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_coshf(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_cosh_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR15]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_cosh_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR13]]
 // FINITEONLY-NEXT:    ret float [[CALL_I]]
 //
 // APPROX-LABEL: @test_coshf(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_cosh_f32(float noundef [[X:%.*]]) #[[ATTR15]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_cosh_f32(float noundef [[X:%.*]]) #[[ATTR13]]
 // APPROX-NEXT:    ret float [[CALL_I]]
 //
 extern "C" __device__ float test_coshf(float x) {
@@ -696,17 +696,17 @@ extern "C" __device__ float test_coshf(float x) {
 
 // DEFAULT-LABEL: @test_cosh(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_cosh_f64(double noundef [[X:%.*]]) #[[ATTR15]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_cosh_f64(double noundef [[X:%.*]]) #[[ATTR13]]
 // DEFAULT-NEXT:    ret double [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_cosh(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_cosh_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR15]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_cosh_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR13]]
 // FINITEONLY-NEXT:    ret double [[CALL_I]]
 //
 // APPROX-LABEL: @test_cosh(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_cosh_f64(double noundef [[X:%.*]]) #[[ATTR15]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_cosh_f64(double noundef [[X:%.*]]) #[[ATTR13]]
 // APPROX-NEXT:    ret double [[CALL_I]]
 //
 extern "C" __device__ double test_cosh(double x) {
@@ -715,17 +715,17 @@ extern "C" __device__ double test_cosh(double x) {
 
 // DEFAULT-LABEL: @test_cospif(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_cospi_f32(float noundef [[X:%.*]]) #[[ATTR16]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_cospi_f32(float noundef [[X:%.*]]) #[[ATTR14]]
 // DEFAULT-NEXT:    ret float [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_cospif(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_cospi_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR16]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_cospi_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR14]]
 // FINITEONLY-NEXT:    ret float [[CALL_I]]
 //
 // APPROX-LABEL: @test_cospif(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_cospi_f32(float noundef [[X:%.*]]) #[[ATTR16]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_cospi_f32(float noundef [[X:%.*]]) #[[ATTR14]]
 // APPROX-NEXT:    ret float [[CALL_I]]
 //
 extern "C" __device__ float test_cospif(float x) {
@@ -734,17 +734,17 @@ extern "C" __device__ float test_cospif(float x) {
 
 // DEFAULT-LABEL: @test_cospi(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_cospi_f64(double noundef [[X:%.*]]) #[[ATTR16]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_cospi_f64(double noundef [[X:%.*]]) #[[ATTR14]]
 // DEFAULT-NEXT:    ret double [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_cospi(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_cospi_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR16]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_cospi_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR14]]
 // FINITEONLY-NEXT:    ret double [[CALL_I]]
 //
 // APPROX-LABEL: @test_cospi(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_cospi_f64(double noundef [[X:%.*]]) #[[ATTR16]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_cospi_f64(double noundef [[X:%.*]]) #[[ATTR14]]
 // APPROX-NEXT:    ret double [[CALL_I]]
 //
 extern "C" __device__ double test_cospi(double x) {
@@ -753,17 +753,17 @@ extern "C" __device__ double test_cospi(double x) {
 
 // DEFAULT-LABEL: @test_cyl_bessel_i0f(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_i0_f32(float noundef [[X:%.*]]) #[[ATTR16]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_i0_f32(float noundef [[X:%.*]]) #[[ATTR14]]
 // DEFAULT-NEXT:    ret float [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_cyl_bessel_i0f(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_i0_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR16]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_i0_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR14]]
 // FINITEONLY-NEXT:    ret float [[CALL_I]]
 //
 // APPROX-LABEL: @test_cyl_bessel_i0f(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_i0_f32(float noundef [[X:%.*]]) #[[ATTR16]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_i0_f32(float noundef [[X:%.*]]) #[[ATTR14]]
 // APPROX-NEXT:    ret float [[CALL_I]]
 //
 extern "C" __device__ float test_cyl_bessel_i0f(float x) {
@@ -772,17 +772,17 @@ extern "C" __device__ float test_cyl_bessel_i0f(float x) {
 
 // DEFAULT-LABEL: @test_cyl_bessel_i0(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_i0_f64(double noundef [[X:%.*]]) #[[ATTR16]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_i0_f64(double noundef [[X:%.*]]) #[[ATTR14]]
 // DEFAULT-NEXT:    ret double [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_cyl_bessel_i0(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_i0_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR16]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_i0_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR14]]
 // FINITEONLY-NEXT:    ret double [[CALL_I]]
 //
 // APPROX-LABEL: @test_cyl_bessel_i0(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_i0_f64(double noundef [[X:%.*]]) #[[ATTR16]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_i0_f64(double noundef [[X:%.*]]) #[[ATTR14]]
 // APPROX-NEXT:    ret double [[CALL_I]]
 //
 extern "C" __device__ double test_cyl_bessel_i0(double x) {
@@ -791,17 +791,17 @@ extern "C" __device__ double test_cyl_bessel_i0(double x) {
 
 // DEFAULT-LABEL: @test_cyl_bessel_i1f(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_i1_f32(float noundef [[X:%.*]]) #[[ATTR16]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_i1_f32(float noundef [[X:%.*]]) #[[ATTR14]]
 // DEFAULT-NEXT:    ret float [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_cyl_bessel_i1f(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_i1_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR16]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_i1_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR14]]
 // FINITEONLY-NEXT:    ret float [[CALL_I]]
 //
 // APPROX-LABEL: @test_cyl_bessel_i1f(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_i1_f32(float noundef [[X:%.*]]) #[[ATTR16]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_i1_f32(float noundef [[X:%.*]]) #[[ATTR14]]
 // APPROX-NEXT:    ret float [[CALL_I]]
 //
 extern "C" __device__ float test_cyl_bessel_i1f(float x) {
@@ -810,17 +810,17 @@ extern "C" __device__ float test_cyl_bessel_i1f(float x) {
 
 // DEFAULT-LABEL: @test_cyl_bessel_i1(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_i1_f64(double noundef [[X:%.*]]) #[[ATTR16]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_i1_f64(double noundef [[X:%.*]]) #[[ATTR14]]
 // DEFAULT-NEXT:    ret double [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_cyl_bessel_i1(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_i1_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR16]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_i1_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR14]]
 // FINITEONLY-NEXT:    ret double [[CALL_I]]
 //
 // APPROX-LABEL: @test_cyl_bessel_i1(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_i1_f64(double noundef [[X:%.*]]) #[[ATTR16]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_i1_f64(double noundef [[X:%.*]]) #[[ATTR14]]
 // APPROX-NEXT:    ret double [[CALL_I]]
 //
 extern "C" __device__ double test_cyl_bessel_i1(double x) {
@@ -829,17 +829,17 @@ extern "C" __device__ double test_cyl_bessel_i1(double x) {
 
 // DEFAULT-LABEL: @test_erfcf(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_erfc_f32(float noundef [[X:%.*]]) #[[ATTR15]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_erfc_f32(float noundef [[X:%.*]]) #[[ATTR13]]
 // DEFAULT-NEXT:    ret float [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_erfcf(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_erfc_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR15]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_erfc_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR13]]
 // FINITEONLY-NEXT:    ret float [[CALL_I]]
 //
 // APPROX-LABEL: @test_erfcf(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_erfc_f32(float noundef [[X:%.*]]) #[[ATTR15]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_erfc_f32(float noundef [[X:%.*]]) #[[ATTR13]]
 // APPROX-NEXT:    ret float [[CALL_I]]
 //
 extern "C" __device__ float test_erfcf(float x) {
@@ -848,17 +848,17 @@ extern "C" __device__ float test_erfcf(float x) {
 
 // DEFAULT-LABEL: @test_erfc(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_erfc_f64(double noundef [[X:%.*]]) #[[ATTR15]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_erfc_f64(double noundef [[X:%.*]]) #[[ATTR13]]
 // DEFAULT-NEXT:    ret double [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_erfc(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_erfc_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR15]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_erfc_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR13]]
 // FINITEONLY-NEXT:    ret double [[CALL_I]]
 //
 // APPROX-LABEL: @test_erfc(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_erfc_f64(double noundef [[X:%.*]]) #[[ATTR15]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_erfc_f64(double noundef [[X:%.*]]) #[[ATTR13]]
 // APPROX-NEXT:    ret double [[CALL_I]]
 //
 extern "C" __device__ double test_erfc(double x) {
@@ -867,17 +867,17 @@ extern "C" __device__ double test_erfc(double x) {
 
 // DEFAULT-LABEL: @test_erfinvf(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_erfinv_f32(float noundef [[X:%.*]]) #[[ATTR15]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_erfinv_f32(float noundef [[X:%.*]]) #[[ATTR13]]
 // DEFAULT-NEXT:    ret float [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_erfinvf(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_erfinv_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR15]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_erfinv_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR13]]
 // FINITEONLY-NEXT:    ret float [[CALL_I]]
 //
 // APPROX-LABEL: @test_erfinvf(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_erfinv_f32(float noundef [[X:%.*]]) #[[ATTR15]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_erfinv_f32(float noundef [[X:%.*]]) #[[ATTR13]]
 // APPROX-NEXT:    ret float [[CALL_I]]
 //
 extern "C" __device__ float test_erfinvf(float x) {
@@ -886,17 +886,17 @@ extern "C" __device__ float test_erfinvf(float x) {
 
 // DEFAULT-LABEL: @test_erfinv(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_erfinv_f64(double noundef [[X:%.*]]) #[[ATTR15]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_erfinv_f64(double noundef [[X:%.*]]) #[[ATTR13]]
 // DEFAULT-NEXT:    ret double [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_erfinv(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_erfinv_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR15]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_erfinv_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR13]]
 // FINITEONLY-NEXT:    ret double [[CALL_I]]
 //
 // APPROX-LABEL: @test_erfinv(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_erfinv_f64(double noundef [[X:%.*]]) #[[ATTR15]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_erfinv_f64(double noundef [[X:%.*]]) #[[ATTR13]]
 // APPROX-NEXT:    ret double [[CALL_I]]
 //
 extern "C" __device__ double test_erfinv(double x) {
@@ -905,17 +905,17 @@ extern "C" __device__ double test_erfinv(double x) {
 
 // DEFAULT-LABEL: @test_exp10f(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_exp10_f32(float noundef [[X:%.*]]) #[[ATTR15]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_exp10_f32(float noundef [[X:%.*]]) #[[ATTR13]]
 // DEFAULT-NEXT:    ret float [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_exp10f(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_exp10_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR15]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_exp10_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR13]]
 // FINITEONLY-NEXT:    ret float [[CALL_I]]
 //
 // APPROX-LABEL: @test_exp10f(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_exp10_f32(float noundef [[X:%.*]]) #[[ATTR15]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_exp10_f32(float noundef [[X:%.*]]) #[[ATTR13]]
 // APPROX-NEXT:    ret float [[CALL_I]]
 //
 extern "C" __device__ float test_exp10f(float x) {
@@ -924,17 +924,17 @@ extern "C" __device__ float test_exp10f(float x) {
 
 // DEFAULT-LABEL: @test_exp10(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_exp10_f64(double noundef [[X:%.*]]) #[[ATTR15]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_exp10_f64(double noundef [[X:%.*]]) #[[ATTR13]]
 // DEFAULT-NEXT:    ret double [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_exp10(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_exp10_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR15]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_exp10_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR13]]
 // FINITEONLY-NEXT:    ret double [[CALL_I]]
 //
 // APPROX-LABEL: @test_exp10(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_exp10_f64(double noundef [[X:%.*]]) #[[ATTR15]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_exp10_f64(double noundef [[X:%.*]]) #[[ATTR13]]
 // APPROX-NEXT:    ret double [[CALL_I]]
 //
 extern "C" __device__ double test_exp10(double x) {
@@ -962,17 +962,17 @@ extern "C" __device__ float test_exp2f(float x) {
 
 // DEFAULT-LABEL: @test_exp2(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_exp2_f64(double noundef [[X:%.*]]) #[[ATTR15]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_exp2_f64(double noundef [[X:%.*]]) #[[ATTR13]]
 // DEFAULT-NEXT:    ret double [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_exp2(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_exp2_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR15]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_exp2_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR13]]
 // FINITEONLY-NEXT:    ret double [[CALL_I]]
 //
 // APPROX-LABEL: @test_exp2(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_exp2_f64(double noundef [[X:%.*]]) #[[ATTR15]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_exp2_f64(double noundef [[X:%.*]]) #[[ATTR13]]
 // APPROX-NEXT:    ret double [[CALL_I]]
 //
 extern "C" __device__ double test_exp2(double x) {
@@ -1000,17 +1000,17 @@ extern "C" __device__ float test_expf(float x) {
 
 // DEFAULT-LABEL: @test_exp(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_exp_f64(double noundef [[X:%.*]]) #[[ATTR15]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_exp_f64(double noundef [[X:%.*]]) #[[ATTR13]]
 // DEFAULT-NEXT:    ret double [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_exp(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_exp_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR15]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_exp_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR13]]
 // FINITEONLY-NEXT:    ret double [[CALL_I]]
 //
 // APPROX-LABEL: @test_exp(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_exp_f64(double noundef [[X:%.*]]) #[[ATTR15]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_exp_f64(double noundef [[X:%.*]]) #[[ATTR13]]
 // APPROX-NEXT:    ret double [[CALL_I]]
 //
 extern "C" __device__ double test_exp(double x) {
@@ -1019,17 +1019,17 @@ extern "C" __device__ double test_exp(double x) {
 
 // DEFAULT-LABEL: @test_expm1f(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_expm1_f32(float noundef [[X:%.*]]) #[[ATTR15]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_expm1_f32(float noundef [[X:%.*]]) #[[ATTR13]]
 // DEFAULT-NEXT:    ret float [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_expm1f(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_expm1_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR15]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_expm1_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR13]]
 // FINITEONLY-NEXT:    ret float [[CALL_I]]
 //
 // APPROX-LABEL: @test_expm1f(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_expm1_f32(float noundef [[X:%.*]]) #[[ATTR15]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_expm1_f32(float noundef [[X:%.*]]) #[[ATTR13]]
 // APPROX-NEXT:    ret float [[CALL_I]]
 //
 extern "C" __device__ float test_expm1f(float x) {
@@ -1038,17 +1038,17 @@ extern "C" __device__ float test_expm1f(float x) {
 
 // DEFAULT-LABEL: @test_expm1(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_expm1_f64(double noundef [[X:%.*]]) #[[ATTR15]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_expm1_f64(double noundef [[X:%.*]]) #[[ATTR13]]
 // DEFAULT-NEXT:    ret double [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_expm1(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_expm1_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR15]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_expm1_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR13]]
 // FINITEONLY-NEXT:    ret double [[CALL_I]]
 //
 // APPROX-LABEL: @test_expm1(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_expm1_f64(double noundef [[X:%.*]]) #[[ATTR15]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_expm1_f64(double noundef [[X:%.*]]) #[[ATTR13]]
 // APPROX-NEXT:    ret double [[CALL_I]]
 //
 extern "C" __device__ double test_expm1(double x) {
@@ -1095,17 +1095,17 @@ extern "C" __device__ double test_fabs(double x) {
 
 // DEFAULT-LABEL: @test_fdimf(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_fdim_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]]) #[[ATTR14]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_fdim_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]]) #[[ATTR12]]
 // DEFAULT-NEXT:    ret float [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_fdimf(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_fdim_f32(float noundef nofpclass(nan inf) [[X:%.*]], float noundef nofpclass(nan inf) [[Y:%.*]]) #[[ATTR14]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_fdim_f32(float noundef nofpclass(nan inf) [[X:%.*]], float noundef nofpclass(nan inf) [[Y:%.*]]) #[[ATTR12]]
 // FINITEONLY-NEXT:    ret float [[CALL_I]]
 //
 // APPROX-LABEL: @test_fdimf(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_fdim_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]]) #[[ATTR14]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_fdim_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]]) #[[ATTR12]]
 // APPROX-NEXT:    ret float [[CALL_I]]
 //
 extern "C" __device__ float test_fdimf(float x, float y) {
@@ -1114,17 +1114,17 @@ extern "C" __device__ float test_fdimf(float x, float y) {
 
 // DEFAULT-LABEL: @test_fdim(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_fdim_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]]) #[[ATTR14]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_fdim_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]]) #[[ATTR12]]
 // DEFAULT-NEXT:    ret double [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_fdim(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_fdim_f64(double noundef nofpclass(nan inf) [[X:%.*]], double noundef nofpclass(nan inf) [[Y:%.*]]) #[[ATTR14]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_fdim_f64(double noundef nofpclass(nan inf) [[X:%.*]], double noundef nofpclass(nan inf) [[Y:%.*]]) #[[ATTR12]]
 // FINITEONLY-NEXT:    ret double [[CALL_I]]
 //
 // APPROX-LABEL: @test_fdim(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_fdim_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]]) #[[ATTR14]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_fdim_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]]) #[[ATTR12]]
 // APPROX-NEXT:    ret double [[CALL_I]]
 //
 extern "C" __device__ double test_fdim(double x, double y) {
@@ -1323,17 +1323,17 @@ extern "C" __device__ double test_fmin(double x, double y) {
 
 // DEFAULT-LABEL: @test_fmodf(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_fmod_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]]) #[[ATTR14]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_fmod_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]]) #[[ATTR12]]
 // DEFAULT-NEXT:    ret float [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_fmodf(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_fmod_f32(float noundef nofpclass(nan inf) [[X:%.*]], float noundef nofpclass(nan inf) [[Y:%.*]]) #[[ATTR14]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_fmod_f32(float noundef nofpclass(nan inf) [[X:%.*]], float noundef nofpclass(nan inf) [[Y:%.*]]) #[[ATTR12]]
 // FINITEONLY-NEXT:    ret float [[CALL_I]]
 //
 // APPROX-LABEL: @test_fmodf(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_fmod_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]]) #[[ATTR14]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_fmod_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]]) #[[ATTR12]]
 // APPROX-NEXT:    ret float [[CALL_I]]
 //
 extern "C" __device__ float test_fmodf(float x, float y) {
@@ -1342,17 +1342,17 @@ extern "C" __device__ float test_fmodf(float x, float y) {
 
 // DEFAULT-LABEL: @test_fmod(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_fmod_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]]) #[[ATTR14]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_fmod_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]]) #[[ATTR12]]
 // DEFAULT-NEXT:    ret double [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_fmod(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_fmod_f64(double noundef nofpclass(nan inf) [[X:%.*]], double noundef nofpclass(nan inf) [[Y:%.*]]) #[[ATTR14]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_fmod_f64(double noundef nofpclass(nan inf) [[X:%.*]], double noundef nofpclass(nan inf) [[Y:%.*]]) #[[ATTR12]]
 // FINITEONLY-NEXT:    ret double [[CALL_I]]
 //
 // APPROX-LABEL: @test_fmod(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_fmod_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]]) #[[ATTR14]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_fmod_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]]) #[[ATTR12]]
 // APPROX-NEXT:    ret double [[CALL_I]]
 //
 extern "C" __device__ double test_fmod(double x, double y) {
@@ -1385,17 +1385,17 @@ extern "C" __device__ double test_frexp(double x, int* y) {
 
 // DEFAULT-LABEL: @test_hypotf(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_hypot_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]]) #[[ATTR14]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_hypot_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]]) #[[ATTR12]]
 // DEFAULT-NEXT:    ret float [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_hypotf(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_hypot_f32(float noundef nofpclass(nan inf) [[X:%.*]], float noundef nofpclass(nan inf) [[Y:%.*]]) #[[ATTR14]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_hypot_f32(float noundef nofpclass(nan inf) [[X:%.*]], float noundef nofpclass(nan inf) [[Y:%.*]]) #[[ATTR12]]
 // FINITEONLY-NEXT:    ret float [[CALL_I]]
 //
 // APPROX-LABEL: @test_hypotf(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_hypot_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]]) #[[ATTR14]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_hypot_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]]) #[[ATTR12]]
 // APPROX-NEXT:    ret float [[CALL_I]]
 //
 extern "C" __device__ float test_hypotf(float x, float y) {
@@ -1404,17 +1404,17 @@ extern "C" __device__ float test_hypotf(float x, float y) {
 
 // DEFAULT-LABEL: @test_hypot(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_hypot_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]]) #[[ATTR14]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_hypot_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]]) #[[ATTR12]]
 // DEFAULT-NEXT:    ret double [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_hypot(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_hypot_f64(double noundef nofpclass(nan inf) [[X:%.*]], double noundef nofpclass(nan inf) [[Y:%.*]]) #[[ATTR14]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_hypot_f64(double noundef nofpclass(nan inf) [[X:%.*]], double noundef nofpclass(nan inf) [[Y:%.*]]) #[[ATTR12]]
 // FINITEONLY-NEXT:    ret double [[CALL_I]]
 //
 // APPROX-LABEL: @test_hypot(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_hypot_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]]) #[[ATTR14]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_hypot_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]]) #[[ATTR12]]
 // APPROX-NEXT:    ret double [[CALL_I]]
 //
 extern "C" __device__ double test_hypot(double x, double y) {
@@ -1423,17 +1423,17 @@ extern "C" __device__ double test_hypot(double x, double y) {
 
 // DEFAULT-LABEL: @test_ilogbf(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call noundef i32 @__ocml_ilogb_f32(float noundef [[X:%.*]]) #[[ATTR14]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call noundef i32 @__ocml_ilogb_f32(float noundef [[X:%.*]]) #[[ATTR12]]
 // DEFAULT-NEXT:    ret i32 [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_ilogbf(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call noundef i32 @__ocml_ilogb_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR14]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call noundef i32 @__ocml_ilogb_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR12]]
 // FINITEONLY-NEXT:    ret i32 [[CALL_I]]
 //
 // APPROX-LABEL: @test_ilogbf(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call noundef i32 @__ocml_ilogb_f32(float noundef [[X:%.*]]) #[[ATTR14]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call noundef i32 @__ocml_ilogb_f32(float noundef [[X:%.*]]) #[[ATTR12]]
 // APPROX-NEXT:    ret i32 [[CALL_I]]
 //
 extern "C" __device__ int test_ilogbf(float x) {
@@ -1442,17 +1442,17 @@ extern "C" __device__ int test_ilogbf(float x) {
 
 // DEFAULT-LABEL: @test_ilogb(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call noundef i32 @__ocml_ilogb_f64(double noundef [[X:%.*]]) #[[ATTR14]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call noundef i32 @__ocml_ilogb_f64(double noundef [[X:%.*]]) #[[ATTR12]]
 // DEFAULT-NEXT:    ret i32 [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_ilogb(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call noundef i32 @__ocml_ilogb_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR14]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call noundef i32 @__ocml_ilogb_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR12]]
 // FINITEONLY-NEXT:    ret i32 [[CALL_I]]
 //
 // APPROX-LABEL: @test_ilogb(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call noundef i32 @__ocml_ilogb_f64(double noundef [[X:%.*]]) #[[ATTR14]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call noundef i32 @__ocml_ilogb_f64(double noundef [[X:%.*]]) #[[ATTR12]]
 // APPROX-NEXT:    ret i32 [[CALL_I]]
 //
 extern "C" __device__ int test_ilogb(double x) {
@@ -1589,17 +1589,17 @@ extern "C" __device__ BOOL_TYPE test___isnan(double x) {
 
 // DEFAULT-LABEL: @test_j0f(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_j0_f32(float noundef [[X:%.*]]) #[[ATTR16]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_j0_f32(float noundef [[X:%.*]]) #[[ATTR14]]
 // DEFAULT-NEXT:    ret float [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_j0f(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_j0_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR16]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_j0_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR14]]
 // FINITEONLY-NEXT:    ret float [[CALL_I]]
 //
 // APPROX-LABEL: @test_j0f(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_j0_f32(float noundef [[X:%.*]]) #[[ATTR16]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_j0_f32(float noundef [[X:%.*]]) #[[ATTR14]]
 // APPROX-NEXT:    ret float [[CALL_I]]
 //
 extern "C" __device__ float test_j0f(float x) {
@@ -1608,17 +1608,17 @@ extern "C" __device__ float test_j0f(float x) {
 
 // DEFAULT-LABEL: @test_j0(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_j0_f64(double noundef [[X:%.*]]) #[[ATTR16]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_j0_f64(double noundef [[X:%.*]]) #[[ATTR14]]
 // DEFAULT-NEXT:    ret double [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_j0(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_j0_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR16]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_j0_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR14]]
 // FINITEONLY-NEXT:    ret double [[CALL_I]]
 //
 // APPROX-LABEL: @test_j0(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_j0_f64(double noundef [[X:%.*]]) #[[ATTR16]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_j0_f64(double noundef [[X:%.*]]) #[[ATTR14]]
 // APPROX-NEXT:    ret double [[CALL_I]]
 //
 extern "C" __device__ double test_j0(double x) {
@@ -1627,17 +1627,17 @@ extern "C" __device__ double test_j0(double x) {
 
 // DEFAULT-LABEL: @test_j1f(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_j1_f32(float noundef [[X:%.*]]) #[[ATTR16]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_j1_f32(float noundef [[X:%.*]]) #[[ATTR14]]
 // DEFAULT-NEXT:    ret float [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_j1f(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_j1_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR16]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_j1_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR14]]
 // FINITEONLY-NEXT:    ret float [[CALL_I]]
 //
 // APPROX-LABEL: @test_j1f(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_j1_f32(float noundef [[X:%.*]]) #[[ATTR16]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_j1_f32(float noundef [[X:%.*]]) #[[ATTR14]]
 // APPROX-NEXT:    ret float [[CALL_I]]
 //
 extern "C" __device__ float test_j1f(float x) {
@@ -1646,17 +1646,17 @@ extern "C" __device__ float test_j1f(float x) {
 
 // DEFAULT-LABEL: @test_j1(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_j1_f64(double noundef [[X:%.*]]) #[[ATTR16]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_j1_f64(double noundef [[X:%.*]]) #[[ATTR14]]
 // DEFAULT-NEXT:    ret double [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_j1(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_j1_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR16]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_j1_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR14]]
 // FINITEONLY-NEXT:    ret double [[CALL_I]]
 //
 // APPROX-LABEL: @test_j1(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_j1_f64(double noundef [[X:%.*]]) #[[ATTR16]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_j1_f64(double noundef [[X:%.*]]) #[[ATTR14]]
 // APPROX-NEXT:    ret double [[CALL_I]]
 //
 extern "C" __device__ double test_j1(double x) {
@@ -1670,14 +1670,14 @@ extern "C" __device__ double test_j1(double x) {
 // DEFAULT-NEXT:      i32 1, label [[IF_THEN2_I:%.*]]
 // DEFAULT-NEXT:    ]
 // DEFAULT:       if.then.i:
-// DEFAULT-NEXT:    [[CALL_I20_I:%.*]] = tail call contract noundef float @__ocml_j0_f32(float noundef [[Y:%.*]]) #[[ATTR16]]
+// DEFAULT-NEXT:    [[CALL_I20_I:%.*]] = tail call contract noundef float @__ocml_j0_f32(float noundef [[Y:%.*]]) #[[ATTR14]]
 // DEFAULT-NEXT:    br label [[_ZL3JNFIF_EXIT:%.*]]
 // DEFAULT:       if.then2.i:
-// DEFAULT-NEXT:    [[CALL_I22_I:%.*]] = tail call contract noundef float @__ocml_j1_f32(float noundef [[Y]]) #[[ATTR16]]
+// DEFAULT-NEXT:    [[CALL_I22_I:%.*]] = tail call contract noundef float @__ocml_j1_f32(float noundef [[Y]]) #[[ATTR14]]
 // DEFAULT-NEXT:    br label [[_ZL3JNFIF_EXIT]]
 // DEFAULT:       if.end4.i:
-// DEFAULT-NEXT:    [[CALL_I_I:%.*]] = tail call contract noundef float @__ocml_j0_f32(float noundef [[Y]]) #[[ATTR16]]
-// DEFAULT-NEXT:    [[CALL_I21_I:%.*]] = tail call contract noundef float @__ocml_j1_f32(float noundef [[Y]]) #[[ATTR16]]
+// DEFAULT-NEXT:    [[CALL_I_I:%.*]] = tail call contract noundef float @__ocml_j0_f32(float noundef [[Y]]) #[[ATTR14]]
+// DEFAULT-NEXT:    [[CALL_I21_I:%.*]] = tail call contract noundef float @__ocml_j1_f32(float noundef [[Y]]) #[[ATTR14]]
 // DEFAULT-NEXT:    [[CMP7_I1:%.*]] = icmp sgt i32 [[X]], 1
 // DEFAULT-NEXT:    br i1 [[CMP7_I1]], label [[FOR_BODY_I:%.*]], label [[_ZL3JNFIF_EXIT]]
 // DEFAULT:       for.body.i:
@@ -1703,14 +1703,14 @@ extern "C" __device__ double test_j1(double x) {
 // FINITEONLY-NEXT:      i32 1, label [[IF_THEN2_I:%.*]]
 // FINITEONLY-NEXT:    ]
 // FINITEONLY:       if.then.i:
-// FINITEONLY-NEXT:    [[CALL_I20_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_j0_f32(float noundef nofpclass(nan inf) [[Y:%.*]]) #[[ATTR16]]
+// FINITEONLY-NEXT:    [[CALL_I20_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_j0_f32(float noundef nofpclass(nan inf) [[Y:%.*]]) #[[ATTR14]]
 // FINITEONLY-NEXT:    br label [[_ZL3JNFIF_EXIT:%.*]]
 // FINITEONLY:       if.then2.i:
-// FINITEONLY-NEXT:    [[CALL_I22_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_j1_f32(float noundef nofpclass(nan inf) [[Y]]) #[[ATTR16]]
+// FINITEONLY-NEXT:    [[CALL_I22_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_j1_f32(float noundef nofpclass(nan inf) [[Y]]) #[[ATTR14]]
 // FINITEONLY-NEXT:    br label [[_ZL3JNFIF_EXIT]]
 // FINITEONLY:       if.end4.i:
-// FINITEONLY-NEXT:    [[CALL_I_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_j0_f32(float noundef nofpclass(nan inf) [[Y]]) #[[ATTR16]]
-// FINITEONLY-NEXT:    [[CALL_I21_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_j1_f32(float noundef nofpclass(nan inf) [[Y]]) #[[ATTR16]]
+// FINITEONLY-NEXT:    [[CALL_I_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_j0_f32(float noundef nofpclass(nan inf) [[Y]]) #[[ATTR14]]
+// FINITEONLY-NEXT:    [[CALL_I21_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_j1_f32(float noundef nofpclass(nan inf) [[Y]]) #[[ATTR14]]
 // FINITEONLY-NEXT:    [[CMP7_I1:%.*]] = icmp sgt i32 [[X]], 1
 // FINITEONLY-NEXT:    br i1 [[CMP7_I1]], label [[FOR_BODY_I:%.*]], label [[_ZL3JNFIF_EXIT]]
 // FINITEONLY:       for.body.i:
@@ -1736,14 +1736,14 @@ extern "C" __device__ double test_j1(double x) {
 // APPROX-NEXT:      i32 1, label [[IF_THEN2_I:%.*]]
 // APPROX-NEXT:    ]
 // APPROX:       if.then.i:
-// APPROX-NEXT:    [[CALL_I20_I:%.*]] = tail call contract noundef float @__ocml_j0_f32(float noundef [[Y:%.*]]) #[[ATTR16]]
+// APPROX-NEXT:    [[CALL_I20_I:%.*]] = tail call contract noundef float @__ocml_j0_f32(float noundef [[Y:%.*]]) #[[ATTR14]]
 // APPROX-NEXT:    br label [[_ZL3JNFIF_EXIT:%.*]]
 // APPROX:       if.then2.i:
-// APPROX-NEXT:    [[CALL_I22_I:%.*]] = tail call contract noundef float @__ocml_j1_f32(float noundef [[Y]]) #[[ATTR16]]
+// APPROX-NEXT:    [[CALL_I22_I:%.*]] = tail call contract noundef float @__ocml_j1_f32(float noundef [[Y]]) #[[ATTR14]]
 // APPROX-NEXT:    br label [[_ZL3JNFIF_EXIT]]
 // APPROX:       if.end4.i:
-// APPROX-NEXT:    [[CALL_I_I:%.*]] = tail call contract noundef float @__ocml_j0_f32(float noundef [[Y]]) #[[ATTR16]]
-// APPROX-NEXT:    [[CALL_I21_I:%.*]] = tail call contract noundef float @__ocml_j1_f32(float noundef [[Y]]) #[[ATTR16]]
+// APPROX-NEXT:    [[CALL_I_I:%.*]] = tail call contract noundef float @__ocml_j0_f32(float noundef [[Y]]) #[[ATTR14]]
+// APPROX-NEXT:    [[CALL_I21_I:%.*]] = tail call contract noundef float @__ocml_j1_f32(float noundef [[Y]]) #[[ATTR14]]
 // APPROX-NEXT:    [[CMP7_I1:%.*]] = icmp sgt i32 [[X]], 1
 // APPROX-NEXT:    br i1 [[CMP7_I1]], label [[FOR_BODY_I:%.*]], label [[_ZL3JNFIF_EXIT]]
 // APPROX:       for.body.i:
@@ -1773,14 +1773,14 @@ extern "C" __device__ float test_jnf(int x, float y) {
 // DEFAULT-NEXT:      i32 1, label [[IF_THEN2_I:%.*]]
 // DEFAULT-NEXT:    ]
 // DEFAULT:       if.then.i:
-// DEFAULT-NEXT:    [[CALL_I20_I:%.*]] = tail call contract noundef double @__ocml_j0_f64(double noundef [[Y:%.*]]) #[[ATTR16]]
+// DEFAULT-NEXT:    [[CALL_I20_I:%.*]] = tail call contract noundef double @__ocml_j0_f64(double noundef [[Y:%.*]]) #[[ATTR14]]
 // DEFAULT-NEXT:    br label [[_ZL2JNID_EXIT:%.*]]
 // DEFAULT:       if.then2.i:
-// DEFAULT-NEXT:    [[CALL_I22_I:%.*]] = tail call contract noundef double @__ocml_j1_f64(double noundef [[Y]]) #[[ATTR16]]
+// DEFAULT-NEXT:    [[CALL_I22_I:%.*]] = tail call contract noundef double @__ocml_j1_f64(double noundef [[Y]]) #[[ATTR14]]
 // DEFAULT-NEXT:    br label [[_ZL2JNID_EXIT]]
 // DEFAULT:       if.end4.i:
-// DEFAULT-NEXT:    [[CALL_I_I:%.*]] = tail call contract noundef double @__ocml_j0_f64(double noundef [[Y]]) #[[ATTR16]]
-// DEFAULT-NEXT:    [[CALL_I21_I:%.*]] = tail call contract noundef double @__ocml_j1_f64(double noundef [[Y]]) #[[ATTR16]]
+// DEFAULT-NEXT:    [[CALL_I_I:%.*]] = tail call contract noundef double @__ocml_j0_f64(double noundef [[Y]]) #[[ATTR14]]
+// DEFAULT-NEXT:    [[CALL_I21_I:%.*]] = tail call contract noundef double @__ocml_j1_f64(double noundef [[Y]]) #[[ATTR14]]
 // DEFAULT-NEXT:    [[CMP7_I1:%.*]] = icmp sgt i32 [[X]], 1
 // DEFAULT-NEXT:    br i1 [[CMP7_I1]], label [[FOR_BODY_I:%.*]], label [[_ZL2JNID_EXIT]]
 // DEFAULT:       for.body.i:
@@ -1806,14 +1806,14 @@ extern "C" __device__ float test_jnf(int x, float y) {
 // FINITEONLY-NEXT:      i32 1, label [[IF_THEN2_I:%.*]]
 // FINITEONLY-NEXT:    ]
 // FINITEONLY:       if.then.i:
-// FINITEONLY-NEXT:    [[CALL_I20_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_j0_f64(double noundef nofpclass(nan inf) [[Y:%.*]]) #[[ATTR16]]
+// FINITEONLY-NEXT:    [[CALL_I20_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_j0_f64(double noundef nofpclass(nan inf) [[Y:%.*]]) #[[ATTR14]]
 // FINITEONLY-NEXT:    br label [[_ZL2JNID_EXIT:%.*]]
 // FINITEONLY:       if.then2.i:
-// FINITEONLY-NEXT:    [[CALL_I22_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_j1_f64(double noundef nofpclass(nan inf) [[Y]]) #[[ATTR16]]
+// FINITEONLY-NEXT:    [[CALL_I22_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_j1_f64(double noundef nofpclass(nan inf) [[Y]]) #[[ATTR14]]
 // FINITEONLY-NEXT:    br label [[_ZL2JNID_EXIT]]
 // FINITEONLY:       if.end4.i:
-// FINITEONLY-NEXT:    [[CALL_I_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_j0_f64(double noundef nofpclass(nan inf) [[Y]]) #[[ATTR16]]
-// FINITEONLY-NEXT:    [[CALL_I21_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_j1_f64(double noundef nofpclass(nan inf) [[Y]]) #[[ATTR16]]
+// FINITEONLY-NEXT:    [[CALL_I_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_j0_f64(double noundef nofpclass(nan inf) [[Y]]) #[[ATTR14]]
+// FINITEONLY-NEXT:    [[CALL_I21_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_j1_f64(double noundef nofpclass(nan inf) [[Y]]) #[[ATTR14]]
 // FINITEONLY-NEXT:    [[CMP7_I1:%.*]] = icmp sgt i32 [[X]], 1
 // FINITEONLY-NEXT:    br i1 [[CMP7_I1]], label [[FOR_BODY_I:%.*]], label [[_ZL2JNID_EXIT]]
 // FINITEONLY:       for.body.i:
@@ -1839,14 +1839,14 @@ extern "C" __device__ float test_jnf(int x, float y) {
 // APPROX-NEXT:      i32 1, label [[IF_THEN2_I:%.*]]
 // APPROX-NEXT:    ]
 // APPROX:       if.then.i:
-// APPROX-NEXT:    [[CALL_I20_I:%.*]] = tail call contract noundef double @__ocml_j0_f64(double noundef [[Y:%.*]]) #[[ATTR16]]
+// APPROX-NEXT:    [[CALL_I20_I:%.*]] = tail call contract noundef double @__ocml_j0_f64(double noundef [[Y:%.*]]) #[[ATTR14]]
 // APPROX-NEXT:    br label [[_ZL2JNID_EXIT:%.*]]
 // APPROX:       if.then2.i:
-// APPROX-NEXT:    [[CALL_I22_I:%.*]] = tail call contract noundef double @__ocml_j1_f64(double noundef [[Y]]) #[[ATTR16]]
+// APPROX-NEXT:    [[CALL_I22_I:%.*]] = tail call contract noundef double @__ocml_j1_f64(double noundef [[Y]]) #[[ATTR14]]
 // APPROX-NEXT:    br label [[_ZL2JNID_EXIT]]
 // APPROX:       if.end4.i:
-// APPROX-NEXT:    [[CALL_I_I:%.*]] = tail call contract noundef double @__ocml_j0_f64(double noundef [[Y]]) #[[ATTR16]]
-// APPROX-NEXT:    [[CALL_I21_I:%.*]] = tail call contract noundef double @__ocml_j1_f64(double noundef [[Y]]) #[[ATTR16]]
+// APPROX-NEXT:    [[CALL_I_I:%.*]] = tail call contract noundef double @__ocml_j0_f64(double noundef [[Y]]) #[[ATTR14]]
+// APPROX-NEXT:    [[CALL_I21_I:%.*]] = tail call contract noundef double @__ocml_j1_f64(double noundef [[Y]]) #[[ATTR14]]
 // APPROX-NEXT:    [[CMP7_I1:%.*]] = icmp sgt i32 [[X]], 1
 // APPROX-NEXT:    br i1 [[CMP7_I1]], label [[FOR_BODY_I:%.*]], label [[_ZL2JNID_EXIT]]
 // APPROX:       for.body.i:
@@ -1909,17 +1909,17 @@ extern "C" __device__ double test_ldexp(double x, int y) {
 
 // DEFAULT-LABEL: @test_lgammaf(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_lgamma_f32(float noundef [[X:%.*]]) #[[ATTR16]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_lgamma_f32(float noundef [[X:%.*]]) #[[ATTR14]]
 // DEFAULT-NEXT:    ret float [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_lgammaf(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_lgamma_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR16]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_lgamma_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR14]]
 // FINITEONLY-NEXT:    ret float [[CALL_I]]
 //
 // APPROX-LABEL: @test_lgammaf(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_lgamma_f32(float noundef [[X:%.*]]) #[[ATTR16]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_lgamma_f32(float noundef [[X:%.*]]) #[[ATTR14]]
 // APPROX-NEXT:    ret float [[CALL_I]]
 //
 extern "C" __device__ float test_lgammaf(float x) {
@@ -1928,17 +1928,17 @@ extern "C" __device__ float test_lgammaf(float x) {
 
 // DEFAULT-LABEL: @test_lgamma(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_lgamma_f64(double noundef [[X:%.*]]) #[[ATTR16]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_lgamma_f64(double noundef [[X:%.*]]) #[[ATTR14]]
 // DEFAULT-NEXT:    ret double [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_lgamma(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_lgamma_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR16]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_lgamma_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR14]]
 // FINITEONLY-NEXT:    ret double [[CALL_I]]
 //
 // APPROX-LABEL: @test_lgamma(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_lgamma_f64(double noundef [[X:%.*]]) #[[ATTR16]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_lgamma_f64(double noundef [[X:%.*]]) #[[ATTR14]]
 // APPROX-NEXT:    ret double [[CALL_I]]
 //
 extern "C" __device__ double test_lgamma(double x) {
@@ -2054,17 +2054,17 @@ extern "C" __device__ float test_log10f(float x) {
 
 // DEFAULT-LABEL: @test_log10(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_log10_f64(double noundef [[X:%.*]]) #[[ATTR15]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_log10_f64(double noundef [[X:%.*]]) #[[ATTR13]]
 // DEFAULT-NEXT:    ret double [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_log10(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_log10_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR15]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_log10_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR13]]
 // FINITEONLY-NEXT:    ret double [[CALL_I]]
 //
 // APPROX-LABEL: @test_log10(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_log10_f64(double noundef [[X:%.*]]) #[[ATTR15]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_log10_f64(double noundef [[X:%.*]]) #[[ATTR13]]
 // APPROX-NEXT:    ret double [[CALL_I]]
 //
 extern "C" __device__ double test_log10(double x) {
@@ -2073,17 +2073,17 @@ extern "C" __device__ double test_log10(double x) {
 
 // DEFAULT-LABEL: @test_log1pf(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_log1p_f32(float noundef [[X:%.*]]) #[[ATTR15]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_log1p_f32(float noundef [[X:%.*]]) #[[ATTR13]]
 // DEFAULT-NEXT:    ret float [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_log1pf(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_log1p_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR15]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_log1p_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR13]]
 // FINITEONLY-NEXT:    ret float [[CALL_I]]
 //
 // APPROX-LABEL: @test_log1pf(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_log1p_f32(float noundef [[X:%.*]]) #[[ATTR15]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_log1p_f32(float noundef [[X:%.*]]) #[[ATTR13]]
 // APPROX-NEXT:    ret float [[CALL_I]]
 //
 extern "C" __device__ float test_log1pf(float x) {
@@ -2092,17 +2092,17 @@ extern "C" __device__ float test_log1pf(float x) {
 
 // DEFAULT-LABEL: @test_log1p(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_log1p_f64(double noundef [[X:%.*]]) #[[ATTR15]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_log1p_f64(double noundef [[X:%.*]]) #[[ATTR13]]
 // DEFAULT-NEXT:    ret double [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_log1p(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_log1p_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR15]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_log1p_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR13]]
 // FINITEONLY-NEXT:    ret double [[CALL_I]]
 //
 // APPROX-LABEL: @test_log1p(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_log1p_f64(double noundef [[X:%.*]]) #[[ATTR15]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_log1p_f64(double noundef [[X:%.*]]) #[[ATTR13]]
 // APPROX-NEXT:    ret double [[CALL_I]]
 //
 extern "C" __device__ double test_log1p(double x) {
@@ -2111,12 +2111,12 @@ extern "C" __device__ double test_log1p(double x) {
 
 // DEFAULT-LABEL: @test_log2f(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_log2_f32(float noundef [[X:%.*]]) #[[ATTR15]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_log2_f32(float noundef [[X:%.*]]) #[[ATTR13]]
 // DEFAULT-NEXT:    ret float [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_log2f(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_log2_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR15]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_log2_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR13]]
 // FINITEONLY-NEXT:    ret float [[CALL_I]]
 //
 // APPROX-LABEL: @test_log2f(
@@ -2130,17 +2130,17 @@ extern "C" __device__ float test_log2f(float x) {
 
 // DEFAULT-LABEL: @test_log2(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_log2_f64(double noundef [[X:%.*]]) #[[ATTR15]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_log2_f64(double noundef [[X:%.*]]) #[[ATTR13]]
 // DEFAULT-NEXT:    ret double [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_log2(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_log2_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR15]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_log2_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR13]]
 // FINITEONLY-NEXT:    ret double [[CALL_I]]
 //
 // APPROX-LABEL: @test_log2(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_log2_f64(double noundef [[X:%.*]]) #[[ATTR15]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_log2_f64(double noundef [[X:%.*]]) #[[ATTR13]]
 // APPROX-NEXT:    ret double [[CALL_I]]
 //
 extern "C" __device__ double test_log2(double x) {
@@ -2149,17 +2149,17 @@ extern "C" __device__ double test_log2(double x) {
 
 // DEFAULT-LABEL: @test_logbf(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_logb_f32(float noundef [[X:%.*]]) #[[ATTR14]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_logb_f32(float noundef [[X:%.*]]) #[[ATTR12]]
 // DEFAULT-NEXT:    ret float [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_logbf(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_logb_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR14]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_logb_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR12]]
 // FINITEONLY-NEXT:    ret float [[CALL_I]]
 //
 // APPROX-LABEL: @test_logbf(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_logb_f32(float noundef [[X:%.*]]) #[[ATTR14]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_logb_f32(float noundef [[X:%.*]]) #[[ATTR12]]
 // APPROX-NEXT:    ret float [[CALL_I]]
 //
 extern "C" __device__ float test_logbf(float x) {
@@ -2168,17 +2168,17 @@ extern "C" __device__ float test_logbf(float x) {
 
 // DEFAULT-LABEL: @test_logb(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_logb_f64(double noundef [[X:%.*]]) #[[ATTR14]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_logb_f64(double noundef [[X:%.*]]) #[[ATTR12]]
 // DEFAULT-NEXT:    ret double [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_logb(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_logb_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR14]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_logb_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR12]]
 // FINITEONLY-NEXT:    ret double [[CALL_I]]
 //
 // APPROX-LABEL: @test_logb(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_logb_f64(double noundef [[X:%.*]]) #[[ATTR14]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_logb_f64(double noundef [[X:%.*]]) #[[ATTR12]]
 // APPROX-NEXT:    ret double [[CALL_I]]
 //
 extern "C" __device__ double test_logb(double x) {
@@ -2187,12 +2187,12 @@ extern "C" __device__ double test_logb(double x) {
 
 // DEFAULT-LABEL: @test_logf(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_log_f32(float noundef [[X:%.*]]) #[[ATTR15]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_log_f32(float noundef [[X:%.*]]) #[[ATTR13]]
 // DEFAULT-NEXT:    ret float [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_logf(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_log_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR15]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_log_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR13]]
 // FINITEONLY-NEXT:    ret float [[CALL_I]]
 //
 // APPROX-LABEL: @test_logf(
@@ -2295,31 +2295,31 @@ extern "C" __device__ long int test_lround(double x) {
 // DEFAULT-LABEL: @test_modff(
 // DEFAULT-NEXT:  entry:
 // DEFAULT-NEXT:    [[__TMP_I:%.*]] = alloca float, align 4, addrspace(5)
-// DEFAULT-NEXT:    call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) [[__TMP_I]]) #[[ATTR17:[0-9]+]]
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = call contract noundef float @__ocml_modf_f32(float noundef [[X:%.*]], ptr addrspace(5) noundef [[__TMP_I]]) #[[ATTR16]]
+// DEFAULT-NEXT:    call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) [[__TMP_I]]) #[[ATTR15:[0-9]+]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = call contract noundef float @__ocml_modf_f32(float noundef [[X:%.*]], ptr addrspace(5) noundef [[__TMP_I]]) #[[ATTR14]]
 // DEFAULT-NEXT:    [[TMP0:%.*]] = load float, ptr addrspace(5) [[__TMP_I]], align 4, !tbaa [[TBAA16:![0-9]+]]
 // DEFAULT-NEXT:    store float [[TMP0]], ptr [[Y:%.*]], align 4, !tbaa [[TBAA16]]
-// DEFAULT-NEXT:    call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) [[__TMP_I]]) #[[ATTR17]]
+// DEFAULT-NEXT:    call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) [[__TMP_I]]) #[[ATTR15]]
 // DEFAULT-NEXT:    ret float [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_modff(
 // FINITEONLY-NEXT:  entry:
 // FINITEONLY-NEXT:    [[__TMP_I:%.*]] = alloca float, align 4, addrspace(5)
-// FINITEONLY-NEXT:    call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) [[__TMP_I]]) #[[ATTR17:[0-9]+]]
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_modf_f32(float noundef nofpclass(nan inf) [[X:%.*]], ptr addrspace(5) noundef [[__TMP_I]]) #[[ATTR16]]
+// FINITEONLY-NEXT:    call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) [[__TMP_I]]) #[[ATTR15:[0-9]+]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_modf_f32(float noundef nofpclass(nan inf) [[X:%.*]], ptr addrspace(5) noundef [[__TMP_I]]) #[[ATTR14]]
 // FINITEONLY-NEXT:    [[TMP0:%.*]] = load float, ptr addrspace(5) [[__TMP_I]], align 4, !tbaa [[TBAA16:![0-9]+]]
 // FINITEONLY-NEXT:    store float [[TMP0]], ptr [[Y:%.*]], align 4, !tbaa [[TBAA16]]
-// FINITEONLY-NEXT:    call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) [[__TMP_I]]) #[[ATTR17]]
+// FINITEONLY-NEXT:    call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) [[__TMP_I]]) #[[ATTR15]]
 // FINITEONLY-NEXT:    ret float [[CALL_I]]
 //
 // APPROX-LABEL: @test_modff(
 // APPROX-NEXT:  entry:
 // APPROX-NEXT:    [[__TMP_I:%.*]] = alloca float, align 4, addrspace(5)
-// APPROX-NEXT:    call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) [[__TMP_I]]) #[[ATTR17:[0-9]+]]
-// APPROX-NEXT:    [[CALL_I:%.*]] = call contract noundef float @__ocml_modf_f32(float noundef [[X:%.*]], ptr addrspace(5) noundef [[__TMP_I]]) #[[ATTR16]]
+// APPROX-NEXT:    call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) [[__TMP_I]]) #[[ATTR15:[0-9]+]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = call contract noundef float @__ocml_modf_f32(float noundef [[X:%.*]], ptr addrspace(5) noundef [[__TMP_I]]) #[[ATTR14]]
 // APPROX-NEXT:    [[TMP0:%.*]] = load float, ptr addrspace(5) [[__TMP_I]], align 4, !tbaa [[TBAA16:![0-9]+]]
 // APPROX-NEXT:    store float [[TMP0]], ptr [[Y:%.*]], align 4, !tbaa [[TBAA16]]
-// APPROX-NEXT:    call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) [[__TMP_I]]) #[[ATTR17]]
+// APPROX-NEXT:    call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) [[__TMP_I]]) #[[ATTR15]]
 // APPROX-NEXT:    ret float [[CALL_I]]
 //
 extern "C" __device__ float test_modff(float x, float* y) {
@@ -2329,31 +2329,31 @@ extern "C" __device__ float test_modff(float x, float* y) {
 // DEFAULT-LABEL: @test_modf(
 // DEFAULT-NEXT:  entry:
 // DEFAULT-NEXT:    [[__TMP_I:%.*]] = alloca double, align 8, addrspace(5)
-// DEFAULT-NEXT:    call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[__TMP_I]]) #[[ATTR17]]
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = call contract noundef double @__ocml_modf_f64(double noundef [[X:%.*]], ptr addrspace(5) noundef [[__TMP_I]]) #[[ATTR16]]
+// DEFAULT-NEXT:    call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[__TMP_I]]) #[[ATTR15]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = call contract noundef double @__ocml_modf_f64(double noundef [[X:%.*]], ptr addrspace(5) noundef [[__TMP_I]]) #[[ATTR14]]
 // DEFAULT-NEXT:    [[TMP0:%.*]] = load double, ptr addrspace(5) [[__TMP_I]], align 8, !tbaa [[TBAA18:![0-9]+]]
 // DEFAULT-NEXT:    store double [[TMP0]], ptr [[Y:%.*]], align 8, !tbaa [[TBAA18]]
-// DEFAULT-NEXT:    call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[__TMP_I]]) #[[ATTR17]]
+// DEFAULT-NEXT:    call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[__TMP_I]]) #[[ATTR15]]
 // DEFAULT-NEXT:    ret double [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_modf(
 // FINITEONLY-NEXT:  entry:
 // FINITEONLY-NEXT:    [[__TMP_I:%.*]] = alloca double, align 8, addrspace(5)
-// FINITEONLY-NEXT:    call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[__TMP_I]]) #[[ATTR17]]
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_modf_f64(double noundef nofpclass(nan inf) [[X:%.*]], ptr addrspace(5) noundef [[__TMP_I]]) #[[ATTR16]]
+// FINITEONLY-NEXT:    call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[__TMP_I]]) #[[ATTR15]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_modf_f64(double noundef nofpclass(nan inf) [[X:%.*]], ptr addrspace(5) noundef [[__TMP_I]]) #[[ATTR14]]
 // FINITEONLY-NEXT:    [[TMP0:%.*]] = load double, ptr addrspace(5) [[__TMP_I]], align 8, !tbaa [[TBAA18:![0-9]+]]
 // FINITEONLY-NEXT:    store double [[TMP0]], ptr [[Y:%.*]], align 8, !tbaa [[TBAA18]]
-// FINITEONLY-NEXT:    call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[__TMP_I]]) #[[ATTR17]]
+// FINITEONLY-NEXT:    call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[__TMP_I]]) #[[ATTR15]]
 // FINITEONLY-NEXT:    ret double [[CALL_I]]
 //
 // APPROX-LABEL: @test_modf(
 // APPROX-NEXT:  entry:
 // APPROX-NEXT:    [[__TMP_I:%.*]] = alloca double, align 8, addrspace(5)
-// APPROX-NEXT:    call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[__TMP_I]]) #[[ATTR17]]
-// APPROX-NEXT:    [[CALL_I:%.*]] = call contract noundef double @__ocml_modf_f64(double noundef [[X:%.*]], ptr addrspace(5) noundef [[__TMP_I]]) #[[ATTR16]]
+// APPROX-NEXT:    call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[__TMP_I]]) #[[ATTR15]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = call contract noundef double @__ocml_modf_f64(double noundef [[X:%.*]], ptr addrspace(5) noundef [[__TMP_I]]) #[[ATTR14]]
 // APPROX-NEXT:    [[TMP0:%.*]] = load double, ptr addrspace(5) [[__TMP_I]], align 8, !tbaa [[TBAA18:![0-9]+]]
 // APPROX-NEXT:    store double [[TMP0]], ptr [[Y:%.*]], align 8, !tbaa [[TBAA18]]
-// APPROX-NEXT:    call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[__TMP_I]]) #[[ATTR17]]
+// APPROX-NEXT:    call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[__TMP_I]]) #[[ATTR15]]
 // APPROX-NEXT:    ret double [[CALL_I]]
 //
 extern "C" __device__ double test_modf(double x, double* y) {
@@ -2629,17 +2629,17 @@ extern "C" __device__ double test_nearbyint(double x) {
 
 // DEFAULT-LABEL: @test_nextafterf(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_nextafter_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]]) #[[ATTR14]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_nextafter_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]]) #[[ATTR12]]
 // DEFAULT-NEXT:    ret float [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_nextafterf(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_nextafter_f32(float noundef nofpclass(nan inf) [[X:%.*]], float noundef nofpclass(nan inf) [[Y:%.*]]) #[[ATTR14]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_nextafter_f32(float noundef nofpclass(nan inf) [[X:%.*]], float noundef nofpclass(nan inf) [[Y:%.*]]) #[[ATTR12]]
 // FINITEONLY-NEXT:    ret float [[CALL_I]]
 //
 // APPROX-LABEL: @test_nextafterf(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_nextafter_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]]) #[[ATTR14]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_nextafter_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]]) #[[ATTR12]]
 // APPROX-NEXT:    ret float [[CALL_I]]
 //
 extern "C" __device__ float test_nextafterf(float x, float y) {
@@ -2648,17 +2648,17 @@ extern "C" __device__ float test_nextafterf(float x, float y) {
 
 // DEFAULT-LABEL: @test_nextafter(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_nextafter_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]]) #[[ATTR14]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_nextafter_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]]) #[[ATTR12]]
 // DEFAULT-NEXT:    ret double [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_nextafter(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_nextafter_f64(double noundef nofpclass(nan inf) [[X:%.*]], double noundef nofpclass(nan inf) [[Y:%.*]]) #[[ATTR14]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_nextafter_f64(double noundef nofpclass(nan inf) [[X:%.*]], double noundef nofpclass(nan inf) [[Y:%.*]]) #[[ATTR12]]
 // FINITEONLY-NEXT:    ret double [[CALL_I]]
 //
 // APPROX-LABEL: @test_nextafter(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_nextafter_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]]) #[[ATTR14]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_nextafter_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]]) #[[ATTR12]]
 // APPROX-NEXT:    ret double [[CALL_I]]
 //
 extern "C" __device__ double test_nextafter(double x, double y) {
@@ -2667,17 +2667,17 @@ extern "C" __device__ double test_nextafter(double x, double y) {
 
 // DEFAULT-LABEL: @test_norm3df(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_len3_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]], float noundef [[Z:%.*]]) #[[ATTR14]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_len3_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]], float noundef [[Z:%.*]]) #[[ATTR12]]
 // DEFAULT-NEXT:    ret float [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_norm3df(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_len3_f32(float noundef nofpclass(nan inf) [[X:%.*]], float noundef nofpclass(nan inf) [[Y:%.*]], float noundef nofpclass(nan inf) [[Z:%.*]]) #[[ATTR14]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_len3_f32(float noundef nofpclass(nan inf) [[X:%.*]], float noundef nofpclass(nan inf) [[Y:%.*]], float noundef nofpclass(nan inf) [[Z:%.*]]) #[[ATTR12]]
 // FINITEONLY-NEXT:    ret float [[CALL_I]]
 //
 // APPROX-LABEL: @test_norm3df(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_len3_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]], float noundef [[Z:%.*]]) #[[ATTR14]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_len3_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]], float noundef [[Z:%.*]]) #[[ATTR12]]
 // APPROX-NEXT:    ret float [[CALL_I]]
 //
 extern "C" __device__ float test_norm3df(float x, float y, float z) {
@@ -2686,17 +2686,17 @@ extern "C" __device__ float test_norm3df(float x, float y, float z) {
 
 // DEFAULT-LABEL: @test_norm3d(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_len3_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]], double noundef [[Z:%.*]]) #[[ATTR14]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_len3_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]], double noundef [[Z:%.*]]) #[[ATTR12]]
 // DEFAULT-NEXT:    ret double [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_norm3d(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_len3_f64(double noundef nofpclass(nan inf) [[X:%.*]], double noundef nofpclass(nan inf) [[Y:%.*]], double noundef nofpclass(nan inf) [[Z:%.*]]) #[[ATTR14]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_len3_f64(double noundef nofpclass(nan inf) [[X:%.*]], double noundef nofpclass(nan inf) [[Y:%.*]], double noundef nofpclass(nan inf) [[Z:%.*]]) #[[ATTR12]]
 // FINITEONLY-NEXT:    ret double [[CALL_I]]
 //
 // APPROX-LABEL: @test_norm3d(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_len3_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]], double noundef [[Z:%.*]]) #[[ATTR14]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_len3_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]], double noundef [[Z:%.*]]) #[[ATTR12]]
 // APPROX-NEXT:    ret double [[CALL_I]]
 //
 extern "C" __device__ double test_norm3d(double x, double y, double z) {
@@ -2705,17 +2705,17 @@ extern "C" __device__ double test_norm3d(double x, double y, double z) {
 
 // DEFAULT-LABEL: @test_norm4df(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_len4_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]], float noundef [[Z:%.*]], float noundef [[W:%.*]]) #[[ATTR14]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_len4_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]], float noundef [[Z:%.*]], float noundef [[W:%.*]]) #[[ATTR12]]
 // DEFAULT-NEXT:    ret float [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_norm4df(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_len4_f32(float noundef nofpclass(nan inf) [[X:%.*]], float noundef nofpclass(nan inf) [[Y:%.*]], float noundef nofpclass(nan inf) [[Z:%.*]], float noundef nofpclass(nan inf) [[W:%.*]]) #[[ATTR14]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_len4_f32(float noundef nofpclass(nan inf) [[X:%.*]], float noundef nofpclass(nan inf) [[Y:%.*]], float noundef nofpclass(nan inf) [[Z:%.*]], float noundef nofpclass(nan inf) [[W:%.*]]) #[[ATTR12]]
 // FINITEONLY-NEXT:    ret float [[CALL_I]]
 //
 // APPROX-LABEL: @test_norm4df(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_len4_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]], float noundef [[Z:%.*]], float noundef [[W:%.*]]) #[[ATTR14]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_len4_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]], float noundef [[Z:%.*]], float noundef [[W:%.*]]) #[[ATTR12]]
 // APPROX-NEXT:    ret float [[CALL_I]]
 //
 extern "C" __device__ float test_norm4df(float x, float y, float z, float w) {
@@ -2724,17 +2724,17 @@ extern "C" __device__ float test_norm4df(float x, float y, float z, float w) {
 
 // DEFAULT-LABEL: @test_norm4d(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_len4_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]], double noundef [[Z:%.*]], double noundef [[W:%.*]]) #[[ATTR14]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_len4_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]], double noundef [[Z:%.*]], double noundef [[W:%.*]]) #[[ATTR12]]
 // DEFAULT-NEXT:    ret double [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_norm4d(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_len4_f64(double noundef nofpclass(nan inf) [[X:%.*]], double noundef nofpclass(nan inf) [[Y:%.*]], double noundef nofpclass(nan inf) [[Z:%.*]], double noundef nofpclass(nan inf) [[W:%.*]]) #[[ATTR14]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_len4_f64(double noundef nofpclass(nan inf) [[X:%.*]], double noundef nofpclass(nan inf) [[Y:%.*]], double noundef nofpclass(nan inf) [[Z:%.*]], double noundef nofpclass(nan inf) [[W:%.*]]) #[[ATTR12]]
 // FINITEONLY-NEXT:    ret double [[CALL_I]]
 //
 // APPROX-LABEL: @test_norm4d(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_len4_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]], double noundef [[Z:%.*]], double noundef [[W:%.*]]) #[[ATTR14]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_len4_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]], double noundef [[Z:%.*]], double noundef [[W:%.*]]) #[[ATTR12]]
 // APPROX-NEXT:    ret double [[CALL_I]]
 //
 extern "C" __device__ double test_norm4d(double x, double y, double z, double w) {
@@ -2743,17 +2743,17 @@ extern "C" __device__ double test_norm4d(double x, double y, double z, double w)
 
 // DEFAULT-LABEL: @test_normcdff(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_ncdf_f32(float noundef [[X:%.*]]) #[[ATTR15]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_ncdf_f32(float noundef [[X:%.*]]) #[[ATTR13]]
 // DEFAULT-NEXT:    ret float [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_normcdff(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_ncdf_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR15]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_ncdf_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR13]]
 // FINITEONLY-NEXT:    ret float [[CALL_I]]
 //
 // APPROX-LABEL: @test_normcdff(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_ncdf_f32(float noundef [[X:%.*]]) #[[ATTR15]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_ncdf_f32(float noundef [[X:%.*]]) #[[ATTR13]]
 // APPROX-NEXT:    ret float [[CALL_I]]
 //
 extern "C" __device__ float test_normcdff(float x) {
@@ -2762,17 +2762,17 @@ extern "C" __device__ float test_normcdff(float x) {
 
 // DEFAULT-LABEL: @test_normcdf(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_ncdf_f64(double noundef [[X:%.*]]) #[[ATTR15]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_ncdf_f64(double noundef [[X:%.*]]) #[[ATTR13]]
 // DEFAULT-NEXT:    ret double [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_normcdf(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_ncdf_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR15]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_ncdf_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR13]]
 // FINITEONLY-NEXT:    ret double [[CALL_I]]
 //
 // APPROX-LABEL: @test_normcdf(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_ncdf_f64(double noundef [[X:%.*]]) #[[ATTR15]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_ncdf_f64(double noundef [[X:%.*]]) #[[ATTR13]]
 // APPROX-NEXT:    ret double [[CALL_I]]
 //
 extern "C" __device__ double test_normcdf(double x) {
@@ -2781,17 +2781,17 @@ extern "C" __device__ double test_normcdf(double x) {
 
 // DEFAULT-LABEL: @test_normcdfinvf(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_ncdfinv_f32(float noundef [[X:%.*]]) #[[ATTR15]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_ncdfinv_f32(float noundef [[X:%.*]]) #[[ATTR13]]
 // DEFAULT-NEXT:    ret float [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_normcdfinvf(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_ncdfinv_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR15]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_ncdfinv_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR13]]
 // FINITEONLY-NEXT:    ret float [[CALL_I]]
 //
 // APPROX-LABEL: @test_normcdfinvf(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_ncdfinv_f32(float noundef [[X:%.*]]) #[[ATTR15]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_ncdfinv_f32(float noundef [[X:%.*]]) #[[ATTR13]]
 // APPROX-NEXT:    ret float [[CALL_I]]
 //
 extern "C" __device__ float test_normcdfinvf(float x) {
@@ -2800,17 +2800,17 @@ extern "C" __device__ float test_normcdfinvf(float x) {
 
 // DEFAULT-LABEL: @test_normcdfinv(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_ncdfinv_f64(double noundef [[X:%.*]]) #[[ATTR15]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_ncdfinv_f64(double noundef [[X:%.*]]) #[[ATTR13]]
 // DEFAULT-NEXT:    ret double [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_normcdfinv(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_ncdfinv_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR15]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_ncdfinv_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR13]]
 // FINITEONLY-NEXT:    ret double [[CALL_I]]
 //
 // APPROX-LABEL: @test_normcdfinv(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_ncdfinv_f64(double noundef [[X:%.*]]) #[[ATTR15]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_ncdfinv_f64(double noundef [[X:%.*]]) #[[ATTR13]]
 // APPROX-NEXT:    ret double [[CALL_I]]
 //
 extern "C" __device__ double test_normcdfinv(double x) {
@@ -2947,17 +2947,17 @@ extern "C" __device__ double test_norm(int x, const double *y) {
 
 // DEFAULT-LABEL: @test_powf(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_pow_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]]) #[[ATTR15]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_pow_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]]) #[[ATTR13]]
 // DEFAULT-NEXT:    ret float [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_powf(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_pow_f32(float noundef nofpclass(nan inf) [[X:%.*]], float noundef nofpclass(nan inf) [[Y:%.*]]) #[[ATTR15]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_pow_f32(float noundef nofpclass(nan inf) [[X:%.*]], float noundef nofpclass(nan inf) [[Y:%.*]]) #[[ATTR13]]
 // FINITEONLY-NEXT:    ret float [[CALL_I]]
 //
 // APPROX-LABEL: @test_powf(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_pow_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]]) #[[ATTR15]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_pow_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]]) #[[ATTR13]]
 // APPROX-NEXT:    ret float [[CALL_I]]
 //
 extern "C" __device__ float test_powf(float x, float y) {
@@ -2966,17 +2966,17 @@ extern "C" __device__ float test_powf(float x, float y) {
 
 // DEFAULT-LABEL: @test_pow(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_pow_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]]) #[[ATTR15]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_pow_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]]) #[[ATTR13]]
 // DEFAULT-NEXT:    ret double [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_pow(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_pow_f64(double noundef nofpclass(nan inf) [[X:%.*]], double noundef nofpclass(nan inf) [[Y:%.*]]) #[[ATTR15]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_pow_f64(double noundef nofpclass(nan inf) [[X:%.*]], double noundef nofpclass(nan inf) [[Y:%.*]]) #[[ATTR13]]
 // FINITEONLY-NEXT:    ret double [[CALL_I]]
 //
 // APPROX-LABEL: @test_pow(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_pow_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]]) #[[ATTR15]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_pow_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]]) #[[ATTR13]]
 // APPROX-NEXT:    ret double [[CALL_I]]
 //
 extern "C" __device__ double test_pow(double x, double y) {
@@ -2985,17 +2985,17 @@ extern "C" __device__ double test_pow(double x, double y) {
 
 // DEFAULT-LABEL: @test_powif(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_pown_f32(float noundef [[X:%.*]], i32 noundef [[Y:%.*]]) #[[ATTR15]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_pown_f32(float noundef [[X:%.*]], i32 noundef [[Y:%.*]]) #[[ATTR13]]
 // DEFAULT-NEXT:    ret float [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_powif(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_pown_f32(float noundef nofpclass(nan inf) [[X:%.*]], i32 noundef [[Y:%.*]]) #[[ATTR15]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_pown_f32(float noundef nofpclass(nan inf) [[X:%.*]], i32 noundef [[Y:%.*]]) #[[ATTR13]]
 // FINITEONLY-NEXT:    ret float [[CALL_I]]
 //
 // APPROX-LABEL: @test_powif(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_pown_f32(float noundef [[X:%.*]], i32 noundef [[Y:%.*]]) #[[ATTR15]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_pown_f32(float noundef [[X:%.*]], i32 noundef [[Y:%.*]]) #[[ATTR13]]
 // APPROX-NEXT:    ret float [[CALL_I]]
 //
 extern "C" __device__ float test_powif(float x, int y) {
@@ -3004,17 +3004,17 @@ extern "C" __device__ float test_powif(float x, int y) {
 
 // DEFAULT-LABEL: @test_powi(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_pown_f64(double noundef [[X:%.*]], i32 noundef [[Y:%.*]]) #[[ATTR15]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_pown_f64(double noundef [[X:%.*]], i32 noundef [[Y:%.*]]) #[[ATTR13]]
 // DEFAULT-NEXT:    ret double [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_powi(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_pown_f64(double noundef nofpclass(nan inf) [[X:%.*]], i32 noundef [[Y:%.*]]) #[[ATTR15]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_pown_f64(double noundef nofpclass(nan inf) [[X:%.*]], i32 noundef [[Y:%.*]]) #[[ATTR13]]
 // FINITEONLY-NEXT:    ret double [[CALL_I]]
 //
 // APPROX-LABEL: @test_powi(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_pown_f64(double noundef [[X:%.*]], i32 noundef [[Y:%.*]]) #[[ATTR15]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_pown_f64(double noundef [[X:%.*]], i32 noundef [[Y:%.*]]) #[[ATTR13]]
 // APPROX-NEXT:    ret double [[CALL_I]]
 //
 extern "C" __device__ double test_powi(double x, int y) {
@@ -3023,17 +3023,17 @@ extern "C" __device__ double test_powi(double x, int y) {
 
 // DEFAULT-LABEL: @test_rcbrtf(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_rcbrt_f32(float noundef [[X:%.*]]) #[[ATTR15]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_rcbrt_f32(float noundef [[X:%.*]]) #[[ATTR13]]
 // DEFAULT-NEXT:    ret float [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_rcbrtf(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_rcbrt_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR15]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_rcbrt_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR13]]
 // FINITEONLY-NEXT:    ret float [[CALL_I]]
 //
 // APPROX-LABEL: @test_rcbrtf(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_rcbrt_f32(float noundef [[X:%.*]]) #[[ATTR15]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_rcbrt_f32(float noundef [[X:%.*]]) #[[ATTR13]]
 // APPROX-NEXT:    ret float [[CALL_I]]
 //
 extern "C" __device__ float test_rcbrtf(float x) {
@@ -3042,17 +3042,17 @@ extern "C" __device__ float test_rcbrtf(float x) {
 
 // DEFAULT-LABEL: @test_rcbrt(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_rcbrt_f64(double noundef [[X:%.*]]) #[[ATTR15]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_rcbrt_f64(double noundef [[X:%.*]]) #[[ATTR13]]
 // DEFAULT-NEXT:    ret double [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_rcbrt(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_rcbrt_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR15]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_rcbrt_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR13]]
 // FINITEONLY-NEXT:    ret double [[CALL_I]]
 //
 // APPROX-LABEL: @test_rcbrt(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_rcbrt_f64(double noundef [[X:%.*]]) #[[ATTR15]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_rcbrt_f64(double noundef [[X:%.*]]) #[[ATTR13]]
 // APPROX-NEXT:    ret double [[CALL_I]]
 //
 extern "C" __device__ double test_rcbrt(double x) {
@@ -3061,17 +3061,17 @@ extern "C" __device__ double test_rcbrt(double x) {
 
 // DEFAULT-LABEL: @test_remainderf(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_remainder_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]]) #[[ATTR14]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_remainder_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]]) #[[ATTR12]]
 // DEFAULT-NEXT:    ret float [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_remainderf(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_remainder_f32(float noundef nofpclass(nan inf) [[X:%.*]], float noundef nofpclass(nan inf) [[Y:%.*]]) #[[ATTR14]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_remainder_f32(float noundef nofpclass(nan inf) [[X:%.*]], float noundef nofpclass(nan inf) [[Y:%.*]]) #[[ATTR12]]
 // FINITEONLY-NEXT:    ret float [[CALL_I]]
 //
 // APPROX-LABEL: @test_remainderf(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_remainder_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]]) #[[ATTR14]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_remainder_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]]) #[[ATTR12]]
 // APPROX-NEXT:    ret float [[CALL_I]]
 //
 extern "C" __device__ float test_remainderf(float x, float y) {
@@ -3080,17 +3080,17 @@ extern "C" __device__ float test_remainderf(float x, float y) {
 
 // DEFAULT-LABEL: @test_remainder(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_remainder_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]]) #[[ATTR14]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_remainder_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]]) #[[ATTR12]]
 // DEFAULT-NEXT:    ret double [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_remainder(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_remainder_f64(double noundef nofpclass(nan inf) [[X:%.*]], double noundef nofpclass(nan inf) [[Y:%.*]]) #[[ATTR14]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_remainder_f64(double noundef nofpclass(nan inf) [[X:%.*]], double noundef nofpclass(nan inf) [[Y:%.*]]) #[[ATTR12]]
 // FINITEONLY-NEXT:    ret double [[CALL_I]]
 //
 // APPROX-LABEL: @test_remainder(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_remainder_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]]) #[[ATTR14]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_remainder_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]]) #[[ATTR12]]
 // APPROX-NEXT:    ret double [[CALL_I]]
 //
 extern "C" __device__ double test_remainder(double x, double y) {
@@ -3100,31 +3100,31 @@ extern "C" __device__ double test_remainder(double x, double y) {
 // DEFAULT-LABEL: @test_remquof(
 // DEFAULT-NEXT:  entry:
 // DEFAULT-NEXT:    [[__TMP_I:%.*]] = alloca i32, align 4, addrspace(5)
-// DEFAULT-NEXT:    call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) [[__TMP_I]]) #[[ATTR17]]
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = call contract noundef float @__ocml_remquo_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]], ptr addrspace(5) noundef [[__TMP_I]]) #[[ATTR16]]
+// DEFAULT-NEXT:    call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) [[__TMP_I]]) #[[ATTR15]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = call contract noundef float @__ocml_remquo_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]], ptr addrspace(5) noundef [[__TMP_I]]) #[[ATTR14]]
 // DEFAULT-NEXT:    [[TMP0:%.*]] = load i32, ptr addrspace(5) [[__TMP_I]], align 4, !tbaa [[TBAA12]]
 // DEFAULT-NEXT:    store i32 [[TMP0]], ptr [[Z:%.*]], align 4, !tbaa [[TBAA12]]
-// DEFAULT-NEXT:    call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) [[__TMP_I]]) #[[ATTR17]]
+// DEFAULT-NEXT:    call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) [[__TMP_I]]) #[[ATTR15]]
 // DEFAULT-NEXT:    ret float [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_remquof(
 // FINITEONLY-NEXT:  entry:
 // FINITEONLY-NEXT:    [[__TMP_I:%.*]] = alloca i32, align 4, addrspace(5)
-// FINITEONLY-NEXT:    call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) [[__TMP_I]]) #[[ATTR17]]
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_remquo_f32(float noundef nofpclass(nan inf) [[X:%.*]], float noundef nofpclass(nan inf) [[Y:%.*]], ptr addrspace(5) noundef [[__TMP_I]]) #[[ATTR16]]
+// FINITEONLY-NEXT:    call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) [[__TMP_I]]) #[[ATTR15]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_remquo_f32(float noundef nofpclass(nan inf) [[X:%.*]], float noundef nofpclass(nan inf) [[Y:%.*]], ptr addrspace(5) noundef [[__TMP_I]]) #[[ATTR14]]
 // FINITEONLY-NEXT:    [[TMP0:%.*]] = load i32, ptr addrspace(5) [[__TMP_I]], align 4, !tbaa [[TBAA12]]
 // FINITEONLY-NEXT:    store i32 [[TMP0]], ptr [[Z:%.*]], align 4, !tbaa [[TBAA12]]
-// FINITEONLY-NEXT:    call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) [[__TMP_I]]) #[[ATTR17]]
+// FINITEONLY-NEXT:    call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) [[__TMP_I]]) #[[ATTR15]]
 // FINITEONLY-NEXT:    ret float [[CALL_I]]
 //
 // APPROX-LABEL: @test_remquof(
 // APPROX-NEXT:  entry:
 // APPROX-NEXT:    [[__TMP_I:%.*]] = alloca i32, align 4, addrspace(5)
-// APPROX-NEXT:    call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) [[__TMP_I]]) #[[ATTR17]]
-// APPROX-NEXT:    [[CALL_I:%.*]] = call contract noundef float @__ocml_remquo_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]], ptr addrspace(5) noundef [[__TMP_I]]) #[[ATTR16]]
+// APPROX-NEXT:    call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) [[__TMP_I]]) #[[ATTR15]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = call contract noundef float @__ocml_remquo_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]], ptr addrspace(5) noundef [[__TMP_I]]) #[[ATTR14]]
 // APPROX-NEXT:    [[TMP0:%.*]] = load i32, ptr addrspace(5) [[__TMP_I]], align 4, !tbaa [[TBAA12]]
 // APPROX-NEXT:    store i32 [[TMP0]], ptr [[Z:%.*]], align 4, !tbaa [[TBAA12]]
-// APPROX-NEXT:    call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) [[__TMP_I]]) #[[ATTR17]]
+// APPROX-NEXT:    call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) [[__TMP_I]]) #[[ATTR15]]
 // APPROX-NEXT:    ret float [[CALL_I]]
 //
 extern "C" __device__ float test_remquof(float x, float y, int* z) {
@@ -3134,31 +3134,31 @@ extern "C" __device__ float test_remquof(float x, float y, int* z) {
 // DEFAULT-LABEL: @test_remquo(
 // DEFAULT-NEXT:  entry:
 // DEFAULT-NEXT:    [[__TMP_I:%.*]] = alloca i32, align 4, addrspace(5)
-// DEFAULT-NEXT:    call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) [[__TMP_I]]) #[[ATTR17]]
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = call contract noundef double @__ocml_remquo_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]], ptr addrspace(5) noundef [[__TMP_I]]) #[[ATTR16]]
+// DEFAULT-NEXT:    call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) [[__TMP_I]]) #[[ATTR15]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = call contract noundef double @__ocml_remquo_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]], ptr addrspace(5) noundef [[__TMP_I]]) #[[ATTR14]]
 // DEFAULT-NEXT:    [[TMP0:%.*]] = load i32, ptr addrspace(5) [[__TMP_I]], align 4, !tbaa [[TBAA12]]
 // DEFAULT-NEXT:    store i32 [[TMP0]], ptr [[Z:%.*]], align 4, !tbaa [[TBAA12]]
-// DEFAULT-NEXT:    call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) [[__TMP_I]]) #[[ATTR17]]
+// DEFAULT-NEXT:    call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) [[__TMP_I]]) #[[ATTR15]]
 // DEFAULT-NEXT:    ret double [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_remquo(
 // FINITEONLY-NEXT:  entry:
 // FINITEONLY-NEXT:    [[__TMP_I:%.*]] = alloca i32, align 4, addrspace(5)
-// FINITEONLY-NEXT:    call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) [[__TMP_I]]) #[[ATTR17]]
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_remquo_f64(double noundef nofpclass(nan inf) [[X:%.*]], double noundef nofpclass(nan inf) [[Y:%.*]], ptr addrspace(5) noundef [[__TMP_I]]) #[[ATTR16]]
+// FINITEONLY-NEXT:    call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) [[__TMP_I]]) #[[ATTR15]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_remquo_f64(double noundef nofpclass(nan inf) [[X:%.*]], double noundef nofpclass(nan inf) [[Y:%.*]], ptr addrspace(5) noundef [[__TMP_I]]) #[[ATTR14]]
 // FINITEONLY-NEXT:    [[TMP0:%.*]] = load i32, ptr addrspace(5) [[__TMP_I]], align 4, !tbaa [[TBAA12]]
 // FINITEONLY-NEXT:    store i32 [[TMP0]], ptr [[Z:%.*]], align 4, !tbaa [[TBAA12]]
-// FINITEONLY-NEXT:    call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) [[__TMP_I]]) #[[ATTR17]]
+// FINITEONLY-NEXT:    call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) [[__TMP_I]]) #[[ATTR15]]
 // FINITEONLY-NEXT:    ret double [[CALL_I]]
 //
 // APPROX-LABEL: @test_remquo(
 // APPROX-NEXT:  entry:
 // APPROX-NEXT:    [[__TMP_I:%.*]] = alloca i32, align 4, addrspace(5)
-// APPROX-NEXT:    call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) [[__TMP_I]]) #[[ATTR17]]
-// APPROX-NEXT:    [[CALL_I:%.*]] = call contract noundef double @__ocml_remquo_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]], ptr addrspace(5) noundef [[__TMP_I]]) #[[ATTR16]]
+// APPROX-NEXT:    call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) [[__TMP_I]]) #[[ATTR15]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = call contract noundef double @__ocml_remquo_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]], ptr addrspace(5) noundef [[__TMP_I]]) #[[ATTR14]]
 // APPROX-NEXT:    [[TMP0:%.*]] = load i32, ptr addrspace(5) [[__TMP_I]], align 4, !tbaa [[TBAA12]]
 // APPROX-NEXT:    store i32 [[TMP0]], ptr [[Z:%.*]], align 4, !tbaa [[TBAA12]]
-// APPROX-NEXT:    call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) [[__TMP_I]]) #[[ATTR17]]
+// APPROX-NEXT:    call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) [[__TMP_I]]) #[[ATTR15]]
 // APPROX-NEXT:    ret double [[CALL_I]]
 //
 extern "C" __device__ double test_remquo(double x, double y, int* z) {
@@ -3167,17 +3167,17 @@ extern "C" __device__ double test_remquo(double x, double y, int* z) {
 
 // DEFAULT-LABEL: @test_rhypotf(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_rhypot_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]]) #[[ATTR14]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_rhypot_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]]) #[[ATTR12]]
 // DEFAULT-NEXT:    ret float [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_rhypotf(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_rhypot_f32(float noundef nofpclass(nan inf) [[X:%.*]], float noundef nofpclass(nan inf) [[Y:%.*]]) #[[ATTR14]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_rhypot_f32(float noundef nofpclass(nan inf) [[X:%.*]], float noundef nofpclass(nan inf) [[Y:%.*]]) #[[ATTR12]]
 // FINITEONLY-NEXT:    ret float [[CALL_I]]
 //
 // APPROX-LABEL: @test_rhypotf(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_rhypot_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]]) #[[ATTR14]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_rhypot_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]]) #[[ATTR12]]
 // APPROX-NEXT:    ret float [[CALL_I]]
 //
 extern "C" __device__ float test_rhypotf(float x, float y) {
@@ -3186,17 +3186,17 @@ extern "C" __device__ float test_rhypotf(float x, float y) {
 
 // DEFAULT-LABEL: @test_rhypot(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_rhypot_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]]) #[[ATTR14]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_rhypot_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]]) #[[ATTR12]]
 // DEFAULT-NEXT:    ret double [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_rhypot(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_rhypot_f64(double noundef nofpclass(nan inf) [[X:%.*]], double noundef nofpclass(nan inf) [[Y:%.*]]) #[[ATTR14]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_rhypot_f64(double noundef nofpclass(nan inf) [[X:%.*]], double noundef nofpclass(nan inf) [[Y:%.*]]) #[[ATTR12]]
 // FINITEONLY-NEXT:    ret double [[CALL_I]]
 //
 // APPROX-LABEL: @test_rhypot(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_rhypot_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]]) #[[ATTR14]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_rhypot_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]]) #[[ATTR12]]
 // APPROX-NEXT:    ret double [[CALL_I]]
 //
 extern "C" __device__ double test_rhypot(double x, double y) {
@@ -3258,7 +3258,7 @@ extern "C" __device__ double test_rint(double x) {
 // DEFAULT-NEXT:    br i1 [[TOBOOL_NOT_I]], label [[_ZL6RNORMFIPKF_EXIT]], label [[WHILE_BODY_I]], !llvm.loop [[LOOP22:![0-9]+]]
 // DEFAULT:       _ZL6rnormfiPKf.exit:
 // DEFAULT-NEXT:    [[__R_0_I_LCSSA:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[ADD_I]], [[WHILE_BODY_I]] ]
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_rsqrt_f32(float noundef [[__R_0_I_LCSSA]]) #[[ATTR15]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_rsqrt_f32(float noundef [[__R_0_I_LCSSA]]) #[[ATTR13]]
 // DEFAULT-NEXT:    ret float [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_rnormf(
@@ -3278,7 +3278,7 @@ extern "C" __device__ double test_rint(double x) {
 // FINITEONLY-NEXT:    br i1 [[TOBOOL_NOT_I]], label [[_ZL6RNORMFIPKF_EXIT]], label [[WHILE_BODY_I]], !llvm.loop [[LOOP22:![0-9]+]]
 // FINITEONLY:       _ZL6rnormfiPKf.exit:
 // FINITEONLY-NEXT:    [[__R_0_I_LCSSA:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[ADD_I]], [[WHILE_BODY_I]] ]
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_rsqrt_f32(float noundef nofpclass(nan inf) [[__R_0_I_LCSSA]]) #[[ATTR15]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_rsqrt_f32(float noundef nofpclass(nan inf) [[__R_0_I_LCSSA]]) #[[ATTR13]]
 // FINITEONLY-NEXT:    ret float [[CALL_I]]
 //
 // APPROX-LABEL: @test_rnormf(
@@ -3298,7 +3298,7 @@ extern "C" __device__ double test_rint(double x) {
 // APPROX-NEXT:    br i1 [[TOBOOL_NOT_I]], label [[_ZL6RNORMFIPKF_EXIT]], label [[WHILE_BODY_I]], !llvm.loop [[LOOP22:![0-9]+]]
 // APPROX:       _ZL6rnormfiPKf.exit:
 // APPROX-NEXT:    [[__R_0_I_LCSSA:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[ADD_I]], [[WHILE_BODY_I]] ]
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_rsqrt_f32(float noundef [[__R_0_I_LCSSA]]) #[[ATTR15]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_rsqrt_f32(float noundef [[__R_0_I_LCSSA]]) #[[ATTR13]]
 // APPROX-NEXT:    ret float [[CALL_I]]
 //
 extern "C" __device__ float test_rnormf(int x, const float* y) {
@@ -3322,7 +3322,7 @@ extern "C" __device__ float test_rnormf(int x, const float* y) {
 // DEFAULT-NEXT:    br i1 [[TOBOOL_NOT_I]], label [[_ZL5RNORMIPKD_EXIT]], label [[WHILE_BODY_I]], !llvm.loop [[LOOP23:![0-9]+]]
 // DEFAULT:       _ZL5rnormiPKd.exit:
 // DEFAULT-NEXT:    [[__R_0_I_LCSSA:%.*]] = phi double [ 0.000000e+00, [[ENTRY]] ], [ [[ADD_I]], [[WHILE_BODY_I]] ]
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_rsqrt_f64(double noundef [[__R_0_I_LCSSA]]) #[[ATTR15]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_rsqrt_f64(double noundef [[__R_0_I_LCSSA]]) #[[ATTR13]]
 // DEFAULT-NEXT:    ret double [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_rnorm(
@@ -3342,7 +3342,7 @@ extern "C" __device__ float test_rnormf(int x, const float* y) {
 // FINITEONLY-NEXT:    br i1 [[TOBOOL_NOT_I]], label [[_ZL5RNORMIPKD_EXIT]], label [[WHILE_BODY_I]], !llvm.loop [[LOOP23:![0-9]+]]
 // FINITEONLY:       _ZL5rnormiPKd.exit:
 // FINITEONLY-NEXT:    [[__R_0_I_LCSSA:%.*]] = phi double [ 0.000000e+00, [[ENTRY]] ], [ [[ADD_I]], [[WHILE_BODY_I]] ]
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_rsqrt_f64(double noundef nofpclass(nan inf) [[__R_0_I_LCSSA]]) #[[ATTR15]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_rsqrt_f64(double noundef nofpclass(nan inf) [[__R_0_I_LCSSA]]) #[[ATTR13]]
 // FINITEONLY-NEXT:    ret double [[CALL_I]]
 //
 // APPROX-LABEL: @test_rnorm(
@@ -3362,7 +3362,7 @@ extern "C" __device__ float test_rnormf(int x, const float* y) {
 // APPROX-NEXT:    br i1 [[TOBOOL_NOT_I]], label [[_ZL5RNORMIPKD_EXIT]], label [[WHILE_BODY_I]], !llvm.loop [[LOOP23:![0-9]+]]
 // APPROX:       _ZL5rnormiPKd.exit:
 // APPROX-NEXT:    [[__R_0_I_LCSSA:%.*]] = phi double [ 0.000000e+00, [[ENTRY]] ], [ [[ADD_I]], [[WHILE_BODY_I]] ]
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_rsqrt_f64(double noundef [[__R_0_I_LCSSA]]) #[[ATTR15]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_rsqrt_f64(double noundef [[__R_0_I_LCSSA]]) #[[ATTR13]]
 // APPROX-NEXT:    ret double [[CALL_I]]
 //
 extern "C" __device__ double test_rnorm(int x, const double* y) {
@@ -3371,17 +3371,17 @@ extern "C" __device__ double test_rnorm(int x, const double* y) {
 
 // DEFAULT-LABEL: @test_rnorm3df(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_rlen3_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]], float noundef [[Z:%.*]]) #[[ATTR14]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_rlen3_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]], float noundef [[Z:%.*]]) #[[ATTR12]]
 // DEFAULT-NEXT:    ret float [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_rnorm3df(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_rlen3_f32(float noundef nofpclass(nan inf) [[X:%.*]], float noundef nofpclass(nan inf) [[Y:%.*]], float noundef nofpclass(nan inf) [[Z:%.*]]) #[[ATTR14]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_rlen3_f32(float noundef nofpclass(nan inf) [[X:%.*]], float noundef nofpclass(nan inf) [[Y:%.*]], float noundef nofpclass(nan inf) [[Z:%.*]]) #[[ATTR12]]
 // FINITEONLY-NEXT:    ret float [[CALL_I]]
 //
 // APPROX-LABEL: @test_rnorm3df(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_rlen3_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]], float noundef [[Z:%.*]]) #[[ATTR14]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_rlen3_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]], float noundef [[Z:%.*]]) #[[ATTR12]]
 // APPROX-NEXT:    ret float [[CALL_I]]
 //
 extern "C" __device__ float test_rnorm3df(float x, float y, float z) {
@@ -3390,17 +3390,17 @@ extern "C" __device__ float test_rnorm3df(float x, float y, float z) {
 
 // DEFAULT-LABEL: @test_rnorm3d(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_rlen3_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]], double noundef [[Z:%.*]]) #[[ATTR14]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_rlen3_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]], double noundef [[Z:%.*]]) #[[ATTR12]]
 // DEFAULT-NEXT:    ret double [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_rnorm3d(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_rlen3_f64(double noundef nofpclass(nan inf) [[X:%.*]], double noundef nofpclass(nan inf) [[Y:%.*]], double noundef nofpclass(nan inf) [[Z:%.*]]) #[[ATTR14]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_rlen3_f64(double noundef nofpclass(nan inf) [[X:%.*]], double noundef nofpclass(nan inf) [[Y:%.*]], double noundef nofpclass(nan inf) [[Z:%.*]]) #[[ATTR12]]
 // FINITEONLY-NEXT:    ret double [[CALL_I]]
 //
 // APPROX-LABEL: @test_rnorm3d(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_rlen3_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]], double noundef [[Z:%.*]]) #[[ATTR14]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_rlen3_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]], double noundef [[Z:%.*]]) #[[ATTR12]]
 // APPROX-NEXT:    ret double [[CALL_I]]
 //
 extern "C" __device__ double test_rnorm3d(double x, double y, double z) {
@@ -3409,17 +3409,17 @@ extern "C" __device__ double test_rnorm3d(double x, double y, double z) {
 
 // DEFAULT-LABEL: @test_rnorm4df(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_rlen4_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]], float noundef [[Z:%.*]], float noundef [[W:%.*]]) #[[ATTR14]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_rlen4_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]], float noundef [[Z:%.*]], float noundef [[W:%.*]]) #[[ATTR12]]
 // DEFAULT-NEXT:    ret float [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_rnorm4df(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_rlen4_f32(float noundef nofpclass(nan inf) [[X:%.*]], float noundef nofpclass(nan inf) [[Y:%.*]], float noundef nofpclass(nan inf) [[Z:%.*]], float noundef nofpclass(nan inf) [[W:%.*]]) #[[ATTR14]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_rlen4_f32(float noundef nofpclass(nan inf) [[X:%.*]], float noundef nofpclass(nan inf) [[Y:%.*]], float noundef nofpclass(nan inf) [[Z:%.*]], float noundef nofpclass(nan inf) [[W:%.*]]) #[[ATTR12]]
 // FINITEONLY-NEXT:    ret float [[CALL_I]]
 //
 // APPROX-LABEL: @test_rnorm4df(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_rlen4_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]], float noundef [[Z:%.*]], float noundef [[W:%.*]]) #[[ATTR14]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_rlen4_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]], float noundef [[Z:%.*]], float noundef [[W:%.*]]) #[[ATTR12]]
 // APPROX-NEXT:    ret float [[CALL_I]]
 //
 extern "C" __device__ float test_rnorm4df(float x, float y, float z, float w) {
@@ -3428,17 +3428,17 @@ extern "C" __device__ float test_rnorm4df(float x, float y, float z, float w) {
 
 // DEFAULT-LABEL: @test_rnorm4d(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_rlen4_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]], double noundef [[Z:%.*]], double noundef [[W:%.*]]) #[[ATTR14]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_rlen4_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]], double noundef [[Z:%.*]], double noundef [[W:%.*]]) #[[ATTR12]]
 // DEFAULT-NEXT:    ret double [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_rnorm4d(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_rlen4_f64(double noundef nofpclass(nan inf) [[X:%.*]], double noundef nofpclass(nan inf) [[Y:%.*]], double noundef nofpclass(nan inf) [[Z:%.*]], double noundef nofpclass(nan inf) [[W:%.*]]) #[[ATTR14]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_rlen4_f64(double noundef nofpclass(nan inf) [[X:%.*]], double noundef nofpclass(nan inf) [[Y:%.*]], double noundef nofpclass(nan inf) [[Z:%.*]], double noundef nofpclass(nan inf) [[W:%.*]]) #[[ATTR12]]
 // FINITEONLY-NEXT:    ret double [[CALL_I]]
 //
 // APPROX-LABEL: @test_rnorm4d(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_rlen4_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]], double noundef [[Z:%.*]], double noundef [[W:%.*]]) #[[ATTR14]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_rlen4_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]], double noundef [[Z:%.*]], double noundef [[W:%.*]]) #[[ATTR12]]
 // APPROX-NEXT:    ret double [[CALL_I]]
 //
 extern "C" __device__ double test_rnorm4d(double x, double y, double z, double w) {
@@ -3485,17 +3485,17 @@ extern "C" __device__ double test_round(double x) {
 
 // DEFAULT-LABEL: @test_rsqrtf(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_rsqrt_f32(float noundef [[X:%.*]]) #[[ATTR15]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_rsqrt_f32(float noundef [[X:%.*]]) #[[ATTR13]]
 // DEFAULT-NEXT:    ret float [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_rsqrtf(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_rsqrt_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR15]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_rsqrt_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR13]]
 // FINITEONLY-NEXT:    ret float [[CALL_I]]
 //
 // APPROX-LABEL: @test_rsqrtf(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_rsqrt_f32(float noundef [[X:%.*]]) #[[ATTR15]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_rsqrt_f32(float noundef [[X:%.*]]) #[[ATTR13]]
 // APPROX-NEXT:    ret float [[CALL_I]]
 //
 extern "C" __device__ float test_rsqrtf(float x) {
@@ -3504,17 +3504,17 @@ extern "C" __device__ float test_rsqrtf(float x) {
 
 // DEFAULT-LABEL: @test_rsqrt(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_rsqrt_f64(double noundef [[X:%.*]]) #[[ATTR15]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_rsqrt_f64(double noundef [[X:%.*]]) #[[ATTR13]]
 // DEFAULT-NEXT:    ret double [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_rsqrt(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_rsqrt_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR15]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_rsqrt_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR13]]
 // FINITEONLY-NEXT:    ret double [[CALL_I]]
 //
 // APPROX-LABEL: @test_rsqrt(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_rsqrt_f64(double noundef [[X:%.*]]) #[[ATTR15]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_rsqrt_f64(double noundef [[X:%.*]]) #[[ATTR13]]
 // APPROX-NEXT:    ret double [[CALL_I]]
 //
 extern "C" __device__ double test_rsqrt(double x) {
@@ -3530,7 +3530,7 @@ extern "C" __device__ double test_rsqrt(double x) {
 // DEFAULT-NEXT:    [[TMP0:%.*]] = tail call contract float @llvm.ldexp.f32.i32(float [[X:%.*]], i32 [[CONV_I]])
 // DEFAULT-NEXT:    br label [[_ZL8SCALBLNFFL_EXIT:%.*]]
 // DEFAULT:       cond.false.i:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract float @__ocml_scalb_f32(float noundef [[X]], float noundef 0x43E0000000000000) #[[ATTR14]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract float @__ocml_scalb_f32(float noundef [[X]], float noundef 0x43E0000000000000) #[[ATTR12]]
 // DEFAULT-NEXT:    br label [[_ZL8SCALBLNFFL_EXIT]]
 // DEFAULT:       _ZL8scalblnffl.exit:
 // DEFAULT-NEXT:    [[COND_I:%.*]] = phi contract float [ [[TMP0]], [[COND_TRUE_I]] ], [ [[CALL_I]], [[COND_FALSE_I]] ]
@@ -3545,7 +3545,7 @@ extern "C" __device__ double test_rsqrt(double x) {
 // FINITEONLY-NEXT:    [[TMP0:%.*]] = tail call nnan ninf contract float @llvm.ldexp.f32.i32(float [[X:%.*]], i32 [[CONV_I]])
 // FINITEONLY-NEXT:    br label [[_ZL8SCALBLNFFL_EXIT:%.*]]
 // FINITEONLY:       cond.false.i:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract nofpclass(nan inf) float @__ocml_scalb_f32(float noundef nofpclass(nan inf) [[X]], float noundef nofpclass(nan inf) 0x43E0000000000000) #[[ATTR14]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract nofpclass(nan inf) float @__ocml_scalb_f32(float noundef nofpclass(nan inf) [[X]], float noundef nofpclass(nan inf) 0x43E0000000000000) #[[ATTR12]]
 // FINITEONLY-NEXT:    br label [[_ZL8SCALBLNFFL_EXIT]]
 // FINITEONLY:       _ZL8scalblnffl.exit:
 // FINITEONLY-NEXT:    [[COND_I:%.*]] = phi nnan ninf contract float [ [[TMP0]], [[COND_TRUE_I]] ], [ [[CALL_I]], [[COND_FALSE_I]] ]
@@ -3560,7 +3560,7 @@ extern "C" __device__ double test_rsqrt(double x) {
 // APPROX-NEXT:    [[TMP0:%.*]] = tail call contract float @llvm.ldexp.f32.i32(float [[X:%.*]], i32 [[CONV_I]])
 // APPROX-NEXT:    br label [[_ZL8SCALBLNFFL_EXIT:%.*]]
 // APPROX:       cond.false.i:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract float @__ocml_scalb_f32(float noundef [[X]], float noundef 0x43E0000000000000) #[[ATTR14]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract float @__ocml_scalb_f32(float noundef [[X]], float noundef 0x43E0000000000000) #[[ATTR12]]
 // APPROX-NEXT:    br label [[_ZL8SCALBLNFFL_EXIT]]
 // APPROX:       _ZL8scalblnffl.exit:
 // APPROX-NEXT:    [[COND_I:%.*]] = phi contract float [ [[TMP0]], [[COND_TRUE_I]] ], [ [[CALL_I]], [[COND_FALSE_I]] ]
@@ -3579,7 +3579,7 @@ extern "C" __device__ float test_scalblnf(float x, long int y) {
 // DEFAULT-NEXT:    [[TMP0:%.*]] = tail call contract double @llvm.ldexp.f64.i32(double [[X:%.*]], i32 [[CONV_I]])
 // DEFAULT-NEXT:    br label [[_ZL7SCALBLNDL_EXIT:%.*]]
 // DEFAULT:       cond.false.i:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract double @__ocml_scalb_f64(double noundef [[X]], double noundef 0x43E0000000000000) #[[ATTR14]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract double @__ocml_scalb_f64(double noundef [[X]], double noundef 0x43E0000000000000) #[[ATTR12]]
 // DEFAULT-NEXT:    br label [[_ZL7SCALBLNDL_EXIT]]
 // DEFAULT:       _ZL7scalblndl.exit:
 // DEFAULT-NEXT:    [[COND_I:%.*]] = phi contract double [ [[TMP0]], [[COND_TRUE_I]] ], [ [[CALL_I]], [[COND_FALSE_I]] ]
@@ -3594,7 +3594,7 @@ extern "C" __device__ float test_scalblnf(float x, long int y) {
 // FINITEONLY-NEXT:    [[TMP0:%.*]] = tail call nnan ninf contract double @llvm.ldexp.f64.i32(double [[X:%.*]], i32 [[CONV_I]])
 // FINITEONLY-NEXT:    br label [[_ZL7SCALBLNDL_EXIT:%.*]]
 // FINITEONLY:       cond.false.i:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract nofpclass(nan inf) double @__ocml_scalb_f64(double noundef nofpclass(nan inf) [[X]], double noundef nofpclass(nan inf) 0x43E0000000000000) #[[ATTR14]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract nofpclass(nan inf) double @__ocml_scalb_f64(double noundef nofpclass(nan inf) [[X]], double noundef nofpclass(nan inf) 0x43E0000000000000) #[[ATTR12]]
 // FINITEONLY-NEXT:    br label [[_ZL7SCALBLNDL_EXIT]]
 // FINITEONLY:       _ZL7scalblndl.exit:
 // FINITEONLY-NEXT:    [[COND_I:%.*]] = phi nnan ninf contract double [ [[TMP0]], [[COND_TRUE_I]] ], [ [[CALL_I]], [[COND_FALSE_I]] ]
@@ -3609,7 +3609,7 @@ extern "C" __device__ float test_scalblnf(float x, long int y) {
 // APPROX-NEXT:    [[TMP0:%.*]] = tail call contract double @llvm.ldexp.f64.i32(double [[X:%.*]], i32 [[CONV_I]])
 // APPROX-NEXT:    br label [[_ZL7SCALBLNDL_EXIT:%.*]]
 // APPROX:       cond.false.i:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract double @__ocml_scalb_f64(double noundef [[X]], double noundef 0x43E0000000000000) #[[ATTR14]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract double @__ocml_scalb_f64(double noundef [[X]], double noundef 0x43E0000000000000) #[[ATTR12]]
 // APPROX-NEXT:    br label [[_ZL7SCALBLNDL_EXIT]]
 // APPROX:       _ZL7scalblndl.exit:
 // APPROX-NEXT:    [[COND_I:%.*]] = phi contract double [ [[TMP0]], [[COND_TRUE_I]] ], [ [[CALL_I]], [[COND_FALSE_I]] ]
@@ -3681,34 +3681,34 @@ extern "C" __device__ BOOL_TYPE test___signbit(double x) {
 // DEFAULT-LABEL: @test_sincosf(
 // DEFAULT-NEXT:  entry:
 // DEFAULT-NEXT:    [[__TMP_I:%.*]] = alloca float, align 4, addrspace(5)
-// DEFAULT-NEXT:    call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) [[__TMP_I]]) #[[ATTR17]]
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = call contract float @__ocml_sincos_f32(float noundef [[X:%.*]], ptr addrspace(5) noundef [[__TMP_I]]) #[[ATTR16]]
+// DEFAULT-NEXT:    call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) [[__TMP_I]]) #[[ATTR15]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = call contract float @__ocml_sincos_f32(float noundef [[X:%.*]], ptr addrspace(5) noundef [[__TMP_I]]) #[[ATTR14]]
 // DEFAULT-NEXT:    store float [[CALL_I]], ptr [[Y:%.*]], align 4, !tbaa [[TBAA16]]
 // DEFAULT-NEXT:    [[TMP0:%.*]] = load float, ptr addrspace(5) [[__TMP_I]], align 4, !tbaa [[TBAA16]]
 // DEFAULT-NEXT:    store float [[TMP0]], ptr [[Z:%.*]], align 4, !tbaa [[TBAA16]]
-// DEFAULT-NEXT:    call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) [[__TMP_I]]) #[[ATTR17]]
+// DEFAULT-NEXT:    call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) [[__TMP_I]]) #[[ATTR15]]
 // DEFAULT-NEXT:    ret void
 //
 // FINITEONLY-LABEL: @test_sincosf(
 // FINITEONLY-NEXT:  entry:
 // FINITEONLY-NEXT:    [[__TMP_I:%.*]] = alloca float, align 4, addrspace(5)
-// FINITEONLY-NEXT:    call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) [[__TMP_I]]) #[[ATTR17]]
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = call nnan ninf contract nofpclass(nan inf) float @__ocml_sincos_f32(float noundef nofpclass(nan inf) [[X:%.*]], ptr addrspace(5) noundef [[__TMP_I]]) #[[ATTR16]]
+// FINITEONLY-NEXT:    call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) [[__TMP_I]]) #[[ATTR15]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = call nnan ninf contract nofpclass(nan inf) float @__ocml_sincos_f32(float noundef nofpclass(nan inf) [[X:%.*]], ptr addrspace(5) noundef [[__TMP_I]]) #[[ATTR14]]
 // FINITEONLY-NEXT:    store float [[CALL_I]], ptr [[Y:%.*]], align 4, !tbaa [[TBAA16]]
 // FINITEONLY-NEXT:    [[TMP0:%.*]] = load float, ptr addrspace(5) [[__TMP_I]], align 4, !tbaa [[TBAA16]]
 // FINITEONLY-NEXT:    store float [[TMP0]], ptr [[Z:%.*]], align 4, !tbaa [[TBAA16]]
-// FINITEONLY-NEXT:    call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) [[__TMP_I]]) #[[ATTR17]]
+// FINITEONLY-NEXT:    call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) [[__TMP_I]]) #[[ATTR15]]
 // FINITEONLY-NEXT:    ret void
 //
 // APPROX-LABEL: @test_sincosf(
 // APPROX-NEXT:  entry:
 // APPROX-NEXT:    [[__TMP_I:%.*]] = alloca float, align 4, addrspace(5)
-// APPROX-NEXT:    call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) [[__TMP_I]]) #[[ATTR17]]
-// APPROX-NEXT:    [[CALL_I:%.*]] = call contract float @__ocml_sincos_f32(float noundef [[X:%.*]], ptr addrspace(5) noundef [[__TMP_I]]) #[[ATTR16]]
+// APPROX-NEXT:    call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) [[__TMP_I]]) #[[ATTR15]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = call contract float @__ocml_sincos_f32(float noundef [[X:%.*]], ptr addrspace(5) noundef [[__TMP_I]]) #[[ATTR14]]
 // APPROX-NEXT:    store float [[CALL_I]], ptr [[Y:%.*]], align 4, !tbaa [[TBAA16]]
 // APPROX-NEXT:    [[TMP0:%.*]] = load float, ptr addrspace(5) [[__TMP_I]], align 4, !tbaa [[TBAA16]]
 // APPROX-NEXT:    store float [[TMP0]], ptr [[Z:%.*]], align 4, !tbaa [[TBAA16]]
-// APPROX-NEXT:    call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) [[__TMP_I]]) #[[ATTR17]]
+// APPROX-NEXT:    call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) [[__TMP_I]]) #[[ATTR15]]
 // APPROX-NEXT:    ret void
 //
 extern "C" __device__ void test_sincosf(float x, float *y, float *z) {
@@ -3718,34 +3718,34 @@ extern "C" __device__ void test_sincosf(float x, float *y, float *z) {
 // DEFAULT-LABEL: @test_sincos(
 // DEFAULT-NEXT:  entry:
 // DEFAULT-NEXT:    [[__TMP_I:%.*]] = alloca double, align 8, addrspace(5)
-// DEFAULT-NEXT:    call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[__TMP_I]]) #[[ATTR17]]
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = call contract double @__ocml_sincos_f64(double noundef [[X:%.*]], ptr addrspace(5) noundef [[__TMP_I]]) #[[ATTR16]]
+// DEFAULT-NEXT:    call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[__TMP_I]]) #[[ATTR15]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = call contract double @__ocml_sincos_f64(double noundef [[X:%.*]], ptr addrspace(5) noundef [[__TMP_I]]) #[[ATTR14]]
 // DEFAULT-NEXT:    store double [[CALL_I]], ptr [[Y:%.*]], align 8, !tbaa [[TBAA18]]
 // DEFAULT-NEXT:    [[TMP0:%.*]] = load double, ptr addrspace(5) [[__TMP_I]], align 8, !tbaa [[TBAA18]]
 // DEFAULT-NEXT:    store double [[TMP0]], ptr [[Z:%.*]], align 8, !tbaa [[TBAA18]]
-// DEFAULT-NEXT:    call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[__TMP_I]]) #[[ATTR17]]
+// DEFAULT-NEXT:    call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[__TMP_I]]) #[[ATTR15]]
 // DEFAULT-NEXT:    ret void
 //
 // FINITEONLY-LABEL: @test_sincos(
 // FINITEONLY-NEXT:  entry:
 // FINITEONLY-NEXT:    [[__TMP_I:%.*]] = alloca double, align 8, addrspace(5)
-// FINITEONLY-NEXT:    call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[__TMP_I]]) #[[ATTR17]]
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = call nnan ninf contract nofpclass(nan inf) double @__ocml_sincos_f64(double noundef nofpclass(nan inf) [[X:%.*]], ptr addrspace(5) noundef [[__TMP_I]]) #[[ATTR16]]
+// FINITEONLY-NEXT:    call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[__TMP_I]]) #[[ATTR15]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = call nnan ninf contract nofpclass(nan inf) double @__ocml_sincos_f64(double noundef nofpclass(nan inf) [[X:%.*]], ptr addrspace(5) noundef [[__TMP_I]]) #[[ATTR14]]
 // FINITEONLY-NEXT:    store double [[CALL_I]], ptr [[Y:%.*]], align 8, !tbaa [[TBAA18]]
 // FINITEONLY-NEXT:    [[TMP0:%.*]] = load double, ptr addrspace(5) [[__TMP_I]], align 8, !tbaa [[TBAA18]]
 // FINITEONLY-NEXT:    store double [[TMP0]], ptr [[Z:%.*]], align 8, !tbaa [[TBAA18]]
-// FINITEONLY-NEXT:    call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[__TMP_I]]) #[[ATTR17]]
+// FINITEONLY-NEXT:    call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[__TMP_I]]) #[[ATTR15]]
 // FINITEONLY-NEXT:    ret void
 //
 // APPROX-LABEL: @test_sincos(
 // APPROX-NEXT:  entry:
 // APPROX-NEXT:    [[__TMP_I:%.*]] = alloca double, align 8, addrspace(5)
-// APPROX-NEXT:    call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[__TMP_I]]) #[[ATTR17]]
-// APPROX-NEXT:    [[CALL_I:%.*]] = call contract double @__ocml_sincos_f64(double noundef [[X:%.*]], ptr addrspace(5) noundef [[__TMP_I]]) #[[ATTR16]]
+// APPROX-NEXT:    call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[__TMP_I]]) #[[ATTR15]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = call contract double @__ocml_sincos_f64(double noundef [[X:%.*]], ptr addrspace(5) noundef [[__TMP_I]]) #[[ATTR14]]
 // APPROX-NEXT:    store double [[CALL_I]], ptr [[Y:%.*]], align 8, !tbaa [[TBAA18]]
 // APPROX-NEXT:    [[TMP0:%.*]] = load double, ptr addrspace(5) [[__TMP_I]], align 8, !tbaa [[TBAA18]]
 // APPROX-NEXT:    store double [[TMP0]], ptr [[Z:%.*]], align 8, !tbaa [[TBAA18]]
-// APPROX-NEXT:    call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[__TMP_I]]) #[[ATTR17]]
+// APPROX-NEXT:    call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[__TMP_I]]) #[[ATTR15]]
 // APPROX-NEXT:    ret void
 //
 extern "C" __device__ void test_sincos(double x, double *y, double *z) {
@@ -3755,34 +3755,34 @@ extern "C" __device__ void test_sincos(double x, double *y, double *z) {
 // DEFAULT-LABEL: @test_sincospif(
 // DEFAULT-NEXT:  entry:
 // DEFAULT-NEXT:    [[__TMP_I:%.*]] = alloca float, align 4, addrspace(5)
-// DEFAULT-NEXT:    call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) [[__TMP_I]]) #[[ATTR17]]
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = call contract float @__ocml_sincospi_f32(float noundef [[X:%.*]], ptr addrspace(5) noundef [[__TMP_I]]) #[[ATTR16]]
+// DEFAULT-NEXT:    call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) [[__TMP_I]]) #[[ATTR15]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = call contract float @__ocml_sincospi_f32(float noundef [[X:%.*]], ptr addrspace(5) noundef [[__TMP_I]]) #[[ATTR14]]
 // DEFAULT-NEXT:    store float [[CALL_I]], ptr [[Y:%.*]], align 4, !tbaa [[TBAA16]]
 // DEFAULT-NEXT:    [[TMP0:%.*]] = load float, ptr addrspace(5) [[__TMP_I]], align 4, !tbaa [[TBAA16]]
 // DEFAULT-NEXT:    store float [[TMP0]], ptr [[Z:%.*]], align 4, !tbaa [[TBAA16]]
-// DEFAULT-NEXT:    call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) [[__TMP_I]]) #[[ATTR17]]
+// DEFAULT-NEXT:    call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) [[__TMP_I]]) #[[ATTR15]]
 // DEFAULT-NEXT:    ret void
 //
 // FINITEONLY-LABEL: @test_sincospif(
 // FINITEONLY-NEXT:  entry:
 // FINITEONLY-NEXT:    [[__TMP_I:%.*]] = alloca float, align 4, addrspace(5)
-// FINITEONLY-NEXT:    call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) [[__TMP_I]]) #[[ATTR17]]
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = call nnan ninf contract nofpclass(nan inf) float @__ocml_sincospi_f32(float noundef nofpclass(nan inf) [[X:%.*]], ptr addrspace(5) noundef [[__TMP_I]]) #[[ATTR16]]
+// FINITEONLY-NEXT:    call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) [[__TMP_I]]) #[[ATTR15]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = call nnan ninf contract nofpclass(nan inf) float @__ocml_sincospi_f32(float noundef nofpclass(nan inf) [[X:%.*]], ptr addrspace(5) noundef [[__TMP_I]]) #[[ATTR14]]
 // FINITEONLY-NEXT:    store float [[CALL_I]], ptr [[Y:%.*]], align 4, !tbaa [[TBAA16]]
 // FINITEONLY-NEXT:    [[TMP0:%.*]] = load float, ptr addrspace(5) [[__TMP_I]], align 4, !tbaa [[TBAA16]]
 // FINITEONLY-NEXT:    store float [[TMP0]], ptr [[Z:%.*]], align 4, !tbaa [[TBAA16]]
-// FINITEONLY-NEXT:    call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) [[__TMP_I]]) #[[ATTR17]]
+// FINITEONLY-NEXT:    call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) [[__TMP_I]]) #[[ATTR15]]
 // FINITEONLY-NEXT:    ret void
 //
 // APPROX-LABEL: @test_sincospif(
 // APPROX-NEXT:  entry:
 // APPROX-NEXT:    [[__TMP_I:%.*]] = alloca float, align 4, addrspace(5)
-// APPROX-NEXT:    call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) [[__TMP_I]]) #[[ATTR17]]
-// APPROX-NEXT:    [[CALL_I:%.*]] = call contract float @__ocml_sincospi_f32(float noundef [[X:%.*]], ptr addrspace(5) noundef [[__TMP_I]]) #[[ATTR16]]
+// APPROX-NEXT:    call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) [[__TMP_I]]) #[[ATTR15]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = call contract float @__ocml_sincospi_f32(float noundef [[X:%.*]], ptr addrspace(5) noundef [[__TMP_I]]) #[[ATTR14]]
 // APPROX-NEXT:    store float [[CALL_I]], ptr [[Y:%.*]], align 4, !tbaa [[TBAA16]]
 // APPROX-NEXT:    [[TMP0:%.*]] = load float, ptr addrspace(5) [[__TMP_I]], align 4, !tbaa [[TBAA16]]
 // APPROX-NEXT:    store float [[TMP0]], ptr [[Z:%.*]], align 4, !tbaa [[TBAA16]]
-// APPROX-NEXT:    call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) [[__TMP_I]]) #[[ATTR17]]
+// APPROX-NEXT:    call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) [[__TMP_I]]) #[[ATTR15]]
 // APPROX-NEXT:    ret void
 //
 extern "C" __device__ void test_sincospif(float x, float *y, float *z) {
@@ -3792,34 +3792,34 @@ extern "C" __device__ void test_sincospif(float x, float *y, float *z) {
 // DEFAULT-LABEL: @test_sincospi(
 // DEFAULT-NEXT:  entry:
 // DEFAULT-NEXT:    [[__TMP_I:%.*]] = alloca double, align 8, addrspace(5)
-// DEFAULT-NEXT:    call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[__TMP_I]]) #[[ATTR17]]
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = call contract double @__ocml_sincospi_f64(double noundef [[X:%.*]], ptr addrspace(5) noundef [[__TMP_I]]) #[[ATTR16]]
+// DEFAULT-NEXT:    call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[__TMP_I]]) #[[ATTR15]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = call contract double @__ocml_sincospi_f64(double noundef [[X:%.*]], ptr addrspace(5) noundef [[__TMP_I]]) #[[ATTR14]]
 // DEFAULT-NEXT:    store double [[CALL_I]], ptr [[Y:%.*]], align 8, !tbaa [[TBAA18]]
 // DEFAULT-NEXT:    [[TMP0:%.*]] = load double, ptr addrspace(5) [[__TMP_I]], align 8, !tbaa [[TBAA18]]
 // DEFAULT-NEXT:    store double [[TMP0]], ptr [[Z:%.*]], align 8, !tbaa [[TBAA18]]
-// DEFAULT-NEXT:    call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[__TMP_I]]) #[[ATTR17]]
+// DEFAULT-NEXT:    call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[__TMP_I]]) #[[ATTR15]]
 // DEFAULT-NEXT:    ret void
 //
 // FINITEONLY-LABEL: @test_sincospi(
 // FINITEONLY-NEXT:  entry:
 // FINITEONLY-NEXT:    [[__TMP_I:%.*]] = alloca double, align 8, addrspace(5)
-// FINITEONLY-NEXT:    call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[__TMP_I]]) #[[ATTR17]]
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = call nnan ninf contract nofpclass(nan inf) double @__ocml_sincospi_f64(double noundef nofpclass(nan inf) [[X:%.*]], ptr addrspace(5) noundef [[__TMP_I]]) #[[ATTR16]]
+// FINITEONLY-NEXT:    call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[__TMP_I]]) #[[ATTR15]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = call nnan ninf contract nofpclass(nan inf) double @__ocml_sincospi_f64(double noundef nofpclass(nan inf) [[X:%.*]], ptr addrspace(5) noundef [[__TMP_I]]) #[[ATTR14]]
 // FINITEONLY-NEXT:    store double [[CALL_I]], ptr [[Y:%.*]], align 8, !tbaa [[TBAA18]]
 // FINITEONLY-NEXT:    [[TMP0:%.*]] = load double, ptr addrspace(5) [[__TMP_I]], align 8, !tbaa [[TBAA18]]
 // FINITEONLY-NEXT:    store double [[TMP0]], ptr [[Z:%.*]], align 8, !tbaa [[TBAA18]]
-// FINITEONLY-NEXT:    call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[__TMP_I]]) #[[ATTR17]]
+// FINITEONLY-NEXT:    call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[__TMP_I]]) #[[ATTR15]]
 // FINITEONLY-NEXT:    ret void
 //
 // APPROX-LABEL: @test_sincospi(
 // APPROX-NEXT:  entry:
 // APPROX-NEXT:    [[__TMP_I:%.*]] = alloca double, align 8, addrspace(5)
-// APPROX-NEXT:    call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[__TMP_I]]) #[[ATTR17]]
-// APPROX-NEXT:    [[CALL_I:%.*]] = call contract double @__ocml_sincospi_f64(double noundef [[X:%.*]], ptr addrspace(5) noundef [[__TMP_I]]) #[[ATTR16]]
+// APPROX-NEXT:    call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[__TMP_I]]) #[[ATTR15]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = call contract double @__ocml_sincospi_f64(double noundef [[X:%.*]], ptr addrspace(5) noundef [[__TMP_I]]) #[[ATTR14]]
 // APPROX-NEXT:    store double [[CALL_I]], ptr [[Y:%.*]], align 8, !tbaa [[TBAA18]]
 // APPROX-NEXT:    [[TMP0:%.*]] = load double, ptr addrspace(5) [[__TMP_I]], align 8, !tbaa [[TBAA18]]
 // APPROX-NEXT:    store double [[TMP0]], ptr [[Z:%.*]], align 8, !tbaa [[TBAA18]]
-// APPROX-NEXT:    call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[__TMP_I]]) #[[ATTR17]]
+// APPROX-NEXT:    call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[__TMP_I]]) #[[ATTR15]]
 // APPROX-NEXT:    ret void
 //
 extern "C" __device__ void test_sincospi(double x, double *y, double *z) {
@@ -3828,17 +3828,17 @@ extern "C" __device__ void test_sincospi(double x, double *y, double *z) {
 
 // DEFAULT-LABEL: @test_sinf(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_sin_f32(float noundef [[X:%.*]]) #[[ATTR16]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_sin_f32(float noundef [[X:%.*]]) #[[ATTR14]]
 // DEFAULT-NEXT:    ret float [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_sinf(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_sin_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR16]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_sin_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR14]]
 // FINITEONLY-NEXT:    ret float [[CALL_I]]
 //
 // APPROX-LABEL: @test_sinf(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I1:%.*]] = tail call contract noundef float @__ocml_native_sin_f32(float noundef [[X:%.*]]) #[[ATTR16]]
+// APPROX-NEXT:    [[CALL_I1:%.*]] = tail call contract noundef float @__ocml_native_sin_f32(float noundef [[X:%.*]]) #[[ATTR14]]
 // APPROX-NEXT:    ret float [[CALL_I1]]
 //
 extern "C" __device__ float test_sinf(float x) {
@@ -3847,17 +3847,17 @@ extern "C" __device__ float test_sinf(float x) {
 
 // DEFAULT-LABEL: @test_sin(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_sin_f64(double noundef [[X:%.*]]) #[[ATTR16]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_sin_f64(double noundef [[X:%.*]]) #[[ATTR14]]
 // DEFAULT-NEXT:    ret double [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_sin(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_sin_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR16]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_sin_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR14]]
 // FINITEONLY-NEXT:    ret double [[CALL_I]]
 //
 // APPROX-LABEL: @test_sin(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_sin_f64(double noundef [[X:%.*]]) #[[ATTR16]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_sin_f64(double noundef [[X:%.*]]) #[[ATTR14]]
 // APPROX-NEXT:    ret double [[CALL_I]]
 //
 extern "C" __device__ double test_sin(double x) {
@@ -3866,17 +3866,17 @@ extern "C" __device__ double test_sin(double x) {
 
 // DEFAULT-LABEL: @test_sinpif(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_sinpi_f32(float noundef [[X:%.*]]) #[[ATTR16]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_sinpi_f32(float noundef [[X:%.*]]) #[[ATTR14]]
 // DEFAULT-NEXT:    ret float [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_sinpif(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_sinpi_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR16]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_sinpi_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR14]]
 // FINITEONLY-NEXT:    ret float [[CALL_I]]
 //
 // APPROX-LABEL: @test_sinpif(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_sinpi_f32(float noundef [[X:%.*]]) #[[ATTR16]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_sinpi_f32(float noundef [[X:%.*]]) #[[ATTR14]]
 // APPROX-NEXT:    ret float [[CALL_I]]
 //
 extern "C" __device__ float test_sinpif(float x) {
@@ -3885,17 +3885,17 @@ extern "C" __device__ float test_sinpif(float x) {
 
 // DEFAULT-LABEL: @test_sinpi(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_sinpi_f64(double noundef [[X:%.*]]) #[[ATTR16]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_sinpi_f64(double noundef [[X:%.*]]) #[[ATTR14]]
 // DEFAULT-NEXT:    ret double [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_sinpi(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_sinpi_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR16]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_sinpi_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR14]]
 // FINITEONLY-NEXT:    ret double [[CALL_I]]
 //
 // APPROX-LABEL: @test_sinpi(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_sinpi_f64(double noundef [[X:%.*]]) #[[ATTR16]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_sinpi_f64(double noundef [[X:%.*]]) #[[ATTR14]]
 // APPROX-NEXT:    ret double [[CALL_I]]
 //
 extern "C" __device__ double test_sinpi(double x) {
@@ -3942,17 +3942,17 @@ extern "C" __device__ double test_sqrt(double x) {
 
 // DEFAULT-LABEL: @test_tanf(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_tan_f32(float noundef [[X:%.*]]) #[[ATTR16]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_tan_f32(float noundef [[X:%.*]]) #[[ATTR14]]
 // DEFAULT-NEXT:    ret float [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_tanf(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_tan_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR16]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_tan_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR14]]
 // FINITEONLY-NEXT:    ret float [[CALL_I]]
 //
 // APPROX-LABEL: @test_tanf(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_tan_f32(float noundef [[X:%.*]]) #[[ATTR16]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_tan_f32(float noundef [[X:%.*]]) #[[ATTR14]]
 // APPROX-NEXT:    ret float [[CALL_I]]
 //
 extern "C" __device__ float test_tanf(float x) {
@@ -3961,17 +3961,17 @@ extern "C" __device__ float test_tanf(float x) {
 
 // DEFAULT-LABEL: @test_tan(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_tan_f64(double noundef [[X:%.*]]) #[[ATTR16]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_tan_f64(double noundef [[X:%.*]]) #[[ATTR14]]
 // DEFAULT-NEXT:    ret double [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_tan(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_tan_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR16]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_tan_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR14]]
 // FINITEONLY-NEXT:    ret double [[CALL_I]]
 //
 // APPROX-LABEL: @test_tan(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_tan_f64(double noundef [[X:%.*]]) #[[ATTR16]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_tan_f64(double noundef [[X:%.*]]) #[[ATTR14]]
 // APPROX-NEXT:    ret double [[CALL_I]]
 //
 extern "C" __device__ double test_tan(double x) {
@@ -3980,17 +3980,17 @@ extern "C" __device__ double test_tan(double x) {
 
 // DEFAULT-LABEL: @test_tanhf(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_tanh_f32(float noundef [[X:%.*]]) #[[ATTR15]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_tanh_f32(float noundef [[X:%.*]]) #[[ATTR13]]
 // DEFAULT-NEXT:    ret float [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_tanhf(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_tanh_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR15]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_tanh_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR13]]
 // FINITEONLY-NEXT:    ret float [[CALL_I]]
 //
 // APPROX-LABEL: @test_tanhf(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_tanh_f32(float noundef [[X:%.*]]) #[[ATTR15]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_tanh_f32(float noundef [[X:%.*]]) #[[ATTR13]]
 // APPROX-NEXT:    ret float [[CALL_I]]
 //
 extern "C" __device__ float test_tanhf(float x) {
@@ -3999,17 +3999,17 @@ extern "C" __device__ float test_tanhf(float x) {
 
 // DEFAULT-LABEL: @test_tanh(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_tanh_f64(double noundef [[X:%.*]]) #[[ATTR15]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_tanh_f64(double noundef [[X:%.*]]) #[[ATTR13]]
 // DEFAULT-NEXT:    ret double [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_tanh(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_tanh_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR15]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_tanh_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR13]]
 // FINITEONLY-NEXT:    ret double [[CALL_I]]
 //
 // APPROX-LABEL: @test_tanh(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_tanh_f64(double noundef [[X:%.*]]) #[[ATTR15]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_tanh_f64(double noundef [[X:%.*]]) #[[ATTR13]]
 // APPROX-NEXT:    ret double [[CALL_I]]
 //
 extern "C" __device__ double test_tanh(double x) {
@@ -4018,17 +4018,17 @@ extern "C" __device__ double test_tanh(double x) {
 
 // DEFAULT-LABEL: @test_tgammaf(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_tgamma_f32(float noundef [[X:%.*]]) #[[ATTR16]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_tgamma_f32(float noundef [[X:%.*]]) #[[ATTR14]]
 // DEFAULT-NEXT:    ret float [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_tgammaf(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_tgamma_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR16]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_tgamma_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR14]]
 // FINITEONLY-NEXT:    ret float [[CALL_I]]
 //
 // APPROX-LABEL: @test_tgammaf(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_tgamma_f32(float noundef [[X:%.*]]) #[[ATTR16]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_tgamma_f32(float noundef [[X:%.*]]) #[[ATTR14]]
 // APPROX-NEXT:    ret float [[CALL_I]]
 //
 extern "C" __device__ float test_tgammaf(float x) {
@@ -4037,17 +4037,17 @@ extern "C" __device__ float test_tgammaf(float x) {
 
 // DEFAULT-LABEL: @test_tgamma(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_tgamma_f64(double noundef [[X:%.*]]) #[[ATTR16]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_tgamma_f64(double noundef [[X:%.*]]) #[[ATTR14]]
 // DEFAULT-NEXT:    ret double [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_tgamma(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_tgamma_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR16]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_tgamma_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR14]]
 // FINITEONLY-NEXT:    ret double [[CALL_I]]
 //
 // APPROX-LABEL: @test_tgamma(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_tgamma_f64(double noundef [[X:%.*]]) #[[ATTR16]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_tgamma_f64(double noundef [[X:%.*]]) #[[ATTR14]]
 // APPROX-NEXT:    ret double [[CALL_I]]
 //
 extern "C" __device__ double test_tgamma(double x) {
@@ -4094,17 +4094,17 @@ extern "C" __device__ double test_trunc(double x) {
 
 // DEFAULT-LABEL: @test_y0f(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_y0_f32(float noundef [[X:%.*]]) #[[ATTR16]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_y0_f32(float noundef [[X:%.*]]) #[[ATTR14]]
 // DEFAULT-NEXT:    ret float [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_y0f(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_y0_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR16]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_y0_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR14]]
 // FINITEONLY-NEXT:    ret float [[CALL_I]]
 //
 // APPROX-LABEL: @test_y0f(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_y0_f32(float noundef [[X:%.*]]) #[[ATTR16]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_y0_f32(float noundef [[X:%.*]]) #[[ATTR14]]
 // APPROX-NEXT:    ret float [[CALL_I]]
 //
 extern "C" __device__ float test_y0f(float x) {
@@ -4113,17 +4113,17 @@ extern "C" __device__ float test_y0f(float x) {
 
 // DEFAULT-LABEL: @test_y0(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_y0_f64(double noundef [[X:%.*]]) #[[ATTR16]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_y0_f64(double noundef [[X:%.*]]) #[[ATTR14]]
 // DEFAULT-NEXT:    ret double [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_y0(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_y0_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR16]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_y0_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR14]]
 // FINITEONLY-NEXT:    ret double [[CALL_I]]
 //
 // APPROX-LABEL: @test_y0(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_y0_f64(double noundef [[X:%.*]]) #[[ATTR16]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_y0_f64(double noundef [[X:%.*]]) #[[ATTR14]]
 // APPROX-NEXT:    ret double [[CALL_I]]
 //
 extern "C" __device__ double test_y0(double x) {
@@ -4132,17 +4132,17 @@ extern "C" __device__ double test_y0(double x) {
 
 // DEFAULT-LABEL: @test_y1f(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_y1_f32(float noundef [[X:%.*]]) #[[ATTR16]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_y1_f32(float noundef [[X:%.*]]) #[[ATTR14]]
 // DEFAULT-NEXT:    ret float [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_y1f(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_y1_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR16]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_y1_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR14]]
 // FINITEONLY-NEXT:    ret float [[CALL_I]]
 //
 // APPROX-LABEL: @test_y1f(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_y1_f32(float noundef [[X:%.*]]) #[[ATTR16]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_y1_f32(float noundef [[X:%.*]]) #[[ATTR14]]
 // APPROX-NEXT:    ret float [[CALL_I]]
 //
 extern "C" __device__ float test_y1f(float x) {
@@ -4151,17 +4151,17 @@ extern "C" __device__ float test_y1f(float x) {
 
 // DEFAULT-LABEL: @test_y1(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_y1_f64(double noundef [[X:%.*]]) #[[ATTR16]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_y1_f64(double noundef [[X:%.*]]) #[[ATTR14]]
 // DEFAULT-NEXT:    ret double [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_y1(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_y1_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR16]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_y1_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR14]]
 // FINITEONLY-NEXT:    ret double [[CALL_I]]
 //
 // APPROX-LABEL: @test_y1(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_y1_f64(double noundef [[X:%.*]]) #[[ATTR16]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_y1_f64(double noundef [[X:%.*]]) #[[ATTR14]]
 // APPROX-NEXT:    ret double [[CALL_I]]
 //
 extern "C" __device__ double test_y1(double x) {
@@ -4175,14 +4175,14 @@ extern "C" __device__ double test_y1(double x) {
 // DEFAULT-NEXT:      i32 1, label [[IF_THEN2_I:%.*]]
 // DEFAULT-NEXT:    ]
 // DEFAULT:       if.then.i:
-// DEFAULT-NEXT:    [[CALL_I20_I:%.*]] = tail call contract noundef float @__ocml_y0_f32(float noundef [[Y:%.*]]) #[[ATTR16]]
+// DEFAULT-NEXT:    [[CALL_I20_I:%.*]] = tail call contract noundef float @__ocml_y0_f32(float noundef [[Y:%.*]]) #[[ATTR14]]
 // DEFAULT-NEXT:    br label [[_ZL3YNFIF_EXIT:%.*]]
 // DEFAULT:       if.then2.i:
-// DEFAULT-NEXT:    [[CALL_I22_I:%.*]] = tail call contract noundef float @__ocml_y1_f32(float noundef [[Y]]) #[[ATTR16]]
+// DEFAULT-NEXT:    [[CALL_I22_I:%.*]] = tail call contract noundef float @__ocml_y1_f32(float noundef [[Y]]) #[[ATTR14]]
 // DEFAULT-NEXT:    br label [[_ZL3YNFIF_EXIT]]
 // DEFAULT:       if.end4.i:
-// DEFAULT-NEXT:    [[CALL_I_I:%.*]] = tail call contract noundef float @__ocml_y0_f32(float noundef [[Y]]) #[[ATTR16]]
-// DEFAULT-NEXT:    [[CALL_I21_I:%.*]] = tail call contract noundef float @__ocml_y1_f32(float noundef [[Y]]) #[[ATTR16]]
+// DEFAULT-NEXT:    [[CALL_I_I:%.*]] = tail call contract noundef float @__ocml_y0_f32(float noundef [[Y]]) #[[ATTR14]]
+// DEFAULT-NEXT:    [[CALL_I21_I:%.*]] = tail call contract noundef float @__ocml_y1_f32(float noundef [[Y]]) #[[ATTR14]]
 // DEFAULT-NEXT:    [[CMP7_I1:%.*]] = icmp sgt i32 [[X]], 1
 // DEFAULT-NEXT:    br i1 [[CMP7_I1]], label [[FOR_BODY_I:%.*]], label [[_ZL3YNFIF_EXIT]]
 // DEFAULT:       for.body.i:
@@ -4208,14 +4208,14 @@ extern "C" __device__ double test_y1(double x) {
 // FINITEONLY-NEXT:      i32 1, label [[IF_THEN2_I:%.*]]
 // FINITEONLY-NEXT:    ]
 // FINITEONLY:       if.then.i:
-// FINITEONLY-NEXT:    [[CALL_I20_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_y0_f32(float noundef nofpclass(nan inf) [[Y:%.*]]) #[[ATTR16]]
+// FINITEONLY-NEXT:    [[CALL_I20_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_y0_f32(float noundef nofpclass(nan inf) [[Y:%.*]]) #[[ATTR14]]
 // FINITEONLY-NEXT:    br label [[_ZL3YNFIF_EXIT:%.*]]
 // FINITEONLY:       if.then2.i:
-// FINITEONLY-NEXT:    [[CALL_I22_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_y1_f32(float noundef nofpclass(nan inf) [[Y]]) #[[ATTR16]]
+// FINITEONLY-NEXT:    [[CALL_I22_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_y1_f32(float noundef nofpclass(nan inf) [[Y]]) #[[ATTR14]]
 // FINITEONLY-NEXT:    br label [[_ZL3YNFIF_EXIT]]
 // FINITEONLY:       if.end4.i:
-// FINITEONLY-NEXT:    [[CALL_I_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_y0_f32(float noundef nofpclass(nan inf) [[Y]]) #[[ATTR16]]
-// FINITEONLY-NEXT:    [[CALL_I21_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_y1_f32(float noundef nofpclass(nan inf) [[Y]]) #[[ATTR16]]
+// FINITEONLY-NEXT:    [[CALL_I_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_y0_f32(float noundef nofpclass(nan inf) [[Y]]) #[[ATTR14]]
+// FINITEONLY-NEXT:    [[CALL_I21_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_y1_f32(float noundef nofpclass(nan inf) [[Y]]) #[[ATTR14]]
 // FINITEONLY-NEXT:    [[CMP7_I1:%.*]] = icmp sgt i32 [[X]], 1
 // FINITEONLY-NEXT:    br i1 [[CMP7_I1]], label [[FOR_BODY_I:%.*]], label [[_ZL3YNFIF_EXIT]]
 // FINITEONLY:       for.body.i:
@@ -4241,14 +4241,14 @@ extern "C" __device__ double test_y1(double x) {
 // APPROX-NEXT:      i32 1, label [[IF_THEN2_I:%.*]]
 // APPROX-NEXT:    ]
 // APPROX:       if.then.i:
-// APPROX-NEXT:    [[CALL_I20_I:%.*]] = tail call contract noundef float @__ocml_y0_f32(float noundef [[Y:%.*]]) #[[ATTR16]]
+// APPROX-NEXT:    [[CALL_I20_I:%.*]] = tail call contract noundef float @__ocml_y0_f32(float noundef [[Y:%.*]]) #[[ATTR14]]
 // APPROX-NEXT:    br label [[_ZL3YNFIF_EXIT:%.*]]
 // APPROX:       if.then2.i:
-// APPROX-NEXT:    [[CALL_I22_I:%.*]] = tail call contract noundef float @__ocml_y1_f32(float noundef [[Y]]) #[[ATTR16]]
+// APPROX-NEXT:    [[CALL_I22_I:%.*]] = tail call contract noundef float @__ocml_y1_f32(float noundef [[Y]]) #[[ATTR14]]
 // APPROX-NEXT:    br label [[_ZL3YNFIF_EXIT]]
 // APPROX:       if.end4.i:
-// APPROX-NEXT:    [[CALL_I_I:%.*]] = tail call contract noundef float @__ocml_y0_f32(float noundef [[Y]]) #[[ATTR16]]
-// APPROX-NEXT:    [[CALL_I21_I:%.*]] = tail call contract noundef float @__ocml_y1_f32(float noundef [[Y]]) #[[ATTR16]]
+// APPROX-NEXT:    [[CALL_I_I:%.*]] = tail call contract noundef float @__ocml_y0_f32(float noundef [[Y]]) #[[ATTR14]]
+// APPROX-NEXT:    [[CALL_I21_I:%.*]] = tail call contract noundef float @__ocml_y1_f32(float noundef [[Y]]) #[[ATTR14]]
 // APPROX-NEXT:    [[CMP7_I1:%.*]] = icmp sgt i32 [[X]], 1
 // APPROX-NEXT:    br i1 [[CMP7_I1]], label [[FOR_BODY_I:%.*]], label [[_ZL3YNFIF_EXIT]]
 // APPROX:       for.body.i:
@@ -4278,14 +4278,14 @@ extern "C" __device__ float test_ynf(int x, float y) {
 // DEFAULT-NEXT:      i32 1, label [[IF_THEN2_I:%.*]]
 // DEFAULT-NEXT:    ]
 // DEFAULT:       if.then.i:
-// DEFAULT-NEXT:    [[CALL_I20_I:%.*]] = tail call contract noundef double @__ocml_y0_f64(double noundef [[Y:%.*]]) #[[ATTR16]]
+// DEFAULT-NEXT:    [[CALL_I20_I:%.*]] = tail call contract noundef double @__ocml_y0_f64(double noundef [[Y:%.*]]) #[[ATTR14]]
 // DEFAULT-NEXT:    br label [[_ZL2YNID_EXIT:%.*]]
 // DEFAULT:       if.then2.i:
-// DEFAULT-NEXT:    [[CALL_I22_I:%.*]] = tail call contract noundef double @__ocml_y1_f64(double noundef [[Y]]) #[[ATTR16]]
+// DEFAULT-NEXT:    [[CALL_I22_I:%.*]] = tail call contract noundef double @__ocml_y1_f64(double noundef [[Y]]) #[[ATTR14]]
 // DEFAULT-NEXT:    br label [[_ZL2YNID_EXIT]]
 // DEFAULT:       if.end4.i:
-// DEFAULT-NEXT:    [[CALL_I_I:%.*]] = tail call contract noundef double @__ocml_y0_f64(double noundef [[Y]]) #[[ATTR16]]
-// DEFAULT-NEXT:    [[CALL_I21_I:%.*]] = tail call contract noundef double @__ocml_y1_f64(double noundef [[Y]]) #[[ATTR16]]
+// DEFAULT-NEXT:    [[CALL_I_I:%.*]] = tail call contract noundef double @__ocml_y0_f64(double noundef [[Y]]) #[[ATTR14]]
+// DEFAULT-NEXT:    [[CALL_I21_I:%.*]] = tail call contract noundef double @__ocml_y1_f64(double noundef [[Y]]) #[[ATTR14]]
 // DEFAULT-NEXT:    [[CMP7_I1:%.*]] = icmp sgt i32 [[X]], 1
 // DEFAULT-NEXT:    br i1 [[CMP7_I1]], label [[FOR_BODY_I:%.*]], label [[_ZL2YNID_EXIT]]
 // DEFAULT:       for.body.i:
@@ -4311,14 +4311,14 @@ extern "C" __device__ float test_ynf(int x, float y) {
 // FINITEONLY-NEXT:      i32 1, label [[IF_THEN2_I:%.*]]
 // FINITEONLY-NEXT:    ]
 // FINITEONLY:       if.then.i:
-// FINITEONLY-NEXT:    [[CALL_I20_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_y0_f64(double noundef nofpclass(nan inf) [[Y:%.*]]) #[[ATTR16]]
+// FINITEONLY-NEXT:    [[CALL_I20_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_y0_f64(double noundef nofpclass(nan inf) [[Y:%.*]]) #[[ATTR14]]
 // FINITEONLY-NEXT:    br label [[_ZL2YNID_EXIT:%.*]]
 // FINITEONLY:       if.then2.i:
-// FINITEONLY-NEXT:    [[CALL_I22_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_y1_f64(double noundef nofpclass(nan inf) [[Y]]) #[[ATTR16]]
+// FINITEONLY-NEXT:    [[CALL_I22_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_y1_f64(double noundef nofpclass(nan inf) [[Y]]) #[[ATTR14]]
 // FINITEONLY-NEXT:    br label [[_ZL2YNID_EXIT]]
 // FINITEONLY:       if.end4.i:
-// FINITEONLY-NEXT:    [[CALL_I_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_y0_f64(double noundef nofpclass(nan inf) [[Y]]) #[[ATTR16]]
-// FINITEONLY-NEXT:    [[CALL_I21_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_y1_f64(double noundef nofpclass(nan inf) [[Y]]) #[[ATTR16]]
+// FINITEONLY-NEXT:    [[CALL_I_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_y0_f64(double noundef nofpclass(nan inf) [[Y]]) #[[ATTR14]]
+// FINITEONLY-NEXT:    [[CALL_I21_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_y1_f64(double noundef nofpclass(nan inf) [[Y]]) #[[ATTR14]]
 // FINITEONLY-NEXT:    [[CMP7_I1:%.*]] = icmp sgt i32 [[X]], 1
 // FINITEONLY-NEXT:    br i1 [[CMP7_I1]], label [[FOR_BODY_I:%.*]], label [[_ZL2YNID_EXIT]]
 // FINITEONLY:       for.body.i:
@@ -4344,14 +4344,14 @@ extern "C" __device__ float test_ynf(int x, float y) {
 // APPROX-NEXT:      i32 1, label [[IF_THEN2_I:%.*]]
 // APPROX-NEXT:    ]
 // APPROX:       if.then.i:
-// APPROX-NEXT:    [[CALL_I20_I:%.*]] = tail call contract noundef double @__ocml_y0_f64(double noundef [[Y:%.*]]) #[[ATTR16]]
+// APPROX-NEXT:    [[CALL_I20_I:%.*]] = tail call contract noundef double @__ocml_y0_f64(double noundef [[Y:%.*]]) #[[ATTR14]]
 // APPROX-NEXT:    br label [[_ZL2YNID_EXIT:%.*]]
 // APPROX:       if.then2.i:
-// APPROX-NEXT:    [[CALL_I22_I:%.*]] = tail call contract noundef double @__ocml_y1_f64(double noundef [[Y]]) #[[ATTR16]]
+// APPROX-NEXT:    [[CALL_I22_I:%.*]] = tail call contract noundef double @__ocml_y1_f64(double noundef [[Y]]) #[[ATTR14]]
 // APPROX-NEXT:    br label [[_ZL2YNID_EXIT]]
 // APPROX:       if.end4.i:
-// APPROX-NEXT:    [[CALL_I_I:%.*]] = tail call contract noundef double @__ocml_y0_f64(double noundef [[Y]]) #[[ATTR16]]
-// APPROX-NEXT:    [[CALL_I21_I:%.*]] = tail call contract noundef double @__ocml_y1_f64(double noundef [[Y]]) #[[ATTR16]]
+// APPROX-NEXT:    [[CALL_I_I:%.*]] = tail call contract noundef double @__ocml_y0_f64(double noundef [[Y]]) #[[ATTR14]]
+// APPROX-NEXT:    [[CALL_I21_I:%.*]] = tail call contract noundef double @__ocml_y1_f64(double noundef [[Y]]) #[[ATTR14]]
 // APPROX-NEXT:    [[CMP7_I1:%.*]] = icmp sgt i32 [[X]], 1
 // APPROX-NEXT:    br i1 [[CMP7_I1]], label [[FOR_BODY_I:%.*]], label [[_ZL2YNID_EXIT]]
 // APPROX:       for.body.i:
@@ -4376,17 +4376,17 @@ extern "C" __device__ double test_yn(int x, double y) {
 
 // DEFAULT-LABEL: @test___cosf(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_native_cos_f32(float noundef [[X:%.*]]) #[[ATTR16]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_native_cos_f32(float noundef [[X:%.*]]) #[[ATTR14]]
 // DEFAULT-NEXT:    ret float [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test___cosf(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_native_cos_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR16]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_native_cos_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR14]]
 // FINITEONLY-NEXT:    ret float [[CALL_I]]
 //
 // APPROX-LABEL: @test___cosf(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_native_cos_f32(float noundef [[X:%.*]]) #[[ATTR16]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_native_cos_f32(float noundef [[X:%.*]]) #[[ATTR14]]
 // APPROX-NEXT:    ret float [[CALL_I]]
 //
 extern "C" __device__ float test___cosf(float x) {
@@ -4553,17 +4553,17 @@ extern "C" __device__ float test___frsqrt_rn(float x) {
 
 // DEFAULT-LABEL: @test___fsqrt_rn(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_native_sqrt_f32(float noundef [[X:%.*]]) #[[ATTR14]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_native_sqrt_f32(float noundef [[X:%.*]]) #[[ATTR12]]
 // DEFAULT-NEXT:    ret float [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test___fsqrt_rn(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_native_sqrt_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR14]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_native_sqrt_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR12]]
 // FINITEONLY-NEXT:    ret float [[CALL_I]]
 //
 // APPROX-LABEL: @test___fsqrt_rn(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_native_sqrt_f32(float noundef [[X:%.*]]) #[[ATTR14]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_native_sqrt_f32(float noundef [[X:%.*]]) #[[ATTR12]]
 // APPROX-NEXT:    ret float [[CALL_I]]
 //
 extern "C" __device__ float test___fsqrt_rn(float x) {
@@ -4648,17 +4648,17 @@ extern "C" __device__ float test___logf(float x) {
 
 // DEFAULT-LABEL: @test___powf(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_pow_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]]) #[[ATTR15]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_pow_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]]) #[[ATTR13]]
 // DEFAULT-NEXT:    ret float [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test___powf(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_pow_f32(float noundef nofpclass(nan inf) [[X:%.*]], float noundef nofpclass(nan inf) [[Y:%.*]]) #[[ATTR15]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_pow_f32(float noundef nofpclass(nan inf) [[X:%.*]], float noundef nofpclass(nan inf) [[Y:%.*]]) #[[ATTR13]]
 // FINITEONLY-NEXT:    ret float [[CALL_I]]
 //
 // APPROX-LABEL: @test___powf(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_pow_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]]) #[[ATTR15]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_pow_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]]) #[[ATTR13]]
 // APPROX-NEXT:    ret float [[CALL_I]]
 //
 extern "C" __device__ float test___powf(float x, float y) {
@@ -4695,25 +4695,25 @@ extern "C" __device__ float test___saturatef(float x) {
 
 // DEFAULT-LABEL: @test___sincosf(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract float @__ocml_native_sin_f32(float noundef [[X:%.*]]) #[[ATTR16]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract float @__ocml_native_sin_f32(float noundef [[X:%.*]]) #[[ATTR14]]
 // DEFAULT-NEXT:    store float [[CALL_I]], ptr [[Y:%.*]], align 4, !tbaa [[TBAA16]]
-// DEFAULT-NEXT:    [[CALL1_I:%.*]] = tail call contract float @__ocml_native_cos_f32(float noundef [[X]]) #[[ATTR16]]
+// DEFAULT-NEXT:    [[CALL1_I:%.*]] = tail call contract float @__ocml_native_cos_f32(float noundef [[X]]) #[[ATTR14]]
 // DEFAULT-NEXT:    store float [[CALL1_I]], ptr [[Z:%.*]], align 4, !tbaa [[TBAA16]]
 // DEFAULT-NEXT:    ret void
 //
 // FINITEONLY-LABEL: @test___sincosf(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract nofpclass(nan inf) float @__ocml_native_sin_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR16]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract nofpclass(nan inf) float @__ocml_native_sin_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR14]]
 // FINITEONLY-NEXT:    store float [[CALL_I]], ptr [[Y:%.*]], align 4, !tbaa [[TBAA16]]
-// FINITEONLY-NEXT:    [[CALL1_I:%.*]] = tail call nnan ninf contract nofpclass(nan inf) float @__ocml_native_cos_f32(float noundef nofpclass(nan inf) [[X]]) #[[ATTR16]]
+// FINITEONLY-NEXT:    [[CALL1_I:%.*]] = tail call nnan ninf contract nofpclass(nan inf) float @__ocml_native_cos_f32(float noundef nofpclass(nan inf) [[X]]) #[[ATTR14]]
 // FINITEONLY-NEXT:    store float [[CALL1_I]], ptr [[Z:%.*]], align 4, !tbaa [[TBAA16]]
 // FINITEONLY-NEXT:    ret void
 //
 // APPROX-LABEL: @test___sincosf(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract float @__ocml_native_sin_f32(float noundef [[X:%.*]]) #[[ATTR16]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract float @__ocml_native_sin_f32(float noundef [[X:%.*]]) #[[ATTR14]]
 // APPROX-NEXT:    store float [[CALL_I]], ptr [[Y:%.*]], align 4, !tbaa [[TBAA16]]
-// APPROX-NEXT:    [[CALL1_I:%.*]] = tail call contract float @__ocml_native_cos_f32(float noundef [[X]]) #[[ATTR16]]
+// APPROX-NEXT:    [[CALL1_I:%.*]] = tail call contract float @__ocml_native_cos_f32(float noundef [[X]]) #[[ATTR14]]
 // APPROX-NEXT:    store float [[CALL1_I]], ptr [[Z:%.*]], align 4, !tbaa [[TBAA16]]
 // APPROX-NEXT:    ret void
 //
@@ -4723,17 +4723,17 @@ extern "C" __device__ void test___sincosf(float x, float *y, float *z) {
 
 // DEFAULT-LABEL: @test___sinf(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_native_sin_f32(float noundef [[X:%.*]]) #[[ATTR16]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_native_sin_f32(float noundef [[X:%.*]]) #[[ATTR14]]
 // DEFAULT-NEXT:    ret float [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test___sinf(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_native_sin_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR16]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_native_sin_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR14]]
 // FINITEONLY-NEXT:    ret float [[CALL_I]]
 //
 // APPROX-LABEL: @test___sinf(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_native_sin_f32(float noundef [[X:%.*]]) #[[ATTR16]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_native_sin_f32(float noundef [[X:%.*]]) #[[ATTR14]]
 // APPROX-NEXT:    ret float [[CALL_I]]
 //
 extern "C" __device__ float test___sinf(float x) {
@@ -4742,24 +4742,24 @@ extern "C" __device__ float test___sinf(float x) {
 
 // DEFAULT-LABEL: @test___tanf(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I3_I:%.*]] = tail call contract noundef float @__ocml_native_sin_f32(float noundef [[X:%.*]]) #[[ATTR16]]
-// DEFAULT-NEXT:    [[CALL_I_I:%.*]] = tail call contract noundef float @__ocml_native_cos_f32(float noundef [[X]]) #[[ATTR16]]
+// DEFAULT-NEXT:    [[CALL_I3_I:%.*]] = tail call contract noundef float @__ocml_native_sin_f32(float noundef [[X:%.*]]) #[[ATTR14]]
+// DEFAULT-NEXT:    [[CALL_I_I:%.*]] = tail call contract noundef float @__ocml_native_cos_f32(float noundef [[X]]) #[[ATTR14]]
 // DEFAULT-NEXT:    [[TMP0:%.*]] = tail call contract float @llvm.amdgcn.rcp.f32(float [[CALL_I_I]])
 // DEFAULT-NEXT:    [[MUL_I:%.*]] = fmul contract float [[CALL_I3_I]], [[TMP0]]
 // DEFAULT-NEXT:    ret float [[MUL_I]]
 //
 // FINITEONLY-LABEL: @test___tanf(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I3_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_native_sin_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR16]]
-// FINITEONLY-NEXT:    [[CALL_I_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_native_cos_f32(float noundef nofpclass(nan inf) [[X]]) #[[ATTR16]]
+// FINITEONLY-NEXT:    [[CALL_I3_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_native_sin_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR14]]
+// FINITEONLY-NEXT:    [[CALL_I_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_native_cos_f32(float noundef nofpclass(nan inf) [[X]]) #[[ATTR14]]
 // FINITEONLY-NEXT:    [[TMP0:%.*]] = tail call nnan ninf contract float @llvm.amdgcn.rcp.f32(float [[CALL_I_I]])
 // FINITEONLY-NEXT:    [[MUL_I:%.*]] = fmul nnan ninf contract float [[CALL_I3_I]], [[TMP0]]
 // FINITEONLY-NEXT:    ret float [[MUL_I]]
 //
 // APPROX-LABEL: @test___tanf(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I3_I:%.*]] = tail call contract noundef float @__ocml_native_sin_f32(float noundef [[X:%.*]]) #[[ATTR16]]
-// APPROX-NEXT:    [[CALL_I_I:%.*]] = tail call contract noundef float @__ocml_native_cos_f32(float noundef [[X]]) #[[ATTR16]]
+// APPROX-NEXT:    [[CALL_I3_I:%.*]] = tail call contract noundef float @__ocml_native_sin_f32(float noundef [[X:%.*]]) #[[ATTR14]]
+// APPROX-NEXT:    [[CALL_I_I:%.*]] = tail call contract noundef float @__ocml_native_cos_f32(float noundef [[X]]) #[[ATTR14]]
 // APPROX-NEXT:    [[TMP0:%.*]] = tail call contract float @llvm.amdgcn.rcp.f32(float [[CALL_I_I]])
 // APPROX-NEXT:    [[MUL_I:%.*]] = fmul contract float [[CALL_I3_I]], [[TMP0]]
 // APPROX-NEXT:    ret float [[MUL_I]]
-- 
cgit v1.1


From decbd29f9e9be50756a083cd677f7fea22cd3c91 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Thu, 8 Feb 2024 14:12:39 +0530
Subject: Reapply "InstCombine: Introduce SimplifyDemandedUseFPClass"" (#74056)

This reverts commit ef388334ee5a3584255b9ef5b3fefdb244fa3fd7.

The referenced issue violates the spec for finite-only math only by
using a return value for a constant infinity. If the interpretation
is results and arguments cannot violate nofpclass, then any
std::numeric_limits<T>::infinity() result is invalid under
-ffinite-math-only. Without this interpretation the utility of
nofpclass is slashed.
---
 llvm/include/llvm/Analysis/ValueTracking.h         |   4 +
 .../Transforms/InstCombine/InstCombineInternal.h   |   9 +
 .../InstCombine/InstCombineSimplifyDemanded.cpp    | 136 ++++++++++++++
 .../InstCombine/InstructionCombining.cpp           |  27 ++-
 .../InstCombine/simplify-demanded-fpclass.ll       | 209 ++++++++-------------
 5 files changed, 251 insertions(+), 134 deletions(-)

diff --git a/llvm/include/llvm/Analysis/ValueTracking.h b/llvm/include/llvm/Analysis/ValueTracking.h
index d9287ae..06f94f5 100644
--- a/llvm/include/llvm/Analysis/ValueTracking.h
+++ b/llvm/include/llvm/Analysis/ValueTracking.h
@@ -248,6 +248,10 @@ struct KnownFPClass {
   /// definitely set or false if the sign bit is definitely unset.
   std::optional<bool> SignBit;
 
+  bool operator==(KnownFPClass Other) const {
+    return KnownFPClasses == Other.KnownFPClasses && SignBit == Other.SignBit;
+  }
+
   /// Return true if it's known this can never be one of the mask entries.
   bool isKnownNever(FPClassTest Mask) const {
     return (KnownFPClasses & Mask) == fcNone;
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
index 97459a8..7f6618f 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
+++ b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
@@ -566,6 +566,15 @@ public:
                                     APInt &PoisonElts, unsigned Depth = 0,
                                     bool AllowMultipleUsers = false) override;
 
+  /// Attempts to replace V with a simpler value based on the demanded
+  /// floating-point classes
+  Value *SimplifyDemandedUseFPClass(Value *V, FPClassTest DemandedMask,
+                                    KnownFPClass &Known, unsigned Depth,
+                                    Instruction *CxtI);
+  bool SimplifyDemandedFPClass(Instruction *I, unsigned Op,
+                               FPClassTest DemandedMask, KnownFPClass &Known,
+                               unsigned Depth = 0);
+
   /// Canonicalize the position of binops relative to shufflevector.
   Instruction *foldVectorBinop(BinaryOperator &Inst);
   Instruction *foldVectorSelect(SelectInst &Sel);
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
index 79873a9..be6ee9d 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
@@ -1877,3 +1877,139 @@ Value *InstCombinerImpl::SimplifyDemandedVectorElts(Value *V,
 
   return MadeChange ? I : nullptr;
 }
+
+/// For floating-point classes that resolve to a single bit pattern, return that
+/// value.
+static Constant *getFPClassConstant(Type *Ty, FPClassTest Mask) {
+  switch (Mask) {
+  case fcPosZero:
+    return ConstantFP::getZero(Ty);
+  case fcNegZero:
+    return ConstantFP::getZero(Ty, true);
+  case fcPosInf:
+    return ConstantFP::getInfinity(Ty);
+  case fcNegInf:
+    return ConstantFP::getInfinity(Ty, true);
+  case fcNone:
+    return PoisonValue::get(Ty);
+  default:
+    return nullptr;
+  }
+}
+
+Value *InstCombinerImpl::SimplifyDemandedUseFPClass(
+    Value *V, const FPClassTest DemandedMask, KnownFPClass &Known,
+    unsigned Depth, Instruction *CxtI) {
+  assert(Depth <= MaxAnalysisRecursionDepth && "Limit Search Depth");
+  Type *VTy = V->getType();
+
+  assert(Known == KnownFPClass() && "expected uninitialized state");
+
+  if (DemandedMask == fcNone)
+    return isa<UndefValue>(V) ? nullptr : PoisonValue::get(VTy);
+
+  if (Depth == MaxAnalysisRecursionDepth)
+    return nullptr;
+
+  Instruction *I = dyn_cast<Instruction>(V);
+  if (!I) {
+    // Handle constants and arguments
+    Known = computeKnownFPClass(V, fcAllFlags, CxtI, Depth + 1);
+    Value *FoldedToConst =
+        getFPClassConstant(VTy, DemandedMask & Known.KnownFPClasses);
+    return FoldedToConst == V ? nullptr : FoldedToConst;
+  }
+
+  if (!I->hasOneUse())
+    return nullptr;
+
+  // TODO: Should account for nofpclass/FastMathFlags on current instruction
+  switch (I->getOpcode()) {
+  case Instruction::FNeg: {
+    if (SimplifyDemandedFPClass(I, 0, llvm::fneg(DemandedMask), Known,
+                                Depth + 1))
+      return I;
+    Known.fneg();
+    break;
+  }
+  case Instruction::Call: {
+    CallInst *CI = cast<CallInst>(I);
+    switch (CI->getIntrinsicID()) {
+    case Intrinsic::fabs:
+      if (SimplifyDemandedFPClass(I, 0, llvm::inverse_fabs(DemandedMask), Known,
+                                  Depth + 1))
+        return I;
+      Known.fabs();
+      break;
+    case Intrinsic::arithmetic_fence:
+      if (SimplifyDemandedFPClass(I, 0, DemandedMask, Known, Depth + 1))
+        return I;
+      break;
+    case Intrinsic::copysign: {
+      // Flip on more potentially demanded classes
+      const FPClassTest DemandedMaskAnySign = llvm::unknown_sign(DemandedMask);
+      if (SimplifyDemandedFPClass(I, 0, DemandedMaskAnySign, Known, Depth + 1))
+        return I;
+
+      if ((DemandedMask & fcPositive) == fcNone) {
+        // Roundabout way of replacing with fneg(fabs)
+        I->setOperand(1, ConstantFP::get(VTy, -1.0));
+        return I;
+      }
+
+      if ((DemandedMask & fcNegative) == fcNone) {
+        // Roundabout way of replacing with fabs
+        I->setOperand(1, ConstantFP::getZero(VTy));
+        return I;
+      }
+
+      KnownFPClass KnownSign =
+          computeKnownFPClass(I->getOperand(1), fcAllFlags, CxtI, Depth + 1);
+      Known.copysign(KnownSign);
+      break;
+    }
+    default:
+      Known = computeKnownFPClass(I, ~DemandedMask, CxtI, Depth + 1);
+      break;
+    }
+
+    break;
+  }
+  case Instruction::Select: {
+    KnownFPClass KnownLHS, KnownRHS;
+    if (SimplifyDemandedFPClass(I, 2, DemandedMask, KnownRHS, Depth + 1) ||
+        SimplifyDemandedFPClass(I, 1, DemandedMask, KnownLHS, Depth + 1))
+      return I;
+
+    if (KnownLHS.isKnownNever(DemandedMask))
+      return I->getOperand(2);
+    if (KnownRHS.isKnownNever(DemandedMask))
+      return I->getOperand(1);
+
+    // TODO: Recognize clamping patterns
+    Known = KnownLHS | KnownRHS;
+    break;
+  }
+  default:
+    Known = computeKnownFPClass(I, ~DemandedMask, CxtI, Depth + 1);
+    break;
+  }
+
+  return getFPClassConstant(VTy, DemandedMask & Known.KnownFPClasses);
+}
+
+bool InstCombinerImpl::SimplifyDemandedFPClass(Instruction *I, unsigned OpNo,
+                                               FPClassTest DemandedMask,
+                                               KnownFPClass &Known,
+                                               unsigned Depth) {
+  Use &U = I->getOperandUse(OpNo);
+  Value *NewVal =
+      SimplifyDemandedUseFPClass(U.get(), DemandedMask, Known, Depth, I);
+  if (!NewVal)
+    return false;
+  if (Instruction *OpInst = dyn_cast<Instruction>(U))
+    salvageDebugInfo(*OpInst);
+
+  replaceUse(U, NewVal);
+  return true;
+}
diff --git a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
index 9e8bcbc..b1e2262 100644
--- a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
@@ -142,6 +142,12 @@ static cl::opt<unsigned>
 MaxArraySize("instcombine-maxarray-size", cl::init(1024),
              cl::desc("Maximum array size considered when doing a combine"));
 
+// TODO: Remove this option
+static cl::opt<bool> EnableSimplifyDemandedUseFPClass(
+    "instcombine-simplify-demanded-fp-class",
+    cl::desc("Enable demanded floating-point class optimizations"),
+    cl::init(false));
+
 // FIXME: Remove this flag when it is no longer necessary to convert
 // llvm.dbg.declare to avoid inaccurate debug info. Setting this to false
 // increases variable availability at the cost of accuracy. Variables that
@@ -3105,8 +3111,25 @@ Instruction *InstCombinerImpl::visitFree(CallInst &FI, Value *Op) {
 }
 
 Instruction *InstCombinerImpl::visitReturnInst(ReturnInst &RI) {
-  // Nothing for now.
-  return nullptr;
+  if (!EnableSimplifyDemandedUseFPClass)
+    return nullptr;
+
+  Value *RetVal = RI.getReturnValue();
+  if (!RetVal || !AttributeFuncs::isNoFPClassCompatibleType(RetVal->getType()))
+    return nullptr;
+
+  Function *F = RI.getFunction();
+  FPClassTest ReturnClass = F->getAttributes().getRetNoFPClass();
+  if (ReturnClass == fcNone)
+    return nullptr;
+
+  KnownFPClass KnownClass;
+  Value *Simplified =
+      SimplifyDemandedUseFPClass(RetVal, ~ReturnClass, KnownClass, 0, &RI);
+  if (!Simplified)
+    return nullptr;
+
+  return ReturnInst::Create(RI.getContext(), Simplified);
 }
 
 // WARNING: keep in sync with SimplifyCFGOpt::simplifyUnreachable()!
diff --git a/llvm/test/Transforms/InstCombine/simplify-demanded-fpclass.ll b/llvm/test/Transforms/InstCombine/simplify-demanded-fpclass.ll
index 9817b6e..dd9b714 100644
--- a/llvm/test/Transforms/InstCombine/simplify-demanded-fpclass.ll
+++ b/llvm/test/Transforms/InstCombine/simplify-demanded-fpclass.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2
-; RUN: opt -S -passes=instcombine < %s | FileCheck %s
+; RUN: opt -S -passes=instcombine -instcombine-simplify-demanded-fp-class < %s | FileCheck %s
 
 declare float @llvm.fabs.f32(float)
 declare float @llvm.copysign.f32(float, float)
@@ -42,7 +42,7 @@ define nofpclass(inf) float @ret_nofpclass_inf_undef() {
 define nofpclass(all) float @ret_nofpclass_all_var(float %arg) {
 ; CHECK-LABEL: define nofpclass(all) float @ret_nofpclass_all_var
 ; CHECK-SAME: (float [[ARG:%.*]]) {
-; CHECK-NEXT:    ret float [[ARG]]
+; CHECK-NEXT:    ret float poison
 ;
   ret float %arg
 }
@@ -51,7 +51,7 @@ define nofpclass(all) float @ret_nofpclass_all_var(float %arg) {
 define nofpclass(all) <2 x float> @ret_nofpclass_all_var_vector(<2 x float> %arg) {
 ; CHECK-LABEL: define nofpclass(all) <2 x float> @ret_nofpclass_all_var_vector
 ; CHECK-SAME: (<2 x float> [[ARG:%.*]]) {
-; CHECK-NEXT:    ret <2 x float> [[ARG]]
+; CHECK-NEXT:    ret <2 x float> poison
 ;
   ret <2 x float> %arg
 }
@@ -65,14 +65,14 @@ define nofpclass(inf) float @ret_nofpclass_inf__0() {
 
 define nofpclass(inf) float @ret_nofpclass_inf__pinf() {
 ; CHECK-LABEL: define nofpclass(inf) float @ret_nofpclass_inf__pinf() {
-; CHECK-NEXT:    ret float 0x7FF0000000000000
+; CHECK-NEXT:    ret float poison
 ;
   ret float 0x7FF0000000000000
 }
 
 define nofpclass(pinf) float @ret_nofpclass_pinf__pinf() {
 ; CHECK-LABEL: define nofpclass(pinf) float @ret_nofpclass_pinf__pinf() {
-; CHECK-NEXT:    ret float 0x7FF0000000000000
+; CHECK-NEXT:    ret float poison
 ;
   ret float 0x7FF0000000000000
 }
@@ -86,7 +86,7 @@ define nofpclass(pinf) float @ret_nofpclass_pinf__ninf() {
 
 define nofpclass(inf) float @ret_nofpclass_inf__ninf() {
 ; CHECK-LABEL: define nofpclass(inf) float @ret_nofpclass_inf__ninf() {
-; CHECK-NEXT:    ret float 0xFFF0000000000000
+; CHECK-NEXT:    ret float poison
 ;
   ret float 0xFFF0000000000000
 }
@@ -106,8 +106,7 @@ define nofpclass(inf) float @ret_nofpclass_inf__select_nofpclass_inf_lhs(i1 %con
 define nofpclass(inf) float @ret_nofpclass_inf__select_nofpclass_arg_only_inf_lhs(i1 %cond, float nofpclass(nan norm zero sub) %x, float %y) {
 ; CHECK-LABEL: define nofpclass(inf) float @ret_nofpclass_inf__select_nofpclass_arg_only_inf_lhs
 ; CHECK-SAME: (i1 [[COND:%.*]], float nofpclass(nan zero sub norm) [[X:%.*]], float [[Y:%.*]]) {
-; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[COND]], float [[X]], float [[Y]]
-; CHECK-NEXT:    ret float [[SELECT]]
+; CHECK-NEXT:    ret float [[Y]]
 ;
   %select = select i1 %cond, float %x, float %y
   ret float %select
@@ -117,8 +116,7 @@ define nofpclass(inf) float @ret_nofpclass_inf__select_nofpclass_arg_only_inf_lh
 define nofpclass(inf) float @ret_nofpclass_inf__select_nofpclass_arg_only_inf_rhs(i1 %cond, float %x, float nofpclass(nan norm zero sub) %y) {
 ; CHECK-LABEL: define nofpclass(inf) float @ret_nofpclass_inf__select_nofpclass_arg_only_inf_rhs
 ; CHECK-SAME: (i1 [[COND:%.*]], float [[X:%.*]], float nofpclass(nan zero sub norm) [[Y:%.*]]) {
-; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[COND]], float [[X]], float [[Y]]
-; CHECK-NEXT:    ret float [[SELECT]]
+; CHECK-NEXT:    ret float [[X]]
 ;
   %select = select i1 %cond, float %x, float %y
   ret float %select
@@ -128,8 +126,7 @@ define nofpclass(inf) float @ret_nofpclass_inf__select_nofpclass_arg_only_inf_rh
 define nofpclass(inf) [3 x [2 x float]] @ret_float_array(i1 %cond, [3 x [2 x float]] nofpclass(nan norm zero sub) %x, [3 x [2 x float]] %y) {
 ; CHECK-LABEL: define nofpclass(inf) [3 x [2 x float]] @ret_float_array
 ; CHECK-SAME: (i1 [[COND:%.*]], [3 x [2 x float]] nofpclass(nan zero sub norm) [[X:%.*]], [3 x [2 x float]] [[Y:%.*]]) {
-; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[COND]], [3 x [2 x float]] [[X]], [3 x [2 x float]] [[Y]]
-; CHECK-NEXT:    ret [3 x [2 x float]] [[SELECT]]
+; CHECK-NEXT:    ret [3 x [2 x float]] [[Y]]
 ;
   %select = select i1 %cond, [3 x [2 x float]] %x, [3 x [2 x float]] %y
   ret [3 x [2 x float ]] %select
@@ -139,8 +136,7 @@ define nofpclass(inf) [3 x [2 x float]] @ret_float_array(i1 %cond, [3 x [2 x flo
 define nofpclass(inf) float @ret_nofpclass_inf__select_pinf_lhs(i1 %cond, float %x) {
 ; CHECK-LABEL: define nofpclass(inf) float @ret_nofpclass_inf__select_pinf_lhs
 ; CHECK-SAME: (i1 [[COND:%.*]], float [[X:%.*]]) {
-; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[COND]], float 0x7FF0000000000000, float [[X]]
-; CHECK-NEXT:    ret float [[SELECT]]
+; CHECK-NEXT:    ret float [[X]]
 ;
   %select = select i1 %cond, float 0x7FF0000000000000, float %x
   ret float %select
@@ -150,8 +146,7 @@ define nofpclass(inf) float @ret_nofpclass_inf__select_pinf_lhs(i1 %cond, float
 define nofpclass(inf) float @ret_nofpclass_inf__select_pinf_rhs(i1 %cond, float %x) {
 ; CHECK-LABEL: define nofpclass(inf) float @ret_nofpclass_inf__select_pinf_rhs
 ; CHECK-SAME: (i1 [[COND:%.*]], float [[X:%.*]]) {
-; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[COND]], float [[X]], float 0x7FF0000000000000
-; CHECK-NEXT:    ret float [[SELECT]]
+; CHECK-NEXT:    ret float [[X]]
 ;
   %select = select i1 %cond, float %x, float 0x7FF0000000000000
   ret float %select
@@ -161,8 +156,7 @@ define nofpclass(inf) float @ret_nofpclass_inf__select_pinf_rhs(i1 %cond, float
 define nofpclass(inf) float @ret_nofpclass_inf__select_pinf_or_ninf(i1 %cond, float %x) {
 ; CHECK-LABEL: define nofpclass(inf) float @ret_nofpclass_inf__select_pinf_or_ninf
 ; CHECK-SAME: (i1 [[COND:%.*]], float [[X:%.*]]) {
-; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[COND]], float 0x7FF0000000000000, float 0xFFF0000000000000
-; CHECK-NEXT:    ret float [[SELECT]]
+; CHECK-NEXT:    ret float poison
 ;
   %select = select i1 %cond, float 0x7FF0000000000000, float 0xFFF0000000000000
   ret float %select
@@ -172,8 +166,7 @@ define nofpclass(inf) float @ret_nofpclass_inf__select_pinf_or_ninf(i1 %cond, fl
 define nofpclass(inf) float @ret_nofpclass_inf__select_ninf_or_pinf(i1 %cond, float %x) {
 ; CHECK-LABEL: define nofpclass(inf) float @ret_nofpclass_inf__select_ninf_or_pinf
 ; CHECK-SAME: (i1 [[COND:%.*]], float [[X:%.*]]) {
-; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[COND]], float 0xFFF0000000000000, float 0x7FF0000000000000
-; CHECK-NEXT:    ret float [[SELECT]]
+; CHECK-NEXT:    ret float poison
 ;
   %select = select i1 %cond, float 0xFFF0000000000000, float 0x7FF0000000000000
   ret float %select
@@ -183,8 +176,7 @@ define nofpclass(inf) float @ret_nofpclass_inf__select_ninf_or_pinf(i1 %cond, fl
 define nofpclass(ninf) float @ret_nofpclass_ninf__select_ninf_or_pinf(i1 %cond, float %x) {
 ; CHECK-LABEL: define nofpclass(ninf) float @ret_nofpclass_ninf__select_ninf_or_pinf
 ; CHECK-SAME: (i1 [[COND:%.*]], float [[X:%.*]]) {
-; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[COND]], float 0xFFF0000000000000, float 0x7FF0000000000000
-; CHECK-NEXT:    ret float [[SELECT]]
+; CHECK-NEXT:    ret float 0x7FF0000000000000
 ;
   %select = select i1 %cond, float 0xFFF0000000000000, float 0x7FF0000000000000
   ret float %select
@@ -194,8 +186,7 @@ define nofpclass(ninf) float @ret_nofpclass_ninf__select_ninf_or_pinf(i1 %cond,
 define nofpclass(pinf) float @ret_nofpclass_pinf__select_ninf_or_pinf(i1 %cond, float %x) {
 ; CHECK-LABEL: define nofpclass(pinf) float @ret_nofpclass_pinf__select_ninf_or_pinf
 ; CHECK-SAME: (i1 [[COND:%.*]], float [[X:%.*]]) {
-; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[COND]], float 0xFFF0000000000000, float 0x7FF0000000000000
-; CHECK-NEXT:    ret float [[SELECT]]
+; CHECK-NEXT:    ret float 0xFFF0000000000000
 ;
   %select = select i1 %cond, float 0xFFF0000000000000, float 0x7FF0000000000000
   ret float %select
@@ -205,8 +196,7 @@ define nofpclass(pinf) float @ret_nofpclass_pinf__select_ninf_or_pinf(i1 %cond,
 define nofpclass(zero) float @ret_nofpclass_zero__select_pzero_or_nzero(i1 %cond, float %x) {
 ; CHECK-LABEL: define nofpclass(zero) float @ret_nofpclass_zero__select_pzero_or_nzero
 ; CHECK-SAME: (i1 [[COND:%.*]], float [[X:%.*]]) {
-; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[COND]], float 0.000000e+00, float -0.000000e+00
-; CHECK-NEXT:    ret float [[SELECT]]
+; CHECK-NEXT:    ret float poison
 ;
   %select = select i1 %cond, float 0.0, float -0.0
   ret float %select
@@ -216,8 +206,7 @@ define nofpclass(zero) float @ret_nofpclass_zero__select_pzero_or_nzero(i1 %cond
 define nofpclass(nzero) float @ret_nofpclass_nzero__select_pzero_or_nzero(i1 %cond, float %x) {
 ; CHECK-LABEL: define nofpclass(nzero) float @ret_nofpclass_nzero__select_pzero_or_nzero
 ; CHECK-SAME: (i1 [[COND:%.*]], float [[X:%.*]]) {
-; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[COND]], float 0.000000e+00, float -0.000000e+00
-; CHECK-NEXT:    ret float [[SELECT]]
+; CHECK-NEXT:    ret float 0.000000e+00
 ;
   %select = select i1 %cond, float 0.0, float -0.0
   ret float %select
@@ -227,8 +216,7 @@ define nofpclass(nzero) float @ret_nofpclass_nzero__select_pzero_or_nzero(i1 %co
 define nofpclass(pzero) float @ret_nofpclass_pzero__select_pzero_or_nzero(i1 %cond, float %x) {
 ; CHECK-LABEL: define nofpclass(pzero) float @ret_nofpclass_pzero__select_pzero_or_nzero
 ; CHECK-SAME: (i1 [[COND:%.*]], float [[X:%.*]]) {
-; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[COND]], float 0.000000e+00, float -0.000000e+00
-; CHECK-NEXT:    ret float [[SELECT]]
+; CHECK-NEXT:    ret float -0.000000e+00
 ;
   %select = select i1 %cond, float 0.0, float -0.0
   ret float %select
@@ -238,8 +226,7 @@ define nofpclass(pzero) float @ret_nofpclass_pzero__select_pzero_or_nzero(i1 %co
 define nofpclass(inf) <2 x float> @ret_nofpclass_inf__select_pinf_lhs_vector(<2 x i1> %cond, <2 x float> %x) {
 ; CHECK-LABEL: define nofpclass(inf) <2 x float> @ret_nofpclass_inf__select_pinf_lhs_vector
 ; CHECK-SAME: (<2 x i1> [[COND:%.*]], <2 x float> [[X:%.*]]) {
-; CHECK-NEXT:    [[SELECT:%.*]] = select <2 x i1> [[COND]], <2 x float> <float 0x7FF0000000000000, float 0x7FF0000000000000>, <2 x float> [[X]]
-; CHECK-NEXT:    ret <2 x float> [[SELECT]]
+; CHECK-NEXT:    ret <2 x float> [[X]]
 ;
   %select = select <2 x i1> %cond, <2 x float> <float 0x7FF0000000000000, float 0x7FF0000000000000>, <2 x float> %x
   ret <2 x float> %select
@@ -249,8 +236,7 @@ define nofpclass(inf) <2 x float> @ret_nofpclass_inf__select_pinf_lhs_vector(<2
 define nofpclass(inf) <2 x float> @ret_nofpclass_inf__select_pinf_lhs_vector_undef(<2 x i1> %cond, <2 x float> %x) {
 ; CHECK-LABEL: define nofpclass(inf) <2 x float> @ret_nofpclass_inf__select_pinf_lhs_vector_undef
 ; CHECK-SAME: (<2 x i1> [[COND:%.*]], <2 x float> [[X:%.*]]) {
-; CHECK-NEXT:    [[SELECT:%.*]] = select <2 x i1> [[COND]], <2 x float> <float 0x7FF0000000000000, float poison>, <2 x float> [[X]]
-; CHECK-NEXT:    ret <2 x float> [[SELECT]]
+; CHECK-NEXT:    ret <2 x float> [[X]]
 ;
   %select = select <2 x i1> %cond, <2 x float> <float 0x7FF0000000000000, float poison>, <2 x float> %x
   ret <2 x float> %select
@@ -260,8 +246,7 @@ define nofpclass(inf) <2 x float> @ret_nofpclass_inf__select_pinf_lhs_vector_und
 define nofpclass(inf) <2 x float> @ret_nofpclass_inf__select_mixed_inf_lhs_vector(<2 x i1> %cond, <2 x float> %x) {
 ; CHECK-LABEL: define nofpclass(inf) <2 x float> @ret_nofpclass_inf__select_mixed_inf_lhs_vector
 ; CHECK-SAME: (<2 x i1> [[COND:%.*]], <2 x float> [[X:%.*]]) {
-; CHECK-NEXT:    [[SELECT:%.*]] = select <2 x i1> [[COND]], <2 x float> <float 0x7FF0000000000000, float 0xFFF0000000000000>, <2 x float> [[X]]
-; CHECK-NEXT:    ret <2 x float> [[SELECT]]
+; CHECK-NEXT:    ret <2 x float> [[X]]
 ;
   %select = select <2 x i1> %cond, <2 x float> <float 0x7FF0000000000000, float 0xFFF0000000000000>, <2 x float> %x
   ret <2 x float> %select
@@ -327,8 +312,7 @@ define nofpclass(nan) float @ret_nofpclass_nan__select_pinf_rhs(i1 %cond, float
 define nofpclass(inf nan) float @ret_nofpclass_inf_nan__select_chain_inf_nan_0(i1 %cond, float %x) {
 ; CHECK-LABEL: define nofpclass(nan inf) float @ret_nofpclass_inf_nan__select_chain_inf_nan_0
 ; CHECK-SAME: (i1 [[COND:%.*]], float [[X:%.*]]) {
-; CHECK-NEXT:    [[SELECT1:%.*]] = select i1 [[COND]], float 0x7FF0000000000000, float [[X]]
-; CHECK-NEXT:    ret float [[SELECT1]]
+; CHECK-NEXT:    ret float [[X]]
 ;
   %select0 = select i1 %cond, float 0x7FF8000000000000, float %x
   %select1 = select i1 %cond, float 0x7FF0000000000000, float %select0
@@ -338,8 +322,7 @@ define nofpclass(inf nan) float @ret_nofpclass_inf_nan__select_chain_inf_nan_0(i
 define nofpclass(inf nan) float @ret_nofpclass_inf_nan__select_chain_inf_nan_1(i1 %cond, float %x) {
 ; CHECK-LABEL: define nofpclass(nan inf) float @ret_nofpclass_inf_nan__select_chain_inf_nan_1
 ; CHECK-SAME: (i1 [[COND:%.*]], float [[X:%.*]]) {
-; CHECK-NEXT:    [[SELECT1:%.*]] = select i1 [[COND]], float 0x7FF0000000000000, float 0x7FF8000000000000
-; CHECK-NEXT:    ret float [[SELECT1]]
+; CHECK-NEXT:    ret float poison
 ;
   %select0 = select i1 %cond, float %x, float 0x7FF8000000000000
   %select1 = select i1 %cond, float 0x7FF0000000000000, float %select0
@@ -360,8 +343,7 @@ define nofpclass(nan) float @ret_nofpclass_nan__select_chain_inf_nan(i1 %cond, f
 define nofpclass(inf) float @ret_nofpclass_inf__select_chain_inf_nan_0(i1 %cond, float %x) {
 ; CHECK-LABEL: define nofpclass(inf) float @ret_nofpclass_inf__select_chain_inf_nan_0
 ; CHECK-SAME: (i1 [[COND:%.*]], float [[X:%.*]]) {
-; CHECK-NEXT:    [[SELECT1:%.*]] = select i1 [[COND]], float 0x7FF0000000000000, float [[X]]
-; CHECK-NEXT:    ret float [[SELECT1]]
+; CHECK-NEXT:    ret float [[X]]
 ;
   %select0 = select i1 %cond, float 0x7FF8000000000000, float %x
   %select1 = select i1 %cond, float 0x7FF0000000000000, float %select0
@@ -371,8 +353,7 @@ define nofpclass(inf) float @ret_nofpclass_inf__select_chain_inf_nan_0(i1 %cond,
 define nofpclass(inf) float @ret_nofpclass_inf__select_chain_inf_nan_1(i1 %cond, float %x) {
 ; CHECK-LABEL: define nofpclass(inf) float @ret_nofpclass_inf__select_chain_inf_nan_1
 ; CHECK-SAME: (i1 [[COND:%.*]], float [[X:%.*]]) {
-; CHECK-NEXT:    [[SELECT1:%.*]] = select i1 [[COND]], float 0x7FF8000000000000, float 0x7FF0000000000000
-; CHECK-NEXT:    ret float [[SELECT1]]
+; CHECK-NEXT:    ret float 0x7FF8000000000000
 ;
   %select0 = select i1 %cond, float 0x7FF8000000000000, float %x
   %select1 = select i1 %cond, float %select0, float 0x7FF0000000000000
@@ -383,8 +364,7 @@ define nofpclass(inf) float @ret_nofpclass_inf__select_chain_inf_nan_1(i1 %cond,
 define nofpclass(inf) float @ret_nofpclass_inf__fabs_select_ninf_rhs(i1 %cond, float %x) {
 ; CHECK-LABEL: define nofpclass(inf) float @ret_nofpclass_inf__fabs_select_ninf_rhs
 ; CHECK-SAME: (i1 [[COND:%.*]], float [[X:%.*]]) {
-; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[COND]], float [[X]], float 0xFFF0000000000000
-; CHECK-NEXT:    [[FABS:%.*]] = call float @llvm.fabs.f32(float [[SELECT]])
+; CHECK-NEXT:    [[FABS:%.*]] = call float @llvm.fabs.f32(float [[X]])
 ; CHECK-NEXT:    ret float [[FABS]]
 ;
   %select = select i1 %cond, float %x, float 0xFFF0000000000000
@@ -396,8 +376,7 @@ define nofpclass(inf) float @ret_nofpclass_inf__fabs_select_ninf_rhs(i1 %cond, f
 define nofpclass(inf) float @ret_nofpclass_inf__fabs_select_pinf_rhs(i1 %cond, float %x) {
 ; CHECK-LABEL: define nofpclass(inf) float @ret_nofpclass_inf__fabs_select_pinf_rhs
 ; CHECK-SAME: (i1 [[COND:%.*]], float [[X:%.*]]) {
-; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[COND]], float [[X]], float 0x7FF0000000000000
-; CHECK-NEXT:    [[FABS:%.*]] = call float @llvm.fabs.f32(float [[SELECT]])
+; CHECK-NEXT:    [[FABS:%.*]] = call float @llvm.fabs.f32(float [[X]])
 ; CHECK-NEXT:    ret float [[FABS]]
 ;
   %select = select i1 %cond, float %x, float 0x7FF0000000000000
@@ -421,8 +400,7 @@ define nofpclass(ninf nnorm nsub nzero) float @ret_nofpclass_no_negatives__fabs_
 define nofpclass(pinf pnorm psub pzero) float @ret_nofpclass_no_positives__fabs_select_pinf_rhs(i1 %cond, float %x) {
 ; CHECK-LABEL: define nofpclass(pinf pzero psub pnorm) float @ret_nofpclass_no_positives__fabs_select_pinf_rhs
 ; CHECK-SAME: (i1 [[COND:%.*]], float [[X:%.*]]) {
-; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[COND]], float [[X]], float 0x7FF0000000000000
-; CHECK-NEXT:    [[FABS:%.*]] = call float @llvm.fabs.f32(float [[SELECT]])
+; CHECK-NEXT:    [[FABS:%.*]] = call float @llvm.fabs.f32(float [[X]])
 ; CHECK-NEXT:    ret float [[FABS]]
 ;
   %select = select i1 %cond, float %x, float 0x7FF0000000000000
@@ -446,9 +424,7 @@ define nofpclass(nan ninf nnorm nsub nzero) float @ret_nofpclass_no_negatives_na
 define nofpclass(nan pinf pnorm psub pzero) float @ret_nofpclass_no_positives_nan__fabs_select_pinf_rhs(i1 %cond, float %x) {
 ; CHECK-LABEL: define nofpclass(nan pinf pzero psub pnorm) float @ret_nofpclass_no_positives_nan__fabs_select_pinf_rhs
 ; CHECK-SAME: (i1 [[COND:%.*]], float [[X:%.*]]) {
-; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[COND]], float [[X]], float 0x7FF0000000000000
-; CHECK-NEXT:    [[FABS:%.*]] = call float @llvm.fabs.f32(float [[SELECT]])
-; CHECK-NEXT:    ret float [[FABS]]
+; CHECK-NEXT:    ret float poison
 ;
   %select = select i1 %cond, float %x, float 0x7FF0000000000000
   %fabs = call float @llvm.fabs.f32(float %select)
@@ -459,8 +435,7 @@ define nofpclass(nan pinf pnorm psub pzero) float @ret_nofpclass_no_positives_na
 define nofpclass(inf) float @ret_nofpclass_inf__fneg_select_ninf_rhs(i1 %cond, float %x) {
 ; CHECK-LABEL: define nofpclass(inf) float @ret_nofpclass_inf__fneg_select_ninf_rhs
 ; CHECK-SAME: (i1 [[COND:%.*]], float [[X:%.*]]) {
-; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[COND]], float [[X]], float 0xFFF0000000000000
-; CHECK-NEXT:    [[FNEG:%.*]] = fneg float [[SELECT]]
+; CHECK-NEXT:    [[FNEG:%.*]] = fneg float [[X]]
 ; CHECK-NEXT:    ret float [[FNEG]]
 ;
   %select = select i1 %cond, float %x, float 0xFFF0000000000000
@@ -472,8 +447,7 @@ define nofpclass(inf) float @ret_nofpclass_inf__fneg_select_ninf_rhs(i1 %cond, f
 define nofpclass(inf nnorm nsub nzero) float @ret_nofpclass_nonegatives_noinf___fneg_select_pinf_rhs(i1 %cond, float %x) {
 ; CHECK-LABEL: define nofpclass(inf nzero nsub nnorm) float @ret_nofpclass_nonegatives_noinf___fneg_select_pinf_rhs
 ; CHECK-SAME: (i1 [[COND:%.*]], float [[X:%.*]]) {
-; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[COND]], float [[X]], float 0x7FF0000000000000
-; CHECK-NEXT:    [[FNEG:%.*]] = fneg float [[SELECT]]
+; CHECK-NEXT:    [[FNEG:%.*]] = fneg float [[X]]
 ; CHECK-NEXT:    ret float [[FNEG]]
 ;
   %select = select i1 %cond, float %x, float 0x7FF0000000000000
@@ -485,8 +459,7 @@ define nofpclass(inf nnorm nsub nzero) float @ret_nofpclass_nonegatives_noinf___
 define nofpclass(inf nnorm nsub nzero) float @ret_nofpclass_nonegatives_noinf___fneg_select_ninf_lhs(i1 %cond, float %x) {
 ; CHECK-LABEL: define nofpclass(inf nzero nsub nnorm) float @ret_nofpclass_nonegatives_noinf___fneg_select_ninf_lhs
 ; CHECK-SAME: (i1 [[COND:%.*]], float [[X:%.*]]) {
-; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[COND]], float 0xFFF0000000000000, float [[X]]
-; CHECK-NEXT:    [[FNEG:%.*]] = fneg float [[SELECT]]
+; CHECK-NEXT:    [[FNEG:%.*]] = fneg float [[X]]
 ; CHECK-NEXT:    ret float [[FNEG]]
 ;
   %select = select i1 %cond, float 0xFFF0000000000000, float %x
@@ -510,8 +483,7 @@ define nofpclass(pzero psub pnorm pinf) float @ret_nofpclass_nopositives___fneg_
 define nofpclass(inf) float @ret_nofpclass_inf__fneg_fabs_select_pinf_rhs(i1 %cond, float %x) {
 ; CHECK-LABEL: define nofpclass(inf) float @ret_nofpclass_inf__fneg_fabs_select_pinf_rhs
 ; CHECK-SAME: (i1 [[COND:%.*]], float [[X:%.*]]) {
-; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[COND]], float [[X]], float 0x7FF0000000000000
-; CHECK-NEXT:    [[FABS:%.*]] = call float @llvm.fabs.f32(float [[SELECT]])
+; CHECK-NEXT:    [[FABS:%.*]] = call float @llvm.fabs.f32(float [[X]])
 ; CHECK-NEXT:    [[FNEG:%.*]] = fneg float [[FABS]]
 ; CHECK-NEXT:    ret float [[FNEG]]
 ;
@@ -525,8 +497,7 @@ define nofpclass(inf) float @ret_nofpclass_inf__fneg_fabs_select_pinf_rhs(i1 %co
 define nofpclass(ninf nnorm nsub nzero) float @ret_nofpclass_nonegatives__fneg_fabs_select_pinf_rhs(i1 %cond, float %x) {
 ; CHECK-LABEL: define nofpclass(ninf nzero nsub nnorm) float @ret_nofpclass_nonegatives__fneg_fabs_select_pinf_rhs
 ; CHECK-SAME: (i1 [[COND:%.*]], float [[X:%.*]]) {
-; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[COND]], float [[X]], float 0x7FF0000000000000
-; CHECK-NEXT:    [[FABS:%.*]] = call float @llvm.fabs.f32(float [[SELECT]])
+; CHECK-NEXT:    [[FABS:%.*]] = call float @llvm.fabs.f32(float [[X]])
 ; CHECK-NEXT:    [[FNEG:%.*]] = fneg float [[FABS]]
 ; CHECK-NEXT:    ret float [[FNEG]]
 ;
@@ -541,10 +512,7 @@ define nofpclass(ninf nnorm nsub nzero) float @ret_nofpclass_nonegatives__fneg_f
 define nofpclass(nan ninf nnorm nsub nzero) float @ret_nofpclass_nonegatives_nonan__fneg_fabs_select_pinf_rhs(i1 %cond, float %x) {
 ; CHECK-LABEL: define nofpclass(nan ninf nzero nsub nnorm) float @ret_nofpclass_nonegatives_nonan__fneg_fabs_select_pinf_rhs
 ; CHECK-SAME: (i1 [[COND:%.*]], float [[X:%.*]]) {
-; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[COND]], float [[X]], float 0x7FF0000000000000
-; CHECK-NEXT:    [[FABS:%.*]] = call float @llvm.fabs.f32(float [[SELECT]])
-; CHECK-NEXT:    [[FNEG:%.*]] = fneg float [[FABS]]
-; CHECK-NEXT:    ret float [[FNEG]]
+; CHECK-NEXT:    ret float poison
 ;
   %select = select i1 %cond, float %x, float 0x7FF0000000000000
   %fabs = call float @llvm.fabs.f32(float %select)
@@ -556,8 +524,7 @@ define nofpclass(nan ninf nnorm nsub nzero) float @ret_nofpclass_nonegatives_non
 define nofpclass(inf) float @ret_nofpclass_inf__copysign_unknown_select_pinf_rhs(i1 %cond, float %x, float %unknown.sign) {
 ; CHECK-LABEL: define nofpclass(inf) float @ret_nofpclass_inf__copysign_unknown_select_pinf_rhs
 ; CHECK-SAME: (i1 [[COND:%.*]], float [[X:%.*]], float [[UNKNOWN_SIGN:%.*]]) {
-; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[COND]], float [[X]], float 0x7FF0000000000000
-; CHECK-NEXT:    [[COPYSIGN:%.*]] = call float @llvm.copysign.f32(float [[SELECT]], float [[UNKNOWN_SIGN]])
+; CHECK-NEXT:    [[COPYSIGN:%.*]] = call float @llvm.copysign.f32(float [[X]], float [[UNKNOWN_SIGN]])
 ; CHECK-NEXT:    ret float [[COPYSIGN]]
 ;
   %select = select i1 %cond, float %x, float 0x7FF0000000000000
@@ -568,8 +535,7 @@ define nofpclass(inf) float @ret_nofpclass_inf__copysign_unknown_select_pinf_rhs
 define nofpclass(inf) float @ret_nofpclass_inf__copysign_positive_select_pinf_rhs(i1 %cond, float %x) {
 ; CHECK-LABEL: define nofpclass(inf) float @ret_nofpclass_inf__copysign_positive_select_pinf_rhs
 ; CHECK-SAME: (i1 [[COND:%.*]], float [[X:%.*]]) {
-; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[COND]], float [[X]], float 0x7FF0000000000000
-; CHECK-NEXT:    [[COPYSIGN:%.*]] = call float @llvm.fabs.f32(float [[SELECT]])
+; CHECK-NEXT:    [[COPYSIGN:%.*]] = call float @llvm.fabs.f32(float [[X]])
 ; CHECK-NEXT:    ret float [[COPYSIGN]]
 ;
   %select = select i1 %cond, float %x, float 0x7FF0000000000000
@@ -580,8 +546,7 @@ define nofpclass(inf) float @ret_nofpclass_inf__copysign_positive_select_pinf_rh
 define nofpclass(inf) float @ret_nofpclass_inf__copysign_negative_select_pinf_rhs(i1 %cond, float %x) {
 ; CHECK-LABEL: define nofpclass(inf) float @ret_nofpclass_inf__copysign_negative_select_pinf_rhs
 ; CHECK-SAME: (i1 [[COND:%.*]], float [[X:%.*]]) {
-; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[COND]], float [[X]], float 0x7FF0000000000000
-; CHECK-NEXT:    [[TMP1:%.*]] = call float @llvm.fabs.f32(float [[SELECT]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call float @llvm.fabs.f32(float [[X]])
 ; CHECK-NEXT:    [[COPYSIGN:%.*]] = fneg float [[TMP1]]
 ; CHECK-NEXT:    ret float [[COPYSIGN]]
 ;
@@ -594,7 +559,8 @@ define nofpclass(inf) float @ret_nofpclass_inf__copysign_negative_select_pinf_rh
 define nofpclass(pinf pnorm psub pzero) float @ret_nofpclass_nopositives_copysign(float %x, float %unknown.sign) {
 ; CHECK-LABEL: define nofpclass(pinf pzero psub pnorm) float @ret_nofpclass_nopositives_copysign
 ; CHECK-SAME: (float [[X:%.*]], float [[UNKNOWN_SIGN:%.*]]) {
-; CHECK-NEXT:    [[COPYSIGN:%.*]] = call float @llvm.copysign.f32(float [[X]], float [[UNKNOWN_SIGN]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call float @llvm.fabs.f32(float [[X]])
+; CHECK-NEXT:    [[COPYSIGN:%.*]] = fneg float [[TMP1]]
 ; CHECK-NEXT:    ret float [[COPYSIGN]]
 ;
   %copysign = call float @llvm.copysign.f32(float %x, float %unknown.sign)
@@ -605,7 +571,8 @@ define nofpclass(pinf pnorm psub pzero) float @ret_nofpclass_nopositives_copysig
 define nofpclass(pinf pnorm psub pzero) float @ret_nofpclass_nopositives_copysign_nnan_flag(float %x, float %unknown.sign) {
 ; CHECK-LABEL: define nofpclass(pinf pzero psub pnorm) float @ret_nofpclass_nopositives_copysign_nnan_flag
 ; CHECK-SAME: (float [[X:%.*]], float [[UNKNOWN_SIGN:%.*]]) {
-; CHECK-NEXT:    [[COPYSIGN:%.*]] = call nnan float @llvm.copysign.f32(float [[X]], float [[UNKNOWN_SIGN]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call nnan float @llvm.fabs.f32(float [[X]])
+; CHECK-NEXT:    [[COPYSIGN:%.*]] = fneg nnan float [[TMP1]]
 ; CHECK-NEXT:    ret float [[COPYSIGN]]
 ;
   %copysign = call nnan float @llvm.copysign.f32(float %x, float %unknown.sign)
@@ -616,7 +583,8 @@ define nofpclass(pinf pnorm psub pzero) float @ret_nofpclass_nopositives_copysig
 define nofpclass(nan pinf pnorm psub pzero) float @ret_nofpclass_nopositives_nonan_copysign(float %x, float %unknown.sign) {
 ; CHECK-LABEL: define nofpclass(nan pinf pzero psub pnorm) float @ret_nofpclass_nopositives_nonan_copysign
 ; CHECK-SAME: (float [[X:%.*]], float [[UNKNOWN_SIGN:%.*]]) {
-; CHECK-NEXT:    [[COPYSIGN:%.*]] = call float @llvm.copysign.f32(float [[X]], float [[UNKNOWN_SIGN]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call float @llvm.fabs.f32(float [[X]])
+; CHECK-NEXT:    [[COPYSIGN:%.*]] = fneg float [[TMP1]]
 ; CHECK-NEXT:    ret float [[COPYSIGN]]
 ;
   %copysign = call float @llvm.copysign.f32(float %x, float %unknown.sign)
@@ -627,7 +595,7 @@ define nofpclass(nan pinf pnorm psub pzero) float @ret_nofpclass_nopositives_non
 define nofpclass(ninf nnorm nsub nzero) float @ret_nofpclass_nonegatives_copysign(float %x, float %unknown.sign) {
 ; CHECK-LABEL: define nofpclass(ninf nzero nsub nnorm) float @ret_nofpclass_nonegatives_copysign
 ; CHECK-SAME: (float [[X:%.*]], float [[UNKNOWN_SIGN:%.*]]) {
-; CHECK-NEXT:    [[COPYSIGN:%.*]] = call float @llvm.copysign.f32(float [[X]], float [[UNKNOWN_SIGN]])
+; CHECK-NEXT:    [[COPYSIGN:%.*]] = call float @llvm.fabs.f32(float [[X]])
 ; CHECK-NEXT:    ret float [[COPYSIGN]]
 ;
   %copysign = call float @llvm.copysign.f32(float %x, float %unknown.sign)
@@ -638,7 +606,7 @@ define nofpclass(ninf nnorm nsub nzero) float @ret_nofpclass_nonegatives_copysig
 define nofpclass(ninf nnorm nsub nzero) float @ret_nofpclass_nonegatives_copysign_nnan_flag(float %x, float %unknown.sign) {
 ; CHECK-LABEL: define nofpclass(ninf nzero nsub nnorm) float @ret_nofpclass_nonegatives_copysign_nnan_flag
 ; CHECK-SAME: (float [[X:%.*]], float [[UNKNOWN_SIGN:%.*]]) {
-; CHECK-NEXT:    [[COPYSIGN:%.*]] = call nnan float @llvm.copysign.f32(float [[X]], float [[UNKNOWN_SIGN]])
+; CHECK-NEXT:    [[COPYSIGN:%.*]] = call nnan float @llvm.fabs.f32(float [[X]])
 ; CHECK-NEXT:    ret float [[COPYSIGN]]
 ;
   %copysign = call nnan float @llvm.copysign.f32(float %x, float %unknown.sign)
@@ -649,7 +617,7 @@ define nofpclass(ninf nnorm nsub nzero) float @ret_nofpclass_nonegatives_copysig
 define nofpclass(nan ninf nnorm nsub nzero) float @ret_nofpclass_nonegatives_nonan_copysign(float %x, float %unknown.sign) {
 ; CHECK-LABEL: define nofpclass(nan ninf nzero nsub nnorm) float @ret_nofpclass_nonegatives_nonan_copysign
 ; CHECK-SAME: (float [[X:%.*]], float [[UNKNOWN_SIGN:%.*]]) {
-; CHECK-NEXT:    [[COPYSIGN:%.*]] = call float @llvm.copysign.f32(float [[X]], float [[UNKNOWN_SIGN]])
+; CHECK-NEXT:    [[COPYSIGN:%.*]] = call float @llvm.fabs.f32(float [[X]])
 ; CHECK-NEXT:    ret float [[COPYSIGN]]
 ;
   %copysign = call float @llvm.copysign.f32(float %x, float %unknown.sign)
@@ -659,8 +627,7 @@ define nofpclass(nan ninf nnorm nsub nzero) float @ret_nofpclass_nonegatives_non
 define nofpclass(pinf pnorm psub pzero) float @ret_nofpclass_nopositives__copysign_fabs_select_pinf_rhs(i1 %cond, float %x, float %sign) {
 ; CHECK-LABEL: define nofpclass(pinf pzero psub pnorm) float @ret_nofpclass_nopositives__copysign_fabs_select_pinf_rhs
 ; CHECK-SAME: (i1 [[COND:%.*]], float [[X:%.*]], float [[SIGN:%.*]]) {
-; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[COND]], float [[X]], float 0x7FF0000000000000
-; CHECK-NEXT:    [[COPYSIGN:%.*]] = call float @llvm.fabs.f32(float [[SELECT]])
+; CHECK-NEXT:    [[COPYSIGN:%.*]] = call float @llvm.fabs.f32(float [[X]])
 ; CHECK-NEXT:    ret float [[COPYSIGN]]
 ;
   %select = select i1 %cond, float %x, float 0x7FF0000000000000
@@ -673,8 +640,7 @@ define nofpclass(pinf pnorm psub pzero) float @ret_nofpclass_nopositives__copysi
 define nofpclass(inf nnorm nsub nzero) float @ret_nofpclass_no_negatives_noinf__copysign_unknown_select_pinf_rhs(i1 %cond, float %x, float %unknown.sign) {
 ; CHECK-LABEL: define nofpclass(inf nzero nsub nnorm) float @ret_nofpclass_no_negatives_noinf__copysign_unknown_select_pinf_rhs
 ; CHECK-SAME: (i1 [[COND:%.*]], float [[X:%.*]], float [[UNKNOWN_SIGN:%.*]]) {
-; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[COND]], float [[X]], float 0x7FF0000000000000
-; CHECK-NEXT:    [[COPYSIGN:%.*]] = call float @llvm.copysign.f32(float [[SELECT]], float [[UNKNOWN_SIGN]])
+; CHECK-NEXT:    [[COPYSIGN:%.*]] = call float @llvm.fabs.f32(float [[X]])
 ; CHECK-NEXT:    ret float [[COPYSIGN]]
 ;
   %select = select i1 %cond, float %x, float 0x7FF0000000000000
@@ -686,8 +652,8 @@ define nofpclass(inf nnorm nsub nzero) float @ret_nofpclass_no_negatives_noinf__
 define nofpclass(inf pnorm psub pzero) float @ret_nofpclass_no_positives_noinf__copysign_unknown_select_pinf_rhs(i1 %cond, float %x, float %unknown.sign) {
 ; CHECK-LABEL: define nofpclass(inf pzero psub pnorm) float @ret_nofpclass_no_positives_noinf__copysign_unknown_select_pinf_rhs
 ; CHECK-SAME: (i1 [[COND:%.*]], float [[X:%.*]], float [[UNKNOWN_SIGN:%.*]]) {
-; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[COND]], float [[X]], float 0x7FF0000000000000
-; CHECK-NEXT:    [[COPYSIGN:%.*]] = call float @llvm.copysign.f32(float [[SELECT]], float [[UNKNOWN_SIGN]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call float @llvm.fabs.f32(float [[X]])
+; CHECK-NEXT:    [[COPYSIGN:%.*]] = fneg float [[TMP1]]
 ; CHECK-NEXT:    ret float [[COPYSIGN]]
 ;
   %select = select i1 %cond, float %x, float 0x7FF0000000000000
@@ -700,7 +666,7 @@ define nofpclass(ninf nnorm nsub nzero) float @ret_nofpclass_no_negatives__copys
 ; CHECK-LABEL: define nofpclass(ninf nzero nsub nnorm) float @ret_nofpclass_no_negatives__copysign_unknown_select_pinf_rhs
 ; CHECK-SAME: (i1 [[COND:%.*]], float [[X:%.*]], float [[UNKNOWN_SIGN:%.*]]) {
 ; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[COND]], float [[X]], float 0x7FF0000000000000
-; CHECK-NEXT:    [[COPYSIGN:%.*]] = call float @llvm.copysign.f32(float [[SELECT]], float [[UNKNOWN_SIGN]])
+; CHECK-NEXT:    [[COPYSIGN:%.*]] = call float @llvm.fabs.f32(float [[SELECT]])
 ; CHECK-NEXT:    ret float [[COPYSIGN]]
 ;
   %select = select i1 %cond, float %x, float 0x7FF0000000000000
@@ -713,7 +679,8 @@ define nofpclass(pinf pnorm psub pzero) float @ret_nofpclass_no_positives__copys
 ; CHECK-LABEL: define nofpclass(pinf pzero psub pnorm) float @ret_nofpclass_no_positives__copysign_unknown_select_pinf_rhs
 ; CHECK-SAME: (i1 [[COND:%.*]], float [[X:%.*]], float [[UNKNOWN_SIGN:%.*]]) {
 ; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[COND]], float [[X]], float 0x7FF0000000000000
-; CHECK-NEXT:    [[COPYSIGN:%.*]] = call float @llvm.copysign.f32(float [[SELECT]], float [[UNKNOWN_SIGN]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call float @llvm.fabs.f32(float [[SELECT]])
+; CHECK-NEXT:    [[COPYSIGN:%.*]] = fneg float [[TMP1]]
 ; CHECK-NEXT:    ret float [[COPYSIGN]]
 ;
   %select = select i1 %cond, float %x, float 0x7FF0000000000000
@@ -726,7 +693,7 @@ define nofpclass(nan ninf nnorm nsub nzero) float @ret_nofpclass_no_negatives_no
 ; CHECK-LABEL: define nofpclass(nan ninf nzero nsub nnorm) float @ret_nofpclass_no_negatives_nonan__copysign_unknown_select_pinf_rhs
 ; CHECK-SAME: (i1 [[COND:%.*]], float [[X:%.*]], float [[UNKNOWN_SIGN:%.*]]) {
 ; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[COND]], float [[X]], float 0x7FF0000000000000
-; CHECK-NEXT:    [[COPYSIGN:%.*]] = call float @llvm.copysign.f32(float [[SELECT]], float [[UNKNOWN_SIGN]])
+; CHECK-NEXT:    [[COPYSIGN:%.*]] = call float @llvm.fabs.f32(float [[SELECT]])
 ; CHECK-NEXT:    ret float [[COPYSIGN]]
 ;
   %select = select i1 %cond, float %x, float 0x7FF0000000000000
@@ -739,7 +706,8 @@ define nofpclass(nan pinf pnorm psub pzero) float @ret_nofpclass_no_positives_no
 ; CHECK-LABEL: define nofpclass(nan pinf pzero psub pnorm) float @ret_nofpclass_no_positives_nonan__copysign_unknown_select_pinf_rhs
 ; CHECK-SAME: (i1 [[COND:%.*]], float [[X:%.*]], float [[UNKNOWN_SIGN:%.*]]) {
 ; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[COND]], float [[X]], float 0x7FF0000000000000
-; CHECK-NEXT:    [[COPYSIGN:%.*]] = call float @llvm.copysign.f32(float [[SELECT]], float [[UNKNOWN_SIGN]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call float @llvm.fabs.f32(float [[SELECT]])
+; CHECK-NEXT:    [[COPYSIGN:%.*]] = fneg float [[TMP1]]
 ; CHECK-NEXT:    ret float [[COPYSIGN]]
 ;
   %select = select i1 %cond, float %x, float 0x7FF0000000000000
@@ -790,9 +758,7 @@ define nofpclass(nan ninf nnorm nsub nzero) float @ret_nofpclass_nan_negatives__
 define nofpclass(nan ninf nnorm nsub zero) float @ret_nofpclass_nan_negatives_zero__select_clamp_pos_to_zero(float %x) {
 ; CHECK-LABEL: define nofpclass(nan ninf zero nsub nnorm) float @ret_nofpclass_nan_negatives_zero__select_clamp_pos_to_zero
 ; CHECK-SAME: (float [[X:%.*]]) {
-; CHECK-NEXT:    [[IS_GT_ZERO:%.*]] = fcmp ogt float [[X]], 0.000000e+00
-; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[IS_GT_ZERO]], float 0.000000e+00, float [[X]]
-; CHECK-NEXT:    ret float [[SELECT]]
+; CHECK-NEXT:    ret float [[X]]
 ;
   %is.gt.zero = fcmp ogt float %x, 0.0
   %select = select i1 %is.gt.zero, float 0.0, float %x
@@ -803,9 +769,7 @@ define nofpclass(nan ninf nnorm nsub zero) float @ret_nofpclass_nan_negatives_ze
 define nofpclass(ninf nnorm nsub zero) float @ret_nofpclass_negatives_zero__select_clamp_pos_to_zero(float %x) {
 ; CHECK-LABEL: define nofpclass(ninf zero nsub nnorm) float @ret_nofpclass_negatives_zero__select_clamp_pos_to_zero
 ; CHECK-SAME: (float [[X:%.*]]) {
-; CHECK-NEXT:    [[IS_GT_ZERO:%.*]] = fcmp ogt float [[X]], 0.000000e+00
-; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[IS_GT_ZERO]], float 0.000000e+00, float [[X]]
-; CHECK-NEXT:    ret float [[SELECT]]
+; CHECK-NEXT:    ret float [[X]]
 ;
   %is.gt.zero = fcmp ogt float %x, 0.0
   %select = select i1 %is.gt.zero, float 0.0, float %x
@@ -819,8 +783,7 @@ define nofpclass(inf) float @ret_nofpclass_noinfs__assumed_isinf__select_pinf_lh
 ; CHECK-NEXT:    [[FABS_X:%.*]] = call float @llvm.fabs.f32(float [[X]])
 ; CHECK-NEXT:    [[X_IS_INF:%.*]] = fcmp oeq float [[FABS_X]], 0x7FF0000000000000
 ; CHECK-NEXT:    call void @llvm.assume(i1 [[X_IS_INF]])
-; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[COND]], float [[X]], float [[Y]]
-; CHECK-NEXT:    ret float [[SELECT]]
+; CHECK-NEXT:    ret float [[Y]]
 ;
   %fabs.x = call float @llvm.fabs.f32(float %x)
   %x.is.inf = fcmp oeq float %fabs.x, 0x7FF0000000000000
@@ -838,18 +801,13 @@ define nofpclass(nan inf nzero nsub nnorm) float @powr_issue64870(float nofpclas
 ; CHECK-NEXT:    [[I1:%.*]] = tail call float @llvm.log2.f32(float [[I]])
 ; CHECK-NEXT:    [[I2:%.*]] = fmul float [[I1]], [[Y]]
 ; CHECK-NEXT:    [[I3:%.*]] = tail call nofpclass(ninf nzero nsub nnorm) float @llvm.exp2.f32(float [[I2]])
-; CHECK-NEXT:    [[I4:%.*]] = fcmp olt float [[Y]], 0.000000e+00
-; CHECK-NEXT:    [[I5:%.*]] = select i1 [[I4]], float 0x7FF0000000000000, float 0.000000e+00
 ; CHECK-NEXT:    [[I6:%.*]] = fcmp oeq float [[X]], 0.000000e+00
-; CHECK-NEXT:    [[I7:%.*]] = select i1 [[I6]], float [[I5]], float [[I3]]
+; CHECK-NEXT:    [[I7:%.*]] = select i1 [[I6]], float 0.000000e+00, float [[I3]]
 ; CHECK-NEXT:    [[I8:%.*]] = fcmp oeq float [[Y]], 0.000000e+00
-; CHECK-NEXT:    [[I9:%.*]] = select i1 [[I6]], float 0x7FF8000000000000, float 1.000000e+00
-; CHECK-NEXT:    [[I10:%.*]] = select i1 [[I8]], float [[I9]], float [[I7]]
 ; CHECK-NEXT:    [[I11:%.*]] = fcmp oeq float [[X]], 1.000000e+00
-; CHECK-NEXT:    [[I12:%.*]] = select i1 [[I11]], float 1.000000e+00, float [[I10]]
-; CHECK-NEXT:    [[I13:%.*]] = fcmp olt float [[X]], 0.000000e+00
-; CHECK-NEXT:    [[I14:%.*]] = select i1 [[I13]], float 0x7FF8000000000000, float [[I12]]
-; CHECK-NEXT:    ret float [[I14]]
+; CHECK-NEXT:    [[TMP0:%.*]] = select i1 [[I11]], i1 true, i1 [[I8]]
+; CHECK-NEXT:    [[I12:%.*]] = select i1 [[TMP0]], float 1.000000e+00, float [[I7]]
+; CHECK-NEXT:    ret float [[I12]]
 ;
 entry:
   %i = tail call float @llvm.fabs.f32(float %x)
@@ -881,12 +839,8 @@ define nofpclass(nan inf nzero nsub nnorm) float @test_powr_issue64870_2(float n
 ; CHECK-NEXT:    [[I4:%.*]] = select i1 [[I]], float 0x7FF8000000000000, float [[ARG1]]
 ; CHECK-NEXT:    [[I5:%.*]] = fmul float [[I4]], [[I3]]
 ; CHECK-NEXT:    [[I6:%.*]] = tail call noundef nofpclass(ninf nzero nsub nnorm) float @llvm.exp2.f32(float noundef [[I5]])
-; CHECK-NEXT:    [[I7:%.*]] = fcmp olt float [[I4]], 0.000000e+00
-; CHECK-NEXT:    [[I8:%.*]] = select i1 [[I7]], float 0x7FF0000000000000, float 0.000000e+00
-; CHECK-NEXT:    [[I9:%.*]] = fcmp ueq float [[I4]], 0.000000e+00
 ; CHECK-NEXT:    [[I10:%.*]] = fcmp oeq float [[I2]], 0.000000e+00
-; CHECK-NEXT:    [[I11:%.*]] = select i1 [[I9]], float 0x7FF8000000000000, float [[I8]]
-; CHECK-NEXT:    [[I12:%.*]] = select i1 [[I10]], float [[I11]], float [[I6]]
+; CHECK-NEXT:    [[I12:%.*]] = select i1 [[I10]], float 0.000000e+00, float [[I6]]
 ; CHECK-NEXT:    ret float [[I12]]
 ;
 bb:
@@ -923,16 +877,10 @@ define nofpclass(nan inf) float @pow_f32(float nofpclass(nan inf) %arg, float no
 ; CHECK-NEXT:    [[I11:%.*]] = and i1 [[I7]], [[I10]]
 ; CHECK-NEXT:    [[I12:%.*]] = select i1 [[I11]], float [[ARG]], float 1.000000e+00
 ; CHECK-NEXT:    [[I13:%.*]] = tail call noundef float @llvm.copysign.f32(float noundef [[I4]], float noundef [[I12]])
-; CHECK-NEXT:    [[I14:%.*]] = fcmp olt float [[ARG]], 0.000000e+00
-; CHECK-NEXT:    [[I15:%.*]] = select i1 [[I7]], float [[I13]], float 0x7FF8000000000000
-; CHECK-NEXT:    [[I16:%.*]] = select i1 [[I14]], float [[I15]], float [[I13]]
 ; CHECK-NEXT:    [[I17:%.*]] = fcmp oeq float [[ARG]], 0.000000e+00
-; CHECK-NEXT:    [[I18:%.*]] = fcmp olt float [[ARG1]], 0.000000e+00
-; CHECK-NEXT:    [[I19:%.*]] = xor i1 [[I17]], [[I18]]
-; CHECK-NEXT:    [[I20:%.*]] = select i1 [[I19]], float 0.000000e+00, float 0x7FF0000000000000
 ; CHECK-NEXT:    [[I21:%.*]] = select i1 [[I11]], float [[ARG]], float 0.000000e+00
-; CHECK-NEXT:    [[I22:%.*]] = tail call noundef nofpclass(nan sub norm) float @llvm.copysign.f32(float noundef [[I20]], float noundef [[I21]])
-; CHECK-NEXT:    [[I23:%.*]] = select i1 [[I17]], float [[I22]], float [[I16]]
+; CHECK-NEXT:    [[I22:%.*]] = tail call noundef nofpclass(nan sub norm) float @llvm.copysign.f32(float noundef 0.000000e+00, float noundef [[I21]])
+; CHECK-NEXT:    [[I23:%.*]] = select i1 [[I17]], float [[I22]], float [[I13]]
 ; CHECK-NEXT:    [[I24:%.*]] = fcmp oeq float [[ARG]], 1.000000e+00
 ; CHECK-NEXT:    [[I25:%.*]] = fcmp oeq float [[ARG1]], 0.000000e+00
 ; CHECK-NEXT:    [[I26:%.*]] = or i1 [[I24]], [[I25]]
@@ -977,8 +925,7 @@ define nofpclass(inf) float @ret_nofpclass_inf__select_nofpclass_call_only_inf(i
 ; CHECK-LABEL: define nofpclass(inf) float @ret_nofpclass_inf__select_nofpclass_call_only_inf
 ; CHECK-SAME: (i1 [[COND:%.*]], float [[Y:%.*]]) {
 ; CHECK-NEXT:    [[MUST_BE_INF:%.*]] = call nofpclass(nan zero sub norm) float @extern()
-; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[COND]], float [[MUST_BE_INF]], float [[Y]]
-; CHECK-NEXT:    ret float [[SELECT]]
+; CHECK-NEXT:    ret float [[Y]]
 ;
   %must.be.inf = call nofpclass(nan norm zero sub) float @extern()
   %select = select i1 %cond, float %must.be.inf, float %y
@@ -989,7 +936,7 @@ define nofpclass(pinf) float @ret_nofpclass_pinf__nofpclass_call_only_inf(i1 %co
 ; CHECK-LABEL: define nofpclass(pinf) float @ret_nofpclass_pinf__nofpclass_call_only_inf
 ; CHECK-SAME: (i1 [[COND:%.*]], float [[Y:%.*]]) {
 ; CHECK-NEXT:    [[MUST_BE_INF:%.*]] = call nofpclass(nan zero sub norm) float @extern()
-; CHECK-NEXT:    ret float [[MUST_BE_INF]]
+; CHECK-NEXT:    ret float 0xFFF0000000000000
 ;
   %must.be.inf = call nofpclass(nan norm zero sub) float @extern()
   ret float %must.be.inf
@@ -999,7 +946,7 @@ define nofpclass(ninf) float @ret_nofpclass_ninf__nofpclass_call_only_inf(i1 %co
 ; CHECK-LABEL: define nofpclass(ninf) float @ret_nofpclass_ninf__nofpclass_call_only_inf
 ; CHECK-SAME: (i1 [[COND:%.*]], float [[Y:%.*]]) {
 ; CHECK-NEXT:    [[MUST_BE_INF:%.*]] = call nofpclass(nan zero sub norm) float @extern()
-; CHECK-NEXT:    ret float [[MUST_BE_INF]]
+; CHECK-NEXT:    ret float 0x7FF0000000000000
 ;
   %must.be.inf = call nofpclass(nan norm zero sub) float @extern()
   ret float %must.be.inf
@@ -1009,7 +956,7 @@ define nofpclass(nzero) float @ret_nofpclass_nzero__nofpclass_call_only_zero(i1
 ; CHECK-LABEL: define nofpclass(nzero) float @ret_nofpclass_nzero__nofpclass_call_only_zero
 ; CHECK-SAME: (i1 [[COND:%.*]], float [[Y:%.*]]) {
 ; CHECK-NEXT:    [[MUST_BE_ZERO:%.*]] = call nofpclass(nan inf sub norm) float @extern()
-; CHECK-NEXT:    ret float [[MUST_BE_ZERO]]
+; CHECK-NEXT:    ret float 0.000000e+00
 ;
   %must.be.zero = call nofpclass(nan sub norm inf) float @extern()
   ret float %must.be.zero
@@ -1019,7 +966,7 @@ define nofpclass(pzero) float @ret_nofpclass_pzero__nofpclass_call_only_zero(i1
 ; CHECK-LABEL: define nofpclass(pzero) float @ret_nofpclass_pzero__nofpclass_call_only_zero
 ; CHECK-SAME: (i1 [[COND:%.*]], float [[Y:%.*]]) {
 ; CHECK-NEXT:    [[MUST_BE_ZERO:%.*]] = call nofpclass(nan inf sub norm) float @extern()
-; CHECK-NEXT:    ret float [[MUST_BE_ZERO]]
+; CHECK-NEXT:    ret float -0.000000e+00
 ;
   %must.be.zero = call nofpclass(nan sub norm inf) float @extern()
   ret float %must.be.zero
@@ -1133,8 +1080,7 @@ define nofpclass(inf) float @ret_nofpclass_inf__recursive_phi_0(i1 %cond0, float
 ; CHECK-NEXT:    [[LOOP_COND:%.*]] = call i1 @loop.cond()
 ; CHECK-NEXT:    br i1 [[LOOP_COND]], label [[RET]], label [[LOOP]]
 ; CHECK:       ret:
-; CHECK-NEXT:    [[PHI_RET:%.*]] = phi float [ 0.000000e+00, [[ENTRY:%.*]] ], [ 0x7FF0000000000000, [[LOOP]] ]
-; CHECK-NEXT:    ret float [[PHI_RET]]
+; CHECK-NEXT:    ret float 0.000000e+00
 ;
 entry:
   br i1 %cond0, label %loop, label %ret
@@ -1159,7 +1105,7 @@ define nofpclass(inf) float @ret_nofpclass_inf__recursive_phi_1(i1 %cond0, float
 ; CHECK-NEXT:    [[LOOP_COND:%.*]] = call i1 @loop.cond()
 ; CHECK-NEXT:    br i1 [[LOOP_COND]], label [[RET]], label [[LOOP]]
 ; CHECK:       ret:
-; CHECK-NEXT:    ret float 0x7FF0000000000000
+; CHECK-NEXT:    ret float poison
 ;
 entry:
   br i1 %cond0, label %loop, label %ret
@@ -1180,8 +1126,8 @@ define nofpclass(inf) float @ret_nofpclass_inf__phi_switch_repeated_predecessor(
 ; CHECK-SAME: (i32 [[SWITCH:%.*]], float [[UNKNOWN:%.*]]) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    switch i32 [[SWITCH]], label [[RET:%.*]] [
-; CHECK-NEXT:    i32 0, label [[LOOP:%.*]]
-; CHECK-NEXT:    i32 1, label [[LOOP]]
+; CHECK-NEXT:      i32 0, label [[LOOP:%.*]]
+; CHECK-NEXT:      i32 1, label [[LOOP]]
 ; CHECK-NEXT:    ]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[PHI_LOOP:%.*]] = phi float [ 0x7FF0000000000000, [[ENTRY:%.*]] ], [ 0x7FF0000000000000, [[ENTRY]] ], [ [[UNKNOWN]], [[LOOP]] ]
@@ -1211,8 +1157,7 @@ ret:
 define nofpclass(inf) float @ret_nofpclass_inf__arithmetic_fence_select_pinf_rhs(i1 %cond, float %x) {
 ; CHECK-LABEL: define nofpclass(inf) float @ret_nofpclass_inf__arithmetic_fence_select_pinf_rhs
 ; CHECK-SAME: (i1 [[COND:%.*]], float [[X:%.*]]) {
-; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[COND]], float [[X]], float 0x7FF0000000000000
-; CHECK-NEXT:    [[FENCE:%.*]] = call float @llvm.arithmetic.fence.f32(float [[SELECT]])
+; CHECK-NEXT:    [[FENCE:%.*]] = call float @llvm.arithmetic.fence.f32(float [[X]])
 ; CHECK-NEXT:    ret float [[FENCE]]
 ;
   %select = select i1 %cond, float %x, float 0x7FF0000000000000
-- 
cgit v1.1


From 35d6ae8110e082e9a4704416dfbe83d5a3b16ed1 Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Thu, 8 Feb 2024 09:44:51 +0100
Subject: [InstCombine] Handle multi-use in simplifyAndOrWithOpReplaced()
 (#81006)

Slightly generalize simplifyAndOrWithOpReplaced() by allowing it to
perform simplifications (without creating new instructions) in multi-use
cases. This way we can remove existing patterns without worrying about
multi-use edge cases.

I've opted to change the general way the implementation works to be more
similar to the standard simplifyWithOpReplaced(). We perform the operand
replacement generically, and then try to simplify the result or create a
new instruction if we're allowed to do so.
---
 .../Transforms/InstCombine/InstCombineAndOrXor.cpp | 92 +++++++++++-----------
 llvm/test/Transforms/InstCombine/or.ll             |  3 +-
 2 files changed, 47 insertions(+), 48 deletions(-)

diff --git a/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp b/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
index aa3b9da..a53eb39 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
@@ -2217,47 +2217,47 @@ foldBitwiseLogicWithIntrinsics(BinaryOperator &I,
   }
 }
 
-// Try to simplify X | Y by replacing occurrences of Y in X with 0.
-// Similarly, simplify X & Y by replacing occurrences of Y in X with -1.
+// Try to simplify V by replacing occurrences of Op with RepOp, but only look
+// through bitwise operations. In particular, for X | Y we try to replace Y with
+// 0 inside X and for X & Y we try to replace Y with -1 inside X.
 // Return the simplified result of X if successful, and nullptr otherwise.
-static Value *simplifyAndOrWithOpReplaced(Value *X, Value *Y, bool IsAnd,
+// If SimplifyOnly is true, no new instructions will be created.
+static Value *simplifyAndOrWithOpReplaced(Value *V, Value *Op, Value *RepOp,
+                                          bool SimplifyOnly,
                                           InstCombinerImpl &IC,
                                           unsigned Depth = 0) {
-  if (isa<Constant>(X) || X == Y)
+  if (Op == RepOp)
     return nullptr;
 
-  Value *RHS;
-  if (match(X, m_c_And(m_Specific(Y), m_Value(RHS)))) {
-    return IsAnd ? RHS : Constant::getNullValue(X->getType());
-  } else if (match(X, m_c_Or(m_Specific(Y), m_Value(RHS)))) {
-    return IsAnd ? Constant::getAllOnesValue(X->getType()) : RHS;
-  } else if (match(X, m_c_Xor(m_Specific(Y), m_Value(RHS)))) {
-    if (IsAnd) {
-      if (X->hasOneUse())
-        return IC.Builder.CreateNot(RHS);
+  if (V == Op)
+    return RepOp;
 
-      if (Value *NotRHS =
-              IC.getFreelyInverted(RHS, RHS->hasOneUse(), &IC.Builder))
-        return NotRHS;
-    } else
-      return RHS;
-  }
+  auto *I = dyn_cast<BinaryOperator>(V);
+  if (!I || !I->isBitwiseLogicOp() || Depth >= 3)
+    return nullptr;
 
-  // Replace uses of Y in X recursively.
-  Value *Op0, *Op1;
-  if (Depth < 2 && match(X, m_BitwiseLogic(m_Value(Op0), m_Value(Op1)))) {
-    // TODO: Relax the one-use constraint to clean up existing hard-coded
-    // simplifications.
-    if (!X->hasOneUse())
-      return nullptr;
-    Value *NewOp0 = simplifyAndOrWithOpReplaced(Op0, Y, IsAnd, IC, Depth + 1);
-    Value *NewOp1 = simplifyAndOrWithOpReplaced(Op1, Y, IsAnd, IC, Depth + 1);
-    if (!NewOp0 && !NewOp1)
-      return nullptr;
-    return IC.Builder.CreateBinOp(cast<BinaryOperator>(X)->getOpcode(),
-                                  NewOp0 ? NewOp0 : Op0, NewOp1 ? NewOp1 : Op1);
-  }
-  return nullptr;
+  if (!I->hasOneUse())
+    SimplifyOnly = true;
+
+  Value *NewOp0 = simplifyAndOrWithOpReplaced(I->getOperand(0), Op, RepOp,
+                                              SimplifyOnly, IC, Depth + 1);
+  Value *NewOp1 = simplifyAndOrWithOpReplaced(I->getOperand(1), Op, RepOp,
+                                              SimplifyOnly, IC, Depth + 1);
+  if (!NewOp0 && !NewOp1)
+    return nullptr;
+
+  if (!NewOp0)
+    NewOp0 = I->getOperand(0);
+  if (!NewOp1)
+    NewOp1 = I->getOperand(1);
+
+  if (Value *Res = simplifyBinOp(I->getOpcode(), NewOp0, NewOp1,
+                                 IC.getSimplifyQuery().getWithInstruction(I)))
+    return Res;
+
+  if (SimplifyOnly)
+    return nullptr;
+  return IC.Builder.CreateBinOp(I->getOpcode(), NewOp0, NewOp1);
 }
 
 // FIXME: We use commutative matchers (m_c_*) for some, but not all, matches
@@ -2781,9 +2781,13 @@ Instruction *InstCombinerImpl::visitAnd(BinaryOperator &I) {
   if (Instruction *Res = foldBitwiseLogicWithIntrinsics(I, Builder))
     return Res;
 
-  if (Value *V = simplifyAndOrWithOpReplaced(Op0, Op1, /*IsAnd*/ true, *this))
+  if (Value *V =
+          simplifyAndOrWithOpReplaced(Op0, Op1, Constant::getAllOnesValue(Ty),
+                                      /*SimplifyOnly*/ false, *this))
     return BinaryOperator::CreateAnd(V, Op1);
-  if (Value *V = simplifyAndOrWithOpReplaced(Op1, Op0, /*IsAnd*/ true, *this))
+  if (Value *V =
+          simplifyAndOrWithOpReplaced(Op1, Op0, Constant::getAllOnesValue(Ty),
+                                      /*SimplifyOnly*/ false, *this))
     return BinaryOperator::CreateAnd(Op0, V);
 
   return nullptr;
@@ -3602,14 +3606,6 @@ Instruction *InstCombinerImpl::visitOr(BinaryOperator &I) {
     if (match(Op1, m_Xor(m_Specific(B), m_Specific(A))))
       return BinaryOperator::CreateOr(Op1, C);
 
-  // ((A & B) ^ C) | B -> C | B
-  if (match(Op0, m_c_Xor(m_c_And(m_Value(A), m_Specific(Op1)), m_Value(C))))
-    return BinaryOperator::CreateOr(C, Op1);
-
-  // B | ((A & B) ^ C) -> B | C
-  if (match(Op1, m_c_Xor(m_c_And(m_Value(A), m_Specific(Op0)), m_Value(C))))
-    return BinaryOperator::CreateOr(Op0, C);
-
   if (Instruction *DeMorgan = matchDeMorgansLaws(I, *this))
     return DeMorgan;
 
@@ -3965,9 +3961,13 @@ Instruction *InstCombinerImpl::visitOr(BinaryOperator &I) {
   if (Instruction *Res = foldBitwiseLogicWithIntrinsics(I, Builder))
     return Res;
 
-  if (Value *V = simplifyAndOrWithOpReplaced(Op0, Op1, /*IsAnd*/ false, *this))
+  if (Value *V =
+          simplifyAndOrWithOpReplaced(Op0, Op1, Constant::getNullValue(Ty),
+                                      /*SimplifyOnly*/ false, *this))
     return BinaryOperator::CreateOr(V, Op1);
-  if (Value *V = simplifyAndOrWithOpReplaced(Op1, Op0, /*IsAnd*/ false, *this))
+  if (Value *V =
+          simplifyAndOrWithOpReplaced(Op1, Op0, Constant::getNullValue(Ty),
+                                      /*SimplifyOnly*/ false, *this))
     return BinaryOperator::CreateOr(Op0, V);
 
   return nullptr;
diff --git a/llvm/test/Transforms/InstCombine/or.ll b/llvm/test/Transforms/InstCombine/or.ll
index 51863af..1b1a6ff 100644
--- a/llvm/test/Transforms/InstCombine/or.ll
+++ b/llvm/test/Transforms/InstCombine/or.ll
@@ -1938,8 +1938,7 @@ define i32 @test_or_and_and_multiuse(i32 %a, i32 %b, i32 %c) {
 ; CHECK-NEXT:    [[AND2:%.*]] = and i32 [[AND1]], [[C:%.*]]
 ; CHECK-NEXT:    call void @use(i32 [[AND1]])
 ; CHECK-NEXT:    call void @use(i32 [[AND2]])
-; CHECK-NEXT:    [[OR:%.*]] = or i32 [[AND2]], [[A]]
-; CHECK-NEXT:    ret i32 [[OR]]
+; CHECK-NEXT:    ret i32 [[A]]
 ;
   %and1 = and i32 %a, %b
   %and2 = and i32 %and1, %c
-- 
cgit v1.1


From 7c0d52ca91d32e693ca245fb82f2402a34212fc3 Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Thu, 8 Feb 2024 09:47:49 +0100
Subject: [ValueTracking] Support dominating known bits condition in and/or
 (#74728)

This extends computeKnownBits() support for dominating conditions to
also handle and/or conditions. We'll look through either and or or
depending on which edge we're considering.

This change is mainly for the sake of completeness, so we don't start
missing optimizations if SimplifyCFG decides to merge some branches.
---
 llvm/lib/Analysis/DomConditionCache.cpp         | 48 ++++++++++++++++---------
 llvm/lib/Analysis/ValueTracking.cpp             | 32 +++++++++++------
 llvm/test/Transforms/InstCombine/known-bits.ll  | 15 +++-----
 llvm/test/Transforms/LoopVectorize/induction.ll | 30 ++++++++--------
 4 files changed, 74 insertions(+), 51 deletions(-)

diff --git a/llvm/lib/Analysis/DomConditionCache.cpp b/llvm/lib/Analysis/DomConditionCache.cpp
index c7f4cab..3dad0c2 100644
--- a/llvm/lib/Analysis/DomConditionCache.cpp
+++ b/llvm/lib/Analysis/DomConditionCache.cpp
@@ -34,23 +34,39 @@ static void findAffectedValues(Value *Cond,
     }
   };
 
-  ICmpInst::Predicate Pred;
-  Value *A;
-  if (match(Cond, m_ICmp(Pred, m_Value(A), m_Constant()))) {
-    AddAffected(A);
+  bool TopLevelIsAnd = match(Cond, m_LogicalAnd());
+  SmallVector<Value *, 8> Worklist;
+  SmallPtrSet<Value *, 8> Visited;
+  Worklist.push_back(Cond);
+  while (!Worklist.empty()) {
+    Value *V = Worklist.pop_back_val();
+    if (!Visited.insert(V).second)
+      continue;
 
-    if (ICmpInst::isEquality(Pred)) {
-      Value *X;
-      // (X & C) or (X | C) or (X ^ C).
-      // (X << C) or (X >>_s C) or (X >>_u C).
-      if (match(A, m_BitwiseLogic(m_Value(X), m_ConstantInt())) ||
-          match(A, m_Shift(m_Value(X), m_ConstantInt())))
-        AddAffected(X);
-    } else {
-      Value *X;
-      // Handle (A + C1) u< C2, which is the canonical form of A > C3 && A < C4.
-      if (match(A, m_Add(m_Value(X), m_ConstantInt())))
-        AddAffected(X);
+    ICmpInst::Predicate Pred;
+    Value *A, *B;
+    // Only recurse into and/or if it matches the top-level and/or type.
+    if (TopLevelIsAnd ? match(V, m_LogicalAnd(m_Value(A), m_Value(B)))
+                      : match(V, m_LogicalOr(m_Value(A), m_Value(B)))) {
+      Worklist.push_back(A);
+      Worklist.push_back(B);
+    } else if (match(V, m_ICmp(Pred, m_Value(A), m_Constant()))) {
+      AddAffected(A);
+
+      if (ICmpInst::isEquality(Pred)) {
+        Value *X;
+        // (X & C) or (X | C) or (X ^ C).
+        // (X << C) or (X >>_s C) or (X >>_u C).
+        if (match(A, m_BitwiseLogic(m_Value(X), m_ConstantInt())) ||
+            match(A, m_Shift(m_Value(X), m_ConstantInt())))
+          AddAffected(X);
+      } else {
+        Value *X;
+        // Handle (A + C1) u< C2, which is the canonical form of
+        // A > C3 && A < C4.
+        if (match(A, m_Add(m_Value(X), m_ConstantInt())))
+          AddAffected(X);
+      }
     }
   }
 }
diff --git a/llvm/lib/Analysis/ValueTracking.cpp b/llvm/lib/Analysis/ValueTracking.cpp
index 58db81f..0e40a02 100644
--- a/llvm/lib/Analysis/ValueTracking.cpp
+++ b/llvm/lib/Analysis/ValueTracking.cpp
@@ -706,28 +706,40 @@ static void computeKnownBitsFromCmp(const Value *V, CmpInst::Predicate Pred,
   }
 }
 
+static void computeKnownBitsFromCond(const Value *V, Value *Cond,
+                                     KnownBits &Known, unsigned Depth,
+                                     const SimplifyQuery &SQ, bool Invert) {
+  Value *A, *B;
+  if (Depth < MaxAnalysisRecursionDepth &&
+      (Invert ? match(Cond, m_LogicalOr(m_Value(A), m_Value(B)))
+              : match(Cond, m_LogicalAnd(m_Value(A), m_Value(B))))) {
+    computeKnownBitsFromCond(V, A, Known, Depth + 1, SQ, Invert);
+    computeKnownBitsFromCond(V, B, Known, Depth + 1, SQ, Invert);
+  }
+
+  if (auto *Cmp = dyn_cast<ICmpInst>(Cond))
+    computeKnownBitsFromCmp(
+        V, Invert ? Cmp->getInversePredicate() : Cmp->getPredicate(),
+        Cmp->getOperand(0), Cmp->getOperand(1), Known, SQ);
+}
+
 void llvm::computeKnownBitsFromContext(const Value *V, KnownBits &Known,
-                                      unsigned Depth, const SimplifyQuery &Q) {
+                                       unsigned Depth, const SimplifyQuery &Q) {
   if (!Q.CxtI)
     return;
 
   if (Q.DC && Q.DT) {
     // Handle dominating conditions.
     for (BranchInst *BI : Q.DC->conditionsFor(V)) {
-      auto *Cmp = dyn_cast<ICmpInst>(BI->getCondition());
-      if (!Cmp)
-        continue;
-
       BasicBlockEdge Edge0(BI->getParent(), BI->getSuccessor(0));
       if (Q.DT->dominates(Edge0, Q.CxtI->getParent()))
-        computeKnownBitsFromCmp(V, Cmp->getPredicate(), Cmp->getOperand(0),
-                                Cmp->getOperand(1), Known, Q);
+        computeKnownBitsFromCond(V, BI->getCondition(), Known, Depth, Q,
+                                 /*Invert*/ false);
 
       BasicBlockEdge Edge1(BI->getParent(), BI->getSuccessor(1));
       if (Q.DT->dominates(Edge1, Q.CxtI->getParent()))
-        computeKnownBitsFromCmp(V, Cmp->getInversePredicate(),
-                                Cmp->getOperand(0), Cmp->getOperand(1), Known,
-                                Q);
+        computeKnownBitsFromCond(V, BI->getCondition(), Known, Depth, Q,
+                                 /*Invert*/ true);
     }
 
     if (Known.hasConflict())
diff --git a/llvm/test/Transforms/InstCombine/known-bits.ll b/llvm/test/Transforms/InstCombine/known-bits.ll
index e346330..246579c 100644
--- a/llvm/test/Transforms/InstCombine/known-bits.ll
+++ b/llvm/test/Transforms/InstCombine/known-bits.ll
@@ -105,8 +105,7 @@ define i8 @test_cond_and(i8 %x, i1 %c) {
 ; CHECK-NEXT:    [[COND:%.*]] = and i1 [[CMP]], [[C:%.*]]
 ; CHECK-NEXT:    br i1 [[COND]], label [[IF:%.*]], label [[EXIT:%.*]]
 ; CHECK:       if:
-; CHECK-NEXT:    [[OR1:%.*]] = or i8 [[X]], -4
-; CHECK-NEXT:    ret i8 [[OR1]]
+; CHECK-NEXT:    ret i8 -4
 ; CHECK:       exit:
 ; CHECK-NEXT:    [[OR2:%.*]] = or i8 [[X]], -4
 ; CHECK-NEXT:    ret i8 [[OR2]]
@@ -133,8 +132,7 @@ define i8 @test_cond_and_commuted(i8 %x, i1 %c1, i1 %c2) {
 ; CHECK-NEXT:    [[COND:%.*]] = and i1 [[C3]], [[CMP]]
 ; CHECK-NEXT:    br i1 [[COND]], label [[IF:%.*]], label [[EXIT:%.*]]
 ; CHECK:       if:
-; CHECK-NEXT:    [[OR1:%.*]] = or i8 [[X]], -4
-; CHECK-NEXT:    ret i8 [[OR1]]
+; CHECK-NEXT:    ret i8 -4
 ; CHECK:       exit:
 ; CHECK-NEXT:    [[OR2:%.*]] = or i8 [[X]], -4
 ; CHECK-NEXT:    ret i8 [[OR2]]
@@ -161,8 +159,7 @@ define i8 @test_cond_logical_and(i8 %x, i1 %c) {
 ; CHECK-NEXT:    [[COND:%.*]] = select i1 [[CMP]], i1 [[C:%.*]], i1 false
 ; CHECK-NEXT:    br i1 [[COND]], label [[IF:%.*]], label [[EXIT:%.*]]
 ; CHECK:       if:
-; CHECK-NEXT:    [[OR1:%.*]] = or i8 [[X]], -4
-; CHECK-NEXT:    ret i8 [[OR1]]
+; CHECK-NEXT:    ret i8 -4
 ; CHECK:       exit:
 ; CHECK-NEXT:    [[OR2:%.*]] = or i8 [[X]], -4
 ; CHECK-NEXT:    ret i8 [[OR2]]
@@ -218,8 +215,7 @@ define i8 @test_cond_inv_or(i8 %x, i1 %c) {
 ; CHECK-NEXT:    [[OR1:%.*]] = or i8 [[X]], -4
 ; CHECK-NEXT:    ret i8 [[OR1]]
 ; CHECK:       exit:
-; CHECK-NEXT:    [[OR2:%.*]] = or i8 [[X]], -4
-; CHECK-NEXT:    ret i8 [[OR2]]
+; CHECK-NEXT:    ret i8 -4
 ;
   %and = and i8 %x, 3
   %cmp = icmp ne i8 %and, 0
@@ -242,8 +238,7 @@ define i8 @test_cond_inv_logical_or(i8 %x, i1 %c) {
 ; CHECK-NEXT:    [[COND:%.*]] = select i1 [[CMP_NOT]], i1 [[C:%.*]], i1 false
 ; CHECK-NEXT:    br i1 [[COND]], label [[IF:%.*]], label [[EXIT:%.*]]
 ; CHECK:       if:
-; CHECK-NEXT:    [[OR1:%.*]] = or i8 [[X]], -4
-; CHECK-NEXT:    ret i8 [[OR1]]
+; CHECK-NEXT:    ret i8 -4
 ; CHECK:       exit:
 ; CHECK-NEXT:    [[OR2:%.*]] = or i8 [[X]], -4
 ; CHECK-NEXT:    ret i8 [[OR2]]
diff --git a/llvm/test/Transforms/LoopVectorize/induction.ll b/llvm/test/Transforms/LoopVectorize/induction.ll
index 29d8719d..50a5cc6 100644
--- a/llvm/test/Transforms/LoopVectorize/induction.ll
+++ b/llvm/test/Transforms/LoopVectorize/induction.ll
@@ -3523,10 +3523,10 @@ define void @wrappingindvars1(i8 %t, i32 %len, ptr %A) {
 ; IND-NEXT:    [[TMP9:%.*]] = or i1 [[TMP3]], [[TMP8]]
 ; IND-NEXT:    br i1 [[TMP9]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
 ; IND:       vector.ph:
-; IND-NEXT:    [[N_VEC:%.*]] = and i32 [[TMP0]], -2
+; IND-NEXT:    [[N_VEC:%.*]] = and i32 [[TMP0]], 510
 ; IND-NEXT:    [[DOTCAST:%.*]] = trunc i32 [[N_VEC]] to i8
 ; IND-NEXT:    [[IND_END:%.*]] = add i8 [[DOTCAST]], [[T]]
-; IND-NEXT:    [[IND_END2:%.*]] = add i32 [[N_VEC]], [[EXT]]
+; IND-NEXT:    [[IND_END2:%.*]] = add nuw nsw i32 [[N_VEC]], [[EXT]]
 ; IND-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <2 x i32> poison, i32 [[EXT]], i64 0
 ; IND-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <2 x i32> [[DOTSPLATINSERT]], <2 x i32> poison, <2 x i32> zeroinitializer
 ; IND-NEXT:    [[INDUCTION:%.*]] = add nuw nsw <2 x i32> [[DOTSPLAT]], <i32 0, i32 1>
@@ -3589,10 +3589,10 @@ define void @wrappingindvars1(i8 %t, i32 %len, ptr %A) {
 ; UNROLL-NEXT:    [[TMP9:%.*]] = or i1 [[TMP3]], [[TMP8]]
 ; UNROLL-NEXT:    br i1 [[TMP9]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
 ; UNROLL:       vector.ph:
-; UNROLL-NEXT:    [[N_VEC:%.*]] = and i32 [[TMP0]], -4
+; UNROLL-NEXT:    [[N_VEC:%.*]] = and i32 [[TMP0]], 508
 ; UNROLL-NEXT:    [[DOTCAST:%.*]] = trunc i32 [[N_VEC]] to i8
 ; UNROLL-NEXT:    [[IND_END:%.*]] = add i8 [[DOTCAST]], [[T]]
-; UNROLL-NEXT:    [[IND_END2:%.*]] = add i32 [[N_VEC]], [[EXT]]
+; UNROLL-NEXT:    [[IND_END2:%.*]] = add nuw nsw i32 [[N_VEC]], [[EXT]]
 ; UNROLL-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <2 x i32> poison, i32 [[EXT]], i64 0
 ; UNROLL-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <2 x i32> [[DOTSPLATINSERT]], <2 x i32> poison, <2 x i32> zeroinitializer
 ; UNROLL-NEXT:    [[INDUCTION:%.*]] = add nuw nsw <2 x i32> [[DOTSPLAT]], <i32 0, i32 1>
@@ -3733,10 +3733,10 @@ define void @wrappingindvars1(i8 %t, i32 %len, ptr %A) {
 ; INTERLEAVE-NEXT:    [[TMP9:%.*]] = or i1 [[TMP3]], [[TMP8]]
 ; INTERLEAVE-NEXT:    br i1 [[TMP9]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
 ; INTERLEAVE:       vector.ph:
-; INTERLEAVE-NEXT:    [[N_VEC:%.*]] = and i32 [[TMP0]], -8
+; INTERLEAVE-NEXT:    [[N_VEC:%.*]] = and i32 [[TMP0]], 504
 ; INTERLEAVE-NEXT:    [[DOTCAST:%.*]] = trunc i32 [[N_VEC]] to i8
 ; INTERLEAVE-NEXT:    [[IND_END:%.*]] = add i8 [[DOTCAST]], [[T]]
-; INTERLEAVE-NEXT:    [[IND_END2:%.*]] = add i32 [[N_VEC]], [[EXT]]
+; INTERLEAVE-NEXT:    [[IND_END2:%.*]] = add nuw nsw i32 [[N_VEC]], [[EXT]]
 ; INTERLEAVE-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[EXT]], i64 0
 ; INTERLEAVE-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <4 x i32> [[DOTSPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
 ; INTERLEAVE-NEXT:    [[INDUCTION:%.*]] = add nuw nsw <4 x i32> [[DOTSPLAT]], <i32 0, i32 1, i32 2, i32 3>
@@ -3907,11 +3907,11 @@ define void @wrappingindvars2(i8 %t, i32 %len, ptr %A) {
 ; IND-NEXT:    [[TMP9:%.*]] = or i1 [[TMP3]], [[TMP8]]
 ; IND-NEXT:    br i1 [[TMP9]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
 ; IND:       vector.ph:
-; IND-NEXT:    [[N_VEC:%.*]] = and i32 [[TMP0]], -2
+; IND-NEXT:    [[N_VEC:%.*]] = and i32 [[TMP0]], 510
 ; IND-NEXT:    [[DOTCAST:%.*]] = trunc i32 [[N_VEC]] to i8
 ; IND-NEXT:    [[IND_END:%.*]] = add i8 [[DOTCAST]], [[T]]
-; IND-NEXT:    [[EXT_MUL5:%.*]] = add i32 [[N_VEC]], [[EXT]]
-; IND-NEXT:    [[IND_END1:%.*]] = shl i32 [[EXT_MUL5]], 2
+; IND-NEXT:    [[EXT_MUL5:%.*]] = add nuw nsw i32 [[N_VEC]], [[EXT]]
+; IND-NEXT:    [[IND_END1:%.*]] = shl nuw nsw i32 [[EXT_MUL5]], 2
 ; IND-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <2 x i32> poison, i32 [[EXT_MUL]], i64 0
 ; IND-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <2 x i32> [[DOTSPLATINSERT]], <2 x i32> poison, <2 x i32> zeroinitializer
 ; IND-NEXT:    [[INDUCTION:%.*]] = add nuw nsw <2 x i32> [[DOTSPLAT]], <i32 0, i32 4>
@@ -3976,11 +3976,11 @@ define void @wrappingindvars2(i8 %t, i32 %len, ptr %A) {
 ; UNROLL-NEXT:    [[TMP9:%.*]] = or i1 [[TMP3]], [[TMP8]]
 ; UNROLL-NEXT:    br i1 [[TMP9]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
 ; UNROLL:       vector.ph:
-; UNROLL-NEXT:    [[N_VEC:%.*]] = and i32 [[TMP0]], -4
+; UNROLL-NEXT:    [[N_VEC:%.*]] = and i32 [[TMP0]], 508
 ; UNROLL-NEXT:    [[DOTCAST:%.*]] = trunc i32 [[N_VEC]] to i8
 ; UNROLL-NEXT:    [[IND_END:%.*]] = add i8 [[DOTCAST]], [[T]]
-; UNROLL-NEXT:    [[EXT_MUL6:%.*]] = add i32 [[N_VEC]], [[EXT]]
-; UNROLL-NEXT:    [[IND_END1:%.*]] = shl i32 [[EXT_MUL6]], 2
+; UNROLL-NEXT:    [[EXT_MUL6:%.*]] = add nuw nsw i32 [[N_VEC]], [[EXT]]
+; UNROLL-NEXT:    [[IND_END1:%.*]] = shl nuw nsw i32 [[EXT_MUL6]], 2
 ; UNROLL-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <2 x i32> poison, i32 [[EXT_MUL]], i64 0
 ; UNROLL-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <2 x i32> [[DOTSPLATINSERT]], <2 x i32> poison, <2 x i32> zeroinitializer
 ; UNROLL-NEXT:    [[INDUCTION:%.*]] = add nuw nsw <2 x i32> [[DOTSPLAT]], <i32 0, i32 4>
@@ -4126,11 +4126,11 @@ define void @wrappingindvars2(i8 %t, i32 %len, ptr %A) {
 ; INTERLEAVE-NEXT:    [[TMP9:%.*]] = or i1 [[TMP3]], [[TMP8]]
 ; INTERLEAVE-NEXT:    br i1 [[TMP9]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
 ; INTERLEAVE:       vector.ph:
-; INTERLEAVE-NEXT:    [[N_VEC:%.*]] = and i32 [[TMP0]], -8
+; INTERLEAVE-NEXT:    [[N_VEC:%.*]] = and i32 [[TMP0]], 504
 ; INTERLEAVE-NEXT:    [[DOTCAST:%.*]] = trunc i32 [[N_VEC]] to i8
 ; INTERLEAVE-NEXT:    [[IND_END:%.*]] = add i8 [[DOTCAST]], [[T]]
-; INTERLEAVE-NEXT:    [[EXT_MUL6:%.*]] = add i32 [[N_VEC]], [[EXT]]
-; INTERLEAVE-NEXT:    [[IND_END1:%.*]] = shl i32 [[EXT_MUL6]], 2
+; INTERLEAVE-NEXT:    [[EXT_MUL6:%.*]] = add nuw nsw i32 [[N_VEC]], [[EXT]]
+; INTERLEAVE-NEXT:    [[IND_END1:%.*]] = shl nuw nsw i32 [[EXT_MUL6]], 2
 ; INTERLEAVE-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[EXT_MUL]], i64 0
 ; INTERLEAVE-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <4 x i32> [[DOTSPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
 ; INTERLEAVE-NEXT:    [[INDUCTION:%.*]] = add nuw nsw <4 x i32> [[DOTSPLAT]], <i32 0, i32 4, i32 8, i32 12>
-- 
cgit v1.1


From 7ec6e7351458924946e9afaadf9788cb233095b9 Mon Sep 17 00:00:00 2001
From: Sven van Haastregt <sven.vanhaastregt@arm.com>
Date: Thu, 8 Feb 2024 08:58:13 +0000
Subject: [DAG] Fix typos in comments; NFC

---
 llvm/include/llvm/CodeGen/SelectionDAG.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/SelectionDAG.h b/llvm/include/llvm/CodeGen/SelectionDAG.h
index b9ec307..886ec0b 100644
--- a/llvm/include/llvm/CodeGen/SelectionDAG.h
+++ b/llvm/include/llvm/CodeGen/SelectionDAG.h
@@ -1613,10 +1613,10 @@ public:
   /// Expand the specified \c ISD::VACOPY node as the Legalize pass would.
   SDValue expandVACopy(SDNode *Node);
 
-  /// Returs an GlobalAddress of the function from the current module with
+  /// Return a GlobalAddress of the function from the current module with
   /// name matching the given ExternalSymbol. Additionally can provide the
   /// matched function.
-  /// Panics the function doesn't exists.
+  /// Panic if the function doesn't exist.
   SDValue getSymbolFunctionGlobalAddress(SDValue Op,
                                          Function **TargetFunction = nullptr);
 
@@ -2255,7 +2255,7 @@ public:
   std::pair<EVT, EVT> GetDependentSplitDestVTs(const EVT &VT, const EVT &EnvVT,
                                                bool *HiIsEmpty) const;
 
-  /// Split the vector with EXTRACT_SUBVECTOR using the provides
+  /// Split the vector with EXTRACT_SUBVECTOR using the provided
   /// VTs and return the low/high part.
   std::pair<SDValue, SDValue> SplitVector(const SDValue &N, const SDLoc &DL,
                                           const EVT &LoVT, const EVT &HiVT);
-- 
cgit v1.1


From dd9511d3e46094ec15282bce6eba163fed2226a4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Timm=20B=C3=A4der?= <tbaeder@redhat.com>
Date: Wed, 7 Feb 2024 16:06:59 +0100
Subject: [clang][Interp][NFC] Convert test case to verify=expected,both style

---
 clang/test/AST/Interp/builtin-functions.cpp | 103 +++++++++++-----------------
 1 file changed, 39 insertions(+), 64 deletions(-)

diff --git a/clang/test/AST/Interp/builtin-functions.cpp b/clang/test/AST/Interp/builtin-functions.cpp
index d6ed2d8..3aa01d5 100644
--- a/clang/test/AST/Interp/builtin-functions.cpp
+++ b/clang/test/AST/Interp/builtin-functions.cpp
@@ -1,11 +1,11 @@
-// RUN: %clang_cc1 -Wno-string-plus-int -fexperimental-new-constant-interpreter %s -verify
-// RUN: %clang_cc1 -Wno-string-plus-int -fexperimental-new-constant-interpreter -triple i686 %s -verify
-// RUN: %clang_cc1 -Wno-string-plus-int -verify=ref %s -Wno-constant-evaluated
-// RUN: %clang_cc1 -std=c++20 -Wno-string-plus-int -fexperimental-new-constant-interpreter %s -verify
-// RUN: %clang_cc1 -std=c++20 -Wno-string-plus-int -fexperimental-new-constant-interpreter -triple i686 %s -verify
-// RUN: %clang_cc1 -std=c++20 -Wno-string-plus-int -verify=ref %s -Wno-constant-evaluated
-// RUN: %clang_cc1 -triple avr -std=c++20 -Wno-string-plus-int -fexperimental-new-constant-interpreter %s -verify
-// RUN: %clang_cc1 -triple avr -std=c++20 -Wno-string-plus-int -verify=ref %s -Wno-constant-evaluated
+// RUN: %clang_cc1 -Wno-string-plus-int -fexperimental-new-constant-interpreter %s -verify=expected,both
+// RUN: %clang_cc1 -Wno-string-plus-int -fexperimental-new-constant-interpreter -triple i686 %s -verify=expected,both
+// RUN: %clang_cc1 -Wno-string-plus-int -verify=ref,both %s -Wno-constant-evaluated
+// RUN: %clang_cc1 -std=c++20 -Wno-string-plus-int -fexperimental-new-constant-interpreter %s -verify=expected,both
+// RUN: %clang_cc1 -std=c++20 -Wno-string-plus-int -fexperimental-new-constant-interpreter -triple i686 %s -verify=expected,both
+// RUN: %clang_cc1 -std=c++20 -Wno-string-plus-int -verify=ref,both %s -Wno-constant-evaluated
+// RUN: %clang_cc1 -triple avr -std=c++20 -Wno-string-plus-int -fexperimental-new-constant-interpreter %s -verify=expected,both
+// RUN: %clang_cc1 -triple avr -std=c++20 -Wno-string-plus-int -verify=ref,both %s -Wno-constant-evaluated
 
 
 namespace strcmp {
@@ -23,23 +23,17 @@ namespace strcmp {
   static_assert(__builtin_strcmp("abab\0banana", "abab") == 0, "");
   static_assert(__builtin_strcmp("abab", "abab\0banana") == 0, "");
   static_assert(__builtin_strcmp("abab\0banana", "abab\0canada") == 0, "");
-  static_assert(__builtin_strcmp(0, "abab") == 0, ""); // expected-error {{not an integral constant}} \
-                                                       // expected-note {{dereferenced null}} \
-                                                       // expected-note {{in call to}} \
-                                                       // ref-error {{not an integral constant}} \
-                                                       // ref-note {{dereferenced null}}
-  static_assert(__builtin_strcmp("abab", 0) == 0, ""); // expected-error {{not an integral constant}} \
-                                                       // expected-note {{dereferenced null}} \
-                                                       // expected-note {{in call to}} \
-                                                       // ref-error {{not an integral constant}} \
-                                                       // ref-note {{dereferenced null}}
+  static_assert(__builtin_strcmp(0, "abab") == 0, ""); // both-error {{not an integral constant}} \
+                                                       // both-note {{dereferenced null}} \
+                                                       // expected-note {{in call to}}
+  static_assert(__builtin_strcmp("abab", 0) == 0, ""); // both-error {{not an integral constant}} \
+                                                       // both-note {{dereferenced null}} \
+                                                       // expected-note {{in call to}}
 
   static_assert(__builtin_strcmp(kFoobar, kFoobazfoobar) == -1, "");
-  static_assert(__builtin_strcmp(kFoobar, kFoobazfoobar + 6) == 0, ""); // expected-error {{not an integral constant}} \
-                                                                        // expected-note {{dereferenced one-past-the-end}} \
-                                                                        // expected-note {{in call to}} \
-                                                                        // ref-error {{not an integral constant}} \
-                                                                        // ref-note {{dereferenced one-past-the-end}}
+  static_assert(__builtin_strcmp(kFoobar, kFoobazfoobar + 6) == 0, ""); // both-error {{not an integral constant}} \
+                                                                        // both-note {{dereferenced one-past-the-end}} \
+                                                                        // expected-note {{in call to}}
 }
 
 /// Copied from constant-expression-cxx11.cpp
@@ -69,41 +63,27 @@ constexpr const char *a = "foo\0quux";
   static_assert(check(b), "");
   static_assert(check(c), "");
 
-  constexpr int over1 = __builtin_strlen(a + 9); // expected-error {{constant expression}} \
-                                                 // expected-note {{one-past-the-end}} \
-                                                 // expected-note {{in call to}} \
-                                                 // ref-error {{constant expression}} \
-                                                 // ref-note {{one-past-the-end}}
-  constexpr int over2 = __builtin_strlen(b + 9); // expected-error {{constant expression}} \
-                                                 // expected-note {{one-past-the-end}} \
-                                                 // expected-note {{in call to}} \
-                                                 // ref-error {{constant expression}} \
-                                                 // ref-note {{one-past-the-end}}
-  constexpr int over3 = __builtin_strlen(c + 9); // expected-error {{constant expression}} \
-                                                 // expected-note {{one-past-the-end}} \
-                                                 // expected-note {{in call to}} \
-                                                 // ref-error {{constant expression}} \
-                                                 // ref-note {{one-past-the-end}}
-
-  constexpr int under1 = __builtin_strlen(a - 1); // expected-error {{constant expression}} \
-                                                  // expected-note {{cannot refer to element -1}} \
-                                                  // ref-error {{constant expression}} \
-                                                  // ref-note {{cannot refer to element -1}}
-  constexpr int under2 = __builtin_strlen(b - 1); // expected-error {{constant expression}} \
-                                                  // expected-note {{cannot refer to element -1}} \
-                                                  // ref-error {{constant expression}} \
-                                                  // ref-note {{cannot refer to element -1}}
-  constexpr int under3 = __builtin_strlen(c - 1); // expected-error {{constant expression}} \
-                                                  // expected-note {{cannot refer to element -1}} \
-                                                  // ref-error {{constant expression}} \
-                                                  // ref-note {{cannot refer to element -1}}
+  constexpr int over1 = __builtin_strlen(a + 9); // both-error {{constant expression}} \
+                                                 // both-note {{one-past-the-end}} \
+                                                 // expected-note {{in call to}}
+  constexpr int over2 = __builtin_strlen(b + 9); // both-error {{constant expression}} \
+                                                 // both-note {{one-past-the-end}} \
+                                                 // expected-note {{in call to}}
+  constexpr int over3 = __builtin_strlen(c + 9); // both-error {{constant expression}} \
+                                                 // both-note {{one-past-the-end}} \
+                                                 // expected-note {{in call to}}
+
+  constexpr int under1 = __builtin_strlen(a - 1); // both-error {{constant expression}} \
+                                                  // both-note {{cannot refer to element -1}}
+  constexpr int under2 = __builtin_strlen(b - 1); // both-error {{constant expression}} \
+                                                  // both-note {{cannot refer to element -1}}
+  constexpr int under3 = __builtin_strlen(c - 1); // both-error {{constant expression}} \
+                                                  // both-note {{cannot refer to element -1}}
 
   constexpr char d[] = { 'f', 'o', 'o' }; // no nul terminator.
-  constexpr int bad = __builtin_strlen(d); // expected-error {{constant expression}} \
-                                           // expected-note {{one-past-the-end}} \
-                                           // expected-note {{in call to}} \
-                                           // ref-error {{constant expression}} \
-                                           // ref-note {{one-past-the-end}}
+  constexpr int bad = __builtin_strlen(d); // both-error {{constant expression}} \
+                                           // both-note {{one-past-the-end}} \
+                                           // expected-note {{in call to}}
 }
 
 namespace nan {
@@ -115,8 +95,7 @@ namespace nan {
   // expected-error@-2 {{must be initialized by a constant expression}}
 #endif
 
-  constexpr double NaN3 = __builtin_nan("foo"); // expected-error {{must be initialized by a constant expression}} \
-                                                // ref-error {{must be initialized by a constant expression}}
+  constexpr double NaN3 = __builtin_nan("foo"); // both-error {{must be initialized by a constant expression}}
   constexpr float NaN4 = __builtin_nanf("");
   //constexpr long double NaN5 = __builtin_nanf128("");
 
@@ -126,8 +105,7 @@ namespace nan {
 
   /// FIXME: Current interpreter misses diagnostics.
   constexpr char f2[] = {'0', 'x', 'A', 'E'}; /// No trailing 0 byte.
-  constexpr double NaN7 = __builtin_nan(f2); // ref-error {{must be initialized by a constant expression}} \
-                                             // expected-error {{must be initialized by a constant expression}} \
+  constexpr double NaN7 = __builtin_nan(f2); // both-error {{must be initialized by a constant expression}} \
                                              // expected-note {{read of dereferenced one-past-the-end pointer}} \
                                              // expected-note {{in call to}}
   static_assert(!__builtin_issignaling(__builtin_nan("")), "");
@@ -370,9 +348,6 @@ namespace EhReturnDataRegno {
       case __builtin_eh_return_data_regno(0):  // constant foldable.
       break;
     }
-
-    __builtin_eh_return_data_regno(X);  // expected-error {{argument to '__builtin_eh_return_data_regno' must be a constant integer}} \
-                                        // ref-error {{argument to '__builtin_eh_return_data_regno' must be a constant integer}}
-
+    __builtin_eh_return_data_regno(X);  // both-error {{argument to '__builtin_eh_return_data_regno' must be a constant integer}}
   }
 }
-- 
cgit v1.1


From ef05b4b520ee342db6a3d6c5607f8e8729246316 Mon Sep 17 00:00:00 2001
From: David Green <david.green@arm.com>
Date: Thu, 8 Feb 2024 09:31:26 +0000
Subject: [BasicAA] More vscale tests. NFC

This time with i8 geps and scale intrinsics, along with mutiple vscale
intrinsics that can be treated as identical.
---
 llvm/test/Analysis/BasicAA/vscale.ll | 168 +++++++++++++++++++++++++++++++++++
 1 file changed, 168 insertions(+)

diff --git a/llvm/test/Analysis/BasicAA/vscale.ll b/llvm/test/Analysis/BasicAA/vscale.ll
index 3fff435..1b9118b 100644
--- a/llvm/test/Analysis/BasicAA/vscale.ll
+++ b/llvm/test/Analysis/BasicAA/vscale.ll
@@ -309,6 +309,174 @@ define void @v1v2types(ptr %p) vscale_range(1,16) {
   ret void
 }
 
+; VScale intrinsic offset tests
+
+; CHECK-LABEL: vscale_neg_notscalable
+; CHECK-DAG:   NoAlias:     <4 x i32>* %p, <4 x i32>* %vm16
+; CHECK-DAG:   NoAlias:     <4 x i32>* %m16, <4 x i32>* %p
+; CHECK-DAG:   MayAlias:    <4 x i32>* %m16, <4 x i32>* %vm16
+; CHECK-DAG:   MayAlias:    <4 x i32>* %p, <4 x i32>* %vm16m16
+; CHECK-DAG:   NoAlias:     <4 x i32>* %vm16, <4 x i32>* %vm16m16
+; CHECK-DAG:   NoAlias:     <4 x i32>* %m16, <4 x i32>* %vm16m16
+; CHECK-DAG:   MayAlias:    <4 x i32>* %m16pv16, <4 x i32>* %p
+; CHECK-DAG:   NoAlias:     <4 x i32>* %m16pv16, <4 x i32>* %vm16
+; CHECK-DAG:   NoAlias:     <4 x i32>* %m16, <4 x i32>* %m16pv16
+; CHECK-DAG:   MayAlias:    <4 x i32>* %m16pv16, <4 x i32>* %vm16m16
+define void @vscale_neg_notscalable(ptr %p) {
+  %v = call i64 @llvm.vscale.i64()
+  %vp = mul nsw i64 %v, 16
+  %vm = mul nsw i64 %v, -16
+  %vm16 = getelementptr i8, ptr %p, i64 %vm
+  %m16 = getelementptr <4 x i32>, ptr %p, i64 -1
+  %vm16m16 = getelementptr <4 x i32>, ptr %vm16, i64 -1
+  %m16pv16 = getelementptr i8, ptr %m16, i64 %vp
+  load <4 x i32>, ptr %p
+  load <4 x i32>, ptr %vm16
+  load <4 x i32>, ptr %m16
+  load <4 x i32>, ptr %vm16m16
+  load <4 x i32>, ptr %m16pv16
+  ret void
+}
+
+; CHECK-LABEL: vscale_neg_scalable
+; CHECK-DAG:   MayAlias:     <vscale x 4 x i32>* %p, <vscale x 4 x i32>* %vm16
+; CHECK-DAG:   MayAlias:     <vscale x 4 x i32>* %m16, <vscale x 4 x i32>* %p
+; CHECK-DAG:   MayAlias:     <vscale x 4 x i32>* %m16, <vscale x 4 x i32>* %vm16
+; CHECK-DAG:   MayAlias:     <vscale x 4 x i32>* %p, <vscale x 4 x i32>* %vm16m16
+; CHECK-DAG:   MayAlias:     <vscale x 4 x i32>* %vm16, <vscale x 4 x i32>* %vm16m16
+; CHECK-DAG:   MayAlias:     <vscale x 4 x i32>* %m16, <vscale x 4 x i32>* %vm16m16
+; CHECK-DAG:   MayAlias:     <vscale x 4 x i32>* %m16pv16, <vscale x 4 x i32>* %p
+; CHECK-DAG:   MayAlias:     <vscale x 4 x i32>* %m16pv16, <vscale x 4 x i32>* %vm16
+; CHECK-DAG:   MayAlias:     <vscale x 4 x i32>* %m16, <vscale x 4 x i32>* %m16pv16
+; CHECK-DAG:   MayAlias:     <vscale x 4 x i32>* %m16pv16, <vscale x 4 x i32>* %vm16m16
+define void @vscale_neg_scalable(ptr %p) {
+  %v = call i64 @llvm.vscale.i64()
+  %vp = mul nsw i64 %v, 16
+  %vm = mul nsw i64 %v, -16
+  %vm16 = getelementptr i8, ptr %p, i64 %vm
+  %m16 = getelementptr <4 x i32>, ptr %p, i64 -1
+  %vm16m16 = getelementptr <4 x i32>, ptr %vm16, i64 -1
+  %m16pv16 = getelementptr i8, ptr %m16, i64 %vp
+  load <vscale x 4 x i32>, ptr %p
+  load <vscale x 4 x i32>, ptr %vm16
+  load <vscale x 4 x i32>, ptr %m16
+  load <vscale x 4 x i32>, ptr %vm16m16
+  load <vscale x 4 x i32>, ptr %m16pv16
+  ret void
+}
+
+; CHECK-LABEL: vscale_pos_notscalable
+; CHECK-DAG:   NoAlias:      <4 x i32>* %p, <4 x i32>* %vm16
+; CHECK-DAG:   NoAlias:      <4 x i32>* %m16, <4 x i32>* %p
+; CHECK-DAG:   MayAlias:     <4 x i32>* %m16, <4 x i32>* %vm16
+; CHECK-DAG:   MayAlias:     <4 x i32>* %p, <4 x i32>* %vm16m16
+; CHECK-DAG:   NoAlias:      <4 x i32>* %vm16, <4 x i32>* %vm16m16
+; CHECK-DAG:   NoAlias:      <4 x i32>* %m16, <4 x i32>* %vm16m16
+; CHECK-DAG:   MayAlias:     <4 x i32>* %m16pv16, <4 x i32>* %p
+; CHECK-DAG:   NoAlias:      <4 x i32>* %m16pv16, <4 x i32>* %vm16
+; CHECK-DAG:   NoAlias:      <4 x i32>* %m16, <4 x i32>* %m16pv16
+; CHECK-DAG:   MayAlias:     <4 x i32>* %m16pv16, <4 x i32>* %vm16m16
+define void @vscale_pos_notscalable(ptr %p) {
+  %v = call i64 @llvm.vscale.i64()
+  %vp = mul nsw i64 %v, 16
+  %vm = mul nsw i64 %v, -16
+  %vm16 = getelementptr i8, ptr %p, i64 %vp
+  %m16 = getelementptr <4 x i32>, ptr %p, i64 1
+  %vm16m16 = getelementptr <4 x i32>, ptr %vm16, i64 1
+  %m16pv16 = getelementptr i8, ptr %m16, i64 %vm
+  load <4 x i32>, ptr %p
+  load <4 x i32>, ptr %vm16
+  load <4 x i32>, ptr %m16
+  load <4 x i32>, ptr %vm16m16
+  load <4 x i32>, ptr %m16pv16
+  ret void
+}
+
+; CHECK-LABEL: vscale_pos_scalable
+; CHECK-DAG:   MayAlias:     <vscale x 4 x i32>* %p, <vscale x 4 x i32>* %vm16
+; CHECK-DAG:   MayAlias:     <vscale x 4 x i32>* %m16, <vscale x 4 x i32>* %p
+; CHECK-DAG:   MayAlias:     <vscale x 4 x i32>* %m16, <vscale x 4 x i32>* %vm16
+; CHECK-DAG:   MayAlias:     <vscale x 4 x i32>* %p, <vscale x 4 x i32>* %vm16m16
+; CHECK-DAG:   MayAlias:     <vscale x 4 x i32>* %vm16, <vscale x 4 x i32>* %vm16m16
+; CHECK-DAG:   MayAlias:     <vscale x 4 x i32>* %m16, <vscale x 4 x i32>* %vm16m16
+; CHECK-DAG:   MayAlias:     <vscale x 4 x i32>* %m16pv16, <vscale x 4 x i32>* %p
+; CHECK-DAG:   MayAlias:     <vscale x 4 x i32>* %m16pv16, <vscale x 4 x i32>* %vm16
+; CHECK-DAG:   MayAlias:     <vscale x 4 x i32>* %m16, <vscale x 4 x i32>* %m16pv16
+; CHECK-DAG:   MayAlias:     <vscale x 4 x i32>* %m16pv16, <vscale x 4 x i32>* %vm16m16
+define void @vscale_pos_scalable(ptr %p) {
+  %v = call i64 @llvm.vscale.i64()
+  %vp = mul nsw i64 %v, 16
+  %vm = mul nsw i64 %v, -16
+  %vm16 = getelementptr i8, ptr %p, i64 %vp
+  %m16 = getelementptr <4 x i32>, ptr %p, i64 1
+  %vm16m16 = getelementptr <4 x i32>, ptr %vm16, i64 1
+  %m16pv16 = getelementptr i8, ptr %m16, i64 %vm
+  load <vscale x 4 x i32>, ptr %p
+  load <vscale x 4 x i32>, ptr %vm16
+  load <vscale x 4 x i32>, ptr %m16
+  load <vscale x 4 x i32>, ptr %vm16m16
+  load <vscale x 4 x i32>, ptr %m16pv16
+  ret void
+}
+
+; CHECK-LABEL: vscale_v1v2types
+; CHECK-DAG:   MustAlias:    <4 x i32>* %p, <vscale x 4 x i32>* %p
+; CHECK-DAG:   MayAlias:     <vscale x 4 x i32>* %p, <vscale x 4 x i32>* %vm16
+; CHECK-DAG:   MayAlias:     <4 x i32>* %p, <vscale x 4 x i32>* %vm16
+; CHECK-DAG:   MayAlias:     <vscale x 4 x i32>* %p, <4 x i32>* %vm16
+; CHECK-DAG:   NoAlias:      <4 x i32>* %p, <4 x i32>* %vm16
+; CHECK-DAG:   MustAlias:    <4 x i32>* %vm16, <vscale x 4 x i32>* %vm16
+; CHECK-DAG:   MayAlias:     <vscale x 4 x i32>* %m16, <vscale x 4 x i32>* %p
+; CHECK-DAG:   MayAlias:     <vscale x 4 x i32>* %m16, <4 x i32>* %p
+; CHECK-DAG:   MayAlias:     <vscale x 4 x i32>* %m16, <vscale x 4 x i32>* %vm16
+; CHECK-DAG:   MayAlias:     <vscale x 4 x i32>* %m16, <4 x i32>* %vm16
+; CHECK-DAG:   NoAlias:      <4 x i32>* %m16, <vscale x 4 x i32>* %p
+; CHECK-DAG:   NoAlias:      <4 x i32>* %m16, <4 x i32>* %p
+; CHECK-DAG:   MayAlias:     <4 x i32>* %m16, <vscale x 4 x i32>* %vm16
+; CHECK-DAG:   MayAlias:     <4 x i32>* %m16, <4 x i32>* %vm16
+; CHECK-DAG:   MustAlias:    <4 x i32>* %m16, <vscale x 4 x i32>* %m16
+; CHECK-DAG:   MayAlias:     <vscale x 4 x i32>* %p, <vscale x 4 x i32>* %vp16
+; CHECK-DAG:   MayAlias:     <4 x i32>* %p, <vscale x 4 x i32>* %vp16
+; CHECK-DAG:   MayAlias:     <vscale x 4 x i32>* %vm16, <vscale x 4 x i32>* %vp16
+; CHECK-DAG:   MayAlias:     <4 x i32>* %vm16, <vscale x 4 x i32>* %vp16
+; CHECK-DAG:   MayAlias:     <vscale x 4 x i32>* %m16, <vscale x 4 x i32>* %vp16
+; CHECK-DAG:   MayAlias:     <4 x i32>* %m16, <vscale x 4 x i32>* %vp16
+define void @vscale_v1v2types(ptr %p) {
+  %v = call i64 @llvm.vscale.i64()
+  %vp = mul nsw i64 %v, 16
+  %vm = mul nsw i64 %v, -16
+  %vp16 = getelementptr i8, ptr %p, i64 %vp
+  %vm16 = getelementptr i8, ptr %p, i64 %vm
+  %m16 = getelementptr <4 x i32>, ptr %p, i64 -1
+  load <vscale x 4 x i32>, ptr %p
+  load <4 x i32>, ptr %p
+  load <vscale x 4 x i32>, ptr %vm16
+  load <4 x i32>, ptr %vm16
+  load <vscale x 4 x i32>, ptr %m16
+  load <4 x i32>, ptr %m16
+  load <vscale x 4 x i32>, ptr %vp16
+  ret void
+}
+
+; CHECK-LABEL: twovscales
+; CHECK-DAG:   MayAlias:     <vscale x 4 x i32>* %vp161, <vscale x 4 x i32>* %vp162
+; CHECK-DAG:   MayAlias:     <vscale x 4 x i32>* %vp161, <vscale x 4 x i32>* %vp161b
+; CHECK-DAG:   MayAlias:     <vscale x 4 x i32>* %vp161b, <vscale x 4 x i32>* %vp162
+define void @twovscales(ptr %p) {
+  %v1 = call i64 @llvm.vscale.i64()
+  %v2 = call i64 @llvm.vscale.i64()
+  %vp1 = mul nsw i64 %v1, 16
+  %vp2 = mul nsw i64 %v2, 16
+  %vp3 = mul nsw i64 %v1, 17
+  %vp161 = getelementptr i8, ptr %p, i64 %vp1
+  %vp162 = getelementptr i8, ptr %p, i64 %vp2
+  %vp161b = getelementptr i8, ptr %vp161, i64 %vp3
+  load <vscale x 4 x i32>, ptr %vp161
+  load <vscale x 4 x i32>, ptr %vp162
+  load <vscale x 4 x i32>, ptr %vp161b
+  ret void
+}
+
 ; getelementptr recursion
 
 ; CHECK-LABEL: gep_recursion_level_1
-- 
cgit v1.1


From 9ac82f0d3ecf6c13669b0c7940920460c037a292 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Martin=20Storsj=C3=B6?= <martin@martin.st>
Date: Thu, 8 Feb 2024 11:45:57 +0200
Subject: [OpenMP] [cmake] In standalone mode, make Python3_EXECUTABLE
 available (#80828)

When running the tests, we try to invoke them as
"${Python3_EXECUTABLE} ${OPENMP_LLVM_LIT_EXECUTABLE}", but when running
"find_package(Python3)" within the function
"find_standalone_test_dependencies", the variable "Python3_EXECUTABLE"
only gets set within the function scope.

Tests have worked regardless of this in many cases, where executing the
python script directly succeeds. But for consistency, and for working in
cases when the python script can't be executed as such, make the
Python3_EXECUTABLE variable available as intended.
---
 openmp/cmake/OpenMPTesting.cmake | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/openmp/cmake/OpenMPTesting.cmake b/openmp/cmake/OpenMPTesting.cmake
index df41956..ab2348ae 100644
--- a/openmp/cmake/OpenMPTesting.cmake
+++ b/openmp/cmake/OpenMPTesting.cmake
@@ -10,6 +10,8 @@ function(find_standalone_test_dependencies)
     message(WARNING "The check targets will not be available!")
     set(ENABLE_CHECK_TARGETS FALSE PARENT_SCOPE)
     return()
+  else()
+    set(Python3_EXECUTABLE ${Python3_EXECUTABLE} PARENT_SCOPE)
   endif()
 
   # Find executables.
-- 
cgit v1.1


From 49ee2ffc65b7660bfe84cd842e083d6c0ee3e991 Mon Sep 17 00:00:00 2001
From: Evgeniy <evgeniy.tyurin@intel.com>
Date: Thu, 8 Feb 2024 02:06:22 -0800
Subject: [X86][GlobalISel] Reorganize br/brcond tests (NFC) (#80204)

Removing duplicating tests under GlobalISel, consolidating to perform
checks with all three selectors.
---
 llvm/test/CodeGen/X86/GlobalISel/br.ll         |   19 -
 llvm/test/CodeGen/X86/GlobalISel/brcond.ll     |   91 --
 llvm/test/CodeGen/X86/fast-isel-cmp-branch2.ll |  293 ------
 llvm/test/CodeGen/X86/fast-isel-cmp-branch3.ll |  469 ---------
 llvm/test/CodeGen/X86/isel-br.ll               |   31 +
 llvm/test/CodeGen/X86/isel-brcond-fcmp.ll      | 1341 ++++++++++++++++++++++++
 llvm/test/CodeGen/X86/isel-brcond-icmp.ll      | 1107 +++++++++++++++++++
 7 files changed, 2479 insertions(+), 872 deletions(-)
 delete mode 100644 llvm/test/CodeGen/X86/GlobalISel/br.ll
 delete mode 100644 llvm/test/CodeGen/X86/GlobalISel/brcond.ll
 delete mode 100644 llvm/test/CodeGen/X86/fast-isel-cmp-branch2.ll
 delete mode 100644 llvm/test/CodeGen/X86/fast-isel-cmp-branch3.ll
 create mode 100644 llvm/test/CodeGen/X86/isel-br.ll
 create mode 100644 llvm/test/CodeGen/X86/isel-brcond-fcmp.ll
 create mode 100644 llvm/test/CodeGen/X86/isel-brcond-icmp.ll

diff --git a/llvm/test/CodeGen/X86/GlobalISel/br.ll b/llvm/test/CodeGen/X86/GlobalISel/br.ll
deleted file mode 100644
index 878fe98..0000000
--- a/llvm/test/CodeGen/X86/GlobalISel/br.ll
+++ /dev/null
@@ -1,19 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -O0 -mtriple=x86_64-linux-gnu    -global-isel -verify-machineinstrs %s -o - | FileCheck %s
-
-define void @uncondbr() {
-; CHECK-LABEL: uncondbr:
-; CHECK:       # %bb.1: # %entry
-; CHECK-NEXT:    jmp .LBB0_3
-; CHECK-NEXT:  .LBB0_2: # %end
-; CHECK-NEXT:    retq
-; CHECK-NEXT:  .LBB0_3: # %bb2
-; CHECK-NEXT:    jmp .LBB0_2
-entry:
-  br label %bb2
-end:
-  ret void
-bb2:
-  br label %end
-}
-
diff --git a/llvm/test/CodeGen/X86/GlobalISel/brcond.ll b/llvm/test/CodeGen/X86/GlobalISel/brcond.ll
deleted file mode 100644
index b38fbfd..0000000
--- a/llvm/test/CodeGen/X86/GlobalISel/brcond.ll
+++ /dev/null
@@ -1,91 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=x86_64-linux-gnu    -global-isel -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=X64
-; RUN: llc -mtriple=i386-linux-gnu      -global-isel -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=X86
-
-define i32 @test_1(i32 %a, i32 %b, i32 %tValue, i32 %fValue) {
-; X64-LABEL: test_1:
-; X64:       # %bb.0: # %entry
-; X64-NEXT:    cmpl %esi, %edi
-; X64-NEXT:    setl %al
-; X64-NEXT:    testb $1, %al
-; X64-NEXT:    je .LBB0_2
-; X64-NEXT:  # %bb.1: # %if.then
-; X64-NEXT:    movl %edx, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movl -{{[0-9]+}}(%rsp), %eax
-; X64-NEXT:    retq
-; X64-NEXT:  .LBB0_2: # %if.else
-; X64-NEXT:    movl %ecx, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movl -{{[0-9]+}}(%rsp), %eax
-; X64-NEXT:    retq
-;
-; X86-LABEL: test_1:
-; X86:       # %bb.0: # %entry
-; X86-NEXT:    pushl %eax
-; X86-NEXT:    .cfi_def_cfa_offset 8
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    cmpl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    setl %al
-; X86-NEXT:    testb $1, %al
-; X86-NEXT:    je .LBB0_2
-; X86-NEXT:  # %bb.1: # %if.then
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    jmp .LBB0_3
-; X86-NEXT:  .LBB0_2: # %if.else
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:  .LBB0_3: # %return
-; X86-NEXT:    movl %eax, (%esp)
-; X86-NEXT:    movl (%esp), %eax
-; X86-NEXT:    popl %ecx
-; X86-NEXT:    .cfi_def_cfa_offset 4
-; X86-NEXT:    retl
-entry:
-  %retval = alloca i32, align 4
-  %cmp = icmp slt i32 %a, %b
-  br i1 %cmp, label %if.then, label %if.else
-
-if.then:
-  store i32 %tValue, ptr %retval, align 4
-  br label %return
-
-if.else:
-  store i32 %fValue, ptr %retval, align 4
-  br label %return
-
-return:
-  %0 = load i32, ptr %retval, align 4
-  ret i32 %0
-}
-
-define i32 @test_2(i32 %a) {
-; X64-LABEL: test_2:
-; X64:       # %bb.0: # %entry
-; X64-NEXT:    testb $1, %dil
-; X64-NEXT:    je .LBB1_2
-; X64-NEXT:  # %bb.1: # %if.then
-; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    retq
-; X64-NEXT:  .LBB1_2: # %if.else
-; X64-NEXT:    movl $1, %eax
-; X64-NEXT:    retq
-;
-; X86-LABEL: test_2:
-; X86:       # %bb.0: # %entry
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    testb $1, %al
-; X86-NEXT:    je .LBB1_2
-; X86-NEXT:  # %bb.1: # %if.then
-; X86-NEXT:    xorl %eax, %eax
-; X86-NEXT:    retl
-; X86-NEXT:  .LBB1_2: # %if.else
-; X86-NEXT:    movl $1, %eax
-; X86-NEXT:    retl
-entry:
-  %cmp = trunc i32 %a to i1
-  br i1 %cmp, label %if.then, label %if.else
-
-if.then:
-  ret i32 0
-if.else:
-  ret i32 1
-}
-
diff --git a/llvm/test/CodeGen/X86/fast-isel-cmp-branch2.ll b/llvm/test/CodeGen/X86/fast-isel-cmp-branch2.ll
deleted file mode 100644
index 475d8fc..0000000
--- a/llvm/test/CodeGen/X86/fast-isel-cmp-branch2.ll
+++ /dev/null
@@ -1,293 +0,0 @@
-; RUN: llc < %s                             -mtriple=x86_64-apple-darwin10 | FileCheck %s
-; RUN: llc < %s -fast-isel -fast-isel-abort=1 -mtriple=x86_64-apple-darwin10 | FileCheck %s
-
-define i32 @fcmp_oeq(float %x, float %y) {
-; CHECK-LABEL: fcmp_oeq
-; CHECK:       ucomiss  %xmm1, %xmm0
-; CHECK-NEXT:  jne {{LBB.+_1}}
-; CHECK-NEXT:  jp {{LBB.+_1}}
-  %1 = fcmp oeq float %x, %y
-  br i1 %1, label %bb1, label %bb2
-bb2:
-  ret i32 1
-bb1:
-  ret i32 0
-}
-
-define i32 @fcmp_ogt(float %x, float %y) {
-; CHECK-LABEL: fcmp_ogt
-; CHECK:       ucomiss  %xmm1, %xmm0
-; CHECK-NEXT:  jbe {{LBB.+_1}}
-  %1 = fcmp ogt float %x, %y
-  br i1 %1, label %bb1, label %bb2
-bb2:
-  ret i32 1
-bb1:
-  ret i32 0
-}
-
-define i32 @fcmp_oge(float %x, float %y) {
-; CHECK-LABEL: fcmp_oge
-; CHECK:       ucomiss  %xmm1, %xmm0
-; CHECK-NEXT:  jb {{LBB.+_1}}
-  %1 = fcmp oge float %x, %y
-  br i1 %1, label %bb1, label %bb2
-bb2:
-  ret i32 1
-bb1:
-  ret i32 0
-}
-
-define i32 @fcmp_olt(float %x, float %y) {
-; CHECK-LABEL: fcmp_olt
-; CHECK:       ucomiss  %xmm0, %xmm1
-; CHECK-NEXT:  jbe {{LBB.+_1}}
-  %1 = fcmp olt float %x, %y
-  br i1 %1, label %bb1, label %bb2
-bb2:
-  ret i32 1
-bb1:
-  ret i32 0
-}
-
-define i32 @fcmp_ole(float %x, float %y) {
-; CHECK-LABEL: fcmp_ole
-; CHECK:       ucomiss  %xmm0, %xmm1
-; CHECK-NEXT:  jb {{LBB.+_1}}
-  %1 = fcmp ole float %x, %y
-  br i1 %1, label %bb1, label %bb2
-bb2:
-  ret i32 1
-bb1:
-  ret i32 0
-}
-
-define i32 @fcmp_one(float %x, float %y) {
-; CHECK-LABEL: fcmp_one
-; CHECK:       ucomiss  %xmm1, %xmm0
-; CHECK-NEXT:  je {{LBB.+_1}}
-  %1 = fcmp one float %x, %y
-  br i1 %1, label %bb1, label %bb2
-bb2:
-  ret i32 1
-bb1:
-  ret i32 0
-}
-
-define i32 @fcmp_ord(float %x, float %y) {
-; CHECK-LABEL: fcmp_ord
-; CHECK:       ucomiss  %xmm1, %xmm0
-; CHECK-NEXT:  jp {{LBB.+_1}}
-  %1 = fcmp ord float %x, %y
-  br i1 %1, label %bb1, label %bb2
-bb2:
-  ret i32 1
-bb1:
-  ret i32 0
-}
-
-define i32 @fcmp_uno(float %x, float %y) {
-; CHECK-LABEL: fcmp_uno
-; CHECK:       ucomiss  %xmm1, %xmm0
-; CHECK-NEXT:  jp {{LBB.+_2}}
-  %1 = fcmp uno float %x, %y
-  br i1 %1, label %bb1, label %bb2
-bb2:
-  ret i32 1
-bb1:
-  ret i32 0
-}
-
-define i32 @fcmp_ueq(float %x, float %y) {
-; CHECK-LABEL: fcmp_ueq
-; CHECK:       ucomiss  %xmm1, %xmm0
-; CHECK-NEXT:  je {{LBB.+_2}}
-  %1 = fcmp ueq float %x, %y
-  br i1 %1, label %bb1, label %bb2
-bb2:
-  ret i32 1
-bb1:
-  ret i32 0
-}
-
-define i32 @fcmp_ugt(float %x, float %y) {
-; CHECK-LABEL: fcmp_ugt
-; CHECK:       ucomiss  %xmm0, %xmm1
-; CHECK-NEXT:  jae {{LBB.+_1}}
-  %1 = fcmp ugt float %x, %y
-  br i1 %1, label %bb1, label %bb2
-bb2:
-  ret i32 1
-bb1:
-  ret i32 0
-}
-
-define i32 @fcmp_uge(float %x, float %y) {
-; CHECK-LABEL: fcmp_uge
-; CHECK:       ucomiss  %xmm0, %xmm1
-; CHECK-NEXT:  ja {{LBB.+_1}}
-  %1 = fcmp uge float %x, %y
-  br i1 %1, label %bb1, label %bb2
-bb2:
-  ret i32 1
-bb1:
-  ret i32 0
-}
-
-define i32 @fcmp_ult(float %x, float %y) {
-; CHECK-LABEL: fcmp_ult
-; CHECK:       ucomiss  %xmm1, %xmm0
-; CHECK-NEXT:  jae {{LBB.+_1}}
-  %1 = fcmp ult float %x, %y
-  br i1 %1, label %bb1, label %bb2
-bb2:
-  ret i32 1
-bb1:
-  ret i32 0
-}
-
-define i32 @fcmp_ule(float %x, float %y) {
-; CHECK-LABEL: fcmp_ule
-; CHECK:       ucomiss  %xmm1, %xmm0
-; CHECK-NEXT:  ja {{LBB.+_1}}
-  %1 = fcmp ule float %x, %y
-  br i1 %1, label %bb1, label %bb2
-bb2:
-  ret i32 1
-bb1:
-  ret i32 0
-}
-
-define i32 @fcmp_une(float %x, float %y) {
-; CHECK-LABEL: fcmp_une
-; CHECK:       ucomiss  %xmm1, %xmm0
-; CHECK-NEXT:  jne {{LBB.+_2}}
-; CHECK-NEXT:  jnp {{LBB.+_1}}
-  %1 = fcmp une float %x, %y
-  br i1 %1, label %bb1, label %bb2
-bb2:
-  ret i32 1
-bb1:
-  ret i32 0
-}
-
-define i32 @icmp_eq(i32 %x, i32 %y) {
-; CHECK-LABEL: icmp_eq
-; CHECK:       cmpl %esi, %edi
-; CHECK-NEXT:  jne {{LBB.+_1}}
-  %1 = icmp eq i32 %x, %y
-  br i1 %1, label %bb1, label %bb2
-bb2:
-  ret i32 1
-bb1:
-  ret i32 0
-}
-
-define i32 @icmp_ne(i32 %x, i32 %y) {
-; CHECK-LABEL: icmp_ne
-; CHECK:       cmpl %esi, %edi
-; CHECK-NEXT:  je {{LBB.+_1}}
-  %1 = icmp ne i32 %x, %y
-  br i1 %1, label %bb1, label %bb2
-bb2:
-  ret i32 1
-bb1:
-  ret i32 0
-}
-
-define i32 @icmp_ugt(i32 %x, i32 %y) {
-; CHECK-LABEL: icmp_ugt
-; CHECK:       cmpl %esi, %edi
-; CHECK-NEXT:  jbe {{LBB.+_1}}
-  %1 = icmp ugt i32 %x, %y
-  br i1 %1, label %bb1, label %bb2
-bb2:
-  ret i32 1
-bb1:
-  ret i32 0
-}
-
-define i32 @icmp_uge(i32 %x, i32 %y) {
-; CHECK-LABEL: icmp_uge
-; CHECK:       cmpl %esi, %edi
-; CHECK-NEXT:  jb {{LBB.+_1}}
-  %1 = icmp uge i32 %x, %y
-  br i1 %1, label %bb1, label %bb2
-bb2:
-  ret i32 1
-bb1:
-  ret i32 0
-}
-
-define i32 @icmp_ult(i32 %x, i32 %y) {
-; CHECK-LABEL: icmp_ult
-; CHECK:       cmpl %esi, %edi
-; CHECK-NEXT:  jae {{LBB.+_1}}
-  %1 = icmp ult i32 %x, %y
-  br i1 %1, label %bb1, label %bb2
-bb2:
-  ret i32 1
-bb1:
-  ret i32 0
-}
-
-define i32 @icmp_ule(i32 %x, i32 %y) {
-; CHECK-LABEL: icmp_ule
-; CHECK:       cmpl %esi, %edi
-; CHECK-NEXT:  ja {{LBB.+_1}}
-  %1 = icmp ule i32 %x, %y
-  br i1 %1, label %bb1, label %bb2
-bb2:
-  ret i32 1
-bb1:
-  ret i32 0
-}
-
-define i32 @icmp_sgt(i32 %x, i32 %y) {
-; CHECK-LABEL: icmp_sgt
-; CHECK:       cmpl %esi, %edi
-; CHECK-NEXT:  jle {{LBB.+_1}}
-  %1 = icmp sgt i32 %x, %y
-  br i1 %1, label %bb1, label %bb2
-bb2:
-  ret i32 1
-bb1:
-  ret i32 0
-}
-
-define i32 @icmp_sge(i32 %x, i32 %y) {
-; CHECK-LABEL: icmp_sge
-; CHECK:       cmpl %esi, %edi
-; CHECK-NEXT:  jl {{LBB.+_1}}
-  %1 = icmp sge i32 %x, %y
-  br i1 %1, label %bb1, label %bb2
-bb2:
-  ret i32 1
-bb1:
-  ret i32 0
-}
-
-define i32 @icmp_slt(i32 %x, i32 %y) {
-; CHECK-LABEL: icmp_slt
-; CHECK:       cmpl %esi, %edi
-; CHECK-NEXT:  jge {{LBB.+_1}}
-  %1 = icmp slt i32 %x, %y
-  br i1 %1, label %bb1, label %bb2
-bb2:
-  ret i32 1
-bb1:
-  ret i32 0
-}
-
-define i32 @icmp_sle(i32 %x, i32 %y) {
-; CHECK-LABEL: icmp_sle
-; CHECK:       cmpl %esi, %edi
-; CHECK-NEXT:  jg {{LBB.+_1}}
-  %1 = icmp sle i32 %x, %y
-  br i1 %1, label %bb1, label %bb2
-bb2:
-  ret i32 1
-bb1:
-  ret i32 0
-}
-
diff --git a/llvm/test/CodeGen/X86/fast-isel-cmp-branch3.ll b/llvm/test/CodeGen/X86/fast-isel-cmp-branch3.ll
deleted file mode 100644
index 8f09b2e3..0000000
--- a/llvm/test/CodeGen/X86/fast-isel-cmp-branch3.ll
+++ /dev/null
@@ -1,469 +0,0 @@
-; RUN: llc < %s -fast-isel -fast-isel-abort=1 -mtriple=x86_64-apple-darwin10 | FileCheck %s
-
-define i32 @fcmp_oeq1(float %x) {
-; CHECK-LABEL: fcmp_oeq1
-; CHECK:       ucomiss  %xmm0, %xmm0
-; CHECK-NEXT:  jp {{LBB.+_1}}
-  %1 = fcmp oeq float %x, %x
-  br i1 %1, label %bb1, label %bb2
-bb2:
-  ret i32 1
-bb1:
-  ret i32 0
-}
-
-define i32 @fcmp_oeq2(float %x) {
-; CHECK-LABEL: fcmp_oeq2
-; CHECK:       xorps    %xmm1, %xmm1
-; CHECK-NEXT:  ucomiss  %xmm1, %xmm0
-; CHECK-NEXT:  jne {{LBB.+_1}}
-; CHECK-NEXT:  jp {{LBB.+_1}}
-  %1 = fcmp oeq float %x, 0.000000e+00
-  br i1 %1, label %bb1, label %bb2
-bb2:
-  ret i32 1
-bb1:
-  ret i32 0
-}
-
-define i32 @fcmp_ogt1(float %x) {
-; CHECK-LABEL: fcmp_ogt1
-; CHECK-NOT:   ucomiss
-; CHECK:       movl $1, %eax
-  %1 = fcmp ogt float %x, %x
-  br i1 %1, label %bb1, label %bb2
-bb2:
-  ret i32 1
-bb1:
-  ret i32 0
-}
-
-define i32 @fcmp_ogt2(float %x) {
-; CHECK-LABEL: fcmp_ogt2
-; CHECK:       xorps    %xmm1, %xmm1
-; CHECK-NEXT:  ucomiss  %xmm1, %xmm0
-; CHECK-NEXT:  jbe {{LBB.+_1}}
-  %1 = fcmp ogt float %x, 0.000000e+00
-  br i1 %1, label %bb1, label %bb2
-bb2:
-  ret i32 1
-bb1:
-  ret i32 0
-}
-
-define i32 @fcmp_oge1(float %x) {
-; CHECK-LABEL: fcmp_oge1
-; CHECK:       ucomiss  %xmm0, %xmm0
-; CHECK-NEXT:  jp {{LBB.+_1}}
-  %1 = fcmp oge float %x, %x
-  br i1 %1, label %bb1, label %bb2
-bb2:
-  ret i32 1
-bb1:
-  ret i32 0
-}
-
-define i32 @fcmp_oge2(float %x) {
-; CHECK-LABEL: fcmp_oge2
-; CHECK:       xorps    %xmm1, %xmm1
-; CHECK-NEXT:  ucomiss  %xmm1, %xmm0
-; CHECK-NEXT:  jb {{LBB.+_1}}
-  %1 = fcmp oge float %x, 0.000000e+00
-  br i1 %1, label %bb1, label %bb2
-bb2:
-  ret i32 1
-bb1:
-  ret i32 0
-}
-
-define i32 @fcmp_olt1(float %x) {
-; CHECK-LABEL: fcmp_olt1
-; CHECK-NOT:   ucomiss
-; CHECK:       movl $1, %eax
-  %1 = fcmp olt float %x, %x
-  br i1 %1, label %bb1, label %bb2
-bb2:
-  ret i32 1
-bb1:
-  ret i32 0
-}
-
-define i32 @fcmp_olt2(float %x) {
-; CHECK-LABEL: fcmp_olt2
-; CHECK:       xorps    %xmm1, %xmm1
-; CHECK-NEXT:  ucomiss  %xmm0, %xmm1
-; CHECK-NEXT:  jbe {{LBB.+_1}}
-  %1 = fcmp olt float %x, 0.000000e+00
-  br i1 %1, label %bb1, label %bb2
-bb2:
-  ret i32 1
-bb1:
-  ret i32 0
-}
-
-define i32 @fcmp_ole1(float %x) {
-; CHECK-LABEL: fcmp_ole1
-; CHECK:       ucomiss  %xmm0, %xmm0
-; CHECK-NEXT:  jp {{LBB.+_1}}
-  %1 = fcmp ole float %x, %x
-  br i1 %1, label %bb1, label %bb2
-bb2:
-  ret i32 1
-bb1:
-  ret i32 0
-}
-
-define i32 @fcmp_ole2(float %x) {
-; CHECK-LABEL: fcmp_ole2
-; CHECK:       xorps    %xmm1, %xmm1
-; CHECK-NEXT:  ucomiss  %xmm0, %xmm1
-; CHECK-NEXT:  jb {{LBB.+_1}}
-  %1 = fcmp ole float %x, 0.000000e+00
-  br i1 %1, label %bb1, label %bb2
-bb2:
-  ret i32 1
-bb1:
-  ret i32 0
-}
-
-define i32 @fcmp_one1(float %x) {
-; CHECK-LABEL: fcmp_one1
-; CHECK-NOT:   ucomiss
-; CHECK:       movl $1, %eax
-  %1 = fcmp one float %x, %x
-  br i1 %1, label %bb1, label %bb2
-bb2:
-  ret i32 1
-bb1:
-  ret i32 0
-}
-
-define i32 @fcmp_one2(float %x) {
-; CHECK-LABEL: fcmp_one2
-; CHECK:       xorps    %xmm1, %xmm1
-; CHECK-NEXT:  ucomiss  %xmm1, %xmm0
-; CHECK-NEXT:  je {{LBB.+_1}}
-  %1 = fcmp one float %x, 0.000000e+00
-  br i1 %1, label %bb1, label %bb2
-bb2:
-  ret i32 1
-bb1:
-  ret i32 0
-}
-
-define i32 @fcmp_ord1(float %x) {
-; CHECK-LABEL: fcmp_ord1
-; CHECK:       ucomiss  %xmm0, %xmm0
-; CHECK-NEXT:  jp {{LBB.+_1}}
-  %1 = fcmp ord float %x, %x
-  br i1 %1, label %bb1, label %bb2
-bb2:
-  ret i32 1
-bb1:
-  ret i32 0
-}
-
-define i32 @fcmp_ord2(float %x) {
-; CHECK-LABEL: fcmp_ord2
-; CHECK:       ucomiss  %xmm0, %xmm0
-; CHECK-NEXT:  jp {{LBB.+_1}}
-  %1 = fcmp ord float %x, 0.000000e+00
-  br i1 %1, label %bb1, label %bb2
-bb2:
-  ret i32 1
-bb1:
-  ret i32 0
-}
-
-define i32 @fcmp_uno1(float %x) {
-; CHECK-LABEL: fcmp_uno1
-; CHECK:       ucomiss  %xmm0, %xmm0
-; CHECK-NEXT:  jp {{LBB.+_2}}
-  %1 = fcmp uno float %x, %x
-  br i1 %1, label %bb1, label %bb2
-bb2:
-  ret i32 1
-bb1:
-  ret i32 0
-}
-
-define i32 @fcmp_uno2(float %x) {
-; CHECK-LABEL: fcmp_uno2
-; CHECK:       ucomiss  %xmm0, %xmm0
-; CHECK-NEXT:  jp {{LBB.+_2}}
-  %1 = fcmp uno float %x, 0.000000e+00
-  br i1 %1, label %bb1, label %bb2
-bb2:
-  ret i32 1
-bb1:
-  ret i32 0
-}
-
-define i32 @fcmp_ueq1(float %x) {
-; CHECK-LABEL: fcmp_ueq1
-; CHECK-NOT:   ucomiss
-  %1 = fcmp ueq float %x, %x
-  br i1 %1, label %bb1, label %bb2
-bb2:
-  ret i32 1
-bb1:
-  ret i32 0
-}
-
-define i32 @fcmp_ueq2(float %x) {
-; CHECK-LABEL: fcmp_ueq2
-; CHECK:       xorps    %xmm1, %xmm1
-; CHECK-NEXT:  ucomiss  %xmm1, %xmm0
-; CHECK-NEXT:  je {{LBB.+_2}}
-  %1 = fcmp ueq float %x, 0.000000e+00
-  br i1 %1, label %bb1, label %bb2
-bb2:
-  ret i32 1
-bb1:
-  ret i32 0
-}
-
-define i32 @fcmp_ugt1(float %x) {
-; CHECK-LABEL: fcmp_ugt1
-; CHECK:       ucomiss  %xmm0, %xmm0
-; CHECK-NEXT:  jnp {{LBB.+_1}}
-  %1 = fcmp ugt float %x, %x
-  br i1 %1, label %bb1, label %bb2
-bb2:
-  ret i32 1
-bb1:
-  ret i32 0
-}
-
-define i32 @fcmp_ugt2(float %x) {
-; CHECK-LABEL: fcmp_ugt2
-; CHECK:       xorps    %xmm1, %xmm1
-; CHECK-NEXT:  ucomiss  %xmm0, %xmm1
-; CHECK-NEXT:  jae {{LBB.+_1}}
-  %1 = fcmp ugt float %x, 0.000000e+00
-  br i1 %1, label %bb1, label %bb2
-bb2:
-  ret i32 1
-bb1:
-  ret i32 0
-}
-
-define i32 @fcmp_uge1(float %x) {
-; CHECK-LABEL: fcmp_uge1
-; CHECK-NOT:   ucomiss
-  %1 = fcmp uge float %x, %x
-  br i1 %1, label %bb1, label %bb2
-bb2:
-  ret i32 1
-bb1:
-  ret i32 0
-}
-
-define i32 @fcmp_uge2(float %x) {
-; CHECK-LABEL: fcmp_uge2
-; CHECK:       xorps    %xmm1, %xmm1
-; CHECK-NEXT:  ucomiss  %xmm0, %xmm1
-; CHECK-NEXT:  ja {{LBB.+_1}}
-  %1 = fcmp uge float %x, 0.000000e+00
-  br i1 %1, label %bb1, label %bb2
-bb2:
-  ret i32 1
-bb1:
-  ret i32 0
-}
-
-define i32 @fcmp_ult1(float %x) {
-; CHECK-LABEL: fcmp_ult1
-; CHECK:       ucomiss  %xmm0, %xmm0
-; CHECK-NEXT:  jnp {{LBB.+_1}}
-  %1 = fcmp ult float %x, %x
-  br i1 %1, label %bb1, label %bb2
-bb2:
-  ret i32 1
-bb1:
-  ret i32 0
-}
-
-define i32 @fcmp_ult2(float %x) {
-; CHECK-LABEL: fcmp_ult2
-; CHECK:       xorps    %xmm1, %xmm1
-; CHECK-NEXT:  ucomiss  %xmm1, %xmm0
-; CHECK-NEXT:  jae {{LBB.+_1}}
-  %1 = fcmp ult float %x, 0.000000e+00
-  br i1 %1, label %bb1, label %bb2
-bb2:
-  ret i32 1
-bb1:
-  ret i32 0
-}
-
-define i32 @fcmp_ule1(float %x) {
-; CHECK-LABEL: fcmp_ule1
-; CHECK-NOT:   ucomiss
-  %1 = fcmp ule float %x, %x
-  br i1 %1, label %bb1, label %bb2
-bb2:
-  ret i32 1
-bb1:
-  ret i32 0
-}
-
-define i32 @fcmp_ule2(float %x) {
-; CHECK-LABEL: fcmp_ule2
-; CHECK:       xorps    %xmm1, %xmm1
-; CHECK-NEXT:  ucomiss  %xmm1, %xmm0
-; CHECK-NEXT:  ja {{LBB.+_1}}
-  %1 = fcmp ule float %x, 0.000000e+00
-  br i1 %1, label %bb1, label %bb2
-bb2:
-  ret i32 1
-bb1:
-  ret i32 0
-}
-
-define i32 @fcmp_une1(float %x) {
-; CHECK-LABEL: fcmp_une1
-; CHECK:       ucomiss  %xmm0, %xmm0
-; CHECK-NEXT:  jnp {{LBB.+_1}}
-  %1 = fcmp une float %x, %x
-  br i1 %1, label %bb1, label %bb2
-bb2:
-  ret i32 1
-bb1:
-  ret i32 0
-}
-
-define i32 @fcmp_une2(float %x) {
-; CHECK-LABEL: fcmp_une2
-; CHECK:       xorps    %xmm1, %xmm1
-; CHECK-NEXT:  ucomiss  %xmm1, %xmm0
-; CHECK-NEXT:  jne {{LBB.+_2}}
-; CHECK-NEXT:  jnp {{LBB.+_1}}
-  %1 = fcmp une float %x, 0.000000e+00
-  br i1 %1, label %bb1, label %bb2
-bb2:
-  ret i32 1
-bb1:
-  ret i32 0
-}
-
-define i32 @icmp_eq(i32 %x) {
-; CHECK-LABEL: icmp_eq
-; CHECK-NOT:   cmpl
-; CHECK:       xorl %eax, %eax
-  %1 = icmp eq i32 %x, %x
-  br i1 %1, label %bb1, label %bb2
-bb2:
-  ret i32 1
-bb1:
-  ret i32 0
-}
-
-define i32 @icmp_ne(i32 %x) {
-; CHECK-LABEL: icmp_ne
-; CHECK-NOT:   cmpl
-; CHECK:       movl $1, %eax
-  %1 = icmp ne i32 %x, %x
-  br i1 %1, label %bb1, label %bb2
-bb2:
-  ret i32 1
-bb1:
-  ret i32 0
-}
-
-define i32 @icmp_ugt(i32 %x) {
-; CHECK-LABEL: icmp_ugt
-; CHECK-NOT:   cmpl
-; CHECK:       movl $1, %eax
-  %1 = icmp ugt i32 %x, %x
-  br i1 %1, label %bb1, label %bb2
-bb2:
-  ret i32 1
-bb1:
-  ret i32 0
-}
-
-define i32 @icmp_uge(i32 %x) {
-; CHECK-LABEL: icmp_uge
-; CHECK-NOT:   cmpl
-; CHECK:       xorl %eax, %eax
-  %1 = icmp uge i32 %x, %x
-  br i1 %1, label %bb1, label %bb2
-bb2:
-  ret i32 1
-bb1:
-  ret i32 0
-}
-
-define i32 @icmp_ult(i32 %x) {
-; CHECK-LABEL: icmp_ult
-; CHECK-NOT:   cmpl
-; CHECK:       movl $1, %eax
-  %1 = icmp ult i32 %x, %x
-  br i1 %1, label %bb1, label %bb2
-bb2:
-  ret i32 1
-bb1:
-  ret i32 0
-}
-
-define i32 @icmp_ule(i32 %x) {
-; CHECK-LABEL: icmp_ule
-; CHECK-NOT:   cmpl
-; CHECK:       xorl %eax, %eax
-  %1 = icmp ule i32 %x, %x
-  br i1 %1, label %bb1, label %bb2
-bb2:
-  ret i32 1
-bb1:
-  ret i32 0
-}
-
-define i32 @icmp_sgt(i32 %x) {
-; CHECK-LABEL: icmp_sgt
-; CHECK-NOT:   cmpl
-; CHECK:       movl $1, %eax
-  %1 = icmp sgt i32 %x, %x
-  br i1 %1, label %bb1, label %bb2
-bb2:
-  ret i32 1
-bb1:
-  ret i32 0
-}
-
-define i32 @icmp_sge(i32 %x) {
-; CHECK-LABEL: icmp_sge
-; CHECK-NOT:   cmpl
-; CHECK:       xorl %eax, %eax
-  %1 = icmp sge i32 %x, %x
-  br i1 %1, label %bb1, label %bb2
-bb2:
-  ret i32 1
-bb1:
-  ret i32 0
-}
-
-define i32 @icmp_slt(i32 %x) {
-; CHECK-LABEL: icmp_slt
-; CHECK-NOT:   cmpl
-; CHECK:       movl $1, %eax
-  %1 = icmp slt i32 %x, %x
-  br i1 %1, label %bb1, label %bb2
-bb2:
-  ret i32 1
-bb1:
-  ret i32 0
-}
-
-define i32 @icmp_sle(i32 %x) {
-; CHECK-LABEL: icmp_sle
-; CHECK-NOT:   cmpl
-; CHECK:       xorl %eax, %eax
-  %1 = icmp sle i32 %x, %x
-  br i1 %1, label %bb1, label %bb2
-bb2:
-  ret i32 1
-bb1:
-  ret i32 0
-}
-
diff --git a/llvm/test/CodeGen/X86/isel-br.ll b/llvm/test/CodeGen/X86/isel-br.ll
new file mode 100644
index 0000000..5388c89
--- /dev/null
+++ b/llvm/test/CodeGen/X86/isel-br.ll
@@ -0,0 +1,31 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc < %s -O0 -mtriple=i686-linux-gnu -global-isel=0 -verify-machineinstrs | FileCheck %s --check-prefix=DAG
+; RUN: llc < %s -O0 -mtriple=i686-linux-gnu -fast-isel -fast-isel-abort=1        | FileCheck %s --check-prefix=DAG
+; RUN: llc < %s -O0 -mtriple=i686-linux-gnu -global-isel -global-isel-abort=1 -verify-machineinstrs | FileCheck %s --check-prefix=GISEL
+; RUN: llc < %s -O0 -mtriple=x86_64-linux-gnu -global-isel=0                     | FileCheck %s --check-prefix=DAG
+; RUN: llc < %s -O0 -mtriple=x86_64-linux-gnu -fast-isel -fast-isel-abort=1      | FileCheck %s --check-prefix=DAG
+; RUN: llc < %s -O0 -mtriple=x86_64-linux-gnu -global-isel -global-isel-abort=1  | FileCheck %s --check-prefix=GISEL
+
+define void @uncondbr() {
+; DAG-LABEL: uncondbr:
+; DAG:       # %bb.0: # %entry
+; DAG-NEXT:    jmp .LBB0_2
+; DAG-NEXT:  .LBB0_1: # %end
+; DAG-NEXT:    ret{{[l|q]}}
+; DAG-NEXT:  .LBB0_2: # %bb2
+; DAG-NEXT:    jmp .LBB0_1
+;
+; GISEL-LABEL: uncondbr:
+; GISEL:       # %bb.1: # %entry
+; GISEL-NEXT:    jmp .LBB0_3
+; GISEL-NEXT:  .LBB0_2: # %end
+; GISEL-NEXT:    ret{{[l|q]}}
+; GISEL-NEXT:  .LBB0_3: # %bb2
+; GISEL-NEXT:    jmp .LBB0_2
+entry:
+  br label %bb2
+end:
+  ret void
+bb2:
+  br label %end
+}
diff --git a/llvm/test/CodeGen/X86/isel-brcond-fcmp.ll b/llvm/test/CodeGen/X86/isel-brcond-fcmp.ll
new file mode 100644
index 0000000..5a28e09
--- /dev/null
+++ b/llvm/test/CodeGen/X86/isel-brcond-fcmp.ll
@@ -0,0 +1,1341 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc < %s -global-isel=0                    -mtriple=x86_64-apple-darwin10 | FileCheck %s --check-prefixes=X64,SDAG-X64
+; RUN: llc < %s -fast-isel -fast-isel-abort=1     -mtriple=x86_64-apple-darwin10 | FileCheck %s --check-prefixes=X64,FASTISEL-X64
+; RUN: llc < %s -global-isel -global-isel-abort=1 -mtriple=x86_64-apple-darwin10 | FileCheck %s --check-prefixes=GISEL-X64
+
+define i32 @fcmp_oeq(float %x, float %y) {
+; X64-LABEL: fcmp_oeq:
+; X64:       ## %bb.0:
+; X64-NEXT:    ucomiss %xmm1, %xmm0
+; X64-NEXT:    jne LBB0_1
+; X64-NEXT:    jp LBB0_1
+; X64-NEXT:  ## %bb.2: ## %bb1
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    retq
+; X64-NEXT:  LBB0_1: ## %bb2
+; X64-NEXT:    movl $1, %eax
+; X64-NEXT:    retq
+;
+; GISEL-X64-LABEL: fcmp_oeq:
+; GISEL-X64:       ## %bb.0:
+; GISEL-X64-NEXT:    ucomiss %xmm1, %xmm0
+; GISEL-X64-NEXT:    sete %al
+; GISEL-X64-NEXT:    setnp %cl
+; GISEL-X64-NEXT:    andb %al, %cl
+; GISEL-X64-NEXT:    testb $1, %cl
+; GISEL-X64-NEXT:    je LBB0_1
+; GISEL-X64-NEXT:  ## %bb.2: ## %bb1
+; GISEL-X64-NEXT:    xorl %eax, %eax
+; GISEL-X64-NEXT:    retq
+; GISEL-X64-NEXT:  LBB0_1: ## %bb2
+; GISEL-X64-NEXT:    movl $1, %eax
+; GISEL-X64-NEXT:    retq
+  %1 = fcmp oeq float %x, %y
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @fcmp_ogt(float %x, float %y) {
+; X64-LABEL: fcmp_ogt:
+; X64:       ## %bb.0:
+; X64-NEXT:    ucomiss %xmm1, %xmm0
+; X64-NEXT:    jbe LBB1_1
+; X64-NEXT:  ## %bb.2: ## %bb1
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    retq
+; X64-NEXT:  LBB1_1: ## %bb2
+; X64-NEXT:    movl $1, %eax
+; X64-NEXT:    retq
+;
+; GISEL-X64-LABEL: fcmp_ogt:
+; GISEL-X64:       ## %bb.0:
+; GISEL-X64-NEXT:    ucomiss %xmm1, %xmm0
+; GISEL-X64-NEXT:    seta %al
+; GISEL-X64-NEXT:    testb $1, %al
+; GISEL-X64-NEXT:    je LBB1_1
+; GISEL-X64-NEXT:  ## %bb.2: ## %bb1
+; GISEL-X64-NEXT:    xorl %eax, %eax
+; GISEL-X64-NEXT:    retq
+; GISEL-X64-NEXT:  LBB1_1: ## %bb2
+; GISEL-X64-NEXT:    movl $1, %eax
+; GISEL-X64-NEXT:    retq
+  %1 = fcmp ogt float %x, %y
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @fcmp_oge(float %x, float %y) {
+; X64-LABEL: fcmp_oge:
+; X64:       ## %bb.0:
+; X64-NEXT:    ucomiss %xmm1, %xmm0
+; X64-NEXT:    jb LBB2_1
+; X64-NEXT:  ## %bb.2: ## %bb1
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    retq
+; X64-NEXT:  LBB2_1: ## %bb2
+; X64-NEXT:    movl $1, %eax
+; X64-NEXT:    retq
+;
+; GISEL-X64-LABEL: fcmp_oge:
+; GISEL-X64:       ## %bb.0:
+; GISEL-X64-NEXT:    ucomiss %xmm1, %xmm0
+; GISEL-X64-NEXT:    setae %al
+; GISEL-X64-NEXT:    testb $1, %al
+; GISEL-X64-NEXT:    je LBB2_1
+; GISEL-X64-NEXT:  ## %bb.2: ## %bb1
+; GISEL-X64-NEXT:    xorl %eax, %eax
+; GISEL-X64-NEXT:    retq
+; GISEL-X64-NEXT:  LBB2_1: ## %bb2
+; GISEL-X64-NEXT:    movl $1, %eax
+; GISEL-X64-NEXT:    retq
+  %1 = fcmp oge float %x, %y
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @fcmp_olt(float %x, float %y) {
+; X64-LABEL: fcmp_olt:
+; X64:       ## %bb.0:
+; X64-NEXT:    ucomiss %xmm0, %xmm1
+; X64-NEXT:    jbe LBB3_1
+; X64-NEXT:  ## %bb.2: ## %bb1
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    retq
+; X64-NEXT:  LBB3_1: ## %bb2
+; X64-NEXT:    movl $1, %eax
+; X64-NEXT:    retq
+;
+; GISEL-X64-LABEL: fcmp_olt:
+; GISEL-X64:       ## %bb.0:
+; GISEL-X64-NEXT:    ucomiss %xmm0, %xmm1
+; GISEL-X64-NEXT:    seta %al
+; GISEL-X64-NEXT:    testb $1, %al
+; GISEL-X64-NEXT:    je LBB3_1
+; GISEL-X64-NEXT:  ## %bb.2: ## %bb1
+; GISEL-X64-NEXT:    xorl %eax, %eax
+; GISEL-X64-NEXT:    retq
+; GISEL-X64-NEXT:  LBB3_1: ## %bb2
+; GISEL-X64-NEXT:    movl $1, %eax
+; GISEL-X64-NEXT:    retq
+  %1 = fcmp olt float %x, %y
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @fcmp_ole(float %x, float %y) {
+; X64-LABEL: fcmp_ole:
+; X64:       ## %bb.0:
+; X64-NEXT:    ucomiss %xmm0, %xmm1
+; X64-NEXT:    jb LBB4_1
+; X64-NEXT:  ## %bb.2: ## %bb1
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    retq
+; X64-NEXT:  LBB4_1: ## %bb2
+; X64-NEXT:    movl $1, %eax
+; X64-NEXT:    retq
+;
+; GISEL-X64-LABEL: fcmp_ole:
+; GISEL-X64:       ## %bb.0:
+; GISEL-X64-NEXT:    ucomiss %xmm0, %xmm1
+; GISEL-X64-NEXT:    setae %al
+; GISEL-X64-NEXT:    testb $1, %al
+; GISEL-X64-NEXT:    je LBB4_1
+; GISEL-X64-NEXT:  ## %bb.2: ## %bb1
+; GISEL-X64-NEXT:    xorl %eax, %eax
+; GISEL-X64-NEXT:    retq
+; GISEL-X64-NEXT:  LBB4_1: ## %bb2
+; GISEL-X64-NEXT:    movl $1, %eax
+; GISEL-X64-NEXT:    retq
+  %1 = fcmp ole float %x, %y
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @fcmp_one(float %x, float %y) {
+; X64-LABEL: fcmp_one:
+; X64:       ## %bb.0:
+; X64-NEXT:    ucomiss %xmm1, %xmm0
+; X64-NEXT:    je LBB5_1
+; X64-NEXT:  ## %bb.2: ## %bb1
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    retq
+; X64-NEXT:  LBB5_1: ## %bb2
+; X64-NEXT:    movl $1, %eax
+; X64-NEXT:    retq
+;
+; GISEL-X64-LABEL: fcmp_one:
+; GISEL-X64:       ## %bb.0:
+; GISEL-X64-NEXT:    ucomiss %xmm1, %xmm0
+; GISEL-X64-NEXT:    setne %al
+; GISEL-X64-NEXT:    testb $1, %al
+; GISEL-X64-NEXT:    je LBB5_1
+; GISEL-X64-NEXT:  ## %bb.2: ## %bb1
+; GISEL-X64-NEXT:    xorl %eax, %eax
+; GISEL-X64-NEXT:    retq
+; GISEL-X64-NEXT:  LBB5_1: ## %bb2
+; GISEL-X64-NEXT:    movl $1, %eax
+; GISEL-X64-NEXT:    retq
+  %1 = fcmp one float %x, %y
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @fcmp_ord(float %x, float %y) {
+; X64-LABEL: fcmp_ord:
+; X64:       ## %bb.0:
+; X64-NEXT:    ucomiss %xmm1, %xmm0
+; X64-NEXT:    jp LBB6_1
+; X64-NEXT:  ## %bb.2: ## %bb1
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    retq
+; X64-NEXT:  LBB6_1: ## %bb2
+; X64-NEXT:    movl $1, %eax
+; X64-NEXT:    retq
+;
+; GISEL-X64-LABEL: fcmp_ord:
+; GISEL-X64:       ## %bb.0:
+; GISEL-X64-NEXT:    ucomiss %xmm1, %xmm0
+; GISEL-X64-NEXT:    setnp %al
+; GISEL-X64-NEXT:    testb $1, %al
+; GISEL-X64-NEXT:    je LBB6_1
+; GISEL-X64-NEXT:  ## %bb.2: ## %bb1
+; GISEL-X64-NEXT:    xorl %eax, %eax
+; GISEL-X64-NEXT:    retq
+; GISEL-X64-NEXT:  LBB6_1: ## %bb2
+; GISEL-X64-NEXT:    movl $1, %eax
+; GISEL-X64-NEXT:    retq
+  %1 = fcmp ord float %x, %y
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @fcmp_uno(float %x, float %y) {
+; X64-LABEL: fcmp_uno:
+; X64:       ## %bb.0:
+; X64-NEXT:    ucomiss %xmm1, %xmm0
+; X64-NEXT:    jp LBB7_2
+; X64-NEXT:  ## %bb.1: ## %bb2
+; X64-NEXT:    movl $1, %eax
+; X64-NEXT:    retq
+; X64-NEXT:  LBB7_2: ## %bb1
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    retq
+;
+; GISEL-X64-LABEL: fcmp_uno:
+; GISEL-X64:       ## %bb.0:
+; GISEL-X64-NEXT:    ucomiss %xmm1, %xmm0
+; GISEL-X64-NEXT:    setp %al
+; GISEL-X64-NEXT:    testb $1, %al
+; GISEL-X64-NEXT:    jne LBB7_2
+; GISEL-X64-NEXT:  ## %bb.1: ## %bb2
+; GISEL-X64-NEXT:    movl $1, %eax
+; GISEL-X64-NEXT:    retq
+; GISEL-X64-NEXT:  LBB7_2: ## %bb1
+; GISEL-X64-NEXT:    xorl %eax, %eax
+; GISEL-X64-NEXT:    retq
+  %1 = fcmp uno float %x, %y
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @fcmp_ueq(float %x, float %y) {
+; X64-LABEL: fcmp_ueq:
+; X64:       ## %bb.0:
+; X64-NEXT:    ucomiss %xmm1, %xmm0
+; X64-NEXT:    je LBB8_2
+; X64-NEXT:  ## %bb.1: ## %bb2
+; X64-NEXT:    movl $1, %eax
+; X64-NEXT:    retq
+; X64-NEXT:  LBB8_2: ## %bb1
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    retq
+;
+; GISEL-X64-LABEL: fcmp_ueq:
+; GISEL-X64:       ## %bb.0:
+; GISEL-X64-NEXT:    ucomiss %xmm1, %xmm0
+; GISEL-X64-NEXT:    sete %al
+; GISEL-X64-NEXT:    testb $1, %al
+; GISEL-X64-NEXT:    jne LBB8_2
+; GISEL-X64-NEXT:  ## %bb.1: ## %bb2
+; GISEL-X64-NEXT:    movl $1, %eax
+; GISEL-X64-NEXT:    retq
+; GISEL-X64-NEXT:  LBB8_2: ## %bb1
+; GISEL-X64-NEXT:    xorl %eax, %eax
+; GISEL-X64-NEXT:    retq
+  %1 = fcmp ueq float %x, %y
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @fcmp_ugt(float %x, float %y) {
+; X64-LABEL: fcmp_ugt:
+; X64:       ## %bb.0:
+; X64-NEXT:    ucomiss %xmm0, %xmm1
+; X64-NEXT:    jae LBB9_1
+; X64-NEXT:  ## %bb.2: ## %bb1
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    retq
+; X64-NEXT:  LBB9_1: ## %bb2
+; X64-NEXT:    movl $1, %eax
+; X64-NEXT:    retq
+;
+; GISEL-X64-LABEL: fcmp_ugt:
+; GISEL-X64:       ## %bb.0:
+; GISEL-X64-NEXT:    ucomiss %xmm0, %xmm1
+; GISEL-X64-NEXT:    setb %al
+; GISEL-X64-NEXT:    testb $1, %al
+; GISEL-X64-NEXT:    je LBB9_1
+; GISEL-X64-NEXT:  ## %bb.2: ## %bb1
+; GISEL-X64-NEXT:    xorl %eax, %eax
+; GISEL-X64-NEXT:    retq
+; GISEL-X64-NEXT:  LBB9_1: ## %bb2
+; GISEL-X64-NEXT:    movl $1, %eax
+; GISEL-X64-NEXT:    retq
+  %1 = fcmp ugt float %x, %y
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @fcmp_uge(float %x, float %y) {
+; X64-LABEL: fcmp_uge:
+; X64:       ## %bb.0:
+; X64-NEXT:    ucomiss %xmm0, %xmm1
+; X64-NEXT:    ja LBB10_1
+; X64-NEXT:  ## %bb.2: ## %bb1
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    retq
+; X64-NEXT:  LBB10_1: ## %bb2
+; X64-NEXT:    movl $1, %eax
+; X64-NEXT:    retq
+;
+; GISEL-X64-LABEL: fcmp_uge:
+; GISEL-X64:       ## %bb.0:
+; GISEL-X64-NEXT:    ucomiss %xmm0, %xmm1
+; GISEL-X64-NEXT:    setbe %al
+; GISEL-X64-NEXT:    testb $1, %al
+; GISEL-X64-NEXT:    je LBB10_1
+; GISEL-X64-NEXT:  ## %bb.2: ## %bb1
+; GISEL-X64-NEXT:    xorl %eax, %eax
+; GISEL-X64-NEXT:    retq
+; GISEL-X64-NEXT:  LBB10_1: ## %bb2
+; GISEL-X64-NEXT:    movl $1, %eax
+; GISEL-X64-NEXT:    retq
+  %1 = fcmp uge float %x, %y
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @fcmp_ult(float %x, float %y) {
+; X64-LABEL: fcmp_ult:
+; X64:       ## %bb.0:
+; X64-NEXT:    ucomiss %xmm1, %xmm0
+; X64-NEXT:    jae LBB11_1
+; X64-NEXT:  ## %bb.2: ## %bb1
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    retq
+; X64-NEXT:  LBB11_1: ## %bb2
+; X64-NEXT:    movl $1, %eax
+; X64-NEXT:    retq
+;
+; GISEL-X64-LABEL: fcmp_ult:
+; GISEL-X64:       ## %bb.0:
+; GISEL-X64-NEXT:    ucomiss %xmm1, %xmm0
+; GISEL-X64-NEXT:    setb %al
+; GISEL-X64-NEXT:    testb $1, %al
+; GISEL-X64-NEXT:    je LBB11_1
+; GISEL-X64-NEXT:  ## %bb.2: ## %bb1
+; GISEL-X64-NEXT:    xorl %eax, %eax
+; GISEL-X64-NEXT:    retq
+; GISEL-X64-NEXT:  LBB11_1: ## %bb2
+; GISEL-X64-NEXT:    movl $1, %eax
+; GISEL-X64-NEXT:    retq
+  %1 = fcmp ult float %x, %y
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @fcmp_ule(float %x, float %y) {
+; X64-LABEL: fcmp_ule:
+; X64:       ## %bb.0:
+; X64-NEXT:    ucomiss %xmm1, %xmm0
+; X64-NEXT:    ja LBB12_1
+; X64-NEXT:  ## %bb.2: ## %bb1
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    retq
+; X64-NEXT:  LBB12_1: ## %bb2
+; X64-NEXT:    movl $1, %eax
+; X64-NEXT:    retq
+;
+; GISEL-X64-LABEL: fcmp_ule:
+; GISEL-X64:       ## %bb.0:
+; GISEL-X64-NEXT:    ucomiss %xmm1, %xmm0
+; GISEL-X64-NEXT:    setbe %al
+; GISEL-X64-NEXT:    testb $1, %al
+; GISEL-X64-NEXT:    je LBB12_1
+; GISEL-X64-NEXT:  ## %bb.2: ## %bb1
+; GISEL-X64-NEXT:    xorl %eax, %eax
+; GISEL-X64-NEXT:    retq
+; GISEL-X64-NEXT:  LBB12_1: ## %bb2
+; GISEL-X64-NEXT:    movl $1, %eax
+; GISEL-X64-NEXT:    retq
+  %1 = fcmp ule float %x, %y
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @fcmp_une(float %x, float %y) {
+; X64-LABEL: fcmp_une:
+; X64:       ## %bb.0:
+; X64-NEXT:    ucomiss %xmm1, %xmm0
+; X64-NEXT:    jne LBB13_2
+; X64-NEXT:    jnp LBB13_1
+; X64-NEXT:  LBB13_2: ## %bb1
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    retq
+; X64-NEXT:  LBB13_1: ## %bb2
+; X64-NEXT:    movl $1, %eax
+; X64-NEXT:    retq
+;
+; GISEL-X64-LABEL: fcmp_une:
+; GISEL-X64:       ## %bb.0:
+; GISEL-X64-NEXT:    ucomiss %xmm1, %xmm0
+; GISEL-X64-NEXT:    setne %al
+; GISEL-X64-NEXT:    setp %cl
+; GISEL-X64-NEXT:    orb %al, %cl
+; GISEL-X64-NEXT:    testb $1, %cl
+; GISEL-X64-NEXT:    je LBB13_1
+; GISEL-X64-NEXT:  ## %bb.2: ## %bb1
+; GISEL-X64-NEXT:    xorl %eax, %eax
+; GISEL-X64-NEXT:    retq
+; GISEL-X64-NEXT:  LBB13_1: ## %bb2
+; GISEL-X64-NEXT:    movl $1, %eax
+; GISEL-X64-NEXT:    retq
+  %1 = fcmp une float %x, %y
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @fcmp_oeq1(float %x) {
+; X64-LABEL: fcmp_oeq1:
+; X64:       ## %bb.0:
+; X64-NEXT:    ucomiss %xmm0, %xmm0
+; X64-NEXT:    jp LBB14_1
+; X64-NEXT:  ## %bb.2: ## %bb1
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    retq
+; X64-NEXT:  LBB14_1: ## %bb2
+; X64-NEXT:    movl $1, %eax
+; X64-NEXT:    retq
+;
+; GISEL-X64-LABEL: fcmp_oeq1:
+; GISEL-X64:       ## %bb.0:
+; GISEL-X64-NEXT:    ucomiss %xmm0, %xmm0
+; GISEL-X64-NEXT:    sete %al
+; GISEL-X64-NEXT:    setnp %cl
+; GISEL-X64-NEXT:    andb %al, %cl
+; GISEL-X64-NEXT:    testb $1, %cl
+; GISEL-X64-NEXT:    je LBB14_1
+; GISEL-X64-NEXT:  ## %bb.2: ## %bb1
+; GISEL-X64-NEXT:    xorl %eax, %eax
+; GISEL-X64-NEXT:    retq
+; GISEL-X64-NEXT:  LBB14_1: ## %bb2
+; GISEL-X64-NEXT:    movl $1, %eax
+; GISEL-X64-NEXT:    retq
+  %1 = fcmp oeq float %x, %x
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @fcmp_oeq2(float %x) {
+; X64-LABEL: fcmp_oeq2:
+; X64:       ## %bb.0:
+; X64-NEXT:    xorps %xmm1, %xmm1
+; X64-NEXT:    ucomiss %xmm1, %xmm0
+; X64-NEXT:    jne LBB15_1
+; X64-NEXT:    jp LBB15_1
+; X64-NEXT:  ## %bb.2: ## %bb1
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    retq
+; X64-NEXT:  LBB15_1: ## %bb2
+; X64-NEXT:    movl $1, %eax
+; X64-NEXT:    retq
+;
+; GISEL-X64-LABEL: fcmp_oeq2:
+; GISEL-X64:       ## %bb.0:
+; GISEL-X64-NEXT:    movss {{.*#+}} xmm1 = [0.0E+0,0.0E+0,0.0E+0,0.0E+0]
+; GISEL-X64-NEXT:    ucomiss %xmm1, %xmm0
+; GISEL-X64-NEXT:    sete %al
+; GISEL-X64-NEXT:    setnp %cl
+; GISEL-X64-NEXT:    andb %al, %cl
+; GISEL-X64-NEXT:    testb $1, %cl
+; GISEL-X64-NEXT:    je LBB15_1
+; GISEL-X64-NEXT:  ## %bb.2: ## %bb1
+; GISEL-X64-NEXT:    xorl %eax, %eax
+; GISEL-X64-NEXT:    retq
+; GISEL-X64-NEXT:  LBB15_1: ## %bb2
+; GISEL-X64-NEXT:    movl $1, %eax
+; GISEL-X64-NEXT:    retq
+  %1 = fcmp oeq float %x, 0.000000e+00
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @fcmp_ogt1(float %x) {
+; SDAG-X64-LABEL: fcmp_ogt1:
+; SDAG-X64:       ## %bb.0:
+; SDAG-X64-NEXT:    xorl    %eax, %eax
+; SDAG-X64-NEXT:    testb   %al, %al
+; SDAG-X64-NEXT:    je      LBB16_1
+; SDAG-X64-NEXT:  ## %bb.2: ## %bb1
+; SDAG-X64-NEXT:    xorl    %eax, %eax
+; SDAG-X64-NEXT:    retq
+; SDAG-X64-NEXT:  LBB16_1: ## %bb2
+; SDAG-X64-NEXT:    movl    $1, %eax
+; SDAG-X64-NEXT:    retq
+
+; FASTISEL-X64-LABEL: fcmp_ogt1:
+; FASTISEL-X64:       ## %bb.0:
+; FASTISEL-X64:         movl    $1, %eax
+; FASTISEL-X64:         retq
+
+; GISEL-X64-LABEL: fcmp_ogt1:
+; GISEL-X64:       ## %bb.0:
+; GISEL-X64-NEXT:    ucomiss %xmm0, %xmm0
+; GISEL-X64-NEXT:    seta %al
+; GISEL-X64-NEXT:    testb $1, %al
+; GISEL-X64-NEXT:    je LBB16_1
+; GISEL-X64-NEXT:  ## %bb.2: ## %bb1
+; GISEL-X64-NEXT:    xorl %eax, %eax
+; GISEL-X64-NEXT:    retq
+; GISEL-X64-NEXT:  LBB16_1: ## %bb2
+; GISEL-X64-NEXT:    movl $1, %eax
+; GISEL-X64-NEXT:    retq
+  %1 = fcmp ogt float %x, %x
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @fcmp_ogt2(float %x) {
+; X64-LABEL: fcmp_ogt2:
+; X64:       ## %bb.0:
+; X64-NEXT:    xorps %xmm1, %xmm1
+; X64-NEXT:    ucomiss %xmm1, %xmm0
+; X64-NEXT:    jbe LBB17_1
+; X64-NEXT:  ## %bb.2: ## %bb1
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    retq
+; X64-NEXT:  LBB17_1: ## %bb2
+; X64-NEXT:    movl $1, %eax
+; X64-NEXT:    retq
+;
+; GISEL-X64-LABEL: fcmp_ogt2:
+; GISEL-X64:       ## %bb.0:
+; GISEL-X64-NEXT:    movss {{.*#+}} xmm1 = [0.0E+0,0.0E+0,0.0E+0,0.0E+0]
+; GISEL-X64-NEXT:    ucomiss %xmm1, %xmm0
+; GISEL-X64-NEXT:    seta %al
+; GISEL-X64-NEXT:    testb $1, %al
+; GISEL-X64-NEXT:    je LBB17_1
+; GISEL-X64-NEXT:  ## %bb.2: ## %bb1
+; GISEL-X64-NEXT:    xorl %eax, %eax
+; GISEL-X64-NEXT:    retq
+; GISEL-X64-NEXT:  LBB17_1: ## %bb2
+; GISEL-X64-NEXT:    movl $1, %eax
+; GISEL-X64-NEXT:    retq
+  %1 = fcmp ogt float %x, 0.000000e+00
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @fcmp_oge1(float %x) {
+; X64-LABEL: fcmp_oge1:
+; X64:       ## %bb.0:
+; X64-NEXT:    ucomiss %xmm0, %xmm0
+; X64-NEXT:    jp LBB18_1
+; X64-NEXT:  ## %bb.2: ## %bb1
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    retq
+; X64-NEXT:  LBB18_1: ## %bb2
+; X64-NEXT:    movl $1, %eax
+; X64-NEXT:    retq
+;
+; GISEL-X64-LABEL: fcmp_oge1:
+; GISEL-X64:       ## %bb.0:
+; GISEL-X64-NEXT:    ucomiss %xmm0, %xmm0
+; GISEL-X64-NEXT:    setae %al
+; GISEL-X64-NEXT:    testb $1, %al
+; GISEL-X64-NEXT:    je LBB18_1
+; GISEL-X64-NEXT:  ## %bb.2: ## %bb1
+; GISEL-X64-NEXT:    xorl %eax, %eax
+; GISEL-X64-NEXT:    retq
+; GISEL-X64-NEXT:  LBB18_1: ## %bb2
+; GISEL-X64-NEXT:    movl $1, %eax
+; GISEL-X64-NEXT:    retq
+  %1 = fcmp oge float %x, %x
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @fcmp_oge2(float %x) {
+; X64-LABEL: fcmp_oge2:
+; X64:       ## %bb.0:
+; X64-NEXT:    xorps %xmm1, %xmm1
+; X64-NEXT:    ucomiss %xmm1, %xmm0
+; X64-NEXT:    jb LBB19_1
+; X64-NEXT:  ## %bb.2: ## %bb1
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    retq
+; X64-NEXT:  LBB19_1: ## %bb2
+; X64-NEXT:    movl $1, %eax
+; X64-NEXT:    retq
+;
+; GISEL-X64-LABEL: fcmp_oge2:
+; GISEL-X64:       ## %bb.0:
+; GISEL-X64-NEXT:    movss {{.*#+}} xmm1 = [0.0E+0,0.0E+0,0.0E+0,0.0E+0]
+; GISEL-X64-NEXT:    ucomiss %xmm1, %xmm0
+; GISEL-X64-NEXT:    setae %al
+; GISEL-X64-NEXT:    testb $1, %al
+; GISEL-X64-NEXT:    je LBB19_1
+; GISEL-X64-NEXT:  ## %bb.2: ## %bb1
+; GISEL-X64-NEXT:    xorl %eax, %eax
+; GISEL-X64-NEXT:    retq
+; GISEL-X64-NEXT:  LBB19_1: ## %bb2
+; GISEL-X64-NEXT:    movl $1, %eax
+; GISEL-X64-NEXT:    retq
+  %1 = fcmp oge float %x, 0.000000e+00
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @fcmp_olt1(float %x) {
+; GISEL-X64-LABEL: fcmp_olt1:
+; GISEL-X64:       ## %bb.0:
+; GISEL-X64-NEXT:    ucomiss %xmm0, %xmm0
+; GISEL-X64-NEXT:    seta %al
+; GISEL-X64-NEXT:    testb $1, %al
+; GISEL-X64-NEXT:    je LBB20_1
+; GISEL-X64-NEXT:  ## %bb.2: ## %bb1
+; GISEL-X64-NEXT:    xorl %eax, %eax
+; GISEL-X64-NEXT:    retq
+; GISEL-X64-NEXT:  LBB20_1: ## %bb2
+; GISEL-X64-NEXT:    movl $1, %eax
+; GISEL-X64-NEXT:    retq
+  %1 = fcmp olt float %x, %x
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @fcmp_olt2(float %x) {
+; X64-LABEL: fcmp_olt2:
+; X64:       ## %bb.0:
+; X64-NEXT:    xorps %xmm1, %xmm1
+; X64-NEXT:    ucomiss %xmm0, %xmm1
+; X64-NEXT:    jbe LBB21_1
+; X64-NEXT:  ## %bb.2: ## %bb1
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    retq
+; X64-NEXT:  LBB21_1: ## %bb2
+; X64-NEXT:    movl $1, %eax
+; X64-NEXT:    retq
+;
+; GISEL-X64-LABEL: fcmp_olt2:
+; GISEL-X64:       ## %bb.0:
+; GISEL-X64-NEXT:    movss {{.*#+}} xmm1 = [0.0E+0,0.0E+0,0.0E+0,0.0E+0]
+; GISEL-X64-NEXT:    ucomiss %xmm0, %xmm1
+; GISEL-X64-NEXT:    seta %al
+; GISEL-X64-NEXT:    testb $1, %al
+; GISEL-X64-NEXT:    je LBB21_1
+; GISEL-X64-NEXT:  ## %bb.2: ## %bb1
+; GISEL-X64-NEXT:    xorl %eax, %eax
+; GISEL-X64-NEXT:    retq
+; GISEL-X64-NEXT:  LBB21_1: ## %bb2
+; GISEL-X64-NEXT:    movl $1, %eax
+; GISEL-X64-NEXT:    retq
+  %1 = fcmp olt float %x, 0.000000e+00
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @fcmp_ole1(float %x) {
+; X64-LABEL: fcmp_ole1:
+; X64:       ## %bb.0:
+; X64-NEXT:    ucomiss %xmm0, %xmm0
+; X64-NEXT:    jp LBB22_1
+; X64-NEXT:  ## %bb.2: ## %bb1
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    retq
+; X64-NEXT:  LBB22_1: ## %bb2
+; X64-NEXT:    movl $1, %eax
+; X64-NEXT:    retq
+;
+; GISEL-X64-LABEL: fcmp_ole1:
+; GISEL-X64:       ## %bb.0:
+; GISEL-X64-NEXT:    ucomiss %xmm0, %xmm0
+; GISEL-X64-NEXT:    setae %al
+; GISEL-X64-NEXT:    testb $1, %al
+; GISEL-X64-NEXT:    je LBB22_1
+; GISEL-X64-NEXT:  ## %bb.2: ## %bb1
+; GISEL-X64-NEXT:    xorl %eax, %eax
+; GISEL-X64-NEXT:    retq
+; GISEL-X64-NEXT:  LBB22_1: ## %bb2
+; GISEL-X64-NEXT:    movl $1, %eax
+; GISEL-X64-NEXT:    retq
+  %1 = fcmp ole float %x, %x
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @fcmp_ole2(float %x) {
+; X64-LABEL: fcmp_ole2:
+; X64:       ## %bb.0:
+; X64-NEXT:    xorps %xmm1, %xmm1
+; X64-NEXT:    ucomiss %xmm0, %xmm1
+; X64-NEXT:    jb LBB23_1
+; X64-NEXT:  ## %bb.2: ## %bb1
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    retq
+; X64-NEXT:  LBB23_1: ## %bb2
+; X64-NEXT:    movl $1, %eax
+; X64-NEXT:    retq
+;
+; GISEL-X64-LABEL: fcmp_ole2:
+; GISEL-X64:       ## %bb.0:
+; GISEL-X64-NEXT:    movss {{.*#+}} xmm1 = [0.0E+0,0.0E+0,0.0E+0,0.0E+0]
+; GISEL-X64-NEXT:    ucomiss %xmm0, %xmm1
+; GISEL-X64-NEXT:    setae %al
+; GISEL-X64-NEXT:    testb $1, %al
+; GISEL-X64-NEXT:    je LBB23_1
+; GISEL-X64-NEXT:  ## %bb.2: ## %bb1
+; GISEL-X64-NEXT:    xorl %eax, %eax
+; GISEL-X64-NEXT:    retq
+; GISEL-X64-NEXT:  LBB23_1: ## %bb2
+; GISEL-X64-NEXT:    movl $1, %eax
+; GISEL-X64-NEXT:    retq
+  %1 = fcmp ole float %x, 0.000000e+00
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @fcmp_one1(float %x) {
+; GISEL-X64-LABEL: fcmp_one1:
+; GISEL-X64:       ## %bb.0:
+; GISEL-X64-NEXT:    ucomiss %xmm0, %xmm0
+; GISEL-X64-NEXT:    setne %al
+; GISEL-X64-NEXT:    testb $1, %al
+; GISEL-X64-NEXT:    je LBB24_1
+; GISEL-X64-NEXT:  ## %bb.2: ## %bb1
+; GISEL-X64-NEXT:    xorl %eax, %eax
+; GISEL-X64-NEXT:    retq
+; GISEL-X64-NEXT:  LBB24_1: ## %bb2
+; GISEL-X64-NEXT:    movl $1, %eax
+; GISEL-X64-NEXT:    retq
+  %1 = fcmp one float %x, %x
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @fcmp_one2(float %x) {
+; X64-LABEL: fcmp_one2:
+; X64:       ## %bb.0:
+; X64-NEXT:    xorps %xmm1, %xmm1
+; X64-NEXT:    ucomiss %xmm1, %xmm0
+; X64-NEXT:    je LBB25_1
+; X64-NEXT:  ## %bb.2: ## %bb1
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    retq
+; X64-NEXT:  LBB25_1: ## %bb2
+; X64-NEXT:    movl $1, %eax
+; X64-NEXT:    retq
+;
+; GISEL-X64-LABEL: fcmp_one2:
+; GISEL-X64:       ## %bb.0:
+; GISEL-X64-NEXT:    movss {{.*#+}} xmm1 = [0.0E+0,0.0E+0,0.0E+0,0.0E+0]
+; GISEL-X64-NEXT:    ucomiss %xmm1, %xmm0
+; GISEL-X64-NEXT:    setne %al
+; GISEL-X64-NEXT:    testb $1, %al
+; GISEL-X64-NEXT:    je LBB25_1
+; GISEL-X64-NEXT:  ## %bb.2: ## %bb1
+; GISEL-X64-NEXT:    xorl %eax, %eax
+; GISEL-X64-NEXT:    retq
+; GISEL-X64-NEXT:  LBB25_1: ## %bb2
+; GISEL-X64-NEXT:    movl $1, %eax
+; GISEL-X64-NEXT:    retq
+  %1 = fcmp one float %x, 0.000000e+00
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @fcmp_ord1(float %x) {
+; X64-LABEL: fcmp_ord1:
+; X64:       ## %bb.0:
+; X64-NEXT:    ucomiss %xmm0, %xmm0
+; X64-NEXT:    jp LBB26_1
+; X64-NEXT:  ## %bb.2: ## %bb1
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    retq
+; X64-NEXT:  LBB26_1: ## %bb2
+; X64-NEXT:    movl $1, %eax
+; X64-NEXT:    retq
+;
+; GISEL-X64-LABEL: fcmp_ord1:
+; GISEL-X64:       ## %bb.0:
+; GISEL-X64-NEXT:    ucomiss %xmm0, %xmm0
+; GISEL-X64-NEXT:    setnp %al
+; GISEL-X64-NEXT:    testb $1, %al
+; GISEL-X64-NEXT:    je LBB26_1
+; GISEL-X64-NEXT:  ## %bb.2: ## %bb1
+; GISEL-X64-NEXT:    xorl %eax, %eax
+; GISEL-X64-NEXT:    retq
+; GISEL-X64-NEXT:  LBB26_1: ## %bb2
+; GISEL-X64-NEXT:    movl $1, %eax
+; GISEL-X64-NEXT:    retq
+  %1 = fcmp ord float %x, %x
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @fcmp_ord2(float %x) {
+; X64-LABEL: fcmp_ord2:
+; X64:       ## %bb.0:
+; X64-NEXT:    ucomiss %xmm0, %xmm0
+; X64-NEXT:    jp LBB27_1
+; X64-NEXT:  ## %bb.2: ## %bb1
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    retq
+; X64-NEXT:  LBB27_1: ## %bb2
+; X64-NEXT:    movl $1, %eax
+; X64-NEXT:    retq
+;
+; GISEL-X64-LABEL: fcmp_ord2:
+; GISEL-X64:       ## %bb.0:
+; GISEL-X64-NEXT:    movss {{.*#+}} xmm1 = [0.0E+0,0.0E+0,0.0E+0,0.0E+0]
+; GISEL-X64-NEXT:    ucomiss %xmm1, %xmm0
+; GISEL-X64-NEXT:    setnp %al
+; GISEL-X64-NEXT:    testb $1, %al
+; GISEL-X64-NEXT:    je LBB27_1
+; GISEL-X64-NEXT:  ## %bb.2: ## %bb1
+; GISEL-X64-NEXT:    xorl %eax, %eax
+; GISEL-X64-NEXT:    retq
+; GISEL-X64-NEXT:  LBB27_1: ## %bb2
+; GISEL-X64-NEXT:    movl $1, %eax
+; GISEL-X64-NEXT:    retq
+  %1 = fcmp ord float %x, 0.000000e+00
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @fcmp_uno1(float %x) {
+; X64-LABEL: fcmp_uno1:
+; X64:       ## %bb.0:
+; X64-NEXT:    ucomiss %xmm0, %xmm0
+; X64-NEXT:    jp LBB28_2
+; X64-NEXT:  ## %bb.1: ## %bb2
+; X64-NEXT:    movl $1, %eax
+; X64-NEXT:    retq
+; X64-NEXT:  LBB28_2: ## %bb1
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    retq
+;
+; GISEL-X64-LABEL: fcmp_uno1:
+; GISEL-X64:       ## %bb.0:
+; GISEL-X64-NEXT:    ucomiss %xmm0, %xmm0
+; GISEL-X64-NEXT:    setp %al
+; GISEL-X64-NEXT:    testb $1, %al
+; GISEL-X64-NEXT:    jne LBB28_2
+; GISEL-X64-NEXT:  ## %bb.1: ## %bb2
+; GISEL-X64-NEXT:    movl $1, %eax
+; GISEL-X64-NEXT:    retq
+; GISEL-X64-NEXT:  LBB28_2: ## %bb1
+; GISEL-X64-NEXT:    xorl %eax, %eax
+; GISEL-X64-NEXT:    retq
+  %1 = fcmp uno float %x, %x
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @fcmp_uno2(float %x) {
+; X64-LABEL: fcmp_uno2:
+; X64:       ## %bb.0:
+; X64-NEXT:    ucomiss %xmm0, %xmm0
+; X64-NEXT:    jp LBB29_2
+; X64-NEXT:  ## %bb.1: ## %bb2
+; X64-NEXT:    movl $1, %eax
+; X64-NEXT:    retq
+; X64-NEXT:  LBB29_2: ## %bb1
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    retq
+;
+; GISEL-X64-LABEL: fcmp_uno2:
+; GISEL-X64:       ## %bb.0:
+; GISEL-X64-NEXT:    movss {{.*#+}} xmm1 = [0.0E+0,0.0E+0,0.0E+0,0.0E+0]
+; GISEL-X64-NEXT:    ucomiss %xmm1, %xmm0
+; GISEL-X64-NEXT:    setp %al
+; GISEL-X64-NEXT:    testb $1, %al
+; GISEL-X64-NEXT:    jne LBB29_2
+; GISEL-X64-NEXT:  ## %bb.1: ## %bb2
+; GISEL-X64-NEXT:    movl $1, %eax
+; GISEL-X64-NEXT:    retq
+; GISEL-X64-NEXT:  LBB29_2: ## %bb1
+; GISEL-X64-NEXT:    xorl %eax, %eax
+; GISEL-X64-NEXT:    retq
+  %1 = fcmp uno float %x, 0.000000e+00
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @fcmp_ueq1(float %x) {
+; GISEL-X64-LABEL: fcmp_ueq1:
+; GISEL-X64:       ## %bb.0:
+; GISEL-X64-NEXT:    ucomiss %xmm0, %xmm0
+; GISEL-X64-NEXT:    sete %al
+; GISEL-X64-NEXT:    testb $1, %al
+; GISEL-X64-NEXT:    jne LBB30_2
+; GISEL-X64-NEXT:  ## %bb.1: ## %bb2
+; GISEL-X64-NEXT:    movl $1, %eax
+; GISEL-X64-NEXT:    retq
+; GISEL-X64-NEXT:  LBB30_2: ## %bb1
+; GISEL-X64-NEXT:    xorl %eax, %eax
+; GISEL-X64-NEXT:    retq
+  %1 = fcmp ueq float %x, %x
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @fcmp_ueq2(float %x) {
+; X64-LABEL: fcmp_ueq2:
+; X64:       ## %bb.0:
+; X64-NEXT:    xorps %xmm1, %xmm1
+; X64-NEXT:    ucomiss %xmm1, %xmm0
+; X64-NEXT:    je LBB31_2
+; X64-NEXT:  ## %bb.1: ## %bb2
+; X64-NEXT:    movl $1, %eax
+; X64-NEXT:    retq
+; X64-NEXT:  LBB31_2: ## %bb1
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    retq
+;
+; GISEL-X64-LABEL: fcmp_ueq2:
+; GISEL-X64:       ## %bb.0:
+; GISEL-X64-NEXT:    movss {{.*#+}} xmm1 = [0.0E+0,0.0E+0,0.0E+0,0.0E+0]
+; GISEL-X64-NEXT:    ucomiss %xmm1, %xmm0
+; GISEL-X64-NEXT:    sete %al
+; GISEL-X64-NEXT:    testb $1, %al
+; GISEL-X64-NEXT:    jne LBB31_2
+; GISEL-X64-NEXT:  ## %bb.1: ## %bb2
+; GISEL-X64-NEXT:    movl $1, %eax
+; GISEL-X64-NEXT:    retq
+; GISEL-X64-NEXT:  LBB31_2: ## %bb1
+; GISEL-X64-NEXT:    xorl %eax, %eax
+; GISEL-X64-NEXT:    retq
+  %1 = fcmp ueq float %x, 0.000000e+00
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @fcmp_ugt1(float %x) {
+; X64-LABEL: fcmp_ugt1:
+; X64:       ## %bb.0:
+; X64-NEXT:    ucomiss %xmm0, %xmm0
+; X64-NEXT:    jnp LBB32_1
+; X64-NEXT:  ## %bb.2: ## %bb1
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    retq
+; X64-NEXT:  LBB32_1: ## %bb2
+; X64-NEXT:    movl $1, %eax
+; X64-NEXT:    retq
+;
+; GISEL-X64-LABEL: fcmp_ugt1:
+; GISEL-X64:       ## %bb.0:
+; GISEL-X64-NEXT:    ucomiss %xmm0, %xmm0
+; GISEL-X64-NEXT:    setb %al
+; GISEL-X64-NEXT:    testb $1, %al
+; GISEL-X64-NEXT:    je LBB32_1
+; GISEL-X64-NEXT:  ## %bb.2: ## %bb1
+; GISEL-X64-NEXT:    xorl %eax, %eax
+; GISEL-X64-NEXT:    retq
+; GISEL-X64-NEXT:  LBB32_1: ## %bb2
+; GISEL-X64-NEXT:    movl $1, %eax
+; GISEL-X64-NEXT:    retq
+  %1 = fcmp ugt float %x, %x
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @fcmp_ugt2(float %x) {
+; X64-LABEL: fcmp_ugt2:
+; X64:       ## %bb.0:
+; X64-NEXT:    xorps %xmm1, %xmm1
+; X64-NEXT:    ucomiss %xmm0, %xmm1
+; X64-NEXT:    jae LBB33_1
+; X64-NEXT:  ## %bb.2: ## %bb1
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    retq
+; X64-NEXT:  LBB33_1: ## %bb2
+; X64-NEXT:    movl $1, %eax
+; X64-NEXT:    retq
+;
+; GISEL-X64-LABEL: fcmp_ugt2:
+; GISEL-X64:       ## %bb.0:
+; GISEL-X64-NEXT:    movss {{.*#+}} xmm1 = [0.0E+0,0.0E+0,0.0E+0,0.0E+0]
+; GISEL-X64-NEXT:    ucomiss %xmm0, %xmm1
+; GISEL-X64-NEXT:    setb %al
+; GISEL-X64-NEXT:    testb $1, %al
+; GISEL-X64-NEXT:    je LBB33_1
+; GISEL-X64-NEXT:  ## %bb.2: ## %bb1
+; GISEL-X64-NEXT:    xorl %eax, %eax
+; GISEL-X64-NEXT:    retq
+; GISEL-X64-NEXT:  LBB33_1: ## %bb2
+; GISEL-X64-NEXT:    movl $1, %eax
+; GISEL-X64-NEXT:    retq
+  %1 = fcmp ugt float %x, 0.000000e+00
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @fcmp_uge1(float %x) {
+; GISEL-X64-LABEL: fcmp_uge1:
+; GISEL-X64:       ## %bb.0:
+; GISEL-X64-NEXT:    ucomiss %xmm0, %xmm0
+; GISEL-X64-NEXT:    setbe %al
+; GISEL-X64-NEXT:    testb $1, %al
+; GISEL-X64-NEXT:    je LBB34_1
+; GISEL-X64-NEXT:  ## %bb.2: ## %bb1
+; GISEL-X64-NEXT:    xorl %eax, %eax
+; GISEL-X64-NEXT:    retq
+; GISEL-X64-NEXT:  LBB34_1: ## %bb2
+; GISEL-X64-NEXT:    movl $1, %eax
+; GISEL-X64-NEXT:    retq
+  %1 = fcmp uge float %x, %x
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @fcmp_uge2(float %x) {
+; X64-LABEL: fcmp_uge2:
+; X64:       ## %bb.0:
+; X64-NEXT:    xorps %xmm1, %xmm1
+; X64-NEXT:    ucomiss %xmm0, %xmm1
+; X64-NEXT:    ja LBB35_1
+; X64-NEXT:  ## %bb.2: ## %bb1
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    retq
+; X64-NEXT:  LBB35_1: ## %bb2
+; X64-NEXT:    movl $1, %eax
+; X64-NEXT:    retq
+;
+; GISEL-X64-LABEL: fcmp_uge2:
+; GISEL-X64:       ## %bb.0:
+; GISEL-X64-NEXT:    movss {{.*#+}} xmm1 = [0.0E+0,0.0E+0,0.0E+0,0.0E+0]
+; GISEL-X64-NEXT:    ucomiss %xmm0, %xmm1
+; GISEL-X64-NEXT:    setbe %al
+; GISEL-X64-NEXT:    testb $1, %al
+; GISEL-X64-NEXT:    je LBB35_1
+; GISEL-X64-NEXT:  ## %bb.2: ## %bb1
+; GISEL-X64-NEXT:    xorl %eax, %eax
+; GISEL-X64-NEXT:    retq
+; GISEL-X64-NEXT:  LBB35_1: ## %bb2
+; GISEL-X64-NEXT:    movl $1, %eax
+; GISEL-X64-NEXT:    retq
+  %1 = fcmp uge float %x, 0.000000e+00
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @fcmp_ult1(float %x) {
+; X64-LABEL: fcmp_ult1:
+; X64:       ## %bb.0:
+; X64-NEXT:    ucomiss %xmm0, %xmm0
+; X64-NEXT:    jnp LBB36_1
+; X64-NEXT:  ## %bb.2: ## %bb1
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    retq
+; X64-NEXT:  LBB36_1: ## %bb2
+; X64-NEXT:    movl $1, %eax
+; X64-NEXT:    retq
+;
+; GISEL-X64-LABEL: fcmp_ult1:
+; GISEL-X64:       ## %bb.0:
+; GISEL-X64-NEXT:    ucomiss %xmm0, %xmm0
+; GISEL-X64-NEXT:    setb %al
+; GISEL-X64-NEXT:    testb $1, %al
+; GISEL-X64-NEXT:    je LBB36_1
+; GISEL-X64-NEXT:  ## %bb.2: ## %bb1
+; GISEL-X64-NEXT:    xorl %eax, %eax
+; GISEL-X64-NEXT:    retq
+; GISEL-X64-NEXT:  LBB36_1: ## %bb2
+; GISEL-X64-NEXT:    movl $1, %eax
+; GISEL-X64-NEXT:    retq
+  %1 = fcmp ult float %x, %x
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @fcmp_ult2(float %x) {
+; X64-LABEL: fcmp_ult2:
+; X64:       ## %bb.0:
+; X64-NEXT:    xorps %xmm1, %xmm1
+; X64-NEXT:    ucomiss %xmm1, %xmm0
+; X64-NEXT:    jae LBB37_1
+; X64-NEXT:  ## %bb.2: ## %bb1
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    retq
+; X64-NEXT:  LBB37_1: ## %bb2
+; X64-NEXT:    movl $1, %eax
+; X64-NEXT:    retq
+;
+; GISEL-X64-LABEL: fcmp_ult2:
+; GISEL-X64:       ## %bb.0:
+; GISEL-X64-NEXT:    movss {{.*#+}} xmm1 = [0.0E+0,0.0E+0,0.0E+0,0.0E+0]
+; GISEL-X64-NEXT:    ucomiss %xmm1, %xmm0
+; GISEL-X64-NEXT:    setb %al
+; GISEL-X64-NEXT:    testb $1, %al
+; GISEL-X64-NEXT:    je LBB37_1
+; GISEL-X64-NEXT:  ## %bb.2: ## %bb1
+; GISEL-X64-NEXT:    xorl %eax, %eax
+; GISEL-X64-NEXT:    retq
+; GISEL-X64-NEXT:  LBB37_1: ## %bb2
+; GISEL-X64-NEXT:    movl $1, %eax
+; GISEL-X64-NEXT:    retq
+  %1 = fcmp ult float %x, 0.000000e+00
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @fcmp_ule1(float %x) {
+; GISEL-X64-LABEL: fcmp_ule1:
+; GISEL-X64:       ## %bb.0:
+; GISEL-X64-NEXT:    ucomiss %xmm0, %xmm0
+; GISEL-X64-NEXT:    setbe %al
+; GISEL-X64-NEXT:    testb $1, %al
+; GISEL-X64-NEXT:    je LBB38_1
+; GISEL-X64-NEXT:  ## %bb.2: ## %bb1
+; GISEL-X64-NEXT:    xorl %eax, %eax
+; GISEL-X64-NEXT:    retq
+; GISEL-X64-NEXT:  LBB38_1: ## %bb2
+; GISEL-X64-NEXT:    movl $1, %eax
+; GISEL-X64-NEXT:    retq
+  %1 = fcmp ule float %x, %x
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @fcmp_ule2(float %x) {
+; X64-LABEL: fcmp_ule2:
+; X64:       ## %bb.0:
+; X64-NEXT:    xorps %xmm1, %xmm1
+; X64-NEXT:    ucomiss %xmm1, %xmm0
+; X64-NEXT:    ja LBB39_1
+; X64-NEXT:  ## %bb.2: ## %bb1
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    retq
+; X64-NEXT:  LBB39_1: ## %bb2
+; X64-NEXT:    movl $1, %eax
+; X64-NEXT:    retq
+;
+; GISEL-X64-LABEL: fcmp_ule2:
+; GISEL-X64:       ## %bb.0:
+; GISEL-X64-NEXT:    movss {{.*#+}} xmm1 = [0.0E+0,0.0E+0,0.0E+0,0.0E+0]
+; GISEL-X64-NEXT:    ucomiss %xmm1, %xmm0
+; GISEL-X64-NEXT:    setbe %al
+; GISEL-X64-NEXT:    testb $1, %al
+; GISEL-X64-NEXT:    je LBB39_1
+; GISEL-X64-NEXT:  ## %bb.2: ## %bb1
+; GISEL-X64-NEXT:    xorl %eax, %eax
+; GISEL-X64-NEXT:    retq
+; GISEL-X64-NEXT:  LBB39_1: ## %bb2
+; GISEL-X64-NEXT:    movl $1, %eax
+; GISEL-X64-NEXT:    retq
+  %1 = fcmp ule float %x, 0.000000e+00
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @fcmp_une1(float %x) {
+; X64-LABEL: fcmp_une1:
+; X64:       ## %bb.0:
+; X64-NEXT:    ucomiss %xmm0, %xmm0
+; X64-NEXT:    jnp LBB40_1
+; X64-NEXT:  ## %bb.2: ## %bb1
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    retq
+; X64-NEXT:  LBB40_1: ## %bb2
+; X64-NEXT:    movl $1, %eax
+; X64-NEXT:    retq
+;
+; GISEL-X64-LABEL: fcmp_une1:
+; GISEL-X64:       ## %bb.0:
+; GISEL-X64-NEXT:    ucomiss %xmm0, %xmm0
+; GISEL-X64-NEXT:    setne %al
+; GISEL-X64-NEXT:    setp %cl
+; GISEL-X64-NEXT:    orb %al, %cl
+; GISEL-X64-NEXT:    testb $1, %cl
+; GISEL-X64-NEXT:    je LBB40_1
+; GISEL-X64-NEXT:  ## %bb.2: ## %bb1
+; GISEL-X64-NEXT:    xorl %eax, %eax
+; GISEL-X64-NEXT:    retq
+; GISEL-X64-NEXT:  LBB40_1: ## %bb2
+; GISEL-X64-NEXT:    movl $1, %eax
+; GISEL-X64-NEXT:    retq
+  %1 = fcmp une float %x, %x
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @fcmp_une2(float %x) {
+; X64-LABEL: fcmp_une2:
+; X64:       ## %bb.0:
+; X64-NEXT:    xorps %xmm1, %xmm1
+; X64-NEXT:    ucomiss %xmm1, %xmm0
+; X64-NEXT:    jne LBB41_2
+; X64-NEXT:    jnp LBB41_1
+; X64-NEXT:  LBB41_2: ## %bb1
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    retq
+; X64-NEXT:  LBB41_1: ## %bb2
+; X64-NEXT:    movl $1, %eax
+; X64-NEXT:    retq
+;
+; GISEL-X64-LABEL: fcmp_une2:
+; GISEL-X64:       ## %bb.0:
+; GISEL-X64-NEXT:    movss {{.*#+}} xmm1 = [0.0E+0,0.0E+0,0.0E+0,0.0E+0]
+; GISEL-X64-NEXT:    ucomiss %xmm1, %xmm0
+; GISEL-X64-NEXT:    setne %al
+; GISEL-X64-NEXT:    setp %cl
+; GISEL-X64-NEXT:    orb %al, %cl
+; GISEL-X64-NEXT:    testb $1, %cl
+; GISEL-X64-NEXT:    je LBB41_1
+; GISEL-X64-NEXT:  ## %bb.2: ## %bb1
+; GISEL-X64-NEXT:    xorl %eax, %eax
+; GISEL-X64-NEXT:    retq
+; GISEL-X64-NEXT:  LBB41_1: ## %bb2
+; GISEL-X64-NEXT:    movl $1, %eax
+; GISEL-X64-NEXT:    retq
+  %1 = fcmp une float %x, 0.000000e+00
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
diff --git a/llvm/test/CodeGen/X86/isel-brcond-icmp.ll b/llvm/test/CodeGen/X86/isel-brcond-icmp.ll
new file mode 100644
index 0000000..59a45d9
--- /dev/null
+++ b/llvm/test/CodeGen/X86/isel-brcond-icmp.ll
@@ -0,0 +1,1107 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -global-isel=0                    -mtriple=x86_64-apple-darwin10 -verify-machineinstrs | FileCheck %s --check-prefixes=X64,SDAG
+; RUN: llc < %s -fast-isel -fast-isel-abort=1     -mtriple=x86_64-apple-darwin10 -verify-machineinstrs | FileCheck %s --check-prefixes=X64,FASTISEL
+; RUN: llc < %s -global-isel -global-isel-abort=1 -mtriple=x86_64-apple-darwin10 -verify-machineinstrs | FileCheck %s --check-prefixes=GISEL-X64
+; RUN: llc < %s -global-isel=0                    -mtriple=i686-apple-darwin10   -verify-machineinstrs | FileCheck %s --check-prefixes=X86,SDAG
+; RUN: llc < %s -fast-isel -fast-isel-abort=1     -mtriple=i686-apple-darwin10   -verify-machineinstrs | FileCheck %s --check-prefixes=X86,FASTISEL
+; RUN: llc < %s -global-isel -global-isel-abort=1 -mtriple=i686-apple-darwin10   -verify-machineinstrs | FileCheck %s --check-prefixes=GISEL-X86
+
+define i32 @icmp_eq_2(i32 %x, i32 %y) {
+; X64-LABEL: icmp_eq_2:
+; X64:       ## %bb.0:
+; X64-NEXT:    cmpl %esi, %edi
+; X64-NEXT:    jne LBB0_1
+; X64-NEXT:  ## %bb.2: ## %bb1
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    retq
+; X64-NEXT:  LBB0_1: ## %bb2
+; X64-NEXT:    movl $1, %eax
+; X64-NEXT:    retq
+;
+; GISEL-X64-LABEL: icmp_eq_2:
+; GISEL-X64:       ## %bb.0:
+; GISEL-X64-NEXT:    cmpl %esi, %edi
+; GISEL-X64-NEXT:    sete %al
+; GISEL-X64-NEXT:    testb $1, %al
+; GISEL-X64-NEXT:    je LBB0_1
+; GISEL-X64-NEXT:  ## %bb.2: ## %bb1
+; GISEL-X64-NEXT:    xorl %eax, %eax
+; GISEL-X64-NEXT:    retq
+; GISEL-X64-NEXT:  LBB0_1: ## %bb2
+; GISEL-X64-NEXT:    movl $1, %eax
+; GISEL-X64-NEXT:    retq
+;
+; X86-LABEL: icmp_eq_2:
+; X86:       ## %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpl {{[0-9]+\(%esp\), %eax|%eax, [0-9]+\(%esp\)}}
+; X86-NEXT:    jne LBB0_1
+; X86-NEXT:  ## %bb.2: ## %bb1
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    retl
+; X86-NEXT:  LBB0_1: ## %bb2
+; X86-NEXT:    movl $1, %eax
+; X86-NEXT:    retl
+;
+; GISEL-X86-LABEL: icmp_eq_2:
+; GISEL-X86:       ## %bb.0:
+; GISEL-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; GISEL-X86-NEXT:    cmpl %eax, {{[0-9]+}}(%esp)
+; GISEL-X86-NEXT:    sete %al
+; GISEL-X86-NEXT:    testb $1, %al
+; GISEL-X86-NEXT:    je LBB0_1
+; GISEL-X86-NEXT:  ## %bb.2: ## %bb1
+; GISEL-X86-NEXT:    xorl %eax, %eax
+; GISEL-X86-NEXT:    retl
+; GISEL-X86-NEXT:  LBB0_1: ## %bb2
+; GISEL-X86-NEXT:    movl $1, %eax
+; GISEL-X86-NEXT:    retl
+  %1 = icmp eq i32 %x, %y
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @icmp_ne_2(i32 %x, i32 %y) {
+; X64-LABEL: icmp_ne_2:
+; X64:       ## %bb.0:
+; X64-NEXT:    cmpl %esi, %edi
+; X64-NEXT:    je LBB1_1
+; X64-NEXT:  ## %bb.2: ## %bb1
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    retq
+; X64-NEXT:  LBB1_1: ## %bb2
+; X64-NEXT:    movl $1, %eax
+; X64-NEXT:    retq
+;
+; GISEL-X64-LABEL: icmp_ne_2:
+; GISEL-X64:       ## %bb.0:
+; GISEL-X64-NEXT:    cmpl %esi, %edi
+; GISEL-X64-NEXT:    setne %al
+; GISEL-X64-NEXT:    testb $1, %al
+; GISEL-X64-NEXT:    je LBB1_1
+; GISEL-X64-NEXT:  ## %bb.2: ## %bb1
+; GISEL-X64-NEXT:    xorl %eax, %eax
+; GISEL-X64-NEXT:    retq
+; GISEL-X64-NEXT:  LBB1_1: ## %bb2
+; GISEL-X64-NEXT:    movl $1, %eax
+; GISEL-X64-NEXT:    retq
+;
+; X86-LABEL: icmp_ne_2:
+; X86:       ## %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpl {{[0-9]+\(%esp\), %eax|%eax, [0-9]+\(%esp\)}}
+; X86-NEXT:    je LBB1_1
+; X86-NEXT:  ## %bb.2: ## %bb1
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    retl
+; X86-NEXT:  LBB1_1: ## %bb2
+; X86-NEXT:    movl $1, %eax
+; X86-NEXT:    retl
+;
+; GISEL-X86-LABEL: icmp_ne_2:
+; GISEL-X86:       ## %bb.0:
+; GISEL-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; GISEL-X86-NEXT:    cmpl %eax, {{[0-9]+}}(%esp)
+; GISEL-X86-NEXT:    setne %al
+; GISEL-X86-NEXT:    testb $1, %al
+; GISEL-X86-NEXT:    je LBB1_1
+; GISEL-X86-NEXT:  ## %bb.2: ## %bb1
+; GISEL-X86-NEXT:    xorl %eax, %eax
+; GISEL-X86-NEXT:    retl
+; GISEL-X86-NEXT:  LBB1_1: ## %bb2
+; GISEL-X86-NEXT:    movl $1, %eax
+; GISEL-X86-NEXT:    retl
+  %1 = icmp ne i32 %x, %y
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @icmp_ugt_2(i32 %x, i32 %y) {
+; X64-LABEL: icmp_ugt_2:
+; X64:       ## %bb.0:
+; X64-NEXT:    cmpl %esi, %edi
+; X64-NEXT:    jbe LBB2_1
+; X64-NEXT:  ## %bb.2: ## %bb1
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    retq
+; X64-NEXT:  LBB2_1: ## %bb2
+; X64-NEXT:    movl $1, %eax
+; X64-NEXT:    retq
+;
+; GISEL-X64-LABEL: icmp_ugt_2:
+; GISEL-X64:       ## %bb.0:
+; GISEL-X64-NEXT:    cmpl %esi, %edi
+; GISEL-X64-NEXT:    seta %al
+; GISEL-X64-NEXT:    testb $1, %al
+; GISEL-X64-NEXT:    je LBB2_1
+; GISEL-X64-NEXT:  ## %bb.2: ## %bb1
+; GISEL-X64-NEXT:    xorl %eax, %eax
+; GISEL-X64-NEXT:    retq
+; GISEL-X64-NEXT:  LBB2_1: ## %bb2
+; GISEL-X64-NEXT:    movl $1, %eax
+; GISEL-X64-NEXT:    retq
+;
+; X86-LABEL: icmp_ugt_2:
+; X86:       ## %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpl {{[0-9]+\(%esp\), %eax|%eax, [0-9]+\(%esp\)}}
+; X86-NEXT:    jbe LBB2_1
+; X86-NEXT:  ## %bb.2: ## %bb1
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    retl
+; X86-NEXT:  LBB2_1: ## %bb2
+; X86-NEXT:    movl $1, %eax
+; X86-NEXT:    retl
+;
+; GISEL-X86-LABEL: icmp_ugt_2:
+; GISEL-X86:       ## %bb.0:
+; GISEL-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; GISEL-X86-NEXT:    cmpl %eax, {{[0-9]+}}(%esp)
+; GISEL-X86-NEXT:    seta %al
+; GISEL-X86-NEXT:    testb $1, %al
+; GISEL-X86-NEXT:    je LBB2_1
+; GISEL-X86-NEXT:  ## %bb.2: ## %bb1
+; GISEL-X86-NEXT:    xorl %eax, %eax
+; GISEL-X86-NEXT:    retl
+; GISEL-X86-NEXT:  LBB2_1: ## %bb2
+; GISEL-X86-NEXT:    movl $1, %eax
+; GISEL-X86-NEXT:    retl
+  %1 = icmp ugt i32 %x, %y
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @icmp_uge_2(i32 %x, i32 %y) {
+; X64-LABEL: icmp_uge_2:
+; X64:       ## %bb.0:
+; X64-NEXT:    cmpl %esi, %edi
+; X64-NEXT:    jb LBB3_1
+; X64-NEXT:  ## %bb.2: ## %bb1
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    retq
+; X64-NEXT:  LBB3_1: ## %bb2
+; X64-NEXT:    movl $1, %eax
+; X64-NEXT:    retq
+;
+; GISEL-X64-LABEL: icmp_uge_2:
+; GISEL-X64:       ## %bb.0:
+; GISEL-X64-NEXT:    cmpl %esi, %edi
+; GISEL-X64-NEXT:    setae %al
+; GISEL-X64-NEXT:    testb $1, %al
+; GISEL-X64-NEXT:    je LBB3_1
+; GISEL-X64-NEXT:  ## %bb.2: ## %bb1
+; GISEL-X64-NEXT:    xorl %eax, %eax
+; GISEL-X64-NEXT:    retq
+; GISEL-X64-NEXT:  LBB3_1: ## %bb2
+; GISEL-X64-NEXT:    movl $1, %eax
+; GISEL-X64-NEXT:    retq
+;
+; X86-LABEL: icmp_uge_2:
+; X86:       ## %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpl {{[0-9]+\(%esp\), %eax|%eax, [0-9]+\(%esp\)}}
+; X86-NEXT:    jb LBB3_1
+; X86-NEXT:  ## %bb.2: ## %bb1
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    retl
+; X86-NEXT:  LBB3_1: ## %bb2
+; X86-NEXT:    movl $1, %eax
+; X86-NEXT:    retl
+;
+; GISEL-X86-LABEL: icmp_uge_2:
+; GISEL-X86:       ## %bb.0:
+; GISEL-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; GISEL-X86-NEXT:    cmpl %eax, {{[0-9]+}}(%esp)
+; GISEL-X86-NEXT:    setae %al
+; GISEL-X86-NEXT:    testb $1, %al
+; GISEL-X86-NEXT:    je LBB3_1
+; GISEL-X86-NEXT:  ## %bb.2: ## %bb1
+; GISEL-X86-NEXT:    xorl %eax, %eax
+; GISEL-X86-NEXT:    retl
+; GISEL-X86-NEXT:  LBB3_1: ## %bb2
+; GISEL-X86-NEXT:    movl $1, %eax
+; GISEL-X86-NEXT:    retl
+  %1 = icmp uge i32 %x, %y
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @icmp_ult_2(i32 %x, i32 %y) {
+; X64-LABEL: icmp_ult_2:
+; X64:       ## %bb.0:
+; X64-NEXT:    cmpl %esi, %edi
+; X64-NEXT:    jae LBB4_1
+; X64-NEXT:  ## %bb.2: ## %bb1
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    retq
+; X64-NEXT:  LBB4_1: ## %bb2
+; X64-NEXT:    movl $1, %eax
+; X64-NEXT:    retq
+;
+; GISEL-X64-LABEL: icmp_ult_2:
+; GISEL-X64:       ## %bb.0:
+; GISEL-X64-NEXT:    cmpl %esi, %edi
+; GISEL-X64-NEXT:    setb %al
+; GISEL-X64-NEXT:    testb $1, %al
+; GISEL-X64-NEXT:    je LBB4_1
+; GISEL-X64-NEXT:  ## %bb.2: ## %bb1
+; GISEL-X64-NEXT:    xorl %eax, %eax
+; GISEL-X64-NEXT:    retq
+; GISEL-X64-NEXT:  LBB4_1: ## %bb2
+; GISEL-X64-NEXT:    movl $1, %eax
+; GISEL-X64-NEXT:    retq
+;
+; X86-LABEL: icmp_ult_2:
+; X86:       ## %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpl {{[0-9]+\(%esp\), %eax|%eax, [0-9]+\(%esp\)}}
+; X86-NEXT:    jae LBB4_1
+; X86-NEXT:  ## %bb.2: ## %bb1
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    retl
+; X86-NEXT:  LBB4_1: ## %bb2
+; X86-NEXT:    movl $1, %eax
+; X86-NEXT:    retl
+;
+; GISEL-X86-LABEL: icmp_ult_2:
+; GISEL-X86:       ## %bb.0:
+; GISEL-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; GISEL-X86-NEXT:    cmpl %eax, {{[0-9]+}}(%esp)
+; GISEL-X86-NEXT:    setb %al
+; GISEL-X86-NEXT:    testb $1, %al
+; GISEL-X86-NEXT:    je LBB4_1
+; GISEL-X86-NEXT:  ## %bb.2: ## %bb1
+; GISEL-X86-NEXT:    xorl %eax, %eax
+; GISEL-X86-NEXT:    retl
+; GISEL-X86-NEXT:  LBB4_1: ## %bb2
+; GISEL-X86-NEXT:    movl $1, %eax
+; GISEL-X86-NEXT:    retl
+  %1 = icmp ult i32 %x, %y
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @icmp_ule_2(i32 %x, i32 %y) {
+; X64-LABEL: icmp_ule_2:
+; X64:       ## %bb.0:
+; X64-NEXT:    cmpl %esi, %edi
+; X64-NEXT:    ja LBB5_1
+; X64-NEXT:  ## %bb.2: ## %bb1
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    retq
+; X64-NEXT:  LBB5_1: ## %bb2
+; X64-NEXT:    movl $1, %eax
+; X64-NEXT:    retq
+;
+; GISEL-X64-LABEL: icmp_ule_2:
+; GISEL-X64:       ## %bb.0:
+; GISEL-X64-NEXT:    cmpl %esi, %edi
+; GISEL-X64-NEXT:    setbe %al
+; GISEL-X64-NEXT:    testb $1, %al
+; GISEL-X64-NEXT:    je LBB5_1
+; GISEL-X64-NEXT:  ## %bb.2: ## %bb1
+; GISEL-X64-NEXT:    xorl %eax, %eax
+; GISEL-X64-NEXT:    retq
+; GISEL-X64-NEXT:  LBB5_1: ## %bb2
+; GISEL-X64-NEXT:    movl $1, %eax
+; GISEL-X64-NEXT:    retq
+;
+; X86-LABEL: icmp_ule_2:
+; X86:       ## %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpl {{[0-9]+\(%esp\), %eax|%eax, [0-9]+\(%esp\)}}
+; X86-NEXT:    ja LBB5_1
+; X86-NEXT:  ## %bb.2: ## %bb1
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    retl
+; X86-NEXT:  LBB5_1: ## %bb2
+; X86-NEXT:    movl $1, %eax
+; X86-NEXT:    retl
+;
+; GISEL-X86-LABEL: icmp_ule_2:
+; GISEL-X86:       ## %bb.0:
+; GISEL-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; GISEL-X86-NEXT:    cmpl %eax, {{[0-9]+}}(%esp)
+; GISEL-X86-NEXT:    setbe %al
+; GISEL-X86-NEXT:    testb $1, %al
+; GISEL-X86-NEXT:    je LBB5_1
+; GISEL-X86-NEXT:  ## %bb.2: ## %bb1
+; GISEL-X86-NEXT:    xorl %eax, %eax
+; GISEL-X86-NEXT:    retl
+; GISEL-X86-NEXT:  LBB5_1: ## %bb2
+; GISEL-X86-NEXT:    movl $1, %eax
+; GISEL-X86-NEXT:    retl
+  %1 = icmp ule i32 %x, %y
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @icmp_sgt_2(i32 %x, i32 %y) {
+; X64-LABEL: icmp_sgt_2:
+; X64:       ## %bb.0:
+; X64-NEXT:    cmpl %esi, %edi
+; X64-NEXT:    jle LBB6_1
+; X64-NEXT:  ## %bb.2: ## %bb1
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    retq
+; X64-NEXT:  LBB6_1: ## %bb2
+; X64-NEXT:    movl $1, %eax
+; X64-NEXT:    retq
+;
+; GISEL-X64-LABEL: icmp_sgt_2:
+; GISEL-X64:       ## %bb.0:
+; GISEL-X64-NEXT:    cmpl %esi, %edi
+; GISEL-X64-NEXT:    setg %al
+; GISEL-X64-NEXT:    testb $1, %al
+; GISEL-X64-NEXT:    je LBB6_1
+; GISEL-X64-NEXT:  ## %bb.2: ## %bb1
+; GISEL-X64-NEXT:    xorl %eax, %eax
+; GISEL-X64-NEXT:    retq
+; GISEL-X64-NEXT:  LBB6_1: ## %bb2
+; GISEL-X64-NEXT:    movl $1, %eax
+; GISEL-X64-NEXT:    retq
+;
+; X86-LABEL: icmp_sgt_2:
+; X86:       ## %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpl {{[0-9]+\(%esp\), %eax|%eax, [0-9]+\(%esp\)}}
+; X86-NEXT:    jle LBB6_1
+; X86-NEXT:  ## %bb.2: ## %bb1
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    retl
+; X86-NEXT:  LBB6_1: ## %bb2
+; X86-NEXT:    movl $1, %eax
+; X86-NEXT:    retl
+;
+; GISEL-X86-LABEL: icmp_sgt_2:
+; GISEL-X86:       ## %bb.0:
+; GISEL-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; GISEL-X86-NEXT:    cmpl %eax, {{[0-9]+}}(%esp)
+; GISEL-X86-NEXT:    setg %al
+; GISEL-X86-NEXT:    testb $1, %al
+; GISEL-X86-NEXT:    je LBB6_1
+; GISEL-X86-NEXT:  ## %bb.2: ## %bb1
+; GISEL-X86-NEXT:    xorl %eax, %eax
+; GISEL-X86-NEXT:    retl
+; GISEL-X86-NEXT:  LBB6_1: ## %bb2
+; GISEL-X86-NEXT:    movl $1, %eax
+; GISEL-X86-NEXT:    retl
+  %1 = icmp sgt i32 %x, %y
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @icmp_sge_2(i32 %x, i32 %y) {
+; X64-LABEL: icmp_sge_2:
+; X64:       ## %bb.0:
+; X64-NEXT:    cmpl %esi, %edi
+; X64-NEXT:    jl LBB7_1
+; X64-NEXT:  ## %bb.2: ## %bb1
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    retq
+; X64-NEXT:  LBB7_1: ## %bb2
+; X64-NEXT:    movl $1, %eax
+; X64-NEXT:    retq
+;
+; GISEL-X64-LABEL: icmp_sge_2:
+; GISEL-X64:       ## %bb.0:
+; GISEL-X64-NEXT:    cmpl %esi, %edi
+; GISEL-X64-NEXT:    setge %al
+; GISEL-X64-NEXT:    testb $1, %al
+; GISEL-X64-NEXT:    je LBB7_1
+; GISEL-X64-NEXT:  ## %bb.2: ## %bb1
+; GISEL-X64-NEXT:    xorl %eax, %eax
+; GISEL-X64-NEXT:    retq
+; GISEL-X64-NEXT:  LBB7_1: ## %bb2
+; GISEL-X64-NEXT:    movl $1, %eax
+; GISEL-X64-NEXT:    retq
+;
+; X86-LABEL: icmp_sge_2:
+; X86:       ## %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpl {{[0-9]+\(%esp\), %eax|%eax, [0-9]+\(%esp\)}}
+; X86-NEXT:    jl LBB7_1
+; X86-NEXT:  ## %bb.2: ## %bb1
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    retl
+; X86-NEXT:  LBB7_1: ## %bb2
+; X86-NEXT:    movl $1, %eax
+; X86-NEXT:    retl
+;
+; GISEL-X86-LABEL: icmp_sge_2:
+; GISEL-X86:       ## %bb.0:
+; GISEL-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; GISEL-X86-NEXT:    cmpl %eax, {{[0-9]+}}(%esp)
+; GISEL-X86-NEXT:    setge %al
+; GISEL-X86-NEXT:    testb $1, %al
+; GISEL-X86-NEXT:    je LBB7_1
+; GISEL-X86-NEXT:  ## %bb.2: ## %bb1
+; GISEL-X86-NEXT:    xorl %eax, %eax
+; GISEL-X86-NEXT:    retl
+; GISEL-X86-NEXT:  LBB7_1: ## %bb2
+; GISEL-X86-NEXT:    movl $1, %eax
+; GISEL-X86-NEXT:    retl
+  %1 = icmp sge i32 %x, %y
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @icmp_slt_2(i32 %x, i32 %y) {
+; X64-LABEL: icmp_slt_2:
+; X64:       ## %bb.0:
+; X64-NEXT:    cmpl %esi, %edi
+; X64-NEXT:    jge LBB8_1
+; X64-NEXT:  ## %bb.2: ## %bb1
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    retq
+; X64-NEXT:  LBB8_1: ## %bb2
+; X64-NEXT:    movl $1, %eax
+; X64-NEXT:    retq
+;
+; GISEL-X64-LABEL: icmp_slt_2:
+; GISEL-X64:       ## %bb.0:
+; GISEL-X64-NEXT:    cmpl %esi, %edi
+; GISEL-X64-NEXT:    setl %al
+; GISEL-X64-NEXT:    testb $1, %al
+; GISEL-X64-NEXT:    je LBB8_1
+; GISEL-X64-NEXT:  ## %bb.2: ## %bb1
+; GISEL-X64-NEXT:    xorl %eax, %eax
+; GISEL-X64-NEXT:    retq
+; GISEL-X64-NEXT:  LBB8_1: ## %bb2
+; GISEL-X64-NEXT:    movl $1, %eax
+; GISEL-X64-NEXT:    retq
+;
+; X86-LABEL: icmp_slt_2:
+; X86:       ## %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpl {{[0-9]+\(%esp\), %eax|%eax, [0-9]+\(%esp\)}}
+; X86-NEXT:    jge LBB8_1
+; X86-NEXT:  ## %bb.2: ## %bb1
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    retl
+; X86-NEXT:  LBB8_1: ## %bb2
+; X86-NEXT:    movl $1, %eax
+; X86-NEXT:    retl
+;
+; GISEL-X86-LABEL: icmp_slt_2:
+; GISEL-X86:       ## %bb.0:
+; GISEL-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; GISEL-X86-NEXT:    cmpl %eax, {{[0-9]+}}(%esp)
+; GISEL-X86-NEXT:    setl %al
+; GISEL-X86-NEXT:    testb $1, %al
+; GISEL-X86-NEXT:    je LBB8_1
+; GISEL-X86-NEXT:  ## %bb.2: ## %bb1
+; GISEL-X86-NEXT:    xorl %eax, %eax
+; GISEL-X86-NEXT:    retl
+; GISEL-X86-NEXT:  LBB8_1: ## %bb2
+; GISEL-X86-NEXT:    movl $1, %eax
+; GISEL-X86-NEXT:    retl
+  %1 = icmp slt i32 %x, %y
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @icmp_sle_2(i32 %x, i32 %y) {
+; X64-LABEL: icmp_sle_2:
+; X64:       ## %bb.0:
+; X64-NEXT:    cmpl %esi, %edi
+; X64-NEXT:    jg LBB9_1
+; X64-NEXT:  ## %bb.2: ## %bb1
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    retq
+; X64-NEXT:  LBB9_1: ## %bb2
+; X64-NEXT:    movl $1, %eax
+; X64-NEXT:    retq
+;
+; GISEL-X64-LABEL: icmp_sle_2:
+; GISEL-X64:       ## %bb.0:
+; GISEL-X64-NEXT:    cmpl %esi, %edi
+; GISEL-X64-NEXT:    setle %al
+; GISEL-X64-NEXT:    testb $1, %al
+; GISEL-X64-NEXT:    je LBB9_1
+; GISEL-X64-NEXT:  ## %bb.2: ## %bb1
+; GISEL-X64-NEXT:    xorl %eax, %eax
+; GISEL-X64-NEXT:    retq
+; GISEL-X64-NEXT:  LBB9_1: ## %bb2
+; GISEL-X64-NEXT:    movl $1, %eax
+; GISEL-X64-NEXT:    retq
+;
+; X86-LABEL: icmp_sle_2:
+; X86:       ## %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpl {{[0-9]+\(%esp\), %eax|%eax, [0-9]+\(%esp\)}}
+; X86-NEXT:    jg LBB9_1
+; X86-NEXT:  ## %bb.2: ## %bb1
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    retl
+; X86-NEXT:  LBB9_1: ## %bb2
+; X86-NEXT:    movl $1, %eax
+; X86-NEXT:    retl
+;
+; GISEL-X86-LABEL: icmp_sle_2:
+; GISEL-X86:       ## %bb.0:
+; GISEL-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; GISEL-X86-NEXT:    cmpl %eax, {{[0-9]+}}(%esp)
+; GISEL-X86-NEXT:    setle %al
+; GISEL-X86-NEXT:    testb $1, %al
+; GISEL-X86-NEXT:    je LBB9_1
+; GISEL-X86-NEXT:  ## %bb.2: ## %bb1
+; GISEL-X86-NEXT:    xorl %eax, %eax
+; GISEL-X86-NEXT:    retl
+; GISEL-X86-NEXT:  LBB9_1: ## %bb2
+; GISEL-X86-NEXT:    movl $1, %eax
+; GISEL-X86-NEXT:    retl
+  %1 = icmp sle i32 %x, %y
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @icmp_eq(i32 %x) {
+; SDAG-LABEL: icmp_eq:
+; SDAG:       ## %bb.0:
+; SDAG-NEXT:    movb $1, %al
+; SDAG-NEXT:    testb %al, %al
+; SDAG-NEXT:    je LBB10_1
+; SDAG-NEXT:  ## %bb.2: ## %bb1
+; SDAG-NEXT:    xorl %eax, %eax
+; SDAG-NEXT:    ret{{q|l}}
+; SDAG-NEXT:  LBB10_1: ## %bb2
+; SDAG-NEXT:    movl $1, %eax
+; SDAG-NEXT:    ret{{q|l}}
+;
+; FASTISEL-LABEL: icmp_eq:
+; FASTISEL:       ## %bb.0:
+; FASTISEL-NEXT:    xorl %eax, %eax
+; FASTISEL-NEXT:    ret{{q|l}}
+;
+; GISEL-X64-LABEL: icmp_eq:
+; GISEL-X64:       ## %bb.0:
+; GISEL-X64-NEXT:    cmpl %edi, %edi
+; GISEL-X64-NEXT:    sete %al
+; GISEL-X64-NEXT:    testb $1, %al
+; GISEL-X64-NEXT:    je LBB10_1
+; GISEL-X64-NEXT:  ## %bb.2: ## %bb1
+; GISEL-X64-NEXT:    xorl %eax, %eax
+; GISEL-X64-NEXT:    retq
+; GISEL-X64-NEXT:  LBB10_1: ## %bb2
+; GISEL-X64-NEXT:    movl $1, %eax
+; GISEL-X64-NEXT:    retq
+;
+; GISEL-X86-LABEL: icmp_eq:
+; GISEL-X86:       ## %bb.0:
+; GISEL-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; GISEL-X86-NEXT:    cmpl %eax, %eax
+; GISEL-X86-NEXT:    sete %al
+; GISEL-X86-NEXT:    testb $1, %al
+; GISEL-X86-NEXT:    je LBB10_1
+; GISEL-X86-NEXT:  ## %bb.2: ## %bb1
+; GISEL-X86-NEXT:    xorl %eax, %eax
+; GISEL-X86-NEXT:    retl
+; GISEL-X86-NEXT:  LBB10_1: ## %bb2
+; GISEL-X86-NEXT:    movl $1, %eax
+; GISEL-X86-NEXT:    retl
+  %1 = icmp eq i32 %x, %x
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @icmp_ne(i32 %x) {
+; SDAG-LABEL: icmp_ne:
+; SDAG:       ## %bb.0:
+; SDAG-NEXT:    xorl %eax, %eax
+; SDAG-NEXT:    testb %al, %al
+; SDAG-NEXT:    je LBB11_1
+; SDAG-NEXT:  ## %bb.2: ## %bb1
+; SDAG-NEXT:    xorl %eax, %eax
+; SDAG-NEXT:    ret{{q|l}}
+; SDAG-NEXT:  LBB11_1: ## %bb2
+; SDAG-NEXT:    movl $1, %eax
+; SDAG-NEXT:    ret{{q|l}}
+;
+; FASTISEL-LABEL: icmp_ne:
+; FASTISEL:       ## %bb.0:
+; FASTISEL-NEXT:    movl $1, %eax
+; FASTISEL-NEXT:    ret{{q|l}}
+;
+; GISEL-X64-LABEL: icmp_ne:
+; GISEL-X64:       ## %bb.0:
+; GISEL-X64-NEXT:    cmpl %edi, %edi
+; GISEL-X64-NEXT:    setne %al
+; GISEL-X64-NEXT:    testb $1, %al
+; GISEL-X64-NEXT:    je LBB11_1
+; GISEL-X64-NEXT:  ## %bb.2: ## %bb1
+; GISEL-X64-NEXT:    xorl %eax, %eax
+; GISEL-X64-NEXT:    retq
+; GISEL-X64-NEXT:  LBB11_1: ## %bb2
+; GISEL-X64-NEXT:    movl $1, %eax
+; GISEL-X64-NEXT:    retq
+;
+; GISEL-X86-LABEL: icmp_ne:
+; GISEL-X86:       ## %bb.0:
+; GISEL-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; GISEL-X86-NEXT:    cmpl %eax, %eax
+; GISEL-X86-NEXT:    setne %al
+; GISEL-X86-NEXT:    testb $1, %al
+; GISEL-X86-NEXT:    je LBB11_1
+; GISEL-X86-NEXT:  ## %bb.2: ## %bb1
+; GISEL-X86-NEXT:    xorl %eax, %eax
+; GISEL-X86-NEXT:    retl
+; GISEL-X86-NEXT:  LBB11_1: ## %bb2
+; GISEL-X86-NEXT:    movl $1, %eax
+; GISEL-X86-NEXT:    retl
+  %1 = icmp ne i32 %x, %x
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @icmp_ugt(i32 %x) {
+; SDAG-LABEL: icmp_ugt:
+; SDAG:       ## %bb.0:
+; SDAG-NEXT:    xorl %eax, %eax
+; SDAG-NEXT:    testb %al, %al
+; SDAG-NEXT:    je LBB12_1
+; SDAG-NEXT:  ## %bb.2: ## %bb1
+; SDAG-NEXT:    xorl %eax, %eax
+; SDAG-NEXT:    ret{{q|l}}
+; SDAG-NEXT:  LBB12_1: ## %bb2
+; SDAG-NEXT:    movl $1, %eax
+; SDAG-NEXT:    ret{{q|l}}
+;
+; FASTISEL-LABEL: icmp_ugt:
+; FASTISEL:       ## %bb.0:
+; FASTISEL-NEXT:    movl $1, %eax
+; FASTISEL-NEXT:    ret{{q|l}}
+;
+; GISEL-X64-LABEL: icmp_ugt:
+; GISEL-X64:       ## %bb.0:
+; GISEL-X64-NEXT:    cmpl %edi, %edi
+; GISEL-X64-NEXT:    seta %al
+; GISEL-X64-NEXT:    testb $1, %al
+; GISEL-X64-NEXT:    je LBB12_1
+; GISEL-X64-NEXT:  ## %bb.2: ## %bb1
+; GISEL-X64-NEXT:    xorl %eax, %eax
+; GISEL-X64-NEXT:    retq
+; GISEL-X64-NEXT:  LBB12_1: ## %bb2
+; GISEL-X64-NEXT:    movl $1, %eax
+; GISEL-X64-NEXT:    retq
+;
+; GISEL-X86-LABEL: icmp_ugt:
+; GISEL-X86:       ## %bb.0:
+; GISEL-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; GISEL-X86-NEXT:    cmpl %eax, %eax
+; GISEL-X86-NEXT:    seta %al
+; GISEL-X86-NEXT:    testb $1, %al
+; GISEL-X86-NEXT:    je LBB12_1
+; GISEL-X86-NEXT:  ## %bb.2: ## %bb1
+; GISEL-X86-NEXT:    xorl %eax, %eax
+; GISEL-X86-NEXT:    retl
+; GISEL-X86-NEXT:  LBB12_1: ## %bb2
+; GISEL-X86-NEXT:    movl $1, %eax
+; GISEL-X86-NEXT:    retl
+  %1 = icmp ugt i32 %x, %x
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @icmp_uge(i32 %x) {
+; SDAG-LABEL: icmp_uge:
+; SDAG:       ## %bb.0:
+; SDAG-NEXT:    movb $1, %al
+; SDAG-NEXT:    testb %al, %al
+; SDAG-NEXT:    je LBB13_1
+; SDAG-NEXT:  ## %bb.2: ## %bb1
+; SDAG-NEXT:    xorl %eax, %eax
+; SDAG-NEXT:    ret{{q|l}}
+; SDAG-NEXT:  LBB13_1: ## %bb2
+; SDAG-NEXT:    movl $1, %eax
+; SDAG-NEXT:    ret{{q|l}}
+;
+; FASTISEL-X64-LABEL: icmp_uge:
+; FASTISEL-X64:       ## %bb.0:
+; FASTISEL-X64-NEXT:    xorl %eax, %eax
+; FASTISEL-X64-NEXT:    retq
+;
+; GISEL-X64-LABEL: icmp_uge:
+; GISEL-X64:       ## %bb.0:
+; GISEL-X64-NEXT:    cmpl %edi, %edi
+; GISEL-X64-NEXT:    setae %al
+; GISEL-X64-NEXT:    testb $1, %al
+; GISEL-X64-NEXT:    je LBB13_1
+; GISEL-X64-NEXT:  ## %bb.2: ## %bb1
+; GISEL-X64-NEXT:    xorl %eax, %eax
+; GISEL-X64-NEXT:    retq
+; GISEL-X64-NEXT:  LBB13_1: ## %bb2
+; GISEL-X64-NEXT:    movl $1, %eax
+; GISEL-X64-NEXT:    retq
+;
+; GISEL-X86-LABEL: icmp_uge:
+; GISEL-X86:       ## %bb.0:
+; GISEL-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; GISEL-X86-NEXT:    cmpl %eax, %eax
+; GISEL-X86-NEXT:    setae %al
+; GISEL-X86-NEXT:    testb $1, %al
+; GISEL-X86-NEXT:    je LBB13_1
+; GISEL-X86-NEXT:  ## %bb.2: ## %bb1
+; GISEL-X86-NEXT:    xorl %eax, %eax
+; GISEL-X86-NEXT:    retl
+; GISEL-X86-NEXT:  LBB13_1: ## %bb2
+; GISEL-X86-NEXT:    movl $1, %eax
+; GISEL-X86-NEXT:    retl
+  %1 = icmp uge i32 %x, %x
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @icmp_ult(i32 %x) {
+; SDAG-LABEL: icmp_ult:
+; SDAG:       ## %bb.0:
+; SDAG-NEXT:    xorl %eax, %eax
+; SDAG-NEXT:    testb %al, %al
+; SDAG-NEXT:    je LBB14_1
+; SDAG-NEXT:  ## %bb.2: ## %bb1
+; SDAG-NEXT:    xorl %eax, %eax
+; SDAG-NEXT:    ret{{q|l}}
+; SDAG-NEXT:  LBB14_1: ## %bb2
+; SDAG-NEXT:    movl $1, %eax
+; SDAG-NEXT:    ret{{q|l}}
+;
+; FASTISEL-X64-LABEL: icmp_ult:
+; FASTISEL-X64:       ## %bb.0:
+; FASTISEL-X64-NEXT:    movl $1, %eax
+; FASTISEL-X64-NEXT:    ret{{q|l}}
+;
+; GISEL-X64-LABEL: icmp_ult:
+; GISEL-X64:       ## %bb.0:
+; GISEL-X64-NEXT:    cmpl %edi, %edi
+; GISEL-X64-NEXT:    setb %al
+; GISEL-X64-NEXT:    testb $1, %al
+; GISEL-X64-NEXT:    je LBB14_1
+; GISEL-X64-NEXT:  ## %bb.2: ## %bb1
+; GISEL-X64-NEXT:    xorl %eax, %eax
+; GISEL-X64-NEXT:    retq
+; GISEL-X64-NEXT:  LBB14_1: ## %bb2
+; GISEL-X64-NEXT:    movl $1, %eax
+; GISEL-X64-NEXT:    retq
+;
+; GISEL-X86-LABEL: icmp_ult:
+; GISEL-X86:       ## %bb.0:
+; GISEL-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; GISEL-X86-NEXT:    cmpl %eax, %eax
+; GISEL-X86-NEXT:    setb %al
+; GISEL-X86-NEXT:    testb $1, %al
+; GISEL-X86-NEXT:    je LBB14_1
+; GISEL-X86-NEXT:  ## %bb.2: ## %bb1
+; GISEL-X86-NEXT:    xorl %eax, %eax
+; GISEL-X86-NEXT:    retl
+; GISEL-X86-NEXT:  LBB14_1: ## %bb2
+; GISEL-X86-NEXT:    movl $1, %eax
+; GISEL-X86-NEXT:    retl
+  %1 = icmp ult i32 %x, %x
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @icmp_ule(i32 %x) {
+; SDAG-LABEL: icmp_ule:
+; SDAG:       ## %bb.0:
+; SDAG-NEXT:    movb $1, %al
+; SDAG-NEXT:    testb %al, %al
+; SDAG-NEXT:    je LBB15_1
+; SDAG-NEXT:  ## %bb.2: ## %bb1
+; SDAG-NEXT:    xorl %eax, %eax
+; SDAG-NEXT:    ret{{q|l}}
+; SDAG-NEXT:  LBB15_1: ## %bb2
+; SDAG-NEXT:    movl $1, %eax
+; SDAG-NEXT:    ret{{q|l}}
+;
+; FASTISEL-LABEL: icmp_ule:
+; FASTISEL:       ## %bb.0:
+; FASTISEL-NEXT:    xorl %eax, %eax
+; FASTISEL-NEXT:    ret{{q|l}}
+;
+; GISEL-X64-LABEL: icmp_ule:
+; GISEL-X64:       ## %bb.0:
+; GISEL-X64-NEXT:    cmpl %edi, %edi
+; GISEL-X64-NEXT:    setbe %al
+; GISEL-X64-NEXT:    testb $1, %al
+; GISEL-X64-NEXT:    je LBB15_1
+; GISEL-X64-NEXT:  ## %bb.2: ## %bb1
+; GISEL-X64-NEXT:    xorl %eax, %eax
+; GISEL-X64-NEXT:    retq
+; GISEL-X64-NEXT:  LBB15_1: ## %bb2
+; GISEL-X64-NEXT:    movl $1, %eax
+; GISEL-X64-NEXT:    retq
+;
+; GISEL-X86-LABEL: icmp_ule:
+; GISEL-X86:       ## %bb.0:
+; GISEL-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; GISEL-X86-NEXT:    cmpl %eax, %eax
+; GISEL-X86-NEXT:    setbe %al
+; GISEL-X86-NEXT:    testb $1, %al
+; GISEL-X86-NEXT:    je LBB15_1
+; GISEL-X86-NEXT:  ## %bb.2: ## %bb1
+; GISEL-X86-NEXT:    xorl %eax, %eax
+; GISEL-X86-NEXT:    retl
+; GISEL-X86-NEXT:  LBB15_1: ## %bb2
+; GISEL-X86-NEXT:    movl $1, %eax
+; GISEL-X86-NEXT:    retl
+  %1 = icmp ule i32 %x, %x
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @icmp_sgt(i32 %x) {
+; SDAG-LABEL: icmp_sgt:
+; SDAG:       ## %bb.0:
+; SDAG-NEXT:    xorl %eax, %eax
+; SDAG-NEXT:    testb %al, %al
+; SDAG-NEXT:    je LBB16_1
+; SDAG-NEXT:  ## %bb.2: ## %bb1
+; SDAG-NEXT:    xorl %eax, %eax
+; SDAG-NEXT:    ret{{q|l}}
+; SDAG-NEXT:  LBB16_1: ## %bb2
+; SDAG-NEXT:    movl $1, %eax
+; SDAG-NEXT:    ret{{q|l}}
+;
+; FASTISEL-LABEL: icmp_sgt:
+; FASTISEL:       ## %bb.0:
+; FASTISEL-NEXT:    movl $1, %eax
+; FASTISEL-NEXT:    ret{{q|l}}
+;
+; GISEL-X64-LABEL: icmp_sgt:
+; GISEL-X64:       ## %bb.0:
+; GISEL-X64-NEXT:    cmpl %edi, %edi
+; GISEL-X64-NEXT:    setg %al
+; GISEL-X64-NEXT:    testb $1, %al
+; GISEL-X64-NEXT:    je LBB16_1
+; GISEL-X64-NEXT:  ## %bb.2: ## %bb1
+; GISEL-X64-NEXT:    xorl %eax, %eax
+; GISEL-X64-NEXT:    retq
+; GISEL-X64-NEXT:  LBB16_1: ## %bb2
+; GISEL-X64-NEXT:    movl $1, %eax
+; GISEL-X64-NEXT:    retq
+;
+; GISEL-X86-LABEL: icmp_sgt:
+; GISEL-X86:       ## %bb.0:
+; GISEL-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; GISEL-X86-NEXT:    cmpl %eax, %eax
+; GISEL-X86-NEXT:    setg %al
+; GISEL-X86-NEXT:    testb $1, %al
+; GISEL-X86-NEXT:    je LBB16_1
+; GISEL-X86-NEXT:  ## %bb.2: ## %bb1
+; GISEL-X86-NEXT:    xorl %eax, %eax
+; GISEL-X86-NEXT:    retl
+; GISEL-X86-NEXT:  LBB16_1: ## %bb2
+; GISEL-X86-NEXT:    movl $1, %eax
+; GISEL-X86-NEXT:    retl
+  %1 = icmp sgt i32 %x, %x
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @icmp_sge(i32 %x) {
+; SDAG-LABEL: icmp_sge:
+; SDAG:       ## %bb.0:
+; SDAG-NEXT:    movb $1, %al
+; SDAG-NEXT:    testb %al, %al
+; SDAG-NEXT:    je LBB17_1
+; SDAG-NEXT:  ## %bb.2: ## %bb1
+; SDAG-NEXT:    xorl %eax, %eax
+; SDAG-NEXT:    ret{{q|l}}
+; SDAG-NEXT:  LBB17_1: ## %bb2
+; SDAG-NEXT:    movl $1, %eax
+; SDAG-NEXT:    ret{{q|l}}
+;
+; FASTISEL-LABEL: icmp_sge:
+; FASTISEL:       ## %bb.0:
+; FASTISEL-NEXT:    xorl %eax, %eax
+; FASTISEL-NEXT:    ret{{q|l}}
+;
+; GISEL-X64-LABEL: icmp_sge:
+; GISEL-X64:       ## %bb.0:
+; GISEL-X64-NEXT:    cmpl %edi, %edi
+; GISEL-X64-NEXT:    setge %al
+; GISEL-X64-NEXT:    testb $1, %al
+; GISEL-X64-NEXT:    je LBB17_1
+; GISEL-X64-NEXT:  ## %bb.2: ## %bb1
+; GISEL-X64-NEXT:    xorl %eax, %eax
+; GISEL-X64-NEXT:    retq
+; GISEL-X64-NEXT:  LBB17_1: ## %bb2
+; GISEL-X64-NEXT:    movl $1, %eax
+; GISEL-X64-NEXT:    retq
+;
+; GISEL-X86-LABEL: icmp_sge:
+; GISEL-X86:       ## %bb.0:
+; GISEL-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; GISEL-X86-NEXT:    cmpl %eax, %eax
+; GISEL-X86-NEXT:    setge %al
+; GISEL-X86-NEXT:    testb $1, %al
+; GISEL-X86-NEXT:    je LBB17_1
+; GISEL-X86-NEXT:  ## %bb.2: ## %bb1
+; GISEL-X86-NEXT:    xorl %eax, %eax
+; GISEL-X86-NEXT:    retl
+; GISEL-X86-NEXT:  LBB17_1: ## %bb2
+; GISEL-X86-NEXT:    movl $1, %eax
+; GISEL-X86-NEXT:    retl
+  %1 = icmp sge i32 %x, %x
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @icmp_slt(i32 %x) {
+; SDAG-LABEL: icmp_slt:
+; SDAG:       ## %bb.0:
+; SDAG-NEXT:    xorl %eax, %eax
+; SDAG-NEXT:    testb %al, %al
+; SDAG-NEXT:    je LBB18_1
+; SDAG-NEXT:  ## %bb.2: ## %bb1
+; SDAG-NEXT:    xorl %eax, %eax
+; SDAG-NEXT:    ret{{q|l}}
+; SDAG-NEXT:  LBB18_1: ## %bb2
+; SDAG-NEXT:    movl $1, %eax
+; SDAG-NEXT:    ret{{q|l}}
+;
+; FASTISEL-LABEL: icmp_slt:
+; FASTISEL:       ## %bb.0:
+; FASTISEL-NEXT:    movl $1, %eax
+; FASTISEL-NEXT:    ret{{q|l}}
+;
+; GISEL-X64-LABEL: icmp_slt:
+; GISEL-X64:       ## %bb.0:
+; GISEL-X64-NEXT:    cmpl %edi, %edi
+; GISEL-X64-NEXT:    setl %al
+; GISEL-X64-NEXT:    testb $1, %al
+; GISEL-X64-NEXT:    je LBB18_1
+; GISEL-X64-NEXT:  ## %bb.2: ## %bb1
+; GISEL-X64-NEXT:    xorl %eax, %eax
+; GISEL-X64-NEXT:    retq
+; GISEL-X64-NEXT:  LBB18_1: ## %bb2
+; GISEL-X64-NEXT:    movl $1, %eax
+; GISEL-X64-NEXT:    retq
+;
+; GISEL-X86-LABEL: icmp_slt:
+; GISEL-X86:       ## %bb.0:
+; GISEL-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; GISEL-X86-NEXT:    cmpl %eax, %eax
+; GISEL-X86-NEXT:    setl %al
+; GISEL-X86-NEXT:    testb $1, %al
+; GISEL-X86-NEXT:    je LBB18_1
+; GISEL-X86-NEXT:  ## %bb.2: ## %bb1
+; GISEL-X86-NEXT:    xorl %eax, %eax
+; GISEL-X86-NEXT:    retl
+; GISEL-X86-NEXT:  LBB18_1: ## %bb2
+; GISEL-X86-NEXT:    movl $1, %eax
+; GISEL-X86-NEXT:    retl
+  %1 = icmp slt i32 %x, %x
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @icmp_sle(i32 %x) {
+; SDAG-LABEL: icmp_sle:
+; SDAG:       ## %bb.0:
+; SDAG-NEXT:    movb $1, %al
+; SDAG-NEXT:    testb %al, %al
+; SDAG-NEXT:    je LBB19_1
+; SDAG-NEXT:  ## %bb.2: ## %bb1
+; SDAG-NEXT:    xorl %eax, %eax
+; SDAG-NEXT:    ret{{q|l}}
+; SDAG-NEXT:  LBB19_1: ## %bb2
+; SDAG-NEXT:    movl $1, %eax
+; SDAG-NEXT:    ret{{q|l}}
+;
+; FASTISEL-LABEL: icmp_sle:
+; FASTISEL:       ## %bb.0:
+; FASTISEL-NEXT:    xorl %eax, %eax
+; FASTISEL-NEXT:    ret{{q|l}}
+;
+; GISEL-X64-LABEL: icmp_sle:
+; GISEL-X64:       ## %bb.0:
+; GISEL-X64-NEXT:    cmpl %edi, %edi
+; GISEL-X64-NEXT:    setle %al
+; GISEL-X64-NEXT:    testb $1, %al
+; GISEL-X64-NEXT:    je LBB19_1
+; GISEL-X64-NEXT:  ## %bb.2: ## %bb1
+; GISEL-X64-NEXT:    xorl %eax, %eax
+; GISEL-X64-NEXT:    retq
+; GISEL-X64-NEXT:  LBB19_1: ## %bb2
+; GISEL-X64-NEXT:    movl $1, %eax
+; GISEL-X64-NEXT:    retq
+;
+; GISEL-X86-LABEL: icmp_sle:
+; GISEL-X86:       ## %bb.0:
+; GISEL-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; GISEL-X86-NEXT:    cmpl %eax, %eax
+; GISEL-X86-NEXT:    setle %al
+; GISEL-X86-NEXT:    testb $1, %al
+; GISEL-X86-NEXT:    je LBB19_1
+; GISEL-X86-NEXT:  ## %bb.2: ## %bb1
+; GISEL-X86-NEXT:    xorl %eax, %eax
+; GISEL-X86-NEXT:    retl
+; GISEL-X86-NEXT:  LBB19_1: ## %bb2
+; GISEL-X86-NEXT:    movl $1, %eax
+; GISEL-X86-NEXT:    retl
+  %1 = icmp sle i32 %x, %x
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
-- 
cgit v1.1


From b85fe40cb88a6b4f640c2b757bd0d254ff1d032c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bal=C3=A1zs=20K=C3=A9ri?= <balazs.keri@ericsson.com>
Date: Thu, 8 Feb 2024 11:09:57 +0100
Subject: [clang][analyzer] Add missing stream related functions to
 StdLibraryFunctionsChecker. (#76979)

Some stream functions were recently added to `StreamChecker` that were
not modeled by `StdCLibraryFunctionsChecker`. To ensure consistency
these functions are added to the other checker too.
Some of the related tests are re-organized.
---
 .../Checkers/StdLibraryFunctionsChecker.cpp        |  79 ++++++++++++--
 .../Inputs/std-c-library-functions-POSIX.h         |  15 ++-
 .../test/Analysis/std-c-library-functions-POSIX.c  |  16 ++-
 clang/test/Analysis/std-c-library-functions.c      |   4 +-
 clang/test/Analysis/stream-error.c                 |  26 -----
 clang/test/Analysis/stream-noopen.c                | 120 +++++++++++++++++----
 clang/test/Analysis/stream.c                       |  25 ++++-
 7 files changed, 221 insertions(+), 64 deletions(-)

diff --git a/clang/lib/StaticAnalyzer/Checkers/StdLibraryFunctionsChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/StdLibraryFunctionsChecker.cpp
index 0c6293e6..6b8ac26 100644
--- a/clang/lib/StaticAnalyzer/Checkers/StdLibraryFunctionsChecker.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/StdLibraryFunctionsChecker.cpp
@@ -2023,13 +2023,6 @@ void StdLibraryFunctionsChecker::initFunctionSummaries(
                                            {{EOFv, EOFv}, {0, UCharRangeMax}},
                                            "an unsigned char value or EOF")));
 
-  // The getc() family of functions that returns either a char or an EOF.
-  addToFunctionSummaryMap(
-      {"getc", "fgetc"}, Signature(ArgTypes{FilePtrTy}, RetType{IntTy}),
-      Summary(NoEvalCall)
-          .Case({ReturnValueCondition(WithinRange,
-                                      {{EOFv, EOFv}, {0, UCharRangeMax}})},
-                ErrnoIrrelevant));
   addToFunctionSummaryMap(
       "getchar", Signature(ArgTypes{}, RetType{IntTy}),
       Summary(NoEvalCall)
@@ -2139,7 +2132,17 @@ void StdLibraryFunctionsChecker::initFunctionSummaries(
         std::move(GetenvSummary));
   }
 
-  if (ModelPOSIX) {
+  if (!ModelPOSIX) {
+    // Without POSIX use of 'errno' is not specified (in these cases).
+    // Add these functions without 'errno' checks.
+    addToFunctionSummaryMap(
+        {"getc", "fgetc"}, Signature(ArgTypes{FilePtrTy}, RetType{IntTy}),
+        Summary(NoEvalCall)
+            .Case({ReturnValueCondition(WithinRange,
+                                        {{EOFv, EOFv}, {0, UCharRangeMax}})},
+                  ErrnoIrrelevant)
+            .ArgConstraint(NotNull(ArgNo(0))));
+  } else {
     const auto ReturnsZeroOrMinusOne =
         ConstraintSet{ReturnValueCondition(WithinRange, Range(-1, 0))};
     const auto ReturnsZero =
@@ -2231,6 +2234,63 @@ void StdLibraryFunctionsChecker::initFunctionSummaries(
             .Case(ReturnsMinusOne, ErrnoNEZeroIrrelevant, GenericFailureMsg)
             .ArgConstraint(NotNull(ArgNo(0))));
 
+    std::optional<QualType> Off_tTy = lookupTy("off_t");
+    std::optional<RangeInt> Off_tMax = getMaxValue(Off_tTy);
+
+    // int fgetc(FILE *stream);
+    // 'getc' is the same as 'fgetc' but may be a macro
+    addToFunctionSummaryMap(
+        {"getc", "fgetc"}, Signature(ArgTypes{FilePtrTy}, RetType{IntTy}),
+        Summary(NoEvalCall)
+            .Case({ReturnValueCondition(WithinRange, {{0, UCharRangeMax}})},
+                  ErrnoMustNotBeChecked, GenericSuccessMsg)
+            .Case({ReturnValueCondition(WithinRange, SingleValue(EOFv))},
+                  ErrnoIrrelevant, GenericFailureMsg)
+            .ArgConstraint(NotNull(ArgNo(0))));
+
+    // int fputc(int c, FILE *stream);
+    // 'putc' is the same as 'fputc' but may be a macro
+    addToFunctionSummaryMap(
+        {"putc", "fputc"},
+        Signature(ArgTypes{IntTy, FilePtrTy}, RetType{IntTy}),
+        Summary(NoEvalCall)
+            .Case({ArgumentCondition(0, WithinRange, Range(0, UCharRangeMax)),
+                   ReturnValueCondition(BO_EQ, ArgNo(0))},
+                  ErrnoMustNotBeChecked, GenericSuccessMsg)
+            .Case({ArgumentCondition(0, OutOfRange, Range(0, UCharRangeMax)),
+                   ReturnValueCondition(WithinRange, Range(0, UCharRangeMax))},
+                  ErrnoMustNotBeChecked, GenericSuccessMsg)
+            .Case({ReturnValueCondition(WithinRange, SingleValue(EOFv))},
+                  ErrnoNEZeroIrrelevant, GenericFailureMsg)
+            .ArgConstraint(NotNull(ArgNo(1))));
+
+    // char *fgets(char *restrict s, int n, FILE *restrict stream);
+    addToFunctionSummaryMap(
+        "fgets",
+        Signature(ArgTypes{CharPtrRestrictTy, IntTy, FilePtrRestrictTy},
+                  RetType{CharPtrTy}),
+        Summary(NoEvalCall)
+            .Case({ReturnValueCondition(BO_EQ, ArgNo(0))},
+                  ErrnoMustNotBeChecked, GenericSuccessMsg)
+            .Case({IsNull(Ret)}, ErrnoIrrelevant, GenericFailureMsg)
+            .ArgConstraint(NotNull(ArgNo(0)))
+            .ArgConstraint(ArgumentCondition(1, WithinRange, Range(0, IntMax)))
+            .ArgConstraint(
+                BufferSize(/*Buffer=*/ArgNo(0), /*BufSize=*/ArgNo(1)))
+            .ArgConstraint(NotNull(ArgNo(2))));
+
+    // int fputs(const char *restrict s, FILE *restrict stream);
+    addToFunctionSummaryMap(
+        "fputs",
+        Signature(ArgTypes{ConstCharPtrRestrictTy, FilePtrRestrictTy},
+                  RetType{IntTy}),
+        Summary(NoEvalCall)
+            .Case(ReturnsNonnegative, ErrnoMustNotBeChecked, GenericSuccessMsg)
+            .Case({ReturnValueCondition(WithinRange, SingleValue(EOFv))},
+                  ErrnoNEZeroIrrelevant, GenericFailureMsg)
+            .ArgConstraint(NotNull(ArgNo(0)))
+            .ArgConstraint(NotNull(ArgNo(1))));
+
     // int ungetc(int c, FILE *stream);
     addToFunctionSummaryMap(
         "ungetc", Signature(ArgTypes{IntTy, FilePtrTy}, RetType{IntTy}),
@@ -2250,9 +2310,6 @@ void StdLibraryFunctionsChecker::initFunctionSummaries(
                 0, WithinRange, {{EOFv, EOFv}, {0, UCharRangeMax}}))
             .ArgConstraint(NotNull(ArgNo(1))));
 
-    std::optional<QualType> Off_tTy = lookupTy("off_t");
-    std::optional<RangeInt> Off_tMax = getMaxValue(Off_tTy);
-
     // int fseek(FILE *stream, long offset, int whence);
     // FIXME: It can be possible to get the 'SEEK_' values (like EOFv) and use
     // these for condition of arg 2.
diff --git a/clang/test/Analysis/Inputs/std-c-library-functions-POSIX.h b/clang/test/Analysis/Inputs/std-c-library-functions-POSIX.h
index 63e22eb..b146068 100644
--- a/clang/test/Analysis/Inputs/std-c-library-functions-POSIX.h
+++ b/clang/test/Analysis/Inputs/std-c-library-functions-POSIX.h
@@ -11,6 +11,7 @@ typedef unsigned long int pthread_t;
 typedef unsigned long time_t;
 typedef unsigned long clockid_t;
 typedef __INT64_TYPE__ off64_t;
+typedef __INT64_TYPE__ fpos_t;
 
 typedef struct {
   int a;
@@ -42,9 +43,22 @@ FILE *fopen(const char *restrict pathname, const char *restrict mode);
 FILE *tmpfile(void);
 FILE *freopen(const char *restrict pathname, const char *restrict mode,
               FILE *restrict stream);
+FILE *fdopen(int fd, const char *mode);
 int fclose(FILE *stream);
+int putc(int c, FILE *stream);
+int fputc(int c, FILE *stream);
+char *fgets(char *restrict s, int n, FILE *restrict stream);
+int fputs(const char *restrict s, FILE *restrict stream);
 int fseek(FILE *stream, long offset, int whence);
+int fgetpos(FILE *restrict stream, fpos_t *restrict pos);
+int fsetpos(FILE *stream, const fpos_t *pos);
+int fflush(FILE *stream);
+long ftell(FILE *stream);
 int fileno(FILE *stream);
+void rewind(FILE *stream);
+void clearerr(FILE *stream);
+int feof(FILE *stream);
+int ferror(FILE *stream);
 long a64l(const char *str64);
 char *l64a(long value);
 int open(const char *path, int oflag, ...);
@@ -100,7 +114,6 @@ int pclose(FILE *stream);
 int close(int fildes);
 long fpathconf(int fildes, int name);
 long pathconf(const char *path, int name);
-FILE *fdopen(int fd, const char *mode);
 void rewinddir(DIR *dir);
 void seekdir(DIR *dirp, long loc);
 int rand_r(unsigned int *seedp);
diff --git a/clang/test/Analysis/std-c-library-functions-POSIX.c b/clang/test/Analysis/std-c-library-functions-POSIX.c
index 03aa8e2..b53f313 100644
--- a/clang/test/Analysis/std-c-library-functions-POSIX.c
+++ b/clang/test/Analysis/std-c-library-functions-POSIX.c
@@ -23,10 +23,22 @@
 // CHECK: Loaded summary for: FILE *popen(const char *command, const char *type)
 // CHECK: Loaded summary for: int fclose(FILE *stream)
 // CHECK: Loaded summary for: int pclose(FILE *stream)
+// CHECK: Loaded summary for: int getc(FILE *)
+// CHECK: Loaded summary for: int fgetc(FILE *)
+// CHECK: Loaded summary for: int putc(int c, FILE *stream)
+// CHECK: Loaded summary for: int fputc(int c, FILE *stream)
+// CHECK: Loaded summary for: char *fgets(char *restrict s, int n, FILE *restrict stream)
+// CHECK: Loaded summary for: int fputs(const char *restrict s, FILE *restrict stream)
 // CHECK: Loaded summary for: int fseek(FILE *stream, long offset, int whence)
-// CHECK: Loaded summary for: int fseeko(FILE *stream, off_t offset, int whence)
-// CHECK: Loaded summary for: off_t ftello(FILE *stream)
+// CHECK: Loaded summary for: int fgetpos(FILE *restrict stream, fpos_t *restrict pos)
+// CHECK: Loaded summary for: int fsetpos(FILE *stream, const fpos_t *pos)
+// CHECK: Loaded summary for: int fflush(FILE *stream)
+// CHECK: Loaded summary for: long ftell(FILE *stream)
 // CHECK: Loaded summary for: int fileno(FILE *stream)
+// CHECK: Loaded summary for: void rewind(FILE *stream)
+// CHECK: Loaded summary for: void clearerr(FILE *stream)
+// CHECK: Loaded summary for: int feof(FILE *stream)
+// CHECK: Loaded summary for: int ferror(FILE *stream)
 // CHECK: Loaded summary for: long a64l(const char *str64)
 // CHECK: Loaded summary for: char *l64a(long value)
 // CHECK: Loaded summary for: int open(const char *path, int oflag, ...)
diff --git a/clang/test/Analysis/std-c-library-functions.c b/clang/test/Analysis/std-c-library-functions.c
index b7eb6b2..e6564e2 100644
--- a/clang/test/Analysis/std-c-library-functions.c
+++ b/clang/test/Analysis/std-c-library-functions.c
@@ -53,8 +53,6 @@
 // CHECK-NEXT: Loaded summary for: int toupper(int)
 // CHECK-NEXT: Loaded summary for: int tolower(int)
 // CHECK-NEXT: Loaded summary for: int toascii(int)
-// CHECK-NEXT: Loaded summary for: int getc(FILE *)
-// CHECK-NEXT: Loaded summary for: int fgetc(FILE *)
 // CHECK-NEXT: Loaded summary for: int getchar(void)
 // CHECK-NEXT: Loaded summary for: unsigned int fread(void *restrict, size_t, size_t, FILE *restrict)
 // CHECK-NEXT: Loaded summary for: unsigned int fwrite(const void *restrict, size_t, size_t, FILE *restrict)
@@ -63,6 +61,8 @@
 // CHECK-NEXT: Loaded summary for: ssize_t getline(char **restrict, size_t *restrict, FILE *restrict)
 // CHECK-NEXT: Loaded summary for: ssize_t getdelim(char **restrict, size_t *restrict, int, FILE *restrict)
 // CHECK-NEXT: Loaded summary for: char *getenv(const char *)
+// CHECK-NEXT: Loaded summary for: int getc(FILE *)
+// CHECK-NEXT: Loaded summary for: int fgetc(FILE *)
 
 #include "Inputs/std-c-library-functions.h"
 
diff --git a/clang/test/Analysis/stream-error.c b/clang/test/Analysis/stream-error.c
index cd4b009..4bab075 100644
--- a/clang/test/Analysis/stream-error.c
+++ b/clang/test/Analysis/stream-error.c
@@ -491,32 +491,6 @@ void error_ftello(void) {
   fclose(F);
 }
 
-void error_fflush_after_fclose(void) {
-  FILE *F = tmpfile();
-  int Ret;
-  fflush(NULL);                      // no-warning
-  if (!F)
-    return;
-  if ((Ret = fflush(F)) != 0)
-    clang_analyzer_eval(Ret == EOF); // expected-warning {{TRUE}}
-  fclose(F);
-  fflush(F);                         // expected-warning {{Stream might be already closed}}
-}
-
-void error_fflush_on_open_failed_stream(void) {
-  FILE *F = tmpfile();
-  if (!F) {
-    fflush(F); // no-warning
-    return;
-  }
-  fclose(F);
-}
-
-void error_fflush_on_unknown_stream(FILE *F) {
-  fflush(F);   // no-warning
-  fclose(F);   // no-warning
-}
-
 void error_fflush_on_non_null_stream_clear_error_states(void) {
   FILE *F0 = tmpfile(), *F1 = tmpfile();
   // `fflush` clears a non-EOF stream's error state.
diff --git a/clang/test/Analysis/stream-noopen.c b/clang/test/Analysis/stream-noopen.c
index 8ad101e..8bd01a9 100644
--- a/clang/test/Analysis/stream-noopen.c
+++ b/clang/test/Analysis/stream-noopen.c
@@ -57,6 +57,95 @@ void test_fwrite(FILE *F) {
   clang_analyzer_eval(ferror(F)); // expected-warning {{UNKNOWN}}
 }
 
+void test_fgetc(FILE *F) {
+  int Ret = fgetc(F);
+  clang_analyzer_eval(F != NULL); // expected-warning {{TRUE}}
+  if (Ret != EOF) {
+    if (errno) {} // expected-warning {{undefined}}
+  } else {
+    clang_analyzer_eval(errno != 0); // expected-warning {{TRUE}}
+                                     // expected-warning@-1 {{FALSE}}
+  }
+  clang_analyzer_eval(feof(F)); // expected-warning {{UNKNOWN}}
+  clang_analyzer_eval(ferror(F)); // expected-warning {{UNKNOWN}}
+}
+
+void test_fputc(FILE *F) {
+  int Ret = fputc('a', F);
+  clang_analyzer_eval(F != NULL); // expected-warning {{TRUE}}
+  if (Ret != EOF) {
+    clang_analyzer_eval(Ret == 'a'); // expected-warning {{TRUE}}
+    if (errno) {} // expected-warning {{undefined}}
+  } else {
+    clang_analyzer_eval(errno != 0); // expected-warning {{TRUE}}
+  }
+  clang_analyzer_eval(feof(F)); // expected-warning {{UNKNOWN}}
+  clang_analyzer_eval(ferror(F)); // expected-warning {{UNKNOWN}}
+}
+
+void test_fgets(char *Buf, int N, FILE *F) {
+  char *Ret = fgets(Buf, N, F);
+  clang_analyzer_eval(F != NULL); // expected-warning {{TRUE}}
+  clang_analyzer_eval(Buf != NULL); // expected-warning {{TRUE}}
+  clang_analyzer_eval(N >= 0); // expected-warning {{TRUE}}
+  if (Ret == Buf) {
+    if (errno) {} // expected-warning {{undefined}}
+  } else {
+    clang_analyzer_eval(Ret == 0); // expected-warning {{TRUE}}
+    clang_analyzer_eval(errno != 0); // expected-warning {{TRUE}}
+                                     // expected-warning@-1 {{FALSE}}
+  }
+  clang_analyzer_eval(feof(F)); // expected-warning {{UNKNOWN}}
+  clang_analyzer_eval(ferror(F)); // expected-warning {{UNKNOWN}}
+
+  char Buf1[10];
+  Ret = fgets(Buf1, 11, F); // expected-warning {{The 1st argument to 'fgets' is a buffer with size 10}}
+}
+
+void test_fgets_bufsize(FILE *F) {
+  char Buf[10];
+  fgets(Buf, 11, F); // expected-warning {{The 1st argument to 'fgets' is a buffer with size 10}}
+}
+
+void test_fputs(char *Buf, FILE *F) {
+  int Ret = fputs(Buf, F);
+  clang_analyzer_eval(F != NULL); // expected-warning {{TRUE}}
+  clang_analyzer_eval(Buf != NULL); // expected-warning {{TRUE}}
+  if (Ret >= 0) {
+    if (errno) {} // expected-warning {{undefined}}
+  } else {
+    clang_analyzer_eval(Ret == EOF); // expected-warning {{TRUE}}
+    clang_analyzer_eval(errno != 0); // expected-warning {{TRUE}}
+  }
+  clang_analyzer_eval(feof(F)); // expected-warning {{UNKNOWN}}
+  clang_analyzer_eval(ferror(F)); // expected-warning {{UNKNOWN}}
+}
+
+void test_ungetc(FILE *F) {
+  int Ret = ungetc('X', F);
+  clang_analyzer_eval(F != NULL); // expected-warning {{TRUE}}
+  if (Ret == 'X') {
+    if (errno) {} // expected-warning {{undefined}}
+  } else {
+    clang_analyzer_eval(Ret == EOF); // expected-warning {{TRUE}}
+    clang_analyzer_eval(errno != 0); // expected-warning {{TRUE}}
+  }
+  clang_analyzer_eval(feof(F)); // expected-warning {{UNKNOWN}}
+  clang_analyzer_eval(ferror(F)); // expected-warning {{UNKNOWN}}
+}
+
+void test_ungetc_EOF(FILE *F, int C) {
+  int Ret = ungetc(EOF, F);
+  clang_analyzer_eval(F != NULL); // expected-warning {{TRUE}}
+  clang_analyzer_eval(Ret == EOF); // expected-warning {{TRUE}}
+  clang_analyzer_eval(errno != 0); // expected-warning {{TRUE}}
+  Ret = ungetc(C, F);
+  if (Ret == EOF) {
+    clang_analyzer_eval(C == EOF); // expected-warning {{TRUE}}
+                                   // expected-warning@-1{{FALSE}}
+  }
+}
+
 void test_fclose(FILE *F) {
   int Ret = fclose(F);
   clang_analyzer_eval(F != NULL); // expected-warning {{TRUE}}
@@ -138,28 +227,17 @@ void test_rewind(FILE *F) {
   rewind(F);
 }
 
-void test_ungetc(FILE *F) {
-  int Ret = ungetc('X', F);
-  clang_analyzer_eval(F != NULL); // expected-warning {{TRUE}}
-  if (Ret == 'X') {
-    if (errno) {} // expected-warning {{undefined}}
-  } else {
-    clang_analyzer_eval(Ret == EOF); // expected-warning {{TRUE}}
-    clang_analyzer_eval(errno != 0); // expected-warning {{TRUE}}
-  }
-  clang_analyzer_eval(feof(F)); // expected-warning {{UNKNOWN}}
-  clang_analyzer_eval(ferror(F)); // expected-warning {{UNKNOWN}}
-}
-
-void test_ungetc_EOF(FILE *F, int C) {
-  int Ret = ungetc(EOF, F);
-  clang_analyzer_eval(F != NULL); // expected-warning {{TRUE}}
-  clang_analyzer_eval(Ret == EOF); // expected-warning {{TRUE}}
-  clang_analyzer_eval(errno != 0); // expected-warning {{TRUE}}
-  Ret = ungetc(C, F);
+void test_fflush(FILE *F) {
+  errno = 0;
+  int Ret = fflush(F);
+  clang_analyzer_eval(F != NULL); // expected-warning{{TRUE}}
+                                  // expected-warning@-1{{FALSE}}
   if (Ret == EOF) {
-    clang_analyzer_eval(C == EOF); // expected-warning {{TRUE}}
-                                   // expected-warning@-1{{FALSE}}
+    clang_analyzer_eval(errno != 0); // expected-warning{{TRUE}}
+  } else {
+    clang_analyzer_eval(Ret == 0); // expected-warning{{TRUE}}
+    clang_analyzer_eval(errno == 0); // expected-warning{{TRUE}}
+                                     // expected-warning@-1{{FALSE}}
   }
 }
 
diff --git a/clang/test/Analysis/stream.c b/clang/test/Analysis/stream.c
index 36a9b4e..378c915 100644
--- a/clang/test/Analysis/stream.c
+++ b/clang/test/Analysis/stream.c
@@ -1,7 +1,9 @@
-// RUN: %clang_analyze_cc1 -analyzer-checker=core,alpha.unix.Stream -verify %s
+// RUN: %clang_analyze_cc1 -analyzer-checker=core,alpha.unix.Stream,debug.ExprInspection -verify %s
 
 #include "Inputs/system-header-simulator.h"
 
+void clang_analyzer_eval(int);
+
 void check_fread(void) {
   FILE *fp = tmpfile();
   fread(0, 0, 0, fp); // expected-warning {{Stream pointer might be NULL}}
@@ -316,3 +318,24 @@ void check_leak_noreturn_2(void) {
 } // expected-warning {{Opened stream never closed. Potential resource leak}}
 // FIXME: This warning should be placed at the `return` above.
 // See https://reviews.llvm.org/D83120 about details.
+
+void fflush_after_fclose(void) {
+  FILE *F = tmpfile();
+  int Ret;
+  fflush(NULL);                      // no-warning
+  if (!F)
+    return;
+  if ((Ret = fflush(F)) != 0)
+    clang_analyzer_eval(Ret == EOF); // expected-warning {{TRUE}}
+  fclose(F);
+  fflush(F);                         // expected-warning {{Stream might be already closed}}
+}
+
+void fflush_on_open_failed_stream(void) {
+  FILE *F = tmpfile();
+  if (!F) {
+    fflush(F); // no-warning
+    return;
+  }
+  fclose(F);
+}
-- 
cgit v1.1


From 8f2378d7fcf19ea00fbd3366c2125569ef084f93 Mon Sep 17 00:00:00 2001
From: Simon Camphausen <simon.camphausen@iml.fraunhofer.de>
Date: Thu, 8 Feb 2024 11:27:08 +0100
Subject: [mlir][EmitC] Add builders for call_opaque op (#80879)

This allows to omit the default valued attributes and therefore write
more compact code.
---
 mlir/include/mlir/Dialect/EmitC/IR/EmitC.td | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/mlir/include/mlir/Dialect/EmitC/IR/EmitC.td b/mlir/include/mlir/Dialect/EmitC/IR/EmitC.td
index 39cc360..c50fdf3 100644
--- a/mlir/include/mlir/Dialect/EmitC/IR/EmitC.td
+++ b/mlir/include/mlir/Dialect/EmitC/IR/EmitC.td
@@ -122,6 +122,19 @@ def EmitC_CallOpaqueOp : EmitC_Op<"call_opaque", []> {
     Variadic<AnyType>:$operands
   );
   let results = (outs Variadic<AnyType>);
+  let builders = [
+    OpBuilder<(ins
+      "::mlir::TypeRange":$resultTypes,
+      "::llvm::StringRef":$callee,
+      "::mlir::ValueRange":$operands,
+      CArg<"::mlir::ArrayAttr", "{}">:$args,
+      CArg<"::mlir::ArrayAttr", "{}">:$template_args), [{
+        build($_builder, $_state, resultTypes, callee, args, template_args,
+            operands);
+      }]
+    >
+  ];
+
   let assemblyFormat = [{
     $callee `(` $operands `)` attr-dict `:` functional-type($operands, results)
   }];
-- 
cgit v1.1


From 1a42b3804f0ed1c4958c4f17216543a1623e3452 Mon Sep 17 00:00:00 2001
From: Jeremy Morse <jeremy.morse@sony.com>
Date: Thu, 8 Feb 2024 10:27:34 +0000
Subject: [DebugInfo][RemoveDIs] Erase ranges of instructions individually
 (#81007)

The BasicBlock::erase method simply removes a range of instructions from
the instlist by unlinking them. However, now that we're attaching
debug-info directly to instructions, some cleanup is required, so use
eraseFromParent on each instruction instead.

This is less efficient, but rare, and seemingly only WASM EH Prepare
uses this method of BasicBlock. Detected via a memory leak check in
asan.

(asan is always the final boss for whatever I do).
---
 llvm/lib/IR/BasicBlock.cpp | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/IR/BasicBlock.cpp b/llvm/lib/IR/BasicBlock.cpp
index bb55f48..fe9d0d0 100644
--- a/llvm/lib/IR/BasicBlock.cpp
+++ b/llvm/lib/IR/BasicBlock.cpp
@@ -677,7 +677,9 @@ BasicBlock *BasicBlock::splitBasicBlockBefore(iterator I, const Twine &BBName) {
 
 BasicBlock::iterator BasicBlock::erase(BasicBlock::iterator FromIt,
                                        BasicBlock::iterator ToIt) {
-  return InstList.erase(FromIt, ToIt);
+  for (Instruction &I : make_early_inc_range(make_range(FromIt, ToIt)))
+    I.eraseFromParent();
+  return ToIt;
 }
 
 void BasicBlock::replacePhiUsesWith(BasicBlock *Old, BasicBlock *New) {
-- 
cgit v1.1


From faa2f9658a0cd276f3415fad2676f8d90df51268 Mon Sep 17 00:00:00 2001
From: Jeremy Morse <jeremy.morse@sony.com>
Date: Thu, 8 Feb 2024 10:44:43 +0000
Subject: [DebugInfo] Handle dbg.assigns in FastISel (#80734)

There are some rare circumstances where dbg.assign intrinsics can reach
FastISel. They are a more specialised kind of dbg.value intrinsic with
more information about the originating alloca. They only occur during
optimisation, but might reach FastISel through always_inlining an
optimised function into an optnone function.

This is a slight problem as it's not safe (for debug-info accuracy) to
ignore any intrinsics, and for RemoveDIs (the intrinsic-replacement
project) it causes a crash through an unhandled switch case. To get
around this, we can just treat the dbg.assign as a dbg.value (it's an
actual subclass) and use the variable location information from the
dbg.value fields. This loses a small amount of debug-info about stack
locations, but is more accurate than just ignoring the intrinsic.

(This has popped up deep in an LTO build of a large codebase while
testing RemoveDIs, I figured it'd be good to fix it for the
intrinsic-form at the same time, just to demonstrate the correct
behaviour).
---
 llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp       |  7 ++++
 llvm/lib/CodeGen/SelectionDAG/FastISel.cpp         | 10 ++++-
 .../X86/dont-drop-dbg-assigns-in-isels.ll          | 46 ++++++++++++++++++++++
 3 files changed, 62 insertions(+), 1 deletion(-)
 create mode 100644 llvm/test/DebugInfo/X86/dont-drop-dbg-assigns-in-isels.ll

diff --git a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
index dd38317..c1d8e89 100644
--- a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
@@ -2120,6 +2120,13 @@ bool IRTranslator::translateKnownIntrinsic(const CallInst &CI, Intrinsic::ID ID,
                                                 ListSize, Alignment));
     return true;
   }
+  case Intrinsic::dbg_assign:
+    // A dbg.assign is a dbg.value with more information about stack locations,
+    // typically produced during optimisation of variables with leaked
+    // addresses. We can treat it like a normal dbg_value intrinsic here; to
+    // benefit from the full analysis of stack/SSA locations, GlobalISel would
+    // need to register for and use the AssignmentTrackingAnalysis pass.
+    LLVM_FALLTHROUGH;
   case Intrinsic::dbg_value: {
     // This form of DBG_VALUE is target-independent.
     const DbgValueInst &DI = cast<DbgValueInst>(CI);
diff --git a/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp b/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp
index 4df79f4..f875652 100644
--- a/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp
@@ -1197,7 +1197,8 @@ void FastISel::handleDbgInfo(const Instruction *II) {
       V = DPV.getVariableLocationOp(0);
 
     bool Res = false;
-    if (DPV.getType() == DPValue::LocationType::Value) {
+    if (DPV.getType() == DPValue::LocationType::Value ||
+        DPV.getType() == DPValue::LocationType::Assign) {
       Res = lowerDbgValue(V, DPV.getExpression(), DPV.getVariable(),
                           DPV.getDebugLoc());
     } else {
@@ -1393,6 +1394,13 @@ bool FastISel::selectIntrinsicCall(const IntrinsicInst *II) {
 
     return true;
   }
+  case Intrinsic::dbg_assign:
+    // A dbg.assign is a dbg.value with more information, typically produced
+    // during optimisation. If one reaches fastisel then something odd has
+    // happened (such as an optimised function being always-inlined into an
+    // optnone function). We will not be using the extra information in the
+    // dbg.assign in that case, just use its dbg.value fields.
+    LLVM_FALLTHROUGH;
   case Intrinsic::dbg_value: {
     // This form of DBG_VALUE is target-independent.
     const DbgValueInst *DI = cast<DbgValueInst>(II);
diff --git a/llvm/test/DebugInfo/X86/dont-drop-dbg-assigns-in-isels.ll b/llvm/test/DebugInfo/X86/dont-drop-dbg-assigns-in-isels.ll
new file mode 100644
index 0000000..77c9aa5
--- /dev/null
+++ b/llvm/test/DebugInfo/X86/dont-drop-dbg-assigns-in-isels.ll
@@ -0,0 +1,46 @@
+; RUN: llc %s -fast-isel -start-after=codegenprepare -stop-before=finalize-isel -o - | FileCheck %s
+; RUN: llc %s -fast-isel -start-after=codegenprepare -stop-before=finalize-isel -o - --try-experimental-debuginfo-iterators | FileCheck %s
+; RUN: llc %s -global-isel -start-after=codegenprepare -stop-before=finalize-isel -o - | FileCheck %s
+; RUN: llc %s -global-isel -start-after=codegenprepare -stop-before=finalize-isel -o - --try-experimental-debuginfo-iterators | FileCheck %s
+
+target datalayout = "e-m:w-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-unknown"
+
+; CHECK: DBG_VALUE
+
+declare void @llvm.dbg.assign(metadata, metadata, metadata, metadata, metadata, metadata)
+
+define dso_local i32 @foo(i32 %a, i32 %b) local_unnamed_addr !dbg !8 {
+entry:
+  call void @llvm.dbg.assign(metadata !DIArgList(i32 %a, i32 %b), metadata !16, metadata !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_LLVM_arg, 1, DW_OP_plus), metadata !21, metadata ptr undef, metadata !DIExpression()), !dbg !17
+  %mul = mul nsw i32 %b, %a, !dbg !18
+  ret i32 %mul, !dbg !18
+}
+
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!3, !4, !5, !19, !6}
+!llvm.ident = !{!7}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: !1, producer: "clang version 11.0.0", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, nameTableKind: None)
+!1 = !DIFile(filename: "debug_value_list_selectiondag.cpp", directory: "/")
+!2 = !{}
+!3 = !{i32 2, !"CodeView", i32 1}
+!4 = !{i32 2, !"Debug Info Version", i32 3}
+!5 = !{i32 1, !"wchar_size", i32 2}
+!6 = !{i32 7, !"PIC Level", i32 2}
+!7 = !{!"clang version 11.0.0"}
+!8 = distinct !DISubprogram(name: "foo", linkageName: "foo", scope: !9, file: !9, line: 1, type: !10, scopeLine: 1, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !13)
+!9 = !DIFile(filename: ".\\debug_value_list.cpp", directory: "/tmp")
+!10 = !DISubroutineType(types: !11)
+!11 = !{!12, !12, !12}
+!12 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+!13 = !{!14, !15, !16}
+!14 = !DILocalVariable(name: "b", arg: 2, scope: !8, file: !9, line: 1, type: !12)
+!15 = !DILocalVariable(name: "a", arg: 1, scope: !8, file: !9, line: 1, type: !12)
+!16 = !DILocalVariable(name: "c", scope: !8, file: !9, line: 2, type: !12)
+!17 = !DILocation(line: 0, scope: !8)
+!18 = !DILocation(line: 3, scope: !8)
+!19 = !{i32 7, !"debug-info-assignment-tracking", i1 true}
+!20 = !DILocalVariable(name: "d", scope: !8, file: !9, line: 2, type: !12)
+!21 = distinct !DIAssignID()
-- 
cgit v1.1


From 878234b3202c9fe343cd59c71b50c4c4c5dc1b8c Mon Sep 17 00:00:00 2001
From: David Green <david.green@arm.com>
Date: Thu, 8 Feb 2024 11:07:33 +0000
Subject: [BasicAA] Scalable offset with scalable typesize. (#80818)

This patch adds a simple alias analysis check for accesses that are scalable
with a offset between them that is also trivially scalable (there are no other
constant/variable offsets). We essentially divide each side by vscale and are
left needing to check that the offset >= typesize.
---
 llvm/lib/Analysis/BasicAliasAnalysis.cpp | 21 +++++++++++++++++++++
 llvm/test/Analysis/BasicAA/vscale.ll     | 22 +++++++++++-----------
 2 files changed, 32 insertions(+), 11 deletions(-)

diff --git a/llvm/lib/Analysis/BasicAliasAnalysis.cpp b/llvm/lib/Analysis/BasicAliasAnalysis.cpp
index 19c4393..ae31814 100644
--- a/llvm/lib/Analysis/BasicAliasAnalysis.cpp
+++ b/llvm/lib/Analysis/BasicAliasAnalysis.cpp
@@ -1170,6 +1170,27 @@ AliasResult BasicAAResult::aliasGEP(
     }
   }
 
+  // VScale Alias Analysis - Given one scalable offset between accesses and a
+  // scalable typesize, we can divide each side by vscale, treating both values
+  // as a constant. We prove that Offset/vscale >= TypeSize/vscale.
+  if (DecompGEP1.VarIndices.size() == 1 && DecompGEP1.VarIndices[0].IsNSW &&
+      DecompGEP1.VarIndices[0].Val.TruncBits == 0 &&
+      DecompGEP1.Offset.isZero() &&
+      PatternMatch::match(DecompGEP1.VarIndices[0].Val.V,
+                          PatternMatch::m_VScale())) {
+    const VariableGEPIndex &ScalableVar = DecompGEP1.VarIndices[0];
+    APInt Scale =
+        ScalableVar.IsNegated ? -ScalableVar.Scale : ScalableVar.Scale;
+    LocationSize VLeftSize = Scale.isNegative() ? V1Size : V2Size;
+
+    // Note that we do not check that the typesize is scalable, as vscale >= 1
+    // so noalias still holds so long as the dependency distance is at least as
+    // big as the typesize.
+    if (VLeftSize.hasValue() &&
+        Scale.uge(VLeftSize.getValue().getKnownMinValue()))
+      return AliasResult::NoAlias;
+  }
+
   // Bail on analysing scalable LocationSize
   if (V1Size.isScalable() || V2Size.isScalable())
     return AliasResult::MayAlias;
diff --git a/llvm/test/Analysis/BasicAA/vscale.ll b/llvm/test/Analysis/BasicAA/vscale.ll
index 1b9118b..ce0c6f1 100644
--- a/llvm/test/Analysis/BasicAA/vscale.ll
+++ b/llvm/test/Analysis/BasicAA/vscale.ll
@@ -339,15 +339,15 @@ define void @vscale_neg_notscalable(ptr %p) {
 }
 
 ; CHECK-LABEL: vscale_neg_scalable
-; CHECK-DAG:   MayAlias:     <vscale x 4 x i32>* %p, <vscale x 4 x i32>* %vm16
+; CHECK-DAG:   NoAlias:      <vscale x 4 x i32>* %p, <vscale x 4 x i32>* %vm16
 ; CHECK-DAG:   MayAlias:     <vscale x 4 x i32>* %m16, <vscale x 4 x i32>* %p
 ; CHECK-DAG:   MayAlias:     <vscale x 4 x i32>* %m16, <vscale x 4 x i32>* %vm16
 ; CHECK-DAG:   MayAlias:     <vscale x 4 x i32>* %p, <vscale x 4 x i32>* %vm16m16
 ; CHECK-DAG:   MayAlias:     <vscale x 4 x i32>* %vm16, <vscale x 4 x i32>* %vm16m16
-; CHECK-DAG:   MayAlias:     <vscale x 4 x i32>* %m16, <vscale x 4 x i32>* %vm16m16
+; CHECK-DAG:   NoAlias:      <vscale x 4 x i32>* %m16, <vscale x 4 x i32>* %vm16m16
 ; CHECK-DAG:   MayAlias:     <vscale x 4 x i32>* %m16pv16, <vscale x 4 x i32>* %p
 ; CHECK-DAG:   MayAlias:     <vscale x 4 x i32>* %m16pv16, <vscale x 4 x i32>* %vm16
-; CHECK-DAG:   MayAlias:     <vscale x 4 x i32>* %m16, <vscale x 4 x i32>* %m16pv16
+; CHECK-DAG:   NoAlias:      <vscale x 4 x i32>* %m16, <vscale x 4 x i32>* %m16pv16
 ; CHECK-DAG:   MayAlias:     <vscale x 4 x i32>* %m16pv16, <vscale x 4 x i32>* %vm16m16
 define void @vscale_neg_scalable(ptr %p) {
   %v = call i64 @llvm.vscale.i64()
@@ -393,15 +393,15 @@ define void @vscale_pos_notscalable(ptr %p) {
 }
 
 ; CHECK-LABEL: vscale_pos_scalable
-; CHECK-DAG:   MayAlias:     <vscale x 4 x i32>* %p, <vscale x 4 x i32>* %vm16
+; CHECK-DAG:   NoAlias:      <vscale x 4 x i32>* %p, <vscale x 4 x i32>* %vm16
 ; CHECK-DAG:   MayAlias:     <vscale x 4 x i32>* %m16, <vscale x 4 x i32>* %p
 ; CHECK-DAG:   MayAlias:     <vscale x 4 x i32>* %m16, <vscale x 4 x i32>* %vm16
 ; CHECK-DAG:   MayAlias:     <vscale x 4 x i32>* %p, <vscale x 4 x i32>* %vm16m16
 ; CHECK-DAG:   MayAlias:     <vscale x 4 x i32>* %vm16, <vscale x 4 x i32>* %vm16m16
-; CHECK-DAG:   MayAlias:     <vscale x 4 x i32>* %m16, <vscale x 4 x i32>* %vm16m16
+; CHECK-DAG:   NoAlias:      <vscale x 4 x i32>* %m16, <vscale x 4 x i32>* %vm16m16
 ; CHECK-DAG:   MayAlias:     <vscale x 4 x i32>* %m16pv16, <vscale x 4 x i32>* %p
 ; CHECK-DAG:   MayAlias:     <vscale x 4 x i32>* %m16pv16, <vscale x 4 x i32>* %vm16
-; CHECK-DAG:   MayAlias:     <vscale x 4 x i32>* %m16, <vscale x 4 x i32>* %m16pv16
+; CHECK-DAG:   NoAlias:      <vscale x 4 x i32>* %m16, <vscale x 4 x i32>* %m16pv16
 ; CHECK-DAG:   MayAlias:     <vscale x 4 x i32>* %m16pv16, <vscale x 4 x i32>* %vm16m16
 define void @vscale_pos_scalable(ptr %p) {
   %v = call i64 @llvm.vscale.i64()
@@ -421,9 +421,9 @@ define void @vscale_pos_scalable(ptr %p) {
 
 ; CHECK-LABEL: vscale_v1v2types
 ; CHECK-DAG:   MustAlias:    <4 x i32>* %p, <vscale x 4 x i32>* %p
-; CHECK-DAG:   MayAlias:     <vscale x 4 x i32>* %p, <vscale x 4 x i32>* %vm16
-; CHECK-DAG:   MayAlias:     <4 x i32>* %p, <vscale x 4 x i32>* %vm16
-; CHECK-DAG:   MayAlias:     <vscale x 4 x i32>* %p, <4 x i32>* %vm16
+; CHECK-DAG:   NoAlias:      <vscale x 4 x i32>* %p, <vscale x 4 x i32>* %vm16
+; CHECK-DAG:   NoAlias:      <4 x i32>* %p, <vscale x 4 x i32>* %vm16
+; CHECK-DAG:   NoAlias:      <vscale x 4 x i32>* %p, <4 x i32>* %vm16
 ; CHECK-DAG:   NoAlias:      <4 x i32>* %p, <4 x i32>* %vm16
 ; CHECK-DAG:   MustAlias:    <4 x i32>* %vm16, <vscale x 4 x i32>* %vm16
 ; CHECK-DAG:   MayAlias:     <vscale x 4 x i32>* %m16, <vscale x 4 x i32>* %p
@@ -435,8 +435,8 @@ define void @vscale_pos_scalable(ptr %p) {
 ; CHECK-DAG:   MayAlias:     <4 x i32>* %m16, <vscale x 4 x i32>* %vm16
 ; CHECK-DAG:   MayAlias:     <4 x i32>* %m16, <4 x i32>* %vm16
 ; CHECK-DAG:   MustAlias:    <4 x i32>* %m16, <vscale x 4 x i32>* %m16
-; CHECK-DAG:   MayAlias:     <vscale x 4 x i32>* %p, <vscale x 4 x i32>* %vp16
-; CHECK-DAG:   MayAlias:     <4 x i32>* %p, <vscale x 4 x i32>* %vp16
+; CHECK-DAG:   NoAlias:      <vscale x 4 x i32>* %p, <vscale x 4 x i32>* %vp16
+; CHECK-DAG:   NoAlias:      <4 x i32>* %p, <vscale x 4 x i32>* %vp16
 ; CHECK-DAG:   MayAlias:     <vscale x 4 x i32>* %vm16, <vscale x 4 x i32>* %vp16
 ; CHECK-DAG:   MayAlias:     <4 x i32>* %vm16, <vscale x 4 x i32>* %vp16
 ; CHECK-DAG:   MayAlias:     <vscale x 4 x i32>* %m16, <vscale x 4 x i32>* %vp16
-- 
cgit v1.1


From 455c3966cd7305b40d6941b544a16c22120b4512 Mon Sep 17 00:00:00 2001
From: Alex Bradbury <asb@igalia.com>
Date: Thu, 8 Feb 2024 11:07:01 +0000
Subject: [RISCV][test] Add test coverage for RISCVInstrInfo::isCopyInstrImpl

---
 llvm/unittests/Target/RISCV/RISCVInstrInfoTest.cpp | 63 ++++++++++++++++++++++
 1 file changed, 63 insertions(+)

diff --git a/llvm/unittests/Target/RISCV/RISCVInstrInfoTest.cpp b/llvm/unittests/Target/RISCV/RISCVInstrInfoTest.cpp
index 5836239..5f3ce53 100644
--- a/llvm/unittests/Target/RISCV/RISCVInstrInfoTest.cpp
+++ b/llvm/unittests/Target/RISCV/RISCVInstrInfoTest.cpp
@@ -94,6 +94,69 @@ TEST_P(RISCVInstrInfoTest, IsAddImmediate) {
   }
 }
 
+TEST_P(RISCVInstrInfoTest, IsCopyInstrImpl) {
+  const RISCVInstrInfo *TII = ST->getInstrInfo();
+  DebugLoc DL;
+
+  // ADDI.
+
+  MachineInstr *MI1 = BuildMI(*MF, DL, TII->get(RISCV::ADDI), RISCV::X1)
+                          .addReg(RISCV::X2)
+                          .addImm(-128)
+                          .getInstr();
+  auto MI1Res = TII->isCopyInstrImpl(*MI1);
+  EXPECT_FALSE(MI1Res.has_value());
+
+  MachineInstr *MI2 = BuildMI(*MF, DL, TII->get(RISCV::ADDI), RISCV::X1)
+                          .addReg(RISCV::X2)
+                          .addImm(0)
+                          .getInstr();
+  auto MI2Res = TII->isCopyInstrImpl(*MI2);
+  ASSERT_TRUE(MI2Res.has_value());
+  EXPECT_EQ(MI2Res->Destination->getReg(), RISCV::X1);
+  EXPECT_EQ(MI2Res->Source->getReg(), RISCV::X2);
+
+  // Partial coverage of FSGNJ_* instructions.
+
+  MachineInstr *MI3 = BuildMI(*MF, DL, TII->get(RISCV::FSGNJ_D), RISCV::F1_D)
+                          .addReg(RISCV::F2_D)
+                          .addReg(RISCV::F1_D)
+                          .getInstr();
+  auto MI3Res = TII->isCopyInstrImpl(*MI3);
+  EXPECT_FALSE(MI3Res.has_value());
+
+  MachineInstr *MI4 = BuildMI(*MF, DL, TII->get(RISCV::FSGNJ_D), RISCV::F1_D)
+                          .addReg(RISCV::F2_D)
+                          .addReg(RISCV::F2_D)
+                          .getInstr();
+  auto MI4Res = TII->isCopyInstrImpl(*MI4);
+  ASSERT_TRUE(MI4Res.has_value());
+  EXPECT_EQ(MI4Res->Destination->getReg(), RISCV::F1_D);
+  EXPECT_EQ(MI4Res->Source->getReg(), RISCV::F2_D);
+
+  // ADD. TODO: Should return true for add reg, x0 and add x0, reg.
+  MachineInstr *MI5 = BuildMI(*MF, DL, TII->get(RISCV::ADD), RISCV::X1)
+                          .addReg(RISCV::X2)
+                          .addReg(RISCV::X3)
+                          .getInstr();
+  auto MI5Res = TII->isCopyInstrImpl(*MI5);
+  EXPECT_FALSE(MI5Res.has_value());
+
+  MachineInstr *MI6 = BuildMI(*MF, DL, TII->get(RISCV::ADD), RISCV::X1)
+                          .addReg(RISCV::X0)
+                          .addReg(RISCV::X2)
+                          .getInstr();
+  auto MI6Res = TII->isCopyInstrImpl(*MI6);
+  EXPECT_FALSE(MI6Res.has_value());
+
+  MachineInstr *MI7 = BuildMI(*MF, DL, TII->get(RISCV::ADD), RISCV::X1)
+                          .addReg(RISCV::X2)
+                          .addReg(RISCV::X0)
+                          .getInstr();
+  auto MI7Res = TII->isCopyInstrImpl(*MI7);
+  EXPECT_FALSE(MI7Res.has_value());
+}
+
 TEST_P(RISCVInstrInfoTest, GetMemOperandsWithOffsetWidth) {
   const RISCVInstrInfo *TII = ST->getInstrInfo();
   const TargetRegisterInfo *TRI = ST->getRegisterInfo();
-- 
cgit v1.1


From d7fb94b6daa643a764e9a756bc544f26c248dafd Mon Sep 17 00:00:00 2001
From: Michael Buch <michaelbuch12@gmail.com>
Date: Thu, 8 Feb 2024 11:09:45 +0000
Subject: [lldb][TypeSynthetic][NFC] Make SyntheticChildrenFrontend::Update()
 return an enum (#80167)

This patch changes the return value of
`SyntheticChildrenFrontend::Update` to a scoped enum that aims to
describe what the return value means.
---
 lldb/include/lldb/DataFormatters/TypeSynthetic.h   | 27 ++++---
 lldb/include/lldb/DataFormatters/VectorIterator.h  |  2 +-
 lldb/include/lldb/lldb-enumerations.h              |  9 +++
 lldb/source/Core/ValueObjectSyntheticFilter.cpp    |  6 +-
 lldb/source/DataFormatters/TypeSynthetic.cpp       |  8 ++-
 lldb/source/DataFormatters/VectorType.cpp          |  4 +-
 .../Plugins/Language/CPlusPlus/BlockPointer.cpp    |  4 +-
 .../Plugins/Language/CPlusPlus/Coroutines.cpp      | 16 ++---
 .../source/Plugins/Language/CPlusPlus/Coroutines.h |  2 +-
 .../Plugins/Language/CPlusPlus/GenericBitset.cpp   |  8 +--
 .../Plugins/Language/CPlusPlus/GenericOptional.cpp |  8 +--
 lldb/source/Plugins/Language/CPlusPlus/LibCxx.cpp  | 63 ++++++++--------
 lldb/source/Plugins/Language/CPlusPlus/LibCxx.h    |  8 +--
 .../Plugins/Language/CPlusPlus/LibCxxAtomic.cpp    |  7 +-
 .../Language/CPlusPlus/LibCxxInitializerList.cpp   | 10 +--
 .../Plugins/Language/CPlusPlus/LibCxxList.cpp      | 32 ++++-----
 .../Plugins/Language/CPlusPlus/LibCxxMap.cpp       |  9 +--
 .../Plugins/Language/CPlusPlus/LibCxxQueue.cpp     |  8 +--
 .../Language/CPlusPlus/LibCxxRangesRefView.cpp     | 11 +--
 .../Plugins/Language/CPlusPlus/LibCxxSpan.cpp      |  9 +--
 .../Plugins/Language/CPlusPlus/LibCxxTuple.cpp     |  8 +--
 .../Language/CPlusPlus/LibCxxUnorderedMap.cpp      | 20 +++---
 .../Plugins/Language/CPlusPlus/LibCxxVariant.cpp   | 12 ++--
 .../Plugins/Language/CPlusPlus/LibCxxVector.cpp    | 28 ++++----
 .../Plugins/Language/CPlusPlus/LibStdcpp.cpp       | 44 ++++++------
 .../Plugins/Language/CPlusPlus/LibStdcppTuple.cpp  |  8 +--
 .../Language/CPlusPlus/LibStdcppUniquePointer.cpp  |  8 +--
 lldb/source/Plugins/Language/ObjC/Cocoa.cpp        |  4 +-
 lldb/source/Plugins/Language/ObjC/NSArray.cpp      | 45 ++++++------
 lldb/source/Plugins/Language/ObjC/NSDictionary.cpp | 83 ++++++++++++----------
 lldb/source/Plugins/Language/ObjC/NSError.cpp      | 12 ++--
 lldb/source/Plugins/Language/ObjC/NSException.cpp  |  9 ++-
 lldb/source/Plugins/Language/ObjC/NSIndexPath.cpp  | 14 ++--
 lldb/source/Plugins/Language/ObjC/NSSet.cpp        | 46 ++++++------
 34 files changed, 321 insertions(+), 271 deletions(-)

diff --git a/lldb/include/lldb/DataFormatters/TypeSynthetic.h b/lldb/include/lldb/DataFormatters/TypeSynthetic.h
index 41be9b7e..23cc054b 100644
--- a/lldb/include/lldb/DataFormatters/TypeSynthetic.h
+++ b/lldb/include/lldb/DataFormatters/TypeSynthetic.h
@@ -49,14 +49,15 @@ public:
 
   virtual size_t GetIndexOfChildWithName(ConstString name) = 0;
 
-  // this function is assumed to always succeed and it if fails, the front-end
-  // should know to deal with it in the correct way (most probably, by refusing
-  // to return any children) the return value of Update() should actually be
-  // interpreted as "ValueObjectSyntheticFilter cache is good/bad" if =true,
-  // ValueObjectSyntheticFilter is allowed to use the children it fetched
-  // previously and cached if =false, ValueObjectSyntheticFilter must throw
-  // away its cache, and query again for children
-  virtual bool Update() = 0;
+  /// This function is assumed to always succeed and if it fails, the front-end
+  /// should know to deal with it in the correct way (most probably, by refusing
+  /// to return any children). The return value of \ref Update should actually
+  /// be interpreted as "ValueObjectSyntheticFilter cache is good/bad". If this
+  /// function returns \ref lldb::ChildCacheState::eReuse, \ref
+  /// ValueObjectSyntheticFilter is allowed to use the children it fetched
+  /// previously and cached. Otherwise, \ref ValueObjectSyntheticFilter must
+  /// throw away its cache, and query again for children.
+  virtual lldb::ChildCacheState Update() = 0;
 
   // if this function returns false, then CalculateNumChildren() MUST return 0
   // since UI frontends might validly decide not to inquire for children given
@@ -116,7 +117,9 @@ public:
     return UINT32_MAX;
   }
 
-  bool Update() override { return false; }
+  lldb::ChildCacheState Update() override {
+    return lldb::ChildCacheState::eRefetch;
+  }
 
   bool MightHaveChildren() override { return false; }
 
@@ -328,7 +331,9 @@ public:
           filter->GetExpressionPathAtIndex(idx), true);
     }
 
-    bool Update() override { return false; }
+    lldb::ChildCacheState Update() override {
+      return lldb::ChildCacheState::eRefetch;
+    }
 
     bool MightHaveChildren() override { return filter->GetCount() > 0; }
 
@@ -427,7 +432,7 @@ public:
 
     lldb::ValueObjectSP GetChildAtIndex(size_t idx) override;
 
-    bool Update() override;
+    lldb::ChildCacheState Update() override;
 
     bool MightHaveChildren() override;
 
diff --git a/lldb/include/lldb/DataFormatters/VectorIterator.h b/lldb/include/lldb/DataFormatters/VectorIterator.h
index 3414298..5f774bb 100644
--- a/lldb/include/lldb/DataFormatters/VectorIterator.h
+++ b/lldb/include/lldb/DataFormatters/VectorIterator.h
@@ -28,7 +28,7 @@ public:
 
   lldb::ValueObjectSP GetChildAtIndex(size_t idx) override;
 
-  bool Update() override;
+  lldb::ChildCacheState Update() override;
 
   bool MightHaveChildren() override;
 
diff --git a/lldb/include/lldb/lldb-enumerations.h b/lldb/include/lldb/lldb-enumerations.h
index 392d333..7e9b538 100644
--- a/lldb/include/lldb/lldb-enumerations.h
+++ b/lldb/include/lldb/lldb-enumerations.h
@@ -1305,6 +1305,15 @@ enum CompletionType {
   eTerminatorCompletion = (1ul << 27)
 };
 
+/// Specifies if children need to be re-computed
+/// after a call to \ref SyntheticChildrenFrontEnd::Update.
+enum class ChildCacheState {
+  eRefetch = 0, ///< Children need to be recomputed dynamically.
+
+  eReuse = 1, ///< Children did not change and don't need to be recomputed;
+              ///< re-use what we computed the last time we called Update.
+};
+
 } // namespace lldb
 
 #endif // LLDB_LLDB_ENUMERATIONS_H
diff --git a/lldb/source/Core/ValueObjectSyntheticFilter.cpp b/lldb/source/Core/ValueObjectSyntheticFilter.cpp
index 43bc532..e8b4b02 100644
--- a/lldb/source/Core/ValueObjectSyntheticFilter.cpp
+++ b/lldb/source/Core/ValueObjectSyntheticFilter.cpp
@@ -43,7 +43,9 @@ public:
 
   bool MightHaveChildren() override { return m_backend.MightHaveChildren(); }
 
-  bool Update() override { return false; }
+  lldb::ChildCacheState Update() override {
+    return lldb::ChildCacheState::eRefetch;
+  }
 };
 
 ValueObjectSynthetic::ValueObjectSynthetic(ValueObject &parent,
@@ -177,7 +179,7 @@ bool ValueObjectSynthetic::UpdateValue() {
   }
 
   // let our backend do its update
-  if (!m_synth_filter_up->Update()) {
+  if (m_synth_filter_up->Update() == lldb::ChildCacheState::eRefetch) {
     LLDB_LOGF(log,
               "[ValueObjectSynthetic::UpdateValue] name=%s, synthetic "
               "filter said caches are stale - clearing",
diff --git a/lldb/source/DataFormatters/TypeSynthetic.cpp b/lldb/source/DataFormatters/TypeSynthetic.cpp
index de042e4..8a6f132 100644
--- a/lldb/source/DataFormatters/TypeSynthetic.cpp
+++ b/lldb/source/DataFormatters/TypeSynthetic.cpp
@@ -190,11 +190,13 @@ size_t ScriptedSyntheticChildren::FrontEnd::CalculateNumChildren(uint32_t max) {
   return m_interpreter->CalculateNumChildren(m_wrapper_sp, max);
 }
 
-bool ScriptedSyntheticChildren::FrontEnd::Update() {
+lldb::ChildCacheState ScriptedSyntheticChildren::FrontEnd::Update() {
   if (!m_wrapper_sp || m_interpreter == nullptr)
-    return false;
+    return lldb::ChildCacheState::eRefetch;
 
-  return m_interpreter->UpdateSynthProviderInstance(m_wrapper_sp);
+  return m_interpreter->UpdateSynthProviderInstance(m_wrapper_sp)
+             ? lldb::ChildCacheState::eReuse
+             : lldb::ChildCacheState::eRefetch;
 }
 
 bool ScriptedSyntheticChildren::FrontEnd::MightHaveChildren() {
diff --git a/lldb/source/DataFormatters/VectorType.cpp b/lldb/source/DataFormatters/VectorType.cpp
index 57dae0b..c94ca68 100644
--- a/lldb/source/DataFormatters/VectorType.cpp
+++ b/lldb/source/DataFormatters/VectorType.cpp
@@ -245,7 +245,7 @@ public:
     return child_sp;
   }
 
-  bool Update() override {
+  lldb::ChildCacheState Update() override {
     m_parent_format = m_backend.GetFormat();
     CompilerType parent_type(m_backend.GetCompilerType());
     CompilerType element_type;
@@ -258,7 +258,7 @@ public:
         ::CalculateNumChildren(element_type, num_elements, m_child_type)
             .value_or(0);
     m_item_format = GetItemFormatForFormat(m_parent_format, m_child_type);
-    return false;
+    return lldb::ChildCacheState::eRefetch;
   }
 
   bool MightHaveChildren() override { return true; }
diff --git a/lldb/source/Plugins/Language/CPlusPlus/BlockPointer.cpp b/lldb/source/Plugins/Language/CPlusPlus/BlockPointer.cpp
index 314a4ac..2e43aa3 100644
--- a/lldb/source/Plugins/Language/CPlusPlus/BlockPointer.cpp
+++ b/lldb/source/Plugins/Language/CPlusPlus/BlockPointer.cpp
@@ -136,7 +136,9 @@ public:
 
   // return true if this object is now safe to use forever without ever
   // updating again; the typical (and tested) answer here is 'false'
-  bool Update() override { return false; }
+  lldb::ChildCacheState Update() override {
+    return lldb::ChildCacheState::eRefetch;
+  }
 
   // maybe return false if the block pointer is, say, null
   bool MightHaveChildren() override { return true; }
diff --git a/lldb/source/Plugins/Language/CPlusPlus/Coroutines.cpp b/lldb/source/Plugins/Language/CPlusPlus/Coroutines.cpp
index 6aeae97..7420174 100644
--- a/lldb/source/Plugins/Language/CPlusPlus/Coroutines.cpp
+++ b/lldb/source/Plugins/Language/CPlusPlus/Coroutines.cpp
@@ -125,24 +125,24 @@ lldb::ValueObjectSP lldb_private::formatters::
   return lldb::ValueObjectSP();
 }
 
-bool lldb_private::formatters::StdlibCoroutineHandleSyntheticFrontEnd::
-    Update() {
+lldb::ChildCacheState
+lldb_private::formatters::StdlibCoroutineHandleSyntheticFrontEnd::Update() {
   m_resume_ptr_sp.reset();
   m_destroy_ptr_sp.reset();
   m_promise_ptr_sp.reset();
 
   ValueObjectSP valobj_sp = m_backend.GetNonSyntheticValue();
   if (!valobj_sp)
-    return false;
+    return lldb::ChildCacheState::eRefetch;
 
   lldb::addr_t frame_ptr_addr = GetCoroFramePtrFromHandle(valobj_sp);
   if (frame_ptr_addr == 0 || frame_ptr_addr == LLDB_INVALID_ADDRESS)
-    return false;
+    return lldb::ChildCacheState::eRefetch;
 
   auto ts = valobj_sp->GetCompilerType().GetTypeSystem();
   auto ast_ctx = ts.dyn_cast_or_null<TypeSystemClang>();
   if (!ast_ctx)
-    return false;
+    return lldb::ChildCacheState::eRefetch;
 
   // Create the `resume` and `destroy` children.
   lldb::TargetSP target_sp = m_backend.GetTargetSP();
@@ -165,7 +165,7 @@ bool lldb_private::formatters::StdlibCoroutineHandleSyntheticFrontEnd::
   CompilerType promise_type(
       valobj_sp->GetCompilerType().GetTypeTemplateArgument(0));
   if (!promise_type)
-    return false;
+    return lldb::ChildCacheState::eRefetch;
 
   // Try to infer the promise_type if it was type-erased
   if (promise_type.IsVoidType()) {
@@ -180,7 +180,7 @@ bool lldb_private::formatters::StdlibCoroutineHandleSyntheticFrontEnd::
   // If we don't know the promise type, we don't display the `promise` member.
   // `CreateValueObjectFromAddress` below would fail for `void` types.
   if (promise_type.IsVoidType()) {
-    return false;
+    return lldb::ChildCacheState::eRefetch;
   }
 
   // Add the `promise` member. We intentionally add `promise` as a pointer type
@@ -194,7 +194,7 @@ bool lldb_private::formatters::StdlibCoroutineHandleSyntheticFrontEnd::
   if (error.Success())
     m_promise_ptr_sp = promisePtr->Clone(ConstString("promise"));
 
-  return false;
+  return lldb::ChildCacheState::eRefetch;
 }
 
 bool lldb_private::formatters::StdlibCoroutineHandleSyntheticFrontEnd::
diff --git a/lldb/source/Plugins/Language/CPlusPlus/Coroutines.h b/lldb/source/Plugins/Language/CPlusPlus/Coroutines.h
index b26cc9e..d38c7ec 100644
--- a/lldb/source/Plugins/Language/CPlusPlus/Coroutines.h
+++ b/lldb/source/Plugins/Language/CPlusPlus/Coroutines.h
@@ -38,7 +38,7 @@ public:
 
   lldb::ValueObjectSP GetChildAtIndex(size_t idx) override;
 
-  bool Update() override;
+  lldb::ChildCacheState Update() override;
 
   bool MightHaveChildren() override;
 
diff --git a/lldb/source/Plugins/Language/CPlusPlus/GenericBitset.cpp b/lldb/source/Plugins/Language/CPlusPlus/GenericBitset.cpp
index 2876efc..ac31663 100644
--- a/lldb/source/Plugins/Language/CPlusPlus/GenericBitset.cpp
+++ b/lldb/source/Plugins/Language/CPlusPlus/GenericBitset.cpp
@@ -33,7 +33,7 @@ public:
   }
 
   bool MightHaveChildren() override { return true; }
-  bool Update() override;
+  lldb::ChildCacheState Update() override;
   size_t CalculateNumChildren() override { return m_elements.size(); }
   ValueObjectSP GetChildAtIndex(size_t idx) override;
 
@@ -78,13 +78,13 @@ llvm::StringRef GenericBitsetFrontEnd::GetDataContainerMemberName() {
   llvm_unreachable("Unknown StdLib enum");
 }
 
-bool GenericBitsetFrontEnd::Update() {
+lldb::ChildCacheState GenericBitsetFrontEnd::Update() {
   m_elements.clear();
   m_first = nullptr;
 
   TargetSP target_sp = m_backend.GetTargetSP();
   if (!target_sp)
-    return false;
+    return lldb::ChildCacheState::eRefetch;
 
   size_t size = 0;
 
@@ -94,7 +94,7 @@ bool GenericBitsetFrontEnd::Update() {
   m_elements.assign(size, ValueObjectSP());
   m_first =
       m_backend.GetChildMemberWithName(GetDataContainerMemberName()).get();
-  return false;
+  return lldb::ChildCacheState::eRefetch;
 }
 
 ValueObjectSP GenericBitsetFrontEnd::GetChildAtIndex(size_t idx) {
diff --git a/lldb/source/Plugins/Language/CPlusPlus/GenericOptional.cpp b/lldb/source/Plugins/Language/CPlusPlus/GenericOptional.cpp
index 7415e91..57331ea 100644
--- a/lldb/source/Plugins/Language/CPlusPlus/GenericOptional.cpp
+++ b/lldb/source/Plugins/Language/CPlusPlus/GenericOptional.cpp
@@ -44,7 +44,7 @@ public:
   size_t CalculateNumChildren() override { return m_has_value ? 1U : 0U; }
 
   ValueObjectSP GetChildAtIndex(size_t idx) override;
-  bool Update() override;
+  lldb::ChildCacheState Update() override;
 
 private:
   bool m_has_value = false;
@@ -61,7 +61,7 @@ GenericOptionalFrontend::GenericOptionalFrontend(ValueObject &valobj,
   }
 }
 
-bool GenericOptionalFrontend::Update() {
+lldb::ChildCacheState GenericOptionalFrontend::Update() {
   ValueObjectSP engaged_sp;
 
   if (m_stdlib == StdLib::LibCxx)
@@ -71,14 +71,14 @@ bool GenericOptionalFrontend::Update() {
                      ->GetChildMemberWithName("_M_engaged");
 
   if (!engaged_sp)
-    return false;
+    return lldb::ChildCacheState::eRefetch;
 
   // _M_engaged/__engaged is a bool flag and is true if the optional contains a
   // value. Converting it to unsigned gives us a size of 1 if it contains a
   // value and 0 if not.
   m_has_value = engaged_sp->GetValueAsUnsigned(0) != 0;
 
-  return false;
+  return lldb::ChildCacheState::eRefetch;
 }
 
 ValueObjectSP GenericOptionalFrontend::GetChildAtIndex(size_t _idx) {
diff --git a/lldb/source/Plugins/Language/CPlusPlus/LibCxx.cpp b/lldb/source/Plugins/Language/CPlusPlus/LibCxx.cpp
index d0bdbe1..a7d7066 100644
--- a/lldb/source/Plugins/Language/CPlusPlus/LibCxx.cpp
+++ b/lldb/source/Plugins/Language/CPlusPlus/LibCxx.cpp
@@ -231,21 +231,22 @@ lldb_private::formatters::LibCxxMapIteratorSyntheticFrontEnd::
     Update();
 }
 
-bool lldb_private::formatters::LibCxxMapIteratorSyntheticFrontEnd::Update() {
+lldb::ChildCacheState
+lldb_private::formatters::LibCxxMapIteratorSyntheticFrontEnd::Update() {
   m_pair_sp.reset();
   m_pair_ptr = nullptr;
 
   ValueObjectSP valobj_sp = m_backend.GetSP();
   if (!valobj_sp)
-    return false;
+    return lldb::ChildCacheState::eRefetch;
 
   TargetSP target_sp(valobj_sp->GetTargetSP());
 
   if (!target_sp)
-    return false;
+    return lldb::ChildCacheState::eRefetch;
 
   if (!valobj_sp)
-    return false;
+    return lldb::ChildCacheState::eRefetch;
 
   // this must be a ValueObject* because it is a child of the ValueObject we
   // are producing children for it if were a ValueObjectSP, we would end up
@@ -278,7 +279,7 @@ bool lldb_private::formatters::LibCxxMapIteratorSyntheticFrontEnd::Update() {
       auto __i_(valobj_sp->GetChildMemberWithName("__i_"));
       if (!__i_) {
         m_pair_ptr = nullptr;
-        return false;
+        return lldb::ChildCacheState::eRefetch;
       }
       CompilerType pair_type(
           __i_->GetCompilerType().GetTypeTemplateArgument(0));
@@ -290,7 +291,7 @@ bool lldb_private::formatters::LibCxxMapIteratorSyntheticFrontEnd::Update() {
           0, name, &bit_offset_ptr, &bitfield_bit_size_ptr, &is_bitfield_ptr);
       if (!pair_type) {
         m_pair_ptr = nullptr;
-        return false;
+        return lldb::ChildCacheState::eRefetch;
       }
 
       auto addr(m_pair_ptr->GetValueAsUnsigned(LLDB_INVALID_ADDRESS));
@@ -299,7 +300,7 @@ bool lldb_private::formatters::LibCxxMapIteratorSyntheticFrontEnd::Update() {
         auto ts = pair_type.GetTypeSystem();
         auto ast_ctx = ts.dyn_cast_or_null<TypeSystemClang>();
         if (!ast_ctx)
-          return false;
+          return lldb::ChildCacheState::eRefetch;
 
         // Mimick layout of std::__tree_iterator::__ptr_ and read it in
         // from process memory.
@@ -328,14 +329,14 @@ bool lldb_private::formatters::LibCxxMapIteratorSyntheticFrontEnd::Update() {
              {"payload", pair_type}});
         std::optional<uint64_t> size = tree_node_type.GetByteSize(nullptr);
         if (!size)
-          return false;
+          return lldb::ChildCacheState::eRefetch;
         WritableDataBufferSP buffer_sp(new DataBufferHeap(*size, 0));
         ProcessSP process_sp(target_sp->GetProcessSP());
         Status error;
         process_sp->ReadMemory(addr, buffer_sp->GetBytes(),
                                buffer_sp->GetByteSize(), error);
         if (error.Fail())
-          return false;
+          return lldb::ChildCacheState::eRefetch;
         DataExtractor extractor(buffer_sp, process_sp->GetByteOrder(),
                                 process_sp->GetAddressByteSize());
         auto pair_sp = CreateValueObjectFromData(
@@ -347,7 +348,7 @@ bool lldb_private::formatters::LibCxxMapIteratorSyntheticFrontEnd::Update() {
     }
   }
 
-  return false;
+  return lldb::ChildCacheState::eRefetch;
 }
 
 size_t lldb_private::formatters::LibCxxMapIteratorSyntheticFrontEnd::
@@ -399,22 +400,22 @@ lldb_private::formatters::LibCxxUnorderedMapIteratorSyntheticFrontEnd::
     Update();
 }
 
-bool lldb_private::formatters::LibCxxUnorderedMapIteratorSyntheticFrontEnd::
-    Update() {
+lldb::ChildCacheState lldb_private::formatters::
+    LibCxxUnorderedMapIteratorSyntheticFrontEnd::Update() {
   m_pair_sp.reset();
   m_iter_ptr = nullptr;
 
   ValueObjectSP valobj_sp = m_backend.GetSP();
   if (!valobj_sp)
-    return false;
+    return lldb::ChildCacheState::eRefetch;
 
   TargetSP target_sp(valobj_sp->GetTargetSP());
 
   if (!target_sp)
-    return false;
+    return lldb::ChildCacheState::eRefetch;
 
   if (!valobj_sp)
-    return false;
+    return lldb::ChildCacheState::eRefetch;
 
   auto exprPathOptions = ValueObject::GetValueForExpressionPathOptions()
                              .DontCheckDotVsArrowSyntax()
@@ -437,7 +438,7 @@ bool lldb_private::formatters::LibCxxUnorderedMapIteratorSyntheticFrontEnd::
     auto iter_child(valobj_sp->GetChildMemberWithName("__i_"));
     if (!iter_child) {
       m_iter_ptr = nullptr;
-      return false;
+      return lldb::ChildCacheState::eRefetch;
     }
 
     CompilerType node_type(iter_child->GetCompilerType()
@@ -455,19 +456,19 @@ bool lldb_private::formatters::LibCxxUnorderedMapIteratorSyntheticFrontEnd::
         0, name, &bit_offset_ptr, &bitfield_bit_size_ptr, &is_bitfield_ptr);
     if (!pair_type) {
       m_iter_ptr = nullptr;
-      return false;
+      return lldb::ChildCacheState::eRefetch;
     }
 
     uint64_t addr = m_iter_ptr->GetValueAsUnsigned(LLDB_INVALID_ADDRESS);
     m_iter_ptr = nullptr;
 
     if (addr == 0 || addr == LLDB_INVALID_ADDRESS)
-      return false;
+      return lldb::ChildCacheState::eRefetch;
 
     auto ts = pair_type.GetTypeSystem();
     auto ast_ctx = ts.dyn_cast_or_null<TypeSystemClang>();
     if (!ast_ctx)
-      return false;
+      return lldb::ChildCacheState::eRefetch;
 
     // Mimick layout of std::__hash_iterator::__node_ and read it in
     // from process memory.
@@ -489,14 +490,14 @@ bool lldb_private::formatters::LibCxxUnorderedMapIteratorSyntheticFrontEnd::
          {"__value_", pair_type}});
     std::optional<uint64_t> size = tree_node_type.GetByteSize(nullptr);
     if (!size)
-      return false;
+      return lldb::ChildCacheState::eRefetch;
     WritableDataBufferSP buffer_sp(new DataBufferHeap(*size, 0));
     ProcessSP process_sp(target_sp->GetProcessSP());
     Status error;
     process_sp->ReadMemory(addr, buffer_sp->GetBytes(),
                            buffer_sp->GetByteSize(), error);
     if (error.Fail())
-      return false;
+      return lldb::ChildCacheState::eRefetch;
     DataExtractor extractor(buffer_sp, process_sp->GetByteOrder(),
                             process_sp->GetAddressByteSize());
     auto pair_sp = CreateValueObjectFromData(
@@ -505,7 +506,7 @@ bool lldb_private::formatters::LibCxxUnorderedMapIteratorSyntheticFrontEnd::
       m_pair_sp = pair_sp->GetChildAtIndex(2);
   }
 
-  return false;
+  return lldb::ChildCacheState::eRefetch;
 }
 
 size_t lldb_private::formatters::LibCxxUnorderedMapIteratorSyntheticFrontEnd::
@@ -600,22 +601,23 @@ lldb_private::formatters::LibcxxSharedPtrSyntheticFrontEnd::GetChildAtIndex(
   return lldb::ValueObjectSP();
 }
 
-bool lldb_private::formatters::LibcxxSharedPtrSyntheticFrontEnd::Update() {
+lldb::ChildCacheState
+lldb_private::formatters::LibcxxSharedPtrSyntheticFrontEnd::Update() {
   m_cntrl = nullptr;
 
   ValueObjectSP valobj_sp = m_backend.GetSP();
   if (!valobj_sp)
-    return false;
+    return lldb::ChildCacheState::eRefetch;
 
   TargetSP target_sp(valobj_sp->GetTargetSP());
   if (!target_sp)
-    return false;
+    return lldb::ChildCacheState::eRefetch;
 
   lldb::ValueObjectSP cntrl_sp(valobj_sp->GetChildMemberWithName("__cntrl_"));
 
   m_cntrl = cntrl_sp.get(); // need to store the raw pointer to avoid a circular
                             // dependency
-  return false;
+  return lldb::ChildCacheState::eRefetch;
 }
 
 bool lldb_private::formatters::LibcxxSharedPtrSyntheticFrontEnd::
@@ -689,14 +691,15 @@ lldb_private::formatters::LibcxxUniquePtrSyntheticFrontEnd::GetChildAtIndex(
   return lldb::ValueObjectSP();
 }
 
-bool lldb_private::formatters::LibcxxUniquePtrSyntheticFrontEnd::Update() {
+lldb::ChildCacheState
+lldb_private::formatters::LibcxxUniquePtrSyntheticFrontEnd::Update() {
   ValueObjectSP valobj_sp = m_backend.GetSP();
   if (!valobj_sp)
-    return false;
+    return lldb::ChildCacheState::eRefetch;
 
   ValueObjectSP ptr_sp(valobj_sp->GetChildMemberWithName("__ptr_"));
   if (!ptr_sp)
-    return false;
+    return lldb::ChildCacheState::eRefetch;
 
   // Retrieve the actual pointer and the deleter, and clone them to give them
   // user-friendly names.
@@ -708,7 +711,7 @@ bool lldb_private::formatters::LibcxxUniquePtrSyntheticFrontEnd::Update() {
   if (deleter_sp)
     m_deleter_sp = deleter_sp->Clone(ConstString("deleter"));
 
-  return false;
+  return lldb::ChildCacheState::eRefetch;
 }
 
 bool lldb_private::formatters::LibcxxUniquePtrSyntheticFrontEnd::
diff --git a/lldb/source/Plugins/Language/CPlusPlus/LibCxx.h b/lldb/source/Plugins/Language/CPlusPlus/LibCxx.h
index 72da6b2..cc8e13d 100644
--- a/lldb/source/Plugins/Language/CPlusPlus/LibCxx.h
+++ b/lldb/source/Plugins/Language/CPlusPlus/LibCxx.h
@@ -91,7 +91,7 @@ public:
 
   lldb::ValueObjectSP GetChildAtIndex(size_t idx) override;
 
-  bool Update() override;
+  lldb::ChildCacheState Update() override;
 
   bool MightHaveChildren() override;
 
@@ -139,7 +139,7 @@ public:
 
   lldb::ValueObjectSP GetChildAtIndex(size_t idx) override;
 
-  bool Update() override;
+  lldb::ChildCacheState Update() override;
 
   bool MightHaveChildren() override;
 
@@ -170,7 +170,7 @@ public:
 
   lldb::ValueObjectSP GetChildAtIndex(size_t idx) override;
 
-  bool Update() override;
+  lldb::ChildCacheState Update() override;
 
   bool MightHaveChildren() override;
 
@@ -190,7 +190,7 @@ public:
 
   lldb::ValueObjectSP GetChildAtIndex(size_t idx) override;
 
-  bool Update() override;
+  lldb::ChildCacheState Update() override;
 
   bool MightHaveChildren() override;
 
diff --git a/lldb/source/Plugins/Language/CPlusPlus/LibCxxAtomic.cpp b/lldb/source/Plugins/Language/CPlusPlus/LibCxxAtomic.cpp
index eacc608..c81b1e80 100644
--- a/lldb/source/Plugins/Language/CPlusPlus/LibCxxAtomic.cpp
+++ b/lldb/source/Plugins/Language/CPlusPlus/LibCxxAtomic.cpp
@@ -94,7 +94,7 @@ public:
 
   lldb::ValueObjectSP GetChildAtIndex(size_t idx) override;
 
-  bool Update() override;
+  lldb::ChildCacheState Update() override;
 
   bool MightHaveChildren() override;
 
@@ -110,12 +110,13 @@ lldb_private::formatters::LibcxxStdAtomicSyntheticFrontEnd::
     LibcxxStdAtomicSyntheticFrontEnd(lldb::ValueObjectSP valobj_sp)
     : SyntheticChildrenFrontEnd(*valobj_sp) {}
 
-bool lldb_private::formatters::LibcxxStdAtomicSyntheticFrontEnd::Update() {
+lldb::ChildCacheState
+lldb_private::formatters::LibcxxStdAtomicSyntheticFrontEnd::Update() {
   ValueObjectSP atomic_value = GetLibCxxAtomicValue(m_backend);
   if (atomic_value)
     m_real_child = GetLibCxxAtomicValue(m_backend).get();
 
-  return false;
+  return lldb::ChildCacheState::eRefetch;
 }
 
 bool lldb_private::formatters::LibcxxStdAtomicSyntheticFrontEnd::
diff --git a/lldb/source/Plugins/Language/CPlusPlus/LibCxxInitializerList.cpp b/lldb/source/Plugins/Language/CPlusPlus/LibCxxInitializerList.cpp
index bfd7b88..3c33f94 100644
--- a/lldb/source/Plugins/Language/CPlusPlus/LibCxxInitializerList.cpp
+++ b/lldb/source/Plugins/Language/CPlusPlus/LibCxxInitializerList.cpp
@@ -30,7 +30,7 @@ public:
 
   lldb::ValueObjectSP GetChildAtIndex(size_t idx) override;
 
-  bool Update() override;
+  lldb::ChildCacheState Update() override;
 
   bool MightHaveChildren() override;
 
@@ -82,13 +82,13 @@ lldb::ValueObjectSP lldb_private::formatters::
                                       m_element_type);
 }
 
-bool lldb_private::formatters::LibcxxInitializerListSyntheticFrontEnd::
-    Update() {
+lldb::ChildCacheState
+lldb_private::formatters::LibcxxInitializerListSyntheticFrontEnd::Update() {
   m_start = nullptr;
   m_num_elements = 0;
   m_element_type = m_backend.GetCompilerType().GetTypeTemplateArgument(0);
   if (!m_element_type.IsValid())
-    return false;
+    return lldb::ChildCacheState::eRefetch;
 
   if (std::optional<uint64_t> size = m_element_type.GetByteSize(nullptr)) {
     m_element_size = *size;
@@ -96,7 +96,7 @@ bool lldb_private::formatters::LibcxxInitializerListSyntheticFrontEnd::
     m_start = m_backend.GetChildMemberWithName("__begin_").get();
   }
 
-  return false;
+  return lldb::ChildCacheState::eRefetch;
 }
 
 bool lldb_private::formatters::LibcxxInitializerListSyntheticFrontEnd::
diff --git a/lldb/source/Plugins/Language/CPlusPlus/LibCxxList.cpp b/lldb/source/Plugins/Language/CPlusPlus/LibCxxList.cpp
index 2e2e2a8..e28ef81 100644
--- a/lldb/source/Plugins/Language/CPlusPlus/LibCxxList.cpp
+++ b/lldb/source/Plugins/Language/CPlusPlus/LibCxxList.cpp
@@ -109,7 +109,7 @@ public:
     return ExtractIndexFromString(name.GetCString());
   }
   bool MightHaveChildren() override { return true; }
-  bool Update() override;
+  lldb::ChildCacheState Update() override;
 
 protected:
   AbstractListFrontEnd(ValueObject &valobj)
@@ -138,7 +138,7 @@ public:
 
   size_t CalculateNumChildren() override;
   ValueObjectSP GetChildAtIndex(size_t idx) override;
-  bool Update() override;
+  lldb::ChildCacheState Update() override;
 };
 
 class ListFrontEnd : public AbstractListFrontEnd {
@@ -151,7 +151,7 @@ public:
 
   lldb::ValueObjectSP GetChildAtIndex(size_t idx) override;
 
-  bool Update() override;
+  lldb::ChildCacheState Update() override;
 
 private:
   lldb::addr_t m_node_address = 0;
@@ -160,7 +160,7 @@ private:
 
 } // end anonymous namespace
 
-bool AbstractListFrontEnd::Update() {
+lldb::ChildCacheState AbstractListFrontEnd::Update() {
   m_loop_detected = 0;
   m_count = UINT32_MAX;
   m_head = nullptr;
@@ -180,10 +180,10 @@ bool AbstractListFrontEnd::Update() {
     list_type = list_type.GetNonReferenceType();
 
   if (list_type.GetNumTemplateArguments() == 0)
-    return false;
+    return lldb::ChildCacheState::eRefetch;
   m_element_type = list_type.GetTypeTemplateArgument(0);
 
-  return false;
+  return lldb::ChildCacheState::eRefetch;
 }
 
 bool AbstractListFrontEnd::HasLoop(size_t count) {
@@ -284,22 +284,22 @@ ValueObjectSP ForwardListFrontEnd::GetChildAtIndex(size_t idx) {
                                    m_element_type);
 }
 
-bool ForwardListFrontEnd::Update() {
+lldb::ChildCacheState ForwardListFrontEnd::Update() {
   AbstractListFrontEnd::Update();
 
   Status err;
   ValueObjectSP backend_addr(m_backend.AddressOf(err));
   if (err.Fail() || !backend_addr)
-    return false;
+    return lldb::ChildCacheState::eRefetch;
 
   ValueObjectSP impl_sp(m_backend.GetChildMemberWithName("__before_begin_"));
   if (!impl_sp)
-    return false;
+    return lldb::ChildCacheState::eRefetch;
   impl_sp = GetFirstValueOfLibCXXCompressedPair(*impl_sp);
   if (!impl_sp)
-    return false;
+    return lldb::ChildCacheState::eRefetch;
   m_head = impl_sp->GetChildMemberWithName("__next_").get();
-  return false;
+  return lldb::ChildCacheState::eRefetch;
 }
 
 ListFrontEnd::ListFrontEnd(lldb::ValueObjectSP valobj_sp)
@@ -394,7 +394,7 @@ lldb::ValueObjectSP ListFrontEnd::GetChildAtIndex(size_t idx) {
                                    m_element_type);
 }
 
-bool ListFrontEnd::Update() {
+lldb::ChildCacheState ListFrontEnd::Update() {
   AbstractListFrontEnd::Update();
   m_tail = nullptr;
   m_node_address = 0;
@@ -402,16 +402,16 @@ bool ListFrontEnd::Update() {
   Status err;
   ValueObjectSP backend_addr(m_backend.AddressOf(err));
   if (err.Fail() || !backend_addr)
-    return false;
+    return lldb::ChildCacheState::eRefetch;
   m_node_address = backend_addr->GetValueAsUnsigned(0);
   if (!m_node_address || m_node_address == LLDB_INVALID_ADDRESS)
-    return false;
+    return lldb::ChildCacheState::eRefetch;
   ValueObjectSP impl_sp(m_backend.GetChildMemberWithName("__end_"));
   if (!impl_sp)
-    return false;
+    return lldb::ChildCacheState::eRefetch;
   m_head = impl_sp->GetChildMemberWithName("__next_").get();
   m_tail = impl_sp->GetChildMemberWithName("__prev_").get();
-  return false;
+  return lldb::ChildCacheState::eRefetch;
 }
 
 SyntheticChildrenFrontEnd *formatters::LibcxxStdListSyntheticFrontEndCreator(
diff --git a/lldb/source/Plugins/Language/CPlusPlus/LibCxxMap.cpp b/lldb/source/Plugins/Language/CPlusPlus/LibCxxMap.cpp
index d3ee63a..d208acf 100644
--- a/lldb/source/Plugins/Language/CPlusPlus/LibCxxMap.cpp
+++ b/lldb/source/Plugins/Language/CPlusPlus/LibCxxMap.cpp
@@ -181,7 +181,7 @@ public:
 
   lldb::ValueObjectSP GetChildAtIndex(size_t idx) override;
 
-  bool Update() override;
+  lldb::ChildCacheState Update() override;
 
   bool MightHaveChildren() override;
 
@@ -405,15 +405,16 @@ lldb_private::formatters::LibcxxStdMapSyntheticFrontEnd::GetChildAtIndex(
   return potential_child_sp;
 }
 
-bool lldb_private::formatters::LibcxxStdMapSyntheticFrontEnd::Update() {
+lldb::ChildCacheState
+lldb_private::formatters::LibcxxStdMapSyntheticFrontEnd::Update() {
   m_count = UINT32_MAX;
   m_tree = m_root_node = nullptr;
   m_iterators.clear();
   m_tree = m_backend.GetChildMemberWithName("__tree_").get();
   if (!m_tree)
-    return false;
+    return lldb::ChildCacheState::eRefetch;
   m_root_node = m_tree->GetChildMemberWithName("__begin_node_").get();
-  return false;
+  return lldb::ChildCacheState::eRefetch;
 }
 
 bool lldb_private::formatters::LibcxxStdMapSyntheticFrontEnd::
diff --git a/lldb/source/Plugins/Language/CPlusPlus/LibCxxQueue.cpp b/lldb/source/Plugins/Language/CPlusPlus/LibCxxQueue.cpp
index c31940a..83f93b1 100644
--- a/lldb/source/Plugins/Language/CPlusPlus/LibCxxQueue.cpp
+++ b/lldb/source/Plugins/Language/CPlusPlus/LibCxxQueue.cpp
@@ -26,7 +26,7 @@ public:
   }
 
   bool MightHaveChildren() override { return true; }
-  bool Update() override;
+  lldb::ChildCacheState Update() override;
 
   size_t CalculateNumChildren() override {
     return m_container_sp ? m_container_sp->GetNumChildren() : 0;
@@ -47,13 +47,13 @@ private:
 };
 } // namespace
 
-bool QueueFrontEnd::Update() {
+lldb::ChildCacheState QueueFrontEnd::Update() {
   m_container_sp = nullptr;
   ValueObjectSP c_sp = m_backend.GetChildMemberWithName("c");
   if (!c_sp)
-    return false;
+    return lldb::ChildCacheState::eRefetch;
   m_container_sp = c_sp->GetSyntheticValue().get();
-  return false;
+  return lldb::ChildCacheState::eRefetch;
 }
 
 SyntheticChildrenFrontEnd *
diff --git a/lldb/source/Plugins/Language/CPlusPlus/LibCxxRangesRefView.cpp b/lldb/source/Plugins/Language/CPlusPlus/LibCxxRangesRefView.cpp
index 6aeb557..c032d67 100644
--- a/lldb/source/Plugins/Language/CPlusPlus/LibCxxRangesRefView.cpp
+++ b/lldb/source/Plugins/Language/CPlusPlus/LibCxxRangesRefView.cpp
@@ -38,7 +38,7 @@ public:
     return m_range_sp;
   }
 
-  bool Update() override;
+  lldb::ChildCacheState Update() override;
 
   bool MightHaveChildren() override { return true; }
 
@@ -59,17 +59,18 @@ lldb_private::formatters::LibcxxStdRangesRefViewSyntheticFrontEnd::
     Update();
 }
 
-bool lldb_private::formatters::LibcxxStdRangesRefViewSyntheticFrontEnd::
-    Update() {
+lldb::ChildCacheState
+lldb_private::formatters::LibcxxStdRangesRefViewSyntheticFrontEnd::Update() {
   ValueObjectSP range_ptr =
       GetChildMemberWithName(m_backend, {ConstString("__range_")});
   if (!range_ptr)
-    return false;
+    return lldb::ChildCacheState::eRefetch;
 
   lldb_private::Status error;
   m_range_sp = range_ptr->Dereference(error);
 
-  return error.Success();
+  return error.Success() ? lldb::ChildCacheState::eReuse
+                         : lldb::ChildCacheState::eRefetch;
 }
 
 lldb_private::SyntheticChildrenFrontEnd *
diff --git a/lldb/source/Plugins/Language/CPlusPlus/LibCxxSpan.cpp b/lldb/source/Plugins/Language/CPlusPlus/LibCxxSpan.cpp
index ec062ed..4ddfaef 100644
--- a/lldb/source/Plugins/Language/CPlusPlus/LibCxxSpan.cpp
+++ b/lldb/source/Plugins/Language/CPlusPlus/LibCxxSpan.cpp
@@ -53,7 +53,7 @@ public:
   // This function checks for a '__size' member to determine the number
   // of elements in the span. If no such member exists, we get the size
   // from the only other place it can be: the template argument.
-  bool Update() override;
+  lldb::ChildCacheState Update() override;
 
   bool MightHaveChildren() override;
 
@@ -93,12 +93,13 @@ lldb_private::formatters::LibcxxStdSpanSyntheticFrontEnd::GetChildAtIndex(
                                       m_element_type);
 }
 
-bool lldb_private::formatters::LibcxxStdSpanSyntheticFrontEnd::Update() {
+lldb::ChildCacheState
+lldb_private::formatters::LibcxxStdSpanSyntheticFrontEnd::Update() {
   // Get element type.
   ValueObjectSP data_type_finder_sp = GetChildMemberWithName(
       m_backend, {ConstString("__data_"), ConstString("__data")});
   if (!data_type_finder_sp)
-    return false;
+    return lldb::ChildCacheState::eRefetch;
 
   m_element_type = data_type_finder_sp->GetCompilerType().GetPointeeType();
 
@@ -122,7 +123,7 @@ bool lldb_private::formatters::LibcxxStdSpanSyntheticFrontEnd::Update() {
     }
   }
 
-  return true;
+  return lldb::ChildCacheState::eReuse;
 }
 
 bool lldb_private::formatters::LibcxxStdSpanSyntheticFrontEnd::
diff --git a/lldb/source/Plugins/Language/CPlusPlus/LibCxxTuple.cpp b/lldb/source/Plugins/Language/CPlusPlus/LibCxxTuple.cpp
index 9024ed4..54687101 100644
--- a/lldb/source/Plugins/Language/CPlusPlus/LibCxxTuple.cpp
+++ b/lldb/source/Plugins/Language/CPlusPlus/LibCxxTuple.cpp
@@ -25,7 +25,7 @@ public:
   }
 
   bool MightHaveChildren() override { return true; }
-  bool Update() override;
+  lldb::ChildCacheState Update() override;
   size_t CalculateNumChildren() override { return m_elements.size(); }
   ValueObjectSP GetChildAtIndex(size_t idx) override;
 
@@ -40,7 +40,7 @@ private:
 };
 }
 
-bool TupleFrontEnd::Update() {
+lldb::ChildCacheState TupleFrontEnd::Update() {
   m_elements.clear();
   m_base = nullptr;
 
@@ -51,11 +51,11 @@ bool TupleFrontEnd::Update() {
     base_sp = m_backend.GetChildMemberWithName("base_");
   }
   if (!base_sp)
-    return false;
+    return lldb::ChildCacheState::eRefetch;
   m_base = base_sp.get();
   m_elements.assign(base_sp->GetCompilerType().GetNumDirectBaseClasses(),
                     nullptr);
-  return false;
+  return lldb::ChildCacheState::eRefetch;
 }
 
 ValueObjectSP TupleFrontEnd::GetChildAtIndex(size_t idx) {
diff --git a/lldb/source/Plugins/Language/CPlusPlus/LibCxxUnorderedMap.cpp b/lldb/source/Plugins/Language/CPlusPlus/LibCxxUnorderedMap.cpp
index 1a85d37..4cac52f 100644
--- a/lldb/source/Plugins/Language/CPlusPlus/LibCxxUnorderedMap.cpp
+++ b/lldb/source/Plugins/Language/CPlusPlus/LibCxxUnorderedMap.cpp
@@ -37,7 +37,7 @@ public:
 
   lldb::ValueObjectSP GetChildAtIndex(size_t idx) override;
 
-  bool Update() override;
+  lldb::ChildCacheState Update() override;
 
   bool MightHaveChildren() override;
 
@@ -193,41 +193,41 @@ lldb::ValueObjectSP lldb_private::formatters::
                                    m_element_type);
 }
 
-bool lldb_private::formatters::LibcxxStdUnorderedMapSyntheticFrontEnd::
-    Update() {
+lldb::ChildCacheState
+lldb_private::formatters::LibcxxStdUnorderedMapSyntheticFrontEnd::Update() {
   m_num_elements = 0;
   m_next_element = nullptr;
   m_elements_cache.clear();
   ValueObjectSP table_sp = m_backend.GetChildMemberWithName("__table_");
   if (!table_sp)
-    return false;
+    return lldb::ChildCacheState::eRefetch;
 
   ValueObjectSP p2_sp = table_sp->GetChildMemberWithName("__p2_");
   if (!p2_sp)
-    return false;
+    return lldb::ChildCacheState::eRefetch;
 
   ValueObjectSP num_elements_sp = GetFirstValueOfLibCXXCompressedPair(*p2_sp);
   if (!num_elements_sp)
-    return false;
+    return lldb::ChildCacheState::eRefetch;
 
   ValueObjectSP p1_sp = table_sp->GetChildMemberWithName("__p1_");
   if (!p1_sp)
-    return false;
+    return lldb::ChildCacheState::eRefetch;
 
   ValueObjectSP value_sp = GetFirstValueOfLibCXXCompressedPair(*p1_sp);
   if (!value_sp)
-    return false;
+    return lldb::ChildCacheState::eRefetch;
 
   m_tree = value_sp->GetChildMemberWithName("__next_").get();
   if (m_tree == nullptr)
-    return false;
+    return lldb::ChildCacheState::eRefetch;
 
   m_num_elements = num_elements_sp->GetValueAsUnsigned(0);
 
   if (m_num_elements > 0)
     m_next_element = m_tree;
 
-  return false;
+  return lldb::ChildCacheState::eRefetch;
 }
 
 bool lldb_private::formatters::LibcxxStdUnorderedMapSyntheticFrontEnd::
diff --git a/lldb/source/Plugins/Language/CPlusPlus/LibCxxVariant.cpp b/lldb/source/Plugins/Language/CPlusPlus/LibCxxVariant.cpp
index e863ccc..ecbb7cf 100644
--- a/lldb/source/Plugins/Language/CPlusPlus/LibCxxVariant.cpp
+++ b/lldb/source/Plugins/Language/CPlusPlus/LibCxxVariant.cpp
@@ -204,7 +204,7 @@ public:
   }
 
   bool MightHaveChildren() override { return true; }
-  bool Update() override;
+  lldb::ChildCacheState Update() override;
   size_t CalculateNumChildren() override { return m_size; }
   ValueObjectSP GetChildAtIndex(size_t idx) override;
 
@@ -213,24 +213,24 @@ private:
 };
 } // namespace
 
-bool VariantFrontEnd::Update() {
+lldb::ChildCacheState VariantFrontEnd::Update() {
   m_size = 0;
   ValueObjectSP impl_sp = formatters::GetChildMemberWithName(
       m_backend, {ConstString("__impl_"), ConstString("__impl")});
   if (!impl_sp)
-    return false;
+    return lldb::ChildCacheState::eRefetch;
 
   LibcxxVariantIndexValidity validity = LibcxxVariantGetIndexValidity(impl_sp);
 
   if (validity == LibcxxVariantIndexValidity::Invalid)
-    return false;
+    return lldb::ChildCacheState::eRefetch;
 
   if (validity == LibcxxVariantIndexValidity::NPos)
-    return true;
+    return lldb::ChildCacheState::eReuse;
 
   m_size = 1;
 
-  return false;
+  return lldb::ChildCacheState::eRefetch;
 }
 
 ValueObjectSP VariantFrontEnd::GetChildAtIndex(size_t idx) {
diff --git a/lldb/source/Plugins/Language/CPlusPlus/LibCxxVector.cpp b/lldb/source/Plugins/Language/CPlusPlus/LibCxxVector.cpp
index 9d88fcf..0c3c3f0 100644
--- a/lldb/source/Plugins/Language/CPlusPlus/LibCxxVector.cpp
+++ b/lldb/source/Plugins/Language/CPlusPlus/LibCxxVector.cpp
@@ -29,7 +29,7 @@ public:
 
   lldb::ValueObjectSP GetChildAtIndex(size_t idx) override;
 
-  bool Update() override;
+  lldb::ChildCacheState Update() override;
 
   bool MightHaveChildren() override;
 
@@ -50,7 +50,7 @@ public:
 
   lldb::ValueObjectSP GetChildAtIndex(size_t idx) override;
 
-  bool Update() override;
+  lldb::ChildCacheState Update() override;
 
   bool MightHaveChildren() override { return true; }
 
@@ -116,17 +116,18 @@ lldb_private::formatters::LibcxxStdVectorSyntheticFrontEnd::GetChildAtIndex(
                                       m_element_type);
 }
 
-bool lldb_private::formatters::LibcxxStdVectorSyntheticFrontEnd::Update() {
+lldb::ChildCacheState
+lldb_private::formatters::LibcxxStdVectorSyntheticFrontEnd::Update() {
   m_start = m_finish = nullptr;
   ValueObjectSP data_type_finder_sp(
       m_backend.GetChildMemberWithName("__end_cap_"));
   if (!data_type_finder_sp)
-    return false;
+    return lldb::ChildCacheState::eRefetch;
 
   data_type_finder_sp =
       GetFirstValueOfLibCXXCompressedPair(*data_type_finder_sp);
   if (!data_type_finder_sp)
-    return false;
+    return lldb::ChildCacheState::eRefetch;
 
   m_element_type = data_type_finder_sp->GetCompilerType().GetPointeeType();
   if (std::optional<uint64_t> size = m_element_type.GetByteSize(nullptr)) {
@@ -138,7 +139,7 @@ bool lldb_private::formatters::LibcxxStdVectorSyntheticFrontEnd::Update() {
       m_finish = m_backend.GetChildMemberWithName("__end_").get();
     }
   }
-  return false;
+  return lldb::ChildCacheState::eRefetch;
 }
 
 bool lldb_private::formatters::LibcxxStdVectorSyntheticFrontEnd::
@@ -226,29 +227,30 @@ lldb_private::formatters::LibcxxVectorBoolSyntheticFrontEnd::GetChildAtIndex(
  }
  }*/
 
-bool lldb_private::formatters::LibcxxVectorBoolSyntheticFrontEnd::Update() {
+lldb::ChildCacheState
+lldb_private::formatters::LibcxxVectorBoolSyntheticFrontEnd::Update() {
   m_children.clear();
   ValueObjectSP valobj_sp = m_backend.GetSP();
   if (!valobj_sp)
-    return false;
+    return lldb::ChildCacheState::eRefetch;
   m_exe_ctx_ref = valobj_sp->GetExecutionContextRef();
   ValueObjectSP size_sp(valobj_sp->GetChildMemberWithName("__size_"));
   if (!size_sp)
-    return false;
+    return lldb::ChildCacheState::eRefetch;
   m_count = size_sp->GetValueAsUnsigned(0);
   if (!m_count)
-    return true;
+    return lldb::ChildCacheState::eReuse;
   ValueObjectSP begin_sp(valobj_sp->GetChildMemberWithName("__begin_"));
   if (!begin_sp) {
     m_count = 0;
-    return false;
+    return lldb::ChildCacheState::eRefetch;
   }
   m_base_data_address = begin_sp->GetValueAsUnsigned(0);
   if (!m_base_data_address) {
     m_count = 0;
-    return false;
+    return lldb::ChildCacheState::eRefetch;
   }
-  return false;
+  return lldb::ChildCacheState::eRefetch;
 }
 
 size_t lldb_private::formatters::LibcxxVectorBoolSyntheticFrontEnd::
diff --git a/lldb/source/Plugins/Language/CPlusPlus/LibStdcpp.cpp b/lldb/source/Plugins/Language/CPlusPlus/LibStdcpp.cpp
index 23af50f..4115518 100644
--- a/lldb/source/Plugins/Language/CPlusPlus/LibStdcpp.cpp
+++ b/lldb/source/Plugins/Language/CPlusPlus/LibStdcpp.cpp
@@ -47,7 +47,7 @@ public:
 
   lldb::ValueObjectSP GetChildAtIndex(size_t idx) override;
 
-  bool Update() override;
+  lldb::ChildCacheState Update() override;
 
   bool MightHaveChildren() override;
 
@@ -68,7 +68,7 @@ public:
 
   lldb::ValueObjectSP GetChildAtIndex(size_t idx) override;
 
-  bool Update() override;
+  lldb::ChildCacheState Update() override;
 
   bool MightHaveChildren() override;
 
@@ -94,29 +94,29 @@ LibstdcppMapIteratorSyntheticFrontEnd::LibstdcppMapIteratorSyntheticFrontEnd(
     Update();
 }
 
-bool LibstdcppMapIteratorSyntheticFrontEnd::Update() {
+lldb::ChildCacheState LibstdcppMapIteratorSyntheticFrontEnd::Update() {
   ValueObjectSP valobj_sp = m_backend.GetSP();
   if (!valobj_sp)
-    return false;
+    return lldb::ChildCacheState::eRefetch;
 
   TargetSP target_sp(valobj_sp->GetTargetSP());
 
   if (!target_sp)
-    return false;
+    return lldb::ChildCacheState::eRefetch;
 
   bool is_64bit = (target_sp->GetArchitecture().GetAddressByteSize() == 8);
 
   if (!valobj_sp)
-    return false;
+    return lldb::ChildCacheState::eRefetch;
   m_exe_ctx_ref = valobj_sp->GetExecutionContextRef();
 
   ValueObjectSP _M_node_sp(valobj_sp->GetChildMemberWithName("_M_node"));
   if (!_M_node_sp)
-    return false;
+    return lldb::ChildCacheState::eRefetch;
 
   m_pair_address = _M_node_sp->GetValueAsUnsigned(0);
   if (m_pair_address == 0)
-    return false;
+    return lldb::ChildCacheState::eRefetch;
 
   m_pair_address += (is_64bit ? 32 : 16);
 
@@ -124,12 +124,12 @@ bool LibstdcppMapIteratorSyntheticFrontEnd::Update() {
   if (my_type.GetNumTemplateArguments() >= 1) {
     CompilerType pair_type = my_type.GetTypeTemplateArgument(0);
     if (!pair_type)
-      return false;
+      return lldb::ChildCacheState::eRefetch;
     m_pair_type = pair_type;
   } else
-    return false;
+    return lldb::ChildCacheState::eRefetch;
 
-  return true;
+  return lldb::ChildCacheState::eReuse;
 }
 
 size_t LibstdcppMapIteratorSyntheticFrontEnd::CalculateNumChildren() {
@@ -193,22 +193,22 @@ lldb_private::formatters::VectorIteratorSyntheticFrontEnd::
     Update();
 }
 
-bool VectorIteratorSyntheticFrontEnd::Update() {
+lldb::ChildCacheState VectorIteratorSyntheticFrontEnd::Update() {
   m_item_sp.reset();
 
   ValueObjectSP valobj_sp = m_backend.GetSP();
   if (!valobj_sp)
-    return false;
+    return lldb::ChildCacheState::eRefetch;
 
   if (!valobj_sp)
-    return false;
+    return lldb::ChildCacheState::eRefetch;
 
   ValueObjectSP item_ptr =
       formatters::GetChildMemberWithName(*valobj_sp, m_item_names);
   if (!item_ptr)
-    return false;
+    return lldb::ChildCacheState::eRefetch;
   if (item_ptr->GetValueAsUnsigned(0) == 0)
-    return false;
+    return lldb::ChildCacheState::eRefetch;
   Status err;
   m_exe_ctx_ref = valobj_sp->GetExecutionContextRef();
   m_item_sp = CreateValueObjectFromAddress(
@@ -216,7 +216,7 @@ bool VectorIteratorSyntheticFrontEnd::Update() {
       item_ptr->GetCompilerType().GetPointeeType());
   if (err.Fail())
     m_item_sp.reset();
-  return false;
+  return lldb::ChildCacheState::eRefetch;
 }
 
 size_t VectorIteratorSyntheticFrontEnd::CalculateNumChildren() { return 1; }
@@ -390,23 +390,23 @@ LibStdcppSharedPtrSyntheticFrontEnd::GetChildAtIndex(size_t idx) {
   return lldb::ValueObjectSP();
 }
 
-bool LibStdcppSharedPtrSyntheticFrontEnd::Update() {
+lldb::ChildCacheState LibStdcppSharedPtrSyntheticFrontEnd::Update() {
   auto backend = m_backend.GetSP();
   if (!backend)
-    return false;
+    return lldb::ChildCacheState::eRefetch;
 
   auto valobj_sp = backend->GetNonSyntheticValue();
   if (!valobj_sp)
-    return false;
+    return lldb::ChildCacheState::eRefetch;
 
   auto ptr_obj_sp = valobj_sp->GetChildMemberWithName("_M_ptr");
   if (!ptr_obj_sp)
-    return false;
+    return lldb::ChildCacheState::eRefetch;
 
   m_ptr_obj = ptr_obj_sp->Clone(ConstString("pointer")).get();
   m_obj_obj = nullptr;
 
-  return false;
+  return lldb::ChildCacheState::eRefetch;
 }
 
 bool LibStdcppSharedPtrSyntheticFrontEnd::MightHaveChildren() { return true; }
diff --git a/lldb/source/Plugins/Language/CPlusPlus/LibStdcppTuple.cpp b/lldb/source/Plugins/Language/CPlusPlus/LibStdcppTuple.cpp
index f1bfeae..189f956 100644
--- a/lldb/source/Plugins/Language/CPlusPlus/LibStdcppTuple.cpp
+++ b/lldb/source/Plugins/Language/CPlusPlus/LibStdcppTuple.cpp
@@ -30,7 +30,7 @@ public:
 
   lldb::ValueObjectSP GetChildAtIndex(size_t idx) override;
 
-  bool Update() override;
+  lldb::ChildCacheState Update() override;
 
   bool MightHaveChildren() override;
 
@@ -53,12 +53,12 @@ LibStdcppTupleSyntheticFrontEnd::LibStdcppTupleSyntheticFrontEnd(
   Update();
 }
 
-bool LibStdcppTupleSyntheticFrontEnd::Update() {
+lldb::ChildCacheState LibStdcppTupleSyntheticFrontEnd::Update() {
   m_members.clear();
 
   ValueObjectSP valobj_backend_sp = m_backend.GetSP();
   if (!valobj_backend_sp)
-    return false;
+    return lldb::ChildCacheState::eRefetch;
 
   ValueObjectSP next_child_sp = valobj_backend_sp->GetNonSyntheticValue();
   while (next_child_sp != nullptr) {
@@ -83,7 +83,7 @@ bool LibStdcppTupleSyntheticFrontEnd::Update() {
     }
   }
 
-  return false;
+  return lldb::ChildCacheState::eRefetch;
 }
 
 bool LibStdcppTupleSyntheticFrontEnd::MightHaveChildren() { return true; }
diff --git a/lldb/source/Plugins/Language/CPlusPlus/LibStdcppUniquePointer.cpp b/lldb/source/Plugins/Language/CPlusPlus/LibStdcppUniquePointer.cpp
index a84d641..3b0f632 100644
--- a/lldb/source/Plugins/Language/CPlusPlus/LibStdcppUniquePointer.cpp
+++ b/lldb/source/Plugins/Language/CPlusPlus/LibStdcppUniquePointer.cpp
@@ -30,7 +30,7 @@ public:
 
   lldb::ValueObjectSP GetChildAtIndex(size_t idx) override;
 
-  bool Update() override;
+  lldb::ChildCacheState Update() override;
 
   bool MightHaveChildren() override;
 
@@ -84,11 +84,11 @@ ValueObjectSP LibStdcppUniquePtrSyntheticFrontEnd::GetTuple() {
   return obj_child_sp;
 }
 
-bool LibStdcppUniquePtrSyntheticFrontEnd::Update() {
+lldb::ChildCacheState LibStdcppUniquePtrSyntheticFrontEnd::Update() {
   ValueObjectSP tuple_sp = GetTuple();
 
   if (!tuple_sp)
-    return false;
+    return lldb::ChildCacheState::eRefetch;
 
   std::unique_ptr<SyntheticChildrenFrontEnd> tuple_frontend(
       LibStdcppTupleSyntheticFrontEndCreator(nullptr, tuple_sp));
@@ -110,7 +110,7 @@ bool LibStdcppUniquePtrSyntheticFrontEnd::Update() {
   }
   m_obj_obj = nullptr;
 
-  return false;
+  return lldb::ChildCacheState::eRefetch;
 }
 
 bool LibStdcppUniquePtrSyntheticFrontEnd::MightHaveChildren() { return true; }
diff --git a/lldb/source/Plugins/Language/ObjC/Cocoa.cpp b/lldb/source/Plugins/Language/ObjC/Cocoa.cpp
index f1a7e04..64047dc 100644
--- a/lldb/source/Plugins/Language/ObjC/Cocoa.cpp
+++ b/lldb/source/Plugins/Language/ObjC/Cocoa.cpp
@@ -1044,7 +1044,9 @@ public:
     return lldb::ValueObjectSP();
   }
 
-  bool Update() override { return false; }
+  lldb::ChildCacheState Update() override {
+    return lldb::ChildCacheState::eRefetch;
+  }
 
   bool MightHaveChildren() override { return false; }
 
diff --git a/lldb/source/Plugins/Language/ObjC/NSArray.cpp b/lldb/source/Plugins/Language/ObjC/NSArray.cpp
index 7d0004c..09bf7a2 100644
--- a/lldb/source/Plugins/Language/ObjC/NSArray.cpp
+++ b/lldb/source/Plugins/Language/ObjC/NSArray.cpp
@@ -54,7 +54,7 @@ public:
 
   lldb::ValueObjectSP GetChildAtIndex(size_t idx) override;
 
-  bool Update() override = 0;
+  lldb::ChildCacheState Update() override = 0;
 
   bool MightHaveChildren() override;
 
@@ -81,7 +81,7 @@ public:
 
   ~GenericNSArrayMSyntheticFrontEnd() override;
 
-  bool Update() override;
+  lldb::ChildCacheState Update() override;
 
 protected:
   lldb::addr_t GetDataAddress() override;
@@ -218,7 +218,7 @@ public:
 
   lldb::ValueObjectSP GetChildAtIndex(size_t idx) override;
 
-  bool Update() override;
+  lldb::ChildCacheState Update() override;
 
   bool MightHaveChildren() override;
 
@@ -306,7 +306,7 @@ public:
 
   lldb::ValueObjectSP GetChildAtIndex(size_t idx) override;
 
-  bool Update() override;
+  lldb::ChildCacheState Update() override;
 
   bool MightHaveChildren() override;
 
@@ -323,7 +323,7 @@ public:
 
   lldb::ValueObjectSP GetChildAtIndex(size_t idx) override;
 
-  bool Update() override;
+  lldb::ChildCacheState Update() override;
 
   bool MightHaveChildren() override;
 
@@ -500,9 +500,8 @@ lldb_private::formatters::NSArrayMSyntheticFrontEndBase::GetChildAtIndex(
 }
 
 template <typename D32, typename D64>
-bool
-lldb_private::formatters::
-  GenericNSArrayMSyntheticFrontEnd<D32, D64>::Update() {
+lldb::ChildCacheState
+lldb_private::formatters::GenericNSArrayMSyntheticFrontEnd<D32, D64>::Update() {
   ValueObjectSP valobj_sp = m_backend.GetSP();
   m_ptr_size = 0;
   delete m_data_32;
@@ -510,13 +509,13 @@ lldb_private::formatters::
   delete m_data_64;
   m_data_64 = nullptr;
   if (!valobj_sp)
-    return false;
+    return lldb::ChildCacheState::eRefetch;
   m_exe_ctx_ref = valobj_sp->GetExecutionContextRef();
   Status error;
   error.Clear();
   lldb::ProcessSP process_sp(valobj_sp->GetProcessSP());
   if (!process_sp)
-    return false;
+    return lldb::ChildCacheState::eRefetch;
   m_ptr_size = process_sp->GetAddressByteSize();
   uint64_t data_location = valobj_sp->GetValueAsUnsigned(0) + m_ptr_size;
   if (m_ptr_size == 4) {
@@ -529,7 +528,8 @@ lldb_private::formatters::
                            error);
   }
 
-  return error.Success();
+  return error.Success() ? lldb::ChildCacheState::eReuse
+                         : lldb::ChildCacheState::eRefetch;
 }
 
 bool
@@ -641,9 +641,9 @@ lldb_private::formatters::GenericNSArrayISyntheticFrontEnd<D32, D64, Inline>::
 }
 
 template <typename D32, typename D64, bool Inline>
-bool
-lldb_private::formatters::GenericNSArrayISyntheticFrontEnd<D32, D64, Inline>::
-  Update() {
+lldb::ChildCacheState
+lldb_private::formatters::GenericNSArrayISyntheticFrontEnd<D32, D64,
+                                                           Inline>::Update() {
   ValueObjectSP valobj_sp = m_backend.GetSP();
   m_ptr_size = 0;
   delete m_data_32;
@@ -651,13 +651,13 @@ lldb_private::formatters::GenericNSArrayISyntheticFrontEnd<D32, D64, Inline>::
   delete m_data_64;
   m_data_64 = nullptr;
   if (!valobj_sp)
-    return false;
+    return lldb::ChildCacheState::eRefetch;
   m_exe_ctx_ref = valobj_sp->GetExecutionContextRef();
   Status error;
   error.Clear();
   lldb::ProcessSP process_sp(valobj_sp->GetProcessSP());
   if (!process_sp)
-    return false;
+    return lldb::ChildCacheState::eRefetch;
   m_ptr_size = process_sp->GetAddressByteSize();
   uint64_t data_location = valobj_sp->GetValueAsUnsigned(0) + m_ptr_size;
   if (m_ptr_size == 4) {
@@ -670,7 +670,8 @@ lldb_private::formatters::GenericNSArrayISyntheticFrontEnd<D32, D64, Inline>::
                            error);
   }
 
-  return error.Success();
+  return error.Success() ? lldb::ChildCacheState::eReuse
+                         : lldb::ChildCacheState::eRefetch;
 }
 
 template <typename D32, typename D64, bool Inline>
@@ -723,8 +724,9 @@ lldb_private::formatters::NSArray0SyntheticFrontEnd::CalculateNumChildren() {
   return 0;
 }
 
-bool lldb_private::formatters::NSArray0SyntheticFrontEnd::Update() {
-  return false;
+lldb::ChildCacheState
+lldb_private::formatters::NSArray0SyntheticFrontEnd::Update() {
+  return lldb::ChildCacheState::eRefetch;
 }
 
 bool lldb_private::formatters::NSArray0SyntheticFrontEnd::MightHaveChildren() {
@@ -757,8 +759,9 @@ lldb_private::formatters::NSArray1SyntheticFrontEnd::CalculateNumChildren() {
   return 1;
 }
 
-bool lldb_private::formatters::NSArray1SyntheticFrontEnd::Update() {
-  return false;
+lldb::ChildCacheState
+lldb_private::formatters::NSArray1SyntheticFrontEnd::Update() {
+  return lldb::ChildCacheState::eRefetch;
 }
 
 bool lldb_private::formatters::NSArray1SyntheticFrontEnd::MightHaveChildren() {
diff --git a/lldb/source/Plugins/Language/ObjC/NSDictionary.cpp b/lldb/source/Plugins/Language/ObjC/NSDictionary.cpp
index d377ee7..9c252a9 100644
--- a/lldb/source/Plugins/Language/ObjC/NSDictionary.cpp
+++ b/lldb/source/Plugins/Language/ObjC/NSDictionary.cpp
@@ -107,7 +107,7 @@ public:
 
   lldb::ValueObjectSP GetChildAtIndex(size_t idx) override;
 
-  bool Update() override;
+  lldb::ChildCacheState Update() override;
 
   bool MightHaveChildren() override;
 
@@ -148,7 +148,7 @@ public:
 
   lldb::ValueObjectSP GetChildAtIndex(size_t idx) override;
 
-  bool Update() override;
+  lldb::ChildCacheState Update() override;
 
   bool MightHaveChildren() override;
 
@@ -180,7 +180,7 @@ public:
 
   lldb::ValueObjectSP GetChildAtIndex(size_t idx) override;
 
-  bool Update() override;
+  lldb::ChildCacheState Update() override;
 
   bool MightHaveChildren() override;
 
@@ -213,7 +213,7 @@ public:
 
   lldb::ValueObjectSP GetChildAtIndex(size_t idx) override;
 
-  bool Update() override;
+  lldb::ChildCacheState Update() override;
 
   bool MightHaveChildren() override;
 
@@ -234,7 +234,7 @@ public:
 
   lldb::ValueObjectSP GetChildAtIndex(size_t idx) override;
 
-  bool Update() override;
+  lldb::ChildCacheState Update() override;
 
   bool MightHaveChildren() override;
 
@@ -266,9 +266,9 @@ namespace Foundation1100 {
     size_t CalculateNumChildren() override;
     
     lldb::ValueObjectSP GetChildAtIndex(size_t idx) override;
-    
-    bool Update() override;
-    
+
+    lldb::ChildCacheState Update() override;
+
     bool MightHaveChildren() override;
     
     size_t GetIndexOfChildWithName(ConstString name) override;
@@ -613,7 +613,8 @@ size_t lldb_private::formatters::NSDictionaryISyntheticFrontEnd::
   return (m_data_32 ? m_data_32->_used : m_data_64->_used);
 }
 
-bool lldb_private::formatters::NSDictionaryISyntheticFrontEnd::Update() {
+lldb::ChildCacheState
+lldb_private::formatters::NSDictionaryISyntheticFrontEnd::Update() {
   m_children.clear();
   delete m_data_32;
   m_data_32 = nullptr;
@@ -622,13 +623,13 @@ bool lldb_private::formatters::NSDictionaryISyntheticFrontEnd::Update() {
   m_ptr_size = 0;
   ValueObjectSP valobj_sp = m_backend.GetSP();
   if (!valobj_sp)
-    return false;
+    return lldb::ChildCacheState::eRefetch;
   m_exe_ctx_ref = valobj_sp->GetExecutionContextRef();
   Status error;
   error.Clear();
   lldb::ProcessSP process_sp(valobj_sp->GetProcessSP());
   if (!process_sp)
-    return false;
+    return lldb::ChildCacheState::eRefetch;
   m_ptr_size = process_sp->GetAddressByteSize();
   m_order = process_sp->GetByteOrder();
   uint64_t data_location = valobj_sp->GetValueAsUnsigned(0) + m_ptr_size;
@@ -642,9 +643,9 @@ bool lldb_private::formatters::NSDictionaryISyntheticFrontEnd::Update() {
                            error);
   }
   if (error.Fail())
-    return false;
+    return lldb::ChildCacheState::eRefetch;
   m_data_ptr = data_location + m_ptr_size;
-  return false;
+  return lldb::ChildCacheState::eRefetch;
 }
 
 bool lldb_private::formatters::NSDictionaryISyntheticFrontEnd::
@@ -750,20 +751,23 @@ size_t lldb_private::formatters::NSCFDictionarySyntheticFrontEnd::
   return m_hashtable.GetCount();
 }
 
-bool lldb_private::formatters::NSCFDictionarySyntheticFrontEnd::Update() {
+lldb::ChildCacheState
+lldb_private::formatters::NSCFDictionarySyntheticFrontEnd::Update() {
   m_children.clear();
   ValueObjectSP valobj_sp = m_backend.GetSP();
   m_ptr_size = 0;
   if (!valobj_sp)
-    return false;
+    return lldb::ChildCacheState::eRefetch;
   m_exe_ctx_ref = valobj_sp->GetExecutionContextRef();
 
   lldb::ProcessSP process_sp(valobj_sp->GetProcessSP());
   if (!process_sp)
-    return false;
+    return lldb::ChildCacheState::eRefetch;
   m_ptr_size = process_sp->GetAddressByteSize();
   m_order = process_sp->GetByteOrder();
-  return m_hashtable.Update(valobj_sp->GetValueAsUnsigned(0), m_exe_ctx_ref);
+  return m_hashtable.Update(valobj_sp->GetValueAsUnsigned(0), m_exe_ctx_ref)
+             ? lldb::ChildCacheState::eReuse
+             : lldb::ChildCacheState::eRefetch;
 }
 
 bool lldb_private::formatters::NSCFDictionarySyntheticFrontEnd::
@@ -881,30 +885,33 @@ size_t lldb_private::formatters::NSConstantDictionarySyntheticFrontEnd::
   return m_size;
 }
 
-bool lldb_private::formatters::NSConstantDictionarySyntheticFrontEnd::Update() {
+lldb::ChildCacheState
+lldb_private::formatters::NSConstantDictionarySyntheticFrontEnd::Update() {
   ValueObjectSP valobj_sp = m_backend.GetSP();
   if (!valobj_sp)
-    return false;
+    return lldb::ChildCacheState::eRefetch;
   m_exe_ctx_ref = valobj_sp->GetExecutionContextRef();
   Status error;
   error.Clear();
   lldb::ProcessSP process_sp(valobj_sp->GetProcessSP());
   if (!process_sp)
-    return false;
+    return lldb::ChildCacheState::eRefetch;
   m_ptr_size = process_sp->GetAddressByteSize();
   m_order = process_sp->GetByteOrder();
   uint64_t valobj_addr = valobj_sp->GetValueAsUnsigned(0);
   m_size = process_sp->ReadUnsignedIntegerFromMemory(
       valobj_addr + 2 * m_ptr_size, m_ptr_size, 0, error);
   if (error.Fail())
-    return false;
+    return lldb::ChildCacheState::eRefetch;
   m_keys_ptr =
       process_sp->ReadPointerFromMemory(valobj_addr + 3 * m_ptr_size, error);
   if (error.Fail())
-    return false;
+    return lldb::ChildCacheState::eRefetch;
   m_objects_ptr =
       process_sp->ReadPointerFromMemory(valobj_addr + 4 * m_ptr_size, error);
-  return !error.Fail();
+
+  return error.Success() ? lldb::ChildCacheState::eReuse
+                         : lldb::ChildCacheState::eRefetch;
 }
 
 bool lldb_private::formatters::NSConstantDictionarySyntheticFrontEnd::
@@ -992,9 +999,10 @@ size_t lldb_private::formatters::NSDictionary1SyntheticFrontEnd::
   return 1;
 }
 
-bool lldb_private::formatters::NSDictionary1SyntheticFrontEnd::Update() {
+lldb::ChildCacheState
+lldb_private::formatters::NSDictionary1SyntheticFrontEnd::Update() {
   m_pair.reset();
-  return false;
+  return lldb::ChildCacheState::eRefetch;
 }
 
 bool lldb_private::formatters::NSDictionary1SyntheticFrontEnd::
@@ -1087,9 +1095,9 @@ lldb_private::formatters::GenericNSDictionaryMSyntheticFrontEnd<D32,D64>::Calcul
 }
 
 template <typename D32, typename D64>
-bool
-lldb_private::formatters::GenericNSDictionaryMSyntheticFrontEnd<D32,D64>::
-  Update() {
+lldb::ChildCacheState
+lldb_private::formatters::GenericNSDictionaryMSyntheticFrontEnd<D32,
+                                                                D64>::Update() {
   m_children.clear();
   ValueObjectSP valobj_sp = m_backend.GetSP();
   m_ptr_size = 0;
@@ -1098,13 +1106,13 @@ lldb_private::formatters::GenericNSDictionaryMSyntheticFrontEnd<D32,D64>::
   delete m_data_64;
   m_data_64 = nullptr;
   if (!valobj_sp)
-    return false;
+    return lldb::ChildCacheState::eRefetch;
   m_exe_ctx_ref = valobj_sp->GetExecutionContextRef();
   Status error;
   error.Clear();
   lldb::ProcessSP process_sp(valobj_sp->GetProcessSP());
   if (!process_sp)
-    return false;
+    return lldb::ChildCacheState::eRefetch;
   m_ptr_size = process_sp->GetAddressByteSize();
   m_order = process_sp->GetByteOrder();
   uint64_t data_location = valobj_sp->GetValueAsUnsigned(0) + m_ptr_size;
@@ -1118,7 +1126,8 @@ lldb_private::formatters::GenericNSDictionaryMSyntheticFrontEnd<D32,D64>::
                            error);
   }
 
-  return error.Success();
+  return error.Success() ? lldb::ChildCacheState::eReuse
+                         : lldb::ChildCacheState::eRefetch;
 }
 
 template <typename D32, typename D64>
@@ -1249,9 +1258,8 @@ lldb_private::formatters::Foundation1100::
   return (m_data_32 ? m_data_32->_used : m_data_64->_used);
 }
 
-bool
-lldb_private::formatters::Foundation1100::
-  NSDictionaryMSyntheticFrontEnd::Update() {
+lldb::ChildCacheState lldb_private::formatters::Foundation1100::
+    NSDictionaryMSyntheticFrontEnd::Update() {
   m_children.clear();
   ValueObjectSP valobj_sp = m_backend.GetSP();
   m_ptr_size = 0;
@@ -1260,13 +1268,13 @@ lldb_private::formatters::Foundation1100::
   delete m_data_64;
   m_data_64 = nullptr;
   if (!valobj_sp)
-    return false;
+    return lldb::ChildCacheState::eRefetch;
   m_exe_ctx_ref = valobj_sp->GetExecutionContextRef();
   Status error;
   error.Clear();
   lldb::ProcessSP process_sp(valobj_sp->GetProcessSP());
   if (!process_sp)
-    return false;
+    return lldb::ChildCacheState::eRefetch;
   m_ptr_size = process_sp->GetAddressByteSize();
   m_order = process_sp->GetByteOrder();
   uint64_t data_location = valobj_sp->GetValueAsUnsigned(0) + m_ptr_size;
@@ -1280,7 +1288,8 @@ lldb_private::formatters::Foundation1100::
                            error);
   }
 
-  return error.Success();
+  return error.Success() ? lldb::ChildCacheState::eReuse
+                         : lldb::ChildCacheState::eRefetch;
 }
 
 bool
diff --git a/lldb/source/Plugins/Language/ObjC/NSError.cpp b/lldb/source/Plugins/Language/ObjC/NSError.cpp
index 99eeb2d..ce52ae5 100644
--- a/lldb/source/Plugins/Language/ObjC/NSError.cpp
+++ b/lldb/source/Plugins/Language/ObjC/NSError.cpp
@@ -133,17 +133,17 @@ public:
     return m_child_sp;
   }
 
-  bool Update() override {
+  lldb::ChildCacheState Update() override {
     m_child_ptr = nullptr;
     m_child_sp.reset();
 
     ProcessSP process_sp(m_backend.GetProcessSP());
     if (!process_sp)
-      return false;
+      return lldb::ChildCacheState::eRefetch;
 
     lldb::addr_t userinfo_location = DerefToNSErrorPointer(m_backend);
     if (userinfo_location == LLDB_INVALID_ADDRESS)
-      return false;
+      return lldb::ChildCacheState::eRefetch;
 
     size_t ptr_size = process_sp->GetAddressByteSize();
 
@@ -152,17 +152,17 @@ public:
     lldb::addr_t userinfo =
         process_sp->ReadPointerFromMemory(userinfo_location, error);
     if (userinfo == LLDB_INVALID_ADDRESS || error.Fail())
-      return false;
+      return lldb::ChildCacheState::eRefetch;
     InferiorSizedWord isw(userinfo, *process_sp);
     TypeSystemClangSP scratch_ts_sp =
         ScratchTypeSystemClang::GetForTarget(process_sp->GetTarget());
     if (!scratch_ts_sp)
-      return false;
+      return lldb::ChildCacheState::eRefetch;
     m_child_sp = CreateValueObjectFromData(
         "_userInfo", isw.GetAsData(process_sp->GetByteOrder()),
         m_backend.GetExecutionContextRef(),
         scratch_ts_sp->GetBasicType(lldb::eBasicTypeObjCID));
-    return false;
+    return lldb::ChildCacheState::eRefetch;
   }
 
   bool MightHaveChildren() override { return true; }
diff --git a/lldb/source/Plugins/Language/ObjC/NSException.cpp b/lldb/source/Plugins/Language/ObjC/NSException.cpp
index 29805bb..e8011e5 100644
--- a/lldb/source/Plugins/Language/ObjC/NSException.cpp
+++ b/lldb/source/Plugins/Language/ObjC/NSException.cpp
@@ -137,14 +137,17 @@ public:
     return lldb::ValueObjectSP();
   }
 
-  bool Update() override {
+  lldb::ChildCacheState Update() override {
     m_name_sp.reset();
     m_reason_sp.reset();
     m_userinfo_sp.reset();
     m_reserved_sp.reset();
 
-    return ExtractFields(m_backend, &m_name_sp, &m_reason_sp, &m_userinfo_sp,
-                         &m_reserved_sp);
+    const auto ret = ExtractFields(m_backend, &m_name_sp, &m_reason_sp,
+                                   &m_userinfo_sp, &m_reserved_sp);
+
+    return ret ? lldb::ChildCacheState::eReuse
+               : lldb::ChildCacheState::eRefetch;
   }
 
   bool MightHaveChildren() override { return true; }
diff --git a/lldb/source/Plugins/Language/ObjC/NSIndexPath.cpp b/lldb/source/Plugins/Language/ObjC/NSIndexPath.cpp
index 2a4ce80..69e6ab1 100644
--- a/lldb/source/Plugins/Language/ObjC/NSIndexPath.cpp
+++ b/lldb/source/Plugins/Language/ObjC/NSIndexPath.cpp
@@ -46,17 +46,17 @@ public:
     return m_impl.GetIndexAtIndex(idx, m_uint_star_type);
   }
 
-  bool Update() override {
+  lldb::ChildCacheState Update() override {
     m_impl.Clear();
 
     auto type_system = m_backend.GetCompilerType().GetTypeSystem();
     if (!type_system)
-      return false;
+      return lldb::ChildCacheState::eRefetch;
 
     auto ast = ScratchTypeSystemClang::GetForTarget(
         *m_backend.GetExecutionContextRef().GetTargetSP());
     if (!ast)
-      return false;
+      return lldb::ChildCacheState::eRefetch;
 
     m_uint_star_type = ast->GetPointerSizedIntType(false);
 
@@ -65,18 +65,18 @@ public:
 
     ProcessSP process_sp = m_backend.GetProcessSP();
     if (!process_sp)
-      return false;
+      return lldb::ChildCacheState::eRefetch;
 
     ObjCLanguageRuntime *runtime = ObjCLanguageRuntime::Get(*process_sp);
 
     if (!runtime)
-      return false;
+      return lldb::ChildCacheState::eRefetch;
 
     ObjCLanguageRuntime::ClassDescriptorSP descriptor(
         runtime->GetClassDescriptor(m_backend));
 
     if (!descriptor.get() || !descriptor->IsValid())
-      return false;
+      return lldb::ChildCacheState::eRefetch;
 
     uint64_t info_bits(0), value_bits(0), payload(0);
 
@@ -119,7 +119,7 @@ public:
         }
       }
     }
-    return false;
+    return lldb::ChildCacheState::eRefetch;
   }
 
   bool MightHaveChildren() override { return m_impl.m_mode != Mode::Invalid; }
diff --git a/lldb/source/Plugins/Language/ObjC/NSSet.cpp b/lldb/source/Plugins/Language/ObjC/NSSet.cpp
index ed1751c..ede6485 100644
--- a/lldb/source/Plugins/Language/ObjC/NSSet.cpp
+++ b/lldb/source/Plugins/Language/ObjC/NSSet.cpp
@@ -50,7 +50,7 @@ public:
 
   lldb::ValueObjectSP GetChildAtIndex(size_t idx) override;
 
-  bool Update() override;
+  lldb::ChildCacheState Update() override;
 
   bool MightHaveChildren() override;
 
@@ -88,7 +88,7 @@ public:
 
   lldb::ValueObjectSP GetChildAtIndex(size_t idx) override;
 
-  bool Update() override;
+  lldb::ChildCacheState Update() override;
 
   bool MightHaveChildren() override;
 
@@ -121,7 +121,7 @@ public:
 
   lldb::ValueObjectSP GetChildAtIndex(size_t idx) override;
 
-  bool Update() override;
+  lldb::ChildCacheState Update() override;
 
   bool MightHaveChildren() override;
 
@@ -237,7 +237,7 @@ public:
 
   lldb::ValueObjectSP GetChildAtIndex(size_t idx) override;
 
-  bool Update() override;
+  lldb::ChildCacheState Update() override;
 
   bool MightHaveChildren() override;
 
@@ -426,7 +426,8 @@ lldb_private::formatters::NSSetISyntheticFrontEnd::CalculateNumChildren() {
   return (m_data_32 ? m_data_32->_used : m_data_64->_used);
 }
 
-bool lldb_private::formatters::NSSetISyntheticFrontEnd::Update() {
+lldb::ChildCacheState
+lldb_private::formatters::NSSetISyntheticFrontEnd::Update() {
   m_children.clear();
   delete m_data_32;
   m_data_32 = nullptr;
@@ -435,13 +436,13 @@ bool lldb_private::formatters::NSSetISyntheticFrontEnd::Update() {
   m_ptr_size = 0;
   ValueObjectSP valobj_sp = m_backend.GetSP();
   if (!valobj_sp)
-    return false;
+    return lldb::ChildCacheState::eRefetch;
   if (!valobj_sp)
-    return false;
+    return lldb::ChildCacheState::eRefetch;
   m_exe_ctx_ref = valobj_sp->GetExecutionContextRef();
   lldb::ProcessSP process_sp(valobj_sp->GetProcessSP());
   if (!process_sp)
-    return false;
+    return lldb::ChildCacheState::eRefetch;
   m_ptr_size = process_sp->GetAddressByteSize();
   uint64_t data_location = valobj_sp->GetValueAsUnsigned(0) + m_ptr_size;
   Status error;
@@ -455,9 +456,9 @@ bool lldb_private::formatters::NSSetISyntheticFrontEnd::Update() {
                            error);
   }
   if (error.Fail())
-    return false;
+    return lldb::ChildCacheState::eRefetch;
   m_data_ptr = data_location + m_ptr_size;
-  return true;
+  return lldb::ChildCacheState::eReuse;
 }
 
 bool lldb_private::formatters::NSSetISyntheticFrontEnd::MightHaveChildren() {
@@ -561,20 +562,23 @@ lldb_private::formatters::NSCFSetSyntheticFrontEnd::CalculateNumChildren() {
   return m_hashtable.GetCount();
 }
 
-bool lldb_private::formatters::NSCFSetSyntheticFrontEnd::Update() {
+lldb::ChildCacheState
+lldb_private::formatters::NSCFSetSyntheticFrontEnd::Update() {
   m_children.clear();
   ValueObjectSP valobj_sp = m_backend.GetSP();
   m_ptr_size = 0;
   if (!valobj_sp)
-    return false;
+    return lldb::ChildCacheState::eRefetch;
   m_exe_ctx_ref = valobj_sp->GetExecutionContextRef();
 
   lldb::ProcessSP process_sp(valobj_sp->GetProcessSP());
   if (!process_sp)
-    return false;
+    return lldb::ChildCacheState::eRefetch;
   m_ptr_size = process_sp->GetAddressByteSize();
   m_order = process_sp->GetByteOrder();
-  return m_hashtable.Update(valobj_sp->GetValueAsUnsigned(0), m_exe_ctx_ref);
+  return m_hashtable.Update(valobj_sp->GetValueAsUnsigned(0), m_exe_ctx_ref)
+             ? lldb::ChildCacheState::eReuse
+             : lldb::ChildCacheState::eRefetch;
 }
 
 bool lldb_private::formatters::NSCFSetSyntheticFrontEnd::MightHaveChildren() {
@@ -701,9 +705,8 @@ lldb_private::formatters::
 }
 
 template <typename D32, typename D64>
-bool
-lldb_private::formatters::
-  GenericNSSetMSyntheticFrontEnd<D32, D64>::Update() {
+lldb::ChildCacheState
+lldb_private::formatters::GenericNSSetMSyntheticFrontEnd<D32, D64>::Update() {
   m_children.clear();
   ValueObjectSP valobj_sp = m_backend.GetSP();
   m_ptr_size = 0;
@@ -712,13 +715,13 @@ lldb_private::formatters::
   delete m_data_64;
   m_data_64 = nullptr;
   if (!valobj_sp)
-    return false;
+    return lldb::ChildCacheState::eRefetch;
   if (!valobj_sp)
-    return false;
+    return lldb::ChildCacheState::eRefetch;
   m_exe_ctx_ref = valobj_sp->GetExecutionContextRef();
   lldb::ProcessSP process_sp(valobj_sp->GetProcessSP());
   if (!process_sp)
-    return false;
+    return lldb::ChildCacheState::eRefetch;
   m_ptr_size = process_sp->GetAddressByteSize();
   uint64_t data_location = valobj_sp->GetValueAsUnsigned(0) + m_ptr_size;
   Status error;
@@ -731,7 +734,8 @@ lldb_private::formatters::
     process_sp->ReadMemory(data_location, m_data_64, sizeof(D64),
                            error);
   }
-  return error.Success();
+  return error.Success() ? lldb::ChildCacheState::eReuse
+                         : lldb::ChildCacheState::eRefetch;
 }
 
 template <typename D32, typename D64>
-- 
cgit v1.1


From b35c5197629494cb675948fe33d2fdcd75b5aafa Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Thu, 8 Feb 2024 11:43:29 +0000
Subject: [DAG] tryToFoldExtendOfConstant - share the same SDLoc argument
 instead of recreating it over and over again.

---
 llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 61 +++++++++++++--------------
 1 file changed, 29 insertions(+), 32 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 4adea02..d3cd9b1 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -12739,12 +12739,12 @@ static SDValue tryToFoldExtendSelectLoad(SDNode *N, const TargetLowering &TLI,
 /// dag nodes (see for example method DAGCombiner::visitSIGN_EXTEND).
 /// Vector extends are not folded if operations are legal; this is to
 /// avoid introducing illegal build_vector dag nodes.
-static SDValue tryToFoldExtendOfConstant(SDNode *N, const TargetLowering &TLI,
+static SDValue tryToFoldExtendOfConstant(SDNode *N, const SDLoc &DL,
+                                         const TargetLowering &TLI,
                                          SelectionDAG &DAG, bool LegalTypes) {
   unsigned Opcode = N->getOpcode();
   SDValue N0 = N->getOperand(0);
   EVT VT = N->getValueType(0);
-  SDLoc DL(N);
 
   assert((ISD::isExtOpcode(Opcode) || ISD::isExtVecInRegOpcode(Opcode)) &&
          "Expected EXTEND dag node in input!");
@@ -13400,7 +13400,7 @@ SDValue DAGCombiner::visitSIGN_EXTEND(SDNode *N) {
   if (N0.isUndef())
     return DAG.getConstant(0, DL, VT);
 
-  if (SDValue Res = tryToFoldExtendOfConstant(N, TLI, DAG, LegalTypes))
+  if (SDValue Res = tryToFoldExtendOfConstant(N, DL, TLI, DAG, LegalTypes))
     return Res;
 
   // fold (sext (sext x)) -> (sext x)
@@ -13669,7 +13669,7 @@ SDValue DAGCombiner::visitZERO_EXTEND(SDNode *N) {
   if (N0.isUndef())
     return DAG.getConstant(0, DL, VT);
 
-  if (SDValue Res = tryToFoldExtendOfConstant(N, TLI, DAG, LegalTypes))
+  if (SDValue Res = tryToFoldExtendOfConstant(N, DL, TLI, DAG, LegalTypes))
     return Res;
 
   // fold (zext (zext x)) -> (zext x)
@@ -13937,12 +13937,13 @@ SDValue DAGCombiner::visitZERO_EXTEND(SDNode *N) {
 SDValue DAGCombiner::visitANY_EXTEND(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   EVT VT = N->getValueType(0);
+  SDLoc DL(N);
 
   // aext(undef) = undef
   if (N0.isUndef())
     return DAG.getUNDEF(VT);
 
-  if (SDValue Res = tryToFoldExtendOfConstant(N, TLI, DAG, LegalTypes))
+  if (SDValue Res = tryToFoldExtendOfConstant(N, DL, TLI, DAG, LegalTypes))
     return Res;
 
   // fold (aext (aext x)) -> (aext x)
@@ -13951,7 +13952,7 @@ SDValue DAGCombiner::visitANY_EXTEND(SDNode *N) {
   if (N0.getOpcode() == ISD::ANY_EXTEND  ||
       N0.getOpcode() == ISD::ZERO_EXTEND ||
       N0.getOpcode() == ISD::SIGN_EXTEND)
-    return DAG.getNode(N0.getOpcode(), SDLoc(N), VT, N0.getOperand(0));
+    return DAG.getNode(N0.getOpcode(), DL, VT, N0.getOperand(0));
 
   // fold (aext (aext_extend_vector_inreg x)) -> (aext_extend_vector_inreg x)
   // fold (aext (zext_extend_vector_inreg x)) -> (zext_extend_vector_inreg x)
@@ -13959,7 +13960,7 @@ SDValue DAGCombiner::visitANY_EXTEND(SDNode *N) {
   if (N0.getOpcode() == ISD::ANY_EXTEND_VECTOR_INREG ||
       N0.getOpcode() == ISD::ZERO_EXTEND_VECTOR_INREG ||
       N0.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG)
-    return DAG.getNode(N0.getOpcode(), SDLoc(N), VT, N0.getOperand(0));
+    return DAG.getNode(N0.getOpcode(), DL, VT, N0.getOperand(0));
 
   // fold (aext (truncate (load x))) -> (aext (smaller load x))
   // fold (aext (truncate (srl (load x), c))) -> (aext (small load (x+c/n)))
@@ -13977,7 +13978,7 @@ SDValue DAGCombiner::visitANY_EXTEND(SDNode *N) {
 
   // fold (aext (truncate x))
   if (N0.getOpcode() == ISD::TRUNCATE)
-    return DAG.getAnyExtOrTrunc(N0.getOperand(0), SDLoc(N), VT);
+    return DAG.getAnyExtOrTrunc(N0.getOperand(0), DL, VT);
 
   // Fold (aext (and (trunc x), cst)) -> (and x, cst)
   // if the trunc is not free.
@@ -13985,7 +13986,6 @@ SDValue DAGCombiner::visitANY_EXTEND(SDNode *N) {
       N0.getOperand(0).getOpcode() == ISD::TRUNCATE &&
       N0.getOperand(1).getOpcode() == ISD::Constant &&
       !TLI.isTruncateFree(N0.getOperand(0).getOperand(0), N0.getValueType())) {
-    SDLoc DL(N);
     SDValue X = DAG.getAnyExtOrTrunc(N0.getOperand(0).getOperand(0), DL, VT);
     SDValue Y = DAG.getNode(ISD::ANY_EXTEND, DL, VT, N0.getOperand(1));
     assert(isa<ConstantSDNode>(Y) && "Expected constant to be folded!");
@@ -14011,9 +14011,9 @@ SDValue DAGCombiner::visitANY_EXTEND(SDNode *N) {
           ExtendUsesToFormExtLoad(VT, N, N0, ISD::ANY_EXTEND, SetCCs, TLI);
     if (DoXform) {
       LoadSDNode *LN0 = cast<LoadSDNode>(N0);
-      SDValue ExtLoad = DAG.getExtLoad(ISD::EXTLOAD, SDLoc(N), VT,
-                                       LN0->getChain(), LN0->getBasePtr(),
-                                       N0.getValueType(), LN0->getMemOperand());
+      SDValue ExtLoad = DAG.getExtLoad(ISD::EXTLOAD, DL, VT, LN0->getChain(),
+                                       LN0->getBasePtr(), N0.getValueType(),
+                                       LN0->getMemOperand());
       ExtendSetCCUses(SetCCs, N0, ExtLoad, ISD::ANY_EXTEND);
       // If the load value is used only by N, replace it via CombineTo N.
       bool NoReplaceTrunc = N0.hasOneUse();
@@ -14039,9 +14039,9 @@ SDValue DAGCombiner::visitANY_EXTEND(SDNode *N) {
     ISD::LoadExtType ExtType = LN0->getExtensionType();
     EVT MemVT = LN0->getMemoryVT();
     if (!LegalOperations || TLI.isLoadExtLegal(ExtType, VT, MemVT)) {
-      SDValue ExtLoad = DAG.getExtLoad(ExtType, SDLoc(N),
-                                       VT, LN0->getChain(), LN0->getBasePtr(),
-                                       MemVT, LN0->getMemOperand());
+      SDValue ExtLoad =
+          DAG.getExtLoad(ExtType, DL, VT, LN0->getChain(), LN0->getBasePtr(),
+                         MemVT, LN0->getMemOperand());
       CombineTo(N, ExtLoad);
       DAG.ReplaceAllUsesOfValueWith(SDValue(LN0, 1), ExtLoad.getValue(1));
       recursivelyDeleteUnusedNodes(LN0);
@@ -14069,23 +14069,20 @@ SDValue DAGCombiner::visitANY_EXTEND(SDNode *N) {
       // we know that the element size of the sext'd result matches the
       // element size of the compare operands.
       if (VT.getSizeInBits() == N00VT.getSizeInBits())
-        return DAG.getSetCC(SDLoc(N), VT, N0.getOperand(0),
-                             N0.getOperand(1),
-                             cast<CondCodeSDNode>(N0.getOperand(2))->get());
+        return DAG.getSetCC(DL, VT, N0.getOperand(0), N0.getOperand(1),
+                            cast<CondCodeSDNode>(N0.getOperand(2))->get());
 
       // If the desired elements are smaller or larger than the source
       // elements we can use a matching integer vector type and then
       // truncate/any extend
       EVT MatchingVectorType = N00VT.changeVectorElementTypeToInteger();
-      SDValue VsetCC =
-        DAG.getSetCC(SDLoc(N), MatchingVectorType, N0.getOperand(0),
-                      N0.getOperand(1),
-                      cast<CondCodeSDNode>(N0.getOperand(2))->get());
-      return DAG.getAnyExtOrTrunc(VsetCC, SDLoc(N), VT);
+      SDValue VsetCC = DAG.getSetCC(
+          DL, MatchingVectorType, N0.getOperand(0), N0.getOperand(1),
+          cast<CondCodeSDNode>(N0.getOperand(2))->get());
+      return DAG.getAnyExtOrTrunc(VsetCC, DL, VT);
     }
 
     // aext(setcc x,y,cc) -> select_cc x, y, 1, 0, cc
-    SDLoc DL(N);
     if (SDValue SCC = SimplifySelectCC(
             DL, N0.getOperand(0), N0.getOperand(1), DAG.getConstant(1, DL, VT),
             DAG.getConstant(0, DL, VT),
@@ -14637,10 +14634,9 @@ SDValue DAGCombiner::visitSIGN_EXTEND_INREG(SDNode *N) {
   return SDValue();
 }
 
-static SDValue
-foldExtendVectorInregToExtendOfSubvector(SDNode *N, const TargetLowering &TLI,
-                                         SelectionDAG &DAG,
-                                         bool LegalOperations) {
+static SDValue foldExtendVectorInregToExtendOfSubvector(
+    SDNode *N, const SDLoc &DL, const TargetLowering &TLI, SelectionDAG &DAG,
+    bool LegalOperations) {
   unsigned InregOpcode = N->getOpcode();
   unsigned Opcode = DAG.getOpcode_EXTEND(InregOpcode);
 
@@ -14667,28 +14663,29 @@ foldExtendVectorInregToExtendOfSubvector(SDNode *N, const TargetLowering &TLI,
   if (LegalOperations && !TLI.isOperationLegal(Opcode, VT))
     return SDValue();
 
-  return DAG.getNode(Opcode, SDLoc(N), VT, Src);
+  return DAG.getNode(Opcode, DL, VT, Src);
 }
 
 SDValue DAGCombiner::visitEXTEND_VECTOR_INREG(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   EVT VT = N->getValueType(0);
+  SDLoc DL(N);
 
   if (N0.isUndef()) {
     // aext_vector_inreg(undef) = undef because the top bits are undefined.
     // {s/z}ext_vector_inreg(undef) = 0 because the top bits must be the same.
     return N->getOpcode() == ISD::ANY_EXTEND_VECTOR_INREG
                ? DAG.getUNDEF(VT)
-               : DAG.getConstant(0, SDLoc(N), VT);
+               : DAG.getConstant(0, DL, VT);
   }
 
-  if (SDValue Res = tryToFoldExtendOfConstant(N, TLI, DAG, LegalTypes))
+  if (SDValue Res = tryToFoldExtendOfConstant(N, DL, TLI, DAG, LegalTypes))
     return Res;
 
   if (SimplifyDemandedVectorElts(SDValue(N, 0)))
     return SDValue(N, 0);
 
-  if (SDValue R = foldExtendVectorInregToExtendOfSubvector(N, TLI, DAG,
+  if (SDValue R = foldExtendVectorInregToExtendOfSubvector(N, DL, TLI, DAG,
                                                            LegalOperations))
     return R;
 
-- 
cgit v1.1


From a643ab852a63a14dba86e031247734c5e3d5adb9 Mon Sep 17 00:00:00 2001
From: Jeremy Morse <jeremy.morse@sony.com>
Date: Thu, 8 Feb 2024 11:49:04 +0000
Subject: [DebugInfo][RemoveDIs] Final omnibus test fixing for RemoveDIs
 (#81125)

With this, I get a clean test suite running under RemoveDIs, the
non-intrinsic representation of debug-info, including under asan. We've
previously established that we generate identical binaries for some
large projects, so this i just edge-case cleanup. The changes:
* CodeGenPrepare fixups need to apply to dbg.assigns as well as
dbg.values (a dbg.assign is a dbg.value).
* Pin a test for constant-deletion to intrinsic debug-info: this very
rare scenario uses a different kill-location sigil in dbg.value mode to
RemoveDIs mode, which generates spurious test differences.
* Suppress a memory leak in a unit test: the code for dealing with
trailing debug-info in a block is necessarily fiddly, leading to this
leak when testing it. Developer-facing interfaces for moving
instructions around always deal with this behind the scenes.
* SROA, when replacing some vector-loads, needs to insert the
replacement loads ahead of any debug-info records so that their values
remain dominated by a definition. Set the head-bit indicating our
insertion should come before debug-info.
---
 llvm/lib/CodeGen/CodeGenPrepare.cpp                                | 3 ++-
 llvm/lib/Transforms/Scalar/SROA.cpp                                | 7 ++++++-
 .../Generic/assignment-tracking/codegenprepare/sunk-addr.ll        | 5 +++++
 llvm/test/Transforms/GlobalOpt/localize-constexpr-debuginfo.ll     | 7 ++++++-
 llvm/test/Transforms/SROA/vector-promotion.ll                      | 4 ++++
 llvm/unittests/IR/BasicBlockDbgInfoTest.cpp                        | 1 +
 6 files changed, 24 insertions(+), 3 deletions(-)

diff --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp
index 5383b15..09c4922 100644
--- a/llvm/lib/CodeGen/CodeGenPrepare.cpp
+++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp
@@ -8455,7 +8455,8 @@ bool CodeGenPrepare::fixupDPValuesOnInst(Instruction &I) {
 // FIXME: should updating debug-info really cause the "changed" flag to fire,
 // which can cause a function to be reprocessed?
 bool CodeGenPrepare::fixupDPValue(DPValue &DPV) {
-  if (DPV.Type != DPValue::LocationType::Value)
+  if (DPV.Type != DPValue::LocationType::Value &&
+      DPV.Type != DPValue::LocationType::Assign)
     return false;
 
   // Does this DPValue refer to a sunk address calculation?
diff --git a/llvm/lib/Transforms/Scalar/SROA.cpp b/llvm/lib/Transforms/Scalar/SROA.cpp
index bdbaf4f..e92e245 100644
--- a/llvm/lib/Transforms/Scalar/SROA.cpp
+++ b/llvm/lib/Transforms/Scalar/SROA.cpp
@@ -2956,7 +2956,12 @@ private:
       assert(DL.typeSizeEqualsStoreSize(LI.getType()) &&
              "Non-byte-multiple bit width");
       // Move the insertion point just past the load so that we can refer to it.
-      IRB.SetInsertPoint(&*std::next(BasicBlock::iterator(&LI)));
+      BasicBlock::iterator LIIt = std::next(LI.getIterator());
+      // Ensure the insertion point comes before any debug-info immediately
+      // after the load, so that variable values referring to the load are
+      // dominated by it.
+      LIIt.setHeadBit(true);
+      IRB.SetInsertPoint(LI.getParent(), LIIt);
       // Create a placeholder value with the same type as LI to use as the
       // basis for the new value. This allows us to replace the uses of LI with
       // the computed value, and then replace the placeholder with LI, leaving
diff --git a/llvm/test/DebugInfo/Generic/assignment-tracking/codegenprepare/sunk-addr.ll b/llvm/test/DebugInfo/Generic/assignment-tracking/codegenprepare/sunk-addr.ll
index 70548465..8b226aa 100644
--- a/llvm/test/DebugInfo/Generic/assignment-tracking/codegenprepare/sunk-addr.ll
+++ b/llvm/test/DebugInfo/Generic/assignment-tracking/codegenprepare/sunk-addr.ll
@@ -3,6 +3,11 @@
 ; RUN:   -mtriple=x86_64-unknown-unknown %s -o - \
 ; RUN: | FileCheck %s --implicit-check-not="call void @llvm.dbg."
 
+;; Test with RemoveDIs non-intrinsic debug-info too.
+; RUN: llc -start-before=codegenprepare -stop-after=codegenprepare \
+; RUN:   -mtriple=x86_64-unknown-unknown %s -o - --try-experimental-debuginfo-iterators \
+; RUN: | FileCheck %s --implicit-check-not="call void @llvm.dbg."
+
 ;; Check that when CodeGenPrepare moves an address computation to a block it's
 ;; used in its dbg.assign uses are updated.
 ;;
diff --git a/llvm/test/Transforms/GlobalOpt/localize-constexpr-debuginfo.ll b/llvm/test/Transforms/GlobalOpt/localize-constexpr-debuginfo.ll
index 18dc038..5d6cc7d 100644
--- a/llvm/test/Transforms/GlobalOpt/localize-constexpr-debuginfo.ll
+++ b/llvm/test/Transforms/GlobalOpt/localize-constexpr-debuginfo.ll
@@ -1,4 +1,9 @@
-; RUN: opt -S < %s -passes=globalopt | FileCheck %s
+; RUN: opt -S < %s -passes=globalopt --experimental-debuginfo-iterators=false | FileCheck %s
+;; FIXME: this test is pinned to not use RemoveDIs non-intrinsic debug-info.
+;; Constant-deletion takes a slightly different path and (correctly) replaces
+;; the operand of the debug-info record with poison instead of a null pointer.
+;; This is a spurious test difference that we'll suppress for turning RemoveDIs
+;; on.
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
diff --git a/llvm/test/Transforms/SROA/vector-promotion.ll b/llvm/test/Transforms/SROA/vector-promotion.ll
index e2aa1e2..e48dd5b 100644
--- a/llvm/test/Transforms/SROA/vector-promotion.ll
+++ b/llvm/test/Transforms/SROA/vector-promotion.ll
@@ -2,6 +2,10 @@
 ; RUN: opt < %s -passes='sroa<preserve-cfg>' -S | FileCheck %s --check-prefixes=CHECK,CHECK-PRESERVE-CFG
 ; RUN: opt < %s -passes='sroa<modify-cfg>' -S | FileCheck %s --check-prefixes=CHECK,CHECK-MODIFY-CFG
 ; RUN: opt < %s -passes=debugify,sroa -S | FileCheck %s --check-prefix=DEBUG
+;;  Ensure that these work with non-intrinsic variable locations.
+; RUN: opt < %s -passes='sroa<preserve-cfg>' -S --try-experimental-debuginfo-iterators | FileCheck %s --check-prefixes=CHECK,CHECK-PRESERVE-CFG
+; RUN: opt < %s -passes='sroa<modify-cfg>' -S --try-experimental-debuginfo-iterators | FileCheck %s --check-prefixes=CHECK,CHECK-MODIFY-CFG
+; RUN: opt < %s -passes=debugify,sroa -S --try-experimental-debuginfo-iterators | FileCheck %s --check-prefix=DEBUG
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n8:16:32:64"
 
 %S1 = type { i64, [42 x float] }
diff --git a/llvm/unittests/IR/BasicBlockDbgInfoTest.cpp b/llvm/unittests/IR/BasicBlockDbgInfoTest.cpp
index 827b4a9..ef2b288 100644
--- a/llvm/unittests/IR/BasicBlockDbgInfoTest.cpp
+++ b/llvm/unittests/IR/BasicBlockDbgInfoTest.cpp
@@ -1476,6 +1476,7 @@ TEST(BasicBlockDbgInfoTest, DbgSpliceToEmpty2) {
   // ... except for some dangling DPValues.
   EXPECT_NE(Exit.getTrailingDPValues(), nullptr);
   EXPECT_FALSE(Exit.getTrailingDPValues()->empty());
+  Exit.getTrailingDPValues()->eraseFromParent();
   Exit.deleteTrailingDPValues();
 
   UseNewDbgInfoFormat = false;
-- 
cgit v1.1


From 7d4733a267cafa2109dc43b151dbae5716f372e4 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Thu, 8 Feb 2024 11:49:17 +0000
Subject: [X86] LowerBUILD_VECTOR - share the same SDLoc argument instead of
 recreating it over and over again.

---
 llvm/lib/Target/X86/X86ISelLowering.cpp | 87 ++++++++++++++++-----------------
 1 file changed, 42 insertions(+), 45 deletions(-)

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index b5b76c6..f310010 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -7135,6 +7135,7 @@ static bool isFoldableUseOfShuffle(SDNode *N) {
 /// The VBROADCAST node is returned when a pattern is found,
 /// or SDValue() otherwise.
 static SDValue lowerBuildVectorAsBroadcast(BuildVectorSDNode *BVOp,
+                                           const SDLoc &dl,
                                            const X86Subtarget &Subtarget,
                                            SelectionDAG &DAG) {
   // VBROADCAST requires AVX.
@@ -7145,8 +7146,6 @@ static SDValue lowerBuildVectorAsBroadcast(BuildVectorSDNode *BVOp,
 
   MVT VT = BVOp->getSimpleValueType(0);
   unsigned NumElts = VT.getVectorNumElements();
-  SDLoc dl(BVOp);
-
   assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&
          "Unsupported vector type for broadcast.");
 
@@ -7492,14 +7491,13 @@ static SDValue LowerBUILD_VECTORvXbf16(SDValue Op, SelectionDAG &DAG,
 }
 
 // Lower BUILD_VECTOR operation for v8i1 and v16i1 types.
-static SDValue LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG,
+static SDValue LowerBUILD_VECTORvXi1(SDValue Op, const SDLoc &dl,
+                                     SelectionDAG &DAG,
                                      const X86Subtarget &Subtarget) {
 
   MVT VT = Op.getSimpleValueType();
   assert((VT.getVectorElementType() == MVT::i1) &&
          "Unexpected type in LowerBUILD_VECTORvXi1!");
-
-  SDLoc dl(Op);
   if (ISD::isBuildVectorAllZeros(Op.getNode()) ||
       ISD::isBuildVectorAllOnes(Op.getNode()))
     return Op;
@@ -7618,7 +7616,7 @@ LLVM_ATTRIBUTE_UNUSED static bool isHorizOp(unsigned Opcode) {
 /// See the corrected implementation in isHopBuildVector(). Can we reduce this
 /// code because it is only used for partial h-op matching now?
 static bool isHorizontalBinOpPart(const BuildVectorSDNode *N, unsigned Opcode,
-                                  SelectionDAG &DAG,
+                                  const SDLoc &DL, SelectionDAG &DAG,
                                   unsigned BaseIdx, unsigned LastIdx,
                                   SDValue &V0, SDValue &V1) {
   EVT VT = N->getValueType(0);
@@ -7928,6 +7926,7 @@ static bool isFMAddSubOrFMSubAdd(const X86Subtarget &Subtarget,
 /// 'fsubadd' operation accordingly to X86ISD::ADDSUB or X86ISD::FMADDSUB or
 /// X86ISD::FMSUBADD node.
 static SDValue lowerToAddSubOrFMAddSub(const BuildVectorSDNode *BV,
+                                       const SDLoc &DL,
                                        const X86Subtarget &Subtarget,
                                        SelectionDAG &DAG) {
   SDValue Opnd0, Opnd1;
@@ -7938,7 +7937,6 @@ static SDValue lowerToAddSubOrFMAddSub(const BuildVectorSDNode *BV,
     return SDValue();
 
   MVT VT = BV->getSimpleValueType(0);
-  SDLoc DL(BV);
 
   // Try to generate X86ISD::FMADDSUB node here.
   SDValue Opnd2;
@@ -8057,22 +8055,22 @@ static bool isHopBuildVector(const BuildVectorSDNode *BV, SelectionDAG &DAG,
 }
 
 static SDValue getHopForBuildVector(const BuildVectorSDNode *BV,
-                                    SelectionDAG &DAG, unsigned HOpcode,
-                                    SDValue V0, SDValue V1) {
+                                    const SDLoc &DL, SelectionDAG &DAG,
+                                    unsigned HOpcode, SDValue V0, SDValue V1) {
   // If either input vector is not the same size as the build vector,
   // extract/insert the low bits to the correct size.
   // This is free (examples: zmm --> xmm, xmm --> ymm).
   MVT VT = BV->getSimpleValueType(0);
   unsigned Width = VT.getSizeInBits();
   if (V0.getValueSizeInBits() > Width)
-    V0 = extractSubVector(V0, 0, DAG, SDLoc(BV), Width);
+    V0 = extractSubVector(V0, 0, DAG, DL, Width);
   else if (V0.getValueSizeInBits() < Width)
-    V0 = insertSubVector(DAG.getUNDEF(VT), V0, 0, DAG, SDLoc(BV), Width);
+    V0 = insertSubVector(DAG.getUNDEF(VT), V0, 0, DAG, DL, Width);
 
   if (V1.getValueSizeInBits() > Width)
-    V1 = extractSubVector(V1, 0, DAG, SDLoc(BV), Width);
+    V1 = extractSubVector(V1, 0, DAG, DL, Width);
   else if (V1.getValueSizeInBits() < Width)
-    V1 = insertSubVector(DAG.getUNDEF(VT), V1, 0, DAG, SDLoc(BV), Width);
+    V1 = insertSubVector(DAG.getUNDEF(VT), V1, 0, DAG, DL, Width);
 
   unsigned NumElts = VT.getVectorNumElements();
   APInt DemandedElts = APInt::getAllOnes(NumElts);
@@ -8084,17 +8082,17 @@ static SDValue getHopForBuildVector(const BuildVectorSDNode *BV,
   unsigned HalfNumElts = NumElts / 2;
   if (VT.is256BitVector() && DemandedElts.lshr(HalfNumElts) == 0) {
     MVT HalfVT = VT.getHalfNumVectorElementsVT();
-    V0 = extractSubVector(V0, 0, DAG, SDLoc(BV), 128);
-    V1 = extractSubVector(V1, 0, DAG, SDLoc(BV), 128);
-    SDValue Half = DAG.getNode(HOpcode, SDLoc(BV), HalfVT, V0, V1);
-    return insertSubVector(DAG.getUNDEF(VT), Half, 0, DAG, SDLoc(BV), 256);
+    V0 = extractSubVector(V0, 0, DAG, DL, 128);
+    V1 = extractSubVector(V1, 0, DAG, DL, 128);
+    SDValue Half = DAG.getNode(HOpcode, DL, HalfVT, V0, V1);
+    return insertSubVector(DAG.getUNDEF(VT), Half, 0, DAG, DL, 256);
   }
 
-  return DAG.getNode(HOpcode, SDLoc(BV), VT, V0, V1);
+  return DAG.getNode(HOpcode, DL, VT, V0, V1);
 }
 
 /// Lower BUILD_VECTOR to a horizontal add/sub operation if possible.
-static SDValue LowerToHorizontalOp(const BuildVectorSDNode *BV,
+static SDValue LowerToHorizontalOp(const BuildVectorSDNode *BV, const SDLoc &DL,
                                    const X86Subtarget &Subtarget,
                                    SelectionDAG &DAG) {
   // We need at least 2 non-undef elements to make this worthwhile by default.
@@ -8114,7 +8112,7 @@ static SDValue LowerToHorizontalOp(const BuildVectorSDNode *BV,
     unsigned HOpcode;
     SDValue V0, V1;
     if (isHopBuildVector(BV, DAG, HOpcode, V0, V1))
-      return getHopForBuildVector(BV, DAG, HOpcode, V0, V1);
+      return getHopForBuildVector(BV, DL, DAG, HOpcode, V0, V1);
   }
 
   // Try harder to match 256-bit ops by using extract/concat.
@@ -8134,22 +8132,21 @@ static SDValue LowerToHorizontalOp(const BuildVectorSDNode *BV,
     if (BV->getOperand(i)->isUndef())
       NumUndefsHI++;
 
-  SDLoc DL(BV);
   SDValue InVec0, InVec1;
   if (VT == MVT::v8i32 || VT == MVT::v16i16) {
     SDValue InVec2, InVec3;
     unsigned X86Opcode;
     bool CanFold = true;
 
-    if (isHorizontalBinOpPart(BV, ISD::ADD, DAG, 0, Half, InVec0, InVec1) &&
-        isHorizontalBinOpPart(BV, ISD::ADD, DAG, Half, NumElts, InVec2,
+    if (isHorizontalBinOpPart(BV, ISD::ADD, DL, DAG, 0, Half, InVec0, InVec1) &&
+        isHorizontalBinOpPart(BV, ISD::ADD, DL, DAG, Half, NumElts, InVec2,
                               InVec3) &&
         ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
         ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
       X86Opcode = X86ISD::HADD;
-    else if (isHorizontalBinOpPart(BV, ISD::SUB, DAG, 0, Half, InVec0,
+    else if (isHorizontalBinOpPart(BV, ISD::SUB, DL, DAG, 0, Half, InVec0,
                                    InVec1) &&
-             isHorizontalBinOpPart(BV, ISD::SUB, DAG, Half, NumElts, InVec2,
+             isHorizontalBinOpPart(BV, ISD::SUB, DL, DAG, Half, NumElts, InVec2,
                                    InVec3) &&
              ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
              ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
@@ -8179,15 +8176,16 @@ static SDValue LowerToHorizontalOp(const BuildVectorSDNode *BV,
   if (VT == MVT::v8f32 || VT == MVT::v4f64 || VT == MVT::v8i32 ||
       VT == MVT::v16i16) {
     unsigned X86Opcode;
-    if (isHorizontalBinOpPart(BV, ISD::ADD, DAG, 0, NumElts, InVec0, InVec1))
+    if (isHorizontalBinOpPart(BV, ISD::ADD, DL, DAG, 0, NumElts, InVec0,
+                              InVec1))
       X86Opcode = X86ISD::HADD;
-    else if (isHorizontalBinOpPart(BV, ISD::SUB, DAG, 0, NumElts, InVec0,
+    else if (isHorizontalBinOpPart(BV, ISD::SUB, DL, DAG, 0, NumElts, InVec0,
                                    InVec1))
       X86Opcode = X86ISD::HSUB;
-    else if (isHorizontalBinOpPart(BV, ISD::FADD, DAG, 0, NumElts, InVec0,
+    else if (isHorizontalBinOpPart(BV, ISD::FADD, DL, DAG, 0, NumElts, InVec0,
                                    InVec1))
       X86Opcode = X86ISD::FHADD;
-    else if (isHorizontalBinOpPart(BV, ISD::FSUB, DAG, 0, NumElts, InVec0,
+    else if (isHorizontalBinOpPart(BV, ISD::FSUB, DL, DAG, 0, NumElts, InVec0,
                                    InVec1))
       X86Opcode = X86ISD::FHSUB;
     else
@@ -8218,10 +8216,9 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
 /// NOTE: Its not in our interest to start make a general purpose vectorizer
 /// from this, but enough scalar bit operations are created from the later
 /// legalization + scalarization stages to need basic support.
-static SDValue lowerBuildVectorToBitOp(BuildVectorSDNode *Op,
+static SDValue lowerBuildVectorToBitOp(BuildVectorSDNode *Op, const SDLoc &DL,
                                        const X86Subtarget &Subtarget,
                                        SelectionDAG &DAG) {
-  SDLoc DL(Op);
   MVT VT = Op->getSimpleValueType(0);
   unsigned NumElems = VT.getVectorNumElements();
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
@@ -8296,9 +8293,9 @@ static SDValue lowerBuildVectorToBitOp(BuildVectorSDNode *Op,
 /// Create a vector constant without a load. SSE/AVX provide the bare minimum
 /// functionality to do this, so it's all zeros, all ones, or some derivation
 /// that is cheap to calculate.
-static SDValue materializeVectorConstant(SDValue Op, SelectionDAG &DAG,
+static SDValue materializeVectorConstant(SDValue Op, const SDLoc &DL,
+                                         SelectionDAG &DAG,
                                          const X86Subtarget &Subtarget) {
-  SDLoc DL(Op);
   MVT VT = Op.getSimpleValueType();
 
   // Vectors containing all zeros can be matched by pxor and xorps.
@@ -8322,7 +8319,7 @@ static SDValue materializeVectorConstant(SDValue Op, SelectionDAG &DAG,
 /// from a vector of source values and a vector of extraction indices.
 /// The vectors might be manipulated to match the type of the permute op.
 static SDValue createVariablePermute(MVT VT, SDValue SrcVec, SDValue IndicesVec,
-                                     SDLoc &DL, SelectionDAG &DAG,
+                                     const SDLoc &DL, SelectionDAG &DAG,
                                      const X86Subtarget &Subtarget) {
   MVT ShuffleVT = VT;
   EVT IndicesVT = EVT(VT).changeVectorElementTypeToInteger();
@@ -8590,7 +8587,8 @@ static SDValue createVariablePermute(MVT VT, SDValue SrcVec, SDValue IndicesVec,
 // TODO: Utilize pshufb and zero mask blending to support more efficient
 // construction of vectors with constant-0 elements.
 static SDValue
-LowerBUILD_VECTORAsVariablePermute(SDValue V, SelectionDAG &DAG,
+LowerBUILD_VECTORAsVariablePermute(SDValue V, const SDLoc &DL,
+                                   SelectionDAG &DAG,
                                    const X86Subtarget &Subtarget) {
   SDValue SrcVec, IndicesVec;
   // Check for a match of the permute source vector and permute index elements.
@@ -8629,7 +8627,6 @@ LowerBUILD_VECTORAsVariablePermute(SDValue V, SelectionDAG &DAG,
       return SDValue();
   }
 
-  SDLoc DL(V);
   MVT VT = V.getSimpleValueType();
   return createVariablePermute(VT, SrcVec, IndicesVec, DL, DAG, Subtarget);
 }
@@ -8645,14 +8642,14 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
 
   // Generate vectors for predicate vectors.
   if (VT.getVectorElementType() == MVT::i1 && Subtarget.hasAVX512())
-    return LowerBUILD_VECTORvXi1(Op, DAG, Subtarget);
+    return LowerBUILD_VECTORvXi1(Op, dl, DAG, Subtarget);
 
   if (VT.getVectorElementType() == MVT::bf16 &&
       (Subtarget.hasAVXNECONVERT() || Subtarget.hasBF16()))
     return LowerBUILD_VECTORvXbf16(Op, DAG, Subtarget);
 
-  if (SDValue VectorConstant = materializeVectorConstant(Op, DAG, Subtarget))
-    return VectorConstant;
+  if (SDValue VectorCst = materializeVectorConstant(Op, dl, DAG, Subtarget))
+    return VectorCst;
 
   unsigned EVTBits = EltVT.getSizeInBits();
   APInt UndefMask = APInt::getZero(NumElems);
@@ -8747,13 +8744,13 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
     }
   }
 
-  if (SDValue AddSub = lowerToAddSubOrFMAddSub(BV, Subtarget, DAG))
+  if (SDValue AddSub = lowerToAddSubOrFMAddSub(BV, dl, Subtarget, DAG))
     return AddSub;
-  if (SDValue HorizontalOp = LowerToHorizontalOp(BV, Subtarget, DAG))
+  if (SDValue HorizontalOp = LowerToHorizontalOp(BV, dl, Subtarget, DAG))
     return HorizontalOp;
-  if (SDValue Broadcast = lowerBuildVectorAsBroadcast(BV, Subtarget, DAG))
+  if (SDValue Broadcast = lowerBuildVectorAsBroadcast(BV, dl, Subtarget, DAG))
     return Broadcast;
-  if (SDValue BitOp = lowerBuildVectorToBitOp(BV, Subtarget, DAG))
+  if (SDValue BitOp = lowerBuildVectorToBitOp(BV, dl, Subtarget, DAG))
     return BitOp;
 
   unsigned NumZero = ZeroMask.popcount();
@@ -8901,8 +8898,8 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
   if (IsAllConstants)
     return SDValue();
 
-  if (SDValue V = LowerBUILD_VECTORAsVariablePermute(Op, DAG, Subtarget))
-      return V;
+  if (SDValue V = LowerBUILD_VECTORAsVariablePermute(Op, dl, DAG, Subtarget))
+    return V;
 
   // See if we can use a vector load to get all of the elements.
   {
-- 
cgit v1.1


From 8e707f8444692762b35fde3e94bbcb02686272a5 Mon Sep 17 00:00:00 2001
From: Sergio Afonso <safonsof@amd.com>
Date: Thu, 8 Feb 2024 12:33:43 +0000
Subject: [Flang][Lower] NFC: Update target-features/target-cpu tests (#80984)

Previously, some of these lowering tests inadvertently relied on a
default triple not introducing any target features. This caused failures
when compiling on a ppc64le-linux-unknown-gnu system.

This patch updates these lowering tests to always explicitly set the
target triple and check that the -target-cpu and -target-features
compiler options are processed as expected.
---
 flang/test/Lower/target-features-amdgcn.f90 | 23 ++++++++++++-----------
 flang/test/Lower/target-features-x86_64.f90 | 16 +++++++---------
 2 files changed, 19 insertions(+), 20 deletions(-)

diff --git a/flang/test/Lower/target-features-amdgcn.f90 b/flang/test/Lower/target-features-amdgcn.f90
index 1f0439b..382230d 100644
--- a/flang/test/Lower/target-features-amdgcn.f90
+++ b/flang/test/Lower/target-features-amdgcn.f90
@@ -1,21 +1,22 @@
 ! REQUIRES: amdgpu-registered-target
-! RUN: %flang_fc1 -emit-fir %s -o - | FileCheck %s --check-prefixes=ALL,NONE
-! RUN: %flang_fc1 -emit-fir -triple amdgcn-amd-amdhsa %s -o - | FileCheck %s --check-prefixes=ALL,TRIPLE
-! RUN: %flang_fc1 -emit-fir -target-cpu gfx90a %s -o - | FileCheck %s --check-prefixes=ALL,CPU
-! RUN: %flang_fc1 -emit-fir -triple amdgcn-amd-amdhsa -target-cpu gfx90a %s -o - | FileCheck %s --check-prefixes=ALL,BOTH
+! RUN: %flang_fc1 -emit-fir -triple amdgcn-amd-amdhsa -target-cpu gfx90a %s -o - | FileCheck %s --check-prefixes=ALL,CPU
+! RUN: %flang_fc1 -emit-fir -triple amdgcn-amd-amdhsa -target-feature +sse %s -o - | FileCheck %s --check-prefixes=ALL,FEATURE
+! RUN: %flang_fc1 -emit-fir -triple amdgcn-amd-amdhsa -target-cpu gfx90a -target-feature +sse %s -o - | FileCheck %s --check-prefixes=ALL,BOTH
 
 ! ALL: module attributes {
 
-! NONE-NOT: fir.target_cpu
-! NONE-NOT: fir.target_features
-
-! TRIPLE-SAME: fir.target_cpu = "generic-hsa"
-! TRIPLE-NOT: fir.target_features
-
 ! CPU-SAME: fir.target_cpu = "gfx90a"
-! CPU-NOT: fir.target_features
+! CPU-SAME: fir.target_features = #llvm.target_features<[
+! CPU-SAME: "+gfx90a-insts"
+! CPU-SAME: ]>
+
+! FEATURE-SAME: fir.target_features = #llvm.target_features<[
+! FEATURE-NOT:  "+gfx90a-insts"
+! FEATURE-SAME: "+sse"
+! FEATURE-SAME: ]>
 
 ! BOTH-SAME: fir.target_cpu = "gfx90a"
 ! BOTH-SAME: fir.target_features = #llvm.target_features<[
 ! BOTH-SAME: "+gfx90a-insts"
+! BOTH-SAME: "+sse"
 ! BOTH-SAME: ]>
diff --git a/flang/test/Lower/target-features-x86_64.f90 b/flang/test/Lower/target-features-x86_64.f90
index 1b628b6..282c479 100644
--- a/flang/test/Lower/target-features-x86_64.f90
+++ b/flang/test/Lower/target-features-x86_64.f90
@@ -1,19 +1,17 @@
 ! REQUIRES: x86-registered-target
-! RUN: %flang_fc1 -emit-fir -triple x86_64-unknown-linux-gnu %s -o - | FileCheck %s --check-prefixes=ALL,NONE
 ! RUN: %flang_fc1 -emit-fir -triple x86_64-unknown-linux-gnu -target-cpu x86-64 %s -o - | FileCheck %s --check-prefixes=ALL,CPU
 ! RUN: %flang_fc1 -emit-fir -triple x86_64-unknown-linux-gnu -target-feature +sse %s -o - | FileCheck %s --check-prefixes=ALL,FEATURE
 ! RUN: %flang_fc1 -emit-fir -triple x86_64-unknown-linux-gnu -target-cpu x86-64 -target-feature +sse %s -o - | FileCheck %s --check-prefixes=ALL,BOTH
 
 ! ALL: module attributes {
 
-! NONE-NOT: fir.target_cpu
-! NONE-NOT: fir.target_features
+! CPU-SAME:     fir.target_cpu = "x86-64"
 
-! CPU-SAME: fir.target_cpu = "x86-64"
-! CPU-NOT: fir.target_features
-
-! FEATURE-NOT: fir.target_cpu
-! FEATURE-SAME: fir.target_features = #llvm.target_features<["+sse"]>
+! FEATURE-SAME: fir.target_features = #llvm.target_features<[
+! FEATURE-SAME: "+sse"
+! FEATURE-SAME: ]>
 
 ! BOTH-SAME: fir.target_cpu = "x86-64"
-! BOTH-SAME: fir.target_features = #llvm.target_features<["+sse"]>
+! BOTH-SAME: fir.target_features = #llvm.target_features<[
+! BOTH-SAME: "+sse"
+! BOTH-SAME: ]>
-- 
cgit v1.1


From 42902d22d1272c1bc10132b06be2d5251b17f225 Mon Sep 17 00:00:00 2001
From: Zain Jaffal <zain@jjaffal.com>
Date: Tue, 2 Jan 2024 16:52:59 +0000
Subject: [InstCombine] Add tests for x / sqrt(y / z) with fast-math

---
 llvm/test/Transforms/InstCombine/fdiv-sqrt.ll | 85 +++++++++++++++++++++++++++
 1 file changed, 85 insertions(+)
 create mode 100644 llvm/test/Transforms/InstCombine/fdiv-sqrt.ll

diff --git a/llvm/test/Transforms/InstCombine/fdiv-sqrt.ll b/llvm/test/Transforms/InstCombine/fdiv-sqrt.ll
new file mode 100644
index 0000000..a8d4b6d
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/fdiv-sqrt.ll
@@ -0,0 +1,85 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S -passes=instcombine < %s | FileCheck %s
+
+declare double @llvm.sqrt.f64(double)
+
+define double @sqrt_div_fast(double %x, double %y, double %z) {
+; CHECK-LABEL: @sqrt_div_fast(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[DIV:%.*]] = fdiv fast double [[Y:%.*]], [[Z:%.*]]
+; CHECK-NEXT:    [[SQRT:%.*]] = call fast double @llvm.sqrt.f64(double [[DIV]])
+; CHECK-NEXT:    [[DIV1:%.*]] = fdiv fast double [[X:%.*]], [[SQRT]]
+; CHECK-NEXT:    ret double [[DIV1]]
+;
+entry:
+  %div = fdiv fast double %y, %z
+  %sqrt = call fast double @llvm.sqrt.f64(double %div)
+  %div1 = fdiv fast double %x, %sqrt
+  ret double %div1
+}
+
+define double @sqrt_div(double %x, double %y, double %z) {
+; CHECK-LABEL: @sqrt_div(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[DIV:%.*]] = fdiv double [[Y:%.*]], [[Z:%.*]]
+; CHECK-NEXT:    [[SQRT:%.*]] = call double @llvm.sqrt.f64(double [[DIV]])
+; CHECK-NEXT:    [[DIV1:%.*]] = fdiv double [[X:%.*]], [[SQRT]]
+; CHECK-NEXT:    ret double [[DIV1]]
+;
+entry:
+  %div = fdiv double %y, %z
+  %sqrt = call double @llvm.sqrt.f64(double %div)
+  %div1 = fdiv double %x, %sqrt
+  ret double %div1
+}
+
+define double @sqrt_div_reassoc_arcp(double %x, double %y, double %z) {
+; CHECK-LABEL: @sqrt_div_reassoc_arcp(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[DIV:%.*]] = fdiv reassoc arcp double [[Y:%.*]], [[Z:%.*]]
+; CHECK-NEXT:    [[SQRT:%.*]] = call reassoc arcp double @llvm.sqrt.f64(double [[DIV]])
+; CHECK-NEXT:    [[DIV1:%.*]] = fdiv reassoc arcp double [[X:%.*]], [[SQRT]]
+; CHECK-NEXT:    ret double [[DIV1]]
+;
+entry:
+  %div = fdiv reassoc arcp double %y, %z
+  %sqrt = call reassoc arcp double @llvm.sqrt.f64(double %div)
+  %div1 = fdiv reassoc arcp double %x, %sqrt
+  ret double %div1
+}
+
+declare void @use(double)
+define double @sqrt_div_fast_multiple_uses_1(double %x, double %y, double %z) {
+; CHECK-LABEL: @sqrt_div_fast_multiple_uses_1(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[DIV:%.*]] = fdiv fast double [[Y:%.*]], [[Z:%.*]]
+; CHECK-NEXT:    call void @use(double [[DIV]])
+; CHECK-NEXT:    [[SQRT:%.*]] = call fast double @llvm.sqrt.f64(double [[DIV]])
+; CHECK-NEXT:    [[DIV1:%.*]] = fdiv fast double [[X:%.*]], [[SQRT]]
+; CHECK-NEXT:    ret double [[DIV1]]
+;
+entry:
+  %div = fdiv fast double %y, %z
+  call void @use(double %div)
+  %sqrt = call fast double @llvm.sqrt.f64(double %div)
+  %div1 = fdiv fast double %x, %sqrt
+  ret double %div1
+}
+
+define double @sqrt_div_fast_multiple_uses_2(double %x, double %y, double %z) {
+; CHECK-LABEL: @sqrt_div_fast_multiple_uses_2(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[DIV:%.*]] = fdiv fast double [[Y:%.*]], [[Z:%.*]]
+; CHECK-NEXT:    [[SQRT:%.*]] = call fast double @llvm.sqrt.f64(double [[DIV]])
+; CHECK-NEXT:    call void @use(double [[SQRT]])
+; CHECK-NEXT:    [[DIV1:%.*]] = fdiv fast double [[X:%.*]], [[SQRT]]
+; CHECK-NEXT:    ret double [[DIV1]]
+;
+entry:
+  %div = fdiv fast double %y, %z
+  %sqrt = call fast double @llvm.sqrt.f64(double %div)
+  call void @use(double %sqrt)
+  %div1 = fdiv fast double %x, %sqrt
+  ret double %div1
+}
+
-- 
cgit v1.1


From e50189b0fdf382e3e0d5fc5e58fe81a78d0de7c8 Mon Sep 17 00:00:00 2001
From: Zain Jaffal <zain@jjaffal.com>
Date: Sat, 6 Jan 2024 17:31:48 +0000
Subject: [InstCombine] Add additional tests for fdiv-sqrt

Add more tests where some of the instructions have missing flags.
---
 llvm/test/Transforms/InstCombine/fdiv-sqrt.ll | 96 ++++++++++++++++++++++++++-
 1 file changed, 93 insertions(+), 3 deletions(-)

diff --git a/llvm/test/Transforms/InstCombine/fdiv-sqrt.ll b/llvm/test/Transforms/InstCombine/fdiv-sqrt.ll
index a8d4b6d..346271b 100644
--- a/llvm/test/Transforms/InstCombine/fdiv-sqrt.ll
+++ b/llvm/test/Transforms/InstCombine/fdiv-sqrt.ll
@@ -42,9 +42,99 @@ define double @sqrt_div_reassoc_arcp(double %x, double %y, double %z) {
 ; CHECK-NEXT:    ret double [[DIV1]]
 ;
 entry:
-  %div = fdiv reassoc arcp double %y, %z
-  %sqrt = call reassoc arcp double @llvm.sqrt.f64(double %div)
-  %div1 = fdiv reassoc arcp double %x, %sqrt
+  %div = fdiv arcp reassoc double %y, %z
+  %sqrt = call arcp reassoc double @llvm.sqrt.f64(double %div)
+  %div1 = fdiv arcp reassoc double %x, %sqrt
+  ret double %div1
+}
+
+define double @sqrt_div_reassoc_missing(double %x, double %y, double %z) {
+; CHECK-LABEL: @sqrt_div_reassoc_missing(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[DIV:%.*]] = fdiv arcp double [[Y:%.*]], [[Z:%.*]]
+; CHECK-NEXT:    [[SQRT:%.*]] = call reassoc arcp double @llvm.sqrt.f64(double [[DIV]])
+; CHECK-NEXT:    [[DIV1:%.*]] = fdiv reassoc arcp double [[X:%.*]], [[SQRT]]
+; CHECK-NEXT:    ret double [[DIV1]]
+;
+entry:
+  %div = fdiv arcp double %y, %z
+  %sqrt = call arcp reassoc double @llvm.sqrt.f64(double %div)
+  %div1 = fdiv arcp reassoc double %x, %sqrt
+  ret double %div1
+}
+
+define double @sqrt_div_reassoc_missing2(double %x, double %y, double %z) {
+; CHECK-LABEL: @sqrt_div_reassoc_missing2(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[DIV:%.*]] = fdiv reassoc arcp double [[Y:%.*]], [[Z:%.*]]
+; CHECK-NEXT:    [[SQRT:%.*]] = call arcp double @llvm.sqrt.f64(double [[DIV]])
+; CHECK-NEXT:    [[DIV1:%.*]] = fdiv reassoc arcp double [[X:%.*]], [[SQRT]]
+; CHECK-NEXT:    ret double [[DIV1]]
+;
+entry:
+  %div = fdiv arcp reassoc double %y, %z
+  %sqrt = call arcp double @llvm.sqrt.f64(double %div)
+  %div1 = fdiv arcp reassoc double %x, %sqrt
+  ret double %div1
+}
+
+define double @sqrt_div_reassoc_missing3(double %x, double %y, double %z) {
+; CHECK-LABEL: @sqrt_div_reassoc_missing3(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[DIV:%.*]] = fdiv reassoc arcp double [[Y:%.*]], [[Z:%.*]]
+; CHECK-NEXT:    [[SQRT:%.*]] = call reassoc arcp double @llvm.sqrt.f64(double [[DIV]])
+; CHECK-NEXT:    [[DIV1:%.*]] = fdiv arcp double [[X:%.*]], [[SQRT]]
+; CHECK-NEXT:    ret double [[DIV1]]
+;
+entry:
+  %div = fdiv arcp reassoc double %y, %z
+  %sqrt = call arcp reassoc double @llvm.sqrt.f64(double %div)
+  %div1 = fdiv arcp double %x, %sqrt
+  ret double %div1
+}
+
+define double @sqrt_div_arcp_missing(double %x, double %y, double %z) {
+; CHECK-LABEL: @sqrt_div_arcp_missing(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[DIV:%.*]] = fdiv reassoc double [[Y:%.*]], [[Z:%.*]]
+; CHECK-NEXT:    [[SQRT:%.*]] = call reassoc arcp double @llvm.sqrt.f64(double [[DIV]])
+; CHECK-NEXT:    [[DIV1:%.*]] = fdiv reassoc arcp double [[X:%.*]], [[SQRT]]
+; CHECK-NEXT:    ret double [[DIV1]]
+;
+entry:
+  %div = fdiv reassoc double %y, %z
+  %sqrt = call arcp reassoc double @llvm.sqrt.f64(double %div)
+  %div1 = fdiv arcp reassoc double %x, %sqrt
+  ret double %div1
+}
+
+define double @sqrt_div_arcp_missing2(double %x, double %y, double %z) {
+; CHECK-LABEL: @sqrt_div_arcp_missing2(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[DIV:%.*]] = fdiv reassoc arcp double [[Y:%.*]], [[Z:%.*]]
+; CHECK-NEXT:    [[SQRT:%.*]] = call reassoc double @llvm.sqrt.f64(double [[DIV]])
+; CHECK-NEXT:    [[DIV1:%.*]] = fdiv reassoc arcp double [[X:%.*]], [[SQRT]]
+; CHECK-NEXT:    ret double [[DIV1]]
+;
+entry:
+  %div = fdiv arcp reassoc double %y, %z
+  %sqrt = call reassoc double @llvm.sqrt.f64(double %div)
+  %div1 = fdiv arcp reassoc double %x, %sqrt
+  ret double %div1
+}
+
+define double @sqrt_div_arcp_missing3(double %x, double %y, double %z) {
+; CHECK-LABEL: @sqrt_div_arcp_missing3(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[DIV:%.*]] = fdiv reassoc arcp double [[Y:%.*]], [[Z:%.*]]
+; CHECK-NEXT:    [[SQRT:%.*]] = call reassoc arcp double @llvm.sqrt.f64(double [[DIV]])
+; CHECK-NEXT:    [[DIV1:%.*]] = fdiv reassoc double [[X:%.*]], [[SQRT]]
+; CHECK-NEXT:    ret double [[DIV1]]
+;
+entry:
+  %div = fdiv arcp reassoc double %y, %z
+  %sqrt = call arcp reassoc double @llvm.sqrt.f64(double %div)
+  %div1 = fdiv reassoc double %x, %sqrt
   ret double %div1
 }
 
-- 
cgit v1.1


From 4b72c5e8277f8688f7ce0bc953f9f3ea54420358 Mon Sep 17 00:00:00 2001
From: whisperity <whisperity@gmail.com>
Date: Thu, 8 Feb 2024 13:37:55 +0100
Subject: [clang][Sema] Subclass `-Wshorten-64-to-32` under
 `-Wimplicit-int-conversion` (#80814)

Although "implicit int conversions" is supposed to be a superset
containing the more specific "64-to-32" case, previously they were a
disjoint set, only enabled in common in the much larger `-Wconversion`.
---
 clang/docs/ReleaseNotes.rst                         |  7 ++++++-
 clang/include/clang/Basic/DiagnosticGroups.td       |  6 +++---
 clang/test/Sema/conversion-64-32.c                  |  6 +++++-
 .../conversion-implicit-int-includes-64-to-32.c     | 21 +++++++++++++++++++++
 4 files changed, 35 insertions(+), 5 deletions(-)
 create mode 100644 clang/test/Sema/conversion-implicit-int-includes-64-to-32.c

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 52a48c7..e158284 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -149,7 +149,12 @@ Improvements to Clang's diagnostics
   prints.
 
 - Clang now diagnoses member template declarations with multiple declarators.
-- Clang now diagnoses use of the ``template`` keyword after declarative nested name specifiers.
+
+- Clang now diagnoses use of the ``template`` keyword after declarative nested
+  name specifiers.
+
+- The ``-Wshorten-64-to-32`` diagnostic is now grouped under ``-Wimplicit-int-conversion`` instead
+   of ``-Wconversion``. Fixes `#69444 <https://github.com/llvm/llvm-project/issues/69444>`_.
 
 Improvements to Clang's time-trace
 ----------------------------------
diff --git a/clang/include/clang/Basic/DiagnosticGroups.td b/clang/include/clang/Basic/DiagnosticGroups.td
index 6765721..975eca0a 100644
--- a/clang/include/clang/Basic/DiagnosticGroups.td
+++ b/clang/include/clang/Basic/DiagnosticGroups.td
@@ -108,8 +108,10 @@ def EnumConversion : DiagGroup<"enum-conversion",
                                 EnumCompareConditional]>;
 def ObjCSignedCharBoolImplicitIntConversion :
   DiagGroup<"objc-signed-char-bool-implicit-int-conversion">;
+def Shorten64To32 : DiagGroup<"shorten-64-to-32">;
 def ImplicitIntConversion : DiagGroup<"implicit-int-conversion",
-                                     [ObjCSignedCharBoolImplicitIntConversion]>;
+                                     [Shorten64To32,
+                                      ObjCSignedCharBoolImplicitIntConversion]>;
 def ImplicitConstIntFloatConversion : DiagGroup<"implicit-const-int-float-conversion">;
 def ImplicitIntFloatConversion : DiagGroup<"implicit-int-float-conversion",
  [ImplicitConstIntFloatConversion]>;
@@ -631,7 +633,6 @@ def Shadow : DiagGroup<"shadow", [ShadowFieldInConstructorModified,
 def ShadowAll : DiagGroup<"shadow-all", [Shadow, ShadowFieldInConstructor,
                                          ShadowUncapturedLocal, ShadowField]>;
 
-def Shorten64To32 : DiagGroup<"shorten-64-to-32">;
 def : DiagGroup<"sign-promo">;
 def SignCompare : DiagGroup<"sign-compare">;
 def SwitchDefault  : DiagGroup<"switch-default">;
@@ -942,7 +943,6 @@ def Conversion : DiagGroup<"conversion",
                             EnumConversion,
                             BitFieldEnumConversion,
                             FloatConversion,
-                            Shorten64To32,
                             IntConversion,
                             ImplicitIntConversion,
                             ImplicitFloatConversion,
diff --git a/clang/test/Sema/conversion-64-32.c b/clang/test/Sema/conversion-64-32.c
index dc417ed..c172dd1 100644
--- a/clang/test/Sema/conversion-64-32.c
+++ b/clang/test/Sema/conversion-64-32.c
@@ -9,9 +9,13 @@ typedef long long long2 __attribute__((__vector_size__(16)));
 
 int4 test1(long2 a) {
   int4  v127 = a;  // no warning.
-  return v127; 
+  return v127;
 }
 
 int test2(long v) {
   return v / 2; // expected-warning {{implicit conversion loses integer precision: 'long' to 'int'}}
 }
+
+char test3(short s) {
+  return s * 2; // no warning.
+}
diff --git a/clang/test/Sema/conversion-implicit-int-includes-64-to-32.c b/clang/test/Sema/conversion-implicit-int-includes-64-to-32.c
new file mode 100644
index 0000000..e22ccbe
--- /dev/null
+++ b/clang/test/Sema/conversion-implicit-int-includes-64-to-32.c
@@ -0,0 +1,21 @@
+// RUN: %clang_cc1 -fsyntax-only -verify -Wimplicit-int-conversion -triple x86_64-apple-darwin %s
+
+int test0(long v) {
+  return v; // expected-warning {{implicit conversion loses integer precision}}
+}
+
+typedef int  int4  __attribute__ ((vector_size(16)));
+typedef long long long2 __attribute__((__vector_size__(16)));
+
+int4 test1(long2 a) {
+  int4  v127 = a;  // no warning.
+  return v127;
+}
+
+int test2(long v) {
+  return v / 2; // expected-warning {{implicit conversion loses integer precision: 'long' to 'int'}}
+}
+
+char test3(short s) {
+  return s * 2; // expected-warning {{implicit conversion loses integer precision: 'int' to 'char'}}
+}
-- 
cgit v1.1


From 448fe73428a810eb67617e07c23510033a21de5a Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Thu, 8 Feb 2024 12:34:50 +0000
Subject: [X86] Add X86::getVectorRegisterWidth helper. NFC.

Replaces internal helper used by addConstantComments to allow reuse in a future patch.
---
 llvm/lib/Target/X86/X86InstrInfo.cpp   | 12 ++++++++++++
 llvm/lib/Target/X86/X86InstrInfo.h     |  3 +++
 llvm/lib/Target/X86/X86MCInstLower.cpp | 24 ++++++------------------
 3 files changed, 21 insertions(+), 18 deletions(-)

diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp
index 0d30a31..0f21880 100644
--- a/llvm/lib/Target/X86/X86InstrInfo.cpp
+++ b/llvm/lib/Target/X86/X86InstrInfo.cpp
@@ -3423,6 +3423,18 @@ unsigned X86::getSwappedVCMPImm(unsigned Imm) {
   return Imm;
 }
 
+unsigned X86::getVectorRegisterWidth(const MCOperandInfo &Info) {
+  if (Info.RegClass == X86::VR128RegClassID ||
+      Info.RegClass == X86::VR128XRegClassID)
+    return 128;
+  if (Info.RegClass == X86::VR256RegClassID ||
+      Info.RegClass == X86::VR256XRegClassID)
+    return 256;
+  if (Info.RegClass == X86::VR512RegClassID)
+    return 512;
+  llvm_unreachable("Unknown register class!");
+}
+
 /// Return true if the Reg is X87 register.
 static bool isX87Reg(unsigned Reg) {
   return (Reg == X86::FPCW || Reg == X86::FPSW ||
diff --git a/llvm/lib/Target/X86/X86InstrInfo.h b/llvm/lib/Target/X86/X86InstrInfo.h
index ee0d2d0..996a24d 100644
--- a/llvm/lib/Target/X86/X86InstrInfo.h
+++ b/llvm/lib/Target/X86/X86InstrInfo.h
@@ -77,6 +77,9 @@ unsigned getSwappedVPCOMImm(unsigned Imm);
 /// Get the VCMP immediate if the opcodes are swapped.
 unsigned getSwappedVCMPImm(unsigned Imm);
 
+/// Get the width of the vector register operand.
+unsigned getVectorRegisterWidth(const MCOperandInfo &Info);
+
 /// Check if the instruction is X87 instruction.
 bool isX87Instruction(MachineInstr &MI);
 
diff --git a/llvm/lib/Target/X86/X86MCInstLower.cpp b/llvm/lib/Target/X86/X86MCInstLower.cpp
index b336ba3..d3b7d97 100644
--- a/llvm/lib/Target/X86/X86MCInstLower.cpp
+++ b/llvm/lib/Target/X86/X86MCInstLower.cpp
@@ -1388,18 +1388,6 @@ PrevCrossBBInst(MachineBasicBlock::const_iterator MBBI) {
   return MBBI;
 }
 
-static unsigned getRegisterWidth(const MCOperandInfo &Info) {
-  if (Info.RegClass == X86::VR128RegClassID ||
-      Info.RegClass == X86::VR128XRegClassID)
-    return 128;
-  if (Info.RegClass == X86::VR256RegClassID ||
-      Info.RegClass == X86::VR256XRegClassID)
-    return 256;
-  if (Info.RegClass == X86::VR512RegClassID)
-    return 512;
-  llvm_unreachable("Unknown register class!");
-}
-
 static unsigned getSrcIdx(const MachineInstr* MI, unsigned SrcIdx) {
   if (X86II::isKMasked(MI->getDesc().TSFlags)) {
     // Skip mask operand.
@@ -1648,7 +1636,7 @@ static void printZeroExtend(const MachineInstr *MI, MCStreamer &OutStreamer,
   CS << " = ";
 
   SmallVector<int> Mask;
-  unsigned Width = getRegisterWidth(MI->getDesc().operands()[0]);
+  unsigned Width = X86::getVectorRegisterWidth(MI->getDesc().operands()[0]);
   assert((Width % DstEltBits) == 0 && (DstEltBits % SrcEltBits) == 0 &&
          "Illegal extension ratio");
   DecodeZeroExtendMask(SrcEltBits, DstEltBits, Width / DstEltBits, false, Mask);
@@ -1753,7 +1741,7 @@ static void addConstantComments(const MachineInstr *MI,
   case X86::VPSHUFBZrmkz: {
     unsigned SrcIdx = getSrcIdx(MI, 1);
     if (auto *C = X86::getConstantFromPool(*MI, SrcIdx + 1)) {
-      unsigned Width = getRegisterWidth(MI->getDesc().operands()[0]);
+      unsigned Width = X86::getVectorRegisterWidth(MI->getDesc().operands()[0]);
       SmallVector<int, 64> Mask;
       DecodePSHUFBMask(C, Width, Mask);
       if (!Mask.empty())
@@ -1775,7 +1763,7 @@ static void addConstantComments(const MachineInstr *MI,
   case X86::VPERMILPSZrmkz: {
     unsigned SrcIdx = getSrcIdx(MI, 1);
     if (auto *C = X86::getConstantFromPool(*MI, SrcIdx + 1)) {
-      unsigned Width = getRegisterWidth(MI->getDesc().operands()[0]);
+      unsigned Width = X86::getVectorRegisterWidth(MI->getDesc().operands()[0]);
       SmallVector<int, 16> Mask;
       DecodeVPERMILPMask(C, 32, Width, Mask);
       if (!Mask.empty())
@@ -1796,7 +1784,7 @@ static void addConstantComments(const MachineInstr *MI,
   case X86::VPERMILPDZrmkz: {
     unsigned SrcIdx = getSrcIdx(MI, 1);
     if (auto *C = X86::getConstantFromPool(*MI, SrcIdx + 1)) {
-      unsigned Width = getRegisterWidth(MI->getDesc().operands()[0]);
+      unsigned Width = X86::getVectorRegisterWidth(MI->getDesc().operands()[0]);
       SmallVector<int, 16> Mask;
       DecodeVPERMILPMask(C, 64, Width, Mask);
       if (!Mask.empty())
@@ -1824,7 +1812,7 @@ static void addConstantComments(const MachineInstr *MI,
     }
 
     if (auto *C = X86::getConstantFromPool(*MI, 3)) {
-      unsigned Width = getRegisterWidth(MI->getDesc().operands()[0]);
+      unsigned Width = X86::getVectorRegisterWidth(MI->getDesc().operands()[0]);
       SmallVector<int, 16> Mask;
       DecodeVPERMIL2PMask(C, (unsigned)CtrlOp.getImm(), ElSize, Width, Mask);
       if (!Mask.empty())
@@ -1835,7 +1823,7 @@ static void addConstantComments(const MachineInstr *MI,
 
   case X86::VPPERMrrm: {
     if (auto *C = X86::getConstantFromPool(*MI, 3)) {
-      unsigned Width = getRegisterWidth(MI->getDesc().operands()[0]);
+      unsigned Width = X86::getVectorRegisterWidth(MI->getDesc().operands()[0]);
       SmallVector<int, 16> Mask;
       DecodeVPPERMMask(C, Width, Mask);
       if (!Mask.empty())
-- 
cgit v1.1


From 6ea76c1328e04799981c78b3661a175a2ba47cec Mon Sep 17 00:00:00 2001
From: Jeremy Morse <jeremy.morse@sony.com>
Date: Thu, 8 Feb 2024 12:41:55 +0000
Subject: [NFCI][RemoveDIs] Build LLVM with RemoveDIs iterators

This commit flips a bit to make LLVM build with "debuginfo iterators",
causing BasicBlock::iterator to contain a bit that's used for debug-info
purposes. More about this can be read on Discourse [0], but the runtime
impact of this should be negligable (iterators usually end up being
inlined), and there should be no change to LLVMs behaviour as a result of
this commit.

What this does mean though, is that roughly 400 debug-info tests where
we've added "--try-experimental-debuginfo-iterators" to RUNlines are going
to start operating in RemoveDIs mode. These are already tested on the
new-debug-iterators buildbot [1], and I've even tested with asan, so I'm
not _expecting_ any turbulence.

[0] https://discourse.llvm.org/t/rfc-instruction-api-changes-needed-to-eliminate-debug-intrinsics-from-ir/68939
[1] https://lab.llvm.org/buildbot/#/builders/275
---
 llvm/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/CMakeLists.txt b/llvm/CMakeLists.txt
index 485c76b..c31980a 100644
--- a/llvm/CMakeLists.txt
+++ b/llvm/CMakeLists.txt
@@ -654,7 +654,7 @@ option(LLVM_EXTERNALIZE_DEBUGINFO
   "Generate dSYM files and strip executables and libraries (Darwin Only)" OFF)
 
 option(LLVM_EXPERIMENTAL_DEBUGINFO_ITERATORS
-  "Add extra Booleans to ilist_iterators to communicate facts for debug-info" OFF)
+  "Add extra Booleans to ilist_iterators to communicate facts for debug-info" ON)
 
 set(LLVM_CODESIGNING_IDENTITY "" CACHE STRING
   "Sign executables and dylibs with the given identity or skip if empty (Darwin Only)")
-- 
cgit v1.1


From ec1fcb381d884ca53e2e0dd4075f946c8f002de2 Mon Sep 17 00:00:00 2001
From: agozillon <Andrew.Gozillon@amd.com>
Date: Thu, 8 Feb 2024 14:03:39 +0100
Subject: [Flang][bbc] Prevent bbc -emit-fir command invoking OpenMP passes
 twice (#80927)

Currently when the bbc tool is invoked with the emit-fir command the pass pipeline will be invoked twice for verification causing the previously added OpenMP pass pipeline to be invoked multiple times.

This change seeks to prevent that from occurring by using a seperate pass manager and run command immediately when it is necessary for the OpenMP passes to be executed.
---
 flang/tools/bbc/bbc.cpp | 28 +++++++++++++++++++++++-----
 1 file changed, 23 insertions(+), 5 deletions(-)

diff --git a/flang/tools/bbc/bbc.cpp b/flang/tools/bbc/bbc.cpp
index 9d5caf5..c9358c8 100644
--- a/flang/tools/bbc/bbc.cpp
+++ b/flang/tools/bbc/bbc.cpp
@@ -256,6 +256,22 @@ createTargetMachine(llvm::StringRef targetTriple, std::string &error) {
                                      /*Reloc::Model=*/std::nullopt)};
 }
 
+/// Build and execute the OpenMPFIRPassPipeline with its own instance
+/// of the pass manager, allowing it to be invoked as soon as it's
+/// required without impacting the main pass pipeline that may be invoked
+/// more than once for verification.
+static mlir::LogicalResult runOpenMPPasses(mlir::ModuleOp mlirModule) {
+  mlir::PassManager pm(mlirModule->getName(),
+                       mlir::OpPassManager::Nesting::Implicit);
+  fir::createOpenMPFIRPassPipeline(pm, enableOpenMPDevice);
+  (void)mlir::applyPassManagerCLOptions(pm);
+  if (mlir::failed(pm.run(mlirModule))) {
+    llvm::errs() << "FATAL: failed to correctly apply OpenMP pass pipeline";
+    return mlir::failure();
+  }
+  return mlir::success();
+}
+
 //===----------------------------------------------------------------------===//
 // Translate Fortran input to FIR, a dialect of MLIR.
 //===----------------------------------------------------------------------===//
@@ -369,14 +385,16 @@ static mlir::LogicalResult convertFortranSourceToMLIR(
                            "could not open output file ")
            << outputName;
 
+  // WARNING: This pipeline must be run immediately after the lowering to
+  // ensure that the FIR is correct with respect to OpenMP operations/
+  // attributes.
+  if (enableOpenMP)
+    if (mlir::failed(runOpenMPPasses(mlirModule)))
+      return mlir::failure();
+
   // Otherwise run the default passes.
   mlir::PassManager pm(mlirModule->getName(),
                        mlir::OpPassManager::Nesting::Implicit);
-  if (enableOpenMP)
-    // WARNING: This pipeline must be run immediately after the lowering to
-    // ensure that the FIR is correct with respect to OpenMP operations/
-    // attributes.
-    fir::createOpenMPFIRPassPipeline(pm, enableOpenMPDevice);
   pm.enableVerifier(/*verifyPasses=*/true);
   (void)mlir::applyPassManagerCLOptions(pm);
   if (passPipeline.hasAnyOccurrences()) {
-- 
cgit v1.1


From 72f04fa0734f8559ad515f507a4a3ce3f461f196 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Martin=20Storsj=C3=B6?= <martin@martin.st>
Date: Thu, 8 Feb 2024 15:28:46 +0200
Subject: [OpenMP] [cmake] Don't use -fno-semantic-interposition on Windows
 (#81113)

This was added in 4b7beab4187ab0766c3d7b272511d5751431a8da. When the
flag was added implicitly elsewhere, it was added via
llvm/cmake/modules/HandleLLVMOptions.cmake, where it wasn't added on
Windows/Cygwin targets.

This avoids one warning per object file in OpenMP.
---
 openmp/cmake/HandleOpenMPOptions.cmake | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/openmp/cmake/HandleOpenMPOptions.cmake b/openmp/cmake/HandleOpenMPOptions.cmake
index 7134620..9387d9b 100644
--- a/openmp/cmake/HandleOpenMPOptions.cmake
+++ b/openmp/cmake/HandleOpenMPOptions.cmake
@@ -46,7 +46,11 @@ append_if(OPENMP_HAVE_WEXTRA_FLAG "-Wno-extra" CMAKE_C_FLAGS CMAKE_CXX_FLAGS)
 append_if(OPENMP_HAVE_WPEDANTIC_FLAG "-Wno-pedantic" CMAKE_C_FLAGS CMAKE_CXX_FLAGS)
 append_if(OPENMP_HAVE_WMAYBE_UNINITIALIZED_FLAG "-Wno-maybe-uninitialized" CMAKE_C_FLAGS CMAKE_CXX_FLAGS)
 
-append_if(OPENMP_HAVE_NO_SEMANTIC_INTERPOSITION "-fno-semantic-interposition" CMAKE_C_FLAGS CMAKE_CXX_FLAGS)
+if (NOT (WIN32 OR CYGWIN))
+  # This flag is not relevant on Windows; the flag is accepted, but produces warnings
+  # about argument unused during compilation.
+  append_if(OPENMP_HAVE_NO_SEMANTIC_INTERPOSITION "-fno-semantic-interposition" CMAKE_C_FLAGS CMAKE_CXX_FLAGS)
+endif()
 append_if(OPENMP_HAVE_FUNCTION_SECTIONS "-ffunction-section" CMAKE_C_FLAGS CMAKE_CXX_FLAGS)
 append_if(OPENMP_HAVE_DATA_SECTIONS "-fdata-sections" CMAKE_C_FLAGS CMAKE_CXX_FLAGS)
 
-- 
cgit v1.1


From 8697bbe2d4aed109520e83c6beab52196ec5b702 Mon Sep 17 00:00:00 2001
From: Mariya Podchishchaeva <mariya.podchishchaeva@intel.com>
Date: Thu, 8 Feb 2024 16:31:57 +0300
Subject: [clang] Use CPlusPlus language option instead of Bool (#80975)

As it was pointed out in
https://github.com/llvm/llvm-project/pull/80724, we should not be
checking `getLangOpts().Bool` when determining something related to
logical operators, since it only indicates that bool keyword is present,
not which semantic logical operators have. As a side effect a missing
`-Wpointer-bool-conversion` in OpenCL C was restored since like C23,
OpenCL C has bool keyword but logical operators still return int.
---
 clang/lib/Sema/SemaChecking.cpp    | 8 ++++----
 clang/test/SemaOpenCL/operators.cl | 2 +-
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp
index c775ff2..f8b73c7 100644
--- a/clang/lib/Sema/SemaChecking.cpp
+++ b/clang/lib/Sema/SemaChecking.cpp
@@ -16129,10 +16129,10 @@ static void CheckConditionalOperator(Sema &S, AbstractConditionalOperator *E,
 /// Check conversion of given expression to boolean.
 /// Input argument E is a logical expression.
 static void CheckBoolLikeConversion(Sema &S, Expr *E, SourceLocation CC) {
-  // While C23 does have bool as a keyword, we still need to run the bool-like
-  // conversion checks as bools are still not used as the return type from
-  // "boolean" operators or as the input type for conditional operators.
-  if (S.getLangOpts().Bool && !S.getLangOpts().C23)
+  // Run the bool-like conversion checks only for C since there bools are
+  // still not used as the return type from "boolean" operators or as the input
+  // type for conditional operators.
+  if (S.getLangOpts().CPlusPlus)
     return;
   if (E->IgnoreParenImpCasts()->getType()->isAtomicType())
     return;
diff --git a/clang/test/SemaOpenCL/operators.cl b/clang/test/SemaOpenCL/operators.cl
index cf359ac..76a7692 100644
--- a/clang/test/SemaOpenCL/operators.cl
+++ b/clang/test/SemaOpenCL/operators.cl
@@ -118,6 +118,6 @@ kernel void pointer_ops(){
   bool b = !p;
   b = p==0;
   int i;
-  b = !&i;
+  b = !&i; // expected-warning {{address of 'i' will always evaluate to 'true'}}
   b = &i==(int *)1;
 }
-- 
cgit v1.1


From fe8a62c46365f5ef0c15df2265bbf0026d0a4047 Mon Sep 17 00:00:00 2001
From: Uday Bondhugula <uday@polymagelabs.com>
Date: Thu, 8 Feb 2024 19:16:29 +0530
Subject: [MLIR] Fix crash in AffineMap::replace for zero result maps (#80930)

Fix obvious bug in AffineMap::replace for the case of zero result maps.
Extend/complete inferExprsFromList to work with empty expression lists.
---
 mlir/include/mlir/Dialect/Affine/IR/AffineOps.td   |  3 ++-
 .../mlir/Dialect/Utils/StructuredOpsUtils.h        |  4 +++-
 mlir/include/mlir/IR/AffineMap.h                   |  6 ++++--
 mlir/lib/Conversion/TosaToLinalg/TosaToLinalg.cpp  | 14 ++++++++-----
 mlir/lib/Conversion/VectorToGPU/VectorToGPU.cpp    |  8 ++++++--
 mlir/lib/Dialect/Affine/IR/AffineOps.cpp           |  8 ++++++--
 mlir/lib/Dialect/Linalg/Transforms/Split.cpp       |  4 +++-
 mlir/lib/Dialect/Linalg/Utils/Utils.cpp            | 17 +++++++--------
 .../SparseTensor/Transforms/SparseGPUCodegen.cpp   |  4 +++-
 mlir/lib/Dialect/Vector/IR/VectorOps.cpp           |  7 ++++---
 .../Vector/Transforms/LowerVectorContract.cpp      |  4 +++-
 .../VectorTransferSplitRewritePatterns.cpp         |  2 +-
 .../Dialect/Vector/Transforms/VectorTransforms.cpp |  9 +++++---
 mlir/lib/IR/AffineMap.cpp                          | 24 ++++++++++++++--------
 mlir/lib/IR/BuiltinTypes.cpp                       |  2 +-
 mlir/unittests/IR/AffineMapTest.cpp                | 23 +++++++++++++++++++++
 mlir/unittests/IR/CMakeLists.txt                   |  1 +
 17 files changed, 99 insertions(+), 41 deletions(-)
 create mode 100644 mlir/unittests/IR/AffineMapTest.cpp

diff --git a/mlir/include/mlir/Dialect/Affine/IR/AffineOps.td b/mlir/include/mlir/Dialect/Affine/IR/AffineOps.td
index 225e4d3..edcfcfd 100644
--- a/mlir/include/mlir/Dialect/Affine/IR/AffineOps.td
+++ b/mlir/include/mlir/Dialect/Affine/IR/AffineOps.td
@@ -67,7 +67,8 @@ def AffineApplyOp : Affine_Op<"apply", [Pure]> {
     OpBuilder<(ins "ArrayRef<AffineExpr> ":$exprList,"ValueRange":$mapOperands),
     [{
       build($_builder, $_state, $_builder.getIndexType(),
-            AffineMap::inferFromExprList(exprList).front(), mapOperands);
+            AffineMap::inferFromExprList(exprList, $_builder.getContext())
+                                        .front(), mapOperands);
     }]>
   ];
 
diff --git a/mlir/include/mlir/Dialect/Utils/StructuredOpsUtils.h b/mlir/include/mlir/Dialect/Utils/StructuredOpsUtils.h
index 134c556..929a2a7 100644
--- a/mlir/include/mlir/Dialect/Utils/StructuredOpsUtils.h
+++ b/mlir/include/mlir/Dialect/Utils/StructuredOpsUtils.h
@@ -121,7 +121,9 @@ public:
   }
 
   bool layout(MapList l) {
-    auto infer = [](MapList m) { return AffineMap::inferFromExprList(m); };
+    auto infer = [&](MapList m) {
+      return AffineMap::inferFromExprList(m, ctx);
+    };
     return maps == infer(l);
   }
 
diff --git a/mlir/include/mlir/IR/AffineMap.h b/mlir/include/mlir/IR/AffineMap.h
index cd751af..cce1412 100644
--- a/mlir/include/mlir/IR/AffineMap.h
+++ b/mlir/include/mlir/IR/AffineMap.h
@@ -122,9 +122,11 @@ public:
   /// `exprs.size()`, as many dims as the largest dim in `exprs` and as many
   /// symbols as the largest symbol in `exprs`.
   static SmallVector<AffineMap, 4>
-  inferFromExprList(ArrayRef<ArrayRef<AffineExpr>> exprsList);
+  inferFromExprList(ArrayRef<ArrayRef<AffineExpr>> exprsList,
+                    MLIRContext *context);
   static SmallVector<AffineMap, 4>
-  inferFromExprList(ArrayRef<SmallVector<AffineExpr, 4>> exprsList);
+  inferFromExprList(ArrayRef<SmallVector<AffineExpr, 4>> exprsList,
+                    MLIRContext *context);
 
   MLIRContext *getContext() const;
 
diff --git a/mlir/lib/Conversion/TosaToLinalg/TosaToLinalg.cpp b/mlir/lib/Conversion/TosaToLinalg/TosaToLinalg.cpp
index 1eb5678..f4f6dadf 100644
--- a/mlir/lib/Conversion/TosaToLinalg/TosaToLinalg.cpp
+++ b/mlir/lib/Conversion/TosaToLinalg/TosaToLinalg.cpp
@@ -2010,7 +2010,8 @@ public:
     }
 
     bool didEncounterError = false;
-    auto maps = AffineMap::inferFromExprList({srcExprs, dstExprs, dstExprs});
+    auto maps = AffineMap::inferFromExprList({srcExprs, dstExprs, dstExprs},
+                                             rewriter.getContext());
     auto linalgOp = rewriter.create<linalg::GenericOp>(
         loc, ArrayRef<Type>({resultTy, resultMaxTy}), input,
         ValueRange({filledTensorIdx, filledTensorMax}), maps, iteratorTypes,
@@ -2351,9 +2352,11 @@ struct RFFT2dConverter final : public OpRewritePattern<RFFT2dOp> {
         createZeroTensor(rewriter, loc, outputType, dynamicSizes)};
 
     // Indexing maps for input and output tensors
-    auto indexingMaps = AffineMap::inferFromExprList(llvm::ArrayRef{
-        affineDimsExpr(rewriter, 0, 3, 4), affineDimsExpr(rewriter, 0, 1, 2),
-        affineDimsExpr(rewriter, 0, 1, 2)});
+    auto indexingMaps = AffineMap::inferFromExprList(
+        llvm::ArrayRef{affineDimsExpr(rewriter, 0, 3, 4),
+                       affineDimsExpr(rewriter, 0, 1, 2),
+                       affineDimsExpr(rewriter, 0, 1, 2)},
+        rewriter.getContext());
 
     // Width and height dimensions of the original input.
     auto dimH = rewriter.createOrFold<tensor::DimOp>(loc, input, 1);
@@ -2463,7 +2466,8 @@ struct FFT2dConverter final : OpRewritePattern<FFT2dOp> {
         ArrayRef{RFFT2dConverter::affineDimsExpr(rewriter, 0, 3, 4),
                  RFFT2dConverter::affineDimsExpr(rewriter, 0, 3, 4),
                  RFFT2dConverter::affineDimsExpr(rewriter, 0, 1, 2),
-                 RFFT2dConverter::affineDimsExpr(rewriter, 0, 1, 2)});
+                 RFFT2dConverter::affineDimsExpr(rewriter, 0, 1, 2)},
+        rewriter.getContext());
 
     // Width and height dimensions of the original input.
     auto dimH = rewriter.createOrFold<tensor::DimOp>(loc, input_real, 1);
diff --git a/mlir/lib/Conversion/VectorToGPU/VectorToGPU.cpp b/mlir/lib/Conversion/VectorToGPU/VectorToGPU.cpp
index b63baf3..85fb8a5 100644
--- a/mlir/lib/Conversion/VectorToGPU/VectorToGPU.cpp
+++ b/mlir/lib/Conversion/VectorToGPU/VectorToGPU.cpp
@@ -77,7 +77,9 @@ static void getXferIndices(RewriterBase &rewriter, TransferOpType xferOp,
 static bool contractSupportsMMAMatrixType(vector::ContractionOp contract,
                                           bool useNvGpu) {
   using MapList = ArrayRef<ArrayRef<AffineExpr>>;
-  auto infer = [](MapList m) { return AffineMap::inferFromExprList(m); };
+  auto infer = [&](MapList m) {
+    return AffineMap::inferFromExprList(m, contract.getContext());
+  };
   AffineExpr m, n, k;
   bindDims(contract.getContext(), m, n, k);
   auto iteratorTypes = contract.getIteratorTypes().getValue();
@@ -394,7 +396,9 @@ struct PrepareContractToGPUMMA
 
     // Set up the parallel/reduction structure in right form.
     using MapList = ArrayRef<ArrayRef<AffineExpr>>;
-    auto infer = [](MapList m) { return AffineMap::inferFromExprList(m); };
+    auto infer = [&](MapList m) {
+      return AffineMap::inferFromExprList(m, op.getContext());
+    };
     AffineExpr m, n, k;
     bindDims(rewriter.getContext(), m, n, k);
     static constexpr std::array<int64_t, 2> perm = {1, 0};
diff --git a/mlir/lib/Dialect/Affine/IR/AffineOps.cpp b/mlir/lib/Dialect/Affine/IR/AffineOps.cpp
index adb56ab..c4b1319 100644
--- a/mlir/lib/Dialect/Affine/IR/AffineOps.cpp
+++ b/mlir/lib/Dialect/Affine/IR/AffineOps.cpp
@@ -1145,7 +1145,9 @@ AffineApplyOp
 mlir::affine::makeComposedAffineApply(OpBuilder &b, Location loc, AffineExpr e,
                                       ArrayRef<OpFoldResult> operands) {
   return makeComposedAffineApply(
-      b, loc, AffineMap::inferFromExprList(ArrayRef<AffineExpr>{e}).front(),
+      b, loc,
+      AffineMap::inferFromExprList(ArrayRef<AffineExpr>{e}, b.getContext())
+          .front(),
       operands);
 }
 
@@ -1220,7 +1222,9 @@ mlir::affine::makeComposedFoldedAffineApply(OpBuilder &b, Location loc,
                                             AffineExpr expr,
                                             ArrayRef<OpFoldResult> operands) {
   return makeComposedFoldedAffineApply(
-      b, loc, AffineMap::inferFromExprList(ArrayRef<AffineExpr>{expr}).front(),
+      b, loc,
+      AffineMap::inferFromExprList(ArrayRef<AffineExpr>{expr}, b.getContext())
+          .front(),
       operands);
 }
 
diff --git a/mlir/lib/Dialect/Linalg/Transforms/Split.cpp b/mlir/lib/Dialect/Linalg/Transforms/Split.cpp
index 0174db4..47b5fcd 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/Split.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/Split.cpp
@@ -83,7 +83,9 @@ linalg::splitOp(RewriterBase &rewriter, TilingInterface op, unsigned dimension,
   bindDims(rewriter.getContext(), d0, d1, d2);
   OpFoldResult minSplitPoint = affine::makeComposedFoldedAffineMin(
       rewriter, op.getLoc(),
-      AffineMap::inferFromExprList(ArrayRef<AffineExpr>{d0, d1 + d2}).front(),
+      AffineMap::inferFromExprList(ArrayRef<AffineExpr>{d0, d1 + d2},
+                                   rewriter.getContext())
+          .front(),
       {splitPoint, offsets[dimension], sizes[dimension]});
 
   // Compute the size of the second part. Return early if the second part would
diff --git a/mlir/lib/Dialect/Linalg/Utils/Utils.cpp b/mlir/lib/Dialect/Linalg/Utils/Utils.cpp
index 986b5f3..5d220c6 100644
--- a/mlir/lib/Dialect/Linalg/Utils/Utils.cpp
+++ b/mlir/lib/Dialect/Linalg/Utils/Utils.cpp
@@ -670,7 +670,8 @@ computeSliceParameters(OpBuilder &builder, Location loc, Value valueToTile,
                               << ": make sure in bound with affine.min\n");
 
       AffineExpr dim0, dim1, dim2;
-      bindDims(builder.getContext(), dim0, dim1, dim2);
+      MLIRContext *context = builder.getContext();
+      bindDims(context, dim0, dim1, dim2);
 
       // Get the dimension size for this dimension. We need to first calculate
       // the max index and then plus one. This is important because for
@@ -678,12 +679,12 @@ computeSliceParameters(OpBuilder &builder, Location loc, Value valueToTile,
       // form `(d0 * s0 + d1)`, where `d0`/`d1 is an output/filter window
       // dimension and `s0` is stride. Directly use the dimension size of
       // output/filer window dimensions will cause incorrect calculation.
-      AffineMap minusOneMap =
-          AffineMap::inferFromExprList({ArrayRef<AffineExpr>{dim0 - 1}})
-              .front();
-      AffineMap plusOneMap =
-          AffineMap::inferFromExprList({ArrayRef<AffineExpr>{dim0 + 1}})
-              .front();
+      AffineMap minusOneMap = AffineMap::inferFromExprList(
+                                  {ArrayRef<AffineExpr>{dim0 - 1}}, context)
+                                  .front();
+      AffineMap plusOneMap = AffineMap::inferFromExprList(
+                                 {ArrayRef<AffineExpr>{dim0 + 1}}, context)
+                                 .front();
       SmallVector<OpFoldResult> maxIndices =
           llvm::to_vector(llvm::map_range(ubs, [&](OpFoldResult ub) {
             return makeComposedFoldedAffineApply(rewriter, loc, minusOneMap,
@@ -696,7 +697,7 @@ computeSliceParameters(OpBuilder &builder, Location loc, Value valueToTile,
 
       // Compute min(dim - offset, size) to avoid out-of-bounds accesses.
       AffineMap minMap = AffineMap::inferFromExprList(
-                             {ArrayRef<AffineExpr>{dim1 - dim2, dim0}})
+                             {ArrayRef<AffineExpr>{dim1 - dim2, dim0}}, context)
                              .front();
       size =
           makeComposedFoldedAffineMin(rewriter, loc, minMap, {size, d, offset});
diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/SparseGPUCodegen.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/SparseGPUCodegen.cpp
index 87a37a7..dd3af9d 100644
--- a/mlir/lib/Dialect/SparseTensor/Transforms/SparseGPUCodegen.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/SparseGPUCodegen.cpp
@@ -1263,7 +1263,9 @@ struct LinalgOpRewriter : public OpRewritePattern<linalg::GenericOp> {
     SmallVector<AffineMap, 4> maps = op.getIndexingMapsArray();
 
     using MapList = ArrayRef<ArrayRef<AffineExpr>>;
-    auto infer = [](MapList m) { return AffineMap::inferFromExprList(m); };
+    auto infer = [&](MapList m) {
+      return AffineMap::inferFromExprList(m, op.getContext());
+    };
     AffineExpr i, j, k;
     bindDims(getContext(), i, j, k);
 
diff --git a/mlir/lib/Dialect/Vector/IR/VectorOps.cpp b/mlir/lib/Dialect/Vector/IR/VectorOps.cpp
index 4523544..5be6a62 100644
--- a/mlir/lib/Dialect/Vector/IR/VectorOps.cpp
+++ b/mlir/lib/Dialect/Vector/IR/VectorOps.cpp
@@ -675,9 +675,10 @@ void vector::ContractionOp::build(OpBuilder &builder, OperationState &result,
                                   ArrayRef<IteratorType> iteratorTypes) {
   result.addOperands({lhs, rhs, acc});
   result.addTypes(acc.getType());
-  result.addAttribute(getIndexingMapsAttrName(result.name),
-                      builder.getAffineMapArrayAttr(
-                          AffineMap::inferFromExprList(indexingExprs)));
+  result.addAttribute(
+      getIndexingMapsAttrName(result.name),
+      builder.getAffineMapArrayAttr(
+          AffineMap::inferFromExprList(indexingExprs, builder.getContext())));
   result.addAttribute(
       getIteratorTypesAttrName(result.name),
       builder.getArrayAttr(llvm::to_vector(llvm::map_range(
diff --git a/mlir/lib/Dialect/Vector/Transforms/LowerVectorContract.cpp b/mlir/lib/Dialect/Vector/Transforms/LowerVectorContract.cpp
index 446eb85..0eaf9f7 100644
--- a/mlir/lib/Dialect/Vector/Transforms/LowerVectorContract.cpp
+++ b/mlir/lib/Dialect/Vector/Transforms/LowerVectorContract.cpp
@@ -695,7 +695,9 @@ ContractionOpToDotLowering::matchAndRewrite(vector::ContractionOp op,
   Value lhs = op.getLhs(), rhs = op.getRhs();
 
   using MapList = ArrayRef<ArrayRef<AffineExpr>>;
-  auto infer = [](MapList m) { return AffineMap::inferFromExprList(m); };
+  auto infer = [&](MapList m) {
+    return AffineMap::inferFromExprList(m, op.getContext());
+  };
   AffineExpr m, n, k;
   bindDims(rewriter.getContext(), m, n, k);
   SmallVector<AffineMap> maps = op.getIndexingMapsArray();
diff --git a/mlir/lib/Dialect/Vector/Transforms/VectorTransferSplitRewritePatterns.cpp b/mlir/lib/Dialect/Vector/Transforms/VectorTransferSplitRewritePatterns.cpp
index f1a2716..b844c2b 100644
--- a/mlir/lib/Dialect/Vector/Transforms/VectorTransferSplitRewritePatterns.cpp
+++ b/mlir/lib/Dialect/Vector/Transforms/VectorTransferSplitRewritePatterns.cpp
@@ -209,7 +209,7 @@ createSubViewIntersection(RewriterBase &b, VectorTransferOpInterface xferOp,
     AffineExpr i, j, k;
     bindDims(xferOp.getContext(), i, j, k);
     SmallVector<AffineMap, 4> maps =
-        AffineMap::inferFromExprList(MapList{{i - j, k}});
+        AffineMap::inferFromExprList(MapList{{i - j, k}}, b.getContext());
     // affine_min(%dimMemRef - %index, %dimAlloc)
     Value affineMin = b.create<affine::AffineMinOp>(
         loc, index.getType(), maps[0], ValueRange{dimMemRef, index, dimAlloc});
diff --git a/mlir/lib/Dialect/Vector/Transforms/VectorTransforms.cpp b/mlir/lib/Dialect/Vector/Transforms/VectorTransforms.cpp
index 4034dc4..53ae138 100644
--- a/mlir/lib/Dialect/Vector/Transforms/VectorTransforms.cpp
+++ b/mlir/lib/Dialect/Vector/Transforms/VectorTransforms.cpp
@@ -160,8 +160,9 @@ struct MultiReduceToContract
         iteratorTypes.push_back(vector::IteratorType::reduction);
       }
     }
-    auto dstMap = AffineMap::get(/*dimCount=*/reductionMask.size(),
-                                 /*symCount=*/0, exprs, reduceOp.getContext());
+    auto dstMap =
+        AffineMap::get(/*dimCount=*/reductionMask.size(),
+                       /*symbolCount=*/0, exprs, reduceOp.getContext());
     rewriter.replaceOpWithNewOp<mlir::vector::ContractionOp>(
         reduceOp, mulOp->getOperand(0), mulOp->getOperand(1), reduceOp.getAcc(),
         rewriter.getAffineMapArrayAttr({srcMap, srcMap, dstMap}),
@@ -1399,7 +1400,9 @@ struct CanonicalizeContractMatmulToMMT final
 
     // Set up the parallel/reduction structure in right form.
     using MapList = ArrayRef<ArrayRef<AffineExpr>>;
-    auto infer = [](MapList m) { return AffineMap::inferFromExprList(m); };
+    auto infer = [&](MapList m) {
+      return AffineMap::inferFromExprList(m, op.getContext());
+    };
     AffineExpr m;
     AffineExpr n;
     AffineExpr k;
diff --git a/mlir/lib/IR/AffineMap.cpp b/mlir/lib/IR/AffineMap.cpp
index c280462..4aa0d4f 100644
--- a/mlir/lib/IR/AffineMap.cpp
+++ b/mlir/lib/IR/AffineMap.cpp
@@ -272,12 +272,16 @@ AffineMap AffineMap::getMultiDimMapWithTargets(unsigned numDims,
   return result;
 }
 
+/// Creates an affine map each for each list of AffineExpr's in `exprsList`
+/// while inferring the right number of dimensional and symbolic inputs needed
+/// based on the maximum dimensional and symbolic identifier appearing in the
+/// expressions.
 template <typename AffineExprContainer>
 static SmallVector<AffineMap, 4>
-inferFromExprList(ArrayRef<AffineExprContainer> exprsList) {
-  assert(!exprsList.empty());
-  assert(!exprsList[0].empty());
-  auto context = exprsList[0][0].getContext();
+inferFromExprList(ArrayRef<AffineExprContainer> exprsList,
+                  MLIRContext *context) {
+  if (exprsList.empty())
+    return {};
   int64_t maxDim = -1, maxSym = -1;
   getMaxDimAndSymbol(exprsList, maxDim, maxSym);
   SmallVector<AffineMap, 4> maps;
@@ -289,13 +293,15 @@ inferFromExprList(ArrayRef<AffineExprContainer> exprsList) {
 }
 
 SmallVector<AffineMap, 4>
-AffineMap::inferFromExprList(ArrayRef<ArrayRef<AffineExpr>> exprsList) {
-  return ::inferFromExprList(exprsList);
+AffineMap::inferFromExprList(ArrayRef<ArrayRef<AffineExpr>> exprsList,
+                             MLIRContext *context) {
+  return ::inferFromExprList(exprsList, context);
 }
 
 SmallVector<AffineMap, 4>
-AffineMap::inferFromExprList(ArrayRef<SmallVector<AffineExpr, 4>> exprsList) {
-  return ::inferFromExprList(exprsList);
+AffineMap::inferFromExprList(ArrayRef<SmallVector<AffineExpr, 4>> exprsList,
+                             MLIRContext *context) {
+  return ::inferFromExprList(exprsList, context);
 }
 
 uint64_t AffineMap::getLargestKnownDivisorOfMapExprs() {
@@ -521,7 +527,7 @@ AffineMap::replace(const DenseMap<AffineExpr, AffineExpr> &map) const {
   newResults.reserve(getNumResults());
   for (AffineExpr e : getResults())
     newResults.push_back(e.replace(map));
-  return AffineMap::inferFromExprList(newResults).front();
+  return AffineMap::inferFromExprList(newResults, getContext()).front();
 }
 
 AffineMap AffineMap::dropResults(const llvm::SmallBitVector &positions) const {
diff --git a/mlir/lib/IR/BuiltinTypes.cpp b/mlir/lib/IR/BuiltinTypes.cpp
index 9b8ee3d..1794b38 100644
--- a/mlir/lib/IR/BuiltinTypes.cpp
+++ b/mlir/lib/IR/BuiltinTypes.cpp
@@ -921,7 +921,7 @@ AffineExpr mlir::makeCanonicalStridedLayoutExpr(ArrayRef<int64_t> sizes,
     return getAffineConstantExpr(0, context);
 
   assert(!exprs.empty() && "expected exprs");
-  auto maps = AffineMap::inferFromExprList(exprs);
+  auto maps = AffineMap::inferFromExprList(exprs, context);
   assert(!maps.empty() && "Expected one non-empty map");
   unsigned numDims = maps[0].getNumDims(), nSymbols = maps[0].getNumSymbols();
 
diff --git a/mlir/unittests/IR/AffineMapTest.cpp b/mlir/unittests/IR/AffineMapTest.cpp
new file mode 100644
index 0000000..081afad
--- /dev/null
+++ b/mlir/unittests/IR/AffineMapTest.cpp
@@ -0,0 +1,23 @@
+//===- AffineMapTest.cpp - unit tests for affine map API ------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/IR/AffineMap.h"
+#include "mlir/IR/Builders.h"
+#include "gtest/gtest.h"
+
+using namespace mlir;
+
+// Test AffineMap replace API for the zero result case.
+TEST(AffineMapTest, inferMapFromAffineExprs) {
+  MLIRContext ctx;
+  OpBuilder b(&ctx);
+  AffineMap map = b.getEmptyAffineMap();
+  DenseMap<AffineExpr, AffineExpr> replacements;
+  map.replace(replacements);
+  EXPECT_EQ(map, map);
+}
diff --git a/mlir/unittests/IR/CMakeLists.txt b/mlir/unittests/IR/CMakeLists.txt
index 1ed4686..e7e9c3b 100644
--- a/mlir/unittests/IR/CMakeLists.txt
+++ b/mlir/unittests/IR/CMakeLists.txt
@@ -1,5 +1,6 @@
 add_mlir_unittest(MLIRIRTests
   AdaptorTest.cpp
+  AffineMapTest.cpp
   AttributeTest.cpp
   DialectTest.cpp
   InterfaceTest.cpp
-- 
cgit v1.1


From d63c8bee58b5d4dad9f1c550a342e782e0038f28 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Timm=20B=C3=A4der?= <tbaeder@redhat.com>
Date: Thu, 8 Feb 2024 15:29:44 +0100
Subject: [clang][ExprConst] Remove unnecessary cast

FD is a FunctionDecl, so no need to cast a FunctionDecl to a
CXXMethodDecl just to assign it to a FunctionDecl.
---
 clang/lib/AST/ExprConstant.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp
index 089bc20..02e153f 100644
--- a/clang/lib/AST/ExprConstant.cpp
+++ b/clang/lib/AST/ExprConstant.cpp
@@ -8006,7 +8006,8 @@ public:
           assert(CorrespondingCallOpSpecialization &&
                  "We must always have a function call operator specialization "
                  "that corresponds to our static invoker specialization");
-          FD = cast<CXXMethodDecl>(CorrespondingCallOpSpecialization);
+          assert(isa<CXXMethodDecl>(CorrespondingCallOpSpecialization));
+          FD = CorrespondingCallOpSpecialization;
         } else
           FD = LambdaCallOp;
       } else if (FD->isReplaceableGlobalAllocationFunction()) {
-- 
cgit v1.1


From 3ad63593dac390e320808f3de0e1906c5fa45c8a Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Thu, 8 Feb 2024 15:29:32 +0100
Subject: [PatternMatch] Add m_PtrAdd() matcher (NFC)

This matches a getelementptr i8 instruction or constant expression,
with a given pointer operand and index.
---
 llvm/include/llvm/IR/PatternMatch.h | 22 ++++++++++++++++++++++
 llvm/unittests/IR/PatternMatch.cpp  | 22 ++++++++++++++++++++++
 2 files changed, 44 insertions(+)

diff --git a/llvm/include/llvm/IR/PatternMatch.h b/llvm/include/llvm/IR/PatternMatch.h
index 3155e7d..fed5524 100644
--- a/llvm/include/llvm/IR/PatternMatch.h
+++ b/llvm/include/llvm/IR/PatternMatch.h
@@ -1614,6 +1614,21 @@ struct m_SplatOrUndefMask {
   }
 };
 
+template <typename PointerOpTy, typename OffsetOpTy> struct PtrAdd_match {
+  PointerOpTy PointerOp;
+  OffsetOpTy OffsetOp;
+
+  PtrAdd_match(const PointerOpTy &PointerOp, const OffsetOpTy &OffsetOp)
+      : PointerOp(PointerOp), OffsetOp(OffsetOp) {}
+
+  template <typename OpTy> bool match(OpTy *V) {
+    auto *GEP = dyn_cast<GEPOperator>(V);
+    return GEP && GEP->getSourceElementType()->isIntegerTy(8) &&
+           PointerOp.match(GEP->getPointerOperand()) &&
+           OffsetOp.match(GEP->idx_begin()->get());
+  }
+};
+
 /// Matches ShuffleVectorInst independently of mask value.
 template <typename V1_t, typename V2_t>
 inline TwoOps_match<V1_t, V2_t, Instruction::ShuffleVector>
@@ -1647,6 +1662,13 @@ inline auto m_GEP(const OperandTypes &...Ops) {
   return AnyOps_match<Instruction::GetElementPtr, OperandTypes...>(Ops...);
 }
 
+/// Matches GEP with i8 source element type
+template <typename PointerOpTy, typename OffsetOpTy>
+inline PtrAdd_match<PointerOpTy, OffsetOpTy>
+m_PtrAdd(const PointerOpTy &PointerOp, const OffsetOpTy &OffsetOp) {
+  return PtrAdd_match<PointerOpTy, OffsetOpTy>(PointerOp, OffsetOp);
+}
+
 //===----------------------------------------------------------------------===//
 // Matchers for CastInst classes
 //
diff --git a/llvm/unittests/IR/PatternMatch.cpp b/llvm/unittests/IR/PatternMatch.cpp
index 885b134..883149c 100644
--- a/llvm/unittests/IR/PatternMatch.cpp
+++ b/llvm/unittests/IR/PatternMatch.cpp
@@ -1889,4 +1889,26 @@ TEST_F(PatternMatchTest, ConstExpr) {
   EXPECT_TRUE(match(V, m_ConstantExpr()));
 }
 
+TEST_F(PatternMatchTest, PtrAdd) {
+  Type *PtrTy = PointerType::getUnqual(Ctx);
+  Type *IdxTy = Type::getInt64Ty(Ctx);
+  Constant *Null = Constant::getNullValue(PtrTy);
+  Constant *Offset = ConstantInt::get(IdxTy, 42);
+  Value *PtrAdd = IRB.CreatePtrAdd(Null, Offset);
+  Value *OtherGEP = IRB.CreateGEP(IdxTy, Null, Offset);
+  Value *PtrAddConst =
+      ConstantExpr::getGetElementPtr(Type::getInt8Ty(Ctx), Null, Offset);
+
+  Value *A, *B;
+  EXPECT_TRUE(match(PtrAdd, m_PtrAdd(m_Value(A), m_Value(B))));
+  EXPECT_EQ(A, Null);
+  EXPECT_EQ(B, Offset);
+
+  EXPECT_TRUE(match(PtrAddConst, m_PtrAdd(m_Value(A), m_Value(B))));
+  EXPECT_EQ(A, Null);
+  EXPECT_EQ(B, Offset);
+
+  EXPECT_FALSE(match(OtherGEP, m_PtrAdd(m_Value(A), m_Value(B))));
+}
+
 } // anonymous namespace.
-- 
cgit v1.1


From d9e92765c5f9b0fa7adafa769dd13d37b6bca038 Mon Sep 17 00:00:00 2001
From: Yingwei Zheng <dtcxzyw2333@gmail.com>
Date: Thu, 8 Feb 2024 22:34:52 +0800
Subject: [ConstantRange] Improve ConstantRange::binaryXor (#80146)

`ConstantRange::binaryXor` gives poor results as it currently depends on
`KnownBits::operator^`.
Since `sub A, B` is canonicalized into `xor A, B` if `B` is the subset
of `A`, this patch reverts the transform in `ConstantRange::binaryXor`,
which will give better results.

Alive2: https://alive2.llvm.org/ce/z/bmTMV9
Fixes #79696.
---
 llvm/lib/IR/ConstantRange.cpp           | 17 +++++++++-
 llvm/test/Transforms/SCCP/pr79696.ll    | 55 +++++++++++++++++++++++++++++++++
 llvm/unittests/IR/ConstantRangeTest.cpp |  6 ++++
 3 files changed, 77 insertions(+), 1 deletion(-)
 create mode 100644 llvm/test/Transforms/SCCP/pr79696.ll

diff --git a/llvm/lib/IR/ConstantRange.cpp b/llvm/lib/IR/ConstantRange.cpp
index cbb64b2..3394a1e 100644
--- a/llvm/lib/IR/ConstantRange.cpp
+++ b/llvm/lib/IR/ConstantRange.cpp
@@ -1467,7 +1467,22 @@ ConstantRange ConstantRange::binaryXor(const ConstantRange &Other) const {
   if (isSingleElement() && getSingleElement()->isAllOnes())
     return Other.binaryNot();
 
-  return fromKnownBits(toKnownBits() ^ Other.toKnownBits(), /*IsSigned*/false);
+  KnownBits LHSKnown = toKnownBits();
+  KnownBits RHSKnown = Other.toKnownBits();
+  KnownBits Known = LHSKnown ^ RHSKnown;
+  ConstantRange CR = fromKnownBits(Known, /*IsSigned*/ false);
+  // Typically the following code doesn't improve the result if BW = 1.
+  if (getBitWidth() == 1)
+    return CR;
+
+  // If LHS is known to be the subset of RHS, treat LHS ^ RHS as RHS -nuw/nsw
+  // LHS. If RHS is known to be the subset of LHS, treat LHS ^ RHS as LHS
+  // -nuw/nsw RHS.
+  if ((~LHSKnown.Zero).isSubsetOf(RHSKnown.One))
+    CR = CR.intersectWith(Other.sub(*this), PreferredRangeType::Unsigned);
+  else if ((~RHSKnown.Zero).isSubsetOf(LHSKnown.One))
+    CR = CR.intersectWith(this->sub(Other), PreferredRangeType::Unsigned);
+  return CR;
 }
 
 ConstantRange
diff --git a/llvm/test/Transforms/SCCP/pr79696.ll b/llvm/test/Transforms/SCCP/pr79696.ll
new file mode 100644
index 0000000..a860112
--- /dev/null
+++ b/llvm/test/Transforms/SCCP/pr79696.ll
@@ -0,0 +1,55 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt < %s -passes=ipsccp -S | FileCheck %s
+
+; Tests from PR79696
+
+define i1 @constant_range_xor(i64 %a) {
+; CHECK-LABEL: define i1 @constant_range_xor(
+; CHECK-SAME: i64 [[A:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ugt i64 [[A]], 8192
+; CHECK-NEXT:    br i1 [[CMP]], label [[THEN:%.*]], label [[ELSE:%.*]]
+; CHECK:       then:
+; CHECK-NEXT:    [[CTLZ:%.*]] = call i64 @llvm.ctlz.i64(i64 [[A]], i1 true)
+; CHECK-NEXT:    [[CONV:%.*]] = xor i64 [[CTLZ]], 63
+; CHECK-NEXT:    ret i1 false
+; CHECK:       else:
+; CHECK-NEXT:    ret i1 false
+;
+entry:
+  %cmp = icmp ugt i64 %a, 8192
+  br i1 %cmp, label %then, label %else
+then:
+  %ctlz = call i64 @llvm.ctlz.i64(i64 %a, i1 true) ;[0, 50]
+  %conv = xor i64 %ctlz, 63                        ;[13, 63]
+  %cmp1 = icmp ult i64 %conv, 13
+  ret i1 %cmp1
+else:
+  ret i1 false
+}
+
+define i1 @constant_range_xor_negative(i64 %a) {
+; CHECK-LABEL: define i1 @constant_range_xor_negative(
+; CHECK-SAME: i64 [[A:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ugt i64 [[A]], 8192
+; CHECK-NEXT:    br i1 [[CMP]], label [[THEN:%.*]], label [[ELSE:%.*]]
+; CHECK:       then:
+; CHECK-NEXT:    [[CTLZ:%.*]] = call i64 @llvm.ctlz.i64(i64 [[A]], i1 true)
+; CHECK-NEXT:    [[CONV:%.*]] = xor i64 [[CTLZ]], 62
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp ult i64 [[CONV]], 13
+; CHECK-NEXT:    ret i1 [[CMP1]]
+; CHECK:       else:
+; CHECK-NEXT:    ret i1 false
+;
+entry:
+  %cmp = icmp ugt i64 %a, 8192
+  br i1 %cmp, label %then, label %else
+then:
+  %ctlz = call i64 @llvm.ctlz.i64(i64 %a, i1 true) ;[0, 50]
+  %conv = xor i64 %ctlz, 62                        ;[12, 63]
+  %cmp1 = icmp ult i64 %conv, 13
+  ret i1 %cmp1
+else:
+  ret i1 false
+}
diff --git a/llvm/unittests/IR/ConstantRangeTest.cpp b/llvm/unittests/IR/ConstantRangeTest.cpp
index e505af5..34a162a 100644
--- a/llvm/unittests/IR/ConstantRangeTest.cpp
+++ b/llvm/unittests/IR/ConstantRangeTest.cpp
@@ -2565,6 +2565,12 @@ TEST_F(ConstantRangeTest, binaryXor) {
   EXPECT_EQ(R16_35.binaryXor(R0_99), ConstantRange(APInt(8, 0), APInt(8, 128)));
   EXPECT_EQ(R0_99.binaryXor(R16_35), ConstantRange(APInt(8, 0), APInt(8, 128)));
 
+  // Treat xor A, B as sub nsw nuw A, B
+  ConstantRange R0_51(APInt(8, 0), APInt(8, 51));
+  ConstantRange R63(APInt(8, 63));
+  EXPECT_EQ(R0_51.binaryXor(R63), ConstantRange(APInt(8, 13), APInt(8, 64)));
+  EXPECT_EQ(R63.binaryXor(R0_51), ConstantRange(APInt(8, 13), APInt(8, 64)));
+
   TestBinaryOpExhaustive(
       [](const ConstantRange &CR1, const ConstantRange &CR2) {
         return CR1.binaryXor(CR2);
-- 
cgit v1.1


From 06774d6bbf32aff45b67d8c3753524ec36bf8869 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Timm=20B=C3=A4der?= <tbaeder@redhat.com>
Date: Thu, 8 Feb 2024 09:55:07 +0100
Subject: [clang][Interp] Handle CXXInheritedCtorInitExprs

We need to forward all arguments of the current function and
call the ctor function.
---
 clang/lib/AST/Interp/ByteCodeExprGen.cpp | 31 +++++++++++++++++
 clang/lib/AST/Interp/ByteCodeExprGen.h   |  1 +
 clang/test/AST/Interp/records.cpp        | 60 ++++++++++++++++++++++++++++++++
 3 files changed, 92 insertions(+)

diff --git a/clang/lib/AST/Interp/ByteCodeExprGen.cpp b/clang/lib/AST/Interp/ByteCodeExprGen.cpp
index 59fddfc..21bc29f 100644
--- a/clang/lib/AST/Interp/ByteCodeExprGen.cpp
+++ b/clang/lib/AST/Interp/ByteCodeExprGen.cpp
@@ -2020,6 +2020,37 @@ bool ByteCodeExprGen<Emitter>::VisitObjCBoolLiteralExpr(
   return this->emitConst(E->getValue(), E);
 }
 
+template <class Emitter>
+bool ByteCodeExprGen<Emitter>::VisitCXXInheritedCtorInitExpr(
+    const CXXInheritedCtorInitExpr *E) {
+  const CXXConstructorDecl *Ctor = E->getConstructor();
+  assert(!Ctor->isTrivial() &&
+         "Trivial CXXInheritedCtorInitExpr, implement. (possible?)");
+  const Function *F = this->getFunction(Ctor);
+  assert(F);
+  assert(!F->hasRVO());
+  assert(F->hasThisPointer());
+
+  if (!this->emitDupPtr(SourceInfo{}))
+    return false;
+
+  // Forward all arguments of the current function (which should be a
+  // constructor itself) to the inherited ctor.
+  // This is necessary because the calling code has pushed the pointer
+  // of the correct base for  us already, but the arguments need
+  // to come after.
+  unsigned Offset = align(primSize(PT_Ptr)); // instance pointer.
+  for (const ParmVarDecl *PD : Ctor->parameters()) {
+    PrimType PT = this->classify(PD->getType()).value_or(PT_Ptr);
+
+    if (!this->emitGetParam(PT, Offset, E))
+      return false;
+    Offset += align(primSize(PT));
+  }
+
+  return this->emitCall(F, E);
+}
+
 template <class Emitter> bool ByteCodeExprGen<Emitter>::discard(const Expr *E) {
   if (E->containsErrors())
     return false;
diff --git a/clang/lib/AST/Interp/ByteCodeExprGen.h b/clang/lib/AST/Interp/ByteCodeExprGen.h
index 2c9cca5..c908a9b 100644
--- a/clang/lib/AST/Interp/ByteCodeExprGen.h
+++ b/clang/lib/AST/Interp/ByteCodeExprGen.h
@@ -111,6 +111,7 @@ public:
   bool VisitGenericSelectionExpr(const GenericSelectionExpr *E);
   bool VisitChooseExpr(const ChooseExpr *E);
   bool VisitObjCBoolLiteralExpr(const ObjCBoolLiteralExpr *E);
+  bool VisitCXXInheritedCtorInitExpr(const CXXInheritedCtorInitExpr *E);
 
 protected:
   bool visitExpr(const Expr *E) override;
diff --git a/clang/test/AST/Interp/records.cpp b/clang/test/AST/Interp/records.cpp
index 5ce1e6e..1ef13f5 100644
--- a/clang/test/AST/Interp/records.cpp
+++ b/clang/test/AST/Interp/records.cpp
@@ -1223,3 +1223,63 @@ namespace IndirectFieldInit {
 
 #endif
 }
+
+namespace InheritedConstructor {
+  namespace PR47555 {
+    struct A {
+      int c;
+      int d;
+      constexpr A(int c, int d) : c(c), d(d){}
+    };
+    struct B : A { using A::A; };
+
+    constexpr B b = {13, 1};
+    static_assert(b.c == 13, "");
+    static_assert(b.d == 1, "");
+  }
+
+  namespace PR47555_2 {
+    struct A {
+      int c;
+      int d;
+      double e;
+      constexpr A(int c, int &d, double e) : c(c), d(++d), e(e){}
+    };
+    struct B : A { using A::A; };
+
+    constexpr int f() {
+      int a = 10;
+      B b = {10, a, 40.0};
+      return a;
+    }
+    static_assert(f() == 11, "");
+  }
+
+  namespace AaronsTest {
+    struct T {
+      constexpr T(float) {}
+    };
+
+    struct Base {
+      constexpr Base(T t = 1.0f) {}
+      constexpr Base(float) {}
+    };
+
+    struct FirstMiddle : Base {
+      using Base::Base;
+      constexpr FirstMiddle() : Base(2.0f) {}
+    };
+
+    struct SecondMiddle : Base {
+      constexpr SecondMiddle() : Base(3.0f) {}
+      constexpr SecondMiddle(T t) : Base(t) {}
+    };
+
+    struct S : FirstMiddle, SecondMiddle {
+      using FirstMiddle::FirstMiddle;
+      constexpr S(int i) : S(4.0f) {}
+    };
+
+    constexpr S s(1);
+  }
+}
-- 
cgit v1.1


From c4b0dfcc99da7506bff6b57d563e5cbce9caf4cd Mon Sep 17 00:00:00 2001
From: Shilei Tian <i@tianshilei.me>
Date: Thu, 8 Feb 2024 09:44:42 -0500
Subject: [Clang] Fix a non-effective assertion (#81083)

`PTy` here is literally `FTy->getParamType(i)`, which makes this
assertion not
work as expected.
---
 clang/lib/CodeGen/CGBuiltin.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index e051cbc..a7a410d 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -5908,7 +5908,7 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID,
           }
         }
 
-        assert(PTy->canLosslesslyBitCastTo(FTy->getParamType(i)) &&
+        assert(ArgValue->getType()->canLosslesslyBitCastTo(PTy) &&
                "Must be able to losslessly bit cast to param");
         // Cast vector type (e.g., v256i32) to x86_amx, this only happen
         // in amx intrinsics.
-- 
cgit v1.1


From fb6ef4233968ffefb616d1c779a5483ef1f140d3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Timm=20B=C3=A4der?= <tbaeder@redhat.com>
Date: Thu, 8 Feb 2024 10:49:14 +0100
Subject: [clang][Interp][NFC] Convert records test to verify=expected,both
 style

---
 clang/test/AST/Interp/records.cpp | 187 ++++++++++++++------------------------
 1 file changed, 66 insertions(+), 121 deletions(-)

diff --git a/clang/test/AST/Interp/records.cpp b/clang/test/AST/Interp/records.cpp
index 1ef13f5..fb50d1c 100644
--- a/clang/test/AST/Interp/records.cpp
+++ b/clang/test/AST/Interp/records.cpp
@@ -1,11 +1,11 @@
-// RUN: %clang_cc1 -fexperimental-new-constant-interpreter -verify %s
-// RUN: %clang_cc1 -fexperimental-new-constant-interpreter -std=c++14 -verify %s
-// RUN: %clang_cc1 -fexperimental-new-constant-interpreter -std=c++20 -verify %s
-// RUN: %clang_cc1 -fexperimental-new-constant-interpreter -triple i686 -verify %s
-// RUN: %clang_cc1 -verify=ref %s
-// RUN: %clang_cc1 -verify=ref -std=c++14 %s
-// RUN: %clang_cc1 -verify=ref -std=c++20 %s
-// RUN: %clang_cc1 -verify=ref -triple i686 %s
+// RUN: %clang_cc1 -fexperimental-new-constant-interpreter -verify=expected,both %s
+// RUN: %clang_cc1 -fexperimental-new-constant-interpreter -std=c++14 -verify=expected,both %s
+// RUN: %clang_cc1 -fexperimental-new-constant-interpreter -std=c++20 -verify=expected,both %s
+// RUN: %clang_cc1 -fexperimental-new-constant-interpreter -triple i686 -verify=expected,both %s
+// RUN: %clang_cc1 -verify=ref,both %s
+// RUN: %clang_cc1 -verify=ref,both -std=c++14 %s
+// RUN: %clang_cc1 -verify=ref,both -std=c++20 %s
+// RUN: %clang_cc1 -verify=ref,both -triple i686 %s
 
 /// Used to crash.
 struct Empty {};
@@ -90,9 +90,8 @@ struct Ints2 {
   int a = 10;
   int b;
 };
-constexpr Ints2 ints22; // expected-error {{without a user-provided default constructor}} \
-                        // expected-error {{must be initialized by a constant expression}} \
-                        // ref-error {{without a user-provided default constructor}}
+constexpr Ints2 ints22; // both-error {{without a user-provided default constructor}} \
+                        // expected-error {{must be initialized by a constant expression}}
 
 constexpr Ints2 I2 = Ints2{12, 25};
 static_assert(I2.a == 12, "");
@@ -164,17 +163,13 @@ constexpr C RVOAndParams(int a) {
 }
 constexpr C RVOAndParamsResult2 = RVOAndParams(12);
 
-class Bar { // expected-note {{definition of 'Bar' is not complete}} \
-            // ref-note {{definition of 'Bar' is not complete}}
+class Bar { // both-note {{definition of 'Bar' is not complete}}
 public:
   constexpr Bar(){}
-  constexpr Bar b; // expected-error {{cannot be constexpr}} \
-                   // expected-error {{has incomplete type 'const Bar'}} \
-                   // ref-error {{cannot be constexpr}} \
-                   // ref-error {{has incomplete type 'const Bar'}}
+  constexpr Bar b; // both-error {{cannot be constexpr}} \
+                   // both-error {{has incomplete type 'const Bar'}}
 };
-constexpr Bar B; // expected-error {{must be initialized by a constant expression}} \
-                 // ref-error {{must be initialized by a constant expression}}
+constexpr Bar B; // both-error {{must be initialized by a constant expression}}
 constexpr Bar *pb = nullptr;
 
 constexpr int locals() {
@@ -198,17 +193,13 @@ namespace thisPointer {
     constexpr int get12() { return 12; }
   };
 
-  constexpr int foo() { // ref-error {{never produces a constant expression}} \
-                        // expected-error {{never produces a constant expression}}
+  constexpr int foo() { // both-error {{never produces a constant expression}}
     S *s = nullptr;
-    return s->get12(); // ref-note 2{{member call on dereferenced null pointer}} \
-                       // expected-note 2{{member call on dereferenced null pointer}}
+    return s->get12(); // both-note 2{{member call on dereferenced null pointer}}
 
   }
-  static_assert(foo() == 12, ""); // ref-error {{not an integral constant expression}} \
-                                  // ref-note {{in call to 'foo()'}} \
-                                  // expected-error {{not an integral constant expression}} \
-                                  // expected-note {{in call to 'foo()'}}
+  static_assert(foo() == 12, ""); // both-error {{not an integral constant expression}} \
+                                  // both-note {{in call to 'foo()'}}
 };
 
 struct FourBoolPairs {
@@ -244,20 +235,16 @@ constexpr A a{};
 static_assert(a.i == 100, "");
 constexpr A a2{12};
 static_assert(a2.i == 12, "");
-static_assert(a2.i == 200, ""); // ref-error {{static assertion failed}} \
-                                // ref-note {{evaluates to '12 == 200'}} \
-                                // expected-error {{static assertion failed}} \
-                                // expected-note {{evaluates to '12 == 200'}}
+static_assert(a2.i == 200, ""); // both-error {{static assertion failed}} \
+                                // both-note {{evaluates to '12 == 200'}}
 
 
 struct S {
   int a = 0;
   constexpr int get5() const { return 5; }
   constexpr void fo() const {
-    this; // expected-warning {{expression result unused}} \
-          // ref-warning {{expression result unused}}
-    this->a; // expected-warning {{expression result unused}} \
-             // ref-warning {{expression result unused}}
+    this; // both-warning {{expression result unused}}
+    this->a; // both-warning {{expression result unused}}
     get5();
     getInts();
   }
@@ -342,12 +329,9 @@ namespace InitializerTemporaries {
   // Invalid destructor.
   struct S {
     constexpr S() {}
-    constexpr ~S() noexcept(false) { throw 12; } // expected-error {{cannot use 'throw'}} \
-                                                 // expected-error {{never produces a constant expression}} \
-                                                 // expected-note 2{{subexpression not valid}} \
-                                                 // ref-error {{cannot use 'throw'}} \
-                                                 // ref-error {{never produces a constant expression}} \
-                                                 // ref-note 2{{subexpression not valid}}
+    constexpr ~S() noexcept(false) { throw 12; } // both-error {{cannot use 'throw'}} \
+                                                 // both-error {{never produces a constant expression}} \
+                                                 // both-note 2{{subexpression not valid}}
   };
 
   constexpr int f() {
@@ -355,10 +339,8 @@ namespace InitializerTemporaries {
     /// FIXME: Wrong source location below.
     return 12; // expected-note {{in call to '&S{}->~S()'}}
   }
-  static_assert(f() == 12); // expected-error {{not an integral constant expression}} \
-                            // expected-note {{in call to 'f()'}} \
-                            // ref-error {{not an integral constant expression}} \
-                            // ref-note {{in call to 'f()'}}
+  static_assert(f() == 12); // both-error {{not an integral constant expression}} \
+                            // both-note {{in call to 'f()'}}
 
 
 #endif
@@ -423,7 +405,8 @@ namespace MI {
 
 namespace DeriveFailures {
 #if __cplusplus < 202002L
-  struct Base { // ref-note 2{{declared here}} expected-note {{declared here}}
+  struct Base { // both-note {{declared here}} \
+                // ref-note {{declared here}}
     int Val;
   };
 
@@ -431,35 +414,29 @@ namespace DeriveFailures {
     int OtherVal;
 
     constexpr Derived(int i) : OtherVal(i) {} // ref-error {{never produces a constant expression}} \
-                                              // ref-note 2{{non-constexpr constructor 'Base' cannot be used in a constant expression}} \
-                                              // expected-note {{non-constexpr constructor 'Base' cannot be used in a constant expression}}
+                                              // both-note {{non-constexpr constructor 'Base' cannot be used in a constant expression}} \
+                                              // ref-note {{non-constexpr constructor 'Base' cannot be used in a constant expression}} 
   };
 
-  constexpr Derived D(12); // ref-error {{must be initialized by a constant expression}} \
-                           // ref-note {{in call to 'Derived(12)'}} \
-                           // ref-note {{declared here}} \
-                           // expected-error {{must be initialized by a constant expression}} \
-                           // expected-note {{in call to 'Derived(12)'}}
+  constexpr Derived D(12); // both-error {{must be initialized by a constant expression}} \
+                           // both-note {{in call to 'Derived(12)'}} \
+                           // ref-note {{declared here}}
 
-  static_assert(D.Val == 0, ""); // ref-error {{not an integral constant expression}} \
+  static_assert(D.Val == 0, ""); // both-error {{not an integral constant expression}} \
                                  // ref-note {{initializer of 'D' is not a constant expression}} \
-                                 // expected-error {{not an integral constant expression}} \
                                  // expected-note {{read of uninitialized object}}
 #endif
 
   struct AnotherBase {
     int Val;
-    constexpr AnotherBase(int i) : Val(12 / i) {} //ref-note {{division by zero}} \
-                                                  //expected-note {{division by zero}}
+    constexpr AnotherBase(int i) : Val(12 / i) {} // both-note {{division by zero}}
   };
 
   struct AnotherDerived : AnotherBase {
     constexpr AnotherDerived(int i) : AnotherBase(i) {}
   };
-  constexpr AnotherBase Derp(0); // ref-error {{must be initialized by a constant expression}} \
-                                 // ref-note {{in call to 'AnotherBase(0)'}} \
-                                 // expected-error {{must be initialized by a constant expression}} \
-                                 // expected-note {{in call to 'AnotherBase(0)'}}
+  constexpr AnotherBase Derp(0); // both-error {{must be initialized by a constant expression}} \
+                                 // both-note {{in call to 'AnotherBase(0)'}}
 
   struct YetAnotherBase {
     int Val;
@@ -467,17 +444,14 @@ namespace DeriveFailures {
   };
 
   struct YetAnotherDerived : YetAnotherBase {
-    using YetAnotherBase::YetAnotherBase; // ref-note {{declared here}} \
-                                          // expected-note {{declared here}}
+    using YetAnotherBase::YetAnotherBase; // both-note {{declared here}}
     int OtherVal;
 
     constexpr bool doit() const { return Val == OtherVal; }
   };
 
-  constexpr YetAnotherDerived Oops(0); // ref-error {{must be initialized by a constant expression}} \
-                                       // ref-note {{constructor inherited from base class 'YetAnotherBase' cannot be used in a constant expression}} \
-                                       // expected-error {{must be initialized by a constant expression}} \
-                                       // expected-note {{constructor inherited from base class 'YetAnotherBase' cannot be used in a constant expression}}
+  constexpr YetAnotherDerived Oops(0); // both-error {{must be initialized by a constant expression}} \
+                                       // both-note {{constructor inherited from base class 'YetAnotherBase' cannot be used in a constant expression}}
 };
 
 namespace EmptyCtor {
@@ -543,18 +517,10 @@ namespace PointerArith {
   constexpr B *b1 = &b + 1;
   constexpr B *b2 = &b + 0;
 
-#if 0
-  constexpr A *a2 = &b + 1; // expected-error {{must be initialized by a constant expression}} \
-                            // expected-note {{cannot access base class of pointer past the end of object}} \
-                            // ref-error {{must be initialized by a constant expression}} \
-                            // ref-note {{cannot access base class of pointer past the end of object}}
-
-#endif
-  constexpr const int *pn = &(&b + 1)->n; // expected-error {{must be initialized by a constant expression}} \
-                                          // expected-note {{cannot access field of pointer past the end of object}} \
-                                          // ref-error {{must be initialized by a constant expression}} \
-                                          // ref-note {{cannot access field of pointer past the end of object}}
-
+  constexpr A *a2 = &b + 1; // both-error {{must be initialized by a constant expression}} \
+                            // both-note {{cannot access base class of pointer past the end of object}}
+  constexpr const int *pn = &(&b + 1)->n; // both-error {{must be initialized by a constant expression}} \
+                                          // both-note {{cannot access field of pointer past the end of object}}
 }
 
 #if __cplusplus >= 202002L
@@ -632,12 +598,9 @@ namespace Destructors {
 
   struct S {
     constexpr S() {}
-    constexpr ~S() { // expected-error {{never produces a constant expression}} \
-                     // ref-error {{never produces a constant expression}}
-      int i = 1 / 0; // expected-warning {{division by zero}} \
-                     // expected-note 2{{division by zero}} \
-                     // ref-warning {{division by zero}} \
-                     // ref-note 2{{division by zero}}
+    constexpr ~S() { // both-error {{never produces a constant expression}}
+      int i = 1 / 0; // both-warning {{division by zero}} \
+                     // both-note 2{{division by zero}}
     }
   };
   constexpr int testS() {
@@ -645,10 +608,8 @@ namespace Destructors {
     return 1; // expected-note {{in call to '&S{}->~S()'}}
               // FIXME: ^ Wrong line
   }
-  static_assert(testS() == 1); // expected-error {{not an integral constant expression}} \
-                               // expected-note {{in call to 'testS()'}} \
-                               // ref-error {{not an integral constant expression}} \
-                               // ref-note {{in call to 'testS()'}}
+  static_assert(testS() == 1); // both-error {{not an integral constant expression}} \
+                               // both-note {{in call to 'testS()'}}
 }
 
 namespace BaseToDerived {
@@ -657,10 +618,8 @@ namespace A {
   struct B : A { int n; };
   struct C : B {};
   C c = {};
-  constexpr C *pb = (C*)((A*)&c + 1); // expected-error {{must be initialized by a constant expression}} \
-                                      // expected-note {{cannot access derived class of pointer past the end of object}} \
-                                      // ref-error {{must be initialized by a constant expression}} \
-                                      // ref-note {{cannot access derived class of pointer past the end of object}}
+  constexpr C *pb = (C*)((A*)&c + 1); // both-error {{must be initialized by a constant expression}} \
+                                      // both-note {{cannot access derived class of pointer past the end of object}}
 }
 namespace B {
   struct A {};
@@ -894,10 +853,8 @@ namespace VirtualFromBase {
   // Virtual f(), not OK.
   constexpr X<X<S2>> xxs2;
   constexpr X<S2> *q = const_cast<X<X<S2>>*>(&xxs2);
-  static_assert(q->f() == sizeof(X<S2>), ""); // ref-error {{not an integral constant expression}} \
-                                              // ref-note {{cannot evaluate call to virtual function}} \
-                                              // expected-error {{not an integral constant expression}} \
-                                              // expected-note {{cannot evaluate call to virtual function}}
+  static_assert(q->f() == sizeof(X<S2>), ""); // both-error {{not an integral constant expression}} \
+                                              // both-note {{cannot evaluate call to virtual function}}
 }
 #endif
 
@@ -1070,14 +1027,10 @@ namespace ParenInit {
 
   /// Not constexpr!
   O o1(0);
-  constinit O o2(0); // ref-error {{variable does not have a constant initializer}} \
-                     // ref-note {{required by 'constinit' specifier}} \
-                     // ref-note {{reference to temporary is not a constant expression}} \
-                     // ref-note {{temporary created here}} \
-                     // expected-error {{variable does not have a constant initializer}} \
-                     // expected-note {{required by 'constinit' specifier}} \
-                     // expected-note {{reference to temporary is not a constant expression}} \
-                     // expected-note {{temporary created here}}
+  constinit O o2(0); // both-error {{variable does not have a constant initializer}} \
+                     // both-note {{required by 'constinit' specifier}} \
+                     // both-note {{reference to temporary is not a constant expression}} \
+                     // both-note {{temporary created here}}
 }
 #endif
 
@@ -1109,32 +1062,24 @@ namespace AccessOnNullptr {
     int a;
   };
 
-  constexpr int a() { // expected-error {{never produces a constant expression}} \
-                      // ref-error {{never produces a constant expression}}
+  constexpr int a() { // both-error {{never produces a constant expression}}
     F *f = nullptr;
 
-    f->a = 0; // expected-note 2{{cannot access field of null pointer}} \
-              // ref-note 2{{cannot access field of null pointer}}
+    f->a = 0; // both-note 2{{cannot access field of null pointer}}
     return f->a;
   }
-  static_assert(a() == 0, ""); // expected-error {{not an integral constant expression}} \
-                               // expected-note {{in call to 'a()'}} \
-                               // ref-error {{not an integral constant expression}} \
-                               // ref-note {{in call to 'a()'}}
+  static_assert(a() == 0, ""); // both-error {{not an integral constant expression}} \
+                               // both-note {{in call to 'a()'}}
 
-  constexpr int a2() { // expected-error {{never produces a constant expression}} \
-                      // ref-error {{never produces a constant expression}}
+  constexpr int a2() { // both-error {{never produces a constant expression}}
     F *f = nullptr;
 
 
-    const int *a = &(f->a); // expected-note 2{{cannot access field of null pointer}} \
-                            // ref-note 2{{cannot access field of null pointer}}
+    const int *a = &(f->a); // both-note 2{{cannot access field of null pointer}}
     return f->a;
   }
-  static_assert(a2() == 0, ""); // expected-error {{not an integral constant expression}} \
-                               // expected-note {{in call to 'a2()'}} \
-                               // ref-error {{not an integral constant expression}} \
-                               // ref-note {{in call to 'a2()'}}
+  static_assert(a2() == 0, ""); // both-error {{not an integral constant expression}} \
+                                // both-note {{in call to 'a2()'}}
 }
 
 namespace IndirectFieldInit {
-- 
cgit v1.1


From 10cd0e7a8bdcd80c0b017f8d0b6b71dd61973b54 Mon Sep 17 00:00:00 2001
From: Tarun Prabhu <tarun@lanl.gov>
Date: Thu, 8 Feb 2024 07:56:16 -0700
Subject: [flang][docs] Update flang documentation regarding the test suite
 (#80755)

Remove redundant reference to flang not being able to generate code. Add
a reference to the gfortran tests that are part of the LLVM Test Suite.
---
 flang/docs/FortranLLVMTestSuite.md | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/flang/docs/FortranLLVMTestSuite.md b/flang/docs/FortranLLVMTestSuite.md
index 62459e6..f07d415 100644
--- a/flang/docs/FortranLLVMTestSuite.md
+++ b/flang/docs/FortranLLVMTestSuite.md
@@ -12,12 +12,6 @@ first-time users read through [LLVM Test Suite
 Guide](https://llvm.org/docs/TestSuiteGuide.html) which describes the
 organizational structure of the test suite and how to run it.
 
-Although the Flang driver is unable to generate code at this time, we
-are neverthelesss incrementally adding Fortran tests into the LLVM
-Test Suite. We are currently testing against GFortran while we make
-progress towards completing the new Flang driver with full
-code-generation capabilities.
-
 ## Running the LLVM test-suite with Fortran
 
 Fortran support can be enabled by setting the following CMake variables:
@@ -63,3 +57,12 @@ cmake -G "Ninja" -DCMAKE_C_COMPILER=gcc -DCMAKE_CXX_COMPILER=g++ \
     -DTEST_SUITE_FORTRAN:STRING=ON \
     -DTEST_SUITE_SPEC2017_ROOT=<path to SPEC directory>  ..
 ```
+
+## Running the gfortran tests
+
+Tests from the gfortran test suite have been imported into the LLVM Test Suite.
+The tests will be run automatically if the test suite is built following the
+instructions described [above](#running-the-LLVM-test-suite-with-fortran).
+There are additional configure-time options that can be used with the gfortran 
+tests. More details about those options and their purpose can be found in 
+[`Fortran/gfortran/README.md`](https://github.com/llvm/llvm-test-suite/tree/main/Fortran/gfortran/README.md)`.
-- 
cgit v1.1


From cd183428a9af6d7dda2018a88aeb495f268716b5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Timm=20B=C3=A4der?= <tbaeder@redhat.com>
Date: Thu, 8 Feb 2024 15:15:14 +0100
Subject: [clang][Interp] Fix handling of generic lambdas

When compiling their static invoker, we need to get the
right specialization.
---
 clang/lib/AST/Interp/ByteCodeEmitter.cpp | 30 +++++++++++++++++++++++++++++-
 clang/test/AST/Interp/lambda.cpp         | 13 +++++++++++++
 2 files changed, 42 insertions(+), 1 deletion(-)

diff --git a/clang/lib/AST/Interp/ByteCodeEmitter.cpp b/clang/lib/AST/Interp/ByteCodeEmitter.cpp
index 8bbfa92..e697e24f 100644
--- a/clang/lib/AST/Interp/ByteCodeEmitter.cpp
+++ b/clang/lib/AST/Interp/ByteCodeEmitter.cpp
@@ -23,6 +23,34 @@ using namespace clang;
 using namespace clang::interp;
 
 Function *ByteCodeEmitter::compileFunc(const FunctionDecl *FuncDecl) {
+  bool IsLambdaStaticInvoker = false;
+  if (const auto *MD = dyn_cast<CXXMethodDecl>(FuncDecl);
+      MD && MD->isLambdaStaticInvoker()) {
+    // For a lambda static invoker, we might have to pick a specialized
+    // version if the lambda is generic. In that case, the picked function
+    // will *NOT* be a static invoker anymore. However, it will still
+    // be a non-static member function, this (usually) requiring an
+    // instance pointer. We suppress that later in this function.
+    IsLambdaStaticInvoker = true;
+
+    const CXXRecordDecl *ClosureClass = MD->getParent();
+    assert(ClosureClass->captures_begin() == ClosureClass->captures_end());
+    if (ClosureClass->isGenericLambda()) {
+      const CXXMethodDecl *LambdaCallOp = ClosureClass->getLambdaCallOperator();
+      assert(MD->isFunctionTemplateSpecialization() &&
+             "A generic lambda's static-invoker function must be a "
+             "template specialization");
+      const TemplateArgumentList *TAL = MD->getTemplateSpecializationArgs();
+      FunctionTemplateDecl *CallOpTemplate =
+          LambdaCallOp->getDescribedFunctionTemplate();
+      void *InsertPos = nullptr;
+      const FunctionDecl *CorrespondingCallOpSpecialization =
+          CallOpTemplate->findSpecialization(TAL->asArray(), InsertPos);
+      assert(CorrespondingCallOpSpecialization);
+      FuncDecl = cast<CXXMethodDecl>(CorrespondingCallOpSpecialization);
+    }
+  }
+
   // Set up argument indices.
   unsigned ParamOffset = 0;
   SmallVector<PrimType, 8> ParamTypes;
@@ -46,7 +74,7 @@ Function *ByteCodeEmitter::compileFunc(const FunctionDecl *FuncDecl) {
   // InterpStack when calling the function.
   bool HasThisPointer = false;
   if (const auto *MD = dyn_cast<CXXMethodDecl>(FuncDecl)) {
-    if (MD->isImplicitObjectMemberFunction()) {
+    if (MD->isImplicitObjectMemberFunction() && !IsLambdaStaticInvoker) {
       HasThisPointer = true;
       ParamTypes.push_back(PT_Ptr);
       ParamOffsets.push_back(ParamOffset);
diff --git a/clang/test/AST/Interp/lambda.cpp b/clang/test/AST/Interp/lambda.cpp
index f840089..a433e56 100644
--- a/clang/test/AST/Interp/lambda.cpp
+++ b/clang/test/AST/Interp/lambda.cpp
@@ -155,6 +155,19 @@ namespace StaticInvoker {
     return fp(i).a;
   }
   static_assert(sv6(12) == 12);
+
+
+  /// A generic lambda.
+  auto GL = [](auto a) { return a; };
+  constexpr char (*fp2)(char) = GL;
+  static_assert(fp2('3') == '3', "");
+
+  struct GLS {
+    int a;
+  };
+  auto GL2 = [](auto a) { return GLS{a}; };
+  constexpr GLS (*fp3)(char) = GL2;
+  static_assert(fp3('3').a == '3', "");
 }
 
 namespace LambdasAsParams {
-- 
cgit v1.1


From 3e33b6f5de6905c98395a77b41d474b87ef9e677 Mon Sep 17 00:00:00 2001
From: Louis Dionne <ldionne.2@gmail.com>
Date: Thu, 8 Feb 2024 10:11:39 -0500
Subject: [libc++][NFC] Reformat a few files that had gotten mis-formatted

Those appear to be oversights when committing patches
in the last few months.
---
 libcxx/include/ostream          | 36 ++++++++++++++++--------------------
 libcxx/include/scoped_allocator |  4 ++--
 libcxx/include/shared_mutex     |  6 +++---
 libcxx/include/string           | 16 +++++++++-------
 libcxx/include/valarray         |  4 ++--
 libcxx/include/vector           |  4 ++--
 6 files changed, 34 insertions(+), 36 deletions(-)

diff --git a/libcxx/include/ostream b/libcxx/include/ostream
index 180adda..2e26073 100644
--- a/libcxx/include/ostream
+++ b/libcxx/include/ostream
@@ -1090,11 +1090,10 @@ _LIBCPP_EXPORTED_FROM_ABI FILE* __get_ostream_file(ostream& __os);
 
 #  ifndef _LIBCPP_HAS_NO_UNICODE
 template <class = void> // TODO PRINT template or availability markup fires too eagerly (http://llvm.org/PR61563).
-_LIBCPP_HIDE_FROM_ABI void
-__vprint_unicode(ostream& __os, string_view __fmt, format_args __args, bool __write_nl) {
-#if _LIBCPP_AVAILABILITY_HAS_PRINT == 0
+_LIBCPP_HIDE_FROM_ABI void __vprint_unicode(ostream& __os, string_view __fmt, format_args __args, bool __write_nl) {
+#    if _LIBCPP_AVAILABILITY_HAS_PRINT == 0
   return std::__vprint_nonunicode(__os, __fmt, __args, __write_nl);
-#else
+#    else
   FILE* __file = std::__get_ostream_file(__os);
   if (!__file || !__print::__is_terminal(__file))
     return std::__vprint_nonunicode(__os, __fmt, __args, __write_nl);
@@ -1110,38 +1109,36 @@ __vprint_unicode(ostream& __os, string_view __fmt, format_args __args, bool __wr
   // This is the path for the native API, start with flushing.
   __os.flush();
 
-#    ifndef _LIBCPP_HAS_NO_EXCEPTIONS
+#      ifndef _LIBCPP_HAS_NO_EXCEPTIONS
   try {
-#    endif // _LIBCPP_HAS_NO_EXCEPTIONS
+#      endif // _LIBCPP_HAS_NO_EXCEPTIONS
     ostream::sentry __s(__os);
     if (__s) {
-#    ifndef _LIBCPP_WIN32API
+#      ifndef _LIBCPP_WIN32API
       __print::__vprint_unicode_posix(__file, __fmt, __args, __write_nl, true);
-#    elif !defined(_LIBCPP_HAS_NO_WIDE_CHARACTERS)
+#      elif !defined(_LIBCPP_HAS_NO_WIDE_CHARACTERS)
     __print::__vprint_unicode_windows(__file, __fmt, __args, __write_nl, true);
-#    else
-#      error "Windows builds with wchar_t disabled are not supported."
-#    endif
+#      else
+#        error "Windows builds with wchar_t disabled are not supported."
+#      endif
     }
 
-#    ifndef _LIBCPP_HAS_NO_EXCEPTIONS
+#      ifndef _LIBCPP_HAS_NO_EXCEPTIONS
   } catch (...) {
     __os.__set_badbit_and_consider_rethrow();
   }
-#    endif // _LIBCPP_HAS_NO_EXCEPTIONS
-#endif // _LIBCPP_AVAILABILITY_HAS_PRINT
+#      endif // _LIBCPP_HAS_NO_EXCEPTIONS
+#    endif   // _LIBCPP_AVAILABILITY_HAS_PRINT
 }
 
 template <class = void> // TODO PRINT template or availability markup fires too eagerly (http://llvm.org/PR61563).
-_LIBCPP_HIDE_FROM_ABI inline void
-vprint_unicode(ostream& __os, string_view __fmt, format_args __args) {
+_LIBCPP_HIDE_FROM_ABI inline void vprint_unicode(ostream& __os, string_view __fmt, format_args __args) {
   std::__vprint_unicode(__os, __fmt, __args, false);
 }
 #  endif // _LIBCPP_HAS_NO_UNICODE
 
 template <class... _Args>
-_LIBCPP_HIDE_FROM_ABI void
-print(ostream& __os, format_string<_Args...> __fmt, _Args&&... __args) {
+_LIBCPP_HIDE_FROM_ABI void print(ostream& __os, format_string<_Args...> __fmt, _Args&&... __args) {
 #  ifndef _LIBCPP_HAS_NO_UNICODE
   if constexpr (__print::__use_unicode_execution_charset)
     std::__vprint_unicode(__os, __fmt.get(), std::make_format_args(__args...), false);
@@ -1153,8 +1150,7 @@ print(ostream& __os, format_string<_Args...> __fmt, _Args&&... __args) {
 }
 
 template <class... _Args>
-_LIBCPP_HIDE_FROM_ABI void
-println(ostream& __os, format_string<_Args...> __fmt, _Args&&... __args) {
+_LIBCPP_HIDE_FROM_ABI void println(ostream& __os, format_string<_Args...> __fmt, _Args&&... __args) {
 #  ifndef _LIBCPP_HAS_NO_UNICODE
   // Note the wording in the Standard is inefficient. The output of
   // std::format is a std::string which is then copied. This solution
diff --git a/libcxx/include/scoped_allocator b/libcxx/include/scoped_allocator
index 1626453..eff6fbd 100644
--- a/libcxx/include/scoped_allocator
+++ b/libcxx/include/scoped_allocator
@@ -476,8 +476,8 @@ public:
   }
 
 private:
-  _LIBCPP_HIDE_FROM_ABI explicit scoped_allocator_adaptor(outer_allocator_type&& __o, inner_allocator_type&& __i) _NOEXCEPT
-      : base(std::move(__o), std::move(__i)) {}
+  _LIBCPP_HIDE_FROM_ABI explicit scoped_allocator_adaptor(
+      outer_allocator_type&& __o, inner_allocator_type&& __i) _NOEXCEPT : base(std::move(__o), std::move(__i)) {}
 
   template <class _Tp, class... _Args>
   _LIBCPP_HIDE_FROM_ABI void __construct(integral_constant<int, 0>, _Tp* __p, _Args&&... __args) {
diff --git a/libcxx/include/shared_mutex b/libcxx/include/shared_mutex
index ac66b3a..57f385b 100644
--- a/libcxx/include/shared_mutex
+++ b/libcxx/include/shared_mutex
@@ -124,9 +124,9 @@ template <class Mutex>
 
 #include <__config>
 
-#  ifdef _LIBCPP_HAS_NO_THREADS
-#    error "<shared_mutex> is not supported since libc++ has been configured without support for threads."
-#  endif
+#ifdef _LIBCPP_HAS_NO_THREADS
+#  error "<shared_mutex> is not supported since libc++ has been configured without support for threads."
+#endif
 
 #include <__assert> // all public C++ headers provide the assertion handler
 #include <__availability>
diff --git a/libcxx/include/string b/libcxx/include/string
index ed4fdbe..530a223 100644
--- a/libcxx/include/string
+++ b/libcxx/include/string
@@ -938,7 +938,11 @@ public:
       // Turning off ASan instrumentation for variable initialization with _LIBCPP_STRING_INTERNAL_MEMORY_ACCESS
       // does not work consistently during initialization of __r_, so we instead unpoison __str's memory manually first.
       // __str's memory needs to be unpoisoned only in the case where it's a short string.
-      : __r_([](basic_string &__s) -> decltype(__s.__r_)&& { if(!__s.__is_long()) __s.__annotate_delete(); return std::move(__s.__r_); }(__str)) {
+      : __r_([](basic_string& __s) -> decltype(__s.__r_)&& {
+          if (!__s.__is_long())
+            __s.__annotate_delete();
+          return std::move(__s.__r_);
+        }(__str)) {
     __str.__r_.first() = __rep();
     __str.__annotate_new(0);
     if (!__is_long())
@@ -1918,7 +1922,7 @@ private:
   }
 
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void __annotate_new(size_type __current_size) const _NOEXCEPT {
-    (void) __current_size;
+    (void)__current_size;
 #if !defined(_LIBCPP_HAS_NO_ASAN) && defined(_LIBCPP_INSTRUMENTED_WITH_ASAN)
     if (!__libcpp_is_constant_evaluated() && (__asan_short_string_is_annotated() || __is_long()))
       __annotate_contiguous_container(data() + capacity() + 1, data() + __current_size + 1);
@@ -1933,7 +1937,7 @@ private:
   }
 
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void __annotate_increase(size_type __n) const _NOEXCEPT {
-    (void) __n;
+    (void)__n;
 #if !defined(_LIBCPP_HAS_NO_ASAN) && defined(_LIBCPP_INSTRUMENTED_WITH_ASAN)
     if (!__libcpp_is_constant_evaluated() && (__asan_short_string_is_annotated() || __is_long()))
       __annotate_contiguous_container(data() + size() + 1, data() + size() + 1 + __n);
@@ -1941,7 +1945,7 @@ private:
   }
 
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void __annotate_shrink(size_type __old_size) const _NOEXCEPT {
-    (void) __old_size;
+    (void)__old_size;
 #if !defined(_LIBCPP_HAS_NO_ASAN) && defined(_LIBCPP_INSTRUMENTED_WITH_ASAN)
     if (!__libcpp_is_constant_evaluated() && (__asan_short_string_is_annotated() || __is_long()))
       __annotate_contiguous_container(data() + __old_size + 1, data() + size() + 1);
@@ -1952,9 +1956,7 @@ private:
   static _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 size_type __align_it(size_type __s) _NOEXCEPT {
     return (__s + (__a - 1)) & ~(__a - 1);
   }
-  enum {
-    __alignment = 8
-  };
+  enum { __alignment = 8 };
   static _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 size_type __recommend(size_type __s) _NOEXCEPT {
     if (__s < __min_cap) {
       return static_cast<size_type>(__min_cap) - 1;
diff --git a/libcxx/include/valarray b/libcxx/include/valarray
index 44adcd7..88b161e 100644
--- a/libcxx/include/valarray
+++ b/libcxx/include/valarray
@@ -2435,7 +2435,7 @@ template <class _Expr, __enable_if_t<__is_val_expr<_Expr>::value, int> >
 inline valarray<_Tp>& valarray<_Tp>::operator*=(const _Expr& __v) {
   size_t __i = 0;
   for (value_type* __t = __begin_; __t != __end_; ++__t, ++__i)
-    *__t *= std::__get(__v,__i);
+    *__t *= std::__get(__v, __i);
   return *this;
 }
 
@@ -2444,7 +2444,7 @@ template <class _Expr, __enable_if_t<__is_val_expr<_Expr>::value, int> >
 inline valarray<_Tp>& valarray<_Tp>::operator/=(const _Expr& __v) {
   size_t __i = 0;
   for (value_type* __t = __begin_; __t != __end_; ++__t, ++__i)
-    *__t /= std::__get(__v,__i);
+    *__t /= std::__get(__v, __i);
   return *this;
 }
 
diff --git a/libcxx/include/vector b/libcxx/include/vector
index e9615ab..3934361 100644
--- a/libcxx/include/vector
+++ b/libcxx/include/vector
@@ -831,8 +831,8 @@ private:
   // For more details, see the "Using libc++" documentation page or
   // the documentation for __sanitizer_annotate_contiguous_container.
 
-  _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI void __annotate_contiguous_container(
-      const void* __old_mid, const void* __new_mid) const {
+  _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI void
+  __annotate_contiguous_container(const void* __old_mid, const void* __new_mid) const {
     (void)__old_mid;
     (void)__new_mid;
 #ifndef _LIBCPP_HAS_NO_ASAN
-- 
cgit v1.1


From 5452cbc4a6bfb905fedeacaa6f27895e249da1e5 Mon Sep 17 00:00:00 2001
From: ostannard <oliver.stannard@arm.com>
Date: Thu, 8 Feb 2024 15:31:54 +0000
Subject: [AArch64] Indirect tail-calls cannot use x16 with pac-ret+pc (#81020)

When using -mbranch-protection=pac-ret+pc, x16 is used in the function
epilogue to hold the address of the signing instruction. This is used by
a HINT instruction which can only use x16, so we can't change this. This
means that we can't use it to hold the function pointer for an indirect
tail-call.

There is existing code to force indirect tail-calls to use x16 or x17
when BTI is enabled, so there are now 4 combinations:

bti  pac-ret+pc  Valid function pointer registers
off  off         Any non callee-saved register
on   off         x16 or x17
off  on          Any non callee-saved register except x16
on   on          x17
---
 llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp      |  4 +-
 llvm/lib/Target/AArch64/AArch64ISelLowering.cpp    |  4 +-
 llvm/lib/Target/AArch64/AArch64InstrInfo.cpp       |  4 +-
 llvm/lib/Target/AArch64/AArch64InstrInfo.td        | 47 +++++++++++++---
 llvm/lib/Target/AArch64/AArch64RegisterInfo.td     | 15 +++--
 .../Target/AArch64/GISel/AArch64CallLowering.cpp   | 15 +++--
 .../AArch64/GISel/AArch64RegisterBankInfo.cpp      |  4 +-
 .../branch-target-enforcement-indirect-calls.ll    | 65 ++++++++++++++++++++++
 llvm/test/CodeGen/AArch64/kcfi-bti.ll              |  4 +-
 9 files changed, 138 insertions(+), 24 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp b/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp
index de24725..5b5ffd7 100644
--- a/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp
+++ b/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp
@@ -1602,7 +1602,9 @@ void AArch64AsmPrinter::emitInstruction(const MachineInstr *MI) {
   // attributes (isCall, isReturn, etc.). We lower them to the real
   // instruction here.
   case AArch64::TCRETURNri:
-  case AArch64::TCRETURNriBTI:
+  case AArch64::TCRETURNrix16x17:
+  case AArch64::TCRETURNrix17:
+  case AArch64::TCRETURNrinotx16:
   case AArch64::TCRETURNriALL: {
     MCInst TmpInst;
     TmpInst.setOpcode(AArch64::BR);
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 8573939..20290c9 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -25700,7 +25700,9 @@ AArch64TargetLowering::EmitKCFICheck(MachineBasicBlock &MBB,
   case AArch64::BLR:
   case AArch64::BLRNoIP:
   case AArch64::TCRETURNri:
-  case AArch64::TCRETURNriBTI:
+  case AArch64::TCRETURNrix16x17:
+  case AArch64::TCRETURNrix17:
+  case AArch64::TCRETURNrinotx16:
     break;
   default:
     llvm_unreachable("Unexpected CFI call opcode");
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
index 9add7d8..39c9609 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -2503,7 +2503,9 @@ bool AArch64InstrInfo::isTailCallReturnInst(const MachineInstr &MI) {
     return false;
   case AArch64::TCRETURNdi:
   case AArch64::TCRETURNri:
-  case AArch64::TCRETURNriBTI:
+  case AArch64::TCRETURNrix16x17:
+  case AArch64::TCRETURNrix17:
+  case AArch64::TCRETURNrinotx16:
   case AArch64::TCRETURNriALL:
     return true;
   }
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index 77fdb68..9c3a692 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -928,8 +928,25 @@ let RecomputePerFunction = 1 in {
   // Avoid generating STRQro if it is slow, unless we're optimizing for code size.
   def UseSTRQro : Predicate<"!Subtarget->isSTRQroSlow() || shouldOptForSize(MF)">;
 
-  def UseBTI : Predicate<[{ MF->getInfo<AArch64FunctionInfo>()->branchTargetEnforcement() }]>;
-  def NotUseBTI : Predicate<[{ !MF->getInfo<AArch64FunctionInfo>()->branchTargetEnforcement() }]>;
+  // Register restrictions for indirect tail-calls:
+  // - If branch target enforcement is enabled, indirect calls must use x16 or
+  //   x17, because these are the only registers which can target the BTI C
+  //   instruction.
+  // - If PAuthLR is enabled, x16 is used in the epilogue to hold the address
+  //   of the signing instruction. This can't be changed because it is used by a
+  //   HINT instruction which only accepts x16. We can't load anything from the
+  //   stack after this because the authentication instruction checks that SP is
+  //   the same as it was at function entry, so we can't have anything on the
+  //   stack.
+
+  // BTI on, PAuthLR off: x16 or x17
+  def TailCallX16X17 : Predicate<[{  MF->getInfo<AArch64FunctionInfo>()->branchTargetEnforcement() && !MF->getInfo<AArch64FunctionInfo>()->branchProtectionPAuthLR() }]>;
+  // BTI on, PAuthLR on: x17 only
+  def TailCallX17 : Predicate<[{ MF->getInfo<AArch64FunctionInfo>()->branchTargetEnforcement() && MF->getInfo<AArch64FunctionInfo>()->branchProtectionPAuthLR() }]>;
+  // BTI off, PAuthLR on: Any non-callee-saved register except x16
+  def TailCallNotX16 : Predicate<[{ !MF->getInfo<AArch64FunctionInfo>()->branchTargetEnforcement() && MF->getInfo<AArch64FunctionInfo>()->branchProtectionPAuthLR() }]>;
+  // BTI off, PAuthLR off: Any non-callee-saved register
+  def TailCallAny : Predicate<[{ !MF->getInfo<AArch64FunctionInfo>()->branchTargetEnforcement() && !MF->getInfo<AArch64FunctionInfo>()->branchProtectionPAuthLR() }]>;
 
   def SLSBLRMitigation : Predicate<[{ MF->getSubtarget<AArch64Subtarget>().hardenSlsBlr() }]>;
   def NoSLSBLRMitigation : Predicate<[{ !MF->getSubtarget<AArch64Subtarget>().hardenSlsBlr() }]>;
@@ -9121,18 +9138,30 @@ let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, Uses = [SP] in {
   // some verifier checks for outlined functions.
   def TCRETURNriALL : Pseudo<(outs), (ins GPR64:$dst, i32imm:$FPDiff), []>,
                       Sched<[WriteBrReg]>;
-  // Indirect tail-call limited to only use registers (x16 and x17) which are
-  // allowed to tail-call a "BTI c" instruction.
-  def TCRETURNriBTI : Pseudo<(outs), (ins rtcGPR64:$dst, i32imm:$FPDiff), []>,
+
+  // Indirect tail-calls with reduced register classes, needed for BTI and
+  // PAuthLR.
+  def TCRETURNrix16x17 : Pseudo<(outs), (ins tcGPRx16x17:$dst, i32imm:$FPDiff), []>,
+                      Sched<[WriteBrReg]>;
+  def TCRETURNrix17 : Pseudo<(outs), (ins tcGPRx17:$dst, i32imm:$FPDiff), []>,
+                      Sched<[WriteBrReg]>;
+  def TCRETURNrinotx16 : Pseudo<(outs), (ins tcGPRnotx16:$dst, i32imm:$FPDiff), []>,
                       Sched<[WriteBrReg]>;
 }
 
 def : Pat<(AArch64tcret tcGPR64:$dst, (i32 timm:$FPDiff)),
           (TCRETURNri tcGPR64:$dst, imm:$FPDiff)>,
-      Requires<[NotUseBTI]>;
-def : Pat<(AArch64tcret rtcGPR64:$dst, (i32 timm:$FPDiff)),
-          (TCRETURNriBTI rtcGPR64:$dst, imm:$FPDiff)>,
-      Requires<[UseBTI]>;
+      Requires<[TailCallAny]>;
+def : Pat<(AArch64tcret tcGPRx16x17:$dst, (i32 timm:$FPDiff)),
+          (TCRETURNrix16x17 tcGPRx16x17:$dst, imm:$FPDiff)>,
+      Requires<[TailCallX16X17]>;
+def : Pat<(AArch64tcret tcGPRx17:$dst, (i32 timm:$FPDiff)),
+          (TCRETURNrix17 tcGPRx17:$dst, imm:$FPDiff)>,
+      Requires<[TailCallX17]>;
+def : Pat<(AArch64tcret tcGPRnotx16:$dst, (i32 timm:$FPDiff)),
+          (TCRETURNrinotx16 tcGPRnotx16:$dst, imm:$FPDiff)>,
+      Requires<[TailCallNotX16]>;
+
 def : Pat<(AArch64tcret tglobaladdr:$dst, (i32 timm:$FPDiff)),
           (TCRETURNdi texternalsym:$dst, imm:$FPDiff)>;
 def : Pat<(AArch64tcret texternalsym:$dst, (i32 timm:$FPDiff)),
diff --git a/llvm/lib/Target/AArch64/AArch64RegisterInfo.td b/llvm/lib/Target/AArch64/AArch64RegisterInfo.td
index b70ab85..569944e 100644
--- a/llvm/lib/Target/AArch64/AArch64RegisterInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64RegisterInfo.td
@@ -217,11 +217,16 @@ def tcGPR64 : RegisterClass<"AArch64", [i64], 64, (sub GPR64common, X19, X20, X2
                                                      X22, X23, X24, X25, X26,
                                                      X27, X28, FP, LR)>;
 
-// Restricted set of tail call registers, for use when branch target
-// enforcement is enabled. These are the only registers which can be used to
-// indirectly branch (not call) to the "BTI c" instruction at the start of a
-// BTI-protected function.
-def rtcGPR64 : RegisterClass<"AArch64", [i64], 64, (add X16, X17)>;
+// Restricted sets of tail call registers, for use when branch target
+// enforcement or PAuthLR are enabled.
+// For BTI, x16 and x17 are the only registers which can be used to indirectly
+// branch (not call) to the "BTI c" instruction at the start of a BTI-protected
+// function.
+// For PAuthLR, x16 must be used in the function epilogue for other purposes,
+// so cannot hold the function pointer.
+def tcGPRx17 : RegisterClass<"AArch64", [i64], 64, (add X17)>;
+def tcGPRx16x17 : RegisterClass<"AArch64", [i64], 64, (add X16, X17)>;
+def tcGPRnotx16 : RegisterClass<"AArch64", [i64], 64, (sub tcGPR64, X16)>;
 
 // Register set that excludes registers that are reserved for procedure calls.
 // This is used for pseudo-instructions that are actually implemented using a
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp b/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp
index 55cad84..3dc3d31 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp
@@ -1012,16 +1012,23 @@ bool AArch64CallLowering::isEligibleForTailCallOptimization(
 
 static unsigned getCallOpcode(const MachineFunction &CallerF, bool IsIndirect,
                               bool IsTailCall) {
+  const AArch64FunctionInfo *FuncInfo = CallerF.getInfo<AArch64FunctionInfo>();
+
   if (!IsTailCall)
     return IsIndirect ? getBLRCallOpcode(CallerF) : (unsigned)AArch64::BL;
 
   if (!IsIndirect)
     return AArch64::TCRETURNdi;
 
-  // When BTI is enabled, we need to use TCRETURNriBTI to make sure that we use
-  // x16 or x17.
-  if (CallerF.getInfo<AArch64FunctionInfo>()->branchTargetEnforcement())
-    return AArch64::TCRETURNriBTI;
+  // When BTI or PAuthLR are enabled, there are restrictions on using x16 and
+  // x17 to hold the function pointer.
+  if (FuncInfo->branchTargetEnforcement()) {
+    if (FuncInfo->branchProtectionPAuthLR())
+      return AArch64::TCRETURNrix17;
+    else
+      return AArch64::TCRETURNrix16x17;
+  } else if (FuncInfo->branchProtectionPAuthLR())
+    return AArch64::TCRETURNrinotx16;
 
   return AArch64::TCRETURNri;
 }
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp
index b8e5e7b..0fc4d7f 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp
@@ -273,7 +273,9 @@ AArch64RegisterBankInfo::getRegBankFromRegClass(const TargetRegisterClass &RC,
   case AArch64::GPR64common_and_GPR64noipRegClassID:
   case AArch64::GPR64noip_and_tcGPR64RegClassID:
   case AArch64::tcGPR64RegClassID:
-  case AArch64::rtcGPR64RegClassID:
+  case AArch64::tcGPRx16x17RegClassID:
+  case AArch64::tcGPRx17RegClassID:
+  case AArch64::tcGPRnotx16RegClassID:
   case AArch64::WSeqPairsClassRegClassID:
   case AArch64::XSeqPairsClassRegClassID:
   case AArch64::MatrixIndexGPR32_8_11RegClassID:
diff --git a/llvm/test/CodeGen/AArch64/branch-target-enforcement-indirect-calls.ll b/llvm/test/CodeGen/AArch64/branch-target-enforcement-indirect-calls.ll
index de543f4..833a6d5 100644
--- a/llvm/test/CodeGen/AArch64/branch-target-enforcement-indirect-calls.ll
+++ b/llvm/test/CodeGen/AArch64/branch-target-enforcement-indirect-calls.ll
@@ -26,3 +26,68 @@ entry:
 ; CHECK: br {{x16|x17}}
   ret void
 }
+define void @bti_enabled_force_x10(ptr %p) "branch-target-enforcement"="true" {
+entry:
+  %p_x10 = tail call ptr asm "", "={x10},{x10},~{lr}"(ptr %p)
+  tail call void %p_x10()
+; CHECK: br {{x16|x17}}
+  ret void
+}
+
+; sign-return-address places no further restrictions on the tail-call register.
+
+define void @bti_enabled_pac_enabled(ptr %p) "branch-target-enforcement"="true" "sign-return-address"="all" {
+entry:
+  tail call void %p()
+; CHECK: br {{x16|x17}}
+  ret void
+}
+define void @bti_enabled_pac_enabled_force_x10(ptr %p) "branch-target-enforcement"="true" "sign-return-address"="all" {
+entry:
+  %p_x10 = tail call ptr asm "", "={x10},{x10},~{lr}"(ptr %p)
+  tail call void %p_x10()
+; CHECK: br {{x16|x17}}
+  ret void
+}
+
+; PAuthLR needs to use x16 to hold the address of the signing instruction. That
+; can't be changed because the hint instruction only uses that register, so the
+; only choice for the tail-call function pointer is x17.
+
+define void @bti_enabled_pac_pc_enabled(ptr %p) "branch-target-enforcement"="true" "sign-return-address"="all" "branch-protection-pauth-lr"="true" {
+entry:
+  tail call void %p()
+; CHECK: br x17
+  ret void
+}
+define void @bti_enabled_pac_pc_enabled_force_x16(ptr %p) "branch-target-enforcement"="true" "sign-return-address"="all" "branch-protection-pauth-lr"="true" {
+entry:
+  %p_x16 = tail call ptr asm "", "={x16},{x16},~{lr}"(ptr %p)
+  tail call void %p_x16()
+; CHECK: br x17
+  ret void
+}
+
+; PAuthLR by itself prevents x16 from being used, but any other
+; non-callee-saved register can be used.
+
+define void @pac_pc_enabled(ptr %p) "sign-return-address"="all" "branch-protection-pauth-lr"="true" {
+entry:
+  tail call void %p()
+; CHECK: br {{(x[0-9]|x1[0-578])$}}
+  ret void
+}
+define void @pac_pc_enabled_force_x16(ptr %p) "sign-return-address"="all" "branch-protection-pauth-lr"="true" {
+entry:
+  %p_x16 = tail call ptr asm "", "={x16},{x16},~{lr}"(ptr %p)
+  tail call void %p_x16()
+; CHECK: br {{(x[0-9]|x1[0-578])$}}
+  ret void
+}
+define void @pac_pc_enabled_force_x17(ptr %p) "sign-return-address"="all" "branch-protection-pauth-lr"="true" {
+entry:
+  %p_x17 = tail call ptr asm "", "={x17},{x17},~{lr}"(ptr %p)
+  tail call void %p_x17()
+; CHECK: br x17
+  ret void
+}
diff --git a/llvm/test/CodeGen/AArch64/kcfi-bti.ll b/llvm/test/CodeGen/AArch64/kcfi-bti.ll
index 12cde43..d3febb5 100644
--- a/llvm/test/CodeGen/AArch64/kcfi-bti.ll
+++ b/llvm/test/CodeGen/AArch64/kcfi-bti.ll
@@ -73,11 +73,11 @@ define void @f3(ptr noundef %x) {
 ; MIR-LABEL: name: f3
 ; MIR: body:
 
-; ISEL: TCRETURNriBTI %1, 0, csr_aarch64_aapcs, implicit $sp, cfi-type 12345678
+; ISEL: TCRETURNrix16x17 %1, 0, csr_aarch64_aapcs, implicit $sp, cfi-type 12345678
 
 ; KCFI:       BUNDLE{{.*}} {
 ; KCFI-NEXT:    KCFI_CHECK $x16, 12345678, implicit-def $x9, implicit-def $x16, implicit-def $x17, implicit-def $nzcv
-; KCFI-NEXT:    TCRETURNriBTI internal killed $x16, 0, csr_aarch64_aapcs, implicit $sp
+; KCFI-NEXT:    TCRETURNrix16x17 internal killed $x16, 0, csr_aarch64_aapcs, implicit $sp
 ; KCFI-NEXT:  }
 
   tail call void %x() [ "kcfi"(i32 12345678) ]
-- 
cgit v1.1


From 0802596df3d1ffd15f6b828a0f5c1e5b687a730f Mon Sep 17 00:00:00 2001
From: Daniel Chen <cdchen@ca.ibm.com>
Date: Thu, 8 Feb 2024 10:38:50 -0500
Subject: [Flang] Update the fix of PR 80738 to cover generic interface inside
 modules (#81087)

The following test cases crashes. The problem is that the fix for PR
https://github.com/llvm/llvm-project/pull/80738 is not quite complete.
It should `GetUltimate()` of the `interface_` before check if it is
generic.


```
  MODULE M

    CONTAINS

    FUNCTION Int(Arg)
    INTEGER :: Int, Arg
      Int = Arg
    END FUNCTION

    FUNCTION Int8(Arg)
    INTEGER(8) :: Int8, Arg
      Int8 = 8_8
    END FUNCTION

  END MODULE

  MODULE M1
  USE M

    INTERFACE Int8
      MODULE PROCEDURE  Int
      MODULE PROCEDURE  Int8
    END INTERFACE

  END MODULE

  PROGRAM PtrAssignGen
  USE M
  USE M1
  IMPLICIT NONE

  INTERFACE Int
    MODULE PROCEDURE  Int
    MODULE PROCEDURE  Int8
  END INTERFACE

  PROCEDURE(Int8),   POINTER :: PtrInt8

  PtrInt8 => Int8
  IF ( PtrInt8(100_8) .NE. 8_8 ) ERROR STOP 12

  END
  ```
---
 flang/lib/Semantics/resolve-names.cpp | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/flang/lib/Semantics/resolve-names.cpp b/flang/lib/Semantics/resolve-names.cpp
index 36deab9..2a42c791 100644
--- a/flang/lib/Semantics/resolve-names.cpp
+++ b/flang/lib/Semantics/resolve-names.cpp
@@ -5648,9 +5648,10 @@ void DeclarationVisitor::Post(const parser::ProcDecl &x) {
   const auto &name{std::get<parser::Name>(x.t)};
   const Symbol *procInterface{nullptr};
   if (interfaceName_) {
-    procInterface = interfaceName_->symbol->has<GenericDetails>()
-        ? interfaceName_->symbol->get<GenericDetails>().specific()
-        : interfaceName_->symbol;
+    Symbol *ultimate{&interfaceName_->symbol->GetUltimate()};
+    procInterface = ultimate->has<GenericDetails>()
+        ? ultimate->get<GenericDetails>().specific()
+        : ultimate;
   }
   auto attrs{HandleSaveName(name.source, GetAttrs())};
   DerivedTypeDetails *dtDetails{nullptr};
-- 
cgit v1.1


From dc5da4851de5d29dd040d85a8387e2e5b4b12b7b Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Thu, 8 Feb 2024 16:41:02 +0100
Subject: [InstCombine] Add tests for #77108 (NFC)

---
 llvm/test/Transforms/InstCombine/dependent-ivs.ll | 374 ++++++++++++++++++++++
 1 file changed, 374 insertions(+)
 create mode 100644 llvm/test/Transforms/InstCombine/dependent-ivs.ll

diff --git a/llvm/test/Transforms/InstCombine/dependent-ivs.ll b/llvm/test/Transforms/InstCombine/dependent-ivs.ll
new file mode 100644
index 0000000..bd66791
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/dependent-ivs.ll
@@ -0,0 +1,374 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -S -passes=instcombine < %s | FileCheck %s
+
+define void @int_iv_nuw(i64 %base, i64 %end) {
+; CHECK-LABEL: define void @int_iv_nuw(
+; CHECK-SAME: i64 [[BASE:%.*]], i64 [[END:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[IV2:%.*]] = phi i64 [ [[IV2_NEXT:%.*]], [[LOOP]] ], [ [[BASE]], [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[LOOP]] ], [ 0, [[ENTRY]] ]
+; CHECK-NEXT:    call void @use.i64(i64 [[IV2]])
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 4
+; CHECK-NEXT:    [[IV2_NEXT]] = add nuw i64 [[IV_NEXT]], [[BASE]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i64 [[IV_NEXT]], [[END]]
+; CHECK-NEXT:    br i1 [[CMP]], label [[EXIT:%.*]], label [[LOOP]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %loop
+
+loop:
+  %iv2 = phi i64 [ %iv2.next, %loop ], [ %base, %entry ]
+  %iv = phi i64 [ %iv.next, %loop ], [ 0, %entry ]
+  call void @use.i64(i64 %iv2)
+  %iv.next = add nuw nsw i64 %iv, 4
+  %iv2.next = add nuw i64 %iv.next, %base
+  %cmp = icmp eq i64 %iv.next, %end
+  br i1 %cmp, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+define void @int_iv_nsw(i64 %base, i64 %end) {
+; CHECK-LABEL: define void @int_iv_nsw(
+; CHECK-SAME: i64 [[BASE:%.*]], i64 [[END:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[IV2:%.*]] = phi i64 [ [[IV2_NEXT:%.*]], [[LOOP]] ], [ [[BASE]], [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[LOOP]] ], [ 0, [[ENTRY]] ]
+; CHECK-NEXT:    call void @use.i64(i64 [[IV2]])
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 4
+; CHECK-NEXT:    [[IV2_NEXT]] = add nsw i64 [[IV_NEXT]], [[BASE]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i64 [[IV_NEXT]], [[END]]
+; CHECK-NEXT:    br i1 [[CMP]], label [[EXIT:%.*]], label [[LOOP]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %loop
+
+loop:
+  %iv2 = phi i64 [ %iv2.next, %loop ], [ %base, %entry ]
+  %iv = phi i64 [ %iv.next, %loop ], [ 0, %entry ]
+  call void @use.i64(i64 %iv2)
+  %iv.next = add nuw nsw i64 %iv, 4
+  %iv2.next = add nsw i64 %iv.next, %base
+  %cmp = icmp eq i64 %iv.next, %end
+  br i1 %cmp, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+define void @int_iv_commuted(i64 %base, i64 %end) {
+; CHECK-LABEL: define void @int_iv_commuted(
+; CHECK-SAME: i64 [[BASE:%.*]], i64 [[END:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[BASE2:%.*]] = mul i64 [[BASE]], 42
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[IV2:%.*]] = phi i64 [ [[IV2_NEXT:%.*]], [[LOOP]] ], [ [[BASE2]], [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[LOOP]] ], [ 0, [[ENTRY]] ]
+; CHECK-NEXT:    call void @use.i64(i64 [[IV2]])
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 4
+; CHECK-NEXT:    [[IV2_NEXT]] = add i64 [[BASE2]], [[IV_NEXT]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i64 [[IV_NEXT]], [[END]]
+; CHECK-NEXT:    br i1 [[CMP]], label [[EXIT:%.*]], label [[LOOP]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %base2 = mul i64 %base, 42 ; thwart complexity-based canonicalization
+  br label %loop
+
+loop:
+  %iv2 = phi i64 [ %iv2.next, %loop ], [ %base2, %entry ]
+  %iv = phi i64 [ %iv.next, %loop ], [ 0, %entry ]
+  call void @use.i64(i64 %iv2)
+  %iv.next = add nuw nsw i64 %iv, 4
+  %iv2.next = add i64 %base2, %iv.next
+  %cmp = icmp eq i64 %iv.next, %end
+  br i1 %cmp, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+define void @int_iv_vector(<2 x i64> %base) {
+; CHECK-LABEL: define void @int_iv_vector(
+; CHECK-SAME: <2 x i64> [[BASE:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[IV2:%.*]] = phi <2 x i64> [ [[IV2_NEXT:%.*]], [[LOOP]] ], [ [[BASE]], [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[IV:%.*]] = phi <2 x i64> [ [[IV_NEXT:%.*]], [[LOOP]] ], [ zeroinitializer, [[ENTRY]] ]
+; CHECK-NEXT:    call void @use.v2i64(<2 x i64> [[IV2]])
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw <2 x i64> [[IV]], <i64 4, i64 4>
+; CHECK-NEXT:    [[IV2_NEXT]] = add <2 x i64> [[IV_NEXT]], [[BASE]]
+; CHECK-NEXT:    [[CMP:%.*]] = call i1 @get.i1()
+; CHECK-NEXT:    br i1 [[CMP]], label [[EXIT:%.*]], label [[LOOP]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %loop
+
+loop:
+  %iv2 = phi <2 x i64> [ %iv2.next, %loop ], [ %base, %entry ]
+  %iv = phi <2 x i64> [ %iv.next, %loop ], [ zeroinitializer, %entry ]
+  call void @use.v2i64(<2 x i64> %iv2)
+  %iv.next = add nuw nsw <2 x i64> %iv, <i64 4, i64 4>
+  %iv2.next = add <2 x i64> %iv.next, %base
+  %cmp = call i1 @get.i1()
+  br i1 %cmp, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+define void @int_iv_loop_variant_step(i64 %base, i64 %end) {
+; CHECK-LABEL: define void @int_iv_loop_variant_step(
+; CHECK-SAME: i64 [[BASE:%.*]], i64 [[END:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[IV2:%.*]] = phi i64 [ [[IV2_NEXT:%.*]], [[LOOP]] ], [ [[BASE]], [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[LOOP]] ], [ 0, [[ENTRY]] ]
+; CHECK-NEXT:    call void @use.i64(i64 [[IV2]])
+; CHECK-NEXT:    [[STEP:%.*]] = call i64 @get.i64()
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], [[STEP]]
+; CHECK-NEXT:    [[IV2_NEXT]] = add nuw i64 [[IV_NEXT]], [[BASE]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i64 [[IV_NEXT]], [[END]]
+; CHECK-NEXT:    br i1 [[CMP]], label [[EXIT:%.*]], label [[LOOP]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %loop
+
+loop:
+  %iv2 = phi i64 [ %iv2.next, %loop ], [ %base, %entry ]
+  %iv = phi i64 [ %iv.next, %loop ], [ 0, %entry ]
+  call void @use.i64(i64 %iv2)
+  %step = call i64 @get.i64()
+  %iv.next = add nuw nsw i64 %iv, %step
+  %iv2.next = add nuw i64 %iv.next, %base
+  %cmp = icmp eq i64 %iv.next, %end
+  br i1 %cmp, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+define void @ptr_iv_inbounds(ptr %base, i64 %end) {
+; CHECK-LABEL: define void @ptr_iv_inbounds(
+; CHECK-SAME: ptr [[BASE:%.*]], i64 [[END:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[IV_PTR:%.*]] = phi ptr [ [[IV_PTR_NEXT:%.*]], [[LOOP]] ], [ [[BASE]], [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[LOOP]] ], [ 0, [[ENTRY]] ]
+; CHECK-NEXT:    call void @use.p0(ptr [[IV_PTR]])
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 4
+; CHECK-NEXT:    [[IV_PTR_NEXT]] = getelementptr inbounds i8, ptr [[BASE]], i64 [[IV_NEXT]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i64 [[IV_NEXT]], [[END]]
+; CHECK-NEXT:    br i1 [[CMP]], label [[EXIT:%.*]], label [[LOOP]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %loop
+
+loop:
+  %iv.ptr = phi ptr [ %iv.ptr.next, %loop ], [ %base, %entry ]
+  %iv = phi i64 [ %iv.next, %loop ], [ 0, %entry ]
+  call void @use.p0(ptr %iv.ptr)
+  %iv.next = add nuw nsw i64 %iv, 4
+  %iv.ptr.next = getelementptr inbounds i8, ptr %base, i64 %iv.next
+  %cmp = icmp eq i64 %iv.next, %end
+  br i1 %cmp, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+define void @ptr_iv_no_inbounds(ptr %base, i64 %end) {
+; CHECK-LABEL: define void @ptr_iv_no_inbounds(
+; CHECK-SAME: ptr [[BASE:%.*]], i64 [[END:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[IV_PTR:%.*]] = phi ptr [ [[IV_PTR_NEXT:%.*]], [[LOOP]] ], [ [[BASE]], [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[LOOP]] ], [ 0, [[ENTRY]] ]
+; CHECK-NEXT:    call void @use.p0(ptr [[IV_PTR]])
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 4
+; CHECK-NEXT:    [[IV_PTR_NEXT]] = getelementptr i8, ptr [[BASE]], i64 [[IV_NEXT]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i64 [[IV_NEXT]], [[END]]
+; CHECK-NEXT:    br i1 [[CMP]], label [[EXIT:%.*]], label [[LOOP]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %loop
+
+loop:
+  %iv.ptr = phi ptr [ %iv.ptr.next, %loop ], [ %base, %entry ]
+  %iv = phi i64 [ %iv.next, %loop ], [ 0, %entry ]
+  call void @use.p0(ptr %iv.ptr)
+  %iv.next = add nuw nsw i64 %iv, 4
+  %iv.ptr.next = getelementptr i8, ptr %base, i64 %iv.next
+  %cmp = icmp eq i64 %iv.next, %end
+  br i1 %cmp, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+define void @ptr_iv_vector(<2 x ptr> %base, i64 %end) {
+; CHECK-LABEL: define void @ptr_iv_vector(
+; CHECK-SAME: <2 x ptr> [[BASE:%.*]], i64 [[END:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[IV_PTR:%.*]] = phi <2 x ptr> [ [[IV_PTR_NEXT:%.*]], [[LOOP]] ], [ [[BASE]], [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[LOOP]] ], [ 0, [[ENTRY]] ]
+; CHECK-NEXT:    call void @use.v2p0(<2 x ptr> [[IV_PTR]])
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 4
+; CHECK-NEXT:    [[IV_PTR_NEXT]] = getelementptr inbounds i8, <2 x ptr> [[BASE]], i64 [[IV_NEXT]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i64 [[IV_NEXT]], [[END]]
+; CHECK-NEXT:    br i1 [[CMP]], label [[EXIT:%.*]], label [[LOOP]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %loop
+
+loop:
+  %iv.ptr = phi <2 x ptr> [ %iv.ptr.next, %loop ], [ %base, %entry ]
+  %iv = phi i64 [ %iv.next, %loop ], [ 0, %entry ]
+  call void @use.v2p0(<2 x ptr> %iv.ptr)
+  %iv.next = add nuw nsw i64 %iv, 4
+  %iv.ptr.next = getelementptr inbounds i8, <2 x ptr> %base, i64 %iv.next
+  %cmp = icmp eq i64 %iv.next, %end
+  br i1 %cmp, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+define void @ptr_iv_vector2(<2 x ptr> %base) {
+; CHECK-LABEL: define void @ptr_iv_vector2(
+; CHECK-SAME: <2 x ptr> [[BASE:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[IV_PTR:%.*]] = phi <2 x ptr> [ [[IV_PTR_NEXT:%.*]], [[LOOP]] ], [ [[BASE]], [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[IV:%.*]] = phi <2 x i64> [ [[IV_NEXT:%.*]], [[LOOP]] ], [ zeroinitializer, [[ENTRY]] ]
+; CHECK-NEXT:    call void @use.v2p0(<2 x ptr> [[IV_PTR]])
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw <2 x i64> [[IV]], <i64 4, i64 4>
+; CHECK-NEXT:    [[IV_PTR_NEXT]] = getelementptr i8, <2 x ptr> [[BASE]], <2 x i64> [[IV_NEXT]]
+; CHECK-NEXT:    [[CMP:%.*]] = call i1 @get.i1()
+; CHECK-NEXT:    br i1 [[CMP]], label [[EXIT:%.*]], label [[LOOP]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %loop
+
+loop:
+  %iv.ptr = phi <2 x ptr> [ %iv.ptr.next, %loop ], [ %base, %entry ]
+  %iv = phi <2 x i64> [ %iv.next, %loop ], [ zeroinitializer, %entry ]
+  call void @use.v2p0(<2 x ptr> %iv.ptr)
+  %iv.next = add nuw nsw <2 x i64> %iv, <i64 4, i64 4>
+  %iv.ptr.next = getelementptr i8, <2 x ptr> %base, <2 x i64> %iv.next
+  %cmp = call i1 @get.i1()
+  br i1 %cmp, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+define void @wrong_start_value(i64 %base, i64 %end) {
+; CHECK-LABEL: define void @wrong_start_value(
+; CHECK-SAME: i64 [[BASE:%.*]], i64 [[END:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[IV2:%.*]] = phi i64 [ [[IV2_NEXT:%.*]], [[LOOP]] ], [ [[BASE]], [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[LOOP]] ], [ 1, [[ENTRY]] ]
+; CHECK-NEXT:    call void @use.i64(i64 [[IV2]])
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 4
+; CHECK-NEXT:    [[IV2_NEXT]] = add i64 [[IV_NEXT]], [[BASE]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i64 [[IV_NEXT]], [[END]]
+; CHECK-NEXT:    br i1 [[CMP]], label [[EXIT:%.*]], label [[LOOP]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %loop
+
+loop:
+  %iv2 = phi i64 [ %iv2.next, %loop ], [ %base, %entry ]
+  %iv = phi i64 [ %iv.next, %loop ], [ 1, %entry ]
+  call void @use.i64(i64 %iv2)
+  %iv.next = add nuw nsw i64 %iv, 4
+  %iv2.next = add i64 %base, %iv.next
+  %cmp = icmp eq i64 %iv.next, %end
+  br i1 %cmp, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+define void @different_loops(i64 %base) {
+; CHECK-LABEL: define void @different_loops(
+; CHECK-SAME: i64 [[BASE:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[LOOP1:%.*]]
+; CHECK:       loop1:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[LOOP1]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    call void @use.i64(i64 [[IV]])
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 4
+; CHECK-NEXT:    [[CMP:%.*]] = call i1 @get.i1()
+; CHECK-NEXT:    br i1 [[CMP]], label [[LOOP2:%.*]], label [[LOOP1]]
+; CHECK:       loop2:
+; CHECK-NEXT:    [[IV2:%.*]] = phi i64 [ [[IV2_NEXT:%.*]], [[LOOP2]] ], [ [[BASE]], [[LOOP1]] ]
+; CHECK-NEXT:    call void @use.i64(i64 [[IV2]])
+; CHECK-NEXT:    [[IV2_NEXT]] = add nuw i64 [[IV_NEXT]], [[BASE]]
+; CHECK-NEXT:    [[CMP2:%.*]] = call i1 @get.i1()
+; CHECK-NEXT:    br i1 [[CMP2]], label [[EXIT:%.*]], label [[LOOP2]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %loop1
+
+loop1:
+  %iv = phi i64 [ %iv.next, %loop1 ], [ 0, %entry ]
+  call void @use.i64(i64 %iv)
+  %iv.next = add nuw nsw i64 %iv, 4
+  %cmp = call i1 @get.i1()
+  br i1 %cmp, label %loop2, label %loop1
+
+loop2:
+  %iv2 = phi i64 [ %iv2.next, %loop2 ], [ %base, %loop1 ]
+  call void @use.i64(i64 %iv2)
+  %iv2.next = add nuw i64 %base, %iv.next
+  %cmp2 = call i1 @get.i1()
+  br i1 %cmp2, label %exit, label %loop2
+
+exit:
+  ret void
+}
+
+declare void @use.p0(ptr)
+declare void @use.v2p0(<2 x ptr>)
+declare void @use.i64(i64)
+declare void @use.v2i64(<2 x i64>)
+declare i1 @get.i1()
+declare i64 @get.i64()
-- 
cgit v1.1


From fffcc5ca83ad2700a3586c1b849a36c6081e2023 Mon Sep 17 00:00:00 2001
From: Francesco Petrogalli <francesco.petrogalli@apple.com>
Date: Thu, 8 Feb 2024 16:54:12 +0100
Subject: [CodeGen] Add ValueType v3i8 (NFCI). (#80826)

---
 llvm/include/llvm/CodeGen/ValueTypes.td       | 363 +++++++++++++-------------
 llvm/lib/CodeGen/ValueTypes.cpp               |   2 +
 llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp |   3 +
 3 files changed, 187 insertions(+), 181 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/ValueTypes.td b/llvm/include/llvm/CodeGen/ValueTypes.td
index 55baaf8..1054738 100644
--- a/llvm/include/llvm/CodeGen/ValueTypes.td
+++ b/llvm/include/llvm/CodeGen/ValueTypes.td
@@ -97,192 +97,193 @@ def v128i4  : VTVec<128,  i4, 32>;   //  128 x i4 vector value
 
 def v1i8    : VTVec<1,    i8, 33>;  //    1 x i8 vector value
 def v2i8    : VTVec<2,    i8, 34>;  //    2 x i8 vector value
-def v4i8    : VTVec<4,    i8, 35>;  //    4 x i8 vector value
-def v8i8    : VTVec<8,    i8, 36>;  //    8 x i8 vector value
-def v16i8   : VTVec<16,   i8, 37>;  //   16 x i8 vector value
-def v32i8   : VTVec<32,   i8, 38>;  //   32 x i8 vector value
-def v64i8   : VTVec<64,   i8, 39>;  //   64 x i8 vector value
-def v128i8  : VTVec<128,  i8, 40>;  //  128 x i8 vector value
-def v256i8  : VTVec<256,  i8, 41>;  //  256 x i8 vector value
-def v512i8  : VTVec<512,  i8, 42>;  //  512 x i8 vector value
-def v1024i8 : VTVec<1024, i8, 43>;  // 1024 x i8 vector value
-
-def v1i16   : VTVec<1,   i16, 44>;  //   1 x i16 vector value
-def v2i16   : VTVec<2,   i16, 45>;  //   2 x i16 vector value
-def v3i16   : VTVec<3,   i16, 46>;  //   3 x i16 vector value
-def v4i16   : VTVec<4,   i16, 47>;  //   4 x i16 vector value
-def v8i16   : VTVec<8,   i16, 48>;  //   8 x i16 vector value
-def v16i16  : VTVec<16,  i16, 49>;  //  16 x i16 vector value
-def v32i16  : VTVec<32,  i16, 50>;  //  32 x i16 vector value
-def v64i16  : VTVec<64,  i16, 51>;  //  64 x i16 vector value
-def v128i16 : VTVec<128, i16, 52>;  // 128 x i16 vector value
-def v256i16 : VTVec<256, i16, 53>;  // 256 x i16 vector value
-def v512i16 : VTVec<512, i16, 54>;  // 512 x i16 vector value
-
-def v1i32    : VTVec<1,    i32, 55>;  //    1 x i32 vector value
-def v2i32    : VTVec<2,    i32, 56>;  //    2 x i32 vector value
-def v3i32    : VTVec<3,    i32, 57>;  //    3 x i32 vector value
-def v4i32    : VTVec<4,    i32, 58>;  //    4 x i32 vector value
-def v5i32    : VTVec<5,    i32, 59>;  //    5 x i32 vector value
-def v6i32    : VTVec<6,    i32, 60>;  //    6 x f32 vector value
-def v7i32    : VTVec<7,    i32, 61>;  //    7 x f32 vector value
-def v8i32    : VTVec<8,    i32, 62>;  //    8 x i32 vector value
-def v9i32    : VTVec<9,    i32, 63>;  //    9 x i32 vector value
-def v10i32   : VTVec<10,   i32, 64>;  //   10 x i32 vector value
-def v11i32   : VTVec<11,   i32, 65>;  //   11 x i32 vector value
-def v12i32   : VTVec<12,   i32, 66>;  //   12 x i32 vector value
-def v16i32   : VTVec<16,   i32, 67>;  //   16 x i32 vector value
-def v32i32   : VTVec<32,   i32, 68>;  //   32 x i32 vector value
-def v64i32   : VTVec<64,   i32, 69>;  //   64 x i32 vector value
-def v128i32  : VTVec<128,  i32, 70>;  //  128 x i32 vector value
-def v256i32  : VTVec<256,  i32, 71>;  //  256 x i32 vector value
-def v512i32  : VTVec<512,  i32, 72>;  //  512 x i32 vector value
-def v1024i32 : VTVec<1024, i32, 73>;  // 1024 x i32 vector value
-def v2048i32 : VTVec<2048, i32, 74>;  // 2048 x i32 vector value
-
-def v1i64   : VTVec<1,   i64, 75>;  //   1 x i64 vector value
-def v2i64   : VTVec<2,   i64, 76>;  //   2 x i64 vector value
-def v3i64   : VTVec<3,   i64, 77>;  //   3 x i64 vector value
-def v4i64   : VTVec<4,   i64, 78>;  //   4 x i64 vector value
-def v8i64   : VTVec<8,   i64, 79>;  //   8 x i64 vector value
-def v16i64  : VTVec<16,  i64, 80>;  //  16 x i64 vector value
-def v32i64  : VTVec<32,  i64, 81>;  //  32 x i64 vector value
-def v64i64  : VTVec<64,  i64, 82>;  //  64 x i64 vector value
-def v128i64 : VTVec<128, i64, 83>;  // 128 x i64 vector value
-def v256i64 : VTVec<256, i64, 84>;  // 256 x i64 vector value
-
-def v1i128  : VTVec<1,  i128, 85>;  //  1 x i128 vector value
-
-def v1f16    : VTVec<1,    f16,  86>;  //    1 x f16 vector value
-def v2f16    : VTVec<2,    f16,  87>;  //    2 x f16 vector value
-def v3f16    : VTVec<3,    f16,  88>;  //    3 x f16 vector value
-def v4f16    : VTVec<4,    f16,  89>;  //    4 x f16 vector value
-def v8f16    : VTVec<8,    f16,  90>;  //    8 x f16 vector value
-def v16f16   : VTVec<16,   f16,  91>;  //   16 x f16 vector value
-def v32f16   : VTVec<32,   f16,  92>;  //   32 x f16 vector value
-def v64f16   : VTVec<64,   f16,  93>;  //   64 x f16 vector value
-def v128f16  : VTVec<128,  f16,  94>;  //  128 x f16 vector value
-def v256f16  : VTVec<256,  f16,  95>;  //  256 x f16 vector value
-def v512f16  : VTVec<512,  f16,  96>;  //  512 x f16 vector value
-
-def v2bf16   : VTVec<2,   bf16,  97>;  //    2 x bf16 vector value
-def v3bf16   : VTVec<3,   bf16,  98>;  //    3 x bf16 vector value
-def v4bf16   : VTVec<4,   bf16,  99>;  //    4 x bf16 vector value
-def v8bf16   : VTVec<8,   bf16, 100>;  //    8 x bf16 vector value
-def v16bf16  : VTVec<16,  bf16, 101>;  //   16 x bf16 vector value
-def v32bf16  : VTVec<32,  bf16, 102>;  //   32 x bf16 vector value
-def v64bf16  : VTVec<64,  bf16, 103>;  //   64 x bf16 vector value
-def v128bf16 : VTVec<128, bf16, 104>;  //  128 x bf16 vector value
-
-def v1f32    : VTVec<1,    f32, 105>;  //    1 x f32 vector value
-def v2f32    : VTVec<2,    f32, 106>;  //    2 x f32 vector value
-def v3f32    : VTVec<3,    f32, 107>;  //    3 x f32 vector value
-def v4f32    : VTVec<4,    f32, 108>;  //    4 x f32 vector value
-def v5f32    : VTVec<5,    f32, 109>;  //    5 x f32 vector value
-def v6f32    : VTVec<6,    f32, 110>;  //    6 x f32 vector value
-def v7f32    : VTVec<7,    f32, 111>;  //    7 x f32 vector value
-def v8f32    : VTVec<8,    f32, 112>;  //    8 x f32 vector value
-def v9f32    : VTVec<9,    f32, 113>;  //    9 x f32 vector value
-def v10f32   : VTVec<10,   f32, 114>;  //   10 x f32 vector value
-def v11f32   : VTVec<11,   f32, 115>;  //   11 x f32 vector value
-def v12f32   : VTVec<12,   f32, 116>;  //   12 x f32 vector value
-def v16f32   : VTVec<16,   f32, 117>;  //   16 x f32 vector value
-def v32f32   : VTVec<32,   f32, 118>;  //   32 x f32 vector value
-def v64f32   : VTVec<64,   f32, 119>;  //   64 x f32 vector value
-def v128f32  : VTVec<128,  f32, 120>;  //  128 x f32 vector value
-def v256f32  : VTVec<256,  f32, 121>;  //  256 x f32 vector value
-def v512f32  : VTVec<512,  f32, 122>;  //  512 x f32 vector value
-def v1024f32 : VTVec<1024, f32, 123>;  // 1024 x f32 vector value
-def v2048f32 : VTVec<2048, f32, 124>;  // 2048 x f32 vector value
-
-def v1f64    : VTVec<1,    f64, 125>;  //    1 x f64 vector value
-def v2f64    : VTVec<2,    f64, 126>;  //    2 x f64 vector value
-def v3f64    : VTVec<3,    f64, 127>;  //    3 x f64 vector value
-def v4f64    : VTVec<4,    f64, 128>;  //    4 x f64 vector value
-def v8f64    : VTVec<8,    f64, 129>;  //    8 x f64 vector value
-def v16f64   : VTVec<16,   f64, 130>;  //   16 x f64 vector value
-def v32f64   : VTVec<32,   f64, 131>;  //   32 x f64 vector value
-def v64f64   : VTVec<64,   f64, 132>;  //   64 x f64 vector value
-def v128f64  : VTVec<128,  f64, 133>;  //  128 x f64 vector value
-def v256f64  : VTVec<256,  f64, 134>;  //  256 x f64 vector value
-
-def nxv1i1  : VTScalableVec<1,  i1, 135>;  // n x  1 x i1  vector value
-def nxv2i1  : VTScalableVec<2,  i1, 136>;  // n x  2 x i1  vector value
-def nxv4i1  : VTScalableVec<4,  i1, 137>;  // n x  4 x i1  vector value
-def nxv8i1  : VTScalableVec<8,  i1, 138>;  // n x  8 x i1  vector value
-def nxv16i1 : VTScalableVec<16, i1, 139>;  // n x 16 x i1  vector value
-def nxv32i1 : VTScalableVec<32, i1, 140>;  // n x 32 x i1  vector value
-def nxv64i1 : VTScalableVec<64, i1, 141>;  // n x 64 x i1  vector value
-
-def nxv1i8  : VTScalableVec<1,  i8, 142>;  // n x  1 x i8  vector value
-def nxv2i8  : VTScalableVec<2,  i8, 143>;  // n x  2 x i8  vector value
-def nxv4i8  : VTScalableVec<4,  i8, 144>;  // n x  4 x i8  vector value
-def nxv8i8  : VTScalableVec<8,  i8, 145>;  // n x  8 x i8  vector value
-def nxv16i8 : VTScalableVec<16, i8, 146>;  // n x 16 x i8  vector value
-def nxv32i8 : VTScalableVec<32, i8, 147>;  // n x 32 x i8  vector value
-def nxv64i8 : VTScalableVec<64, i8, 148>;  // n x 64 x i8  vector value
-
-def nxv1i16  : VTScalableVec<1,  i16, 149>;  // n x  1 x i16 vector value
-def nxv2i16  : VTScalableVec<2,  i16, 150>;  // n x  2 x i16 vector value
-def nxv4i16  : VTScalableVec<4,  i16, 151>;  // n x  4 x i16 vector value
-def nxv8i16  : VTScalableVec<8,  i16, 152>;  // n x  8 x i16 vector value
-def nxv16i16 : VTScalableVec<16, i16, 153>;  // n x 16 x i16 vector value
-def nxv32i16 : VTScalableVec<32, i16, 154>;  // n x 32 x i16 vector value
-
-def nxv1i32  : VTScalableVec<1,  i32, 155>;  // n x  1 x i32 vector value
-def nxv2i32  : VTScalableVec<2,  i32, 156>;  // n x  2 x i32 vector value
-def nxv4i32  : VTScalableVec<4,  i32, 157>;  // n x  4 x i32 vector value
-def nxv8i32  : VTScalableVec<8,  i32, 158>;  // n x  8 x i32 vector value
-def nxv16i32 : VTScalableVec<16, i32, 159>;  // n x 16 x i32 vector value
-def nxv32i32 : VTScalableVec<32, i32, 160>;  // n x 32 x i32 vector value
-
-def nxv1i64  : VTScalableVec<1,  i64, 161>;  // n x  1 x i64 vector value
-def nxv2i64  : VTScalableVec<2,  i64, 162>;  // n x  2 x i64 vector value
-def nxv4i64  : VTScalableVec<4,  i64, 163>;  // n x  4 x i64 vector value
-def nxv8i64  : VTScalableVec<8,  i64, 164>;  // n x  8 x i64 vector value
-def nxv16i64 : VTScalableVec<16, i64, 165>;  // n x 16 x i64 vector value
-def nxv32i64 : VTScalableVec<32, i64, 166>;  // n x 32 x i64 vector value
-
-def nxv1f16  : VTScalableVec<1,  f16, 167>;  // n x  1 x  f16 vector value
-def nxv2f16  : VTScalableVec<2,  f16, 168>;  // n x  2 x  f16 vector value
-def nxv4f16  : VTScalableVec<4,  f16, 169>;  // n x  4 x  f16 vector value
-def nxv8f16  : VTScalableVec<8,  f16, 170>;  // n x  8 x  f16 vector value
-def nxv16f16 : VTScalableVec<16, f16, 171>;  // n x 16 x  f16 vector value
-def nxv32f16 : VTScalableVec<32, f16, 172>;  // n x 32 x  f16 vector value
-
-def nxv1bf16  : VTScalableVec<1,  bf16, 173>;  // n x  1 x bf16 vector value
-def nxv2bf16  : VTScalableVec<2,  bf16, 174>;  // n x  2 x bf16 vector value
-def nxv4bf16  : VTScalableVec<4,  bf16, 175>;  // n x  4 x bf16 vector value
-def nxv8bf16  : VTScalableVec<8,  bf16, 176>;  // n x  8 x bf16 vector value
-def nxv16bf16 : VTScalableVec<16, bf16, 177>;  // n x 16 x bf16 vector value
-def nxv32bf16 : VTScalableVec<32, bf16, 178>;  // n x 32 x bf16 vector value
-
-def nxv1f32  : VTScalableVec<1,  f32, 179>;  // n x  1 x  f32 vector value
-def nxv2f32  : VTScalableVec<2,  f32, 180>;  // n x  2 x  f32 vector value
-def nxv4f32  : VTScalableVec<4,  f32, 181>;  // n x  4 x  f32 vector value
-def nxv8f32  : VTScalableVec<8,  f32, 182>;  // n x  8 x  f32 vector value
-def nxv16f32 : VTScalableVec<16, f32, 183>;  // n x 16 x  f32 vector value
-
-def nxv1f64  : VTScalableVec<1,  f64, 184>;  // n x  1 x  f64 vector value
-def nxv2f64  : VTScalableVec<2,  f64, 185>;  // n x  2 x  f64 vector value
-def nxv4f64  : VTScalableVec<4,  f64, 186>;  // n x  4 x  f64 vector value
-def nxv8f64  : VTScalableVec<8,  f64, 187>;  // n x  8 x  f64 vector value
-
-def x86mmx    : ValueType<64,   188>;  // X86 MMX value
-def FlagVT    : ValueType<0,    189> { // Pre-RA sched glue
+def v3i8    : VTVec<3,    i8, 35>;  //    3 x i8 vector value
+def v4i8    : VTVec<4,    i8, 36>;  //    4 x i8 vector value
+def v8i8    : VTVec<8,    i8, 37>;  //    8 x i8 vector value
+def v16i8   : VTVec<16,   i8, 38>;  //   16 x i8 vector value
+def v32i8   : VTVec<32,   i8, 39>;  //   32 x i8 vector value
+def v64i8   : VTVec<64,   i8, 40>;  //   64 x i8 vector value
+def v128i8  : VTVec<128,  i8, 41>;  //  128 x i8 vector value
+def v256i8  : VTVec<256,  i8, 42>;  //  256 x i8 vector value
+def v512i8  : VTVec<512,  i8, 43>;  //  512 x i8 vector value
+def v1024i8 : VTVec<1024, i8, 44>;  // 1024 x i8 vector value
+
+def v1i16   : VTVec<1,   i16, 45>;  //   1 x i16 vector value
+def v2i16   : VTVec<2,   i16, 46>;  //   2 x i16 vector value
+def v3i16   : VTVec<3,   i16, 47>;  //   3 x i16 vector value
+def v4i16   : VTVec<4,   i16, 48>;  //   4 x i16 vector value
+def v8i16   : VTVec<8,   i16, 49>;  //   8 x i16 vector value
+def v16i16  : VTVec<16,  i16, 50>;  //  16 x i16 vector value
+def v32i16  : VTVec<32,  i16, 51>;  //  32 x i16 vector value
+def v64i16  : VTVec<64,  i16, 52>;  //  64 x i16 vector value
+def v128i16 : VTVec<128, i16, 53>;  // 128 x i16 vector value
+def v256i16 : VTVec<256, i16, 54>;  // 256 x i16 vector value
+def v512i16 : VTVec<512, i16, 55>;  // 512 x i16 vector value
+
+def v1i32    : VTVec<1,    i32, 56>;  //    1 x i32 vector value
+def v2i32    : VTVec<2,    i32, 57>;  //    2 x i32 vector value
+def v3i32    : VTVec<3,    i32, 58>;  //    3 x i32 vector value
+def v4i32    : VTVec<4,    i32, 59>;  //    4 x i32 vector value
+def v5i32    : VTVec<5,    i32, 60>;  //    5 x i32 vector value
+def v6i32    : VTVec<6,    i32, 61>;  //    6 x f32 vector value
+def v7i32    : VTVec<7,    i32, 62>;  //    7 x f32 vector value
+def v8i32    : VTVec<8,    i32, 63>;  //    8 x i32 vector value
+def v9i32    : VTVec<9,    i32, 64>;  //    9 x i32 vector value
+def v10i32   : VTVec<10,   i32, 65>;  //   10 x i32 vector value
+def v11i32   : VTVec<11,   i32, 66>;  //   11 x i32 vector value
+def v12i32   : VTVec<12,   i32, 67>;  //   12 x i32 vector value
+def v16i32   : VTVec<16,   i32, 68>;  //   16 x i32 vector value
+def v32i32   : VTVec<32,   i32, 69>;  //   32 x i32 vector value
+def v64i32   : VTVec<64,   i32, 70>;  //   64 x i32 vector value
+def v128i32  : VTVec<128,  i32, 71>;  //  128 x i32 vector value
+def v256i32  : VTVec<256,  i32, 72>;  //  256 x i32 vector value
+def v512i32  : VTVec<512,  i32, 73>;  //  512 x i32 vector value
+def v1024i32 : VTVec<1024, i32, 74>;  // 1024 x i32 vector value
+def v2048i32 : VTVec<2048, i32, 75>;  // 2048 x i32 vector value
+
+def v1i64   : VTVec<1,   i64, 76>;  //   1 x i64 vector value
+def v2i64   : VTVec<2,   i64, 77>;  //   2 x i64 vector value
+def v3i64   : VTVec<3,   i64, 78>;  //   3 x i64 vector value
+def v4i64   : VTVec<4,   i64, 79>;  //   4 x i64 vector value
+def v8i64   : VTVec<8,   i64, 80>;  //   8 x i64 vector value
+def v16i64  : VTVec<16,  i64, 81>;  //  16 x i64 vector value
+def v32i64  : VTVec<32,  i64, 82>;  //  32 x i64 vector value
+def v64i64  : VTVec<64,  i64, 83>;  //  64 x i64 vector value
+def v128i64 : VTVec<128, i64, 84>;  // 128 x i64 vector value
+def v256i64 : VTVec<256, i64, 85>;  // 256 x i64 vector value
+
+def v1i128  : VTVec<1,  i128, 86>;  //  1 x i128 vector value
+
+def v1f16    : VTVec<1,    f16,  87>;  //    1 x f16 vector value
+def v2f16    : VTVec<2,    f16,  88>;  //    2 x f16 vector value
+def v3f16    : VTVec<3,    f16,  89>;  //    3 x f16 vector value
+def v4f16    : VTVec<4,    f16,  90>;  //    4 x f16 vector value
+def v8f16    : VTVec<8,    f16,  91>;  //    8 x f16 vector value
+def v16f16   : VTVec<16,   f16,  92>;  //   16 x f16 vector value
+def v32f16   : VTVec<32,   f16,  93>;  //   32 x f16 vector value
+def v64f16   : VTVec<64,   f16,  94>;  //   64 x f16 vector value
+def v128f16  : VTVec<128,  f16,  95>;  //  128 x f16 vector value
+def v256f16  : VTVec<256,  f16,  96>;  //  256 x f16 vector value
+def v512f16  : VTVec<512,  f16,  97>;  //  512 x f16 vector value
+
+def v2bf16   : VTVec<2,   bf16,  98>;  //    2 x bf16 vector value
+def v3bf16   : VTVec<3,   bf16,  99>;  //    3 x bf16 vector value
+def v4bf16   : VTVec<4,   bf16, 100>;  //    4 x bf16 vector value
+def v8bf16   : VTVec<8,   bf16, 101>;  //    8 x bf16 vector value
+def v16bf16  : VTVec<16,  bf16, 102>;  //   16 x bf16 vector value
+def v32bf16  : VTVec<32,  bf16, 103>;  //   32 x bf16 vector value
+def v64bf16  : VTVec<64,  bf16, 104>;  //   64 x bf16 vector value
+def v128bf16 : VTVec<128, bf16, 105>;  //  128 x bf16 vector value
+
+def v1f32    : VTVec<1,    f32, 106>;  //    1 x f32 vector value
+def v2f32    : VTVec<2,    f32, 107>;  //    2 x f32 vector value
+def v3f32    : VTVec<3,    f32, 108>;  //    3 x f32 vector value
+def v4f32    : VTVec<4,    f32, 109>;  //    4 x f32 vector value
+def v5f32    : VTVec<5,    f32, 110>;  //    5 x f32 vector value
+def v6f32    : VTVec<6,    f32, 111>;  //    6 x f32 vector value
+def v7f32    : VTVec<7,    f32, 112>;  //    7 x f32 vector value
+def v8f32    : VTVec<8,    f32, 113>;  //    8 x f32 vector value
+def v9f32    : VTVec<9,    f32, 114>;  //    9 x f32 vector value
+def v10f32   : VTVec<10,   f32, 115>;  //   10 x f32 vector value
+def v11f32   : VTVec<11,   f32, 116>;  //   11 x f32 vector value
+def v12f32   : VTVec<12,   f32, 117>;  //   12 x f32 vector value
+def v16f32   : VTVec<16,   f32, 118>;  //   16 x f32 vector value
+def v32f32   : VTVec<32,   f32, 119>;  //   32 x f32 vector value
+def v64f32   : VTVec<64,   f32, 120>;  //   64 x f32 vector value
+def v128f32  : VTVec<128,  f32, 121>;  //  128 x f32 vector value
+def v256f32  : VTVec<256,  f32, 122>;  //  256 x f32 vector value
+def v512f32  : VTVec<512,  f32, 123>;  //  512 x f32 vector value
+def v1024f32 : VTVec<1024, f32, 124>;  // 1024 x f32 vector value
+def v2048f32 : VTVec<2048, f32, 125>;  // 2048 x f32 vector value
+
+def v1f64    : VTVec<1,    f64, 126>;  //    1 x f64 vector value
+def v2f64    : VTVec<2,    f64, 127>;  //    2 x f64 vector value
+def v3f64    : VTVec<3,    f64, 128>;  //    3 x f64 vector value
+def v4f64    : VTVec<4,    f64, 129>;  //    4 x f64 vector value
+def v8f64    : VTVec<8,    f64, 130>;  //    8 x f64 vector value
+def v16f64   : VTVec<16,   f64, 131>;  //   16 x f64 vector value
+def v32f64   : VTVec<32,   f64, 132>;  //   32 x f64 vector value
+def v64f64   : VTVec<64,   f64, 133>;  //   64 x f64 vector value
+def v128f64  : VTVec<128,  f64, 134>;  //  128 x f64 vector value
+def v256f64  : VTVec<256,  f64, 135>;  //  256 x f64 vector value
+
+def nxv1i1  : VTScalableVec<1,  i1, 136>;  // n x  1 x i1  vector value
+def nxv2i1  : VTScalableVec<2,  i1, 137>;  // n x  2 x i1  vector value
+def nxv4i1  : VTScalableVec<4,  i1, 138>;  // n x  4 x i1  vector value
+def nxv8i1  : VTScalableVec<8,  i1, 139>;  // n x  8 x i1  vector value
+def nxv16i1 : VTScalableVec<16, i1, 140>;  // n x 16 x i1  vector value
+def nxv32i1 : VTScalableVec<32, i1, 141>;  // n x 32 x i1  vector value
+def nxv64i1 : VTScalableVec<64, i1, 142>;  // n x 64 x i1  vector value
+
+def nxv1i8  : VTScalableVec<1,  i8, 143>;  // n x  1 x i8  vector value
+def nxv2i8  : VTScalableVec<2,  i8, 144>;  // n x  2 x i8  vector value
+def nxv4i8  : VTScalableVec<4,  i8, 145>;  // n x  4 x i8  vector value
+def nxv8i8  : VTScalableVec<8,  i8, 146>;  // n x  8 x i8  vector value
+def nxv16i8 : VTScalableVec<16, i8, 147>;  // n x 16 x i8  vector value
+def nxv32i8 : VTScalableVec<32, i8, 148>;  // n x 32 x i8  vector value
+def nxv64i8 : VTScalableVec<64, i8, 149>;  // n x 64 x i8  vector value
+
+def nxv1i16  : VTScalableVec<1,  i16, 150>;  // n x  1 x i16 vector value
+def nxv2i16  : VTScalableVec<2,  i16, 151>;  // n x  2 x i16 vector value
+def nxv4i16  : VTScalableVec<4,  i16, 152>;  // n x  4 x i16 vector value
+def nxv8i16  : VTScalableVec<8,  i16, 153>;  // n x  8 x i16 vector value
+def nxv16i16 : VTScalableVec<16, i16, 154>;  // n x 16 x i16 vector value
+def nxv32i16 : VTScalableVec<32, i16, 155>;  // n x 32 x i16 vector value
+
+def nxv1i32  : VTScalableVec<1,  i32, 156>;  // n x  1 x i32 vector value
+def nxv2i32  : VTScalableVec<2,  i32, 157>;  // n x  2 x i32 vector value
+def nxv4i32  : VTScalableVec<4,  i32, 158>;  // n x  4 x i32 vector value
+def nxv8i32  : VTScalableVec<8,  i32, 159>;  // n x  8 x i32 vector value
+def nxv16i32 : VTScalableVec<16, i32, 160>;  // n x 16 x i32 vector value
+def nxv32i32 : VTScalableVec<32, i32, 161>;  // n x 32 x i32 vector value
+
+def nxv1i64  : VTScalableVec<1,  i64, 162>;  // n x  1 x i64 vector value
+def nxv2i64  : VTScalableVec<2,  i64, 163>;  // n x  2 x i64 vector value
+def nxv4i64  : VTScalableVec<4,  i64, 164>;  // n x  4 x i64 vector value
+def nxv8i64  : VTScalableVec<8,  i64, 165>;  // n x  8 x i64 vector value
+def nxv16i64 : VTScalableVec<16, i64, 166>;  // n x 16 x i64 vector value
+def nxv32i64 : VTScalableVec<32, i64, 167>;  // n x 32 x i64 vector value
+
+def nxv1f16  : VTScalableVec<1,  f16, 168>;  // n x  1 x  f16 vector value
+def nxv2f16  : VTScalableVec<2,  f16, 169>;  // n x  2 x  f16 vector value
+def nxv4f16  : VTScalableVec<4,  f16, 170>;  // n x  4 x  f16 vector value
+def nxv8f16  : VTScalableVec<8,  f16, 171>;  // n x  8 x  f16 vector value
+def nxv16f16 : VTScalableVec<16, f16, 172>;  // n x 16 x  f16 vector value
+def nxv32f16 : VTScalableVec<32, f16, 173>;  // n x 32 x  f16 vector value
+
+def nxv1bf16  : VTScalableVec<1,  bf16, 174>;  // n x  1 x bf16 vector value
+def nxv2bf16  : VTScalableVec<2,  bf16, 175>;  // n x  2 x bf16 vector value
+def nxv4bf16  : VTScalableVec<4,  bf16, 176>;  // n x  4 x bf16 vector value
+def nxv8bf16  : VTScalableVec<8,  bf16, 177>;  // n x  8 x bf16 vector value
+def nxv16bf16 : VTScalableVec<16, bf16, 178>;  // n x 16 x bf16 vector value
+def nxv32bf16 : VTScalableVec<32, bf16, 179>;  // n x 32 x bf16 vector value
+
+def nxv1f32  : VTScalableVec<1,  f32, 180>;  // n x  1 x  f32 vector value
+def nxv2f32  : VTScalableVec<2,  f32, 181>;  // n x  2 x  f32 vector value
+def nxv4f32  : VTScalableVec<4,  f32, 182>;  // n x  4 x  f32 vector value
+def nxv8f32  : VTScalableVec<8,  f32, 183>;  // n x  8 x  f32 vector value
+def nxv16f32 : VTScalableVec<16, f32, 184>;  // n x 16 x  f32 vector value
+
+def nxv1f64  : VTScalableVec<1,  f64, 185>;  // n x  1 x  f64 vector value
+def nxv2f64  : VTScalableVec<2,  f64, 186>;  // n x  2 x  f64 vector value
+def nxv4f64  : VTScalableVec<4,  f64, 187>;  // n x  4 x  f64 vector value
+def nxv8f64  : VTScalableVec<8,  f64, 188>;  // n x  8 x  f64 vector value
+
+def x86mmx    : ValueType<64,   189>;  // X86 MMX value
+def FlagVT    : ValueType<0,    190> { // Pre-RA sched glue
   let LLVMName = "Glue";
 }
-def isVoid    : ValueType<0,    190>;  // Produces no value
-def untyped   : ValueType<8,    191> { // Produces an untyped value
+def isVoid    : ValueType<0,    191>;  // Produces no value
+def untyped   : ValueType<8,    192> { // Produces an untyped value
   let LLVMName = "Untyped";
 }
-def funcref   : ValueType<0,    192>;  // WebAssembly's funcref type
-def externref : ValueType<0,    193>;  // WebAssembly's externref type
-def x86amx    : ValueType<8192, 194>;  // X86 AMX value
-def i64x8     : ValueType<512,  195>;  // 8 Consecutive GPRs (AArch64)
+def funcref   : ValueType<0,    193>;  // WebAssembly's funcref type
+def externref : ValueType<0,    194>;  // WebAssembly's externref type
+def x86amx    : ValueType<8192, 195>;  // X86 AMX value
+def i64x8     : ValueType<512,  196>;  // 8 Consecutive GPRs (AArch64)
 def aarch64svcount
-              : ValueType<16,   196>;  // AArch64 predicate-as-counter
-def spirvbuiltin : ValueType<0,  197>; // SPIR-V's builtin type
+              : ValueType<16,  197>;  // AArch64 predicate-as-counter
+def spirvbuiltin : ValueType<0, 198>; // SPIR-V's builtin type
 
 def token      : ValueType<0, 248>;  // TokenTy
 def MetadataVT : ValueType<0, 249> { // Metadata
diff --git a/llvm/lib/CodeGen/ValueTypes.cpp b/llvm/lib/CodeGen/ValueTypes.cpp
index ba3b9e0..731fcab 100644
--- a/llvm/lib/CodeGen/ValueTypes.cpp
+++ b/llvm/lib/CodeGen/ValueTypes.cpp
@@ -264,6 +264,8 @@ Type *EVT::getTypeForEVT(LLVMContext &Context) const {
     return FixedVectorType::get(Type::getInt8Ty(Context), 1);
   case MVT::v2i8:
     return FixedVectorType::get(Type::getInt8Ty(Context), 2);
+  case MVT::v3i8:
+    return FixedVectorType::get(Type::getInt8Ty(Context), 3);
   case MVT::v4i8:
     return FixedVectorType::get(Type::getInt8Ty(Context), 4);
   case MVT::v8i8:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index 10569d9..528257e 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -308,8 +308,11 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
   setTruncStoreAction(MVT::v2f64, MVT::v2f32, Expand);
   setTruncStoreAction(MVT::v2f64, MVT::v2f16, Expand);
 
+  setTruncStoreAction(MVT::v3i32, MVT::v3i8, Expand);
+
   setTruncStoreAction(MVT::v3i64, MVT::v3i32, Expand);
   setTruncStoreAction(MVT::v3i64, MVT::v3i16, Expand);
+  setTruncStoreAction(MVT::v3i64, MVT::v3i8, Expand);
   setTruncStoreAction(MVT::v3f64, MVT::v3f32, Expand);
   setTruncStoreAction(MVT::v3f64, MVT::v3f16, Expand);
 
-- 
cgit v1.1


From b14731fe93d0db9a59984783051880795ae0992d Mon Sep 17 00:00:00 2001
From: erichkeane <ekeane@nvidia.com>
Date: Thu, 8 Feb 2024 07:57:57 -0800
Subject: [OpenACC][NFC] Fix parse result from 'set'

Apparently 'set' was being parsed as 'shutdown'.  There isn't really any
way of detecting this without getting into a Sema implementation,
however fixing this now as I noticed it.
---
 clang/lib/Parse/ParseOpenACC.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/clang/lib/Parse/ParseOpenACC.cpp b/clang/lib/Parse/ParseOpenACC.cpp
index 1fee9f8..e099d07 100644
--- a/clang/lib/Parse/ParseOpenACC.cpp
+++ b/clang/lib/Parse/ParseOpenACC.cpp
@@ -54,7 +54,7 @@ OpenACCDirectiveKindEx getOpenACCDirectiveKind(Token Tok) {
           .Case("declare", OpenACCDirectiveKind::Declare)
           .Case("init", OpenACCDirectiveKind::Init)
           .Case("shutdown", OpenACCDirectiveKind::Shutdown)
-          .Case("set", OpenACCDirectiveKind::Shutdown)
+          .Case("set", OpenACCDirectiveKind::Set)
           .Case("update", OpenACCDirectiveKind::Update)
           .Case("wait", OpenACCDirectiveKind::Wait)
           .Default(OpenACCDirectiveKind::Invalid);
-- 
cgit v1.1


From 067d2779fcfc62dd429177f350b8cefe49b65b51 Mon Sep 17 00:00:00 2001
From: ian Bearman <ianb@microsoft.com>
Date: Thu, 8 Feb 2024 07:59:37 -0800
Subject: [MLIR] Setting MemorySpace During Bufferization (#78484)

Collection of changes with the goal of being able to convert `encoding`
to `memorySpace` during bufferization
- new API for encoder to allow implementation to select destination
memory space
- update existing bufferization implementations to support the new
interface
---
 .../Dialect/Bufferization/IR/BufferizableOpInterface.h  | 15 ++++++++++-----
 .../Arith/Transforms/BufferizableOpInterfaceImpl.cpp    | 13 +++++++------
 .../Bufferization/IR/BufferizableOpInterface.cpp        | 14 ++++++++------
 mlir/lib/Dialect/Bufferization/IR/BufferizationOps.cpp  |  4 ++--
 mlir/lib/Dialect/Bufferization/Transforms/Bufferize.cpp |  8 ++++++--
 .../Transforms/FuncBufferizableOpInterfaceImpl.cpp      |  5 +++--
 .../Tensor/Transforms/BufferizableOpInterfaceImpl.cpp   | 17 ++++++++++-------
 .../Dialect/Bufferization/TestTensorCopyInsertion.cpp   |  6 ++++--
 8 files changed, 50 insertions(+), 32 deletions(-)

diff --git a/mlir/include/mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h b/mlir/include/mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h
index 226a2fb..d8cfeee 100644
--- a/mlir/include/mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h
+++ b/mlir/include/mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h
@@ -257,6 +257,9 @@ struct BufferizationOptions {
   /// Parameters: Value, memory space, bufferization options
   using UnknownTypeConverterFn = std::function<BaseMemRefType(
       Value, Attribute memorySpace, const BufferizationOptions &)>;
+  // Produce a MemorySpace attribute from a tensor type
+  using DefaultMemorySpaceFn =
+      std::function<std::optional<Attribute>(TensorType t)>;
 
   BufferizationOptions();
 
@@ -296,11 +299,6 @@ struct BufferizationOptions {
   /// bufferized or not.
   bool bufferizeFunctionBoundaries = false;
 
-  /// The default memory space that should be used when it cannot be inferred
-  /// from the context. If case of std::nullopt, bufferization fails when the
-  /// memory space cannot be inferred at any point.
-  std::optional<Attribute> defaultMemorySpace = Attribute();
-
   /// Certain ops have aliasing OpOperand/OpResult invariants (e.g., scf.for).
   /// If this flag is set to `false`, those invariants are no longer enforced
   /// with buffer copies.
@@ -351,6 +349,13 @@ struct BufferizationOptions {
   /// used.
   UnknownTypeConverterFn unknownTypeConverterFn = nullptr;
 
+  // Use during type conversion to determine the memory space for memref based
+  // on the original tensor type if the memory space cannot be inferred.
+  // Returning std::nullopt will cause bufferization to fail (useful to indicate
+  // failure to determine memory space for a tensor type).
+  DefaultMemorySpaceFn defaultMemorySpaceFn =
+      [](TensorType t) -> std::optional<Attribute> { return Attribute(); };
+
   /// Seed for the analysis fuzzer. If set to `0`, the fuzzer is deactivated.
   /// Should be used only with `testAnalysisOnly = true`.
   unsigned analysisFuzzerSeed = 0;
diff --git a/mlir/lib/Dialect/Arith/Transforms/BufferizableOpInterfaceImpl.cpp b/mlir/lib/Dialect/Arith/Transforms/BufferizableOpInterfaceImpl.cpp
index f69b255..d7492c9 100644
--- a/mlir/lib/Dialect/Arith/Transforms/BufferizableOpInterfaceImpl.cpp
+++ b/mlir/lib/Dialect/Arith/Transforms/BufferizableOpInterfaceImpl.cpp
@@ -26,17 +26,18 @@ struct ConstantOpInterface
   LogicalResult bufferize(Operation *op, RewriterBase &rewriter,
                           const BufferizationOptions &options) const {
     auto constantOp = cast<arith::ConstantOp>(op);
+    auto type = constantOp.getType().dyn_cast<RankedTensorType>();
+
+    // Only ranked tensors are supported.
+    if (!type)
+      return failure();
 
     Attribute memorySpace;
-    if (options.defaultMemorySpace.has_value())
-      memorySpace = *options.defaultMemorySpace;
+    if (auto memSpace = options.defaultMemorySpaceFn(type))
+      memorySpace = *memSpace;
     else
       return constantOp->emitError("could not infer memory space");
 
-    // Only ranked tensors are supported.
-    if (!isa<RankedTensorType>(constantOp.getType()))
-      return failure();
-
     // Only constants inside a module are supported.
     auto moduleOp = constantOp->getParentOfType<ModuleOp>();
     if (!moduleOp)
diff --git a/mlir/lib/Dialect/Bufferization/IR/BufferizableOpInterface.cpp b/mlir/lib/Dialect/Bufferization/IR/BufferizableOpInterface.cpp
index 6ca9702..8f0f6d1 100644
--- a/mlir/lib/Dialect/Bufferization/IR/BufferizableOpInterface.cpp
+++ b/mlir/lib/Dialect/Bufferization/IR/BufferizableOpInterface.cpp
@@ -682,11 +682,12 @@ bufferization::getBufferType(Value value, const BufferizationOptions &options,
     return bufferizableOp.getBufferType(value, options, invocationStack);
 
   // Op is not bufferizable.
-  if (!options.defaultMemorySpace.has_value())
+  auto memSpace =
+      options.defaultMemorySpaceFn(value.getType().cast<TensorType>());
+  if (!memSpace.has_value())
     return op->emitError("could not infer memory space");
 
-  return getMemRefType(value, options, /*layout=*/{},
-                       *options.defaultMemorySpace);
+  return getMemRefType(value, options, /*layout=*/{}, *memSpace);
 }
 
 bool bufferization::hasTensorSemantics(Operation *op) {
@@ -936,11 +937,12 @@ FailureOr<BaseMemRefType> bufferization::detail::defaultGetBufferType(
 
   // If we do not know the memory space and there is no default memory space,
   // report a failure.
-  if (!options.defaultMemorySpace.has_value())
+  auto memSpace =
+      options.defaultMemorySpaceFn(value.getType().cast<TensorType>());
+  if (!memSpace.has_value())
     return op->emitError("could not infer memory space");
 
-  return getMemRefType(value, options, /*layout=*/{},
-                       *options.defaultMemorySpace);
+  return getMemRefType(value, options, /*layout=*/{}, *memSpace);
 }
 
 bool bufferization::detail::defaultIsRepetitiveRegion(
diff --git a/mlir/lib/Dialect/Bufferization/IR/BufferizationOps.cpp b/mlir/lib/Dialect/Bufferization/IR/BufferizationOps.cpp
index eb4a96f..34a0c59 100644
--- a/mlir/lib/Dialect/Bufferization/IR/BufferizationOps.cpp
+++ b/mlir/lib/Dialect/Bufferization/IR/BufferizationOps.cpp
@@ -234,8 +234,8 @@ AllocTensorOp::getBufferType(Value value, const BufferizationOptions &options,
     if (failed(copyBufferType))
       return failure();
     memorySpace = copyBufferType->getMemorySpace();
-  } else if (options.defaultMemorySpace.has_value()) {
-    memorySpace = *options.defaultMemorySpace;
+  } else if (auto ms = options.defaultMemorySpaceFn(getType())) {
+    memorySpace = *ms;
   } else {
     return getOperation()->emitError("could not infer memory space");
   }
diff --git a/mlir/lib/Dialect/Bufferization/Transforms/Bufferize.cpp b/mlir/lib/Dialect/Bufferization/Transforms/Bufferize.cpp
index dc94b72..208cbda 100644
--- a/mlir/lib/Dialect/Bufferization/Transforms/Bufferize.cpp
+++ b/mlir/lib/Dialect/Bufferization/Transforms/Bufferize.cpp
@@ -210,8 +210,12 @@ struct OneShotBufferizePass
       opt.dumpAliasSets = dumpAliasSets;
       opt.setFunctionBoundaryTypeConversion(
           parseLayoutMapOption(functionBoundaryTypeConversion));
-      if (mustInferMemorySpace)
-        opt.defaultMemorySpace = std::nullopt;
+      if (mustInferMemorySpace) {
+        opt.defaultMemorySpaceFn =
+            [](TensorType t) -> std::optional<Attribute> {
+          return std::nullopt;
+        };
+      }
       opt.printConflicts = printConflicts;
       opt.testAnalysisOnly = testAnalysisOnly;
       opt.bufferizeFunctionBoundaries = bufferizeFunctionBoundaries;
diff --git a/mlir/lib/Dialect/Bufferization/Transforms/FuncBufferizableOpInterfaceImpl.cpp b/mlir/lib/Dialect/Bufferization/Transforms/FuncBufferizableOpInterfaceImpl.cpp
index 07cd1f9..4cdbbf3 100644
--- a/mlir/lib/Dialect/Bufferization/Transforms/FuncBufferizableOpInterfaceImpl.cpp
+++ b/mlir/lib/Dialect/Bufferization/Transforms/FuncBufferizableOpInterfaceImpl.cpp
@@ -66,7 +66,7 @@ getBufferizedFunctionArgType(FuncOp funcOp, int64_t index,
   assert(tensorType && "expected TensorType");
 
   BaseMemRefType memrefType = options.functionArgTypeConverterFn(
-      tensorType, *options.defaultMemorySpace, funcOp, options);
+      tensorType, *options.defaultMemorySpaceFn(tensorType), funcOp, options);
 
   auto layoutAttr = funcOp.getArgAttrOfType<AffineMapAttr>(
       index, BufferizationDialect::kBufferLayoutAttrName);
@@ -443,7 +443,8 @@ struct FuncOpInterface
       // Note: If `inferFunctionResultLayout = true`, cast are later folded
       // away.
       BaseMemRefType resultType = options.functionArgTypeConverterFn(
-          tensorType, *options.defaultMemorySpace, funcOp, options);
+          tensorType, *options.defaultMemorySpaceFn(tensorType), funcOp,
+          options);
       Value toMemrefOp = rewriter.create<bufferization::ToMemrefOp>(
           loc, resultType, returnVal);
       returnValues.push_back(toMemrefOp);
diff --git a/mlir/lib/Dialect/Tensor/Transforms/BufferizableOpInterfaceImpl.cpp b/mlir/lib/Dialect/Tensor/Transforms/BufferizableOpInterfaceImpl.cpp
index 678b7c0..957f631 100644
--- a/mlir/lib/Dialect/Tensor/Transforms/BufferizableOpInterfaceImpl.cpp
+++ b/mlir/lib/Dialect/Tensor/Transforms/BufferizableOpInterfaceImpl.cpp
@@ -473,14 +473,14 @@ struct FromElementsOpInterface
   LogicalResult bufferize(Operation *op, RewriterBase &rewriter,
                           const BufferizationOptions &options) const {
     auto fromElementsOp = cast<tensor::FromElementsOp>(op);
+    auto tensorType = cast<RankedTensorType>(fromElementsOp.getType());
 
     // TODO: Implement memory space for this op.
-    if (options.defaultMemorySpace != Attribute())
+    if (options.defaultMemorySpaceFn(tensorType) != Attribute())
       return op->emitError("memory space not implemented yet");
 
     // Allocate a buffer for the result.
     Location loc = op->getLoc();
-    auto tensorType = cast<RankedTensorType>(fromElementsOp.getType());
     auto shape = tensorType.getShape();
     // TODO: Create alloc_tensor ops during TensorCopyInsertion.
     FailureOr<Value> tensorAlloc = allocateTensorForShapedValue(
@@ -588,8 +588,10 @@ struct GenerateOpInterface
                           const BufferizationOptions &options) const {
     auto generateOp = cast<tensor::GenerateOp>(op);
 
+    auto type = generateOp.getResult().getType();
+
     // TODO: Implement memory space for this op.
-    if (options.defaultMemorySpace != Attribute())
+    if (options.defaultMemorySpaceFn(type) != Attribute())
       return op->emitError("memory space not implemented yet");
 
     // Allocate memory.
@@ -1007,10 +1009,6 @@ struct SplatOpInterface
     OpBuilder::InsertionGuard g(rewriter);
     auto splatOp = cast<tensor::SplatOp>(op);
 
-    // TODO: Implement memory space for this op.
-    if (options.defaultMemorySpace != Attribute())
-      return op->emitError("memory space not implemented yet");
-
     // Allocate memory.
     Location loc = op->getLoc();
     FailureOr<Value> tensorAlloc = allocateTensorForShapedValue(
@@ -1021,6 +1019,11 @@ struct SplatOpInterface
 
     // Create linalg::MapOp.
     auto tensorType = cast<RankedTensorType>(tensorAlloc->getType());
+
+    // TODO: Implement memory space for this op.
+    if (options.defaultMemorySpaceFn(tensorType) != Attribute())
+      return op->emitError("memory space not implemented yet");
+
     auto linalgOp =
         rewriter.create<linalg::MapOp>(loc, tensorType, /*inputs=*/ValueRange(),
                                        /*init=*/*tensorAlloc);
diff --git a/mlir/test/lib/Dialect/Bufferization/TestTensorCopyInsertion.cpp b/mlir/test/lib/Dialect/Bufferization/TestTensorCopyInsertion.cpp
index fedfbe3..2991a3c 100644
--- a/mlir/test/lib/Dialect/Bufferization/TestTensorCopyInsertion.cpp
+++ b/mlir/test/lib/Dialect/Bufferization/TestTensorCopyInsertion.cpp
@@ -44,8 +44,10 @@ struct TestTensorCopyInsertionPass
     bufferization::OneShotBufferizationOptions options;
     options.allowReturnAllocsFromLoops = allowReturnAllocsFromLoops;
     options.bufferizeFunctionBoundaries = bufferizeFunctionBoundaries;
-    if (mustInferMemorySpace)
-      options.defaultMemorySpace = std::nullopt;
+    if (mustInferMemorySpace) {
+      options.defaultMemorySpaceFn =
+          [](TensorType t) -> std::optional<Attribute> { return std::nullopt; };
+    }
     if (failed(bufferization::insertTensorCopies(getOperation(), options)))
       signalPassFailure();
   }
-- 
cgit v1.1


From 92eaf036bf22ecc276146cd073208e6a867af8d4 Mon Sep 17 00:00:00 2001
From: Jeremy Morse <jeremy.morse@sony.com>
Date: Thu, 8 Feb 2024 16:13:22 +0000
Subject: [NFC][RemoveDIs] Remove conditional compilation for RemoveDIs
 (#81149)

A colleague observes that switching the default value of
LLVM_EXPERIMENTAL_DEBUGINFO_ITERATORS to "On" hasn't flipped the value
in their CMakeCache.txt. This probably means that everyone with an
existing build tree is going to not have support built in, meaning
everyone in LLVM would need to clean+rebuild their worktree when we flip
the switch on... which doesn't sound good.

So instead, just delete the flag and everything it does, making everyone
build and run ~400 lit tests in RemoveDIs mode. None of the buildbots
have had trouble with this, so it Should Be Fine (TM).

(Sending for review as this is changing various comments, and touches
several different areas -- I don't want to get too punchy).
---
 llvm/CMakeLists.txt                          |  3 ---
 llvm/cmake/modules/HandleLLVMOptions.cmake   |  4 ----
 llvm/include/llvm/ADT/ilist_iterator.h       | 23 -----------------------
 llvm/tools/llc/llc.cpp                       |  8 ++------
 llvm/tools/llvm-link/llvm-link.cpp           |  8 ++------
 llvm/tools/llvm-lto/llvm-lto.cpp             |  8 ++------
 llvm/tools/llvm-lto2/llvm-lto2.cpp           |  8 ++------
 llvm/tools/llvm-reduce/llvm-reduce.cpp       |  8 ++------
 llvm/tools/opt/optdriver.cpp                 |  8 ++------
 llvm/unittests/ADT/IListIteratorBitsTest.cpp | 18 ++----------------
 llvm/unittests/IR/BasicBlockDbgInfoTest.cpp  |  6 ------
 11 files changed, 14 insertions(+), 88 deletions(-)

diff --git a/llvm/CMakeLists.txt b/llvm/CMakeLists.txt
index c31980a..81f2753 100644
--- a/llvm/CMakeLists.txt
+++ b/llvm/CMakeLists.txt
@@ -653,9 +653,6 @@ option(LLVM_USE_OPROFILE
 option(LLVM_EXTERNALIZE_DEBUGINFO
   "Generate dSYM files and strip executables and libraries (Darwin Only)" OFF)
 
-option(LLVM_EXPERIMENTAL_DEBUGINFO_ITERATORS
-  "Add extra Booleans to ilist_iterators to communicate facts for debug-info" ON)
-
 set(LLVM_CODESIGNING_IDENTITY "" CACHE STRING
   "Sign executables and dylibs with the given identity or skip if empty (Darwin Only)")
 
diff --git a/llvm/cmake/modules/HandleLLVMOptions.cmake b/llvm/cmake/modules/HandleLLVMOptions.cmake
index 0699a85..486df22 100644
--- a/llvm/cmake/modules/HandleLLVMOptions.cmake
+++ b/llvm/cmake/modules/HandleLLVMOptions.cmake
@@ -140,10 +140,6 @@ if(LLVM_ENABLE_EXPENSIVE_CHECKS)
   endif()
 endif()
 
-if(LLVM_EXPERIMENTAL_DEBUGINFO_ITERATORS)
-  add_compile_definitions(EXPERIMENTAL_DEBUGINFO_ITERATORS)
-endif()
-
 if (LLVM_ENABLE_STRICT_FIXED_SIZE_VECTORS)
   add_compile_definitions(STRICT_FIXED_SIZE_VECTORS)
 endif()
diff --git a/llvm/include/llvm/ADT/ilist_iterator.h b/llvm/include/llvm/ADT/ilist_iterator.h
index 9047b9b..2393c4d 100644
--- a/llvm/include/llvm/ADT/ilist_iterator.h
+++ b/llvm/include/llvm/ADT/ilist_iterator.h
@@ -202,17 +202,12 @@ private:
 
   node_pointer NodePtr = nullptr;
 
-#ifdef EXPERIMENTAL_DEBUGINFO_ITERATORS
-  // (Default: Off) Allow extra position-information flags to be stored
-  // in iterators, in aid of removing debug-info intrinsics from LLVM.
-
   /// Is this position intended to contain any debug-info immediately before
   /// the position?
   mutable bool HeadInclusiveBit = false;
   /// Is this position intended to contain any debug-info immediately after
   /// the position?
   mutable bool TailInclusiveBit = false;
-#endif
 
 public:
   /// Create from an ilist_node.
@@ -231,10 +226,8 @@ public:
       const ilist_iterator_w_bits<OptionsT, IsReverse, RHSIsConst> &RHS,
       std::enable_if_t<IsConst || !RHSIsConst, void *> = nullptr)
       : NodePtr(RHS.NodePtr) {
-#ifdef EXPERIMENTAL_DEBUGINFO_ITERATORS
     HeadInclusiveBit = RHS.HeadInclusiveBit;
     TailInclusiveBit = RHS.TailInclusiveBit;
-#endif
   }
 
   // This is templated so that we can allow assigning to a const iterator from
@@ -243,10 +236,8 @@ public:
   std::enable_if_t<IsConst || !RHSIsConst, ilist_iterator_w_bits &>
   operator=(const ilist_iterator_w_bits<OptionsT, IsReverse, RHSIsConst> &RHS) {
     NodePtr = RHS.NodePtr;
-#ifdef EXPERIMENTAL_DEBUGINFO_ITERATORS
     HeadInclusiveBit = RHS.HeadInclusiveBit;
     TailInclusiveBit = RHS.TailInclusiveBit;
-#endif
     return *this;
   }
 
@@ -280,10 +271,8 @@ public:
           const_cast<typename ilist_iterator_w_bits<OptionsT, IsReverse,
                                                     false>::node_reference>(
               *NodePtr));
-#ifdef EXPERIMENTAL_DEBUGINFO_ITERATORS
       New.HeadInclusiveBit = HeadInclusiveBit;
       New.TailInclusiveBit = TailInclusiveBit;
-#endif
       return New;
     }
     return ilist_iterator_w_bits<OptionsT, IsReverse, false>();
@@ -309,18 +298,14 @@ public:
   // Increment and decrement operators...
   ilist_iterator_w_bits &operator--() {
     NodePtr = IsReverse ? NodePtr->getNext() : NodePtr->getPrev();
-#ifdef EXPERIMENTAL_DEBUGINFO_ITERATORS
     HeadInclusiveBit = false;
     TailInclusiveBit = false;
-#endif
     return *this;
   }
   ilist_iterator_w_bits &operator++() {
     NodePtr = IsReverse ? NodePtr->getPrev() : NodePtr->getNext();
-#ifdef EXPERIMENTAL_DEBUGINFO_ITERATORS
     HeadInclusiveBit = false;
     TailInclusiveBit = false;
-#endif
     return *this;
   }
   ilist_iterator_w_bits operator--(int) {
@@ -340,18 +325,10 @@ public:
   /// Check for end.  Only valid if ilist_sentinel_tracking<true>.
   bool isEnd() const { return NodePtr ? NodePtr->isSentinel() : false; }
 
-#ifdef EXPERIMENTAL_DEBUGINFO_ITERATORS
   bool getHeadBit() const { return HeadInclusiveBit; }
   bool getTailBit() const { return TailInclusiveBit; }
   void setHeadBit(bool SetBit) const { HeadInclusiveBit = SetBit; }
   void setTailBit(bool SetBit) const { TailInclusiveBit = SetBit; }
-#else
-  // Store and return no information if we're not using this feature.
-  bool getHeadBit() const { return false; }
-  bool getTailBit() const { return false; }
-  void setHeadBit(bool SetBit) const { (void)SetBit; }
-  void setTailBit(bool SetBit) const { (void)SetBit; }
-#endif
 };
 
 template <typename From> struct simplify_type;
diff --git a/llvm/tools/llc/llc.cpp b/llvm/tools/llc/llc.cpp
index 3e2567c..b292f70 100644
--- a/llvm/tools/llc/llc.cpp
+++ b/llvm/tools/llc/llc.cpp
@@ -365,15 +365,11 @@ int main(int argc, char **argv) {
   }
 
   // RemoveDIs debug-info transition: tests may request that we /try/ to use the
-  // new debug-info format, if it's built in.
-#ifdef EXPERIMENTAL_DEBUGINFO_ITERATORS
+  // new debug-info format.
   if (TryUseNewDbgInfoFormat) {
-    // If LLVM was built with support for this, turn the new debug-info format
-    // on.
+    // Turn the new debug-info format on.
     UseNewDbgInfoFormat = true;
   }
-#endif
-  (void)TryUseNewDbgInfoFormat;
 
   if (TimeTrace)
     timeTraceProfilerInitialize(TimeTraceGranularity, argv[0]);
diff --git a/llvm/tools/llvm-link/llvm-link.cpp b/llvm/tools/llvm-link/llvm-link.cpp
index d50e067..e6c219a 100644
--- a/llvm/tools/llvm-link/llvm-link.cpp
+++ b/llvm/tools/llvm-link/llvm-link.cpp
@@ -473,15 +473,11 @@ int main(int argc, char **argv) {
   cl::ParseCommandLineOptions(argc, argv, "llvm linker\n");
 
   // RemoveDIs debug-info transition: tests may request that we /try/ to use the
-  // new debug-info format, if it's built in.
-#ifdef EXPERIMENTAL_DEBUGINFO_ITERATORS
+  // new debug-info format.
   if (TryUseNewDbgInfoFormat) {
-    // If LLVM was built with support for this, turn the new debug-info format
-    // on.
+    // Turn the new debug-info format on.
     UseNewDbgInfoFormat = true;
   }
-#endif
-  (void)TryUseNewDbgInfoFormat;
 
   LLVMContext Context;
   Context.setDiagnosticHandler(std::make_unique<LLVMLinkDiagnosticHandler>(),
diff --git a/llvm/tools/llvm-lto/llvm-lto.cpp b/llvm/tools/llvm-lto/llvm-lto.cpp
index f272814..7943d69 100644
--- a/llvm/tools/llvm-lto/llvm-lto.cpp
+++ b/llvm/tools/llvm-lto/llvm-lto.cpp
@@ -945,15 +945,11 @@ int main(int argc, char **argv) {
   cl::ParseCommandLineOptions(argc, argv, "llvm LTO linker\n");
 
   // RemoveDIs debug-info transition: tests may request that we /try/ to use the
-  // new debug-info format, if it's built in.
-#ifdef EXPERIMENTAL_DEBUGINFO_ITERATORS
+  // new debug-info format.
   if (TryUseNewDbgInfoFormat) {
-    // If LLVM was built with support for this, turn the new debug-info format
-    // on.
+    // Turn the new debug-info format on.
     UseNewDbgInfoFormat = true;
   }
-#endif
-  (void)TryUseNewDbgInfoFormat;
 
   if (OptLevel < '0' || OptLevel > '3')
     error("optimization level must be between 0 and 3");
diff --git a/llvm/tools/llvm-lto2/llvm-lto2.cpp b/llvm/tools/llvm-lto2/llvm-lto2.cpp
index c212374..d5de4f6 100644
--- a/llvm/tools/llvm-lto2/llvm-lto2.cpp
+++ b/llvm/tools/llvm-lto2/llvm-lto2.cpp
@@ -230,15 +230,11 @@ static int run(int argc, char **argv) {
   cl::ParseCommandLineOptions(argc, argv, "Resolution-based LTO test harness");
 
   // RemoveDIs debug-info transition: tests may request that we /try/ to use the
-  // new debug-info format, if it's built in.
-#ifdef EXPERIMENTAL_DEBUGINFO_ITERATORS
+  // new debug-info format.
   if (TryUseNewDbgInfoFormat) {
-    // If LLVM was built with support for this, turn the new debug-info format
-    // on.
+    // Turn the new debug-info format on.
     UseNewDbgInfoFormat = true;
   }
-#endif
-  (void)TryUseNewDbgInfoFormat;
 
   // FIXME: Workaround PR30396 which means that a symbol can appear
   // more than once if it is defined in module-level assembly and
diff --git a/llvm/tools/llvm-reduce/llvm-reduce.cpp b/llvm/tools/llvm-reduce/llvm-reduce.cpp
index 71ce0ca5..f913771 100644
--- a/llvm/tools/llvm-reduce/llvm-reduce.cpp
+++ b/llvm/tools/llvm-reduce/llvm-reduce.cpp
@@ -151,15 +151,11 @@ int main(int Argc, char **Argv) {
   cl::ParseCommandLineOptions(Argc, Argv, "LLVM automatic testcase reducer.\n");
 
   // RemoveDIs debug-info transition: tests may request that we /try/ to use the
-  // new debug-info format, if it's built in.
-#ifdef EXPERIMENTAL_DEBUGINFO_ITERATORS
+  // new debug-info format.
   if (TryUseNewDbgInfoFormat) {
-    // If LLVM was built with support for this, turn the new debug-info format
-    // on.
+    // Turn the new debug-info format on.
     UseNewDbgInfoFormat = true;
   }
-#endif
-  (void)TryUseNewDbgInfoFormat;
 
   if (Argc == 1) {
     cl::PrintHelpMessage();
diff --git a/llvm/tools/opt/optdriver.cpp b/llvm/tools/opt/optdriver.cpp
index 3f66bfc..85f5294 100644
--- a/llvm/tools/opt/optdriver.cpp
+++ b/llvm/tools/opt/optdriver.cpp
@@ -462,15 +462,11 @@ extern "C" int optMain(
       argc, argv, "llvm .bc -> .bc modular optimizer and analysis printer\n");
 
   // RemoveDIs debug-info transition: tests may request that we /try/ to use the
-  // new debug-info format, if it's built in.
-#ifdef EXPERIMENTAL_DEBUGINFO_ITERATORS
+  // new debug-info format.
   if (TryUseNewDbgInfoFormat) {
-    // If LLVM was built with support for this, turn the new debug-info format
-    // on.
+    // Turn the new debug-info format on.
     UseNewDbgInfoFormat = true;
   }
-#endif
-  (void)TryUseNewDbgInfoFormat;
 
   LLVMContext Context;
 
diff --git a/llvm/unittests/ADT/IListIteratorBitsTest.cpp b/llvm/unittests/ADT/IListIteratorBitsTest.cpp
index 167b30a..8ae73b1 100644
--- a/llvm/unittests/ADT/IListIteratorBitsTest.cpp
+++ b/llvm/unittests/ADT/IListIteratorBitsTest.cpp
@@ -55,10 +55,8 @@ TEST(IListIteratorBitsTest, ConsAndAssignment) {
 
   simple_ilist<Node, ilist_iterator_bits<true>>::iterator I, I2;
 
-// Two sets of tests: if we've compiled in the iterator bits, then check that
-// HeadInclusiveBit and TailInclusiveBit are preserved on assignment and copy
-// construction, but not on other operations.
-#ifdef EXPERIMENTAL_DEBUGINFO_ITERATORS
+  // Check that HeadInclusiveBit and TailInclusiveBit are preserved on
+  // assignment and copy construction, but not on other operations.
   I = L.begin();
   EXPECT_FALSE(I.getHeadBit());
   EXPECT_FALSE(I.getTailBit());
@@ -85,18 +83,6 @@ TEST(IListIteratorBitsTest, ConsAndAssignment) {
   simple_ilist<Node, ilist_iterator_bits<true>>::iterator I3(I);
   EXPECT_TRUE(I3.getHeadBit());
   EXPECT_TRUE(I3.getTailBit());
-#else
-  // The calls should be available, but shouldn't actually store information.
-  I = L.begin();
-  EXPECT_FALSE(I.getHeadBit());
-  EXPECT_FALSE(I.getTailBit());
-  I.setHeadBit(true);
-  I.setTailBit(true);
-  EXPECT_FALSE(I.getHeadBit());
-  EXPECT_FALSE(I.getTailBit());
-  // Suppress warnings as we don't test with this variable.
-  (void)I2;
-#endif
 }
 
 class dummy {
diff --git a/llvm/unittests/IR/BasicBlockDbgInfoTest.cpp b/llvm/unittests/IR/BasicBlockDbgInfoTest.cpp
index ef2b288..53b191c 100644
--- a/llvm/unittests/IR/BasicBlockDbgInfoTest.cpp
+++ b/llvm/unittests/IR/BasicBlockDbgInfoTest.cpp
@@ -27,11 +27,6 @@ using namespace llvm;
 
 extern cl::opt<bool> UseNewDbgInfoFormat;
 
-// None of these tests are meaningful or do anything if we do not have the
-// experimental "head" bit compiled into ilist_iterator (aka
-// ilist_iterator_w_bits), thus there's no point compiling these tests in.
-#ifdef EXPERIMENTAL_DEBUGINFO_ITERATORS
-
 static std::unique_ptr<Module> parseIR(LLVMContext &C, const char *IR) {
   SMDiagnostic Err;
   std::unique_ptr<Module> Mod = parseAssemblyString(IR, Err, C);
@@ -1535,4 +1530,3 @@ TEST(BasicBlockDbgInfoTest, DbgMoveToEnd) {
 }
 
 } // End anonymous namespace.
-#endif // EXPERIMENTAL_DEBUGINFO_ITERATORS
-- 
cgit v1.1


From 7d19dc50de2c81ead6af750bcddd139ac99a48b5 Mon Sep 17 00:00:00 2001
From: Ivan Kosarev <ivan.kosarev@amd.com>
Date: Thu, 8 Feb 2024 18:23:00 +0200
Subject: [AMDGPU][True16] Support VOP3 source DPP operands. (#80892)

---
 .../Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp    |  43 ++++++--
 .../AMDGPU/Disassembler/AMDGPUDisassembler.cpp     |  38 +++++++
 .../AMDGPU/Disassembler/AMDGPUDisassembler.h       |   1 +
 llvm/lib/Target/AMDGPU/SIFoldOperands.cpp          |  32 ++++--
 llvm/lib/Target/AMDGPU/SIInstrInfo.td              |  23 ++++-
 llvm/lib/Target/AMDGPU/SIRegisterInfo.td           |   6 ++
 .../AMDGPU/GlobalISel/inst-select-fceil.s16.mir    |   6 +-
 .../AMDGPU/GlobalISel/inst-select-ffloor.s16.mir   |   6 +-
 llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16.mir   |   4 +-
 .../AMDGPU/gfx11_asm_vop3_dpp16_from_vop1-fake16.s |  85 ++++++++++++++++
 .../MC/AMDGPU/gfx11_asm_vop3_dpp16_from_vop1.s     |  64 ++++++------
 .../AMDGPU/gfx11_asm_vop3_dpp8_from_vop1-fake16.s  |  25 +++++
 .../test/MC/AMDGPU/gfx11_asm_vop3_dpp8_from_vop1.s |  24 +++--
 .../AMDGPU/gfx11_dasm_vop3_dpp16_from_vop1.txt     | 111 +++++++++++++++------
 .../AMDGPU/gfx11_dasm_vop3_dpp8_from_vop1.txt      |  51 ++++++++--
 15 files changed, 410 insertions(+), 109 deletions(-)
 create mode 100644 llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp16_from_vop1-fake16.s
 create mode 100644 llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp8_from_vop1-fake16.s

diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
index 225e781..a94da99 100644
--- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
+++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
@@ -314,8 +314,9 @@ public:
     return isRegOrImmWithInputMods(AMDGPU::VS_64RegClassID, MVT::f64);
   }
 
-  bool isRegOrInlineImmWithFP16InputMods() const {
-    return isRegOrInline(AMDGPU::VS_32RegClassID, MVT::f16);
+  template <bool IsFake16> bool isRegOrInlineImmWithFP16InputMods() const {
+    return isRegOrInline(
+        IsFake16 ? AMDGPU::VS_32RegClassID : AMDGPU::VS_16RegClassID, MVT::f16);
   }
 
   bool isRegOrInlineImmWithFP32InputMods() const {
@@ -8151,7 +8152,7 @@ ParseStatus AMDGPUAsmParser::parseOModSI(OperandVector &Operands) {
 
 // Determines which bit DST_OP_SEL occupies in the op_sel operand according to
 // the number of src operands present, then copies that bit into src0_modifiers.
-void cvtVOP3DstOpSelOnly(MCInst &Inst) {
+static void cvtVOP3DstOpSelOnly(MCInst &Inst, const MCRegisterInfo &MRI) {
   int Opc = Inst.getOpcode();
   int OpSelIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::op_sel);
   if (OpSelIdx == -1)
@@ -8168,23 +8169,34 @@ void cvtVOP3DstOpSelOnly(MCInst &Inst) {
 
   unsigned OpSel = Inst.getOperand(OpSelIdx).getImm();
 
-  if ((OpSel & (1 << SrcNum)) != 0) {
-    int ModIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0_modifiers);
-    uint32_t ModVal = Inst.getOperand(ModIdx).getImm();
-    Inst.getOperand(ModIdx).setImm(ModVal | SISrcMods::DST_OP_SEL);
+  int DstIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
+  if (DstIdx == -1)
+    return;
+
+  const MCOperand &DstOp = Inst.getOperand(DstIdx);
+  int ModIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0_modifiers);
+  uint32_t ModVal = Inst.getOperand(ModIdx).getImm();
+  if (DstOp.isReg() &&
+      MRI.getRegClass(AMDGPU::VGPR_16RegClassID).contains(DstOp.getReg())) {
+    if (AMDGPU::isHi(DstOp.getReg(), MRI))
+      ModVal |= SISrcMods::DST_OP_SEL;
+  } else {
+    if ((OpSel & (1 << SrcNum)) != 0)
+      ModVal |= SISrcMods::DST_OP_SEL;
   }
+  Inst.getOperand(ModIdx).setImm(ModVal);
 }
 
 void AMDGPUAsmParser::cvtVOP3OpSel(MCInst &Inst,
                                    const OperandVector &Operands) {
   cvtVOP3P(Inst, Operands);
-  cvtVOP3DstOpSelOnly(Inst);
+  cvtVOP3DstOpSelOnly(Inst, *getMRI());
 }
 
 void AMDGPUAsmParser::cvtVOP3OpSel(MCInst &Inst, const OperandVector &Operands,
                                    OptionalImmIndexMap &OptionalIdx) {
   cvtVOP3P(Inst, Operands, OptionalIdx);
-  cvtVOP3DstOpSelOnly(Inst);
+  cvtVOP3DstOpSelOnly(Inst, *getMRI());
 }
 
 static bool isRegOrImmWithInputMods(const MCInstrDesc &Desc, unsigned OpNum) {
@@ -8433,8 +8445,17 @@ void AMDGPUAsmParser::cvtVOP3P(MCInst &Inst, const OperandVector &Operands,
 
     uint32_t ModVal = 0;
 
-    if ((OpSel & (1 << J)) != 0)
-      ModVal |= SISrcMods::OP_SEL_0;
+    const MCOperand &SrcOp = Inst.getOperand(OpIdx);
+    if (SrcOp.isReg() && getMRI()
+                             ->getRegClass(AMDGPU::VGPR_16RegClassID)
+                             .contains(SrcOp.getReg())) {
+      bool VGPRSuffixIsHi = AMDGPU::isHi(SrcOp.getReg(), *getMRI());
+      if (VGPRSuffixIsHi)
+        ModVal |= SISrcMods::OP_SEL_0;
+    } else {
+      if ((OpSel & (1 << J)) != 0)
+        ModVal |= SISrcMods::OP_SEL_0;
+    }
 
     if ((OpSelHi & (1 << J)) != 0)
       ModVal |= SISrcMods::OP_SEL_1;
diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
index fba9eb5..85377d0 100644
--- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
+++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
@@ -913,6 +913,41 @@ static VOPModifiers collectVOPModifiers(const MCInst &MI,
   return Modifiers;
 }
 
+// Instructions decode the op_sel/suffix bits into the src_modifier
+// operands. Copy those bits into the src operands for true16 VGPRs.
+void AMDGPUDisassembler::convertTrue16OpSel(MCInst &MI) const {
+  const unsigned Opc = MI.getOpcode();
+  const MCRegisterClass &ConversionRC =
+      MRI.getRegClass(AMDGPU::VGPR_16RegClassID);
+  constexpr std::array<std::tuple<int, int, unsigned>, 4> OpAndOpMods = {
+      {{AMDGPU::OpName::src0, AMDGPU::OpName::src0_modifiers,
+        SISrcMods::OP_SEL_0},
+       {AMDGPU::OpName::src1, AMDGPU::OpName::src1_modifiers,
+        SISrcMods::OP_SEL_0},
+       {AMDGPU::OpName::src2, AMDGPU::OpName::src2_modifiers,
+        SISrcMods::OP_SEL_0},
+       {AMDGPU::OpName::vdst, AMDGPU::OpName::src0_modifiers,
+        SISrcMods::DST_OP_SEL}}};
+  for (const auto &[OpName, OpModsName, OpSelMask] : OpAndOpMods) {
+    int OpIdx = AMDGPU::getNamedOperandIdx(Opc, OpName);
+    int OpModsIdx = AMDGPU::getNamedOperandIdx(Opc, OpModsName);
+    if (OpIdx == -1 || OpModsIdx == -1)
+      continue;
+    MCOperand &Op = MI.getOperand(OpIdx);
+    if (!Op.isReg())
+      continue;
+    if (!ConversionRC.contains(Op.getReg()))
+      continue;
+    unsigned OpEnc = MRI.getEncodingValue(Op.getReg());
+    const MCOperand &OpMods = MI.getOperand(OpModsIdx);
+    unsigned ModVal = OpMods.getImm();
+    if (ModVal & OpSelMask) { // isHi
+      unsigned RegIdx = OpEnc & AMDGPU::HWEncoding::REG_IDX_MASK;
+      Op.setReg(ConversionRC.getRegister(RegIdx * 2 + 1));
+    }
+  }
+}
+
 // MAC opcodes have special old and src2 operands.
 // src2 is tied to dst, while old is not tied (but assumed to be).
 bool AMDGPUDisassembler::isMacDPP(MCInst &MI) const {
@@ -968,6 +1003,7 @@ DecodeStatus AMDGPUDisassembler::convertDPP8Inst(MCInst &MI) const {
     unsigned DescNumOps = MCII->get(Opc).getNumOperands();
     if (MI.getNumOperands() < DescNumOps &&
         AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::op_sel)) {
+      convertTrue16OpSel(MI);
       auto Mods = collectVOPModifiers(MI);
       insertNamedMCOperand(MI, MCOperand::createImm(Mods.OpSel),
                            AMDGPU::OpName::op_sel);
@@ -991,6 +1027,8 @@ DecodeStatus AMDGPUDisassembler::convertVOP3DPPInst(MCInst &MI) const {
   if (isMacDPP(MI))
     convertMacDPPInst(MI);
 
+  convertTrue16OpSel(MI);
+
   int VDstInIdx =
       AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdst_in);
   if (VDstInIdx != -1)
diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
index 5a89b30..02feaf55 100644
--- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
+++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
@@ -203,6 +203,7 @@ public:
   DecodeStatus convertVOP3PDPPInst(MCInst &MI) const;
   DecodeStatus convertVOPCDPPInst(MCInst &MI) const;
   void convertMacDPPInst(MCInst &MI) const;
+  void convertTrue16OpSel(MCInst &MI) const;
 
   enum OpWidthTy {
     OPW32,
diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
index a812cdc..8bf0568 100644
--- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
@@ -756,14 +756,14 @@ void SIFoldOperands::foldOperand(
   int UseOpIdx,
   SmallVectorImpl<FoldCandidate> &FoldList,
   SmallVectorImpl<MachineInstr *> &CopiesToReplace) const {
-  const MachineOperand &UseOp = UseMI->getOperand(UseOpIdx);
+  const MachineOperand *UseOp = &UseMI->getOperand(UseOpIdx);
 
-  if (!isUseSafeToFold(*UseMI, UseOp))
+  if (!isUseSafeToFold(*UseMI, *UseOp))
     return;
 
   // FIXME: Fold operands with subregs.
-  if (UseOp.isReg() && OpToFold.isReg() &&
-      (UseOp.isImplicit() || UseOp.getSubReg() != AMDGPU::NoSubRegister))
+  if (UseOp->isReg() && OpToFold.isReg() &&
+      (UseOp->isImplicit() || UseOp->getSubReg() != AMDGPU::NoSubRegister))
     return;
 
   // Special case for REG_SEQUENCE: We can't fold literals into
@@ -859,7 +859,6 @@ void SIFoldOperands::foldOperand(
     if (MovOp == AMDGPU::COPY)
       return;
 
-    UseMI->setDesc(TII->get(MovOp));
     MachineInstr::mop_iterator ImpOpI = UseMI->implicit_operands().begin();
     MachineInstr::mop_iterator ImpOpE = UseMI->implicit_operands().end();
     while (ImpOpI != ImpOpE) {
@@ -867,6 +866,19 @@ void SIFoldOperands::foldOperand(
       ImpOpI++;
       UseMI->removeOperand(UseMI->getOperandNo(Tmp));
     }
+    UseMI->setDesc(TII->get(MovOp));
+
+    if (MovOp == AMDGPU::V_MOV_B16_t16_e64) {
+      const auto &SrcOp = UseMI->getOperand(UseOpIdx);
+      MachineOperand NewSrcOp(SrcOp);
+      MachineFunction *MF = UseMI->getParent()->getParent();
+      UseMI->removeOperand(1);
+      UseMI->addOperand(*MF, MachineOperand::CreateImm(0)); // src0_modifiers
+      UseMI->addOperand(NewSrcOp);                          // src0
+      UseMI->addOperand(*MF, MachineOperand::CreateImm(0)); // op_sel
+      UseOpIdx = 2;
+      UseOp = &UseMI->getOperand(UseOpIdx);
+    }
     CopiesToReplace.push_back(UseMI);
   } else {
     if (UseMI->isCopy() && OpToFold.isReg() &&
@@ -1027,7 +1039,7 @@ void SIFoldOperands::foldOperand(
 
     // Don't fold into target independent nodes.  Target independent opcodes
     // don't have defined register classes.
-    if (UseDesc.isVariadic() || UseOp.isImplicit() ||
+    if (UseDesc.isVariadic() || UseOp->isImplicit() ||
         UseDesc.operands()[UseOpIdx].RegClass == -1)
       return;
   }
@@ -1062,17 +1074,17 @@ void SIFoldOperands::foldOperand(
       TRI->getRegClass(FoldDesc.operands()[0].RegClass);
 
   // Split 64-bit constants into 32-bits for folding.
-  if (UseOp.getSubReg() && AMDGPU::getRegBitWidth(*FoldRC) == 64) {
-    Register UseReg = UseOp.getReg();
+  if (UseOp->getSubReg() && AMDGPU::getRegBitWidth(*FoldRC) == 64) {
+    Register UseReg = UseOp->getReg();
     const TargetRegisterClass *UseRC = MRI->getRegClass(UseReg);
     if (AMDGPU::getRegBitWidth(*UseRC) != 64)
       return;
 
     APInt Imm(64, OpToFold.getImm());
-    if (UseOp.getSubReg() == AMDGPU::sub0) {
+    if (UseOp->getSubReg() == AMDGPU::sub0) {
       Imm = Imm.getLoBits(32);
     } else {
-      assert(UseOp.getSubReg() == AMDGPU::sub1);
+      assert(UseOp->getSubReg() == AMDGPU::sub1);
       Imm = Imm.getHiBits(32);
     }
 
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
index 7edec5a..2259977 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -1148,7 +1148,13 @@ def FPT16InputModsMatchClass : FPInputModsMatchClass<16> {
 def FP32InputModsMatchClass : FPInputModsMatchClass<32>;
 def FP64InputModsMatchClass : FPInputModsMatchClass<64>;
 
-def FP16VCSrcInputModsMatchClass : FPVCSrcInputModsMatchClass<16>;
+class FP16VCSrcInputModsMatchClass<bit IsFake16>
+    : FPVCSrcInputModsMatchClass<16> {
+  let Name = !if(IsFake16, "RegOrInlineImmWithFPFake16InputMods",
+                 "RegOrInlineImmWithFPT16InputMods");
+  let PredicateMethod = "isRegOrInlineImmWithFP16InputMods<" #
+                        !if(IsFake16, "true", "false") # ">";
+}
 def FP32VCSrcInputModsMatchClass : FPVCSrcInputModsMatchClass<32>;
 
 class InputMods <AsmOperandClass matchClass> : Operand <i32> {
@@ -1166,7 +1172,8 @@ def FPT16InputMods : FPInputMods<FPT16InputModsMatchClass>;
 def FP32InputMods : FPInputMods<FP32InputModsMatchClass>;
 def FP64InputMods : FPInputMods<FP64InputModsMatchClass>;
 
-def FP16VCSrcInputMods : FPInputMods<FP16VCSrcInputModsMatchClass>;
+class FP16VCSrcInputMods<bit IsFake16>
+  : FPInputMods<FP16VCSrcInputModsMatchClass<IsFake16>>;
 def FP32VCSrcInputMods : FPInputMods<FP32VCSrcInputModsMatchClass>;
 
 class IntInputModsMatchClass <int opSize> : AsmOperandClass {
@@ -1653,11 +1660,11 @@ class getSrcModDPP_t16 <ValueType VT, bit IsFake16 = 1> {
 }
 
 // Return type of input modifiers operand for specified input operand for DPP
-class getSrcModVOP3DPP <ValueType VT> {
+class getSrcModVOP3DPP <ValueType VT, bit IsFake16 = 1> {
   Operand ret =
       !if (VT.isFP,
            !if (!or(!eq(VT.Value, f16.Value), !eq(VT.Value, bf16.Value)),
-                FP16VCSrcInputMods, FP32VCSrcInputMods),
+                FP16VCSrcInputMods<IsFake16>, FP32VCSrcInputMods),
            Int32VCSrcInputMods);
 }
 
@@ -2450,6 +2457,10 @@ class VOP_PAT_GEN <VOPProfile p, int mode=PatGenMode.NoPattern> : VOPProfile <p.
 class VOPProfile_True16<VOPProfile P> : VOPProfile<P.ArgVT> {
   let IsTrue16 = 1;
   let IsRealTrue16 = 1;
+
+  let HasOpSel = 1;
+  let HasModifiers = 1; // All instructions at least have OpSel.
+
   // Most DstVT are 16-bit, but not all.
   let DstRC = getVALUDstForVT<DstVT, 1 /*IsTrue16*/, 0 /*IsVOP3Encoding*/>.ret;
   let DstRC64 = getVALUDstForVT<DstVT>.ret;
@@ -2461,6 +2472,10 @@ class VOPProfile_True16<VOPProfile P> : VOPProfile<P.ArgVT> {
   let Src0ModDPP = getSrcModDPP_t16<Src0VT, 0 /*IsFake16*/>.ret;
   let Src1ModDPP = getSrcModDPP_t16<Src1VT, 0 /*IsFake16*/>.ret;
   let Src2ModDPP = getSrcModDPP_t16<Src2VT, 0 /*IsFake16*/>.ret;
+  let Src0VOP3DPP = VGPRSrc_16;
+  let Src0ModVOP3DPP = getSrcModVOP3DPP<Src0VT, 0 /*IsFake16*/>.ret;
+  let Src1ModVOP3DPP = getSrcModVOP3DPP<Src1VT, 0 /*IsFake16*/>.ret;
+  let Src2ModVOP3DPP = getSrcModVOP3DPP<Src2VT, 0 /*IsFake16*/>.ret;
 
   let DstRC64 = getVALUDstForVT<DstVT, 1 /*IsTrue16*/, 1 /*IsVOP3Encoding*/>.ret;
   let Src0RC64 = getVOP3SrcForVT<Src0VT, 1 /*IsTrue16*/>.ret;
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
index c9dbe02..aabb6c2 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
@@ -1235,6 +1235,12 @@ def VGPRSrc_16_Lo128 : RegisterOperand<VGPR_16_Lo128> {
   let EncoderMethod = "getMachineOpValueT16Lo128";
 }
 
+// True 16 operands.
+def VGPRSrc_16 : RegisterOperand<VGPR_16> {
+  let DecoderMethod = "DecodeVGPR_16RegisterClass";
+  let EncoderMethod = "getMachineOpValueT16";
+}
+
 //===----------------------------------------------------------------------===//
 //  ASrc_* Operands with an AccVGPR
 //===----------------------------------------------------------------------===//
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fceil.s16.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fceil.s16.mir
index 84da311..014534a 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fceil.s16.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fceil.s16.mir
@@ -50,7 +50,7 @@ body: |
     ; GFX11-NEXT: {{  $}}
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_16 = COPY [[COPY]]
-    ; GFX11-NEXT: [[V_CEIL_F16_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_CEIL_F16_t16_e64 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
+    ; GFX11-NEXT: [[V_CEIL_F16_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_CEIL_F16_t16_e64 0, [[COPY1]], 0, 0, 0, implicit $mode, implicit $exec
     ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_CEIL_F16_t16_e64_]]
     ; GFX11-NEXT: $vgpr0 = COPY [[COPY2]]
     ;
@@ -88,7 +88,7 @@ body: |
     ; GFX11: liveins: $sgpr0
     ; GFX11-NEXT: {{  $}}
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
-    ; GFX11-NEXT: [[V_CEIL_F16_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_CEIL_F16_t16_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
+    ; GFX11-NEXT: [[V_CEIL_F16_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_CEIL_F16_t16_e64 0, [[COPY]], 0, 0, 0, implicit $mode, implicit $exec
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[V_CEIL_F16_t16_e64_]]
     ; GFX11-NEXT: $vgpr0 = COPY [[COPY1]]
     ;
@@ -127,7 +127,7 @@ body: |
     ; GFX11-NEXT: {{  $}}
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_16 = COPY [[COPY]]
-    ; GFX11-NEXT: [[V_CEIL_F16_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_CEIL_F16_t16_e64 1, [[COPY1]], 0, 0, implicit $mode, implicit $exec
+    ; GFX11-NEXT: [[V_CEIL_F16_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_CEIL_F16_t16_e64 1, [[COPY1]], 0, 0, 0, implicit $mode, implicit $exec
     ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_CEIL_F16_t16_e64_]]
     ; GFX11-NEXT: $vgpr0 = COPY [[COPY2]]
     ;
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-ffloor.s16.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-ffloor.s16.mir
index 30975a8..dcf9e16 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-ffloor.s16.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-ffloor.s16.mir
@@ -59,7 +59,7 @@ body: |
     ; GFX11-NEXT: {{  $}}
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_16 = COPY [[COPY]]
-    ; GFX11-NEXT: [[V_FLOOR_F16_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_FLOOR_F16_t16_e64 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
+    ; GFX11-NEXT: [[V_FLOOR_F16_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_FLOOR_F16_t16_e64 0, [[COPY1]], 0, 0, 0, implicit $mode, implicit $exec
     ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_FLOOR_F16_t16_e64_]]
     ; GFX11-NEXT: $vgpr0 = COPY [[COPY2]]
     ;
@@ -97,7 +97,7 @@ body: |
     ; GFX11: liveins: $sgpr0
     ; GFX11-NEXT: {{  $}}
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
-    ; GFX11-NEXT: [[V_FLOOR_F16_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_FLOOR_F16_t16_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
+    ; GFX11-NEXT: [[V_FLOOR_F16_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_FLOOR_F16_t16_e64 0, [[COPY]], 0, 0, 0, implicit $mode, implicit $exec
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[V_FLOOR_F16_t16_e64_]]
     ; GFX11-NEXT: $vgpr0 = COPY [[COPY1]]
     ;
@@ -136,7 +136,7 @@ body: |
     ; GFX11-NEXT: {{  $}}
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_16 = COPY [[COPY]]
-    ; GFX11-NEXT: [[V_FLOOR_F16_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_FLOOR_F16_t16_e64 1, [[COPY1]], 0, 0, implicit $mode, implicit $exec
+    ; GFX11-NEXT: [[V_FLOOR_F16_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_FLOOR_F16_t16_e64 1, [[COPY1]], 0, 0, 0, implicit $mode, implicit $exec
     ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_FLOOR_F16_t16_e64_]]
     ; GFX11-NEXT: $vgpr0 = COPY [[COPY2]]
     ;
diff --git a/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16.mir b/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16.mir
index 7767aa5..9ae5f55 100644
--- a/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16.mir
+++ b/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16.mir
@@ -66,7 +66,7 @@ body:             |
     ; REAL16: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
     ; REAL16-NEXT: [[V_CVT_F32_U32_e64_:%[0-9]+]]:vgpr_32 = V_CVT_F32_U32_e64 [[DEF]], 0, 0, implicit $mode, implicit $exec
     ; REAL16-NEXT: [[DEF1:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
-    ; REAL16-NEXT: [[V_CEIL_F16_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_CEIL_F16_t16_e64 0, [[V_CVT_F32_U32_e64_]].lo16, 0, 0, implicit $mode, implicit $exec
+    ; REAL16-NEXT: [[V_CEIL_F16_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_CEIL_F16_t16_e64 0, [[V_CVT_F32_U32_e64_]].lo16, 0, 0, 0, implicit $mode, implicit $exec
     ;
     ; FAKE16-LABEL: name: ceil_f16
     ; FAKE16: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
@@ -87,7 +87,7 @@ body:             |
     ; REAL16: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
     ; REAL16-NEXT: [[V_CVT_F32_U32_e64_:%[0-9]+]]:vgpr_32 = V_CVT_F32_U32_e64 [[DEF]], 0, 0, implicit $mode, implicit $exec
     ; REAL16-NEXT: [[DEF1:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
-    ; REAL16-NEXT: [[V_FLOOR_F16_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_FLOOR_F16_t16_e64 0, [[V_CVT_F32_U32_e64_]].lo16, 0, 0, implicit $mode, implicit $exec
+    ; REAL16-NEXT: [[V_FLOOR_F16_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_FLOOR_F16_t16_e64 0, [[V_CVT_F32_U32_e64_]].lo16, 0, 0, 0, implicit $mode, implicit $exec
     ;
     ; FAKE16-LABEL: name: floor_f16
     ; FAKE16: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp16_from_vop1-fake16.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp16_from_vop1-fake16.s
new file mode 100644
index 0000000..1871a41
--- /dev/null
+++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp16_from_vop1-fake16.s
@@ -0,0 +1,85 @@
+// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=-real-true16,+wavefrontsize32,-wavefrontsize64 -show-encoding %s | FileCheck --check-prefixes=GFX11 %s
+
+v_ceil_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0]
+// GFX11: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+
+v_ceil_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3]
+// GFX11: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
+
+v_ceil_f16_e64_dpp v5, v1 row_mirror
+// GFX11: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
+
+v_ceil_f16_e64_dpp v5, v1 row_half_mirror
+// GFX11: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
+
+v_ceil_f16_e64_dpp v5, v1 row_shl:1
+// GFX11: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
+
+v_ceil_f16_e64_dpp v5, v1 row_shl:15
+// GFX11: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
+
+v_ceil_f16_e64_dpp v5, v1 row_shr:1
+// GFX11: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
+
+v_ceil_f16_e64_dpp v5, v1 row_shr:15
+// GFX11: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
+
+v_ceil_f16_e64_dpp v5, v1 row_ror:1
+// GFX11: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
+
+v_ceil_f16_e64_dpp v5, v1 row_ror:15
+// GFX11: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
+
+v_ceil_f16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
+// GFX11: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
+
+v_ceil_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX11: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
+
+v_ceil_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX11: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13]
+
+v_ceil_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX11: [0xff,0x81,0xdc,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30]
+
+v_floor_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0]
+// GFX11: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+
+v_floor_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3]
+// GFX11: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
+
+v_floor_f16_e64_dpp v5, v1 row_mirror
+// GFX11: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
+
+v_floor_f16_e64_dpp v5, v1 row_half_mirror
+// GFX11: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
+
+v_floor_f16_e64_dpp v5, v1 row_shl:1
+// GFX11: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
+
+v_floor_f16_e64_dpp v5, v1 row_shl:15
+// GFX11: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
+
+v_floor_f16_e64_dpp v5, v1 row_shr:1
+// GFX11: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
+
+v_floor_f16_e64_dpp v5, v1 row_shr:15
+// GFX11: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
+
+v_floor_f16_e64_dpp v5, v1 row_ror:1
+// GFX11: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
+
+v_floor_f16_e64_dpp v5, v1 row_ror:15
+// GFX11: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
+
+v_floor_f16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
+// GFX11: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
+
+v_floor_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX11: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
+
+v_floor_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX11: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13]
+
+v_floor_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX11: [0xff,0x81,0xdb,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30]
diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp16_from_vop1.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp16_from_vop1.s
index 9a65c66..701a725 100644
--- a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp16_from_vop1.s
+++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp16_from_vop1.s
@@ -1,4 +1,4 @@
-// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -show-encoding %s | FileCheck --check-prefixes=GFX11 %s
+// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+real-true16,+wavefrontsize32,-wavefrontsize64 -show-encoding %s | FileCheck --check-prefixes=GFX11 %s
 
 v_bfrev_b32_e64_dpp v5, v1 quad_perm:[3,2,1,0]
 // GFX11: [0x05,0x00,0xb8,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
@@ -42,46 +42,52 @@ v_bfrev_b32_e64_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 f
 v_bfrev_b32_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
 // GFX11: [0xff,0x00,0xb8,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x05,0x30]
 
-v_ceil_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0]
+v_ceil_f16_e64_dpp v5.l, v1.l quad_perm:[3,2,1,0]
 // GFX11: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
 
-v_ceil_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3]
+v_ceil_f16_e64_dpp v5.l, v1.h quad_perm:[3,2,1,0]
+// GFX11: [0x05,0x08,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+
+v_ceil_f16_e64_dpp v5.h, v1.l quad_perm:[3,2,1,0]
+// GFX11: [0x05,0x40,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+
+v_ceil_f16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3]
 // GFX11: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
 
-v_ceil_f16_e64_dpp v5, v1 row_mirror
+v_ceil_f16_e64_dpp v5.l, v1.l row_mirror
 // GFX11: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
 
-v_ceil_f16_e64_dpp v5, v1 row_half_mirror
+v_ceil_f16_e64_dpp v5.l, v1.l row_half_mirror
 // GFX11: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
 
-v_ceil_f16_e64_dpp v5, v1 row_shl:1
+v_ceil_f16_e64_dpp v5.l, v1.l row_shl:1
 // GFX11: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
 
-v_ceil_f16_e64_dpp v5, v1 row_shl:15
+v_ceil_f16_e64_dpp v5.l, v1.l row_shl:15
 // GFX11: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
 
-v_ceil_f16_e64_dpp v5, v1 row_shr:1
+v_ceil_f16_e64_dpp v5.l, v1.l row_shr:1
 // GFX11: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
 
-v_ceil_f16_e64_dpp v5, v1 row_shr:15
+v_ceil_f16_e64_dpp v5.l, v1.l row_shr:15
 // GFX11: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
 
-v_ceil_f16_e64_dpp v5, v1 row_ror:1
+v_ceil_f16_e64_dpp v5.l, v1.l row_ror:1
 // GFX11: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
 
-v_ceil_f16_e64_dpp v5, v1 row_ror:15
+v_ceil_f16_e64_dpp v5.l, v1.l row_ror:15
 // GFX11: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
 
-v_ceil_f16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
+v_ceil_f16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf
 // GFX11: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
 
-v_ceil_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1
+v_ceil_f16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1
 // GFX11: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
 
-v_ceil_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+v_ceil_f16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
 // GFX11: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13]
 
-v_ceil_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+v_ceil_f16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
 // GFX11: [0xff,0x81,0xdc,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30]
 
 v_ceil_f32_e64_dpp v5, v1 quad_perm:[3,2,1,0]
@@ -1512,46 +1518,46 @@ v_ffbl_b32_e64_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi
 v_ffbl_b32_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
 // GFX11: [0xff,0x00,0xba,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x05,0x30]
 
-v_floor_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0]
+v_floor_f16_e64_dpp v5.l, v1.l quad_perm:[3,2,1,0]
 // GFX11: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
 
-v_floor_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3]
+v_floor_f16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3]
 // GFX11: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
 
-v_floor_f16_e64_dpp v5, v1 row_mirror
+v_floor_f16_e64_dpp v5.l, v1.l row_mirror
 // GFX11: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
 
-v_floor_f16_e64_dpp v5, v1 row_half_mirror
+v_floor_f16_e64_dpp v5.l, v1.l row_half_mirror
 // GFX11: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
 
-v_floor_f16_e64_dpp v5, v1 row_shl:1
+v_floor_f16_e64_dpp v5.l, v1.l row_shl:1
 // GFX11: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
 
-v_floor_f16_e64_dpp v5, v1 row_shl:15
+v_floor_f16_e64_dpp v5.l, v1.l row_shl:15
 // GFX11: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
 
-v_floor_f16_e64_dpp v5, v1 row_shr:1
+v_floor_f16_e64_dpp v5.l, v1.l row_shr:1
 // GFX11: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
 
-v_floor_f16_e64_dpp v5, v1 row_shr:15
+v_floor_f16_e64_dpp v5.l, v1.l row_shr:15
 // GFX11: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
 
-v_floor_f16_e64_dpp v5, v1 row_ror:1
+v_floor_f16_e64_dpp v5.l, v1.l row_ror:1
 // GFX11: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
 
-v_floor_f16_e64_dpp v5, v1 row_ror:15
+v_floor_f16_e64_dpp v5.l, v1.l row_ror:15
 // GFX11: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
 
-v_floor_f16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
+v_floor_f16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf
 // GFX11: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
 
-v_floor_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1
+v_floor_f16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1
 // GFX11: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
 
-v_floor_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+v_floor_f16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
 // GFX11: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13]
 
-v_floor_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+v_floor_f16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
 // GFX11: [0xff,0x81,0xdb,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30]
 
 v_floor_f32_e64_dpp v5, v1 quad_perm:[3,2,1,0]
diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp8_from_vop1-fake16.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp8_from_vop1-fake16.s
new file mode 100644
index 0000000..1bef1fe2
--- /dev/null
+++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp8_from_vop1-fake16.s
@@ -0,0 +1,25 @@
+// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=-real-true16,+wavefrontsize32,-wavefrontsize64 -show-encoding %s | FileCheck --check-prefix=GFX11 %s
+
+v_ceil_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: [0x05,0x00,0xdc,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+
+v_ceil_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: [0x05,0x00,0xdc,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
+
+v_ceil_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX11: [0x05,0x00,0xdc,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
+
+v_ceil_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX11: [0xff,0x81,0xdc,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
+
+v_floor_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: [0x05,0x00,0xdb,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+
+v_floor_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: [0x05,0x00,0xdb,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
+
+v_floor_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX11: [0x05,0x00,0xdb,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
+
+v_floor_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX11: [0xff,0x81,0xdb,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp8_from_vop1.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp8_from_vop1.s
index 3897b82..043e0f9 100644
--- a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp8_from_vop1.s
+++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp8_from_vop1.s
@@ -1,4 +1,4 @@
-// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -show-encoding %s | FileCheck --check-prefix=GFX11 %s
+// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+real-true16,+wavefrontsize32,-wavefrontsize64 -show-encoding %s | FileCheck --check-prefix=GFX11 %s
 
 v_bfrev_b32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: [0x05,0x00,0xb8,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
@@ -9,16 +9,22 @@ v_bfrev_b32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
 v_bfrev_b32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX11: [0xff,0x00,0xb8,0xd5,0xe9,0x00,0x00,0x00,0xff,0x00,0x00,0x00]
 
-v_ceil_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0]
+v_ceil_f16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: [0x05,0x00,0xdc,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
 
-v_ceil_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0]
+v_ceil_f16_e64_dpp v5.l, v1.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: [0x05,0x08,0xdc,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+
+v_ceil_f16_e64_dpp v5.h, v1.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: [0x05,0x40,0xdc,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+
+v_ceil_f16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: [0x05,0x00,0xdc,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
 
-v_ceil_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1
+v_ceil_f16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1
 // GFX11: [0x05,0x00,0xdc,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
 
-v_ceil_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0
+v_ceil_f16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX11: [0xff,0x81,0xdc,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
 
 v_ceil_f32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0]
@@ -375,16 +381,16 @@ v_ffbl_b32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
 v_ffbl_b32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX11: [0xff,0x00,0xba,0xd5,0xe9,0x00,0x00,0x00,0xff,0x00,0x00,0x00]
 
-v_floor_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0]
+v_floor_f16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: [0x05,0x00,0xdb,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
 
-v_floor_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0]
+v_floor_f16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: [0x05,0x00,0xdb,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
 
-v_floor_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1
+v_floor_f16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1
 // GFX11: [0x05,0x00,0xdb,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
 
-v_floor_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0
+v_floor_f16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX11: [0xff,0x81,0xdb,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
 
 v_floor_f32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0]
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp16_from_vop1.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp16_from_vop1.txt
index cf29efa..fe50845 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp16_from_vop1.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp16_from_vop1.txt
@@ -1,4 +1,5 @@
-# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX11 %s
+# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX11,GFX11-REAL16 %s
+# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
 
 # GFX11: v_bfrev_b32_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xb8,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
 0x05,0x00,0xb8,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff
@@ -42,48 +43,74 @@
 # GFX11: v_bfrev_b32_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0xb8,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x0d,0x30]
 0xff,0x00,0xb8,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x0d,0x30
 
-# GFX11: v_ceil_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+# GFX11-REAL16: v_ceil_f16_e64_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+# GFX11-FAKE16: v_ceil_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
 0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff
 
-# GFX11: v_ceil_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
+# GFX11-REAL16: v_ceil_f16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
+# GFX11-FAKE16: v_ceil_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
 0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff
 
-# GFX11: v_ceil_f16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
+# GFX11-REAL16: v_ceil_f16_e64_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
+# GFX11-FAKE16: v_ceil_f16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
 0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff
 
-# GFX11: v_ceil_f16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
+# GFX11-REAL16: v_ceil_f16_e64_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
+# GFX11-FAKE16: v_ceil_f16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
 0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff
 
-# GFX11: v_ceil_f16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
+# GFX11-REAL16: v_ceil_f16_e64_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
+# GFX11-FAKE16: v_ceil_f16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
 0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff
 
-# GFX11: v_ceil_f16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
+# GFX11-REAL16: v_ceil_f16_e64_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
+# GFX11-FAKE16: v_ceil_f16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
 0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff
 
-# GFX11: v_ceil_f16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
+# GFX11-REAL16: v_ceil_f16_e64_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
+# GFX11-FAKE16: v_ceil_f16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
 0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff
 
-# GFX11: v_ceil_f16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
+# GFX11-REAL16: v_ceil_f16_e64_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
+# GFX11-FAKE16: v_ceil_f16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
 0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff
 
-# GFX11: v_ceil_f16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
+# GFX11-REAL16: v_ceil_f16_e64_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
+# GFX11-FAKE16: v_ceil_f16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
 0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff
 
-# GFX11: v_ceil_f16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
+# GFX11-REAL16: v_ceil_f16_e64_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
+# GFX11-FAKE16: v_ceil_f16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
 0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff
 
-# GFX11: v_ceil_f16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
+# GFX11-REAL16: v_ceil_f16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
+# GFX11-FAKE16: v_ceil_f16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
 0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff
 
-# GFX11: v_ceil_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
+# GFX11-REAL16: v_ceil_f16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
+# GFX11-FAKE16: v_ceil_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
 0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01
 
-# GFX11: v_ceil_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13]
+# GFX11-REAL16: v_ceil_f16_e64_dpp v5.h, v1.h op_sel:[1,1] mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x48,0xdc,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
+# COM: GFX11-FAKE16: warning: invalid instruction encoding
+0x05,0x48,0xdc,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01
+
+# GFX11-REAL16: v_ceil_f16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13]
+# GFX11-FAKE16: v_ceil_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13]
 0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13
 
-# GFX11: v_ceil_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xdc,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30]
+# GFX11-REAL16: v_ceil_f16_e64_dpp v5.l, v1.h op_sel:[1,0] mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x08,0xdc,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13]
+# COM: GFX11-FAKE16: warning: invalid instruction encoding
+0x05,0x08,0xdc,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13
+
+# GFX11-REAL16: v_ceil_f16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xdc,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30]
+# GFX11-FAKE16: v_ceil_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xdc,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30]
 0xff,0x81,0xdc,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30
 
+# GFX11-REAL16: v_ceil_f16_e64_dpp v255.h, -|v255.l| op_sel:[0,1] clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc1,0xdc,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30]
+# COM: GFX11-FAKE16: warning: invalid instruction encoding
+0xff,0xc1,0xdc,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30
+
 # GFX11: v_ceil_f32_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xa2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
 0x05,0x00,0xa2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff
 
@@ -1302,48 +1329,74 @@
 # GFX11: v_exp_f32_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xa5,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30]
 0xff,0x81,0xa5,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30
 
-# GFX11: v_floor_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+# GFX11-REAL16: v_floor_f16_e64_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+# GFX11-FAKE16: v_floor_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
 0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff
 
-# GFX11: v_floor_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
+# GFX11-REAL16: v_floor_f16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
+# GFX11-FAKE16: v_floor_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
 0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff
 
-# GFX11: v_floor_f16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
+# GFX11-REAL16: v_floor_f16_e64_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
+# GFX11-FAKE16: v_floor_f16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
 0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff
 
-# GFX11: v_floor_f16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
+# GFX11-REAL16: v_floor_f16_e64_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
+# GFX11-FAKE16: v_floor_f16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
 0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff
 
-# GFX11: v_floor_f16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
+# GFX11-REAL16: v_floor_f16_e64_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
+# GFX11-FAKE16: v_floor_f16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
 0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff
 
-# GFX11: v_floor_f16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
+# GFX11-REAL16: v_floor_f16_e64_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
+# GFX11-FAKE16: v_floor_f16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
 0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff
 
-# GFX11: v_floor_f16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
+# GFX11-REAL16: v_floor_f16_e64_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
+# GFX11-FAKE16: v_floor_f16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
 0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff
 
-# GFX11: v_floor_f16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
+# GFX11-REAL16: v_floor_f16_e64_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
+# GFX11-FAKE16: v_floor_f16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
 0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff
 
-# GFX11: v_floor_f16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
+# GFX11-REAL16: v_floor_f16_e64_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
+# GFX11-FAKE16: v_floor_f16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
 0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff
 
-# GFX11: v_floor_f16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
+# GFX11-REAL16: v_floor_f16_e64_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
+# GFX11-FAKE16: v_floor_f16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
 0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff
 
-# GFX11: v_floor_f16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
+# GFX11-REAL16: v_floor_f16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
+# GFX11-FAKE16: v_floor_f16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
 0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff
 
-# GFX11: v_floor_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
+# GFX11-REAL16: v_floor_f16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
+# GFX11-FAKE16: v_floor_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
 0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01
 
-# GFX11: v_floor_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13]
+# GFX11-REAL16: v_floor_f16_e64_dpp v5.h, v1.h op_sel:[1,1] mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x48,0xdb,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
+# COM: GFX11-FAKE16: warning: invalid instruction encoding
+0x05,0x48,0xdb,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01
+
+# GFX11-REAL16: v_floor_f16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13]
+# GFX11-FAKE16: v_floor_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13]
 0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13
 
-# GFX11: v_floor_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xdb,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30]
+# GFX11-REAL16: v_floor_f16_e64_dpp v5.l, v1.h op_sel:[1,0] mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x08,0xdb,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13]
+# COM: GFX11-FAKE16: warning: invalid instruction encoding
+0x05,0x08,0xdb,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13
+
+# GFX11-REAL16: v_floor_f16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xdb,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30]
+# GFX11-FAKE16: v_floor_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xdb,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30]
 0xff,0x81,0xdb,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30
 
+# GFX11-REAL16: v_floor_f16_e64_dpp v255.h, -|v255.l| op_sel:[0,1] clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc1,0xdb,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30]
+# COM: GFX11-FAKE16: warning: invalid instruction encoding
+0xff,0xc1,0xdb,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30
+
 # GFX11: v_floor_f32_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xa4,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
 0x05,0x00,0xa4,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff
 
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp8_from_vop1.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp8_from_vop1.txt
index bfda6d1..c1b500e 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp8_from_vop1.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp8_from_vop1.txt
@@ -1,4 +1,5 @@
-# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX11 %s
+# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX11,GFX11-REAL16 %s
+# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
 
 # GFX11: v_bfrev_b32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xb8,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
 0x05,0x00,0xb8,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05
@@ -6,18 +7,34 @@
 # GFX11: v_bfrev_b32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0xb8,0xd5,0xea,0x00,0x00,0x00,0xff,0x00,0x00,0x00]
 0xff,0x00,0xb8,0xd5,0xea,0x00,0x00,0x00,0xff,0x00,0x00,0x00
 
-# GFX11: v_ceil_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdc,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+# GFX11-REAL16: v_ceil_f16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdc,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+# GFX11-FAKE16: v_ceil_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdc,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
 0x05,0x00,0xdc,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05
 
-# GFX11: v_ceil_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdc,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
+# GFX11-REAL16: v_ceil_f16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdc,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
+# GFX11-FAKE16: v_ceil_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdc,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
 0x05,0x00,0xdc,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05
 
-# GFX11: v_ceil_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdc,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
+# GFX11-REAL16: v_ceil_f16_e64_dpp v5.h, v1.h op_sel:[1,1] mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xdc,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
+# COM: GFX11-FAKE16: warning: invalid instruction encoding
+0x05,0x48,0xdc,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05
+
+# GFX11-REAL16: v_ceil_f16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdc,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
+# GFX11-FAKE16: v_ceil_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdc,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
 0x05,0x00,0xdc,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05
 
-# GFX11: v_ceil_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xdc,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
+# GFX11-REAL16: v_ceil_f16_e64_dpp v5.l, v1.h op_sel:[1,0] mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0xdc,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
+# COM: GFX11-FAKE16: warning: invalid instruction encoding
+0x05,0x08,0xdc,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05
+
+# GFX11-REAL16: v_ceil_f16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xdc,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
+# GFX11-FAKE16: v_ceil_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xdc,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
 0xff,0x81,0xdc,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00
 
+# GFX11-REAL16: v_ceil_f16_e64_dpp v255.h, -|v255.l| op_sel:[0,1] clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc1,0xdc,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
+# COM: GFX11-FAKE16: warning: invalid instruction encoding
+0xff,0xc1,0xdc,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00
+
 # GFX11: v_ceil_f32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xa2,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
 0x05,0x00,0xa2,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05
 
@@ -288,18 +305,34 @@
 # GFX11: v_exp_f32_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xa5,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
 0xff,0x81,0xa5,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00
 
-# GFX11: v_floor_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdb,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+# GFX11-REAL16: v_floor_f16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdb,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+# GFX11-FAKE16: v_floor_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdb,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
 0x05,0x00,0xdb,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05
 
-# GFX11: v_floor_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdb,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
+# GFX11-REAL16: v_floor_f16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdb,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
+# GFX11-FAKE16: v_floor_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdb,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
 0x05,0x00,0xdb,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05
 
-# GFX11: v_floor_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdb,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
+# GFX11-REAL16: v_floor_f16_e64_dpp v5.h, v1.h op_sel:[1,1] mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xdb,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
+# COM: GFX11-FAKE16: warning: invalid instruction encoding
+0x05,0x48,0xdb,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05
+
+# GFX11-REAL16: v_floor_f16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdb,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
+# GFX11-FAKE16: v_floor_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdb,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
 0x05,0x00,0xdb,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05
 
-# GFX11: v_floor_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xdb,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
+# GFX11-REAL16: v_floor_f16_e64_dpp v5.l, v1.h op_sel:[1,0] mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0xdb,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
+# COM: GFX11-FAKE16: warning: invalid instruction encoding
+0x05,0x08,0xdb,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05
+
+# GFX11-REAL16: v_floor_f16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xdb,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
+# GFX11-FAKE16: v_floor_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xdb,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
 0xff,0x81,0xdb,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00
 
+# GFX11-REAL16: v_floor_f16_e64_dpp v255.h, -|v255.l| op_sel:[0,1] clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc1,0xdb,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
+# COM: GFX11-FAKE16: warning: invalid instruction encoding
+0xff,0xc1,0xdb,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00
+
 # GFX11: v_floor_f32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xa4,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
 0x05,0x00,0xa4,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05
 
-- 
cgit v1.1


From b846613837d83989d99d33f4b90db7bad019aa8c Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Thu, 8 Feb 2024 14:01:38 +0000
Subject: [X86] X86FixupVectorConstants - add destination register width to
 rebuildSplatCst/rebuildZeroUpperCst/rebuildExtCst callbacks

As found on #81136 - we aren't correctly handling for cases where the constant pool entry is wider than the destination register width, causing incorrect scaling of the truncated constant for load-extension cases.

This first patch just pulls out the destination register width argument, its still currently driven by the constant pool entry but that will be addressed in a followup.
---
 llvm/lib/Target/X86/X86FixupVectorConstants.cpp | 52 ++++++++++++++-----------
 1 file changed, 29 insertions(+), 23 deletions(-)

diff --git a/llvm/lib/Target/X86/X86FixupVectorConstants.cpp b/llvm/lib/Target/X86/X86FixupVectorConstants.cpp
index 9c46cee..9b90b5e 100644
--- a/llvm/lib/Target/X86/X86FixupVectorConstants.cpp
+++ b/llvm/lib/Target/X86/X86FixupVectorConstants.cpp
@@ -121,6 +121,13 @@ static std::optional<APInt> extractConstantBits(const Constant *C) {
   return std::nullopt;
 }
 
+static std::optional<APInt> extractConstantBits(const Constant *C,
+                                                unsigned NumBits) {
+  if (std::optional<APInt> Bits = extractConstantBits(C))
+    return Bits->zextOrTrunc(NumBits);
+  return std::nullopt;
+}
+
 // Attempt to compute the splat width of bits data by normalizing the splat to
 // remove undefs.
 static std::optional<APInt> getSplatableConstant(const Constant *C,
@@ -217,16 +224,15 @@ static Constant *rebuildConstant(LLVMContext &Ctx, Type *SclTy,
 
 // Attempt to rebuild a normalized splat vector constant of the requested splat
 // width, built up of potentially smaller scalar values.
-static Constant *rebuildSplatCst(const Constant *C, unsigned /*NumElts*/,
-                                 unsigned SplatBitWidth) {
+static Constant *rebuildSplatCst(const Constant *C, unsigned /*NumBits*/,
+                                 unsigned /*NumElts*/, unsigned SplatBitWidth) {
   std::optional<APInt> Splat = getSplatableConstant(C, SplatBitWidth);
   if (!Splat)
     return nullptr;
 
   // Determine scalar size to use for the constant splat vector, clamping as we
   // might have found a splat smaller than the original constant data.
-  const Type *OriginalType = C->getType();
-  Type *SclTy = OriginalType->getScalarType();
+  Type *SclTy = C->getType()->getScalarType();
   unsigned NumSclBits = SclTy->getPrimitiveSizeInBits();
   NumSclBits = std::min<unsigned>(NumSclBits, SplatBitWidth);
 
@@ -236,20 +242,19 @@ static Constant *rebuildSplatCst(const Constant *C, unsigned /*NumElts*/,
                    : 64;
 
   // Extract per-element bits.
-  return rebuildConstant(OriginalType->getContext(), SclTy, *Splat, NumSclBits);
+  return rebuildConstant(C->getContext(), SclTy, *Splat, NumSclBits);
 }
 
-static Constant *rebuildZeroUpperCst(const Constant *C, unsigned /*NumElts*/,
+static Constant *rebuildZeroUpperCst(const Constant *C, unsigned NumBits,
+                                     unsigned /*NumElts*/,
                                      unsigned ScalarBitWidth) {
-  Type *Ty = C->getType();
-  Type *SclTy = Ty->getScalarType();
-  unsigned NumBits = Ty->getPrimitiveSizeInBits();
+  Type *SclTy = C->getType()->getScalarType();
   unsigned NumSclBits = SclTy->getPrimitiveSizeInBits();
   LLVMContext &Ctx = C->getContext();
 
   if (NumBits > ScalarBitWidth) {
     // Determine if the upper bits are all zero.
-    if (std::optional<APInt> Bits = extractConstantBits(C)) {
+    if (std::optional<APInt> Bits = extractConstantBits(C, NumBits)) {
       if (Bits->countLeadingZeros() >= (NumBits - ScalarBitWidth)) {
         // If the original constant was made of smaller elements, try to retain
         // those types.
@@ -266,16 +271,15 @@ static Constant *rebuildZeroUpperCst(const Constant *C, unsigned /*NumElts*/,
   return nullptr;
 }
 
-static Constant *rebuildExtCst(const Constant *C, bool IsSExt, unsigned NumElts,
+static Constant *rebuildExtCst(const Constant *C, bool IsSExt,
+                               unsigned NumBits, unsigned NumElts,
                                unsigned SrcEltBitWidth) {
-  Type *Ty = C->getType();
-  unsigned NumBits = Ty->getPrimitiveSizeInBits();
   unsigned DstEltBitWidth = NumBits / NumElts;
   assert((NumBits % NumElts) == 0 && (NumBits % SrcEltBitWidth) == 0 &&
          (DstEltBitWidth % SrcEltBitWidth) == 0 &&
          (DstEltBitWidth > SrcEltBitWidth) && "Illegal extension width");
 
-  if (std::optional<APInt> Bits = extractConstantBits(C)) {
+  if (std::optional<APInt> Bits = extractConstantBits(C, NumBits)) {
     assert((Bits->getBitWidth() / DstEltBitWidth) == NumElts &&
            (Bits->getBitWidth() % DstEltBitWidth) == 0 &&
            "Unexpected constant extension");
@@ -290,19 +294,20 @@ static Constant *rebuildExtCst(const Constant *C, bool IsSExt, unsigned NumElts,
       TruncBits.insertBits(Elt.trunc(SrcEltBitWidth), I * SrcEltBitWidth);
     }
 
+    Type *Ty = C->getType();
     return rebuildConstant(Ty->getContext(), Ty->getScalarType(), TruncBits,
                            SrcEltBitWidth);
   }
 
   return nullptr;
 }
-static Constant *rebuildSExtCst(const Constant *C, unsigned NumElts,
-                                unsigned SrcEltBitWidth) {
-  return rebuildExtCst(C, true, NumElts, SrcEltBitWidth);
+static Constant *rebuildSExtCst(const Constant *C, unsigned NumBits,
+                                unsigned NumElts, unsigned SrcEltBitWidth) {
+  return rebuildExtCst(C, true, NumBits, NumElts, SrcEltBitWidth);
 }
-static Constant *rebuildZExtCst(const Constant *C, unsigned NumElts,
-                                unsigned SrcEltBitWidth) {
-  return rebuildExtCst(C, false, NumElts, SrcEltBitWidth);
+static Constant *rebuildZExtCst(const Constant *C, unsigned NumBits,
+                                unsigned NumElts, unsigned SrcEltBitWidth) {
+  return rebuildExtCst(C, false, NumBits, NumElts, SrcEltBitWidth);
 }
 
 bool X86FixupVectorConstantsPass::processInstruction(MachineFunction &MF,
@@ -320,7 +325,7 @@ bool X86FixupVectorConstantsPass::processInstruction(MachineFunction &MF,
     int Op;
     int NumCstElts;
     int BitWidth;
-    std::function<Constant *(const Constant *, unsigned, unsigned)>
+    std::function<Constant *(const Constant *, unsigned, unsigned, unsigned)>
         RebuildConstant;
   };
   auto FixupConstant = [&](ArrayRef<FixupEntry> Fixups, unsigned OperandNo) {
@@ -335,12 +340,13 @@ bool X86FixupVectorConstantsPass::processInstruction(MachineFunction &MF,
     assert(MI.getNumOperands() >= (OperandNo + X86::AddrNumOperands) &&
            "Unexpected number of operands!");
     if (auto *C = X86::getConstantFromPool(MI, OperandNo)) {
+      unsigned NumBits = C->getType()->getPrimitiveSizeInBits();
       for (const FixupEntry &Fixup : Fixups) {
         if (Fixup.Op) {
           // Construct a suitable constant and adjust the MI to use the new
           // constant pool entry.
-          if (Constant *NewCst =
-                  Fixup.RebuildConstant(C, Fixup.NumCstElts, Fixup.BitWidth)) {
+          if (Constant *NewCst = Fixup.RebuildConstant(
+                  C, NumBits, Fixup.NumCstElts, Fixup.BitWidth)) {
             unsigned NewCPI =
                 CP->getConstantPoolIndex(NewCst, Align(Fixup.BitWidth / 8));
             MI.setDesc(TII->get(Fixup.Op));
-- 
cgit v1.1


From eb85c8edf576d27254fa37bf9ed72ec0867756f7 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Thu, 8 Feb 2024 15:59:19 +0000
Subject: [X86] Add test case for #81136

---
 llvm/test/CodeGen/X86/pr81136.ll | 46 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 46 insertions(+)
 create mode 100644 llvm/test/CodeGen/X86/pr81136.ll

diff --git a/llvm/test/CodeGen/X86/pr81136.ll b/llvm/test/CodeGen/X86/pr81136.ll
new file mode 100644
index 0000000..8843adc
--- /dev/null
+++ b/llvm/test/CodeGen/X86/pr81136.ll
@@ -0,0 +1,46 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=btver2 | FileCheck %s
+
+; FIXME: Should be vpmovzxbq[128,1] instead of vpmovzxbd[128,1,0,0]
+define i64 @PR81136(i32 %a0, i32 %a1, ptr %a2) {
+; CHECK-LABEL: PR81136:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovd %edi, %xmm0
+; CHECK-NEXT:    vmovd %esi, %xmm1
+; CHECK-NEXT:    vmovdqa (%rdx), %ymm2
+; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vpmovzxbd {{.*#+}} xmm4 = [128,1,0,0]
+; CHECK-NEXT:    vpcmpgtq %xmm3, %xmm4, %xmm4
+; CHECK-NEXT:    vpcmpgtw %xmm0, %xmm1, %xmm0
+; CHECK-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
+; CHECK-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    vpmovsxwq %xmm0, %xmm0
+; CHECK-NEXT:    vpalignr {{.*#+}} xmm0 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
+; CHECK-NEXT:    vpcmpeqq %xmm3, %xmm0, %xmm0
+; CHECK-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    vpcmpeqq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm1
+; CHECK-NEXT:    vextractf128 $1, %ymm2, %xmm2
+; CHECK-NEXT:    vpcmpeqq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
+; CHECK-NEXT:    vinsertf128 $1, %xmm0, %ymm4, %ymm0
+; CHECK-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; CHECK-NEXT:    vandnpd %ymm0, %ymm1, %ymm0
+; CHECK-NEXT:    vmovmskpd %ymm0, %eax
+; CHECK-NEXT:    popcntl %eax, %eax
+; CHECK-NEXT:    negq %rax
+; CHECK-NEXT:    retq
+  %v0 = bitcast i32 %a0 to <2 x i16>
+  %v1 = bitcast i32 %a1 to <2 x i16>
+  %cmp15 = icmp sle <2 x i16> %v1, %v0
+  %conv16 = sext <2 x i1> %cmp15 to <2 x i64>
+  %shuffle29 = shufflevector <2 x i64> %conv16, <2 x i64> <i64 128, i64 1>, <4 x i32> <i32 2, i32 3, i32 3, i32 0>
+  %data = load volatile <4 x i64>, ptr %a2, align 32
+  %cmp65 = icmp ne <4 x i64> %data, <i64 -2071602529, i64 -1537047284, i64 717942021, i64 597457239>
+  %cmp67 = icmp ne <4 x i64> %shuffle29, zeroinitializer
+  %and = and <4 x i1> %cmp65, %cmp67
+  %mask = bitcast <4 x i1> %and to i4
+  %cnt = tail call i4 @llvm.ctpop.i4(i4 %mask)
+  %cntz = zext i4 %cnt to i64
+  %res = sub nsw i64 0, %cntz
+  ret i64 %res
+}
+declare i4 @llvm.ctpop.i4(i4)
-- 
cgit v1.1


From f407be32fe8084fe02c4f16842548d21afdb447f Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Thu, 8 Feb 2024 16:31:09 +0000
Subject: [X86] X86FixupVectorConstants - rename FixupEntry::BitWidth to
 FixupEntry::MemBitWidth NFC.

Make it clearer that this refers to the width of the constant element stored in memory - which won't match the register element width after a sext/zextload
---
 llvm/lib/Target/X86/X86FixupVectorConstants.cpp | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/llvm/lib/Target/X86/X86FixupVectorConstants.cpp b/llvm/lib/Target/X86/X86FixupVectorConstants.cpp
index 9b90b5e..32ca9c16 100644
--- a/llvm/lib/Target/X86/X86FixupVectorConstants.cpp
+++ b/llvm/lib/Target/X86/X86FixupVectorConstants.cpp
@@ -324,7 +324,7 @@ bool X86FixupVectorConstantsPass::processInstruction(MachineFunction &MF,
   struct FixupEntry {
     int Op;
     int NumCstElts;
-    int BitWidth;
+    int MemBitWidth;
     std::function<Constant *(const Constant *, unsigned, unsigned, unsigned)>
         RebuildConstant;
   };
@@ -332,23 +332,23 @@ bool X86FixupVectorConstantsPass::processInstruction(MachineFunction &MF,
 #ifdef EXPENSIVE_CHECKS
     assert(llvm::is_sorted(Fixups,
                            [](const FixupEntry &A, const FixupEntry &B) {
-                             return (A.NumCstElts * A.BitWidth) <
-                                    (B.NumCstElts * B.BitWidth);
+                             return (A.NumCstElts * A.MemBitWidth) <
+                                    (B.NumCstElts * B.MemBitWidth);
                            }) &&
            "Constant fixup table not sorted in ascending constant size");
 #endif
     assert(MI.getNumOperands() >= (OperandNo + X86::AddrNumOperands) &&
            "Unexpected number of operands!");
     if (auto *C = X86::getConstantFromPool(MI, OperandNo)) {
-      unsigned NumBits = C->getType()->getPrimitiveSizeInBits();
+      unsigned RegBitWidth = C->getType()->getPrimitiveSizeInBits();
       for (const FixupEntry &Fixup : Fixups) {
         if (Fixup.Op) {
           // Construct a suitable constant and adjust the MI to use the new
           // constant pool entry.
           if (Constant *NewCst = Fixup.RebuildConstant(
-                  C, NumBits, Fixup.NumCstElts, Fixup.BitWidth)) {
+                  C, RegBitWidth, Fixup.NumCstElts, Fixup.MemBitWidth)) {
             unsigned NewCPI =
-                CP->getConstantPoolIndex(NewCst, Align(Fixup.BitWidth / 8));
+                CP->getConstantPoolIndex(NewCst, Align(Fixup.MemBitWidth / 8));
             MI.setDesc(TII->get(Fixup.Op));
             MI.getOperand(OperandNo + X86::AddrDisp).setIndex(NewCPI);
             return true;
-- 
cgit v1.1


From 5aeabf2df92b92c71b5dbdb9ae82a37431aa2ee4 Mon Sep 17 00:00:00 2001
From: stephenpeckham <118857872+stephenpeckham@users.noreply.github.com>
Date: Thu, 8 Feb 2024 10:44:19 -0600
Subject: [XCOFF][obj2yaml] Support SymbolAlignmentAndType as 2 separate fields
 in YAML. (#76828)

XCOFF encodes a symbol type and alignment in a single 8-bit field. It is
easier to read and write YAML files if the fields can be specified
separately. This PR causes obj2yaml to write the fields separately and
allows yaml2obj to read either the single combined field or the separate
fields.
---
 llvm/include/llvm/ObjectYAML/XCOFFYAML.h          |   7 ++
 llvm/lib/ObjectYAML/XCOFFEmitter.cpp              |  99 +++++++++++++------
 llvm/lib/ObjectYAML/XCOFFYAML.cpp                 |  16 ++-
 llvm/test/tools/obj2yaml/XCOFF/aix.yaml           |  12 ++-
 llvm/test/tools/obj2yaml/XCOFF/aux-symbols.yaml   |  12 ++-
 llvm/test/tools/yaml2obj/XCOFF/aux-aligntype.yaml | 114 ++++++++++++++++++++++
 llvm/test/tools/yaml2obj/XCOFF/aux-symbols.yaml   |  25 +++++
 llvm/tools/obj2yaml/xcoff2yaml.cpp                |   4 +-
 8 files changed, 250 insertions(+), 39 deletions(-)
 create mode 100644 llvm/test/tools/yaml2obj/XCOFF/aux-aligntype.yaml

diff --git a/llvm/include/llvm/ObjectYAML/XCOFFYAML.h b/llvm/include/llvm/ObjectYAML/XCOFFYAML.h
index f1e821f..dd359ac 100644
--- a/llvm/include/llvm/ObjectYAML/XCOFFYAML.h
+++ b/llvm/include/llvm/ObjectYAML/XCOFFYAML.h
@@ -121,6 +121,9 @@ struct CsectAuxEnt : AuxSymbolEnt {
   // Common fields for both XCOFF32 and XCOFF64.
   std::optional<uint32_t> ParameterHashIndex;
   std::optional<uint16_t> TypeChkSectNum;
+  std::optional<XCOFF::SymbolType> SymbolType;
+  std::optional<uint8_t> SymbolAlignment;
+  // The two previous values can be encoded as a single value.
   std::optional<uint8_t> SymbolAlignmentAndType;
   std::optional<XCOFF::StorageMappingClass> StorageMappingClass;
 
@@ -237,6 +240,10 @@ template <> struct ScalarEnumerationTraits<XCOFF::StorageMappingClass> {
   static void enumeration(IO &IO, XCOFF::StorageMappingClass &Value);
 };
 
+template <> struct ScalarEnumerationTraits<XCOFF::SymbolType> {
+  static void enumeration(IO &IO, XCOFF::SymbolType &Value);
+};
+
 template <> struct ScalarEnumerationTraits<XCOFF::CFileStringType> {
   static void enumeration(IO &IO, XCOFF::CFileStringType &Type);
 };
diff --git a/llvm/lib/ObjectYAML/XCOFFEmitter.cpp b/llvm/lib/ObjectYAML/XCOFFEmitter.cpp
index ccf768c..5b244ff 100644
--- a/llvm/lib/ObjectYAML/XCOFFEmitter.cpp
+++ b/llvm/lib/ObjectYAML/XCOFFEmitter.cpp
@@ -23,6 +23,7 @@
 #include "llvm/Support/raw_ostream.h"
 
 using namespace llvm;
+using namespace llvm::object;
 
 namespace {
 
@@ -56,14 +57,14 @@ private:
   bool writeSymbols();
   void writeStringTable();
 
-  void writeAuxSymbol(const XCOFFYAML::CsectAuxEnt &AuxSym);
-  void writeAuxSymbol(const XCOFFYAML::FileAuxEnt &AuxSym);
-  void writeAuxSymbol(const XCOFFYAML::FunctionAuxEnt &AuxSym);
-  void writeAuxSymbol(const XCOFFYAML::ExcpetionAuxEnt &AuxSym);
-  void writeAuxSymbol(const XCOFFYAML::BlockAuxEnt &AuxSym);
-  void writeAuxSymbol(const XCOFFYAML::SectAuxEntForDWARF &AuxSym);
-  void writeAuxSymbol(const XCOFFYAML::SectAuxEntForStat &AuxSym);
-  void writeAuxSymbol(const std::unique_ptr<XCOFFYAML::AuxSymbolEnt> &AuxSym);
+  bool writeAuxSymbol(const XCOFFYAML::CsectAuxEnt &AuxSym);
+  bool writeAuxSymbol(const XCOFFYAML::FileAuxEnt &AuxSym);
+  bool writeAuxSymbol(const XCOFFYAML::FunctionAuxEnt &AuxSym);
+  bool writeAuxSymbol(const XCOFFYAML::ExcpetionAuxEnt &AuxSym);
+  bool writeAuxSymbol(const XCOFFYAML::BlockAuxEnt &AuxSym);
+  bool writeAuxSymbol(const XCOFFYAML::SectAuxEntForDWARF &AuxSym);
+  bool writeAuxSymbol(const XCOFFYAML::SectAuxEntForStat &AuxSym);
+  bool writeAuxSymbol(const std::unique_ptr<XCOFFYAML::AuxSymbolEnt> &AuxSym);
 
   XCOFFYAML::Object &Obj;
   bool Is64Bit = false;
@@ -181,7 +182,7 @@ bool XCOFFWriter::initStringTable() {
   StrTblBuilder.clear();
 
   if (Obj.StrTbl.Strings) {
-    // All specified strings should be added to the string table.
+    // Add all specified strings to the string table.
     for (StringRef StringEnt : *Obj.StrTbl.Strings)
       StrTblBuilder.add(StringEnt);
 
@@ -524,12 +525,44 @@ bool XCOFFWriter::writeRelocations() {
   return true;
 }
 
-void XCOFFWriter::writeAuxSymbol(const XCOFFYAML::CsectAuxEnt &AuxSym) {
+bool XCOFFWriter::writeAuxSymbol(const XCOFFYAML::CsectAuxEnt &AuxSym) {
+  uint8_t SymAlignAndType = 0;
+  if (AuxSym.SymbolAlignmentAndType) {
+    if (AuxSym.SymbolType || AuxSym.SymbolAlignment) {
+      ErrHandler("cannot specify SymbolType or SymbolAlignment if "
+                 "SymbolAlignmentAndType is specified");
+      return false;
+    }
+    SymAlignAndType = *AuxSym.SymbolAlignmentAndType;
+  } else {
+    if (AuxSym.SymbolType) {
+      uint8_t SymbolType = *AuxSym.SymbolType;
+      if (SymbolType & ~XCOFFCsectAuxRef::SymbolTypeMask) {
+        ErrHandler("symbol type must be less than " +
+                   Twine(1 + XCOFFCsectAuxRef::SymbolTypeMask));
+        return false;
+      }
+      SymAlignAndType = SymbolType;
+    }
+    if (AuxSym.SymbolAlignment) {
+      const uint8_t ShiftedSymbolAlignmentMask =
+          XCOFFCsectAuxRef::SymbolAlignmentMask >>
+          XCOFFCsectAuxRef::SymbolAlignmentBitOffset;
+
+      if (*AuxSym.SymbolAlignment & ~ShiftedSymbolAlignmentMask) {
+        ErrHandler("symbol alignment must be less than " +
+                   Twine(1 + ShiftedSymbolAlignmentMask));
+        return false;
+      }
+      SymAlignAndType |= (*AuxSym.SymbolAlignment
+                          << XCOFFCsectAuxRef::SymbolAlignmentBitOffset);
+    }
+  }
   if (Is64Bit) {
     W.write<uint32_t>(AuxSym.SectionOrLengthLo.value_or(0));
     W.write<uint32_t>(AuxSym.ParameterHashIndex.value_or(0));
     W.write<uint16_t>(AuxSym.TypeChkSectNum.value_or(0));
-    W.write<uint8_t>(AuxSym.SymbolAlignmentAndType.value_or(0));
+    W.write<uint8_t>(SymAlignAndType);
     W.write<uint8_t>(AuxSym.StorageMappingClass.value_or(XCOFF::XMC_PR));
     W.write<uint32_t>(AuxSym.SectionOrLengthHi.value_or(0));
     W.write<uint8_t>(0);
@@ -538,23 +571,25 @@ void XCOFFWriter::writeAuxSymbol(const XCOFFYAML::CsectAuxEnt &AuxSym) {
     W.write<uint32_t>(AuxSym.SectionOrLength.value_or(0));
     W.write<uint32_t>(AuxSym.ParameterHashIndex.value_or(0));
     W.write<uint16_t>(AuxSym.TypeChkSectNum.value_or(0));
-    W.write<uint8_t>(AuxSym.SymbolAlignmentAndType.value_or(0));
+    W.write<uint8_t>(SymAlignAndType);
     W.write<uint8_t>(AuxSym.StorageMappingClass.value_or(XCOFF::XMC_PR));
     W.write<uint32_t>(AuxSym.StabInfoIndex.value_or(0));
     W.write<uint16_t>(AuxSym.StabSectNum.value_or(0));
   }
+  return true;
 }
 
-void XCOFFWriter::writeAuxSymbol(const XCOFFYAML::ExcpetionAuxEnt &AuxSym) {
+bool XCOFFWriter::writeAuxSymbol(const XCOFFYAML::ExcpetionAuxEnt &AuxSym) {
   assert(Is64Bit && "can't write the exception auxiliary symbol for XCOFF32");
   W.write<uint64_t>(AuxSym.OffsetToExceptionTbl.value_or(0));
   W.write<uint32_t>(AuxSym.SizeOfFunction.value_or(0));
   W.write<uint32_t>(AuxSym.SymIdxOfNextBeyond.value_or(0));
   W.write<uint8_t>(0);
   W.write<uint8_t>(XCOFF::AUX_EXCEPT);
+  return true;
 }
 
-void XCOFFWriter::writeAuxSymbol(const XCOFFYAML::FunctionAuxEnt &AuxSym) {
+bool XCOFFWriter::writeAuxSymbol(const XCOFFYAML::FunctionAuxEnt &AuxSym) {
   if (Is64Bit) {
     W.write<uint64_t>(AuxSym.PtrToLineNum.value_or(0));
     W.write<uint32_t>(AuxSym.SizeOfFunction.value_or(0));
@@ -568,9 +603,10 @@ void XCOFFWriter::writeAuxSymbol(const XCOFFYAML::FunctionAuxEnt &AuxSym) {
     W.write<uint32_t>(AuxSym.SymIdxOfNextBeyond.value_or(0));
     W.OS.write_zeros(2);
   }
+  return true;
 }
 
-void XCOFFWriter::writeAuxSymbol(const XCOFFYAML::FileAuxEnt &AuxSym) {
+bool XCOFFWriter::writeAuxSymbol(const XCOFFYAML::FileAuxEnt &AuxSym) {
   StringRef FileName = AuxSym.FileNameOrString.value_or("");
   if (nameShouldBeInStringTable(FileName)) {
     W.write<int32_t>(0);
@@ -586,9 +622,10 @@ void XCOFFWriter::writeAuxSymbol(const XCOFFYAML::FileAuxEnt &AuxSym) {
   } else {
     W.OS.write_zeros(3);
   }
+  return true;
 }
 
-void XCOFFWriter::writeAuxSymbol(const XCOFFYAML::BlockAuxEnt &AuxSym) {
+bool XCOFFWriter::writeAuxSymbol(const XCOFFYAML::BlockAuxEnt &AuxSym) {
   if (Is64Bit) {
     W.write<uint32_t>(AuxSym.LineNum.value_or(0));
     W.OS.write_zeros(13);
@@ -599,9 +636,10 @@ void XCOFFWriter::writeAuxSymbol(const XCOFFYAML::BlockAuxEnt &AuxSym) {
     W.write<uint16_t>(AuxSym.LineNumLo.value_or(0));
     W.OS.write_zeros(12);
   }
+  return true;
 }
 
-void XCOFFWriter::writeAuxSymbol(const XCOFFYAML::SectAuxEntForDWARF &AuxSym) {
+bool XCOFFWriter::writeAuxSymbol(const XCOFFYAML::SectAuxEntForDWARF &AuxSym) {
   if (Is64Bit) {
     W.write<uint64_t>(AuxSym.LengthOfSectionPortion.value_or(0));
     W.write<uint64_t>(AuxSym.NumberOfRelocEnt.value_or(0));
@@ -613,34 +651,36 @@ void XCOFFWriter::writeAuxSymbol(const XCOFFYAML::SectAuxEntForDWARF &AuxSym) {
     W.write<uint32_t>(AuxSym.NumberOfRelocEnt.value_or(0));
     W.OS.write_zeros(6);
   }
+  return true;
 }
 
-void XCOFFWriter::writeAuxSymbol(const XCOFFYAML::SectAuxEntForStat &AuxSym) {
+bool XCOFFWriter::writeAuxSymbol(const XCOFFYAML::SectAuxEntForStat &AuxSym) {
   assert(!Is64Bit && "can't write the stat auxiliary symbol for XCOFF64");
   W.write<uint32_t>(AuxSym.SectionLength.value_or(0));
   W.write<uint16_t>(AuxSym.NumberOfRelocEnt.value_or(0));
   W.write<uint16_t>(AuxSym.NumberOfLineNum.value_or(0));
   W.OS.write_zeros(10);
+  return true;
 }
 
-void XCOFFWriter::writeAuxSymbol(
+bool XCOFFWriter::writeAuxSymbol(
     const std::unique_ptr<XCOFFYAML::AuxSymbolEnt> &AuxSym) {
   if (auto AS = dyn_cast<XCOFFYAML::CsectAuxEnt>(AuxSym.get()))
-    writeAuxSymbol(*AS);
+    return writeAuxSymbol(*AS);
   else if (auto AS = dyn_cast<XCOFFYAML::FunctionAuxEnt>(AuxSym.get()))
-    writeAuxSymbol(*AS);
+    return writeAuxSymbol(*AS);
   else if (auto AS = dyn_cast<XCOFFYAML::ExcpetionAuxEnt>(AuxSym.get()))
-    writeAuxSymbol(*AS);
+    return writeAuxSymbol(*AS);
   else if (auto AS = dyn_cast<XCOFFYAML::FileAuxEnt>(AuxSym.get()))
-    writeAuxSymbol(*AS);
+    return writeAuxSymbol(*AS);
   else if (auto AS = dyn_cast<XCOFFYAML::BlockAuxEnt>(AuxSym.get()))
-    writeAuxSymbol(*AS);
+    return writeAuxSymbol(*AS);
   else if (auto AS = dyn_cast<XCOFFYAML::SectAuxEntForDWARF>(AuxSym.get()))
-    writeAuxSymbol(*AS);
+    return writeAuxSymbol(*AS);
   else if (auto AS = dyn_cast<XCOFFYAML::SectAuxEntForStat>(AuxSym.get()))
-    writeAuxSymbol(*AS);
-  else
-    llvm_unreachable("unknown auxiliary symbol type");
+    return writeAuxSymbol(*AS);
+  llvm_unreachable("unknown auxiliary symbol type");
+  return false;
 }
 
 bool XCOFFWriter::writeSymbols() {
@@ -698,7 +738,8 @@ bool XCOFFWriter::writeSymbols() {
     } else {
       for (const std::unique_ptr<XCOFFYAML::AuxSymbolEnt> &AuxSym :
            YamlSym.AuxEntries) {
-        writeAuxSymbol(AuxSym);
+        if (!writeAuxSymbol(AuxSym))
+          return false;
       }
       // Pad with zeros.
       if (NumOfAuxSym > YamlSym.AuxEntries.size())
diff --git a/llvm/lib/ObjectYAML/XCOFFYAML.cpp b/llvm/lib/ObjectYAML/XCOFFYAML.cpp
index 398b09c..83bf613 100644
--- a/llvm/lib/ObjectYAML/XCOFFYAML.cpp
+++ b/llvm/lib/ObjectYAML/XCOFFYAML.cpp
@@ -127,6 +127,17 @@ void ScalarEnumerationTraits<XCOFF::StorageMappingClass>::enumeration(
 #undef ECase
 }
 
+void ScalarEnumerationTraits<XCOFF::SymbolType>::enumeration(
+    IO &IO, XCOFF::SymbolType &Value) {
+#define ECase(X) IO.enumCase(Value, #X, XCOFF::X)
+  ECase(XTY_ER);
+  ECase(XTY_SD);
+  ECase(XTY_LD);
+  ECase(XTY_CM);
+#undef ECase
+  IO.enumFallback<Hex8>(Value);
+}
+
 void ScalarEnumerationTraits<XCOFFYAML::AuxSymbolType>::enumeration(
     IO &IO, XCOFFYAML::AuxSymbolType &Type) {
 #define ECase(X) IO.enumCase(Type, #X, XCOFFYAML::X)
@@ -229,6 +240,8 @@ static void auxSymMapping(IO &IO, XCOFFYAML::CsectAuxEnt &AuxSym, bool Is64) {
   IO.mapOptional("ParameterHashIndex", AuxSym.ParameterHashIndex);
   IO.mapOptional("TypeChkSectNum", AuxSym.TypeChkSectNum);
   IO.mapOptional("SymbolAlignmentAndType", AuxSym.SymbolAlignmentAndType);
+  IO.mapOptional("SymbolType", AuxSym.SymbolType);
+  IO.mapOptional("SymbolAlignment", AuxSym.SymbolAlignment);
   IO.mapOptional("StorageMappingClass", AuxSym.StorageMappingClass);
   if (Is64) {
     IO.mapOptional("SectionOrLengthLo", AuxSym.SectionOrLengthLo);
@@ -350,7 +363,8 @@ void MappingTraits<XCOFFYAML::Symbol>::mapping(IO &IO, XCOFFYAML::Symbol &S) {
   IO.mapOptional("AuxEntries", S.AuxEntries);
 }
 
-void MappingTraits<XCOFFYAML::StringTable>::mapping(IO &IO, XCOFFYAML::StringTable &Str) {
+void MappingTraits<XCOFFYAML::StringTable>::mapping(
+    IO &IO, XCOFFYAML::StringTable &Str) {
   IO.mapOptional("ContentSize", Str.ContentSize);
   IO.mapOptional("Length", Str.Length);
   IO.mapOptional("Strings", Str.Strings);
diff --git a/llvm/test/tools/obj2yaml/XCOFF/aix.yaml b/llvm/test/tools/obj2yaml/XCOFF/aix.yaml
index fbd5fa0..9f2f68b 100644
--- a/llvm/test/tools/obj2yaml/XCOFF/aix.yaml
+++ b/llvm/test/tools/obj2yaml/XCOFF/aix.yaml
@@ -56,7 +56,8 @@
 # CHECK32-NEXT:       - Type:            AUX_CSECT
 # CHECK32-NEXT:         ParameterHashIndex: 0
 # CHECK32-NEXT:         TypeChkSectNum:  0
-# CHECK32-NEXT:         SymbolAlignmentAndType: 0
+# CHECK32-NEXT:         SymbolType:      XTY_ER
+# CHECK32-NEXT:         SymbolAlignment: 0
 # CHECK32-NEXT:         StorageMappingClass: XMC_PR
 # CHECK32-NEXT:         SectionOrLength: 0
 # CHECK32-NEXT:         StabInfoIndex:   0
@@ -71,7 +72,8 @@
 # CHECK32-NEXT:       - Type:            AUX_CSECT
 # CHECK32-NEXT:         ParameterHashIndex: 0
 # CHECK32-NEXT:         TypeChkSectNum:  0
-# CHECK32-NEXT:         SymbolAlignmentAndType: 0
+# CHECK32-NEXT:         SymbolType:      XTY_ER
+# CHECK32-NEXT:         SymbolAlignment: 0
 # CHECK32-NEXT:         StorageMappingClass: XMC_PR
 # CHECK32-NEXT:         SectionOrLength: 0
 # CHECK32-NEXT:         StabInfoIndex:   0
@@ -128,7 +130,8 @@
 # CHECK64-NEXT:       - Type:            AUX_CSECT
 # CHECK64-NEXT:         ParameterHashIndex: 0
 # CHECK64-NEXT:         TypeChkSectNum:  0
-# CHECK64-NEXT:         SymbolAlignmentAndType: 0
+# CHECK64-NEXT:         SymbolType:      XTY_ER
+# CHECK64-NEXT:         SymbolAlignment: 0
 # CHECK64-NEXT:         StorageMappingClass: XMC_PR
 # CHECK64-NEXT:         SectionOrLengthLo: 0
 # CHECK64-NEXT:         SectionOrLengthHi: 0
@@ -142,7 +145,8 @@
 # CHECK64-NEXT:       - Type:            AUX_CSECT
 # CHECK64-NEXT:         ParameterHashIndex: 0
 # CHECK64-NEXT:         TypeChkSectNum:  0
-# CHECK64-NEXT:         SymbolAlignmentAndType: 0
+# CHECK64-NEXT:         SymbolType:      XTY_ER
+# CHECK64-NEXT:         SymbolAlignment: 0
 # CHECK64-NEXT:         StorageMappingClass: XMC_PR
 # CHECK64-NEXT:         SectionOrLengthLo: 0
 # CHECK64-NEXT:         SectionOrLengthHi: 0
diff --git a/llvm/test/tools/obj2yaml/XCOFF/aux-symbols.yaml b/llvm/test/tools/obj2yaml/XCOFF/aux-symbols.yaml
index 7f93b8d..8155ac1 100644
--- a/llvm/test/tools/obj2yaml/XCOFF/aux-symbols.yaml
+++ b/llvm/test/tools/obj2yaml/XCOFF/aux-symbols.yaml
@@ -34,7 +34,8 @@
 # CHECK32-NEXT:       - Type:            AUX_CSECT
 # CHECK32-NEXT:         ParameterHashIndex: 1
 # CHECK32-NEXT:         TypeChkSectNum:  2
-# CHECK32-NEXT:         SymbolAlignmentAndType: 41
+# CHECK32-NEXT:         SymbolType: XTY_SD
+# CHECK32-NEXT:         SymbolAlignment: 5
 # CHECK32-NEXT:         StorageMappingClass: XMC_PR
 # CHECK32-NEXT:         SectionOrLength: 3
 # CHECK32-NEXT:         StabInfoIndex:   4
@@ -54,7 +55,8 @@
 # CHECK32-NEXT:       - Type:            AUX_CSECT
 # CHECK32-NEXT:         ParameterHashIndex: 1
 # CHECK32-NEXT:         TypeChkSectNum:  2
-# CHECK32-NEXT:         SymbolAlignmentAndType: 17
+# CHECK32-NEXT:         SymbolType: XTY_SD
+# CHECK32-NEXT:         SymbolAlignment: 2
 # CHECK32-NEXT:         StorageMappingClass: XMC_PR
 # CHECK32-NEXT:         SectionOrLength: 4
 # CHECK32-NEXT:         StabInfoIndex:   5
@@ -174,7 +176,8 @@ Symbols:
 # CHECK64-NEXT:       - Type:            AUX_CSECT
 # CHECK64-NEXT:         ParameterHashIndex: 1
 # CHECK64-NEXT:         TypeChkSectNum:  2
-# CHECK64-NEXT:         SymbolAlignmentAndType: 41
+# CHECK64-NEXT:         SymbolType: XTY_SD
+# CHECK64-NEXT:         SymbolAlignment: 5
 # CHECK64-NEXT:         StorageMappingClass: XMC_PR
 # CHECK64-NEXT:         SectionOrLengthLo: 3
 # CHECK64-NEXT:         SectionOrLengthHi: 4
@@ -196,7 +199,8 @@ Symbols:
 # CHECK64-NEXT:       - Type:            AUX_CSECT
 # CHECK64-NEXT:         ParameterHashIndex: 1
 # CHECK64-NEXT:         TypeChkSectNum:  2
-# CHECK64-NEXT:         SymbolAlignmentAndType: 17
+# CHECK64-NEXT:         SymbolType: XTY_SD
+# CHECK64-NEXT:         SymbolAlignment: 2
 # CHECK64-NEXT:         StorageMappingClass: XMC_PR
 # CHECK64-NEXT:         SectionOrLengthLo: 3
 # CHECK64-NEXT:         SectionOrLengthHi: 4
diff --git a/llvm/test/tools/yaml2obj/XCOFF/aux-aligntype.yaml b/llvm/test/tools/yaml2obj/XCOFF/aux-aligntype.yaml
new file mode 100644
index 0000000..190224d
--- /dev/null
+++ b/llvm/test/tools/yaml2obj/XCOFF/aux-aligntype.yaml
@@ -0,0 +1,114 @@
+## Check that yaml2obj can parse SymbolAlignmentAndType, SymbolAlignment,
+## and SymbolType.
+
+# RUN: yaml2obj %s --docnum=1 -DMAGIC=0x01DF -o %t32
+# RUN: obj2yaml %t32 | FileCheck %s --check-prefix=CHECK
+# RUN: yaml2obj %s --docnum=1 -DMAGIC=0x01F7 -o %t64
+# RUN: obj2yaml %t64 | FileCheck %s --check-prefix=CHECK
+
+# CHECK:        --- !XCOFF
+# CHECK-NEXT: FileHeader:
+# CHECK-NEXT:   MagicNumber:
+# CHECK:      Symbols:
+# CHECK:       - Name:            .fcn1
+# CHECK:         NumberOfAuxEntries: 1
+# CHECK-NEXT:    AuxEntries:
+# CHECK-NEXT:      - Type:            AUX_CSECT
+# CHECK:             SymbolType:      XTY_ER
+# CHECK-NEXT:        SymbolAlignment: 4
+# CHECK:       - Name:            .fcn2
+# CHECK:         NumberOfAuxEntries: 1
+# CHECK-NEXT:    AuxEntries:
+# CHECK-NEXT:      - Type:            AUX_CSECT
+# CHECK:             SymbolType:      XTY_SD
+# CHECK-NEXT:        SymbolAlignment: 2
+# CHECK:       - Name:            .fcn3
+# CHECK:         NumberOfAuxEntries: 1
+# CHECK-NEXT:    AuxEntries:
+# CHECK-NEXT:      - Type:            AUX_CSECT
+# CHECK:             SymbolType:      XTY_SD
+# CHECK-NEXT:        SymbolAlignment: 0
+
+--- !XCOFF
+FileHeader:
+  MagicNumber: [[MAGIC]]
+Symbols:
+  - StorageClass: C_EXT
+    Name: .fcn1
+    AuxEntries:
+      - Type: AUX_CSECT
+        SymbolAlignment: 4
+  - StorageClass: C_EXT
+    Name: .fcn2
+    AuxEntries:
+      - Type: AUX_CSECT
+        SymbolAlignment: 2
+        SymbolType: XTY_SD
+  - StorageClass:    C_EXT
+    Name: .fcn3
+    AuxEntries:
+      - Type: AUX_CSECT
+        SymbolType: XTY_SD
+
+## Ensure that SymbolAlignment is in range.
+# RUN: not yaml2obj %s --docnum=2 -o %t 2>&1 | FileCheck %s --check-prefix=ERROR1
+# ERROR1: symbol alignment must be less than 32
+
+--- !XCOFF
+FileHeader:
+  MagicNumber:     0x1F7
+Symbols:
+  - StorageClass:    C_EXT
+    Name:               .fcn1
+    AuxEntries:
+      - Type:               AUX_CSECT
+        SymbolType: XTY_SD
+        SymbolAlignment: 32
+        SectionOrLengthLo:    4
+
+## Ensure that neither SymbolAlignment nor SymbolType can be specified if
+## SymbolAlignmentAndType is specified.
+# RUN: not yaml2obj %s --docnum=3 -o %t 2>&1 | FileCheck %s --check-prefix=ERROR2
+# ERROR2: cannot specify SymbolType or SymbolAlignment if SymbolAlignmentAndType is specified
+
+--- !XCOFF
+FileHeader:
+  MagicNumber: 0x1DF
+Symbols:
+  - StorageClass: C_EXT
+    Name: .fcn1
+    AuxEntries:
+      - Type: AUX_CSECT
+        SymbolAlignmentAndType: 17
+        SymbolAlignment: 4
+        SectionOrLength: 4
+
+# RUN: not yaml2obj %s --docnum=4 -o %t 2>&1 | FileCheck %s --check-prefix=ERROR2
+
+--- !XCOFF
+FileHeader:
+  MagicNumber: 0x1DF
+Symbols:
+  - StorageClass: C_EXT
+    Name: .fcn1
+    AuxEntries:
+      - Type: AUX_CSECT
+        SymbolAlignmentAndType: 17
+        SymbolAlignment: 4
+        SymbolType: XTY_CM
+        SectionOrLength: 4
+
+# RUN: not yaml2obj %s --docnum=5 -o %t 2>&1 | FileCheck %s --check-prefix=ERROR2
+
+--- !XCOFF
+FileHeader:
+  MagicNumber: 0x1F7
+Symbols:
+  - StorageClass: C_EXT
+  - StorageClass: C_EXT
+    Name: .fcn2
+    AuxEntries:
+      - Type: AUX_CSECT
+        SymbolAlignmentAndType: 18
+        SymbolType: XTY_SD
+        SectionOrLengthLo: 4
diff --git a/llvm/test/tools/yaml2obj/XCOFF/aux-symbols.yaml b/llvm/test/tools/yaml2obj/XCOFF/aux-symbols.yaml
index fe75c19..04c774d 100644
--- a/llvm/test/tools/yaml2obj/XCOFF/aux-symbols.yaml
+++ b/llvm/test/tools/yaml2obj/XCOFF/aux-symbols.yaml
@@ -579,3 +579,28 @@ Symbols:
     AuxEntries:
       - Type: AUX_FILE
         FileNameOrString: foo
+
+## Case10: Specify a SymbolType outside the range of field definition.
+# RUN: not yaml2obj %s -DSYMTYPE=8 --docnum=8 -o %t10 2>&1 | \ 
+# RUN:   FileCheck %s --check-prefix BADSYMTYPE
+
+# BADSYMTYPE: error: symbol type must be less than 8
+
+## Case11: Specify a SymbolType outside the range of its enumeration.
+# RUN: yaml2obj %s -DSYMTYPE=7 --docnum=8 -o %t11
+# RUN: llvm-readobj --syms %t11 | FileCheck %s --check-prefix=STYPE
+
+--- !XCOFF
+FileHeader:
+  MagicNumber: 0x1DF
+Symbols:
+  - Name:               aux_fcn_csect
+    StorageClass:       C_EXT
+    Type:               0x20
+    AuxEntries:
+      - Type:                   AUX_CSECT
+        SymbolAlignment: 4
+        SymbolType: [[SYMTYPE=<none>]]
+
+# STYPE:      SymbolAlignmentLog2: 4
+# STYPE-NEXT:   SymbolType: 0x7
diff --git a/llvm/tools/obj2yaml/xcoff2yaml.cpp b/llvm/tools/obj2yaml/xcoff2yaml.cpp
index 0acbf48..e426b64 100644
--- a/llvm/tools/obj2yaml/xcoff2yaml.cpp
+++ b/llvm/tools/obj2yaml/xcoff2yaml.cpp
@@ -209,7 +209,9 @@ void XCOFFDumper::dumpCsectAuxSym(XCOFFYAML::Symbol &Sym,
   XCOFFYAML::CsectAuxEnt CsectAuxSym;
   CsectAuxSym.ParameterHashIndex = AuxEntPtr.getParameterHashIndex();
   CsectAuxSym.TypeChkSectNum = AuxEntPtr.getTypeChkSectNum();
-  CsectAuxSym.SymbolAlignmentAndType = AuxEntPtr.getSymbolAlignmentAndType();
+  CsectAuxSym.SymbolAlignment = AuxEntPtr.getAlignmentLog2();
+  CsectAuxSym.SymbolType =
+      static_cast<XCOFF::SymbolType>(AuxEntPtr.getSymbolType());
   CsectAuxSym.StorageMappingClass = AuxEntPtr.getStorageMappingClass();
 
   if (Obj.is64Bit()) {
-- 
cgit v1.1


From 58e8147d1690485ed0a6fcb59c7b6ea4b8cd2936 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Valentin=20Clement=20=28=E3=83=90=E3=83=AC=E3=83=B3?=
 =?UTF-8?q?=E3=82=BF=E3=82=A4=E3=83=B3=20=E3=82=AF=E3=83=AC=E3=83=A1?=
 =?UTF-8?q?=E3=83=B3=29?= <clementval@gmail.com>
Date: Thu, 8 Feb 2024 08:49:11 -0800
Subject: [flang][openacc] Use original input for base address with optional
 (#80931)

In #80317 the data op generation was updated to use correctly the #0
result from the hlfir.delcare op. In case of optional that are not
descriptor, it is preferable to use the original input for the varPtr
value of the OpenACC data op.
This patch also make sure that the descriptor value of optional is only
accessed when present.
---
 flang/lib/Lower/DirectivesCommon.h      | 93 ++++++++++++++++++++++++++-------
 flang/lib/Lower/OpenACC.cpp             | 20 +++++--
 flang/test/Lower/OpenACC/acc-bounds.f90 | 38 +++++++++++++-
 3 files changed, 124 insertions(+), 27 deletions(-)

diff --git a/flang/lib/Lower/DirectivesCommon.h b/flang/lib/Lower/DirectivesCommon.h
index bd88037..8d560db 100644
--- a/flang/lib/Lower/DirectivesCommon.h
+++ b/flang/lib/Lower/DirectivesCommon.h
@@ -52,10 +52,13 @@ namespace lower {
 /// operations.
 struct AddrAndBoundsInfo {
   explicit AddrAndBoundsInfo() {}
-  explicit AddrAndBoundsInfo(mlir::Value addr) : addr(addr) {}
-  explicit AddrAndBoundsInfo(mlir::Value addr, mlir::Value isPresent)
-      : addr(addr), isPresent(isPresent) {}
+  explicit AddrAndBoundsInfo(mlir::Value addr, mlir::Value rawInput)
+      : addr(addr), rawInput(rawInput) {}
+  explicit AddrAndBoundsInfo(mlir::Value addr, mlir::Value rawInput,
+                             mlir::Value isPresent)
+      : addr(addr), rawInput(rawInput), isPresent(isPresent) {}
   mlir::Value addr = nullptr;
+  mlir::Value rawInput = nullptr;
   mlir::Value isPresent = nullptr;
 };
 
@@ -615,20 +618,30 @@ getDataOperandBaseAddr(Fortran::lower::AbstractConverter &converter,
                        fir::FirOpBuilder &builder,
                        Fortran::lower::SymbolRef sym, mlir::Location loc) {
   mlir::Value symAddr = converter.getSymbolAddress(sym);
+  mlir::Value rawInput = symAddr;
   if (auto declareOp =
-          mlir::dyn_cast_or_null<hlfir::DeclareOp>(symAddr.getDefiningOp()))
+          mlir::dyn_cast_or_null<hlfir::DeclareOp>(symAddr.getDefiningOp())) {
     symAddr = declareOp.getResults()[0];
+    rawInput = declareOp.getResults()[1];
+  }
 
   // TODO: Might need revisiting to handle for non-shared clauses
   if (!symAddr) {
     if (const auto *details =
-            sym->detailsIf<Fortran::semantics::HostAssocDetails>())
+            sym->detailsIf<Fortran::semantics::HostAssocDetails>()) {
       symAddr = converter.getSymbolAddress(details->symbol());
+      rawInput = symAddr;
+    }
   }
 
   if (!symAddr)
     llvm::report_fatal_error("could not retrieve symbol address");
 
+  mlir::Value isPresent;
+  if (Fortran::semantics::IsOptional(sym))
+    isPresent =
+        builder.create<fir::IsPresentOp>(loc, builder.getI1Type(), rawInput);
+
   if (auto boxTy =
           fir::unwrapRefType(symAddr.getType()).dyn_cast<fir::BaseBoxType>()) {
     if (boxTy.getEleTy().isa<fir::RecordType>())
@@ -638,8 +651,6 @@ getDataOperandBaseAddr(Fortran::lower::AbstractConverter &converter,
     // `fir.ref<fir.class<T>>` type.
     if (symAddr.getType().isa<fir::ReferenceType>()) {
       if (Fortran::semantics::IsOptional(sym)) {
-        mlir::Value isPresent =
-            builder.create<fir::IsPresentOp>(loc, builder.getI1Type(), symAddr);
         mlir::Value addr =
             builder.genIfOp(loc, {boxTy}, isPresent, /*withElseRegion=*/true)
                 .genThen([&]() {
@@ -652,14 +663,13 @@ getDataOperandBaseAddr(Fortran::lower::AbstractConverter &converter,
                   builder.create<fir::ResultOp>(loc, mlir::ValueRange{absent});
                 })
                 .getResults()[0];
-        return AddrAndBoundsInfo(addr, isPresent);
+        return AddrAndBoundsInfo(addr, rawInput, isPresent);
       }
       mlir::Value addr = builder.create<fir::LoadOp>(loc, symAddr);
-      return AddrAndBoundsInfo(addr);
-      ;
+      return AddrAndBoundsInfo(addr, rawInput, isPresent);
     }
   }
-  return AddrAndBoundsInfo(symAddr);
+  return AddrAndBoundsInfo(symAddr, rawInput, isPresent);
 }
 
 template <typename BoundsOp, typename BoundsType>
@@ -807,7 +817,7 @@ genBoundsOps(fir::FirOpBuilder &builder, mlir::Location loc,
              Fortran::lower::StatementContext &stmtCtx,
              const std::list<Fortran::parser::SectionSubscript> &subscripts,
              std::stringstream &asFortran, fir::ExtendedValue &dataExv,
-             bool dataExvIsAssumedSize, mlir::Value baseAddr,
+             bool dataExvIsAssumedSize, AddrAndBoundsInfo &info,
              bool treatIndexAsSection = false) {
   int dimension = 0;
   mlir::Type idxTy = builder.getIndexType();
@@ -831,11 +841,30 @@ genBoundsOps(fir::FirOpBuilder &builder, mlir::Location loc,
       mlir::Value stride = one;
       bool strideInBytes = false;
 
-      if (fir::unwrapRefType(baseAddr.getType()).isa<fir::BaseBoxType>()) {
-        mlir::Value d = builder.createIntegerConstant(loc, idxTy, dimension);
-        auto dimInfo = builder.create<fir::BoxDimsOp>(loc, idxTy, idxTy, idxTy,
-                                                      baseAddr, d);
-        stride = dimInfo.getByteStride();
+      if (fir::unwrapRefType(info.addr.getType()).isa<fir::BaseBoxType>()) {
+        if (info.isPresent) {
+          stride =
+              builder
+                  .genIfOp(loc, idxTy, info.isPresent, /*withElseRegion=*/true)
+                  .genThen([&]() {
+                    mlir::Value d =
+                        builder.createIntegerConstant(loc, idxTy, dimension);
+                    auto dimInfo = builder.create<fir::BoxDimsOp>(
+                        loc, idxTy, idxTy, idxTy, info.addr, d);
+                    builder.create<fir::ResultOp>(loc, dimInfo.getByteStride());
+                  })
+                  .genElse([&] {
+                    mlir::Value zero =
+                        builder.createIntegerConstant(loc, idxTy, 0);
+                    builder.create<fir::ResultOp>(loc, zero);
+                  })
+                  .getResults()[0];
+        } else {
+          mlir::Value d = builder.createIntegerConstant(loc, idxTy, dimension);
+          auto dimInfo = builder.create<fir::BoxDimsOp>(loc, idxTy, idxTy,
+                                                        idxTy, info.addr, d);
+          stride = dimInfo.getByteStride();
+        }
         strideInBytes = true;
       }
 
@@ -919,7 +948,26 @@ genBoundsOps(fir::FirOpBuilder &builder, mlir::Location loc,
           }
         }
 
-        extent = fir::factory::readExtent(builder, loc, dataExv, dimension);
+        if (info.isPresent &&
+            fir::unwrapRefType(info.addr.getType()).isa<fir::BaseBoxType>()) {
+          extent =
+              builder
+                  .genIfOp(loc, idxTy, info.isPresent, /*withElseRegion=*/true)
+                  .genThen([&]() {
+                    mlir::Value ext = fir::factory::readExtent(
+                        builder, loc, dataExv, dimension);
+                    builder.create<fir::ResultOp>(loc, ext);
+                  })
+                  .genElse([&] {
+                    mlir::Value zero =
+                        builder.createIntegerConstant(loc, idxTy, 0);
+                    builder.create<fir::ResultOp>(loc, zero);
+                  })
+                  .getResults()[0];
+        } else {
+          extent = fir::factory::readExtent(builder, loc, dataExv, dimension);
+        }
+
         if (dataExvIsAssumedSize && dimension + 1 == dataExvRank) {
           extent = zero;
           if (ubound && lbound) {
@@ -976,6 +1024,7 @@ AddrAndBoundsInfo gatherDataOperandAddrAndBounds(
                   dataExv = converter.genExprAddr(operandLocation, *exprBase,
                                                   stmtCtx);
                   info.addr = fir::getBase(dataExv);
+                  info.rawInput = info.addr;
                   asFortran << (*exprBase).AsFortran();
                 } else {
                   const Fortran::parser::Name &name =
@@ -993,7 +1042,7 @@ AddrAndBoundsInfo gatherDataOperandAddrAndBounds(
                   bounds = genBoundsOps<BoundsOp, BoundsType>(
                       builder, operandLocation, converter, stmtCtx,
                       arrayElement->subscripts, asFortran, dataExv,
-                      dataExvIsAssumedSize, info.addr, treatIndexAsSection);
+                      dataExvIsAssumedSize, info, treatIndexAsSection);
                 }
                 asFortran << ')';
               } else if (auto structComp = Fortran::parser::Unwrap<
@@ -1001,6 +1050,7 @@ AddrAndBoundsInfo gatherDataOperandAddrAndBounds(
                 fir::ExtendedValue compExv =
                     converter.genExprAddr(operandLocation, *expr, stmtCtx);
                 info.addr = fir::getBase(compExv);
+                info.rawInput = info.addr;
                 if (fir::unwrapRefType(info.addr.getType())
                         .isa<fir::SequenceType>())
                   bounds = genBaseBoundsOps<BoundsOp, BoundsType>(
@@ -1012,7 +1062,7 @@ AddrAndBoundsInfo gatherDataOperandAddrAndBounds(
                     *Fortran::parser::GetLastName(*structComp).symbol);
                 if (isOptional)
                   info.isPresent = builder.create<fir::IsPresentOp>(
-                      operandLocation, builder.getI1Type(), info.addr);
+                      operandLocation, builder.getI1Type(), info.rawInput);
 
                 if (auto loadOp = mlir::dyn_cast_or_null<fir::LoadOp>(
                         info.addr.getDefiningOp())) {
@@ -1020,6 +1070,7 @@ AddrAndBoundsInfo gatherDataOperandAddrAndBounds(
                       fir::isPointerType(loadOp.getType()))
                     info.addr = builder.create<fir::BoxAddrOp>(operandLocation,
                                                                info.addr);
+                  info.rawInput = info.addr;
                 }
 
                 // If the component is an allocatable or pointer the result of
@@ -1029,6 +1080,7 @@ AddrAndBoundsInfo gatherDataOperandAddrAndBounds(
                 if (auto boxAddrOp = mlir::dyn_cast_or_null<fir::BoxAddrOp>(
                         info.addr.getDefiningOp())) {
                   info.addr = boxAddrOp.getVal();
+                  info.rawInput = info.addr;
                   bounds = genBoundsOpsFromBox<BoundsOp, BoundsType>(
                       builder, operandLocation, converter, compExv, info);
                 }
@@ -1043,6 +1095,7 @@ AddrAndBoundsInfo gatherDataOperandAddrAndBounds(
                   fir::ExtendedValue compExv =
                       converter.genExprAddr(operandLocation, *expr, stmtCtx);
                   info.addr = fir::getBase(compExv);
+                  info.rawInput = info.addr;
                   asFortran << (*expr).AsFortran();
                 } else if (const auto *dataRef{
                                std::get_if<Fortran::parser::DataRef>(
diff --git a/flang/lib/Lower/OpenACC.cpp b/flang/lib/Lower/OpenACC.cpp
index 43f54c6..6ae270f 100644
--- a/flang/lib/Lower/OpenACC.cpp
+++ b/flang/lib/Lower/OpenACC.cpp
@@ -67,9 +67,12 @@ static Op createDataEntryOp(fir::FirOpBuilder &builder, mlir::Location loc,
   mlir::Value varPtrPtr;
   if (auto boxTy = baseAddr.getType().dyn_cast<fir::BaseBoxType>()) {
     if (isPresent) {
+      mlir::Type ifRetTy = boxTy.getEleTy();
+      if (!fir::isa_ref_type(ifRetTy))
+        ifRetTy = fir::ReferenceType::get(ifRetTy);
       baseAddr =
           builder
-              .genIfOp(loc, {boxTy.getEleTy()}, isPresent,
+              .genIfOp(loc, {ifRetTy}, isPresent,
                        /*withElseRegion=*/true)
               .genThen([&]() {
                 mlir::Value boxAddr =
@@ -78,7 +81,7 @@ static Op createDataEntryOp(fir::FirOpBuilder &builder, mlir::Location loc,
               })
               .genElse([&] {
                 mlir::Value absent =
-                    builder.create<fir::AbsentOp>(loc, boxTy.getEleTy());
+                    builder.create<fir::AbsentOp>(loc, ifRetTy);
                 builder.create<fir::ResultOp>(loc, mlir::ValueRange{absent});
               })
               .getResults()[0];
@@ -295,9 +298,16 @@ genDataOperandOperations(const Fortran::parser::AccObjectList &objectList,
                                        asFortran, bounds,
                                        /*treatIndexAsSection=*/true);
 
-    Op op = createDataEntryOp<Op>(
-        builder, operandLocation, info.addr, asFortran, bounds, structured,
-        implicit, dataClause, info.addr.getType(), info.isPresent);
+    // If the input value is optional and is not a descriptor, we use the
+    // rawInput directly.
+    mlir::Value baseAddr =
+        ((info.addr.getType() != fir::unwrapRefType(info.rawInput.getType())) &&
+         info.isPresent)
+            ? info.rawInput
+            : info.addr;
+    Op op = createDataEntryOp<Op>(builder, operandLocation, baseAddr, asFortran,
+                                  bounds, structured, implicit, dataClause,
+                                  baseAddr.getType(), info.isPresent);
     dataOperands.push_back(op.getAccPtr());
   }
 }
diff --git a/flang/test/Lower/OpenACC/acc-bounds.f90 b/flang/test/Lower/OpenACC/acc-bounds.f90
index bd96bc8..df97cbc 100644
--- a/flang/test/Lower/OpenACC/acc-bounds.f90
+++ b/flang/test/Lower/OpenACC/acc-bounds.f90
@@ -126,8 +126,8 @@ contains
   
 ! CHECK-LABEL: func.func @_QMopenacc_boundsPacc_optional_data(
 ! CHECK-SAME: %[[ARG0:.*]]: !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>> {fir.bindc_name = "a", fir.optional}) {
-! CHECK: %[[ARG0_DECL:.*]]:2 = hlfir.declare %arg0 {fortran_attrs = #fir.var_attrs<optional, pointer>, uniq_name = "_QMopenacc_boundsFacc_optional_dataEa"} : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>) -> (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>)
-! CHECK: %[[IS_PRESENT:.*]] = fir.is_present %[[ARG0_DECL]]#0 : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>) -> i1
+! CHECK: %[[ARG0_DECL:.*]]:2 = hlfir.declare %[[ARG0]] {fortran_attrs = #fir.var_attrs<optional, pointer>, uniq_name = "_QMopenacc_boundsFacc_optional_dataEa"} : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>) -> (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>)
+! CHECK: %[[IS_PRESENT:.*]] = fir.is_present %[[ARG0_DECL]]#1 : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>) -> i1
 ! CHECK: %[[BOX:.*]] = fir.if %[[IS_PRESENT]] -> (!fir.box<!fir.ptr<!fir.array<?xf32>>>) {
 ! CHECK:   %[[LOAD:.*]] = fir.load %[[ARG0_DECL]]#0 : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>
 ! CHECK:   fir.result %[[LOAD]] : !fir.box<!fir.ptr<!fir.array<?xf32>>>
@@ -153,4 +153,38 @@ contains
 ! CHECK: %[[ATTACH:.*]] = acc.attach varPtr(%[[BOX_ADDR]] : !fir.ptr<!fir.array<?xf32>>) bounds(%[[BOUND]]) -> !fir.ptr<!fir.array<?xf32>> {name = "a"}
 ! CHECK: acc.data dataOperands(%[[ATTACH]] : !fir.ptr<!fir.array<?xf32>>)
 
+  subroutine acc_optional_data2(a, n)
+    integer :: n
+    real, optional :: a(n)
+    !$acc data no_create(a)
+    !$acc end data
+  end subroutine
+
+! CHECK-LABEL: func.func @_QMopenacc_boundsPacc_optional_data2(
+! CHECK-SAME: %[[A:.*]]: !fir.ref<!fir.array<?xf32>> {fir.bindc_name = "a", fir.optional}, %[[N:.*]]: !fir.ref<i32> {fir.bindc_name = "n"}) {
+! CHECK: %[[DECL_A:.*]]:2 = hlfir.declare %[[A]](%{{.*}}) {fortran_attrs = #fir.var_attrs<optional>, uniq_name = "_QMopenacc_boundsFacc_optional_data2Ea"} : (!fir.ref<!fir.array<?xf32>>, !fir.shape<1>) -> (!fir.box<!fir.array<?xf32>>, !fir.ref<!fir.array<?xf32>>)
+! CHECK: %[[NO_CREATE:.*]] = acc.nocreate varPtr(%[[DECL_A]]#1 : !fir.ref<!fir.array<?xf32>>) bounds(%10) -> !fir.ref<!fir.array<?xf32>> {name = "a"}
+! CHECK: acc.data dataOperands(%[[NO_CREATE]] : !fir.ref<!fir.array<?xf32>>) {
+
+  subroutine acc_optional_data3(a, n)
+    integer :: n
+    real, optional :: a(n)
+    !$acc data no_create(a(1:n))
+    !$acc end data
+  end subroutine
+
+! CHECK-LABEL: func.func @_QMopenacc_boundsPacc_optional_data3(
+! CHECK-SAME: %[[A:.*]]: !fir.ref<!fir.array<?xf32>> {fir.bindc_name = "a", fir.optional}, %[[N:.*]]: !fir.ref<i32> {fir.bindc_name = "n"}) {
+! CHECK: %[[DECL_A:.*]]:2 = hlfir.declare %[[A]](%{{.*}}) {fortran_attrs = #fir.var_attrs<optional>, uniq_name = "_QMopenacc_boundsFacc_optional_data3Ea"} : (!fir.ref<!fir.array<?xf32>>, !fir.shape<1>) -> (!fir.box<!fir.array<?xf32>>, !fir.ref<!fir.array<?xf32>>)
+! CHECK: %[[PRES:.*]] = fir.is_present %[[DECL_A]]#1 : (!fir.ref<!fir.array<?xf32>>) -> i1
+! CHECK: %[[STRIDE:.*]] = fir.if %[[PRES]] -> (index) {
+! CHECK:   %[[DIMS:.*]]:3 = fir.box_dims %[[DECL_A]]#0, %c0{{.*}} : (!fir.box<!fir.array<?xf32>>, index) -> (index, index, index)
+! CHECK:   fir.result %[[DIMS]]#2 : index
+! CHECK: } else {
+! CHECK:   fir.result %c0{{.*}} : index
+! CHECK: }
+! CHECK: %[[BOUNDS:.*]] = acc.bounds lowerbound(%c0{{.*}} : index) upperbound(%{{.*}} : index) extent(%{{.*}} : index) stride(%[[STRIDE]] : index) startIdx(%c1 : index) {strideInBytes = true}
+! CHECK: %[[NOCREATE:.*]] = acc.nocreate varPtr(%[[DECL_A]]#1 : !fir.ref<!fir.array<?xf32>>) bounds(%14) -> !fir.ref<!fir.array<?xf32>> {name = "a(1:n)"}
+! CHECK: acc.data dataOperands(%[[NOCREATE]] : !fir.ref<!fir.array<?xf32>>) {
+
 end module
-- 
cgit v1.1


From 66d462d0a1ba1e510fff479baff8f21ecb924b1f Mon Sep 17 00:00:00 2001
From: Adrian Prantl <aprantl@apple.com>
Date: Thu, 8 Feb 2024 08:54:52 -0800
Subject: Add missing textual header to module map

---
 clang/include/module.modulemap | 1 +
 1 file changed, 1 insertion(+)

diff --git a/clang/include/module.modulemap b/clang/include/module.modulemap
index 794526b..9285595 100644
--- a/clang/include/module.modulemap
+++ b/clang/include/module.modulemap
@@ -81,6 +81,7 @@ module Clang_Basic {
   textual header "clang/Basic/RISCVVTypes.def"
   textual header "clang/Basic/Sanitizers.def"
   textual header "clang/Basic/TargetCXXABI.def"
+  textual header "clang/Basic/TargetOSMacros.def"
   textual header "clang/Basic/TransformTypeTraits.def"
   textual header "clang/Basic/TokenKinds.def"
   textual header "clang/Basic/WebAssemblyReferenceTypes.def"
-- 
cgit v1.1


From 750981f1a2c6069cded709b75cc87d7abd05277a Mon Sep 17 00:00:00 2001
From: Adrian Prantl <aprantl@apple.com>
Date: Thu, 8 Feb 2024 09:03:47 -0800
Subject: Fix a truly strange triple in testcase

---
 lldb/test/API/macosx/universal/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lldb/test/API/macosx/universal/Makefile b/lldb/test/API/macosx/universal/Makefile
index 8712fde..7d4762f 100644
--- a/lldb/test/API/macosx/universal/Makefile
+++ b/lldb/test/API/macosx/universal/Makefile
@@ -14,7 +14,7 @@ testit.x86_64: testit.x86_64.o
 	$(CC) -isysroot $(SDKROOT) -target x86_64-apple-macosx10.9 -o testit.x86_64 $<
 
 testit.x86_64h.o: main.c
-	$(CC) -isysroot $(SDKROOT) -g -O0 -target x86_64h-apple-macosx10.9-apple-macosx10.9-apple-macosx10.9-apple-macosx10.9 -c -o testit.x86_64h.o $<
+	$(CC) -isysroot $(SDKROOT) -g -O0 -target x86_64h-apple-macosx10.9 -c -o testit.x86_64h.o $<
 
 testit.x86_64.o: main.c
 	$(CC) -isysroot $(SDKROOT) -g -O0 -target x86_64-apple-macosx10.9 -c -o testit.x86_64.o $<
-- 
cgit v1.1


From bdde5f9bea75e897bcc31a95b9c3376988c211cc Mon Sep 17 00:00:00 2001
From: Jeremy Morse <jeremy.morse@sony.com>
Date: Thu, 8 Feb 2024 16:40:48 +0000
Subject: [DebugInfo][RemoveDIs] Turn on non-instrinsic debug-info by default

This patch causes all variable-location debug-info to be converted into
non-intrinsic records as they passes through the optimisation /
instrumentation passes. There's a brief introduction here [0] and a more
detailed thread on what this means on discourse at [1].

If this commit is breaking your downstream tests, please see comment 12 in
[1], which documents the kind of variation in tests we'd expect to see from
this change and what to do about it.

[0] https://llvm.org/docs/RemoveDIsDebugInfo.html
[1] https://discourse.llvm.org/t/rfc-instruction-api-changes-needed-to-eliminate-debug-intrinsics-from-ir/68939
---
 llvm/lib/IR/BasicBlock.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/IR/BasicBlock.cpp b/llvm/lib/IR/BasicBlock.cpp
index fe9d0d0..bf02eba 100644
--- a/llvm/lib/IR/BasicBlock.cpp
+++ b/llvm/lib/IR/BasicBlock.cpp
@@ -34,7 +34,7 @@ cl::opt<bool>
     UseNewDbgInfoFormat("experimental-debuginfo-iterators",
                         cl::desc("Enable communicating debuginfo positions "
                                  "through iterators, eliminating intrinsics"),
-                        cl::init(false));
+                        cl::init(true));
 
 DPMarker *BasicBlock::createMarker(Instruction *I) {
   assert(IsNewDbgInfoFormat &&
-- 
cgit v1.1


From f219cda7bd43696792ca4668ca5a9fbf55a9f09f Mon Sep 17 00:00:00 2001
From: Jason Molenda <jmolenda@apple.com>
Date: Thu, 8 Feb 2024 09:16:12 -0800
Subject: [lldb] Fix printf formatting of std::time_t seconds (#81078)

This formatter
https://github.com/llvm/llvm-project/pull/78609
was originally passing the signed seconds (which can refer to times in
the past) with an unsigned printf formatter, and had tests that expected
to see negative values from the printf which always failed on macOS. I'm
not clear how they ever passed on any platform.

Fix the printf to print seconds as a signed value, and re-enable the
tests.
---
 lldb/source/Plugins/Language/CPlusPlus/LibCxx.cpp  |  6 ++---
 .../libcxx/chrono/TestDataFormatterLibcxxChrono.py | 30 ++++++++++------------
 2 files changed, 17 insertions(+), 19 deletions(-)

diff --git a/lldb/source/Plugins/Language/CPlusPlus/LibCxx.cpp b/lldb/source/Plugins/Language/CPlusPlus/LibCxx.cpp
index a7d7066..7893aa7 100644
--- a/lldb/source/Plugins/Language/CPlusPlus/LibCxx.cpp
+++ b/lldb/source/Plugins/Language/CPlusPlus/LibCxx.cpp
@@ -1108,7 +1108,7 @@ bool lldb_private::formatters::LibcxxChronoSysSecondsSummaryProvider(
 
   const std::time_t seconds = ptr_sp->GetValueAsSigned(0);
   if (seconds < chrono_timestamp_min || seconds > chrono_timestamp_max)
-    stream.Printf("timestamp=%" PRIu64 " s", static_cast<uint64_t>(seconds));
+    stream.Printf("timestamp=%" PRId64 " s", static_cast<int64_t>(seconds));
   else {
     std::array<char, 128> str;
     std::size_t size =
@@ -1116,8 +1116,8 @@ bool lldb_private::formatters::LibcxxChronoSysSecondsSummaryProvider(
     if (size == 0)
       return false;
 
-    stream.Printf("date/time=%s timestamp=%" PRIu64 " s", str.data(),
-                  static_cast<uint64_t>(seconds));
+    stream.Printf("date/time=%s timestamp=%" PRId64 " s", str.data(),
+                  static_cast<int64_t>(seconds));
   }
 
   return true;
diff --git a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx/chrono/TestDataFormatterLibcxxChrono.py b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx/chrono/TestDataFormatterLibcxxChrono.py
index 9706f9e..a90fb82 100644
--- a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx/chrono/TestDataFormatterLibcxxChrono.py
+++ b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx/chrono/TestDataFormatterLibcxxChrono.py
@@ -54,17 +54,16 @@ class LibcxxChronoDataFormatterTestCase(TestBase):
             substrs=["ss_0 = date/time=1970-01-01T00:00:00Z timestamp=0 s"],
         )
 
-        # FIXME disabled temporarily, macOS is printing this as an unsigned?
-        #self.expect(
-        #    "frame variable ss_neg_date_time",
-        #    substrs=[
-        #        "ss_neg_date_time = date/time=-32767-01-01T00:00:00Z timestamp=-1096193779200 s"
-        #    ],
-        #)
-        #self.expect(
-        #    "frame variable ss_neg_seconds",
-        #    substrs=["ss_neg_seconds = timestamp=-1096193779201 s"],
-        #)
+        self.expect(
+            "frame variable ss_neg_date_time",
+            substrs=[
+                "ss_neg_date_time = date/time=-32767-01-01T00:00:00Z timestamp=-1096193779200 s"
+            ],
+        )
+        self.expect(
+            "frame variable ss_neg_seconds",
+            substrs=["ss_neg_seconds = timestamp=-1096193779201 s"],
+        )
 
         self.expect(
             "frame variable ss_pos_date_time",
@@ -77,11 +76,10 @@ class LibcxxChronoDataFormatterTestCase(TestBase):
             substrs=["ss_pos_seconds = timestamp=971890963200 s"],
         )
 
-        # FIXME disabled temporarily, macOS is printing this as an unsigned?
-        #self.expect(
-        #    "frame variable ss_min",
-        #    substrs=["ss_min = timestamp=-9223372036854775808 s"],
-        #)
+        self.expect(
+            "frame variable ss_min",
+            substrs=["ss_min = timestamp=-9223372036854775808 s"],
+        )
         self.expect(
             "frame variable ss_max",
             substrs=["ss_max = timestamp=9223372036854775807 s"],
-- 
cgit v1.1


From af97edff70b0d9cb89729dc0d8af1d1ea101686e Mon Sep 17 00:00:00 2001
From: Dave Lee <davelee.com@gmail.com>
Date: Thu, 8 Feb 2024 09:32:12 -0800
Subject: [lldb] Refactor GetFormatFromCString to always check for partial
 matches  (NFC) (#81018)

Refactors logic in `ParseInternal` that was previously calling
`GetFormatFromCString` twice, once with `partial_match_ok` set to false,
and the second time set to true.

With this change, lldb formats (ie `%@`, `%S`, etc) are checked first.
If a format is not one of those, then `GetFormatFromCString` is called
once, and now always checks for partial matches.
---
 lldb/include/lldb/DataFormatters/FormatManager.h |  2 +-
 lldb/source/Core/FormatEntity.cpp                | 26 ++++++++++--------------
 lldb/source/DataFormatters/FormatManager.cpp     | 17 +++++++---------
 lldb/source/Interpreter/OptionArgParser.cpp      |  3 +--
 4 files changed, 20 insertions(+), 28 deletions(-)

diff --git a/lldb/include/lldb/DataFormatters/FormatManager.h b/lldb/include/lldb/DataFormatters/FormatManager.h
index 986614f..db2fe99 100644
--- a/lldb/include/lldb/DataFormatters/FormatManager.h
+++ b/lldb/include/lldb/DataFormatters/FormatManager.h
@@ -138,7 +138,7 @@ public:
   }
 
   static bool GetFormatFromCString(const char *format_cstr,
-                                   bool partial_match_ok, lldb::Format &format);
+                                   lldb::Format &format);
 
   static char GetFormatAsFormatChar(lldb::Format format);
 
diff --git a/lldb/source/Core/FormatEntity.cpp b/lldb/source/Core/FormatEntity.cpp
index 3c665c2..fa5eadc 100644
--- a/lldb/source/Core/FormatEntity.cpp
+++ b/lldb/source/Core/FormatEntity.cpp
@@ -2151,11 +2151,7 @@ static Status ParseInternal(llvm::StringRef &format, Entry &parent_entry,
             if (entry.printf_format.find('%') == std::string::npos) {
               bool clear_printf = false;
 
-              if (FormatManager::GetFormatFromCString(
-                      entry.printf_format.c_str(), false, entry.fmt)) {
-                // We have an LLDB format, so clear the printf format
-                clear_printf = true;
-              } else if (entry.printf_format.size() == 1) {
+              if (entry.printf_format.size() == 1) {
                 switch (entry.printf_format[0]) {
                 case '@': // if this is an @ sign, print ObjC description
                   entry.number = ValueObject::
@@ -2198,20 +2194,20 @@ static Status ParseInternal(llvm::StringRef &format, Entry &parent_entry,
                       eValueObjectRepresentationStyleExpressionPath;
                   clear_printf = true;
                   break;
-                default:
+                }
+              }
+
+              if (entry.number == 0) {
+                if (FormatManager::GetFormatFromCString(
+                        entry.printf_format.c_str(), entry.fmt)) {
+                  clear_printf = true;
+                } else if (entry.printf_format == "tid") {
+                  verify_is_thread_id = true;
+                } else {
                   error.SetErrorStringWithFormat("invalid format: '%s'",
                                                  entry.printf_format.c_str());
                   return error;
                 }
-              } else if (FormatManager::GetFormatFromCString(
-                             entry.printf_format.c_str(), true, entry.fmt)) {
-                clear_printf = true;
-              } else if (entry.printf_format == "tid") {
-                verify_is_thread_id = true;
-              } else {
-                error.SetErrorStringWithFormat("invalid format: '%s'",
-                                               entry.printf_format.c_str());
-                return error;
               }
 
               // Our format string turned out to not be a printf style format
diff --git a/lldb/source/DataFormatters/FormatManager.cpp b/lldb/source/DataFormatters/FormatManager.cpp
index f1f135d..092fa3c 100644
--- a/lldb/source/DataFormatters/FormatManager.cpp
+++ b/lldb/source/DataFormatters/FormatManager.cpp
@@ -91,7 +91,7 @@ static bool GetFormatFromFormatChar(char format_char, Format &format) {
 }
 
 static bool GetFormatFromFormatName(llvm::StringRef format_name,
-                                    bool partial_match_ok, Format &format) {
+                                    Format &format) {
   uint32_t i;
   for (i = 0; i < g_num_format_infos; ++i) {
     if (format_name.equals_insensitive(g_format_infos[i].format_name)) {
@@ -100,13 +100,11 @@ static bool GetFormatFromFormatName(llvm::StringRef format_name,
     }
   }
 
-  if (partial_match_ok) {
-    for (i = 0; i < g_num_format_infos; ++i) {
-      if (llvm::StringRef(g_format_infos[i].format_name)
-              .starts_with_insensitive(format_name)) {
-        format = g_format_infos[i].format;
-        return true;
-      }
+  for (i = 0; i < g_num_format_infos; ++i) {
+    if (llvm::StringRef(g_format_infos[i].format_name)
+            .starts_with_insensitive(format_name)) {
+      format = g_format_infos[i].format;
+      return true;
     }
   }
   format = eFormatInvalid;
@@ -124,7 +122,6 @@ void FormatManager::Changed() {
 }
 
 bool FormatManager::GetFormatFromCString(const char *format_cstr,
-                                         bool partial_match_ok,
                                          lldb::Format &format) {
   bool success = false;
   if (format_cstr && format_cstr[0]) {
@@ -134,7 +131,7 @@ bool FormatManager::GetFormatFromCString(const char *format_cstr,
         return true;
     }
 
-    success = GetFormatFromFormatName(format_cstr, partial_match_ok, format);
+    success = GetFormatFromFormatName(format_cstr, format);
   }
   if (!success)
     format = eFormatInvalid;
diff --git a/lldb/source/Interpreter/OptionArgParser.cpp b/lldb/source/Interpreter/OptionArgParser.cpp
index d13805a..75ccad8 100644
--- a/lldb/source/Interpreter/OptionArgParser.cpp
+++ b/lldb/source/Interpreter/OptionArgParser.cpp
@@ -93,8 +93,7 @@ Status OptionArgParser::ToFormat(const char *s, lldb::Format &format,
         *byte_size_ptr = 0;
     }
 
-    const bool partial_match_ok = true;
-    if (!FormatManager::GetFormatFromCString(s, partial_match_ok, format)) {
+    if (!FormatManager::GetFormatFromCString(s, format)) {
       StreamString error_strm;
       error_strm.Printf(
           "Invalid format character or name '%s'. Valid values are:\n", s);
-- 
cgit v1.1


From bef25ae297d6d246bf0fa8667c8b08f9d5e8dae7 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Thu, 8 Feb 2024 17:31:06 +0000
Subject: [X86] X86FixupVectorConstants - use explicit register bitwidth for
 the loaded vector instead of using constant pool bitwidth

Fixes #81136 - we might be loading from a constant pool entry wider than the destination register bitwidth, affecting the vextload scale calculation.

ConvertToBroadcastAVX512 doesn't yet set an explicit bitwidth (it will default to the constant pool bitwidth) due to difficulties in looking up the original register width through the fold tables, but as we only use rebuildSplatCst this shouldn't cause any miscompilations, although it might prevent folding to broadcast if only the lower bits match a splatable pattern.
---
 llvm/lib/Target/X86/X86FixupVectorConstants.cpp | 35 ++++++++++++++-----------
 llvm/test/CodeGen/X86/pr81136.ll                |  3 +--
 2 files changed, 21 insertions(+), 17 deletions(-)

diff --git a/llvm/lib/Target/X86/X86FixupVectorConstants.cpp b/llvm/lib/Target/X86/X86FixupVectorConstants.cpp
index 32ca9c16..da7dcbb 100644
--- a/llvm/lib/Target/X86/X86FixupVectorConstants.cpp
+++ b/llvm/lib/Target/X86/X86FixupVectorConstants.cpp
@@ -226,6 +226,7 @@ static Constant *rebuildConstant(LLVMContext &Ctx, Type *SclTy,
 // width, built up of potentially smaller scalar values.
 static Constant *rebuildSplatCst(const Constant *C, unsigned /*NumBits*/,
                                  unsigned /*NumElts*/, unsigned SplatBitWidth) {
+  // TODO: Truncate to NumBits once ConvertToBroadcastAVX512 support this.
   std::optional<APInt> Splat = getSplatableConstant(C, SplatBitWidth);
   if (!Splat)
     return nullptr;
@@ -328,7 +329,8 @@ bool X86FixupVectorConstantsPass::processInstruction(MachineFunction &MF,
     std::function<Constant *(const Constant *, unsigned, unsigned, unsigned)>
         RebuildConstant;
   };
-  auto FixupConstant = [&](ArrayRef<FixupEntry> Fixups, unsigned OperandNo) {
+  auto FixupConstant = [&](ArrayRef<FixupEntry> Fixups, unsigned RegBitWidth,
+                           unsigned OperandNo) {
 #ifdef EXPENSIVE_CHECKS
     assert(llvm::is_sorted(Fixups,
                            [](const FixupEntry &A, const FixupEntry &B) {
@@ -340,7 +342,8 @@ bool X86FixupVectorConstantsPass::processInstruction(MachineFunction &MF,
     assert(MI.getNumOperands() >= (OperandNo + X86::AddrNumOperands) &&
            "Unexpected number of operands!");
     if (auto *C = X86::getConstantFromPool(MI, OperandNo)) {
-      unsigned RegBitWidth = C->getType()->getPrimitiveSizeInBits();
+      RegBitWidth =
+          RegBitWidth ? RegBitWidth : C->getType()->getPrimitiveSizeInBits();
       for (const FixupEntry &Fixup : Fixups) {
         if (Fixup.Op) {
           // Construct a suitable constant and adjust the MI to use the new
@@ -377,7 +380,7 @@ bool X86FixupVectorConstantsPass::processInstruction(MachineFunction &MF,
     // TODO: SSE3 MOVDDUP Handling
     return FixupConstant({{X86::MOVSSrm, 1, 32, rebuildZeroUpperCst},
                           {X86::MOVSDrm, 1, 64, rebuildZeroUpperCst}},
-                         1);
+                         128, 1);
   case X86::VMOVAPDrm:
   case X86::VMOVAPSrm:
   case X86::VMOVUPDrm:
@@ -386,7 +389,7 @@ bool X86FixupVectorConstantsPass::processInstruction(MachineFunction &MF,
                           {X86::VBROADCASTSSrm, 1, 32, rebuildSplatCst},
                           {X86::VMOVSDrm, 1, 64, rebuildZeroUpperCst},
                           {X86::VMOVDDUPrm, 1, 64, rebuildSplatCst}},
-                         1);
+                         128, 1);
   case X86::VMOVAPDYrm:
   case X86::VMOVAPSYrm:
   case X86::VMOVUPDYrm:
@@ -394,7 +397,7 @@ bool X86FixupVectorConstantsPass::processInstruction(MachineFunction &MF,
     return FixupConstant({{X86::VBROADCASTSSYrm, 1, 32, rebuildSplatCst},
                           {X86::VBROADCASTSDYrm, 1, 64, rebuildSplatCst},
                           {X86::VBROADCASTF128rm, 1, 128, rebuildSplatCst}},
-                         1);
+                         256, 1);
   case X86::VMOVAPDZ128rm:
   case X86::VMOVAPSZ128rm:
   case X86::VMOVUPDZ128rm:
@@ -403,7 +406,7 @@ bool X86FixupVectorConstantsPass::processInstruction(MachineFunction &MF,
                           {X86::VBROADCASTSSZ128rm, 1, 32, rebuildSplatCst},
                           {X86::VMOVSDZrm, 1, 64, rebuildZeroUpperCst},
                           {X86::VMOVDDUPZ128rm, 1, 64, rebuildSplatCst}},
-                         1);
+                         128, 1);
   case X86::VMOVAPDZ256rm:
   case X86::VMOVAPSZ256rm:
   case X86::VMOVUPDZ256rm:
@@ -412,7 +415,7 @@ bool X86FixupVectorConstantsPass::processInstruction(MachineFunction &MF,
         {{X86::VBROADCASTSSZ256rm, 1, 32, rebuildSplatCst},
          {X86::VBROADCASTSDZ256rm, 1, 64, rebuildSplatCst},
          {X86::VBROADCASTF32X4Z256rm, 1, 128, rebuildSplatCst}},
-        1);
+        256, 1);
   case X86::VMOVAPDZrm:
   case X86::VMOVAPSZrm:
   case X86::VMOVUPDZrm:
@@ -421,7 +424,7 @@ bool X86FixupVectorConstantsPass::processInstruction(MachineFunction &MF,
                           {X86::VBROADCASTSDZrm, 1, 64, rebuildSplatCst},
                           {X86::VBROADCASTF32X4rm, 1, 128, rebuildSplatCst},
                           {X86::VBROADCASTF64X4rm, 1, 256, rebuildSplatCst}},
-                         1);
+                         512, 1);
     /* Integer Loads */
   case X86::MOVDQArm:
   case X86::MOVDQUrm: {
@@ -440,7 +443,7 @@ bool X86FixupVectorConstantsPass::processInstruction(MachineFunction &MF,
         {HasSSE41 ? X86::PMOVZXWDrm : 0, 4, 16, rebuildZExtCst},
         {HasSSE41 ? X86::PMOVSXDQrm : 0, 2, 32, rebuildSExtCst},
         {HasSSE41 ? X86::PMOVZXDQrm : 0, 2, 32, rebuildZExtCst}};
-    return FixupConstant(Fixups, 1);
+    return FixupConstant(Fixups, 128, 1);
   }
   case X86::VMOVDQArm:
   case X86::VMOVDQUrm: {
@@ -465,7 +468,7 @@ bool X86FixupVectorConstantsPass::processInstruction(MachineFunction &MF,
         {X86::VPMOVZXWDrm, 4, 16, rebuildZExtCst},
         {X86::VPMOVSXDQrm, 2, 32, rebuildSExtCst},
         {X86::VPMOVZXDQrm, 2, 32, rebuildZExtCst}};
-    return FixupConstant(Fixups, 1);
+    return FixupConstant(Fixups, 128, 1);
   }
   case X86::VMOVDQAYrm:
   case X86::VMOVDQUYrm: {
@@ -490,7 +493,7 @@ bool X86FixupVectorConstantsPass::processInstruction(MachineFunction &MF,
         {HasAVX2 ? X86::VPMOVZXWDYrm : 0, 8, 16, rebuildZExtCst},
         {HasAVX2 ? X86::VPMOVSXDQYrm : 0, 4, 32, rebuildSExtCst},
         {HasAVX2 ? X86::VPMOVZXDQYrm : 0, 4, 32, rebuildZExtCst}};
-    return FixupConstant(Fixups, 1);
+    return FixupConstant(Fixups, 256, 1);
   }
   case X86::VMOVDQA32Z128rm:
   case X86::VMOVDQA64Z128rm:
@@ -515,7 +518,7 @@ bool X86FixupVectorConstantsPass::processInstruction(MachineFunction &MF,
         {X86::VPMOVZXWDZ128rm, 4, 16, rebuildZExtCst},
         {X86::VPMOVSXDQZ128rm, 2, 32, rebuildSExtCst},
         {X86::VPMOVZXDQZ128rm, 2, 32, rebuildZExtCst}};
-    return FixupConstant(Fixups, 1);
+    return FixupConstant(Fixups, 128, 1);
   }
   case X86::VMOVDQA32Z256rm:
   case X86::VMOVDQA64Z256rm:
@@ -539,7 +542,7 @@ bool X86FixupVectorConstantsPass::processInstruction(MachineFunction &MF,
         {X86::VPMOVZXWDZ256rm, 8, 16, rebuildZExtCst},
         {X86::VPMOVSXDQZ256rm, 4, 32, rebuildSExtCst},
         {X86::VPMOVZXDQZ256rm, 4, 32, rebuildZExtCst}};
-    return FixupConstant(Fixups, 1);
+    return FixupConstant(Fixups, 256, 1);
   }
   case X86::VMOVDQA32Zrm:
   case X86::VMOVDQA64Zrm:
@@ -564,7 +567,7 @@ bool X86FixupVectorConstantsPass::processInstruction(MachineFunction &MF,
         {X86::VPMOVZXWDZrm, 16, 16, rebuildZExtCst},
         {X86::VPMOVSXDQZrm, 8, 32, rebuildSExtCst},
         {X86::VPMOVZXDQZrm, 8, 32, rebuildZExtCst}};
-    return FixupConstant(Fixups, 1);
+    return FixupConstant(Fixups, 512, 1);
   }
   }
 
@@ -592,7 +595,9 @@ bool X86FixupVectorConstantsPass::processInstruction(MachineFunction &MF,
       unsigned OpNo = OpBcst32 == 0 ? OpNoBcst64 : OpNoBcst32;
       FixupEntry Fixups[] = {{(int)OpBcst32, 32, 32, rebuildSplatCst},
                              {(int)OpBcst64, 64, 64, rebuildSplatCst}};
-      return FixupConstant(Fixups, OpNo);
+      // TODO: Add support for RegBitWidth, but currently rebuildSplatCst
+      // doesn't require it (defaults to Constant::getPrimitiveSizeInBits).
+      return FixupConstant(Fixups, 0, OpNo);
     }
     return false;
   };
diff --git a/llvm/test/CodeGen/X86/pr81136.ll b/llvm/test/CodeGen/X86/pr81136.ll
index 8843adc..b4ac3fc 100644
--- a/llvm/test/CodeGen/X86/pr81136.ll
+++ b/llvm/test/CodeGen/X86/pr81136.ll
@@ -1,7 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
 ; RUN: llc < %s -mtriple=x86_64-- -mcpu=btver2 | FileCheck %s
 
-; FIXME: Should be vpmovzxbq[128,1] instead of vpmovzxbd[128,1,0,0]
 define i64 @PR81136(i32 %a0, i32 %a1, ptr %a2) {
 ; CHECK-LABEL: PR81136:
 ; CHECK:       # %bb.0:
@@ -9,7 +8,7 @@ define i64 @PR81136(i32 %a0, i32 %a1, ptr %a2) {
 ; CHECK-NEXT:    vmovd %esi, %xmm1
 ; CHECK-NEXT:    vmovdqa (%rdx), %ymm2
 ; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vpmovzxbd {{.*#+}} xmm4 = [128,1,0,0]
+; CHECK-NEXT:    vpmovzxbq {{.*#+}} xmm4 = [128,1]
 ; CHECK-NEXT:    vpcmpgtq %xmm3, %xmm4, %xmm4
 ; CHECK-NEXT:    vpcmpgtw %xmm0, %xmm1, %xmm0
 ; CHECK-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
-- 
cgit v1.1


From c8d431e0ed6ab6276bf45d1c36466faad8e4e4d1 Mon Sep 17 00:00:00 2001
From: Philip Reames <preames@rivosinc.com>
Date: Thu, 8 Feb 2024 09:40:11 -0800
Subject: [riscv] Add test coverage in advance of a upcoming fix

This is a reduced test case for a fix for the issue identified in
https://github.com/llvm/llvm-project/issues/80910.
---
 .../RISCV/rvv/fixed-vectors-buildvec-of-binop.ll   | 34 ++++++++++++++++++++++
 1 file changed, 34 insertions(+)

diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-buildvec-of-binop.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-buildvec-of-binop.ll
index c8531ed..e376688 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-buildvec-of-binop.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-buildvec-of-binop.ll
@@ -588,3 +588,37 @@ define <8 x i32> @add_constant_rhs_8xi32_partial(<8 x i32> %vin, i32 %a, i32 %b,
   %v3 = insertelement <8 x i32> %v2, i32 %e3, i32 7
   ret <8 x i32> %v3
 }
+
+; FIXME: This is currently showing a miscompile, we effectively
+; truncate before the ashr instead of after it, so if %a or %b
+; is e.g. UINT32_MAX+1 we get different result.
+define <2 x i32> @build_vec_of_trunc_op(i64 %a, i64 %b) {
+; RV32-LABEL: build_vec_of_trunc_op:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    slli a1, a1, 31
+; RV32-NEXT:    srli a0, a0, 1
+; RV32-NEXT:    or a0, a0, a1
+; RV32-NEXT:    slli a3, a3, 31
+; RV32-NEXT:    srli a2, a2, 1
+; RV32-NEXT:    or a2, a2, a3
+; RV32-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; RV32-NEXT:    vmv.v.x v8, a0
+; RV32-NEXT:    vslide1down.vx v8, v8, a2
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: build_vec_of_trunc_op:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; RV64-NEXT:    vmv.v.x v8, a0
+; RV64-NEXT:    vslide1down.vx v8, v8, a1
+; RV64-NEXT:    vsrl.vi v8, v8, 1
+; RV64-NEXT:    ret
+entry:
+  %conv11.i = ashr i64 %a, 1
+  %conv11.2 = ashr i64 %b, 1
+  %0 = trunc i64 %conv11.i to i32
+  %1 = trunc i64 %conv11.2 to i32
+  %2 = insertelement <2 x i32> zeroinitializer, i32 %0, i64 0
+  %3 = insertelement <2 x i32> %2, i32 %1, i64 1
+  ret <2 x i32> %3
+}
-- 
cgit v1.1


From 16d1a6486c25769d264a6ddb70a48bbb1c23c077 Mon Sep 17 00:00:00 2001
From: Cooper Partin <coopp@microsoft.com>
Date: Thu, 8 Feb 2024 09:50:21 -0800
Subject: [DirectX] Fix HLSL bitshifts to leverage the OpenCL pipeline for
 bitshifting (#81030)

Fixes #55106

In HLSL bit shifts are defined to shift by shift size % type size. This
contains the following changes:

HLSL codegen bit shifts will be emitted as x << (y & (sizeof(x) - 1) and
bitshift masking leverages the OpenCL pipeline for this.

Tests were also added to validate this behavior.


Before this change the following was being emitted:
; Function Attrs: noinline nounwind optnone
define noundef i32 @"?shl32@@YAHHH@Z"(i32 noundef %V, i32 noundef %S) #0
{
entry:
  %S.addr = alloca i32, align 4
  %V.addr = alloca i32, align 4
  store i32 %S, ptr %S.addr, align 4
  store i32 %V, ptr %V.addr, align 4
  %0 = load i32, ptr %V.addr, align 4
  %1 = load i32, ptr %S.addr, align 4
  %shl = shl i32 %0, %1
  ret i32 %shl
}

After this change:
; Function Attrs: noinline nounwind optnone
define noundef i32 @"?shl32@@YAHHH@Z"(i32 noundef %V, i32 noundef %S) #0
{
entry:
  %S.addr = alloca i32, align 4
  %V.addr = alloca i32, align 4
  store i32 %S, ptr %S.addr, align 4
  store i32 %V, ptr %V.addr, align 4
  %0 = load i32, ptr %V.addr, align 4
  %1 = load i32, ptr %S.addr, align 4
  %shl.mask = and i32 %1, 31
  %shl = shl i32 %0, %shl.mask
  ret i32 %shl
}

---------

Co-authored-by: Cooper Partin <coopp@ntdev.microsoft.com>
---
 clang/lib/CodeGen/CGExprScalar.cpp     |  4 ++--
 clang/test/CodeGenHLSL/shift-mask.hlsl | 35 ++++++++++++++++++++++++++++++++++
 2 files changed, 37 insertions(+), 2 deletions(-)
 create mode 100644 clang/test/CodeGenHLSL/shift-mask.hlsl

diff --git a/clang/lib/CodeGen/CGExprScalar.cpp b/clang/lib/CodeGen/CGExprScalar.cpp
index df8f71c..fa03163 100644
--- a/clang/lib/CodeGen/CGExprScalar.cpp
+++ b/clang/lib/CodeGen/CGExprScalar.cpp
@@ -4168,7 +4168,7 @@ Value *ScalarExprEmitter::EmitShl(const BinOpInfo &Ops) {
   bool SanitizeBase = SanitizeSignedBase || SanitizeUnsignedBase;
   bool SanitizeExponent = CGF.SanOpts.has(SanitizerKind::ShiftExponent);
   // OpenCL 6.3j: shift values are effectively % word size of LHS.
-  if (CGF.getLangOpts().OpenCL)
+  if (CGF.getLangOpts().OpenCL || CGF.getLangOpts().HLSL)
     RHS = ConstrainShiftValue(Ops.LHS, RHS, "shl.mask");
   else if ((SanitizeBase || SanitizeExponent) &&
            isa<llvm::IntegerType>(Ops.LHS->getType())) {
@@ -4237,7 +4237,7 @@ Value *ScalarExprEmitter::EmitShr(const BinOpInfo &Ops) {
     RHS = Builder.CreateIntCast(RHS, Ops.LHS->getType(), false, "sh_prom");
 
   // OpenCL 6.3j: shift values are effectively % word size of LHS.
-  if (CGF.getLangOpts().OpenCL)
+  if (CGF.getLangOpts().OpenCL || CGF.getLangOpts().HLSL)
     RHS = ConstrainShiftValue(Ops.LHS, RHS, "shr.mask");
   else if (CGF.SanOpts.has(SanitizerKind::ShiftExponent) &&
            isa<llvm::IntegerType>(Ops.LHS->getType())) {
diff --git a/clang/test/CodeGenHLSL/shift-mask.hlsl b/clang/test/CodeGenHLSL/shift-mask.hlsl
new file mode 100644
index 0000000..d046efa
--- /dev/null
+++ b/clang/test/CodeGenHLSL/shift-mask.hlsl
@@ -0,0 +1,35 @@
+// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
+// RUN:   dxil-pc-shadermodel6.3-library %s \
+// RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s
+
+int shl32(int V, int S) {
+  return V << S;
+}
+
+// CHECK: define noundef i32 @"?shl32{{[@$?.A-Za-z0-9_]+}}"(i32 noundef %V, i32 noundef %S) #0 {
+// CHECK-DAG:  %[[Masked:.*]] = and i32 %{{.*}}, 31
+// CHECK-DAG:  %{{.*}} = shl i32 %{{.*}}, %[[Masked]]
+
+int shr32(int V, int S) {
+  return V >> S;
+}
+
+// CHECK: define noundef i32 @"?shr32{{[@$?.A-Za-z0-9_]+}}"(i32 noundef %V, i32 noundef %S) #0 {
+// CHECK-DAG:  %[[Masked:.*]] = and i32 %{{.*}}, 31
+// CHECK-DAG:  %{{.*}} = ashr i32 %{{.*}}, %[[Masked]]
+
+int64_t shl64(int64_t V, int64_t S) {
+  return V << S;
+}
+
+// CHECK: define noundef i64 @"?shl64{{[@$?.A-Za-z0-9_]+}}"(i64 noundef %V, i64 noundef %S) #0 {
+// CHECK-DAG:  %[[Masked:.*]] = and i64 %{{.*}}, 63
+// CHECK-DAG:  %{{.*}} = shl i64 %{{.*}}, %[[Masked]]
+
+int64_t shr64(int64_t V, int64_t S) {
+  return V >> S;
+}
+
+// CHECK: define noundef i64 @"?shr64{{[@$?.A-Za-z0-9_]+}}"(i64 noundef %V, i64 noundef %S) #0 {
+// CHECK-DAG:  %[[Masked:.*]] = and i64 %{{.*}}, 63
+// CHECK-DAG:  %{{.*}} = ashr i64 %{{.*}}, %[[Masked]]
-- 
cgit v1.1


From 758fd59d018fe01262dd246e3e1e3d4389cb82e4 Mon Sep 17 00:00:00 2001
From: "S. Bharadwaj Yadavalli" <Bharadwaj.Yadavalli@microsoft.com>
Date: Thu, 8 Feb 2024 13:02:32 -0500
Subject: [DirectX][NFC] Change usage pattern *Dxil* to *DXIL* for uniformity
 (#80778)

Match DXIL TableGen class names with structure names in DXIL Emitter.
Delete unnecessary Name field.
---
 llvm/lib/Target/DirectX/DXIL.td          |  89 +++++++++++-----------
 llvm/lib/Target/DirectX/DXILMetadata.cpp |   8 +-
 llvm/utils/TableGen/DXILEmitter.cpp      | 125 +++++++++++++++----------------
 3 files changed, 107 insertions(+), 115 deletions(-)

diff --git a/llvm/lib/Target/DirectX/DXIL.td b/llvm/lib/Target/DirectX/DXIL.td
index aec6460..3f3ace5 100644
--- a/llvm/lib/Target/DirectX/DXIL.td
+++ b/llvm/lib/Target/DirectX/DXIL.td
@@ -14,28 +14,28 @@
 include "llvm/IR/Intrinsics.td"
 
 // Abstract representation of the class a DXIL Operation belongs to.
-class DxilOpClass<string name> {
+class DXILOpClass<string name> {
   string Name = name;
 }
 
 // Abstract representation of the category a DXIL Operation belongs to
-class DxilOpCategory<string name> {
+class DXILOpCategory<string name> {
   string Name = name;
 }
 
-def UnaryClass : DxilOpClass<"Unary">;
-def BinaryClass : DxilOpClass<"Binary">;
-def FlattenedThreadIdInGroupClass : DxilOpClass<"FlattenedThreadIdInGroup">;
-def ThreadIdInGroupClass : DxilOpClass<"ThreadIdInGroup">;
-def ThreadIdClass : DxilOpClass<"ThreadId">;
-def GroupIdClass : DxilOpClass<"GroupId">;
+def UnaryClass : DXILOpClass<"Unary">;
+def BinaryClass : DXILOpClass<"Binary">;
+def FlattenedThreadIdInGroupClass : DXILOpClass<"FlattenedThreadIdInGroup">;
+def ThreadIdInGroupClass : DXILOpClass<"ThreadIdInGroup">;
+def ThreadIdClass : DXILOpClass<"ThreadId">;
+def GroupIdClass : DXILOpClass<"GroupId">;
 
-def BinaryUintCategory : DxilOpCategory<"Binary uint">;
-def UnaryFloatCategory : DxilOpCategory<"Unary float">;
-def ComputeIDCategory : DxilOpCategory<"Compute/Mesh/Amplification shader">;
+def BinaryUintCategory : DXILOpCategory<"Binary uint">;
+def UnaryFloatCategory : DXILOpCategory<"Unary float">;
+def ComputeIDCategory : DXILOpCategory<"Compute/Mesh/Amplification shader">;
 
 // The parameter description for a DXIL operation
-class DxilOpParameter<int pos, string type, string name, string doc,
+class DXILOpParameter<int pos, string type, string name, string doc,
                  bit isConstant = 0, string enumName = "",
                  int maxValue = 0> {
   int Pos = pos;               // Position in parameter list
@@ -49,16 +49,13 @@ class DxilOpParameter<int pos, string type, string name, string doc,
 }
 
 // A representation for a DXIL operation
-class DxilOperationDesc<string name> {
-  // TODO : Appears redundant. OpName should serve the same purpose
-  string Name = name; // short, unique name
-
+class DXILOperationDesc {
   string OpName = "";         // Name of DXIL operation
   int OpCode = 0;             // Unique non-negative integer associated with the operation
-  DxilOpClass  OpClass;       // Class of the operation
-  DxilOpCategory OpCategory;  // Category of the operation
+  DXILOpClass  OpClass;       // Class of the operation
+  DXILOpCategory OpCategory;  // Category of the operation
   string Doc = "";            // Description of the operation
-  list<DxilOpParameter> Params = []; // Parameter list of the operation
+  list<DXILOpParameter> Params = []; // Parameter list of the operation
   string OverloadTypes = "";  // Overload types, if applicable
   string Attributes = "";     // Attribute shorthands: rn=does not access
                               // memory,ro=only reads from memory,
@@ -73,9 +70,9 @@ class DxilOperationDesc<string name> {
   list<string> StatsGroup = [];
 }
 
-class DxilOperation<string name, int opCode, DxilOpClass opClass, DxilOpCategory opCategory, string doc,
-              string oloadTypes, string attrs, list<DxilOpParameter> params,
-              list<string> statsGroup = []> : DxilOperationDesc<name> {
+class DXILOperation<string name, int opCode, DXILOpClass opClass, DXILOpCategory opCategory, string doc,
+              string oloadTypes, string attrs, list<DXILOpParameter> params,
+              list<string> statsGroup = []> : DXILOperationDesc {
   let OpName = name;
   let OpCode = opCode;
   let Doc = doc;
@@ -90,56 +87,56 @@ class DxilOperation<string name, int opCode, DxilOpClass opClass, DxilOpCategory
 // LLVM intrinsic that DXIL operation maps to.
 class LLVMIntrinsic<Intrinsic llvm_intrinsic_> { Intrinsic llvm_intrinsic = llvm_intrinsic_; }
 
-def Sin : DxilOperation<"Sin", 13, UnaryClass, UnaryFloatCategory, "returns sine(theta) for theta in radians.",
+def Sin : DXILOperation<"Sin", 13, UnaryClass, UnaryFloatCategory, "returns sine(theta) for theta in radians.",
   "half;float;", "rn",
   [
-    DxilOpParameter<0, "$o", "", "operation result">,
-    DxilOpParameter<1, "i32", "opcode", "DXIL opcode">,
-    DxilOpParameter<2, "$o", "value", "input value">
+    DXILOpParameter<0, "$o", "", "operation result">,
+    DXILOpParameter<1, "i32", "opcode", "DXIL opcode">,
+    DXILOpParameter<2, "$o", "value", "input value">
   ],
   ["floats"]>,
   LLVMIntrinsic<int_sin>;
 
-def UMax : DxilOperation< "UMax", 39,  BinaryClass,  BinaryUintCategory, "unsigned integer maximum. UMax(a,b) = a > b ? a : b",
+def UMax : DXILOperation< "UMax", 39,  BinaryClass,  BinaryUintCategory, "unsigned integer maximum. UMax(a,b) = a > b ? a : b",
     "i16;i32;i64;",  "rn",
   [
-    DxilOpParameter<0,  "$o",  "",  "operation result">,
-    DxilOpParameter<1,  "i32",  "opcode",  "DXIL opcode">,
-    DxilOpParameter<2,  "$o",  "a",  "input value">,
-    DxilOpParameter<3,  "$o",  "b",  "input value">
+    DXILOpParameter<0,  "$o",  "",  "operation result">,
+    DXILOpParameter<1,  "i32",  "opcode",  "DXIL opcode">,
+    DXILOpParameter<2,  "$o",  "a",  "input value">,
+    DXILOpParameter<3,  "$o",  "b",  "input value">
   ],
   ["uints"]>,
   LLVMIntrinsic<int_umax>;
 
-def ThreadId : DxilOperation< "ThreadId", 93,  ThreadIdClass, ComputeIDCategory, "reads the thread ID", "i32;",  "rn",
+def ThreadId : DXILOperation< "ThreadId", 93,  ThreadIdClass, ComputeIDCategory, "reads the thread ID", "i32;",  "rn",
   [
-    DxilOpParameter<0,  "i32",  "",  "thread ID component">,
-    DxilOpParameter<1,  "i32",  "opcode",  "DXIL opcode">,
-    DxilOpParameter<2,  "i32",  "component",  "component to read (x,y,z)">
+    DXILOpParameter<0,  "i32",  "",  "thread ID component">,
+    DXILOpParameter<1,  "i32",  "opcode",  "DXIL opcode">,
+    DXILOpParameter<2,  "i32",  "component",  "component to read (x,y,z)">
   ]>,
   LLVMIntrinsic<int_dx_thread_id>;
 
-def GroupId : DxilOperation< "GroupId", 94,  GroupIdClass, ComputeIDCategory, "reads the group ID (SV_GroupID)", "i32;",  "rn",
+def GroupId : DXILOperation< "GroupId", 94,  GroupIdClass, ComputeIDCategory, "reads the group ID (SV_GroupID)", "i32;",  "rn",
   [
-    DxilOpParameter<0,  "i32",  "",  "group ID component">,
-    DxilOpParameter<1,  "i32",  "opcode",  "DXIL opcode">,
-    DxilOpParameter<2,  "i32",  "component",  "component to read">
+    DXILOpParameter<0,  "i32",  "",  "group ID component">,
+    DXILOpParameter<1,  "i32",  "opcode",  "DXIL opcode">,
+    DXILOpParameter<2,  "i32",  "component",  "component to read">
   ]>,
   LLVMIntrinsic<int_dx_group_id>;
 
-def ThreadIdInGroup : DxilOperation< "ThreadIdInGroup", 95,  ThreadIdInGroupClass, ComputeIDCategory,
+def ThreadIdInGroup : DXILOperation< "ThreadIdInGroup", 95,  ThreadIdInGroupClass, ComputeIDCategory,
   "reads the thread ID within the group (SV_GroupThreadID)", "i32;",  "rn",
   [
-    DxilOpParameter<0,  "i32",  "",  "thread ID in group component">,
-    DxilOpParameter<1,  "i32",  "opcode",  "DXIL opcode">,
-    DxilOpParameter<2,  "i32",  "component",  "component to read (x,y,z)">
+    DXILOpParameter<0,  "i32",  "",  "thread ID in group component">,
+    DXILOpParameter<1,  "i32",  "opcode",  "DXIL opcode">,
+    DXILOpParameter<2,  "i32",  "component",  "component to read (x,y,z)">
   ]>,
   LLVMIntrinsic<int_dx_thread_id_in_group>;
 
-def FlattenedThreadIdInGroup : DxilOperation< "FlattenedThreadIdInGroup", 96,  FlattenedThreadIdInGroupClass, ComputeIDCategory,
+def FlattenedThreadIdInGroup : DXILOperation< "FlattenedThreadIdInGroup", 96,  FlattenedThreadIdInGroupClass, ComputeIDCategory,
    "provides a flattened index for a given thread within a given group (SV_GroupIndex)", "i32;",  "rn",
   [
-    DxilOpParameter<0,  "i32",  "",  "result">,
-    DxilOpParameter<1,  "i32",  "opcode",  "DXIL opcode">
+    DXILOpParameter<0,  "i32",  "",  "result">,
+    DXILOpParameter<1,  "i32",  "opcode",  "DXIL opcode">
   ]>,
   LLVMIntrinsic<int_dx_flattened_thread_id_in_group>;
diff --git a/llvm/lib/Target/DirectX/DXILMetadata.cpp b/llvm/lib/Target/DirectX/DXILMetadata.cpp
index db55f25..2d94490 100644
--- a/llvm/lib/Target/DirectX/DXILMetadata.cpp
+++ b/llvm/lib/Target/DirectX/DXILMetadata.cpp
@@ -213,7 +213,7 @@ public:
     // FIXME: add signature for profile other than CS.
     // See https://github.com/llvm/llvm-project/issues/57928.
     MDTuple *Signatures = nullptr;
-    return emitDxilEntryPointTuple(
+    return emitDXILEntryPointTuple(
         &F, F.getName().str(), Signatures, Resources,
         Props.emitDXILEntryProps(RawShaderFlag, Ctx, /*IsLib*/ false), Ctx);
   }
@@ -222,7 +222,7 @@ public:
     // FIXME: add signature for profile other than CS.
     // See https://github.com/llvm/llvm-project/issues/57928.
     MDTuple *Signatures = nullptr;
-    return emitDxilEntryPointTuple(
+    return emitDXILEntryPointTuple(
         &F, F.getName().str(), Signatures,
         /*entry in lib doesn't need resources metadata*/ nullptr,
         Props.emitDXILEntryProps(RawShaderFlag, Ctx, /*IsLib*/ true), Ctx);
@@ -233,13 +233,13 @@ public:
   static MDTuple *emitEmptyEntryForLib(MDTuple *Resources,
                                        uint64_t RawShaderFlag,
                                        LLVMContext &Ctx) {
-    return emitDxilEntryPointTuple(
+    return emitDXILEntryPointTuple(
         nullptr, "", nullptr, Resources,
         EntryProps::emitEntryPropsForEmptyEntry(RawShaderFlag, Ctx), Ctx);
   }
 
 private:
-  static MDTuple *emitDxilEntryPointTuple(Function *Fn, const std::string &Name,
+  static MDTuple *emitDXILEntryPointTuple(Function *Fn, const std::string &Name,
                                           MDTuple *Signatures,
                                           MDTuple *Resources,
                                           MDTuple *Properties,
diff --git a/llvm/utils/TableGen/DXILEmitter.cpp b/llvm/utils/TableGen/DXILEmitter.cpp
index 475a57a..cb9f9c6 100644
--- a/llvm/utils/TableGen/DXILEmitter.cpp
+++ b/llvm/utils/TableGen/DXILEmitter.cpp
@@ -30,7 +30,7 @@ struct DXILShaderModel {
   int Minor = 0;
 };
 
-struct DXILParam {
+struct DXILParameter {
   int Pos; // position in parameter list
   ParameterKind Kind;
   StringRef Name; // short, unique name
@@ -38,23 +38,21 @@ struct DXILParam {
   bool IsConst;   // whether this argument requires a constant value in the IR
   StringRef EnumName; // the name of the enum type if applicable
   int MaxValue;       // the maximum value for this parameter if applicable
-  DXILParam(const Record *R);
+  DXILParameter(const Record *R);
 };
 
-struct DXILOperationData {
-  StringRef Name; // short, unique name
-
-  StringRef DXILOp;    // name of DXIL operation
-  int DXILOpID;        // ID of DXIL operation
-  StringRef DXILClass; // name of the opcode class
+struct DXILOperationDesc {
+  StringRef OpName;    // name of DXIL operation
+  int OpCode;          // ID of DXIL operation
+  StringRef OpClass;   // name of the opcode class
   StringRef Category;  // classification for this instruction
   StringRef Doc;       // the documentation description of this instruction
 
-  SmallVector<DXILParam> Params; // the operands that this instruction takes
+  SmallVector<DXILParameter> Params; // the operands that this instruction takes
   StringRef OverloadTypes;       // overload types if applicable
   StringRef FnAttr;              // attribute shorthands: rn=does not access
                                  // memory,ro=only reads from memory
-  StringRef Intrinsic; // The llvm intrinsic map to DXILOp. Default is "" which
+  StringRef Intrinsic; // The llvm intrinsic map to OpName. Default is "" which
                        // means no map exist
   bool IsDeriv = false;    // whether this is some kind of derivative
   bool IsGradient = false; // whether this requires a gradient calculation
@@ -71,11 +69,10 @@ struct DXILOperationData {
   int OverloadParamIndex; // parameter index which control the overload.
                           // When < 0, should be only 1 overload type.
   SmallVector<StringRef, 4> counters; // counters for this inst.
-  DXILOperationData(const Record *R) {
-    Name = R->getValueAsString("Name");
-    DXILOp = R->getValueAsString("OpName");
-    DXILOpID = R->getValueAsInt("OpCode");
-    DXILClass = R->getValueAsDef("OpClass")->getValueAsString("Name");
+  DXILOperationDesc(const Record *R) {
+    OpName = R->getValueAsString("OpName");
+    OpCode = R->getValueAsInt("OpCode");
+    OpClass = R->getValueAsDef("OpClass")->getValueAsString("Name");
     Category = R->getValueAsDef("OpCategory")->getValueAsString("Name");
 
     if (R->getValue("llvm_intrinsic")) {
@@ -92,7 +89,7 @@ struct DXILOperationData {
     OverloadParamIndex = -1;
     for (unsigned I = 0; I < ParamList->size(); ++I) {
       Record *Param = ParamList->getElementAsRecord(I);
-      Params.emplace_back(DXILParam(Param));
+      Params.emplace_back(DXILParameter(Param));
       auto &CurParam = Params.back();
       if (CurParam.Kind >= ParameterKind::OVERLOAD)
         OverloadParamIndex = I;
@@ -121,7 +118,7 @@ static ParameterKind parameterTypeNameToKind(StringRef Name) {
       .Default(ParameterKind::INVALID);
 }
 
-DXILParam::DXILParam(const Record *R) {
+DXILParameter::DXILParameter(const Record *R) {
   Name = R->getValueAsString("Name");
   Pos = R->getValueAsInt("Pos");
   Kind = parameterTypeNameToKind(R->getValueAsString("LLVMType"));
@@ -166,10 +163,9 @@ static std::string parameterKindToString(ParameterKind Kind) {
   llvm_unreachable("Unknown llvm::dxil::ParameterKind enum");
 }
 
-static void emitDXILOpEnum(DXILOperationData &DXILOp, raw_ostream &OS) {
+static void emitDXILOpEnum(DXILOperationDesc &Op, raw_ostream &OS) {
   // Name = ID, // Doc
-  OS << DXILOp.Name << " = " << DXILOp.DXILOpID << ", // " << DXILOp.Doc
-     << "\n";
+  OS << Op.OpName << " = " << Op.OpCode << ", // " << Op.Doc << "\n";
 }
 
 static std::string buildCategoryStr(StringSet<> &Cetegorys) {
@@ -182,14 +178,14 @@ static std::string buildCategoryStr(StringSet<> &Cetegorys) {
 }
 
 // Emit enum declaration for DXIL.
-static void emitDXILEnums(std::vector<DXILOperationData> &DXILOps,
+static void emitDXILEnums(std::vector<DXILOperationDesc> &Ops,
                           raw_ostream &OS) {
   // Sort by Category + OpName.
-  llvm::sort(DXILOps, [](DXILOperationData &A, DXILOperationData &B) {
+  llvm::sort(Ops, [](DXILOperationDesc &A, DXILOperationDesc &B) {
     // Group by Category first.
     if (A.Category == B.Category)
       // Inside same Category, order by OpName.
-      return A.DXILOp < B.DXILOp;
+      return A.OpName < B.OpName;
     else
       return A.Category < B.Category;
   });
@@ -199,18 +195,18 @@ static void emitDXILEnums(std::vector<DXILOperationData> &DXILOps,
 
   StringMap<StringSet<>> ClassMap;
   StringRef PrevCategory = "";
-  for (auto &DXILOp : DXILOps) {
-    StringRef Category = DXILOp.Category;
+  for (auto &Op : Ops) {
+    StringRef Category = Op.Category;
     if (Category != PrevCategory) {
       OS << "\n// " << Category << "\n";
       PrevCategory = Category;
     }
-    emitDXILOpEnum(DXILOp, OS);
-    auto It = ClassMap.find(DXILOp.DXILClass);
+    emitDXILOpEnum(Op, OS);
+    auto It = ClassMap.find(Op.OpClass);
     if (It != ClassMap.end()) {
-      It->second.insert(DXILOp.Category);
+      It->second.insert(Op.Category);
     } else {
-      ClassMap[DXILOp.DXILClass].insert(DXILOp.Category);
+      ClassMap[Op.OpClass].insert(Op.Category);
     }
   }
 
@@ -253,18 +249,18 @@ static void emitDXILEnums(std::vector<DXILOperationData> &DXILOps,
 }
 
 // Emit map from llvm intrinsic to DXIL operation.
-static void emitDXILIntrinsicMap(std::vector<DXILOperationData> &DXILOps,
+static void emitDXILIntrinsicMap(std::vector<DXILOperationDesc> &Ops,
                                  raw_ostream &OS) {
   OS << "\n";
   // FIXME: use array instead of SmallDenseMap.
   OS << "static const SmallDenseMap<Intrinsic::ID, dxil::OpCode> LowerMap = "
         "{\n";
-  for (auto &DXILOp : DXILOps) {
-    if (DXILOp.Intrinsic.empty())
+  for (auto &Op : Ops) {
+    if (Op.Intrinsic.empty())
       continue;
     // {Intrinsic::sin, dxil::OpCode::Sin},
-    OS << "  { Intrinsic::" << DXILOp.Intrinsic
-       << ", dxil::OpCode::" << DXILOp.DXILOp << "},\n";
+    OS << "  { Intrinsic::" << Op.Intrinsic << ", dxil::OpCode::" << Op.OpName
+       << "},\n";
   }
   OS << "};\n";
   OS << "\n";
@@ -315,20 +311,20 @@ static std::string lowerFirstLetter(StringRef Name) {
   return LowerName;
 }
 
-static std::string getDXILOpClassName(StringRef DXILOpClass) {
+static std::string getDXILOpClassName(StringRef OpClass) {
   // Lower first letter expect for special case.
-  return StringSwitch<std::string>(DXILOpClass)
+  return StringSwitch<std::string>(OpClass)
       .Case("CBufferLoad", "cbufferLoad")
       .Case("CBufferLoadLegacy", "cbufferLoadLegacy")
       .Case("GSInstanceID", "gsInstanceID")
-      .Default(lowerFirstLetter(DXILOpClass));
+      .Default(lowerFirstLetter(OpClass));
 }
 
-static void emitDXILOperationTable(std::vector<DXILOperationData> &DXILOps,
+static void emitDXILOperationTable(std::vector<DXILOperationDesc> &Ops,
                                    raw_ostream &OS) {
-  // Sort by DXILOpID.
-  llvm::sort(DXILOps, [](DXILOperationData &A, DXILOperationData &B) {
-    return A.DXILOpID < B.DXILOpID;
+  // Sort by OpCode.
+  llvm::sort(Ops, [](DXILOperationDesc &A, DXILOperationDesc &B) {
+    return A.OpCode < B.OpCode;
   });
 
   // Collect Names.
@@ -338,18 +334,18 @@ static void emitDXILOperationTable(std::vector<DXILOperationData> &DXILOps,
 
   StringMap<SmallVector<ParameterKind>> ParameterMap;
   StringSet<> ClassSet;
-  for (auto &DXILOp : DXILOps) {
-    OpStrings.add(DXILOp.DXILOp.str());
+  for (auto &Op : Ops) {
+    OpStrings.add(Op.OpName.str());
 
-    if (ClassSet.contains(DXILOp.DXILClass))
+    if (ClassSet.contains(Op.OpClass))
       continue;
-    ClassSet.insert(DXILOp.DXILClass);
-    OpClassStrings.add(getDXILOpClassName(DXILOp.DXILClass));
+    ClassSet.insert(Op.OpClass);
+    OpClassStrings.add(getDXILOpClassName(Op.OpClass));
     SmallVector<ParameterKind> ParamKindVec;
-    for (auto &Param : DXILOp.Params) {
+    for (auto &Param : Op.Params) {
       ParamKindVec.emplace_back(Param.Kind);
     }
-    ParameterMap[DXILOp.DXILClass] = ParamKindVec;
+    ParameterMap[Op.OpClass] = ParamKindVec;
     Parameters.add(ParamKindVec);
   }
 
@@ -363,26 +359,25 @@ static void emitDXILOperationTable(std::vector<DXILOperationData> &DXILOps,
   // OpCodeClassNameIndex,
   // OverloadKind::FLOAT | OverloadKind::HALF, Attribute::AttrKind::ReadNone, 0,
   // 3, ParameterTableOffset},
-  OS << "static const OpCodeProperty *getOpCodeProperty(dxil::OpCode DXILOp) "
+  OS << "static const OpCodeProperty *getOpCodeProperty(dxil::OpCode Op) "
         "{\n";
 
   OS << "  static const OpCodeProperty OpCodeProps[] = {\n";
-  for (auto &DXILOp : DXILOps) {
-    OS << "  { dxil::OpCode::" << DXILOp.DXILOp << ", "
-       << OpStrings.get(DXILOp.DXILOp.str())
-       << ", OpCodeClass::" << DXILOp.DXILClass << ", "
-       << OpClassStrings.get(getDXILOpClassName(DXILOp.DXILClass)) << ", "
-       << getDXILOperationOverload(DXILOp.OverloadTypes) << ", "
-       << emitDXILOperationFnAttr(DXILOp.FnAttr) << ", "
-       << DXILOp.OverloadParamIndex << ", " << DXILOp.Params.size() << ", "
-       << Parameters.get(ParameterMap[DXILOp.DXILClass]) << " },\n";
+  for (auto &Op : Ops) {
+    OS << "  { dxil::OpCode::" << Op.OpName << ", "
+       << OpStrings.get(Op.OpName.str()) << ", OpCodeClass::" << Op.OpClass
+       << ", " << OpClassStrings.get(getDXILOpClassName(Op.OpClass)) << ", "
+       << getDXILOperationOverload(Op.OverloadTypes) << ", "
+       << emitDXILOperationFnAttr(Op.FnAttr) << ", " << Op.OverloadParamIndex
+       << ", " << Op.Params.size() << ", "
+       << Parameters.get(ParameterMap[Op.OpClass]) << " },\n";
   }
   OS << "  };\n";
 
   OS << "  // FIXME: change search to indexing with\n";
-  OS << "  // DXILOp once all DXIL op is added.\n";
+  OS << "  // Op once all DXIL operations are added.\n";
   OS << "  OpCodeProperty TmpProp;\n";
-  OS << "  TmpProp.OpCode = DXILOp;\n";
+  OS << "  TmpProp.OpCode = Op;\n";
   OS << "  const OpCodeProperty *Prop =\n";
   OS << "      llvm::lower_bound(OpCodeProps, TmpProp,\n";
   OS << "                        [](const OpCodeProperty &A, const "
@@ -394,12 +389,12 @@ static void emitDXILOperationTable(std::vector<DXILOperationData> &DXILOps,
   OS << "}\n\n";
 
   // Emit the string tables.
-  OS << "static const char *getOpCodeName(dxil::OpCode DXILOp) {\n\n";
+  OS << "static const char *getOpCodeName(dxil::OpCode Op) {\n\n";
 
   OpStrings.emitStringLiteralDef(OS,
                                  "  static const char DXILOpCodeNameTable[]");
 
-  OS << "  auto *Prop = getOpCodeProperty(DXILOp);\n";
+  OS << "  auto *Prop = getOpCodeProperty(Op);\n";
   OS << "  unsigned Index = Prop->OpCodeNameOffset;\n";
   OS << "  return DXILOpCodeNameTable + Index;\n";
   OS << "}\n\n";
@@ -431,14 +426,14 @@ static void emitDXILOperationTable(std::vector<DXILOperationData> &DXILOps,
 }
 
 static void EmitDXILOperation(RecordKeeper &Records, raw_ostream &OS) {
-  std::vector<Record *> Ops = Records.getAllDerivedDefinitions("DxilOperation");
+  std::vector<Record *> Ops = Records.getAllDerivedDefinitions("DXILOperation");
   OS << "// Generated code, do not edit.\n";
   OS << "\n";
 
-  std::vector<DXILOperationData> DXILOps;
+  std::vector<DXILOperationDesc> DXILOps;
   DXILOps.reserve(Ops.size());
   for (auto *Record : Ops) {
-    DXILOps.emplace_back(DXILOperationData(Record));
+    DXILOps.emplace_back(DXILOperationDesc(Record));
   }
 
   OS << "#ifdef DXIL_OP_ENUM\n";
-- 
cgit v1.1


From abc4f74df7ab3b324b7bf9d171e8a22a92d7dda5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Valentin=20Clement=20=28=E3=83=90=E3=83=AC=E3=83=B3?=
 =?UTF-8?q?=E3=82=BF=E3=82=A4=E3=83=B3=20=E3=82=AF=E3=83=AC=E3=83=A1?=
 =?UTF-8?q?=E3=83=B3=29?= <clementval@gmail.com>
Date: Thu, 8 Feb 2024 10:03:08 -0800
Subject: [flang][cuda] Lower attribute for local variable (#81076)

This is a first simple patch to introduce a new FIR attribute to carry
the CUDA variable attribute information to hlfir.declare and fir.declare
operations. It currently lowers this information for local variables.

The texture attribute is omitted since it is rejected by semantic and
will not make its way to MLIR.

This new attribute is added as optional attribute to the hlfir.declare
and fir.declare operations.
---
 flang/include/flang/Lower/ConvertVariable.h        |  6 ++++
 flang/include/flang/Optimizer/Builder/HLFIRTools.h | 10 +++---
 flang/include/flang/Optimizer/Dialect/FIRAttr.td   | 23 +++++++++++-
 flang/include/flang/Optimizer/Dialect/FIROps.td    |  3 +-
 flang/include/flang/Optimizer/HLFIR/HLFIROps.td    |  6 ++--
 flang/lib/Lower/ConvertVariable.cpp                | 41 ++++++++++++++++++++--
 flang/lib/Optimizer/Builder/HLFIRTools.cpp         |  5 +--
 flang/lib/Optimizer/Dialect/FIRAttr.cpp            |  3 +-
 flang/lib/Optimizer/HLFIR/IR/HLFIROps.cpp          |  5 +--
 .../Optimizer/HLFIR/Transforms/ConvertToFIR.cpp    |  6 +++-
 flang/test/Lower/CUDA/cuda-data-attribute.cuf      | 22 ++++++++++++
 flang/unittests/Optimizer/FortranVariableTest.cpp  | 12 ++++---
 12 files changed, 121 insertions(+), 21 deletions(-)
 create mode 100644 flang/test/Lower/CUDA/cuda-data-attribute.cuf

diff --git a/flang/include/flang/Lower/ConvertVariable.h b/flang/include/flang/Lower/ConvertVariable.h
index 0ff3ca9..cdbf050 100644
--- a/flang/include/flang/Lower/ConvertVariable.h
+++ b/flang/include/flang/Lower/ConvertVariable.h
@@ -137,6 +137,12 @@ translateSymbolAttributes(mlir::MLIRContext *mlirContext,
                           fir::FortranVariableFlagsEnum extraFlags =
                               fir::FortranVariableFlagsEnum::None);
 
+/// Translate the CUDA Fortran attributes of \p sym into the FIR CUDA attribute
+/// representation.
+fir::CUDAAttributeAttr
+translateSymbolCUDAAttribute(mlir::MLIRContext *mlirContext,
+                             const Fortran::semantics::Symbol &sym);
+
 /// Map a symbol to a given fir::ExtendedValue. This will generate an
 /// hlfir.declare when lowering to HLFIR and map the hlfir.declare result to the
 /// symbol.
diff --git a/flang/include/flang/Optimizer/Builder/HLFIRTools.h b/flang/include/flang/Optimizer/Builder/HLFIRTools.h
index efbd57c..fe69ffa 100644
--- a/flang/include/flang/Optimizer/Builder/HLFIRTools.h
+++ b/flang/include/flang/Optimizer/Builder/HLFIRTools.h
@@ -233,11 +233,11 @@ translateToExtendedValue(mlir::Location loc, fir::FirOpBuilder &builder,
                          fir::FortranVariableOpInterface fortranVariable);
 
 /// Generate declaration for a fir::ExtendedValue in memory.
-fir::FortranVariableOpInterface genDeclare(mlir::Location loc,
-                                           fir::FirOpBuilder &builder,
-                                           const fir::ExtendedValue &exv,
-                                           llvm::StringRef name,
-                                           fir::FortranVariableFlagsAttr flags);
+fir::FortranVariableOpInterface
+genDeclare(mlir::Location loc, fir::FirOpBuilder &builder,
+           const fir::ExtendedValue &exv, llvm::StringRef name,
+           fir::FortranVariableFlagsAttr flags,
+           fir::CUDAAttributeAttr cudaAttr = {});
 
 /// Generate an hlfir.associate to build a variable from an expression value.
 /// The type of the variable must be provided so that scalar logicals are
diff --git a/flang/include/flang/Optimizer/Dialect/FIRAttr.td b/flang/include/flang/Optimizer/Dialect/FIRAttr.td
index 114bf7d..bc73124 100644
--- a/flang/include/flang/Optimizer/Dialect/FIRAttr.td
+++ b/flang/include/flang/Optimizer/Dialect/FIRAttr.td
@@ -55,7 +55,28 @@ def fir_FortranVariableFlagsAttr : fir_Attr<"FortranVariableFlags"> {
   let returnType = "::fir::FortranVariableFlagsEnum";
   let convertFromStorage = "$_self.getFlags()";
   let constBuilderCall =
-          "::fir::FortranVariableFlagsAttr::get($_builder.getContext(), $0)";
+        "::fir::FortranVariableFlagsAttr::get($_builder.getContext(), $0)";
+}
+
+def CUDAconstant : I32EnumAttrCase<"Constant", 0, "constant">;
+def CUDAdevice   : I32EnumAttrCase<"Device", 1, "device">;
+def CUDAmanaged  : I32EnumAttrCase<"Managed", 2, "managed">;
+def CUDApinned   : I32EnumAttrCase<"Pinned", 3, "pinned">;
+def CUDAshared   : I32EnumAttrCase<"Shared", 4, "shared">;
+def CUDAunified  : I32EnumAttrCase<"Unified", 5, "unified">;
+// Texture is omitted since it is obsolete and rejected by semantic.
+
+def fir_CUDAAttribute : I32EnumAttr<
+    "CUDAAttribute",
+    "CUDA Fortran variable attributes",
+    [CUDAconstant, CUDAdevice, CUDAmanaged, CUDApinned, CUDAshared,
+     CUDAunified]> {
+  let genSpecializedAttr = 0;
+  let cppNamespace = "::fir";
+}
+
+def fir_CUDAAttributeAttr : EnumAttr<fir_Dialect, fir_CUDAAttribute, "cuda"> {
+  let assemblyFormat = [{ ```<` $value `>` }];
 }
 
 def fir_BoxFieldAttr : I32EnumAttr<
diff --git a/flang/include/flang/Optimizer/Dialect/FIROps.td b/flang/include/flang/Optimizer/Dialect/FIROps.td
index fcecc60..b954a0c 100644
--- a/flang/include/flang/Optimizer/Dialect/FIROps.td
+++ b/flang/include/flang/Optimizer/Dialect/FIROps.td
@@ -3027,7 +3027,8 @@ def fir_DeclareOp : fir_Op<"declare", [AttrSizedOperandSegments,
     Optional<AnyShapeOrShiftType>:$shape,
     Variadic<AnyIntegerType>:$typeparams,
     Builtin_StringAttr:$uniq_name,
-    OptionalAttr<fir_FortranVariableFlagsAttr>:$fortran_attrs
+    OptionalAttr<fir_FortranVariableFlagsAttr>:$fortran_attrs,
+    OptionalAttr<fir_CUDAAttributeAttr>:$cuda_attr
   );
 
   let results = (outs AnyRefOrBox);
diff --git a/flang/include/flang/Optimizer/HLFIR/HLFIROps.td b/flang/include/flang/Optimizer/HLFIR/HLFIROps.td
index 753ede2..f22e9a7 100644
--- a/flang/include/flang/Optimizer/HLFIR/HLFIROps.td
+++ b/flang/include/flang/Optimizer/HLFIR/HLFIROps.td
@@ -88,7 +88,8 @@ def hlfir_DeclareOp : hlfir_Op<"declare", [AttrSizedOperandSegments,
     Optional<AnyShapeOrShiftType>:$shape,
     Variadic<AnyIntegerType>:$typeparams,
     Builtin_StringAttr:$uniq_name,
-    OptionalAttr<fir_FortranVariableFlagsAttr>:$fortran_attrs
+    OptionalAttr<fir_FortranVariableFlagsAttr>:$fortran_attrs,
+    OptionalAttr<fir_CUDAAttributeAttr>:$cuda_attr
   );
 
   let results = (outs AnyFortranVariable, AnyRefOrBoxLike);
@@ -101,7 +102,8 @@ def hlfir_DeclareOp : hlfir_Op<"declare", [AttrSizedOperandSegments,
   let builders = [
     OpBuilder<(ins "mlir::Value":$memref, "llvm::StringRef":$uniq_name,
       CArg<"mlir::Value", "{}">:$shape, CArg<"mlir::ValueRange", "{}">:$typeparams,
-      CArg<"fir::FortranVariableFlagsAttr", "{}">:$fortran_attrs)>];
+      CArg<"fir::FortranVariableFlagsAttr", "{}">:$fortran_attrs,
+      CArg<"fir::CUDAAttributeAttr", "{}">:$cuda_attr)>];
 
   let extraClassDeclaration = [{
     /// Get the variable original base (same as input). It lacks
diff --git a/flang/lib/Lower/ConvertVariable.cpp b/flang/lib/Lower/ConvertVariable.cpp
index 8ea2557..f761e14 100644
--- a/flang/lib/Lower/ConvertVariable.cpp
+++ b/flang/lib/Lower/ConvertVariable.cpp
@@ -1579,6 +1579,38 @@ fir::FortranVariableFlagsAttr Fortran::lower::translateSymbolAttributes(
   return fir::FortranVariableFlagsAttr::get(mlirContext, flags);
 }
 
+fir::CUDAAttributeAttr Fortran::lower::translateSymbolCUDAAttribute(
+    mlir::MLIRContext *mlirContext, const Fortran::semantics::Symbol &sym) {
+  std::optional<Fortran::common::CUDADataAttr> cudaAttr =
+      Fortran::semantics::GetCUDADataAttr(&sym);
+  if (cudaAttr) {
+    fir::CUDAAttribute attr;
+    switch (*cudaAttr) {
+    case Fortran::common::CUDADataAttr::Constant:
+      attr = fir::CUDAAttribute::Constant;
+      break;
+    case Fortran::common::CUDADataAttr::Device:
+      attr = fir::CUDAAttribute::Device;
+      break;
+    case Fortran::common::CUDADataAttr::Managed:
+      attr = fir::CUDAAttribute::Managed;
+      break;
+    case Fortran::common::CUDADataAttr::Pinned:
+      attr = fir::CUDAAttribute::Pinned;
+      break;
+    case Fortran::common::CUDADataAttr::Shared:
+      attr = fir::CUDAAttribute::Shared;
+      break;
+    case Fortran::common::CUDADataAttr::Texture:
+      // Obsolete attribute
+      break;
+    }
+
+    return fir::CUDAAttributeAttr::get(mlirContext, attr);
+  }
+  return {};
+}
+
 /// Map a symbol to its FIR address and evaluated specification expressions.
 /// Not for symbols lowered to fir.box.
 /// Will optionally create fir.declare.
@@ -1618,6 +1650,8 @@ static void genDeclareSymbol(Fortran::lower::AbstractConverter &converter,
     auto name = converter.mangleName(sym);
     fir::FortranVariableFlagsAttr attributes =
         Fortran::lower::translateSymbolAttributes(builder.getContext(), sym);
+    fir::CUDAAttributeAttr cudaAttr =
+        Fortran::lower::translateSymbolCUDAAttribute(builder.getContext(), sym);
 
     if (isCrayPointee) {
       mlir::Type baseType =
@@ -1664,7 +1698,7 @@ static void genDeclareSymbol(Fortran::lower::AbstractConverter &converter,
       return;
     }
     auto newBase = builder.create<hlfir::DeclareOp>(
-        loc, base, name, shapeOrShift, lenParams, attributes);
+        loc, base, name, shapeOrShift, lenParams, attributes, cudaAttr);
     symMap.addVariableDefinition(sym, newBase, force);
     return;
   }
@@ -1709,9 +1743,12 @@ void Fortran::lower::genDeclareSymbol(
     fir::FortranVariableFlagsAttr attributes =
         Fortran::lower::translateSymbolAttributes(
             builder.getContext(), sym.GetUltimate(), extraFlags);
+    fir::CUDAAttributeAttr cudaAttr =
+        Fortran::lower::translateSymbolCUDAAttribute(builder.getContext(),
+                                                     sym.GetUltimate());
     auto name = converter.mangleName(sym);
     hlfir::EntityWithAttributes declare =
-        hlfir::genDeclare(loc, builder, exv, name, attributes);
+        hlfir::genDeclare(loc, builder, exv, name, attributes, cudaAttr);
     symMap.addVariableDefinition(sym, declare.getIfVariableInterface(), force);
     return;
   }
diff --git a/flang/lib/Optimizer/Builder/HLFIRTools.cpp b/flang/lib/Optimizer/Builder/HLFIRTools.cpp
index 94f723b..61e5311 100644
--- a/flang/lib/Optimizer/Builder/HLFIRTools.cpp
+++ b/flang/lib/Optimizer/Builder/HLFIRTools.cpp
@@ -198,7 +198,8 @@ mlir::Value hlfir::Entity::getFirBase() const {
 fir::FortranVariableOpInterface
 hlfir::genDeclare(mlir::Location loc, fir::FirOpBuilder &builder,
                   const fir::ExtendedValue &exv, llvm::StringRef name,
-                  fir::FortranVariableFlagsAttr flags) {
+                  fir::FortranVariableFlagsAttr flags,
+                  fir::CUDAAttributeAttr cudaAttr) {
 
   mlir::Value base = fir::getBase(exv);
   assert(fir::conformsWithPassByRef(base.getType()) &&
@@ -228,7 +229,7 @@ hlfir::genDeclare(mlir::Location loc, fir::FirOpBuilder &builder,
       },
       [](const auto &) {});
   auto declareOp = builder.create<hlfir::DeclareOp>(
-      loc, base, name, shapeOrShift, lenParams, flags);
+      loc, base, name, shapeOrShift, lenParams, flags, cudaAttr);
   return mlir::cast<fir::FortranVariableOpInterface>(declareOp.getOperation());
 }
 
diff --git a/flang/lib/Optimizer/Dialect/FIRAttr.cpp b/flang/lib/Optimizer/Dialect/FIRAttr.cpp
index 4871091..04431b6 100644
--- a/flang/lib/Optimizer/Dialect/FIRAttr.cpp
+++ b/flang/lib/Optimizer/Dialect/FIRAttr.cpp
@@ -14,6 +14,7 @@
 #include "flang/Optimizer/Dialect/FIRDialect.h"
 #include "flang/Optimizer/Dialect/Support/KindMapping.h"
 #include "mlir/IR/AttributeSupport.h"
+#include "mlir/IR/Builders.h"
 #include "mlir/IR/BuiltinTypes.h"
 #include "mlir/IR/DialectImplementation.h"
 #include "llvm/ADT/SmallString.h"
@@ -297,5 +298,5 @@ void fir::printFirAttribute(FIROpsDialect *dialect, mlir::Attribute attr,
 void FIROpsDialect::registerAttributes() {
   addAttributes<ClosedIntervalAttr, ExactTypeAttr, FortranVariableFlagsAttr,
                 LowerBoundAttr, PointIntervalAttr, RealAttr, SubclassAttr,
-                UpperBoundAttr>();
+                UpperBoundAttr, CUDAAttributeAttr>();
 }
diff --git a/flang/lib/Optimizer/HLFIR/IR/HLFIROps.cpp b/flang/lib/Optimizer/HLFIR/IR/HLFIROps.cpp
index ce12e6f..85644c1 100644
--- a/flang/lib/Optimizer/HLFIR/IR/HLFIROps.cpp
+++ b/flang/lib/Optimizer/HLFIR/IR/HLFIROps.cpp
@@ -123,14 +123,15 @@ void hlfir::DeclareOp::build(mlir::OpBuilder &builder,
                              mlir::OperationState &result, mlir::Value memref,
                              llvm::StringRef uniq_name, mlir::Value shape,
                              mlir::ValueRange typeparams,
-                             fir::FortranVariableFlagsAttr fortran_attrs) {
+                             fir::FortranVariableFlagsAttr fortran_attrs,
+                             fir::CUDAAttributeAttr cuda_attr) {
   auto nameAttr = builder.getStringAttr(uniq_name);
   mlir::Type inputType = memref.getType();
   bool hasExplicitLbs = hasExplicitLowerBounds(shape);
   mlir::Type hlfirVariableType =
       getHLFIRVariableType(inputType, hasExplicitLbs);
   build(builder, result, {hlfirVariableType, inputType}, memref, shape,
-        typeparams, nameAttr, fortran_attrs);
+        typeparams, nameAttr, fortran_attrs, cuda_attr);
 }
 
 mlir::LogicalResult hlfir::DeclareOp::verify() {
diff --git a/flang/lib/Optimizer/HLFIR/Transforms/ConvertToFIR.cpp b/flang/lib/Optimizer/HLFIR/Transforms/ConvertToFIR.cpp
index b690185..b15fb59 100644
--- a/flang/lib/Optimizer/HLFIR/Transforms/ConvertToFIR.cpp
+++ b/flang/lib/Optimizer/HLFIR/Transforms/ConvertToFIR.cpp
@@ -320,12 +320,16 @@ public:
     mlir::Location loc = declareOp->getLoc();
     mlir::Value memref = declareOp.getMemref();
     fir::FortranVariableFlagsAttr fortranAttrs;
+    fir::CUDAAttributeAttr cudaAttr;
     if (auto attrs = declareOp.getFortranAttrs())
       fortranAttrs =
           fir::FortranVariableFlagsAttr::get(rewriter.getContext(), *attrs);
+    if (auto attr = declareOp.getCudaAttr())
+      cudaAttr = fir::CUDAAttributeAttr::get(rewriter.getContext(), *attr);
     auto firDeclareOp = rewriter.create<fir::DeclareOp>(
         loc, memref.getType(), memref, declareOp.getShape(),
-        declareOp.getTypeparams(), declareOp.getUniqName(), fortranAttrs);
+        declareOp.getTypeparams(), declareOp.getUniqName(), fortranAttrs,
+        cudaAttr);
 
     // Propagate other attributes from hlfir.declare to fir.declare.
     // OpenACC's acc.declare is one example. Right now, the propagation
diff --git a/flang/test/Lower/CUDA/cuda-data-attribute.cuf b/flang/test/Lower/CUDA/cuda-data-attribute.cuf
new file mode 100644
index 0000000..caa8ac7
--- /dev/null
+++ b/flang/test/Lower/CUDA/cuda-data-attribute.cuf
@@ -0,0 +1,22 @@
+! RUN: bbc -emit-hlfir -fcuda %s -o - | FileCheck %s
+! RUN: bbc -emit-hlfir -fcuda %s -o - | fir-opt -convert-hlfir-to-fir | FileCheck %s --check-prefix=FIR
+
+! Test lowering of CUDA attribute on local variables.
+
+subroutine local_var_attrs
+  real, constant :: rc
+  real, device :: rd
+  real, allocatable, managed :: rm
+  real, allocatable, pinned :: rp
+end subroutine
+
+! CHECK-LABEL: func.func @_QPlocal_var_attrs()
+! CHECK: %{{.*}}:2 = hlfir.declare %{{.*}} {cuda_attr = #fir.cuda<constant>, uniq_name = "_QFlocal_var_attrsErc"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
+! CHECK: %{{.*}}:2 = hlfir.declare %{{.*}} {cuda_attr = #fir.cuda<device>, uniq_name = "_QFlocal_var_attrsErd"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
+! CHECK: %{{.*}}:2 = hlfir.declare %{{.*}} {cuda_attr = #fir.cuda<managed>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFlocal_var_attrsErm"} : (!fir.ref<!fir.box<!fir.heap<f32>>>) -> (!fir.ref<!fir.box<!fir.heap<f32>>>, !fir.ref<!fir.box<!fir.heap<f32>>>)
+! CHECK: %{{.*}}:2 = hlfir.declare %{{.*}} {cuda_attr = #fir.cuda<pinned>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFlocal_var_attrsErp"} : (!fir.ref<!fir.box<!fir.heap<f32>>>) -> (!fir.ref<!fir.box<!fir.heap<f32>>>, !fir.ref<!fir.box<!fir.heap<f32>>>)
+
+! FIR: %{{.*}} = fir.declare %{{.*}} {cuda_attr = #fir.cuda<constant>, uniq_name = "_QFlocal_var_attrsErc"} : (!fir.ref<f32>) -> !fir.ref<f32>
+! FIR: %{{.*}} = fir.declare %{{.*}} {cuda_attr = #fir.cuda<device>, uniq_name = "_QFlocal_var_attrsErd"} : (!fir.ref<f32>) -> !fir.ref<f32>
+! FIR: %{{.*}} = fir.declare %{{.*}} {cuda_attr = #fir.cuda<managed>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFlocal_var_attrsErm"} : (!fir.ref<!fir.box<!fir.heap<f32>>>) -> !fir.ref<!fir.box<!fir.heap<f32>>>
+! FIR: %{{.*}} = fir.declare %{{.*}} {cuda_attr = #fir.cuda<pinned>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFlocal_var_attrsErp"} : (!fir.ref<!fir.box<!fir.heap<f32>>>) -> !fir.ref<!fir.box<!fir.heap<f32>>>
diff --git a/flang/unittests/Optimizer/FortranVariableTest.cpp b/flang/unittests/Optimizer/FortranVariableTest.cpp
index 42ed225..4b101ce 100644
--- a/flang/unittests/Optimizer/FortranVariableTest.cpp
+++ b/flang/unittests/Optimizer/FortranVariableTest.cpp
@@ -49,7 +49,8 @@ TEST_F(FortranVariableTest, SimpleScalar) {
   auto name = mlir::StringAttr::get(&context, "x");
   auto declare = builder->create<fir::DeclareOp>(loc, addr.getType(), addr,
       /*shape=*/mlir::Value{}, /*typeParams=*/std::nullopt, name,
-      /*fortran_attrs=*/fir::FortranVariableFlagsAttr{});
+      /*fortran_attrs=*/fir::FortranVariableFlagsAttr{},
+      /*cuda_attr=*/fir::CUDAAttributeAttr{});
 
   fir::FortranVariableOpInterface fortranVariable = declare;
   EXPECT_FALSE(fortranVariable.isArray());
@@ -74,7 +75,8 @@ TEST_F(FortranVariableTest, CharacterScalar) {
   auto name = mlir::StringAttr::get(&context, "x");
   auto declare = builder->create<fir::DeclareOp>(loc, addr.getType(), addr,
       /*shape=*/mlir::Value{}, typeParams, name,
-      /*fortran_attrs=*/fir::FortranVariableFlagsAttr{});
+      /*fortran_attrs=*/fir::FortranVariableFlagsAttr{},
+      /*cuda_attr=*/fir::CUDAAttributeAttr{});
 
   fir::FortranVariableOpInterface fortranVariable = declare;
   EXPECT_FALSE(fortranVariable.isArray());
@@ -104,7 +106,8 @@ TEST_F(FortranVariableTest, SimpleArray) {
   auto name = mlir::StringAttr::get(&context, "x");
   auto declare = builder->create<fir::DeclareOp>(loc, addr.getType(), addr,
       shape, /*typeParams*/ std::nullopt, name,
-      /*fortran_attrs=*/fir::FortranVariableFlagsAttr{});
+      /*fortran_attrs=*/fir::FortranVariableFlagsAttr{},
+      /*cuda_attr=*/fir::CUDAAttributeAttr{});
 
   fir::FortranVariableOpInterface fortranVariable = declare;
   EXPECT_TRUE(fortranVariable.isArray());
@@ -134,7 +137,8 @@ TEST_F(FortranVariableTest, CharacterArray) {
   auto name = mlir::StringAttr::get(&context, "x");
   auto declare = builder->create<fir::DeclareOp>(loc, addr.getType(), addr,
       shape, typeParams, name,
-      /*fortran_attrs=*/fir::FortranVariableFlagsAttr{});
+      /*fortran_attrs=*/fir::FortranVariableFlagsAttr{},
+      /*cuda_attr=*/fir::CUDAAttributeAttr{});
 
   fir::FortranVariableOpInterface fortranVariable = declare;
   EXPECT_TRUE(fortranVariable.isArray());
-- 
cgit v1.1


From 17f0680f69f44d340fd0205f7763b2830357c0d5 Mon Sep 17 00:00:00 2001
From: Krystian Stasiowski <sdkrystian@gmail.com>
Date: Thu, 8 Feb 2024 13:04:10 -0500
Subject: [Clang][Sema] Abbreviated function templates do not append invented
 parameters to empty template parameter lists (#80864)

According to [dcl.fct] p23:
> An abbreviated function template can have a _template-head_. The
invented _template-parameters_ are appended to the
_template-parameter-list_ after the explicitly declared
_template-parameters_.

`template<>` is not a _template-head_ -- a _template-head_ must have at
least one _template-parameter_. This patch corrects our current behavior
of appending the invented template parameters to the innermost template
parameter list, regardless of whether it is empty. Example:
```
template<typename T>
struct A
{
    void f(auto);
};

template<>
void A<int>::f(auto); // ok

template<>
template<> // warning: extraneous template parameter list in template specialization
void A<int>::f(auto);
```
---
 clang/docs/ReleaseNotes.rst                        |  2 ++
 clang/include/clang/AST/DeclTemplate.h             |  1 +
 clang/lib/AST/DeclPrinter.cpp                      |  4 ++++
 clang/lib/Sema/SemaDecl.cpp                        |  2 +-
 clang/lib/Sema/SemaDeclCXX.cpp                     | 11 +++++++++-
 clang/test/AST/ast-print-method-decl.cpp           |  3 +--
 .../test/CXX/dcl.decl/dcl.meaning/dcl.fct/p23.cpp  | 24 ++++++++++++++++++++++
 clang/test/OpenMP/for_loop_auto.cpp                |  2 +-
 8 files changed, 44 insertions(+), 5 deletions(-)
 create mode 100644 clang/test/CXX/dcl.decl/dcl.meaning/dcl.fct/p23.cpp

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index e158284..32440ee 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -215,6 +215,8 @@ Bug Fixes to C++ Support
   Fixes (`#68490 <https://github.com/llvm/llvm-project/issues/68490>`_)
 - Fix a crash when trying to call a varargs function that also has an explicit object parameter.
   Fixes (`#80971 ICE when explicit object parameter be a function parameter pack`)
+- Fixed a bug where abbreviated function templates would append their invented template parameters to
+  an empty template parameter lists.
 
 Bug Fixes to AST Handling
 ^^^^^^^^^^^^^^^^^^^^^^^^^
diff --git a/clang/include/clang/AST/DeclTemplate.h b/clang/include/clang/AST/DeclTemplate.h
index baf7114..e3b6a7e 100644
--- a/clang/include/clang/AST/DeclTemplate.h
+++ b/clang/include/clang/AST/DeclTemplate.h
@@ -134,6 +134,7 @@ public:
   const_iterator end() const { return begin() + NumParams; }
 
   unsigned size() const { return NumParams; }
+  bool empty() const { return NumParams == 0; }
 
   ArrayRef<NamedDecl *> asArray() { return llvm::ArrayRef(begin(), end()); }
   ArrayRef<const NamedDecl*> asArray() const {
diff --git a/clang/lib/AST/DeclPrinter.cpp b/clang/lib/AST/DeclPrinter.cpp
index 822ac12..43d2219 100644
--- a/clang/lib/AST/DeclPrinter.cpp
+++ b/clang/lib/AST/DeclPrinter.cpp
@@ -1215,6 +1215,10 @@ void DeclPrinter::printTemplateParameters(const TemplateParameterList *Params,
                                           bool OmitTemplateKW) {
   assert(Params);
 
+  // Don't print invented template parameter lists.
+  if (!Params->empty() && Params->getParam(0)->isImplicit())
+    return;
+
   if (!OmitTemplateKW)
     Out << "template ";
   Out << '<';
diff --git a/clang/lib/Sema/SemaDecl.cpp b/clang/lib/Sema/SemaDecl.cpp
index 18a5d93..2c526cd 100644
--- a/clang/lib/Sema/SemaDecl.cpp
+++ b/clang/lib/Sema/SemaDecl.cpp
@@ -9759,7 +9759,7 @@ Sema::ActOnFunctionDeclarator(Scope *S, Declarator &D, DeclContext *DC,
   SmallVector<TemplateParameterList *, 4> TemplateParamLists;
   llvm::append_range(TemplateParamLists, TemplateParamListsRef);
   if (TemplateParameterList *Invented = D.getInventedTemplateParameterList()) {
-    if (!TemplateParamLists.empty() &&
+    if (!TemplateParamLists.empty() && !TemplateParamLists.back()->empty() &&
         Invented->getDepth() == TemplateParamLists.back()->getDepth())
       TemplateParamLists.back() = Invented;
     else
diff --git a/clang/lib/Sema/SemaDeclCXX.cpp b/clang/lib/Sema/SemaDeclCXX.cpp
index ab8a967..fea8c50 100644
--- a/clang/lib/Sema/SemaDeclCXX.cpp
+++ b/clang/lib/Sema/SemaDeclCXX.cpp
@@ -19294,7 +19294,16 @@ void Sema::ActOnStartFunctionDeclarationDeclarator(
         ExplicitLists, /*IsFriend=*/false, IsMemberSpecialization, IsInvalid,
         /*SuppressDiagnostic=*/true);
   }
-  if (ExplicitParams) {
+  // C++23 [dcl.fct]p23:
+  //   An abbreviated function template can have a template-head. The invented
+  //   template-parameters are appended to the template-parameter-list after
+  //   the explicitly declared template-parameters.
+  //
+  // A template-head must have one or more template-parameters (read:
+  // 'template<>' is *not* a template-head). Only append the invented
+  // template parameters if we matched the nested-name-specifier to a non-empty
+  // TemplateParameterList.
+  if (ExplicitParams && !ExplicitParams->empty()) {
     Info.AutoTemplateParameterDepth = ExplicitParams->getDepth();
     llvm::append_range(Info.TemplateParams, *ExplicitParams);
     Info.NumExplicitTemplateParams = ExplicitParams->size();
diff --git a/clang/test/AST/ast-print-method-decl.cpp b/clang/test/AST/ast-print-method-decl.cpp
index 9f5d112..75dea0c 100644
--- a/clang/test/AST/ast-print-method-decl.cpp
+++ b/clang/test/AST/ast-print-method-decl.cpp
@@ -32,8 +32,7 @@ struct DelegatingCtor2 {
 
 // CHECK: struct DelegatingCtor3 {
 struct DelegatingCtor3 {
-  // FIXME: template <> should not be output
-  // CHECK: template <> DelegatingCtor3(auto);
+  // CHECK: DelegatingCtor3(auto);
   DelegatingCtor3(auto);
 
   // FIXME: Implicitly specialized method should not be output
diff --git a/clang/test/CXX/dcl.decl/dcl.meaning/dcl.fct/p23.cpp b/clang/test/CXX/dcl.decl/dcl.meaning/dcl.fct/p23.cpp
new file mode 100644
index 0000000..469c4e0
--- /dev/null
+++ b/clang/test/CXX/dcl.decl/dcl.meaning/dcl.fct/p23.cpp
@@ -0,0 +1,24 @@
+// RUN: %clang_cc1 -std=c++20 -pedantic-errors -verify %s
+
+// FIXME: This should be an error with -pedantic-errors.
+template<> // expected-warning {{extraneous template parameter list in template specialization}}
+void f(auto);
+
+template<typename>
+void f(auto);
+
+template<typename T>
+struct A {
+  void g(auto);
+};
+
+template<typename T>
+void A<T>::g(auto) { }
+
+template<>
+void A<int>::g(auto) { }
+
+// FIXME: This should be an error with -pedantic-errors.
+template<>
+template<> // expected-warning {{extraneous template parameter list in template specialization}}
+void A<long>::g(auto) { }
diff --git a/clang/test/OpenMP/for_loop_auto.cpp b/clang/test/OpenMP/for_loop_auto.cpp
index b2c5540..4467de6 100644
--- a/clang/test/OpenMP/for_loop_auto.cpp
+++ b/clang/test/OpenMP/for_loop_auto.cpp
@@ -10,7 +10,7 @@
 #ifndef HEADER
 #define HEADER
 
-// CHECK:      template <> void do_loop(const auto &v) {
+// CHECK:      void do_loop(const auto &v) {
 // CHECK-NEXT: #pragma omp parallel for
 // CHECK-NEXT:    for (const auto &i : v)
 // CHECK-NEXT:      ;
-- 
cgit v1.1


From 35fae044c5faf8ddb9be7b47bb7573e839f77472 Mon Sep 17 00:00:00 2001
From: Peiming Liu <36770114+PeimingLiu@users.noreply.github.com>
Date: Thu, 8 Feb 2024 12:12:24 -0600
Subject: [mlir][sparse] using non-static field to avoid data races. (#81165)

---
 .../SparseTensor/Transforms/Utils/LoopEmitter.cpp  | 15 +++----
 .../SparseTensor/Transforms/Utils/LoopEmitter.h    |  1 +
 .../Transforms/Utils/SparseTensorLevel.cpp         | 48 ++++++++++++++--------
 .../Transforms/Utils/SparseTensorLevel.h           | 20 +++++----
 4 files changed, 50 insertions(+), 34 deletions(-)

diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/Utils/LoopEmitter.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/Utils/LoopEmitter.cpp
index 1c2857d..0ead135 100644
--- a/mlir/lib/Dialect/SparseTensor/Transforms/Utils/LoopEmitter.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/Utils/LoopEmitter.cpp
@@ -94,7 +94,7 @@ void LoopEmitter::initialize(ValueRange ts, StringAttr loopTag, bool hasOutput,
   this->loopTag = loopTag;
   this->hasOutput = hasOutput;
   this->isSparseOut = isSparseOut;
-  SparseIterator::setSparseEmitStrategy(emitStrategy);
+  this->emitStrategy = emitStrategy;
 
   const unsigned numManifestTensors = ts.size();
   const unsigned synTensorId = numManifestTensors;
@@ -166,13 +166,13 @@ void LoopEmitter::initialize(ValueRange ts, StringAttr loopTag, bool hasOutput,
 std::unique_ptr<SparseIterator>
 LoopEmitter::makeLevelIterator(OpBuilder &builder, Location loc, TensorId t,
                                Level l) {
-  auto it = makeSimpleIterator(*lvls[t][l]);
+  auto it = makeSimpleIterator(*lvls[t][l], emitStrategy);
   auto stt = getSparseTensorType(tensors[t]);
   if (stt.hasEncoding() && stt.getEncoding().isSlice()) {
     Value offset = genSliceOffset(builder, loc, tensors[t], l);
     Value stride = genSliceStride(builder, loc, tensors[t], l);
-    auto slicedIt = makeSlicedLevelIterator(std::move(it), offset, stride,
-                                            lvls[t][l]->getSize());
+    auto slicedIt = makeSlicedLevelIterator(
+        std::move(it), offset, stride, lvls[t][l]->getSize(), emitStrategy);
     return slicedIt;
   }
   return it;
@@ -186,7 +186,7 @@ void LoopEmitter::initializeLoopEmit(
     TensorId synId = getSynTensorId();
     for (unsigned i = 0, e = loopHighs.size(); i < e; i++) {
       Value sz = loopHighs[i] = synSetter(builder, loc, i);
-      auto [stl, it] = makeSynLevelAndIterator(sz, synId, i);
+      auto [stl, it] = makeSynLevelAndIterator(sz, synId, i, emitStrategy);
       lvls[synId][i] = std::move(stl);
       iters[synId][i].emplace_back(std::move(it));
     }
@@ -317,12 +317,13 @@ void LoopEmitter::initSubSectIterator(OpBuilder &builder, Location loc) {
           size = ADDI(size, ADDI(MULI(idxMax, C_IDX(stride)), C_IDX(1)));
         }
         it = makeNonEmptySubSectIterator(builder, loc, parent, loopHighs[loop],
-                                         std::move(lvlIt), size, curDep.second);
+                                         std::move(lvlIt), size, curDep.second,
+                                         emitStrategy);
       } else {
         const SparseIterator &subSectIter = *iters[t][lvl].back();
         it = makeTraverseSubSectIterator(builder, loc, subSectIter, *parent,
                                          std::move(lvlIt), loopHighs[loop],
-                                         curDep.second);
+                                         curDep.second, emitStrategy);
       }
       lastIter[t] = it.get();
       iters[t][lvl].emplace_back(std::move(it));
diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/Utils/LoopEmitter.h b/mlir/lib/Dialect/SparseTensor/Transforms/Utils/LoopEmitter.h
index 5bab2c6..7bfe713 100644
--- a/mlir/lib/Dialect/SparseTensor/Transforms/Utils/LoopEmitter.h
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/Utils/LoopEmitter.h
@@ -380,6 +380,7 @@ private:
   /// tensor.
   bool hasOutput;
   bool isSparseOut;
+  SparseEmitStrategy emitStrategy;
 
   //
   // Fields which have `numTensor` many entries.
diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/Utils/SparseTensorLevel.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/Utils/SparseTensorLevel.cpp
index 04b49c3..4ba9ecb 100644
--- a/mlir/lib/Dialect/SparseTensor/Transforms/Utils/SparseTensorLevel.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/Utils/SparseTensorLevel.cpp
@@ -773,9 +773,6 @@ public:
 // SparseIterator derived classes implementation.
 //===----------------------------------------------------------------------===//
 
-SparseEmitStrategy SparseIterator::emitStrategy =
-    SparseEmitStrategy::kFunctional;
-
 void SparseIterator::genInit(OpBuilder &b, Location l,
                              const SparseIterator *p) {
   if (emitStrategy == SparseEmitStrategy::kDebugInterface) {
@@ -1303,27 +1300,38 @@ sparse_tensor::makeSparseTensorLevel(OpBuilder &b, Location l, Value t,
 }
 
 std::pair<std::unique_ptr<SparseTensorLevel>, std::unique_ptr<SparseIterator>>
-sparse_tensor::makeSynLevelAndIterator(Value sz, unsigned tid, unsigned lvl) {
+sparse_tensor::makeSynLevelAndIterator(Value sz, unsigned tid, unsigned lvl,
+                                       SparseEmitStrategy strategy) {
   auto stl = std::make_unique<DenseLevel>(tid, lvl, sz, /*encoded=*/false);
   auto it = std::make_unique<TrivialIterator>(*stl);
+  it->setSparseEmitStrategy(strategy);
   return std::make_pair(std::move(stl), std::move(it));
 }
 
 std::unique_ptr<SparseIterator>
-sparse_tensor::makeSimpleIterator(const SparseTensorLevel &stl) {
+sparse_tensor::makeSimpleIterator(const SparseTensorLevel &stl,
+                                  SparseEmitStrategy strategy) {
+  std::unique_ptr<SparseIterator> ret;
   if (!isUniqueLT(stl.getLT())) {
     // We always dedupliate the non-unique level, but we should optimize it away
     // if possible.
-    return std::make_unique<DedupIterator>(stl);
+    ret = std::make_unique<DedupIterator>(stl);
+  } else {
+    ret = std::make_unique<TrivialIterator>(stl);
   }
-  return std::make_unique<TrivialIterator>(stl);
+  ret->setSparseEmitStrategy(strategy);
+  return ret;
 }
 
 std::unique_ptr<SparseIterator>
 sparse_tensor::makeSlicedLevelIterator(std::unique_ptr<SparseIterator> &&sit,
-                                       Value offset, Value stride, Value size) {
+                                       Value offset, Value stride, Value size,
+                                       SparseEmitStrategy strategy) {
 
-  return std::make_unique<FilterIterator>(std::move(sit), offset, stride, size);
+  auto ret =
+      std::make_unique<FilterIterator>(std::move(sit), offset, stride, size);
+  ret->setSparseEmitStrategy(strategy);
+  return ret;
 }
 
 static const SparseIterator *tryUnwrapFilter(const SparseIterator *it) {
@@ -1335,38 +1343,42 @@ static const SparseIterator *tryUnwrapFilter(const SparseIterator *it) {
 
 std::unique_ptr<SparseIterator> sparse_tensor::makeNonEmptySubSectIterator(
     OpBuilder &b, Location l, const SparseIterator *parent, Value loopBound,
-    std::unique_ptr<SparseIterator> &&delegate, Value size, unsigned stride) {
+    std::unique_ptr<SparseIterator> &&delegate, Value size, unsigned stride,
+    SparseEmitStrategy strategy) {
 
   // Try unwrap the NonEmptySubSectIterator from a filter parent.
   parent = tryUnwrapFilter(parent);
-  auto it = std::make_unique<NonEmptySubSectIterator>(
-      b, l, parent, std::move(delegate), size);
+  std::unique_ptr<SparseIterator> it =
+      std::make_unique<NonEmptySubSectIterator>(b, l, parent,
+                                                std::move(delegate), size);
 
   if (stride != 1) {
     // TODO: We can safely skip bound checking on sparse levels, but for dense
     // iteration space, we need the bound to infer the dense loop range.
-    return std::make_unique<FilterIterator>(std::move(it), /*offset=*/C_IDX(0),
-                                            C_IDX(stride), /*size=*/loopBound);
+    it = std::make_unique<FilterIterator>(std::move(it), /*offset=*/C_IDX(0),
+                                          C_IDX(stride), /*size=*/loopBound);
   }
+  it->setSparseEmitStrategy(strategy);
   return it;
 }
 
 std::unique_ptr<SparseIterator> sparse_tensor::makeTraverseSubSectIterator(
     OpBuilder &b, Location l, const SparseIterator &subSectIter,
     const SparseIterator &parent, std::unique_ptr<SparseIterator> &&wrap,
-    Value loopBound, unsigned stride) {
+    Value loopBound, unsigned stride, SparseEmitStrategy strategy) {
 
   // This must be a subsection iterator or a filtered subsection iterator.
   auto &subSect =
       llvm::cast<NonEmptySubSectIterator>(*tryUnwrapFilter(&subSectIter));
 
-  auto it = std::make_unique<SubSectIterator>(
+  std::unique_ptr<SparseIterator> it = std::make_unique<SubSectIterator>(
       subSect, *tryUnwrapFilter(&parent), std::move(wrap));
 
   if (stride != 1) {
-    return std::make_unique<FilterIterator>(std::move(it), /*offset=*/C_IDX(0),
-                                            C_IDX(stride), /*size=*/loopBound);
+    it = std::make_unique<FilterIterator>(std::move(it), /*offset=*/C_IDX(0),
+                                          C_IDX(stride), /*size=*/loopBound);
   }
+  it->setSparseEmitStrategy(strategy);
   return it;
 }
 
diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/Utils/SparseTensorLevel.h b/mlir/lib/Dialect/SparseTensor/Transforms/Utils/SparseTensorLevel.h
index fc2d9de..d1e94b7 100644
--- a/mlir/lib/Dialect/SparseTensor/Transforms/Utils/SparseTensorLevel.h
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/Utils/SparseTensorLevel.h
@@ -111,8 +111,8 @@ protected:
 public:
   virtual ~SparseIterator() = default;
 
-  static void setSparseEmitStrategy(SparseEmitStrategy strategy) {
-    SparseIterator::emitStrategy = strategy;
+  void setSparseEmitStrategy(SparseEmitStrategy strategy) {
+    emitStrategy = strategy;
   }
 
   virtual std::string getDebugInterfacePrefix() const = 0;
@@ -248,7 +248,7 @@ protected:
     return ref.take_front(cursorValsCnt);
   }
 
-  static SparseEmitStrategy emitStrategy;
+  SparseEmitStrategy emitStrategy;
 
 public:
   const IterKind kind;     // For LLVM-style RTTI.
@@ -277,32 +277,34 @@ std::unique_ptr<SparseTensorLevel> makeSparseTensorLevel(OpBuilder &builder,
 
 /// Helper function to create a simple SparseIterator object that iterate over
 /// the SparseTensorLevel.
-std::unique_ptr<SparseIterator>
-makeSimpleIterator(const SparseTensorLevel &stl);
+std::unique_ptr<SparseIterator> makeSimpleIterator(const SparseTensorLevel &stl,
+                                                   SparseEmitStrategy strategy);
 
 /// Helper function to create a synthetic SparseIterator object that iterate
 /// over a dense space specified by [0,`sz`).
 std::pair<std::unique_ptr<SparseTensorLevel>, std::unique_ptr<SparseIterator>>
-makeSynLevelAndIterator(Value sz, unsigned tid, unsigned lvl);
+makeSynLevelAndIterator(Value sz, unsigned tid, unsigned lvl,
+                        SparseEmitStrategy strategy);
 
 /// Helper function to create a SparseIterator object that iterate over a
 /// sliced space, the orignal space (before slicing) is traversed by `sit`.
 std::unique_ptr<SparseIterator>
 makeSlicedLevelIterator(std::unique_ptr<SparseIterator> &&sit, Value offset,
-                        Value stride, Value size);
+                        Value stride, Value size, SparseEmitStrategy strategy);
 
 /// Helper function to create a SparseIterator object that iterate over the
 /// non-empty subsections set.
 std::unique_ptr<SparseIterator> makeNonEmptySubSectIterator(
     OpBuilder &b, Location l, const SparseIterator *parent, Value loopBound,
-    std::unique_ptr<SparseIterator> &&delegate, Value size, unsigned stride);
+    std::unique_ptr<SparseIterator> &&delegate, Value size, unsigned stride,
+    SparseEmitStrategy strategy);
 
 /// Helper function to create a SparseIterator object that iterate over a
 /// non-empty subsection created by NonEmptySubSectIterator.
 std::unique_ptr<SparseIterator> makeTraverseSubSectIterator(
     OpBuilder &b, Location l, const SparseIterator &subsectIter,
     const SparseIterator &parent, std::unique_ptr<SparseIterator> &&wrap,
-    Value loopBound, unsigned stride);
+    Value loopBound, unsigned stride, SparseEmitStrategy strategy);
 
 } // namespace sparse_tensor
 } // namespace mlir
-- 
cgit v1.1


From da95d926f6fce4ed9707c77908ad96624268f134 Mon Sep 17 00:00:00 2001
From: Jan Svoboda <jan_svoboda@apple.com>
Date: Thu, 8 Feb 2024 19:19:18 +0100
Subject: [clang][lex] Always pass suggested module to `InclusionDirective()`
 callback (#81061)

This patch provides more information to the
`PPCallbacks::InclusionDirective()` hook. We now always pass the
suggested module, regardless of whether it was actually imported or not.
The extra `bool ModuleImported` parameter then denotes whether the
header `#include` will be automatically translated into import the the
module.

The main change is in `clang/lib/Lex/PPDirectives.cpp`, where we take
care to not modify `SuggestedModule` after it's been populated by
`LookupHeaderIncludeOrImport()`. We now exclusively use the `SM`
(`ModuleToImport`) variable instead, which has been equivalent to
`SuggestedModule` until now. This allows us to use the original
non-modified `SuggestedModule` for the callback itself.

(This patch turns out to be necessary for
https://github.com/apple/llvm-project/pull/8011).
---
 clang-tools-extra/clang-move/Move.cpp              |  3 +-
 .../clang-tidy/ExpandModularHeadersPPCallbacks.cpp |  6 +-
 .../clang-tidy/ExpandModularHeadersPPCallbacks.h   |  2 +-
 .../altera/KernelNameRestrictionCheck.cpp          |  5 +-
 .../clang-tidy/bugprone/SuspiciousIncludeCheck.cpp |  7 ++-
 .../clang-tidy/llvm/IncludeOrderCheck.cpp          |  7 ++-
 .../llvmlibc/RestrictSystemLibcHeadersCheck.cpp    |  9 +--
 .../clang-tidy/misc/HeaderIncludeCycleCheck.cpp    |  2 +-
 .../modernize/DeprecatedHeadersCheck.cpp           |  7 ++-
 .../clang-tidy/modernize/MacroToEnumCheck.cpp      |  3 +-
 .../portability/RestrictSystemIncludesCheck.cpp    |  4 +-
 .../portability/RestrictSystemIncludesCheck.h      |  3 +-
 .../readability/DuplicateIncludeCheck.cpp          |  7 ++-
 .../clang-tidy/utils/IncludeInserter.cpp           |  3 +-
 clang-tools-extra/clangd/Headers.cpp               |  3 +-
 clang-tools-extra/clangd/ParsedAST.cpp             |  2 +-
 clang-tools-extra/clangd/index/IndexAction.cpp     |  3 +-
 .../clangd/unittests/ReplayPeambleTests.cpp        |  2 +-
 clang-tools-extra/include-cleaner/lib/Record.cpp   |  6 +-
 clang-tools-extra/modularize/CoverageChecker.cpp   |  3 +-
 .../modularize/PreprocessorTracker.cpp             | 20 +++----
 clang-tools-extra/pp-trace/PPCallbacksTracker.cpp  |  6 +-
 clang-tools-extra/pp-trace/PPCallbacksTracker.h    |  3 +-
 .../test/pp-trace/pp-trace-include.cpp             | 12 ++--
 clang/include/clang/Lex/PPCallbacks.h              | 16 +++--
 clang/include/clang/Lex/PreprocessingRecord.h      |  3 +-
 .../DependencyScanning/ModuleDepCollector.h        |  3 +-
 clang/lib/CodeGen/MacroPPCallbacks.cpp             |  4 +-
 clang/lib/CodeGen/MacroPPCallbacks.h               |  3 +-
 clang/lib/Frontend/DependencyFile.cpp              |  3 +-
 clang/lib/Frontend/DependencyGraph.cpp             |  7 ++-
 clang/lib/Frontend/ModuleDependencyCollector.cpp   |  3 +-
 clang/lib/Frontend/PrecompiledPreamble.cpp         |  3 +-
 clang/lib/Frontend/PrintPreprocessedOutput.cpp     | 11 ++--
 clang/lib/Frontend/Rewrite/InclusionRewriter.cpp   | 10 ++--
 clang/lib/Lex/PPDirectives.cpp                     | 70 +++++++++++-----------
 clang/lib/Lex/PreprocessingRecord.cpp              | 11 ++--
 .../DependencyScanning/ModuleDepCollector.cpp      |  8 +--
 clang/tools/libclang/Indexing.cpp                  |  5 +-
 clang/unittests/Lex/PPCallbacksTest.cpp            |  9 ++-
 40 files changed, 168 insertions(+), 129 deletions(-)

diff --git a/clang-tools-extra/clang-move/Move.cpp b/clang-tools-extra/clang-move/Move.cpp
index 1d10348..ac16803 100644
--- a/clang-tools-extra/clang-move/Move.cpp
+++ b/clang-tools-extra/clang-move/Move.cpp
@@ -133,7 +133,8 @@ public:
                           CharSourceRange FilenameRange,
                           OptionalFileEntryRef /*File*/, StringRef SearchPath,
                           StringRef /*RelativePath*/,
-                          const Module * /*Imported*/,
+                          const Module * /*SuggestedModule*/,
+                          bool /*ModuleImported*/,
                           SrcMgr::CharacteristicKind /*FileType*/) override {
     if (auto FileEntry = SM.getFileEntryRefForID(SM.getFileID(HashLoc)))
       MoveTool->addIncludes(FileName, IsAngled, SearchPath,
diff --git a/clang-tools-extra/clang-tidy/ExpandModularHeadersPPCallbacks.cpp b/clang-tools-extra/clang-tidy/ExpandModularHeadersPPCallbacks.cpp
index 5ecd4fb..5e2cc20 100644
--- a/clang-tools-extra/clang-tidy/ExpandModularHeadersPPCallbacks.cpp
+++ b/clang-tools-extra/clang-tidy/ExpandModularHeadersPPCallbacks.cpp
@@ -166,12 +166,12 @@ void ExpandModularHeadersPPCallbacks::InclusionDirective(
     SourceLocation DirectiveLoc, const Token &IncludeToken,
     StringRef IncludedFilename, bool IsAngled, CharSourceRange FilenameRange,
     OptionalFileEntryRef IncludedFile, StringRef SearchPath,
-    StringRef RelativePath, const Module *Imported,
+    StringRef RelativePath, const Module *SuggestedModule, bool ModuleImported,
     SrcMgr::CharacteristicKind FileType) {
-  if (Imported) {
+  if (ModuleImported) {
     serialization::ModuleFile *MF =
         Compiler.getASTReader()->getModuleManager().lookup(
-            *Imported->getASTFile());
+            *SuggestedModule->getASTFile());
     handleModuleFile(MF);
   }
   parseToLocation(DirectiveLoc);
diff --git a/clang-tools-extra/clang-tidy/ExpandModularHeadersPPCallbacks.h b/clang-tools-extra/clang-tidy/ExpandModularHeadersPPCallbacks.h
index 3f6abc3..0742c21 100644
--- a/clang-tools-extra/clang-tidy/ExpandModularHeadersPPCallbacks.h
+++ b/clang-tools-extra/clang-tidy/ExpandModularHeadersPPCallbacks.h
@@ -69,7 +69,7 @@ private:
                           bool IsAngled, CharSourceRange FilenameRange,
                           OptionalFileEntryRef IncludedFile,
                           StringRef SearchPath, StringRef RelativePath,
-                          const Module *Imported,
+                          const Module *SuggestedModule, bool ModuleImported,
                           SrcMgr::CharacteristicKind FileType) override;
 
   void EndOfMainFile() override;
diff --git a/clang-tools-extra/clang-tidy/altera/KernelNameRestrictionCheck.cpp b/clang-tools-extra/clang-tidy/altera/KernelNameRestrictionCheck.cpp
index 084e44a..fb1e0e8 100644
--- a/clang-tools-extra/clang-tidy/altera/KernelNameRestrictionCheck.cpp
+++ b/clang-tools-extra/clang-tidy/altera/KernelNameRestrictionCheck.cpp
@@ -29,7 +29,8 @@ public:
                           StringRef FileName, bool IsAngled,
                           CharSourceRange FileNameRange,
                           OptionalFileEntryRef File, StringRef SearchPath,
-                          StringRef RelativePath, const Module *Imported,
+                          StringRef RelativePath, const Module *SuggestedModule,
+                          bool ModuleImported,
                           SrcMgr::CharacteristicKind FileType) override;
 
   void EndOfMainFile() override;
@@ -61,7 +62,7 @@ void KernelNameRestrictionCheck::registerPPCallbacks(const SourceManager &SM,
 void KernelNameRestrictionPPCallbacks::InclusionDirective(
     SourceLocation HashLoc, const Token &, StringRef FileName, bool,
     CharSourceRange, OptionalFileEntryRef, StringRef, StringRef, const Module *,
-    SrcMgr::CharacteristicKind) {
+    bool, SrcMgr::CharacteristicKind) {
   IncludeDirective ID = {HashLoc, FileName};
   IncludeDirectives.push_back(std::move(ID));
 }
diff --git a/clang-tools-extra/clang-tidy/bugprone/SuspiciousIncludeCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/SuspiciousIncludeCheck.cpp
index 61d89cf..09ba79f 100644
--- a/clang-tools-extra/clang-tidy/bugprone/SuspiciousIncludeCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/SuspiciousIncludeCheck.cpp
@@ -26,7 +26,8 @@ public:
                           StringRef FileName, bool IsAngled,
                           CharSourceRange FilenameRange,
                           OptionalFileEntryRef File, StringRef SearchPath,
-                          StringRef RelativePath, const Module *Imported,
+                          StringRef RelativePath, const Module *SuggestedModule,
+                          bool ModuleImported,
                           SrcMgr::CharacteristicKind FileType) override;
 
 private:
@@ -51,8 +52,8 @@ void SuspiciousIncludeCheck::registerPPCallbacks(
 void SuspiciousIncludePPCallbacks::InclusionDirective(
     SourceLocation HashLoc, const Token &IncludeTok, StringRef FileName,
     bool IsAngled, CharSourceRange FilenameRange, OptionalFileEntryRef File,
-    StringRef SearchPath, StringRef RelativePath, const Module *Imported,
-    SrcMgr::CharacteristicKind FileType) {
+    StringRef SearchPath, StringRef RelativePath, const Module *SuggestedModule,
+    bool ModuleImported, SrcMgr::CharacteristicKind FileType) {
   if (IncludeTok.getIdentifierInfo()->getPPKeywordID() == tok::pp_import)
     return;
 
diff --git a/clang-tools-extra/clang-tidy/llvm/IncludeOrderCheck.cpp b/clang-tools-extra/clang-tidy/llvm/IncludeOrderCheck.cpp
index bdd72f8..4246c8c5 100644
--- a/clang-tools-extra/clang-tidy/llvm/IncludeOrderCheck.cpp
+++ b/clang-tools-extra/clang-tidy/llvm/IncludeOrderCheck.cpp
@@ -27,7 +27,8 @@ public:
                           StringRef FileName, bool IsAngled,
                           CharSourceRange FilenameRange,
                           OptionalFileEntryRef File, StringRef SearchPath,
-                          StringRef RelativePath, const Module *Imported,
+                          StringRef RelativePath, const Module *SuggestedModule,
+                          bool ModuleImported,
                           SrcMgr::CharacteristicKind FileType) override;
   void EndOfMainFile() override;
 
@@ -81,8 +82,8 @@ static int getPriority(StringRef Filename, bool IsAngled, bool IsMainModule) {
 void IncludeOrderPPCallbacks::InclusionDirective(
     SourceLocation HashLoc, const Token &IncludeTok, StringRef FileName,
     bool IsAngled, CharSourceRange FilenameRange, OptionalFileEntryRef File,
-    StringRef SearchPath, StringRef RelativePath, const Module *Imported,
-    SrcMgr::CharacteristicKind FileType) {
+    StringRef SearchPath, StringRef RelativePath, const Module *SuggestedModule,
+    bool ModuleImported, SrcMgr::CharacteristicKind FileType) {
   // We recognize the first include as a special main module header and want
   // to leave it in the top position.
   IncludeDirective ID = {HashLoc, FilenameRange, std::string(FileName),
diff --git a/clang-tools-extra/clang-tidy/llvmlibc/RestrictSystemLibcHeadersCheck.cpp b/clang-tools-extra/clang-tidy/llvmlibc/RestrictSystemLibcHeadersCheck.cpp
index 3451d34..b656917 100644
--- a/clang-tools-extra/clang-tidy/llvmlibc/RestrictSystemLibcHeadersCheck.cpp
+++ b/clang-tools-extra/clang-tidy/llvmlibc/RestrictSystemLibcHeadersCheck.cpp
@@ -33,7 +33,8 @@ public:
                           StringRef FileName, bool IsAngled,
                           CharSourceRange FilenameRange,
                           OptionalFileEntryRef File, StringRef SearchPath,
-                          StringRef RelativePath, const Module *Imported,
+                          StringRef RelativePath, const Module *SuggestedModule,
+                          bool ModuleImported,
                           SrcMgr::CharacteristicKind FileType) override;
 
 private:
@@ -45,14 +46,14 @@ private:
 void RestrictedIncludesPPCallbacks::InclusionDirective(
     SourceLocation HashLoc, const Token &IncludeTok, StringRef FileName,
     bool IsAngled, CharSourceRange FilenameRange, OptionalFileEntryRef File,
-    StringRef SearchPath, StringRef RelativePath, const Module *Imported,
-    SrcMgr::CharacteristicKind FileType) {
+    StringRef SearchPath, StringRef RelativePath, const Module *SuggestedModule,
+    bool ModuleImported, SrcMgr::CharacteristicKind FileType) {
   // Compiler provided headers are allowed (e.g stddef.h).
   if (SrcMgr::isSystem(FileType) && SearchPath == CompilerIncudeDir)
     return;
   portability::RestrictedIncludesPPCallbacks::InclusionDirective(
       HashLoc, IncludeTok, FileName, IsAngled, FilenameRange, File, SearchPath,
-      RelativePath, Imported, FileType);
+      RelativePath, SuggestedModule, ModuleImported, FileType);
 }
 
 void RestrictSystemLibcHeadersCheck::registerPPCallbacks(
diff --git a/clang-tools-extra/clang-tidy/misc/HeaderIncludeCycleCheck.cpp b/clang-tools-extra/clang-tidy/misc/HeaderIncludeCycleCheck.cpp
index bebd6e39..fadfdc8 100644
--- a/clang-tools-extra/clang-tidy/misc/HeaderIncludeCycleCheck.cpp
+++ b/clang-tools-extra/clang-tidy/misc/HeaderIncludeCycleCheck.cpp
@@ -83,7 +83,7 @@ public:
   void InclusionDirective(SourceLocation, const Token &, StringRef FilePath,
                           bool, CharSourceRange Range,
                           OptionalFileEntryRef File, StringRef, StringRef,
-                          const Module *,
+                          const Module *, bool,
                           SrcMgr::CharacteristicKind FileType) override {
     if (FileType != clang::SrcMgr::C_User)
       return;
diff --git a/clang-tools-extra/clang-tidy/modernize/DeprecatedHeadersCheck.cpp b/clang-tools-extra/clang-tidy/modernize/DeprecatedHeadersCheck.cpp
index 030a781..6d287eb 100644
--- a/clang-tools-extra/clang-tidy/modernize/DeprecatedHeadersCheck.cpp
+++ b/clang-tools-extra/clang-tidy/modernize/DeprecatedHeadersCheck.cpp
@@ -32,7 +32,8 @@ public:
                           StringRef FileName, bool IsAngled,
                           CharSourceRange FilenameRange,
                           OptionalFileEntryRef File, StringRef SearchPath,
-                          StringRef RelativePath, const Module *Imported,
+                          StringRef RelativePath, const Module *SuggestedModule,
+                          bool ModuleImported,
                           SrcMgr::CharacteristicKind FileType) override;
 
 private:
@@ -178,8 +179,8 @@ IncludeModernizePPCallbacks::IncludeModernizePPCallbacks(
 void IncludeModernizePPCallbacks::InclusionDirective(
     SourceLocation HashLoc, const Token &IncludeTok, StringRef FileName,
     bool IsAngled, CharSourceRange FilenameRange, OptionalFileEntryRef File,
-    StringRef SearchPath, StringRef RelativePath, const Module *Imported,
-    SrcMgr::CharacteristicKind FileType) {
+    StringRef SearchPath, StringRef RelativePath, const Module *SuggestedModule,
+    bool ModuleImported, SrcMgr::CharacteristicKind FileType) {
 
   // If we don't want to warn for non-main file reports and this is one, skip
   // it.
diff --git a/clang-tools-extra/clang-tidy/modernize/MacroToEnumCheck.cpp b/clang-tools-extra/clang-tidy/modernize/MacroToEnumCheck.cpp
index b197c22..0b47ed3 100644
--- a/clang-tools-extra/clang-tidy/modernize/MacroToEnumCheck.cpp
+++ b/clang-tools-extra/clang-tidy/modernize/MacroToEnumCheck.cpp
@@ -117,7 +117,8 @@ public:
                           StringRef FileName, bool IsAngled,
                           CharSourceRange FilenameRange,
                           OptionalFileEntryRef File, StringRef SearchPath,
-                          StringRef RelativePath, const Module *Imported,
+                          StringRef RelativePath, const Module *SuggestedModule,
+                          bool ModuleImported,
                           SrcMgr::CharacteristicKind FileType) override {
     clearCurrentEnum(HashLoc);
   }
diff --git a/clang-tools-extra/clang-tidy/portability/RestrictSystemIncludesCheck.cpp b/clang-tools-extra/clang-tidy/portability/RestrictSystemIncludesCheck.cpp
index 9ee0b4e..db5693e 100644
--- a/clang-tools-extra/clang-tidy/portability/RestrictSystemIncludesCheck.cpp
+++ b/clang-tools-extra/clang-tidy/portability/RestrictSystemIncludesCheck.cpp
@@ -21,8 +21,8 @@ namespace clang::tidy::portability {
 void RestrictedIncludesPPCallbacks::InclusionDirective(
     SourceLocation HashLoc, const Token &IncludeTok, StringRef FileName,
     bool IsAngled, CharSourceRange FilenameRange, OptionalFileEntryRef File,
-    StringRef SearchPath, StringRef RelativePath, const Module *Imported,
-    SrcMgr::CharacteristicKind FileType) {
+    StringRef SearchPath, StringRef RelativePath, const Module *SuggestedModule,
+    bool ModuleImported, SrcMgr::CharacteristicKind FileType) {
   if (!Check.contains(FileName) && SrcMgr::isSystem(FileType)) {
     SmallString<256> FullPath;
     llvm::sys::path::append(FullPath, SearchPath);
diff --git a/clang-tools-extra/clang-tidy/portability/RestrictSystemIncludesCheck.h b/clang-tools-extra/clang-tidy/portability/RestrictSystemIncludesCheck.h
index ad18e6f..60fae5e 100644
--- a/clang-tools-extra/clang-tidy/portability/RestrictSystemIncludesCheck.h
+++ b/clang-tools-extra/clang-tidy/portability/RestrictSystemIncludesCheck.h
@@ -50,7 +50,8 @@ public:
                           StringRef FileName, bool IsAngled,
                           CharSourceRange FilenameRange,
                           OptionalFileEntryRef File, StringRef SearchPath,
-                          StringRef RelativePath, const Module *Imported,
+                          StringRef RelativePath, const Module *SuggestedModule,
+                          bool ModuleImported,
                           SrcMgr::CharacteristicKind FileType) override;
   void EndOfMainFile() override;
 
diff --git a/clang-tools-extra/clang-tidy/readability/DuplicateIncludeCheck.cpp b/clang-tools-extra/clang-tidy/readability/DuplicateIncludeCheck.cpp
index d1f41e0..6714716 100644
--- a/clang-tools-extra/clang-tidy/readability/DuplicateIncludeCheck.cpp
+++ b/clang-tools-extra/clang-tidy/readability/DuplicateIncludeCheck.cpp
@@ -47,7 +47,8 @@ public:
                           StringRef FileName, bool IsAngled,
                           CharSourceRange FilenameRange,
                           OptionalFileEntryRef File, StringRef SearchPath,
-                          StringRef RelativePath, const Module *Imported,
+                          StringRef RelativePath, const Module *SuggestedModule,
+                          bool ModuleImported,
                           SrcMgr::CharacteristicKind FileType) override;
 
   void MacroDefined(const Token &MacroNameTok,
@@ -76,8 +77,8 @@ void DuplicateIncludeCallbacks::FileChanged(SourceLocation Loc,
 void DuplicateIncludeCallbacks::InclusionDirective(
     SourceLocation HashLoc, const Token &IncludeTok, StringRef FileName,
     bool IsAngled, CharSourceRange FilenameRange, OptionalFileEntryRef File,
-    StringRef SearchPath, StringRef RelativePath, const Module *Imported,
-    SrcMgr::CharacteristicKind FileType) {
+    StringRef SearchPath, StringRef RelativePath, const Module *SuggestedModule,
+    bool ModuleImported, SrcMgr::CharacteristicKind FileType) {
   if (llvm::is_contained(Files.back(), FileName)) {
     // We want to delete the entire line, so make sure that [Start,End] covers
     // everything.
diff --git a/clang-tools-extra/clang-tidy/utils/IncludeInserter.cpp b/clang-tools-extra/clang-tidy/utils/IncludeInserter.cpp
index d0b7474..b53016f 100644
--- a/clang-tools-extra/clang-tidy/utils/IncludeInserter.cpp
+++ b/clang-tools-extra/clang-tidy/utils/IncludeInserter.cpp
@@ -25,7 +25,8 @@ public:
                           bool IsAngled, CharSourceRange FileNameRange,
                           OptionalFileEntryRef /*IncludedFile*/,
                           StringRef /*SearchPath*/, StringRef /*RelativePath*/,
-                          const Module * /*ImportedModule*/,
+                          const Module * /*SuggestedModule*/,
+                          bool /*ModuleImported*/,
                           SrcMgr::CharacteristicKind /*FileType*/) override {
     Inserter->addInclude(FileNameRef, IsAngled, HashLocation,
                          IncludeToken.getEndLoc());
diff --git a/clang-tools-extra/clangd/Headers.cpp b/clang-tools-extra/clangd/Headers.cpp
index 076e636..75f8668 100644
--- a/clang-tools-extra/clangd/Headers.cpp
+++ b/clang-tools-extra/clangd/Headers.cpp
@@ -41,7 +41,8 @@ public:
                           OptionalFileEntryRef File,
                           llvm::StringRef /*SearchPath*/,
                           llvm::StringRef /*RelativePath*/,
-                          const clang::Module * /*Imported*/,
+                          const clang::Module * /*SuggestedModule*/,
+                          bool /*ModuleImported*/,
                           SrcMgr::CharacteristicKind FileKind) override {
     auto MainFID = SM.getMainFileID();
     // If an include is part of the preamble patch, translate #line directives.
diff --git a/clang-tools-extra/clangd/ParsedAST.cpp b/clang-tools-extra/clangd/ParsedAST.cpp
index 14a9179..bbb0e2c 100644
--- a/clang-tools-extra/clangd/ParsedAST.cpp
+++ b/clang-tools-extra/clangd/ParsedAST.cpp
@@ -244,7 +244,7 @@ private:
                             SynthesizedFilenameTok.getEndLoc())
               .toCharRange(SM),
           File, "SearchPath", "RelPath",
-          /*Imported=*/nullptr, Inc.FileKind);
+          /*SuggestedModule=*/nullptr, /*ModuleImported=*/false, Inc.FileKind);
       if (File)
         Delegate->FileSkipped(*File, SynthesizedFilenameTok, Inc.FileKind);
     }
diff --git a/clang-tools-extra/clangd/index/IndexAction.cpp b/clang-tools-extra/clangd/index/IndexAction.cpp
index 5d56285..ed56c2a 100644
--- a/clang-tools-extra/clangd/index/IndexAction.cpp
+++ b/clang-tools-extra/clangd/index/IndexAction.cpp
@@ -89,7 +89,8 @@ public:
                           llvm::StringRef FileName, bool IsAngled,
                           CharSourceRange FilenameRange,
                           OptionalFileEntryRef File, llvm::StringRef SearchPath,
-                          llvm::StringRef RelativePath, const Module *Imported,
+                          llvm::StringRef RelativePath,
+                          const Module *SuggestedModule, bool ModuleImported,
                           SrcMgr::CharacteristicKind FileType) override {
     auto IncludeURI = toURI(File);
     if (!IncludeURI)
diff --git a/clang-tools-extra/clangd/unittests/ReplayPeambleTests.cpp b/clang-tools-extra/clangd/unittests/ReplayPeambleTests.cpp
index 472fe30..147d9ab 100644
--- a/clang-tools-extra/clangd/unittests/ReplayPeambleTests.cpp
+++ b/clang-tools-extra/clangd/unittests/ReplayPeambleTests.cpp
@@ -72,7 +72,7 @@ struct ReplayPreamblePPCallback : public PPCallbacks {
   void InclusionDirective(SourceLocation HashLoc, const Token &IncludeTok,
                           StringRef FileName, bool IsAngled,
                           CharSourceRange FilenameRange, OptionalFileEntryRef,
-                          StringRef, StringRef, const clang::Module *,
+                          StringRef, StringRef, const clang::Module *, bool,
                           SrcMgr::CharacteristicKind) override {
     Includes.emplace_back(SM, HashLoc, IncludeTok, FileName, IsAngled,
                           FilenameRange);
diff --git a/clang-tools-extra/include-cleaner/lib/Record.cpp b/clang-tools-extra/include-cleaner/lib/Record.cpp
index c93c56a..78a4df6 100644
--- a/clang-tools-extra/include-cleaner/lib/Record.cpp
+++ b/clang-tools-extra/include-cleaner/lib/Record.cpp
@@ -65,7 +65,8 @@ public:
                           StringRef SpelledFilename, bool IsAngled,
                           CharSourceRange FilenameRange,
                           OptionalFileEntryRef File, StringRef SearchPath,
-                          StringRef RelativePath, const Module *,
+                          StringRef RelativePath, const Module *SuggestedModule,
+                          bool ModuleImported,
                           SrcMgr::CharacteristicKind) override {
     if (!Active)
       return;
@@ -214,7 +215,8 @@ public:
                           OptionalFileEntryRef File,
                           llvm::StringRef /*SearchPath*/,
                           llvm::StringRef /*RelativePath*/,
-                          const clang::Module * /*Imported*/,
+                          const clang::Module * /*SuggestedModule*/,
+                          bool /*ModuleImported*/,
                           SrcMgr::CharacteristicKind FileKind) override {
     FileID HashFID = SM.getFileID(HashLoc);
     int HashLine = SM.getLineNumber(HashFID, SM.getFileOffset(HashLoc));
diff --git a/clang-tools-extra/modularize/CoverageChecker.cpp b/clang-tools-extra/modularize/CoverageChecker.cpp
index 1e8b0aa..0e76c53 100644
--- a/clang-tools-extra/modularize/CoverageChecker.cpp
+++ b/clang-tools-extra/modularize/CoverageChecker.cpp
@@ -90,7 +90,8 @@ public:
                           StringRef FileName, bool IsAngled,
                           CharSourceRange FilenameRange,
                           OptionalFileEntryRef File, StringRef SearchPath,
-                          StringRef RelativePath, const Module *Imported,
+                          StringRef RelativePath, const Module *SuggestedModule,
+                          bool ModuleImported,
                           SrcMgr::CharacteristicKind FileType) override {
     Checker.collectUmbrellaHeaderHeader(File->getName());
   }
diff --git a/clang-tools-extra/modularize/PreprocessorTracker.cpp b/clang-tools-extra/modularize/PreprocessorTracker.cpp
index 7557fb1..85e3aab 100644
--- a/clang-tools-extra/modularize/PreprocessorTracker.cpp
+++ b/clang-tools-extra/modularize/PreprocessorTracker.cpp
@@ -730,15 +730,14 @@ public:
   ~PreprocessorCallbacks() override {}
 
   // Overridden handlers.
-  void InclusionDirective(clang::SourceLocation HashLoc,
-                          const clang::Token &IncludeTok,
-                          llvm::StringRef FileName, bool IsAngled,
-                          clang::CharSourceRange FilenameRange,
-                          clang::OptionalFileEntryRef File,
-                          llvm::StringRef SearchPath,
-                          llvm::StringRef RelativePath,
-                          const clang::Module *Imported,
-                          clang::SrcMgr::CharacteristicKind FileType) override;
+  void
+  InclusionDirective(clang::SourceLocation HashLoc,
+                     const clang::Token &IncludeTok, llvm::StringRef FileName,
+                     bool IsAngled, clang::CharSourceRange FilenameRange,
+                     clang::OptionalFileEntryRef File,
+                     llvm::StringRef SearchPath, llvm::StringRef RelativePath,
+                     const clang::Module *SuggestedModule, bool ModuleImported,
+                     clang::SrcMgr::CharacteristicKind FileType) override;
   void FileChanged(clang::SourceLocation Loc,
                    clang::PPCallbacks::FileChangeReason Reason,
                    clang::SrcMgr::CharacteristicKind FileType,
@@ -1275,7 +1274,8 @@ void PreprocessorCallbacks::InclusionDirective(
     llvm::StringRef FileName, bool IsAngled,
     clang::CharSourceRange FilenameRange, clang::OptionalFileEntryRef File,
     llvm::StringRef SearchPath, llvm::StringRef RelativePath,
-    const clang::Module *Imported, clang::SrcMgr::CharacteristicKind FileType) {
+    const clang::Module *SuggestedModule, bool ModuleImported,
+    clang::SrcMgr::CharacteristicKind FileType) {
   int DirectiveLine, DirectiveColumn;
   std::string HeaderPath = getSourceLocationFile(PP, HashLoc);
   getSourceLocationLineAndColumn(PP, HashLoc, DirectiveLine, DirectiveColumn);
diff --git a/clang-tools-extra/pp-trace/PPCallbacksTracker.cpp b/clang-tools-extra/pp-trace/PPCallbacksTracker.cpp
index a59a827..3bb30fd 100644
--- a/clang-tools-extra/pp-trace/PPCallbacksTracker.cpp
+++ b/clang-tools-extra/pp-trace/PPCallbacksTracker.cpp
@@ -135,7 +135,8 @@ void PPCallbacksTracker::InclusionDirective(
     SourceLocation HashLoc, const Token &IncludeTok, llvm::StringRef FileName,
     bool IsAngled, CharSourceRange FilenameRange, OptionalFileEntryRef File,
     llvm::StringRef SearchPath, llvm::StringRef RelativePath,
-    const Module *Imported, SrcMgr::CharacteristicKind FileType) {
+    const Module *SuggestedModule, bool ModuleImported,
+    SrcMgr::CharacteristicKind FileType) {
   beginCallback("InclusionDirective");
   appendArgument("HashLoc", HashLoc);
   appendArgument("IncludeTok", IncludeTok);
@@ -145,7 +146,8 @@ void PPCallbacksTracker::InclusionDirective(
   appendArgument("File", File);
   appendFilePathArgument("SearchPath", SearchPath);
   appendFilePathArgument("RelativePath", RelativePath);
-  appendArgument("Imported", Imported);
+  appendArgument("SuggestedModule", SuggestedModule);
+  appendArgument("ModuleImported", ModuleImported);
 }
 
 // Callback invoked whenever there was an explicit module-import
diff --git a/clang-tools-extra/pp-trace/PPCallbacksTracker.h b/clang-tools-extra/pp-trace/PPCallbacksTracker.h
index c195a72..04590a9 100644
--- a/clang-tools-extra/pp-trace/PPCallbacksTracker.h
+++ b/clang-tools-extra/pp-trace/PPCallbacksTracker.h
@@ -95,7 +95,8 @@ public:
                           llvm::StringRef FileName, bool IsAngled,
                           CharSourceRange FilenameRange,
                           OptionalFileEntryRef File, llvm::StringRef SearchPath,
-                          llvm::StringRef RelativePath, const Module *Imported,
+                          llvm::StringRef RelativePath,
+                          const Module *SuggestedModule, bool ModuleImported,
                           SrcMgr::CharacteristicKind FileType) override;
   void moduleImport(SourceLocation ImportLoc, ModuleIdPath Path,
                     const Module *Imported) override;
diff --git a/clang-tools-extra/test/pp-trace/pp-trace-include.cpp b/clang-tools-extra/test/pp-trace/pp-trace-include.cpp
index db0b2c8..ea9896e 100644
--- a/clang-tools-extra/test/pp-trace/pp-trace-include.cpp
+++ b/clang-tools-extra/test/pp-trace/pp-trace-include.cpp
@@ -59,7 +59,8 @@
 // CHECK-NEXT:   File: "{{.*}}{{[/\\]}}Inputs/Level1A.h"
 // CHECK-NEXT:   SearchPath: "{{.*}}{{[/\\]}}pp-trace"
 // CHECK-NEXT:   RelativePath: "Inputs/Level1A.h"
-// CHECK-NEXT:   Imported: (null)
+// CHECK-NEXT:   SuggestedModule: (null)
+// CHECK-NEXT:   ModuleImported: false
 // CHECK-NEXT: - Callback: FileChanged
 // CHECK-NEXT:   Loc: "{{.*}}{{[/\\]}}Inputs/Level1A.h:1:1"
 // CHECK-NEXT:   Reason: EnterFile
@@ -74,7 +75,8 @@
 // CHECK-NEXT:   File: "{{.*}}{{[/\\]}}Inputs/Level2A.h"
 // CHECK-NEXT:   SearchPath: "{{.*}}{{[/\\]}}Inputs"
 // CHECK-NEXT:   RelativePath: "Level2A.h"
-// CHECK-NEXT:   Imported: (null)
+// CHECK-NEXT:   SuggestedModule: (null)
+// CHECK-NEXT:   ModuleImported: false
 // CHECK-NEXT: - Callback: FileChanged
 // CHECK-NEXT:   Loc: "{{.*}}{{[/\\]}}Inputs/Level2A.h:1:1"
 // CHECK-NEXT:   Reason: EnterFile
@@ -105,7 +107,8 @@
 // CHECK-NEXT:   File: "{{.*}}{{[/\\]}}Inputs/Level1B.h"
 // CHECK-NEXT:   SearchPath: "{{.*}}{{[/\\]}}pp-trace"
 // CHECK-NEXT:   RelativePath: "Inputs/Level1B.h"
-// CHECK-NEXT:   Imported: (null)
+// CHECK-NEXT:   SuggestedModule: (null)
+// CHECK-NEXT:   ModuleImported: false
 // CHECK-NEXT: - Callback: FileChanged
 // CHECK-NEXT:   Loc: "{{.*}}{{[/\\]}}Inputs/Level1B.h:1:1"
 // CHECK-NEXT:   Reason: EnterFile
@@ -120,7 +123,8 @@
 // CHECK-NEXT:   File: "{{.*}}{{[/\\]}}Inputs/Level2B.h"
 // CHECK-NEXT:   SearchPath: "{{.*}}{{[/\\]}}Inputs"
 // CHECK-NEXT:   RelativePath: "Level2B.h"
-// CHECK-NEXT:   Imported: (null)
+// CHECK-NEXT:   SuggestedModule: (null)
+// CHECK-NEXT:   ModuleImported: false
 // CHECK-NEXT: - Callback: FileChanged
 // CHECK-NEXT:   Loc: "{{.*}}{{[/\\]}}Inputs/Level2B.h:1:1"
 // CHECK-NEXT:   Reason: EnterFile
diff --git a/clang/include/clang/Lex/PPCallbacks.h b/clang/include/clang/Lex/PPCallbacks.h
index e3942af..dfc74b5 100644
--- a/clang/include/clang/Lex/PPCallbacks.h
+++ b/clang/include/clang/Lex/PPCallbacks.h
@@ -127,8 +127,10 @@ public:
   /// \param RelativePath The path relative to SearchPath, at which the include
   /// file was found. This is equal to FileName except for framework includes.
   ///
-  /// \param Imported The module, whenever an inclusion directive was
-  /// automatically turned into a module import or null otherwise.
+  /// \param SuggestedModule The module suggested for this header, if any.
+  ///
+  /// \param ModuleImported Whether this include was translated into import of
+  /// \p SuggestedModule.
   ///
   /// \param FileType The characteristic kind, indicates whether a file or
   /// directory holds normal user code, system code, or system code which is
@@ -139,7 +141,8 @@ public:
                                   bool IsAngled, CharSourceRange FilenameRange,
                                   OptionalFileEntryRef File,
                                   StringRef SearchPath, StringRef RelativePath,
-                                  const Module *Imported,
+                                  const Module *SuggestedModule,
+                                  bool ModuleImported,
                                   SrcMgr::CharacteristicKind FileType) {}
 
   /// Callback invoked whenever a submodule was entered.
@@ -473,14 +476,15 @@ public:
                           StringRef FileName, bool IsAngled,
                           CharSourceRange FilenameRange,
                           OptionalFileEntryRef File, StringRef SearchPath,
-                          StringRef RelativePath, const Module *Imported,
+                          StringRef RelativePath, const Module *SuggestedModule,
+                          bool ModuleImported,
                           SrcMgr::CharacteristicKind FileType) override {
     First->InclusionDirective(HashLoc, IncludeTok, FileName, IsAngled,
                               FilenameRange, File, SearchPath, RelativePath,
-                              Imported, FileType);
+                              SuggestedModule, ModuleImported, FileType);
     Second->InclusionDirective(HashLoc, IncludeTok, FileName, IsAngled,
                                FilenameRange, File, SearchPath, RelativePath,
-                               Imported, FileType);
+                               SuggestedModule, ModuleImported, FileType);
   }
 
   void EnteredSubmodule(Module *M, SourceLocation ImportLoc,
diff --git a/clang/include/clang/Lex/PreprocessingRecord.h b/clang/include/clang/Lex/PreprocessingRecord.h
index 5ddf024..437d8e4c 100644
--- a/clang/include/clang/Lex/PreprocessingRecord.h
+++ b/clang/include/clang/Lex/PreprocessingRecord.h
@@ -532,7 +532,8 @@ class Token;
                             StringRef FileName, bool IsAngled,
                             CharSourceRange FilenameRange,
                             OptionalFileEntryRef File, StringRef SearchPath,
-                            StringRef RelativePath, const Module *Imported,
+                            StringRef RelativePath,
+                            const Module *SuggestedModule, bool ModuleImported,
                             SrcMgr::CharacteristicKind FileType) override;
     void Ifdef(SourceLocation Loc, const Token &MacroNameTok,
                const MacroDefinition &MD) override;
diff --git a/clang/include/clang/Tooling/DependencyScanning/ModuleDepCollector.h b/clang/include/clang/Tooling/DependencyScanning/ModuleDepCollector.h
index 051363b..13ad253 100644
--- a/clang/include/clang/Tooling/DependencyScanning/ModuleDepCollector.h
+++ b/clang/include/clang/Tooling/DependencyScanning/ModuleDepCollector.h
@@ -166,7 +166,8 @@ public:
                           StringRef FileName, bool IsAngled,
                           CharSourceRange FilenameRange,
                           OptionalFileEntryRef File, StringRef SearchPath,
-                          StringRef RelativePath, const Module *Imported,
+                          StringRef RelativePath, const Module *SuggestedModule,
+                          bool ModuleImported,
                           SrcMgr::CharacteristicKind FileType) override;
   void moduleImport(SourceLocation ImportLoc, ModuleIdPath Path,
                     const Module *Imported) override;
diff --git a/clang/lib/CodeGen/MacroPPCallbacks.cpp b/clang/lib/CodeGen/MacroPPCallbacks.cpp
index 8589869..c5d1e3a 100644
--- a/clang/lib/CodeGen/MacroPPCallbacks.cpp
+++ b/clang/lib/CodeGen/MacroPPCallbacks.cpp
@@ -168,8 +168,8 @@ void MacroPPCallbacks::FileChanged(SourceLocation Loc, FileChangeReason Reason,
 void MacroPPCallbacks::InclusionDirective(
     SourceLocation HashLoc, const Token &IncludeTok, StringRef FileName,
     bool IsAngled, CharSourceRange FilenameRange, OptionalFileEntryRef File,
-    StringRef SearchPath, StringRef RelativePath, const Module *Imported,
-    SrcMgr::CharacteristicKind FileType) {
+    StringRef SearchPath, StringRef RelativePath, const Module *SuggestedModule,
+    bool ModuleImported, SrcMgr::CharacteristicKind FileType) {
 
   // Record the line location of the current included file.
   LastHashLoc = HashLoc;
diff --git a/clang/lib/CodeGen/MacroPPCallbacks.h b/clang/lib/CodeGen/MacroPPCallbacks.h
index 5af177d..5f46864 100644
--- a/clang/lib/CodeGen/MacroPPCallbacks.h
+++ b/clang/lib/CodeGen/MacroPPCallbacks.h
@@ -102,7 +102,8 @@ public:
                           StringRef FileName, bool IsAngled,
                           CharSourceRange FilenameRange,
                           OptionalFileEntryRef File, StringRef SearchPath,
-                          StringRef RelativePath, const Module *Imported,
+                          StringRef RelativePath, const Module *SuggestedModule,
+                          bool ModuleImported,
                           SrcMgr::CharacteristicKind FileType) override;
 
   /// Hook called whenever a macro definition is seen.
diff --git a/clang/lib/Frontend/DependencyFile.cpp b/clang/lib/Frontend/DependencyFile.cpp
index 19abcac..369816e 100644
--- a/clang/lib/Frontend/DependencyFile.cpp
+++ b/clang/lib/Frontend/DependencyFile.cpp
@@ -66,7 +66,8 @@ struct DepCollectorPPCallbacks : public PPCallbacks {
                           StringRef FileName, bool IsAngled,
                           CharSourceRange FilenameRange,
                           OptionalFileEntryRef File, StringRef SearchPath,
-                          StringRef RelativePath, const Module *Imported,
+                          StringRef RelativePath, const Module *SuggestedModule,
+                          bool ModuleImported,
                           SrcMgr::CharacteristicKind FileType) override {
     if (!File)
       DepCollector.maybeAddDependency(FileName, /*FromModule*/ false,
diff --git a/clang/lib/Frontend/DependencyGraph.cpp b/clang/lib/Frontend/DependencyGraph.cpp
index b471471..20e5f23 100644
--- a/clang/lib/Frontend/DependencyGraph.cpp
+++ b/clang/lib/Frontend/DependencyGraph.cpp
@@ -49,7 +49,8 @@ public:
                           StringRef FileName, bool IsAngled,
                           CharSourceRange FilenameRange,
                           OptionalFileEntryRef File, StringRef SearchPath,
-                          StringRef RelativePath, const Module *Imported,
+                          StringRef RelativePath, const Module *SuggestedModule,
+                          bool ModuleImported,
                           SrcMgr::CharacteristicKind FileType) override;
 
   void EndOfMainFile() override {
@@ -68,8 +69,8 @@ void clang::AttachDependencyGraphGen(Preprocessor &PP, StringRef OutputFile,
 void DependencyGraphCallback::InclusionDirective(
     SourceLocation HashLoc, const Token &IncludeTok, StringRef FileName,
     bool IsAngled, CharSourceRange FilenameRange, OptionalFileEntryRef File,
-    StringRef SearchPath, StringRef RelativePath, const Module *Imported,
-    SrcMgr::CharacteristicKind FileType) {
+    StringRef SearchPath, StringRef RelativePath, const Module *SuggestedModule,
+    bool ModuleImported, SrcMgr::CharacteristicKind FileType) {
   if (!File)
     return;
 
diff --git a/clang/lib/Frontend/ModuleDependencyCollector.cpp b/clang/lib/Frontend/ModuleDependencyCollector.cpp
index 939e611..b88cb60 100644
--- a/clang/lib/Frontend/ModuleDependencyCollector.cpp
+++ b/clang/lib/Frontend/ModuleDependencyCollector.cpp
@@ -55,7 +55,8 @@ struct ModuleDependencyPPCallbacks : public PPCallbacks {
                           StringRef FileName, bool IsAngled,
                           CharSourceRange FilenameRange,
                           OptionalFileEntryRef File, StringRef SearchPath,
-                          StringRef RelativePath, const Module *Imported,
+                          StringRef RelativePath, const Module *SuggestedModule,
+                          bool ModuleImported,
                           SrcMgr::CharacteristicKind FileType) override {
     if (!File)
       return;
diff --git a/clang/lib/Frontend/PrecompiledPreamble.cpp b/clang/lib/Frontend/PrecompiledPreamble.cpp
index 62373b2..9b0ef30 100644
--- a/clang/lib/Frontend/PrecompiledPreamble.cpp
+++ b/clang/lib/Frontend/PrecompiledPreamble.cpp
@@ -98,7 +98,8 @@ public:
                           StringRef FileName, bool IsAngled,
                           CharSourceRange FilenameRange,
                           OptionalFileEntryRef File, StringRef SearchPath,
-                          StringRef RelativePath, const Module *Imported,
+                          StringRef RelativePath, const Module *SuggestedModule,
+                          bool ModuleImported,
                           SrcMgr::CharacteristicKind FileType) override {
     // File is std::nullopt if it wasn't found.
     // (We have some false negatives if PP recovered e.g. <foo> -> "foo")
diff --git a/clang/lib/Frontend/PrintPreprocessedOutput.cpp b/clang/lib/Frontend/PrintPreprocessedOutput.cpp
index 7f5f669..a26d2c3 100644
--- a/clang/lib/Frontend/PrintPreprocessedOutput.cpp
+++ b/clang/lib/Frontend/PrintPreprocessedOutput.cpp
@@ -153,7 +153,8 @@ public:
                           StringRef FileName, bool IsAngled,
                           CharSourceRange FilenameRange,
                           OptionalFileEntryRef File, StringRef SearchPath,
-                          StringRef RelativePath, const Module *Imported,
+                          StringRef RelativePath, const Module *SuggestedModule,
+                          bool ModuleImported,
                           SrcMgr::CharacteristicKind FileType) override;
   void Ident(SourceLocation Loc, StringRef str) override;
   void PragmaMessage(SourceLocation Loc, StringRef Namespace,
@@ -401,8 +402,8 @@ void PrintPPOutputPPCallbacks::FileChanged(SourceLocation Loc,
 void PrintPPOutputPPCallbacks::InclusionDirective(
     SourceLocation HashLoc, const Token &IncludeTok, StringRef FileName,
     bool IsAngled, CharSourceRange FilenameRange, OptionalFileEntryRef File,
-    StringRef SearchPath, StringRef RelativePath, const Module *Imported,
-    SrcMgr::CharacteristicKind FileType) {
+    StringRef SearchPath, StringRef RelativePath, const Module *SuggestedModule,
+    bool ModuleImported, SrcMgr::CharacteristicKind FileType) {
   // In -dI mode, dump #include directives prior to dumping their content or
   // interpretation. Similar for -fkeep-system-includes.
   if (DumpIncludeDirectives || (KeepSystemIncludes && isSystem(FileType))) {
@@ -418,14 +419,14 @@ void PrintPPOutputPPCallbacks::InclusionDirective(
   }
 
   // When preprocessing, turn implicit imports into module import pragmas.
-  if (Imported) {
+  if (ModuleImported) {
     switch (IncludeTok.getIdentifierInfo()->getPPKeywordID()) {
     case tok::pp_include:
     case tok::pp_import:
     case tok::pp_include_next:
       MoveToLine(HashLoc, /*RequireStartOfLine=*/true);
       *OS << "#pragma clang module import "
-          << Imported->getFullModuleName(true)
+          << SuggestedModule->getFullModuleName(true)
           << " /* clang -E: implicit import for "
           << "#" << PP.getSpelling(IncludeTok) << " "
           << (IsAngled ? '<' : '"') << FileName << (IsAngled ? '>' : '"')
diff --git a/clang/lib/Frontend/Rewrite/InclusionRewriter.cpp b/clang/lib/Frontend/Rewrite/InclusionRewriter.cpp
index b6b3746..1462058 100644
--- a/clang/lib/Frontend/Rewrite/InclusionRewriter.cpp
+++ b/clang/lib/Frontend/Rewrite/InclusionRewriter.cpp
@@ -75,7 +75,8 @@ private:
                           StringRef FileName, bool IsAngled,
                           CharSourceRange FilenameRange,
                           OptionalFileEntryRef File, StringRef SearchPath,
-                          StringRef RelativePath, const Module *Imported,
+                          StringRef RelativePath, const Module *SuggestedModule,
+                          bool ModuleImported,
                           SrcMgr::CharacteristicKind FileType) override;
   void If(SourceLocation Loc, SourceRange ConditionRange,
           ConditionValueKind ConditionValue) override;
@@ -189,9 +190,10 @@ void InclusionRewriter::InclusionDirective(
     StringRef /*FileName*/, bool /*IsAngled*/,
     CharSourceRange /*FilenameRange*/, OptionalFileEntryRef /*File*/,
     StringRef /*SearchPath*/, StringRef /*RelativePath*/,
-    const Module *Imported, SrcMgr::CharacteristicKind FileType) {
-  if (Imported) {
-    auto P = ModuleIncludes.insert(std::make_pair(HashLoc, Imported));
+    const Module *SuggestedModule, bool ModuleImported,
+    SrcMgr::CharacteristicKind FileType) {
+  if (ModuleImported) {
+    auto P = ModuleIncludes.insert(std::make_pair(HashLoc, SuggestedModule));
     (void)P;
     assert(P.second && "Unexpected revisitation of the same include directive");
   } else
diff --git a/clang/lib/Lex/PPDirectives.cpp b/clang/lib/Lex/PPDirectives.cpp
index a980f4b..97f9c0a 100644
--- a/clang/lib/Lex/PPDirectives.cpp
+++ b/clang/lib/Lex/PPDirectives.cpp
@@ -2253,26 +2253,27 @@ Preprocessor::ImportAction Preprocessor::HandleHeaderIncludeOrImport(
 
   // FIXME: We do not have a good way to disambiguate C++ clang modules from
   // C++ standard modules (other than use/non-use of Header Units).
-  Module *SM = SuggestedModule.getModule();
 
-  bool MaybeTranslateInclude =
-      Action == Enter && File && SM && !SM->isForBuilding(getLangOpts());
+  Module *ModuleToImport = SuggestedModule.getModule();
+
+  bool MaybeTranslateInclude = Action == Enter && File && ModuleToImport &&
+                               !ModuleToImport->isForBuilding(getLangOpts());
 
   // Maybe a usable Header Unit
   bool UsableHeaderUnit = false;
-  if (getLangOpts().CPlusPlusModules && SM && SM->isHeaderUnit()) {
+  if (getLangOpts().CPlusPlusModules && ModuleToImport &&
+      ModuleToImport->isHeaderUnit()) {
     if (TrackGMFState.inGMF() || IsImportDecl)
       UsableHeaderUnit = true;
     else if (!IsImportDecl) {
       // This is a Header Unit that we do not include-translate
-      SuggestedModule = ModuleMap::KnownHeader();
-      SM = nullptr;
+      ModuleToImport = nullptr;
     }
   }
   // Maybe a usable clang header module.
   bool UsableClangHeaderModule =
-      (getLangOpts().CPlusPlusModules || getLangOpts().Modules) && SM &&
-      !SM->isHeaderUnit();
+      (getLangOpts().CPlusPlusModules || getLangOpts().Modules) &&
+      ModuleToImport && !ModuleToImport->isHeaderUnit();
 
   // Determine whether we should try to import the module for this #include, if
   // there is one. Don't do so if precompiled module support is disabled or we
@@ -2282,12 +2283,11 @@ Preprocessor::ImportAction Preprocessor::HandleHeaderIncludeOrImport(
     // unavailable, diagnose the situation and bail out.
     // FIXME: Remove this; loadModule does the same check (but produces
     // slightly worse diagnostics).
-    if (checkModuleIsAvailable(getLangOpts(), getTargetInfo(),
-                               *SuggestedModule.getModule(),
+    if (checkModuleIsAvailable(getLangOpts(), getTargetInfo(), *ModuleToImport,
                                getDiagnostics())) {
       Diag(FilenameTok.getLocation(),
            diag::note_implicit_top_level_module_import_here)
-          << SuggestedModule.getModule()->getTopLevelModuleName();
+          << ModuleToImport->getTopLevelModuleName();
       return {ImportAction::None};
     }
 
@@ -2295,7 +2295,7 @@ Preprocessor::ImportAction Preprocessor::HandleHeaderIncludeOrImport(
     // FIXME: Should we have a second loadModule() overload to avoid this
     // extra lookup step?
     SmallVector<std::pair<IdentifierInfo *, SourceLocation>, 2> Path;
-    for (Module *Mod = SM; Mod; Mod = Mod->Parent)
+    for (Module *Mod = ModuleToImport; Mod; Mod = Mod->Parent)
       Path.push_back(std::make_pair(getIdentifierInfo(Mod->Name),
                                     FilenameTok.getLocation()));
     std::reverse(Path.begin(), Path.end());
@@ -2306,12 +2306,12 @@ Preprocessor::ImportAction Preprocessor::HandleHeaderIncludeOrImport(
 
     // Load the module to import its macros. We'll make the declarations
     // visible when the parser gets here.
-    // FIXME: Pass SuggestedModule in here rather than converting it to a path
-    // and making the module loader convert it back again.
+    // FIXME: Pass SM in here rather than converting it to a path and making the
+    // module loader convert it back again.
     ModuleLoadResult Imported = TheModuleLoader.loadModule(
         IncludeTok.getLocation(), Path, Module::Hidden,
         /*IsInclusionDirective=*/true);
-    assert((Imported == nullptr || Imported == SuggestedModule.getModule()) &&
+    assert((Imported == nullptr || Imported == SM) &&
            "the imported module is different than the suggested one");
 
     if (Imported) {
@@ -2323,8 +2323,7 @@ Preprocessor::ImportAction Preprocessor::HandleHeaderIncludeOrImport(
       // was in the directory of an umbrella header, for instance), but no
       // actual module containing it exists (because the umbrella header is
       // incomplete).  Treat this as a textual inclusion.
-      SuggestedModule = ModuleMap::KnownHeader();
-      SM = nullptr;
+      ModuleToImport = nullptr;
     } else if (Imported.isConfigMismatch()) {
       // On a configuration mismatch, enter the header textually. We still know
       // that it's part of the corresponding module.
@@ -2365,7 +2364,7 @@ Preprocessor::ImportAction Preprocessor::HandleHeaderIncludeOrImport(
   // this file will have no effect.
   if (Action == Enter && File &&
       !HeaderInfo.ShouldEnterIncludeFile(*this, *File, EnterOnce,
-                                         getLangOpts().Modules, SM,
+                                         getLangOpts().Modules, ModuleToImport,
                                          IsFirstIncludeOfFile)) {
     // C++ standard modules:
     // If we are not in the GMF, then we textually include only
@@ -2380,7 +2379,7 @@ Preprocessor::ImportAction Preprocessor::HandleHeaderIncludeOrImport(
     if (UsableHeaderUnit && !getLangOpts().CompilingPCH)
       Action = TrackGMFState.inGMF() ? Import : Skip;
     else
-      Action = (SuggestedModule && !getLangOpts().CompilingPCH) ? Import : Skip;
+      Action = (ModuleToImport && !getLangOpts().CompilingPCH) ? Import : Skip;
   }
 
   // Check for circular inclusion of the main file.
@@ -2400,8 +2399,7 @@ Preprocessor::ImportAction Preprocessor::HandleHeaderIncludeOrImport(
     // FIXME: Use a different callback for a pp-import?
     Callbacks->InclusionDirective(HashLoc, IncludeTok, LookupFilename, isAngled,
                                   FilenameRange, File, SearchPath, RelativePath,
-                                  Action == Import ? SuggestedModule.getModule()
-                                                   : nullptr,
+                                  SuggestedModule.getModule(), Action == Import,
                                   FileCharacter);
     if (Action == Skip && File)
       Callbacks->FileSkipped(*File, FilenameTok, FileCharacter);
@@ -2412,7 +2410,7 @@ Preprocessor::ImportAction Preprocessor::HandleHeaderIncludeOrImport(
 
   // If this is a C++20 pp-import declaration, diagnose if we didn't find any
   // module corresponding to the named header.
-  if (IsImportDecl && !SuggestedModule) {
+  if (IsImportDecl && !ModuleToImport) {
     Diag(FilenameTok, diag::err_header_import_not_header_unit)
       << OriginalFilename << File->getName();
     return {ImportAction::None};
@@ -2517,8 +2515,8 @@ Preprocessor::ImportAction Preprocessor::HandleHeaderIncludeOrImport(
   switch (Action) {
   case Skip:
     // If we don't need to enter the file, stop now.
-    if (SM)
-      return {ImportAction::SkippedModuleImport, SM};
+    if (ModuleToImport)
+      return {ImportAction::SkippedModuleImport, ModuleToImport};
     return {ImportAction::None};
 
   case IncludeLimitReached:
@@ -2530,13 +2528,13 @@ Preprocessor::ImportAction Preprocessor::HandleHeaderIncludeOrImport(
     // If this is a module import, make it visible if needed.
     assert(SM && "no module to import");
 
-    makeModuleVisible(SM, EndLoc);
+    makeModuleVisible(ModuleToImport, EndLoc);
 
     if (IncludeTok.getIdentifierInfo()->getPPKeywordID() ==
         tok::pp___include_macros)
       return {ImportAction::None};
 
-    return {ImportAction::ModuleImport, SM};
+    return {ImportAction::ModuleImport, ModuleToImport};
   }
 
   case Enter:
@@ -2573,13 +2571,14 @@ Preprocessor::ImportAction Preprocessor::HandleHeaderIncludeOrImport(
 
   // Determine if we're switching to building a new submodule, and which one.
   // This does not apply for C++20 modules header units.
-  if (SM && !SM->isHeaderUnit()) {
-    if (SM->getTopLevelModule()->ShadowingModule) {
+  if (ModuleToImport && !ModuleToImport->isHeaderUnit()) {
+    if (ModuleToImport->getTopLevelModule()->ShadowingModule) {
       // We are building a submodule that belongs to a shadowed module. This
       // means we find header files in the shadowed module.
-      Diag(SM->DefinitionLoc, diag::err_module_build_shadowed_submodule)
-          << SM->getFullModuleName();
-      Diag(SM->getTopLevelModule()->ShadowingModule->DefinitionLoc,
+      Diag(ModuleToImport->DefinitionLoc,
+           diag::err_module_build_shadowed_submodule)
+          << ModuleToImport->getFullModuleName();
+      Diag(ModuleToImport->getTopLevelModule()->ShadowingModule->DefinitionLoc,
            diag::note_previous_definition);
       return {ImportAction::None};
     }
@@ -2591,21 +2590,22 @@ Preprocessor::ImportAction Preprocessor::HandleHeaderIncludeOrImport(
     // that behaves the same as the header would behave in a compilation using
     // that PCH, which means we should enter the submodule. We need to teach
     // the AST serialization layer to deal with the resulting AST.
-    if (getLangOpts().CompilingPCH && SM->isForBuilding(getLangOpts()))
+    if (getLangOpts().CompilingPCH &&
+        ModuleToImport->isForBuilding(getLangOpts()))
       return {ImportAction::None};
 
     assert(!CurLexerSubmodule && "should not have marked this as a module yet");
-    CurLexerSubmodule = SM;
+    CurLexerSubmodule = ModuleToImport;
 
     // Let the macro handling code know that any future macros are within
     // the new submodule.
-    EnterSubmodule(SM, EndLoc, /*ForPragma*/ false);
+    EnterSubmodule(ModuleToImport, EndLoc, /*ForPragma*/ false);
 
     // Let the parser know that any future declarations are within the new
     // submodule.
     // FIXME: There's no point doing this if we're handling a #__include_macros
     // directive.
-    return {ImportAction::ModuleBegin, SM};
+    return {ImportAction::ModuleBegin, ModuleToImport};
   }
 
   assert(!IsImportDecl && "failed to diagnose missing module for import decl");
diff --git a/clang/lib/Lex/PreprocessingRecord.cpp b/clang/lib/Lex/PreprocessingRecord.cpp
index aab6a2b..be5aac7 100644
--- a/clang/lib/Lex/PreprocessingRecord.cpp
+++ b/clang/lib/Lex/PreprocessingRecord.cpp
@@ -472,8 +472,8 @@ void PreprocessingRecord::MacroUndefined(const Token &Id,
 void PreprocessingRecord::InclusionDirective(
     SourceLocation HashLoc, const Token &IncludeTok, StringRef FileName,
     bool IsAngled, CharSourceRange FilenameRange, OptionalFileEntryRef File,
-    StringRef SearchPath, StringRef RelativePath, const Module *Imported,
-    SrcMgr::CharacteristicKind FileType) {
+    StringRef SearchPath, StringRef RelativePath, const Module *SuggestedModule,
+    bool ModuleImported, SrcMgr::CharacteristicKind FileType) {
   InclusionDirective::InclusionKind Kind = InclusionDirective::Include;
 
   switch (IncludeTok.getIdentifierInfo()->getPPKeywordID()) {
@@ -506,10 +506,9 @@ void PreprocessingRecord::InclusionDirective(
       EndLoc = EndLoc.getLocWithOffset(-1); // the InclusionDirective expects
                                             // a token range.
   }
-  clang::InclusionDirective *ID =
-      new (*this) clang::InclusionDirective(*this, Kind, FileName, !IsAngled,
-                                            (bool)Imported, File,
-                                            SourceRange(HashLoc, EndLoc));
+  clang::InclusionDirective *ID = new (*this) clang::InclusionDirective(
+      *this, Kind, FileName, !IsAngled, ModuleImported, File,
+      SourceRange(HashLoc, EndLoc));
   addPreprocessedEntity(ID);
 }
 
diff --git a/clang/lib/Tooling/DependencyScanning/ModuleDepCollector.cpp b/clang/lib/Tooling/DependencyScanning/ModuleDepCollector.cpp
index 995d8b2..5a9e563 100644
--- a/clang/lib/Tooling/DependencyScanning/ModuleDepCollector.cpp
+++ b/clang/lib/Tooling/DependencyScanning/ModuleDepCollector.cpp
@@ -430,14 +430,14 @@ void ModuleDepCollectorPP::LexedFileChanged(FileID FID,
 void ModuleDepCollectorPP::InclusionDirective(
     SourceLocation HashLoc, const Token &IncludeTok, StringRef FileName,
     bool IsAngled, CharSourceRange FilenameRange, OptionalFileEntryRef File,
-    StringRef SearchPath, StringRef RelativePath, const Module *Imported,
-    SrcMgr::CharacteristicKind FileType) {
-  if (!File && !Imported) {
+    StringRef SearchPath, StringRef RelativePath, const Module *SuggestedModule,
+    bool ModuleImported, SrcMgr::CharacteristicKind FileType) {
+  if (!File && !ModuleImported) {
     // This is a non-modular include that HeaderSearch failed to find. Add it
     // here as `FileChanged` will never see it.
     MDC.addFileDep(FileName);
   }
-  handleImport(Imported);
+  handleImport(SuggestedModule);
 }
 
 void ModuleDepCollectorPP::moduleImport(SourceLocation ImportLoc,
diff --git a/clang/tools/libclang/Indexing.cpp b/clang/tools/libclang/Indexing.cpp
index 17d393e..05d88452 100644
--- a/clang/tools/libclang/Indexing.cpp
+++ b/clang/tools/libclang/Indexing.cpp
@@ -261,12 +261,13 @@ public:
                           StringRef FileName, bool IsAngled,
                           CharSourceRange FilenameRange,
                           OptionalFileEntryRef File, StringRef SearchPath,
-                          StringRef RelativePath, const Module *Imported,
+                          StringRef RelativePath, const Module *SuggestedModule,
+                          bool ModuleImported,
                           SrcMgr::CharacteristicKind FileType) override {
     bool isImport = (IncludeTok.is(tok::identifier) &&
             IncludeTok.getIdentifierInfo()->getPPKeywordID() == tok::pp_import);
     DataConsumer.ppIncludedFile(HashLoc, FileName, File, isImport, IsAngled,
-                            Imported);
+                                ModuleImported);
   }
 
   /// MacroDefined - This hook is called whenever a macro definition is seen.
diff --git a/clang/unittests/Lex/PPCallbacksTest.cpp b/clang/unittests/Lex/PPCallbacksTest.cpp
index e0a27b5..f3cdb1d 100644
--- a/clang/unittests/Lex/PPCallbacksTest.cpp
+++ b/clang/unittests/Lex/PPCallbacksTest.cpp
@@ -37,7 +37,8 @@ public:
                           StringRef FileName, bool IsAngled,
                           CharSourceRange FilenameRange,
                           OptionalFileEntryRef File, StringRef SearchPath,
-                          StringRef RelativePath, const Module *Imported,
+                          StringRef RelativePath, const Module *SuggestedModule,
+                          bool ModuleImported,
                           SrcMgr::CharacteristicKind FileType) override {
     this->HashLoc = HashLoc;
     this->IncludeTok = IncludeTok;
@@ -47,7 +48,8 @@ public:
     this->File = File;
     this->SearchPath = SearchPath.str();
     this->RelativePath = RelativePath.str();
-    this->Imported = Imported;
+    this->SuggestedModule = SuggestedModule;
+    this->ModuleImported = ModuleImported;
     this->FileType = FileType;
   }
 
@@ -59,7 +61,8 @@ public:
   OptionalFileEntryRef File;
   SmallString<16> SearchPath;
   SmallString<16> RelativePath;
-  const Module* Imported;
+  const Module *SuggestedModule;
+  bool ModuleImported;
   SrcMgr::CharacteristicKind FileType;
 };
 
-- 
cgit v1.1


From 13c14ad42c65e154dc079332dd5dd58e8925d26c Mon Sep 17 00:00:00 2001
From: Jeremy Morse <jeremy.morse@sony.com>
Date: Thu, 8 Feb 2024 18:14:07 +0000
Subject: Revert "[DebugInfo][RemoveDIs] Turn on non-instrinsic debug-info by
 default"

This reverts commit bdde5f9bea75e897bcc31a95b9c3376988c211cc.

Two situations that are tripping a few buildbots:

  https://lab.llvm.org/buildbot/#/builders/205/builds/25126

Here, polly is currently presenting a DebugLoc attached to a debugging
intrinsic as a "true" source location in a user report, something that's
unreliable.

  https://lab.llvm.org/buildbot/#/builders/184/builds/10242

These HWAsan failures are probably (97% confidence) because in
StackInfoBuilder::visit we're not observing DPValues attached to lifetime
intrinsics because they're delt with higher up the function.

But it's late-o'clock here, so revert for now.
---
 llvm/lib/IR/BasicBlock.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/IR/BasicBlock.cpp b/llvm/lib/IR/BasicBlock.cpp
index bf02eba..fe9d0d0 100644
--- a/llvm/lib/IR/BasicBlock.cpp
+++ b/llvm/lib/IR/BasicBlock.cpp
@@ -34,7 +34,7 @@ cl::opt<bool>
     UseNewDbgInfoFormat("experimental-debuginfo-iterators",
                         cl::desc("Enable communicating debuginfo positions "
                                  "through iterators, eliminating intrinsics"),
-                        cl::init(true));
+                        cl::init(false));
 
 DPMarker *BasicBlock::createMarker(Instruction *I) {
   assert(IsNewDbgInfoFormat &&
-- 
cgit v1.1


From 544f610d5310e1c1e7dd7a081d5a2a2607225740 Mon Sep 17 00:00:00 2001
From: Nikolas Klauser <nikolasklauser@berlin.de>
Date: Thu, 8 Feb 2024 19:22:16 +0100
Subject: [libc++] Use __is_pointer_in_range inside vector::insert (#80624)

---
 libcxx/include/vector | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/libcxx/include/vector b/libcxx/include/vector
index 3934361..ce7df7a 100644
--- a/libcxx/include/vector
+++ b/libcxx/include/vector
@@ -351,6 +351,7 @@ template<class T, class charT> requires is-vector-bool-reference<T> // Since C++
 #include <__type_traits/type_identity.h>
 #include <__utility/exception_guard.h>
 #include <__utility/forward.h>
+#include <__utility/is_pointer_in_range.h>
 #include <__utility/move.h>
 #include <__utility/pair.h>
 #include <__utility/swap.h>
@@ -1580,14 +1581,13 @@ template <class _Tp, class _Allocator>
 _LIBCPP_CONSTEXPR_SINCE_CXX20 typename vector<_Tp, _Allocator>::iterator
 vector<_Tp, _Allocator>::insert(const_iterator __position, const_reference __x) {
   pointer __p = this->__begin_ + (__position - begin());
-  // We can't compare unrelated pointers inside constant expressions
-  if (!__libcpp_is_constant_evaluated() && this->__end_ < this->__end_cap()) {
+  if (this->__end_ < this->__end_cap()) {
     if (__p == this->__end_) {
       __construct_one_at_end(__x);
     } else {
       __move_range(__p, this->__end_, __p + 1);
       const_pointer __xr = pointer_traits<const_pointer>::pointer_to(__x);
-      if (__p <= __xr && __xr < this->__end_)
+      if (std::__is_pointer_in_range(std::__to_address(__p), std::__to_address(__end_), std::addressof(__x)))
         ++__xr;
       *__p = *__xr;
     }
-- 
cgit v1.1


From d272d944de9f0cb274752f77e97d4ceab2401ec5 Mon Sep 17 00:00:00 2001
From: Nikolas Klauser <nikolasklauser@berlin.de>
Date: Thu, 8 Feb 2024 19:22:49 +0100
Subject: [libc++][NFC] Simplify the implementation of `numeric_limits`
 (#80425)

The cv specializations for `numeric_limits` inherited privately for some
reason. We can simplify the implementation by inheriting publicly and
removing the members that just replicate the values from the base class.
---
 libcxx/include/limits | 283 +-------------------------------------------------
 1 file changed, 5 insertions(+), 278 deletions(-)

diff --git a/libcxx/include/limits b/libcxx/include/limits
index a240580..c704b4d 100644
--- a/libcxx/include/limits
+++ b/libcxx/include/limits
@@ -436,8 +436,8 @@ protected:
 };
 
 template <class _Tp>
-class _LIBCPP_TEMPLATE_VIS numeric_limits : private __libcpp_numeric_limits<__remove_cv_t<_Tp> > {
-  typedef __libcpp_numeric_limits<__remove_cv_t<_Tp> > __base;
+class _LIBCPP_TEMPLATE_VIS numeric_limits : private __libcpp_numeric_limits<_Tp> {
+  typedef __libcpp_numeric_limits<_Tp> __base;
   typedef typename __base::type type;
 
 public:
@@ -530,286 +530,13 @@ template <class _Tp>
 _LIBCPP_CONSTEXPR const float_round_style numeric_limits<_Tp>::round_style;
 
 template <class _Tp>
-class _LIBCPP_TEMPLATE_VIS numeric_limits<const _Tp> : private numeric_limits<_Tp> {
-  typedef numeric_limits<_Tp> __base;
-  typedef _Tp type;
-
-public:
-  static _LIBCPP_CONSTEXPR const bool is_specialized = __base::is_specialized;
-  _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type min() _NOEXCEPT { return __base::min(); }
-  _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type max() _NOEXCEPT { return __base::max(); }
-  _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type lowest() _NOEXCEPT { return __base::lowest(); }
-
-  static _LIBCPP_CONSTEXPR const int digits       = __base::digits;
-  static _LIBCPP_CONSTEXPR const int digits10     = __base::digits10;
-  static _LIBCPP_CONSTEXPR const int max_digits10 = __base::max_digits10;
-  static _LIBCPP_CONSTEXPR const bool is_signed   = __base::is_signed;
-  static _LIBCPP_CONSTEXPR const bool is_integer  = __base::is_integer;
-  static _LIBCPP_CONSTEXPR const bool is_exact    = __base::is_exact;
-  static _LIBCPP_CONSTEXPR const int radix        = __base::radix;
-  _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type epsilon() _NOEXCEPT { return __base::epsilon(); }
-  _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type round_error() _NOEXCEPT { return __base::round_error(); }
+class _LIBCPP_TEMPLATE_VIS numeric_limits<const _Tp> : public numeric_limits<_Tp> {};
 
-  static _LIBCPP_CONSTEXPR const int min_exponent   = __base::min_exponent;
-  static _LIBCPP_CONSTEXPR const int min_exponent10 = __base::min_exponent10;
-  static _LIBCPP_CONSTEXPR const int max_exponent   = __base::max_exponent;
-  static _LIBCPP_CONSTEXPR const int max_exponent10 = __base::max_exponent10;
-
-  static _LIBCPP_CONSTEXPR const bool has_infinity      = __base::has_infinity;
-  static _LIBCPP_CONSTEXPR const bool has_quiet_NaN     = __base::has_quiet_NaN;
-  static _LIBCPP_CONSTEXPR const bool has_signaling_NaN = __base::has_signaling_NaN;
-  _LIBCPP_SUPPRESS_DEPRECATED_PUSH
-  static _LIBCPP_DEPRECATED_IN_CXX23 _LIBCPP_CONSTEXPR const float_denorm_style has_denorm = __base::has_denorm;
-  static _LIBCPP_DEPRECATED_IN_CXX23 _LIBCPP_CONSTEXPR const bool has_denorm_loss          = __base::has_denorm_loss;
-  _LIBCPP_SUPPRESS_DEPRECATED_POP
-  _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type infinity() _NOEXCEPT { return __base::infinity(); }
-  _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type quiet_NaN() _NOEXCEPT { return __base::quiet_NaN(); }
-  _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type signaling_NaN() _NOEXCEPT { return __base::signaling_NaN(); }
-  _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type denorm_min() _NOEXCEPT { return __base::denorm_min(); }
-
-  static _LIBCPP_CONSTEXPR const bool is_iec559  = __base::is_iec559;
-  static _LIBCPP_CONSTEXPR const bool is_bounded = __base::is_bounded;
-  static _LIBCPP_CONSTEXPR const bool is_modulo  = __base::is_modulo;
-
-  static _LIBCPP_CONSTEXPR const bool traps                    = __base::traps;
-  static _LIBCPP_CONSTEXPR const bool tinyness_before          = __base::tinyness_before;
-  static _LIBCPP_CONSTEXPR const float_round_style round_style = __base::round_style;
-};
-
-template <class _Tp>
-_LIBCPP_CONSTEXPR const bool numeric_limits<const _Tp>::is_specialized;
-template <class _Tp>
-_LIBCPP_CONSTEXPR const int numeric_limits<const _Tp>::digits;
-template <class _Tp>
-_LIBCPP_CONSTEXPR const int numeric_limits<const _Tp>::digits10;
-template <class _Tp>
-_LIBCPP_CONSTEXPR const int numeric_limits<const _Tp>::max_digits10;
-template <class _Tp>
-_LIBCPP_CONSTEXPR const bool numeric_limits<const _Tp>::is_signed;
-template <class _Tp>
-_LIBCPP_CONSTEXPR const bool numeric_limits<const _Tp>::is_integer;
-template <class _Tp>
-_LIBCPP_CONSTEXPR const bool numeric_limits<const _Tp>::is_exact;
-template <class _Tp>
-_LIBCPP_CONSTEXPR const int numeric_limits<const _Tp>::radix;
 template <class _Tp>
-_LIBCPP_CONSTEXPR const int numeric_limits<const _Tp>::min_exponent;
-template <class _Tp>
-_LIBCPP_CONSTEXPR const int numeric_limits<const _Tp>::min_exponent10;
-template <class _Tp>
-_LIBCPP_CONSTEXPR const int numeric_limits<const _Tp>::max_exponent;
-template <class _Tp>
-_LIBCPP_CONSTEXPR const int numeric_limits<const _Tp>::max_exponent10;
-template <class _Tp>
-_LIBCPP_CONSTEXPR const bool numeric_limits<const _Tp>::has_infinity;
-template <class _Tp>
-_LIBCPP_CONSTEXPR const bool numeric_limits<const _Tp>::has_quiet_NaN;
-template <class _Tp>
-_LIBCPP_CONSTEXPR const bool numeric_limits<const _Tp>::has_signaling_NaN;
-template <class _Tp>
-_LIBCPP_CONSTEXPR const float_denorm_style numeric_limits<const _Tp>::has_denorm;
-template <class _Tp>
-_LIBCPP_CONSTEXPR const bool numeric_limits<const _Tp>::has_denorm_loss;
-template <class _Tp>
-_LIBCPP_CONSTEXPR const bool numeric_limits<const _Tp>::is_iec559;
-template <class _Tp>
-_LIBCPP_CONSTEXPR const bool numeric_limits<const _Tp>::is_bounded;
-template <class _Tp>
-_LIBCPP_CONSTEXPR const bool numeric_limits<const _Tp>::is_modulo;
-template <class _Tp>
-_LIBCPP_CONSTEXPR const bool numeric_limits<const _Tp>::traps;
-template <class _Tp>
-_LIBCPP_CONSTEXPR const bool numeric_limits<const _Tp>::tinyness_before;
-template <class _Tp>
-_LIBCPP_CONSTEXPR const float_round_style numeric_limits<const _Tp>::round_style;
-
-template <class _Tp>
-class _LIBCPP_TEMPLATE_VIS numeric_limits<volatile _Tp> : private numeric_limits<_Tp> {
-  typedef numeric_limits<_Tp> __base;
-  typedef _Tp type;
-
-public:
-  static _LIBCPP_CONSTEXPR const bool is_specialized = __base::is_specialized;
-  _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type min() _NOEXCEPT { return __base::min(); }
-  _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type max() _NOEXCEPT { return __base::max(); }
-  _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type lowest() _NOEXCEPT { return __base::lowest(); }
-
-  static _LIBCPP_CONSTEXPR const int digits       = __base::digits;
-  static _LIBCPP_CONSTEXPR const int digits10     = __base::digits10;
-  static _LIBCPP_CONSTEXPR const int max_digits10 = __base::max_digits10;
-  static _LIBCPP_CONSTEXPR const bool is_signed   = __base::is_signed;
-  static _LIBCPP_CONSTEXPR const bool is_integer  = __base::is_integer;
-  static _LIBCPP_CONSTEXPR const bool is_exact    = __base::is_exact;
-  static _LIBCPP_CONSTEXPR const int radix        = __base::radix;
-  _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type epsilon() _NOEXCEPT { return __base::epsilon(); }
-  _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type round_error() _NOEXCEPT { return __base::round_error(); }
-
-  static _LIBCPP_CONSTEXPR const int min_exponent   = __base::min_exponent;
-  static _LIBCPP_CONSTEXPR const int min_exponent10 = __base::min_exponent10;
-  static _LIBCPP_CONSTEXPR const int max_exponent   = __base::max_exponent;
-  static _LIBCPP_CONSTEXPR const int max_exponent10 = __base::max_exponent10;
-
-  static _LIBCPP_CONSTEXPR const bool has_infinity      = __base::has_infinity;
-  static _LIBCPP_CONSTEXPR const bool has_quiet_NaN     = __base::has_quiet_NaN;
-  static _LIBCPP_CONSTEXPR const bool has_signaling_NaN = __base::has_signaling_NaN;
-  _LIBCPP_SUPPRESS_DEPRECATED_PUSH
-  static _LIBCPP_DEPRECATED_IN_CXX23 _LIBCPP_CONSTEXPR const float_denorm_style has_denorm = __base::has_denorm;
-  static _LIBCPP_DEPRECATED_IN_CXX23 _LIBCPP_CONSTEXPR const bool has_denorm_loss          = __base::has_denorm_loss;
-  _LIBCPP_SUPPRESS_DEPRECATED_POP
-  _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type infinity() _NOEXCEPT { return __base::infinity(); }
-  _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type quiet_NaN() _NOEXCEPT { return __base::quiet_NaN(); }
-  _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type signaling_NaN() _NOEXCEPT { return __base::signaling_NaN(); }
-  _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type denorm_min() _NOEXCEPT { return __base::denorm_min(); }
+class _LIBCPP_TEMPLATE_VIS numeric_limits<volatile _Tp> : public numeric_limits<_Tp> {};
 
-  static _LIBCPP_CONSTEXPR const bool is_iec559  = __base::is_iec559;
-  static _LIBCPP_CONSTEXPR const bool is_bounded = __base::is_bounded;
-  static _LIBCPP_CONSTEXPR const bool is_modulo  = __base::is_modulo;
-
-  static _LIBCPP_CONSTEXPR const bool traps                    = __base::traps;
-  static _LIBCPP_CONSTEXPR const bool tinyness_before          = __base::tinyness_before;
-  static _LIBCPP_CONSTEXPR const float_round_style round_style = __base::round_style;
-};
-
-template <class _Tp>
-_LIBCPP_CONSTEXPR const bool numeric_limits<volatile _Tp>::is_specialized;
-template <class _Tp>
-_LIBCPP_CONSTEXPR const int numeric_limits<volatile _Tp>::digits;
-template <class _Tp>
-_LIBCPP_CONSTEXPR const int numeric_limits<volatile _Tp>::digits10;
-template <class _Tp>
-_LIBCPP_CONSTEXPR const int numeric_limits<volatile _Tp>::max_digits10;
-template <class _Tp>
-_LIBCPP_CONSTEXPR const bool numeric_limits<volatile _Tp>::is_signed;
-template <class _Tp>
-_LIBCPP_CONSTEXPR const bool numeric_limits<volatile _Tp>::is_integer;
-template <class _Tp>
-_LIBCPP_CONSTEXPR const bool numeric_limits<volatile _Tp>::is_exact;
-template <class _Tp>
-_LIBCPP_CONSTEXPR const int numeric_limits<volatile _Tp>::radix;
-template <class _Tp>
-_LIBCPP_CONSTEXPR const int numeric_limits<volatile _Tp>::min_exponent;
-template <class _Tp>
-_LIBCPP_CONSTEXPR const int numeric_limits<volatile _Tp>::min_exponent10;
-template <class _Tp>
-_LIBCPP_CONSTEXPR const int numeric_limits<volatile _Tp>::max_exponent;
-template <class _Tp>
-_LIBCPP_CONSTEXPR const int numeric_limits<volatile _Tp>::max_exponent10;
-template <class _Tp>
-_LIBCPP_CONSTEXPR const bool numeric_limits<volatile _Tp>::has_infinity;
-template <class _Tp>
-_LIBCPP_CONSTEXPR const bool numeric_limits<volatile _Tp>::has_quiet_NaN;
-template <class _Tp>
-_LIBCPP_CONSTEXPR const bool numeric_limits<volatile _Tp>::has_signaling_NaN;
-template <class _Tp>
-_LIBCPP_CONSTEXPR const float_denorm_style numeric_limits<volatile _Tp>::has_denorm;
-template <class _Tp>
-_LIBCPP_CONSTEXPR const bool numeric_limits<volatile _Tp>::has_denorm_loss;
-template <class _Tp>
-_LIBCPP_CONSTEXPR const bool numeric_limits<volatile _Tp>::is_iec559;
-template <class _Tp>
-_LIBCPP_CONSTEXPR const bool numeric_limits<volatile _Tp>::is_bounded;
-template <class _Tp>
-_LIBCPP_CONSTEXPR const bool numeric_limits<volatile _Tp>::is_modulo;
-template <class _Tp>
-_LIBCPP_CONSTEXPR const bool numeric_limits<volatile _Tp>::traps;
-template <class _Tp>
-_LIBCPP_CONSTEXPR const bool numeric_limits<volatile _Tp>::tinyness_before;
-template <class _Tp>
-_LIBCPP_CONSTEXPR const float_round_style numeric_limits<volatile _Tp>::round_style;
-
-template <class _Tp>
-class _LIBCPP_TEMPLATE_VIS numeric_limits<const volatile _Tp> : private numeric_limits<_Tp> {
-  typedef numeric_limits<_Tp> __base;
-  typedef _Tp type;
-
-public:
-  static _LIBCPP_CONSTEXPR const bool is_specialized = __base::is_specialized;
-  _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type min() _NOEXCEPT { return __base::min(); }
-  _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type max() _NOEXCEPT { return __base::max(); }
-  _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type lowest() _NOEXCEPT { return __base::lowest(); }
-
-  static _LIBCPP_CONSTEXPR const int digits       = __base::digits;
-  static _LIBCPP_CONSTEXPR const int digits10     = __base::digits10;
-  static _LIBCPP_CONSTEXPR const int max_digits10 = __base::max_digits10;
-  static _LIBCPP_CONSTEXPR const bool is_signed   = __base::is_signed;
-  static _LIBCPP_CONSTEXPR const bool is_integer  = __base::is_integer;
-  static _LIBCPP_CONSTEXPR const bool is_exact    = __base::is_exact;
-  static _LIBCPP_CONSTEXPR const int radix        = __base::radix;
-  _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type epsilon() _NOEXCEPT { return __base::epsilon(); }
-  _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type round_error() _NOEXCEPT { return __base::round_error(); }
-
-  static _LIBCPP_CONSTEXPR const int min_exponent   = __base::min_exponent;
-  static _LIBCPP_CONSTEXPR const int min_exponent10 = __base::min_exponent10;
-  static _LIBCPP_CONSTEXPR const int max_exponent   = __base::max_exponent;
-  static _LIBCPP_CONSTEXPR const int max_exponent10 = __base::max_exponent10;
-
-  static _LIBCPP_CONSTEXPR const bool has_infinity      = __base::has_infinity;
-  static _LIBCPP_CONSTEXPR const bool has_quiet_NaN     = __base::has_quiet_NaN;
-  static _LIBCPP_CONSTEXPR const bool has_signaling_NaN = __base::has_signaling_NaN;
-  _LIBCPP_SUPPRESS_DEPRECATED_PUSH
-  static _LIBCPP_DEPRECATED_IN_CXX23 _LIBCPP_CONSTEXPR const float_denorm_style has_denorm = __base::has_denorm;
-  static _LIBCPP_DEPRECATED_IN_CXX23 _LIBCPP_CONSTEXPR const bool has_denorm_loss          = __base::has_denorm_loss;
-  _LIBCPP_SUPPRESS_DEPRECATED_POP
-  _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type infinity() _NOEXCEPT { return __base::infinity(); }
-  _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type quiet_NaN() _NOEXCEPT { return __base::quiet_NaN(); }
-  _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type signaling_NaN() _NOEXCEPT { return __base::signaling_NaN(); }
-  _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type denorm_min() _NOEXCEPT { return __base::denorm_min(); }
-
-  static _LIBCPP_CONSTEXPR const bool is_iec559  = __base::is_iec559;
-  static _LIBCPP_CONSTEXPR const bool is_bounded = __base::is_bounded;
-  static _LIBCPP_CONSTEXPR const bool is_modulo  = __base::is_modulo;
-
-  static _LIBCPP_CONSTEXPR const bool traps                    = __base::traps;
-  static _LIBCPP_CONSTEXPR const bool tinyness_before          = __base::tinyness_before;
-  static _LIBCPP_CONSTEXPR const float_round_style round_style = __base::round_style;
-};
-
-template <class _Tp>
-_LIBCPP_CONSTEXPR const bool numeric_limits<const volatile _Tp>::is_specialized;
-template <class _Tp>
-_LIBCPP_CONSTEXPR const int numeric_limits<const volatile _Tp>::digits;
-template <class _Tp>
-_LIBCPP_CONSTEXPR const int numeric_limits<const volatile _Tp>::digits10;
-template <class _Tp>
-_LIBCPP_CONSTEXPR const int numeric_limits<const volatile _Tp>::max_digits10;
-template <class _Tp>
-_LIBCPP_CONSTEXPR const bool numeric_limits<const volatile _Tp>::is_signed;
-template <class _Tp>
-_LIBCPP_CONSTEXPR const bool numeric_limits<const volatile _Tp>::is_integer;
-template <class _Tp>
-_LIBCPP_CONSTEXPR const bool numeric_limits<const volatile _Tp>::is_exact;
-template <class _Tp>
-_LIBCPP_CONSTEXPR const int numeric_limits<const volatile _Tp>::radix;
-template <class _Tp>
-_LIBCPP_CONSTEXPR const int numeric_limits<const volatile _Tp>::min_exponent;
-template <class _Tp>
-_LIBCPP_CONSTEXPR const int numeric_limits<const volatile _Tp>::min_exponent10;
-template <class _Tp>
-_LIBCPP_CONSTEXPR const int numeric_limits<const volatile _Tp>::max_exponent;
-template <class _Tp>
-_LIBCPP_CONSTEXPR const int numeric_limits<const volatile _Tp>::max_exponent10;
-template <class _Tp>
-_LIBCPP_CONSTEXPR const bool numeric_limits<const volatile _Tp>::has_infinity;
-template <class _Tp>
-_LIBCPP_CONSTEXPR const bool numeric_limits<const volatile _Tp>::has_quiet_NaN;
-template <class _Tp>
-_LIBCPP_CONSTEXPR const bool numeric_limits<const volatile _Tp>::has_signaling_NaN;
-template <class _Tp>
-_LIBCPP_CONSTEXPR const float_denorm_style numeric_limits<const volatile _Tp>::has_denorm;
-template <class _Tp>
-_LIBCPP_CONSTEXPR const bool numeric_limits<const volatile _Tp>::has_denorm_loss;
-template <class _Tp>
-_LIBCPP_CONSTEXPR const bool numeric_limits<const volatile _Tp>::is_iec559;
-template <class _Tp>
-_LIBCPP_CONSTEXPR const bool numeric_limits<const volatile _Tp>::is_bounded;
-template <class _Tp>
-_LIBCPP_CONSTEXPR const bool numeric_limits<const volatile _Tp>::is_modulo;
-template <class _Tp>
-_LIBCPP_CONSTEXPR const bool numeric_limits<const volatile _Tp>::traps;
-template <class _Tp>
-_LIBCPP_CONSTEXPR const bool numeric_limits<const volatile _Tp>::tinyness_before;
 template <class _Tp>
-_LIBCPP_CONSTEXPR const float_round_style numeric_limits<const volatile _Tp>::round_style;
+class _LIBCPP_TEMPLATE_VIS numeric_limits<const volatile _Tp> : public numeric_limits<_Tp> {};
 
 _LIBCPP_END_NAMESPACE_STD
 
-- 
cgit v1.1


From 1b5f6916199ce09244cdb52c6911f2028e6ca95a Mon Sep 17 00:00:00 2001
From: Nikolas Klauser <nikolasklauser@berlin.de>
Date: Thu, 8 Feb 2024 19:23:10 +0100
Subject: [libc++] Avoid including <cmath> in <compare> (#80418)

This reduces the time to include `<compare>` from 84ms to 36ms.
---
 libcxx/include/__compare/strong_order.h          | 23 +++++++++++++----------
 libcxx/include/__compare/weak_order.h            | 12 +++++++-----
 libcxx/include/compare                           |  1 +
 libcxx/test/libcxx/transitive_includes/cxx23.csv |  1 -
 libcxx/test/libcxx/transitive_includes/cxx26.csv |  1 -
 5 files changed, 21 insertions(+), 17 deletions(-)

diff --git a/libcxx/include/__compare/strong_order.h b/libcxx/include/__compare/strong_order.h
index 5f6ade5..3dc819e 100644
--- a/libcxx/include/__compare/strong_order.h
+++ b/libcxx/include/__compare/strong_order.h
@@ -13,11 +13,14 @@
 #include <__compare/compare_three_way.h>
 #include <__compare/ordering.h>
 #include <__config>
+#include <__math/exponential_functions.h>
+#include <__math/traits.h>
 #include <__type_traits/conditional.h>
 #include <__type_traits/decay.h>
+#include <__type_traits/is_floating_point.h>
+#include <__type_traits/is_same.h>
 #include <__utility/forward.h>
 #include <__utility/priority_tag.h>
-#include <cmath>
 #include <cstdint>
 #include <limits>
 
@@ -66,27 +69,27 @@ struct __fn {
       return strong_ordering::greater;
     } else if (__t == __u) {
       if constexpr (numeric_limits<_Dp>::radix == 2) {
-        return std::signbit(__u) <=> std::signbit(__t);
+        return __math::signbit(__u) <=> __math::signbit(__t);
       } else {
         // This is bullet 3 of the IEEE754 algorithm, relevant
         // only for decimal floating-point;
         // see https://stackoverflow.com/questions/69068075/
-        if (__t == 0 || std::isinf(__t)) {
-          return std::signbit(__u) <=> std::signbit(__t);
+        if (__t == 0 || __math::isinf(__t)) {
+          return __math::signbit(__u) <=> __math::signbit(__t);
         } else {
           int __texp, __uexp;
-          (void)std::frexp(__t, &__texp);
-          (void)std::frexp(__u, &__uexp);
+          (void)__math::frexp(__t, &__texp);
+          (void)__math::frexp(__u, &__uexp);
           return (__t < 0) ? (__texp <=> __uexp) : (__uexp <=> __texp);
         }
       }
     } else {
       // They're unordered, so one of them must be a NAN.
       // The order is -QNAN, -SNAN, numbers, +SNAN, +QNAN.
-      bool __t_is_nan      = std::isnan(__t);
-      bool __u_is_nan      = std::isnan(__u);
-      bool __t_is_negative = std::signbit(__t);
-      bool __u_is_negative = std::signbit(__u);
+      bool __t_is_nan      = __math::isnan(__t);
+      bool __u_is_nan      = __math::isnan(__u);
+      bool __t_is_negative = __math::signbit(__t);
+      bool __u_is_negative = __math::signbit(__u);
       using _IntType =
           conditional_t< sizeof(__t) == sizeof(int32_t),
                          int32_t,
diff --git a/libcxx/include/__compare/weak_order.h b/libcxx/include/__compare/weak_order.h
index 9f719eb..b82a708 100644
--- a/libcxx/include/__compare/weak_order.h
+++ b/libcxx/include/__compare/weak_order.h
@@ -13,10 +13,12 @@
 #include <__compare/ordering.h>
 #include <__compare/strong_order.h>
 #include <__config>
+#include <__math/traits.h>
 #include <__type_traits/decay.h>
+#include <__type_traits/is_floating_point.h>
+#include <__type_traits/is_same.h>
 #include <__utility/forward.h>
 #include <__utility/priority_tag.h>
-#include <cmath>
 
 #ifndef _LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER
 #  pragma GCC system_header
@@ -51,10 +53,10 @@ struct __fn {
       return weak_ordering::greater;
     } else {
       // Otherwise, at least one of them is a NaN.
-      bool __t_is_nan      = std::isnan(__t);
-      bool __u_is_nan      = std::isnan(__u);
-      bool __t_is_negative = std::signbit(__t);
-      bool __u_is_negative = std::signbit(__u);
+      bool __t_is_nan      = __math::isnan(__t);
+      bool __u_is_nan      = __math::isnan(__u);
+      bool __t_is_negative = __math::signbit(__t);
+      bool __u_is_negative = __math::signbit(__u);
       if (__t_is_nan && __u_is_nan) {
         return (__u_is_negative <=> __t_is_negative);
       } else if (__t_is_nan) {
diff --git a/libcxx/include/compare b/libcxx/include/compare
index 626c743..cc0cae8 100644
--- a/libcxx/include/compare
+++ b/libcxx/include/compare
@@ -162,6 +162,7 @@ namespace std {
 #endif
 
 #if !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES) && _LIBCPP_STD_VER <= 20
+#  include <cmath>
 #  include <type_traits>
 #endif
 
diff --git a/libcxx/test/libcxx/transitive_includes/cxx23.csv b/libcxx/test/libcxx/transitive_includes/cxx23.csv
index 7c7099d..bd82411 100644
--- a/libcxx/test/libcxx/transitive_includes/cxx23.csv
+++ b/libcxx/test/libcxx/transitive_includes/cxx23.csv
@@ -105,7 +105,6 @@ codecvt string
 codecvt tuple
 codecvt typeinfo
 codecvt version
-compare cmath
 compare cstddef
 compare cstdint
 compare limits
diff --git a/libcxx/test/libcxx/transitive_includes/cxx26.csv b/libcxx/test/libcxx/transitive_includes/cxx26.csv
index 7c7099d..bd82411 100644
--- a/libcxx/test/libcxx/transitive_includes/cxx26.csv
+++ b/libcxx/test/libcxx/transitive_includes/cxx26.csv
@@ -105,7 +105,6 @@ codecvt string
 codecvt tuple
 codecvt typeinfo
 codecvt version
-compare cmath
 compare cstddef
 compare cstdint
 compare limits
-- 
cgit v1.1


From b92e0a31dab5917f31b4672430004add34b5e775 Mon Sep 17 00:00:00 2001
From: Valentin Clement <clementval@gmail.com>
Date: Thu, 8 Feb 2024 10:23:20 -0800
Subject: [flang][cuda] Fix warning in switch

---
 flang/lib/Lower/ConvertVariable.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/flang/lib/Lower/ConvertVariable.cpp b/flang/lib/Lower/ConvertVariable.cpp
index f761e14..d57bdd4 100644
--- a/flang/lib/Lower/ConvertVariable.cpp
+++ b/flang/lib/Lower/ConvertVariable.cpp
@@ -1603,7 +1603,7 @@ fir::CUDAAttributeAttr Fortran::lower::translateSymbolCUDAAttribute(
       break;
     case Fortran::common::CUDADataAttr::Texture:
       // Obsolete attribute
-      break;
+      return {};
     }
 
     return fir::CUDAAttributeAttr::get(mlirContext, attr);
-- 
cgit v1.1


From c0ff10814fb056369cd2bbf0e672498b4cc9c1d4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nicolai=20H=C3=A4hnle?= <nicolai.haehnle@amd.com>
Date: Thu, 8 Feb 2024 19:24:55 +0100
Subject: docs/GettingStarted: document linker-related cmake options (#80932)

Both LLVM_LINK_LLVM_DYLIB and LLVM_PARALLEL_LINK_JOBS help with some
common gotchas. It seems worth documenting them here explicitly.

Based on a review comment, also "refactor" the documentation to avoid duplication.
---
 llvm/docs/CMake.rst          |  2 ++
 llvm/docs/GettingStarted.rst | 86 +++++++++-----------------------------------
 2 files changed, 19 insertions(+), 69 deletions(-)

diff --git a/llvm/docs/CMake.rst b/llvm/docs/CMake.rst
index 13d1912c..20f73c9 100644
--- a/llvm/docs/CMake.rst
+++ b/llvm/docs/CMake.rst
@@ -178,6 +178,8 @@ variable and type on the CMake command line:
 
   $ cmake -DVARIABLE:TYPE=value path/to/llvm/source
 
+.. _cmake_frequently_used_variables:
+
 Frequently-used CMake variables
 -------------------------------
 
diff --git a/llvm/docs/GettingStarted.rst b/llvm/docs/GettingStarted.rst
index 316fc6a..687d1f2 100644
--- a/llvm/docs/GettingStarted.rst
+++ b/llvm/docs/GettingStarted.rst
@@ -540,75 +540,23 @@ Variables are passed to ``cmake`` on the command line using the format
 ``-D<variable name>=<value>``. The following variables are some common options
 used by people developing LLVM.
 
-+-------------------------+----------------------------------------------------+
-| Variable                | Purpose                                            |
-+=========================+====================================================+
-| CMAKE_C_COMPILER        | Tells ``cmake`` which C compiler to use. By        |
-|                         | default, this will be /usr/bin/cc.                 |
-+-------------------------+----------------------------------------------------+
-| CMAKE_CXX_COMPILER      | Tells ``cmake`` which C++ compiler to use. By      |
-|                         | default, this will be /usr/bin/c++.                |
-+-------------------------+----------------------------------------------------+
-| CMAKE_BUILD_TYPE        | Tells ``cmake`` what type of build you are trying  |
-|                         | to generate files for. Valid options are Debug,    |
-|                         | Release, RelWithDebInfo, and MinSizeRel. Default   |
-|                         | is Debug.                                          |
-+-------------------------+----------------------------------------------------+
-| CMAKE_INSTALL_PREFIX    | Specifies the install directory to target when     |
-|                         | running the install action of the build files.     |
-+-------------------------+----------------------------------------------------+
-| Python3_EXECUTABLE      | Forces CMake to use a specific Python version by   |
-|                         | passing a path to a Python interpreter. By default |
-|                         | the Python version of the interpreter in your PATH |
-|                         | is used.                                           |
-+-------------------------+----------------------------------------------------+
-| LLVM_TARGETS_TO_BUILD   | A semicolon delimited list controlling which       |
-|                         | targets will be built and linked into llvm.        |
-|                         | The default list is defined as                     |
-|                         | ``LLVM_ALL_TARGETS``, and can be set to include    |
-|                         | out-of-tree targets. The default value includes:   |
-|                         | ``AArch64, AMDGPU, ARM, AVR, BPF, Hexagon, Lanai,  |
-|                         | Mips, MSP430, NVPTX, PowerPC, RISCV, Sparc,        |
-|                         | SystemZ, WebAssembly, X86, XCore``. Setting this   |
-|                         | to ``"host"`` will only compile the host           |
-|                         | architecture (e.g. equivalent to specifying ``X86``|
-|                         | on an x86 host machine) can                        |
-|                         | significantly speed up compile and test times.     |
-+-------------------------+----------------------------------------------------+
-| LLVM_ENABLE_DOXYGEN     | Build doxygen-based documentation from the source  |
-|                         | code This is disabled by default because it is     |
-|                         | slow and generates a lot of output.                |
-+-------------------------+----------------------------------------------------+
-| LLVM_ENABLE_PROJECTS    | A semicolon-delimited list selecting which of the  |
-|                         | other LLVM subprojects to additionally build. (Only|
-|                         | effective when using a side-by-side project layout |
-|                         | e.g. via git). The default list is empty. Can      |
-|                         | include: clang, clang-tools-extra,                 |
-|                         | cross-project-tests, flang, libc, libclc, lld,     |
-|                         | lldb, mlir, openmp, polly, or pstl.                |
-+-------------------------+----------------------------------------------------+
-| LLVM_ENABLE_RUNTIMES    | A semicolon-delimited list selecting which of the  |
-|                         | runtimes to build. (Only effective when using the  |
-|                         | full monorepo layout). The default list is empty.  |
-|                         | Can include: compiler-rt, libc, libcxx, libcxxabi, |
-|                         | libunwind, or openmp.                              |
-+-------------------------+----------------------------------------------------+
-| LLVM_ENABLE_SPHINX      | Build sphinx-based documentation from the source   |
-|                         | code. This is disabled by default because it is    |
-|                         | slow and generates a lot of output. Sphinx version |
-|                         | 1.5 or later recommended.                          |
-+-------------------------+----------------------------------------------------+
-| LLVM_BUILD_LLVM_DYLIB   | Generate libLLVM.so. This library contains a       |
-|                         | default set of LLVM components that can be         |
-|                         | overridden with ``LLVM_DYLIB_COMPONENTS``. The     |
-|                         | default contains most of LLVM and is defined in    |
-|                         | ``tools/llvm-shlib/CMakelists.txt``. This option is|
-|                         | not available on Windows.                          |
-+-------------------------+----------------------------------------------------+
-| LLVM_OPTIMIZED_TABLEGEN | Builds a release tablegen that gets used during    |
-|                         | the LLVM build. This can dramatically speed up     |
-|                         | debug builds.                                      |
-+-------------------------+----------------------------------------------------+
+* ``CMAKE_C_COMPILER``
+* ``CMAKE_CXX_COMPILER``
+* ``CMAKE_BUILD_TYPE``
+* ``CMAKE_INSTALL_PREFIX``
+* ``Python3_EXECUTABLE``
+* ``LLVM_TARGETS_TO_BUILD``
+* ``LLVM_ENABLE_PROJECTS``
+* ``LLVM_ENABLE_RUNTIMES``
+* ``LLVM_ENABLE_DOXYGEN``
+* ``LLVM_ENABLE_SPHINX``
+* ``LLVM_BUILD_LLVM_DYLIB``
+* ``LLVM_LINK_LLVM_DYLIB``
+* ``LLVM_PARALLEL_LINK_JOBS``
+* ``LLVM_OPTIMIZED_TABLEGEN``
+
+See :ref:`the list of frequently-used CMake variables <cmake_frequently_used_variables>`
+for more information.
 
 To configure LLVM, follow these steps:
 
-- 
cgit v1.1


From 687304a018d36c4b0def4618a98fee6975172453 Mon Sep 17 00:00:00 2001
From: Jan Svoboda <jan_svoboda@apple.com>
Date: Thu, 8 Feb 2024 10:26:06 -0800
Subject: [clang][lex] Fix build failure after da95d926

---
 clang/lib/Lex/PPDirectives.cpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/clang/lib/Lex/PPDirectives.cpp b/clang/lib/Lex/PPDirectives.cpp
index 97f9c0a..0b22139 100644
--- a/clang/lib/Lex/PPDirectives.cpp
+++ b/clang/lib/Lex/PPDirectives.cpp
@@ -2306,12 +2306,12 @@ Preprocessor::ImportAction Preprocessor::HandleHeaderIncludeOrImport(
 
     // Load the module to import its macros. We'll make the declarations
     // visible when the parser gets here.
-    // FIXME: Pass SM in here rather than converting it to a path and making the
-    // module loader convert it back again.
+    // FIXME: Pass ModuleToImport in here rather than converting it to a path
+    // and making the module loader convert it back again.
     ModuleLoadResult Imported = TheModuleLoader.loadModule(
         IncludeTok.getLocation(), Path, Module::Hidden,
         /*IsInclusionDirective=*/true);
-    assert((Imported == nullptr || Imported == SM) &&
+    assert((Imported == nullptr || Imported == ModuleToImport) &&
            "the imported module is different than the suggested one");
 
     if (Imported) {
@@ -2526,7 +2526,7 @@ Preprocessor::ImportAction Preprocessor::HandleHeaderIncludeOrImport(
 
   case Import: {
     // If this is a module import, make it visible if needed.
-    assert(SM && "no module to import");
+    assert(ModuleToImport && "no module to import");
 
     makeModuleVisible(ModuleToImport, EndLoc);
 
-- 
cgit v1.1


From ab4a793e8bc78f50f9f104c9c732e2dd91bf70a2 Mon Sep 17 00:00:00 2001
From: Chelsea Cassanova <chelsea_cassanova@apple.com>
Date: Thu, 8 Feb 2024 10:33:37 -0800
Subject: [lldb][debugger][NFC] Add broadcast bit for category-based progress
 events. (#81169)

This commit adds a new broadcast bit to the debugger. When in use, it
will be listened to for progress events that will be delivered and kept
track of by category as opposed to the current behaviour of coming in
one by one.
---
 lldb/include/lldb/API/SBDebugger.h | 1 +
 lldb/include/lldb/Core/Debugger.h  | 1 +
 2 files changed, 2 insertions(+)

diff --git a/lldb/include/lldb/API/SBDebugger.h b/lldb/include/lldb/API/SBDebugger.h
index 218113a..62b2f91 100644
--- a/lldb/include/lldb/API/SBDebugger.h
+++ b/lldb/include/lldb/API/SBDebugger.h
@@ -46,6 +46,7 @@ public:
       eBroadcastBitProgress = (1 << 0),
       eBroadcastBitWarning = (1 << 1),
       eBroadcastBitError = (1 << 2),
+      eBroadcastBitProgressCategory = (1 << 3),
   };
 
   SBDebugger();
diff --git a/lldb/include/lldb/Core/Debugger.h b/lldb/include/lldb/Core/Debugger.h
index c6d603c..6ba90eb 100644
--- a/lldb/include/lldb/Core/Debugger.h
+++ b/lldb/include/lldb/Core/Debugger.h
@@ -84,6 +84,7 @@ public:
     eBroadcastBitWarning = (1 << 1),
     eBroadcastBitError = (1 << 2),
     eBroadcastSymbolChange = (1 << 3),
+    eBroadcastBitProgressCategory = (1 << 4),
   };
 
   using DebuggerList = std::vector<lldb::DebuggerSP>;
-- 
cgit v1.1


From a1ed821b49d9a189c3a0a11228c0de517020feca Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Thu, 8 Feb 2024 10:56:00 -0800
Subject: [TableGen] Simplify prepSkipToLineEnd for preprocessing

The MemoryBuffer is created using `RequiresNullTerminator`, so we can
safely skip the `CurPtr != CurBuf.end()` check. The redundant check
causes a cppcheck report. In addition, elsewhere, including `*CurPtr ==
'#'` below, makes the null terminator assumption as well.

Close #81120
---
 llvm/lib/TableGen/TGLexer.cpp | 8 ++------
 llvm/lib/TableGen/TGLexer.h   | 5 -----
 2 files changed, 2 insertions(+), 11 deletions(-)

diff --git a/llvm/lib/TableGen/TGLexer.cpp b/llvm/lib/TableGen/TGLexer.cpp
index 5456432..99d866a 100644
--- a/llvm/lib/TableGen/TGLexer.cpp
+++ b/llvm/lib/TableGen/TGLexer.cpp
@@ -849,7 +849,8 @@ bool TGLexer::prepSkipRegion(bool MustNeverBeFalse) {
 
   do {
     // Skip all symbols to the line end.
-    prepSkipToLineEnd();
+    while (*CurPtr != '\n')
+      ++CurPtr;
 
     // Find the first non-whitespace symbol in the next line(s).
     if (!prepSkipLineBegin())
@@ -1032,11 +1033,6 @@ bool TGLexer::prepSkipDirectiveEnd() {
   return true;
 }
 
-void TGLexer::prepSkipToLineEnd() {
-  while (*CurPtr != '\n' && *CurPtr != '\r' && CurPtr != CurBuf.end())
-    ++CurPtr;
-}
-
 bool TGLexer::prepIsProcessingEnabled() {
   for (const PreprocessorControlDesc &I :
        llvm::reverse(*PrepIncludeStack.back()))
diff --git a/llvm/lib/TableGen/TGLexer.h b/llvm/lib/TableGen/TGLexer.h
index 25dcd9f..9adc03c 100644
--- a/llvm/lib/TableGen/TGLexer.h
+++ b/llvm/lib/TableGen/TGLexer.h
@@ -467,11 +467,6 @@ private:
   // directive.
   bool prepSkipDirectiveEnd();
 
-  // Skip all symbols to the end of the line/file.
-  // The method adjusts CurPtr, so that it points to either new line
-  // symbol in the current line or the buffer end.
-  void prepSkipToLineEnd();
-
   // Return true, if the current preprocessor control stack is such that
   // we should allow lexer to process the next token, false - otherwise.
   //
-- 
cgit v1.1


From a56fa161ab2617fa3aab3f91285fc757b6a8e09b Mon Sep 17 00:00:00 2001
From: Krystian Stasiowski <sdkrystian@gmail.com>
Date: Thu, 8 Feb 2024 13:59:47 -0500
Subject: [clang-tidy] Fix failing test after #80864 (#81171)

The following test case in
`clang-tools-extra/test/clang-tidy/infrastructure/diagnostic.cpp` is
failing:
```
#ifdef PR64602 // Should not crash
template <class T = void>
struct S
{
    auto foo(auto);
};

template <>
auto S<>::foo(auto)
{
    return 1;
}
// CHECK8: error: template parameter list matching the non-templated nested type 'S<>' should be empty ('template<>') [clang-diagnostic-error]
#endif
```

#80864 fixes a bug where we would (incorrectly) append invented template
parameters to empty template parameter lists, which causes this test to
fail.
---
 clang-tools-extra/test/clang-tidy/infrastructure/diagnostic.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/clang-tools-extra/test/clang-tidy/infrastructure/diagnostic.cpp b/clang-tools-extra/test/clang-tidy/infrastructure/diagnostic.cpp
index 547f634..d0efc5c 100644
--- a/clang-tools-extra/test/clang-tidy/infrastructure/diagnostic.cpp
+++ b/clang-tools-extra/test/clang-tidy/infrastructure/diagnostic.cpp
@@ -68,5 +68,6 @@ auto S<>::foo(auto)
 {
     return 1;
 }
-// CHECK8: error: template parameter list matching the non-templated nested type 'S<>' should be empty ('template<>') [clang-diagnostic-error]
+// CHECK8: error: conflicting types for 'foo' [clang-diagnostic-error]
+// CHECK8: note: previous declaration is here
 #endif
-- 
cgit v1.1


From 3d71e4166de81bc3b86d127d9ac6607bda2b2755 Mon Sep 17 00:00:00 2001
From: Jeremy Kun <2467754+j2kun@users.noreply.github.com>
Date: Thu, 8 Feb 2024 11:06:43 -0800
Subject: [docs]: Add a note about using custom types with diagnostics (#73818)

---
 mlir/docs/Diagnostics.md | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/mlir/docs/Diagnostics.md b/mlir/docs/Diagnostics.md
index 9819843..82bc61d 100644
--- a/mlir/docs/Diagnostics.md
+++ b/mlir/docs/Diagnostics.md
@@ -119,6 +119,14 @@ op->emitError() << anotherOp;
 op->emitRemark() << anotherOp;
 ```
 
+To make a custom type compatible with Diagnostics, one must implement the
+following friend function.
+
+```c++
+friend mlir::Diagnostic &operator<<(
+    mlir::Diagnostic &diagnostic, const MyType &foo);
+```
+
 ### Attaching notes
 
 Unlike many other compiler frameworks, notes in MLIR cannot be emitted directly.
-- 
cgit v1.1


From 74fc16aaaa227b84e22706d2c5e376287f560b9e Mon Sep 17 00:00:00 2001
From: Jonas Devlieghere <jonas@devlieghere.com>
Date: Thu, 8 Feb 2024 11:24:07 -0800
Subject: [lldb] Expand background symbol download (#80890)

LLDB has a setting (symbols.enable-background-lookup) that calls
dsymForUUID on a background thread for images as they appear in the
current backtrace. Originally, the laziness of only looking up symbols
for images in the backtrace only existed to bring the number of
dsymForUUID calls down to a manageable number.

Users have requesting the same functionality but blocking. This gives
them the same user experience as enabling dsymForUUID globally, but
without the massive upfront cost of having to download all the images,
the majority of which they'll likely not need.

This patch renames the setting to have a more generic name
(symbols.auto-download) and changes its values from a boolean to an
enum. Users can now specify "off", "background" and "foreground". The
default remains "off" although I'll probably change that in the near
future.
---
 lldb/include/lldb/Core/ModuleList.h   | 23 ++++++++++++++++++++++-
 lldb/include/lldb/lldb-enumerations.h |  6 ++++++
 lldb/source/Core/CoreProperties.td    |  7 ++++++-
 lldb/source/Core/ModuleList.cpp       | 13 +++++++++----
 lldb/source/Host/common/Host.cpp      |  2 ++
 lldb/source/Symbol/SymbolLocator.cpp  | 22 ++++++++++++++++------
 6 files changed, 61 insertions(+), 12 deletions(-)

diff --git a/lldb/include/lldb/Core/ModuleList.h b/lldb/include/lldb/Core/ModuleList.h
index d78f7c5..43d931a 100644
--- a/lldb/include/lldb/Core/ModuleList.h
+++ b/lldb/include/lldb/Core/ModuleList.h
@@ -47,6 +47,26 @@ class UUID;
 class VariableList;
 struct ModuleFunctionSearchOptions;
 
+static constexpr OptionEnumValueElement g_auto_download_enum_values[] = {
+    {
+        lldb::eSymbolDownloadOff,
+        "off",
+        "Disable automatically downloading symbols.",
+    },
+    {
+        lldb::eSymbolDownloadBackground,
+        "background",
+        "Download symbols in the background for images as they appear in the "
+        "backtrace.",
+    },
+    {
+        lldb::eSymbolDownloadForeground,
+        "foreground",
+        "Download symbols in the foreground for images as they appear in the "
+        "backtrace.",
+    },
+};
+
 class ModuleListProperties : public Properties {
   mutable llvm::sys::RWMutex m_symlink_paths_mutex;
   PathMappingList m_symlink_paths;
@@ -60,7 +80,6 @@ public:
   bool SetClangModulesCachePath(const FileSpec &path);
   bool GetEnableExternalLookup() const;
   bool SetEnableExternalLookup(bool new_value);
-  bool GetEnableBackgroundLookup() const;
   bool GetEnableLLDBIndexCache() const;
   bool SetEnableLLDBIndexCache(bool new_value);
   uint64_t GetLLDBIndexCacheMaxByteSize();
@@ -71,6 +90,8 @@ public:
 
   bool GetLoadSymbolOnDemand();
 
+  lldb::SymbolDownload GetSymbolAutoDownload() const;
+
   PathMappingList GetSymlinkMappings() const;
 };
 
diff --git a/lldb/include/lldb/lldb-enumerations.h b/lldb/include/lldb/lldb-enumerations.h
index 7e9b538..4640533 100644
--- a/lldb/include/lldb/lldb-enumerations.h
+++ b/lldb/include/lldb/lldb-enumerations.h
@@ -1314,6 +1314,12 @@ enum class ChildCacheState {
               ///< re-use what we computed the last time we called Update.
 };
 
+enum SymbolDownload {
+  eSymbolDownloadOff = 0,
+  eSymbolDownloadBackground = 1,
+  eSymbolDownloadForeground = 2,
+};
+
 } // namespace lldb
 
 #endif // LLDB_LLDB_ENUMERATIONS_H
diff --git a/lldb/source/Core/CoreProperties.td b/lldb/source/Core/CoreProperties.td
index 8d81967..9c4aa2d 100644
--- a/lldb/source/Core/CoreProperties.td
+++ b/lldb/source/Core/CoreProperties.td
@@ -8,7 +8,12 @@ let Definition = "modulelist" in {
   def EnableBackgroundLookup: Property<"enable-background-lookup", "Boolean">,
     Global,
     DefaultFalse,
-    Desc<"On macOS, enable calling dsymForUUID (or an equivalent script/binary) in the background to locate symbol files that weren't found.">;
+    Desc<"Alias for backward compatibility: when enabled this is the equivalent to 'symbols.download background'.">;
+  def AutoDownload: Property<"auto-download", "Enum">,
+    Global,
+    DefaultEnumValue<"eSymbolDownloadOff">,
+    EnumValues<"OptionEnumValues(g_auto_download_enum_values)">,
+    Desc<"On macOS, automatically download symbols with dsymForUUID (or an equivalent script/binary) for relevant images in the debug session.">;
   def ClangModulesCachePath: Property<"clang-modules-cache-path", "FileSpec">,
     Global,
     DefaultStringValue<"">,
diff --git a/lldb/source/Core/ModuleList.cpp b/lldb/source/Core/ModuleList.cpp
index b7f3936..b03490b 100644
--- a/lldb/source/Core/ModuleList.cpp
+++ b/lldb/source/Core/ModuleList.cpp
@@ -104,10 +104,15 @@ bool ModuleListProperties::SetEnableExternalLookup(bool new_value) {
   return SetPropertyAtIndex(ePropertyEnableExternalLookup, new_value);
 }
 
-bool ModuleListProperties::GetEnableBackgroundLookup() const {
-  const uint32_t idx = ePropertyEnableBackgroundLookup;
-  return GetPropertyAtIndexAs<bool>(
-      idx, g_modulelist_properties[idx].default_uint_value != 0);
+SymbolDownload ModuleListProperties::GetSymbolAutoDownload() const {
+  // Backward compatibility alias.
+  if (GetPropertyAtIndexAs<bool>(ePropertyEnableBackgroundLookup, false))
+    return eSymbolDownloadBackground;
+
+  const uint32_t idx = ePropertyAutoDownload;
+  return GetPropertyAtIndexAs<lldb::SymbolDownload>(
+      idx, static_cast<lldb::SymbolDownload>(
+               g_modulelist_properties[idx].default_uint_value));
 }
 
 FileSpec ModuleListProperties::GetClangModulesCachePath() const {
diff --git a/lldb/source/Host/common/Host.cpp b/lldb/source/Host/common/Host.cpp
index f4cec97..b72ba7e 100644
--- a/lldb/source/Host/common/Host.cpp
+++ b/lldb/source/Host/common/Host.cpp
@@ -550,6 +550,8 @@ llvm::Error Host::OpenFileInExternalEditor(llvm::StringRef editor,
 }
 
 bool Host::IsInteractiveGraphicSession() { return false; }
+
+bool Host::IsNetworkLimited() { return false; }
 #endif
 
 std::unique_ptr<Connection> Host::CreateDefaultConnection(llvm::StringRef url) {
diff --git a/lldb/source/Symbol/SymbolLocator.cpp b/lldb/source/Symbol/SymbolLocator.cpp
index 918f13ed..93a5bc4 100644
--- a/lldb/source/Symbol/SymbolLocator.cpp
+++ b/lldb/source/Symbol/SymbolLocator.cpp
@@ -10,6 +10,7 @@
 
 #include "lldb/Core/Debugger.h"
 #include "lldb/Core/PluginManager.h"
+#include "lldb/Host/Host.h"
 
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/Support/ThreadPool.h"
@@ -18,12 +19,10 @@ using namespace lldb;
 using namespace lldb_private;
 
 void SymbolLocator::DownloadSymbolFileAsync(const UUID &uuid) {
-  if (!ModuleList::GetGlobalModuleListProperties().GetEnableBackgroundLookup())
-    return;
-
   static llvm::SmallSet<UUID, 8> g_seen_uuids;
   static std::mutex g_mutex;
-  Debugger::GetThreadPool().async([=]() {
+
+  auto lookup = [=]() {
     {
       std::lock_guard<std::mutex> guard(g_mutex);
       if (g_seen_uuids.count(uuid))
@@ -36,12 +35,23 @@ void SymbolLocator::DownloadSymbolFileAsync(const UUID &uuid) {
     module_spec.GetUUID() = uuid;
     if (!PluginManager::DownloadObjectAndSymbolFile(module_spec, error,
                                                     /*force_lookup=*/true,
-                                                    /*copy_executable=*/false))
+                                                    /*copy_executable=*/true))
       return;
 
     if (error.Fail())
       return;
 
     Debugger::ReportSymbolChange(module_spec);
-  });
+  };
+
+  switch (ModuleList::GetGlobalModuleListProperties().GetSymbolAutoDownload()) {
+  case eSymbolDownloadOff:
+    break;
+  case eSymbolDownloadBackground:
+    Debugger::GetThreadPool().async(lookup);
+    break;
+  case eSymbolDownloadForeground:
+    lookup();
+    break;
+  };
 }
-- 
cgit v1.1


From 88e52511ca71165f1ff3d7c42229aeacb2c16db3 Mon Sep 17 00:00:00 2001
From: alex-t <alex-t@users.noreply.github.com>
Date: Thu, 8 Feb 2024 20:27:36 +0100
Subject: [AMDGPU] Compiler should synthesize private buffer resource
 descriptor from flat_scratch_init (#79586)

This change implements synthesizing the private buffer resource
descriptor in the kernel prolog instead of using the preloaded kernel
argument.
---
 llvm/docs/AMDGPUUsage.rst                          |  10 +-
 llvm/lib/Target/AMDGPU/SIFrameLowering.cpp         | 108 ++++---
 llvm/lib/Target/AMDGPU/SIFrameLowering.h           |  14 +-
 .../AMDGPU/GlobalISel/call-outgoing-stack-args.ll  |  10 +-
 .../abi-attribute-hints-undefined-behavior.ll      |  18 +-
 .../blender-no-live-segment-at-def-implicit-def.ll |   5 +-
 .../AMDGPU/branch-folding-implicit-def-subreg.ll   |   7 +-
 llvm/test/CodeGen/AMDGPU/call-argument-types.ll    | 329 ++++++++++++---------
 llvm/test/CodeGen/AMDGPU/call-reqd-group-size.ll   |  30 +-
 llvm/test/CodeGen/AMDGPU/call-waitcnt.ll           |  29 +-
 .../CodeGen/AMDGPU/callee-special-input-vgprs.ll   |   6 +-
 llvm/test/CodeGen/AMDGPU/cc-update.ll              |  84 +++---
 .../AMDGPU/cross-block-use-is-not-abi-copy.ll      |  10 +-
 .../CodeGen/AMDGPU/indirect-call-known-callees.ll  |   9 +-
 llvm/test/CodeGen/AMDGPU/indirect-call.ll          |  20 +-
 .../AMDGPU/kernel-vgpr-spill-mubuf-with-voffset.ll |   5 +-
 llvm/test/CodeGen/AMDGPU/lds-frame-extern.ll       |  60 ++--
 .../CodeGen/AMDGPU/llvm.amdgcn.lds.kernel.id.ll    |   5 +-
 .../CodeGen/AMDGPU/lower-module-lds-via-hybrid.ll  |  15 +-
 .../CodeGen/AMDGPU/lower-module-lds-via-table.ll   |  15 +-
 ...machine-sink-temporal-divergence-swdev407790.ll |  14 +-
 .../CodeGen/AMDGPU/need-fp-from-vgpr-spills.ll     |  15 +-
 llvm/test/CodeGen/AMDGPU/simple-indirect-call.ll   |   7 +-
 .../CodeGen/AMDGPU/tuple-allocation-failure.ll     |  14 +-
 llvm/test/CodeGen/AMDGPU/vgpr_constant_to_sgpr.ll  |   5 +-
 25 files changed, 494 insertions(+), 350 deletions(-)

diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst
index 6b24171..3019968 100644
--- a/llvm/docs/AMDGPUUsage.rst
+++ b/llvm/docs/AMDGPUUsage.rst
@@ -5530,9 +5530,13 @@ If the *Target Properties* column of :ref:`amdgpu-processor-table` specifies
 Instead the flat SCRATCH instructions are used.
 
 Otherwise, Private Segment Buffer SGPR register is used to initialize 4 SGPRs
-that are used as a V# to access scratch. CP uses the value provided by the
-runtime. It is used, together with Scratch Wavefront Offset as an offset, to
-access the private memory space using a segment address. See
+that are used as a V# to access scratch. 
+The compiler synthesizes the initialization value for the Private Segment
+Buffer in the kernel prologue, using the Flat Scratch Init to initialize low
+64-bit and a known constant for the high ones. If the Flat Scratch Init is not
+available, CP uses the value provided by the runtime. It is used, together with
+Scratch Wavefront Offset as an offset, to access the private memory space using
+a segment address. See
 :ref:`amdgpu-amdhsa-initial-kernel-execution-state`.
 
 The scratch V# is a four-aligned SGPR and always selected for the kernel as
diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
index d02aee7..6327a81 100644
--- a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
@@ -379,7 +379,8 @@ public:
 } // namespace llvm
 
 // Emit flat scratch setup code, assuming `MFI->hasFlatScratchInit()`
-void SIFrameLowering::emitEntryFunctionFlatScratchInit(
+// and return the FlatScratchInit Register used
+Register SIFrameLowering::emitEntryFunctionFlatScratchInit(
     MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
     const DebugLoc &DL, Register ScratchWaveOffsetReg) const {
   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
@@ -399,6 +400,7 @@ void SIFrameLowering::emitEntryFunctionFlatScratchInit(
 
   Register FlatScrInitLo;
   Register FlatScrInitHi;
+  Register FlatScratchInitReg;
 
   if (ST.isAmdPalOS()) {
     // Extract the scratch offset from the descriptor in the GIT
@@ -408,7 +410,6 @@ void SIFrameLowering::emitEntryFunctionFlatScratchInit(
 
     // Find unused reg to load flat scratch init into
     MachineRegisterInfo &MRI = MF.getRegInfo();
-    Register FlatScrInit = AMDGPU::NoRegister;
     ArrayRef<MCPhysReg> AllSGPR64s = TRI->getAllSGPR64(MF);
     unsigned NumPreloaded = (MFI->getNumPreloadedSGPRs() + 1) / 2;
     AllSGPR64s = AllSGPR64s.slice(
@@ -417,16 +418,28 @@ void SIFrameLowering::emitEntryFunctionFlatScratchInit(
     for (MCPhysReg Reg : AllSGPR64s) {
       if (LiveUnits.available(Reg) && !MRI.isReserved(Reg) &&
           MRI.isAllocatable(Reg) && !TRI->isSubRegisterEq(Reg, GITPtrLoReg)) {
-        FlatScrInit = Reg;
+        FlatScratchInitReg = Reg;
         break;
       }
     }
-    assert(FlatScrInit && "Failed to find free register for scratch init");
 
-    FlatScrInitLo = TRI->getSubReg(FlatScrInit, AMDGPU::sub0);
-    FlatScrInitHi = TRI->getSubReg(FlatScrInit, AMDGPU::sub1);
+  } else {
+    FlatScratchInitReg =
+        MFI->getPreloadedReg(AMDGPUFunctionArgInfo::FLAT_SCRATCH_INIT);
+
+    MachineRegisterInfo &MRI = MF.getRegInfo();
+    MRI.addLiveIn(FlatScratchInitReg);
+    MBB.addLiveIn(FlatScratchInitReg);
+  }
+
+  assert(FlatScratchInitReg && "Failed to find free register for scratch init");
+
+  FlatScrInitLo = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub0);
+  FlatScrInitHi = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub1);
+
+  if (ST.isAmdPalOS()) {
 
-    buildGitPtr(MBB, I, DL, TII, FlatScrInit);
+    buildGitPtr(MBB, I, DL, TII, FlatScratchInitReg);
 
     // We now have the GIT ptr - now get the scratch descriptor from the entry
     // at offset 0 (or offset 16 for a compute shader).
@@ -441,8 +454,8 @@ void SIFrameLowering::emitEntryFunctionFlatScratchInit(
         MF.getFunction().getCallingConv() == CallingConv::AMDGPU_CS ? 16 : 0;
     const GCNSubtarget &Subtarget = MF.getSubtarget<GCNSubtarget>();
     unsigned EncodedOffset = AMDGPU::convertSMRDOffsetUnits(Subtarget, Offset);
-    BuildMI(MBB, I, DL, LoadDwordX2, FlatScrInit)
-        .addReg(FlatScrInit)
+    BuildMI(MBB, I, DL, LoadDwordX2, FlatScratchInitReg)
+        .addReg(FlatScratchInitReg)
         .addImm(EncodedOffset) // offset
         .addImm(0)             // cpol
         .addMemOperand(MMO);
@@ -450,20 +463,9 @@ void SIFrameLowering::emitEntryFunctionFlatScratchInit(
     // Mask the offset in [47:0] of the descriptor
     const MCInstrDesc &SAndB32 = TII->get(AMDGPU::S_AND_B32);
     auto And = BuildMI(MBB, I, DL, SAndB32, FlatScrInitHi)
-        .addReg(FlatScrInitHi)
-        .addImm(0xffff);
+                   .addReg(FlatScrInitHi)
+                   .addImm(0xffff);
     And->getOperand(3).setIsDead(); // Mark SCC as dead.
-  } else {
-    Register FlatScratchInitReg =
-        MFI->getPreloadedReg(AMDGPUFunctionArgInfo::FLAT_SCRATCH_INIT);
-    assert(FlatScratchInitReg);
-
-    MachineRegisterInfo &MRI = MF.getRegInfo();
-    MRI.addLiveIn(FlatScratchInitReg);
-    MBB.addLiveIn(FlatScratchInitReg);
-
-    FlatScrInitLo = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub0);
-    FlatScrInitHi = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub1);
   }
 
   // Do a 64-bit pointer add.
@@ -486,20 +488,21 @@ void SIFrameLowering::emitEntryFunctionFlatScratchInit(
         addReg(FlatScrInitHi).
         addImm(int16_t(AMDGPU::Hwreg::ID_FLAT_SCR_HI |
                        (31 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_)));
-      return;
+      return FlatScratchInitReg;
     }
 
-    // For GFX9.
+    assert(ST.getGeneration() == AMDGPUSubtarget::GFX9);
+
     BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_U32), AMDGPU::FLAT_SCR_LO)
-      .addReg(FlatScrInitLo)
-      .addReg(ScratchWaveOffsetReg);
+        .addReg(FlatScrInitLo)
+        .addReg(ScratchWaveOffsetReg);
     auto Addc = BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADDC_U32),
                         AMDGPU::FLAT_SCR_HI)
       .addReg(FlatScrInitHi)
       .addImm(0);
     Addc->getOperand(3).setIsDead(); // Mark SCC as dead.
 
-    return;
+    return AMDGPU::FLAT_SCR;
   }
 
   assert(ST.getGeneration() < AMDGPUSubtarget::GFX9);
@@ -520,6 +523,7 @@ void SIFrameLowering::emitEntryFunctionFlatScratchInit(
     .addReg(FlatScrInitLo, RegState::Kill)
     .addImm(8);
   LShr->getOperand(3).setIsDead(); // Mark SCC as dead.
+  return AMDGPU::FLAT_SCR;
 }
 
 // Note SGPRSpill stack IDs should only be used for SGPR spilling to VGPRs, not
@@ -611,11 +615,15 @@ void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF,
   const SIInstrInfo *TII = ST.getInstrInfo();
   const SIRegisterInfo *TRI = &TII->getRegisterInfo();
   MachineRegisterInfo &MRI = MF.getRegInfo();
-  const Function &F = MF.getFunction();
   MachineFrameInfo &FrameInfo = MF.getFrameInfo();
 
   assert(MFI->isEntryFunction());
 
+  bool NeedsFlatScratchInit =
+      MFI->getUserSGPRInfo().hasFlatScratchInit() &&
+      (MRI.isPhysRegUsed(AMDGPU::FLAT_SCR) || FrameInfo.hasCalls() ||
+       (!allStackObjectsAreDead(FrameInfo) && ST.enableFlatScratch()));
+
   Register PreloadedScratchWaveOffsetReg = MFI->getPreloadedReg(
       AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET);
 
@@ -641,7 +649,7 @@ void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF,
   // Now that we have fixed the reserved SRSRC we need to locate the
   // (potentially) preloaded SRSRC.
   Register PreloadedScratchRsrcReg;
-  if (ST.isAmdHsaOrMesa(F)) {
+  if (ST.isAmdHsaOrMesa(MF.getFunction()) && !NeedsFlatScratchInit) {
     PreloadedScratchRsrcReg =
         MFI->getPreloadedReg(AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_BUFFER);
     if (ScratchRsrcReg && PreloadedScratchRsrcReg) {
@@ -697,33 +705,30 @@ void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF,
     BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), FPReg).addImm(0);
   }
 
-  bool NeedsFlatScratchInit =
-      MFI->getUserSGPRInfo().hasFlatScratchInit() &&
-      (MRI.isPhysRegUsed(AMDGPU::FLAT_SCR) || FrameInfo.hasCalls() ||
-       (!allStackObjectsAreDead(FrameInfo) && ST.enableFlatScratch()));
-
   if ((NeedsFlatScratchInit || ScratchRsrcReg) &&
       PreloadedScratchWaveOffsetReg && !ST.flatScratchIsArchitected()) {
     MRI.addLiveIn(PreloadedScratchWaveOffsetReg);
     MBB.addLiveIn(PreloadedScratchWaveOffsetReg);
   }
 
+  Register FlatScratchInit;
   if (NeedsFlatScratchInit) {
-    emitEntryFunctionFlatScratchInit(MF, MBB, I, DL, ScratchWaveOffsetReg);
+    FlatScratchInit =
+        emitEntryFunctionFlatScratchInit(MF, MBB, I, DL, ScratchWaveOffsetReg);
   }
 
   if (ScratchRsrcReg) {
-    emitEntryFunctionScratchRsrcRegSetup(MF, MBB, I, DL,
-                                         PreloadedScratchRsrcReg,
-                                         ScratchRsrcReg, ScratchWaveOffsetReg);
+    emitEntryFunctionScratchRsrcRegSetup(
+        MF, MBB, I, DL, FlatScratchInit, ScratchRsrcReg,
+        PreloadedScratchRsrcReg, ScratchWaveOffsetReg);
   }
 }
 
 // Emit scratch RSRC setup code, assuming `ScratchRsrcReg != AMDGPU::NoReg`
 void SIFrameLowering::emitEntryFunctionScratchRsrcRegSetup(
     MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
-    const DebugLoc &DL, Register PreloadedScratchRsrcReg,
-    Register ScratchRsrcReg, Register ScratchWaveOffsetReg) const {
+    const DebugLoc &DL, Register FlatScratchInit, Register ScratchRsrcReg,
+    Register PreloadedScratchRsrcReg, Register ScratchWaveOffsetReg) const {
 
   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
   const SIInstrInfo *TII = ST.getInstrInfo();
@@ -771,7 +776,8 @@ void SIFrameLowering::emitEntryFunctionScratchRsrcRegSetup(
           .addImm(21)
           .addReg(Rsrc03);
     }
-  } else if (ST.isMesaGfxShader(Fn) || !PreloadedScratchRsrcReg) {
+  } else if (ST.isMesaGfxShader(Fn) ||
+             (!FlatScratchInit.isValid() && !PreloadedScratchRsrcReg)) {
     assert(!ST.isAmdHsaOrMesa(Fn));
     const MCInstrDesc &SMovB32 = TII->get(AMDGPU::S_MOV_B32);
 
@@ -830,6 +836,26 @@ void SIFrameLowering::emitEntryFunctionScratchRsrcRegSetup(
       .addImm(Rsrc23 >> 32)
       .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
   } else if (ST.isAmdHsaOrMesa(Fn)) {
+
+    if (FlatScratchInit) {
+      const MCInstrDesc &SMovB32 = TII->get(AMDGPU::S_MOV_B32);
+      Register Lo_32 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub2);
+      Register Hi_32 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub3);
+      uint64_t Rsrc23 = TII->getScratchRsrcWords23();
+      I = BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY),
+                  TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0_sub1))
+              .addReg(FlatScratchInit)
+              .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
+      BuildMI(MBB, I, DL, SMovB32, Lo_32)
+          .addImm(Rsrc23 & 0xffffffff)
+          .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
+
+      BuildMI(MBB, I, DL, SMovB32, Hi_32)
+          .addImm(Rsrc23 >> 32)
+          .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
+      return;
+    }
+
     assert(PreloadedScratchRsrcReg);
 
     if (ScratchRsrcReg != PreloadedScratchRsrcReg) {
diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.h b/llvm/lib/Target/AMDGPU/SIFrameLowering.h
index b3feb75..f706d48 100644
--- a/llvm/lib/Target/AMDGPU/SIFrameLowering.h
+++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.h
@@ -67,19 +67,19 @@ public:
                                 MachineBasicBlock::iterator MI) const override;
 
 private:
-  void emitEntryFunctionFlatScratchInit(MachineFunction &MF,
-                                        MachineBasicBlock &MBB,
-                                        MachineBasicBlock::iterator I,
-                                        const DebugLoc &DL,
-                                        Register ScratchWaveOffsetReg) const;
+  Register
+  emitEntryFunctionFlatScratchInit(MachineFunction &MF, MachineBasicBlock &MBB,
+                                   MachineBasicBlock::iterator I,
+                                   const DebugLoc &DL,
+                                   Register ScratchWaveOffsetReg) const;
 
   Register getEntryFunctionReservedScratchRsrcReg(MachineFunction &MF) const;
 
   void emitEntryFunctionScratchRsrcRegSetup(
       MachineFunction &MF, MachineBasicBlock &MBB,
       MachineBasicBlock::iterator I, const DebugLoc &DL,
-      Register PreloadedPrivateBufferReg, Register ScratchRsrcReg,
-      Register ScratchWaveOffsetReg) const;
+      Register FlatScratchInit, Register ScratchRsrcReg,
+      Register PreloadedScratchRsrcReg, Register ScratchWaveOffsetReg) const;
 
 public:
   bool hasFP(const MachineFunction &MF) const override;
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll
index e597ce6..6e49a5a 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll
@@ -13,10 +13,11 @@ define amdgpu_kernel void @kernel_caller_stack() {
 ; MUBUF-LABEL: kernel_caller_stack:
 ; MUBUF:       ; %bb.0:
 ; MUBUF-NEXT:    s_add_u32 flat_scratch_lo, s4, s7
+; MUBUF-NEXT:    s_mov_b32 s2, -1
 ; MUBUF-NEXT:    s_addc_u32 flat_scratch_hi, s5, 0
-; MUBUF-NEXT:    s_add_u32 s0, s0, s7
+; MUBUF-NEXT:    s_mov_b32 s3, 0xe00000
 ; MUBUF-NEXT:    s_mov_b32 s32, 0
-; MUBUF-NEXT:    s_addc_u32 s1, s1, 0
+; MUBUF-NEXT:    s_mov_b64 s[0:1], flat_scratch
 ; MUBUF-NEXT:    v_mov_b32_e32 v0, 9
 ; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:4
 ; MUBUF-NEXT:    v_mov_b32_e32 v0, 10
@@ -61,9 +62,10 @@ define amdgpu_kernel void @kernel_caller_byval() {
 ; MUBUF-LABEL: kernel_caller_byval:
 ; MUBUF:       ; %bb.0:
 ; MUBUF-NEXT:    s_add_u32 flat_scratch_lo, s4, s7
+; MUBUF-NEXT:    s_mov_b32 s2, -1
 ; MUBUF-NEXT:    s_addc_u32 flat_scratch_hi, s5, 0
-; MUBUF-NEXT:    s_add_u32 s0, s0, s7
-; MUBUF-NEXT:    s_addc_u32 s1, s1, 0
+; MUBUF-NEXT:    s_mov_b32 s3, 0xe00000
+; MUBUF-NEXT:    s_mov_b64 s[0:1], flat_scratch
 ; MUBUF-NEXT:    v_mov_b32_e32 v0, 0
 ; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:8
 ; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:12
diff --git a/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll b/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll
index a439c0f..609b5e6 100644
--- a/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll
+++ b/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll
@@ -48,19 +48,20 @@ define amdgpu_kernel void @parent_kernel_missing_inputs() #0 {
 ; FIXEDABI-SDAG-LABEL: parent_kernel_missing_inputs:
 ; FIXEDABI-SDAG:       ; %bb.0:
 ; FIXEDABI-SDAG-NEXT:    s_add_i32 s4, s4, s9
-; FIXEDABI-SDAG-NEXT:    s_lshr_b32 flat_scratch_hi, s4, 8
+; FIXEDABI-SDAG-NEXT:    s_mov_b32 s2, -1
 ; FIXEDABI-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
-; FIXEDABI-SDAG-NEXT:    s_add_u32 s0, s0, s9
+; FIXEDABI-SDAG-NEXT:    s_mov_b32 flat_scratch_lo, s5
+; FIXEDABI-SDAG-NEXT:    s_lshr_b32 flat_scratch_hi, s4, 8
+; FIXEDABI-SDAG-NEXT:    s_mov_b32 s3, 0x11e80000
 ; FIXEDABI-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
 ; FIXEDABI-SDAG-NEXT:    v_or_b32_e32 v0, v0, v1
-; FIXEDABI-SDAG-NEXT:    s_addc_u32 s1, s1, 0
+; FIXEDABI-SDAG-NEXT:    s_mov_b64 s[0:1], flat_scratch
 ; FIXEDABI-SDAG-NEXT:    s_mov_b32 s14, s8
 ; FIXEDABI-SDAG-NEXT:    v_or_b32_e32 v31, v0, v2
 ; FIXEDABI-SDAG-NEXT:    s_mov_b64 s[8:9], 0
 ; FIXEDABI-SDAG-NEXT:    s_mov_b32 s12, s6
 ; FIXEDABI-SDAG-NEXT:    s_mov_b32 s13, s7
 ; FIXEDABI-SDAG-NEXT:    s_mov_b32 s32, 0
-; FIXEDABI-SDAG-NEXT:    s_mov_b32 flat_scratch_lo, s5
 ; FIXEDABI-SDAG-NEXT:    s_getpc_b64 s[4:5]
 ; FIXEDABI-SDAG-NEXT:    s_add_u32 s4, s4, requires_all_inputs@rel32@lo+4
 ; FIXEDABI-SDAG-NEXT:    s_addc_u32 s5, s5, requires_all_inputs@rel32@hi+12
@@ -70,19 +71,20 @@ define amdgpu_kernel void @parent_kernel_missing_inputs() #0 {
 ; FIXEDABI-GISEL-LABEL: parent_kernel_missing_inputs:
 ; FIXEDABI-GISEL:       ; %bb.0:
 ; FIXEDABI-GISEL-NEXT:    s_add_i32 s4, s4, s9
-; FIXEDABI-GISEL-NEXT:    s_lshr_b32 flat_scratch_hi, s4, 8
+; FIXEDABI-GISEL-NEXT:    s_mov_b32 s2, -1
 ; FIXEDABI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
-; FIXEDABI-GISEL-NEXT:    s_add_u32 s0, s0, s9
+; FIXEDABI-GISEL-NEXT:    s_mov_b32 flat_scratch_lo, s5
+; FIXEDABI-GISEL-NEXT:    s_lshr_b32 flat_scratch_hi, s4, 8
+; FIXEDABI-GISEL-NEXT:    s_mov_b32 s3, 0x11e80000
 ; FIXEDABI-GISEL-NEXT:    v_or_b32_e32 v0, v0, v1
 ; FIXEDABI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 20, v2
-; FIXEDABI-GISEL-NEXT:    s_addc_u32 s1, s1, 0
+; FIXEDABI-GISEL-NEXT:    s_mov_b64 s[0:1], flat_scratch
 ; FIXEDABI-GISEL-NEXT:    s_mov_b32 s14, s8
 ; FIXEDABI-GISEL-NEXT:    v_or_b32_e32 v31, v0, v1
 ; FIXEDABI-GISEL-NEXT:    s_mov_b64 s[8:9], 0
 ; FIXEDABI-GISEL-NEXT:    s_mov_b32 s12, s6
 ; FIXEDABI-GISEL-NEXT:    s_mov_b32 s13, s7
 ; FIXEDABI-GISEL-NEXT:    s_mov_b32 s32, 0
-; FIXEDABI-GISEL-NEXT:    s_mov_b32 flat_scratch_lo, s5
 ; FIXEDABI-GISEL-NEXT:    s_getpc_b64 s[4:5]
 ; FIXEDABI-GISEL-NEXT:    s_add_u32 s4, s4, requires_all_inputs@rel32@lo+4
 ; FIXEDABI-GISEL-NEXT:    s_addc_u32 s5, s5, requires_all_inputs@rel32@hi+12
diff --git a/llvm/test/CodeGen/AMDGPU/blender-no-live-segment-at-def-implicit-def.ll b/llvm/test/CodeGen/AMDGPU/blender-no-live-segment-at-def-implicit-def.ll
index 7c8d40c..74c6bb5 100644
--- a/llvm/test/CodeGen/AMDGPU/blender-no-live-segment-at-def-implicit-def.ll
+++ b/llvm/test/CodeGen/AMDGPU/blender-no-live-segment-at-def-implicit-def.ll
@@ -10,8 +10,9 @@ define amdgpu_kernel void @blender_no_live_segment_at_def_error(<4 x float> %ext
 ; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
 ; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; CHECK-NEXT:    s_load_dwordx8 s[36:43], s[6:7], 0x0
-; CHECK-NEXT:    s_add_u32 s0, s0, s15
-; CHECK-NEXT:    s_addc_u32 s1, s1, 0
+; CHECK-NEXT:    s_mov_b32 s2, -1
+; CHECK-NEXT:    s_mov_b32 s3, 0x31c16000
+; CHECK-NEXT:    s_mov_b64 s[0:1], s[10:11]
 ; CHECK-NEXT:    s_mov_b64 s[10:11], s[8:9]
 ; CHECK-NEXT:    s_mov_b32 s8, 0
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll b/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll
index 5a128c7..c06f213 100644
--- a/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll
+++ b/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll
@@ -5,13 +5,14 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
   ; GFX90A-LABEL: name: f1
   ; GFX90A: bb.0.bb:
   ; GFX90A-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
-  ; GFX90A-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr15, $sgpr10_sgpr11
+  ; GFX90A-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr15, $sgpr10_sgpr11
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   $sgpr32 = S_MOV_B32 0
   ; GFX90A-NEXT:   $flat_scr_lo = S_ADD_U32 $sgpr10, $sgpr15, implicit-def $scc
   ; GFX90A-NEXT:   $flat_scr_hi = S_ADDC_U32 $sgpr11, 0, implicit-def dead $scc, implicit $scc
-  ; GFX90A-NEXT:   $sgpr0 = S_ADD_U32 $sgpr0, $sgpr15, implicit-def $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3
-  ; GFX90A-NEXT:   $sgpr1 = S_ADDC_U32 $sgpr1, 0, implicit-def dead $scc, implicit $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT:   $sgpr2 = S_MOV_B32 4294967295, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT:   $sgpr3 = S_MOV_B32 14680064, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT:   $sgpr0_sgpr1 = COPY $flat_scr, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT:   renamable $sgpr10_sgpr11 = COPY $sgpr8_sgpr9
   ; GFX90A-NEXT:   renamable $vgpr31 = COPY $vgpr0, implicit $exec
   ; GFX90A-NEXT:   renamable $sgpr33 = S_LOAD_DWORD_IMM renamable $sgpr6_sgpr7, 24, 0 :: (dereferenceable invariant load (s32) from %ir.arg4.kernarg.offset.align.down, align 8, addrspace 4)
diff --git a/llvm/test/CodeGen/AMDGPU/call-argument-types.ll b/llvm/test/CodeGen/AMDGPU/call-argument-types.ll
index 87e17a1..381fb98 100644
--- a/llvm/test/CodeGen/AMDGPU/call-argument-types.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-argument-types.ll
@@ -129,12 +129,13 @@ define amdgpu_kernel void @test_call_external_void_func_i1_imm() #0 {
 ; HSA-LABEL: test_call_external_void_func_i1_imm:
 ; HSA:       ; %bb.0:
 ; HSA-NEXT:    s_add_i32 s4, s4, s7
+; HSA-NEXT:    s_mov_b32 s2, -1
+; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s5
 ; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s4, 8
-; HSA-NEXT:    s_add_u32 s0, s0, s7
-; HSA-NEXT:    s_addc_u32 s1, s1, 0
+; HSA-NEXT:    s_mov_b32 s3, 0x11e80000
+; HSA-NEXT:    s_mov_b64 s[0:1], flat_scratch
 ; HSA-NEXT:    v_mov_b32_e32 v0, 1
 ; HSA-NEXT:    s_mov_b32 s32, 0
-; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s5
 ; HSA-NEXT:    s_getpc_b64 s[4:5]
 ; HSA-NEXT:    s_add_u32 s4, s4, external_void_func_i1@rel32@lo+4
 ; HSA-NEXT:    s_addc_u32 s5, s5, external_void_func_i1@rel32@hi+12
@@ -234,8 +235,9 @@ define amdgpu_kernel void @test_call_external_void_func_i1_signext(i32) #0 {
 ; HSA-NEXT:    s_mov_b32 s6, -1
 ; HSA-NEXT:    buffer_load_ubyte v0, off, s[4:7], 0 glc
 ; HSA-NEXT:    s_waitcnt vmcnt(0)
-; HSA-NEXT:    s_add_u32 s0, s0, s9
-; HSA-NEXT:    s_addc_u32 s1, s1, 0
+; HSA-NEXT:    s_mov_b32 s2, -1
+; HSA-NEXT:    s_mov_b32 s3, 0x11e80000
+; HSA-NEXT:    s_mov_b64 s[0:1], flat_scratch
 ; HSA-NEXT:    s_mov_b32 s32, 0
 ; HSA-NEXT:    s_getpc_b64 s[4:5]
 ; HSA-NEXT:    s_add_u32 s4, s4, external_void_func_i1_signext@rel32@lo+4
@@ -339,8 +341,9 @@ define amdgpu_kernel void @test_call_external_void_func_i1_zeroext(i32) #0 {
 ; HSA-NEXT:    s_mov_b32 s6, -1
 ; HSA-NEXT:    buffer_load_ubyte v0, off, s[4:7], 0 glc
 ; HSA-NEXT:    s_waitcnt vmcnt(0)
-; HSA-NEXT:    s_add_u32 s0, s0, s9
-; HSA-NEXT:    s_addc_u32 s1, s1, 0
+; HSA-NEXT:    s_mov_b32 s2, -1
+; HSA-NEXT:    s_mov_b32 s3, 0x11e80000
+; HSA-NEXT:    s_mov_b64 s[0:1], flat_scratch
 ; HSA-NEXT:    s_mov_b32 s32, 0
 ; HSA-NEXT:    s_getpc_b64 s[4:5]
 ; HSA-NEXT:    s_add_u32 s4, s4, external_void_func_i1_zeroext@rel32@lo+4
@@ -422,12 +425,13 @@ define amdgpu_kernel void @test_call_external_void_func_i8_imm(i32) #0 {
 ; HSA-LABEL: test_call_external_void_func_i8_imm:
 ; HSA:       ; %bb.0:
 ; HSA-NEXT:    s_add_i32 s6, s6, s9
+; HSA-NEXT:    s_mov_b32 s2, -1
+; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s7
 ; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s6, 8
-; HSA-NEXT:    s_add_u32 s0, s0, s9
-; HSA-NEXT:    s_addc_u32 s1, s1, 0
+; HSA-NEXT:    s_mov_b32 s3, 0x11e80000
+; HSA-NEXT:    s_mov_b64 s[0:1], flat_scratch
 ; HSA-NEXT:    v_mov_b32_e32 v0, 0x7b
 ; HSA-NEXT:    s_mov_b32 s32, 0
-; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s7
 ; HSA-NEXT:    s_getpc_b64 s[4:5]
 ; HSA-NEXT:    s_add_u32 s4, s4, external_void_func_i8@rel32@lo+4
 ; HSA-NEXT:    s_addc_u32 s5, s5, external_void_func_i8@rel32@hi+12
@@ -525,8 +529,9 @@ define amdgpu_kernel void @test_call_external_void_func_i8_signext(i32) #0 {
 ; HSA-NEXT:    s_mov_b32 s6, -1
 ; HSA-NEXT:    buffer_load_sbyte v0, off, s[4:7], 0 glc
 ; HSA-NEXT:    s_waitcnt vmcnt(0)
-; HSA-NEXT:    s_add_u32 s0, s0, s9
-; HSA-NEXT:    s_addc_u32 s1, s1, 0
+; HSA-NEXT:    s_mov_b32 s2, -1
+; HSA-NEXT:    s_mov_b32 s3, 0x11e80000
+; HSA-NEXT:    s_mov_b64 s[0:1], flat_scratch
 ; HSA-NEXT:    s_mov_b32 s32, 0
 ; HSA-NEXT:    s_getpc_b64 s[4:5]
 ; HSA-NEXT:    s_add_u32 s4, s4, external_void_func_i8_signext@rel32@lo+4
@@ -625,8 +630,9 @@ define amdgpu_kernel void @test_call_external_void_func_i8_zeroext(i32) #0 {
 ; HSA-NEXT:    s_mov_b32 s6, -1
 ; HSA-NEXT:    buffer_load_ubyte v0, off, s[4:7], 0 glc
 ; HSA-NEXT:    s_waitcnt vmcnt(0)
-; HSA-NEXT:    s_add_u32 s0, s0, s9
-; HSA-NEXT:    s_addc_u32 s1, s1, 0
+; HSA-NEXT:    s_mov_b32 s2, -1
+; HSA-NEXT:    s_mov_b32 s3, 0x11e80000
+; HSA-NEXT:    s_mov_b64 s[0:1], flat_scratch
 ; HSA-NEXT:    s_mov_b32 s32, 0
 ; HSA-NEXT:    s_getpc_b64 s[4:5]
 ; HSA-NEXT:    s_add_u32 s4, s4, external_void_func_i8_zeroext@rel32@lo+4
@@ -707,12 +713,13 @@ define amdgpu_kernel void @test_call_external_void_func_i16_imm() #0 {
 ; HSA-LABEL: test_call_external_void_func_i16_imm:
 ; HSA:       ; %bb.0:
 ; HSA-NEXT:    s_add_i32 s4, s4, s7
+; HSA-NEXT:    s_mov_b32 s2, -1
+; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s5
 ; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s4, 8
-; HSA-NEXT:    s_add_u32 s0, s0, s7
-; HSA-NEXT:    s_addc_u32 s1, s1, 0
+; HSA-NEXT:    s_mov_b32 s3, 0x11e80000
+; HSA-NEXT:    s_mov_b64 s[0:1], flat_scratch
 ; HSA-NEXT:    v_mov_b32_e32 v0, 0x7b
 ; HSA-NEXT:    s_mov_b32 s32, 0
-; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s5
 ; HSA-NEXT:    s_getpc_b64 s[4:5]
 ; HSA-NEXT:    s_add_u32 s4, s4, external_void_func_i16@rel32@lo+4
 ; HSA-NEXT:    s_addc_u32 s5, s5, external_void_func_i16@rel32@hi+12
@@ -809,8 +816,9 @@ define amdgpu_kernel void @test_call_external_void_func_i16_signext(i32) #0 {
 ; HSA-NEXT:    s_mov_b32 s6, -1
 ; HSA-NEXT:    buffer_load_sshort v0, off, s[4:7], 0 glc
 ; HSA-NEXT:    s_waitcnt vmcnt(0)
-; HSA-NEXT:    s_add_u32 s0, s0, s9
-; HSA-NEXT:    s_addc_u32 s1, s1, 0
+; HSA-NEXT:    s_mov_b32 s2, -1
+; HSA-NEXT:    s_mov_b32 s3, 0x11e80000
+; HSA-NEXT:    s_mov_b64 s[0:1], flat_scratch
 ; HSA-NEXT:    s_mov_b32 s32, 0
 ; HSA-NEXT:    s_getpc_b64 s[4:5]
 ; HSA-NEXT:    s_add_u32 s4, s4, external_void_func_i16_signext@rel32@lo+4
@@ -909,8 +917,9 @@ define amdgpu_kernel void @test_call_external_void_func_i16_zeroext(i32) #0 {
 ; HSA-NEXT:    s_mov_b32 s6, -1
 ; HSA-NEXT:    buffer_load_ushort v0, off, s[4:7], 0 glc
 ; HSA-NEXT:    s_waitcnt vmcnt(0)
-; HSA-NEXT:    s_add_u32 s0, s0, s9
-; HSA-NEXT:    s_addc_u32 s1, s1, 0
+; HSA-NEXT:    s_mov_b32 s2, -1
+; HSA-NEXT:    s_mov_b32 s3, 0x11e80000
+; HSA-NEXT:    s_mov_b64 s[0:1], flat_scratch
 ; HSA-NEXT:    s_mov_b32 s32, 0
 ; HSA-NEXT:    s_getpc_b64 s[4:5]
 ; HSA-NEXT:    s_add_u32 s4, s4, external_void_func_i16_zeroext@rel32@lo+4
@@ -991,12 +1000,13 @@ define amdgpu_kernel void @test_call_external_void_func_i32_imm(i32) #0 {
 ; HSA-LABEL: test_call_external_void_func_i32_imm:
 ; HSA:       ; %bb.0:
 ; HSA-NEXT:    s_add_i32 s6, s6, s9
+; HSA-NEXT:    s_mov_b32 s2, -1
+; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s7
 ; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s6, 8
-; HSA-NEXT:    s_add_u32 s0, s0, s9
-; HSA-NEXT:    s_addc_u32 s1, s1, 0
+; HSA-NEXT:    s_mov_b32 s3, 0x11e80000
+; HSA-NEXT:    s_mov_b64 s[0:1], flat_scratch
 ; HSA-NEXT:    v_mov_b32_e32 v0, 42
 ; HSA-NEXT:    s_mov_b32 s32, 0
-; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s7
 ; HSA-NEXT:    s_getpc_b64 s[4:5]
 ; HSA-NEXT:    s_add_u32 s4, s4, external_void_func_i32@rel32@lo+4
 ; HSA-NEXT:    s_addc_u32 s5, s5, external_void_func_i32@rel32@hi+12
@@ -1078,13 +1088,14 @@ define amdgpu_kernel void @test_call_external_void_func_i64_imm() #0 {
 ; HSA-LABEL: test_call_external_void_func_i64_imm:
 ; HSA:       ; %bb.0:
 ; HSA-NEXT:    s_add_i32 s4, s4, s7
+; HSA-NEXT:    s_mov_b32 s2, -1
+; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s5
 ; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s4, 8
-; HSA-NEXT:    s_add_u32 s0, s0, s7
-; HSA-NEXT:    s_addc_u32 s1, s1, 0
+; HSA-NEXT:    s_mov_b32 s3, 0x11e80000
+; HSA-NEXT:    s_mov_b64 s[0:1], flat_scratch
 ; HSA-NEXT:    v_mov_b32_e32 v0, 0x7b
 ; HSA-NEXT:    v_mov_b32_e32 v1, 0
 ; HSA-NEXT:    s_mov_b32 s32, 0
-; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s5
 ; HSA-NEXT:    s_getpc_b64 s[4:5]
 ; HSA-NEXT:    s_add_u32 s4, s4, external_void_func_i64@rel32@lo+4
 ; HSA-NEXT:    s_addc_u32 s5, s5, external_void_func_i64@rel32@hi+12
@@ -1182,12 +1193,13 @@ define amdgpu_kernel void @test_call_external_void_func_v2i64() #0 {
 ; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s4, 8
 ; HSA-NEXT:    s_mov_b32 s4, 0
 ; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s5
-; HSA-NEXT:    s_add_u32 s0, s0, s7
 ; HSA-NEXT:    s_mov_b32 s7, 0x1100f000
 ; HSA-NEXT:    s_mov_b32 s6, -1
 ; HSA-NEXT:    s_mov_b32 s5, s4
 ; HSA-NEXT:    buffer_load_dwordx4 v[0:3], off, s[4:7], 0
-; HSA-NEXT:    s_addc_u32 s1, s1, 0
+; HSA-NEXT:    s_mov_b32 s2, -1
+; HSA-NEXT:    s_mov_b32 s3, 0x11e80000
+; HSA-NEXT:    s_mov_b64 s[0:1], flat_scratch
 ; HSA-NEXT:    s_mov_b32 s32, 0
 ; HSA-NEXT:    s_getpc_b64 s[4:5]
 ; HSA-NEXT:    s_add_u32 s4, s4, external_void_func_v2i64@rel32@lo+4
@@ -1278,15 +1290,16 @@ define amdgpu_kernel void @test_call_external_void_func_v2i64_imm() #0 {
 ; HSA-LABEL: test_call_external_void_func_v2i64_imm:
 ; HSA:       ; %bb.0:
 ; HSA-NEXT:    s_add_i32 s4, s4, s7
+; HSA-NEXT:    s_mov_b32 s2, -1
+; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s5
 ; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s4, 8
-; HSA-NEXT:    s_add_u32 s0, s0, s7
-; HSA-NEXT:    s_addc_u32 s1, s1, 0
+; HSA-NEXT:    s_mov_b32 s3, 0x11e80000
+; HSA-NEXT:    s_mov_b64 s[0:1], flat_scratch
 ; HSA-NEXT:    v_mov_b32_e32 v0, 1
 ; HSA-NEXT:    v_mov_b32_e32 v1, 2
 ; HSA-NEXT:    v_mov_b32_e32 v2, 3
 ; HSA-NEXT:    v_mov_b32_e32 v3, 4
 ; HSA-NEXT:    s_mov_b32 s32, 0
-; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s5
 ; HSA-NEXT:    s_getpc_b64 s[4:5]
 ; HSA-NEXT:    s_add_u32 s4, s4, external_void_func_v2i64@rel32@lo+4
 ; HSA-NEXT:    s_addc_u32 s5, s5, external_void_func_v2i64@rel32@hi+12
@@ -1391,12 +1404,13 @@ define amdgpu_kernel void @test_call_external_void_func_v3i64() #0 {
 ; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s4, 8
 ; HSA-NEXT:    s_mov_b32 s4, 0
 ; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s5
-; HSA-NEXT:    s_add_u32 s0, s0, s7
 ; HSA-NEXT:    s_mov_b32 s7, 0x1100f000
 ; HSA-NEXT:    s_mov_b32 s6, -1
 ; HSA-NEXT:    s_mov_b32 s5, s4
 ; HSA-NEXT:    buffer_load_dwordx4 v[0:3], off, s[4:7], 0
-; HSA-NEXT:    s_addc_u32 s1, s1, 0
+; HSA-NEXT:    s_mov_b32 s2, -1
+; HSA-NEXT:    s_mov_b32 s3, 0x11e80000
+; HSA-NEXT:    s_mov_b64 s[0:1], flat_scratch
 ; HSA-NEXT:    v_mov_b32_e32 v4, 1
 ; HSA-NEXT:    v_mov_b32_e32 v5, 2
 ; HSA-NEXT:    s_mov_b32 s32, 0
@@ -1514,12 +1528,13 @@ define amdgpu_kernel void @test_call_external_void_func_v4i64() #0 {
 ; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s4, 8
 ; HSA-NEXT:    s_mov_b32 s4, 0
 ; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s5
-; HSA-NEXT:    s_add_u32 s0, s0, s7
 ; HSA-NEXT:    s_mov_b32 s7, 0x1100f000
 ; HSA-NEXT:    s_mov_b32 s6, -1
 ; HSA-NEXT:    s_mov_b32 s5, s4
 ; HSA-NEXT:    buffer_load_dwordx4 v[0:3], off, s[4:7], 0
-; HSA-NEXT:    s_addc_u32 s1, s1, 0
+; HSA-NEXT:    s_mov_b32 s2, -1
+; HSA-NEXT:    s_mov_b32 s3, 0x11e80000
+; HSA-NEXT:    s_mov_b64 s[0:1], flat_scratch
 ; HSA-NEXT:    v_mov_b32_e32 v4, 1
 ; HSA-NEXT:    v_mov_b32_e32 v5, 2
 ; HSA-NEXT:    v_mov_b32_e32 v6, 3
@@ -1605,12 +1620,13 @@ define amdgpu_kernel void @test_call_external_void_func_f16_imm() #0 {
 ; HSA-LABEL: test_call_external_void_func_f16_imm:
 ; HSA:       ; %bb.0:
 ; HSA-NEXT:    s_add_i32 s4, s4, s7
+; HSA-NEXT:    s_mov_b32 s2, -1
+; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s5
 ; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s4, 8
-; HSA-NEXT:    s_add_u32 s0, s0, s7
-; HSA-NEXT:    s_addc_u32 s1, s1, 0
+; HSA-NEXT:    s_mov_b32 s3, 0x11e80000
+; HSA-NEXT:    s_mov_b64 s[0:1], flat_scratch
 ; HSA-NEXT:    v_mov_b32_e32 v0, 0x4400
 ; HSA-NEXT:    s_mov_b32 s32, 0
-; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s5
 ; HSA-NEXT:    s_getpc_b64 s[4:5]
 ; HSA-NEXT:    s_add_u32 s4, s4, external_void_func_f16@rel32@lo+4
 ; HSA-NEXT:    s_addc_u32 s5, s5, external_void_func_f16@rel32@hi+12
@@ -1689,12 +1705,13 @@ define amdgpu_kernel void @test_call_external_void_func_f32_imm() #0 {
 ; HSA-LABEL: test_call_external_void_func_f32_imm:
 ; HSA:       ; %bb.0:
 ; HSA-NEXT:    s_add_i32 s4, s4, s7
+; HSA-NEXT:    s_mov_b32 s2, -1
+; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s5
 ; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s4, 8
-; HSA-NEXT:    s_add_u32 s0, s0, s7
-; HSA-NEXT:    s_addc_u32 s1, s1, 0
+; HSA-NEXT:    s_mov_b32 s3, 0x11e80000
+; HSA-NEXT:    s_mov_b64 s[0:1], flat_scratch
 ; HSA-NEXT:    v_mov_b32_e32 v0, 4.0
 ; HSA-NEXT:    s_mov_b32 s32, 0
-; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s5
 ; HSA-NEXT:    s_getpc_b64 s[4:5]
 ; HSA-NEXT:    s_add_u32 s4, s4, external_void_func_f32@rel32@lo+4
 ; HSA-NEXT:    s_addc_u32 s5, s5, external_void_func_f32@rel32@hi+12
@@ -1776,13 +1793,14 @@ define amdgpu_kernel void @test_call_external_void_func_v2f32_imm() #0 {
 ; HSA-LABEL: test_call_external_void_func_v2f32_imm:
 ; HSA:       ; %bb.0:
 ; HSA-NEXT:    s_add_i32 s4, s4, s7
+; HSA-NEXT:    s_mov_b32 s2, -1
+; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s5
 ; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s4, 8
-; HSA-NEXT:    s_add_u32 s0, s0, s7
-; HSA-NEXT:    s_addc_u32 s1, s1, 0
+; HSA-NEXT:    s_mov_b32 s3, 0x11e80000
+; HSA-NEXT:    s_mov_b64 s[0:1], flat_scratch
 ; HSA-NEXT:    v_mov_b32_e32 v0, 1.0
 ; HSA-NEXT:    v_mov_b32_e32 v1, 2.0
 ; HSA-NEXT:    s_mov_b32 s32, 0
-; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s5
 ; HSA-NEXT:    s_getpc_b64 s[4:5]
 ; HSA-NEXT:    s_add_u32 s4, s4, external_void_func_v2f32@rel32@lo+4
 ; HSA-NEXT:    s_addc_u32 s5, s5, external_void_func_v2f32@rel32@hi+12
@@ -1868,14 +1886,15 @@ define amdgpu_kernel void @test_call_external_void_func_v3f32_imm() #0 {
 ; HSA-LABEL: test_call_external_void_func_v3f32_imm:
 ; HSA:       ; %bb.0:
 ; HSA-NEXT:    s_add_i32 s4, s4, s7
+; HSA-NEXT:    s_mov_b32 s2, -1
+; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s5
 ; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s4, 8
-; HSA-NEXT:    s_add_u32 s0, s0, s7
-; HSA-NEXT:    s_addc_u32 s1, s1, 0
+; HSA-NEXT:    s_mov_b32 s3, 0x11e80000
+; HSA-NEXT:    s_mov_b64 s[0:1], flat_scratch
 ; HSA-NEXT:    v_mov_b32_e32 v0, 1.0
 ; HSA-NEXT:    v_mov_b32_e32 v1, 2.0
 ; HSA-NEXT:    v_mov_b32_e32 v2, 4.0
 ; HSA-NEXT:    s_mov_b32 s32, 0
-; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s5
 ; HSA-NEXT:    s_getpc_b64 s[4:5]
 ; HSA-NEXT:    s_add_u32 s4, s4, external_void_func_v3f32@rel32@lo+4
 ; HSA-NEXT:    s_addc_u32 s5, s5, external_void_func_v3f32@rel32@hi+12
@@ -1968,16 +1987,17 @@ define amdgpu_kernel void @test_call_external_void_func_v5f32_imm() #0 {
 ; HSA-LABEL: test_call_external_void_func_v5f32_imm:
 ; HSA:       ; %bb.0:
 ; HSA-NEXT:    s_add_i32 s4, s4, s7
+; HSA-NEXT:    s_mov_b32 s2, -1
+; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s5
 ; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s4, 8
-; HSA-NEXT:    s_add_u32 s0, s0, s7
-; HSA-NEXT:    s_addc_u32 s1, s1, 0
+; HSA-NEXT:    s_mov_b32 s3, 0x11e80000
+; HSA-NEXT:    s_mov_b64 s[0:1], flat_scratch
 ; HSA-NEXT:    v_mov_b32_e32 v0, 1.0
 ; HSA-NEXT:    v_mov_b32_e32 v1, 2.0
 ; HSA-NEXT:    v_mov_b32_e32 v2, 4.0
 ; HSA-NEXT:    v_mov_b32_e32 v3, -1.0
 ; HSA-NEXT:    v_mov_b32_e32 v4, 0.5
 ; HSA-NEXT:    s_mov_b32 s32, 0
-; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s5
 ; HSA-NEXT:    s_getpc_b64 s[4:5]
 ; HSA-NEXT:    s_add_u32 s4, s4, external_void_func_v5f32@rel32@lo+4
 ; HSA-NEXT:    s_addc_u32 s5, s5, external_void_func_v5f32@rel32@hi+12
@@ -2059,13 +2079,14 @@ define amdgpu_kernel void @test_call_external_void_func_f64_imm() #0 {
 ; HSA-LABEL: test_call_external_void_func_f64_imm:
 ; HSA:       ; %bb.0:
 ; HSA-NEXT:    s_add_i32 s4, s4, s7
+; HSA-NEXT:    s_mov_b32 s2, -1
+; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s5
 ; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s4, 8
-; HSA-NEXT:    s_add_u32 s0, s0, s7
-; HSA-NEXT:    s_addc_u32 s1, s1, 0
+; HSA-NEXT:    s_mov_b32 s3, 0x11e80000
+; HSA-NEXT:    s_mov_b64 s[0:1], flat_scratch
 ; HSA-NEXT:    v_mov_b32_e32 v0, 0
 ; HSA-NEXT:    v_mov_b32_e32 v1, 0x40100000
 ; HSA-NEXT:    s_mov_b32 s32, 0
-; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s5
 ; HSA-NEXT:    s_getpc_b64 s[4:5]
 ; HSA-NEXT:    s_add_u32 s4, s4, external_void_func_f64@rel32@lo+4
 ; HSA-NEXT:    s_addc_u32 s5, s5, external_void_func_f64@rel32@hi+12
@@ -2154,15 +2175,16 @@ define amdgpu_kernel void @test_call_external_void_func_v2f64_imm() #0 {
 ; HSA-LABEL: test_call_external_void_func_v2f64_imm:
 ; HSA:       ; %bb.0:
 ; HSA-NEXT:    s_add_i32 s4, s4, s7
+; HSA-NEXT:    s_mov_b32 s2, -1
+; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s5
 ; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s4, 8
-; HSA-NEXT:    s_add_u32 s0, s0, s7
-; HSA-NEXT:    s_addc_u32 s1, s1, 0
+; HSA-NEXT:    s_mov_b32 s3, 0x11e80000
+; HSA-NEXT:    s_mov_b64 s[0:1], flat_scratch
 ; HSA-NEXT:    v_mov_b32_e32 v0, 0
 ; HSA-NEXT:    v_mov_b32_e32 v1, 2.0
 ; HSA-NEXT:    v_mov_b32_e32 v2, 0
 ; HSA-NEXT:    v_mov_b32_e32 v3, 0x40100000
 ; HSA-NEXT:    s_mov_b32 s32, 0
-; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s5
 ; HSA-NEXT:    s_getpc_b64 s[4:5]
 ; HSA-NEXT:    s_add_u32 s4, s4, external_void_func_v2f64@rel32@lo+4
 ; HSA-NEXT:    s_addc_u32 s5, s5, external_void_func_v2f64@rel32@hi+12
@@ -2258,9 +2280,11 @@ define amdgpu_kernel void @test_call_external_void_func_v3f64_imm() #0 {
 ; HSA-LABEL: test_call_external_void_func_v3f64_imm:
 ; HSA:       ; %bb.0:
 ; HSA-NEXT:    s_add_i32 s4, s4, s7
+; HSA-NEXT:    s_mov_b32 s2, -1
+; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s5
 ; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s4, 8
-; HSA-NEXT:    s_add_u32 s0, s0, s7
-; HSA-NEXT:    s_addc_u32 s1, s1, 0
+; HSA-NEXT:    s_mov_b32 s3, 0x11e80000
+; HSA-NEXT:    s_mov_b64 s[0:1], flat_scratch
 ; HSA-NEXT:    v_mov_b32_e32 v0, 0
 ; HSA-NEXT:    v_mov_b32_e32 v1, 2.0
 ; HSA-NEXT:    v_mov_b32_e32 v2, 0
@@ -2268,7 +2292,6 @@ define amdgpu_kernel void @test_call_external_void_func_v3f64_imm() #0 {
 ; HSA-NEXT:    v_mov_b32_e32 v4, 0
 ; HSA-NEXT:    v_mov_b32_e32 v5, 0x40200000
 ; HSA-NEXT:    s_mov_b32 s32, 0
-; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s5
 ; HSA-NEXT:    s_getpc_b64 s[4:5]
 ; HSA-NEXT:    s_add_u32 s4, s4, external_void_func_v3f64@rel32@lo+4
 ; HSA-NEXT:    s_addc_u32 s5, s5, external_void_func_v3f64@rel32@hi+12
@@ -2357,14 +2380,15 @@ define amdgpu_kernel void @test_call_external_void_func_v2i16() #0 {
 ; HSA-LABEL: test_call_external_void_func_v2i16:
 ; HSA:       ; %bb.0:
 ; HSA-NEXT:    s_add_i32 s4, s4, s7
-; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s4, 8
-; HSA-NEXT:    s_add_u32 s0, s0, s7
 ; HSA-NEXT:    s_mov_b32 s7, 0x1100f000
 ; HSA-NEXT:    s_mov_b32 s6, -1
 ; HSA-NEXT:    buffer_load_dword v0, off, s[4:7], 0
-; HSA-NEXT:    s_addc_u32 s1, s1, 0
-; HSA-NEXT:    s_mov_b32 s32, 0
+; HSA-NEXT:    s_mov_b32 s2, -1
 ; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s5
+; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s4, 8
+; HSA-NEXT:    s_mov_b32 s3, 0x11e80000
+; HSA-NEXT:    s_mov_b64 s[0:1], flat_scratch
+; HSA-NEXT:    s_mov_b32 s32, 0
 ; HSA-NEXT:    s_getpc_b64 s[4:5]
 ; HSA-NEXT:    s_add_u32 s4, s4, external_void_func_v2i16@rel32@lo+4
 ; HSA-NEXT:    s_addc_u32 s5, s5, external_void_func_v2i16@rel32@hi+12
@@ -2456,14 +2480,15 @@ define amdgpu_kernel void @test_call_external_void_func_v3i16() #0 {
 ; HSA-LABEL: test_call_external_void_func_v3i16:
 ; HSA:       ; %bb.0:
 ; HSA-NEXT:    s_add_i32 s4, s4, s7
-; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s4, 8
-; HSA-NEXT:    s_add_u32 s0, s0, s7
 ; HSA-NEXT:    s_mov_b32 s7, 0x1100f000
 ; HSA-NEXT:    s_mov_b32 s6, -1
 ; HSA-NEXT:    buffer_load_dwordx2 v[0:1], off, s[4:7], 0
-; HSA-NEXT:    s_addc_u32 s1, s1, 0
-; HSA-NEXT:    s_mov_b32 s32, 0
+; HSA-NEXT:    s_mov_b32 s2, -1
 ; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s5
+; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s4, 8
+; HSA-NEXT:    s_mov_b32 s3, 0x11e80000
+; HSA-NEXT:    s_mov_b64 s[0:1], flat_scratch
+; HSA-NEXT:    s_mov_b32 s32, 0
 ; HSA-NEXT:    s_getpc_b64 s[4:5]
 ; HSA-NEXT:    s_add_u32 s4, s4, external_void_func_v3i16@rel32@lo+4
 ; HSA-NEXT:    s_addc_u32 s5, s5, external_void_func_v3i16@rel32@hi+12
@@ -2556,14 +2581,15 @@ define amdgpu_kernel void @test_call_external_void_func_v3f16() #0 {
 ; HSA-LABEL: test_call_external_void_func_v3f16:
 ; HSA:       ; %bb.0:
 ; HSA-NEXT:    s_add_i32 s4, s4, s7
-; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s4, 8
-; HSA-NEXT:    s_add_u32 s0, s0, s7
 ; HSA-NEXT:    s_mov_b32 s7, 0x1100f000
 ; HSA-NEXT:    s_mov_b32 s6, -1
 ; HSA-NEXT:    buffer_load_dwordx2 v[0:1], off, s[4:7], 0
-; HSA-NEXT:    s_addc_u32 s1, s1, 0
-; HSA-NEXT:    s_mov_b32 s32, 0
+; HSA-NEXT:    s_mov_b32 s2, -1
 ; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s5
+; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s4, 8
+; HSA-NEXT:    s_mov_b32 s3, 0x11e80000
+; HSA-NEXT:    s_mov_b64 s[0:1], flat_scratch
+; HSA-NEXT:    s_mov_b32 s32, 0
 ; HSA-NEXT:    s_getpc_b64 s[4:5]
 ; HSA-NEXT:    s_add_u32 s4, s4, external_void_func_v3f16@rel32@lo+4
 ; HSA-NEXT:    s_addc_u32 s5, s5, external_void_func_v3f16@rel32@hi+12
@@ -2647,13 +2673,14 @@ define amdgpu_kernel void @test_call_external_void_func_v3i16_imm() #0 {
 ; HSA-LABEL: test_call_external_void_func_v3i16_imm:
 ; HSA:       ; %bb.0:
 ; HSA-NEXT:    s_add_i32 s4, s4, s7
+; HSA-NEXT:    s_mov_b32 s2, -1
+; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s5
 ; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s4, 8
-; HSA-NEXT:    s_add_u32 s0, s0, s7
-; HSA-NEXT:    s_addc_u32 s1, s1, 0
+; HSA-NEXT:    s_mov_b32 s3, 0x11e80000
+; HSA-NEXT:    s_mov_b64 s[0:1], flat_scratch
 ; HSA-NEXT:    v_mov_b32_e32 v0, 0x20001
 ; HSA-NEXT:    v_mov_b32_e32 v1, 3
 ; HSA-NEXT:    s_mov_b32 s32, 0
-; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s5
 ; HSA-NEXT:    s_getpc_b64 s[4:5]
 ; HSA-NEXT:    s_add_u32 s4, s4, external_void_func_v3i16@rel32@lo+4
 ; HSA-NEXT:    s_addc_u32 s5, s5, external_void_func_v3i16@rel32@hi+12
@@ -2737,13 +2764,14 @@ define amdgpu_kernel void @test_call_external_void_func_v3f16_imm() #0 {
 ; HSA-LABEL: test_call_external_void_func_v3f16_imm:
 ; HSA:       ; %bb.0:
 ; HSA-NEXT:    s_add_i32 s4, s4, s7
+; HSA-NEXT:    s_mov_b32 s2, -1
+; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s5
 ; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s4, 8
-; HSA-NEXT:    s_add_u32 s0, s0, s7
-; HSA-NEXT:    s_addc_u32 s1, s1, 0
+; HSA-NEXT:    s_mov_b32 s3, 0x11e80000
+; HSA-NEXT:    s_mov_b64 s[0:1], flat_scratch
 ; HSA-NEXT:    v_mov_b32_e32 v0, 0x40003c00
 ; HSA-NEXT:    v_mov_b32_e32 v1, 0x4400
 ; HSA-NEXT:    s_mov_b32 s32, 0
-; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s5
 ; HSA-NEXT:    s_getpc_b64 s[4:5]
 ; HSA-NEXT:    s_add_u32 s4, s4, external_void_func_v3f16@rel32@lo+4
 ; HSA-NEXT:    s_addc_u32 s5, s5, external_void_func_v3f16@rel32@hi+12
@@ -2835,14 +2863,15 @@ define amdgpu_kernel void @test_call_external_void_func_v4i16() #0 {
 ; HSA-LABEL: test_call_external_void_func_v4i16:
 ; HSA:       ; %bb.0:
 ; HSA-NEXT:    s_add_i32 s4, s4, s7
-; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s4, 8
-; HSA-NEXT:    s_add_u32 s0, s0, s7
 ; HSA-NEXT:    s_mov_b32 s7, 0x1100f000
 ; HSA-NEXT:    s_mov_b32 s6, -1
 ; HSA-NEXT:    buffer_load_dwordx2 v[0:1], off, s[4:7], 0
-; HSA-NEXT:    s_addc_u32 s1, s1, 0
-; HSA-NEXT:    s_mov_b32 s32, 0
+; HSA-NEXT:    s_mov_b32 s2, -1
 ; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s5
+; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s4, 8
+; HSA-NEXT:    s_mov_b32 s3, 0x11e80000
+; HSA-NEXT:    s_mov_b64 s[0:1], flat_scratch
+; HSA-NEXT:    s_mov_b32 s32, 0
 ; HSA-NEXT:    s_getpc_b64 s[4:5]
 ; HSA-NEXT:    s_add_u32 s4, s4, external_void_func_v4i16@rel32@lo+4
 ; HSA-NEXT:    s_addc_u32 s5, s5, external_void_func_v4i16@rel32@hi+12
@@ -2928,13 +2957,14 @@ define amdgpu_kernel void @test_call_external_void_func_v4i16_imm() #0 {
 ; HSA-LABEL: test_call_external_void_func_v4i16_imm:
 ; HSA:       ; %bb.0:
 ; HSA-NEXT:    s_add_i32 s4, s4, s7
+; HSA-NEXT:    s_mov_b32 s2, -1
+; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s5
 ; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s4, 8
-; HSA-NEXT:    s_add_u32 s0, s0, s7
-; HSA-NEXT:    s_addc_u32 s1, s1, 0
+; HSA-NEXT:    s_mov_b32 s3, 0x11e80000
+; HSA-NEXT:    s_mov_b64 s[0:1], flat_scratch
 ; HSA-NEXT:    v_mov_b32_e32 v0, 0x20001
 ; HSA-NEXT:    v_mov_b32_e32 v1, 0x40003
 ; HSA-NEXT:    s_mov_b32 s32, 0
-; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s5
 ; HSA-NEXT:    s_getpc_b64 s[4:5]
 ; HSA-NEXT:    s_add_u32 s4, s4, external_void_func_v4i16@rel32@lo+4
 ; HSA-NEXT:    s_addc_u32 s5, s5, external_void_func_v4i16@rel32@hi+12
@@ -3025,14 +3055,15 @@ define amdgpu_kernel void @test_call_external_void_func_v2f16() #0 {
 ; HSA-LABEL: test_call_external_void_func_v2f16:
 ; HSA:       ; %bb.0:
 ; HSA-NEXT:    s_add_i32 s4, s4, s7
-; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s4, 8
-; HSA-NEXT:    s_add_u32 s0, s0, s7
 ; HSA-NEXT:    s_mov_b32 s7, 0x1100f000
 ; HSA-NEXT:    s_mov_b32 s6, -1
 ; HSA-NEXT:    buffer_load_dword v0, off, s[4:7], 0
-; HSA-NEXT:    s_addc_u32 s1, s1, 0
-; HSA-NEXT:    s_mov_b32 s32, 0
+; HSA-NEXT:    s_mov_b32 s2, -1
 ; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s5
+; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s4, 8
+; HSA-NEXT:    s_mov_b32 s3, 0x11e80000
+; HSA-NEXT:    s_mov_b64 s[0:1], flat_scratch
+; HSA-NEXT:    s_mov_b32 s32, 0
 ; HSA-NEXT:    s_getpc_b64 s[4:5]
 ; HSA-NEXT:    s_add_u32 s4, s4, external_void_func_v2f16@rel32@lo+4
 ; HSA-NEXT:    s_addc_u32 s5, s5, external_void_func_v2f16@rel32@hi+12
@@ -3120,14 +3151,15 @@ define amdgpu_kernel void @test_call_external_void_func_v2i32() #0 {
 ; HSA-LABEL: test_call_external_void_func_v2i32:
 ; HSA:       ; %bb.0:
 ; HSA-NEXT:    s_add_i32 s4, s4, s7
-; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s4, 8
-; HSA-NEXT:    s_add_u32 s0, s0, s7
 ; HSA-NEXT:    s_mov_b32 s7, 0x1100f000
 ; HSA-NEXT:    s_mov_b32 s6, -1
 ; HSA-NEXT:    buffer_load_dwordx2 v[0:1], off, s[4:7], 0
-; HSA-NEXT:    s_addc_u32 s1, s1, 0
-; HSA-NEXT:    s_mov_b32 s32, 0
+; HSA-NEXT:    s_mov_b32 s2, -1
 ; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s5
+; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s4, 8
+; HSA-NEXT:    s_mov_b32 s3, 0x11e80000
+; HSA-NEXT:    s_mov_b64 s[0:1], flat_scratch
+; HSA-NEXT:    s_mov_b32 s32, 0
 ; HSA-NEXT:    s_getpc_b64 s[4:5]
 ; HSA-NEXT:    s_add_u32 s4, s4, external_void_func_v2i32@rel32@lo+4
 ; HSA-NEXT:    s_addc_u32 s5, s5, external_void_func_v2i32@rel32@hi+12
@@ -3210,13 +3242,14 @@ define amdgpu_kernel void @test_call_external_void_func_v2i32_imm() #0 {
 ; HSA-LABEL: test_call_external_void_func_v2i32_imm:
 ; HSA:       ; %bb.0:
 ; HSA-NEXT:    s_add_i32 s4, s4, s7
+; HSA-NEXT:    s_mov_b32 s2, -1
+; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s5
 ; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s4, 8
-; HSA-NEXT:    s_add_u32 s0, s0, s7
-; HSA-NEXT:    s_addc_u32 s1, s1, 0
+; HSA-NEXT:    s_mov_b32 s3, 0x11e80000
+; HSA-NEXT:    s_mov_b64 s[0:1], flat_scratch
 ; HSA-NEXT:    v_mov_b32_e32 v0, 1
 ; HSA-NEXT:    v_mov_b32_e32 v1, 2
 ; HSA-NEXT:    s_mov_b32 s32, 0
-; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s5
 ; HSA-NEXT:    s_getpc_b64 s[4:5]
 ; HSA-NEXT:    s_add_u32 s4, s4, external_void_func_v2i32@rel32@lo+4
 ; HSA-NEXT:    s_addc_u32 s5, s5, external_void_func_v2i32@rel32@hi+12
@@ -3302,14 +3335,15 @@ define amdgpu_kernel void @test_call_external_void_func_v3i32_imm(i32) #0 {
 ; HSA-LABEL: test_call_external_void_func_v3i32_imm:
 ; HSA:       ; %bb.0:
 ; HSA-NEXT:    s_add_i32 s6, s6, s9
+; HSA-NEXT:    s_mov_b32 s2, -1
+; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s7
 ; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s6, 8
-; HSA-NEXT:    s_add_u32 s0, s0, s9
-; HSA-NEXT:    s_addc_u32 s1, s1, 0
+; HSA-NEXT:    s_mov_b32 s3, 0x11e80000
+; HSA-NEXT:    s_mov_b64 s[0:1], flat_scratch
 ; HSA-NEXT:    v_mov_b32_e32 v0, 3
 ; HSA-NEXT:    v_mov_b32_e32 v1, 4
 ; HSA-NEXT:    v_mov_b32_e32 v2, 5
 ; HSA-NEXT:    s_mov_b32 s32, 0
-; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s7
 ; HSA-NEXT:    s_getpc_b64 s[4:5]
 ; HSA-NEXT:    s_add_u32 s4, s4, external_void_func_v3i32@rel32@lo+4
 ; HSA-NEXT:    s_addc_u32 s5, s5, external_void_func_v3i32@rel32@hi+12
@@ -3398,15 +3432,16 @@ define amdgpu_kernel void @test_call_external_void_func_v3i32_i32(i32) #0 {
 ; HSA-LABEL: test_call_external_void_func_v3i32_i32:
 ; HSA:       ; %bb.0:
 ; HSA-NEXT:    s_add_i32 s6, s6, s9
+; HSA-NEXT:    s_mov_b32 s2, -1
+; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s7
 ; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s6, 8
-; HSA-NEXT:    s_add_u32 s0, s0, s9
-; HSA-NEXT:    s_addc_u32 s1, s1, 0
+; HSA-NEXT:    s_mov_b32 s3, 0x11e80000
+; HSA-NEXT:    s_mov_b64 s[0:1], flat_scratch
 ; HSA-NEXT:    v_mov_b32_e32 v0, 3
 ; HSA-NEXT:    v_mov_b32_e32 v1, 4
 ; HSA-NEXT:    v_mov_b32_e32 v2, 5
 ; HSA-NEXT:    v_mov_b32_e32 v3, 6
 ; HSA-NEXT:    s_mov_b32 s32, 0
-; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s7
 ; HSA-NEXT:    s_getpc_b64 s[4:5]
 ; HSA-NEXT:    s_add_u32 s4, s4, external_void_func_v3i32_i32@rel32@lo+4
 ; HSA-NEXT:    s_addc_u32 s5, s5, external_void_func_v3i32_i32@rel32@hi+12
@@ -3493,14 +3528,15 @@ define amdgpu_kernel void @test_call_external_void_func_v4i32() #0 {
 ; HSA-LABEL: test_call_external_void_func_v4i32:
 ; HSA:       ; %bb.0:
 ; HSA-NEXT:    s_add_i32 s4, s4, s7
-; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s4, 8
-; HSA-NEXT:    s_add_u32 s0, s0, s7
 ; HSA-NEXT:    s_mov_b32 s7, 0x1100f000
 ; HSA-NEXT:    s_mov_b32 s6, -1
 ; HSA-NEXT:    buffer_load_dwordx4 v[0:3], off, s[4:7], 0
-; HSA-NEXT:    s_addc_u32 s1, s1, 0
-; HSA-NEXT:    s_mov_b32 s32, 0
+; HSA-NEXT:    s_mov_b32 s2, -1
 ; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s5
+; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s4, 8
+; HSA-NEXT:    s_mov_b32 s3, 0x11e80000
+; HSA-NEXT:    s_mov_b64 s[0:1], flat_scratch
+; HSA-NEXT:    s_mov_b32 s32, 0
 ; HSA-NEXT:    s_getpc_b64 s[4:5]
 ; HSA-NEXT:    s_add_u32 s4, s4, external_void_func_v4i32@rel32@lo+4
 ; HSA-NEXT:    s_addc_u32 s5, s5, external_void_func_v4i32@rel32@hi+12
@@ -3590,15 +3626,16 @@ define amdgpu_kernel void @test_call_external_void_func_v4i32_imm() #0 {
 ; HSA-LABEL: test_call_external_void_func_v4i32_imm:
 ; HSA:       ; %bb.0:
 ; HSA-NEXT:    s_add_i32 s4, s4, s7
+; HSA-NEXT:    s_mov_b32 s2, -1
+; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s5
 ; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s4, 8
-; HSA-NEXT:    s_add_u32 s0, s0, s7
-; HSA-NEXT:    s_addc_u32 s1, s1, 0
+; HSA-NEXT:    s_mov_b32 s3, 0x11e80000
+; HSA-NEXT:    s_mov_b64 s[0:1], flat_scratch
 ; HSA-NEXT:    v_mov_b32_e32 v0, 1
 ; HSA-NEXT:    v_mov_b32_e32 v1, 2
 ; HSA-NEXT:    v_mov_b32_e32 v2, 3
 ; HSA-NEXT:    v_mov_b32_e32 v3, 4
 ; HSA-NEXT:    s_mov_b32 s32, 0
-; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s5
 ; HSA-NEXT:    s_getpc_b64 s[4:5]
 ; HSA-NEXT:    s_add_u32 s4, s4, external_void_func_v4i32@rel32@lo+4
 ; HSA-NEXT:    s_addc_u32 s5, s5, external_void_func_v4i32@rel32@hi+12
@@ -3691,16 +3728,17 @@ define amdgpu_kernel void @test_call_external_void_func_v5i32_imm() #0 {
 ; HSA-LABEL: test_call_external_void_func_v5i32_imm:
 ; HSA:       ; %bb.0:
 ; HSA-NEXT:    s_add_i32 s4, s4, s7
+; HSA-NEXT:    s_mov_b32 s2, -1
+; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s5
 ; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s4, 8
-; HSA-NEXT:    s_add_u32 s0, s0, s7
-; HSA-NEXT:    s_addc_u32 s1, s1, 0
+; HSA-NEXT:    s_mov_b32 s3, 0x11e80000
+; HSA-NEXT:    s_mov_b64 s[0:1], flat_scratch
 ; HSA-NEXT:    v_mov_b32_e32 v0, 1
 ; HSA-NEXT:    v_mov_b32_e32 v1, 2
 ; HSA-NEXT:    v_mov_b32_e32 v2, 3
 ; HSA-NEXT:    v_mov_b32_e32 v3, 4
 ; HSA-NEXT:    v_mov_b32_e32 v4, 5
 ; HSA-NEXT:    s_mov_b32 s32, 0
-; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s5
 ; HSA-NEXT:    s_getpc_b64 s[4:5]
 ; HSA-NEXT:    s_add_u32 s4, s4, external_void_func_v5i32@rel32@lo+4
 ; HSA-NEXT:    s_addc_u32 s5, s5, external_void_func_v5i32@rel32@hi+12
@@ -3803,13 +3841,14 @@ define amdgpu_kernel void @test_call_external_void_func_v8i32() #0 {
 ; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s5
 ; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s4, 8
 ; HSA-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
-; HSA-NEXT:    s_add_u32 s0, s0, s7
 ; HSA-NEXT:    s_mov_b32 s7, 0x1100f000
 ; HSA-NEXT:    s_mov_b32 s6, -1
 ; HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; HSA-NEXT:    buffer_load_dwordx4 v[0:3], off, s[4:7], 0
 ; HSA-NEXT:    buffer_load_dwordx4 v[4:7], off, s[4:7], 0 offset:16
-; HSA-NEXT:    s_addc_u32 s1, s1, 0
+; HSA-NEXT:    s_mov_b32 s2, -1
+; HSA-NEXT:    s_mov_b32 s3, 0x11e80000
+; HSA-NEXT:    s_mov_b64 s[0:1], flat_scratch
 ; HSA-NEXT:    s_mov_b32 s32, 0
 ; HSA-NEXT:    s_getpc_b64 s[4:5]
 ; HSA-NEXT:    s_add_u32 s4, s4, external_void_func_v8i32@rel32@lo+4
@@ -3915,9 +3954,11 @@ define amdgpu_kernel void @test_call_external_void_func_v8i32_imm() #0 {
 ; HSA-LABEL: test_call_external_void_func_v8i32_imm:
 ; HSA:       ; %bb.0:
 ; HSA-NEXT:    s_add_i32 s4, s4, s7
+; HSA-NEXT:    s_mov_b32 s2, -1
+; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s5
 ; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s4, 8
-; HSA-NEXT:    s_add_u32 s0, s0, s7
-; HSA-NEXT:    s_addc_u32 s1, s1, 0
+; HSA-NEXT:    s_mov_b32 s3, 0x11e80000
+; HSA-NEXT:    s_mov_b64 s[0:1], flat_scratch
 ; HSA-NEXT:    v_mov_b32_e32 v0, 1
 ; HSA-NEXT:    v_mov_b32_e32 v1, 2
 ; HSA-NEXT:    v_mov_b32_e32 v2, 3
@@ -3927,7 +3968,6 @@ define amdgpu_kernel void @test_call_external_void_func_v8i32_imm() #0 {
 ; HSA-NEXT:    v_mov_b32_e32 v6, 7
 ; HSA-NEXT:    v_mov_b32_e32 v7, 8
 ; HSA-NEXT:    s_mov_b32 s32, 0
-; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s5
 ; HSA-NEXT:    s_getpc_b64 s[4:5]
 ; HSA-NEXT:    s_add_u32 s4, s4, external_void_func_v8i32@rel32@lo+4
 ; HSA-NEXT:    s_addc_u32 s5, s5, external_void_func_v8i32@rel32@hi+12
@@ -4038,7 +4078,6 @@ define amdgpu_kernel void @test_call_external_void_func_v16i32() #0 {
 ; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s5
 ; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s4, 8
 ; HSA-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
-; HSA-NEXT:    s_add_u32 s0, s0, s7
 ; HSA-NEXT:    s_mov_b32 s7, 0x1100f000
 ; HSA-NEXT:    s_mov_b32 s6, -1
 ; HSA-NEXT:    s_waitcnt lgkmcnt(0)
@@ -4046,7 +4085,9 @@ define amdgpu_kernel void @test_call_external_void_func_v16i32() #0 {
 ; HSA-NEXT:    buffer_load_dwordx4 v[4:7], off, s[4:7], 0 offset:16
 ; HSA-NEXT:    buffer_load_dwordx4 v[8:11], off, s[4:7], 0 offset:32
 ; HSA-NEXT:    buffer_load_dwordx4 v[12:15], off, s[4:7], 0 offset:48
-; HSA-NEXT:    s_addc_u32 s1, s1, 0
+; HSA-NEXT:    s_mov_b32 s2, -1
+; HSA-NEXT:    s_mov_b32 s3, 0x11e80000
+; HSA-NEXT:    s_mov_b64 s[0:1], flat_scratch
 ; HSA-NEXT:    s_mov_b32 s32, 0
 ; HSA-NEXT:    s_getpc_b64 s[4:5]
 ; HSA-NEXT:    s_add_u32 s4, s4, external_void_func_v16i32@rel32@lo+4
@@ -4183,7 +4224,6 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32() #0 {
 ; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s5
 ; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s4, 8
 ; HSA-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
-; HSA-NEXT:    s_add_u32 s0, s0, s7
 ; HSA-NEXT:    s_mov_b32 s7, 0x1100f000
 ; HSA-NEXT:    s_mov_b32 s6, -1
 ; HSA-NEXT:    s_waitcnt lgkmcnt(0)
@@ -4195,8 +4235,10 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32() #0 {
 ; HSA-NEXT:    buffer_load_dwordx4 v[16:19], off, s[4:7], 0 offset:64
 ; HSA-NEXT:    buffer_load_dwordx4 v[20:23], off, s[4:7], 0 offset:80
 ; HSA-NEXT:    buffer_load_dwordx4 v[24:27], off, s[4:7], 0 offset:96
+; HSA-NEXT:    s_mov_b32 s2, -1
+; HSA-NEXT:    s_mov_b32 s3, 0x11e80000
 ; HSA-NEXT:    s_mov_b32 s32, 0
-; HSA-NEXT:    s_addc_u32 s1, s1, 0
+; HSA-NEXT:    s_mov_b64 s[0:1], flat_scratch
 ; HSA-NEXT:    s_getpc_b64 s[8:9]
 ; HSA-NEXT:    s_add_u32 s8, s8, external_void_func_v32i32@rel32@lo+4
 ; HSA-NEXT:    s_addc_u32 s9, s9, external_void_func_v32i32@rel32@hi+12
@@ -4359,9 +4401,10 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32_i32(i32) #0 {
 ; HSA-NEXT:    buffer_load_dwordx4 v[16:19], off, s[4:7], 0 offset:64
 ; HSA-NEXT:    buffer_load_dwordx4 v[20:23], off, s[4:7], 0 offset:80
 ; HSA-NEXT:    buffer_load_dwordx4 v[24:27], off, s[4:7], 0 offset:96
-; HSA-NEXT:    s_add_u32 s0, s0, s9
+; HSA-NEXT:    s_mov_b32 s2, -1
+; HSA-NEXT:    s_mov_b32 s3, 0x11e80000
 ; HSA-NEXT:    s_mov_b32 s32, 0
-; HSA-NEXT:    s_addc_u32 s1, s1, 0
+; HSA-NEXT:    s_mov_b64 s[0:1], flat_scratch
 ; HSA-NEXT:    s_getpc_b64 s[4:5]
 ; HSA-NEXT:    s_add_u32 s4, s4, external_void_func_v32i32_i32@rel32@lo+4
 ; HSA-NEXT:    s_addc_u32 s5, s5, external_void_func_v32i32_i32@rel32@hi+12
@@ -4466,14 +4509,15 @@ define amdgpu_kernel void @test_call_external_i32_func_i32_imm(ptr addrspace(1)
 ;
 ; HSA-LABEL: test_call_external_i32_func_i32_imm:
 ; HSA:       ; %bb.0:
-; HSA-NEXT:    s_add_i32 s6, s6, s9
 ; HSA-NEXT:    s_load_dwordx2 s[36:37], s[4:5], 0x0
+; HSA-NEXT:    s_add_i32 s6, s6, s9
+; HSA-NEXT:    s_mov_b32 s2, -1
+; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s7
 ; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s6, 8
-; HSA-NEXT:    s_add_u32 s0, s0, s9
-; HSA-NEXT:    s_addc_u32 s1, s1, 0
+; HSA-NEXT:    s_mov_b32 s3, 0x11e80000
+; HSA-NEXT:    s_mov_b64 s[0:1], flat_scratch
 ; HSA-NEXT:    v_mov_b32_e32 v0, 42
 ; HSA-NEXT:    s_mov_b32 s32, 0
-; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s7
 ; HSA-NEXT:    s_mov_b32 s39, 0x1100f000
 ; HSA-NEXT:    s_mov_b32 s38, -1
 ; HSA-NEXT:    s_getpc_b64 s[4:5]
@@ -4581,13 +4625,14 @@ define amdgpu_kernel void @test_call_external_void_func_struct_i8_i32() #0 {
 ; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s5
 ; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s4, 8
 ; HSA-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
-; HSA-NEXT:    s_add_u32 s0, s0, s7
 ; HSA-NEXT:    s_mov_b32 s7, 0x1100f000
 ; HSA-NEXT:    s_mov_b32 s6, -1
 ; HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; HSA-NEXT:    buffer_load_ubyte v0, off, s[4:7], 0
 ; HSA-NEXT:    buffer_load_dword v1, off, s[4:7], 0 offset:4
-; HSA-NEXT:    s_addc_u32 s1, s1, 0
+; HSA-NEXT:    s_mov_b32 s2, -1
+; HSA-NEXT:    s_mov_b32 s3, 0x11e80000
+; HSA-NEXT:    s_mov_b64 s[0:1], flat_scratch
 ; HSA-NEXT:    s_mov_b32 s32, 0
 ; HSA-NEXT:    s_getpc_b64 s[4:5]
 ; HSA-NEXT:    s_add_u32 s4, s4, external_void_func_struct_i8_i32@rel32@lo+4
@@ -4702,9 +4747,11 @@ define amdgpu_kernel void @test_call_external_void_func_byval_struct_i8_i32() #0
 ; HSA-LABEL: test_call_external_void_func_byval_struct_i8_i32:
 ; HSA:       ; %bb.0:
 ; HSA-NEXT:    s_add_i32 s4, s4, s7
+; HSA-NEXT:    s_mov_b32 s2, -1
+; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s5
 ; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s4, 8
-; HSA-NEXT:    s_add_u32 s0, s0, s7
-; HSA-NEXT:    s_addc_u32 s1, s1, 0
+; HSA-NEXT:    s_mov_b32 s3, 0x11e80000
+; HSA-NEXT:    s_mov_b64 s[0:1], flat_scratch
 ; HSA-NEXT:    v_mov_b32_e32 v0, 3
 ; HSA-NEXT:    buffer_store_byte v0, off, s[0:3], 0 offset:8
 ; HSA-NEXT:    v_mov_b32_e32 v0, 8
@@ -4712,7 +4759,6 @@ define amdgpu_kernel void @test_call_external_void_func_byval_struct_i8_i32() #0
 ; HSA-NEXT:    buffer_load_dword v0, off, s[0:3], 0 offset:12
 ; HSA-NEXT:    buffer_load_dword v1, off, s[0:3], 0 offset:8
 ; HSA-NEXT:    s_movk_i32 s32, 0x400
-; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s5
 ; HSA-NEXT:    s_getpc_b64 s[4:5]
 ; HSA-NEXT:    s_add_u32 s4, s4, external_void_func_byval_struct_i8_i32@rel32@lo+4
 ; HSA-NEXT:    s_addc_u32 s5, s5, external_void_func_byval_struct_i8_i32@rel32@hi+12
@@ -4877,9 +4923,11 @@ define amdgpu_kernel void @test_call_external_void_func_sret_struct_i8_i32_byval
 ; HSA-LABEL: test_call_external_void_func_sret_struct_i8_i32_byval_struct_i8_i32:
 ; HSA:       ; %bb.0:
 ; HSA-NEXT:    s_add_i32 s6, s6, s9
+; HSA-NEXT:    s_mov_b32 s2, -1
+; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s7
 ; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s6, 8
-; HSA-NEXT:    s_add_u32 s0, s0, s9
-; HSA-NEXT:    s_addc_u32 s1, s1, 0
+; HSA-NEXT:    s_mov_b32 s3, 0x11e80000
+; HSA-NEXT:    s_mov_b64 s[0:1], flat_scratch
 ; HSA-NEXT:    v_mov_b32_e32 v0, 3
 ; HSA-NEXT:    buffer_store_byte v0, off, s[0:3], 0 offset:8
 ; HSA-NEXT:    v_mov_b32_e32 v0, 8
@@ -4887,7 +4935,6 @@ define amdgpu_kernel void @test_call_external_void_func_sret_struct_i8_i32_byval
 ; HSA-NEXT:    buffer_load_dword v0, off, s[0:3], 0 offset:12
 ; HSA-NEXT:    buffer_load_dword v1, off, s[0:3], 0 offset:8
 ; HSA-NEXT:    s_movk_i32 s32, 0x800
-; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s7
 ; HSA-NEXT:    s_getpc_b64 s[4:5]
 ; HSA-NEXT:    s_add_u32 s4, s4, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32@rel32@lo+4
 ; HSA-NEXT:    s_addc_u32 s5, s5, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32@rel32@hi+12
@@ -5085,12 +5132,13 @@ define amdgpu_kernel void @test_call_external_void_func_v16i8() #0 {
 ; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s5
 ; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s4, 8
 ; HSA-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
-; HSA-NEXT:    s_add_u32 s0, s0, s7
 ; HSA-NEXT:    s_mov_b32 s7, 0x1100f000
 ; HSA-NEXT:    s_mov_b32 s6, -1
-; HSA-NEXT:    s_addc_u32 s1, s1, 0
+; HSA-NEXT:    s_mov_b32 s2, -1
+; HSA-NEXT:    s_mov_b32 s3, 0x11e80000
 ; HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; HSA-NEXT:    buffer_load_dwordx4 v[0:3], off, s[4:7], 0
+; HSA-NEXT:    s_mov_b64 s[0:1], flat_scratch
 ; HSA-NEXT:    s_mov_b32 s32, 0
 ; HSA-NEXT:    s_getpc_b64 s[4:5]
 ; HSA-NEXT:    s_add_u32 s4, s4, external_void_func_v16i8@rel32@lo+4
@@ -5339,14 +5387,15 @@ define amdgpu_kernel void @stack_passed_arg_alignment_v32i32_f64(<32 x i32> %val
 ; HSA-LABEL: stack_passed_arg_alignment_v32i32_f64:
 ; HSA:       ; %bb.0: ; %entry
 ; HSA-NEXT:    s_add_i32 s6, s6, s9
-; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s6, 8
 ; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s7
-; HSA-NEXT:    s_add_u32 s0, s0, s9
+; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s6, 8
 ; HSA-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x40
 ; HSA-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x80
 ; HSA-NEXT:    s_load_dwordx16 s[36:51], s[4:5], 0x0
+; HSA-NEXT:    s_mov_b32 s2, -1
+; HSA-NEXT:    s_mov_b32 s3, 0x11e80000
 ; HSA-NEXT:    s_mov_b32 s32, 0
-; HSA-NEXT:    s_addc_u32 s1, s1, 0
+; HSA-NEXT:    s_mov_b64 s[0:1], flat_scratch
 ; HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; HSA-NEXT:    v_mov_b32_e32 v0, s23
 ; HSA-NEXT:    v_mov_b32_e32 v1, s6
diff --git a/llvm/test/CodeGen/AMDGPU/call-reqd-group-size.ll b/llvm/test/CodeGen/AMDGPU/call-reqd-group-size.ll
index c62a082..8e2fca5 100644
--- a/llvm/test/CodeGen/AMDGPU/call-reqd-group-size.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-reqd-group-size.ll
@@ -11,10 +11,11 @@ define amdgpu_kernel void @known_x_0(ptr addrspace(1) %out) !reqd_work_group_siz
 ; CHECK-LABEL: known_x_0:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_add_u32 flat_scratch_lo, s6, s9
+; CHECK-NEXT:    s_mov_b32 s2, -1
 ; CHECK-NEXT:    s_addc_u32 flat_scratch_hi, s7, 0
-; CHECK-NEXT:    s_add_u32 s0, s0, s9
+; CHECK-NEXT:    s_mov_b32 s3, 0xe00000
 ; CHECK-NEXT:    v_lshlrev_b32_e32 v0, 20, v2
-; CHECK-NEXT:    s_addc_u32 s1, s1, 0
+; CHECK-NEXT:    s_mov_b64 s[0:1], flat_scratch
 ; CHECK-NEXT:    v_lshl_or_b32 v31, v1, 10, v0
 ; CHECK-NEXT:    s_mov_b32 s32, 0
 ; CHECK-NEXT:    s_getpc_b64 s[4:5]
@@ -31,9 +32,10 @@ define amdgpu_kernel void @known_y_0(ptr addrspace(1) %out) !reqd_work_group_siz
 ; CHECK-LABEL: known_y_0:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_add_u32 flat_scratch_lo, s6, s9
+; CHECK-NEXT:    s_mov_b32 s2, -1
 ; CHECK-NEXT:    s_addc_u32 flat_scratch_hi, s7, 0
-; CHECK-NEXT:    s_add_u32 s0, s0, s9
-; CHECK-NEXT:    s_addc_u32 s1, s1, 0
+; CHECK-NEXT:    s_mov_b32 s3, 0xe00000
+; CHECK-NEXT:    s_mov_b64 s[0:1], flat_scratch
 ; CHECK-NEXT:    v_lshl_or_b32 v31, v2, 20, v0
 ; CHECK-NEXT:    s_mov_b32 s32, 0
 ; CHECK-NEXT:    s_getpc_b64 s[4:5]
@@ -50,9 +52,10 @@ define amdgpu_kernel void @known_z_0(ptr addrspace(1) %out) !reqd_work_group_siz
 ; CHECK-LABEL: known_z_0:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_add_u32 flat_scratch_lo, s6, s9
+; CHECK-NEXT:    s_mov_b32 s2, -1
 ; CHECK-NEXT:    s_addc_u32 flat_scratch_hi, s7, 0
-; CHECK-NEXT:    s_add_u32 s0, s0, s9
-; CHECK-NEXT:    s_addc_u32 s1, s1, 0
+; CHECK-NEXT:    s_mov_b32 s3, 0xe00000
+; CHECK-NEXT:    s_mov_b64 s[0:1], flat_scratch
 ; CHECK-NEXT:    v_lshl_or_b32 v31, v1, 10, v0
 ; CHECK-NEXT:    s_mov_b32 s32, 0
 ; CHECK-NEXT:    s_getpc_b64 s[4:5]
@@ -69,9 +72,10 @@ define amdgpu_kernel void @known_yz_0(ptr addrspace(1) %out) !reqd_work_group_si
 ; CHECK-LABEL: known_yz_0:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_add_u32 flat_scratch_lo, s6, s9
+; CHECK-NEXT:    s_mov_b32 s2, -1
 ; CHECK-NEXT:    s_addc_u32 flat_scratch_hi, s7, 0
-; CHECK-NEXT:    s_add_u32 s0, s0, s9
-; CHECK-NEXT:    s_addc_u32 s1, s1, 0
+; CHECK-NEXT:    s_mov_b32 s3, 0xe00000
+; CHECK-NEXT:    s_mov_b64 s[0:1], flat_scratch
 ; CHECK-NEXT:    v_mov_b32_e32 v31, v0
 ; CHECK-NEXT:    s_mov_b32 s32, 0
 ; CHECK-NEXT:    s_getpc_b64 s[4:5]
@@ -88,9 +92,10 @@ define amdgpu_kernel void @known_xz_0(ptr addrspace(1) %out) !reqd_work_group_si
 ; CHECK-LABEL: known_xz_0:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_add_u32 flat_scratch_lo, s6, s9
+; CHECK-NEXT:    s_mov_b32 s2, -1
 ; CHECK-NEXT:    s_addc_u32 flat_scratch_hi, s7, 0
-; CHECK-NEXT:    s_add_u32 s0, s0, s9
-; CHECK-NEXT:    s_addc_u32 s1, s1, 0
+; CHECK-NEXT:    s_mov_b32 s3, 0xe00000
+; CHECK-NEXT:    s_mov_b64 s[0:1], flat_scratch
 ; CHECK-NEXT:    v_lshlrev_b32_e32 v31, 10, v1
 ; CHECK-NEXT:    s_mov_b32 s32, 0
 ; CHECK-NEXT:    s_getpc_b64 s[4:5]
@@ -108,9 +113,10 @@ define amdgpu_kernel void @known_xyz_0(ptr addrspace(1) %out) !reqd_work_group_s
 ; CHECK-LABEL: known_xyz_0:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_add_u32 flat_scratch_lo, s6, s9
+; CHECK-NEXT:    s_mov_b32 s2, -1
 ; CHECK-NEXT:    s_addc_u32 flat_scratch_hi, s7, 0
-; CHECK-NEXT:    s_add_u32 s0, s0, s9
-; CHECK-NEXT:    s_addc_u32 s1, s1, 0
+; CHECK-NEXT:    s_mov_b32 s3, 0xe00000
+; CHECK-NEXT:    s_mov_b64 s[0:1], flat_scratch
 ; CHECK-NEXT:    v_mov_b32_e32 v31, 0
 ; CHECK-NEXT:    s_mov_b32 s32, 0
 ; CHECK-NEXT:    s_getpc_b64 s[4:5]
diff --git a/llvm/test/CodeGen/AMDGPU/call-waitcnt.ll b/llvm/test/CodeGen/AMDGPU/call-waitcnt.ll
index 616e5f0..6db5eff 100644
--- a/llvm/test/CodeGen/AMDGPU/call-waitcnt.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-waitcnt.ll
@@ -7,12 +7,13 @@ define amdgpu_kernel void @call_memory_arg_load(ptr addrspace(3) %ptr, i32) #0 {
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dword s4, s[4:5], 0x0
 ; GCN-NEXT:    s_add_u32 flat_scratch_lo, s6, s9
+; GCN-NEXT:    s_mov_b32 s2, -1
 ; GCN-NEXT:    s_addc_u32 flat_scratch_hi, s7, 0
-; GCN-NEXT:    s_add_u32 s0, s0, s9
-; GCN-NEXT:    s_addc_u32 s1, s1, 0
+; GCN-NEXT:    s_mov_b32 s3, 0xe00000
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN-NEXT:    ds_read_b32 v0, v0
+; GCN-NEXT:    s_mov_b64 s[0:1], flat_scratch
 ; GCN-NEXT:    s_mov_b32 s32, 0
 ; GCN-NEXT:    s_getpc_b64 s[4:5]
 ; GCN-NEXT:    s_add_u32 s4, s4, func@rel32@lo+4
@@ -30,10 +31,11 @@ define amdgpu_kernel void @call_memory_no_dep(ptr addrspace(1) %ptr, i32) #0 {
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
 ; GCN-NEXT:    s_add_u32 flat_scratch_lo, s6, s9
+; GCN-NEXT:    s_mov_b32 s2, -1
 ; GCN-NEXT:    s_addc_u32 flat_scratch_hi, s7, 0
-; GCN-NEXT:    s_add_u32 s0, s0, s9
+; GCN-NEXT:    s_mov_b32 s3, 0xe00000
 ; GCN-NEXT:    v_mov_b32_e32 v0, 0
-; GCN-NEXT:    s_addc_u32 s1, s1, 0
+; GCN-NEXT:    s_mov_b64 s[0:1], flat_scratch
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    global_store_dword v0, v0, s[4:5]
 ; GCN-NEXT:    v_mov_b32_e32 v0, 0
@@ -52,11 +54,12 @@ define amdgpu_kernel void @call_memory_no_dep(ptr addrspace(1) %ptr, i32) #0 {
 define amdgpu_kernel void @call_no_wait_after_call(ptr addrspace(1) %ptr, i32) #0 {
 ; GCN-LABEL: call_no_wait_after_call:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_add_u32 flat_scratch_lo, s6, s9
 ; GCN-NEXT:    s_load_dwordx2 s[34:35], s[4:5], 0x0
+; GCN-NEXT:    s_add_u32 flat_scratch_lo, s6, s9
+; GCN-NEXT:    s_mov_b32 s2, -1
 ; GCN-NEXT:    s_addc_u32 flat_scratch_hi, s7, 0
-; GCN-NEXT:    s_add_u32 s0, s0, s9
-; GCN-NEXT:    s_addc_u32 s1, s1, 0
+; GCN-NEXT:    s_mov_b32 s3, 0xe00000
+; GCN-NEXT:    s_mov_b64 s[0:1], flat_scratch
 ; GCN-NEXT:    v_mov_b32_e32 v0, 0
 ; GCN-NEXT:    s_mov_b32 s32, 0
 ; GCN-NEXT:    s_getpc_b64 s[4:5]
@@ -74,11 +77,12 @@ define amdgpu_kernel void @call_no_wait_after_call(ptr addrspace(1) %ptr, i32) #
 define amdgpu_kernel void @call_no_wait_after_call_return_val(ptr addrspace(1) %ptr, i32) #0 {
 ; GCN-LABEL: call_no_wait_after_call_return_val:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_add_u32 flat_scratch_lo, s6, s9
 ; GCN-NEXT:    s_load_dwordx2 s[34:35], s[4:5], 0x0
+; GCN-NEXT:    s_add_u32 flat_scratch_lo, s6, s9
+; GCN-NEXT:    s_mov_b32 s2, -1
 ; GCN-NEXT:    s_addc_u32 flat_scratch_hi, s7, 0
-; GCN-NEXT:    s_add_u32 s0, s0, s9
-; GCN-NEXT:    s_addc_u32 s1, s1, 0
+; GCN-NEXT:    s_mov_b32 s3, 0xe00000
+; GCN-NEXT:    s_mov_b64 s[0:1], flat_scratch
 ; GCN-NEXT:    v_mov_b32_e32 v0, 0
 ; GCN-NEXT:    s_mov_b32 s32, 0
 ; GCN-NEXT:    s_getpc_b64 s[4:5]
@@ -99,12 +103,13 @@ define amdgpu_kernel void @call_got_load(ptr addrspace(1) %ptr, i32) #0 {
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_add_u32 flat_scratch_lo, s6, s9
 ; GCN-NEXT:    s_addc_u32 flat_scratch_hi, s7, 0
-; GCN-NEXT:    s_add_u32 s0, s0, s9
-; GCN-NEXT:    s_addc_u32 s1, s1, 0
 ; GCN-NEXT:    s_getpc_b64 s[4:5]
 ; GCN-NEXT:    s_add_u32 s4, s4, got.func@gotpcrel32@lo+4
 ; GCN-NEXT:    s_addc_u32 s5, s5, got.func@gotpcrel32@hi+12
 ; GCN-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GCN-NEXT:    s_mov_b32 s2, -1
+; GCN-NEXT:    s_mov_b32 s3, 0xe00000
+; GCN-NEXT:    s_mov_b64 s[0:1], flat_scratch
 ; GCN-NEXT:    v_mov_b32_e32 v0, 0
 ; GCN-NEXT:    s_mov_b32 s32, 0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll b/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll
index 6d603ef..49bf48a 100644
--- a/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll
+++ b/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll
@@ -165,7 +165,7 @@ define amdgpu_kernel void @kern_indirect_use_workitem_id_z() #1 {
 ; FIXEDABI-NOT: v1
 ; FIXEDABI-NOT: v2
 ; FIXEDABI: v_lshlrev_b32_e32 v1, 10, v1
-; FIXEDABI-NEXT: v_or_b32_e32 v31, v0, v1
+; FIXEDABI: v_or_b32_e32 v31, v0, v1
 ; FIXEDABI-NOT: v0
 ; FIXEDABI-NOT: v1
 ; FIXEDABI-NOT: v2
@@ -181,7 +181,7 @@ define amdgpu_kernel void @kern_indirect_use_workitem_id_xy() #1 {
 ; FIXEDABI-NOT: v1
 ; FIXEDABI-NOT: v2
 ; FIXEDABI: v_lshlrev_b32_e32 v1, 20, v2
-; FIXEDABI-NEXT: v_or_b32_e32 v31, v0, v1
+; FIXEDABI: v_or_b32_e32 v31, v0, v1
 ; FIXEDABI-NOT: v0
 ; FIXEDABI-NOT: v1
 ; FIXEDABI-NOT: v2
@@ -198,7 +198,7 @@ define amdgpu_kernel void @kern_indirect_use_workitem_id_xz() #1 {
 ; FIXEDABI-NOT: v2
 ; FIXEDABI:v_lshlrev_b32_e32 v0, 20, v2
 ; FIXEDABI-NEXT: v_lshlrev_b32_e32 v1, 10, v1
-; FIXEDABI-NEXT: v_or_b32_e32 v31, v1, v0
+; FIXEDABI: v_or_b32_e32 v31, v1, v0
 ; FIXEDABI-NOT: v0
 ; FIXEDABI-NOT: v1
 ; FIXEDABI-NOT: v2
diff --git a/llvm/test/CodeGen/AMDGPU/cc-update.ll b/llvm/test/CodeGen/AMDGPU/cc-update.ll
index ca09163b2..42beb1c 100644
--- a/llvm/test/CodeGen/AMDGPU/cc-update.ll
+++ b/llvm/test/CodeGen/AMDGPU/cc-update.ll
@@ -68,13 +68,14 @@ define amdgpu_kernel void @test_kern_call() local_unnamed_addr #0 {
 ; GFX803-LABEL: test_kern_call:
 ; GFX803:       ; %bb.0: ; %entry
 ; GFX803-NEXT:    s_add_i32 s10, s10, s15
-; GFX803-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; GFX803-NEXT:    s_mov_b32 s2, -1
 ; GFX803-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
-; GFX803-NEXT:    s_add_u32 s0, s0, s15
+; GFX803-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX803-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; GFX803-NEXT:    s_mov_b32 s3, 0x11e80000
 ; GFX803-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
 ; GFX803-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX803-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX803-NEXT:    s_addc_u32 s1, s1, 0
+; GFX803-NEXT:    s_mov_b64 s[0:1], flat_scratch
 ; GFX803-NEXT:    s_mov_b64 s[10:11], s[8:9]
 ; GFX803-NEXT:    v_or_b32_e32 v31, v0, v2
 ; GFX803-NEXT:    s_mov_b64 s[8:9], s[6:7]
@@ -88,11 +89,12 @@ define amdgpu_kernel void @test_kern_call() local_unnamed_addr #0 {
 ; GFX900-LABEL: test_kern_call:
 ; GFX900:       ; %bb.0: ; %entry
 ; GFX900-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX900-NEXT:    s_mov_b32 s2, -1
 ; GFX900-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
-; GFX900-NEXT:    s_add_u32 s0, s0, s15
+; GFX900-NEXT:    s_mov_b32 s3, 0xe00000
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
-; GFX900-NEXT:    s_addc_u32 s1, s1, 0
+; GFX900-NEXT:    s_mov_b64 s[0:1], flat_scratch
 ; GFX900-NEXT:    s_mov_b64 s[10:11], s[8:9]
 ; GFX900-NEXT:    v_or3_b32 v31, v0, v1, v2
 ; GFX900-NEXT:    s_mov_b64 s[8:9], s[6:7]
@@ -112,11 +114,12 @@ define amdgpu_kernel void @test_kern_call() local_unnamed_addr #0 {
 ; GFX1010-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX1010-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
 ; GFX1010-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
-; GFX1010-NEXT:    s_add_u32 s0, s0, s15
-; GFX1010-NEXT:    s_addc_u32 s1, s1, 0
+; GFX1010-NEXT:    s_mov_b32 s2, -1
+; GFX1010-NEXT:    s_mov_b32 s3, 0x31c16000
+; GFX1010-NEXT:    s_mov_b64 s[0:1], s[10:11]
 ; GFX1010-NEXT:    s_mov_b64 s[10:11], s[8:9]
-; GFX1010-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX1010-NEXT:    v_or3_b32 v31, v0, v1, v2
+; GFX1010-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX1010-NEXT:    s_getpc_b64 s[16:17]
 ; GFX1010-NEXT:    s_add_u32 s16, s16, ex@rel32@lo+4
 ; GFX1010-NEXT:    s_addc_u32 s17, s17, ex@rel32@hi+12
@@ -148,13 +151,14 @@ define amdgpu_kernel void @test_kern_stack_and_call() local_unnamed_addr #0 {
 ; GFX803-LABEL: test_kern_stack_and_call:
 ; GFX803:       ; %bb.0: ; %entry
 ; GFX803-NEXT:    s_add_i32 s10, s10, s15
-; GFX803-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; GFX803-NEXT:    s_mov_b32 s2, -1
 ; GFX803-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
-; GFX803-NEXT:    s_add_u32 s0, s0, s15
+; GFX803-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX803-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; GFX803-NEXT:    s_mov_b32 s3, 0x11e80000
 ; GFX803-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
 ; GFX803-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX803-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX803-NEXT:    s_addc_u32 s1, s1, 0
+; GFX803-NEXT:    s_mov_b64 s[0:1], flat_scratch
 ; GFX803-NEXT:    s_mov_b64 s[10:11], s[8:9]
 ; GFX803-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX803-NEXT:    v_or_b32_e32 v31, v0, v2
@@ -171,11 +175,12 @@ define amdgpu_kernel void @test_kern_stack_and_call() local_unnamed_addr #0 {
 ; GFX900-LABEL: test_kern_stack_and_call:
 ; GFX900:       ; %bb.0: ; %entry
 ; GFX900-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX900-NEXT:    s_mov_b32 s2, -1
 ; GFX900-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
-; GFX900-NEXT:    s_add_u32 s0, s0, s15
+; GFX900-NEXT:    s_mov_b32 s3, 0xe00000
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
-; GFX900-NEXT:    s_addc_u32 s1, s1, 0
+; GFX900-NEXT:    s_mov_b64 s[0:1], flat_scratch
 ; GFX900-NEXT:    s_mov_b64 s[10:11], s[8:9]
 ; GFX900-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX900-NEXT:    v_or3_b32 v31, v0, v1, v2
@@ -199,10 +204,11 @@ define amdgpu_kernel void @test_kern_stack_and_call() local_unnamed_addr #0 {
 ; GFX1010-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
 ; GFX1010-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
 ; GFX1010-NEXT:    v_mov_b32_e32 v3, 0
-; GFX1010-NEXT:    s_add_u32 s0, s0, s15
-; GFX1010-NEXT:    s_addc_u32 s1, s1, 0
-; GFX1010-NEXT:    s_mov_b64 s[10:11], s[8:9]
+; GFX1010-NEXT:    s_mov_b32 s2, -1
+; GFX1010-NEXT:    s_mov_b32 s3, 0x31c16000
+; GFX1010-NEXT:    s_mov_b64 s[0:1], s[10:11]
 ; GFX1010-NEXT:    v_or3_b32 v31, v0, v1, v2
+; GFX1010-NEXT:    s_mov_b64 s[10:11], s[8:9]
 ; GFX1010-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX1010-NEXT:    buffer_store_dword v3, off, s[0:3], 0 offset:4
 ; GFX1010-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -311,13 +317,14 @@ define amdgpu_kernel void @test_force_fp_kern_call() local_unnamed_addr #2 {
 ; GFX803-LABEL: test_force_fp_kern_call:
 ; GFX803:       ; %bb.0: ; %entry
 ; GFX803-NEXT:    s_add_i32 s10, s10, s15
-; GFX803-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; GFX803-NEXT:    s_mov_b32 s2, -1
 ; GFX803-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
-; GFX803-NEXT:    s_add_u32 s0, s0, s15
+; GFX803-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX803-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; GFX803-NEXT:    s_mov_b32 s3, 0x11e80000
 ; GFX803-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
 ; GFX803-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX803-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX803-NEXT:    s_addc_u32 s1, s1, 0
+; GFX803-NEXT:    s_mov_b64 s[0:1], flat_scratch
 ; GFX803-NEXT:    s_mov_b64 s[10:11], s[8:9]
 ; GFX803-NEXT:    v_or_b32_e32 v31, v0, v2
 ; GFX803-NEXT:    s_mov_b64 s[8:9], s[6:7]
@@ -332,11 +339,12 @@ define amdgpu_kernel void @test_force_fp_kern_call() local_unnamed_addr #2 {
 ; GFX900-LABEL: test_force_fp_kern_call:
 ; GFX900:       ; %bb.0: ; %entry
 ; GFX900-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX900-NEXT:    s_mov_b32 s2, -1
 ; GFX900-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
-; GFX900-NEXT:    s_add_u32 s0, s0, s15
+; GFX900-NEXT:    s_mov_b32 s3, 0xe00000
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
-; GFX900-NEXT:    s_addc_u32 s1, s1, 0
+; GFX900-NEXT:    s_mov_b64 s[0:1], flat_scratch
 ; GFX900-NEXT:    s_mov_b64 s[10:11], s[8:9]
 ; GFX900-NEXT:    v_or3_b32 v31, v0, v1, v2
 ; GFX900-NEXT:    s_mov_b64 s[8:9], s[6:7]
@@ -358,11 +366,12 @@ define amdgpu_kernel void @test_force_fp_kern_call() local_unnamed_addr #2 {
 ; GFX1010-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX1010-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
 ; GFX1010-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
-; GFX1010-NEXT:    s_add_u32 s0, s0, s15
-; GFX1010-NEXT:    s_addc_u32 s1, s1, 0
+; GFX1010-NEXT:    s_mov_b32 s2, -1
+; GFX1010-NEXT:    s_mov_b32 s3, 0x31c16000
+; GFX1010-NEXT:    s_mov_b64 s[0:1], s[10:11]
 ; GFX1010-NEXT:    s_mov_b64 s[10:11], s[8:9]
-; GFX1010-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX1010-NEXT:    v_or3_b32 v31, v0, v1, v2
+; GFX1010-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX1010-NEXT:    s_getpc_b64 s[16:17]
 ; GFX1010-NEXT:    s_add_u32 s16, s16, ex@rel32@lo+4
 ; GFX1010-NEXT:    s_addc_u32 s17, s17, ex@rel32@hi+12
@@ -413,14 +422,15 @@ define amdgpu_kernel void @test_force_fp_kern_stack_and_call() local_unnamed_add
 ; GFX803-LABEL: test_force_fp_kern_stack_and_call:
 ; GFX803:       ; %bb.0: ; %entry
 ; GFX803-NEXT:    s_add_i32 s10, s10, s15
-; GFX803-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; GFX803-NEXT:    s_mov_b32 s2, -1
 ; GFX803-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
-; GFX803-NEXT:    s_add_u32 s0, s0, s15
+; GFX803-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX803-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; GFX803-NEXT:    s_mov_b32 s3, 0x11e80000
 ; GFX803-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
 ; GFX803-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX803-NEXT:    s_mov_b32 s33, 0
-; GFX803-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX803-NEXT:    s_addc_u32 s1, s1, 0
+; GFX803-NEXT:    s_mov_b64 s[0:1], flat_scratch
 ; GFX803-NEXT:    s_mov_b64 s[10:11], s[8:9]
 ; GFX803-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX803-NEXT:    v_or_b32_e32 v31, v0, v2
@@ -437,12 +447,13 @@ define amdgpu_kernel void @test_force_fp_kern_stack_and_call() local_unnamed_add
 ; GFX900-LABEL: test_force_fp_kern_stack_and_call:
 ; GFX900:       ; %bb.0: ; %entry
 ; GFX900-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX900-NEXT:    s_mov_b32 s2, -1
 ; GFX900-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
-; GFX900-NEXT:    s_add_u32 s0, s0, s15
+; GFX900-NEXT:    s_mov_b32 s3, 0xe00000
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
 ; GFX900-NEXT:    s_mov_b32 s33, 0
-; GFX900-NEXT:    s_addc_u32 s1, s1, 0
+; GFX900-NEXT:    s_mov_b64 s[0:1], flat_scratch
 ; GFX900-NEXT:    s_mov_b64 s[10:11], s[8:9]
 ; GFX900-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX900-NEXT:    v_or3_b32 v31, v0, v1, v2
@@ -467,10 +478,11 @@ define amdgpu_kernel void @test_force_fp_kern_stack_and_call() local_unnamed_add
 ; GFX1010-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
 ; GFX1010-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
 ; GFX1010-NEXT:    v_mov_b32_e32 v3, 0
-; GFX1010-NEXT:    s_add_u32 s0, s0, s15
-; GFX1010-NEXT:    s_addc_u32 s1, s1, 0
-; GFX1010-NEXT:    s_mov_b64 s[10:11], s[8:9]
+; GFX1010-NEXT:    s_mov_b32 s2, -1
+; GFX1010-NEXT:    s_mov_b32 s3, 0x31c16000
+; GFX1010-NEXT:    s_mov_b64 s[0:1], s[10:11]
 ; GFX1010-NEXT:    v_or3_b32 v31, v0, v1, v2
+; GFX1010-NEXT:    s_mov_b64 s[10:11], s[8:9]
 ; GFX1010-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX1010-NEXT:    buffer_store_dword v3, off, s[0:3], s33 offset:4
 ; GFX1010-NEXT:    s_waitcnt_vscnt null, 0x0
diff --git a/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll b/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll
index 11871db..68c632a 100644
--- a/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll
+++ b/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll
@@ -180,8 +180,9 @@ define amdgpu_kernel void @v3i16_registers(i1 %cond) #0 {
 ; GCN-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GCN-NEXT:    s_mov_b64 s[10:11], s[8:9]
 ; GCN-NEXT:    s_load_dword s8, s[6:7], 0x0
-; GCN-NEXT:    s_add_u32 s0, s0, s15
-; GCN-NEXT:    s_addc_u32 s1, s1, 0
+; GCN-NEXT:    s_mov_b32 s2, -1
+; GCN-NEXT:    s_mov_b32 s3, 0xe00000
+; GCN-NEXT:    s_mov_b64 s[0:1], flat_scratch
 ; GCN-NEXT:    s_mov_b32 s32, 0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_bitcmp1_b32 s8, 0
@@ -229,8 +230,9 @@ define amdgpu_kernel void @v3f16_registers(i1 %cond) #0 {
 ; GCN-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GCN-NEXT:    s_mov_b64 s[10:11], s[8:9]
 ; GCN-NEXT:    s_load_dword s8, s[6:7], 0x0
-; GCN-NEXT:    s_add_u32 s0, s0, s15
-; GCN-NEXT:    s_addc_u32 s1, s1, 0
+; GCN-NEXT:    s_mov_b32 s2, -1
+; GCN-NEXT:    s_mov_b32 s3, 0xe00000
+; GCN-NEXT:    s_mov_b64 s[0:1], flat_scratch
 ; GCN-NEXT:    s_mov_b32 s32, 0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_bitcmp1_b32 s8, 0
diff --git a/llvm/test/CodeGen/AMDGPU/indirect-call-known-callees.ll b/llvm/test/CodeGen/AMDGPU/indirect-call-known-callees.ll
index 47110d9..2d019ef 100644
--- a/llvm/test/CodeGen/AMDGPU/indirect-call-known-callees.ll
+++ b/llvm/test/CodeGen/AMDGPU/indirect-call-known-callees.ll
@@ -13,8 +13,6 @@ define amdgpu_kernel void @indirect_call_known_no_special_inputs() {
 ; GFX9:       ; %bb.0: ; %bb
 ; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s4, s7
 ; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s5, 0
-; GFX9-NEXT:    s_add_u32 s0, s0, s7
-; GFX9-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX9-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX9-NEXT:    s_load_dword s7, s[4:5], 0x0
 ; GFX9-NEXT:    s_getpc_b64 s[4:5]
@@ -25,14 +23,17 @@ define amdgpu_kernel void @indirect_call_known_no_special_inputs() {
 ; GFX9-NEXT:    s_addc_u32 s9, s9, snork@gotpcrel32@hi+12
 ; GFX9-NEXT:    s_load_dwordx2 s[10:11], s[8:9], 0x0
 ; GFX9-NEXT:    s_load_dwordx2 s[12:13], s[4:5], 0x0
-; GFX9-NEXT:    s_mov_b64 s[8:9], 0
+; GFX9-NEXT:    s_mov_b32 s2, -1
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_and_b32 s4, 1, s7
+; GFX9-NEXT:    s_mov_b32 s3, 0xe00000
 ; GFX9-NEXT:    s_cmp_eq_u32 s4, 1
-; GFX9-NEXT:    v_mov_b32_e32 v31, v0
+; GFX9-NEXT:    s_mov_b64 s[0:1], flat_scratch
 ; GFX9-NEXT:    s_cselect_b32 s5, s13, s11
 ; GFX9-NEXT:    s_cselect_b32 s4, s12, s10
+; GFX9-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX9-NEXT:    s_mov_b32 s12, s6
+; GFX9-NEXT:    v_mov_b32_e32 v31, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-NEXT:    s_mov_b32 s32, 0
diff --git a/llvm/test/CodeGen/AMDGPU/indirect-call.ll b/llvm/test/CodeGen/AMDGPU/indirect-call.ll
index 408199b..a66ed93 100644
--- a/llvm/test/CodeGen/AMDGPU/indirect-call.ll
+++ b/llvm/test/CodeGen/AMDGPU/indirect-call.ll
@@ -12,8 +12,9 @@ define amdgpu_kernel void @test_indirect_call_sgpr_ptr(i8) {
 ; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s13
 ; GCN-NEXT:    s_add_i32 s12, s12, s17
 ; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
-; GCN-NEXT:    s_add_u32 s0, s0, s17
-; GCN-NEXT:    s_addc_u32 s1, s1, 0
+; GCN-NEXT:    s_mov_b32 s2, -1
+; GCN-NEXT:    s_mov_b32 s3, 0x1e8f000
+; GCN-NEXT:    s_mov_b64 s[0:1], flat_scratch
 ; GCN-NEXT:    s_mov_b32 s13, s15
 ; GCN-NEXT:    s_mov_b32 s12, s14
 ; GCN-NEXT:    s_getpc_b64 s[14:15]
@@ -37,8 +38,9 @@ define amdgpu_kernel void @test_indirect_call_sgpr_ptr(i8) {
 ; GISEL-NEXT:    s_mov_b32 flat_scratch_lo, s13
 ; GISEL-NEXT:    s_add_i32 s12, s12, s17
 ; GISEL-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
-; GISEL-NEXT:    s_add_u32 s0, s0, s17
-; GISEL-NEXT:    s_addc_u32 s1, s1, 0
+; GISEL-NEXT:    s_mov_b32 s2, -1
+; GISEL-NEXT:    s_mov_b32 s3, 0x1e8f000
+; GISEL-NEXT:    s_mov_b64 s[0:1], flat_scratch
 ; GISEL-NEXT:    s_mov_b32 s13, s15
 ; GISEL-NEXT:    s_mov_b32 s12, s14
 ; GISEL-NEXT:    s_getpc_b64 s[14:15]
@@ -67,8 +69,9 @@ define amdgpu_kernel void @test_indirect_call_sgpr_ptr_arg(i8) {
 ; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s13
 ; GCN-NEXT:    s_add_i32 s12, s12, s17
 ; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
-; GCN-NEXT:    s_add_u32 s0, s0, s17
-; GCN-NEXT:    s_addc_u32 s1, s1, 0
+; GCN-NEXT:    s_mov_b32 s2, -1
+; GCN-NEXT:    s_mov_b32 s3, 0x1e8f000
+; GCN-NEXT:    s_mov_b64 s[0:1], flat_scratch
 ; GCN-NEXT:    s_mov_b32 s13, s15
 ; GCN-NEXT:    s_mov_b32 s12, s14
 ; GCN-NEXT:    s_getpc_b64 s[14:15]
@@ -93,8 +96,9 @@ define amdgpu_kernel void @test_indirect_call_sgpr_ptr_arg(i8) {
 ; GISEL-NEXT:    s_mov_b32 flat_scratch_lo, s13
 ; GISEL-NEXT:    s_add_i32 s12, s12, s17
 ; GISEL-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
-; GISEL-NEXT:    s_add_u32 s0, s0, s17
-; GISEL-NEXT:    s_addc_u32 s1, s1, 0
+; GISEL-NEXT:    s_mov_b32 s2, -1
+; GISEL-NEXT:    s_mov_b32 s3, 0x1e8f000
+; GISEL-NEXT:    s_mov_b64 s[0:1], flat_scratch
 ; GISEL-NEXT:    s_mov_b32 s13, s15
 ; GISEL-NEXT:    s_mov_b32 s12, s14
 ; GISEL-NEXT:    s_getpc_b64 s[14:15]
diff --git a/llvm/test/CodeGen/AMDGPU/kernel-vgpr-spill-mubuf-with-voffset.ll b/llvm/test/CodeGen/AMDGPU/kernel-vgpr-spill-mubuf-with-voffset.ll
index 6e90554..8843efd 100644
--- a/llvm/test/CodeGen/AMDGPU/kernel-vgpr-spill-mubuf-with-voffset.ll
+++ b/llvm/test/CodeGen/AMDGPU/kernel-vgpr-spill-mubuf-with-voffset.ll
@@ -11,8 +11,9 @@ define amdgpu_kernel void @test_kernel(i32 %val) #0 {
 ; CHECK-NEXT:    s_mov_b32 s33, 0
 ; CHECK-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
 ; CHECK-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
-; CHECK-NEXT:    s_add_u32 s0, s0, s15
-; CHECK-NEXT:    s_addc_u32 s1, s1, 0
+; CHECK-NEXT:    s_mov_b32 s2, -1
+; CHECK-NEXT:    s_mov_b32 s3, 0xe00000
+; CHECK-NEXT:    s_mov_b64 s[0:1], flat_scratch
 ; CHECK-NEXT:    ; implicit-def: $vgpr3 : SGPR spill to VGPR lane
 ; CHECK-NEXT:    s_mov_b64 s[10:11], s[8:9]
 ; CHECK-NEXT:    v_mov_b32_e32 v3, v2
diff --git a/llvm/test/CodeGen/AMDGPU/lds-frame-extern.ll b/llvm/test/CodeGen/AMDGPU/lds-frame-extern.ll
index 66f31bb..4851c4f 100644
--- a/llvm/test/CodeGen/AMDGPU/lds-frame-extern.ll
+++ b/llvm/test/CodeGen/AMDGPU/lds-frame-extern.ll
@@ -118,10 +118,11 @@ define amdgpu_kernel void @module_1_kernel_normal_extern_normal(i32 %idx) {
 ; CHECK-NEXT:    s_addc_u32 s7, s7, 0
 ; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6
 ; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7
-; CHECK-NEXT:    s_add_u32 s0, s0, s9
-; CHECK-NEXT:    s_addc_u32 s1, s1, 0
+; CHECK-NEXT:    s_mov_b32 s2, -1
 ; CHECK-NEXT:    s_add_u32 s8, s4, 8
+; CHECK-NEXT:    s_mov_b32 s3, 0x31c16000
 ; CHECK-NEXT:    s_addc_u32 s9, s5, 0
+; CHECK-NEXT:    s_mov_b64 s[0:1], s[6:7]
 ; CHECK-NEXT:    s_getpc_b64 s[6:7]
 ; CHECK-NEXT:    s_add_u32 s6, s6, use_module@gotpcrel32@lo+4
 ; CHECK-NEXT:    s_addc_u32 s7, s7, use_module@gotpcrel32@hi+12
@@ -177,10 +178,11 @@ define amdgpu_kernel void @module_1_kernel_overalign_extern_normal(i32 %idx) {
 ; CHECK-NEXT:    s_addc_u32 s7, s7, 0
 ; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6
 ; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7
-; CHECK-NEXT:    s_add_u32 s0, s0, s9
-; CHECK-NEXT:    s_addc_u32 s1, s1, 0
+; CHECK-NEXT:    s_mov_b32 s2, -1
 ; CHECK-NEXT:    s_add_u32 s8, s4, 8
+; CHECK-NEXT:    s_mov_b32 s3, 0x31c16000
 ; CHECK-NEXT:    s_addc_u32 s9, s5, 0
+; CHECK-NEXT:    s_mov_b64 s[0:1], s[6:7]
 ; CHECK-NEXT:    s_getpc_b64 s[6:7]
 ; CHECK-NEXT:    s_add_u32 s6, s6, use_module@gotpcrel32@lo+4
 ; CHECK-NEXT:    s_addc_u32 s7, s7, use_module@gotpcrel32@hi+12
@@ -236,10 +238,11 @@ define amdgpu_kernel void @module_1_kernel_normal_extern_overalign(i32 %idx) {
 ; CHECK-NEXT:    s_addc_u32 s7, s7, 0
 ; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6
 ; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7
-; CHECK-NEXT:    s_add_u32 s0, s0, s9
-; CHECK-NEXT:    s_addc_u32 s1, s1, 0
+; CHECK-NEXT:    s_mov_b32 s2, -1
 ; CHECK-NEXT:    s_add_u32 s8, s4, 8
+; CHECK-NEXT:    s_mov_b32 s3, 0x31c16000
 ; CHECK-NEXT:    s_addc_u32 s9, s5, 0
+; CHECK-NEXT:    s_mov_b64 s[0:1], s[6:7]
 ; CHECK-NEXT:    s_getpc_b64 s[6:7]
 ; CHECK-NEXT:    s_add_u32 s6, s6, use_module@gotpcrel32@lo+4
 ; CHECK-NEXT:    s_addc_u32 s7, s7, use_module@gotpcrel32@hi+12
@@ -295,10 +298,11 @@ define amdgpu_kernel void @module_1_kernel_overalign_extern_overalign(i32 %idx)
 ; CHECK-NEXT:    s_addc_u32 s7, s7, 0
 ; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6
 ; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7
-; CHECK-NEXT:    s_add_u32 s0, s0, s9
-; CHECK-NEXT:    s_addc_u32 s1, s1, 0
+; CHECK-NEXT:    s_mov_b32 s2, -1
 ; CHECK-NEXT:    s_add_u32 s8, s4, 8
+; CHECK-NEXT:    s_mov_b32 s3, 0x31c16000
 ; CHECK-NEXT:    s_addc_u32 s9, s5, 0
+; CHECK-NEXT:    s_mov_b64 s[0:1], s[6:7]
 ; CHECK-NEXT:    s_getpc_b64 s[6:7]
 ; CHECK-NEXT:    s_add_u32 s6, s6, use_module@gotpcrel32@lo+4
 ; CHECK-NEXT:    s_addc_u32 s7, s7, use_module@gotpcrel32@hi+12
@@ -341,8 +345,6 @@ define amdgpu_kernel void @module_0_kernel_normal_indirect_extern_normal(i32 %id
 ; CHECK-NEXT:    s_addc_u32 s7, s7, 0
 ; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6
 ; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7
-; CHECK-NEXT:    s_add_u32 s0, s0, s9
-; CHECK-NEXT:    s_addc_u32 s1, s1, 0
 ; CHECK-NEXT:    s_add_u32 s8, s4, 8
 ; CHECK-NEXT:    s_addc_u32 s9, s5, 0
 ; CHECK-NEXT:    s_getpc_b64 s[4:5]
@@ -351,6 +353,9 @@ define amdgpu_kernel void @module_0_kernel_normal_indirect_extern_normal(i32 %id
 ; CHECK-NEXT:    v_mov_b32_e32 v0, 0
 ; CHECK-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
 ; CHECK-NEXT:    v_mov_b32_e32 v1, 2
+; CHECK-NEXT:    s_mov_b32 s2, -1
+; CHECK-NEXT:    s_mov_b32 s3, 0x31c16000
+; CHECK-NEXT:    s_mov_b64 s[0:1], s[6:7]
 ; CHECK-NEXT:    s_mov_b32 s15, 0
 ; CHECK-NEXT:    ds_write_b16 v0, v1
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
@@ -370,14 +375,15 @@ define amdgpu_kernel void @module_1_kernel_normal_indirect_extern_normal(i32 %id
 ; CHECK-NEXT:    s_addc_u32 s7, s7, 0
 ; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6
 ; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7
-; CHECK-NEXT:    s_add_u32 s0, s0, s9
-; CHECK-NEXT:    s_addc_u32 s1, s1, 0
 ; CHECK-NEXT:    s_add_u32 s8, s4, 8
 ; CHECK-NEXT:    s_addc_u32 s9, s5, 0
 ; CHECK-NEXT:    s_getpc_b64 s[4:5]
 ; CHECK-NEXT:    s_add_u32 s4, s4, use_module@gotpcrel32@lo+4
 ; CHECK-NEXT:    s_addc_u32 s5, s5, use_module@gotpcrel32@hi+12
+; CHECK-NEXT:    s_mov_b32 s2, -1
 ; CHECK-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; CHECK-NEXT:    s_mov_b32 s3, 0x31c16000
+; CHECK-NEXT:    s_mov_b64 s[0:1], s[6:7]
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; CHECK-NEXT:    s_getpc_b64 s[4:5]
@@ -410,8 +416,6 @@ define amdgpu_kernel void @module_0_kernel_overalign_indirect_extern_normal(i32
 ; CHECK-NEXT:    s_addc_u32 s7, s7, 0
 ; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6
 ; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7
-; CHECK-NEXT:    s_add_u32 s0, s0, s9
-; CHECK-NEXT:    s_addc_u32 s1, s1, 0
 ; CHECK-NEXT:    s_add_u32 s8, s4, 8
 ; CHECK-NEXT:    s_addc_u32 s9, s5, 0
 ; CHECK-NEXT:    s_getpc_b64 s[4:5]
@@ -420,6 +424,9 @@ define amdgpu_kernel void @module_0_kernel_overalign_indirect_extern_normal(i32
 ; CHECK-NEXT:    v_mov_b32_e32 v0, 0
 ; CHECK-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
 ; CHECK-NEXT:    v_mov_b32_e32 v1, 2
+; CHECK-NEXT:    s_mov_b32 s2, -1
+; CHECK-NEXT:    s_mov_b32 s3, 0x31c16000
+; CHECK-NEXT:    s_mov_b64 s[0:1], s[6:7]
 ; CHECK-NEXT:    s_mov_b32 s15, 2
 ; CHECK-NEXT:    ds_write_b16 v0, v1
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
@@ -439,14 +446,15 @@ define amdgpu_kernel void @module_1_kernel_overalign_indirect_extern_normal(i32
 ; CHECK-NEXT:    s_addc_u32 s7, s7, 0
 ; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6
 ; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7
-; CHECK-NEXT:    s_add_u32 s0, s0, s9
-; CHECK-NEXT:    s_addc_u32 s1, s1, 0
 ; CHECK-NEXT:    s_add_u32 s8, s4, 8
 ; CHECK-NEXT:    s_addc_u32 s9, s5, 0
 ; CHECK-NEXT:    s_getpc_b64 s[4:5]
 ; CHECK-NEXT:    s_add_u32 s4, s4, use_module@gotpcrel32@lo+4
 ; CHECK-NEXT:    s_addc_u32 s5, s5, use_module@gotpcrel32@hi+12
+; CHECK-NEXT:    s_mov_b32 s2, -1
 ; CHECK-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; CHECK-NEXT:    s_mov_b32 s3, 0x31c16000
+; CHECK-NEXT:    s_mov_b64 s[0:1], s[6:7]
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; CHECK-NEXT:    s_getpc_b64 s[4:5]
@@ -479,8 +487,6 @@ define amdgpu_kernel void @module_0_kernel_normal_indirect_extern_overalign(i32
 ; CHECK-NEXT:    s_addc_u32 s7, s7, 0
 ; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6
 ; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7
-; CHECK-NEXT:    s_add_u32 s0, s0, s9
-; CHECK-NEXT:    s_addc_u32 s1, s1, 0
 ; CHECK-NEXT:    s_add_u32 s8, s4, 8
 ; CHECK-NEXT:    s_addc_u32 s9, s5, 0
 ; CHECK-NEXT:    s_getpc_b64 s[4:5]
@@ -489,6 +495,9 @@ define amdgpu_kernel void @module_0_kernel_normal_indirect_extern_overalign(i32
 ; CHECK-NEXT:    v_mov_b32_e32 v0, 0
 ; CHECK-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
 ; CHECK-NEXT:    v_mov_b32_e32 v1, 2
+; CHECK-NEXT:    s_mov_b32 s2, -1
+; CHECK-NEXT:    s_mov_b32 s3, 0x31c16000
+; CHECK-NEXT:    s_mov_b64 s[0:1], s[6:7]
 ; CHECK-NEXT:    s_mov_b32 s15, 1
 ; CHECK-NEXT:    ds_write_b16 v0, v1
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
@@ -508,14 +517,15 @@ define amdgpu_kernel void @module_1_kernel_normal_indirect_extern_overalign(i32
 ; CHECK-NEXT:    s_addc_u32 s7, s7, 0
 ; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6
 ; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7
-; CHECK-NEXT:    s_add_u32 s0, s0, s9
-; CHECK-NEXT:    s_addc_u32 s1, s1, 0
 ; CHECK-NEXT:    s_add_u32 s8, s4, 8
 ; CHECK-NEXT:    s_addc_u32 s9, s5, 0
 ; CHECK-NEXT:    s_getpc_b64 s[4:5]
 ; CHECK-NEXT:    s_add_u32 s4, s4, use_module@gotpcrel32@lo+4
 ; CHECK-NEXT:    s_addc_u32 s5, s5, use_module@gotpcrel32@hi+12
+; CHECK-NEXT:    s_mov_b32 s2, -1
 ; CHECK-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; CHECK-NEXT:    s_mov_b32 s3, 0x31c16000
+; CHECK-NEXT:    s_mov_b64 s[0:1], s[6:7]
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; CHECK-NEXT:    s_getpc_b64 s[4:5]
@@ -548,8 +558,6 @@ define amdgpu_kernel void @module_0_kernel_overalign_indirect_extern_overalign(i
 ; CHECK-NEXT:    s_addc_u32 s7, s7, 0
 ; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6
 ; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7
-; CHECK-NEXT:    s_add_u32 s0, s0, s9
-; CHECK-NEXT:    s_addc_u32 s1, s1, 0
 ; CHECK-NEXT:    s_add_u32 s8, s4, 8
 ; CHECK-NEXT:    s_addc_u32 s9, s5, 0
 ; CHECK-NEXT:    s_getpc_b64 s[4:5]
@@ -558,6 +566,9 @@ define amdgpu_kernel void @module_0_kernel_overalign_indirect_extern_overalign(i
 ; CHECK-NEXT:    v_mov_b32_e32 v0, 0
 ; CHECK-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
 ; CHECK-NEXT:    v_mov_b32_e32 v1, 2
+; CHECK-NEXT:    s_mov_b32 s2, -1
+; CHECK-NEXT:    s_mov_b32 s3, 0x31c16000
+; CHECK-NEXT:    s_mov_b64 s[0:1], s[6:7]
 ; CHECK-NEXT:    s_mov_b32 s15, 3
 ; CHECK-NEXT:    ds_write_b16 v0, v1
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
@@ -577,14 +588,15 @@ define amdgpu_kernel void @module_1_kernel_overalign_indirect_extern_overalign(i
 ; CHECK-NEXT:    s_addc_u32 s7, s7, 0
 ; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6
 ; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7
-; CHECK-NEXT:    s_add_u32 s0, s0, s9
-; CHECK-NEXT:    s_addc_u32 s1, s1, 0
 ; CHECK-NEXT:    s_add_u32 s8, s4, 8
 ; CHECK-NEXT:    s_addc_u32 s9, s5, 0
 ; CHECK-NEXT:    s_getpc_b64 s[4:5]
 ; CHECK-NEXT:    s_add_u32 s4, s4, use_module@gotpcrel32@lo+4
 ; CHECK-NEXT:    s_addc_u32 s5, s5, use_module@gotpcrel32@hi+12
+; CHECK-NEXT:    s_mov_b32 s2, -1
 ; CHECK-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; CHECK-NEXT:    s_mov_b32 s3, 0x31c16000
+; CHECK-NEXT:    s_mov_b64 s[0:1], s[6:7]
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; CHECK-NEXT:    s_getpc_b64 s[4:5]
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.lds.kernel.id.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.lds.kernel.id.ll
index 61818da..26271a0 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.lds.kernel.id.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.lds.kernel.id.ll
@@ -45,8 +45,9 @@ define amdgpu_kernel void @indirect_lds_id(ptr addrspace(1) %out) !llvm.amdgcn.l
 ; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s7
 ; GCN-NEXT:    s_add_i32 s6, s6, s9
 ; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s6, 8
-; GCN-NEXT:    s_add_u32 s0, s0, s9
-; GCN-NEXT:    s_addc_u32 s1, s1, 0
+; GCN-NEXT:    s_mov_b32 s2, -1
+; GCN-NEXT:    s_mov_b32 s3, 0x1e8f000
+; GCN-NEXT:    s_mov_b64 s[0:1], flat_scratch
 ; GCN-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
 ; GCN-NEXT:    s_getpc_b64 s[6:7]
 ; GCN-NEXT:    s_add_u32 s6, s6, function_lds_id@gotpcrel32@lo+4
diff --git a/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-hybrid.ll b/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-hybrid.ll
index bb7c43f..f780188 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-hybrid.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-hybrid.ll
@@ -164,8 +164,9 @@ define amdgpu_kernel void @k01() {
 ; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s7
 ; GCN-NEXT:    s_add_i32 s6, s6, s9
 ; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s6, 8
-; GCN-NEXT:    s_add_u32 s0, s0, s9
-; GCN-NEXT:    s_addc_u32 s1, s1, 0
+; GCN-NEXT:    s_mov_b32 s2, -1
+; GCN-NEXT:    s_mov_b32 s3, 0x1e8f000
+; GCN-NEXT:    s_mov_b64 s[0:1], flat_scratch  
 ; GCN-NEXT:    s_mov_b64 s[8:9], s[4:5]
 ; GCN-NEXT:    s_getpc_b64 s[4:5]
 ; GCN-NEXT:    s_add_u32 s4, s4, f0@gotpcrel32@lo+4
@@ -198,8 +199,9 @@ define amdgpu_kernel void @k23() {
 ; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s7
 ; GCN-NEXT:    s_add_i32 s6, s6, s9
 ; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s6, 8
-; GCN-NEXT:    s_add_u32 s0, s0, s9
-; GCN-NEXT:    s_addc_u32 s1, s1, 0
+; GCN-NEXT:    s_mov_b32 s2, -1
+; GCN-NEXT:    s_mov_b32 s3, 0x1e8f000
+; GCN-NEXT:    s_mov_b64 s[0:1], flat_scratch 
 ; GCN-NEXT:    s_mov_b64 s[8:9], s[4:5]
 ; GCN-NEXT:    s_getpc_b64 s[4:5]
 ; GCN-NEXT:    s_add_u32 s4, s4, f2@gotpcrel32@lo+4
@@ -240,8 +242,9 @@ define amdgpu_kernel void @k123() {
 ; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s7
 ; GCN-NEXT:    s_add_i32 s6, s6, s9
 ; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s6, 8
-; GCN-NEXT:    s_add_u32 s0, s0, s9
-; GCN-NEXT:    s_addc_u32 s1, s1, 0
+; GCN-NEXT:    s_mov_b32 s2, -1
+; GCN-NEXT:    s_mov_b32 s3, 0x1e8f000
+; GCN-NEXT:    s_mov_b64 s[0:1], flat_scratch
 ; GCN-NEXT:    s_mov_b64 s[8:9], s[4:5]
 ; GCN-NEXT:    s_getpc_b64 s[4:5]
 ; GCN-NEXT:    s_add_u32 s4, s4, f1@gotpcrel32@lo+4
diff --git a/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-table.ll b/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-table.ll
index 4d73436..fa4b93f 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-table.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-table.ll
@@ -229,8 +229,9 @@ define amdgpu_kernel void @k01() {
 ; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s7
 ; GCN-NEXT:    s_add_i32 s6, s6, s9
 ; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s6, 8
-; GCN-NEXT:    s_add_u32 s0, s0, s9
-; GCN-NEXT:    s_addc_u32 s1, s1, 0
+; GCN-NEXT:    s_mov_b32 s2, -1
+; GCN-NEXT:    s_mov_b32 s3, 0x1e8f000
+; GCN-NEXT:    s_mov_b64 s[0:1], flat_scratch
 ; GCN-NEXT:    s_mov_b64 s[8:9], s[4:5]
 ; GCN-NEXT:    s_getpc_b64 s[4:5]
 ; GCN-NEXT:    s_add_u32 s4, s4, f0@gotpcrel32@lo+4
@@ -268,8 +269,9 @@ define amdgpu_kernel void @k23() {
 ; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s7
 ; GCN-NEXT:    s_add_i32 s6, s6, s9
 ; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s6, 8
-; GCN-NEXT:    s_add_u32 s0, s0, s9
-; GCN-NEXT:    s_addc_u32 s1, s1, 0
+; GCN-NEXT:    s_mov_b32 s2, -1
+; GCN-NEXT:    s_mov_b32 s3, 0x1e8f000
+; GCN-NEXT:    s_mov_b64 s[0:1], flat_scratch
 ; GCN-NEXT:    s_mov_b64 s[8:9], s[4:5]
 ; GCN-NEXT:    s_getpc_b64 s[4:5]
 ; GCN-NEXT:    s_add_u32 s4, s4, f2@gotpcrel32@lo+4
@@ -310,8 +312,9 @@ define amdgpu_kernel void @k123() {
 ; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s7
 ; GCN-NEXT:    s_add_i32 s6, s6, s9
 ; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s6, 8
-; GCN-NEXT:    s_add_u32 s0, s0, s9
-; GCN-NEXT:    s_addc_u32 s1, s1, 0
+; GCN-NEXT:    s_mov_b32 s2, -1
+; GCN-NEXT:    s_mov_b32 s3, 0x1e8f000
+; GCN-NEXT:    s_mov_b64 s[0:1], flat_scratch
 ; GCN-NEXT:    s_mov_b64 s[8:9], s[4:5]
 ; GCN-NEXT:    s_getpc_b64 s[4:5]
 ; GCN-NEXT:    s_add_u32 s4, s4, f1@gotpcrel32@lo+4
diff --git a/llvm/test/CodeGen/AMDGPU/machine-sink-temporal-divergence-swdev407790.ll b/llvm/test/CodeGen/AMDGPU/machine-sink-temporal-divergence-swdev407790.ll
index 138a6a8..e17f311 100644
--- a/llvm/test/CodeGen/AMDGPU/machine-sink-temporal-divergence-swdev407790.ll
+++ b/llvm/test/CodeGen/AMDGPU/machine-sink-temporal-divergence-swdev407790.ll
@@ -44,17 +44,18 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no
 ; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
 ; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; CHECK-NEXT:    s_load_dwordx8 s[44:51], s[6:7], 0x0
-; CHECK-NEXT:    s_add_u32 s0, s0, s15
 ; CHECK-NEXT:    s_mov_b64 s[34:35], s[6:7]
-; CHECK-NEXT:    s_addc_u32 s1, s1, 0
 ; CHECK-NEXT:    v_mov_b32_e32 v40, v0
 ; CHECK-NEXT:    s_add_u32 s42, s34, 40
 ; CHECK-NEXT:    v_mov_b32_e32 v31, v0
 ; CHECK-NEXT:    v_mov_b32_e32 v0, 0
+; CHECK-NEXT:    s_mov_b32 s2, -1
 ; CHECK-NEXT:    s_mov_b64 s[36:37], s[8:9]
 ; CHECK-NEXT:    s_addc_u32 s43, s35, 0
-; CHECK-NEXT:    s_mov_b64 s[10:11], s[36:37]
+; CHECK-NEXT:    s_mov_b32 s3, 0x31c16000
+; CHECK-NEXT:    s_mov_b64 s[0:1], s[10:11]
 ; CHECK-NEXT:    s_mov_b64 s[8:9], s[42:43]
+; CHECK-NEXT:    s_mov_b64 s[10:11], s[36:37]
 ; CHECK-NEXT:    s_mov_b32 s33, s14
 ; CHECK-NEXT:    s_mov_b32 s40, s13
 ; CHECK-NEXT:    s_mov_b32 s41, s12
@@ -781,17 +782,18 @@ define protected amdgpu_kernel void @kernel_round1_short(ptr addrspace(1) nocapt
 ; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
 ; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; CHECK-NEXT:    s_load_dwordx2 s[44:45], s[6:7], 0x10
-; CHECK-NEXT:    s_add_u32 s0, s0, s15
 ; CHECK-NEXT:    s_mov_b64 s[36:37], s[6:7]
-; CHECK-NEXT:    s_addc_u32 s1, s1, 0
 ; CHECK-NEXT:    v_mov_b32_e32 v40, v0
 ; CHECK-NEXT:    s_add_u32 s42, s36, 40
 ; CHECK-NEXT:    v_mov_b32_e32 v31, v0
 ; CHECK-NEXT:    v_mov_b32_e32 v0, 0
+; CHECK-NEXT:    s_mov_b32 s2, -1
 ; CHECK-NEXT:    s_mov_b64 s[34:35], s[8:9]
 ; CHECK-NEXT:    s_addc_u32 s43, s37, 0
-; CHECK-NEXT:    s_mov_b64 s[10:11], s[34:35]
+; CHECK-NEXT:    s_mov_b32 s3, 0x31c16000
+; CHECK-NEXT:    s_mov_b64 s[0:1], s[10:11]
 ; CHECK-NEXT:    s_mov_b64 s[8:9], s[42:43]
+; CHECK-NEXT:    s_mov_b64 s[10:11], s[34:35]
 ; CHECK-NEXT:    s_mov_b32 s33, s14
 ; CHECK-NEXT:    s_mov_b32 s40, s13
 ; CHECK-NEXT:    s_mov_b32 s41, s12
diff --git a/llvm/test/CodeGen/AMDGPU/need-fp-from-vgpr-spills.ll b/llvm/test/CodeGen/AMDGPU/need-fp-from-vgpr-spills.ll
index f70441e..70a9bbb 100644
--- a/llvm/test/CodeGen/AMDGPU/need-fp-from-vgpr-spills.ll
+++ b/llvm/test/CodeGen/AMDGPU/need-fp-from-vgpr-spills.ll
@@ -69,8 +69,9 @@ define amdgpu_kernel void @kernel_call() {
 ; CHECK-NEXT:    s_mov_b32 s32, 0
 ; CHECK-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
 ; CHECK-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
-; CHECK-NEXT:    s_add_u32 s0, s0, s15
-; CHECK-NEXT:    s_addc_u32 s1, s1, 0
+; CHECK-NEXT:    s_mov_b32 s2, -1
+; CHECK-NEXT:    s_mov_b32 s3, 0xe00000
+; CHECK-NEXT:    s_mov_b64 s[0:1], flat_scratch
 ; CHECK-NEXT:    s_mov_b64 s[10:11], s[8:9]
 ; CHECK-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; CHECK-NEXT:    s_getpc_b64 s[16:17]
@@ -128,8 +129,9 @@ define amdgpu_kernel void @kernel_tailcall() {
 ; CHECK-NEXT:    s_mov_b32 s32, 0
 ; CHECK-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
 ; CHECK-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
-; CHECK-NEXT:    s_add_u32 s0, s0, s15
-; CHECK-NEXT:    s_addc_u32 s1, s1, 0
+; CHECK-NEXT:    s_mov_b32 s2, -1
+; CHECK-NEXT:    s_mov_b32 s3, 0xe00000
+; CHECK-NEXT:    s_mov_b64 s[0:1], flat_scratch
 ; CHECK-NEXT:    s_mov_b64 s[10:11], s[8:9]
 ; CHECK-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; CHECK-NEXT:    s_getpc_b64 s[16:17]
@@ -240,8 +242,9 @@ define protected amdgpu_kernel void @kernel() {
 ; CHECK-NEXT:    s_mov_b32 s32, 0
 ; CHECK-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
 ; CHECK-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
-; CHECK-NEXT:    s_add_u32 s0, s0, s15
-; CHECK-NEXT:    s_addc_u32 s1, s1, 0
+; CHECK-NEXT:    s_mov_b32 s2, -1
+; CHECK-NEXT:    s_mov_b32 s3, 0xe00000
+; CHECK-NEXT:    s_mov_b64 s[0:1], flat_scratch
 ; CHECK-NEXT:    s_mov_b64 s[10:11], s[8:9]
 ; CHECK-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; CHECK-NEXT:    s_getpc_b64 s[16:17]
diff --git a/llvm/test/CodeGen/AMDGPU/simple-indirect-call.ll b/llvm/test/CodeGen/AMDGPU/simple-indirect-call.ll
index e7c5aaf..e6d9c0d 100644
--- a/llvm/test/CodeGen/AMDGPU/simple-indirect-call.ll
+++ b/llvm/test/CodeGen/AMDGPU/simple-indirect-call.ll
@@ -45,8 +45,8 @@ define amdgpu_kernel void @test_simple_indirect_call() {
 ; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x4
 ; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
 ; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
-; GFX9-NEXT:    s_add_u32 s0, s0, s15
-; GFX9-NEXT:    s_addc_u32 s1, s1, 0
+; GFX9-NEXT:    s_mov_b32 s2, -1
+; GFX9-NEXT:    s_mov_b32 s3, 0xe00000
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_lshr_b32 s4, s4, 16
 ; GFX9-NEXT:    s_mul_i32 s4, s4, s5
@@ -55,8 +55,9 @@ define amdgpu_kernel void @test_simple_indirect_call() {
 ; GFX9-NEXT:    s_add_u32 s6, s6, indirect@rel32@lo+4
 ; GFX9-NEXT:    s_addc_u32 s7, s7, indirect@rel32@hi+12
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s6
-; GFX9-NEXT:    v_mov_b32_e32 v4, s7
+; GFX9-NEXT:    s_mov_b64 s[0:1], flat_scratch
 ; GFX9-NEXT:    v_mad_u32_u24 v0, v1, s5, v0
+; GFX9-NEXT:    v_mov_b32_e32 v4, s7
 ; GFX9-NEXT:    v_add_lshl_u32 v0, v0, v2, 3
 ; GFX9-NEXT:    s_mov_b32 s32, 0
 ; GFX9-NEXT:    ds_write_b64 v0, v[3:4]
diff --git a/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll b/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll
index 1118cc3..8d8459f 100644
--- a/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll
+++ b/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll
@@ -45,10 +45,8 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS1-NEXT:    s_load_dwordx2 s[4:5], s[38:39], 0x18
 ; GLOBALNESS1-NEXT:    s_load_dword s7, s[38:39], 0x20
 ; GLOBALNESS1-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GLOBALNESS1-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
-; GLOBALNESS1-NEXT:    s_add_u32 s0, s0, s15
 ; GLOBALNESS1-NEXT:    v_mov_b32_e32 v0, 0
-; GLOBALNESS1-NEXT:    s_addc_u32 s1, s1, 0
+; GLOBALNESS1-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GLOBALNESS1-NEXT:    v_mov_b32_e32 v1, 0x40994400
 ; GLOBALNESS1-NEXT:    s_bitcmp1_b32 s74, 0
 ; GLOBALNESS1-NEXT:    s_waitcnt lgkmcnt(0)
@@ -73,7 +71,10 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS1-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; GLOBALNESS1-NEXT:    v_cmp_ne_u32_e64 s[50:51], 1, v0
 ; GLOBALNESS1-NEXT:    v_cmp_ne_u32_e64 s[42:43], 1, v1
+; GLOBALNESS1-NEXT:    s_mov_b32 s2, -1
 ; GLOBALNESS1-NEXT:    v_cmp_ne_u32_e64 s[44:45], 1, v3
+; GLOBALNESS1-NEXT:    s_mov_b32 s3, 0xe00000
+; GLOBALNESS1-NEXT:    s_mov_b64 s[0:1], flat_scratch
 ; GLOBALNESS1-NEXT:    s_mov_b32 s68, s14
 ; GLOBALNESS1-NEXT:    s_mov_b32 s69, s13
 ; GLOBALNESS1-NEXT:    s_mov_b32 s70, s12
@@ -332,10 +333,8 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS0-NEXT:    s_load_dwordx2 s[4:5], s[38:39], 0x18
 ; GLOBALNESS0-NEXT:    s_load_dword s7, s[38:39], 0x20
 ; GLOBALNESS0-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GLOBALNESS0-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
-; GLOBALNESS0-NEXT:    s_add_u32 s0, s0, s15
 ; GLOBALNESS0-NEXT:    v_mov_b32_e32 v0, 0
-; GLOBALNESS0-NEXT:    s_addc_u32 s1, s1, 0
+; GLOBALNESS0-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GLOBALNESS0-NEXT:    v_mov_b32_e32 v1, 0x40994400
 ; GLOBALNESS0-NEXT:    s_bitcmp1_b32 s74, 0
 ; GLOBALNESS0-NEXT:    s_waitcnt lgkmcnt(0)
@@ -360,7 +359,10 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS0-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; GLOBALNESS0-NEXT:    v_cmp_ne_u32_e64 s[50:51], 1, v0
 ; GLOBALNESS0-NEXT:    v_cmp_ne_u32_e64 s[42:43], 1, v1
+; GLOBALNESS0-NEXT:    s_mov_b32 s2, -1
 ; GLOBALNESS0-NEXT:    v_cmp_ne_u32_e64 s[44:45], 1, v3
+; GLOBALNESS0-NEXT:    s_mov_b32 s3, 0xe00000
+; GLOBALNESS0-NEXT:    s_mov_b64 s[0:1], flat_scratch
 ; GLOBALNESS0-NEXT:    s_mov_b32 s66, s14
 ; GLOBALNESS0-NEXT:    s_mov_b32 s67, s13
 ; GLOBALNESS0-NEXT:    s_mov_b32 s68, s12
diff --git a/llvm/test/CodeGen/AMDGPU/vgpr_constant_to_sgpr.ll b/llvm/test/CodeGen/AMDGPU/vgpr_constant_to_sgpr.ll
index 7840559..7d759089 100644
--- a/llvm/test/CodeGen/AMDGPU/vgpr_constant_to_sgpr.ll
+++ b/llvm/test/CodeGen/AMDGPU/vgpr_constant_to_sgpr.ll
@@ -14,8 +14,9 @@ define protected amdgpu_kernel void @kern(ptr %addr) !llvm.amdgcn.lds.kernel.id
 ; CHECK-NEXT:    s_addc_u32 s11, s11, 0
 ; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
 ; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
-; CHECK-NEXT:    s_add_u32 s0, s0, s15
-; CHECK-NEXT:    s_addc_u32 s1, s1, 0
+; CHECK-NEXT:    s_mov_b32 s2, -1
+; CHECK-NEXT:    s_mov_b32 s3, 0x31c16000
+; CHECK-NEXT:    s_mov_b64 s[0:1], s[10:11]
 ; CHECK-NEXT:    s_mov_b64 s[10:11], s[8:9]
 ; CHECK-NEXT:    s_load_dwordx2 s[8:9], s[6:7], 0x0
 ; CHECK-NEXT:    v_mov_b32_e32 v5, 42
-- 
cgit v1.1


From d0f72f88606b78447fb7b61214651854c787c26f Mon Sep 17 00:00:00 2001
From: Philip Reames <preames@rivosinc.com>
Date: Thu, 8 Feb 2024 11:28:06 -0800
Subject: [RISCV] Consider truncate semantics in performBUILD_VECTORCombine
 (#81168)

Fixes https://github.com/llvm/llvm-project/issues/80910.

Per the documentation in ISDOpcodes.h, for BUILD_VECTOR "The types of
the operands must match the vector element type, except that integer
types are allowed to be larger than the element type, in which case the
operands are implicitly truncated."

This transform was assuming that the scalar operand type matched the
result type. This resulted in essentially performing a truncate before a
binop, instead of after. As demonstrated by the test case changes, this
is often not legal.
---
 llvm/lib/Target/RISCV/RISCVISelLowering.cpp        |   6 +
 .../RISCV/rvv/fixed-vectors-buildvec-of-binop.ll   |   6 +-
 .../CodeGen/RISCV/rvv/fixed-vectors-vselect.ll     | 513 +++++++++++++++------
 .../test/CodeGen/RISCV/urem-seteq-illegal-types.ll |  53 +--
 4 files changed, 399 insertions(+), 179 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 27037f4..0799cc2 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -14956,6 +14956,11 @@ static SDValue performBUILD_VECTORCombine(SDNode *N, SelectionDAG &DAG,
   if (!TLI.isOperationLegalOrCustom(Opcode, VT) || !TLI.isTypeLegal(VT))
     return SDValue();
 
+  // This BUILD_VECTOR involves an implicit truncation, and sinking
+  // truncates through binops is non-trivial.
+  if (N->op_begin()->getValueType() != VT.getVectorElementType())
+    return SDValue();
+
   SmallVector<SDValue> LHSOps;
   SmallVector<SDValue> RHSOps;
   for (SDValue Op : N->ops()) {
@@ -14983,6 +14988,7 @@ static SDValue performBUILD_VECTORCombine(SDNode *N, SelectionDAG &DAG,
     // have different LHS and RHS types.
     if (Op.getOperand(0).getValueType() != Op.getOperand(1).getValueType())
       return SDValue();
+
     RHSOps.push_back(Op.getOperand(1));
   }
 
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-buildvec-of-binop.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-buildvec-of-binop.ll
index e376688..af7d7f7a 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-buildvec-of-binop.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-buildvec-of-binop.ll
@@ -589,7 +589,8 @@ define <8 x i32> @add_constant_rhs_8xi32_partial(<8 x i32> %vin, i32 %a, i32 %b,
   ret <8 x i32> %v3
 }
 
-; FIXME: This is currently showing a miscompile, we effectively
+; Here we can not pull the ashr through into the vector domain due to
+; the truncate semantics of the build_vector.  Doing so would
 ; truncate before the ashr instead of after it, so if %a or %b
 ; is e.g. UINT32_MAX+1 we get different result.
 define <2 x i32> @build_vec_of_trunc_op(i64 %a, i64 %b) {
@@ -608,10 +609,11 @@ define <2 x i32> @build_vec_of_trunc_op(i64 %a, i64 %b) {
 ;
 ; RV64-LABEL: build_vec_of_trunc_op:
 ; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    srli a0, a0, 1
+; RV64-NEXT:    srli a1, a1, 1
 ; RV64-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
 ; RV64-NEXT:    vmv.v.x v8, a0
 ; RV64-NEXT:    vslide1down.vx v8, v8, a1
-; RV64-NEXT:    vsrl.vi v8, v8, 1
 ; RV64-NEXT:    ret
 entry:
   %conv11.i = ashr i64 %a, 1
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vselect.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vselect.ll
index cd47720..ead41b0 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vselect.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vselect.ll
@@ -3,30 +3,65 @@
 ; RUN: llc -mtriple=riscv64 -target-abi=lp64d -mattr=+v,+zfh,+zvfh,+f,+d -verify-machineinstrs < %s | FileCheck %s -check-prefixes=CHECK,RV64
 
 define void @vselect_vv_v6i32(ptr %a, ptr %b, ptr %cc, ptr %z) {
-; CHECK-LABEL: vselect_vv_v6i32:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 6, e32, m2, ta, ma
-; CHECK-NEXT:    lbu a2, 0(a2)
-; CHECK-NEXT:    vle32.v v8, (a1)
-; CHECK-NEXT:    srli a1, a2, 1
-; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-NEXT:    vmv.v.x v10, a2
-; CHECK-NEXT:    vslide1down.vx v10, v10, a1
-; CHECK-NEXT:    srli a1, a2, 2
-; CHECK-NEXT:    vslide1down.vx v10, v10, a1
-; CHECK-NEXT:    srli a1, a2, 3
-; CHECK-NEXT:    vslide1down.vx v10, v10, a1
-; CHECK-NEXT:    srli a1, a2, 4
-; CHECK-NEXT:    vslide1down.vx v10, v10, a1
-; CHECK-NEXT:    srli a2, a2, 5
-; CHECK-NEXT:    vslide1down.vx v10, v10, a2
-; CHECK-NEXT:    vslidedown.vi v10, v10, 2
-; CHECK-NEXT:    vand.vi v10, v10, 1
-; CHECK-NEXT:    vmsne.vi v0, v10, 0
-; CHECK-NEXT:    vsetivli zero, 6, e32, m2, tu, mu
-; CHECK-NEXT:    vle32.v v8, (a0), v0.t
-; CHECK-NEXT:    vse32.v v8, (a3)
-; CHECK-NEXT:    ret
+; RV32-LABEL: vselect_vv_v6i32:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetivli zero, 6, e32, m2, ta, ma
+; RV32-NEXT:    lbu a2, 0(a2)
+; RV32-NEXT:    vle32.v v8, (a1)
+; RV32-NEXT:    slli a1, a2, 30
+; RV32-NEXT:    srli a1, a1, 31
+; RV32-NEXT:    andi a4, a2, 1
+; RV32-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; RV32-NEXT:    vmv.v.x v10, a4
+; RV32-NEXT:    vslide1down.vx v10, v10, a1
+; RV32-NEXT:    slli a1, a2, 29
+; RV32-NEXT:    srli a1, a1, 31
+; RV32-NEXT:    vslide1down.vx v10, v10, a1
+; RV32-NEXT:    slli a1, a2, 28
+; RV32-NEXT:    srli a1, a1, 31
+; RV32-NEXT:    vslide1down.vx v10, v10, a1
+; RV32-NEXT:    slli a1, a2, 27
+; RV32-NEXT:    srli a1, a1, 31
+; RV32-NEXT:    vslide1down.vx v10, v10, a1
+; RV32-NEXT:    srli a2, a2, 5
+; RV32-NEXT:    vslide1down.vx v10, v10, a2
+; RV32-NEXT:    vslidedown.vi v10, v10, 2
+; RV32-NEXT:    vand.vi v10, v10, 1
+; RV32-NEXT:    vmsne.vi v0, v10, 0
+; RV32-NEXT:    vsetivli zero, 6, e32, m2, tu, mu
+; RV32-NEXT:    vle32.v v8, (a0), v0.t
+; RV32-NEXT:    vse32.v v8, (a3)
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vselect_vv_v6i32:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli zero, 6, e32, m2, ta, ma
+; RV64-NEXT:    lbu a2, 0(a2)
+; RV64-NEXT:    vle32.v v8, (a1)
+; RV64-NEXT:    slli a1, a2, 62
+; RV64-NEXT:    srli a1, a1, 63
+; RV64-NEXT:    andi a4, a2, 1
+; RV64-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; RV64-NEXT:    vmv.v.x v10, a4
+; RV64-NEXT:    vslide1down.vx v10, v10, a1
+; RV64-NEXT:    slli a1, a2, 61
+; RV64-NEXT:    srli a1, a1, 63
+; RV64-NEXT:    vslide1down.vx v10, v10, a1
+; RV64-NEXT:    slli a1, a2, 60
+; RV64-NEXT:    srli a1, a1, 63
+; RV64-NEXT:    vslide1down.vx v10, v10, a1
+; RV64-NEXT:    slli a1, a2, 59
+; RV64-NEXT:    srli a1, a1, 63
+; RV64-NEXT:    vslide1down.vx v10, v10, a1
+; RV64-NEXT:    srli a2, a2, 5
+; RV64-NEXT:    vslide1down.vx v10, v10, a2
+; RV64-NEXT:    vslidedown.vi v10, v10, 2
+; RV64-NEXT:    vand.vi v10, v10, 1
+; RV64-NEXT:    vmsne.vi v0, v10, 0
+; RV64-NEXT:    vsetivli zero, 6, e32, m2, tu, mu
+; RV64-NEXT:    vle32.v v8, (a0), v0.t
+; RV64-NEXT:    vse32.v v8, (a3)
+; RV64-NEXT:    ret
   %va = load <6 x i32>, ptr %a
   %vb = load <6 x i32>, ptr %b
   %vcc = load <6 x i1>, ptr %cc
@@ -36,31 +71,67 @@ define void @vselect_vv_v6i32(ptr %a, ptr %b, ptr %cc, ptr %z) {
 }
 
 define void @vselect_vx_v6i32(i32 %a, ptr %b, ptr %cc, ptr %z) {
-; CHECK-LABEL: vselect_vx_v6i32:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 6, e32, m2, ta, ma
-; CHECK-NEXT:    lbu a2, 0(a2)
-; CHECK-NEXT:    vle32.v v8, (a1)
-; CHECK-NEXT:    srli a1, a2, 1
-; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-NEXT:    vmv.v.x v10, a2
-; CHECK-NEXT:    vslide1down.vx v10, v10, a1
-; CHECK-NEXT:    srli a1, a2, 2
-; CHECK-NEXT:    vslide1down.vx v10, v10, a1
-; CHECK-NEXT:    srli a1, a2, 3
-; CHECK-NEXT:    vslide1down.vx v10, v10, a1
-; CHECK-NEXT:    srli a1, a2, 4
-; CHECK-NEXT:    vslide1down.vx v10, v10, a1
-; CHECK-NEXT:    srli a2, a2, 5
-; CHECK-NEXT:    vslide1down.vx v10, v10, a2
-; CHECK-NEXT:    vslidedown.vi v10, v10, 2
-; CHECK-NEXT:    vand.vi v10, v10, 1
-; CHECK-NEXT:    vmsne.vi v0, v10, 0
-; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; CHECK-NEXT:    vmerge.vxm v8, v8, a0, v0
-; CHECK-NEXT:    vsetivli zero, 6, e32, m2, ta, ma
-; CHECK-NEXT:    vse32.v v8, (a3)
-; CHECK-NEXT:    ret
+; RV32-LABEL: vselect_vx_v6i32:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetivli zero, 6, e32, m2, ta, ma
+; RV32-NEXT:    lbu a2, 0(a2)
+; RV32-NEXT:    vle32.v v8, (a1)
+; RV32-NEXT:    slli a1, a2, 30
+; RV32-NEXT:    srli a1, a1, 31
+; RV32-NEXT:    andi a4, a2, 1
+; RV32-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; RV32-NEXT:    vmv.v.x v10, a4
+; RV32-NEXT:    vslide1down.vx v10, v10, a1
+; RV32-NEXT:    slli a1, a2, 29
+; RV32-NEXT:    srli a1, a1, 31
+; RV32-NEXT:    vslide1down.vx v10, v10, a1
+; RV32-NEXT:    slli a1, a2, 28
+; RV32-NEXT:    srli a1, a1, 31
+; RV32-NEXT:    vslide1down.vx v10, v10, a1
+; RV32-NEXT:    slli a1, a2, 27
+; RV32-NEXT:    srli a1, a1, 31
+; RV32-NEXT:    vslide1down.vx v10, v10, a1
+; RV32-NEXT:    srli a2, a2, 5
+; RV32-NEXT:    vslide1down.vx v10, v10, a2
+; RV32-NEXT:    vslidedown.vi v10, v10, 2
+; RV32-NEXT:    vand.vi v10, v10, 1
+; RV32-NEXT:    vmsne.vi v0, v10, 0
+; RV32-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; RV32-NEXT:    vmerge.vxm v8, v8, a0, v0
+; RV32-NEXT:    vsetivli zero, 6, e32, m2, ta, ma
+; RV32-NEXT:    vse32.v v8, (a3)
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vselect_vx_v6i32:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli zero, 6, e32, m2, ta, ma
+; RV64-NEXT:    lbu a2, 0(a2)
+; RV64-NEXT:    vle32.v v8, (a1)
+; RV64-NEXT:    slli a1, a2, 62
+; RV64-NEXT:    srli a1, a1, 63
+; RV64-NEXT:    andi a4, a2, 1
+; RV64-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; RV64-NEXT:    vmv.v.x v10, a4
+; RV64-NEXT:    vslide1down.vx v10, v10, a1
+; RV64-NEXT:    slli a1, a2, 61
+; RV64-NEXT:    srli a1, a1, 63
+; RV64-NEXT:    vslide1down.vx v10, v10, a1
+; RV64-NEXT:    slli a1, a2, 60
+; RV64-NEXT:    srli a1, a1, 63
+; RV64-NEXT:    vslide1down.vx v10, v10, a1
+; RV64-NEXT:    slli a1, a2, 59
+; RV64-NEXT:    srli a1, a1, 63
+; RV64-NEXT:    vslide1down.vx v10, v10, a1
+; RV64-NEXT:    srli a2, a2, 5
+; RV64-NEXT:    vslide1down.vx v10, v10, a2
+; RV64-NEXT:    vslidedown.vi v10, v10, 2
+; RV64-NEXT:    vand.vi v10, v10, 1
+; RV64-NEXT:    vmsne.vi v0, v10, 0
+; RV64-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; RV64-NEXT:    vmerge.vxm v8, v8, a0, v0
+; RV64-NEXT:    vsetivli zero, 6, e32, m2, ta, ma
+; RV64-NEXT:    vse32.v v8, (a3)
+; RV64-NEXT:    ret
   %vb = load <6 x i32>, ptr %b
   %ahead = insertelement <6 x i32> poison, i32 %a, i32 0
   %va = shufflevector <6 x i32> %ahead, <6 x i32> poison, <6 x i32> zeroinitializer
@@ -71,31 +142,67 @@ define void @vselect_vx_v6i32(i32 %a, ptr %b, ptr %cc, ptr %z) {
 }
 
 define void @vselect_vi_v6i32(ptr %b, ptr %cc, ptr %z) {
-; CHECK-LABEL: vselect_vi_v6i32:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 6, e32, m2, ta, ma
-; CHECK-NEXT:    lbu a1, 0(a1)
-; CHECK-NEXT:    vle32.v v8, (a0)
-; CHECK-NEXT:    srli a0, a1, 1
-; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-NEXT:    vmv.v.x v10, a1
-; CHECK-NEXT:    vslide1down.vx v10, v10, a0
-; CHECK-NEXT:    srli a0, a1, 2
-; CHECK-NEXT:    vslide1down.vx v10, v10, a0
-; CHECK-NEXT:    srli a0, a1, 3
-; CHECK-NEXT:    vslide1down.vx v10, v10, a0
-; CHECK-NEXT:    srli a0, a1, 4
-; CHECK-NEXT:    vslide1down.vx v10, v10, a0
-; CHECK-NEXT:    srli a1, a1, 5
-; CHECK-NEXT:    vslide1down.vx v10, v10, a1
-; CHECK-NEXT:    vslidedown.vi v10, v10, 2
-; CHECK-NEXT:    vand.vi v10, v10, 1
-; CHECK-NEXT:    vmsne.vi v0, v10, 0
-; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; CHECK-NEXT:    vmerge.vim v8, v8, -1, v0
-; CHECK-NEXT:    vsetivli zero, 6, e32, m2, ta, ma
-; CHECK-NEXT:    vse32.v v8, (a2)
-; CHECK-NEXT:    ret
+; RV32-LABEL: vselect_vi_v6i32:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetivli zero, 6, e32, m2, ta, ma
+; RV32-NEXT:    lbu a1, 0(a1)
+; RV32-NEXT:    vle32.v v8, (a0)
+; RV32-NEXT:    slli a0, a1, 30
+; RV32-NEXT:    srli a0, a0, 31
+; RV32-NEXT:    andi a3, a1, 1
+; RV32-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; RV32-NEXT:    vmv.v.x v10, a3
+; RV32-NEXT:    vslide1down.vx v10, v10, a0
+; RV32-NEXT:    slli a0, a1, 29
+; RV32-NEXT:    srli a0, a0, 31
+; RV32-NEXT:    vslide1down.vx v10, v10, a0
+; RV32-NEXT:    slli a0, a1, 28
+; RV32-NEXT:    srli a0, a0, 31
+; RV32-NEXT:    vslide1down.vx v10, v10, a0
+; RV32-NEXT:    slli a0, a1, 27
+; RV32-NEXT:    srli a0, a0, 31
+; RV32-NEXT:    vslide1down.vx v10, v10, a0
+; RV32-NEXT:    srli a1, a1, 5
+; RV32-NEXT:    vslide1down.vx v10, v10, a1
+; RV32-NEXT:    vslidedown.vi v10, v10, 2
+; RV32-NEXT:    vand.vi v10, v10, 1
+; RV32-NEXT:    vmsne.vi v0, v10, 0
+; RV32-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; RV32-NEXT:    vmerge.vim v8, v8, -1, v0
+; RV32-NEXT:    vsetivli zero, 6, e32, m2, ta, ma
+; RV32-NEXT:    vse32.v v8, (a2)
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vselect_vi_v6i32:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli zero, 6, e32, m2, ta, ma
+; RV64-NEXT:    lbu a1, 0(a1)
+; RV64-NEXT:    vle32.v v8, (a0)
+; RV64-NEXT:    slli a0, a1, 62
+; RV64-NEXT:    srli a0, a0, 63
+; RV64-NEXT:    andi a3, a1, 1
+; RV64-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; RV64-NEXT:    vmv.v.x v10, a3
+; RV64-NEXT:    vslide1down.vx v10, v10, a0
+; RV64-NEXT:    slli a0, a1, 61
+; RV64-NEXT:    srli a0, a0, 63
+; RV64-NEXT:    vslide1down.vx v10, v10, a0
+; RV64-NEXT:    slli a0, a1, 60
+; RV64-NEXT:    srli a0, a0, 63
+; RV64-NEXT:    vslide1down.vx v10, v10, a0
+; RV64-NEXT:    slli a0, a1, 59
+; RV64-NEXT:    srli a0, a0, 63
+; RV64-NEXT:    vslide1down.vx v10, v10, a0
+; RV64-NEXT:    srli a1, a1, 5
+; RV64-NEXT:    vslide1down.vx v10, v10, a1
+; RV64-NEXT:    vslidedown.vi v10, v10, 2
+; RV64-NEXT:    vand.vi v10, v10, 1
+; RV64-NEXT:    vmsne.vi v0, v10, 0
+; RV64-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; RV64-NEXT:    vmerge.vim v8, v8, -1, v0
+; RV64-NEXT:    vsetivli zero, 6, e32, m2, ta, ma
+; RV64-NEXT:    vse32.v v8, (a2)
+; RV64-NEXT:    ret
   %vb = load <6 x i32>, ptr %b
   %a = insertelement <6 x i32> poison, i32 -1, i32 0
   %va = shufflevector <6 x i32> %a, <6 x i32> poison, <6 x i32> zeroinitializer
@@ -107,30 +214,65 @@ define void @vselect_vi_v6i32(ptr %b, ptr %cc, ptr %z) {
 
 
 define void @vselect_vv_v6f32(ptr %a, ptr %b, ptr %cc, ptr %z) {
-; CHECK-LABEL: vselect_vv_v6f32:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 6, e32, m2, ta, ma
-; CHECK-NEXT:    lbu a2, 0(a2)
-; CHECK-NEXT:    vle32.v v8, (a1)
-; CHECK-NEXT:    srli a1, a2, 1
-; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-NEXT:    vmv.v.x v10, a2
-; CHECK-NEXT:    vslide1down.vx v10, v10, a1
-; CHECK-NEXT:    srli a1, a2, 2
-; CHECK-NEXT:    vslide1down.vx v10, v10, a1
-; CHECK-NEXT:    srli a1, a2, 3
-; CHECK-NEXT:    vslide1down.vx v10, v10, a1
-; CHECK-NEXT:    srli a1, a2, 4
-; CHECK-NEXT:    vslide1down.vx v10, v10, a1
-; CHECK-NEXT:    srli a2, a2, 5
-; CHECK-NEXT:    vslide1down.vx v10, v10, a2
-; CHECK-NEXT:    vslidedown.vi v10, v10, 2
-; CHECK-NEXT:    vand.vi v10, v10, 1
-; CHECK-NEXT:    vmsne.vi v0, v10, 0
-; CHECK-NEXT:    vsetivli zero, 6, e32, m2, tu, mu
-; CHECK-NEXT:    vle32.v v8, (a0), v0.t
-; CHECK-NEXT:    vse32.v v8, (a3)
-; CHECK-NEXT:    ret
+; RV32-LABEL: vselect_vv_v6f32:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetivli zero, 6, e32, m2, ta, ma
+; RV32-NEXT:    lbu a2, 0(a2)
+; RV32-NEXT:    vle32.v v8, (a1)
+; RV32-NEXT:    slli a1, a2, 30
+; RV32-NEXT:    srli a1, a1, 31
+; RV32-NEXT:    andi a4, a2, 1
+; RV32-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; RV32-NEXT:    vmv.v.x v10, a4
+; RV32-NEXT:    vslide1down.vx v10, v10, a1
+; RV32-NEXT:    slli a1, a2, 29
+; RV32-NEXT:    srli a1, a1, 31
+; RV32-NEXT:    vslide1down.vx v10, v10, a1
+; RV32-NEXT:    slli a1, a2, 28
+; RV32-NEXT:    srli a1, a1, 31
+; RV32-NEXT:    vslide1down.vx v10, v10, a1
+; RV32-NEXT:    slli a1, a2, 27
+; RV32-NEXT:    srli a1, a1, 31
+; RV32-NEXT:    vslide1down.vx v10, v10, a1
+; RV32-NEXT:    srli a2, a2, 5
+; RV32-NEXT:    vslide1down.vx v10, v10, a2
+; RV32-NEXT:    vslidedown.vi v10, v10, 2
+; RV32-NEXT:    vand.vi v10, v10, 1
+; RV32-NEXT:    vmsne.vi v0, v10, 0
+; RV32-NEXT:    vsetivli zero, 6, e32, m2, tu, mu
+; RV32-NEXT:    vle32.v v8, (a0), v0.t
+; RV32-NEXT:    vse32.v v8, (a3)
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vselect_vv_v6f32:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli zero, 6, e32, m2, ta, ma
+; RV64-NEXT:    lbu a2, 0(a2)
+; RV64-NEXT:    vle32.v v8, (a1)
+; RV64-NEXT:    slli a1, a2, 62
+; RV64-NEXT:    srli a1, a1, 63
+; RV64-NEXT:    andi a4, a2, 1
+; RV64-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; RV64-NEXT:    vmv.v.x v10, a4
+; RV64-NEXT:    vslide1down.vx v10, v10, a1
+; RV64-NEXT:    slli a1, a2, 61
+; RV64-NEXT:    srli a1, a1, 63
+; RV64-NEXT:    vslide1down.vx v10, v10, a1
+; RV64-NEXT:    slli a1, a2, 60
+; RV64-NEXT:    srli a1, a1, 63
+; RV64-NEXT:    vslide1down.vx v10, v10, a1
+; RV64-NEXT:    slli a1, a2, 59
+; RV64-NEXT:    srli a1, a1, 63
+; RV64-NEXT:    vslide1down.vx v10, v10, a1
+; RV64-NEXT:    srli a2, a2, 5
+; RV64-NEXT:    vslide1down.vx v10, v10, a2
+; RV64-NEXT:    vslidedown.vi v10, v10, 2
+; RV64-NEXT:    vand.vi v10, v10, 1
+; RV64-NEXT:    vmsne.vi v0, v10, 0
+; RV64-NEXT:    vsetivli zero, 6, e32, m2, tu, mu
+; RV64-NEXT:    vle32.v v8, (a0), v0.t
+; RV64-NEXT:    vse32.v v8, (a3)
+; RV64-NEXT:    ret
   %va = load <6 x float>, ptr %a
   %vb = load <6 x float>, ptr %b
   %vcc = load <6 x i1>, ptr %cc
@@ -140,31 +282,67 @@ define void @vselect_vv_v6f32(ptr %a, ptr %b, ptr %cc, ptr %z) {
 }
 
 define void @vselect_vx_v6f32(float %a, ptr %b, ptr %cc, ptr %z) {
-; CHECK-LABEL: vselect_vx_v6f32:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 6, e32, m2, ta, ma
-; CHECK-NEXT:    lbu a1, 0(a1)
-; CHECK-NEXT:    vle32.v v8, (a0)
-; CHECK-NEXT:    srli a0, a1, 1
-; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-NEXT:    vmv.v.x v10, a1
-; CHECK-NEXT:    vslide1down.vx v10, v10, a0
-; CHECK-NEXT:    srli a0, a1, 2
-; CHECK-NEXT:    vslide1down.vx v10, v10, a0
-; CHECK-NEXT:    srli a0, a1, 3
-; CHECK-NEXT:    vslide1down.vx v10, v10, a0
-; CHECK-NEXT:    srli a0, a1, 4
-; CHECK-NEXT:    vslide1down.vx v10, v10, a0
-; CHECK-NEXT:    srli a1, a1, 5
-; CHECK-NEXT:    vslide1down.vx v10, v10, a1
-; CHECK-NEXT:    vslidedown.vi v10, v10, 2
-; CHECK-NEXT:    vand.vi v10, v10, 1
-; CHECK-NEXT:    vmsne.vi v0, v10, 0
-; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; CHECK-NEXT:    vfmerge.vfm v8, v8, fa0, v0
-; CHECK-NEXT:    vsetivli zero, 6, e32, m2, ta, ma
-; CHECK-NEXT:    vse32.v v8, (a2)
-; CHECK-NEXT:    ret
+; RV32-LABEL: vselect_vx_v6f32:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetivli zero, 6, e32, m2, ta, ma
+; RV32-NEXT:    lbu a1, 0(a1)
+; RV32-NEXT:    vle32.v v8, (a0)
+; RV32-NEXT:    slli a0, a1, 30
+; RV32-NEXT:    srli a0, a0, 31
+; RV32-NEXT:    andi a3, a1, 1
+; RV32-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; RV32-NEXT:    vmv.v.x v10, a3
+; RV32-NEXT:    vslide1down.vx v10, v10, a0
+; RV32-NEXT:    slli a0, a1, 29
+; RV32-NEXT:    srli a0, a0, 31
+; RV32-NEXT:    vslide1down.vx v10, v10, a0
+; RV32-NEXT:    slli a0, a1, 28
+; RV32-NEXT:    srli a0, a0, 31
+; RV32-NEXT:    vslide1down.vx v10, v10, a0
+; RV32-NEXT:    slli a0, a1, 27
+; RV32-NEXT:    srli a0, a0, 31
+; RV32-NEXT:    vslide1down.vx v10, v10, a0
+; RV32-NEXT:    srli a1, a1, 5
+; RV32-NEXT:    vslide1down.vx v10, v10, a1
+; RV32-NEXT:    vslidedown.vi v10, v10, 2
+; RV32-NEXT:    vand.vi v10, v10, 1
+; RV32-NEXT:    vmsne.vi v0, v10, 0
+; RV32-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; RV32-NEXT:    vfmerge.vfm v8, v8, fa0, v0
+; RV32-NEXT:    vsetivli zero, 6, e32, m2, ta, ma
+; RV32-NEXT:    vse32.v v8, (a2)
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vselect_vx_v6f32:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli zero, 6, e32, m2, ta, ma
+; RV64-NEXT:    lbu a1, 0(a1)
+; RV64-NEXT:    vle32.v v8, (a0)
+; RV64-NEXT:    slli a0, a1, 62
+; RV64-NEXT:    srli a0, a0, 63
+; RV64-NEXT:    andi a3, a1, 1
+; RV64-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; RV64-NEXT:    vmv.v.x v10, a3
+; RV64-NEXT:    vslide1down.vx v10, v10, a0
+; RV64-NEXT:    slli a0, a1, 61
+; RV64-NEXT:    srli a0, a0, 63
+; RV64-NEXT:    vslide1down.vx v10, v10, a0
+; RV64-NEXT:    slli a0, a1, 60
+; RV64-NEXT:    srli a0, a0, 63
+; RV64-NEXT:    vslide1down.vx v10, v10, a0
+; RV64-NEXT:    slli a0, a1, 59
+; RV64-NEXT:    srli a0, a0, 63
+; RV64-NEXT:    vslide1down.vx v10, v10, a0
+; RV64-NEXT:    srli a1, a1, 5
+; RV64-NEXT:    vslide1down.vx v10, v10, a1
+; RV64-NEXT:    vslidedown.vi v10, v10, 2
+; RV64-NEXT:    vand.vi v10, v10, 1
+; RV64-NEXT:    vmsne.vi v0, v10, 0
+; RV64-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; RV64-NEXT:    vfmerge.vfm v8, v8, fa0, v0
+; RV64-NEXT:    vsetivli zero, 6, e32, m2, ta, ma
+; RV64-NEXT:    vse32.v v8, (a2)
+; RV64-NEXT:    ret
   %vb = load <6 x float>, ptr %b
   %ahead = insertelement <6 x float> poison, float %a, i32 0
   %va = shufflevector <6 x float> %ahead, <6 x float> poison, <6 x i32> zeroinitializer
@@ -175,31 +353,67 @@ define void @vselect_vx_v6f32(float %a, ptr %b, ptr %cc, ptr %z) {
 }
 
 define void @vselect_vfpzero_v6f32(ptr %b, ptr %cc, ptr %z) {
-; CHECK-LABEL: vselect_vfpzero_v6f32:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 6, e32, m2, ta, ma
-; CHECK-NEXT:    lbu a1, 0(a1)
-; CHECK-NEXT:    vle32.v v8, (a0)
-; CHECK-NEXT:    srli a0, a1, 1
-; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-NEXT:    vmv.v.x v10, a1
-; CHECK-NEXT:    vslide1down.vx v10, v10, a0
-; CHECK-NEXT:    srli a0, a1, 2
-; CHECK-NEXT:    vslide1down.vx v10, v10, a0
-; CHECK-NEXT:    srli a0, a1, 3
-; CHECK-NEXT:    vslide1down.vx v10, v10, a0
-; CHECK-NEXT:    srli a0, a1, 4
-; CHECK-NEXT:    vslide1down.vx v10, v10, a0
-; CHECK-NEXT:    srli a1, a1, 5
-; CHECK-NEXT:    vslide1down.vx v10, v10, a1
-; CHECK-NEXT:    vslidedown.vi v10, v10, 2
-; CHECK-NEXT:    vand.vi v10, v10, 1
-; CHECK-NEXT:    vmsne.vi v0, v10, 0
-; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; CHECK-NEXT:    vmerge.vim v8, v8, 0, v0
-; CHECK-NEXT:    vsetivli zero, 6, e32, m2, ta, ma
-; CHECK-NEXT:    vse32.v v8, (a2)
-; CHECK-NEXT:    ret
+; RV32-LABEL: vselect_vfpzero_v6f32:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetivli zero, 6, e32, m2, ta, ma
+; RV32-NEXT:    lbu a1, 0(a1)
+; RV32-NEXT:    vle32.v v8, (a0)
+; RV32-NEXT:    slli a0, a1, 30
+; RV32-NEXT:    srli a0, a0, 31
+; RV32-NEXT:    andi a3, a1, 1
+; RV32-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; RV32-NEXT:    vmv.v.x v10, a3
+; RV32-NEXT:    vslide1down.vx v10, v10, a0
+; RV32-NEXT:    slli a0, a1, 29
+; RV32-NEXT:    srli a0, a0, 31
+; RV32-NEXT:    vslide1down.vx v10, v10, a0
+; RV32-NEXT:    slli a0, a1, 28
+; RV32-NEXT:    srli a0, a0, 31
+; RV32-NEXT:    vslide1down.vx v10, v10, a0
+; RV32-NEXT:    slli a0, a1, 27
+; RV32-NEXT:    srli a0, a0, 31
+; RV32-NEXT:    vslide1down.vx v10, v10, a0
+; RV32-NEXT:    srli a1, a1, 5
+; RV32-NEXT:    vslide1down.vx v10, v10, a1
+; RV32-NEXT:    vslidedown.vi v10, v10, 2
+; RV32-NEXT:    vand.vi v10, v10, 1
+; RV32-NEXT:    vmsne.vi v0, v10, 0
+; RV32-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; RV32-NEXT:    vmerge.vim v8, v8, 0, v0
+; RV32-NEXT:    vsetivli zero, 6, e32, m2, ta, ma
+; RV32-NEXT:    vse32.v v8, (a2)
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vselect_vfpzero_v6f32:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli zero, 6, e32, m2, ta, ma
+; RV64-NEXT:    lbu a1, 0(a1)
+; RV64-NEXT:    vle32.v v8, (a0)
+; RV64-NEXT:    slli a0, a1, 62
+; RV64-NEXT:    srli a0, a0, 63
+; RV64-NEXT:    andi a3, a1, 1
+; RV64-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; RV64-NEXT:    vmv.v.x v10, a3
+; RV64-NEXT:    vslide1down.vx v10, v10, a0
+; RV64-NEXT:    slli a0, a1, 61
+; RV64-NEXT:    srli a0, a0, 63
+; RV64-NEXT:    vslide1down.vx v10, v10, a0
+; RV64-NEXT:    slli a0, a1, 60
+; RV64-NEXT:    srli a0, a0, 63
+; RV64-NEXT:    vslide1down.vx v10, v10, a0
+; RV64-NEXT:    slli a0, a1, 59
+; RV64-NEXT:    srli a0, a0, 63
+; RV64-NEXT:    vslide1down.vx v10, v10, a0
+; RV64-NEXT:    srli a1, a1, 5
+; RV64-NEXT:    vslide1down.vx v10, v10, a1
+; RV64-NEXT:    vslidedown.vi v10, v10, 2
+; RV64-NEXT:    vand.vi v10, v10, 1
+; RV64-NEXT:    vmsne.vi v0, v10, 0
+; RV64-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; RV64-NEXT:    vmerge.vim v8, v8, 0, v0
+; RV64-NEXT:    vsetivli zero, 6, e32, m2, ta, ma
+; RV64-NEXT:    vse32.v v8, (a2)
+; RV64-NEXT:    ret
   %vb = load <6 x float>, ptr %b
   %a = insertelement <6 x float> poison, float 0.0, i32 0
   %va = shufflevector <6 x float> %a, <6 x float> poison, <6 x i32> zeroinitializer
@@ -497,6 +711,3 @@ define <64 x i1> @vselect_v64i1(<64 x i1> %a, <64 x i1> %b, <64 x i1> %cc) {
   %v = select <64 x i1> %cc, <64 x i1> %a, <64 x i1> %b
   ret <64 x i1> %v
 }
-;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; RV32: {{.*}}
-; RV64: {{.*}}
diff --git a/llvm/test/CodeGen/RISCV/urem-seteq-illegal-types.ll b/llvm/test/CodeGen/RISCV/urem-seteq-illegal-types.ll
index 4544cba..c016e8f 100644
--- a/llvm/test/CodeGen/RISCV/urem-seteq-illegal-types.ll
+++ b/llvm/test/CodeGen/RISCV/urem-seteq-illegal-types.ll
@@ -521,33 +521,35 @@ define void @test_urem_vec(ptr %X) nounwind {
 ;
 ; RV32MV-LABEL: test_urem_vec:
 ; RV32MV:       # %bb.0:
-; RV32MV-NEXT:    lbu a1, 4(a0)
-; RV32MV-NEXT:    lw a2, 0(a0)
-; RV32MV-NEXT:    slli a1, a1, 10
-; RV32MV-NEXT:    srli a3, a2, 22
-; RV32MV-NEXT:    or a1, a3, a1
-; RV32MV-NEXT:    srli a3, a2, 11
+; RV32MV-NEXT:    lw a1, 0(a0)
+; RV32MV-NEXT:    andi a2, a1, 2047
 ; RV32MV-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
 ; RV32MV-NEXT:    vmv.v.x v8, a2
+; RV32MV-NEXT:    lbu a2, 4(a0)
+; RV32MV-NEXT:    slli a3, a1, 10
+; RV32MV-NEXT:    srli a3, a3, 21
 ; RV32MV-NEXT:    vslide1down.vx v8, v8, a3
+; RV32MV-NEXT:    slli a2, a2, 10
+; RV32MV-NEXT:    srli a1, a1, 22
+; RV32MV-NEXT:    or a1, a1, a2
+; RV32MV-NEXT:    andi a1, a1, 2047
 ; RV32MV-NEXT:    vslide1down.vx v8, v8, a1
+; RV32MV-NEXT:    lui a1, %hi(.LCPI4_0)
+; RV32MV-NEXT:    addi a1, a1, %lo(.LCPI4_0)
+; RV32MV-NEXT:    vle16.v v9, (a1)
 ; RV32MV-NEXT:    vslidedown.vi v8, v8, 1
-; RV32MV-NEXT:    li a1, 2047
-; RV32MV-NEXT:    lui a2, %hi(.LCPI4_0)
-; RV32MV-NEXT:    addi a2, a2, %lo(.LCPI4_0)
-; RV32MV-NEXT:    vle16.v v9, (a2)
-; RV32MV-NEXT:    vand.vx v8, v8, a1
 ; RV32MV-NEXT:    vid.v v10
 ; RV32MV-NEXT:    vsub.vv v8, v8, v10
 ; RV32MV-NEXT:    vmul.vv v8, v8, v9
 ; RV32MV-NEXT:    vadd.vv v9, v8, v8
-; RV32MV-NEXT:    lui a2, 41121
-; RV32MV-NEXT:    addi a2, a2, -1527
+; RV32MV-NEXT:    lui a1, 41121
+; RV32MV-NEXT:    addi a1, a1, -1527
 ; RV32MV-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; RV32MV-NEXT:    vmv.s.x v10, a2
+; RV32MV-NEXT:    vmv.s.x v10, a1
 ; RV32MV-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; RV32MV-NEXT:    vsext.vf2 v11, v10
 ; RV32MV-NEXT:    vsll.vv v9, v9, v11
+; RV32MV-NEXT:    li a1, 2047
 ; RV32MV-NEXT:    vand.vx v8, v8, a1
 ; RV32MV-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; RV32MV-NEXT:    vmv.v.i v10, 1
@@ -585,31 +587,30 @@ define void @test_urem_vec(ptr %X) nounwind {
 ; RV64MV-NEXT:    lwu a2, 0(a0)
 ; RV64MV-NEXT:    slli a1, a1, 32
 ; RV64MV-NEXT:    or a1, a2, a1
+; RV64MV-NEXT:    slli a2, a1, 42
+; RV64MV-NEXT:    srli a2, a2, 53
+; RV64MV-NEXT:    andi a3, a1, 2047
 ; RV64MV-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
-; RV64MV-NEXT:    vmv.v.x v8, a1
-; RV64MV-NEXT:    slli a1, a1, 24
-; RV64MV-NEXT:    srli a1, a1, 24
-; RV64MV-NEXT:    srli a2, a1, 11
+; RV64MV-NEXT:    vmv.v.x v8, a3
 ; RV64MV-NEXT:    vslide1down.vx v8, v8, a2
 ; RV64MV-NEXT:    srli a1, a1, 22
 ; RV64MV-NEXT:    vslide1down.vx v8, v8, a1
+; RV64MV-NEXT:    lui a1, %hi(.LCPI4_0)
+; RV64MV-NEXT:    addi a1, a1, %lo(.LCPI4_0)
+; RV64MV-NEXT:    vle16.v v9, (a1)
 ; RV64MV-NEXT:    vslidedown.vi v8, v8, 1
-; RV64MV-NEXT:    li a1, 2047
-; RV64MV-NEXT:    lui a2, %hi(.LCPI4_0)
-; RV64MV-NEXT:    addi a2, a2, %lo(.LCPI4_0)
-; RV64MV-NEXT:    vle16.v v9, (a2)
-; RV64MV-NEXT:    vand.vx v8, v8, a1
 ; RV64MV-NEXT:    vid.v v10
 ; RV64MV-NEXT:    vsub.vv v8, v8, v10
 ; RV64MV-NEXT:    vmul.vv v8, v8, v9
 ; RV64MV-NEXT:    vadd.vv v9, v8, v8
-; RV64MV-NEXT:    lui a2, 41121
-; RV64MV-NEXT:    addi a2, a2, -1527
+; RV64MV-NEXT:    lui a1, 41121
+; RV64MV-NEXT:    addi a1, a1, -1527
 ; RV64MV-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; RV64MV-NEXT:    vmv.s.x v10, a2
+; RV64MV-NEXT:    vmv.s.x v10, a1
 ; RV64MV-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; RV64MV-NEXT:    vsext.vf2 v11, v10
 ; RV64MV-NEXT:    vsll.vv v9, v9, v11
+; RV64MV-NEXT:    li a1, 2047
 ; RV64MV-NEXT:    vand.vx v8, v8, a1
 ; RV64MV-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; RV64MV-NEXT:    vmv.v.i v10, 1
-- 
cgit v1.1


From 35c4f025f9d4c398eff0c8e49a47a5c7067939ba Mon Sep 17 00:00:00 2001
From: Jacob Lambert <jacob.lambert@amd.com>
Date: Thu, 8 Feb 2024 11:35:04 -0800
Subject: [NFC][clang][Driver] Specify options for <arg> with -save-temps=<arg>
 (#80921)

---
 clang/include/clang/Driver/Options.td | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index 4b232b8..4f498db 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -5392,7 +5392,9 @@ def regcall4 : Flag<["-"], "regcall4">, Group<m_Group>,
   MarshallingInfoFlag<LangOpts<"RegCall4">>;
 def save_temps_EQ : Joined<["-", "--"], "save-temps=">, Flags<[NoXarchOption]>,
   Visibility<[ClangOption, CC1Option, FlangOption, FC1Option]>,
-  HelpText<"Save intermediate compilation results.">;
+  HelpText<"Save intermediate compilation results. <arg> can be set to cwd for "
+  "current working directory, or obj which will save temporary files in the "
+  "same directory as the final output file">;
 def save_temps : Flag<["-", "--"], "save-temps">, Flags<[NoXarchOption]>,
   Visibility<[ClangOption, FlangOption, FC1Option]>,
   Alias<save_temps_EQ>, AliasArgs<["cwd"]>,
-- 
cgit v1.1


From 3b57b647a9bb821137f91dfbc2172a9947f620cc Mon Sep 17 00:00:00 2001
From: Natalie Chouinard <sudonatalie@google.com>
Date: Thu, 8 Feb 2024 14:35:44 -0500
Subject: [HLSL][SPIR-V] Add create.handle intrinsic (#81038)

Add a SPIR-V target-specific intrinsic for creating handles, which is
used for lowering HLSL resources types like RWBuffer.

`llvm/lib/TargetParser/Triple.cpp`: SPIR-V intrinsics use "spv" as the
target prefix, not "spirv". As far as I can tell, this is the first one
that is used via the `CGBuiltin` codepath, which relies on
`getArchTypePrefix`, so I've corrected it here.

`clang/lib/Basic/Targets/SPIR.h`: When records are laid out in the
lowering from AST to IR, they were incorrectly offset because these
Pointer attributes were defaulting to 32.

Related to #81036
---
 clang/lib/Basic/Targets/SPIR.h                            | 1 +
 clang/test/CodeGenHLSL/builtins/RWBuffer-constructor.hlsl | 4 ++++
 llvm/include/llvm/IR/IntrinsicsSPIRV.td                   | 4 ++++
 llvm/lib/IR/Function.cpp                                  | 1 +
 llvm/lib/TargetParser/Triple.cpp                          | 2 +-
 5 files changed, 11 insertions(+), 1 deletion(-)

diff --git a/clang/lib/Basic/Targets/SPIR.h b/clang/lib/Basic/Targets/SPIR.h
index e6235f3..e25991e 100644
--- a/clang/lib/Basic/Targets/SPIR.h
+++ b/clang/lib/Basic/Targets/SPIR.h
@@ -310,6 +310,7 @@ public:
     assert(Triple.getEnvironment() >= llvm::Triple::Pixel &&
            Triple.getEnvironment() <= llvm::Triple::Amplification &&
            "Logical SPIR-V environment must be a valid shader stage.");
+    PointerWidth = PointerAlign = 64;
 
     // SPIR-V IDs are represented with a single 32-bit word.
     SizeType = TargetInfo::UnsignedInt;
diff --git a/clang/test/CodeGenHLSL/builtins/RWBuffer-constructor.hlsl b/clang/test/CodeGenHLSL/builtins/RWBuffer-constructor.hlsl
index 2b9c66d..74b3f59 100644
--- a/clang/test/CodeGenHLSL/builtins/RWBuffer-constructor.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/RWBuffer-constructor.hlsl
@@ -1,4 +1,5 @@
 // RUN: %clang_cc1 -triple dxil-pc-shadermodel6.3-library -x hlsl -emit-llvm -disable-llvm-passes -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple spirv-vulkan-library -x hlsl -emit-llvm -disable-llvm-passes -o - %s | FileCheck %s --check-prefix=CHECK-SPIRV
 
 RWBuffer<float> Buf;
 
@@ -7,3 +8,6 @@ RWBuffer<float> Buf;
 
 // CHECK: %[[HandleRes:[0-9]+]] = call ptr @llvm.dx.create.handle(i8 1)
 // CHECK: store ptr %[[HandleRes]], ptr %h, align 4
+
+// CHECK-SPIRV: %[[HandleRes:[0-9]+]] = call ptr @llvm.spv.create.handle(i8 1)
+// CHECK-SPIRV: store ptr %[[HandleRes]], ptr %h, align 8
diff --git a/llvm/include/llvm/IR/IntrinsicsSPIRV.td b/llvm/include/llvm/IR/IntrinsicsSPIRV.td
index ea0074d..057dc64 100644
--- a/llvm/include/llvm/IR/IntrinsicsSPIRV.td
+++ b/llvm/include/llvm/IR/IntrinsicsSPIRV.td
@@ -38,4 +38,8 @@ let TargetPrefix = "spv" in {
   // Expect, Assume Intrinsics
   def int_spv_assume : Intrinsic<[], [llvm_i1_ty]>;
   def int_spv_expect : Intrinsic<[llvm_anyint_ty], [LLVMMatchType<0>, LLVMMatchType<0>]>;
+
+  // The following intrinsic(s) are mirrored from IntrinsicsDirectX.td for HLSL support.
+  def int_spv_create_handle : ClangBuiltin<"__builtin_hlsl_create_handle">,
+      Intrinsic<[ llvm_ptr_ty ], [llvm_i8_ty], [IntrWillReturn]>;
 }
diff --git a/llvm/lib/IR/Function.cpp b/llvm/lib/IR/Function.cpp
index 22e2455..fceffbc 100644
--- a/llvm/lib/IR/Function.cpp
+++ b/llvm/lib/IR/Function.cpp
@@ -44,6 +44,7 @@
 #include "llvm/IR/IntrinsicsR600.h"
 #include "llvm/IR/IntrinsicsRISCV.h"
 #include "llvm/IR/IntrinsicsS390.h"
+#include "llvm/IR/IntrinsicsSPIRV.h"
 #include "llvm/IR/IntrinsicsVE.h"
 #include "llvm/IR/IntrinsicsWebAssembly.h"
 #include "llvm/IR/IntrinsicsX86.h"
diff --git a/llvm/lib/TargetParser/Triple.cpp b/llvm/lib/TargetParser/Triple.cpp
index 3494ae5..96dbd5c 100644
--- a/llvm/lib/TargetParser/Triple.cpp
+++ b/llvm/lib/TargetParser/Triple.cpp
@@ -188,7 +188,7 @@ StringRef Triple::getArchTypePrefix(ArchType Kind) {
 
   case spirv:
   case spirv32:
-  case spirv64:     return "spirv";
+  case spirv64:     return "spv";
 
   case kalimba:     return "kalimba";
   case lanai:       return "lanai";
-- 
cgit v1.1


From e5924d64991abb4da111317ff5e8d9147265354a Mon Sep 17 00:00:00 2001
From: Yinying Li <107574043+yinying-lisa-li@users.noreply.github.com>
Date: Thu, 8 Feb 2024 19:38:42 +0000
Subject: [mlir][sparse] Implement parsing n out of m (#79935)

1. Add parsing methods for block[n, m].
2. Encode n and m with the newly extended 64-bit LevelType enum.
3. Update 2:4 methods names/comments to n:m.
---
 mlir/include/mlir-c/Dialect/SparseTensor.h         |  28 +--
 mlir/include/mlir/Dialect/SparseTensor/IR/Enums.h  | 205 +++++++++++++--------
 .../SparseTensor/IR/SparseTensorAttrDefs.td        |   5 +-
 .../Dialect/SparseTensor/IR/SparseTensorType.h     |   2 +-
 .../mlir/Dialect/SparseTensor/Utils/Merger.h       |   2 +-
 .../mlir/ExecutionEngine/SparseTensor/Storage.h    |  20 +-
 mlir/lib/Bindings/Python/DialectSparseTensor.cpp   |   2 +-
 mlir/lib/CAPI/Dialect/SparseTensor.cpp             |  49 +++--
 .../SparseTensor/IR/Detail/LvlTypeParser.cpp       |  54 ++++--
 .../Dialect/SparseTensor/IR/Detail/LvlTypeParser.h |   6 +-
 .../SparseTensor/IR/SparseTensorDialect.cpp        |  16 +-
 .../SparseTensor/Transforms/SparseGPUCodegen.cpp   |   2 +-
 .../Transforms/SparseTensorCodegen.cpp             |   6 +-
 .../SparseTensor/Transforms/Sparsification.cpp     |   2 +-
 .../Transforms/Utils/SparseTensorLevel.cpp         |  19 +-
 mlir/lib/Dialect/SparseTensor/Utils/Merger.cpp     |   4 +-
 mlir/lib/ExecutionEngine/SparseTensor/Storage.cpp  |   2 +-
 mlir/test/CAPI/sparse_tensor.c                     |   6 +-
 .../Dialect/SparseTensor/GPU/gpu_matmul24_lib.mlir |   2 +-
 .../Dialect/SparseTensor/roundtrip_encoding.mlir   |  30 ++-
 .../Dialect/SparseTensor/sparse_fill_zero.mlir     |   2 +-
 .../SparseTensor/CPU/sparse_block_matmul.mlir      |   2 +-
 .../Dialect/SparseTensor/CPU/sparse_ds.mlir        |   2 +-
 .../GPU/CUDA/sm80-lt/sparse-matmul-2-4-lib.mlir    |   2 +-
 .../GPU/CUDA/sm80-lt/sparse-matmul-2-4-prune.mlir  |   2 +-
 mlir/test/python/dialects/sparse_tensor/dialect.py |   4 +-
 26 files changed, 302 insertions(+), 174 deletions(-)

diff --git a/mlir/include/mlir-c/Dialect/SparseTensor.h b/mlir/include/mlir-c/Dialect/SparseTensor.h
index 42d8400..2c71b00 100644
--- a/mlir/include/mlir-c/Dialect/SparseTensor.h
+++ b/mlir/include/mlir-c/Dialect/SparseTensor.h
@@ -28,20 +28,20 @@ MLIR_DECLARE_CAPI_DIALECT_REGISTRATION(SparseTensor, sparse_tensor);
 typedef uint64_t MlirSparseTensorLevelType;
 
 enum MlirBaseSparseTensorLevelType {
-  MLIR_SPARSE_TENSOR_LEVEL_DENSE = 4,                   // 0b00001_00
-  MLIR_SPARSE_TENSOR_LEVEL_COMPRESSED = 8,              // 0b00010_00
-  MLIR_SPARSE_TENSOR_LEVEL_COMPRESSED_NU = 9,           // 0b00010_01
-  MLIR_SPARSE_TENSOR_LEVEL_COMPRESSED_NO = 10,          // 0b00010_10
-  MLIR_SPARSE_TENSOR_LEVEL_COMPRESSED_NU_NO = 11,       // 0b00010_11
-  MLIR_SPARSE_TENSOR_LEVEL_SINGLETON = 16,              // 0b00100_00
-  MLIR_SPARSE_TENSOR_LEVEL_SINGLETON_NU = 17,           // 0b00100_01
-  MLIR_SPARSE_TENSOR_LEVEL_SINGLETON_NO = 18,           // 0b00100_10
-  MLIR_SPARSE_TENSOR_LEVEL_SINGLETON_NU_NO = 19,        // 0b00100_11
-  MLIR_SPARSE_TENSOR_LEVEL_LOOSE_COMPRESSED = 32,       // 0b01000_00
-  MLIR_SPARSE_TENSOR_LEVEL_LOOSE_COMPRESSED_NU = 33,    // 0b01000_01
-  MLIR_SPARSE_TENSOR_LEVEL_LOOSE_COMPRESSED_NO = 34,    // 0b01000_10
-  MLIR_SPARSE_TENSOR_LEVEL_LOOSE_COMPRESSED_NU_NO = 35, // 0b01000_11
-  MLIR_SPARSE_TENSOR_LEVEL_TWO_OUT_OF_FOUR = 64,        // 0b10000_00
+  MLIR_SPARSE_TENSOR_LEVEL_DENSE = 0x000000010000,
+  MLIR_SPARSE_TENSOR_LEVEL_COMPRESSED = 0x000000020000,
+  MLIR_SPARSE_TENSOR_LEVEL_COMPRESSED_NU = 0x000000020001,
+  MLIR_SPARSE_TENSOR_LEVEL_COMPRESSED_NO = 0x000000020002,
+  MLIR_SPARSE_TENSOR_LEVEL_COMPRESSED_NU_NO = 0x000000020003,
+  MLIR_SPARSE_TENSOR_LEVEL_SINGLETON = 0x000000040000,
+  MLIR_SPARSE_TENSOR_LEVEL_SINGLETON_NU = 0x000000040001,
+  MLIR_SPARSE_TENSOR_LEVEL_SINGLETON_NO = 0x000000040002,
+  MLIR_SPARSE_TENSOR_LEVEL_SINGLETON_NU_NO = 0x000000040003,
+  MLIR_SPARSE_TENSOR_LEVEL_LOOSE_COMPRESSED = 0x000000080000,
+  MLIR_SPARSE_TENSOR_LEVEL_LOOSE_COMPRESSED_NU = 0x000000080001,
+  MLIR_SPARSE_TENSOR_LEVEL_LOOSE_COMPRESSED_NO = 0x000000080002,
+  MLIR_SPARSE_TENSOR_LEVEL_LOOSE_COMPRESSED_NU_NO = 0x000000080003,
+  MLIR_SPARSE_TENSOR_LEVEL_N_OUT_OF_M = 0x000000100000,
 };
 
 //===----------------------------------------------------------------------===//
diff --git a/mlir/include/mlir/Dialect/SparseTensor/IR/Enums.h b/mlir/include/mlir/Dialect/SparseTensor/IR/Enums.h
index 86c52bf..e940d20 100644
--- a/mlir/include/mlir/Dialect/SparseTensor/IR/Enums.h
+++ b/mlir/include/mlir/Dialect/SparseTensor/IR/Enums.h
@@ -154,9 +154,10 @@ enum class Action : uint32_t {
 
 /// This enum defines all the sparse representations supportable by
 /// the SparseTensor dialect. We use a lightweight encoding to encode
-/// both the "format" per se (dense, compressed, singleton, loose_compressed,
-/// two-out-of-four) as well as the "properties" (ordered, unique). The
-/// encoding is chosen for performance of the runtime library, and thus may
+/// the "format" per se (dense, compressed, singleton, loose_compressed,
+/// n-out-of-m), the "properties" (ordered, unique) as well as n and m when
+/// the format is NOutOfM.
+/// The encoding is chosen for performance of the runtime library, and thus may
 /// change in future versions; consequently, client code should use the
 /// predicate functions defined below, rather than relying on knowledge
 /// about the particular binary encoding.
@@ -165,41 +166,75 @@ enum class Action : uint32_t {
 /// where we need to store an undefined or indeterminate `LevelType`.
 /// It should not be used externally, since it does not indicate an
 /// actual/representable format.
+///
+/// Bit manipulations for LevelType:
+///
+/// | 8-bit n | 8-bit m | 16-bit LevelFormat | 16-bit LevelProperty |
+///
 enum class LevelType : uint64_t {
-  Undef = 0,                // 0b00000_00
-  Dense = 4,                // 0b00001_00
-  Compressed = 8,           // 0b00010_00
-  CompressedNu = 9,         // 0b00010_01
-  CompressedNo = 10,        // 0b00010_10
-  CompressedNuNo = 11,      // 0b00010_11
-  Singleton = 16,           // 0b00100_00
-  SingletonNu = 17,         // 0b00100_01
-  SingletonNo = 18,         // 0b00100_10
-  SingletonNuNo = 19,       // 0b00100_11
-  LooseCompressed = 32,     // 0b01000_00
-  LooseCompressedNu = 33,   // 0b01000_01
-  LooseCompressedNo = 34,   // 0b01000_10
-  LooseCompressedNuNo = 35, // 0b01000_11
-  TwoOutOfFour = 64,        // 0b10000_00
+  Undef = 0x000000000000,
+  Dense = 0x000000010000,
+  Compressed = 0x000000020000,
+  CompressedNu = 0x000000020001,
+  CompressedNo = 0x000000020002,
+  CompressedNuNo = 0x000000020003,
+  Singleton = 0x000000040000,
+  SingletonNu = 0x000000040001,
+  SingletonNo = 0x000000040002,
+  SingletonNuNo = 0x000000040003,
+  LooseCompressed = 0x000000080000,
+  LooseCompressedNu = 0x000000080001,
+  LooseCompressedNo = 0x000000080002,
+  LooseCompressedNuNo = 0x000000080003,
+  NOutOfM = 0x000000100000,
 };
 
 /// This enum defines all supported storage format without the level properties.
 enum class LevelFormat : uint64_t {
-  Dense = 4,            // 0b00001_00
-  Compressed = 8,       // 0b00010_00
-  Singleton = 16,       // 0b00100_00
-  LooseCompressed = 32, // 0b01000_00
-  TwoOutOfFour = 64,    // 0b10000_00
+  Dense = 0x00010000,
+  Compressed = 0x00020000,
+  Singleton = 0x00040000,
+  LooseCompressed = 0x00080000,
+  NOutOfM = 0x00100000,
 };
 
 /// This enum defines all the nondefault properties for storage formats.
 enum class LevelPropertyNondefault : uint64_t {
-  Nonunique = 1,  // 0b00000_01
-  Nonordered = 2, // 0b00000_10
+  Nonunique = 0x0001,
+  Nonordered = 0x0002,
 };
 
+/// Get N of NOutOfM level type.
+constexpr uint64_t getN(LevelType lt) {
+  return (static_cast<uint64_t>(lt) >> 32) & 0xff;
+}
+
+/// Get M of NOutOfM level type.
+constexpr uint64_t getM(LevelType lt) {
+  return (static_cast<uint64_t>(lt) >> 40) & 0xff;
+}
+
+/// Convert N of NOutOfM level type to the stored bits.
+constexpr uint64_t nToBits(uint64_t n) { return n << 32; }
+
+/// Convert M of NOutOfM level type to the stored bits.
+constexpr uint64_t mToBits(uint64_t m) { return m << 40; }
+
+/// Check if the `LevelType` is NOutOfM (regardless of
+/// properties and block sizes).
+constexpr bool isNOutOfMLT(LevelType lt) {
+  return ((static_cast<uint64_t>(lt) & 0x100000) ==
+          static_cast<uint64_t>(LevelType::NOutOfM));
+}
+
+/// Check if the `LevelType` is NOutOfM with the correct block sizes.
+constexpr bool isValidNOutOfMLT(LevelType lt, uint64_t n, uint64_t m) {
+  return isNOutOfMLT(lt) && getN(lt) == n && getM(lt) == m;
+}
+
 /// Returns string representation of the given dimension level type.
-constexpr const char *toMLIRString(LevelType lt) {
+constexpr const char *toMLIRString(LevelType lvlType) {
+  auto lt = static_cast<LevelType>(static_cast<uint64_t>(lvlType) & 0xffffffff);
   switch (lt) {
   case LevelType::Undef:
     return "undef";
@@ -229,21 +264,22 @@ constexpr const char *toMLIRString(LevelType lt) {
     return "loose_compressed(nonordered)";
   case LevelType::LooseCompressedNuNo:
     return "loose_compressed(nonunique, nonordered)";
-  case LevelType::TwoOutOfFour:
-    return "block2_4";
+  case LevelType::NOutOfM:
+    return "structured";
   }
   return "";
 }
 
 /// Check that the `LevelType` contains a valid (possibly undefined) value.
 constexpr bool isValidLT(LevelType lt) {
-  const uint64_t formatBits = static_cast<uint64_t>(lt) >> 2;
-  const uint64_t propertyBits = static_cast<uint64_t>(lt) & 3;
-  // If undefined or dense, then must be unique and ordered.
+  const uint64_t formatBits = static_cast<uint64_t>(lt) & 0xffff0000;
+  const uint64_t propertyBits = static_cast<uint64_t>(lt) & 0xffff;
+  // If undefined/dense/NOutOfM, then must be unique and ordered.
   // Otherwise, the format must be one of the known ones.
-  return (formatBits <= 1 || formatBits == 16)
+  return (formatBits <= 0x10000 || formatBits == 0x100000)
              ? (propertyBits == 0)
-             : (formatBits == 2 || formatBits == 4 || formatBits == 8);
+             : (formatBits == 0x20000 || formatBits == 0x40000 ||
+                formatBits == 0x80000);
 }
 
 /// Check if the `LevelType` is the special undefined value.
@@ -251,34 +287,28 @@ constexpr bool isUndefLT(LevelType lt) { return lt == LevelType::Undef; }
 
 /// Check if the `LevelType` is dense (regardless of properties).
 constexpr bool isDenseLT(LevelType lt) {
-  return (static_cast<uint64_t>(lt) & ~3) ==
+  return (static_cast<uint64_t>(lt) & ~0xffff) ==
          static_cast<uint64_t>(LevelType::Dense);
 }
 
 /// Check if the `LevelType` is compressed (regardless of properties).
 constexpr bool isCompressedLT(LevelType lt) {
-  return (static_cast<uint64_t>(lt) & ~3) ==
+  return (static_cast<uint64_t>(lt) & ~0xffff) ==
          static_cast<uint64_t>(LevelType::Compressed);
 }
 
 /// Check if the `LevelType` is singleton (regardless of properties).
 constexpr bool isSingletonLT(LevelType lt) {
-  return (static_cast<uint64_t>(lt) & ~3) ==
+  return (static_cast<uint64_t>(lt) & ~0xffff) ==
          static_cast<uint64_t>(LevelType::Singleton);
 }
 
 /// Check if the `LevelType` is loose compressed (regardless of properties).
 constexpr bool isLooseCompressedLT(LevelType lt) {
-  return (static_cast<uint64_t>(lt) & ~3) ==
+  return (static_cast<uint64_t>(lt) & ~0xffff) ==
          static_cast<uint64_t>(LevelType::LooseCompressed);
 }
 
-/// Check if the `LevelType` is 2OutOf4 (regardless of properties).
-constexpr bool is2OutOf4LT(LevelType lt) {
-  return (static_cast<uint64_t>(lt) & ~3) ==
-         static_cast<uint64_t>(LevelType::TwoOutOfFour);
-}
-
 /// Check if the `LevelType` needs positions array.
 constexpr bool isWithPosLT(LevelType lt) {
   return isCompressedLT(lt) || isLooseCompressedLT(lt);
@@ -287,17 +317,19 @@ constexpr bool isWithPosLT(LevelType lt) {
 /// Check if the `LevelType` needs coordinates array.
 constexpr bool isWithCrdLT(LevelType lt) {
   return isCompressedLT(lt) || isSingletonLT(lt) || isLooseCompressedLT(lt) ||
-         is2OutOf4LT(lt);
+         isNOutOfMLT(lt);
 }
 
 /// Check if the `LevelType` is ordered (regardless of storage format).
 constexpr bool isOrderedLT(LevelType lt) {
   return !(static_cast<uint64_t>(lt) & 2);
+  return !(static_cast<uint64_t>(lt) & 2);
 }
 
 /// Check if the `LevelType` is unique (regardless of storage format).
 constexpr bool isUniqueLT(LevelType lt) {
   return !(static_cast<uint64_t>(lt) & 1);
+  return !(static_cast<uint64_t>(lt) & 1);
 }
 
 /// Convert a LevelType to its corresponding LevelFormat.
@@ -305,21 +337,25 @@ constexpr bool isUniqueLT(LevelType lt) {
 constexpr std::optional<LevelFormat> getLevelFormat(LevelType lt) {
   if (lt == LevelType::Undef)
     return std::nullopt;
-  return static_cast<LevelFormat>(static_cast<uint64_t>(lt) & ~3);
+  return static_cast<LevelFormat>(static_cast<uint64_t>(lt) & 0xffff0000);
 }
 
 /// Convert a LevelFormat to its corresponding LevelType with the given
 /// properties. Returns std::nullopt when the properties are not applicable
 /// for the input level format.
 constexpr std::optional<LevelType> buildLevelType(LevelFormat lf, bool ordered,
-                                                  bool unique) {
-  auto lt = static_cast<LevelType>(static_cast<uint64_t>(lf) |
-                                   (ordered ? 0 : 2) | (unique ? 0 : 1));
+                                                  bool unique, uint64_t n = 0,
+                                                  uint64_t m = 0) {
+  uint64_t newN = n << 32;
+  uint64_t newM = m << 40;
+  auto lt =
+      static_cast<LevelType>(static_cast<uint64_t>(lf) | (ordered ? 0 : 2) |
+                             (unique ? 0 : 1) | newN | newM);
   return isValidLT(lt) ? std::optional(lt) : std::nullopt;
 }
 
 //
-// Ensure the above methods work as indended.
+// Ensure the above methods work as intended.
 //
 
 static_assert(
@@ -341,7 +377,7 @@ static_assert(
          LevelFormat::LooseCompressed &&
      *getLevelFormat(LevelType::LooseCompressedNuNo) ==
          LevelFormat::LooseCompressed &&
-     *getLevelFormat(LevelType::TwoOutOfFour) == LevelFormat::TwoOutOfFour),
+     *getLevelFormat(LevelType::NOutOfM) == LevelFormat::NOutOfM),
     "getLevelFormat conversion is broken");
 
 static_assert(
@@ -373,14 +409,29 @@ static_assert(
          LevelType::LooseCompressedNo &&
      *buildLevelType(LevelFormat::LooseCompressed, false, false) ==
          LevelType::LooseCompressedNuNo &&
-     buildLevelType(LevelFormat::TwoOutOfFour, false, true) == std::nullopt &&
-     buildLevelType(LevelFormat::TwoOutOfFour, true, false) == std::nullopt &&
-     buildLevelType(LevelFormat::TwoOutOfFour, false, false) == std::nullopt &&
-     *buildLevelType(LevelFormat::TwoOutOfFour, true, true) ==
-         LevelType::TwoOutOfFour),
+     buildLevelType(LevelFormat::NOutOfM, false, true) == std::nullopt &&
+     buildLevelType(LevelFormat::NOutOfM, true, false) == std::nullopt &&
+     buildLevelType(LevelFormat::NOutOfM, false, false) == std::nullopt &&
+     *buildLevelType(LevelFormat::NOutOfM, true, true) == LevelType::NOutOfM),
     "buildLevelType conversion is broken");
 
 static_assert(
+    (getN(*buildLevelType(LevelFormat::NOutOfM, true, true, 2, 4)) == 2 &&
+     getM(*buildLevelType(LevelFormat::NOutOfM, true, true, 2, 4)) == 4 &&
+     getN(*buildLevelType(LevelFormat::NOutOfM, true, true, 8, 10)) == 8 &&
+     getM(*buildLevelType(LevelFormat::NOutOfM, true, true, 8, 10)) == 10),
+    "getN/M conversion is broken");
+
+static_assert(
+    (isValidNOutOfMLT(*buildLevelType(LevelFormat::NOutOfM, true, true, 2, 4),
+                      2, 4) &&
+     isValidNOutOfMLT(*buildLevelType(LevelFormat::NOutOfM, true, true, 8, 10),
+                      8, 10) &&
+     !isValidNOutOfMLT(*buildLevelType(LevelFormat::NOutOfM, true, true, 3, 4),
+                       2, 4)),
+    "isValidNOutOfMLT definition is broken");
+
+static_assert(
     (isValidLT(LevelType::Undef) && isValidLT(LevelType::Dense) &&
      isValidLT(LevelType::Compressed) && isValidLT(LevelType::CompressedNu) &&
      isValidLT(LevelType::CompressedNo) &&
@@ -391,7 +442,7 @@ static_assert(
      isValidLT(LevelType::LooseCompressedNu) &&
      isValidLT(LevelType::LooseCompressedNo) &&
      isValidLT(LevelType::LooseCompressedNuNo) &&
-     isValidLT(LevelType::TwoOutOfFour)),
+     isValidLT(LevelType::NOutOfM)),
     "isValidLT definition is broken");
 
 static_assert((isDenseLT(LevelType::Dense) &&
@@ -407,7 +458,7 @@ static_assert((isDenseLT(LevelType::Dense) &&
                !isDenseLT(LevelType::LooseCompressedNu) &&
                !isDenseLT(LevelType::LooseCompressedNo) &&
                !isDenseLT(LevelType::LooseCompressedNuNo) &&
-               !isDenseLT(LevelType::TwoOutOfFour)),
+               !isDenseLT(LevelType::NOutOfM)),
               "isDenseLT definition is broken");
 
 static_assert((!isCompressedLT(LevelType::Dense) &&
@@ -423,7 +474,7 @@ static_assert((!isCompressedLT(LevelType::Dense) &&
                !isCompressedLT(LevelType::LooseCompressedNu) &&
                !isCompressedLT(LevelType::LooseCompressedNo) &&
                !isCompressedLT(LevelType::LooseCompressedNuNo) &&
-               !isCompressedLT(LevelType::TwoOutOfFour)),
+               !isCompressedLT(LevelType::NOutOfM)),
               "isCompressedLT definition is broken");
 
 static_assert((!isSingletonLT(LevelType::Dense) &&
@@ -439,7 +490,7 @@ static_assert((!isSingletonLT(LevelType::Dense) &&
                !isSingletonLT(LevelType::LooseCompressedNu) &&
                !isSingletonLT(LevelType::LooseCompressedNo) &&
                !isSingletonLT(LevelType::LooseCompressedNuNo) &&
-               !isSingletonLT(LevelType::TwoOutOfFour)),
+               !isSingletonLT(LevelType::NOutOfM)),
               "isSingletonLT definition is broken");
 
 static_assert((!isLooseCompressedLT(LevelType::Dense) &&
@@ -455,24 +506,24 @@ static_assert((!isLooseCompressedLT(LevelType::Dense) &&
                isLooseCompressedLT(LevelType::LooseCompressedNu) &&
                isLooseCompressedLT(LevelType::LooseCompressedNo) &&
                isLooseCompressedLT(LevelType::LooseCompressedNuNo) &&
-               !isLooseCompressedLT(LevelType::TwoOutOfFour)),
+               !isLooseCompressedLT(LevelType::NOutOfM)),
               "isLooseCompressedLT definition is broken");
 
-static_assert((!is2OutOf4LT(LevelType::Dense) &&
-               !is2OutOf4LT(LevelType::Compressed) &&
-               !is2OutOf4LT(LevelType::CompressedNu) &&
-               !is2OutOf4LT(LevelType::CompressedNo) &&
-               !is2OutOf4LT(LevelType::CompressedNuNo) &&
-               !is2OutOf4LT(LevelType::Singleton) &&
-               !is2OutOf4LT(LevelType::SingletonNu) &&
-               !is2OutOf4LT(LevelType::SingletonNo) &&
-               !is2OutOf4LT(LevelType::SingletonNuNo) &&
-               !is2OutOf4LT(LevelType::LooseCompressed) &&
-               !is2OutOf4LT(LevelType::LooseCompressedNu) &&
-               !is2OutOf4LT(LevelType::LooseCompressedNo) &&
-               !is2OutOf4LT(LevelType::LooseCompressedNuNo) &&
-               is2OutOf4LT(LevelType::TwoOutOfFour)),
-              "is2OutOf4LT definition is broken");
+static_assert((!isNOutOfMLT(LevelType::Dense) &&
+               !isNOutOfMLT(LevelType::Compressed) &&
+               !isNOutOfMLT(LevelType::CompressedNu) &&
+               !isNOutOfMLT(LevelType::CompressedNo) &&
+               !isNOutOfMLT(LevelType::CompressedNuNo) &&
+               !isNOutOfMLT(LevelType::Singleton) &&
+               !isNOutOfMLT(LevelType::SingletonNu) &&
+               !isNOutOfMLT(LevelType::SingletonNo) &&
+               !isNOutOfMLT(LevelType::SingletonNuNo) &&
+               !isNOutOfMLT(LevelType::LooseCompressed) &&
+               !isNOutOfMLT(LevelType::LooseCompressedNu) &&
+               !isNOutOfMLT(LevelType::LooseCompressedNo) &&
+               !isNOutOfMLT(LevelType::LooseCompressedNuNo) &&
+               isNOutOfMLT(LevelType::NOutOfM)),
+              "isNOutOfMLT definition is broken");
 
 static_assert((isOrderedLT(LevelType::Dense) &&
                isOrderedLT(LevelType::Compressed) &&
@@ -487,7 +538,7 @@ static_assert((isOrderedLT(LevelType::Dense) &&
                isOrderedLT(LevelType::LooseCompressedNu) &&
                !isOrderedLT(LevelType::LooseCompressedNo) &&
                !isOrderedLT(LevelType::LooseCompressedNuNo) &&
-               isOrderedLT(LevelType::TwoOutOfFour)),
+               isOrderedLT(LevelType::NOutOfM)),
               "isOrderedLT definition is broken");
 
 static_assert((isUniqueLT(LevelType::Dense) &&
@@ -503,7 +554,7 @@ static_assert((isUniqueLT(LevelType::Dense) &&
                !isUniqueLT(LevelType::LooseCompressedNu) &&
                isUniqueLT(LevelType::LooseCompressedNo) &&
                !isUniqueLT(LevelType::LooseCompressedNuNo) &&
-               isUniqueLT(LevelType::TwoOutOfFour)),
+               isUniqueLT(LevelType::NOutOfM)),
               "isUniqueLT definition is broken");
 
 /// Bit manipulations for affine encoding.
diff --git a/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorAttrDefs.td b/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorAttrDefs.td
index 12c1068..5b3b971 100644
--- a/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorAttrDefs.td
+++ b/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorAttrDefs.td
@@ -145,7 +145,8 @@ def SparseTensorEncodingAttr : SparseTensor_Attr<"SparseTensorEncoding",
     - **compressed** : only nonzeros along this level are stored
     - **loose_compressed** : as compressed, but allows for free space between regions
     - **singleton** : a variant of the compressed format, where coordinates have no siblings
-    - **block2_4** : the compression uses a 2:4 encoding per 1x4 block
+    - **structured[n, m]** : the compression uses a n:m encoding
+      (viz. n out of m consecutive elements are nonzero)
 
     For a compressed level, each position interval is represented in a compact
     way with a lowerbound `pos(i)` and an upperbound `pos(i+1) - 1`, which implies
@@ -374,7 +375,7 @@ def SparseTensorEncodingAttr : SparseTensor_Attr<"SparseTensorEncoding",
     bool isCompressedLvl(::mlir::sparse_tensor::Level l) const { return isCompressedLT(getLvlType(l)); }
     bool isSingletonLvl(::mlir::sparse_tensor::Level l) const { return isSingletonLT(getLvlType(l)); }
     bool isLooseCompressedLvl(::mlir::sparse_tensor::Level l) const { return isLooseCompressedLT(getLvlType(l)); }
-    bool isTwoOutOfFourLvl(::mlir::sparse_tensor::Level l) const { return is2OutOf4LT(getLvlType(l)); }
+    bool isNOutOfMLvl(::mlir::sparse_tensor::Level l) const { return isNOutOfMLT(getLvlType(l)); }
     bool isOrderedLvl(::mlir::sparse_tensor::Level l) const { return isOrderedLT(getLvlType(l)); }
     bool isUniqueLvl(::mlir::sparse_tensor::Level l) const { return isUniqueLT(getLvlType(l)); }
 
diff --git a/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorType.h b/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorType.h
index 4c98129..4e2b85d 100644
--- a/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorType.h
+++ b/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorType.h
@@ -291,7 +291,7 @@ public:
     return isLooseCompressedLT(getLvlType(l));
   }
   bool isSingletonLvl(Level l) const { return isSingletonLT(getLvlType(l)); }
-  bool is2OutOf4Lvl(Level l) const { return is2OutOf4LT(getLvlType(l)); }
+  bool isNOutOfMLvl(Level l) const { return isNOutOfMLT(getLvlType(l)); }
   bool isOrderedLvl(Level l) const { return isOrderedLT(getLvlType(l)); }
   bool isUniqueLvl(Level l) const { return isUniqueLT(getLvlType(l)); }
   bool isWithPos(Level l) const { return isWithPosLT(getLvlType(l)); }
diff --git a/mlir/include/mlir/Dialect/SparseTensor/Utils/Merger.h b/mlir/include/mlir/Dialect/SparseTensor/Utils/Merger.h
index 4a34bb2..490ef30 100644
--- a/mlir/include/mlir/Dialect/SparseTensor/Utils/Merger.h
+++ b/mlir/include/mlir/Dialect/SparseTensor/Utils/Merger.h
@@ -510,7 +510,7 @@ public:
     if (isLvlWithNonTrivialIdxExp(b)) {
       auto lt = getLoopDependentLevelType(b);
       return isCompressedLT(lt) || isSingletonLT(lt) ||
-             isLooseCompressedLT(lt) || is2OutOf4LT(lt);
+             isLooseCompressedLT(lt) || isNOutOfMLT(lt);
     }
     return false;
   }
diff --git a/mlir/include/mlir/ExecutionEngine/SparseTensor/Storage.h b/mlir/include/mlir/ExecutionEngine/SparseTensor/Storage.h
index 01c5f23..1418217 100644
--- a/mlir/include/mlir/ExecutionEngine/SparseTensor/Storage.h
+++ b/mlir/include/mlir/ExecutionEngine/SparseTensor/Storage.h
@@ -123,8 +123,8 @@ public:
   /// Safely checks if the level uses singleton storage.
   bool isSingletonLvl(uint64_t l) const { return isSingletonLT(getLvlType(l)); }
 
-  /// Safely checks if the level uses 2 out of 4 storage.
-  bool is2OutOf4Lvl(uint64_t l) const { return is2OutOf4LT(getLvlType(l)); }
+  /// Safely checks if the level uses n out of m storage.
+  bool isNOutOfMLvl(uint64_t l) const { return isNOutOfMLT(getLvlType(l)); }
 
   /// Safely checks if the level is ordered.
   bool isOrderedLvl(uint64_t l) const { return isOrderedLT(getLvlType(l)); }
@@ -450,7 +450,7 @@ private:
   void appendCrd(uint64_t lvl, uint64_t full, uint64_t crd) {
     if (!isDenseLvl(lvl)) {
       assert(isCompressedLvl(lvl) || isLooseCompressedLvl(lvl) ||
-             isSingletonLvl(lvl) || is2OutOf4Lvl(lvl));
+             isSingletonLvl(lvl) || isNOutOfMLvl(lvl));
       coordinates[lvl].push_back(detail::checkOverflowCast<C>(crd));
     } else { // Dense level.
       assert(crd >= full && "Coordinate was already filled");
@@ -473,7 +473,7 @@ private:
       return positions[l][parentSz];
     if (isLooseCompressedLvl(l))
       return positions[l][2 * parentSz - 1];
-    if (isSingletonLvl(l) || is2OutOf4Lvl(l))
+    if (isSingletonLvl(l) || isNOutOfMLvl(l))
       return parentSz; // new size same as the parent
     assert(isDenseLvl(l));
     return parentSz * getLvlSize(l);
@@ -527,7 +527,7 @@ private:
       uint64_t pos = coordinates[l].size();
       positions[l].insert(positions[l].end(), 2 * count,
                           detail::checkOverflowCast<P>(pos));
-    } else if (isSingletonLvl(l) || is2OutOf4Lvl(l)) {
+    } else if (isSingletonLvl(l) || isNOutOfMLvl(l)) {
       return; // Nothing to finalize.
     } else {  // Dense dimension.
       assert(isDenseLvl(l));
@@ -624,7 +624,7 @@ private:
         lvlCursor[l] = static_cast<uint64_t>(coordinatesL[pos]);
         toCOO(pos, l + 1, dimCoords);
       }
-    } else if (isSingletonLvl(l) || is2OutOf4Lvl(l)) {
+    } else if (isSingletonLvl(l) || isNOutOfMLvl(l)) {
       assert(parentPos < coordinates[l].size());
       lvlCursor[l] = static_cast<uint64_t>(coordinates[l][parentPos]);
       toCOO(parentPos, l + 1, dimCoords);
@@ -721,8 +721,8 @@ SparseTensorStorage<P, C, V>::SparseTensorStorage(
     } else if (isSingletonLvl(l)) {
       coordinates[l].reserve(sz);
       sz = 1;
-    } else if (is2OutOf4Lvl(l)) {
-      assert(l == lvlRank - 1 && "unexpected 2:4 usage");
+    } else if (isNOutOfMLvl(l)) {
+      assert(l == lvlRank - 1 && "unexpected n:m usage");
       sz = detail::checkedMul(sz, lvlSizes[l]) / 2;
       coordinates[l].reserve(sz);
       values.reserve(sz);
@@ -791,8 +791,8 @@ SparseTensorStorage<P, C, V>::SparseTensorStorage(
       }
     } else if (isSingletonLvl(l)) {
       assert(0 && "general singleton not supported yet");
-    } else if (is2OutOf4Lvl(l)) {
-      assert(0 && "2Out4 not supported yet");
+    } else if (isNOutOfMLvl(l)) {
+      assert(0 && "n ouf of m not supported yet");
     } else {
       assert(isDenseLvl(l));
     }
diff --git a/mlir/lib/Bindings/Python/DialectSparseTensor.cpp b/mlir/lib/Bindings/Python/DialectSparseTensor.cpp
index 698367a..607534c 100644
--- a/mlir/lib/Bindings/Python/DialectSparseTensor.cpp
+++ b/mlir/lib/Bindings/Python/DialectSparseTensor.cpp
@@ -25,7 +25,7 @@ using namespace mlir::python::adaptors;
 static void populateDialectSparseTensorSubmodule(const py::module &m) {
   py::enum_<MlirBaseSparseTensorLevelType>(m, "LevelType", py::module_local())
       .value("dense", MLIR_SPARSE_TENSOR_LEVEL_DENSE)
-      .value("compressed24", MLIR_SPARSE_TENSOR_LEVEL_TWO_OUT_OF_FOUR)
+      .value("n_out_of_m", MLIR_SPARSE_TENSOR_LEVEL_N_OUT_OF_M)
       .value("compressed", MLIR_SPARSE_TENSOR_LEVEL_COMPRESSED)
       .value("compressed_nu", MLIR_SPARSE_TENSOR_LEVEL_COMPRESSED_NU)
       .value("compressed_no", MLIR_SPARSE_TENSOR_LEVEL_COMPRESSED_NO)
diff --git a/mlir/lib/CAPI/Dialect/SparseTensor.cpp b/mlir/lib/CAPI/Dialect/SparseTensor.cpp
index e4534ad..a34b9a29 100644
--- a/mlir/lib/CAPI/Dialect/SparseTensor.cpp
+++ b/mlir/lib/CAPI/Dialect/SparseTensor.cpp
@@ -20,25 +20,36 @@ MLIR_DEFINE_CAPI_DIALECT_REGISTRATION(SparseTensor, sparse_tensor,
                                       mlir::sparse_tensor::SparseTensorDialect)
 
 // Ensure the C-API enums are int-castable to C++ equivalents.
-static_assert(static_cast<int>(MLIR_SPARSE_TENSOR_LEVEL_DENSE) ==
-                      static_cast<int>(LevelType::Dense) &&
-                  static_cast<int>(MLIR_SPARSE_TENSOR_LEVEL_COMPRESSED) ==
-                      static_cast<int>(LevelType::Compressed) &&
-                  static_cast<int>(MLIR_SPARSE_TENSOR_LEVEL_COMPRESSED_NU) ==
-                      static_cast<int>(LevelType::CompressedNu) &&
-                  static_cast<int>(MLIR_SPARSE_TENSOR_LEVEL_COMPRESSED_NO) ==
-                      static_cast<int>(LevelType::CompressedNo) &&
-                  static_cast<int>(MLIR_SPARSE_TENSOR_LEVEL_COMPRESSED_NU_NO) ==
-                      static_cast<int>(LevelType::CompressedNuNo) &&
-                  static_cast<int>(MLIR_SPARSE_TENSOR_LEVEL_SINGLETON) ==
-                      static_cast<int>(LevelType::Singleton) &&
-                  static_cast<int>(MLIR_SPARSE_TENSOR_LEVEL_SINGLETON_NU) ==
-                      static_cast<int>(LevelType::SingletonNu) &&
-                  static_cast<int>(MLIR_SPARSE_TENSOR_LEVEL_SINGLETON_NO) ==
-                      static_cast<int>(LevelType::SingletonNo) &&
-                  static_cast<int>(MLIR_SPARSE_TENSOR_LEVEL_SINGLETON_NU_NO) ==
-                      static_cast<int>(LevelType::SingletonNuNo),
-              "MlirSparseTensorLevelType (C-API) and LevelType (C++) mismatch");
+static_assert(
+    static_cast<int>(MLIR_SPARSE_TENSOR_LEVEL_DENSE) ==
+            static_cast<int>(LevelType::Dense) &&
+        static_cast<int>(MLIR_SPARSE_TENSOR_LEVEL_COMPRESSED) ==
+            static_cast<int>(LevelType::Compressed) &&
+        static_cast<int>(MLIR_SPARSE_TENSOR_LEVEL_COMPRESSED_NU) ==
+            static_cast<int>(LevelType::CompressedNu) &&
+        static_cast<int>(MLIR_SPARSE_TENSOR_LEVEL_COMPRESSED_NO) ==
+            static_cast<int>(LevelType::CompressedNo) &&
+        static_cast<int>(MLIR_SPARSE_TENSOR_LEVEL_COMPRESSED_NU_NO) ==
+            static_cast<int>(LevelType::CompressedNuNo) &&
+        static_cast<int>(MLIR_SPARSE_TENSOR_LEVEL_SINGLETON) ==
+            static_cast<int>(LevelType::Singleton) &&
+        static_cast<int>(MLIR_SPARSE_TENSOR_LEVEL_SINGLETON_NU) ==
+            static_cast<int>(LevelType::SingletonNu) &&
+        static_cast<int>(MLIR_SPARSE_TENSOR_LEVEL_SINGLETON_NO) ==
+            static_cast<int>(LevelType::SingletonNo) &&
+        static_cast<int>(MLIR_SPARSE_TENSOR_LEVEL_SINGLETON_NU_NO) ==
+            static_cast<int>(LevelType::SingletonNuNo) &&
+        static_cast<int>(MLIR_SPARSE_TENSOR_LEVEL_LOOSE_COMPRESSED) ==
+            static_cast<int>(LevelType::LooseCompressed) &&
+        static_cast<int>(MLIR_SPARSE_TENSOR_LEVEL_LOOSE_COMPRESSED_NU) ==
+            static_cast<int>(LevelType::LooseCompressedNu) &&
+        static_cast<int>(MLIR_SPARSE_TENSOR_LEVEL_LOOSE_COMPRESSED_NO) ==
+            static_cast<int>(LevelType::LooseCompressedNo) &&
+        static_cast<int>(MLIR_SPARSE_TENSOR_LEVEL_LOOSE_COMPRESSED_NU_NO) ==
+            static_cast<int>(LevelType::LooseCompressedNuNo) &&
+        static_cast<int>(MLIR_SPARSE_TENSOR_LEVEL_N_OUT_OF_M) ==
+            static_cast<int>(LevelType::NOutOfM),
+    "MlirSparseTensorLevelType (C-API) and LevelType (C++) mismatch");
 
 bool mlirAttributeIsASparseTensorEncodingAttr(MlirAttribute attr) {
   return isa<SparseTensorEncodingAttr>(unwrap(attr));
diff --git a/mlir/lib/Dialect/SparseTensor/IR/Detail/LvlTypeParser.cpp b/mlir/lib/Dialect/SparseTensor/IR/Detail/LvlTypeParser.cpp
index eb7ea63..752d6e6 100644
--- a/mlir/lib/Dialect/SparseTensor/IR/Detail/LvlTypeParser.cpp
+++ b/mlir/lib/Dialect/SparseTensor/IR/Detail/LvlTypeParser.cpp
@@ -29,12 +29,21 @@ using namespace mlir::sparse_tensor::ir_detail;
 // `LvlTypeParser` implementation.
 //===----------------------------------------------------------------------===//
 
-FailureOr<uint8_t> LvlTypeParser::parseLvlType(AsmParser &parser) const {
+FailureOr<uint64_t> LvlTypeParser::parseLvlType(AsmParser &parser) const {
   StringRef base;
   const auto loc = parser.getCurrentLocation();
   ERROR_IF(failed(parser.parseOptionalKeyword(&base)),
            "expected valid level format (e.g. dense, compressed or singleton)")
-  uint8_t properties = 0;
+  uint64_t properties = 0;
+  SmallVector<unsigned> structure;
+
+  if (base.compare("structured") == 0) {
+    ParseResult res = parser.parseCommaSeparatedList(
+        mlir::OpAsmParser::Delimiter::OptionalSquare,
+        [&]() -> ParseResult { return parseStructure(parser, &structure); },
+        " in block n out of m");
+    FAILURE_IF_FAILED(res)
+  }
 
   ParseResult res = parser.parseCommaSeparatedList(
       mlir::OpAsmParser::Delimiter::OptionalParen,
@@ -44,15 +53,20 @@ FailureOr<uint8_t> LvlTypeParser::parseLvlType(AsmParser &parser) const {
 
   // Set the base bit for properties.
   if (base.compare("dense") == 0) {
-    properties |= static_cast<uint8_t>(LevelFormat::Dense);
+    properties |= static_cast<uint64_t>(LevelFormat::Dense);
   } else if (base.compare("compressed") == 0) {
-    properties |= static_cast<uint8_t>(LevelFormat::Compressed);
-  } else if (base.compare("block2_4") == 0) {
-    properties |= static_cast<uint8_t>(LevelFormat::TwoOutOfFour);
+    properties |= static_cast<uint64_t>(LevelFormat::Compressed);
+  } else if (base.compare("structured") == 0) {
+    if (structure.size() != 2) {
+      parser.emitError(loc, "expected exactly 2 structure sizes");
+      return failure();
+    }
+    properties |= static_cast<uint64_t>(LevelFormat::NOutOfM);
+    properties |= nToBits(structure[0]) | mToBits(structure[1]);
   } else if (base.compare("loose_compressed") == 0) {
-    properties |= static_cast<uint8_t>(LevelFormat::LooseCompressed);
+    properties |= static_cast<uint64_t>(LevelFormat::LooseCompressed);
   } else if (base.compare("singleton") == 0) {
-    properties |= static_cast<uint8_t>(LevelFormat::Singleton);
+    properties |= static_cast<uint64_t>(LevelFormat::Singleton);
   } else {
     parser.emitError(loc, "unknown level format: ") << base;
     return failure();
@@ -64,15 +78,15 @@ FailureOr<uint8_t> LvlTypeParser::parseLvlType(AsmParser &parser) const {
 }
 
 ParseResult LvlTypeParser::parseProperty(AsmParser &parser,
-                                         uint8_t *properties) const {
+                                         uint64_t *properties) const {
   StringRef strVal;
   auto loc = parser.getCurrentLocation();
   ERROR_IF(failed(parser.parseOptionalKeyword(&strVal)),
            "expected valid level property (e.g. nonordered, nonunique or high)")
   if (strVal.compare("nonunique") == 0) {
-    *properties |= static_cast<uint8_t>(LevelPropertyNondefault::Nonunique);
+    *properties |= static_cast<uint64_t>(LevelPropertyNondefault::Nonunique);
   } else if (strVal.compare("nonordered") == 0) {
-    *properties |= static_cast<uint8_t>(LevelPropertyNondefault::Nonordered);
+    *properties |= static_cast<uint64_t>(LevelPropertyNondefault::Nonordered);
   } else {
     parser.emitError(loc, "unknown level property: ") << strVal;
     return failure();
@@ -80,4 +94,22 @@ ParseResult LvlTypeParser::parseProperty(AsmParser &parser,
   return success();
 }
 
+ParseResult
+LvlTypeParser::parseStructure(AsmParser &parser,
+                              SmallVector<unsigned> *structure) const {
+  int intVal;
+  auto loc = parser.getCurrentLocation();
+  OptionalParseResult intValParseResult = parser.parseOptionalInteger(intVal);
+  if (intValParseResult.has_value()) {
+    if (failed(*intValParseResult)) {
+      parser.emitError(loc, "failed to parse block size");
+      return failure();
+    }
+    structure->push_back(intVal);
+    return success();
+  }
+  parser.emitError(loc, "expected valid integer for block size");
+  return failure();
+}
+
 //===----------------------------------------------------------------------===//
diff --git a/mlir/lib/Dialect/SparseTensor/IR/Detail/LvlTypeParser.h b/mlir/lib/Dialect/SparseTensor/IR/Detail/LvlTypeParser.h
index 5e2f11b..6a13112 100644
--- a/mlir/lib/Dialect/SparseTensor/IR/Detail/LvlTypeParser.h
+++ b/mlir/lib/Dialect/SparseTensor/IR/Detail/LvlTypeParser.h
@@ -18,10 +18,12 @@ namespace ir_detail {
 class LvlTypeParser {
 public:
   LvlTypeParser() = default;
-  FailureOr<uint8_t> parseLvlType(AsmParser &parser) const;
+  FailureOr<uint64_t> parseLvlType(AsmParser &parser) const;
 
 private:
-  ParseResult parseProperty(AsmParser &parser, uint8_t *properties) const;
+  ParseResult parseProperty(AsmParser &parser, uint64_t *properties) const;
+  ParseResult parseStructure(AsmParser &parser,
+                             SmallVector<unsigned> *structure) const;
 };
 
 } // namespace ir_detail
diff --git a/mlir/lib/Dialect/SparseTensor/IR/SparseTensorDialect.cpp b/mlir/lib/Dialect/SparseTensor/IR/SparseTensorDialect.cpp
index 27125bc..67b1d79 100644
--- a/mlir/lib/Dialect/SparseTensor/IR/SparseTensorDialect.cpp
+++ b/mlir/lib/Dialect/SparseTensor/IR/SparseTensorDialect.cpp
@@ -613,16 +613,28 @@ void SparseTensorEncodingAttr::printDimensions(
   }
 }
 
+std::string getNOutOfMString(LevelType lt) {
+  if (isNOutOfMLT(lt)) {
+    unsigned n = getN(lt);
+    unsigned m = getM(lt);
+    auto output = "[" + std::to_string(n) + ", " + std::to_string(m) + "]";
+    return output;
+  }
+  return "";
+}
+
 void SparseTensorEncodingAttr::printLevels(AffineMap &map, AsmPrinter &printer,
                                            ArrayRef<LevelType> lvlTypes) const {
   for (unsigned i = 0, n = map.getNumResults() - 1; i < n; i++) {
     map.getResult(i).print(printer.getStream());
-    printer << " : " << toMLIRString(lvlTypes[i]) << ", ";
+    printer << " : " << toMLIRString(lvlTypes[i])
+            << getNOutOfMString(lvlTypes[i]) << ", ";
   }
   if (map.getNumResults() >= 1) {
     auto lastIndex = map.getNumResults() - 1;
     map.getResult(lastIndex).print(printer.getStream());
-    printer << " : " << toMLIRString(lvlTypes[lastIndex]);
+    printer << " : " << toMLIRString(lvlTypes[lastIndex])
+            << getNOutOfMString(lvlTypes[lastIndex]);
   }
 }
 
diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/SparseGPUCodegen.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/SparseGPUCodegen.cpp
index dd3af9d..3f352c8 100644
--- a/mlir/lib/Dialect/SparseTensor/Transforms/SparseGPUCodegen.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/SparseGPUCodegen.cpp
@@ -451,7 +451,7 @@ static bool isAdmissibleBSR(SparseTensorType &aTp) {
 /// Test for 2:4 matrix with suitable metadata.
 static bool isAdmissible24(SparseTensorType &aTp) {
   return aTp.getDimRank() == 2 && aTp.getLvlRank() == 3 && aTp.isDenseLvl(0) &&
-         aTp.isDenseLvl(1) && aTp.is2OutOf4Lvl(2) && isAdmissibleMetaData(aTp);
+         aTp.isDenseLvl(1) && aTp.isNOutOfMLvl(2) && isAdmissibleMetaData(aTp);
 }
 
 /// Test for conversion into 2:4 matrix.
diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorCodegen.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorCodegen.cpp
index 491501a..d4459c6 100644
--- a/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorCodegen.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorCodegen.cpp
@@ -130,7 +130,7 @@ static void allocSchemeForRank(OpBuilder &builder, Location loc,
       createPushback(builder, loc, desc, SparseTensorFieldKind::PosMemRef, lvl,
                      /*value=*/posZero, /*repeat=*/linear);
       return;
-    } else if (isSingletonLT(lt) || is2OutOf4LT(lt)) {
+    } else if (isSingletonLT(lt) || isNOutOfMLT(lt)) {
       return; // nothing to do
     }
     // Keep compounding the size, but nothing needs to be initialized
@@ -409,7 +409,7 @@ static void genEndInsert(OpBuilder &builder, Location loc,
       }
     } else {
       assert(isDenseLT(lt) || isLooseCompressedLT(lt) || isSingletonLT(lt) ||
-             is2OutOf4LT(lt));
+             isNOutOfMLT(lt));
     }
   }
 }
@@ -488,7 +488,7 @@ public:
         }
         parentPos =
             genCompressed(builder, loc, desc, coords, value, parentPos, lvl);
-      } else if (isSingletonLT(lt) || is2OutOf4LT(lt)) {
+      } else if (isSingletonLT(lt) || isNOutOfMLT(lt)) {
         // Create:
         //   coordinates[lvl].push_back(coords[lvl])
         //   positions[lvl] = positions[lvl-1]
diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/Sparsification.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/Sparsification.cpp
index ab38ab5..8f2ae60 100644
--- a/mlir/lib/Dialect/SparseTensor/Transforms/Sparsification.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/Sparsification.cpp
@@ -891,7 +891,7 @@ static scf::IfOp genIf(CodegenEnv &env, OpBuilder &builder, LoopId curr,
         assert(curr == env.merger().loop(b));
         Value clause;
         if (isCompressedLT(lt) || isSingletonLT(lt) ||
-            isLooseCompressedLT(lt) || is2OutOf4LT(lt)) {
+            isLooseCompressedLT(lt) || isNOutOfMLT(lt)) {
           assert(lvl.has_value());
           const Value crd = env.emitter().getCoord(tid, *lvl);
           const Value lvar = env.getLoopVar(curr);
diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/Utils/SparseTensorLevel.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/Utils/SparseTensorLevel.cpp
index 4ba9ecb..c85f820 100644
--- a/mlir/lib/Dialect/SparseTensor/Transforms/Utils/SparseTensorLevel.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/Utils/SparseTensorLevel.cpp
@@ -139,18 +139,19 @@ public:
   }
 };
 
-class TwoOutFourLevel : public SparseLevel {
+class NOutOfMLevel : public SparseLevel {
 public:
-  TwoOutFourLevel(unsigned tid, Level lvl, LevelType lt, Value lvlSize,
-                  Value crdBuffer)
+  NOutOfMLevel(unsigned tid, Level lvl, LevelType lt, Value lvlSize,
+               Value crdBuffer)
       : SparseLevel(tid, lvl, lt, lvlSize, crdBuffer) {}
 
   ValuePair peekRangeAt(OpBuilder &b, Location l, Value p,
                         Value max) const override {
-    assert(max == nullptr && isUnique() && "2:4 level can not be non-unique.");
-    // Each 2:4 blk has exactly two specified elements.
-    Value posLo = MULI(p, C_IDX(2));
-    return {posLo, ADDI(posLo, C_IDX(2))};
+    assert(max == nullptr && isUnique() && "n:m level can not be non-unique.");
+    // Each n:m blk has exactly n specified elements.
+    auto n = getN(lt);
+    Value posLo = MULI(p, C_IDX(n));
+    return {posLo, ADDI(posLo, C_IDX(n))};
   }
 };
 
@@ -1291,9 +1292,9 @@ sparse_tensor::makeSparseTensorLevel(OpBuilder &b, Location l, Value t,
     Value crd = genToCoordinates(b, l, t, lvl);
     return std::make_unique<SingletonLevel>(tid, lvl, lt, sz, crd);
   }
-  case LevelFormat::TwoOutOfFour: {
+  case LevelFormat::NOutOfM: {
     Value crd = genToCoordinates(b, l, t, lvl);
-    return std::make_unique<TwoOutFourLevel>(tid, lvl, lt, sz, crd);
+    return std::make_unique<NOutOfMLevel>(tid, lvl, lt, sz, crd);
   }
   }
   llvm_unreachable("unrecognizable level format");
diff --git a/mlir/lib/Dialect/SparseTensor/Utils/Merger.cpp b/mlir/lib/Dialect/SparseTensor/Utils/Merger.cpp
index 6cdf5f8..96537cb 100644
--- a/mlir/lib/Dialect/SparseTensor/Utils/Merger.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Utils/Merger.cpp
@@ -489,7 +489,7 @@ BitVector Merger::simplifyCond(LatSetId s0, LatPointId p0) {
     if (simple[b] && !isSparseLvlWithNonTrivialIdxExp(b)) {
       const auto lt = getLvlType(b);
       if (!isCompressedLT(lt) && !isSingletonLT(lt) &&
-          !isLooseCompressedLT(lt) && !is2OutOf4LT(lt)) {
+          !isLooseCompressedLT(lt) && !isNOutOfMLT(lt)) {
         if (reset)
           simple.reset(b);
         reset = true;
@@ -670,7 +670,7 @@ bool Merger::hasAnySparse(const BitVector &bits) const {
   for (TensorLoopId b : bits.set_bits()) {
     const auto lt = getLvlType(b);
     if (isCompressedLT(lt) || isSingletonLT(lt) || isLooseCompressedLT(lt) ||
-        is2OutOf4LT(lt))
+        isNOutOfMLT(lt))
       return true;
   }
   return hasSparseIdxReduction(bits);
diff --git a/mlir/lib/ExecutionEngine/SparseTensor/Storage.cpp b/mlir/lib/ExecutionEngine/SparseTensor/Storage.cpp
index 0c7b3a2..9e8b240 100644
--- a/mlir/lib/ExecutionEngine/SparseTensor/Storage.cpp
+++ b/mlir/lib/ExecutionEngine/SparseTensor/Storage.cpp
@@ -45,7 +45,7 @@ SparseTensorStorageBase::SparseTensorStorageBase( // NOLINT
   for (uint64_t l = 0; l < lvlRank; l++) {
     assert(lvlSizes[l] > 0 && "Level size zero has trivial storage");
     assert(isDenseLvl(l) || isCompressedLvl(l) || isLooseCompressedLvl(l) ||
-           isSingletonLvl(l) || is2OutOf4Lvl(l));
+           isSingletonLvl(l) || isNOutOfMLvl(l));
   }
 }
 
diff --git a/mlir/test/CAPI/sparse_tensor.c b/mlir/test/CAPI/sparse_tensor.c
index 2c6ad55..a8b9f90 100644
--- a/mlir/test/CAPI/sparse_tensor.c
+++ b/mlir/test/CAPI/sparse_tensor.c
@@ -38,9 +38,9 @@ static int testRoundtripEncoding(MlirContext ctx) {
       mlirSparseTensorEncodingAttrGetDimToLvl(originalAttr);
   // CHECK: (d0, d1)[s0] -> (s0, d0, d1)
   mlirAffineMapDump(dimToLvl);
-  // CHECK: level_type: 4
-  // CHECK: level_type: 8
-  // CHECK: level_type: 8
+  // CHECK: level_type: 65536
+  // CHECK: level_type: 131072
+  // CHECK: level_type: 131072
   MlirAffineMap lvlToDim =
       mlirSparseTensorEncodingAttrGetLvlToDim(originalAttr);
   int lvlRank = mlirSparseTensorEncodingGetLvlRank(originalAttr);
diff --git a/mlir/test/Dialect/SparseTensor/GPU/gpu_matmul24_lib.mlir b/mlir/test/Dialect/SparseTensor/GPU/gpu_matmul24_lib.mlir
index 6fe7ec9..8293169 100644
--- a/mlir/test/Dialect/SparseTensor/GPU/gpu_matmul24_lib.mlir
+++ b/mlir/test/Dialect/SparseTensor/GPU/gpu_matmul24_lib.mlir
@@ -4,7 +4,7 @@
   map = ( i, j ) ->
   ( i            : dense,
     j floordiv 4 : dense,
-    j mod 4      : block2_4
+    j mod 4      : structured[2, 4]
   )
 }>
 
diff --git a/mlir/test/Dialect/SparseTensor/roundtrip_encoding.mlir b/mlir/test/Dialect/SparseTensor/roundtrip_encoding.mlir
index 20702bb..6452063 100644
--- a/mlir/test/Dialect/SparseTensor/roundtrip_encoding.mlir
+++ b/mlir/test/Dialect/SparseTensor/roundtrip_encoding.mlir
@@ -207,12 +207,12 @@ func.func private @BSR_explicit(%arg0: tensor<?x?xf64, #BSR_explicit>) {
   map = ( i, j ) ->
   ( i            : dense,
     j floordiv 4 : dense,
-    j mod 4      : block2_4
+    j mod 4      : structured[2, 4]
   ),
   crdWidth = 8  // we would even like just 2-bits
 }>
 
-// CHECK-DAG: #[[$NV_24:.*]] = #sparse_tensor.encoding<{ map = (d0, d1) -> (d0 : dense, d1 floordiv 4 : dense, d1 mod 4 : block2_4), crdWidth = 8 }>
+// CHECK-DAG: #[[$NV_24:.*]] = #sparse_tensor.encoding<{ map = (d0, d1) -> (d0 : dense, d1 floordiv 4 : dense, d1 mod 4 : structured[2, 4]), crdWidth = 8 }>
 // CHECK-LABEL: func private @NV_24(
 // CHECK-SAME: tensor<?x?xf64, #[[$NV_24]]>
 func.func private @NV_24(%arg0: tensor<?x?xf64, #NV_24>) {
@@ -226,11 +226,11 @@ func.func private @NV_24(%arg0: tensor<?x?xf64, #NV_24>) {
   ( i            : dense,
     j            : dense,
     k floordiv 4 : dense,
-    k mod 4      : block2_4
+    k mod 4      : structured[2, 4]
   )
 }>
 
-// CHECK-DAG: #[[$NV_24:.*]] = #sparse_tensor.encoding<{ map = (d0, d1, d2) -> (d0 : dense, d1 : dense, d2 floordiv 4 : dense, d2 mod 4 : block2_4) }>
+// CHECK-DAG: #[[$NV_24:.*]] = #sparse_tensor.encoding<{ map = (d0, d1, d2) -> (d0 : dense, d1 : dense, d2 floordiv 4 : dense, d2 mod 4 : structured[2, 4]) }>
 // CHECK-LABEL: func private @NV_24(
 // CHECK-SAME: tensor<?x?x?xf64, #[[$NV_24]]>
 func.func private @NV_24(%arg0: tensor<?x?x?xf64, #NV_24>) {
@@ -244,13 +244,31 @@ func.func private @NV_24(%arg0: tensor<?x?x?xf64, #NV_24>) {
   ( i            : dense,
     k floordiv 4 : dense,
     j            : dense,
-    k mod 4      : block2_4
+    k mod 4      : structured[2, 4]
   )
 }>
 
-// CHECK-DAG: #[[$NV_24:.*]] = #sparse_tensor.encoding<{ map = (d0, d1, d2) -> (d0 : dense, d2 floordiv 4 : dense, d1 : dense, d2 mod 4 : block2_4) }>
+// CHECK-DAG: #[[$NV_24:.*]] = #sparse_tensor.encoding<{ map = (d0, d1, d2) -> (d0 : dense, d2 floordiv 4 : dense, d1 : dense, d2 mod 4 : structured[2, 4]) }>
 // CHECK-LABEL: func private @NV_24(
 // CHECK-SAME: tensor<?x?x?xf64, #[[$NV_24]]>
 func.func private @NV_24(%arg0: tensor<?x?x?xf64, #NV_24>) {
   return
 }
+
+// -----
+
+#NOutOfM = #sparse_tensor.encoding<{
+  map = ( i, j, k ) ->
+  ( i            : dense,
+    k floordiv 8 : dense,
+    j            : dense,
+    k mod 8      : structured[5, 8]
+  )
+}>
+
+// CHECK-DAG: #[[$NOutOfM:.*]] = #sparse_tensor.encoding<{ map = (d0, d1, d2) -> (d0 : dense, d2 floordiv 8 : dense, d1 : dense, d2 mod 8 : structured[5, 8]) }>
+// CHECK-LABEL: func private @NOutOfM(
+// CHECK-SAME: tensor<?x?x?xf64, #[[$NOutOfM]]>
+func.func private @NOutOfM(%arg0: tensor<?x?x?xf64, #NOutOfM>) {
+  return
+}
diff --git a/mlir/test/Dialect/SparseTensor/sparse_fill_zero.mlir b/mlir/test/Dialect/SparseTensor/sparse_fill_zero.mlir
index 7c494b2..d04fbe8 100644
--- a/mlir/test/Dialect/SparseTensor/sparse_fill_zero.mlir
+++ b/mlir/test/Dialect/SparseTensor/sparse_fill_zero.mlir
@@ -14,7 +14,7 @@
 // CHECK-DAG:       %[[VAL_8:.*]] = arith.constant true
 // CHECK-DAG:       %[[VAL_9:.*]] = arith.constant 100 : index
 // CHECK-DAG:       %[[VAL_10:.*]] = arith.constant 300 : index
-// CHECK-DAG:       %[[VAL_11:.*]] = arith.constant 8 : i64
+// CHECK-DAG:       %[[VAL_11:.*]] = arith.constant 131072 : i64
 // CHECK:           %[[VAL_12:.*]] = memref.alloca() : memref<2xi64>
 // CHECK:           %[[VAL_13:.*]] = memref.cast %[[VAL_12]] : memref<2xi64> to memref<?xi64>
 // CHECK:           memref.store %[[VAL_11]], %[[VAL_12]]{{\[}}%[[VAL_5]]] : memref<2xi64>
diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_block_matmul.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_block_matmul.mlir
index 4bc080f..e47ac46 100644
--- a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_block_matmul.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_block_matmul.mlir
@@ -59,7 +59,7 @@
   map = ( i, j ) ->
   ( i            : dense,
     j floordiv 4 : dense,
-    j mod 4      : block2_4
+    j mod 4      : structured[2, 4]
   ),
 }>
 
diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_ds.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_ds.mlir
index df5b48a..ec5c758 100644
--- a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_ds.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_ds.mlir
@@ -41,7 +41,7 @@
 #NV_24 = #sparse_tensor.encoding<{
   map = ( i, j ) -> ( i            : dense,
                       j floordiv 4 : dense,
-                      j mod 4      : block2_4),
+                      j mod 4      : structured[2, 4]),
   crdWidth = 8
 }>
 
diff --git a/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sm80-lt/sparse-matmul-2-4-lib.mlir b/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sm80-lt/sparse-matmul-2-4-lib.mlir
index 17b50b4..b0f63f1 100644
--- a/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sm80-lt/sparse-matmul-2-4-lib.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sm80-lt/sparse-matmul-2-4-lib.mlir
@@ -20,7 +20,7 @@
   map = ( i, j ) ->
   ( i            : dense,
     j floordiv 4 : dense,
-    j mod 4      : block2_4
+    j mod 4      : structured[2, 4]
   )
 }>
 
diff --git a/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sm80-lt/sparse-matmul-2-4-prune.mlir b/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sm80-lt/sparse-matmul-2-4-prune.mlir
index eb99a02..311cb60 100644
--- a/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sm80-lt/sparse-matmul-2-4-prune.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sm80-lt/sparse-matmul-2-4-prune.mlir
@@ -20,7 +20,7 @@
   map = ( i, j ) ->
   ( i            : dense,
     j floordiv 4 : dense,
-    j mod 4      : block2_4
+    j mod 4      : structured[2, 4]
   )
 }>
 
diff --git a/mlir/test/python/dialects/sparse_tensor/dialect.py b/mlir/test/python/dialects/sparse_tensor/dialect.py
index 946a224..412c579 100644
--- a/mlir/test/python/dialects/sparse_tensor/dialect.py
+++ b/mlir/test/python/dialects/sparse_tensor/dialect.py
@@ -28,7 +28,7 @@ def testEncodingAttr1D():
         # CHECK: equal: True
         print(f"equal: {casted == parsed}")
 
-        # CHECK: lvl_types: [8]
+        # CHECK: lvl_types: [131072]
         print(f"lvl_types: {casted.lvl_types}")
         # CHECK: dim_to_lvl: (d0) -> (d0)
         print(f"dim_to_lvl: {casted.dim_to_lvl}")
@@ -70,7 +70,7 @@ def testEncodingAttr2D():
         # CHECK: equal: True
         print(f"equal: {casted == parsed}")
 
-        # CHECK: lvl_types: [4, 8]
+        # CHECK: lvl_types: [65536, 131072]
         print(f"lvl_types: {casted.lvl_types}")
         # CHECK: dim_to_lvl: (d0, d1) -> (d1, d0)
         print(f"dim_to_lvl: {casted.dim_to_lvl}")
-- 
cgit v1.1


From d1fdb416299c0efa5979ed989f7c1f39973dcb73 Mon Sep 17 00:00:00 2001
From: John Demme <john.demme@microsoft.com>
Date: Thu, 8 Feb 2024 11:39:06 -0800
Subject: [MLIR][Python] Add method for getting the live operation objects
 (#78663)

Currently, a method exists to get the count of the operation objects
which are still alive. This helps for sanity checking, but isn't
terribly useful for debugging. This new method returns the actual
operation objects which are still alive.

This allows Python code like the following:

```
    gc.collect()
    live_ops = ir.Context.current._get_live_operation_objects()
    for op in live_ops:
      print(f"Warning: {op} is still live. Referrers:")
      for referrer in gc.get_referrers(op)[0]:
        print(f"  {referrer}")
```
---
 mlir/lib/Bindings/Python/IRCore.cpp      | 9 +++++++++
 mlir/lib/Bindings/Python/IRModule.h      | 3 +++
 mlir/python/mlir/_mlir_libs/_mlir/ir.pyi | 1 +
 mlir/test/python/ir/module.py            | 4 ++++
 4 files changed, 17 insertions(+)

diff --git a/mlir/lib/Bindings/Python/IRCore.cpp b/mlir/lib/Bindings/Python/IRCore.cpp
index 5412c3d..8a7951d 100644
--- a/mlir/lib/Bindings/Python/IRCore.cpp
+++ b/mlir/lib/Bindings/Python/IRCore.cpp
@@ -636,6 +636,13 @@ size_t PyMlirContext::getLiveCount() { return getLiveContexts().size(); }
 
 size_t PyMlirContext::getLiveOperationCount() { return liveOperations.size(); }
 
+std::vector<PyOperation *> PyMlirContext::getLiveOperationObjects() {
+  std::vector<PyOperation *> liveObjects;
+  for (auto &entry : liveOperations)
+    liveObjects.push_back(entry.second.second);
+  return liveObjects;
+}
+
 size_t PyMlirContext::clearLiveOperations() {
   for (auto &op : liveOperations)
     op.second.second->setInvalid();
@@ -2546,6 +2553,8 @@ void mlir::python::populateIRCore(py::module &m) {
              return ref.releaseObject();
            })
       .def("_get_live_operation_count", &PyMlirContext::getLiveOperationCount)
+      .def("_get_live_operation_objects",
+           &PyMlirContext::getLiveOperationObjects)
       .def("_clear_live_operations", &PyMlirContext::clearLiveOperations)
       .def("_get_live_module_count", &PyMlirContext::getLiveModuleCount)
       .def_property_readonly(MLIR_PYTHON_CAPI_PTR_ATTR,
diff --git a/mlir/lib/Bindings/Python/IRModule.h b/mlir/lib/Bindings/Python/IRModule.h
index 79b7e0c..48f39c9 100644
--- a/mlir/lib/Bindings/Python/IRModule.h
+++ b/mlir/lib/Bindings/Python/IRModule.h
@@ -201,6 +201,9 @@ public:
   /// Gets the count of live context objects. Used for testing.
   static size_t getLiveCount();
 
+  /// Get a list of Python objects which are still in the live context map.
+  std::vector<PyOperation *> getLiveOperationObjects();
+
   /// Gets the count of live operations associated with this context.
   /// Used for testing.
   size_t getLiveOperationCount();
diff --git a/mlir/python/mlir/_mlir_libs/_mlir/ir.pyi b/mlir/python/mlir/_mlir_libs/_mlir/ir.pyi
index 57a8599..344abb6 100644
--- a/mlir/python/mlir/_mlir_libs/_mlir/ir.pyi
+++ b/mlir/python/mlir/_mlir_libs/_mlir/ir.pyi
@@ -985,6 +985,7 @@ class Context:
     def _get_context_again(self) -> Context: ...
     def _get_live_module_count(self) -> int: ...
     def _get_live_operation_count(self) -> int: ...
+    def _get_live_operation_objects(self) -> List[Operation]: ...
     def append_dialect_registry(self, registry: DialectRegistry) -> None: ...
     def attach_diagnostic_handler(
         self, callback: Callable[[Diagnostic], bool]
diff --git a/mlir/test/python/ir/module.py b/mlir/test/python/ir/module.py
index a5c38a6..ecafcb4 100644
--- a/mlir/test/python/ir/module.py
+++ b/mlir/test/python/ir/module.py
@@ -105,6 +105,10 @@ def testModuleOperation():
     assert ctx._get_live_module_count() == 1
     op1 = module.operation
     assert ctx._get_live_operation_count() == 1
+    live_ops = ctx._get_live_operation_objects()
+    assert len(live_ops) == 1
+    assert live_ops[0] is op1
+    live_ops = None
     # CHECK: module @successfulParse
     print(op1)
 
-- 
cgit v1.1


From 705fcd4e0addee6e9e13541dbcbc81cec9748a83 Mon Sep 17 00:00:00 2001
From: Jonas Devlieghere <jonas@devlieghere.com>
Date: Thu, 8 Feb 2024 11:50:53 -0800
Subject: Revert "[lldb] Expand background symbol lookup" (#81182)

Reverts llvm/llvm-project#80890
---
 lldb/include/lldb/Core/ModuleList.h   | 23 +----------------------
 lldb/include/lldb/lldb-enumerations.h |  6 ------
 lldb/source/Core/CoreProperties.td    |  7 +------
 lldb/source/Core/ModuleList.cpp       | 13 ++++---------
 lldb/source/Host/common/Host.cpp      |  2 --
 lldb/source/Symbol/SymbolLocator.cpp  | 22 ++++++----------------
 6 files changed, 12 insertions(+), 61 deletions(-)

diff --git a/lldb/include/lldb/Core/ModuleList.h b/lldb/include/lldb/Core/ModuleList.h
index 43d931a..d78f7c5 100644
--- a/lldb/include/lldb/Core/ModuleList.h
+++ b/lldb/include/lldb/Core/ModuleList.h
@@ -47,26 +47,6 @@ class UUID;
 class VariableList;
 struct ModuleFunctionSearchOptions;
 
-static constexpr OptionEnumValueElement g_auto_download_enum_values[] = {
-    {
-        lldb::eSymbolDownloadOff,
-        "off",
-        "Disable automatically downloading symbols.",
-    },
-    {
-        lldb::eSymbolDownloadBackground,
-        "background",
-        "Download symbols in the background for images as they appear in the "
-        "backtrace.",
-    },
-    {
-        lldb::eSymbolDownloadForeground,
-        "foreground",
-        "Download symbols in the foreground for images as they appear in the "
-        "backtrace.",
-    },
-};
-
 class ModuleListProperties : public Properties {
   mutable llvm::sys::RWMutex m_symlink_paths_mutex;
   PathMappingList m_symlink_paths;
@@ -80,6 +60,7 @@ public:
   bool SetClangModulesCachePath(const FileSpec &path);
   bool GetEnableExternalLookup() const;
   bool SetEnableExternalLookup(bool new_value);
+  bool GetEnableBackgroundLookup() const;
   bool GetEnableLLDBIndexCache() const;
   bool SetEnableLLDBIndexCache(bool new_value);
   uint64_t GetLLDBIndexCacheMaxByteSize();
@@ -90,8 +71,6 @@ public:
 
   bool GetLoadSymbolOnDemand();
 
-  lldb::SymbolDownload GetSymbolAutoDownload() const;
-
   PathMappingList GetSymlinkMappings() const;
 };
 
diff --git a/lldb/include/lldb/lldb-enumerations.h b/lldb/include/lldb/lldb-enumerations.h
index 4640533..7e9b538 100644
--- a/lldb/include/lldb/lldb-enumerations.h
+++ b/lldb/include/lldb/lldb-enumerations.h
@@ -1314,12 +1314,6 @@ enum class ChildCacheState {
               ///< re-use what we computed the last time we called Update.
 };
 
-enum SymbolDownload {
-  eSymbolDownloadOff = 0,
-  eSymbolDownloadBackground = 1,
-  eSymbolDownloadForeground = 2,
-};
-
 } // namespace lldb
 
 #endif // LLDB_LLDB_ENUMERATIONS_H
diff --git a/lldb/source/Core/CoreProperties.td b/lldb/source/Core/CoreProperties.td
index 9c4aa2d..8d81967 100644
--- a/lldb/source/Core/CoreProperties.td
+++ b/lldb/source/Core/CoreProperties.td
@@ -8,12 +8,7 @@ let Definition = "modulelist" in {
   def EnableBackgroundLookup: Property<"enable-background-lookup", "Boolean">,
     Global,
     DefaultFalse,
-    Desc<"Alias for backward compatibility: when enabled this is the equivalent to 'symbols.download background'.">;
-  def AutoDownload: Property<"auto-download", "Enum">,
-    Global,
-    DefaultEnumValue<"eSymbolDownloadOff">,
-    EnumValues<"OptionEnumValues(g_auto_download_enum_values)">,
-    Desc<"On macOS, automatically download symbols with dsymForUUID (or an equivalent script/binary) for relevant images in the debug session.">;
+    Desc<"On macOS, enable calling dsymForUUID (or an equivalent script/binary) in the background to locate symbol files that weren't found.">;
   def ClangModulesCachePath: Property<"clang-modules-cache-path", "FileSpec">,
     Global,
     DefaultStringValue<"">,
diff --git a/lldb/source/Core/ModuleList.cpp b/lldb/source/Core/ModuleList.cpp
index b03490b..b7f3936 100644
--- a/lldb/source/Core/ModuleList.cpp
+++ b/lldb/source/Core/ModuleList.cpp
@@ -104,15 +104,10 @@ bool ModuleListProperties::SetEnableExternalLookup(bool new_value) {
   return SetPropertyAtIndex(ePropertyEnableExternalLookup, new_value);
 }
 
-SymbolDownload ModuleListProperties::GetSymbolAutoDownload() const {
-  // Backward compatibility alias.
-  if (GetPropertyAtIndexAs<bool>(ePropertyEnableBackgroundLookup, false))
-    return eSymbolDownloadBackground;
-
-  const uint32_t idx = ePropertyAutoDownload;
-  return GetPropertyAtIndexAs<lldb::SymbolDownload>(
-      idx, static_cast<lldb::SymbolDownload>(
-               g_modulelist_properties[idx].default_uint_value));
+bool ModuleListProperties::GetEnableBackgroundLookup() const {
+  const uint32_t idx = ePropertyEnableBackgroundLookup;
+  return GetPropertyAtIndexAs<bool>(
+      idx, g_modulelist_properties[idx].default_uint_value != 0);
 }
 
 FileSpec ModuleListProperties::GetClangModulesCachePath() const {
diff --git a/lldb/source/Host/common/Host.cpp b/lldb/source/Host/common/Host.cpp
index b72ba7e..f4cec97 100644
--- a/lldb/source/Host/common/Host.cpp
+++ b/lldb/source/Host/common/Host.cpp
@@ -550,8 +550,6 @@ llvm::Error Host::OpenFileInExternalEditor(llvm::StringRef editor,
 }
 
 bool Host::IsInteractiveGraphicSession() { return false; }
-
-bool Host::IsNetworkLimited() { return false; }
 #endif
 
 std::unique_ptr<Connection> Host::CreateDefaultConnection(llvm::StringRef url) {
diff --git a/lldb/source/Symbol/SymbolLocator.cpp b/lldb/source/Symbol/SymbolLocator.cpp
index 93a5bc4..918f13ed 100644
--- a/lldb/source/Symbol/SymbolLocator.cpp
+++ b/lldb/source/Symbol/SymbolLocator.cpp
@@ -10,7 +10,6 @@
 
 #include "lldb/Core/Debugger.h"
 #include "lldb/Core/PluginManager.h"
-#include "lldb/Host/Host.h"
 
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/Support/ThreadPool.h"
@@ -19,10 +18,12 @@ using namespace lldb;
 using namespace lldb_private;
 
 void SymbolLocator::DownloadSymbolFileAsync(const UUID &uuid) {
+  if (!ModuleList::GetGlobalModuleListProperties().GetEnableBackgroundLookup())
+    return;
+
   static llvm::SmallSet<UUID, 8> g_seen_uuids;
   static std::mutex g_mutex;
-
-  auto lookup = [=]() {
+  Debugger::GetThreadPool().async([=]() {
     {
       std::lock_guard<std::mutex> guard(g_mutex);
       if (g_seen_uuids.count(uuid))
@@ -35,23 +36,12 @@ void SymbolLocator::DownloadSymbolFileAsync(const UUID &uuid) {
     module_spec.GetUUID() = uuid;
     if (!PluginManager::DownloadObjectAndSymbolFile(module_spec, error,
                                                     /*force_lookup=*/true,
-                                                    /*copy_executable=*/true))
+                                                    /*copy_executable=*/false))
       return;
 
     if (error.Fail())
       return;
 
     Debugger::ReportSymbolChange(module_spec);
-  };
-
-  switch (ModuleList::GetGlobalModuleListProperties().GetSymbolAutoDownload()) {
-  case eSymbolDownloadOff:
-    break;
-  case eSymbolDownloadBackground:
-    Debugger::GetThreadPool().async(lookup);
-    break;
-  case eSymbolDownloadForeground:
-    lookup();
-    break;
-  };
+  });
 }
-- 
cgit v1.1


From b8545e1ece271df16185d446503474c105d6398a Mon Sep 17 00:00:00 2001
From: Philip Reames <preames@rivosinc.com>
Date: Thu, 8 Feb 2024 12:15:33 -0800
Subject: [RISCV] Consider all subvector extracts within a single VREG cheap
 (#81032)

This adjusts the isSubVectorExtractCheap callback to consider any
extract which fits entirely within the first VLEN bits of the src vector
(and uses a 5 bit immediate for the slide) as cheap. These can be done
via a single m1 vslide1down.vi instruction.

This allows our generic DAG combine logic to kick in and recognize a few
more cases where shuffle source is longer than the dest, but that using
a wider shuffle is still profitable. (Or as shown in the test diff, we
can split the wider source and do two narrower shuffles.)
---
 llvm/lib/Target/RISCV/RISCVISelLowering.cpp        |  27 +++--
 .../RISCV/rvv/fixed-vectors-int-shuffles.ll        | 110 ++++-----------------
 2 files changed, 40 insertions(+), 97 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 0799cc2..a62610b 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -2173,19 +2173,34 @@ bool RISCVTargetLowering::isExtractSubvectorCheap(EVT ResVT, EVT SrcVT,
   if (ResVT.isScalableVector() || SrcVT.isScalableVector())
     return false;
 
+  EVT EltVT = ResVT.getVectorElementType();
+  assert(EltVT == SrcVT.getVectorElementType() && "Should hold for node");
+
+  // The smallest type we can slide is i8.
+  // TODO: We can extract index 0 from a mask vector without a slide.
+  if (EltVT == MVT::i1)
+    return false;
+
   unsigned ResElts = ResVT.getVectorNumElements();
   unsigned SrcElts = SrcVT.getVectorNumElements();
 
+  unsigned MinVLen = Subtarget.getRealMinVLen();
+  unsigned MinVLMAX = MinVLen / EltVT.getSizeInBits();
+
+  // If we're extracting only data from the first VLEN bits of the source
+  // then we can always do this with an m1 vslidedown.vx.  Restricting the
+  // Index ensures we can use a vslidedown.vi.
+  // TODO: We can generalize this when the exact VLEN is known.
+  if (Index + ResElts <= MinVLMAX && Index < 31)
+    return true;
+
   // Convervatively only handle extracting half of a vector.
-  // TODO: Relax this.
+  // TODO: For sizes which aren't multiples of VLEN sizes, this may not be
+  // a cheap extract.  However, this case is important in practice for
+  // shuffled extracts of longer vectors.  How resolve?
   if ((ResElts * 2) != SrcElts)
     return false;
 
-  // The smallest type we can slide is i8.
-  // TODO: We can extract index 0 from a mask vector without a slide.
-  if (ResVT.getVectorElementType() == MVT::i1)
-    return false;
-
   // Slide can support arbitrary index, but we only treat vslidedown.vi as
   // cheap.
   if (Index >= 32)
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll
index acad71b..0e8d9cf 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll
@@ -722,97 +722,25 @@ define <8 x i32> @shuffle_v8i32_2(<8 x i32> %x, <8 x i32> %y) {
 
 ; FIXME: This could be expressed as a vrgather.vv
 define <8 x i8> @shuffle_v64i8_v8i8(<64 x i8> %wide.vec) {
-; RV32-LABEL: shuffle_v64i8_v8i8:
-; RV32:       # %bb.0:
-; RV32-NEXT:    addi sp, sp, -128
-; RV32-NEXT:    .cfi_def_cfa_offset 128
-; RV32-NEXT:    sw ra, 124(sp) # 4-byte Folded Spill
-; RV32-NEXT:    sw s0, 120(sp) # 4-byte Folded Spill
-; RV32-NEXT:    .cfi_offset ra, -4
-; RV32-NEXT:    .cfi_offset s0, -8
-; RV32-NEXT:    addi s0, sp, 128
-; RV32-NEXT:    .cfi_def_cfa s0, 0
-; RV32-NEXT:    andi sp, sp, -64
-; RV32-NEXT:    li a0, 64
-; RV32-NEXT:    mv a1, sp
-; RV32-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
-; RV32-NEXT:    vse8.v v8, (a1)
-; RV32-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
-; RV32-NEXT:    vslidedown.vi v10, v8, 8
-; RV32-NEXT:    vmv.x.s a0, v10
-; RV32-NEXT:    vmv.x.s a1, v8
-; RV32-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a1
-; RV32-NEXT:    vslide1down.vx v10, v10, a0
-; RV32-NEXT:    vsetivli zero, 1, e8, m2, ta, ma
-; RV32-NEXT:    vslidedown.vi v12, v8, 16
-; RV32-NEXT:    vmv.x.s a0, v12
-; RV32-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; RV32-NEXT:    vslide1down.vx v10, v10, a0
-; RV32-NEXT:    vsetivli zero, 1, e8, m2, ta, ma
-; RV32-NEXT:    vslidedown.vi v8, v8, 24
-; RV32-NEXT:    vmv.x.s a0, v8
-; RV32-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; RV32-NEXT:    vslide1down.vx v8, v10, a0
-; RV32-NEXT:    lbu a0, 32(sp)
-; RV32-NEXT:    lbu a1, 40(sp)
-; RV32-NEXT:    lbu a2, 48(sp)
-; RV32-NEXT:    lbu a3, 56(sp)
-; RV32-NEXT:    vslide1down.vx v8, v8, a0
-; RV32-NEXT:    vslide1down.vx v8, v8, a1
-; RV32-NEXT:    vslide1down.vx v8, v8, a2
-; RV32-NEXT:    vslide1down.vx v8, v8, a3
-; RV32-NEXT:    addi sp, s0, -128
-; RV32-NEXT:    lw ra, 124(sp) # 4-byte Folded Reload
-; RV32-NEXT:    lw s0, 120(sp) # 4-byte Folded Reload
-; RV32-NEXT:    addi sp, sp, 128
-; RV32-NEXT:    ret
-;
-; RV64-LABEL: shuffle_v64i8_v8i8:
-; RV64:       # %bb.0:
-; RV64-NEXT:    addi sp, sp, -128
-; RV64-NEXT:    .cfi_def_cfa_offset 128
-; RV64-NEXT:    sd ra, 120(sp) # 8-byte Folded Spill
-; RV64-NEXT:    sd s0, 112(sp) # 8-byte Folded Spill
-; RV64-NEXT:    .cfi_offset ra, -8
-; RV64-NEXT:    .cfi_offset s0, -16
-; RV64-NEXT:    addi s0, sp, 128
-; RV64-NEXT:    .cfi_def_cfa s0, 0
-; RV64-NEXT:    andi sp, sp, -64
-; RV64-NEXT:    li a0, 64
-; RV64-NEXT:    mv a1, sp
-; RV64-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
-; RV64-NEXT:    vse8.v v8, (a1)
-; RV64-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
-; RV64-NEXT:    vslidedown.vi v10, v8, 8
-; RV64-NEXT:    vmv.x.s a0, v10
-; RV64-NEXT:    vmv.x.s a1, v8
-; RV64-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; RV64-NEXT:    vmv.v.x v10, a1
-; RV64-NEXT:    vslide1down.vx v10, v10, a0
-; RV64-NEXT:    vsetivli zero, 1, e8, m2, ta, ma
-; RV64-NEXT:    vslidedown.vi v12, v8, 16
-; RV64-NEXT:    vmv.x.s a0, v12
-; RV64-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; RV64-NEXT:    vslide1down.vx v10, v10, a0
-; RV64-NEXT:    vsetivli zero, 1, e8, m2, ta, ma
-; RV64-NEXT:    vslidedown.vi v8, v8, 24
-; RV64-NEXT:    vmv.x.s a0, v8
-; RV64-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; RV64-NEXT:    vslide1down.vx v8, v10, a0
-; RV64-NEXT:    lbu a0, 32(sp)
-; RV64-NEXT:    lbu a1, 40(sp)
-; RV64-NEXT:    lbu a2, 48(sp)
-; RV64-NEXT:    lbu a3, 56(sp)
-; RV64-NEXT:    vslide1down.vx v8, v8, a0
-; RV64-NEXT:    vslide1down.vx v8, v8, a1
-; RV64-NEXT:    vslide1down.vx v8, v8, a2
-; RV64-NEXT:    vslide1down.vx v8, v8, a3
-; RV64-NEXT:    addi sp, s0, -128
-; RV64-NEXT:    ld ra, 120(sp) # 8-byte Folded Reload
-; RV64-NEXT:    ld s0, 112(sp) # 8-byte Folded Reload
-; RV64-NEXT:    addi sp, sp, 128
-; RV64-NEXT:    ret
+; CHECK-LABEL: shuffle_v64i8_v8i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a0, 32
+; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
+; CHECK-NEXT:    vid.v v12
+; CHECK-NEXT:    vsll.vi v14, v12, 3
+; CHECK-NEXT:    vrgather.vv v12, v8, v14
+; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
+; CHECK-NEXT:    vslidedown.vx v8, v8, a0
+; CHECK-NEXT:    li a1, 240
+; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; CHECK-NEXT:    vmv.s.x v0, a1
+; CHECK-NEXT:    lui a1, 98561
+; CHECK-NEXT:    addi a1, a1, -2048
+; CHECK-NEXT:    vmv.v.x v10, a1
+; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, mu
+; CHECK-NEXT:    vrgather.vv v12, v8, v10, v0.t
+; CHECK-NEXT:    vmv1r.v v8, v12
+; CHECK-NEXT:    ret
   %s = shufflevector <64 x i8> %wide.vec, <64 x i8> poison, <8 x i32> <i32 0, i32 8, i32 16, i32 24, i32 32, i32 40, i32 48, i32 56>
   ret <8 x i8> %s
 }
-- 
cgit v1.1


From 5f4b40c90a51248b097de7b5bc89c6976d4c3298 Mon Sep 17 00:00:00 2001
From: Jonas Devlieghere <jonas@devlieghere.com>
Date: Thu, 8 Feb 2024 12:39:04 -0800
Subject: [lldb] Expand background symbol download (#80890)

LLDB has a setting (symbols.enable-background-lookup) that calls
dsymForUUID on a background thread for images as they appear in the
current backtrace. Originally, the laziness of only looking up symbols
for images in the backtrace only existed to bring the number of
dsymForUUID calls down to a manageable number.

Users have requesting the same functionality but blocking. This gives
them the same user experience as enabling dsymForUUID globally, but
without the massive upfront cost of having to download all the images,
the majority of which they'll likely not need.

This patch renames the setting to have a more generic name
(symbols.auto-download) and changes its values from a boolean to an
enum. Users can now specify "off", "background" and "foreground". The
default remains "off" although I'll probably change that in the near
future.
---
 lldb/include/lldb/Core/ModuleList.h   | 23 ++++++++++++++++++++++-
 lldb/include/lldb/lldb-enumerations.h |  6 ++++++
 lldb/source/Core/CoreProperties.td    |  7 ++++++-
 lldb/source/Core/ModuleList.cpp       | 13 +++++++++----
 lldb/source/Symbol/SymbolLocator.cpp  | 22 ++++++++++++++++------
 5 files changed, 59 insertions(+), 12 deletions(-)

diff --git a/lldb/include/lldb/Core/ModuleList.h b/lldb/include/lldb/Core/ModuleList.h
index d78f7c5..43d931a 100644
--- a/lldb/include/lldb/Core/ModuleList.h
+++ b/lldb/include/lldb/Core/ModuleList.h
@@ -47,6 +47,26 @@ class UUID;
 class VariableList;
 struct ModuleFunctionSearchOptions;
 
+static constexpr OptionEnumValueElement g_auto_download_enum_values[] = {
+    {
+        lldb::eSymbolDownloadOff,
+        "off",
+        "Disable automatically downloading symbols.",
+    },
+    {
+        lldb::eSymbolDownloadBackground,
+        "background",
+        "Download symbols in the background for images as they appear in the "
+        "backtrace.",
+    },
+    {
+        lldb::eSymbolDownloadForeground,
+        "foreground",
+        "Download symbols in the foreground for images as they appear in the "
+        "backtrace.",
+    },
+};
+
 class ModuleListProperties : public Properties {
   mutable llvm::sys::RWMutex m_symlink_paths_mutex;
   PathMappingList m_symlink_paths;
@@ -60,7 +80,6 @@ public:
   bool SetClangModulesCachePath(const FileSpec &path);
   bool GetEnableExternalLookup() const;
   bool SetEnableExternalLookup(bool new_value);
-  bool GetEnableBackgroundLookup() const;
   bool GetEnableLLDBIndexCache() const;
   bool SetEnableLLDBIndexCache(bool new_value);
   uint64_t GetLLDBIndexCacheMaxByteSize();
@@ -71,6 +90,8 @@ public:
 
   bool GetLoadSymbolOnDemand();
 
+  lldb::SymbolDownload GetSymbolAutoDownload() const;
+
   PathMappingList GetSymlinkMappings() const;
 };
 
diff --git a/lldb/include/lldb/lldb-enumerations.h b/lldb/include/lldb/lldb-enumerations.h
index 7e9b538..4640533 100644
--- a/lldb/include/lldb/lldb-enumerations.h
+++ b/lldb/include/lldb/lldb-enumerations.h
@@ -1314,6 +1314,12 @@ enum class ChildCacheState {
               ///< re-use what we computed the last time we called Update.
 };
 
+enum SymbolDownload {
+  eSymbolDownloadOff = 0,
+  eSymbolDownloadBackground = 1,
+  eSymbolDownloadForeground = 2,
+};
+
 } // namespace lldb
 
 #endif // LLDB_LLDB_ENUMERATIONS_H
diff --git a/lldb/source/Core/CoreProperties.td b/lldb/source/Core/CoreProperties.td
index 8d81967..9c4aa2d 100644
--- a/lldb/source/Core/CoreProperties.td
+++ b/lldb/source/Core/CoreProperties.td
@@ -8,7 +8,12 @@ let Definition = "modulelist" in {
   def EnableBackgroundLookup: Property<"enable-background-lookup", "Boolean">,
     Global,
     DefaultFalse,
-    Desc<"On macOS, enable calling dsymForUUID (or an equivalent script/binary) in the background to locate symbol files that weren't found.">;
+    Desc<"Alias for backward compatibility: when enabled this is the equivalent to 'symbols.download background'.">;
+  def AutoDownload: Property<"auto-download", "Enum">,
+    Global,
+    DefaultEnumValue<"eSymbolDownloadOff">,
+    EnumValues<"OptionEnumValues(g_auto_download_enum_values)">,
+    Desc<"On macOS, automatically download symbols with dsymForUUID (or an equivalent script/binary) for relevant images in the debug session.">;
   def ClangModulesCachePath: Property<"clang-modules-cache-path", "FileSpec">,
     Global,
     DefaultStringValue<"">,
diff --git a/lldb/source/Core/ModuleList.cpp b/lldb/source/Core/ModuleList.cpp
index b7f3936..b03490b 100644
--- a/lldb/source/Core/ModuleList.cpp
+++ b/lldb/source/Core/ModuleList.cpp
@@ -104,10 +104,15 @@ bool ModuleListProperties::SetEnableExternalLookup(bool new_value) {
   return SetPropertyAtIndex(ePropertyEnableExternalLookup, new_value);
 }
 
-bool ModuleListProperties::GetEnableBackgroundLookup() const {
-  const uint32_t idx = ePropertyEnableBackgroundLookup;
-  return GetPropertyAtIndexAs<bool>(
-      idx, g_modulelist_properties[idx].default_uint_value != 0);
+SymbolDownload ModuleListProperties::GetSymbolAutoDownload() const {
+  // Backward compatibility alias.
+  if (GetPropertyAtIndexAs<bool>(ePropertyEnableBackgroundLookup, false))
+    return eSymbolDownloadBackground;
+
+  const uint32_t idx = ePropertyAutoDownload;
+  return GetPropertyAtIndexAs<lldb::SymbolDownload>(
+      idx, static_cast<lldb::SymbolDownload>(
+               g_modulelist_properties[idx].default_uint_value));
 }
 
 FileSpec ModuleListProperties::GetClangModulesCachePath() const {
diff --git a/lldb/source/Symbol/SymbolLocator.cpp b/lldb/source/Symbol/SymbolLocator.cpp
index 918f13ed..93a5bc4 100644
--- a/lldb/source/Symbol/SymbolLocator.cpp
+++ b/lldb/source/Symbol/SymbolLocator.cpp
@@ -10,6 +10,7 @@
 
 #include "lldb/Core/Debugger.h"
 #include "lldb/Core/PluginManager.h"
+#include "lldb/Host/Host.h"
 
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/Support/ThreadPool.h"
@@ -18,12 +19,10 @@ using namespace lldb;
 using namespace lldb_private;
 
 void SymbolLocator::DownloadSymbolFileAsync(const UUID &uuid) {
-  if (!ModuleList::GetGlobalModuleListProperties().GetEnableBackgroundLookup())
-    return;
-
   static llvm::SmallSet<UUID, 8> g_seen_uuids;
   static std::mutex g_mutex;
-  Debugger::GetThreadPool().async([=]() {
+
+  auto lookup = [=]() {
     {
       std::lock_guard<std::mutex> guard(g_mutex);
       if (g_seen_uuids.count(uuid))
@@ -36,12 +35,23 @@ void SymbolLocator::DownloadSymbolFileAsync(const UUID &uuid) {
     module_spec.GetUUID() = uuid;
     if (!PluginManager::DownloadObjectAndSymbolFile(module_spec, error,
                                                     /*force_lookup=*/true,
-                                                    /*copy_executable=*/false))
+                                                    /*copy_executable=*/true))
       return;
 
     if (error.Fail())
       return;
 
     Debugger::ReportSymbolChange(module_spec);
-  });
+  };
+
+  switch (ModuleList::GetGlobalModuleListProperties().GetSymbolAutoDownload()) {
+  case eSymbolDownloadOff:
+    break;
+  case eSymbolDownloadBackground:
+    Debugger::GetThreadPool().async(lookup);
+    break;
+  case eSymbolDownloadForeground:
+    lookup();
+    break;
+  };
 }
-- 
cgit v1.1


From 3f9d8d892e2de2ac2542cb8e88ae5317f3282244 Mon Sep 17 00:00:00 2001
From: NAKAMURA Takumi <geek4civic@gmail.com>
Date: Fri, 9 Feb 2024 06:11:20 +0900
Subject: [Coverage] MCDCRecordProcessor: Find `ExecVectors` directly (#80816)

Deprecate `TestVectors`, since no one uses it.

This affects the output order of ExecVectors.
The current impl emits sorted by binary value of ExecVector. This impl
emits along the traversal of `buildTestVector()`.
---
 llvm/lib/ProfileData/Coverage/CoverageMapping.cpp | 31 ++++++++---------------
 llvm/test/tools/llvm-cov/mcdc-const.test          | 28 ++++++++++----------
 llvm/test/tools/llvm-cov/mcdc-general.test        | 16 ++++++------
 3 files changed, 33 insertions(+), 42 deletions(-)

diff --git a/llvm/lib/ProfileData/Coverage/CoverageMapping.cpp b/llvm/lib/ProfileData/Coverage/CoverageMapping.cpp
index 6b189c3..eb0996e 100644
--- a/llvm/lib/ProfileData/Coverage/CoverageMapping.cpp
+++ b/llvm/lib/ProfileData/Coverage/CoverageMapping.cpp
@@ -253,9 +253,6 @@ class MCDCRecordProcessor {
   /// Mapping of calculated MC/DC Independence Pairs for each condition.
   MCDCRecord::TVPairMap IndependencePairs;
 
-  /// Total number of possible Test Vectors for the boolean expression.
-  MCDCRecord::TestVectors TestVectors;
-
   /// Actual executed Test Vectors for the boolean expression, based on
   /// ExecutedTestVectorBitmap.
   MCDCRecord::TestVectors ExecVectors;
@@ -267,18 +264,20 @@ public:
       : Bitmap(Bitmap), Region(Region), Branches(Branches),
         NumConditions(Region.MCDCParams.NumConditions),
         BitmapIdx(Region.MCDCParams.BitmapIdx * CHAR_BIT),
-        Folded(NumConditions, false), IndependencePairs(NumConditions),
-        TestVectors((size_t)1 << NumConditions) {}
+        Folded(NumConditions, false), IndependencePairs(NumConditions) {}
 
 private:
   void recordTestVector(MCDCRecord::TestVector &TV, unsigned Index,
                         MCDCRecord::CondState Result) {
+    if (!Bitmap[BitmapIdx + Index])
+      return;
+
     // Copy the completed test vector to the vector of testvectors.
-    TestVectors[Index] = TV;
+    ExecVectors.push_back(TV);
 
     // The final value (T,F) is equal to the last non-dontcare state on the
     // path (in a short-circuiting system).
-    TestVectors[Index].push_back(Result);
+    ExecVectors.back().push_back(Result);
   }
 
   // Walk the binary decision diagram and try assigning both false and true to
@@ -308,13 +307,11 @@ private:
   /// Walk the bits in the bitmap.  A bit set to '1' indicates that the test
   /// vector at the corresponding index was executed during a test run.
   void findExecutedTestVectors() {
-    for (unsigned Idx = 0; Idx < (1u << NumConditions); ++Idx) {
-      assert(BitmapIdx + Idx < Bitmap.size() && "Bitmap overrun");
-      if (Bitmap[BitmapIdx + Idx] == 0)
-        continue;
-      assert(!TestVectors[Idx].empty() && "Test Vector doesn't exist.");
-      ExecVectors.push_back(TestVectors[Idx]);
-    }
+    // Walk the binary decision diagram to enumerate all possible test vectors.
+    // We start at the root node (ID == 1) with all values being DontCare.
+    // `Index` encodes the bitmask of true values and is initially 0.
+    MCDCRecord::TestVector TV(NumConditions, MCDCRecord::MCDC_DontCare);
+    buildTestVector(TV, 1, 0);
   }
 
   // Find an independence pair for each condition:
@@ -380,12 +377,6 @@ public:
       Folded[I++] = (B->Count.isZero() && B->FalseCount.isZero());
     }
 
-    // Walk the binary decision diagram to enumerate all possible test vectors.
-    // We start at the root node (ID == 1) with all values being DontCare.
-    // `Index` encodes the bitmask of true values and is initially 0.
-    MCDCRecord::TestVector TV(NumConditions, MCDCRecord::MCDC_DontCare);
-    buildTestVector(TV, 1, 0);
-
     // Using Profile Bitmap from runtime, mark the executed test vectors.
     findExecutedTestVectors();
 
diff --git a/llvm/test/tools/llvm-cov/mcdc-const.test b/llvm/test/tools/llvm-cov/mcdc-const.test
index 0b2c9c9..5424625 100644
--- a/llvm/test/tools/llvm-cov/mcdc-const.test
+++ b/llvm/test/tools/llvm-cov/mcdc-const.test
@@ -61,8 +61,8 @@
 //      CHECKFULLCASE: |  C1-Pair: constant folded
 // CHECKFULLCASE-NEXT: |  C2-Pair: not covered
 //      CHECKFULLCASE: |  MC/DC Coverage for Decision: 0.00%
-//      CHECKFULLCASE: |  1 { T,  C  = T      }
-// CHECKFULLCASE-NEXT: |  2 { F,  C  = T      }
+//      CHECKFULLCASE: |  1 { F,  C  = T      }
+// CHECKFULLCASE-NEXT: |  2 { T,  C  = T      }
 //      CHECKFULLCASE: |  C1-Pair: not covered
 // CHECKFULLCASE-NEXT: |  C2-Pair: constant folded
 //      CHECKFULLCASE: |  MC/DC Coverage for Decision: 0.00%
@@ -106,8 +106,8 @@
 // CHECKFULLCASE-NEXT: |  C2-Pair: not covered
 // CHECKFULLCASE-NEXT: |  C3-Pair: not covered
 //      CHECKFULLCASE: |  MC/DC Coverage for Decision: 0.00%
-//      CHECKFULLCASE: |  1 { T,  C,  -  = T      }
-// CHECKFULLCASE-NEXT: |  2 { F,  C,  -  = T      }
+//      CHECKFULLCASE: |  1 { F,  C,  -  = T      }
+// CHECKFULLCASE-NEXT: |  2 { T,  C,  -  = T      }
 //      CHECKFULLCASE: |  C1-Pair: not covered
 // CHECKFULLCASE-NEXT: |  C2-Pair: constant folded
 // CHECKFULLCASE-NEXT: |  C3-Pair: not covered
@@ -118,8 +118,8 @@
 // CHECKFULLCASE-NEXT: |  C2-Pair: not covered
 // CHECKFULLCASE-NEXT: |  C3-Pair: not covered
 //      CHECKFULLCASE: |  MC/DC Coverage for Decision: 0.00%
-//      CHECKFULLCASE: |  1 { T,  C,  -  = T      }
-// CHECKFULLCASE-NEXT: |  2 { F,  C,  T  = T      }
+//      CHECKFULLCASE: |  1 { F,  C,  T  = T      }
+// CHECKFULLCASE-NEXT: |  2 { T,  C,  -  = T      }
 //      CHECKFULLCASE: |  C1-Pair: not covered
 // CHECKFULLCASE-NEXT: |  C2-Pair: constant folded
 // CHECKFULLCASE-NEXT: |  C3-Pair: not covered
@@ -151,26 +151,26 @@
 // CHECKFULLCASE-NEXT: |  C2-Pair: constant folded
 // CHECKFULLCASE-NEXT: |  C3-Pair: covered: (2,3)
 //      CHECKFULLCASE: |  MC/DC Coverage for Decision: 100.00%
-//      CHECKFULLCASE: |  1 { T,  -,  C  = T      }
-// CHECKFULLCASE-NEXT: |  2 { F,  T,  C  = T      }
+//      CHECKFULLCASE: |  1 { F,  T,  C  = T      }
+// CHECKFULLCASE-NEXT: |  2 { T,  -,  C  = T      }
 //      CHECKFULLCASE: |  C1-Pair: not covered
 // CHECKFULLCASE-NEXT: |  C2-Pair: not covered
 // CHECKFULLCASE-NEXT: |  C3-Pair: constant folded
 //      CHECKFULLCASE: |  MC/DC Coverage for Decision: 0.00%
-//      CHECKFULLCASE: |  1 { T,  C,  -  = T      }
-// CHECKFULLCASE-NEXT: |  2 { F,  C,  -  = T      }
+//      CHECKFULLCASE: |  1 { F,  C,  -  = T      }
+// CHECKFULLCASE-NEXT: |  2 { T,  C,  -  = T      }
 //      CHECKFULLCASE: |  C1-Pair: not covered
 // CHECKFULLCASE-NEXT: |  C2-Pair: constant folded
 // CHECKFULLCASE-NEXT: |  C3-Pair: not covered
 //      CHECKFULLCASE: |  MC/DC Coverage for Decision: 0.00%
-//      CHECKFULLCASE: |  1 { T,  -,  C  = T      }
-// CHECKFULLCASE-NEXT: |  2 { F,  T,  C  = T      }
+//      CHECKFULLCASE: |  1 { F,  T,  C  = T      }
+// CHECKFULLCASE-NEXT: |  2 { T,  -,  C  = T      }
 //      CHECKFULLCASE: |  C1-Pair: not covered
 // CHECKFULLCASE-NEXT: |  C2-Pair: not covered
 // CHECKFULLCASE-NEXT: |  C3-Pair: constant folded
 //      CHECKFULLCASE: |  MC/DC Coverage for Decision: 0.00%
-//      CHECKFULLCASE: |  1 { T,  C,  -  = T      }
-// CHECKFULLCASE-NEXT: |  2 { F,  C,  T  = T      }
+//      CHECKFULLCASE: |  1 { F,  C,  T  = T      }
+// CHECKFULLCASE-NEXT: |  2 { T,  C,  -  = T      }
 //      CHECKFULLCASE: |  C1-Pair: not covered
 // CHECKFULLCASE-NEXT: |  C2-Pair: constant folded
 // CHECKFULLCASE-NEXT: |  C3-Pair: not covered
diff --git a/llvm/test/tools/llvm-cov/mcdc-general.test b/llvm/test/tools/llvm-cov/mcdc-general.test
index 753036b..4b59ce5 100644
--- a/llvm/test/tools/llvm-cov/mcdc-general.test
+++ b/llvm/test/tools/llvm-cov/mcdc-general.test
@@ -19,16 +19,16 @@
 // CHECK-NEXT:  |
 // CHECK-NEXT:  |     C1, C2, C3, C4    Result
 // CHECK-NEXT:  |  1 { F,  -,  F,  -  = F      }
-// CHECK-NEXT:  |  2 { T,  F,  F,  -  = F      }
-// CHECK-NEXT:  |  3 { F,  -,  T,  F  = F      }
+// CHECK-NEXT:  |  2 { F,  -,  T,  F  = F      }
+// CHECK-NEXT:  |  3 { T,  F,  F,  -  = F      }
 // CHECK-NEXT:  |  4 { T,  F,  T,  F  = F      }
-// CHECK-NEXT:  |  5 { T,  T,  -,  -  = T      }
-// CHECK-NEXT:  |  6 { T,  F,  T,  T  = T      }
+// CHECK-NEXT:  |  5 { T,  F,  T,  T  = T      }
+// CHECK-NEXT:  |  6 { T,  T,  -,  -  = T      }
 // CHECK-NEXT:  |
-// CHECK-NEXT:  |  C1-Pair: covered: (1,5)
-// CHECK-NEXT:  |  C2-Pair: covered: (2,5)
-// CHECK-NEXT:  |  C3-Pair: covered: (2,6)
-// CHECK-NEXT:  |  C4-Pair: covered: (4,6)
+// CHECK-NEXT:  |  C1-Pair: covered: (1,6)
+// CHECK-NEXT:  |  C2-Pair: covered: (3,6)
+// CHECK-NEXT:  |  C3-Pair: covered: (3,5)
+// CHECK-NEXT:  |  C4-Pair: covered: (4,5)
 // CHECK-NEXT:  |  MC/DC Coverage for Decision: 100.00%
 // CHECK-NEXT:  |
 // CHECK-NEXT:  ------------------
-- 
cgit v1.1


From 581857278961b41bc1676499f92167b97a5e4c58 Mon Sep 17 00:00:00 2001
From: Derek Schuff <dschuff@chromium.org>
Date: Thu, 8 Feb 2024 13:20:47 -0800
Subject: [Object][Wasm] Generate symbol info from name section names (#81063)

Currently symbol info is generated from a linking section or from export
names. This PR generates symbols in a WasmObjectFile from the name
section as well, which allows tools like objdump and nm to show useful
information for more linked binaries. There are some limitations:
most notably that we don't assume any particular ABI, so we don't get
detailed information about data symbols if the segments are merged
(which is the default).

Covers most of the desired functionality from #76107
---
 llvm/lib/Object/WasmObjectFile.cpp                 | 49 +++++++++++-
 .../wasm-linked-namesec-with-linkingsec.yaml       | 40 ++++++++++
 llvm/test/Object/wasm-linked-symbol-table.yaml     | 75 +++++++++++++++++++
 .../wasm/linked-symbol-table-namesec.yaml          | 87 ++++++++++++++++++++++
 .../llvm-objdump/wasm/linked-symbol-table.yaml     | 75 -------------------
 5 files changed, 247 insertions(+), 79 deletions(-)
 create mode 100644 llvm/test/Object/wasm-linked-namesec-with-linkingsec.yaml
 create mode 100644 llvm/test/Object/wasm-linked-symbol-table.yaml
 create mode 100644 llvm/test/tools/llvm-objdump/wasm/linked-symbol-table-namesec.yaml
 delete mode 100644 llvm/test/tools/llvm-objdump/wasm/linked-symbol-table.yaml

diff --git a/llvm/lib/Object/WasmObjectFile.cpp b/llvm/lib/Object/WasmObjectFile.cpp
index 8c1bbe9..ea17154 100644
--- a/llvm/lib/Object/WasmObjectFile.cpp
+++ b/llvm/lib/Object/WasmObjectFile.cpp
@@ -508,10 +508,17 @@ Error WasmObjectFile::parseNameSection(ReadContext &Ctx) {
   llvm::DenseSet<uint64_t> SeenGlobals;
   llvm::DenseSet<uint64_t> SeenSegments;
 
+  // If there is symbol info from the export section, this info will supersede
+  // it, but not info from a linking section
+  if (!HasLinkingSection) {
+    Symbols.clear();
+  }
+
   while (Ctx.Ptr < Ctx.End) {
     uint8_t Type = readUint8(Ctx);
     uint32_t Size = readVaruint32(Ctx);
     const uint8_t *SubSectionEnd = Ctx.Ptr + Size;
+
     switch (Type) {
     case wasm::WASM_NAMES_FUNCTION:
     case wasm::WASM_NAMES_GLOBAL:
@@ -521,6 +528,16 @@ Error WasmObjectFile::parseNameSection(ReadContext &Ctx) {
         uint32_t Index = readVaruint32(Ctx);
         StringRef Name = readString(Ctx);
         wasm::NameType nameType = wasm::NameType::FUNCTION;
+        wasm::WasmSymbolInfo Info{Name,
+                                  /*Kind */ wasm::WASM_SYMBOL_TYPE_FUNCTION,
+                                  /* Flags */ 0,
+                                  /* ImportModule */ std::nullopt,
+                                  /* ImportName */ std::nullopt,
+                                  /* ExportName */ std::nullopt,
+                                  {/* ElementIndex */ Index}};
+        const wasm::WasmSignature *Signature = nullptr;
+        const wasm::WasmGlobalType *GlobalType = nullptr;
+        const wasm::WasmTableType *TableType = nullptr;
         if (Type == wasm::WASM_NAMES_FUNCTION) {
           if (!SeenFunctions.insert(Index).second)
             return make_error<GenericBinaryError>(
@@ -529,26 +546,50 @@ Error WasmObjectFile::parseNameSection(ReadContext &Ctx) {
             return make_error<GenericBinaryError>("invalid function name entry",
                                                   object_error::parse_failed);
 
-          if (isDefinedFunctionIndex(Index))
-            getDefinedFunction(Index).DebugName = Name;
+          if (isDefinedFunctionIndex(Index)) {
+            wasm::WasmFunction &F = getDefinedFunction(Index);
+            F.DebugName = Name;
+            Signature = &Signatures[F.SigIndex];
+            if (F.ExportName) {
+              Info.ExportName = F.ExportName;
+              Info.Flags |= wasm::WASM_SYMBOL_BINDING_GLOBAL;
+            } else {
+              Info.Flags |= wasm::WASM_SYMBOL_BINDING_LOCAL;
+            }
+          } else {
+            Info.Flags |= wasm::WASM_SYMBOL_UNDEFINED;
+          }
         } else if (Type == wasm::WASM_NAMES_GLOBAL) {
-          nameType = wasm::NameType::GLOBAL;
           if (!SeenGlobals.insert(Index).second)
             return make_error<GenericBinaryError>("global named more than once",
                                                   object_error::parse_failed);
           if (!isValidGlobalIndex(Index) || Name.empty())
             return make_error<GenericBinaryError>("invalid global name entry",
                                                   object_error::parse_failed);
+          nameType = wasm::NameType::GLOBAL;
+          Info.Kind = wasm::WASM_SYMBOL_TYPE_GLOBAL;
+          if (isDefinedGlobalIndex(Index)) {
+            GlobalType = &getDefinedGlobal(Index).Type;
+          } else {
+            Info.Flags |= wasm::WASM_SYMBOL_UNDEFINED;
+          }
         } else {
-          nameType = wasm::NameType::DATA_SEGMENT;
           if (!SeenSegments.insert(Index).second)
             return make_error<GenericBinaryError>(
                 "segment named more than once", object_error::parse_failed);
           if (Index > DataSegments.size())
             return make_error<GenericBinaryError>("invalid data segment name entry",
                                                   object_error::parse_failed);
+          nameType = wasm::NameType::DATA_SEGMENT;
+          Info.Kind = wasm::WASM_SYMBOL_TYPE_DATA;
+          Info.Flags |= wasm::WASM_SYMBOL_BINDING_LOCAL;
+          assert(Index < DataSegments.size());
+          Info.DataRef = wasm::WasmDataReference{
+              Index, 0, DataSegments[Index].Data.Content.size()};
         }
         DebugNames.push_back(wasm::WasmDebugName{nameType, Index, Name});
+        if (!HasLinkingSection)
+          Symbols.emplace_back(Info, GlobalType, TableType, Signature);
       }
       break;
     }
diff --git a/llvm/test/Object/wasm-linked-namesec-with-linkingsec.yaml b/llvm/test/Object/wasm-linked-namesec-with-linkingsec.yaml
new file mode 100644
index 0000000..c730417
--- /dev/null
+++ b/llvm/test/Object/wasm-linked-namesec-with-linkingsec.yaml
@@ -0,0 +1,40 @@
+# RUN: yaml2obj %s -o %t.wasm
+# RUN: llvm-nm -P %t.wasm | FileCheck %s
+#
+# Test that names from the linking section override those from the name section
+# CHECK:  foo T 1 0
+# CHECK-NOT: my_func_local_name
+
+--- !WASM
+FileHeader:
+  Version:         0x1
+Sections:
+  - Type:            TYPE
+    Signatures:
+      - Index:           0
+        ParamTypes:      []
+        ReturnTypes:     []
+  - Type:            FUNCTION
+    FunctionTypes:   [ 0, 0 ]
+  - Type:            CODE
+    Functions:
+      - Index:           0
+        Locals:
+        Body:            00
+      - Index:           1
+        Locals:
+        Body:            00
+  - Type:            CUSTOM
+    Name:            linking
+    Version:         2
+    SymbolTable:
+      - Index:           0
+        Kind:            FUNCTION
+        Name:            foo
+        Flags:           [ VISIBILITY_HIDDEN ]
+        Function:        0
+  - Type:            CUSTOM
+    Name:            name
+    FunctionNames:
+     - Index:        1
+       Name:         my_func_local_name
diff --git a/llvm/test/Object/wasm-linked-symbol-table.yaml b/llvm/test/Object/wasm-linked-symbol-table.yaml
new file mode 100644
index 0000000..6dd949a
--- /dev/null
+++ b/llvm/test/Object/wasm-linked-symbol-table.yaml
@@ -0,0 +1,75 @@
+# RUN: yaml2obj %s -o %t.wasm
+# RUN: llvm-objdump -t %t.wasm | FileCheck %s
+#
+# CHECK:      SYMBOL TABLE:
+# CHECK-NEXT: 0000009f g F CODE my_func_export
+# CHECK-NEXT: 0000002a g O DATA my_global_export
+# CHECK-NEXT: 00000000 g   TABLE my_table_export
+
+--- !WASM
+FileHeader:
+  Version:         0x1
+Sections:
+  - Type:            TYPE
+    Signatures:
+      - Index:           0
+        ParamTypes:      []
+        ReturnTypes:     []
+  - Type:            IMPORT
+    Imports:
+      - Module:          env
+        Field:           foo
+        Kind:            FUNCTION
+        SigIndex:        0
+      - Module:          env
+        Field:           bar
+        Kind:            GLOBAL
+        GlobalType:      I32
+        GlobalMutable:   true
+      - Module:          env
+        Field:           memory
+        Kind:            MEMORY
+        Memory:
+          Minimum:         0x1
+  - Type:            FUNCTION
+    FunctionTypes:   [ 0 ]
+  - Type:            TABLE
+    Tables:
+      - Index:           0
+        ElemType:        FUNCREF
+        Limits:
+          Flags:           [ HAS_MAX ]
+          Minimum:         0x1
+          Maximum:         0x1
+  - Type:            GLOBAL
+    Globals:
+      - Index:           1
+        Mutable:         false
+        Type:            I32
+        InitExpr:
+          Opcode:          I32_CONST
+          Value:           42
+  - Type:            EXPORT
+    Exports:
+      - Name:            my_func_export
+        Kind:            FUNCTION
+        Index:           1
+      - Name:            my_global_export
+        Kind:            GLOBAL
+        Index:           1
+      - Name:            my_table_export
+        Kind:            TABLE
+        Index:           0
+  - Type:            CODE
+    Functions:
+      - Index:           1
+        Locals:
+        Body:            00
+  - Type:            DATA
+    Segments:
+      - SectionOffset:   0
+        InitFlags:       0
+        Offset:
+          Opcode:          I32_CONST
+          Value:           0
+        Content:         ''
diff --git a/llvm/test/tools/llvm-objdump/wasm/linked-symbol-table-namesec.yaml b/llvm/test/tools/llvm-objdump/wasm/linked-symbol-table-namesec.yaml
new file mode 100644
index 0000000..622a606
--- /dev/null
+++ b/llvm/test/tools/llvm-objdump/wasm/linked-symbol-table-namesec.yaml
@@ -0,0 +1,87 @@
+# RUN: yaml2obj %s -o %t.wasm
+# RUN: llvm-objdump -t %t.wasm | FileCheck %s
+#
+# CHECK:      SYMBOL TABLE:
+# CHECK-NEXT: 00000000   F *UND* my_func_import_name
+# CHECK-NEXT: 00000083 g F CODE my_func_export_name
+# CHECK-NEXT: 00000086 l F CODE my_func_local_name
+# CHECK-NEXT: 00000000    *UND* my_global_import_name
+# CHECK-NEXT: 00000001 g  GLOBAL my_global_export_name
+# CHECK-NEXT: 00000000 l O DATA my_datasegment_name
+
+--- !WASM
+FileHeader:
+  Version:         0x1
+Sections:
+  - Type:            TYPE
+    Signatures:
+      - Index:           0
+        ParamTypes:      []
+        ReturnTypes:     []
+  - Type:            IMPORT
+    Imports:
+      - Module:          env
+        Field:           foo
+        Kind:            FUNCTION
+        SigIndex:        0
+      - Module:          env
+        Field:           bar
+        Kind:            GLOBAL
+        GlobalType:      I32
+        GlobalMutable:   true
+      - Module:          env
+        Field:           memory
+        Kind:            MEMORY
+        Memory:
+          Minimum:         0x1
+  - Type:            FUNCTION
+    FunctionTypes:   [ 0, 0 ]
+  - Type:            GLOBAL
+    Globals:
+      - Index:           1
+        Mutable:         false
+        Type:            I32
+        InitExpr:
+          Opcode:          I32_CONST
+          Value:           42
+  - Type:            EXPORT
+    Exports:
+      - Name:            my_func_export
+        Kind:            FUNCTION
+        Index:           1
+      - Name:            my_global_export
+        Kind:            GLOBAL
+        Index:           1
+  - Type:            CODE
+    Functions:
+      - Index:           1
+        Locals:
+        Body:            00
+      - Index:           2
+        Locals:
+        Body:            00
+  - Type:            DATA
+    Segments:
+      - SectionOffset:   0
+        InitFlags:       0
+        Offset:
+          Opcode:          I32_CONST
+          Value:           0
+        Content:         'abcd1234'
+  - Type:            CUSTOM
+    Name:            name
+    FunctionNames:
+     - Index:        0
+       Name:         my_func_import_name
+     - Index:        1
+       Name:         my_func_export_name
+     - Index:        2
+       Name:         my_func_local_name
+    GlobalNames:
+     - Index:        0
+       Name:         my_global_import_name
+     - Index:        1
+       Name:         my_global_export_name
+    DataSegmentNames:
+     - Index:        0
+       Name:         my_datasegment_name
diff --git a/llvm/test/tools/llvm-objdump/wasm/linked-symbol-table.yaml b/llvm/test/tools/llvm-objdump/wasm/linked-symbol-table.yaml
deleted file mode 100644
index 6dd949a..0000000
--- a/llvm/test/tools/llvm-objdump/wasm/linked-symbol-table.yaml
+++ /dev/null
@@ -1,75 +0,0 @@
-# RUN: yaml2obj %s -o %t.wasm
-# RUN: llvm-objdump -t %t.wasm | FileCheck %s
-#
-# CHECK:      SYMBOL TABLE:
-# CHECK-NEXT: 0000009f g F CODE my_func_export
-# CHECK-NEXT: 0000002a g O DATA my_global_export
-# CHECK-NEXT: 00000000 g   TABLE my_table_export
-
---- !WASM
-FileHeader:
-  Version:         0x1
-Sections:
-  - Type:            TYPE
-    Signatures:
-      - Index:           0
-        ParamTypes:      []
-        ReturnTypes:     []
-  - Type:            IMPORT
-    Imports:
-      - Module:          env
-        Field:           foo
-        Kind:            FUNCTION
-        SigIndex:        0
-      - Module:          env
-        Field:           bar
-        Kind:            GLOBAL
-        GlobalType:      I32
-        GlobalMutable:   true
-      - Module:          env
-        Field:           memory
-        Kind:            MEMORY
-        Memory:
-          Minimum:         0x1
-  - Type:            FUNCTION
-    FunctionTypes:   [ 0 ]
-  - Type:            TABLE
-    Tables:
-      - Index:           0
-        ElemType:        FUNCREF
-        Limits:
-          Flags:           [ HAS_MAX ]
-          Minimum:         0x1
-          Maximum:         0x1
-  - Type:            GLOBAL
-    Globals:
-      - Index:           1
-        Mutable:         false
-        Type:            I32
-        InitExpr:
-          Opcode:          I32_CONST
-          Value:           42
-  - Type:            EXPORT
-    Exports:
-      - Name:            my_func_export
-        Kind:            FUNCTION
-        Index:           1
-      - Name:            my_global_export
-        Kind:            GLOBAL
-        Index:           1
-      - Name:            my_table_export
-        Kind:            TABLE
-        Index:           0
-  - Type:            CODE
-    Functions:
-      - Index:           1
-        Locals:
-        Body:            00
-  - Type:            DATA
-    Segments:
-      - SectionOffset:   0
-        InitFlags:       0
-        Offset:
-          Opcode:          I32_CONST
-          Value:           0
-        Content:         ''
-- 
cgit v1.1


From cdde0d9602217eb0bc091b4de16197e6aa5bb132 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Timm=20B=C3=A4der?= <tbaeder@redhat.com>
Date: Thu, 8 Feb 2024 16:03:22 +0100
Subject: [clang][Interp][NFC] Make a local variable const

---
 clang/lib/AST/Interp/Interp.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/clang/lib/AST/Interp/Interp.h b/clang/lib/AST/Interp/Interp.h
index a76e633..1299a70 100644
--- a/clang/lib/AST/Interp/Interp.h
+++ b/clang/lib/AST/Interp/Interp.h
@@ -811,7 +811,7 @@ bool CMP3(InterpState &S, CodePtr OpPC, const ComparisonCategoryInfo *CmpInfo) {
   const auto *CmpValueInfo = CmpInfo->getValueInfo(CmpResult);
   assert(CmpValueInfo);
   assert(CmpValueInfo->hasValidIntValue());
-  APSInt IntValue = CmpValueInfo->getIntValue();
+  const APSInt &IntValue = CmpValueInfo->getIntValue();
   return SetThreeWayComparisonField(S, OpPC, P, IntValue);
 }
 
-- 
cgit v1.1


From 7c9c4983b1d493c2fdea76f99591f9ab49877306 Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Thu, 8 Feb 2024 13:43:11 -0800
Subject: [DWARFLinkerParallel] Fix member initialization order (#81179)

DWARFLinkerImpl::DWARFLinkerImpl initializes
DebugStrStrings/DebugLineStrStrings/CommonSections using GlobalData
but GlobalData is initialized after the three members.
Move GlobalData before.

Fix #81110
---
 llvm/lib/DWARFLinker/Parallel/DWARFLinkerImpl.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/llvm/lib/DWARFLinker/Parallel/DWARFLinkerImpl.h b/llvm/lib/DWARFLinker/Parallel/DWARFLinkerImpl.h
index 527c7a0..7c17c5b 100644
--- a/llvm/lib/DWARFLinker/Parallel/DWARFLinkerImpl.h
+++ b/llvm/lib/DWARFLinker/Parallel/DWARFLinkerImpl.h
@@ -351,6 +351,9 @@ protected:
   /// \defgroup Data members accessed sequentially.
   ///
   /// @{
+  /// Data global for the whole linking process.
+  LinkingGlobalData GlobalData;
+
   /// DwarfStringPoolEntries for .debug_str section.
   StringEntryToDwarfStringPoolEntryMap DebugStrStrings;
 
@@ -368,9 +371,6 @@ protected:
 
   /// Overall compile units number.
   uint64_t OverallNumberOfCU = 0;
-
-  /// Data global for the whole linking process.
-  LinkingGlobalData GlobalData;
   /// @}
 };
 
-- 
cgit v1.1


From f78c9b88b7a1a54cf67037f9088a3e48779b1e44 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Thu, 8 Feb 2024 13:39:58 -0800
Subject: [RISCV] Use MCPhysReg for AllPopRegs. NFC

MCPhysReg is 2 bytes, while Register is 4 bytes.
---
 llvm/lib/Target/RISCV/RISCVFrameLowering.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp b/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp
index e5b5103..b12b497 100644
--- a/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp
@@ -42,7 +42,7 @@ RISCVFrameLowering::RISCVFrameLowering(const RISCVSubtarget &STI)
                           /*TransientStackAlignment=*/Align(16)),
       STI(STI) {}
 
-static const Register AllPopRegs[] = {
+static const MCPhysReg AllPopRegs[] = {
     RISCV::X1,  RISCV::X8,  RISCV::X9,  RISCV::X18, RISCV::X19,
     RISCV::X20, RISCV::X21, RISCV::X22, RISCV::X23, RISCV::X24,
     RISCV::X25, RISCV::X26, RISCV::X27};
-- 
cgit v1.1


From 4b54b474aa0ffb355faa63cc2d8f95fd321c887f Mon Sep 17 00:00:00 2001
From: Alex MacLean <amaclean@nvidia.com>
Date: Thu, 8 Feb 2024 13:53:18 -0800
Subject: [NVPTX][NFC] cleanup dead vars, use MAKE_CASE (#81161)

Cleanup some dead variables. In addition, switch to a `MAKE_CASE` macro,
similar to other targets, to reduce boilerplate.
---
 llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp   |    1 -
 llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp | 1048 +++++++++++----------------
 2 files changed, 428 insertions(+), 621 deletions(-)

diff --git a/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp b/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
index 6c4879b..cdfc288 100644
--- a/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
@@ -1530,7 +1530,6 @@ void NVPTXAsmPrinter::emitFunctionParamList(const Function *F, raw_ostream &O) {
     if (isKernelFunction(*F)) {
       if (isSampler(*I) || isImage(*I)) {
         if (isImage(*I)) {
-          std::string sname = std::string(I->getName());
           if (isImageWriteOnly(*I) || isImageReadWrite(*I)) {
             if (hasImageHandles)
               O << "\t.param .u64 .ptr .surfref ";
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index 24e0be2..c7bc623 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -858,623 +858,432 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
 }
 
 const char *NVPTXTargetLowering::getTargetNodeName(unsigned Opcode) const {
+
+#define MAKE_CASE(V)                                                           \
+  case V:                                                                      \
+    return #V;
+
   switch ((NVPTXISD::NodeType)Opcode) {
   case NVPTXISD::FIRST_NUMBER:
     break;
-  case NVPTXISD::CALL:
-    return "NVPTXISD::CALL";
-  case NVPTXISD::RET_GLUE:
-    return "NVPTXISD::RET_GLUE";
-  case NVPTXISD::LOAD_PARAM:
-    return "NVPTXISD::LOAD_PARAM";
-  case NVPTXISD::Wrapper:
-    return "NVPTXISD::Wrapper";
-  case NVPTXISD::DeclareParam:
-    return "NVPTXISD::DeclareParam";
-  case NVPTXISD::DeclareScalarParam:
-    return "NVPTXISD::DeclareScalarParam";
-  case NVPTXISD::DeclareRet:
-    return "NVPTXISD::DeclareRet";
-  case NVPTXISD::DeclareScalarRet:
-    return "NVPTXISD::DeclareScalarRet";
-  case NVPTXISD::DeclareRetParam:
-    return "NVPTXISD::DeclareRetParam";
-  case NVPTXISD::PrintCall:
-    return "NVPTXISD::PrintCall";
-  case NVPTXISD::PrintConvergentCall:
-    return "NVPTXISD::PrintConvergentCall";
-  case NVPTXISD::PrintCallUni:
-    return "NVPTXISD::PrintCallUni";
-  case NVPTXISD::PrintConvergentCallUni:
-    return "NVPTXISD::PrintConvergentCallUni";
-  case NVPTXISD::LoadParam:
-    return "NVPTXISD::LoadParam";
-  case NVPTXISD::LoadParamV2:
-    return "NVPTXISD::LoadParamV2";
-  case NVPTXISD::LoadParamV4:
-    return "NVPTXISD::LoadParamV4";
-  case NVPTXISD::StoreParam:
-    return "NVPTXISD::StoreParam";
-  case NVPTXISD::StoreParamV2:
-    return "NVPTXISD::StoreParamV2";
-  case NVPTXISD::StoreParamV4:
-    return "NVPTXISD::StoreParamV4";
-  case NVPTXISD::StoreParamS32:
-    return "NVPTXISD::StoreParamS32";
-  case NVPTXISD::StoreParamU32:
-    return "NVPTXISD::StoreParamU32";
-  case NVPTXISD::CallArgBegin:
-    return "NVPTXISD::CallArgBegin";
-  case NVPTXISD::CallArg:
-    return "NVPTXISD::CallArg";
-  case NVPTXISD::LastCallArg:
-    return "NVPTXISD::LastCallArg";
-  case NVPTXISD::CallArgEnd:
-    return "NVPTXISD::CallArgEnd";
-  case NVPTXISD::CallVoid:
-    return "NVPTXISD::CallVoid";
-  case NVPTXISD::CallVal:
-    return "NVPTXISD::CallVal";
-  case NVPTXISD::CallSymbol:
-    return "NVPTXISD::CallSymbol";
-  case NVPTXISD::Prototype:
-    return "NVPTXISD::Prototype";
-  case NVPTXISD::MoveParam:
-    return "NVPTXISD::MoveParam";
-  case NVPTXISD::StoreRetval:
-    return "NVPTXISD::StoreRetval";
-  case NVPTXISD::StoreRetvalV2:
-    return "NVPTXISD::StoreRetvalV2";
-  case NVPTXISD::StoreRetvalV4:
-    return "NVPTXISD::StoreRetvalV4";
-  case NVPTXISD::PseudoUseParam:
-    return "NVPTXISD::PseudoUseParam";
-  case NVPTXISD::RETURN:
-    return "NVPTXISD::RETURN";
-  case NVPTXISD::CallSeqBegin:
-    return "NVPTXISD::CallSeqBegin";
-  case NVPTXISD::CallSeqEnd:
-    return "NVPTXISD::CallSeqEnd";
-  case NVPTXISD::CallPrototype:
-    return "NVPTXISD::CallPrototype";
-  case NVPTXISD::ProxyReg:
-    return "NVPTXISD::ProxyReg";
-  case NVPTXISD::LoadV2:
-    return "NVPTXISD::LoadV2";
-  case NVPTXISD::LoadV4:
-    return "NVPTXISD::LoadV4";
-  case NVPTXISD::LDGV2:
-    return "NVPTXISD::LDGV2";
-  case NVPTXISD::LDGV4:
-    return "NVPTXISD::LDGV4";
-  case NVPTXISD::LDUV2:
-    return "NVPTXISD::LDUV2";
-  case NVPTXISD::LDUV4:
-    return "NVPTXISD::LDUV4";
-  case NVPTXISD::StoreV2:
-    return "NVPTXISD::StoreV2";
-  case NVPTXISD::StoreV4:
-    return "NVPTXISD::StoreV4";
-  case NVPTXISD::FUN_SHFL_CLAMP:
-    return "NVPTXISD::FUN_SHFL_CLAMP";
-  case NVPTXISD::FUN_SHFR_CLAMP:
-    return "NVPTXISD::FUN_SHFR_CLAMP";
-  case NVPTXISD::IMAD:
-    return "NVPTXISD::IMAD";
-  case NVPTXISD::BFE:
-    return "NVPTXISD::BFE";
-  case NVPTXISD::BFI:
-    return "NVPTXISD::BFI";
-  case NVPTXISD::PRMT:
-    return "NVPTXISD::PRMT";
-  case NVPTXISD::SETP_F16X2:
-    return "NVPTXISD::SETP_F16X2";
-  case NVPTXISD::SETP_BF16X2:
-    return "NVPTXISD::SETP_BF16X2";
-  case NVPTXISD::Dummy:
-    return "NVPTXISD::Dummy";
-  case NVPTXISD::MUL_WIDE_SIGNED:
-    return "NVPTXISD::MUL_WIDE_SIGNED";
-  case NVPTXISD::MUL_WIDE_UNSIGNED:
-    return "NVPTXISD::MUL_WIDE_UNSIGNED";
-  case NVPTXISD::Tex1DFloatS32:        return "NVPTXISD::Tex1DFloatS32";
-  case NVPTXISD::Tex1DFloatFloat:      return "NVPTXISD::Tex1DFloatFloat";
-  case NVPTXISD::Tex1DFloatFloatLevel:
-    return "NVPTXISD::Tex1DFloatFloatLevel";
-  case NVPTXISD::Tex1DFloatFloatGrad:
-    return "NVPTXISD::Tex1DFloatFloatGrad";
-  case NVPTXISD::Tex1DS32S32:          return "NVPTXISD::Tex1DS32S32";
-  case NVPTXISD::Tex1DS32Float:        return "NVPTXISD::Tex1DS32Float";
-  case NVPTXISD::Tex1DS32FloatLevel:
-    return "NVPTXISD::Tex1DS32FloatLevel";
-  case NVPTXISD::Tex1DS32FloatGrad:
-    return "NVPTXISD::Tex1DS32FloatGrad";
-  case NVPTXISD::Tex1DU32S32:          return "NVPTXISD::Tex1DU32S32";
-  case NVPTXISD::Tex1DU32Float:        return "NVPTXISD::Tex1DU32Float";
-  case NVPTXISD::Tex1DU32FloatLevel:
-    return "NVPTXISD::Tex1DU32FloatLevel";
-  case NVPTXISD::Tex1DU32FloatGrad:
-    return "NVPTXISD::Tex1DU32FloatGrad";
-  case NVPTXISD::Tex1DArrayFloatS32:   return "NVPTXISD::Tex1DArrayFloatS32";
-  case NVPTXISD::Tex1DArrayFloatFloat: return "NVPTXISD::Tex1DArrayFloatFloat";
-  case NVPTXISD::Tex1DArrayFloatFloatLevel:
-    return "NVPTXISD::Tex1DArrayFloatFloatLevel";
-  case NVPTXISD::Tex1DArrayFloatFloatGrad:
-    return "NVPTXISD::Tex1DArrayFloatFloatGrad";
-  case NVPTXISD::Tex1DArrayS32S32:     return "NVPTXISD::Tex1DArrayS32S32";
-  case NVPTXISD::Tex1DArrayS32Float:   return "NVPTXISD::Tex1DArrayS32Float";
-  case NVPTXISD::Tex1DArrayS32FloatLevel:
-    return "NVPTXISD::Tex1DArrayS32FloatLevel";
-  case NVPTXISD::Tex1DArrayS32FloatGrad:
-    return "NVPTXISD::Tex1DArrayS32FloatGrad";
-  case NVPTXISD::Tex1DArrayU32S32:     return "NVPTXISD::Tex1DArrayU32S32";
-  case NVPTXISD::Tex1DArrayU32Float:   return "NVPTXISD::Tex1DArrayU32Float";
-  case NVPTXISD::Tex1DArrayU32FloatLevel:
-    return "NVPTXISD::Tex1DArrayU32FloatLevel";
-  case NVPTXISD::Tex1DArrayU32FloatGrad:
-    return "NVPTXISD::Tex1DArrayU32FloatGrad";
-  case NVPTXISD::Tex2DFloatS32:        return "NVPTXISD::Tex2DFloatS32";
-  case NVPTXISD::Tex2DFloatFloat:      return "NVPTXISD::Tex2DFloatFloat";
-  case NVPTXISD::Tex2DFloatFloatLevel:
-    return "NVPTXISD::Tex2DFloatFloatLevel";
-  case NVPTXISD::Tex2DFloatFloatGrad:
-    return "NVPTXISD::Tex2DFloatFloatGrad";
-  case NVPTXISD::Tex2DS32S32:          return "NVPTXISD::Tex2DS32S32";
-  case NVPTXISD::Tex2DS32Float:        return "NVPTXISD::Tex2DS32Float";
-  case NVPTXISD::Tex2DS32FloatLevel:
-    return "NVPTXISD::Tex2DS32FloatLevel";
-  case NVPTXISD::Tex2DS32FloatGrad:
-    return "NVPTXISD::Tex2DS32FloatGrad";
-  case NVPTXISD::Tex2DU32S32:          return "NVPTXISD::Tex2DU32S32";
-  case NVPTXISD::Tex2DU32Float:        return "NVPTXISD::Tex2DU32Float";
-  case NVPTXISD::Tex2DU32FloatLevel:
-    return "NVPTXISD::Tex2DU32FloatLevel";
-  case NVPTXISD::Tex2DU32FloatGrad:
-    return "NVPTXISD::Tex2DU32FloatGrad";
-  case NVPTXISD::Tex2DArrayFloatS32:   return "NVPTXISD::Tex2DArrayFloatS32";
-  case NVPTXISD::Tex2DArrayFloatFloat: return "NVPTXISD::Tex2DArrayFloatFloat";
-  case NVPTXISD::Tex2DArrayFloatFloatLevel:
-    return "NVPTXISD::Tex2DArrayFloatFloatLevel";
-  case NVPTXISD::Tex2DArrayFloatFloatGrad:
-    return "NVPTXISD::Tex2DArrayFloatFloatGrad";
-  case NVPTXISD::Tex2DArrayS32S32:     return "NVPTXISD::Tex2DArrayS32S32";
-  case NVPTXISD::Tex2DArrayS32Float:   return "NVPTXISD::Tex2DArrayS32Float";
-  case NVPTXISD::Tex2DArrayS32FloatLevel:
-    return "NVPTXISD::Tex2DArrayS32FloatLevel";
-  case NVPTXISD::Tex2DArrayS32FloatGrad:
-    return "NVPTXISD::Tex2DArrayS32FloatGrad";
-  case NVPTXISD::Tex2DArrayU32S32:     return "NVPTXISD::Tex2DArrayU32S32";
-  case NVPTXISD::Tex2DArrayU32Float:   return "NVPTXISD::Tex2DArrayU32Float";
-  case NVPTXISD::Tex2DArrayU32FloatLevel:
-    return "NVPTXISD::Tex2DArrayU32FloatLevel";
-  case NVPTXISD::Tex2DArrayU32FloatGrad:
-    return "NVPTXISD::Tex2DArrayU32FloatGrad";
-  case NVPTXISD::Tex3DFloatS32:        return "NVPTXISD::Tex3DFloatS32";
-  case NVPTXISD::Tex3DFloatFloat:      return "NVPTXISD::Tex3DFloatFloat";
-  case NVPTXISD::Tex3DFloatFloatLevel:
-    return "NVPTXISD::Tex3DFloatFloatLevel";
-  case NVPTXISD::Tex3DFloatFloatGrad:
-    return "NVPTXISD::Tex3DFloatFloatGrad";
-  case NVPTXISD::Tex3DS32S32:          return "NVPTXISD::Tex3DS32S32";
-  case NVPTXISD::Tex3DS32Float:        return "NVPTXISD::Tex3DS32Float";
-  case NVPTXISD::Tex3DS32FloatLevel:
-    return "NVPTXISD::Tex3DS32FloatLevel";
-  case NVPTXISD::Tex3DS32FloatGrad:
-    return "NVPTXISD::Tex3DS32FloatGrad";
-  case NVPTXISD::Tex3DU32S32:          return "NVPTXISD::Tex3DU32S32";
-  case NVPTXISD::Tex3DU32Float:        return "NVPTXISD::Tex3DU32Float";
-  case NVPTXISD::Tex3DU32FloatLevel:
-    return "NVPTXISD::Tex3DU32FloatLevel";
-  case NVPTXISD::Tex3DU32FloatGrad:
-    return "NVPTXISD::Tex3DU32FloatGrad";
-  case NVPTXISD::TexCubeFloatFloat:      return "NVPTXISD::TexCubeFloatFloat";
-  case NVPTXISD::TexCubeFloatFloatLevel:
-    return "NVPTXISD::TexCubeFloatFloatLevel";
-  case NVPTXISD::TexCubeS32Float:        return "NVPTXISD::TexCubeS32Float";
-  case NVPTXISD::TexCubeS32FloatLevel:
-    return "NVPTXISD::TexCubeS32FloatLevel";
-  case NVPTXISD::TexCubeU32Float:        return "NVPTXISD::TexCubeU32Float";
-  case NVPTXISD::TexCubeU32FloatLevel:
-    return "NVPTXISD::TexCubeU32FloatLevel";
-  case NVPTXISD::TexCubeArrayFloatFloat:
-    return "NVPTXISD::TexCubeArrayFloatFloat";
-  case NVPTXISD::TexCubeArrayFloatFloatLevel:
-    return "NVPTXISD::TexCubeArrayFloatFloatLevel";
-  case NVPTXISD::TexCubeArrayS32Float:
-    return "NVPTXISD::TexCubeArrayS32Float";
-  case NVPTXISD::TexCubeArrayS32FloatLevel:
-    return "NVPTXISD::TexCubeArrayS32FloatLevel";
-  case NVPTXISD::TexCubeArrayU32Float:
-    return "NVPTXISD::TexCubeArrayU32Float";
-  case NVPTXISD::TexCubeArrayU32FloatLevel:
-    return "NVPTXISD::TexCubeArrayU32FloatLevel";
-  case NVPTXISD::Tld4R2DFloatFloat:
-    return "NVPTXISD::Tld4R2DFloatFloat";
-  case NVPTXISD::Tld4G2DFloatFloat:
-    return "NVPTXISD::Tld4G2DFloatFloat";
-  case NVPTXISD::Tld4B2DFloatFloat:
-    return "NVPTXISD::Tld4B2DFloatFloat";
-  case NVPTXISD::Tld4A2DFloatFloat:
-    return "NVPTXISD::Tld4A2DFloatFloat";
-  case NVPTXISD::Tld4R2DS64Float:
-    return "NVPTXISD::Tld4R2DS64Float";
-  case NVPTXISD::Tld4G2DS64Float:
-    return "NVPTXISD::Tld4G2DS64Float";
-  case NVPTXISD::Tld4B2DS64Float:
-    return "NVPTXISD::Tld4B2DS64Float";
-  case NVPTXISD::Tld4A2DS64Float:
-    return "NVPTXISD::Tld4A2DS64Float";
-  case NVPTXISD::Tld4R2DU64Float:
-    return "NVPTXISD::Tld4R2DU64Float";
-  case NVPTXISD::Tld4G2DU64Float:
-    return "NVPTXISD::Tld4G2DU64Float";
-  case NVPTXISD::Tld4B2DU64Float:
-    return "NVPTXISD::Tld4B2DU64Float";
-  case NVPTXISD::Tld4A2DU64Float:
-    return "NVPTXISD::Tld4A2DU64Float";
-
-  case NVPTXISD::TexUnified1DFloatS32:
-    return "NVPTXISD::TexUnified1DFloatS32";
-  case NVPTXISD::TexUnified1DFloatFloat:
-    return "NVPTXISD::TexUnified1DFloatFloat";
-  case NVPTXISD::TexUnified1DFloatFloatLevel:
-    return "NVPTXISD::TexUnified1DFloatFloatLevel";
-  case NVPTXISD::TexUnified1DFloatFloatGrad:
-    return "NVPTXISD::TexUnified1DFloatFloatGrad";
-  case NVPTXISD::TexUnified1DS32S32:
-    return "NVPTXISD::TexUnified1DS32S32";
-  case NVPTXISD::TexUnified1DS32Float:
-    return "NVPTXISD::TexUnified1DS32Float";
-  case NVPTXISD::TexUnified1DS32FloatLevel:
-    return "NVPTXISD::TexUnified1DS32FloatLevel";
-  case NVPTXISD::TexUnified1DS32FloatGrad:
-    return "NVPTXISD::TexUnified1DS32FloatGrad";
-  case NVPTXISD::TexUnified1DU32S32:
-    return "NVPTXISD::TexUnified1DU32S32";
-  case NVPTXISD::TexUnified1DU32Float:
-    return "NVPTXISD::TexUnified1DU32Float";
-  case NVPTXISD::TexUnified1DU32FloatLevel:
-    return "NVPTXISD::TexUnified1DU32FloatLevel";
-  case NVPTXISD::TexUnified1DU32FloatGrad:
-    return "NVPTXISD::TexUnified1DU32FloatGrad";
-  case NVPTXISD::TexUnified1DArrayFloatS32:
-    return "NVPTXISD::TexUnified1DArrayFloatS32";
-  case NVPTXISD::TexUnified1DArrayFloatFloat:
-    return "NVPTXISD::TexUnified1DArrayFloatFloat";
-  case NVPTXISD::TexUnified1DArrayFloatFloatLevel:
-    return "NVPTXISD::TexUnified1DArrayFloatFloatLevel";
-  case NVPTXISD::TexUnified1DArrayFloatFloatGrad:
-    return "NVPTXISD::TexUnified1DArrayFloatFloatGrad";
-  case NVPTXISD::TexUnified1DArrayS32S32:
-    return "NVPTXISD::TexUnified1DArrayS32S32";
-  case NVPTXISD::TexUnified1DArrayS32Float:
-    return "NVPTXISD::TexUnified1DArrayS32Float";
-  case NVPTXISD::TexUnified1DArrayS32FloatLevel:
-    return "NVPTXISD::TexUnified1DArrayS32FloatLevel";
-  case NVPTXISD::TexUnified1DArrayS32FloatGrad:
-    return "NVPTXISD::TexUnified1DArrayS32FloatGrad";
-  case NVPTXISD::TexUnified1DArrayU32S32:
-    return "NVPTXISD::TexUnified1DArrayU32S32";
-  case NVPTXISD::TexUnified1DArrayU32Float:
-    return "NVPTXISD::TexUnified1DArrayU32Float";
-  case NVPTXISD::TexUnified1DArrayU32FloatLevel:
-    return "NVPTXISD::TexUnified1DArrayU32FloatLevel";
-  case NVPTXISD::TexUnified1DArrayU32FloatGrad:
-    return "NVPTXISD::TexUnified1DArrayU32FloatGrad";
-  case NVPTXISD::TexUnified2DFloatS32:
-    return "NVPTXISD::TexUnified2DFloatS32";
-  case NVPTXISD::TexUnified2DFloatFloat:
-    return "NVPTXISD::TexUnified2DFloatFloat";
-  case NVPTXISD::TexUnified2DFloatFloatLevel:
-    return "NVPTXISD::TexUnified2DFloatFloatLevel";
-  case NVPTXISD::TexUnified2DFloatFloatGrad:
-    return "NVPTXISD::TexUnified2DFloatFloatGrad";
-  case NVPTXISD::TexUnified2DS32S32:
-    return "NVPTXISD::TexUnified2DS32S32";
-  case NVPTXISD::TexUnified2DS32Float:
-    return "NVPTXISD::TexUnified2DS32Float";
-  case NVPTXISD::TexUnified2DS32FloatLevel:
-    return "NVPTXISD::TexUnified2DS32FloatLevel";
-  case NVPTXISD::TexUnified2DS32FloatGrad:
-    return "NVPTXISD::TexUnified2DS32FloatGrad";
-  case NVPTXISD::TexUnified2DU32S32:
-    return "NVPTXISD::TexUnified2DU32S32";
-  case NVPTXISD::TexUnified2DU32Float:
-    return "NVPTXISD::TexUnified2DU32Float";
-  case NVPTXISD::TexUnified2DU32FloatLevel:
-    return "NVPTXISD::TexUnified2DU32FloatLevel";
-  case NVPTXISD::TexUnified2DU32FloatGrad:
-    return "NVPTXISD::TexUnified2DU32FloatGrad";
-  case NVPTXISD::TexUnified2DArrayFloatS32:
-    return "NVPTXISD::TexUnified2DArrayFloatS32";
-  case NVPTXISD::TexUnified2DArrayFloatFloat:
-    return "NVPTXISD::TexUnified2DArrayFloatFloat";
-  case NVPTXISD::TexUnified2DArrayFloatFloatLevel:
-    return "NVPTXISD::TexUnified2DArrayFloatFloatLevel";
-  case NVPTXISD::TexUnified2DArrayFloatFloatGrad:
-    return "NVPTXISD::TexUnified2DArrayFloatFloatGrad";
-  case NVPTXISD::TexUnified2DArrayS32S32:
-    return "NVPTXISD::TexUnified2DArrayS32S32";
-  case NVPTXISD::TexUnified2DArrayS32Float:
-    return "NVPTXISD::TexUnified2DArrayS32Float";
-  case NVPTXISD::TexUnified2DArrayS32FloatLevel:
-    return "NVPTXISD::TexUnified2DArrayS32FloatLevel";
-  case NVPTXISD::TexUnified2DArrayS32FloatGrad:
-    return "NVPTXISD::TexUnified2DArrayS32FloatGrad";
-  case NVPTXISD::TexUnified2DArrayU32S32:
-    return "NVPTXISD::TexUnified2DArrayU32S32";
-  case NVPTXISD::TexUnified2DArrayU32Float:
-    return "NVPTXISD::TexUnified2DArrayU32Float";
-  case NVPTXISD::TexUnified2DArrayU32FloatLevel:
-    return "NVPTXISD::TexUnified2DArrayU32FloatLevel";
-  case NVPTXISD::TexUnified2DArrayU32FloatGrad:
-    return "NVPTXISD::TexUnified2DArrayU32FloatGrad";
-  case NVPTXISD::TexUnified3DFloatS32:
-    return "NVPTXISD::TexUnified3DFloatS32";
-  case NVPTXISD::TexUnified3DFloatFloat:
-    return "NVPTXISD::TexUnified3DFloatFloat";
-  case NVPTXISD::TexUnified3DFloatFloatLevel:
-    return "NVPTXISD::TexUnified3DFloatFloatLevel";
-  case NVPTXISD::TexUnified3DFloatFloatGrad:
-    return "NVPTXISD::TexUnified3DFloatFloatGrad";
-  case NVPTXISD::TexUnified3DS32S32:
-    return "NVPTXISD::TexUnified3DS32S32";
-  case NVPTXISD::TexUnified3DS32Float:
-    return "NVPTXISD::TexUnified3DS32Float";
-  case NVPTXISD::TexUnified3DS32FloatLevel:
-    return "NVPTXISD::TexUnified3DS32FloatLevel";
-  case NVPTXISD::TexUnified3DS32FloatGrad:
-    return "NVPTXISD::TexUnified3DS32FloatGrad";
-  case NVPTXISD::TexUnified3DU32S32:
-    return "NVPTXISD::TexUnified3DU32S32";
-  case NVPTXISD::TexUnified3DU32Float:
-    return "NVPTXISD::TexUnified3DU32Float";
-  case NVPTXISD::TexUnified3DU32FloatLevel:
-    return "NVPTXISD::TexUnified3DU32FloatLevel";
-  case NVPTXISD::TexUnified3DU32FloatGrad:
-    return "NVPTXISD::TexUnified3DU32FloatGrad";
-  case NVPTXISD::TexUnifiedCubeFloatFloat:
-    return "NVPTXISD::TexUnifiedCubeFloatFloat";
-  case NVPTXISD::TexUnifiedCubeFloatFloatLevel:
-    return "NVPTXISD::TexUnifiedCubeFloatFloatLevel";
-  case NVPTXISD::TexUnifiedCubeS32Float:
-    return "NVPTXISD::TexUnifiedCubeS32Float";
-  case NVPTXISD::TexUnifiedCubeS32FloatLevel:
-    return "NVPTXISD::TexUnifiedCubeS32FloatLevel";
-  case NVPTXISD::TexUnifiedCubeU32Float:
-    return "NVPTXISD::TexUnifiedCubeU32Float";
-  case NVPTXISD::TexUnifiedCubeU32FloatLevel:
-    return "NVPTXISD::TexUnifiedCubeU32FloatLevel";
-  case NVPTXISD::TexUnifiedCubeArrayFloatFloat:
-    return "NVPTXISD::TexUnifiedCubeArrayFloatFloat";
-  case NVPTXISD::TexUnifiedCubeArrayFloatFloatLevel:
-    return "NVPTXISD::TexUnifiedCubeArrayFloatFloatLevel";
-  case NVPTXISD::TexUnifiedCubeArrayS32Float:
-    return "NVPTXISD::TexUnifiedCubeArrayS32Float";
-  case NVPTXISD::TexUnifiedCubeArrayS32FloatLevel:
-    return "NVPTXISD::TexUnifiedCubeArrayS32FloatLevel";
-  case NVPTXISD::TexUnifiedCubeArrayU32Float:
-    return "NVPTXISD::TexUnifiedCubeArrayU32Float";
-  case NVPTXISD::TexUnifiedCubeArrayU32FloatLevel:
-    return "NVPTXISD::TexUnifiedCubeArrayU32FloatLevel";
-  case NVPTXISD::TexUnifiedCubeFloatFloatGrad:
-    return "NVPTXISD::TexUnifiedCubeFloatFloatGrad";
-  case NVPTXISD::TexUnifiedCubeS32FloatGrad:
-    return "NVPTXISD::TexUnifiedCubeS32FloatGrad";
-  case NVPTXISD::TexUnifiedCubeU32FloatGrad:
-    return "NVPTXISD::TexUnifiedCubeU32FloatGrad";
-  case NVPTXISD::TexUnifiedCubeArrayFloatFloatGrad:
-    return "NVPTXISD::TexUnifiedCubeArrayFloatFloatGrad";
-  case NVPTXISD::TexUnifiedCubeArrayS32FloatGrad:
-    return "NVPTXISD::TexUnifiedCubeArrayS32FloatGrad";
-  case NVPTXISD::TexUnifiedCubeArrayU32FloatGrad:
-    return "NVPTXISD::TexUnifiedCubeArrayU32FloatGrad";
-  case NVPTXISD::Tld4UnifiedR2DFloatFloat:
-    return "NVPTXISD::Tld4UnifiedR2DFloatFloat";
-  case NVPTXISD::Tld4UnifiedG2DFloatFloat:
-    return "NVPTXISD::Tld4UnifiedG2DFloatFloat";
-  case NVPTXISD::Tld4UnifiedB2DFloatFloat:
-    return "NVPTXISD::Tld4UnifiedB2DFloatFloat";
-  case NVPTXISD::Tld4UnifiedA2DFloatFloat:
-    return "NVPTXISD::Tld4UnifiedA2DFloatFloat";
-  case NVPTXISD::Tld4UnifiedR2DS64Float:
-    return "NVPTXISD::Tld4UnifiedR2DS64Float";
-  case NVPTXISD::Tld4UnifiedG2DS64Float:
-    return "NVPTXISD::Tld4UnifiedG2DS64Float";
-  case NVPTXISD::Tld4UnifiedB2DS64Float:
-    return "NVPTXISD::Tld4UnifiedB2DS64Float";
-  case NVPTXISD::Tld4UnifiedA2DS64Float:
-    return "NVPTXISD::Tld4UnifiedA2DS64Float";
-  case NVPTXISD::Tld4UnifiedR2DU64Float:
-    return "NVPTXISD::Tld4UnifiedR2DU64Float";
-  case NVPTXISD::Tld4UnifiedG2DU64Float:
-    return "NVPTXISD::Tld4UnifiedG2DU64Float";
-  case NVPTXISD::Tld4UnifiedB2DU64Float:
-    return "NVPTXISD::Tld4UnifiedB2DU64Float";
-  case NVPTXISD::Tld4UnifiedA2DU64Float:
-    return "NVPTXISD::Tld4UnifiedA2DU64Float";
-
-  case NVPTXISD::Suld1DI8Clamp:          return "NVPTXISD::Suld1DI8Clamp";
-  case NVPTXISD::Suld1DI16Clamp:         return "NVPTXISD::Suld1DI16Clamp";
-  case NVPTXISD::Suld1DI32Clamp:         return "NVPTXISD::Suld1DI32Clamp";
-  case NVPTXISD::Suld1DI64Clamp:         return "NVPTXISD::Suld1DI64Clamp";
-  case NVPTXISD::Suld1DV2I8Clamp:        return "NVPTXISD::Suld1DV2I8Clamp";
-  case NVPTXISD::Suld1DV2I16Clamp:       return "NVPTXISD::Suld1DV2I16Clamp";
-  case NVPTXISD::Suld1DV2I32Clamp:       return "NVPTXISD::Suld1DV2I32Clamp";
-  case NVPTXISD::Suld1DV2I64Clamp:       return "NVPTXISD::Suld1DV2I64Clamp";
-  case NVPTXISD::Suld1DV4I8Clamp:        return "NVPTXISD::Suld1DV4I8Clamp";
-  case NVPTXISD::Suld1DV4I16Clamp:       return "NVPTXISD::Suld1DV4I16Clamp";
-  case NVPTXISD::Suld1DV4I32Clamp:       return "NVPTXISD::Suld1DV4I32Clamp";
-
-  case NVPTXISD::Suld1DArrayI8Clamp:   return "NVPTXISD::Suld1DArrayI8Clamp";
-  case NVPTXISD::Suld1DArrayI16Clamp:  return "NVPTXISD::Suld1DArrayI16Clamp";
-  case NVPTXISD::Suld1DArrayI32Clamp:  return "NVPTXISD::Suld1DArrayI32Clamp";
-  case NVPTXISD::Suld1DArrayI64Clamp:  return "NVPTXISD::Suld1DArrayI64Clamp";
-  case NVPTXISD::Suld1DArrayV2I8Clamp: return "NVPTXISD::Suld1DArrayV2I8Clamp";
-  case NVPTXISD::Suld1DArrayV2I16Clamp:return "NVPTXISD::Suld1DArrayV2I16Clamp";
-  case NVPTXISD::Suld1DArrayV2I32Clamp:return "NVPTXISD::Suld1DArrayV2I32Clamp";
-  case NVPTXISD::Suld1DArrayV2I64Clamp:return "NVPTXISD::Suld1DArrayV2I64Clamp";
-  case NVPTXISD::Suld1DArrayV4I8Clamp: return "NVPTXISD::Suld1DArrayV4I8Clamp";
-  case NVPTXISD::Suld1DArrayV4I16Clamp:return "NVPTXISD::Suld1DArrayV4I16Clamp";
-  case NVPTXISD::Suld1DArrayV4I32Clamp:return "NVPTXISD::Suld1DArrayV4I32Clamp";
-
-  case NVPTXISD::Suld2DI8Clamp:          return "NVPTXISD::Suld2DI8Clamp";
-  case NVPTXISD::Suld2DI16Clamp:         return "NVPTXISD::Suld2DI16Clamp";
-  case NVPTXISD::Suld2DI32Clamp:         return "NVPTXISD::Suld2DI32Clamp";
-  case NVPTXISD::Suld2DI64Clamp:         return "NVPTXISD::Suld2DI64Clamp";
-  case NVPTXISD::Suld2DV2I8Clamp:        return "NVPTXISD::Suld2DV2I8Clamp";
-  case NVPTXISD::Suld2DV2I16Clamp:       return "NVPTXISD::Suld2DV2I16Clamp";
-  case NVPTXISD::Suld2DV2I32Clamp:       return "NVPTXISD::Suld2DV2I32Clamp";
-  case NVPTXISD::Suld2DV2I64Clamp:       return "NVPTXISD::Suld2DV2I64Clamp";
-  case NVPTXISD::Suld2DV4I8Clamp:        return "NVPTXISD::Suld2DV4I8Clamp";
-  case NVPTXISD::Suld2DV4I16Clamp:       return "NVPTXISD::Suld2DV4I16Clamp";
-  case NVPTXISD::Suld2DV4I32Clamp:       return "NVPTXISD::Suld2DV4I32Clamp";
-
-  case NVPTXISD::Suld2DArrayI8Clamp:   return "NVPTXISD::Suld2DArrayI8Clamp";
-  case NVPTXISD::Suld2DArrayI16Clamp:  return "NVPTXISD::Suld2DArrayI16Clamp";
-  case NVPTXISD::Suld2DArrayI32Clamp:  return "NVPTXISD::Suld2DArrayI32Clamp";
-  case NVPTXISD::Suld2DArrayI64Clamp:  return "NVPTXISD::Suld2DArrayI64Clamp";
-  case NVPTXISD::Suld2DArrayV2I8Clamp: return "NVPTXISD::Suld2DArrayV2I8Clamp";
-  case NVPTXISD::Suld2DArrayV2I16Clamp:return "NVPTXISD::Suld2DArrayV2I16Clamp";
-  case NVPTXISD::Suld2DArrayV2I32Clamp:return "NVPTXISD::Suld2DArrayV2I32Clamp";
-  case NVPTXISD::Suld2DArrayV2I64Clamp:return "NVPTXISD::Suld2DArrayV2I64Clamp";
-  case NVPTXISD::Suld2DArrayV4I8Clamp: return "NVPTXISD::Suld2DArrayV4I8Clamp";
-  case NVPTXISD::Suld2DArrayV4I16Clamp:return "NVPTXISD::Suld2DArrayV4I16Clamp";
-  case NVPTXISD::Suld2DArrayV4I32Clamp:return "NVPTXISD::Suld2DArrayV4I32Clamp";
-
-  case NVPTXISD::Suld3DI8Clamp:          return "NVPTXISD::Suld3DI8Clamp";
-  case NVPTXISD::Suld3DI16Clamp:         return "NVPTXISD::Suld3DI16Clamp";
-  case NVPTXISD::Suld3DI32Clamp:         return "NVPTXISD::Suld3DI32Clamp";
-  case NVPTXISD::Suld3DI64Clamp:         return "NVPTXISD::Suld3DI64Clamp";
-  case NVPTXISD::Suld3DV2I8Clamp:        return "NVPTXISD::Suld3DV2I8Clamp";
-  case NVPTXISD::Suld3DV2I16Clamp:       return "NVPTXISD::Suld3DV2I16Clamp";
-  case NVPTXISD::Suld3DV2I32Clamp:       return "NVPTXISD::Suld3DV2I32Clamp";
-  case NVPTXISD::Suld3DV2I64Clamp:       return "NVPTXISD::Suld3DV2I64Clamp";
-  case NVPTXISD::Suld3DV4I8Clamp:        return "NVPTXISD::Suld3DV4I8Clamp";
-  case NVPTXISD::Suld3DV4I16Clamp:       return "NVPTXISD::Suld3DV4I16Clamp";
-  case NVPTXISD::Suld3DV4I32Clamp:       return "NVPTXISD::Suld3DV4I32Clamp";
-
-  case NVPTXISD::Suld1DI8Trap:          return "NVPTXISD::Suld1DI8Trap";
-  case NVPTXISD::Suld1DI16Trap:         return "NVPTXISD::Suld1DI16Trap";
-  case NVPTXISD::Suld1DI32Trap:         return "NVPTXISD::Suld1DI32Trap";
-  case NVPTXISD::Suld1DI64Trap:         return "NVPTXISD::Suld1DI64Trap";
-  case NVPTXISD::Suld1DV2I8Trap:        return "NVPTXISD::Suld1DV2I8Trap";
-  case NVPTXISD::Suld1DV2I16Trap:       return "NVPTXISD::Suld1DV2I16Trap";
-  case NVPTXISD::Suld1DV2I32Trap:       return "NVPTXISD::Suld1DV2I32Trap";
-  case NVPTXISD::Suld1DV2I64Trap:       return "NVPTXISD::Suld1DV2I64Trap";
-  case NVPTXISD::Suld1DV4I8Trap:        return "NVPTXISD::Suld1DV4I8Trap";
-  case NVPTXISD::Suld1DV4I16Trap:       return "NVPTXISD::Suld1DV4I16Trap";
-  case NVPTXISD::Suld1DV4I32Trap:       return "NVPTXISD::Suld1DV4I32Trap";
-
-  case NVPTXISD::Suld1DArrayI8Trap:     return "NVPTXISD::Suld1DArrayI8Trap";
-  case NVPTXISD::Suld1DArrayI16Trap:    return "NVPTXISD::Suld1DArrayI16Trap";
-  case NVPTXISD::Suld1DArrayI32Trap:    return "NVPTXISD::Suld1DArrayI32Trap";
-  case NVPTXISD::Suld1DArrayI64Trap:    return "NVPTXISD::Suld1DArrayI64Trap";
-  case NVPTXISD::Suld1DArrayV2I8Trap:   return "NVPTXISD::Suld1DArrayV2I8Trap";
-  case NVPTXISD::Suld1DArrayV2I16Trap:  return "NVPTXISD::Suld1DArrayV2I16Trap";
-  case NVPTXISD::Suld1DArrayV2I32Trap:  return "NVPTXISD::Suld1DArrayV2I32Trap";
-  case NVPTXISD::Suld1DArrayV2I64Trap:  return "NVPTXISD::Suld1DArrayV2I64Trap";
-  case NVPTXISD::Suld1DArrayV4I8Trap:   return "NVPTXISD::Suld1DArrayV4I8Trap";
-  case NVPTXISD::Suld1DArrayV4I16Trap:  return "NVPTXISD::Suld1DArrayV4I16Trap";
-  case NVPTXISD::Suld1DArrayV4I32Trap:  return "NVPTXISD::Suld1DArrayV4I32Trap";
-
-  case NVPTXISD::Suld2DI8Trap:          return "NVPTXISD::Suld2DI8Trap";
-  case NVPTXISD::Suld2DI16Trap:         return "NVPTXISD::Suld2DI16Trap";
-  case NVPTXISD::Suld2DI32Trap:         return "NVPTXISD::Suld2DI32Trap";
-  case NVPTXISD::Suld2DI64Trap:         return "NVPTXISD::Suld2DI64Trap";
-  case NVPTXISD::Suld2DV2I8Trap:        return "NVPTXISD::Suld2DV2I8Trap";
-  case NVPTXISD::Suld2DV2I16Trap:       return "NVPTXISD::Suld2DV2I16Trap";
-  case NVPTXISD::Suld2DV2I32Trap:       return "NVPTXISD::Suld2DV2I32Trap";
-  case NVPTXISD::Suld2DV2I64Trap:       return "NVPTXISD::Suld2DV2I64Trap";
-  case NVPTXISD::Suld2DV4I8Trap:        return "NVPTXISD::Suld2DV4I8Trap";
-  case NVPTXISD::Suld2DV4I16Trap:       return "NVPTXISD::Suld2DV4I16Trap";
-  case NVPTXISD::Suld2DV4I32Trap:       return "NVPTXISD::Suld2DV4I32Trap";
-
-  case NVPTXISD::Suld2DArrayI8Trap:     return "NVPTXISD::Suld2DArrayI8Trap";
-  case NVPTXISD::Suld2DArrayI16Trap:    return "NVPTXISD::Suld2DArrayI16Trap";
-  case NVPTXISD::Suld2DArrayI32Trap:    return "NVPTXISD::Suld2DArrayI32Trap";
-  case NVPTXISD::Suld2DArrayI64Trap:    return "NVPTXISD::Suld2DArrayI64Trap";
-  case NVPTXISD::Suld2DArrayV2I8Trap:   return "NVPTXISD::Suld2DArrayV2I8Trap";
-  case NVPTXISD::Suld2DArrayV2I16Trap:  return "NVPTXISD::Suld2DArrayV2I16Trap";
-  case NVPTXISD::Suld2DArrayV2I32Trap:  return "NVPTXISD::Suld2DArrayV2I32Trap";
-  case NVPTXISD::Suld2DArrayV2I64Trap:  return "NVPTXISD::Suld2DArrayV2I64Trap";
-  case NVPTXISD::Suld2DArrayV4I8Trap:   return "NVPTXISD::Suld2DArrayV4I8Trap";
-  case NVPTXISD::Suld2DArrayV4I16Trap:  return "NVPTXISD::Suld2DArrayV4I16Trap";
-  case NVPTXISD::Suld2DArrayV4I32Trap:  return "NVPTXISD::Suld2DArrayV4I32Trap";
-
-  case NVPTXISD::Suld3DI8Trap:          return "NVPTXISD::Suld3DI8Trap";
-  case NVPTXISD::Suld3DI16Trap:         return "NVPTXISD::Suld3DI16Trap";
-  case NVPTXISD::Suld3DI32Trap:         return "NVPTXISD::Suld3DI32Trap";
-  case NVPTXISD::Suld3DI64Trap:         return "NVPTXISD::Suld3DI64Trap";
-  case NVPTXISD::Suld3DV2I8Trap:        return "NVPTXISD::Suld3DV2I8Trap";
-  case NVPTXISD::Suld3DV2I16Trap:       return "NVPTXISD::Suld3DV2I16Trap";
-  case NVPTXISD::Suld3DV2I32Trap:       return "NVPTXISD::Suld3DV2I32Trap";
-  case NVPTXISD::Suld3DV2I64Trap:       return "NVPTXISD::Suld3DV2I64Trap";
-  case NVPTXISD::Suld3DV4I8Trap:        return "NVPTXISD::Suld3DV4I8Trap";
-  case NVPTXISD::Suld3DV4I16Trap:       return "NVPTXISD::Suld3DV4I16Trap";
-  case NVPTXISD::Suld3DV4I32Trap:       return "NVPTXISD::Suld3DV4I32Trap";
-
-  case NVPTXISD::Suld1DI8Zero:          return "NVPTXISD::Suld1DI8Zero";
-  case NVPTXISD::Suld1DI16Zero:         return "NVPTXISD::Suld1DI16Zero";
-  case NVPTXISD::Suld1DI32Zero:         return "NVPTXISD::Suld1DI32Zero";
-  case NVPTXISD::Suld1DI64Zero:         return "NVPTXISD::Suld1DI64Zero";
-  case NVPTXISD::Suld1DV2I8Zero:        return "NVPTXISD::Suld1DV2I8Zero";
-  case NVPTXISD::Suld1DV2I16Zero:       return "NVPTXISD::Suld1DV2I16Zero";
-  case NVPTXISD::Suld1DV2I32Zero:       return "NVPTXISD::Suld1DV2I32Zero";
-  case NVPTXISD::Suld1DV2I64Zero:       return "NVPTXISD::Suld1DV2I64Zero";
-  case NVPTXISD::Suld1DV4I8Zero:        return "NVPTXISD::Suld1DV4I8Zero";
-  case NVPTXISD::Suld1DV4I16Zero:       return "NVPTXISD::Suld1DV4I16Zero";
-  case NVPTXISD::Suld1DV4I32Zero:       return "NVPTXISD::Suld1DV4I32Zero";
-
-  case NVPTXISD::Suld1DArrayI8Zero:     return "NVPTXISD::Suld1DArrayI8Zero";
-  case NVPTXISD::Suld1DArrayI16Zero:    return "NVPTXISD::Suld1DArrayI16Zero";
-  case NVPTXISD::Suld1DArrayI32Zero:    return "NVPTXISD::Suld1DArrayI32Zero";
-  case NVPTXISD::Suld1DArrayI64Zero:    return "NVPTXISD::Suld1DArrayI64Zero";
-  case NVPTXISD::Suld1DArrayV2I8Zero:   return "NVPTXISD::Suld1DArrayV2I8Zero";
-  case NVPTXISD::Suld1DArrayV2I16Zero:  return "NVPTXISD::Suld1DArrayV2I16Zero";
-  case NVPTXISD::Suld1DArrayV2I32Zero:  return "NVPTXISD::Suld1DArrayV2I32Zero";
-  case NVPTXISD::Suld1DArrayV2I64Zero:  return "NVPTXISD::Suld1DArrayV2I64Zero";
-  case NVPTXISD::Suld1DArrayV4I8Zero:   return "NVPTXISD::Suld1DArrayV4I8Zero";
-  case NVPTXISD::Suld1DArrayV4I16Zero:  return "NVPTXISD::Suld1DArrayV4I16Zero";
-  case NVPTXISD::Suld1DArrayV4I32Zero:  return "NVPTXISD::Suld1DArrayV4I32Zero";
-
-  case NVPTXISD::Suld2DI8Zero:          return "NVPTXISD::Suld2DI8Zero";
-  case NVPTXISD::Suld2DI16Zero:         return "NVPTXISD::Suld2DI16Zero";
-  case NVPTXISD::Suld2DI32Zero:         return "NVPTXISD::Suld2DI32Zero";
-  case NVPTXISD::Suld2DI64Zero:         return "NVPTXISD::Suld2DI64Zero";
-  case NVPTXISD::Suld2DV2I8Zero:        return "NVPTXISD::Suld2DV2I8Zero";
-  case NVPTXISD::Suld2DV2I16Zero:       return "NVPTXISD::Suld2DV2I16Zero";
-  case NVPTXISD::Suld2DV2I32Zero:       return "NVPTXISD::Suld2DV2I32Zero";
-  case NVPTXISD::Suld2DV2I64Zero:       return "NVPTXISD::Suld2DV2I64Zero";
-  case NVPTXISD::Suld2DV4I8Zero:        return "NVPTXISD::Suld2DV4I8Zero";
-  case NVPTXISD::Suld2DV4I16Zero:       return "NVPTXISD::Suld2DV4I16Zero";
-  case NVPTXISD::Suld2DV4I32Zero:       return "NVPTXISD::Suld2DV4I32Zero";
-
-  case NVPTXISD::Suld2DArrayI8Zero:     return "NVPTXISD::Suld2DArrayI8Zero";
-  case NVPTXISD::Suld2DArrayI16Zero:    return "NVPTXISD::Suld2DArrayI16Zero";
-  case NVPTXISD::Suld2DArrayI32Zero:    return "NVPTXISD::Suld2DArrayI32Zero";
-  case NVPTXISD::Suld2DArrayI64Zero:    return "NVPTXISD::Suld2DArrayI64Zero";
-  case NVPTXISD::Suld2DArrayV2I8Zero:   return "NVPTXISD::Suld2DArrayV2I8Zero";
-  case NVPTXISD::Suld2DArrayV2I16Zero:  return "NVPTXISD::Suld2DArrayV2I16Zero";
-  case NVPTXISD::Suld2DArrayV2I32Zero:  return "NVPTXISD::Suld2DArrayV2I32Zero";
-  case NVPTXISD::Suld2DArrayV2I64Zero:  return "NVPTXISD::Suld2DArrayV2I64Zero";
-  case NVPTXISD::Suld2DArrayV4I8Zero:   return "NVPTXISD::Suld2DArrayV4I8Zero";
-  case NVPTXISD::Suld2DArrayV4I16Zero:  return "NVPTXISD::Suld2DArrayV4I16Zero";
-  case NVPTXISD::Suld2DArrayV4I32Zero:  return "NVPTXISD::Suld2DArrayV4I32Zero";
-
-  case NVPTXISD::Suld3DI8Zero:          return "NVPTXISD::Suld3DI8Zero";
-  case NVPTXISD::Suld3DI16Zero:         return "NVPTXISD::Suld3DI16Zero";
-  case NVPTXISD::Suld3DI32Zero:         return "NVPTXISD::Suld3DI32Zero";
-  case NVPTXISD::Suld3DI64Zero:         return "NVPTXISD::Suld3DI64Zero";
-  case NVPTXISD::Suld3DV2I8Zero:        return "NVPTXISD::Suld3DV2I8Zero";
-  case NVPTXISD::Suld3DV2I16Zero:       return "NVPTXISD::Suld3DV2I16Zero";
-  case NVPTXISD::Suld3DV2I32Zero:       return "NVPTXISD::Suld3DV2I32Zero";
-  case NVPTXISD::Suld3DV2I64Zero:       return "NVPTXISD::Suld3DV2I64Zero";
-  case NVPTXISD::Suld3DV4I8Zero:        return "NVPTXISD::Suld3DV4I8Zero";
-  case NVPTXISD::Suld3DV4I16Zero:       return "NVPTXISD::Suld3DV4I16Zero";
-  case NVPTXISD::Suld3DV4I32Zero:       return "NVPTXISD::Suld3DV4I32Zero";
+
+    MAKE_CASE(NVPTXISD::CALL)
+    MAKE_CASE(NVPTXISD::RET_GLUE)
+    MAKE_CASE(NVPTXISD::LOAD_PARAM)
+    MAKE_CASE(NVPTXISD::Wrapper)
+    MAKE_CASE(NVPTXISD::DeclareParam)
+    MAKE_CASE(NVPTXISD::DeclareScalarParam)
+    MAKE_CASE(NVPTXISD::DeclareRet)
+    MAKE_CASE(NVPTXISD::DeclareScalarRet)
+    MAKE_CASE(NVPTXISD::DeclareRetParam)
+    MAKE_CASE(NVPTXISD::PrintCall)
+    MAKE_CASE(NVPTXISD::PrintConvergentCall)
+    MAKE_CASE(NVPTXISD::PrintCallUni)
+    MAKE_CASE(NVPTXISD::PrintConvergentCallUni)
+    MAKE_CASE(NVPTXISD::LoadParam)
+    MAKE_CASE(NVPTXISD::LoadParamV2)
+    MAKE_CASE(NVPTXISD::LoadParamV4)
+    MAKE_CASE(NVPTXISD::StoreParam)
+    MAKE_CASE(NVPTXISD::StoreParamV2)
+    MAKE_CASE(NVPTXISD::StoreParamV4)
+    MAKE_CASE(NVPTXISD::StoreParamS32)
+    MAKE_CASE(NVPTXISD::StoreParamU32)
+    MAKE_CASE(NVPTXISD::CallArgBegin)
+    MAKE_CASE(NVPTXISD::CallArg)
+    MAKE_CASE(NVPTXISD::LastCallArg)
+    MAKE_CASE(NVPTXISD::CallArgEnd)
+    MAKE_CASE(NVPTXISD::CallVoid)
+    MAKE_CASE(NVPTXISD::CallVal)
+    MAKE_CASE(NVPTXISD::CallSymbol)
+    MAKE_CASE(NVPTXISD::Prototype)
+    MAKE_CASE(NVPTXISD::MoveParam)
+    MAKE_CASE(NVPTXISD::StoreRetval)
+    MAKE_CASE(NVPTXISD::StoreRetvalV2)
+    MAKE_CASE(NVPTXISD::StoreRetvalV4)
+    MAKE_CASE(NVPTXISD::PseudoUseParam)
+    MAKE_CASE(NVPTXISD::RETURN)
+    MAKE_CASE(NVPTXISD::CallSeqBegin)
+    MAKE_CASE(NVPTXISD::CallSeqEnd)
+    MAKE_CASE(NVPTXISD::CallPrototype)
+    MAKE_CASE(NVPTXISD::ProxyReg)
+    MAKE_CASE(NVPTXISD::LoadV2)
+    MAKE_CASE(NVPTXISD::LoadV4)
+    MAKE_CASE(NVPTXISD::LDGV2)
+    MAKE_CASE(NVPTXISD::LDGV4)
+    MAKE_CASE(NVPTXISD::LDUV2)
+    MAKE_CASE(NVPTXISD::LDUV4)
+    MAKE_CASE(NVPTXISD::StoreV2)
+    MAKE_CASE(NVPTXISD::StoreV4)
+    MAKE_CASE(NVPTXISD::FUN_SHFL_CLAMP)
+    MAKE_CASE(NVPTXISD::FUN_SHFR_CLAMP)
+    MAKE_CASE(NVPTXISD::IMAD)
+    MAKE_CASE(NVPTXISD::BFE)
+    MAKE_CASE(NVPTXISD::BFI)
+    MAKE_CASE(NVPTXISD::PRMT)
+    MAKE_CASE(NVPTXISD::SETP_F16X2)
+    MAKE_CASE(NVPTXISD::SETP_BF16X2)
+    MAKE_CASE(NVPTXISD::Dummy)
+    MAKE_CASE(NVPTXISD::MUL_WIDE_SIGNED)
+    MAKE_CASE(NVPTXISD::MUL_WIDE_UNSIGNED)
+    MAKE_CASE(NVPTXISD::Tex1DFloatS32)
+    MAKE_CASE(NVPTXISD::Tex1DFloatFloat)
+    MAKE_CASE(NVPTXISD::Tex1DFloatFloatLevel)
+    MAKE_CASE(NVPTXISD::Tex1DFloatFloatGrad)
+    MAKE_CASE(NVPTXISD::Tex1DS32S32)
+    MAKE_CASE(NVPTXISD::Tex1DS32Float)
+    MAKE_CASE(NVPTXISD::Tex1DS32FloatLevel)
+    MAKE_CASE(NVPTXISD::Tex1DS32FloatGrad)
+    MAKE_CASE(NVPTXISD::Tex1DU32S32)
+    MAKE_CASE(NVPTXISD::Tex1DU32Float)
+    MAKE_CASE(NVPTXISD::Tex1DU32FloatLevel)
+    MAKE_CASE(NVPTXISD::Tex1DU32FloatGrad)
+    MAKE_CASE(NVPTXISD::Tex1DArrayFloatS32)
+    MAKE_CASE(NVPTXISD::Tex1DArrayFloatFloat)
+    MAKE_CASE(NVPTXISD::Tex1DArrayFloatFloatLevel)
+    MAKE_CASE(NVPTXISD::Tex1DArrayFloatFloatGrad)
+    MAKE_CASE(NVPTXISD::Tex1DArrayS32S32)
+    MAKE_CASE(NVPTXISD::Tex1DArrayS32Float)
+    MAKE_CASE(NVPTXISD::Tex1DArrayS32FloatLevel)
+    MAKE_CASE(NVPTXISD::Tex1DArrayS32FloatGrad)
+    MAKE_CASE(NVPTXISD::Tex1DArrayU32S32)
+    MAKE_CASE(NVPTXISD::Tex1DArrayU32Float)
+    MAKE_CASE(NVPTXISD::Tex1DArrayU32FloatLevel)
+    MAKE_CASE(NVPTXISD::Tex1DArrayU32FloatGrad)
+    MAKE_CASE(NVPTXISD::Tex2DFloatS32)
+    MAKE_CASE(NVPTXISD::Tex2DFloatFloat)
+    MAKE_CASE(NVPTXISD::Tex2DFloatFloatLevel)
+    MAKE_CASE(NVPTXISD::Tex2DFloatFloatGrad)
+    MAKE_CASE(NVPTXISD::Tex2DS32S32)
+    MAKE_CASE(NVPTXISD::Tex2DS32Float)
+    MAKE_CASE(NVPTXISD::Tex2DS32FloatLevel)
+    MAKE_CASE(NVPTXISD::Tex2DS32FloatGrad)
+    MAKE_CASE(NVPTXISD::Tex2DU32S32)
+    MAKE_CASE(NVPTXISD::Tex2DU32Float)
+    MAKE_CASE(NVPTXISD::Tex2DU32FloatLevel)
+    MAKE_CASE(NVPTXISD::Tex2DU32FloatGrad)
+    MAKE_CASE(NVPTXISD::Tex2DArrayFloatS32)
+    MAKE_CASE(NVPTXISD::Tex2DArrayFloatFloat)
+    MAKE_CASE(NVPTXISD::Tex2DArrayFloatFloatLevel)
+    MAKE_CASE(NVPTXISD::Tex2DArrayFloatFloatGrad)
+    MAKE_CASE(NVPTXISD::Tex2DArrayS32S32)
+    MAKE_CASE(NVPTXISD::Tex2DArrayS32Float)
+    MAKE_CASE(NVPTXISD::Tex2DArrayS32FloatLevel)
+    MAKE_CASE(NVPTXISD::Tex2DArrayS32FloatGrad)
+    MAKE_CASE(NVPTXISD::Tex2DArrayU32S32)
+    MAKE_CASE(NVPTXISD::Tex2DArrayU32Float)
+    MAKE_CASE(NVPTXISD::Tex2DArrayU32FloatLevel)
+    MAKE_CASE(NVPTXISD::Tex2DArrayU32FloatGrad)
+    MAKE_CASE(NVPTXISD::Tex3DFloatS32)
+    MAKE_CASE(NVPTXISD::Tex3DFloatFloat)
+    MAKE_CASE(NVPTXISD::Tex3DFloatFloatLevel)
+    MAKE_CASE(NVPTXISD::Tex3DFloatFloatGrad)
+    MAKE_CASE(NVPTXISD::Tex3DS32S32)
+    MAKE_CASE(NVPTXISD::Tex3DS32Float)
+    MAKE_CASE(NVPTXISD::Tex3DS32FloatLevel)
+    MAKE_CASE(NVPTXISD::Tex3DS32FloatGrad)
+    MAKE_CASE(NVPTXISD::Tex3DU32S32)
+    MAKE_CASE(NVPTXISD::Tex3DU32Float)
+    MAKE_CASE(NVPTXISD::Tex3DU32FloatLevel)
+    MAKE_CASE(NVPTXISD::Tex3DU32FloatGrad)
+    MAKE_CASE(NVPTXISD::TexCubeFloatFloat)
+    MAKE_CASE(NVPTXISD::TexCubeFloatFloatLevel)
+    MAKE_CASE(NVPTXISD::TexCubeS32Float)
+    MAKE_CASE(NVPTXISD::TexCubeS32FloatLevel)
+    MAKE_CASE(NVPTXISD::TexCubeU32Float)
+    MAKE_CASE(NVPTXISD::TexCubeU32FloatLevel)
+    MAKE_CASE(NVPTXISD::TexCubeArrayFloatFloat)
+    MAKE_CASE(NVPTXISD::TexCubeArrayFloatFloatLevel)
+    MAKE_CASE(NVPTXISD::TexCubeArrayS32Float)
+    MAKE_CASE(NVPTXISD::TexCubeArrayS32FloatLevel)
+    MAKE_CASE(NVPTXISD::TexCubeArrayU32Float)
+    MAKE_CASE(NVPTXISD::TexCubeArrayU32FloatLevel)
+    MAKE_CASE(NVPTXISD::Tld4R2DFloatFloat)
+    MAKE_CASE(NVPTXISD::Tld4G2DFloatFloat)
+    MAKE_CASE(NVPTXISD::Tld4B2DFloatFloat)
+    MAKE_CASE(NVPTXISD::Tld4A2DFloatFloat)
+    MAKE_CASE(NVPTXISD::Tld4R2DS64Float)
+    MAKE_CASE(NVPTXISD::Tld4G2DS64Float)
+    MAKE_CASE(NVPTXISD::Tld4B2DS64Float)
+    MAKE_CASE(NVPTXISD::Tld4A2DS64Float)
+    MAKE_CASE(NVPTXISD::Tld4R2DU64Float)
+    MAKE_CASE(NVPTXISD::Tld4G2DU64Float)
+    MAKE_CASE(NVPTXISD::Tld4B2DU64Float)
+    MAKE_CASE(NVPTXISD::Tld4A2DU64Float)
+
+    MAKE_CASE(NVPTXISD::TexUnified1DFloatS32)
+    MAKE_CASE(NVPTXISD::TexUnified1DFloatFloat)
+    MAKE_CASE(NVPTXISD::TexUnified1DFloatFloatLevel)
+    MAKE_CASE(NVPTXISD::TexUnified1DFloatFloatGrad)
+    MAKE_CASE(NVPTXISD::TexUnified1DS32S32)
+    MAKE_CASE(NVPTXISD::TexUnified1DS32Float)
+    MAKE_CASE(NVPTXISD::TexUnified1DS32FloatLevel)
+    MAKE_CASE(NVPTXISD::TexUnified1DS32FloatGrad)
+    MAKE_CASE(NVPTXISD::TexUnified1DU32S32)
+    MAKE_CASE(NVPTXISD::TexUnified1DU32Float)
+    MAKE_CASE(NVPTXISD::TexUnified1DU32FloatLevel)
+    MAKE_CASE(NVPTXISD::TexUnified1DU32FloatGrad)
+    MAKE_CASE(NVPTXISD::TexUnified1DArrayFloatS32)
+    MAKE_CASE(NVPTXISD::TexUnified1DArrayFloatFloat)
+    MAKE_CASE(NVPTXISD::TexUnified1DArrayFloatFloatLevel)
+    MAKE_CASE(NVPTXISD::TexUnified1DArrayFloatFloatGrad)
+    MAKE_CASE(NVPTXISD::TexUnified1DArrayS32S32)
+    MAKE_CASE(NVPTXISD::TexUnified1DArrayS32Float)
+    MAKE_CASE(NVPTXISD::TexUnified1DArrayS32FloatLevel)
+    MAKE_CASE(NVPTXISD::TexUnified1DArrayS32FloatGrad)
+    MAKE_CASE(NVPTXISD::TexUnified1DArrayU32S32)
+    MAKE_CASE(NVPTXISD::TexUnified1DArrayU32Float)
+    MAKE_CASE(NVPTXISD::TexUnified1DArrayU32FloatLevel)
+    MAKE_CASE(NVPTXISD::TexUnified1DArrayU32FloatGrad)
+    MAKE_CASE(NVPTXISD::TexUnified2DFloatS32)
+    MAKE_CASE(NVPTXISD::TexUnified2DFloatFloat)
+    MAKE_CASE(NVPTXISD::TexUnified2DFloatFloatLevel)
+    MAKE_CASE(NVPTXISD::TexUnified2DFloatFloatGrad)
+    MAKE_CASE(NVPTXISD::TexUnified2DS32S32)
+    MAKE_CASE(NVPTXISD::TexUnified2DS32Float)
+    MAKE_CASE(NVPTXISD::TexUnified2DS32FloatLevel)
+    MAKE_CASE(NVPTXISD::TexUnified2DS32FloatGrad)
+    MAKE_CASE(NVPTXISD::TexUnified2DU32S32)
+    MAKE_CASE(NVPTXISD::TexUnified2DU32Float)
+    MAKE_CASE(NVPTXISD::TexUnified2DU32FloatLevel)
+    MAKE_CASE(NVPTXISD::TexUnified2DU32FloatGrad)
+    MAKE_CASE(NVPTXISD::TexUnified2DArrayFloatS32)
+    MAKE_CASE(NVPTXISD::TexUnified2DArrayFloatFloat)
+    MAKE_CASE(NVPTXISD::TexUnified2DArrayFloatFloatLevel)
+    MAKE_CASE(NVPTXISD::TexUnified2DArrayFloatFloatGrad)
+    MAKE_CASE(NVPTXISD::TexUnified2DArrayS32S32)
+    MAKE_CASE(NVPTXISD::TexUnified2DArrayS32Float)
+    MAKE_CASE(NVPTXISD::TexUnified2DArrayS32FloatLevel)
+    MAKE_CASE(NVPTXISD::TexUnified2DArrayS32FloatGrad)
+    MAKE_CASE(NVPTXISD::TexUnified2DArrayU32S32)
+    MAKE_CASE(NVPTXISD::TexUnified2DArrayU32Float)
+    MAKE_CASE(NVPTXISD::TexUnified2DArrayU32FloatLevel)
+    MAKE_CASE(NVPTXISD::TexUnified2DArrayU32FloatGrad)
+    MAKE_CASE(NVPTXISD::TexUnified3DFloatS32)
+    MAKE_CASE(NVPTXISD::TexUnified3DFloatFloat)
+    MAKE_CASE(NVPTXISD::TexUnified3DFloatFloatLevel)
+    MAKE_CASE(NVPTXISD::TexUnified3DFloatFloatGrad)
+    MAKE_CASE(NVPTXISD::TexUnified3DS32S32)
+    MAKE_CASE(NVPTXISD::TexUnified3DS32Float)
+    MAKE_CASE(NVPTXISD::TexUnified3DS32FloatLevel)
+    MAKE_CASE(NVPTXISD::TexUnified3DS32FloatGrad)
+    MAKE_CASE(NVPTXISD::TexUnified3DU32S32)
+    MAKE_CASE(NVPTXISD::TexUnified3DU32Float)
+    MAKE_CASE(NVPTXISD::TexUnified3DU32FloatLevel)
+    MAKE_CASE(NVPTXISD::TexUnified3DU32FloatGrad)
+    MAKE_CASE(NVPTXISD::TexUnifiedCubeFloatFloat)
+    MAKE_CASE(NVPTXISD::TexUnifiedCubeFloatFloatLevel)
+    MAKE_CASE(NVPTXISD::TexUnifiedCubeS32Float)
+    MAKE_CASE(NVPTXISD::TexUnifiedCubeS32FloatLevel)
+    MAKE_CASE(NVPTXISD::TexUnifiedCubeU32Float)
+    MAKE_CASE(NVPTXISD::TexUnifiedCubeU32FloatLevel)
+    MAKE_CASE(NVPTXISD::TexUnifiedCubeArrayFloatFloat)
+    MAKE_CASE(NVPTXISD::TexUnifiedCubeArrayFloatFloatLevel)
+    MAKE_CASE(NVPTXISD::TexUnifiedCubeArrayS32Float)
+    MAKE_CASE(NVPTXISD::TexUnifiedCubeArrayS32FloatLevel)
+    MAKE_CASE(NVPTXISD::TexUnifiedCubeArrayU32Float)
+    MAKE_CASE(NVPTXISD::TexUnifiedCubeArrayU32FloatLevel)
+    MAKE_CASE(NVPTXISD::TexUnifiedCubeFloatFloatGrad)
+    MAKE_CASE(NVPTXISD::TexUnifiedCubeS32FloatGrad)
+    MAKE_CASE(NVPTXISD::TexUnifiedCubeU32FloatGrad)
+    MAKE_CASE(NVPTXISD::TexUnifiedCubeArrayFloatFloatGrad)
+    MAKE_CASE(NVPTXISD::TexUnifiedCubeArrayS32FloatGrad)
+    MAKE_CASE(NVPTXISD::TexUnifiedCubeArrayU32FloatGrad)
+    MAKE_CASE(NVPTXISD::Tld4UnifiedR2DFloatFloat)
+    MAKE_CASE(NVPTXISD::Tld4UnifiedG2DFloatFloat)
+    MAKE_CASE(NVPTXISD::Tld4UnifiedB2DFloatFloat)
+    MAKE_CASE(NVPTXISD::Tld4UnifiedA2DFloatFloat)
+    MAKE_CASE(NVPTXISD::Tld4UnifiedR2DS64Float)
+    MAKE_CASE(NVPTXISD::Tld4UnifiedG2DS64Float)
+    MAKE_CASE(NVPTXISD::Tld4UnifiedB2DS64Float)
+    MAKE_CASE(NVPTXISD::Tld4UnifiedA2DS64Float)
+    MAKE_CASE(NVPTXISD::Tld4UnifiedR2DU64Float)
+    MAKE_CASE(NVPTXISD::Tld4UnifiedG2DU64Float)
+    MAKE_CASE(NVPTXISD::Tld4UnifiedB2DU64Float)
+    MAKE_CASE(NVPTXISD::Tld4UnifiedA2DU64Float)
+
+    MAKE_CASE(NVPTXISD::Suld1DI8Clamp)
+    MAKE_CASE(NVPTXISD::Suld1DI16Clamp)
+    MAKE_CASE(NVPTXISD::Suld1DI32Clamp)
+    MAKE_CASE(NVPTXISD::Suld1DI64Clamp)
+    MAKE_CASE(NVPTXISD::Suld1DV2I8Clamp)
+    MAKE_CASE(NVPTXISD::Suld1DV2I16Clamp)
+    MAKE_CASE(NVPTXISD::Suld1DV2I32Clamp)
+    MAKE_CASE(NVPTXISD::Suld1DV2I64Clamp)
+    MAKE_CASE(NVPTXISD::Suld1DV4I8Clamp)
+    MAKE_CASE(NVPTXISD::Suld1DV4I16Clamp)
+    MAKE_CASE(NVPTXISD::Suld1DV4I32Clamp)
+
+    MAKE_CASE(NVPTXISD::Suld1DArrayI8Clamp)
+    MAKE_CASE(NVPTXISD::Suld1DArrayI16Clamp)
+    MAKE_CASE(NVPTXISD::Suld1DArrayI32Clamp)
+    MAKE_CASE(NVPTXISD::Suld1DArrayI64Clamp)
+    MAKE_CASE(NVPTXISD::Suld1DArrayV2I8Clamp)
+    MAKE_CASE(NVPTXISD::Suld1DArrayV2I16Clamp)
+    MAKE_CASE(NVPTXISD::Suld1DArrayV2I32Clamp)
+    MAKE_CASE(NVPTXISD::Suld1DArrayV2I64Clamp)
+    MAKE_CASE(NVPTXISD::Suld1DArrayV4I8Clamp)
+    MAKE_CASE(NVPTXISD::Suld1DArrayV4I16Clamp)
+    MAKE_CASE(NVPTXISD::Suld1DArrayV4I32Clamp)
+
+    MAKE_CASE(NVPTXISD::Suld2DI8Clamp)
+    MAKE_CASE(NVPTXISD::Suld2DI16Clamp)
+    MAKE_CASE(NVPTXISD::Suld2DI32Clamp)
+    MAKE_CASE(NVPTXISD::Suld2DI64Clamp)
+    MAKE_CASE(NVPTXISD::Suld2DV2I8Clamp)
+    MAKE_CASE(NVPTXISD::Suld2DV2I16Clamp)
+    MAKE_CASE(NVPTXISD::Suld2DV2I32Clamp)
+    MAKE_CASE(NVPTXISD::Suld2DV2I64Clamp)
+    MAKE_CASE(NVPTXISD::Suld2DV4I8Clamp)
+    MAKE_CASE(NVPTXISD::Suld2DV4I16Clamp)
+    MAKE_CASE(NVPTXISD::Suld2DV4I32Clamp)
+
+    MAKE_CASE(NVPTXISD::Suld2DArrayI8Clamp)
+    MAKE_CASE(NVPTXISD::Suld2DArrayI16Clamp)
+    MAKE_CASE(NVPTXISD::Suld2DArrayI32Clamp)
+    MAKE_CASE(NVPTXISD::Suld2DArrayI64Clamp)
+    MAKE_CASE(NVPTXISD::Suld2DArrayV2I8Clamp)
+    MAKE_CASE(NVPTXISD::Suld2DArrayV2I16Clamp)
+    MAKE_CASE(NVPTXISD::Suld2DArrayV2I32Clamp)
+    MAKE_CASE(NVPTXISD::Suld2DArrayV2I64Clamp)
+    MAKE_CASE(NVPTXISD::Suld2DArrayV4I8Clamp)
+    MAKE_CASE(NVPTXISD::Suld2DArrayV4I16Clamp)
+    MAKE_CASE(NVPTXISD::Suld2DArrayV4I32Clamp)
+
+    MAKE_CASE(NVPTXISD::Suld3DI8Clamp)
+    MAKE_CASE(NVPTXISD::Suld3DI16Clamp)
+    MAKE_CASE(NVPTXISD::Suld3DI32Clamp)
+    MAKE_CASE(NVPTXISD::Suld3DI64Clamp)
+    MAKE_CASE(NVPTXISD::Suld3DV2I8Clamp)
+    MAKE_CASE(NVPTXISD::Suld3DV2I16Clamp)
+    MAKE_CASE(NVPTXISD::Suld3DV2I32Clamp)
+    MAKE_CASE(NVPTXISD::Suld3DV2I64Clamp)
+    MAKE_CASE(NVPTXISD::Suld3DV4I8Clamp)
+    MAKE_CASE(NVPTXISD::Suld3DV4I16Clamp)
+    MAKE_CASE(NVPTXISD::Suld3DV4I32Clamp)
+
+    MAKE_CASE(NVPTXISD::Suld1DI8Trap)
+    MAKE_CASE(NVPTXISD::Suld1DI16Trap)
+    MAKE_CASE(NVPTXISD::Suld1DI32Trap)
+    MAKE_CASE(NVPTXISD::Suld1DI64Trap)
+    MAKE_CASE(NVPTXISD::Suld1DV2I8Trap)
+    MAKE_CASE(NVPTXISD::Suld1DV2I16Trap)
+    MAKE_CASE(NVPTXISD::Suld1DV2I32Trap)
+    MAKE_CASE(NVPTXISD::Suld1DV2I64Trap)
+    MAKE_CASE(NVPTXISD::Suld1DV4I8Trap)
+    MAKE_CASE(NVPTXISD::Suld1DV4I16Trap)
+    MAKE_CASE(NVPTXISD::Suld1DV4I32Trap)
+
+    MAKE_CASE(NVPTXISD::Suld1DArrayI8Trap)
+    MAKE_CASE(NVPTXISD::Suld1DArrayI16Trap)
+    MAKE_CASE(NVPTXISD::Suld1DArrayI32Trap)
+    MAKE_CASE(NVPTXISD::Suld1DArrayI64Trap)
+    MAKE_CASE(NVPTXISD::Suld1DArrayV2I8Trap)
+    MAKE_CASE(NVPTXISD::Suld1DArrayV2I16Trap)
+    MAKE_CASE(NVPTXISD::Suld1DArrayV2I32Trap)
+    MAKE_CASE(NVPTXISD::Suld1DArrayV2I64Trap)
+    MAKE_CASE(NVPTXISD::Suld1DArrayV4I8Trap)
+    MAKE_CASE(NVPTXISD::Suld1DArrayV4I16Trap)
+    MAKE_CASE(NVPTXISD::Suld1DArrayV4I32Trap)
+
+    MAKE_CASE(NVPTXISD::Suld2DI8Trap)
+    MAKE_CASE(NVPTXISD::Suld2DI16Trap)
+    MAKE_CASE(NVPTXISD::Suld2DI32Trap)
+    MAKE_CASE(NVPTXISD::Suld2DI64Trap)
+    MAKE_CASE(NVPTXISD::Suld2DV2I8Trap)
+    MAKE_CASE(NVPTXISD::Suld2DV2I16Trap)
+    MAKE_CASE(NVPTXISD::Suld2DV2I32Trap)
+    MAKE_CASE(NVPTXISD::Suld2DV2I64Trap)
+    MAKE_CASE(NVPTXISD::Suld2DV4I8Trap)
+    MAKE_CASE(NVPTXISD::Suld2DV4I16Trap)
+    MAKE_CASE(NVPTXISD::Suld2DV4I32Trap)
+
+    MAKE_CASE(NVPTXISD::Suld2DArrayI8Trap)
+    MAKE_CASE(NVPTXISD::Suld2DArrayI16Trap)
+    MAKE_CASE(NVPTXISD::Suld2DArrayI32Trap)
+    MAKE_CASE(NVPTXISD::Suld2DArrayI64Trap)
+    MAKE_CASE(NVPTXISD::Suld2DArrayV2I8Trap)
+    MAKE_CASE(NVPTXISD::Suld2DArrayV2I16Trap)
+    MAKE_CASE(NVPTXISD::Suld2DArrayV2I32Trap)
+    MAKE_CASE(NVPTXISD::Suld2DArrayV2I64Trap)
+    MAKE_CASE(NVPTXISD::Suld2DArrayV4I8Trap)
+    MAKE_CASE(NVPTXISD::Suld2DArrayV4I16Trap)
+    MAKE_CASE(NVPTXISD::Suld2DArrayV4I32Trap)
+
+    MAKE_CASE(NVPTXISD::Suld3DI8Trap)
+    MAKE_CASE(NVPTXISD::Suld3DI16Trap)
+    MAKE_CASE(NVPTXISD::Suld3DI32Trap)
+    MAKE_CASE(NVPTXISD::Suld3DI64Trap)
+    MAKE_CASE(NVPTXISD::Suld3DV2I8Trap)
+    MAKE_CASE(NVPTXISD::Suld3DV2I16Trap)
+    MAKE_CASE(NVPTXISD::Suld3DV2I32Trap)
+    MAKE_CASE(NVPTXISD::Suld3DV2I64Trap)
+    MAKE_CASE(NVPTXISD::Suld3DV4I8Trap)
+    MAKE_CASE(NVPTXISD::Suld3DV4I16Trap)
+    MAKE_CASE(NVPTXISD::Suld3DV4I32Trap)
+
+    MAKE_CASE(NVPTXISD::Suld1DI8Zero)
+    MAKE_CASE(NVPTXISD::Suld1DI16Zero)
+    MAKE_CASE(NVPTXISD::Suld1DI32Zero)
+    MAKE_CASE(NVPTXISD::Suld1DI64Zero)
+    MAKE_CASE(NVPTXISD::Suld1DV2I8Zero)
+    MAKE_CASE(NVPTXISD::Suld1DV2I16Zero)
+    MAKE_CASE(NVPTXISD::Suld1DV2I32Zero)
+    MAKE_CASE(NVPTXISD::Suld1DV2I64Zero)
+    MAKE_CASE(NVPTXISD::Suld1DV4I8Zero)
+    MAKE_CASE(NVPTXISD::Suld1DV4I16Zero)
+    MAKE_CASE(NVPTXISD::Suld1DV4I32Zero)
+
+    MAKE_CASE(NVPTXISD::Suld1DArrayI8Zero)
+    MAKE_CASE(NVPTXISD::Suld1DArrayI16Zero)
+    MAKE_CASE(NVPTXISD::Suld1DArrayI32Zero)
+    MAKE_CASE(NVPTXISD::Suld1DArrayI64Zero)
+    MAKE_CASE(NVPTXISD::Suld1DArrayV2I8Zero)
+    MAKE_CASE(NVPTXISD::Suld1DArrayV2I16Zero)
+    MAKE_CASE(NVPTXISD::Suld1DArrayV2I32Zero)
+    MAKE_CASE(NVPTXISD::Suld1DArrayV2I64Zero)
+    MAKE_CASE(NVPTXISD::Suld1DArrayV4I8Zero)
+    MAKE_CASE(NVPTXISD::Suld1DArrayV4I16Zero)
+    MAKE_CASE(NVPTXISD::Suld1DArrayV4I32Zero)
+
+    MAKE_CASE(NVPTXISD::Suld2DI8Zero)
+    MAKE_CASE(NVPTXISD::Suld2DI16Zero)
+    MAKE_CASE(NVPTXISD::Suld2DI32Zero)
+    MAKE_CASE(NVPTXISD::Suld2DI64Zero)
+    MAKE_CASE(NVPTXISD::Suld2DV2I8Zero)
+    MAKE_CASE(NVPTXISD::Suld2DV2I16Zero)
+    MAKE_CASE(NVPTXISD::Suld2DV2I32Zero)
+    MAKE_CASE(NVPTXISD::Suld2DV2I64Zero)
+    MAKE_CASE(NVPTXISD::Suld2DV4I8Zero)
+    MAKE_CASE(NVPTXISD::Suld2DV4I16Zero)
+    MAKE_CASE(NVPTXISD::Suld2DV4I32Zero)
+
+    MAKE_CASE(NVPTXISD::Suld2DArrayI8Zero)
+    MAKE_CASE(NVPTXISD::Suld2DArrayI16Zero)
+    MAKE_CASE(NVPTXISD::Suld2DArrayI32Zero)
+    MAKE_CASE(NVPTXISD::Suld2DArrayI64Zero)
+    MAKE_CASE(NVPTXISD::Suld2DArrayV2I8Zero)
+    MAKE_CASE(NVPTXISD::Suld2DArrayV2I16Zero)
+    MAKE_CASE(NVPTXISD::Suld2DArrayV2I32Zero)
+    MAKE_CASE(NVPTXISD::Suld2DArrayV2I64Zero)
+    MAKE_CASE(NVPTXISD::Suld2DArrayV4I8Zero)
+    MAKE_CASE(NVPTXISD::Suld2DArrayV4I16Zero)
+    MAKE_CASE(NVPTXISD::Suld2DArrayV4I32Zero)
+
+    MAKE_CASE(NVPTXISD::Suld3DI8Zero)
+    MAKE_CASE(NVPTXISD::Suld3DI16Zero)
+    MAKE_CASE(NVPTXISD::Suld3DI32Zero)
+    MAKE_CASE(NVPTXISD::Suld3DI64Zero)
+    MAKE_CASE(NVPTXISD::Suld3DV2I8Zero)
+    MAKE_CASE(NVPTXISD::Suld3DV2I16Zero)
+    MAKE_CASE(NVPTXISD::Suld3DV2I32Zero)
+    MAKE_CASE(NVPTXISD::Suld3DV2I64Zero)
+    MAKE_CASE(NVPTXISD::Suld3DV4I8Zero)
+    MAKE_CASE(NVPTXISD::Suld3DV4I16Zero)
+    MAKE_CASE(NVPTXISD::Suld3DV4I32Zero)
   }
   return nullptr;
+
+#undef MAKE_CASE
 }
 
 TargetLoweringBase::LegalizeTypeAction
@@ -3070,8 +2879,7 @@ SDValue NVPTXTargetLowering::LowerFormalArguments(
   // See similar issue in LowerCall.
   unsigned InsIdx = 0;
 
-  int idx = 0;
-  for (unsigned i = 0, e = theArgs.size(); i != e; ++i, ++idx, ++InsIdx) {
+  for (unsigned i = 0, e = theArgs.size(); i != e; ++i, ++InsIdx) {
     Type *Ty = argTypes[i];
 
     if (theArgs[i]->use_empty()) {
@@ -3107,10 +2915,10 @@ SDValue NVPTXTargetLowering::LowerFormalArguments(
       continue;
     }
 
-    // In the following cases, assign a node order of "idx+1"
+    // In the following cases, assign a node order of "i+1"
     // to newly created nodes. The SDNodes for params have to
     // appear in the same order as their order of appearance
-    // in the original function. "idx+1" holds that order.
+    // in the original function. "i+1" holds that order.
     if (!PAL.hasParamAttr(i, Attribute::ByVal)) {
       bool aggregateIsPacked = false;
       if (StructType *STy = dyn_cast<StructType>(Ty))
@@ -3125,7 +2933,7 @@ SDValue NVPTXTargetLowering::LowerFormalArguments(
       auto VectorInfo =
           VectorizePTXValueVTs(VTs, Offsets, DL.getABITypeAlign(Ty));
 
-      SDValue Arg = getParamSymbol(DAG, idx, PtrVT);
+      SDValue Arg = getParamSymbol(DAG, i, PtrVT);
       int VecIdx = -1; // Index of the first element of the current vector.
       for (unsigned parti = 0, parte = VTs.size(); parti != parte; ++parti) {
         if (VectorInfo[parti] & PVF_FIRST) {
@@ -3159,7 +2967,7 @@ SDValue NVPTXTargetLowering::LowerFormalArguments(
                                   MachineMemOperand::MODereferenceable |
                                       MachineMemOperand::MOInvariant);
           if (P.getNode())
-            P.getNode()->setIROrder(idx + 1);
+            P.getNode()->setIROrder(i + 1);
           for (unsigned j = 0; j < NumElts; ++j) {
             SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, LoadVT, P,
                                       DAG.getIntPtrConstant(j, dl));
@@ -3208,10 +3016,10 @@ SDValue NVPTXTargetLowering::LowerFormalArguments(
     EVT ObjectVT = getValueType(DL, Ty);
     assert(ObjectVT == Ins[InsIdx].VT &&
            "Ins type did not match function type");
-    SDValue Arg = getParamSymbol(DAG, idx, PtrVT);
+    SDValue Arg = getParamSymbol(DAG, i, PtrVT);
     SDValue p = DAG.getNode(NVPTXISD::MoveParam, dl, ObjectVT, Arg);
     if (p.getNode())
-      p.getNode()->setIROrder(idx + 1);
+      p.getNode()->setIROrder(i + 1);
     InVals.push_back(p);
   }
 
-- 
cgit v1.1


From 0572dabb71147fdc156d90a3ecd036d1652c2006 Mon Sep 17 00:00:00 2001
From: Arthur Eubanks <aeubanks@google.com>
Date: Thu, 8 Feb 2024 21:56:57 +0000
Subject: [gn build] Add IntrinsicsSPIRV.h generator

This was missing in the gn build for some reason, causing build errors like http://45.33.8.238/linux/130337/step_4.txt after 3b57b647.
---
 llvm/utils/gn/secondary/llvm/include/llvm/IR/BUILD.gn | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/llvm/utils/gn/secondary/llvm/include/llvm/IR/BUILD.gn b/llvm/utils/gn/secondary/llvm/include/llvm/IR/BUILD.gn
index a594d2a..87e5860 100644
--- a/llvm/utils/gn/secondary/llvm/include/llvm/IR/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/include/llvm/IR/BUILD.gn
@@ -85,6 +85,10 @@ gen_arch_intrinsics("IntrinsicsS390") {
   intrinsic_prefix = "s390"
 }
 
+gen_arch_intrinsics("IntrinsicsSPIRV") {
+  intrinsic_prefix = "spv"
+}
+
 gen_arch_intrinsics("IntrinsicsVE") {
   intrinsic_prefix = "ve"
 }
@@ -128,6 +132,7 @@ group("public_tablegen") {
     ":IntrinsicsR600",
     ":IntrinsicsRISCV",
     ":IntrinsicsS390",
+    ":IntrinsicsSPIRV",
     ":IntrinsicsVE",
     ":IntrinsicsWebAssembly",
     ":IntrinsicsX86",
-- 
cgit v1.1


From a6f42adf9ba03d69e8bf2eaf5af2e7f8f6294b37 Mon Sep 17 00:00:00 2001
From: NAKAMURA Takumi <geek4civic@gmail.com>
Date: Fri, 9 Feb 2024 07:17:16 +0900
Subject: [Bazel] Update for SPIRV

---
 utils/bazel/llvm-project-overlay/llvm/BUILD.bazel | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel b/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel
index f720c18..6b947d4 100644
--- a/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel
@@ -772,6 +772,10 @@ llvm_target_intrinsics_list = [
         "intrinsic_prefix": "s390",
     },
     {
+        "name": "SPIRV",
+        "intrinsic_prefix": "spv",
+    },
+    {
         "name": "VE",
         "intrinsic_prefix": "ve",
     },
-- 
cgit v1.1


From 7fe97f042036407a124bf1646a3f1124ddac3de5 Mon Sep 17 00:00:00 2001
From: Maksim Panchenko <maks@fb.com>
Date: Thu, 8 Feb 2024 14:21:49 -0800
Subject: [BOLT] Always run CheckLargeFunctions in non-relocation mode (#80922)

We run CheckLargeFunctions pass in non-relocation mode to prevent the
emission of functions that later could not be written to the output due
to their large size. The main reason behind the pass is to prevent the
emission of metadata for such functions since this metadata becomes
incorrect if the function is left unmodified.

Currently, the pass is enabled in non-relocation mode only when debug
info output is also enabled. As we emit increasingly more kinds of
metadata, e.g. for the Linux kernel, it becomes more challenging to
track metadata that needs to be fixed. Hence, I'm enabling the pass to
always run in non-relocation mode.
---
 bolt/lib/Passes/BinaryPasses.cpp     | 5 +----
 bolt/lib/Rewrite/RewriteInstance.cpp | 2 ++
 2 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/bolt/lib/Passes/BinaryPasses.cpp b/bolt/lib/Passes/BinaryPasses.cpp
index bcb1227..08dce2f 100644
--- a/bolt/lib/Passes/BinaryPasses.cpp
+++ b/bolt/lib/Passes/BinaryPasses.cpp
@@ -554,11 +554,8 @@ void CheckLargeFunctions::runOnFunctions(BinaryContext &BC) {
   if (BC.HasRelocations)
     return;
 
-  if (!opts::UpdateDebugSections)
-    return;
-
   // If the function wouldn't fit, mark it as non-simple. Otherwise, we may emit
-  // incorrect debug info.
+  // incorrect meta data.
   ParallelUtilities::WorkFuncTy WorkFun = [&](BinaryFunction &BF) {
     uint64_t HotSize, ColdSize;
     std::tie(HotSize, ColdSize) =
diff --git a/bolt/lib/Rewrite/RewriteInstance.cpp b/bolt/lib/Rewrite/RewriteInstance.cpp
index 9a242d9..c909e31 100644
--- a/bolt/lib/Rewrite/RewriteInstance.cpp
+++ b/bolt/lib/Rewrite/RewriteInstance.cpp
@@ -3631,6 +3631,7 @@ void RewriteInstance::mapCodeSections(BOLTLinker::SectionMapper MapSection) {
     Function.setImageAddress(FuncSection->getAllocAddress());
     Function.setImageSize(FuncSection->getOutputSize());
     if (Function.getImageSize() > Function.getMaxSize()) {
+      assert(!BC->isX86() && "Unexpected large function.");
       TooLarge = true;
       FailedAddresses.emplace_back(Function.getAddress());
     }
@@ -5367,6 +5368,7 @@ void RewriteInstance::rewriteFile() {
       continue;
 
     if (Function->getImageSize() > Function->getMaxSize()) {
+      assert(!BC->isX86() && "Unexpected large function.");
       if (opts::Verbosity >= 1)
         errs() << "BOLT-WARNING: new function size (0x"
                << Twine::utohexstr(Function->getImageSize())
-- 
cgit v1.1


From 7b5a9bb8f0f58b188655252f74b0941512e44389 Mon Sep 17 00:00:00 2001
From: Davide Italiano <davidino@fb.com>
Date: Thu, 8 Feb 2024 14:22:24 -0800
Subject: [github/CODEOWNERS] Add Alexander as BOLT reviewer.

---
 .github/CODEOWNERS | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
index 767f58e..3fe0cbb 100644
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@@ -103,4 +103,4 @@
 /mlir/**/*SparseTensor*/ @aartbik @PeimingLiu @yinying-lisa-li @matthias-springer
 
 # BOLT
-/bolt/ @aaupov @maksfb @rafaelauler @dcci
+/bolt/ @aaupov @maksfb @rafaelauler @ayermolo @dcci
-- 
cgit v1.1


From 3c42e10afdc518f6d8be5620289ef0da0bf03c5f Mon Sep 17 00:00:00 2001
From: Reid Kleckner <rnk@google.com>
Date: Thu, 8 Feb 2024 14:27:14 -0800
Subject: Consider aggregate bases when checking if an InitListExpr is constant
 (#80519)

This code was correct as written prior to C++17, which allowed bases to
appear in the initializer list.

This was observable by creating non-constant aggregate initialization at
file scope in a compound literal, but since that behavior will change
soon if we implement support for dynamic initialization, I also added a
unit test for `isConstantInitializer`.

This fixes at least one part of issue #80510 .

---------

Co-authored-by: Aaron Ballman <aaron@aaronballman.com>
---
 clang/docs/ReleaseNotes.rst             |  4 ++
 clang/lib/AST/Expr.cpp                  | 19 +++++++++
 clang/test/SemaCXX/compound-literal.cpp | 21 ++++++++++
 clang/unittests/AST/ASTExprTest.cpp     | 68 +++++++++++++++++++++++++++++----
 4 files changed, 104 insertions(+), 8 deletions(-)

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 32440ee..df3ad20 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -217,6 +217,10 @@ Bug Fixes to C++ Support
   Fixes (`#80971 ICE when explicit object parameter be a function parameter pack`)
 - Fixed a bug where abbreviated function templates would append their invented template parameters to
   an empty template parameter lists.
+- Clang now classifies aggregate initialization in C++17 and newer as constant
+  or non-constant more accurately. Previously, only a subset of the initializer
+  elements were considered, misclassifying some initializers as constant. Fixes
+  some of (`#80510 <https://github.com/llvm/llvm-project/issues/80510>`).
 
 Bug Fixes to AST Handling
 ^^^^^^^^^^^^^^^^^^^^^^^^^
diff --git a/clang/lib/AST/Expr.cpp b/clang/lib/AST/Expr.cpp
index d665a08..8b10e28 100644
--- a/clang/lib/AST/Expr.cpp
+++ b/clang/lib/AST/Expr.cpp
@@ -3328,6 +3328,12 @@ bool Expr::isConstantInitializer(ASTContext &Ctx, bool IsForRef,
            DIUE->getUpdater()->isConstantInitializer(Ctx, false, Culprit);
   }
   case InitListExprClass: {
+    // C++ [dcl.init.aggr]p2:
+    //   The elements of an aggregate are:
+    //   - for an array, the array elements in increasing subscript order, or
+    //   - for a class, the direct base classes in declaration order, followed
+    //     by the direct non-static data members (11.4) that are not members of
+    //     an anonymous union, in declaration order.
     const InitListExpr *ILE = cast<InitListExpr>(this);
     assert(ILE->isSemanticForm() && "InitListExpr must be in semantic form");
     if (ILE->getType()->isArrayType()) {
@@ -3342,6 +3348,19 @@ bool Expr::isConstantInitializer(ASTContext &Ctx, bool IsForRef,
     if (ILE->getType()->isRecordType()) {
       unsigned ElementNo = 0;
       RecordDecl *RD = ILE->getType()->castAs<RecordType>()->getDecl();
+
+      // In C++17, bases were added to the list of members used by aggregate
+      // initialization.
+      if (const auto *CXXRD = dyn_cast<CXXRecordDecl>(RD)) {
+        for (unsigned i = 0, e = CXXRD->getNumBases(); i < e; i++) {
+          if (ElementNo < ILE->getNumInits()) {
+            const Expr *Elt = ILE->getInit(ElementNo++);
+            if (!Elt->isConstantInitializer(Ctx, false, Culprit))
+              return false;
+          }
+        }
+      }
+
       for (const auto *Field : RD->fields()) {
         // If this is a union, skip all the fields that aren't being initialized.
         if (RD->isUnion() && ILE->getInitializedFieldInUnion() != Field)
diff --git a/clang/test/SemaCXX/compound-literal.cpp b/clang/test/SemaCXX/compound-literal.cpp
index 5957099..a3d3b9f 100644
--- a/clang/test/SemaCXX/compound-literal.cpp
+++ b/clang/test/SemaCXX/compound-literal.cpp
@@ -3,6 +3,7 @@
 // RUN: %clang_cc1 -fsyntax-only -std=c++11 -verify -ast-dump %s > %t-11
 // RUN: FileCheck --input-file=%t-11 %s
 // RUN: FileCheck --input-file=%t-11 %s --check-prefix=CHECK-CXX11
+// RUN: %clang_cc1 -verify -std=c++17 %s
 
 // http://llvm.org/PR7905
 namespace PR7905 {
@@ -108,3 +109,23 @@ int computed_with_lambda = [] {
   return result;
 }();
 #endif
+
+namespace DynamicFileScopeLiteral {
+// This covers the case where we have a file-scope compound literal with a
+// non-constant initializer in C++. Previously, we had a bug where Clang forgot
+// to consider initializer list elements for bases.
+struct Empty {};
+struct Foo : Empty { // expected-note 0+ {{candidate constructor}}
+  int x;
+  int y;
+};
+int f();
+#if __cplusplus < 201103L
+// expected-error@+6 {{non-aggregate type 'Foo' cannot be initialized with an initializer list}}
+#elif __cplusplus < 201703L
+// expected-error@+4 {{no matching constructor}}
+#else
+// expected-error@+2 {{initializer element is not a compile-time constant}}
+#endif
+Foo o = (Foo){ {}, 1, f() };
+}
diff --git a/clang/unittests/AST/ASTExprTest.cpp b/clang/unittests/AST/ASTExprTest.cpp
index ec75492..5ec6aea 100644
--- a/clang/unittests/AST/ASTExprTest.cpp
+++ b/clang/unittests/AST/ASTExprTest.cpp
@@ -20,17 +20,37 @@
 
 using namespace clang;
 
+using clang::ast_matchers::cxxRecordDecl;
+using clang::ast_matchers::hasName;
+using clang::ast_matchers::match;
+using clang::ast_matchers::varDecl;
+using clang::tooling::buildASTFromCode;
+
+static IntegerLiteral *createIntLiteral(ASTContext &Ctx, uint32_t Value) {
+  const int numBits = 32;
+  return IntegerLiteral::Create(Ctx, llvm::APInt(numBits, Value), Ctx.IntTy,
+                                {});
+}
+
+const CXXRecordDecl *getCXXRecordDeclNode(ASTUnit *AST,
+                                          const std::string &Name) {
+  auto Result =
+      match(cxxRecordDecl(hasName(Name)).bind("record"), AST->getASTContext());
+  EXPECT_FALSE(Result.empty());
+  return Result[0].getNodeAs<CXXRecordDecl>("record");
+}
+
+const VarDecl *getVariableNode(ASTUnit *AST, const std::string &Name) {
+  auto Result = match(varDecl(hasName(Name)).bind("var"), AST->getASTContext());
+  EXPECT_EQ(Result.size(), 1u);
+  return Result[0].getNodeAs<VarDecl>("var");
+}
+
 TEST(ASTExpr, IgnoreExprCallbackForwarded) {
   constexpr char Code[] = "";
   auto AST = tooling::buildASTFromCodeWithArgs(Code, /*Args=*/{"-std=c++20"});
   ASTContext &Ctx = AST->getASTContext();
 
-  auto createIntLiteral = [&](uint32_t Value) -> IntegerLiteral * {
-    const int numBits = 32;
-    return IntegerLiteral::Create(Ctx, llvm::APInt(numBits, Value),
-                                  Ctx.UnsignedIntTy, {});
-  };
-
   struct IgnoreParens {
     Expr *operator()(Expr *E) & { return nullptr; }
     Expr *operator()(Expr *E) && {
@@ -42,7 +62,7 @@ TEST(ASTExpr, IgnoreExprCallbackForwarded) {
   };
 
   {
-    auto *IntExpr = createIntLiteral(10);
+    auto *IntExpr = createIntLiteral(Ctx, 10);
     ParenExpr *PE =
         new (Ctx) ParenExpr(SourceLocation{}, SourceLocation{}, IntExpr);
     EXPECT_EQ(IntExpr, IgnoreExprNodes(PE, IgnoreParens{}));
@@ -50,9 +70,41 @@ TEST(ASTExpr, IgnoreExprCallbackForwarded) {
 
   {
     IgnoreParens CB{};
-    auto *IntExpr = createIntLiteral(10);
+    auto *IntExpr = createIntLiteral(Ctx, 10);
     ParenExpr *PE =
         new (Ctx) ParenExpr(SourceLocation{}, SourceLocation{}, IntExpr);
     EXPECT_EQ(nullptr, IgnoreExprNodes(PE, CB));
   }
 }
+
+TEST(ASTExpr, InitListIsConstantInitialized) {
+  auto AST = buildASTFromCode(R"cpp(
+    struct Empty {};
+    struct Foo : Empty { int x, y; };
+    int gv;
+  )cpp");
+  ASTContext &Ctx = AST->getASTContext();
+  const CXXRecordDecl *Empty = getCXXRecordDeclNode(AST.get(), "Empty");
+  const CXXRecordDecl *Foo = getCXXRecordDeclNode(AST.get(), "Foo");
+
+  SourceLocation Loc{};
+  InitListExpr *BaseInit = new (Ctx) InitListExpr(Ctx, Loc, {}, Loc);
+  BaseInit->setType(Ctx.getRecordType(Empty));
+  Expr *Exprs[3] = {
+      BaseInit,
+      createIntLiteral(Ctx, 13),
+      createIntLiteral(Ctx, 42),
+  };
+  InitListExpr *FooInit = new (Ctx) InitListExpr(Ctx, Loc, Exprs, Loc);
+  FooInit->setType(Ctx.getRecordType(Foo));
+  EXPECT_TRUE(FooInit->isConstantInitializer(Ctx, false));
+
+  // Replace the last initializer with something non-constant and make sure
+  // this returns false. Previously we had a bug where we didn't count base
+  // initializers, and only iterated over fields.
+  const VarDecl *GV = getVariableNode(AST.get(), "gv");
+  auto *Ref = new (Ctx) DeclRefExpr(Ctx, const_cast<VarDecl *>(GV), false,
+                                    Ctx.IntTy, VK_LValue, Loc);
+  (void)FooInit->updateInit(Ctx, 2, Ref);
+  EXPECT_FALSE(FooInit->isConstantInitializer(Ctx, false));
+}
-- 
cgit v1.1


From 06c89bd59ca2279f76a41e851b7b2df634a6191e Mon Sep 17 00:00:00 2001
From: Luke Lau <luke@igalia.com>
Date: Fri, 9 Feb 2024 06:51:11 +0800
Subject: [RISCV] Check type is legal before combining mgather to vlse
 intrinsic (#81107)

Otherwise we will crash since target intrinsics don't have their types
legalized. Let the mgather get legalized first, then do the combine on
the legal type.
Fixes #81088

Co-authored-by: Craig Topper <craig.topper@sifive.com>
---
 llvm/lib/Target/RISCV/RISCVISelLowering.cpp        |   2 +-
 .../RISCV/rvv/fixed-vectors-masked-gather.ll       | 448 +++++++++++++++++++++
 2 files changed, 449 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index a62610b..12c0cd5 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -15833,7 +15833,7 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N,
           MGN->getMemOperand(), IndexType, MGN->getExtensionType());
 
     if (Index.getOpcode() == ISD::BUILD_VECTOR &&
-        MGN->getExtensionType() == ISD::NON_EXTLOAD) {
+        MGN->getExtensionType() == ISD::NON_EXTLOAD && isTypeLegal(VT)) {
       if (std::optional<VIDSequence> SimpleVID = isSimpleVIDSequence(Index);
           SimpleVID && SimpleVID->StepDenominator == 1) {
         const int64_t StepNumerator = SimpleVID->StepNumerator;
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll
index df41ac1..890707c 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll
@@ -14638,5 +14638,453 @@ define <8 x i16> @mgather_shuffle_vrgather(ptr %base) {
   %v = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> %ptrs, i32 4, <8 x i1> %allones, <8 x i16> poison)
   ret <8 x i16> %v
 }
+
+; v32i64 is not a legal type, so make sure we don't try to combine the mgather
+; to a vlse intrinsic until it is legalized and split.
+define <32 x i64> @mgather_strided_split(ptr %base) {
+; RV32V-LABEL: mgather_strided_split:
+; RV32V:       # %bb.0:
+; RV32V-NEXT:    li a1, 16
+; RV32V-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32V-NEXT:    vlse64.v v8, (a0), a1
+; RV32V-NEXT:    addi a0, a0, 256
+; RV32V-NEXT:    vlse64.v v16, (a0), a1
+; RV32V-NEXT:    ret
+;
+; RV64V-LABEL: mgather_strided_split:
+; RV64V:       # %bb.0:
+; RV64V-NEXT:    li a1, 16
+; RV64V-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV64V-NEXT:    vlse64.v v8, (a0), a1
+; RV64V-NEXT:    addi a0, a0, 256
+; RV64V-NEXT:    vlse64.v v16, (a0), a1
+; RV64V-NEXT:    ret
+;
+; RV32ZVE32F-LABEL: mgather_strided_split:
+; RV32ZVE32F:       # %bb.0:
+; RV32ZVE32F-NEXT:    addi sp, sp, -512
+; RV32ZVE32F-NEXT:    .cfi_def_cfa_offset 512
+; RV32ZVE32F-NEXT:    sw ra, 508(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    sw s0, 504(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    sw s2, 500(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    sw s3, 496(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    sw s4, 492(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    sw s5, 488(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    sw s6, 484(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    sw s7, 480(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    sw s8, 476(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    sw s9, 472(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    sw s10, 468(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    sw s11, 464(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    .cfi_offset ra, -4
+; RV32ZVE32F-NEXT:    .cfi_offset s0, -8
+; RV32ZVE32F-NEXT:    .cfi_offset s2, -12
+; RV32ZVE32F-NEXT:    .cfi_offset s3, -16
+; RV32ZVE32F-NEXT:    .cfi_offset s4, -20
+; RV32ZVE32F-NEXT:    .cfi_offset s5, -24
+; RV32ZVE32F-NEXT:    .cfi_offset s6, -28
+; RV32ZVE32F-NEXT:    .cfi_offset s7, -32
+; RV32ZVE32F-NEXT:    .cfi_offset s8, -36
+; RV32ZVE32F-NEXT:    .cfi_offset s9, -40
+; RV32ZVE32F-NEXT:    .cfi_offset s10, -44
+; RV32ZVE32F-NEXT:    .cfi_offset s11, -48
+; RV32ZVE32F-NEXT:    addi s0, sp, 512
+; RV32ZVE32F-NEXT:    .cfi_def_cfa s0, 0
+; RV32ZVE32F-NEXT:    andi sp, sp, -128
+; RV32ZVE32F-NEXT:    li a2, 32
+; RV32ZVE32F-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
+; RV32ZVE32F-NEXT:    vid.v v8
+; RV32ZVE32F-NEXT:    vsll.vi v8, v8, 4
+; RV32ZVE32F-NEXT:    vadd.vx v8, v8, a1
+; RV32ZVE32F-NEXT:    vmv.x.s a1, v8
+; RV32ZVE32F-NEXT:    lw a3, 0(a1)
+; RV32ZVE32F-NEXT:    sw a3, 216(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    lw a1, 4(a1)
+; RV32ZVE32F-NEXT:    sw a1, 208(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
+; RV32ZVE32F-NEXT:    vslidedown.vi v16, v8, 1
+; RV32ZVE32F-NEXT:    vmv.x.s a1, v16
+; RV32ZVE32F-NEXT:    lw a3, 0(a1)
+; RV32ZVE32F-NEXT:    sw a3, 252(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    lw a1, 4(a1)
+; RV32ZVE32F-NEXT:    sw a1, 248(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    vslidedown.vi v16, v8, 2
+; RV32ZVE32F-NEXT:    vmv.x.s a1, v16
+; RV32ZVE32F-NEXT:    lw a3, 0(a1)
+; RV32ZVE32F-NEXT:    sw a3, 244(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    lw a1, 4(a1)
+; RV32ZVE32F-NEXT:    sw a1, 236(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    vslidedown.vi v16, v8, 3
+; RV32ZVE32F-NEXT:    vmv.x.s a1, v16
+; RV32ZVE32F-NEXT:    lw a3, 0(a1)
+; RV32ZVE32F-NEXT:    sw a3, 228(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    lw a1, 4(a1)
+; RV32ZVE32F-NEXT:    sw a1, 220(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
+; RV32ZVE32F-NEXT:    vslidedown.vi v16, v8, 4
+; RV32ZVE32F-NEXT:    vmv.x.s a1, v16
+; RV32ZVE32F-NEXT:    lw a3, 0(a1)
+; RV32ZVE32F-NEXT:    sw a3, 240(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    lw a1, 4(a1)
+; RV32ZVE32F-NEXT:    sw a1, 232(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    vslidedown.vi v16, v8, 5
+; RV32ZVE32F-NEXT:    vmv.x.s a1, v16
+; RV32ZVE32F-NEXT:    lw a3, 0(a1)
+; RV32ZVE32F-NEXT:    sw a3, 224(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    lw a1, 4(a1)
+; RV32ZVE32F-NEXT:    sw a1, 212(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    vslidedown.vi v16, v8, 6
+; RV32ZVE32F-NEXT:    vmv.x.s a1, v16
+; RV32ZVE32F-NEXT:    lw a3, 0(a1)
+; RV32ZVE32F-NEXT:    sw a3, 204(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    lw a1, 4(a1)
+; RV32ZVE32F-NEXT:    sw a1, 200(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    vslidedown.vi v16, v8, 7
+; RV32ZVE32F-NEXT:    vmv.x.s a1, v16
+; RV32ZVE32F-NEXT:    lw a3, 0(a1)
+; RV32ZVE32F-NEXT:    sw a3, 196(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    lw a1, 4(a1)
+; RV32ZVE32F-NEXT:    sw a1, 192(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    addi a1, sp, 256
+; RV32ZVE32F-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
+; RV32ZVE32F-NEXT:    vse32.v v8, (a1)
+; RV32ZVE32F-NEXT:    lw a1, 288(sp)
+; RV32ZVE32F-NEXT:    lw a2, 292(sp)
+; RV32ZVE32F-NEXT:    lw a3, 0(a1)
+; RV32ZVE32F-NEXT:    sw a3, 188(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    lw a1, 4(a1)
+; RV32ZVE32F-NEXT:    sw a1, 184(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    lw a1, 296(sp)
+; RV32ZVE32F-NEXT:    lw a3, 0(a2)
+; RV32ZVE32F-NEXT:    sw a3, 180(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    lw a2, 4(a2)
+; RV32ZVE32F-NEXT:    sw a2, 176(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    lw a2, 300(sp)
+; RV32ZVE32F-NEXT:    lw a3, 0(a1)
+; RV32ZVE32F-NEXT:    sw a3, 172(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    lw a1, 4(a1)
+; RV32ZVE32F-NEXT:    sw a1, 168(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    lw a1, 304(sp)
+; RV32ZVE32F-NEXT:    lw a3, 0(a2)
+; RV32ZVE32F-NEXT:    sw a3, 164(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    lw a2, 4(a2)
+; RV32ZVE32F-NEXT:    sw a2, 160(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    lw a2, 308(sp)
+; RV32ZVE32F-NEXT:    lw a3, 0(a1)
+; RV32ZVE32F-NEXT:    sw a3, 156(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    lw a1, 4(a1)
+; RV32ZVE32F-NEXT:    sw a1, 152(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    lw a1, 312(sp)
+; RV32ZVE32F-NEXT:    lw a3, 0(a2)
+; RV32ZVE32F-NEXT:    sw a3, 148(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    lw a2, 4(a2)
+; RV32ZVE32F-NEXT:    sw a2, 144(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    lw a2, 316(sp)
+; RV32ZVE32F-NEXT:    lw a3, 0(a1)
+; RV32ZVE32F-NEXT:    sw a3, 140(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    lw a1, 4(a1)
+; RV32ZVE32F-NEXT:    sw a1, 136(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    lw a1, 320(sp)
+; RV32ZVE32F-NEXT:    lw a3, 0(a2)
+; RV32ZVE32F-NEXT:    sw a3, 132(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    lw a2, 4(a2)
+; RV32ZVE32F-NEXT:    sw a2, 128(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    lw a2, 324(sp)
+; RV32ZVE32F-NEXT:    lw a3, 0(a1)
+; RV32ZVE32F-NEXT:    sw a3, 124(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    lw a1, 4(a1)
+; RV32ZVE32F-NEXT:    sw a1, 120(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    lw a1, 328(sp)
+; RV32ZVE32F-NEXT:    lw a3, 0(a2)
+; RV32ZVE32F-NEXT:    sw a3, 116(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    lw a2, 4(a2)
+; RV32ZVE32F-NEXT:    sw a2, 112(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    lw a2, 332(sp)
+; RV32ZVE32F-NEXT:    lw a3, 0(a1)
+; RV32ZVE32F-NEXT:    sw a3, 104(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    lw ra, 4(a1)
+; RV32ZVE32F-NEXT:    lw a1, 336(sp)
+; RV32ZVE32F-NEXT:    lw s10, 0(a2)
+; RV32ZVE32F-NEXT:    lw s8, 4(a2)
+; RV32ZVE32F-NEXT:    lw a2, 340(sp)
+; RV32ZVE32F-NEXT:    lw s6, 0(a1)
+; RV32ZVE32F-NEXT:    lw s4, 4(a1)
+; RV32ZVE32F-NEXT:    lw a4, 344(sp)
+; RV32ZVE32F-NEXT:    lw s2, 0(a2)
+; RV32ZVE32F-NEXT:    lw t5, 4(a2)
+; RV32ZVE32F-NEXT:    lw a2, 348(sp)
+; RV32ZVE32F-NEXT:    lw t3, 0(a4)
+; RV32ZVE32F-NEXT:    lw t2, 4(a4)
+; RV32ZVE32F-NEXT:    lw a4, 352(sp)
+; RV32ZVE32F-NEXT:    lw t0, 0(a2)
+; RV32ZVE32F-NEXT:    lw a7, 4(a2)
+; RV32ZVE32F-NEXT:    lw a2, 356(sp)
+; RV32ZVE32F-NEXT:    lw a6, 0(a4)
+; RV32ZVE32F-NEXT:    lw a5, 4(a4)
+; RV32ZVE32F-NEXT:    lw a4, 360(sp)
+; RV32ZVE32F-NEXT:    lw a1, 0(a2)
+; RV32ZVE32F-NEXT:    sw a1, 108(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    lw a1, 4(a2)
+; RV32ZVE32F-NEXT:    sw a1, 100(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    lw a2, 364(sp)
+; RV32ZVE32F-NEXT:    lw s11, 0(a4)
+; RV32ZVE32F-NEXT:    lw s9, 4(a4)
+; RV32ZVE32F-NEXT:    lw a1, 368(sp)
+; RV32ZVE32F-NEXT:    lw s7, 0(a2)
+; RV32ZVE32F-NEXT:    lw s5, 4(a2)
+; RV32ZVE32F-NEXT:    lw a3, 372(sp)
+; RV32ZVE32F-NEXT:    lw s3, 0(a1)
+; RV32ZVE32F-NEXT:    lw t6, 4(a1)
+; RV32ZVE32F-NEXT:    lw a2, 376(sp)
+; RV32ZVE32F-NEXT:    lw t4, 0(a3)
+; RV32ZVE32F-NEXT:    lw a1, 380(sp)
+; RV32ZVE32F-NEXT:    lw t1, 4(a3)
+; RV32ZVE32F-NEXT:    lw a4, 0(a2)
+; RV32ZVE32F-NEXT:    lw a3, 4(a2)
+; RV32ZVE32F-NEXT:    lw a2, 0(a1)
+; RV32ZVE32F-NEXT:    lw a1, 4(a1)
+; RV32ZVE32F-NEXT:    sw a5, 196(a0)
+; RV32ZVE32F-NEXT:    sw a6, 192(a0)
+; RV32ZVE32F-NEXT:    sw a7, 188(a0)
+; RV32ZVE32F-NEXT:    sw t0, 184(a0)
+; RV32ZVE32F-NEXT:    sw t2, 180(a0)
+; RV32ZVE32F-NEXT:    sw t3, 176(a0)
+; RV32ZVE32F-NEXT:    sw t5, 172(a0)
+; RV32ZVE32F-NEXT:    sw s2, 168(a0)
+; RV32ZVE32F-NEXT:    sw s4, 164(a0)
+; RV32ZVE32F-NEXT:    sw s6, 160(a0)
+; RV32ZVE32F-NEXT:    sw s8, 156(a0)
+; RV32ZVE32F-NEXT:    sw s10, 152(a0)
+; RV32ZVE32F-NEXT:    sw ra, 148(a0)
+; RV32ZVE32F-NEXT:    lw a5, 104(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT:    sw a5, 144(a0)
+; RV32ZVE32F-NEXT:    lw a5, 112(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT:    sw a5, 140(a0)
+; RV32ZVE32F-NEXT:    lw a5, 116(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT:    sw a5, 136(a0)
+; RV32ZVE32F-NEXT:    lw a5, 120(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT:    sw a5, 132(a0)
+; RV32ZVE32F-NEXT:    lw a5, 124(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT:    sw a5, 128(a0)
+; RV32ZVE32F-NEXT:    lw a5, 128(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT:    sw a5, 124(a0)
+; RV32ZVE32F-NEXT:    lw a5, 132(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT:    sw a5, 120(a0)
+; RV32ZVE32F-NEXT:    lw a5, 136(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT:    sw a5, 116(a0)
+; RV32ZVE32F-NEXT:    lw a5, 140(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT:    sw a5, 112(a0)
+; RV32ZVE32F-NEXT:    lw a5, 144(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT:    sw a5, 108(a0)
+; RV32ZVE32F-NEXT:    lw a5, 148(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT:    sw a5, 104(a0)
+; RV32ZVE32F-NEXT:    lw a5, 152(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT:    sw a5, 100(a0)
+; RV32ZVE32F-NEXT:    lw a5, 156(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT:    sw a5, 96(a0)
+; RV32ZVE32F-NEXT:    lw a5, 160(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT:    sw a5, 92(a0)
+; RV32ZVE32F-NEXT:    lw a5, 164(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT:    sw a5, 88(a0)
+; RV32ZVE32F-NEXT:    lw a5, 168(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT:    sw a5, 84(a0)
+; RV32ZVE32F-NEXT:    lw a5, 172(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT:    sw a5, 80(a0)
+; RV32ZVE32F-NEXT:    lw a5, 176(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT:    sw a5, 76(a0)
+; RV32ZVE32F-NEXT:    lw a5, 180(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT:    sw a5, 72(a0)
+; RV32ZVE32F-NEXT:    lw a5, 184(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT:    sw a5, 68(a0)
+; RV32ZVE32F-NEXT:    lw a5, 188(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT:    sw a5, 64(a0)
+; RV32ZVE32F-NEXT:    lw a5, 208(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT:    sw a5, 4(a0)
+; RV32ZVE32F-NEXT:    lw a5, 216(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT:    sw a5, 0(a0)
+; RV32ZVE32F-NEXT:    sw a1, 252(a0)
+; RV32ZVE32F-NEXT:    sw a2, 248(a0)
+; RV32ZVE32F-NEXT:    sw a3, 244(a0)
+; RV32ZVE32F-NEXT:    sw a4, 240(a0)
+; RV32ZVE32F-NEXT:    sw t1, 236(a0)
+; RV32ZVE32F-NEXT:    sw t4, 232(a0)
+; RV32ZVE32F-NEXT:    sw t6, 228(a0)
+; RV32ZVE32F-NEXT:    sw s3, 224(a0)
+; RV32ZVE32F-NEXT:    sw s5, 220(a0)
+; RV32ZVE32F-NEXT:    sw s7, 216(a0)
+; RV32ZVE32F-NEXT:    sw s9, 212(a0)
+; RV32ZVE32F-NEXT:    sw s11, 208(a0)
+; RV32ZVE32F-NEXT:    lw a1, 100(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT:    sw a1, 204(a0)
+; RV32ZVE32F-NEXT:    lw a1, 108(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT:    sw a1, 200(a0)
+; RV32ZVE32F-NEXT:    lw a1, 220(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT:    sw a1, 28(a0)
+; RV32ZVE32F-NEXT:    lw a1, 228(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT:    sw a1, 24(a0)
+; RV32ZVE32F-NEXT:    lw a1, 236(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT:    sw a1, 20(a0)
+; RV32ZVE32F-NEXT:    lw a1, 244(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT:    sw a1, 16(a0)
+; RV32ZVE32F-NEXT:    lw a1, 248(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT:    sw a1, 12(a0)
+; RV32ZVE32F-NEXT:    lw a1, 252(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT:    sw a1, 8(a0)
+; RV32ZVE32F-NEXT:    lw a1, 192(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT:    sw a1, 60(a0)
+; RV32ZVE32F-NEXT:    lw a1, 196(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT:    sw a1, 56(a0)
+; RV32ZVE32F-NEXT:    lw a1, 200(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT:    sw a1, 52(a0)
+; RV32ZVE32F-NEXT:    lw a1, 204(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT:    sw a1, 48(a0)
+; RV32ZVE32F-NEXT:    lw a1, 212(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT:    sw a1, 44(a0)
+; RV32ZVE32F-NEXT:    lw a1, 224(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT:    sw a1, 40(a0)
+; RV32ZVE32F-NEXT:    lw a1, 232(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT:    sw a1, 36(a0)
+; RV32ZVE32F-NEXT:    lw a1, 240(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT:    sw a1, 32(a0)
+; RV32ZVE32F-NEXT:    addi sp, s0, -512
+; RV32ZVE32F-NEXT:    lw ra, 508(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT:    lw s0, 504(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT:    lw s2, 500(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT:    lw s3, 496(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT:    lw s4, 492(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT:    lw s5, 488(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT:    lw s6, 484(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT:    lw s7, 480(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT:    lw s8, 476(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT:    lw s9, 472(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT:    lw s10, 468(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT:    lw s11, 464(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT:    addi sp, sp, 512
+; RV32ZVE32F-NEXT:    ret
+;
+; RV64ZVE32F-LABEL: mgather_strided_split:
+; RV64ZVE32F:       # %bb.0:
+; RV64ZVE32F-NEXT:    addi sp, sp, -144
+; RV64ZVE32F-NEXT:    .cfi_def_cfa_offset 144
+; RV64ZVE32F-NEXT:    sd ra, 136(sp) # 8-byte Folded Spill
+; RV64ZVE32F-NEXT:    sd s0, 128(sp) # 8-byte Folded Spill
+; RV64ZVE32F-NEXT:    sd s1, 120(sp) # 8-byte Folded Spill
+; RV64ZVE32F-NEXT:    sd s2, 112(sp) # 8-byte Folded Spill
+; RV64ZVE32F-NEXT:    sd s3, 104(sp) # 8-byte Folded Spill
+; RV64ZVE32F-NEXT:    sd s4, 96(sp) # 8-byte Folded Spill
+; RV64ZVE32F-NEXT:    sd s5, 88(sp) # 8-byte Folded Spill
+; RV64ZVE32F-NEXT:    sd s6, 80(sp) # 8-byte Folded Spill
+; RV64ZVE32F-NEXT:    sd s7, 72(sp) # 8-byte Folded Spill
+; RV64ZVE32F-NEXT:    sd s8, 64(sp) # 8-byte Folded Spill
+; RV64ZVE32F-NEXT:    sd s9, 56(sp) # 8-byte Folded Spill
+; RV64ZVE32F-NEXT:    sd s10, 48(sp) # 8-byte Folded Spill
+; RV64ZVE32F-NEXT:    sd s11, 40(sp) # 8-byte Folded Spill
+; RV64ZVE32F-NEXT:    .cfi_offset ra, -8
+; RV64ZVE32F-NEXT:    .cfi_offset s0, -16
+; RV64ZVE32F-NEXT:    .cfi_offset s1, -24
+; RV64ZVE32F-NEXT:    .cfi_offset s2, -32
+; RV64ZVE32F-NEXT:    .cfi_offset s3, -40
+; RV64ZVE32F-NEXT:    .cfi_offset s4, -48
+; RV64ZVE32F-NEXT:    .cfi_offset s5, -56
+; RV64ZVE32F-NEXT:    .cfi_offset s6, -64
+; RV64ZVE32F-NEXT:    .cfi_offset s7, -72
+; RV64ZVE32F-NEXT:    .cfi_offset s8, -80
+; RV64ZVE32F-NEXT:    .cfi_offset s9, -88
+; RV64ZVE32F-NEXT:    .cfi_offset s10, -96
+; RV64ZVE32F-NEXT:    .cfi_offset s11, -104
+; RV64ZVE32F-NEXT:    ld a2, 0(a1)
+; RV64ZVE32F-NEXT:    sd a2, 32(sp) # 8-byte Folded Spill
+; RV64ZVE32F-NEXT:    ld a2, 16(a1)
+; RV64ZVE32F-NEXT:    sd a2, 24(sp) # 8-byte Folded Spill
+; RV64ZVE32F-NEXT:    ld a2, 32(a1)
+; RV64ZVE32F-NEXT:    sd a2, 16(sp) # 8-byte Folded Spill
+; RV64ZVE32F-NEXT:    ld a2, 48(a1)
+; RV64ZVE32F-NEXT:    sd a2, 8(sp) # 8-byte Folded Spill
+; RV64ZVE32F-NEXT:    ld a2, 64(a1)
+; RV64ZVE32F-NEXT:    sd a2, 0(sp) # 8-byte Folded Spill
+; RV64ZVE32F-NEXT:    ld a7, 80(a1)
+; RV64ZVE32F-NEXT:    ld t0, 96(a1)
+; RV64ZVE32F-NEXT:    ld t1, 112(a1)
+; RV64ZVE32F-NEXT:    ld t2, 128(a1)
+; RV64ZVE32F-NEXT:    ld t3, 144(a1)
+; RV64ZVE32F-NEXT:    ld t4, 160(a1)
+; RV64ZVE32F-NEXT:    ld t5, 176(a1)
+; RV64ZVE32F-NEXT:    ld t6, 192(a1)
+; RV64ZVE32F-NEXT:    ld s0, 208(a1)
+; RV64ZVE32F-NEXT:    ld s1, 224(a1)
+; RV64ZVE32F-NEXT:    ld s2, 240(a1)
+; RV64ZVE32F-NEXT:    ld s3, 256(a1)
+; RV64ZVE32F-NEXT:    ld s4, 272(a1)
+; RV64ZVE32F-NEXT:    ld s5, 288(a1)
+; RV64ZVE32F-NEXT:    ld s6, 304(a1)
+; RV64ZVE32F-NEXT:    ld s7, 320(a1)
+; RV64ZVE32F-NEXT:    ld s8, 336(a1)
+; RV64ZVE32F-NEXT:    ld s9, 352(a1)
+; RV64ZVE32F-NEXT:    ld s10, 368(a1)
+; RV64ZVE32F-NEXT:    ld s11, 384(a1)
+; RV64ZVE32F-NEXT:    ld ra, 400(a1)
+; RV64ZVE32F-NEXT:    ld a6, 416(a1)
+; RV64ZVE32F-NEXT:    ld a5, 432(a1)
+; RV64ZVE32F-NEXT:    ld a2, 496(a1)
+; RV64ZVE32F-NEXT:    ld a3, 480(a1)
+; RV64ZVE32F-NEXT:    ld a4, 464(a1)
+; RV64ZVE32F-NEXT:    ld a1, 448(a1)
+; RV64ZVE32F-NEXT:    sd a2, 248(a0)
+; RV64ZVE32F-NEXT:    sd a3, 240(a0)
+; RV64ZVE32F-NEXT:    sd a4, 232(a0)
+; RV64ZVE32F-NEXT:    sd a1, 224(a0)
+; RV64ZVE32F-NEXT:    sd a5, 216(a0)
+; RV64ZVE32F-NEXT:    sd a6, 208(a0)
+; RV64ZVE32F-NEXT:    sd ra, 200(a0)
+; RV64ZVE32F-NEXT:    sd s11, 192(a0)
+; RV64ZVE32F-NEXT:    sd s10, 184(a0)
+; RV64ZVE32F-NEXT:    sd s9, 176(a0)
+; RV64ZVE32F-NEXT:    sd s8, 168(a0)
+; RV64ZVE32F-NEXT:    sd s7, 160(a0)
+; RV64ZVE32F-NEXT:    sd s6, 152(a0)
+; RV64ZVE32F-NEXT:    sd s5, 144(a0)
+; RV64ZVE32F-NEXT:    sd s4, 136(a0)
+; RV64ZVE32F-NEXT:    sd s3, 128(a0)
+; RV64ZVE32F-NEXT:    sd s2, 120(a0)
+; RV64ZVE32F-NEXT:    sd s1, 112(a0)
+; RV64ZVE32F-NEXT:    sd s0, 104(a0)
+; RV64ZVE32F-NEXT:    sd t6, 96(a0)
+; RV64ZVE32F-NEXT:    sd t5, 88(a0)
+; RV64ZVE32F-NEXT:    sd t4, 80(a0)
+; RV64ZVE32F-NEXT:    sd t3, 72(a0)
+; RV64ZVE32F-NEXT:    sd t2, 64(a0)
+; RV64ZVE32F-NEXT:    sd t1, 56(a0)
+; RV64ZVE32F-NEXT:    sd t0, 48(a0)
+; RV64ZVE32F-NEXT:    sd a7, 40(a0)
+; RV64ZVE32F-NEXT:    ld a1, 0(sp) # 8-byte Folded Reload
+; RV64ZVE32F-NEXT:    sd a1, 32(a0)
+; RV64ZVE32F-NEXT:    ld a1, 8(sp) # 8-byte Folded Reload
+; RV64ZVE32F-NEXT:    sd a1, 24(a0)
+; RV64ZVE32F-NEXT:    ld a1, 16(sp) # 8-byte Folded Reload
+; RV64ZVE32F-NEXT:    sd a1, 16(a0)
+; RV64ZVE32F-NEXT:    ld a1, 24(sp) # 8-byte Folded Reload
+; RV64ZVE32F-NEXT:    sd a1, 8(a0)
+; RV64ZVE32F-NEXT:    ld a1, 32(sp) # 8-byte Folded Reload
+; RV64ZVE32F-NEXT:    sd a1, 0(a0)
+; RV64ZVE32F-NEXT:    ld ra, 136(sp) # 8-byte Folded Reload
+; RV64ZVE32F-NEXT:    ld s0, 128(sp) # 8-byte Folded Reload
+; RV64ZVE32F-NEXT:    ld s1, 120(sp) # 8-byte Folded Reload
+; RV64ZVE32F-NEXT:    ld s2, 112(sp) # 8-byte Folded Reload
+; RV64ZVE32F-NEXT:    ld s3, 104(sp) # 8-byte Folded Reload
+; RV64ZVE32F-NEXT:    ld s4, 96(sp) # 8-byte Folded Reload
+; RV64ZVE32F-NEXT:    ld s5, 88(sp) # 8-byte Folded Reload
+; RV64ZVE32F-NEXT:    ld s6, 80(sp) # 8-byte Folded Reload
+; RV64ZVE32F-NEXT:    ld s7, 72(sp) # 8-byte Folded Reload
+; RV64ZVE32F-NEXT:    ld s8, 64(sp) # 8-byte Folded Reload
+; RV64ZVE32F-NEXT:    ld s9, 56(sp) # 8-byte Folded Reload
+; RV64ZVE32F-NEXT:    ld s10, 48(sp) # 8-byte Folded Reload
+; RV64ZVE32F-NEXT:    ld s11, 40(sp) # 8-byte Folded Reload
+; RV64ZVE32F-NEXT:    addi sp, sp, 144
+; RV64ZVE32F-NEXT:    ret
+  %ptrs = getelementptr inbounds i64, ptr %base, <32 x i64> <i64 0, i64 2, i64 4, i64 6, i64 8, i64 10, i64 12, i64 14, i64 16, i64 18, i64 20, i64 22, i64 24, i64 26, i64 28, i64 30, i64 32, i64 34, i64 36, i64 38, i64 40, i64 42, i64 44, i64 46, i64 48, i64 50, i64 52, i64 54, i64 56, i64 58, i64 60, i64 62>
+  %x = call <32 x i64> @llvm.masked.gather.v32i64.v32p0(<32 x ptr> %ptrs, i32 8, <32 x i1> shufflevector(<32 x i1> insertelement(<32 x i1> poison, i1 true, i32 0), <32 x i1> poison, <32 x i32> zeroinitializer), <32 x i64> poison)
+  ret <32 x i64> %x
+}
+
 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
 ; RV64: {{.*}}
-- 
cgit v1.1


From f7201505a6ec7a0f904d2f09cece5c770058a991 Mon Sep 17 00:00:00 2001
From: Jerry Wu <cheyuw@google.com>
Date: Thu, 8 Feb 2024 14:52:09 -0800
Subject: [mlir] Add transformation to wrap scf::while in zero-trip-check
 (#81050)

Add `scf::wrapWhileLoopInZeroTripCheck` to wrap scf while loop in
zero-trip-check.
---
 .../mlir/Dialect/SCF/Transforms/Transforms.h       |  41 +++++++
 mlir/lib/Dialect/SCF/Transforms/CMakeLists.txt     |   1 +
 .../Dialect/SCF/Transforms/WrapInZeroTripCheck.cpp | 132 +++++++++++++++++++++
 .../SCF/wrap-while-loop-in-zero-trip-check.mlir    | 130 ++++++++++++++++++++
 mlir/test/lib/Dialect/SCF/CMakeLists.txt           |   1 +
 .../lib/Dialect/SCF/TestSCFWrapInZeroTripCheck.cpp |  72 +++++++++++
 mlir/tools/mlir-opt/mlir-opt.cpp                   |   2 +
 7 files changed, 379 insertions(+)
 create mode 100644 mlir/lib/Dialect/SCF/Transforms/WrapInZeroTripCheck.cpp
 create mode 100644 mlir/test/Dialect/SCF/wrap-while-loop-in-zero-trip-check.mlir
 create mode 100644 mlir/test/lib/Dialect/SCF/TestSCFWrapInZeroTripCheck.cpp

diff --git a/mlir/include/mlir/Dialect/SCF/Transforms/Transforms.h b/mlir/include/mlir/Dialect/SCF/Transforms/Transforms.h
index e91f9e4..690cd14 100644
--- a/mlir/include/mlir/Dialect/SCF/Transforms/Transforms.h
+++ b/mlir/include/mlir/Dialect/SCF/Transforms/Transforms.h
@@ -30,6 +30,7 @@ namespace scf {
 class IfOp;
 class ForOp;
 class ParallelOp;
+class WhileOp;
 
 /// Fuses all adjacent scf.parallel operations with identical bounds and step
 /// into one scf.parallel operations. Uses a naive aliasing and dependency
@@ -181,6 +182,46 @@ FailureOr<ForOp> pipelineForLoop(RewriterBase &rewriter, ForOp forOp,
                                  const PipeliningOption &options,
                                  bool *modifiedIR = nullptr);
 
+/// Create zero-trip-check around a `while` op and return the new loop op in the
+/// check. The while loop is rotated to avoid evaluating the condition twice
+///
+/// By default the check won't be created for do-while loop as it is not
+/// required. `forceCreateCheck` can force the creation.
+///
+/// It turns:
+///
+///   scf.while (%arg0 = %init) : (i32) -> i64 {
+///     %val = .., %arg0 : i64
+///     %cond = arith.cmpi .., %arg0 : i32
+///     scf.condition(%cond) %val : i64
+///   } do {
+///   ^bb0(%arg1: i64):
+///     %next = .., %arg1 : i32
+///     scf.yield %next : i32
+///   }
+///
+///  into:
+///
+///   %pre_val = .., %init : i64
+///   %pre_cond = arith.cmpi .., %init : i32
+///   scf.if %pre_cond -> i64 {
+///     %res = scf.while (%arg1 = %va0) : (i64) -> i64 {
+///       %next = .., %arg1 : i32
+///       %val = .., %next : i64
+///       %cond = arith.cmpi .., %next : i32
+///       scf.condition(%cond) %val : i64
+///     } do {
+///     ^bb0(%arg2: i64):
+///       %scf.yield %arg2 : i32
+///     }
+///     scf.yield %res : i64
+///   } else {
+///     scf.yield %pre_val : i64
+///   }
+FailureOr<WhileOp> wrapWhileLoopInZeroTripCheck(WhileOp whileOp,
+                                                RewriterBase &rewriter,
+                                                bool forceCreateCheck = false);
+
 } // namespace scf
 } // namespace mlir
 
diff --git a/mlir/lib/Dialect/SCF/Transforms/CMakeLists.txt b/mlir/lib/Dialect/SCF/Transforms/CMakeLists.txt
index fdaeb2f..e549420 100644
--- a/mlir/lib/Dialect/SCF/Transforms/CMakeLists.txt
+++ b/mlir/lib/Dialect/SCF/Transforms/CMakeLists.txt
@@ -13,6 +13,7 @@ add_mlir_dialect_library(MLIRSCFTransforms
   ParallelLoopTiling.cpp
   StructuralTypeConversions.cpp
   TileUsingInterface.cpp
+  WrapInZeroTripCheck.cpp
 
   ADDITIONAL_HEADER_DIRS
   ${MLIR_MAIN_INCLUDE_DIR}/mlir/Dialect/SCF
diff --git a/mlir/lib/Dialect/SCF/Transforms/WrapInZeroTripCheck.cpp b/mlir/lib/Dialect/SCF/Transforms/WrapInZeroTripCheck.cpp
new file mode 100644
index 0000000..f829208
--- /dev/null
+++ b/mlir/lib/Dialect/SCF/Transforms/WrapInZeroTripCheck.cpp
@@ -0,0 +1,132 @@
+//===- WrapInZeroTripCheck.cpp - Loop transforms to add zero-trip-check ---===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/SCF/IR/SCF.h"
+#include "mlir/Dialect/SCF/Transforms/Transforms.h"
+#include "mlir/IR/IRMapping.h"
+#include "mlir/IR/PatternMatch.h"
+
+using namespace mlir;
+
+/// Create zero-trip-check around a `while` op and return the new loop op in the
+/// check. The while loop is rotated to avoid evaluating the condition twice.
+///
+/// Given an example below:
+///
+///   scf.while (%arg0 = %init) : (i32) -> i64 {
+///     %val = .., %arg0 : i64
+///     %cond = arith.cmpi .., %arg0 : i32
+///     scf.condition(%cond) %val : i64
+///   } do {
+///   ^bb0(%arg1: i64):
+///     %next = .., %arg1 : i32
+///     scf.yield %next : i32
+///   }
+///
+/// First clone before block to the front of the loop:
+///
+///   %pre_val = .., %init : i64
+///   %pre_cond = arith.cmpi .., %init : i32
+///   scf.while (%arg0 = %init) : (i32) -> i64 {
+///     %val = .., %arg0 : i64
+///     %cond = arith.cmpi .., %arg0 : i32
+///     scf.condition(%cond) %val : i64
+///   } do {
+///   ^bb0(%arg1: i64):
+///     %next = .., %arg1 : i32
+///     scf.yield %next : i32
+///   }
+///
+/// Create `if` op with the condition, rotate and move the loop into the else
+/// branch:
+///
+///   %pre_val = .., %init : i64
+///   %pre_cond = arith.cmpi .., %init : i32
+///   scf.if %pre_cond -> i64 {
+///     %res = scf.while (%arg1 = %va0) : (i64) -> i64 {
+///       // Original after block
+///       %next = .., %arg1 : i32
+///       // Original before block
+///       %val = .., %next : i64
+///       %cond = arith.cmpi .., %next : i32
+///       scf.condition(%cond) %val : i64
+///     } do {
+///     ^bb0(%arg2: i64):
+///       %scf.yield %arg2 : i32
+///     }
+///     scf.yield %res : i64
+///   } else {
+///     scf.yield %pre_val : i64
+///   }
+FailureOr<scf::WhileOp> mlir::scf::wrapWhileLoopInZeroTripCheck(
+    scf::WhileOp whileOp, RewriterBase &rewriter, bool forceCreateCheck) {
+  // If the loop is in do-while form (after block only passes through values),
+  // there is no need to create a zero-trip-check as before block is always run.
+  if (!forceCreateCheck && isa<scf::YieldOp>(whileOp.getAfterBody()->front())) {
+    return whileOp;
+  }
+
+  OpBuilder::InsertionGuard insertion_guard(rewriter);
+
+  IRMapping mapper;
+  Block *beforeBlock = whileOp.getBeforeBody();
+  // Clone before block before the loop for zero-trip-check.
+  for (auto [arg, init] :
+       llvm::zip_equal(beforeBlock->getArguments(), whileOp.getInits())) {
+    mapper.map(arg, init);
+  }
+  rewriter.setInsertionPoint(whileOp);
+  for (auto &op : *beforeBlock) {
+    if (isa<scf::ConditionOp>(op)) {
+      break;
+    }
+    // Safe to clone everything as in a single block all defs have been cloned
+    // and added to mapper in order.
+    rewriter.insert(op.clone(mapper));
+  }
+
+  scf::ConditionOp condOp = whileOp.getConditionOp();
+  Value clonedCondition = mapper.lookupOrDefault(condOp.getCondition());
+  SmallVector<Value> clonedCondArgs = llvm::map_to_vector(
+      condOp.getArgs(), [&](Value arg) { return mapper.lookupOrDefault(arg); });
+
+  // Create rotated while loop.
+  auto newLoopOp = rewriter.create<scf::WhileOp>(
+      whileOp.getLoc(), whileOp.getResultTypes(), clonedCondArgs,
+      [&](OpBuilder &builder, Location loc, ValueRange args) {
+        // Rotate and move the loop body into before block.
+        auto newBlock = builder.getBlock();
+        rewriter.mergeBlocks(whileOp.getAfterBody(), newBlock, args);
+        auto yieldOp = cast<scf::YieldOp>(newBlock->getTerminator());
+        rewriter.mergeBlocks(whileOp.getBeforeBody(), newBlock,
+                             yieldOp.getResults());
+        rewriter.eraseOp(yieldOp);
+      },
+      [&](OpBuilder &builder, Location loc, ValueRange args) {
+        // Pass through values.
+        builder.create<scf::YieldOp>(loc, args);
+      });
+
+  // Create zero-trip-check and move the while loop in.
+  auto ifOp = rewriter.create<scf::IfOp>(
+      whileOp.getLoc(), clonedCondition,
+      [&](OpBuilder &builder, Location loc) {
+        // Then runs the while loop.
+        rewriter.moveOpBefore(newLoopOp, builder.getInsertionBlock(),
+                              builder.getInsertionPoint());
+        builder.create<scf::YieldOp>(loc, newLoopOp.getResults());
+      },
+      [&](OpBuilder &builder, Location loc) {
+        // Else returns the results from precondition.
+        builder.create<scf::YieldOp>(loc, clonedCondArgs);
+      });
+
+  rewriter.replaceOp(whileOp, ifOp);
+
+  return newLoopOp;
+}
diff --git a/mlir/test/Dialect/SCF/wrap-while-loop-in-zero-trip-check.mlir b/mlir/test/Dialect/SCF/wrap-while-loop-in-zero-trip-check.mlir
new file mode 100644
index 0000000..8954839
--- /dev/null
+++ b/mlir/test/Dialect/SCF/wrap-while-loop-in-zero-trip-check.mlir
@@ -0,0 +1,130 @@
+// RUN: mlir-opt %s -test-wrap-scf-while-loop-in-zero-trip-check -split-input-file  | FileCheck %s
+// RUN: mlir-opt %s -test-wrap-scf-while-loop-in-zero-trip-check='force-create-check=true' -split-input-file  | FileCheck %s --check-prefix FORCE-CREATE-CHECK
+
+func.func @wrap_while_loop_in_zero_trip_check(%bound : i32) -> i32 {
+  %cst0 = arith.constant 0 : i32
+  %cst5 = arith.constant 5 : i32
+  %res:2 = scf.while (%iter = %cst0) : (i32) -> (i32, i32) {
+    %cond = arith.cmpi slt, %iter, %bound : i32
+    %inv = arith.addi %bound, %cst5 : i32
+    scf.condition(%cond) %iter, %inv : i32, i32
+  } do {
+  ^bb0(%arg1: i32, %arg2: i32):
+    %next = arith.addi %arg1, %arg2 : i32
+    scf.yield %next : i32
+  }
+  return %res#0 : i32
+}
+
+// CHECK-LABEL: func.func @wrap_while_loop_in_zero_trip_check(
+// CHECK-SAME:      %[[BOUND:.*]]: i32) -> i32 {
+// CHECK-DAG:     %[[C0:.*]] = arith.constant 0 : i32
+// CHECK-DAG:     %[[C5:.*]] = arith.constant 5 : i32
+// CHECK-DAG:     %[[PRE_COND:.*]] = arith.cmpi slt, %[[C0]], %[[BOUND]] : i32
+// CHECK-DAG:     %[[PRE_INV:.*]] = arith.addi %[[BOUND]], %[[C5]] : i32
+// CHECK:         %[[IF:.*]]:2 = scf.if %[[PRE_COND]] -> (i32, i32) {
+// CHECK:           %[[WHILE:.*]]:2 = scf.while (
+// CHECK-SAME:          %[[ARG1:.*]] = %[[C0]], %[[ARG2:.*]] = %[[PRE_INV]]
+// CHECK-SAME:      ) : (i32, i32) -> (i32, i32) {
+// CHECK:             %[[NEXT:.*]] = arith.addi %[[ARG1]], %[[ARG2]] : i32
+// CHECK:             %[[COND:.*]] = arith.cmpi slt, %[[NEXT]], %[[BOUND]] : i32
+// CHECK:             %[[INV:.*]] = arith.addi %[[BOUND]], %[[C5]] : i32
+// CHECK:             scf.condition(%[[COND]]) %[[NEXT]], %[[INV]] : i32, i32
+// CHECK:           } do {
+// CHECK:           ^bb0(%[[ARG3:.*]]: i32, %[[ARG4:.*]]: i32):
+// CHECK:             scf.yield %[[ARG3]], %[[ARG4]] : i32, i32
+// CHECK:           }
+// CHECK:           scf.yield %[[WHILE]]#0, %[[WHILE]]#1 : i32, i32
+// CHECK:         } else {
+// CHECK:           scf.yield %[[C0]], %[[PRE_INV]] : i32, i32
+// CHECK:         }
+// CHECK:         return %[[IF]]#0 : i32
+
+// -----
+
+func.func @wrap_while_loop_with_minimal_before_block(%bound : i32) -> i32 {
+  %cst0 = arith.constant 0 : i32
+  %true = arith.constant true
+  %cst5 = arith.constant 5 : i32
+  %res = scf.while (%iter = %cst0, %arg0 = %true) : (i32, i1) -> i32 {
+    scf.condition(%arg0) %iter : i32
+  } do {
+  ^bb0(%arg1: i32):
+    %next = arith.addi %arg1, %cst5 : i32
+    %cond = arith.cmpi slt, %next, %bound : i32
+    scf.yield %next, %cond : i32, i1
+  }
+  return %res : i32
+}
+
+// CHECK-LABEL: func.func @wrap_while_loop_with_minimal_before_block(
+// CHECK-SAME:      %[[BOUND:.*]]: i32) -> i32 {
+// CHECK-DAG:     %[[C0:.*]] = arith.constant 0 : i32
+// CHECK-DAG:     %[[TRUE:.*]] = arith.constant true
+// CHECK-DAG:     %[[C5:.*]] = arith.constant 5 : i32
+// CHECK:         %[[IF:.*]] = scf.if %[[TRUE]] -> (i32) {
+// CHECK:           %[[WHILE:.*]] = scf.while (%[[ARG1:.*]] = %[[C0]]) : (i32) -> i32 {
+// CHECK:             %[[NEXT:.*]] = arith.addi %[[ARG1]], %[[C5]] : i32
+// CHECK:             %[[COND:.*]] = arith.cmpi slt, %[[NEXT]], %[[BOUND]] : i32
+// CHECK:             scf.condition(%[[COND]]) %[[NEXT]] : i32
+// CHECK:           } do {
+// CHECK:           ^bb0(%[[ARG2:.*]]: i32):
+// CHECK:             scf.yield %[[ARG2]] : i32
+// CHECK:           }
+// CHECK:           scf.yield %[[WHILE]] : i32
+// CHECK:         } else {
+// CHECK:           scf.yield %[[C0]] : i32
+// CHECK:         }
+// CHECK:         return %[[IF]] : i32
+
+// -----
+
+func.func @wrap_do_while_loop_in_zero_trip_check(%bound : i32) -> i32 {
+  %cst0 = arith.constant 0 : i32
+  %cst5 = arith.constant 5 : i32
+  %res = scf.while (%iter = %cst0) : (i32) -> i32 {
+    %next = arith.addi %iter, %cst5 : i32
+    %cond = arith.cmpi slt, %next, %bound : i32
+    scf.condition(%cond) %next : i32
+  } do {
+  ^bb0(%arg1: i32):
+    scf.yield %arg1 : i32
+  }
+  return %res : i32
+}
+
+// CHECK-LABEL: func.func @wrap_do_while_loop_in_zero_trip_check(
+// CHECK-SAME:      %[[BOUND:.*]]: i32) -> i32 {
+// CHECK-DAG:     %[[C0:.*]] = arith.constant 0 : i32
+// CHECK-DAG:     %[[C5:.*]] = arith.constant 5 : i32
+// CHECK-NOT:     scf.if
+// CHECK:         %[[WHILE:.*]] = scf.while (%[[ARG1:.*]] = %[[C0]]) : (i32) -> i32 {
+// CHECK:             %[[NEXT:.*]] = arith.addi %[[ARG1]], %[[C5]] : i32
+// CHECK:             %[[COND:.*]] = arith.cmpi slt, %[[NEXT]], %[[BOUND]] : i32
+// CHECK:             scf.condition(%[[COND]]) %[[NEXT]] : i32
+// CHECK:           } do {
+// CHECK:           ^bb0(%[[ARG2:.*]]: i32):
+// CHECK:             scf.yield %[[ARG2]] : i32
+// CHECK:           }
+// CHECK:         return %[[WHILE]] : i32
+
+// FORCE-CREATE-CHECK-LABEL: func.func @wrap_do_while_loop_in_zero_trip_check(
+// FORCE-CREATE-CHECK-SAME:      %[[BOUND:.*]]: i32) -> i32 {
+// FORCE-CREATE-CHECK-DAG:     %[[C0:.*]] = arith.constant 0 : i32
+// FORCE-CREATE-CHECK-DAG:     %[[C5:.*]] = arith.constant 5 : i32
+// FORCE-CREATE-CHECK:         %[[PRE_NEXT:.*]] = arith.addi %[[C0]], %[[C5]] : i32
+// FORCE-CREATE-CHECK:         %[[PRE_COND:.*]] = arith.cmpi slt, %[[PRE_NEXT]], %[[BOUND]] : i32
+// FORCE-CREATE-CHECK:         %[[IF:.*]] = scf.if %[[PRE_COND]] -> (i32) {
+// FORCE-CREATE-CHECK:           %[[WHILE:.*]] = scf.while (%[[ARG1:.*]] = %[[PRE_NEXT]]) : (i32) -> i32 {
+// FORCE-CREATE-CHECK:             %[[NEXT:.*]] = arith.addi %[[ARG1]], %[[C5]] : i32
+// FORCE-CREATE-CHECK:             %[[COND:.*]] = arith.cmpi slt, %[[NEXT]], %[[BOUND]] : i32
+// FORCE-CREATE-CHECK:             scf.condition(%[[COND]]) %[[NEXT]] : i32
+// FORCE-CREATE-CHECK:           } do {
+// FORCE-CREATE-CHECK:           ^bb0(%[[ARG2:.*]]: i32):
+// FORCE-CREATE-CHECK:             scf.yield %[[ARG2]] : i32
+// FORCE-CREATE-CHECK:           }
+// FORCE-CREATE-CHECK:           scf.yield %[[WHILE]] : i32
+// FORCE-CREATE-CHECK:         } else {
+// FORCE-CREATE-CHECK:           scf.yield %[[PRE_NEXT]] : i32
+// FORCE-CREATE-CHECK:         }
+// FORCE-CREATE-CHECK:         return %[[IF]] : i32
diff --git a/mlir/test/lib/Dialect/SCF/CMakeLists.txt b/mlir/test/lib/Dialect/SCF/CMakeLists.txt
index 22c2f238..d93bd55 100644
--- a/mlir/test/lib/Dialect/SCF/CMakeLists.txt
+++ b/mlir/test/lib/Dialect/SCF/CMakeLists.txt
@@ -3,6 +3,7 @@ add_mlir_library(MLIRSCFTestPasses
   TestLoopParametricTiling.cpp
   TestLoopUnrolling.cpp
   TestSCFUtils.cpp
+  TestSCFWrapInZeroTripCheck.cpp
   TestWhileOpBuilder.cpp
 
   EXCLUDE_FROM_LIBMLIR
diff --git a/mlir/test/lib/Dialect/SCF/TestSCFWrapInZeroTripCheck.cpp b/mlir/test/lib/Dialect/SCF/TestSCFWrapInZeroTripCheck.cpp
new file mode 100644
index 0000000..10206dd
--- /dev/null
+++ b/mlir/test/lib/Dialect/SCF/TestSCFWrapInZeroTripCheck.cpp
@@ -0,0 +1,72 @@
+//===- TestWrapInZeroTripCheck.cpp -- Passes to test SCF zero-trip-check --===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the passes to test wrap-in-zero-trip-check transforms on
+// SCF loop ops.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Dialect/SCF/IR/SCF.h"
+#include "mlir/Dialect/SCF/Transforms/Transforms.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/Pass/Pass.h"
+
+using namespace mlir;
+
+namespace {
+
+struct TestWrapWhileLoopInZeroTripCheckPass
+    : public PassWrapper<TestWrapWhileLoopInZeroTripCheckPass,
+                         OperationPass<func::FuncOp>> {
+  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(
+      TestWrapWhileLoopInZeroTripCheckPass)
+
+  StringRef getArgument() const final {
+    return "test-wrap-scf-while-loop-in-zero-trip-check";
+  }
+
+  StringRef getDescription() const final {
+    return "test scf::wrapWhileLoopInZeroTripCheck";
+  }
+
+  TestWrapWhileLoopInZeroTripCheckPass() = default;
+  TestWrapWhileLoopInZeroTripCheckPass(
+      const TestWrapWhileLoopInZeroTripCheckPass &) {}
+  explicit TestWrapWhileLoopInZeroTripCheckPass(bool forceCreateCheckParam) {
+    forceCreateCheck = forceCreateCheckParam;
+  }
+
+  void runOnOperation() override {
+    func::FuncOp func = getOperation();
+    MLIRContext *context = &getContext();
+    IRRewriter rewriter(context);
+    func.walk([&](scf::WhileOp op) {
+      FailureOr<scf::WhileOp> result =
+          scf::wrapWhileLoopInZeroTripCheck(op, rewriter, forceCreateCheck);
+      // Ignore not implemented failure in tests. The expected output should
+      // catch problems (e.g. transformation doesn't happen).
+      (void)result;
+    });
+  }
+
+  Option<bool> forceCreateCheck{
+      *this, "force-create-check",
+      llvm::cl::desc("Force to create zero-trip-check."),
+      llvm::cl::init(false)};
+};
+
+} // namespace
+
+namespace mlir {
+namespace test {
+void registerTestSCFWrapInZeroTripCheckPasses() {
+  PassRegistration<TestWrapWhileLoopInZeroTripCheckPass>();
+}
+} // namespace test
+} // namespace mlir
diff --git a/mlir/tools/mlir-opt/mlir-opt.cpp b/mlir/tools/mlir-opt/mlir-opt.cpp
index 1b3f60b..cec1e52 100644
--- a/mlir/tools/mlir-opt/mlir-opt.cpp
+++ b/mlir/tools/mlir-opt/mlir-opt.cpp
@@ -127,6 +127,7 @@ void registerTestPadFusion();
 void registerTestRecursiveTypesPass();
 void registerTestSCFUtilsPass();
 void registerTestSCFWhileOpBuilderPass();
+void registerTestSCFWrapInZeroTripCheckPasses();
 void registerTestShapeMappingPass();
 void registerTestSliceAnalysisPass();
 void registerTestTensorCopyInsertionPass();
@@ -250,6 +251,7 @@ void registerTestPasses() {
   mlir::test::registerTestRecursiveTypesPass();
   mlir::test::registerTestSCFUtilsPass();
   mlir::test::registerTestSCFWhileOpBuilderPass();
+  mlir::test::registerTestSCFWrapInZeroTripCheckPasses();
   mlir::test::registerTestShapeMappingPass();
   mlir::test::registerTestSliceAnalysisPass();
   mlir::test::registerTestTensorCopyInsertionPass();
-- 
cgit v1.1


From 8c106a15156857d23ba9e61c55b49b1e2b6c1583 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Valentin=20Clement=20=28=E3=83=90=E3=83=AC=E3=83=B3?=
 =?UTF-8?q?=E3=82=BF=E3=82=A4=E3=83=B3=20=E3=82=AF=E3=83=AC=E3=83=A1?=
 =?UTF-8?q?=E3=83=B3=29?= <clementval@gmail.com>
Date: Thu, 8 Feb 2024 15:13:48 -0800
Subject: [flang] Fix attribute printing for fir.global op (#81197)

The custom printer for `fir.global` was eluding all the attributes
present on the op when printing the attribute dictionary. So any
attribute that is not part of the pretty printing was therefore
discarded.
This patch fix the printer and also make use of the getters for the
attribute names when they are hardcoded.
---
 flang/lib/Optimizer/Dialect/FIROps.cpp | 19 ++++++++++++-------
 flang/test/Fir/fir-ops.fir             |  7 +++++++
 2 files changed, 19 insertions(+), 7 deletions(-)

diff --git a/flang/lib/Optimizer/Dialect/FIROps.cpp b/flang/lib/Optimizer/Dialect/FIROps.cpp
index 483f318..a5b31da 100644
--- a/flang/lib/Optimizer/Dialect/FIROps.cpp
+++ b/flang/lib/Optimizer/Dialect/FIROps.cpp
@@ -1348,12 +1348,12 @@ mlir::ParseResult fir::GlobalOp::parse(mlir::OpAsmParser &parser,
   if (parser.parseOptionalAttrDict(result.attributes))
     return mlir::failure();
 
-  if (succeeded(parser.parseOptionalKeyword("constant"))) {
+  if (succeeded(parser.parseOptionalKeyword(getConstantAttrNameStr()))) {
     // if "constant" keyword then mark this as a constant, not a variable
-    result.addAttribute("constant", builder.getUnitAttr());
+    result.addAttribute(getConstantAttrNameStr(), builder.getUnitAttr());
   }
 
-  if (succeeded(parser.parseOptionalKeyword("target")))
+  if (succeeded(parser.parseOptionalKeyword(getTargetAttrNameStr())))
     result.addAttribute(getTargetAttrNameStr(), builder.getUnitAttr());
 
   mlir::Type globalType;
@@ -1382,11 +1382,16 @@ void fir::GlobalOp::print(mlir::OpAsmPrinter &p) {
   p.printAttributeWithoutType(getSymrefAttr());
   if (auto val = getValueOrNull())
     p << '(' << val << ')';
-  p.printOptionalAttrDict((*this)->getAttrs(), (*this).getAttributeNames());
-  if (getOperation()->getAttr(fir::GlobalOp::getConstantAttrNameStr()))
-    p << " constant";
+  // Print all other attributes that are not pretty printed here.
+  p.printOptionalAttrDict((*this)->getAttrs(), /*elideAttrs=*/{
+                              getSymNameAttrName(), getSymrefAttrName(),
+                              getTypeAttrName(), getConstantAttrName(),
+                              getTargetAttrName(), getLinkNameAttrName(),
+                              getInitValAttrName()});
+  if (getOperation()->getAttr(getConstantAttrName()))
+    p << " " << getConstantAttrNameStr();
   if (getOperation()->getAttr(getTargetAttrName()))
-    p << " target";
+    p << " " << getTargetAttrNameStr();
   p << " : ";
   p.printType(getType());
   if (hasInitializationBody()) {
diff --git a/flang/test/Fir/fir-ops.fir b/flang/test/Fir/fir-ops.fir
index 3c4095b..962621c 100644
--- a/flang/test/Fir/fir-ops.fir
+++ b/flang/test/Fir/fir-ops.fir
@@ -893,3 +893,10 @@ func.func @test_box_typecode(%a: !fir.class<none>) {
 // CHECK-LABEL: func.func @test_box_typecode(
 // CHECK-SAME: %[[A:.*]]: !fir.class<none>)
 // CHECK: %{{.*}} = fir.box_typecode %[[A]] : (!fir.class<none>) -> i32
+
+fir.global @t1 {keep_my_attr = "data"} : i32 {
+ %1 = arith.constant 0 : i32
+  fir.has_value %1 : i32
+}
+
+// CHECK-LABEL: fir.global @t1 {keep_my_attr = "data"} : i32
-- 
cgit v1.1


From 9affa177b526459beddafad30474d2e3186376e4 Mon Sep 17 00:00:00 2001
From: Alex MacLean <amaclean@nvidia.com>
Date: Thu, 8 Feb 2024 15:14:13 -0800
Subject: [NVPTX] Add support for calling aliases (#81170)

The current implementation of aliases tries to remove all the aliases in
the module to prevent the generic version of `AsmPrinter` from emitting
them incorrectly. Unfortunately, if the aliases are used this will fail.
Instead let's override the function to print aliases directly.

In addition, the declarations of the alias functions must occur before
the uses. To fix this we emit alias declarations as part of
`emitDeclarations` and only emit the `.alias` directives at the end
(where we can assume the aliasee has also already been declared).
---
 llvm/include/llvm/CodeGen/AsmPrinter.h     |  2 +-
 llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp |  2 +-
 llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp  | 54 ++++++++++++++----------------
 llvm/lib/Target/NVPTX/NVPTXAsmPrinter.h    |  5 ++-
 llvm/test/CodeGen/NVPTX/alias.ll           | 52 +++++++++++++++++++++-------
 5 files changed, 70 insertions(+), 45 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/AsmPrinter.h b/llvm/include/llvm/CodeGen/AsmPrinter.h
index fbd198a..a7fbf4a 100644
--- a/llvm/include/llvm/CodeGen/AsmPrinter.h
+++ b/llvm/include/llvm/CodeGen/AsmPrinter.h
@@ -897,7 +897,7 @@ private:
   virtual void emitModuleCommandLines(Module &M);
 
   GCMetadataPrinter *getOrCreateGCPrinter(GCStrategy &S);
-  void emitGlobalAlias(Module &M, const GlobalAlias &GA);
+  virtual void emitGlobalAlias(const Module &M, const GlobalAlias &GA);
   void emitGlobalIFunc(Module &M, const GlobalIFunc &GI);
 
 private:
diff --git a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
index b961fc2..e89a1c26 100644
--- a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
@@ -2127,7 +2127,7 @@ void AsmPrinter::emitGlobalGOTEquivs() {
     emitGlobalVariable(GV);
 }
 
-void AsmPrinter::emitGlobalAlias(Module &M, const GlobalAlias &GA) {
+void AsmPrinter::emitGlobalAlias(const Module &M, const GlobalAlias &GA) {
   MCSymbol *Name = getSymbol(&GA);
   bool IsFunction = GA.getValueType()->isFunctionTy();
   // Treat bitcasts of functions as functions also. This is important at least
diff --git a/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp b/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
index cdfc288..2219d9f 100644
--- a/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
@@ -57,6 +57,7 @@
 #include "llvm/IR/DebugLoc.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalAlias.h"
 #include "llvm/IR/GlobalValue.h"
 #include "llvm/IR/GlobalVariable.h"
 #include "llvm/IR/Instruction.h"
@@ -605,14 +606,33 @@ void NVPTXAsmPrinter::emitVirtualRegister(unsigned int vr,
   O << getVirtualRegisterName(vr);
 }
 
+void NVPTXAsmPrinter::emitAliasDeclaration(const GlobalAlias *GA,
+                                           raw_ostream &O) {
+  const Function *F = dyn_cast_or_null<Function>(GA->getAliaseeObject());
+  if (!F || isKernelFunction(*F) || F->isDeclaration())
+    report_fatal_error(
+        "NVPTX aliasee must be a non-kernel function definition");
+
+  if (GA->hasLinkOnceLinkage() || GA->hasWeakLinkage() ||
+      GA->hasAvailableExternallyLinkage() || GA->hasCommonLinkage())
+    report_fatal_error("NVPTX aliasee must not be '.weak'");
+
+  emitDeclarationWithName(F, getSymbol(GA), O);
+}
+
 void NVPTXAsmPrinter::emitDeclaration(const Function *F, raw_ostream &O) {
+  emitDeclarationWithName(F, getSymbol(F), O);
+}
+
+void NVPTXAsmPrinter::emitDeclarationWithName(const Function *F, MCSymbol *S,
+                                              raw_ostream &O) {
   emitLinkageDirective(F, O);
   if (isKernelFunction(*F))
     O << ".entry ";
   else
     O << ".func ";
   printReturnValStr(F, O);
-  getSymbol(F)->print(O, MAI);
+  S->print(O, MAI);
   O << "\n";
   emitFunctionParamList(F, O);
   O << "\n";
@@ -759,6 +779,8 @@ void NVPTXAsmPrinter::emitDeclarations(const Module &M, raw_ostream &O) {
     }
     seenMap[&F] = true;
   }
+  for (const GlobalAlias &GA : M.aliases())
+    emitAliasDeclaration(&GA, O);
 }
 
 static bool isEmptyXXStructor(GlobalVariable *GV) {
@@ -853,25 +875,9 @@ void NVPTXAsmPrinter::emitGlobalAlias(const Module &M, const GlobalAlias &GA) {
   raw_svector_ostream OS(Str);
 
   MCSymbol *Name = getSymbol(&GA);
-  const Function *F = dyn_cast<Function>(GA.getAliasee());
-  if (!F || isKernelFunction(*F))
-    report_fatal_error("NVPTX aliasee must be a non-kernel function");
-
-  if (GA.hasLinkOnceLinkage() || GA.hasWeakLinkage() ||
-      GA.hasAvailableExternallyLinkage() || GA.hasCommonLinkage())
-    report_fatal_error("NVPTX aliasee must not be '.weak'");
-
-  OS << "\n";
-  emitLinkageDirective(F, OS);
-  OS << ".func ";
-  printReturnValStr(F, OS);
-  OS << Name->getName();
-  emitFunctionParamList(F, OS);
-  if (shouldEmitPTXNoReturn(F, TM))
-    OS << "\n.noreturn";
-  OS << ";\n";
 
-  OS << ".alias " << Name->getName() << ", " << F->getName() << ";\n";
+  OS << ".alias " << Name->getName() << ", " << GA.getAliaseeObject()->getName()
+     << ";\n";
 
   OutStreamer->emitRawText(OS.str());
 }
@@ -932,16 +938,6 @@ bool NVPTXAsmPrinter::doFinalization(Module &M) {
     GlobalsEmitted = true;
   }
 
-  // If we have any aliases we emit them at the end.
-  SmallVector<GlobalAlias *> AliasesToRemove;
-  for (GlobalAlias &Alias : M.aliases()) {
-    emitGlobalAlias(M, Alias);
-    AliasesToRemove.push_back(&Alias);
-  }
-
-  for (GlobalAlias *A : AliasesToRemove)
-    A->eraseFromParent();
-
   // call doFinalization
   bool ret = AsmPrinter::doFinalization(M);
 
diff --git a/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.h b/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.h
index 7f0f37e..979d185 100644
--- a/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.h
+++ b/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.h
@@ -27,6 +27,7 @@
 #include "llvm/IR/DebugLoc.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalAlias.h"
 #include "llvm/IR/GlobalValue.h"
 #include "llvm/IR/Value.h"
 #include "llvm/MC/MCExpr.h"
@@ -174,7 +175,7 @@ private:
   void printModuleLevelGV(const GlobalVariable *GVar, raw_ostream &O,
                           bool processDemoted, const NVPTXSubtarget &STI);
   void emitGlobals(const Module &M);
-  void emitGlobalAlias(const Module &M, const GlobalAlias &GA);
+  void emitGlobalAlias(const Module &M, const GlobalAlias &GA) override;
   void emitHeader(Module &M, raw_ostream &O, const NVPTXSubtarget &STI);
   void emitKernelFunctionDirectives(const Function &F, raw_ostream &O) const;
   void emitVirtualRegister(unsigned int vr, raw_ostream &);
@@ -222,6 +223,8 @@ private:
   void emitLinkageDirective(const GlobalValue *V, raw_ostream &O);
   void emitDeclarations(const Module &, raw_ostream &O);
   void emitDeclaration(const Function *, raw_ostream &O);
+  void emitAliasDeclaration(const GlobalAlias *, raw_ostream &O);
+  void emitDeclarationWithName(const Function *, MCSymbol *, raw_ostream &O);
   void emitDemotedVars(const Function *, raw_ostream &);
 
   bool lowerImageHandleOperand(const MachineInstr *MI, unsigned OpNo,
diff --git a/llvm/test/CodeGen/NVPTX/alias.ll b/llvm/test/CodeGen/NVPTX/alias.ll
index d5dc3a1..cb592dd 100644
--- a/llvm/test/CodeGen/NVPTX/alias.ll
+++ b/llvm/test/CodeGen/NVPTX/alias.ll
@@ -1,8 +1,10 @@
 ; RUN: llc < %s -march=nvptx64 -mcpu=sm_30 -mattr=+ptx64 | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_30 -mattr=+ptx64 | %ptxas-verify %}
 
 define i32 @a() { ret i32 0 }
 @b = internal alias i32 (), ptr @a
 @c = internal alias i32 (), ptr @a
+@d = internal alias i32 (), ptr @c
 
 define void @foo(i32 %0, ptr %1) { ret void }
 @bar = alias i32 (), ptr @foo
@@ -12,8 +14,37 @@ define void @noreturn() #0 {
 }
 @noreturn_alias = alias i32 (), ptr @noreturn
 
+define i32 @z() {
+  %val = call i32 @b()
+  ret i32 %val
+}
+
+
 attributes #0 = { noreturn }
 
+;      CHECK: .visible .func  (.param .b32 func_retval0) b
+; CHECK-NEXT: ()
+; CHECK-NEXT: ;
+
+;      CHECK: .visible .func  (.param .b32 func_retval0) c
+; CHECK-NEXT: ()
+; CHECK-NEXT: ;
+
+;      CHECK: .visible .func  (.param .b32 func_retval0) d
+; CHECK-NEXT: ()
+; CHECK-NEXT: ;
+
+;      CHECK: .visible .func bar
+; CHECK-NEXT: (
+; CHECK-NEXT:         .param .b32 foo_param_0,
+; CHECK-NEXT:         .param .b64 foo_param_1
+; CHECK-NEXT: )
+; CHECK-NEXT: ;
+
+;      CHECK: .visible .func noreturn_alias
+; CHECK-NEXT: ()
+; CHECK-NEXT: .noreturn;
+
 ; CHECK: .visible .func  (.param .b32 func_retval0) a()
 
 ;      CHECK: .visible .func foo(
@@ -24,18 +55,13 @@ attributes #0 = { noreturn }
 ;      CHECK: .visible .func noreturn()
 ; CHECK-NEXT: .noreturn
 
-;      CHECK: .visible .func  (.param .b32 func_retval0) b();
-; CHECK-NEXT: .alias b, a;
+;      CHECK: .visible .func  (.param .b32 func_retval0) z()
+;      CHECK:      call.uni (retval0), 
+; CHECK-NEXT:      b,
 
-;      CHECK: .visible .func  (.param .b32 func_retval0) c();
-; CHECK-NEXT: .alias c, a;
 
-;      CHECK: .visible .func bar(
-; CHECK-NEXT:         .param .b32 foo_param_0,
-; CHECK-NEXT:         .param .b64 foo_param_1
-; CHECK-NEXT: );
-; CHECK-NEXT: .alias bar, foo;
-
-;      CHECK: .visible .func noreturn_alias()
-; CHECK-NEXT: .noreturn;
-; CHECK-NEXT: .alias noreturn_alias, noreturn;
+; CHECK: .alias b, a;
+; CHECK: .alias c, a;
+; CHECK: .alias d, a;
+; CHECK: .alias bar, foo;
+; CHECK: .alias noreturn_alias, noreturn;
-- 
cgit v1.1


From 9211e67da36782db44a46ccb9ac06734ccf2570f Mon Sep 17 00:00:00 2001
From: Joseph Huber <huberjn@outlook.com>
Date: Thu, 8 Feb 2024 17:16:31 -0600
Subject: [NVVMReflect] Force dead branch elimination in NVVMReflect (#81189)

Summary:
The `__nvvm_reflect` function is used to guard invalid code that varies
between architectures. One problem with this feature is that if it is
used without optimizations, it will leave invalid code in the module
that will then make it to the backend. The `__nvvm_reflect` pass is
already mandatory, so it should do some trivial branch removal to ensure
that constants are handled correctly. This dead branch elimination only
works in the trivial case of a compare on a branch and does not touch
any conditionals that were not realted to the `__nvvm_reflect` call in
order to preserve `O0` semantics as much as possible. This should allow
the following to work on NVPTX targets

```c
int foo() {
  if (__nvvm_reflect("__CUDA_ARCH") >= 700)
    asm("valid;\n");
}
```
---
 llvm/docs/NVPTXUsage.rst                        |   5 +
 llvm/lib/Target/NVPTX/NVVMReflect.cpp           |  62 +++++++++++
 llvm/test/CodeGen/NVPTX/nvvm-reflect-arch-O0.ll | 141 ++++++++++++++++++++++++
 llvm/test/CodeGen/NVPTX/nvvm-reflect-arch.ll    |   1 -
 4 files changed, 208 insertions(+), 1 deletion(-)
 create mode 100644 llvm/test/CodeGen/NVPTX/nvvm-reflect-arch-O0.ll

diff --git a/llvm/docs/NVPTXUsage.rst b/llvm/docs/NVPTXUsage.rst
index 22acc6c..b5e3918 100644
--- a/llvm/docs/NVPTXUsage.rst
+++ b/llvm/docs/NVPTXUsage.rst
@@ -296,6 +296,11 @@ pipeline, immediately after the link stage. The ``internalize`` pass is also
 recommended to remove unused math functions from the resulting PTX. For an
 input IR module ``module.bc``, the following compilation flow is recommended:
 
+The ``NVVMReflect`` pass will attempt to remove dead code even without
+optimizations. This allows potentially incompatible instructions to be avoided
+at all optimizations levels. This currently only works for simple conditionals
+like the above example.
+
 1. Save list of external functions in ``module.bc``
 2. Link ``module.bc`` with ``libdevice.compute_XX.YY.bc``
 3. Internalize all functions not in list from (1)
diff --git a/llvm/lib/Target/NVPTX/NVVMReflect.cpp b/llvm/lib/Target/NVPTX/NVVMReflect.cpp
index 7d2678a..5283c2f 100644
--- a/llvm/lib/Target/NVPTX/NVVMReflect.cpp
+++ b/llvm/lib/Target/NVPTX/NVVMReflect.cpp
@@ -20,6 +20,7 @@
 
 #include "NVPTX.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/Analysis/ConstantFolding.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Function.h"
@@ -36,6 +37,8 @@
 #include "llvm/Support/raw_os_ostream.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Local.h"
 #include <sstream>
 #include <string>
 #define NVVM_REFLECT_FUNCTION "__nvvm_reflect"
@@ -87,6 +90,7 @@ static bool runNVVMReflect(Function &F, unsigned SmVersion) {
   }
 
   SmallVector<Instruction *, 4> ToRemove;
+  SmallVector<ICmpInst *, 4> ToSimplify;
 
   // Go through the calls in this function.  Each call to __nvvm_reflect or
   // llvm.nvvm.reflect should be a CallInst with a ConstantArray argument.
@@ -171,6 +175,13 @@ static bool runNVVMReflect(Function &F, unsigned SmVersion) {
     } else if (ReflectArg == "__CUDA_ARCH") {
       ReflectVal = SmVersion * 10;
     }
+
+    // If the immediate user is a simple comparison we want to simplify it.
+    // TODO: This currently does not handle switch instructions.
+    for (User *U : Call->users())
+      if (ICmpInst *I = dyn_cast<ICmpInst>(U))
+        ToSimplify.push_back(I);
+
     Call->replaceAllUsesWith(ConstantInt::get(Call->getType(), ReflectVal));
     ToRemove.push_back(Call);
   }
@@ -178,6 +189,57 @@ static bool runNVVMReflect(Function &F, unsigned SmVersion) {
   for (Instruction *I : ToRemove)
     I->eraseFromParent();
 
+  // The code guarded by __nvvm_reflect may be invalid for the target machine.
+  // We need to do some basic dead code elimination to trim invalid code before
+  // it reaches the backend at all optimization levels.
+  SmallVector<BranchInst *> Simplified;
+  for (ICmpInst *Cmp : ToSimplify) {
+    Constant *LHS = dyn_cast<Constant>(Cmp->getOperand(0));
+    Constant *RHS = dyn_cast<Constant>(Cmp->getOperand(1));
+
+    if (!LHS || !RHS)
+      continue;
+
+    // If the comparison is a compile time constant we simply propagate it.
+    Constant *C = ConstantFoldCompareInstOperands(
+        Cmp->getPredicate(), LHS, RHS, Cmp->getModule()->getDataLayout());
+
+    if (!C)
+      continue;
+
+    for (User *U : Cmp->users())
+      if (BranchInst *I = dyn_cast<BranchInst>(U))
+        Simplified.push_back(I);
+
+    Cmp->replaceAllUsesWith(C);
+    Cmp->eraseFromParent();
+  }
+
+  // Each instruction here is a conditional branch off of a constant true or
+  // false value. Simply replace it with an unconditional branch to the
+  // appropriate basic block and delete the rest if it is trivially dead.
+  DenseSet<Instruction *> Removed;
+  for (BranchInst *Branch : Simplified) {
+    if (Removed.contains(Branch))
+      continue;
+
+    ConstantInt *C = dyn_cast<ConstantInt>(Branch->getCondition());
+    if (!C || (!C->isOne() && !C->isZero()))
+      continue;
+
+    BasicBlock *TrueBB =
+        C->isOne() ? Branch->getSuccessor(0) : Branch->getSuccessor(1);
+    BasicBlock *FalseBB =
+        C->isOne() ? Branch->getSuccessor(1) : Branch->getSuccessor(0);
+
+    ReplaceInstWithInst(Branch, BranchInst::Create(TrueBB));
+    if (FalseBB->use_empty() && FalseBB->hasNPredecessors(0) &&
+        FalseBB->getFirstNonPHIOrDbg()) {
+      Removed.insert(FalseBB->getFirstNonPHIOrDbg());
+      changeToUnreachable(FalseBB->getFirstNonPHIOrDbg());
+    }
+  }
+
   return ToRemove.size() > 0;
 }
 
diff --git a/llvm/test/CodeGen/NVPTX/nvvm-reflect-arch-O0.ll b/llvm/test/CodeGen/NVPTX/nvvm-reflect-arch-O0.ll
new file mode 100644
index 0000000..c9586d5
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/nvvm-reflect-arch-O0.ll
@@ -0,0 +1,141 @@
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_52 -mattr=+ptx64 -O0 | FileCheck %s --check-prefix=SM_52
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_70 -mattr=+ptx64 -O0 | FileCheck %s --check-prefix=SM_70
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_90 -mattr=+ptx72 -O0 | FileCheck %s --check-prefix=SM_90
+
+@.str = private unnamed_addr constant [12 x i8] c"__CUDA_ARCH\00"
+
+declare i32 @__nvvm_reflect(ptr)
+
+;      SM_52: .visible .func  (.param .b32 func_retval0) foo()
+;      SM_52: mov.b32         %[[REG:.+]], 3;
+; SM_52-NEXT: st.param.b32    [func_retval0+0], %[[REG:.+]];
+; SM_52-NEXT: ret;
+;
+;      SM_70: .visible .func  (.param .b32 func_retval0) foo()
+;      SM_70: mov.b32         %[[REG:.+]], 2;
+; SM_70-NEXT: st.param.b32    [func_retval0+0], %[[REG:.+]];
+; SM_70-NEXT: ret;
+;
+;      SM_90: .visible .func  (.param .b32 func_retval0) foo()
+;      SM_90: mov.b32         %[[REG:.+]], 1;
+; SM_90-NEXT: st.param.b32    [func_retval0+0], %[[REG:.+]];
+; SM_90-NEXT: ret;
+define i32 @foo() {
+entry:
+  %call = call i32 @__nvvm_reflect(ptr @.str)
+  %cmp = icmp uge i32 %call, 900
+  br i1 %cmp, label %if.then, label %if.else
+
+if.then:
+  br label %return
+
+if.else:
+  %call1 = call i32 @__nvvm_reflect(ptr @.str)
+  %cmp2 = icmp uge i32 %call1, 700
+  br i1 %cmp2, label %if.then3, label %if.else4
+
+if.then3:
+  br label %return
+
+if.else4:
+  %call5 = call i32 @__nvvm_reflect(ptr @.str)
+  %cmp6 = icmp uge i32 %call5, 520
+  br i1 %cmp6, label %if.then7, label %if.else8
+
+if.then7:
+  br label %return
+
+if.else8:
+  br label %return
+
+return:
+  %retval.0 = phi i32 [ 1, %if.then ], [ 2, %if.then3 ], [ 3, %if.then7 ], [ 4, %if.else8 ]
+  ret i32 %retval.0
+}
+
+;      SM_52: .visible .func  (.param .b32 func_retval0) bar()
+;      SM_52: mov.b32         %[[REG:.+]], 2;
+; SM_52-NEXT: st.param.b32    [func_retval0+0], %[[REG:.+]];
+; SM_52-NEXT: ret;
+;
+;      SM_70: .visible .func  (.param .b32 func_retval0) bar()
+;      SM_70: mov.b32         %[[REG:.+]], 1;
+; SM_70-NEXT: st.param.b32    [func_retval0+0], %[[REG:.+]];
+; SM_70-NEXT: ret;
+;
+;      SM_90: .visible .func  (.param .b32 func_retval0) bar()
+;      SM_90: mov.b32         %[[REG:.+]], 1;
+; SM_90-NEXT: st.param.b32    [func_retval0+0], %[[REG:.+]];
+; SM_90-NEXT: ret;
+define i32 @bar() {
+entry:
+  %call = call i32 @__nvvm_reflect(ptr @.str)
+  %cmp = icmp uge i32 %call, 700
+  br i1 %cmp, label %if.then, label %if.else
+
+if.then:
+  br label %if.end
+
+if.else:
+  br label %if.end
+
+if.end:
+  %x = phi i32 [ 1, %if.then ], [ 2, %if.else ]
+  ret i32 %x
+}
+
+; SM_52-NOT: valid;
+; SM_70: valid;
+; SM_90: valid;
+define void @baz() {
+entry:
+  %call = call i32 @__nvvm_reflect(ptr @.str)
+  %cmp = icmp uge i32 %call, 700
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:
+  call void asm sideeffect "valid;\0A", ""()
+  br label %if.end
+
+if.end:
+  ret void
+}
+
+; SM_52: .visible .func  (.param .b32 func_retval0) qux()
+; SM_52: mov.u32         %[[REG1:.+]], %[[REG2:.+]];
+; SM_52: st.param.b32    [func_retval0+0], %[[REG1:.+]];
+; SM_52: ret;
+; SM_70: .visible .func  (.param .b32 func_retval0) qux()
+; SM_70: mov.u32         %[[REG1:.+]], %[[REG2:.+]];
+; SM_70: st.param.b32    [func_retval0+0], %[[REG1:.+]];
+; SM_70: ret;
+; SM_90: .visible .func  (.param .b32 func_retval0) qux()
+; SM_90: st.param.b32    [func_retval0+0], %[[REG1:.+]];
+; SM_90: ret;
+define i32 @qux() {
+entry:
+  %call = call i32 @__nvvm_reflect(ptr noundef @.str)
+  %cmp = icmp uge i32 %call, 700
+  %conv = zext i1 %cmp to i32
+  switch i32 %conv, label %sw.default [
+    i32 900, label %sw.bb
+    i32 700, label %sw.bb1
+    i32 520, label %sw.bb2
+  ]
+
+sw.bb:
+  br label %return
+
+sw.bb1:
+  br label %return
+
+sw.bb2:
+  br label %return
+
+sw.default:
+  br label %return
+
+return:
+  %retval = phi i32 [ 4, %sw.default ], [ 3, %sw.bb2 ], [ 2, %sw.bb1 ], [ 1, %sw.bb ]
+  ret i32 %retval
+}
diff --git a/llvm/test/CodeGen/NVPTX/nvvm-reflect-arch.ll b/llvm/test/CodeGen/NVPTX/nvvm-reflect-arch.ll
index e8c554c..ac5875c 100644
--- a/llvm/test/CodeGen/NVPTX/nvvm-reflect-arch.ll
+++ b/llvm/test/CodeGen/NVPTX/nvvm-reflect-arch.ll
@@ -18,4 +18,3 @@ define i32 @foo(float %a, float %b) {
 ; SM35: ret i32 350  
   ret i32 %reflect
 }
-
-- 
cgit v1.1


From 42230e213e11a0cf9cdbdcd49225eb0d325ef007 Mon Sep 17 00:00:00 2001
From: Joseph Huber <huberjn@outlook.com>
Date: Thu, 8 Feb 2024 17:17:21 -0600
Subject: [LinkerWrapper] Allow 'all' as a generic bundled architecture
 (#81193)

Summary:
Currently, the linker wrapper sorts input files into different link
jobs according to their architectures. Here we assume each architecture
is a unique and incompatible link job unless they are specifically
marked compatible. This patch simply adds an `all` target to represent
an architecture that should be linked against every single other
architecture.

This will be useful for modelling generic IR such as the ROCm device
libraries or the NVPTX libdevice.
---
 clang/test/Driver/linker-wrapper.c | 16 ++++++++++++++++
 llvm/lib/Object/OffloadBinary.cpp  |  4 ++++
 2 files changed, 20 insertions(+)

diff --git a/clang/test/Driver/linker-wrapper.c b/clang/test/Driver/linker-wrapper.c
index 010001b..647629a 100644
--- a/clang/test/Driver/linker-wrapper.c
+++ b/clang/test/Driver/linker-wrapper.c
@@ -172,6 +172,22 @@ __attribute__((visibility("protected"), used)) int x;
 // AMD-TARGET-ID: clang{{.*}} -o {{.*}}.img --target=amdgcn-amd-amdhsa -mcpu=gfx90a:xnack+ -O2 -Wl,--no-undefined {{.*}}.o {{.*}}.o
 // AMD-TARGET-ID: clang{{.*}} -o {{.*}}.img --target=amdgcn-amd-amdhsa -mcpu=gfx90a:xnack- -O2 -Wl,--no-undefined {{.*}}.o {{.*}}.o
 
+// RUN: clang-offload-packager -o %t-lib.out \
+// RUN:   --image=file=%t.elf.o,kind=openmp,triple=amdgcn-amd-amdhsa,arch=all
+// RUN: %clang -cc1 %s -triple x86_64-unknown-linux-gnu -emit-obj -o %t.o -fembed-offload-object=%t-lib.out
+// RUN: llvm-ar rcs %t.a %t.o
+// RUN: clang-offload-packager -o %t1.out \
+// RUN:   --image=file=%t.elf.o,kind=openmp,triple=amdgcn-amd-amdhsa,arch=gfx90a
+// RUN: %clang -cc1 %s -triple x86_64-unknown-linux-gnu -emit-obj -o %t1.o -fembed-offload-object=%t1.out
+// RUN: clang-offload-packager -o %t2.out \
+// RUN:   --image=file=%t.elf.o,kind=openmp,triple=amdgcn-amd-amdhsa,arch=gfx908
+// RUN: %clang -cc1 %s -triple x86_64-unknown-linux-gnu -emit-obj -o %t2.o -fembed-offload-object=%t2.out
+// RUN: clang-linker-wrapper --host-triple=x86_64-unknown-linux-gnu --dry-run \
+// RUN:   --linker-path=/usr/bin/ld -- %t1.o %t2.o %t.a -o a.out 2>&1 | FileCheck %s --check-prefix=ARCH-ALL
+
+// ARCH-ALL: clang{{.*}} -o {{.*}}.img --target=amdgcn-amd-amdhsa -mcpu=gfx908 -O2 -Wl,--no-undefined {{.*}}.o {{.*}}.o
+// ARCH-ALL: clang{{.*}} -o {{.*}}.img --target=amdgcn-amd-amdhsa -mcpu=gfx90a -O2 -Wl,--no-undefined {{.*}}.o {{.*}}.o
+
 // RUN: clang-offload-packager -o %t.out \
 // RUN:   --image=file=%t.elf.o,kind=openmp,triple=x86_64-unknown-linux-gnu \
 // RUN:   --image=file=%t.elf.o,kind=openmp,triple=x86_64-unknown-linux-gnu
diff --git a/llvm/lib/Object/OffloadBinary.cpp b/llvm/lib/Object/OffloadBinary.cpp
index 22d604b..58b9b39 100644
--- a/llvm/lib/Object/OffloadBinary.cpp
+++ b/llvm/lib/Object/OffloadBinary.cpp
@@ -355,6 +355,10 @@ bool object::areTargetsCompatible(const OffloadFile::TargetID &LHS,
   if (LHS.first != RHS.first)
     return false;
 
+  // If the architecture is "all" we assume it is always compatible.
+  if (LHS.second.equals("all") || RHS.second.equals("all"))
+    return true;
+
   // Only The AMDGPU target requires additional checks.
   llvm::Triple T(LHS.first);
   if (!T.isAMDGPU())
-- 
cgit v1.1


From c429f48b56714f9366eee8490061bd008635cbc0 Mon Sep 17 00:00:00 2001
From: Sam Clegg <sbc@chromium.org>
Date: Thu, 8 Feb 2024 15:20:37 -0800
Subject: [Object][WebAssembly] Improve error on invalid relocation (#81203)

See https://github.com/emscripten-core/emscripten/issues/21140
---
 llvm/lib/Object/WasmObjectFile.cpp        | 40 ++++++++++++++-----------------
 llvm/test/Object/wasm-bad-relocation.yaml | 35 +++++++++++++++++++++++++++
 2 files changed, 53 insertions(+), 22 deletions(-)
 create mode 100644 llvm/test/Object/wasm-bad-relocation.yaml

diff --git a/llvm/lib/Object/WasmObjectFile.cpp b/llvm/lib/Object/WasmObjectFile.cpp
index ea17154..1d68687 100644
--- a/llvm/lib/Object/WasmObjectFile.cpp
+++ b/llvm/lib/Object/WasmObjectFile.cpp
@@ -1034,6 +1034,13 @@ Error WasmObjectFile::parseRelocSection(StringRef Name, ReadContext &Ctx) {
     if (Reloc.Offset < PreviousOffset)
       return make_error<GenericBinaryError>("relocations not in offset order",
                                             object_error::parse_failed);
+
+    auto badReloc = [&](StringRef msg) {
+      return make_error<GenericBinaryError>(
+          msg + ": " + Twine(Symbols[Reloc.Index].Info.Name),
+          object_error::parse_failed);
+    };
+
     PreviousOffset = Reloc.Offset;
     Reloc.Index = readVaruint32(Ctx);
     switch (type) {
@@ -1046,18 +1053,15 @@ Error WasmObjectFile::parseRelocSection(StringRef Name, ReadContext &Ctx) {
     case wasm::R_WASM_TABLE_INDEX_REL_SLEB:
     case wasm::R_WASM_TABLE_INDEX_REL_SLEB64:
       if (!isValidFunctionSymbol(Reloc.Index))
-        return make_error<GenericBinaryError>(
-            "invalid relocation function index", object_error::parse_failed);
+        return badReloc("invalid function relocation");
       break;
     case wasm::R_WASM_TABLE_NUMBER_LEB:
       if (!isValidTableSymbol(Reloc.Index))
-        return make_error<GenericBinaryError>("invalid relocation table index",
-                                              object_error::parse_failed);
+        return badReloc("invalid table relocation");
       break;
     case wasm::R_WASM_TYPE_INDEX_LEB:
       if (Reloc.Index >= Signatures.size())
-        return make_error<GenericBinaryError>("invalid relocation type index",
-                                              object_error::parse_failed);
+        return badReloc("invalid relocation type index");
       break;
     case wasm::R_WASM_GLOBAL_INDEX_LEB:
       // R_WASM_GLOBAL_INDEX_LEB are can be used against function and data
@@ -1065,18 +1069,15 @@ Error WasmObjectFile::parseRelocSection(StringRef Name, ReadContext &Ctx) {
       if (!isValidGlobalSymbol(Reloc.Index) &&
           !isValidDataSymbol(Reloc.Index) &&
           !isValidFunctionSymbol(Reloc.Index))
-        return make_error<GenericBinaryError>("invalid relocation global index",
-                                              object_error::parse_failed);
+        return badReloc("invalid global relocation");
       break;
     case wasm::R_WASM_GLOBAL_INDEX_I32:
       if (!isValidGlobalSymbol(Reloc.Index))
-        return make_error<GenericBinaryError>("invalid relocation global index",
-                                              object_error::parse_failed);
+        return badReloc("invalid global relocation");
       break;
     case wasm::R_WASM_TAG_INDEX_LEB:
       if (!isValidTagSymbol(Reloc.Index))
-        return make_error<GenericBinaryError>("invalid relocation tag index",
-                                              object_error::parse_failed);
+        return badReloc("invalid tag relocation");
       break;
     case wasm::R_WASM_MEMORY_ADDR_LEB:
     case wasm::R_WASM_MEMORY_ADDR_SLEB:
@@ -1085,8 +1086,7 @@ Error WasmObjectFile::parseRelocSection(StringRef Name, ReadContext &Ctx) {
     case wasm::R_WASM_MEMORY_ADDR_TLS_SLEB:
     case wasm::R_WASM_MEMORY_ADDR_LOCREL_I32:
       if (!isValidDataSymbol(Reloc.Index))
-        return make_error<GenericBinaryError>("invalid relocation data index",
-                                              object_error::parse_failed);
+        return badReloc("invalid data relocation");
       Reloc.Addend = readVarint32(Ctx);
       break;
     case wasm::R_WASM_MEMORY_ADDR_LEB64:
@@ -1095,26 +1095,22 @@ Error WasmObjectFile::parseRelocSection(StringRef Name, ReadContext &Ctx) {
     case wasm::R_WASM_MEMORY_ADDR_REL_SLEB64:
     case wasm::R_WASM_MEMORY_ADDR_TLS_SLEB64:
       if (!isValidDataSymbol(Reloc.Index))
-        return make_error<GenericBinaryError>("invalid relocation data index",
-                                              object_error::parse_failed);
+        return badReloc("invalid data relocation");
       Reloc.Addend = readVarint64(Ctx);
       break;
     case wasm::R_WASM_FUNCTION_OFFSET_I32:
       if (!isValidFunctionSymbol(Reloc.Index))
-        return make_error<GenericBinaryError>(
-            "invalid relocation function index", object_error::parse_failed);
+        return badReloc("invalid function relocation");
       Reloc.Addend = readVarint32(Ctx);
       break;
     case wasm::R_WASM_FUNCTION_OFFSET_I64:
       if (!isValidFunctionSymbol(Reloc.Index))
-        return make_error<GenericBinaryError>(
-            "invalid relocation function index", object_error::parse_failed);
+        return badReloc("invalid function relocation");
       Reloc.Addend = readVarint64(Ctx);
       break;
     case wasm::R_WASM_SECTION_OFFSET_I32:
       if (!isValidSectionSymbol(Reloc.Index))
-        return make_error<GenericBinaryError>(
-            "invalid relocation section index", object_error::parse_failed);
+        return badReloc("invalid section relocation");
       Reloc.Addend = readVarint32(Ctx);
       break;
     default:
diff --git a/llvm/test/Object/wasm-bad-relocation.yaml b/llvm/test/Object/wasm-bad-relocation.yaml
new file mode 100644
index 0000000..aed405c
--- /dev/null
+++ b/llvm/test/Object/wasm-bad-relocation.yaml
@@ -0,0 +1,35 @@
+# RUN: yaml2obj %s | not llvm-objdump -s - 2>&1 | FileCheck %s
+
+# Check for invalid relocations.  In this case we have a relocations of type
+# R_WASM_FUNCTION_INDEX_LEB against a symbol (foo) which is not a function
+# symbol but a data symbol.
+
+# CHECK: invalid function relocation: foo
+
+--- !WASM
+FileHeader:
+  Version:         0x00000001
+Sections:
+  - Type:            DATA
+    Segments:
+      - SectionOffset:   0
+        InitFlags:       0
+        Offset:
+          Opcode:          I32_CONST
+          Value:           0
+        Content:         '6401020304'
+    Relocations:
+      - Type:            R_WASM_FUNCTION_INDEX_LEB
+        Index:           0
+        Offset:          0x00000000
+  - Type:            CUSTOM
+    Name:            linking
+    Version:         2
+    SymbolTable:
+      - Index:           0
+        Kind:            DATA
+        Name:            foo
+        Flags:           [ ]
+        Segment:         0
+        Offset:          0
+        Size:            1
-- 
cgit v1.1


From 0800a36053943beabe1c3f98fe4ecccbc192a2a7 Mon Sep 17 00:00:00 2001
From: Joseph Huber <huberjn@outlook.com>
Date: Thu, 8 Feb 2024 17:31:47 -0600
Subject: Revert "[NVVMReflect] Force dead branch elimination in NVVMReflect
 (#81189)"

This reverts commit 9211e67da36782db44a46ccb9ac06734ccf2570f.

Summary:
This seemed to crash one one of the CUDA math tests. Revert until it can
be fixed.
---
 llvm/docs/NVPTXUsage.rst                        |   5 -
 llvm/lib/Target/NVPTX/NVVMReflect.cpp           |  62 -----------
 llvm/test/CodeGen/NVPTX/nvvm-reflect-arch-O0.ll | 141 ------------------------
 llvm/test/CodeGen/NVPTX/nvvm-reflect-arch.ll    |   1 +
 4 files changed, 1 insertion(+), 208 deletions(-)
 delete mode 100644 llvm/test/CodeGen/NVPTX/nvvm-reflect-arch-O0.ll

diff --git a/llvm/docs/NVPTXUsage.rst b/llvm/docs/NVPTXUsage.rst
index b5e3918..22acc6c 100644
--- a/llvm/docs/NVPTXUsage.rst
+++ b/llvm/docs/NVPTXUsage.rst
@@ -296,11 +296,6 @@ pipeline, immediately after the link stage. The ``internalize`` pass is also
 recommended to remove unused math functions from the resulting PTX. For an
 input IR module ``module.bc``, the following compilation flow is recommended:
 
-The ``NVVMReflect`` pass will attempt to remove dead code even without
-optimizations. This allows potentially incompatible instructions to be avoided
-at all optimizations levels. This currently only works for simple conditionals
-like the above example.
-
 1. Save list of external functions in ``module.bc``
 2. Link ``module.bc`` with ``libdevice.compute_XX.YY.bc``
 3. Internalize all functions not in list from (1)
diff --git a/llvm/lib/Target/NVPTX/NVVMReflect.cpp b/llvm/lib/Target/NVPTX/NVVMReflect.cpp
index 5283c2f..7d2678a 100644
--- a/llvm/lib/Target/NVPTX/NVVMReflect.cpp
+++ b/llvm/lib/Target/NVPTX/NVVMReflect.cpp
@@ -20,7 +20,6 @@
 
 #include "NVPTX.h"
 #include "llvm/ADT/SmallVector.h"
-#include "llvm/Analysis/ConstantFolding.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Function.h"
@@ -37,8 +36,6 @@
 #include "llvm/Support/raw_os_ostream.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Scalar.h"
-#include "llvm/Transforms/Utils/BasicBlockUtils.h"
-#include "llvm/Transforms/Utils/Local.h"
 #include <sstream>
 #include <string>
 #define NVVM_REFLECT_FUNCTION "__nvvm_reflect"
@@ -90,7 +87,6 @@ static bool runNVVMReflect(Function &F, unsigned SmVersion) {
   }
 
   SmallVector<Instruction *, 4> ToRemove;
-  SmallVector<ICmpInst *, 4> ToSimplify;
 
   // Go through the calls in this function.  Each call to __nvvm_reflect or
   // llvm.nvvm.reflect should be a CallInst with a ConstantArray argument.
@@ -175,13 +171,6 @@ static bool runNVVMReflect(Function &F, unsigned SmVersion) {
     } else if (ReflectArg == "__CUDA_ARCH") {
       ReflectVal = SmVersion * 10;
     }
-
-    // If the immediate user is a simple comparison we want to simplify it.
-    // TODO: This currently does not handle switch instructions.
-    for (User *U : Call->users())
-      if (ICmpInst *I = dyn_cast<ICmpInst>(U))
-        ToSimplify.push_back(I);
-
     Call->replaceAllUsesWith(ConstantInt::get(Call->getType(), ReflectVal));
     ToRemove.push_back(Call);
   }
@@ -189,57 +178,6 @@ static bool runNVVMReflect(Function &F, unsigned SmVersion) {
   for (Instruction *I : ToRemove)
     I->eraseFromParent();
 
-  // The code guarded by __nvvm_reflect may be invalid for the target machine.
-  // We need to do some basic dead code elimination to trim invalid code before
-  // it reaches the backend at all optimization levels.
-  SmallVector<BranchInst *> Simplified;
-  for (ICmpInst *Cmp : ToSimplify) {
-    Constant *LHS = dyn_cast<Constant>(Cmp->getOperand(0));
-    Constant *RHS = dyn_cast<Constant>(Cmp->getOperand(1));
-
-    if (!LHS || !RHS)
-      continue;
-
-    // If the comparison is a compile time constant we simply propagate it.
-    Constant *C = ConstantFoldCompareInstOperands(
-        Cmp->getPredicate(), LHS, RHS, Cmp->getModule()->getDataLayout());
-
-    if (!C)
-      continue;
-
-    for (User *U : Cmp->users())
-      if (BranchInst *I = dyn_cast<BranchInst>(U))
-        Simplified.push_back(I);
-
-    Cmp->replaceAllUsesWith(C);
-    Cmp->eraseFromParent();
-  }
-
-  // Each instruction here is a conditional branch off of a constant true or
-  // false value. Simply replace it with an unconditional branch to the
-  // appropriate basic block and delete the rest if it is trivially dead.
-  DenseSet<Instruction *> Removed;
-  for (BranchInst *Branch : Simplified) {
-    if (Removed.contains(Branch))
-      continue;
-
-    ConstantInt *C = dyn_cast<ConstantInt>(Branch->getCondition());
-    if (!C || (!C->isOne() && !C->isZero()))
-      continue;
-
-    BasicBlock *TrueBB =
-        C->isOne() ? Branch->getSuccessor(0) : Branch->getSuccessor(1);
-    BasicBlock *FalseBB =
-        C->isOne() ? Branch->getSuccessor(1) : Branch->getSuccessor(0);
-
-    ReplaceInstWithInst(Branch, BranchInst::Create(TrueBB));
-    if (FalseBB->use_empty() && FalseBB->hasNPredecessors(0) &&
-        FalseBB->getFirstNonPHIOrDbg()) {
-      Removed.insert(FalseBB->getFirstNonPHIOrDbg());
-      changeToUnreachable(FalseBB->getFirstNonPHIOrDbg());
-    }
-  }
-
   return ToRemove.size() > 0;
 }
 
diff --git a/llvm/test/CodeGen/NVPTX/nvvm-reflect-arch-O0.ll b/llvm/test/CodeGen/NVPTX/nvvm-reflect-arch-O0.ll
deleted file mode 100644
index c9586d5..0000000
--- a/llvm/test/CodeGen/NVPTX/nvvm-reflect-arch-O0.ll
+++ /dev/null
@@ -1,141 +0,0 @@
-; RUN: llc < %s -march=nvptx64 -mcpu=sm_52 -mattr=+ptx64 -O0 | FileCheck %s --check-prefix=SM_52
-; RUN: llc < %s -march=nvptx64 -mcpu=sm_70 -mattr=+ptx64 -O0 | FileCheck %s --check-prefix=SM_70
-; RUN: llc < %s -march=nvptx64 -mcpu=sm_90 -mattr=+ptx72 -O0 | FileCheck %s --check-prefix=SM_90
-
-@.str = private unnamed_addr constant [12 x i8] c"__CUDA_ARCH\00"
-
-declare i32 @__nvvm_reflect(ptr)
-
-;      SM_52: .visible .func  (.param .b32 func_retval0) foo()
-;      SM_52: mov.b32         %[[REG:.+]], 3;
-; SM_52-NEXT: st.param.b32    [func_retval0+0], %[[REG:.+]];
-; SM_52-NEXT: ret;
-;
-;      SM_70: .visible .func  (.param .b32 func_retval0) foo()
-;      SM_70: mov.b32         %[[REG:.+]], 2;
-; SM_70-NEXT: st.param.b32    [func_retval0+0], %[[REG:.+]];
-; SM_70-NEXT: ret;
-;
-;      SM_90: .visible .func  (.param .b32 func_retval0) foo()
-;      SM_90: mov.b32         %[[REG:.+]], 1;
-; SM_90-NEXT: st.param.b32    [func_retval0+0], %[[REG:.+]];
-; SM_90-NEXT: ret;
-define i32 @foo() {
-entry:
-  %call = call i32 @__nvvm_reflect(ptr @.str)
-  %cmp = icmp uge i32 %call, 900
-  br i1 %cmp, label %if.then, label %if.else
-
-if.then:
-  br label %return
-
-if.else:
-  %call1 = call i32 @__nvvm_reflect(ptr @.str)
-  %cmp2 = icmp uge i32 %call1, 700
-  br i1 %cmp2, label %if.then3, label %if.else4
-
-if.then3:
-  br label %return
-
-if.else4:
-  %call5 = call i32 @__nvvm_reflect(ptr @.str)
-  %cmp6 = icmp uge i32 %call5, 520
-  br i1 %cmp6, label %if.then7, label %if.else8
-
-if.then7:
-  br label %return
-
-if.else8:
-  br label %return
-
-return:
-  %retval.0 = phi i32 [ 1, %if.then ], [ 2, %if.then3 ], [ 3, %if.then7 ], [ 4, %if.else8 ]
-  ret i32 %retval.0
-}
-
-;      SM_52: .visible .func  (.param .b32 func_retval0) bar()
-;      SM_52: mov.b32         %[[REG:.+]], 2;
-; SM_52-NEXT: st.param.b32    [func_retval0+0], %[[REG:.+]];
-; SM_52-NEXT: ret;
-;
-;      SM_70: .visible .func  (.param .b32 func_retval0) bar()
-;      SM_70: mov.b32         %[[REG:.+]], 1;
-; SM_70-NEXT: st.param.b32    [func_retval0+0], %[[REG:.+]];
-; SM_70-NEXT: ret;
-;
-;      SM_90: .visible .func  (.param .b32 func_retval0) bar()
-;      SM_90: mov.b32         %[[REG:.+]], 1;
-; SM_90-NEXT: st.param.b32    [func_retval0+0], %[[REG:.+]];
-; SM_90-NEXT: ret;
-define i32 @bar() {
-entry:
-  %call = call i32 @__nvvm_reflect(ptr @.str)
-  %cmp = icmp uge i32 %call, 700
-  br i1 %cmp, label %if.then, label %if.else
-
-if.then:
-  br label %if.end
-
-if.else:
-  br label %if.end
-
-if.end:
-  %x = phi i32 [ 1, %if.then ], [ 2, %if.else ]
-  ret i32 %x
-}
-
-; SM_52-NOT: valid;
-; SM_70: valid;
-; SM_90: valid;
-define void @baz() {
-entry:
-  %call = call i32 @__nvvm_reflect(ptr @.str)
-  %cmp = icmp uge i32 %call, 700
-  br i1 %cmp, label %if.then, label %if.end
-
-if.then:
-  call void asm sideeffect "valid;\0A", ""()
-  br label %if.end
-
-if.end:
-  ret void
-}
-
-; SM_52: .visible .func  (.param .b32 func_retval0) qux()
-; SM_52: mov.u32         %[[REG1:.+]], %[[REG2:.+]];
-; SM_52: st.param.b32    [func_retval0+0], %[[REG1:.+]];
-; SM_52: ret;
-; SM_70: .visible .func  (.param .b32 func_retval0) qux()
-; SM_70: mov.u32         %[[REG1:.+]], %[[REG2:.+]];
-; SM_70: st.param.b32    [func_retval0+0], %[[REG1:.+]];
-; SM_70: ret;
-; SM_90: .visible .func  (.param .b32 func_retval0) qux()
-; SM_90: st.param.b32    [func_retval0+0], %[[REG1:.+]];
-; SM_90: ret;
-define i32 @qux() {
-entry:
-  %call = call i32 @__nvvm_reflect(ptr noundef @.str)
-  %cmp = icmp uge i32 %call, 700
-  %conv = zext i1 %cmp to i32
-  switch i32 %conv, label %sw.default [
-    i32 900, label %sw.bb
-    i32 700, label %sw.bb1
-    i32 520, label %sw.bb2
-  ]
-
-sw.bb:
-  br label %return
-
-sw.bb1:
-  br label %return
-
-sw.bb2:
-  br label %return
-
-sw.default:
-  br label %return
-
-return:
-  %retval = phi i32 [ 4, %sw.default ], [ 3, %sw.bb2 ], [ 2, %sw.bb1 ], [ 1, %sw.bb ]
-  ret i32 %retval
-}
diff --git a/llvm/test/CodeGen/NVPTX/nvvm-reflect-arch.ll b/llvm/test/CodeGen/NVPTX/nvvm-reflect-arch.ll
index ac5875c..e8c554c 100644
--- a/llvm/test/CodeGen/NVPTX/nvvm-reflect-arch.ll
+++ b/llvm/test/CodeGen/NVPTX/nvvm-reflect-arch.ll
@@ -18,3 +18,4 @@ define i32 @foo(float %a, float %b) {
 ; SM35: ret i32 350  
   ret i32 %reflect
 }
+
-- 
cgit v1.1


From 2572f45c7d6c081ba9b4fa344e928182f8df7773 Mon Sep 17 00:00:00 2001
From: Valentin Clement <clementval@gmail.com>
Date: Thu, 8 Feb 2024 15:43:18 -0800
Subject: [flang] Fix missing generated header

Fix buildbot failing because of missing  HLFIRTypes.h.inc
---
 flang/lib/Optimizer/Dialect/CMakeLists.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/flang/lib/Optimizer/Dialect/CMakeLists.txt b/flang/lib/Optimizer/Dialect/CMakeLists.txt
index 745439b..58a4276 100644
--- a/flang/lib/Optimizer/Dialect/CMakeLists.txt
+++ b/flang/lib/Optimizer/Dialect/CMakeLists.txt
@@ -13,6 +13,7 @@ add_flang_library(FIRDialect
   CanonicalizationPatternsIncGen
   MLIRIR
   FIROpsIncGen
+  HLFIROpsIncGen
   intrinsics_gen
 
   LINK_LIBS
-- 
cgit v1.1


From 93471466be65cf78330782d461a821ffb82e070a Mon Sep 17 00:00:00 2001
From: David Blaikie <dblaikie@gmail.com>
Date: Fri, 9 Feb 2024 11:41:04 +1100
Subject: Document use of `skip-precommit-approval` label for non-review pull
 requests (#81053)

Derived from this discussion:
https://discourse.llvm.org/t/prs-without-approvals-muddy-the-waters/76656
---
 llvm/docs/CodeReview.rst | 6 ++++++
 llvm/docs/GitHub.rst     | 6 ++++++
 2 files changed, 12 insertions(+)

diff --git a/llvm/docs/CodeReview.rst b/llvm/docs/CodeReview.rst
index f1d5b6c..e3a7494 100644
--- a/llvm/docs/CodeReview.rst
+++ b/llvm/docs/CodeReview.rst
@@ -103,6 +103,12 @@ ready to be committed. Specifically, once a patch is sent out for review, it
 needs an explicit approval before it is committed. Do not assume silent
 approval, or solicit objections to a patch with a deadline.
 
+.. note::
+   If you are using a Pull Request for purposes other than review
+   (eg: precommit CI results, convenient web-based reverts, etc)
+   `skip-precommit-approval<https://github.com/llvm/llvm-project/labels?q=skip-precommit-approval>`
+   label to the PR.
+
 Acknowledge All Reviewer Feedback
 ---------------------------------
 
diff --git a/llvm/docs/GitHub.rst b/llvm/docs/GitHub.rst
index c3fbb64..51a7310 100644
--- a/llvm/docs/GitHub.rst
+++ b/llvm/docs/GitHub.rst
@@ -30,6 +30,12 @@ describes the typical workflow of creating a Pull Request and getting it reviewe
 and accepted. This is meant as an overview of the GitHub workflow, for complete
 documentation refer to `GitHub's documentation <https://docs.github.com/pull-requests>`_.
 
+.. note::
+   If you are using a Pull Request for purposes other than review
+   (eg: precommit CI results, convenient web-based reverts, etc)
+   `skip-precommit-approval<https://github.com/llvm/llvm-project/labels?q=skip-precommit-approval>`
+   label to the PR.
+
 GitHub Tools
 ------------
 You can interact with GitHub in several ways: via git command line tools,
-- 
cgit v1.1


From 1d4fc381d3da4317cc2cfa59b2d59d53decddf71 Mon Sep 17 00:00:00 2001
From: Felipe de Azevedo Piovezan <fpiovezan@apple.com>
Date: Thu, 8 Feb 2024 16:48:04 -0800
Subject: [DWARFVerifier] Fix verification of empty line tables (#81162)

A line table whose sole entry is an end sequence should not have the
entry's file index verified, as that value corresponds to the initial
value of the state machine, not to a real file index. In DWARF 5, this
is particularly problematic as it uses 0-based indexing, and the state
machine specifies a starting index of 1; in other words, you'd need to
have _two_ files before such index became legal "by default".

A previous attempt to fix this problem was done [1], but it was too
specific in its condition, and did not capture all possible cases where
this issue can happen.

[1]: https://github.com/llvm/llvm-project/pull/77004
---
 llvm/lib/DebugInfo/DWARF/DWARFVerifier.cpp         | 13 +++--
 .../X86/verify_empty_debug_line_sequence.yaml      | 55 ++++++++++++++++++++++
 2 files changed, 61 insertions(+), 7 deletions(-)
 create mode 100644 llvm/test/tools/llvm-dwarfdump/X86/verify_empty_debug_line_sequence.yaml

diff --git a/llvm/lib/DebugInfo/DWARF/DWARFVerifier.cpp b/llvm/lib/DebugInfo/DWARF/DWARFVerifier.cpp
index 2124ff8..b523576 100644
--- a/llvm/lib/DebugInfo/DWARF/DWARFVerifier.cpp
+++ b/llvm/lib/DebugInfo/DWARF/DWARFVerifier.cpp
@@ -1025,6 +1025,11 @@ void DWARFVerifier::verifyDebugLineRows() {
       FileIndex++;
     }
 
+    // Nothing to verify in a line table with a single row containing the end
+    // sequence.
+    if (LineTable->Rows.size() == 1 && LineTable->Rows.front().EndSequence)
+      continue;
+
     // Verify rows.
     uint64_t PrevAddress = 0;
     uint32_t RowIndex = 0;
@@ -1048,13 +1053,7 @@ void DWARFVerifier::verifyDebugLineRows() {
             });
       }
 
-      // If the prologue contains no file names and the line table has only one
-      // row, do not verify the file index, this is a line table of an empty
-      // file with an end_sequence, but the DWARF standard sets the file number
-      // to 1 by default, otherwise verify file index.
-      if ((LineTable->Prologue.FileNames.size() ||
-           LineTable->Rows.size() != 1) &&
-          !LineTable->hasFileAtIndex(Row.File)) {
+      if (!LineTable->hasFileAtIndex(Row.File)) {
         ++NumDebugLineErrors;
         ErrorCategory.Report("Invalid file index in debug_line", [&]() {
           error() << ".debug_line["
diff --git a/llvm/test/tools/llvm-dwarfdump/X86/verify_empty_debug_line_sequence.yaml b/llvm/test/tools/llvm-dwarfdump/X86/verify_empty_debug_line_sequence.yaml
new file mode 100644
index 0000000..1bab2c2
--- /dev/null
+++ b/llvm/test/tools/llvm-dwarfdump/X86/verify_empty_debug_line_sequence.yaml
@@ -0,0 +1,55 @@
+# RUN: yaml2obj %s -o %t.o
+# RUN: llvm-dwarfdump -debug-line -verify %t.o | FileCheck %s
+
+# CHECK: Verifying .debug_line...
+# CHECK: No errors
+
+# In a line table like the one below, with no rows (other than the
+# end_sequence), we should never verify the file index because the state
+# machine initializes the file index to 1, which is invalid in DWARF 5 due to
+# its 0-based indexing.
+
+# file_names[  0]:
+#            name: "/home/umb/tests_2018/106_rnglists2"
+#       dir_index: 0
+# Address            Line   Column File   ISA Discriminator OpIndex Flags
+# ------------------ ------ ------ ------ --- ------------- ------- -------------
+# 0x0000000000000000      1      0      1   0             0       0  is_stmt end_sequence
+
+
+--- !ELF
+FileHeader:
+  Class: ELFCLASS64
+  Data:  ELFDATA2LSB
+  Type:  ET_EXEC
+DWARF:
+  debug_abbrev:
+    - ID:              0
+      Table:
+        - Code:            0x1
+          Tag:             DW_TAG_compile_unit
+          Children:        DW_CHILDREN_no
+          Attributes:
+            - Attribute:       DW_AT_stmt_list
+              Form:            DW_FORM_sec_offset
+  debug_info:
+    - Length:          0xd
+      Version:         5
+      UnitType:        DW_UT_compile
+      AbbrevTableID:   0
+      AbbrOffset:      0x0
+      AddrSize:        8
+      Entries:
+        - AbbrCode:        0x1
+          Values:
+            - Value:           0x0
+Sections:
+  - Name:            .debug_line
+    Type:            SHT_PROGBITS
+    AddressAlign:    0x1
+    Content:         300000000500080025000000010101fb0e0d00010101010000000100000101011f010000000002011f020b010000000000000101
+  - Name:            .debug_line_str
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_MERGE, SHF_STRINGS ]
+    AddressAlign:    0x1
+    Content:         2F686F6D652F756D622F74657374735F323031382F3130365F726E676C697374733200746573742E63707000
-- 
cgit v1.1


From 1389260805ec2ffb74a4fb311e7327c64d9b8f54 Mon Sep 17 00:00:00 2001
From: Lang Hames <lhames@gmail.com>
Date: Thu, 8 Feb 2024 16:51:28 -0800
Subject: [JITLink][MachO][arm64] Fix error-check order.

The error check should be performed after the iterator increment, not before
it. Thanks to @dcb314 for catching this!

Fixes github.com/apple/swift/issues/81119
---
 llvm/lib/ExecutionEngine/JITLink/MachO_arm64.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/ExecutionEngine/JITLink/MachO_arm64.cpp b/llvm/lib/ExecutionEngine/JITLink/MachO_arm64.cpp
index 809b2d5..556031b 100644
--- a/llvm/lib/ExecutionEngine/JITLink/MachO_arm64.cpp
+++ b/llvm/lib/ExecutionEngine/JITLink/MachO_arm64.cpp
@@ -312,10 +312,10 @@ private:
 
           Addend = SignExtend64(RI.r_symbolnum, 24);
 
+          ++RelItr;
           if (RelItr == RelEnd)
             return make_error<JITLinkError>("Unpaired Addend reloc at " +
                                             formatv("{0:x16}", FixupAddress));
-          ++RelItr;
           RI = getRelocationInfo(RelItr);
 
           MachORelocKind = getRelocationKind(RI);
-- 
cgit v1.1


From fbf43b0121006e371fbf50ad8642e4c62405e5cc Mon Sep 17 00:00:00 2001
From: lntue <35648136+lntue@users.noreply.github.com>
Date: Thu, 8 Feb 2024 20:13:27 -0500
Subject: [libc] Only declare float128 math functions in the generated math.h
 if float128 type is supported. (#81010)

---
 libc/include/math.h.def                |  2 ++
 libc/spec/spec.td                      |  4 ++++
 libc/spec/stdc.td                      | 18 +++++++++---------
 libc/utils/HdrGen/PublicAPICommand.cpp | 15 ++++++++++++++-
 4 files changed, 29 insertions(+), 10 deletions(-)

diff --git a/libc/include/math.h.def b/libc/include/math.h.def
index 813bb72..927e2d6 100644
--- a/libc/include/math.h.def
+++ b/libc/include/math.h.def
@@ -11,6 +11,8 @@
 
 #include <__llvm-libc-common.h>
 #include <llvm-libc-macros/math-macros.h>
+#include <llvm-libc-types/float128.h>
+
 
 %%public_api()
 
diff --git a/libc/spec/spec.td b/libc/spec/spec.td
index 0b557c8..aebf495 100644
--- a/libc/spec/spec.td
+++ b/libc/spec/spec.td
@@ -176,6 +176,10 @@ class FunctionSpec<string name, RetValSpec return, list<ArgSpec> args> {
   list<ArgSpec> Args = args;
 }
 
+class GuardedFunctionSpec<string name, RetValSpec return, list<ArgSpec> args, string guard_macro> : FunctionSpec<name, return, args> {
+  string Guard = guard_macro;
+}
+
 class ObjectSpec<string name, string type> {
   string Name = name;
   string Type = type;
diff --git a/libc/spec/stdc.td b/libc/spec/stdc.td
index 97dabbc..e37f95a 100644
--- a/libc/spec/stdc.td
+++ b/libc/spec/stdc.td
@@ -359,17 +359,17 @@ def StdC : StandardSpec<"stdc"> {
           FunctionSpec<"copysign", RetValSpec<DoubleType>, [ArgSpec<DoubleType>, ArgSpec<DoubleType>]>,
           FunctionSpec<"copysignf", RetValSpec<FloatType>, [ArgSpec<FloatType>, ArgSpec<FloatType>]>,
           FunctionSpec<"copysignl", RetValSpec<LongDoubleType>, [ArgSpec<LongDoubleType>, ArgSpec<LongDoubleType>]>,
-          FunctionSpec<"copysignf128", RetValSpec<Float128Type>, [ArgSpec<Float128Type>, ArgSpec<Float128Type>]>,
+          GuardedFunctionSpec<"copysignf128", RetValSpec<Float128Type>, [ArgSpec<Float128Type>, ArgSpec<Float128Type>], "LIBC_COMPILER_HAS_FLOAT128">,
 
           FunctionSpec<"ceil", RetValSpec<DoubleType>, [ArgSpec<DoubleType>]>,
           FunctionSpec<"ceilf", RetValSpec<FloatType>, [ArgSpec<FloatType>]>,
           FunctionSpec<"ceill", RetValSpec<LongDoubleType>, [ArgSpec<LongDoubleType>]>,
-          FunctionSpec<"ceilf128", RetValSpec<Float128Type>, [ArgSpec<Float128Type>]>,
+          GuardedFunctionSpec<"ceilf128", RetValSpec<Float128Type>, [ArgSpec<Float128Type>], "LIBC_COMPILER_HAS_FLOAT128">,
 
           FunctionSpec<"fabs", RetValSpec<DoubleType>, [ArgSpec<DoubleType>]>,
           FunctionSpec<"fabsf", RetValSpec<FloatType>, [ArgSpec<FloatType>]>,
           FunctionSpec<"fabsl", RetValSpec<LongDoubleType>, [ArgSpec<LongDoubleType>]>,
-          FunctionSpec<"fabsf128", RetValSpec<Float128Type>, [ArgSpec<Float128Type>]>,
+          GuardedFunctionSpec<"fabsf128", RetValSpec<Float128Type>, [ArgSpec<Float128Type>], "LIBC_COMPILER_HAS_FLOAT128">,
 
           FunctionSpec<"fdim", RetValSpec<DoubleType>, [ArgSpec<DoubleType>, ArgSpec<DoubleType>]>,
           FunctionSpec<"fdimf", RetValSpec<FloatType>, [ArgSpec<FloatType>, ArgSpec<FloatType>]>,
@@ -378,17 +378,17 @@ def StdC : StandardSpec<"stdc"> {
           FunctionSpec<"floor", RetValSpec<DoubleType>, [ArgSpec<DoubleType>]>,
           FunctionSpec<"floorf", RetValSpec<FloatType>, [ArgSpec<FloatType>]>,
           FunctionSpec<"floorl", RetValSpec<LongDoubleType>, [ArgSpec<LongDoubleType>]>,
-          FunctionSpec<"floorf128", RetValSpec<Float128Type>, [ArgSpec<Float128Type>]>,
+          GuardedFunctionSpec<"floorf128", RetValSpec<Float128Type>, [ArgSpec<Float128Type>], "LIBC_COMPILER_HAS_FLOAT128">,
 
           FunctionSpec<"fmin", RetValSpec<DoubleType>, [ArgSpec<DoubleType>, ArgSpec<DoubleType>]>,
           FunctionSpec<"fminf", RetValSpec<FloatType>, [ArgSpec<FloatType>, ArgSpec<FloatType>]>,
           FunctionSpec<"fminl", RetValSpec<LongDoubleType>, [ArgSpec<LongDoubleType>, ArgSpec<LongDoubleType>]>,
-          FunctionSpec<"fminf128", RetValSpec<Float128Type>, [ArgSpec<Float128Type>, ArgSpec<Float128Type>]>,
+          GuardedFunctionSpec<"fminf128", RetValSpec<Float128Type>, [ArgSpec<Float128Type>, ArgSpec<Float128Type>], "LIBC_COMPILER_HAS_FLOAT128">,
 
           FunctionSpec<"fmax", RetValSpec<DoubleType>, [ArgSpec<DoubleType>, ArgSpec<DoubleType>]>,
           FunctionSpec<"fmaxf", RetValSpec<FloatType>, [ArgSpec<FloatType>, ArgSpec<FloatType>]>,
           FunctionSpec<"fmaxl", RetValSpec<LongDoubleType>, [ArgSpec<LongDoubleType>, ArgSpec<LongDoubleType>]>,
-          FunctionSpec<"fmaxf128", RetValSpec<Float128Type>, [ArgSpec<Float128Type>, ArgSpec<Float128Type>]>,
+          GuardedFunctionSpec<"fmaxf128", RetValSpec<Float128Type>, [ArgSpec<Float128Type>, ArgSpec<Float128Type>], "LIBC_COMPILER_HAS_FLOAT128">,
 
           FunctionSpec<"fma", RetValSpec<DoubleType>, [ArgSpec<DoubleType>, ArgSpec<DoubleType>, ArgSpec<DoubleType>]>,
           FunctionSpec<"fmaf", RetValSpec<FloatType>, [ArgSpec<FloatType>, ArgSpec<FloatType>, ArgSpec<FloatType>]>,
@@ -461,7 +461,7 @@ def StdC : StandardSpec<"stdc"> {
           FunctionSpec<"round", RetValSpec<DoubleType>, [ArgSpec<DoubleType>]>,
           FunctionSpec<"roundf", RetValSpec<FloatType>, [ArgSpec<FloatType>]>,
           FunctionSpec<"roundl", RetValSpec<LongDoubleType>, [ArgSpec<LongDoubleType>]>,
-          FunctionSpec<"roundf128", RetValSpec<Float128Type>, [ArgSpec<Float128Type>]>,
+          GuardedFunctionSpec<"roundf128", RetValSpec<Float128Type>, [ArgSpec<Float128Type>], "LIBC_COMPILER_HAS_FLOAT128">,
 
           FunctionSpec<"lround", RetValSpec<LongType>, [ArgSpec<DoubleType>]>,
           FunctionSpec<"lroundf", RetValSpec<LongType>, [ArgSpec<FloatType>]>,
@@ -486,12 +486,12 @@ def StdC : StandardSpec<"stdc"> {
           FunctionSpec<"sqrt", RetValSpec<DoubleType>, [ArgSpec<DoubleType>]>,
           FunctionSpec<"sqrtf", RetValSpec<FloatType>, [ArgSpec<FloatType>]>,
           FunctionSpec<"sqrtl", RetValSpec<LongDoubleType>, [ArgSpec<LongDoubleType>]>,
-          FunctionSpec<"sqrtf128", RetValSpec<Float128Type>, [ArgSpec<Float128Type>]>, 
+          GuardedFunctionSpec<"sqrtf128", RetValSpec<Float128Type>, [ArgSpec<Float128Type>], "LIBC_COMPILER_HAS_FLOAT128">,
 
           FunctionSpec<"trunc", RetValSpec<DoubleType>, [ArgSpec<DoubleType>]>,
           FunctionSpec<"truncf", RetValSpec<FloatType>, [ArgSpec<FloatType>]>,
           FunctionSpec<"truncl", RetValSpec<LongDoubleType>, [ArgSpec<LongDoubleType>]>,
-          FunctionSpec<"truncf128", RetValSpec<Float128Type>, [ArgSpec<Float128Type>]>,
+          GuardedFunctionSpec<"truncf128", RetValSpec<Float128Type>, [ArgSpec<Float128Type>], "LIBC_COMPILER_HAS_FLOAT128">,
 
           FunctionSpec<"nearbyint", RetValSpec<DoubleType>, [ArgSpec<DoubleType>]>,
           FunctionSpec<"nearbyintf", RetValSpec<FloatType>, [ArgSpec<FloatType>]>,
diff --git a/libc/utils/HdrGen/PublicAPICommand.cpp b/libc/utils/HdrGen/PublicAPICommand.cpp
index b1c7a07..cf6984b 100644
--- a/libc/utils/HdrGen/PublicAPICommand.cpp
+++ b/libc/utils/HdrGen/PublicAPICommand.cpp
@@ -102,6 +102,14 @@ void writeAPIFromIndex(APIIndexer &G,
     llvm::Record *RetValSpec = FunctionSpec->getValueAsDef("Return");
     llvm::Record *ReturnType = RetValSpec->getValueAsDef("ReturnType");
 
+    // TODO: https://github.com/llvm/llvm-project/issues/81208
+    //   Ideally, we should group functions based on their guarding macros.
+    bool Guarded =
+        (FunctionSpec->getType()->getAsString() == "GuardedFunctionSpec");
+
+    if (Guarded)
+      OS << "#ifdef " << FunctionSpec->getValueAsString("Guard") << "\n";
+
     OS << G.getTypeAsString(ReturnType) << " " << Name << "(";
 
     auto ArgsList = FunctionSpec->getValueAsListOfDefs("Args");
@@ -112,7 +120,12 @@ void writeAPIFromIndex(APIIndexer &G,
         OS << ", ";
     }
 
-    OS << ") __NOEXCEPT;\n\n";
+    OS << ") __NOEXCEPT;\n";
+
+    if (Guarded)
+      OS << "#endif // " << FunctionSpec->getValueAsString("Guard") << "\n";
+
+    OS << "\n";
   }
 
   // Make another pass over entrypoints to emit object declarations.
-- 
cgit v1.1


From 4759890f859277cd798648a9a333573cd088d98a Mon Sep 17 00:00:00 2001
From: Alexey Z <alexey.zhikhar@gmail.com>
Date: Thu, 8 Feb 2024 20:22:27 -0500
Subject: [mlir][tensor] Fix bug in insert_slice canonical. with tensor
 encoding (#81045)

Previously, `InsertSliceOpSourceCastInserter` was incorrectly applied to
a case when tensor types have an encoding attribute attached to them.
The type `newSrcType` was missing that attribute from the old `srcType`,
which made the expression `srcType == newSrcType` false, since
`tensor<2x2xf32, "foo">` is not equal to `tensor<2x2xf32>`. That lead to
an endless back and forth between `InsertSliceOpSourceCastInserter` that
would introduce a cast and `InsertSliceOpCastFolder` that would remove
it right after.
---
 mlir/lib/Dialect/Tensor/IR/TensorOps.cpp   |  4 ++--
 mlir/test/Dialect/Tensor/canonicalize.mlir | 18 ++++++++++++++++++
 2 files changed, 20 insertions(+), 2 deletions(-)

diff --git a/mlir/lib/Dialect/Tensor/IR/TensorOps.cpp b/mlir/lib/Dialect/Tensor/IR/TensorOps.cpp
index b21e89a..8298cf1 100644
--- a/mlir/lib/Dialect/Tensor/IR/TensorOps.cpp
+++ b/mlir/lib/Dialect/Tensor/IR/TensorOps.cpp
@@ -2663,8 +2663,8 @@ struct InsertSliceOpSourceCastInserter final
     if (!hasValidSizesOffsets(newSrcShape))
       return failure();
 
-    RankedTensorType newSrcType =
-        RankedTensorType::get(newSrcShape, srcType.getElementType());
+    RankedTensorType newSrcType = RankedTensorType::get(
+        newSrcShape, srcType.getElementType(), srcType.getEncoding());
     if (srcType == newSrcType ||
         !preservesStaticInformation(srcType, newSrcType) ||
         !tensor::CastOp::areCastCompatible(srcType, newSrcType))
diff --git a/mlir/test/Dialect/Tensor/canonicalize.mlir b/mlir/test/Dialect/Tensor/canonicalize.mlir
index 7192a71..90c715b 100644
--- a/mlir/test/Dialect/Tensor/canonicalize.mlir
+++ b/mlir/test/Dialect/Tensor/canonicalize.mlir
@@ -555,6 +555,24 @@ func.func @insert_slice_canonicalize(%arg0 : tensor<?x?x?xf32>, %arg1 : index,
 
 // -----
 
+// Do not insert a cast for the following example. The new source type wouldn't be "more static" than the old one.
+func.func @insert_slice_canonicalize_encoding(%arg0 : tensor<2x2xf32, "foo">,
+                                              %arg1 : tensor<4x4xf32, "foo">) -> tensor<4x4xf32, "foo">
+{
+  %0 = tensor.insert_slice %arg0 into %arg1[0, 0] [2, 2] [1, 1] : tensor<2x2xf32, "foo"> into tensor<4x4xf32, "foo">
+  return %0 : tensor<4x4xf32, "foo">
+}
+// CHECK-LABEL: func @insert_slice_canonicalize_encoding
+//  CHECK-SAME:     %[[ARG0:[a-zA-Z0-9_]+]]: tensor<2x2xf32, "foo">
+//  CHECK-SAME:     %[[ARG1:[a-zA-Z0-9_]+]]: tensor<4x4xf32, "foo">
+//       CHECK-NOT: tensor.cast
+//       CHECK:   %[[RESULT:.+]] = tensor.insert_slice %[[ARG0]] into %[[ARG1]]
+//  CHECK-SAME:      [0, 0] [2, 2] [1, 1]
+//  CHECK-SAME:      : tensor<2x2xf32, "foo"> into tensor<4x4xf32, "foo">
+//       CHECK:   return %[[RESULT]]
+
+// -----
+
 func.func @slice_to_insert_slice_canonicalize(%arg0 : tensor<?x?x?xf32>, %arg1 : index,
     %arg2 : index, %arg3 : tensor<?x?x?xf32>) -> tensor<?x?x?xf32>
 {
-- 
cgit v1.1


From 6e1f438528b6e1ece2f6ef331905c352ccc2fcfd Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Thu, 8 Feb 2024 17:35:45 -0800
Subject: [ELF] Improve --ro-rosegment/--omagic/--nmagic tests

Notably, test that --ro-rosegment with a linker script may unnecessarily
make a read-only PT_LOAD executable.
---
 lld/test/ELF/segments.s | 200 ++++++++++++++----------------------------------
 1 file changed, 59 insertions(+), 141 deletions(-)

diff --git a/lld/test/ELF/segments.s b/lld/test/ELF/segments.s
index d9af9a3..614f6e7 100644
--- a/lld/test/ELF/segments.s
+++ b/lld/test/ELF/segments.s
@@ -1,157 +1,75 @@
 # REQUIRES: x86
-# RUN: llvm-mc -filetype=obj -triple=x86_64-unknown-linux %s -o %t
+# RUN: rm -rf %t && split-file %s %t && cd %t
+# RUN: llvm-mc -filetype=obj -triple=x86_64 a.s -o a.o
 
-# RUN: ld.lld %t -o %t1
-# RUN: llvm-readobj --program-headers %t1 | FileCheck --check-prefix=ROSEGMENT %s
-# RUN: ld.lld --no-rosegment --rosegment %t -o - | cmp - %t1
-# RUN: ld.lld --omagic --no-omagic %t -o - | cmp - %t1
+# RUN: ld.lld a.o -o a
+# RUN: llvm-readelf -l a | FileCheck --check-prefix=ROSEGMENT %s
+# RUN: ld.lld --no-rosegment --rosegment a.o -o - | cmp - a
+# RUN: ld.lld --omagic --no-omagic a.o -o - | cmp - a
 
-# ROSEGMENT:      ProgramHeader {
-# ROSEGMENT:        Type: PT_LOAD
-# ROSEGMENT-NEXT:    Offset: 0x0
-# ROSEGMENT-NEXT:    VirtualAddress:
-# ROSEGMENT-NEXT:    PhysicalAddress:
-# ROSEGMENT-NEXT:    FileSize:
-# ROSEGMENT-NEXT:    MemSize:
-# ROSEGMENT-NEXT:    Flags [
-# ROSEGMENT-NEXT:      PF_R
-# ROSEGMENT-NEXT:    ]
-# ROSEGMENT-NEXT:    Alignment: 4096
-# ROSEGMENT-NEXT:  }
-# ROSEGMENT-NEXT:  ProgramHeader {
-# ROSEGMENT-NEXT:    Type: PT_LOAD
-# ROSEGMENT-NEXT:    Offset: 0x15C
-# ROSEGMENT-NEXT:    VirtualAddress:
-# ROSEGMENT-NEXT:    PhysicalAddress:
-# ROSEGMENT-NEXT:    FileSize:
-# ROSEGMENT-NEXT:    MemSize:
-# ROSEGMENT-NEXT:    Flags [
-# ROSEGMENT-NEXT:      PF_R
-# ROSEGMENT-NEXT:      PF_X
-# ROSEGMENT-NEXT:    ]
-# ROSEGMENT-NEXT:    Alignment: 4096
-# ROSEGMENT-NEXT:  }
-# ROSEGMENT-NEXT:  ProgramHeader {
-# ROSEGMENT-NEXT:    Type: PT_LOAD
-# ROSEGMENT-NEXT:    Offset: 0x15E
-# ROSEGMENT-NEXT:    VirtualAddress:
-# ROSEGMENT-NEXT:    PhysicalAddress:
-# ROSEGMENT-NEXT:    FileSize: 1
-# ROSEGMENT-NEXT:    MemSize: 1
-# ROSEGMENT-NEXT:    Flags [
-# ROSEGMENT-NEXT:      PF_R
-# ROSEGMENT-NEXT:      PF_W
-# ROSEGMENT-NEXT:    ]
-# ROSEGMENT-NEXT:    Alignment: 4096
-# ROSEGMENT-NEXT:  }
+# ROSEGMENT:       Type           Offset   VirtAddr           PhysAddr           FileSiz  MemSiz   Flg Align
+# ROSEGMENT-NEXT:  PHDR           0x000040 0x0000000000200040 0x0000000000200040 0x000118 0x000118 R   0x8
+# ROSEGMENT-NEXT:  LOAD           0x000000 0x0000000000200000 0x0000000000200000 0x00015a 0x00015a R   0x1000
+# ROSEGMENT-NEXT:  LOAD           0x00015c 0x000000000020115c 0x000000000020115c 0x000003 0x000003 R E 0x1000
+# ROSEGMENT-NEXT:  LOAD           0x00015f 0x000000000020215f 0x000000000020215f 0x000002 0x000002 RW  0x1000
+# ROSEGMENT-NEXT:  GNU_STACK      0x000000 0x0000000000000000 0x0000000000000000 0x000000 0x000000 RW  0
 
-# RUN: ld.lld --no-rosegment %t -o %t2
-# RUN: llvm-readobj --program-headers %t2 | FileCheck --check-prefix=NOROSEGMENT %s
+# RUN: ld.lld --no-rosegment a.o -o noro
+# RUN: llvm-readelf -l noro | FileCheck --check-prefix=NOROSEGMENT %s
 
-# NOROSEGMENT:     ProgramHeader {
-# NOROSEGMENT:       Type: PT_LOAD
-# NOROSEGMENT-NEXT:   Offset: 0x0
-# NOROSEGMENT-NEXT:   VirtualAddress:
-# NOROSEGMENT-NEXT:   PhysicalAddress:
-# NOROSEGMENT-NEXT:   FileSize:
-# NOROSEGMENT-NEXT:   MemSize:
-# NOROSEGMENT-NEXT:   Flags [
-# NOROSEGMENT-NEXT:     PF_R
-# NOROSEGMENT-NEXT:     PF_X
-# NOROSEGMENT-NEXT:   ]
-# NOROSEGMENT-NEXT:   Alignment: 4096
-# NOROSEGMENT-NEXT: }
-# NOROSEGMENT-NEXT: ProgramHeader {
-# NOROSEGMENT-NEXT:   Type: PT_LOAD
-# NOROSEGMENT-NEXT:   Offset: 0x126
-# NOROSEGMENT-NEXT:   VirtualAddress:
-# NOROSEGMENT-NEXT:   PhysicalAddress:
-# NOROSEGMENT-NEXT:   FileSize:
-# NOROSEGMENT-NEXT:   MemSize:
-# NOROSEGMENT-NEXT:   Flags [
-# NOROSEGMENT-NEXT:     PF_R
-# NOROSEGMENT-NEXT:     PF_W
-# NOROSEGMENT-NEXT:   ]
-# NOROSEGMENT-NEXT:   Alignment: 4096
-# NOROSEGMENT-NEXT: }
-# NOROSEGMENT-NEXT: ProgramHeader {
-# NOROSEGMENT-NEXT:   Type: PT_GNU_STACK
+# NOROSEGMENT:       Type           Offset   VirtAddr           PhysAddr           FileSiz  MemSiz   Flg Align
+# NOROSEGMENT-NEXT:  PHDR           0x000040 0x0000000000200040 0x0000000000200040 0x0000e0 0x0000e0 R   0x8
+# NOROSEGMENT-NEXT:  LOAD           0x000000 0x0000000000200000 0x0000000000200000 0x000127 0x000127 R E 0x1000
+# NOROSEGMENT-NEXT:  LOAD           0x000127 0x0000000000201127 0x0000000000201127 0x000002 0x000002 RW  0x1000
+# NOROSEGMENT-NEXT:  GNU_STACK      0x000000 0x0000000000000000 0x0000000000000000 0x000000 0x000000 RW  0
 
-# RUN: ld.lld -N %t -o %t3
-# RUN: llvm-readobj --program-headers %t3 | FileCheck --check-prefix=OMAGIC %s
-# RUN: ld.lld --omagic %t -o %t3
-# RUN: llvm-readobj --program-headers %t3 | FileCheck --check-prefix=OMAGIC %s
+# RUN: ld.lld --no-rosegment a.o -T a.lds -o noro1
+# RUN: llvm-readelf -l noro1 | FileCheck --check-prefix=NOROSEGMENT1 %s
 
-# OMAGIC:     ProgramHeader {
-# OMAGIC:      Type: PT_LOAD
-# OMAGIC-NEXT:   Offset: 0xB0
-# OMAGIC-NEXT:   VirtualAddress:
-# OMAGIC-NEXT:   PhysicalAddress:
-# OMAGIC-NEXT:   FileSize:
-# OMAGIC-NEXT:   MemSize:
-# OMAGIC-NEXT:   Flags [
-# OMAGIC-NEXT:     PF_R
-# OMAGIC-NEXT:     PF_W
-# OMAGIC-NEXT:     PF_X
-# OMAGIC-NEXT:   ]
-# OMAGIC-NEXT:   Alignment: 4
-# OMAGIC-NEXT: }
-# OMAGIC-NEXT: ProgramHeader {
-# OMAGIC-NEXT:   Type: PT_GNU_STACK
+# NOROSEGMENT1:       Type           Offset   VirtAddr           PhysAddr           FileSiz  MemSiz   Flg Align
+# NOROSEGMENT1-NEXT:  LOAD           0x001000 0x0000000000000000 0x0000000000000000 0x000007 0x000007 R E 0x1000
+# NOROSEGMENT1-NEXT:  LOAD           0x001007 0x0000000000000007 0x0000000000000007 0x000001 0x000001 RW  0x1000
+# NOROSEGMENT1-NEXT:  LOAD           0x001008 0x0000000000000008 0x0000000000000008 0x000001 0x000001 R E 0x1000
+# NOROSEGMENT1-NEXT:  LOAD           0x001009 0x0000000000000009 0x0000000000000009 0x000001 0x000001 RW  0x1000
+# NOROSEGMENT1-NEXT:  GNU_STACK      0x000000 0x0000000000000000 0x0000000000000000 0x000000 0x000000 RW  0
 
-# RUN: ld.lld -n %t -o %t4
-# RUN: llvm-readobj --program-headers %t4 | FileCheck --check-prefix=NMAGIC %s
-# RUN: ld.lld --nmagic %t -o %t4
-# RUN: llvm-readobj --program-headers %t4 | FileCheck --check-prefix=NMAGIC %s
+# RUN: ld.lld -N a.o -o omagic
+# RUN: llvm-readelf -l omagic | FileCheck --check-prefix=OMAGIC %s
+# RUN: ld.lld --omagic a.o -o - | cmp - omagic
 
-# NMAGIC:   ProgramHeader {
-# NMAGIC-NEXT:     Type: PT_LOAD
-# NMAGIC-NEXT:     Offset: 0x120
-# NMAGIC-NEXT:     VirtualAddress:
-# NMAGIC-NEXT:     PhysicalAddress:
-# NMAGIC-NEXT:     FileSize: 1
-# NMAGIC-NEXT:     MemSize: 1
-# NMAGIC-NEXT:     Flags [
-# NMAGIC-NEXT:       PF_R
-# NMAGIC-NEXT:     ]
-# NMAGIC-NEXT:     Alignment: 1
-# NMAGIC-NEXT:   }
-# NMAGIC-NEXT:   ProgramHeader {
-# NMAGIC-NEXT:     Type: PT_LOAD
-# NMAGIC-NEXT:     Offset: 0x124
-# NMAGIC-NEXT:     VirtualAddress:
-# NMAGIC-NEXT:     PhysicalAddress:
-# NMAGIC-NEXT:     FileSize: 2
-# NMAGIC-NEXT:     MemSize: 2
-# NMAGIC-NEXT:     Flags [
-# NMAGIC-NEXT:       PF_R
-# NMAGIC-NEXT:       PF_X
-# NMAGIC-NEXT:     ]
-# NMAGIC-NEXT:     Alignment: 4
-# NMAGIC-NEXT:   }
-# NMAGIC-NEXT:   ProgramHeader {
-# NMAGIC-NEXT:     Type: PT_LOAD (0x1)
-# NMAGIC-NEXT:     Offset: 0x126
-# NMAGIC-NEXT:     VirtualAddress:
-# NMAGIC-NEXT:     PhysicalAddress:
-# NMAGIC-NEXT:     FileSize: 1
-# NMAGIC-NEXT:     MemSize: 1
-# NMAGIC-NEXT:     Flags [
-# NMAGIC-NEXT:       PF_R
-# NMAGIC-NEXT:       PF_W
-# NMAGIC-NEXT:     ]
-# NMAGIC-NEXT:     Alignment: 1
-# NMAGIC-NEXT:   }
+# OMAGIC:       Type           Offset   VirtAddr           PhysAddr           FileSiz  MemSiz   Flg Align
+# OMAGIC-NEXT:  LOAD           0x0000b0 0x00000000002000b0 0x00000000002000b0 0x000009 0x000009 RWE 0x4
+# OMAGIC-NEXT:  GNU_STACK      0x000000 0x0000000000000000 0x0000000000000000 0x000000 0x000000 RW  0
 
+# RUN: ld.lld -n a.o -o nmagic
+# RUN: llvm-readelf -l nmagic | FileCheck --check-prefix=NMAGIC %s
+# RUN: ld.lld --nmagic a.o -o - | cmp nmagic -
+
+# NMAGIC:       Type           Offset   VirtAddr           PhysAddr           FileSiz  MemSiz   Flg Align
+# NMAGIC-NEXT:  LOAD           0x000120 0x0000000000200120 0x0000000000200120 0x000002 0x000002 R   0x1
+# NMAGIC-NEXT:  LOAD           0x000124 0x0000000000200124 0x0000000000200124 0x000003 0x000003 R E 0x4
+# NMAGIC-NEXT:  LOAD           0x000127 0x0000000000200127 0x0000000000200127 0x000002 0x000002 RW  0x1
+# NMAGIC-NEXT:  GNU_STACK      0x000000 0x0000000000000000 0x0000000000000000 0x000000 0x000000 RW  0
+
+#--- a.s
 .global _start
 _start:
  nop
 
-.section .ro,"a"
-nop
+.section .ro1,"a"; .byte 1
+.section .rw1,"aw"; .byte 3
+.section .rx1,"ax"; .byte 2
 
-.section .rw,"aw"
-nop
+.section .ro2,"a"; .byte 1
+.section .rw2,"aw"; .byte 3
+.section .rx2,"ax"; .byte 2
 
-.section .rx,"ax"
-nop
+#--- a.lds
+SECTIONS {
+  .ro1 : {}
+  .text : {}
+  .rx : { *(.rx*) }
+  .rw1 : {}
+  .ro2 : {}
+  .rw2 : {}
+}
-- 
cgit v1.1


From f60826917aff102450a470dee85208fd578685c4 Mon Sep 17 00:00:00 2001
From: Abdurrahman Akkas <abdurrahmanakkas@gmail.com>
Date: Thu, 8 Feb 2024 17:50:41 -0800
Subject: [MLIR] Fix a small formatting issue in AsmPrinter.cpp (#81214)

Introduced in 76ce4736721a6e9030210bda6df0ad8a8f478a19
---
 mlir/lib/IR/AsmPrinter.cpp | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/mlir/lib/IR/AsmPrinter.cpp b/mlir/lib/IR/AsmPrinter.cpp
index 6b8b747..6bed909 100644
--- a/mlir/lib/IR/AsmPrinter.cpp
+++ b/mlir/lib/IR/AsmPrinter.cpp
@@ -74,11 +74,9 @@ MLIRContext *AsmParser::getContext() const { return getBuilder().getContext(); }
 /// Parse a type list.
 /// This is out-of-line to work-around https://github.com/llvm/llvm-project/issues/62918
 ParseResult AsmParser::parseTypeList(SmallVectorImpl<Type> &result) {
-    return parseCommaSeparatedList(
-        [&]() { return parseType(result.emplace_back()); });
-  }
-
-
+  return parseCommaSeparatedList(
+      [&]() { return parseType(result.emplace_back()); });
+}
 
 //===----------------------------------------------------------------------===//
 // DialectAsmPrinter
-- 
cgit v1.1


From ffabcbcf8f9fc7ef5fd29e2a711f95aba0ef7808 Mon Sep 17 00:00:00 2001
From: Joseph Huber <huberjn@outlook.com>
Date: Thu, 8 Feb 2024 17:16:31 -0600
Subject: [NVVMReflect][Reland] Force dead branch elimination in NVVMReflect
 (#81189)

Summary:
The `__nvvm_reflect` function is used to guard invalid code that varies
between architectures. One problem with this feature is that if it is
used without optimizations, it will leave invalid code in the module
that will then make it to the backend. The `__nvvm_reflect` pass is
already mandatory, so it should do some trivial branch removal to ensure
that constants are handled correctly. This dead branch elimination only
works in the trivial case of a compare on a branch and does not touch
any conditionals that were not realted to the `__nvvm_reflect` call in
order to preserve `O0` semantics as much as possible. This should allow
the following to work on NVPTX targets

```c
int foo() {
  if (__nvvm_reflect("__CUDA_ARCH") >= 700)
    asm("valid;\n");
}
```

Relanding after fixing a bug.
---
 llvm/docs/NVPTXUsage.rst                        |   5 +
 llvm/lib/Target/NVPTX/NVVMReflect.cpp           |  65 +++++++++
 llvm/test/CodeGen/NVPTX/nvvm-reflect-arch-O0.ll | 175 ++++++++++++++++++++++++
 llvm/test/CodeGen/NVPTX/nvvm-reflect-arch.ll    |   1 -
 4 files changed, 245 insertions(+), 1 deletion(-)
 create mode 100644 llvm/test/CodeGen/NVPTX/nvvm-reflect-arch-O0.ll

diff --git a/llvm/docs/NVPTXUsage.rst b/llvm/docs/NVPTXUsage.rst
index 22acc6c..b5e3918 100644
--- a/llvm/docs/NVPTXUsage.rst
+++ b/llvm/docs/NVPTXUsage.rst
@@ -296,6 +296,11 @@ pipeline, immediately after the link stage. The ``internalize`` pass is also
 recommended to remove unused math functions from the resulting PTX. For an
 input IR module ``module.bc``, the following compilation flow is recommended:
 
+The ``NVVMReflect`` pass will attempt to remove dead code even without
+optimizations. This allows potentially incompatible instructions to be avoided
+at all optimizations levels. This currently only works for simple conditionals
+like the above example.
+
 1. Save list of external functions in ``module.bc``
 2. Link ``module.bc`` with ``libdevice.compute_XX.YY.bc``
 3. Internalize all functions not in list from (1)
diff --git a/llvm/lib/Target/NVPTX/NVVMReflect.cpp b/llvm/lib/Target/NVPTX/NVVMReflect.cpp
index 7d2678a..3794ad9b 100644
--- a/llvm/lib/Target/NVPTX/NVVMReflect.cpp
+++ b/llvm/lib/Target/NVPTX/NVVMReflect.cpp
@@ -20,6 +20,7 @@
 
 #include "NVPTX.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/Analysis/ConstantFolding.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Function.h"
@@ -36,6 +37,8 @@
 #include "llvm/Support/raw_os_ostream.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Local.h"
 #include <sstream>
 #include <string>
 #define NVVM_REFLECT_FUNCTION "__nvvm_reflect"
@@ -87,6 +90,7 @@ static bool runNVVMReflect(Function &F, unsigned SmVersion) {
   }
 
   SmallVector<Instruction *, 4> ToRemove;
+  SmallVector<ICmpInst *, 4> ToSimplify;
 
   // Go through the calls in this function.  Each call to __nvvm_reflect or
   // llvm.nvvm.reflect should be a CallInst with a ConstantArray argument.
@@ -171,6 +175,13 @@ static bool runNVVMReflect(Function &F, unsigned SmVersion) {
     } else if (ReflectArg == "__CUDA_ARCH") {
       ReflectVal = SmVersion * 10;
     }
+
+    // If the immediate user is a simple comparison we want to simplify it.
+    // TODO: This currently does not handle switch instructions.
+    for (User *U : Call->users())
+      if (ICmpInst *I = dyn_cast<ICmpInst>(U))
+        ToSimplify.push_back(I);
+
     Call->replaceAllUsesWith(ConstantInt::get(Call->getType(), ReflectVal));
     ToRemove.push_back(Call);
   }
@@ -178,6 +189,60 @@ static bool runNVVMReflect(Function &F, unsigned SmVersion) {
   for (Instruction *I : ToRemove)
     I->eraseFromParent();
 
+  // The code guarded by __nvvm_reflect may be invalid for the target machine.
+  // We need to do some basic dead code elimination to trim invalid code before
+  // it reaches the backend at all optimization levels.
+  SmallVector<BranchInst *> Simplified;
+  for (ICmpInst *Cmp : ToSimplify) {
+    Constant *LHS = dyn_cast<Constant>(Cmp->getOperand(0));
+    Constant *RHS = dyn_cast<Constant>(Cmp->getOperand(1));
+
+    if (!LHS || !RHS)
+      continue;
+
+    // If the comparison is a compile time constant we simply propagate it.
+    Constant *C = ConstantFoldCompareInstOperands(
+        Cmp->getPredicate(), LHS, RHS, Cmp->getModule()->getDataLayout());
+
+    if (!C)
+      continue;
+
+    for (User *U : Cmp->users())
+      if (BranchInst *I = dyn_cast<BranchInst>(U))
+        Simplified.push_back(I);
+
+    Cmp->replaceAllUsesWith(C);
+    Cmp->eraseFromParent();
+  }
+
+  // Each instruction here is a conditional branch off of a constant true or
+  // false value. Simply replace it with an unconditional branch to the
+  // appropriate basic block and delete the rest if it is trivially dead.
+  DenseSet<Instruction *> Removed;
+  for (BranchInst *Branch : Simplified) {
+    if (Removed.contains(Branch))
+      continue;
+
+    ConstantInt *C = dyn_cast<ConstantInt>(Branch->getCondition());
+    if (!C || (!C->isOne() && !C->isZero()))
+      continue;
+
+    BasicBlock *TrueBB =
+        C->isOne() ? Branch->getSuccessor(0) : Branch->getSuccessor(1);
+    BasicBlock *FalseBB =
+        C->isOne() ? Branch->getSuccessor(1) : Branch->getSuccessor(0);
+
+    // This transformation is only correct on simple edges.
+    if (!FalseBB->hasNPredecessors(1))
+      continue;
+
+    ReplaceInstWithInst(Branch, BranchInst::Create(TrueBB));
+    if (FalseBB->use_empty() && !FalseBB->getFirstNonPHIOrDbg()) {
+      Removed.insert(FalseBB->getFirstNonPHIOrDbg());
+      changeToUnreachable(FalseBB->getFirstNonPHIOrDbg());
+    }
+  }
+
   return ToRemove.size() > 0;
 }
 
diff --git a/llvm/test/CodeGen/NVPTX/nvvm-reflect-arch-O0.ll b/llvm/test/CodeGen/NVPTX/nvvm-reflect-arch-O0.ll
new file mode 100644
index 0000000..9dcdf5b
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/nvvm-reflect-arch-O0.ll
@@ -0,0 +1,175 @@
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_52 -mattr=+ptx64 -O0 | FileCheck %s --check-prefix=SM_52
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_70 -mattr=+ptx64 -O0 | FileCheck %s --check-prefix=SM_70
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_90 -mattr=+ptx72 -O0 | FileCheck %s --check-prefix=SM_90
+
+@.str = private unnamed_addr constant [12 x i8] c"__CUDA_ARCH\00"
+@.str1 = constant [11 x i8] c"__CUDA_FTZ\00"
+
+declare i32 @__nvvm_reflect(ptr)
+
+;      SM_52: .visible .func  (.param .b32 func_retval0) foo()
+;      SM_52: mov.b32         %[[REG:.+]], 3;
+; SM_52-NEXT: st.param.b32    [func_retval0+0], %[[REG:.+]];
+; SM_52-NEXT: ret;
+;
+;      SM_70: .visible .func  (.param .b32 func_retval0) foo()
+;      SM_70: mov.b32         %[[REG:.+]], 2;
+; SM_70-NEXT: st.param.b32    [func_retval0+0], %[[REG:.+]];
+; SM_70-NEXT: ret;
+;
+;      SM_90: .visible .func  (.param .b32 func_retval0) foo()
+;      SM_90: mov.b32         %[[REG:.+]], 1;
+; SM_90-NEXT: st.param.b32    [func_retval0+0], %[[REG:.+]];
+; SM_90-NEXT: ret;
+define i32 @foo() {
+entry:
+  %call = call i32 @__nvvm_reflect(ptr @.str)
+  %cmp = icmp uge i32 %call, 900
+  br i1 %cmp, label %if.then, label %if.else
+
+if.then:
+  br label %return
+
+if.else:
+  %call1 = call i32 @__nvvm_reflect(ptr @.str)
+  %cmp2 = icmp uge i32 %call1, 700
+  br i1 %cmp2, label %if.then3, label %if.else4
+
+if.then3:
+  br label %return
+
+if.else4:
+  %call5 = call i32 @__nvvm_reflect(ptr @.str)
+  %cmp6 = icmp uge i32 %call5, 520
+  br i1 %cmp6, label %if.then7, label %if.else8
+
+if.then7:
+  br label %return
+
+if.else8:
+  br label %return
+
+return:
+  %retval.0 = phi i32 [ 1, %if.then ], [ 2, %if.then3 ], [ 3, %if.then7 ], [ 4, %if.else8 ]
+  ret i32 %retval.0
+}
+
+;      SM_52: .visible .func  (.param .b32 func_retval0) bar()
+;      SM_52: mov.b32         %[[REG:.+]], 2;
+; SM_52-NEXT: st.param.b32    [func_retval0+0], %[[REG:.+]];
+; SM_52-NEXT: ret;
+;
+;      SM_70: .visible .func  (.param .b32 func_retval0) bar()
+;      SM_70: mov.b32         %[[REG:.+]], 1;
+; SM_70-NEXT: st.param.b32    [func_retval0+0], %[[REG:.+]];
+; SM_70-NEXT: ret;
+;
+;      SM_90: .visible .func  (.param .b32 func_retval0) bar()
+;      SM_90: mov.b32         %[[REG:.+]], 1;
+; SM_90-NEXT: st.param.b32    [func_retval0+0], %[[REG:.+]];
+; SM_90-NEXT: ret;
+define i32 @bar() {
+entry:
+  %call = call i32 @__nvvm_reflect(ptr @.str)
+  %cmp = icmp uge i32 %call, 700
+  br i1 %cmp, label %if.then, label %if.else
+
+if.then:
+  br label %if.end
+
+if.else:
+  br label %if.end
+
+if.end:
+  %x = phi i32 [ 1, %if.then ], [ 2, %if.else ]
+  ret i32 %x
+}
+
+; SM_52-NOT: valid;
+; SM_70: valid;
+; SM_90: valid;
+define void @baz() {
+entry:
+  %call = call i32 @__nvvm_reflect(ptr @.str)
+  %cmp = icmp uge i32 %call, 700
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:
+  call void asm sideeffect "valid;\0A", ""()
+  br label %if.end
+
+if.end:
+  ret void
+}
+
+; SM_52: .visible .func  (.param .b32 func_retval0) qux()
+; SM_52: mov.u32         %[[REG1:.+]], %[[REG2:.+]];
+; SM_52: st.param.b32    [func_retval0+0], %[[REG1:.+]];
+; SM_52: ret;
+; SM_70: .visible .func  (.param .b32 func_retval0) qux()
+; SM_70: mov.u32         %[[REG1:.+]], %[[REG2:.+]];
+; SM_70: st.param.b32    [func_retval0+0], %[[REG1:.+]];
+; SM_70: ret;
+; SM_90: .visible .func  (.param .b32 func_retval0) qux()
+; SM_90: st.param.b32    [func_retval0+0], %[[REG1:.+]];
+; SM_90: ret;
+define i32 @qux() {
+entry:
+  %call = call i32 @__nvvm_reflect(ptr noundef @.str)
+  %cmp = icmp uge i32 %call, 700
+  %conv = zext i1 %cmp to i32
+  switch i32 %conv, label %sw.default [
+    i32 900, label %sw.bb
+    i32 700, label %sw.bb1
+    i32 520, label %sw.bb2
+  ]
+
+sw.bb:
+  br label %return
+
+sw.bb1:
+  br label %return
+
+sw.bb2:
+  br label %return
+
+sw.default:
+  br label %return
+
+return:
+  %retval = phi i32 [ 4, %sw.default ], [ 3, %sw.bb2 ], [ 2, %sw.bb1 ], [ 1, %sw.bb ]
+  ret i32 %retval
+}
+
+;      SM_52: .visible .func  (.param .b32 func_retval0) phi()
+;      SM_52: mov.f32         %[[REG:.+]], 0f00000000;
+; SM_52-NEXT: st.param.f32    [func_retval0+0], %[[REG]];
+; SM_52-NEXT: ret;
+;      SM_70: .visible .func  (.param .b32 func_retval0) phi()
+;      SM_70: mov.f32         %[[REG:.+]], 0f00000000;
+; SM_70-NEXT: st.param.f32    [func_retval0+0], %[[REG]];
+; SM_70-NEXT: ret;
+;      SM_90: .visible .func  (.param .b32 func_retval0) phi()
+;      SM_90: mov.f32         %[[REG:.+]], 0f00000000;
+; SM_90-NEXT: st.param.f32    [func_retval0+0], %[[REG]];
+; SM_90-NEXT: ret;
+define float @phi() {
+entry:
+  %0 = call i32 @__nvvm_reflect(ptr @.str)
+  %1 = icmp eq i32 %0, 0
+  br i1 %1, label %if.then, label %if.else
+
+if.then:
+  br label %if.else
+
+if.else:
+  %.08 = phi float [ 0.000000e+00, %if.then ], [ 1.000000e+00, %entry ]
+  %4 = fcmp ogt float %.08, 0.000000e+00
+  br i1 %4, label %exit, label %if.exit
+
+if.exit:
+  br label %exit
+
+exit:
+  ret float 0.000000e+00
+}
diff --git a/llvm/test/CodeGen/NVPTX/nvvm-reflect-arch.ll b/llvm/test/CodeGen/NVPTX/nvvm-reflect-arch.ll
index e8c554c..ac5875c 100644
--- a/llvm/test/CodeGen/NVPTX/nvvm-reflect-arch.ll
+++ b/llvm/test/CodeGen/NVPTX/nvvm-reflect-arch.ll
@@ -18,4 +18,3 @@ define i32 @foo(float %a, float %b) {
 ; SM35: ret i32 350  
   ret i32 %reflect
 }
-
-- 
cgit v1.1


From c560ce464ae486e86e3d2d9684df3f714317f502 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Valentin=20Clement=20=28=E3=83=90=E3=83=AC=E3=83=B3?=
 =?UTF-8?q?=E3=82=BF=E3=82=A4=E3=83=B3=20=E3=82=AF=E3=83=AC=E3=83=A1?=
 =?UTF-8?q?=E3=83=B3=29?= <clementval@gmail.com>
Date: Thu, 8 Feb 2024 18:49:21 -0800
Subject: [flang][cuda] Lower attribute for dummy argument (#81212)

Lower CUDA attribute for simple dummy argument. This is done in a
similar way than `TARGET`, `OPTIONAL` and so on.

This patch also move the `Fortran::common::CUDADataAttr` to
`fir::CUDAAttributeAttr` mapping to
`flang/include/flang/Optimizer/Support/Utils.h` so that it can be reused
where needed.
---
 .../flang/Optimizer/Dialect/FIROpsSupport.h        |  3 ++
 flang/include/flang/Optimizer/Support/Utils.h      | 30 +++++++++++++++++++
 flang/lib/Lower/CallInterface.cpp                  |  5 ++++
 flang/lib/Lower/ConvertVariable.cpp                | 28 ++---------------
 flang/test/Lower/CUDA/cuda-data-attribute.cuf      | 35 +++++++++++++++++++++-
 5 files changed, 74 insertions(+), 27 deletions(-)

diff --git a/flang/include/flang/Optimizer/Dialect/FIROpsSupport.h b/flang/include/flang/Optimizer/Dialect/FIROpsSupport.h
index 977949e..6ac6a31 100644
--- a/flang/include/flang/Optimizer/Dialect/FIROpsSupport.h
+++ b/flang/include/flang/Optimizer/Dialect/FIROpsSupport.h
@@ -72,6 +72,9 @@ constexpr llvm::StringRef getOptionalAttrName() { return "fir.optional"; }
 /// Attribute to mark Fortran entities with the TARGET attribute.
 static constexpr llvm::StringRef getTargetAttrName() { return "fir.target"; }
 
+/// Attribute to mark Fortran entities with the CUDA attribute.
+static constexpr llvm::StringRef getCUDAAttrName() { return "fir.cuda_attr"; }
+
 /// Attribute to mark that a function argument is a character dummy procedure.
 /// Character dummy procedure have special ABI constraints.
 static constexpr llvm::StringRef getCharacterProcedureDummyAttrName() {
diff --git a/flang/include/flang/Optimizer/Support/Utils.h b/flang/include/flang/Optimizer/Support/Utils.h
index b50f297..586701b 100644
--- a/flang/include/flang/Optimizer/Support/Utils.h
+++ b/flang/include/flang/Optimizer/Support/Utils.h
@@ -273,6 +273,36 @@ inline void genMinMaxlocReductionLoop(
   builder.setInsertionPointAfter(ifMaskTrueOp);
 }
 
+inline fir::CUDAAttributeAttr
+getCUDAAttribute(mlir::MLIRContext *mlirContext,
+                 std::optional<Fortran::common::CUDADataAttr> cudaAttr) {
+  if (cudaAttr) {
+    fir::CUDAAttribute attr;
+    switch (*cudaAttr) {
+    case Fortran::common::CUDADataAttr::Constant:
+      attr = fir::CUDAAttribute::Constant;
+      break;
+    case Fortran::common::CUDADataAttr::Device:
+      attr = fir::CUDAAttribute::Device;
+      break;
+    case Fortran::common::CUDADataAttr::Managed:
+      attr = fir::CUDAAttribute::Managed;
+      break;
+    case Fortran::common::CUDADataAttr::Pinned:
+      attr = fir::CUDAAttribute::Pinned;
+      break;
+    case Fortran::common::CUDADataAttr::Shared:
+      attr = fir::CUDAAttribute::Shared;
+      break;
+    case Fortran::common::CUDADataAttr::Texture:
+      // Obsolete attribute
+      return {};
+    }
+    return fir::CUDAAttributeAttr::get(mlirContext, attr);
+  }
+  return {};
+}
+
 } // namespace fir
 
 #endif // FORTRAN_OPTIMIZER_SUPPORT_UTILS_H
diff --git a/flang/lib/Lower/CallInterface.cpp b/flang/lib/Lower/CallInterface.cpp
index b007c95..4c297ce 100644
--- a/flang/lib/Lower/CallInterface.cpp
+++ b/flang/lib/Lower/CallInterface.cpp
@@ -19,6 +19,7 @@
 #include "flang/Optimizer/Dialect/FIRDialect.h"
 #include "flang/Optimizer/Dialect/FIROpsSupport.h"
 #include "flang/Optimizer/Support/InternalNames.h"
+#include "flang/Optimizer/Support/Utils.h"
 #include "flang/Semantics/symbol.h"
 #include "flang/Semantics/tools.h"
 #include <optional>
@@ -993,6 +994,10 @@ private:
       TODO(loc, "VOLATILE in procedure interface");
     if (obj.attrs.test(Attrs::Target))
       addMLIRAttr(fir::getTargetAttrName());
+    if (obj.cudaDataAttr)
+      attrs.emplace_back(
+          mlir::StringAttr::get(&mlirContext, fir::getCUDAAttrName()),
+          fir::getCUDAAttribute(&mlirContext, obj.cudaDataAttr));
 
     // TODO: intents that require special care (e.g finalization)
 
diff --git a/flang/lib/Lower/ConvertVariable.cpp b/flang/lib/Lower/ConvertVariable.cpp
index d57bdd4..f14267f 100644
--- a/flang/lib/Lower/ConvertVariable.cpp
+++ b/flang/lib/Lower/ConvertVariable.cpp
@@ -37,6 +37,7 @@
 #include "flang/Optimizer/HLFIR/HLFIROps.h"
 #include "flang/Optimizer/Support/FatalError.h"
 #include "flang/Optimizer/Support/InternalNames.h"
+#include "flang/Optimizer/Support/Utils.h"
 #include "flang/Semantics/runtime-type-info.h"
 #include "flang/Semantics/tools.h"
 #include "llvm/Support/Debug.h"
@@ -1583,32 +1584,7 @@ fir::CUDAAttributeAttr Fortran::lower::translateSymbolCUDAAttribute(
     mlir::MLIRContext *mlirContext, const Fortran::semantics::Symbol &sym) {
   std::optional<Fortran::common::CUDADataAttr> cudaAttr =
       Fortran::semantics::GetCUDADataAttr(&sym);
-  if (cudaAttr) {
-    fir::CUDAAttribute attr;
-    switch (*cudaAttr) {
-    case Fortran::common::CUDADataAttr::Constant:
-      attr = fir::CUDAAttribute::Constant;
-      break;
-    case Fortran::common::CUDADataAttr::Device:
-      attr = fir::CUDAAttribute::Device;
-      break;
-    case Fortran::common::CUDADataAttr::Managed:
-      attr = fir::CUDAAttribute::Managed;
-      break;
-    case Fortran::common::CUDADataAttr::Pinned:
-      attr = fir::CUDAAttribute::Pinned;
-      break;
-    case Fortran::common::CUDADataAttr::Shared:
-      attr = fir::CUDAAttribute::Shared;
-      break;
-    case Fortran::common::CUDADataAttr::Texture:
-      // Obsolete attribute
-      return {};
-    }
-
-    return fir::CUDAAttributeAttr::get(mlirContext, attr);
-  }
-  return {};
+  return fir::getCUDAAttribute(mlirContext, cudaAttr);
 }
 
 /// Map a symbol to its FIR address and evaluated specification expressions.
diff --git a/flang/test/Lower/CUDA/cuda-data-attribute.cuf b/flang/test/Lower/CUDA/cuda-data-attribute.cuf
index caa8ac7..b02701b 100644
--- a/flang/test/Lower/CUDA/cuda-data-attribute.cuf
+++ b/flang/test/Lower/CUDA/cuda-data-attribute.cuf
@@ -1,7 +1,7 @@
 ! RUN: bbc -emit-hlfir -fcuda %s -o - | FileCheck %s
 ! RUN: bbc -emit-hlfir -fcuda %s -o - | fir-opt -convert-hlfir-to-fir | FileCheck %s --check-prefix=FIR
 
-! Test lowering of CUDA attribute on local variables.
+! Test lowering of CUDA attribute on variables.
 
 subroutine local_var_attrs
   real, constant :: rc
@@ -20,3 +20,36 @@ end subroutine
 ! FIR: %{{.*}} = fir.declare %{{.*}} {cuda_attr = #fir.cuda<device>, uniq_name = "_QFlocal_var_attrsErd"} : (!fir.ref<f32>) -> !fir.ref<f32>
 ! FIR: %{{.*}} = fir.declare %{{.*}} {cuda_attr = #fir.cuda<managed>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFlocal_var_attrsErm"} : (!fir.ref<!fir.box<!fir.heap<f32>>>) -> !fir.ref<!fir.box<!fir.heap<f32>>>
 ! FIR: %{{.*}} = fir.declare %{{.*}} {cuda_attr = #fir.cuda<pinned>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFlocal_var_attrsErp"} : (!fir.ref<!fir.box<!fir.heap<f32>>>) -> !fir.ref<!fir.box<!fir.heap<f32>>>
+
+subroutine dummy_arg_constant(dc)
+  real, constant :: dc
+end subroutine
+! CHECK-LABEL: func.func @_QPdummy_arg_constant(
+! CHECK-SAME: %[[ARG0:.*]]: !fir.ref<f32> {fir.bindc_name = "dc", fir.cuda_attr = #fir.cuda<constant>}
+! CHECK: %{{.*}}:2 = hlfir.declare %[[ARG0]] {cuda_attr = #fir.cuda<constant>, uniq_name = "_QFdummy_arg_constantEdc"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
+
+subroutine dummy_arg_device(dd)
+  real, device :: dd
+end subroutine
+! CHECK-LABEL: func.func @_QPdummy_arg_device(
+! CHECK-SAME: %[[ARG0:.*]]: !fir.ref<f32> {fir.bindc_name = "dd", fir.cuda_attr = #fir.cuda<device>}) {
+! CHECK: %{{.*}}:2 = hlfir.declare %[[ARG0]] {cuda_attr = #fir.cuda<device>, uniq_name = "_QFdummy_arg_deviceEdd"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
+
+subroutine dummy_arg_managed(dm)
+  real, allocatable, managed :: dm
+end subroutine
+! CHECK-LABEL: func.func @_QPdummy_arg_managed(
+! CHECK-SAME: %[[ARG0:.*]]: !fir.ref<!fir.box<!fir.heap<f32>>> {fir.bindc_name = "dm", fir.cuda_attr = #fir.cuda<managed>}) {
+! CHECK: %{{.*}}:2 = hlfir.declare %[[ARG0]] {cuda_attr = #fir.cuda<managed>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFdummy_arg_managedEdm"} : (!fir.ref<!fir.box<!fir.heap<f32>>>) -> (!fir.ref<!fir.box<!fir.heap<f32>>>, !fir.ref<!fir.box<!fir.heap<f32>>>)
+
+subroutine dummy_arg_pinned(dp)
+  real, allocatable, pinned :: dp
+end subroutine
+! CHECK-LABEL: func.func @_QPdummy_arg_pinned(
+! CHECK-SAME: %[[ARG0:.*]]: !fir.ref<!fir.box<!fir.heap<f32>>> {fir.bindc_name = "dp", fir.cuda_attr = #fir.cuda<pinned>}) {
+! CHECK: %{{.*}}:2 = hlfir.declare %[[ARG0]] {cuda_attr = #fir.cuda<pinned>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFdummy_arg_pinnedEdp"} : (!fir.ref<!fir.box<!fir.heap<f32>>>) -> (!fir.ref<!fir.box<!fir.heap<f32>>>, !fir.ref<!fir.box<!fir.heap<f32>>>)
+
+
+
+
+
-- 
cgit v1.1


From 8e297c779635d7f22626c1a9dd1cb9dc86ea6540 Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Thu, 8 Feb 2024 19:05:55 -0800
Subject: [ELF] Improve --ro-rosegment tests

---
 lld/test/ELF/segments.s | 30 ++++++++++++++++++++++++------
 1 file changed, 24 insertions(+), 6 deletions(-)

diff --git a/lld/test/ELF/segments.s b/lld/test/ELF/segments.s
index 614f6e7..ee17117 100644
--- a/lld/test/ELF/segments.s
+++ b/lld/test/ELF/segments.s
@@ -9,11 +9,24 @@
 
 # ROSEGMENT:       Type           Offset   VirtAddr           PhysAddr           FileSiz  MemSiz   Flg Align
 # ROSEGMENT-NEXT:  PHDR           0x000040 0x0000000000200040 0x0000000000200040 0x000118 0x000118 R   0x8
-# ROSEGMENT-NEXT:  LOAD           0x000000 0x0000000000200000 0x0000000000200000 0x00015a 0x00015a R   0x1000
+# ROSEGMENT-NEXT:  LOAD           0x000000 0x0000000000200000 0x0000000000200000 0x00015b 0x00015b R   0x1000
 # ROSEGMENT-NEXT:  LOAD           0x00015c 0x000000000020115c 0x000000000020115c 0x000003 0x000003 R E 0x1000
 # ROSEGMENT-NEXT:  LOAD           0x00015f 0x000000000020215f 0x000000000020215f 0x000002 0x000002 RW  0x1000
 # ROSEGMENT-NEXT:  GNU_STACK      0x000000 0x0000000000000000 0x0000000000000000 0x000000 0x000000 RW  0
 
+# RUN: ld.lld --rosegment a.o -T a.lds -o ro1
+# RUN: llvm-readelf -l ro1 | FileCheck --check-prefix=ROSEGMENT1 %s
+
+# ROSEGMENT1:       Type           Offset   VirtAddr           PhysAddr           FileSiz  MemSiz   Flg Align
+# ROSEGMENT1-NEXT:  LOAD           0x001000 0x0000000000000000 0x0000000000000000 0x000001 0x000001 R   0x1000
+# ROSEGMENT1-NEXT:  LOAD           0x001004 0x0000000000000004 0x0000000000000004 0x000002 0x000002 R E 0x1000
+# ROSEGMENT1-NEXT:  LOAD           0x001006 0x0000000000000006 0x0000000000000006 0x000001 0x000001 RW  0x1000
+# ROSEGMENT1-NEXT:  LOAD           0x001007 0x0000000000000007 0x0000000000000007 0x000001 0x000001 R E 0x1000
+# ROSEGMENT1-NEXT:  LOAD           0x001008 0x0000000000000008 0x0000000000000008 0x000001 0x000001 R   0x1000
+# ROSEGMENT1-NEXT:  LOAD           0x001009 0x0000000000000009 0x0000000000000009 0x000001 0x000001 RW  0x1000
+# ROSEGMENT1-NEXT:  LOAD           0x00100a 0x000000000000000a 0x000000000000000a 0x000001 0x000001 R   0x1000
+# ROSEGMENT1-NEXT:  GNU_STACK      0x000000 0x0000000000000000 0x0000000000000000 0x000000 0x000000 RW  0
+
 # RUN: ld.lld --no-rosegment a.o -o noro
 # RUN: llvm-readelf -l noro | FileCheck --check-prefix=NOROSEGMENT %s
 
@@ -27,10 +40,11 @@
 # RUN: llvm-readelf -l noro1 | FileCheck --check-prefix=NOROSEGMENT1 %s
 
 # NOROSEGMENT1:       Type           Offset   VirtAddr           PhysAddr           FileSiz  MemSiz   Flg Align
-# NOROSEGMENT1-NEXT:  LOAD           0x001000 0x0000000000000000 0x0000000000000000 0x000007 0x000007 R E 0x1000
-# NOROSEGMENT1-NEXT:  LOAD           0x001007 0x0000000000000007 0x0000000000000007 0x000001 0x000001 RW  0x1000
-# NOROSEGMENT1-NEXT:  LOAD           0x001008 0x0000000000000008 0x0000000000000008 0x000001 0x000001 R E 0x1000
+# NOROSEGMENT1-NEXT:  LOAD           0x001000 0x0000000000000000 0x0000000000000000 0x000006 0x000006 R E 0x1000
+# NOROSEGMENT1-NEXT:  LOAD           0x001006 0x0000000000000006 0x0000000000000006 0x000001 0x000001 RW  0x1000
+# NOROSEGMENT1-NEXT:  LOAD           0x001007 0x0000000000000007 0x0000000000000007 0x000002 0x000002 R E 0x1000
 # NOROSEGMENT1-NEXT:  LOAD           0x001009 0x0000000000000009 0x0000000000000009 0x000001 0x000001 RW  0x1000
+# NOROSEGMENT1-NEXT:  LOAD           0x00100a 0x000000000000000a 0x000000000000000a 0x000001 0x000001 R E 0x1000
 # NOROSEGMENT1-NEXT:  GNU_STACK      0x000000 0x0000000000000000 0x0000000000000000 0x000000 0x000000 RW  0
 
 # RUN: ld.lld -N a.o -o omagic
@@ -46,7 +60,7 @@
 # RUN: ld.lld --nmagic a.o -o - | cmp nmagic -
 
 # NMAGIC:       Type           Offset   VirtAddr           PhysAddr           FileSiz  MemSiz   Flg Align
-# NMAGIC-NEXT:  LOAD           0x000120 0x0000000000200120 0x0000000000200120 0x000002 0x000002 R   0x1
+# NMAGIC-NEXT:  LOAD           0x000120 0x0000000000200120 0x0000000000200120 0x000003 0x000003 R   0x1
 # NMAGIC-NEXT:  LOAD           0x000124 0x0000000000200124 0x0000000000200124 0x000003 0x000003 R E 0x4
 # NMAGIC-NEXT:  LOAD           0x000127 0x0000000000200127 0x0000000000200127 0x000002 0x000002 RW  0x1
 # NMAGIC-NEXT:  GNU_STACK      0x000000 0x0000000000000000 0x0000000000000000 0x000000 0x000000 RW  0
@@ -64,12 +78,16 @@ _start:
 .section .rw2,"aw"; .byte 3
 .section .rx2,"ax"; .byte 2
 
+.section .ro3,"a"; .byte 1
+
 #--- a.lds
 SECTIONS {
   .ro1 : {}
   .text : {}
-  .rx : { *(.rx*) }
+  .rx1 : {}
   .rw1 : {}
+  .rx2 : {}
   .ro2 : {}
   .rw2 : {}
+  .ro3 : {}
 }
-- 
cgit v1.1


From ac0577177f053ba7e7016e1b7e44cf5932d00b03 Mon Sep 17 00:00:00 2001
From: Abinaya Saravanan <quic_asaravan@quicinc.com>
Date: Fri, 9 Feb 2024 09:15:15 +0530
Subject: [HEXAGON] Add basic block limit for RDF optimizations (#81071)

Skip RDF optimizations if a function contains a number of basic blocks
that is more than a limit

---------

Co-authored-by: Yashas Andaluri <quic_yandalur@quicinc.com>
---
 llvm/lib/Target/Hexagon/HexagonOptAddrMode.cpp   | 10 ++++++++++
 llvm/lib/Target/Hexagon/HexagonRDFOpt.cpp        | 11 +++++++++++
 llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp |  4 ++++
 3 files changed, 25 insertions(+)

diff --git a/llvm/lib/Target/Hexagon/HexagonOptAddrMode.cpp b/llvm/lib/Target/Hexagon/HexagonOptAddrMode.cpp
index aa31762..0e82bf6 100644
--- a/llvm/lib/Target/Hexagon/HexagonOptAddrMode.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonOptAddrMode.cpp
@@ -47,6 +47,8 @@ static cl::opt<int> CodeGrowthLimit("hexagon-amode-growth-limit",
   cl::Hidden, cl::init(0), cl::desc("Code growth limit for address mode "
   "optimization"));
 
+extern cl::opt<unsigned> RDFFuncBlockLimit;
+
 namespace llvm {
 
   FunctionPass *createHexagonOptAddrMode();
@@ -856,6 +858,14 @@ bool HexagonOptAddrMode::runOnMachineFunction(MachineFunction &MF) {
   if (skipFunction(MF.getFunction()))
     return false;
 
+  // Perform RDF optimizations only if number of basic blocks in the
+  // function is less than the limit
+  if (MF.size() > RDFFuncBlockLimit) {
+    LLVM_DEBUG(dbgs() << "Skipping " << getPassName()
+                      << ": too many basic blocks\n");
+    return false;
+  }
+
   bool Changed = false;
   auto &HST = MF.getSubtarget<HexagonSubtarget>();
   MRI = &MF.getRegInfo();
diff --git a/llvm/lib/Target/Hexagon/HexagonRDFOpt.cpp b/llvm/lib/Target/Hexagon/HexagonRDFOpt.cpp
index 7eccbd2..4131f2a 100644
--- a/llvm/lib/Target/Hexagon/HexagonRDFOpt.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonRDFOpt.cpp
@@ -50,6 +50,9 @@ static unsigned RDFCount = 0;
 static cl::opt<unsigned>
     RDFLimit("hexagon-rdf-limit",
              cl::init(std::numeric_limits<unsigned>::max()));
+
+extern cl::opt<unsigned> RDFFuncBlockLimit;
+
 static cl::opt<bool> RDFDump("hexagon-rdf-dump", cl::Hidden);
 static cl::opt<bool> RDFTrackReserved("hexagon-rdf-track-reserved", cl::Hidden);
 
@@ -285,6 +288,14 @@ bool HexagonRDFOpt::runOnMachineFunction(MachineFunction &MF) {
   if (skipFunction(MF.getFunction()))
     return false;
 
+  // Perform RDF optimizations only if number of basic blocks in the
+  // function is less than the limit
+  if (MF.size() > RDFFuncBlockLimit) {
+    if (RDFDump)
+      dbgs() << "Skipping " << getPassName() << ": too many basic blocks\n";
+    return false;
+  }
+
   if (RDFLimit.getPosition()) {
     if (RDFCount >= RDFLimit)
       return false;
diff --git a/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp b/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp
index e7a692d..7d4b420 100644
--- a/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp
@@ -39,6 +39,10 @@ static cl::opt<bool>
 static cl::opt<bool> EnableRDFOpt("rdf-opt", cl::Hidden, cl::init(true),
                                   cl::desc("Enable RDF-based optimizations"));
 
+cl::opt<unsigned> RDFFuncBlockLimit(
+    "rdf-bb-limit", cl::Hidden, cl::init(1000),
+    cl::desc("Basic block limit for a function for RDF optimizations"));
+
 static cl::opt<bool> DisableHardwareLoops("disable-hexagon-hwloops",
   cl::Hidden, cl::desc("Disable Hardware Loops for Hexagon target"));
 
-- 
cgit v1.1


From db88f3015867ca569ae78a30f20a944c8e1b8afc Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Thu, 8 Feb 2024 19:27:27 -0800
Subject: [RISCV] Add test for saving s10 with cm.push. NFC

If cm.push saves s10, it must also save s11 due to an encoding
limitation. We handle this in the code, but had no test for it.
---
 llvm/test/CodeGen/RISCV/push-pop-popret.ll | 71 ++++++++++++++++++++++++++++++
 1 file changed, 71 insertions(+)

diff --git a/llvm/test/CodeGen/RISCV/push-pop-popret.ll b/llvm/test/CodeGen/RISCV/push-pop-popret.ll
index 09a91498..e007dcc 100644
--- a/llvm/test/CodeGen/RISCV/push-pop-popret.ll
+++ b/llvm/test/CodeGen/RISCV/push-pop-popret.ll
@@ -3218,3 +3218,74 @@ entry:
   call void @bar(ptr %0, ptr %var)
   ret i32 %x
 }
+
+define void @spill_x10() {
+; RV32IZCMP-LABEL: spill_x10:
+; RV32IZCMP:       # %bb.0: # %entry
+; RV32IZCMP-NEXT:    cm.push {ra, s0-s11}, -64
+; RV32IZCMP-NEXT:    .cfi_def_cfa_offset 64
+; RV32IZCMP-NEXT:    .cfi_offset s10, -8
+; RV32IZCMP-NEXT:    #APP
+; RV32IZCMP-NEXT:    li s10, 0
+; RV32IZCMP-NEXT:    #NO_APP
+; RV32IZCMP-NEXT:    cm.popret {ra, s0-s11}, 64
+;
+; RV64IZCMP-LABEL: spill_x10:
+; RV64IZCMP:       # %bb.0: # %entry
+; RV64IZCMP-NEXT:    cm.push {ra, s0-s11}, -112
+; RV64IZCMP-NEXT:    .cfi_def_cfa_offset 112
+; RV64IZCMP-NEXT:    .cfi_offset s10, -16
+; RV64IZCMP-NEXT:    #APP
+; RV64IZCMP-NEXT:    li s10, 0
+; RV64IZCMP-NEXT:    #NO_APP
+; RV64IZCMP-NEXT:    cm.popret {ra, s0-s11}, 112
+;
+; RV32IZCMP-SR-LABEL: spill_x10:
+; RV32IZCMP-SR:       # %bb.0: # %entry
+; RV32IZCMP-SR-NEXT:    cm.push {ra, s0-s11}, -64
+; RV32IZCMP-SR-NEXT:    .cfi_def_cfa_offset 64
+; RV32IZCMP-SR-NEXT:    .cfi_offset s10, -8
+; RV32IZCMP-SR-NEXT:    #APP
+; RV32IZCMP-SR-NEXT:    li s10, 0
+; RV32IZCMP-SR-NEXT:    #NO_APP
+; RV32IZCMP-SR-NEXT:    cm.popret {ra, s0-s11}, 64
+;
+; RV64IZCMP-SR-LABEL: spill_x10:
+; RV64IZCMP-SR:       # %bb.0: # %entry
+; RV64IZCMP-SR-NEXT:    cm.push {ra, s0-s11}, -112
+; RV64IZCMP-SR-NEXT:    .cfi_def_cfa_offset 112
+; RV64IZCMP-SR-NEXT:    .cfi_offset s10, -16
+; RV64IZCMP-SR-NEXT:    #APP
+; RV64IZCMP-SR-NEXT:    li s10, 0
+; RV64IZCMP-SR-NEXT:    #NO_APP
+; RV64IZCMP-SR-NEXT:    cm.popret {ra, s0-s11}, 112
+;
+; RV32I-LABEL: spill_x10:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    addi sp, sp, -16
+; RV32I-NEXT:    .cfi_def_cfa_offset 16
+; RV32I-NEXT:    sw s10, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    .cfi_offset s10, -4
+; RV32I-NEXT:    #APP
+; RV32I-NEXT:    li s10, 0
+; RV32I-NEXT:    #NO_APP
+; RV32I-NEXT:    lw s10, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    addi sp, sp, 16
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: spill_x10:
+; RV64I:       # %bb.0: # %entry
+; RV64I-NEXT:    addi sp, sp, -16
+; RV64I-NEXT:    .cfi_def_cfa_offset 16
+; RV64I-NEXT:    sd s10, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    .cfi_offset s10, -8
+; RV64I-NEXT:    #APP
+; RV64I-NEXT:    li s10, 0
+; RV64I-NEXT:    #NO_APP
+; RV64I-NEXT:    ld s10, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    addi sp, sp, 16
+; RV64I-NEXT:    ret
+entry:
+  tail call void asm sideeffect "li s10, 0", "~{s10}"()
+  ret void
+}
-- 
cgit v1.1


From 763139afc19ddf2e0f0265dc828ce8e5fbe92530 Mon Sep 17 00:00:00 2001
From: Owen Pan <owenpiano@gmail.com>
Date: Thu, 8 Feb 2024 21:42:29 -0800
Subject: [clang-format] Update FormatToken::isSimpleTypeSpecifier() (#80241)

Now with a8279a8bc541, we can make the update.
---
 clang/include/clang/Format/Format.h   |  2 ++
 clang/lib/Format/FormatToken.cpp      | 35 +----------------------------------
 clang/lib/Format/FormatTokenLexer.cpp |  7 ++++---
 clang/lib/Format/FormatTokenLexer.h   |  1 -
 4 files changed, 7 insertions(+), 38 deletions(-)

diff --git a/clang/include/clang/Format/Format.h b/clang/include/clang/Format/Format.h
index cb14d98..bb63d33 100644
--- a/clang/include/clang/Format/Format.h
+++ b/clang/include/clang/Format/Format.h
@@ -5175,6 +5175,8 @@ tooling::Replacements sortUsingDeclarations(const FormatStyle &Style,
                                             ArrayRef<tooling::Range> Ranges,
                                             StringRef FileName = "<stdin>");
 
+extern LangOptions LangOpts;
+
 /// Returns the ``LangOpts`` that the formatter expects you to set.
 ///
 /// \param Style determines specific settings for lexing mode.
diff --git a/clang/lib/Format/FormatToken.cpp b/clang/lib/Format/FormatToken.cpp
index b791c5a..69f751d 100644
--- a/clang/lib/Format/FormatToken.cpp
+++ b/clang/lib/Format/FormatToken.cpp
@@ -34,41 +34,8 @@ const char *getTokenTypeName(TokenType Type) {
   return nullptr;
 }
 
-// FIXME: This is copy&pasted from Sema. Put it in a common place and remove
-// duplication.
 bool FormatToken::isSimpleTypeSpecifier() const {
-  switch (Tok.getKind()) {
-  case tok::kw_short:
-  case tok::kw_long:
-  case tok::kw___int64:
-  case tok::kw___int128:
-  case tok::kw_signed:
-  case tok::kw_unsigned:
-  case tok::kw_void:
-  case tok::kw_char:
-  case tok::kw_int:
-  case tok::kw_half:
-  case tok::kw_float:
-  case tok::kw_double:
-  case tok::kw___bf16:
-  case tok::kw__Float16:
-  case tok::kw___float128:
-  case tok::kw___ibm128:
-  case tok::kw_wchar_t:
-  case tok::kw_bool:
-#define TRANSFORM_TYPE_TRAIT_DEF(_, Trait) case tok::kw___##Trait:
-#include "clang/Basic/TransformTypeTraits.def"
-  case tok::annot_typename:
-  case tok::kw_char8_t:
-  case tok::kw_char16_t:
-  case tok::kw_char32_t:
-  case tok::kw_typeof:
-  case tok::kw_decltype:
-  case tok::kw__Atomic:
-    return true;
-  default:
-    return false;
-  }
+  return Tok.isSimpleTypeSpecifier(LangOpts);
 }
 
 bool FormatToken::isTypeOrIdentifier() const {
diff --git a/clang/lib/Format/FormatTokenLexer.cpp b/clang/lib/Format/FormatTokenLexer.cpp
index a87d0ba..31b2b7e 100644
--- a/clang/lib/Format/FormatTokenLexer.cpp
+++ b/clang/lib/Format/FormatTokenLexer.cpp
@@ -22,18 +22,20 @@
 namespace clang {
 namespace format {
 
+LangOptions LangOpts;
+
 FormatTokenLexer::FormatTokenLexer(
     const SourceManager &SourceMgr, FileID ID, unsigned Column,
     const FormatStyle &Style, encoding::Encoding Encoding,
     llvm::SpecificBumpPtrAllocator<FormatToken> &Allocator,
     IdentifierTable &IdentTable)
     : FormatTok(nullptr), IsFirstToken(true), StateStack({LexerState::NORMAL}),
-      Column(Column), TrailingWhitespace(0),
-      LangOpts(getFormattingLangOpts(Style)), SourceMgr(SourceMgr), ID(ID),
+      Column(Column), TrailingWhitespace(0), SourceMgr(SourceMgr), ID(ID),
       Style(Style), IdentTable(IdentTable), Keywords(IdentTable),
       Encoding(Encoding), Allocator(Allocator), FirstInLineIndex(0),
       FormattingDisabled(false), MacroBlockBeginRegex(Style.MacroBlockBegin),
       MacroBlockEndRegex(Style.MacroBlockEnd) {
+  LangOpts = getFormattingLangOpts(Style);
   Lex.reset(new Lexer(ID, SourceMgr.getBufferOrFake(ID), SourceMgr, LangOpts));
   Lex->SetKeepWhitespaceMode(true);
 
@@ -1442,7 +1444,6 @@ void FormatTokenLexer::readRawToken(FormatToken &Tok) {
 
 void FormatTokenLexer::resetLexer(unsigned Offset) {
   StringRef Buffer = SourceMgr.getBufferData(ID);
-  LangOpts = getFormattingLangOpts(Style);
   Lex.reset(new Lexer(SourceMgr.getLocForStartOfFile(ID), LangOpts,
                       Buffer.begin(), Buffer.begin() + Offset, Buffer.end()));
   Lex->SetKeepWhitespaceMode(true);
diff --git a/clang/lib/Format/FormatTokenLexer.h b/clang/lib/Format/FormatTokenLexer.h
index 65dd733..52838f1 100644
--- a/clang/lib/Format/FormatTokenLexer.h
+++ b/clang/lib/Format/FormatTokenLexer.h
@@ -120,7 +120,6 @@ private:
   unsigned Column;
   unsigned TrailingWhitespace;
   std::unique_ptr<Lexer> Lex;
-  LangOptions LangOpts;
   const SourceManager &SourceMgr;
   FileID ID;
   const FormatStyle &Style;
-- 
cgit v1.1


From b2cd50dbe78c0f0328fe208ab8c4d6005d9272dd Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Thu, 8 Feb 2024 22:42:06 -0800
Subject: [RISCV] Use replace XLenVT with i64 in some isel patterns that are
 only used for RV64. NFC

---
 llvm/lib/Target/RISCV/RISCVInstrInfoZb.td | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td b/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td
index f8938c0..9e32444 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td
@@ -888,22 +888,22 @@ foreach i = {1,2,3} in {
 }
 
 let Predicates = [HasStdExtZbs, IsRV64] in {
-def : Pat<(i32 (and (not (shiftop<shl> 1, (XLenVT GPR:$rs2))), GPR:$rs1)),
+def : Pat<(i32 (and (not (shiftop<shl> 1, (i64 GPR:$rs2))), GPR:$rs1)),
           (BCLR GPR:$rs1, GPR:$rs2)>;
-def : Pat<(i32 (and (rotl -2, (XLenVT GPR:$rs2)), GPR:$rs1)),
+def : Pat<(i32 (and (rotl -2, (i64 GPR:$rs2)), GPR:$rs1)),
           (BCLR GPR:$rs1, GPR:$rs2)>;
-def : Pat<(i32 (or (shiftop<shl> 1, (XLenVT GPR:$rs2)), GPR:$rs1)),
+def : Pat<(i32 (or (shiftop<shl> 1, (i64 GPR:$rs2)), GPR:$rs1)),
           (BSET GPR:$rs1, GPR:$rs2)>;
-def : Pat<(i32 (xor (shiftop<shl> 1, (XLenVT GPR:$rs2)), GPR:$rs1)),
+def : Pat<(i32 (xor (shiftop<shl> 1, (i64 GPR:$rs2)), GPR:$rs1)),
           (BINV GPR:$rs1, GPR:$rs2)>;
-def : Pat<(i32 (and (shiftop<srl> GPR:$rs1, (XLenVT GPR:$rs2)), 1)),
+def : Pat<(i32 (and (shiftop<srl> GPR:$rs1, (i64 GPR:$rs2)), 1)),
           (BEXT GPR:$rs1, GPR:$rs2)>;
-def : Pat<(i64 (and (anyext (i32 (shiftop<srl> GPR:$rs1, (XLenVT GPR:$rs2)))), 1)),
+def : Pat<(i64 (and (anyext (i32 (shiftop<srl> GPR:$rs1, (i64 GPR:$rs2)))), 1)),
           (BEXT GPR:$rs1, GPR:$rs2)>;
 
-def : Pat<(i32 (shiftop<shl> 1, (XLenVT GPR:$rs2))),
+def : Pat<(i32 (shiftop<shl> 1, (i64 GPR:$rs2))),
           (BSET (XLenVT X0), GPR:$rs2)>;
-def : Pat<(i32 (not (shiftop<shl> -1, (XLenVT GPR:$rs2)))),
+def : Pat<(i32 (not (shiftop<shl> -1, (i64 GPR:$rs2)))),
           (ADDI (BSET (XLenVT X0), GPR:$rs2), -1)>;
 
 def : Pat<(i32 (and (srl GPR:$rs1, uimm5:$shamt), (i32 1))),
-- 
cgit v1.1


From 95b14da678f4670283240ef4cf60f3a39bed97b4 Mon Sep 17 00:00:00 2001
From: Quentin Dian <dianqk@dianqk.net>
Date: Fri, 9 Feb 2024 15:29:05 +0800
Subject: [RegisterCoalescer] Clear instructions not recorded in `ErasedInstrs`
 but erased (#79820)

Fixes #79718. Fixes #71178.

The same instructions may exist in an iteration. We cannot immediately
delete instructions in `ErasedInstrs`.
---
 llvm/lib/CodeGen/RegisterCoalescer.cpp             |  27 ++-
 .../LoongArch/register-coalescer-crash-pr79718.mir | 213 +++++++++++++++++++++
 .../X86/PR71178-register-coalescer-crash.ll        | 103 ++++++++++
 3 files changed, 338 insertions(+), 5 deletions(-)
 create mode 100644 llvm/test/CodeGen/LoongArch/register-coalescer-crash-pr79718.mir
 create mode 100644 llvm/test/CodeGen/X86/PR71178-register-coalescer-crash.ll

diff --git a/llvm/lib/CodeGen/RegisterCoalescer.cpp b/llvm/lib/CodeGen/RegisterCoalescer.cpp
index cbb1a74..7e9c992 100644
--- a/llvm/lib/CodeGen/RegisterCoalescer.cpp
+++ b/llvm/lib/CodeGen/RegisterCoalescer.cpp
@@ -236,7 +236,8 @@ namespace {
     /// was successfully coalesced away. If it is not currently possible to
     /// coalesce this interval, but it may be possible if other things get
     /// coalesced, then it returns true by reference in 'Again'.
-    bool joinCopy(MachineInstr *CopyMI, bool &Again);
+    bool joinCopy(MachineInstr *CopyMI, bool &Again,
+                  SmallPtrSetImpl<MachineInstr *> &CurrentErasedInstrs);
 
     /// Attempt to join these two intervals.  On failure, this
     /// returns false.  The output "SrcInt" will not have been modified, so we
@@ -1964,7 +1965,9 @@ void RegisterCoalescer::setUndefOnPrunedSubRegUses(LiveInterval &LI,
   LIS->shrinkToUses(&LI);
 }
 
-bool RegisterCoalescer::joinCopy(MachineInstr *CopyMI, bool &Again) {
+bool RegisterCoalescer::joinCopy(
+    MachineInstr *CopyMI, bool &Again,
+    SmallPtrSetImpl<MachineInstr *> &CurrentErasedInstrs) {
   Again = false;
   LLVM_DEBUG(dbgs() << LIS->getInstructionIndex(*CopyMI) << '\t' << *CopyMI);
 
@@ -2156,7 +2159,9 @@ bool RegisterCoalescer::joinCopy(MachineInstr *CopyMI, bool &Again) {
   // CopyMI has been erased by joinIntervals at this point. Remove it from
   // ErasedInstrs since copyCoalesceWorkList() won't add a successful join back
   // to the work list. This keeps ErasedInstrs from growing needlessly.
-  ErasedInstrs.erase(CopyMI);
+  if (ErasedInstrs.erase(CopyMI))
+    // But we may encounter the instruction again in this iteration.
+    CurrentErasedInstrs.insert(CopyMI);
 
   // Rewrite all SrcReg operands to DstReg.
   // Also update DstReg operands to include DstIdx if it is set.
@@ -3982,21 +3987,33 @@ void RegisterCoalescer::lateLiveIntervalUpdate() {
 bool RegisterCoalescer::
 copyCoalesceWorkList(MutableArrayRef<MachineInstr*> CurrList) {
   bool Progress = false;
+  SmallPtrSet<MachineInstr *, 4> CurrentErasedInstrs;
   for (MachineInstr *&MI : CurrList) {
     if (!MI)
       continue;
     // Skip instruction pointers that have already been erased, for example by
     // dead code elimination.
-    if (ErasedInstrs.count(MI)) {
+    if (ErasedInstrs.count(MI) || CurrentErasedInstrs.count(MI)) {
       MI = nullptr;
       continue;
     }
     bool Again = false;
-    bool Success = joinCopy(MI, Again);
+    bool Success = joinCopy(MI, Again, CurrentErasedInstrs);
     Progress |= Success;
     if (Success || !Again)
       MI = nullptr;
   }
+  // Clear instructions not recorded in `ErasedInstrs` but erased.
+  if (!CurrentErasedInstrs.empty()) {
+    for (MachineInstr *&MI : CurrList) {
+      if (MI && CurrentErasedInstrs.count(MI))
+        MI = nullptr;
+    }
+    for (MachineInstr *&MI : WorkList) {
+      if (MI && CurrentErasedInstrs.count(MI))
+        MI = nullptr;
+    }
+  }
   return Progress;
 }
 
diff --git a/llvm/test/CodeGen/LoongArch/register-coalescer-crash-pr79718.mir b/llvm/test/CodeGen/LoongArch/register-coalescer-crash-pr79718.mir
new file mode 100644
index 0000000..9bbb579
--- /dev/null
+++ b/llvm/test/CodeGen/LoongArch/register-coalescer-crash-pr79718.mir
@@ -0,0 +1,213 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 4
+# RUN: llc -o - %s -mtriple=loongarch64 \
+# RUN:  -run-pass=register-coalescer -join-liveintervals=1 -join-splitedges=0 | FileCheck %s
+
+---
+name:            foo
+tracksRegLiveness: true
+body:             |
+  ; CHECK-LABEL: name: foo
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
+  ; CHECK-NEXT:   liveins: $r4, $r5, $r6, $r7, $r8
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:gpr = COPY $r8
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:gpr = COPY $r7
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:gpr = COPY $r6
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:gpr = COPY $r5
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:gpr = COPY $r4
+  ; CHECK-NEXT:   [[ANDI:%[0-9]+]]:gpr = ANDI [[COPY3]], 1
+  ; CHECK-NEXT:   [[ORI:%[0-9]+]]:gpr = ORI $r0, 1
+  ; CHECK-NEXT:   [[ANDI1:%[0-9]+]]:gpr = ANDI [[COPY2]], 1
+  ; CHECK-NEXT:   [[ANDI2:%[0-9]+]]:gpr = ANDI [[COPY1]], 1
+  ; CHECK-NEXT:   [[ANDI3:%[0-9]+]]:gpr = ANDI [[COPY]], 1
+  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:gpr = COPY $r0
+  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:gpr = COPY $r0
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   successors: %bb.2(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:gpr = COPY [[COPY5]]
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   successors: %bb.3(0x40000000), %bb.4(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   BEQZ [[ANDI]], %bb.4
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.3:
+  ; CHECK-NEXT:   successors: %bb.9(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   PseudoBR %bb.9
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.4:
+  ; CHECK-NEXT:   successors: %bb.5(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.5:
+  ; CHECK-NEXT:   successors: %bb.7(0x7c000000), %bb.6(0x04000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   dead [[LD_D:%[0-9]+]]:gpr = LD_D $r0, 8
+  ; CHECK-NEXT:   dead [[LD_D1:%[0-9]+]]:gpr = LD_D $r0, 0
+  ; CHECK-NEXT:   BNEZ [[ANDI1]], %bb.7
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.6:
+  ; CHECK-NEXT:   successors: %bb.11(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:gpr = COPY $r0
+  ; CHECK-NEXT:   PseudoBR %bb.11
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.7:
+  ; CHECK-NEXT:   successors: %bb.8(0x7c000000), %bb.10(0x04000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   BEQZ [[ANDI2]], %bb.10
+  ; CHECK-NEXT:   PseudoBR %bb.8
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.8:
+  ; CHECK-NEXT:   successors: %bb.9(0x04000000), %bb.5(0x7c000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:gpr = ADDI_D [[COPY6]], 1
+  ; CHECK-NEXT:   BEQZ [[ANDI3]], %bb.5
+  ; CHECK-NEXT:   PseudoBR %bb.9
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.9:
+  ; CHECK-NEXT:   successors: %bb.12(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   ST_B $r0, [[COPY4]], 0
+  ; CHECK-NEXT:   PseudoBR %bb.12
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.10:
+  ; CHECK-NEXT:   successors: %bb.11(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:gpr = ADDI_D [[COPY6]], 1
+  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:gpr = COPY [[ORI]]
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.11:
+  ; CHECK-NEXT:   successors: %bb.12(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   ST_D $r0, [[COPY4]], 0
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.12:
+  ; CHECK-NEXT:   successors: %bb.2(0x7c000000), %bb.1(0x04000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   BEQ [[COPY7]], [[ORI]], %bb.2
+  ; CHECK-NEXT:   PseudoBR %bb.1
+  bb.0:
+    liveins: $r4, $r5, $r6, $r7, $r8
+
+    %0:gpr = COPY killed $r8
+    %1:gpr = COPY killed $r7
+    %2:gpr = COPY killed $r6
+    %3:gpr = COPY killed $r5
+    %4:gpr = COPY killed $r4
+    %5:gpr = COPY $r0
+    %6:gpr = COPY killed %5
+    %7:gpr = ANDI killed %3, 1
+    %8:gpr = ORI $r0, 1
+    %9:gpr = ANDI killed %2, 1
+    %10:gpr = ANDI killed %1, 1
+    %11:gpr = ANDI killed %0, 1
+    %12:gpr = COPY %6
+    %13:gpr = COPY killed %6
+    %14:gpr = IMPLICIT_DEF
+
+  bb.1:
+    %15:gpr = COPY killed %14
+    %16:gpr = COPY killed %13
+    %17:gpr = COPY killed %12
+    %18:gpr = COPY %17
+    %19:gpr = COPY %16
+    %20:gpr = COPY killed %16
+    %21:gpr = COPY killed %15
+
+  bb.2:
+    successors: %bb.3, %bb.4
+
+    %22:gpr = COPY killed %21
+    %23:gpr = COPY killed %20
+    %24:gpr = COPY killed %19
+    %25:gpr = COPY killed %18
+    BEQZ %7, %bb.4
+
+  bb.3:
+    %26:gpr = COPY killed %24
+    %27:gpr = COPY killed %23
+    PseudoBR %bb.9
+
+  bb.4:
+    %28:gpr = COPY killed %23
+
+  bb.5:
+    successors: %bb.7(0x7c000000), %bb.6(0x04000000)
+
+    %29:gpr = COPY killed %28
+    dead %30:gpr = LD_D $r0, 8
+    dead %31:gpr = LD_D $r0, 0
+    BNEZ %9, %bb.7
+
+  bb.6:
+    %32:gpr = COPY $r0
+    %33:gpr = COPY killed %32
+    %34:gpr = COPY killed %33
+    %35:gpr = COPY killed %22
+    PseudoBR %bb.11
+
+  bb.7:
+    successors: %bb.8(0x7c000000), %bb.10(0x04000000)
+
+    BEQZ %10, %bb.10
+    PseudoBR %bb.8
+
+  bb.8:
+    successors: %bb.9(0x04000000), %bb.5(0x7c000000)
+
+    %36:gpr = ADDI_D killed %29, 1
+    %28:gpr = COPY %36
+    %26:gpr = COPY %36
+    %27:gpr = COPY killed %36
+    BEQZ %11, %bb.5
+    PseudoBR %bb.9
+
+  bb.9:
+    %37:gpr = COPY killed %27
+    %38:gpr = COPY killed %26
+    %39:gpr = COPY $r0
+    ST_B killed %39, %4, 0
+    %40:gpr = COPY killed %25
+    %41:gpr = COPY killed %38
+    %42:gpr = COPY killed %37
+    %43:gpr = COPY killed %22
+    PseudoBR %bb.12
+
+  bb.10:
+    %44:gpr = ADDI_D killed %29, 1
+    %34:gpr = COPY %8
+    %35:gpr = COPY killed %44
+
+  bb.11:
+    %45:gpr = COPY killed %35
+    %46:gpr = COPY killed %34
+    %47:gpr = COPY $r0
+    ST_D killed %47, %4, 0
+    %40:gpr = COPY %45
+    %41:gpr = COPY %46
+    %42:gpr = COPY killed %46
+    %43:gpr = COPY killed %45
+
+  bb.12:
+    successors: %bb.2(0x7c000000), %bb.1(0x04000000)
+
+    %48:gpr = COPY killed %43
+    %49:gpr = COPY killed %42
+    %50:gpr = COPY killed %41
+    %51:gpr = COPY killed %40
+    %12:gpr = COPY %51
+    %13:gpr = COPY %50
+    %14:gpr = COPY %48
+    %18:gpr = COPY killed %51
+    %19:gpr = COPY killed %50
+    %20:gpr = COPY killed %49
+    %21:gpr = COPY killed %48
+    BEQ %17, %8, %bb.2
+    PseudoBR %bb.1
+
+...
diff --git a/llvm/test/CodeGen/X86/PR71178-register-coalescer-crash.ll b/llvm/test/CodeGen/X86/PR71178-register-coalescer-crash.ll
new file mode 100644
index 0000000..0ce346f
--- /dev/null
+++ b/llvm/test/CodeGen/X86/PR71178-register-coalescer-crash.ll
@@ -0,0 +1,103 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc < %s -mtriple=x86_64 -- | FileCheck %s
+
+define i32 @h(i1 %arg, i32 %arg1) {
+; CHECK-LABEL: h:
+; CHECK:       # %bb.0: # %bb
+; CHECK-NEXT:    movl $1, %eax
+; CHECK-NEXT:    movabsq $9166129423, %rcx # imm = 0x22258090F
+; CHECK-NEXT:    xorl %edx, %edx
+; CHECK-NEXT:    jmp .LBB0_1
+; CHECK-NEXT:    .p2align 4, 0x90
+; CHECK-NEXT:  .LBB0_9: # %bb18
+; CHECK-NEXT:    # in Loop: Header=BB0_1 Depth=1
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    testb $1, %dil
+; CHECK-NEXT:    jne .LBB0_10
+; CHECK-NEXT:  .LBB0_1: # %bb4
+; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    testq %rdx, %rdx
+; CHECK-NEXT:    jne .LBB0_2
+; CHECK-NEXT:  # %bb.7: # %bb16
+; CHECK-NEXT:    # in Loop: Header=BB0_1 Depth=1
+; CHECK-NEXT:    testb $1, %dil
+; CHECK-NEXT:    jne .LBB0_9
+; CHECK-NEXT:  # %bb.8: # %bb17
+; CHECK-NEXT:    # in Loop: Header=BB0_1 Depth=1
+; CHECK-NEXT:    movq %rcx, %rdx
+; CHECK-NEXT:    jmp .LBB0_9
+; CHECK-NEXT:  .LBB0_2: # %bb9
+; CHECK-NEXT:    # in Loop: Header=BB0_1 Depth=1
+; CHECK-NEXT:    testb $1, %dil
+; CHECK-NEXT:    testb $1, %dil
+; CHECK-NEXT:    je .LBB0_4
+; CHECK-NEXT:  # %bb.3: # %bb13
+; CHECK-NEXT:    # in Loop: Header=BB0_1 Depth=1
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:  .LBB0_4: # %bb14
+; CHECK-NEXT:    # in Loop: Header=BB0_1 Depth=1
+; CHECK-NEXT:    cmpl $1, %esi
+; CHECK-NEXT:    je .LBB0_1
+; CHECK-NEXT:  # %bb.5: # %bb14
+; CHECK-NEXT:    movl %eax, %r8d
+; CHECK-NEXT:    testl %esi, %esi
+; CHECK-NEXT:    movl %esi, %eax
+; CHECK-NEXT:    jne .LBB0_6
+; CHECK-NEXT:  .LBB0_10: # %bb22
+; CHECK-NEXT:    retq
+; CHECK-NEXT:  .LBB0_6: # %bb22.loopexit1
+; CHECK-NEXT:    movl %r8d, %eax
+; CHECK-NEXT:    retq
+bb:
+  br label %bb2
+
+bb2:                                              ; preds = %bb14, %bb
+  %i = phi i64 [ %i5, %bb14 ], [ 0, %bb ]
+  %i3 = phi i32 [ %i15, %bb14 ], [ 1, %bb ]
+  br label %bb4
+
+bb4:                                              ; preds = %bb18, %bb2
+  %i5 = phi i64 [ %i19, %bb18 ], [ %i, %bb2 ]
+  %i6 = phi i64 [ %i20, %bb18 ], [ %i, %bb2 ]
+  %i7 = phi i32 [ 0, %bb18 ], [ %i3, %bb2 ]
+  %i8 = icmp eq i64 %i6, 0
+  br i1 %i8, label %bb16, label %bb9
+
+bb9:                                              ; preds = %bb4
+  br i1 %arg, label %bb12, label %bb10
+
+bb10:                                             ; preds = %bb9
+  %i11 = sdiv i64 0, 0
+  br label %bb12
+
+bb12:                                             ; preds = %bb10, %bb9
+  br i1 %arg, label %bb13, label %bb14
+
+bb13:                                             ; preds = %bb12
+  br label %bb14
+
+bb14:                                             ; preds = %bb13, %bb12
+  %i15 = phi i32 [ 0, %bb13 ], [ %i7, %bb12 ]
+  switch i32 %arg1, label %bb22 [
+    i32 0, label %bb21
+    i32 1, label %bb2
+  ]
+
+bb16:                                             ; preds = %bb4
+  br i1 %arg, label %bb18, label %bb17
+
+bb17:                                             ; preds = %bb16
+  br label %bb18
+
+bb18:                                             ; preds = %bb17, %bb16
+  %i19 = phi i64 [ 9166129423, %bb17 ], [ %i5, %bb16 ]
+  %i20 = phi i64 [ 9166129423, %bb17 ], [ %i6, %bb16 ]
+  br i1 %arg, label %bb22, label %bb4
+
+bb21:                                             ; preds = %bb14
+  br label %bb22
+
+bb22:                                             ; preds = %bb21, %bb18, %bb14
+  %i23 = phi i32 [ %arg1, %bb21 ], [ %i15, %bb14 ], [ 0, %bb18 ]
+  ret i32 %i23
+}
-- 
cgit v1.1


From abc39f9aa750634973fe8ba5519d6bbdd70567c4 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Thu, 8 Feb 2024 23:36:37 -0800
Subject: [RISCV] Add casts to isel patterns that produce more than 1
 instruction.

We need explicitly cast to XLenVT to avoid tablegen picking i32.

If the SelectionDAG scheduler is used it can't find a register
class for i32 if i32 isn't a legal type.

Fixes #81192, but I might have missed some patterns.
---
 llvm/lib/Target/RISCV/RISCVInstrInfo.td       | 24 +++----
 llvm/lib/Target/RISCV/RISCVInstrInfoD.td      | 24 +++----
 llvm/lib/Target/RISCV/RISCVInstrInfoF.td      | 16 ++---
 llvm/lib/Target/RISCV/RISCVInstrInfoM.td      |  2 +-
 llvm/lib/Target/RISCV/RISCVInstrInfoXTHead.td | 59 ++++++++--------
 llvm/lib/Target/RISCV/RISCVInstrInfoZb.td     | 99 ++++++++++++++-------------
 llvm/lib/Target/RISCV/RISCVInstrInfoZfh.td    | 16 ++---
 7 files changed, 122 insertions(+), 118 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.td b/llvm/lib/Target/RISCV/RISCVInstrInfo.td
index 5189824..7fe9b62 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.td
@@ -1260,14 +1260,14 @@ def : PatGprSimm12<or_is_add, ADDI>;
 // negate of low bit can be done via two (compressible) shifts.  The negate
 // is never compressible since rs1 and rd can't be the same register.
 def : Pat<(XLenVT (sub 0, (and_oneuse GPR:$rs, 1))),
-          (SRAI (SLLI $rs, (ImmSubFromXLen (XLenVT 1))),
+          (SRAI (XLenVT (SLLI $rs, (ImmSubFromXLen (XLenVT 1)))),
                 (ImmSubFromXLen (XLenVT 1)))>;
 
 // AND with leading/trailing ones mask exceeding simm32/simm12.
 def : Pat<(i64 (and GPR:$rs, LeadingOnesMask:$mask)),
-          (SLLI (SRLI $rs, LeadingOnesMask:$mask), LeadingOnesMask:$mask)>;
+          (SLLI (i64 (SRLI $rs, LeadingOnesMask:$mask)), LeadingOnesMask:$mask)>;
 def : Pat<(XLenVT (and GPR:$rs, TrailingOnesMask:$mask)),
-          (SRLI (SLLI $rs, TrailingOnesMask:$mask), TrailingOnesMask:$mask)>;
+          (SRLI (XLenVT (SLLI $rs, TrailingOnesMask:$mask)), TrailingOnesMask:$mask)>;
 
 // Match both a plain shift and one where the shift amount is masked (this is
 // typically introduced when the legalizer promotes the shift amount and
@@ -1380,7 +1380,7 @@ defm Select_GPR : SelectCC_GPR_rrirr<GPR, XLenVT>;
 class SelectCompressOpt<CondCode Cond>
     : Pat<(riscv_selectcc_frag:$select (XLenVT GPR:$lhs), simm12_no6:$Constant, Cond,
                                        (XLenVT GPR:$truev), GPR:$falsev),
-    (Select_GPR_Using_CC_GPR (ADDI GPR:$lhs, (NegImm simm12:$Constant)), (XLenVT X0),
+    (Select_GPR_Using_CC_GPR (XLenVT (ADDI GPR:$lhs, (NegImm simm12:$Constant))), (XLenVT X0),
                           (IntCCtoRISCVCC $select), GPR:$truev, GPR:$falsev)>;
 
 def OptForMinSize : Predicate<"MF ? MF->getFunction().hasMinSize() : false">;
@@ -1728,12 +1728,12 @@ def ADJCALLSTACKUP   : Pseudo<(outs), (ins i32imm:$amt1, i32imm:$amt2),
 /// RV64 patterns
 
 let Predicates = [IsRV64, NotHasStdExtZba] in {
-def : Pat<(i64 (and GPR:$rs1, 0xffffffff)), (SRLI (SLLI GPR:$rs1, 32), 32)>;
+def : Pat<(i64 (and GPR:$rs1, 0xffffffff)), (SRLI (i64 (SLLI GPR:$rs1, 32)), 32)>;
 
 // If we're shifting a 32-bit zero extended value left by 0-31 bits, use 2
 // shifts instead of 3. This can occur when unsigned is used to index an array.
 def : Pat<(i64 (shl (and GPR:$rs1, 0xffffffff), uimm5:$shamt)),
-          (SRLI (SLLI GPR:$rs1, 32), (ImmSubFrom32 uimm5:$shamt))>;
+          (SRLI (i64 (SLLI GPR:$rs1, 32)), (ImmSubFrom32 uimm5:$shamt))>;
 }
 
 class binop_allhusers<SDPatternOperator operator>
@@ -1768,7 +1768,7 @@ def u32simm12 : ImmLeaf<XLenVT, [{
 let Predicates = [IsRV64] in {
 
 def : Pat<(i64 (and GPR:$rs, LeadingOnesWMask:$mask)),
-          (SLLI (SRLIW $rs, LeadingOnesWMask:$mask), LeadingOnesWMask:$mask)>;
+          (SLLI (i64 (SRLIW $rs, LeadingOnesWMask:$mask)), LeadingOnesWMask:$mask)>;
 
 /// sext and zext
 
@@ -1864,13 +1864,13 @@ def KCFI_CHECK
 
 /// Simple optimization
 def : Pat<(XLenVT (add GPR:$rs1, (AddiPair:$rs2))),
-          (ADDI (ADDI GPR:$rs1, (AddiPairImmLarge AddiPair:$rs2)),
+          (ADDI (XLenVT (ADDI GPR:$rs1, (AddiPairImmLarge AddiPair:$rs2))),
                 (AddiPairImmSmall GPR:$rs2))>;
 
 let Predicates = [IsRV64] in {
 // Select W instructions if only the lower 32-bits of the result are used.
 def : Pat<(binop_allwusers<add> GPR:$rs1, (AddiPair:$rs2)),
-          (ADDIW (ADDIW GPR:$rs1, (AddiPairImmLarge AddiPair:$rs2)),
+          (ADDIW (i64 (ADDIW GPR:$rs1, (AddiPairImmLarge AddiPair:$rs2))),
                  (AddiPairImmSmall AddiPair:$rs2))>;
 }
 
@@ -1929,7 +1929,7 @@ def : PatGprImm<srl, SRLIW, uimm5, i32>;
 def : PatGprImm<sra, SRAIW, uimm5, i32>;
 
 def : Pat<(i32 (and GPR:$rs, TrailingOnesMask:$mask)),
-          (SRLI (SLLI $rs, (i64 (XLenSubTrailingOnes $mask))),
+          (SRLI (i32 (SLLI $rs, (i64 (XLenSubTrailingOnes $mask)))),
                 (i64 (XLenSubTrailingOnes $mask)))>;
 
 // Use sext if the sign bit of the input is 0.
@@ -1937,12 +1937,12 @@ def : Pat<(zext_is_sext GPR:$src), (ADDIW GPR:$src, 0)>;
 }
 
 let Predicates = [IsRV64, NotHasStdExtZba] in {
-def : Pat<(zext GPR:$src), (SRLI (SLLI GPR:$src, 32), 32)>;
+def : Pat<(zext GPR:$src), (SRLI (i64 (SLLI GPR:$src, 32)), 32)>;
 
 // If we're shifting a 32-bit zero extended value left by 0-31 bits, use 2
 // shifts instead of 3. This can occur when unsigned is used to index an array.
 def : Pat<(shl (zext GPR:$rs), uimm5:$shamt),
-          (SRLI (SLLI GPR:$rs, 32), (ImmSubFrom32 uimm5:$shamt))>;
+          (SRLI (i64 (SLLI GPR:$rs, 32)), (ImmSubFrom32 uimm5:$shamt))>;
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoD.td b/llvm/lib/Target/RISCV/RISCVInstrInfoD.td
index fec43d8..9b4f93d 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoD.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoD.td
@@ -410,11 +410,11 @@ foreach Ext = DExts in {
 let Predicates = [HasStdExtD] in {
 // Match signaling FEQ_D
 def : Pat<(XLenVT (strict_fsetccs FPR64:$rs1, FPR64:$rs2, SETEQ)),
-          (AND (FLE_D $rs1, $rs2),
-               (FLE_D $rs2, $rs1))>;
+          (AND (XLenVT (FLE_D $rs1, $rs2)),
+               (XLenVT (FLE_D $rs2, $rs1)))>;
 def : Pat<(XLenVT (strict_fsetccs FPR64:$rs1, FPR64:$rs2, SETOEQ)),
-          (AND (FLE_D $rs1, $rs2),
-               (FLE_D $rs2, $rs1))>;
+          (AND (XLenVT (FLE_D $rs1, $rs2)),
+               (XLenVT (FLE_D $rs2, $rs1)))>;
 // If both operands are the same, use a single FLE.
 def : Pat<(XLenVT (strict_fsetccs FPR64:$rs1, FPR64:$rs1, SETEQ)),
           (FLE_D $rs1, $rs1)>;
@@ -430,11 +430,11 @@ def : PatSetCC<FPR64, any_fsetccs, SETOLE, FLE_D, f64>;
 let Predicates = [HasStdExtZdinx, IsRV64] in {
 // Match signaling FEQ_D
 def : Pat<(XLenVT (strict_fsetccs (f64 FPR64INX:$rs1), FPR64INX:$rs2, SETEQ)),
-          (AND (FLE_D_INX $rs1, $rs2),
-               (FLE_D_INX $rs2, $rs1))>;
+          (AND (XLenVT (FLE_D_INX $rs1, $rs2)),
+               (XLenVT (FLE_D_INX $rs2, $rs1)))>;
 def : Pat<(XLenVT (strict_fsetccs (f64 FPR64INX:$rs1), FPR64INX:$rs2, SETOEQ)),
-          (AND (FLE_D_INX $rs1, $rs2),
-               (FLE_D_INX $rs2, $rs1))>;
+          (AND (XLenVT (FLE_D_INX $rs1, $rs2)),
+               (XLenVT (FLE_D_INX $rs2, $rs1)))>;
 // If both operands are the same, use a single FLE.
 def : Pat<(XLenVT (strict_fsetccs (f64 FPR64INX:$rs1), FPR64INX:$rs1, SETEQ)),
           (FLE_D_INX $rs1, $rs1)>;
@@ -450,11 +450,11 @@ def : PatSetCC<FPR64INX, any_fsetccs, SETOLE, FLE_D_INX, f64>;
 let Predicates = [HasStdExtZdinx, IsRV32] in {
 // Match signaling FEQ_D
 def : Pat<(XLenVT (strict_fsetccs FPR64IN32X:$rs1, FPR64IN32X:$rs2, SETEQ)),
-          (AND (FLE_D_IN32X $rs1, $rs2),
-               (FLE_D_IN32X $rs2, $rs1))>;
+          (AND (XLenVT (FLE_D_IN32X $rs1, $rs2)),
+               (XLenVT (FLE_D_IN32X $rs2, $rs1)))>;
 def : Pat<(XLenVT (strict_fsetccs FPR64IN32X:$rs1, FPR64IN32X:$rs2, SETOEQ)),
-          (AND (FLE_D_IN32X $rs1, $rs2),
-               (FLE_D_IN32X $rs2, $rs1))>;
+          (AND (XLenVT (FLE_D_IN32X $rs1, $rs2)),
+               (XLenVT (FLE_D_IN32X $rs2, $rs1)))>;
 // If both operands are the same, use a single FLE.
 def : Pat<(XLenVT (strict_fsetccs FPR64IN32X:$rs1, FPR64IN32X:$rs1, SETEQ)),
           (FLE_D_IN32X $rs1, $rs1)>;
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoF.td b/llvm/lib/Target/RISCV/RISCVInstrInfoF.td
index 52eadbd..7d89608 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoF.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoF.td
@@ -617,11 +617,11 @@ foreach Ext = FExts in {
 let Predicates = [HasStdExtF] in {
 // Match signaling FEQ_S
 def : Pat<(XLenVT (strict_fsetccs FPR32:$rs1, FPR32:$rs2, SETEQ)),
-          (AND (FLE_S $rs1, $rs2),
-               (FLE_S $rs2, $rs1))>;
+          (AND (XLenVT (FLE_S $rs1, $rs2)),
+               (XLenVT (FLE_S $rs2, $rs1)))>;
 def : Pat<(XLenVT (strict_fsetccs FPR32:$rs1, FPR32:$rs2, SETOEQ)),
-          (AND (FLE_S $rs1, $rs2),
-               (FLE_S $rs2, $rs1))>;
+          (AND (XLenVT (FLE_S $rs1, $rs2)),
+               (XLenVT (FLE_S $rs2, $rs1)))>;
 // If both operands are the same, use a single FLE.
 def : Pat<(XLenVT (strict_fsetccs FPR32:$rs1, FPR32:$rs1, SETEQ)),
           (FLE_S $rs1, $rs1)>;
@@ -632,11 +632,11 @@ def : Pat<(XLenVT (strict_fsetccs FPR32:$rs1, FPR32:$rs1, SETOEQ)),
 let Predicates = [HasStdExtZfinx] in {
 // Match signaling FEQ_S
 def : Pat<(XLenVT (strict_fsetccs FPR32INX:$rs1, FPR32INX:$rs2, SETEQ)),
-          (AND (FLE_S_INX $rs1, $rs2),
-               (FLE_S_INX $rs2, $rs1))>;
+          (AND (XLenVT (FLE_S_INX $rs1, $rs2)),
+               (XLenVT (FLE_S_INX $rs2, $rs1)))>;
 def : Pat<(XLenVT (strict_fsetccs FPR32INX:$rs1, FPR32INX:$rs2, SETOEQ)),
-          (AND (FLE_S_INX $rs1, $rs2),
-               (FLE_S_INX $rs2, $rs1))>;
+          (AND (XLenVT (FLE_S_INX $rs1, $rs2)),
+               (XLenVT (FLE_S_INX $rs2, $rs1)))>;
 // If both operands are the same, use a single FLE.
 def : Pat<(XLenVT (strict_fsetccs FPR32INX:$rs1, FPR32INX:$rs1, SETEQ)),
           (FLE_S_INX $rs1, $rs1)>;
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoM.td b/llvm/lib/Target/RISCV/RISCVInstrInfoM.td
index f9890ca..6b43d43 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoM.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoM.td
@@ -112,7 +112,7 @@ let Predicates = [HasStdExtMOrZmmul, IsRV64, NotHasStdExtZba] in {
 // inputs left by 32 and use a MULHU. This saves two SRLIs needed to finish
 // zeroing the upper 32 bits.
 def : Pat<(i64 (mul (and GPR:$rs1, 0xffffffff), (and GPR:$rs2, 0xffffffff))),
-          (MULHU (SLLI GPR:$rs1, 32), (SLLI GPR:$rs2, 32))>;
+          (MULHU (i64 (SLLI GPR:$rs1, 32)), (i64 (SLLI GPR:$rs2, 32)))>;
 } // Predicates = [HasStdExtMOrZmmul, IsRV64, NotHasStdExtZba]
 
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoXTHead.td b/llvm/lib/Target/RISCV/RISCVInstrInfoXTHead.td
index ff474e4..79ced38 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoXTHead.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoXTHead.td
@@ -548,65 +548,66 @@ def : Pat<(add_non_imm12 sh3add_op:$rs1, (XLenVT GPR:$rs2)),
           (TH_ADDSL GPR:$rs2, sh3add_op:$rs1, 3)>;
 
 def : Pat<(add (mul_oneuse GPR:$rs1, (XLenVT 6)), GPR:$rs2),
-          (TH_ADDSL GPR:$rs2, (TH_ADDSL GPR:$rs1, GPR:$rs1, 1), 1)>;
+          (TH_ADDSL GPR:$rs2, (XLenVT (TH_ADDSL GPR:$rs1, GPR:$rs1, 1)), 1)>;
 def : Pat<(add (mul_oneuse GPR:$rs1, (XLenVT 10)), GPR:$rs2),
-          (TH_ADDSL GPR:$rs2, (TH_ADDSL GPR:$rs1, GPR:$rs1, 2), 1)>;
+          (TH_ADDSL GPR:$rs2, (XLenVT (TH_ADDSL GPR:$rs1, GPR:$rs1, 2)), 1)>;
 def : Pat<(add (mul_oneuse GPR:$rs1, (XLenVT 18)), GPR:$rs2),
-          (TH_ADDSL GPR:$rs2, (TH_ADDSL GPR:$rs1, GPR:$rs1, 3), 1)>;
+          (TH_ADDSL GPR:$rs2, (XLenVT (TH_ADDSL GPR:$rs1, GPR:$rs1, 3)), 1)>;
 def : Pat<(add (mul_oneuse GPR:$rs1, (XLenVT 12)), GPR:$rs2),
-          (TH_ADDSL GPR:$rs2, (TH_ADDSL GPR:$rs1, GPR:$rs1, 1), 2)>;
+          (TH_ADDSL GPR:$rs2, (XLenVT (TH_ADDSL GPR:$rs1, GPR:$rs1, 1)), 2)>;
 def : Pat<(add (mul_oneuse GPR:$rs1, (XLenVT 20)), GPR:$rs2),
-          (TH_ADDSL GPR:$rs2, (TH_ADDSL GPR:$rs1, GPR:$rs1, 2), 2)>;
+          (TH_ADDSL GPR:$rs2, (XLenVT (TH_ADDSL GPR:$rs1, GPR:$rs1, 2)), 2)>;
 def : Pat<(add (mul_oneuse GPR:$rs1, (XLenVT 36)), GPR:$rs2),
-          (TH_ADDSL GPR:$rs2, (TH_ADDSL GPR:$rs1, GPR:$rs1, 3), 2)>;
+          (TH_ADDSL GPR:$rs2, (XLenVT (TH_ADDSL GPR:$rs1, GPR:$rs1, 3)), 2)>;
 def : Pat<(add (mul_oneuse GPR:$rs1, (XLenVT 24)), GPR:$rs2),
-          (TH_ADDSL GPR:$rs2, (TH_ADDSL GPR:$rs1, GPR:$rs1, 1), 3)>;
+          (TH_ADDSL GPR:$rs2, (XLenVT (TH_ADDSL GPR:$rs1, GPR:$rs1, 1)), 3)>;
 def : Pat<(add (mul_oneuse GPR:$rs1, (XLenVT 40)), GPR:$rs2),
-          (TH_ADDSL GPR:$rs2, (TH_ADDSL GPR:$rs1, GPR:$rs1, 2), 3)>;
+          (TH_ADDSL GPR:$rs2, (XLenVT (TH_ADDSL GPR:$rs1, GPR:$rs1, 2)), 3)>;
 def : Pat<(add (mul_oneuse GPR:$rs1, (XLenVT 72)), GPR:$rs2),
-          (TH_ADDSL GPR:$rs2, (TH_ADDSL GPR:$rs1, GPR:$rs1, 3), 3)>;
+          (TH_ADDSL GPR:$rs2, (XLenVT (TH_ADDSL GPR:$rs1, GPR:$rs1, 3)), 3)>;
 
 def : Pat<(add (XLenVT GPR:$r), CSImm12MulBy4:$i),
-          (TH_ADDSL GPR:$r, (ADDI (XLenVT X0), (SimmShiftRightBy2XForm CSImm12MulBy4:$i)), 2)>;
+          (TH_ADDSL GPR:$r, (XLenVT (ADDI (XLenVT X0), (SimmShiftRightBy2XForm CSImm12MulBy4:$i))), 2)>;
 def : Pat<(add (XLenVT GPR:$r), CSImm12MulBy8:$i),
-          (TH_ADDSL GPR:$r, (ADDI (XLenVT X0), (SimmShiftRightBy3XForm CSImm12MulBy8:$i)), 3)>;
+          (TH_ADDSL GPR:$r, (XLenVT (ADDI (XLenVT X0), (SimmShiftRightBy3XForm CSImm12MulBy8:$i))), 3)>;
 
 def : Pat<(mul (XLenVT GPR:$r), C3LeftShift:$i),
-          (SLLI (TH_ADDSL GPR:$r, GPR:$r, 1),
+          (SLLI (XLenVT (TH_ADDSL GPR:$r, GPR:$r, 1)),
                 (TrailingZeros C3LeftShift:$i))>;
 def : Pat<(mul (XLenVT GPR:$r), C5LeftShift:$i),
-          (SLLI (TH_ADDSL GPR:$r, GPR:$r, 2),
+          (SLLI (XLenVT (TH_ADDSL GPR:$r, GPR:$r, 2)),
                 (TrailingZeros C5LeftShift:$i))>;
 def : Pat<(mul (XLenVT GPR:$r), C9LeftShift:$i),
-          (SLLI (TH_ADDSL GPR:$r, GPR:$r, 3),
+          (SLLI (XLenVT (TH_ADDSL GPR:$r, GPR:$r, 3)),
                 (TrailingZeros C9LeftShift:$i))>;
 
 def : Pat<(mul_const_oneuse GPR:$r, (XLenVT 11)),
-          (TH_ADDSL GPR:$r, (TH_ADDSL GPR:$r, GPR:$r, 2), 1)>;
+          (TH_ADDSL GPR:$r, (XLenVT (TH_ADDSL GPR:$r, GPR:$r, 2)), 1)>;
 def : Pat<(mul_const_oneuse GPR:$r, (XLenVT 19)),
-          (TH_ADDSL GPR:$r, (TH_ADDSL GPR:$r, GPR:$r, 3), 1)>;
+          (TH_ADDSL GPR:$r, (XLenVT (TH_ADDSL GPR:$r, GPR:$r, 3)), 1)>;
 def : Pat<(mul_const_oneuse GPR:$r, (XLenVT 13)),
-          (TH_ADDSL GPR:$r, (TH_ADDSL GPR:$r, GPR:$r, 1), 2)>;
+          (TH_ADDSL GPR:$r, (XLenVT (TH_ADDSL GPR:$r, GPR:$r, 1)), 2)>;
 def : Pat<(mul_const_oneuse GPR:$r, (XLenVT 21)),
-          (TH_ADDSL GPR:$r, (TH_ADDSL GPR:$r, GPR:$r, 2), 2)>;
+          (TH_ADDSL GPR:$r, (XLenVT (TH_ADDSL GPR:$r, GPR:$r, 2)), 2)>;
 def : Pat<(mul_const_oneuse GPR:$r, (XLenVT 37)),
-          (TH_ADDSL GPR:$r, (TH_ADDSL GPR:$r, GPR:$r, 3), 2)>;
+          (TH_ADDSL GPR:$r, (XLenVT (TH_ADDSL GPR:$r, GPR:$r, 3)), 2)>;
 def : Pat<(mul_const_oneuse GPR:$r, (XLenVT 25)),
-          (TH_ADDSL (TH_ADDSL GPR:$r, GPR:$r, 2), (TH_ADDSL GPR:$r, GPR:$r, 2), 2)>;
+          (TH_ADDSL (XLenVT (TH_ADDSL GPR:$r, GPR:$r, 2)),
+                    (XLenVT (TH_ADDSL GPR:$r, GPR:$r, 2)), 2)>;
 def : Pat<(mul_const_oneuse GPR:$r, (XLenVT 41)),
-          (TH_ADDSL GPR:$r, (TH_ADDSL GPR:$r, GPR:$r, 2), 3)>;
+          (TH_ADDSL GPR:$r, (XLenVT (TH_ADDSL GPR:$r, GPR:$r, 2)), 3)>;
 def : Pat<(mul_const_oneuse GPR:$r, (XLenVT 73)),
-          (TH_ADDSL GPR:$r, (TH_ADDSL GPR:$r, GPR:$r, 3), 3)>;
+          (TH_ADDSL GPR:$r, (XLenVT (TH_ADDSL GPR:$r, GPR:$r, 3)), 3)>;
 def : Pat<(mul_const_oneuse GPR:$r, (XLenVT 27)),
-          (TH_ADDSL (TH_ADDSL GPR:$r, GPR:$r, 3), (TH_ADDSL GPR:$r, GPR:$r, 3), 1)>;
+          (TH_ADDSL (XLenVT (TH_ADDSL GPR:$r, GPR:$r, 3)), (XLenVT (TH_ADDSL GPR:$r, GPR:$r, 3)), 1)>;
 def : Pat<(mul_const_oneuse GPR:$r, (XLenVT 45)),
-          (TH_ADDSL (TH_ADDSL GPR:$r, GPR:$r, 3), (TH_ADDSL GPR:$r, GPR:$r, 3), 2)>;
+          (TH_ADDSL (XLenVT (TH_ADDSL GPR:$r, GPR:$r, 3)), (XLenVT (TH_ADDSL GPR:$r, GPR:$r, 3)), 2)>;
 def : Pat<(mul_const_oneuse GPR:$r, (XLenVT 81)),
-          (TH_ADDSL (TH_ADDSL GPR:$r, GPR:$r, 3), (TH_ADDSL GPR:$r, GPR:$r, 3), 3)>;
+          (TH_ADDSL (XLenVT (TH_ADDSL GPR:$r, GPR:$r, 3)), (XLenVT (TH_ADDSL GPR:$r, GPR:$r, 3)), 3)>;
 
 def : Pat<(mul_const_oneuse GPR:$r, (XLenVT 200)),
-          (SLLI (TH_ADDSL (TH_ADDSL GPR:$r, GPR:$r, 2),
-                          (TH_ADDSL GPR:$r, GPR:$r, 2), 2), 3)>;
+          (SLLI (XLenVT (TH_ADDSL (XLenVT (TH_ADDSL GPR:$r, GPR:$r, 2)),
+                                  (XLenVT (TH_ADDSL GPR:$r, GPR:$r, 2)), 2)), 3)>;
 } // Predicates = [HasVendorXTHeadBa]
 
 let Predicates = [HasVendorXTHeadBb] in {
@@ -633,14 +634,14 @@ def : Pat<(sra (bswap i64:$rs1), (i64 32)),
 def : Pat<(binop_allwusers<srl> (bswap i64:$rs1), (i64 32)),
           (TH_REVW i64:$rs1)>;
 def : Pat<(riscv_clzw i64:$rs1),
-          (TH_FF0 (SLLI (XORI i64:$rs1, -1), 32))>;
+          (TH_FF0 (i64 (SLLI (i64 (XORI i64:$rs1, -1)), 32)))>;
 } // Predicates = [HasVendorXTHeadBb, IsRV64]
 
 let Predicates = [HasVendorXTHeadBs] in {
 def : Pat<(and (srl (XLenVT GPR:$rs1), uimmlog2xlen:$shamt), 1),
           (TH_TST GPR:$rs1, uimmlog2xlen:$shamt)>;
 def : Pat<(XLenVT (seteq (and (XLenVT GPR:$rs1), SingleBitSetMask:$mask), 0)),
-          (TH_TST (XORI GPR:$rs1, -1), SingleBitSetMask:$mask)>;
+          (TH_TST (XLenVT (XORI GPR:$rs1, -1)), SingleBitSetMask:$mask)>;
 } // Predicates = [HasVendorXTHeadBs]
 
 let Predicates = [HasVendorXTHeadCondMov] in {
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td b/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td
index 9e32444..f0f8494 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td
@@ -555,7 +555,7 @@ def : Pat<(XLenVT (and (shiftop<srl> GPR:$rs1, (XLenVT GPR:$rs2)), 1)),
 def : Pat<(XLenVT (shiftop<shl> 1, (XLenVT GPR:$rs2))),
           (BSET (XLenVT X0), GPR:$rs2)>;
 def : Pat<(XLenVT (not (shiftop<shl> -1, (XLenVT GPR:$rs2)))),
-          (ADDI (BSET (XLenVT X0), GPR:$rs2), -1)>;
+          (ADDI (XLenVT (BSET (XLenVT X0), GPR:$rs2)), -1)>;
 
 def : Pat<(XLenVT (and GPR:$rs1, BCLRMask:$mask)),
           (BCLRI GPR:$rs1, BCLRMask:$mask)>;
@@ -568,25 +568,25 @@ def : Pat<(XLenVT (and (srl GPR:$rs1, uimmlog2xlen:$shamt), (XLenVT 1))),
           (BEXTI GPR:$rs1, uimmlog2xlen:$shamt)>;
 
 def : Pat<(XLenVT (seteq (XLenVT (and GPR:$rs1, SingleBitSetMask:$mask)), 0)),
-          (BEXTI (XORI GPR:$rs1, -1), SingleBitSetMask:$mask)>;
+          (BEXTI (XLenVT (XORI GPR:$rs1, -1)), SingleBitSetMask:$mask)>;
 
 def : Pat<(XLenVT (or GPR:$r, BSETINVTwoBitsMask:$i)),
-          (BSETI (BSETI GPR:$r, (TrailingZeros BSETINVTwoBitsMask:$i)),
+          (BSETI (XLenVT (BSETI GPR:$r, (TrailingZeros BSETINVTwoBitsMask:$i))),
                  (BSETINVTwoBitsMaskHigh BSETINVTwoBitsMask:$i))>;
 def : Pat<(XLenVT (xor GPR:$r, BSETINVTwoBitsMask:$i)),
-          (BINVI (BINVI GPR:$r, (TrailingZeros BSETINVTwoBitsMask:$i)),
+          (BINVI (XLenVT (BINVI GPR:$r, (TrailingZeros BSETINVTwoBitsMask:$i))),
                  (BSETINVTwoBitsMaskHigh BSETINVTwoBitsMask:$i))>;
 def : Pat<(XLenVT (or GPR:$r, BSETINVORIMask:$i)),
-          (BSETI (ORI GPR:$r, (BSETINVORIMaskLow BSETINVORIMask:$i)),
+          (BSETI (XLenVT (ORI GPR:$r, (BSETINVORIMaskLow BSETINVORIMask:$i))),
                  (BSETINVTwoBitsMaskHigh BSETINVORIMask:$i))>;
 def : Pat<(XLenVT (xor GPR:$r, BSETINVORIMask:$i)),
-          (BINVI (XORI GPR:$r, (BSETINVORIMaskLow BSETINVORIMask:$i)),
+          (BINVI (XLenVT (XORI GPR:$r, (BSETINVORIMaskLow BSETINVORIMask:$i))),
                  (BSETINVTwoBitsMaskHigh BSETINVORIMask:$i))>;
 def : Pat<(XLenVT (and GPR:$r, BCLRITwoBitsMask:$i)),
-          (BCLRI (BCLRI GPR:$r, (BCLRITwoBitsMaskLow BCLRITwoBitsMask:$i)),
+          (BCLRI (XLenVT (BCLRI GPR:$r, (BCLRITwoBitsMaskLow BCLRITwoBitsMask:$i))),
                  (BCLRITwoBitsMaskHigh BCLRITwoBitsMask:$i))>;
 def : Pat<(XLenVT (and GPR:$r, BCLRIANDIMask:$i)),
-          (BCLRI (ANDI GPR:$r, (BCLRIANDIMaskLow BCLRIANDIMask:$i)),
+          (BCLRI (XLenVT (ANDI GPR:$r, (BCLRIANDIMaskLow BCLRIANDIMask:$i))),
                  (BCLRITwoBitsMaskHigh BCLRIANDIMask:$i))>;
 } // Predicates = [HasStdExtZbs]
 
@@ -614,7 +614,7 @@ def : PatGpr<riscv_ctzw, CTZW>;
 def : Pat<(i64 (ctpop (i64 (zexti32 (i64 GPR:$rs1))))), (CPOPW GPR:$rs1)>;
 
 def : Pat<(i64 (riscv_absw GPR:$rs1)),
-          (MAX GPR:$rs1, (SUBW (XLenVT X0), GPR:$rs1))>;
+          (MAX GPR:$rs1, (XLenVT (SUBW (XLenVT X0), GPR:$rs1)))>;
 } // Predicates = [HasStdExtZbb, IsRV64]
 
 let Predicates = [HasStdExtZbb] in {
@@ -686,63 +686,66 @@ foreach i = {1,2,3} in {
 }
 
 def : Pat<(add (mul_oneuse GPR:$rs1, (XLenVT 6)), GPR:$rs2),
-          (SH1ADD (SH1ADD GPR:$rs1, GPR:$rs1), GPR:$rs2)>;
+          (SH1ADD (XLenVT (SH1ADD GPR:$rs1, GPR:$rs1)), GPR:$rs2)>;
 def : Pat<(add (mul_oneuse GPR:$rs1, (XLenVT 10)), GPR:$rs2),
-          (SH1ADD (SH2ADD GPR:$rs1, GPR:$rs1), GPR:$rs2)>;
+          (SH1ADD (XLenVT (SH2ADD GPR:$rs1, GPR:$rs1)), GPR:$rs2)>;
 def : Pat<(add (mul_oneuse GPR:$rs1, (XLenVT 18)), GPR:$rs2),
-          (SH1ADD (SH3ADD GPR:$rs1, GPR:$rs1), GPR:$rs2)>;
+          (SH1ADD (XLenVT (SH3ADD GPR:$rs1, GPR:$rs1)), GPR:$rs2)>;
 def : Pat<(add (mul_oneuse GPR:$rs1, (XLenVT 12)), GPR:$rs2),
-          (SH2ADD (SH1ADD GPR:$rs1, GPR:$rs1), GPR:$rs2)>;
+          (SH2ADD (XLenVT (SH1ADD GPR:$rs1, GPR:$rs1)), GPR:$rs2)>;
 def : Pat<(add (mul_oneuse GPR:$rs1, (XLenVT 20)), GPR:$rs2),
-          (SH2ADD (SH2ADD GPR:$rs1, GPR:$rs1), GPR:$rs2)>;
+          (SH2ADD (XLenVT (SH2ADD GPR:$rs1, GPR:$rs1)), GPR:$rs2)>;
 def : Pat<(add (mul_oneuse GPR:$rs1, (XLenVT 36)), GPR:$rs2),
-          (SH2ADD (SH3ADD GPR:$rs1, GPR:$rs1), GPR:$rs2)>;
+          (SH2ADD (XLenVT (SH3ADD GPR:$rs1, GPR:$rs1)), GPR:$rs2)>;
 def : Pat<(add (mul_oneuse GPR:$rs1, (XLenVT 24)), GPR:$rs2),
-          (SH3ADD (SH1ADD GPR:$rs1, GPR:$rs1), GPR:$rs2)>;
+          (SH3ADD (XLenVT (SH1ADD GPR:$rs1, GPR:$rs1)), GPR:$rs2)>;
 def : Pat<(add (mul_oneuse GPR:$rs1, (XLenVT 40)), GPR:$rs2),
-          (SH3ADD (SH2ADD GPR:$rs1, GPR:$rs1), GPR:$rs2)>;
+          (SH3ADD (XLenVT (SH2ADD GPR:$rs1, GPR:$rs1)), GPR:$rs2)>;
 def : Pat<(add (mul_oneuse GPR:$rs1, (XLenVT 72)), GPR:$rs2),
-          (SH3ADD (SH3ADD GPR:$rs1, GPR:$rs1), GPR:$rs2)>;
+          (SH3ADD (XLenVT (SH3ADD GPR:$rs1, GPR:$rs1)), GPR:$rs2)>;
 
 def : Pat<(add (XLenVT GPR:$r), CSImm12MulBy4:$i),
-          (SH2ADD (ADDI (XLenVT X0), (SimmShiftRightBy2XForm CSImm12MulBy4:$i)),
+          (SH2ADD (XLenVT (ADDI (XLenVT X0), (SimmShiftRightBy2XForm CSImm12MulBy4:$i))),
                   GPR:$r)>;
 def : Pat<(add (XLenVT GPR:$r), CSImm12MulBy8:$i),
-          (SH3ADD (ADDI (XLenVT X0), (SimmShiftRightBy3XForm CSImm12MulBy8:$i)),
+          (SH3ADD (XLenVT (ADDI (XLenVT X0), (SimmShiftRightBy3XForm CSImm12MulBy8:$i))),
                   GPR:$r)>;
 
 def : Pat<(mul (XLenVT GPR:$r), C3LeftShift:$i),
-          (SLLI (SH1ADD GPR:$r, GPR:$r),
+          (SLLI (XLenVT (SH1ADD GPR:$r, GPR:$r)),
                 (TrailingZeros C3LeftShift:$i))>;
 def : Pat<(mul (XLenVT GPR:$r), C5LeftShift:$i),
-          (SLLI (SH2ADD GPR:$r, GPR:$r),
+          (SLLI (XLenVT (SH2ADD GPR:$r, GPR:$r)),
                 (TrailingZeros C5LeftShift:$i))>;
 def : Pat<(mul (XLenVT GPR:$r), C9LeftShift:$i),
-          (SLLI (SH3ADD GPR:$r, GPR:$r),
+          (SLLI (XLenVT (SH3ADD GPR:$r, GPR:$r)),
                 (TrailingZeros C9LeftShift:$i))>;
 
 def : Pat<(mul_const_oneuse GPR:$r, (XLenVT 11)),
-          (SH1ADD (SH2ADD GPR:$r, GPR:$r), GPR:$r)>;
+          (SH1ADD (XLenVT (SH2ADD GPR:$r, GPR:$r)), GPR:$r)>;
 def : Pat<(mul_const_oneuse GPR:$r, (XLenVT 19)),
-          (SH1ADD (SH3ADD GPR:$r, GPR:$r), GPR:$r)>;
+          (SH1ADD (XLenVT (SH3ADD GPR:$r, GPR:$r)), GPR:$r)>;
 def : Pat<(mul_const_oneuse GPR:$r, (XLenVT 13)),
-          (SH2ADD (SH1ADD GPR:$r, GPR:$r), GPR:$r)>;
+          (SH2ADD (XLenVT (SH1ADD GPR:$r, GPR:$r)), GPR:$r)>;
 def : Pat<(mul_const_oneuse GPR:$r, (XLenVT 21)),
-          (SH2ADD (SH2ADD GPR:$r, GPR:$r), GPR:$r)>;
+          (SH2ADD (XLenVT (SH2ADD GPR:$r, GPR:$r)), GPR:$r)>;
 def : Pat<(mul_const_oneuse GPR:$r, (XLenVT 37)),
-          (SH2ADD (SH3ADD GPR:$r, GPR:$r), GPR:$r)>;
+          (SH2ADD (XLenVT (SH3ADD GPR:$r, GPR:$r)), GPR:$r)>;
 def : Pat<(mul_const_oneuse GPR:$r, (XLenVT 25)),
-          (SH3ADD (SH1ADD GPR:$r, GPR:$r), GPR:$r)>;
+          (SH3ADD (XLenVT (SH1ADD GPR:$r, GPR:$r)), GPR:$r)>;
 def : Pat<(mul_const_oneuse GPR:$r, (XLenVT 41)),
-          (SH3ADD (SH2ADD GPR:$r, GPR:$r), GPR:$r)>;
+          (SH3ADD (XLenVT (SH2ADD GPR:$r, GPR:$r)), GPR:$r)>;
 def : Pat<(mul_const_oneuse GPR:$r, (XLenVT 73)),
-          (SH3ADD (SH3ADD GPR:$r, GPR:$r), GPR:$r)>;
+          (SH3ADD (XLenVT (SH3ADD GPR:$r, GPR:$r)), GPR:$r)>;
 def : Pat<(mul_const_oneuse GPR:$r, (XLenVT 27)),
-          (SH1ADD (SH3ADD GPR:$r, GPR:$r), (SH3ADD GPR:$r, GPR:$r))>;
+          (SH1ADD (XLenVT (SH3ADD GPR:$r, GPR:$r)),
+                  (XLenVT (SH3ADD GPR:$r, GPR:$r)))>;
 def : Pat<(mul_const_oneuse GPR:$r, (XLenVT 45)),
-          (SH2ADD (SH3ADD GPR:$r, GPR:$r), (SH3ADD GPR:$r, GPR:$r))>;
+          (SH2ADD (XLenVT (SH3ADD GPR:$r, GPR:$r)),
+                  (XLenVT (SH3ADD GPR:$r, GPR:$r)))>;
 def : Pat<(mul_const_oneuse GPR:$r, (XLenVT 81)),
-          (SH3ADD (SH3ADD GPR:$r, GPR:$r), (SH3ADD GPR:$r, GPR:$r))>;
+          (SH3ADD (XLenVT (SH3ADD GPR:$r, GPR:$r)),
+                  (XLenVT (SH3ADD GPR:$r, GPR:$r)))>;
 } // Predicates = [HasStdExtZba]
 
 let Predicates = [HasStdExtZba, IsRV64] in {
@@ -751,7 +754,7 @@ def : Pat<(i64 (shl (and GPR:$rs1, 0xFFFFFFFF), uimm5:$shamt)),
 // Match a shifted 0xffffffff mask. Use SRLI to clear the LSBs and SLLI_UW to
 // mask and shift.
 def : Pat<(i64 (and GPR:$rs1, Shifted32OnesMask:$mask)),
-          (SLLI_UW (SRLI GPR:$rs1, Shifted32OnesMask:$mask),
+          (SLLI_UW (XLenVT (SRLI GPR:$rs1, Shifted32OnesMask:$mask)),
                    Shifted32OnesMask:$mask)>;
 def : Pat<(i64 (add_non_imm12 (and GPR:$rs1, 0xFFFFFFFF), GPR:$rs2)),
           (ADD_UW GPR:$rs1, GPR:$rs2)>;
@@ -781,29 +784,29 @@ foreach i = {1,2,3} in {
 }
 
 def : Pat<(i64 (add_non_imm12 (and GPR:$rs1, 0xFFFFFFFE), (XLenVT GPR:$rs2))),
-          (SH1ADD (SRLIW GPR:$rs1, 1), GPR:$rs2)>;
+          (SH1ADD (XLenVT (SRLIW GPR:$rs1, 1)), GPR:$rs2)>;
 def : Pat<(i64 (add_non_imm12 (and GPR:$rs1, 0xFFFFFFFC), (XLenVT GPR:$rs2))),
-          (SH2ADD (SRLIW GPR:$rs1, 2), GPR:$rs2)>;
+          (SH2ADD (XLenVT (SRLIW GPR:$rs1, 2)), GPR:$rs2)>;
 def : Pat<(i64 (add_non_imm12 (and GPR:$rs1, 0xFFFFFFF8), (XLenVT GPR:$rs2))),
-          (SH3ADD (SRLIW GPR:$rs1, 3), GPR:$rs2)>;
+          (SH3ADD (XLenVT (SRLIW GPR:$rs1, 3)), GPR:$rs2)>;
 
 // Use SRLI to clear the LSBs and SHXADD_UW to mask and shift.
 def : Pat<(i64 (add_non_imm12 (and GPR:$rs1, 0x1FFFFFFFE), (XLenVT GPR:$rs2))),
-          (SH1ADD_UW (SRLI GPR:$rs1, 1), GPR:$rs2)>;
+          (SH1ADD_UW (XLenVT (SRLI GPR:$rs1, 1)), GPR:$rs2)>;
 def : Pat<(i64 (add_non_imm12 (and GPR:$rs1, 0x3FFFFFFFC), (XLenVT GPR:$rs2))),
-          (SH2ADD_UW (SRLI GPR:$rs1, 2), GPR:$rs2)>;
+          (SH2ADD_UW (XLenVT (SRLI GPR:$rs1, 2)), GPR:$rs2)>;
 def : Pat<(i64 (add_non_imm12 (and GPR:$rs1, 0x7FFFFFFF8), (XLenVT GPR:$rs2))),
-          (SH3ADD_UW (SRLI GPR:$rs1, 3), GPR:$rs2)>;
+          (SH3ADD_UW (XLenVT (SRLI GPR:$rs1, 3)), GPR:$rs2)>;
 
 def : Pat<(i64 (mul (and_oneuse GPR:$r, 0xFFFFFFFF), C3LeftShiftUW:$i)),
-          (SH1ADD (SLLI_UW GPR:$r, (TrailingZeros C3LeftShiftUW:$i)),
-                  (SLLI_UW GPR:$r, (TrailingZeros C3LeftShiftUW:$i)))>;
+          (SH1ADD (XLenVT (SLLI_UW GPR:$r, (TrailingZeros C3LeftShiftUW:$i))),
+                  (XLenVT (SLLI_UW GPR:$r, (TrailingZeros C3LeftShiftUW:$i))))>;
 def : Pat<(i64 (mul (and_oneuse GPR:$r, 0xFFFFFFFF), C5LeftShiftUW:$i)),
-          (SH2ADD (SLLI_UW GPR:$r, (TrailingZeros C5LeftShiftUW:$i)),
-                  (SLLI_UW GPR:$r, (TrailingZeros C5LeftShiftUW:$i)))>;
+          (SH2ADD (XLenVT (SLLI_UW GPR:$r, (TrailingZeros C5LeftShiftUW:$i))),
+                  (XLenVT (SLLI_UW GPR:$r, (TrailingZeros C5LeftShiftUW:$i))))>;
 def : Pat<(i64 (mul (and_oneuse GPR:$r, 0xFFFFFFFF), C9LeftShiftUW:$i)),
-          (SH3ADD (SLLI_UW GPR:$r, (TrailingZeros C9LeftShiftUW:$i)),
-                  (SLLI_UW GPR:$r, (TrailingZeros C9LeftShiftUW:$i)))>;
+          (SH3ADD (XLenVT (SLLI_UW GPR:$r, (TrailingZeros C9LeftShiftUW:$i))),
+                  (XLenVT (SLLI_UW GPR:$r, (TrailingZeros C9LeftShiftUW:$i))))>;
 } // Predicates = [HasStdExtZba, IsRV64]
 
 let Predicates = [HasStdExtZbcOrZbkc] in {
@@ -904,7 +907,7 @@ def : Pat<(i64 (and (anyext (i32 (shiftop<srl> GPR:$rs1, (i64 GPR:$rs2)))), 1)),
 def : Pat<(i32 (shiftop<shl> 1, (i64 GPR:$rs2))),
           (BSET (XLenVT X0), GPR:$rs2)>;
 def : Pat<(i32 (not (shiftop<shl> -1, (i64 GPR:$rs2)))),
-          (ADDI (BSET (XLenVT X0), GPR:$rs2), -1)>;
+          (ADDI (i32 (BSET (XLenVT X0), GPR:$rs2)), -1)>;
 
 def : Pat<(i32 (and (srl GPR:$rs1, uimm5:$shamt), (i32 1))),
           (BEXTI GPR:$rs1, uimm5:$shamt)>;
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoZfh.td b/llvm/lib/Target/RISCV/RISCVInstrInfoZfh.td
index 2e0f754..e0f1c71 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoZfh.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoZfh.td
@@ -366,11 +366,11 @@ foreach Ext = ZfhExts in {
 let Predicates = [HasStdExtZfh] in {
 // Match signaling FEQ_H
 def : Pat<(XLenVT (strict_fsetccs (f16 FPR16:$rs1), FPR16:$rs2, SETEQ)),
-          (AND (FLE_H $rs1, $rs2),
-               (FLE_H $rs2, $rs1))>;
+          (AND (XLenVT (FLE_H $rs1, $rs2)),
+               (XLenVT (FLE_H $rs2, $rs1)))>;
 def : Pat<(XLenVT (strict_fsetccs (f16 FPR16:$rs1), FPR16:$rs2, SETOEQ)),
-          (AND (FLE_H $rs1, $rs2),
-               (FLE_H $rs2, $rs1))>;
+          (AND (XLenVT (FLE_H $rs1, $rs2)),
+               (XLenVT (FLE_H $rs2, $rs1)))>;
 // If both operands are the same, use a single FLE.
 def : Pat<(XLenVT (strict_fsetccs (f16 FPR16:$rs1), (f16 FPR16:$rs1), SETEQ)),
           (FLE_H $rs1, $rs1)>;
@@ -381,11 +381,11 @@ def : Pat<(XLenVT (strict_fsetccs (f16 FPR16:$rs1), (f16 FPR16:$rs1), SETOEQ)),
 let Predicates = [HasStdExtZhinx] in {
 // Match signaling FEQ_H
 def : Pat<(XLenVT (strict_fsetccs FPR16INX:$rs1, FPR16INX:$rs2, SETEQ)),
-          (AND (FLE_H_INX $rs1, $rs2),
-               (FLE_H_INX $rs2, $rs1))>;
+          (AND (XLenVT (FLE_H_INX $rs1, $rs2)),
+               (XLenVT (FLE_H_INX $rs2, $rs1)))>;
 def : Pat<(XLenVT (strict_fsetccs FPR16INX:$rs1, FPR16INX:$rs2, SETOEQ)),
-          (AND (FLE_H_INX $rs1, $rs2),
-               (FLE_H_INX $rs2, $rs1))>;
+          (AND (XLenVT (FLE_H_INX $rs1, $rs2)),
+               (XLenVT (FLE_H_INX $rs2, $rs1)))>;
 // If both operands are the same, use a single FLE.
 def : Pat<(XLenVT (strict_fsetccs FPR16INX:$rs1, FPR16INX:$rs1, SETEQ)),
           (FLE_H_INX $rs1, $rs1)>;
-- 
cgit v1.1


From 0079136f7d2454ef2889061bb214741163ba232d Mon Sep 17 00:00:00 2001
From: David Green <david.green@arm.com>
Date: Fri, 9 Feb 2024 07:48:43 +0000
Subject: [BasicAA] Fix Scale check in vscale aliasing. (#81174)

This is a fix for #80818, as pointed out in #81144 it should be checking
the abs of Scale. The added test changes from NoAlias to MayAlias.
---
 llvm/lib/Analysis/BasicAliasAnalysis.cpp |  2 +-
 llvm/test/Analysis/BasicAA/vscale.ll     | 11 +++++++++++
 2 files changed, 12 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/Analysis/BasicAliasAnalysis.cpp b/llvm/lib/Analysis/BasicAliasAnalysis.cpp
index ae31814..682b0a2 100644
--- a/llvm/lib/Analysis/BasicAliasAnalysis.cpp
+++ b/llvm/lib/Analysis/BasicAliasAnalysis.cpp
@@ -1187,7 +1187,7 @@ AliasResult BasicAAResult::aliasGEP(
     // so noalias still holds so long as the dependency distance is at least as
     // big as the typesize.
     if (VLeftSize.hasValue() &&
-        Scale.uge(VLeftSize.getValue().getKnownMinValue()))
+        Scale.abs().uge(VLeftSize.getValue().getKnownMinValue()))
       return AliasResult::NoAlias;
   }
 
diff --git a/llvm/test/Analysis/BasicAA/vscale.ll b/llvm/test/Analysis/BasicAA/vscale.ll
index ce0c6f1..b2f5c66 100644
--- a/llvm/test/Analysis/BasicAA/vscale.ll
+++ b/llvm/test/Analysis/BasicAA/vscale.ll
@@ -458,6 +458,17 @@ define void @vscale_v1v2types(ptr %p) {
   ret void
 }
 
+; CHECK-LABEL: vscale_negativescale
+; CHECK-DAG:   MayAlias:    <vscale x 4 x i32>* %p, <vscale x 4 x i32>* %vm16
+define void @vscale_negativescale(ptr %p) vscale_range(1,16) {
+  %v = call i64 @llvm.vscale.i64()
+  %vm = mul nsw i64 %v, -15
+  %vm16 = getelementptr i8, ptr %p, i64 %vm
+  load <vscale x 4 x i32>, ptr %vm16
+  load <vscale x 4 x i32>, ptr %p
+  ret void
+}
+
 ; CHECK-LABEL: twovscales
 ; CHECK-DAG:   MayAlias:     <vscale x 4 x i32>* %vp161, <vscale x 4 x i32>* %vp162
 ; CHECK-DAG:   MayAlias:     <vscale x 4 x i32>* %vp161, <vscale x 4 x i32>* %vp161b
-- 
cgit v1.1


From 8316bf34ac21117f35bc8e6fafa2b3e7da75e1d5 Mon Sep 17 00:00:00 2001
From: DianQK <dianqk@dianqk.net>
Date: Fri, 9 Feb 2024 15:54:54 +0800
Subject: Revert "[RegisterCoalescer] Clear instructions not recorded in
 `ErasedInstrs` but erased (#79820)"

This reverts commit 95b14da678f4670283240ef4cf60f3a39bed97b4.
---
 llvm/lib/CodeGen/RegisterCoalescer.cpp             |  27 +--
 .../LoongArch/register-coalescer-crash-pr79718.mir | 213 ---------------------
 .../X86/PR71178-register-coalescer-crash.ll        | 103 ----------
 3 files changed, 5 insertions(+), 338 deletions(-)
 delete mode 100644 llvm/test/CodeGen/LoongArch/register-coalescer-crash-pr79718.mir
 delete mode 100644 llvm/test/CodeGen/X86/PR71178-register-coalescer-crash.ll

diff --git a/llvm/lib/CodeGen/RegisterCoalescer.cpp b/llvm/lib/CodeGen/RegisterCoalescer.cpp
index 7e9c992..cbb1a74 100644
--- a/llvm/lib/CodeGen/RegisterCoalescer.cpp
+++ b/llvm/lib/CodeGen/RegisterCoalescer.cpp
@@ -236,8 +236,7 @@ namespace {
     /// was successfully coalesced away. If it is not currently possible to
     /// coalesce this interval, but it may be possible if other things get
     /// coalesced, then it returns true by reference in 'Again'.
-    bool joinCopy(MachineInstr *CopyMI, bool &Again,
-                  SmallPtrSetImpl<MachineInstr *> &CurrentErasedInstrs);
+    bool joinCopy(MachineInstr *CopyMI, bool &Again);
 
     /// Attempt to join these two intervals.  On failure, this
     /// returns false.  The output "SrcInt" will not have been modified, so we
@@ -1965,9 +1964,7 @@ void RegisterCoalescer::setUndefOnPrunedSubRegUses(LiveInterval &LI,
   LIS->shrinkToUses(&LI);
 }
 
-bool RegisterCoalescer::joinCopy(
-    MachineInstr *CopyMI, bool &Again,
-    SmallPtrSetImpl<MachineInstr *> &CurrentErasedInstrs) {
+bool RegisterCoalescer::joinCopy(MachineInstr *CopyMI, bool &Again) {
   Again = false;
   LLVM_DEBUG(dbgs() << LIS->getInstructionIndex(*CopyMI) << '\t' << *CopyMI);
 
@@ -2159,9 +2156,7 @@ bool RegisterCoalescer::joinCopy(
   // CopyMI has been erased by joinIntervals at this point. Remove it from
   // ErasedInstrs since copyCoalesceWorkList() won't add a successful join back
   // to the work list. This keeps ErasedInstrs from growing needlessly.
-  if (ErasedInstrs.erase(CopyMI))
-    // But we may encounter the instruction again in this iteration.
-    CurrentErasedInstrs.insert(CopyMI);
+  ErasedInstrs.erase(CopyMI);
 
   // Rewrite all SrcReg operands to DstReg.
   // Also update DstReg operands to include DstIdx if it is set.
@@ -3987,33 +3982,21 @@ void RegisterCoalescer::lateLiveIntervalUpdate() {
 bool RegisterCoalescer::
 copyCoalesceWorkList(MutableArrayRef<MachineInstr*> CurrList) {
   bool Progress = false;
-  SmallPtrSet<MachineInstr *, 4> CurrentErasedInstrs;
   for (MachineInstr *&MI : CurrList) {
     if (!MI)
       continue;
     // Skip instruction pointers that have already been erased, for example by
     // dead code elimination.
-    if (ErasedInstrs.count(MI) || CurrentErasedInstrs.count(MI)) {
+    if (ErasedInstrs.count(MI)) {
       MI = nullptr;
       continue;
     }
     bool Again = false;
-    bool Success = joinCopy(MI, Again, CurrentErasedInstrs);
+    bool Success = joinCopy(MI, Again);
     Progress |= Success;
     if (Success || !Again)
       MI = nullptr;
   }
-  // Clear instructions not recorded in `ErasedInstrs` but erased.
-  if (!CurrentErasedInstrs.empty()) {
-    for (MachineInstr *&MI : CurrList) {
-      if (MI && CurrentErasedInstrs.count(MI))
-        MI = nullptr;
-    }
-    for (MachineInstr *&MI : WorkList) {
-      if (MI && CurrentErasedInstrs.count(MI))
-        MI = nullptr;
-    }
-  }
   return Progress;
 }
 
diff --git a/llvm/test/CodeGen/LoongArch/register-coalescer-crash-pr79718.mir b/llvm/test/CodeGen/LoongArch/register-coalescer-crash-pr79718.mir
deleted file mode 100644
index 9bbb579..0000000
--- a/llvm/test/CodeGen/LoongArch/register-coalescer-crash-pr79718.mir
+++ /dev/null
@@ -1,213 +0,0 @@
-# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 4
-# RUN: llc -o - %s -mtriple=loongarch64 \
-# RUN:  -run-pass=register-coalescer -join-liveintervals=1 -join-splitedges=0 | FileCheck %s
-
----
-name:            foo
-tracksRegLiveness: true
-body:             |
-  ; CHECK-LABEL: name: foo
-  ; CHECK: bb.0:
-  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
-  ; CHECK-NEXT:   liveins: $r4, $r5, $r6, $r7, $r8
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:gpr = COPY $r8
-  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:gpr = COPY $r7
-  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:gpr = COPY $r6
-  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:gpr = COPY $r5
-  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:gpr = COPY $r4
-  ; CHECK-NEXT:   [[ANDI:%[0-9]+]]:gpr = ANDI [[COPY3]], 1
-  ; CHECK-NEXT:   [[ORI:%[0-9]+]]:gpr = ORI $r0, 1
-  ; CHECK-NEXT:   [[ANDI1:%[0-9]+]]:gpr = ANDI [[COPY2]], 1
-  ; CHECK-NEXT:   [[ANDI2:%[0-9]+]]:gpr = ANDI [[COPY1]], 1
-  ; CHECK-NEXT:   [[ANDI3:%[0-9]+]]:gpr = ANDI [[COPY]], 1
-  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:gpr = COPY $r0
-  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:gpr = COPY $r0
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT: bb.1:
-  ; CHECK-NEXT:   successors: %bb.2(0x80000000)
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:gpr = COPY [[COPY5]]
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT: bb.2:
-  ; CHECK-NEXT:   successors: %bb.3(0x40000000), %bb.4(0x40000000)
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   BEQZ [[ANDI]], %bb.4
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT: bb.3:
-  ; CHECK-NEXT:   successors: %bb.9(0x80000000)
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   PseudoBR %bb.9
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT: bb.4:
-  ; CHECK-NEXT:   successors: %bb.5(0x80000000)
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT: bb.5:
-  ; CHECK-NEXT:   successors: %bb.7(0x7c000000), %bb.6(0x04000000)
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   dead [[LD_D:%[0-9]+]]:gpr = LD_D $r0, 8
-  ; CHECK-NEXT:   dead [[LD_D1:%[0-9]+]]:gpr = LD_D $r0, 0
-  ; CHECK-NEXT:   BNEZ [[ANDI1]], %bb.7
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT: bb.6:
-  ; CHECK-NEXT:   successors: %bb.11(0x80000000)
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:gpr = COPY $r0
-  ; CHECK-NEXT:   PseudoBR %bb.11
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT: bb.7:
-  ; CHECK-NEXT:   successors: %bb.8(0x7c000000), %bb.10(0x04000000)
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   BEQZ [[ANDI2]], %bb.10
-  ; CHECK-NEXT:   PseudoBR %bb.8
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT: bb.8:
-  ; CHECK-NEXT:   successors: %bb.9(0x04000000), %bb.5(0x7c000000)
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:gpr = ADDI_D [[COPY6]], 1
-  ; CHECK-NEXT:   BEQZ [[ANDI3]], %bb.5
-  ; CHECK-NEXT:   PseudoBR %bb.9
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT: bb.9:
-  ; CHECK-NEXT:   successors: %bb.12(0x80000000)
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   ST_B $r0, [[COPY4]], 0
-  ; CHECK-NEXT:   PseudoBR %bb.12
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT: bb.10:
-  ; CHECK-NEXT:   successors: %bb.11(0x80000000)
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:gpr = ADDI_D [[COPY6]], 1
-  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:gpr = COPY [[ORI]]
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT: bb.11:
-  ; CHECK-NEXT:   successors: %bb.12(0x80000000)
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   ST_D $r0, [[COPY4]], 0
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT: bb.12:
-  ; CHECK-NEXT:   successors: %bb.2(0x7c000000), %bb.1(0x04000000)
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   BEQ [[COPY7]], [[ORI]], %bb.2
-  ; CHECK-NEXT:   PseudoBR %bb.1
-  bb.0:
-    liveins: $r4, $r5, $r6, $r7, $r8
-
-    %0:gpr = COPY killed $r8
-    %1:gpr = COPY killed $r7
-    %2:gpr = COPY killed $r6
-    %3:gpr = COPY killed $r5
-    %4:gpr = COPY killed $r4
-    %5:gpr = COPY $r0
-    %6:gpr = COPY killed %5
-    %7:gpr = ANDI killed %3, 1
-    %8:gpr = ORI $r0, 1
-    %9:gpr = ANDI killed %2, 1
-    %10:gpr = ANDI killed %1, 1
-    %11:gpr = ANDI killed %0, 1
-    %12:gpr = COPY %6
-    %13:gpr = COPY killed %6
-    %14:gpr = IMPLICIT_DEF
-
-  bb.1:
-    %15:gpr = COPY killed %14
-    %16:gpr = COPY killed %13
-    %17:gpr = COPY killed %12
-    %18:gpr = COPY %17
-    %19:gpr = COPY %16
-    %20:gpr = COPY killed %16
-    %21:gpr = COPY killed %15
-
-  bb.2:
-    successors: %bb.3, %bb.4
-
-    %22:gpr = COPY killed %21
-    %23:gpr = COPY killed %20
-    %24:gpr = COPY killed %19
-    %25:gpr = COPY killed %18
-    BEQZ %7, %bb.4
-
-  bb.3:
-    %26:gpr = COPY killed %24
-    %27:gpr = COPY killed %23
-    PseudoBR %bb.9
-
-  bb.4:
-    %28:gpr = COPY killed %23
-
-  bb.5:
-    successors: %bb.7(0x7c000000), %bb.6(0x04000000)
-
-    %29:gpr = COPY killed %28
-    dead %30:gpr = LD_D $r0, 8
-    dead %31:gpr = LD_D $r0, 0
-    BNEZ %9, %bb.7
-
-  bb.6:
-    %32:gpr = COPY $r0
-    %33:gpr = COPY killed %32
-    %34:gpr = COPY killed %33
-    %35:gpr = COPY killed %22
-    PseudoBR %bb.11
-
-  bb.7:
-    successors: %bb.8(0x7c000000), %bb.10(0x04000000)
-
-    BEQZ %10, %bb.10
-    PseudoBR %bb.8
-
-  bb.8:
-    successors: %bb.9(0x04000000), %bb.5(0x7c000000)
-
-    %36:gpr = ADDI_D killed %29, 1
-    %28:gpr = COPY %36
-    %26:gpr = COPY %36
-    %27:gpr = COPY killed %36
-    BEQZ %11, %bb.5
-    PseudoBR %bb.9
-
-  bb.9:
-    %37:gpr = COPY killed %27
-    %38:gpr = COPY killed %26
-    %39:gpr = COPY $r0
-    ST_B killed %39, %4, 0
-    %40:gpr = COPY killed %25
-    %41:gpr = COPY killed %38
-    %42:gpr = COPY killed %37
-    %43:gpr = COPY killed %22
-    PseudoBR %bb.12
-
-  bb.10:
-    %44:gpr = ADDI_D killed %29, 1
-    %34:gpr = COPY %8
-    %35:gpr = COPY killed %44
-
-  bb.11:
-    %45:gpr = COPY killed %35
-    %46:gpr = COPY killed %34
-    %47:gpr = COPY $r0
-    ST_D killed %47, %4, 0
-    %40:gpr = COPY %45
-    %41:gpr = COPY %46
-    %42:gpr = COPY killed %46
-    %43:gpr = COPY killed %45
-
-  bb.12:
-    successors: %bb.2(0x7c000000), %bb.1(0x04000000)
-
-    %48:gpr = COPY killed %43
-    %49:gpr = COPY killed %42
-    %50:gpr = COPY killed %41
-    %51:gpr = COPY killed %40
-    %12:gpr = COPY %51
-    %13:gpr = COPY %50
-    %14:gpr = COPY %48
-    %18:gpr = COPY killed %51
-    %19:gpr = COPY killed %50
-    %20:gpr = COPY killed %49
-    %21:gpr = COPY killed %48
-    BEQ %17, %8, %bb.2
-    PseudoBR %bb.1
-
-...
diff --git a/llvm/test/CodeGen/X86/PR71178-register-coalescer-crash.ll b/llvm/test/CodeGen/X86/PR71178-register-coalescer-crash.ll
deleted file mode 100644
index 0ce346f..0000000
--- a/llvm/test/CodeGen/X86/PR71178-register-coalescer-crash.ll
+++ /dev/null
@@ -1,103 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
-; RUN: llc < %s -mtriple=x86_64 -- | FileCheck %s
-
-define i32 @h(i1 %arg, i32 %arg1) {
-; CHECK-LABEL: h:
-; CHECK:       # %bb.0: # %bb
-; CHECK-NEXT:    movl $1, %eax
-; CHECK-NEXT:    movabsq $9166129423, %rcx # imm = 0x22258090F
-; CHECK-NEXT:    xorl %edx, %edx
-; CHECK-NEXT:    jmp .LBB0_1
-; CHECK-NEXT:    .p2align 4, 0x90
-; CHECK-NEXT:  .LBB0_9: # %bb18
-; CHECK-NEXT:    # in Loop: Header=BB0_1 Depth=1
-; CHECK-NEXT:    xorl %eax, %eax
-; CHECK-NEXT:    testb $1, %dil
-; CHECK-NEXT:    jne .LBB0_10
-; CHECK-NEXT:  .LBB0_1: # %bb4
-; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    testq %rdx, %rdx
-; CHECK-NEXT:    jne .LBB0_2
-; CHECK-NEXT:  # %bb.7: # %bb16
-; CHECK-NEXT:    # in Loop: Header=BB0_1 Depth=1
-; CHECK-NEXT:    testb $1, %dil
-; CHECK-NEXT:    jne .LBB0_9
-; CHECK-NEXT:  # %bb.8: # %bb17
-; CHECK-NEXT:    # in Loop: Header=BB0_1 Depth=1
-; CHECK-NEXT:    movq %rcx, %rdx
-; CHECK-NEXT:    jmp .LBB0_9
-; CHECK-NEXT:  .LBB0_2: # %bb9
-; CHECK-NEXT:    # in Loop: Header=BB0_1 Depth=1
-; CHECK-NEXT:    testb $1, %dil
-; CHECK-NEXT:    testb $1, %dil
-; CHECK-NEXT:    je .LBB0_4
-; CHECK-NEXT:  # %bb.3: # %bb13
-; CHECK-NEXT:    # in Loop: Header=BB0_1 Depth=1
-; CHECK-NEXT:    xorl %eax, %eax
-; CHECK-NEXT:  .LBB0_4: # %bb14
-; CHECK-NEXT:    # in Loop: Header=BB0_1 Depth=1
-; CHECK-NEXT:    cmpl $1, %esi
-; CHECK-NEXT:    je .LBB0_1
-; CHECK-NEXT:  # %bb.5: # %bb14
-; CHECK-NEXT:    movl %eax, %r8d
-; CHECK-NEXT:    testl %esi, %esi
-; CHECK-NEXT:    movl %esi, %eax
-; CHECK-NEXT:    jne .LBB0_6
-; CHECK-NEXT:  .LBB0_10: # %bb22
-; CHECK-NEXT:    retq
-; CHECK-NEXT:  .LBB0_6: # %bb22.loopexit1
-; CHECK-NEXT:    movl %r8d, %eax
-; CHECK-NEXT:    retq
-bb:
-  br label %bb2
-
-bb2:                                              ; preds = %bb14, %bb
-  %i = phi i64 [ %i5, %bb14 ], [ 0, %bb ]
-  %i3 = phi i32 [ %i15, %bb14 ], [ 1, %bb ]
-  br label %bb4
-
-bb4:                                              ; preds = %bb18, %bb2
-  %i5 = phi i64 [ %i19, %bb18 ], [ %i, %bb2 ]
-  %i6 = phi i64 [ %i20, %bb18 ], [ %i, %bb2 ]
-  %i7 = phi i32 [ 0, %bb18 ], [ %i3, %bb2 ]
-  %i8 = icmp eq i64 %i6, 0
-  br i1 %i8, label %bb16, label %bb9
-
-bb9:                                              ; preds = %bb4
-  br i1 %arg, label %bb12, label %bb10
-
-bb10:                                             ; preds = %bb9
-  %i11 = sdiv i64 0, 0
-  br label %bb12
-
-bb12:                                             ; preds = %bb10, %bb9
-  br i1 %arg, label %bb13, label %bb14
-
-bb13:                                             ; preds = %bb12
-  br label %bb14
-
-bb14:                                             ; preds = %bb13, %bb12
-  %i15 = phi i32 [ 0, %bb13 ], [ %i7, %bb12 ]
-  switch i32 %arg1, label %bb22 [
-    i32 0, label %bb21
-    i32 1, label %bb2
-  ]
-
-bb16:                                             ; preds = %bb4
-  br i1 %arg, label %bb18, label %bb17
-
-bb17:                                             ; preds = %bb16
-  br label %bb18
-
-bb18:                                             ; preds = %bb17, %bb16
-  %i19 = phi i64 [ 9166129423, %bb17 ], [ %i5, %bb16 ]
-  %i20 = phi i64 [ 9166129423, %bb17 ], [ %i6, %bb16 ]
-  br i1 %arg, label %bb22, label %bb4
-
-bb21:                                             ; preds = %bb14
-  br label %bb22
-
-bb22:                                             ; preds = %bb21, %bb18, %bb14
-  %i23 = phi i32 [ %arg1, %bb21 ], [ %i15, %bb14 ], [ 0, %bb18 ]
-  ret i32 %i23
-}
-- 
cgit v1.1


From ccb46e8365787c446236df20c068d101c637346a Mon Sep 17 00:00:00 2001
From: DianQK <dianqk@dianqk.net>
Date: Fri, 9 Feb 2024 15:58:48 +0800
Subject: Reapply "[RegisterCoalescer] Clear instructions not recorded in
 `ErasedInstrs` but erased (#79820)"

This reverts commit 8316bf34ac21117f35bc8e6fafa2b3e7da75e1d5.
---
 llvm/lib/CodeGen/RegisterCoalescer.cpp             |  27 ++-
 .../LoongArch/register-coalescer-crash-pr79718.mir | 212 +++++++++++++++++++++
 .../X86/PR71178-register-coalescer-crash.ll        | 103 ++++++++++
 3 files changed, 337 insertions(+), 5 deletions(-)
 create mode 100644 llvm/test/CodeGen/LoongArch/register-coalescer-crash-pr79718.mir
 create mode 100644 llvm/test/CodeGen/X86/PR71178-register-coalescer-crash.ll

diff --git a/llvm/lib/CodeGen/RegisterCoalescer.cpp b/llvm/lib/CodeGen/RegisterCoalescer.cpp
index cbb1a74..7e9c992 100644
--- a/llvm/lib/CodeGen/RegisterCoalescer.cpp
+++ b/llvm/lib/CodeGen/RegisterCoalescer.cpp
@@ -236,7 +236,8 @@ namespace {
     /// was successfully coalesced away. If it is not currently possible to
     /// coalesce this interval, but it may be possible if other things get
     /// coalesced, then it returns true by reference in 'Again'.
-    bool joinCopy(MachineInstr *CopyMI, bool &Again);
+    bool joinCopy(MachineInstr *CopyMI, bool &Again,
+                  SmallPtrSetImpl<MachineInstr *> &CurrentErasedInstrs);
 
     /// Attempt to join these two intervals.  On failure, this
     /// returns false.  The output "SrcInt" will not have been modified, so we
@@ -1964,7 +1965,9 @@ void RegisterCoalescer::setUndefOnPrunedSubRegUses(LiveInterval &LI,
   LIS->shrinkToUses(&LI);
 }
 
-bool RegisterCoalescer::joinCopy(MachineInstr *CopyMI, bool &Again) {
+bool RegisterCoalescer::joinCopy(
+    MachineInstr *CopyMI, bool &Again,
+    SmallPtrSetImpl<MachineInstr *> &CurrentErasedInstrs) {
   Again = false;
   LLVM_DEBUG(dbgs() << LIS->getInstructionIndex(*CopyMI) << '\t' << *CopyMI);
 
@@ -2156,7 +2159,9 @@ bool RegisterCoalescer::joinCopy(MachineInstr *CopyMI, bool &Again) {
   // CopyMI has been erased by joinIntervals at this point. Remove it from
   // ErasedInstrs since copyCoalesceWorkList() won't add a successful join back
   // to the work list. This keeps ErasedInstrs from growing needlessly.
-  ErasedInstrs.erase(CopyMI);
+  if (ErasedInstrs.erase(CopyMI))
+    // But we may encounter the instruction again in this iteration.
+    CurrentErasedInstrs.insert(CopyMI);
 
   // Rewrite all SrcReg operands to DstReg.
   // Also update DstReg operands to include DstIdx if it is set.
@@ -3982,21 +3987,33 @@ void RegisterCoalescer::lateLiveIntervalUpdate() {
 bool RegisterCoalescer::
 copyCoalesceWorkList(MutableArrayRef<MachineInstr*> CurrList) {
   bool Progress = false;
+  SmallPtrSet<MachineInstr *, 4> CurrentErasedInstrs;
   for (MachineInstr *&MI : CurrList) {
     if (!MI)
       continue;
     // Skip instruction pointers that have already been erased, for example by
     // dead code elimination.
-    if (ErasedInstrs.count(MI)) {
+    if (ErasedInstrs.count(MI) || CurrentErasedInstrs.count(MI)) {
       MI = nullptr;
       continue;
     }
     bool Again = false;
-    bool Success = joinCopy(MI, Again);
+    bool Success = joinCopy(MI, Again, CurrentErasedInstrs);
     Progress |= Success;
     if (Success || !Again)
       MI = nullptr;
   }
+  // Clear instructions not recorded in `ErasedInstrs` but erased.
+  if (!CurrentErasedInstrs.empty()) {
+    for (MachineInstr *&MI : CurrList) {
+      if (MI && CurrentErasedInstrs.count(MI))
+        MI = nullptr;
+    }
+    for (MachineInstr *&MI : WorkList) {
+      if (MI && CurrentErasedInstrs.count(MI))
+        MI = nullptr;
+    }
+  }
   return Progress;
 }
 
diff --git a/llvm/test/CodeGen/LoongArch/register-coalescer-crash-pr79718.mir b/llvm/test/CodeGen/LoongArch/register-coalescer-crash-pr79718.mir
new file mode 100644
index 0000000..b3c44af
--- /dev/null
+++ b/llvm/test/CodeGen/LoongArch/register-coalescer-crash-pr79718.mir
@@ -0,0 +1,212 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 4
+# RUN: llc -o - %s -mtriple=loongarch64 \
+# RUN:  -run-pass=register-coalescer -join-liveintervals=1 -join-splitedges=0 | FileCheck %s
+
+---
+name:            foo
+tracksRegLiveness: true
+body:             |
+  ; CHECK-LABEL: name: foo
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
+  ; CHECK-NEXT:   liveins: $r4, $r5, $r6, $r7, $r8
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:gpr = COPY $r8
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:gpr = COPY $r7
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:gpr = COPY $r6
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:gpr = COPY $r5
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:gpr = COPY $r4
+  ; CHECK-NEXT:   [[ANDI:%[0-9]+]]:gpr = ANDI [[COPY3]], 1
+  ; CHECK-NEXT:   [[ORI:%[0-9]+]]:gpr = ORI $r0, 1
+  ; CHECK-NEXT:   [[ANDI1:%[0-9]+]]:gpr = ANDI [[COPY2]], 1
+  ; CHECK-NEXT:   [[ANDI2:%[0-9]+]]:gpr = ANDI [[COPY1]], 1
+  ; CHECK-NEXT:   [[ANDI3:%[0-9]+]]:gpr = ANDI [[COPY]], 1
+  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:gpr = COPY $r0
+  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:gpr = COPY $r0
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   successors: %bb.2(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:gpr = COPY [[COPY5]]
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   successors: %bb.3(0x40000000), %bb.4(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   BEQZ [[ANDI]], %bb.4
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.3:
+  ; CHECK-NEXT:   successors: %bb.9(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   PseudoBR %bb.9
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.4:
+  ; CHECK-NEXT:   successors: %bb.5(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.5:
+  ; CHECK-NEXT:   successors: %bb.7(0x7c000000), %bb.6(0x04000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   dead [[LD_D:%[0-9]+]]:gpr = LD_D $r0, 8
+  ; CHECK-NEXT:   dead [[LD_D1:%[0-9]+]]:gpr = LD_D $r0, 0
+  ; CHECK-NEXT:   BNEZ [[ANDI1]], %bb.7
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.6:
+  ; CHECK-NEXT:   successors: %bb.11(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:gpr = COPY $r0
+  ; CHECK-NEXT:   PseudoBR %bb.11
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.7:
+  ; CHECK-NEXT:   successors: %bb.8(0x7c000000), %bb.10(0x04000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   BEQZ [[ANDI2]], %bb.10
+  ; CHECK-NEXT:   PseudoBR %bb.8
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.8:
+  ; CHECK-NEXT:   successors: %bb.9(0x04000000), %bb.5(0x7c000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:gpr = ADDI_D [[COPY6]], 1
+  ; CHECK-NEXT:   BEQZ [[ANDI3]], %bb.5
+  ; CHECK-NEXT:   PseudoBR %bb.9
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.9:
+  ; CHECK-NEXT:   successors: %bb.12(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   ST_B $r0, [[COPY4]], 0
+  ; CHECK-NEXT:   PseudoBR %bb.12
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.10:
+  ; CHECK-NEXT:   successors: %bb.11(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:gpr = ADDI_D [[COPY6]], 1
+  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:gpr = COPY [[ORI]]
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.11:
+  ; CHECK-NEXT:   successors: %bb.12(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   ST_D $r0, [[COPY4]], 0
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.12:
+  ; CHECK-NEXT:   successors: %bb.2(0x7c000000), %bb.1(0x04000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   BEQ [[COPY7]], [[ORI]], %bb.2
+  ; CHECK-NEXT:   PseudoBR %bb.1
+  bb.0:
+    liveins: $r4, $r5, $r6, $r7, $r8
+
+    %0:gpr = COPY killed $r8
+    %1:gpr = COPY killed $r7
+    %2:gpr = COPY killed $r6
+    %3:gpr = COPY killed $r5
+    %4:gpr = COPY killed $r4
+    %5:gpr = COPY $r0
+    %6:gpr = COPY killed %5
+    %7:gpr = ANDI killed %3, 1
+    %8:gpr = ORI $r0, 1
+    %9:gpr = ANDI killed %2, 1
+    %10:gpr = ANDI killed %1, 1
+    %11:gpr = ANDI killed %0, 1
+    %12:gpr = COPY %6
+    %13:gpr = COPY killed %6
+    %14:gpr = IMPLICIT_DEF
+
+  bb.1:
+    %15:gpr = COPY killed %14
+    %16:gpr = COPY killed %13
+    %17:gpr = COPY killed %12
+    %18:gpr = COPY %17
+    %19:gpr = COPY %16
+    %20:gpr = COPY killed %16
+    %21:gpr = COPY killed %15
+
+  bb.2:
+    successors: %bb.3, %bb.4
+
+    %22:gpr = COPY killed %21
+    %23:gpr = COPY killed %20
+    %24:gpr = COPY killed %19
+    %25:gpr = COPY killed %18
+    BEQZ %7, %bb.4
+
+  bb.3:
+    %26:gpr = COPY killed %24
+    %27:gpr = COPY killed %23
+    PseudoBR %bb.9
+
+  bb.4:
+    %28:gpr = COPY killed %23
+
+  bb.5:
+    successors: %bb.7(0x7c000000), %bb.6(0x04000000)
+
+    %29:gpr = COPY killed %28
+    dead %30:gpr = LD_D $r0, 8
+    dead %31:gpr = LD_D $r0, 0
+    BNEZ %9, %bb.7
+
+  bb.6:
+    %32:gpr = COPY $r0
+    %33:gpr = COPY killed %32
+    %34:gpr = COPY killed %33
+    %35:gpr = COPY killed %22
+    PseudoBR %bb.11
+
+  bb.7:
+    successors: %bb.8(0x7c000000), %bb.10(0x04000000)
+
+    BEQZ %10, %bb.10
+    PseudoBR %bb.8
+
+  bb.8:
+    successors: %bb.9(0x04000000), %bb.5(0x7c000000)
+
+    %36:gpr = ADDI_D killed %29, 1
+    %28:gpr = COPY %36
+    %26:gpr = COPY %36
+    %27:gpr = COPY killed %36
+    BEQZ %11, %bb.5
+    PseudoBR %bb.9
+
+  bb.9:
+    %37:gpr = COPY killed %27
+    %38:gpr = COPY killed %26
+    %39:gpr = COPY $r0
+    ST_B killed %39, %4, 0
+    %40:gpr = COPY killed %25
+    %41:gpr = COPY killed %38
+    %42:gpr = COPY killed %37
+    %43:gpr = COPY killed %22
+    PseudoBR %bb.12
+
+  bb.10:
+    %44:gpr = ADDI_D killed %29, 1
+    %34:gpr = COPY %8
+    %35:gpr = COPY killed %44
+
+  bb.11:
+    %45:gpr = COPY killed %35
+    %46:gpr = COPY killed %34
+    %47:gpr = COPY $r0
+    ST_D killed %47, %4, 0
+    %40:gpr = COPY %45
+    %41:gpr = COPY %46
+    %42:gpr = COPY killed %46
+    %43:gpr = COPY killed %45
+
+  bb.12:
+    successors: %bb.2(0x7c000000), %bb.1(0x04000000)
+
+    %48:gpr = COPY killed %43
+    %49:gpr = COPY killed %42
+    %50:gpr = COPY killed %41
+    %51:gpr = COPY killed %40
+    %12:gpr = COPY %51
+    %13:gpr = COPY %50
+    %14:gpr = COPY %48
+    %18:gpr = COPY killed %51
+    %19:gpr = COPY killed %50
+    %20:gpr = COPY killed %49
+    %21:gpr = COPY killed %48
+    BEQ %17, %8, %bb.2
+    PseudoBR %bb.1
+
+...
diff --git a/llvm/test/CodeGen/X86/PR71178-register-coalescer-crash.ll b/llvm/test/CodeGen/X86/PR71178-register-coalescer-crash.ll
new file mode 100644
index 0000000..0ce346f
--- /dev/null
+++ b/llvm/test/CodeGen/X86/PR71178-register-coalescer-crash.ll
@@ -0,0 +1,103 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc < %s -mtriple=x86_64 -- | FileCheck %s
+
+define i32 @h(i1 %arg, i32 %arg1) {
+; CHECK-LABEL: h:
+; CHECK:       # %bb.0: # %bb
+; CHECK-NEXT:    movl $1, %eax
+; CHECK-NEXT:    movabsq $9166129423, %rcx # imm = 0x22258090F
+; CHECK-NEXT:    xorl %edx, %edx
+; CHECK-NEXT:    jmp .LBB0_1
+; CHECK-NEXT:    .p2align 4, 0x90
+; CHECK-NEXT:  .LBB0_9: # %bb18
+; CHECK-NEXT:    # in Loop: Header=BB0_1 Depth=1
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    testb $1, %dil
+; CHECK-NEXT:    jne .LBB0_10
+; CHECK-NEXT:  .LBB0_1: # %bb4
+; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    testq %rdx, %rdx
+; CHECK-NEXT:    jne .LBB0_2
+; CHECK-NEXT:  # %bb.7: # %bb16
+; CHECK-NEXT:    # in Loop: Header=BB0_1 Depth=1
+; CHECK-NEXT:    testb $1, %dil
+; CHECK-NEXT:    jne .LBB0_9
+; CHECK-NEXT:  # %bb.8: # %bb17
+; CHECK-NEXT:    # in Loop: Header=BB0_1 Depth=1
+; CHECK-NEXT:    movq %rcx, %rdx
+; CHECK-NEXT:    jmp .LBB0_9
+; CHECK-NEXT:  .LBB0_2: # %bb9
+; CHECK-NEXT:    # in Loop: Header=BB0_1 Depth=1
+; CHECK-NEXT:    testb $1, %dil
+; CHECK-NEXT:    testb $1, %dil
+; CHECK-NEXT:    je .LBB0_4
+; CHECK-NEXT:  # %bb.3: # %bb13
+; CHECK-NEXT:    # in Loop: Header=BB0_1 Depth=1
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:  .LBB0_4: # %bb14
+; CHECK-NEXT:    # in Loop: Header=BB0_1 Depth=1
+; CHECK-NEXT:    cmpl $1, %esi
+; CHECK-NEXT:    je .LBB0_1
+; CHECK-NEXT:  # %bb.5: # %bb14
+; CHECK-NEXT:    movl %eax, %r8d
+; CHECK-NEXT:    testl %esi, %esi
+; CHECK-NEXT:    movl %esi, %eax
+; CHECK-NEXT:    jne .LBB0_6
+; CHECK-NEXT:  .LBB0_10: # %bb22
+; CHECK-NEXT:    retq
+; CHECK-NEXT:  .LBB0_6: # %bb22.loopexit1
+; CHECK-NEXT:    movl %r8d, %eax
+; CHECK-NEXT:    retq
+bb:
+  br label %bb2
+
+bb2:                                              ; preds = %bb14, %bb
+  %i = phi i64 [ %i5, %bb14 ], [ 0, %bb ]
+  %i3 = phi i32 [ %i15, %bb14 ], [ 1, %bb ]
+  br label %bb4
+
+bb4:                                              ; preds = %bb18, %bb2
+  %i5 = phi i64 [ %i19, %bb18 ], [ %i, %bb2 ]
+  %i6 = phi i64 [ %i20, %bb18 ], [ %i, %bb2 ]
+  %i7 = phi i32 [ 0, %bb18 ], [ %i3, %bb2 ]
+  %i8 = icmp eq i64 %i6, 0
+  br i1 %i8, label %bb16, label %bb9
+
+bb9:                                              ; preds = %bb4
+  br i1 %arg, label %bb12, label %bb10
+
+bb10:                                             ; preds = %bb9
+  %i11 = sdiv i64 0, 0
+  br label %bb12
+
+bb12:                                             ; preds = %bb10, %bb9
+  br i1 %arg, label %bb13, label %bb14
+
+bb13:                                             ; preds = %bb12
+  br label %bb14
+
+bb14:                                             ; preds = %bb13, %bb12
+  %i15 = phi i32 [ 0, %bb13 ], [ %i7, %bb12 ]
+  switch i32 %arg1, label %bb22 [
+    i32 0, label %bb21
+    i32 1, label %bb2
+  ]
+
+bb16:                                             ; preds = %bb4
+  br i1 %arg, label %bb18, label %bb17
+
+bb17:                                             ; preds = %bb16
+  br label %bb18
+
+bb18:                                             ; preds = %bb17, %bb16
+  %i19 = phi i64 [ 9166129423, %bb17 ], [ %i5, %bb16 ]
+  %i20 = phi i64 [ 9166129423, %bb17 ], [ %i6, %bb16 ]
+  br i1 %arg, label %bb22, label %bb4
+
+bb21:                                             ; preds = %bb14
+  br label %bb22
+
+bb22:                                             ; preds = %bb21, %bb18, %bb14
+  %i23 = phi i32 [ %arg1, %bb21 ], [ %i15, %bb14 ], [ 0, %bb18 ]
+  ret i32 %i23
+}
-- 
cgit v1.1


From b477d39bf6811ac12a1e7e98f308cf4c9a8de26f Mon Sep 17 00:00:00 2001
From: jeanPerier <jperier@nvidia.com>
Date: Fri, 9 Feb 2024 09:10:49 +0100
Subject: [flang] Align runtime info and lowering regarding passing ABIs
 (#81166)

Runtime derived type info contains information to tell the runtime if
some argument in a user defined assignment must be passed with a
descriptor or not. This information was not properly build, it would
tell the runtime that TARGET argument must be passed via descriptor,
which is incorrect.

Share the logic between lowering and runtime info generation to
determine if an argument must be passed by descriptor or not.
---
 flang/include/flang/Evaluate/characteristics.h |  1 +
 flang/lib/Evaluate/characteristics.cpp         | 24 +++++++++++++++++++++++
 flang/lib/Lower/CallInterface.cpp              | 27 +-------------------------
 flang/lib/Semantics/runtime-type-info.cpp      | 14 ++++++++-----
 flang/test/Semantics/typeinfo09.f90            | 20 +++++++++++++++++++
 5 files changed, 55 insertions(+), 31 deletions(-)
 create mode 100644 flang/test/Semantics/typeinfo09.f90

diff --git a/flang/include/flang/Evaluate/characteristics.h b/flang/include/flang/Evaluate/characteristics.h
index fd4af15..04a0d71 100644
--- a/flang/include/flang/Evaluate/characteristics.h
+++ b/flang/include/flang/Evaluate/characteristics.h
@@ -229,6 +229,7 @@ struct DummyDataObject {
   static std::optional<DummyDataObject> Characterize(
       const semantics::Symbol &, FoldingContext &);
   bool CanBePassedViaImplicitInterface(std::string *whyNot = nullptr) const;
+  bool IsPassedByDescriptor(bool isBindC) const;
   llvm::raw_ostream &Dump(llvm::raw_ostream &) const;
 
   TypeAndShape type;
diff --git a/flang/lib/Evaluate/characteristics.cpp b/flang/lib/Evaluate/characteristics.cpp
index d480050..c14a422 100644
--- a/flang/lib/Evaluate/characteristics.cpp
+++ b/flang/lib/Evaluate/characteristics.cpp
@@ -461,6 +461,30 @@ bool DummyDataObject::CanBePassedViaImplicitInterface(
   }
 }
 
+bool DummyDataObject::IsPassedByDescriptor(bool isBindC) const {
+  constexpr TypeAndShape::Attrs shapeRequiringBox = {
+      TypeAndShape::Attr::AssumedShape, TypeAndShape::Attr::DeferredShape,
+      TypeAndShape::Attr::AssumedRank, TypeAndShape::Attr::Coarray};
+  if ((attrs & Attrs{Attr::Allocatable, Attr::Pointer}).any()) {
+    return true;
+  } else if ((type.attrs() & shapeRequiringBox).any()) {
+    // Need to pass shape/coshape info in a descriptor.
+    return true;
+  } else if (type.type().IsPolymorphic() && !type.type().IsAssumedType()) {
+    // Need to pass dynamic type info in a descriptor.
+    return true;
+  } else if (const auto *derived{GetDerivedTypeSpec(type.type())}) {
+    if (const semantics::Scope *scope = derived->scope()) {
+      // Need to pass length type parameters in a descriptor if any.
+      return scope->IsDerivedTypeWithLengthParameter();
+    }
+  } else if (isBindC && type.type().IsAssumedLengthCharacter()) {
+    // Fortran 2018 18.3.6 point 2 (5)
+    return true;
+  }
+  return false;
+}
+
 llvm::raw_ostream &DummyDataObject::Dump(llvm::raw_ostream &o) const {
   attrs.Dump(o, EnumToString);
   if (intent != common::Intent::Default) {
diff --git a/flang/lib/Lower/CallInterface.cpp b/flang/lib/Lower/CallInterface.cpp
index 4c297ce..f67ee88 100644
--- a/flang/lib/Lower/CallInterface.cpp
+++ b/flang/lib/Lower/CallInterface.cpp
@@ -916,31 +916,6 @@ private:
     }
   }
 
-  // Define when an explicit argument must be passed in a fir.box.
-  bool dummyRequiresBox(
-      const Fortran::evaluate::characteristics::DummyDataObject &obj,
-      bool isBindC) {
-    using ShapeAttr = Fortran::evaluate::characteristics::TypeAndShape::Attr;
-    using ShapeAttrs = Fortran::evaluate::characteristics::TypeAndShape::Attrs;
-    constexpr ShapeAttrs shapeRequiringBox = {
-        ShapeAttr::AssumedShape, ShapeAttr::DeferredShape,
-        ShapeAttr::AssumedRank, ShapeAttr::Coarray};
-    if ((obj.type.attrs() & shapeRequiringBox).any())
-      // Need to pass shape/coshape info in fir.box.
-      return true;
-    if (obj.type.type().IsPolymorphic() && !obj.type.type().IsAssumedType())
-      // Need to pass dynamic type info in fir.box.
-      return true;
-    if (const Fortran::semantics::DerivedTypeSpec *derived =
-            Fortran::evaluate::GetDerivedTypeSpec(obj.type.type()))
-      if (const Fortran::semantics::Scope *scope = derived->scope())
-        // Need to pass length type parameters in fir.box if any.
-        return scope->IsDerivedTypeWithLengthParameter();
-    if (isBindC && obj.type.type().IsAssumedLengthCharacter())
-      return true; // Fortran 2018 18.3.6 point 2 (5)
-    return false;
-  }
-
   mlir::Type
   translateDynamicType(const Fortran::evaluate::DynamicType &dynamicType) {
     Fortran::common::TypeCategory cat = dynamicType.category();
@@ -1027,7 +1002,7 @@ private:
       addFirOperand(boxRefType, nextPassedArgPosition(), Property::MutableBox,
                     attrs);
       addPassedArg(PassEntityBy::MutableBox, entity, characteristics);
-    } else if (dummyRequiresBox(obj, isBindC)) {
+    } else if (obj.IsPassedByDescriptor(isBindC)) {
       // Pass as fir.box or fir.class
       if (isValueAttr &&
           !getConverter().getLoweringOptions().getLowerToHighLevelFIR())
diff --git a/flang/lib/Semantics/runtime-type-info.cpp b/flang/lib/Semantics/runtime-type-info.cpp
index de71083..66c4216 100644
--- a/flang/lib/Semantics/runtime-type-info.cpp
+++ b/flang/lib/Semantics/runtime-type-info.cpp
@@ -1144,7 +1144,7 @@ void RuntimeTableBuilder::DescribeSpecialProc(
           which = scalarFinalEnum_;
           if (int rank{evaluate::GetRank(typeAndShape.shape())}; rank > 0) {
             which = IntExpr<1>(ToInt64(which).value() + rank);
-            if (!proc->dummyArguments[0].CanBePassedViaImplicitInterface()) {
+            if (dummyData.IsPassedByDescriptor(proc->IsBindC())) {
               argThatMightBeDescriptor = 1;
             }
             if (!typeAndShape.attrs().test(evaluate::characteristics::
@@ -1187,10 +1187,14 @@ void RuntimeTableBuilder::DescribeSpecialProc(
         break;
       }
     }
-    if (argThatMightBeDescriptor != 0 &&
-        !proc->dummyArguments.at(argThatMightBeDescriptor - 1)
-             .CanBePassedViaImplicitInterface()) {
-      isArgDescriptorSet |= 1 << (argThatMightBeDescriptor - 1);
+    if (argThatMightBeDescriptor != 0) {
+      if (const auto *dummyData{
+              std::get_if<evaluate::characteristics::DummyDataObject>(
+                  &proc->dummyArguments.at(argThatMightBeDescriptor - 1).u)}) {
+        if (dummyData->IsPassedByDescriptor(proc->IsBindC())) {
+          isArgDescriptorSet |= 1 << (argThatMightBeDescriptor - 1);
+        }
+      }
     }
     evaluate::StructureConstructorValues values;
     auto index{evaluate::ToInt64(which)};
diff --git a/flang/test/Semantics/typeinfo09.f90 b/flang/test/Semantics/typeinfo09.f90
new file mode 100644
index 0000000..3527ee6
--- /dev/null
+++ b/flang/test/Semantics/typeinfo09.f90
@@ -0,0 +1,20 @@
+!RUN: bbc --dump-symbols %s | FileCheck %s
+!RUN: %flang_fc1 -fdebug-dump-symbols %s | FileCheck %s
+! test setting of isargdescriptorset in the runtime type info.
+
+module m
+ type :: sometype
+ contains
+  procedure :: copy => copy_impl
+  generic :: assignment(=) => copy
+ end type
+interface
+  subroutine copy_impl(this, x)
+    import
+    class(sometype), intent(out) :: this
+    type(sometype), target, intent(in) :: x
+  end subroutine
+end interface
+end module
+
+!CHECK: .s.sometype, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(specialbinding) shape: 0_8:0_8 init:[specialbinding::specialbinding(which=1_1,isargdescriptorset=1_1,istypebound=1_1,isargcontiguousset=0_1,proc=copy_impl)]
-- 
cgit v1.1


From bc6955f18ced3ca89d49bc28eeb58cd6d367e136 Mon Sep 17 00:00:00 2001
From: Diana Picus <Diana-Magda.Picus@amd.com>
Date: Fri, 9 Feb 2024 09:20:25 +0100
Subject: [AMDGPU] Don't fix the scavenge slot at offset 0 (#79136)

At the moment, the emergency spill slot is a fixed object for entry
functions and chain functions, and a regular stack object otherwise.
This patch adopts the latter behaviour for entry/chain functions too. It
seems this was always the intention [1] and it will also save us a bit
of stack space in cases where the first stack object has a large
alignment.

[1]
https://github.com/llvm/llvm-project/commit/34c8b835b16fb3879f1b9770e91df21883356bb6
---
 llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp   |   12 +-
 llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp          |    3 -
 .../AMDGPU/GlobalISel/call-outgoing-stack-args.ll  |   54 +-
 .../AMDGPU/GlobalISel/crash-stack-address-O0.ll    |    4 +-
 .../AMDGPU/GlobalISel/flat-scratch-init.gfx.ll     |    4 +-
 .../test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll |  130 +--
 .../AMDGPU/GlobalISel/insertelement-stack-lower.ll |  258 ++---
 llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-cc.ll     |   24 +-
 .../CodeGen/AMDGPU/amdgpu-cs-chain-preserve-cc.ll  |  104 +-
 llvm/test/CodeGen/AMDGPU/amdgpu.private-memory.ll  |   10 +-
 llvm/test/CodeGen/AMDGPU/array-ptr-calc-i32.ll     |    2 +-
 llvm/test/CodeGen/AMDGPU/call-argument-types.ll    |  106 +-
 .../AMDGPU/callee-special-input-vgprs-packed.ll    |    4 +-
 .../CodeGen/AMDGPU/callee-special-input-vgprs.ll   |    6 +-
 llvm/test/CodeGen/AMDGPU/captured-frame-index.ll   |   50 +-
 llvm/test/CodeGen/AMDGPU/cc-update.ll              |   32 +-
 llvm/test/CodeGen/AMDGPU/cf-loop-on-constant.ll    |   42 +-
 llvm/test/CodeGen/AMDGPU/cgp-addressing-modes.ll   |   13 +-
 llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll         |   54 +-
 llvm/test/CodeGen/AMDGPU/collapse-endcf.ll         |  132 +--
 llvm/test/CodeGen/AMDGPU/commute-compares.ll       |    2 +-
 .../CodeGen/AMDGPU/control-flow-fastregalloc.ll    |   16 +-
 llvm/test/CodeGen/AMDGPU/extload-private.ll        |    8 +-
 llvm/test/CodeGen/AMDGPU/extract_vector_dynelt.ll  |    2 +
 llvm/test/CodeGen/AMDGPU/flat-scratch-init.ll      |   16 +-
 llvm/test/CodeGen/AMDGPU/flat-scratch-svs.ll       |  108 +-
 llvm/test/CodeGen/AMDGPU/flat-scratch.ll           |  570 +++++-----
 .../frame-index-elimination-tied-operand.mir       |    2 +-
 .../test/CodeGen/AMDGPU/frame-index-elimination.ll |    2 +-
 .../CodeGen/AMDGPU/global_atomics_scan_fadd.ll     |   72 +-
 .../CodeGen/AMDGPU/global_atomics_scan_fsub.ll     |   72 +-
 llvm/test/CodeGen/AMDGPU/huge-private-buffer.ll    |   20 +-
 llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll   |   18 +-
 .../test/CodeGen/AMDGPU/kernarg-stack-alignment.ll |   10 +-
 .../test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.ll |    4 +-
 .../AMDGPU/llvm.amdgcn.implicit.ptr.buffer.ll      |    4 +-
 .../CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll    |    2 +-
 llvm/test/CodeGen/AMDGPU/load-global-i16.ll        |  144 +--
 llvm/test/CodeGen/AMDGPU/load-global-i32.ll        |   32 +-
 llvm/test/CodeGen/AMDGPU/memory_clause.ll          |   24 +-
 llvm/test/CodeGen/AMDGPU/mubuf-offset-private.ll   |    4 +-
 ...partial-regcopy-and-spill-missed-at-regalloc.ll |    8 +-
 .../CodeGen/AMDGPU/partial-sgpr-to-vgpr-spills.ll  |   62 +-
 .../AMDGPU/pei-amdgpu-cs-chain-preserve.mir        |   20 +-
 llvm/test/CodeGen/AMDGPU/pei-amdgpu-cs-chain.mir   |   16 +-
 .../regalloc-introduces-copy-sgpr-to-agpr.mir      |  136 +--
 llvm/test/CodeGen/AMDGPU/scratch-simple.ll         |    8 +-
 llvm/test/CodeGen/AMDGPU/sgpr-spill-no-vgprs.ll    |   16 +-
 llvm/test/CodeGen/AMDGPU/sgpr-spill.mir            |  240 ++--
 llvm/test/CodeGen/AMDGPU/si-spill-sgpr-stack.ll    |    2 +-
 llvm/test/CodeGen/AMDGPU/spill-agpr.ll             |   32 +-
 llvm/test/CodeGen/AMDGPU/spill-m0.ll               |    4 +-
 .../CodeGen/AMDGPU/spill-offset-calculation.ll     |   36 +-
 llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll  |   16 +-
 llvm/test/CodeGen/AMDGPU/spill-special-sgpr.mir    |   54 +-
 llvm/test/CodeGen/AMDGPU/stack-realign-kernel.ll   |   28 +-
 llvm/test/CodeGen/AMDGPU/stack-size-overflow.ll    |    8 +-
 llvm/test/CodeGen/AMDGPU/stacksave_stackrestore.ll |   18 +-
 .../CodeGen/AMDGPU/vgpr-mark-last-scratch-load.ll  |   68 +-
 .../AMDGPU/vgpr-spill-emergency-stack-slot.ll      |    2 +-
 .../AMDGPU/vgpr-spill-placement-issue61083.ll      |   14 +-
 llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll     | 1164 ++++++++++----------
 llvm/test/CodeGen/AMDGPU/wqm.ll                    |    8 +-
 llvm/test/CodeGen/AMDGPU/wwm-reserved.ll           |   88 +-
 .../MIR/AMDGPU/long-branch-reg-all-sgpr-used.ll    |    4 +-
 .../MIR/AMDGPU/machine-function-info-after-pei.ll  |    2 +-
 llvm/test/DebugInfo/AMDGPU/variable-locations.ll   |    6 +-
 67 files changed, 2127 insertions(+), 2109 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
index b94d143..52d6fe6 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
@@ -552,14 +552,10 @@ int SIMachineFunctionInfo::getScavengeFI(MachineFrameInfo &MFI,
                                          const SIRegisterInfo &TRI) {
   if (ScavengeFI)
     return *ScavengeFI;
-  if (isBottomOfStack()) {
-    ScavengeFI = MFI.CreateFixedObject(
-        TRI.getSpillSize(AMDGPU::SGPR_32RegClass), 0, false);
-  } else {
-    ScavengeFI = MFI.CreateStackObject(
-        TRI.getSpillSize(AMDGPU::SGPR_32RegClass),
-        TRI.getSpillAlign(AMDGPU::SGPR_32RegClass), false);
-  }
+
+  ScavengeFI =
+      MFI.CreateStackObject(TRI.getSpillSize(AMDGPU::SGPR_32RegClass),
+                            TRI.getSpillAlign(AMDGPU::SGPR_32RegClass), false);
   return *ScavengeFI;
 }
 
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
index 1a22b77..3664535 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -2287,9 +2287,6 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
           if (FrameReg)
             FIOp.ChangeToRegister(FrameReg, false);
 
-          if (!Offset)
-            return false;
-
           MachineOperand *OffsetOp =
             TII->getNamedOperand(*MI, AMDGPU::OpName::offset);
           int64_t NewOffset = Offset + OffsetOp->getImm();
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll
index 6e49a5a..61bc28b 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll
@@ -67,6 +67,8 @@ define amdgpu_kernel void @kernel_caller_byval() {
 ; MUBUF-NEXT:    s_mov_b32 s3, 0xe00000
 ; MUBUF-NEXT:    s_mov_b64 s[0:1], flat_scratch
 ; MUBUF-NEXT:    v_mov_b32_e32 v0, 0
+; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:4
 ; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:8
 ; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:12
 ; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:16
@@ -97,25 +99,23 @@ define amdgpu_kernel void @kernel_caller_byval() {
 ; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:116
 ; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:120
 ; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:124
-; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:128
-; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:132
-; MUBUF-NEXT:    buffer_load_dword v0, off, s[0:3], 0 offset:8
+; MUBUF-NEXT:    buffer_load_dword v0, off, s[0:3], 0
 ; MUBUF-NEXT:    s_nop 0
-; MUBUF-NEXT:    buffer_load_dword v1, off, s[0:3], 0 offset:12
-; MUBUF-NEXT:    buffer_load_dword v2, off, s[0:3], 0 offset:16
-; MUBUF-NEXT:    buffer_load_dword v3, off, s[0:3], 0 offset:20
-; MUBUF-NEXT:    buffer_load_dword v4, off, s[0:3], 0 offset:24
-; MUBUF-NEXT:    buffer_load_dword v5, off, s[0:3], 0 offset:28
-; MUBUF-NEXT:    buffer_load_dword v6, off, s[0:3], 0 offset:32
-; MUBUF-NEXT:    buffer_load_dword v7, off, s[0:3], 0 offset:36
-; MUBUF-NEXT:    buffer_load_dword v8, off, s[0:3], 0 offset:40
-; MUBUF-NEXT:    buffer_load_dword v9, off, s[0:3], 0 offset:44
-; MUBUF-NEXT:    buffer_load_dword v10, off, s[0:3], 0 offset:48
-; MUBUF-NEXT:    buffer_load_dword v11, off, s[0:3], 0 offset:52
-; MUBUF-NEXT:    buffer_load_dword v12, off, s[0:3], 0 offset:56
-; MUBUF-NEXT:    buffer_load_dword v13, off, s[0:3], 0 offset:60
-; MUBUF-NEXT:    buffer_load_dword v14, off, s[0:3], 0 offset:64
-; MUBUF-NEXT:    buffer_load_dword v15, off, s[0:3], 0 offset:68
+; MUBUF-NEXT:    buffer_load_dword v1, off, s[0:3], 0 offset:4
+; MUBUF-NEXT:    buffer_load_dword v2, off, s[0:3], 0 offset:8
+; MUBUF-NEXT:    buffer_load_dword v3, off, s[0:3], 0 offset:12
+; MUBUF-NEXT:    buffer_load_dword v4, off, s[0:3], 0 offset:16
+; MUBUF-NEXT:    buffer_load_dword v5, off, s[0:3], 0 offset:20
+; MUBUF-NEXT:    buffer_load_dword v6, off, s[0:3], 0 offset:24
+; MUBUF-NEXT:    buffer_load_dword v7, off, s[0:3], 0 offset:28
+; MUBUF-NEXT:    buffer_load_dword v8, off, s[0:3], 0 offset:32
+; MUBUF-NEXT:    buffer_load_dword v9, off, s[0:3], 0 offset:36
+; MUBUF-NEXT:    buffer_load_dword v10, off, s[0:3], 0 offset:40
+; MUBUF-NEXT:    buffer_load_dword v11, off, s[0:3], 0 offset:44
+; MUBUF-NEXT:    buffer_load_dword v12, off, s[0:3], 0 offset:48
+; MUBUF-NEXT:    buffer_load_dword v13, off, s[0:3], 0 offset:52
+; MUBUF-NEXT:    buffer_load_dword v14, off, s[0:3], 0 offset:56
+; MUBUF-NEXT:    buffer_load_dword v15, off, s[0:3], 0 offset:60
 ; MUBUF-NEXT:    s_movk_i32 s32, 0x1400
 ; MUBUF-NEXT:    s_getpc_b64 s[4:5]
 ; MUBUF-NEXT:    s_add_u32 s4, s4, external_void_func_byval@rel32@lo+4
@@ -162,6 +162,7 @@ define amdgpu_kernel void @kernel_caller_byval() {
 ; FLATSCR-NEXT:    s_addc_u32 flat_scratch_hi, s1, 0
 ; FLATSCR-NEXT:    v_mov_b32_e32 v1, 0
 ; FLATSCR-NEXT:    s_mov_b32 s0, 0
+; FLATSCR-NEXT:    scratch_store_dwordx2 off, v[0:1], s0
 ; FLATSCR-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:8
 ; FLATSCR-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
 ; FLATSCR-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:24
@@ -177,16 +178,15 @@ define amdgpu_kernel void @kernel_caller_byval() {
 ; FLATSCR-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:104
 ; FLATSCR-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:112
 ; FLATSCR-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:120
-; FLATSCR-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:128
-; FLATSCR-NEXT:    scratch_load_dwordx2 v[0:1], off, s0 offset:8
+; FLATSCR-NEXT:    scratch_load_dwordx2 v[0:1], off, s0
 ; FLATSCR-NEXT:    s_nop 0
-; FLATSCR-NEXT:    scratch_load_dwordx2 v[2:3], off, s0 offset:16
-; FLATSCR-NEXT:    scratch_load_dwordx2 v[4:5], off, s0 offset:24
-; FLATSCR-NEXT:    scratch_load_dwordx2 v[6:7], off, s0 offset:32
-; FLATSCR-NEXT:    scratch_load_dwordx2 v[8:9], off, s0 offset:40
-; FLATSCR-NEXT:    scratch_load_dwordx2 v[10:11], off, s0 offset:48
-; FLATSCR-NEXT:    scratch_load_dwordx2 v[12:13], off, s0 offset:56
-; FLATSCR-NEXT:    scratch_load_dwordx2 v[14:15], off, s0 offset:64
+; FLATSCR-NEXT:    scratch_load_dwordx2 v[2:3], off, s0 offset:8
+; FLATSCR-NEXT:    scratch_load_dwordx2 v[4:5], off, s0 offset:16
+; FLATSCR-NEXT:    scratch_load_dwordx2 v[6:7], off, s0 offset:24
+; FLATSCR-NEXT:    scratch_load_dwordx2 v[8:9], off, s0 offset:32
+; FLATSCR-NEXT:    scratch_load_dwordx2 v[10:11], off, s0 offset:40
+; FLATSCR-NEXT:    scratch_load_dwordx2 v[12:13], off, s0 offset:48
+; FLATSCR-NEXT:    scratch_load_dwordx2 v[14:15], off, s0 offset:56
 ; FLATSCR-NEXT:    s_movk_i32 s32, 0x50
 ; FLATSCR-NEXT:    s_getpc_b64 s[0:1]
 ; FLATSCR-NEXT:    s_add_u32 s0, s0, external_void_func_byval@rel32@lo+4
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/crash-stack-address-O0.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/crash-stack-address-O0.ll
index 9580326..0d79365 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/crash-stack-address-O0.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/crash-stack-address-O0.ll
@@ -12,10 +12,10 @@ define amdgpu_kernel void @stack_write_fi() {
 ; CHECK-NEXT:    s_mov_b32 s5, 0
 ; CHECK-NEXT:    s_mov_b32 s4, 0
 ; CHECK-NEXT:    v_mov_b32_e32 v0, s5
-; CHECK-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:4
+; CHECK-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    v_mov_b32_e32 v0, s4
-; CHECK-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:8
+; CHECK-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:4
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    s_endpgm
 entry:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch-init.gfx.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch-init.gfx.ll
index dcad707..b4b95fd 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch-init.gfx.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch-init.gfx.ll
@@ -12,7 +12,7 @@ define amdgpu_ps void @amdgpu_ps() {
 ; MESA-NEXT:    s_add_u32 flat_scratch_lo, s2, s4
 ; MESA-NEXT:    s_mov_b64 s[0:1], src_private_base
 ; MESA-NEXT:    s_addc_u32 flat_scratch_hi, s3, 0
-; MESA-NEXT:    v_mov_b32_e32 v0, 4
+; MESA-NEXT:    v_mov_b32_e32 v0, 0
 ; MESA-NEXT:    v_mov_b32_e32 v1, s1
 ; MESA-NEXT:    v_mov_b32_e32 v2, 0
 ; MESA-NEXT:    flat_store_dword v[0:1], v2
@@ -24,7 +24,7 @@ define amdgpu_ps void @amdgpu_ps() {
 ; PAL-NEXT:    s_getpc_b64 s[2:3]
 ; PAL-NEXT:    s_mov_b32 s2, s0
 ; PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
-; PAL-NEXT:    v_mov_b32_e32 v0, 4
+; PAL-NEXT:    v_mov_b32_e32 v0, 0
 ; PAL-NEXT:    v_mov_b32_e32 v2, 0
 ; PAL-NEXT:    s_waitcnt lgkmcnt(0)
 ; PAL-NEXT:    s_and_b32 s3, s3, 0xffff
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll
index 75065f6..921bdb5 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll
@@ -15,11 +15,11 @@ define amdgpu_kernel void @store_load_sindex_kernel(i32 %idx) {
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_lshl_b32 s1, s0, 2
 ; GFX9-NEXT:    s_and_b32 s0, s0, 15
-; GFX9-NEXT:    s_add_i32 s1, s1, 4
+; GFX9-NEXT:    s_add_i32 s1, s1, 0
 ; GFX9-NEXT:    s_lshl_b32 s0, s0, 2
 ; GFX9-NEXT:    scratch_store_dword off, v0, s1
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_add_i32 s0, s0, 4
+; GFX9-NEXT:    s_add_i32 s0, s0, 0
 ; GFX9-NEXT:    scratch_load_dword v0, off, s0 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_endpgm
@@ -36,8 +36,8 @@ define amdgpu_kernel void @store_load_sindex_kernel(i32 %idx) {
 ; GFX10-NEXT:    s_and_b32 s1, s0, 15
 ; GFX10-NEXT:    s_lshl_b32 s0, s0, 2
 ; GFX10-NEXT:    s_lshl_b32 s1, s1, 2
-; GFX10-NEXT:    s_add_i32 s0, s0, 4
-; GFX10-NEXT:    s_add_i32 s1, s1, 4
+; GFX10-NEXT:    s_add_i32 s0, s0, 0
+; GFX10-NEXT:    s_add_i32 s1, s1, 0
 ; GFX10-NEXT:    scratch_store_dword off, v0, s0
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    scratch_load_dword v0, off, s1 glc dlc
@@ -51,12 +51,12 @@ define amdgpu_kernel void @store_load_sindex_kernel(i32 %idx) {
 ; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NEXT:    s_lshl_b32 s1, s0, 2
 ; GFX940-NEXT:    s_and_b32 s0, s0, 15
-; GFX940-NEXT:    s_add_i32 s1, s1, 4
+; GFX940-NEXT:    s_add_i32 s1, s1, 0
 ; GFX940-NEXT:    s_lshl_b32 s0, s0, 2
 ; GFX940-NEXT:    scratch_store_dword off, v0, s1 sc0 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NEXT:    scratch_load_dword v0, v0, off offset:4 sc0 sc1
+; GFX940-NEXT:    scratch_load_dword v0, v0, off sc0 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    s_endpgm
 ;
@@ -69,10 +69,10 @@ define amdgpu_kernel void @store_load_sindex_kernel(i32 %idx) {
 ; GFX11-NEXT:    s_lshl_b32 s1, s1, 2
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    v_dual_mov_b32 v0, 15 :: v_dual_mov_b32 v1, s1
-; GFX11-NEXT:    s_add_i32 s0, s0, 4
+; GFX11-NEXT:    s_add_i32 s0, s0, 0
 ; GFX11-NEXT:    scratch_store_b32 off, v0, s0 dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    scratch_load_b32 v0, v1, off offset:4 glc dlc
+; GFX11-NEXT:    scratch_load_b32 v0, v1, off glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_endpgm
 ;
@@ -87,9 +87,9 @@ define amdgpu_kernel void @store_load_sindex_kernel(i32 %idx) {
 ; GFX12-NEXT:    s_lshl_b32 s0, s0, 2
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    v_mov_b32_e32 v2, s0
-; GFX12-NEXT:    scratch_store_b32 v0, v1, off offset:4 scope:SCOPE_SYS
+; GFX12-NEXT:    scratch_store_b32 v0, v1, off scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    scratch_load_b32 v0, v2, off offset:4 scope:SCOPE_SYS
+; GFX12-NEXT:    scratch_load_b32 v0, v2, off scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    s_endpgm
 bb:
@@ -109,12 +109,12 @@ define amdgpu_kernel void @store_load_vindex_kernel() {
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
 ; GFX9-NEXT:    v_sub_u32_e32 v0, 0, v0
 ; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s1, 0
-; GFX9-NEXT:    v_add_u32_e32 v1, 4, v1
+; GFX9-NEXT:    v_add_u32_e32 v1, 0, v1
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 15
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    scratch_store_dword v1, v2, off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_add_u32_e32 v0, 4, v0
+; GFX9-NEXT:    v_add_u32_e32 v0, 0, v0
 ; GFX9-NEXT:    scratch_load_dword v0, v0, off offset:124 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_endpgm
@@ -129,8 +129,8 @@ define amdgpu_kernel void @store_load_vindex_kernel() {
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 15
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 2, v1
-; GFX10-NEXT:    v_add_nc_u32_e32 v0, 4, v0
-; GFX10-NEXT:    v_add_nc_u32_e32 v1, 4, v1
+; GFX10-NEXT:    v_add_nc_u32_e32 v0, 0, v0
+; GFX10-NEXT:    v_add_nc_u32_e32 v1, 0, v1
 ; GFX10-NEXT:    scratch_store_dword v0, v2, off
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    scratch_load_dword v0, v1, off offset:124 glc dlc
@@ -143,9 +143,9 @@ define amdgpu_kernel void @store_load_vindex_kernel() {
 ; GFX940-NEXT:    v_sub_u32_e32 v0, 0, v0
 ; GFX940-NEXT:    v_mov_b32_e32 v2, 15
 ; GFX940-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX940-NEXT:    scratch_store_dword v1, v2, off offset:4 sc0 sc1
+; GFX940-NEXT:    scratch_store_dword v1, v2, off sc0 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    v_add_u32_e32 v0, 4, v0
+; GFX940-NEXT:    v_add_u32_e32 v0, 0, v0
 ; GFX940-NEXT:    scratch_load_dword v0, v0, off offset:124 sc0 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    s_endpgm
@@ -156,9 +156,9 @@ define amdgpu_kernel void @store_load_vindex_kernel() {
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-NEXT:    v_dual_mov_b32 v2, 15 :: v_dual_lshlrev_b32 v1, 2, v1
-; GFX11-NEXT:    scratch_store_b32 v0, v2, off offset:4 dlc
+; GFX11-NEXT:    scratch_store_b32 v0, v2, off dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    v_add_nc_u32_e32 v1, 4, v1
+; GFX11-NEXT:    v_add_nc_u32_e32 v1, 0, v1
 ; GFX11-NEXT:    scratch_load_b32 v0, v1, off offset:124 glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_endpgm
@@ -169,9 +169,9 @@ define amdgpu_kernel void @store_load_vindex_kernel() {
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX12-NEXT:    v_dual_mov_b32 v2, 15 :: v_dual_lshlrev_b32 v1, 2, v1
-; GFX12-NEXT:    scratch_store_b32 v0, v2, off offset:4 scope:SCOPE_SYS
+; GFX12-NEXT:    scratch_store_b32 v0, v2, off scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    scratch_load_b32 v0, v1, off offset:128 scope:SCOPE_SYS
+; GFX12-NEXT:    scratch_load_b32 v0, v1, off offset:124 scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    s_endpgm
 bb:
@@ -324,16 +324,16 @@ define amdgpu_kernel void @store_load_sindex_small_offset_kernel(i32 %idx) {
 ; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s2, s5
 ; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s3, 0
 ; GFX9-NEXT:    s_mov_b32 s1, 0
-; GFX9-NEXT:    scratch_load_dword v0, off, s1 offset:4 glc
+; GFX9-NEXT:    scratch_load_dword v0, off, s1 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_lshl_b32 s1, s0, 2
 ; GFX9-NEXT:    s_and_b32 s0, s0, 15
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 15
-; GFX9-NEXT:    s_addk_i32 s1, 0x104
+; GFX9-NEXT:    s_addk_i32 s1, 0x100
 ; GFX9-NEXT:    s_lshl_b32 s0, s0, 2
 ; GFX9-NEXT:    scratch_store_dword off, v0, s1
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_addk_i32 s0, 0x104
+; GFX9-NEXT:    s_addk_i32 s0, 0x100
 ; GFX9-NEXT:    scratch_load_dword v0, off, s0 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_endpgm
@@ -345,15 +345,15 @@ define amdgpu_kernel void @store_load_sindex_small_offset_kernel(i32 %idx) {
 ; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
 ; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
 ; GFX10-NEXT:    s_load_dword s0, s[0:1], 0x24
-; GFX10-NEXT:    scratch_load_dword v0, off, off offset:4 glc dlc
+; GFX10-NEXT:    scratch_load_dword v0, off, off glc dlc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 15
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_and_b32 s1, s0, 15
 ; GFX10-NEXT:    s_lshl_b32 s0, s0, 2
 ; GFX10-NEXT:    s_lshl_b32 s1, s1, 2
-; GFX10-NEXT:    s_addk_i32 s0, 0x104
-; GFX10-NEXT:    s_addk_i32 s1, 0x104
+; GFX10-NEXT:    s_addk_i32 s0, 0x100
+; GFX10-NEXT:    s_addk_i32 s1, 0x100
 ; GFX10-NEXT:    scratch_store_dword off, v0, s0
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    scratch_load_dword v0, off, s1 glc dlc
@@ -363,42 +363,42 @@ define amdgpu_kernel void @store_load_sindex_small_offset_kernel(i32 %idx) {
 ; GFX940-LABEL: store_load_sindex_small_offset_kernel:
 ; GFX940:       ; %bb.0: ; %bb
 ; GFX940-NEXT:    s_load_dword s0, s[0:1], 0x24
-; GFX940-NEXT:    scratch_load_dword v0, off, off offset:4 sc0 sc1
+; GFX940-NEXT:    scratch_load_dword v0, off, off sc0 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v0, 15
 ; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NEXT:    s_lshl_b32 s1, s0, 2
 ; GFX940-NEXT:    s_and_b32 s0, s0, 15
-; GFX940-NEXT:    s_addk_i32 s1, 0x104
+; GFX940-NEXT:    s_addk_i32 s1, 0x100
 ; GFX940-NEXT:    s_lshl_b32 s0, s0, 2
 ; GFX940-NEXT:    scratch_store_dword off, v0, s1 sc0 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NEXT:    scratch_load_dword v0, v0, off offset:260 sc0 sc1
+; GFX940-NEXT:    scratch_load_dword v0, v0, off offset:256 sc0 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: store_load_sindex_small_offset_kernel:
 ; GFX11:       ; %bb.0: ; %bb
 ; GFX11-NEXT:    s_load_b32 s0, s[0:1], 0x24
-; GFX11-NEXT:    scratch_load_b32 v2, off, off offset:4 glc dlc
+; GFX11-NEXT:    scratch_load_b32 v2, off, off glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_and_b32 s1, s0, 15
 ; GFX11-NEXT:    s_lshl_b32 s0, s0, 2
 ; GFX11-NEXT:    s_lshl_b32 s1, s1, 2
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    v_dual_mov_b32 v0, 15 :: v_dual_mov_b32 v1, s1
-; GFX11-NEXT:    s_addk_i32 s0, 0x104
+; GFX11-NEXT:    s_addk_i32 s0, 0x100
 ; GFX11-NEXT:    scratch_store_b32 off, v0, s0 dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    scratch_load_b32 v0, v1, off offset:260 glc dlc
+; GFX11-NEXT:    scratch_load_b32 v0, v1, off offset:256 glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: store_load_sindex_small_offset_kernel:
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    s_load_b32 s0, s[0:1], 0x24
-; GFX12-NEXT:    scratch_load_b32 v3, off, off offset:4 scope:SCOPE_SYS
+; GFX12-NEXT:    scratch_load_b32 v3, off, off scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v1, 15
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
@@ -408,9 +408,9 @@ define amdgpu_kernel void @store_load_sindex_small_offset_kernel(i32 %idx) {
 ; GFX12-NEXT:    s_lshl_b32 s0, s0, 2
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    v_mov_b32_e32 v2, s0
-; GFX12-NEXT:    scratch_store_b32 v0, v1, off offset:260 scope:SCOPE_SYS
+; GFX12-NEXT:    scratch_store_b32 v0, v1, off offset:256 scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    scratch_load_b32 v0, v2, off offset:260 scope:SCOPE_SYS
+; GFX12-NEXT:    scratch_load_b32 v0, v2, off offset:256 scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    s_endpgm
 bb:
@@ -432,16 +432,16 @@ define amdgpu_kernel void @store_load_vindex_small_offset_kernel() {
 ; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s0, s3
 ; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s1, 0
 ; GFX9-NEXT:    s_mov_b32 s0, 0
-; GFX9-NEXT:    scratch_load_dword v1, off, s0 offset:4 glc
+; GFX9-NEXT:    scratch_load_dword v1, off, s0 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
 ; GFX9-NEXT:    v_sub_u32_e32 v0, 0, v0
-; GFX9-NEXT:    v_add_u32_e32 v1, 0x104, v1
+; GFX9-NEXT:    v_add_u32_e32 v1, 0x100, v1
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 15
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    scratch_store_dword v1, v2, off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_add_u32_e32 v0, 0x104, v0
+; GFX9-NEXT:    v_add_u32_e32 v0, 0x100, v0
 ; GFX9-NEXT:    scratch_load_dword v0, v0, off offset:124 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_endpgm
@@ -455,11 +455,11 @@ define amdgpu_kernel void @store_load_vindex_small_offset_kernel() {
 ; GFX10-NEXT:    v_sub_nc_u32_e32 v1, 0, v0
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 15
-; GFX10-NEXT:    scratch_load_dword v3, off, off offset:4 glc dlc
+; GFX10-NEXT:    scratch_load_dword v3, off, off glc dlc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 2, v1
-; GFX10-NEXT:    v_add_nc_u32_e32 v0, 0x104, v0
-; GFX10-NEXT:    v_add_nc_u32_e32 v1, 0x104, v1
+; GFX10-NEXT:    v_add_nc_u32_e32 v0, 0x100, v0
+; GFX10-NEXT:    v_add_nc_u32_e32 v1, 0x100, v1
 ; GFX10-NEXT:    scratch_store_dword v0, v2, off
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    scratch_load_dword v0, v1, off offset:124 glc dlc
@@ -468,15 +468,15 @@ define amdgpu_kernel void @store_load_vindex_small_offset_kernel() {
 ;
 ; GFX940-LABEL: store_load_vindex_small_offset_kernel:
 ; GFX940:       ; %bb.0: ; %bb
-; GFX940-NEXT:    scratch_load_dword v1, off, off offset:4 sc0 sc1
+; GFX940-NEXT:    scratch_load_dword v1, off, off sc0 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
 ; GFX940-NEXT:    v_sub_u32_e32 v0, 0, v0
 ; GFX940-NEXT:    v_mov_b32_e32 v2, 15
 ; GFX940-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX940-NEXT:    scratch_store_dword v1, v2, off offset:260 sc0 sc1
+; GFX940-NEXT:    scratch_store_dword v1, v2, off offset:256 sc0 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    v_add_u32_e32 v0, 0x104, v0
+; GFX940-NEXT:    v_add_u32_e32 v0, 0x100, v0
 ; GFX940-NEXT:    scratch_load_dword v0, v0, off offset:124 sc0 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    s_endpgm
@@ -485,12 +485,12 @@ define amdgpu_kernel void @store_load_vindex_small_offset_kernel() {
 ; GFX11:       ; %bb.0: ; %bb
 ; GFX11-NEXT:    v_sub_nc_u32_e32 v1, 0, v0
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX11-NEXT:    scratch_load_b32 v3, off, off offset:4 glc dlc
+; GFX11-NEXT:    scratch_load_b32 v3, off, off glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_dual_mov_b32 v2, 15 :: v_dual_lshlrev_b32 v1, 2, v1
-; GFX11-NEXT:    scratch_store_b32 v0, v2, off offset:260 dlc
+; GFX11-NEXT:    scratch_store_b32 v0, v2, off offset:256 dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    v_add_nc_u32_e32 v1, 0x104, v1
+; GFX11-NEXT:    v_add_nc_u32_e32 v1, 0x100, v1
 ; GFX11-NEXT:    scratch_load_b32 v0, v1, off offset:124 glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_endpgm
@@ -500,12 +500,12 @@ define amdgpu_kernel void @store_load_vindex_small_offset_kernel() {
 ; GFX12-NEXT:    v_sub_nc_u32_e32 v1, 0, v0
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX12-NEXT:    v_mov_b32_e32 v2, 15
-; GFX12-NEXT:    scratch_load_b32 v3, off, off offset:4 scope:SCOPE_SYS
+; GFX12-NEXT:    scratch_load_b32 v3, off, off scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v1, 2, v1
-; GFX12-NEXT:    scratch_store_b32 v0, v2, off offset:260 scope:SCOPE_SYS
+; GFX12-NEXT:    scratch_store_b32 v0, v2, off offset:256 scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    scratch_load_b32 v0, v1, off offset:384 scope:SCOPE_SYS
+; GFX12-NEXT:    scratch_load_b32 v0, v1, off offset:380 scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    s_endpgm
 bb:
@@ -708,7 +708,7 @@ define amdgpu_kernel void @store_load_sindex_large_offset_kernel(i32 %idx) {
 ; GFX12-LABEL: store_load_sindex_large_offset_kernel:
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    s_load_b32 s0, s[0:1], 0x24
-; GFX12-NEXT:    scratch_load_b32 v3, off, off offset:4 scope:SCOPE_SYS
+; GFX12-NEXT:    scratch_load_b32 v3, off, off scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v1, 15
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
@@ -718,9 +718,9 @@ define amdgpu_kernel void @store_load_sindex_large_offset_kernel(i32 %idx) {
 ; GFX12-NEXT:    s_lshl_b32 s0, s0, 2
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    v_mov_b32_e32 v2, s0
-; GFX12-NEXT:    scratch_store_b32 v0, v1, off offset:16388 scope:SCOPE_SYS
+; GFX12-NEXT:    scratch_store_b32 v0, v1, off offset:16384 scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    scratch_load_b32 v0, v2, off offset:16388 scope:SCOPE_SYS
+; GFX12-NEXT:    scratch_load_b32 v0, v2, off offset:16384 scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    s_endpgm
 bb:
@@ -812,12 +812,12 @@ define amdgpu_kernel void @store_load_vindex_large_offset_kernel() {
 ; GFX12-NEXT:    v_sub_nc_u32_e32 v1, 0, v0
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX12-NEXT:    v_mov_b32_e32 v2, 15
-; GFX12-NEXT:    scratch_load_b32 v3, off, off offset:4 scope:SCOPE_SYS
+; GFX12-NEXT:    scratch_load_b32 v3, off, off scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v1, 2, v1
-; GFX12-NEXT:    scratch_store_b32 v0, v2, off offset:16388 scope:SCOPE_SYS
+; GFX12-NEXT:    scratch_store_b32 v0, v2, off offset:16384 scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    scratch_load_b32 v0, v1, off offset:16512 scope:SCOPE_SYS
+; GFX12-NEXT:    scratch_load_b32 v0, v1, off offset:16508 scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    s_endpgm
 bb:
@@ -1003,11 +1003,11 @@ define amdgpu_kernel void @store_load_large_imm_offset_kernel() {
 ; GFX12-LABEL: store_load_large_imm_offset_kernel:
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_dual_mov_b32 v0, 13 :: v_dual_mov_b32 v1, 15
-; GFX12-NEXT:    scratch_store_b32 off, v0, off offset:4 scope:SCOPE_SYS
+; GFX12-NEXT:    scratch_store_b32 off, v0, off scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    scratch_store_b32 off, v1, off offset:16004 scope:SCOPE_SYS
+; GFX12-NEXT:    scratch_store_b32 off, v1, off offset:16000 scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    scratch_load_b32 v0, off, off offset:16004 scope:SCOPE_SYS
+; GFX12-NEXT:    scratch_load_b32 v0, off, off offset:16000 scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    s_endpgm
 bb:
@@ -1116,7 +1116,7 @@ define amdgpu_kernel void @store_load_vidx_sidx_offset(i32 %sidx) {
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 15
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_add_lshl_u32 v0, s0, v0, 2
-; GFX9-NEXT:    v_add_u32_e32 v0, 4, v0
+; GFX9-NEXT:    v_add_u32_e32 v0, 0, v0
 ; GFX9-NEXT:    scratch_store_dword v0, v1, off offset:1024
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    scratch_load_dword v0, v0, off offset:1024 glc
@@ -1133,7 +1133,7 @@ define amdgpu_kernel void @store_load_vidx_sidx_offset(i32 %sidx) {
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 15
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    v_add_lshl_u32 v0, s0, v0, 2
-; GFX10-NEXT:    v_add_nc_u32_e32 v0, 4, v0
+; GFX10-NEXT:    v_add_nc_u32_e32 v0, 0, v0
 ; GFX10-NEXT:    scratch_store_dword v0, v1, off offset:1024
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    scratch_load_dword v0, v0, off offset:1024 glc dlc
@@ -1146,7 +1146,7 @@ define amdgpu_kernel void @store_load_vidx_sidx_offset(i32 %sidx) {
 ; GFX940-NEXT:    v_mov_b32_e32 v1, 15
 ; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NEXT:    v_add_lshl_u32 v0, s0, v0, 2
-; GFX940-NEXT:    v_add_u32_e32 v0, 4, v0
+; GFX940-NEXT:    v_add_u32_e32 v0, 0, v0
 ; GFX940-NEXT:    scratch_store_dword v0, v1, off offset:1024 sc0 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    scratch_load_dword v0, v0, off offset:1024 sc0 sc1
@@ -1160,7 +1160,7 @@ define amdgpu_kernel void @store_load_vidx_sidx_offset(i32 %sidx) {
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    v_add_lshl_u32 v0, s0, v0, 2
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_add_nc_u32_e32 v0, 4, v0
+; GFX11-NEXT:    v_add_nc_u32_e32 v0, 0, v0
 ; GFX11-NEXT:    scratch_store_b32 v0, v1, off offset:1024 dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    scratch_load_b32 v0, v0, off offset:1024 glc dlc
@@ -1173,9 +1173,9 @@ define amdgpu_kernel void @store_load_vidx_sidx_offset(i32 %sidx) {
 ; GFX12-NEXT:    v_mov_b32_e32 v1, 15
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_add_lshl_u32 v0, s0, v0, 2
-; GFX12-NEXT:    scratch_store_b32 v0, v1, off offset:1028 scope:SCOPE_SYS
+; GFX12-NEXT:    scratch_store_b32 v0, v1, off offset:1024 scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    scratch_load_b32 v0, v0, off offset:1028 scope:SCOPE_SYS
+; GFX12-NEXT:    scratch_load_b32 v0, v0, off offset:1024 scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    s_endpgm
 bb:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement-stack-lower.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement-stack-lower.ll
index faab70c..a1c99f5 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement-stack-lower.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement-stack-lower.ll
@@ -11,7 +11,7 @@ define amdgpu_kernel void @v_insert_v64i32_varidx(ptr addrspace(1) %out.ptr, ptr
 ; GCN-NEXT:    s_load_dwordx2 s[24:25], s[4:5], 0x10
 ; GCN-NEXT:    s_add_u32 s0, s0, s7
 ; GCN-NEXT:    s_addc_u32 s1, s1, 0
-; GCN-NEXT:    v_mov_b32_e32 v16, 0x100
+; GCN-NEXT:    v_mov_b32_e32 v16, 0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_load_dwordx16 s[36:51], s[22:23], 0x0
 ; GCN-NEXT:    s_load_dwordx16 s[52:67], s[22:23], 0x40
@@ -35,189 +35,189 @@ define amdgpu_kernel void @v_insert_v64i32_varidx(ptr addrspace(1) %out.ptr, ptr
 ; GCN-NEXT:    v_mov_b32_e32 v14, s50
 ; GCN-NEXT:    v_mov_b32_e32 v15, s51
 ; GCN-NEXT:    s_load_dwordx16 s[36:51], s[22:23], 0xc0
-; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:256
-; GCN-NEXT:    buffer_store_dword v1, off, s[0:3], 0 offset:260
-; GCN-NEXT:    buffer_store_dword v2, off, s[0:3], 0 offset:264
-; GCN-NEXT:    buffer_store_dword v3, off, s[0:3], 0 offset:268
-; GCN-NEXT:    buffer_store_dword v4, off, s[0:3], 0 offset:272
-; GCN-NEXT:    buffer_store_dword v5, off, s[0:3], 0 offset:276
-; GCN-NEXT:    buffer_store_dword v6, off, s[0:3], 0 offset:280
-; GCN-NEXT:    buffer_store_dword v7, off, s[0:3], 0 offset:284
-; GCN-NEXT:    buffer_store_dword v8, off, s[0:3], 0 offset:288
-; GCN-NEXT:    buffer_store_dword v9, off, s[0:3], 0 offset:292
-; GCN-NEXT:    buffer_store_dword v10, off, s[0:3], 0 offset:296
-; GCN-NEXT:    buffer_store_dword v11, off, s[0:3], 0 offset:300
-; GCN-NEXT:    buffer_store_dword v12, off, s[0:3], 0 offset:304
-; GCN-NEXT:    buffer_store_dword v13, off, s[0:3], 0 offset:308
-; GCN-NEXT:    buffer_store_dword v14, off, s[0:3], 0 offset:312
-; GCN-NEXT:    buffer_store_dword v15, off, s[0:3], 0 offset:316
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT:    buffer_store_dword v1, off, s[0:3], 0 offset:4
+; GCN-NEXT:    buffer_store_dword v2, off, s[0:3], 0 offset:8
+; GCN-NEXT:    buffer_store_dword v3, off, s[0:3], 0 offset:12
+; GCN-NEXT:    buffer_store_dword v4, off, s[0:3], 0 offset:16
+; GCN-NEXT:    buffer_store_dword v5, off, s[0:3], 0 offset:20
+; GCN-NEXT:    buffer_store_dword v6, off, s[0:3], 0 offset:24
+; GCN-NEXT:    buffer_store_dword v7, off, s[0:3], 0 offset:28
+; GCN-NEXT:    buffer_store_dword v8, off, s[0:3], 0 offset:32
+; GCN-NEXT:    buffer_store_dword v9, off, s[0:3], 0 offset:36
+; GCN-NEXT:    buffer_store_dword v10, off, s[0:3], 0 offset:40
+; GCN-NEXT:    buffer_store_dword v11, off, s[0:3], 0 offset:44
+; GCN-NEXT:    buffer_store_dword v12, off, s[0:3], 0 offset:48
+; GCN-NEXT:    buffer_store_dword v13, off, s[0:3], 0 offset:52
+; GCN-NEXT:    buffer_store_dword v14, off, s[0:3], 0 offset:56
+; GCN-NEXT:    buffer_store_dword v15, off, s[0:3], 0 offset:60
 ; GCN-NEXT:    v_mov_b32_e32 v0, s52
-; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:320
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:64
 ; GCN-NEXT:    v_mov_b32_e32 v0, s53
-; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:324
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:68
 ; GCN-NEXT:    v_mov_b32_e32 v0, s54
-; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:328
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:72
 ; GCN-NEXT:    v_mov_b32_e32 v0, s55
-; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:332
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:76
 ; GCN-NEXT:    v_mov_b32_e32 v0, s56
-; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:336
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:80
 ; GCN-NEXT:    v_mov_b32_e32 v0, s57
-; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:340
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:84
 ; GCN-NEXT:    v_mov_b32_e32 v0, s58
-; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:344
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:88
 ; GCN-NEXT:    v_mov_b32_e32 v0, s59
-; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:348
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:92
 ; GCN-NEXT:    v_mov_b32_e32 v0, s60
-; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:352
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:96
 ; GCN-NEXT:    v_mov_b32_e32 v0, s61
-; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:356
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:100
 ; GCN-NEXT:    v_mov_b32_e32 v0, s62
-; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:360
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:104
 ; GCN-NEXT:    v_mov_b32_e32 v0, s63
-; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:364
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:108
 ; GCN-NEXT:    v_mov_b32_e32 v0, s64
-; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:368
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:112
 ; GCN-NEXT:    v_mov_b32_e32 v0, s65
-; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:372
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:116
 ; GCN-NEXT:    v_mov_b32_e32 v0, s66
-; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:376
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:120
 ; GCN-NEXT:    v_mov_b32_e32 v0, s67
-; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:380
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:124
 ; GCN-NEXT:    v_mov_b32_e32 v0, s4
-; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:384
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:128
 ; GCN-NEXT:    v_mov_b32_e32 v0, s5
-; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:388
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:132
 ; GCN-NEXT:    v_mov_b32_e32 v0, s6
-; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:392
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:136
 ; GCN-NEXT:    v_mov_b32_e32 v0, s7
-; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:396
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:140
 ; GCN-NEXT:    v_mov_b32_e32 v0, s8
-; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:400
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:144
 ; GCN-NEXT:    v_mov_b32_e32 v0, s9
-; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:404
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:148
 ; GCN-NEXT:    v_mov_b32_e32 v0, s10
-; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:408
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:152
 ; GCN-NEXT:    v_mov_b32_e32 v0, s11
-; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:412
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:156
 ; GCN-NEXT:    v_mov_b32_e32 v0, s12
-; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:416
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:160
 ; GCN-NEXT:    v_mov_b32_e32 v0, s13
-; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:420
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:164
 ; GCN-NEXT:    v_mov_b32_e32 v0, s14
-; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:424
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:168
 ; GCN-NEXT:    v_mov_b32_e32 v0, s15
-; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:428
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:172
 ; GCN-NEXT:    v_mov_b32_e32 v0, s16
-; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:432
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:176
 ; GCN-NEXT:    v_mov_b32_e32 v0, s17
-; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:436
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:180
 ; GCN-NEXT:    v_mov_b32_e32 v0, s18
-; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:440
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:184
 ; GCN-NEXT:    v_mov_b32_e32 v0, s19
-; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:444
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:188
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v0, s36
-; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:448
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:192
 ; GCN-NEXT:    v_mov_b32_e32 v0, s37
-; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:452
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:196
 ; GCN-NEXT:    v_mov_b32_e32 v0, s38
-; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:456
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:200
 ; GCN-NEXT:    v_mov_b32_e32 v0, s39
-; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:460
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:204
 ; GCN-NEXT:    v_mov_b32_e32 v0, s40
-; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:464
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:208
 ; GCN-NEXT:    v_mov_b32_e32 v0, s41
-; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:468
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:212
 ; GCN-NEXT:    v_mov_b32_e32 v0, s42
-; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:472
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:216
 ; GCN-NEXT:    v_mov_b32_e32 v0, s43
-; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:476
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:220
 ; GCN-NEXT:    v_mov_b32_e32 v0, s44
-; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:480
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:224
 ; GCN-NEXT:    v_mov_b32_e32 v0, s45
-; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:484
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:228
 ; GCN-NEXT:    v_mov_b32_e32 v0, s46
-; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:488
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:232
 ; GCN-NEXT:    v_mov_b32_e32 v0, s47
-; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:492
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:236
 ; GCN-NEXT:    v_mov_b32_e32 v0, s48
-; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:496
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:240
 ; GCN-NEXT:    v_mov_b32_e32 v0, s49
-; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:500
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:244
 ; GCN-NEXT:    v_mov_b32_e32 v0, s50
 ; GCN-NEXT:    s_and_b32 s4, s25, 63
-; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:504
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:248
 ; GCN-NEXT:    v_mov_b32_e32 v0, s51
 ; GCN-NEXT:    s_lshl_b32 s4, s4, 2
-; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:508
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:252
 ; GCN-NEXT:    v_add_u32_e32 v0, s4, v16
 ; GCN-NEXT:    v_mov_b32_e32 v1, s24
 ; GCN-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
-; GCN-NEXT:    buffer_load_dword v0, off, s[0:3], 0 offset:256
+; GCN-NEXT:    buffer_load_dword v0, off, s[0:3], 0
 ; GCN-NEXT:    s_nop 0
-; GCN-NEXT:    buffer_load_dword v1, off, s[0:3], 0 offset:260
-; GCN-NEXT:    buffer_load_dword v2, off, s[0:3], 0 offset:264
-; GCN-NEXT:    buffer_load_dword v3, off, s[0:3], 0 offset:268
-; GCN-NEXT:    buffer_load_dword v4, off, s[0:3], 0 offset:272
-; GCN-NEXT:    buffer_load_dword v5, off, s[0:3], 0 offset:276
-; GCN-NEXT:    buffer_load_dword v6, off, s[0:3], 0 offset:280
-; GCN-NEXT:    buffer_load_dword v7, off, s[0:3], 0 offset:284
-; GCN-NEXT:    buffer_load_dword v8, off, s[0:3], 0 offset:288
-; GCN-NEXT:    buffer_load_dword v9, off, s[0:3], 0 offset:292
-; GCN-NEXT:    buffer_load_dword v10, off, s[0:3], 0 offset:296
-; GCN-NEXT:    buffer_load_dword v11, off, s[0:3], 0 offset:300
-; GCN-NEXT:    buffer_load_dword v12, off, s[0:3], 0 offset:304
-; GCN-NEXT:    buffer_load_dword v13, off, s[0:3], 0 offset:308
-; GCN-NEXT:    buffer_load_dword v14, off, s[0:3], 0 offset:312
-; GCN-NEXT:    buffer_load_dword v15, off, s[0:3], 0 offset:316
-; GCN-NEXT:    buffer_load_dword v16, off, s[0:3], 0 offset:320
-; GCN-NEXT:    buffer_load_dword v17, off, s[0:3], 0 offset:324
-; GCN-NEXT:    buffer_load_dword v18, off, s[0:3], 0 offset:328
-; GCN-NEXT:    buffer_load_dword v19, off, s[0:3], 0 offset:332
-; GCN-NEXT:    buffer_load_dword v20, off, s[0:3], 0 offset:336
-; GCN-NEXT:    buffer_load_dword v21, off, s[0:3], 0 offset:340
-; GCN-NEXT:    buffer_load_dword v22, off, s[0:3], 0 offset:344
-; GCN-NEXT:    buffer_load_dword v23, off, s[0:3], 0 offset:348
-; GCN-NEXT:    buffer_load_dword v24, off, s[0:3], 0 offset:352
-; GCN-NEXT:    buffer_load_dword v25, off, s[0:3], 0 offset:356
-; GCN-NEXT:    buffer_load_dword v26, off, s[0:3], 0 offset:360
-; GCN-NEXT:    buffer_load_dword v27, off, s[0:3], 0 offset:364
-; GCN-NEXT:    buffer_load_dword v28, off, s[0:3], 0 offset:368
-; GCN-NEXT:    buffer_load_dword v29, off, s[0:3], 0 offset:372
-; GCN-NEXT:    buffer_load_dword v30, off, s[0:3], 0 offset:376
-; GCN-NEXT:    buffer_load_dword v31, off, s[0:3], 0 offset:380
-; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], 0 offset:384
-; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], 0 offset:388
-; GCN-NEXT:    buffer_load_dword v34, off, s[0:3], 0 offset:392
-; GCN-NEXT:    buffer_load_dword v35, off, s[0:3], 0 offset:396
-; GCN-NEXT:    buffer_load_dword v36, off, s[0:3], 0 offset:400
-; GCN-NEXT:    buffer_load_dword v37, off, s[0:3], 0 offset:404
-; GCN-NEXT:    buffer_load_dword v38, off, s[0:3], 0 offset:408
-; GCN-NEXT:    buffer_load_dword v39, off, s[0:3], 0 offset:412
-; GCN-NEXT:    buffer_load_dword v40, off, s[0:3], 0 offset:416
-; GCN-NEXT:    buffer_load_dword v41, off, s[0:3], 0 offset:420
-; GCN-NEXT:    buffer_load_dword v42, off, s[0:3], 0 offset:424
-; GCN-NEXT:    buffer_load_dword v43, off, s[0:3], 0 offset:428
-; GCN-NEXT:    buffer_load_dword v44, off, s[0:3], 0 offset:432
-; GCN-NEXT:    buffer_load_dword v45, off, s[0:3], 0 offset:436
-; GCN-NEXT:    buffer_load_dword v46, off, s[0:3], 0 offset:440
-; GCN-NEXT:    buffer_load_dword v47, off, s[0:3], 0 offset:444
-; GCN-NEXT:    buffer_load_dword v48, off, s[0:3], 0 offset:448
-; GCN-NEXT:    buffer_load_dword v49, off, s[0:3], 0 offset:452
-; GCN-NEXT:    buffer_load_dword v50, off, s[0:3], 0 offset:456
-; GCN-NEXT:    buffer_load_dword v51, off, s[0:3], 0 offset:460
-; GCN-NEXT:    buffer_load_dword v52, off, s[0:3], 0 offset:464
-; GCN-NEXT:    buffer_load_dword v53, off, s[0:3], 0 offset:468
-; GCN-NEXT:    buffer_load_dword v54, off, s[0:3], 0 offset:472
-; GCN-NEXT:    buffer_load_dword v55, off, s[0:3], 0 offset:476
-; GCN-NEXT:    buffer_load_dword v56, off, s[0:3], 0 offset:480
-; GCN-NEXT:    buffer_load_dword v57, off, s[0:3], 0 offset:484
-; GCN-NEXT:    buffer_load_dword v58, off, s[0:3], 0 offset:488
-; GCN-NEXT:    buffer_load_dword v59, off, s[0:3], 0 offset:492
-; GCN-NEXT:    buffer_load_dword v60, off, s[0:3], 0 offset:496
-; GCN-NEXT:    buffer_load_dword v61, off, s[0:3], 0 offset:500
-; GCN-NEXT:    buffer_load_dword v62, off, s[0:3], 0 offset:504
-; GCN-NEXT:    buffer_load_dword v63, off, s[0:3], 0 offset:508
+; GCN-NEXT:    buffer_load_dword v1, off, s[0:3], 0 offset:4
+; GCN-NEXT:    buffer_load_dword v2, off, s[0:3], 0 offset:8
+; GCN-NEXT:    buffer_load_dword v3, off, s[0:3], 0 offset:12
+; GCN-NEXT:    buffer_load_dword v4, off, s[0:3], 0 offset:16
+; GCN-NEXT:    buffer_load_dword v5, off, s[0:3], 0 offset:20
+; GCN-NEXT:    buffer_load_dword v6, off, s[0:3], 0 offset:24
+; GCN-NEXT:    buffer_load_dword v7, off, s[0:3], 0 offset:28
+; GCN-NEXT:    buffer_load_dword v8, off, s[0:3], 0 offset:32
+; GCN-NEXT:    buffer_load_dword v9, off, s[0:3], 0 offset:36
+; GCN-NEXT:    buffer_load_dword v10, off, s[0:3], 0 offset:40
+; GCN-NEXT:    buffer_load_dword v11, off, s[0:3], 0 offset:44
+; GCN-NEXT:    buffer_load_dword v12, off, s[0:3], 0 offset:48
+; GCN-NEXT:    buffer_load_dword v13, off, s[0:3], 0 offset:52
+; GCN-NEXT:    buffer_load_dword v14, off, s[0:3], 0 offset:56
+; GCN-NEXT:    buffer_load_dword v15, off, s[0:3], 0 offset:60
+; GCN-NEXT:    buffer_load_dword v16, off, s[0:3], 0 offset:64
+; GCN-NEXT:    buffer_load_dword v17, off, s[0:3], 0 offset:68
+; GCN-NEXT:    buffer_load_dword v18, off, s[0:3], 0 offset:72
+; GCN-NEXT:    buffer_load_dword v19, off, s[0:3], 0 offset:76
+; GCN-NEXT:    buffer_load_dword v20, off, s[0:3], 0 offset:80
+; GCN-NEXT:    buffer_load_dword v21, off, s[0:3], 0 offset:84
+; GCN-NEXT:    buffer_load_dword v22, off, s[0:3], 0 offset:88
+; GCN-NEXT:    buffer_load_dword v23, off, s[0:3], 0 offset:92
+; GCN-NEXT:    buffer_load_dword v24, off, s[0:3], 0 offset:96
+; GCN-NEXT:    buffer_load_dword v25, off, s[0:3], 0 offset:100
+; GCN-NEXT:    buffer_load_dword v26, off, s[0:3], 0 offset:104
+; GCN-NEXT:    buffer_load_dword v27, off, s[0:3], 0 offset:108
+; GCN-NEXT:    buffer_load_dword v28, off, s[0:3], 0 offset:112
+; GCN-NEXT:    buffer_load_dword v29, off, s[0:3], 0 offset:116
+; GCN-NEXT:    buffer_load_dword v30, off, s[0:3], 0 offset:120
+; GCN-NEXT:    buffer_load_dword v31, off, s[0:3], 0 offset:124
+; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], 0 offset:128
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], 0 offset:132
+; GCN-NEXT:    buffer_load_dword v34, off, s[0:3], 0 offset:136
+; GCN-NEXT:    buffer_load_dword v35, off, s[0:3], 0 offset:140
+; GCN-NEXT:    buffer_load_dword v36, off, s[0:3], 0 offset:144
+; GCN-NEXT:    buffer_load_dword v37, off, s[0:3], 0 offset:148
+; GCN-NEXT:    buffer_load_dword v38, off, s[0:3], 0 offset:152
+; GCN-NEXT:    buffer_load_dword v39, off, s[0:3], 0 offset:156
+; GCN-NEXT:    buffer_load_dword v40, off, s[0:3], 0 offset:160
+; GCN-NEXT:    buffer_load_dword v41, off, s[0:3], 0 offset:164
+; GCN-NEXT:    buffer_load_dword v42, off, s[0:3], 0 offset:168
+; GCN-NEXT:    buffer_load_dword v43, off, s[0:3], 0 offset:172
+; GCN-NEXT:    buffer_load_dword v44, off, s[0:3], 0 offset:176
+; GCN-NEXT:    buffer_load_dword v45, off, s[0:3], 0 offset:180
+; GCN-NEXT:    buffer_load_dword v46, off, s[0:3], 0 offset:184
+; GCN-NEXT:    buffer_load_dword v47, off, s[0:3], 0 offset:188
+; GCN-NEXT:    buffer_load_dword v48, off, s[0:3], 0 offset:192
+; GCN-NEXT:    buffer_load_dword v49, off, s[0:3], 0 offset:196
+; GCN-NEXT:    buffer_load_dword v50, off, s[0:3], 0 offset:200
+; GCN-NEXT:    buffer_load_dword v51, off, s[0:3], 0 offset:204
+; GCN-NEXT:    buffer_load_dword v52, off, s[0:3], 0 offset:208
+; GCN-NEXT:    buffer_load_dword v53, off, s[0:3], 0 offset:212
+; GCN-NEXT:    buffer_load_dword v54, off, s[0:3], 0 offset:216
+; GCN-NEXT:    buffer_load_dword v55, off, s[0:3], 0 offset:220
+; GCN-NEXT:    buffer_load_dword v56, off, s[0:3], 0 offset:224
+; GCN-NEXT:    buffer_load_dword v57, off, s[0:3], 0 offset:228
+; GCN-NEXT:    buffer_load_dword v58, off, s[0:3], 0 offset:232
+; GCN-NEXT:    buffer_load_dword v59, off, s[0:3], 0 offset:236
+; GCN-NEXT:    buffer_load_dword v60, off, s[0:3], 0 offset:240
+; GCN-NEXT:    buffer_load_dword v61, off, s[0:3], 0 offset:244
+; GCN-NEXT:    buffer_load_dword v62, off, s[0:3], 0 offset:248
+; GCN-NEXT:    buffer_load_dword v63, off, s[0:3], 0 offset:252
 ; GCN-NEXT:    s_waitcnt vmcnt(60)
 ; GCN-NEXT:    global_store_dwordx4 v64, v[0:3], s[20:21]
 ; GCN-NEXT:    s_waitcnt vmcnt(57)
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-cc.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-cc.ll
index 3e572f9..c92b78c 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-cc.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-cc.ll
@@ -365,8 +365,8 @@ define amdgpu_cs_chain void @alloca_and_call() {
 ; GISEL-GFX11-NEXT:    s_mov_b32 s0, use@abs32@lo
 ; GISEL-GFX11-NEXT:    s_mov_b32 s1, use@abs32@hi
 ; GISEL-GFX11-NEXT:    s_mov_b32 s32, 16
-; GISEL-GFX11-NEXT:    scratch_store_b32 off, v0, off offset:4
-; GISEL-GFX11-NEXT:    v_mov_b32_e32 v0, 4
+; GISEL-GFX11-NEXT:    scratch_store_b32 off, v0, off
+; GISEL-GFX11-NEXT:    v_mov_b32_e32 v0, 0
 ; GISEL-GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GISEL-GFX11-NEXT:    s_endpgm
 ;
@@ -378,8 +378,8 @@ define amdgpu_cs_chain void @alloca_and_call() {
 ; GISEL-GFX10-NEXT:    s_mov_b32 s4, use@abs32@lo
 ; GISEL-GFX10-NEXT:    s_mov_b32 s5, use@abs32@hi
 ; GISEL-GFX10-NEXT:    s_mov_b64 s[2:3], s[50:51]
-; GISEL-GFX10-NEXT:    buffer_store_dword v0, off, s[48:51], 0 offset:4
-; GISEL-GFX10-NEXT:    v_mov_b32_e32 v0, 4
+; GISEL-GFX10-NEXT:    buffer_store_dword v0, off, s[48:51], 0
+; GISEL-GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; GISEL-GFX10-NEXT:    s_movk_i32 s32, 0x200
 ; GISEL-GFX10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; GISEL-GFX10-NEXT:    s_endpgm
@@ -391,8 +391,8 @@ define amdgpu_cs_chain void @alloca_and_call() {
 ; DAGISEL-GFX11-NEXT:    s_mov_b32 s1, use@abs32@hi
 ; DAGISEL-GFX11-NEXT:    s_mov_b32 s0, use@abs32@lo
 ; DAGISEL-GFX11-NEXT:    s_mov_b32 s32, 16
-; DAGISEL-GFX11-NEXT:    scratch_store_b32 off, v0, off offset:4
-; DAGISEL-GFX11-NEXT:    v_mov_b32_e32 v0, 4
+; DAGISEL-GFX11-NEXT:    scratch_store_b32 off, v0, off
+; DAGISEL-GFX11-NEXT:    v_mov_b32_e32 v0, 0
 ; DAGISEL-GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; DAGISEL-GFX11-NEXT:    s_endpgm
 ;
@@ -404,8 +404,8 @@ define amdgpu_cs_chain void @alloca_and_call() {
 ; DAGISEL-GFX10-NEXT:    s_mov_b32 s5, use@abs32@hi
 ; DAGISEL-GFX10-NEXT:    s_mov_b32 s4, use@abs32@lo
 ; DAGISEL-GFX10-NEXT:    s_mov_b64 s[2:3], s[50:51]
-; DAGISEL-GFX10-NEXT:    buffer_store_dword v0, off, s[48:51], 0 offset:4
-; DAGISEL-GFX10-NEXT:    v_mov_b32_e32 v0, 4
+; DAGISEL-GFX10-NEXT:    buffer_store_dword v0, off, s[48:51], 0
+; DAGISEL-GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; DAGISEL-GFX10-NEXT:    s_movk_i32 s32, 0x200
 ; DAGISEL-GFX10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; DAGISEL-GFX10-NEXT:    s_endpgm
@@ -867,7 +867,7 @@ define amdgpu_cs_chain void @amdgpu_cs_chain_dont_realign_stack(i32 %idx) {
 ; GISEL-GFX11-NEXT:    s_mov_b32 s0, 1
 ; GISEL-GFX11-NEXT:    v_lshlrev_b32_e32 v0, 4, v8
 ; GISEL-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GISEL-GFX11-NEXT:    v_add_nc_u32_e32 v4, 32, v0
+; GISEL-GFX11-NEXT:    v_add_nc_u32_e32 v4, 0, v0
 ; GISEL-GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3
 ; GISEL-GFX11-NEXT:    v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v2, s2
 ; GISEL-GFX11-NEXT:    scratch_store_b128 v4, v[0:3], off dlc
@@ -882,7 +882,7 @@ define amdgpu_cs_chain void @amdgpu_cs_chain_dont_realign_stack(i32 %idx) {
 ; GISEL-GFX10-NEXT:    v_mov_b32_e32 v2, 2
 ; GISEL-GFX10-NEXT:    v_mov_b32_e32 v3, 3
 ; GISEL-GFX10-NEXT:    v_mov_b32_e32 v4, 4
-; GISEL-GFX10-NEXT:    v_add_nc_u32_e32 v0, 32, v0
+; GISEL-GFX10-NEXT:    v_add_nc_u32_e32 v0, 0, v0
 ; GISEL-GFX10-NEXT:    buffer_store_dword v1, v0, s[48:51], 0 offen
 ; GISEL-GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GISEL-GFX10-NEXT:    buffer_store_dword v2, v0, s[48:51], 0 offen offset:4
@@ -898,7 +898,7 @@ define amdgpu_cs_chain void @amdgpu_cs_chain_dont_realign_stack(i32 %idx) {
 ; DAGISEL-GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; DAGISEL-GFX11-NEXT:    v_dual_mov_b32 v0, 1 :: v_dual_mov_b32 v1, 2
 ; DAGISEL-GFX11-NEXT:    v_dual_mov_b32 v2, 3 :: v_dual_mov_b32 v3, 4
-; DAGISEL-GFX11-NEXT:    v_lshl_add_u32 v4, v8, 4, 32
+; DAGISEL-GFX11-NEXT:    v_lshl_add_u32 v4, v8, 4, 0
 ; DAGISEL-GFX11-NEXT:    scratch_store_b128 v4, v[0:3], off dlc
 ; DAGISEL-GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; DAGISEL-GFX11-NEXT:    s_endpgm
@@ -907,7 +907,7 @@ define amdgpu_cs_chain void @amdgpu_cs_chain_dont_realign_stack(i32 %idx) {
 ; DAGISEL-GFX10:       ; %bb.0:
 ; DAGISEL-GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; DAGISEL-GFX10-NEXT:    v_mov_b32_e32 v0, 4
-; DAGISEL-GFX10-NEXT:    v_lshl_add_u32 v1, v8, 4, 32
+; DAGISEL-GFX10-NEXT:    v_lshl_add_u32 v1, v8, 4, 0
 ; DAGISEL-GFX10-NEXT:    v_mov_b32_e32 v2, 3
 ; DAGISEL-GFX10-NEXT:    v_mov_b32_e32 v3, 2
 ; DAGISEL-GFX10-NEXT:    v_mov_b32_e32 v4, 1
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-preserve-cc.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-preserve-cc.ll
index 4190d07..8d9ed9b 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-preserve-cc.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-preserve-cc.ll
@@ -181,13 +181,13 @@ define amdgpu_cs_chain_preserve void @chain_preserve_to_chain_preserve(<3 x i32>
 ; GISEL-GFX11-LABEL: chain_preserve_to_chain_preserve:
 ; GISEL-GFX11:       ; %bb.0:
 ; GISEL-GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-GFX11-NEXT:    scratch_store_b32 off, v16, off offset:4 ; 4-byte Folded Spill
+; GISEL-GFX11-NEXT:    scratch_store_b32 off, v16, off ; 4-byte Folded Spill
 ; GISEL-GFX11-NEXT:    s_mov_b32 s3, s0
 ; GISEL-GFX11-NEXT:    v_mov_b32_e32 v1, v8
 ; GISEL-GFX11-NEXT:    ;;#ASMSTART
 ; GISEL-GFX11-NEXT:    s_nop
 ; GISEL-GFX11-NEXT:    ;;#ASMEND
-; GISEL-GFX11-NEXT:    scratch_load_b32 v16, off, off offset:4 ; 4-byte Folded Reload
+; GISEL-GFX11-NEXT:    scratch_load_b32 v16, off, off ; 4-byte Folded Reload
 ; GISEL-GFX11-NEXT:    s_mov_b32 s4, chain_preserve_callee@abs32@lo
 ; GISEL-GFX11-NEXT:    s_mov_b32 s5, chain_preserve_callee@abs32@hi
 ; GISEL-GFX11-NEXT:    v_mov_b32_e32 v8, v1
@@ -198,13 +198,13 @@ define amdgpu_cs_chain_preserve void @chain_preserve_to_chain_preserve(<3 x i32>
 ; GISEL-GFX10-LABEL: chain_preserve_to_chain_preserve:
 ; GISEL-GFX10:       ; %bb.0:
 ; GISEL-GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-GFX10-NEXT:    buffer_store_dword v16, off, s[48:51], 0 offset:4 ; 4-byte Folded Spill
+; GISEL-GFX10-NEXT:    buffer_store_dword v16, off, s[48:51], 0 ; 4-byte Folded Spill
 ; GISEL-GFX10-NEXT:    s_mov_b32 s3, s0
 ; GISEL-GFX10-NEXT:    v_mov_b32_e32 v1, v8
 ; GISEL-GFX10-NEXT:    ;;#ASMSTART
 ; GISEL-GFX10-NEXT:    s_nop
 ; GISEL-GFX10-NEXT:    ;;#ASMEND
-; GISEL-GFX10-NEXT:    buffer_load_dword v16, off, s[48:51], 0 offset:4 ; 4-byte Folded Reload
+; GISEL-GFX10-NEXT:    buffer_load_dword v16, off, s[48:51], 0 ; 4-byte Folded Reload
 ; GISEL-GFX10-NEXT:    s_mov_b32 s4, chain_preserve_callee@abs32@lo
 ; GISEL-GFX10-NEXT:    s_mov_b32 s5, chain_preserve_callee@abs32@hi
 ; GISEL-GFX10-NEXT:    v_mov_b32_e32 v8, v1
@@ -215,13 +215,13 @@ define amdgpu_cs_chain_preserve void @chain_preserve_to_chain_preserve(<3 x i32>
 ; DAGISEL-GFX11-LABEL: chain_preserve_to_chain_preserve:
 ; DAGISEL-GFX11:       ; %bb.0:
 ; DAGISEL-GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; DAGISEL-GFX11-NEXT:    scratch_store_b32 off, v16, off offset:4 ; 4-byte Folded Spill
+; DAGISEL-GFX11-NEXT:    scratch_store_b32 off, v16, off ; 4-byte Folded Spill
 ; DAGISEL-GFX11-NEXT:    v_mov_b32_e32 v1, v8
 ; DAGISEL-GFX11-NEXT:    s_mov_b32 s3, s0
 ; DAGISEL-GFX11-NEXT:    ;;#ASMSTART
 ; DAGISEL-GFX11-NEXT:    s_nop
 ; DAGISEL-GFX11-NEXT:    ;;#ASMEND
-; DAGISEL-GFX11-NEXT:    scratch_load_b32 v16, off, off offset:4 ; 4-byte Folded Reload
+; DAGISEL-GFX11-NEXT:    scratch_load_b32 v16, off, off ; 4-byte Folded Reload
 ; DAGISEL-GFX11-NEXT:    s_mov_b32 s5, chain_preserve_callee@abs32@hi
 ; DAGISEL-GFX11-NEXT:    v_mov_b32_e32 v8, v1
 ; DAGISEL-GFX11-NEXT:    s_mov_b32 s4, chain_preserve_callee@abs32@lo
@@ -232,13 +232,13 @@ define amdgpu_cs_chain_preserve void @chain_preserve_to_chain_preserve(<3 x i32>
 ; DAGISEL-GFX10-LABEL: chain_preserve_to_chain_preserve:
 ; DAGISEL-GFX10:       ; %bb.0:
 ; DAGISEL-GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; DAGISEL-GFX10-NEXT:    buffer_store_dword v16, off, s[48:51], 0 offset:4 ; 4-byte Folded Spill
+; DAGISEL-GFX10-NEXT:    buffer_store_dword v16, off, s[48:51], 0 ; 4-byte Folded Spill
 ; DAGISEL-GFX10-NEXT:    v_mov_b32_e32 v1, v8
 ; DAGISEL-GFX10-NEXT:    s_mov_b32 s3, s0
 ; DAGISEL-GFX10-NEXT:    ;;#ASMSTART
 ; DAGISEL-GFX10-NEXT:    s_nop
 ; DAGISEL-GFX10-NEXT:    ;;#ASMEND
-; DAGISEL-GFX10-NEXT:    buffer_load_dword v16, off, s[48:51], 0 offset:4 ; 4-byte Folded Reload
+; DAGISEL-GFX10-NEXT:    buffer_load_dword v16, off, s[48:51], 0 ; 4-byte Folded Reload
 ; DAGISEL-GFX10-NEXT:    s_mov_b32 s5, chain_preserve_callee@abs32@hi
 ; DAGISEL-GFX10-NEXT:    v_mov_b32_e32 v8, v1
 ; DAGISEL-GFX10-NEXT:    s_mov_b32 s4, chain_preserve_callee@abs32@lo
@@ -254,13 +254,13 @@ define amdgpu_cs_chain_preserve void @chain_preserve_to_chain(<3 x i32> inreg %a
 ; GISEL-GFX11-LABEL: chain_preserve_to_chain:
 ; GISEL-GFX11:       ; %bb.0:
 ; GISEL-GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-GFX11-NEXT:    scratch_store_b32 off, v16, off offset:4 ; 4-byte Folded Spill
+; GISEL-GFX11-NEXT:    scratch_store_b32 off, v16, off ; 4-byte Folded Spill
 ; GISEL-GFX11-NEXT:    s_mov_b32 s3, s0
 ; GISEL-GFX11-NEXT:    v_mov_b32_e32 v1, v8
 ; GISEL-GFX11-NEXT:    ;;#ASMSTART
 ; GISEL-GFX11-NEXT:    s_nop
 ; GISEL-GFX11-NEXT:    ;;#ASMEND
-; GISEL-GFX11-NEXT:    scratch_load_b32 v16, off, off offset:4 ; 4-byte Folded Reload
+; GISEL-GFX11-NEXT:    scratch_load_b32 v16, off, off ; 4-byte Folded Reload
 ; GISEL-GFX11-NEXT:    s_mov_b32 s4, chain_callee@abs32@lo
 ; GISEL-GFX11-NEXT:    s_mov_b32 s5, chain_callee@abs32@hi
 ; GISEL-GFX11-NEXT:    v_mov_b32_e32 v8, v1
@@ -271,13 +271,13 @@ define amdgpu_cs_chain_preserve void @chain_preserve_to_chain(<3 x i32> inreg %a
 ; GISEL-GFX10-LABEL: chain_preserve_to_chain:
 ; GISEL-GFX10:       ; %bb.0:
 ; GISEL-GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-GFX10-NEXT:    buffer_store_dword v16, off, s[48:51], 0 offset:4 ; 4-byte Folded Spill
+; GISEL-GFX10-NEXT:    buffer_store_dword v16, off, s[48:51], 0 ; 4-byte Folded Spill
 ; GISEL-GFX10-NEXT:    s_mov_b32 s3, s0
 ; GISEL-GFX10-NEXT:    v_mov_b32_e32 v1, v8
 ; GISEL-GFX10-NEXT:    ;;#ASMSTART
 ; GISEL-GFX10-NEXT:    s_nop
 ; GISEL-GFX10-NEXT:    ;;#ASMEND
-; GISEL-GFX10-NEXT:    buffer_load_dword v16, off, s[48:51], 0 offset:4 ; 4-byte Folded Reload
+; GISEL-GFX10-NEXT:    buffer_load_dword v16, off, s[48:51], 0 ; 4-byte Folded Reload
 ; GISEL-GFX10-NEXT:    s_mov_b32 s4, chain_callee@abs32@lo
 ; GISEL-GFX10-NEXT:    s_mov_b32 s5, chain_callee@abs32@hi
 ; GISEL-GFX10-NEXT:    v_mov_b32_e32 v8, v1
@@ -288,13 +288,13 @@ define amdgpu_cs_chain_preserve void @chain_preserve_to_chain(<3 x i32> inreg %a
 ; DAGISEL-GFX11-LABEL: chain_preserve_to_chain:
 ; DAGISEL-GFX11:       ; %bb.0:
 ; DAGISEL-GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; DAGISEL-GFX11-NEXT:    scratch_store_b32 off, v16, off offset:4 ; 4-byte Folded Spill
+; DAGISEL-GFX11-NEXT:    scratch_store_b32 off, v16, off ; 4-byte Folded Spill
 ; DAGISEL-GFX11-NEXT:    v_mov_b32_e32 v1, v8
 ; DAGISEL-GFX11-NEXT:    s_mov_b32 s3, s0
 ; DAGISEL-GFX11-NEXT:    ;;#ASMSTART
 ; DAGISEL-GFX11-NEXT:    s_nop
 ; DAGISEL-GFX11-NEXT:    ;;#ASMEND
-; DAGISEL-GFX11-NEXT:    scratch_load_b32 v16, off, off offset:4 ; 4-byte Folded Reload
+; DAGISEL-GFX11-NEXT:    scratch_load_b32 v16, off, off ; 4-byte Folded Reload
 ; DAGISEL-GFX11-NEXT:    s_mov_b32 s5, chain_callee@abs32@hi
 ; DAGISEL-GFX11-NEXT:    v_mov_b32_e32 v8, v1
 ; DAGISEL-GFX11-NEXT:    s_mov_b32 s4, chain_callee@abs32@lo
@@ -305,13 +305,13 @@ define amdgpu_cs_chain_preserve void @chain_preserve_to_chain(<3 x i32> inreg %a
 ; DAGISEL-GFX10-LABEL: chain_preserve_to_chain:
 ; DAGISEL-GFX10:       ; %bb.0:
 ; DAGISEL-GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; DAGISEL-GFX10-NEXT:    buffer_store_dword v16, off, s[48:51], 0 offset:4 ; 4-byte Folded Spill
+; DAGISEL-GFX10-NEXT:    buffer_store_dword v16, off, s[48:51], 0 ; 4-byte Folded Spill
 ; DAGISEL-GFX10-NEXT:    v_mov_b32_e32 v1, v8
 ; DAGISEL-GFX10-NEXT:    s_mov_b32 s3, s0
 ; DAGISEL-GFX10-NEXT:    ;;#ASMSTART
 ; DAGISEL-GFX10-NEXT:    s_nop
 ; DAGISEL-GFX10-NEXT:    ;;#ASMEND
-; DAGISEL-GFX10-NEXT:    buffer_load_dword v16, off, s[48:51], 0 offset:4 ; 4-byte Folded Reload
+; DAGISEL-GFX10-NEXT:    buffer_load_dword v16, off, s[48:51], 0 ; 4-byte Folded Reload
 ; DAGISEL-GFX10-NEXT:    s_mov_b32 s5, chain_callee@abs32@hi
 ; DAGISEL-GFX10-NEXT:    v_mov_b32_e32 v8, v1
 ; DAGISEL-GFX10-NEXT:    s_mov_b32 s4, chain_callee@abs32@lo
@@ -327,7 +327,7 @@ define amdgpu_cs_chain_preserve void @chain_preserve_to_chain_wwm(<3 x i32> inre
 ; GISEL-GFX11-LABEL: chain_preserve_to_chain_wwm:
 ; GISEL-GFX11:       ; %bb.0:
 ; GISEL-GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-GFX11-NEXT:    scratch_store_b32 off, v16, off offset:4 ; 4-byte Folded Spill
+; GISEL-GFX11-NEXT:    scratch_store_b32 off, v16, off ; 4-byte Folded Spill
 ; GISEL-GFX11-NEXT:    s_mov_b32 s3, s0
 ; GISEL-GFX11-NEXT:    v_mov_b32_e32 v1, 3
 ; GISEL-GFX11-NEXT:    s_not_b32 exec_lo, exec_lo
@@ -336,7 +336,7 @@ define amdgpu_cs_chain_preserve void @chain_preserve_to_chain_wwm(<3 x i32> inre
 ; GISEL-GFX11-NEXT:    ;;#ASMSTART
 ; GISEL-GFX11-NEXT:    s_nop
 ; GISEL-GFX11-NEXT:    ;;#ASMEND
-; GISEL-GFX11-NEXT:    scratch_load_b32 v16, off, off offset:4 ; 4-byte Folded Reload
+; GISEL-GFX11-NEXT:    scratch_load_b32 v16, off, off ; 4-byte Folded Reload
 ; GISEL-GFX11-NEXT:    v_mov_b32_e32 v2, v1
 ; GISEL-GFX11-NEXT:    s_mov_b32 s4, chain_callee@abs32@lo
 ; GISEL-GFX11-NEXT:    s_mov_b32 s5, chain_callee@abs32@hi
@@ -349,7 +349,7 @@ define amdgpu_cs_chain_preserve void @chain_preserve_to_chain_wwm(<3 x i32> inre
 ; GISEL-GFX10-LABEL: chain_preserve_to_chain_wwm:
 ; GISEL-GFX10:       ; %bb.0:
 ; GISEL-GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-GFX10-NEXT:    buffer_store_dword v16, off, s[48:51], 0 offset:4 ; 4-byte Folded Spill
+; GISEL-GFX10-NEXT:    buffer_store_dword v16, off, s[48:51], 0 ; 4-byte Folded Spill
 ; GISEL-GFX10-NEXT:    s_mov_b32 s3, s0
 ; GISEL-GFX10-NEXT:    v_mov_b32_e32 v1, 3
 ; GISEL-GFX10-NEXT:    s_not_b32 exec_lo, exec_lo
@@ -358,7 +358,7 @@ define amdgpu_cs_chain_preserve void @chain_preserve_to_chain_wwm(<3 x i32> inre
 ; GISEL-GFX10-NEXT:    ;;#ASMSTART
 ; GISEL-GFX10-NEXT:    s_nop
 ; GISEL-GFX10-NEXT:    ;;#ASMEND
-; GISEL-GFX10-NEXT:    buffer_load_dword v16, off, s[48:51], 0 offset:4 ; 4-byte Folded Reload
+; GISEL-GFX10-NEXT:    buffer_load_dword v16, off, s[48:51], 0 ; 4-byte Folded Reload
 ; GISEL-GFX10-NEXT:    v_mov_b32_e32 v2, v1
 ; GISEL-GFX10-NEXT:    s_mov_b32 s4, chain_callee@abs32@lo
 ; GISEL-GFX10-NEXT:    s_mov_b32 s5, chain_callee@abs32@hi
@@ -370,7 +370,7 @@ define amdgpu_cs_chain_preserve void @chain_preserve_to_chain_wwm(<3 x i32> inre
 ; DAGISEL-GFX11-LABEL: chain_preserve_to_chain_wwm:
 ; DAGISEL-GFX11:       ; %bb.0:
 ; DAGISEL-GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; DAGISEL-GFX11-NEXT:    scratch_store_b32 off, v16, off offset:4 ; 4-byte Folded Spill
+; DAGISEL-GFX11-NEXT:    scratch_store_b32 off, v16, off ; 4-byte Folded Spill
 ; DAGISEL-GFX11-NEXT:    s_mov_b32 s3, s0
 ; DAGISEL-GFX11-NEXT:    v_mov_b32_e32 v1, 3
 ; DAGISEL-GFX11-NEXT:    s_not_b32 exec_lo, exec_lo
@@ -379,7 +379,7 @@ define amdgpu_cs_chain_preserve void @chain_preserve_to_chain_wwm(<3 x i32> inre
 ; DAGISEL-GFX11-NEXT:    ;;#ASMSTART
 ; DAGISEL-GFX11-NEXT:    s_nop
 ; DAGISEL-GFX11-NEXT:    ;;#ASMEND
-; DAGISEL-GFX11-NEXT:    scratch_load_b32 v16, off, off offset:4 ; 4-byte Folded Reload
+; DAGISEL-GFX11-NEXT:    scratch_load_b32 v16, off, off ; 4-byte Folded Reload
 ; DAGISEL-GFX11-NEXT:    v_mov_b32_e32 v2, v1
 ; DAGISEL-GFX11-NEXT:    s_mov_b32 s5, chain_callee@abs32@hi
 ; DAGISEL-GFX11-NEXT:    s_mov_b32 s4, chain_callee@abs32@lo
@@ -392,7 +392,7 @@ define amdgpu_cs_chain_preserve void @chain_preserve_to_chain_wwm(<3 x i32> inre
 ; DAGISEL-GFX10-LABEL: chain_preserve_to_chain_wwm:
 ; DAGISEL-GFX10:       ; %bb.0:
 ; DAGISEL-GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; DAGISEL-GFX10-NEXT:    buffer_store_dword v16, off, s[48:51], 0 offset:4 ; 4-byte Folded Spill
+; DAGISEL-GFX10-NEXT:    buffer_store_dword v16, off, s[48:51], 0 ; 4-byte Folded Spill
 ; DAGISEL-GFX10-NEXT:    s_mov_b32 s3, s0
 ; DAGISEL-GFX10-NEXT:    v_mov_b32_e32 v1, 3
 ; DAGISEL-GFX10-NEXT:    s_not_b32 exec_lo, exec_lo
@@ -401,7 +401,7 @@ define amdgpu_cs_chain_preserve void @chain_preserve_to_chain_wwm(<3 x i32> inre
 ; DAGISEL-GFX10-NEXT:    ;;#ASMSTART
 ; DAGISEL-GFX10-NEXT:    s_nop
 ; DAGISEL-GFX10-NEXT:    ;;#ASMEND
-; DAGISEL-GFX10-NEXT:    buffer_load_dword v16, off, s[48:51], 0 offset:4 ; 4-byte Folded Reload
+; DAGISEL-GFX10-NEXT:    buffer_load_dword v16, off, s[48:51], 0 ; 4-byte Folded Reload
 ; DAGISEL-GFX10-NEXT:    v_mov_b32_e32 v2, v1
 ; DAGISEL-GFX10-NEXT:    s_mov_b32 s5, chain_callee@abs32@hi
 ; DAGISEL-GFX10-NEXT:    s_mov_b32 s4, chain_callee@abs32@lo
@@ -422,8 +422,8 @@ define amdgpu_cs_chain_preserve void @chain_preserve_to_chain_use_all_v0_v7(<3 x
 ; GISEL-GFX11:       ; %bb.0:
 ; GISEL-GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-GFX11-NEXT:    s_clause 0x1
-; GISEL-GFX11-NEXT:    scratch_store_b32 off, v11, off offset:8
-; GISEL-GFX11-NEXT:    scratch_store_b32 off, v16, off offset:4
+; GISEL-GFX11-NEXT:    scratch_store_b32 off, v11, off offset:4
+; GISEL-GFX11-NEXT:    scratch_store_b32 off, v16, off
 ; GISEL-GFX11-NEXT:    v_mov_b32_e32 v11, v8
 ; GISEL-GFX11-NEXT:    s_mov_b32 s3, s0
 ; GISEL-GFX11-NEXT:    ;;#ASMSTART
@@ -433,8 +433,8 @@ define amdgpu_cs_chain_preserve void @chain_preserve_to_chain_use_all_v0_v7(<3 x
 ; GISEL-GFX11-NEXT:    s_mov_b32 s5, chain_callee@abs32@hi
 ; GISEL-GFX11-NEXT:    v_mov_b32_e32 v8, v11
 ; GISEL-GFX11-NEXT:    s_clause 0x1
-; GISEL-GFX11-NEXT:    scratch_load_b32 v16, off, off offset:4
-; GISEL-GFX11-NEXT:    scratch_load_b32 v11, off, off offset:8
+; GISEL-GFX11-NEXT:    scratch_load_b32 v16, off, off
+; GISEL-GFX11-NEXT:    scratch_load_b32 v11, off, off offset:4
 ; GISEL-GFX11-NEXT:    s_mov_b32 s0, s3
 ; GISEL-GFX11-NEXT:    s_mov_b32 exec_lo, -1
 ; GISEL-GFX11-NEXT:    s_setpc_b64 s[4:5]
@@ -442,8 +442,8 @@ define amdgpu_cs_chain_preserve void @chain_preserve_to_chain_use_all_v0_v7(<3 x
 ; GISEL-GFX10-LABEL: chain_preserve_to_chain_use_all_v0_v7:
 ; GISEL-GFX10:       ; %bb.0:
 ; GISEL-GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-GFX10-NEXT:    buffer_store_dword v11, off, s[48:51], 0 offset:8 ; 4-byte Folded Spill
-; GISEL-GFX10-NEXT:    buffer_store_dword v16, off, s[48:51], 0 offset:4 ; 4-byte Folded Spill
+; GISEL-GFX10-NEXT:    buffer_store_dword v11, off, s[48:51], 0 offset:4 ; 4-byte Folded Spill
+; GISEL-GFX10-NEXT:    buffer_store_dword v16, off, s[48:51], 0 ; 4-byte Folded Spill
 ; GISEL-GFX10-NEXT:    v_mov_b32_e32 v11, v8
 ; GISEL-GFX10-NEXT:    s_mov_b32 s3, s0
 ; GISEL-GFX10-NEXT:    ;;#ASMSTART
@@ -453,8 +453,8 @@ define amdgpu_cs_chain_preserve void @chain_preserve_to_chain_use_all_v0_v7(<3 x
 ; GISEL-GFX10-NEXT:    s_mov_b32 s5, chain_callee@abs32@hi
 ; GISEL-GFX10-NEXT:    v_mov_b32_e32 v8, v11
 ; GISEL-GFX10-NEXT:    s_clause 0x1
-; GISEL-GFX10-NEXT:    buffer_load_dword v16, off, s[48:51], 0 offset:4
-; GISEL-GFX10-NEXT:    buffer_load_dword v11, off, s[48:51], 0 offset:8
+; GISEL-GFX10-NEXT:    buffer_load_dword v16, off, s[48:51], 0
+; GISEL-GFX10-NEXT:    buffer_load_dword v11, off, s[48:51], 0 offset:4
 ; GISEL-GFX10-NEXT:    s_mov_b32 s0, s3
 ; GISEL-GFX10-NEXT:    s_mov_b32 exec_lo, -1
 ; GISEL-GFX10-NEXT:    s_setpc_b64 s[4:5]
@@ -463,8 +463,8 @@ define amdgpu_cs_chain_preserve void @chain_preserve_to_chain_use_all_v0_v7(<3 x
 ; DAGISEL-GFX11:       ; %bb.0:
 ; DAGISEL-GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; DAGISEL-GFX11-NEXT:    s_clause 0x1
-; DAGISEL-GFX11-NEXT:    scratch_store_b32 off, v11, off offset:8
-; DAGISEL-GFX11-NEXT:    scratch_store_b32 off, v16, off offset:4
+; DAGISEL-GFX11-NEXT:    scratch_store_b32 off, v11, off offset:4
+; DAGISEL-GFX11-NEXT:    scratch_store_b32 off, v16, off
 ; DAGISEL-GFX11-NEXT:    v_mov_b32_e32 v11, v8
 ; DAGISEL-GFX11-NEXT:    s_mov_b32 s3, s0
 ; DAGISEL-GFX11-NEXT:    ;;#ASMSTART
@@ -474,8 +474,8 @@ define amdgpu_cs_chain_preserve void @chain_preserve_to_chain_use_all_v0_v7(<3 x
 ; DAGISEL-GFX11-NEXT:    s_mov_b32 s4, chain_callee@abs32@lo
 ; DAGISEL-GFX11-NEXT:    v_mov_b32_e32 v8, v11
 ; DAGISEL-GFX11-NEXT:    s_clause 0x1
-; DAGISEL-GFX11-NEXT:    scratch_load_b32 v16, off, off offset:4
-; DAGISEL-GFX11-NEXT:    scratch_load_b32 v11, off, off offset:8
+; DAGISEL-GFX11-NEXT:    scratch_load_b32 v16, off, off
+; DAGISEL-GFX11-NEXT:    scratch_load_b32 v11, off, off offset:4
 ; DAGISEL-GFX11-NEXT:    s_mov_b32 s0, s3
 ; DAGISEL-GFX11-NEXT:    s_mov_b32 exec_lo, -1
 ; DAGISEL-GFX11-NEXT:    s_setpc_b64 s[4:5]
@@ -483,8 +483,8 @@ define amdgpu_cs_chain_preserve void @chain_preserve_to_chain_use_all_v0_v7(<3 x
 ; DAGISEL-GFX10-LABEL: chain_preserve_to_chain_use_all_v0_v7:
 ; DAGISEL-GFX10:       ; %bb.0:
 ; DAGISEL-GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; DAGISEL-GFX10-NEXT:    buffer_store_dword v11, off, s[48:51], 0 offset:8 ; 4-byte Folded Spill
-; DAGISEL-GFX10-NEXT:    buffer_store_dword v16, off, s[48:51], 0 offset:4 ; 4-byte Folded Spill
+; DAGISEL-GFX10-NEXT:    buffer_store_dword v11, off, s[48:51], 0 offset:4 ; 4-byte Folded Spill
+; DAGISEL-GFX10-NEXT:    buffer_store_dword v16, off, s[48:51], 0 ; 4-byte Folded Spill
 ; DAGISEL-GFX10-NEXT:    v_mov_b32_e32 v11, v8
 ; DAGISEL-GFX10-NEXT:    s_mov_b32 s3, s0
 ; DAGISEL-GFX10-NEXT:    ;;#ASMSTART
@@ -494,8 +494,8 @@ define amdgpu_cs_chain_preserve void @chain_preserve_to_chain_use_all_v0_v7(<3 x
 ; DAGISEL-GFX10-NEXT:    s_mov_b32 s4, chain_callee@abs32@lo
 ; DAGISEL-GFX10-NEXT:    v_mov_b32_e32 v8, v11
 ; DAGISEL-GFX10-NEXT:    s_clause 0x1
-; DAGISEL-GFX10-NEXT:    buffer_load_dword v16, off, s[48:51], 0 offset:4
-; DAGISEL-GFX10-NEXT:    buffer_load_dword v11, off, s[48:51], 0 offset:8
+; DAGISEL-GFX10-NEXT:    buffer_load_dword v16, off, s[48:51], 0
+; DAGISEL-GFX10-NEXT:    buffer_load_dword v11, off, s[48:51], 0 offset:4
 ; DAGISEL-GFX10-NEXT:    s_mov_b32 s0, s3
 ; DAGISEL-GFX10-NEXT:    s_mov_b32 exec_lo, -1
 ; DAGISEL-GFX10-NEXT:    s_setpc_b64 s[4:5]
@@ -508,13 +508,13 @@ define amdgpu_cs_chain_preserve void @chain_preserve_to_chain_preserve_fewer_arg
 ; GISEL-GFX11-LABEL: chain_preserve_to_chain_preserve_fewer_args:
 ; GISEL-GFX11:       ; %bb.0:
 ; GISEL-GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-GFX11-NEXT:    scratch_store_b32 off, v16, off offset:4 ; 4-byte Folded Spill
+; GISEL-GFX11-NEXT:    scratch_store_b32 off, v16, off ; 4-byte Folded Spill
 ; GISEL-GFX11-NEXT:    s_mov_b32 s2, s0
 ; GISEL-GFX11-NEXT:    v_mov_b32_e32 v1, v8
 ; GISEL-GFX11-NEXT:    ;;#ASMSTART
 ; GISEL-GFX11-NEXT:    s_nop
 ; GISEL-GFX11-NEXT:    ;;#ASMEND
-; GISEL-GFX11-NEXT:    scratch_load_b32 v16, off, off offset:4 ; 4-byte Folded Reload
+; GISEL-GFX11-NEXT:    scratch_load_b32 v16, off, off ; 4-byte Folded Reload
 ; GISEL-GFX11-NEXT:    s_mov_b32 s4, chain_preserve_callee_2@abs32@lo
 ; GISEL-GFX11-NEXT:    s_mov_b32 s5, chain_preserve_callee_2@abs32@hi
 ; GISEL-GFX11-NEXT:    v_mov_b32_e32 v8, v1
@@ -525,13 +525,13 @@ define amdgpu_cs_chain_preserve void @chain_preserve_to_chain_preserve_fewer_arg
 ; GISEL-GFX10-LABEL: chain_preserve_to_chain_preserve_fewer_args:
 ; GISEL-GFX10:       ; %bb.0:
 ; GISEL-GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-GFX10-NEXT:    buffer_store_dword v16, off, s[48:51], 0 offset:4 ; 4-byte Folded Spill
+; GISEL-GFX10-NEXT:    buffer_store_dword v16, off, s[48:51], 0 ; 4-byte Folded Spill
 ; GISEL-GFX10-NEXT:    s_mov_b32 s2, s0
 ; GISEL-GFX10-NEXT:    v_mov_b32_e32 v1, v8
 ; GISEL-GFX10-NEXT:    ;;#ASMSTART
 ; GISEL-GFX10-NEXT:    s_nop
 ; GISEL-GFX10-NEXT:    ;;#ASMEND
-; GISEL-GFX10-NEXT:    buffer_load_dword v16, off, s[48:51], 0 offset:4 ; 4-byte Folded Reload
+; GISEL-GFX10-NEXT:    buffer_load_dword v16, off, s[48:51], 0 ; 4-byte Folded Reload
 ; GISEL-GFX10-NEXT:    s_mov_b32 s4, chain_preserve_callee_2@abs32@lo
 ; GISEL-GFX10-NEXT:    s_mov_b32 s5, chain_preserve_callee_2@abs32@hi
 ; GISEL-GFX10-NEXT:    v_mov_b32_e32 v8, v1
@@ -542,13 +542,13 @@ define amdgpu_cs_chain_preserve void @chain_preserve_to_chain_preserve_fewer_arg
 ; DAGISEL-GFX11-LABEL: chain_preserve_to_chain_preserve_fewer_args:
 ; DAGISEL-GFX11:       ; %bb.0:
 ; DAGISEL-GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; DAGISEL-GFX11-NEXT:    scratch_store_b32 off, v16, off offset:4 ; 4-byte Folded Spill
+; DAGISEL-GFX11-NEXT:    scratch_store_b32 off, v16, off ; 4-byte Folded Spill
 ; DAGISEL-GFX11-NEXT:    v_mov_b32_e32 v1, v8
 ; DAGISEL-GFX11-NEXT:    s_mov_b32 s2, s0
 ; DAGISEL-GFX11-NEXT:    ;;#ASMSTART
 ; DAGISEL-GFX11-NEXT:    s_nop
 ; DAGISEL-GFX11-NEXT:    ;;#ASMEND
-; DAGISEL-GFX11-NEXT:    scratch_load_b32 v16, off, off offset:4 ; 4-byte Folded Reload
+; DAGISEL-GFX11-NEXT:    scratch_load_b32 v16, off, off ; 4-byte Folded Reload
 ; DAGISEL-GFX11-NEXT:    s_mov_b32 s5, chain_preserve_callee_2@abs32@hi
 ; DAGISEL-GFX11-NEXT:    v_mov_b32_e32 v8, v1
 ; DAGISEL-GFX11-NEXT:    s_mov_b32 s4, chain_preserve_callee_2@abs32@lo
@@ -559,13 +559,13 @@ define amdgpu_cs_chain_preserve void @chain_preserve_to_chain_preserve_fewer_arg
 ; DAGISEL-GFX10-LABEL: chain_preserve_to_chain_preserve_fewer_args:
 ; DAGISEL-GFX10:       ; %bb.0:
 ; DAGISEL-GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; DAGISEL-GFX10-NEXT:    buffer_store_dword v16, off, s[48:51], 0 offset:4 ; 4-byte Folded Spill
+; DAGISEL-GFX10-NEXT:    buffer_store_dword v16, off, s[48:51], 0 ; 4-byte Folded Spill
 ; DAGISEL-GFX10-NEXT:    v_mov_b32_e32 v1, v8
 ; DAGISEL-GFX10-NEXT:    s_mov_b32 s2, s0
 ; DAGISEL-GFX10-NEXT:    ;;#ASMSTART
 ; DAGISEL-GFX10-NEXT:    s_nop
 ; DAGISEL-GFX10-NEXT:    ;;#ASMEND
-; DAGISEL-GFX10-NEXT:    buffer_load_dword v16, off, s[48:51], 0 offset:4 ; 4-byte Folded Reload
+; DAGISEL-GFX10-NEXT:    buffer_load_dword v16, off, s[48:51], 0 ; 4-byte Folded Reload
 ; DAGISEL-GFX10-NEXT:    s_mov_b32 s5, chain_preserve_callee_2@abs32@hi
 ; DAGISEL-GFX10-NEXT:    v_mov_b32_e32 v8, v1
 ; DAGISEL-GFX10-NEXT:    s_mov_b32 s4, chain_preserve_callee_2@abs32@lo
@@ -592,7 +592,7 @@ define amdgpu_cs_chain_preserve void @amdgpu_cs_chain_preserve_dont_realign_stac
 ; GISEL-GFX11-NEXT:    s_mov_b32 s0, 1
 ; GISEL-GFX11-NEXT:    v_lshlrev_b32_e32 v0, 4, v8
 ; GISEL-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GISEL-GFX11-NEXT:    v_add_nc_u32_e32 v4, 32, v0
+; GISEL-GFX11-NEXT:    v_add_nc_u32_e32 v4, 0, v0
 ; GISEL-GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3
 ; GISEL-GFX11-NEXT:    v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v2, s2
 ; GISEL-GFX11-NEXT:    scratch_store_b128 v4, v[0:3], off dlc
@@ -607,7 +607,7 @@ define amdgpu_cs_chain_preserve void @amdgpu_cs_chain_preserve_dont_realign_stac
 ; GISEL-GFX10-NEXT:    v_mov_b32_e32 v2, 2
 ; GISEL-GFX10-NEXT:    v_mov_b32_e32 v3, 3
 ; GISEL-GFX10-NEXT:    v_mov_b32_e32 v4, 4
-; GISEL-GFX10-NEXT:    v_add_nc_u32_e32 v0, 32, v0
+; GISEL-GFX10-NEXT:    v_add_nc_u32_e32 v0, 0, v0
 ; GISEL-GFX10-NEXT:    buffer_store_dword v1, v0, s[48:51], 0 offen
 ; GISEL-GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GISEL-GFX10-NEXT:    buffer_store_dword v2, v0, s[48:51], 0 offen offset:4
@@ -623,7 +623,7 @@ define amdgpu_cs_chain_preserve void @amdgpu_cs_chain_preserve_dont_realign_stac
 ; DAGISEL-GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; DAGISEL-GFX11-NEXT:    v_dual_mov_b32 v0, 1 :: v_dual_mov_b32 v1, 2
 ; DAGISEL-GFX11-NEXT:    v_dual_mov_b32 v2, 3 :: v_dual_mov_b32 v3, 4
-; DAGISEL-GFX11-NEXT:    v_lshl_add_u32 v4, v8, 4, 32
+; DAGISEL-GFX11-NEXT:    v_lshl_add_u32 v4, v8, 4, 0
 ; DAGISEL-GFX11-NEXT:    scratch_store_b128 v4, v[0:3], off dlc
 ; DAGISEL-GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; DAGISEL-GFX11-NEXT:    s_endpgm
@@ -632,7 +632,7 @@ define amdgpu_cs_chain_preserve void @amdgpu_cs_chain_preserve_dont_realign_stac
 ; DAGISEL-GFX10:       ; %bb.0:
 ; DAGISEL-GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; DAGISEL-GFX10-NEXT:    v_mov_b32_e32 v0, 4
-; DAGISEL-GFX10-NEXT:    v_lshl_add_u32 v1, v8, 4, 32
+; DAGISEL-GFX10-NEXT:    v_lshl_add_u32 v1, v8, 4, 0
 ; DAGISEL-GFX10-NEXT:    v_mov_b32_e32 v2, 3
 ; DAGISEL-GFX10-NEXT:    v_mov_b32_e32 v3, 2
 ; DAGISEL-GFX10-NEXT:    v_mov_b32_e32 v4, 1
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu.private-memory.ll b/llvm/test/CodeGen/AMDGPU/amdgpu.private-memory.ll
index ff2f2c6..93c18de 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu.private-memory.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu.private-memory.ll
@@ -209,8 +209,8 @@ for.end:
 
 ; R600-VECT: MOVA_INT
 
-; SI-ALLOCA-DAG: buffer_store_short v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], 0 offset:6 ; encoding: [0x06,0x00,0x68,0xe0
-; SI-ALLOCA-DAG: buffer_store_short v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], 0 offset:4 ; encoding: [0x04,0x00,0x68,0xe0
+; SI-ALLOCA-DAG: buffer_store_short v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], 0 offset:2 ; encoding: [0x02,0x00,0x68,0xe0
+; SI-ALLOCA-DAG: buffer_store_short v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], 0 ; encoding: [0x00,0x00,0x68,0xe0
 ; Loaded value is 0 or 1, so sext will become zext, so we get buffer_load_ushort instead of buffer_load_sshort.
 ; SI-ALLOCA: buffer_load_sshort v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0
 
@@ -238,8 +238,8 @@ entry:
 ; SI-PROMOTE-VECT-DAG: s_lshl_b32
 ; SI-PROMOTE-VECT-DAG: v_lshrrev
 
-; SI-ALLOCA-DAG: buffer_store_byte v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], 0 offset:4 ; encoding: [0x04,0x00,0x60,0xe0
-; SI-ALLOCA-DAG: buffer_store_byte v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], 0 offset:5 ; encoding: [0x05,0x00,0x60,0xe0
+; SI-ALLOCA-DAG: buffer_store_byte v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], 0 ; encoding: [0x00,0x00,0x60,0xe0
+; SI-ALLOCA-DAG: buffer_store_byte v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], 0 offset:1 ; encoding: [0x01,0x00,0x60,0xe0
 define amdgpu_kernel void @char_array(ptr addrspace(1) %out, i32 %index) #0 {
 entry:
   %0 = alloca [2 x i8], addrspace(5)
@@ -258,7 +258,7 @@ entry:
 ; FUNC-LABEL: {{^}}no_overlap:
 ;
 ; A total of 5 bytes should be allocated and used.
-; SI: buffer_store_byte v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], 0 offset:4 ;
+; SI: buffer_store_byte v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], 0 ;
 define amdgpu_kernel void @no_overlap(ptr addrspace(1) %out, i32 %in) #0 {
 entry:
   %0 = alloca [3 x i8], align 1, addrspace(5)
diff --git a/llvm/test/CodeGen/AMDGPU/array-ptr-calc-i32.ll b/llvm/test/CodeGen/AMDGPU/array-ptr-calc-i32.ll
index 954994a..d33196b 100644
--- a/llvm/test/CodeGen/AMDGPU/array-ptr-calc-i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/array-ptr-calc-i32.ll
@@ -12,7 +12,7 @@ declare void @llvm.amdgcn.s.barrier() #2
 
 ; SI-LABEL: {{^}}test_private_array_ptr_calc:
 
-; SI-ALLOCA: v_add_i32_e32 [[PTRREG:v[0-9]+]], vcc, 16, v{{[0-9]+}}
+; SI-ALLOCA: v_add_i32_e32 [[PTRREG:v[0-9]+]], vcc, 0, v{{[0-9]+}}
 ; SI-ALLOCA: buffer_store_dword {{v[0-9]+}}, [[PTRREG]], s[{{[0-9]+:[0-9]+}}], 0 offen offset:64
 ; SI-ALLOCA: s_barrier
 ; SI-ALLOCA: buffer_load_dword {{v[0-9]+}}, [[PTRREG]], s[{{[0-9]+:[0-9]+}}], 0 offen offset:64
diff --git a/llvm/test/CodeGen/AMDGPU/call-argument-types.ll b/llvm/test/CodeGen/AMDGPU/call-argument-types.ll
index 381fb98..f72d22b 100644
--- a/llvm/test/CodeGen/AMDGPU/call-argument-types.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-argument-types.ll
@@ -4655,11 +4655,11 @@ define amdgpu_kernel void @test_call_external_void_func_byval_struct_i8_i32() #0
 ; VI-NEXT:    s_add_u32 s36, s36, s1
 ; VI-NEXT:    s_addc_u32 s37, s37, 0
 ; VI-NEXT:    v_mov_b32_e32 v0, 3
-; VI-NEXT:    buffer_store_byte v0, off, s[36:39], 0 offset:8
+; VI-NEXT:    buffer_store_byte v0, off, s[36:39], 0
 ; VI-NEXT:    v_mov_b32_e32 v0, 8
-; VI-NEXT:    buffer_store_dword v0, off, s[36:39], 0 offset:12
-; VI-NEXT:    buffer_load_dword v0, off, s[36:39], 0 offset:12
-; VI-NEXT:    buffer_load_dword v1, off, s[36:39], 0 offset:8
+; VI-NEXT:    buffer_store_dword v0, off, s[36:39], 0 offset:4
+; VI-NEXT:    buffer_load_dword v0, off, s[36:39], 0 offset:4
+; VI-NEXT:    buffer_load_dword v1, off, s[36:39], 0
 ; VI-NEXT:    s_mov_b64 s[0:1], s[36:37]
 ; VI-NEXT:    s_movk_i32 s32, 0x400
 ; VI-NEXT:    s_mov_b64 s[2:3], s[38:39]
@@ -4682,11 +4682,11 @@ define amdgpu_kernel void @test_call_external_void_func_byval_struct_i8_i32() #0
 ; CI-NEXT:    s_add_u32 s36, s36, s1
 ; CI-NEXT:    s_addc_u32 s37, s37, 0
 ; CI-NEXT:    v_mov_b32_e32 v0, 3
-; CI-NEXT:    buffer_store_byte v0, off, s[36:39], 0 offset:8
+; CI-NEXT:    buffer_store_byte v0, off, s[36:39], 0
 ; CI-NEXT:    v_mov_b32_e32 v0, 8
-; CI-NEXT:    buffer_store_dword v0, off, s[36:39], 0 offset:12
-; CI-NEXT:    buffer_load_dword v0, off, s[36:39], 0 offset:12
-; CI-NEXT:    buffer_load_dword v1, off, s[36:39], 0 offset:8
+; CI-NEXT:    buffer_store_dword v0, off, s[36:39], 0 offset:4
+; CI-NEXT:    buffer_load_dword v0, off, s[36:39], 0 offset:4
+; CI-NEXT:    buffer_load_dword v1, off, s[36:39], 0
 ; CI-NEXT:    s_mov_b64 s[0:1], s[36:37]
 ; CI-NEXT:    s_movk_i32 s32, 0x400
 ; CI-NEXT:    s_mov_b64 s[2:3], s[38:39]
@@ -4709,12 +4709,12 @@ define amdgpu_kernel void @test_call_external_void_func_byval_struct_i8_i32() #0
 ; GFX9-NEXT:    s_add_u32 s36, s36, s1
 ; GFX9-NEXT:    s_addc_u32 s37, s37, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 3
-; GFX9-NEXT:    buffer_store_byte v0, off, s[36:39], 0 offset:8
+; GFX9-NEXT:    buffer_store_byte v0, off, s[36:39], 0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 8
-; GFX9-NEXT:    buffer_store_dword v0, off, s[36:39], 0 offset:12
-; GFX9-NEXT:    buffer_load_dword v0, off, s[36:39], 0 offset:12
+; GFX9-NEXT:    buffer_store_dword v0, off, s[36:39], 0 offset:4
+; GFX9-NEXT:    buffer_load_dword v0, off, s[36:39], 0 offset:4
 ; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_load_dword v1, off, s[36:39], 0 offset:8
+; GFX9-NEXT:    buffer_load_dword v1, off, s[36:39], 0
 ; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37]
 ; GFX9-NEXT:    s_movk_i32 s32, 0x400
 ; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
@@ -4736,9 +4736,9 @@ define amdgpu_kernel void @test_call_external_void_func_byval_struct_i8_i32() #0
 ; GFX11-NEXT:    s_add_u32 s0, s0, external_void_func_byval_struct_i8_i32@rel32@lo+4
 ; GFX11-NEXT:    s_addc_u32 s1, s1, external_void_func_byval_struct_i8_i32@rel32@hi+12
 ; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    scratch_store_b8 off, v0, off offset:8
-; GFX11-NEXT:    scratch_store_b32 off, v1, off offset:12
-; GFX11-NEXT:    scratch_load_b64 v[0:1], off, off offset:8
+; GFX11-NEXT:    scratch_store_b8 off, v0, off
+; GFX11-NEXT:    scratch_store_b32 off, v1, off offset:4
+; GFX11-NEXT:    scratch_load_b64 v[0:1], off, off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    scratch_store_b64 off, v[0:1], s32
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
@@ -4753,11 +4753,11 @@ define amdgpu_kernel void @test_call_external_void_func_byval_struct_i8_i32() #0
 ; HSA-NEXT:    s_mov_b32 s3, 0x11e80000
 ; HSA-NEXT:    s_mov_b64 s[0:1], flat_scratch
 ; HSA-NEXT:    v_mov_b32_e32 v0, 3
-; HSA-NEXT:    buffer_store_byte v0, off, s[0:3], 0 offset:8
+; HSA-NEXT:    buffer_store_byte v0, off, s[0:3], 0
 ; HSA-NEXT:    v_mov_b32_e32 v0, 8
-; HSA-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:12
-; HSA-NEXT:    buffer_load_dword v0, off, s[0:3], 0 offset:12
-; HSA-NEXT:    buffer_load_dword v1, off, s[0:3], 0 offset:8
+; HSA-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:4
+; HSA-NEXT:    buffer_load_dword v0, off, s[0:3], 0 offset:4
+; HSA-NEXT:    buffer_load_dword v1, off, s[0:3], 0
 ; HSA-NEXT:    s_movk_i32 s32, 0x400
 ; HSA-NEXT:    s_getpc_b64 s[4:5]
 ; HSA-NEXT:    s_add_u32 s4, s4, external_void_func_byval_struct_i8_i32@rel32@lo+4
@@ -4787,11 +4787,11 @@ define amdgpu_kernel void @test_call_external_void_func_sret_struct_i8_i32_byval
 ; VI-NEXT:    s_add_u32 s36, s36, s3
 ; VI-NEXT:    s_addc_u32 s37, s37, 0
 ; VI-NEXT:    v_mov_b32_e32 v0, 3
-; VI-NEXT:    buffer_store_byte v0, off, s[36:39], 0 offset:8
+; VI-NEXT:    buffer_store_byte v0, off, s[36:39], 0
 ; VI-NEXT:    v_mov_b32_e32 v0, 8
-; VI-NEXT:    buffer_store_dword v0, off, s[36:39], 0 offset:12
-; VI-NEXT:    buffer_load_dword v0, off, s[36:39], 0 offset:12
-; VI-NEXT:    buffer_load_dword v1, off, s[36:39], 0 offset:8
+; VI-NEXT:    buffer_store_dword v0, off, s[36:39], 0 offset:4
+; VI-NEXT:    buffer_load_dword v0, off, s[36:39], 0 offset:4
+; VI-NEXT:    buffer_load_dword v1, off, s[36:39], 0
 ; VI-NEXT:    s_movk_i32 s32, 0x800
 ; VI-NEXT:    s_mov_b64 s[0:1], s[36:37]
 ; VI-NEXT:    s_mov_b64 s[2:3], s[38:39]
@@ -4802,10 +4802,10 @@ define amdgpu_kernel void @test_call_external_void_func_sret_struct_i8_i32_byval
 ; VI-NEXT:    buffer_store_dword v0, off, s[36:39], s32 offset:4
 ; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    buffer_store_dword v1, off, s[36:39], s32
-; VI-NEXT:    v_mov_b32_e32 v0, 16
+; VI-NEXT:    v_mov_b32_e32 v0, 8
 ; VI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; VI-NEXT:    buffer_load_ubyte v0, off, s[36:39], 0 offset:16
-; VI-NEXT:    buffer_load_dword v1, off, s[36:39], 0 offset:20
+; VI-NEXT:    buffer_load_ubyte v0, off, s[36:39], 0 offset:8
+; VI-NEXT:    buffer_load_dword v1, off, s[36:39], 0 offset:12
 ; VI-NEXT:    s_mov_b32 s3, 0xf000
 ; VI-NEXT:    s_mov_b32 s2, -1
 ; VI-NEXT:    s_waitcnt vmcnt(1)
@@ -4824,11 +4824,11 @@ define amdgpu_kernel void @test_call_external_void_func_sret_struct_i8_i32_byval
 ; CI-NEXT:    s_add_u32 s36, s36, s3
 ; CI-NEXT:    s_addc_u32 s37, s37, 0
 ; CI-NEXT:    v_mov_b32_e32 v0, 3
-; CI-NEXT:    buffer_store_byte v0, off, s[36:39], 0 offset:8
+; CI-NEXT:    buffer_store_byte v0, off, s[36:39], 0
 ; CI-NEXT:    v_mov_b32_e32 v0, 8
-; CI-NEXT:    buffer_store_dword v0, off, s[36:39], 0 offset:12
-; CI-NEXT:    buffer_load_dword v0, off, s[36:39], 0 offset:12
-; CI-NEXT:    buffer_load_dword v1, off, s[36:39], 0 offset:8
+; CI-NEXT:    buffer_store_dword v0, off, s[36:39], 0 offset:4
+; CI-NEXT:    buffer_load_dword v0, off, s[36:39], 0 offset:4
+; CI-NEXT:    buffer_load_dword v1, off, s[36:39], 0
 ; CI-NEXT:    s_movk_i32 s32, 0x800
 ; CI-NEXT:    s_mov_b64 s[0:1], s[36:37]
 ; CI-NEXT:    s_mov_b64 s[2:3], s[38:39]
@@ -4839,10 +4839,10 @@ define amdgpu_kernel void @test_call_external_void_func_sret_struct_i8_i32_byval
 ; CI-NEXT:    buffer_store_dword v0, off, s[36:39], s32 offset:4
 ; CI-NEXT:    s_waitcnt vmcnt(1)
 ; CI-NEXT:    buffer_store_dword v1, off, s[36:39], s32
-; CI-NEXT:    v_mov_b32_e32 v0, 16
+; CI-NEXT:    v_mov_b32_e32 v0, 8
 ; CI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; CI-NEXT:    buffer_load_ubyte v0, off, s[36:39], 0 offset:16
-; CI-NEXT:    buffer_load_dword v1, off, s[36:39], 0 offset:20
+; CI-NEXT:    buffer_load_ubyte v0, off, s[36:39], 0 offset:8
+; CI-NEXT:    buffer_load_dword v1, off, s[36:39], 0 offset:12
 ; CI-NEXT:    s_mov_b32 s3, 0xf000
 ; CI-NEXT:    s_mov_b32 s2, -1
 ; CI-NEXT:    s_waitcnt vmcnt(1)
@@ -4861,12 +4861,12 @@ define amdgpu_kernel void @test_call_external_void_func_sret_struct_i8_i32_byval
 ; GFX9-NEXT:    s_add_u32 s36, s36, s3
 ; GFX9-NEXT:    s_addc_u32 s37, s37, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 3
-; GFX9-NEXT:    buffer_store_byte v0, off, s[36:39], 0 offset:8
+; GFX9-NEXT:    buffer_store_byte v0, off, s[36:39], 0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 8
-; GFX9-NEXT:    buffer_store_dword v0, off, s[36:39], 0 offset:12
-; GFX9-NEXT:    buffer_load_dword v0, off, s[36:39], 0 offset:12
+; GFX9-NEXT:    buffer_store_dword v0, off, s[36:39], 0 offset:4
+; GFX9-NEXT:    buffer_load_dword v0, off, s[36:39], 0 offset:4
 ; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_load_dword v1, off, s[36:39], 0 offset:8
+; GFX9-NEXT:    buffer_load_dword v1, off, s[36:39], 0
 ; GFX9-NEXT:    s_movk_i32 s32, 0x800
 ; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37]
 ; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
@@ -4877,10 +4877,10 @@ define amdgpu_kernel void @test_call_external_void_func_sret_struct_i8_i32_byval
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[36:39], s32 offset:4
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    buffer_store_dword v1, off, s[36:39], s32
-; GFX9-NEXT:    v_mov_b32_e32 v0, 16
+; GFX9-NEXT:    v_mov_b32_e32 v0, 8
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX9-NEXT:    buffer_load_ubyte v0, off, s[36:39], 0 offset:16
-; GFX9-NEXT:    buffer_load_dword v1, off, s[36:39], 0 offset:20
+; GFX9-NEXT:    buffer_load_ubyte v0, off, s[36:39], 0 offset:8
+; GFX9-NEXT:    buffer_load_dword v1, off, s[36:39], 0 offset:12
 ; GFX9-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX9-NEXT:    s_mov_b32 s2, -1
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
@@ -4898,16 +4898,16 @@ define amdgpu_kernel void @test_call_external_void_func_sret_struct_i8_i32_byval
 ; GFX11-NEXT:    s_add_u32 s0, s0, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32@rel32@lo+4
 ; GFX11-NEXT:    s_addc_u32 s1, s1, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32@rel32@hi+12
 ; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    scratch_store_b8 off, v0, off offset:8
-; GFX11-NEXT:    scratch_store_b32 off, v1, off offset:12
-; GFX11-NEXT:    scratch_load_b64 v[0:1], off, off offset:8
+; GFX11-NEXT:    scratch_store_b8 off, v0, off
+; GFX11-NEXT:    scratch_store_b32 off, v1, off offset:4
+; GFX11-NEXT:    scratch_load_b64 v[0:1], off, off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    scratch_store_b64 off, v[0:1], s32
-; GFX11-NEXT:    v_mov_b32_e32 v0, 16
+; GFX11-NEXT:    v_mov_b32_e32 v0, 8
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    scratch_load_u8 v0, off, off offset:16
-; GFX11-NEXT:    scratch_load_b32 v1, off, off offset:20
+; GFX11-NEXT:    scratch_load_u8 v0, off, off offset:8
+; GFX11-NEXT:    scratch_load_b32 v1, off, off offset:12
 ; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX11-NEXT:    s_mov_b32 s2, -1
 ; GFX11-NEXT:    s_waitcnt vmcnt(1)
@@ -4929,11 +4929,11 @@ define amdgpu_kernel void @test_call_external_void_func_sret_struct_i8_i32_byval
 ; HSA-NEXT:    s_mov_b32 s3, 0x11e80000
 ; HSA-NEXT:    s_mov_b64 s[0:1], flat_scratch
 ; HSA-NEXT:    v_mov_b32_e32 v0, 3
-; HSA-NEXT:    buffer_store_byte v0, off, s[0:3], 0 offset:8
+; HSA-NEXT:    buffer_store_byte v0, off, s[0:3], 0
 ; HSA-NEXT:    v_mov_b32_e32 v0, 8
-; HSA-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:12
-; HSA-NEXT:    buffer_load_dword v0, off, s[0:3], 0 offset:12
-; HSA-NEXT:    buffer_load_dword v1, off, s[0:3], 0 offset:8
+; HSA-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:4
+; HSA-NEXT:    buffer_load_dword v0, off, s[0:3], 0 offset:4
+; HSA-NEXT:    buffer_load_dword v1, off, s[0:3], 0
 ; HSA-NEXT:    s_movk_i32 s32, 0x800
 ; HSA-NEXT:    s_getpc_b64 s[4:5]
 ; HSA-NEXT:    s_add_u32 s4, s4, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32@rel32@lo+4
@@ -4942,10 +4942,10 @@ define amdgpu_kernel void @test_call_external_void_func_sret_struct_i8_i32_byval
 ; HSA-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:4
 ; HSA-NEXT:    s_waitcnt vmcnt(1)
 ; HSA-NEXT:    buffer_store_dword v1, off, s[0:3], s32
-; HSA-NEXT:    v_mov_b32_e32 v0, 16
+; HSA-NEXT:    v_mov_b32_e32 v0, 8
 ; HSA-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; HSA-NEXT:    buffer_load_ubyte v0, off, s[0:3], 0 offset:16
-; HSA-NEXT:    buffer_load_dword v1, off, s[0:3], 0 offset:20
+; HSA-NEXT:    buffer_load_ubyte v0, off, s[0:3], 0 offset:8
+; HSA-NEXT:    buffer_load_dword v1, off, s[0:3], 0 offset:12
 ; HSA-NEXT:    s_mov_b32 s7, 0x1100f000
 ; HSA-NEXT:    s_mov_b32 s6, -1
 ; HSA-NEXT:    s_waitcnt vmcnt(1)
diff --git a/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs-packed.ll b/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs-packed.ll
index c74d5ef..1d2523d 100644
--- a/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs-packed.ll
+++ b/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs-packed.ll
@@ -509,13 +509,13 @@ define void @too_many_args_use_workitem_id_x_byval(
 ; Local stack object initialize. Offset 0 is the emergency spill slot.
 ; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x3e7{{$}}
 ; GCN-DAG: s_movk_i32 s32, 0x400
-; GCN: buffer_store_dword [[K]], off, s[0:3], 0 offset:4
+; GCN: buffer_store_dword [[K]], off, s[0:3], 0
 
 ; Pass %arg31 on stack
 ; GCN: v_mov_b32_e32 [[K1:v[0-9]+]], 0x140{{$}}
 ; GCN: buffer_store_dword [[K1:v[0-9]+]], off, s[0:3], s32{{$}}
 
-; GCN: buffer_load_dword [[RELOAD_BYVAL:v[0-9]+]], off, s[0:3], 0 offset:4
+; GCN: buffer_load_dword [[RELOAD_BYVAL:v[0-9]+]], off, s[0:3], 0
 ; GCN: buffer_store_dword [[RELOAD_BYVAL]], off, s[0:3], s32 offset:4{{$}}
 ; GCN: v_mov_b32_e32 [[RELOAD_BYVAL]],
 ; GCN: s_swappc_b64
diff --git a/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll b/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll
index 49bf48a..0705d49 100644
--- a/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll
+++ b/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll
@@ -529,16 +529,16 @@ define void @too_many_args_use_workitem_id_x_byval(
 ; FIXEDABI: v_mov_b32_e32 v31, v0
 ; FIXEDABI: v_mov_b32_e32 [[K0:v[0-9]+]], 0x3e7
 ; FIXEDABI: s_movk_i32 s32, 0x400{{$}}
-; FIXEDABI: buffer_store_dword [[K0]], off, s[0:3], 0 offset:4{{$}}
+; FIXEDABI: buffer_store_dword [[K0]], off, s[0:3], 0{{$}}
 ; FIXEDABI: v_mov_b32_e32 [[K1:v[0-9]+]], 0x140
 
 ; FIXEDABI: buffer_store_dword [[K1]], off, s[0:3], s32{{$}}
 
 ; FIXME: Why this reload?
-; FIXEDABI: buffer_load_dword [[RELOAD:v[0-9]+]], off, s[0:3], 0 offset:4{{$}}
+; FIXEDABI: buffer_load_dword [[RELOAD:v[0-9]+]], off, s[0:3], 0{{$}}
 
 ; FIXEDABI-NOT: s32
-; FIXEDABI: buffer_store_dword [[RELOAD]], off, s[0:3], s32 offset:4
+; FIXEDABI: buffer_store_dword [[RELOAD]], off, s[0:3], s32
 ; FIXEDABI: s_swappc_b64
 define amdgpu_kernel void @kern_call_too_many_args_use_workitem_id_x_byval() #1 {
   %alloca = alloca i32, align 4, addrspace(5)
diff --git a/llvm/test/CodeGen/AMDGPU/captured-frame-index.ll b/llvm/test/CodeGen/AMDGPU/captured-frame-index.ll
index f9b44f4..927e45f 100644
--- a/llvm/test/CodeGen/AMDGPU/captured-frame-index.ll
+++ b/llvm/test/CodeGen/AMDGPU/captured-frame-index.ll
@@ -1,7 +1,7 @@
 ; RUN: llc -mtriple=amdgcn-- -mcpu=tahiti -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
 
 ; GCN-LABEL: {{^}}store_fi_lifetime:
-; GCN: v_mov_b32_e32 [[FI:v[0-9]+]], 4{{$}}
+; GCN: v_mov_b32_e32 [[FI:v[0-9]+]], 0{{$}}
 ; GCN: buffer_store_dword [[FI]]
 define amdgpu_kernel void @store_fi_lifetime(ptr addrspace(1) %out, i32 %in) #0 {
 entry:
@@ -14,7 +14,7 @@ entry:
 
 ; GCN-LABEL: {{^}}stored_fi_to_lds:
 ; GCN: s_load_dword [[LDSPTR:s[0-9]+]]
-; GCN: v_mov_b32_e32 [[ZERO0:v[0-9]+]], 4{{$}}
+; GCN: v_mov_b32_e32 [[ZERO0:v[0-9]+]], 0{{$}}
 ; GCN: buffer_store_dword v{{[0-9]+}}, off,
 ; GCN: v_mov_b32_e32 [[VLDSPTR:v[0-9]+]], [[LDSPTR]]
 ; GCN: ds_write_b32  [[VLDSPTR]], [[ZERO0]]
@@ -27,16 +27,16 @@ define amdgpu_kernel void @stored_fi_to_lds(ptr addrspace(3) %ptr) #0 {
 
 ; Offset is applied
 ; GCN-LABEL: {{^}}stored_fi_to_lds_2_small_objects:
-; GCN-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 4{{$}}
+; GCN-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}}
+; GCN-DAG: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
 ; GCN-DAG: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:4{{$}}
-; GCN-DAG: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8{{$}}
 
 ; GCN-DAG: s_load_dword [[LDSPTR:s[0-9]+]]
 
 ; GCN-DAG: v_mov_b32_e32 [[VLDSPTR:v[0-9]+]], [[LDSPTR]]
 ; GCN: ds_write_b32  [[VLDSPTR]], [[ZERO]]
 
-; GCN-DAG: v_mov_b32_e32 [[FI1:v[0-9]+]], 8{{$}}
+; GCN-DAG: v_mov_b32_e32 [[FI1:v[0-9]+]], 4{{$}}
 ; GCN: ds_write_b32  [[VLDSPTR]], [[FI1]]
 define amdgpu_kernel void @stored_fi_to_lds_2_small_objects(ptr addrspace(3) %ptr) #0 {
   %tmp0 = alloca float, addrspace(5)
@@ -51,9 +51,9 @@ define amdgpu_kernel void @stored_fi_to_lds_2_small_objects(ptr addrspace(3) %pt
 ; Same frame index is used multiple times in the store
 ; GCN-LABEL: {{^}}stored_fi_to_self:
 ; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x4d2{{$}}
-; GCN: buffer_store_dword [[K]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:4{{$}}
-; GCN-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 4{{$}}
-; GCN: buffer_store_dword [[ZERO]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:4{{$}}
+; GCN: buffer_store_dword [[K]], off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
+; GCN-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}}
+; GCN: buffer_store_dword [[ZERO]], off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
 define amdgpu_kernel void @stored_fi_to_self() #0 {
   %tmp = alloca ptr addrspace(5), addrspace(5)
 
@@ -65,13 +65,13 @@ define amdgpu_kernel void @stored_fi_to_self() #0 {
 
 ; GCN-LABEL: {{^}}stored_fi_to_self_offset:
 ; GCN-DAG: v_mov_b32_e32 [[K0:v[0-9]+]], 32{{$}}
-; GCN: buffer_store_dword [[K0]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:4{{$}}
+; GCN: buffer_store_dword [[K0]], off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
 
 ; GCN-DAG: v_mov_b32_e32 [[K1:v[0-9]+]], 0x4d2{{$}}
-; GCN: buffer_store_dword [[K1]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:2052{{$}}
+; GCN: buffer_store_dword [[K1]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:2048{{$}}
 
-; GCN: v_mov_b32_e32 [[OFFSETK:v[0-9]+]], 0x804{{$}}
-; GCN: buffer_store_dword [[OFFSETK]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:2052{{$}}
+; GCN: v_mov_b32_e32 [[OFFSETK:v[0-9]+]], 0x800{{$}}
+; GCN: buffer_store_dword [[OFFSETK]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:2048{{$}}
 define amdgpu_kernel void @stored_fi_to_self_offset() #0 {
   %tmp0 = alloca [512 x i32], addrspace(5)
   %tmp1 = alloca ptr addrspace(5), addrspace(5)
@@ -86,15 +86,15 @@ define amdgpu_kernel void @stored_fi_to_self_offset() #0 {
 }
 
 ; GCN-LABEL: {{^}}stored_fi_to_fi:
+; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
 ; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:4{{$}}
 ; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8{{$}}
-; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:12{{$}}
 
-; GCN: v_mov_b32_e32 [[FI1:v[0-9]+]], 8{{$}}
-; GCN: buffer_store_dword [[FI1]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:12{{$}}
+; GCN: v_mov_b32_e32 [[FI1:v[0-9]+]], 4{{$}}
+; GCN: buffer_store_dword [[FI1]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8{{$}}
 
-; GCN: v_mov_b32_e32 [[FI2:v[0-9]+]], 12{{$}}
-; GCN: buffer_store_dword [[FI2]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8{{$}}
+; GCN: v_mov_b32_e32 [[FI2:v[0-9]+]], 8{{$}}
+; GCN: buffer_store_dword [[FI2]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:4{{$}}
 define amdgpu_kernel void @stored_fi_to_fi() #0 {
   %tmp0 = alloca ptr addrspace(5), addrspace(5)
   %tmp1 = alloca ptr addrspace(5), addrspace(5)
@@ -104,14 +104,14 @@ define amdgpu_kernel void @stored_fi_to_fi() #0 {
   store volatile ptr addrspace(5) inttoptr (i32 9999 to ptr addrspace(5)), ptr addrspace(5) %tmp2
 
 
-  store volatile ptr addrspace(5) %tmp1, ptr addrspace(5) %tmp2 ; store offset 4 at offset 8
-  store volatile ptr addrspace(5) %tmp2, ptr addrspace(5) %tmp1 ; store offset 8 at offset 4
+  store volatile ptr addrspace(5) %tmp1, ptr addrspace(5) %tmp2 ; store offset 0 at offset 4
+  store volatile ptr addrspace(5) %tmp2, ptr addrspace(5) %tmp1 ; store offset 4 at offset 0
   ret void
 }
 
 ; GCN-LABEL: {{^}}stored_fi_to_global:
-; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:4{{$}}
-; GCN: v_mov_b32_e32 [[FI:v[0-9]+]], 4{{$}}
+; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
+; GCN: v_mov_b32_e32 [[FI:v[0-9]+]], 0{{$}}
 ; GCN: buffer_store_dword [[FI]]
 define amdgpu_kernel void @stored_fi_to_global(ptr addrspace(1) %ptr) #0 {
   %tmp = alloca float, addrspace(5)
@@ -122,14 +122,14 @@ define amdgpu_kernel void @stored_fi_to_global(ptr addrspace(1) %ptr) #0 {
 
 ; Offset is applied
 ; GCN-LABEL: {{^}}stored_fi_to_global_2_small_objects:
+; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
 ; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:4{{$}}
 ; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8{{$}}
-; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:12{{$}}
 
-; GCN: v_mov_b32_e32 [[FI1:v[0-9]+]], 8{{$}}
+; GCN: v_mov_b32_e32 [[FI1:v[0-9]+]], 4{{$}}
 ; GCN: buffer_store_dword [[FI1]], off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
 
-; GCN-DAG: v_mov_b32_e32 [[FI2:v[0-9]+]], 12{{$}}
+; GCN-DAG: v_mov_b32_e32 [[FI2:v[0-9]+]], 8{{$}}
 ; GCN: buffer_store_dword [[FI2]], off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
 define amdgpu_kernel void @stored_fi_to_global_2_small_objects(ptr addrspace(1) %ptr) #0 {
   %tmp0 = alloca float, addrspace(5)
@@ -178,7 +178,7 @@ define amdgpu_kernel void @stored_fi_to_global_huge_frame_offset(ptr addrspace(1
 ; GCN: s_getpc_b64 s[[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]]
 ; GCN: s_add_u32 s{{[0-9]+}}, s[[PC_LO]], g1@gotpcrel32@lo+4
 ; GCN: s_addc_u32 s{{[0-9]+}}, s[[PC_HI]], g1@gotpcrel32@hi+12
-; GCN: v_mov_b32_e32 [[FI:v[0-9]+]], 4{{$}}
+; GCN: v_mov_b32_e32 [[FI:v[0-9]+]], 0{{$}}
 ; GCN: buffer_store_dword [[FI]]
 define amdgpu_kernel void @cannot_select_assertzext_valuetype(ptr addrspace(1) %out, i32 %idx) #0 {
 entry:
diff --git a/llvm/test/CodeGen/AMDGPU/cc-update.ll b/llvm/test/CodeGen/AMDGPU/cc-update.ll
index 42beb1c..7188883 100644
--- a/llvm/test/CodeGen/AMDGPU/cc-update.ll
+++ b/llvm/test/CodeGen/AMDGPU/cc-update.ll
@@ -30,7 +30,7 @@ define amdgpu_kernel void @test_kern_stack() local_unnamed_addr #0 {
 ; GFX803-NEXT:    s_add_u32 s0, s0, s7
 ; GFX803-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX803-NEXT:    v_mov_b32_e32 v0, 0
-; GFX803-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:4
+; GFX803-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX803-NEXT:    s_waitcnt vmcnt(0)
 ; GFX803-NEXT:    s_endpgm
 ;
@@ -39,7 +39,7 @@ define amdgpu_kernel void @test_kern_stack() local_unnamed_addr #0 {
 ; GFX900-NEXT:    s_add_u32 s0, s0, s7
 ; GFX900-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX900-NEXT:    v_mov_b32_e32 v0, 0
-; GFX900-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:4
+; GFX900-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_endpgm
 ;
@@ -48,14 +48,14 @@ define amdgpu_kernel void @test_kern_stack() local_unnamed_addr #0 {
 ; GFX1010-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1010-NEXT:    s_add_u32 s0, s0, s7
 ; GFX1010-NEXT:    s_addc_u32 s1, s1, 0
-; GFX1010-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:4
+; GFX1010-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX1010-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX1010-NEXT:    s_endpgm
 ;
 ; GFX1100-LABEL: test_kern_stack:
 ; GFX1100:       ; %bb.0: ; %entry
 ; GFX1100-NEXT:    v_mov_b32_e32 v0, 0
-; GFX1100-NEXT:    scratch_store_b32 off, v0, off offset:4 dlc
+; GFX1100-NEXT:    scratch_store_b32 off, v0, off dlc
 ; GFX1100-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX1100-NEXT:    s_endpgm
 entry:
@@ -164,7 +164,7 @@ define amdgpu_kernel void @test_kern_stack_and_call() local_unnamed_addr #0 {
 ; GFX803-NEXT:    v_or_b32_e32 v31, v0, v2
 ; GFX803-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX803-NEXT:    s_movk_i32 s32, 0x400
-; GFX803-NEXT:    buffer_store_dword v3, off, s[0:3], 0 offset:4
+; GFX803-NEXT:    buffer_store_dword v3, off, s[0:3], 0
 ; GFX803-NEXT:    s_waitcnt vmcnt(0)
 ; GFX803-NEXT:    s_getpc_b64 s[16:17]
 ; GFX803-NEXT:    s_add_u32 s16, s16, ex@rel32@lo+4
@@ -186,7 +186,7 @@ define amdgpu_kernel void @test_kern_stack_and_call() local_unnamed_addr #0 {
 ; GFX900-NEXT:    v_or3_b32 v31, v0, v1, v2
 ; GFX900-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX900-NEXT:    s_movk_i32 s32, 0x400
-; GFX900-NEXT:    buffer_store_dword v3, off, s[0:3], 0 offset:4
+; GFX900-NEXT:    buffer_store_dword v3, off, s[0:3], 0
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_getpc_b64 s[16:17]
 ; GFX900-NEXT:    s_add_u32 s16, s16, ex@rel32@lo+4
@@ -210,7 +210,7 @@ define amdgpu_kernel void @test_kern_stack_and_call() local_unnamed_addr #0 {
 ; GFX1010-NEXT:    v_or3_b32 v31, v0, v1, v2
 ; GFX1010-NEXT:    s_mov_b64 s[10:11], s[8:9]
 ; GFX1010-NEXT:    s_mov_b64 s[8:9], s[6:7]
-; GFX1010-NEXT:    buffer_store_dword v3, off, s[0:3], 0 offset:4
+; GFX1010-NEXT:    buffer_store_dword v3, off, s[0:3], 0
 ; GFX1010-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX1010-NEXT:    s_getpc_b64 s[16:17]
 ; GFX1010-NEXT:    s_add_u32 s16, s16, ex@rel32@lo+4
@@ -229,7 +229,7 @@ define amdgpu_kernel void @test_kern_stack_and_call() local_unnamed_addr #0 {
 ; GFX1100-NEXT:    s_mov_b32 s13, s14
 ; GFX1100-NEXT:    s_mov_b32 s14, s15
 ; GFX1100-NEXT:    s_mov_b32 s32, 16
-; GFX1100-NEXT:    scratch_store_b32 off, v1, off offset:4 dlc
+; GFX1100-NEXT:    scratch_store_b32 off, v1, off dlc
 ; GFX1100-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX1100-NEXT:    s_getpc_b64 s[6:7]
 ; GFX1100-NEXT:    s_add_u32 s6, s6, ex@rel32@lo+4
@@ -276,7 +276,7 @@ define amdgpu_kernel void @test_force_fp_kern_stack() local_unnamed_addr #2 {
 ; GFX803-NEXT:    s_mov_b32 s33, 0
 ; GFX803-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX803-NEXT:    v_mov_b32_e32 v0, 0
-; GFX803-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:4
+; GFX803-NEXT:    buffer_store_dword v0, off, s[0:3], s33
 ; GFX803-NEXT:    s_waitcnt vmcnt(0)
 ; GFX803-NEXT:    s_endpgm
 ;
@@ -286,7 +286,7 @@ define amdgpu_kernel void @test_force_fp_kern_stack() local_unnamed_addr #2 {
 ; GFX900-NEXT:    s_mov_b32 s33, 0
 ; GFX900-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX900-NEXT:    v_mov_b32_e32 v0, 0
-; GFX900-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:4
+; GFX900-NEXT:    buffer_store_dword v0, off, s[0:3], s33
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_endpgm
 ;
@@ -296,7 +296,7 @@ define amdgpu_kernel void @test_force_fp_kern_stack() local_unnamed_addr #2 {
 ; GFX1010-NEXT:    s_add_u32 s0, s0, s7
 ; GFX1010-NEXT:    s_mov_b32 s33, 0
 ; GFX1010-NEXT:    s_addc_u32 s1, s1, 0
-; GFX1010-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:4
+; GFX1010-NEXT:    buffer_store_dword v0, off, s[0:3], s33
 ; GFX1010-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX1010-NEXT:    s_endpgm
 ;
@@ -304,7 +304,7 @@ define amdgpu_kernel void @test_force_fp_kern_stack() local_unnamed_addr #2 {
 ; GFX1100:       ; %bb.0: ; %entry
 ; GFX1100-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1100-NEXT:    s_mov_b32 s33, 0
-; GFX1100-NEXT:    scratch_store_b32 off, v0, s33 offset:4 dlc
+; GFX1100-NEXT:    scratch_store_b32 off, v0, s33 dlc
 ; GFX1100-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX1100-NEXT:    s_endpgm
 entry:
@@ -436,7 +436,7 @@ define amdgpu_kernel void @test_force_fp_kern_stack_and_call() local_unnamed_add
 ; GFX803-NEXT:    v_or_b32_e32 v31, v0, v2
 ; GFX803-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX803-NEXT:    s_movk_i32 s32, 0x400
-; GFX803-NEXT:    buffer_store_dword v3, off, s[0:3], s33 offset:4
+; GFX803-NEXT:    buffer_store_dword v3, off, s[0:3], s33
 ; GFX803-NEXT:    s_waitcnt vmcnt(0)
 ; GFX803-NEXT:    s_getpc_b64 s[16:17]
 ; GFX803-NEXT:    s_add_u32 s16, s16, ex@rel32@lo+4
@@ -459,7 +459,7 @@ define amdgpu_kernel void @test_force_fp_kern_stack_and_call() local_unnamed_add
 ; GFX900-NEXT:    v_or3_b32 v31, v0, v1, v2
 ; GFX900-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX900-NEXT:    s_movk_i32 s32, 0x400
-; GFX900-NEXT:    buffer_store_dword v3, off, s[0:3], s33 offset:4
+; GFX900-NEXT:    buffer_store_dword v3, off, s[0:3], s33
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_getpc_b64 s[16:17]
 ; GFX900-NEXT:    s_add_u32 s16, s16, ex@rel32@lo+4
@@ -484,7 +484,7 @@ define amdgpu_kernel void @test_force_fp_kern_stack_and_call() local_unnamed_add
 ; GFX1010-NEXT:    v_or3_b32 v31, v0, v1, v2
 ; GFX1010-NEXT:    s_mov_b64 s[10:11], s[8:9]
 ; GFX1010-NEXT:    s_mov_b64 s[8:9], s[6:7]
-; GFX1010-NEXT:    buffer_store_dword v3, off, s[0:3], s33 offset:4
+; GFX1010-NEXT:    buffer_store_dword v3, off, s[0:3], s33
 ; GFX1010-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX1010-NEXT:    s_getpc_b64 s[16:17]
 ; GFX1010-NEXT:    s_add_u32 s16, s16, ex@rel32@lo+4
@@ -504,7 +504,7 @@ define amdgpu_kernel void @test_force_fp_kern_stack_and_call() local_unnamed_add
 ; GFX1100-NEXT:    s_mov_b32 s13, s14
 ; GFX1100-NEXT:    s_mov_b32 s14, s15
 ; GFX1100-NEXT:    s_mov_b32 s32, 16
-; GFX1100-NEXT:    scratch_store_b32 off, v1, s33 offset:4 dlc
+; GFX1100-NEXT:    scratch_store_b32 off, v1, s33 dlc
 ; GFX1100-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX1100-NEXT:    s_getpc_b64 s[6:7]
 ; GFX1100-NEXT:    s_add_u32 s6, s6, ex@rel32@lo+4
diff --git a/llvm/test/CodeGen/AMDGPU/cf-loop-on-constant.ll b/llvm/test/CodeGen/AMDGPU/cf-loop-on-constant.ll
index cd36f6a..5615919 100644
--- a/llvm/test/CodeGen/AMDGPU/cf-loop-on-constant.ll
+++ b/llvm/test/CodeGen/AMDGPU/cf-loop-on-constant.ll
@@ -48,13 +48,13 @@ define amdgpu_kernel void @test_loop(ptr addrspace(3) %ptr, i32 %n) nounwind {
 ; GCN_DBG-NEXT:    v_writelane_b32 v0, s0, 1
 ; GCN_DBG-NEXT:    s_mov_b64 s[4:5], exec
 ; GCN_DBG-NEXT:    s_mov_b64 exec, -1
-; GCN_DBG-NEXT:    buffer_store_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
+; GCN_DBG-NEXT:    buffer_store_dword v0, off, s[12:15], 0 ; 4-byte Folded Spill
 ; GCN_DBG-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN_DBG-NEXT:    s_cbranch_scc1 .LBB0_2
 ; GCN_DBG-NEXT:  ; %bb.1: ; %for.exit
 ; GCN_DBG-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; GCN_DBG-NEXT:    s_waitcnt expcnt(0)
-; GCN_DBG-NEXT:    buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
+; GCN_DBG-NEXT:    buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload
 ; GCN_DBG-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN_DBG-NEXT:    ; kill: killed $vgpr0
 ; GCN_DBG-NEXT:    s_endpgm
@@ -62,7 +62,7 @@ define amdgpu_kernel void @test_loop(ptr addrspace(3) %ptr, i32 %n) nounwind {
 ; GCN_DBG-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GCN_DBG-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; GCN_DBG-NEXT:    s_waitcnt expcnt(0)
-; GCN_DBG-NEXT:    buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
+; GCN_DBG-NEXT:    buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload
 ; GCN_DBG-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN_DBG-NEXT:    s_waitcnt vmcnt(0)
 ; GCN_DBG-NEXT:    v_readlane_b32 s0, v0, 1
@@ -87,13 +87,13 @@ define amdgpu_kernel void @test_loop(ptr addrspace(3) %ptr, i32 %n) nounwind {
 ; GCN_DBG-NEXT:    s_and_b64 vcc, exec, s[2:3]
 ; GCN_DBG-NEXT:    v_writelane_b32 v0, s0, 1
 ; GCN_DBG-NEXT:    s_or_saveexec_b64 s[4:5], -1
-; GCN_DBG-NEXT:    buffer_store_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
+; GCN_DBG-NEXT:    buffer_store_dword v0, off, s[12:15], 0 ; 4-byte Folded Spill
 ; GCN_DBG-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN_DBG-NEXT:    s_cbranch_vccnz .LBB0_2
 ; GCN_DBG-NEXT:  ; %bb.3: ; %DummyReturnBlock
 ; GCN_DBG-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; GCN_DBG-NEXT:    s_waitcnt expcnt(0)
-; GCN_DBG-NEXT:    buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
+; GCN_DBG-NEXT:    buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload
 ; GCN_DBG-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN_DBG-NEXT:    ; kill: killed $vgpr0
 ; GCN_DBG-NEXT:    s_endpgm
@@ -151,13 +151,13 @@ define amdgpu_kernel void @loop_const_true(ptr addrspace(3) %ptr, i32 %n) nounwi
 ; GCN_DBG-NEXT:    s_mov_b32 s0, 0
 ; GCN_DBG-NEXT:    v_writelane_b32 v0, s0, 1
 ; GCN_DBG-NEXT:    s_or_saveexec_b64 s[4:5], -1
-; GCN_DBG-NEXT:    buffer_store_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
+; GCN_DBG-NEXT:    buffer_store_dword v0, off, s[12:15], 0 ; 4-byte Folded Spill
 ; GCN_DBG-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN_DBG-NEXT:    s_branch .LBB1_2
 ; GCN_DBG-NEXT:  .LBB1_1: ; %for.exit
 ; GCN_DBG-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; GCN_DBG-NEXT:    s_waitcnt expcnt(0)
-; GCN_DBG-NEXT:    buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
+; GCN_DBG-NEXT:    buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload
 ; GCN_DBG-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN_DBG-NEXT:    ; kill: killed $vgpr0
 ; GCN_DBG-NEXT:    s_endpgm
@@ -165,7 +165,7 @@ define amdgpu_kernel void @loop_const_true(ptr addrspace(3) %ptr, i32 %n) nounwi
 ; GCN_DBG-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GCN_DBG-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; GCN_DBG-NEXT:    s_waitcnt expcnt(0)
-; GCN_DBG-NEXT:    buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
+; GCN_DBG-NEXT:    buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload
 ; GCN_DBG-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN_DBG-NEXT:    s_waitcnt vmcnt(0)
 ; GCN_DBG-NEXT:    v_readlane_b32 s0, v0, 1
@@ -190,7 +190,7 @@ define amdgpu_kernel void @loop_const_true(ptr addrspace(3) %ptr, i32 %n) nounwi
 ; GCN_DBG-NEXT:    s_and_b64 vcc, exec, s[2:3]
 ; GCN_DBG-NEXT:    v_writelane_b32 v0, s0, 1
 ; GCN_DBG-NEXT:    s_or_saveexec_b64 s[4:5], -1
-; GCN_DBG-NEXT:    buffer_store_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
+; GCN_DBG-NEXT:    buffer_store_dword v0, off, s[12:15], 0 ; 4-byte Folded Spill
 ; GCN_DBG-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN_DBG-NEXT:    s_cbranch_vccnz .LBB1_1
 ; GCN_DBG-NEXT:    s_branch .LBB1_2
@@ -239,13 +239,13 @@ define amdgpu_kernel void @loop_const_false(ptr addrspace(3) %ptr, i32 %n) nounw
 ; GCN_DBG-NEXT:    s_mov_b32 s0, 0
 ; GCN_DBG-NEXT:    v_writelane_b32 v0, s0, 1
 ; GCN_DBG-NEXT:    s_or_saveexec_b64 s[4:5], -1
-; GCN_DBG-NEXT:    buffer_store_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
+; GCN_DBG-NEXT:    buffer_store_dword v0, off, s[12:15], 0 ; 4-byte Folded Spill
 ; GCN_DBG-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN_DBG-NEXT:    s_branch .LBB2_2
 ; GCN_DBG-NEXT:  .LBB2_1: ; %for.exit
 ; GCN_DBG-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; GCN_DBG-NEXT:    s_waitcnt expcnt(0)
-; GCN_DBG-NEXT:    buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
+; GCN_DBG-NEXT:    buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload
 ; GCN_DBG-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN_DBG-NEXT:    ; kill: killed $vgpr0
 ; GCN_DBG-NEXT:    s_endpgm
@@ -253,7 +253,7 @@ define amdgpu_kernel void @loop_const_false(ptr addrspace(3) %ptr, i32 %n) nounw
 ; GCN_DBG-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GCN_DBG-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; GCN_DBG-NEXT:    s_waitcnt expcnt(0)
-; GCN_DBG-NEXT:    buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
+; GCN_DBG-NEXT:    buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload
 ; GCN_DBG-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN_DBG-NEXT:    s_waitcnt vmcnt(0)
 ; GCN_DBG-NEXT:    v_readlane_b32 s0, v0, 1
@@ -278,7 +278,7 @@ define amdgpu_kernel void @loop_const_false(ptr addrspace(3) %ptr, i32 %n) nounw
 ; GCN_DBG-NEXT:    s_and_b64 vcc, exec, s[2:3]
 ; GCN_DBG-NEXT:    v_writelane_b32 v0, s0, 1
 ; GCN_DBG-NEXT:    s_or_saveexec_b64 s[4:5], -1
-; GCN_DBG-NEXT:    buffer_store_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
+; GCN_DBG-NEXT:    buffer_store_dword v0, off, s[12:15], 0 ; 4-byte Folded Spill
 ; GCN_DBG-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN_DBG-NEXT:    s_cbranch_vccnz .LBB2_1
 ; GCN_DBG-NEXT:    s_branch .LBB2_2
@@ -328,13 +328,13 @@ define amdgpu_kernel void @loop_const_undef(ptr addrspace(3) %ptr, i32 %n) nounw
 ; GCN_DBG-NEXT:    s_mov_b32 s0, 0
 ; GCN_DBG-NEXT:    v_writelane_b32 v0, s0, 1
 ; GCN_DBG-NEXT:    s_or_saveexec_b64 s[4:5], -1
-; GCN_DBG-NEXT:    buffer_store_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
+; GCN_DBG-NEXT:    buffer_store_dword v0, off, s[12:15], 0 ; 4-byte Folded Spill
 ; GCN_DBG-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN_DBG-NEXT:    s_branch .LBB3_2
 ; GCN_DBG-NEXT:  .LBB3_1: ; %for.exit
 ; GCN_DBG-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; GCN_DBG-NEXT:    s_waitcnt expcnt(0)
-; GCN_DBG-NEXT:    buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
+; GCN_DBG-NEXT:    buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload
 ; GCN_DBG-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN_DBG-NEXT:    ; kill: killed $vgpr0
 ; GCN_DBG-NEXT:    s_endpgm
@@ -342,7 +342,7 @@ define amdgpu_kernel void @loop_const_undef(ptr addrspace(3) %ptr, i32 %n) nounw
 ; GCN_DBG-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GCN_DBG-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; GCN_DBG-NEXT:    s_waitcnt expcnt(0)
-; GCN_DBG-NEXT:    buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
+; GCN_DBG-NEXT:    buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload
 ; GCN_DBG-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN_DBG-NEXT:    s_waitcnt vmcnt(0)
 ; GCN_DBG-NEXT:    v_readlane_b32 s0, v0, 1
@@ -365,7 +365,7 @@ define amdgpu_kernel void @loop_const_undef(ptr addrspace(3) %ptr, i32 %n) nounw
 ; GCN_DBG-NEXT:    s_add_i32 s0, s0, s1
 ; GCN_DBG-NEXT:    v_writelane_b32 v0, s0, 1
 ; GCN_DBG-NEXT:    s_or_saveexec_b64 s[4:5], -1
-; GCN_DBG-NEXT:    buffer_store_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
+; GCN_DBG-NEXT:    buffer_store_dword v0, off, s[12:15], 0 ; 4-byte Folded Spill
 ; GCN_DBG-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN_DBG-NEXT:    s_cbranch_scc1 .LBB3_1
 ; GCN_DBG-NEXT:    s_branch .LBB3_2
@@ -441,13 +441,13 @@ define amdgpu_kernel void @loop_arg_0(ptr addrspace(3) %ptr, i32 %n) nounwind {
 ; GCN_DBG-NEXT:    s_mov_b32 s0, 0
 ; GCN_DBG-NEXT:    v_writelane_b32 v0, s0, 3
 ; GCN_DBG-NEXT:    s_or_saveexec_b64 s[6:7], -1
-; GCN_DBG-NEXT:    buffer_store_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
+; GCN_DBG-NEXT:    buffer_store_dword v0, off, s[12:15], 0 ; 4-byte Folded Spill
 ; GCN_DBG-NEXT:    s_mov_b64 exec, s[6:7]
 ; GCN_DBG-NEXT:    s_branch .LBB4_2
 ; GCN_DBG-NEXT:  .LBB4_1: ; %for.exit
 ; GCN_DBG-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; GCN_DBG-NEXT:    s_waitcnt expcnt(0)
-; GCN_DBG-NEXT:    buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
+; GCN_DBG-NEXT:    buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload
 ; GCN_DBG-NEXT:    s_mov_b64 exec, s[6:7]
 ; GCN_DBG-NEXT:    ; kill: killed $vgpr0
 ; GCN_DBG-NEXT:    s_endpgm
@@ -455,7 +455,7 @@ define amdgpu_kernel void @loop_arg_0(ptr addrspace(3) %ptr, i32 %n) nounwind {
 ; GCN_DBG-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GCN_DBG-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; GCN_DBG-NEXT:    s_waitcnt expcnt(0)
-; GCN_DBG-NEXT:    buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
+; GCN_DBG-NEXT:    buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload
 ; GCN_DBG-NEXT:    s_mov_b64 exec, s[6:7]
 ; GCN_DBG-NEXT:    s_waitcnt vmcnt(0)
 ; GCN_DBG-NEXT:    v_readlane_b32 s0, v0, 3
@@ -481,7 +481,7 @@ define amdgpu_kernel void @loop_arg_0(ptr addrspace(3) %ptr, i32 %n) nounwind {
 ; GCN_DBG-NEXT:    s_and_b64 vcc, exec, s[2:3]
 ; GCN_DBG-NEXT:    v_writelane_b32 v0, s0, 3
 ; GCN_DBG-NEXT:    s_or_saveexec_b64 s[6:7], -1
-; GCN_DBG-NEXT:    buffer_store_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
+; GCN_DBG-NEXT:    buffer_store_dword v0, off, s[12:15], 0 ; 4-byte Folded Spill
 ; GCN_DBG-NEXT:    s_mov_b64 exec, s[6:7]
 ; GCN_DBG-NEXT:    s_cbranch_vccnz .LBB4_1
 ; GCN_DBG-NEXT:    s_branch .LBB4_2
diff --git a/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes.ll b/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes.ll
index afb7357..49f9f69 100644
--- a/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes.ll
+++ b/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes.ll
@@ -136,8 +136,8 @@ done:
 
 ; GCN-LABEL: {{^}}test_sink_scratch_small_offset_i32:
 ; GCN: s_and_saveexec_b64
-; GCN: buffer_store_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:4092{{$}}
-; GCN: buffer_load_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:4092 glc{{$}}
+; GCN: buffer_store_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:4088{{$}}
+; GCN: buffer_load_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:4088 glc{{$}}
 ; GCN: {{^}}.LBB4_2:
 define amdgpu_kernel void @test_sink_scratch_small_offset_i32(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %arg) {
 entry:
@@ -166,7 +166,8 @@ done:
   ret void
 }
 
-; This ends up not fitting due to the reserved 4 bytes at offset 0
+; This used to be a special case when the scavenge slot was
+; fixed at offset 0.
 ; OPT-LABEL: @test_sink_scratch_small_offset_i32_reserved(
 ; OPT-NOT:  getelementptr [512 x i32]
 ; OPT: br i1
@@ -174,10 +175,8 @@ done:
 
 ; GCN-LABEL: {{^}}test_sink_scratch_small_offset_i32_reserved:
 ; GCN: s_and_saveexec_b64
-; GCN: v_mov_b32_e32 [[BASE_FI0:v[0-9]+]], 4
-; GCN: buffer_store_dword {{v[0-9]+}}, [[BASE_FI0]], {{s\[[0-9]+:[0-9]+\]}}, 0 offen offset:4092{{$}}
-; GCN: v_mov_b32_e32 [[BASE_FI1:v[0-9]+]], 4
-; GCN: buffer_load_dword {{v[0-9]+}}, [[BASE_FI1]], {{s\[[0-9]+:[0-9]+\]}}, 0 offen offset:4092 glc{{$}}
+; GCN: buffer_store_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:4092{{$}}
+; GCN: buffer_load_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:4092 glc{{$}}
 ; GCN: {{^.LBB[0-9]+}}_2:
 
 define amdgpu_kernel void @test_sink_scratch_small_offset_i32_reserved(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %arg) {
diff --git a/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll b/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll
index dfc8361..397efb1 100644
--- a/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll
+++ b/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll
@@ -432,22 +432,22 @@ define amdgpu_kernel void @vload2_private(ptr addrspace(1) nocapture readonly %i
 ; GFX900-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX900-NEXT:    global_load_ushort v0, v2, s[4:5]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
-; GFX900-NEXT:    buffer_store_short v0, off, s[0:3], 0 offset:4
+; GFX900-NEXT:    buffer_store_short v0, off, s[0:3], 0
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    global_load_ushort v0, v2, s[4:5] offset:2
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
-; GFX900-NEXT:    buffer_store_short v0, off, s[0:3], 0 offset:6
+; GFX900-NEXT:    buffer_store_short v0, off, s[0:3], 0 offset:2
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    global_load_ushort v0, v2, s[4:5] offset:4
 ; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
-; GFX900-NEXT:    buffer_store_short v0, off, s[0:3], 0 offset:8
+; GFX900-NEXT:    buffer_store_short v0, off, s[0:3], 0 offset:4
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
-; GFX900-NEXT:    buffer_load_ushort v0, off, s[0:3], 0 offset:6
-; GFX900-NEXT:    buffer_load_ushort v3, off, s[0:3], 0 offset:4
+; GFX900-NEXT:    buffer_load_ushort v0, off, s[0:3], 0 offset:2
+; GFX900-NEXT:    buffer_load_ushort v3, off, s[0:3], 0
 ; GFX900-NEXT:    s_waitcnt vmcnt(1)
 ; GFX900-NEXT:    v_mov_b32_e32 v1, v0
-; GFX900-NEXT:    buffer_load_short_d16_hi v1, off, s[0:3], 0 offset:8
+; GFX900-NEXT:    buffer_load_short_d16_hi v1, off, s[0:3], 0 offset:4
 ; GFX900-NEXT:    s_waitcnt vmcnt(1)
 ; GFX900-NEXT:    v_perm_b32 v0, v0, v3, s4
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
@@ -464,19 +464,19 @@ define amdgpu_kernel void @vload2_private(ptr addrspace(1) nocapture readonly %i
 ; FLATSCR-NEXT:    s_waitcnt lgkmcnt(0)
 ; FLATSCR-NEXT:    global_load_ushort v0, v2, s[0:1]
 ; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; FLATSCR-NEXT:    scratch_store_short off, v0, s4 offset:4
+; FLATSCR-NEXT:    scratch_store_short off, v0, s4
 ; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; FLATSCR-NEXT:    global_load_ushort v0, v2, s[0:1] offset:2
 ; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; FLATSCR-NEXT:    scratch_store_short off, v0, s4 offset:6
+; FLATSCR-NEXT:    scratch_store_short off, v0, s4 offset:2
 ; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; FLATSCR-NEXT:    global_load_ushort v0, v2, s[0:1] offset:4
 ; FLATSCR-NEXT:    s_mov_b32 s0, 0
 ; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; FLATSCR-NEXT:    scratch_store_short off, v0, s0 offset:8
+; FLATSCR-NEXT:    scratch_store_short off, v0, s0 offset:4
 ; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; FLATSCR-NEXT:    scratch_load_dword v0, off, s0 offset:4
-; FLATSCR-NEXT:    scratch_load_dword v1, off, s0 offset:6
+; FLATSCR-NEXT:    scratch_load_dword v0, off, s0
+; FLATSCR-NEXT:    scratch_load_dword v1, off, s0 offset:2
 ; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; FLATSCR-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
 ; FLATSCR-NEXT:    s_endpgm
@@ -490,24 +490,24 @@ define amdgpu_kernel void @vload2_private(ptr addrspace(1) nocapture readonly %i
 ; GFX10_DEFAULT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10_DEFAULT-NEXT:    global_load_ushort v0, v2, s[4:5]
 ; GFX10_DEFAULT-NEXT:    s_waitcnt vmcnt(0)
-; GFX10_DEFAULT-NEXT:    buffer_store_short v0, off, s[0:3], 0 offset:4
+; GFX10_DEFAULT-NEXT:    buffer_store_short v0, off, s[0:3], 0
 ; GFX10_DEFAULT-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10_DEFAULT-NEXT:    global_load_ushort v0, v2, s[4:5] offset:2
 ; GFX10_DEFAULT-NEXT:    s_waitcnt vmcnt(0)
-; GFX10_DEFAULT-NEXT:    buffer_store_short v0, off, s[0:3], 0 offset:6
+; GFX10_DEFAULT-NEXT:    buffer_store_short v0, off, s[0:3], 0 offset:2
 ; GFX10_DEFAULT-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10_DEFAULT-NEXT:    global_load_ushort v0, v2, s[4:5] offset:4
 ; GFX10_DEFAULT-NEXT:    s_waitcnt vmcnt(0)
-; GFX10_DEFAULT-NEXT:    buffer_store_short v0, off, s[0:3], 0 offset:8
+; GFX10_DEFAULT-NEXT:    buffer_store_short v0, off, s[0:3], 0 offset:4
 ; GFX10_DEFAULT-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10_DEFAULT-NEXT:    s_clause 0x1
-; GFX10_DEFAULT-NEXT:    buffer_load_ushort v0, off, s[0:3], 0 offset:6
-; GFX10_DEFAULT-NEXT:    buffer_load_ushort v3, off, s[0:3], 0 offset:4
+; GFX10_DEFAULT-NEXT:    buffer_load_ushort v0, off, s[0:3], 0 offset:2
+; GFX10_DEFAULT-NEXT:    buffer_load_ushort v3, off, s[0:3], 0
 ; GFX10_DEFAULT-NEXT:    s_waitcnt vmcnt(1)
 ; GFX10_DEFAULT-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX10_DEFAULT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10_DEFAULT-NEXT:    v_perm_b32 v0, v0, v3, 0x5040100
-; GFX10_DEFAULT-NEXT:    buffer_load_short_d16_hi v1, off, s[0:3], 0 offset:8
+; GFX10_DEFAULT-NEXT:    buffer_load_short_d16_hi v1, off, s[0:3], 0 offset:4
 ; GFX10_DEFAULT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10_DEFAULT-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
 ; GFX10_DEFAULT-NEXT:    s_endpgm
@@ -524,21 +524,21 @@ define amdgpu_kernel void @vload2_private(ptr addrspace(1) nocapture readonly %i
 ; FLATSCR_GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; FLATSCR_GFX10-NEXT:    global_load_ushort v0, v2, s[0:1]
 ; FLATSCR_GFX10-NEXT:    s_waitcnt vmcnt(0)
-; FLATSCR_GFX10-NEXT:    scratch_store_short off, v0, s4 offset:4
+; FLATSCR_GFX10-NEXT:    scratch_store_short off, v0, s4
 ; FLATSCR_GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; FLATSCR_GFX10-NEXT:    global_load_ushort v0, v2, s[0:1] offset:2
 ; FLATSCR_GFX10-NEXT:    s_waitcnt vmcnt(0)
-; FLATSCR_GFX10-NEXT:    scratch_store_short off, v0, s4 offset:6
+; FLATSCR_GFX10-NEXT:    scratch_store_short off, v0, s4 offset:2
 ; FLATSCR_GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; FLATSCR_GFX10-NEXT:    global_load_ushort v0, v2, s[0:1] offset:4
 ; FLATSCR_GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; FLATSCR_GFX10-NEXT:    s_mov_b32 s0, 0
 ; FLATSCR_GFX10-NEXT:    s_waitcnt vmcnt(0)
-; FLATSCR_GFX10-NEXT:    scratch_store_short off, v0, s0 offset:8
+; FLATSCR_GFX10-NEXT:    scratch_store_short off, v0, s0 offset:4
 ; FLATSCR_GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; FLATSCR_GFX10-NEXT:    s_clause 0x1
-; FLATSCR_GFX10-NEXT:    scratch_load_dword v0, off, s0 offset:4
-; FLATSCR_GFX10-NEXT:    scratch_load_dword v1, off, s0 offset:6
+; FLATSCR_GFX10-NEXT:    scratch_load_dword v0, off, s0
+; FLATSCR_GFX10-NEXT:    scratch_load_dword v1, off, s0 offset:2
 ; FLATSCR_GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; FLATSCR_GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
 ; FLATSCR_GFX10-NEXT:    s_endpgm
@@ -550,19 +550,19 @@ define amdgpu_kernel void @vload2_private(ptr addrspace(1) nocapture readonly %i
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    global_load_u16 v0, v2, s[0:1]
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    scratch_store_b16 off, v0, off offset:4 dlc
+; GFX11-NEXT:    scratch_store_b16 off, v0, off dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    global_load_u16 v0, v2, s[0:1] offset:2
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    scratch_store_b16 off, v0, off offset:6 dlc
+; GFX11-NEXT:    scratch_store_b16 off, v0, off offset:2 dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    global_load_u16 v0, v2, s[0:1] offset:4
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    scratch_store_b16 off, v0, off offset:8 dlc
+; GFX11-NEXT:    scratch_store_b16 off, v0, off offset:4 dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    scratch_load_b32 v0, off, off offset:4
-; GFX11-NEXT:    scratch_load_b32 v1, off, off offset:6
+; GFX11-NEXT:    scratch_load_b32 v0, off, off
+; GFX11-NEXT:    scratch_load_b32 v1, off, off offset:2
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
 ; GFX11-NEXT:    s_nop 0
diff --git a/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll b/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll
index 8bd60aa..6422bee 100644
--- a/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll
+++ b/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll
@@ -51,21 +51,21 @@ define amdgpu_kernel void @simple_nested_if(ptr addrspace(1) nocapture %arg) {
 ; GCN-O0-NEXT:    ; implicit-def: $vgpr1 : SGPR spill to VGPR lane
 ; GCN-O0-NEXT:    v_mov_b32_e32 v1, v0
 ; GCN-O0-NEXT:    s_or_saveexec_b64 s[8:9], -1
-; GCN-O0-NEXT:    buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
+; GCN-O0-NEXT:    buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload
 ; GCN-O0-NEXT:    s_mov_b64 exec, s[8:9]
 ; GCN-O0-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x9
 ; GCN-O0-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN-O0-NEXT:    v_writelane_b32 v0, s0, 0
 ; GCN-O0-NEXT:    v_writelane_b32 v0, s1, 1
 ; GCN-O0-NEXT:    v_mov_b32_e32 v2, v1
-; GCN-O0-NEXT:    buffer_store_dword v2, off, s[12:15], 0 offset:8 ; 4-byte Folded Spill
+; GCN-O0-NEXT:    buffer_store_dword v2, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
 ; GCN-O0-NEXT:    s_mov_b32 s0, 1
 ; GCN-O0-NEXT:    v_cmp_gt_u32_e64 s[2:3], v1, s0
 ; GCN-O0-NEXT:    s_mov_b64 s[0:1], exec
 ; GCN-O0-NEXT:    v_writelane_b32 v0, s0, 2
 ; GCN-O0-NEXT:    v_writelane_b32 v0, s1, 3
 ; GCN-O0-NEXT:    s_or_saveexec_b64 s[8:9], -1
-; GCN-O0-NEXT:    buffer_store_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
+; GCN-O0-NEXT:    buffer_store_dword v0, off, s[12:15], 0 ; 4-byte Folded Spill
 ; GCN-O0-NEXT:    s_mov_b64 exec, s[8:9]
 ; GCN-O0-NEXT:    s_and_b64 s[0:1], s[0:1], s[2:3]
 ; GCN-O0-NEXT:    s_mov_b64 exec, s[0:1]
@@ -73,12 +73,12 @@ define amdgpu_kernel void @simple_nested_if(ptr addrspace(1) nocapture %arg) {
 ; GCN-O0-NEXT:  ; %bb.1: ; %bb.outer.then
 ; GCN-O0-NEXT:    s_or_saveexec_b64 s[8:9], -1
 ; GCN-O0-NEXT:    s_waitcnt expcnt(0)
-; GCN-O0-NEXT:    buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
+; GCN-O0-NEXT:    buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload
 ; GCN-O0-NEXT:    s_mov_b64 exec, s[8:9]
 ; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-O0-NEXT:    v_readlane_b32 s4, v0, 0
 ; GCN-O0-NEXT:    v_readlane_b32 s5, v0, 1
-; GCN-O0-NEXT:    buffer_load_dword v1, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload
+; GCN-O0-NEXT:    buffer_load_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
 ; GCN-O0-NEXT:    s_mov_b32 s2, 0xf000
 ; GCN-O0-NEXT:    s_mov_b32 s0, 0
 ; GCN-O0-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
@@ -98,7 +98,7 @@ define amdgpu_kernel void @simple_nested_if(ptr addrspace(1) nocapture %arg) {
 ; GCN-O0-NEXT:    v_writelane_b32 v0, s0, 4
 ; GCN-O0-NEXT:    v_writelane_b32 v0, s1, 5
 ; GCN-O0-NEXT:    s_or_saveexec_b64 s[8:9], -1
-; GCN-O0-NEXT:    buffer_store_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
+; GCN-O0-NEXT:    buffer_store_dword v0, off, s[12:15], 0 ; 4-byte Folded Spill
 ; GCN-O0-NEXT:    s_mov_b64 exec, s[8:9]
 ; GCN-O0-NEXT:    s_and_b64 s[0:1], s[0:1], s[2:3]
 ; GCN-O0-NEXT:    s_mov_b64 exec, s[0:1]
@@ -106,12 +106,12 @@ define amdgpu_kernel void @simple_nested_if(ptr addrspace(1) nocapture %arg) {
 ; GCN-O0-NEXT:  ; %bb.2: ; %bb.inner.then
 ; GCN-O0-NEXT:    s_or_saveexec_b64 s[8:9], -1
 ; GCN-O0-NEXT:    s_waitcnt expcnt(0)
-; GCN-O0-NEXT:    buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
+; GCN-O0-NEXT:    buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload
 ; GCN-O0-NEXT:    s_mov_b64 exec, s[8:9]
 ; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-O0-NEXT:    v_readlane_b32 s0, v0, 0
 ; GCN-O0-NEXT:    v_readlane_b32 s1, v0, 1
-; GCN-O0-NEXT:    buffer_load_dword v1, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload
+; GCN-O0-NEXT:    buffer_load_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
 ; GCN-O0-NEXT:    v_mov_b32_e32 v0, 1
 ; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-O0-NEXT:    v_add_i32_e64 v1, s[2:3], v1, v0
@@ -130,7 +130,7 @@ define amdgpu_kernel void @simple_nested_if(ptr addrspace(1) nocapture %arg) {
 ; GCN-O0-NEXT:  .LBB0_3: ; %Flow
 ; GCN-O0-NEXT:    s_or_saveexec_b64 s[8:9], -1
 ; GCN-O0-NEXT:    s_waitcnt expcnt(0)
-; GCN-O0-NEXT:    buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
+; GCN-O0-NEXT:    buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload
 ; GCN-O0-NEXT:    s_mov_b64 exec, s[8:9]
 ; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-O0-NEXT:    v_readlane_b32 s0, v0, 4
@@ -139,7 +139,7 @@ define amdgpu_kernel void @simple_nested_if(ptr addrspace(1) nocapture %arg) {
 ; GCN-O0-NEXT:  .LBB0_4: ; %bb.outer.end
 ; GCN-O0-NEXT:    s_or_saveexec_b64 s[8:9], -1
 ; GCN-O0-NEXT:    s_waitcnt expcnt(0)
-; GCN-O0-NEXT:    buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
+; GCN-O0-NEXT:    buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload
 ; GCN-O0-NEXT:    s_mov_b64 exec, s[8:9]
 ; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-O0-NEXT:    v_readlane_b32 s0, v0, 2
@@ -225,21 +225,21 @@ define amdgpu_kernel void @uncollapsable_nested_if(ptr addrspace(1) nocapture %a
 ; GCN-O0-NEXT:    ; implicit-def: $vgpr1 : SGPR spill to VGPR lane
 ; GCN-O0-NEXT:    v_mov_b32_e32 v1, v0
 ; GCN-O0-NEXT:    s_or_saveexec_b64 s[8:9], -1
-; GCN-O0-NEXT:    buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
+; GCN-O0-NEXT:    buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload
 ; GCN-O0-NEXT:    s_mov_b64 exec, s[8:9]
 ; GCN-O0-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x9
 ; GCN-O0-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN-O0-NEXT:    v_writelane_b32 v0, s0, 0
 ; GCN-O0-NEXT:    v_writelane_b32 v0, s1, 1
 ; GCN-O0-NEXT:    v_mov_b32_e32 v2, v1
-; GCN-O0-NEXT:    buffer_store_dword v2, off, s[12:15], 0 offset:8 ; 4-byte Folded Spill
+; GCN-O0-NEXT:    buffer_store_dword v2, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
 ; GCN-O0-NEXT:    s_mov_b32 s0, 1
 ; GCN-O0-NEXT:    v_cmp_gt_u32_e64 s[2:3], v1, s0
 ; GCN-O0-NEXT:    s_mov_b64 s[0:1], exec
 ; GCN-O0-NEXT:    v_writelane_b32 v0, s0, 2
 ; GCN-O0-NEXT:    v_writelane_b32 v0, s1, 3
 ; GCN-O0-NEXT:    s_or_saveexec_b64 s[8:9], -1
-; GCN-O0-NEXT:    buffer_store_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
+; GCN-O0-NEXT:    buffer_store_dword v0, off, s[12:15], 0 ; 4-byte Folded Spill
 ; GCN-O0-NEXT:    s_mov_b64 exec, s[8:9]
 ; GCN-O0-NEXT:    s_and_b64 s[0:1], s[0:1], s[2:3]
 ; GCN-O0-NEXT:    s_mov_b64 exec, s[0:1]
@@ -247,12 +247,12 @@ define amdgpu_kernel void @uncollapsable_nested_if(ptr addrspace(1) nocapture %a
 ; GCN-O0-NEXT:  ; %bb.1: ; %bb.outer.then
 ; GCN-O0-NEXT:    s_or_saveexec_b64 s[8:9], -1
 ; GCN-O0-NEXT:    s_waitcnt expcnt(0)
-; GCN-O0-NEXT:    buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
+; GCN-O0-NEXT:    buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload
 ; GCN-O0-NEXT:    s_mov_b64 exec, s[8:9]
 ; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-O0-NEXT:    v_readlane_b32 s4, v0, 0
 ; GCN-O0-NEXT:    v_readlane_b32 s5, v0, 1
-; GCN-O0-NEXT:    buffer_load_dword v1, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload
+; GCN-O0-NEXT:    buffer_load_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
 ; GCN-O0-NEXT:    s_mov_b32 s2, 0xf000
 ; GCN-O0-NEXT:    s_mov_b32 s0, 0
 ; GCN-O0-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
@@ -272,7 +272,7 @@ define amdgpu_kernel void @uncollapsable_nested_if(ptr addrspace(1) nocapture %a
 ; GCN-O0-NEXT:    v_writelane_b32 v0, s0, 4
 ; GCN-O0-NEXT:    v_writelane_b32 v0, s1, 5
 ; GCN-O0-NEXT:    s_or_saveexec_b64 s[8:9], -1
-; GCN-O0-NEXT:    buffer_store_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
+; GCN-O0-NEXT:    buffer_store_dword v0, off, s[12:15], 0 ; 4-byte Folded Spill
 ; GCN-O0-NEXT:    s_mov_b64 exec, s[8:9]
 ; GCN-O0-NEXT:    s_and_b64 s[0:1], s[0:1], s[2:3]
 ; GCN-O0-NEXT:    s_mov_b64 exec, s[0:1]
@@ -280,12 +280,12 @@ define amdgpu_kernel void @uncollapsable_nested_if(ptr addrspace(1) nocapture %a
 ; GCN-O0-NEXT:  ; %bb.2: ; %bb.inner.then
 ; GCN-O0-NEXT:    s_or_saveexec_b64 s[8:9], -1
 ; GCN-O0-NEXT:    s_waitcnt expcnt(0)
-; GCN-O0-NEXT:    buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
+; GCN-O0-NEXT:    buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload
 ; GCN-O0-NEXT:    s_mov_b64 exec, s[8:9]
 ; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-O0-NEXT:    v_readlane_b32 s0, v0, 0
 ; GCN-O0-NEXT:    v_readlane_b32 s1, v0, 1
-; GCN-O0-NEXT:    buffer_load_dword v1, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload
+; GCN-O0-NEXT:    buffer_load_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
 ; GCN-O0-NEXT:    v_mov_b32_e32 v0, 1
 ; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-O0-NEXT:    v_add_i32_e64 v1, s[2:3], v1, v0
@@ -305,7 +305,7 @@ define amdgpu_kernel void @uncollapsable_nested_if(ptr addrspace(1) nocapture %a
 ; GCN-O0-NEXT:  .LBB1_3: ; %Flow
 ; GCN-O0-NEXT:    s_or_saveexec_b64 s[8:9], -1
 ; GCN-O0-NEXT:    s_waitcnt expcnt(0)
-; GCN-O0-NEXT:    buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
+; GCN-O0-NEXT:    buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload
 ; GCN-O0-NEXT:    s_mov_b64 exec, s[8:9]
 ; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-O0-NEXT:    v_readlane_b32 s0, v0, 2
@@ -315,7 +315,7 @@ define amdgpu_kernel void @uncollapsable_nested_if(ptr addrspace(1) nocapture %a
 ; GCN-O0-NEXT:  .LBB1_4: ; %bb.inner.end
 ; GCN-O0-NEXT:    s_or_saveexec_b64 s[8:9], -1
 ; GCN-O0-NEXT:    s_waitcnt expcnt(0)
-; GCN-O0-NEXT:    buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
+; GCN-O0-NEXT:    buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload
 ; GCN-O0-NEXT:    s_mov_b64 exec, s[8:9]
 ; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-O0-NEXT:    v_readlane_b32 s2, v0, 4
@@ -323,7 +323,7 @@ define amdgpu_kernel void @uncollapsable_nested_if(ptr addrspace(1) nocapture %a
 ; GCN-O0-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GCN-O0-NEXT:    v_readlane_b32 s0, v0, 0
 ; GCN-O0-NEXT:    v_readlane_b32 s1, v0, 1
-; GCN-O0-NEXT:    buffer_load_dword v1, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload
+; GCN-O0-NEXT:    buffer_load_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
 ; GCN-O0-NEXT:    v_mov_b32_e32 v0, 2
 ; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-O0-NEXT:    v_add_i32_e64 v1, s[2:3], v1, v0
@@ -341,7 +341,7 @@ define amdgpu_kernel void @uncollapsable_nested_if(ptr addrspace(1) nocapture %a
 ; GCN-O0-NEXT:    s_branch .LBB1_3
 ; GCN-O0-NEXT:  .LBB1_5: ; %bb.outer.end
 ; GCN-O0-NEXT:    s_or_saveexec_b64 s[8:9], -1
-; GCN-O0-NEXT:    buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
+; GCN-O0-NEXT:    buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload
 ; GCN-O0-NEXT:    s_mov_b64 exec, s[8:9]
 ; GCN-O0-NEXT:    v_mov_b32_e32 v2, 3
 ; GCN-O0-NEXT:    v_mov_b32_e32 v1, 0
@@ -436,7 +436,7 @@ define amdgpu_kernel void @nested_if_if_else(ptr addrspace(1) nocapture %arg) {
 ; GCN-O0-NEXT:    ; implicit-def: $vgpr1 : SGPR spill to VGPR lane
 ; GCN-O0-NEXT:    v_mov_b32_e32 v1, v0
 ; GCN-O0-NEXT:    s_or_saveexec_b64 s[6:7], -1
-; GCN-O0-NEXT:    buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
+; GCN-O0-NEXT:    buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload
 ; GCN-O0-NEXT:    s_mov_b64 exec, s[6:7]
 ; GCN-O0-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x9
 ; GCN-O0-NEXT:    s_waitcnt lgkmcnt(0)
@@ -445,7 +445,7 @@ define amdgpu_kernel void @nested_if_if_else(ptr addrspace(1) nocapture %arg) {
 ; GCN-O0-NEXT:    v_writelane_b32 v0, s2, 0
 ; GCN-O0-NEXT:    v_writelane_b32 v0, s3, 1
 ; GCN-O0-NEXT:    v_mov_b32_e32 v2, v1
-; GCN-O0-NEXT:    buffer_store_dword v2, off, s[12:15], 0 offset:8 ; 4-byte Folded Spill
+; GCN-O0-NEXT:    buffer_store_dword v2, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
 ; GCN-O0-NEXT:    s_mov_b32 s2, 0xf000
 ; GCN-O0-NEXT:    s_mov_b32 s4, 0
 ; GCN-O0-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
@@ -468,7 +468,7 @@ define amdgpu_kernel void @nested_if_if_else(ptr addrspace(1) nocapture %arg) {
 ; GCN-O0-NEXT:    v_writelane_b32 v0, s0, 2
 ; GCN-O0-NEXT:    v_writelane_b32 v0, s1, 3
 ; GCN-O0-NEXT:    s_or_saveexec_b64 s[6:7], -1
-; GCN-O0-NEXT:    buffer_store_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
+; GCN-O0-NEXT:    buffer_store_dword v0, off, s[12:15], 0 ; 4-byte Folded Spill
 ; GCN-O0-NEXT:    s_mov_b64 exec, s[6:7]
 ; GCN-O0-NEXT:    s_and_b64 s[0:1], s[0:1], s[2:3]
 ; GCN-O0-NEXT:    s_mov_b64 exec, s[0:1]
@@ -476,9 +476,9 @@ define amdgpu_kernel void @nested_if_if_else(ptr addrspace(1) nocapture %arg) {
 ; GCN-O0-NEXT:  ; %bb.1: ; %bb.outer.then
 ; GCN-O0-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; GCN-O0-NEXT:    s_waitcnt expcnt(0)
-; GCN-O0-NEXT:    buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
+; GCN-O0-NEXT:    buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload
 ; GCN-O0-NEXT:    s_mov_b64 exec, s[6:7]
-; GCN-O0-NEXT:    buffer_load_dword v1, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload
+; GCN-O0-NEXT:    buffer_load_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
 ; GCN-O0-NEXT:    s_mov_b32 s0, 2
 ; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-O0-NEXT:    v_cmp_ne_u32_e64 s[0:1], v1, s0
@@ -488,7 +488,7 @@ define amdgpu_kernel void @nested_if_if_else(ptr addrspace(1) nocapture %arg) {
 ; GCN-O0-NEXT:    v_writelane_b32 v0, s2, 4
 ; GCN-O0-NEXT:    v_writelane_b32 v0, s3, 5
 ; GCN-O0-NEXT:    s_or_saveexec_b64 s[6:7], -1
-; GCN-O0-NEXT:    buffer_store_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
+; GCN-O0-NEXT:    buffer_store_dword v0, off, s[12:15], 0 ; 4-byte Folded Spill
 ; GCN-O0-NEXT:    s_mov_b64 exec, s[6:7]
 ; GCN-O0-NEXT:    s_mov_b64 exec, s[0:1]
 ; GCN-O0-NEXT:    s_cbranch_execz .LBB2_2
@@ -496,7 +496,7 @@ define amdgpu_kernel void @nested_if_if_else(ptr addrspace(1) nocapture %arg) {
 ; GCN-O0-NEXT:  .LBB2_2: ; %Flow
 ; GCN-O0-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; GCN-O0-NEXT:    s_waitcnt expcnt(0)
-; GCN-O0-NEXT:    buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
+; GCN-O0-NEXT:    buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload
 ; GCN-O0-NEXT:    s_mov_b64 exec, s[6:7]
 ; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-O0-NEXT:    v_readlane_b32 s0, v0, 4
@@ -506,19 +506,19 @@ define amdgpu_kernel void @nested_if_if_else(ptr addrspace(1) nocapture %arg) {
 ; GCN-O0-NEXT:    v_writelane_b32 v0, s0, 6
 ; GCN-O0-NEXT:    v_writelane_b32 v0, s1, 7
 ; GCN-O0-NEXT:    s_or_saveexec_b64 s[6:7], -1
-; GCN-O0-NEXT:    buffer_store_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
+; GCN-O0-NEXT:    buffer_store_dword v0, off, s[12:15], 0 ; 4-byte Folded Spill
 ; GCN-O0-NEXT:    s_mov_b64 exec, s[6:7]
 ; GCN-O0-NEXT:    s_xor_b64 exec, exec, s[0:1]
 ; GCN-O0-NEXT:    s_cbranch_execz .LBB2_5
 ; GCN-O0-NEXT:  ; %bb.3: ; %bb.then
 ; GCN-O0-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; GCN-O0-NEXT:    s_waitcnt expcnt(0)
-; GCN-O0-NEXT:    buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
+; GCN-O0-NEXT:    buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload
 ; GCN-O0-NEXT:    s_mov_b64 exec, s[6:7]
 ; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-O0-NEXT:    v_readlane_b32 s0, v0, 0
 ; GCN-O0-NEXT:    v_readlane_b32 s1, v0, 1
-; GCN-O0-NEXT:    buffer_load_dword v1, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload
+; GCN-O0-NEXT:    buffer_load_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
 ; GCN-O0-NEXT:    v_mov_b32_e32 v0, 1
 ; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-O0-NEXT:    v_add_i32_e64 v1, s[2:3], v1, v0
@@ -538,12 +538,12 @@ define amdgpu_kernel void @nested_if_if_else(ptr addrspace(1) nocapture %arg) {
 ; GCN-O0-NEXT:  .LBB2_4: ; %bb.else
 ; GCN-O0-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; GCN-O0-NEXT:    s_waitcnt expcnt(0)
-; GCN-O0-NEXT:    buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
+; GCN-O0-NEXT:    buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload
 ; GCN-O0-NEXT:    s_mov_b64 exec, s[6:7]
 ; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-O0-NEXT:    v_readlane_b32 s0, v0, 0
 ; GCN-O0-NEXT:    v_readlane_b32 s1, v0, 1
-; GCN-O0-NEXT:    buffer_load_dword v1, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload
+; GCN-O0-NEXT:    buffer_load_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
 ; GCN-O0-NEXT:    v_mov_b32_e32 v0, 2
 ; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-O0-NEXT:    v_add_i32_e64 v1, s[2:3], v1, v0
@@ -562,7 +562,7 @@ define amdgpu_kernel void @nested_if_if_else(ptr addrspace(1) nocapture %arg) {
 ; GCN-O0-NEXT:  .LBB2_5: ; %Flow1
 ; GCN-O0-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; GCN-O0-NEXT:    s_waitcnt expcnt(0)
-; GCN-O0-NEXT:    buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
+; GCN-O0-NEXT:    buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload
 ; GCN-O0-NEXT:    s_mov_b64 exec, s[6:7]
 ; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-O0-NEXT:    v_readlane_b32 s0, v0, 6
@@ -571,7 +571,7 @@ define amdgpu_kernel void @nested_if_if_else(ptr addrspace(1) nocapture %arg) {
 ; GCN-O0-NEXT:  .LBB2_6: ; %bb.outer.end
 ; GCN-O0-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; GCN-O0-NEXT:    s_waitcnt expcnt(0)
-; GCN-O0-NEXT:    buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
+; GCN-O0-NEXT:    buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload
 ; GCN-O0-NEXT:    s_mov_b64 exec, s[6:7]
 ; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-O0-NEXT:    v_readlane_b32 s0, v0, 2
@@ -684,11 +684,11 @@ define amdgpu_kernel void @nested_if_else_if(ptr addrspace(1) nocapture %arg) {
 ; GCN-O0-NEXT:    ; implicit-def: $vgpr1 : SGPR spill to VGPR lane
 ; GCN-O0-NEXT:    v_mov_b32_e32 v1, v0
 ; GCN-O0-NEXT:    s_or_saveexec_b64 s[8:9], -1
-; GCN-O0-NEXT:    buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
+; GCN-O0-NEXT:    buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload
 ; GCN-O0-NEXT:    s_mov_b64 exec, s[8:9]
 ; GCN-O0-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x9
 ; GCN-O0-NEXT:    v_mov_b32_e32 v2, v1
-; GCN-O0-NEXT:    buffer_store_dword v2, off, s[12:15], 0 offset:16 ; 4-byte Folded Spill
+; GCN-O0-NEXT:    buffer_store_dword v2, off, s[12:15], 0 offset:12 ; 4-byte Folded Spill
 ; GCN-O0-NEXT:    s_mov_b32 s0, 2
 ; GCN-O0-NEXT:    v_lshlrev_b32_e64 v3, s0, v1
 ; GCN-O0-NEXT:    s_mov_b32 s1, 0
@@ -707,9 +707,9 @@ define amdgpu_kernel void @nested_if_else_if(ptr addrspace(1) nocapture %arg) {
 ; GCN-O0-NEXT:    v_addc_u32_e64 v2, s[2:3], v2, v6, s[2:3]
 ; GCN-O0-NEXT:    ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec
 ; GCN-O0-NEXT:    v_mov_b32_e32 v6, v2
-; GCN-O0-NEXT:    buffer_store_dword v5, off, s[12:15], 0 offset:8 ; 4-byte Folded Spill
+; GCN-O0-NEXT:    buffer_store_dword v5, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
 ; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
-; GCN-O0-NEXT:    buffer_store_dword v6, off, s[12:15], 0 offset:12 ; 4-byte Folded Spill
+; GCN-O0-NEXT:    buffer_store_dword v6, off, s[12:15], 0 offset:8 ; 4-byte Folded Spill
 ; GCN-O0-NEXT:    s_mov_b32 s1, 0xf000
 ; GCN-O0-NEXT:    s_mov_b32 s2, 0
 ; GCN-O0-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
@@ -725,7 +725,7 @@ define amdgpu_kernel void @nested_if_else_if(ptr addrspace(1) nocapture %arg) {
 ; GCN-O0-NEXT:    v_writelane_b32 v0, s2, 0
 ; GCN-O0-NEXT:    v_writelane_b32 v0, s3, 1
 ; GCN-O0-NEXT:    s_or_saveexec_b64 s[8:9], -1
-; GCN-O0-NEXT:    buffer_store_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
+; GCN-O0-NEXT:    buffer_store_dword v0, off, s[12:15], 0 ; 4-byte Folded Spill
 ; GCN-O0-NEXT:    s_mov_b64 exec, s[8:9]
 ; GCN-O0-NEXT:    s_mov_b64 exec, s[0:1]
 ; GCN-O0-NEXT:    s_cbranch_execz .LBB3_1
@@ -733,7 +733,7 @@ define amdgpu_kernel void @nested_if_else_if(ptr addrspace(1) nocapture %arg) {
 ; GCN-O0-NEXT:  .LBB3_1: ; %Flow2
 ; GCN-O0-NEXT:    s_or_saveexec_b64 s[8:9], -1
 ; GCN-O0-NEXT:    s_waitcnt expcnt(0)
-; GCN-O0-NEXT:    buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
+; GCN-O0-NEXT:    buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload
 ; GCN-O0-NEXT:    s_mov_b64 exec, s[8:9]
 ; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-O0-NEXT:    v_readlane_b32 s0, v0, 0
@@ -743,18 +743,18 @@ define amdgpu_kernel void @nested_if_else_if(ptr addrspace(1) nocapture %arg) {
 ; GCN-O0-NEXT:    v_writelane_b32 v0, s0, 2
 ; GCN-O0-NEXT:    v_writelane_b32 v0, s1, 3
 ; GCN-O0-NEXT:    s_or_saveexec_b64 s[8:9], -1
-; GCN-O0-NEXT:    buffer_store_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
+; GCN-O0-NEXT:    buffer_store_dword v0, off, s[12:15], 0 ; 4-byte Folded Spill
 ; GCN-O0-NEXT:    s_mov_b64 exec, s[8:9]
 ; GCN-O0-NEXT:    s_xor_b64 exec, exec, s[0:1]
 ; GCN-O0-NEXT:    s_cbranch_execz .LBB3_8
 ; GCN-O0-NEXT:  ; %bb.2: ; %bb.outer.then
 ; GCN-O0-NEXT:    s_or_saveexec_b64 s[8:9], -1
 ; GCN-O0-NEXT:    s_waitcnt expcnt(0)
-; GCN-O0-NEXT:    buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
+; GCN-O0-NEXT:    buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload
 ; GCN-O0-NEXT:    s_mov_b64 exec, s[8:9]
-; GCN-O0-NEXT:    buffer_load_dword v1, off, s[12:15], 0 offset:16 ; 4-byte Folded Reload
-; GCN-O0-NEXT:    buffer_load_dword v3, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload
-; GCN-O0-NEXT:    buffer_load_dword v4, off, s[12:15], 0 offset:12 ; 4-byte Folded Reload
+; GCN-O0-NEXT:    buffer_load_dword v1, off, s[12:15], 0 offset:12 ; 4-byte Folded Reload
+; GCN-O0-NEXT:    buffer_load_dword v3, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
+; GCN-O0-NEXT:    buffer_load_dword v4, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload
 ; GCN-O0-NEXT:    s_mov_b32 s0, 0xf000
 ; GCN-O0-NEXT:    s_mov_b32 s2, 0
 ; GCN-O0-NEXT:    s_mov_b32 s4, s2
@@ -772,15 +772,15 @@ define amdgpu_kernel void @nested_if_else_if(ptr addrspace(1) nocapture %arg) {
 ; GCN-O0-NEXT:    v_writelane_b32 v0, s0, 4
 ; GCN-O0-NEXT:    v_writelane_b32 v0, s1, 5
 ; GCN-O0-NEXT:    s_or_saveexec_b64 s[8:9], -1
-; GCN-O0-NEXT:    buffer_store_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
+; GCN-O0-NEXT:    buffer_store_dword v0, off, s[12:15], 0 ; 4-byte Folded Spill
 ; GCN-O0-NEXT:    s_mov_b64 exec, s[8:9]
 ; GCN-O0-NEXT:    s_and_b64 s[0:1], s[0:1], s[2:3]
 ; GCN-O0-NEXT:    s_mov_b64 exec, s[0:1]
 ; GCN-O0-NEXT:    s_cbranch_execz .LBB3_7
 ; GCN-O0-NEXT:  ; %bb.3: ; %bb.inner.then
 ; GCN-O0-NEXT:    s_waitcnt expcnt(1)
-; GCN-O0-NEXT:    buffer_load_dword v1, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload
-; GCN-O0-NEXT:    buffer_load_dword v2, off, s[12:15], 0 offset:12 ; 4-byte Folded Reload
+; GCN-O0-NEXT:    buffer_load_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
+; GCN-O0-NEXT:    buffer_load_dword v2, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload
 ; GCN-O0-NEXT:    s_mov_b32 s0, 0xf000
 ; GCN-O0-NEXT:    s_mov_b32 s2, 0
 ; GCN-O0-NEXT:    s_mov_b32 s4, s2
@@ -797,11 +797,11 @@ define amdgpu_kernel void @nested_if_else_if(ptr addrspace(1) nocapture %arg) {
 ; GCN-O0-NEXT:  .LBB3_4: ; %bb.outer.else
 ; GCN-O0-NEXT:    s_or_saveexec_b64 s[8:9], -1
 ; GCN-O0-NEXT:    s_waitcnt expcnt(0)
-; GCN-O0-NEXT:    buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
+; GCN-O0-NEXT:    buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload
 ; GCN-O0-NEXT:    s_mov_b64 exec, s[8:9]
-; GCN-O0-NEXT:    buffer_load_dword v1, off, s[12:15], 0 offset:16 ; 4-byte Folded Reload
-; GCN-O0-NEXT:    buffer_load_dword v3, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload
-; GCN-O0-NEXT:    buffer_load_dword v4, off, s[12:15], 0 offset:12 ; 4-byte Folded Reload
+; GCN-O0-NEXT:    buffer_load_dword v1, off, s[12:15], 0 offset:12 ; 4-byte Folded Reload
+; GCN-O0-NEXT:    buffer_load_dword v3, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
+; GCN-O0-NEXT:    buffer_load_dword v4, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload
 ; GCN-O0-NEXT:    s_mov_b32 s1, 0xf000
 ; GCN-O0-NEXT:    s_mov_b32 s0, 0
 ; GCN-O0-NEXT:    s_mov_b32 s2, s0
@@ -818,15 +818,15 @@ define amdgpu_kernel void @nested_if_else_if(ptr addrspace(1) nocapture %arg) {
 ; GCN-O0-NEXT:    v_writelane_b32 v0, s0, 6
 ; GCN-O0-NEXT:    v_writelane_b32 v0, s1, 7
 ; GCN-O0-NEXT:    s_or_saveexec_b64 s[8:9], -1
-; GCN-O0-NEXT:    buffer_store_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
+; GCN-O0-NEXT:    buffer_store_dword v0, off, s[12:15], 0 ; 4-byte Folded Spill
 ; GCN-O0-NEXT:    s_mov_b64 exec, s[8:9]
 ; GCN-O0-NEXT:    s_and_b64 s[0:1], s[0:1], s[2:3]
 ; GCN-O0-NEXT:    s_mov_b64 exec, s[0:1]
 ; GCN-O0-NEXT:    s_cbranch_execz .LBB3_6
 ; GCN-O0-NEXT:  ; %bb.5: ; %bb.inner.then2
 ; GCN-O0-NEXT:    s_waitcnt expcnt(1)
-; GCN-O0-NEXT:    buffer_load_dword v1, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload
-; GCN-O0-NEXT:    buffer_load_dword v2, off, s[12:15], 0 offset:12 ; 4-byte Folded Reload
+; GCN-O0-NEXT:    buffer_load_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
+; GCN-O0-NEXT:    buffer_load_dword v2, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload
 ; GCN-O0-NEXT:    s_mov_b32 s0, 0xf000
 ; GCN-O0-NEXT:    s_mov_b32 s2, 0
 ; GCN-O0-NEXT:    s_mov_b32 s4, s2
@@ -842,7 +842,7 @@ define amdgpu_kernel void @nested_if_else_if(ptr addrspace(1) nocapture %arg) {
 ; GCN-O0-NEXT:  .LBB3_6: ; %Flow
 ; GCN-O0-NEXT:    s_or_saveexec_b64 s[8:9], -1
 ; GCN-O0-NEXT:    s_waitcnt expcnt(0)
-; GCN-O0-NEXT:    buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
+; GCN-O0-NEXT:    buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload
 ; GCN-O0-NEXT:    s_mov_b64 exec, s[8:9]
 ; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-O0-NEXT:    v_readlane_b32 s0, v0, 6
@@ -852,7 +852,7 @@ define amdgpu_kernel void @nested_if_else_if(ptr addrspace(1) nocapture %arg) {
 ; GCN-O0-NEXT:  .LBB3_7: ; %Flow1
 ; GCN-O0-NEXT:    s_or_saveexec_b64 s[8:9], -1
 ; GCN-O0-NEXT:    s_waitcnt expcnt(0)
-; GCN-O0-NEXT:    buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
+; GCN-O0-NEXT:    buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload
 ; GCN-O0-NEXT:    s_mov_b64 exec, s[8:9]
 ; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-O0-NEXT:    v_readlane_b32 s0, v0, 4
@@ -861,7 +861,7 @@ define amdgpu_kernel void @nested_if_else_if(ptr addrspace(1) nocapture %arg) {
 ; GCN-O0-NEXT:  .LBB3_8: ; %bb.outer.end
 ; GCN-O0-NEXT:    s_or_saveexec_b64 s[8:9], -1
 ; GCN-O0-NEXT:    s_waitcnt expcnt(0)
-; GCN-O0-NEXT:    buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
+; GCN-O0-NEXT:    buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload
 ; GCN-O0-NEXT:    s_mov_b64 exec, s[8:9]
 ; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-O0-NEXT:    v_readlane_b32 s0, v0, 2
@@ -938,34 +938,34 @@ define amdgpu_kernel void @s_endpgm_unsafe_barrier(ptr addrspace(1) nocapture %a
 ; GCN-O0-NEXT:    ; implicit-def: $vgpr1 : SGPR spill to VGPR lane
 ; GCN-O0-NEXT:    v_mov_b32_e32 v1, v0
 ; GCN-O0-NEXT:    s_or_saveexec_b64 s[6:7], -1
-; GCN-O0-NEXT:    buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
+; GCN-O0-NEXT:    buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload
 ; GCN-O0-NEXT:    s_mov_b64 exec, s[6:7]
 ; GCN-O0-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x9
 ; GCN-O0-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN-O0-NEXT:    v_writelane_b32 v0, s0, 0
 ; GCN-O0-NEXT:    v_writelane_b32 v0, s1, 1
 ; GCN-O0-NEXT:    v_mov_b32_e32 v2, v1
-; GCN-O0-NEXT:    buffer_store_dword v2, off, s[12:15], 0 offset:8 ; 4-byte Folded Spill
+; GCN-O0-NEXT:    buffer_store_dword v2, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
 ; GCN-O0-NEXT:    s_mov_b32 s0, 1
 ; GCN-O0-NEXT:    v_cmp_gt_u32_e64 s[2:3], v1, s0
 ; GCN-O0-NEXT:    s_mov_b64 s[0:1], exec
 ; GCN-O0-NEXT:    v_writelane_b32 v0, s0, 2
 ; GCN-O0-NEXT:    v_writelane_b32 v0, s1, 3
 ; GCN-O0-NEXT:    s_or_saveexec_b64 s[6:7], -1
-; GCN-O0-NEXT:    buffer_store_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
+; GCN-O0-NEXT:    buffer_store_dword v0, off, s[12:15], 0 ; 4-byte Folded Spill
 ; GCN-O0-NEXT:    s_mov_b64 exec, s[6:7]
 ; GCN-O0-NEXT:    s_and_b64 s[0:1], s[0:1], s[2:3]
 ; GCN-O0-NEXT:    s_mov_b64 exec, s[0:1]
 ; GCN-O0-NEXT:    s_cbranch_execz .LBB4_2
 ; GCN-O0-NEXT:  ; %bb.1: ; %bb.then
 ; GCN-O0-NEXT:    s_or_saveexec_b64 s[6:7], -1
-; GCN-O0-NEXT:    buffer_load_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
+; GCN-O0-NEXT:    buffer_load_dword v1, off, s[12:15], 0 ; 4-byte Folded Reload
 ; GCN-O0-NEXT:    s_mov_b64 exec, s[6:7]
 ; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-O0-NEXT:    v_readlane_b32 s0, v1, 0
 ; GCN-O0-NEXT:    v_readlane_b32 s1, v1, 1
 ; GCN-O0-NEXT:    s_waitcnt expcnt(0)
-; GCN-O0-NEXT:    buffer_load_dword v0, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload
+; GCN-O0-NEXT:    buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
 ; GCN-O0-NEXT:    s_mov_b32 s2, 0xf000
 ; GCN-O0-NEXT:    s_mov_b32 s4, 0
 ; GCN-O0-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
@@ -983,7 +983,7 @@ define amdgpu_kernel void @s_endpgm_unsafe_barrier(ptr addrspace(1) nocapture %a
 ; GCN-O0-NEXT:  .LBB4_2: ; %bb.end
 ; GCN-O0-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; GCN-O0-NEXT:    s_waitcnt expcnt(0)
-; GCN-O0-NEXT:    buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
+; GCN-O0-NEXT:    buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload
 ; GCN-O0-NEXT:    s_mov_b64 exec, s[6:7]
 ; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-O0-NEXT:    v_readlane_b32 s0, v0, 2
diff --git a/llvm/test/CodeGen/AMDGPU/commute-compares.ll b/llvm/test/CodeGen/AMDGPU/commute-compares.ll
index 1d2e211..d94e75c8 100644
--- a/llvm/test/CodeGen/AMDGPU/commute-compares.ll
+++ b/llvm/test/CodeGen/AMDGPU/commute-compares.ll
@@ -699,7 +699,7 @@ define amdgpu_kernel void @commute_uno_2.0_f64(ptr addrspace(1) %out, ptr addrsp
 ; GCN-LABEL: {{^}}commute_frameindex:
 ; XGCN: v_cmp_eq_u32_e32 vcc, 0, v{{[0-9]+}}
 
-; GCN: v_mov_b32_e32 [[FI:v[0-9]+]], 4{{$}}
+; GCN: v_mov_b32_e32 [[FI:v[0-9]+]], 0{{$}}
 ; GCN: v_cmp_eq_u32_e32 vcc, v{{[0-9]+}}, [[FI]]
 define amdgpu_kernel void @commute_frameindex(ptr addrspace(1) nocapture %out) #0 {
 entry:
diff --git a/llvm/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll b/llvm/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll
index 6ef14d3..2b5a8d9 100644
--- a/llvm/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll
+++ b/llvm/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll
@@ -26,7 +26,7 @@
 
 ; VMEM: v_writelane_b32 v[[V_SAVEEXEC:[0-9]+]], s[[SAVEEXEC_LO]], 0
 ; VMEM: v_writelane_b32 v[[V_SAVEEXEC]], s[[SAVEEXEC_HI]], 1
-; VMEM: buffer_store_dword v[[V_SAVEEXEC]], off, s[0:3], 0 offset:[[V_EXEC_SPILL_OFFSET:[0-9]+]] ; 4-byte Folded Spill
+; VMEM: buffer_store_dword v[[V_SAVEEXEC]], off, s[0:3], 0 ; 4-byte Folded Spill
 
 ; GCN: s_and_b64 s[[[ANDEXEC_LO:[0-9]+]]:[[ANDEXEC_HI:[0-9]+]]], s[[[SAVEEXEC_LO]]:[[SAVEEXEC_HI]]], [[CMP0]]
 ; GCN: s_mov_b64 exec, s[[[ANDEXEC_LO]]:[[ANDEXEC_HI]]]
@@ -50,7 +50,7 @@
 ; VGPR: v_readlane_b32 s[[S_RELOAD_SAVEEXEC_LO:[0-9]+]], [[SPILL_VGPR]], [[SAVEEXEC_LO_LANE]]
 ; VGPR: v_readlane_b32 s[[S_RELOAD_SAVEEXEC_HI:[0-9]+]], [[SPILL_VGPR]], [[SAVEEXEC_HI_LANE]]
 
-; VMEM: buffer_load_dword v[[V_RELOAD_SAVEEXEC:[0-9]+]], off, s[0:3], 0 offset:[[V_EXEC_SPILL_OFFSET]] ; 4-byte Folded Reload
+; VMEM: buffer_load_dword v[[V_RELOAD_SAVEEXEC:[0-9]+]], off, s[0:3], 0 ; 4-byte Folded Reload
 ; VMEM: s_waitcnt vmcnt(0)
 ; VMEM: v_readlane_b32 s[[S_RELOAD_SAVEEXEC_LO:[0-9]+]], v[[V_RELOAD_SAVEEXEC]], 0
 ; VMEM: v_readlane_b32 s[[S_RELOAD_SAVEEXEC_HI:[0-9]+]], v[[V_RELOAD_SAVEEXEC]], 1
@@ -99,7 +99,7 @@ endif:
 
 ; VMEM: v_writelane_b32 v[[V_SAVEEXEC:[0-9]+]], s[[SAVEEXEC_LO]], 0
 ; VMEM: v_writelane_b32 v[[V_SAVEEXEC]], s[[SAVEEXEC_HI]], 1
-; VMEM: buffer_store_dword v[[V_SAVEEXEC]], off, s[0:3], 0 offset:[[V_EXEC_SPILL_OFFSET:[0-9]+]] ; 4-byte Folded Spill
+; VMEM: buffer_store_dword v[[V_SAVEEXEC]], off, s[0:3], 0 ; 4-byte Folded Spill
 
 
 ; GCN: s_and_b64 s[[[ANDEXEC_LO:[0-9]+]]:[[ANDEXEC_HI:[0-9]+]]], s[[[SAVEEXEC_LO:[0-9]+]]:[[SAVEEXEC_HI:[0-9]+]]], [[CMP0]]
@@ -123,7 +123,7 @@ endif:
 ; VGPR: v_readlane_b32 s[[S_RELOAD_SAVEEXEC_LO:[0-9]+]], [[SPILL_VGPR]], [[SAVEEXEC_LO_LANE]]
 ; VGPR: v_readlane_b32 s[[S_RELOAD_SAVEEXEC_HI:[0-9]+]], [[SPILL_VGPR]], [[SAVEEXEC_HI_LANE]]
 
-; VMEM: buffer_load_dword v[[V_RELOAD_SAVEEXEC:[0-9]+]], off, s[0:3], 0 offset:[[V_EXEC_SPILL_OFFSET]] ; 4-byte Folded Reload
+; VMEM: buffer_load_dword v[[V_RELOAD_SAVEEXEC:[0-9]+]], off, s[0:3], 0 ; 4-byte Folded Reload
 ; VMEM: s_waitcnt vmcnt(0)
 ; VMEM: v_readlane_b32 s[[S_RELOAD_SAVEEXEC_LO:[0-9]+]], v[[V_RELOAD_SAVEEXEC]], 0
 ; VMEM: v_readlane_b32 s[[S_RELOAD_SAVEEXEC_HI:[0-9]+]], v[[V_RELOAD_SAVEEXEC]], 1
@@ -176,11 +176,11 @@ end:
 ; Spill saved exec
 ; VGPR: v_writelane_b32 [[SPILL_VGPR:v[0-9]+]], s[[SAVEEXEC_LO]], [[SAVEEXEC_LO_LANE:[0-9]+]]
 ; VGPR: v_writelane_b32 [[SPILL_VGPR]], s[[SAVEEXEC_HI]], [[SAVEEXEC_HI_LANE:[0-9]+]]
-; VGPR: buffer_store_dword [[SPILL_VGPR]], off, s[0:3], 0 offset:[[SAVEEXEC_OFFSET:[0-9]+]] ; 4-byte Folded Spill
+; VGPR: buffer_store_dword [[SPILL_VGPR]], off, s[0:3], 0 ; 4-byte Folded Spill
 
 ; VMEM: v_writelane_b32 v[[V_SAVEEXEC:[0-9]+]], s[[SAVEEXEC_LO]], 0
 ; VMEM: v_writelane_b32 v[[V_SAVEEXEC]], s[[SAVEEXEC_HI]], 1
-; VMEM: buffer_store_dword v[[V_SAVEEXEC]], off, s[0:3], 0 offset:[[SAVEEXEC_OFFSET:[0-9]+]] ; 4-byte Folded Spill
+; VMEM: buffer_store_dword v[[V_SAVEEXEC]], off, s[0:3], 0 ; 4-byte Folded Spill
 
 ; GCN: s_mov_b64 exec, [[CMP0]]
 
@@ -189,11 +189,11 @@ end:
 ; GCN-NEXT: s_branch [[ELSE:.LBB[0-9]+_[0-9]+]]
 
 ; GCN: [[FLOW]]: ; %Flow
-; VGPR: buffer_load_dword [[SPILL_VGPR:v[0-9]+]], off, s[0:3], 0 offset:[[SAVEEXEC_OFFSET]] ; 4-byte Folded Reload
+; VGPR: buffer_load_dword [[SPILL_VGPR:v[0-9]+]], off, s[0:3], 0 ; 4-byte Folded Reload
 ; VGPR: v_readlane_b32 s[[FLOW_S_RELOAD_SAVEEXEC_LO:[0-9]+]], [[SPILL_VGPR]], [[SAVEEXEC_LO_LANE]]
 ; VGPR: v_readlane_b32 s[[FLOW_S_RELOAD_SAVEEXEC_HI:[0-9]+]], [[SPILL_VGPR]], [[SAVEEXEC_HI_LANE]]
 
-; VMEM: buffer_load_dword v[[FLOW_V_RELOAD_SAVEEXEC:[0-9]+]], off, s[0:3], 0 offset:[[SAVEEXEC_OFFSET]]
+; VMEM: buffer_load_dword v[[FLOW_V_RELOAD_SAVEEXEC:[0-9]+]], off, s[0:3], 0
 ; VMEM: s_waitcnt vmcnt(0)
 ; VMEM: v_readlane_b32 s[[FLOW_S_RELOAD_SAVEEXEC_LO:[0-9]+]], v[[FLOW_V_RELOAD_SAVEEXEC]], 0
 ; VMEM: v_readlane_b32 s[[FLOW_S_RELOAD_SAVEEXEC_HI:[0-9]+]], v[[FLOW_V_RELOAD_SAVEEXEC]], 1
diff --git a/llvm/test/CodeGen/AMDGPU/extload-private.ll b/llvm/test/CodeGen/AMDGPU/extload-private.ll
index 5fda337..3802dc5 100644
--- a/llvm/test/CodeGen/AMDGPU/extload-private.ll
+++ b/llvm/test/CodeGen/AMDGPU/extload-private.ll
@@ -2,7 +2,7 @@
 ; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
 
 ; FUNC-LABEL: {{^}}load_i8_sext_private:
-; SI: buffer_load_sbyte v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], 0 offset:4{{$}}
+; SI: buffer_load_sbyte v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], 0{{$}}
 define amdgpu_kernel void @load_i8_sext_private(ptr addrspace(1) %out) {
 entry:
   %tmp0 = alloca i8, addrspace(5)
@@ -13,7 +13,7 @@ entry:
 }
 
 ; FUNC-LABEL: {{^}}load_i8_zext_private:
-; SI: buffer_load_ubyte v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], 0 offset:4{{$}}
+; SI: buffer_load_ubyte v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], 0{{$}}
 define amdgpu_kernel void @load_i8_zext_private(ptr addrspace(1) %out) {
 entry:
   %tmp0 = alloca i8, addrspace(5)
@@ -24,7 +24,7 @@ entry:
 }
 
 ; FUNC-LABEL: {{^}}load_i16_sext_private:
-; SI: buffer_load_sshort v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], 0 offset:4{{$}}
+; SI: buffer_load_sshort v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], 0{{$}}
 define amdgpu_kernel void @load_i16_sext_private(ptr addrspace(1) %out) {
 entry:
   %tmp0 = alloca i16, addrspace(5)
@@ -35,7 +35,7 @@ entry:
 }
 
 ; FUNC-LABEL: {{^}}load_i16_zext_private:
-; SI: buffer_load_ushort v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], 0 offset:4 glc{{$}}
+; SI: buffer_load_ushort v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], 0 glc{{$}}
 define amdgpu_kernel void @load_i16_zext_private(ptr addrspace(1) %out) {
 entry:
   %tmp0 = alloca i16, addrspace(5)
diff --git a/llvm/test/CodeGen/AMDGPU/extract_vector_dynelt.ll b/llvm/test/CodeGen/AMDGPU/extract_vector_dynelt.ll
index 51a4db1..1ba7e70 100644
--- a/llvm/test/CodeGen/AMDGPU/extract_vector_dynelt.ll
+++ b/llvm/test/CodeGen/AMDGPU/extract_vector_dynelt.ll
@@ -410,6 +410,8 @@ entry:
 }
 
 ; GCN-LABEL: {{^}}bit4_extelt:
+; FIXME: One v_mov_b32_e32 vN, 0 should suffice
+; GCN: v_mov_b32_e32 [[FI:v[0-9]+]], 0
 ; GCN-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0
 ; GCN-DAG: v_mov_b32_e32 [[ONE:v[0-9]+]], 1
 ; GCN-DAG: buffer_store_byte [[ZERO]],
diff --git a/llvm/test/CodeGen/AMDGPU/flat-scratch-init.ll b/llvm/test/CodeGen/AMDGPU/flat-scratch-init.ll
index bcf6bda..57991d6 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-scratch-init.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-scratch-init.ll
@@ -12,7 +12,7 @@ define amdgpu_kernel void @stack_object_addrspacecast_in_kernel_no_calls() {
 ; FLAT_SCR_OPT-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
 ; FLAT_SCR_OPT-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
 ; FLAT_SCR_OPT-NEXT:    s_mov_b64 s[0:1], src_private_base
-; FLAT_SCR_OPT-NEXT:    v_mov_b32_e32 v0, 4
+; FLAT_SCR_OPT-NEXT:    v_mov_b32_e32 v0, 0
 ; FLAT_SCR_OPT-NEXT:    v_mov_b32_e32 v1, s1
 ; FLAT_SCR_OPT-NEXT:    v_mov_b32_e32 v2, 0
 ; FLAT_SCR_OPT-NEXT:    flat_store_dword v[0:1], v2
@@ -22,7 +22,7 @@ define amdgpu_kernel void @stack_object_addrspacecast_in_kernel_no_calls() {
 ; FLAT_SCR_ARCH-LABEL: stack_object_addrspacecast_in_kernel_no_calls:
 ; FLAT_SCR_ARCH:       ; %bb.0:
 ; FLAT_SCR_ARCH-NEXT:    s_mov_b64 s[0:1], src_private_base
-; FLAT_SCR_ARCH-NEXT:    v_mov_b32_e32 v0, 4
+; FLAT_SCR_ARCH-NEXT:    v_mov_b32_e32 v0, 0
 ; FLAT_SCR_ARCH-NEXT:    v_mov_b32_e32 v1, s1
 ; FLAT_SCR_ARCH-NEXT:    v_mov_b32_e32 v2, 0
 ; FLAT_SCR_ARCH-NEXT:    flat_store_dword v[0:1], v2
@@ -43,7 +43,7 @@ define amdgpu_kernel void @stack_object_in_kernel_no_calls() {
 ; FLAT_SCR_OPT-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
 ; FLAT_SCR_OPT-NEXT:    v_mov_b32_e32 v0, 0
 ; FLAT_SCR_OPT-NEXT:    s_mov_b32 s0, 0
-; FLAT_SCR_OPT-NEXT:    scratch_store_dword off, v0, s0 offset:4
+; FLAT_SCR_OPT-NEXT:    scratch_store_dword off, v0, s0
 ; FLAT_SCR_OPT-NEXT:    s_waitcnt_vscnt null, 0x0
 ; FLAT_SCR_OPT-NEXT:    s_endpgm
 ;
@@ -51,7 +51,7 @@ define amdgpu_kernel void @stack_object_in_kernel_no_calls() {
 ; FLAT_SCR_ARCH:       ; %bb.0:
 ; FLAT_SCR_ARCH-NEXT:    v_mov_b32_e32 v0, 0
 ; FLAT_SCR_ARCH-NEXT:    s_mov_b32 s0, 0
-; FLAT_SCR_ARCH-NEXT:    scratch_store_dword off, v0, s0 offset:4
+; FLAT_SCR_ARCH-NEXT:    scratch_store_dword off, v0, s0
 ; FLAT_SCR_ARCH-NEXT:    s_waitcnt_vscnt null, 0x0
 ; FLAT_SCR_ARCH-NEXT:    s_endpgm
   %alloca = alloca i32, addrspace(5)
@@ -120,7 +120,7 @@ define amdgpu_kernel void @test(ptr addrspace(1) %out, i32 %in) {
 ; FLAT_SCR_OPT-NEXT:    v_writelane_b32 v0, s2, 0
 ; FLAT_SCR_OPT-NEXT:    v_writelane_b32 v0, s3, 1
 ; FLAT_SCR_OPT-NEXT:    s_or_saveexec_b32 s105, -1
-; FLAT_SCR_OPT-NEXT:    s_mov_b32 s2, 4
+; FLAT_SCR_OPT-NEXT:    s_mov_b32 s2, 0
 ; FLAT_SCR_OPT-NEXT:    scratch_store_dword off, v0, s2 ; 4-byte Folded Spill
 ; FLAT_SCR_OPT-NEXT:    s_waitcnt_depctr 0xffe3
 ; FLAT_SCR_OPT-NEXT:    s_mov_b32 exec_lo, s105
@@ -221,7 +221,7 @@ define amdgpu_kernel void @test(ptr addrspace(1) %out, i32 %in) {
 ; FLAT_SCR_OPT-NEXT:    ;;#ASMSTART
 ; FLAT_SCR_OPT-NEXT:    ;;#ASMEND
 ; FLAT_SCR_OPT-NEXT:    s_or_saveexec_b32 s105, -1
-; FLAT_SCR_OPT-NEXT:    s_mov_b32 s0, 4
+; FLAT_SCR_OPT-NEXT:    s_mov_b32 s0, 0
 ; FLAT_SCR_OPT-NEXT:    scratch_load_dword v1, off, s0 ; 4-byte Folded Reload
 ; FLAT_SCR_OPT-NEXT:    s_waitcnt_depctr 0xffe3
 ; FLAT_SCR_OPT-NEXT:    s_mov_b32 exec_lo, s105
@@ -243,7 +243,7 @@ define amdgpu_kernel void @test(ptr addrspace(1) %out, i32 %in) {
 ; FLAT_SCR_ARCH-NEXT:    v_writelane_b32 v0, s2, 0
 ; FLAT_SCR_ARCH-NEXT:    v_writelane_b32 v0, s3, 1
 ; FLAT_SCR_ARCH-NEXT:    s_or_saveexec_b32 s105, -1
-; FLAT_SCR_ARCH-NEXT:    s_mov_b32 s2, 4
+; FLAT_SCR_ARCH-NEXT:    s_mov_b32 s2, 0
 ; FLAT_SCR_ARCH-NEXT:    scratch_store_dword off, v0, s2 ; 4-byte Folded Spill
 ; FLAT_SCR_ARCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; FLAT_SCR_ARCH-NEXT:    s_mov_b32 exec_lo, s105
@@ -344,7 +344,7 @@ define amdgpu_kernel void @test(ptr addrspace(1) %out, i32 %in) {
 ; FLAT_SCR_ARCH-NEXT:    ;;#ASMSTART
 ; FLAT_SCR_ARCH-NEXT:    ;;#ASMEND
 ; FLAT_SCR_ARCH-NEXT:    s_or_saveexec_b32 s105, -1
-; FLAT_SCR_ARCH-NEXT:    s_mov_b32 s0, 4
+; FLAT_SCR_ARCH-NEXT:    s_mov_b32 s0, 0
 ; FLAT_SCR_ARCH-NEXT:    scratch_load_dword v1, off, s0 ; 4-byte Folded Reload
 ; FLAT_SCR_ARCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; FLAT_SCR_ARCH-NEXT:    s_mov_b32 exec_lo, s105
diff --git a/llvm/test/CodeGen/AMDGPU/flat-scratch-svs.ll b/llvm/test/CodeGen/AMDGPU/flat-scratch-svs.ll
index 8284a77..0af57c6 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-scratch-svs.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-scratch-svs.ll
@@ -15,7 +15,7 @@ define amdgpu_kernel void @soff1_voff1(i32 %soff) {
 ; GFX940-SDAG-LABEL: soff1_voff1:
 ; GFX940-SDAG:       ; %bb.0: ; %bb
 ; GFX940-SDAG-NEXT:    s_load_dword s0, s[0:1], 0x24
-; GFX940-SDAG-NEXT:    v_mov_b32_e32 v1, 4
+; GFX940-SDAG-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX940-SDAG-NEXT:    v_mov_b32_e32 v2, 1
 ; GFX940-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-SDAG-NEXT:    v_add_u32_e32 v1, s0, v1
@@ -36,7 +36,7 @@ define amdgpu_kernel void @soff1_voff1(i32 %soff) {
 ; GFX940-GISEL-LABEL: soff1_voff1:
 ; GFX940-GISEL:       ; %bb.0: ; %bb
 ; GFX940-GISEL-NEXT:    s_load_dword s0, s[0:1], 0x24
-; GFX940-GISEL-NEXT:    v_mov_b32_e32 v1, 4
+; GFX940-GISEL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX940-GISEL-NEXT:    v_mov_b32_e32 v2, 1
 ; GFX940-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-GISEL-NEXT:    v_add3_u32 v0, v1, s0, v0
@@ -58,7 +58,7 @@ define amdgpu_kernel void @soff1_voff1(i32 %soff) {
 ; GFX11-SDAG-NEXT:    s_load_b32 s0, s[0:1], 0x24
 ; GFX11-SDAG-NEXT:    v_dual_mov_b32 v1, 1 :: v_dual_mov_b32 v2, 2
 ; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_add3_u32 v0, 4, s0, v0
+; GFX11-SDAG-NEXT:    v_add3_u32 v0, 0, s0, v0
 ; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-SDAG-NEXT:    v_dual_mov_b32 v3, 4 :: v_dual_add_nc_u32 v4, 1, v0
 ; GFX11-SDAG-NEXT:    v_add_nc_u32_e32 v5, 2, v0
@@ -76,7 +76,7 @@ define amdgpu_kernel void @soff1_voff1(i32 %soff) {
 ; GFX11-GISEL-NEXT:    s_load_b32 s0, s[0:1], 0x24
 ; GFX11-GISEL-NEXT:    v_dual_mov_b32 v1, 1 :: v_dual_mov_b32 v2, 2
 ; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_add3_u32 v0, 4, s0, v0
+; GFX11-GISEL-NEXT:    v_add3_u32 v0, 0, s0, v0
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-GISEL-NEXT:    v_dual_mov_b32 v3, 4 :: v_dual_add_nc_u32 v4, 1, v0
 ; GFX11-GISEL-NEXT:    v_add_nc_u32_e32 v5, 2, v0
@@ -95,7 +95,7 @@ define amdgpu_kernel void @soff1_voff1(i32 %soff) {
 ; GFX12-SDAG-NEXT:    v_dual_mov_b32 v1, 1 :: v_dual_mov_b32 v2, 2
 ; GFX12-SDAG-NEXT:    v_mov_b32_e32 v3, 4
 ; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT:    s_add_co_i32 s0, s0, 4
+; GFX12-SDAG-NEXT:    s_add_co_i32 s0, s0, 0
 ; GFX12-SDAG-NEXT:    scratch_store_b8 v0, v1, s0 offset:1 scope:SCOPE_SYS
 ; GFX12-SDAG-NEXT:    s_wait_storecnt 0x0
 ; GFX12-SDAG-NEXT:    scratch_store_b8 v0, v2, s0 offset:2 scope:SCOPE_SYS
@@ -110,7 +110,7 @@ define amdgpu_kernel void @soff1_voff1(i32 %soff) {
 ; GFX12-GISEL-NEXT:    v_dual_mov_b32 v1, 1 :: v_dual_mov_b32 v2, 2
 ; GFX12-GISEL-NEXT:    v_mov_b32_e32 v3, 4
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT:    v_add3_u32 v0, 4, s0, v0
+; GFX12-GISEL-NEXT:    v_add3_u32 v0, 0, s0, v0
 ; GFX12-GISEL-NEXT:    scratch_store_b8 v0, v1, off offset:1 scope:SCOPE_SYS
 ; GFX12-GISEL-NEXT:    s_wait_storecnt 0x0
 ; GFX12-GISEL-NEXT:    scratch_store_b8 v0, v2, off offset:2 scope:SCOPE_SYS
@@ -138,7 +138,7 @@ define amdgpu_kernel void @soff1_voff2(i32 %soff) {
 ; GFX940-SDAG-LABEL: soff1_voff2:
 ; GFX940-SDAG:       ; %bb.0: ; %bb
 ; GFX940-SDAG-NEXT:    s_load_dword s0, s[0:1], 0x24
-; GFX940-SDAG-NEXT:    v_mov_b32_e32 v1, 4
+; GFX940-SDAG-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX940-SDAG-NEXT:    v_mov_b32_e32 v2, 1
 ; GFX940-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-SDAG-NEXT:    v_add_u32_e32 v1, s0, v1
@@ -159,7 +159,7 @@ define amdgpu_kernel void @soff1_voff2(i32 %soff) {
 ; GFX940-GISEL-LABEL: soff1_voff2:
 ; GFX940-GISEL:       ; %bb.0: ; %bb
 ; GFX940-GISEL-NEXT:    s_load_dword s0, s[0:1], 0x24
-; GFX940-GISEL-NEXT:    v_mov_b32_e32 v1, 4
+; GFX940-GISEL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX940-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX940-GISEL-NEXT:    v_mov_b32_e32 v2, 1
 ; GFX940-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
@@ -184,7 +184,7 @@ define amdgpu_kernel void @soff1_voff2(i32 %soff) {
 ; GFX11-SDAG-NEXT:    v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4
 ; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_add3_u32 v0, 4, s0, v0
+; GFX11-SDAG-NEXT:    v_add3_u32 v0, 0, s0, v0
 ; GFX11-SDAG-NEXT:    v_add_nc_u32_e32 v4, 1, v0
 ; GFX11-SDAG-NEXT:    v_add_nc_u32_e32 v5, 2, v0
 ; GFX11-SDAG-NEXT:    v_add_nc_u32_e32 v0, 4, v0
@@ -203,7 +203,7 @@ define amdgpu_kernel void @soff1_voff2(i32 %soff) {
 ; GFX11-GISEL-NEXT:    v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4
 ; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_add3_u32 v0, 4, s0, v0
+; GFX11-GISEL-NEXT:    v_add3_u32 v0, 0, s0, v0
 ; GFX11-GISEL-NEXT:    v_add_nc_u32_e32 v4, 1, v0
 ; GFX11-GISEL-NEXT:    v_add_nc_u32_e32 v5, 2, v0
 ; GFX11-GISEL-NEXT:    v_add_nc_u32_e32 v0, 4, v0
@@ -221,7 +221,7 @@ define amdgpu_kernel void @soff1_voff2(i32 %soff) {
 ; GFX12-SDAG-NEXT:    v_dual_mov_b32 v1, 1 :: v_dual_lshlrev_b32 v0, 1, v0
 ; GFX12-SDAG-NEXT:    v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4
 ; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT:    s_add_co_i32 s0, s0, 4
+; GFX12-SDAG-NEXT:    s_add_co_i32 s0, s0, 0
 ; GFX12-SDAG-NEXT:    scratch_store_b8 v0, v1, s0 offset:1 scope:SCOPE_SYS
 ; GFX12-SDAG-NEXT:    s_wait_storecnt 0x0
 ; GFX12-SDAG-NEXT:    scratch_store_b8 v0, v2, s0 offset:2 scope:SCOPE_SYS
@@ -237,7 +237,7 @@ define amdgpu_kernel void @soff1_voff2(i32 %soff) {
 ; GFX12-GISEL-NEXT:    v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX12-GISEL-NEXT:    v_add3_u32 v0, 4, s0, v0
+; GFX12-GISEL-NEXT:    v_add3_u32 v0, 0, s0, v0
 ; GFX12-GISEL-NEXT:    scratch_store_b8 v0, v1, off offset:1 scope:SCOPE_SYS
 ; GFX12-GISEL-NEXT:    s_wait_storecnt 0x0
 ; GFX12-GISEL-NEXT:    scratch_store_b8 v0, v2, off offset:2 scope:SCOPE_SYS
@@ -265,7 +265,7 @@ define amdgpu_kernel void @soff1_voff4(i32 %soff) {
 ; GFX940-SDAG-LABEL: soff1_voff4:
 ; GFX940-SDAG:       ; %bb.0: ; %bb
 ; GFX940-SDAG-NEXT:    s_load_dword s0, s[0:1], 0x24
-; GFX940-SDAG-NEXT:    v_mov_b32_e32 v1, 4
+; GFX940-SDAG-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX940-SDAG-NEXT:    v_mov_b32_e32 v2, 1
 ; GFX940-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-SDAG-NEXT:    v_add_u32_e32 v1, s0, v1
@@ -286,7 +286,7 @@ define amdgpu_kernel void @soff1_voff4(i32 %soff) {
 ; GFX940-GISEL-LABEL: soff1_voff4:
 ; GFX940-GISEL:       ; %bb.0: ; %bb
 ; GFX940-GISEL-NEXT:    s_load_dword s0, s[0:1], 0x24
-; GFX940-GISEL-NEXT:    v_mov_b32_e32 v1, 4
+; GFX940-GISEL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX940-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX940-GISEL-NEXT:    v_mov_b32_e32 v2, 1
 ; GFX940-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
@@ -311,7 +311,7 @@ define amdgpu_kernel void @soff1_voff4(i32 %soff) {
 ; GFX11-SDAG-NEXT:    v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4
 ; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_add3_u32 v0, 4, s0, v0
+; GFX11-SDAG-NEXT:    v_add3_u32 v0, 0, s0, v0
 ; GFX11-SDAG-NEXT:    v_add_nc_u32_e32 v4, 1, v0
 ; GFX11-SDAG-NEXT:    v_add_nc_u32_e32 v5, 2, v0
 ; GFX11-SDAG-NEXT:    v_add_nc_u32_e32 v0, 4, v0
@@ -330,7 +330,7 @@ define amdgpu_kernel void @soff1_voff4(i32 %soff) {
 ; GFX11-GISEL-NEXT:    v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4
 ; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_add3_u32 v0, 4, s0, v0
+; GFX11-GISEL-NEXT:    v_add3_u32 v0, 0, s0, v0
 ; GFX11-GISEL-NEXT:    v_add_nc_u32_e32 v4, 1, v0
 ; GFX11-GISEL-NEXT:    v_add_nc_u32_e32 v5, 2, v0
 ; GFX11-GISEL-NEXT:    v_add_nc_u32_e32 v0, 4, v0
@@ -348,7 +348,7 @@ define amdgpu_kernel void @soff1_voff4(i32 %soff) {
 ; GFX12-SDAG-NEXT:    v_dual_mov_b32 v1, 1 :: v_dual_lshlrev_b32 v0, 2, v0
 ; GFX12-SDAG-NEXT:    v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4
 ; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT:    s_add_co_i32 s0, s0, 4
+; GFX12-SDAG-NEXT:    s_add_co_i32 s0, s0, 0
 ; GFX12-SDAG-NEXT:    scratch_store_b8 v0, v1, s0 offset:1 scope:SCOPE_SYS
 ; GFX12-SDAG-NEXT:    s_wait_storecnt 0x0
 ; GFX12-SDAG-NEXT:    scratch_store_b8 v0, v2, s0 offset:2 scope:SCOPE_SYS
@@ -364,7 +364,7 @@ define amdgpu_kernel void @soff1_voff4(i32 %soff) {
 ; GFX12-GISEL-NEXT:    v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX12-GISEL-NEXT:    v_add3_u32 v0, 4, s0, v0
+; GFX12-GISEL-NEXT:    v_add3_u32 v0, 0, s0, v0
 ; GFX12-GISEL-NEXT:    scratch_store_b8 v0, v1, off offset:1 scope:SCOPE_SYS
 ; GFX12-GISEL-NEXT:    s_wait_storecnt 0x0
 ; GFX12-GISEL-NEXT:    scratch_store_b8 v0, v2, off offset:2 scope:SCOPE_SYS
@@ -392,7 +392,7 @@ define amdgpu_kernel void @soff2_voff1(i32 %soff) {
 ; GFX940-SDAG-LABEL: soff2_voff1:
 ; GFX940-SDAG:       ; %bb.0: ; %bb
 ; GFX940-SDAG-NEXT:    s_load_dword s0, s[0:1], 0x24
-; GFX940-SDAG-NEXT:    v_mov_b32_e32 v1, 4
+; GFX940-SDAG-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX940-SDAG-NEXT:    v_mov_b32_e32 v2, 1
 ; GFX940-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-SDAG-NEXT:    s_lshl_b32 s0, s0, 1
@@ -414,7 +414,7 @@ define amdgpu_kernel void @soff2_voff1(i32 %soff) {
 ; GFX940-GISEL-LABEL: soff2_voff1:
 ; GFX940-GISEL:       ; %bb.0: ; %bb
 ; GFX940-GISEL-NEXT:    s_load_dword s0, s[0:1], 0x24
-; GFX940-GISEL-NEXT:    v_mov_b32_e32 v1, 4
+; GFX940-GISEL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX940-GISEL-NEXT:    v_mov_b32_e32 v2, 1
 ; GFX940-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-GISEL-NEXT:    s_lshl_b32 s0, s0, 1
@@ -439,7 +439,7 @@ define amdgpu_kernel void @soff2_voff1(i32 %soff) {
 ; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    s_lshl_b32 s0, s0, 1
 ; GFX11-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_add3_u32 v0, 4, s0, v0
+; GFX11-SDAG-NEXT:    v_add3_u32 v0, 0, s0, v0
 ; GFX11-SDAG-NEXT:    v_dual_mov_b32 v3, 4 :: v_dual_add_nc_u32 v4, 1, v0
 ; GFX11-SDAG-NEXT:    v_add_nc_u32_e32 v5, 2, v0
 ; GFX11-SDAG-NEXT:    v_add_nc_u32_e32 v0, 4, v0
@@ -458,7 +458,7 @@ define amdgpu_kernel void @soff2_voff1(i32 %soff) {
 ; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    s_lshl_b32 s0, s0, 1
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_add3_u32 v0, 4, s0, v0
+; GFX11-GISEL-NEXT:    v_add3_u32 v0, 0, s0, v0
 ; GFX11-GISEL-NEXT:    v_dual_mov_b32 v3, 4 :: v_dual_add_nc_u32 v4, 1, v0
 ; GFX11-GISEL-NEXT:    v_add_nc_u32_e32 v5, 2, v0
 ; GFX11-GISEL-NEXT:    v_add_nc_u32_e32 v0, 4, v0
@@ -478,7 +478,7 @@ define amdgpu_kernel void @soff2_voff1(i32 %soff) {
 ; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-SDAG-NEXT:    s_lshl_b32 s0, s0, 1
 ; GFX12-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-SDAG-NEXT:    s_add_co_i32 s0, s0, 4
+; GFX12-SDAG-NEXT:    s_add_co_i32 s0, s0, 0
 ; GFX12-SDAG-NEXT:    scratch_store_b8 v0, v1, s0 offset:1 scope:SCOPE_SYS
 ; GFX12-SDAG-NEXT:    s_wait_storecnt 0x0
 ; GFX12-SDAG-NEXT:    scratch_store_b8 v0, v2, s0 offset:2 scope:SCOPE_SYS
@@ -495,7 +495,7 @@ define amdgpu_kernel void @soff2_voff1(i32 %soff) {
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-GISEL-NEXT:    s_lshl_b32 s0, s0, 1
 ; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-GISEL-NEXT:    v_add3_u32 v0, 4, s0, v0
+; GFX12-GISEL-NEXT:    v_add3_u32 v0, 0, s0, v0
 ; GFX12-GISEL-NEXT:    scratch_store_b8 v0, v1, off offset:1 scope:SCOPE_SYS
 ; GFX12-GISEL-NEXT:    s_wait_storecnt 0x0
 ; GFX12-GISEL-NEXT:    scratch_store_b8 v0, v2, off offset:2 scope:SCOPE_SYS
@@ -523,7 +523,7 @@ define amdgpu_kernel void @soff2_voff2(i32 %soff) {
 ; GFX940-SDAG-LABEL: soff2_voff2:
 ; GFX940-SDAG:       ; %bb.0: ; %bb
 ; GFX940-SDAG-NEXT:    s_load_dword s0, s[0:1], 0x24
-; GFX940-SDAG-NEXT:    v_mov_b32_e32 v1, 4
+; GFX940-SDAG-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX940-SDAG-NEXT:    v_mov_b32_e32 v2, 1
 ; GFX940-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-SDAG-NEXT:    s_lshl_b32 s0, s0, 1
@@ -544,7 +544,7 @@ define amdgpu_kernel void @soff2_voff2(i32 %soff) {
 ; GFX940-GISEL-LABEL: soff2_voff2:
 ; GFX940-GISEL:       ; %bb.0: ; %bb
 ; GFX940-GISEL-NEXT:    s_load_dword s0, s[0:1], 0x24
-; GFX940-GISEL-NEXT:    v_mov_b32_e32 v1, 4
+; GFX940-GISEL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX940-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX940-GISEL-NEXT:    v_mov_b32_e32 v2, 1
 ; GFX940-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
@@ -571,7 +571,7 @@ define amdgpu_kernel void @soff2_voff2(i32 %soff) {
 ; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    s_lshl_b32 s0, s0, 1
 ; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
-; GFX11-SDAG-NEXT:    v_add3_u32 v0, 4, s0, v0
+; GFX11-SDAG-NEXT:    v_add3_u32 v0, 0, s0, v0
 ; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-SDAG-NEXT:    v_add_nc_u32_e32 v4, 2, v0
 ; GFX11-SDAG-NEXT:    v_add_nc_u32_e32 v5, 4, v0
@@ -591,7 +591,7 @@ define amdgpu_kernel void @soff2_voff2(i32 %soff) {
 ; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    s_lshl_b32 s0, s0, 1
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
-; GFX11-GISEL-NEXT:    v_add3_u32 v0, 4, s0, v0
+; GFX11-GISEL-NEXT:    v_add3_u32 v0, 0, s0, v0
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-GISEL-NEXT:    v_add_nc_u32_e32 v4, 1, v0
 ; GFX11-GISEL-NEXT:    v_add_nc_u32_e32 v5, 2, v0
@@ -612,7 +612,7 @@ define amdgpu_kernel void @soff2_voff2(i32 %soff) {
 ; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-SDAG-NEXT:    s_lshl_b32 s0, s0, 1
 ; GFX12-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-SDAG-NEXT:    s_add_co_i32 s0, s0, 4
+; GFX12-SDAG-NEXT:    s_add_co_i32 s0, s0, 0
 ; GFX12-SDAG-NEXT:    scratch_store_b8 v0, v1, s0 offset:1 scope:SCOPE_SYS
 ; GFX12-SDAG-NEXT:    s_wait_storecnt 0x0
 ; GFX12-SDAG-NEXT:    scratch_store_b8 v0, v2, s0 offset:2 scope:SCOPE_SYS
@@ -629,7 +629,7 @@ define amdgpu_kernel void @soff2_voff2(i32 %soff) {
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-GISEL-NEXT:    s_lshl_b32 s0, s0, 1
 ; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
-; GFX12-GISEL-NEXT:    v_add3_u32 v0, 4, s0, v0
+; GFX12-GISEL-NEXT:    v_add3_u32 v0, 0, s0, v0
 ; GFX12-GISEL-NEXT:    scratch_store_b8 v0, v1, off offset:1 scope:SCOPE_SYS
 ; GFX12-GISEL-NEXT:    s_wait_storecnt 0x0
 ; GFX12-GISEL-NEXT:    scratch_store_b8 v0, v2, off offset:2 scope:SCOPE_SYS
@@ -657,7 +657,7 @@ define amdgpu_kernel void @soff2_voff4(i32 %soff) {
 ; GFX940-SDAG-LABEL: soff2_voff4:
 ; GFX940-SDAG:       ; %bb.0: ; %bb
 ; GFX940-SDAG-NEXT:    s_load_dword s0, s[0:1], 0x24
-; GFX940-SDAG-NEXT:    v_mov_b32_e32 v1, 4
+; GFX940-SDAG-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX940-SDAG-NEXT:    v_mov_b32_e32 v2, 1
 ; GFX940-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-SDAG-NEXT:    s_lshl_b32 s0, s0, 1
@@ -678,7 +678,7 @@ define amdgpu_kernel void @soff2_voff4(i32 %soff) {
 ; GFX940-GISEL-LABEL: soff2_voff4:
 ; GFX940-GISEL:       ; %bb.0: ; %bb
 ; GFX940-GISEL-NEXT:    s_load_dword s0, s[0:1], 0x24
-; GFX940-GISEL-NEXT:    v_mov_b32_e32 v1, 4
+; GFX940-GISEL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX940-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX940-GISEL-NEXT:    v_mov_b32_e32 v2, 1
 ; GFX940-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
@@ -705,7 +705,7 @@ define amdgpu_kernel void @soff2_voff4(i32 %soff) {
 ; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    s_lshl_b32 s0, s0, 1
 ; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
-; GFX11-SDAG-NEXT:    v_add3_u32 v0, 4, s0, v0
+; GFX11-SDAG-NEXT:    v_add3_u32 v0, 0, s0, v0
 ; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-SDAG-NEXT:    v_add_nc_u32_e32 v4, 2, v0
 ; GFX11-SDAG-NEXT:    v_add_nc_u32_e32 v5, 4, v0
@@ -725,7 +725,7 @@ define amdgpu_kernel void @soff2_voff4(i32 %soff) {
 ; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    s_lshl_b32 s0, s0, 1
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
-; GFX11-GISEL-NEXT:    v_add3_u32 v0, 4, s0, v0
+; GFX11-GISEL-NEXT:    v_add3_u32 v0, 0, s0, v0
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-GISEL-NEXT:    v_add_nc_u32_e32 v4, 1, v0
 ; GFX11-GISEL-NEXT:    v_add_nc_u32_e32 v5, 2, v0
@@ -746,7 +746,7 @@ define amdgpu_kernel void @soff2_voff4(i32 %soff) {
 ; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-SDAG-NEXT:    s_lshl_b32 s0, s0, 1
 ; GFX12-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-SDAG-NEXT:    s_add_co_i32 s0, s0, 4
+; GFX12-SDAG-NEXT:    s_add_co_i32 s0, s0, 0
 ; GFX12-SDAG-NEXT:    scratch_store_b8 v0, v1, s0 offset:1 scope:SCOPE_SYS
 ; GFX12-SDAG-NEXT:    s_wait_storecnt 0x0
 ; GFX12-SDAG-NEXT:    scratch_store_b8 v0, v2, s0 offset:2 scope:SCOPE_SYS
@@ -763,7 +763,7 @@ define amdgpu_kernel void @soff2_voff4(i32 %soff) {
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-GISEL-NEXT:    s_lshl_b32 s0, s0, 1
 ; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
-; GFX12-GISEL-NEXT:    v_add3_u32 v0, 4, s0, v0
+; GFX12-GISEL-NEXT:    v_add3_u32 v0, 0, s0, v0
 ; GFX12-GISEL-NEXT:    scratch_store_b8 v0, v1, off offset:1 scope:SCOPE_SYS
 ; GFX12-GISEL-NEXT:    s_wait_storecnt 0x0
 ; GFX12-GISEL-NEXT:    scratch_store_b8 v0, v2, off offset:2 scope:SCOPE_SYS
@@ -791,7 +791,7 @@ define amdgpu_kernel void @soff4_voff1(i32 %soff) {
 ; GFX940-SDAG-LABEL: soff4_voff1:
 ; GFX940-SDAG:       ; %bb.0: ; %bb
 ; GFX940-SDAG-NEXT:    s_load_dword s0, s[0:1], 0x24
-; GFX940-SDAG-NEXT:    v_mov_b32_e32 v1, 4
+; GFX940-SDAG-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX940-SDAG-NEXT:    v_mov_b32_e32 v2, 1
 ; GFX940-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-SDAG-NEXT:    s_lshl_b32 s0, s0, 2
@@ -813,7 +813,7 @@ define amdgpu_kernel void @soff4_voff1(i32 %soff) {
 ; GFX940-GISEL-LABEL: soff4_voff1:
 ; GFX940-GISEL:       ; %bb.0: ; %bb
 ; GFX940-GISEL-NEXT:    s_load_dword s0, s[0:1], 0x24
-; GFX940-GISEL-NEXT:    v_mov_b32_e32 v1, 4
+; GFX940-GISEL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX940-GISEL-NEXT:    v_mov_b32_e32 v2, 1
 ; GFX940-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-GISEL-NEXT:    s_lshl_b32 s0, s0, 2
@@ -838,7 +838,7 @@ define amdgpu_kernel void @soff4_voff1(i32 %soff) {
 ; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    s_lshl_b32 s0, s0, 2
 ; GFX11-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_add3_u32 v0, 4, s0, v0
+; GFX11-SDAG-NEXT:    v_add3_u32 v0, 0, s0, v0
 ; GFX11-SDAG-NEXT:    v_dual_mov_b32 v3, 4 :: v_dual_add_nc_u32 v4, 1, v0
 ; GFX11-SDAG-NEXT:    v_add_nc_u32_e32 v5, 2, v0
 ; GFX11-SDAG-NEXT:    v_add_nc_u32_e32 v0, 4, v0
@@ -857,7 +857,7 @@ define amdgpu_kernel void @soff4_voff1(i32 %soff) {
 ; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    s_lshl_b32 s0, s0, 2
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_add3_u32 v0, 4, s0, v0
+; GFX11-GISEL-NEXT:    v_add3_u32 v0, 0, s0, v0
 ; GFX11-GISEL-NEXT:    v_dual_mov_b32 v3, 4 :: v_dual_add_nc_u32 v4, 1, v0
 ; GFX11-GISEL-NEXT:    v_add_nc_u32_e32 v5, 2, v0
 ; GFX11-GISEL-NEXT:    v_add_nc_u32_e32 v0, 4, v0
@@ -877,7 +877,7 @@ define amdgpu_kernel void @soff4_voff1(i32 %soff) {
 ; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-SDAG-NEXT:    s_lshl_b32 s0, s0, 2
 ; GFX12-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-SDAG-NEXT:    s_add_co_i32 s0, s0, 4
+; GFX12-SDAG-NEXT:    s_add_co_i32 s0, s0, 0
 ; GFX12-SDAG-NEXT:    scratch_store_b8 v0, v1, s0 offset:1 scope:SCOPE_SYS
 ; GFX12-SDAG-NEXT:    s_wait_storecnt 0x0
 ; GFX12-SDAG-NEXT:    scratch_store_b8 v0, v2, s0 offset:2 scope:SCOPE_SYS
@@ -894,7 +894,7 @@ define amdgpu_kernel void @soff4_voff1(i32 %soff) {
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-GISEL-NEXT:    s_lshl_b32 s0, s0, 2
 ; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-GISEL-NEXT:    v_add3_u32 v0, 4, s0, v0
+; GFX12-GISEL-NEXT:    v_add3_u32 v0, 0, s0, v0
 ; GFX12-GISEL-NEXT:    scratch_store_b8 v0, v1, off offset:1 scope:SCOPE_SYS
 ; GFX12-GISEL-NEXT:    s_wait_storecnt 0x0
 ; GFX12-GISEL-NEXT:    scratch_store_b8 v0, v2, off offset:2 scope:SCOPE_SYS
@@ -922,7 +922,7 @@ define amdgpu_kernel void @soff4_voff2(i32 %soff) {
 ; GFX940-SDAG-LABEL: soff4_voff2:
 ; GFX940-SDAG:       ; %bb.0: ; %bb
 ; GFX940-SDAG-NEXT:    s_load_dword s0, s[0:1], 0x24
-; GFX940-SDAG-NEXT:    v_mov_b32_e32 v1, 4
+; GFX940-SDAG-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX940-SDAG-NEXT:    v_mov_b32_e32 v2, 1
 ; GFX940-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-SDAG-NEXT:    s_lshl_b32 s0, s0, 2
@@ -943,7 +943,7 @@ define amdgpu_kernel void @soff4_voff2(i32 %soff) {
 ; GFX940-GISEL-LABEL: soff4_voff2:
 ; GFX940-GISEL:       ; %bb.0: ; %bb
 ; GFX940-GISEL-NEXT:    s_load_dword s0, s[0:1], 0x24
-; GFX940-GISEL-NEXT:    v_mov_b32_e32 v1, 4
+; GFX940-GISEL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX940-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX940-GISEL-NEXT:    v_mov_b32_e32 v2, 1
 ; GFX940-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
@@ -970,7 +970,7 @@ define amdgpu_kernel void @soff4_voff2(i32 %soff) {
 ; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    s_lshl_b32 s0, s0, 2
 ; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
-; GFX11-SDAG-NEXT:    v_add3_u32 v0, 4, s0, v0
+; GFX11-SDAG-NEXT:    v_add3_u32 v0, 0, s0, v0
 ; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-SDAG-NEXT:    v_add_nc_u32_e32 v4, 2, v0
 ; GFX11-SDAG-NEXT:    v_add_nc_u32_e32 v5, 4, v0
@@ -990,7 +990,7 @@ define amdgpu_kernel void @soff4_voff2(i32 %soff) {
 ; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    s_lshl_b32 s0, s0, 2
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
-; GFX11-GISEL-NEXT:    v_add3_u32 v0, 4, s0, v0
+; GFX11-GISEL-NEXT:    v_add3_u32 v0, 0, s0, v0
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-GISEL-NEXT:    v_add_nc_u32_e32 v4, 1, v0
 ; GFX11-GISEL-NEXT:    v_add_nc_u32_e32 v5, 2, v0
@@ -1011,7 +1011,7 @@ define amdgpu_kernel void @soff4_voff2(i32 %soff) {
 ; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-SDAG-NEXT:    s_lshl_b32 s0, s0, 2
 ; GFX12-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-SDAG-NEXT:    s_add_co_i32 s0, s0, 4
+; GFX12-SDAG-NEXT:    s_add_co_i32 s0, s0, 0
 ; GFX12-SDAG-NEXT:    scratch_store_b8 v0, v1, s0 offset:1 scope:SCOPE_SYS
 ; GFX12-SDAG-NEXT:    s_wait_storecnt 0x0
 ; GFX12-SDAG-NEXT:    scratch_store_b8 v0, v2, s0 offset:2 scope:SCOPE_SYS
@@ -1028,7 +1028,7 @@ define amdgpu_kernel void @soff4_voff2(i32 %soff) {
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-GISEL-NEXT:    s_lshl_b32 s0, s0, 2
 ; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
-; GFX12-GISEL-NEXT:    v_add3_u32 v0, 4, s0, v0
+; GFX12-GISEL-NEXT:    v_add3_u32 v0, 0, s0, v0
 ; GFX12-GISEL-NEXT:    scratch_store_b8 v0, v1, off offset:1 scope:SCOPE_SYS
 ; GFX12-GISEL-NEXT:    s_wait_storecnt 0x0
 ; GFX12-GISEL-NEXT:    scratch_store_b8 v0, v2, off offset:2 scope:SCOPE_SYS
@@ -1056,7 +1056,7 @@ define amdgpu_kernel void @soff4_voff4(i32 %soff) {
 ; GFX940-SDAG-LABEL: soff4_voff4:
 ; GFX940-SDAG:       ; %bb.0: ; %bb
 ; GFX940-SDAG-NEXT:    s_load_dword s0, s[0:1], 0x24
-; GFX940-SDAG-NEXT:    v_mov_b32_e32 v1, 4
+; GFX940-SDAG-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX940-SDAG-NEXT:    v_mov_b32_e32 v2, 1
 ; GFX940-SDAG-NEXT:    v_mov_b32_e32 v3, 2
 ; GFX940-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1076,7 +1076,7 @@ define amdgpu_kernel void @soff4_voff4(i32 %soff) {
 ; GFX940-GISEL-LABEL: soff4_voff4:
 ; GFX940-GISEL:       ; %bb.0: ; %bb
 ; GFX940-GISEL-NEXT:    s_load_dword s0, s[0:1], 0x24
-; GFX940-GISEL-NEXT:    v_mov_b32_e32 v1, 4
+; GFX940-GISEL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX940-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX940-GISEL-NEXT:    v_mov_b32_e32 v2, 1
 ; GFX940-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1103,7 +1103,7 @@ define amdgpu_kernel void @soff4_voff4(i32 %soff) {
 ; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    s_lshl_b32 s0, s0, 2
 ; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
-; GFX11-SDAG-NEXT:    v_add3_u32 v0, 4, s0, v0
+; GFX11-SDAG-NEXT:    v_add3_u32 v0, 0, s0, v0
 ; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-SDAG-NEXT:    v_dual_mov_b32 v2, 2 :: v_dual_add_nc_u32 v3, 4, v0
 ; GFX11-SDAG-NEXT:    scratch_store_b8 v0, v1, off offset:1 dlc
@@ -1122,7 +1122,7 @@ define amdgpu_kernel void @soff4_voff4(i32 %soff) {
 ; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    s_lshl_b32 s0, s0, 2
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
-; GFX11-GISEL-NEXT:    v_add3_u32 v0, 4, s0, v0
+; GFX11-GISEL-NEXT:    v_add3_u32 v0, 0, s0, v0
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-GISEL-NEXT:    v_add_nc_u32_e32 v4, 1, v0
 ; GFX11-GISEL-NEXT:    v_add_nc_u32_e32 v5, 2, v0
@@ -1143,7 +1143,7 @@ define amdgpu_kernel void @soff4_voff4(i32 %soff) {
 ; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-SDAG-NEXT:    s_lshl_b32 s0, s0, 2
 ; GFX12-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-SDAG-NEXT:    s_add_co_i32 s0, s0, 4
+; GFX12-SDAG-NEXT:    s_add_co_i32 s0, s0, 0
 ; GFX12-SDAG-NEXT:    scratch_store_b8 v0, v1, s0 offset:1 scope:SCOPE_SYS
 ; GFX12-SDAG-NEXT:    s_wait_storecnt 0x0
 ; GFX12-SDAG-NEXT:    scratch_store_b8 v0, v2, s0 offset:2 scope:SCOPE_SYS
@@ -1160,7 +1160,7 @@ define amdgpu_kernel void @soff4_voff4(i32 %soff) {
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-GISEL-NEXT:    s_lshl_b32 s0, s0, 2
 ; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
-; GFX12-GISEL-NEXT:    v_add3_u32 v0, 4, s0, v0
+; GFX12-GISEL-NEXT:    v_add3_u32 v0, 0, s0, v0
 ; GFX12-GISEL-NEXT:    scratch_store_b8 v0, v1, off offset:1 scope:SCOPE_SYS
 ; GFX12-GISEL-NEXT:    s_wait_storecnt 0x0
 ; GFX12-GISEL-NEXT:    scratch_store_b8 v0, v2, off offset:2 scope:SCOPE_SYS
diff --git a/llvm/test/CodeGen/AMDGPU/flat-scratch.ll b/llvm/test/CodeGen/AMDGPU/flat-scratch.ll
index 84e3879..687d845 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-scratch.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-scratch.ll
@@ -23,10 +23,10 @@ define amdgpu_kernel void @zero_init_kernel() {
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s3
-; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 offset:52
-; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 offset:36
-; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 offset:20
-; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 offset:4
+; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 offset:48
+; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 offset:32
+; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 offset:16
+; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], s0
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: zero_init_kernel:
@@ -43,10 +43,10 @@ define amdgpu_kernel void @zero_init_kernel() {
 ; GFX10-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX10-NEXT:    v_mov_b32_e32 v3, s3
-; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:52
-; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:36
-; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:20
-; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:4
+; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:48
+; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:32
+; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:16
+; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], off
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: zero_init_kernel:
@@ -59,10 +59,10 @@ define amdgpu_kernel void @zero_init_kernel() {
 ; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX11-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-NEXT:    s_clause 0x3
-; GFX11-NEXT:    scratch_store_b128 off, v[0:3], off offset:52
-; GFX11-NEXT:    scratch_store_b128 off, v[0:3], off offset:36
-; GFX11-NEXT:    scratch_store_b128 off, v[0:3], off offset:20
-; GFX11-NEXT:    scratch_store_b128 off, v[0:3], off offset:4
+; GFX11-NEXT:    scratch_store_b128 off, v[0:3], off offset:48
+; GFX11-NEXT:    scratch_store_b128 off, v[0:3], off offset:32
+; GFX11-NEXT:    scratch_store_b128 off, v[0:3], off offset:16
+; GFX11-NEXT:    scratch_store_b128 off, v[0:3], off
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: zero_init_kernel:
@@ -75,10 +75,10 @@ define amdgpu_kernel void @zero_init_kernel() {
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX12-NEXT:    s_clause 0x3
-; GFX12-NEXT:    scratch_store_b128 off, v[0:3], off offset:52
-; GFX12-NEXT:    scratch_store_b128 off, v[0:3], off offset:36
-; GFX12-NEXT:    scratch_store_b128 off, v[0:3], off offset:20
-; GFX12-NEXT:    scratch_store_b128 off, v[0:3], off offset:4
+; GFX12-NEXT:    scratch_store_b128 off, v[0:3], off offset:48
+; GFX12-NEXT:    scratch_store_b128 off, v[0:3], off offset:32
+; GFX12-NEXT:    scratch_store_b128 off, v[0:3], off offset:16
+; GFX12-NEXT:    scratch_store_b128 off, v[0:3], off
 ; GFX12-NEXT:    s_endpgm
 ;
 ; GFX9-PAL-LABEL: zero_init_kernel:
@@ -98,10 +98,10 @@ define amdgpu_kernel void @zero_init_kernel() {
 ; GFX9-PAL-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-PAL-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX9-PAL-NEXT:    v_mov_b32_e32 v3, s3
-; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 offset:52
-; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 offset:36
-; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 offset:20
-; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 offset:4
+; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 offset:48
+; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 offset:32
+; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 offset:16
+; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s0
 ; GFX9-PAL-NEXT:    s_endpgm
 ;
 ; GFX940-LABEL: zero_init_kernel:
@@ -112,10 +112,10 @@ define amdgpu_kernel void @zero_init_kernel() {
 ; GFX940-NEXT:    s_mov_b32 s3, s0
 ; GFX940-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
-; GFX940-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:52 sc0 sc1
-; GFX940-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:36 sc0 sc1
-; GFX940-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:20 sc0 sc1
-; GFX940-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:4 sc0 sc1
+; GFX940-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:48 sc0 sc1
+; GFX940-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:32 sc0 sc1
+; GFX940-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:16 sc0 sc1
+; GFX940-NEXT:    scratch_store_dwordx4 off, v[0:3], off sc0 sc1
 ; GFX940-NEXT:    s_endpgm
 ;
 ; GFX1010-PAL-LABEL: zero_init_kernel:
@@ -137,10 +137,10 @@ define amdgpu_kernel void @zero_init_kernel() {
 ; GFX1010-PAL-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX1010-PAL-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX1010-PAL-NEXT:    v_mov_b32_e32 v3, s3
-; GFX1010-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 offset:52
-; GFX1010-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 offset:36
-; GFX1010-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 offset:20
-; GFX1010-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 offset:4
+; GFX1010-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 offset:48
+; GFX1010-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 offset:32
+; GFX1010-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 offset:16
+; GFX1010-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s0
 ; GFX1010-PAL-NEXT:    s_endpgm
 ;
 ; GFX1030-PAL-LABEL: zero_init_kernel:
@@ -162,10 +162,10 @@ define amdgpu_kernel void @zero_init_kernel() {
 ; GFX1030-PAL-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX1030-PAL-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX1030-PAL-NEXT:    v_mov_b32_e32 v3, s3
-; GFX1030-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:52
-; GFX1030-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:36
-; GFX1030-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:20
-; GFX1030-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:4
+; GFX1030-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:48
+; GFX1030-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:32
+; GFX1030-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:16
+; GFX1030-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], off
 ; GFX1030-PAL-NEXT:    s_endpgm
 ;
 ; GFX11-PAL-LABEL: zero_init_kernel:
@@ -178,10 +178,10 @@ define amdgpu_kernel void @zero_init_kernel() {
 ; GFX11-PAL-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX11-PAL-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-PAL-NEXT:    s_clause 0x3
-; GFX11-PAL-NEXT:    scratch_store_b128 off, v[0:3], off offset:52
-; GFX11-PAL-NEXT:    scratch_store_b128 off, v[0:3], off offset:36
-; GFX11-PAL-NEXT:    scratch_store_b128 off, v[0:3], off offset:20
-; GFX11-PAL-NEXT:    scratch_store_b128 off, v[0:3], off offset:4
+; GFX11-PAL-NEXT:    scratch_store_b128 off, v[0:3], off offset:48
+; GFX11-PAL-NEXT:    scratch_store_b128 off, v[0:3], off offset:32
+; GFX11-PAL-NEXT:    scratch_store_b128 off, v[0:3], off offset:16
+; GFX11-PAL-NEXT:    scratch_store_b128 off, v[0:3], off
 ; GFX11-PAL-NEXT:    s_endpgm
 ;
 ; GFX12-PAL-LABEL: zero_init_kernel:
@@ -194,10 +194,10 @@ define amdgpu_kernel void @zero_init_kernel() {
 ; GFX12-PAL-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX12-PAL-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX12-PAL-NEXT:    s_clause 0x3
-; GFX12-PAL-NEXT:    scratch_store_b128 off, v[0:3], off offset:52
-; GFX12-PAL-NEXT:    scratch_store_b128 off, v[0:3], off offset:36
-; GFX12-PAL-NEXT:    scratch_store_b128 off, v[0:3], off offset:20
-; GFX12-PAL-NEXT:    scratch_store_b128 off, v[0:3], off offset:4
+; GFX12-PAL-NEXT:    scratch_store_b128 off, v[0:3], off offset:48
+; GFX12-PAL-NEXT:    scratch_store_b128 off, v[0:3], off offset:32
+; GFX12-PAL-NEXT:    scratch_store_b128 off, v[0:3], off offset:16
+; GFX12-PAL-NEXT:    scratch_store_b128 off, v[0:3], off
 ; GFX12-PAL-NEXT:    s_endpgm
   %alloca = alloca [32 x i16], align 2, addrspace(5)
   call void @llvm.memset.p5.i64(ptr addrspace(5) align 2 dereferenceable(64) %alloca, i8 0, i64 64, i1 false)
@@ -381,11 +381,11 @@ define amdgpu_kernel void @store_load_sindex_kernel(i32 %idx) {
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_lshl_b32 s1, s0, 2
 ; GFX9-NEXT:    s_and_b32 s0, s0, 15
-; GFX9-NEXT:    s_add_i32 s1, s1, 4
+; GFX9-NEXT:    s_add_i32 s1, s1, 0
 ; GFX9-NEXT:    s_lshl_b32 s0, s0, 2
 ; GFX9-NEXT:    scratch_store_dword off, v0, s1
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_add_i32 s0, s0, 4
+; GFX9-NEXT:    s_add_i32 s0, s0, 0
 ; GFX9-NEXT:    scratch_load_dword v0, off, s0 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_endpgm
@@ -402,8 +402,8 @@ define amdgpu_kernel void @store_load_sindex_kernel(i32 %idx) {
 ; GFX10-NEXT:    s_and_b32 s1, s0, 15
 ; GFX10-NEXT:    s_lshl_b32 s0, s0, 2
 ; GFX10-NEXT:    s_lshl_b32 s1, s1, 2
-; GFX10-NEXT:    s_add_i32 s0, s0, 4
-; GFX10-NEXT:    s_add_i32 s1, s1, 4
+; GFX10-NEXT:    s_add_i32 s0, s0, 0
+; GFX10-NEXT:    s_add_i32 s1, s1, 0
 ; GFX10-NEXT:    scratch_store_dword off, v0, s0
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    scratch_load_dword v0, off, s1 glc dlc
@@ -418,8 +418,8 @@ define amdgpu_kernel void @store_load_sindex_kernel(i32 %idx) {
 ; GFX11-NEXT:    s_and_b32 s1, s0, 15
 ; GFX11-NEXT:    s_lshl_b32 s0, s0, 2
 ; GFX11-NEXT:    s_lshl_b32 s1, s1, 2
-; GFX11-NEXT:    s_add_i32 s0, s0, 4
-; GFX11-NEXT:    s_add_i32 s1, s1, 4
+; GFX11-NEXT:    s_add_i32 s0, s0, 0
+; GFX11-NEXT:    s_add_i32 s1, s1, 0
 ; GFX11-NEXT:    scratch_store_b32 off, v0, s0 dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    scratch_load_b32 v0, off, s1 glc dlc
@@ -434,8 +434,8 @@ define amdgpu_kernel void @store_load_sindex_kernel(i32 %idx) {
 ; GFX12-NEXT:    s_and_b32 s1, s0, 15
 ; GFX12-NEXT:    s_lshl_b32 s0, s0, 2
 ; GFX12-NEXT:    s_lshl_b32 s1, s1, 2
-; GFX12-NEXT:    s_add_co_i32 s0, s0, 4
-; GFX12-NEXT:    s_add_co_i32 s1, s1, 4
+; GFX12-NEXT:    s_add_co_i32 s0, s0, 0
+; GFX12-NEXT:    s_add_co_i32 s1, s1, 0
 ; GFX12-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    scratch_load_b32 v0, off, s1 scope:SCOPE_SYS
@@ -455,11 +455,11 @@ define amdgpu_kernel void @store_load_sindex_kernel(i32 %idx) {
 ; GFX9-PAL-NEXT:    s_addc_u32 flat_scratch_hi, s5, 0
 ; GFX9-PAL-NEXT:    s_lshl_b32 s1, s0, 2
 ; GFX9-PAL-NEXT:    s_and_b32 s0, s0, 15
-; GFX9-PAL-NEXT:    s_add_i32 s1, s1, 4
+; GFX9-PAL-NEXT:    s_add_i32 s1, s1, 0
 ; GFX9-PAL-NEXT:    s_lshl_b32 s0, s0, 2
 ; GFX9-PAL-NEXT:    scratch_store_dword off, v0, s1
 ; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-PAL-NEXT:    s_add_i32 s0, s0, 4
+; GFX9-PAL-NEXT:    s_add_i32 s0, s0, 0
 ; GFX9-PAL-NEXT:    scratch_load_dword v0, off, s0 glc
 ; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-PAL-NEXT:    s_endpgm
@@ -471,11 +471,11 @@ define amdgpu_kernel void @store_load_sindex_kernel(i32 %idx) {
 ; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NEXT:    s_lshl_b32 s1, s0, 2
 ; GFX940-NEXT:    s_and_b32 s0, s0, 15
-; GFX940-NEXT:    s_add_i32 s1, s1, 4
+; GFX940-NEXT:    s_add_i32 s1, s1, 0
 ; GFX940-NEXT:    s_lshl_b32 s0, s0, 2
 ; GFX940-NEXT:    scratch_store_dword off, v0, s1 sc0 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    s_add_i32 s0, s0, 4
+; GFX940-NEXT:    s_add_i32 s0, s0, 0
 ; GFX940-NEXT:    scratch_load_dword v0, off, s0 sc0 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    s_endpgm
@@ -497,8 +497,8 @@ define amdgpu_kernel void @store_load_sindex_kernel(i32 %idx) {
 ; GFX10-PAL-NEXT:    s_and_b32 s1, s0, 15
 ; GFX10-PAL-NEXT:    s_lshl_b32 s0, s0, 2
 ; GFX10-PAL-NEXT:    s_lshl_b32 s1, s1, 2
-; GFX10-PAL-NEXT:    s_add_i32 s0, s0, 4
-; GFX10-PAL-NEXT:    s_add_i32 s1, s1, 4
+; GFX10-PAL-NEXT:    s_add_i32 s0, s0, 0
+; GFX10-PAL-NEXT:    s_add_i32 s1, s1, 0
 ; GFX10-PAL-NEXT:    scratch_store_dword off, v0, s0
 ; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-PAL-NEXT:    scratch_load_dword v0, off, s1 glc dlc
@@ -513,8 +513,8 @@ define amdgpu_kernel void @store_load_sindex_kernel(i32 %idx) {
 ; GFX11-PAL-NEXT:    s_and_b32 s1, s0, 15
 ; GFX11-PAL-NEXT:    s_lshl_b32 s0, s0, 2
 ; GFX11-PAL-NEXT:    s_lshl_b32 s1, s1, 2
-; GFX11-PAL-NEXT:    s_add_i32 s0, s0, 4
-; GFX11-PAL-NEXT:    s_add_i32 s1, s1, 4
+; GFX11-PAL-NEXT:    s_add_i32 s0, s0, 0
+; GFX11-PAL-NEXT:    s_add_i32 s1, s1, 0
 ; GFX11-PAL-NEXT:    scratch_store_b32 off, v0, s0 dlc
 ; GFX11-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-PAL-NEXT:    scratch_load_b32 v0, off, s1 glc dlc
@@ -529,8 +529,8 @@ define amdgpu_kernel void @store_load_sindex_kernel(i32 %idx) {
 ; GFX12-PAL-NEXT:    s_and_b32 s1, s0, 15
 ; GFX12-PAL-NEXT:    s_lshl_b32 s0, s0, 2
 ; GFX12-PAL-NEXT:    s_lshl_b32 s1, s1, 2
-; GFX12-PAL-NEXT:    s_add_co_i32 s0, s0, 4
-; GFX12-PAL-NEXT:    s_add_co_i32 s1, s1, 4
+; GFX12-PAL-NEXT:    s_add_co_i32 s0, s0, 0
+; GFX12-PAL-NEXT:    s_add_co_i32 s1, s1, 0
 ; GFX12-PAL-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SYS
 ; GFX12-PAL-NEXT:    s_wait_storecnt 0x0
 ; GFX12-PAL-NEXT:    scratch_load_b32 v0, off, s1 scope:SCOPE_SYS
@@ -552,13 +552,13 @@ define amdgpu_ps void @store_load_sindex_foo(i32 inreg %idx) {
 ; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s0, s3
 ; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s1, 0
 ; GFX9-NEXT:    s_lshl_b32 s0, s2, 2
-; GFX9-NEXT:    s_add_i32 s0, s0, 4
+; GFX9-NEXT:    s_add_i32 s0, s0, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 15
 ; GFX9-NEXT:    scratch_store_dword off, v0, s0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_and_b32 s0, s2, 15
 ; GFX9-NEXT:    s_lshl_b32 s0, s0, 2
-; GFX9-NEXT:    s_add_i32 s0, s0, 4
+; GFX9-NEXT:    s_add_i32 s0, s0, 0
 ; GFX9-NEXT:    scratch_load_dword v0, off, s0 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_endpgm
@@ -573,8 +573,8 @@ define amdgpu_ps void @store_load_sindex_foo(i32 inreg %idx) {
 ; GFX10-NEXT:    s_and_b32 s0, s2, 15
 ; GFX10-NEXT:    s_lshl_b32 s1, s2, 2
 ; GFX10-NEXT:    s_lshl_b32 s0, s0, 2
-; GFX10-NEXT:    s_add_i32 s1, s1, 4
-; GFX10-NEXT:    s_add_i32 s0, s0, 4
+; GFX10-NEXT:    s_add_i32 s1, s1, 0
+; GFX10-NEXT:    s_add_i32 s0, s0, 0
 ; GFX10-NEXT:    scratch_store_dword off, v0, s1
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    scratch_load_dword v0, off, s0 glc dlc
@@ -587,8 +587,8 @@ define amdgpu_ps void @store_load_sindex_foo(i32 inreg %idx) {
 ; GFX11-NEXT:    s_and_b32 s1, s0, 15
 ; GFX11-NEXT:    s_lshl_b32 s0, s0, 2
 ; GFX11-NEXT:    s_lshl_b32 s1, s1, 2
-; GFX11-NEXT:    s_add_i32 s0, s0, 4
-; GFX11-NEXT:    s_add_i32 s1, s1, 4
+; GFX11-NEXT:    s_add_i32 s0, s0, 0
+; GFX11-NEXT:    s_add_i32 s1, s1, 0
 ; GFX11-NEXT:    scratch_store_b32 off, v0, s0 dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    scratch_load_b32 v0, off, s1 glc dlc
@@ -601,8 +601,8 @@ define amdgpu_ps void @store_load_sindex_foo(i32 inreg %idx) {
 ; GFX12-NEXT:    s_and_b32 s1, s0, 15
 ; GFX12-NEXT:    s_lshl_b32 s0, s0, 2
 ; GFX12-NEXT:    s_lshl_b32 s1, s1, 2
-; GFX12-NEXT:    s_add_co_i32 s0, s0, 4
-; GFX12-NEXT:    s_add_co_i32 s1, s1, 4
+; GFX12-NEXT:    s_add_co_i32 s0, s0, 0
+; GFX12-NEXT:    s_add_co_i32 s1, s1, 0
 ; GFX12-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    scratch_load_b32 v0, off, s1 scope:SCOPE_SYS
@@ -621,11 +621,11 @@ define amdgpu_ps void @store_load_sindex_foo(i32 inreg %idx) {
 ; GFX9-PAL-NEXT:    s_addc_u32 flat_scratch_hi, s3, 0
 ; GFX9-PAL-NEXT:    s_lshl_b32 s1, s0, 2
 ; GFX9-PAL-NEXT:    s_and_b32 s0, s0, 15
-; GFX9-PAL-NEXT:    s_add_i32 s1, s1, 4
+; GFX9-PAL-NEXT:    s_add_i32 s1, s1, 0
 ; GFX9-PAL-NEXT:    s_lshl_b32 s0, s0, 2
 ; GFX9-PAL-NEXT:    scratch_store_dword off, v0, s1
 ; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-PAL-NEXT:    s_add_i32 s0, s0, 4
+; GFX9-PAL-NEXT:    s_add_i32 s0, s0, 0
 ; GFX9-PAL-NEXT:    scratch_load_dword v0, off, s0 glc
 ; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-PAL-NEXT:    s_endpgm
@@ -634,12 +634,12 @@ define amdgpu_ps void @store_load_sindex_foo(i32 inreg %idx) {
 ; GFX940:       ; %bb.0: ; %bb
 ; GFX940-NEXT:    s_lshl_b32 s1, s0, 2
 ; GFX940-NEXT:    s_and_b32 s0, s0, 15
-; GFX940-NEXT:    s_add_i32 s1, s1, 4
+; GFX940-NEXT:    s_add_i32 s1, s1, 0
 ; GFX940-NEXT:    v_mov_b32_e32 v0, 15
 ; GFX940-NEXT:    s_lshl_b32 s0, s0, 2
 ; GFX940-NEXT:    scratch_store_dword off, v0, s1 sc0 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    s_add_i32 s0, s0, 4
+; GFX940-NEXT:    s_add_i32 s0, s0, 0
 ; GFX940-NEXT:    scratch_load_dword v0, off, s0 sc0 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    s_endpgm
@@ -659,8 +659,8 @@ define amdgpu_ps void @store_load_sindex_foo(i32 inreg %idx) {
 ; GFX10-PAL-NEXT:    s_and_b32 s1, s0, 15
 ; GFX10-PAL-NEXT:    s_lshl_b32 s0, s0, 2
 ; GFX10-PAL-NEXT:    s_lshl_b32 s1, s1, 2
-; GFX10-PAL-NEXT:    s_add_i32 s0, s0, 4
-; GFX10-PAL-NEXT:    s_add_i32 s1, s1, 4
+; GFX10-PAL-NEXT:    s_add_i32 s0, s0, 0
+; GFX10-PAL-NEXT:    s_add_i32 s1, s1, 0
 ; GFX10-PAL-NEXT:    scratch_store_dword off, v0, s0
 ; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-PAL-NEXT:    scratch_load_dword v0, off, s1 glc dlc
@@ -673,8 +673,8 @@ define amdgpu_ps void @store_load_sindex_foo(i32 inreg %idx) {
 ; GFX11-PAL-NEXT:    s_and_b32 s1, s0, 15
 ; GFX11-PAL-NEXT:    s_lshl_b32 s0, s0, 2
 ; GFX11-PAL-NEXT:    s_lshl_b32 s1, s1, 2
-; GFX11-PAL-NEXT:    s_add_i32 s0, s0, 4
-; GFX11-PAL-NEXT:    s_add_i32 s1, s1, 4
+; GFX11-PAL-NEXT:    s_add_i32 s0, s0, 0
+; GFX11-PAL-NEXT:    s_add_i32 s1, s1, 0
 ; GFX11-PAL-NEXT:    scratch_store_b32 off, v0, s0 dlc
 ; GFX11-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-PAL-NEXT:    scratch_load_b32 v0, off, s1 glc dlc
@@ -687,8 +687,8 @@ define amdgpu_ps void @store_load_sindex_foo(i32 inreg %idx) {
 ; GFX12-PAL-NEXT:    s_and_b32 s1, s0, 15
 ; GFX12-PAL-NEXT:    s_lshl_b32 s0, s0, 2
 ; GFX12-PAL-NEXT:    s_lshl_b32 s1, s1, 2
-; GFX12-PAL-NEXT:    s_add_co_i32 s0, s0, 4
-; GFX12-PAL-NEXT:    s_add_co_i32 s1, s1, 4
+; GFX12-PAL-NEXT:    s_add_co_i32 s0, s0, 0
+; GFX12-PAL-NEXT:    s_add_co_i32 s1, s1, 0
 ; GFX12-PAL-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SYS
 ; GFX12-PAL-NEXT:    s_wait_storecnt 0x0
 ; GFX12-PAL-NEXT:    scratch_load_b32 v0, off, s1 scope:SCOPE_SYS
@@ -710,11 +710,11 @@ define amdgpu_kernel void @store_load_vindex_kernel() {
 ; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s0, s3
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s1, 0
-; GFX9-NEXT:    v_add_u32_e32 v1, 4, v0
+; GFX9-NEXT:    v_add_u32_e32 v1, 0, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 15
 ; GFX9-NEXT:    scratch_store_dword v1, v2, off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_sub_u32_e32 v0, 4, v0
+; GFX9-NEXT:    v_sub_u32_e32 v0, 0, v0
 ; GFX9-NEXT:    scratch_load_dword v0, v0, off offset:124 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_endpgm
@@ -727,8 +727,8 @@ define amdgpu_kernel void @store_load_vindex_kernel() {
 ; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 15
-; GFX10-NEXT:    v_add_nc_u32_e32 v1, 4, v0
-; GFX10-NEXT:    v_sub_nc_u32_e32 v0, 4, v0
+; GFX10-NEXT:    v_add_nc_u32_e32 v1, 0, v0
+; GFX10-NEXT:    v_sub_nc_u32_e32 v0, 0, v0
 ; GFX10-NEXT:    scratch_store_dword v1, v2, off
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    scratch_load_dword v0, v0, off offset:124 glc dlc
@@ -739,8 +739,8 @@ define amdgpu_kernel void @store_load_vindex_kernel() {
 ; GFX11:       ; %bb.0: ; %bb
 ; GFX11-NEXT:    v_dual_mov_b32 v1, 15 :: v_dual_lshlrev_b32 v0, 2, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_sub_nc_u32_e32 v2, 4, v0
-; GFX11-NEXT:    scratch_store_b32 v0, v1, off offset:4 dlc
+; GFX11-NEXT:    v_sub_nc_u32_e32 v2, 0, v0
+; GFX11-NEXT:    scratch_store_b32 v0, v1, off dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    scratch_load_b32 v0, v2, off offset:124 glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
@@ -750,8 +750,8 @@ define amdgpu_kernel void @store_load_vindex_kernel() {
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_dual_mov_b32 v1, 15 :: v_dual_lshlrev_b32 v0, 2, v0
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_sub_nc_u32_e32 v2, 4, v0
-; GFX12-NEXT:    scratch_store_b32 v0, v1, off offset:4 scope:SCOPE_SYS
+; GFX12-NEXT:    v_sub_nc_u32_e32 v2, 0, v0
+; GFX12-NEXT:    scratch_store_b32 v0, v1, off scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    scratch_load_b32 v0, v2, off offset:124 scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
@@ -763,9 +763,9 @@ define amdgpu_kernel void @store_load_vindex_kernel() {
 ; GFX9-PAL-NEXT:    s_mov_b32 s2, s0
 ; GFX9-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
 ; GFX9-PAL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-PAL-NEXT:    v_add_u32_e32 v1, 4, v0
+; GFX9-PAL-NEXT:    v_add_u32_e32 v1, 0, v0
 ; GFX9-PAL-NEXT:    v_mov_b32_e32 v2, 15
-; GFX9-PAL-NEXT:    v_sub_u32_e32 v0, 4, v0
+; GFX9-PAL-NEXT:    v_sub_u32_e32 v0, 0, v0
 ; GFX9-PAL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
 ; GFX9-PAL-NEXT:    s_add_u32 flat_scratch_lo, s2, s1
@@ -780,9 +780,9 @@ define amdgpu_kernel void @store_load_vindex_kernel() {
 ; GFX940:       ; %bb.0: ; %bb
 ; GFX940-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX940-NEXT:    v_mov_b32_e32 v1, 15
-; GFX940-NEXT:    scratch_store_dword v0, v1, off offset:4 sc0 sc1
+; GFX940-NEXT:    scratch_store_dword v0, v1, off sc0 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    v_sub_u32_e32 v0, 4, v0
+; GFX940-NEXT:    v_sub_u32_e32 v0, 0, v0
 ; GFX940-NEXT:    scratch_load_dword v0, v0, off offset:124 sc0 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    s_endpgm
@@ -800,8 +800,8 @@ define amdgpu_kernel void @store_load_vindex_kernel() {
 ; GFX10-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
 ; GFX10-PAL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-PAL-NEXT:    v_mov_b32_e32 v2, 15
-; GFX10-PAL-NEXT:    v_add_nc_u32_e32 v1, 4, v0
-; GFX10-PAL-NEXT:    v_sub_nc_u32_e32 v0, 4, v0
+; GFX10-PAL-NEXT:    v_add_nc_u32_e32 v1, 0, v0
+; GFX10-PAL-NEXT:    v_sub_nc_u32_e32 v0, 0, v0
 ; GFX10-PAL-NEXT:    scratch_store_dword v1, v2, off
 ; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-PAL-NEXT:    scratch_load_dword v0, v0, off offset:124 glc dlc
@@ -812,8 +812,8 @@ define amdgpu_kernel void @store_load_vindex_kernel() {
 ; GFX11-PAL:       ; %bb.0: ; %bb
 ; GFX11-PAL-NEXT:    v_dual_mov_b32 v1, 15 :: v_dual_lshlrev_b32 v0, 2, v0
 ; GFX11-PAL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-PAL-NEXT:    v_sub_nc_u32_e32 v2, 4, v0
-; GFX11-PAL-NEXT:    scratch_store_b32 v0, v1, off offset:4 dlc
+; GFX11-PAL-NEXT:    v_sub_nc_u32_e32 v2, 0, v0
+; GFX11-PAL-NEXT:    scratch_store_b32 v0, v1, off dlc
 ; GFX11-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-PAL-NEXT:    scratch_load_b32 v0, v2, off offset:124 glc dlc
 ; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0)
@@ -823,8 +823,8 @@ define amdgpu_kernel void @store_load_vindex_kernel() {
 ; GFX12-PAL:       ; %bb.0: ; %bb
 ; GFX12-PAL-NEXT:    v_dual_mov_b32 v1, 15 :: v_dual_lshlrev_b32 v0, 2, v0
 ; GFX12-PAL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-PAL-NEXT:    v_sub_nc_u32_e32 v2, 4, v0
-; GFX12-PAL-NEXT:    scratch_store_b32 v0, v1, off offset:4 scope:SCOPE_SYS
+; GFX12-PAL-NEXT:    v_sub_nc_u32_e32 v2, 0, v0
+; GFX12-PAL-NEXT:    scratch_store_b32 v0, v1, off scope:SCOPE_SYS
 ; GFX12-PAL-NEXT:    s_wait_storecnt 0x0
 ; GFX12-PAL-NEXT:    scratch_load_b32 v0, v2, off offset:124 scope:SCOPE_SYS
 ; GFX12-PAL-NEXT:    s_wait_loadcnt 0x0
@@ -1064,7 +1064,7 @@ define amdgpu_kernel void @zero_init_small_offset_kernel() {
 ; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s0, s3
 ; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s1, 0
 ; GFX9-NEXT:    s_mov_b32 s0, 0
-; GFX9-NEXT:    scratch_load_dword v0, off, s0 offset:4 glc
+; GFX9-NEXT:    scratch_load_dword v0, off, s0 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_mov_b32 s1, s0
 ; GFX9-NEXT:    s_mov_b32 s2, s0
@@ -1073,10 +1073,10 @@ define amdgpu_kernel void @zero_init_small_offset_kernel() {
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s3
-; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 offset:260
-; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 offset:276
-; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 offset:292
-; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 offset:308
+; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 offset:256
+; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 offset:272
+; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 offset:288
+; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 offset:304
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: zero_init_small_offset_kernel:
@@ -1085,7 +1085,7 @@ define amdgpu_kernel void @zero_init_small_offset_kernel() {
 ; GFX10-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
 ; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
-; GFX10-NEXT:    scratch_load_dword v0, off, off offset:4 glc dlc
+; GFX10-NEXT:    scratch_load_dword v0, off, off glc dlc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_mov_b32 s0, 0
 ; GFX10-NEXT:    s_mov_b32 s1, s0
@@ -1095,15 +1095,15 @@ define amdgpu_kernel void @zero_init_small_offset_kernel() {
 ; GFX10-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX10-NEXT:    v_mov_b32_e32 v3, s3
-; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:260
-; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:276
-; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:292
-; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:308
+; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:256
+; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:272
+; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:288
+; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:304
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: zero_init_small_offset_kernel:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    scratch_load_b32 v0, off, off offset:4 glc dlc
+; GFX11-NEXT:    scratch_load_b32 v0, off, off glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_mov_b32 s0, 0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
@@ -1113,15 +1113,15 @@ define amdgpu_kernel void @zero_init_small_offset_kernel() {
 ; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX11-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-NEXT:    s_clause 0x3
-; GFX11-NEXT:    scratch_store_b128 off, v[0:3], off offset:260
-; GFX11-NEXT:    scratch_store_b128 off, v[0:3], off offset:276
-; GFX11-NEXT:    scratch_store_b128 off, v[0:3], off offset:292
-; GFX11-NEXT:    scratch_store_b128 off, v[0:3], off offset:308
+; GFX11-NEXT:    scratch_store_b128 off, v[0:3], off offset:256
+; GFX11-NEXT:    scratch_store_b128 off, v[0:3], off offset:272
+; GFX11-NEXT:    scratch_store_b128 off, v[0:3], off offset:288
+; GFX11-NEXT:    scratch_store_b128 off, v[0:3], off offset:304
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: zero_init_small_offset_kernel:
 ; GFX12:       ; %bb.0:
-; GFX12-NEXT:    scratch_load_b32 v0, off, off offset:4 scope:SCOPE_SYS
+; GFX12-NEXT:    scratch_load_b32 v0, off, off scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    s_mov_b32 s0, 0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
@@ -1131,10 +1131,10 @@ define amdgpu_kernel void @zero_init_small_offset_kernel() {
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX12-NEXT:    s_clause 0x3
-; GFX12-NEXT:    scratch_store_b128 off, v[0:3], off offset:260
-; GFX12-NEXT:    scratch_store_b128 off, v[0:3], off offset:276
-; GFX12-NEXT:    scratch_store_b128 off, v[0:3], off offset:292
-; GFX12-NEXT:    scratch_store_b128 off, v[0:3], off offset:308
+; GFX12-NEXT:    scratch_store_b128 off, v[0:3], off offset:256
+; GFX12-NEXT:    scratch_store_b128 off, v[0:3], off offset:272
+; GFX12-NEXT:    scratch_store_b128 off, v[0:3], off offset:288
+; GFX12-NEXT:    scratch_store_b128 off, v[0:3], off offset:304
 ; GFX12-NEXT:    s_endpgm
 ;
 ; GFX9-PAL-LABEL: zero_init_small_offset_kernel:
@@ -1147,7 +1147,7 @@ define amdgpu_kernel void @zero_init_small_offset_kernel() {
 ; GFX9-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
 ; GFX9-PAL-NEXT:    s_add_u32 flat_scratch_lo, s2, s1
 ; GFX9-PAL-NEXT:    s_addc_u32 flat_scratch_hi, s3, 0
-; GFX9-PAL-NEXT:    scratch_load_dword v0, off, s0 offset:4 glc
+; GFX9-PAL-NEXT:    scratch_load_dword v0, off, s0 glc
 ; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-PAL-NEXT:    s_mov_b32 s1, s0
 ; GFX9-PAL-NEXT:    s_mov_b32 s2, s0
@@ -1156,15 +1156,15 @@ define amdgpu_kernel void @zero_init_small_offset_kernel() {
 ; GFX9-PAL-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-PAL-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX9-PAL-NEXT:    v_mov_b32_e32 v3, s3
-; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 offset:260
-; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 offset:276
-; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 offset:292
-; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 offset:308
+; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 offset:256
+; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 offset:272
+; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 offset:288
+; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 offset:304
 ; GFX9-PAL-NEXT:    s_endpgm
 ;
 ; GFX940-LABEL: zero_init_small_offset_kernel:
 ; GFX940:       ; %bb.0:
-; GFX940-NEXT:    scratch_load_dword v0, off, off offset:4 sc0 sc1
+; GFX940-NEXT:    scratch_load_dword v0, off, off sc0 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    s_mov_b32 s0, 0
 ; GFX940-NEXT:    s_mov_b32 s1, s0
@@ -1172,10 +1172,10 @@ define amdgpu_kernel void @zero_init_small_offset_kernel() {
 ; GFX940-NEXT:    s_mov_b32 s3, s0
 ; GFX940-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
-; GFX940-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:260 sc0 sc1
-; GFX940-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:276 sc0 sc1
-; GFX940-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:292 sc0 sc1
-; GFX940-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:308 sc0 sc1
+; GFX940-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:256 sc0 sc1
+; GFX940-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:272 sc0 sc1
+; GFX940-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:288 sc0 sc1
+; GFX940-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:304 sc0 sc1
 ; GFX940-NEXT:    s_endpgm
 ;
 ; GFX1010-PAL-LABEL: zero_init_small_offset_kernel:
@@ -1190,7 +1190,7 @@ define amdgpu_kernel void @zero_init_small_offset_kernel() {
 ; GFX1010-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
 ; GFX1010-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
 ; GFX1010-PAL-NEXT:    s_mov_b32 s0, 0
-; GFX1010-PAL-NEXT:    scratch_load_dword v0, off, s0 offset:4 glc dlc
+; GFX1010-PAL-NEXT:    scratch_load_dword v0, off, s0 glc dlc
 ; GFX1010-PAL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1010-PAL-NEXT:    s_mov_b32 s1, s0
 ; GFX1010-PAL-NEXT:    s_mov_b32 s2, s0
@@ -1199,10 +1199,10 @@ define amdgpu_kernel void @zero_init_small_offset_kernel() {
 ; GFX1010-PAL-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX1010-PAL-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX1010-PAL-NEXT:    v_mov_b32_e32 v3, s3
-; GFX1010-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 offset:260
-; GFX1010-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 offset:276
-; GFX1010-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 offset:292
-; GFX1010-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 offset:308
+; GFX1010-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 offset:256
+; GFX1010-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 offset:272
+; GFX1010-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 offset:288
+; GFX1010-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 offset:304
 ; GFX1010-PAL-NEXT:    s_endpgm
 ;
 ; GFX1030-PAL-LABEL: zero_init_small_offset_kernel:
@@ -1216,7 +1216,7 @@ define amdgpu_kernel void @zero_init_small_offset_kernel() {
 ; GFX1030-PAL-NEXT:    s_addc_u32 s3, s3, 0
 ; GFX1030-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
 ; GFX1030-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
-; GFX1030-PAL-NEXT:    scratch_load_dword v0, off, off offset:4 glc dlc
+; GFX1030-PAL-NEXT:    scratch_load_dword v0, off, off glc dlc
 ; GFX1030-PAL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1030-PAL-NEXT:    s_mov_b32 s0, 0
 ; GFX1030-PAL-NEXT:    s_mov_b32 s1, s0
@@ -1226,15 +1226,15 @@ define amdgpu_kernel void @zero_init_small_offset_kernel() {
 ; GFX1030-PAL-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX1030-PAL-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX1030-PAL-NEXT:    v_mov_b32_e32 v3, s3
-; GFX1030-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:260
-; GFX1030-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:276
-; GFX1030-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:292
-; GFX1030-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:308
+; GFX1030-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:256
+; GFX1030-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:272
+; GFX1030-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:288
+; GFX1030-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:304
 ; GFX1030-PAL-NEXT:    s_endpgm
 ;
 ; GFX11-PAL-LABEL: zero_init_small_offset_kernel:
 ; GFX11-PAL:       ; %bb.0:
-; GFX11-PAL-NEXT:    scratch_load_b32 v0, off, off offset:4 glc dlc
+; GFX11-PAL-NEXT:    scratch_load_b32 v0, off, off glc dlc
 ; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-PAL-NEXT:    s_mov_b32 s0, 0
 ; GFX11-PAL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
@@ -1244,15 +1244,15 @@ define amdgpu_kernel void @zero_init_small_offset_kernel() {
 ; GFX11-PAL-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX11-PAL-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-PAL-NEXT:    s_clause 0x3
-; GFX11-PAL-NEXT:    scratch_store_b128 off, v[0:3], off offset:260
-; GFX11-PAL-NEXT:    scratch_store_b128 off, v[0:3], off offset:276
-; GFX11-PAL-NEXT:    scratch_store_b128 off, v[0:3], off offset:292
-; GFX11-PAL-NEXT:    scratch_store_b128 off, v[0:3], off offset:308
+; GFX11-PAL-NEXT:    scratch_store_b128 off, v[0:3], off offset:256
+; GFX11-PAL-NEXT:    scratch_store_b128 off, v[0:3], off offset:272
+; GFX11-PAL-NEXT:    scratch_store_b128 off, v[0:3], off offset:288
+; GFX11-PAL-NEXT:    scratch_store_b128 off, v[0:3], off offset:304
 ; GFX11-PAL-NEXT:    s_endpgm
 ;
 ; GFX12-PAL-LABEL: zero_init_small_offset_kernel:
 ; GFX12-PAL:       ; %bb.0:
-; GFX12-PAL-NEXT:    scratch_load_b32 v0, off, off offset:4 scope:SCOPE_SYS
+; GFX12-PAL-NEXT:    scratch_load_b32 v0, off, off scope:SCOPE_SYS
 ; GFX12-PAL-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-PAL-NEXT:    s_mov_b32 s0, 0
 ; GFX12-PAL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
@@ -1262,10 +1262,10 @@ define amdgpu_kernel void @zero_init_small_offset_kernel() {
 ; GFX12-PAL-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX12-PAL-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX12-PAL-NEXT:    s_clause 0x3
-; GFX12-PAL-NEXT:    scratch_store_b128 off, v[0:3], off offset:260
-; GFX12-PAL-NEXT:    scratch_store_b128 off, v[0:3], off offset:276
-; GFX12-PAL-NEXT:    scratch_store_b128 off, v[0:3], off offset:292
-; GFX12-PAL-NEXT:    scratch_store_b128 off, v[0:3], off offset:308
+; GFX12-PAL-NEXT:    scratch_store_b128 off, v[0:3], off offset:256
+; GFX12-PAL-NEXT:    scratch_store_b128 off, v[0:3], off offset:272
+; GFX12-PAL-NEXT:    scratch_store_b128 off, v[0:3], off offset:288
+; GFX12-PAL-NEXT:    scratch_store_b128 off, v[0:3], off offset:304
 ; GFX12-PAL-NEXT:    s_endpgm
   %padding = alloca [64 x i32], align 4, addrspace(5)
   %alloca = alloca [32 x i16], align 2, addrspace(5)
@@ -1470,16 +1470,16 @@ define amdgpu_kernel void @store_load_sindex_small_offset_kernel(i32 %idx) {
 ; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s2, s5
 ; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s3, 0
 ; GFX9-NEXT:    s_mov_b32 s1, 0
-; GFX9-NEXT:    scratch_load_dword v0, off, s1 offset:4 glc
+; GFX9-NEXT:    scratch_load_dword v0, off, s1 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_lshl_b32 s1, s0, 2
 ; GFX9-NEXT:    s_and_b32 s0, s0, 15
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 15
-; GFX9-NEXT:    s_addk_i32 s1, 0x104
+; GFX9-NEXT:    s_addk_i32 s1, 0x100
 ; GFX9-NEXT:    s_lshl_b32 s0, s0, 2
 ; GFX9-NEXT:    scratch_store_dword off, v0, s1
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_addk_i32 s0, 0x104
+; GFX9-NEXT:    s_addk_i32 s0, 0x100
 ; GFX9-NEXT:    scratch_load_dword v0, off, s0 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_endpgm
@@ -1491,15 +1491,15 @@ define amdgpu_kernel void @store_load_sindex_small_offset_kernel(i32 %idx) {
 ; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
 ; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
 ; GFX10-NEXT:    s_load_dword s0, s[0:1], 0x24
-; GFX10-NEXT:    scratch_load_dword v0, off, off offset:4 glc dlc
+; GFX10-NEXT:    scratch_load_dword v0, off, off glc dlc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 15
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_and_b32 s1, s0, 15
 ; GFX10-NEXT:    s_lshl_b32 s0, s0, 2
 ; GFX10-NEXT:    s_lshl_b32 s1, s1, 2
-; GFX10-NEXT:    s_addk_i32 s0, 0x104
-; GFX10-NEXT:    s_addk_i32 s1, 0x104
+; GFX10-NEXT:    s_addk_i32 s0, 0x100
+; GFX10-NEXT:    s_addk_i32 s1, 0x100
 ; GFX10-NEXT:    scratch_store_dword off, v0, s0
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    scratch_load_dword v0, off, s1 glc dlc
@@ -1509,15 +1509,15 @@ define amdgpu_kernel void @store_load_sindex_small_offset_kernel(i32 %idx) {
 ; GFX11-LABEL: store_load_sindex_small_offset_kernel:
 ; GFX11:       ; %bb.0: ; %bb
 ; GFX11-NEXT:    s_load_b32 s0, s[0:1], 0x24
-; GFX11-NEXT:    scratch_load_b32 v0, off, off offset:4 glc dlc
+; GFX11-NEXT:    scratch_load_b32 v0, off, off glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 15
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_and_b32 s1, s0, 15
 ; GFX11-NEXT:    s_lshl_b32 s0, s0, 2
 ; GFX11-NEXT:    s_lshl_b32 s1, s1, 2
-; GFX11-NEXT:    s_addk_i32 s0, 0x104
-; GFX11-NEXT:    s_addk_i32 s1, 0x104
+; GFX11-NEXT:    s_addk_i32 s0, 0x100
+; GFX11-NEXT:    s_addk_i32 s1, 0x100
 ; GFX11-NEXT:    scratch_store_b32 off, v0, s0 dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    scratch_load_b32 v0, off, s1 glc dlc
@@ -1527,15 +1527,15 @@ define amdgpu_kernel void @store_load_sindex_small_offset_kernel(i32 %idx) {
 ; GFX12-LABEL: store_load_sindex_small_offset_kernel:
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    s_load_b32 s0, s[0:1], 0x24
-; GFX12-NEXT:    scratch_load_b32 v0, off, off offset:4 scope:SCOPE_SYS
+; GFX12-NEXT:    scratch_load_b32 v0, off, off scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v0, 15
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_and_b32 s1, s0, 15
 ; GFX12-NEXT:    s_lshl_b32 s0, s0, 2
 ; GFX12-NEXT:    s_lshl_b32 s1, s1, 2
-; GFX12-NEXT:    s_addk_co_i32 s0, 0x104
-; GFX12-NEXT:    s_addk_co_i32 s1, 0x104
+; GFX12-NEXT:    s_addk_co_i32 s0, 0x100
+; GFX12-NEXT:    s_addk_co_i32 s1, 0x100
 ; GFX12-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    scratch_load_b32 v0, off, s1 scope:SCOPE_SYS
@@ -1553,16 +1553,16 @@ define amdgpu_kernel void @store_load_sindex_small_offset_kernel(i32 %idx) {
 ; GFX9-PAL-NEXT:    s_add_u32 flat_scratch_lo, s4, s3
 ; GFX9-PAL-NEXT:    s_addc_u32 flat_scratch_hi, s5, 0
 ; GFX9-PAL-NEXT:    s_mov_b32 s1, 0
-; GFX9-PAL-NEXT:    scratch_load_dword v0, off, s1 offset:4 glc
+; GFX9-PAL-NEXT:    scratch_load_dword v0, off, s1 glc
 ; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-PAL-NEXT:    s_lshl_b32 s1, s0, 2
 ; GFX9-PAL-NEXT:    s_and_b32 s0, s0, 15
 ; GFX9-PAL-NEXT:    v_mov_b32_e32 v0, 15
-; GFX9-PAL-NEXT:    s_addk_i32 s1, 0x104
+; GFX9-PAL-NEXT:    s_addk_i32 s1, 0x100
 ; GFX9-PAL-NEXT:    s_lshl_b32 s0, s0, 2
 ; GFX9-PAL-NEXT:    scratch_store_dword off, v0, s1
 ; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-PAL-NEXT:    s_addk_i32 s0, 0x104
+; GFX9-PAL-NEXT:    s_addk_i32 s0, 0x100
 ; GFX9-PAL-NEXT:    scratch_load_dword v0, off, s0 glc
 ; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-PAL-NEXT:    s_endpgm
@@ -1570,17 +1570,17 @@ define amdgpu_kernel void @store_load_sindex_small_offset_kernel(i32 %idx) {
 ; GFX940-LABEL: store_load_sindex_small_offset_kernel:
 ; GFX940:       ; %bb.0: ; %bb
 ; GFX940-NEXT:    s_load_dword s0, s[0:1], 0x24
-; GFX940-NEXT:    scratch_load_dword v0, off, off offset:4 sc0 sc1
+; GFX940-NEXT:    scratch_load_dword v0, off, off sc0 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v0, 15
 ; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NEXT:    s_lshl_b32 s1, s0, 2
 ; GFX940-NEXT:    s_and_b32 s0, s0, 15
-; GFX940-NEXT:    s_addk_i32 s1, 0x104
+; GFX940-NEXT:    s_addk_i32 s1, 0x100
 ; GFX940-NEXT:    s_lshl_b32 s0, s0, 2
 ; GFX940-NEXT:    scratch_store_dword off, v0, s1 sc0 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    s_addk_i32 s0, 0x104
+; GFX940-NEXT:    s_addk_i32 s0, 0x100
 ; GFX940-NEXT:    scratch_load_dword v0, off, s0 sc0 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    s_endpgm
@@ -1598,15 +1598,15 @@ define amdgpu_kernel void @store_load_sindex_small_offset_kernel(i32 %idx) {
 ; GFX1010-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s5
 ; GFX1010-PAL-NEXT:    s_load_dword s0, s[0:1], 0x0
 ; GFX1010-PAL-NEXT:    s_mov_b32 s1, 0
-; GFX1010-PAL-NEXT:    scratch_load_dword v0, off, s1 offset:4 glc dlc
+; GFX1010-PAL-NEXT:    scratch_load_dword v0, off, s1 glc dlc
 ; GFX1010-PAL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1010-PAL-NEXT:    v_mov_b32_e32 v0, 15
 ; GFX1010-PAL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1010-PAL-NEXT:    s_and_b32 s1, s0, 15
 ; GFX1010-PAL-NEXT:    s_lshl_b32 s0, s0, 2
 ; GFX1010-PAL-NEXT:    s_lshl_b32 s1, s1, 2
-; GFX1010-PAL-NEXT:    s_addk_i32 s0, 0x104
-; GFX1010-PAL-NEXT:    s_addk_i32 s1, 0x104
+; GFX1010-PAL-NEXT:    s_addk_i32 s0, 0x100
+; GFX1010-PAL-NEXT:    s_addk_i32 s1, 0x100
 ; GFX1010-PAL-NEXT:    scratch_store_dword off, v0, s0
 ; GFX1010-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX1010-PAL-NEXT:    scratch_load_dword v0, off, s1 glc dlc
@@ -1625,15 +1625,15 @@ define amdgpu_kernel void @store_load_sindex_small_offset_kernel(i32 %idx) {
 ; GFX1030-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s4
 ; GFX1030-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s5
 ; GFX1030-PAL-NEXT:    s_load_dword s0, s[0:1], 0x0
-; GFX1030-PAL-NEXT:    scratch_load_dword v0, off, off offset:4 glc dlc
+; GFX1030-PAL-NEXT:    scratch_load_dword v0, off, off glc dlc
 ; GFX1030-PAL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1030-PAL-NEXT:    v_mov_b32_e32 v0, 15
 ; GFX1030-PAL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1030-PAL-NEXT:    s_and_b32 s1, s0, 15
 ; GFX1030-PAL-NEXT:    s_lshl_b32 s0, s0, 2
 ; GFX1030-PAL-NEXT:    s_lshl_b32 s1, s1, 2
-; GFX1030-PAL-NEXT:    s_addk_i32 s0, 0x104
-; GFX1030-PAL-NEXT:    s_addk_i32 s1, 0x104
+; GFX1030-PAL-NEXT:    s_addk_i32 s0, 0x100
+; GFX1030-PAL-NEXT:    s_addk_i32 s1, 0x100
 ; GFX1030-PAL-NEXT:    scratch_store_dword off, v0, s0
 ; GFX1030-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX1030-PAL-NEXT:    scratch_load_dword v0, off, s1 glc dlc
@@ -1643,15 +1643,15 @@ define amdgpu_kernel void @store_load_sindex_small_offset_kernel(i32 %idx) {
 ; GFX11-PAL-LABEL: store_load_sindex_small_offset_kernel:
 ; GFX11-PAL:       ; %bb.0: ; %bb
 ; GFX11-PAL-NEXT:    s_load_b32 s0, s[0:1], 0x0
-; GFX11-PAL-NEXT:    scratch_load_b32 v0, off, off offset:4 glc dlc
+; GFX11-PAL-NEXT:    scratch_load_b32 v0, off, off glc dlc
 ; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-PAL-NEXT:    v_mov_b32_e32 v0, 15
 ; GFX11-PAL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-PAL-NEXT:    s_and_b32 s1, s0, 15
 ; GFX11-PAL-NEXT:    s_lshl_b32 s0, s0, 2
 ; GFX11-PAL-NEXT:    s_lshl_b32 s1, s1, 2
-; GFX11-PAL-NEXT:    s_addk_i32 s0, 0x104
-; GFX11-PAL-NEXT:    s_addk_i32 s1, 0x104
+; GFX11-PAL-NEXT:    s_addk_i32 s0, 0x100
+; GFX11-PAL-NEXT:    s_addk_i32 s1, 0x100
 ; GFX11-PAL-NEXT:    scratch_store_b32 off, v0, s0 dlc
 ; GFX11-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-PAL-NEXT:    scratch_load_b32 v0, off, s1 glc dlc
@@ -1661,15 +1661,15 @@ define amdgpu_kernel void @store_load_sindex_small_offset_kernel(i32 %idx) {
 ; GFX12-PAL-LABEL: store_load_sindex_small_offset_kernel:
 ; GFX12-PAL:       ; %bb.0: ; %bb
 ; GFX12-PAL-NEXT:    s_load_b32 s0, s[0:1], 0x0
-; GFX12-PAL-NEXT:    scratch_load_b32 v0, off, off offset:4 scope:SCOPE_SYS
+; GFX12-PAL-NEXT:    scratch_load_b32 v0, off, off scope:SCOPE_SYS
 ; GFX12-PAL-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-PAL-NEXT:    v_mov_b32_e32 v0, 15
 ; GFX12-PAL-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-PAL-NEXT:    s_and_b32 s1, s0, 15
 ; GFX12-PAL-NEXT:    s_lshl_b32 s0, s0, 2
 ; GFX12-PAL-NEXT:    s_lshl_b32 s1, s1, 2
-; GFX12-PAL-NEXT:    s_addk_co_i32 s0, 0x104
-; GFX12-PAL-NEXT:    s_addk_co_i32 s1, 0x104
+; GFX12-PAL-NEXT:    s_addk_co_i32 s0, 0x100
+; GFX12-PAL-NEXT:    s_addk_co_i32 s1, 0x100
 ; GFX12-PAL-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SYS
 ; GFX12-PAL-NEXT:    s_wait_storecnt 0x0
 ; GFX12-PAL-NEXT:    scratch_load_b32 v0, off, s1 scope:SCOPE_SYS
@@ -1694,16 +1694,16 @@ define amdgpu_ps void @store_load_sindex_small_offset_foo(i32 inreg %idx) {
 ; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s0, s3
 ; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s1, 0
 ; GFX9-NEXT:    s_mov_b32 s0, 0
-; GFX9-NEXT:    scratch_load_dword v0, off, s0 offset:4 glc
+; GFX9-NEXT:    scratch_load_dword v0, off, s0 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_lshl_b32 s0, s2, 2
-; GFX9-NEXT:    s_addk_i32 s0, 0x104
+; GFX9-NEXT:    s_addk_i32 s0, 0x100
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 15
 ; GFX9-NEXT:    scratch_store_dword off, v0, s0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_and_b32 s0, s2, 15
 ; GFX9-NEXT:    s_lshl_b32 s0, s0, 2
-; GFX9-NEXT:    s_addk_i32 s0, 0x104
+; GFX9-NEXT:    s_addk_i32 s0, 0x100
 ; GFX9-NEXT:    scratch_load_dword v0, off, s0 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_endpgm
@@ -1714,14 +1714,14 @@ define amdgpu_ps void @store_load_sindex_small_offset_foo(i32 inreg %idx) {
 ; GFX10-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
 ; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
-; GFX10-NEXT:    scratch_load_dword v0, off, off offset:4 glc dlc
+; GFX10-NEXT:    scratch_load_dword v0, off, off glc dlc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 15
 ; GFX10-NEXT:    s_and_b32 s0, s2, 15
 ; GFX10-NEXT:    s_lshl_b32 s1, s2, 2
 ; GFX10-NEXT:    s_lshl_b32 s0, s0, 2
-; GFX10-NEXT:    s_addk_i32 s1, 0x104
-; GFX10-NEXT:    s_addk_i32 s0, 0x104
+; GFX10-NEXT:    s_addk_i32 s1, 0x100
+; GFX10-NEXT:    s_addk_i32 s0, 0x100
 ; GFX10-NEXT:    scratch_store_dword off, v0, s1
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    scratch_load_dword v0, off, s0 glc dlc
@@ -1730,14 +1730,14 @@ define amdgpu_ps void @store_load_sindex_small_offset_foo(i32 inreg %idx) {
 ;
 ; GFX11-LABEL: store_load_sindex_small_offset_foo:
 ; GFX11:       ; %bb.0: ; %bb
-; GFX11-NEXT:    scratch_load_b32 v0, off, off offset:4 glc dlc
+; GFX11-NEXT:    scratch_load_b32 v0, off, off glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 15
 ; GFX11-NEXT:    s_and_b32 s1, s0, 15
 ; GFX11-NEXT:    s_lshl_b32 s0, s0, 2
 ; GFX11-NEXT:    s_lshl_b32 s1, s1, 2
-; GFX11-NEXT:    s_addk_i32 s0, 0x104
-; GFX11-NEXT:    s_addk_i32 s1, 0x104
+; GFX11-NEXT:    s_addk_i32 s0, 0x100
+; GFX11-NEXT:    s_addk_i32 s1, 0x100
 ; GFX11-NEXT:    scratch_store_b32 off, v0, s0 dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    scratch_load_b32 v0, off, s1 glc dlc
@@ -1746,14 +1746,14 @@ define amdgpu_ps void @store_load_sindex_small_offset_foo(i32 inreg %idx) {
 ;
 ; GFX12-LABEL: store_load_sindex_small_offset_foo:
 ; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    scratch_load_b32 v0, off, off offset:4 scope:SCOPE_SYS
+; GFX12-NEXT:    scratch_load_b32 v0, off, off scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v0, 15
 ; GFX12-NEXT:    s_and_b32 s1, s0, 15
 ; GFX12-NEXT:    s_lshl_b32 s0, s0, 2
 ; GFX12-NEXT:    s_lshl_b32 s1, s1, 2
-; GFX12-NEXT:    s_addk_co_i32 s0, 0x104
-; GFX12-NEXT:    s_addk_co_i32 s1, 0x104
+; GFX12-NEXT:    s_addk_co_i32 s0, 0x100
+; GFX12-NEXT:    s_addk_co_i32 s1, 0x100
 ; GFX12-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    scratch_load_b32 v0, off, s1 scope:SCOPE_SYS
@@ -1770,32 +1770,32 @@ define amdgpu_ps void @store_load_sindex_small_offset_foo(i32 inreg %idx) {
 ; GFX9-PAL-NEXT:    s_add_u32 flat_scratch_lo, s2, s1
 ; GFX9-PAL-NEXT:    s_addc_u32 flat_scratch_hi, s3, 0
 ; GFX9-PAL-NEXT:    s_mov_b32 s1, 0
-; GFX9-PAL-NEXT:    scratch_load_dword v0, off, s1 offset:4 glc
+; GFX9-PAL-NEXT:    scratch_load_dword v0, off, s1 glc
 ; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-PAL-NEXT:    s_lshl_b32 s1, s0, 2
 ; GFX9-PAL-NEXT:    s_and_b32 s0, s0, 15
-; GFX9-PAL-NEXT:    s_addk_i32 s1, 0x104
+; GFX9-PAL-NEXT:    s_addk_i32 s1, 0x100
 ; GFX9-PAL-NEXT:    v_mov_b32_e32 v0, 15
 ; GFX9-PAL-NEXT:    s_lshl_b32 s0, s0, 2
 ; GFX9-PAL-NEXT:    scratch_store_dword off, v0, s1
 ; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-PAL-NEXT:    s_addk_i32 s0, 0x104
+; GFX9-PAL-NEXT:    s_addk_i32 s0, 0x100
 ; GFX9-PAL-NEXT:    scratch_load_dword v0, off, s0 glc
 ; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-PAL-NEXT:    s_endpgm
 ;
 ; GFX940-LABEL: store_load_sindex_small_offset_foo:
 ; GFX940:       ; %bb.0: ; %bb
-; GFX940-NEXT:    scratch_load_dword v0, off, off offset:4 sc0 sc1
+; GFX940-NEXT:    scratch_load_dword v0, off, off sc0 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    s_lshl_b32 s1, s0, 2
 ; GFX940-NEXT:    s_and_b32 s0, s0, 15
-; GFX940-NEXT:    s_addk_i32 s1, 0x104
+; GFX940-NEXT:    s_addk_i32 s1, 0x100
 ; GFX940-NEXT:    v_mov_b32_e32 v0, 15
 ; GFX940-NEXT:    s_lshl_b32 s0, s0, 2
 ; GFX940-NEXT:    scratch_store_dword off, v0, s1 sc0 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    s_addk_i32 s0, 0x104
+; GFX940-NEXT:    s_addk_i32 s0, 0x100
 ; GFX940-NEXT:    scratch_load_dword v0, off, s0 sc0 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    s_endpgm
@@ -1812,14 +1812,14 @@ define amdgpu_ps void @store_load_sindex_small_offset_foo(i32 inreg %idx) {
 ; GFX1010-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
 ; GFX1010-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
 ; GFX1010-PAL-NEXT:    s_mov_b32 s1, 0
-; GFX1010-PAL-NEXT:    scratch_load_dword v0, off, s1 offset:4 glc dlc
+; GFX1010-PAL-NEXT:    scratch_load_dword v0, off, s1 glc dlc
 ; GFX1010-PAL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1010-PAL-NEXT:    v_mov_b32_e32 v0, 15
 ; GFX1010-PAL-NEXT:    s_and_b32 s1, s0, 15
 ; GFX1010-PAL-NEXT:    s_lshl_b32 s0, s0, 2
 ; GFX1010-PAL-NEXT:    s_lshl_b32 s1, s1, 2
-; GFX1010-PAL-NEXT:    s_addk_i32 s0, 0x104
-; GFX1010-PAL-NEXT:    s_addk_i32 s1, 0x104
+; GFX1010-PAL-NEXT:    s_addk_i32 s0, 0x100
+; GFX1010-PAL-NEXT:    s_addk_i32 s1, 0x100
 ; GFX1010-PAL-NEXT:    scratch_store_dword off, v0, s0
 ; GFX1010-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX1010-PAL-NEXT:    scratch_load_dword v0, off, s1 glc dlc
@@ -1837,14 +1837,14 @@ define amdgpu_ps void @store_load_sindex_small_offset_foo(i32 inreg %idx) {
 ; GFX1030-PAL-NEXT:    s_addc_u32 s3, s3, 0
 ; GFX1030-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
 ; GFX1030-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
-; GFX1030-PAL-NEXT:    scratch_load_dword v0, off, off offset:4 glc dlc
+; GFX1030-PAL-NEXT:    scratch_load_dword v0, off, off glc dlc
 ; GFX1030-PAL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1030-PAL-NEXT:    v_mov_b32_e32 v0, 15
 ; GFX1030-PAL-NEXT:    s_and_b32 s1, s0, 15
 ; GFX1030-PAL-NEXT:    s_lshl_b32 s0, s0, 2
 ; GFX1030-PAL-NEXT:    s_lshl_b32 s1, s1, 2
-; GFX1030-PAL-NEXT:    s_addk_i32 s0, 0x104
-; GFX1030-PAL-NEXT:    s_addk_i32 s1, 0x104
+; GFX1030-PAL-NEXT:    s_addk_i32 s0, 0x100
+; GFX1030-PAL-NEXT:    s_addk_i32 s1, 0x100
 ; GFX1030-PAL-NEXT:    scratch_store_dword off, v0, s0
 ; GFX1030-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX1030-PAL-NEXT:    scratch_load_dword v0, off, s1 glc dlc
@@ -1853,14 +1853,14 @@ define amdgpu_ps void @store_load_sindex_small_offset_foo(i32 inreg %idx) {
 ;
 ; GFX11-PAL-LABEL: store_load_sindex_small_offset_foo:
 ; GFX11-PAL:       ; %bb.0: ; %bb
-; GFX11-PAL-NEXT:    scratch_load_b32 v0, off, off offset:4 glc dlc
+; GFX11-PAL-NEXT:    scratch_load_b32 v0, off, off glc dlc
 ; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-PAL-NEXT:    v_mov_b32_e32 v0, 15
 ; GFX11-PAL-NEXT:    s_and_b32 s1, s0, 15
 ; GFX11-PAL-NEXT:    s_lshl_b32 s0, s0, 2
 ; GFX11-PAL-NEXT:    s_lshl_b32 s1, s1, 2
-; GFX11-PAL-NEXT:    s_addk_i32 s0, 0x104
-; GFX11-PAL-NEXT:    s_addk_i32 s1, 0x104
+; GFX11-PAL-NEXT:    s_addk_i32 s0, 0x100
+; GFX11-PAL-NEXT:    s_addk_i32 s1, 0x100
 ; GFX11-PAL-NEXT:    scratch_store_b32 off, v0, s0 dlc
 ; GFX11-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-PAL-NEXT:    scratch_load_b32 v0, off, s1 glc dlc
@@ -1869,14 +1869,14 @@ define amdgpu_ps void @store_load_sindex_small_offset_foo(i32 inreg %idx) {
 ;
 ; GFX12-PAL-LABEL: store_load_sindex_small_offset_foo:
 ; GFX12-PAL:       ; %bb.0: ; %bb
-; GFX12-PAL-NEXT:    scratch_load_b32 v0, off, off offset:4 scope:SCOPE_SYS
+; GFX12-PAL-NEXT:    scratch_load_b32 v0, off, off scope:SCOPE_SYS
 ; GFX12-PAL-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-PAL-NEXT:    v_mov_b32_e32 v0, 15
 ; GFX12-PAL-NEXT:    s_and_b32 s1, s0, 15
 ; GFX12-PAL-NEXT:    s_lshl_b32 s0, s0, 2
 ; GFX12-PAL-NEXT:    s_lshl_b32 s1, s1, 2
-; GFX12-PAL-NEXT:    s_addk_co_i32 s0, 0x104
-; GFX12-PAL-NEXT:    s_addk_co_i32 s1, 0x104
+; GFX12-PAL-NEXT:    s_addk_co_i32 s0, 0x100
+; GFX12-PAL-NEXT:    s_addk_co_i32 s1, 0x100
 ; GFX12-PAL-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SYS
 ; GFX12-PAL-NEXT:    s_wait_storecnt 0x0
 ; GFX12-PAL-NEXT:    scratch_load_b32 v0, off, s1 scope:SCOPE_SYS
@@ -1901,14 +1901,14 @@ define amdgpu_kernel void @store_load_vindex_small_offset_kernel() {
 ; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s0, s3
 ; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s1, 0
 ; GFX9-NEXT:    s_mov_b32 s0, 0
-; GFX9-NEXT:    scratch_load_dword v1, off, s0 offset:4 glc
+; GFX9-NEXT:    scratch_load_dword v1, off, s0 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-NEXT:    v_add_u32_e32 v1, 0x104, v0
+; GFX9-NEXT:    v_add_u32_e32 v1, 0x100, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 15
 ; GFX9-NEXT:    scratch_store_dword v1, v2, off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_sub_u32_e32 v0, 0x104, v0
+; GFX9-NEXT:    v_sub_u32_e32 v0, 0x100, v0
 ; GFX9-NEXT:    scratch_load_dword v0, v0, off offset:124 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_endpgm
@@ -1921,10 +1921,10 @@ define amdgpu_kernel void @store_load_vindex_small_offset_kernel() {
 ; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 15
-; GFX10-NEXT:    scratch_load_dword v3, off, off offset:4 glc dlc
+; GFX10-NEXT:    scratch_load_dword v3, off, off glc dlc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_add_nc_u32_e32 v1, 0x104, v0
-; GFX10-NEXT:    v_sub_nc_u32_e32 v0, 0x104, v0
+; GFX10-NEXT:    v_add_nc_u32_e32 v1, 0x100, v0
+; GFX10-NEXT:    v_sub_nc_u32_e32 v0, 0x100, v0
 ; GFX10-NEXT:    scratch_store_dword v1, v2, off
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    scratch_load_dword v0, v0, off offset:124 glc dlc
@@ -1934,10 +1934,10 @@ define amdgpu_kernel void @store_load_vindex_small_offset_kernel() {
 ; GFX11-LABEL: store_load_vindex_small_offset_kernel:
 ; GFX11:       ; %bb.0: ; %bb
 ; GFX11-NEXT:    v_dual_mov_b32 v1, 15 :: v_dual_lshlrev_b32 v0, 2, v0
-; GFX11-NEXT:    scratch_load_b32 v3, off, off offset:4 glc dlc
+; GFX11-NEXT:    scratch_load_b32 v3, off, off glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_sub_nc_u32_e32 v2, 0x104, v0
-; GFX11-NEXT:    scratch_store_b32 v0, v1, off offset:260 dlc
+; GFX11-NEXT:    v_sub_nc_u32_e32 v2, 0x100, v0
+; GFX11-NEXT:    scratch_store_b32 v0, v1, off offset:256 dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    scratch_load_b32 v0, v2, off offset:124 glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
@@ -1946,10 +1946,10 @@ define amdgpu_kernel void @store_load_vindex_small_offset_kernel() {
 ; GFX12-LABEL: store_load_vindex_small_offset_kernel:
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_dual_mov_b32 v1, 15 :: v_dual_lshlrev_b32 v0, 2, v0
-; GFX12-NEXT:    scratch_load_b32 v3, off, off offset:4 scope:SCOPE_SYS
+; GFX12-NEXT:    scratch_load_b32 v3, off, off scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_sub_nc_u32_e32 v2, 0x104, v0
-; GFX12-NEXT:    scratch_store_b32 v0, v1, off offset:260 scope:SCOPE_SYS
+; GFX12-NEXT:    v_sub_nc_u32_e32 v2, 0x100, v0
+; GFX12-NEXT:    scratch_store_b32 v0, v1, off offset:256 scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    scratch_load_b32 v0, v2, off offset:124 scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
@@ -1967,25 +1967,25 @@ define amdgpu_kernel void @store_load_vindex_small_offset_kernel() {
 ; GFX9-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
 ; GFX9-PAL-NEXT:    s_add_u32 flat_scratch_lo, s2, s1
 ; GFX9-PAL-NEXT:    s_addc_u32 flat_scratch_hi, s3, 0
-; GFX9-PAL-NEXT:    scratch_load_dword v1, off, s0 offset:4 glc
+; GFX9-PAL-NEXT:    scratch_load_dword v1, off, s0 glc
 ; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-PAL-NEXT:    v_add_u32_e32 v1, 0x104, v0
+; GFX9-PAL-NEXT:    v_add_u32_e32 v1, 0x100, v0
 ; GFX9-PAL-NEXT:    scratch_store_dword v1, v2, off
 ; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-PAL-NEXT:    v_sub_u32_e32 v0, 0x104, v0
+; GFX9-PAL-NEXT:    v_sub_u32_e32 v0, 0x100, v0
 ; GFX9-PAL-NEXT:    scratch_load_dword v0, v0, off offset:124 glc
 ; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-PAL-NEXT:    s_endpgm
 ;
 ; GFX940-LABEL: store_load_vindex_small_offset_kernel:
 ; GFX940:       ; %bb.0: ; %bb
-; GFX940-NEXT:    scratch_load_dword v1, off, off offset:4 sc0 sc1
+; GFX940-NEXT:    scratch_load_dword v1, off, off sc0 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX940-NEXT:    v_mov_b32_e32 v1, 15
-; GFX940-NEXT:    scratch_store_dword v0, v1, off offset:260 sc0 sc1
+; GFX940-NEXT:    scratch_store_dword v0, v1, off offset:256 sc0 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    v_sub_u32_e32 v0, 0x104, v0
+; GFX940-NEXT:    v_sub_u32_e32 v0, 0x100, v0
 ; GFX940-NEXT:    scratch_load_dword v0, v0, off offset:124 sc0 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    s_endpgm
@@ -2004,10 +2004,10 @@ define amdgpu_kernel void @store_load_vindex_small_offset_kernel() {
 ; GFX1010-PAL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX1010-PAL-NEXT:    v_mov_b32_e32 v2, 15
 ; GFX1010-PAL-NEXT:    s_mov_b32 s0, 0
-; GFX1010-PAL-NEXT:    scratch_load_dword v3, off, s0 offset:4 glc dlc
+; GFX1010-PAL-NEXT:    scratch_load_dword v3, off, s0 glc dlc
 ; GFX1010-PAL-NEXT:    s_waitcnt vmcnt(0)
-; GFX1010-PAL-NEXT:    v_add_nc_u32_e32 v1, 0x104, v0
-; GFX1010-PAL-NEXT:    v_sub_nc_u32_e32 v0, 0x104, v0
+; GFX1010-PAL-NEXT:    v_add_nc_u32_e32 v1, 0x100, v0
+; GFX1010-PAL-NEXT:    v_sub_nc_u32_e32 v0, 0x100, v0
 ; GFX1010-PAL-NEXT:    scratch_store_dword v1, v2, off
 ; GFX1010-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX1010-PAL-NEXT:    scratch_load_dword v0, v0, off offset:124 glc dlc
@@ -2027,10 +2027,10 @@ define amdgpu_kernel void @store_load_vindex_small_offset_kernel() {
 ; GFX1030-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
 ; GFX1030-PAL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX1030-PAL-NEXT:    v_mov_b32_e32 v2, 15
-; GFX1030-PAL-NEXT:    scratch_load_dword v3, off, off offset:4 glc dlc
+; GFX1030-PAL-NEXT:    scratch_load_dword v3, off, off glc dlc
 ; GFX1030-PAL-NEXT:    s_waitcnt vmcnt(0)
-; GFX1030-PAL-NEXT:    v_add_nc_u32_e32 v1, 0x104, v0
-; GFX1030-PAL-NEXT:    v_sub_nc_u32_e32 v0, 0x104, v0
+; GFX1030-PAL-NEXT:    v_add_nc_u32_e32 v1, 0x100, v0
+; GFX1030-PAL-NEXT:    v_sub_nc_u32_e32 v0, 0x100, v0
 ; GFX1030-PAL-NEXT:    scratch_store_dword v1, v2, off
 ; GFX1030-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX1030-PAL-NEXT:    scratch_load_dword v0, v0, off offset:124 glc dlc
@@ -2040,10 +2040,10 @@ define amdgpu_kernel void @store_load_vindex_small_offset_kernel() {
 ; GFX11-PAL-LABEL: store_load_vindex_small_offset_kernel:
 ; GFX11-PAL:       ; %bb.0: ; %bb
 ; GFX11-PAL-NEXT:    v_dual_mov_b32 v1, 15 :: v_dual_lshlrev_b32 v0, 2, v0
-; GFX11-PAL-NEXT:    scratch_load_b32 v3, off, off offset:4 glc dlc
+; GFX11-PAL-NEXT:    scratch_load_b32 v3, off, off glc dlc
 ; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-PAL-NEXT:    v_sub_nc_u32_e32 v2, 0x104, v0
-; GFX11-PAL-NEXT:    scratch_store_b32 v0, v1, off offset:260 dlc
+; GFX11-PAL-NEXT:    v_sub_nc_u32_e32 v2, 0x100, v0
+; GFX11-PAL-NEXT:    scratch_store_b32 v0, v1, off offset:256 dlc
 ; GFX11-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-PAL-NEXT:    scratch_load_b32 v0, v2, off offset:124 glc dlc
 ; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0)
@@ -2052,10 +2052,10 @@ define amdgpu_kernel void @store_load_vindex_small_offset_kernel() {
 ; GFX12-PAL-LABEL: store_load_vindex_small_offset_kernel:
 ; GFX12-PAL:       ; %bb.0: ; %bb
 ; GFX12-PAL-NEXT:    v_dual_mov_b32 v1, 15 :: v_dual_lshlrev_b32 v0, 2, v0
-; GFX12-PAL-NEXT:    scratch_load_b32 v3, off, off offset:4 scope:SCOPE_SYS
+; GFX12-PAL-NEXT:    scratch_load_b32 v3, off, off scope:SCOPE_SYS
 ; GFX12-PAL-NEXT:    s_wait_loadcnt 0x0
-; GFX12-PAL-NEXT:    v_sub_nc_u32_e32 v2, 0x104, v0
-; GFX12-PAL-NEXT:    scratch_store_b32 v0, v1, off offset:260 scope:SCOPE_SYS
+; GFX12-PAL-NEXT:    v_sub_nc_u32_e32 v2, 0x100, v0
+; GFX12-PAL-NEXT:    scratch_store_b32 v0, v1, off offset:256 scope:SCOPE_SYS
 ; GFX12-PAL-NEXT:    s_wait_storecnt 0x0
 ; GFX12-PAL-NEXT:    scratch_load_b32 v0, v2, off offset:124 scope:SCOPE_SYS
 ; GFX12-PAL-NEXT:    s_wait_loadcnt 0x0
@@ -2305,7 +2305,7 @@ define amdgpu_kernel void @zero_init_large_offset_kernel() {
 ;
 ; GFX12-LABEL: zero_init_large_offset_kernel:
 ; GFX12:       ; %bb.0:
-; GFX12-NEXT:    scratch_load_b32 v0, off, off offset:4 scope:SCOPE_SYS
+; GFX12-NEXT:    scratch_load_b32 v0, off, off scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    s_mov_b32 s0, 0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
@@ -2315,10 +2315,10 @@ define amdgpu_kernel void @zero_init_large_offset_kernel() {
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX12-NEXT:    s_clause 0x3
-; GFX12-NEXT:    scratch_store_b128 off, v[0:3], off offset:16388
-; GFX12-NEXT:    scratch_store_b128 off, v[0:3], off offset:16404
-; GFX12-NEXT:    scratch_store_b128 off, v[0:3], off offset:16420
-; GFX12-NEXT:    scratch_store_b128 off, v[0:3], off offset:16436
+; GFX12-NEXT:    scratch_store_b128 off, v[0:3], off offset:16384
+; GFX12-NEXT:    scratch_store_b128 off, v[0:3], off offset:16400
+; GFX12-NEXT:    scratch_store_b128 off, v[0:3], off offset:16416
+; GFX12-NEXT:    scratch_store_b128 off, v[0:3], off offset:16432
 ; GFX12-NEXT:    s_endpgm
 ;
 ; GFX9-PAL-LABEL: zero_init_large_offset_kernel:
@@ -2441,7 +2441,7 @@ define amdgpu_kernel void @zero_init_large_offset_kernel() {
 ;
 ; GFX12-PAL-LABEL: zero_init_large_offset_kernel:
 ; GFX12-PAL:       ; %bb.0:
-; GFX12-PAL-NEXT:    scratch_load_b32 v0, off, off offset:4 scope:SCOPE_SYS
+; GFX12-PAL-NEXT:    scratch_load_b32 v0, off, off scope:SCOPE_SYS
 ; GFX12-PAL-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-PAL-NEXT:    s_mov_b32 s0, 0
 ; GFX12-PAL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
@@ -2451,10 +2451,10 @@ define amdgpu_kernel void @zero_init_large_offset_kernel() {
 ; GFX12-PAL-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX12-PAL-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX12-PAL-NEXT:    s_clause 0x3
-; GFX12-PAL-NEXT:    scratch_store_b128 off, v[0:3], off offset:16388
-; GFX12-PAL-NEXT:    scratch_store_b128 off, v[0:3], off offset:16404
-; GFX12-PAL-NEXT:    scratch_store_b128 off, v[0:3], off offset:16420
-; GFX12-PAL-NEXT:    scratch_store_b128 off, v[0:3], off offset:16436
+; GFX12-PAL-NEXT:    scratch_store_b128 off, v[0:3], off offset:16384
+; GFX12-PAL-NEXT:    scratch_store_b128 off, v[0:3], off offset:16400
+; GFX12-PAL-NEXT:    scratch_store_b128 off, v[0:3], off offset:16416
+; GFX12-PAL-NEXT:    scratch_store_b128 off, v[0:3], off offset:16432
 ; GFX12-PAL-NEXT:    s_endpgm
   %padding = alloca [4096 x i32], align 4, addrspace(5)
   %alloca = alloca [32 x i16], align 2, addrspace(5)
@@ -2768,15 +2768,15 @@ define amdgpu_kernel void @store_load_sindex_large_offset_kernel(i32 %idx) {
 ; GFX12-LABEL: store_load_sindex_large_offset_kernel:
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    s_load_b32 s0, s[0:1], 0x24
-; GFX12-NEXT:    scratch_load_b32 v0, off, off offset:4 scope:SCOPE_SYS
+; GFX12-NEXT:    scratch_load_b32 v0, off, off scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v0, 15
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_and_b32 s1, s0, 15
 ; GFX12-NEXT:    s_lshl_b32 s0, s0, 2
 ; GFX12-NEXT:    s_lshl_b32 s1, s1, 2
-; GFX12-NEXT:    s_addk_co_i32 s0, 0x4004
-; GFX12-NEXT:    s_addk_co_i32 s1, 0x4004
+; GFX12-NEXT:    s_addk_co_i32 s0, 0x4000
+; GFX12-NEXT:    s_addk_co_i32 s1, 0x4000
 ; GFX12-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    scratch_load_b32 v0, off, s1 scope:SCOPE_SYS
@@ -2902,15 +2902,15 @@ define amdgpu_kernel void @store_load_sindex_large_offset_kernel(i32 %idx) {
 ; GFX12-PAL-LABEL: store_load_sindex_large_offset_kernel:
 ; GFX12-PAL:       ; %bb.0: ; %bb
 ; GFX12-PAL-NEXT:    s_load_b32 s0, s[0:1], 0x0
-; GFX12-PAL-NEXT:    scratch_load_b32 v0, off, off offset:4 scope:SCOPE_SYS
+; GFX12-PAL-NEXT:    scratch_load_b32 v0, off, off scope:SCOPE_SYS
 ; GFX12-PAL-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-PAL-NEXT:    v_mov_b32_e32 v0, 15
 ; GFX12-PAL-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-PAL-NEXT:    s_and_b32 s1, s0, 15
 ; GFX12-PAL-NEXT:    s_lshl_b32 s0, s0, 2
 ; GFX12-PAL-NEXT:    s_lshl_b32 s1, s1, 2
-; GFX12-PAL-NEXT:    s_addk_co_i32 s0, 0x4004
-; GFX12-PAL-NEXT:    s_addk_co_i32 s1, 0x4004
+; GFX12-PAL-NEXT:    s_addk_co_i32 s0, 0x4000
+; GFX12-PAL-NEXT:    s_addk_co_i32 s1, 0x4000
 ; GFX12-PAL-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SYS
 ; GFX12-PAL-NEXT:    s_wait_storecnt 0x0
 ; GFX12-PAL-NEXT:    scratch_load_b32 v0, off, s1 scope:SCOPE_SYS
@@ -2987,14 +2987,14 @@ define amdgpu_ps void @store_load_sindex_large_offset_foo(i32 inreg %idx) {
 ;
 ; GFX12-LABEL: store_load_sindex_large_offset_foo:
 ; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    scratch_load_b32 v0, off, off offset:4 scope:SCOPE_SYS
+; GFX12-NEXT:    scratch_load_b32 v0, off, off scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v0, 15
 ; GFX12-NEXT:    s_and_b32 s1, s0, 15
 ; GFX12-NEXT:    s_lshl_b32 s0, s0, 2
 ; GFX12-NEXT:    s_lshl_b32 s1, s1, 2
-; GFX12-NEXT:    s_addk_co_i32 s0, 0x4004
-; GFX12-NEXT:    s_addk_co_i32 s1, 0x4004
+; GFX12-NEXT:    s_addk_co_i32 s0, 0x4000
+; GFX12-NEXT:    s_addk_co_i32 s1, 0x4000
 ; GFX12-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    scratch_load_b32 v0, off, s1 scope:SCOPE_SYS
@@ -3110,14 +3110,14 @@ define amdgpu_ps void @store_load_sindex_large_offset_foo(i32 inreg %idx) {
 ;
 ; GFX12-PAL-LABEL: store_load_sindex_large_offset_foo:
 ; GFX12-PAL:       ; %bb.0: ; %bb
-; GFX12-PAL-NEXT:    scratch_load_b32 v0, off, off offset:4 scope:SCOPE_SYS
+; GFX12-PAL-NEXT:    scratch_load_b32 v0, off, off scope:SCOPE_SYS
 ; GFX12-PAL-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-PAL-NEXT:    v_mov_b32_e32 v0, 15
 ; GFX12-PAL-NEXT:    s_and_b32 s1, s0, 15
 ; GFX12-PAL-NEXT:    s_lshl_b32 s0, s0, 2
 ; GFX12-PAL-NEXT:    s_lshl_b32 s1, s1, 2
-; GFX12-PAL-NEXT:    s_addk_co_i32 s0, 0x4004
-; GFX12-PAL-NEXT:    s_addk_co_i32 s1, 0x4004
+; GFX12-PAL-NEXT:    s_addk_co_i32 s0, 0x4000
+; GFX12-PAL-NEXT:    s_addk_co_i32 s1, 0x4000
 ; GFX12-PAL-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SYS
 ; GFX12-PAL-NEXT:    s_wait_storecnt 0x0
 ; GFX12-PAL-NEXT:    scratch_load_b32 v0, off, s1 scope:SCOPE_SYS
@@ -3188,10 +3188,10 @@ define amdgpu_kernel void @store_load_vindex_large_offset_kernel() {
 ; GFX12-LABEL: store_load_vindex_large_offset_kernel:
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_dual_mov_b32 v1, 15 :: v_dual_lshlrev_b32 v0, 2, v0
-; GFX12-NEXT:    scratch_load_b32 v3, off, off offset:4 scope:SCOPE_SYS
+; GFX12-NEXT:    scratch_load_b32 v3, off, off scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_sub_nc_u32_e32 v2, 0x4004, v0
-; GFX12-NEXT:    scratch_store_b32 v0, v1, off offset:16388 scope:SCOPE_SYS
+; GFX12-NEXT:    v_sub_nc_u32_e32 v2, 0x4000, v0
+; GFX12-NEXT:    scratch_store_b32 v0, v1, off offset:16384 scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    scratch_load_b32 v0, v2, off offset:124 scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
@@ -3296,10 +3296,10 @@ define amdgpu_kernel void @store_load_vindex_large_offset_kernel() {
 ; GFX12-PAL-LABEL: store_load_vindex_large_offset_kernel:
 ; GFX12-PAL:       ; %bb.0: ; %bb
 ; GFX12-PAL-NEXT:    v_dual_mov_b32 v1, 15 :: v_dual_lshlrev_b32 v0, 2, v0
-; GFX12-PAL-NEXT:    scratch_load_b32 v3, off, off offset:4 scope:SCOPE_SYS
+; GFX12-PAL-NEXT:    scratch_load_b32 v3, off, off scope:SCOPE_SYS
 ; GFX12-PAL-NEXT:    s_wait_loadcnt 0x0
-; GFX12-PAL-NEXT:    v_sub_nc_u32_e32 v2, 0x4004, v0
-; GFX12-PAL-NEXT:    scratch_store_b32 v0, v1, off offset:16388 scope:SCOPE_SYS
+; GFX12-PAL-NEXT:    v_sub_nc_u32_e32 v2, 0x4000, v0
+; GFX12-PAL-NEXT:    scratch_store_b32 v0, v1, off offset:16384 scope:SCOPE_SYS
 ; GFX12-PAL-NEXT:    s_wait_storecnt 0x0
 ; GFX12-PAL-NEXT:    scratch_load_b32 v0, v2, off offset:124 scope:SCOPE_SYS
 ; GFX12-PAL-NEXT:    s_wait_loadcnt 0x0
@@ -3537,11 +3537,11 @@ define amdgpu_kernel void @store_load_large_imm_offset_kernel() {
 ; GFX12-LABEL: store_load_large_imm_offset_kernel:
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_dual_mov_b32 v0, 13 :: v_dual_mov_b32 v1, 15
-; GFX12-NEXT:    scratch_store_b32 off, v0, off offset:4 scope:SCOPE_SYS
+; GFX12-NEXT:    scratch_store_b32 off, v0, off scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    scratch_store_b32 off, v1, off offset:16004 scope:SCOPE_SYS
+; GFX12-NEXT:    scratch_store_b32 off, v1, off offset:16000 scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    scratch_load_b32 v0, off, off offset:16004 scope:SCOPE_SYS
+; GFX12-NEXT:    scratch_load_b32 v0, off, off offset:16000 scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    s_endpgm
 ;
@@ -3642,11 +3642,11 @@ define amdgpu_kernel void @store_load_large_imm_offset_kernel() {
 ; GFX12-PAL-LABEL: store_load_large_imm_offset_kernel:
 ; GFX12-PAL:       ; %bb.0: ; %bb
 ; GFX12-PAL-NEXT:    v_dual_mov_b32 v0, 13 :: v_dual_mov_b32 v1, 15
-; GFX12-PAL-NEXT:    scratch_store_b32 off, v0, off offset:4 scope:SCOPE_SYS
+; GFX12-PAL-NEXT:    scratch_store_b32 off, v0, off scope:SCOPE_SYS
 ; GFX12-PAL-NEXT:    s_wait_storecnt 0x0
-; GFX12-PAL-NEXT:    scratch_store_b32 off, v1, off offset:16004 scope:SCOPE_SYS
+; GFX12-PAL-NEXT:    scratch_store_b32 off, v1, off offset:16000 scope:SCOPE_SYS
 ; GFX12-PAL-NEXT:    s_wait_storecnt 0x0
-; GFX12-PAL-NEXT:    scratch_load_b32 v0, off, off offset:16004 scope:SCOPE_SYS
+; GFX12-PAL-NEXT:    scratch_load_b32 v0, off, off offset:16000 scope:SCOPE_SYS
 ; GFX12-PAL-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-PAL-NEXT:    s_endpgm
 bb:
@@ -3812,7 +3812,7 @@ define amdgpu_kernel void @store_load_vidx_sidx_offset(i32 %sidx) {
 ; GFX9:       ; %bb.0: ; %bb
 ; GFX9-NEXT:    s_load_dword s0, s[0:1], 0x24
 ; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s2, s5
-; GFX9-NEXT:    v_mov_b32_e32 v1, 4
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s3, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_add_u32_e32 v0, s0, v0
@@ -3834,7 +3834,7 @@ define amdgpu_kernel void @store_load_vidx_sidx_offset(i32 %sidx) {
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 15
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    v_add_nc_u32_e32 v0, s0, v0
-; GFX10-NEXT:    v_lshl_add_u32 v0, v0, 2, 4
+; GFX10-NEXT:    v_lshl_add_u32 v0, v0, 2, 0
 ; GFX10-NEXT:    scratch_store_dword v0, v1, off offset:1024
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    scratch_load_dword v0, v0, off offset:1024 glc dlc
@@ -3847,7 +3847,7 @@ define amdgpu_kernel void @store_load_vidx_sidx_offset(i32 %sidx) {
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    v_dual_mov_b32 v1, 15 :: v_dual_add_nc_u32 v0, s0, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_lshl_add_u32 v0, v0, 2, 4
+; GFX11-NEXT:    v_lshl_add_u32 v0, v0, 2, 0
 ; GFX11-NEXT:    scratch_store_b32 v0, v1, off offset:1024 dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    scratch_load_b32 v0, v0, off offset:1024 glc dlc
@@ -3860,9 +3860,9 @@ define amdgpu_kernel void @store_load_vidx_sidx_offset(i32 %sidx) {
 ; GFX12-NEXT:    v_mov_b32_e32 v1, 15
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_add_lshl_u32 v0, s0, v0, 2
-; GFX12-NEXT:    scratch_store_b32 v0, v1, off offset:1028 scope:SCOPE_SYS
+; GFX12-NEXT:    scratch_store_b32 v0, v1, off offset:1024 scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    scratch_load_b32 v0, v0, off offset:1028 scope:SCOPE_SYS
+; GFX12-NEXT:    scratch_load_b32 v0, v0, off offset:1024 scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    s_endpgm
 ;
@@ -3871,7 +3871,7 @@ define amdgpu_kernel void @store_load_vidx_sidx_offset(i32 %sidx) {
 ; GFX9-PAL-NEXT:    s_getpc_b64 s[4:5]
 ; GFX9-PAL-NEXT:    s_mov_b32 s4, s0
 ; GFX9-PAL-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
-; GFX9-PAL-NEXT:    v_mov_b32_e32 v1, 4
+; GFX9-PAL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-PAL-NEXT:    s_load_dword s0, s[0:1], 0x0
 ; GFX9-PAL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-PAL-NEXT:    s_and_b32 s5, s5, 0xffff
@@ -3889,7 +3889,7 @@ define amdgpu_kernel void @store_load_vidx_sidx_offset(i32 %sidx) {
 ; GFX940-LABEL: store_load_vidx_sidx_offset:
 ; GFX940:       ; %bb.0: ; %bb
 ; GFX940-NEXT:    s_load_dword s0, s[0:1], 0x24
-; GFX940-NEXT:    v_mov_b32_e32 v1, 4
+; GFX940-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NEXT:    v_add_u32_e32 v0, s0, v0
 ; GFX940-NEXT:    v_lshl_add_u32 v0, v0, 2, v1
@@ -3915,7 +3915,7 @@ define amdgpu_kernel void @store_load_vidx_sidx_offset(i32 %sidx) {
 ; GFX10-PAL-NEXT:    v_mov_b32_e32 v1, 15
 ; GFX10-PAL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-PAL-NEXT:    v_add_nc_u32_e32 v0, s0, v0
-; GFX10-PAL-NEXT:    v_lshl_add_u32 v0, v0, 2, 4
+; GFX10-PAL-NEXT:    v_lshl_add_u32 v0, v0, 2, 0
 ; GFX10-PAL-NEXT:    scratch_store_dword v0, v1, off offset:1024
 ; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-PAL-NEXT:    scratch_load_dword v0, v0, off offset:1024 glc dlc
@@ -3928,7 +3928,7 @@ define amdgpu_kernel void @store_load_vidx_sidx_offset(i32 %sidx) {
 ; GFX11-PAL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-PAL-NEXT:    v_dual_mov_b32 v1, 15 :: v_dual_add_nc_u32 v0, s0, v0
 ; GFX11-PAL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-PAL-NEXT:    v_lshl_add_u32 v0, v0, 2, 4
+; GFX11-PAL-NEXT:    v_lshl_add_u32 v0, v0, 2, 0
 ; GFX11-PAL-NEXT:    scratch_store_b32 v0, v1, off offset:1024 dlc
 ; GFX11-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-PAL-NEXT:    scratch_load_b32 v0, v0, off offset:1024 glc dlc
@@ -3941,9 +3941,9 @@ define amdgpu_kernel void @store_load_vidx_sidx_offset(i32 %sidx) {
 ; GFX12-PAL-NEXT:    v_mov_b32_e32 v1, 15
 ; GFX12-PAL-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-PAL-NEXT:    v_add_lshl_u32 v0, s0, v0, 2
-; GFX12-PAL-NEXT:    scratch_store_b32 v0, v1, off offset:1028 scope:SCOPE_SYS
+; GFX12-PAL-NEXT:    scratch_store_b32 v0, v1, off offset:1024 scope:SCOPE_SYS
 ; GFX12-PAL-NEXT:    s_wait_storecnt 0x0
-; GFX12-PAL-NEXT:    scratch_load_b32 v0, v0, off offset:1028 scope:SCOPE_SYS
+; GFX12-PAL-NEXT:    scratch_load_b32 v0, v0, off offset:1024 scope:SCOPE_SYS
 ; GFX12-PAL-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-PAL-NEXT:    s_endpgm
 bb:
@@ -4732,11 +4732,11 @@ define amdgpu_ps void @large_offset() {
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v2, v0
 ; GFX12-NEXT:    v_mov_b32_e32 v3, v0
-; GFX12-NEXT:    scratch_store_b128 off, v[0:3], off offset:3024 scope:SCOPE_SYS
+; GFX12-NEXT:    scratch_store_b128 off, v[0:3], off offset:3008 scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    scratch_load_b128 v[0:3], off, off offset:3024 scope:SCOPE_SYS
+; GFX12-NEXT:    scratch_load_b128 v[0:3], off, off offset:3008 scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_dual_mov_b32 v0, 16 :: v_dual_mov_b32 v1, 0x810
+; GFX12-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x800
 ; GFX12-NEXT:    ;;#ASMSTART
 ; GFX12-NEXT:    ; use v0
 ; GFX12-NEXT:    ;;#ASMEND
@@ -4850,11 +4850,11 @@ define amdgpu_ps void @large_offset() {
 ; GFX12-PAL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-PAL-NEXT:    v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v2, v0
 ; GFX12-PAL-NEXT:    v_mov_b32_e32 v3, v0
-; GFX12-PAL-NEXT:    scratch_store_b128 off, v[0:3], off offset:3024 scope:SCOPE_SYS
+; GFX12-PAL-NEXT:    scratch_store_b128 off, v[0:3], off offset:3008 scope:SCOPE_SYS
 ; GFX12-PAL-NEXT:    s_wait_storecnt 0x0
-; GFX12-PAL-NEXT:    scratch_load_b128 v[0:3], off, off offset:3024 scope:SCOPE_SYS
+; GFX12-PAL-NEXT:    scratch_load_b128 v[0:3], off, off offset:3008 scope:SCOPE_SYS
 ; GFX12-PAL-NEXT:    s_wait_loadcnt 0x0
-; GFX12-PAL-NEXT:    v_dual_mov_b32 v0, 16 :: v_dual_mov_b32 v1, 0x810
+; GFX12-PAL-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x800
 ; GFX12-PAL-NEXT:    ;;#ASMSTART
 ; GFX12-PAL-NEXT:    ; use v0
 ; GFX12-PAL-NEXT:    ;;#ASMEND
diff --git a/llvm/test/CodeGen/AMDGPU/frame-index-elimination-tied-operand.mir b/llvm/test/CodeGen/AMDGPU/frame-index-elimination-tied-operand.mir
index d7a1b2d..17ec6f5 100644
--- a/llvm/test/CodeGen/AMDGPU/frame-index-elimination-tied-operand.mir
+++ b/llvm/test/CodeGen/AMDGPU/frame-index-elimination-tied-operand.mir
@@ -22,7 +22,7 @@ body:             |
     ; GFX11: liveins: $sgpr0_sgpr1
     ; GFX11-NEXT: {{  $}}
     ; GFX11-NEXT: renamable $vgpr0 = V_MOV_B32_e32 123, implicit $exec
-    ; GFX11-NEXT: renamable $vgpr0 = SCRATCH_LOAD_SHORT_D16_HI_ST 4, 0, killed renamable $vgpr0, implicit $exec, implicit $flat_scr
+    ; GFX11-NEXT: renamable $vgpr0 = SCRATCH_LOAD_SHORT_D16_HI_ST 0, 0, killed renamable $vgpr0, implicit $exec, implicit $flat_scr
     ; GFX11-NEXT: renamable $sgpr0 = S_LOAD_DWORD_IMM killed renamable $sgpr0_sgpr1, 4, 0
     ; GFX11-NEXT: renamable $sgpr0 = S_LSHL_B32 killed renamable $sgpr0, 1, implicit-def dead $scc
     ; GFX11-NEXT: renamable $vgpr1 = COPY killed renamable $sgpr0, implicit $exec
diff --git a/llvm/test/CodeGen/AMDGPU/frame-index-elimination.ll b/llvm/test/CodeGen/AMDGPU/frame-index-elimination.ll
index 028f328..eeddc22 100644
--- a/llvm/test/CodeGen/AMDGPU/frame-index-elimination.ll
+++ b/llvm/test/CodeGen/AMDGPU/frame-index-elimination.ll
@@ -310,7 +310,7 @@ ret:
 
 ; GFX11-LABEL: tied_operand_test:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-DAG:     scratch_load_u16 [[LDRESULT:v[0-9]+]], off, off offset:4
+; GFX11-DAG:     scratch_load_u16 [[LDRESULT:v[0-9]+]], off, off
 ; GFX11-DAG:     v_mov_b32_e32 [[C:v[0-9]+]], 0x7b
 ; GFX11-DAG:     ds_store_b16 v{{[0-9]+}}, [[LDRESULT]]  offset:10
 ; GFX11-DAG:     ds_store_b16 v{{[0-9]+}}, [[C]]  offset:8
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll
index 6eec8d5..9c7ce39 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll
@@ -1227,9 +1227,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_one_as_scope
 ; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
 ; GFX1164-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX1164-NEXT:    s_clause 0x1
-; GFX1164-NEXT:    scratch_store_b32 off, v0, off offset:12
-; GFX1164-NEXT:    scratch_store_b32 off, v1, off offset:8
-; GFX1164-NEXT:    scratch_load_b64 v[0:1], off, off offset:8
+; GFX1164-NEXT:    scratch_store_b32 off, v0, off offset:4
+; GFX1164-NEXT:    scratch_store_b32 off, v1, off
+; GFX1164-NEXT:    scratch_load_b64 v[0:1], off, off
 ; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v2, exec_hi, v2
 ; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1164-NEXT:    v_cmpx_eq_u32_e32 0, v2
@@ -1271,9 +1271,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_one_as_scope
 ; GFX1132-NEXT:    s_mov_b32 s2, 0
 ; GFX1132-NEXT:    s_mov_b32 s3, exec_lo
 ; GFX1132-NEXT:    s_clause 0x1
-; GFX1132-NEXT:    scratch_store_b32 off, v0, off offset:12
-; GFX1132-NEXT:    scratch_store_b32 off, v1, off offset:8
-; GFX1132-NEXT:    scratch_load_b64 v[0:1], off, off offset:8
+; GFX1132-NEXT:    scratch_store_b32 off, v0, off offset:4
+; GFX1132-NEXT:    scratch_store_b32 off, v1, off
+; GFX1132-NEXT:    scratch_load_b64 v[0:1], off, off
 ; GFX1132-NEXT:    v_cmpx_eq_u32_e32 0, v2
 ; GFX1132-NEXT:    s_cbranch_execz .LBB2_3
 ; GFX1132-NEXT:  ; %bb.1:
@@ -1431,9 +1431,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_one_as_scope
 ; GFX1164-DPP-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX1164-DPP-NEXT:    s_clause 0x1
-; GFX1164-DPP-NEXT:    scratch_store_b32 off, v0, off offset:12
-; GFX1164-DPP-NEXT:    scratch_store_b32 off, v1, off offset:8
-; GFX1164-DPP-NEXT:    scratch_load_b64 v[0:1], off, off offset:8
+; GFX1164-DPP-NEXT:    scratch_store_b32 off, v0, off offset:4
+; GFX1164-DPP-NEXT:    scratch_store_b32 off, v1, off
+; GFX1164-DPP-NEXT:    scratch_load_b64 v[0:1], off, off
 ; GFX1164-DPP-NEXT:    v_mbcnt_hi_u32_b32 v2, exec_hi, v2
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v2
@@ -1475,9 +1475,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_one_as_scope
 ; GFX1132-DPP-NEXT:    s_mov_b32 s2, 0
 ; GFX1132-DPP-NEXT:    s_mov_b32 s3, exec_lo
 ; GFX1132-DPP-NEXT:    s_clause 0x1
-; GFX1132-DPP-NEXT:    scratch_store_b32 off, v0, off offset:12
-; GFX1132-DPP-NEXT:    scratch_store_b32 off, v1, off offset:8
-; GFX1132-DPP-NEXT:    scratch_load_b64 v[0:1], off, off offset:8
+; GFX1132-DPP-NEXT:    scratch_store_b32 off, v0, off offset:4
+; GFX1132-DPP-NEXT:    scratch_store_b32 off, v1, off
+; GFX1132-DPP-NEXT:    scratch_load_b64 v[0:1], off, off
 ; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v2
 ; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB2_3
 ; GFX1132-DPP-NEXT:  ; %bb.1:
@@ -2457,9 +2457,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_
 ; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
 ; GFX1164-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX1164-NEXT:    s_clause 0x1
-; GFX1164-NEXT:    scratch_store_b32 off, v0, off offset:12
-; GFX1164-NEXT:    scratch_store_b32 off, v1, off offset:8
-; GFX1164-NEXT:    scratch_load_b64 v[0:1], off, off offset:8
+; GFX1164-NEXT:    scratch_store_b32 off, v0, off offset:4
+; GFX1164-NEXT:    scratch_store_b32 off, v1, off
+; GFX1164-NEXT:    scratch_load_b64 v[0:1], off, off
 ; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v2, exec_hi, v2
 ; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1164-NEXT:    v_cmpx_eq_u32_e32 0, v2
@@ -2501,9 +2501,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_
 ; GFX1132-NEXT:    s_mov_b32 s2, 0
 ; GFX1132-NEXT:    s_mov_b32 s3, exec_lo
 ; GFX1132-NEXT:    s_clause 0x1
-; GFX1132-NEXT:    scratch_store_b32 off, v0, off offset:12
-; GFX1132-NEXT:    scratch_store_b32 off, v1, off offset:8
-; GFX1132-NEXT:    scratch_load_b64 v[0:1], off, off offset:8
+; GFX1132-NEXT:    scratch_store_b32 off, v0, off offset:4
+; GFX1132-NEXT:    scratch_store_b32 off, v1, off
+; GFX1132-NEXT:    scratch_load_b64 v[0:1], off, off
 ; GFX1132-NEXT:    v_cmpx_eq_u32_e32 0, v2
 ; GFX1132-NEXT:    s_cbranch_execz .LBB4_3
 ; GFX1132-NEXT:  ; %bb.1:
@@ -2661,9 +2661,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_
 ; GFX1164-DPP-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX1164-DPP-NEXT:    s_clause 0x1
-; GFX1164-DPP-NEXT:    scratch_store_b32 off, v0, off offset:12
-; GFX1164-DPP-NEXT:    scratch_store_b32 off, v1, off offset:8
-; GFX1164-DPP-NEXT:    scratch_load_b64 v[0:1], off, off offset:8
+; GFX1164-DPP-NEXT:    scratch_store_b32 off, v0, off offset:4
+; GFX1164-DPP-NEXT:    scratch_store_b32 off, v1, off
+; GFX1164-DPP-NEXT:    scratch_load_b64 v[0:1], off, off
 ; GFX1164-DPP-NEXT:    v_mbcnt_hi_u32_b32 v2, exec_hi, v2
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v2
@@ -2705,9 +2705,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_
 ; GFX1132-DPP-NEXT:    s_mov_b32 s2, 0
 ; GFX1132-DPP-NEXT:    s_mov_b32 s3, exec_lo
 ; GFX1132-DPP-NEXT:    s_clause 0x1
-; GFX1132-DPP-NEXT:    scratch_store_b32 off, v0, off offset:12
-; GFX1132-DPP-NEXT:    scratch_store_b32 off, v1, off offset:8
-; GFX1132-DPP-NEXT:    scratch_load_b64 v[0:1], off, off offset:8
+; GFX1132-DPP-NEXT:    scratch_store_b32 off, v0, off offset:4
+; GFX1132-DPP-NEXT:    scratch_store_b32 off, v1, off
+; GFX1132-DPP-NEXT:    scratch_load_b64 v[0:1], off, off
 ; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v2
 ; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB4_3
 ; GFX1132-DPP-NEXT:  ; %bb.1:
@@ -4355,9 +4355,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_defalut_scop
 ; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
 ; GFX1164-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX1164-NEXT:    s_clause 0x1
-; GFX1164-NEXT:    scratch_store_b32 off, v0, off offset:12
-; GFX1164-NEXT:    scratch_store_b32 off, v1, off offset:8
-; GFX1164-NEXT:    scratch_load_b64 v[0:1], off, off offset:8
+; GFX1164-NEXT:    scratch_store_b32 off, v0, off offset:4
+; GFX1164-NEXT:    scratch_store_b32 off, v1, off
+; GFX1164-NEXT:    scratch_load_b64 v[0:1], off, off
 ; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v2, exec_hi, v2
 ; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1164-NEXT:    v_cmpx_eq_u32_e32 0, v2
@@ -4399,9 +4399,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_defalut_scop
 ; GFX1132-NEXT:    s_mov_b32 s2, 0
 ; GFX1132-NEXT:    s_mov_b32 s3, exec_lo
 ; GFX1132-NEXT:    s_clause 0x1
-; GFX1132-NEXT:    scratch_store_b32 off, v0, off offset:12
-; GFX1132-NEXT:    scratch_store_b32 off, v1, off offset:8
-; GFX1132-NEXT:    scratch_load_b64 v[0:1], off, off offset:8
+; GFX1132-NEXT:    scratch_store_b32 off, v0, off offset:4
+; GFX1132-NEXT:    scratch_store_b32 off, v1, off
+; GFX1132-NEXT:    scratch_load_b64 v[0:1], off, off
 ; GFX1132-NEXT:    v_cmpx_eq_u32_e32 0, v2
 ; GFX1132-NEXT:    s_cbranch_execz .LBB7_3
 ; GFX1132-NEXT:  ; %bb.1:
@@ -4559,9 +4559,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_defalut_scop
 ; GFX1164-DPP-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX1164-DPP-NEXT:    s_clause 0x1
-; GFX1164-DPP-NEXT:    scratch_store_b32 off, v0, off offset:12
-; GFX1164-DPP-NEXT:    scratch_store_b32 off, v1, off offset:8
-; GFX1164-DPP-NEXT:    scratch_load_b64 v[0:1], off, off offset:8
+; GFX1164-DPP-NEXT:    scratch_store_b32 off, v0, off offset:4
+; GFX1164-DPP-NEXT:    scratch_store_b32 off, v1, off
+; GFX1164-DPP-NEXT:    scratch_load_b64 v[0:1], off, off
 ; GFX1164-DPP-NEXT:    v_mbcnt_hi_u32_b32 v2, exec_hi, v2
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v2
@@ -4603,9 +4603,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_defalut_scop
 ; GFX1132-DPP-NEXT:    s_mov_b32 s2, 0
 ; GFX1132-DPP-NEXT:    s_mov_b32 s3, exec_lo
 ; GFX1132-DPP-NEXT:    s_clause 0x1
-; GFX1132-DPP-NEXT:    scratch_store_b32 off, v0, off offset:12
-; GFX1132-DPP-NEXT:    scratch_store_b32 off, v1, off offset:8
-; GFX1132-DPP-NEXT:    scratch_load_b64 v[0:1], off, off offset:8
+; GFX1132-DPP-NEXT:    scratch_store_b32 off, v0, off offset:4
+; GFX1132-DPP-NEXT:    scratch_store_b32 off, v1, off
+; GFX1132-DPP-NEXT:    scratch_load_b64 v[0:1], off, off
 ; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v2
 ; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB7_3
 ; GFX1132-DPP-NEXT:  ; %bb.1:
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll
index c927a0e..11d35c5 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll
@@ -1331,9 +1331,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_one_as_scope
 ; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
 ; GFX1164-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX1164-NEXT:    s_clause 0x1
-; GFX1164-NEXT:    scratch_store_b32 off, v0, off offset:12
-; GFX1164-NEXT:    scratch_store_b32 off, v1, off offset:8
-; GFX1164-NEXT:    scratch_load_b64 v[0:1], off, off offset:8
+; GFX1164-NEXT:    scratch_store_b32 off, v0, off offset:4
+; GFX1164-NEXT:    scratch_store_b32 off, v1, off
+; GFX1164-NEXT:    scratch_load_b64 v[0:1], off, off
 ; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v2, exec_hi, v2
 ; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1164-NEXT:    v_cmpx_eq_u32_e32 0, v2
@@ -1375,9 +1375,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_one_as_scope
 ; GFX1132-NEXT:    s_mov_b32 s2, 0
 ; GFX1132-NEXT:    s_mov_b32 s3, exec_lo
 ; GFX1132-NEXT:    s_clause 0x1
-; GFX1132-NEXT:    scratch_store_b32 off, v0, off offset:12
-; GFX1132-NEXT:    scratch_store_b32 off, v1, off offset:8
-; GFX1132-NEXT:    scratch_load_b64 v[0:1], off, off offset:8
+; GFX1132-NEXT:    scratch_store_b32 off, v0, off offset:4
+; GFX1132-NEXT:    scratch_store_b32 off, v1, off
+; GFX1132-NEXT:    scratch_load_b64 v[0:1], off, off
 ; GFX1132-NEXT:    v_cmpx_eq_u32_e32 0, v2
 ; GFX1132-NEXT:    s_cbranch_execz .LBB2_3
 ; GFX1132-NEXT:  ; %bb.1:
@@ -1535,9 +1535,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_one_as_scope
 ; GFX1164-DPP-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX1164-DPP-NEXT:    s_clause 0x1
-; GFX1164-DPP-NEXT:    scratch_store_b32 off, v0, off offset:12
-; GFX1164-DPP-NEXT:    scratch_store_b32 off, v1, off offset:8
-; GFX1164-DPP-NEXT:    scratch_load_b64 v[0:1], off, off offset:8
+; GFX1164-DPP-NEXT:    scratch_store_b32 off, v0, off offset:4
+; GFX1164-DPP-NEXT:    scratch_store_b32 off, v1, off
+; GFX1164-DPP-NEXT:    scratch_load_b64 v[0:1], off, off
 ; GFX1164-DPP-NEXT:    v_mbcnt_hi_u32_b32 v2, exec_hi, v2
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v2
@@ -1579,9 +1579,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_one_as_scope
 ; GFX1132-DPP-NEXT:    s_mov_b32 s2, 0
 ; GFX1132-DPP-NEXT:    s_mov_b32 s3, exec_lo
 ; GFX1132-DPP-NEXT:    s_clause 0x1
-; GFX1132-DPP-NEXT:    scratch_store_b32 off, v0, off offset:12
-; GFX1132-DPP-NEXT:    scratch_store_b32 off, v1, off offset:8
-; GFX1132-DPP-NEXT:    scratch_load_b64 v[0:1], off, off offset:8
+; GFX1132-DPP-NEXT:    scratch_store_b32 off, v0, off offset:4
+; GFX1132-DPP-NEXT:    scratch_store_b32 off, v1, off
+; GFX1132-DPP-NEXT:    scratch_load_b64 v[0:1], off, off
 ; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v2
 ; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB2_3
 ; GFX1132-DPP-NEXT:  ; %bb.1:
@@ -2561,9 +2561,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_
 ; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
 ; GFX1164-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX1164-NEXT:    s_clause 0x1
-; GFX1164-NEXT:    scratch_store_b32 off, v0, off offset:12
-; GFX1164-NEXT:    scratch_store_b32 off, v1, off offset:8
-; GFX1164-NEXT:    scratch_load_b64 v[0:1], off, off offset:8
+; GFX1164-NEXT:    scratch_store_b32 off, v0, off offset:4
+; GFX1164-NEXT:    scratch_store_b32 off, v1, off
+; GFX1164-NEXT:    scratch_load_b64 v[0:1], off, off
 ; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v2, exec_hi, v2
 ; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1164-NEXT:    v_cmpx_eq_u32_e32 0, v2
@@ -2605,9 +2605,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_
 ; GFX1132-NEXT:    s_mov_b32 s2, 0
 ; GFX1132-NEXT:    s_mov_b32 s3, exec_lo
 ; GFX1132-NEXT:    s_clause 0x1
-; GFX1132-NEXT:    scratch_store_b32 off, v0, off offset:12
-; GFX1132-NEXT:    scratch_store_b32 off, v1, off offset:8
-; GFX1132-NEXT:    scratch_load_b64 v[0:1], off, off offset:8
+; GFX1132-NEXT:    scratch_store_b32 off, v0, off offset:4
+; GFX1132-NEXT:    scratch_store_b32 off, v1, off
+; GFX1132-NEXT:    scratch_load_b64 v[0:1], off, off
 ; GFX1132-NEXT:    v_cmpx_eq_u32_e32 0, v2
 ; GFX1132-NEXT:    s_cbranch_execz .LBB4_3
 ; GFX1132-NEXT:  ; %bb.1:
@@ -2765,9 +2765,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_
 ; GFX1164-DPP-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX1164-DPP-NEXT:    s_clause 0x1
-; GFX1164-DPP-NEXT:    scratch_store_b32 off, v0, off offset:12
-; GFX1164-DPP-NEXT:    scratch_store_b32 off, v1, off offset:8
-; GFX1164-DPP-NEXT:    scratch_load_b64 v[0:1], off, off offset:8
+; GFX1164-DPP-NEXT:    scratch_store_b32 off, v0, off offset:4
+; GFX1164-DPP-NEXT:    scratch_store_b32 off, v1, off
+; GFX1164-DPP-NEXT:    scratch_load_b64 v[0:1], off, off
 ; GFX1164-DPP-NEXT:    v_mbcnt_hi_u32_b32 v2, exec_hi, v2
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v2
@@ -2809,9 +2809,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_
 ; GFX1132-DPP-NEXT:    s_mov_b32 s2, 0
 ; GFX1132-DPP-NEXT:    s_mov_b32 s3, exec_lo
 ; GFX1132-DPP-NEXT:    s_clause 0x1
-; GFX1132-DPP-NEXT:    scratch_store_b32 off, v0, off offset:12
-; GFX1132-DPP-NEXT:    scratch_store_b32 off, v1, off offset:8
-; GFX1132-DPP-NEXT:    scratch_load_b64 v[0:1], off, off offset:8
+; GFX1132-DPP-NEXT:    scratch_store_b32 off, v0, off offset:4
+; GFX1132-DPP-NEXT:    scratch_store_b32 off, v1, off
+; GFX1132-DPP-NEXT:    scratch_load_b64 v[0:1], off, off
 ; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v2
 ; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB4_3
 ; GFX1132-DPP-NEXT:  ; %bb.1:
@@ -4563,9 +4563,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_defalut_scop
 ; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
 ; GFX1164-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX1164-NEXT:    s_clause 0x1
-; GFX1164-NEXT:    scratch_store_b32 off, v0, off offset:12
-; GFX1164-NEXT:    scratch_store_b32 off, v1, off offset:8
-; GFX1164-NEXT:    scratch_load_b64 v[0:1], off, off offset:8
+; GFX1164-NEXT:    scratch_store_b32 off, v0, off offset:4
+; GFX1164-NEXT:    scratch_store_b32 off, v1, off
+; GFX1164-NEXT:    scratch_load_b64 v[0:1], off, off
 ; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v2, exec_hi, v2
 ; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1164-NEXT:    v_cmpx_eq_u32_e32 0, v2
@@ -4607,9 +4607,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_defalut_scop
 ; GFX1132-NEXT:    s_mov_b32 s2, 0
 ; GFX1132-NEXT:    s_mov_b32 s3, exec_lo
 ; GFX1132-NEXT:    s_clause 0x1
-; GFX1132-NEXT:    scratch_store_b32 off, v0, off offset:12
-; GFX1132-NEXT:    scratch_store_b32 off, v1, off offset:8
-; GFX1132-NEXT:    scratch_load_b64 v[0:1], off, off offset:8
+; GFX1132-NEXT:    scratch_store_b32 off, v0, off offset:4
+; GFX1132-NEXT:    scratch_store_b32 off, v1, off
+; GFX1132-NEXT:    scratch_load_b64 v[0:1], off, off
 ; GFX1132-NEXT:    v_cmpx_eq_u32_e32 0, v2
 ; GFX1132-NEXT:    s_cbranch_execz .LBB7_3
 ; GFX1132-NEXT:  ; %bb.1:
@@ -4767,9 +4767,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_defalut_scop
 ; GFX1164-DPP-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX1164-DPP-NEXT:    s_clause 0x1
-; GFX1164-DPP-NEXT:    scratch_store_b32 off, v0, off offset:12
-; GFX1164-DPP-NEXT:    scratch_store_b32 off, v1, off offset:8
-; GFX1164-DPP-NEXT:    scratch_load_b64 v[0:1], off, off offset:8
+; GFX1164-DPP-NEXT:    scratch_store_b32 off, v0, off offset:4
+; GFX1164-DPP-NEXT:    scratch_store_b32 off, v1, off
+; GFX1164-DPP-NEXT:    scratch_load_b64 v[0:1], off, off
 ; GFX1164-DPP-NEXT:    v_mbcnt_hi_u32_b32 v2, exec_hi, v2
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v2
@@ -4811,9 +4811,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_defalut_scop
 ; GFX1132-DPP-NEXT:    s_mov_b32 s2, 0
 ; GFX1132-DPP-NEXT:    s_mov_b32 s3, exec_lo
 ; GFX1132-DPP-NEXT:    s_clause 0x1
-; GFX1132-DPP-NEXT:    scratch_store_b32 off, v0, off offset:12
-; GFX1132-DPP-NEXT:    scratch_store_b32 off, v1, off offset:8
-; GFX1132-DPP-NEXT:    scratch_load_b64 v[0:1], off, off offset:8
+; GFX1132-DPP-NEXT:    scratch_store_b32 off, v0, off offset:4
+; GFX1132-DPP-NEXT:    scratch_store_b32 off, v1, off
+; GFX1132-DPP-NEXT:    scratch_load_b64 v[0:1], off, off
 ; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v2
 ; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB7_3
 ; GFX1132-DPP-NEXT:  ; %bb.1:
diff --git a/llvm/test/CodeGen/AMDGPU/huge-private-buffer.ll b/llvm/test/CodeGen/AMDGPU/huge-private-buffer.ll
index 5882043..b9269e2 100644
--- a/llvm/test/CodeGen/AMDGPU/huge-private-buffer.ll
+++ b/llvm/test/CodeGen/AMDGPU/huge-private-buffer.ll
@@ -7,12 +7,12 @@
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -amdgpu-enable-vopd=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SCRATCH2048K %s
 
 ; GCN-LABEL: {{^}}scratch_buffer_known_high_masklo16:
-; GCN: v_mov_b32_e32 [[FI:v[0-9]+]], 4
+; GCN: v_mov_b32_e32 [[FI:v[0-9]+]], 0{{$}}
 ; GCN: v_and_b32_e32 v{{[0-9]+}}, 0xfffc, [[FI]]
 ; GCN: {{flat|global}}_store_{{dword|b32}} v[{{[0-9]+:[0-9]+}}],
 define amdgpu_kernel void @scratch_buffer_known_high_masklo16() {
   %alloca = alloca i32, align 4, addrspace(5)
-  store volatile i32 0, ptr addrspace(5) %alloca
+  store volatile i32 15, ptr addrspace(5) %alloca
   %toint = ptrtoint ptr addrspace(5) %alloca to i32
   %masked = and i32 %toint, 65535
   store volatile i32 %masked, ptr addrspace(1) undef
@@ -20,7 +20,7 @@ define amdgpu_kernel void @scratch_buffer_known_high_masklo16() {
 }
 
 ; GCN-LABEL: {{^}}scratch_buffer_known_high_masklo17:
-; GCN: v_mov_b32_e32 [[FI:v[0-9]+]], 4
+; GCN: v_mov_b32_e32 [[FI:v[0-9]+]], 0{{$}}
 ; SCRATCH128K-NOT: v_and_b32
 ; SCRATCH256K: v_and_b32_e32 v{{[0-9]+}}, 0x1fffc, [[FI]]
 ; SCRATCH1024K: v_and_b32_e32 v{{[0-9]+}}, 0x1fffc, [[FI]]
@@ -28,7 +28,7 @@ define amdgpu_kernel void @scratch_buffer_known_high_masklo16() {
 ; GCN: {{flat|global}}_store_{{dword|b32}} v[{{[0-9]+:[0-9]+}}],
 define amdgpu_kernel void @scratch_buffer_known_high_masklo17() {
   %alloca = alloca i32, align 4, addrspace(5)
-  store volatile i32 0, ptr addrspace(5) %alloca
+  store volatile i32 15, ptr addrspace(5) %alloca
   %toint = ptrtoint ptr addrspace(5) %alloca to i32
   %masked = and i32 %toint, 131071
   store volatile i32 %masked, ptr addrspace(1) undef
@@ -36,7 +36,7 @@ define amdgpu_kernel void @scratch_buffer_known_high_masklo17() {
 }
 
 ; GCN-LABEL: {{^}}scratch_buffer_known_high_masklo18:
-; GCN: v_mov_b32_e32 [[FI:v[0-9]+]], 4
+; GCN: v_mov_b32_e32 [[FI:v[0-9]+]], 0{{$}}
 ; SCRATCH128K-NOT: v_and_b32
 ; SCRATCH256K-NOT: v_and_b32
 ; SCRATCH1024K: v_and_b32_e32 v{{[0-9]+}}, 0x3fffc, [[FI]]
@@ -44,7 +44,7 @@ define amdgpu_kernel void @scratch_buffer_known_high_masklo17() {
 ; GCN: {{flat|global}}_store_{{dword|b32}} v[{{[0-9]+:[0-9]+}}],
 define amdgpu_kernel void @scratch_buffer_known_high_masklo18() {
   %alloca = alloca i32, align 4, addrspace(5)
-  store volatile i32 0, ptr addrspace(5) %alloca
+  store volatile i32 15, ptr addrspace(5) %alloca
   %toint = ptrtoint ptr addrspace(5) %alloca to i32
   %masked = and i32 %toint, 262143
   store volatile i32 %masked, ptr addrspace(1) undef
@@ -52,7 +52,7 @@ define amdgpu_kernel void @scratch_buffer_known_high_masklo18() {
 }
 
 ; GCN-LABEL: {{^}}scratch_buffer_known_high_masklo20:
-; GCN: v_mov_b32_e32 [[FI:v[0-9]+]], 4
+; GCN: v_mov_b32_e32 [[FI:v[0-9]+]], 0{{$}}
 ; SCRATCH128K-NOT: v_and_b32
 ; SCRATCH256K-NOT: v_and_b32
 ; SCRATCH1024K-NOT: v_and_b32
@@ -60,7 +60,7 @@ define amdgpu_kernel void @scratch_buffer_known_high_masklo18() {
 ; GCN: {{flat|global}}_store_{{dword|b32}} v[{{[0-9]+:[0-9]+}}],
 define amdgpu_kernel void @scratch_buffer_known_high_masklo20() {
   %alloca = alloca i32, align 4, addrspace(5)
-  store volatile i32 0, ptr addrspace(5) %alloca
+  store volatile i32 15, ptr addrspace(5) %alloca
   %toint = ptrtoint ptr addrspace(5) %alloca to i32
   %masked = and i32 %toint, 1048575
   store volatile i32 %masked, ptr addrspace(1) undef
@@ -68,12 +68,12 @@ define amdgpu_kernel void @scratch_buffer_known_high_masklo20() {
 }
 
 ; GCN-LABEL: {{^}}scratch_buffer_known_high_masklo21:
-; GCN: v_mov_b32_e32 [[FI:v[0-9]+]], 4
+; GCN: v_mov_b32_e32 [[FI:v[0-9]+]], 0{{$}}
 ; GCN-NOT: v_and_b32
 ; GCN: {{flat|global}}_store_{{dword|b32}} v[{{[0-9]+:[0-9]+}}],
 define amdgpu_kernel void @scratch_buffer_known_high_masklo21() {
   %alloca = alloca i32, align 4, addrspace(5)
-  store volatile i32 0, ptr addrspace(5) %alloca
+  store volatile i32 15, ptr addrspace(5) %alloca
   %toint = ptrtoint ptr addrspace(5) %alloca to i32
   %masked = and i32 %toint, 2097151
   store volatile i32 %masked, ptr addrspace(1) undef
diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll
index 823c444..f736ca7 100644
--- a/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll
@@ -969,7 +969,7 @@ define amdgpu_kernel void @bit4_inselt(ptr addrspace(1) %out, <4 x i1> %vec, i32
 ; GCN-NEXT:    s_add_u32 s4, s4, s3
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GCN-NEXT:    s_addc_u32 s5, s5, 0
-; GCN-NEXT:    v_mov_b32_e32 v0, 4
+; GCN-NEXT:    v_mov_b32_e32 v0, 0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_and_b32 s3, s3, 3
 ; GCN-NEXT:    v_mov_b32_e32 v1, s2
@@ -980,16 +980,16 @@ define amdgpu_kernel void @bit4_inselt(ptr addrspace(1) %out, <4 x i1> %vec, i32
 ; GCN-NEXT:    v_and_b32_e32 v2, 1, v2
 ; GCN-NEXT:    v_and_b32_e32 v3, 3, v3
 ; GCN-NEXT:    v_and_b32_e32 v4, 1, v4
-; GCN-NEXT:    buffer_store_byte v1, off, s[4:7], 0 offset:4
-; GCN-NEXT:    buffer_store_byte v4, off, s[4:7], 0 offset:7
-; GCN-NEXT:    buffer_store_byte v3, off, s[4:7], 0 offset:6
-; GCN-NEXT:    buffer_store_byte v2, off, s[4:7], 0 offset:5
+; GCN-NEXT:    buffer_store_byte v1, off, s[4:7], 0
+; GCN-NEXT:    buffer_store_byte v4, off, s[4:7], 0 offset:3
+; GCN-NEXT:    buffer_store_byte v3, off, s[4:7], 0 offset:2
+; GCN-NEXT:    buffer_store_byte v2, off, s[4:7], 0 offset:1
 ; GCN-NEXT:    v_mov_b32_e32 v1, 1
 ; GCN-NEXT:    buffer_store_byte v1, v0, s[4:7], 0 offen
-; GCN-NEXT:    buffer_load_ubyte v0, off, s[4:7], 0 offset:4
-; GCN-NEXT:    buffer_load_ubyte v1, off, s[4:7], 0 offset:5
-; GCN-NEXT:    buffer_load_ubyte v2, off, s[4:7], 0 offset:6
-; GCN-NEXT:    buffer_load_ubyte v3, off, s[4:7], 0 offset:7
+; GCN-NEXT:    buffer_load_ubyte v0, off, s[4:7], 0
+; GCN-NEXT:    buffer_load_ubyte v1, off, s[4:7], 0 offset:1
+; GCN-NEXT:    buffer_load_ubyte v2, off, s[4:7], 0 offset:2
+; GCN-NEXT:    buffer_load_ubyte v3, off, s[4:7], 0 offset:3
 ; GCN-NEXT:    s_waitcnt vmcnt(3)
 ; GCN-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GCN-NEXT:    s_waitcnt vmcnt(2)
diff --git a/llvm/test/CodeGen/AMDGPU/kernarg-stack-alignment.ll b/llvm/test/CodeGen/AMDGPU/kernarg-stack-alignment.ll
index 5873e9c..6f61179 100644
--- a/llvm/test/CodeGen/AMDGPU/kernarg-stack-alignment.ll
+++ b/llvm/test/CodeGen/AMDGPU/kernarg-stack-alignment.ll
@@ -4,7 +4,7 @@
 ; alignment of the stack
 
 ; CHECK-LABEL: {{^}}no_args:
-; CHECK: ScratchSize: 5{{$}}
+; CHECK: ScratchSize: 8{{$}}
 define amdgpu_kernel void @no_args() {
   %alloca = alloca i8, addrspace(5)
   store volatile i8 0, ptr addrspace(5) %alloca
@@ -12,7 +12,7 @@ define amdgpu_kernel void @no_args() {
 }
 
 ; CHECK-LABEL: {{^}}force_align32:
-; CHECK: ScratchSize: 5{{$}}
+; CHECK: ScratchSize: 8{{$}}
 define amdgpu_kernel void @force_align32(<8 x i32>) {
   %alloca = alloca i8, addrspace(5)
   store volatile i8 0, ptr addrspace(5) %alloca
@@ -20,7 +20,7 @@ define amdgpu_kernel void @force_align32(<8 x i32>) {
 }
 
 ; CHECK-LABEL: {{^}}force_align64:
-; CHECK: ScratchSize: 5{{$}}
+; CHECK: ScratchSize: 8{{$}}
 define amdgpu_kernel void @force_align64(<16 x i32>) {
   %alloca = alloca i8, addrspace(5)
   store volatile i8 0, ptr addrspace(5) %alloca
@@ -28,7 +28,7 @@ define amdgpu_kernel void @force_align64(<16 x i32>) {
 }
 
 ; CHECK-LABEL: {{^}}force_align128:
-; CHECK: ScratchSize: 5{{$}}
+; CHECK: ScratchSize: 8{{$}}
 define amdgpu_kernel void @force_align128(<32 x i32>) {
   %alloca = alloca i8, addrspace(5)
   store volatile i8 0, ptr addrspace(5) %alloca
@@ -36,7 +36,7 @@ define amdgpu_kernel void @force_align128(<32 x i32>) {
 }
 
 ; CHECK-LABEL: {{^}}force_align256:
-; CHECK: ScratchSize: 5{{$}}
+; CHECK: ScratchSize: 8{{$}}
 define amdgpu_kernel void @force_align256(<64 x i32>) {
   %alloca = alloca i8, addrspace(5)
   store volatile i8 0, ptr addrspace(5) %alloca
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.ll
index 13a8033..a209dcf 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.ll
@@ -444,7 +444,7 @@ main_body:
 ; for stack access.
 
 ; CHECK-LABEL: {{^}}no_fold_fi_imm_soffset:
-; CHECK: v_mov_b32_e32 [[FI:v[0-9]+]], 4{{$}}
+; CHECK: v_mov_b32_e32 [[FI:v[0-9]+]], 0{{$}}
 ; CHECK-NEXT: buffer_load_dword v0, [[FI]], s{{\[[0-9]+:[0-9]+\]}}, 0 idxen
 define amdgpu_ps float @no_fold_fi_imm_soffset(<4 x i32> inreg %rsrc) {
   %alloca = alloca i32, addrspace(5)
@@ -455,7 +455,7 @@ define amdgpu_ps float @no_fold_fi_imm_soffset(<4 x i32> inreg %rsrc) {
 }
 
 ; CHECK-LABEL: {{^}}no_fold_fi_reg_soffset:
-; CHECK-DAG: v_mov_b32_e32 v[[FI:[0-9]+]], 4{{$}}
+; CHECK-DAG: v_mov_b32_e32 v[[FI:[0-9]+]], 0{{$}}
 ; CHECK-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], s
 ; CHECK: buffer_load_dword v0, v[[[FI]]:[[HI]]
 define amdgpu_ps float @no_fold_fi_reg_soffset(<4 x i32> inreg %rsrc, i32 inreg %soffset) {
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.implicit.ptr.buffer.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.implicit.ptr.buffer.ll
index e9d9b66..8598b78 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.implicit.ptr.buffer.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.implicit.ptr.buffer.ll
@@ -3,7 +3,7 @@
 ; FIXME: Requires stack object to not assert
 ; GCN-LABEL: {{^}}test_ps:
 ; GCN: s_load_dwordx2 s[4:5], s[0:1], 0x0
-; GCN: buffer_store_dword v0, off, s[4:7], 0 offset:4
+; GCN: buffer_store_dword v0, off, s[4:7], 0{{$}}
 ; GCN: s_load_dword s{{[0-9]+}}, s[0:1], 0x0
 ; GCN-NEXT: s_waitcnt
 ; GCN-NEXT: ; return
@@ -17,7 +17,7 @@ define amdgpu_ps i32 @test_ps() #1 {
 
 ; GCN-LABEL: {{^}}test_cs:
 ; GCN: s_mov_b64 s[4:5], s[0:1]
-; GCN: buffer_store_dword v{{[0-9]+}}, off, s[4:7], 0 offset:4
+; GCN: buffer_store_dword v{{[0-9]+}}, off, s[4:7], 0{{$}}
 ; GCN: s_load_dword s0, s[0:1], 0x0
 define amdgpu_cs i32 @test_cs() #1 {
   %alloca = alloca i32, addrspace(5)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll
index e789db1..0284f44 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll
@@ -58,7 +58,7 @@ define amdgpu_kernel void @test_readfirstlane_copy_from_sgpr(ptr addrspace(1) %o
 
 ; Make sure this doesn't crash.
 ; CHECK-LABEL: {{^}}test_readfirstlane_fi:
-; CHECK: s_mov_b32 [[FIVAL:s[0-9]]], 4
+; CHECK: s_mov_b32 [[FIVAL:s[0-9]]], 0
 define amdgpu_kernel void @test_readfirstlane_fi(ptr addrspace(1) %out) #1 {
   %alloca = alloca i32, addrspace(5)
   %int = ptrtoint ptr addrspace(5) %alloca to i32
diff --git a/llvm/test/CodeGen/AMDGPU/load-global-i16.ll b/llvm/test/CodeGen/AMDGPU/load-global-i16.ll
index b288535..21e27bf 100644
--- a/llvm/test/CodeGen/AMDGPU/load-global-i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-global-i16.ll
@@ -3478,19 +3478,19 @@ define amdgpu_kernel void @global_zextload_v64i16_to_v64i32(ptr addrspace(1) %ou
 ; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v24, 16, v10
 ; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v17, 0xffff, v15
 ; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v15, 0xffff, v14
-; GCN-NOHSA-SI-NEXT:    buffer_store_dword v15, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
+; GCN-NOHSA-SI-NEXT:    buffer_store_dword v15, off, s[12:15], 0 ; 4-byte Folded Spill
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NOHSA-SI-NEXT:    buffer_store_dword v16, off, s[12:15], 0 offset:8 ; 4-byte Folded Spill
-; GCN-NOHSA-SI-NEXT:    buffer_store_dword v17, off, s[12:15], 0 offset:12 ; 4-byte Folded Spill
-; GCN-NOHSA-SI-NEXT:    buffer_store_dword v18, off, s[12:15], 0 offset:16 ; 4-byte Folded Spill
+; GCN-NOHSA-SI-NEXT:    buffer_store_dword v16, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
+; GCN-NOHSA-SI-NEXT:    buffer_store_dword v17, off, s[12:15], 0 offset:8 ; 4-byte Folded Spill
+; GCN-NOHSA-SI-NEXT:    buffer_store_dword v18, off, s[12:15], 0 offset:12 ; 4-byte Folded Spill
 ; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v20, 0xffff, v13
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
 ; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v18, 0xffff, v12
-; GCN-NOHSA-SI-NEXT:    buffer_store_dword v18, off, s[12:15], 0 offset:20 ; 4-byte Folded Spill
+; GCN-NOHSA-SI-NEXT:    buffer_store_dword v18, off, s[12:15], 0 offset:16 ; 4-byte Folded Spill
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NOHSA-SI-NEXT:    buffer_store_dword v19, off, s[12:15], 0 offset:24 ; 4-byte Folded Spill
-; GCN-NOHSA-SI-NEXT:    buffer_store_dword v20, off, s[12:15], 0 offset:28 ; 4-byte Folded Spill
-; GCN-NOHSA-SI-NEXT:    buffer_store_dword v21, off, s[12:15], 0 offset:32 ; 4-byte Folded Spill
+; GCN-NOHSA-SI-NEXT:    buffer_store_dword v19, off, s[12:15], 0 offset:20 ; 4-byte Folded Spill
+; GCN-NOHSA-SI-NEXT:    buffer_store_dword v20, off, s[12:15], 0 offset:24 ; 4-byte Folded Spill
+; GCN-NOHSA-SI-NEXT:    buffer_store_dword v21, off, s[12:15], 0 offset:28 ; 4-byte Folded Spill
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(2)
 ; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v19, 16, v9
 ; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v17, 16, v8
@@ -3562,17 +3562,17 @@ define amdgpu_kernel void @global_zextload_v64i16_to_v64i32(ptr addrspace(1) %ou
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:80
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:32
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[23:26], off, s[0:3], 0 offset:48
-; GCN-NOHSA-SI-NEXT:    buffer_load_dword v0, off, s[12:15], 0 offset:20 ; 4-byte Folded Reload
-; GCN-NOHSA-SI-NEXT:    buffer_load_dword v1, off, s[12:15], 0 offset:24 ; 4-byte Folded Reload
-; GCN-NOHSA-SI-NEXT:    buffer_load_dword v2, off, s[12:15], 0 offset:28 ; 4-byte Folded Reload
-; GCN-NOHSA-SI-NEXT:    buffer_load_dword v3, off, s[12:15], 0 offset:32 ; 4-byte Folded Reload
+; GCN-NOHSA-SI-NEXT:    buffer_load_dword v0, off, s[12:15], 0 offset:16 ; 4-byte Folded Reload
+; GCN-NOHSA-SI-NEXT:    buffer_load_dword v1, off, s[12:15], 0 offset:20 ; 4-byte Folded Reload
+; GCN-NOHSA-SI-NEXT:    buffer_load_dword v2, off, s[12:15], 0 offset:24 ; 4-byte Folded Reload
+; GCN-NOHSA-SI-NEXT:    buffer_load_dword v3, off, s[12:15], 0 offset:28 ; 4-byte Folded Reload
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT:    buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
-; GCN-NOHSA-SI-NEXT:    buffer_load_dword v1, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload
-; GCN-NOHSA-SI-NEXT:    buffer_load_dword v2, off, s[12:15], 0 offset:12 ; 4-byte Folded Reload
-; GCN-NOHSA-SI-NEXT:    buffer_load_dword v3, off, s[12:15], 0 offset:16 ; 4-byte Folded Reload
+; GCN-NOHSA-SI-NEXT:    buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload
+; GCN-NOHSA-SI-NEXT:    buffer_load_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
+; GCN-NOHSA-SI-NEXT:    buffer_load_dword v2, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload
+; GCN-NOHSA-SI-NEXT:    buffer_load_dword v3, off, s[12:15], 0 offset:12 ; 4-byte Folded Reload
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
 ; GCN-NOHSA-SI-NEXT:    s_endpgm
@@ -3801,20 +3801,20 @@ define amdgpu_kernel void @global_zextload_v64i16_to_v64i32(ptr addrspace(1) %ou
 ; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v16, 16, v14
 ; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v17, 0xffff, v15
 ; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v15, 0xffff, v14
-; GCN-NOHSA-VI-NEXT:    buffer_store_dword v15, off, s[88:91], 0 offset:4 ; 4-byte Folded Spill
+; GCN-NOHSA-VI-NEXT:    buffer_store_dword v15, off, s[88:91], 0 ; 4-byte Folded Spill
 ; GCN-NOHSA-VI-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NOHSA-VI-NEXT:    buffer_store_dword v16, off, s[88:91], 0 offset:8 ; 4-byte Folded Spill
-; GCN-NOHSA-VI-NEXT:    buffer_store_dword v17, off, s[88:91], 0 offset:12 ; 4-byte Folded Spill
-; GCN-NOHSA-VI-NEXT:    buffer_store_dword v18, off, s[88:91], 0 offset:16 ; 4-byte Folded Spill
+; GCN-NOHSA-VI-NEXT:    buffer_store_dword v16, off, s[88:91], 0 offset:4 ; 4-byte Folded Spill
+; GCN-NOHSA-VI-NEXT:    buffer_store_dword v17, off, s[88:91], 0 offset:8 ; 4-byte Folded Spill
+; GCN-NOHSA-VI-NEXT:    buffer_store_dword v18, off, s[88:91], 0 offset:12 ; 4-byte Folded Spill
 ; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v21, 16, v13
 ; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v19, 16, v12
 ; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v20, 0xffff, v13
 ; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v18, 0xffff, v12
-; GCN-NOHSA-VI-NEXT:    buffer_store_dword v18, off, s[88:91], 0 offset:20 ; 4-byte Folded Spill
+; GCN-NOHSA-VI-NEXT:    buffer_store_dword v18, off, s[88:91], 0 offset:16 ; 4-byte Folded Spill
 ; GCN-NOHSA-VI-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NOHSA-VI-NEXT:    buffer_store_dword v19, off, s[88:91], 0 offset:24 ; 4-byte Folded Spill
-; GCN-NOHSA-VI-NEXT:    buffer_store_dword v20, off, s[88:91], 0 offset:28 ; 4-byte Folded Spill
-; GCN-NOHSA-VI-NEXT:    buffer_store_dword v21, off, s[88:91], 0 offset:32 ; 4-byte Folded Spill
+; GCN-NOHSA-VI-NEXT:    buffer_store_dword v19, off, s[88:91], 0 offset:20 ; 4-byte Folded Spill
+; GCN-NOHSA-VI-NEXT:    buffer_store_dword v20, off, s[88:91], 0 offset:24 ; 4-byte Folded Spill
+; GCN-NOHSA-VI-NEXT:    buffer_store_dword v21, off, s[88:91], 0 offset:28 ; 4-byte Folded Spill
 ; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v26, 16, v11
 ; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v24, 16, v10
 ; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v19, 16, v9
@@ -3885,16 +3885,16 @@ define amdgpu_kernel void @global_zextload_v64i16_to_v64i32(ptr addrspace(1) %ou
 ; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:80
 ; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:32
 ; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[23:26], off, s[0:3], 0 offset:48
-; GCN-NOHSA-VI-NEXT:    buffer_load_dword v0, off, s[88:91], 0 offset:20 ; 4-byte Folded Reload
-; GCN-NOHSA-VI-NEXT:    buffer_load_dword v1, off, s[88:91], 0 offset:24 ; 4-byte Folded Reload
-; GCN-NOHSA-VI-NEXT:    buffer_load_dword v2, off, s[88:91], 0 offset:28 ; 4-byte Folded Reload
-; GCN-NOHSA-VI-NEXT:    buffer_load_dword v3, off, s[88:91], 0 offset:32 ; 4-byte Folded Reload
+; GCN-NOHSA-VI-NEXT:    buffer_load_dword v0, off, s[88:91], 0 offset:16 ; 4-byte Folded Reload
+; GCN-NOHSA-VI-NEXT:    buffer_load_dword v1, off, s[88:91], 0 offset:20 ; 4-byte Folded Reload
+; GCN-NOHSA-VI-NEXT:    buffer_load_dword v2, off, s[88:91], 0 offset:24 ; 4-byte Folded Reload
+; GCN-NOHSA-VI-NEXT:    buffer_load_dword v3, off, s[88:91], 0 offset:28 ; 4-byte Folded Reload
 ; GCN-NOHSA-VI-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
-; GCN-NOHSA-VI-NEXT:    buffer_load_dword v0, off, s[88:91], 0 offset:4 ; 4-byte Folded Reload
-; GCN-NOHSA-VI-NEXT:    buffer_load_dword v1, off, s[88:91], 0 offset:8 ; 4-byte Folded Reload
-; GCN-NOHSA-VI-NEXT:    buffer_load_dword v2, off, s[88:91], 0 offset:12 ; 4-byte Folded Reload
-; GCN-NOHSA-VI-NEXT:    buffer_load_dword v3, off, s[88:91], 0 offset:16 ; 4-byte Folded Reload
+; GCN-NOHSA-VI-NEXT:    buffer_load_dword v0, off, s[88:91], 0 ; 4-byte Folded Reload
+; GCN-NOHSA-VI-NEXT:    buffer_load_dword v1, off, s[88:91], 0 offset:4 ; 4-byte Folded Reload
+; GCN-NOHSA-VI-NEXT:    buffer_load_dword v2, off, s[88:91], 0 offset:8 ; 4-byte Folded Reload
+; GCN-NOHSA-VI-NEXT:    buffer_load_dword v3, off, s[88:91], 0 offset:12 ; 4-byte Folded Reload
 ; GCN-NOHSA-VI-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
 ; GCN-NOHSA-VI-NEXT:    s_endpgm
@@ -4289,11 +4289,11 @@ define amdgpu_kernel void @global_sextload_v64i16_to_v64i32(ptr addrspace(1) %ou
 ; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v1, 16, v10
 ; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v2, v11, 0, 16
 ; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v0, v10, 0, 16
-; GCN-NOHSA-SI-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:4 ; 4-byte Folded Spill
+; GCN-NOHSA-SI-NEXT:    buffer_store_dword v0, off, s[8:11], 0 ; 4-byte Folded Spill
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NOHSA-SI-NEXT:    buffer_store_dword v1, off, s[8:11], 0 offset:8 ; 4-byte Folded Spill
-; GCN-NOHSA-SI-NEXT:    buffer_store_dword v2, off, s[8:11], 0 offset:12 ; 4-byte Folded Spill
-; GCN-NOHSA-SI-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:16 ; 4-byte Folded Spill
+; GCN-NOHSA-SI-NEXT:    buffer_store_dword v1, off, s[8:11], 0 offset:4 ; 4-byte Folded Spill
+; GCN-NOHSA-SI-NEXT:    buffer_store_dword v2, off, s[8:11], 0 offset:8 ; 4-byte Folded Spill
+; GCN-NOHSA-SI-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:12 ; 4-byte Folded Spill
 ; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v7, 16, v9
 ; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v5, 16, v8
 ; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v6, v9, 0, 16
@@ -4370,10 +4370,10 @@ define amdgpu_kernel void @global_sextload_v64i16_to_v64i32(ptr addrspace(1) %ou
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:32
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:48
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0
-; GCN-NOHSA-SI-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:4 ; 4-byte Folded Reload
-; GCN-NOHSA-SI-NEXT:    buffer_load_dword v1, off, s[8:11], 0 offset:8 ; 4-byte Folded Reload
-; GCN-NOHSA-SI-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:12 ; 4-byte Folded Reload
-; GCN-NOHSA-SI-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:16 ; 4-byte Folded Reload
+; GCN-NOHSA-SI-NEXT:    buffer_load_dword v0, off, s[8:11], 0 ; 4-byte Folded Reload
+; GCN-NOHSA-SI-NEXT:    buffer_load_dword v1, off, s[8:11], 0 offset:4 ; 4-byte Folded Reload
+; GCN-NOHSA-SI-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:8 ; 4-byte Folded Reload
+; GCN-NOHSA-SI-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:12 ; 4-byte Folded Reload
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
 ; GCN-NOHSA-SI-NEXT:    s_endpgm
@@ -4602,20 +4602,20 @@ define amdgpu_kernel void @global_sextload_v64i16_to_v64i32(ptr addrspace(1) %ou
 ; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v16, 16, v14
 ; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v17, v15, 0, 16
 ; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v15, v14, 0, 16
-; GCN-NOHSA-VI-NEXT:    buffer_store_dword v15, off, s[88:91], 0 offset:4 ; 4-byte Folded Spill
+; GCN-NOHSA-VI-NEXT:    buffer_store_dword v15, off, s[88:91], 0 ; 4-byte Folded Spill
 ; GCN-NOHSA-VI-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NOHSA-VI-NEXT:    buffer_store_dword v16, off, s[88:91], 0 offset:8 ; 4-byte Folded Spill
-; GCN-NOHSA-VI-NEXT:    buffer_store_dword v17, off, s[88:91], 0 offset:12 ; 4-byte Folded Spill
-; GCN-NOHSA-VI-NEXT:    buffer_store_dword v18, off, s[88:91], 0 offset:16 ; 4-byte Folded Spill
+; GCN-NOHSA-VI-NEXT:    buffer_store_dword v16, off, s[88:91], 0 offset:4 ; 4-byte Folded Spill
+; GCN-NOHSA-VI-NEXT:    buffer_store_dword v17, off, s[88:91], 0 offset:8 ; 4-byte Folded Spill
+; GCN-NOHSA-VI-NEXT:    buffer_store_dword v18, off, s[88:91], 0 offset:12 ; 4-byte Folded Spill
 ; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v16, 16, v13
 ; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v14, 16, v12
 ; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v15, v13, 0, 16
 ; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v13, v12, 0, 16
-; GCN-NOHSA-VI-NEXT:    buffer_store_dword v13, off, s[88:91], 0 offset:20 ; 4-byte Folded Spill
+; GCN-NOHSA-VI-NEXT:    buffer_store_dword v13, off, s[88:91], 0 offset:16 ; 4-byte Folded Spill
 ; GCN-NOHSA-VI-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NOHSA-VI-NEXT:    buffer_store_dword v14, off, s[88:91], 0 offset:24 ; 4-byte Folded Spill
-; GCN-NOHSA-VI-NEXT:    buffer_store_dword v15, off, s[88:91], 0 offset:28 ; 4-byte Folded Spill
-; GCN-NOHSA-VI-NEXT:    buffer_store_dword v16, off, s[88:91], 0 offset:32 ; 4-byte Folded Spill
+; GCN-NOHSA-VI-NEXT:    buffer_store_dword v14, off, s[88:91], 0 offset:20 ; 4-byte Folded Spill
+; GCN-NOHSA-VI-NEXT:    buffer_store_dword v15, off, s[88:91], 0 offset:24 ; 4-byte Folded Spill
+; GCN-NOHSA-VI-NEXT:    buffer_store_dword v16, off, s[88:91], 0 offset:28 ; 4-byte Folded Spill
 ; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v19, 16, v11
 ; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v17, 16, v10
 ; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v18, v11, 0, 16
@@ -4686,16 +4686,16 @@ define amdgpu_kernel void @global_sextload_v64i16_to_v64i32(ptr addrspace(1) %ou
 ; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:80
 ; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[39:42], off, s[0:3], 0 offset:32
 ; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:48
-; GCN-NOHSA-VI-NEXT:    buffer_load_dword v0, off, s[88:91], 0 offset:20 ; 4-byte Folded Reload
-; GCN-NOHSA-VI-NEXT:    buffer_load_dword v1, off, s[88:91], 0 offset:24 ; 4-byte Folded Reload
-; GCN-NOHSA-VI-NEXT:    buffer_load_dword v2, off, s[88:91], 0 offset:28 ; 4-byte Folded Reload
-; GCN-NOHSA-VI-NEXT:    buffer_load_dword v3, off, s[88:91], 0 offset:32 ; 4-byte Folded Reload
+; GCN-NOHSA-VI-NEXT:    buffer_load_dword v0, off, s[88:91], 0 offset:16 ; 4-byte Folded Reload
+; GCN-NOHSA-VI-NEXT:    buffer_load_dword v1, off, s[88:91], 0 offset:20 ; 4-byte Folded Reload
+; GCN-NOHSA-VI-NEXT:    buffer_load_dword v2, off, s[88:91], 0 offset:24 ; 4-byte Folded Reload
+; GCN-NOHSA-VI-NEXT:    buffer_load_dword v3, off, s[88:91], 0 offset:28 ; 4-byte Folded Reload
 ; GCN-NOHSA-VI-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
-; GCN-NOHSA-VI-NEXT:    buffer_load_dword v0, off, s[88:91], 0 offset:4 ; 4-byte Folded Reload
-; GCN-NOHSA-VI-NEXT:    buffer_load_dword v1, off, s[88:91], 0 offset:8 ; 4-byte Folded Reload
-; GCN-NOHSA-VI-NEXT:    buffer_load_dword v2, off, s[88:91], 0 offset:12 ; 4-byte Folded Reload
-; GCN-NOHSA-VI-NEXT:    buffer_load_dword v3, off, s[88:91], 0 offset:16 ; 4-byte Folded Reload
+; GCN-NOHSA-VI-NEXT:    buffer_load_dword v0, off, s[88:91], 0 ; 4-byte Folded Reload
+; GCN-NOHSA-VI-NEXT:    buffer_load_dword v1, off, s[88:91], 0 offset:4 ; 4-byte Folded Reload
+; GCN-NOHSA-VI-NEXT:    buffer_load_dword v2, off, s[88:91], 0 offset:8 ; 4-byte Folded Reload
+; GCN-NOHSA-VI-NEXT:    buffer_load_dword v3, off, s[88:91], 0 offset:12 ; 4-byte Folded Reload
 ; GCN-NOHSA-VI-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
 ; GCN-NOHSA-VI-NEXT:    s_endpgm
@@ -7270,11 +7270,11 @@ define amdgpu_kernel void @global_zextload_v32i16_to_v32i64(ptr addrspace(1) %ou
 ; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v16
 ; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v14
 ; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v0, 0xffff, v14
-; GCN-NOHSA-SI-NEXT:    buffer_store_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
+; GCN-NOHSA-SI-NEXT:    buffer_store_dword v0, off, s[12:15], 0 ; 4-byte Folded Spill
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NOHSA-SI-NEXT:    buffer_store_dword v1, off, s[12:15], 0 offset:8 ; 4-byte Folded Spill
-; GCN-NOHSA-SI-NEXT:    buffer_store_dword v2, off, s[12:15], 0 offset:12 ; 4-byte Folded Spill
-; GCN-NOHSA-SI-NEXT:    buffer_store_dword v3, off, s[12:15], 0 offset:16 ; 4-byte Folded Spill
+; GCN-NOHSA-SI-NEXT:    buffer_store_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
+; GCN-NOHSA-SI-NEXT:    buffer_store_dword v2, off, s[12:15], 0 offset:8 ; 4-byte Folded Spill
+; GCN-NOHSA-SI-NEXT:    buffer_store_dword v3, off, s[12:15], 0 offset:12 ; 4-byte Folded Spill
 ; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v12, 0xffff, v16
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v14, v3
 ; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v30, 0xffff, v15
@@ -7334,16 +7334,16 @@ define amdgpu_kernel void @global_zextload_v32i16_to_v32i64(ptr addrspace(1) %ou
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v11, v39
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v13, v39
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v15, v39
-; GCN-NOHSA-SI-NEXT:    buffer_store_dword v12, off, s[12:15], 0 offset:20 ; 4-byte Folded Spill
+; GCN-NOHSA-SI-NEXT:    buffer_store_dword v12, off, s[12:15], 0 offset:16 ; 4-byte Folded Spill
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NOHSA-SI-NEXT:    buffer_store_dword v13, off, s[12:15], 0 offset:24 ; 4-byte Folded Spill
-; GCN-NOHSA-SI-NEXT:    buffer_store_dword v14, off, s[12:15], 0 offset:28 ; 4-byte Folded Spill
-; GCN-NOHSA-SI-NEXT:    buffer_store_dword v15, off, s[12:15], 0 offset:32 ; 4-byte Folded Spill
+; GCN-NOHSA-SI-NEXT:    buffer_store_dword v13, off, s[12:15], 0 offset:20 ; 4-byte Folded Spill
+; GCN-NOHSA-SI-NEXT:    buffer_store_dword v14, off, s[12:15], 0 offset:24 ; 4-byte Folded Spill
+; GCN-NOHSA-SI-NEXT:    buffer_store_dword v15, off, s[12:15], 0 offset:28 ; 4-byte Folded Spill
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT:    buffer_load_dword v12, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
-; GCN-NOHSA-SI-NEXT:    buffer_load_dword v13, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload
-; GCN-NOHSA-SI-NEXT:    buffer_load_dword v14, off, s[12:15], 0 offset:12 ; 4-byte Folded Reload
-; GCN-NOHSA-SI-NEXT:    buffer_load_dword v15, off, s[12:15], 0 offset:16 ; 4-byte Folded Reload
+; GCN-NOHSA-SI-NEXT:    buffer_load_dword v12, off, s[12:15], 0 ; 4-byte Folded Reload
+; GCN-NOHSA-SI-NEXT:    buffer_load_dword v13, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
+; GCN-NOHSA-SI-NEXT:    buffer_load_dword v14, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload
+; GCN-NOHSA-SI-NEXT:    buffer_load_dword v15, off, s[12:15], 0 offset:12 ; 4-byte Folded Reload
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v13, v39
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v15, v39
@@ -7363,10 +7363,10 @@ define amdgpu_kernel void @global_zextload_v32i16_to_v32i64(ptr addrspace(1) %ou
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[46:49], off, s[0:3], 0 offset:128
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:96
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:64
-; GCN-NOHSA-SI-NEXT:    buffer_load_dword v0, off, s[12:15], 0 offset:20 ; 4-byte Folded Reload
-; GCN-NOHSA-SI-NEXT:    buffer_load_dword v1, off, s[12:15], 0 offset:24 ; 4-byte Folded Reload
-; GCN-NOHSA-SI-NEXT:    buffer_load_dword v2, off, s[12:15], 0 offset:28 ; 4-byte Folded Reload
-; GCN-NOHSA-SI-NEXT:    buffer_load_dword v3, off, s[12:15], 0 offset:32 ; 4-byte Folded Reload
+; GCN-NOHSA-SI-NEXT:    buffer_load_dword v0, off, s[12:15], 0 offset:16 ; 4-byte Folded Reload
+; GCN-NOHSA-SI-NEXT:    buffer_load_dword v1, off, s[12:15], 0 offset:20 ; 4-byte Folded Reload
+; GCN-NOHSA-SI-NEXT:    buffer_load_dword v2, off, s[12:15], 0 offset:24 ; 4-byte Folded Reload
+; GCN-NOHSA-SI-NEXT:    buffer_load_dword v3, off, s[12:15], 0 offset:28 ; 4-byte Folded Reload
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[0:3], 0
diff --git a/llvm/test/CodeGen/AMDGPU/load-global-i32.ll b/llvm/test/CodeGen/AMDGPU/load-global-i32.ll
index af96165..0f9cc33 100644
--- a/llvm/test/CodeGen/AMDGPU/load-global-i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-global-i32.ll
@@ -3031,11 +3031,11 @@ define amdgpu_kernel void @global_sextload_v32i32_to_v32i64(ptr addrspace(1) %ou
 ; SI-NOHSA-NEXT:    v_mov_b32_e32 v34, v29
 ; SI-NOHSA-NEXT:    v_mov_b32_e32 v44, v30
 ; SI-NOHSA-NEXT:    v_mov_b32_e32 v46, v31
-; SI-NOHSA-NEXT:    buffer_store_dword v44, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
+; SI-NOHSA-NEXT:    buffer_store_dword v44, off, s[12:15], 0 ; 4-byte Folded Spill
 ; SI-NOHSA-NEXT:    s_waitcnt vmcnt(0)
-; SI-NOHSA-NEXT:    buffer_store_dword v45, off, s[12:15], 0 offset:8 ; 4-byte Folded Spill
-; SI-NOHSA-NEXT:    buffer_store_dword v46, off, s[12:15], 0 offset:12 ; 4-byte Folded Spill
-; SI-NOHSA-NEXT:    buffer_store_dword v47, off, s[12:15], 0 offset:16 ; 4-byte Folded Spill
+; SI-NOHSA-NEXT:    buffer_store_dword v45, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
+; SI-NOHSA-NEXT:    buffer_store_dword v46, off, s[12:15], 0 offset:8 ; 4-byte Folded Spill
+; SI-NOHSA-NEXT:    buffer_store_dword v47, off, s[12:15], 0 offset:12 ; 4-byte Folded Spill
 ; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v15, 31, v7
 ; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v13, 31, v6
 ; SI-NOHSA-NEXT:    s_waitcnt expcnt(0)
@@ -3090,10 +3090,10 @@ define amdgpu_kernel void @global_sextload_v32i32_to_v32i64(ptr addrspace(1) %ou
 ; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[40:43], off, s[0:3], 0 offset:224
 ; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[36:39], off, s[0:3], 0 offset:240
 ; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[32:35], off, s[0:3], 0 offset:192
-; SI-NOHSA-NEXT:    buffer_load_dword v8, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
-; SI-NOHSA-NEXT:    buffer_load_dword v9, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload
-; SI-NOHSA-NEXT:    buffer_load_dword v10, off, s[12:15], 0 offset:12 ; 4-byte Folded Reload
-; SI-NOHSA-NEXT:    buffer_load_dword v11, off, s[12:15], 0 offset:16 ; 4-byte Folded Reload
+; SI-NOHSA-NEXT:    buffer_load_dword v8, off, s[12:15], 0 ; 4-byte Folded Reload
+; SI-NOHSA-NEXT:    buffer_load_dword v9, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
+; SI-NOHSA-NEXT:    buffer_load_dword v10, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload
+; SI-NOHSA-NEXT:    buffer_load_dword v11, off, s[12:15], 0 offset:12 ; 4-byte Folded Reload
 ; SI-NOHSA-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:208
 ; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[44:47], off, s[0:3], 0 offset:160
@@ -3611,11 +3611,11 @@ define amdgpu_kernel void @global_sextload_v32i32_to_v32i64(ptr addrspace(1) %ou
 ; GCN-GFX900-HSA-NEXT:    v_ashrrev_i32_e32 v5, 31, v0
 ; GCN-GFX900-HSA-NEXT:    v_mov_b32_e32 v4, v0
 ; GCN-GFX900-HSA-NEXT:    v_mov_b32_e32 v6, v1
-; GCN-GFX900-HSA-NEXT:    buffer_store_dword v25, off, s[8:11], 0 offset:4 ; 4-byte Folded Spill
+; GCN-GFX900-HSA-NEXT:    buffer_store_dword v25, off, s[8:11], 0 ; 4-byte Folded Spill
 ; GCN-GFX900-HSA-NEXT:    s_waitcnt vmcnt(0)
-; GCN-GFX900-HSA-NEXT:    buffer_store_dword v26, off, s[8:11], 0 offset:8 ; 4-byte Folded Spill
-; GCN-GFX900-HSA-NEXT:    buffer_store_dword v27, off, s[8:11], 0 offset:12 ; 4-byte Folded Spill
-; GCN-GFX900-HSA-NEXT:    buffer_store_dword v28, off, s[8:11], 0 offset:16 ; 4-byte Folded Spill
+; GCN-GFX900-HSA-NEXT:    buffer_store_dword v26, off, s[8:11], 0 offset:4 ; 4-byte Folded Spill
+; GCN-GFX900-HSA-NEXT:    buffer_store_dword v27, off, s[8:11], 0 offset:8 ; 4-byte Folded Spill
+; GCN-GFX900-HSA-NEXT:    buffer_store_dword v28, off, s[8:11], 0 offset:12 ; 4-byte Folded Spill
 ; GCN-GFX900-HSA-NEXT:    v_ashrrev_i32_e32 v28, 31, v12
 ; GCN-GFX900-HSA-NEXT:    v_ashrrev_i32_e32 v26, 31, v11
 ; GCN-GFX900-HSA-NEXT:    v_ashrrev_i32_e32 v40, 31, v10
@@ -3654,11 +3654,11 @@ define amdgpu_kernel void @global_sextload_v32i32_to_v32i64(ptr addrspace(1) %ou
 ; GCN-GFX900-HSA-NEXT:    global_store_dwordx4 v8, v[33:36], s[0:1] offset:224
 ; GCN-GFX900-HSA-NEXT:    global_store_dwordx4 v8, v[29:32], s[0:1] offset:240
 ; GCN-GFX900-HSA-NEXT:    global_store_dwordx4 v8, v[4:7], s[0:1] offset:192
-; GCN-GFX900-HSA-NEXT:    buffer_load_dword v32, off, s[8:11], 0 offset:4 ; 4-byte Folded Reload
+; GCN-GFX900-HSA-NEXT:    buffer_load_dword v32, off, s[8:11], 0 ; 4-byte Folded Reload
 ; GCN-GFX900-HSA-NEXT:    s_nop 0
-; GCN-GFX900-HSA-NEXT:    buffer_load_dword v33, off, s[8:11], 0 offset:8 ; 4-byte Folded Reload
-; GCN-GFX900-HSA-NEXT:    buffer_load_dword v34, off, s[8:11], 0 offset:12 ; 4-byte Folded Reload
-; GCN-GFX900-HSA-NEXT:    buffer_load_dword v35, off, s[8:11], 0 offset:16 ; 4-byte Folded Reload
+; GCN-GFX900-HSA-NEXT:    buffer_load_dword v33, off, s[8:11], 0 offset:4 ; 4-byte Folded Reload
+; GCN-GFX900-HSA-NEXT:    buffer_load_dword v34, off, s[8:11], 0 offset:8 ; 4-byte Folded Reload
+; GCN-GFX900-HSA-NEXT:    buffer_load_dword v35, off, s[8:11], 0 offset:12 ; 4-byte Folded Reload
 ; GCN-GFX900-HSA-NEXT:    s_waitcnt vmcnt(8)
 ; GCN-GFX900-HSA-NEXT:    v_ashrrev_i32_e32 v60, 31, v52
 ; GCN-GFX900-HSA-NEXT:    v_ashrrev_i32_e32 v58, 31, v51
diff --git a/llvm/test/CodeGen/AMDGPU/memory_clause.ll b/llvm/test/CodeGen/AMDGPU/memory_clause.ll
index ee2c590..940287d 100644
--- a/llvm/test/CodeGen/AMDGPU/memory_clause.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory_clause.ll
@@ -390,7 +390,7 @@ define amdgpu_kernel void @flat_scratch_load(float %a, float %b, <8 x i32> %desc
 ; GCN-NEXT:    s_add_u32 s16, s16, s3
 ; GCN-NEXT:    s_addc_u32 s17, s17, 0
 ; GCN-NEXT:    v_mov_b32_e32 v0, 0x40b00000
-; GCN-NEXT:    buffer_store_dword v0, off, s[16:19], 0 offset:4
+; GCN-NEXT:    buffer_store_dword v0, off, s[16:19], 0
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    s_brev_b32 s0, 1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
@@ -401,7 +401,7 @@ define amdgpu_kernel void @flat_scratch_load(float %a, float %b, <8 x i32> %desc
 ; GCN-NEXT:    v_mov_b32_e32 v1, s13
 ; GCN-NEXT:    ;;#ASMSTART
 ; GCN-NEXT:    ;;#ASMEND
-; GCN-NEXT:    buffer_load_dword v2, off, s[16:19], 0 offset:4
+; GCN-NEXT:    buffer_load_dword v2, off, s[16:19], 0
 ; GCN-NEXT:    s_nop 0
 ; GCN-NEXT:    image_sample v0, v[0:1], s[4:11], s[0:3] dmask:0x1
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
@@ -421,11 +421,11 @@ define amdgpu_kernel void @flat_scratch_load(float %a, float %b, <8 x i32> %desc
 ; GCN-SCRATCH-NEXT:    v_mov_b32_e32 v0, 0x40b00000
 ; GCN-SCRATCH-NEXT:    s_brev_b32 s8, 1
 ; GCN-SCRATCH-NEXT:    s_mov_b32 s9, s8
-; GCN-SCRATCH-NEXT:    scratch_store_dword off, v0, off offset:4
+; GCN-SCRATCH-NEXT:    scratch_store_dword off, v0, off
 ; GCN-SCRATCH-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GCN-SCRATCH-NEXT:    ;;#ASMSTART
 ; GCN-SCRATCH-NEXT:    ;;#ASMEND
-; GCN-SCRATCH-NEXT:    scratch_load_dword v2, off, off offset:4
+; GCN-SCRATCH-NEXT:    scratch_load_dword v2, off, off
 ; GCN-SCRATCH-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-SCRATCH-NEXT:    v_mov_b32_e32 v0, s10
 ; GCN-SCRATCH-NEXT:    v_mov_b32_e32 v1, s11
@@ -460,15 +460,15 @@ define amdgpu_kernel void @flat_scratch_load_clause(float %a, float %b, <8 x i32
 ; GCN-NEXT:    s_add_u32 s4, s4, s3
 ; GCN-NEXT:    s_addc_u32 s5, s5, 0
 ; GCN-NEXT:    v_mov_b32_e32 v0, 0x40b00000
-; GCN-NEXT:    buffer_store_dword v0, off, s[4:7], 0 offset:4
+; GCN-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v0, 0x40d00000
-; GCN-NEXT:    buffer_store_dword v0, off, s[4:7], 0 offset:8
+; GCN-NEXT:    buffer_store_dword v0, off, s[4:7], 0 offset:4
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    ;;#ASMSTART
 ; GCN-NEXT:    ;;#ASMEND
-; GCN-NEXT:    buffer_load_dword v0, off, s[4:7], 0 offset:4
-; GCN-NEXT:    buffer_load_dword v1, off, s[4:7], 0 offset:8
+; GCN-NEXT:    buffer_load_dword v0, off, s[4:7], 0
+; GCN-NEXT:    buffer_load_dword v1, off, s[4:7], 0 offset:4
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    v_add_f32_e32 v0, v0, v1
 ; GCN-NEXT:    exp mrt0 v0, off, off, off done vm
@@ -482,15 +482,15 @@ define amdgpu_kernel void @flat_scratch_load_clause(float %a, float %b, <8 x i32
 ; GCN-SCRATCH-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
 ; GCN-SCRATCH-NEXT:    v_mov_b32_e32 v0, 0x40b00000
 ; GCN-SCRATCH-NEXT:    v_mov_b32_e32 v1, 0x40d00000
-; GCN-SCRATCH-NEXT:    scratch_store_dword off, v0, off offset:4
+; GCN-SCRATCH-NEXT:    scratch_store_dword off, v0, off
 ; GCN-SCRATCH-NEXT:    s_waitcnt_vscnt null, 0x0
-; GCN-SCRATCH-NEXT:    scratch_store_dword off, v1, off offset:8
+; GCN-SCRATCH-NEXT:    scratch_store_dword off, v1, off offset:4
 ; GCN-SCRATCH-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GCN-SCRATCH-NEXT:    ;;#ASMSTART
 ; GCN-SCRATCH-NEXT:    ;;#ASMEND
 ; GCN-SCRATCH-NEXT:    s_clause 0x1
-; GCN-SCRATCH-NEXT:    scratch_load_dword v0, off, off offset:4
-; GCN-SCRATCH-NEXT:    scratch_load_dword v1, off, off offset:8
+; GCN-SCRATCH-NEXT:    scratch_load_dword v0, off, off
+; GCN-SCRATCH-NEXT:    scratch_load_dword v1, off, off offset:4
 ; GCN-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-SCRATCH-NEXT:    v_add_f32_e32 v0, v0, v1
 ; GCN-SCRATCH-NEXT:    exp mrt0 v0, off, off, off done vm
diff --git a/llvm/test/CodeGen/AMDGPU/mubuf-offset-private.ll b/llvm/test/CodeGen/AMDGPU/mubuf-offset-private.ll
index 301f971..4ba5f3a 100644
--- a/llvm/test/CodeGen/AMDGPU/mubuf-offset-private.ll
+++ b/llvm/test/CodeGen/AMDGPU/mubuf-offset-private.ll
@@ -137,11 +137,11 @@ define amdgpu_kernel void @store_private_offset_i8_max_offset_plus2() #0 {
 ; so a possibly negative base index can't be used for the vgpr offset.
 
 ; GCN-LABEL: {{^}}store_private_unknown_bits_vaddr:
-; SICIVI: v_add_{{i|u}}32_e32 [[ADDR0:v[0-9]+]], vcc, 4
+; SICIVI: v_add_{{i|u}}32_e32 [[ADDR0:v[0-9]+]], vcc, 0
 ; SICIVI: v_add_{{i|u}}32_e32 [[ADDR1:v[0-9]+]], vcc, 32, [[ADDR0]]
 ; SICIVI: buffer_store_dword v{{[0-9]+}}, [[ADDR1]], s{{\[[0-9]+:[0-9]+\]}}, 0 offen{{$}}
 
-; GFX9: v_add_u32_e32 [[ADDR:v[0-9]+]], 4,
+; GFX9: v_add_u32_e32 [[ADDR:v[0-9]+]], 0,
 ; GFX9: buffer_store_dword v{{[0-9]+}}, [[ADDR]], s{{\[[0-9]+:[0-9]+\]}}, 0 offen offset:32
 define amdgpu_kernel void @store_private_unknown_bits_vaddr() #0 {
   %alloca = alloca [16 x i32], align 4, addrspace(5)
diff --git a/llvm/test/CodeGen/AMDGPU/partial-regcopy-and-spill-missed-at-regalloc.ll b/llvm/test/CodeGen/AMDGPU/partial-regcopy-and-spill-missed-at-regalloc.ll
index 75da11b..45fbaaa 100644
--- a/llvm/test/CodeGen/AMDGPU/partial-regcopy-and-spill-missed-at-regalloc.ll
+++ b/llvm/test/CodeGen/AMDGPU/partial-regcopy-and-spill-missed-at-regalloc.ll
@@ -39,7 +39,7 @@ define amdgpu_kernel void @partial_copy(<4 x i32> %arg) #0 {
   ; PEI-GFX908-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6225930 /* regdef:VReg_128 */, def renamable $vgpr0_vgpr1_vgpr2_vgpr3
   ; PEI-GFX908-NEXT:   renamable $agpr0_agpr1_agpr2_agpr3 = COPY killed renamable $vgpr0_vgpr1_vgpr2_vgpr3, implicit $exec
   ; PEI-GFX908-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 3538954 /* regdef:VReg_64 */, def renamable $vgpr0_vgpr1
-  ; PEI-GFX908-NEXT:   BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr8_sgpr9_sgpr10_sgpr11, 0, 4, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $vgpr0_vgpr1 :: (store (s32) into %stack.0, addrspace 5)
+  ; PEI-GFX908-NEXT:   BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr8_sgpr9_sgpr10_sgpr11, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $vgpr0_vgpr1 :: (store (s32) into %stack.0, addrspace 5)
   ; PEI-GFX908-NEXT:   $agpr4 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec, implicit killed $vgpr0_vgpr1
   ; PEI-GFX908-NEXT:   renamable $vgpr0_vgpr1_vgpr2_vgpr3 = COPY killed renamable $agpr0_agpr1_agpr2_agpr3, implicit $exec
   ; PEI-GFX908-NEXT:   GLOBAL_STORE_DWORDX4 undef renamable $vgpr0_vgpr1, killed renamable $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) undef`, addrspace 1)
@@ -48,7 +48,7 @@ define amdgpu_kernel void @partial_copy(<4 x i32> %arg) #0 {
   ; PEI-GFX908-NEXT:   renamable $vgpr0 = V_MOV_B32_e32 1, implicit $exec
   ; PEI-GFX908-NEXT:   renamable $vgpr1 = V_MOV_B32_e32 2, implicit $exec
   ; PEI-GFX908-NEXT:   renamable $agpr0_agpr1_agpr2_agpr3 = V_MFMA_I32_4X4X4I8_e64 killed $vgpr0, killed $vgpr1, killed $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
-  ; PEI-GFX908-NEXT:   $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr8_sgpr9_sgpr10_sgpr11, 0, 4, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1 :: (load (s32) from %stack.0, addrspace 5)
+  ; PEI-GFX908-NEXT:   $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr8_sgpr9_sgpr10_sgpr11, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1 :: (load (s32) from %stack.0, addrspace 5)
   ; PEI-GFX908-NEXT:   $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr4, implicit $exec, implicit $vgpr0_vgpr1
   ; PEI-GFX908-NEXT:   GLOBAL_STORE_DWORDX2 undef renamable $vgpr0_vgpr1, killed renamable $vgpr0_vgpr1, 0, 0, implicit $exec :: (volatile store (s64) into `ptr addrspace(1) undef`, addrspace 1)
   ; PEI-GFX908-NEXT:   renamable $vgpr0_vgpr1_vgpr2_vgpr3 = COPY killed renamable $agpr0_agpr1_agpr2_agpr3, implicit $exec
@@ -86,7 +86,7 @@ define amdgpu_kernel void @partial_copy(<4 x i32> %arg) #0 {
   ; PEI-GFX90A-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6553610 /* regdef:VReg_128_Align2 */, def renamable $vgpr0_vgpr1_vgpr2_vgpr3
   ; PEI-GFX90A-NEXT:   renamable $agpr0_agpr1_agpr2_agpr3 = COPY killed renamable $vgpr0_vgpr1_vgpr2_vgpr3, implicit $exec
   ; PEI-GFX90A-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 3866634 /* regdef:VReg_64_Align2 */, def renamable $vgpr0_vgpr1
-  ; PEI-GFX90A-NEXT:   BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr8_sgpr9_sgpr10_sgpr11, 0, 4, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $vgpr0_vgpr1 :: (store (s32) into %stack.0, addrspace 5)
+  ; PEI-GFX90A-NEXT:   BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr8_sgpr9_sgpr10_sgpr11, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $vgpr0_vgpr1 :: (store (s32) into %stack.0, addrspace 5)
   ; PEI-GFX90A-NEXT:   $agpr4 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec, implicit killed $vgpr0_vgpr1
   ; PEI-GFX90A-NEXT:   GLOBAL_STORE_DWORDX4 undef renamable $vgpr0_vgpr1, killed renamable $agpr0_agpr1_agpr2_agpr3, 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) undef`, addrspace 1)
   ; PEI-GFX90A-NEXT:   renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM killed renamable $sgpr4_sgpr5, 0, 0 :: (dereferenceable invariant load (s128) from %ir.arg.kernarg.offset1, addrspace 4)
@@ -94,7 +94,7 @@ define amdgpu_kernel void @partial_copy(<4 x i32> %arg) #0 {
   ; PEI-GFX90A-NEXT:   renamable $vgpr0 = V_MOV_B32_e32 1, implicit $exec
   ; PEI-GFX90A-NEXT:   renamable $vgpr1 = V_MOV_B32_e32 2, implicit $exec
   ; PEI-GFX90A-NEXT:   renamable $agpr0_agpr1_agpr2_agpr3 = V_MFMA_I32_4X4X4I8_e64 killed $vgpr0, killed $vgpr1, killed $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
-  ; PEI-GFX90A-NEXT:   $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr8_sgpr9_sgpr10_sgpr11, 0, 4, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1 :: (load (s32) from %stack.0, addrspace 5)
+  ; PEI-GFX90A-NEXT:   $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr8_sgpr9_sgpr10_sgpr11, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1 :: (load (s32) from %stack.0, addrspace 5)
   ; PEI-GFX90A-NEXT:   $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr4, implicit $exec, implicit $vgpr0_vgpr1
   ; PEI-GFX90A-NEXT:   GLOBAL_STORE_DWORDX2 undef renamable $vgpr0_vgpr1, killed renamable $vgpr0_vgpr1, 0, 0, implicit $exec :: (volatile store (s64) into `ptr addrspace(1) undef`, addrspace 1)
   ; PEI-GFX90A-NEXT:   GLOBAL_STORE_DWORDX4 undef renamable $vgpr0_vgpr1, killed renamable $agpr0_agpr1_agpr2_agpr3, 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) undef`, addrspace 1)
diff --git a/llvm/test/CodeGen/AMDGPU/partial-sgpr-to-vgpr-spills.ll b/llvm/test/CodeGen/AMDGPU/partial-sgpr-to-vgpr-spills.ll
index d6d559b..d898a13 100644
--- a/llvm/test/CodeGen/AMDGPU/partial-sgpr-to-vgpr-spills.ll
+++ b/llvm/test/CodeGen/AMDGPU/partial-sgpr-to-vgpr-spills.ll
@@ -110,7 +110,7 @@ define amdgpu_kernel void @spill_sgprs_to_multiple_vgprs(ptr addrspace(1) %out,
 ; GCN-NEXT:    v_writelane_b32 v2, s10, 62
 ; GCN-NEXT:    v_writelane_b32 v2, s11, 63
 ; GCN-NEXT:    s_or_saveexec_b64 s[34:35], -1
-; GCN-NEXT:    buffer_store_dword v2, off, s[92:95], 0 offset:12 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v2, off, s[92:95], 0 offset:8 ; 4-byte Folded Spill
 ; GCN-NEXT:    s_mov_b64 exec, s[34:35]
 ; GCN-NEXT:    ;;#ASMSTART
 ; GCN-NEXT:    ; def s[4:11]
@@ -201,7 +201,7 @@ define amdgpu_kernel void @spill_sgprs_to_multiple_vgprs(ptr addrspace(1) %out,
 ; GCN-NEXT:    v_writelane_b32 v1, s10, 62
 ; GCN-NEXT:    v_writelane_b32 v1, s11, 63
 ; GCN-NEXT:    s_or_saveexec_b64 s[34:35], -1
-; GCN-NEXT:    buffer_store_dword v1, off, s[92:95], 0 offset:8 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v1, off, s[92:95], 0 offset:4 ; 4-byte Folded Spill
 ; GCN-NEXT:    s_mov_b64 exec, s[34:35]
 ; GCN-NEXT:    ;;#ASMSTART
 ; GCN-NEXT:    ; def s[4:11]
@@ -215,7 +215,7 @@ define amdgpu_kernel void @spill_sgprs_to_multiple_vgprs(ptr addrspace(1) %out,
 ; GCN-NEXT:    v_writelane_b32 v0, s10, 6
 ; GCN-NEXT:    v_writelane_b32 v0, s11, 7
 ; GCN-NEXT:    s_or_saveexec_b64 s[34:35], -1
-; GCN-NEXT:    buffer_store_dword v0, off, s[92:95], 0 offset:4 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v0, off, s[92:95], 0 ; 4-byte Folded Spill
 ; GCN-NEXT:    s_mov_b64 exec, s[34:35]
 ; GCN-NEXT:    s_mov_b32 s1, 0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
@@ -223,10 +223,10 @@ define amdgpu_kernel void @spill_sgprs_to_multiple_vgprs(ptr addrspace(1) %out,
 ; GCN-NEXT:    s_cbranch_scc1 .LBB0_2
 ; GCN-NEXT:  ; %bb.1: ; %bb0
 ; GCN-NEXT:    s_or_saveexec_b64 s[34:35], -1
-; GCN-NEXT:    buffer_load_dword v2, off, s[92:95], 0 offset:8 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v2, off, s[92:95], 0 offset:4 ; 4-byte Folded Reload
 ; GCN-NEXT:    s_mov_b64 exec, s[34:35]
 ; GCN-NEXT:    s_or_saveexec_b64 s[34:35], -1
-; GCN-NEXT:    buffer_load_dword v1, off, s[92:95], 0 offset:12 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v1, off, s[92:95], 0 offset:8 ; 4-byte Folded Reload
 ; GCN-NEXT:    s_mov_b64 exec, s[34:35]
 ; GCN-NEXT:    s_waitcnt vmcnt(1)
 ; GCN-NEXT:    v_readlane_b32 s8, v2, 56
@@ -319,7 +319,7 @@ define amdgpu_kernel void @spill_sgprs_to_multiple_vgprs(ptr addrspace(1) %out,
 ; GCN-NEXT:    v_readlane_b32 s6, v1, 6
 ; GCN-NEXT:    v_readlane_b32 s7, v1, 7
 ; GCN-NEXT:    s_or_saveexec_b64 s[34:35], -1
-; GCN-NEXT:    buffer_load_dword v0, off, s[92:95], 0 offset:4 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v0, off, s[92:95], 0 ; 4-byte Folded Reload
 ; GCN-NEXT:    s_mov_b64 exec, s[34:35]
 ; GCN-NEXT:    ;;#ASMSTART
 ; GCN-NEXT:    ; use s[0:7]
@@ -423,13 +423,13 @@ define amdgpu_kernel void @spill_sgprs_to_multiple_vgprs(ptr addrspace(1) %out,
 ; GCN-NEXT:    ;;#ASMEND
 ; GCN-NEXT:  .LBB0_2: ; %ret
 ; GCN-NEXT:    s_or_saveexec_b64 s[34:35], -1
-; GCN-NEXT:    buffer_load_dword v0, off, s[92:95], 0 offset:4 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v0, off, s[92:95], 0 ; 4-byte Folded Reload
 ; GCN-NEXT:    s_mov_b64 exec, s[34:35]
 ; GCN-NEXT:    s_or_saveexec_b64 s[34:35], -1
-; GCN-NEXT:    buffer_load_dword v1, off, s[92:95], 0 offset:8 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v1, off, s[92:95], 0 offset:4 ; 4-byte Folded Reload
 ; GCN-NEXT:    s_mov_b64 exec, s[34:35]
 ; GCN-NEXT:    s_or_saveexec_b64 s[34:35], -1
-; GCN-NEXT:    buffer_load_dword v2, off, s[92:95], 0 offset:12 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v2, off, s[92:95], 0 offset:8 ; 4-byte Folded Reload
 ; GCN-NEXT:    s_mov_b64 exec, s[34:35]
 ; GCN-NEXT:    ; kill: killed $vgpr2
 ; GCN-NEXT:    ; kill: killed $vgpr1
@@ -570,7 +570,7 @@ define amdgpu_kernel void @split_sgpr_spill_2_vgprs(ptr addrspace(1) %out, i32 %
 ; GCN-NEXT:    v_writelane_b32 v1, s18, 62
 ; GCN-NEXT:    v_writelane_b32 v1, s19, 63
 ; GCN-NEXT:    s_or_saveexec_b64 s[28:29], -1
-; GCN-NEXT:    buffer_store_dword v1, off, s[52:55], 0 offset:8 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v1, off, s[52:55], 0 offset:4 ; 4-byte Folded Spill
 ; GCN-NEXT:    s_mov_b64 exec, s[28:29]
 ; GCN-NEXT:    ;;#ASMSTART
 ; GCN-NEXT:    ; def s[4:11]
@@ -589,7 +589,7 @@ define amdgpu_kernel void @split_sgpr_spill_2_vgprs(ptr addrspace(1) %out, i32 %
 ; GCN-NEXT:    v_writelane_b32 v0, s2, 8
 ; GCN-NEXT:    v_writelane_b32 v0, s3, 9
 ; GCN-NEXT:    s_or_saveexec_b64 s[28:29], -1
-; GCN-NEXT:    buffer_store_dword v0, off, s[52:55], 0 offset:4 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v0, off, s[52:55], 0 ; 4-byte Folded Spill
 ; GCN-NEXT:    s_mov_b64 exec, s[28:29]
 ; GCN-NEXT:    s_mov_b32 s1, 0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
@@ -597,10 +597,10 @@ define amdgpu_kernel void @split_sgpr_spill_2_vgprs(ptr addrspace(1) %out, i32 %
 ; GCN-NEXT:    s_cbranch_scc1 .LBB1_2
 ; GCN-NEXT:  ; %bb.1: ; %bb0
 ; GCN-NEXT:    s_or_saveexec_b64 s[28:29], -1
-; GCN-NEXT:    buffer_load_dword v1, off, s[52:55], 0 offset:4 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v1, off, s[52:55], 0 ; 4-byte Folded Reload
 ; GCN-NEXT:    s_mov_b64 exec, s[28:29]
 ; GCN-NEXT:    s_or_saveexec_b64 s[28:29], -1
-; GCN-NEXT:    buffer_load_dword v0, off, s[52:55], 0 offset:8 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v0, off, s[52:55], 0 offset:4 ; 4-byte Folded Reload
 ; GCN-NEXT:    s_mov_b64 exec, s[28:29]
 ; GCN-NEXT:    s_waitcnt vmcnt(1)
 ; GCN-NEXT:    v_readlane_b32 s16, v1, 8
@@ -698,10 +698,10 @@ define amdgpu_kernel void @split_sgpr_spill_2_vgprs(ptr addrspace(1) %out, i32 %
 ; GCN-NEXT:    ;;#ASMEND
 ; GCN-NEXT:  .LBB1_2: ; %ret
 ; GCN-NEXT:    s_or_saveexec_b64 s[28:29], -1
-; GCN-NEXT:    buffer_load_dword v0, off, s[52:55], 0 offset:4 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v0, off, s[52:55], 0 ; 4-byte Folded Reload
 ; GCN-NEXT:    s_mov_b64 exec, s[28:29]
 ; GCN-NEXT:    s_or_saveexec_b64 s[28:29], -1
-; GCN-NEXT:    buffer_load_dword v1, off, s[52:55], 0 offset:8 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v1, off, s[52:55], 0 offset:4 ; 4-byte Folded Reload
 ; GCN-NEXT:    s_mov_b64 exec, s[28:29]
 ; GCN-NEXT:    ; kill: killed $vgpr1
 ; GCN-NEXT:    ; kill: killed $vgpr0
@@ -747,10 +747,10 @@ define amdgpu_kernel void @no_vgprs_last_sgpr_spill(ptr addrspace(1) %out, i32 %
 ; GCN-NEXT:    ;;#ASMSTART
 ; GCN-NEXT:    ;;#ASMEND
 ; GCN-NEXT:    s_or_saveexec_b64 s[34:35], -1
-; GCN-NEXT:    buffer_load_dword v1, off, s[52:55], 0 offset:8 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v1, off, s[52:55], 0 offset:4 ; 4-byte Folded Reload
 ; GCN-NEXT:    s_mov_b64 exec, s[34:35]
 ; GCN-NEXT:    s_or_saveexec_b64 s[34:35], -1
-; GCN-NEXT:    buffer_load_dword v0, off, s[52:55], 0 offset:4 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v0, off, s[52:55], 0 ; 4-byte Folded Reload
 ; GCN-NEXT:    s_mov_b64 exec, s[34:35]
 ; GCN-NEXT:    ;;#ASMSTART
 ; GCN-NEXT:    ;;#ASMEND
@@ -840,7 +840,7 @@ define amdgpu_kernel void @no_vgprs_last_sgpr_spill(ptr addrspace(1) %out, i32 %
 ; GCN-NEXT:    v_writelane_b32 v1, s18, 62
 ; GCN-NEXT:    v_writelane_b32 v1, s19, 63
 ; GCN-NEXT:    s_or_saveexec_b64 s[34:35], -1
-; GCN-NEXT:    buffer_store_dword v1, off, s[52:55], 0 offset:8 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v1, off, s[52:55], 0 offset:4 ; 4-byte Folded Spill
 ; GCN-NEXT:    s_mov_b64 exec, s[34:35]
 ; GCN-NEXT:    ;;#ASMSTART
 ; GCN-NEXT:    ; def s[2:3]
@@ -849,7 +849,7 @@ define amdgpu_kernel void @no_vgprs_last_sgpr_spill(ptr addrspace(1) %out, i32 %
 ; GCN-NEXT:    v_writelane_b32 v0, s2, 0
 ; GCN-NEXT:    v_writelane_b32 v0, s3, 1
 ; GCN-NEXT:    s_or_saveexec_b64 s[34:35], -1
-; GCN-NEXT:    buffer_store_dword v0, off, s[52:55], 0 offset:4 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v0, off, s[52:55], 0 ; 4-byte Folded Spill
 ; GCN-NEXT:    s_mov_b64 exec, s[34:35]
 ; GCN-NEXT:    s_mov_b32 s1, 0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
@@ -857,7 +857,7 @@ define amdgpu_kernel void @no_vgprs_last_sgpr_spill(ptr addrspace(1) %out, i32 %
 ; GCN-NEXT:    s_cbranch_scc1 .LBB2_2
 ; GCN-NEXT:  ; %bb.1: ; %bb0
 ; GCN-NEXT:    s_or_saveexec_b64 s[34:35], -1
-; GCN-NEXT:    buffer_load_dword v1, off, s[52:55], 0 offset:8 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v1, off, s[52:55], 0 offset:4 ; 4-byte Folded Reload
 ; GCN-NEXT:    s_mov_b64 exec, s[34:35]
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    v_readlane_b32 s36, v1, 32
@@ -909,7 +909,7 @@ define amdgpu_kernel void @no_vgprs_last_sgpr_spill(ptr addrspace(1) %out, i32 %
 ; GCN-NEXT:    v_readlane_b32 s30, v1, 14
 ; GCN-NEXT:    v_readlane_b32 s31, v1, 15
 ; GCN-NEXT:    s_or_saveexec_b64 s[34:35], -1
-; GCN-NEXT:    buffer_load_dword v0, off, s[52:55], 0 offset:4 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v0, off, s[52:55], 0 ; 4-byte Folded Reload
 ; GCN-NEXT:    s_mov_b64 exec, s[34:35]
 ; GCN-NEXT:    ;;#ASMSTART
 ; GCN-NEXT:    ; use s[16:31]
@@ -947,10 +947,10 @@ define amdgpu_kernel void @no_vgprs_last_sgpr_spill(ptr addrspace(1) %out, i32 %
 ; GCN-NEXT:    ;;#ASMEND
 ; GCN-NEXT:  .LBB2_2: ; %ret
 ; GCN-NEXT:    s_or_saveexec_b64 s[34:35], -1
-; GCN-NEXT:    buffer_load_dword v0, off, s[52:55], 0 offset:4 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v0, off, s[52:55], 0 ; 4-byte Folded Reload
 ; GCN-NEXT:    s_mov_b64 exec, s[34:35]
 ; GCN-NEXT:    s_or_saveexec_b64 s[34:35], -1
-; GCN-NEXT:    buffer_load_dword v1, off, s[52:55], 0 offset:8 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v1, off, s[52:55], 0 offset:4 ; 4-byte Folded Reload
 ; GCN-NEXT:    s_mov_b64 exec, s[34:35]
 ; GCN-NEXT:    ; kill: killed $vgpr1
 ; GCN-NEXT:    ; kill: killed $vgpr0
@@ -999,10 +999,10 @@ define amdgpu_kernel void @no_vgprs_last_sgpr_spill_live_v0(i32 %in) #1 {
 ; GCN-NEXT:    ;;#ASMSTART
 ; GCN-NEXT:    ;;#ASMEND
 ; GCN-NEXT:    s_or_saveexec_b64 s[34:35], -1
-; GCN-NEXT:    buffer_load_dword v1, off, s[52:55], 0 offset:8 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v1, off, s[52:55], 0 offset:4 ; 4-byte Folded Reload
 ; GCN-NEXT:    s_mov_b64 exec, s[34:35]
 ; GCN-NEXT:    s_or_saveexec_b64 s[34:35], -1
-; GCN-NEXT:    buffer_load_dword v0, off, s[52:55], 0 offset:4 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v0, off, s[52:55], 0 ; 4-byte Folded Reload
 ; GCN-NEXT:    s_mov_b64 exec, s[34:35]
 ; GCN-NEXT:    ;;#ASMSTART
 ; GCN-NEXT:    ;;#ASMEND
@@ -1092,7 +1092,7 @@ define amdgpu_kernel void @no_vgprs_last_sgpr_spill_live_v0(i32 %in) #1 {
 ; GCN-NEXT:    v_writelane_b32 v1, s18, 62
 ; GCN-NEXT:    v_writelane_b32 v1, s19, 63
 ; GCN-NEXT:    s_or_saveexec_b64 s[34:35], -1
-; GCN-NEXT:    buffer_store_dword v1, off, s[52:55], 0 offset:8 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v1, off, s[52:55], 0 offset:4 ; 4-byte Folded Spill
 ; GCN-NEXT:    s_mov_b64 exec, s[34:35]
 ; GCN-NEXT:    ;;#ASMSTART
 ; GCN-NEXT:    ; def s[2:3]
@@ -1101,7 +1101,7 @@ define amdgpu_kernel void @no_vgprs_last_sgpr_spill_live_v0(i32 %in) #1 {
 ; GCN-NEXT:    v_writelane_b32 v0, s2, 0
 ; GCN-NEXT:    v_writelane_b32 v0, s3, 1
 ; GCN-NEXT:    s_or_saveexec_b64 s[34:35], -1
-; GCN-NEXT:    buffer_store_dword v0, off, s[52:55], 0 offset:4 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v0, off, s[52:55], 0 ; 4-byte Folded Spill
 ; GCN-NEXT:    s_mov_b64 exec, s[34:35]
 ; GCN-NEXT:    s_mov_b32 s1, 0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1109,7 +1109,7 @@ define amdgpu_kernel void @no_vgprs_last_sgpr_spill_live_v0(i32 %in) #1 {
 ; GCN-NEXT:    s_cbranch_scc1 .LBB3_2
 ; GCN-NEXT:  ; %bb.1: ; %bb0
 ; GCN-NEXT:    s_or_saveexec_b64 s[34:35], -1
-; GCN-NEXT:    buffer_load_dword v2, off, s[52:55], 0 offset:8 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v2, off, s[52:55], 0 offset:4 ; 4-byte Folded Reload
 ; GCN-NEXT:    s_mov_b64 exec, s[34:35]
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    v_readlane_b32 s36, v2, 32
@@ -1161,7 +1161,7 @@ define amdgpu_kernel void @no_vgprs_last_sgpr_spill_live_v0(i32 %in) #1 {
 ; GCN-NEXT:    v_readlane_b32 s30, v2, 14
 ; GCN-NEXT:    v_readlane_b32 s31, v2, 15
 ; GCN-NEXT:    s_or_saveexec_b64 s[34:35], -1
-; GCN-NEXT:    buffer_load_dword v1, off, s[52:55], 0 offset:4 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v1, off, s[52:55], 0 ; 4-byte Folded Reload
 ; GCN-NEXT:    s_mov_b64 exec, s[34:35]
 ; GCN-NEXT:    ;;#ASMSTART
 ; GCN-NEXT:    ; def v0
@@ -1205,10 +1205,10 @@ define amdgpu_kernel void @no_vgprs_last_sgpr_spill_live_v0(i32 %in) #1 {
 ; GCN-NEXT:    ;;#ASMEND
 ; GCN-NEXT:  .LBB3_2: ; %ret
 ; GCN-NEXT:    s_or_saveexec_b64 s[34:35], -1
-; GCN-NEXT:    buffer_load_dword v0, off, s[52:55], 0 offset:4 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v0, off, s[52:55], 0 ; 4-byte Folded Reload
 ; GCN-NEXT:    s_mov_b64 exec, s[34:35]
 ; GCN-NEXT:    s_or_saveexec_b64 s[34:35], -1
-; GCN-NEXT:    buffer_load_dword v1, off, s[52:55], 0 offset:8 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v1, off, s[52:55], 0 offset:4 ; 4-byte Folded Reload
 ; GCN-NEXT:    s_mov_b64 exec, s[34:35]
 ; GCN-NEXT:    ; kill: killed $vgpr1
 ; GCN-NEXT:    ; kill: killed $vgpr0
diff --git a/llvm/test/CodeGen/AMDGPU/pei-amdgpu-cs-chain-preserve.mir b/llvm/test/CodeGen/AMDGPU/pei-amdgpu-cs-chain-preserve.mir
index d349339..8e2a56b 100644
--- a/llvm/test/CodeGen/AMDGPU/pei-amdgpu-cs-chain-preserve.mir
+++ b/llvm/test/CodeGen/AMDGPU/pei-amdgpu-cs-chain-preserve.mir
@@ -36,12 +36,12 @@ body:             |
     ; GCN-LABEL: name: preserve_active_lanes_above_args
     ; GCN: liveins: $sgpr0, $vgpr8, $vgpr9, $vgpr10
     ; GCN-NEXT: {{  $}}
-    ; GCN-NEXT: SCRATCH_STORE_DWORD_ST killed $vgpr10, 4, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.0, addrspace 5)
+    ; GCN-NEXT: SCRATCH_STORE_DWORD_ST killed $vgpr10, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.0, addrspace 5)
     ; GCN-NEXT: renamable $vgpr10 = V_MOV_B32_e32 10, implicit $exec
     ; GCN-NEXT: $vgpr8 = COPY killed renamable $vgpr10
     ; GCN-NEXT: renamable $sgpr4_sgpr5 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee + 4, target-flags(amdgpu-gotprel32-hi) @callee + 12, implicit-def dead $scc
     ; GCN-NEXT: renamable $sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM killed renamable $sgpr4_sgpr5, 0, 0 :: (dereferenceable invariant load (p0) from got, addrspace 4)
-    ; GCN-NEXT: $vgpr10 = SCRATCH_LOAD_DWORD_ST 4, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.0, addrspace 5)
+    ; GCN-NEXT: $vgpr10 = SCRATCH_LOAD_DWORD_ST 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.0, addrspace 5)
     ; GCN-NEXT: SI_CS_CHAIN_TC_W32 killed renamable $sgpr4_sgpr5, @callee, 0, -1, amdgpu_allvgprs, implicit $sgpr0, implicit $vgpr8, implicit $vgpr9
     renamable $vgpr10 = V_MOV_B32_e32 10, implicit $exec
     $vgpr8 = COPY renamable killed $vgpr10
@@ -70,8 +70,8 @@ body:             |
     ; GCN: liveins: $sgpr0, $sgpr35, $vgpr8, $vgpr9, $vgpr10
     ; GCN-NEXT: {{  $}}
     ; GCN-NEXT: $sgpr1 = S_OR_SAVEEXEC_B32 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
-    ; GCN-NEXT: SCRATCH_STORE_DWORD_ST $vgpr10, 4, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.0, addrspace 5)
-    ; GCN-NEXT: SCRATCH_STORE_DWORD_ST killed $vgpr11, 8, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.1, addrspace 5)
+    ; GCN-NEXT: SCRATCH_STORE_DWORD_ST $vgpr10, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.0, addrspace 5)
+    ; GCN-NEXT: SCRATCH_STORE_DWORD_ST killed $vgpr11, 4, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.1, addrspace 5)
     ; GCN-NEXT: $exec_lo = S_MOV_B32 killed $sgpr1
     ; GCN-NEXT: renamable $vgpr10 = SI_SPILL_S32_TO_VGPR $sgpr35, 0, killed $vgpr10
     ; GCN-NEXT: $sgpr35 = S_MOV_B32 5
@@ -81,8 +81,8 @@ body:             |
     ; GCN-NEXT: renamable $sgpr4_sgpr5 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee + 4, target-flags(amdgpu-gotprel32-hi) @callee + 12, implicit-def dead $scc
     ; GCN-NEXT: renamable $sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM killed renamable $sgpr4_sgpr5, 0, 0 :: (dereferenceable invariant load (p0) from got, addrspace 4)
     ; GCN-NEXT: $sgpr1 = S_OR_SAVEEXEC_B32 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
-    ; GCN-NEXT: $vgpr10 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 4, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.0, addrspace 5)
-    ; GCN-NEXT: $vgpr11 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 8, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.1, addrspace 5)
+    ; GCN-NEXT: $vgpr10 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.0, addrspace 5)
+    ; GCN-NEXT: $vgpr11 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 4, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.1, addrspace 5)
     ; GCN-NEXT: $exec_lo = S_MOV_B32 killed $sgpr1
     ; GCN-NEXT: SI_CS_CHAIN_TC_W32 killed renamable $sgpr4_sgpr5, @callee, 0, -1, amdgpu_allvgprs, implicit $sgpr0, implicit $vgpr8, implicit $vgpr9
     renamable $vgpr10 = SI_SPILL_S32_TO_VGPR $sgpr35, 0, killed $vgpr10
@@ -142,8 +142,8 @@ body:             |
     ; GCN: liveins: $sgpr0, $sgpr35, $vgpr8, $vgpr9, $vgpr10
     ; GCN-NEXT: {{  $}}
     ; GCN-NEXT: $sgpr1 = S_OR_SAVEEXEC_B32 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
-    ; GCN-NEXT: SCRATCH_STORE_DWORD_ST $vgpr8, 4, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.0, addrspace 5)
-    ; GCN-NEXT: SCRATCH_STORE_DWORD_ST $vgpr9, 8, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.1, addrspace 5)
+    ; GCN-NEXT: SCRATCH_STORE_DWORD_ST $vgpr8, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.0, addrspace 5)
+    ; GCN-NEXT: SCRATCH_STORE_DWORD_ST $vgpr9, 4, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.1, addrspace 5)
     ; GCN-NEXT: $exec_lo = S_MOV_B32 killed $sgpr1
     ; GCN-NEXT: renamable $vgpr8 = SI_SPILL_S32_TO_VGPR $sgpr35, 0, killed $vgpr8
     ; GCN-NEXT: $sgpr35 = S_MOV_B32 5
@@ -152,8 +152,8 @@ body:             |
     ; GCN-NEXT: renamable $sgpr4_sgpr5 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee + 4, target-flags(amdgpu-gotprel32-hi) @callee + 12, implicit-def dead $scc
     ; GCN-NEXT: renamable $sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM killed renamable $sgpr4_sgpr5, 0, 0 :: (dereferenceable invariant load (p0) from got, addrspace 4)
     ; GCN-NEXT: $sgpr1 = S_OR_SAVEEXEC_B32 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
-    ; GCN-NEXT: $vgpr8 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 4, 0, implicit $exec, implicit $flat_scr, implicit $vgpr8(tied-def 0) :: (load (s32) from %stack.0, addrspace 5)
-    ; GCN-NEXT: $vgpr9 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 8, 0, implicit $exec, implicit $flat_scr, implicit $vgpr9(tied-def 0) :: (load (s32) from %stack.1, addrspace 5)
+    ; GCN-NEXT: $vgpr8 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit $vgpr8(tied-def 0) :: (load (s32) from %stack.0, addrspace 5)
+    ; GCN-NEXT: $vgpr9 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 4, 0, implicit $exec, implicit $flat_scr, implicit $vgpr9(tied-def 0) :: (load (s32) from %stack.1, addrspace 5)
     ; GCN-NEXT: $exec_lo = S_MOV_B32 killed $sgpr1
     ; GCN-NEXT: SI_CS_CHAIN_TC_W32 killed renamable $sgpr4_sgpr5, @callee, 0, -1, amdgpu_allvgprs, implicit $sgpr0, implicit $vgpr8, implicit $vgpr9
     renamable $vgpr8 = SI_SPILL_S32_TO_VGPR $sgpr35, 0, killed $vgpr8
diff --git a/llvm/test/CodeGen/AMDGPU/pei-amdgpu-cs-chain.mir b/llvm/test/CodeGen/AMDGPU/pei-amdgpu-cs-chain.mir
index ae920f9..765597fe 100644
--- a/llvm/test/CodeGen/AMDGPU/pei-amdgpu-cs-chain.mir
+++ b/llvm/test/CodeGen/AMDGPU/pei-amdgpu-cs-chain.mir
@@ -38,14 +38,14 @@ body:             |
     ; GCN: liveins: $sgpr0, $sgpr35, $vgpr8, $vgpr9
     ; GCN-NEXT: {{  $}}
     ; GCN-NEXT: $sgpr1 = S_XOR_SAVEEXEC_B32 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
-    ; GCN-NEXT: SCRATCH_STORE_DWORD_ST $vgpr8, 4, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.0, addrspace 5)
-    ; GCN-NEXT: SCRATCH_STORE_DWORD_ST $vgpr9, 8, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.1, addrspace 5)
+    ; GCN-NEXT: SCRATCH_STORE_DWORD_ST $vgpr8, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.0, addrspace 5)
+    ; GCN-NEXT: SCRATCH_STORE_DWORD_ST $vgpr9, 4, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.1, addrspace 5)
     ; GCN-NEXT: $exec_lo = S_MOV_B32 killed $sgpr1
     ; GCN-NEXT: renamable $sgpr4_sgpr5 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee + 4, target-flags(amdgpu-gotprel32-hi) @callee + 12, implicit-def dead $scc
     ; GCN-NEXT: renamable $sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM killed renamable $sgpr4_sgpr5, 0, 0 :: (dereferenceable invariant load (p0) from got, addrspace 4)
     ; GCN-NEXT: $sgpr1 = S_XOR_SAVEEXEC_B32 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
-    ; GCN-NEXT: $vgpr8 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 4, 0, implicit $exec, implicit $flat_scr, implicit $vgpr8(tied-def 0) :: (load (s32) from %stack.0, addrspace 5)
-    ; GCN-NEXT: $vgpr9 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 8, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.1, addrspace 5)
+    ; GCN-NEXT: $vgpr8 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit $vgpr8(tied-def 0) :: (load (s32) from %stack.0, addrspace 5)
+    ; GCN-NEXT: $vgpr9 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 4, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.1, addrspace 5)
     ; GCN-NEXT: $exec_lo = S_MOV_B32 killed $sgpr1
     ; GCN-NEXT: SI_CS_CHAIN_TC_W32 killed renamable $sgpr4_sgpr5, @callee, 0, -1, amdgpu_allvgprs, implicit $sgpr0, implicit $vgpr8
     renamable $sgpr4_sgpr5 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee + 4, target-flags(amdgpu-gotprel32-hi) @callee + 12, implicit-def dead $scc
@@ -72,8 +72,8 @@ body:             |
     ; GCN: liveins: $sgpr0, $sgpr35, $vgpr8, $vgpr9
     ; GCN-NEXT: {{  $}}
     ; GCN-NEXT: $sgpr1 = S_XOR_SAVEEXEC_B32 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
-    ; GCN-NEXT: SCRATCH_STORE_DWORD_ST $vgpr8, 4, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.0, addrspace 5)
-    ; GCN-NEXT: SCRATCH_STORE_DWORD_ST $vgpr9, 8, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.1, addrspace 5)
+    ; GCN-NEXT: SCRATCH_STORE_DWORD_ST $vgpr8, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.0, addrspace 5)
+    ; GCN-NEXT: SCRATCH_STORE_DWORD_ST $vgpr9, 4, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.1, addrspace 5)
     ; GCN-NEXT: $exec_lo = S_MOV_B32 killed $sgpr1
     ; GCN-NEXT: renamable $vgpr8 = SI_SPILL_S32_TO_VGPR $sgpr35, 0, killed $vgpr8
     ; GCN-NEXT: $sgpr35 = S_MOV_B32 5
@@ -86,8 +86,8 @@ body:             |
     ; GCN-NEXT: renamable $sgpr4_sgpr5 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee + 4, target-flags(amdgpu-gotprel32-hi) @callee + 12, implicit-def dead $scc
     ; GCN-NEXT: renamable $sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM killed renamable $sgpr4_sgpr5, 0, 0 :: (dereferenceable invariant load (p0) from got, addrspace 4)
     ; GCN-NEXT: $sgpr1 = S_XOR_SAVEEXEC_B32 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
-    ; GCN-NEXT: $vgpr8 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 4, 0, implicit $exec, implicit $flat_scr, implicit $vgpr8(tied-def 0) :: (load (s32) from %stack.0, addrspace 5)
-    ; GCN-NEXT: $vgpr9 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 8, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.1, addrspace 5)
+    ; GCN-NEXT: $vgpr8 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit $vgpr8(tied-def 0) :: (load (s32) from %stack.0, addrspace 5)
+    ; GCN-NEXT: $vgpr9 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 4, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.1, addrspace 5)
     ; GCN-NEXT: $exec_lo = S_MOV_B32 killed $sgpr1
     ; GCN-NEXT: SI_CS_CHAIN_TC_W32 killed renamable $sgpr4_sgpr5, @callee, 0, -1, amdgpu_allvgprs, implicit $sgpr0, implicit $vgpr8
     renamable $vgpr8 = SI_SPILL_S32_TO_VGPR $sgpr35, 0, killed $vgpr8
diff --git a/llvm/test/CodeGen/AMDGPU/regalloc-introduces-copy-sgpr-to-agpr.mir b/llvm/test/CodeGen/AMDGPU/regalloc-introduces-copy-sgpr-to-agpr.mir
index 23c6afd..e4cbae6 100644
--- a/llvm/test/CodeGen/AMDGPU/regalloc-introduces-copy-sgpr-to-agpr.mir
+++ b/llvm/test/CodeGen/AMDGPU/regalloc-introduces-copy-sgpr-to-agpr.mir
@@ -39,7 +39,7 @@ body:             |
     ; GFX908-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, $sgpr7, implicit-def $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3
     ; GFX908-NEXT: $sgpr1 = S_ADDC_U32 $sgpr1, 0, implicit-def dead $scc, implicit $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3
     ; GFX908-NEXT: renamable $vgpr34 = GLOBAL_LOAD_DWORD undef $vgpr0_vgpr1, 0, 0, implicit $exec
-    ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr34, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec :: (store (s32) into %stack.0, addrspace 5)
+    ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr34, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into %stack.0, addrspace 5)
     ; GFX908-NEXT: renamable $vgpr34 = GLOBAL_LOAD_DWORD undef $vgpr0_vgpr1, 0, 0, implicit $exec
     ; GFX908-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr34, implicit $exec, implicit $exec
     ; GFX908-NEXT: renamable $vgpr34 = GLOBAL_LOAD_DWORD undef $vgpr0_vgpr1, 0, 0, implicit $exec
@@ -148,74 +148,74 @@ body:             |
     ; GFX908-NEXT: $vgpr35 = V_MOV_B32_e32 killed $sgpr4, implicit $exec
     ; GFX908-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr35, implicit $exec, implicit $exec
     ; GFX908-NEXT: $vgpr34 = V_MOV_B32_e32 killed $sgpr5, implicit $exec, implicit $exec
-    ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr34, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 8, 0, 0, implicit $exec :: (store (s32) into %stack.1, addrspace 5)
+    ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr34, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec :: (store (s32) into %stack.1, addrspace 5)
     ; GFX908-NEXT: $vgpr34 = V_MOV_B32_e32 killed $sgpr6, implicit $exec, implicit $exec
-    ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr34, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 12, 0, 0, implicit $exec :: (store (s32) into %stack.2, addrspace 5)
+    ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr34, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 8, 0, 0, implicit $exec :: (store (s32) into %stack.2, addrspace 5)
     ; GFX908-NEXT: $vgpr34 = V_MOV_B32_e32 killed $sgpr7, implicit $exec, implicit $exec
-    ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr34, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 16, 0, 0, implicit $exec :: (store (s32) into %stack.3, addrspace 5)
+    ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr34, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 12, 0, 0, implicit $exec :: (store (s32) into %stack.3, addrspace 5)
     ; GFX908-NEXT: $vgpr34 = V_MOV_B32_e32 killed $sgpr8, implicit $exec, implicit $exec
-    ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr34, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 20, 0, 0, implicit $exec :: (store (s32) into %stack.4, addrspace 5)
+    ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr34, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 16, 0, 0, implicit $exec :: (store (s32) into %stack.4, addrspace 5)
     ; GFX908-NEXT: $vgpr34 = V_MOV_B32_e32 killed $sgpr9, implicit $exec, implicit $exec
-    ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr34, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 24, 0, 0, implicit $exec :: (store (s32) into %stack.5, addrspace 5)
+    ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr34, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 20, 0, 0, implicit $exec :: (store (s32) into %stack.5, addrspace 5)
     ; GFX908-NEXT: $vgpr34 = V_MOV_B32_e32 killed $sgpr10, implicit $exec, implicit $exec
-    ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr34, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 28, 0, 0, implicit $exec :: (store (s32) into %stack.6, addrspace 5)
+    ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr34, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 24, 0, 0, implicit $exec :: (store (s32) into %stack.6, addrspace 5)
     ; GFX908-NEXT: $vgpr34 = V_MOV_B32_e32 killed $sgpr11, implicit $exec, implicit $exec
-    ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr34, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 32, 0, 0, implicit $exec :: (store (s32) into %stack.7, addrspace 5)
+    ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr34, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 28, 0, 0, implicit $exec :: (store (s32) into %stack.7, addrspace 5)
     ; GFX908-NEXT: $vgpr34 = V_MOV_B32_e32 killed $sgpr12, implicit $exec, implicit $exec
-    ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr34, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 36, 0, 0, implicit $exec :: (store (s32) into %stack.8, addrspace 5)
+    ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr34, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 32, 0, 0, implicit $exec :: (store (s32) into %stack.8, addrspace 5)
     ; GFX908-NEXT: $vgpr34 = V_MOV_B32_e32 killed $sgpr13, implicit $exec, implicit $exec
-    ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr34, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 40, 0, 0, implicit $exec :: (store (s32) into %stack.9, addrspace 5)
+    ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr34, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 36, 0, 0, implicit $exec :: (store (s32) into %stack.9, addrspace 5)
     ; GFX908-NEXT: $vgpr34 = V_MOV_B32_e32 killed $sgpr14, implicit $exec, implicit $exec
-    ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr34, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 44, 0, 0, implicit $exec :: (store (s32) into %stack.10, addrspace 5)
+    ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr34, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 40, 0, 0, implicit $exec :: (store (s32) into %stack.10, addrspace 5)
     ; GFX908-NEXT: $vgpr34 = V_MOV_B32_e32 killed $sgpr15, implicit $exec, implicit $exec
-    ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr34, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 48, 0, 0, implicit $exec :: (store (s32) into %stack.11, addrspace 5)
+    ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr34, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 44, 0, 0, implicit $exec :: (store (s32) into %stack.11, addrspace 5)
     ; GFX908-NEXT: $vgpr34 = V_MOV_B32_e32 killed $sgpr16, implicit $exec, implicit $exec
-    ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr34, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 52, 0, 0, implicit $exec :: (store (s32) into %stack.12, addrspace 5)
+    ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr34, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 48, 0, 0, implicit $exec :: (store (s32) into %stack.12, addrspace 5)
     ; GFX908-NEXT: $vgpr34 = V_MOV_B32_e32 killed $sgpr17, implicit $exec, implicit $exec
-    ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr34, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 56, 0, 0, implicit $exec :: (store (s32) into %stack.13, addrspace 5)
+    ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr34, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 52, 0, 0, implicit $exec :: (store (s32) into %stack.13, addrspace 5)
     ; GFX908-NEXT: $vgpr34 = V_MOV_B32_e32 killed $sgpr18, implicit $exec, implicit $exec
-    ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr34, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 60, 0, 0, implicit $exec :: (store (s32) into %stack.14, addrspace 5)
+    ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr34, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 56, 0, 0, implicit $exec :: (store (s32) into %stack.14, addrspace 5)
     ; GFX908-NEXT: $vgpr34 = V_MOV_B32_e32 killed $sgpr19, implicit $exec, implicit $exec
-    ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr34, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 64, 0, 0, implicit $exec :: (store (s32) into %stack.15, addrspace 5)
+    ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr34, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 60, 0, 0, implicit $exec :: (store (s32) into %stack.15, addrspace 5)
     ; GFX908-NEXT: $vgpr34 = V_MOV_B32_e32 killed $sgpr20, implicit $exec, implicit $exec
-    ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr34, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 68, 0, 0, implicit $exec :: (store (s32) into %stack.16, addrspace 5)
+    ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr34, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 64, 0, 0, implicit $exec :: (store (s32) into %stack.16, addrspace 5)
     ; GFX908-NEXT: $vgpr34 = V_MOV_B32_e32 killed $sgpr21, implicit $exec, implicit $exec
-    ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr34, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 72, 0, 0, implicit $exec :: (store (s32) into %stack.17, addrspace 5)
+    ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr34, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 68, 0, 0, implicit $exec :: (store (s32) into %stack.17, addrspace 5)
     ; GFX908-NEXT: $vgpr34 = V_MOV_B32_e32 killed $sgpr22, implicit $exec, implicit $exec
-    ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr34, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 76, 0, 0, implicit $exec :: (store (s32) into %stack.18, addrspace 5)
+    ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr34, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 72, 0, 0, implicit $exec :: (store (s32) into %stack.18, addrspace 5)
     ; GFX908-NEXT: $vgpr34 = V_MOV_B32_e32 killed $sgpr23, implicit $exec, implicit $exec
-    ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr34, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 80, 0, 0, implicit $exec :: (store (s32) into %stack.19, addrspace 5)
+    ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr34, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 76, 0, 0, implicit $exec :: (store (s32) into %stack.19, addrspace 5)
     ; GFX908-NEXT: $vgpr34 = V_MOV_B32_e32 killed $sgpr24, implicit $exec, implicit $exec
-    ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr34, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 84, 0, 0, implicit $exec :: (store (s32) into %stack.20, addrspace 5)
+    ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr34, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 80, 0, 0, implicit $exec :: (store (s32) into %stack.20, addrspace 5)
     ; GFX908-NEXT: $vgpr34 = V_MOV_B32_e32 killed $sgpr25, implicit $exec, implicit $exec
-    ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr34, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 88, 0, 0, implicit $exec :: (store (s32) into %stack.21, addrspace 5)
+    ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr34, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 84, 0, 0, implicit $exec :: (store (s32) into %stack.21, addrspace 5)
     ; GFX908-NEXT: $vgpr34 = V_MOV_B32_e32 killed $sgpr26, implicit $exec, implicit $exec
-    ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr34, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 92, 0, 0, implicit $exec :: (store (s32) into %stack.22, addrspace 5)
+    ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr34, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 88, 0, 0, implicit $exec :: (store (s32) into %stack.22, addrspace 5)
     ; GFX908-NEXT: $vgpr34 = V_MOV_B32_e32 killed $sgpr27, implicit $exec, implicit $exec
-    ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr34, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 96, 0, 0, implicit $exec :: (store (s32) into %stack.23, addrspace 5)
+    ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr34, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 92, 0, 0, implicit $exec :: (store (s32) into %stack.23, addrspace 5)
     ; GFX908-NEXT: $vgpr34 = V_MOV_B32_e32 killed $sgpr28, implicit $exec, implicit $exec
-    ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr34, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 100, 0, 0, implicit $exec :: (store (s32) into %stack.24, addrspace 5)
+    ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr34, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 96, 0, 0, implicit $exec :: (store (s32) into %stack.24, addrspace 5)
     ; GFX908-NEXT: $vgpr34 = V_MOV_B32_e32 killed $sgpr29, implicit $exec, implicit $exec
-    ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr34, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 104, 0, 0, implicit $exec :: (store (s32) into %stack.25, addrspace 5)
+    ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr34, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 100, 0, 0, implicit $exec :: (store (s32) into %stack.25, addrspace 5)
     ; GFX908-NEXT: $vgpr34 = V_MOV_B32_e32 killed $sgpr30, implicit $exec, implicit $exec
-    ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr34, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 108, 0, 0, implicit $exec :: (store (s32) into %stack.26, addrspace 5)
+    ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr34, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 104, 0, 0, implicit $exec :: (store (s32) into %stack.26, addrspace 5)
     ; GFX908-NEXT: $vgpr34 = V_MOV_B32_e32 killed $sgpr31, implicit $exec, implicit $exec
-    ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr34, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 112, 0, 0, implicit $exec :: (store (s32) into %stack.27, addrspace 5)
+    ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr34, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 108, 0, 0, implicit $exec :: (store (s32) into %stack.27, addrspace 5)
     ; GFX908-NEXT: $vgpr34 = V_MOV_B32_e32 killed $sgpr34, implicit $exec, implicit $exec
-    ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr34, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 116, 0, 0, implicit $exec :: (store (s32) into %stack.28, addrspace 5)
+    ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr34, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 112, 0, 0, implicit $exec :: (store (s32) into %stack.28, addrspace 5)
     ; GFX908-NEXT: $vgpr34 = V_MOV_B32_e32 killed $sgpr35, implicit $exec, implicit $exec
-    ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr34, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 120, 0, 0, implicit $exec :: (store (s32) into %stack.29, addrspace 5)
+    ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr34, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 116, 0, 0, implicit $exec :: (store (s32) into %stack.29, addrspace 5)
     ; GFX908-NEXT: $vgpr34 = V_MOV_B32_e32 killed $sgpr36, implicit $exec, implicit $exec
-    ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr34, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 124, 0, 0, implicit $exec :: (store (s32) into %stack.30, addrspace 5)
+    ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr34, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 120, 0, 0, implicit $exec :: (store (s32) into %stack.30, addrspace 5)
     ; GFX908-NEXT: $vgpr34 = V_MOV_B32_e32 killed $sgpr37, implicit $exec, implicit $exec
-    ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr34, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 128, 0, 0, implicit $exec :: (store (s32) into %stack.31, addrspace 5)
+    ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr34, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 124, 0, 0, implicit $exec :: (store (s32) into %stack.31, addrspace 5)
     ; GFX908-NEXT: $vgpr34 = V_MOV_B32_e32 killed $sgpr38, implicit $exec, implicit $exec
-    ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr34, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 132, 0, 0, implicit $exec :: (store (s32) into %stack.32, addrspace 5)
+    ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr34, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 128, 0, 0, implicit $exec :: (store (s32) into %stack.32, addrspace 5)
     ; GFX908-NEXT: $vgpr34 = V_MOV_B32_e32 killed $sgpr39, implicit $exec, implicit $exec
-    ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr34, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 136, 0, 0, implicit $exec :: (store (s32) into %stack.33, addrspace 5)
+    ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr34, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 132, 0, 0, implicit $exec :: (store (s32) into %stack.33, addrspace 5)
     ; GFX908-NEXT: $vgpr34 = V_MOV_B32_e32 killed $sgpr40, implicit $exec, implicit $exec
     ; GFX908-NEXT: S_NOP 0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, implicit $vgpr16_vgpr17_vgpr18_vgpr19, implicit $vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27, implicit $vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33, implicit $vgpr35
-    ; GFX908-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec :: (load (s32) from %stack.0, addrspace 5)
+    ; GFX908-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (load (s32) from %stack.0, addrspace 5)
     ; GFX908-NEXT: GLOBAL_STORE_DWORD undef $vgpr0_vgpr1, killed renamable $vgpr0, 0, 0, implicit $exec
     ; GFX908-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr1, implicit $exec, implicit $exec
     ; GFX908-NEXT: GLOBAL_STORE_DWORD undef $vgpr0_vgpr1, killed renamable $vgpr0, 0, 0, implicit $exec
@@ -287,39 +287,39 @@ body:             |
     ; GFX908-NEXT: GLOBAL_STORE_DWORD undef $vgpr0_vgpr1, killed renamable $vgpr0, 0, 0, implicit $exec
     ; GFX908-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr35, implicit $exec, implicit $exec
     ; GFX908-NEXT: GLOBAL_STORE_DWORD undef $vgpr0_vgpr1, killed renamable $vgpr0, 0, 0, implicit $exec
-    ; GFX908-NEXT: $vgpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 8, 0, 0, implicit $exec :: (load (s32) from %stack.1, addrspace 5)
-    ; GFX908-NEXT: $vgpr2 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 12, 0, 0, implicit $exec :: (load (s32) from %stack.2, addrspace 5)
-    ; GFX908-NEXT: $vgpr3 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 16, 0, 0, implicit $exec :: (load (s32) from %stack.3, addrspace 5)
-    ; GFX908-NEXT: $vgpr4 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 20, 0, 0, implicit $exec :: (load (s32) from %stack.4, addrspace 5)
-    ; GFX908-NEXT: $vgpr5 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 24, 0, 0, implicit $exec :: (load (s32) from %stack.5, addrspace 5)
-    ; GFX908-NEXT: $vgpr6 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 28, 0, 0, implicit $exec :: (load (s32) from %stack.6, addrspace 5)
-    ; GFX908-NEXT: $vgpr7 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 32, 0, 0, implicit $exec :: (load (s32) from %stack.7, addrspace 5)
-    ; GFX908-NEXT: $vgpr8 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 36, 0, 0, implicit $exec :: (load (s32) from %stack.8, addrspace 5)
-    ; GFX908-NEXT: $vgpr9 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 40, 0, 0, implicit $exec :: (load (s32) from %stack.9, addrspace 5)
-    ; GFX908-NEXT: $vgpr10 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 44, 0, 0, implicit $exec :: (load (s32) from %stack.10, addrspace 5)
-    ; GFX908-NEXT: $vgpr11 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 48, 0, 0, implicit $exec :: (load (s32) from %stack.11, addrspace 5)
-    ; GFX908-NEXT: $vgpr12 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 52, 0, 0, implicit $exec :: (load (s32) from %stack.12, addrspace 5)
-    ; GFX908-NEXT: $vgpr13 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 56, 0, 0, implicit $exec :: (load (s32) from %stack.13, addrspace 5)
-    ; GFX908-NEXT: $vgpr14 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 60, 0, 0, implicit $exec :: (load (s32) from %stack.14, addrspace 5)
-    ; GFX908-NEXT: $vgpr15 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 64, 0, 0, implicit $exec :: (load (s32) from %stack.15, addrspace 5)
-    ; GFX908-NEXT: $vgpr16 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 68, 0, 0, implicit $exec :: (load (s32) from %stack.16, addrspace 5)
-    ; GFX908-NEXT: $vgpr17 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 72, 0, 0, implicit $exec :: (load (s32) from %stack.17, addrspace 5)
-    ; GFX908-NEXT: $vgpr18 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 76, 0, 0, implicit $exec :: (load (s32) from %stack.18, addrspace 5)
-    ; GFX908-NEXT: $vgpr19 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 80, 0, 0, implicit $exec :: (load (s32) from %stack.19, addrspace 5)
-    ; GFX908-NEXT: $vgpr20 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 84, 0, 0, implicit $exec :: (load (s32) from %stack.20, addrspace 5)
-    ; GFX908-NEXT: $vgpr21 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 88, 0, 0, implicit $exec :: (load (s32) from %stack.21, addrspace 5)
-    ; GFX908-NEXT: $vgpr22 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 92, 0, 0, implicit $exec :: (load (s32) from %stack.22, addrspace 5)
-    ; GFX908-NEXT: $vgpr23 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 96, 0, 0, implicit $exec :: (load (s32) from %stack.23, addrspace 5)
-    ; GFX908-NEXT: $vgpr24 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 100, 0, 0, implicit $exec :: (load (s32) from %stack.24, addrspace 5)
-    ; GFX908-NEXT: $vgpr25 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 104, 0, 0, implicit $exec :: (load (s32) from %stack.25, addrspace 5)
-    ; GFX908-NEXT: $vgpr26 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 108, 0, 0, implicit $exec :: (load (s32) from %stack.26, addrspace 5)
-    ; GFX908-NEXT: $vgpr27 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 112, 0, 0, implicit $exec :: (load (s32) from %stack.27, addrspace 5)
-    ; GFX908-NEXT: $vgpr28 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 116, 0, 0, implicit $exec :: (load (s32) from %stack.28, addrspace 5)
-    ; GFX908-NEXT: $vgpr29 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 120, 0, 0, implicit $exec :: (load (s32) from %stack.29, addrspace 5)
-    ; GFX908-NEXT: $vgpr30 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 124, 0, 0, implicit $exec :: (load (s32) from %stack.30, addrspace 5)
-    ; GFX908-NEXT: $vgpr31 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 128, 0, 0, implicit $exec :: (load (s32) from %stack.31, addrspace 5)
-    ; GFX908-NEXT: $vgpr32 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 132, 0, 0, implicit $exec :: (load (s32) from %stack.32, addrspace 5)
-    ; GFX908-NEXT: $vgpr33 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 136, 0, 0, implicit $exec :: (load (s32) from %stack.33, addrspace 5)
+    ; GFX908-NEXT: $vgpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec :: (load (s32) from %stack.1, addrspace 5)
+    ; GFX908-NEXT: $vgpr2 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 8, 0, 0, implicit $exec :: (load (s32) from %stack.2, addrspace 5)
+    ; GFX908-NEXT: $vgpr3 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 12, 0, 0, implicit $exec :: (load (s32) from %stack.3, addrspace 5)
+    ; GFX908-NEXT: $vgpr4 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 16, 0, 0, implicit $exec :: (load (s32) from %stack.4, addrspace 5)
+    ; GFX908-NEXT: $vgpr5 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 20, 0, 0, implicit $exec :: (load (s32) from %stack.5, addrspace 5)
+    ; GFX908-NEXT: $vgpr6 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 24, 0, 0, implicit $exec :: (load (s32) from %stack.6, addrspace 5)
+    ; GFX908-NEXT: $vgpr7 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 28, 0, 0, implicit $exec :: (load (s32) from %stack.7, addrspace 5)
+    ; GFX908-NEXT: $vgpr8 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 32, 0, 0, implicit $exec :: (load (s32) from %stack.8, addrspace 5)
+    ; GFX908-NEXT: $vgpr9 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 36, 0, 0, implicit $exec :: (load (s32) from %stack.9, addrspace 5)
+    ; GFX908-NEXT: $vgpr10 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 40, 0, 0, implicit $exec :: (load (s32) from %stack.10, addrspace 5)
+    ; GFX908-NEXT: $vgpr11 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 44, 0, 0, implicit $exec :: (load (s32) from %stack.11, addrspace 5)
+    ; GFX908-NEXT: $vgpr12 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 48, 0, 0, implicit $exec :: (load (s32) from %stack.12, addrspace 5)
+    ; GFX908-NEXT: $vgpr13 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 52, 0, 0, implicit $exec :: (load (s32) from %stack.13, addrspace 5)
+    ; GFX908-NEXT: $vgpr14 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 56, 0, 0, implicit $exec :: (load (s32) from %stack.14, addrspace 5)
+    ; GFX908-NEXT: $vgpr15 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 60, 0, 0, implicit $exec :: (load (s32) from %stack.15, addrspace 5)
+    ; GFX908-NEXT: $vgpr16 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 64, 0, 0, implicit $exec :: (load (s32) from %stack.16, addrspace 5)
+    ; GFX908-NEXT: $vgpr17 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 68, 0, 0, implicit $exec :: (load (s32) from %stack.17, addrspace 5)
+    ; GFX908-NEXT: $vgpr18 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 72, 0, 0, implicit $exec :: (load (s32) from %stack.18, addrspace 5)
+    ; GFX908-NEXT: $vgpr19 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 76, 0, 0, implicit $exec :: (load (s32) from %stack.19, addrspace 5)
+    ; GFX908-NEXT: $vgpr20 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 80, 0, 0, implicit $exec :: (load (s32) from %stack.20, addrspace 5)
+    ; GFX908-NEXT: $vgpr21 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 84, 0, 0, implicit $exec :: (load (s32) from %stack.21, addrspace 5)
+    ; GFX908-NEXT: $vgpr22 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 88, 0, 0, implicit $exec :: (load (s32) from %stack.22, addrspace 5)
+    ; GFX908-NEXT: $vgpr23 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 92, 0, 0, implicit $exec :: (load (s32) from %stack.23, addrspace 5)
+    ; GFX908-NEXT: $vgpr24 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 96, 0, 0, implicit $exec :: (load (s32) from %stack.24, addrspace 5)
+    ; GFX908-NEXT: $vgpr25 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 100, 0, 0, implicit $exec :: (load (s32) from %stack.25, addrspace 5)
+    ; GFX908-NEXT: $vgpr26 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 104, 0, 0, implicit $exec :: (load (s32) from %stack.26, addrspace 5)
+    ; GFX908-NEXT: $vgpr27 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 108, 0, 0, implicit $exec :: (load (s32) from %stack.27, addrspace 5)
+    ; GFX908-NEXT: $vgpr28 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 112, 0, 0, implicit $exec :: (load (s32) from %stack.28, addrspace 5)
+    ; GFX908-NEXT: $vgpr29 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 116, 0, 0, implicit $exec :: (load (s32) from %stack.29, addrspace 5)
+    ; GFX908-NEXT: $vgpr30 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 120, 0, 0, implicit $exec :: (load (s32) from %stack.30, addrspace 5)
+    ; GFX908-NEXT: $vgpr31 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 124, 0, 0, implicit $exec :: (load (s32) from %stack.31, addrspace 5)
+    ; GFX908-NEXT: $vgpr32 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 128, 0, 0, implicit $exec :: (load (s32) from %stack.32, addrspace 5)
+    ; GFX908-NEXT: $vgpr33 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 132, 0, 0, implicit $exec :: (load (s32) from %stack.33, addrspace 5)
     ; GFX908-NEXT: S_NOP 0, implicit renamable $agpr0, implicit killed renamable $vgpr1, implicit killed renamable $vgpr2, implicit killed renamable $vgpr3, implicit killed renamable $vgpr4, implicit killed renamable $vgpr5, implicit killed renamable $vgpr6, implicit killed renamable $vgpr7, implicit killed renamable $vgpr8, implicit killed renamable $vgpr9, implicit killed renamable $vgpr10, implicit killed renamable $vgpr11, implicit killed renamable $vgpr12, implicit killed renamable $vgpr13, implicit killed renamable $vgpr14, implicit killed renamable $vgpr15, implicit killed renamable $vgpr16, implicit killed renamable $vgpr17, implicit killed renamable $vgpr18, implicit killed renamable $vgpr19, implicit killed renamable $vgpr20, implicit killed renamable $vgpr21, implicit killed renamable $vgpr22, implicit killed renamable $vgpr23, implicit killed renamable $vgpr24, implicit killed renamable $vgpr25, implicit killed renamable $vgpr26, implicit killed renamable $vgpr27, implicit killed renamable $vgpr28, implicit killed renamable $vgpr29, implicit killed renamable $vgpr30, implicit killed renamable $vgpr31, implicit killed renamable $vgpr32, implicit killed renamable $vgpr33, implicit killed renamable $vgpr34
     ; GFX908-NEXT: S_ENDPGM 0, implicit killed renamable $agpr0
     %v0:vgpr_32 = GLOBAL_LOAD_DWORD undef $vgpr0_vgpr1, 0, 0, implicit $exec
diff --git a/llvm/test/CodeGen/AMDGPU/scratch-simple.ll b/llvm/test/CodeGen/AMDGPU/scratch-simple.ll
index 272daac..4cc469b 100644
--- a/llvm/test/CodeGen/AMDGPU/scratch-simple.ll
+++ b/llvm/test/CodeGen/AMDGPU/scratch-simple.ll
@@ -73,14 +73,14 @@
 ; GFX10-FLATSCR-PAL: v_and_b32_e32 [[CLAMP_IDX:v[0-9]+]], 0x1fc, v0
 ; GFX11-FLATSCR: v_and_b32_e32 [[CLAMP_IDX:v[0-9]+]], 0x1fc, v0
 
-; MUBUF-DAG: v_add{{_|_nc_}}{{i|u}}32_e32 [[HI_OFF:v[0-9]+]],{{.*}} 0x280, [[CLAMP_IDX]]
-; MUBUF-DAG: v_add{{_|_nc_}}{{i|u}}32_e32 [[LO_OFF:v[0-9]+]],{{.*}} {{v2|0x80}}, [[CLAMP_IDX]]
-; FLATSCR: v_add{{_|_nc_}}{{u32|b32}}_e32 [[LO_OFF:v[0-9]+]],{{.*}} {{v2|0x80}}, [[CLAMP_IDX]]
+; MUBUF-DAG: v_add{{_|_nc_}}{{i|u}}32_e32 [[HI_OFF:v[0-9]+]],{{.*}} 0x200, [[CLAMP_IDX]]
+; MUBUF-DAG: v_add{{_|_nc_}}{{i|u}}32_e32 [[LO_OFF:v[0-9]+]],{{.*}} {{v2|0}}, [[CLAMP_IDX]]
+; FLATSCR: v_add{{_|_nc_}}{{u32|b32}}_e32 [[LO_OFF:v[0-9]+]],{{.*}} {{v2|0}}, [[CLAMP_IDX]]
 
 ; MUBUF: buffer_load_dword {{v[0-9]+}}, [[LO_OFF]], {{s\[[0-9]+:[0-9]+\]}}, 0 offen
 ; MUBUF: buffer_load_dword {{v[0-9]+}}, [[HI_OFF]], {{s\[[0-9]+:[0-9]+\]}}, 0 offen
 ; FLATSCR: scratch_load_dword {{v[0-9]+}}, [[LO_OFF]], off
-; GFX11-FLATSCR: scratch_load_b32 {{v[0-9]+}}, [[CLAMP_IDX]], off offset:128
+; GFX11-FLATSCR: scratch_load_b32 {{v[0-9]+}}, [[CLAMP_IDX]], off{{$}}
 define amdgpu_ps float @ps_main(i32 %idx) {
   %v1 = extractelement <81 x float> <float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float 0x3FE41CFEA0000000, float 0xBFE7A693C0000000, float 0xBFEA477C60000000, float 0xBFEBE5DC60000000, float 0xBFEC71C720000000, float 0xBFEBE5DC60000000, float 0xBFEA477C60000000, float 0xBFE7A693C0000000, float 0xBFE41CFEA0000000, float 0x3FDF9B13E0000000, float 0x3FDF9B1380000000, float 0x3FD5C53B80000000, float 0x3FD5C53B00000000, float 0x3FC6326AC0000000, float 0x3FC63269E0000000, float 0xBEE05CEB00000000, float 0xBEE086A320000000, float 0xBFC63269E0000000, float 0xBFC6326AC0000000, float 0xBFD5C53B80000000, float 0xBFD5C53B80000000, float 0xBFDF9B13E0000000, float 0xBFDF9B1460000000, float 0xBFE41CFE80000000, float 0x3FE7A693C0000000, float 0x3FEA477C20000000, float 0x3FEBE5DC40000000, float 0x3FEC71C6E0000000, float 0x3FEBE5DC40000000, float 0x3FEA477C20000000, float 0x3FE7A693C0000000, float 0xBFE41CFE80000000>, i32 %idx
   %v2 = extractelement <81 x float> <float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float 0xBFE41CFEA0000000, float 0xBFDF9B13E0000000, float 0xBFD5C53B80000000, float 0xBFC6326AC0000000, float 0x3EE0789320000000, float 0x3FC6326AC0000000, float 0x3FD5C53B80000000, float 0x3FDF9B13E0000000, float 0x3FE41CFEA0000000, float 0xBFE7A693C0000000, float 0x3FE7A693C0000000, float 0xBFEA477C20000000, float 0x3FEA477C20000000, float 0xBFEBE5DC40000000, float 0x3FEBE5DC40000000, float 0xBFEC71C720000000, float 0x3FEC71C6E0000000, float 0xBFEBE5DC60000000, float 0x3FEBE5DC40000000, float 0xBFEA477C20000000, float 0x3FEA477C20000000, float 0xBFE7A693C0000000, float 0x3FE7A69380000000, float 0xBFE41CFEA0000000, float 0xBFDF9B13E0000000, float 0xBFD5C53B80000000, float 0xBFC6326AC0000000, float 0x3EE0789320000000, float 0x3FC6326AC0000000, float 0x3FD5C53B80000000, float 0x3FDF9B13E0000000, float 0x3FE41CFE80000000>, i32 %idx
diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-spill-no-vgprs.ll b/llvm/test/CodeGen/AMDGPU/sgpr-spill-no-vgprs.ll
index 573fa7ac..242ecd8 100644
--- a/llvm/test/CodeGen/AMDGPU/sgpr-spill-no-vgprs.ll
+++ b/llvm/test/CodeGen/AMDGPU/sgpr-spill-no-vgprs.ll
@@ -15,10 +15,10 @@ define amdgpu_kernel void @partial_no_vgprs_last_sgpr_spill(ptr addrspace(1) %ou
 ; GCN-NEXT:    ;;#ASMSTART
 ; GCN-NEXT:    ;;#ASMEND
 ; GCN-NEXT:    s_or_saveexec_b64 s[24:25], -1
-; GCN-NEXT:    buffer_load_dword v1, off, s[0:3], 0 offset:8 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v1, off, s[0:3], 0 offset:4 ; 4-byte Folded Reload
 ; GCN-NEXT:    s_mov_b64 exec, s[24:25]
 ; GCN-NEXT:    s_or_saveexec_b64 s[24:25], -1
-; GCN-NEXT:    buffer_load_dword v0, off, s[0:3], 0 offset:4 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v0, off, s[0:3], 0 ; 4-byte Folded Reload
 ; GCN-NEXT:    s_mov_b64 exec, s[24:25]
 ; GCN-NEXT:    ;;#ASMSTART
 ; GCN-NEXT:    ;;#ASMEND
@@ -106,7 +106,7 @@ define amdgpu_kernel void @partial_no_vgprs_last_sgpr_spill(ptr addrspace(1) %ou
 ; GCN-NEXT:    v_writelane_b32 v1, s22, 62
 ; GCN-NEXT:    v_writelane_b32 v1, s23, 63
 ; GCN-NEXT:    s_or_saveexec_b64 s[24:25], -1
-; GCN-NEXT:    buffer_store_dword v1, off, s[0:3], 0 offset:8 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v1, off, s[0:3], 0 offset:4 ; 4-byte Folded Spill
 ; GCN-NEXT:    s_mov_b64 exec, s[24:25]
 ; GCN-NEXT:    ;;#ASMSTART
 ; GCN-NEXT:    ; def s[6:7]
@@ -115,7 +115,7 @@ define amdgpu_kernel void @partial_no_vgprs_last_sgpr_spill(ptr addrspace(1) %ou
 ; GCN-NEXT:    v_writelane_b32 v0, s6, 0
 ; GCN-NEXT:    v_writelane_b32 v0, s7, 1
 ; GCN-NEXT:    s_or_saveexec_b64 s[24:25], -1
-; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:4 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 ; 4-byte Folded Spill
 ; GCN-NEXT:    s_mov_b64 exec, s[24:25]
 ; GCN-NEXT:    s_mov_b32 s5, 0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
@@ -123,7 +123,7 @@ define amdgpu_kernel void @partial_no_vgprs_last_sgpr_spill(ptr addrspace(1) %ou
 ; GCN-NEXT:    s_cbranch_scc1 .LBB0_2
 ; GCN-NEXT:  ; %bb.1: ; %bb0
 ; GCN-NEXT:    s_or_saveexec_b64 s[24:25], -1
-; GCN-NEXT:    buffer_load_dword v1, off, s[0:3], 0 offset:8 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v1, off, s[0:3], 0 offset:4 ; 4-byte Folded Reload
 ; GCN-NEXT:    s_mov_b64 exec, s[24:25]
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    v_readlane_b32 s4, v1, 0
@@ -143,7 +143,7 @@ define amdgpu_kernel void @partial_no_vgprs_last_sgpr_spill(ptr addrspace(1) %ou
 ; GCN-NEXT:    v_readlane_b32 s18, v1, 14
 ; GCN-NEXT:    v_readlane_b32 s19, v1, 15
 ; GCN-NEXT:    s_or_saveexec_b64 s[24:25], -1
-; GCN-NEXT:    buffer_load_dword v0, off, s[0:3], 0 offset:4 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v0, off, s[0:3], 0 ; 4-byte Folded Reload
 ; GCN-NEXT:    s_mov_b64 exec, s[24:25]
 ; GCN-NEXT:    ;;#ASMSTART
 ; GCN-NEXT:    ; use s[4:19]
@@ -213,10 +213,10 @@ define amdgpu_kernel void @partial_no_vgprs_last_sgpr_spill(ptr addrspace(1) %ou
 ; GCN-NEXT:    ;;#ASMEND
 ; GCN-NEXT:  .LBB0_2: ; %ret
 ; GCN-NEXT:    s_or_saveexec_b64 s[24:25], -1
-; GCN-NEXT:    buffer_load_dword v0, off, s[0:3], 0 offset:4 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v0, off, s[0:3], 0 ; 4-byte Folded Reload
 ; GCN-NEXT:    s_mov_b64 exec, s[24:25]
 ; GCN-NEXT:    s_or_saveexec_b64 s[24:25], -1
-; GCN-NEXT:    buffer_load_dword v1, off, s[0:3], 0 offset:8 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v1, off, s[0:3], 0 offset:4 ; 4-byte Folded Reload
 ; GCN-NEXT:    s_mov_b64 exec, s[24:25]
 ; GCN-NEXT:    ; kill: killed $vgpr1
 ; GCN-NEXT:    ; kill: killed $vgpr0
diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-spill.mir b/llvm/test/CodeGen/AMDGPU/sgpr-spill.mir
index 059eb6d..f19b0a5 100644
--- a/llvm/test/CodeGen/AMDGPU/sgpr-spill.mir
+++ b/llvm/test/CodeGen/AMDGPU/sgpr-spill.mir
@@ -68,74 +68,74 @@ body:             |
     ; GCN64-MUBUF-NEXT: renamable $sgpr12 = IMPLICIT_DEF
     ; GCN64-MUBUF-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec
     ; GCN64-MUBUF-NEXT: $exec = S_MOV_B64 1, implicit-def $vgpr0
-    ; GCN64-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
+    ; GCN64-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %stack.9, addrspace 5)
     ; GCN64-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR killed $sgpr12, 0, undef $vgpr0
     ; GCN64-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 4, 0, 0, implicit $exec :: (store (s32) into %stack.0, addrspace 5)
-    ; GCN64-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
+    ; GCN64-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %stack.9, addrspace 5)
     ; GCN64-MUBUF-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1, implicit killed $vgpr0
     ; GCN64-MUBUF-NEXT: renamable $sgpr12 = IMPLICIT_DEF
     ; GCN64-MUBUF-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec
     ; GCN64-MUBUF-NEXT: $exec = S_MOV_B64 1, implicit-def $vgpr0
-    ; GCN64-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
+    ; GCN64-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %stack.9, addrspace 5)
     ; GCN64-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr12, 0, undef $vgpr0
     ; GCN64-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 4, 0, 0, implicit $exec :: (store (s32) into %stack.0, addrspace 5)
-    ; GCN64-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
+    ; GCN64-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %stack.9, addrspace 5)
     ; GCN64-MUBUF-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1, implicit killed $vgpr0
     ; GCN64-MUBUF-NEXT: renamable $sgpr12_sgpr13 = IMPLICIT_DEF
     ; GCN64-MUBUF-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec
     ; GCN64-MUBUF-NEXT: $exec = S_MOV_B64 3, implicit-def $vgpr0
-    ; GCN64-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
+    ; GCN64-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %stack.9, addrspace 5)
     ; GCN64-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr12, 0, undef $vgpr0, implicit $sgpr12_sgpr13
     ; GCN64-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr13, 1, $vgpr0, implicit killed $sgpr12_sgpr13
     ; GCN64-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 8, 0, 0, implicit $exec :: (store (s32) into %stack.1, addrspace 5)
-    ; GCN64-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
+    ; GCN64-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %stack.9, addrspace 5)
     ; GCN64-MUBUF-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1, implicit killed $vgpr0
     ; GCN64-MUBUF-NEXT: renamable $sgpr12_sgpr13 = IMPLICIT_DEF
     ; GCN64-MUBUF-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec
     ; GCN64-MUBUF-NEXT: $exec = S_MOV_B64 3, implicit-def $vgpr0
-    ; GCN64-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
+    ; GCN64-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %stack.9, addrspace 5)
     ; GCN64-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr12, 0, undef $vgpr0, implicit $sgpr12_sgpr13
     ; GCN64-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr13, 1, $vgpr0, implicit $sgpr12_sgpr13
     ; GCN64-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 8, 0, 0, implicit $exec :: (store (s32) into %stack.1, addrspace 5)
-    ; GCN64-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
+    ; GCN64-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %stack.9, addrspace 5)
     ; GCN64-MUBUF-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1, implicit killed $vgpr0
     ; GCN64-MUBUF-NEXT: renamable $sgpr12_sgpr13_sgpr14 = IMPLICIT_DEF
     ; GCN64-MUBUF-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec
     ; GCN64-MUBUF-NEXT: $exec = S_MOV_B64 7, implicit-def $vgpr0
-    ; GCN64-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
+    ; GCN64-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %stack.9, addrspace 5)
     ; GCN64-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr12, 0, undef $vgpr0, implicit $sgpr12_sgpr13_sgpr14
     ; GCN64-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr13, 1, $vgpr0, implicit $sgpr12_sgpr13_sgpr14
     ; GCN64-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr14, 2, $vgpr0, implicit killed $sgpr12_sgpr13_sgpr14
     ; GCN64-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 16, 0, 0, implicit $exec :: (store (s32) into %stack.2, addrspace 5)
-    ; GCN64-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
+    ; GCN64-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %stack.9, addrspace 5)
     ; GCN64-MUBUF-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1, implicit killed $vgpr0
     ; GCN64-MUBUF-NEXT: renamable $sgpr12_sgpr13_sgpr14_sgpr15 = IMPLICIT_DEF
     ; GCN64-MUBUF-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec
     ; GCN64-MUBUF-NEXT: $exec = S_MOV_B64 15, implicit-def $vgpr0
-    ; GCN64-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
+    ; GCN64-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %stack.9, addrspace 5)
     ; GCN64-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr12, 0, undef $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15
     ; GCN64-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr13, 1, $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15
     ; GCN64-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr14, 2, $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15
     ; GCN64-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr15, 3, $vgpr0, implicit killed $sgpr12_sgpr13_sgpr14_sgpr15
     ; GCN64-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 28, 0, 0, implicit $exec :: (store (s32) into %stack.3, addrspace 5)
-    ; GCN64-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
+    ; GCN64-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %stack.9, addrspace 5)
     ; GCN64-MUBUF-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1, implicit killed $vgpr0
     ; GCN64-MUBUF-NEXT: renamable $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16 = IMPLICIT_DEF
     ; GCN64-MUBUF-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec
     ; GCN64-MUBUF-NEXT: $exec = S_MOV_B64 31, implicit-def $vgpr0
-    ; GCN64-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
+    ; GCN64-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %stack.9, addrspace 5)
     ; GCN64-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr12, 0, undef $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16
     ; GCN64-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr13, 1, $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16
     ; GCN64-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr14, 2, $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16
     ; GCN64-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr15, 3, $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16
     ; GCN64-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr16, 4, $vgpr0, implicit killed $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16
     ; GCN64-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 44, 0, 0, implicit $exec :: (store (s32) into %stack.4, addrspace 5)
-    ; GCN64-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
+    ; GCN64-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %stack.9, addrspace 5)
     ; GCN64-MUBUF-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1, implicit killed $vgpr0
     ; GCN64-MUBUF-NEXT: renamable $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19 = IMPLICIT_DEF
     ; GCN64-MUBUF-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec
     ; GCN64-MUBUF-NEXT: $exec = S_MOV_B64 255, implicit-def $vgpr0
-    ; GCN64-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
+    ; GCN64-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %stack.9, addrspace 5)
     ; GCN64-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr12, 0, undef $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19
     ; GCN64-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr13, 1, $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19
     ; GCN64-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr14, 2, $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19
@@ -145,12 +145,12 @@ body:             |
     ; GCN64-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr18, 6, $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19
     ; GCN64-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr19, 7, $vgpr0, implicit killed $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19
     ; GCN64-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 64, 0, 0, implicit $exec :: (store (s32) into %stack.5, addrspace 5)
-    ; GCN64-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
+    ; GCN64-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %stack.9, addrspace 5)
     ; GCN64-MUBUF-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1, implicit killed $vgpr0
     ; GCN64-MUBUF-NEXT: renamable $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27 = IMPLICIT_DEF
     ; GCN64-MUBUF-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec
     ; GCN64-MUBUF-NEXT: $exec = S_MOV_B64 65535, implicit-def $vgpr0
-    ; GCN64-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
+    ; GCN64-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %stack.9, addrspace 5)
     ; GCN64-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr12, 0, undef $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27
     ; GCN64-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr13, 1, $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27
     ; GCN64-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr14, 2, $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27
@@ -168,12 +168,12 @@ body:             |
     ; GCN64-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr26, 14, $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27
     ; GCN64-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr27, 15, $vgpr0, implicit killed $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27
     ; GCN64-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 96, 0, 0, implicit $exec :: (store (s32) into %stack.6, addrspace 5)
-    ; GCN64-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
+    ; GCN64-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %stack.9, addrspace 5)
     ; GCN64-MUBUF-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1, implicit killed $vgpr0
     ; GCN64-MUBUF-NEXT: renamable $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95 = IMPLICIT_DEF
     ; GCN64-MUBUF-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec
     ; GCN64-MUBUF-NEXT: $exec = S_MOV_B64 4294967295, implicit-def $vgpr0
-    ; GCN64-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
+    ; GCN64-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %stack.9, addrspace 5)
     ; GCN64-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr64, 0, undef $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
     ; GCN64-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr65, 1, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
     ; GCN64-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr66, 2, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
@@ -207,16 +207,16 @@ body:             |
     ; GCN64-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr94, 30, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
     ; GCN64-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr95, 31, $vgpr0, implicit killed $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
     ; GCN64-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 160, 0, 0, implicit $exec :: (store (s32) into %stack.7, addrspace 5)
-    ; GCN64-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
+    ; GCN64-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %stack.9, addrspace 5)
     ; GCN64-MUBUF-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1, implicit killed $vgpr0
     ; GCN64-MUBUF-NEXT: renamable $sgpr12 = IMPLICIT_DEF
     ; GCN64-MUBUF-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec
     ; GCN64-MUBUF-NEXT: $exec = S_MOV_B64 1, implicit-def $vgpr0
-    ; GCN64-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
+    ; GCN64-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %stack.9, addrspace 5)
     ; GCN64-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr12, 0, undef $vgpr0
     ; GCN64-MUBUF-NEXT: $sgpr2 = S_ADD_I32 $sgpr33, 262144, implicit-def dead $scc
     ; GCN64-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr28_sgpr29_sgpr30_sgpr31, killed $sgpr2, 0, 0, 0, implicit $exec :: (store (s32) into %stack.8, align 4096, addrspace 5)
-    ; GCN64-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
+    ; GCN64-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %stack.9, addrspace 5)
     ; GCN64-MUBUF-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1, implicit killed $vgpr0
     ;
     ; GCN32-MUBUF-LABEL: name: check_spill
@@ -232,74 +232,74 @@ body:             |
     ; GCN32-MUBUF-NEXT: renamable $sgpr12 = IMPLICIT_DEF
     ; GCN32-MUBUF-NEXT: $sgpr0 = S_MOV_B32 $exec_lo
     ; GCN32-MUBUF-NEXT: $exec_lo = S_MOV_B32 1, implicit-def $vgpr0
-    ; GCN32-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
+    ; GCN32-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %stack.9, addrspace 5)
     ; GCN32-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR killed $sgpr12, 0, undef $vgpr0
     ; GCN32-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 4, 0, 0, implicit $exec :: (store (s32) into %stack.0, addrspace 5)
-    ; GCN32-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
+    ; GCN32-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %stack.9, addrspace 5)
     ; GCN32-MUBUF-NEXT: $exec_lo = S_MOV_B32 killed $sgpr0, implicit killed $vgpr0
     ; GCN32-MUBUF-NEXT: renamable $sgpr12 = IMPLICIT_DEF
     ; GCN32-MUBUF-NEXT: $sgpr0 = S_MOV_B32 $exec_lo
     ; GCN32-MUBUF-NEXT: $exec_lo = S_MOV_B32 1, implicit-def $vgpr0
-    ; GCN32-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
+    ; GCN32-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %stack.9, addrspace 5)
     ; GCN32-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr12, 0, undef $vgpr0
     ; GCN32-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 4, 0, 0, implicit $exec :: (store (s32) into %stack.0, addrspace 5)
-    ; GCN32-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
+    ; GCN32-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %stack.9, addrspace 5)
     ; GCN32-MUBUF-NEXT: $exec_lo = S_MOV_B32 killed $sgpr0, implicit killed $vgpr0
     ; GCN32-MUBUF-NEXT: renamable $sgpr12_sgpr13 = IMPLICIT_DEF
     ; GCN32-MUBUF-NEXT: $sgpr0 = S_MOV_B32 $exec_lo
     ; GCN32-MUBUF-NEXT: $exec_lo = S_MOV_B32 3, implicit-def $vgpr0
-    ; GCN32-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
+    ; GCN32-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %stack.9, addrspace 5)
     ; GCN32-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr12, 0, undef $vgpr0, implicit $sgpr12_sgpr13
     ; GCN32-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr13, 1, $vgpr0, implicit killed $sgpr12_sgpr13
     ; GCN32-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 8, 0, 0, implicit $exec :: (store (s32) into %stack.1, addrspace 5)
-    ; GCN32-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
+    ; GCN32-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %stack.9, addrspace 5)
     ; GCN32-MUBUF-NEXT: $exec_lo = S_MOV_B32 killed $sgpr0, implicit killed $vgpr0
     ; GCN32-MUBUF-NEXT: renamable $sgpr12_sgpr13 = IMPLICIT_DEF
     ; GCN32-MUBUF-NEXT: $sgpr0 = S_MOV_B32 $exec_lo
     ; GCN32-MUBUF-NEXT: $exec_lo = S_MOV_B32 3, implicit-def $vgpr0
-    ; GCN32-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
+    ; GCN32-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %stack.9, addrspace 5)
     ; GCN32-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr12, 0, undef $vgpr0, implicit $sgpr12_sgpr13
     ; GCN32-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr13, 1, $vgpr0, implicit $sgpr12_sgpr13
     ; GCN32-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 8, 0, 0, implicit $exec :: (store (s32) into %stack.1, addrspace 5)
-    ; GCN32-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
+    ; GCN32-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %stack.9, addrspace 5)
     ; GCN32-MUBUF-NEXT: $exec_lo = S_MOV_B32 killed $sgpr0, implicit killed $vgpr0
     ; GCN32-MUBUF-NEXT: renamable $sgpr12_sgpr13_sgpr14 = IMPLICIT_DEF
     ; GCN32-MUBUF-NEXT: $sgpr0 = S_MOV_B32 $exec_lo
     ; GCN32-MUBUF-NEXT: $exec_lo = S_MOV_B32 7, implicit-def $vgpr0
-    ; GCN32-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
+    ; GCN32-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %stack.9, addrspace 5)
     ; GCN32-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr12, 0, undef $vgpr0, implicit $sgpr12_sgpr13_sgpr14
     ; GCN32-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr13, 1, $vgpr0, implicit $sgpr12_sgpr13_sgpr14
     ; GCN32-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr14, 2, $vgpr0, implicit killed $sgpr12_sgpr13_sgpr14
     ; GCN32-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 16, 0, 0, implicit $exec :: (store (s32) into %stack.2, addrspace 5)
-    ; GCN32-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
+    ; GCN32-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %stack.9, addrspace 5)
     ; GCN32-MUBUF-NEXT: $exec_lo = S_MOV_B32 killed $sgpr0, implicit killed $vgpr0
     ; GCN32-MUBUF-NEXT: renamable $sgpr12_sgpr13_sgpr14_sgpr15 = IMPLICIT_DEF
     ; GCN32-MUBUF-NEXT: $sgpr0 = S_MOV_B32 $exec_lo
     ; GCN32-MUBUF-NEXT: $exec_lo = S_MOV_B32 15, implicit-def $vgpr0
-    ; GCN32-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
+    ; GCN32-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %stack.9, addrspace 5)
     ; GCN32-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr12, 0, undef $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15
     ; GCN32-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr13, 1, $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15
     ; GCN32-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr14, 2, $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15
     ; GCN32-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr15, 3, $vgpr0, implicit killed $sgpr12_sgpr13_sgpr14_sgpr15
     ; GCN32-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 28, 0, 0, implicit $exec :: (store (s32) into %stack.3, addrspace 5)
-    ; GCN32-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
+    ; GCN32-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %stack.9, addrspace 5)
     ; GCN32-MUBUF-NEXT: $exec_lo = S_MOV_B32 killed $sgpr0, implicit killed $vgpr0
     ; GCN32-MUBUF-NEXT: renamable $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16 = IMPLICIT_DEF
     ; GCN32-MUBUF-NEXT: $sgpr0 = S_MOV_B32 $exec_lo
     ; GCN32-MUBUF-NEXT: $exec_lo = S_MOV_B32 31, implicit-def $vgpr0
-    ; GCN32-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
+    ; GCN32-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %stack.9, addrspace 5)
     ; GCN32-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr12, 0, undef $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16
     ; GCN32-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr13, 1, $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16
     ; GCN32-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr14, 2, $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16
     ; GCN32-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr15, 3, $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16
     ; GCN32-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr16, 4, $vgpr0, implicit killed $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16
     ; GCN32-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 44, 0, 0, implicit $exec :: (store (s32) into %stack.4, addrspace 5)
-    ; GCN32-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
+    ; GCN32-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %stack.9, addrspace 5)
     ; GCN32-MUBUF-NEXT: $exec_lo = S_MOV_B32 killed $sgpr0, implicit killed $vgpr0
     ; GCN32-MUBUF-NEXT: renamable $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19 = IMPLICIT_DEF
     ; GCN32-MUBUF-NEXT: $sgpr0 = S_MOV_B32 $exec_lo
     ; GCN32-MUBUF-NEXT: $exec_lo = S_MOV_B32 255, implicit-def $vgpr0
-    ; GCN32-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
+    ; GCN32-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %stack.9, addrspace 5)
     ; GCN32-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr12, 0, undef $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19
     ; GCN32-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr13, 1, $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19
     ; GCN32-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr14, 2, $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19
@@ -309,12 +309,12 @@ body:             |
     ; GCN32-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr18, 6, $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19
     ; GCN32-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr19, 7, $vgpr0, implicit killed $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19
     ; GCN32-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 64, 0, 0, implicit $exec :: (store (s32) into %stack.5, addrspace 5)
-    ; GCN32-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
+    ; GCN32-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %stack.9, addrspace 5)
     ; GCN32-MUBUF-NEXT: $exec_lo = S_MOV_B32 killed $sgpr0, implicit killed $vgpr0
     ; GCN32-MUBUF-NEXT: renamable $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27 = IMPLICIT_DEF
     ; GCN32-MUBUF-NEXT: $sgpr0 = S_MOV_B32 $exec_lo
     ; GCN32-MUBUF-NEXT: $exec_lo = S_MOV_B32 65535, implicit-def $vgpr0
-    ; GCN32-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
+    ; GCN32-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %stack.9, addrspace 5)
     ; GCN32-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr12, 0, undef $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27
     ; GCN32-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr13, 1, $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27
     ; GCN32-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr14, 2, $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27
@@ -332,12 +332,12 @@ body:             |
     ; GCN32-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr26, 14, $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27
     ; GCN32-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr27, 15, $vgpr0, implicit killed $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27
     ; GCN32-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 96, 0, 0, implicit $exec :: (store (s32) into %stack.6, addrspace 5)
-    ; GCN32-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
+    ; GCN32-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %stack.9, addrspace 5)
     ; GCN32-MUBUF-NEXT: $exec_lo = S_MOV_B32 killed $sgpr0, implicit killed $vgpr0
     ; GCN32-MUBUF-NEXT: renamable $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95 = IMPLICIT_DEF
     ; GCN32-MUBUF-NEXT: $sgpr0 = S_MOV_B32 $exec_lo
     ; GCN32-MUBUF-NEXT: $exec_lo = S_MOV_B32 4294967295, implicit-def $vgpr0
-    ; GCN32-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
+    ; GCN32-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %stack.9, addrspace 5)
     ; GCN32-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr64, 0, undef $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
     ; GCN32-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr65, 1, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
     ; GCN32-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr66, 2, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
@@ -371,16 +371,16 @@ body:             |
     ; GCN32-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr94, 30, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
     ; GCN32-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr95, 31, $vgpr0, implicit killed $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
     ; GCN32-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 160, 0, 0, implicit $exec :: (store (s32) into %stack.7, addrspace 5)
-    ; GCN32-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
+    ; GCN32-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %stack.9, addrspace 5)
     ; GCN32-MUBUF-NEXT: $exec_lo = S_MOV_B32 killed $sgpr0, implicit killed $vgpr0
     ; GCN32-MUBUF-NEXT: renamable $sgpr12 = IMPLICIT_DEF
     ; GCN32-MUBUF-NEXT: $sgpr0 = S_MOV_B32 $exec_lo
     ; GCN32-MUBUF-NEXT: $exec_lo = S_MOV_B32 1, implicit-def $vgpr0
-    ; GCN32-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
+    ; GCN32-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %stack.9, addrspace 5)
     ; GCN32-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr12, 0, undef $vgpr0
     ; GCN32-MUBUF-NEXT: $sgpr1 = S_ADD_I32 $sgpr33, 131072, implicit-def dead $scc
     ; GCN32-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, killed $sgpr1, 0, 0, 0, implicit $exec :: (store (s32) into %stack.8, align 4096, addrspace 5)
-    ; GCN32-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
+    ; GCN32-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %stack.9, addrspace 5)
     ; GCN32-MUBUF-NEXT: $exec_lo = S_MOV_B32 killed $sgpr0, implicit killed $vgpr0
     ;
     ; GCN64-FLATSCR-LABEL: name: check_spill
@@ -392,74 +392,74 @@ body:             |
     ; GCN64-FLATSCR-NEXT: renamable $sgpr12 = IMPLICIT_DEF
     ; GCN64-FLATSCR-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec
     ; GCN64-FLATSCR-NEXT: $exec = S_MOV_B64 1, implicit-def $vgpr0
-    ; GCN64-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
+    ; GCN64-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.9, addrspace 5)
     ; GCN64-FLATSCR-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR killed $sgpr12, 0, undef $vgpr0
     ; GCN64-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr33, 4, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.0, addrspace 5)
-    ; GCN64-FLATSCR-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
+    ; GCN64-FLATSCR-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.9, addrspace 5)
     ; GCN64-FLATSCR-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1, implicit killed $vgpr0
     ; GCN64-FLATSCR-NEXT: renamable $sgpr12 = IMPLICIT_DEF
     ; GCN64-FLATSCR-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec
     ; GCN64-FLATSCR-NEXT: $exec = S_MOV_B64 1, implicit-def $vgpr0
-    ; GCN64-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
+    ; GCN64-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.9, addrspace 5)
     ; GCN64-FLATSCR-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr12, 0, undef $vgpr0
     ; GCN64-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr33, 4, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.0, addrspace 5)
-    ; GCN64-FLATSCR-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
+    ; GCN64-FLATSCR-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.9, addrspace 5)
     ; GCN64-FLATSCR-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1, implicit killed $vgpr0
     ; GCN64-FLATSCR-NEXT: renamable $sgpr12_sgpr13 = IMPLICIT_DEF
     ; GCN64-FLATSCR-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec
     ; GCN64-FLATSCR-NEXT: $exec = S_MOV_B64 3, implicit-def $vgpr0
-    ; GCN64-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
+    ; GCN64-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.9, addrspace 5)
     ; GCN64-FLATSCR-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr12, 0, undef $vgpr0, implicit $sgpr12_sgpr13
     ; GCN64-FLATSCR-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr13, 1, $vgpr0, implicit killed $sgpr12_sgpr13
     ; GCN64-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr33, 8, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.1, addrspace 5)
-    ; GCN64-FLATSCR-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
+    ; GCN64-FLATSCR-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.9, addrspace 5)
     ; GCN64-FLATSCR-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1, implicit killed $vgpr0
     ; GCN64-FLATSCR-NEXT: renamable $sgpr12_sgpr13 = IMPLICIT_DEF
     ; GCN64-FLATSCR-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec
     ; GCN64-FLATSCR-NEXT: $exec = S_MOV_B64 3, implicit-def $vgpr0
-    ; GCN64-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
+    ; GCN64-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.9, addrspace 5)
     ; GCN64-FLATSCR-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr12, 0, undef $vgpr0, implicit $sgpr12_sgpr13
     ; GCN64-FLATSCR-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr13, 1, $vgpr0, implicit $sgpr12_sgpr13
     ; GCN64-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr33, 8, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.1, addrspace 5)
-    ; GCN64-FLATSCR-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
+    ; GCN64-FLATSCR-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.9, addrspace 5)
     ; GCN64-FLATSCR-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1, implicit killed $vgpr0
     ; GCN64-FLATSCR-NEXT: renamable $sgpr12_sgpr13_sgpr14 = IMPLICIT_DEF
     ; GCN64-FLATSCR-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec
     ; GCN64-FLATSCR-NEXT: $exec = S_MOV_B64 7, implicit-def $vgpr0
-    ; GCN64-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
+    ; GCN64-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.9, addrspace 5)
     ; GCN64-FLATSCR-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr12, 0, undef $vgpr0, implicit $sgpr12_sgpr13_sgpr14
     ; GCN64-FLATSCR-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr13, 1, $vgpr0, implicit $sgpr12_sgpr13_sgpr14
     ; GCN64-FLATSCR-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr14, 2, $vgpr0, implicit killed $sgpr12_sgpr13_sgpr14
     ; GCN64-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr33, 16, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.2, addrspace 5)
-    ; GCN64-FLATSCR-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
+    ; GCN64-FLATSCR-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.9, addrspace 5)
     ; GCN64-FLATSCR-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1, implicit killed $vgpr0
     ; GCN64-FLATSCR-NEXT: renamable $sgpr12_sgpr13_sgpr14_sgpr15 = IMPLICIT_DEF
     ; GCN64-FLATSCR-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec
     ; GCN64-FLATSCR-NEXT: $exec = S_MOV_B64 15, implicit-def $vgpr0
-    ; GCN64-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
+    ; GCN64-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.9, addrspace 5)
     ; GCN64-FLATSCR-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr12, 0, undef $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15
     ; GCN64-FLATSCR-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr13, 1, $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15
     ; GCN64-FLATSCR-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr14, 2, $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15
     ; GCN64-FLATSCR-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr15, 3, $vgpr0, implicit killed $sgpr12_sgpr13_sgpr14_sgpr15
     ; GCN64-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr33, 28, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.3, addrspace 5)
-    ; GCN64-FLATSCR-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
+    ; GCN64-FLATSCR-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.9, addrspace 5)
     ; GCN64-FLATSCR-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1, implicit killed $vgpr0
     ; GCN64-FLATSCR-NEXT: renamable $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16 = IMPLICIT_DEF
     ; GCN64-FLATSCR-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec
     ; GCN64-FLATSCR-NEXT: $exec = S_MOV_B64 31, implicit-def $vgpr0
-    ; GCN64-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
+    ; GCN64-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.9, addrspace 5)
     ; GCN64-FLATSCR-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr12, 0, undef $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16
     ; GCN64-FLATSCR-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr13, 1, $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16
     ; GCN64-FLATSCR-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr14, 2, $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16
     ; GCN64-FLATSCR-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr15, 3, $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16
     ; GCN64-FLATSCR-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr16, 4, $vgpr0, implicit killed $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16
     ; GCN64-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr33, 44, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.4, addrspace 5)
-    ; GCN64-FLATSCR-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
+    ; GCN64-FLATSCR-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.9, addrspace 5)
     ; GCN64-FLATSCR-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1, implicit killed $vgpr0
     ; GCN64-FLATSCR-NEXT: renamable $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19 = IMPLICIT_DEF
     ; GCN64-FLATSCR-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec
     ; GCN64-FLATSCR-NEXT: $exec = S_MOV_B64 255, implicit-def $vgpr0
-    ; GCN64-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
+    ; GCN64-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.9, addrspace 5)
     ; GCN64-FLATSCR-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr12, 0, undef $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19
     ; GCN64-FLATSCR-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr13, 1, $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19
     ; GCN64-FLATSCR-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr14, 2, $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19
@@ -469,12 +469,12 @@ body:             |
     ; GCN64-FLATSCR-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr18, 6, $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19
     ; GCN64-FLATSCR-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr19, 7, $vgpr0, implicit killed $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19
     ; GCN64-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr33, 64, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.5, addrspace 5)
-    ; GCN64-FLATSCR-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
+    ; GCN64-FLATSCR-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.9, addrspace 5)
     ; GCN64-FLATSCR-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1, implicit killed $vgpr0
     ; GCN64-FLATSCR-NEXT: renamable $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27 = IMPLICIT_DEF
     ; GCN64-FLATSCR-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec
     ; GCN64-FLATSCR-NEXT: $exec = S_MOV_B64 65535, implicit-def $vgpr0
-    ; GCN64-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
+    ; GCN64-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.9, addrspace 5)
     ; GCN64-FLATSCR-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr12, 0, undef $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27
     ; GCN64-FLATSCR-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr13, 1, $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27
     ; GCN64-FLATSCR-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr14, 2, $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27
@@ -492,12 +492,12 @@ body:             |
     ; GCN64-FLATSCR-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr26, 14, $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27
     ; GCN64-FLATSCR-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr27, 15, $vgpr0, implicit killed $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27
     ; GCN64-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr33, 96, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.6, addrspace 5)
-    ; GCN64-FLATSCR-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
+    ; GCN64-FLATSCR-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.9, addrspace 5)
     ; GCN64-FLATSCR-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1, implicit killed $vgpr0
     ; GCN64-FLATSCR-NEXT: renamable $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95 = IMPLICIT_DEF
     ; GCN64-FLATSCR-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec
     ; GCN64-FLATSCR-NEXT: $exec = S_MOV_B64 4294967295, implicit-def $vgpr0
-    ; GCN64-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
+    ; GCN64-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.9, addrspace 5)
     ; GCN64-FLATSCR-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr64, 0, undef $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
     ; GCN64-FLATSCR-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr65, 1, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
     ; GCN64-FLATSCR-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr66, 2, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
@@ -531,16 +531,16 @@ body:             |
     ; GCN64-FLATSCR-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr94, 30, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
     ; GCN64-FLATSCR-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr95, 31, $vgpr0, implicit killed $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
     ; GCN64-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr33, 160, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.7, addrspace 5)
-    ; GCN64-FLATSCR-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
+    ; GCN64-FLATSCR-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.9, addrspace 5)
     ; GCN64-FLATSCR-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1, implicit killed $vgpr0
     ; GCN64-FLATSCR-NEXT: renamable $sgpr12 = IMPLICIT_DEF
     ; GCN64-FLATSCR-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec
     ; GCN64-FLATSCR-NEXT: $exec = S_MOV_B64 1, implicit-def $vgpr0
-    ; GCN64-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
+    ; GCN64-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.9, addrspace 5)
     ; GCN64-FLATSCR-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr12, 0, undef $vgpr0
     ; GCN64-FLATSCR-NEXT: $sgpr2 = S_ADD_I32 $sgpr33, 4096, implicit-def dead $scc
     ; GCN64-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, killed $sgpr2, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.8, align 4096, addrspace 5)
-    ; GCN64-FLATSCR-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
+    ; GCN64-FLATSCR-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.9, addrspace 5)
     ; GCN64-FLATSCR-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1, implicit killed $vgpr0
     renamable $sgpr12 = IMPLICIT_DEF
     SI_SPILL_S32_SAVE killed $sgpr12, %stack.0, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99, implicit $sgpr32
@@ -626,52 +626,52 @@ body:             |
     ; GCN64-MUBUF-NEXT: $sgpr29 = S_ADDC_U32 $sgpr29, 0, implicit-def dead $scc, implicit $scc, implicit-def $sgpr28_sgpr29_sgpr30_sgpr31
     ; GCN64-MUBUF-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec
     ; GCN64-MUBUF-NEXT: $exec = S_MOV_B64 1, implicit-def $vgpr0
-    ; GCN64-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
+    ; GCN64-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %stack.9, addrspace 5)
     ; GCN64-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 4, 0, 0, implicit $exec :: (load (s32) from %stack.0, addrspace 5)
     ; GCN64-MUBUF-NEXT: $sgpr12 = SI_RESTORE_S32_FROM_VGPR killed $vgpr0, 0
-    ; GCN64-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
+    ; GCN64-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %stack.9, addrspace 5)
     ; GCN64-MUBUF-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1, implicit killed $vgpr0
     ; GCN64-MUBUF-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec
     ; GCN64-MUBUF-NEXT: $exec = S_MOV_B64 3, implicit-def $vgpr0
-    ; GCN64-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
+    ; GCN64-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %stack.9, addrspace 5)
     ; GCN64-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 8, 0, 0, implicit $exec :: (load (s32) from %stack.1, addrspace 5)
     ; GCN64-MUBUF-NEXT: $sgpr12 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 0, implicit-def $sgpr12_sgpr13
     ; GCN64-MUBUF-NEXT: $sgpr13 = SI_RESTORE_S32_FROM_VGPR killed $vgpr0, 1
-    ; GCN64-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
+    ; GCN64-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %stack.9, addrspace 5)
     ; GCN64-MUBUF-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1, implicit killed $vgpr0
     ; GCN64-MUBUF-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec
     ; GCN64-MUBUF-NEXT: $exec = S_MOV_B64 7, implicit-def $vgpr0
-    ; GCN64-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
+    ; GCN64-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %stack.9, addrspace 5)
     ; GCN64-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 16, 0, 0, implicit $exec :: (load (s32) from %stack.2, addrspace 5)
     ; GCN64-MUBUF-NEXT: $sgpr12 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 0, implicit-def $sgpr12_sgpr13_sgpr14
     ; GCN64-MUBUF-NEXT: $sgpr13 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 1
     ; GCN64-MUBUF-NEXT: $sgpr14 = SI_RESTORE_S32_FROM_VGPR killed $vgpr0, 2
-    ; GCN64-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
+    ; GCN64-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %stack.9, addrspace 5)
     ; GCN64-MUBUF-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1, implicit killed $vgpr0
     ; GCN64-MUBUF-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec
     ; GCN64-MUBUF-NEXT: $exec = S_MOV_B64 15, implicit-def $vgpr0
-    ; GCN64-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
+    ; GCN64-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %stack.9, addrspace 5)
     ; GCN64-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 28, 0, 0, implicit $exec :: (load (s32) from %stack.3, addrspace 5)
     ; GCN64-MUBUF-NEXT: $sgpr12 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 0, implicit-def $sgpr12_sgpr13_sgpr14_sgpr15
     ; GCN64-MUBUF-NEXT: $sgpr13 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 1
     ; GCN64-MUBUF-NEXT: $sgpr14 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 2
     ; GCN64-MUBUF-NEXT: $sgpr15 = SI_RESTORE_S32_FROM_VGPR killed $vgpr0, 3
-    ; GCN64-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
+    ; GCN64-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %stack.9, addrspace 5)
     ; GCN64-MUBUF-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1, implicit killed $vgpr0
     ; GCN64-MUBUF-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec
     ; GCN64-MUBUF-NEXT: $exec = S_MOV_B64 31, implicit-def $vgpr0
-    ; GCN64-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
+    ; GCN64-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %stack.9, addrspace 5)
     ; GCN64-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 44, 0, 0, implicit $exec :: (load (s32) from %stack.4, addrspace 5)
     ; GCN64-MUBUF-NEXT: $sgpr12 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 0, implicit-def $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16
     ; GCN64-MUBUF-NEXT: $sgpr13 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 1
     ; GCN64-MUBUF-NEXT: $sgpr14 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 2
     ; GCN64-MUBUF-NEXT: $sgpr15 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 3
     ; GCN64-MUBUF-NEXT: $sgpr16 = SI_RESTORE_S32_FROM_VGPR killed $vgpr0, 4
-    ; GCN64-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
+    ; GCN64-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %stack.9, addrspace 5)
     ; GCN64-MUBUF-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1, implicit killed $vgpr0
     ; GCN64-MUBUF-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec
     ; GCN64-MUBUF-NEXT: $exec = S_MOV_B64 255, implicit-def $vgpr0
-    ; GCN64-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
+    ; GCN64-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %stack.9, addrspace 5)
     ; GCN64-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 64, 0, 0, implicit $exec :: (load (s32) from %stack.5, addrspace 5)
     ; GCN64-MUBUF-NEXT: $sgpr12 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 0, implicit-def $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19
     ; GCN64-MUBUF-NEXT: $sgpr13 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 1
@@ -681,11 +681,11 @@ body:             |
     ; GCN64-MUBUF-NEXT: $sgpr17 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 5
     ; GCN64-MUBUF-NEXT: $sgpr18 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 6
     ; GCN64-MUBUF-NEXT: $sgpr19 = SI_RESTORE_S32_FROM_VGPR killed $vgpr0, 7
-    ; GCN64-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
+    ; GCN64-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %stack.9, addrspace 5)
     ; GCN64-MUBUF-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1, implicit killed $vgpr0
     ; GCN64-MUBUF-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec
     ; GCN64-MUBUF-NEXT: $exec = S_MOV_B64 65535, implicit-def $vgpr0
-    ; GCN64-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
+    ; GCN64-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %stack.9, addrspace 5)
     ; GCN64-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 96, 0, 0, implicit $exec :: (load (s32) from %stack.6, addrspace 5)
     ; GCN64-MUBUF-NEXT: $sgpr12 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 0, implicit-def $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27
     ; GCN64-MUBUF-NEXT: $sgpr13 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 1
@@ -703,11 +703,11 @@ body:             |
     ; GCN64-MUBUF-NEXT: $sgpr25 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 13
     ; GCN64-MUBUF-NEXT: $sgpr26 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 14
     ; GCN64-MUBUF-NEXT: $sgpr27 = SI_RESTORE_S32_FROM_VGPR killed $vgpr0, 15
-    ; GCN64-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
+    ; GCN64-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %stack.9, addrspace 5)
     ; GCN64-MUBUF-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1, implicit killed $vgpr0
     ; GCN64-MUBUF-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec
     ; GCN64-MUBUF-NEXT: $exec = S_MOV_B64 4294967295, implicit-def $vgpr0
-    ; GCN64-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
+    ; GCN64-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %stack.9, addrspace 5)
     ; GCN64-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 160, 0, 0, implicit $exec :: (load (s32) from %stack.7, addrspace 5)
     ; GCN64-MUBUF-NEXT: $sgpr64 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 0, implicit-def $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
     ; GCN64-MUBUF-NEXT: $sgpr65 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 1
@@ -741,15 +741,15 @@ body:             |
     ; GCN64-MUBUF-NEXT: $sgpr93 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 29
     ; GCN64-MUBUF-NEXT: $sgpr94 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 30
     ; GCN64-MUBUF-NEXT: $sgpr95 = SI_RESTORE_S32_FROM_VGPR killed $vgpr0, 31
-    ; GCN64-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
+    ; GCN64-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %stack.9, addrspace 5)
     ; GCN64-MUBUF-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1, implicit killed $vgpr0
     ; GCN64-MUBUF-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec
     ; GCN64-MUBUF-NEXT: $exec = S_MOV_B64 1, implicit-def $vgpr0
-    ; GCN64-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
+    ; GCN64-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %stack.9, addrspace 5)
     ; GCN64-MUBUF-NEXT: $sgpr2 = S_ADD_I32 $sgpr33, 262144, implicit-def dead $scc
     ; GCN64-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr28_sgpr29_sgpr30_sgpr31, killed $sgpr2, 0, 0, 0, implicit $exec :: (load (s32) from %stack.8, align 4096, addrspace 5)
     ; GCN64-MUBUF-NEXT: $sgpr12 = SI_RESTORE_S32_FROM_VGPR killed $vgpr0, 0
-    ; GCN64-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
+    ; GCN64-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %stack.9, addrspace 5)
     ; GCN64-MUBUF-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1, implicit killed $vgpr0
     ;
     ; GCN32-MUBUF-LABEL: name: check_reload
@@ -764,52 +764,52 @@ body:             |
     ; GCN32-MUBUF-NEXT: $sgpr97 = S_ADDC_U32 $sgpr97, 0, implicit-def dead $scc, implicit $scc, implicit-def $sgpr96_sgpr97_sgpr98_sgpr99
     ; GCN32-MUBUF-NEXT: $sgpr0 = S_MOV_B32 $exec_lo
     ; GCN32-MUBUF-NEXT: $exec_lo = S_MOV_B32 1, implicit-def $vgpr0
-    ; GCN32-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
+    ; GCN32-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %stack.9, addrspace 5)
     ; GCN32-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 4, 0, 0, implicit $exec :: (load (s32) from %stack.0, addrspace 5)
     ; GCN32-MUBUF-NEXT: $sgpr12 = SI_RESTORE_S32_FROM_VGPR killed $vgpr0, 0
-    ; GCN32-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
+    ; GCN32-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %stack.9, addrspace 5)
     ; GCN32-MUBUF-NEXT: $exec_lo = S_MOV_B32 killed $sgpr0, implicit killed $vgpr0
     ; GCN32-MUBUF-NEXT: $sgpr0 = S_MOV_B32 $exec_lo
     ; GCN32-MUBUF-NEXT: $exec_lo = S_MOV_B32 3, implicit-def $vgpr0
-    ; GCN32-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
+    ; GCN32-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %stack.9, addrspace 5)
     ; GCN32-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 8, 0, 0, implicit $exec :: (load (s32) from %stack.1, addrspace 5)
     ; GCN32-MUBUF-NEXT: $sgpr12 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 0, implicit-def $sgpr12_sgpr13
     ; GCN32-MUBUF-NEXT: $sgpr13 = SI_RESTORE_S32_FROM_VGPR killed $vgpr0, 1
-    ; GCN32-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
+    ; GCN32-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %stack.9, addrspace 5)
     ; GCN32-MUBUF-NEXT: $exec_lo = S_MOV_B32 killed $sgpr0, implicit killed $vgpr0
     ; GCN32-MUBUF-NEXT: $sgpr0 = S_MOV_B32 $exec_lo
     ; GCN32-MUBUF-NEXT: $exec_lo = S_MOV_B32 7, implicit-def $vgpr0
-    ; GCN32-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
+    ; GCN32-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %stack.9, addrspace 5)
     ; GCN32-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 16, 0, 0, implicit $exec :: (load (s32) from %stack.2, addrspace 5)
     ; GCN32-MUBUF-NEXT: $sgpr12 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 0, implicit-def $sgpr12_sgpr13_sgpr14
     ; GCN32-MUBUF-NEXT: $sgpr13 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 1
     ; GCN32-MUBUF-NEXT: $sgpr14 = SI_RESTORE_S32_FROM_VGPR killed $vgpr0, 2
-    ; GCN32-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
+    ; GCN32-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %stack.9, addrspace 5)
     ; GCN32-MUBUF-NEXT: $exec_lo = S_MOV_B32 killed $sgpr0, implicit killed $vgpr0
     ; GCN32-MUBUF-NEXT: $sgpr0 = S_MOV_B32 $exec_lo
     ; GCN32-MUBUF-NEXT: $exec_lo = S_MOV_B32 15, implicit-def $vgpr0
-    ; GCN32-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
+    ; GCN32-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %stack.9, addrspace 5)
     ; GCN32-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 28, 0, 0, implicit $exec :: (load (s32) from %stack.3, addrspace 5)
     ; GCN32-MUBUF-NEXT: $sgpr12 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 0, implicit-def $sgpr12_sgpr13_sgpr14_sgpr15
     ; GCN32-MUBUF-NEXT: $sgpr13 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 1
     ; GCN32-MUBUF-NEXT: $sgpr14 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 2
     ; GCN32-MUBUF-NEXT: $sgpr15 = SI_RESTORE_S32_FROM_VGPR killed $vgpr0, 3
-    ; GCN32-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
+    ; GCN32-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %stack.9, addrspace 5)
     ; GCN32-MUBUF-NEXT: $exec_lo = S_MOV_B32 killed $sgpr0, implicit killed $vgpr0
     ; GCN32-MUBUF-NEXT: $sgpr0 = S_MOV_B32 $exec_lo
     ; GCN32-MUBUF-NEXT: $exec_lo = S_MOV_B32 31, implicit-def $vgpr0
-    ; GCN32-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
+    ; GCN32-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %stack.9, addrspace 5)
     ; GCN32-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 44, 0, 0, implicit $exec :: (load (s32) from %stack.4, addrspace 5)
     ; GCN32-MUBUF-NEXT: $sgpr12 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 0, implicit-def $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16
     ; GCN32-MUBUF-NEXT: $sgpr13 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 1
     ; GCN32-MUBUF-NEXT: $sgpr14 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 2
     ; GCN32-MUBUF-NEXT: $sgpr15 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 3
     ; GCN32-MUBUF-NEXT: $sgpr16 = SI_RESTORE_S32_FROM_VGPR killed $vgpr0, 4
-    ; GCN32-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
+    ; GCN32-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %stack.9, addrspace 5)
     ; GCN32-MUBUF-NEXT: $exec_lo = S_MOV_B32 killed $sgpr0, implicit killed $vgpr0
     ; GCN32-MUBUF-NEXT: $sgpr0 = S_MOV_B32 $exec_lo
     ; GCN32-MUBUF-NEXT: $exec_lo = S_MOV_B32 255, implicit-def $vgpr0
-    ; GCN32-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
+    ; GCN32-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %stack.9, addrspace 5)
     ; GCN32-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 64, 0, 0, implicit $exec :: (load (s32) from %stack.5, addrspace 5)
     ; GCN32-MUBUF-NEXT: $sgpr12 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 0, implicit-def $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19
     ; GCN32-MUBUF-NEXT: $sgpr13 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 1
@@ -819,11 +819,11 @@ body:             |
     ; GCN32-MUBUF-NEXT: $sgpr17 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 5
     ; GCN32-MUBUF-NEXT: $sgpr18 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 6
     ; GCN32-MUBUF-NEXT: $sgpr19 = SI_RESTORE_S32_FROM_VGPR killed $vgpr0, 7
-    ; GCN32-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
+    ; GCN32-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %stack.9, addrspace 5)
     ; GCN32-MUBUF-NEXT: $exec_lo = S_MOV_B32 killed $sgpr0, implicit killed $vgpr0
     ; GCN32-MUBUF-NEXT: $sgpr0 = S_MOV_B32 $exec_lo
     ; GCN32-MUBUF-NEXT: $exec_lo = S_MOV_B32 65535, implicit-def $vgpr0
-    ; GCN32-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
+    ; GCN32-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %stack.9, addrspace 5)
     ; GCN32-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 96, 0, 0, implicit $exec :: (load (s32) from %stack.6, addrspace 5)
     ; GCN32-MUBUF-NEXT: $sgpr12 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 0, implicit-def $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27
     ; GCN32-MUBUF-NEXT: $sgpr13 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 1
@@ -841,11 +841,11 @@ body:             |
     ; GCN32-MUBUF-NEXT: $sgpr25 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 13
     ; GCN32-MUBUF-NEXT: $sgpr26 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 14
     ; GCN32-MUBUF-NEXT: $sgpr27 = SI_RESTORE_S32_FROM_VGPR killed $vgpr0, 15
-    ; GCN32-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
+    ; GCN32-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %stack.9, addrspace 5)
     ; GCN32-MUBUF-NEXT: $exec_lo = S_MOV_B32 killed $sgpr0, implicit killed $vgpr0
     ; GCN32-MUBUF-NEXT: $sgpr0 = S_MOV_B32 $exec_lo
     ; GCN32-MUBUF-NEXT: $exec_lo = S_MOV_B32 4294967295, implicit-def $vgpr0
-    ; GCN32-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
+    ; GCN32-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %stack.9, addrspace 5)
     ; GCN32-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 160, 0, 0, implicit $exec :: (load (s32) from %stack.7, addrspace 5)
     ; GCN32-MUBUF-NEXT: $sgpr64 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 0, implicit-def $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
     ; GCN32-MUBUF-NEXT: $sgpr65 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 1
@@ -879,15 +879,15 @@ body:             |
     ; GCN32-MUBUF-NEXT: $sgpr93 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 29
     ; GCN32-MUBUF-NEXT: $sgpr94 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 30
     ; GCN32-MUBUF-NEXT: $sgpr95 = SI_RESTORE_S32_FROM_VGPR killed $vgpr0, 31
-    ; GCN32-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
+    ; GCN32-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %stack.9, addrspace 5)
     ; GCN32-MUBUF-NEXT: $exec_lo = S_MOV_B32 killed $sgpr0, implicit killed $vgpr0
     ; GCN32-MUBUF-NEXT: $sgpr0 = S_MOV_B32 $exec_lo
     ; GCN32-MUBUF-NEXT: $exec_lo = S_MOV_B32 1, implicit-def $vgpr0
-    ; GCN32-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
+    ; GCN32-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %stack.9, addrspace 5)
     ; GCN32-MUBUF-NEXT: $sgpr1 = S_ADD_I32 $sgpr33, 131072, implicit-def dead $scc
     ; GCN32-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr96_sgpr97_sgpr98_sgpr99, killed $sgpr1, 0, 0, 0, implicit $exec :: (load (s32) from %stack.8, align 4096, addrspace 5)
     ; GCN32-MUBUF-NEXT: $sgpr12 = SI_RESTORE_S32_FROM_VGPR killed $vgpr0, 0
-    ; GCN32-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
+    ; GCN32-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %stack.9, addrspace 5)
     ; GCN32-MUBUF-NEXT: $exec_lo = S_MOV_B32 killed $sgpr0, implicit killed $vgpr0
     ;
     ; GCN64-FLATSCR-LABEL: name: check_reload
@@ -898,52 +898,52 @@ body:             |
     ; GCN64-FLATSCR-NEXT: $flat_scr_hi = S_ADDC_U32 $sgpr1, 0, implicit-def dead $scc, implicit $scc
     ; GCN64-FLATSCR-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec
     ; GCN64-FLATSCR-NEXT: $exec = S_MOV_B64 1, implicit-def $vgpr0
-    ; GCN64-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
+    ; GCN64-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.9, addrspace 5)
     ; GCN64-FLATSCR-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 4, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.0, addrspace 5)
     ; GCN64-FLATSCR-NEXT: $sgpr12 = SI_RESTORE_S32_FROM_VGPR killed $vgpr0, 0
-    ; GCN64-FLATSCR-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
+    ; GCN64-FLATSCR-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.9, addrspace 5)
     ; GCN64-FLATSCR-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1, implicit killed $vgpr0
     ; GCN64-FLATSCR-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec
     ; GCN64-FLATSCR-NEXT: $exec = S_MOV_B64 3, implicit-def $vgpr0
-    ; GCN64-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
+    ; GCN64-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.9, addrspace 5)
     ; GCN64-FLATSCR-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 8, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.1, addrspace 5)
     ; GCN64-FLATSCR-NEXT: $sgpr12 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 0, implicit-def $sgpr12_sgpr13
     ; GCN64-FLATSCR-NEXT: $sgpr13 = SI_RESTORE_S32_FROM_VGPR killed $vgpr0, 1
-    ; GCN64-FLATSCR-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
+    ; GCN64-FLATSCR-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.9, addrspace 5)
     ; GCN64-FLATSCR-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1, implicit killed $vgpr0
     ; GCN64-FLATSCR-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec
     ; GCN64-FLATSCR-NEXT: $exec = S_MOV_B64 7, implicit-def $vgpr0
-    ; GCN64-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
+    ; GCN64-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.9, addrspace 5)
     ; GCN64-FLATSCR-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 16, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.2, addrspace 5)
     ; GCN64-FLATSCR-NEXT: $sgpr12 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 0, implicit-def $sgpr12_sgpr13_sgpr14
     ; GCN64-FLATSCR-NEXT: $sgpr13 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 1
     ; GCN64-FLATSCR-NEXT: $sgpr14 = SI_RESTORE_S32_FROM_VGPR killed $vgpr0, 2
-    ; GCN64-FLATSCR-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
+    ; GCN64-FLATSCR-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.9, addrspace 5)
     ; GCN64-FLATSCR-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1, implicit killed $vgpr0
     ; GCN64-FLATSCR-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec
     ; GCN64-FLATSCR-NEXT: $exec = S_MOV_B64 15, implicit-def $vgpr0
-    ; GCN64-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
+    ; GCN64-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.9, addrspace 5)
     ; GCN64-FLATSCR-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 28, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.3, addrspace 5)
     ; GCN64-FLATSCR-NEXT: $sgpr12 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 0, implicit-def $sgpr12_sgpr13_sgpr14_sgpr15
     ; GCN64-FLATSCR-NEXT: $sgpr13 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 1
     ; GCN64-FLATSCR-NEXT: $sgpr14 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 2
     ; GCN64-FLATSCR-NEXT: $sgpr15 = SI_RESTORE_S32_FROM_VGPR killed $vgpr0, 3
-    ; GCN64-FLATSCR-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
+    ; GCN64-FLATSCR-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.9, addrspace 5)
     ; GCN64-FLATSCR-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1, implicit killed $vgpr0
     ; GCN64-FLATSCR-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec
     ; GCN64-FLATSCR-NEXT: $exec = S_MOV_B64 31, implicit-def $vgpr0
-    ; GCN64-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
+    ; GCN64-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.9, addrspace 5)
     ; GCN64-FLATSCR-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 44, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.4, addrspace 5)
     ; GCN64-FLATSCR-NEXT: $sgpr12 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 0, implicit-def $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16
     ; GCN64-FLATSCR-NEXT: $sgpr13 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 1
     ; GCN64-FLATSCR-NEXT: $sgpr14 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 2
     ; GCN64-FLATSCR-NEXT: $sgpr15 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 3
     ; GCN64-FLATSCR-NEXT: $sgpr16 = SI_RESTORE_S32_FROM_VGPR killed $vgpr0, 4
-    ; GCN64-FLATSCR-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
+    ; GCN64-FLATSCR-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.9, addrspace 5)
     ; GCN64-FLATSCR-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1, implicit killed $vgpr0
     ; GCN64-FLATSCR-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec
     ; GCN64-FLATSCR-NEXT: $exec = S_MOV_B64 255, implicit-def $vgpr0
-    ; GCN64-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
+    ; GCN64-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.9, addrspace 5)
     ; GCN64-FLATSCR-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 64, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.5, addrspace 5)
     ; GCN64-FLATSCR-NEXT: $sgpr12 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 0, implicit-def $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19
     ; GCN64-FLATSCR-NEXT: $sgpr13 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 1
@@ -953,11 +953,11 @@ body:             |
     ; GCN64-FLATSCR-NEXT: $sgpr17 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 5
     ; GCN64-FLATSCR-NEXT: $sgpr18 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 6
     ; GCN64-FLATSCR-NEXT: $sgpr19 = SI_RESTORE_S32_FROM_VGPR killed $vgpr0, 7
-    ; GCN64-FLATSCR-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
+    ; GCN64-FLATSCR-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.9, addrspace 5)
     ; GCN64-FLATSCR-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1, implicit killed $vgpr0
     ; GCN64-FLATSCR-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec
     ; GCN64-FLATSCR-NEXT: $exec = S_MOV_B64 65535, implicit-def $vgpr0
-    ; GCN64-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
+    ; GCN64-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.9, addrspace 5)
     ; GCN64-FLATSCR-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 96, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.6, addrspace 5)
     ; GCN64-FLATSCR-NEXT: $sgpr12 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 0, implicit-def $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27
     ; GCN64-FLATSCR-NEXT: $sgpr13 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 1
@@ -975,11 +975,11 @@ body:             |
     ; GCN64-FLATSCR-NEXT: $sgpr25 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 13
     ; GCN64-FLATSCR-NEXT: $sgpr26 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 14
     ; GCN64-FLATSCR-NEXT: $sgpr27 = SI_RESTORE_S32_FROM_VGPR killed $vgpr0, 15
-    ; GCN64-FLATSCR-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
+    ; GCN64-FLATSCR-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.9, addrspace 5)
     ; GCN64-FLATSCR-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1, implicit killed $vgpr0
     ; GCN64-FLATSCR-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec
     ; GCN64-FLATSCR-NEXT: $exec = S_MOV_B64 4294967295, implicit-def $vgpr0
-    ; GCN64-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
+    ; GCN64-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.9, addrspace 5)
     ; GCN64-FLATSCR-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 160, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.7, addrspace 5)
     ; GCN64-FLATSCR-NEXT: $sgpr64 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 0, implicit-def $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
     ; GCN64-FLATSCR-NEXT: $sgpr65 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 1
@@ -1013,15 +1013,15 @@ body:             |
     ; GCN64-FLATSCR-NEXT: $sgpr93 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 29
     ; GCN64-FLATSCR-NEXT: $sgpr94 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 30
     ; GCN64-FLATSCR-NEXT: $sgpr95 = SI_RESTORE_S32_FROM_VGPR killed $vgpr0, 31
-    ; GCN64-FLATSCR-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
+    ; GCN64-FLATSCR-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.9, addrspace 5)
     ; GCN64-FLATSCR-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1, implicit killed $vgpr0
     ; GCN64-FLATSCR-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec
     ; GCN64-FLATSCR-NEXT: $exec = S_MOV_B64 1, implicit-def $vgpr0
-    ; GCN64-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
+    ; GCN64-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.9, addrspace 5)
     ; GCN64-FLATSCR-NEXT: $sgpr2 = S_ADD_I32 $sgpr33, 4096, implicit-def dead $scc
     ; GCN64-FLATSCR-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR killed $sgpr2, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.8, align 4096, addrspace 5)
     ; GCN64-FLATSCR-NEXT: $sgpr12 = SI_RESTORE_S32_FROM_VGPR killed $vgpr0, 0
-    ; GCN64-FLATSCR-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
+    ; GCN64-FLATSCR-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.9, addrspace 5)
     ; GCN64-FLATSCR-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1, implicit killed $vgpr0
     renamable $sgpr12 = SI_SPILL_S32_RESTORE %stack.0, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99, implicit $sgpr32
 
diff --git a/llvm/test/CodeGen/AMDGPU/si-spill-sgpr-stack.ll b/llvm/test/CodeGen/AMDGPU/si-spill-sgpr-stack.ll
index e67b5e4..c5a5a52 100644
--- a/llvm/test/CodeGen/AMDGPU/si-spill-sgpr-stack.ll
+++ b/llvm/test/CodeGen/AMDGPU/si-spill-sgpr-stack.ll
@@ -8,7 +8,7 @@
 ; Make sure we are handling hazards correctly.
 ; SGPR: v_mov_b32_e32 v0, vcc_lo
 ; SGPR-NEXT: s_or_saveexec_b64 [[EXEC_COPY:s\[[0-9]+:[0-9]+\]]], -1
-; SGPR-NEXT: buffer_load_dword [[VHI:v[0-9]+]], off, s[{{[0-9]+:[0-9]+}}], 0 offset:4 ; 4-byte Folded Reload
+; SGPR-NEXT: buffer_load_dword [[VHI:v[0-9]+]], off, s[{{[0-9]+:[0-9]+}}], 0 ; 4-byte Folded Reload
 ; SGPR-NEXT: s_mov_b64 exec, [[EXEC_COPY]]
 ; SGPR-NEXT: s_waitcnt vmcnt(0)
 ; SGPR-NEXT: v_readlane_b32 s{{[0-9]+}}, [[VHI]], 0
diff --git a/llvm/test/CodeGen/AMDGPU/spill-agpr.ll b/llvm/test/CodeGen/AMDGPU/spill-agpr.ll
index 5871a78..c9413b6 100644
--- a/llvm/test/CodeGen/AMDGPU/spill-agpr.ll
+++ b/llvm/test/CodeGen/AMDGPU/spill-agpr.ll
@@ -75,31 +75,31 @@ use:
 ; GCN: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD1
 
 ; GFX908-DAG:  v_accvgpr_read_b32 v5, a0 ; Reload Reuse
-; GFX908-DAG:  buffer_store_dword v5, off, s[{{[0-9:]+}}], 0 offset:4 ; 4-byte Folded Spill
+; GFX908-DAG:  buffer_store_dword v5, off, s[{{[0-9:]+}}], 0 ; 4-byte Folded Spill
 ; GFX908-DAG:  v_accvgpr_read_b32 v5, a1 ; Reload Reuse
-; GFX908-DAG:  buffer_store_dword v5, off, s[{{[0-9:]+}}], 0 offset:8 ; 4-byte Folded Spill
+; GFX908-DAG:  buffer_store_dword v5, off, s[{{[0-9:]+}}], 0 offset:4 ; 4-byte Folded Spill
 ; GFX908-DAG:  v_accvgpr_read_b32 v5, a2 ; Reload Reuse
-; GFX908-DAG:  buffer_store_dword v5, off, s[{{[0-9:]+}}], 0 offset:12 ; 4-byte Folded Spill
+; GFX908-DAG:  buffer_store_dword v5, off, s[{{[0-9:]+}}], 0 offset:8 ; 4-byte Folded Spill
 ; GFX908-DAG:  v_accvgpr_read_b32 v5, a3 ; Reload Reuse
-; GFX908-DAG:  buffer_store_dword v5, off, s[{{[0-9:]+}}], 0 offset:16 ; 4-byte Folded Spill
+; GFX908-DAG:  buffer_store_dword v5, off, s[{{[0-9:]+}}], 0 offset:12 ; 4-byte Folded Spill
 
-; GFX90A-DAG:  buffer_store_dword a0, off, s[{{[0-9:]+}}], 0 offset:4 ; 4-byte Folded Spill
-; GFX90A-DAG:  buffer_store_dword a1, off, s[{{[0-9:]+}}], 0 offset:8 ; 4-byte Folded Spill
-; GFX90A-DAG:  buffer_store_dword a2, off, s[{{[0-9:]+}}], 0 offset:12 ; 4-byte Folded Spill
-; GFX90A-DAG:  buffer_store_dword a3, off, s[{{[0-9:]+}}], 0 offset:16 ; 4-byte Folded Spill
+; GFX90A-DAG:  buffer_store_dword a0, off, s[{{[0-9:]+}}], 0 ; 4-byte Folded Spill
+; GFX90A-DAG:  buffer_store_dword a1, off, s[{{[0-9:]+}}], 0 offset:4 ; 4-byte Folded Spill
+; GFX90A-DAG:  buffer_store_dword a2, off, s[{{[0-9:]+}}], 0 offset:8 ; 4-byte Folded Spill
+; GFX90A-DAG:  buffer_store_dword a3, off, s[{{[0-9:]+}}], 0 offset:12 ; 4-byte Folded Spill
 
 ; GCN:  v_mfma_f32_4x4x1f32 a[0:3], v{{[0-9]+}}, v{{[0-9]+}}, a[0:3]
 
-; GFX908-DAG:  buffer_load_dword v0, off, s[{{[0-9:]+}}], 0 offset:4 ; 4-byte Folded Reload
-; GFX908-DAG:  buffer_load_dword v1, off, s[{{[0-9:]+}}], 0 offset:8 ; 4-byte Folded Reload
-; GFX908-DAG:  buffer_load_dword v2, off, s[{{[0-9:]+}}], 0 offset:12 ; 4-byte Folded Reload
-; GFX908-DAG:  buffer_load_dword v3, off, s[{{[0-9:]+}}], 0 offset:16 ; 4-byte Folded Reload
+; GFX908-DAG:  buffer_load_dword v0, off, s[{{[0-9:]+}}], 0 ; 4-byte Folded Reload
+; GFX908-DAG:  buffer_load_dword v1, off, s[{{[0-9:]+}}], 0 offset:4 ; 4-byte Folded Reload
+; GFX908-DAG:  buffer_load_dword v2, off, s[{{[0-9:]+}}], 0 offset:8 ; 4-byte Folded Reload
+; GFX908-DAG:  buffer_load_dword v3, off, s[{{[0-9:]+}}], 0 offset:12 ; 4-byte Folded Reload
 ; GFX908: global_store_dwordx4 v[{{[0-9:]+}}], v[0:3], off
 
-; GFX90A-DAG:  buffer_load_dword v2, off, s[4:7], 0 offset:4 ; 4-byte Folded Reload
-; GFX90A-DAG:  buffer_load_dword v3, off, s[4:7], 0 offset:8 ; 4-byte Folded Reload
-; GFX90A-DAG:  buffer_load_dword v4, off, s[4:7], 0 offset:12 ; 4-byte Folded Reload
-; GFX90A-DAG:  buffer_load_dword v5, off, s[4:7], 0 offset:16 ; 4-byte Folded Reload
+; GFX90A-DAG:  buffer_load_dword v2, off, s[4:7], 0 ; 4-byte Folded Reload
+; GFX90A-DAG:  buffer_load_dword v3, off, s[4:7], 0 offset:4 ; 4-byte Folded Reload
+; GFX90A-DAG:  buffer_load_dword v4, off, s[4:7], 0 offset:8 ; 4-byte Folded Reload
+; GFX90A-DAG:  buffer_load_dword v5, off, s[4:7], 0 offset:12 ; 4-byte Folded Reload
 ; GFX90A:  global_store_dwordx4 v[0:1], v[2:5], off
 
 ; GCN: ScratchSize: 20
diff --git a/llvm/test/CodeGen/AMDGPU/spill-m0.ll b/llvm/test/CodeGen/AMDGPU/spill-m0.ll
index 4a13a74..f192f25 100644
--- a/llvm/test/CodeGen/AMDGPU/spill-m0.ll
+++ b/llvm/test/CodeGen/AMDGPU/spill-m0.ll
@@ -17,7 +17,7 @@
 ; TOVMEM: s_mov_b64 [[COPY_EXEC:s\[[0-9]+:[0-9]+\]]], exec
 ; TOVMEM: s_mov_b64 exec, 1
 ; TOVMEM: v_writelane_b32 [[SPILL_VREG:v[0-9]+]], [[M0_COPY]], 0
-; TOVMEM: buffer_store_dword [[SPILL_VREG]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:4 ; 4-byte Folded Spill
+; TOVMEM: buffer_store_dword [[SPILL_VREG]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 ; 4-byte Folded Spill
 ; TOVMEM: s_mov_b64 exec, [[COPY_EXEC]]
 
 ; GCN: s_cbranch_scc1 [[ENDIF:.LBB[0-9]+_[0-9]+]]
@@ -26,7 +26,7 @@
 ; TOVGPR: v_readlane_b32 [[M0_RESTORE:s[0-9]+]], [[SPILL_VREG]], [[M0_LANE]]
 ; TOVGPR: s_mov_b32 m0, [[M0_RESTORE]]
 
-; TOVMEM: buffer_load_dword [[RELOAD_VREG:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:4 ; 4-byte Folded Reload
+; TOVMEM: buffer_load_dword [[RELOAD_VREG:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 ; 4-byte Folded Reload
 ; TOVMEM: s_waitcnt vmcnt(0)
 ; TOVMEM: v_readlane_b32 [[M0_RESTORE:s[0-9]+]], [[RELOAD_VREG]], 0
 ; TOVMEM: s_mov_b32 m0, [[M0_RESTORE]]
diff --git a/llvm/test/CodeGen/AMDGPU/spill-offset-calculation.ll b/llvm/test/CodeGen/AMDGPU/spill-offset-calculation.ll
index 7ad3520..baca66a 100644
--- a/llvm/test/CodeGen/AMDGPU/spill-offset-calculation.ll
+++ b/llvm/test/CodeGen/AMDGPU/spill-offset-calculation.ll
@@ -11,14 +11,14 @@ define amdgpu_kernel void @test_inst_offset_kernel() {
 ; MUBUF:       ; %bb.0: ; %entry
 ; MUBUF-NEXT:    s_add_u32 s0, s0, s7
 ; MUBUF-NEXT:    s_addc_u32 s1, s1, 0
-; MUBUF-NEXT:    buffer_load_dword v0, off, s[0:3], 0 offset:8 glc
+; MUBUF-NEXT:    buffer_load_dword v0, off, s[0:3], 0 offset:4 glc
 ; MUBUF-NEXT:    s_waitcnt vmcnt(0)
-; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:4092 ; 4-byte Folded Spill
+; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:4088 ; 4-byte Folded Spill
 ; MUBUF-NEXT:    ;;#ASMSTART
 ; MUBUF-NEXT:    ;;#ASMEND
-; MUBUF-NEXT:    buffer_load_dword v0, off, s[0:3], 0 offset:4092 ; 4-byte Folded Reload
+; MUBUF-NEXT:    buffer_load_dword v0, off, s[0:3], 0 offset:4088 ; 4-byte Folded Reload
 ; MUBUF-NEXT:    s_waitcnt vmcnt(0)
-; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:8
+; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:4
 ; MUBUF-NEXT:    s_waitcnt vmcnt(0)
 ; MUBUF-NEXT:    s_endpgm
 ;
@@ -27,16 +27,16 @@ define amdgpu_kernel void @test_inst_offset_kernel() {
 ; FLATSCR-NEXT:    s_add_u32 flat_scratch_lo, s0, s3
 ; FLATSCR-NEXT:    s_addc_u32 flat_scratch_hi, s1, 0
 ; FLATSCR-NEXT:    s_mov_b32 s0, 0
-; FLATSCR-NEXT:    scratch_load_dword v0, off, s0 offset:8 glc
+; FLATSCR-NEXT:    scratch_load_dword v0, off, s0 offset:4 glc
 ; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; FLATSCR-NEXT:    s_movk_i32 s0, 0xffc
+; FLATSCR-NEXT:    s_movk_i32 s0, 0xff8
 ; FLATSCR-NEXT:    scratch_store_dword off, v0, s0 ; 4-byte Folded Spill
 ; FLATSCR-NEXT:    ;;#ASMSTART
 ; FLATSCR-NEXT:    ;;#ASMEND
 ; FLATSCR-NEXT:    scratch_load_dword v0, off, s0 ; 4-byte Folded Reload
 ; FLATSCR-NEXT:    s_mov_b32 s0, 0
 ; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; FLATSCR-NEXT:    scratch_store_dword off, v0, s0 offset:8
+; FLATSCR-NEXT:    scratch_store_dword off, v0, s0 offset:4
 ; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; FLATSCR-NEXT:    s_endpgm
 entry:
@@ -277,19 +277,19 @@ define amdgpu_kernel void @test_sgpr_offset_subregs_kernel() {
 ; MUBUF:       ; %bb.0: ; %entry
 ; MUBUF-NEXT:    s_add_u32 s0, s0, s7
 ; MUBUF-NEXT:    s_addc_u32 s1, s1, 0
-; MUBUF-NEXT:    buffer_load_dword v0, off, s[0:3], 0 offset:12 glc
+; MUBUF-NEXT:    buffer_load_dword v0, off, s[0:3], 0 offset:8 glc
 ; MUBUF-NEXT:    s_waitcnt vmcnt(0)
-; MUBUF-NEXT:    buffer_load_dword v1, off, s[0:3], 0 offset:16 glc
+; MUBUF-NEXT:    buffer_load_dword v1, off, s[0:3], 0 offset:12 glc
 ; MUBUF-NEXT:    s_waitcnt vmcnt(0)
-; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:4088 ; 4-byte Folded Spill
+; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:4084 ; 4-byte Folded Spill
 ; MUBUF-NEXT:    s_waitcnt vmcnt(0)
-; MUBUF-NEXT:    buffer_store_dword v1, off, s[0:3], 0 offset:4092 ; 4-byte Folded Spill
+; MUBUF-NEXT:    buffer_store_dword v1, off, s[0:3], 0 offset:4088 ; 4-byte Folded Spill
 ; MUBUF-NEXT:    ;;#ASMSTART
 ; MUBUF-NEXT:    ;;#ASMEND
-; MUBUF-NEXT:    buffer_load_dword v0, off, s[0:3], 0 offset:8 glc
+; MUBUF-NEXT:    buffer_load_dword v0, off, s[0:3], 0 offset:4 glc
 ; MUBUF-NEXT:    s_waitcnt vmcnt(0)
-; MUBUF-NEXT:    buffer_load_dword v0, off, s[0:3], 0 offset:4088 ; 4-byte Folded Reload
-; MUBUF-NEXT:    buffer_load_dword v1, off, s[0:3], 0 offset:4092 ; 4-byte Folded Reload
+; MUBUF-NEXT:    buffer_load_dword v0, off, s[0:3], 0 offset:4084 ; 4-byte Folded Reload
+; MUBUF-NEXT:    buffer_load_dword v1, off, s[0:3], 0 offset:4088 ; 4-byte Folded Reload
 ; MUBUF-NEXT:    s_waitcnt vmcnt(0)
 ; MUBUF-NEXT:    ;;#ASMSTART
 ; MUBUF-NEXT:    ; v[0:1]
@@ -301,16 +301,16 @@ define amdgpu_kernel void @test_sgpr_offset_subregs_kernel() {
 ; FLATSCR-NEXT:    s_add_u32 flat_scratch_lo, s0, s3
 ; FLATSCR-NEXT:    s_addc_u32 flat_scratch_hi, s1, 0
 ; FLATSCR-NEXT:    s_mov_b32 s0, 0
-; FLATSCR-NEXT:    scratch_load_dwordx2 v[0:1], off, s0 offset:12 glc
+; FLATSCR-NEXT:    scratch_load_dwordx2 v[0:1], off, s0 offset:8 glc
 ; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; FLATSCR-NEXT:    s_movk_i32 s0, 0xff8
+; FLATSCR-NEXT:    s_movk_i32 s0, 0xff4
 ; FLATSCR-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 ; 8-byte Folded Spill
 ; FLATSCR-NEXT:    s_mov_b32 s0, 0
 ; FLATSCR-NEXT:    ;;#ASMSTART
 ; FLATSCR-NEXT:    ;;#ASMEND
-; FLATSCR-NEXT:    scratch_load_dword v0, off, s0 offset:8 glc
+; FLATSCR-NEXT:    scratch_load_dword v0, off, s0 offset:4 glc
 ; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; FLATSCR-NEXT:    s_movk_i32 s0, 0xff8
+; FLATSCR-NEXT:    s_movk_i32 s0, 0xff4
 ; FLATSCR-NEXT:    scratch_load_dwordx2 v[0:1], off, s0 ; 8-byte Folded Reload
 ; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; FLATSCR-NEXT:    ;;#ASMSTART
diff --git a/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll b/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll
index 1458a93..bea2e6d 100644
--- a/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll
+++ b/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll
@@ -10315,8 +10315,8 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    s_mov_b64 exec, s[6:7]
 ; GFX6-NEXT:    s_mov_b64 s[6:7], exec
 ; GFX6-NEXT:    s_mov_b64 exec, 0xff
-; GFX6-NEXT:    s_mov_b32 s34, 0x84800
 ; GFX6-NEXT:    buffer_store_dword v4, off, s[40:43], 0
+; GFX6-NEXT:    s_mov_b32 s34, 0x84800
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    buffer_load_dword v4, off, s[40:43], s34 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
@@ -10351,8 +10351,8 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    s_mov_b64 exec, s[6:7]
 ; GFX6-NEXT:    s_mov_b64 s[6:7], exec
 ; GFX6-NEXT:    s_mov_b64 exec, 0xff
-; GFX6-NEXT:    s_mov_b32 s34, 0x85000
 ; GFX6-NEXT:    buffer_store_dword v4, off, s[40:43], 0
+; GFX6-NEXT:    s_mov_b32 s34, 0x85000
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    buffer_load_dword v4, off, s[40:43], s34 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
@@ -10387,8 +10387,8 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    s_mov_b64 exec, s[6:7]
 ; GFX6-NEXT:    s_mov_b64 s[6:7], exec
 ; GFX6-NEXT:    s_mov_b64 exec, 0xff
-; GFX6-NEXT:    s_mov_b32 s34, 0x85800
 ; GFX6-NEXT:    buffer_store_dword v4, off, s[40:43], 0
+; GFX6-NEXT:    s_mov_b32 s34, 0x85800
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    buffer_load_dword v4, off, s[40:43], s34 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
@@ -10431,8 +10431,8 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX6-NEXT:    s_mov_b64 s[34:35], exec
 ; GFX6-NEXT:    s_mov_b64 exec, 0xff
-; GFX6-NEXT:    s_mov_b32 s36, 0x86000
 ; GFX6-NEXT:    buffer_store_dword v4, off, s[40:43], 0
+; GFX6-NEXT:    s_mov_b32 s36, 0x86000
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    buffer_load_dword v4, off, s[40:43], s36 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
@@ -10449,8 +10449,8 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX6-NEXT:    s_mov_b64 s[34:35], exec
 ; GFX6-NEXT:    s_mov_b64 exec, 15
-; GFX6-NEXT:    s_mov_b32 s44, 0x86800
 ; GFX6-NEXT:    buffer_store_dword v4, off, s[40:43], 0
+; GFX6-NEXT:    s_mov_b32 s44, 0x86800
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    buffer_load_dword v4, off, s[40:43], s44 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
@@ -10463,8 +10463,8 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX6-NEXT:    s_mov_b64 s[44:45], exec
 ; GFX6-NEXT:    s_mov_b64 exec, 3
-; GFX6-NEXT:    v_mov_b32_e32 v7, 0x21b0
 ; GFX6-NEXT:    buffer_store_dword v4, off, s[40:43], 0
+; GFX6-NEXT:    v_mov_b32_e32 v7, 0x21b0
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    buffer_load_dword v4, v7, s[40:43], 0 offen ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
@@ -10494,8 +10494,8 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    s_or_b64 exec, exec, vcc
 ; GFX6-NEXT:    s_mov_b64 s[4:5], exec
 ; GFX6-NEXT:    s_mov_b64 exec, 15
-; GFX6-NEXT:    s_mov_b32 s6, 0x80400
 ; GFX6-NEXT:    buffer_store_dword v4, off, s[40:43], 0
+; GFX6-NEXT:    s_mov_b32 s6, 0x80400
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    buffer_load_dword v4, off, s[40:43], s6 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
@@ -10509,8 +10509,8 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    s_mov_b64 s[36:37], s[0:1]
 ; GFX6-NEXT:    s_mov_b64 s[4:5], exec
 ; GFX6-NEXT:    s_mov_b64 exec, 15
-; GFX6-NEXT:    s_mov_b32 s6, 0x80800
 ; GFX6-NEXT:    buffer_store_dword v4, off, s[40:43], 0
+; GFX6-NEXT:    s_mov_b32 s6, 0x80800
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    buffer_load_dword v4, off, s[40:43], s6 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/spill-special-sgpr.mir b/llvm/test/CodeGen/AMDGPU/spill-special-sgpr.mir
index 3892ceb..537aca1 100644
--- a/llvm/test/CodeGen/AMDGPU/spill-special-sgpr.mir
+++ b/llvm/test/CodeGen/AMDGPU/spill-special-sgpr.mir
@@ -50,28 +50,28 @@ body:             |
     ; GFX9-NEXT: $vcc = IMPLICIT_DEF
     ; GFX9-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec
     ; GFX9-NEXT: $exec = S_MOV_B64 3, implicit-def $vgpr0
-    ; GFX9-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr12_sgpr13_sgpr14_sgpr15, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
+    ; GFX9-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr12_sgpr13_sgpr14_sgpr15, $sgpr33, 8, 0, 0, implicit $exec :: (store (s32) into %stack.1, addrspace 5)
     ; GFX9-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $vcc_lo, 0, undef $vgpr0, implicit $vcc
     ; GFX9-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $vcc_hi, 1, $vgpr0, implicit $vcc
-    ; GFX9-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr12_sgpr13_sgpr14_sgpr15, $sgpr33, 4, 0, 0, implicit $exec :: (store (s32) into %stack.0, addrspace 5)
-    ; GFX9-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr12_sgpr13_sgpr14_sgpr15, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
+    ; GFX9-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr12_sgpr13_sgpr14_sgpr15, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %stack.0, addrspace 5)
+    ; GFX9-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr12_sgpr13_sgpr14_sgpr15, $sgpr33, 8, 0, 0, implicit $exec :: (load (s32) from %stack.1, addrspace 5)
     ; GFX9-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1, implicit killed $vgpr0
     ; GFX9-NEXT: $vcc = IMPLICIT_DEF
     ; GFX9-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec
     ; GFX9-NEXT: $exec = S_MOV_B64 3, implicit-def $vgpr0
-    ; GFX9-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr12_sgpr13_sgpr14_sgpr15, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
+    ; GFX9-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr12_sgpr13_sgpr14_sgpr15, $sgpr33, 8, 0, 0, implicit $exec :: (store (s32) into %stack.1, addrspace 5)
     ; GFX9-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $vcc_lo, 0, undef $vgpr0, implicit $vcc
     ; GFX9-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $vcc_hi, 1, $vgpr0, implicit killed $vcc
-    ; GFX9-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr12_sgpr13_sgpr14_sgpr15, $sgpr33, 4, 0, 0, implicit $exec :: (store (s32) into %stack.0, addrspace 5)
-    ; GFX9-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr12_sgpr13_sgpr14_sgpr15, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
+    ; GFX9-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr12_sgpr13_sgpr14_sgpr15, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %stack.0, addrspace 5)
+    ; GFX9-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr12_sgpr13_sgpr14_sgpr15, $sgpr33, 8, 0, 0, implicit $exec :: (load (s32) from %stack.1, addrspace 5)
     ; GFX9-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1, implicit killed $vgpr0
     ; GFX9-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec
     ; GFX9-NEXT: $exec = S_MOV_B64 3, implicit-def $vgpr0
-    ; GFX9-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr12_sgpr13_sgpr14_sgpr15, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
-    ; GFX9-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr12_sgpr13_sgpr14_sgpr15, $sgpr33, 4, 0, 0, implicit $exec :: (load (s32) from %stack.0, addrspace 5)
+    ; GFX9-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr12_sgpr13_sgpr14_sgpr15, $sgpr33, 8, 0, 0, implicit $exec :: (store (s32) into %stack.1, addrspace 5)
+    ; GFX9-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr12_sgpr13_sgpr14_sgpr15, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %stack.0, addrspace 5)
     ; GFX9-NEXT: $vcc_lo = SI_RESTORE_S32_FROM_VGPR $vgpr0, 0, implicit-def $vcc
     ; GFX9-NEXT: $vcc_hi = SI_RESTORE_S32_FROM_VGPR killed $vgpr0, 1
-    ; GFX9-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr12_sgpr13_sgpr14_sgpr15, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
+    ; GFX9-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr12_sgpr13_sgpr14_sgpr15, $sgpr33, 8, 0, 0, implicit $exec :: (load (s32) from %stack.1, addrspace 5)
     ; GFX9-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1, implicit killed $vgpr0
     ;
     ; GFX10-LABEL: name: check_vcc
@@ -87,28 +87,28 @@ body:             |
     ; GFX10-NEXT: $vcc = IMPLICIT_DEF
     ; GFX10-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec
     ; GFX10-NEXT: $exec = S_MOV_B64 3, implicit-def $vgpr0
-    ; GFX10-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
+    ; GFX10-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 8, 0, 0, implicit $exec :: (store (s32) into %stack.1, addrspace 5)
     ; GFX10-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $vcc_lo, 0, undef $vgpr0, implicit $vcc
     ; GFX10-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $vcc_hi, 1, $vgpr0, implicit $vcc
-    ; GFX10-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 4, 0, 0, implicit $exec :: (store (s32) into %stack.0, addrspace 5)
-    ; GFX10-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
+    ; GFX10-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %stack.0, addrspace 5)
+    ; GFX10-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 8, 0, 0, implicit $exec :: (load (s32) from %stack.1, addrspace 5)
     ; GFX10-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1, implicit killed $vgpr0
     ; GFX10-NEXT: $vcc = IMPLICIT_DEF
     ; GFX10-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec
     ; GFX10-NEXT: $exec = S_MOV_B64 3, implicit-def $vgpr0
-    ; GFX10-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
+    ; GFX10-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 8, 0, 0, implicit $exec :: (store (s32) into %stack.1, addrspace 5)
     ; GFX10-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $vcc_lo, 0, undef $vgpr0, implicit $vcc
     ; GFX10-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $vcc_hi, 1, $vgpr0, implicit killed $vcc
-    ; GFX10-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 4, 0, 0, implicit $exec :: (store (s32) into %stack.0, addrspace 5)
-    ; GFX10-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
+    ; GFX10-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %stack.0, addrspace 5)
+    ; GFX10-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 8, 0, 0, implicit $exec :: (load (s32) from %stack.1, addrspace 5)
     ; GFX10-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1, implicit killed $vgpr0
     ; GFX10-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec
     ; GFX10-NEXT: $exec = S_MOV_B64 3, implicit-def $vgpr0
-    ; GFX10-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
-    ; GFX10-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 4, 0, 0, implicit $exec :: (load (s32) from %stack.0, addrspace 5)
+    ; GFX10-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 8, 0, 0, implicit $exec :: (store (s32) into %stack.1, addrspace 5)
+    ; GFX10-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %stack.0, addrspace 5)
     ; GFX10-NEXT: $vcc_lo = SI_RESTORE_S32_FROM_VGPR $vgpr0, 0, implicit-def $vcc
     ; GFX10-NEXT: $vcc_hi = SI_RESTORE_S32_FROM_VGPR killed $vgpr0, 1
-    ; GFX10-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
+    ; GFX10-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 8, 0, 0, implicit $exec :: (load (s32) from %stack.1, addrspace 5)
     ; GFX10-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1, implicit killed $vgpr0
     ;
     ; GFX11-LABEL: name: check_vcc
@@ -118,28 +118,28 @@ body:             |
     ; GFX11-NEXT: $vcc = IMPLICIT_DEF
     ; GFX11-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec
     ; GFX11-NEXT: $exec = S_MOV_B64 3, implicit-def $vgpr0
-    ; GFX11-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
+    ; GFX11-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr33, 8, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.1, addrspace 5)
     ; GFX11-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $vcc_lo, 0, undef $vgpr0, implicit $vcc
     ; GFX11-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $vcc_hi, 1, $vgpr0, implicit $vcc
-    ; GFX11-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr33, 4, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.0, addrspace 5)
-    ; GFX11-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
+    ; GFX11-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.0, addrspace 5)
+    ; GFX11-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 8, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.1, addrspace 5)
     ; GFX11-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1, implicit killed $vgpr0
     ; GFX11-NEXT: $vcc = IMPLICIT_DEF
     ; GFX11-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec
     ; GFX11-NEXT: $exec = S_MOV_B64 3, implicit-def $vgpr0
-    ; GFX11-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
+    ; GFX11-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr33, 8, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.1, addrspace 5)
     ; GFX11-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $vcc_lo, 0, undef $vgpr0, implicit $vcc
     ; GFX11-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $vcc_hi, 1, $vgpr0, implicit killed $vcc
-    ; GFX11-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr33, 4, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.0, addrspace 5)
-    ; GFX11-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
+    ; GFX11-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.0, addrspace 5)
+    ; GFX11-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 8, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.1, addrspace 5)
     ; GFX11-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1, implicit killed $vgpr0
     ; GFX11-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec
     ; GFX11-NEXT: $exec = S_MOV_B64 3, implicit-def $vgpr0
-    ; GFX11-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
-    ; GFX11-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 4, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.0, addrspace 5)
+    ; GFX11-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr33, 8, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.1, addrspace 5)
+    ; GFX11-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.0, addrspace 5)
     ; GFX11-NEXT: $vcc_lo = SI_RESTORE_S32_FROM_VGPR $vgpr0, 0, implicit-def $vcc
     ; GFX11-NEXT: $vcc_hi = SI_RESTORE_S32_FROM_VGPR killed $vgpr0, 1
-    ; GFX11-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
+    ; GFX11-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 8, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.1, addrspace 5)
     ; GFX11-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1, implicit killed $vgpr0
     $vcc = IMPLICIT_DEF
     SI_SPILL_S64_SAVE $vcc, %stack.0, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99, implicit $sgpr32
diff --git a/llvm/test/CodeGen/AMDGPU/stack-realign-kernel.ll b/llvm/test/CodeGen/AMDGPU/stack-realign-kernel.ll
index 3720933..eb211f7 100644
--- a/llvm/test/CodeGen/AMDGPU/stack-realign-kernel.ll
+++ b/llvm/test/CodeGen/AMDGPU/stack-realign-kernel.ll
@@ -8,6 +8,9 @@ define amdgpu_kernel void @max_alignment_128() #0 {
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_add_u32 s0, s0, s7
 ; VI-NEXT:    s_addc_u32 s1, s1, 0
+; VI-NEXT:    v_mov_b32_e32 v0, 3
+; VI-NEXT:    buffer_store_byte v0, off, s[0:3], 0
+; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, 9
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:128
 ; VI-NEXT:    s_waitcnt vmcnt(0)
@@ -56,6 +59,9 @@ define amdgpu_kernel void @max_alignment_128() #0 {
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_add_u32 s0, s0, s7
 ; GFX9-NEXT:    s_addc_u32 s1, s1, 0
+; GFX9-NEXT:    v_mov_b32_e32 v0, 3
+; GFX9-NEXT:    buffer_store_byte v0, off, s[0:3], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 9
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:128
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
@@ -101,6 +107,8 @@ define amdgpu_kernel void @max_alignment_128() #0 {
 ; GFX9-NEXT:     .amdhsa_exception_int_div_zero 0
 ; GFX9-NEXT:    .end_amdhsa_kernel
 ; GFX9-NEXT:    .text
+  %clutter = alloca i8, addrspace(5) ; Force non-zero offset for next alloca
+  store volatile i8 3, ptr addrspace(5) %clutter
   %alloca.align = alloca i32, align 128, addrspace(5)
   store volatile i32 9, ptr addrspace(5) %alloca.align, align 128
   ret void
@@ -111,6 +119,9 @@ define amdgpu_kernel void @stackrealign_attr() #1 {
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_add_u32 s0, s0, s7
 ; VI-NEXT:    s_addc_u32 s1, s1, 0
+; VI-NEXT:    v_mov_b32_e32 v0, 3
+; VI-NEXT:    buffer_store_byte v0, off, s[0:3], 0
+; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, 9
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:4
 ; VI-NEXT:    s_waitcnt vmcnt(0)
@@ -119,7 +130,7 @@ define amdgpu_kernel void @stackrealign_attr() #1 {
 ; VI-NEXT:    .p2align 6
 ; VI-NEXT:    .amdhsa_kernel stackrealign_attr
 ; VI-NEXT:     .amdhsa_group_segment_fixed_size 0
-; VI-NEXT:     .amdhsa_private_segment_fixed_size 8
+; VI-NEXT:     .amdhsa_private_segment_fixed_size 12
 ; VI-NEXT:     .amdhsa_kernarg_size 0
 ; VI-NEXT:     .amdhsa_user_sgpr_count 6
 ; VI-NEXT:     .amdhsa_user_sgpr_private_segment_buffer 1
@@ -159,6 +170,9 @@ define amdgpu_kernel void @stackrealign_attr() #1 {
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_add_u32 s0, s0, s7
 ; GFX9-NEXT:    s_addc_u32 s1, s1, 0
+; GFX9-NEXT:    v_mov_b32_e32 v0, 3
+; GFX9-NEXT:    buffer_store_byte v0, off, s[0:3], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 9
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:4
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
@@ -167,7 +181,7 @@ define amdgpu_kernel void @stackrealign_attr() #1 {
 ; GFX9-NEXT:    .p2align 6
 ; GFX9-NEXT:    .amdhsa_kernel stackrealign_attr
 ; GFX9-NEXT:     .amdhsa_group_segment_fixed_size 0
-; GFX9-NEXT:     .amdhsa_private_segment_fixed_size 8
+; GFX9-NEXT:     .amdhsa_private_segment_fixed_size 12
 ; GFX9-NEXT:     .amdhsa_kernarg_size 0
 ; GFX9-NEXT:     .amdhsa_user_sgpr_count 6
 ; GFX9-NEXT:     .amdhsa_user_sgpr_private_segment_buffer 1
@@ -204,6 +218,8 @@ define amdgpu_kernel void @stackrealign_attr() #1 {
 ; GFX9-NEXT:     .amdhsa_exception_int_div_zero 0
 ; GFX9-NEXT:    .end_amdhsa_kernel
 ; GFX9-NEXT:    .text
+  %clutter = alloca i8, addrspace(5) ; Force non-zero offset for next alloca
+  store volatile i8 3, ptr addrspace(5) %clutter
   %alloca.align = alloca i32, align 4, addrspace(5)
   store volatile i32 9, ptr addrspace(5) %alloca.align, align 4
   ret void
@@ -214,6 +230,9 @@ define amdgpu_kernel void @alignstack_attr() #2 {
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_add_u32 s0, s0, s7
 ; VI-NEXT:    s_addc_u32 s1, s1, 0
+; VI-NEXT:    v_mov_b32_e32 v0, 3
+; VI-NEXT:    buffer_store_byte v0, off, s[0:3], 0
+; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, 9
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:4
 ; VI-NEXT:    s_waitcnt vmcnt(0)
@@ -262,6 +281,9 @@ define amdgpu_kernel void @alignstack_attr() #2 {
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_add_u32 s0, s0, s7
 ; GFX9-NEXT:    s_addc_u32 s1, s1, 0
+; GFX9-NEXT:    v_mov_b32_e32 v0, 3
+; GFX9-NEXT:    buffer_store_byte v0, off, s[0:3], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 9
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:4
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
@@ -307,6 +329,8 @@ define amdgpu_kernel void @alignstack_attr() #2 {
 ; GFX9-NEXT:     .amdhsa_exception_int_div_zero 0
 ; GFX9-NEXT:    .end_amdhsa_kernel
 ; GFX9-NEXT:    .text
+  %clutter = alloca i8, addrspace(5) ; Force non-zero offset for next alloca
+  store volatile i8 3, ptr addrspace(5) %clutter
   %alloca.align = alloca i32, align 4, addrspace(5)
   store volatile i32 9, ptr addrspace(5) %alloca.align, align 4
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/stack-size-overflow.ll b/llvm/test/CodeGen/AMDGPU/stack-size-overflow.ll
index b1a939d..e378a83 100644
--- a/llvm/test/CodeGen/AMDGPU/stack-size-overflow.ll
+++ b/llvm/test/CodeGen/AMDGPU/stack-size-overflow.ll
@@ -3,8 +3,8 @@
 
 declare void @llvm.memset.p5.i32(ptr addrspace(5) nocapture, i8, i32, i32, i1) #1
 
-; ERROR: error: <unknown>:0:0: stack frame size (131061) exceeds limit (131056) in function 'stack_size_limit_wave64'
-; GCN: ; ScratchSize: 131061
+; ERROR: error: <unknown>:0:0: stack frame size (131064) exceeds limit (131056) in function 'stack_size_limit_wave64'
+; GCN: ; ScratchSize: 131064
 define amdgpu_kernel void @stack_size_limit_wave64() #0 {
 entry:
   %alloca = alloca [131057 x i8], align 1, addrspace(5)
@@ -12,8 +12,8 @@ entry:
   ret void
 }
 
-; ERROR: error: <unknown>:0:0: stack frame size (262117) exceeds limit (262112) in function 'stack_size_limit_wave32'
-; GCN: ; ScratchSize: 262117
+; ERROR: error: <unknown>:0:0: stack frame size (262120) exceeds limit (262112) in function 'stack_size_limit_wave32'
+; GCN: ; ScratchSize: 262120
 define amdgpu_kernel void @stack_size_limit_wave32() #1 {
 entry:
   %alloca = alloca [262113 x i8], align 1, addrspace(5)
diff --git a/llvm/test/CodeGen/AMDGPU/stacksave_stackrestore.ll b/llvm/test/CodeGen/AMDGPU/stacksave_stackrestore.ll
index d8db2d5..8c5b894 100644
--- a/llvm/test/CodeGen/AMDGPU/stacksave_stackrestore.ll
+++ b/llvm/test/CodeGen/AMDGPU/stacksave_stackrestore.ll
@@ -878,7 +878,7 @@ define amdgpu_kernel void @kernel_stacksave_stackrestore_call_with_stack_objects
 ; WAVE32-OPT-NEXT:    s_lshr_b32 s6, s0, 5
 ; WAVE32-OPT-NEXT:    s_mov_b64 s[0:1], s[8:9]
 ; WAVE32-OPT-NEXT:    s_mov_b64 s[2:3], s[10:11]
-; WAVE32-OPT-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:4
+; WAVE32-OPT-NEXT:    buffer_store_dword v0, off, s[8:11], 0
 ; WAVE32-OPT-NEXT:    s_waitcnt_vscnt null, 0x0
 ; WAVE32-OPT-NEXT:    buffer_store_dword v1, off, s[8:11], s32 offset:4
 ; WAVE32-OPT-NEXT:    s_swappc_b64 s[30:31], s[4:5]
@@ -904,7 +904,7 @@ define amdgpu_kernel void @kernel_stacksave_stackrestore_call_with_stack_objects
 ; WAVE64-OPT-NEXT:    s_lshr_b32 s6, s0, 6
 ; WAVE64-OPT-NEXT:    s_mov_b64 s[0:1], s[8:9]
 ; WAVE64-OPT-NEXT:    s_mov_b64 s[2:3], s[10:11]
-; WAVE64-OPT-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:4
+; WAVE64-OPT-NEXT:    buffer_store_dword v0, off, s[8:11], 0
 ; WAVE64-OPT-NEXT:    s_waitcnt_vscnt null, 0x0
 ; WAVE64-OPT-NEXT:    buffer_store_dword v1, off, s[8:11], s32 offset:4
 ; WAVE64-OPT-NEXT:    s_swappc_b64 s[30:31], s[4:5]
@@ -935,10 +935,10 @@ define amdgpu_kernel void @kernel_stacksave_stackrestore_call_with_stack_objects
 ; WAVE32-O0-NEXT:    s_lshr_b32 s0, s0, 5
 ; WAVE32-O0-NEXT:    v_writelane_b32 v3, s0, 1
 ; WAVE32-O0-NEXT:    s_or_saveexec_b32 s19, -1
-; WAVE32-O0-NEXT:    buffer_store_dword v3, off, s[20:23], 0 offset:132 ; 4-byte Folded Spill
+; WAVE32-O0-NEXT:    buffer_store_dword v3, off, s[20:23], 0 offset:128 ; 4-byte Folded Spill
 ; WAVE32-O0-NEXT:    s_mov_b32 exec_lo, s19
 ; WAVE32-O0-NEXT:    v_mov_b32_e32 v3, 42
-; WAVE32-O0-NEXT:    buffer_store_dword v3, off, s[20:23], 0 offset:4
+; WAVE32-O0-NEXT:    buffer_store_dword v3, off, s[20:23], 0
 ; WAVE32-O0-NEXT:    s_waitcnt_vscnt null, 0x0
 ; WAVE32-O0-NEXT:    s_mov_b64 s[0:1], s[20:21]
 ; WAVE32-O0-NEXT:    s_mov_b64 s[2:3], s[22:23]
@@ -1020,7 +1020,7 @@ define amdgpu_kernel void @kernel_stacksave_stackrestore_call_with_stack_objects
 ; WAVE32-O0-NEXT:    v_mov_b32_e32 v30, s18
 ; WAVE32-O0-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; WAVE32-O0-NEXT:    s_or_saveexec_b32 s19, -1
-; WAVE32-O0-NEXT:    buffer_load_dword v0, off, s[20:23], 0 offset:132 ; 4-byte Folded Reload
+; WAVE32-O0-NEXT:    buffer_load_dword v0, off, s[20:23], 0 offset:128 ; 4-byte Folded Reload
 ; WAVE32-O0-NEXT:    s_mov_b32 exec_lo, s19
 ; WAVE32-O0-NEXT:    s_waitcnt vmcnt(0)
 ; WAVE32-O0-NEXT:    v_readlane_b32 s1, v0, 1
@@ -1053,10 +1053,10 @@ define amdgpu_kernel void @kernel_stacksave_stackrestore_call_with_stack_objects
 ; WAVE64-O0-NEXT:    s_lshr_b32 s0, s0, 6
 ; WAVE64-O0-NEXT:    v_writelane_b32 v3, s0, 1
 ; WAVE64-O0-NEXT:    s_or_saveexec_b64 s[20:21], -1
-; WAVE64-O0-NEXT:    buffer_store_dword v3, off, s[24:27], 0 offset:132 ; 4-byte Folded Spill
+; WAVE64-O0-NEXT:    buffer_store_dword v3, off, s[24:27], 0 offset:128 ; 4-byte Folded Spill
 ; WAVE64-O0-NEXT:    s_mov_b64 exec, s[20:21]
 ; WAVE64-O0-NEXT:    v_mov_b32_e32 v3, 42
-; WAVE64-O0-NEXT:    buffer_store_dword v3, off, s[24:27], 0 offset:4
+; WAVE64-O0-NEXT:    buffer_store_dword v3, off, s[24:27], 0
 ; WAVE64-O0-NEXT:    s_waitcnt_vscnt null, 0x0
 ; WAVE64-O0-NEXT:    s_mov_b64 s[0:1], s[24:25]
 ; WAVE64-O0-NEXT:    s_mov_b64 s[2:3], s[26:27]
@@ -1138,7 +1138,7 @@ define amdgpu_kernel void @kernel_stacksave_stackrestore_call_with_stack_objects
 ; WAVE64-O0-NEXT:    v_mov_b32_e32 v30, s18
 ; WAVE64-O0-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; WAVE64-O0-NEXT:    s_or_saveexec_b64 s[20:21], -1
-; WAVE64-O0-NEXT:    buffer_load_dword v0, off, s[24:27], 0 offset:132 ; 4-byte Folded Reload
+; WAVE64-O0-NEXT:    buffer_load_dword v0, off, s[24:27], 0 offset:128 ; 4-byte Folded Reload
 ; WAVE64-O0-NEXT:    s_mov_b64 exec, s[20:21]
 ; WAVE64-O0-NEXT:    s_waitcnt vmcnt(0)
 ; WAVE64-O0-NEXT:    v_readlane_b32 s1, v0, 1
@@ -1172,7 +1172,7 @@ define amdgpu_kernel void @kernel_stacksave_stackrestore_call_with_stack_objects
 ; WAVE32-WWM-PREALLOC-NEXT:    s_lshr_b32 s0, s0, 5
 ; WAVE32-WWM-PREALLOC-NEXT:    v_writelane_b32 v32, s0, 1
 ; WAVE32-WWM-PREALLOC-NEXT:    v_mov_b32_e32 v3, 42
-; WAVE32-WWM-PREALLOC-NEXT:    buffer_store_dword v3, off, s[20:23], 0 offset:4
+; WAVE32-WWM-PREALLOC-NEXT:    buffer_store_dword v3, off, s[20:23], 0
 ; WAVE32-WWM-PREALLOC-NEXT:    s_waitcnt_vscnt null, 0x0
 ; WAVE32-WWM-PREALLOC-NEXT:    s_mov_b64 s[0:1], s[20:21]
 ; WAVE32-WWM-PREALLOC-NEXT:    s_mov_b64 s[2:3], s[22:23]
diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-mark-last-scratch-load.ll b/llvm/test/CodeGen/AMDGPU/vgpr-mark-last-scratch-load.ll
index 808f006..137bd0f 100644
--- a/llvm/test/CodeGen/AMDGPU/vgpr-mark-last-scratch-load.ll
+++ b/llvm/test/CodeGen/AMDGPU/vgpr-mark-last-scratch-load.ll
@@ -16,33 +16,33 @@ define amdgpu_cs void @max_6_vgprs(ptr addrspace(1) %p) "amdgpu-num-vgpr"="6" {
 ; CHECK-NEXT:    s_wait_loadcnt 0x0
 ; CHECK-NEXT:    global_load_b32 v2, v[0:1], off offset:16 scope:SCOPE_SYS
 ; CHECK-NEXT:    s_wait_loadcnt 0x0
-; CHECK-NEXT:    scratch_store_b32 off, v2, off offset:4 ; 4-byte Folded Spill
+; CHECK-NEXT:    scratch_store_b32 off, v2, off ; 4-byte Folded Spill
 ; CHECK-NEXT:    global_load_b32 v2, v[0:1], off offset:48 scope:SCOPE_SYS
 ; CHECK-NEXT:    s_wait_loadcnt 0x0
-; CHECK-NEXT:    scratch_store_b32 off, v2, off offset:8 ; 4-byte Folded Spill
+; CHECK-NEXT:    scratch_store_b32 off, v2, off offset:4 ; 4-byte Folded Spill
 ; CHECK-NEXT:    global_load_b32 v2, v[0:1], off offset:96 scope:SCOPE_SYS
 ; CHECK-NEXT:    s_wait_loadcnt 0x0
-; CHECK-NEXT:    scratch_store_b32 off, v2, off offset:12 ; 4-byte Folded Spill
+; CHECK-NEXT:    scratch_store_b32 off, v2, off offset:8 ; 4-byte Folded Spill
 ; CHECK-NEXT:    global_load_b32 v0, v[0:1], off offset:160 scope:SCOPE_SYS
 ; CHECK-NEXT:    s_wait_loadcnt 0x0
-; CHECK-NEXT:    scratch_store_b32 off, v0, off offset:16 ; 4-byte Folded Spill
+; CHECK-NEXT:    scratch_store_b32 off, v0, off offset:12 ; 4-byte Folded Spill
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ;;#ASMEND
 ; CHECK-NEXT:    global_store_b32 v[0:1], v5, off scope:SCOPE_SYS
 ; CHECK-NEXT:    s_wait_storecnt 0x0
-; CHECK-NEXT:    scratch_load_b32 v0, off, off offset:4 th:TH_LOAD_LU ; 4-byte Folded Reload
+; CHECK-NEXT:    scratch_load_b32 v0, off, off th:TH_LOAD_LU ; 4-byte Folded Reload
 ; CHECK-NEXT:    s_wait_loadcnt 0x0
 ; CHECK-NEXT:    global_store_b32 v[0:1], v0, off scope:SCOPE_SYS
 ; CHECK-NEXT:    s_wait_storecnt 0x0
-; CHECK-NEXT:    scratch_load_b32 v0, off, off offset:8 th:TH_LOAD_LU ; 4-byte Folded Reload
+; CHECK-NEXT:    scratch_load_b32 v0, off, off offset:4 th:TH_LOAD_LU ; 4-byte Folded Reload
 ; CHECK-NEXT:    s_wait_loadcnt 0x0
 ; CHECK-NEXT:    global_store_b32 v[0:1], v0, off scope:SCOPE_SYS
 ; CHECK-NEXT:    s_wait_storecnt 0x0
-; CHECK-NEXT:    scratch_load_b32 v0, off, off offset:12 th:TH_LOAD_LU ; 4-byte Folded Reload
+; CHECK-NEXT:    scratch_load_b32 v0, off, off offset:8 th:TH_LOAD_LU ; 4-byte Folded Reload
 ; CHECK-NEXT:    s_wait_loadcnt 0x0
 ; CHECK-NEXT:    global_store_b32 v[0:1], v0, off scope:SCOPE_SYS
 ; CHECK-NEXT:    s_wait_storecnt 0x0
-; CHECK-NEXT:    scratch_load_b32 v0, off, off offset:16 th:TH_LOAD_LU ; 4-byte Folded Reload
+; CHECK-NEXT:    scratch_load_b32 v0, off, off offset:12 th:TH_LOAD_LU ; 4-byte Folded Reload
 ; CHECK-NEXT:    s_wait_loadcnt 0x0
 ; CHECK-NEXT:    global_store_b32 v[0:1], v0, off scope:SCOPE_SYS
 ; CHECK-NEXT:    s_wait_storecnt 0x0
@@ -83,16 +83,16 @@ define amdgpu_cs void @max_11_vgprs_branch(ptr addrspace(1) %p, i32 %tmp) "amdgp
 ; CHECK-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v4, vcc_lo
 ; CHECK-NEXT:    global_load_b32 v3, v[0:1], off offset:336 scope:SCOPE_SYS
 ; CHECK-NEXT:    s_wait_loadcnt 0x0
-; CHECK-NEXT:    scratch_store_b32 off, v3, off offset:12 ; 4-byte Folded Spill
+; CHECK-NEXT:    scratch_store_b32 off, v3, off offset:8 ; 4-byte Folded Spill
 ; CHECK-NEXT:    global_load_b32 v3, v[0:1], off offset:448 scope:SCOPE_SYS
 ; CHECK-NEXT:    s_wait_loadcnt 0x0
-; CHECK-NEXT:    scratch_store_b32 off, v3, off offset:16 ; 4-byte Folded Spill
+; CHECK-NEXT:    scratch_store_b32 off, v3, off offset:12 ; 4-byte Folded Spill
 ; CHECK-NEXT:    global_load_b32 v3, v[0:1], off offset:576 scope:SCOPE_SYS
 ; CHECK-NEXT:    s_wait_loadcnt 0x0
-; CHECK-NEXT:    scratch_store_b32 off, v3, off offset:4 ; 4-byte Folded Spill
+; CHECK-NEXT:    scratch_store_b32 off, v3, off ; 4-byte Folded Spill
 ; CHECK-NEXT:    global_load_b32 v3, v[0:1], off offset:720 scope:SCOPE_SYS
 ; CHECK-NEXT:    s_wait_loadcnt 0x0
-; CHECK-NEXT:    scratch_store_b32 off, v3, off offset:8 ; 4-byte Folded Spill
+; CHECK-NEXT:    scratch_store_b32 off, v3, off offset:4 ; 4-byte Folded Spill
 ; CHECK-NEXT:    v_cmpx_eq_u32_e32 0, v2
 ; CHECK-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; CHECK-NEXT:    s_cbranch_execz .LBB1_2
@@ -101,23 +101,27 @@ define amdgpu_cs void @max_11_vgprs_branch(ptr addrspace(1) %p, i32 %tmp) "amdgp
 ; CHECK-NEXT:    s_wait_loadcnt 0x0
 ; CHECK-NEXT:    global_load_b32 v2, v[0:1], off offset:16 scope:SCOPE_SYS
 ; CHECK-NEXT:    s_wait_loadcnt 0x0
-; CHECK-NEXT:    scratch_store_b32 off, v2, off offset:20 ; 4-byte Folded Spill
+; CHECK-NEXT:    scratch_store_b32 off, v2, off offset:16 ; 4-byte Folded Spill
 ; CHECK-NEXT:    global_load_b32 v2, v[0:1], off offset:48 scope:SCOPE_SYS
 ; CHECK-NEXT:    s_wait_loadcnt 0x0
-; CHECK-NEXT:    scratch_store_b32 off, v2, off offset:24 ; 4-byte Folded Spill
+; CHECK-NEXT:    scratch_store_b32 off, v2, off offset:20 ; 4-byte Folded Spill
 ; CHECK-NEXT:    global_load_b32 v2, v[0:1], off offset:96 scope:SCOPE_SYS
 ; CHECK-NEXT:    s_wait_loadcnt 0x0
-; CHECK-NEXT:    scratch_store_b32 off, v2, off offset:28 ; 4-byte Folded Spill
+; CHECK-NEXT:    scratch_store_b32 off, v2, off offset:24 ; 4-byte Folded Spill
 ; CHECK-NEXT:    global_load_b32 v2, v[0:1], off offset:160 scope:SCOPE_SYS
 ; CHECK-NEXT:    s_wait_loadcnt 0x0
-; CHECK-NEXT:    scratch_store_b32 off, v2, off offset:32 ; 4-byte Folded Spill
+; CHECK-NEXT:    scratch_store_b32 off, v2, off offset:28 ; 4-byte Folded Spill
 ; CHECK-NEXT:    global_load_b32 v0, v[0:1], off offset:240 scope:SCOPE_SYS
 ; CHECK-NEXT:    s_wait_loadcnt 0x0
-; CHECK-NEXT:    scratch_store_b32 off, v0, off offset:36 ; 4-byte Folded Spill
+; CHECK-NEXT:    scratch_store_b32 off, v0, off offset:32 ; 4-byte Folded Spill
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ;;#ASMEND
 ; CHECK-NEXT:    global_store_b32 v[0:1], v10, off scope:SCOPE_SYS
 ; CHECK-NEXT:    s_wait_storecnt 0x0
+; CHECK-NEXT:    scratch_load_b32 v0, off, off offset:16 th:TH_LOAD_LU ; 4-byte Folded Reload
+; CHECK-NEXT:    s_wait_loadcnt 0x0
+; CHECK-NEXT:    global_store_b32 v[0:1], v0, off scope:SCOPE_SYS
+; CHECK-NEXT:    s_wait_storecnt 0x0
 ; CHECK-NEXT:    scratch_load_b32 v0, off, off offset:20 th:TH_LOAD_LU ; 4-byte Folded Reload
 ; CHECK-NEXT:    s_wait_loadcnt 0x0
 ; CHECK-NEXT:    global_store_b32 v[0:1], v0, off scope:SCOPE_SYS
@@ -134,7 +138,7 @@ define amdgpu_cs void @max_11_vgprs_branch(ptr addrspace(1) %p, i32 %tmp) "amdgp
 ; CHECK-NEXT:    s_wait_loadcnt 0x0
 ; CHECK-NEXT:    global_store_b32 v[0:1], v0, off scope:SCOPE_SYS
 ; CHECK-NEXT:    s_wait_storecnt 0x0
-; CHECK-NEXT:    scratch_load_b32 v0, off, off offset:36 th:TH_LOAD_LU ; 4-byte Folded Reload
+; CHECK-NEXT:    scratch_load_b32 v0, off, off offset:8 th:TH_LOAD_LU ; 4-byte Folded Reload
 ; CHECK-NEXT:    s_wait_loadcnt 0x0
 ; CHECK-NEXT:    global_store_b32 v[0:1], v0, off scope:SCOPE_SYS
 ; CHECK-NEXT:    s_wait_storecnt 0x0
@@ -142,10 +146,6 @@ define amdgpu_cs void @max_11_vgprs_branch(ptr addrspace(1) %p, i32 %tmp) "amdgp
 ; CHECK-NEXT:    s_wait_loadcnt 0x0
 ; CHECK-NEXT:    global_store_b32 v[0:1], v0, off scope:SCOPE_SYS
 ; CHECK-NEXT:    s_wait_storecnt 0x0
-; CHECK-NEXT:    scratch_load_b32 v0, off, off offset:16 th:TH_LOAD_LU ; 4-byte Folded Reload
-; CHECK-NEXT:    s_wait_loadcnt 0x0
-; CHECK-NEXT:    global_store_b32 v[0:1], v0, off scope:SCOPE_SYS
-; CHECK-NEXT:    s_wait_storecnt 0x0
 ; CHECK-NEXT:    ; implicit-def: $vgpr0
 ; CHECK-NEXT:    ; kill: killed $vgpr0
 ; CHECK-NEXT:    ; implicit-def: $vgpr0
@@ -159,23 +159,27 @@ define amdgpu_cs void @max_11_vgprs_branch(ptr addrspace(1) %p, i32 %tmp) "amdgp
 ; CHECK-NEXT:    s_wait_loadcnt 0x0
 ; CHECK-NEXT:    global_load_b32 v2, v[0:1], off offset:16 scope:SCOPE_SYS
 ; CHECK-NEXT:    s_wait_loadcnt 0x0
-; CHECK-NEXT:    scratch_store_b32 off, v2, off offset:20 ; 4-byte Folded Spill
+; CHECK-NEXT:    scratch_store_b32 off, v2, off offset:16 ; 4-byte Folded Spill
 ; CHECK-NEXT:    global_load_b32 v2, v[0:1], off offset:48 scope:SCOPE_SYS
 ; CHECK-NEXT:    s_wait_loadcnt 0x0
-; CHECK-NEXT:    scratch_store_b32 off, v2, off offset:24 ; 4-byte Folded Spill
+; CHECK-NEXT:    scratch_store_b32 off, v2, off offset:20 ; 4-byte Folded Spill
 ; CHECK-NEXT:    global_load_b32 v2, v[0:1], off offset:96 scope:SCOPE_SYS
 ; CHECK-NEXT:    s_wait_loadcnt 0x0
-; CHECK-NEXT:    scratch_store_b32 off, v2, off offset:28 ; 4-byte Folded Spill
+; CHECK-NEXT:    scratch_store_b32 off, v2, off offset:24 ; 4-byte Folded Spill
 ; CHECK-NEXT:    global_load_b32 v2, v[0:1], off offset:160 scope:SCOPE_SYS
 ; CHECK-NEXT:    s_wait_loadcnt 0x0
-; CHECK-NEXT:    scratch_store_b32 off, v2, off offset:32 ; 4-byte Folded Spill
+; CHECK-NEXT:    scratch_store_b32 off, v2, off offset:28 ; 4-byte Folded Spill
 ; CHECK-NEXT:    global_load_b32 v0, v[0:1], off offset:240 scope:SCOPE_SYS
 ; CHECK-NEXT:    s_wait_loadcnt 0x0
-; CHECK-NEXT:    scratch_store_b32 off, v0, off offset:36 ; 4-byte Folded Spill
+; CHECK-NEXT:    scratch_store_b32 off, v0, off offset:32 ; 4-byte Folded Spill
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ;;#ASMEND
 ; CHECK-NEXT:    global_store_b32 v[0:1], v10, off scope:SCOPE_SYS
 ; CHECK-NEXT:    s_wait_storecnt 0x0
+; CHECK-NEXT:    scratch_load_b32 v0, off, off offset:16 th:TH_LOAD_LU ; 4-byte Folded Reload
+; CHECK-NEXT:    s_wait_loadcnt 0x0
+; CHECK-NEXT:    global_store_b32 v[0:1], v0, off scope:SCOPE_SYS
+; CHECK-NEXT:    s_wait_storecnt 0x0
 ; CHECK-NEXT:    scratch_load_b32 v0, off, off offset:20 th:TH_LOAD_LU ; 4-byte Folded Reload
 ; CHECK-NEXT:    s_wait_loadcnt 0x0
 ; CHECK-NEXT:    global_store_b32 v[0:1], v0, off scope:SCOPE_SYS
@@ -192,7 +196,7 @@ define amdgpu_cs void @max_11_vgprs_branch(ptr addrspace(1) %p, i32 %tmp) "amdgp
 ; CHECK-NEXT:    s_wait_loadcnt 0x0
 ; CHECK-NEXT:    global_store_b32 v[0:1], v0, off scope:SCOPE_SYS
 ; CHECK-NEXT:    s_wait_storecnt 0x0
-; CHECK-NEXT:    scratch_load_b32 v0, off, off offset:36 th:TH_LOAD_LU ; 4-byte Folded Reload
+; CHECK-NEXT:    scratch_load_b32 v0, off, off offset:8 th:TH_LOAD_LU ; 4-byte Folded Reload
 ; CHECK-NEXT:    s_wait_loadcnt 0x0
 ; CHECK-NEXT:    global_store_b32 v[0:1], v0, off scope:SCOPE_SYS
 ; CHECK-NEXT:    s_wait_storecnt 0x0
@@ -200,17 +204,13 @@ define amdgpu_cs void @max_11_vgprs_branch(ptr addrspace(1) %p, i32 %tmp) "amdgp
 ; CHECK-NEXT:    s_wait_loadcnt 0x0
 ; CHECK-NEXT:    global_store_b32 v[0:1], v0, off scope:SCOPE_SYS
 ; CHECK-NEXT:    s_wait_storecnt 0x0
-; CHECK-NEXT:    scratch_load_b32 v0, off, off offset:16 th:TH_LOAD_LU ; 4-byte Folded Reload
-; CHECK-NEXT:    s_wait_loadcnt 0x0
-; CHECK-NEXT:    global_store_b32 v[0:1], v0, off scope:SCOPE_SYS
-; CHECK-NEXT:    s_wait_storecnt 0x0
 ; CHECK-NEXT:  .LBB1_4: ; %.exit
 ; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; CHECK-NEXT:    scratch_load_b32 v0, off, off offset:4 th:TH_LOAD_LU ; 4-byte Folded Reload
+; CHECK-NEXT:    scratch_load_b32 v0, off, off th:TH_LOAD_LU ; 4-byte Folded Reload
 ; CHECK-NEXT:    s_wait_loadcnt 0x0
 ; CHECK-NEXT:    global_store_b32 v[0:1], v0, off scope:SCOPE_SYS
 ; CHECK-NEXT:    s_wait_storecnt 0x0
-; CHECK-NEXT:    scratch_load_b32 v0, off, off offset:8 th:TH_LOAD_LU ; 4-byte Folded Reload
+; CHECK-NEXT:    scratch_load_b32 v0, off, off offset:4 th:TH_LOAD_LU ; 4-byte Folded Reload
 ; CHECK-NEXT:    s_wait_loadcnt 0x0
 ; CHECK-NEXT:    global_store_b32 v[0:1], v0, off scope:SCOPE_SYS
 ; CHECK-NEXT:    s_wait_storecnt 0x0
diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll b/llvm/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll
index 7aaf945..0cabfa9 100644
--- a/llvm/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll
+++ b/llvm/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll
@@ -25,7 +25,7 @@
 ; GCN: buffer_store_dword {{v[0-9]+}}, off, s[[[DESC0]]:[[DESC3]]], 0 offset:{{[0-9]+}} ; 4-byte Folded Spill
 ; GCN: buffer_load_dword v{{[0-9]+}}, off, s[[[DESC0]]:[[DESC3]]], 0 offset:{{[0-9]+}} ; 4-byte Folded Reload
 ; GCN: NumVgprs: 256
-; GCN: ScratchSize: 768
+; GCN: ScratchSize: 640
 
 define amdgpu_vs void @main(ptr addrspace(4) inreg %arg, ptr addrspace(4) inreg %arg1, ptr addrspace(4) inreg %arg2, ptr addrspace(4) inreg %arg3, ptr addrspace(4) inreg %arg4, i32 inreg %arg5, i32 inreg %arg6, i32 %arg7, i32 %arg8, i32 %arg9, i32 %arg10) #0 {
 bb:
diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-spill-placement-issue61083.ll b/llvm/test/CodeGen/AMDGPU/vgpr-spill-placement-issue61083.ll
index 3a3860d..a1d3e2a 100644
--- a/llvm/test/CodeGen/AMDGPU/vgpr-spill-placement-issue61083.ll
+++ b/llvm/test/CodeGen/AMDGPU/vgpr-spill-placement-issue61083.ll
@@ -16,12 +16,12 @@ define amdgpu_kernel void @__omp_offloading_16_dd2df_main_l9()  {
 ; CHECK-NEXT:    ; implicit-def: $vgpr1 : SGPR spill to VGPR lane
 ; CHECK-NEXT:    v_mov_b32_e32 v2, v0
 ; CHECK-NEXT:    s_or_saveexec_b64 s[8:9], -1
-; CHECK-NEXT:    buffer_load_dword v0, off, s[0:3], 0 offset:4 ; 4-byte Folded Reload
+; CHECK-NEXT:    buffer_load_dword v0, off, s[0:3], 0 ; 4-byte Folded Reload
 ; CHECK-NEXT:    s_mov_b64 exec, s[8:9]
 ; CHECK-NEXT:    v_mov_b32_e32 v1, 0
 ; CHECK-NEXT:    global_load_ushort v3, v1, s[4:5] offset:4
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    buffer_store_dword v3, off, s[0:3], 0 offset:8 ; 4-byte Folded Spill
+; CHECK-NEXT:    buffer_store_dword v3, off, s[0:3], 0 offset:4 ; 4-byte Folded Spill
 ; CHECK-NEXT:    ; implicit-def: $sgpr4
 ; CHECK-NEXT:    s_mov_b32 s4, 0
 ; CHECK-NEXT:    v_cmp_eq_u32_e64 s[6:7], v2, s4
@@ -32,7 +32,7 @@ define amdgpu_kernel void @__omp_offloading_16_dd2df_main_l9()  {
 ; CHECK-NEXT:    v_writelane_b32 v0, s4, 0
 ; CHECK-NEXT:    v_writelane_b32 v0, s5, 1
 ; CHECK-NEXT:    s_or_saveexec_b64 s[8:9], -1
-; CHECK-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:4 ; 4-byte Folded Spill
+; CHECK-NEXT:    buffer_store_dword v0, off, s[0:3], 0 ; 4-byte Folded Spill
 ; CHECK-NEXT:    s_mov_b64 exec, s[8:9]
 ; CHECK-NEXT:    s_and_b64 s[4:5], s[4:5], s[6:7]
 ; CHECK-NEXT:    s_mov_b64 exec, s[4:5]
@@ -40,20 +40,20 @@ define amdgpu_kernel void @__omp_offloading_16_dd2df_main_l9()  {
 ; CHECK-NEXT:  ; %bb.1: ; %bb193
 ; CHECK-NEXT:  .LBB0_2: ; %bb194
 ; CHECK-NEXT:    s_or_saveexec_b64 s[8:9], -1
-; CHECK-NEXT:    buffer_load_dword v1, off, s[0:3], 0 offset:4 ; 4-byte Folded Reload
+; CHECK-NEXT:    buffer_load_dword v1, off, s[0:3], 0 ; 4-byte Folded Reload
 ; CHECK-NEXT:    s_mov_b64 exec, s[8:9]
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    v_readlane_b32 s4, v1, 0
 ; CHECK-NEXT:    v_readlane_b32 s5, v1, 1
 ; CHECK-NEXT:    s_or_b64 exec, exec, s[4:5]
-; CHECK-NEXT:    buffer_load_dword v0, off, s[0:3], 0 offset:8 ; 4-byte Folded Reload
+; CHECK-NEXT:    buffer_load_dword v0, off, s[0:3], 0 offset:4 ; 4-byte Folded Reload
 ; CHECK-NEXT:    s_mov_b32 s4, 0
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    v_cmp_ne_u16_e64 s[4:5], v0, s4
 ; CHECK-NEXT:    s_and_b64 vcc, exec, s[4:5]
 ; CHECK-NEXT:    s_cbranch_vccnz .LBB0_4
 ; CHECK-NEXT:  ; %bb.3: ; %bb201
-; CHECK-NEXT:    buffer_load_dword v1, off, s[0:3], 0 offset:8 ; 4-byte Folded Reload
+; CHECK-NEXT:    buffer_load_dword v1, off, s[0:3], 0 offset:4 ; 4-byte Folded Reload
 ; CHECK-NEXT:    s_getpc_b64 s[4:5]
 ; CHECK-NEXT:    s_add_u32 s4, s4, V2@rel32@lo+4
 ; CHECK-NEXT:    s_addc_u32 s5, s5, V2@rel32@hi+12
@@ -66,7 +66,7 @@ define amdgpu_kernel void @__omp_offloading_16_dd2df_main_l9()  {
 ; CHECK-NEXT:    ; divergent unreachable
 ; CHECK-NEXT:  .LBB0_4: ; %UnifiedReturnBlock
 ; CHECK-NEXT:    s_or_saveexec_b64 s[8:9], -1
-; CHECK-NEXT:    buffer_load_dword v0, off, s[0:3], 0 offset:4 ; 4-byte Folded Reload
+; CHECK-NEXT:    buffer_load_dword v0, off, s[0:3], 0 ; 4-byte Folded Reload
 ; CHECK-NEXT:    s_mov_b64 exec, s[8:9]
 ; CHECK-NEXT:    ; kill: killed $vgpr0
 ; CHECK-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll b/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll
index 78e8ab1..f78b408 100644
--- a/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll
+++ b/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll
@@ -433,416 +433,416 @@ define amdgpu_kernel void @v256i8_liveout(ptr addrspace(1) %src1, ptr addrspace(
 ; GFX906-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX906-NEXT:    s_waitcnt vmcnt(3)
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v20
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:20 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:16 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v20
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:24 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:20 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v20
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:28 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:24 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v19
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:32 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:28 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v19
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:36 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:32 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v19
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:40 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:36 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v18
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:44 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:40 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v18
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:48 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:44 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v18
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:52 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:48 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v17
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:56 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:52 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v17
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:60 ; 4-byte Folded Spill
-; GFX906-NEXT:    buffer_store_dword v17, off, s[8:11], 0 offset:4 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:56 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v17, off, s[8:11], 0 ; 4-byte Folded Spill
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    buffer_store_dword v18, off, s[8:11], 0 offset:8 ; 4-byte Folded Spill
-; GFX906-NEXT:    buffer_store_dword v19, off, s[8:11], 0 offset:12 ; 4-byte Folded Spill
-; GFX906-NEXT:    buffer_store_dword v20, off, s[8:11], 0 offset:16 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v18, off, s[8:11], 0 offset:4 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v19, off, s[8:11], 0 offset:8 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v20, off, s[8:11], 0 offset:12 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v17
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:64 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:60 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v8
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:68 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:64 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v8
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:72 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:68 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v8
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:76 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:72 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v7
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:80 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:76 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v7
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:84 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:80 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v7
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:88 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:84 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v6
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:92 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:88 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v6
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:96 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:92 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v6
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:100 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:96 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v5
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:104 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:100 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v5
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:108 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:104 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v5
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:112 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:108 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v12
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:116 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:112 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v12
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:120 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:116 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v12
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:124 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:120 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v11
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:128 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:124 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v11
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:132 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:128 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v11
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:136 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:132 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v10
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:140 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:136 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v10
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:144 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:140 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v10
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:148 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:144 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v9
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:152 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:148 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v9
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:156 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:152 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v9
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:160 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:156 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v16
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:164 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:160 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v16
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:168 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:164 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v16
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:172 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:168 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v15
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:180 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:176 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v15
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:184 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:180 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v15
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:176 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:172 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v14
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:192 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:188 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v14
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:196 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:192 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v14
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:188 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:184 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v13
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:204 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:200 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v13
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:208 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:204 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v13
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:200 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:196 ; 4-byte Folded Spill
 ; GFX906-NEXT:    global_load_dwordx4 v[17:20], v63, s[4:5] offset:176
 ; GFX906-NEXT:    global_load_dwordx4 v[21:24], v63, s[4:5] offset:160
 ; GFX906-NEXT:    s_waitcnt vmcnt(1)
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v20
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:212 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:208 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v20
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:216 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:212 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v20
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:228 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:224 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v19
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:220 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:216 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v19
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:224 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:220 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v19
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:240 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:236 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v18
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:232 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:228 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v18
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:236 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:232 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v18
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:252 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:248 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v17
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:244 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:240 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v17
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:248 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:244 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v17
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:256 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:252 ; 4-byte Folded Spill
 ; GFX906-NEXT:    s_waitcnt vmcnt(12)
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v24
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:260 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:256 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v24
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:264 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:260 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v24
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:276 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:272 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v23
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:268 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:264 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v23
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:272 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:268 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v23
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:288 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:284 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v22
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:280 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:276 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v22
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:284 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:280 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v22
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:300 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:296 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v21
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:292 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:288 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v21
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:296 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:292 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v21
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:304 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:300 ; 4-byte Folded Spill
 ; GFX906-NEXT:    global_load_dwordx4 v[25:28], v63, s[4:5] offset:144
 ; GFX906-NEXT:    global_load_dwordx4 v[29:32], v63, s[4:5] offset:128
 ; GFX906-NEXT:    s_waitcnt vmcnt(1)
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v28
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:308 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:304 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v28
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:312 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:308 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v28
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:324 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:320 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v27
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:316 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:312 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v27
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:320 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:316 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v27
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:336 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:332 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v26
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:328 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:324 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v26
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:332 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:328 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v26
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:348 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:344 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v25
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:340 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:336 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v25
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:344 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:340 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v25
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:352 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:348 ; 4-byte Folded Spill
 ; GFX906-NEXT:    s_waitcnt vmcnt(12)
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v32
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:356 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:352 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v32
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:360 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:356 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v32
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:372 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:368 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v31
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:364 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:360 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v31
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:368 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:364 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v31
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:384 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:380 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v30
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:376 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:372 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v30
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:380 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:376 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v30
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:396 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:392 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v29
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:388 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:384 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v29
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:392 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:388 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v29
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:400 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:396 ; 4-byte Folded Spill
 ; GFX906-NEXT:    global_load_dwordx4 v[33:36], v63, s[4:5] offset:112
 ; GFX906-NEXT:    global_load_dwordx4 v[37:40], v63, s[4:5] offset:96
 ; GFX906-NEXT:    s_waitcnt vmcnt(1)
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v36
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:404 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:400 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v36
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:408 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:404 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v36
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:420 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:416 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v35
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:412 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:408 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v35
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:416 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:412 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v35
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:432 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:428 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v34
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:424 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:420 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v34
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:428 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:424 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v34
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:444 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:440 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v33
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:436 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:432 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v33
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:440 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:436 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v33
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:448 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:444 ; 4-byte Folded Spill
 ; GFX906-NEXT:    s_waitcnt vmcnt(12)
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v40
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:452 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:448 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v40
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:456 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:452 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v40
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:468 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:464 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v39
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:460 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:456 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v39
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:464 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:460 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v39
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:480 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:476 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v38
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:472 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:468 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v38
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:476 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:472 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v38
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:492 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:488 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v37
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:484 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:480 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v37
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:488 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:484 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v37
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:496 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:492 ; 4-byte Folded Spill
 ; GFX906-NEXT:    global_load_dwordx4 v[41:44], v63, s[4:5] offset:80
 ; GFX906-NEXT:    global_load_dwordx4 v[45:48], v63, s[4:5] offset:64
 ; GFX906-NEXT:    s_waitcnt vmcnt(1)
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v44
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:500 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:496 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v44
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:504 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:500 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v44
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:516 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:512 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v43
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:508 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:504 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v43
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:512 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:508 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v43
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:528 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:524 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v42
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:520 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:516 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v42
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:524 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:520 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v42
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:540 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:536 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v41
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:532 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:528 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v41
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:536 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:532 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v41
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:544 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:540 ; 4-byte Folded Spill
 ; GFX906-NEXT:    s_waitcnt vmcnt(12)
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v48
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:548 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:544 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v48
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:552 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:548 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v48
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:564 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:560 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v47
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:556 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:552 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v47
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:560 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:556 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v47
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:576 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:572 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v46
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:568 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:564 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v46
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:572 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:568 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v46
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:588 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:584 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v45
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:580 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:576 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v45
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:584 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:580 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v45
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:592 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:588 ; 4-byte Folded Spill
 ; GFX906-NEXT:    global_load_dwordx4 v[49:52], v63, s[4:5] offset:48
 ; GFX906-NEXT:    global_load_dwordx4 v[53:56], v63, s[4:5] offset:32
 ; GFX906-NEXT:    s_waitcnt vmcnt(1)
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v52
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:596 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:592 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v52
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:600 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:596 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v52
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:612 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:608 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v51
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:604 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:600 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v51
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:608 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:604 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v51
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:624 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:620 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v50
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:616 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:612 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v50
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:620 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:616 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v50
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:636 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:632 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v49
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:628 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:624 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v49
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:632 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:628 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v49
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:640 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:636 ; 4-byte Folded Spill
 ; GFX906-NEXT:    s_waitcnt vmcnt(12)
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v56
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:644 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:640 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v56
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:648 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:644 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v56
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:660 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:656 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v55
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:652 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:648 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v55
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:656 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:652 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v55
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:672 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:668 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v54
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:664 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:660 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v54
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:668 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:664 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v54
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:684 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:680 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v53
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:676 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:672 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v53
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:680 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:676 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v53
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:688 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:684 ; 4-byte Folded Spill
 ; GFX906-NEXT:    global_load_dwordx4 v[57:60], v63, s[4:5] offset:16
 ; GFX906-NEXT:    s_nop 0
 ; GFX906-NEXT:    global_load_dwordx4 v[0:3], v63, s[4:5]
 ; GFX906-NEXT:    s_waitcnt vmcnt(1)
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 24, v60
-; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:692 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:688 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 16, v60
-; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:696 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:692 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 8, v60
-; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:708 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:704 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 24, v59
-; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:700 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:696 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 16, v59
-; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:704 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:700 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 8, v59
-; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:720 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:716 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 24, v58
-; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:712 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:708 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 16, v58
-; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:716 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:712 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 8, v58
-; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:732 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:728 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 24, v57
-; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:724 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:720 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 16, v57
-; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:728 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:724 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 8, v57
-; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:736 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:732 ; 4-byte Folded Spill
 ; GFX906-NEXT:    s_waitcnt vmcnt(12)
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 24, v3
-; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:740 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:736 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 16, v3
-; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:744 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:740 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 8, v3
-; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:756 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:752 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 24, v2
-; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:748 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:744 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 16, v2
-; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:752 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:748 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 8, v2
-; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:768 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:764 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 24, v1
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v62, 24, v0
-; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:760 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:756 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 16, v1
-; GFX906-NEXT:    buffer_store_dword v62, off, s[8:11], 0 offset:772 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v62, off, s[8:11], 0 offset:768 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v62, 16, v0
-; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:764 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:760 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 8, v1
-; GFX906-NEXT:    buffer_store_dword v62, off, s[8:11], 0 offset:776 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v62, off, s[8:11], 0 offset:772 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v62, 8, v0
 ; GFX906-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX906-NEXT:    s_cbranch_execz .LBB6_2
@@ -853,494 +853,494 @@ define amdgpu_kernel void @v256i8_liveout(ptr addrspace(1) %src1, ptr addrspace(
 ; GFX906-NEXT:    global_load_dwordx4 v[13:16], v63, s[6:7] offset:192
 ; GFX906-NEXT:    s_waitcnt vmcnt(3)
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v17, 24, v3
-; GFX906-NEXT:    buffer_store_dword v17, off, s[8:11], 0 offset:20 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v17, off, s[8:11], 0 offset:16 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v17, 16, v3
-; GFX906-NEXT:    buffer_store_dword v17, off, s[8:11], 0 offset:24 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v17, off, s[8:11], 0 offset:20 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v17, 8, v3
-; GFX906-NEXT:    buffer_store_dword v17, off, s[8:11], 0 offset:28 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v17, off, s[8:11], 0 offset:24 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v17, 24, v2
-; GFX906-NEXT:    buffer_store_dword v17, off, s[8:11], 0 offset:32 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v17, off, s[8:11], 0 offset:28 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v17, 16, v2
-; GFX906-NEXT:    buffer_store_dword v17, off, s[8:11], 0 offset:36 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v17, off, s[8:11], 0 offset:32 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v17, 8, v2
-; GFX906-NEXT:    buffer_store_dword v17, off, s[8:11], 0 offset:40 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v17, off, s[8:11], 0 offset:36 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v17, 24, v1
-; GFX906-NEXT:    buffer_store_dword v17, off, s[8:11], 0 offset:44 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v17, off, s[8:11], 0 offset:40 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v17, 16, v1
-; GFX906-NEXT:    buffer_store_dword v17, off, s[8:11], 0 offset:48 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v17, off, s[8:11], 0 offset:44 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v17, 8, v1
-; GFX906-NEXT:    buffer_store_dword v17, off, s[8:11], 0 offset:52 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v17, off, s[8:11], 0 offset:48 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v17, 24, v0
-; GFX906-NEXT:    buffer_store_dword v17, off, s[8:11], 0 offset:56 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v17, off, s[8:11], 0 offset:52 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v17, 16, v0
-; GFX906-NEXT:    buffer_store_dword v17, off, s[8:11], 0 offset:60 ; 4-byte Folded Spill
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:4 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v17, off, s[8:11], 0 offset:56 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 ; 4-byte Folded Spill
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    buffer_store_dword v1, off, s[8:11], 0 offset:8 ; 4-byte Folded Spill
-; GFX906-NEXT:    buffer_store_dword v2, off, s[8:11], 0 offset:12 ; 4-byte Folded Spill
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:16 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v1, off, s[8:11], 0 offset:4 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v2, off, s[8:11], 0 offset:8 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:12 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v0
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:64 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:60 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v8
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:68 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:64 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v8
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:72 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:68 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v8
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:76 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:72 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v7
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:80 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:76 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v7
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:84 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:80 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v7
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:88 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:84 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v6
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:92 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:88 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v6
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:96 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:92 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v6
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:100 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:96 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v5
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:104 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:100 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v5
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:108 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:104 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v5
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:112 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:108 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v12
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:116 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:112 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v12
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:120 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:116 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v12
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:124 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:120 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v11
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:128 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:124 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v11
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:132 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:128 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v11
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:136 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:132 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v10
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:140 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:136 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v10
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:144 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:140 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v10
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:148 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:144 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v9
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:152 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:148 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v9
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:156 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:152 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v9
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:160 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:156 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v16
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:164 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:160 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v16
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:168 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:164 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v16
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:172 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:168 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v15
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:180 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:176 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v15
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:184 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:180 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v15
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:176 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:172 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v14
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:192 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:188 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v14
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:196 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:192 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v14
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:188 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:184 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v13
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:204 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:200 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v13
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:208 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:204 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v13
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:200 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:196 ; 4-byte Folded Spill
 ; GFX906-NEXT:    global_load_dwordx4 v[17:20], v63, s[6:7] offset:176
 ; GFX906-NEXT:    global_load_dwordx4 v[21:24], v63, s[6:7] offset:160
 ; GFX906-NEXT:    s_waitcnt vmcnt(1)
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v20
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:212 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:208 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v20
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:216 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:212 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v20
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:228 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:224 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v19
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:220 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:216 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v19
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:224 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:220 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v19
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:240 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:236 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v18
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:232 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:228 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v18
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:236 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:232 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v18
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:252 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:248 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v17
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:244 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:240 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v17
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:248 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:244 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v17
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:256 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:252 ; 4-byte Folded Spill
 ; GFX906-NEXT:    s_waitcnt vmcnt(12)
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v24
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:260 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:256 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v24
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:264 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:260 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v24
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:276 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:272 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v23
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:268 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:264 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v23
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:272 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:268 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v23
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:288 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:284 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v22
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:280 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:276 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v22
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:284 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:280 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v22
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:300 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:296 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v21
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:292 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:288 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v21
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:296 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:292 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v21
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:304 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:300 ; 4-byte Folded Spill
 ; GFX906-NEXT:    global_load_dwordx4 v[25:28], v63, s[6:7] offset:144
 ; GFX906-NEXT:    global_load_dwordx4 v[29:32], v63, s[6:7] offset:128
 ; GFX906-NEXT:    s_waitcnt vmcnt(1)
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v28
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:308 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:304 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v28
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:312 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:308 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v28
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:324 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:320 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v27
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:316 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:312 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v27
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:320 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:316 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v27
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:336 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:332 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v26
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:328 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:324 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v26
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:332 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:328 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v26
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:348 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:344 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v25
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:340 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:336 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v25
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:344 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:340 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v25
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:352 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:348 ; 4-byte Folded Spill
 ; GFX906-NEXT:    s_waitcnt vmcnt(12)
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v32
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:356 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:352 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v32
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:360 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:356 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v32
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:372 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:368 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v31
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:364 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:360 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v31
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:368 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:364 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v31
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:384 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:380 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v30
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:376 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:372 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v30
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:380 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:376 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v30
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:396 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:392 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v29
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:388 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:384 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v29
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:392 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:388 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v29
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:400 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:396 ; 4-byte Folded Spill
 ; GFX906-NEXT:    global_load_dwordx4 v[33:36], v63, s[6:7] offset:112
 ; GFX906-NEXT:    global_load_dwordx4 v[37:40], v63, s[6:7] offset:96
 ; GFX906-NEXT:    s_waitcnt vmcnt(1)
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v36
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:404 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:400 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v36
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:408 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:404 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v36
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:420 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:416 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v35
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:412 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:408 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v35
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:416 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:412 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v35
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:432 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:428 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v34
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:424 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:420 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v34
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:428 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:424 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v34
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:444 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:440 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v33
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:436 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:432 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v33
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:440 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:436 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v33
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:448 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:444 ; 4-byte Folded Spill
 ; GFX906-NEXT:    s_waitcnt vmcnt(12)
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v40
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:452 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:448 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v40
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:456 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:452 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v40
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:468 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:464 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v39
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:460 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:456 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v39
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:464 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:460 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v39
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:480 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:476 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v38
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:472 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:468 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v38
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:476 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:472 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v38
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:492 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:488 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v37
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:484 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:480 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v37
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:488 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:484 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v37
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:496 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:492 ; 4-byte Folded Spill
 ; GFX906-NEXT:    global_load_dwordx4 v[41:44], v63, s[6:7] offset:80
 ; GFX906-NEXT:    global_load_dwordx4 v[45:48], v63, s[6:7] offset:64
 ; GFX906-NEXT:    s_waitcnt vmcnt(1)
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v44
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:500 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:496 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v44
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:504 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:500 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v44
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:516 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:512 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v43
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:508 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:504 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v43
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:512 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:508 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v43
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:528 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:524 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v42
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:520 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:516 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v42
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:524 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:520 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v42
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:540 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:536 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v41
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:532 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:528 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v41
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:536 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:532 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v41
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:544 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:540 ; 4-byte Folded Spill
 ; GFX906-NEXT:    s_waitcnt vmcnt(12)
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v48
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:548 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:544 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v48
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:552 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:548 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v48
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:564 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:560 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v47
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:556 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:552 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v47
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:560 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:556 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v47
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:576 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:572 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v46
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:568 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:564 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v46
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:572 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:568 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v46
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:588 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:584 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v45
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:580 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:576 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v45
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:584 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:580 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v45
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:592 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:588 ; 4-byte Folded Spill
 ; GFX906-NEXT:    global_load_dwordx4 v[49:52], v63, s[6:7] offset:48
 ; GFX906-NEXT:    global_load_dwordx4 v[53:56], v63, s[6:7] offset:32
 ; GFX906-NEXT:    s_waitcnt vmcnt(1)
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v52
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:596 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:592 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v52
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:600 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:596 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v52
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:612 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:608 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v51
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:604 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:600 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v51
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:608 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:604 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v51
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:624 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:620 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v50
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:616 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:612 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v50
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:620 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:616 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v50
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:636 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:632 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v49
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:628 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:624 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v49
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:632 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:628 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v49
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:640 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:636 ; 4-byte Folded Spill
 ; GFX906-NEXT:    s_waitcnt vmcnt(12)
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v56
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:644 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:640 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v56
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:648 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:644 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v56
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:660 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:656 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v55
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:652 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:648 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v55
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:656 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:652 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v55
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:672 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:668 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v54
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:664 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:660 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v54
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:668 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:664 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v54
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:684 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:680 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v53
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:676 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:672 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v53
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:680 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:676 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v53
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:688 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:684 ; 4-byte Folded Spill
 ; GFX906-NEXT:    global_load_dwordx4 v[57:60], v63, s[6:7] offset:16
 ; GFX906-NEXT:    s_nop 0
 ; GFX906-NEXT:    global_load_dwordx4 v[0:3], v63, s[6:7]
 ; GFX906-NEXT:    s_waitcnt vmcnt(1)
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 24, v60
-; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:692 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:688 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 16, v60
-; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:696 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:692 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 8, v60
-; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:708 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:704 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 24, v59
-; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:700 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:696 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 16, v59
-; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:704 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:700 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 8, v59
-; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:720 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:716 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 24, v58
-; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:712 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:708 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 16, v58
-; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:716 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:712 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 8, v58
-; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:732 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:728 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 24, v57
-; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:724 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:720 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 16, v57
-; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:728 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:724 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 8, v57
-; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:736 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:732 ; 4-byte Folded Spill
 ; GFX906-NEXT:    s_waitcnt vmcnt(12)
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 24, v3
-; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:740 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:736 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 16, v3
-; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:744 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:740 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 8, v3
-; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:756 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:752 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 24, v2
-; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:748 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:744 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 16, v2
-; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:752 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:748 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 8, v2
-; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:768 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:764 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 24, v1
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v62, 24, v0
-; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:760 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:756 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 16, v1
-; GFX906-NEXT:    buffer_store_dword v62, off, s[8:11], 0 offset:772 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v62, off, s[8:11], 0 offset:768 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v62, 16, v0
-; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:764 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:760 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 8, v1
-; GFX906-NEXT:    buffer_store_dword v62, off, s[8:11], 0 offset:776 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v62, off, s[8:11], 0 offset:772 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v62, 8, v0
 ; GFX906-NEXT:  .LBB6_2: ; %bb.2
 ; GFX906-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v61, 8, v61
 ; GFX906-NEXT:    v_or_b32_sdwa v1, v1, v61 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v61, off, s[8:11], 0 offset:768 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v61, off, s[8:11], 0 offset:764 ; 4-byte Folded Reload
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v62, 8, v62
 ; GFX906-NEXT:    v_or_b32_sdwa v0, v0, v62 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v62, off, s[8:11], 0 offset:776 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v63, off, s[8:11], 0 offset:764 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v62, off, s[8:11], 0 offset:772 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v63, off, s[8:11], 0 offset:760 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_waitcnt vmcnt(2)
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v61, 8, v61
 ; GFX906-NEXT:    v_or_b32_sdwa v2, v2, v61 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v61, off, s[8:11], 0 offset:756 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v61, off, s[8:11], 0 offset:752 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v61, 8, v61
 ; GFX906-NEXT:    v_or_b32_sdwa v3, v3, v61 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v61, off, s[8:11], 0 offset:772 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v61, off, s[8:11], 0 offset:768 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v61, 8, v61
 ; GFX906-NEXT:    v_or_b32_sdwa v61, v62, v61 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v62, off, s[8:11], 0 offset:760 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v62, off, s[8:11], 0 offset:756 ; 4-byte Folded Reload
 ; GFX906-NEXT:    v_or_b32_sdwa v0, v0, v61 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v61, off, s[8:11], 0 offset:748 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v61, off, s[8:11], 0 offset:744 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_waitcnt vmcnt(1)
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v62, 8, v62
 ; GFX906-NEXT:    v_or_b32_sdwa v62, v63, v62 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX906-NEXT:    v_or_b32_sdwa v1, v1, v62 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v62, off, s[8:11], 0 offset:752 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v62, off, s[8:11], 0 offset:748 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_waitcnt vmcnt(1)
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v61, 8, v61
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
 ; GFX906-NEXT:    v_or_b32_sdwa v61, v62, v61 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX906-NEXT:    v_or_b32_sdwa v2, v2, v61 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v61, off, s[8:11], 0 offset:740 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v62, off, s[8:11], 0 offset:744 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v61, off, s[8:11], 0 offset:736 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v62, off, s[8:11], 0 offset:740 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_waitcnt vmcnt(1)
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v61, 8, v61
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
 ; GFX906-NEXT:    v_or_b32_sdwa v61, v62, v61 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX906-NEXT:    v_or_b32_sdwa v3, v3, v61 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX906-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3]
-; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:736 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:732 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_nop 0
-; GFX906-NEXT:    buffer_load_dword v1, off, s[8:11], 0 offset:732 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:720 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:708 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v1, off, s[8:11], 0 offset:728 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:716 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:704 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_waitcnt vmcnt(3)
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
 ; GFX906-NEXT:    v_or_b32_sdwa v0, v57, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX906-NEXT:    s_waitcnt vmcnt(2)
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
-; GFX906-NEXT:    buffer_load_dword v57, off, s[8:11], 0 offset:724 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v57, off, s[8:11], 0 offset:720 ; 4-byte Folded Reload
 ; GFX906-NEXT:    v_or_b32_sdwa v1, v58, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v58, off, s[8:11], 0 offset:728 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v58, off, s[8:11], 0 offset:724 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_waitcnt vmcnt(3)
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
 ; GFX906-NEXT:    v_or_b32_sdwa v2, v59, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v59, off, s[8:11], 0 offset:716 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v59, off, s[8:11], 0 offset:712 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_waitcnt vmcnt(2)
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v57, 8, v57
 ; GFX906-NEXT:    s_waitcnt vmcnt(1)
 ; GFX906-NEXT:    v_or_b32_sdwa v57, v58, v57 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v58, off, s[8:11], 0 offset:712 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v58, off, s[8:11], 0 offset:708 ; 4-byte Folded Reload
 ; GFX906-NEXT:    v_or_b32_sdwa v0, v0, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v57, off, s[8:11], 0 offset:700 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v57, off, s[8:11], 0 offset:696 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_waitcnt vmcnt(1)
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v58, 8, v58
 ; GFX906-NEXT:    v_or_b32_sdwa v58, v59, v58 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX906-NEXT:    v_or_b32_sdwa v1, v1, v58 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v58, off, s[8:11], 0 offset:704 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v58, off, s[8:11], 0 offset:700 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_waitcnt vmcnt(1)
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v57, 8, v57
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
 ; GFX906-NEXT:    v_or_b32_sdwa v57, v58, v57 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX906-NEXT:    v_or_b32_sdwa v2, v2, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v57, off, s[8:11], 0 offset:692 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v58, off, s[8:11], 0 offset:696 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v57, off, s[8:11], 0 offset:688 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v58, off, s[8:11], 0 offset:692 ; 4-byte Folded Reload
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v3, 8, v3
 ; GFX906-NEXT:    v_or_b32_sdwa v3, v60, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX906-NEXT:    s_waitcnt vmcnt(1)
@@ -1349,42 +1349,42 @@ define amdgpu_kernel void @v256i8_liveout(ptr addrspace(1) %src1, ptr addrspace(
 ; GFX906-NEXT:    v_or_b32_sdwa v57, v58, v57 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX906-NEXT:    v_or_b32_sdwa v3, v3, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX906-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] offset:16
-; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:688 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:684 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_nop 0
-; GFX906-NEXT:    buffer_load_dword v1, off, s[8:11], 0 offset:684 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:672 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:660 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v1, off, s[8:11], 0 offset:680 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:668 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:656 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_waitcnt vmcnt(3)
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
 ; GFX906-NEXT:    v_or_b32_sdwa v0, v53, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX906-NEXT:    s_waitcnt vmcnt(2)
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
-; GFX906-NEXT:    buffer_load_dword v53, off, s[8:11], 0 offset:676 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v53, off, s[8:11], 0 offset:672 ; 4-byte Folded Reload
 ; GFX906-NEXT:    v_or_b32_sdwa v1, v54, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v54, off, s[8:11], 0 offset:680 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v54, off, s[8:11], 0 offset:676 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_waitcnt vmcnt(3)
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
 ; GFX906-NEXT:    v_or_b32_sdwa v2, v55, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v55, off, s[8:11], 0 offset:668 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v55, off, s[8:11], 0 offset:664 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_waitcnt vmcnt(2)
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v53, 8, v53
 ; GFX906-NEXT:    s_waitcnt vmcnt(1)
 ; GFX906-NEXT:    v_or_b32_sdwa v53, v54, v53 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v54, off, s[8:11], 0 offset:664 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v54, off, s[8:11], 0 offset:660 ; 4-byte Folded Reload
 ; GFX906-NEXT:    v_or_b32_sdwa v0, v0, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v53, off, s[8:11], 0 offset:652 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v53, off, s[8:11], 0 offset:648 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_waitcnt vmcnt(1)
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v54, 8, v54
 ; GFX906-NEXT:    v_or_b32_sdwa v54, v55, v54 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX906-NEXT:    v_or_b32_sdwa v1, v1, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v54, off, s[8:11], 0 offset:656 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v54, off, s[8:11], 0 offset:652 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_waitcnt vmcnt(1)
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v53, 8, v53
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
 ; GFX906-NEXT:    v_or_b32_sdwa v53, v54, v53 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX906-NEXT:    v_or_b32_sdwa v2, v2, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v53, off, s[8:11], 0 offset:644 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v54, off, s[8:11], 0 offset:648 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v53, off, s[8:11], 0 offset:640 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v54, off, s[8:11], 0 offset:644 ; 4-byte Folded Reload
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v3, 8, v3
 ; GFX906-NEXT:    v_or_b32_sdwa v3, v56, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX906-NEXT:    s_waitcnt vmcnt(1)
@@ -1393,42 +1393,42 @@ define amdgpu_kernel void @v256i8_liveout(ptr addrspace(1) %src1, ptr addrspace(
 ; GFX906-NEXT:    v_or_b32_sdwa v53, v54, v53 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX906-NEXT:    v_or_b32_sdwa v3, v3, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX906-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] offset:32
-; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:640 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:636 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_nop 0
-; GFX906-NEXT:    buffer_load_dword v1, off, s[8:11], 0 offset:636 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:624 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:612 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v1, off, s[8:11], 0 offset:632 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:620 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:608 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_waitcnt vmcnt(3)
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
 ; GFX906-NEXT:    v_or_b32_sdwa v0, v49, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX906-NEXT:    s_waitcnt vmcnt(2)
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
-; GFX906-NEXT:    buffer_load_dword v49, off, s[8:11], 0 offset:628 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v49, off, s[8:11], 0 offset:624 ; 4-byte Folded Reload
 ; GFX906-NEXT:    v_or_b32_sdwa v1, v50, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v50, off, s[8:11], 0 offset:632 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v50, off, s[8:11], 0 offset:628 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_waitcnt vmcnt(3)
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
 ; GFX906-NEXT:    v_or_b32_sdwa v2, v51, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v51, off, s[8:11], 0 offset:620 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v51, off, s[8:11], 0 offset:616 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_waitcnt vmcnt(2)
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v49, 8, v49
 ; GFX906-NEXT:    s_waitcnt vmcnt(1)
 ; GFX906-NEXT:    v_or_b32_sdwa v49, v50, v49 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v50, off, s[8:11], 0 offset:616 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v50, off, s[8:11], 0 offset:612 ; 4-byte Folded Reload
 ; GFX906-NEXT:    v_or_b32_sdwa v0, v0, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v49, off, s[8:11], 0 offset:604 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v49, off, s[8:11], 0 offset:600 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_waitcnt vmcnt(1)
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v50, 8, v50
 ; GFX906-NEXT:    v_or_b32_sdwa v50, v51, v50 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX906-NEXT:    v_or_b32_sdwa v1, v1, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v50, off, s[8:11], 0 offset:608 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v50, off, s[8:11], 0 offset:604 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_waitcnt vmcnt(1)
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v49, 8, v49
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
 ; GFX906-NEXT:    v_or_b32_sdwa v49, v50, v49 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX906-NEXT:    v_or_b32_sdwa v2, v2, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v49, off, s[8:11], 0 offset:596 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v50, off, s[8:11], 0 offset:600 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v49, off, s[8:11], 0 offset:592 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v50, off, s[8:11], 0 offset:596 ; 4-byte Folded Reload
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v3, 8, v3
 ; GFX906-NEXT:    v_or_b32_sdwa v3, v52, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX906-NEXT:    s_waitcnt vmcnt(1)
@@ -1437,42 +1437,42 @@ define amdgpu_kernel void @v256i8_liveout(ptr addrspace(1) %src1, ptr addrspace(
 ; GFX906-NEXT:    v_or_b32_sdwa v49, v50, v49 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX906-NEXT:    v_or_b32_sdwa v3, v3, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX906-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] offset:48
-; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:592 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:588 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_nop 0
-; GFX906-NEXT:    buffer_load_dword v1, off, s[8:11], 0 offset:588 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:576 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:564 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v1, off, s[8:11], 0 offset:584 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:572 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:560 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_waitcnt vmcnt(3)
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
 ; GFX906-NEXT:    v_or_b32_sdwa v0, v45, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX906-NEXT:    s_waitcnt vmcnt(2)
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
-; GFX906-NEXT:    buffer_load_dword v45, off, s[8:11], 0 offset:580 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v45, off, s[8:11], 0 offset:576 ; 4-byte Folded Reload
 ; GFX906-NEXT:    v_or_b32_sdwa v1, v46, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v46, off, s[8:11], 0 offset:584 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v46, off, s[8:11], 0 offset:580 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_waitcnt vmcnt(3)
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
 ; GFX906-NEXT:    v_or_b32_sdwa v2, v47, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v47, off, s[8:11], 0 offset:572 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v47, off, s[8:11], 0 offset:568 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_waitcnt vmcnt(2)
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v45, 8, v45
 ; GFX906-NEXT:    s_waitcnt vmcnt(1)
 ; GFX906-NEXT:    v_or_b32_sdwa v45, v46, v45 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v46, off, s[8:11], 0 offset:568 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v46, off, s[8:11], 0 offset:564 ; 4-byte Folded Reload
 ; GFX906-NEXT:    v_or_b32_sdwa v0, v0, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v45, off, s[8:11], 0 offset:556 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v45, off, s[8:11], 0 offset:552 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_waitcnt vmcnt(1)
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v46, 8, v46
 ; GFX906-NEXT:    v_or_b32_sdwa v46, v47, v46 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX906-NEXT:    v_or_b32_sdwa v1, v1, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v46, off, s[8:11], 0 offset:560 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v46, off, s[8:11], 0 offset:556 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_waitcnt vmcnt(1)
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v45, 8, v45
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
 ; GFX906-NEXT:    v_or_b32_sdwa v45, v46, v45 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX906-NEXT:    v_or_b32_sdwa v2, v2, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v45, off, s[8:11], 0 offset:548 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v46, off, s[8:11], 0 offset:552 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v45, off, s[8:11], 0 offset:544 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v46, off, s[8:11], 0 offset:548 ; 4-byte Folded Reload
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v3, 8, v3
 ; GFX906-NEXT:    v_or_b32_sdwa v3, v48, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX906-NEXT:    s_waitcnt vmcnt(1)
@@ -1481,42 +1481,42 @@ define amdgpu_kernel void @v256i8_liveout(ptr addrspace(1) %src1, ptr addrspace(
 ; GFX906-NEXT:    v_or_b32_sdwa v45, v46, v45 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX906-NEXT:    v_or_b32_sdwa v3, v3, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX906-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] offset:64
-; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:544 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:540 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_nop 0
-; GFX906-NEXT:    buffer_load_dword v1, off, s[8:11], 0 offset:540 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:528 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:516 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v1, off, s[8:11], 0 offset:536 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:524 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:512 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_waitcnt vmcnt(3)
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
 ; GFX906-NEXT:    v_or_b32_sdwa v0, v41, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX906-NEXT:    s_waitcnt vmcnt(2)
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
-; GFX906-NEXT:    buffer_load_dword v41, off, s[8:11], 0 offset:532 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v41, off, s[8:11], 0 offset:528 ; 4-byte Folded Reload
 ; GFX906-NEXT:    v_or_b32_sdwa v1, v42, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v42, off, s[8:11], 0 offset:536 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v42, off, s[8:11], 0 offset:532 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_waitcnt vmcnt(3)
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
 ; GFX906-NEXT:    v_or_b32_sdwa v2, v43, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v43, off, s[8:11], 0 offset:524 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v43, off, s[8:11], 0 offset:520 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_waitcnt vmcnt(2)
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v41, 8, v41
 ; GFX906-NEXT:    s_waitcnt vmcnt(1)
 ; GFX906-NEXT:    v_or_b32_sdwa v41, v42, v41 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v42, off, s[8:11], 0 offset:520 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v42, off, s[8:11], 0 offset:516 ; 4-byte Folded Reload
 ; GFX906-NEXT:    v_or_b32_sdwa v0, v0, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v41, off, s[8:11], 0 offset:508 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v41, off, s[8:11], 0 offset:504 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_waitcnt vmcnt(1)
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v42, 8, v42
 ; GFX906-NEXT:    v_or_b32_sdwa v42, v43, v42 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX906-NEXT:    v_or_b32_sdwa v1, v1, v42 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v42, off, s[8:11], 0 offset:512 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v42, off, s[8:11], 0 offset:508 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_waitcnt vmcnt(1)
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v41, 8, v41
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
 ; GFX906-NEXT:    v_or_b32_sdwa v41, v42, v41 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX906-NEXT:    v_or_b32_sdwa v2, v2, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v41, off, s[8:11], 0 offset:500 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v42, off, s[8:11], 0 offset:504 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v41, off, s[8:11], 0 offset:496 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v42, off, s[8:11], 0 offset:500 ; 4-byte Folded Reload
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v3, 8, v3
 ; GFX906-NEXT:    v_or_b32_sdwa v3, v44, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX906-NEXT:    s_waitcnt vmcnt(1)
@@ -1525,42 +1525,42 @@ define amdgpu_kernel void @v256i8_liveout(ptr addrspace(1) %src1, ptr addrspace(
 ; GFX906-NEXT:    v_or_b32_sdwa v41, v42, v41 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX906-NEXT:    v_or_b32_sdwa v3, v3, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX906-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] offset:80
-; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:496 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:492 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_nop 0
-; GFX906-NEXT:    buffer_load_dword v1, off, s[8:11], 0 offset:492 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:480 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:468 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v1, off, s[8:11], 0 offset:488 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:476 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:464 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_waitcnt vmcnt(3)
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
 ; GFX906-NEXT:    v_or_b32_sdwa v0, v37, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX906-NEXT:    s_waitcnt vmcnt(2)
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
-; GFX906-NEXT:    buffer_load_dword v37, off, s[8:11], 0 offset:484 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v37, off, s[8:11], 0 offset:480 ; 4-byte Folded Reload
 ; GFX906-NEXT:    v_or_b32_sdwa v1, v38, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v38, off, s[8:11], 0 offset:488 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v38, off, s[8:11], 0 offset:484 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_waitcnt vmcnt(3)
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
 ; GFX906-NEXT:    v_or_b32_sdwa v2, v39, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v39, off, s[8:11], 0 offset:476 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v39, off, s[8:11], 0 offset:472 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_waitcnt vmcnt(2)
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v37, 8, v37
 ; GFX906-NEXT:    s_waitcnt vmcnt(1)
 ; GFX906-NEXT:    v_or_b32_sdwa v37, v38, v37 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v38, off, s[8:11], 0 offset:472 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v38, off, s[8:11], 0 offset:468 ; 4-byte Folded Reload
 ; GFX906-NEXT:    v_or_b32_sdwa v0, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v37, off, s[8:11], 0 offset:460 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v37, off, s[8:11], 0 offset:456 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_waitcnt vmcnt(1)
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v38, 8, v38
 ; GFX906-NEXT:    v_or_b32_sdwa v38, v39, v38 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX906-NEXT:    v_or_b32_sdwa v1, v1, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v38, off, s[8:11], 0 offset:464 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v38, off, s[8:11], 0 offset:460 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_waitcnt vmcnt(1)
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v37, 8, v37
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
 ; GFX906-NEXT:    v_or_b32_sdwa v37, v38, v37 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX906-NEXT:    v_or_b32_sdwa v2, v2, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v37, off, s[8:11], 0 offset:452 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v38, off, s[8:11], 0 offset:456 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v37, off, s[8:11], 0 offset:448 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v38, off, s[8:11], 0 offset:452 ; 4-byte Folded Reload
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v3, 8, v3
 ; GFX906-NEXT:    v_or_b32_sdwa v3, v40, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX906-NEXT:    s_waitcnt vmcnt(1)
@@ -1569,42 +1569,42 @@ define amdgpu_kernel void @v256i8_liveout(ptr addrspace(1) %src1, ptr addrspace(
 ; GFX906-NEXT:    v_or_b32_sdwa v37, v38, v37 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX906-NEXT:    v_or_b32_sdwa v3, v3, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX906-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] offset:96
-; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:448 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:444 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_nop 0
-; GFX906-NEXT:    buffer_load_dword v1, off, s[8:11], 0 offset:444 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:432 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:420 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v1, off, s[8:11], 0 offset:440 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:428 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:416 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_waitcnt vmcnt(3)
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
 ; GFX906-NEXT:    v_or_b32_sdwa v0, v33, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX906-NEXT:    s_waitcnt vmcnt(2)
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
-; GFX906-NEXT:    buffer_load_dword v33, off, s[8:11], 0 offset:436 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v33, off, s[8:11], 0 offset:432 ; 4-byte Folded Reload
 ; GFX906-NEXT:    v_or_b32_sdwa v1, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v34, off, s[8:11], 0 offset:440 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v34, off, s[8:11], 0 offset:436 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_waitcnt vmcnt(3)
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
 ; GFX906-NEXT:    v_or_b32_sdwa v2, v35, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v35, off, s[8:11], 0 offset:428 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v35, off, s[8:11], 0 offset:424 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_waitcnt vmcnt(2)
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v33, 8, v33
 ; GFX906-NEXT:    s_waitcnt vmcnt(1)
 ; GFX906-NEXT:    v_or_b32_sdwa v33, v34, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v34, off, s[8:11], 0 offset:424 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v34, off, s[8:11], 0 offset:420 ; 4-byte Folded Reload
 ; GFX906-NEXT:    v_or_b32_sdwa v0, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v33, off, s[8:11], 0 offset:412 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v33, off, s[8:11], 0 offset:408 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_waitcnt vmcnt(1)
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v34, 8, v34
 ; GFX906-NEXT:    v_or_b32_sdwa v34, v35, v34 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX906-NEXT:    v_or_b32_sdwa v1, v1, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v34, off, s[8:11], 0 offset:416 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v34, off, s[8:11], 0 offset:412 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_waitcnt vmcnt(1)
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v33, 8, v33
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
 ; GFX906-NEXT:    v_or_b32_sdwa v33, v34, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX906-NEXT:    v_or_b32_sdwa v2, v2, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v33, off, s[8:11], 0 offset:404 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v34, off, s[8:11], 0 offset:408 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v33, off, s[8:11], 0 offset:400 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v34, off, s[8:11], 0 offset:404 ; 4-byte Folded Reload
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v3, 8, v3
 ; GFX906-NEXT:    v_or_b32_sdwa v3, v36, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX906-NEXT:    s_waitcnt vmcnt(1)
@@ -1613,42 +1613,42 @@ define amdgpu_kernel void @v256i8_liveout(ptr addrspace(1) %src1, ptr addrspace(
 ; GFX906-NEXT:    v_or_b32_sdwa v33, v34, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX906-NEXT:    v_or_b32_sdwa v3, v3, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX906-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] offset:112
-; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:400 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:396 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_nop 0
-; GFX906-NEXT:    buffer_load_dword v1, off, s[8:11], 0 offset:396 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:384 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:372 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v1, off, s[8:11], 0 offset:392 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:380 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:368 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_waitcnt vmcnt(3)
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
 ; GFX906-NEXT:    v_or_b32_sdwa v0, v29, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX906-NEXT:    s_waitcnt vmcnt(2)
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
-; GFX906-NEXT:    buffer_load_dword v29, off, s[8:11], 0 offset:388 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v29, off, s[8:11], 0 offset:384 ; 4-byte Folded Reload
 ; GFX906-NEXT:    v_or_b32_sdwa v1, v30, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v30, off, s[8:11], 0 offset:392 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v30, off, s[8:11], 0 offset:388 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_waitcnt vmcnt(3)
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
 ; GFX906-NEXT:    v_or_b32_sdwa v2, v31, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v31, off, s[8:11], 0 offset:380 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v31, off, s[8:11], 0 offset:376 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_waitcnt vmcnt(2)
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v29, 8, v29
 ; GFX906-NEXT:    s_waitcnt vmcnt(1)
 ; GFX906-NEXT:    v_or_b32_sdwa v29, v30, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v30, off, s[8:11], 0 offset:376 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v30, off, s[8:11], 0 offset:372 ; 4-byte Folded Reload
 ; GFX906-NEXT:    v_or_b32_sdwa v0, v0, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v29, off, s[8:11], 0 offset:364 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v29, off, s[8:11], 0 offset:360 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_waitcnt vmcnt(1)
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v30, 8, v30
 ; GFX906-NEXT:    v_or_b32_sdwa v30, v31, v30 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX906-NEXT:    v_or_b32_sdwa v1, v1, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v30, off, s[8:11], 0 offset:368 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v30, off, s[8:11], 0 offset:364 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_waitcnt vmcnt(1)
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v29, 8, v29
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
 ; GFX906-NEXT:    v_or_b32_sdwa v29, v30, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX906-NEXT:    v_or_b32_sdwa v2, v2, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v29, off, s[8:11], 0 offset:356 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v30, off, s[8:11], 0 offset:360 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v29, off, s[8:11], 0 offset:352 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v30, off, s[8:11], 0 offset:356 ; 4-byte Folded Reload
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v3, 8, v3
 ; GFX906-NEXT:    v_or_b32_sdwa v3, v32, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX906-NEXT:    s_waitcnt vmcnt(1)
@@ -1657,42 +1657,42 @@ define amdgpu_kernel void @v256i8_liveout(ptr addrspace(1) %src1, ptr addrspace(
 ; GFX906-NEXT:    v_or_b32_sdwa v29, v30, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX906-NEXT:    v_or_b32_sdwa v3, v3, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX906-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] offset:128
-; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:352 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:348 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_nop 0
-; GFX906-NEXT:    buffer_load_dword v1, off, s[8:11], 0 offset:348 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:336 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:324 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v1, off, s[8:11], 0 offset:344 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:332 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:320 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_waitcnt vmcnt(3)
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
 ; GFX906-NEXT:    v_or_b32_sdwa v0, v25, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX906-NEXT:    s_waitcnt vmcnt(2)
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
-; GFX906-NEXT:    buffer_load_dword v25, off, s[8:11], 0 offset:340 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v25, off, s[8:11], 0 offset:336 ; 4-byte Folded Reload
 ; GFX906-NEXT:    v_or_b32_sdwa v1, v26, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v26, off, s[8:11], 0 offset:344 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v26, off, s[8:11], 0 offset:340 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_waitcnt vmcnt(3)
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
 ; GFX906-NEXT:    v_or_b32_sdwa v2, v27, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v27, off, s[8:11], 0 offset:332 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v27, off, s[8:11], 0 offset:328 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_waitcnt vmcnt(2)
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v25, 8, v25
 ; GFX906-NEXT:    s_waitcnt vmcnt(1)
 ; GFX906-NEXT:    v_or_b32_sdwa v25, v26, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v26, off, s[8:11], 0 offset:328 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v26, off, s[8:11], 0 offset:324 ; 4-byte Folded Reload
 ; GFX906-NEXT:    v_or_b32_sdwa v0, v0, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v25, off, s[8:11], 0 offset:316 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v25, off, s[8:11], 0 offset:312 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_waitcnt vmcnt(1)
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v26, 8, v26
 ; GFX906-NEXT:    v_or_b32_sdwa v26, v27, v26 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX906-NEXT:    v_or_b32_sdwa v1, v1, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v26, off, s[8:11], 0 offset:320 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v26, off, s[8:11], 0 offset:316 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_waitcnt vmcnt(1)
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v25, 8, v25
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
 ; GFX906-NEXT:    v_or_b32_sdwa v25, v26, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX906-NEXT:    v_or_b32_sdwa v2, v2, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v25, off, s[8:11], 0 offset:308 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v26, off, s[8:11], 0 offset:312 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v25, off, s[8:11], 0 offset:304 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v26, off, s[8:11], 0 offset:308 ; 4-byte Folded Reload
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v3, 8, v3
 ; GFX906-NEXT:    v_or_b32_sdwa v3, v28, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX906-NEXT:    s_waitcnt vmcnt(1)
@@ -1701,42 +1701,42 @@ define amdgpu_kernel void @v256i8_liveout(ptr addrspace(1) %src1, ptr addrspace(
 ; GFX906-NEXT:    v_or_b32_sdwa v25, v26, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX906-NEXT:    v_or_b32_sdwa v3, v3, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX906-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] offset:144
-; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:304 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:300 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_nop 0
-; GFX906-NEXT:    buffer_load_dword v1, off, s[8:11], 0 offset:300 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:288 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:276 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v1, off, s[8:11], 0 offset:296 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:284 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:272 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_waitcnt vmcnt(3)
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
 ; GFX906-NEXT:    v_or_b32_sdwa v0, v21, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX906-NEXT:    s_waitcnt vmcnt(2)
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
-; GFX906-NEXT:    buffer_load_dword v21, off, s[8:11], 0 offset:292 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v21, off, s[8:11], 0 offset:288 ; 4-byte Folded Reload
 ; GFX906-NEXT:    v_or_b32_sdwa v1, v22, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v22, off, s[8:11], 0 offset:296 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v22, off, s[8:11], 0 offset:292 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_waitcnt vmcnt(3)
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
 ; GFX906-NEXT:    v_or_b32_sdwa v2, v23, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v23, off, s[8:11], 0 offset:284 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v23, off, s[8:11], 0 offset:280 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_waitcnt vmcnt(2)
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v21, 8, v21
 ; GFX906-NEXT:    s_waitcnt vmcnt(1)
 ; GFX906-NEXT:    v_or_b32_sdwa v21, v22, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v22, off, s[8:11], 0 offset:280 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v22, off, s[8:11], 0 offset:276 ; 4-byte Folded Reload
 ; GFX906-NEXT:    v_or_b32_sdwa v0, v0, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v21, off, s[8:11], 0 offset:268 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v21, off, s[8:11], 0 offset:264 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_waitcnt vmcnt(1)
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v22, 8, v22
 ; GFX906-NEXT:    v_or_b32_sdwa v22, v23, v22 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX906-NEXT:    v_or_b32_sdwa v1, v1, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v22, off, s[8:11], 0 offset:272 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v22, off, s[8:11], 0 offset:268 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_waitcnt vmcnt(1)
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v21, 8, v21
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
 ; GFX906-NEXT:    v_or_b32_sdwa v21, v22, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX906-NEXT:    v_or_b32_sdwa v2, v2, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v21, off, s[8:11], 0 offset:260 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v22, off, s[8:11], 0 offset:264 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v21, off, s[8:11], 0 offset:256 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v22, off, s[8:11], 0 offset:260 ; 4-byte Folded Reload
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v3, 8, v3
 ; GFX906-NEXT:    v_or_b32_sdwa v3, v24, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX906-NEXT:    s_waitcnt vmcnt(1)
@@ -1745,42 +1745,42 @@ define amdgpu_kernel void @v256i8_liveout(ptr addrspace(1) %src1, ptr addrspace(
 ; GFX906-NEXT:    v_or_b32_sdwa v21, v22, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX906-NEXT:    v_or_b32_sdwa v3, v3, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX906-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] offset:160
-; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:256 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:252 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_nop 0
-; GFX906-NEXT:    buffer_load_dword v1, off, s[8:11], 0 offset:252 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:240 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:228 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v1, off, s[8:11], 0 offset:248 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:236 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:224 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_waitcnt vmcnt(3)
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
 ; GFX906-NEXT:    v_or_b32_sdwa v0, v17, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX906-NEXT:    s_waitcnt vmcnt(2)
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
-; GFX906-NEXT:    buffer_load_dword v17, off, s[8:11], 0 offset:244 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v17, off, s[8:11], 0 offset:240 ; 4-byte Folded Reload
 ; GFX906-NEXT:    v_or_b32_sdwa v1, v18, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v18, off, s[8:11], 0 offset:248 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v18, off, s[8:11], 0 offset:244 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_waitcnt vmcnt(3)
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
 ; GFX906-NEXT:    v_or_b32_sdwa v2, v19, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v19, off, s[8:11], 0 offset:236 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v19, off, s[8:11], 0 offset:232 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_waitcnt vmcnt(2)
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v17, 8, v17
 ; GFX906-NEXT:    s_waitcnt vmcnt(1)
 ; GFX906-NEXT:    v_or_b32_sdwa v17, v18, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v18, off, s[8:11], 0 offset:232 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v18, off, s[8:11], 0 offset:228 ; 4-byte Folded Reload
 ; GFX906-NEXT:    v_or_b32_sdwa v0, v0, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v17, off, s[8:11], 0 offset:220 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v17, off, s[8:11], 0 offset:216 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_waitcnt vmcnt(1)
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v18, 8, v18
 ; GFX906-NEXT:    v_or_b32_sdwa v18, v19, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX906-NEXT:    v_or_b32_sdwa v1, v1, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v18, off, s[8:11], 0 offset:224 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v18, off, s[8:11], 0 offset:220 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_waitcnt vmcnt(1)
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v17, 8, v17
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
 ; GFX906-NEXT:    v_or_b32_sdwa v17, v18, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX906-NEXT:    v_or_b32_sdwa v2, v2, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v17, off, s[8:11], 0 offset:212 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v18, off, s[8:11], 0 offset:216 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v17, off, s[8:11], 0 offset:208 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v18, off, s[8:11], 0 offset:212 ; 4-byte Folded Reload
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v3, 8, v3
 ; GFX906-NEXT:    v_or_b32_sdwa v3, v20, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX906-NEXT:    s_waitcnt vmcnt(1)
@@ -1789,36 +1789,36 @@ define amdgpu_kernel void @v256i8_liveout(ptr addrspace(1) %src1, ptr addrspace(
 ; GFX906-NEXT:    v_or_b32_sdwa v17, v18, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX906-NEXT:    v_or_b32_sdwa v3, v3, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX906-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] offset:176
-; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:204 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:200 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_nop 0
-; GFX906-NEXT:    buffer_load_dword v1, off, s[8:11], 0 offset:208 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:196 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:188 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v1, off, s[8:11], 0 offset:204 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:192 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:184 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_waitcnt vmcnt(3)
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
 ; GFX906-NEXT:    s_waitcnt vmcnt(2)
 ; GFX906-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v1, off, s[8:11], 0 offset:192 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v1, off, s[8:11], 0 offset:188 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_waitcnt vmcnt(1)
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v3, 8, v3
 ; GFX906-NEXT:    v_or_b32_sdwa v3, v14, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v14, off, s[8:11], 0 offset:168 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v14, off, s[8:11], 0 offset:164 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_waitcnt vmcnt(1)
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
 ; GFX906-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:200 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:196 ; 4-byte Folded Reload
 ; GFX906-NEXT:    v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:184 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:180 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_waitcnt vmcnt(1)
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
 ; GFX906-NEXT:    v_or_b32_sdwa v2, v13, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX906-NEXT:    v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:180 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v13, off, s[8:11], 0 offset:164 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:176 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v13, off, s[8:11], 0 offset:160 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_waitcnt vmcnt(1)
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
 ; GFX906-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:176 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:172 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_waitcnt vmcnt(1)
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v13, 8, v13
 ; GFX906-NEXT:    v_or_b32_sdwa v13, v14, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -1826,27 +1826,27 @@ define amdgpu_kernel void @v256i8_liveout(ptr addrspace(1) %src1, ptr addrspace(
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v3, 8, v3
 ; GFX906-NEXT:    v_or_b32_sdwa v3, v15, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX906-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:172 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:168 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v3, 8, v3
 ; GFX906-NEXT:    v_or_b32_sdwa v3, v16, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX906-NEXT:    v_or_b32_sdwa v3, v3, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX906-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] offset:192
-; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:160 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:156 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_nop 0
-; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:156 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v1, off, s[8:11], 0 offset:152 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:144 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:152 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v1, off, s[8:11], 0 offset:148 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:140 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_waitcnt vmcnt(3)
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
 ; GFX906-NEXT:    v_or_b32_sdwa v0, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v9, off, s[8:11], 0 offset:132 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v9, off, s[8:11], 0 offset:128 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_waitcnt vmcnt(2)
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
 ; GFX906-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX906-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v1, off, s[8:11], 0 offset:148 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:140 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v1, off, s[8:11], 0 offset:144 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:136 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_waitcnt vmcnt(1)
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
@@ -1854,9 +1854,9 @@ define amdgpu_kernel void @v256i8_liveout(ptr addrspace(1) %src1, ptr addrspace(
 ; GFX906-NEXT:    v_or_b32_sdwa v1, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX906-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX906-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:136 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:128 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v10, off, s[8:11], 0 offset:120 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:132 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:124 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v10, off, s[8:11], 0 offset:116 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_waitcnt vmcnt(2)
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
 ; GFX906-NEXT:    s_waitcnt vmcnt(1)
@@ -1864,8 +1864,8 @@ define amdgpu_kernel void @v256i8_liveout(ptr addrspace(1) %src1, ptr addrspace(
 ; GFX906-NEXT:    v_or_b32_sdwa v2, v11, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX906-NEXT:    v_or_b32_sdwa v3, v9, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX906-NEXT:    v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:124 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v9, off, s[8:11], 0 offset:116 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:120 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v9, off, s[8:11], 0 offset:112 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_waitcnt vmcnt(1)
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v3, 8, v3
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
@@ -1874,21 +1874,21 @@ define amdgpu_kernel void @v256i8_liveout(ptr addrspace(1) %src1, ptr addrspace(
 ; GFX906-NEXT:    v_or_b32_sdwa v9, v10, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX906-NEXT:    v_or_b32_sdwa v3, v3, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX906-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] offset:208
-; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:112 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:108 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_nop 0
-; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:108 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v1, off, s[8:11], 0 offset:104 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:96 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:104 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v1, off, s[8:11], 0 offset:100 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:92 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_waitcnt vmcnt(3)
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
 ; GFX906-NEXT:    v_or_b32_sdwa v0, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v5, off, s[8:11], 0 offset:84 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v5, off, s[8:11], 0 offset:80 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_waitcnt vmcnt(2)
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
 ; GFX906-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX906-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v1, off, s[8:11], 0 offset:100 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:92 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v1, off, s[8:11], 0 offset:96 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:88 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_waitcnt vmcnt(1)
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
@@ -1896,9 +1896,9 @@ define amdgpu_kernel void @v256i8_liveout(ptr addrspace(1) %src1, ptr addrspace(
 ; GFX906-NEXT:    v_or_b32_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX906-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX906-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:88 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:80 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v6, off, s[8:11], 0 offset:72 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:84 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:76 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v6, off, s[8:11], 0 offset:68 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_waitcnt vmcnt(2)
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
 ; GFX906-NEXT:    s_waitcnt vmcnt(1)
@@ -1906,8 +1906,8 @@ define amdgpu_kernel void @v256i8_liveout(ptr addrspace(1) %src1, ptr addrspace(
 ; GFX906-NEXT:    v_or_b32_sdwa v2, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX906-NEXT:    v_or_b32_sdwa v3, v5, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX906-NEXT:    v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:76 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v5, off, s[8:11], 0 offset:68 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:72 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v5, off, s[8:11], 0 offset:64 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_waitcnt vmcnt(1)
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v3, 8, v3
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
@@ -1916,15 +1916,15 @@ define amdgpu_kernel void @v256i8_liveout(ptr addrspace(1) %src1, ptr addrspace(
 ; GFX906-NEXT:    v_or_b32_sdwa v5, v6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX906-NEXT:    v_or_b32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX906-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] offset:224
-; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:64 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:60 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_nop 0
-; GFX906-NEXT:    buffer_load_dword v5, off, s[8:11], 0 offset:4 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v6, off, s[8:11], 0 offset:8 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v7, off, s[8:11], 0 offset:12 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v8, off, s[8:11], 0 offset:16 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v1, off, s[8:11], 0 offset:56 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:60 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:48 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v5, off, s[8:11], 0 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v6, off, s[8:11], 0 offset:4 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v7, off, s[8:11], 0 offset:8 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v8, off, s[8:11], 0 offset:12 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v1, off, s[8:11], 0 offset:52 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:56 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:44 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_waitcnt vmcnt(7)
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
 ; GFX906-NEXT:    s_waitcnt vmcnt(3)
@@ -1934,9 +1934,9 @@ define amdgpu_kernel void @v256i8_liveout(ptr addrspace(1) %src1, ptr addrspace(
 ; GFX906-NEXT:    s_waitcnt vmcnt(1)
 ; GFX906-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX906-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v1, off, s[8:11], 0 offset:52 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:44 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v5, off, s[8:11], 0 offset:36 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v1, off, s[8:11], 0 offset:48 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:40 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v5, off, s[8:11], 0 offset:32 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_waitcnt vmcnt(2)
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
 ; GFX906-NEXT:    s_waitcnt vmcnt(1)
@@ -1944,9 +1944,9 @@ define amdgpu_kernel void @v256i8_liveout(ptr addrspace(1) %src1, ptr addrspace(
 ; GFX906-NEXT:    v_or_b32_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX906-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX906-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:40 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:32 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v6, off, s[8:11], 0 offset:24 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:36 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:28 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v6, off, s[8:11], 0 offset:20 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_waitcnt vmcnt(2)
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
 ; GFX906-NEXT:    s_waitcnt vmcnt(1)
@@ -1954,8 +1954,8 @@ define amdgpu_kernel void @v256i8_liveout(ptr addrspace(1) %src1, ptr addrspace(
 ; GFX906-NEXT:    v_or_b32_sdwa v2, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX906-NEXT:    v_or_b32_sdwa v3, v5, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX906-NEXT:    v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:28 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v5, off, s[8:11], 0 offset:20 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:24 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v5, off, s[8:11], 0 offset:16 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_waitcnt vmcnt(1)
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v3, 8, v3
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/wqm.ll b/llvm/test/CodeGen/AMDGPU/wqm.ll
index 46ff9a9..95dfb12 100644
--- a/llvm/test/CodeGen/AMDGPU/wqm.ll
+++ b/llvm/test/CodeGen/AMDGPU/wqm.ll
@@ -2035,9 +2035,9 @@ define amdgpu_ps void @test_alloca(float %data, i32 %a, i32 %idx) nounwind {
 ; GFX9-W64-NEXT:    s_and_b64 exec, exec, s[0:1]
 ; GFX9-W64-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
-; GFX9-W64-NEXT:    buffer_store_dword v1, off, s[8:11], 0 offset:4
+; GFX9-W64-NEXT:    buffer_store_dword v1, off, s[8:11], 0
 ; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-W64-NEXT:    v_mov_b32_e32 v1, 4
+; GFX9-W64-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-W64-NEXT:    v_lshl_add_u32 v1, v2, 2, v1
 ; GFX9-W64-NEXT:    buffer_load_dword v1, v1, s[8:11], 0 offen
 ; GFX9-W64-NEXT:    s_and_b64 exec, exec, s[0:1]
@@ -2059,11 +2059,11 @@ define amdgpu_ps void @test_alloca(float %data, i32 %a, i32 %idx) nounwind {
 ; GFX10-W32-NEXT:    s_addc_u32 s9, s9, 0
 ; GFX10-W32-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
-; GFX10-W32-NEXT:    v_lshl_add_u32 v2, v2, 2, 4
+; GFX10-W32-NEXT:    v_lshl_add_u32 v2, v2, 2, 0
 ; GFX10-W32-NEXT:    s_and_b32 exec_lo, exec_lo, s0
 ; GFX10-W32-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
-; GFX10-W32-NEXT:    buffer_store_dword v1, off, s[8:11], 0 offset:4
+; GFX10-W32-NEXT:    buffer_store_dword v1, off, s[8:11], 0
 ; GFX10-W32-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-W32-NEXT:    buffer_load_dword v1, v2, s[8:11], 0 offen
 ; GFX10-W32-NEXT:    s_and_b32 exec_lo, exec_lo, s0
diff --git a/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll b/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll
index 6003d03..47c976d 100644
--- a/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll
+++ b/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll
@@ -124,7 +124,7 @@ define amdgpu_cs void @cfg(ptr addrspace(8) inreg %tmp14, i32 %arg) {
 ; GFX9-O0-NEXT:    ; implicit-def: $vgpr3 : SGPR spill to VGPR lane
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[12:13], -1
-; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[16:19], 0 offset:4 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[16:19], 0 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[12:13]
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-O0-NEXT:    v_writelane_b32 v0, s3, 0
@@ -150,9 +150,9 @@ define amdgpu_cs void @cfg(ptr addrspace(8) inreg %tmp14, i32 %arg) {
 ; GFX9-O0-NEXT:    s_nop 2
 ; GFX9-O0-NEXT:    buffer_load_dwordx2 v[4:5], off, s[4:7], s0
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[16:19], 0 offset:16 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[16:19], 0 offset:12 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[16:19], 0 offset:20 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[16:19], 0 offset:16 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr2_sgpr3
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX9-O0-NEXT:    s_not_b64 exec, exec
@@ -165,22 +165,22 @@ define amdgpu_cs void @cfg(ptr addrspace(8) inreg %tmp14, i32 %arg) {
 ; GFX9-O0-NEXT:    v_add_u32_e64 v1, v1, v2
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[2:3]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v1
-; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[16:19], 0 offset:12 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[16:19], 0 offset:8 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    v_cmp_eq_u32_e64 s[2:3], v3, s0
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v3, s0
-; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[16:19], 0 offset:8 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[16:19], 0 offset:4 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_mov_b64 s[0:1], exec
 ; GFX9-O0-NEXT:    v_writelane_b32 v0, s0, 5
 ; GFX9-O0-NEXT:    v_writelane_b32 v0, s1, 6
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[12:13], -1
-; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[16:19], 0 offset:4 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[16:19], 0 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[12:13]
 ; GFX9-O0-NEXT:    s_and_b64 s[0:1], s[0:1], s[2:3]
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX9-O0-NEXT:    s_cbranch_execz .LBB1_2
 ; GFX9-O0-NEXT:  ; %bb.1: ; %if
-; GFX9-O0-NEXT:    buffer_load_dword v3, off, s[16:19], 0 offset:16 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v4, off, s[16:19], 0 offset:20 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v3, off, s[16:19], 0 offset:12 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v4, off, s[16:19], 0 offset:16 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[0:1], -1
@@ -195,10 +195,10 @@ define amdgpu_cs void @cfg(ptr addrspace(8) inreg %tmp14, i32 %arg) {
 ; GFX9-O0-NEXT:    v_add_u32_e64 v1, v2, v1
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v1
-; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[16:19], 0 offset:8 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[16:19], 0 offset:4 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:  .LBB1_2: ; %merge
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[12:13], -1
-; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[16:19], 0 offset:4 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[16:19], 0 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[12:13]
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-O0-NEXT:    v_readlane_b32 s4, v0, 5
@@ -208,8 +208,8 @@ define amdgpu_cs void @cfg(ptr addrspace(8) inreg %tmp14, i32 %arg) {
 ; GFX9-O0-NEXT:    v_readlane_b32 s3, v0, 2
 ; GFX9-O0-NEXT:    v_readlane_b32 s0, v0, 3
 ; GFX9-O0-NEXT:    v_readlane_b32 s1, v0, 4
-; GFX9-O0-NEXT:    buffer_load_dword v3, off, s[16:19], 0 offset:12 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v4, off, s[16:19], 0 offset:8 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v3, off, s[16:19], 0 offset:8 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v4, off, s[16:19], 0 offset:4 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-O0-NEXT:    v_cmp_eq_u32_e64 s[4:5], v3, v4
 ; GFX9-O0-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s[4:5]
@@ -349,7 +349,7 @@ define amdgpu_kernel void @call(ptr addrspace(8) inreg %tmp14, i32 inreg %arg) {
 ; GFX9-O0-NEXT:    v_writelane_b32 v7, s4, 2
 ; GFX9-O0-NEXT:    v_writelane_b32 v7, s5, 3
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[20:21], -1
-; GFX9-O0-NEXT:    buffer_store_dword v7, off, s[24:27], 0 offset:4 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v7, off, s[24:27], 0 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[20:21]
 ; GFX9-O0-NEXT:    s_mov_b64 s[4:5], s[0:1]
 ; GFX9-O0-NEXT:    v_readlane_b32 s0, v7, 2
@@ -358,7 +358,7 @@ define amdgpu_kernel void @call(ptr addrspace(8) inreg %tmp14, i32 inreg %arg) {
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v1
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v0
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[20:21], -1
-; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[24:27], 0 offset:4 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[24:27], 0 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[20:21]
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[2:3]
 ; GFX9-O0-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x24
@@ -388,7 +388,7 @@ define amdgpu_kernel void @call(ptr addrspace(8) inreg %tmp14, i32 inreg %arg) {
 ; GFX9-O0-NEXT:    v_writelane_b32 v0, s2, 9
 ; GFX9-O0-NEXT:    v_writelane_b32 v0, s3, 10
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[20:21], -1
-; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[24:27], 0 offset:4 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[24:27], 0 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[20:21]
 ; GFX9-O0-NEXT:    s_mov_b64 s[6:7], 56
 ; GFX9-O0-NEXT:    s_mov_b32 s2, s0
@@ -415,7 +415,7 @@ define amdgpu_kernel void @call(ptr addrspace(8) inreg %tmp14, i32 inreg %arg) {
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v6
 ; GFX9-O0-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[20:21], -1
-; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[24:27], 0 offset:4 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[24:27], 0 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[20:21]
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-O0-NEXT:    v_readlane_b32 s0, v1, 4
@@ -427,7 +427,7 @@ define amdgpu_kernel void @call(ptr addrspace(8) inreg %tmp14, i32 inreg %arg) {
 ; GFX9-O0-NEXT:    v_readlane_b32 s4, v1, 8
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[20:21], -1
-; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[24:27], 0 offset:4 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[24:27], 0 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[20:21]
 ; GFX9-O0-NEXT:    v_add_u32_e64 v3, v3, v6
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[6:7]
@@ -584,7 +584,7 @@ define amdgpu_kernel void @call_i64(ptr addrspace(8) inreg %tmp14, i64 inreg %ar
 ; GFX9-O0-NEXT:    v_writelane_b32 v12, s4, 2
 ; GFX9-O0-NEXT:    v_writelane_b32 v12, s5, 3
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[20:21], -1
-; GFX9-O0-NEXT:    buffer_store_dword v12, off, s[24:27], 0 offset:4 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v12, off, s[24:27], 0 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[20:21]
 ; GFX9-O0-NEXT:    s_mov_b64 s[4:5], s[0:1]
 ; GFX9-O0-NEXT:    v_readlane_b32 s0, v12, 2
@@ -593,7 +593,7 @@ define amdgpu_kernel void @call_i64(ptr addrspace(8) inreg %tmp14, i64 inreg %ar
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v1
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v0
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[20:21], -1
-; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[24:27], 0 offset:4 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[24:27], 0 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[20:21]
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[2:3]
 ; GFX9-O0-NEXT:    s_load_dwordx2 s[16:17], s[0:1], 0x24
@@ -624,7 +624,7 @@ define amdgpu_kernel void @call_i64(ptr addrspace(8) inreg %tmp14, i64 inreg %ar
 ; GFX9-O0-NEXT:    v_writelane_b32 v0, s2, 8
 ; GFX9-O0-NEXT:    v_writelane_b32 v0, s3, 9
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[20:21], -1
-; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[24:27], 0 offset:4 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[24:27], 0 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[20:21]
 ; GFX9-O0-NEXT:    s_mov_b64 s[6:7], 60
 ; GFX9-O0-NEXT:    s_mov_b32 s2, s0
@@ -659,7 +659,7 @@ define amdgpu_kernel void @call_i64(ptr addrspace(8) inreg %tmp14, i64 inreg %ar
 ; GFX9-O0-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-O0-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[20:21], -1
-; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[24:27], 0 offset:4 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[24:27], 0 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[20:21]
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-O0-NEXT:    v_readlane_b32 s0, v2, 4
@@ -670,7 +670,7 @@ define amdgpu_kernel void @call_i64(ptr addrspace(8) inreg %tmp14, i64 inreg %ar
 ; GFX9-O0-NEXT:    v_readlane_b32 s5, v2, 9
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[20:21], -1
-; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[24:27], 0 offset:4 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[24:27], 0 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[20:21]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v1
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr6
@@ -994,7 +994,7 @@ define amdgpu_cs void @strict_wwm_cfg(ptr addrspace(8) inreg %tmp14, i32 %arg) {
 ; GFX9-O0-NEXT:    ; implicit-def: $vgpr3 : SGPR spill to VGPR lane
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[12:13], -1
-; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[16:19], 0 offset:4 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[16:19], 0 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[12:13]
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-O0-NEXT:    v_writelane_b32 v0, s3, 0
@@ -1020,9 +1020,9 @@ define amdgpu_cs void @strict_wwm_cfg(ptr addrspace(8) inreg %tmp14, i32 %arg) {
 ; GFX9-O0-NEXT:    s_nop 2
 ; GFX9-O0-NEXT:    buffer_load_dwordx2 v[4:5], off, s[4:7], s0
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[16:19], 0 offset:16 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[16:19], 0 offset:12 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[16:19], 0 offset:20 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[16:19], 0 offset:16 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr2_sgpr3
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX9-O0-NEXT:    s_not_b64 exec, exec
@@ -1035,22 +1035,22 @@ define amdgpu_cs void @strict_wwm_cfg(ptr addrspace(8) inreg %tmp14, i32 %arg) {
 ; GFX9-O0-NEXT:    v_add_u32_e64 v1, v1, v2
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[2:3]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v1
-; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[16:19], 0 offset:12 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[16:19], 0 offset:8 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    v_cmp_eq_u32_e64 s[2:3], v3, s0
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v3, s0
-; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[16:19], 0 offset:8 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[16:19], 0 offset:4 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_mov_b64 s[0:1], exec
 ; GFX9-O0-NEXT:    v_writelane_b32 v0, s0, 5
 ; GFX9-O0-NEXT:    v_writelane_b32 v0, s1, 6
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[12:13], -1
-; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[16:19], 0 offset:4 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[16:19], 0 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[12:13]
 ; GFX9-O0-NEXT:    s_and_b64 s[0:1], s[0:1], s[2:3]
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX9-O0-NEXT:    s_cbranch_execz .LBB8_2
 ; GFX9-O0-NEXT:  ; %bb.1: ; %if
-; GFX9-O0-NEXT:    buffer_load_dword v3, off, s[16:19], 0 offset:16 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v4, off, s[16:19], 0 offset:20 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v3, off, s[16:19], 0 offset:12 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v4, off, s[16:19], 0 offset:16 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[0:1], -1
@@ -1065,10 +1065,10 @@ define amdgpu_cs void @strict_wwm_cfg(ptr addrspace(8) inreg %tmp14, i32 %arg) {
 ; GFX9-O0-NEXT:    v_add_u32_e64 v1, v2, v1
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v1
-; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[16:19], 0 offset:8 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[16:19], 0 offset:4 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:  .LBB8_2: ; %merge
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[12:13], -1
-; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[16:19], 0 offset:4 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[16:19], 0 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[12:13]
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-O0-NEXT:    v_readlane_b32 s4, v0, 5
@@ -1078,8 +1078,8 @@ define amdgpu_cs void @strict_wwm_cfg(ptr addrspace(8) inreg %tmp14, i32 %arg) {
 ; GFX9-O0-NEXT:    v_readlane_b32 s3, v0, 2
 ; GFX9-O0-NEXT:    v_readlane_b32 s0, v0, 3
 ; GFX9-O0-NEXT:    v_readlane_b32 s1, v0, 4
-; GFX9-O0-NEXT:    buffer_load_dword v3, off, s[16:19], 0 offset:12 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v4, off, s[16:19], 0 offset:8 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v3, off, s[16:19], 0 offset:8 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v4, off, s[16:19], 0 offset:4 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-O0-NEXT:    v_cmp_eq_u32_e64 s[4:5], v3, v4
 ; GFX9-O0-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s[4:5]
@@ -1219,7 +1219,7 @@ define amdgpu_kernel void @strict_wwm_call(ptr addrspace(8) inreg %tmp14, i32 in
 ; GFX9-O0-NEXT:    v_writelane_b32 v7, s4, 2
 ; GFX9-O0-NEXT:    v_writelane_b32 v7, s5, 3
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[20:21], -1
-; GFX9-O0-NEXT:    buffer_store_dword v7, off, s[24:27], 0 offset:4 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v7, off, s[24:27], 0 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[20:21]
 ; GFX9-O0-NEXT:    s_mov_b64 s[4:5], s[0:1]
 ; GFX9-O0-NEXT:    v_readlane_b32 s0, v7, 2
@@ -1228,7 +1228,7 @@ define amdgpu_kernel void @strict_wwm_call(ptr addrspace(8) inreg %tmp14, i32 in
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v1
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v0
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[20:21], -1
-; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[24:27], 0 offset:4 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[24:27], 0 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[20:21]
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[2:3]
 ; GFX9-O0-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x24
@@ -1258,7 +1258,7 @@ define amdgpu_kernel void @strict_wwm_call(ptr addrspace(8) inreg %tmp14, i32 in
 ; GFX9-O0-NEXT:    v_writelane_b32 v0, s2, 9
 ; GFX9-O0-NEXT:    v_writelane_b32 v0, s3, 10
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[20:21], -1
-; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[24:27], 0 offset:4 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[24:27], 0 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[20:21]
 ; GFX9-O0-NEXT:    s_mov_b64 s[6:7], 56
 ; GFX9-O0-NEXT:    s_mov_b32 s2, s0
@@ -1285,7 +1285,7 @@ define amdgpu_kernel void @strict_wwm_call(ptr addrspace(8) inreg %tmp14, i32 in
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v6
 ; GFX9-O0-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[20:21], -1
-; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[24:27], 0 offset:4 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[24:27], 0 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[20:21]
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-O0-NEXT:    v_readlane_b32 s0, v1, 4
@@ -1297,7 +1297,7 @@ define amdgpu_kernel void @strict_wwm_call(ptr addrspace(8) inreg %tmp14, i32 in
 ; GFX9-O0-NEXT:    v_readlane_b32 s4, v1, 8
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[20:21], -1
-; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[24:27], 0 offset:4 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[24:27], 0 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[20:21]
 ; GFX9-O0-NEXT:    v_add_u32_e64 v3, v3, v6
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[6:7]
@@ -1454,7 +1454,7 @@ define amdgpu_kernel void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i6
 ; GFX9-O0-NEXT:    v_writelane_b32 v12, s4, 2
 ; GFX9-O0-NEXT:    v_writelane_b32 v12, s5, 3
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[20:21], -1
-; GFX9-O0-NEXT:    buffer_store_dword v12, off, s[24:27], 0 offset:4 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v12, off, s[24:27], 0 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[20:21]
 ; GFX9-O0-NEXT:    s_mov_b64 s[4:5], s[0:1]
 ; GFX9-O0-NEXT:    v_readlane_b32 s0, v12, 2
@@ -1463,7 +1463,7 @@ define amdgpu_kernel void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i6
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v1
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v0
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[20:21], -1
-; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[24:27], 0 offset:4 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[24:27], 0 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[20:21]
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[2:3]
 ; GFX9-O0-NEXT:    s_load_dwordx2 s[16:17], s[0:1], 0x24
@@ -1494,7 +1494,7 @@ define amdgpu_kernel void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i6
 ; GFX9-O0-NEXT:    v_writelane_b32 v0, s2, 8
 ; GFX9-O0-NEXT:    v_writelane_b32 v0, s3, 9
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[20:21], -1
-; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[24:27], 0 offset:4 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[24:27], 0 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[20:21]
 ; GFX9-O0-NEXT:    s_mov_b64 s[6:7], 60
 ; GFX9-O0-NEXT:    s_mov_b32 s2, s0
@@ -1529,7 +1529,7 @@ define amdgpu_kernel void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i6
 ; GFX9-O0-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-O0-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[20:21], -1
-; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[24:27], 0 offset:4 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[24:27], 0 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[20:21]
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-O0-NEXT:    v_readlane_b32 s0, v2, 4
@@ -1540,7 +1540,7 @@ define amdgpu_kernel void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i6
 ; GFX9-O0-NEXT:    v_readlane_b32 s5, v2, 9
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[20:21], -1
-; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[24:27], 0 offset:4 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[24:27], 0 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[20:21]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v1
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr6
diff --git a/llvm/test/CodeGen/MIR/AMDGPU/long-branch-reg-all-sgpr-used.ll b/llvm/test/CodeGen/MIR/AMDGPU/long-branch-reg-all-sgpr-used.ll
index abb9806..2588d88 100644
--- a/llvm/test/CodeGen/MIR/AMDGPU/long-branch-reg-all-sgpr-used.ll
+++ b/llvm/test/CodeGen/MIR/AMDGPU/long-branch-reg-all-sgpr-used.ll
@@ -38,7 +38,7 @@
 ; CHECK-NEXT:     fp64-fp16-output-denormals: true
 ; CHECK-NEXT:   highBitsOf32BitAddress: 0
 ; CHECK-NEXT:   occupancy:       5
-; CHECK-NEXT:   scavengeFI:      '%fixed-stack.0'
+; CHECK-NEXT:   scavengeFI:      '%stack.0'
 ; CHECK-NEXT:   vgprForAGPRCopy: ''
 ; CHECK-NEXT:   sgprForEXECCopy: '$sgpr100_sgpr101'
 ; CHECK-NEXT:   longBranchReservedReg: ''
@@ -303,7 +303,7 @@
 ; CHECK-NEXT:     fp64-fp16-output-denormals: true
 ; CHECK-NEXT:   highBitsOf32BitAddress: 0
 ; CHECK-NEXT:   occupancy:       5
-; CHECK-NEXT:   scavengeFI:      '%fixed-stack.0'
+; CHECK-NEXT:   scavengeFI:      '%stack.0'
 ; CHECK-NEXT:   vgprForAGPRCopy: ''
 ; CHECK-NEXT:   sgprForEXECCopy: '$sgpr100_sgpr101'
 ; CHECK-NEXT:   longBranchReservedReg: ''
diff --git a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-after-pei.ll b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-after-pei.ll
index f2144b8..9939366 100644
--- a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-after-pei.ll
+++ b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-after-pei.ll
@@ -38,7 +38,7 @@
 ; AFTER-PEI-NEXT:   fp64-fp16-output-denormals: true
 ; AFTER-PEI-NEXT: highBitsOf32BitAddress: 0
 ; AFTER-PEI-NEXT: occupancy: 5
-; AFTER-PEI-NEXT: scavengeFI: '%fixed-stack.0'
+; AFTER-PEI-NEXT: scavengeFI: '%stack.3'
 ; AFTER-PEI-NEXT: vgprForAGPRCopy: ''
 ; AFTER-PEI-NEXT: sgprForEXECCopy: ''
 ; AFTER-PEI-NEXT: longBranchReservedReg: ''
diff --git a/llvm/test/DebugInfo/AMDGPU/variable-locations.ll b/llvm/test/DebugInfo/AMDGPU/variable-locations.ll
index b3bdf96..b795ad1 100644
--- a/llvm/test/DebugInfo/AMDGPU/variable-locations.ll
+++ b/llvm/test/DebugInfo/AMDGPU/variable-locations.ll
@@ -36,15 +36,15 @@ declare void @llvm.dbg.declare(metadata, metadata, metadata)
 
 define amdgpu_kernel void @kernel1(
 ; CHECK: {{.*}}DW_TAG_formal_parameter
-; CHECK-NEXT: DW_AT_location [DW_FORM_block1] (DW_OP_fbreg +4, DW_OP_lit1, DW_OP_swap, DW_OP_xderef)
+; CHECK-NEXT: DW_AT_location [DW_FORM_block1] (DW_OP_fbreg +0, DW_OP_lit1, DW_OP_swap, DW_OP_xderef)
 ; CHECK-NEXT: DW_AT_name {{.*}}"ArgN"
     i32 %ArgN,
 ; CHECK: {{.*}}DW_TAG_formal_parameter
-; CHECK-NEXT: DW_AT_location [DW_FORM_block1] (DW_OP_fbreg +8, DW_OP_lit1, DW_OP_swap, DW_OP_xderef)
+; CHECK-NEXT: DW_AT_location [DW_FORM_block1] (DW_OP_fbreg +4, DW_OP_lit1, DW_OP_swap, DW_OP_xderef)
 ; CHECK-NEXT: DW_AT_name {{.*}}"ArgA"
     ptr addrspace(1) %ArgA,
 ; CHECK: {{.*}}DW_TAG_formal_parameter
-; CHECK-NEXT: DW_AT_location [DW_FORM_block1] (DW_OP_fbreg +16, DW_OP_lit1, DW_OP_swap, DW_OP_xderef)
+; CHECK-NEXT: DW_AT_location [DW_FORM_block1] (DW_OP_fbreg +12, DW_OP_lit1, DW_OP_swap, DW_OP_xderef)
 ; CHECK-NEXT: DW_AT_name {{.*}}"ArgB"
     ptr addrspace(1) %ArgB) !dbg !13 {
 entry:
-- 
cgit v1.1


From 0d9decc6694c188e2f7fa17d140ba9bd7cc98b6b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Timm=20B=C3=A4der?= <tbaeder@redhat.com>
Date: Thu, 8 Feb 2024 22:38:28 +0100
Subject: [clang][Interp] Handle invalid CXXCtorInitializer expressions

Their type might be a null type, in which case we need to abort here.
---
 clang/lib/AST/Interp/ByteCodeStmtGen.cpp |  4 ++++
 clang/test/AST/Interp/records.cpp        | 11 +++++++++++
 2 files changed, 15 insertions(+)

diff --git a/clang/lib/AST/Interp/ByteCodeStmtGen.cpp b/clang/lib/AST/Interp/ByteCodeStmtGen.cpp
index b0ec90a..bedcc78d 100644
--- a/clang/lib/AST/Interp/ByteCodeStmtGen.cpp
+++ b/clang/lib/AST/Interp/ByteCodeStmtGen.cpp
@@ -144,6 +144,10 @@ bool ByteCodeStmtGen<Emitter>::visitFunc(const FunctionDecl *F) {
 
   auto emitFieldInitializer = [&](const Record::Field *F, unsigned FieldOffset,
                                   const Expr *InitExpr) -> bool {
+    // We don't know what to do with these, so just return false.
+    if (InitExpr->getType().isNull())
+      return false;
+
     if (std::optional<PrimType> T = this->classify(InitExpr)) {
       if (!this->visit(InitExpr))
         return false;
diff --git a/clang/test/AST/Interp/records.cpp b/clang/test/AST/Interp/records.cpp
index fb50d1c..93da831 100644
--- a/clang/test/AST/Interp/records.cpp
+++ b/clang/test/AST/Interp/records.cpp
@@ -1228,3 +1228,14 @@ namespace InheritedConstructor {
     constexpr S s(1);
   }
 }
+
+namespace InvalidCtorInitializer {
+  struct X {
+    int Y;
+    constexpr X() // expected-note {{declared here}}
+        : Y(fo_o_()) {} // both-error {{use of undeclared identifier 'fo_o_'}}
+  };
+  // no crash on evaluating the constexpr ctor.
+  constexpr int Z = X().Y; // both-error {{constexpr variable 'Z' must be initialized by a constant expression}} \
+                           // expected-note {{undefined constructor 'X'}}
+}
-- 
cgit v1.1


From 173e674ba55eb93e8af43f2eece7feffe9954b34 Mon Sep 17 00:00:00 2001
From: Owen Pan <owenpiano@gmail.com>
Date: Fri, 9 Feb 2024 00:19:52 -0800
Subject: [clang-format] Fix an out-of-bounds bug uncovered by 763139afc19d

---
 clang/unittests/Format/QualifierFixerTest.cpp | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/clang/unittests/Format/QualifierFixerTest.cpp b/clang/unittests/Format/QualifierFixerTest.cpp
index 324366c..4e1768d 100644
--- a/clang/unittests/Format/QualifierFixerTest.cpp
+++ b/clang/unittests/Format/QualifierFixerTest.cpp
@@ -1100,8 +1100,6 @@ TEST_F(QualifierFixerTest, IsQualifierType) {
       NotTokens[3], ConfiguredTokens));
   EXPECT_FALSE(LeftRightQualifierAlignmentFixer::isConfiguredQualifierOrType(
       NotTokens[4], ConfiguredTokens));
-  EXPECT_FALSE(LeftRightQualifierAlignmentFixer::isConfiguredQualifierOrType(
-      NotTokens[5], ConfiguredTokens));
 
   EXPECT_FALSE(
       LeftRightQualifierAlignmentFixer::isQualifierOrType(NotTokens[0]));
@@ -1113,8 +1111,6 @@ TEST_F(QualifierFixerTest, IsQualifierType) {
       LeftRightQualifierAlignmentFixer::isQualifierOrType(NotTokens[3]));
   EXPECT_FALSE(
       LeftRightQualifierAlignmentFixer::isQualifierOrType(NotTokens[4]));
-  EXPECT_FALSE(
-      LeftRightQualifierAlignmentFixer::isQualifierOrType(NotTokens[5]));
 }
 
 TEST_F(QualifierFixerTest, IsMacro) {
-- 
cgit v1.1


From b9079baaddfed5e604fbfaa1d81a7a1c38e78c26 Mon Sep 17 00:00:00 2001
From: Pierre van Houtryve <pierre.vanhoutryve@amd.com>
Date: Fri, 9 Feb 2024 09:27:04 +0100
Subject: [NFC] clang-format utils/TableGen (#80973)

```
find llvm/utils/TableGen -iname "*.h" -o -iname "*.cpp" | xargs clang-format-16 -i
```

Split from #80847
---
 llvm/utils/TableGen/AsmMatcherEmitter.cpp          |  593 +++++----
 llvm/utils/TableGen/AsmWriterEmitter.cpp           |  242 ++--
 llvm/utils/TableGen/AsmWriterInst.cpp              |   69 +-
 llvm/utils/TableGen/AsmWriterInst.h                |  166 +--
 llvm/utils/TableGen/CTagsEmitter.cpp               |    7 +-
 llvm/utils/TableGen/CallingConvEmitter.cpp         |   48 +-
 llvm/utils/TableGen/CodeEmitterGen.cpp             |   41 +-
 llvm/utils/TableGen/CodeGenDAGPatterns.cpp         |  579 ++++-----
 llvm/utils/TableGen/CodeGenDAGPatterns.h           |  209 ++-
 llvm/utils/TableGen/CodeGenHwModes.cpp             |   16 +-
 llvm/utils/TableGen/CodeGenHwModes.h               |   70 +-
 llvm/utils/TableGen/CodeGenInstruction.cpp         |  147 ++-
 llvm/utils/TableGen/CodeGenInstruction.h           |  602 +++++----
 llvm/utils/TableGen/CodeGenIntrinsics.h            |   16 +-
 llvm/utils/TableGen/CodeGenMapTable.cpp            |  132 +-
 llvm/utils/TableGen/CodeGenRegisters.cpp           |  286 +++--
 llvm/utils/TableGen/CodeGenRegisters.h             | 1354 ++++++++++----------
 llvm/utils/TableGen/CodeGenSchedule.cpp            |  338 +++--
 llvm/utils/TableGen/CodeGenSchedule.h              |   49 +-
 llvm/utils/TableGen/CodeGenTarget.cpp              |   62 +-
 llvm/utils/TableGen/CodeGenTarget.h                |   36 +-
 llvm/utils/TableGen/DAGISelEmitter.cpp             |   35 +-
 llvm/utils/TableGen/DAGISelMatcher.cpp             |   74 +-
 llvm/utils/TableGen/DAGISelMatcher.h               |  518 ++++----
 llvm/utils/TableGen/DAGISelMatcherEmitter.cpp      |  167 ++-
 llvm/utils/TableGen/DAGISelMatcherGen.cpp          |  304 +++--
 llvm/utils/TableGen/DAGISelMatcherOpt.cpp          |    5 +-
 llvm/utils/TableGen/DFAEmitter.cpp                 |   10 +-
 llvm/utils/TableGen/DFAPacketizerEmitter.cpp       |    6 +-
 llvm/utils/TableGen/DXILEmitter.cpp                |   25 +-
 llvm/utils/TableGen/DecoderEmitter.cpp             |  136 +-
 llvm/utils/TableGen/DisassemblerEmitter.cpp        |    4 +-
 llvm/utils/TableGen/FastISelEmitter.cpp            |  138 +-
 llvm/utils/TableGen/InfoByHwMode.cpp               |   63 +-
 llvm/utils/TableGen/InfoByHwMode.h                 |   41 +-
 llvm/utils/TableGen/InstrDocsEmitter.cpp           |   26 +-
 llvm/utils/TableGen/InstrInfoEmitter.cpp           |  178 ++-
 llvm/utils/TableGen/IntrinsicEmitter.cpp           |   42 +-
 llvm/utils/TableGen/OptParserEmitter.cpp           |    9 +-
 llvm/utils/TableGen/PredicateExpander.cpp          |    4 +-
 llvm/utils/TableGen/PseudoLoweringEmitter.cpp      |   94 +-
 llvm/utils/TableGen/RegisterBankEmitter.cpp        |   10 +-
 llvm/utils/TableGen/RegisterInfoEmitter.cpp        |  226 ++--
 llvm/utils/TableGen/SDNodeProperties.h             |    2 +-
 llvm/utils/TableGen/SearchableTableEmitter.cpp     |   42 +-
 llvm/utils/TableGen/SequenceToOffsetTable.h        |   12 +-
 llvm/utils/TableGen/SubtargetEmitter.cpp           |  379 +++---
 llvm/utils/TableGen/SubtargetFeatureInfo.cpp       |    2 +-
 llvm/utils/TableGen/SubtargetFeatureInfo.h         |    3 +-
 llvm/utils/TableGen/TableGenBackends.h             |    2 -
 llvm/utils/TableGen/Types.cpp                      |    4 +-
 llvm/utils/TableGen/Types.h                        |    2 +-
 .../TableGen/WebAssemblyDisassemblerEmitter.cpp    |    6 +-
 .../TableGen/X86CompressEVEXTablesEmitter.cpp      |   11 +-
 llvm/utils/TableGen/X86DisassemblerShared.h        |    4 +-
 llvm/utils/TableGen/X86DisassemblerTables.cpp      |   21 +-
 llvm/utils/TableGen/X86DisassemblerTables.h        |   52 +-
 llvm/utils/TableGen/X86ModRMFilters.cpp            |   12 +-
 llvm/utils/TableGen/X86ModRMFilters.h              |   29 +-
 llvm/utils/TableGen/X86RecognizableInstr.cpp       |    2 +-
 llvm/utils/TableGen/X86RecognizableInstr.h         |    2 +-
 61 files changed, 3923 insertions(+), 3841 deletions(-)

diff --git a/llvm/utils/TableGen/AsmMatcherEmitter.cpp b/llvm/utils/TableGen/AsmMatcherEmitter.cpp
index 011d96a..9065885 100644
--- a/llvm/utils/TableGen/AsmMatcherEmitter.cpp
+++ b/llvm/utils/TableGen/AsmMatcherEmitter.cpp
@@ -140,10 +140,11 @@ class AsmMatcherInfo;
 // RegisterSets can be seen in the outputted AsmMatcher tables occasionally, and
 // can even affect compiler output (at least seen in diagnostics produced when
 // all matches fail). So we use a type that sorts them consistently.
-typedef std::set<Record*, LessRecordByID> RegisterSet;
+typedef std::set<Record *, LessRecordByID> RegisterSet;
 
 class AsmMatcherEmitter {
   RecordKeeper &Records;
+
 public:
   AsmMatcherEmitter(RecordKeeper &R) : Records(R) {}
 
@@ -166,7 +167,7 @@ struct ClassInfo {
 
     /// The (first) user defined class, subsequent user defined classes are
     /// UserClass0+1, and so on.
-    UserClass0 = 1<<16
+    UserClass0 = 1 << 16
   };
 
   /// Kind - The class kind, which is either a predefined kind, or (UserClass0 +
@@ -176,7 +177,7 @@ struct ClassInfo {
   /// SuperClasses - The super classes of this class. Note that for simplicities
   /// sake user operands only record their immediate super class, while register
   /// operands include all superclasses.
-  std::vector<ClassInfo*> SuperClasses;
+  std::vector<ClassInfo *> SuperClasses;
 
   /// Name - The full class name, suitable for use in an enum.
   std::string Name;
@@ -204,10 +205,12 @@ struct ClassInfo {
   /// For register classes: the records for all the registers in this class.
   RegisterSet Registers;
 
-  /// For custom match classes: the diagnostic kind for when the predicate fails.
+  /// For custom match classes: the diagnostic kind for when the predicate
+  /// fails.
   std::string DiagnosticType;
 
-  /// For custom match classes: the diagnostic string for when the predicate fails.
+  /// For custom match classes: the diagnostic string for when the predicate
+  /// fails.
   std::string DiagnosticString;
 
   /// Is this operand optional and not always required.
@@ -224,9 +227,7 @@ public:
   }
 
   /// isUserClass() - Check if this is a user defined class.
-  bool isUserClass() const {
-    return Kind >= UserClass0;
-  }
+  bool isUserClass() const { return Kind >= UserClass0; }
 
   /// isRelatedTo - Check whether this class is "related" to \p RHS. Classes
   /// are related if they are in the same class hierarchy.
@@ -244,8 +245,8 @@ public:
       RegisterSet Tmp;
       std::insert_iterator<RegisterSet> II(Tmp, Tmp.begin());
       std::set_intersection(Registers.begin(), Registers.end(),
-                            RHS.Registers.begin(), RHS.Registers.end(),
-                            II, LessRecordByID());
+                            RHS.Registers.begin(), RHS.Registers.end(), II,
+                            LessRecordByID());
 
       return !Tmp.empty();
     }
@@ -469,7 +470,7 @@ struct MatchableInfo {
                                 unsigned SrcOperand2) {
       ResOperand X;
       X.Kind = TiedOperand;
-      X.TiedOperands = { TiedOperandNum, SrcOperand1, SrcOperand2 };
+      X.TiedOperands = {TiedOperandNum, SrcOperand1, SrcOperand2};
       X.MINumOperands = 1;
       return X;
     }
@@ -503,7 +504,7 @@ struct MatchableInfo {
   Record *const TheDef;
 
   /// DefRec - This is the definition that it came from.
-  PointerUnion<const CodeGenInstruction*, const CodeGenInstAlias*> DefRec;
+  PointerUnion<const CodeGenInstruction *, const CodeGenInstAlias *> DefRec;
 
   const CodeGenInstruction *getResultInst() const {
     if (isa<const CodeGenInstruction *>(DefRec))
@@ -542,16 +543,13 @@ struct MatchableInfo {
   bool UseInstAsmMatchConverter;
 
   MatchableInfo(const CodeGenInstruction &CGI)
-    : AsmVariantID(0), AsmString(CGI.AsmString), TheDef(CGI.TheDef), DefRec(&CGI),
-      UseInstAsmMatchConverter(true) {
-  }
+      : AsmVariantID(0), AsmString(CGI.AsmString), TheDef(CGI.TheDef),
+        DefRec(&CGI), UseInstAsmMatchConverter(true) {}
 
   MatchableInfo(std::unique_ptr<const CodeGenInstAlias> Alias)
-    : AsmVariantID(0), AsmString(Alias->AsmString), TheDef(Alias->TheDef),
-      DefRec(Alias.release()),
-      UseInstAsmMatchConverter(
-        TheDef->getValueAsBit("UseInstAsmMatchConverter")) {
-  }
+      : AsmVariantID(0), AsmString(Alias->AsmString), TheDef(Alias->TheDef),
+        DefRec(Alias.release()), UseInstAsmMatchConverter(TheDef->getValueAsBit(
+                                     "UseInstAsmMatchConverter")) {}
 
   // Could remove this and the dtor if PointerUnion supported unique_ptr
   // elements with a dynamic failure/assertion (like the one below) in the case
@@ -576,9 +574,8 @@ struct MatchableInfo {
   void formTwoOperandAlias(StringRef Constraint);
 
   void initialize(const AsmMatcherInfo &Info,
-                  SmallPtrSetImpl<Record*> &SingletonRegisters,
-                  AsmVariantInfo const &Variant,
-                  bool HasMnemonicFirst);
+                  SmallPtrSetImpl<Record *> &SingletonRegisters,
+                  AsmVariantInfo const &Variant, bool HasMnemonicFirst);
 
   /// validate - Return true if this matchable is a valid thing to match against
   /// and perform a bunch of validity checking.
@@ -603,9 +600,9 @@ struct MatchableInfo {
   }
 
   int findAsmOperandOriginallyNamed(StringRef N) const {
-    auto I =
-        find_if(AsmOperands,
-                [&](const AsmOperand &Op) { return Op.OrigSrcOpName == N; });
+    auto I = find_if(AsmOperands, [&](const AsmOperand &Op) {
+      return Op.OrigSrcOpName == N;
+    });
     return (I != AsmOperands.end()) ? I - AsmOperands.begin() : -1;
   }
 
@@ -706,7 +703,7 @@ private:
 
 struct OperandMatchEntry {
   unsigned OperandMask;
-  const MatchableInfo* MI;
+  const MatchableInfo *MI;
   ClassInfo *CI;
 
   static OperandMatchEntry create(const MatchableInfo *mi, ClassInfo *ci,
@@ -740,21 +737,21 @@ public:
   std::vector<OperandMatchEntry> OperandMatchInfo;
 
   /// Map of Register records to their class information.
-  typedef std::map<Record*, ClassInfo*, LessRecordByID> RegisterClassesTy;
+  typedef std::map<Record *, ClassInfo *, LessRecordByID> RegisterClassesTy;
   RegisterClassesTy RegisterClasses;
 
   /// Map of Predicate records to their subtarget information.
   std::map<Record *, SubtargetFeatureInfo, LessRecordByID> SubtargetFeatures;
 
   /// Map of AsmOperandClass records to their class information.
-  std::map<Record*, ClassInfo*> AsmOperandClasses;
+  std::map<Record *, ClassInfo *> AsmOperandClasses;
 
   /// Map of RegisterClass records to their class information.
-  std::map<Record*, ClassInfo*> RegisterClassClasses;
+  std::map<Record *, ClassInfo *> RegisterClassClasses;
 
 private:
   /// Map of token to class information which has already been constructed.
-  std::map<std::string, ClassInfo*> TokenClasses;
+  std::map<std::string, ClassInfo *> TokenClasses;
 
 private:
   /// getTokenClass - Lookup or create the class for the given token.
@@ -767,7 +764,7 @@ private:
 
   /// buildRegisterClasses - Build the ClassInfo* instances for register
   /// classes.
-  void buildRegisterClasses(SmallPtrSetImpl<Record*> &SingletonRegisters);
+  void buildRegisterClasses(SmallPtrSetImpl<Record *> &SingletonRegisters);
 
   /// buildOperandClasses - Build the ClassInfo* instances for user defined
   /// operand classes.
@@ -779,8 +776,7 @@ private:
                                   MatchableInfo::AsmOperand &Op);
 
 public:
-  AsmMatcherInfo(Record *AsmParser,
-                 CodeGenTarget &Target,
+  AsmMatcherInfo(Record *AsmParser, CodeGenTarget &Target,
                  RecordKeeper &Records);
 
   /// Construct the various tables used during matching.
@@ -798,9 +794,7 @@ public:
     return I == SubtargetFeatures.end() ? nullptr : &I->second;
   }
 
-  RecordKeeper &getRecords() const {
-    return Records;
-  }
+  RecordKeeper &getRecords() const { return Records; }
 
   bool hasOptionalOperands() const {
     return any_of(Classes,
@@ -812,7 +806,8 @@ public:
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 LLVM_DUMP_METHOD void MatchableInfo::dump() const {
-  errs() << TheDef->getName() << " -- " << "flattened:\"" << AsmString <<"\"\n";
+  errs() << TheDef->getName() << " -- "
+         << "flattened:\"" << AsmString << "\"\n";
 
   errs() << "  variant: " << AsmVariantID << "\n";
 
@@ -850,7 +845,7 @@ parseTwoOperandConstraint(StringRef S, ArrayRef<SMLoc> Loc) {
 void MatchableInfo::formTwoOperandAlias(StringRef Constraint) {
   // Figure out which operands are aliased and mark them as tied.
   std::pair<StringRef, StringRef> Ops =
-    parseTwoOperandConstraint(Constraint, TheDef->getLoc());
+      parseTwoOperandConstraint(Constraint, TheDef->getLoc());
 
   // Find the AsmOperands that refer to the operands we're aliasing.
   int SrcAsmOperand = findAsmOperandNamed(Ops.first);
@@ -858,11 +853,11 @@ void MatchableInfo::formTwoOperandAlias(StringRef Constraint) {
   if (SrcAsmOperand == -1)
     PrintFatalError(TheDef->getLoc(),
                     "unknown source two-operand alias operand '" + Ops.first +
-                    "'.");
+                        "'.");
   if (DstAsmOperand == -1)
     PrintFatalError(TheDef->getLoc(),
                     "unknown destination two-operand alias operand '" +
-                    Ops.second + "'.");
+                        Ops.second + "'.");
 
   // Find the ResOperand that refers to the operand we're aliasing away
   // and update it to refer to the combined operand instead.
@@ -878,7 +873,7 @@ void MatchableInfo::formTwoOperandAlias(StringRef Constraint) {
   // Adjust the ResOperand references to any AsmOperands that followed
   // the one we just deleted.
   for (ResOperand &Op : ResOperands) {
-    switch(Op.Kind) {
+    switch (Op.Kind) {
     default:
       // Nothing to do for operands that don't reference AsmOperands.
       break;
@@ -892,10 +887,9 @@ void MatchableInfo::formTwoOperandAlias(StringRef Constraint) {
 
 /// extractSingletonRegisterForAsmOperand - Extract singleton register,
 /// if present, from specified token.
-static void
-extractSingletonRegisterForAsmOperand(MatchableInfo::AsmOperand &Op,
-                                      const AsmMatcherInfo &Info,
-                                      StringRef RegisterPrefix) {
+static void extractSingletonRegisterForAsmOperand(MatchableInfo::AsmOperand &Op,
+                                                  const AsmMatcherInfo &Info,
+                                                  StringRef RegisterPrefix) {
   StringRef Tok = Op.Token;
 
   // If this token is not an isolated token, i.e., it isn't separated from
@@ -922,13 +916,12 @@ extractSingletonRegisterForAsmOperand(MatchableInfo::AsmOperand &Op,
 }
 
 void MatchableInfo::initialize(const AsmMatcherInfo &Info,
-                               SmallPtrSetImpl<Record*> &SingletonRegisters,
+                               SmallPtrSetImpl<Record *> &SingletonRegisters,
                                AsmVariantInfo const &Variant,
                                bool HasMnemonicFirst) {
   AsmVariantID = Variant.AsmVariantNo;
-  AsmString =
-    CodeGenInstruction::FlattenAsmStringVariants(AsmString,
-                                                 Variant.AsmVariantNo);
+  AsmString = CodeGenInstruction::FlattenAsmStringVariants(
+      AsmString, Variant.AsmVariantNo);
 
   tokenizeAsmString(Info, Variant);
 
@@ -936,7 +929,7 @@ void MatchableInfo::initialize(const AsmMatcherInfo &Info,
   // simple string, not a $foo variable or a singleton register.
   if (AsmOperands.empty())
     PrintFatalError(TheDef->getLoc(),
-                  "Instruction '" + TheDef->getName() + "' has no tokens");
+                    "Instruction '" + TheDef->getName() + "' has no tokens");
 
   assert(!AsmOperands[0].Token.empty());
   if (HasMnemonicFirst) {
@@ -1045,7 +1038,7 @@ void MatchableInfo::tokenizeAsmString(const AsmMatcherInfo &Info,
       size_t EndPos = String.find('}', i);
       assert(EndPos != StringRef::npos &&
              "Missing brace in operand reference!");
-      addAsmOperand(String.slice(i, EndPos+1), IsIsolatedToken);
+      addAsmOperand(String.slice(i, EndPos + 1), IsIsolatedToken);
       Prev = EndPos + 1;
       i = EndPos;
       IsIsolatedToken = false;
@@ -1070,16 +1063,16 @@ bool MatchableInfo::validate(StringRef CommentDelimiter, bool IsAlias) const {
   // isCodeGenOnly if they are pseudo instructions.
   if (AsmString.find('\n') != std::string::npos)
     PrintFatalError(TheDef->getLoc(),
-                  "multiline instruction is not valid for the asmparser, "
-                  "mark it isCodeGenOnly");
+                    "multiline instruction is not valid for the asmparser, "
+                    "mark it isCodeGenOnly");
 
   // Remove comments from the asm string.  We know that the asmstring only
   // has one line.
   if (!CommentDelimiter.empty() &&
       StringRef(AsmString).contains(CommentDelimiter))
     PrintFatalError(TheDef->getLoc(),
-                  "asmstring for instruction has comment character in it, "
-                  "mark it isCodeGenOnly");
+                    "asmstring for instruction has comment character in it, "
+                    "mark it isCodeGenOnly");
 
   // Reject matchables with operand modifiers, these aren't something we can
   // handle, the target should be refactored to use operands instead of
@@ -1092,17 +1085,17 @@ bool MatchableInfo::validate(StringRef CommentDelimiter, bool IsAlias) const {
   for (const AsmOperand &Op : AsmOperands) {
     StringRef Tok = Op.Token;
     if (Tok[0] == '$' && Tok.contains(':'))
-      PrintFatalError(TheDef->getLoc(),
-                      "matchable with operand modifier '" + Tok +
-                      "' not supported by asm matcher.  Mark isCodeGenOnly!");
+      PrintFatalError(
+          TheDef->getLoc(),
+          "matchable with operand modifier '" + Tok +
+              "' not supported by asm matcher.  Mark isCodeGenOnly!");
     // Verify that any operand is only mentioned once.
     // We reject aliases and ignore instructions for now.
     if (!IsAlias && TheDef->getValueAsString("AsmMatchConverter").empty() &&
         Tok[0] == '$' && !OperandNames.insert(std::string(Tok)).second) {
       LLVM_DEBUG({
         errs() << "warning: '" << TheDef->getName() << "': "
-               << "ignoring instruction with tied operand '"
-               << Tok << "'\n";
+               << "ignoring instruction with tied operand '" << Tok << "'\n";
       });
       return false;
     }
@@ -1116,15 +1109,33 @@ static std::string getEnumNameForToken(StringRef Str) {
 
   for (char C : Str) {
     switch (C) {
-    case '*': Res += "_STAR_"; break;
-    case '%': Res += "_PCT_"; break;
-    case ':': Res += "_COLON_"; break;
-    case '!': Res += "_EXCLAIM_"; break;
-    case '.': Res += "_DOT_"; break;
-    case '<': Res += "_LT_"; break;
-    case '>': Res += "_GT_"; break;
-    case '-': Res += "_MINUS_"; break;
-    case '#': Res += "_HASH_"; break;
+    case '*':
+      Res += "_STAR_";
+      break;
+    case '%':
+      Res += "_PCT_";
+      break;
+    case ':':
+      Res += "_COLON_";
+      break;
+    case '!':
+      Res += "_EXCLAIM_";
+      break;
+    case '.':
+      Res += "_DOT_";
+      break;
+    case '<':
+      Res += "_LT_";
+      break;
+    case '>':
+      Res += "_GT_";
+      break;
+    case '-':
+      Res += "_MINUS_";
+      break;
+    case '#':
+      Res += "_HASH_";
+      break;
     default:
       if (isAlnum(C))
         Res += C;
@@ -1166,8 +1177,7 @@ AsmMatcherInfo::getOperandClass(const CGIOperandList::OperandInfo &OI,
   return getOperandClass(Rec, SubOpIdx);
 }
 
-ClassInfo *
-AsmMatcherInfo::getOperandClass(Record *Rec, int SubOpIdx) {
+ClassInfo *AsmMatcherInfo::getOperandClass(Record *Rec, int SubOpIdx) {
   if (Rec->isSubClassOf("RegisterOperand")) {
     // RegisterOperand may have an associated ParserMatchClass. If it does,
     // use it, else just fall back to the underlying register class.
@@ -1177,7 +1187,7 @@ AsmMatcherInfo::getOperandClass(Record *Rec, int SubOpIdx) {
                       "Record `" + Rec->getName() +
                           "' does not have a ParserMatchClass!\n");
 
-    if (DefInit *DI= dyn_cast<DefInit>(R->getValue())) {
+    if (DefInit *DI = dyn_cast<DefInit>(R->getValue())) {
       Record *MatchClass = DI->getDef();
       if (ClassInfo *CI = AsmOperandClasses[MatchClass])
         return CI;
@@ -1186,8 +1196,9 @@ AsmMatcherInfo::getOperandClass(Record *Rec, int SubOpIdx) {
     // No custom match class. Just use the register class.
     Record *ClassRec = Rec->getValueAsDef("RegClass");
     if (!ClassRec)
-      PrintFatalError(Rec->getLoc(), "RegisterOperand `" + Rec->getName() +
-                    "' has no associated register class!\n");
+      PrintFatalError(Rec->getLoc(),
+                      "RegisterOperand `" + Rec->getName() +
+                          "' has no associated register class!\n");
     if (ClassInfo *CI = RegisterClassClasses[ClassRec])
       return CI;
     PrintFatalError(Rec->getLoc(), "register class has no class info!");
@@ -1200,8 +1211,9 @@ AsmMatcherInfo::getOperandClass(Record *Rec, int SubOpIdx) {
   }
 
   if (!Rec->isSubClassOf("Operand"))
-    PrintFatalError(Rec->getLoc(), "Operand `" + Rec->getName() +
-                  "' does not derive from class Operand!\n");
+    PrintFatalError(Rec->getLoc(),
+                    "Operand `" + Rec->getName() +
+                        "' does not derive from class Operand!\n");
   Record *MatchClass = Rec->getValueAsDef("ParserMatchClass");
   if (ClassInfo *CI = AsmOperandClasses[MatchClass])
     return CI;
@@ -1210,19 +1222,18 @@ AsmMatcherInfo::getOperandClass(Record *Rec, int SubOpIdx) {
 }
 
 struct LessRegisterSet {
-  bool operator() (const RegisterSet &LHS, const RegisterSet & RHS) const {
+  bool operator()(const RegisterSet &LHS, const RegisterSet &RHS) const {
     // std::set<T> defines its own compariso "operator<", but it
     // performs a lexicographical comparison by T's innate comparison
     // for some reason. We don't want non-deterministic pointer
     // comparisons so use this instead.
-    return std::lexicographical_compare(LHS.begin(), LHS.end(),
-                                        RHS.begin(), RHS.end(),
-                                        LessRecordByID());
+    return std::lexicographical_compare(LHS.begin(), LHS.end(), RHS.begin(),
+                                        RHS.end(), LessRecordByID());
   }
 };
 
-void AsmMatcherInfo::
-buildRegisterClasses(SmallPtrSetImpl<Record*> &SingletonRegisters) {
+void AsmMatcherInfo::buildRegisterClasses(
+    SmallPtrSetImpl<Record *> &SingletonRegisters) {
   const auto &Registers = Target.getRegBank().getRegisters();
   auto &RegClassList = Target.getRegBank().getRegClasses();
 
@@ -1244,7 +1255,7 @@ buildRegisterClasses(SmallPtrSetImpl<Record*> &SingletonRegisters) {
   // Introduce derived sets where necessary (when a register does not determine
   // a unique register set class), and build the mapping of registers to the set
   // they should classify to.
-  std::map<Record*, RegisterSet> RegisterMap;
+  std::map<Record *, RegisterSet> RegisterMap;
   for (const CodeGenRegister &CGR : Registers) {
     // Compute the intersection of all sets containing this register.
     RegisterSet ContainingSet;
@@ -1273,7 +1284,7 @@ buildRegisterClasses(SmallPtrSetImpl<Record*> &SingletonRegisters) {
   }
 
   // Construct the register classes.
-  std::map<RegisterSet, ClassInfo*, LessRegisterSet> RegisterSetClasses;
+  std::map<RegisterSet, ClassInfo *, LessRegisterSet> RegisterSetClasses;
   unsigned Index = 0;
   for (const RegisterSet &RS : RegisterSets) {
     Classes.emplace_front();
@@ -1298,9 +1309,8 @@ buildRegisterClasses(SmallPtrSetImpl<Record*> &SingletonRegisters) {
   for (const RegisterSet &RS : RegisterSets) {
     ClassInfo *CI = RegisterSetClasses[RS];
     for (const RegisterSet &RS2 : RegisterSets)
-      if (RS != RS2 &&
-          std::includes(RS2.begin(), RS2.end(), RS.begin(), RS.end(),
-                        LessRecordByID()))
+      if (RS != RS2 && std::includes(RS2.begin(), RS2.end(), RS.begin(),
+                                     RS.end(), LessRecordByID()))
         CI->SuperClasses.push_back(RegisterSetClasses[RS2]);
   }
 
@@ -1354,8 +1364,8 @@ buildRegisterClasses(SmallPtrSetImpl<Record*> &SingletonRegisters) {
 }
 
 void AsmMatcherInfo::buildOperandClasses() {
-  std::vector<Record*> AsmOperands =
-    Records.getAllDerivedDefinitions("AsmOperandClass");
+  std::vector<Record *> AsmOperands =
+      Records.getAllDerivedDefinitions("AsmOperandClass");
 
   // Pre-populate AsmOperandClasses map.
   for (Record *Rec : AsmOperands) {
@@ -1438,11 +1448,9 @@ void AsmMatcherInfo::buildOperandClasses() {
   }
 }
 
-AsmMatcherInfo::AsmMatcherInfo(Record *asmParser,
-                               CodeGenTarget &target,
+AsmMatcherInfo::AsmMatcherInfo(Record *asmParser, CodeGenTarget &target,
                                RecordKeeper &records)
-  : Records(records), AsmParser(asmParser), Target(target) {
-}
+    : Records(records), AsmParser(asmParser), Target(target) {}
 
 /// buildOperandMatchInfo - Build the necessary information to handle user
 /// defined operand parsing methods.
@@ -1476,8 +1484,8 @@ void AsmMatcherInfo::buildOperandMatchInfo() {
     for (const auto &OCM : OpClassMask) {
       unsigned OpMask = OCM.second;
       ClassInfo *CI = OCM.first;
-      OperandMatchInfo.push_back(OperandMatchEntry::create(MI.get(), CI,
-                                                           OpMask));
+      OperandMatchInfo.push_back(
+          OperandMatchEntry::create(MI.get(), CI, OpMask));
     }
   }
 }
@@ -1499,7 +1507,7 @@ void AsmMatcherInfo::buildInfo() {
 
   // Parse the instructions; we need to do this first so that we can gather the
   // singleton register classes.
-  SmallPtrSet<Record*, 16> SingletonRegisters;
+  SmallPtrSet<Record *, 16> SingletonRegisters;
   unsigned VariantCount = Target.getAsmParserVariantCount();
   for (unsigned VC = 0; VC != VariantCount; ++VC) {
     Record *AsmVariant = Target.getAsmParserVariant(VC);
@@ -1511,8 +1519,7 @@ void AsmMatcherInfo::buildInfo() {
         AsmVariant->getValueAsString("TokenizingCharacters");
     Variant.SeparatorCharacters =
         AsmVariant->getValueAsString("SeparatorCharacters");
-    Variant.BreakCharacters =
-        AsmVariant->getValueAsString("BreakCharacters");
+    Variant.BreakCharacters = AsmVariant->getValueAsString("BreakCharacters");
     Variant.Name = AsmVariant->getValueAsString("Name");
     Variant.AsmVariantNo = AsmVariant->getValueAsInt("Variant");
 
@@ -1546,8 +1553,8 @@ void AsmMatcherInfo::buildInfo() {
 
     // Parse all of the InstAlias definitions and stick them in the list of
     // matchables.
-    std::vector<Record*> AllInstAliases =
-      Records.getAllDerivedDefinitions("InstAlias");
+    std::vector<Record *> AllInstAliases =
+        Records.getAllDerivedDefinitions("InstAlias");
     for (Record *InstAlias : AllInstAliases) {
       auto Alias = std::make_unique<CodeGenInstAlias>(InstAlias, Target);
 
@@ -1654,14 +1661,14 @@ void AsmMatcherInfo::buildInfo() {
 
   // Process token alias definitions and set up the associated superclass
   // information.
-  std::vector<Record*> AllTokenAliases =
-    Records.getAllDerivedDefinitions("TokenAlias");
+  std::vector<Record *> AllTokenAliases =
+      Records.getAllDerivedDefinitions("TokenAlias");
   for (Record *Rec : AllTokenAliases) {
     ClassInfo *FromClass = getTokenClass(Rec->getValueAsString("FromToken"));
     ClassInfo *ToClass = getTokenClass(Rec->getValueAsString("ToToken"));
     if (FromClass == ToClass)
       PrintFatalError(Rec->getLoc(),
-                    "error: Destination value identical to source value.");
+                      "error: Destination value identical to source value.");
     FromClass->SuperClasses.push_back(ToClass);
   }
 
@@ -1681,10 +1688,9 @@ void AsmMatcherInfo::buildInfo() {
 
 /// buildInstructionOperandReference - The specified operand is a reference to a
 /// named operand such as $src.  Resolve the Class and OperandInfo pointers.
-void AsmMatcherInfo::
-buildInstructionOperandReference(MatchableInfo *II,
-                                 StringRef OperandName,
-                                 unsigned AsmOpIdx) {
+void AsmMatcherInfo::buildInstructionOperandReference(MatchableInfo *II,
+                                                      StringRef OperandName,
+                                                      unsigned AsmOpIdx) {
   const CodeGenInstruction &CGI = *cast<const CodeGenInstruction *>(II->DefRec);
   const CGIOperandList &Operands = CGI.Operands;
   MatchableInfo::AsmOperand *Op = &II->AsmOperands[AsmOpIdx];
@@ -1708,7 +1714,8 @@ buildInstructionOperandReference(MatchableInfo *II,
       for (unsigned SI = 1, SE = Operands[Idx].MINumOperands; SI != SE; ++SI) {
         MatchableInfo::AsmOperand NewAsmOp(/*IsIsolatedToken=*/true, Token);
         NewAsmOp.SubOpIdx = SI;
-        II->AsmOperands.insert(II->AsmOperands.begin()+AsmOpIdx+SI, NewAsmOp);
+        II->AsmOperands.insert(II->AsmOperands.begin() + AsmOpIdx + SI,
+                               NewAsmOp);
       }
       // Replace Op with first suboperand.
       Op = &II->AsmOperands[AsmOpIdx]; // update the pointer in case it moved
@@ -1760,8 +1767,8 @@ void AsmMatcherInfo::buildAliasOperandReference(MatchableInfo *II,
       // Use the match class from the Alias definition, not the
       // destination instruction, as we may have an immediate that's
       // being munged by the match class.
-      Op.Class = getOperandClass(CGA.ResultOperands[i].getRecord(),
-                                 Op.SubOpIdx);
+      Op.Class =
+          getOperandClass(CGA.ResultOperands[i].getRecord(), Op.SubOpIdx);
       Op.SrcOpName = OperandName;
       Op.OrigSrcOpName = OperandName;
       return;
@@ -1812,8 +1819,8 @@ void MatchableInfo::buildInstructionResultOperands() {
 
     // Add a separate ResOperand for each suboperand.
     for (unsigned AI = 0; AI < NumOperands; ++AI) {
-      assert(AsmOperands[SrcOperand+AI].SubOpIdx == (int)AI &&
-             AsmOperands[SrcOperand+AI].SrcOpName == OpInfo.Name &&
+      assert(AsmOperands[SrcOperand + AI].SubOpIdx == (int)AI &&
+             AsmOperands[SrcOperand + AI].SrcOpName == OpInfo.Name &&
              "unexpected AsmOperands for suboperands");
       ResOperands.push_back(ResOperand::getRenderedOp(SrcOperand + AI, 1));
     }
@@ -1886,8 +1893,9 @@ void MatchableInfo::buildAliasResultOperands(bool AliasConstraintsAreChecked) {
 
     // Handle all the suboperands for this operand.
     const std::string &OpName = OpInfo->Name;
-    for ( ; AliasOpNo <  LastOpNo &&
-            CGA.ResultInstOperandIndex[AliasOpNo].first == i; ++AliasOpNo) {
+    for (; AliasOpNo < LastOpNo &&
+           CGA.ResultInstOperandIndex[AliasOpNo].first == i;
+         ++AliasOpNo) {
       int SubIdx = CGA.ResultInstOperandIndex[AliasOpNo].second;
 
       // Find out what operand from the asmparser that this MCInst operand
@@ -1897,17 +1905,18 @@ void MatchableInfo::buildAliasResultOperands(bool AliasConstraintsAreChecked) {
         StringRef Name = CGA.ResultOperands[AliasOpNo].getName();
         int SrcOperand = findAsmOperand(Name, SubIdx);
         if (SrcOperand == -1)
-          PrintFatalError(TheDef->getLoc(), "Instruction '" +
-                        TheDef->getName() + "' has operand '" + OpName +
-                        "' that doesn't appear in asm string!");
+          PrintFatalError(TheDef->getLoc(),
+                          "Instruction '" + TheDef->getName() +
+                              "' has operand '" + OpName +
+                              "' that doesn't appear in asm string!");
 
         // Add it to the operand references. If it is added a second time, the
         // record won't be updated and it will fail later on.
         OperandRefs.try_emplace(Name, SrcOperand);
 
         unsigned NumOperands = (SubIdx == -1 ? OpInfo->MINumOperands : 1);
-        ResOperands.push_back(ResOperand::getRenderedOp(SrcOperand,
-                                                        NumOperands));
+        ResOperands.push_back(
+            ResOperand::getRenderedOp(SrcOperand, NumOperands));
         break;
       }
       case CodeGenInstAlias::ResultOperand::K_Imm: {
@@ -1952,7 +1961,7 @@ emitConvertFuncs(CodeGenTarget &Target, StringRef ClassName,
                  raw_ostream &OS) {
   SmallSetVector<CachedHashString, 16> OperandConversionKinds;
   SmallSetVector<CachedHashString, 16> InstructionConversionKinds;
-  std::vector<std::vector<uint8_t> > ConversionTable;
+  std::vector<std::vector<uint8_t>> ConversionTable;
   size_t MaxRowLength = 2; // minimum is custom converter plus terminator.
 
   // TargetOperandClass - This is the target's operand class, like X86Operand.
@@ -2009,7 +2018,8 @@ emitConvertFuncs(CodeGenTarget &Target, StringRef ClassName,
   CvtOS << "      break;\n";
   CvtOS << "    case CVT_Tied: {\n";
   CvtOS << "      assert(OpIdx < (size_t)(std::end(TiedAsmOperandTable) -\n";
-  CvtOS << "                              std::begin(TiedAsmOperandTable)) &&\n";
+  CvtOS
+      << "                              std::begin(TiedAsmOperandTable)) &&\n";
   CvtOS << "             \"Tied operand not found\");\n";
   CvtOS << "      unsigned TiedResOpnd = TiedAsmOperandTable[OpIdx][0];\n";
   CvtOS << "      if (TiedResOpnd != (uint8_t)-1)\n";
@@ -2048,7 +2058,7 @@ emitConvertFuncs(CodeGenTarget &Target, StringRef ClassName,
 
   // Map of e.g. <0, 2, 3> -> "Tie_0_2_3" enum label.
   std::map<std::tuple<uint8_t, uint8_t, uint8_t>, std::string>
-  TiedOperandsEnumMap;
+      TiedOperandsEnumMap;
 
   for (auto &II : Infos) {
     // Check if we have a custom match function.
@@ -2073,8 +2083,8 @@ emitConvertFuncs(CodeGenTarget &Target, StringRef ClassName,
       ConversionTable.back().push_back(CVT_Done);
 
       // Add the handler to the conversion driver function.
-      CvtOS << "    case CVT_"
-            << getEnumNameForToken(AsmMatchConverter) << ":\n"
+      CvtOS << "    case CVT_" << getEnumNameForToken(AsmMatchConverter)
+            << ":\n"
             << "      " << AsmMatchConverter << "(Inst, Operands);\n"
             << "      break;\n";
 
@@ -2088,7 +2098,7 @@ emitConvertFuncs(CodeGenTarget &Target, StringRef ClassName,
     std::vector<uint8_t> ConversionRow;
 
     // Compute the convert enum and the case body.
-    MaxRowLength = std::max(MaxRowLength, II->ResOperands.size()*2 + 1 );
+    MaxRowLength = std::max(MaxRowLength, II->ResOperands.size() * 2 + 1);
 
     for (unsigned i = 0, e = II->ResOperands.size(); i != e; ++i) {
       const MatchableInfo::ResOperand &OpInfo = II->ResOperands[i];
@@ -2098,7 +2108,7 @@ emitConvertFuncs(CodeGenTarget &Target, StringRef ClassName,
       case MatchableInfo::ResOperand::RenderAsmOperand: {
         // This comes from something we parsed.
         const MatchableInfo::AsmOperand &Op =
-          II->AsmOperands[OpInfo.AsmOperandNum];
+            II->AsmOperands[OpInfo.AsmOperandNum];
 
         // Registers are always converted the same, don't duplicate the
         // conversion function based on them.
@@ -2111,8 +2121,9 @@ emitConvertFuncs(CodeGenTarget &Target, StringRef ClassName,
 
         // Add the conversion kind, if necessary, and get the associated ID
         // the index of its entry in the vector).
-        std::string Name = "CVT_" + (Op.Class->isRegisterClass() ? "Reg" :
-                                     Op.Class->RenderMethod);
+        std::string Name =
+            "CVT_" +
+            (Op.Class->isRegisterClass() ? "Reg" : Op.Class->RenderMethod);
         if (Op.Class->IsOptional) {
           // For optional operands we must also care about DefaultMethod
           assert(HasOptionalOperands);
@@ -2121,8 +2132,8 @@ emitConvertFuncs(CodeGenTarget &Target, StringRef ClassName,
         Name = getEnumNameForToken(Name);
 
         bool IsNewConverter = false;
-        unsigned ID = getConverterOperandID(Name, OperandConversionKinds,
-                                            IsNewConverter);
+        unsigned ID =
+            getConverterOperandID(Name, OperandConversionKinds, IsNewConverter);
 
         // Add the operand entry to the instruction kind conversion row.
         ConversionRow.push_back(ID);
@@ -2171,10 +2182,8 @@ emitConvertFuncs(CodeGenTarget &Target, StringRef ClassName,
         // operand from the earlier one.We can only tie single MCOperand values.
         assert(OpInfo.MINumOperands == 1 && "Not a singular MCOperand");
         uint8_t TiedOp = OpInfo.TiedOperands.ResOpnd;
-        uint8_t SrcOp1 =
-            OpInfo.TiedOperands.SrcOpnd1Idx + HasMnemonicFirst;
-        uint8_t SrcOp2 =
-            OpInfo.TiedOperands.SrcOpnd2Idx + HasMnemonicFirst;
+        uint8_t SrcOp1 = OpInfo.TiedOperands.SrcOpnd1Idx + HasMnemonicFirst;
+        uint8_t SrcOp2 = OpInfo.TiedOperands.SrcOpnd2Idx + HasMnemonicFirst;
         assert((i > TiedOp || TiedOp == (uint8_t)-1) &&
                "Tied operand precedes its target!");
         auto TiedTupleName = std::string("Tie") + utostr(TiedOp) + '_' +
@@ -2198,8 +2207,8 @@ emitConvertFuncs(CodeGenTarget &Target, StringRef ClassName,
 
         std::string Name = "CVT_" + Ty;
         bool IsNewConverter = false;
-        unsigned ID = getConverterOperandID(Name, OperandConversionKinds,
-                                            IsNewConverter);
+        unsigned ID =
+            getConverterOperandID(Name, OperandConversionKinds, IsNewConverter);
         // Add the operand entry to the instruction kind conversion row.
         ConversionRow.push_back(ID);
         ConversionRow.push_back(0);
@@ -2230,8 +2239,8 @@ emitConvertFuncs(CodeGenTarget &Target, StringRef ClassName,
         Signature += "__" + Name;
         Name = "CVT_" + Name;
         bool IsNewConverter = false;
-        unsigned ID = getConverterOperandID(Name, OperandConversionKinds,
-                                            IsNewConverter);
+        unsigned ID =
+            getConverterOperandID(Name, OperandConversionKinds, IsNewConverter);
         // Add the operand entry to the instruction kind conversion row.
         ConversionRow.push_back(ID);
         ConversionRow.push_back(0);
@@ -2289,9 +2298,8 @@ emitConvertFuncs(CodeGenTarget &Target, StringRef ClassName,
 
     OS << "static const uint8_t TiedAsmOperandTable[][3] = {\n";
     for (auto &KV : TiedOperandsEnumMap) {
-      OS << "  /* " << KV.second << " */ { "
-         << utostr(std::get<0>(KV.first)) << ", "
-         << utostr(std::get<1>(KV.first)) << ", "
+      OS << "  /* " << KV.second << " */ { " << utostr(std::get<0>(KV.first))
+         << ", " << utostr(std::get<1>(KV.first)) << ", "
          << utostr(std::get<2>(KV.first)) << " },\n";
     }
     OS << "};\n\n";
@@ -2402,7 +2410,8 @@ static void emitMatchClassEnumeration(CodeGenTarget &Target,
 
 /// emitMatchClassDiagStrings - Emit a function to get the diagnostic text to be
 /// used when an assembly operand does not match the expected operand class.
-static void emitOperandMatchErrorDiagStrings(AsmMatcherInfo &Info, raw_ostream &OS) {
+static void emitOperandMatchErrorDiagStrings(AsmMatcherInfo &Info,
+                                             raw_ostream &OS) {
   // If the target does not use DiagnosticString for any operands, don't emit
   // an unused function.
   if (llvm::all_of(Info.Classes, [](const ClassInfo &CI) {
@@ -2415,12 +2424,12 @@ static void emitOperandMatchErrorDiagStrings(AsmMatcherInfo &Info, raw_ostream &
      << "MatchResultTy MatchResult) {\n";
   OS << "  switch (MatchResult) {\n";
 
-  for (const auto &CI: Info.Classes) {
+  for (const auto &CI : Info.Classes) {
     if (!CI.DiagnosticString.empty()) {
       assert(!CI.DiagnosticType.empty() &&
              "DiagnosticString set without DiagnosticType");
-      OS << "  case " << Info.Target.getName()
-         << "AsmParser::Match_" << CI.DiagnosticType << ":\n";
+      OS << "  case " << Info.Target.getName() << "AsmParser::Match_"
+         << CI.DiagnosticType << ":\n";
       OS << "    return \"" << CI.DiagnosticString << "\";\n";
     }
   }
@@ -2441,7 +2450,7 @@ static void emitRegisterMatchErrorFunc(AsmMatcherInfo &Info, raw_ostream &OS) {
     OS << "  return MCTargetAsmParser::Match_InvalidOperand;\n";
   } else {
     OS << "  switch (RegisterClass) {\n";
-    for (const auto &CI: Info.Classes) {
+    for (const auto &CI : Info.Classes) {
       if (CI.isRegisterClass() && !CI.DiagnosticType.empty()) {
         OS << "  case " << CI.Name << ":\n";
         OS << "    return " << Info.Target.getName() << "AsmParser::Match_"
@@ -2458,8 +2467,7 @@ static void emitRegisterMatchErrorFunc(AsmMatcherInfo &Info, raw_ostream &OS) {
 }
 
 /// emitValidateOperandClass - Emit the function to validate an operand class.
-static void emitValidateOperandClass(AsmMatcherInfo &Info,
-                                     raw_ostream &OS) {
+static void emitValidateOperandClass(AsmMatcherInfo &Info, raw_ostream &OS) {
   OS << "static unsigned validateOperandClass(MCParsedAsmOperand &GOp, "
      << "MatchClassKind Kind) {\n";
   OS << "  " << Info.Target.getName() << "Operand &Operand = ("
@@ -2495,8 +2503,7 @@ static void emitValidateOperandClass(AsmMatcherInfo &Info,
       OS << "      return " << Info.Target.getName() << "AsmParser::Match_"
          << CI.DiagnosticType << ";\n";
       OS << "    break;\n";
-    }
-    else
+    } else
       OS << "    break;\n";
     OS << "    }\n";
   }
@@ -2508,8 +2515,8 @@ static void emitValidateOperandClass(AsmMatcherInfo &Info,
   OS << "    switch (Operand.getReg()) {\n";
   OS << "    default: OpKind = InvalidMatchClass; break;\n";
   for (const auto &RC : Info.RegisterClasses)
-    OS << "    case " << RC.first->getValueAsString("Namespace") << "::"
-       << RC.first->getName() << ": OpKind = " << RC.second->Name
+    OS << "    case " << RC.first->getValueAsString("Namespace")
+       << "::" << RC.first->getName() << ": OpKind = " << RC.second->Name
        << "; break;\n";
   OS << "    }\n";
   OS << "    return isSubclass(OpKind, Kind) ? "
@@ -2676,7 +2683,8 @@ static void emitOperandDiagnosticTypes(AsmMatcherInfo &Info, raw_ostream &OS) {
       Types.insert(OpClassEntry.second->DiagnosticType);
   }
 
-  if (Types.empty()) return;
+  if (Types.empty())
+    return;
 
   // Now emit the enum entries.
   for (StringRef Type : Types)
@@ -2709,7 +2717,7 @@ static void emitGetSubtargetFeatureName(AsmMatcherInfo &Info, raw_ostream &OS) {
 
 static std::string GetAliasRequiredFeatures(Record *R,
                                             const AsmMatcherInfo &Info) {
-  std::vector<Record*> ReqFeatures = R->getValueAsListOfDefs("Predicates");
+  std::vector<Record *> ReqFeatures = R->getValueAsListOfDefs("Predicates");
   std::string Result;
 
   if (ReqFeatures.empty())
@@ -2719,8 +2727,9 @@ static std::string GetAliasRequiredFeatures(Record *R,
     const SubtargetFeatureInfo *F = Info.getSubtargetFeature(ReqFeatures[i]);
 
     if (!F)
-      PrintFatalError(R->getLoc(), "Predicate '" + ReqFeatures[i]->getName() +
-                    "' is not marked as an AssemblerPredicate!");
+      PrintFatalError(R->getLoc(),
+                      "Predicate '" + ReqFeatures[i]->getName() +
+                          "' is not marked as an AssemblerPredicate!");
 
     if (i)
       Result += " && ";
@@ -2731,21 +2740,21 @@ static std::string GetAliasRequiredFeatures(Record *R,
   return Result;
 }
 
-static void emitMnemonicAliasVariant(raw_ostream &OS,const AsmMatcherInfo &Info,
-                                     std::vector<Record*> &Aliases,
-                                     unsigned Indent = 0,
-                                  StringRef AsmParserVariantName = StringRef()){
+static void
+emitMnemonicAliasVariant(raw_ostream &OS, const AsmMatcherInfo &Info,
+                         std::vector<Record *> &Aliases, unsigned Indent = 0,
+                         StringRef AsmParserVariantName = StringRef()) {
   // Keep track of all the aliases from a mnemonic.  Use an std::map so that the
   // iteration order of the map is stable.
-  std::map<std::string, std::vector<Record*> > AliasesFromMnemonic;
+  std::map<std::string, std::vector<Record *>> AliasesFromMnemonic;
 
   for (Record *R : Aliases) {
     // FIXME: Allow AssemblerVariantName to be a comma separated list.
     StringRef AsmVariantName = R->getValueAsString("AsmVariantName");
     if (AsmVariantName != AsmParserVariantName)
       continue;
-    AliasesFromMnemonic[R->getValueAsString("FromMnemonic").lower()]
-        .push_back(R);
+    AliasesFromMnemonic[R->getValueAsString("FromMnemonic").lower()].push_back(
+        R);
   }
   if (AliasesFromMnemonic.empty())
     return;
@@ -2754,7 +2763,7 @@ static void emitMnemonicAliasVariant(raw_ostream &OS,const AsmMatcherInfo &Info,
   // by the string remapper.
   std::vector<StringMatcher::StringPair> Cases;
   for (const auto &AliasEntry : AliasesFromMnemonic) {
-    const std::vector<Record*> &ToVec = AliasEntry.second;
+    const std::vector<Record *> &ToVec = AliasEntry.second;
 
     // Loop through each alias and emit code that handles each case.  If there
     // are two instructions without predicates, emit an error.  If there is one,
@@ -2818,12 +2827,13 @@ static bool emitMnemonicAliases(raw_ostream &OS, const AsmMatcherInfo &Info,
   if (!MatchPrefix.empty())
     return false;
 
-  std::vector<Record*> Aliases =
-    Info.getRecords().getAllDerivedDefinitions("MnemonicAlias");
-  if (Aliases.empty()) return false;
+  std::vector<Record *> Aliases =
+      Info.getRecords().getAllDerivedDefinitions("MnemonicAlias");
+  if (Aliases.empty())
+    return false;
 
   OS << "static void applyMnemonicAliases(StringRef &Mnemonic, "
-    "const FeatureBitset &Features, unsigned VariantID) {\n";
+        "const FeatureBitset &Features, unsigned VariantID) {\n";
   OS << "  switch (VariantID) {\n";
   unsigned VariantCount = Target.getAsmParserVariantCount();
   for (unsigned VC = 0; VC != VariantCount; ++VC) {
@@ -2859,17 +2869,15 @@ emitCustomOperandParsing(raw_ostream &OS, CodeGenTarget &Target,
   // Emit the static custom operand parsing table;
   OS << "namespace {\n";
   OS << "  struct OperandMatchEntry {\n";
-  OS << "    " << getMinimalTypeForRange(MaxMnemonicIndex)
-               << " Mnemonic;\n";
-  OS << "    " << getMinimalTypeForRange(MaxMask)
-               << " OperandMask;\n";
+  OS << "    " << getMinimalTypeForRange(MaxMnemonicIndex) << " Mnemonic;\n";
+  OS << "    " << getMinimalTypeForRange(MaxMask) << " OperandMask;\n";
   OS << "    "
      << getMinimalTypeForRange(
             std::distance(Info.Classes.begin(), Info.Classes.end()) +
             2 /* Include 'InvalidMatchClass' and 'OptionalMatchClass' */)
      << " Class;\n";
   OS << "    " << getMinimalTypeForRange(MaxFeaturesIndex)
-               << " RequiredFeaturesIdx;\n\n";
+     << " RequiredFeaturesIdx;\n\n";
   OS << "    StringRef getMnemonic() const {\n";
   OS << "      return StringRef(MnemonicTable + Mnemonic + 1,\n";
   OS << "                       MnemonicTable[Mnemonic]);\n";
@@ -2903,13 +2911,13 @@ emitCustomOperandParsing(raw_ostream &OS, CodeGenTarget &Target,
 
     // Store a pascal-style length byte in the mnemonic.
     std::string LenMnemonic = char(II.Mnemonic.size()) + II.Mnemonic.lower();
-    OS << StringTable.GetOrAddStringOffset(LenMnemonic, false)
-       << " /* " << II.Mnemonic << " */, ";
+    OS << StringTable.GetOrAddStringOffset(LenMnemonic, false) << " /* "
+       << II.Mnemonic << " */, ";
 
     OS << OMI.OperandMask;
     OS << " /* ";
     ListSeparator LS;
-    for (int i = 0, e = 31; i !=e; ++i)
+    for (int i = 0, e = 31; i != e; ++i)
       if (OMI.OperandMask & (1 << i))
         OS << LS << i;
     OS << " */, ";
@@ -2958,7 +2966,8 @@ emitCustomOperandParsing(raw_ostream &OS, CodeGenTarget &Target,
 
   // Emit code to get the available features.
   OS << "  // Get the current feature set.\n";
-  OS << "  const FeatureBitset &AvailableFeatures = getAvailableFeatures();\n\n";
+  OS << "  const FeatureBitset &AvailableFeatures = "
+        "getAvailableFeatures();\n\n";
 
   OS << "  // Get the next operand index.\n";
   OS << "  unsigned NextOpNum = Operands.size()"
@@ -3064,7 +3073,7 @@ static void emitMnemonicSpellChecker(raw_ostream &OS, CodeGenTarget &Target,
      << "MnemonicSpellCheck(StringRef S, const FeatureBitset &FBS,"
      << " unsigned VariantID) {\n";
   if (!VariantCount)
-    OS <<  "  return \"\";";
+    OS << "  return \"\";";
   else {
     OS << "  const unsigned MaxEditDist = 2;\n";
     OS << "  std::vector<StringRef> Candidates;\n";
@@ -3112,10 +3121,8 @@ static void emitMnemonicSpellChecker(raw_ostream &OS, CodeGenTarget &Target,
   OS << "\n";
 }
 
-static void emitMnemonicChecker(raw_ostream &OS,
-                                CodeGenTarget &Target,
-                                unsigned VariantCount,
-                                bool HasMnemonicFirst,
+static void emitMnemonicChecker(raw_ostream &OS, CodeGenTarget &Target,
+                                unsigned VariantCount, bool HasMnemonicFirst,
                                 bool HasMnemonicAliases) {
   OS << "static bool " << Target.getName()
      << "CheckMnemonic(StringRef Mnemonic,\n";
@@ -3125,7 +3132,7 @@ static void emitMnemonicChecker(raw_ostream &OS,
      << "unsigned VariantID) {\n";
 
   if (!VariantCount) {
-    OS <<  "  return false;\n";
+    OS << "  return false;\n";
   } else {
     if (HasMnemonicAliases) {
       OS << "  // Process all MnemonicAliases to remap the mnemonic.\n";
@@ -3232,9 +3239,9 @@ void AsmMatcherEmitter::run(raw_ostream &OS) {
 #endif
 
   DEBUG_WITH_TYPE("instruction_info", {
-      for (const auto &MI : Info.Matchables)
-        MI->dump();
-    });
+    for (const auto &MI : Info.Matchables)
+      MI->dump();
+  });
 
   // Check for ambiguous matchables.
   DEBUG_WITH_TYPE("ambiguous_instrs", {
@@ -3256,8 +3263,7 @@ void AsmMatcherEmitter::run(raw_ostream &OS) {
       }
     }
     if (NumAmbiguous)
-      errs() << "warning: " << NumAmbiguous
-             << " ambiguous matchables!\n";
+      errs() << "warning: " << NumAmbiguous << " ambiguous matchables!\n";
   });
 
   // Compute the information on the custom operand parsing.
@@ -3275,12 +3281,14 @@ void AsmMatcherEmitter::run(raw_ostream &OS) {
   OS << "#undef GET_ASSEMBLER_HEADER\n";
   OS << "  // This should be included into the middle of the declaration of\n";
   OS << "  // your subclasses implementation of MCTargetAsmParser.\n";
-  OS << "  FeatureBitset ComputeAvailableFeatures(const FeatureBitset &FB) const;\n";
+  OS << "  FeatureBitset ComputeAvailableFeatures(const FeatureBitset &FB) "
+        "const;\n";
   if (HasOptionalOperands) {
     OS << "  void convertToMCInst(unsigned Kind, MCInst &Inst, "
        << "unsigned Opcode,\n"
        << "                       const OperandVector &Operands,\n"
-       << "                       const SmallBitVector &OptionalOperandsMask);\n";
+       << "                       const SmallBitVector "
+          "&OptionalOperandsMask);\n";
   } else {
     OS << "  void convertToMCInst(unsigned Kind, MCInst &Inst, "
        << "unsigned Opcode,\n"
@@ -3291,7 +3299,8 @@ void AsmMatcherEmitter::run(raw_ostream &OS) {
   OS << "  unsigned MatchInstructionImpl(const OperandVector &Operands,\n"
      << "                                MCInst &Inst,\n";
   if (ReportMultipleNearMisses)
-    OS << "                                SmallVectorImpl<NearMissInfo> *NearMisses,\n";
+    OS << "                                SmallVectorImpl<NearMissInfo> "
+          "*NearMisses,\n";
   else
     OS << "                                uint64_t &ErrorInfo,\n"
        << "                                FeatureBitset &MissingFeatures,\n";
@@ -3304,11 +3313,11 @@ void AsmMatcherEmitter::run(raw_ostream &OS) {
        << "                                bool matchingInlineAsm,\n"
        << "                                unsigned VariantID = 0) {\n"
        << "    FeatureBitset MissingFeatures;\n"
-       << "    return MatchInstructionImpl(Operands, Inst, ErrorInfo, MissingFeatures,\n"
+       << "    return MatchInstructionImpl(Operands, Inst, ErrorInfo, "
+          "MissingFeatures,\n"
        << "                                matchingInlineAsm, VariantID);\n"
        << "  }\n\n";
 
-
   if (!Info.OperandMatchInfo.empty()) {
     OS << "  ParseStatus MatchOperandParserImpl(\n";
     OS << "    OperandVector &Operands,\n";
@@ -3362,9 +3371,9 @@ void AsmMatcherEmitter::run(raw_ostream &OS) {
   // Generate the convertToMCInst function to convert operands into an MCInst.
   // Also, generate the convertToMapAndConstraints function for MS-style inline
   // assembly.  The latter doesn't actually generate a MCInst.
-  unsigned NumConverters = emitConvertFuncs(Target, ClassName, Info.Matchables,
-                                            HasMnemonicFirst,
-                                            HasOptionalOperands, OS);
+  unsigned NumConverters =
+      emitConvertFuncs(Target, ClassName, Info.Matchables, HasMnemonicFirst,
+                       HasOptionalOperands, OS);
 
   // Emit the enumeration for classes which participate in matching.
   emitMatchClassEnumeration(Target, Info.Classes, OS);
@@ -3406,8 +3415,8 @@ void AsmMatcherEmitter::run(raw_ostream &OS) {
 
     // Store a pascal-style length byte in the mnemonic.
     std::string LenMnemonic = char(MI->Mnemonic.size()) + MI->Mnemonic.lower();
-    MaxMnemonicIndex = std::max(MaxMnemonicIndex,
-                        StringTable.GetOrAddStringOffset(LenMnemonic, false));
+    MaxMnemonicIndex = std::max(
+        MaxMnemonicIndex, StringTable.GetOrAddStringOffset(LenMnemonic, false));
   }
 
   OS << "static const char MnemonicTable[] =\n";
@@ -3476,13 +3485,11 @@ void AsmMatcherEmitter::run(raw_ostream &OS) {
   // following the mnemonic.
   OS << "namespace {\n";
   OS << "  struct MatchEntry {\n";
-  OS << "    " << getMinimalTypeForRange(MaxMnemonicIndex)
-               << " Mnemonic;\n";
+  OS << "    " << getMinimalTypeForRange(MaxMnemonicIndex) << " Mnemonic;\n";
   OS << "    uint16_t Opcode;\n";
-  OS << "    " << getMinimalTypeForRange(NumConverters)
-               << " ConvertFn;\n";
+  OS << "    " << getMinimalTypeForRange(NumConverters) << " ConvertFn;\n";
   OS << "    " << getMinimalTypeForRange(FeatureBitsets.size())
-               << " RequiredFeaturesIdx;\n";
+     << " RequiredFeaturesIdx;\n";
   OS << "    "
      << getMinimalTypeForRange(
             std::distance(Info.Classes.begin(), Info.Classes.end()) +
@@ -3524,9 +3531,8 @@ void AsmMatcherEmitter::run(raw_ostream &OS) {
       std::string LenMnemonic =
           char(MI->Mnemonic.size()) + MI->Mnemonic.lower();
       OS << "  { " << StringTable.GetOrAddStringOffset(LenMnemonic, false)
-         << " /* " << MI->Mnemonic << " */, "
-         << Target.getInstNamespace() << "::"
-         << MI->getResultInst()->TheDef->getName() << ", "
+         << " /* " << MI->Mnemonic << " */, " << Target.getInstNamespace()
+         << "::" << MI->getResultInst()->TheDef->getName() << ", "
          << MI->ConversionFnKind << ", ";
 
       // Write the required features mask.
@@ -3563,17 +3569,17 @@ void AsmMatcherEmitter::run(raw_ostream &OS) {
 
   if (!ReportMultipleNearMisses) {
     OS << "  // Eliminate obvious mismatches.\n";
-    OS << "  if (Operands.size() > "
-       << (MaxNumOperands + HasMnemonicFirst) << ") {\n";
-    OS << "    ErrorInfo = "
-       << (MaxNumOperands + HasMnemonicFirst) << ";\n";
+    OS << "  if (Operands.size() > " << (MaxNumOperands + HasMnemonicFirst)
+       << ") {\n";
+    OS << "    ErrorInfo = " << (MaxNumOperands + HasMnemonicFirst) << ";\n";
     OS << "    return Match_InvalidOperand;\n";
     OS << "  }\n\n";
   }
 
   // Emit code to get the available features.
   OS << "  // Get the current feature set.\n";
-  OS << "  const FeatureBitset &AvailableFeatures = getAvailableFeatures();\n\n";
+  OS << "  const FeatureBitset &AvailableFeatures = "
+        "getAvailableFeatures();\n\n";
 
   OS << "  // Get the instruction mnemonic, which is the first token.\n";
   if (HasMnemonicFirst) {
@@ -3632,7 +3638,8 @@ void AsmMatcherEmitter::run(raw_ostream &OS) {
           "std::equal_range(Start, End, Mnemonic.lower(), LessOpcode());\n\n";
   }
 
-  OS << "  DEBUG_WITH_TYPE(\"asm-matcher\", dbgs() << \"AsmMatcher: found \" <<\n"
+  OS << "  DEBUG_WITH_TYPE(\"asm-matcher\", dbgs() << \"AsmMatcher: found \" "
+        "<<\n"
      << "  std::distance(MnemonicRange.first, MnemonicRange.second) <<\n"
      << "  \" encodings with mnemonic '\" << Mnemonic << \"'\\n\");\n\n";
 
@@ -3647,15 +3654,20 @@ void AsmMatcherEmitter::run(raw_ostream &OS) {
         "FeatureBitsets[it->RequiredFeaturesIdx];\n";
   OS << "    bool HasRequiredFeatures =\n";
   OS << "      (AvailableFeatures & RequiredFeatures) == RequiredFeatures;\n";
-  OS << "    DEBUG_WITH_TYPE(\"asm-matcher\", dbgs() << \"Trying to match opcode \"\n";
-  OS << "                                          << MII.getName(it->Opcode) << \"\\n\");\n";
+  OS << "    DEBUG_WITH_TYPE(\"asm-matcher\", dbgs() << \"Trying to match "
+        "opcode \"\n";
+  OS << "                                          << MII.getName(it->Opcode) "
+        "<< \"\\n\");\n";
 
   if (ReportMultipleNearMisses) {
-    OS << "    // Some state to record ways in which this instruction did not match.\n";
+    OS << "    // Some state to record ways in which this instruction did not "
+          "match.\n";
     OS << "    NearMissInfo OperandNearMiss = NearMissInfo::getSuccess();\n";
     OS << "    NearMissInfo FeaturesNearMiss = NearMissInfo::getSuccess();\n";
-    OS << "    NearMissInfo EarlyPredicateNearMiss = NearMissInfo::getSuccess();\n";
-    OS << "    NearMissInfo LatePredicateNearMiss = NearMissInfo::getSuccess();\n";
+    OS << "    NearMissInfo EarlyPredicateNearMiss = "
+          "NearMissInfo::getSuccess();\n";
+    OS << "    NearMissInfo LatePredicateNearMiss = "
+          "NearMissInfo::getSuccess();\n";
     OS << "    bool MultipleInvalidOperands = false;\n";
   }
 
@@ -3676,30 +3688,39 @@ void AsmMatcherEmitter::run(raw_ostream &OS) {
   OS << "      auto Formal = "
      << "static_cast<MatchClassKind>(it->Classes[FormalIdx]);\n";
   OS << "      DEBUG_WITH_TYPE(\"asm-matcher\",\n";
-  OS << "                      dbgs() << \"  Matching formal operand class \" << getMatchClassName(Formal)\n";
-  OS << "                             << \" against actual operand at index \" << ActualIdx);\n";
+  OS << "                      dbgs() << \"  Matching formal operand class \" "
+        "<< getMatchClassName(Formal)\n";
+  OS << "                             << \" against actual operand at index \" "
+        "<< ActualIdx);\n";
   OS << "      if (ActualIdx < Operands.size())\n";
   OS << "        DEBUG_WITH_TYPE(\"asm-matcher\", dbgs() << \" (\";\n";
-  OS << "                        Operands[ActualIdx]->print(dbgs()); dbgs() << \"): \");\n";
+  OS << "                        Operands[ActualIdx]->print(dbgs()); dbgs() << "
+        "\"): \");\n";
   OS << "      else\n";
   OS << "        DEBUG_WITH_TYPE(\"asm-matcher\", dbgs() << \": \");\n";
   OS << "      if (ActualIdx >= Operands.size()) {\n";
   OS << "        DEBUG_WITH_TYPE(\"asm-matcher\", dbgs() << \"actual operand "
         "index out of range\\n\");\n";
   if (ReportMultipleNearMisses) {
-    OS << "        bool ThisOperandValid = (Formal == " <<"InvalidMatchClass) || "
-                                   "isSubclass(Formal, OptionalMatchClass);\n";
+    OS << "        bool ThisOperandValid = (Formal == "
+       << "InvalidMatchClass) || "
+          "isSubclass(Formal, OptionalMatchClass);\n";
     OS << "        if (!ThisOperandValid) {\n";
     OS << "          if (!OperandNearMiss) {\n";
     OS << "            // Record info about match failure for later use.\n";
-    OS << "            DEBUG_WITH_TYPE(\"asm-matcher\", dbgs() << \"recording too-few-operands near miss\\n\");\n";
+    OS << "            DEBUG_WITH_TYPE(\"asm-matcher\", dbgs() << \"recording "
+          "too-few-operands near miss\\n\");\n";
     OS << "            OperandNearMiss =\n";
-    OS << "                NearMissInfo::getTooFewOperands(Formal, it->Opcode);\n";
-    OS << "          } else if (OperandNearMiss.getKind() != NearMissInfo::NearMissTooFewOperands) {\n";
-    OS << "            // If more than one operand is invalid, give up on this match entry.\n";
+    OS << "                NearMissInfo::getTooFewOperands(Formal, "
+          "it->Opcode);\n";
+    OS << "          } else if (OperandNearMiss.getKind() != "
+          "NearMissInfo::NearMissTooFewOperands) {\n";
+    OS << "            // If more than one operand is invalid, give up on this "
+          "match entry.\n";
     OS << "            DEBUG_WITH_TYPE(\n";
     OS << "                \"asm-matcher\",\n";
-    OS << "                dbgs() << \"second invalid operand, giving up on this opcode\\n\");\n";
+    OS << "                dbgs() << \"second invalid operand, giving up on "
+          "this opcode\\n\");\n";
     OS << "            MultipleInvalidOperands = true;\n";
     OS << "            break;\n";
     OS << "          }\n";
@@ -3731,17 +3752,20 @@ void AsmMatcherEmitter::run(raw_ostream &OS) {
   OS << "      unsigned Diag = validateOperandClass(Actual, Formal);\n";
   OS << "      if (Diag == Match_Success) {\n";
   OS << "        DEBUG_WITH_TYPE(\"asm-matcher\",\n";
-  OS << "                        dbgs() << \"match success using generic matcher\\n\");\n";
+  OS << "                        dbgs() << \"match success using generic "
+        "matcher\\n\");\n";
   OS << "        ++ActualIdx;\n";
   OS << "        continue;\n";
   OS << "      }\n";
   OS << "      // If the generic handler indicates an invalid operand\n";
   OS << "      // failure, check for a special case.\n";
   OS << "      if (Diag != Match_Success) {\n";
-  OS << "        unsigned TargetDiag = validateTargetOperandClass(Actual, Formal);\n";
+  OS << "        unsigned TargetDiag = validateTargetOperandClass(Actual, "
+        "Formal);\n";
   OS << "        if (TargetDiag == Match_Success) {\n";
   OS << "          DEBUG_WITH_TYPE(\"asm-matcher\",\n";
-  OS << "                          dbgs() << \"match success using target matcher\\n\");\n";
+  OS << "                          dbgs() << \"match success using target "
+        "matcher\\n\");\n";
   OS << "          ++ActualIdx;\n";
   OS << "          continue;\n";
   OS << "        }\n";
@@ -3758,38 +3782,46 @@ void AsmMatcherEmitter::run(raw_ostream &OS) {
   if (HasOptionalOperands) {
     OS << "        OptionalOperandsMask.set(FormalIdx);\n";
   }
-    OS << "        DEBUG_WITH_TYPE(\"asm-matcher\", dbgs() << \"ignoring optional operand\\n\");\n";
+  OS << "        DEBUG_WITH_TYPE(\"asm-matcher\", dbgs() << \"ignoring "
+        "optional operand\\n\");\n";
   OS << "        continue;\n";
   OS << "      }\n";
 
   if (ReportMultipleNearMisses) {
     OS << "      if (!OperandNearMiss) {\n";
-    OS << "        // If this is the first invalid operand we have seen, record some\n";
+    OS << "        // If this is the first invalid operand we have seen, "
+          "record some\n";
     OS << "        // information about it.\n";
     OS << "        DEBUG_WITH_TYPE(\n";
     OS << "            \"asm-matcher\",\n";
     OS << "            dbgs()\n";
-    OS << "                << \"operand match failed, recording near-miss with diag code \"\n";
+    OS << "                << \"operand match failed, recording near-miss with "
+          "diag code \"\n";
     OS << "                << Diag << \"\\n\");\n";
     OS << "        OperandNearMiss =\n";
-    OS << "            NearMissInfo::getMissedOperand(Diag, Formal, it->Opcode, ActualIdx);\n";
+    OS << "            NearMissInfo::getMissedOperand(Diag, Formal, "
+          "it->Opcode, ActualIdx);\n";
     OS << "        ++ActualIdx;\n";
     OS << "      } else {\n";
-    OS << "        // If more than one operand is invalid, give up on this match entry.\n";
+    OS << "        // If more than one operand is invalid, give up on this "
+          "match entry.\n";
     OS << "        DEBUG_WITH_TYPE(\n";
     OS << "            \"asm-matcher\",\n";
-    OS << "            dbgs() << \"second operand mismatch, skipping this opcode\\n\");\n";
+    OS << "            dbgs() << \"second operand mismatch, skipping this "
+          "opcode\\n\");\n";
     OS << "        MultipleInvalidOperands = true;\n";
     OS << "        break;\n";
     OS << "      }\n";
     OS << "    }\n\n";
   } else {
-    OS << "      // If this operand is broken for all of the instances of this\n";
+    OS << "      // If this operand is broken for all of the instances of "
+          "this\n";
     OS << "      // mnemonic, keep track of it so we can report loc info.\n";
     OS << "      // If we already had a match that only failed due to a\n";
     OS << "      // target predicate, that diagnostic is preferred.\n";
     OS << "      if (!HadMatchOtherThanPredicate &&\n";
-    OS << "          (it == MnemonicRange.first || ErrorInfo <= ActualIdx)) {\n";
+    OS << "          (it == MnemonicRange.first || ErrorInfo <= ActualIdx)) "
+          "{\n";
     OS << "        if (HasRequiredFeatures && (ErrorInfo != ActualIdx || Diag "
           "!= Match_InvalidOperand))\n";
     OS << "          RetCode = Diag;\n";
@@ -3805,8 +3837,10 @@ void AsmMatcherEmitter::run(raw_ostream &OS) {
     OS << "    if (MultipleInvalidOperands) {\n";
   else
     OS << "    if (!OperandsValid) {\n";
-  OS << "      DEBUG_WITH_TYPE(\"asm-matcher\", dbgs() << \"Opcode result: multiple \"\n";
-  OS << "                                               \"operand mismatches, ignoring \"\n";
+  OS << "      DEBUG_WITH_TYPE(\"asm-matcher\", dbgs() << \"Opcode result: "
+        "multiple \"\n";
+  OS << "                                               \"operand mismatches, "
+        "ignoring \"\n";
   OS << "                                               \"this opcode\\n\");\n";
   OS << "      continue;\n";
   OS << "    }\n";
@@ -3817,13 +3851,16 @@ void AsmMatcherEmitter::run(raw_ostream &OS) {
     OS << "      HadMatchOtherThanFeatures = true;\n";
   OS << "      FeatureBitset NewMissingFeatures = RequiredFeatures & "
         "~AvailableFeatures;\n";
-  OS << "      DEBUG_WITH_TYPE(\"asm-matcher\", dbgs() << \"Missing target features:\";\n";
-  OS << "                      for (unsigned I = 0, E = NewMissingFeatures.size(); I != E; ++I)\n";
+  OS << "      DEBUG_WITH_TYPE(\"asm-matcher\", dbgs() << \"Missing target "
+        "features:\";\n";
+  OS << "                      for (unsigned I = 0, E = "
+        "NewMissingFeatures.size(); I != E; ++I)\n";
   OS << "                        if (NewMissingFeatures[I])\n";
   OS << "                          dbgs() << ' ' << I;\n";
   OS << "                      dbgs() << \"\\n\");\n";
   if (ReportMultipleNearMisses) {
-    OS << "      FeaturesNearMiss = NearMissInfo::getMissedFeature(NewMissingFeatures);\n";
+    OS << "      FeaturesNearMiss = "
+          "NearMissInfo::getMissedFeature(NewMissingFeatures);\n";
   } else {
     OS << "      if (NewMissingFeatures.count() <=\n"
           "          MissingFeatures.count())\n";
@@ -3848,10 +3885,12 @@ void AsmMatcherEmitter::run(raw_ostream &OS) {
      << "      Inst.clear();\n";
   OS << "      DEBUG_WITH_TYPE(\n";
   OS << "          \"asm-matcher\",\n";
-  OS << "          dbgs() << \"Early target match predicate failed with diag code \"\n";
+  OS << "          dbgs() << \"Early target match predicate failed with diag "
+        "code \"\n";
   OS << "                 << MatchResult << \"\\n\");\n";
   if (ReportMultipleNearMisses) {
-    OS << "      EarlyPredicateNearMiss = NearMissInfo::getMissedPredicate(MatchResult);\n";
+    OS << "      EarlyPredicateNearMiss = "
+          "NearMissInfo::getMissedPredicate(MatchResult);\n";
   } else {
     OS << "      RetCode = MatchResult;\n"
        << "      HadMatchOtherThanPredicate = true;\n"
@@ -3860,20 +3899,27 @@ void AsmMatcherEmitter::run(raw_ostream &OS) {
   OS << "    }\n\n";
 
   if (ReportMultipleNearMisses) {
-    OS << "    // If we did not successfully match the operands, then we can't convert to\n";
+    OS << "    // If we did not successfully match the operands, then we can't "
+          "convert to\n";
     OS << "    // an MCInst, so bail out on this instruction variant now.\n";
     OS << "    if (OperandNearMiss) {\n";
-    OS << "      // If the operand mismatch was the only problem, reprrt it as a near-miss.\n";
-    OS << "      if (NearMisses && !FeaturesNearMiss && !EarlyPredicateNearMiss) {\n";
+    OS << "      // If the operand mismatch was the only problem, reprrt it as "
+          "a near-miss.\n";
+    OS << "      if (NearMisses && !FeaturesNearMiss && "
+          "!EarlyPredicateNearMiss) {\n";
     OS << "        DEBUG_WITH_TYPE(\n";
     OS << "            \"asm-matcher\",\n";
     OS << "            dbgs()\n";
-    OS << "                << \"Opcode result: one mismatched operand, adding near-miss\\n\");\n";
+    OS << "                << \"Opcode result: one mismatched operand, adding "
+          "near-miss\\n\");\n";
     OS << "        NearMisses->push_back(OperandNearMiss);\n";
     OS << "      } else {\n";
-    OS << "        DEBUG_WITH_TYPE(\"asm-matcher\", dbgs() << \"Opcode result: multiple \"\n";
-    OS << "                                                 \"types of mismatch, so not \"\n";
-    OS << "                                                 \"reporting near-miss\\n\");\n";
+    OS << "        DEBUG_WITH_TYPE(\"asm-matcher\", dbgs() << \"Opcode result: "
+          "multiple \"\n";
+    OS << "                                                 \"types of "
+          "mismatch, so not \"\n";
+    OS << "                                                 \"reporting "
+          "near-miss\\n\");\n";
     OS << "      }\n";
     OS << "      continue;\n";
     OS << "    }\n\n";
@@ -3905,11 +3951,13 @@ void AsmMatcherEmitter::run(raw_ostream &OS) {
      << "    if ((MatchResult = checkTargetMatchPredicate(Inst)) !="
      << " Match_Success) {\n"
      << "      DEBUG_WITH_TYPE(\"asm-matcher\",\n"
-     << "                      dbgs() << \"Target match predicate failed with diag code \"\n"
+     << "                      dbgs() << \"Target match predicate failed with "
+        "diag code \"\n"
      << "                             << MatchResult << \"\\n\");\n"
      << "      Inst.clear();\n";
   if (ReportMultipleNearMisses) {
-    OS << "      LatePredicateNearMiss = NearMissInfo::getMissedPredicate(MatchResult);\n";
+    OS << "      LatePredicateNearMiss = "
+          "NearMissInfo::getMissedPredicate(MatchResult);\n";
   } else {
     OS << "      RetCode = MatchResult;\n"
        << "      HadMatchOtherThanPredicate = true;\n"
@@ -3923,10 +3971,14 @@ void AsmMatcherEmitter::run(raw_ostream &OS) {
     OS << "                         (int)(bool)EarlyPredicateNearMiss +\n";
     OS << "                         (int)(bool)LatePredicateNearMiss);\n";
     OS << "    if (NumNearMisses == 1) {\n";
-    OS << "      // We had exactly one type of near-miss, so add that to the list.\n";
-    OS << "      assert(!OperandNearMiss && \"OperandNearMiss was handled earlier\");\n";
-    OS << "      DEBUG_WITH_TYPE(\"asm-matcher\", dbgs() << \"Opcode result: found one type of \"\n";
-    OS << "                                            \"mismatch, so reporting a \"\n";
+    OS << "      // We had exactly one type of near-miss, so add that to the "
+          "list.\n";
+    OS << "      assert(!OperandNearMiss && \"OperandNearMiss was handled "
+          "earlier\");\n";
+    OS << "      DEBUG_WITH_TYPE(\"asm-matcher\", dbgs() << \"Opcode result: "
+          "found one type of \"\n";
+    OS << "                                            \"mismatch, so "
+          "reporting a \"\n";
     OS << "                                            \"near-miss\\n\");\n";
     OS << "      if (NearMisses && FeaturesNearMiss)\n";
     OS << "        NearMisses->push_back(FeaturesNearMiss);\n";
@@ -3937,10 +3989,14 @@ void AsmMatcherEmitter::run(raw_ostream &OS) {
     OS << "\n";
     OS << "      continue;\n";
     OS << "    } else if (NumNearMisses > 1) {\n";
-    OS << "      // This instruction missed in more than one way, so ignore it.\n";
-    OS << "      DEBUG_WITH_TYPE(\"asm-matcher\", dbgs() << \"Opcode result: multiple \"\n";
-    OS << "                                               \"types of mismatch, so not \"\n";
-    OS << "                                               \"reporting near-miss\\n\");\n";
+    OS << "      // This instruction missed in more than one way, so ignore "
+          "it.\n";
+    OS << "      DEBUG_WITH_TYPE(\"asm-matcher\", dbgs() << \"Opcode result: "
+          "multiple \"\n";
+    OS << "                                               \"types of mismatch, "
+          "so not \"\n";
+    OS << "                                               \"reporting "
+          "near-miss\\n\");\n";
     OS << "      continue;\n";
     OS << "    }\n";
   }
@@ -3952,7 +4008,9 @@ void AsmMatcherEmitter::run(raw_ostream &OS) {
 
   if (HasDeprecation) {
     OS << "    std::string Info;\n";
-    OS << "    if (!getParser().getTargetParser().getTargetOptions().MCNoDeprecatedWarn &&\n";
+    OS << "    if "
+          "(!getParser().getTargetParser().getTargetOptions()."
+          "MCNoDeprecatedWarn &&\n";
     OS << "        MII.getDeprecatedInfo(Inst, getSTI(), Info)) {\n";
     OS << "      SMLoc Loc = ((" << Target.getName()
        << "Operand &)*Operands[0]).getStartLoc();\n";
@@ -3969,7 +4027,8 @@ void AsmMatcherEmitter::run(raw_ostream &OS) {
 
   OS << "    DEBUG_WITH_TYPE(\n";
   OS << "        \"asm-matcher\",\n";
-  OS << "        dbgs() << \"Opcode result: complete match, selecting this opcode\\n\");\n";
+  OS << "        dbgs() << \"Opcode result: complete match, selecting this "
+        "opcode\\n\");\n";
   OS << "    return Match_Success;\n";
   OS << "  }\n\n";
 
@@ -4002,8 +4061,8 @@ void AsmMatcherEmitter::run(raw_ostream &OS) {
   OS << "\n#ifdef GET_MNEMONIC_CHECKER\n";
   OS << "#undef GET_MNEMONIC_CHECKER\n\n";
 
-  emitMnemonicChecker(OS, Target, VariantCount,
-                      HasMnemonicFirst, HasMnemonicAliases);
+  emitMnemonicChecker(OS, Target, VariantCount, HasMnemonicFirst,
+                      HasMnemonicAliases);
 
   OS << "#endif // GET_MNEMONIC_CHECKER\n\n";
 }
diff --git a/llvm/utils/TableGen/AsmWriterEmitter.cpp b/llvm/utils/TableGen/AsmWriterEmitter.cpp
index e0cd5fa..c05991f 100644
--- a/llvm/utils/TableGen/AsmWriterEmitter.cpp
+++ b/llvm/utils/TableGen/AsmWriterEmitter.cpp
@@ -64,6 +64,7 @@ public:
   AsmWriterEmitter(RecordKeeper &R);
 
   void run(raw_ostream &o);
+
 private:
   void EmitGetMnemonic(
       raw_ostream &o,
@@ -84,9 +85,9 @@ private:
 
 } // end anonymous namespace
 
-static void PrintCases(std::vector<std::pair<std::string,
-                       AsmWriterOperand>> &OpsToPrint, raw_ostream &O,
-                       bool PassSubtarget) {
+static void
+PrintCases(std::vector<std::pair<std::string, AsmWriterOperand>> &OpsToPrint,
+           raw_ostream &O, bool PassSubtarget) {
   O << "    case " << OpsToPrint.back().first << ":";
   AsmWriterOperand TheOp = OpsToPrint.back().second;
   OpsToPrint.pop_back();
@@ -94,9 +95,9 @@ static void PrintCases(std::vector<std::pair<std::string,
   // Check to see if any other operands are identical in this list, and if so,
   // emit a case label for them.
   for (unsigned i = OpsToPrint.size(); i != 0; --i)
-    if (OpsToPrint[i-1].second == TheOp) {
-      O << "\n    case " << OpsToPrint[i-1].first << ":";
-      OpsToPrint.erase(OpsToPrint.begin()+i-1);
+    if (OpsToPrint[i - 1].second == TheOp) {
+      O << "\n    case " << OpsToPrint[i - 1].first << ":";
+      OpsToPrint.erase(OpsToPrint.begin() + i - 1);
     }
 
   // Finally, emit the code.
@@ -106,33 +107,33 @@ static void PrintCases(std::vector<std::pair<std::string,
 
 /// EmitInstructions - Emit the last instruction in the vector and any other
 /// instructions that are suitably similar to it.
-static void EmitInstructions(std::vector<AsmWriterInst> &Insts,
-                             raw_ostream &O, bool PassSubtarget) {
+static void EmitInstructions(std::vector<AsmWriterInst> &Insts, raw_ostream &O,
+                             bool PassSubtarget) {
   AsmWriterInst FirstInst = Insts.back();
   Insts.pop_back();
 
   std::vector<AsmWriterInst> SimilarInsts;
   unsigned DifferingOperand = ~0;
   for (unsigned i = Insts.size(); i != 0; --i) {
-    unsigned DiffOp = Insts[i-1].MatchesAllButOneOp(FirstInst);
+    unsigned DiffOp = Insts[i - 1].MatchesAllButOneOp(FirstInst);
     if (DiffOp != ~1U) {
-      if (DifferingOperand == ~0U)  // First match!
+      if (DifferingOperand == ~0U) // First match!
         DifferingOperand = DiffOp;
 
       // If this differs in the same operand as the rest of the instructions in
       // this class, move it to the SimilarInsts list.
       if (DifferingOperand == DiffOp || DiffOp == ~0U) {
-        SimilarInsts.push_back(Insts[i-1]);
-        Insts.erase(Insts.begin()+i-1);
+        SimilarInsts.push_back(Insts[i - 1]);
+        Insts.erase(Insts.begin() + i - 1);
       }
     }
   }
 
-  O << "  case " << FirstInst.CGI->Namespace << "::"
-    << FirstInst.CGI->TheDef->getName() << ":\n";
+  O << "  case " << FirstInst.CGI->Namespace
+    << "::" << FirstInst.CGI->TheDef->getName() << ":\n";
   for (const AsmWriterInst &AWI : SimilarInsts)
-    O << "  case " << AWI.CGI->Namespace << "::"
-      << AWI.CGI->TheDef->getName() << ":\n";
+    O << "  case " << AWI.CGI->Namespace << "::" << AWI.CGI->TheDef->getName()
+      << ":\n";
   for (unsigned i = 0, e = FirstInst.Operands.size(); i != e; ++i) {
     if (i != DifferingOperand) {
       // If the operand is the same for all instructions, just print it.
@@ -143,14 +144,15 @@ static void EmitInstructions(std::vector<AsmWriterInst> &Insts,
       O << "    switch (MI->getOpcode()) {\n";
       O << "    default: llvm_unreachable(\"Unexpected opcode.\");\n";
       std::vector<std::pair<std::string, AsmWriterOperand>> OpsToPrint;
-      OpsToPrint.push_back(std::make_pair(FirstInst.CGI->Namespace.str() + "::" +
-                                          FirstInst.CGI->TheDef->getName().str(),
-                                          FirstInst.Operands[i]));
+      OpsToPrint.push_back(
+          std::make_pair(FirstInst.CGI->Namespace.str() +
+                             "::" + FirstInst.CGI->TheDef->getName().str(),
+                         FirstInst.Operands[i]));
 
       for (const AsmWriterInst &AWI : SimilarInsts) {
-        OpsToPrint.push_back(std::make_pair(AWI.CGI->Namespace.str()+"::" +
-                                            AWI.CGI->TheDef->getName().str(),
-                                            AWI.Operands[i]));
+        OpsToPrint.push_back(std::make_pair(
+            AWI.CGI->Namespace.str() + "::" + AWI.CGI->TheDef->getName().str(),
+            AWI.Operands[i]));
       }
       std::reverse(OpsToPrint.begin(), OpsToPrint.end());
       while (!OpsToPrint.empty())
@@ -162,11 +164,10 @@ static void EmitInstructions(std::vector<AsmWriterInst> &Insts,
   O << "    break;\n";
 }
 
-void AsmWriterEmitter::
-FindUniqueOperandCommands(std::vector<std::string> &UniqueOperandCommands,
-                          std::vector<std::vector<unsigned>> &InstIdxs,
-                          std::vector<unsigned> &InstOpsUsed,
-                          bool PassSubtarget) const {
+void AsmWriterEmitter::FindUniqueOperandCommands(
+    std::vector<std::string> &UniqueOperandCommands,
+    std::vector<std::vector<unsigned>> &InstIdxs,
+    std::vector<unsigned> &InstOpsUsed, bool PassSubtarget) const {
   // This vector parallels UniqueOperandCommands, keeping track of which
   // instructions each case are used for.  It is a comma separated string of
   // enums.
@@ -177,9 +178,10 @@ FindUniqueOperandCommands(std::vector<std::string> &UniqueOperandCommands,
   for (size_t i = 0, e = Instructions.size(); i != e; ++i) {
     const AsmWriterInst &Inst = Instructions[i];
     if (Inst.Operands.empty())
-      continue;   // Instruction already done.
+      continue; // Instruction already done.
 
-    std::string Command = "    "+Inst.Operands[0].getCode(PassSubtarget)+"\n";
+    std::string Command =
+        "    " + Inst.Operands[0].getCode(PassSubtarget) + "\n";
 
     // Check to see if we already have 'Command' in UniqueOperandCommands.
     // If not, add it.
@@ -203,12 +205,12 @@ FindUniqueOperandCommands(std::vector<std::string> &UniqueOperandCommands,
   // For each entry of UniqueOperandCommands, there is a set of instructions
   // that uses it.  If the next command of all instructions in the set are
   // identical, fold it into the command.
-  for (size_t CommandIdx = 0, e = UniqueOperandCommands.size();
-       CommandIdx != e; ++CommandIdx) {
+  for (size_t CommandIdx = 0, e = UniqueOperandCommands.size(); CommandIdx != e;
+       ++CommandIdx) {
 
     const auto &Idxs = InstIdxs[CommandIdx];
 
-    for (unsigned Op = 1; ; ++Op) {
+    for (unsigned Op = 1;; ++Op) {
       // Find the first instruction in the set.
       const AsmWriterInst &FirstInst = Instructions[Idxs.front()];
       // If this instruction has no more operands, we isn't anything to merge
@@ -227,8 +229,8 @@ FindUniqueOperandCommands(std::vector<std::string> &UniqueOperandCommands,
 
       // Okay, everything in this command set has the same next operand.  Add it
       // to UniqueOperandCommands and remember that it was consumed.
-      std::string Command = "    " +
-        FirstInst.Operands[Op].getCode(PassSubtarget) + "\n";
+      std::string Command =
+          "    " + FirstInst.Operands[Op].getCode(PassSubtarget) + "\n";
 
       UniqueOperandCommands[CommandIdx] += Command;
       InstOpsUsed[CommandIdx]++;
@@ -239,35 +241,58 @@ FindUniqueOperandCommands(std::vector<std::string> &UniqueOperandCommands,
   for (unsigned i = 0, e = InstrsForCase.size(); i != e; ++i) {
     std::string Instrs = InstrsForCase[i];
     if (Instrs.size() > 70) {
-      Instrs.erase(Instrs.begin()+70, Instrs.end());
+      Instrs.erase(Instrs.begin() + 70, Instrs.end());
       Instrs += "...";
     }
 
     if (!Instrs.empty())
-      UniqueOperandCommands[i] = "    // " + Instrs + "\n" +
-        UniqueOperandCommands[i];
+      UniqueOperandCommands[i] =
+          "    // " + Instrs + "\n" + UniqueOperandCommands[i];
   }
 }
 
 static void UnescapeString(std::string &Str) {
   for (unsigned i = 0; i != Str.size(); ++i) {
-    if (Str[i] == '\\' && i != Str.size()-1) {
-      switch (Str[i+1]) {
-      default: continue;  // Don't execute the code after the switch.
-      case 'a': Str[i] = '\a'; break;
-      case 'b': Str[i] = '\b'; break;
-      case 'e': Str[i] = 27; break;
-      case 'f': Str[i] = '\f'; break;
-      case 'n': Str[i] = '\n'; break;
-      case 'r': Str[i] = '\r'; break;
-      case 't': Str[i] = '\t'; break;
-      case 'v': Str[i] = '\v'; break;
-      case '"': Str[i] = '\"'; break;
-      case '\'': Str[i] = '\''; break;
-      case '\\': Str[i] = '\\'; break;
+    if (Str[i] == '\\' && i != Str.size() - 1) {
+      switch (Str[i + 1]) {
+      default:
+        continue; // Don't execute the code after the switch.
+      case 'a':
+        Str[i] = '\a';
+        break;
+      case 'b':
+        Str[i] = '\b';
+        break;
+      case 'e':
+        Str[i] = 27;
+        break;
+      case 'f':
+        Str[i] = '\f';
+        break;
+      case 'n':
+        Str[i] = '\n';
+        break;
+      case 'r':
+        Str[i] = '\r';
+        break;
+      case 't':
+        Str[i] = '\t';
+        break;
+      case 'v':
+        Str[i] = '\v';
+        break;
+      case '"':
+        Str[i] = '\"';
+        break;
+      case '\'':
+        Str[i] = '\'';
+        break;
+      case '\\':
+        Str[i] = '\\';
+        break;
       }
       // Nuke the second character.
-      Str.erase(Str.begin()+i+1);
+      Str.erase(Str.begin() + i + 1);
     }
   }
 }
@@ -281,14 +306,19 @@ static void UnescapeString(std::string &Str) {
 /// causes non-standard escape character warnings.
 static void UnescapeAliasString(std::string &Str) {
   for (unsigned i = 0; i != Str.size(); ++i) {
-    if (Str[i] == '\\' && i != Str.size()-1) {
-      switch (Str[i+1]) {
-      default: continue;  // Don't execute the code after the switch.
-      case '{': Str[i] = '{'; break;
-      case '}': Str[i] = '}'; break;
+    if (Str[i] == '\\' && i != Str.size() - 1) {
+      switch (Str[i + 1]) {
+      default:
+        continue; // Don't execute the code after the switch.
+      case '{':
+        Str[i] = '{';
+        break;
+      case '}':
+        Str[i] = '}';
+        break;
       }
       // Nuke the second character.
-      Str.erase(Str.begin()+i+1);
+      Str.erase(Str.begin() + i + 1);
     }
   }
 }
@@ -318,8 +348,7 @@ void AsmWriterEmitter::EmitGetMnemonic(
   // Add all strings to the string table upfront so it can generate an optimized
   // representation.
   for (AsmWriterInst &AWI : Instructions) {
-    if (AWI.Operands[0].OperandType ==
-                 AsmWriterOperand::isLiteralTextOperand &&
+    if (AWI.Operands[0].OperandType == AsmWriterOperand::isLiteralTextOperand &&
         !AWI.Operands[0].Str.empty()) {
       std::string Str = AWI.Operands[0].Str;
       UnescapeString(Str);
@@ -347,7 +376,7 @@ void AsmWriterEmitter::EmitGetMnemonic(
     }
 
     // Bias offset by one since we want 0 as a sentinel.
-    OpcodeInfo[AWI.CGIIndex] = Idx+1;
+    OpcodeInfo[AWI.CGIIndex] = Idx + 1;
   }
 
   // Figure out how many bits we used for the string index.
@@ -365,7 +394,8 @@ void AsmWriterEmitter::EmitGetMnemonic(
                               NumInstOpsHandled, PassSubtarget);
 
     // If we ran out of operands to print, we're done.
-    if (UniqueOperandCommands.empty()) break;
+    if (UniqueOperandCommands.empty())
+      break;
 
     // Compute the number of bits we need to represent these cases, this is
     // ceil(log2(numentries)).
@@ -383,14 +413,14 @@ void AsmWriterEmitter::EmitGetMnemonic(
       unsigned NumOps = NumInstOpsHandled[i];
       for (unsigned Idx : InstIdxs[i]) {
         OpcodeInfo[Instructions[Idx].CGIIndex] |=
-          (uint64_t)i << (OpcodeInfoBits-BitsLeft);
+            (uint64_t)i << (OpcodeInfoBits - BitsLeft);
         // Remove the info about this operand from the instruction.
         AsmWriterInst &Inst = Instructions[Idx];
         if (!Inst.Operands.empty()) {
           assert(NumOps <= Inst.Operands.size() &&
                  "Can't remove this many ops!");
           Inst.Operands.erase(Inst.Operands.begin(),
-                              Inst.Operands.begin()+NumOps);
+                              Inst.Operands.begin() + NumOps);
         }
       }
     }
@@ -487,7 +517,7 @@ void AsmWriterEmitter::EmitPrintInstruction(
     << "  assert(Bits != 0 && \"Cannot print this instruction.\");\n";
 
   // Output the table driven operand information.
-  BitsLeft = OpcodeInfoBits-AsmStrBits;
+  BitsLeft = OpcodeInfoBits - AsmStrBits;
   for (unsigned i = 0, e = TableDrivenOperandPrinters.size(); i != e; ++i) {
     std::vector<std::string> &Commands = TableDrivenOperandPrinters[i];
 
@@ -497,25 +527,21 @@ void AsmWriterEmitter::EmitPrintInstruction(
     assert(NumBits <= BitsLeft && "consistency error");
 
     // Emit code to extract this field from Bits.
-    O << "\n  // Fragment " << i << " encoded into " << NumBits
-      << " bits for " << Commands.size() << " unique commands.\n";
+    O << "\n  // Fragment " << i << " encoded into " << NumBits << " bits for "
+      << Commands.size() << " unique commands.\n";
 
     if (Commands.size() == 2) {
       // Emit two possibilitys with if/else.
-      O << "  if ((Bits >> "
-        << (OpcodeInfoBits-BitsLeft) << ") & "
-        << ((1 << NumBits)-1) << ") {\n"
-        << Commands[1]
-        << "  } else {\n"
-        << Commands[0]
-        << "  }\n\n";
+      O << "  if ((Bits >> " << (OpcodeInfoBits - BitsLeft) << ") & "
+        << ((1 << NumBits) - 1) << ") {\n"
+        << Commands[1] << "  } else {\n"
+        << Commands[0] << "  }\n\n";
     } else if (Commands.size() == 1) {
       // Emit a single possibility.
       O << Commands[0] << "\n\n";
     } else {
-      O << "  switch ((Bits >> "
-        << (OpcodeInfoBits-BitsLeft) << ") & "
-        << ((1 << NumBits)-1) << ") {\n"
+      O << "  switch ((Bits >> " << (OpcodeInfoBits - BitsLeft) << ") & "
+        << ((1 << NumBits) - 1) << ") {\n"
         << "  default: llvm_unreachable(\"Invalid command number.\");\n";
 
       // Print out all the cases.
@@ -537,7 +563,6 @@ void AsmWriterEmitter::EmitPrintInstruction(
   // elements in the vector.
   std::reverse(Instructions.begin(), Instructions.end());
 
-
   // Now that we've emitted all of the operand info that fit into 64 bits, emit
   // information for those instructions that are left.  This is a less dense
   // encoding, but we expect the main 64-bit table to handle the majority of
@@ -572,22 +597,21 @@ emitRegisterNameString(raw_ostream &O, StringRef AltName,
         AsmName = std::string(Reg.getName());
     } else {
       // Make sure the register has an alternate name for this index.
-      std::vector<Record*> AltNameList =
-        Reg.TheDef->getValueAsListOfDefs("RegAltNameIndices");
+      std::vector<Record *> AltNameList =
+          Reg.TheDef->getValueAsListOfDefs("RegAltNameIndices");
       unsigned Idx = 0, e;
       for (e = AltNameList.size();
-           Idx < e && (AltNameList[Idx]->getName() != AltName);
-           ++Idx)
+           Idx < e && (AltNameList[Idx]->getName() != AltName); ++Idx)
         ;
       // If the register has an alternate name for this index, use it.
       // Otherwise, leave it empty as an error flag.
       if (Idx < e) {
         std::vector<StringRef> AltNames =
-          Reg.TheDef->getValueAsListOfStrings("AltNames");
+            Reg.TheDef->getValueAsListOfStrings("AltNames");
         if (AltNames.size() <= Idx)
           PrintFatalError(Reg.TheDef->getLoc(),
                           "Register definition missing alt name for '" +
-                          AltName + "'.");
+                              AltName + "'.");
         AsmName = std::string(AltNames[Idx]);
       }
     }
@@ -613,15 +637,17 @@ void AsmWriterEmitter::EmitGetRegisterName(raw_ostream &O) {
   Record *AsmWriter = Target.getAsmWriter();
   StringRef ClassName = AsmWriter->getValueAsString("AsmWriterClassName");
   const auto &Registers = Target.getRegBank().getRegisters();
-  const std::vector<Record*> &AltNameIndices = Target.getRegAltNameIndices();
+  const std::vector<Record *> &AltNameIndices = Target.getRegAltNameIndices();
   bool hasAltNames = AltNameIndices.size() > 1;
   StringRef Namespace = Registers.front().TheDef->getValueAsString("Namespace");
 
-  O <<
-  "\n\n/// getRegisterName - This method is automatically generated by tblgen\n"
-  "/// from the register set description.  This returns the assembler name\n"
-  "/// for the specified register.\n"
-  "const char *" << Target.getName() << ClassName << "::";
+  O << "\n\n/// getRegisterName - This method is automatically generated by "
+       "tblgen\n"
+       "/// from the register set description.  This returns the assembler "
+       "name\n"
+       "/// for the specified register.\n"
+       "const char *"
+    << Target.getName() << ClassName << "::";
   if (hasAltNames)
     O << "\ngetRegisterName(MCRegister Reg, unsigned AltIdx) {\n";
   else
@@ -695,8 +721,7 @@ public:
 
   void addOperand(StringRef Op, int OpIdx, int PrintMethodIdx = -1) {
     assert(OpIdx >= 0 && OpIdx < 0xFE && "Idx out of range");
-    assert(PrintMethodIdx >= -1 && PrintMethodIdx < 0xFF &&
-           "Idx out of range");
+    assert(PrintMethodIdx >= -1 && PrintMethodIdx < 0xFF && "Idx out of range");
     OpMap[Op] = std::make_pair(OpIdx, PrintMethodIdx);
   }
 
@@ -791,7 +816,7 @@ namespace {
 struct AliasPriorityComparator {
   typedef std::pair<CodeGenInstAlias, int> ValueType;
   bool operator()(const ValueType &LHS, const ValueType &RHS) const {
-    if (LHS.second ==  RHS.second) {
+    if (LHS.second == RHS.second) {
       // We don't actually care about the order, but for consistency it
       // shouldn't depend on pointer comparisons.
       return LessRecordByID()(LHS.first.TheDef, RHS.first.TheDef);
@@ -819,8 +844,8 @@ void AsmWriterEmitter::EmitPrintAliasInstruction(raw_ostream &O) {
   unsigned Variant = AsmWriter->getValueAsInt("Variant");
   bool PassSubtarget = AsmWriter->getValueAsInt("PassSubtarget");
 
-  std::vector<Record*> AllInstAliases =
-    Records.getAllDerivedDefinitions("InstAlias");
+  std::vector<Record *> AllInstAliases =
+      Records.getAllDerivedDefinitions("InstAlias");
 
   // Create a map from the qualified name to a list of potential matches.
   typedef std::set<std::pair<CodeGenInstAlias, int>, AliasPriorityComparator>
@@ -843,8 +868,8 @@ void AsmWriterEmitter::EmitPrintAliasInstruction(raw_ostream &O) {
   std::vector<std::pair<std::string, bool>> PrintMethods;
 
   // A list of MCOperandPredicates for all operands in use, and the reverse map
-  std::vector<const Record*> MCOpPredicates;
-  DenseMap<const Record*, unsigned> MCOpPredicateMap;
+  std::vector<const Record *> MCOpPredicates;
+  DenseMap<const Record *, unsigned> MCOpPredicateMap;
 
   for (auto &Aliases : AliasMap) {
     // Collection of instruction alias rules. May contain ambiguous rules.
@@ -854,8 +879,8 @@ void AsmWriterEmitter::EmitPrintAliasInstruction(raw_ostream &O) {
       const CodeGenInstAlias &CGA = Alias.first;
       unsigned LastOpNo = CGA.ResultInstOperandIndex.size();
       std::string FlatInstAsmString =
-         CodeGenInstruction::FlattenAsmStringVariants(CGA.ResultInst->AsmString,
-                                                      Variant);
+          CodeGenInstruction::FlattenAsmStringVariants(
+              CGA.ResultInst->AsmString, Variant);
       unsigned NumResultOps = CountNumOperands(FlatInstAsmString, Variant);
 
       std::string FlatAliasAsmString =
@@ -881,8 +906,8 @@ void AsmWriterEmitter::EmitPrintAliasInstruction(raw_ostream &O) {
           unsigned OpNum = Operands.getSubOperandNumber(MIOpNum).first;
           if (Operands[OpNum].MINumOperands == 1 &&
               Operands[OpNum].getTiedRegister() != -1) {
-            // Tied operands of different RegisterClass should be explicit within
-            // an instruction's syntax and so cannot be skipped.
+            // Tied operands of different RegisterClass should be explicit
+            // within an instruction's syntax and so cannot be skipped.
             int TiedOpNum = Operands[OpNum].getTiedRegister();
             if (Operands[OpNum].Rec->getName() ==
                 Operands[TiedOpNum].Rec->getName()) {
@@ -1083,7 +1108,7 @@ void AsmWriterEmitter::EmitPrintAliasInstruction(raw_ostream &O) {
     if (It == IAPrinterMap.end())
       continue;
     std::vector<IAPrinter> &IAPs = It->second;
-    std::vector<IAPrinter*> UniqueIAPs;
+    std::vector<IAPrinter *> UniqueIAPs;
 
     // Remove any ambiguous alias rules.
     for (auto &LHS : IAPs) {
@@ -1099,7 +1124,8 @@ void AsmWriterEmitter::EmitPrintAliasInstruction(raw_ostream &O) {
         UniqueIAPs.push_back(&LHS);
     }
 
-    if (UniqueIAPs.empty()) continue;
+    if (UniqueIAPs.empty())
+      continue;
 
     unsigned PatternStart = PatternCount;
 
@@ -1193,7 +1219,8 @@ void AsmWriterEmitter::EmitPrintAliasInstruction(raw_ostream &O) {
   if (MCOpPredicates.empty())
     O.indent(2) << "  nullptr,\n";
   else
-    O.indent(2) << "  &" << Target.getName() << ClassName << "ValidateMCOperand,\n";
+    O.indent(2) << "  &" << Target.getName() << ClassName
+                << "ValidateMCOperand,\n";
   O.indent(2) << "};\n";
 
   O.indent(2) << "const char *AsmString = matchAliasPatterns(MI, "
@@ -1262,21 +1289,22 @@ void AsmWriterEmitter::EmitPrintAliasInstruction(raw_ostream &O) {
         << "    break;\n";
     }
     O << "  }\n";
-  }    
+  }
   O << "}\n\n";
 
   if (!MCOpPredicates.empty()) {
     O << "static bool " << Target.getName() << ClassName
       << "ValidateMCOperand(const MCOperand &MCOp,\n"
       << "                  const MCSubtargetInfo &STI,\n"
-      << "                  unsigned PredicateIndex) {\n"      
+      << "                  unsigned PredicateIndex) {\n"
       << "  switch (PredicateIndex) {\n"
       << "  default:\n"
       << "    llvm_unreachable(\"Unknown MCOperandPredicate kind\");\n"
       << "    break;\n";
 
     for (unsigned i = 0; i < MCOpPredicates.size(); ++i) {
-      StringRef MCOpPred = MCOpPredicates[i]->getValueAsString("MCOperandPredicate");
+      StringRef MCOpPred =
+          MCOpPredicates[i]->getValueAsString("MCOperandPredicate");
       O << "  case " << i + 1 << ": {\n"
         << MCOpPred.data() << "\n"
         << "    }\n";
diff --git a/llvm/utils/TableGen/AsmWriterInst.cpp b/llvm/utils/TableGen/AsmWriterInst.cpp
index c955859..1fa609e 100644
--- a/llvm/utils/TableGen/AsmWriterInst.cpp
+++ b/llvm/utils/TableGen/AsmWriterInst.cpp
@@ -57,54 +57,55 @@ AsmWriterInst::AsmWriterInst(const CodeGenInstruction &CGI, unsigned CGIIndex,
   std::string::size_type LastEmitted = 0;
   while (LastEmitted != AsmString.size()) {
     std::string::size_type DollarPos =
-      AsmString.find_first_of("$\\", LastEmitted);
-    if (DollarPos == std::string::npos) DollarPos = AsmString.size();
+        AsmString.find_first_of("$\\", LastEmitted);
+    if (DollarPos == std::string::npos)
+      DollarPos = AsmString.size();
 
     // Emit a constant string fragment.
     if (DollarPos != LastEmitted) {
       for (; LastEmitted != DollarPos; ++LastEmitted)
         switch (AsmString[LastEmitted]) {
-          case '\n':
-            AddLiteralString("\\n");
-            break;
-          case '\t':
-            AddLiteralString("\\t");
-            break;
-          case '"':
-            AddLiteralString("\\\"");
-            break;
-          case '\\':
-            AddLiteralString("\\\\");
-            break;
-          default:
-            AddLiteralString(std::string(1, AsmString[LastEmitted]));
-            break;
+        case '\n':
+          AddLiteralString("\\n");
+          break;
+        case '\t':
+          AddLiteralString("\\t");
+          break;
+        case '"':
+          AddLiteralString("\\\"");
+          break;
+        case '\\':
+          AddLiteralString("\\\\");
+          break;
+        default:
+          AddLiteralString(std::string(1, AsmString[LastEmitted]));
+          break;
         }
     } else if (AsmString[DollarPos] == '\\') {
-      if (DollarPos+1 != AsmString.size()) {
-        if (AsmString[DollarPos+1] == 'n') {
+      if (DollarPos + 1 != AsmString.size()) {
+        if (AsmString[DollarPos + 1] == 'n') {
           AddLiteralString("\\n");
-        } else if (AsmString[DollarPos+1] == 't') {
+        } else if (AsmString[DollarPos + 1] == 't') {
           AddLiteralString("\\t");
-        } else if (std::string("${|}\\").find(AsmString[DollarPos+1])
-                   != std::string::npos) {
-          AddLiteralString(std::string(1, AsmString[DollarPos+1]));
+        } else if (std::string("${|}\\").find(AsmString[DollarPos + 1]) !=
+                   std::string::npos) {
+          AddLiteralString(std::string(1, AsmString[DollarPos + 1]));
         } else {
           PrintFatalError(
               CGI.TheDef->getLoc(),
               "Non-supported escaped character found in instruction '" +
                   CGI.TheDef->getName() + "'!");
         }
-        LastEmitted = DollarPos+2;
+        LastEmitted = DollarPos + 2;
         continue;
       }
-    } else if (DollarPos+1 != AsmString.size() &&
-               AsmString[DollarPos+1] == '$') {
-      AddLiteralString("$");  // "$$" -> $
-      LastEmitted = DollarPos+2;
+    } else if (DollarPos + 1 != AsmString.size() &&
+               AsmString[DollarPos + 1] == '$') {
+      AddLiteralString("$"); // "$$" -> $
+      LastEmitted = DollarPos + 2;
     } else {
       // Get the name of the variable.
-      std::string::size_type VarEnd = DollarPos+1;
+      std::string::size_type VarEnd = DollarPos + 1;
 
       // handle ${foo}bar as $foo by detecting whether the character following
       // the dollar sign is a curly brace.  If so, advance VarEnd and DollarPos
@@ -118,7 +119,8 @@ AsmWriterInst::AsmWriterInst(const CodeGenInstruction &CGI, unsigned CGIIndex,
 
       while (VarEnd < AsmString.size() && isIdentChar(AsmString[VarEnd]))
         ++VarEnd;
-      StringRef VarName(AsmString.data()+DollarPos+1, VarEnd-DollarPos-1);
+      StringRef VarName(AsmString.data() + DollarPos + 1,
+                        VarEnd - DollarPos - 1);
 
       // Modifier - Support ${foo:modifier} syntax, where "modifier" is passed
       // into printOperand.  Also support ${:feature}, which is passed into
@@ -190,13 +192,14 @@ AsmWriterInst::AsmWriterInst(const CodeGenInstruction &CGI, unsigned CGIIndex,
 /// specified instruction except for one differing operand, return the differing
 /// operand number.  If more than one operand mismatches, return ~1, otherwise
 /// if the instructions are identical return ~0.
-unsigned AsmWriterInst::MatchesAllButOneOp(const AsmWriterInst &Other)const{
-  if (Operands.size() != Other.Operands.size()) return ~1;
+unsigned AsmWriterInst::MatchesAllButOneOp(const AsmWriterInst &Other) const {
+  if (Operands.size() != Other.Operands.size())
+    return ~1;
 
   unsigned MismatchOperand = ~0U;
   for (unsigned i = 0, e = Operands.size(); i != e; ++i) {
     if (Operands[i] != Other.Operands[i]) {
-      if (MismatchOperand != ~0U)  // Already have one mismatch?
+      if (MismatchOperand != ~0U) // Already have one mismatch?
         return ~1U;
       MismatchOperand = i;
     }
diff --git a/llvm/utils/TableGen/AsmWriterInst.h b/llvm/utils/TableGen/AsmWriterInst.h
index 9c93e82..f0ebf79 100644
--- a/llvm/utils/TableGen/AsmWriterInst.h
+++ b/llvm/utils/TableGen/AsmWriterInst.h
@@ -20,88 +20,88 @@
 #include <vector>
 
 namespace llvm {
-  class CodeGenInstruction;
-
-  struct AsmWriterOperand {
-    enum OpType {
-      // Output this text surrounded by quotes to the asm.
-      isLiteralTextOperand,
-      // This is the name of a routine to call to print the operand.
-      isMachineInstrOperand,
-      // Output this text verbatim to the asm writer.  It is code that
-      // will output some text to the asm.
-      isLiteralStatementOperand
-    } OperandType;
-
-    /// MiOpNo - For isMachineInstrOperand, this is the operand number of the
-    /// machine instruction.
-    unsigned MIOpNo = 0;
-
-    /// Str - For isLiteralTextOperand, this IS the literal text.  For
-    /// isMachineInstrOperand, this is the PrinterMethodName for the operand..
-    /// For isLiteralStatementOperand, this is the code to insert verbatim
-    /// into the asm writer.
-    std::string Str;
-
-    /// MiModifier - For isMachineInstrOperand, this is the modifier string for
-    /// an operand, specified with syntax like ${opname:modifier}.
-    std::string MiModifier;
-
-    bool PCRel = false;
-
-    // To make VS STL happy
-    AsmWriterOperand(OpType op = isLiteralTextOperand):OperandType(op) {}
-
-    AsmWriterOperand(const std::string &LitStr,
-                     OpType op = isLiteralTextOperand)
-    : OperandType(op), Str(LitStr) {}
-
-    AsmWriterOperand(const std::string &Printer, unsigned _MIOpNo,
-                     const std::string &Modifier,
-                     OpType op = isMachineInstrOperand, bool PCRel = false)
-        : OperandType(op), MIOpNo(_MIOpNo), Str(Printer), MiModifier(Modifier),
-          PCRel(PCRel) {}
-
-    bool operator!=(const AsmWriterOperand &Other) const {
-      if (OperandType != Other.OperandType || Str != Other.Str) return true;
-      if (OperandType == isMachineInstrOperand)
-        return MIOpNo != Other.MIOpNo || MiModifier != Other.MiModifier ||
-               PCRel != Other.PCRel;
-      return false;
-    }
-    bool operator==(const AsmWriterOperand &Other) const {
-      return !operator!=(Other);
-    }
-
-    /// getCode - Return the code that prints this operand.
-    std::string getCode(bool PassSubtarget) const;
-  };
-
-  class AsmWriterInst {
-  public:
-    std::vector<AsmWriterOperand> Operands;
-    const CodeGenInstruction *CGI;
-    unsigned CGIIndex;
-
-    AsmWriterInst(const CodeGenInstruction &CGI, unsigned CGIIndex,
-                  unsigned Variant);
-
-    /// MatchesAllButOneOp - If this instruction is exactly identical to the
-    /// specified instruction except for one differing operand, return the
-    /// differing operand number.  Otherwise return ~0.
-    unsigned MatchesAllButOneOp(const AsmWriterInst &Other) const;
-
-  private:
-    void AddLiteralString(const std::string &Str) {
-      // If the last operand was already a literal text string, append this to
-      // it, otherwise add a new operand.
-      if (!Operands.empty() &&
-          Operands.back().OperandType == AsmWriterOperand::isLiteralTextOperand)
-        Operands.back().Str.append(Str);
-      else
-        Operands.push_back(AsmWriterOperand(Str));
-    }
-  };
-}
+class CodeGenInstruction;
+
+struct AsmWriterOperand {
+  enum OpType {
+    // Output this text surrounded by quotes to the asm.
+    isLiteralTextOperand,
+    // This is the name of a routine to call to print the operand.
+    isMachineInstrOperand,
+    // Output this text verbatim to the asm writer.  It is code that
+    // will output some text to the asm.
+    isLiteralStatementOperand
+  } OperandType;
+
+  /// MiOpNo - For isMachineInstrOperand, this is the operand number of the
+  /// machine instruction.
+  unsigned MIOpNo = 0;
+
+  /// Str - For isLiteralTextOperand, this IS the literal text.  For
+  /// isMachineInstrOperand, this is the PrinterMethodName for the operand..
+  /// For isLiteralStatementOperand, this is the code to insert verbatim
+  /// into the asm writer.
+  std::string Str;
+
+  /// MiModifier - For isMachineInstrOperand, this is the modifier string for
+  /// an operand, specified with syntax like ${opname:modifier}.
+  std::string MiModifier;
+
+  bool PCRel = false;
+
+  // To make VS STL happy
+  AsmWriterOperand(OpType op = isLiteralTextOperand) : OperandType(op) {}
+
+  AsmWriterOperand(const std::string &LitStr, OpType op = isLiteralTextOperand)
+      : OperandType(op), Str(LitStr) {}
+
+  AsmWriterOperand(const std::string &Printer, unsigned _MIOpNo,
+                   const std::string &Modifier,
+                   OpType op = isMachineInstrOperand, bool PCRel = false)
+      : OperandType(op), MIOpNo(_MIOpNo), Str(Printer), MiModifier(Modifier),
+        PCRel(PCRel) {}
+
+  bool operator!=(const AsmWriterOperand &Other) const {
+    if (OperandType != Other.OperandType || Str != Other.Str)
+      return true;
+    if (OperandType == isMachineInstrOperand)
+      return MIOpNo != Other.MIOpNo || MiModifier != Other.MiModifier ||
+             PCRel != Other.PCRel;
+    return false;
+  }
+  bool operator==(const AsmWriterOperand &Other) const {
+    return !operator!=(Other);
+  }
+
+  /// getCode - Return the code that prints this operand.
+  std::string getCode(bool PassSubtarget) const;
+};
+
+class AsmWriterInst {
+public:
+  std::vector<AsmWriterOperand> Operands;
+  const CodeGenInstruction *CGI;
+  unsigned CGIIndex;
+
+  AsmWriterInst(const CodeGenInstruction &CGI, unsigned CGIIndex,
+                unsigned Variant);
+
+  /// MatchesAllButOneOp - If this instruction is exactly identical to the
+  /// specified instruction except for one differing operand, return the
+  /// differing operand number.  Otherwise return ~0.
+  unsigned MatchesAllButOneOp(const AsmWriterInst &Other) const;
+
+private:
+  void AddLiteralString(const std::string &Str) {
+    // If the last operand was already a literal text string, append this to
+    // it, otherwise add a new operand.
+    if (!Operands.empty() &&
+        Operands.back().OperandType == AsmWriterOperand::isLiteralTextOperand)
+      Operands.back().Str.append(Str);
+    else
+      Operands.push_back(AsmWriterOperand(Str));
+  }
+};
+} // namespace llvm
 
 #endif
diff --git a/llvm/utils/TableGen/CTagsEmitter.cpp b/llvm/utils/TableGen/CTagsEmitter.cpp
index b8e27d0..bda18936 100644
--- a/llvm/utils/TableGen/CTagsEmitter.cpp
+++ b/llvm/utils/TableGen/CTagsEmitter.cpp
@@ -1,4 +1,4 @@
-//===- CTagsEmitter.cpp - Generate ctags-compatible index ------------------===//
+//===- CTagsEmitter.cpp - Generate ctags-compatible index -----------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -30,6 +30,7 @@ private:
   StringRef Id;
   StringRef BufferIdentifier;
   unsigned Line;
+
 public:
   Tag(StringRef Name, const SMLoc Location) : Id(Name) {
     const MemoryBuffer *CurMB =
@@ -39,7 +40,8 @@ public:
     Line = LineAndColumn.first;
   }
   int operator<(const Tag &B) const {
-    return std::make_tuple(Id, BufferIdentifier, Line) < std::make_tuple(B.Id, B.BufferIdentifier, B.Line);
+    return std::make_tuple(Id, BufferIdentifier, Line) <
+           std::make_tuple(B.Id, B.BufferIdentifier, B.Line);
   }
   void emit(raw_ostream &OS) const {
     OS << Id << "\t" << BufferIdentifier << "\t" << Line << "\n";
@@ -49,6 +51,7 @@ public:
 class CTagsEmitter {
 private:
   RecordKeeper &Records;
+
 public:
   CTagsEmitter(RecordKeeper &R) : Records(R) {}
 
diff --git a/llvm/utils/TableGen/CallingConvEmitter.cpp b/llvm/utils/TableGen/CallingConvEmitter.cpp
index 02e7000..3c3a287 100644
--- a/llvm/utils/TableGen/CallingConvEmitter.cpp
+++ b/llvm/utils/TableGen/CallingConvEmitter.cpp
@@ -117,23 +117,24 @@ void CallingConvEmitter::EmitCallingConv(Record *CC, raw_ostream &O) {
     O << "\n";
     EmitAction(Action, 2, O);
   }
-  
+
   O << "\n  return true; // CC didn't match.\n";
   O << "}\n";
 }
 
-void CallingConvEmitter::EmitAction(Record *Action,
-                                    unsigned Indent, raw_ostream &O) {
+void CallingConvEmitter::EmitAction(Record *Action, unsigned Indent,
+                                    raw_ostream &O) {
   std::string IndentStr = std::string(Indent, ' ');
 
   if (Action->isSubClassOf("CCPredicateAction")) {
     O << IndentStr << "if (";
-    
+
     if (Action->isSubClassOf("CCIfType")) {
       ListInit *VTs = Action->getValueAsListInit("VTs");
       for (unsigned i = 0, e = VTs->size(); i != e; ++i) {
         Record *VT = VTs->getElementAsRecord(i);
-        if (i != 0) O << " ||\n    " << IndentStr;
+        if (i != 0)
+          O << " ||\n    " << IndentStr;
         O << "LocVT == " << getEnumName(getValueType(VT));
       }
 
@@ -143,9 +144,9 @@ void CallingConvEmitter::EmitAction(Record *Action,
       errs() << *Action;
       PrintFatalError(Action->getLoc(), "Unknown CCPredicateAction!");
     }
-    
+
     O << ") {\n";
-    EmitAction(Action->getValueAsDef("SubAction"), Indent+2, O);
+    EmitAction(Action->getValueAsDef("SubAction"), Indent + 2, O);
     O << IndentStr << "}\n";
   } else {
     if (Action->isSubClassOf("CCDelegateTo")) {
@@ -241,8 +242,8 @@ void CallingConvEmitter::EmitAction(Record *Action,
         O << "\n" << IndentStr << "};\n";
 
         O << IndentStr << "if (unsigned Reg = State.AllocateReg(RegList"
-          << RegListNumber << ", " << "RegList" << ShadowRegListNumber
-          << ")) {\n";
+          << RegListNumber << ", "
+          << "RegList" << ShadowRegListNumber << ")) {\n";
       }
       O << IndentStr << "  State.addLoc(CCValAssign::getReg(ValNo, ValVT, "
         << "Reg, LocVT, LocInfo));\n";
@@ -257,7 +258,8 @@ void CallingConvEmitter::EmitAction(Record *Action,
       if (Size)
         O << Size << ", ";
       else
-        O << "\n" << IndentStr
+        O << "\n"
+          << IndentStr
           << "  State.getMachineFunction().getDataLayout()."
              "getTypeAllocSize(EVT(LocVT).getTypeForEVT(State.getContext())),"
              " ";
@@ -269,8 +271,8 @@ void CallingConvEmitter::EmitAction(Record *Action,
           << "  State.getMachineFunction().getDataLayout()."
              "getABITypeAlign(EVT(LocVT).getTypeForEVT(State.getContext()"
              "))";
-      O << ");\n" << IndentStr
-        << "State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset"
+      O << ");\n"
+        << IndentStr << "State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset"
         << Counter << ", LocVT, LocInfo));\n";
       O << IndentStr << "return false;\n";
     } else if (Action->isSubClassOf("CCAssignToStackWithShadow")) {
@@ -281,7 +283,7 @@ void CallingConvEmitter::EmitAction(Record *Action,
       unsigned ShadowRegListNumber = ++Counter;
 
       O << IndentStr << "static const MCPhysReg ShadowRegList"
-          << ShadowRegListNumber << "[] = {\n";
+        << ShadowRegListNumber << "[] = {\n";
       O << IndentStr << "  ";
       ListSeparator LS;
       for (unsigned i = 0, e = ShadowRegList->size(); i != e; ++i)
@@ -297,7 +299,7 @@ void CallingConvEmitter::EmitAction(Record *Action,
     } else if (Action->isSubClassOf("CCPromoteToType")) {
       Record *DestTy = Action->getValueAsDef("DestTy");
       MVT::SimpleValueType DestVT = getValueType(DestTy);
-      O << IndentStr << "LocVT = " << getEnumName(DestVT) <<";\n";
+      O << IndentStr << "LocVT = " << getEnumName(DestVT) << ";\n";
       if (MVT(DestVT).isFloatingPoint()) {
         O << IndentStr << "LocInfo = CCValAssign::FPExt;\n";
       } else {
@@ -326,15 +328,18 @@ void CallingConvEmitter::EmitAction(Record *Action,
       }
     } else if (Action->isSubClassOf("CCBitConvertToType")) {
       Record *DestTy = Action->getValueAsDef("DestTy");
-      O << IndentStr << "LocVT = " << getEnumName(getValueType(DestTy)) <<";\n";
+      O << IndentStr << "LocVT = " << getEnumName(getValueType(DestTy))
+        << ";\n";
       O << IndentStr << "LocInfo = CCValAssign::BCvt;\n";
     } else if (Action->isSubClassOf("CCTruncToType")) {
       Record *DestTy = Action->getValueAsDef("DestTy");
-      O << IndentStr << "LocVT = " << getEnumName(getValueType(DestTy)) <<";\n";
+      O << IndentStr << "LocVT = " << getEnumName(getValueType(DestTy))
+        << ";\n";
       O << IndentStr << "LocInfo = CCValAssign::Trunc;\n";
     } else if (Action->isSubClassOf("CCPassIndirect")) {
       Record *DestTy = Action->getValueAsDef("DestTy");
-      O << IndentStr << "LocVT = " << getEnumName(getValueType(DestTy)) <<";\n";
+      O << IndentStr << "LocVT = " << getEnumName(getValueType(DestTy))
+        << ";\n";
       O << IndentStr << "LocInfo = CCValAssign::Indirect;\n";
     } else if (Action->isSubClassOf("CCPassByVal")) {
       int Size = Action->getValueAsInt("Size");
@@ -343,8 +348,8 @@ void CallingConvEmitter::EmitAction(Record *Action,
         << Size << ", Align(" << Align << "), ArgFlags);\n";
       O << IndentStr << "return false;\n";
     } else if (Action->isSubClassOf("CCCustom")) {
-      O << IndentStr
-        << "if (" << Action->getValueAsString("FuncName") << "(ValNo, ValVT, "
+      O << IndentStr << "if (" << Action->getValueAsString("FuncName")
+        << "(ValNo, ValVT, "
         << "LocVT, LocInfo, ArgFlags, State))\n";
       O << IndentStr << "  return false;\n";
     } else {
@@ -376,9 +381,8 @@ void CallingConvEmitter::EmitArgRegisterLists(raw_ostream &O) {
         std::set<std::string> &InnerRegisters = InnerEntry.second;
 
         if (InnerRegisters.find(CCName) != InnerRegisters.end()) {
-          AssignedRegsMap[InnerCCName].insert(
-              AssignedRegsMap[CCName].begin(),
-              AssignedRegsMap[CCName].end());
+          AssignedRegsMap[InnerCCName].insert(AssignedRegsMap[CCName].begin(),
+                                              AssignedRegsMap[CCName].end());
           InnerRegisters.erase(CCName);
         }
       }
diff --git a/llvm/utils/TableGen/CodeEmitterGen.cpp b/llvm/utils/TableGen/CodeEmitterGen.cpp
index 48ed319..d7020d1 100644
--- a/llvm/utils/TableGen/CodeEmitterGen.cpp
+++ b/llvm/utils/TableGen/CodeEmitterGen.cpp
@@ -78,8 +78,8 @@ private:
 
 // If the VarBitInit at position 'bit' matches the specified variable then
 // return the variable bit position.  Otherwise return -1.
-int CodeEmitterGen::getVariableBit(const std::string &VarName,
-                                   BitsInit *BI, int bit) {
+int CodeEmitterGen::getVariableBit(const std::string &VarName, BitsInit *BI,
+                                   int bit) {
   if (VarBitInit *VBI = dyn_cast<VarBitInit>(BI->getBit(bit))) {
     if (VarInit *VI = dyn_cast<VarInit>(VBI->getBitVar()))
       if (VI->getName() == VarName)
@@ -101,16 +101,16 @@ bool CodeEmitterGen::addCodeToMergeInOperand(Record *R, BitsInit *BI,
   CodeGenInstruction &CGI = Target.getInstruction(R);
 
   // Determine if VarName actually contributes to the Inst encoding.
-  int bit = BI->getNumBits()-1;
+  int bit = BI->getNumBits() - 1;
 
   // Scan for a bit that this contributed to.
-  for (; bit >= 0; ) {
+  for (; bit >= 0;) {
     if (getVariableBit(VarName, BI, bit) != -1)
       break;
-    
+
     --bit;
   }
-  
+
   // If we found no bits, ignore this value, otherwise emit the call to get the
   // operand encoding.
   if (bit < 0)
@@ -127,12 +127,14 @@ bool CodeEmitterGen::addCodeToMergeInOperand(Record *R, BitsInit *BI,
     // Get the machine operand number for the indicated operand.
     OpIdx = CGI.Operands[OpIdx].MIOperandNo;
   } else {
-    PrintError(R, Twine("No operand named ") + VarName + " in record " + R->getName());
+    PrintError(R, Twine("No operand named ") + VarName + " in record " +
+                      R->getName());
     return false;
   }
 
   if (CGI.Operands.isFlatOperandNotEmitted(OpIdx)) {
-    PrintError(R, "Operand " + VarName + " used but also marked as not emitted!");
+    PrintError(R,
+               "Operand " + VarName + " used but also marked as not emitted!");
     return false;
   }
 
@@ -156,10 +158,12 @@ bool CodeEmitterGen::addCodeToMergeInOperand(Record *R, BitsInit *BI,
     Case += ", Fixups, STI);\n";
   } else {
     if (UseAPInt) {
-      Case += "      getMachineOpValue(MI, MI.getOperand(" + utostr(OpIdx) + ")";
+      Case +=
+          "      getMachineOpValue(MI, MI.getOperand(" + utostr(OpIdx) + ")";
       Case += ", op, Fixups, STI";
     } else {
-      Case += "      op = getMachineOpValue(MI, MI.getOperand(" + utostr(OpIdx) + ")";
+      Case += "      op = getMachineOpValue(MI, MI.getOperand(" +
+              utostr(OpIdx) + ")";
       Case += ", Fixups, STI";
     }
     Case += ");\n";
@@ -193,9 +197,9 @@ bool CodeEmitterGen::addCodeToMergeInOperand(Record *R, BitsInit *BI,
   }
 
   unsigned BitOffset = -1;
-  for (; bit >= 0; ) {
+  for (; bit >= 0;) {
     int varBit = getVariableBit(VarName, BI, bit);
-    
+
     // If this bit isn't from a variable, skip it.
     if (varBit == -1) {
       --bit;
@@ -209,7 +213,8 @@ bool CodeEmitterGen::addCodeToMergeInOperand(Record *R, BitsInit *BI,
     int N = 1;
     for (--bit; bit >= 0;) {
       varBit = getVariableBit(VarName, BI, bit);
-      if (varBit == -1 || varBit != (beginVarBit - N)) break;
+      if (varBit == -1 || varBit != (beginVarBit - N))
+        break;
       ++N;
       --bit;
     }
@@ -368,7 +373,9 @@ void CodeEmitterGen::emitInstructionBaseValues(
 
     if (R->getValueAsString("Namespace") == "TargetOpcode" ||
         R->getValueAsBit("isPseudo")) {
-      o << "    "; emitInstBits(o, APInt(BitWidth, 0)); o << ",\n";
+      o << "    ";
+      emitInstBits(o, APInt(BitWidth, 0));
+      o << ",\n";
       continue;
     }
 
@@ -419,13 +426,13 @@ void CodeEmitterGen::run(raw_ostream &o) {
   emitSourceFileHeader("Machine Code Emitter", o);
 
   CodeGenTarget Target(Records);
-  std::vector<Record*> Insts = Records.getAllDerivedDefinitions("Instruction");
+  std::vector<Record *> Insts = Records.getAllDerivedDefinitions("Instruction");
 
   // For little-endian instruction bit encodings, reverse the bit order
   Target.reverseBitsForLittleEndianEncoding();
 
-  ArrayRef<const CodeGenInstruction*> NumberedInstructions =
-    Target.getInstructionsByEnumValue();
+  ArrayRef<const CodeGenInstruction *> NumberedInstructions =
+      Target.getInstructionsByEnumValue();
 
   if (any_of(NumberedInstructions, [](const CodeGenInstruction *CGI) {
         Record *R = CGI->TheDef;
diff --git a/llvm/utils/TableGen/CodeGenDAGPatterns.cpp b/llvm/utils/TableGen/CodeGenDAGPatterns.cpp
index f88e25e..62e0482 100644
--- a/llvm/utils/TableGen/CodeGenDAGPatterns.cpp
+++ b/llvm/utils/TableGen/CodeGenDAGPatterns.cpp
@@ -38,18 +38,10 @@ using namespace llvm;
 static inline bool isIntegerOrPtr(MVT VT) {
   return VT.isInteger() || VT == MVT::iPTR;
 }
-static inline bool isFloatingPoint(MVT VT) {
-  return VT.isFloatingPoint();
-}
-static inline bool isVector(MVT VT) {
-  return VT.isVector();
-}
-static inline bool isScalar(MVT VT) {
-  return !VT.isVector();
-}
-static inline bool isScalarInteger(MVT VT) {
-  return VT.isScalarInteger();
-}
+static inline bool isFloatingPoint(MVT VT) { return VT.isFloatingPoint(); }
+static inline bool isVector(MVT VT) { return VT.isVector(); }
+static inline bool isScalar(MVT VT) { return !VT.isVector(); }
+static inline bool isScalarInteger(MVT VT) { return VT.isScalarInteger(); }
 
 template <typename Predicate>
 static bool berase_if(MachineValueTypeSet &S, Predicate P) {
@@ -173,8 +165,7 @@ bool TypeSetByHwMode::constrain(const TypeSetByHwMode &VTS) {
   return Changed;
 }
 
-template <typename Predicate>
-bool TypeSetByHwMode::constrain(Predicate P) {
+template <typename Predicate> bool TypeSetByHwMode::constrain(Predicate P) {
   bool Changed = false;
   for (auto &I : *this)
     Changed |= berase_if(I.second, [&P](MVT VT) { return !P(VT); });
@@ -257,20 +248,18 @@ bool TypeSetByHwMode::operator==(const TypeSetByHwMode &VTS) const {
 }
 
 namespace llvm {
-  raw_ostream &operator<<(raw_ostream &OS, const MachineValueTypeSet &T) {
-    T.writeToStream(OS);
-    return OS;
-  }
-  raw_ostream &operator<<(raw_ostream &OS, const TypeSetByHwMode &T) {
-    T.writeToStream(OS);
-    return OS;
-  }
+raw_ostream &operator<<(raw_ostream &OS, const MachineValueTypeSet &T) {
+  T.writeToStream(OS);
+  return OS;
+}
+raw_ostream &operator<<(raw_ostream &OS, const TypeSetByHwMode &T) {
+  T.writeToStream(OS);
+  return OS;
 }
+} // namespace llvm
 
 LLVM_DUMP_METHOD
-void TypeSetByHwMode::dump() const {
-  dbgs() << *this << '\n';
-}
+void TypeSetByHwMode::dump() const { dbgs() << *this << '\n'; }
 
 bool TypeSetByHwMode::intersect(SetType &Out, const SetType &In) {
   bool OutP = Out.count(MVT::iPTR), InP = In.count(MVT::iPTR);
@@ -335,7 +324,7 @@ bool TypeSetByHwMode::intersect(SetType &Out, const SetType &In) {
   // OutP == true
   SetType InOnly = subtract(In, Out);
   unsigned SizeOut = Out.size();
-  berase_if(Out, CompIn);   // This will remove at least the iPTR.
+  berase_if(Out, CompIn); // This will remove at least the iPTR.
   unsigned NumI = llvm::count_if(InOnly, isScalarInteger);
   if (NumI == 0) {
     // iPTR deleted from Out.
@@ -568,29 +557,29 @@ bool TypeInfer::EnforceSmallerThan(TypeSetByHwMode &Small, TypeSetByHwMode &Big,
     // smaller-or-equal than MinS.
     auto MinS = min_if(S.begin(), S.end(), isScalar, LT);
     if (MinS != S.end())
-      Changed |= berase_if(B, std::bind(SameKindLE,
-                                        std::placeholders::_1, *MinS));
+      Changed |=
+          berase_if(B, std::bind(SameKindLE, std::placeholders::_1, *MinS));
 
     // MaxS = max scalar in Big, remove all scalars from Small that are
     // larger than MaxS.
     auto MaxS = max_if(B.begin(), B.end(), isScalar, LT);
     if (MaxS != B.end())
-      Changed |= berase_if(S, std::bind(SameKindLE,
-                                        *MaxS, std::placeholders::_1));
+      Changed |=
+          berase_if(S, std::bind(SameKindLE, *MaxS, std::placeholders::_1));
 
     // MinV = min vector in Small, remove all vectors from Big that are
     // smaller-or-equal than MinV.
     auto MinV = min_if(S.begin(), S.end(), isVector, LT);
     if (MinV != S.end())
-      Changed |= berase_if(B, std::bind(SameKindLE,
-                                        std::placeholders::_1, *MinV));
+      Changed |=
+          berase_if(B, std::bind(SameKindLE, std::placeholders::_1, *MinV));
 
     // MaxV = max vector in Big, remove all vectors from Small that are
     // larger than MaxV.
     auto MaxV = max_if(B.begin(), B.end(), isVector, LT);
     if (MaxV != B.end())
-      Changed |= berase_if(S, std::bind(SameKindLE,
-                                        *MaxV, std::placeholders::_1));
+      Changed |=
+          berase_if(S, std::bind(SameKindLE, *MaxV, std::placeholders::_1));
   }
 
   return Changed;
@@ -618,8 +607,8 @@ bool TypeInfer::EnforceVectorEltTypeIs(TypeSetByHwMode &Vec,
     TypeSetByHwMode::SetType &V = Vec.get(M);
     TypeSetByHwMode::SetType &E = Elem.get(M);
 
-    Changed |= berase_if(V, isScalar);  // Scalar = !vector
-    Changed |= berase_if(E, isVector);  // Vector = !scalar
+    Changed |= berase_if(V, isScalar); // Scalar = !vector
+    Changed |= berase_if(E, isVector); // Vector = !scalar
     assert(!V.empty() && !E.empty());
 
     MachineValueTypeSet VT, ST;
@@ -632,8 +621,8 @@ bool TypeInfer::EnforceVectorEltTypeIs(TypeSetByHwMode &Vec,
 
     // Remove from V all (vector) types whose element type is not in S.
     Changed |= berase_if(V, [&ST](MVT T) -> bool {
-                              return !ST.count(T.getVectorElementType());
-                            });
+      return !ST.count(T.getVectorElementType());
+    });
     // Remove from E all (scalar) types, for which there is no corresponding
     // type in V.
     Changed |= berase_if(E, [&VT](MVT T) -> bool { return !VT.count(T); });
@@ -887,7 +876,6 @@ TypeInfer::ValidateOnExit::~ValidateOnExit() {
   }
 }
 
-
 //===----------------------------------------------------------------------===//
 // ScopedName Implementation
 //===----------------------------------------------------------------------===//
@@ -896,10 +884,7 @@ bool ScopedName::operator==(const ScopedName &o) const {
   return Scope == o.Scope && Identifier == o.Identifier;
 }
 
-bool ScopedName::operator!=(const ScopedName &o) const {
-  return !(*this == o);
-}
-
+bool ScopedName::operator!=(const ScopedName &o) const { return !(*this == o); }
 
 //===----------------------------------------------------------------------===//
 // TreePredicateFn Implementation
@@ -1011,8 +996,9 @@ std::string TreePredicateFn::getPredCode() const {
       PrintFatalError(getOrigPatFragRecord()->getRecord()->getLoc(),
                       "IsAtomicOrderingAcquireRelease requires IsAtomic");
     if (isAtomicOrderingSequentiallyConsistent())
-      PrintFatalError(getOrigPatFragRecord()->getRecord()->getLoc(),
-                      "IsAtomicOrderingSequentiallyConsistent requires IsAtomic");
+      PrintFatalError(
+          getOrigPatFragRecord()->getRecord()->getLoc(),
+          "IsAtomicOrderingSequentiallyConsistent requires IsAtomic");
     if (isAtomicOrderingAcquireOrStronger())
       PrintFatalError(getOrigPatFragRecord()->getRecord()->getLoc(),
                       "IsAtomicOrderingAcquireOrStronger requires IsAtomic");
@@ -1027,7 +1013,7 @@ std::string TreePredicateFn::getPredCode() const {
   if (isLoad() || isStore() || isAtomic()) {
     if (ListInit *AddressSpaces = getAddressSpaces()) {
       Code += "unsigned AddrSpace = cast<MemSDNode>(N)->getAddressSpace();\n"
-        " if (";
+              " if (";
 
       ListSeparator LS(" && ");
       for (Init *Val : AddressSpaces->getValues()) {
@@ -1077,18 +1063,22 @@ std::string TreePredicateFn::getPredCode() const {
             "AtomicOrdering::SequentiallyConsistent) return false;\n";
 
   if (isAtomic() && isAtomicOrderingAcquireOrStronger())
-    Code += "if (!isAcquireOrStronger(cast<AtomicSDNode>(N)->getMergedOrdering())) "
-            "return false;\n";
+    Code +=
+        "if (!isAcquireOrStronger(cast<AtomicSDNode>(N)->getMergedOrdering())) "
+        "return false;\n";
   if (isAtomic() && isAtomicOrderingWeakerThanAcquire())
-    Code += "if (isAcquireOrStronger(cast<AtomicSDNode>(N)->getMergedOrdering())) "
-            "return false;\n";
+    Code +=
+        "if (isAcquireOrStronger(cast<AtomicSDNode>(N)->getMergedOrdering())) "
+        "return false;\n";
 
   if (isAtomic() && isAtomicOrderingReleaseOrStronger())
-    Code += "if (!isReleaseOrStronger(cast<AtomicSDNode>(N)->getMergedOrdering())) "
-            "return false;\n";
+    Code +=
+        "if (!isReleaseOrStronger(cast<AtomicSDNode>(N)->getMergedOrdering())) "
+        "return false;\n";
   if (isAtomic() && isAtomicOrderingWeakerThanRelease())
-    Code += "if (isReleaseOrStronger(cast<AtomicSDNode>(N)->getMergedOrdering())) "
-            "return false;\n";
+    Code +=
+        "if (isReleaseOrStronger(cast<AtomicSDNode>(N)->getMergedOrdering())) "
+        "return false;\n";
 
   // TODO: Handle atomic sextload/zextload normally when ATOMIC_LOAD is removed.
   if (isAtomic() && (isZeroExtLoad() || isSignExtLoad()))
@@ -1239,16 +1229,20 @@ bool TreePredicateFn::isAtomicOrderingSequentiallyConsistent() const {
                                       true);
 }
 bool TreePredicateFn::isAtomicOrderingAcquireOrStronger() const {
-  return isPredefinedPredicateEqualTo("IsAtomicOrderingAcquireOrStronger", true);
+  return isPredefinedPredicateEqualTo("IsAtomicOrderingAcquireOrStronger",
+                                      true);
 }
 bool TreePredicateFn::isAtomicOrderingWeakerThanAcquire() const {
-  return isPredefinedPredicateEqualTo("IsAtomicOrderingAcquireOrStronger", false);
+  return isPredefinedPredicateEqualTo("IsAtomicOrderingAcquireOrStronger",
+                                      false);
 }
 bool TreePredicateFn::isAtomicOrderingReleaseOrStronger() const {
-  return isPredefinedPredicateEqualTo("IsAtomicOrderingReleaseOrStronger", true);
+  return isPredefinedPredicateEqualTo("IsAtomicOrderingReleaseOrStronger",
+                                      true);
 }
 bool TreePredicateFn::isAtomicOrderingWeakerThanRelease() const {
-  return isPredefinedPredicateEqualTo("IsAtomicOrderingReleaseOrStronger", false);
+  return isPredefinedPredicateEqualTo("IsAtomicOrderingReleaseOrStronger",
+                                      false);
 }
 Record *TreePredicateFn::getMemoryVT() const {
   Record *R = getOrigPatFragRecord()->getRecord();
@@ -1428,7 +1422,7 @@ static bool isImmAllOnesAllZerosMatch(const TreePatternNode *P) {
 /// pattern.
 static unsigned getPatternSize(const TreePatternNode *P,
                                const CodeGenDAGPatterns &CGP) {
-  unsigned Size = 3;  // The node itself.
+  unsigned Size = 3; // The node itself.
   // If the root node is a ConstantSDNode, increases its size.
   // e.g. (set R32:$dst, 0).
   if (P->isLeaf() && isa<IntInit>(P->getLeafValue()))
@@ -1459,7 +1453,7 @@ static unsigned getPatternSize(const TreePatternNode *P,
     }
     if (Child->isLeaf()) {
       if (isa<IntInit>(Child->getLeafValue()))
-        Size += 5;  // Matches a ConstantSDNode (+3) and a specific value (+2).
+        Size += 5; // Matches a ConstantSDNode (+3) and a specific value (+2).
       else if (Child->getComplexPatternInfo(CGP))
         Size += getPatternSize(Child, CGP);
       else if (isImmAllOnesAllZerosMatch(Child))
@@ -1474,8 +1468,7 @@ static unsigned getPatternSize(const TreePatternNode *P,
 
 /// Compute the complexity metric for the input pattern.  This roughly
 /// corresponds to the number of nodes that are covered.
-int PatternToMatch::
-getPatternComplexity(const CodeGenDAGPatterns &CGP) const {
+int PatternToMatch::getPatternComplexity(const CodeGenDAGPatterns &CGP) const {
   return getPatternSize(getSrcPattern(), CGP) + getAddedComplexity();
 }
 
@@ -1550,18 +1543,17 @@ SDTypeConstraint::SDTypeConstraint(Record *R, const CodeGenHwModes &CGH) {
   } else if (R->isSubClassOf("SDTCisVTSmallerThanOp")) {
     ConstraintType = SDTCisVTSmallerThanOp;
     x.SDTCisVTSmallerThanOp_Info.OtherOperandNum =
-      R->getValueAsInt("OtherOperandNum");
+        R->getValueAsInt("OtherOperandNum");
   } else if (R->isSubClassOf("SDTCisOpSmallerThanOp")) {
     ConstraintType = SDTCisOpSmallerThanOp;
     x.SDTCisOpSmallerThanOp_Info.BigOperandNum =
-      R->getValueAsInt("BigOperandNum");
+        R->getValueAsInt("BigOperandNum");
   } else if (R->isSubClassOf("SDTCisEltOfVec")) {
     ConstraintType = SDTCisEltOfVec;
     x.SDTCisEltOfVec_Info.OtherOperandNum = R->getValueAsInt("OtherOpNum");
   } else if (R->isSubClassOf("SDTCisSubVecOfVec")) {
     ConstraintType = SDTCisSubVecOfVec;
-    x.SDTCisSubVecOfVec_Info.OtherOperandNum =
-      R->getValueAsInt("OtherOpNum");
+    x.SDTCisSubVecOfVec_Info.OtherOperandNum = R->getValueAsInt("OtherOpNum");
   } else if (R->isSubClassOf("SDTCVecEltisVT")) {
     ConstraintType = SDTCVecEltisVT;
     VVT = getValueTypeByHwMode(R->getValueAsDef("VT"), CGH);
@@ -1577,11 +1569,11 @@ SDTypeConstraint::SDTypeConstraint(Record *R, const CodeGenHwModes &CGH) {
   } else if (R->isSubClassOf("SDTCisSameNumEltsAs")) {
     ConstraintType = SDTCisSameNumEltsAs;
     x.SDTCisSameNumEltsAs_Info.OtherOperandNum =
-      R->getValueAsInt("OtherOperandNum");
+        R->getValueAsInt("OtherOperandNum");
   } else if (R->isSubClassOf("SDTCisSameSizeAs")) {
     ConstraintType = SDTCisSameSizeAs;
     x.SDTCisSameSizeAs_Info.OtherOperandNum =
-      R->getValueAsInt("OtherOperandNum");
+        R->getValueAsInt("OtherOperandNum");
   } else {
     PrintFatalError(R->getLoc(),
                     "Unrecognized SDTypeConstraint '" + R->getName() + "'!\n");
@@ -1604,8 +1596,8 @@ static TreePatternNode *getOperandNum(unsigned OpNo, TreePatternNode *N,
   if (OpNo >= N->getNumChildren()) {
     std::string S;
     raw_string_ostream OS(S);
-    OS << "Invalid operand number in type constraint "
-           << (OpNo+NumResults) << " ";
+    OS << "Invalid operand number in type constraint " << (OpNo + NumResults)
+       << " ";
     N->print(OS);
     PrintFatalError(S);
   }
@@ -1635,7 +1627,7 @@ bool SDTypeConstraint::ApplyTypeConstraint(TreePatternNode *N,
     return NodeToApply->UpdateNodeType(ResNo, MVT::iPTR, TP);
   case SDTCisInt:
     // Require it to be one of the legal integer VTs.
-     return TI.EnforceInteger(NodeToApply->getExtType(ResNo));
+    return TI.EnforceInteger(NodeToApply->getExtType(ResNo));
   case SDTCisFP:
     // Require it to be one of the legal fp VTs.
     return TI.EnforceFloatingPoint(NodeToApply->getExtType(ResNo));
@@ -1645,7 +1637,7 @@ bool SDTypeConstraint::ApplyTypeConstraint(TreePatternNode *N,
   case SDTCisSameAs: {
     unsigned OResNo = 0;
     TreePatternNode *OtherNode =
-      getOperandNum(x.SDTCisSameAs_Info.OtherOperandNum, N, NodeInfo, OResNo);
+        getOperandNum(x.SDTCisSameAs_Info.OtherOperandNum, N, NodeInfo, OResNo);
     return (int)NodeToApply->UpdateNodeType(ResNo,
                                             OtherNode->getExtType(OResNo), TP) |
            (int)OtherNode->UpdateNodeType(OResNo,
@@ -1654,10 +1646,10 @@ bool SDTypeConstraint::ApplyTypeConstraint(TreePatternNode *N,
   case SDTCisVTSmallerThanOp: {
     // The NodeToApply must be a leaf node that is a VT.  OtherOperandNum must
     // have an integer type that is smaller than the VT.
-    if (!NodeToApply->isLeaf() ||
-        !isa<DefInit>(NodeToApply->getLeafValue()) ||
-        !cast<DefInit>(NodeToApply->getLeafValue())->getDef()
-               ->isSubClassOf("ValueType")) {
+    if (!NodeToApply->isLeaf() || !isa<DefInit>(NodeToApply->getLeafValue()) ||
+        !cast<DefInit>(NodeToApply->getLeafValue())
+             ->getDef()
+             ->isSubClassOf("ValueType")) {
       TP.error(N->getOperator()->getName() + " expects a VT operand!");
       return false;
     }
@@ -1667,26 +1659,23 @@ bool SDTypeConstraint::ApplyTypeConstraint(TreePatternNode *N,
     TypeSetByHwMode TypeListTmp(VVT);
 
     unsigned OResNo = 0;
-    TreePatternNode *OtherNode =
-      getOperandNum(x.SDTCisVTSmallerThanOp_Info.OtherOperandNum, N, NodeInfo,
-                    OResNo);
+    TreePatternNode *OtherNode = getOperandNum(
+        x.SDTCisVTSmallerThanOp_Info.OtherOperandNum, N, NodeInfo, OResNo);
 
     return TI.EnforceSmallerThan(TypeListTmp, OtherNode->getExtType(OResNo),
                                  /*SmallIsVT*/ true);
   }
   case SDTCisOpSmallerThanOp: {
     unsigned BResNo = 0;
-    TreePatternNode *BigOperand =
-      getOperandNum(x.SDTCisOpSmallerThanOp_Info.BigOperandNum, N, NodeInfo,
-                    BResNo);
+    TreePatternNode *BigOperand = getOperandNum(
+        x.SDTCisOpSmallerThanOp_Info.BigOperandNum, N, NodeInfo, BResNo);
     return TI.EnforceSmallerThan(NodeToApply->getExtType(ResNo),
                                  BigOperand->getExtType(BResNo));
   }
   case SDTCisEltOfVec: {
     unsigned VResNo = 0;
-    TreePatternNode *VecOperand =
-      getOperandNum(x.SDTCisEltOfVec_Info.OtherOperandNum, N, NodeInfo,
-                    VResNo);
+    TreePatternNode *VecOperand = getOperandNum(
+        x.SDTCisEltOfVec_Info.OtherOperandNum, N, NodeInfo, VResNo);
     // Filter vector types out of VecOperand that don't have the right element
     // type.
     return TI.EnforceVectorEltTypeIs(VecOperand->getExtType(VResNo),
@@ -1694,9 +1683,8 @@ bool SDTypeConstraint::ApplyTypeConstraint(TreePatternNode *N,
   }
   case SDTCisSubVecOfVec: {
     unsigned VResNo = 0;
-    TreePatternNode *BigVecOperand =
-      getOperandNum(x.SDTCisSubVecOfVec_Info.OtherOperandNum, N, NodeInfo,
-                    VResNo);
+    TreePatternNode *BigVecOperand = getOperandNum(
+        x.SDTCisSubVecOfVec_Info.OtherOperandNum, N, NodeInfo, VResNo);
 
     // Filter vector types out of BigVecOperand that don't have the
     // right subvector type.
@@ -1708,17 +1696,15 @@ bool SDTypeConstraint::ApplyTypeConstraint(TreePatternNode *N,
   }
   case SDTCisSameNumEltsAs: {
     unsigned OResNo = 0;
-    TreePatternNode *OtherNode =
-      getOperandNum(x.SDTCisSameNumEltsAs_Info.OtherOperandNum,
-                    N, NodeInfo, OResNo);
+    TreePatternNode *OtherNode = getOperandNum(
+        x.SDTCisSameNumEltsAs_Info.OtherOperandNum, N, NodeInfo, OResNo);
     return TI.EnforceSameNumElts(OtherNode->getExtType(OResNo),
                                  NodeToApply->getExtType(ResNo));
   }
   case SDTCisSameSizeAs: {
     unsigned OResNo = 0;
-    TreePatternNode *OtherNode =
-      getOperandNum(x.SDTCisSameSizeAs_Info.OtherOperandNum,
-                    N, NodeInfo, OResNo);
+    TreePatternNode *OtherNode = getOperandNum(
+        x.SDTCisSameSizeAs_Info.OtherOperandNum, N, NodeInfo, OResNo);
     return TI.EnforceSameSize(OtherNode->getExtType(OResNo),
                               NodeToApply->getExtType(ResNo));
   }
@@ -1729,8 +1715,7 @@ bool SDTypeConstraint::ApplyTypeConstraint(TreePatternNode *N,
 // Update the node type to match an instruction operand or result as specified
 // in the ins or outs lists on the instruction definition. Return true if the
 // type was actually changed.
-bool TreePatternNode::UpdateNodeTypeFromInst(unsigned ResNo,
-                                             Record *Operand,
+bool TreePatternNode::UpdateNodeTypeFromInst(unsigned ResNo, Record *Operand,
                                              TreePattern &TP) {
   // The 'unknown' operand indicates that types should be inferred from the
   // context.
@@ -1808,7 +1793,7 @@ bool TreePatternNode::setDefaultMode(unsigned Mode) {
 // SDNodeInfo implementation
 //
 SDNodeInfo::SDNodeInfo(Record *R, const CodeGenHwModes &CGH) : Def(R) {
-  EnumName    = R->getValueAsString("Opcode");
+  EnumName = R->getValueAsString("Opcode");
   SDClassName = R->getValueAsString("SDClass");
   Record *TypeProfile = R->getValueAsDef("TypeProfile");
   NumResults = TypeProfile->getValueAsInt("NumResults");
@@ -1818,8 +1803,8 @@ SDNodeInfo::SDNodeInfo(Record *R, const CodeGenHwModes &CGH) : Def(R) {
   Properties = parseSDPatternOperatorProperties(R);
 
   // Parse the type constraints.
-  std::vector<Record*> ConstraintList =
-    TypeProfile->getValueAsListOfDefs("Constraints");
+  std::vector<Record *> ConstraintList =
+      TypeProfile->getValueAsListOfDefs("Constraints");
   for (Record *R : ConstraintList)
     TypeConstraints.emplace_back(R, CGH);
 }
@@ -1835,11 +1820,12 @@ MVT::SimpleValueType SDNodeInfo::getKnownType(unsigned ResNo) const {
 
   for (const SDTypeConstraint &Constraint : TypeConstraints) {
     // Make sure that this applies to the correct node result.
-    if (Constraint.OperandNo >= NumResults)  // FIXME: need value #
+    if (Constraint.OperandNo >= NumResults) // FIXME: need value #
       continue;
 
     switch (Constraint.ConstraintType) {
-    default: break;
+    default:
+      break;
     case SDTypeConstraint::SDTCisVT:
       if (Constraint.VVT.isSimple())
         return Constraint.VVT.getSimple().SimpleTy;
@@ -1856,9 +1842,8 @@ MVT::SimpleValueType SDNodeInfo::getKnownType(unsigned ResNo) const {
 //
 
 static unsigned GetNumNodeResults(Record *Operator, CodeGenDAGPatterns &CDP) {
-  if (Operator->getName() == "set" ||
-      Operator->getName() == "implicit")
-    return 0;  // All return nothing.
+  if (Operator->getName() == "set" || Operator->getName() == "implicit")
+    return 0; // All return nothing.
 
   if (Operator->isSubClassOf("Intrinsic"))
     return CDP.getIntrinsic(Operator).IS.RetTys.size();
@@ -1908,16 +1893,17 @@ static unsigned GetNumNodeResults(Record *Operator, CodeGenDAGPatterns &CDP) {
     }
 
     // Add on one implicit def if it has a resolvable type.
-    if (InstInfo.HasOneImplicitDefWithKnownVT(CDP.getTargetInfo()) !=MVT::Other)
+    if (InstInfo.HasOneImplicitDefWithKnownVT(CDP.getTargetInfo()) !=
+        MVT::Other)
       ++NumDefsToAdd;
     return NumDefsToAdd;
   }
 
   if (Operator->isSubClassOf("SDNodeXForm"))
-    return 1;  // FIXME: Generalize SDNodeXForm
+    return 1; // FIXME: Generalize SDNodeXForm
 
   if (Operator->isSubClassOf("ValueType"))
-    return 1;  // A type-cast of one result.
+    return 1; // A type-cast of one result.
 
   if (Operator->isSubClassOf("ComplexPattern"))
     return 1;
@@ -1963,9 +1949,7 @@ void TreePatternNode::print(raw_ostream &OS) const {
   for (const ScopedName &Name : NamesAsPredicateArg)
     OS << ":$pred:" << Name.getScope() << ":" << Name.getIdentifier();
 }
-void TreePatternNode::dump() const {
-  print(errs());
-}
+void TreePatternNode::dump() const { print(errs()); }
 
 /// isIsomorphicTo - Return true if this node is recursively
 /// isomorphic to the specified node.  For this comparison, the node's
@@ -1976,7 +1960,8 @@ void TreePatternNode::dump() const {
 /// isomorphic if the names match.
 bool TreePatternNode::isIsomorphicTo(const TreePatternNode *N,
                                      const MultipleUseVarSet &DepVars) const {
-  if (N == this) return true;
+  if (N == this)
+    return true;
   if (N->isLeaf() != isLeaf())
     return false;
 
@@ -2035,17 +2020,18 @@ TreePatternNodePtr TreePatternNode::clone() const {
 void TreePatternNode::RemoveAllTypes() {
   // Reset to unknown type.
   std::fill(Types.begin(), Types.end(), TypeSetByHwMode());
-  if (isLeaf()) return;
+  if (isLeaf())
+    return;
   for (unsigned i = 0, e = getNumChildren(); i != e; ++i)
     getChild(i)->RemoveAllTypes();
 }
 
-
 /// SubstituteFormalArguments - Replace the formal arguments in this tree
 /// with actual values specified by ArgMap.
 void TreePatternNode::SubstituteFormalArguments(
     std::map<std::string, TreePatternNodePtr> &ArgMap) {
-  if (isLeaf()) return;
+  if (isLeaf())
+    return;
 
   for (unsigned i = 0, e = getNumChildren(); i != e; ++i) {
     TreePatternNode *Child = getChild(i);
@@ -2053,8 +2039,9 @@ void TreePatternNode::SubstituteFormalArguments(
       Init *Val = Child->getLeafValue();
       // Note that, when substituting into an output pattern, Val might be an
       // UnsetInit.
-      if (isa<UnsetInit>(Val) || (isa<DefInit>(Val) &&
-          cast<DefInit>(Val)->getDef()->getName() == "node")) {
+      if (isa<UnsetInit>(Val) ||
+          (isa<DefInit>(Val) &&
+           cast<DefInit>(Val)->getDef()->getName() == "node")) {
         // We found a use of a formal argument, replace it with its value.
         TreePatternNodePtr NewChild = ArgMap[Child->getName()];
         assert(NewChild && "Couldn't find formal argument!");
@@ -2069,7 +2056,6 @@ void TreePatternNode::SubstituteFormalArguments(
   }
 }
 
-
 /// InlinePatternFragments - If this pattern refers to any pattern
 /// fragments, return the set of inlined versions (this can be more than
 /// one if a PatFrags record has multiple alternatives).
@@ -2205,7 +2191,7 @@ void TreePatternNode::InlinePatternFragments(
     for (const TreePredicateCall &Pred : getPredicateCalls())
       FragTree->addPredicateCall(Pred);
 
-    // The fragment we inlined could have recursive inlining that is needed.  See
+    // The fragment we inlined could have recursive inlining that is needed. See
     // if there are any pattern fragments in it and inline them as needed.
     FragTree->InlinePatternFragments(TP, OutAlternatives);
   }
@@ -2224,8 +2210,7 @@ void TreePatternNode::InlinePatternFragments(
 /// GPR:$src operand above.
 ///
 static TypeSetByHwMode getImplicitType(Record *R, unsigned ResNo,
-                                       bool NotRegisters,
-                                       bool Unnamed,
+                                       bool NotRegisters, bool Unnamed,
                                        TreePattern &TP) {
   CodeGenDAGPatterns &CDP = TP.getDAGPatterns();
 
@@ -2331,11 +2316,10 @@ static TypeSetByHwMode getImplicitType(Record *R, unsigned ResNo,
   return TypeSetByHwMode(MVT::Other);
 }
 
-
 /// getIntrinsicInfo - If this node corresponds to an intrinsic, return the
 /// CodeGenIntrinsic information for it, otherwise return a null pointer.
-const CodeGenIntrinsic *TreePatternNode::
-getIntrinsicInfo(const CodeGenDAGPatterns &CDP) const {
+const CodeGenIntrinsic *
+TreePatternNode::getIntrinsicInfo(const CodeGenDAGPatterns &CDP) const {
   if (getOperator() != CDP.get_intrinsic_void_sdnode() &&
       getOperator() != CDP.get_intrinsic_w_chain_sdnode() &&
       getOperator() != CDP.get_intrinsic_wo_chain_sdnode())
@@ -2406,9 +2390,6 @@ bool TreePatternNode::NodeHasProperty(SDNP Property,
   return CGP.getSDNodeInfo(getOperator()).hasProperty(Property);
 }
 
-
-
-
 /// TreeHasProperty - Return true if any node in this tree has the specified
 /// property.
 bool TreePatternNode::TreeHasProperty(SDNP Property,
@@ -2423,8 +2404,8 @@ bool TreePatternNode::TreeHasProperty(SDNP Property,
 
 /// isCommutativeIntrinsic - Return true if the node corresponds to a
 /// commutative intrinsic.
-bool
-TreePatternNode::isCommutativeIntrinsic(const CodeGenDAGPatterns &CDP) const {
+bool TreePatternNode::isCommutativeIntrinsic(
+    const CodeGenDAGPatterns &CDP) const {
   if (const CodeGenIntrinsic *Int = getIntrinsicInfo(CDP))
     return Int->isCommutative;
   return false;
@@ -2441,19 +2422,16 @@ static bool isOperandClass(const TreePatternNode *N, StringRef Class) {
   return false;
 }
 
-static void emitTooManyOperandsError(TreePattern &TP,
-                                     StringRef InstName,
-                                     unsigned Expected,
-                                     unsigned Actual) {
+static void emitTooManyOperandsError(TreePattern &TP, StringRef InstName,
+                                     unsigned Expected, unsigned Actual) {
   TP.error("Instruction '" + InstName + "' was provided " + Twine(Actual) +
            " operands but expected only " + Twine(Expected) + "!");
 }
 
-static void emitTooFewOperandsError(TreePattern &TP,
-                                    StringRef InstName,
+static void emitTooFewOperandsError(TreePattern &TP, StringRef InstName,
                                     unsigned Actual) {
-  TP.error("Instruction '" + InstName +
-           "' expects more than the provided " + Twine(Actual) + " operands!");
+  TP.error("Instruction '" + InstName + "' expects more than the provided " +
+           Twine(Actual) + " operands!");
 }
 
 /// ApplyTypeConstraints - Apply all of the type constraints relevant to
@@ -2469,9 +2447,9 @@ bool TreePatternNode::ApplyTypeConstraints(TreePattern &TP, bool NotRegisters) {
       // If it's a regclass or something else known, include the type.
       bool MadeChange = false;
       for (unsigned i = 0, e = Types.size(); i != e; ++i)
-        MadeChange |= UpdateNodeType(i, getImplicitType(DI->getDef(), i,
-                                                        NotRegisters,
-                                                        !hasName(), TP), TP);
+        MadeChange |= UpdateNodeType(
+            i, getImplicitType(DI->getDef(), i, NotRegisters, !hasName(), TP),
+            TP);
       return MadeChange;
     }
 
@@ -2530,8 +2508,8 @@ bool TreePatternNode::ApplyTypeConstraints(TreePattern &TP, bool NotRegisters) {
     // Apply type info to the intrinsic ID.
     MadeChange |= getChild(0)->UpdateNodeType(0, MVT::iPTR, TP);
 
-    for (unsigned i = 0, e = getNumChildren()-1; i != e; ++i) {
-      MadeChange |= getChild(i+1)->ApplyTypeConstraints(TP, NotRegisters);
+    for (unsigned i = 0, e = getNumChildren() - 1; i != e; ++i) {
+      MadeChange |= getChild(i + 1)->ApplyTypeConstraints(TP, NotRegisters);
 
       MVT::SimpleValueType OpVT =
           getValueType(Int->IS.ParamTys[i]->getValueAsDef("VT"));
@@ -2562,14 +2540,14 @@ bool TreePatternNode::ApplyTypeConstraints(TreePattern &TP, bool NotRegisters) {
   if (getOperator()->isSubClassOf("Instruction")) {
     const DAGInstruction &Inst = CDP.getInstruction(getOperator());
     CodeGenInstruction &InstInfo =
-      CDP.getTargetInfo().getInstruction(getOperator());
+        CDP.getTargetInfo().getInstruction(getOperator());
 
     bool MadeChange = false;
 
     // Apply the result types to the node, these come from the things in the
     // (outs) list of the instruction.
-    unsigned NumResultsToAdd = std::min(InstInfo.Operands.NumDefs,
-                                        Inst.getNumResults());
+    unsigned NumResultsToAdd =
+        std::min(InstInfo.Operands.NumDefs, Inst.getNumResults());
     for (unsigned ResNo = 0; ResNo != NumResultsToAdd; ++ResNo)
       MadeChange |= UpdateNodeTypeFromInst(ResNo, Inst.getResult(ResNo), TP);
 
@@ -2581,7 +2559,7 @@ bool TreePatternNode::ApplyTypeConstraints(TreePattern &TP, bool NotRegisters) {
       // FIXME: Generalize to multiple possible types and multiple possible
       // ImplicitDefs.
       MVT::SimpleValueType VT =
-        InstInfo.HasOneImplicitDefWithKnownVT(CDP.getTargetInfo());
+          InstInfo.HasOneImplicitDefWithKnownVT(CDP.getTargetInfo());
 
       if (VT != MVT::Other)
         MadeChange |= UpdateNodeType(ResNo, VT, TP);
@@ -2636,7 +2614,8 @@ bool TreePatternNode::ApplyTypeConstraints(TreePattern &TP, bool NotRegisters) {
     // the pattern was intended to override A or skip it.
     unsigned NonOverridableOperands = NumFixedOperands;
     while (NonOverridableOperands > NumResults &&
-           CDP.operandHasDefault(InstInfo.Operands[NonOverridableOperands-1].Rec))
+           CDP.operandHasDefault(
+               InstInfo.Operands[NonOverridableOperands - 1].Rec))
       --NonOverridableOperands;
 
     unsigned ChildNo = 0;
@@ -2659,7 +2638,7 @@ bool TreePatternNode::ApplyTypeConstraints(TreePattern &TP, bool NotRegisters) {
       }
 
       TreePatternNode *Child = getChild(ChildNo++);
-      unsigned ChildResNo = 0;  // Instructions always use res #0 of their op.
+      unsigned ChildResNo = 0; // Instructions always use res #0 of their op.
 
       // If the operand has sub-operands, they may be provided by distinct
       // child patterns, so attempt to match each sub-operand separately.
@@ -2672,8 +2651,7 @@ bool TreePatternNode::ApplyTypeConstraints(TreePattern &TP, bool NotRegisters) {
           if (Child->getNumMIResults(CDP) < NumArgs) {
             // Match first sub-operand against the child we already have.
             Record *SubRec = cast<DefInit>(MIOpInfo->getArg(0))->getDef();
-            MadeChange |=
-              Child->UpdateNodeTypeFromInst(ChildResNo, SubRec, TP);
+            MadeChange |= Child->UpdateNodeTypeFromInst(ChildResNo, SubRec, TP);
 
             // And the remaining sub-operands against subsequent children.
             for (unsigned Arg = 1; Arg < NumArgs; ++Arg) {
@@ -2686,7 +2664,7 @@ bool TreePatternNode::ApplyTypeConstraints(TreePattern &TP, bool NotRegisters) {
 
               SubRec = cast<DefInit>(MIOpInfo->getArg(Arg))->getDef();
               MadeChange |=
-                Child->UpdateNodeTypeFromInst(ChildResNo, SubRec, TP);
+                  Child->UpdateNodeTypeFromInst(ChildResNo, SubRec, TP);
             }
             continue;
           }
@@ -2699,8 +2677,8 @@ bool TreePatternNode::ApplyTypeConstraints(TreePattern &TP, bool NotRegisters) {
     }
 
     if (!InstInfo.Operands.isVariadic && ChildNo != getNumChildren()) {
-      emitTooManyOperandsError(TP, getOperator()->getName(),
-                               ChildNo, getNumChildren());
+      emitTooManyOperandsError(TP, getOperator()->getName(), ChildNo,
+                               getNumChildren());
       return false;
     }
 
@@ -2759,7 +2737,6 @@ static bool OnlyOnRHSOfCommutative(TreePatternNode *N) {
   return false;
 }
 
-
 /// canPatternMatch - If it is impossible for this pattern to match on this
 /// target, fill in Reason and return false.  Otherwise, return true.  This is
 /// used as a sanity check for .td files (to prevent people from writing stuff
@@ -2767,7 +2744,8 @@ static bool OnlyOnRHSOfCommutative(TreePatternNode *N) {
 /// generating stuff that is useless.
 bool TreePatternNode::canPatternMatch(std::string &Reason,
                                       const CodeGenDAGPatterns &CDP) {
-  if (isLeaf()) return true;
+  if (isLeaf())
+    return true;
 
   for (unsigned i = 0, e = getNumChildren(); i != e; ++i)
     if (!getChild(i)->canPatternMatch(Reason, CDP))
@@ -2790,11 +2768,12 @@ bool TreePatternNode::canPatternMatch(std::string &Reason,
   if (NodeInfo.hasProperty(SDNPCommutative) || isCommIntrinsic) {
     // Scan all of the operands of the node and make sure that only the last one
     // is a constant node, unless the RHS also is.
-    if (!OnlyOnRHSOfCommutative(getChild(getNumChildren()-1))) {
+    if (!OnlyOnRHSOfCommutative(getChild(getNumChildren() - 1))) {
       unsigned Skip = isCommIntrinsic ? 1 : 0; // First operand is intrinsic id.
-      for (unsigned i = Skip, e = getNumChildren()-1; i != e; ++i)
+      for (unsigned i = Skip, e = getNumChildren() - 1; i != e; ++i)
         if (OnlyOnRHSOfCommutative(getChild(i))) {
-          Reason="Immediate value must be on the RHS of commutative operators!";
+          Reason =
+              "Immediate value must be on the RHS of commutative operators!";
           return false;
         }
     }
@@ -2808,17 +2787,17 @@ bool TreePatternNode::canPatternMatch(std::string &Reason,
 //
 
 TreePattern::TreePattern(Record *TheRec, ListInit *RawPat, bool isInput,
-                         CodeGenDAGPatterns &cdp) : TheRecord(TheRec), CDP(cdp),
-                         isInputPattern(isInput), HasError(false),
-                         Infer(*this) {
+                         CodeGenDAGPatterns &cdp)
+    : TheRecord(TheRec), CDP(cdp), isInputPattern(isInput), HasError(false),
+      Infer(*this) {
   for (Init *I : RawPat->getValues())
     Trees.push_back(ParseTreePattern(I, ""));
 }
 
 TreePattern::TreePattern(Record *TheRec, DagInit *Pat, bool isInput,
-                         CodeGenDAGPatterns &cdp) : TheRecord(TheRec), CDP(cdp),
-                         isInputPattern(isInput), HasError(false),
-                         Infer(*this) {
+                         CodeGenDAGPatterns &cdp)
+    : TheRecord(TheRec), CDP(cdp), isInputPattern(isInput), HasError(false),
+      Infer(*this) {
   Trees.push_back(ParseTreePattern(Pat, ""));
 }
 
@@ -2861,9 +2840,9 @@ TreePatternNodePtr TreePattern::ParseTreePattern(Init *TheInit,
     ///   (foo GPR, imm) -> (foo GPR, (imm))
     if (R->isSubClassOf("SDNode") || R->isSubClassOf("PatFrags"))
       return ParseTreePattern(
-        DagInit::get(DI, nullptr,
-                     std::vector<std::pair<Init*, StringInit*> >()),
-        OpName);
+          DagInit::get(DI, nullptr,
+                       std::vector<std::pair<Init *, StringInit *>>()),
+          OpName);
 
     // Input argument?
     TreePatternNodePtr Res = makeIntrusiveRefCnt<TreePatternNode>(DI, 1);
@@ -2943,8 +2922,7 @@ TreePatternNodePtr TreePattern::ParseTreePattern(Init *TheInit,
       !Operator->isSubClassOf("SDNodeXForm") &&
       !Operator->isSubClassOf("Intrinsic") &&
       !Operator->isSubClassOf("ComplexPattern") &&
-      Operator->getName() != "set" &&
-      Operator->getName() != "implicit")
+      Operator->getName() != "set" && Operator->getName() != "implicit")
     error("Unrecognized node '" + Operator->getName() + "'!");
 
   //  Check to see if this is something that is illegal in an input pattern.
@@ -2956,20 +2934,16 @@ TreePatternNodePtr TreePattern::ParseTreePattern(Init *TheInit,
     if (Operator->isSubClassOf("Intrinsic"))
       error("Cannot use '" + Operator->getName() + "' in an output pattern!");
 
-    if (Operator->isSubClassOf("SDNode") &&
-        Operator->getName() != "imm" &&
-        Operator->getName() != "timm" &&
-        Operator->getName() != "fpimm" &&
+    if (Operator->isSubClassOf("SDNode") && Operator->getName() != "imm" &&
+        Operator->getName() != "timm" && Operator->getName() != "fpimm" &&
         Operator->getName() != "tglobaltlsaddr" &&
         Operator->getName() != "tconstpool" &&
         Operator->getName() != "tjumptable" &&
         Operator->getName() != "tframeindex" &&
         Operator->getName() != "texternalsym" &&
         Operator->getName() != "tblockaddress" &&
-        Operator->getName() != "tglobaladdr" &&
-        Operator->getName() != "bb" &&
-        Operator->getName() != "vt" &&
-        Operator->getName() != "mcsym")
+        Operator->getName() != "tglobaladdr" && Operator->getName() != "bb" &&
+        Operator->getName() != "vt" && Operator->getName() != "mcsym")
       error("Cannot use '" + Operator->getName() + "' in an output pattern!");
   }
 
@@ -2979,8 +2953,8 @@ TreePatternNodePtr TreePattern::ParseTreePattern(Init *TheInit,
   for (unsigned i = 0, e = Dag->getNumArgs(); i != e; ++i)
     Children.push_back(ParseTreePattern(Dag->getArg(i), Dag->getArgNameStr(i)));
 
-  // Get the actual number of results before Operator is converted to an intrinsic
-  // node (which is hard-coded to have either zero or one result).
+  // Get the actual number of results before Operator is converted to an
+  // intrinsic node (which is hard-coded to have either zero or one result).
   unsigned NumResults = GetNumNodeResults(Operator, CDP);
 
   // If the operator is an intrinsic, then this is just syntactic sugar for
@@ -2988,7 +2962,7 @@ TreePatternNodePtr TreePattern::ParseTreePattern(Init *TheInit,
   // convert the intrinsic name to a number.
   if (Operator->isSubClassOf("Intrinsic")) {
     const CodeGenIntrinsic &Int = getDAGPatterns().getIntrinsic(Operator);
-    unsigned IID = getDAGPatterns().getIntrinsicID(Operator)+1;
+    unsigned IID = getDAGPatterns().getIntrinsicID(Operator) + 1;
 
     // If this intrinsic returns void, it must have side-effects and thus a
     // chain.
@@ -3072,13 +3046,11 @@ static bool SimplifyTree(TreePatternNodePtr &N) {
   return MadeChange;
 }
 
-
-
 /// InferAllTypes - Infer/propagate as many types throughout the expression
 /// patterns as possible.  Return true if all types are inferred, false
 /// otherwise.  Flags an error if a type contradiction is found.
-bool TreePattern::
-InferAllTypes(const StringMap<SmallVector<TreePatternNode*,1> > *InNamedTypes) {
+bool TreePattern::InferAllTypes(
+    const StringMap<SmallVector<TreePatternNode *, 1>> *InNamedTypes) {
   if (NamedNodes.empty())
     ComputeNamedNodes();
 
@@ -3092,7 +3064,7 @@ InferAllTypes(const StringMap<SmallVector<TreePatternNode*,1> > *InNamedTypes) {
 
     // If there are constraints on our named nodes, apply them.
     for (auto &Entry : NamedNodes) {
-      SmallVectorImpl<TreePatternNode*> &Nodes = Entry.second;
+      SmallVectorImpl<TreePatternNode *> &Nodes = Entry.second;
 
       // If we have input named node types, propagate their types to the named
       // values here.
@@ -3103,8 +3075,8 @@ InferAllTypes(const StringMap<SmallVector<TreePatternNode*,1> > *InNamedTypes) {
           return true;
         }
 
-        const SmallVectorImpl<TreePatternNode*> &InNodes =
-          InNamedTypes->find(Entry.getKey())->second;
+        const SmallVectorImpl<TreePatternNode *> &InNodes =
+            InNamedTypes->find(Entry.getKey())->second;
 
         // The input types should be fully resolved by now.
         for (TreePatternNode *Node : Nodes) {
@@ -3120,19 +3092,18 @@ InferAllTypes(const StringMap<SmallVector<TreePatternNode*,1> > *InNamedTypes) {
               continue;
           }
 
-          assert(Node->getNumTypes() == 1 &&
-                 InNodes[0]->getNumTypes() == 1 &&
+          assert(Node->getNumTypes() == 1 && InNodes[0]->getNumTypes() == 1 &&
                  "FIXME: cannot name multiple result nodes yet");
-          MadeChange |= Node->UpdateNodeType(0, InNodes[0]->getExtType(0),
-                                             *this);
+          MadeChange |=
+              Node->UpdateNodeType(0, InNodes[0]->getExtType(0), *this);
         }
       }
 
       // If there are multiple nodes with the same name, they must all have the
       // same type.
       if (Entry.second.size() > 1) {
-        for (unsigned i = 0, e = Nodes.size()-1; i != e; ++i) {
-          TreePatternNode *N1 = Nodes[i], *N2 = Nodes[i+1];
+        for (unsigned i = 0, e = Nodes.size() - 1; i != e; ++i) {
+          TreePatternNode *N1 = Nodes[i], *N2 = Nodes[i + 1];
           assert(N1->getNumTypes() == 1 && N2->getNumTypes() == 1 &&
                  "FIXME: cannot name multiple result nodes yet");
 
@@ -3190,7 +3161,7 @@ CodeGenDAGPatterns::CodeGenDAGPatterns(RecordKeeper &R,
   ParsePatternFragments();
   ParseDefaultOperands();
   ParseInstructions();
-  ParsePatternFragments(/*OutFrags*/true);
+  ParsePatternFragments(/*OutFrags*/ true);
   ParsePatterns();
 
   // Generate variants.  For example, commutative patterns can match
@@ -3221,7 +3192,7 @@ Record *CodeGenDAGPatterns::getSDNodeNamed(StringRef Name) const {
 
 // Parse all of the SDNode definitions for the target, populating SDNodes.
 void CodeGenDAGPatterns::ParseNodeInfo() {
-  std::vector<Record*> Nodes = Records.getAllDerivedDefinitions("SDNode");
+  std::vector<Record *> Nodes = Records.getAllDerivedDefinitions("SDNode");
   const CodeGenHwModes &CGH = getTargetInfo().getHwModes();
 
   while (!Nodes.empty()) {
@@ -3231,15 +3202,16 @@ void CodeGenDAGPatterns::ParseNodeInfo() {
   }
 
   // Get the builtin intrinsic nodes.
-  intrinsic_void_sdnode     = getSDNodeNamed("intrinsic_void");
-  intrinsic_w_chain_sdnode  = getSDNodeNamed("intrinsic_w_chain");
+  intrinsic_void_sdnode = getSDNodeNamed("intrinsic_void");
+  intrinsic_w_chain_sdnode = getSDNodeNamed("intrinsic_w_chain");
   intrinsic_wo_chain_sdnode = getSDNodeNamed("intrinsic_wo_chain");
 }
 
 /// ParseNodeTransforms - Parse all SDNodeXForm instances into the SDNodeXForms
 /// map, and emit them to the file as functions.
 void CodeGenDAGPatterns::ParseNodeTransforms() {
-  std::vector<Record*> Xforms = Records.getAllDerivedDefinitions("SDNodeXForm");
+  std::vector<Record *> Xforms =
+      Records.getAllDerivedDefinitions("SDNodeXForm");
   while (!Xforms.empty()) {
     Record *XFormNode = Xforms.back();
     Record *SDNode = XFormNode->getValueAsDef("Opcode");
@@ -3252,21 +3224,22 @@ void CodeGenDAGPatterns::ParseNodeTransforms() {
 }
 
 void CodeGenDAGPatterns::ParseComplexPatterns() {
-  std::vector<Record*> AMs = Records.getAllDerivedDefinitions("ComplexPattern");
+  std::vector<Record *> AMs =
+      Records.getAllDerivedDefinitions("ComplexPattern");
   while (!AMs.empty()) {
     ComplexPatterns.insert(std::make_pair(AMs.back(), AMs.back()));
     AMs.pop_back();
   }
 }
 
-
 /// ParsePatternFragments - Parse all of the PatFrag definitions in the .td
 /// file, building up the PatternFragments map.  After we've collected them all,
 /// inline fragments together as necessary, so that there are no references left
 /// inside a pattern fragment to a pattern fragment.
 ///
 void CodeGenDAGPatterns::ParsePatternFragments(bool OutFrags) {
-  std::vector<Record*> Fragments = Records.getAllDerivedDefinitions("PatFrags");
+  std::vector<Record *> Fragments =
+      Records.getAllDerivedDefinitions("PatFrags");
 
   // First step, parse all of the fragments.
   for (Record *Frag : Fragments) {
@@ -3274,10 +3247,9 @@ void CodeGenDAGPatterns::ParsePatternFragments(bool OutFrags) {
       continue;
 
     ListInit *LI = Frag->getValueAsListInit("Fragments");
-    TreePattern *P =
-        (PatternFragments[Frag] = std::make_unique<TreePattern>(
-             Frag, LI, !Frag->isSubClassOf("OutPatFrag"),
-             *this)).get();
+    TreePattern *P = (PatternFragments[Frag] = std::make_unique<TreePattern>(
+                          Frag, LI, !Frag->isSubClassOf("OutPatFrag"), *this))
+                         .get();
 
     // Validate the argument list, converting it to set, to discard duplicates.
     std::vector<std::string> &Args = P->getArgList();
@@ -3294,10 +3266,9 @@ void CodeGenDAGPatterns::ParsePatternFragments(bool OutFrags) {
     DefInit *OpsOp = dyn_cast<DefInit>(OpsList->getOperator());
     // Special cases: ops == outs == ins. Different names are used to
     // improve readability.
-    if (!OpsOp ||
-        (OpsOp->getDef()->getName() != "ops" &&
-         OpsOp->getDef()->getName() != "outs" &&
-         OpsOp->getDef()->getName() != "ins"))
+    if (!OpsOp || (OpsOp->getDef()->getName() != "ops" &&
+                   OpsOp->getDef()->getName() != "outs" &&
+                   OpsOp->getDef()->getName() != "ins"))
       P->error("Operands list should start with '(ops ... '!");
 
     // Copy over the arguments.
@@ -3323,7 +3294,7 @@ void CodeGenDAGPatterns::ParsePatternFragments(bool OutFrags) {
     // If there is a node transformation corresponding to this, keep track of
     // it.
     Record *Transform = Frag->getValueAsDef("OperandTransform");
-    if (!getSDNodeTransform(Transform).second.empty())    // not noop xform?
+    if (!getSDNodeTransform(Transform).second.empty()) // not noop xform?
       for (const auto &T : P->getTrees())
         T->setTransformFn(Transform);
   }
@@ -3354,7 +3325,7 @@ void CodeGenDAGPatterns::ParsePatternFragments(bool OutFrags) {
 }
 
 void CodeGenDAGPatterns::ParseDefaultOperands() {
-  std::vector<Record*> DefaultOps;
+  std::vector<Record *> DefaultOps;
   DefaultOps = Records.getAllDerivedDefinitions("OperandWithDefaultOps");
 
   // Find some SDNode.
@@ -3366,10 +3337,10 @@ void CodeGenDAGPatterns::ParseDefaultOperands() {
 
     // Clone the DefaultInfo dag node, changing the operator from 'ops' to
     // SomeSDnode so that we can parse this.
-    std::vector<std::pair<Init*, StringInit*> > Ops;
+    std::vector<std::pair<Init *, StringInit *>> Ops;
     for (unsigned op = 0, e = DefaultInfo->getNumArgs(); op != e; ++op)
-      Ops.push_back(std::make_pair(DefaultInfo->getArg(op),
-                                   DefaultInfo->getArgName(op)));
+      Ops.push_back(
+          std::make_pair(DefaultInfo->getArg(op), DefaultInfo->getArgName(op)));
     DagInit *DI = DagInit::get(SomeSDNode, nullptr, Ops);
 
     // Create a TreePattern to parse this.
@@ -3520,7 +3491,7 @@ void CodeGenDAGPatterns::FindPatternInputsAndOutputs(
     I.error("Cannot specify a transform function on a set node!");
 
   // Check the set destinations.
-  unsigned NumDests = Pat->getNumChildren()-1;
+  unsigned NumDests = Pat->getNumChildren() - 1;
   for (unsigned i = 0; i != NumDests; ++i) {
     TreePatternNodePtr Dest = Pat->getChildShared(i);
     // For set destinations we also must resolve fragments here.
@@ -3565,6 +3536,7 @@ void CodeGenDAGPatterns::FindPatternInputsAndOutputs(
 
 class InstAnalyzer {
   const CodeGenDAGPatterns &CDP;
+
 public:
   bool hasSideEffects;
   bool mayStore;
@@ -3574,8 +3546,8 @@ public:
   bool hasChain;
 
   InstAnalyzer(const CodeGenDAGPatterns &cdp)
-    : CDP(cdp), hasSideEffects(false), mayStore(false), mayLoad(false),
-      isBitcast(false), isVariadic(false), hasChain(false) {}
+      : CDP(cdp), hasSideEffects(false), mayStore(false), mayLoad(false),
+        isBitcast(false), isVariadic(false), hasChain(false) {}
 
   void Analyze(const PatternToMatch &Pat) {
     const TreePatternNode *N = Pat.getSrcPattern();
@@ -3611,9 +3583,12 @@ public:
         // Handle ComplexPattern leaves.
         if (LeafRec->isSubClassOf("ComplexPattern")) {
           const ComplexPattern &CP = CDP.getComplexPattern(LeafRec);
-          if (CP.hasProperty(SDNPMayStore)) mayStore = true;
-          if (CP.hasProperty(SDNPMayLoad)) mayLoad = true;
-          if (CP.hasProperty(SDNPSideEffect)) hasSideEffects = true;
+          if (CP.hasProperty(SDNPMayStore))
+            mayStore = true;
+          if (CP.hasProperty(SDNPMayLoad))
+            mayLoad = true;
+          if (CP.hasProperty(SDNPSideEffect))
+            hasSideEffects = true;
         }
       }
       return;
@@ -3624,11 +3599,16 @@ public:
       AnalyzeNode(N->getChild(i));
 
     // Notice properties of the node.
-    if (N->NodeHasProperty(SDNPMayStore, CDP)) mayStore = true;
-    if (N->NodeHasProperty(SDNPMayLoad, CDP)) mayLoad = true;
-    if (N->NodeHasProperty(SDNPSideEffect, CDP)) hasSideEffects = true;
-    if (N->NodeHasProperty(SDNPVariadic, CDP)) isVariadic = true;
-    if (N->NodeHasProperty(SDNPHasChain, CDP)) hasChain = true;
+    if (N->NodeHasProperty(SDNPMayStore, CDP))
+      mayStore = true;
+    if (N->NodeHasProperty(SDNPMayLoad, CDP))
+      mayLoad = true;
+    if (N->NodeHasProperty(SDNPSideEffect, CDP))
+      hasSideEffects = true;
+    if (N->NodeHasProperty(SDNPVariadic, CDP))
+      isVariadic = true;
+    if (N->NodeHasProperty(SDNPHasChain, CDP))
+      hasChain = true;
 
     if (const CodeGenIntrinsic *IntInfo = N->getIntrinsicInfo(CDP)) {
       ModRefInfo MR = IntInfo->ME.getModRef();
@@ -3645,17 +3625,15 @@ public:
         hasSideEffects = true;
     }
   }
-
 };
 
 static bool InferFromPattern(CodeGenInstruction &InstInfo,
-                             const InstAnalyzer &PatInfo,
-                             Record *PatDef) {
+                             const InstAnalyzer &PatInfo, Record *PatDef) {
   bool Error = false;
 
   // Remember where InstInfo got its flags.
   if (InstInfo.hasUndefFlags())
-      InstInfo.InferredFrom = PatDef;
+    InstInfo.InferredFrom = PatDef;
 
   // Check explicitly set flags for consistency.
   if (InstInfo.hasSideEffects != PatInfo.hasSideEffects &&
@@ -3666,14 +3644,14 @@ static bool InferFromPattern(CodeGenInstruction &InstInfo,
     if (!InstInfo.hasSideEffects) {
       Error = true;
       PrintError(PatDef->getLoc(), "Pattern doesn't match hasSideEffects = " +
-                 Twine(InstInfo.hasSideEffects));
+                                       Twine(InstInfo.hasSideEffects));
     }
   }
 
   if (InstInfo.mayStore != PatInfo.mayStore && !InstInfo.mayStore_Unset) {
     Error = true;
-    PrintError(PatDef->getLoc(), "Pattern doesn't match mayStore = " +
-               Twine(InstInfo.mayStore));
+    PrintError(PatDef->getLoc(),
+               "Pattern doesn't match mayStore = " + Twine(InstInfo.mayStore));
   }
 
   if (InstInfo.mayLoad != PatInfo.mayLoad && !InstInfo.mayLoad_Unset) {
@@ -3681,8 +3659,8 @@ static bool InferFromPattern(CodeGenInstruction &InstInfo,
     // Some targets translate immediates to loads.
     if (!InstInfo.mayLoad) {
       Error = true;
-      PrintError(PatDef->getLoc(), "Pattern doesn't match mayLoad = " +
-                 Twine(InstInfo.mayLoad));
+      PrintError(PatDef->getLoc(),
+                 "Pattern doesn't match mayLoad = " + Twine(InstInfo.mayLoad));
     }
   }
 
@@ -3712,11 +3690,13 @@ static bool InferFromPattern(CodeGenInstruction &InstInfo,
 /// null_frag operator.
 static bool hasNullFragReference(DagInit *DI) {
   DefInit *OpDef = dyn_cast<DefInit>(DI->getOperator());
-  if (!OpDef) return false;
+  if (!OpDef)
+    return false;
   Record *Operator = OpDef->getDef();
 
   // If this is the null fragment, return true.
-  if (Operator->getName() == "null_frag") return true;
+  if (Operator->getName() == "null_frag")
+    return true;
   // If any of the arguments reference the null fragment, return true.
   for (unsigned i = 0, e = DI->getNumArgs(); i != e; ++i) {
     if (auto Arg = dyn_cast<DefInit>(DI->getArg(i)))
@@ -3743,8 +3723,8 @@ static bool hasNullFragReference(ListInit *LI) {
 }
 
 /// Get all the instructions in a tree.
-static void
-getInstructionsInTree(TreePatternNode *Tree, SmallVectorImpl<Record*> &Instrs) {
+static void getInstructionsInTree(TreePatternNode *Tree,
+                                  SmallVectorImpl<Record *> &Instrs) {
   if (Tree->isLeaf())
     return;
   if (Tree->getOperator()->isSubClassOf("Instruction"))
@@ -3755,8 +3735,7 @@ getInstructionsInTree(TreePatternNode *Tree, SmallVectorImpl<Record*> &Instrs) {
 
 /// Check the class of a pattern leaf node against the instruction operand it
 /// represents.
-static bool checkOperandClass(CGIOperandList::OperandInfo &OI,
-                              Record *Leaf) {
+static bool checkOperandClass(CGIOperandList::OperandInfo &OI, Record *Leaf) {
   if (OI.Rec == Leaf)
     return true;
 
@@ -3772,8 +3751,9 @@ static bool checkOperandClass(CGIOperandList::OperandInfo &OI,
   return false;
 }
 
-void CodeGenDAGPatterns::parseInstructionPattern(
-    CodeGenInstruction &CGI, ListInit *Pat, DAGInstMap &DAGInsts) {
+void CodeGenDAGPatterns::parseInstructionPattern(CodeGenInstruction &CGI,
+                                                 ListInit *Pat,
+                                                 DAGInstMap &DAGInsts) {
 
   assert(!DAGInsts.count(CGI.TheDef) && "Instruction already parsed!");
 
@@ -3789,7 +3769,7 @@ void CodeGenDAGPatterns::parseInstructionPattern(
   MapVector<std::string, TreePatternNodePtr, std::map<std::string, unsigned>>
       InstResults;
 
-  std::vector<Record*> InstImpResults;
+  std::vector<Record *> InstImpResults;
 
   // Verify that the top-level forms in the instruction are of void type, and
   // fill in the InstResults map.
@@ -3805,8 +3785,8 @@ void CodeGenDAGPatterns::parseInstructionPattern(
         Pat->getExtType(k).writeToStream(OS);
       }
       I.error("Top-level forms in instruction pattern should have"
-               " void types, has types " +
-               OS.str());
+              " void types, has types " +
+              OS.str());
     }
 
     // Find inputs and outputs, and verify the structure of the uses/defs.
@@ -3823,7 +3803,7 @@ void CodeGenDAGPatterns::parseInstructionPattern(
   assert(I.getArgList().empty() && "Args list should still be empty here!");
 
   // Check that all of the results occur first in the list.
-  std::vector<Record*> Results;
+  std::vector<Record *> Results;
   std::vector<unsigned> ResultIndices;
   SmallVector<TreePatternNodePtr, 2> ResNodes;
   for (unsigned i = 0; i != NumResults; ++i) {
@@ -3850,8 +3830,9 @@ void CodeGenDAGPatterns::parseInstructionPattern(
     Record *R = cast<DefInit>(RNode->getLeafValue())->getDef();
     ResNodes.push_back(std::move(RNode));
     if (!R)
-      I.error("Operand $" + OpName + " should be a set destination: all "
-               "outputs must occur before inputs in operand list!");
+      I.error("Operand $" + OpName +
+              " should be a set destination: all "
+              "outputs must occur before inputs in operand list!");
 
     if (!checkOperandClass(CGI.Operands[i], R))
       I.error("Operand $" + OpName + " class mismatch!");
@@ -3868,7 +3849,7 @@ void CodeGenDAGPatterns::parseInstructionPattern(
 
   // Loop over the inputs next.
   std::vector<TreePatternNodePtr> ResultNodeOperands;
-  std::vector<Record*> Operands;
+  std::vector<Record *> Operands;
   for (unsigned i = NumResults, e = CGI.Operands.size(); i != e; ++i) {
     CGIOperandList::OperandInfo &Op = CGI.Operands[i];
     const std::string &OpName = Op.Name;
@@ -3885,16 +3866,17 @@ void CodeGenDAGPatterns::parseInstructionPattern(
           continue;
       }
       I.error("Operand $" + OpName +
-               " does not appear in the instruction pattern");
+              " does not appear in the instruction pattern");
     }
     TreePatternNodePtr InVal = InstInputs[OpName];
-    InstInputs.erase(OpName);   // It occurred, remove from map.
+    InstInputs.erase(OpName); // It occurred, remove from map.
 
     if (InVal->isLeaf() && isa<DefInit>(InVal->getLeafValue())) {
       Record *InRec = cast<DefInit>(InVal->getLeafValue())->getDef();
       if (!checkOperandClass(Op, InRec))
-        I.error("Operand $" + OpName + "'s register class disagrees"
-                 " between the operand and pattern");
+        I.error("Operand $" + OpName +
+                "'s register class disagrees"
+                " between the operand and pattern");
     }
     Operands.push_back(Op.Rec);
 
@@ -3910,7 +3892,7 @@ void CodeGenDAGPatterns::parseInstructionPattern(
       std::vector<TreePatternNodePtr> Children;
       Children.push_back(OpNode);
       OpNode = makeIntrusiveRefCnt<TreePatternNode>(Xform, std::move(Children),
-                                                 OpNode->getNumTypes());
+                                                    OpNode->getNumTypes());
     }
 
     ResultNodeOperands.push_back(std::move(OpNode));
@@ -3935,8 +3917,8 @@ void CodeGenDAGPatterns::parseInstructionPattern(
   TreePatternNodePtr Pattern = I.getTree(0);
   TreePatternNodePtr SrcPattern;
   if (Pattern->getOperator()->getName() == "set") {
-    SrcPattern = Pattern->getChild(Pattern->getNumChildren()-1)->clone();
-  } else{
+    SrcPattern = Pattern->getChild(Pattern->getNumChildren() - 1)->clone();
+  } else {
     // Not a set (store or something?)
     SrcPattern = Pattern;
   }
@@ -3954,7 +3936,8 @@ void CodeGenDAGPatterns::parseInstructionPattern(
 /// any fragments involved.  This populates the Instructions list with fully
 /// resolved instructions.
 void CodeGenDAGPatterns::ParseInstructions() {
-  std::vector<Record*> Instrs = Records.getAllDerivedDefinitions("Instruction");
+  std::vector<Record *> Instrs =
+      Records.getAllDerivedDefinitions("Instruction");
 
   for (Record *Instr : Instrs) {
     ListInit *LI = nullptr;
@@ -3969,8 +3952,8 @@ void CodeGenDAGPatterns::ParseInstructions() {
     // is from a multiclass expansion w/ a SDPatternOperator passed in as
     // null_frag.
     if (!LI || LI->empty() || hasNullFragReference(LI)) {
-      std::vector<Record*> Results;
-      std::vector<Record*> Operands;
+      std::vector<Record *> Results;
+      std::vector<Record *> Operands;
 
       CodeGenInstruction &InstInfo = Target.getInstruction(Instr);
 
@@ -3980,14 +3963,15 @@ void CodeGenDAGPatterns::ParseInstructions() {
 
         // The rest are inputs.
         for (unsigned j = InstInfo.Operands.NumDefs,
-               e = InstInfo.Operands.size(); j < e; ++j)
+                      e = InstInfo.Operands.size();
+             j < e; ++j)
           Operands.push_back(InstInfo.Operands[j].Rec);
       }
 
       // Create and insert the instruction.
       Instructions.try_emplace(Instr, std::move(Results), std::move(Operands),
                                std::vector<Record *>());
-      continue;  // no pattern.
+      continue; // no pattern.
     }
 
     CodeGenInstruction &CGI = Target.getInstruction(Instr);
@@ -4036,19 +4020,18 @@ void CodeGenDAGPatterns::AddPatternToMatch(TreePattern *Pattern,
   std::string Reason;
   if (!PTM.getSrcPattern()->canPatternMatch(Reason, *this)) {
     PrintWarning(Pattern->getRecord()->getLoc(),
-      Twine("Pattern can never match: ") + Reason);
+                 Twine("Pattern can never match: ") + Reason);
     return;
   }
 
   // If the source pattern's root is a complex pattern, that complex pattern
   // must specify the nodes it can potentially match.
   if (const ComplexPattern *CP =
-        PTM.getSrcPattern()->getComplexPatternInfo(*this))
+          PTM.getSrcPattern()->getComplexPatternInfo(*this))
     if (CP->getRootNodes().empty())
       Pattern->error("ComplexPattern at root must specify list of opcodes it"
                      " could match");
 
-
   // Find all of the named values in the input and output, ensure they have the
   // same type.
   std::map<std::string, NameRecord> SrcNames, DstNames;
@@ -4074,8 +4057,8 @@ void CodeGenDAGPatterns::AddPatternToMatch(TreePattern *Pattern,
 }
 
 void CodeGenDAGPatterns::InferInstructionFlags() {
-  ArrayRef<const CodeGenInstruction*> Instructions =
-    Target.getInstructionsByEnumValue();
+  ArrayRef<const CodeGenInstruction *> Instructions =
+      Target.getInstructionsByEnumValue();
 
   unsigned Errors = 0;
 
@@ -4085,7 +4068,7 @@ void CodeGenDAGPatterns::InferInstructionFlags() {
   for (const PatternToMatch &PTM : ptms()) {
     // We can only infer from single-instruction patterns, otherwise we won't
     // know which instruction should get the flags.
-    SmallVector<Record*, 8> PatInstrs;
+    SmallVector<Record *, 8> PatInstrs;
     getInstructionsInTree(PTM.getDstPattern(), PatInstrs);
     if (PatInstrs.size() != 1)
       continue;
@@ -4109,7 +4092,7 @@ void CodeGenDAGPatterns::InferInstructionFlags() {
   if (Target.guessInstructionProperties()) {
     for (unsigned i = 0, e = Instructions.size(); i != e; ++i) {
       CodeGenInstruction *InstInfo =
-        const_cast<CodeGenInstruction *>(Instructions[i]);
+          const_cast<CodeGenInstruction *>(Instructions[i]);
       if (InstInfo->InferredFrom)
         continue;
       // The mayLoad and mayStore flags default to false.
@@ -4123,7 +4106,7 @@ void CodeGenDAGPatterns::InferInstructionFlags() {
   // Complain about any flags that are still undefined.
   for (unsigned i = 0, e = Instructions.size(); i != e; ++i) {
     CodeGenInstruction *InstInfo =
-      const_cast<CodeGenInstruction *>(Instructions[i]);
+        const_cast<CodeGenInstruction *>(Instructions[i]);
     if (InstInfo->InferredFrom)
       continue;
     if (InstInfo->hasSideEffects_Unset)
@@ -4138,12 +4121,11 @@ void CodeGenDAGPatterns::InferInstructionFlags() {
   }
 }
 
-
 /// Verify instruction flags against pattern node properties.
 void CodeGenDAGPatterns::VerifyInstructionFlags() {
   unsigned Errors = 0;
   for (const PatternToMatch &PTM : ptms()) {
-    SmallVector<Record*, 8> Instrs;
+    SmallVector<Record *, 8> Instrs;
     getInstructionsInTree(PTM.getDstPattern(), Instrs);
     if (Instrs.empty())
       continue;
@@ -4186,16 +4168,16 @@ void CodeGenDAGPatterns::VerifyInstructionFlags() {
     ++Errors;
 
     for (const std::string &Msg : Msgs)
-      PrintError(PTM.getSrcRecord()->getLoc(), Twine(Msg) + " on the " +
-                 (Instrs.size() == 1 ?
-                  "instruction" : "output instructions"));
+      PrintError(
+          PTM.getSrcRecord()->getLoc(),
+          Twine(Msg) + " on the " +
+              (Instrs.size() == 1 ? "instruction" : "output instructions"));
     // Provide the location of the relevant instruction definitions.
     for (const Record *Instr : Instrs) {
       if (Instr != PTM.getSrcRecord())
         PrintError(Instr->getLoc(), "defined here");
       const CodeGenInstruction &InstInfo = Target.getInstruction(Instr);
-      if (InstInfo.InferredFrom &&
-          InstInfo.InferredFrom != InstInfo.TheDef &&
+      if (InstInfo.InferredFrom && InstInfo.InferredFrom != InstInfo.TheDef &&
           InstInfo.InferredFrom != PTM.getSrcRecord())
         PrintError(InstInfo.InferredFrom->getLoc(), "inferred from pattern");
     }
@@ -4237,11 +4219,11 @@ static bool ForceArbitraryInstResultType(TreePatternNode *N, TreePattern &TP) {
 // Promote xform function to be an explicit node wherever set.
 static TreePatternNodePtr PromoteXForms(TreePatternNodePtr N) {
   if (Record *Xform = N->getTransformFn()) {
-      N->setTransformFn(nullptr);
-      std::vector<TreePatternNodePtr> Children;
-      Children.push_back(PromoteXForms(N));
-      return makeIntrusiveRefCnt<TreePatternNode>(Xform, std::move(Children),
-                                               N->getNumTypes());
+    N->setTransformFn(nullptr);
+    std::vector<TreePatternNodePtr> Children;
+    Children.push_back(PromoteXForms(N));
+    return makeIntrusiveRefCnt<TreePatternNode>(Xform, std::move(Children),
+                                                N->getNumTypes());
   }
 
   if (!N->isLeaf())
@@ -4252,9 +4234,9 @@ static TreePatternNodePtr PromoteXForms(TreePatternNodePtr N) {
   return N;
 }
 
-void CodeGenDAGPatterns::ParseOnePattern(Record *TheDef,
-       TreePattern &Pattern, TreePattern &Result,
-       const std::vector<Record *> &InstImpResults) {
+void CodeGenDAGPatterns::ParseOnePattern(
+    Record *TheDef, TreePattern &Pattern, TreePattern &Result,
+    const std::vector<Record *> &InstImpResults) {
 
   // Inline pattern fragments and expand multiple alternatives.
   Pattern.InlinePatternFragments();
@@ -4274,8 +4256,7 @@ void CodeGenDAGPatterns::ParseOnePattern(Record *TheDef,
 
     // Infer as many types as possible.  If we cannot infer all of them, we
     // can never do anything with this pattern: report it to the user.
-    InferredAllResultTypes =
-        Result.InferAllTypes(&Pattern.getNamedNodesMap());
+    InferredAllResultTypes = Result.InferAllTypes(&Pattern.getNamedNodesMap());
 
     IterateInference = false;
 
@@ -4286,11 +4267,11 @@ void CodeGenDAGPatterns::ParseOnePattern(Record *TheDef,
     for (const auto &T : Pattern.getTrees())
       for (unsigned i = 0, e = std::min(Result.getOnlyTree()->getNumTypes(),
                                         T->getNumTypes());
-         i != e; ++i) {
-        IterateInference |= T->UpdateNodeType(
-            i, Result.getOnlyTree()->getExtType(i), Result);
-        IterateInference |= Result.getOnlyTree()->UpdateNodeType(
-            i, T->getExtType(i), Result);
+           i != e; ++i) {
+        IterateInference |=
+            T->UpdateNodeType(i, Result.getOnlyTree()->getExtType(i), Result);
+        IterateInference |=
+            Result.getOnlyTree()->UpdateNodeType(i, T->getExtType(i), Result);
       }
 
     // If our iteration has converged and the input pattern's types are fully
@@ -4302,8 +4283,7 @@ void CodeGenDAGPatterns::ParseOnePattern(Record *TheDef,
     //
     // In any case, to handle this, we just go through and disambiguate some
     // arbitrary types to the result pattern's nodes.
-    if (!IterateInference && InferredAllPatternTypes &&
-        !InferredAllResultTypes)
+    if (!IterateInference && InferredAllPatternTypes && !InferredAllResultTypes)
       IterateInference =
           ForceArbitraryInstResultType(Result.getTree(0).get(), Result);
   } while (IterateInference);
@@ -4357,7 +4337,7 @@ void CodeGenDAGPatterns::ParseOnePattern(Record *TheDef,
 }
 
 void CodeGenDAGPatterns::ParsePatterns() {
-  std::vector<Record*> Patterns = Records.getAllDerivedDefinitions("Pattern");
+  std::vector<Record *> Patterns = Records.getAllDerivedDefinitions("Pattern");
 
   for (Record *CurPattern : Patterns) {
     DagInit *Tree = CurPattern->getValueAsDag("PatternToMatch");
@@ -4369,7 +4349,8 @@ void CodeGenDAGPatterns::ParsePatterns() {
     TreePattern Pattern(CurPattern, Tree, true, *this);
 
     ListInit *LI = CurPattern->getValueAsListInit("ResultInstrs");
-    if (LI->empty()) continue;  // no pattern.
+    if (LI->empty())
+      continue; // no pattern.
 
     // Parse the instruction.
     TreePattern Result(CurPattern, LI, false, *this);
@@ -4382,7 +4363,7 @@ void CodeGenDAGPatterns::ParsePatterns() {
     std::map<std::string, TreePatternNodePtr> InstInputs;
     MapVector<std::string, TreePatternNodePtr, std::map<std::string, unsigned>>
         InstResults;
-    std::vector<Record*> InstImpResults;
+    std::vector<Record *> InstImpResults;
     for (unsigned j = 0, ee = Pattern.getNumTrees(); j != ee; ++j)
       FindPatternInputsAndOutputs(Pattern, Pattern.getTree(j), InstInputs,
                                   InstResults, InstImpResults);
@@ -4512,7 +4493,6 @@ static void DumpDepVars(MultipleUseVarSet &DepVars) {
 }
 #endif
 
-
 /// CombineChildVariants - Given a bunch of permutations of each child of the
 /// 'operator' node, put them together in all possible ways.
 static void CombineChildVariants(
@@ -4598,7 +4578,8 @@ static void CombineChildVariants(TreePatternNodePtr Orig,
 static void
 GatherChildrenOfAssociativeOpcode(TreePatternNodePtr N,
                                   std::vector<TreePatternNodePtr> &Children) {
-  assert(N->getNumChildren()==2 &&"Associative but doesn't have 2 children!");
+  assert(N->getNumChildren() == 2 &&
+         "Associative but doesn't have 2 children!");
   Record *Operator = N->getOperator();
 
   // Only permit raw nodes.
@@ -4725,7 +4706,6 @@ static void GenerateVariantsOf(TreePatternNodePtr N,
   }
 }
 
-
 // GenerateVariants - Generate variants.  For example, commutative patterns can
 // match multiple ways.  Add them to PatternsToMatch as well.
 void CodeGenDAGPatterns::GenerateVariants() {
@@ -4782,7 +4762,8 @@ void CodeGenDAGPatterns::GenerateVariants() {
         }
       }
       // If we already have it, ignore the variant.
-      if (AlreadyExists) continue;
+      if (AlreadyExists)
+        continue;
 
       // Otherwise, add it to the list of patterns we have.
       PatternsToMatch.emplace_back(
diff --git a/llvm/utils/TableGen/CodeGenDAGPatterns.h b/llvm/utils/TableGen/CodeGenDAGPatterns.h
index 2611fe0..ea6219c 100644
--- a/llvm/utils/TableGen/CodeGenDAGPatterns.h
+++ b/llvm/utils/TableGen/CodeGenDAGPatterns.h
@@ -56,17 +56,15 @@ struct MachineValueTypeSet {
   static_assert(std::is_same<std::underlying_type_t<MVT::SimpleValueType>,
                              uint8_t>::value,
                 "Change uint8_t here to the SimpleValueType's type");
-  static unsigned constexpr Capacity = std::numeric_limits<uint8_t>::max()+1;
+  static unsigned constexpr Capacity = std::numeric_limits<uint8_t>::max() + 1;
   using WordType = uint64_t;
-  static unsigned constexpr WordWidth = CHAR_BIT*sizeof(WordType);
-  static unsigned constexpr NumWords = Capacity/WordWidth;
-  static_assert(NumWords*WordWidth == Capacity,
+  static unsigned constexpr WordWidth = CHAR_BIT * sizeof(WordType);
+  static unsigned constexpr NumWords = Capacity / WordWidth;
+  static_assert(NumWords * WordWidth == Capacity,
                 "Capacity should be a multiple of WordWidth");
 
   LLVM_ATTRIBUTE_ALWAYS_INLINE
-  MachineValueTypeSet() {
-    clear();
-  }
+  MachineValueTypeSet() { clear(); }
 
   LLVM_ATTRIBUTE_ALWAYS_INLINE
   unsigned size() const {
@@ -76,9 +74,7 @@ struct MachineValueTypeSet {
     return Count;
   }
   LLVM_ATTRIBUTE_ALWAYS_INLINE
-  void clear() {
-    std::memset(Words.data(), 0, NumWords*sizeof(WordType));
-  }
+  void clear() { std::memset(Words.data(), 0, NumWords * sizeof(WordType)); }
   LLVM_ATTRIBUTE_ALWAYS_INLINE
   bool empty() const {
     for (WordType W : Words)
@@ -90,7 +86,7 @@ struct MachineValueTypeSet {
   unsigned count(MVT T) const {
     return (Words[T.SimpleTy / WordWidth] >> (T.SimpleTy % WordWidth)) & 1;
   }
-  std::pair<MachineValueTypeSet&,bool> insert(MVT T) {
+  std::pair<MachineValueTypeSet &, bool> insert(MVT T) {
     bool V = count(T.SimpleTy);
     Words[T.SimpleTy / WordWidth] |= WordType(1) << (T.SimpleTy % WordWidth);
     return {*this, V};
@@ -113,8 +109,8 @@ struct MachineValueTypeSet {
     using iterator_category = std::forward_iterator_tag;
     using value_type = MVT;
     using difference_type = ptrdiff_t;
-    using pointer = const MVT*;
-    using reference = const MVT&;
+    using pointer = const MVT *;
+    using reference = const MVT &;
 
     LLVM_ATTRIBUTE_ALWAYS_INLINE
     MVT operator*() const {
@@ -128,7 +124,7 @@ struct MachineValueTypeSet {
     LLVM_ATTRIBUTE_ALWAYS_INLINE
     const_iterator &operator++() {
       assert(Pos != Capacity);
-      Pos = find_from_pos(Pos+1);
+      Pos = find_from_pos(Pos + 1);
       return *this;
     }
 
@@ -137,9 +133,7 @@ struct MachineValueTypeSet {
       return Set == It.Set && Pos == It.Pos;
     }
     LLVM_ATTRIBUTE_ALWAYS_INLINE
-    bool operator!=(const const_iterator &It) const {
-      return !operator==(It);
-    }
+    bool operator!=(const const_iterator &It) const { return !operator==(It); }
 
   private:
     unsigned find_from_pos(unsigned P) const {
@@ -151,7 +145,7 @@ struct MachineValueTypeSet {
       // the trailing bits need to be masked off to use findFirstSet.
       if (SkipBits != 0) {
         WordType W = Set->Words[SkipWords];
-        W &= maskLeadingOnes<WordType>(WordWidth-SkipBits);
+        W &= maskLeadingOnes<WordType>(WordWidth - SkipBits);
         if (W != 0)
           return Count + llvm::countr_zero(W);
         Count += WordWidth;
@@ -174,20 +168,18 @@ struct MachineValueTypeSet {
   LLVM_ATTRIBUTE_ALWAYS_INLINE
   const_iterator begin() const { return const_iterator(this, false); }
   LLVM_ATTRIBUTE_ALWAYS_INLINE
-  const_iterator end()   const { return const_iterator(this, true); }
+  const_iterator end() const { return const_iterator(this, true); }
 
   LLVM_ATTRIBUTE_ALWAYS_INLINE
   bool operator==(const MachineValueTypeSet &S) const {
     return Words == S.Words;
   }
   LLVM_ATTRIBUTE_ALWAYS_INLINE
-  bool operator!=(const MachineValueTypeSet &S) const {
-    return !operator==(S);
-  }
+  bool operator!=(const MachineValueTypeSet &S) const { return !operator==(S); }
 
 private:
   friend struct const_iterator;
-  std::array<WordType,NumWords> Words;
+  std::array<WordType, NumWords> Words;
 };
 
 raw_ostream &operator<<(raw_ostream &OS, const MachineValueTypeSet &T);
@@ -200,14 +192,12 @@ struct TypeSetByHwMode : public InfoByHwMode<MachineValueTypeSet> {
   TypeSetByHwMode(const TypeSetByHwMode &VTS) = default;
   TypeSetByHwMode &operator=(const TypeSetByHwMode &) = default;
   TypeSetByHwMode(MVT::SimpleValueType VT)
-    : TypeSetByHwMode(ValueTypeByHwMode(VT)) {}
+      : TypeSetByHwMode(ValueTypeByHwMode(VT)) {}
   TypeSetByHwMode(ValueTypeByHwMode VT)
-    : TypeSetByHwMode(ArrayRef<ValueTypeByHwMode>(&VT, 1)) {}
+      : TypeSetByHwMode(ArrayRef<ValueTypeByHwMode>(&VT, 1)) {}
   TypeSetByHwMode(ArrayRef<ValueTypeByHwMode> VTList);
 
-  SetType &getOrCreate(unsigned Mode) {
-    return Map[Mode];
-  }
+  SetType &getOrCreate(unsigned Mode) { return Map[Mode]; }
 
   bool isValueTypeByHwMode(bool AllowEmpty) const;
   ValueTypeByHwMode getValueTypeByHwMode() const;
@@ -225,9 +215,7 @@ struct TypeSetByHwMode : public InfoByHwMode<MachineValueTypeSet> {
 
   bool isPossible() const;
 
-  bool isPointer() const {
-    return getValueTypeByHwMode().isPointer();
-  }
+  bool isPointer() const { return getValueTypeByHwMode().isPointer(); }
 
   unsigned getPtrAddrSpace() const {
     assert(isPointer());
@@ -313,8 +301,7 @@ struct TypeInfer {
   /// Ensure that for each type T in \p Sub, T is a vector type, and there
   /// exists a type U in \p Vec such that U is a vector type with the same
   /// element type as T and at least as many elements as T.
-  bool EnforceVectorSubVectorTypeIs(TypeSetByHwMode &Vec,
-                                    TypeSetByHwMode &Sub);
+  bool EnforceVectorSubVectorTypeIs(TypeSetByHwMode &Vec, TypeSetByHwMode &Sub);
   /// 1. Ensure that \p V has a scalar type iff \p W has a scalar type.
   /// 2. Ensure that for each vector type T in \p V, there exists a vector
   ///    type U in \p W, such that T and U have the same number of elements.
@@ -346,15 +333,13 @@ struct TypeInfer {
     SuppressValidation(TypeInfer &TI) : Infer(TI), SavedValidate(TI.Validate) {
       Infer.Validate = false;
     }
-    ~SuppressValidation() {
-      Infer.Validate = SavedValidate;
-    }
+    ~SuppressValidation() { Infer.Validate = SavedValidate; }
     TypeInfer &Infer;
     bool SavedValidate;
   };
 
   TreePattern &TP;
-  bool Validate = true;   // Indicate whether to validate types.
+  bool Validate = true; // Indicate whether to validate types.
 
 private:
   const TypeSetByHwMode &getLegalTypes() const;
@@ -372,14 +357,24 @@ typedef StringSet<> MultipleUseVarSet;
 struct SDTypeConstraint {
   SDTypeConstraint(Record *R, const CodeGenHwModes &CGH);
 
-  unsigned OperandNo;   // The operand # this constraint applies to.
+  unsigned OperandNo; // The operand # this constraint applies to.
   enum {
-    SDTCisVT, SDTCisPtrTy, SDTCisInt, SDTCisFP, SDTCisVec, SDTCisSameAs,
-    SDTCisVTSmallerThanOp, SDTCisOpSmallerThanOp, SDTCisEltOfVec,
-    SDTCisSubVecOfVec, SDTCVecEltisVT, SDTCisSameNumEltsAs, SDTCisSameSizeAs
+    SDTCisVT,
+    SDTCisPtrTy,
+    SDTCisInt,
+    SDTCisFP,
+    SDTCisVec,
+    SDTCisSameAs,
+    SDTCisVTSmallerThanOp,
+    SDTCisOpSmallerThanOp,
+    SDTCisEltOfVec,
+    SDTCisSubVecOfVec,
+    SDTCVecEltisVT,
+    SDTCisSameNumEltsAs,
+    SDTCisSameSizeAs
   } ConstraintType;
 
-  union {   // The discriminated union.
+  union { // The discriminated union.
     struct {
       unsigned OtherOperandNum;
     } SDTCisSameAs_Info;
@@ -422,6 +417,7 @@ struct SDTypeConstraint {
 class ScopedName {
   unsigned Scope;
   std::string Identifier;
+
 public:
   ScopedName(unsigned Scope, StringRef Identifier)
       : Scope(Scope), Identifier(std::string(Identifier)) {
@@ -447,6 +443,7 @@ class SDNodeInfo {
   unsigned NumResults;
   int NumOperands;
   std::vector<SDTypeConstraint> TypeConstraints;
+
 public:
   // Parse the specified record.
   SDNodeInfo(Record *R, const CodeGenHwModes &CGH);
@@ -487,11 +484,11 @@ class TreePredicateFn {
   /// PatFragRec - This is the TreePattern for the PatFrag that we
   /// originally came from.
   TreePattern *PatFragRec;
+
 public:
   /// TreePredicateFn constructor.  Here 'N' is a subclass of PatFrag.
   TreePredicateFn(TreePattern *N);
 
-
   TreePattern *getOrigPatFragRecord() const { return PatFragRec; }
 
   /// isAlwaysTrue - Return true if this is a noop predicate.
@@ -582,7 +579,8 @@ public:
   bool isAtomicOrderingWeakerThanRelease() const;
 
   /// If non-null, indicates that this predicate is a predefined memory VT
-  /// predicate for a load/store and returns the ValueType record for the memory VT.
+  /// predicate for a load/store and returns the ValueType record for the memory
+  /// VT.
   Record *getMemoryVT() const;
   /// If non-null, indicates that this predicate is a predefined memory VT
   /// predicate (checking only the scalar type) for load/store and returns the
@@ -615,14 +613,12 @@ struct TreePredicateCall {
   unsigned Scope;
 
   TreePredicateCall(const TreePredicateFn &Fn, unsigned Scope)
-    : Fn(Fn), Scope(Scope) {}
+      : Fn(Fn), Scope(Scope) {}
 
   bool operator==(const TreePredicateCall &o) const {
     return Fn == o.Fn && Scope == o.Scope;
   }
-  bool operator!=(const TreePredicateCall &o) const {
-    return !(*this == o);
-  }
+  bool operator!=(const TreePredicateCall &o) const { return !(*this == o); }
 };
 
 class TreePatternNode : public RefCountedBase<TreePatternNode> {
@@ -681,7 +677,7 @@ public:
   const std::vector<ScopedName> &getNamesAsPredicateArg() const {
     return NamesAsPredicateArg;
   }
-  void setNamesAsPredicateArg(const std::vector<ScopedName>& Names) {
+  void setNamesAsPredicateArg(const std::vector<ScopedName> &Names) {
     NamesAsPredicateArg = Names;
   }
   void addNameAsPredicateArg(const ScopedName &N) {
@@ -733,9 +729,7 @@ public:
   const TreePatternNodePtr &getChildShared(unsigned N) const {
     return Children[N];
   }
-  TreePatternNodePtr &getChildSharedPtr(unsigned N) {
-    return Children[N];
-  }
+  TreePatternNodePtr &getChildSharedPtr(unsigned N) { return Children[N]; }
   void setChild(unsigned i, TreePatternNodePtr N) { Children[i] = N; }
 
   /// hasChild - Return true if N is any of our children.
@@ -762,7 +756,8 @@ public:
   }
   void addPredicateCall(const TreePredicateCall &Call) {
     assert(!Call.Fn.isAlwaysTrue() && "Empty predicate string!");
-    assert(!is_contained(PredicateCalls, Call) && "predicate applied recursively");
+    assert(!is_contained(PredicateCalls, Call) &&
+           "predicate applied recursively");
     PredicateCalls.push_back(Call);
   }
   void addPredicateCall(const TreePredicateFn &Fn, unsigned Scope) {
@@ -805,8 +800,7 @@ public:
   void print(raw_ostream &OS) const;
   void dump() const;
 
-public:   // Higher level manipulation routines.
-
+public: // Higher level manipulation routines.
   /// clone - Return a new copy of this tree.
   ///
   TreePatternNodePtr clone() const;
@@ -845,8 +839,7 @@ public:   // Higher level manipulation routines.
                       TreePattern &TP);
   bool UpdateNodeType(unsigned ResNo, MVT::SimpleValueType InTy,
                       TreePattern &TP);
-  bool UpdateNodeType(unsigned ResNo, ValueTypeByHwMode InTy,
-                      TreePattern &TP);
+  bool UpdateNodeType(unsigned ResNo, ValueTypeByHwMode InTy, TreePattern &TP);
 
   // Update node type with types inferred from an instruction operand or result
   // def from the ins/outs lists.
@@ -910,7 +903,6 @@ class TreePattern {
   TypeInfer Infer;
 
 public:
-
   /// TreePattern constructor - Parse the specified DagInits into the
   /// current record.
   TreePattern(Record *TheRec, ListInit *RawPat, bool isInput,
@@ -971,12 +963,8 @@ public:
   /// error - If this is the first error in the current resolution step,
   /// print it and set the error flag.  Otherwise, continue silently.
   void error(const Twine &Msg);
-  bool hasError() const {
-    return HasError;
-  }
-  void resetError() {
-    HasError = false;
-  }
+  bool hasError() const { return HasError; }
+  void resetError() { HasError = false; }
 
   TypeInfer &getInfer() { return Infer; }
 
@@ -989,7 +977,6 @@ private:
   void ComputeNamedNodes(TreePatternNode *N);
 };
 
-
 inline bool TreePatternNode::UpdateNodeType(unsigned ResNo,
                                             const TypeSetByHwMode &InTy,
                                             TreePattern &TP) {
@@ -1014,7 +1001,6 @@ inline bool TreePatternNode::UpdateNodeType(unsigned ResNo,
   return TP.getInfer().MergeInTypeInfo(Types[ResNo], VTS);
 }
 
-
 /// DAGDefaultOperand - One of these is created for each OperandWithDefaultOps
 /// that has a set ExecuteAlways / DefaultOps field.
 struct DAGDefaultOperand {
@@ -1022,9 +1008,9 @@ struct DAGDefaultOperand {
 };
 
 class DAGInstruction {
-  std::vector<Record*> Results;
-  std::vector<Record*> Operands;
-  std::vector<Record*> ImpResults;
+  std::vector<Record *> Results;
+  std::vector<Record *> Operands;
+  std::vector<Record *> ImpResults;
   TreePatternNodePtr SrcPattern;
   TreePatternNodePtr ResultPattern;
 
@@ -1041,7 +1027,7 @@ public:
   unsigned getNumResults() const { return Results.size(); }
   unsigned getNumOperands() const { return Operands.size(); }
   unsigned getNumImpResults() const { return ImpResults.size(); }
-  const std::vector<Record*>& getImpResults() const { return ImpResults; }
+  const std::vector<Record *> &getImpResults() const { return ImpResults; }
 
   Record *getResult(unsigned RN) const {
     assert(RN < Results.size());
@@ -1065,34 +1051,33 @@ public:
 /// PatternToMatch - Used by CodeGenDAGPatterns to keep tab of patterns
 /// processed to produce isel.
 class PatternToMatch {
-  Record          *SrcRecord;   // Originating Record for the pattern.
-  ListInit        *Predicates;  // Top level predicate conditions to match.
-  TreePatternNodePtr SrcPattern;      // Source pattern to match.
-  TreePatternNodePtr DstPattern;      // Resulting pattern.
-  std::vector<Record*> Dstregs; // Physical register defs being matched.
-  std::string      HwModeFeatures;
-  int              AddedComplexity; // Add to matching pattern complexity.
-  unsigned         ID;          // Unique ID for the record.
+  Record *SrcRecord;             // Originating Record for the pattern.
+  ListInit *Predicates;          // Top level predicate conditions to match.
+  TreePatternNodePtr SrcPattern; // Source pattern to match.
+  TreePatternNodePtr DstPattern; // Resulting pattern.
+  std::vector<Record *> Dstregs; // Physical register defs being matched.
+  std::string HwModeFeatures;
+  int AddedComplexity; // Add to matching pattern complexity.
+  unsigned ID;         // Unique ID for the record.
 
 public:
   PatternToMatch(Record *srcrecord, ListInit *preds, TreePatternNodePtr src,
                  TreePatternNodePtr dst, std::vector<Record *> dstregs,
-                 int complexity, unsigned uid,
-                 const Twine &hwmodefeatures = "")
+                 int complexity, unsigned uid, const Twine &hwmodefeatures = "")
       : SrcRecord(srcrecord), Predicates(preds), SrcPattern(src),
         DstPattern(dst), Dstregs(std::move(dstregs)),
         HwModeFeatures(hwmodefeatures.str()), AddedComplexity(complexity),
         ID(uid) {}
 
-  Record          *getSrcRecord()  const { return SrcRecord; }
-  ListInit        *getPredicates() const { return Predicates; }
+  Record *getSrcRecord() const { return SrcRecord; }
+  ListInit *getPredicates() const { return Predicates; }
   TreePatternNode *getSrcPattern() const { return SrcPattern.get(); }
   TreePatternNodePtr getSrcPatternShared() const { return SrcPattern; }
   TreePatternNode *getDstPattern() const { return DstPattern.get(); }
   TreePatternNodePtr getDstPatternShared() const { return DstPattern; }
-  const std::vector<Record*> &getDstRegs() const { return Dstregs; }
-  StringRef   getHwModeFeatures() const { return HwModeFeatures; }
-  int         getAddedComplexity() const { return AddedComplexity; }
+  const std::vector<Record *> &getDstRegs() const { return Dstregs; }
+  StringRef getHwModeFeatures() const { return HwModeFeatures; }
+  int getAddedComplexity() const { return AddedComplexity; }
   unsigned getID() const { return ID; }
 
   std::string getPredicateCheck() const;
@@ -1108,14 +1093,14 @@ class CodeGenDAGPatterns {
   CodeGenTarget Target;
   CodeGenIntrinsicTable Intrinsics;
 
-  std::map<Record*, SDNodeInfo, LessRecordByID> SDNodes;
-  std::map<Record*, std::pair<Record*, std::string>, LessRecordByID>
+  std::map<Record *, SDNodeInfo, LessRecordByID> SDNodes;
+  std::map<Record *, std::pair<Record *, std::string>, LessRecordByID>
       SDNodeXForms;
-  std::map<Record*, ComplexPattern, LessRecordByID> ComplexPatterns;
+  std::map<Record *, ComplexPattern, LessRecordByID> ComplexPatterns;
   std::map<Record *, std::unique_ptr<TreePattern>, LessRecordByID>
       PatternFragments;
-  std::map<Record*, DAGDefaultOperand, LessRecordByID> DefaultOperands;
-  std::map<Record*, DAGInstruction, LessRecordByID> Instructions;
+  std::map<Record *, DAGDefaultOperand, LessRecordByID> DefaultOperands;
+  std::map<Record *, DAGInstruction, LessRecordByID> Instructions;
 
   // Specific SDNode definitions:
   Record *intrinsic_void_sdnode;
@@ -1128,7 +1113,7 @@ class CodeGenDAGPatterns {
 
   TypeSetByHwMode LegalVTS;
 
-  using PatternRewriterFn = std::function<void (TreePattern *)>;
+  using PatternRewriterFn = std::function<void(TreePattern *)>;
   PatternRewriterFn PatternRewriter;
 
   unsigned NumScopes = 0;
@@ -1150,7 +1135,7 @@ public:
   }
 
   // Node transformation lookups.
-  typedef std::pair<Record*, std::string> NodeXForm;
+  typedef std::pair<Record *, std::string> NodeXForm;
   const NodeXForm &getSDNodeTransform(Record *R) const {
     auto F = SDNodeXForms.find(R);
     assert(F != SDNodeXForms.end() && "Invalid transform!");
@@ -1165,25 +1150,27 @@ public:
 
   const CodeGenIntrinsic &getIntrinsic(Record *R) const {
     for (unsigned i = 0, e = Intrinsics.size(); i != e; ++i)
-      if (Intrinsics[i].TheDef == R) return Intrinsics[i];
+      if (Intrinsics[i].TheDef == R)
+        return Intrinsics[i];
     llvm_unreachable("Unknown intrinsic!");
   }
 
   const CodeGenIntrinsic &getIntrinsicInfo(unsigned IID) const {
-    if (IID-1 < Intrinsics.size())
-      return Intrinsics[IID-1];
+    if (IID - 1 < Intrinsics.size())
+      return Intrinsics[IID - 1];
     llvm_unreachable("Bad intrinsic ID!");
   }
 
   unsigned getIntrinsicID(Record *R) const {
     for (unsigned i = 0, e = Intrinsics.size(); i != e; ++i)
-      if (Intrinsics[i].TheDef == R) return i;
+      if (Intrinsics[i].TheDef == R)
+        return i;
     llvm_unreachable("Unknown intrinsic!");
   }
 
   const DAGDefaultOperand &getDefaultOperand(Record *R) const {
     auto F = DefaultOperands.find(R);
-    assert(F != DefaultOperands.end() &&"Isn't an analyzed default operand!");
+    assert(F != DefaultOperands.end() && "Isn't an analyzed default operand!");
     return F->second;
   }
 
@@ -1213,10 +1200,9 @@ public:
   iterator_range<ptm_iterator> ptms() const { return PatternsToMatch; }
 
   /// Parse the Pattern for an instruction, and insert the result in DAGInsts.
-  typedef std::map<Record*, DAGInstruction, LessRecordByID> DAGInstMap;
-  void parseInstructionPattern(
-      CodeGenInstruction &CGI, ListInit *Pattern,
-      DAGInstMap &DAGInsts);
+  typedef std::map<Record *, DAGInstruction, LessRecordByID> DAGInstMap;
+  void parseInstructionPattern(CodeGenInstruction &CGI, ListInit *Pattern,
+                               DAGInstMap &DAGInsts);
 
   const DAGInstruction &getInstruction(Record *R) const {
     auto F = Instructions.find(R);
@@ -1224,9 +1210,7 @@ public:
     return F->second;
   }
 
-  Record *get_intrinsic_void_sdnode() const {
-    return intrinsic_void_sdnode;
-  }
+  Record *get_intrinsic_void_sdnode() const { return intrinsic_void_sdnode; }
   Record *get_intrinsic_w_chain_sdnode() const {
     return intrinsic_w_chain_sdnode;
   }
@@ -1238,7 +1222,7 @@ public:
 
   bool operandHasDefault(Record *Op) const {
     return Op->isSubClassOf("OperandWithDefaultOps") &&
-      !getDefaultOperand(Op).DefaultOps.empty();
+           !getDefaultOperand(Op).DefaultOps.empty();
   }
 
 private:
@@ -1254,8 +1238,8 @@ private:
   void GenerateVariants();
   void VerifyInstructionFlags();
 
-  void ParseOnePattern(Record *TheDef,
-                       TreePattern &Pattern, TreePattern &Result,
+  void ParseOnePattern(Record *TheDef, TreePattern &Pattern,
+                       TreePattern &Result,
                        const std::vector<Record *> &InstImpResults);
   void AddPatternToMatch(TreePattern *Pattern, PatternToMatch &&PTM);
   void FindPatternInputsAndOutputs(
@@ -1266,14 +1250,13 @@ private:
       std::vector<Record *> &InstImpResults);
 };
 
-
 inline bool SDNodeInfo::ApplyTypeConstraints(TreePatternNode *N,
                                              TreePattern &TP) const {
-    bool MadeChange = false;
-    for (unsigned i = 0, e = TypeConstraints.size(); i != e; ++i)
-      MadeChange |= TypeConstraints[i].ApplyTypeConstraint(N, *this, TP);
-    return MadeChange;
-  }
+  bool MadeChange = false;
+  for (unsigned i = 0, e = TypeConstraints.size(); i != e; ++i)
+    MadeChange |= TypeConstraints[i].ApplyTypeConstraint(N, *this, TP);
+  return MadeChange;
+}
 
 } // end namespace llvm
 
diff --git a/llvm/utils/TableGen/CodeGenHwModes.cpp b/llvm/utils/TableGen/CodeGenHwModes.cpp
index 2171507..7c08c75 100644
--- a/llvm/utils/TableGen/CodeGenHwModes.cpp
+++ b/llvm/utils/TableGen/CodeGenHwModes.cpp
@@ -37,17 +37,17 @@ HwMode::HwMode(Record *R) {
 }
 
 LLVM_DUMP_METHOD
-void HwMode::dump() const {
-  dbgs() << Name << ": " << Features << '\n';
-}
+void HwMode::dump() const { dbgs() << Name << ": " << Features << '\n'; }
 
 HwModeSelect::HwModeSelect(Record *R, CodeGenHwModes &CGH) {
-  std::vector<Record*> Modes = R->getValueAsListOfDefs("Modes");
-  std::vector<Record*> Objects = R->getValueAsListOfDefs("Objects");
+  std::vector<Record *> Modes = R->getValueAsListOfDefs("Modes");
+  std::vector<Record *> Objects = R->getValueAsListOfDefs("Objects");
   if (Modes.size() != Objects.size()) {
-    PrintError(R->getLoc(), "in record " + R->getName() +
-        " derived from HwModeSelect: the lists Modes and Objects should "
-        "have the same size");
+    PrintError(
+        R->getLoc(),
+        "in record " + R->getName() +
+            " derived from HwModeSelect: the lists Modes and Objects should "
+            "have the same size");
     report_fatal_error("error in target description.");
   }
   for (unsigned i = 0, e = Modes.size(); i != e; ++i) {
diff --git a/llvm/utils/TableGen/CodeGenHwModes.h b/llvm/utils/TableGen/CodeGenHwModes.h
index 09d20ad..9a5b7a8 100644
--- a/llvm/utils/TableGen/CodeGenHwModes.h
+++ b/llvm/utils/TableGen/CodeGenHwModes.h
@@ -22,46 +22,46 @@
 // HwModeId -> list of predicates (definition)
 
 namespace llvm {
-  class Record;
-  class RecordKeeper;
+class Record;
+class RecordKeeper;
 
-  struct CodeGenHwModes;
+struct CodeGenHwModes;
 
-  struct HwMode {
-    HwMode(Record *R);
-    StringRef Name;
-    std::string Features;
-    std::string Predicates;
-    void dump() const;
-  };
+struct HwMode {
+  HwMode(Record *R);
+  StringRef Name;
+  std::string Features;
+  std::string Predicates;
+  void dump() const;
+};
 
-  struct HwModeSelect {
-    HwModeSelect(Record *R, CodeGenHwModes &CGH);
-    typedef std::pair<unsigned, Record*> PairType;
-    std::vector<PairType> Items;
-    void dump() const;
-  };
+struct HwModeSelect {
+  HwModeSelect(Record *R, CodeGenHwModes &CGH);
+  typedef std::pair<unsigned, Record *> PairType;
+  std::vector<PairType> Items;
+  void dump() const;
+};
 
-  struct CodeGenHwModes {
-    enum : unsigned { DefaultMode = 0 };
-    static StringRef DefaultModeName;
+struct CodeGenHwModes {
+  enum : unsigned { DefaultMode = 0 };
+  static StringRef DefaultModeName;
 
-    CodeGenHwModes(RecordKeeper &R);
-    unsigned getHwModeId(Record *R) const;
-    const HwMode &getMode(unsigned Id) const {
-      assert(Id != 0 && "Mode id of 0 is reserved for the default mode");
-      return Modes[Id-1];
-    }
-    const HwModeSelect &getHwModeSelect(Record *R) const;
-    unsigned getNumModeIds() const { return Modes.size()+1; }
-    void dump() const;
+  CodeGenHwModes(RecordKeeper &R);
+  unsigned getHwModeId(Record *R) const;
+  const HwMode &getMode(unsigned Id) const {
+    assert(Id != 0 && "Mode id of 0 is reserved for the default mode");
+    return Modes[Id - 1];
+  }
+  const HwModeSelect &getHwModeSelect(Record *R) const;
+  unsigned getNumModeIds() const { return Modes.size() + 1; }
+  void dump() const;
 
-  private:
-    RecordKeeper &Records;
-    DenseMap<Record *, unsigned> ModeIds;  // HwMode Record -> HwModeId
-    std::vector<HwMode> Modes;
-    std::map<Record*,HwModeSelect> ModeSelects;
-  };
-}
+private:
+  RecordKeeper &Records;
+  DenseMap<Record *, unsigned> ModeIds; // HwMode Record -> HwModeId
+  std::vector<HwMode> Modes;
+  std::map<Record *, HwModeSelect> ModeSelects;
+};
+} // namespace llvm
 
 #endif // LLVM_UTILS_TABLEGEN_CODEGENHWMODES_H
diff --git a/llvm/utils/TableGen/CodeGenInstruction.cpp b/llvm/utils/TableGen/CodeGenInstruction.cpp
index 5cd8941..a569194 100644
--- a/llvm/utils/TableGen/CodeGenInstruction.cpp
+++ b/llvm/utils/TableGen/CodeGenInstruction.cpp
@@ -55,15 +55,15 @@ CGIOperandList::CGIOperandList(Record *R) : TheDef(R) {
   unsigned e = InDI->getNumArgs() + OutDI->getNumArgs();
   OperandList.reserve(e);
   bool VariadicOuts = false;
-  for (unsigned i = 0; i != e; ++i){
+  for (unsigned i = 0; i != e; ++i) {
     Init *ArgInit;
     StringRef ArgName;
     if (i < NumDefs) {
       ArgInit = OutDI->getArg(i);
       ArgName = OutDI->getArgNameStr(i);
     } else {
-      ArgInit = InDI->getArg(i-NumDefs);
-      ArgName = InDI->getArgNameStr(i-NumDefs);
+      ArgInit = InDI->getArg(i - NumDefs);
+      ArgName = InDI->getArgNameStr(i - NumDefs);
     }
 
     DagInit *SubArgDag = dyn_cast<DagInit>(ArgInit);
@@ -192,7 +192,6 @@ CGIOperandList::CGIOperandList(Record *R) : TheDef(R) {
     --NumDefs;
 }
 
-
 /// getOperandNamed - Return the index of the operand with the specified
 /// non-empty name.  If the instruction does not have an operand with the
 /// specified name, abort.
@@ -230,7 +229,7 @@ bool CGIOperandList::hasSubOperandAlias(
   return false;
 }
 
-std::pair<unsigned,unsigned>
+std::pair<unsigned, unsigned>
 CGIOperandList::ParseOperandName(StringRef Op, bool AllowWholeOp) {
   if (!Op.starts_with("$"))
     PrintFatalError(TheDef->getLoc(),
@@ -242,7 +241,7 @@ CGIOperandList::ParseOperandName(StringRef Op, bool AllowWholeOp) {
   // Check to see if this is $foo.bar.
   StringRef::size_type DotIdx = OpName.find_first_of('.');
   if (DotIdx != StringRef::npos) {
-    SubOpName = OpName.substr(DotIdx+1);
+    SubOpName = OpName.substr(DotIdx + 1);
     if (SubOpName.empty())
       PrintFatalError(TheDef->getLoc(),
                       TheDef->getName() +
@@ -266,7 +265,7 @@ CGIOperandList::ParseOperandName(StringRef Op, bool AllowWholeOp) {
 
   OpIdx = getOperandNamed(OpName);
 
-  if (SubOpName.empty()) {  // If no suboperand name was specified:
+  if (SubOpName.empty()) { // If no suboperand name was specified:
     // If one was needed, throw.
     if (OperandList[OpIdx].MINumOperands > 1 && !AllowWholeOp &&
         SubOpName.empty())
@@ -299,82 +298,80 @@ CGIOperandList::ParseOperandName(StringRef Op, bool AllowWholeOp) {
   return std::make_pair(0U, 0U);
 }
 
-static void ParseConstraint(StringRef CStr, CGIOperandList &Ops,
-                            Record *Rec) {
+static void ParseConstraint(StringRef CStr, CGIOperandList &Ops, Record *Rec) {
   // EARLY_CLOBBER: @early $reg
   StringRef::size_type wpos = CStr.find_first_of(" \t");
   StringRef::size_type start = CStr.find_first_not_of(" \t");
   StringRef Tok = CStr.substr(start, wpos - start);
   if (Tok == "@earlyclobber") {
-    StringRef Name = CStr.substr(wpos+1);
+    StringRef Name = CStr.substr(wpos + 1);
     wpos = Name.find_first_not_of(" \t");
     if (wpos == StringRef::npos)
-      PrintFatalError(
-        Rec->getLoc(), "Illegal format for @earlyclobber constraint in '" +
-        Rec->getName() + "': '" + CStr + "'");
+      PrintFatalError(Rec->getLoc(),
+                      "Illegal format for @earlyclobber constraint in '" +
+                          Rec->getName() + "': '" + CStr + "'");
     Name = Name.substr(wpos);
-    std::pair<unsigned,unsigned> Op = Ops.ParseOperandName(Name, false);
+    std::pair<unsigned, unsigned> Op = Ops.ParseOperandName(Name, false);
 
     // Build the string for the operand
     if (!Ops[Op.first].Constraints[Op.second].isNone())
-      PrintFatalError(
-        Rec->getLoc(), "Operand '" + Name + "' of '" + Rec->getName() +
-        "' cannot have multiple constraints!");
+      PrintFatalError(Rec->getLoc(), "Operand '" + Name + "' of '" +
+                                         Rec->getName() +
+                                         "' cannot have multiple constraints!");
     Ops[Op.first].Constraints[Op.second] =
-    CGIOperandList::ConstraintInfo::getEarlyClobber();
+        CGIOperandList::ConstraintInfo::getEarlyClobber();
     return;
   }
 
   // Only other constraint is "TIED_TO" for now.
   StringRef::size_type pos = CStr.find_first_of('=');
   if (pos == StringRef::npos)
-    PrintFatalError(
-      Rec->getLoc(), "Unrecognized constraint '" + CStr +
-      "' in '" + Rec->getName() + "'");
+    PrintFatalError(Rec->getLoc(), "Unrecognized constraint '" + CStr +
+                                       "' in '" + Rec->getName() + "'");
   start = CStr.find_first_not_of(" \t");
 
   // TIED_TO: $src1 = $dst
   wpos = CStr.find_first_of(" \t", start);
   if (wpos == StringRef::npos || wpos > pos)
-    PrintFatalError(
-      Rec->getLoc(), "Illegal format for tied-to constraint in '" +
-      Rec->getName() + "': '" + CStr + "'");
+    PrintFatalError(Rec->getLoc(),
+                    "Illegal format for tied-to constraint in '" +
+                        Rec->getName() + "': '" + CStr + "'");
   StringRef LHSOpName = CStr.substr(start, wpos - start);
-  std::pair<unsigned,unsigned> LHSOp = Ops.ParseOperandName(LHSOpName, false);
+  std::pair<unsigned, unsigned> LHSOp = Ops.ParseOperandName(LHSOpName, false);
 
   wpos = CStr.find_first_not_of(" \t", pos + 1);
   if (wpos == StringRef::npos)
-    PrintFatalError(
-      Rec->getLoc(), "Illegal format for tied-to constraint: '" + CStr + "'");
+    PrintFatalError(Rec->getLoc(),
+                    "Illegal format for tied-to constraint: '" + CStr + "'");
 
   StringRef RHSOpName = CStr.substr(wpos);
-  std::pair<unsigned,unsigned> RHSOp = Ops.ParseOperandName(RHSOpName, false);
+  std::pair<unsigned, unsigned> RHSOp = Ops.ParseOperandName(RHSOpName, false);
 
   // Sort the operands into order, which should put the output one
   // first. But keep the original order, for use in diagnostics.
   bool FirstIsDest = (LHSOp < RHSOp);
-  std::pair<unsigned,unsigned> DestOp = (FirstIsDest ? LHSOp : RHSOp);
+  std::pair<unsigned, unsigned> DestOp = (FirstIsDest ? LHSOp : RHSOp);
   StringRef DestOpName = (FirstIsDest ? LHSOpName : RHSOpName);
-  std::pair<unsigned,unsigned> SrcOp = (FirstIsDest ? RHSOp : LHSOp);
+  std::pair<unsigned, unsigned> SrcOp = (FirstIsDest ? RHSOp : LHSOp);
   StringRef SrcOpName = (FirstIsDest ? RHSOpName : LHSOpName);
 
   // Ensure one operand is a def and the other is a use.
   if (DestOp.first >= Ops.NumDefs)
-    PrintFatalError(
-      Rec->getLoc(), "Input operands '" + LHSOpName + "' and '" + RHSOpName +
-      "' of '" + Rec->getName() + "' cannot be tied!");
+    PrintFatalError(Rec->getLoc(), "Input operands '" + LHSOpName + "' and '" +
+                                       RHSOpName + "' of '" + Rec->getName() +
+                                       "' cannot be tied!");
   if (SrcOp.first < Ops.NumDefs)
-    PrintFatalError(
-      Rec->getLoc(), "Output operands '" + LHSOpName + "' and '" + RHSOpName +
-      "' of '" + Rec->getName() + "' cannot be tied!");
+    PrintFatalError(Rec->getLoc(), "Output operands '" + LHSOpName + "' and '" +
+                                       RHSOpName + "' of '" + Rec->getName() +
+                                       "' cannot be tied!");
 
   // The constraint has to go on the operand with higher index, i.e.
   // the source one. Check there isn't another constraint there
   // already.
   if (!Ops[SrcOp.first].Constraints[SrcOp.second].isNone())
-    PrintFatalError(
-      Rec->getLoc(), "Operand '" + SrcOpName + "' of '" + Rec->getName() +
-      "' cannot have multiple constraints!");
+    PrintFatalError(Rec->getLoc(), "Operand '" + SrcOpName + "' of '" +
+                                       Rec->getName() +
+                                       "' cannot have multiple constraints!");
 
   unsigned DestFlatOpNo = Ops.getFlattenedOperandNumber(DestOp);
   auto NewConstraint = CGIOperandList::ConstraintInfo::getTied(DestFlatOpNo);
@@ -384,16 +381,17 @@ static void ParseConstraint(StringRef CStr, CGIOperandList &Ops,
   for (const CGIOperandList::OperandInfo &Op : Ops) {
     for (unsigned i = 0; i < Op.MINumOperands; i++)
       if (Op.Constraints[i] == NewConstraint)
-        PrintFatalError(
-          Rec->getLoc(), "Operand '" + DestOpName + "' of '" + Rec->getName() +
-          "' cannot have multiple operands tied to it!");
+        PrintFatalError(Rec->getLoc(),
+                        "Operand '" + DestOpName + "' of '" + Rec->getName() +
+                            "' cannot have multiple operands tied to it!");
   }
 
   Ops[SrcOp.first].Constraints[SrcOp.second] = NewConstraint;
 }
 
 static void ParseConstraints(StringRef CStr, CGIOperandList &Ops, Record *Rec) {
-  if (CStr.empty()) return;
+  if (CStr.empty())
+    return;
 
   StringRef delims(",");
   StringRef::size_type bidx, eidx;
@@ -413,15 +411,15 @@ void CGIOperandList::ProcessDisableEncoding(StringRef DisableEncoding) {
   while (true) {
     StringRef OpName;
     std::tie(OpName, DisableEncoding) = getToken(DisableEncoding, " ,\t");
-    if (OpName.empty()) break;
+    if (OpName.empty())
+      break;
 
     // Figure out which operand this is.
-    std::pair<unsigned,unsigned> Op = ParseOperandName(OpName, false);
+    std::pair<unsigned, unsigned> Op = ParseOperandName(OpName, false);
 
     // Mark the operand as not-to-be encoded.
     OperandList[Op.first].DoNotEncode[Op.second] = true;
   }
-
 }
 
 //===----------------------------------------------------------------------===//
@@ -429,27 +427,27 @@ void CGIOperandList::ProcessDisableEncoding(StringRef DisableEncoding) {
 //===----------------------------------------------------------------------===//
 
 CodeGenInstruction::CodeGenInstruction(Record *R)
-  : TheDef(R), Operands(R), InferredFrom(nullptr) {
+    : TheDef(R), Operands(R), InferredFrom(nullptr) {
   Namespace = R->getValueAsString("Namespace");
   AsmString = std::string(R->getValueAsString("AsmString"));
 
   isPreISelOpcode = R->getValueAsBit("isPreISelOpcode");
-  isReturn     = R->getValueAsBit("isReturn");
+  isReturn = R->getValueAsBit("isReturn");
   isEHScopeReturn = R->getValueAsBit("isEHScopeReturn");
-  isBranch     = R->getValueAsBit("isBranch");
+  isBranch = R->getValueAsBit("isBranch");
   isIndirectBranch = R->getValueAsBit("isIndirectBranch");
-  isCompare    = R->getValueAsBit("isCompare");
-  isMoveImm    = R->getValueAsBit("isMoveImm");
-  isMoveReg    = R->getValueAsBit("isMoveReg");
-  isBitcast    = R->getValueAsBit("isBitcast");
-  isSelect     = R->getValueAsBit("isSelect");
-  isBarrier    = R->getValueAsBit("isBarrier");
-  isCall       = R->getValueAsBit("isCall");
-  isAdd        = R->getValueAsBit("isAdd");
-  isTrap       = R->getValueAsBit("isTrap");
+  isCompare = R->getValueAsBit("isCompare");
+  isMoveImm = R->getValueAsBit("isMoveImm");
+  isMoveReg = R->getValueAsBit("isMoveReg");
+  isBitcast = R->getValueAsBit("isBitcast");
+  isSelect = R->getValueAsBit("isSelect");
+  isBarrier = R->getValueAsBit("isBarrier");
+  isCall = R->getValueAsBit("isCall");
+  isAdd = R->getValueAsBit("isAdd");
+  isTrap = R->getValueAsBit("isTrap");
   canFoldAsLoad = R->getValueAsBit("canFoldAsLoad");
-  isPredicable = !R->getValueAsBit("isUnpredicable") && (
-      Operands.isPredicable || R->getValueAsBit("isPredicable"));
+  isPredicable = !R->getValueAsBit("isUnpredicable") &&
+                 (Operands.isPredicable || R->getValueAsBit("isPredicable"));
   isConvertibleToThreeAddress = R->getValueAsBit("isConvertibleToThreeAddress");
   isCommutable = R->getValueAsBit("isCommutable");
   isTerminator = R->getValueAsBit("isTerminator");
@@ -457,7 +455,7 @@ CodeGenInstruction::CodeGenInstruction(Record *R)
   hasDelaySlot = R->getValueAsBit("hasDelaySlot");
   usesCustomInserter = R->getValueAsBit("usesCustomInserter");
   hasPostISelHook = R->getValueAsBit("hasPostISelHook");
-  hasCtrlDep   = R->getValueAsBit("hasCtrlDep");
+  hasCtrlDep = R->getValueAsBit("hasCtrlDep");
   isNotDuplicable = R->getValueAsBit("isNotDuplicable");
   isRegSequence = R->getValueAsBit("isRegSequence");
   isExtractSubreg = R->getValueAsBit("isExtractSubreg");
@@ -469,9 +467,9 @@ CodeGenInstruction::CodeGenInstruction(Record *R)
   isAuthenticated = R->getValueAsBit("isAuthenticated");
 
   bool Unset;
-  mayLoad      = R->getValueAsBitOrUnset("mayLoad", Unset);
+  mayLoad = R->getValueAsBitOrUnset("mayLoad", Unset);
   mayLoad_Unset = Unset;
-  mayStore     = R->getValueAsBitOrUnset("mayStore", Unset);
+  mayStore = R->getValueAsBitOrUnset("mayStore", Unset);
   mayStore_Unset = Unset;
   mayRaiseFPException = R->getValueAsBit("mayRaiseFPException");
   hasSideEffects = R->getValueAsBitOrUnset("hasSideEffects", Unset);
@@ -494,8 +492,7 @@ CodeGenInstruction::CodeGenInstruction(Record *R)
   ParseConstraints(R->getValueAsString("Constraints"), Operands, R);
 
   // Parse the DisableEncoding field.
-  Operands.ProcessDisableEncoding(
-      R->getValueAsString("DisableEncoding"));
+  Operands.ProcessDisableEncoding(R->getValueAsString("DisableEncoding"));
 
   // First check for a ComplexDeprecationPredicate.
   if (R->getValue("ComplexDeprecationPredicate")) {
@@ -516,25 +513,25 @@ CodeGenInstruction::CodeGenInstruction(Record *R)
 /// HasOneImplicitDefWithKnownVT - If the instruction has at least one
 /// implicit def and it has a known VT, return the VT, otherwise return
 /// MVT::Other.
-MVT::SimpleValueType CodeGenInstruction::
-HasOneImplicitDefWithKnownVT(const CodeGenTarget &TargetInfo) const {
-  if (ImplicitDefs.empty()) return MVT::Other;
+MVT::SimpleValueType CodeGenInstruction::HasOneImplicitDefWithKnownVT(
+    const CodeGenTarget &TargetInfo) const {
+  if (ImplicitDefs.empty())
+    return MVT::Other;
 
   // Check to see if the first implicit def has a resolvable type.
   Record *FirstImplicitDef = ImplicitDefs[0];
   assert(FirstImplicitDef->isSubClassOf("Register"));
   const std::vector<ValueTypeByHwMode> &RegVTs =
-    TargetInfo.getRegisterVTs(FirstImplicitDef);
+      TargetInfo.getRegisterVTs(FirstImplicitDef);
   if (RegVTs.size() == 1 && RegVTs[0].isSimple())
     return RegVTs[0].getSimple().SimpleTy;
   return MVT::Other;
 }
 
-
 /// FlattenAsmStringVariants - Flatten the specified AsmString to only
 /// include text from the specified variant, returning the new string.
-std::string CodeGenInstruction::
-FlattenAsmStringVariants(StringRef Cur, unsigned Variant) {
+std::string CodeGenInstruction::FlattenAsmStringVariants(StringRef Cur,
+                                                         unsigned Variant) {
   std::string Res;
 
   for (;;) {
@@ -542,8 +539,8 @@ FlattenAsmStringVariants(StringRef Cur, unsigned Variant) {
     size_t VariantsStart = 0;
     for (size_t e = Cur.size(); VariantsStart != e; ++VariantsStart)
       if (Cur[VariantsStart] == '{' &&
-          (VariantsStart == 0 || (Cur[VariantsStart-1] != '$' &&
-                                  Cur[VariantsStart-1] != '\\')))
+          (VariantsStart == 0 ||
+           (Cur[VariantsStart - 1] != '$' && Cur[VariantsStart - 1] != '\\')))
         break;
 
     // Add the prefix to the result.
@@ -557,7 +554,7 @@ FlattenAsmStringVariants(StringRef Cur, unsigned Variant) {
     size_t VariantsEnd = VariantsStart;
     unsigned NestedBraces = 1;
     for (size_t e = Cur.size(); VariantsEnd != e; ++VariantsEnd) {
-      if (Cur[VariantsEnd] == '}' && Cur[VariantsEnd-1] != '\\') {
+      if (Cur[VariantsEnd] == '}' && Cur[VariantsEnd - 1] != '\\') {
         if (--NestedBraces == 0)
           break;
       } else if (Cur[VariantsEnd] == '{')
diff --git a/llvm/utils/TableGen/CodeGenInstruction.h b/llvm/utils/TableGen/CodeGenInstruction.h
index 4a34c29..ca7b1e9 100644
--- a/llvm/utils/TableGen/CodeGenInstruction.h
+++ b/llvm/utils/TableGen/CodeGenInstruction.h
@@ -23,324 +23,320 @@
 #include <vector>
 
 namespace llvm {
-  class Record;
-  class DagInit;
-  class CodeGenTarget;
+class Record;
+class DagInit;
+class CodeGenTarget;
 
-  class CGIOperandList {
-  public:
-    class ConstraintInfo {
-      enum { None, EarlyClobber, Tied } Kind = None;
-      unsigned OtherTiedOperand = 0;
-
-    public:
-      ConstraintInfo() = default;
-
-      static ConstraintInfo getEarlyClobber() {
-        ConstraintInfo I;
-        I.Kind = EarlyClobber;
-        I.OtherTiedOperand = 0;
-        return I;
-      }
-
-      static ConstraintInfo getTied(unsigned Op) {
-        ConstraintInfo I;
-        I.Kind = Tied;
-        I.OtherTiedOperand = Op;
-        return I;
-      }
-
-      bool isNone() const { return Kind == None; }
-      bool isEarlyClobber() const { return Kind == EarlyClobber; }
-      bool isTied() const { return Kind == Tied; }
+class CGIOperandList {
+public:
+  class ConstraintInfo {
+    enum { None, EarlyClobber, Tied } Kind = None;
+    unsigned OtherTiedOperand = 0;
 
-      unsigned getTiedOperand() const {
-        assert(isTied());
-        return OtherTiedOperand;
-      }
-
-      bool operator==(const ConstraintInfo &RHS) const {
-        if (Kind != RHS.Kind)
-          return false;
-        if (Kind == Tied && OtherTiedOperand != RHS.OtherTiedOperand)
-          return false;
-        return true;
-      }
-      bool operator!=(const ConstraintInfo &RHS) const {
-        return !(*this == RHS);
-      }
-    };
-
-    /// OperandInfo - The information we keep track of for each operand in the
-    /// operand list for a tablegen instruction.
-    struct OperandInfo {
-      /// Rec - The definition this operand is declared as.
-      ///
-      Record *Rec;
-
-      /// Name - If this operand was assigned a symbolic name, this is it,
-      /// otherwise, it's empty.
-      std::string Name;
-
-      /// The names of sub-operands, if given, otherwise empty.
-      std::vector<std::string> SubOpNames;
-
-      /// PrinterMethodName - The method used to print operands of this type in
-      /// the asmprinter.
-      std::string PrinterMethodName;
-
-      /// The method used to get the machine operand value for binary
-      /// encoding, per sub-operand. If empty, uses "getMachineOpValue".
-      std::vector<std::string> EncoderMethodNames;
-
-      /// OperandType - A value from MCOI::OperandType representing the type of
-      /// the operand.
-      std::string OperandType;
-
-      /// MIOperandNo - Currently (this is meant to be phased out), some logical
-      /// operands correspond to multiple MachineInstr operands.  In the X86
-      /// target for example, one address operand is represented as 4
-      /// MachineOperands.  Because of this, the operand number in the
-      /// OperandList may not match the MachineInstr operand num.  Until it
-      /// does, this contains the MI operand index of this operand.
-      unsigned MIOperandNo;
-      unsigned MINumOperands;   // The number of operands.
-
-      /// DoNotEncode - Bools are set to true in this vector for each operand in
-      /// the DisableEncoding list.  These should not be emitted by the code
-      /// emitter.
-      BitVector DoNotEncode;
-
-      /// MIOperandInfo - Default MI operand type. Note an operand may be made
-      /// up of multiple MI operands.
-      DagInit *MIOperandInfo;
-
-      /// Constraint info for this operand.  This operand can have pieces, so we
-      /// track constraint info for each.
-      std::vector<ConstraintInfo> Constraints;
-
-      OperandInfo(Record *R, const std::string &N, const std::string &PMN,
-                  const std::string &OT, unsigned MION, unsigned MINO,
-                  DagInit *MIOI)
-          : Rec(R), Name(N), SubOpNames(MINO), PrinterMethodName(PMN),
-            EncoderMethodNames(MINO), OperandType(OT), MIOperandNo(MION),
-            MINumOperands(MINO), DoNotEncode(MINO), MIOperandInfo(MIOI),
-            Constraints(MINO) {}
-
-      /// getTiedOperand - If this operand is tied to another one, return the
-      /// other operand number.  Otherwise, return -1.
-      int getTiedRegister() const {
-        for (unsigned j = 0, e = Constraints.size(); j != e; ++j) {
-          const CGIOperandList::ConstraintInfo &CI = Constraints[j];
-          if (CI.isTied()) return CI.getTiedOperand();
-        }
-        return -1;
-      }
-    };
-
-    CGIOperandList(Record *D);
-
-    Record *TheDef;            // The actual record containing this OperandList.
+  public:
+    ConstraintInfo() = default;
 
-    /// NumDefs - Number of def operands declared, this is the number of
-    /// elements in the instruction's (outs) list.
-    ///
-    unsigned NumDefs;
-
-    /// OperandList - The list of declared operands, along with their declared
-    /// type (which is a record).
-    std::vector<OperandInfo> OperandList;
-
-    /// SubOpAliases - List of alias names for suboperands.
-    StringMap<std::pair<unsigned, unsigned>> SubOpAliases;
-
-    // Information gleaned from the operand list.
-    bool isPredicable;
-    bool hasOptionalDef;
-    bool isVariadic;
-
-    // Provide transparent accessors to the operand list.
-    bool empty() const { return OperandList.empty(); }
-    unsigned size() const { return OperandList.size(); }
-    const OperandInfo &operator[](unsigned i) const { return OperandList[i]; }
-    OperandInfo &operator[](unsigned i) { return OperandList[i]; }
-    OperandInfo &back() { return OperandList.back(); }
-    const OperandInfo &back() const { return OperandList.back(); }
-
-    typedef std::vector<OperandInfo>::iterator iterator;
-    typedef std::vector<OperandInfo>::const_iterator const_iterator;
-    iterator begin() { return OperandList.begin(); }
-    const_iterator begin() const { return OperandList.begin(); }
-    iterator end() { return OperandList.end(); }
-    const_iterator end() const { return OperandList.end(); }
-
-    /// getOperandNamed - Return the index of the operand with the specified
-    /// non-empty name.  If the instruction does not have an operand with the
-    /// specified name, abort.
-    unsigned getOperandNamed(StringRef Name) const;
-
-    /// hasOperandNamed - Query whether the instruction has an operand of the
-    /// given name. If so, return true and set OpIdx to the index of the
-    /// operand. Otherwise, return false.
-    bool hasOperandNamed(StringRef Name, unsigned &OpIdx) const;
-
-    bool hasSubOperandAlias(StringRef Name,
-                            std::pair<unsigned, unsigned> &SubOp) const;
-
-    /// ParseOperandName - Parse an operand name like "$foo" or "$foo.bar",
-    /// where $foo is a whole operand and $foo.bar refers to a suboperand.
-    /// This aborts if the name is invalid.  If AllowWholeOp is true, references
-    /// to operands with suboperands are allowed, otherwise not.
-    std::pair<unsigned,unsigned> ParseOperandName(StringRef Op,
-                                                  bool AllowWholeOp = true);
-
-    /// getFlattenedOperandNumber - Flatten a operand/suboperand pair into a
-    /// flat machineinstr operand #.
-    unsigned getFlattenedOperandNumber(std::pair<unsigned,unsigned> Op) const {
-      return OperandList[Op.first].MIOperandNo + Op.second;
+    static ConstraintInfo getEarlyClobber() {
+      ConstraintInfo I;
+      I.Kind = EarlyClobber;
+      I.OtherTiedOperand = 0;
+      return I;
     }
 
-    /// getSubOperandNumber - Unflatten a operand number into an
-    /// operand/suboperand pair.
-    std::pair<unsigned,unsigned> getSubOperandNumber(unsigned Op) const {
-      for (unsigned i = 0; ; ++i) {
-        assert(i < OperandList.size() && "Invalid flat operand #");
-        if (OperandList[i].MIOperandNo+OperandList[i].MINumOperands > Op)
-          return std::make_pair(i, Op-OperandList[i].MIOperandNo);
-      }
+    static ConstraintInfo getTied(unsigned Op) {
+      ConstraintInfo I;
+      I.Kind = Tied;
+      I.OtherTiedOperand = Op;
+      return I;
     }
 
+    bool isNone() const { return Kind == None; }
+    bool isEarlyClobber() const { return Kind == EarlyClobber; }
+    bool isTied() const { return Kind == Tied; }
 
-    /// isFlatOperandNotEmitted - Return true if the specified flat operand #
-    /// should not be emitted with the code emitter.
-    bool isFlatOperandNotEmitted(unsigned FlatOpNo) const {
-      std::pair<unsigned,unsigned> Op = getSubOperandNumber(FlatOpNo);
-      if (OperandList[Op.first].DoNotEncode.size() > Op.second)
-        return OperandList[Op.first].DoNotEncode[Op.second];
-      return false;
+    unsigned getTiedOperand() const {
+      assert(isTied());
+      return OtherTiedOperand;
     }
 
-    void ProcessDisableEncoding(StringRef Value);
-  };
-
-
-  class CodeGenInstruction {
-  public:
-    Record *TheDef;            // The actual record defining this instruction.
-    StringRef Namespace;       // The namespace the instruction is in.
-
-    /// AsmString - The format string used to emit a .s file for the
-    /// instruction.
-    std::string AsmString;
-
-    /// Operands - This is information about the (ins) and (outs) list specified
-    /// to the instruction.
-    CGIOperandList Operands;
-
-    /// ImplicitDefs/ImplicitUses - These are lists of registers that are
-    /// implicitly defined and used by the instruction.
-    std::vector<Record*> ImplicitDefs, ImplicitUses;
-
-    // Various boolean values we track for the instruction.
-    bool isPreISelOpcode : 1;
-    bool isReturn : 1;
-    bool isEHScopeReturn : 1;
-    bool isBranch : 1;
-    bool isIndirectBranch : 1;
-    bool isCompare : 1;
-    bool isMoveImm : 1;
-    bool isMoveReg : 1;
-    bool isBitcast : 1;
-    bool isSelect : 1;
-    bool isBarrier : 1;
-    bool isCall : 1;
-    bool isAdd : 1;
-    bool isTrap : 1;
-    bool canFoldAsLoad : 1;
-    bool mayLoad : 1;
-    bool mayLoad_Unset : 1;
-    bool mayStore : 1;
-    bool mayStore_Unset : 1;
-    bool mayRaiseFPException : 1;
-    bool isPredicable : 1;
-    bool isConvertibleToThreeAddress : 1;
-    bool isCommutable : 1;
-    bool isTerminator : 1;
-    bool isReMaterializable : 1;
-    bool hasDelaySlot : 1;
-    bool usesCustomInserter : 1;
-    bool hasPostISelHook : 1;
-    bool hasCtrlDep : 1;
-    bool isNotDuplicable : 1;
-    bool hasSideEffects : 1;
-    bool hasSideEffects_Unset : 1;
-    bool isAsCheapAsAMove : 1;
-    bool hasExtraSrcRegAllocReq : 1;
-    bool hasExtraDefRegAllocReq : 1;
-    bool isCodeGenOnly : 1;
-    bool isPseudo : 1;
-    bool isMeta : 1;
-    bool isRegSequence : 1;
-    bool isExtractSubreg : 1;
-    bool isInsertSubreg : 1;
-    bool isConvergent : 1;
-    bool hasNoSchedulingInfo : 1;
-    bool FastISelShouldIgnore : 1;
-    bool hasChain : 1;
-    bool hasChain_Inferred : 1;
-    bool variadicOpsAreDefs : 1;
-    bool isAuthenticated : 1;
-
-    std::string DeprecatedReason;
-    bool HasComplexDeprecationPredicate;
-
-    /// Are there any undefined flags?
-    bool hasUndefFlags() const {
-      return mayLoad_Unset || mayStore_Unset || hasSideEffects_Unset;
-    }
-
-    // The record used to infer instruction flags, or NULL if no flag values
-    // have been inferred.
-    Record *InferredFrom;
-
-    // The enum value assigned by CodeGenTarget::computeInstrsByEnum.
-    mutable unsigned EnumVal;
-
-    CodeGenInstruction(Record *R);
-
-    /// HasOneImplicitDefWithKnownVT - If the instruction has at least one
-    /// implicit def and it has a known VT, return the VT, otherwise return
-    /// MVT::Other.
-    MVT::SimpleValueType
-      HasOneImplicitDefWithKnownVT(const CodeGenTarget &TargetInfo) const;
-
-
-    /// FlattenAsmStringVariants - Flatten the specified AsmString to only
-    /// include text from the specified variant, returning the new string.
-    static std::string FlattenAsmStringVariants(StringRef AsmString,
-                                                unsigned Variant);
-
-    // Is the specified operand in a generic instruction implicitly a pointer.
-    // This can be used on intructions that use typeN or ptypeN to identify
-    // operands that should be considered as pointers even though SelectionDAG
-    // didn't make a distinction between integer and pointers.
-    bool isInOperandAPointer(unsigned i) const {
-      return isOperandImpl("InOperandList", i, "IsPointer");
+    bool operator==(const ConstraintInfo &RHS) const {
+      if (Kind != RHS.Kind)
+        return false;
+      if (Kind == Tied && OtherTiedOperand != RHS.OtherTiedOperand)
+        return false;
+      return true;
     }
+    bool operator!=(const ConstraintInfo &RHS) const { return !(*this == RHS); }
+  };
 
-    bool isOutOperandAPointer(unsigned i) const {
-      return isOperandImpl("OutOperandList", i, "IsPointer");
+  /// OperandInfo - The information we keep track of for each operand in the
+  /// operand list for a tablegen instruction.
+  struct OperandInfo {
+    /// Rec - The definition this operand is declared as.
+    ///
+    Record *Rec;
+
+    /// Name - If this operand was assigned a symbolic name, this is it,
+    /// otherwise, it's empty.
+    std::string Name;
+
+    /// The names of sub-operands, if given, otherwise empty.
+    std::vector<std::string> SubOpNames;
+
+    /// PrinterMethodName - The method used to print operands of this type in
+    /// the asmprinter.
+    std::string PrinterMethodName;
+
+    /// The method used to get the machine operand value for binary
+    /// encoding, per sub-operand. If empty, uses "getMachineOpValue".
+    std::vector<std::string> EncoderMethodNames;
+
+    /// OperandType - A value from MCOI::OperandType representing the type of
+    /// the operand.
+    std::string OperandType;
+
+    /// MIOperandNo - Currently (this is meant to be phased out), some logical
+    /// operands correspond to multiple MachineInstr operands.  In the X86
+    /// target for example, one address operand is represented as 4
+    /// MachineOperands.  Because of this, the operand number in the
+    /// OperandList may not match the MachineInstr operand num.  Until it
+    /// does, this contains the MI operand index of this operand.
+    unsigned MIOperandNo;
+    unsigned MINumOperands; // The number of operands.
+
+    /// DoNotEncode - Bools are set to true in this vector for each operand in
+    /// the DisableEncoding list.  These should not be emitted by the code
+    /// emitter.
+    BitVector DoNotEncode;
+
+    /// MIOperandInfo - Default MI operand type. Note an operand may be made
+    /// up of multiple MI operands.
+    DagInit *MIOperandInfo;
+
+    /// Constraint info for this operand.  This operand can have pieces, so we
+    /// track constraint info for each.
+    std::vector<ConstraintInfo> Constraints;
+
+    OperandInfo(Record *R, const std::string &N, const std::string &PMN,
+                const std::string &OT, unsigned MION, unsigned MINO,
+                DagInit *MIOI)
+        : Rec(R), Name(N), SubOpNames(MINO), PrinterMethodName(PMN),
+          EncoderMethodNames(MINO), OperandType(OT), MIOperandNo(MION),
+          MINumOperands(MINO), DoNotEncode(MINO), MIOperandInfo(MIOI),
+          Constraints(MINO) {}
+
+    /// getTiedOperand - If this operand is tied to another one, return the
+    /// other operand number.  Otherwise, return -1.
+    int getTiedRegister() const {
+      for (unsigned j = 0, e = Constraints.size(); j != e; ++j) {
+        const CGIOperandList::ConstraintInfo &CI = Constraints[j];
+        if (CI.isTied())
+          return CI.getTiedOperand();
+      }
+      return -1;
     }
+  };
 
-    /// Check if the operand is required to be an immediate.
-    bool isInOperandImmArg(unsigned i) const {
-      return isOperandImpl("InOperandList", i, "IsImmediate");
+  CGIOperandList(Record *D);
+
+  Record *TheDef; // The actual record containing this OperandList.
+
+  /// NumDefs - Number of def operands declared, this is the number of
+  /// elements in the instruction's (outs) list.
+  ///
+  unsigned NumDefs;
+
+  /// OperandList - The list of declared operands, along with their declared
+  /// type (which is a record).
+  std::vector<OperandInfo> OperandList;
+
+  /// SubOpAliases - List of alias names for suboperands.
+  StringMap<std::pair<unsigned, unsigned>> SubOpAliases;
+
+  // Information gleaned from the operand list.
+  bool isPredicable;
+  bool hasOptionalDef;
+  bool isVariadic;
+
+  // Provide transparent accessors to the operand list.
+  bool empty() const { return OperandList.empty(); }
+  unsigned size() const { return OperandList.size(); }
+  const OperandInfo &operator[](unsigned i) const { return OperandList[i]; }
+  OperandInfo &operator[](unsigned i) { return OperandList[i]; }
+  OperandInfo &back() { return OperandList.back(); }
+  const OperandInfo &back() const { return OperandList.back(); }
+
+  typedef std::vector<OperandInfo>::iterator iterator;
+  typedef std::vector<OperandInfo>::const_iterator const_iterator;
+  iterator begin() { return OperandList.begin(); }
+  const_iterator begin() const { return OperandList.begin(); }
+  iterator end() { return OperandList.end(); }
+  const_iterator end() const { return OperandList.end(); }
+
+  /// getOperandNamed - Return the index of the operand with the specified
+  /// non-empty name.  If the instruction does not have an operand with the
+  /// specified name, abort.
+  unsigned getOperandNamed(StringRef Name) const;
+
+  /// hasOperandNamed - Query whether the instruction has an operand of the
+  /// given name. If so, return true and set OpIdx to the index of the
+  /// operand. Otherwise, return false.
+  bool hasOperandNamed(StringRef Name, unsigned &OpIdx) const;
+
+  bool hasSubOperandAlias(StringRef Name,
+                          std::pair<unsigned, unsigned> &SubOp) const;
+
+  /// ParseOperandName - Parse an operand name like "$foo" or "$foo.bar",
+  /// where $foo is a whole operand and $foo.bar refers to a suboperand.
+  /// This aborts if the name is invalid.  If AllowWholeOp is true, references
+  /// to operands with suboperands are allowed, otherwise not.
+  std::pair<unsigned, unsigned> ParseOperandName(StringRef Op,
+                                                 bool AllowWholeOp = true);
+
+  /// getFlattenedOperandNumber - Flatten a operand/suboperand pair into a
+  /// flat machineinstr operand #.
+  unsigned getFlattenedOperandNumber(std::pair<unsigned, unsigned> Op) const {
+    return OperandList[Op.first].MIOperandNo + Op.second;
+  }
+
+  /// getSubOperandNumber - Unflatten a operand number into an
+  /// operand/suboperand pair.
+  std::pair<unsigned, unsigned> getSubOperandNumber(unsigned Op) const {
+    for (unsigned i = 0;; ++i) {
+      assert(i < OperandList.size() && "Invalid flat operand #");
+      if (OperandList[i].MIOperandNo + OperandList[i].MINumOperands > Op)
+        return std::make_pair(i, Op - OperandList[i].MIOperandNo);
     }
-
-  private:
-    bool isOperandImpl(StringRef OpListName, unsigned i,
-                       StringRef PropertyName) const;
-  };
+  }
+
+  /// isFlatOperandNotEmitted - Return true if the specified flat operand #
+  /// should not be emitted with the code emitter.
+  bool isFlatOperandNotEmitted(unsigned FlatOpNo) const {
+    std::pair<unsigned, unsigned> Op = getSubOperandNumber(FlatOpNo);
+    if (OperandList[Op.first].DoNotEncode.size() > Op.second)
+      return OperandList[Op.first].DoNotEncode[Op.second];
+    return false;
+  }
+
+  void ProcessDisableEncoding(StringRef Value);
+};
+
+class CodeGenInstruction {
+public:
+  Record *TheDef;      // The actual record defining this instruction.
+  StringRef Namespace; // The namespace the instruction is in.
+
+  /// AsmString - The format string used to emit a .s file for the
+  /// instruction.
+  std::string AsmString;
+
+  /// Operands - This is information about the (ins) and (outs) list specified
+  /// to the instruction.
+  CGIOperandList Operands;
+
+  /// ImplicitDefs/ImplicitUses - These are lists of registers that are
+  /// implicitly defined and used by the instruction.
+  std::vector<Record *> ImplicitDefs, ImplicitUses;
+
+  // Various boolean values we track for the instruction.
+  bool isPreISelOpcode : 1;
+  bool isReturn : 1;
+  bool isEHScopeReturn : 1;
+  bool isBranch : 1;
+  bool isIndirectBranch : 1;
+  bool isCompare : 1;
+  bool isMoveImm : 1;
+  bool isMoveReg : 1;
+  bool isBitcast : 1;
+  bool isSelect : 1;
+  bool isBarrier : 1;
+  bool isCall : 1;
+  bool isAdd : 1;
+  bool isTrap : 1;
+  bool canFoldAsLoad : 1;
+  bool mayLoad : 1;
+  bool mayLoad_Unset : 1;
+  bool mayStore : 1;
+  bool mayStore_Unset : 1;
+  bool mayRaiseFPException : 1;
+  bool isPredicable : 1;
+  bool isConvertibleToThreeAddress : 1;
+  bool isCommutable : 1;
+  bool isTerminator : 1;
+  bool isReMaterializable : 1;
+  bool hasDelaySlot : 1;
+  bool usesCustomInserter : 1;
+  bool hasPostISelHook : 1;
+  bool hasCtrlDep : 1;
+  bool isNotDuplicable : 1;
+  bool hasSideEffects : 1;
+  bool hasSideEffects_Unset : 1;
+  bool isAsCheapAsAMove : 1;
+  bool hasExtraSrcRegAllocReq : 1;
+  bool hasExtraDefRegAllocReq : 1;
+  bool isCodeGenOnly : 1;
+  bool isPseudo : 1;
+  bool isMeta : 1;
+  bool isRegSequence : 1;
+  bool isExtractSubreg : 1;
+  bool isInsertSubreg : 1;
+  bool isConvergent : 1;
+  bool hasNoSchedulingInfo : 1;
+  bool FastISelShouldIgnore : 1;
+  bool hasChain : 1;
+  bool hasChain_Inferred : 1;
+  bool variadicOpsAreDefs : 1;
+  bool isAuthenticated : 1;
+
+  std::string DeprecatedReason;
+  bool HasComplexDeprecationPredicate;
+
+  /// Are there any undefined flags?
+  bool hasUndefFlags() const {
+    return mayLoad_Unset || mayStore_Unset || hasSideEffects_Unset;
+  }
+
+  // The record used to infer instruction flags, or NULL if no flag values
+  // have been inferred.
+  Record *InferredFrom;
+
+  // The enum value assigned by CodeGenTarget::computeInstrsByEnum.
+  mutable unsigned EnumVal;
+
+  CodeGenInstruction(Record *R);
+
+  /// HasOneImplicitDefWithKnownVT - If the instruction has at least one
+  /// implicit def and it has a known VT, return the VT, otherwise return
+  /// MVT::Other.
+  MVT::SimpleValueType
+  HasOneImplicitDefWithKnownVT(const CodeGenTarget &TargetInfo) const;
+
+  /// FlattenAsmStringVariants - Flatten the specified AsmString to only
+  /// include text from the specified variant, returning the new string.
+  static std::string FlattenAsmStringVariants(StringRef AsmString,
+                                              unsigned Variant);
+
+  // Is the specified operand in a generic instruction implicitly a pointer.
+  // This can be used on intructions that use typeN or ptypeN to identify
+  // operands that should be considered as pointers even though SelectionDAG
+  // didn't make a distinction between integer and pointers.
+  bool isInOperandAPointer(unsigned i) const {
+    return isOperandImpl("InOperandList", i, "IsPointer");
+  }
+
+  bool isOutOperandAPointer(unsigned i) const {
+    return isOperandImpl("OutOperandList", i, "IsPointer");
+  }
+
+  /// Check if the operand is required to be an immediate.
+  bool isInOperandImmArg(unsigned i) const {
+    return isOperandImpl("InOperandList", i, "IsImmediate");
+  }
+
+private:
+  bool isOperandImpl(StringRef OpListName, unsigned i,
+                     StringRef PropertyName) const;
+};
 } // namespace llvm
 
 #endif
diff --git a/llvm/utils/TableGen/CodeGenIntrinsics.h b/llvm/utils/TableGen/CodeGenIntrinsics.h
index f3452f5..da9e386 100644
--- a/llvm/utils/TableGen/CodeGenIntrinsics.h
+++ b/llvm/utils/TableGen/CodeGenIntrinsics.h
@@ -25,12 +25,12 @@ class Record;
 class RecordKeeper;
 
 struct CodeGenIntrinsic {
-  Record *TheDef;             // The actual record defining this intrinsic.
-  std::string Name;           // The name of the LLVM function "llvm.bswap.i32"
-  std::string EnumName;       // The name of the enum "bswap_i32"
+  Record *TheDef;       // The actual record defining this intrinsic.
+  std::string Name;     // The name of the LLVM function "llvm.bswap.i32"
+  std::string EnumName; // The name of the enum "bswap_i32"
   std::string ClangBuiltinName; // Name of the corresponding GCC builtin, or "".
-  std::string MSBuiltinName;  // Name of the corresponding MS builtin, or "".
-  std::string TargetPrefix;   // Target prefix, e.g. "ppc" for t-s intrinsics.
+  std::string MSBuiltinName;    // Name of the corresponding MS builtin, or "".
+  std::string TargetPrefix;     // Target prefix, e.g. "ppc" for t-s intrinsics.
 
   /// This structure holds the return values and parameter values of an
   /// intrinsic. If the number of return values is > 1, then the intrinsic
@@ -136,9 +136,7 @@ struct CodeGenIntrinsic {
 
   void addArgAttribute(unsigned Idx, ArgAttrKind AK, uint64_t V = 0);
 
-  bool hasProperty(enum SDNP Prop) const {
-    return Properties & (1 << Prop);
-  }
+  bool hasProperty(enum SDNP Prop) const { return Properties & (1 << Prop); }
 
   /// Goes through all IntrProperties that have IsDefault
   /// value set and sets the property.
@@ -182,6 +180,6 @@ public:
     return Intrinsics[Pos];
   }
 };
-}
+} // namespace llvm
 
 #endif
diff --git a/llvm/utils/TableGen/CodeGenMapTable.cpp b/llvm/utils/TableGen/CodeGenMapTable.cpp
index fd375735..03af0b4 100644
--- a/llvm/utils/TableGen/CodeGenMapTable.cpp
+++ b/llvm/utils/TableGen/CodeGenMapTable.cpp
@@ -80,9 +80,9 @@
 #include "llvm/TableGen/Error.h"
 #include "llvm/TableGen/Record.h"
 using namespace llvm;
-typedef std::map<std::string, std::vector<Record*> > InstrRelMapTy;
+typedef std::map<std::string, std::vector<Record *>> InstrRelMapTy;
 
-typedef std::map<std::vector<Init*>, std::vector<Record*> > RowInstrMapTy;
+typedef std::map<std::vector<Init *>, std::vector<Record *>> RowInstrMapTy;
 
 namespace {
 
@@ -95,10 +95,10 @@ private:
   ListInit *RowFields;
   ListInit *ColFields;
   ListInit *KeyCol;
-  std::vector<ListInit*> ValueCols;
+  std::vector<ListInit *> ValueCols;
 
 public:
-  InstrMap(Record* MapRec) {
+  InstrMap(Record *MapRec) {
     Name = std::string(MapRec->getName());
 
     // FilterClass - It's used to reduce the search space only to the
@@ -130,7 +130,8 @@ public:
     // Each instruction map must specify at least one column for it to be valid.
     if (ColValList->empty())
       PrintFatalError(MapRec->getLoc(), "InstrMapping record `" +
-        MapRec->getName() + "' has empty " + "`ValueCols' field!");
+                                            MapRec->getName() + "' has empty " +
+                                            "`ValueCols' field!");
 
     for (Init *I : ColValList->getValues()) {
       auto *ColI = cast<ListInit>(I);
@@ -138,9 +139,10 @@ public:
       // Make sure that all the sub-lists in 'ValueCols' have same number of
       // elements as the fields in 'ColFields'.
       if (ColI->size() != ColFields->size())
-        PrintFatalError(MapRec->getLoc(), "Record `" + MapRec->getName() +
-          "', field `ValueCols' entries don't match with " +
-          " the entries in 'ColFields'!");
+        PrintFatalError(MapRec->getLoc(),
+                        "Record `" + MapRec->getName() +
+                            "', field `ValueCols' entries don't match with " +
+                            " the entries in 'ColFields'!");
       ValueCols.push_back(ColI);
     }
   }
@@ -155,13 +157,10 @@ public:
 
   ListInit *getKeyCol() const { return KeyCol; }
 
-  const std::vector<ListInit*> &getValueCols() const {
-    return ValueCols;
-  }
+  const std::vector<ListInit *> &getValueCols() const { return ValueCols; }
 };
 } // end anonymous namespace
 
-
 //===----------------------------------------------------------------------===//
 // class MapTableEmitter : It builds the instruction relation maps using
 // the information provided in InstrMapping records. It outputs these
@@ -171,26 +170,26 @@ public:
 namespace {
 class MapTableEmitter {
 private:
-//  std::string TargetName;
+  //  std::string TargetName;
   const CodeGenTarget &Target;
   // InstrMapDesc - InstrMapping record to be processed.
   InstrMap InstrMapDesc;
 
   // InstrDefs - list of instructions filtered using FilterClass defined
   // in InstrMapDesc.
-  std::vector<Record*> InstrDefs;
+  std::vector<Record *> InstrDefs;
 
   // RowInstrMap - maps RowFields values to the instructions. It's keyed by the
   // values of the row fields and contains vector of records as values.
   RowInstrMapTy RowInstrMap;
 
   // KeyInstrVec - list of key instructions.
-  std::vector<Record*> KeyInstrVec;
-  DenseMap<Record*, std::vector<Record*> > MapTable;
+  std::vector<Record *> KeyInstrVec;
+  DenseMap<Record *, std::vector<Record *>> MapTable;
 
 public:
-  MapTableEmitter(CodeGenTarget &Target, RecordKeeper &Records, Record *IMRec):
-                  Target(Target), InstrMapDesc(IMRec) {
+  MapTableEmitter(CodeGenTarget &Target, RecordKeeper &Records, Record *IMRec)
+      : Target(Target), InstrMapDesc(IMRec) {
     const std::string &FilterClass = InstrMapDesc.getFilterClass();
     InstrDefs = Records.getAllDerivedDefinitions(FilterClass);
   }
@@ -199,7 +198,7 @@ public:
 
   // Returns true if an instruction is a key instruction, i.e., its ColFields
   // have same values as KeyCol.
-  bool isKeyColInstr(Record* CurInstr);
+  bool isKeyColInstr(Record *CurInstr);
 
   // Find column instruction corresponding to a key instruction based on the
   // constraints for that column.
@@ -215,11 +214,9 @@ public:
 
   // Lookup functions to query binary search tables.
   void emitMapFuncBody(raw_ostream &OS, unsigned TableSize);
-
 };
 } // end anonymous namespace
 
-
 //===----------------------------------------------------------------------===//
 // Process all the instructions that model this relation (alreday present in
 // InstrDefs) and insert them into RowInstrMap which is keyed by the values of
@@ -230,14 +227,15 @@ public:
 
 void MapTableEmitter::buildRowInstrMap() {
   for (Record *CurInstr : InstrDefs) {
-    std::vector<Init*> KeyValue;
+    std::vector<Init *> KeyValue;
     ListInit *RowFields = InstrMapDesc.getRowFields();
     for (Init *RowField : RowFields->getValues()) {
       RecordVal *RecVal = CurInstr->getValue(RowField);
       if (RecVal == nullptr)
-        PrintFatalError(CurInstr->getLoc(), "No value " +
-                        RowField->getAsString() + " found in \"" +
-                        CurInstr->getName() + "\" instruction description.");
+        PrintFatalError(CurInstr->getLoc(),
+                        "No value " + RowField->getAsString() + " found in \"" +
+                            CurInstr->getName() +
+                            "\" instruction description.");
       Init *CurInstrVal = RecVal->getValue();
       KeyValue.push_back(CurInstrVal);
     }
@@ -256,14 +254,14 @@ void MapTableEmitter::buildRowInstrMap() {
 // Return true if an instruction is a KeyCol instruction.
 //===----------------------------------------------------------------------===//
 
-bool MapTableEmitter::isKeyColInstr(Record* CurInstr) {
+bool MapTableEmitter::isKeyColInstr(Record *CurInstr) {
   ListInit *ColFields = InstrMapDesc.getColFields();
   ListInit *KeyCol = InstrMapDesc.getKeyCol();
 
   // Check if the instruction is a KeyCol instruction.
   bool MatchFound = true;
-  for (unsigned j = 0, endCF = ColFields->size();
-      (j < endCF) && MatchFound; j++) {
+  for (unsigned j = 0, endCF = ColFields->size(); (j < endCF) && MatchFound;
+       j++) {
     RecordVal *ColFieldName = CurInstr->getValue(ColFields->getElement(j));
     std::string CurInstrVal = ColFieldName->getValue()->getAsUnquotedString();
     std::string KeyColValue = KeyCol->getElement(j)->getAsUnquotedString();
@@ -280,10 +278,10 @@ bool MapTableEmitter::isKeyColInstr(Record* CurInstr) {
 void MapTableEmitter::buildMapTable() {
   // Find column instructions for a given key based on the ColField
   // constraints.
-  const std::vector<ListInit*> &ValueCols = InstrMapDesc.getValueCols();
+  const std::vector<ListInit *> &ValueCols = InstrMapDesc.getValueCols();
   unsigned NumOfCols = ValueCols.size();
   for (Record *CurKeyInstr : KeyInstrVec) {
-    std::vector<Record*> ColInstrVec(NumOfCols);
+    std::vector<Record *> ColInstrVec(NumOfCols);
 
     // Find the column instruction based on the constraints for the column.
     for (unsigned ColIdx = 0; ColIdx < NumOfCols; ColIdx++) {
@@ -302,7 +300,7 @@ void MapTableEmitter::buildMapTable() {
 Record *MapTableEmitter::getInstrForColumn(Record *KeyInstr,
                                            ListInit *CurValueCol) {
   ListInit *RowFields = InstrMapDesc.getRowFields();
-  std::vector<Init*> KeyValue;
+  std::vector<Init *> KeyValue;
 
   // Construct KeyValue using KeyInstr's values for RowFields.
   for (Init *RowField : RowFields->getValues()) {
@@ -314,15 +312,15 @@ Record *MapTableEmitter::getInstrForColumn(Record *KeyInstr,
   // in RowInstrMap. We search through these instructions to find a match
   // for the current column, i.e., the instruction which has the same values
   // as CurValueCol for all the fields in ColFields.
-  const std::vector<Record*> &RelatedInstrVec = RowInstrMap[KeyValue];
+  const std::vector<Record *> &RelatedInstrVec = RowInstrMap[KeyValue];
 
   ListInit *ColFields = InstrMapDesc.getColFields();
   Record *MatchInstr = nullptr;
 
   for (llvm::Record *CurInstr : RelatedInstrVec) {
     bool MatchFound = true;
-    for (unsigned j = 0, endCF = ColFields->size();
-         (j < endCF) && MatchFound; j++) {
+    for (unsigned j = 0, endCF = ColFields->size(); (j < endCF) && MatchFound;
+         j++) {
       Init *ColFieldJ = ColFields->getElement(j);
       Init *CurInstrInit = CurInstr->getValue(ColFieldJ)->getValue();
       std::string CurInstrVal = CurInstrInit->getAsUnquotedString();
@@ -360,21 +358,21 @@ Record *MapTableEmitter::getInstrForColumn(Record *KeyInstr,
 
 unsigned MapTableEmitter::emitBinSearchTable(raw_ostream &OS) {
 
-  ArrayRef<const CodeGenInstruction*> NumberedInstructions =
-                                            Target.getInstructionsByEnumValue();
+  ArrayRef<const CodeGenInstruction *> NumberedInstructions =
+      Target.getInstructionsByEnumValue();
   StringRef Namespace = Target.getInstNamespace();
-  const std::vector<ListInit*> &ValueCols = InstrMapDesc.getValueCols();
+  const std::vector<ListInit *> &ValueCols = InstrMapDesc.getValueCols();
   unsigned NumCol = ValueCols.size();
   unsigned TotalNumInstr = NumberedInstructions.size();
   unsigned TableSize = 0;
 
-  OS << "static const uint16_t "<<InstrMapDesc.getName();
+  OS << "static const uint16_t " << InstrMapDesc.getName();
   // Number of columns in the table are NumCol+1 because key instructions are
   // emitted as first column.
-  OS << "Table[]["<< NumCol+1 << "] = {\n";
+  OS << "Table[][" << NumCol + 1 << "] = {\n";
   for (unsigned i = 0; i < TotalNumInstr; i++) {
     Record *CurInstr = NumberedInstructions[i]->TheDef;
-    std::vector<Record*> ColInstrs = MapTable[CurInstr];
+    std::vector<Record *> ColInstrs = MapTable[CurInstr];
     std::string OutStr;
     unsigned RelExists = 0;
     if (!ColInstrs.empty()) {
@@ -385,19 +383,23 @@ unsigned MapTableEmitter::emitBinSearchTable(raw_ostream &OS) {
           OutStr += Namespace;
           OutStr += "::";
           OutStr += ColInstrs[j]->getName();
-        } else { OutStr += ", (uint16_t)-1U";}
+        } else {
+          OutStr += ", (uint16_t)-1U";
+        }
       }
 
       if (RelExists) {
         OS << "  { " << Namespace << "::" << CurInstr->getName();
-        OS << OutStr <<" },\n";
+        OS << OutStr << " },\n";
         TableSize++;
       }
     }
   }
   if (!TableSize) {
-    OS << "  { " << Namespace << "::" << "INSTRUCTION_LIST_END, ";
-    OS << Namespace << "::" << "INSTRUCTION_LIST_END }";
+    OS << "  { " << Namespace << "::"
+       << "INSTRUCTION_LIST_END, ";
+    OS << Namespace << "::"
+       << "INSTRUCTION_LIST_END }";
   }
   OS << "}; // End of " << InstrMapDesc.getName() << "Table\n\n";
   return TableSize;
@@ -430,11 +432,10 @@ void MapTableEmitter::emitBinSearch(raw_ostream &OS, unsigned TableSize) {
 // Emit functions to query relation tables.
 //===----------------------------------------------------------------------===//
 
-void MapTableEmitter::emitMapFuncBody(raw_ostream &OS,
-                                           unsigned TableSize) {
+void MapTableEmitter::emitMapFuncBody(raw_ostream &OS, unsigned TableSize) {
 
   ListInit *ColFields = InstrMapDesc.getColFields();
-  const std::vector<ListInit*> &ValueCols = InstrMapDesc.getValueCols();
+  const std::vector<ListInit *> &ValueCols = InstrMapDesc.getValueCols();
 
   // Emit binary search algorithm to locate instructions in the
   // relation table. If found, return opcode value from the appropriate column
@@ -455,14 +456,13 @@ void MapTableEmitter::emitMapFuncBody(raw_ostream &OS,
       }
       OS << ")\n";
       OS << "    return " << InstrMapDesc.getName();
-      OS << "Table[mid]["<<i+1<<"];\n";
+      OS << "Table[mid][" << i + 1 << "];\n";
     }
     OS << "  return -1;";
-  }
-  else
+  } else
     OS << "  return " << InstrMapDesc.getName() << "Table[mid][1];\n";
 
-  OS <<"}\n\n";
+  OS << "}\n\n";
 }
 
 //===----------------------------------------------------------------------===//
@@ -477,9 +477,9 @@ void MapTableEmitter::emitTablesWithFunc(raw_ostream &OS) {
   // to pass another input to indicate the column to be selected.
 
   ListInit *ColFields = InstrMapDesc.getColFields();
-  const std::vector<ListInit*> &ValueCols = InstrMapDesc.getValueCols();
-  OS << "// "<< InstrMapDesc.getName() << "\nLLVM_READONLY\n";
-  OS << "int "<< InstrMapDesc.getName() << "(uint16_t Opcode";
+  const std::vector<ListInit *> &ValueCols = InstrMapDesc.getValueCols();
+  OS << "// " << InstrMapDesc.getName() << "\nLLVM_READONLY\n";
+  OS << "int " << InstrMapDesc.getName() << "(uint16_t Opcode";
   if (ValueCols.size() > 1) {
     for (Init *CF : ColFields->getValues()) {
       std::string ColName = CF->getAsUnquotedString();
@@ -501,9 +501,9 @@ void MapTableEmitter::emitTablesWithFunc(raw_ostream &OS) {
 
 static void emitEnums(raw_ostream &OS, RecordKeeper &Records) {
 
-  std::vector<Record*> InstrMapVec;
+  std::vector<Record *> InstrMapVec;
   InstrMapVec = Records.getAllDerivedDefinitions("InstrMapping");
-  std::map<std::string, std::vector<Init*> > ColFieldValueMap;
+  std::map<std::string, std::vector<Init *>> ColFieldValueMap;
 
   // Iterate over all InstrMapping records and create a map between column
   // fields and their possible values across all records.
@@ -511,20 +511,22 @@ static void emitEnums(raw_ostream &OS, RecordKeeper &Records) {
     ListInit *ColFields;
     ColFields = CurMap->getValueAsListInit("ColFields");
     ListInit *List = CurMap->getValueAsListInit("ValueCols");
-    std::vector<ListInit*> ValueCols;
+    std::vector<ListInit *> ValueCols;
     unsigned ListSize = List->size();
 
     for (unsigned j = 0; j < ListSize; j++) {
       auto *ListJ = cast<ListInit>(List->getElement(j));
 
       if (ListJ->size() != ColFields->size())
-        PrintFatalError("Record `" + CurMap->getName() + "', field "
-          "`ValueCols' entries don't match with the entries in 'ColFields' !");
+        PrintFatalError("Record `" + CurMap->getName() +
+                        "', field "
+                        "`ValueCols' entries don't match with the entries in "
+                        "'ColFields' !");
       ValueCols.push_back(ListJ);
     }
 
     for (unsigned j = 0, endCF = ColFields->size(); j < endCF; j++) {
-      for (unsigned k = 0; k < ListSize; k++){
+      for (unsigned k = 0; k < ListSize; k++) {
         std::string ColName = ColFields->getElement(j)->getAsUnquotedString();
         ColFieldValueMap[ColName].push_back((ValueCols[k])->getElement(j));
       }
@@ -532,14 +534,14 @@ static void emitEnums(raw_ostream &OS, RecordKeeper &Records) {
   }
 
   for (auto &Entry : ColFieldValueMap) {
-    std::vector<Init*> FieldValues = Entry.second;
+    std::vector<Init *> FieldValues = Entry.second;
 
     // Delete duplicate entries from ColFieldValueMap
     for (unsigned i = 0; i < FieldValues.size() - 1; i++) {
       Init *CurVal = FieldValues[i];
-      for (unsigned j = i+1; j < FieldValues.size(); j++) {
+      for (unsigned j = i + 1; j < FieldValues.size(); j++) {
         if (CurVal == FieldValues[j]) {
-          FieldValues.erase(FieldValues.begin()+j);
+          FieldValues.erase(FieldValues.begin() + j);
           --j;
         }
       }
@@ -566,7 +568,7 @@ namespace llvm {
 void EmitMapTable(RecordKeeper &Records, raw_ostream &OS) {
   CodeGenTarget Target(Records);
   StringRef NameSpace = Target.getInstNamespace();
-  std::vector<Record*> InstrMapVec;
+  std::vector<Record *> InstrMapVec;
   InstrMapVec = Records.getAllDerivedDefinitions("InstrMapping");
 
   if (InstrMapVec.empty())
@@ -603,4 +605,4 @@ void EmitMapTable(RecordKeeper &Records, raw_ostream &OS) {
   OS << "#endif // GET_INSTRMAP_INFO\n\n";
 }
 
-} // End llvm namespace
+} // namespace llvm
diff --git a/llvm/utils/TableGen/CodeGenRegisters.cpp b/llvm/utils/TableGen/CodeGenRegisters.cpp
index d1abdb7..4b89540 100644
--- a/llvm/utils/TableGen/CodeGenRegisters.cpp
+++ b/llvm/utils/TableGen/CodeGenRegisters.cpp
@@ -48,7 +48,7 @@ using namespace llvm;
 //===----------------------------------------------------------------------===//
 
 CodeGenSubRegIndex::CodeGenSubRegIndex(Record *R, unsigned Enum)
-  : TheDef(R), EnumValue(Enum), AllSuperRegsCovered(true), Artificial(true) {
+    : TheDef(R), EnumValue(Enum), AllSuperRegsCovered(true), Artificial(true) {
   Name = std::string(R->getName());
   if (R->getValue("Namespace"))
     Namespace = std::string(R->getValueAsString("Namespace"));
@@ -74,7 +74,7 @@ void CodeGenSubRegIndex::updateComponents(CodeGenRegBank &RegBank) {
   if (!TheDef)
     return;
 
-  std::vector<Record*> Comps = TheDef->getValueAsListOfDefs("ComposedOf");
+  std::vector<Record *> Comps = TheDef->getValueAsListOfDefs("ComposedOf");
   if (!Comps.empty()) {
     if (Comps.size() != 2)
       PrintFatalError(TheDef->getLoc(),
@@ -86,13 +86,13 @@ void CodeGenSubRegIndex::updateComponents(CodeGenRegBank &RegBank) {
       PrintFatalError(TheDef->getLoc(), "Ambiguous ComposedOf entries");
   }
 
-  std::vector<Record*> Parts =
-    TheDef->getValueAsListOfDefs("CoveringSubRegIndices");
+  std::vector<Record *> Parts =
+      TheDef->getValueAsListOfDefs("CoveringSubRegIndices");
   if (!Parts.empty()) {
     if (Parts.size() < 2)
       PrintFatalError(TheDef->getLoc(),
                       "CoveredBySubRegs must have two or more entries");
-    SmallVector<CodeGenSubRegIndex*, 8> IdxParts;
+    SmallVector<CodeGenSubRegIndex *, 8> IdxParts;
     for (Record *Part : Parts)
       IdxParts.push_back(RegBank.getSubRegIdx(Part));
     setConcatenationOf(IdxParts);
@@ -117,17 +117,19 @@ LaneBitmask CodeGenSubRegIndex::computeLaneMask() const {
 }
 
 void CodeGenSubRegIndex::setConcatenationOf(
-    ArrayRef<CodeGenSubRegIndex*> Parts) {
+    ArrayRef<CodeGenSubRegIndex *> Parts) {
   if (ConcatenationOf.empty())
     ConcatenationOf.assign(Parts.begin(), Parts.end());
   else
-    assert(std::equal(Parts.begin(), Parts.end(),
-                      ConcatenationOf.begin()) && "parts consistent");
+    assert(std::equal(Parts.begin(), Parts.end(), ConcatenationOf.begin()) &&
+           "parts consistent");
 }
 
 void CodeGenSubRegIndex::computeConcatTransitiveClosure() {
-  for (SmallVectorImpl<CodeGenSubRegIndex*>::iterator
-       I = ConcatenationOf.begin(); I != ConcatenationOf.end(); /*empty*/) {
+  for (SmallVectorImpl<CodeGenSubRegIndex *>::iterator I =
+           ConcatenationOf.begin();
+       I != ConcatenationOf.end();
+       /*empty*/) {
     CodeGenSubRegIndex *SubIdx = *I;
     SubIdx->computeConcatTransitiveClosure();
 #ifndef NDEBUG
@@ -160,8 +162,8 @@ CodeGenRegister::CodeGenRegister(Record *R, unsigned Enum)
 }
 
 void CodeGenRegister::buildObjectGraph(CodeGenRegBank &RegBank) {
-  std::vector<Record*> SRIs = TheDef->getValueAsListOfDefs("SubRegIndices");
-  std::vector<Record*> SRs = TheDef->getValueAsListOfDefs("SubRegs");
+  std::vector<Record *> SRIs = TheDef->getValueAsListOfDefs("SubRegIndices");
+  std::vector<Record *> SRs = TheDef->getValueAsListOfDefs("SubRegs");
 
   if (SRIs.size() != SRs.size())
     PrintFatalError(TheDef->getLoc(),
@@ -182,7 +184,7 @@ void CodeGenRegister::buildObjectGraph(CodeGenRegBank &RegBank) {
 
   // Add ad hoc alias links. This is a symmetric relationship between two
   // registers, so build a symmetric graph by adding links in both ends.
-  std::vector<Record*> Aliases = TheDef->getValueAsListOfDefs("Aliases");
+  std::vector<Record *> Aliases = TheDef->getValueAsListOfDefs("Aliases");
   for (Record *Alias : Aliases) {
     CodeGenRegister *Reg = RegBank.getReg(Alias);
     ExplicitAliases.push_back(Reg);
@@ -204,8 +206,8 @@ class RegUnitIterator {
   static CodeGenRegister::RegUnitList Sentinel;
 
 public:
-  RegUnitIterator(const CodeGenRegister::Vec &Regs):
-    RegI(Regs.begin()), RegE(Regs.end()) {
+  RegUnitIterator(const CodeGenRegister::Vec &Regs)
+      : RegI(Regs.begin()), RegE(Regs.end()) {
 
     if (RegI == RegE) {
       UnitI = Sentinel.end();
@@ -219,9 +221,15 @@ public:
 
   bool isValid() const { return UnitI != UnitE; }
 
-  unsigned operator* () const { assert(isValid()); return *UnitI; }
+  unsigned operator*() const {
+    assert(isValid());
+    return *UnitI;
+  }
 
-  const CodeGenRegister *getReg() const { assert(isValid()); return *RegI; }
+  const CodeGenRegister *getReg() const {
+    assert(isValid());
+    return *RegI;
+  }
 
   /// Preincrement.  Move to the next unit.
   void operator++() {
@@ -280,14 +288,15 @@ CodeGenRegister::computeSubRegs(CodeGenRegBank &RegBank) {
       Idx->Artificial = false;
     if (!SubRegs.insert(std::make_pair(Idx, SR)).second)
       PrintFatalError(TheDef->getLoc(), "SubRegIndex " + Idx->getName() +
-                      " appears twice in Register " + getName());
+                                            " appears twice in Register " +
+                                            getName());
     // Map explicit sub-registers first, so the names take precedence.
     // The inherited sub-registers are mapped below.
     SubReg2Idx.insert(std::make_pair(SR, Idx));
   }
 
   // Keep track of inherited subregs and how they can be reached.
-  SmallPtrSet<CodeGenRegister*, 8> Orphans;
+  SmallPtrSet<CodeGenRegister *, 8> Orphans;
 
   // Clone inherited subregs and place duplicate entries in Orphans.
   // Here the order is important - earlier subregs take precedence.
@@ -305,7 +314,7 @@ CodeGenRegister::computeSubRegs(CodeGenRegBank &RegBank) {
   // If dsub_2 has ComposedOf = [qsub_1, dsub_0], and this register has a
   // qsub_1 subreg, add a dsub_2 subreg.  Keep growing Indices and process
   // expanded subreg indices recursively.
-  SmallVector<CodeGenSubRegIndex*, 8> Indices = ExplicitSubRegIndices;
+  SmallVector<CodeGenSubRegIndex *, 8> Indices = ExplicitSubRegIndices;
   for (unsigned i = 0; i != Indices.size(); ++i) {
     CodeGenSubRegIndex *Idx = Indices[i];
     const CodeGenSubRegIndex::CompMap &Comps = Idx->getComposites();
@@ -350,7 +359,8 @@ CodeGenRegister::computeSubRegs(CodeGenRegBank &RegBank) {
     const SubRegMap &Map = SR->computeSubRegs(RegBank);
     for (const auto &SubReg : Map)
       if (Orphans.erase(SubReg.second))
-        SubRegs[RegBank.getCompositeSubRegIndex(Idx, SubReg.first)] = SubReg.second;
+        SubRegs[RegBank.getCompositeSubRegIndex(Idx, SubReg.first)] =
+            SubReg.second;
   }
 
   // Compute the inverse SubReg -> Idx map.
@@ -360,7 +370,7 @@ CodeGenRegister::computeSubRegs(CodeGenRegBank &RegBank) {
       if (TheDef)
         Loc = TheDef->getLoc();
       PrintFatalError(Loc, "Register " + getName() +
-                      " has itself as a sub-register");
+                               " has itself as a sub-register");
     }
 
     // Compute AllSuperRegsCovered.
@@ -368,17 +378,18 @@ CodeGenRegister::computeSubRegs(CodeGenRegBank &RegBank) {
       SubReg.first->AllSuperRegsCovered = false;
 
     // Ensure that every sub-register has a unique name.
-    DenseMap<const CodeGenRegister*, CodeGenSubRegIndex*>::iterator Ins =
-      SubReg2Idx.insert(std::make_pair(SubReg.second, SubReg.first)).first;
+    DenseMap<const CodeGenRegister *, CodeGenSubRegIndex *>::iterator Ins =
+        SubReg2Idx.insert(std::make_pair(SubReg.second, SubReg.first)).first;
     if (Ins->second == SubReg.first)
       continue;
     // Trouble: Two different names for SubReg.second.
     ArrayRef<SMLoc> Loc;
     if (TheDef)
       Loc = TheDef->getLoc();
-    PrintFatalError(Loc, "Sub-register can't have two names: " +
-                  SubReg.second->getName() + " available as " +
-                  SubReg.first->getName() + " and " + Ins->second->getName());
+    PrintFatalError(
+        Loc, "Sub-register can't have two names: " + SubReg.second->getName() +
+                 " available as " + SubReg.first->getName() + " and " +
+                 Ins->second->getName());
   }
 
   // Derive possible names for sub-register concatenations from any explicit
@@ -392,7 +403,7 @@ CodeGenRegister::computeSubRegs(CodeGenRegBank &RegBank) {
       continue;
 
     // SR is composed of multiple sub-regs. Find their names in this register.
-    SmallVector<CodeGenSubRegIndex*, 8> Parts;
+    SmallVector<CodeGenSubRegIndex *, 8> Parts;
     for (unsigned j = 0, e = SR->ExplicitSubRegs.size(); j != e; ++j) {
       CodeGenSubRegIndex &I = *SR->ExplicitSubRegIndices[j];
       if (!I.Artificial)
@@ -464,8 +475,8 @@ CodeGenRegister::computeSubRegs(CodeGenRegBank &RegBank) {
 void CodeGenRegister::computeSecondarySubRegs(CodeGenRegBank &RegBank) {
   SmallVector<SubRegMap::value_type, 8> NewSubRegs;
 
-  std::queue<std::pair<CodeGenSubRegIndex*,CodeGenRegister*>> SubRegQueue;
-  for (std::pair<CodeGenSubRegIndex*,CodeGenRegister*> P : SubRegs)
+  std::queue<std::pair<CodeGenSubRegIndex *, CodeGenRegister *>> SubRegQueue;
+  for (std::pair<CodeGenSubRegIndex *, CodeGenRegister *> P : SubRegs)
     SubRegQueue.push(P);
 
   // Look at the leading super-registers of each sub-register. Those are the
@@ -479,7 +490,7 @@ void CodeGenRegister::computeSecondarySubRegs(CodeGenRegBank &RegBank) {
 
     const CodeGenRegister::SuperRegList &Leads = SubReg->LeadingSuperRegs;
     for (unsigned i = 0, e = Leads.size(); i != e; ++i) {
-      CodeGenRegister *Cand = const_cast<CodeGenRegister*>(Leads[i]);
+      CodeGenRegister *Cand = const_cast<CodeGenRegister *>(Leads[i]);
       // Already got this sub-register?
       if (Cand == this || getSubRegIndex(Cand))
         continue;
@@ -488,7 +499,7 @@ void CodeGenRegister::computeSecondarySubRegs(CodeGenRegBank &RegBank) {
              "Super-register has no sub-registers");
       if (Cand->ExplicitSubRegs.size() == 1)
         continue;
-      SmallVector<CodeGenSubRegIndex*, 8> Parts;
+      SmallVector<CodeGenSubRegIndex *, 8> Parts;
       // We know that the first component is (SubRegIdx,SubReg). However we
       // may still need to split it into smaller subregister parts.
       assert(Cand->ExplicitSubRegs[0] == SubReg && "LeadingSuperRegs correct");
@@ -513,7 +524,7 @@ void CodeGenRegister::computeSecondarySubRegs(CodeGenRegBank &RegBank) {
       // Each part of Cand is a sub-register of this. Make the full Cand also
       // a sub-register with a concatenated sub-register index.
       CodeGenSubRegIndex *Concat = RegBank.getConcatSubRegIndex(Parts);
-      std::pair<CodeGenSubRegIndex*,CodeGenRegister*> NewSubReg =
+      std::pair<CodeGenSubRegIndex *, CodeGenRegister *> NewSubReg =
           std::make_pair(Concat, Cand);
 
       if (!SubRegs.insert(NewSubReg).second)
@@ -570,9 +581,8 @@ void CodeGenRegister::computeSuperRegs(CodeGenRegBank &RegBank) {
   TopoSig = RegBank.getTopoSig(Id);
 }
 
-void
-CodeGenRegister::addSubRegsPreOrder(SetVector<const CodeGenRegister*> &OSet,
-                                    CodeGenRegBank &RegBank) const {
+void CodeGenRegister::addSubRegsPreOrder(
+    SetVector<const CodeGenRegister *> &OSet, CodeGenRegBank &RegBank) const {
   assert(SubRegsComplete && "Must precompute sub-registers");
   for (unsigned i = 0, e = ExplicitSubRegs.size(); i != e; ++i) {
     CodeGenRegister *SR = ExplicitSubRegs[i];
@@ -611,7 +621,7 @@ struct TupleExpander : SetTheory::Expander {
       : SynthDefs(SynthDefs) {}
 
   void expand(SetTheory &ST, Record *Def, SetTheory::RecSet &Elts) override {
-    std::vector<Record*> Indices = Def->getValueAsListOfDefs("SubRegIndices");
+    std::vector<Record *> Indices = Def->getValueAsListOfDefs("SubRegIndices");
     unsigned Dim = Indices.size();
     ListInit *SubRegs = Def->getValueAsListInit("SubRegs");
     if (Dim != SubRegs->size())
@@ -635,17 +645,18 @@ struct TupleExpander : SetTheory::Expander {
     Record *RegisterCl = Def->getRecords().getClass("Register");
     RecTy *RegisterRecTy = RecordRecTy::get(RegisterCl);
     std::vector<StringRef> RegNames =
-      Def->getValueAsListOfStrings("RegAsmNames");
+        Def->getValueAsListOfStrings("RegAsmNames");
 
     // Zip them up.
     RecordKeeper &RK = Def->getRecords();
     for (unsigned n = 0; n != Length; ++n) {
       std::string Name;
       Record *Proto = Lists[0][n];
-      std::vector<Init*> Tuple;
+      std::vector<Init *> Tuple;
       for (unsigned i = 0; i != Dim; ++i) {
         Record *Reg = Lists[i][n];
-        if (i) Name += '_';
+        if (i)
+          Name += '_';
         Name += Reg->getName();
         Tuple.push_back(DefInit::get(Reg));
       }
@@ -660,7 +671,7 @@ struct TupleExpander : SetTheory::Expander {
         if (RegNames.size() <= n)
           PrintFatalError(Def->getLoc(),
                           "Register tuple definition missing name for '" +
-                            Name + "'.");
+                              Name + "'.");
         AsmName = StringInit::get(RK, RegNames[n]);
       }
 
@@ -703,15 +714,13 @@ struct TupleExpander : SetTheory::Expander {
           RV.setValue(BitInit::get(RK, true));
 
         // Copy fields from the RegisterTuples def.
-        if (Field == "SubRegIndices" ||
-            Field == "CompositeIndices") {
+        if (Field == "SubRegIndices" || Field == "CompositeIndices") {
           NewReg->addValue(*Def->getValue(Field));
           continue;
         }
 
         // Some fields get their default uninitialized value.
-        if (Field == "DwarfNumbers" ||
-            Field == "DwarfAlias" ||
+        if (Field == "DwarfNumbers" || Field == "DwarfAlias" ||
             Field == "Aliases") {
           if (const RecordVal *DefRV = RegisterCl->getValue(Field))
             NewReg->addValue(*DefRV);
@@ -740,7 +749,7 @@ CodeGenRegisterClass::CodeGenRegisterClass(CodeGenRegBank &RegBank, Record *R)
     : TheDef(R), Name(std::string(R->getName())),
       TopoSigs(RegBank.getNumTopoSigs()), EnumValue(-1), TSFlags(0) {
   GeneratePressureSet = R->getValueAsBit("GeneratePressureSet");
-  std::vector<Record*> TypeList = R->getValueAsListOfDefs("RegTypes");
+  std::vector<Record *> TypeList = R->getValueAsListOfDefs("RegTypes");
   if (TypeList.empty())
     PrintFatalError(R->getLoc(), "RegTypes list must not be empty!");
   for (unsigned i = 0, e = TypeList.size(); i != e; ++i) {
@@ -779,7 +788,7 @@ CodeGenRegisterClass::CodeGenRegisterClass(CodeGenRegBank &RegBank, Record *R)
       Order.pop_back();
       if (!contains(Reg))
         PrintFatalError(R->getLoc(), " AltOrder register " + Reg->getName() +
-                      " is not a class member");
+                                         " is not a class member");
     }
   }
 
@@ -793,8 +802,8 @@ CodeGenRegisterClass::CodeGenRegisterClass(CodeGenRegBank &RegBank, Record *R)
          "Impossible to determine register size");
   if (!RSI.hasDefault()) {
     RegSizeInfo RI;
-    RI.RegSize = RI.SpillSize = Size ? Size
-                                     : VTs[0].getSimple().getSizeInBits();
+    RI.RegSize = RI.SpillSize =
+        Size ? Size : VTs[0].getSimple().getSizeInBits();
     RI.SpillAlignment = R->getValueAsInt("Alignment");
     RSI.insertRegSizeForMode(DefaultMode, RI);
   }
@@ -890,7 +899,7 @@ bool CodeGenRegisterClass::contains(const CodeGenRegister *Reg) const {
                             deref<std::less<>>());
 }
 
-unsigned CodeGenRegisterClass::getWeight(const CodeGenRegBank& RegBank) const {
+unsigned CodeGenRegisterClass::getWeight(const CodeGenRegBank &RegBank) const {
   if (TheDef && !TheDef->isValueUnset("Weight"))
     return TheDef->getValueAsInt("Weight");
 
@@ -902,19 +911,19 @@ unsigned CodeGenRegisterClass::getWeight(const CodeGenRegBank& RegBank) const {
 
 namespace llvm {
 
-  raw_ostream &operator<<(raw_ostream &OS, const CodeGenRegisterClass::Key &K) {
-    OS << "{ " << K.RSI;
-    for (const auto R : *K.Members)
-      OS << ", " << R->getName();
-    return OS << " }";
-  }
+raw_ostream &operator<<(raw_ostream &OS, const CodeGenRegisterClass::Key &K) {
+  OS << "{ " << K.RSI;
+  for (const auto R : *K.Members)
+    OS << ", " << R->getName();
+  return OS << " }";
+}
 
 } // end namespace llvm
 
 // This is a simple lexicographical order that can be used to search for sets.
 // It is not the same as the topological order provided by TopoOrderRC.
-bool CodeGenRegisterClass::Key::
-operator<(const CodeGenRegisterClass::Key &B) const {
+bool CodeGenRegisterClass::Key::operator<(
+    const CodeGenRegisterClass::Key &B) const {
   assert(Members && B.Members);
   return std::tie(*Members, RSI) < std::tie(*B.Members, B.RSI);
 }
@@ -1066,7 +1075,7 @@ CodeGenRegisterClass::getMatchingSubClassWithSubRegs(
 
   // Find all the subreg classes and order them by size too.
   std::vector<std::pair<CodeGenRegisterClass *, BitVector>> SuperRegClasses;
-  for (auto &RC: RegClasses) {
+  for (auto &RC : RegClasses) {
     BitVector SuperRegClassesBV(RegClasses.size());
     RC.getSuperRegClasses(SubIdx, SuperRegClassesBV);
     if (SuperRegClassesBV.any())
@@ -1129,8 +1138,8 @@ void CodeGenRegisterClass::getSuperRegClasses(const CodeGenSubRegIndex *SubIdx,
 }
 
 // Populate a unique sorted list of units from a register set.
-void CodeGenRegisterClass::buildRegUnitSet(const CodeGenRegBank &RegBank,
-  std::vector<unsigned> &RegUnits) const {
+void CodeGenRegisterClass::buildRegUnitSet(
+    const CodeGenRegBank &RegBank, std::vector<unsigned> &RegUnits) const {
   std::vector<unsigned> TmpUnits;
   for (RegUnitIterator UnitI(Members); UnitI.isValid(); ++UnitI) {
     const RegUnit &RU = RegBank.getRegUnit(*UnitI);
@@ -1158,7 +1167,8 @@ CodeGenRegisterCategory::CodeGenRegisterCategory(CodeGenRegBank &RegBank,
 //===----------------------------------------------------------------------===//
 
 CodeGenRegBank::CodeGenRegBank(RecordKeeper &Records,
-                               const CodeGenHwModes &Modes) : CGH(Modes) {
+                               const CodeGenHwModes &Modes)
+    : CGH(Modes) {
   // Configure register Sets to understand register classes and tuples.
   Sets.addFieldExpander("RegisterClass", "MemberList");
   Sets.addFieldExpander("CalleeSavedRegs", "SaveList");
@@ -1167,7 +1177,7 @@ CodeGenRegBank::CodeGenRegBank(RecordKeeper &Records,
 
   // Read in the user-defined (named) sub-register indices.
   // More indices will be synthesized later.
-  std::vector<Record*> SRIs = Records.getAllDerivedDefinitions("SubRegIndex");
+  std::vector<Record *> SRIs = Records.getAllDerivedDefinitions("SubRegIndex");
   llvm::sort(SRIs, LessRecord());
   for (unsigned i = 0, e = SRIs.size(); i != e; ++i)
     getSubRegIdx(SRIs[i]);
@@ -1238,8 +1248,9 @@ CodeGenRegBank::CodeGenRegBank(RecordKeeper &Records,
     SRI.computeConcatTransitiveClosure();
     if (!SRI.ConcatenationOf.empty())
       ConcatIdx.insert(std::make_pair(
-          SmallVector<CodeGenSubRegIndex*,8>(SRI.ConcatenationOf.begin(),
-                                             SRI.ConcatenationOf.end()), &SRI));
+          SmallVector<CodeGenSubRegIndex *, 8>(SRI.ConcatenationOf.begin(),
+                                               SRI.ConcatenationOf.end()),
+          &SRI));
   }
 
   // Infer even more sub-registers by combining leading super-registers.
@@ -1269,7 +1280,7 @@ CodeGenRegBank::CodeGenRegBank(RecordKeeper &Records,
   NumNativeRegUnits = RegUnits.size();
 
   // Read in register class definitions.
-  std::vector<Record*> RCs = Records.getAllDerivedDefinitions("RegisterClass");
+  std::vector<Record *> RCs = Records.getAllDerivedDefinitions("RegisterClass");
   if (RCs.empty())
     PrintFatalError("No 'RegisterClass' subclasses defined!");
 
@@ -1299,8 +1310,8 @@ CodeGenRegBank::CodeGenRegBank(RecordKeeper &Records,
 }
 
 // Create a synthetic CodeGenSubRegIndex without a corresponding Record.
-CodeGenSubRegIndex*
-CodeGenRegBank::createSubRegIndex(StringRef Name, StringRef Namespace) {
+CodeGenSubRegIndex *CodeGenRegBank::createSubRegIndex(StringRef Name,
+                                                      StringRef Namespace) {
   SubRegIndices.emplace_back(Name, Namespace, SubRegIndices.size() + 1);
   return &SubRegIndices.back();
 }
@@ -1315,7 +1326,7 @@ CodeGenSubRegIndex *CodeGenRegBank::getSubRegIdx(Record *Def) {
 }
 
 const CodeGenSubRegIndex *
-CodeGenRegBank::findSubRegIdx(const Record* Def) const {
+CodeGenRegBank::findSubRegIdx(const Record *Def) const {
   return Def2SubRegIdx.lookup(Def);
 }
 
@@ -1339,7 +1350,7 @@ void CodeGenRegBank::addToMaps(CodeGenRegisterClass *RC) {
 }
 
 // Create a synthetic sub-class if it is missing.
-CodeGenRegisterClass*
+CodeGenRegisterClass *
 CodeGenRegBank::getOrCreateSubClass(const CodeGenRegisterClass *RC,
                                     const CodeGenRegister::Vec *Members,
                                     StringRef Name) {
@@ -1362,7 +1373,7 @@ CodeGenRegisterClass *CodeGenRegBank::getRegClass(const Record *Def) const {
   PrintFatalError(Def->getLoc(), "Not a known RegisterClass!");
 }
 
-CodeGenSubRegIndex*
+CodeGenSubRegIndex *
 CodeGenRegBank::getCompositeSubRegIndex(CodeGenSubRegIndex *A,
                                         CodeGenSubRegIndex *B) {
   // Look for an existing entry.
@@ -1377,8 +1388,8 @@ CodeGenRegBank::getCompositeSubRegIndex(CodeGenSubRegIndex *A,
   return Comp;
 }
 
-CodeGenSubRegIndex *CodeGenRegBank::
-getConcatSubRegIndex(const SmallVector<CodeGenSubRegIndex *, 8> &Parts) {
+CodeGenSubRegIndex *CodeGenRegBank::getConcatSubRegIndex(
+    const SmallVector<CodeGenSubRegIndex *, 8> &Parts) {
   assert(Parts.size() > 1 && "Need two parts to concatenate");
 #ifndef NDEBUG
   for (CodeGenSubRegIndex *Idx : Parts) {
@@ -1419,26 +1430,26 @@ getConcatSubRegIndex(const SmallVector<CodeGenSubRegIndex *, 8> &Parts) {
 }
 
 void CodeGenRegBank::computeComposites() {
-  using RegMap = std::map<const CodeGenRegister*, const CodeGenRegister*>;
+  using RegMap = std::map<const CodeGenRegister *, const CodeGenRegister *>;
 
   // Subreg -> { Reg->Reg }, where the right-hand side is the mapping from
   // register to (sub)register associated with the action of the left-hand
   // side subregister.
-  std::map<const CodeGenSubRegIndex*, RegMap> SubRegAction;
+  std::map<const CodeGenSubRegIndex *, RegMap> SubRegAction;
   for (const CodeGenRegister &R : Registers) {
     const CodeGenRegister::SubRegMap &SM = R.getSubRegs();
-    for (std::pair<const CodeGenSubRegIndex*, const CodeGenRegister*> P : SM)
+    for (std::pair<const CodeGenSubRegIndex *, const CodeGenRegister *> P : SM)
       SubRegAction[P.first].insert({&R, P.second});
   }
 
   // Calculate the composition of two subregisters as compositions of their
   // associated actions.
-  auto compose = [&SubRegAction] (const CodeGenSubRegIndex *Sub1,
-                                  const CodeGenSubRegIndex *Sub2) {
+  auto compose = [&SubRegAction](const CodeGenSubRegIndex *Sub1,
+                                 const CodeGenSubRegIndex *Sub2) {
     RegMap C;
     const RegMap &Img1 = SubRegAction.at(Sub1);
     const RegMap &Img2 = SubRegAction.at(Sub2);
-    for (std::pair<const CodeGenRegister*, const CodeGenRegister*> P : Img1) {
+    for (std::pair<const CodeGenRegister *, const CodeGenRegister *> P : Img1) {
       auto F = Img2.find(P.second);
       if (F != Img2.end())
         C.insert({P.first, F->second});
@@ -1447,13 +1458,13 @@ void CodeGenRegBank::computeComposites() {
   };
 
   // Check if the two maps agree on the intersection of their domains.
-  auto agree = [] (const RegMap &Map1, const RegMap &Map2) {
+  auto agree = [](const RegMap &Map1, const RegMap &Map2) {
     // Technically speaking, an empty map agrees with any other map, but
     // this could flag false positives. We're interested in non-vacuous
     // agreements.
     if (Map1.empty() || Map2.empty())
       return false;
-    for (std::pair<const CodeGenRegister*, const CodeGenRegister*> P : Map1) {
+    for (std::pair<const CodeGenRegister *, const CodeGenRegister *> P : Map1) {
       auto F = Map2.find(P.first);
       if (F == Map2.end() || P.second != F->second)
         return false;
@@ -1461,9 +1472,9 @@ void CodeGenRegBank::computeComposites() {
     return true;
   };
 
-  using CompositePair = std::pair<const CodeGenSubRegIndex*,
-                                  const CodeGenSubRegIndex*>;
-  SmallSet<CompositePair,4> UserDefined;
+  using CompositePair =
+      std::pair<const CodeGenSubRegIndex *, const CodeGenSubRegIndex *>;
+  SmallSet<CompositePair, 4> UserDefined;
   for (const CodeGenSubRegIndex &Idx : SubRegIndices)
     for (auto P : Idx.getComposites())
       UserDefined.insert(std::make_pair(&Idx, P.first));
@@ -1528,8 +1539,8 @@ void CodeGenRegBank::computeSubRegLaneMasks() {
     if (Idx.getComposites().empty()) {
       if (Bit > LaneBitmask::BitWidth) {
         PrintFatalError(
-          Twine("Ran out of lanemask bits to represent subregister ")
-          + Idx.getName());
+            Twine("Ran out of lanemask bits to represent subregister ") +
+            Idx.getName());
       }
       Idx.LaneMask = LaneBitmask::getLane(Bit);
       ++Bit;
@@ -1556,7 +1567,7 @@ void CodeGenRegBank::computeSubRegLaneMasks() {
       unsigned DstBit = Idx.LaneMask.getHighestLane();
       assert(Idx.LaneMask == LaneBitmask::getLane(DstBit) &&
              "Must be a leaf subregister");
-      MaskRolPair MaskRol = { LaneBitmask::getLane(0), (uint8_t)DstBit };
+      MaskRolPair MaskRol = {LaneBitmask::getLane(0), (uint8_t)DstBit};
       LaneTransforms.push_back(MaskRol);
     } else {
       // Go through all leaf subregisters and find the ones that compose with
@@ -1571,7 +1582,7 @@ void CodeGenRegBank::computeSubRegLaneMasks() {
         // Replicate the behaviour from the lane mask generation loop above.
         unsigned SrcBit = NextBit;
         LaneBitmask SrcMask = LaneBitmask::getLane(SrcBit);
-        if (NextBit < LaneBitmask::BitWidth-1)
+        if (NextBit < LaneBitmask::BitWidth - 1)
           ++NextBit;
         assert(Idx2.LaneMask == SrcMask);
 
@@ -1586,8 +1597,8 @@ void CodeGenRegBank::computeSubRegLaneMasks() {
         // Create Mask+Rotate operation and merge with existing ops if possible.
         unsigned DstBit = Composite->LaneMask.getHighestLane();
         int Shift = DstBit - SrcBit;
-        uint8_t RotateLeft = Shift >= 0 ? (uint8_t)Shift
-                                        : LaneBitmask::BitWidth + Shift;
+        uint8_t RotateLeft =
+            Shift >= 0 ? (uint8_t)Shift : LaneBitmask::BitWidth + Shift;
         for (auto &I : LaneTransforms) {
           if (I.RotateLeft == RotateLeft) {
             I.Mask |= SrcMask;
@@ -1595,7 +1606,7 @@ void CodeGenRegBank::computeSubRegLaneMasks() {
           }
         }
         if (SrcMask.any()) {
-          MaskRolPair MaskRol = { SrcMask, RotateLeft };
+          MaskRolPair MaskRol = {SrcMask, RotateLeft};
           LaneTransforms.push_back(MaskRol);
         }
       }
@@ -1611,7 +1622,7 @@ void CodeGenRegBank::computeSubRegLaneMasks() {
     // in a sequence with 0 entries we can just pick any other. Choose
     // Mask 0xffffffff with Rotation 0.
     if (LaneTransforms.size() == 0) {
-      MaskRolPair P = { LaneBitmask::getAll(), 0 };
+      MaskRolPair P = {LaneBitmask::getAll(), 0};
       LaneTransforms.push_back(P);
     }
   }
@@ -1679,7 +1690,7 @@ struct UberRegSet {
 //
 // UberRegSets[0] is a special non-allocatable set.
 static void computeUberSets(std::vector<UberRegSet> &UberSets,
-                            std::vector<UberRegSet*> &RegSets,
+                            std::vector<UberRegSet *> &RegSets,
                             CodeGenRegBank &RegBank) {
   const auto &Registers = RegBank.getRegisters();
 
@@ -1742,7 +1753,8 @@ static void computeUberWeights(std::vector<UberRegSet> &UberSets,
                                CodeGenRegBank &RegBank) {
   // Skip the first unallocatable set.
   for (std::vector<UberRegSet>::iterator I = std::next(UberSets.begin()),
-         E = UberSets.end(); I != E; ++I) {
+                                         E = UberSets.end();
+       I != E; ++I) {
 
     // Initialize all unit weights in this set, and remember the max units/reg.
     const CodeGenRegister *Reg = nullptr;
@@ -1797,7 +1809,7 @@ static void computeUberWeights(std::vector<UberRegSet> &UberSets,
 // - induces recomputation of UberWeights.
 static bool normalizeWeight(CodeGenRegister *Reg,
                             std::vector<UberRegSet> &UberSets,
-                            std::vector<UberRegSet*> &RegSets,
+                            std::vector<UberRegSet *> &RegSets,
                             BitVector &NormalRegs,
                             CodeGenRegister::RegUnitList &NormalUnits,
                             CodeGenRegBank &RegBank) {
@@ -1830,15 +1842,14 @@ static bool normalizeWeight(CodeGenRegister *Reg,
     // for this register, has not been used to normalize a subregister's set,
     // and has not already been used to singularly determine this UberRegSet.
     unsigned AdjustUnit = *Reg->getRegUnits().begin();
-    if (Reg->getRegUnits().count() != 1
-        || hasRegUnit(NormalUnits, AdjustUnit)
-        || hasRegUnit(UberSet->SingularDeterminants, AdjustUnit)) {
+    if (Reg->getRegUnits().count() != 1 ||
+        hasRegUnit(NormalUnits, AdjustUnit) ||
+        hasRegUnit(UberSet->SingularDeterminants, AdjustUnit)) {
       // We don't have an adjustable unit, so adopt a new one.
       AdjustUnit = RegBank.newRegUnit(UberSet->Weight - RegWeight);
       Reg->adoptRegUnit(AdjustUnit);
       // Adopting a unit does not immediately require recomputing set weights.
-    }
-    else {
+    } else {
       // Adjust the existing single unit.
       if (!RegBank.getRegUnit(AdjustUnit).Artificial)
         RegBank.increaseRegUnitWeight(AdjustUnit, UberSet->Weight - RegWeight);
@@ -1860,7 +1871,7 @@ static bool normalizeWeight(CodeGenRegister *Reg,
 // where each register's weight is defined as sum of its units' weights.
 void CodeGenRegBank::computeRegUnitWeights() {
   std::vector<UberRegSet> UberSets;
-  std::vector<UberRegSet*> RegSets(Registers.size());
+  std::vector<UberRegSet *> RegSets(Registers.size());
   computeUberSets(UberSets, RegSets, *this);
   // UberSets and RegSets are now immutable.
 
@@ -1871,7 +1882,7 @@ void CodeGenRegBank::computeRegUnitWeights() {
   unsigned NumIters = 0;
   for (bool Changed = true; Changed; ++NumIters) {
     assert(NumIters <= NumNativeRegUnits && "Runaway register unit weights");
-    (void) NumIters;
+    (void)NumIters;
     Changed = false;
     for (auto &Reg : Registers) {
       CodeGenRegister::RegUnitList NormalUnits;
@@ -1887,9 +1898,9 @@ void CodeGenRegBank::computeRegUnitWeights() {
 static std::vector<RegUnitSet>::const_iterator
 findRegUnitSet(const std::vector<RegUnitSet> &UniqueSets,
                const RegUnitSet &Set) {
-  std::vector<RegUnitSet>::const_iterator
-    I = UniqueSets.begin(), E = UniqueSets.end();
-  for(;I != E; ++I) {
+  std::vector<RegUnitSet>::const_iterator I = UniqueSets.begin(),
+                                          E = UniqueSets.end();
+  for (; I != E; ++I) {
     if (I->Units == Set.Units)
       break;
   }
@@ -1899,8 +1910,8 @@ findRegUnitSet(const std::vector<RegUnitSet> &UniqueSets,
 // Return true if the RUSubSet is a subset of RUSuperSet.
 static bool isRegUnitSubSet(const std::vector<unsigned> &RUSubSet,
                             const std::vector<unsigned> &RUSuperSet) {
-  return std::includes(RUSuperSet.begin(), RUSuperSet.end(),
-                       RUSubSet.begin(), RUSubSet.end());
+  return std::includes(RUSuperSet.begin(), RUSuperSet.end(), RUSubSet.begin(),
+                       RUSubSet.end());
 }
 
 /// Iteratively prune unit sets. Prune subsets that are close to the superset,
@@ -1925,8 +1936,8 @@ void CodeGenRegBank::pruneUnitSets() {
 
   // Form an equivalence class of UnitSets with no significant difference.
   std::vector<unsigned> SuperSetIDs;
-  for (unsigned SubIdx = 0, EndIdx = RegUnitSets.size();
-       SubIdx != EndIdx; ++SubIdx) {
+  for (unsigned SubIdx = 0, EndIdx = RegUnitSets.size(); SubIdx != EndIdx;
+       ++SubIdx) {
     const RegUnitSet &SubSet = RegUnitSets[SubIdx];
     unsigned SuperIdx = 0;
     for (; SuperIdx != EndIdx; ++SuperIdx) {
@@ -1935,10 +1946,10 @@ void CodeGenRegBank::pruneUnitSets() {
 
       unsigned UnitWeight = RegUnits[SubSet.Units[0]].Weight;
       const RegUnitSet &SuperSet = RegUnitSets[SuperIdx];
-      if (isRegUnitSubSet(SubSet.Units, SuperSet.Units)
-          && (SubSet.Units.size() + 3 > SuperSet.Units.size())
-          && UnitWeight == RegUnits[SuperSet.Units[0]].Weight
-          && UnitWeight == RegUnits[SuperSet.Units.back()].Weight) {
+      if (isRegUnitSubSet(SubSet.Units, SuperSet.Units) &&
+          (SubSet.Units.size() + 3 > SuperSet.Units.size()) &&
+          UnitWeight == RegUnits[SuperSet.Units[0]].Weight &&
+          UnitWeight == RegUnits[SuperSet.Units.back()].Weight) {
         LLVM_DEBUG(dbgs() << "UnitSet " << SubIdx << " subsumed by " << SuperIdx
                           << "\n");
         // We can pick any of the set names for the merged set. Go for the
@@ -1988,7 +1999,7 @@ void CodeGenRegBank::computeRegUnitSets() {
 
     // Find an existing RegUnitSet.
     std::vector<RegUnitSet>::const_iterator SetI =
-      findRegUnitSet(RegUnitSets, RegUnitSets.back());
+        findRegUnitSet(RegUnitSets, RegUnitSets.back());
     if (SetI != std::prev(RegUnitSets.end()))
       RegUnitSets.pop_back();
   }
@@ -2023,10 +2034,10 @@ void CodeGenRegBank::computeRegUnitSets() {
     // In theory, this is combinatorial. In practice, it needs to be bounded
     // by a small number of sets for regpressure to be efficient.
     // If the assert is hit, we need to implement pruning.
-    assert(Idx < (2*NumRegUnitSubSets) && "runaway unit set inference");
+    assert(Idx < (2 * NumRegUnitSubSets) && "runaway unit set inference");
 
     // Compare new sets with all original classes.
-    for (unsigned SearchIdx = (Idx >= NumRegUnitSubSets) ? 0 : Idx+1;
+    for (unsigned SearchIdx = (Idx >= NumRegUnitSubSets) ? 0 : Idx + 1;
          SearchIdx != EndIdx; ++SearchIdx) {
       std::set<unsigned> Intersection;
       std::set_intersection(RegUnitSets[Idx].Units.begin(),
@@ -2040,7 +2051,7 @@ void CodeGenRegBank::computeRegUnitSets() {
       // Speculatively grow the RegUnitSets to hold the new set.
       RegUnitSets.resize(RegUnitSets.size() + 1);
       RegUnitSets.back().Name =
-        RegUnitSets[Idx].Name + "_with_" + RegUnitSets[SearchIdx].Name;
+          RegUnitSets[Idx].Name + "_with_" + RegUnitSets[SearchIdx].Name;
 
       std::set_union(RegUnitSets[Idx].Units.begin(),
                      RegUnitSets[Idx].Units.end(),
@@ -2051,7 +2062,7 @@ void CodeGenRegBank::computeRegUnitSets() {
 
       // Find an existing RegUnitSet, or add the union to the unique sets.
       std::vector<RegUnitSet>::const_iterator SetI =
-        findRegUnitSet(RegUnitSets, RegUnitSets.back());
+          findRegUnitSet(RegUnitSets, RegUnitSets.back());
       if (SetI != std::prev(RegUnitSets.end()))
         RegUnitSets.pop_back();
       else {
@@ -2098,8 +2109,8 @@ void CodeGenRegBank::computeRegUnitSets() {
                dbgs() << "\n  UnitSetIDs:");
 
     // Find all supersets.
-    for (unsigned USIdx = 0, USEnd = RegUnitSets.size();
-         USIdx != USEnd; ++USIdx) {
+    for (unsigned USIdx = 0, USEnd = RegUnitSets.size(); USIdx != USEnd;
+         ++USIdx) {
       if (isRegUnitSubSet(RCRegUnits, RegUnitSets[USIdx].Units)) {
         LLVM_DEBUG(dbgs() << " " << USIdx);
         RegClassUnitSets[RCIdx].push_back(USIdx);
@@ -2114,8 +2125,8 @@ void CodeGenRegBank::computeRegUnitSets() {
   // contain the unit. Normally, this matches an existing list of UnitSets for a
   // register class. If not, we create a new entry in RegClassUnitSets as a
   // "fake" register class.
-  for (unsigned UnitIdx = 0, UnitEnd = NumNativeRegUnits;
-       UnitIdx < UnitEnd; ++UnitIdx) {
+  for (unsigned UnitIdx = 0, UnitEnd = NumNativeRegUnits; UnitIdx < UnitEnd;
+       ++UnitIdx) {
     std::vector<unsigned> RUSets;
     for (unsigned i = 0, e = RegUnitSets.size(); i != e; ++i) {
       RegUnitSet &RUSet = RegUnitSets[i];
@@ -2124,8 +2135,8 @@ void CodeGenRegBank::computeRegUnitSets() {
       RUSets.push_back(i);
     }
     unsigned RCUnitSetsIdx = 0;
-    for (unsigned e = RegClassUnitSets.size();
-         RCUnitSetsIdx != e; ++RCUnitSetsIdx) {
+    for (unsigned e = RegClassUnitSets.size(); RCUnitSetsIdx != e;
+         ++RCUnitSetsIdx) {
       if (RegClassUnitSets[RCUnitSetsIdx] == RUSets) {
         break;
       }
@@ -2301,9 +2312,8 @@ void CodeGenRegBank::inferSubClassWithSubReg(CodeGenRegisterClass *RC) {
       continue;
     }
     // This is a real subset.  See if we have a matching class.
-    CodeGenRegisterClass *SubRC =
-      getOrCreateSubClass(RC, &I->second,
-                          RC->getName() + "_with_" + I->first->getName());
+    CodeGenRegisterClass *SubRC = getOrCreateSubClass(
+        RC, &I->second, RC->getName() + "_with_" + I->first->getName());
     RC->setSubClassWithSubReg(&SubIdx, SubRC);
   }
 }
@@ -2315,8 +2325,9 @@ void CodeGenRegBank::inferSubClassWithSubReg(CodeGenRegisterClass *RC) {
 // has a maximal result for any SubIdx and any X >= FirstSubRegRC.
 //
 
-void CodeGenRegBank::inferMatchingSuperRegClass(CodeGenRegisterClass *RC,
-                                                std::list<CodeGenRegisterClass>::iterator FirstSubRegRC) {
+void CodeGenRegBank::inferMatchingSuperRegClass(
+    CodeGenRegisterClass *RC,
+    std::list<CodeGenRegisterClass>::iterator FirstSubRegRC) {
   DenseMap<const CodeGenRegister *, std::vector<const CodeGenRegister *>>
       SubToSuperRegs;
   BitVector TopoSigs(getNumTopoSigs());
@@ -2374,9 +2385,9 @@ void CodeGenRegBank::inferMatchingSuperRegClass(CodeGenRegisterClass *RC,
 
       // Only a subset of RC maps into SubRC. Make sure it is represented by a
       // class.
-      getOrCreateSubClass(RC, &SubSetVec, RC->getName() + "_with_" +
-                                          SubIdx.getName() + "_in_" +
-                                          SubRC.getName());
+      getOrCreateSubClass(RC, &SubSetVec,
+                          RC->getName() + "_with_" + SubIdx.getName() + "_in_" +
+                              SubRC.getName());
     }
   }
 }
@@ -2431,8 +2442,7 @@ void CodeGenRegBank::computeInferredRegisterClasses() {
 /// return null. If the register is in multiple classes, and the classes have a
 /// superset-subset relationship and the same set of types, return the
 /// superclass.  Otherwise return null.
-const CodeGenRegisterClass*
-CodeGenRegBank::getRegClassForRegister(Record *R) {
+const CodeGenRegisterClass *CodeGenRegBank::getRegClassForRegister(Record *R) {
   const CodeGenRegister *Reg = getReg(R);
   const CodeGenRegisterClass *FoundRC = nullptr;
   for (const auto &RC : getRegClasses()) {
@@ -2477,8 +2487,8 @@ CodeGenRegBank::getMinimalPhysRegClass(Record *RegRecord,
   const CodeGenRegister *Reg = getReg(RegRecord);
   const CodeGenRegisterClass *BestRC = nullptr;
   for (const auto &RC : getRegClasses()) {
-    if ((!VT || RC.hasType(*VT)) &&
-        RC.contains(Reg) && (!BestRC || BestRC->hasSubClass(&RC)))
+    if ((!VT || RC.hasType(*VT)) && RC.contains(Reg) &&
+        (!BestRC || BestRC->hasSubClass(&RC)))
       BestRC = &RC;
   }
 
@@ -2486,8 +2496,8 @@ CodeGenRegBank::getMinimalPhysRegClass(Record *RegRecord,
   return BestRC;
 }
 
-BitVector CodeGenRegBank::computeCoveredRegisters(ArrayRef<Record*> Regs) {
-  SetVector<const CodeGenRegister*> Set;
+BitVector CodeGenRegBank::computeCoveredRegisters(ArrayRef<Record *> Regs) {
+  SetVector<const CodeGenRegister *> Set;
 
   // First add Regs with all sub-registers.
   for (unsigned i = 0, e = Regs.size(); i != e; ++i) {
diff --git a/llvm/utils/TableGen/CodeGenRegisters.h b/llvm/utils/TableGen/CodeGenRegisters.h
index 97f6081..cfc6d87 100644
--- a/llvm/utils/TableGen/CodeGenRegisters.h
+++ b/llvm/utils/TableGen/CodeGenRegisters.h
@@ -44,810 +44,798 @@
 
 namespace llvm {
 
-  class CodeGenRegBank;
+class CodeGenRegBank;
 
-  /// Used to encode a step in a register lane mask transformation.
-  /// Mask the bits specified in Mask, then rotate them Rol bits to the left
-  /// assuming a wraparound at 32bits.
-  struct MaskRolPair {
-    LaneBitmask Mask;
-    uint8_t RotateLeft;
+/// Used to encode a step in a register lane mask transformation.
+/// Mask the bits specified in Mask, then rotate them Rol bits to the left
+/// assuming a wraparound at 32bits.
+struct MaskRolPair {
+  LaneBitmask Mask;
+  uint8_t RotateLeft;
 
-    bool operator==(const MaskRolPair Other) const {
-      return Mask == Other.Mask && RotateLeft == Other.RotateLeft;
-    }
-    bool operator!=(const MaskRolPair Other) const {
-      return Mask != Other.Mask || RotateLeft != Other.RotateLeft;
-    }
-  };
-
-  /// CodeGenSubRegIndex - Represents a sub-register index.
-  class CodeGenSubRegIndex {
-    Record *const TheDef;
-    std::string Name;
-    std::string Namespace;
-
-  public:
-    uint16_t Size;
-    uint16_t Offset;
-    const unsigned EnumValue;
-    mutable LaneBitmask LaneMask;
-    mutable SmallVector<MaskRolPair,1> CompositionLaneMaskTransform;
-
-    /// A list of subregister indexes concatenated resulting in this
-    /// subregister index. This is the reverse of CodeGenRegBank::ConcatIdx.
-    SmallVector<CodeGenSubRegIndex*,4> ConcatenationOf;
-
-    // Are all super-registers containing this SubRegIndex covered by their
-    // sub-registers?
-    bool AllSuperRegsCovered;
-    // A subregister index is "artificial" if every subregister obtained
-    // from applying this index is artificial. Artificial subregister
-    // indexes are not used to create new register classes.
-    bool Artificial;
-
-    CodeGenSubRegIndex(Record *R, unsigned Enum);
-    CodeGenSubRegIndex(StringRef N, StringRef Nspace, unsigned Enum);
-    CodeGenSubRegIndex(CodeGenSubRegIndex&) = delete;
-
-    const std::string &getName() const { return Name; }
-    const std::string &getNamespace() const { return Namespace; }
-    std::string getQualifiedName() const;
-
-    // Map of composite subreg indices.
-    typedef std::map<CodeGenSubRegIndex *, CodeGenSubRegIndex *,
-                     deref<std::less<>>>
-        CompMap;
-
-    // Returns the subreg index that results from composing this with Idx.
-    // Returns NULL if this and Idx don't compose.
-    CodeGenSubRegIndex *compose(CodeGenSubRegIndex *Idx) const {
-      CompMap::const_iterator I = Composed.find(Idx);
-      return I == Composed.end() ? nullptr : I->second;
-    }
+  bool operator==(const MaskRolPair Other) const {
+    return Mask == Other.Mask && RotateLeft == Other.RotateLeft;
+  }
+  bool operator!=(const MaskRolPair Other) const {
+    return Mask != Other.Mask || RotateLeft != Other.RotateLeft;
+  }
+};
+
+/// CodeGenSubRegIndex - Represents a sub-register index.
+class CodeGenSubRegIndex {
+  Record *const TheDef;
+  std::string Name;
+  std::string Namespace;
+
+public:
+  uint16_t Size;
+  uint16_t Offset;
+  const unsigned EnumValue;
+  mutable LaneBitmask LaneMask;
+  mutable SmallVector<MaskRolPair, 1> CompositionLaneMaskTransform;
+
+  /// A list of subregister indexes concatenated resulting in this
+  /// subregister index. This is the reverse of CodeGenRegBank::ConcatIdx.
+  SmallVector<CodeGenSubRegIndex *, 4> ConcatenationOf;
+
+  // Are all super-registers containing this SubRegIndex covered by their
+  // sub-registers?
+  bool AllSuperRegsCovered;
+  // A subregister index is "artificial" if every subregister obtained
+  // from applying this index is artificial. Artificial subregister
+  // indexes are not used to create new register classes.
+  bool Artificial;
+
+  CodeGenSubRegIndex(Record *R, unsigned Enum);
+  CodeGenSubRegIndex(StringRef N, StringRef Nspace, unsigned Enum);
+  CodeGenSubRegIndex(CodeGenSubRegIndex &) = delete;
+
+  const std::string &getName() const { return Name; }
+  const std::string &getNamespace() const { return Namespace; }
+  std::string getQualifiedName() const;
+
+  // Map of composite subreg indices.
+  typedef std::map<CodeGenSubRegIndex *, CodeGenSubRegIndex *,
+                   deref<std::less<>>>
+      CompMap;
+
+  // Returns the subreg index that results from composing this with Idx.
+  // Returns NULL if this and Idx don't compose.
+  CodeGenSubRegIndex *compose(CodeGenSubRegIndex *Idx) const {
+    CompMap::const_iterator I = Composed.find(Idx);
+    return I == Composed.end() ? nullptr : I->second;
+  }
 
-    // Add a composite subreg index: this+A = B.
-    // Return a conflicting composite, or NULL
-    CodeGenSubRegIndex *addComposite(CodeGenSubRegIndex *A,
-                                     CodeGenSubRegIndex *B) {
-      assert(A && B);
-      std::pair<CompMap::iterator, bool> Ins =
+  // Add a composite subreg index: this+A = B.
+  // Return a conflicting composite, or NULL
+  CodeGenSubRegIndex *addComposite(CodeGenSubRegIndex *A,
+                                   CodeGenSubRegIndex *B) {
+    assert(A && B);
+    std::pair<CompMap::iterator, bool> Ins =
         Composed.insert(std::make_pair(A, B));
-      // Synthetic subreg indices that aren't contiguous (for instance ARM
-      // register tuples) don't have a bit range, so it's OK to let
-      // B->Offset == -1. For the other cases, accumulate the offset and set
-      // the size here. Only do so if there is no offset yet though.
-      if ((Offset != (uint16_t)-1 && A->Offset != (uint16_t)-1) &&
-          (B->Offset == (uint16_t)-1)) {
-        B->Offset = Offset + A->Offset;
-        B->Size = A->Size;
-      }
-      return (Ins.second || Ins.first->second == B) ? nullptr
-                                                    : Ins.first->second;
-    }
-
-    // Update the composite maps of components specified in 'ComposedOf'.
-    void updateComponents(CodeGenRegBank&);
-
-    // Return the map of composites.
-    const CompMap &getComposites() const { return Composed; }
-
-    // Compute LaneMask from Composed. Return LaneMask.
-    LaneBitmask computeLaneMask() const;
-
-    void setConcatenationOf(ArrayRef<CodeGenSubRegIndex*> Parts);
-
-    /// Replaces subregister indexes in the `ConcatenationOf` list with
-    /// list of subregisters they are composed of (if any). Do this recursively.
-    void computeConcatTransitiveClosure();
-
-    bool operator<(const CodeGenSubRegIndex &RHS) const {
-      return this->EnumValue < RHS.EnumValue;
-    }
-
-  private:
-    CompMap Composed;
-  };
-
-  /// CodeGenRegister - Represents a register definition.
-  class CodeGenRegister {
-  public:
-    Record *TheDef;
-    unsigned EnumValue;
-    std::vector<int64_t> CostPerUse;
-    bool CoveredBySubRegs = true;
-    bool HasDisjunctSubRegs = false;
-    bool Artificial = true;
-    bool Constant = false;
+    // Synthetic subreg indices that aren't contiguous (for instance ARM
+    // register tuples) don't have a bit range, so it's OK to let
+    // B->Offset == -1. For the other cases, accumulate the offset and set
+    // the size here. Only do so if there is no offset yet though.
+    if ((Offset != (uint16_t)-1 && A->Offset != (uint16_t)-1) &&
+        (B->Offset == (uint16_t)-1)) {
+      B->Offset = Offset + A->Offset;
+      B->Size = A->Size;
+    }
+    return (Ins.second || Ins.first->second == B) ? nullptr : Ins.first->second;
+  }
 
-    // Map SubRegIndex -> Register.
-    typedef std::map<CodeGenSubRegIndex *, CodeGenRegister *,
-                     deref<std::less<>>>
-        SubRegMap;
+  // Update the composite maps of components specified in 'ComposedOf'.
+  void updateComponents(CodeGenRegBank &);
 
-    CodeGenRegister(Record *R, unsigned Enum);
+  // Return the map of composites.
+  const CompMap &getComposites() const { return Composed; }
 
-    StringRef getName() const;
+  // Compute LaneMask from Composed. Return LaneMask.
+  LaneBitmask computeLaneMask() const;
 
-    // Extract more information from TheDef. This is used to build an object
-    // graph after all CodeGenRegister objects have been created.
-    void buildObjectGraph(CodeGenRegBank&);
+  void setConcatenationOf(ArrayRef<CodeGenSubRegIndex *> Parts);
 
-    // Lazily compute a map of all sub-registers.
-    // This includes unique entries for all sub-sub-registers.
-    const SubRegMap &computeSubRegs(CodeGenRegBank&);
+  /// Replaces subregister indexes in the `ConcatenationOf` list with
+  /// list of subregisters they are composed of (if any). Do this recursively.
+  void computeConcatTransitiveClosure();
 
-    // Compute extra sub-registers by combining the existing sub-registers.
-    void computeSecondarySubRegs(CodeGenRegBank&);
+  bool operator<(const CodeGenSubRegIndex &RHS) const {
+    return this->EnumValue < RHS.EnumValue;
+  }
 
-    // Add this as a super-register to all sub-registers after the sub-register
-    // graph has been built.
-    void computeSuperRegs(CodeGenRegBank&);
+private:
+  CompMap Composed;
+};
 
-    const SubRegMap &getSubRegs() const {
-      assert(SubRegsComplete && "Must precompute sub-registers");
-      return SubRegs;
-    }
+/// CodeGenRegister - Represents a register definition.
+class CodeGenRegister {
+public:
+  Record *TheDef;
+  unsigned EnumValue;
+  std::vector<int64_t> CostPerUse;
+  bool CoveredBySubRegs = true;
+  bool HasDisjunctSubRegs = false;
+  bool Artificial = true;
+  bool Constant = false;
 
-    // Add sub-registers to OSet following a pre-order defined by the .td file.
-    void addSubRegsPreOrder(SetVector<const CodeGenRegister*> &OSet,
-                            CodeGenRegBank&) const;
+  // Map SubRegIndex -> Register.
+  typedef std::map<CodeGenSubRegIndex *, CodeGenRegister *, deref<std::less<>>>
+      SubRegMap;
 
-    // Return the sub-register index naming Reg as a sub-register of this
-    // register. Returns NULL if Reg is not a sub-register.
-    CodeGenSubRegIndex *getSubRegIndex(const CodeGenRegister *Reg) const {
-      return SubReg2Idx.lookup(Reg);
-    }
+  CodeGenRegister(Record *R, unsigned Enum);
 
-    typedef std::vector<const CodeGenRegister*> SuperRegList;
+  StringRef getName() const;
 
-    // Get the list of super-registers in topological order, small to large.
-    // This is valid after computeSubRegs visits all registers during RegBank
-    // construction.
-    const SuperRegList &getSuperRegs() const {
-      assert(SubRegsComplete && "Must precompute sub-registers");
-      return SuperRegs;
-    }
+  // Extract more information from TheDef. This is used to build an object
+  // graph after all CodeGenRegister objects have been created.
+  void buildObjectGraph(CodeGenRegBank &);
 
-    // Get the list of ad hoc aliases. The graph is symmetric, so the list
-    // contains all registers in 'Aliases', and all registers that mention this
-    // register in 'Aliases'.
-    ArrayRef<CodeGenRegister*> getExplicitAliases() const {
-      return ExplicitAliases;
-    }
+  // Lazily compute a map of all sub-registers.
+  // This includes unique entries for all sub-sub-registers.
+  const SubRegMap &computeSubRegs(CodeGenRegBank &);
 
-    // Get the topological signature of this register. This is a small integer
-    // less than RegBank.getNumTopoSigs(). Registers with the same TopoSig have
-    // identical sub-register structure. That is, they support the same set of
-    // sub-register indices mapping to the same kind of sub-registers
-    // (TopoSig-wise).
-    unsigned getTopoSig() const {
-      assert(SuperRegsComplete && "TopoSigs haven't been computed yet.");
-      return TopoSig;
-    }
+  // Compute extra sub-registers by combining the existing sub-registers.
+  void computeSecondarySubRegs(CodeGenRegBank &);
 
-    // List of register units in ascending order.
-    typedef SparseBitVector<> RegUnitList;
-    typedef SmallVector<LaneBitmask, 16> RegUnitLaneMaskList;
+  // Add this as a super-register to all sub-registers after the sub-register
+  // graph has been built.
+  void computeSuperRegs(CodeGenRegBank &);
 
-    // How many entries in RegUnitList are native?
-    RegUnitList NativeRegUnits;
+  const SubRegMap &getSubRegs() const {
+    assert(SubRegsComplete && "Must precompute sub-registers");
+    return SubRegs;
+  }
 
-    // Get the list of register units.
-    // This is only valid after computeSubRegs() completes.
-    const RegUnitList &getRegUnits() const { return RegUnits; }
+  // Add sub-registers to OSet following a pre-order defined by the .td file.
+  void addSubRegsPreOrder(SetVector<const CodeGenRegister *> &OSet,
+                          CodeGenRegBank &) const;
 
-    ArrayRef<LaneBitmask> getRegUnitLaneMasks() const {
-      return ArrayRef(RegUnitLaneMasks).slice(0, NativeRegUnits.count());
-    }
+  // Return the sub-register index naming Reg as a sub-register of this
+  // register. Returns NULL if Reg is not a sub-register.
+  CodeGenSubRegIndex *getSubRegIndex(const CodeGenRegister *Reg) const {
+    return SubReg2Idx.lookup(Reg);
+  }
 
-    // Get the native register units. This is a prefix of getRegUnits().
-    RegUnitList getNativeRegUnits() const {
-      return NativeRegUnits;
-    }
+  typedef std::vector<const CodeGenRegister *> SuperRegList;
 
-    void setRegUnitLaneMasks(const RegUnitLaneMaskList &LaneMasks) {
-      RegUnitLaneMasks = LaneMasks;
-    }
+  // Get the list of super-registers in topological order, small to large.
+  // This is valid after computeSubRegs visits all registers during RegBank
+  // construction.
+  const SuperRegList &getSuperRegs() const {
+    assert(SubRegsComplete && "Must precompute sub-registers");
+    return SuperRegs;
+  }
 
-    // Inherit register units from subregisters.
-    // Return true if the RegUnits changed.
-    bool inheritRegUnits(CodeGenRegBank &RegBank);
+  // Get the list of ad hoc aliases. The graph is symmetric, so the list
+  // contains all registers in 'Aliases', and all registers that mention this
+  // register in 'Aliases'.
+  ArrayRef<CodeGenRegister *> getExplicitAliases() const {
+    return ExplicitAliases;
+  }
 
-    // Adopt a register unit for pressure tracking.
-    // A unit is adopted iff its unit number is >= NativeRegUnits.count().
-    void adoptRegUnit(unsigned RUID) { RegUnits.set(RUID); }
+  // Get the topological signature of this register. This is a small integer
+  // less than RegBank.getNumTopoSigs(). Registers with the same TopoSig have
+  // identical sub-register structure. That is, they support the same set of
+  // sub-register indices mapping to the same kind of sub-registers
+  // (TopoSig-wise).
+  unsigned getTopoSig() const {
+    assert(SuperRegsComplete && "TopoSigs haven't been computed yet.");
+    return TopoSig;
+  }
 
-    // Get the sum of this register's register unit weights.
-    unsigned getWeight(const CodeGenRegBank &RegBank) const;
+  // List of register units in ascending order.
+  typedef SparseBitVector<> RegUnitList;
+  typedef SmallVector<LaneBitmask, 16> RegUnitLaneMaskList;
 
-    // Canonically ordered set.
-    typedef std::vector<const CodeGenRegister*> Vec;
+  // How many entries in RegUnitList are native?
+  RegUnitList NativeRegUnits;
 
-  private:
-    bool SubRegsComplete;
-    bool SuperRegsComplete;
-    unsigned TopoSig;
+  // Get the list of register units.
+  // This is only valid after computeSubRegs() completes.
+  const RegUnitList &getRegUnits() const { return RegUnits; }
 
-    // The sub-registers explicit in the .td file form a tree.
-    SmallVector<CodeGenSubRegIndex*, 8> ExplicitSubRegIndices;
-    SmallVector<CodeGenRegister*, 8> ExplicitSubRegs;
+  ArrayRef<LaneBitmask> getRegUnitLaneMasks() const {
+    return ArrayRef(RegUnitLaneMasks).slice(0, NativeRegUnits.count());
+  }
 
-    // Explicit ad hoc aliases, symmetrized to form an undirected graph.
-    SmallVector<CodeGenRegister*, 8> ExplicitAliases;
+  // Get the native register units. This is a prefix of getRegUnits().
+  RegUnitList getNativeRegUnits() const { return NativeRegUnits; }
 
-    // Super-registers where this is the first explicit sub-register.
-    SuperRegList LeadingSuperRegs;
+  void setRegUnitLaneMasks(const RegUnitLaneMaskList &LaneMasks) {
+    RegUnitLaneMasks = LaneMasks;
+  }
 
-    SubRegMap SubRegs;
-    SuperRegList SuperRegs;
-    DenseMap<const CodeGenRegister*, CodeGenSubRegIndex*> SubReg2Idx;
-    RegUnitList RegUnits;
-    RegUnitLaneMaskList RegUnitLaneMasks;
-  };
+  // Inherit register units from subregisters.
+  // Return true if the RegUnits changed.
+  bool inheritRegUnits(CodeGenRegBank &RegBank);
+
+  // Adopt a register unit for pressure tracking.
+  // A unit is adopted iff its unit number is >= NativeRegUnits.count().
+  void adoptRegUnit(unsigned RUID) { RegUnits.set(RUID); }
+
+  // Get the sum of this register's register unit weights.
+  unsigned getWeight(const CodeGenRegBank &RegBank) const;
+
+  // Canonically ordered set.
+  typedef std::vector<const CodeGenRegister *> Vec;
+
+private:
+  bool SubRegsComplete;
+  bool SuperRegsComplete;
+  unsigned TopoSig;
+
+  // The sub-registers explicit in the .td file form a tree.
+  SmallVector<CodeGenSubRegIndex *, 8> ExplicitSubRegIndices;
+  SmallVector<CodeGenRegister *, 8> ExplicitSubRegs;
+
+  // Explicit ad hoc aliases, symmetrized to form an undirected graph.
+  SmallVector<CodeGenRegister *, 8> ExplicitAliases;
+
+  // Super-registers where this is the first explicit sub-register.
+  SuperRegList LeadingSuperRegs;
+
+  SubRegMap SubRegs;
+  SuperRegList SuperRegs;
+  DenseMap<const CodeGenRegister *, CodeGenSubRegIndex *> SubReg2Idx;
+  RegUnitList RegUnits;
+  RegUnitLaneMaskList RegUnitLaneMasks;
+};
+
+inline bool operator<(const CodeGenRegister &A, const CodeGenRegister &B) {
+  return A.EnumValue < B.EnumValue;
+}
+
+inline bool operator==(const CodeGenRegister &A, const CodeGenRegister &B) {
+  return A.EnumValue == B.EnumValue;
+}
+
+class CodeGenRegisterClass {
+  CodeGenRegister::Vec Members;
+  // Allocation orders. Order[0] always contains all registers in Members.
+  std::vector<SmallVector<Record *, 16>> Orders;
+  // Bit mask of sub-classes including this, indexed by their EnumValue.
+  BitVector SubClasses;
+  // List of super-classes, topologocally ordered to have the larger classes
+  // first.  This is the same as sorting by EnumValue.
+  SmallVector<CodeGenRegisterClass *, 4> SuperClasses;
+  Record *TheDef;
+  std::string Name;
+
+  // For a synthesized class, inherit missing properties from the nearest
+  // super-class.
+  void inheritProperties(CodeGenRegBank &);
+
+  // Map SubRegIndex -> sub-class.  This is the largest sub-class where all
+  // registers have a SubRegIndex sub-register.
+  DenseMap<const CodeGenSubRegIndex *, CodeGenRegisterClass *>
+      SubClassWithSubReg;
+
+  // Map SubRegIndex -> set of super-reg classes.  This is all register
+  // classes SuperRC such that:
+  //
+  //   R:SubRegIndex in this RC for all R in SuperRC.
+  //
+  DenseMap<const CodeGenSubRegIndex *, SmallPtrSet<CodeGenRegisterClass *, 8>>
+      SuperRegClasses;
+
+  // Bit vector of TopoSigs for the registers in this class. This will be
+  // very sparse on regular architectures.
+  BitVector TopoSigs;
+
+public:
+  unsigned EnumValue;
+  StringRef Namespace;
+  SmallVector<ValueTypeByHwMode, 4> VTs;
+  RegSizeInfoByHwMode RSI;
+  int CopyCost;
+  bool Allocatable;
+  StringRef AltOrderSelect;
+  uint8_t AllocationPriority;
+  bool GlobalPriority;
+  uint8_t TSFlags;
+  /// Contains the combination of the lane masks of all subregisters.
+  LaneBitmask LaneMask;
+  /// True if there are at least 2 subregisters which do not interfere.
+  bool HasDisjunctSubRegs;
+  bool CoveredBySubRegs;
+  /// A register class is artificial if all its members are artificial.
+  bool Artificial;
+  /// Generate register pressure set for this register class and any class
+  /// synthesized from it.
+  bool GeneratePressureSet;
+
+  // Return the Record that defined this class, or NULL if the class was
+  // created by TableGen.
+  Record *getDef() const { return TheDef; }
+
+  std::string getNamespaceQualification() const;
+  const std::string &getName() const { return Name; }
+  std::string getQualifiedName() const;
+  std::string getIdName() const;
+  std::string getQualifiedIdName() const;
+  ArrayRef<ValueTypeByHwMode> getValueTypes() const { return VTs; }
+  unsigned getNumValueTypes() const { return VTs.size(); }
+  bool hasType(const ValueTypeByHwMode &VT) const;
+
+  const ValueTypeByHwMode &getValueTypeNum(unsigned VTNum) const {
+    if (VTNum < VTs.size())
+      return VTs[VTNum];
+    llvm_unreachable("VTNum greater than number of ValueTypes in RegClass!");
+  }
 
-  inline bool operator<(const CodeGenRegister &A, const CodeGenRegister &B) {
-    return A.EnumValue < B.EnumValue;
-  }
-
-  inline bool operator==(const CodeGenRegister &A, const CodeGenRegister &B) {
-    return A.EnumValue == B.EnumValue;
-  }
-
-  class CodeGenRegisterClass {
-    CodeGenRegister::Vec Members;
-    // Allocation orders. Order[0] always contains all registers in Members.
-    std::vector<SmallVector<Record*, 16>> Orders;
-    // Bit mask of sub-classes including this, indexed by their EnumValue.
-    BitVector SubClasses;
-    // List of super-classes, topologocally ordered to have the larger classes
-    // first.  This is the same as sorting by EnumValue.
-    SmallVector<CodeGenRegisterClass*, 4> SuperClasses;
-    Record *TheDef;
-    std::string Name;
-
-    // For a synthesized class, inherit missing properties from the nearest
-    // super-class.
-    void inheritProperties(CodeGenRegBank&);
-
-    // Map SubRegIndex -> sub-class.  This is the largest sub-class where all
-    // registers have a SubRegIndex sub-register.
-    DenseMap<const CodeGenSubRegIndex *, CodeGenRegisterClass *>
-        SubClassWithSubReg;
-
-    // Map SubRegIndex -> set of super-reg classes.  This is all register
-    // classes SuperRC such that:
-    //
-    //   R:SubRegIndex in this RC for all R in SuperRC.
-    //
-    DenseMap<const CodeGenSubRegIndex *, SmallPtrSet<CodeGenRegisterClass *, 8>>
-        SuperRegClasses;
-
-    // Bit vector of TopoSigs for the registers in this class. This will be
-    // very sparse on regular architectures.
-    BitVector TopoSigs;
-
-  public:
-    unsigned EnumValue;
-    StringRef Namespace;
-    SmallVector<ValueTypeByHwMode, 4> VTs;
-    RegSizeInfoByHwMode RSI;
-    int CopyCost;
-    bool Allocatable;
-    StringRef AltOrderSelect;
-    uint8_t AllocationPriority;
-    bool GlobalPriority;
-    uint8_t TSFlags;
-    /// Contains the combination of the lane masks of all subregisters.
-    LaneBitmask LaneMask;
-    /// True if there are at least 2 subregisters which do not interfere.
-    bool HasDisjunctSubRegs;
-    bool CoveredBySubRegs;
-    /// A register class is artificial if all its members are artificial.
-    bool Artificial;
-    /// Generate register pressure set for this register class and any class
-    /// synthesized from it.
-    bool GeneratePressureSet;
-
-    // Return the Record that defined this class, or NULL if the class was
-    // created by TableGen.
-    Record *getDef() const { return TheDef; }
-
-    std::string getNamespaceQualification() const;
-    const std::string &getName() const { return Name; }
-    std::string getQualifiedName() const;
-    std::string getIdName() const;
-    std::string getQualifiedIdName() const;
-    ArrayRef<ValueTypeByHwMode> getValueTypes() const { return VTs; }
-    unsigned getNumValueTypes() const { return VTs.size(); }
-    bool hasType(const ValueTypeByHwMode &VT) const;
-
-    const ValueTypeByHwMode &getValueTypeNum(unsigned VTNum) const {
-      if (VTNum < VTs.size())
-        return VTs[VTNum];
-      llvm_unreachable("VTNum greater than number of ValueTypes in RegClass!");
-    }
+  // Return true if this class contains the register.
+  bool contains(const CodeGenRegister *) const;
 
-    // Return true if this class contains the register.
-    bool contains(const CodeGenRegister*) const;
-
-    // Returns true if RC is a subclass.
-    // RC is a sub-class of this class if it is a valid replacement for any
-    // instruction operand where a register of this classis required. It must
-    // satisfy these conditions:
-    //
-    // 1. All RC registers are also in this.
-    // 2. The RC spill size must not be smaller than our spill size.
-    // 3. RC spill alignment must be compatible with ours.
-    //
-    bool hasSubClass(const CodeGenRegisterClass *RC) const {
-      return SubClasses.test(RC->EnumValue);
-    }
+  // Returns true if RC is a subclass.
+  // RC is a sub-class of this class if it is a valid replacement for any
+  // instruction operand where a register of this classis required. It must
+  // satisfy these conditions:
+  //
+  // 1. All RC registers are also in this.
+  // 2. The RC spill size must not be smaller than our spill size.
+  // 3. RC spill alignment must be compatible with ours.
+  //
+  bool hasSubClass(const CodeGenRegisterClass *RC) const {
+    return SubClasses.test(RC->EnumValue);
+  }
 
-    // getSubClassWithSubReg - Returns the largest sub-class where all
-    // registers have a SubIdx sub-register.
-    CodeGenRegisterClass *
-    getSubClassWithSubReg(const CodeGenSubRegIndex *SubIdx) const {
-      return SubClassWithSubReg.lookup(SubIdx);
-    }
+  // getSubClassWithSubReg - Returns the largest sub-class where all
+  // registers have a SubIdx sub-register.
+  CodeGenRegisterClass *
+  getSubClassWithSubReg(const CodeGenSubRegIndex *SubIdx) const {
+    return SubClassWithSubReg.lookup(SubIdx);
+  }
 
-    /// Find largest subclass where all registers have SubIdx subregisters in
-    /// SubRegClass and the largest subregister class that contains those
-    /// subregisters without (as far as possible) also containing additional registers.
-    ///
-    /// This can be used to find a suitable pair of classes for subregister copies.
-    /// \return std::pair<SubClass, SubRegClass> where SubClass is a SubClass is
-    /// a class where every register has SubIdx and SubRegClass is a class where
-    /// every register is covered by the SubIdx subregister of SubClass.
-    std::optional<std::pair<CodeGenRegisterClass *, CodeGenRegisterClass *>>
-    getMatchingSubClassWithSubRegs(CodeGenRegBank &RegBank,
-                                   const CodeGenSubRegIndex *SubIdx) const;
-
-    void setSubClassWithSubReg(const CodeGenSubRegIndex *SubIdx,
-                               CodeGenRegisterClass *SubRC) {
-      SubClassWithSubReg[SubIdx] = SubRC;
-    }
+  /// Find largest subclass where all registers have SubIdx subregisters in
+  /// SubRegClass and the largest subregister class that contains those
+  /// subregisters without (as far as possible) also containing additional
+  /// registers.
+  ///
+  /// This can be used to find a suitable pair of classes for subregister
+  /// copies. \return std::pair<SubClass, SubRegClass> where SubClass is a
+  /// SubClass is a class where every register has SubIdx and SubRegClass is a
+  /// class where every register is covered by the SubIdx subregister of
+  /// SubClass.
+  std::optional<std::pair<CodeGenRegisterClass *, CodeGenRegisterClass *>>
+  getMatchingSubClassWithSubRegs(CodeGenRegBank &RegBank,
+                                 const CodeGenSubRegIndex *SubIdx) const;
+
+  void setSubClassWithSubReg(const CodeGenSubRegIndex *SubIdx,
+                             CodeGenRegisterClass *SubRC) {
+    SubClassWithSubReg[SubIdx] = SubRC;
+  }
 
-    // getSuperRegClasses - Returns a bit vector of all register classes
-    // containing only SubIdx super-registers of this class.
-    void getSuperRegClasses(const CodeGenSubRegIndex *SubIdx,
-                            BitVector &Out) const;
+  // getSuperRegClasses - Returns a bit vector of all register classes
+  // containing only SubIdx super-registers of this class.
+  void getSuperRegClasses(const CodeGenSubRegIndex *SubIdx,
+                          BitVector &Out) const;
 
-    // addSuperRegClass - Add a class containing only SubIdx super-registers.
-    void addSuperRegClass(CodeGenSubRegIndex *SubIdx,
-                          CodeGenRegisterClass *SuperRC) {
-      SuperRegClasses[SubIdx].insert(SuperRC);
-    }
+  // addSuperRegClass - Add a class containing only SubIdx super-registers.
+  void addSuperRegClass(CodeGenSubRegIndex *SubIdx,
+                        CodeGenRegisterClass *SuperRC) {
+    SuperRegClasses[SubIdx].insert(SuperRC);
+  }
 
-    // getSubClasses - Returns a constant BitVector of subclasses indexed by
-    // EnumValue.
-    // The SubClasses vector includes an entry for this class.
-    const BitVector &getSubClasses() const { return SubClasses; }
+  // getSubClasses - Returns a constant BitVector of subclasses indexed by
+  // EnumValue.
+  // The SubClasses vector includes an entry for this class.
+  const BitVector &getSubClasses() const { return SubClasses; }
 
-    // getSuperClasses - Returns a list of super classes ordered by EnumValue.
-    // The array does not include an entry for this class.
-    ArrayRef<CodeGenRegisterClass*> getSuperClasses() const {
-      return SuperClasses;
-    }
+  // getSuperClasses - Returns a list of super classes ordered by EnumValue.
+  // The array does not include an entry for this class.
+  ArrayRef<CodeGenRegisterClass *> getSuperClasses() const {
+    return SuperClasses;
+  }
 
-    // Returns an ordered list of class members.
-    // The order of registers is the same as in the .td file.
-    // No = 0 is the default allocation order, No = 1 is the first alternative.
-    ArrayRef<Record*> getOrder(unsigned No = 0) const {
-        return Orders[No];
-    }
+  // Returns an ordered list of class members.
+  // The order of registers is the same as in the .td file.
+  // No = 0 is the default allocation order, No = 1 is the first alternative.
+  ArrayRef<Record *> getOrder(unsigned No = 0) const { return Orders[No]; }
 
-    // Return the total number of allocation orders available.
-    unsigned getNumOrders() const { return Orders.size(); }
+  // Return the total number of allocation orders available.
+  unsigned getNumOrders() const { return Orders.size(); }
 
-    // Get the set of registers.  This set contains the same registers as
-    // getOrder(0).
-    const CodeGenRegister::Vec &getMembers() const { return Members; }
+  // Get the set of registers.  This set contains the same registers as
+  // getOrder(0).
+  const CodeGenRegister::Vec &getMembers() const { return Members; }
 
-    // Get a bit vector of TopoSigs present in this register class.
-    const BitVector &getTopoSigs() const { return TopoSigs; }
+  // Get a bit vector of TopoSigs present in this register class.
+  const BitVector &getTopoSigs() const { return TopoSigs; }
 
-    // Get a weight of this register class.
-    unsigned getWeight(const CodeGenRegBank&) const;
+  // Get a weight of this register class.
+  unsigned getWeight(const CodeGenRegBank &) const;
 
-    // Populate a unique sorted list of units from a register set.
-    void buildRegUnitSet(const CodeGenRegBank &RegBank,
-                         std::vector<unsigned> &RegUnits) const;
+  // Populate a unique sorted list of units from a register set.
+  void buildRegUnitSet(const CodeGenRegBank &RegBank,
+                       std::vector<unsigned> &RegUnits) const;
 
-    CodeGenRegisterClass(CodeGenRegBank&, Record *R);
-    CodeGenRegisterClass(CodeGenRegisterClass&) = delete;
+  CodeGenRegisterClass(CodeGenRegBank &, Record *R);
+  CodeGenRegisterClass(CodeGenRegisterClass &) = delete;
 
-    // A key representing the parts of a register class used for forming
-    // sub-classes.  Note the ordering provided by this key is not the same as
-    // the topological order used for the EnumValues.
-    struct Key {
-      const CodeGenRegister::Vec *Members;
-      RegSizeInfoByHwMode RSI;
+  // A key representing the parts of a register class used for forming
+  // sub-classes.  Note the ordering provided by this key is not the same as
+  // the topological order used for the EnumValues.
+  struct Key {
+    const CodeGenRegister::Vec *Members;
+    RegSizeInfoByHwMode RSI;
 
-      Key(const CodeGenRegister::Vec *M, const RegSizeInfoByHwMode &I)
+    Key(const CodeGenRegister::Vec *M, const RegSizeInfoByHwMode &I)
         : Members(M), RSI(I) {}
 
-      Key(const CodeGenRegisterClass &RC)
+    Key(const CodeGenRegisterClass &RC)
         : Members(&RC.getMembers()), RSI(RC.RSI) {}
 
-      // Lexicographical order of (Members, RegSizeInfoByHwMode).
-      bool operator<(const Key&) const;
-    };
-
-    // Create a non-user defined register class.
-    CodeGenRegisterClass(CodeGenRegBank&, StringRef Name, Key Props);
-
-    // Called by CodeGenRegBank::CodeGenRegBank().
-    static void computeSubClasses(CodeGenRegBank&);
-
-    // Get ordering value among register base classes.
-    std::optional<int> getBaseClassOrder() const {
-      if (TheDef && !TheDef->isValueUnset("BaseClassOrder"))
-        return TheDef->getValueAsInt("BaseClassOrder");
-      return {};
-    }
+    // Lexicographical order of (Members, RegSizeInfoByHwMode).
+    bool operator<(const Key &) const;
   };
 
-  // Register categories are used when we need to deterine the category a
-  // register falls into (GPR, vector, fixed, etc.) without having to know
-  // specific information about the target architecture.
-  class CodeGenRegisterCategory {
-    Record *TheDef;
-    std::string Name;
-    std::list<CodeGenRegisterClass *> Classes;
+  // Create a non-user defined register class.
+  CodeGenRegisterClass(CodeGenRegBank &, StringRef Name, Key Props);
 
-  public:
-    CodeGenRegisterCategory(CodeGenRegBank &, Record *R);
-    CodeGenRegisterCategory(CodeGenRegisterCategory &) = delete;
+  // Called by CodeGenRegBank::CodeGenRegBank().
+  static void computeSubClasses(CodeGenRegBank &);
 
-    // Return the Record that defined this class, or NULL if the class was
-    // created by TableGen.
-    Record *getDef() const { return TheDef; }
-
-    std::string getName() const { return Name; }
-    std::list<CodeGenRegisterClass *> getClasses() const { return Classes; }
-  };
+  // Get ordering value among register base classes.
+  std::optional<int> getBaseClassOrder() const {
+    if (TheDef && !TheDef->isValueUnset("BaseClassOrder"))
+      return TheDef->getValueAsInt("BaseClassOrder");
+    return {};
+  }
+};
+
+// Register categories are used when we need to deterine the category a
+// register falls into (GPR, vector, fixed, etc.) without having to know
+// specific information about the target architecture.
+class CodeGenRegisterCategory {
+  Record *TheDef;
+  std::string Name;
+  std::list<CodeGenRegisterClass *> Classes;
+
+public:
+  CodeGenRegisterCategory(CodeGenRegBank &, Record *R);
+  CodeGenRegisterCategory(CodeGenRegisterCategory &) = delete;
+
+  // Return the Record that defined this class, or NULL if the class was
+  // created by TableGen.
+  Record *getDef() const { return TheDef; }
+
+  std::string getName() const { return Name; }
+  std::list<CodeGenRegisterClass *> getClasses() const { return Classes; }
+};
+
+// Register units are used to model interference and register pressure.
+// Every register is assigned one or more register units such that two
+// registers overlap if and only if they have a register unit in common.
+//
+// Normally, one register unit is created per leaf register. Non-leaf
+// registers inherit the units of their sub-registers.
+struct RegUnit {
+  // Weight assigned to this RegUnit for estimating register pressure.
+  // This is useful when equalizing weights in register classes with mixed
+  // register topologies.
+  unsigned Weight;
+
+  // Each native RegUnit corresponds to one or two root registers. The full
+  // set of registers containing this unit can be computed as the union of
+  // these two registers and their super-registers.
+  const CodeGenRegister *Roots[2];
+
+  // Index into RegClassUnitSets where we can find the list of UnitSets that
+  // contain this unit.
+  unsigned RegClassUnitSetsIdx;
+  // A register unit is artificial if at least one of its roots is
+  // artificial.
+  bool Artificial;
+
+  RegUnit() : Weight(0), RegClassUnitSetsIdx(0), Artificial(false) {
+    Roots[0] = Roots[1] = nullptr;
+  }
 
-  // Register units are used to model interference and register pressure.
-  // Every register is assigned one or more register units such that two
-  // registers overlap if and only if they have a register unit in common.
-  //
-  // Normally, one register unit is created per leaf register. Non-leaf
-  // registers inherit the units of their sub-registers.
-  struct RegUnit {
-    // Weight assigned to this RegUnit for estimating register pressure.
-    // This is useful when equalizing weights in register classes with mixed
-    // register topologies.
-    unsigned Weight;
-
-    // Each native RegUnit corresponds to one or two root registers. The full
-    // set of registers containing this unit can be computed as the union of
-    // these two registers and their super-registers.
-    const CodeGenRegister *Roots[2];
-
-    // Index into RegClassUnitSets where we can find the list of UnitSets that
-    // contain this unit.
-    unsigned RegClassUnitSetsIdx;
-    // A register unit is artificial if at least one of its roots is
-    // artificial.
-    bool Artificial;
-
-    RegUnit() : Weight(0), RegClassUnitSetsIdx(0), Artificial(false) {
-      Roots[0] = Roots[1] = nullptr;
-    }
+  ArrayRef<const CodeGenRegister *> getRoots() const {
+    assert(!(Roots[1] && !Roots[0]) && "Invalid roots array");
+    return ArrayRef(Roots, !!Roots[0] + !!Roots[1]);
+  }
+};
 
-    ArrayRef<const CodeGenRegister*> getRoots() const {
-      assert(!(Roots[1] && !Roots[0]) && "Invalid roots array");
-      return ArrayRef(Roots, !!Roots[0] + !!Roots[1]);
-    }
-  };
+// Each RegUnitSet is a sorted vector with a name.
+struct RegUnitSet {
+  typedef std::vector<unsigned>::const_iterator iterator;
 
-  // Each RegUnitSet is a sorted vector with a name.
-  struct RegUnitSet {
-    typedef std::vector<unsigned>::const_iterator iterator;
+  std::string Name;
+  std::vector<unsigned> Units;
+  unsigned Weight = 0; // Cache the sum of all unit weights.
+  unsigned Order = 0;  // Cache the sort key.
 
-    std::string Name;
-    std::vector<unsigned> Units;
-    unsigned Weight = 0; // Cache the sum of all unit weights.
-    unsigned Order = 0;  // Cache the sort key.
+  RegUnitSet() = default;
+};
 
-    RegUnitSet() = default;
-  };
+// Base vector for identifying TopoSigs. The contents uniquely identify a
+// TopoSig, only computeSuperRegs needs to know how.
+typedef SmallVector<unsigned, 16> TopoSigId;
 
-  // Base vector for identifying TopoSigs. The contents uniquely identify a
-  // TopoSig, only computeSuperRegs needs to know how.
-  typedef SmallVector<unsigned, 16> TopoSigId;
+// CodeGenRegBank - Represent a target's registers and the relations between
+// them.
+class CodeGenRegBank {
+  SetTheory Sets;
 
-  // CodeGenRegBank - Represent a target's registers and the relations between
-  // them.
-  class CodeGenRegBank {
-    SetTheory Sets;
+  const CodeGenHwModes &CGH;
 
-    const CodeGenHwModes &CGH;
+  std::deque<CodeGenSubRegIndex> SubRegIndices;
+  DenseMap<Record *, CodeGenSubRegIndex *> Def2SubRegIdx;
 
-    std::deque<CodeGenSubRegIndex> SubRegIndices;
-    DenseMap<Record*, CodeGenSubRegIndex*> Def2SubRegIdx;
+  CodeGenSubRegIndex *createSubRegIndex(StringRef Name, StringRef NameSpace);
 
-    CodeGenSubRegIndex *createSubRegIndex(StringRef Name, StringRef NameSpace);
+  typedef std::map<SmallVector<CodeGenSubRegIndex *, 8>, CodeGenSubRegIndex *>
+      ConcatIdxMap;
+  ConcatIdxMap ConcatIdx;
 
-    typedef std::map<SmallVector<CodeGenSubRegIndex*, 8>,
-                     CodeGenSubRegIndex*> ConcatIdxMap;
-    ConcatIdxMap ConcatIdx;
+  // Registers.
+  std::deque<CodeGenRegister> Registers;
+  StringMap<CodeGenRegister *> RegistersByName;
+  DenseMap<Record *, CodeGenRegister *> Def2Reg;
+  unsigned NumNativeRegUnits;
 
-    // Registers.
-    std::deque<CodeGenRegister> Registers;
-    StringMap<CodeGenRegister*> RegistersByName;
-    DenseMap<Record*, CodeGenRegister*> Def2Reg;
-    unsigned NumNativeRegUnits;
+  std::map<TopoSigId, unsigned> TopoSigs;
 
-    std::map<TopoSigId, unsigned> TopoSigs;
+  // Includes native (0..NumNativeRegUnits-1) and adopted register units.
+  SmallVector<RegUnit, 8> RegUnits;
 
-    // Includes native (0..NumNativeRegUnits-1) and adopted register units.
-    SmallVector<RegUnit, 8> RegUnits;
+  // Register classes.
+  std::list<CodeGenRegisterClass> RegClasses;
+  DenseMap<Record *, CodeGenRegisterClass *> Def2RC;
+  typedef std::map<CodeGenRegisterClass::Key, CodeGenRegisterClass *> RCKeyMap;
+  RCKeyMap Key2RC;
 
-    // Register classes.
-    std::list<CodeGenRegisterClass> RegClasses;
-    DenseMap<Record*, CodeGenRegisterClass*> Def2RC;
-    typedef std::map<CodeGenRegisterClass::Key, CodeGenRegisterClass*> RCKeyMap;
-    RCKeyMap Key2RC;
+  // Register categories.
+  std::list<CodeGenRegisterCategory> RegCategories;
+  DenseMap<Record *, CodeGenRegisterCategory *> Def2RCat;
+  using RCatKeyMap =
+      std::map<CodeGenRegisterClass::Key, CodeGenRegisterCategory *>;
+  RCatKeyMap Key2RCat;
 
-    // Register categories.
-    std::list<CodeGenRegisterCategory> RegCategories;
-    DenseMap<Record *, CodeGenRegisterCategory *> Def2RCat;
-    using RCatKeyMap =
-        std::map<CodeGenRegisterClass::Key, CodeGenRegisterCategory *>;
-    RCatKeyMap Key2RCat;
+  // Remember each unique set of register units. Initially, this contains a
+  // unique set for each register class. Simliar sets are coalesced with
+  // pruneUnitSets and new supersets are inferred during computeRegUnitSets.
+  std::vector<RegUnitSet> RegUnitSets;
 
-    // Remember each unique set of register units. Initially, this contains a
-    // unique set for each register class. Simliar sets are coalesced with
-    // pruneUnitSets and new supersets are inferred during computeRegUnitSets.
-    std::vector<RegUnitSet> RegUnitSets;
+  // Map RegisterClass index to the index of the RegUnitSet that contains the
+  // class's units and any inferred RegUnit supersets.
+  //
+  // NOTE: This could grow beyond the number of register classes when we map
+  // register units to lists of unit sets. If the list of unit sets does not
+  // already exist for a register class, we create a new entry in this vector.
+  std::vector<std::vector<unsigned>> RegClassUnitSets;
 
-    // Map RegisterClass index to the index of the RegUnitSet that contains the
-    // class's units and any inferred RegUnit supersets.
-    //
-    // NOTE: This could grow beyond the number of register classes when we map
-    // register units to lists of unit sets. If the list of unit sets does not
-    // already exist for a register class, we create a new entry in this vector.
-    std::vector<std::vector<unsigned>> RegClassUnitSets;
+  // Give each register unit set an order based on sorting criteria.
+  std::vector<unsigned> RegUnitSetOrder;
 
-    // Give each register unit set an order based on sorting criteria.
-    std::vector<unsigned> RegUnitSetOrder;
+  // Keep track of synthesized definitions generated in TupleExpander.
+  std::vector<std::unique_ptr<Record>> SynthDefs;
 
-    // Keep track of synthesized definitions generated in TupleExpander.
-    std::vector<std::unique_ptr<Record>> SynthDefs;
+  // Add RC to *2RC maps.
+  void addToMaps(CodeGenRegisterClass *);
 
-    // Add RC to *2RC maps.
-    void addToMaps(CodeGenRegisterClass*);
+  // Create a synthetic sub-class if it is missing.
+  CodeGenRegisterClass *getOrCreateSubClass(const CodeGenRegisterClass *RC,
+                                            const CodeGenRegister::Vec *Membs,
+                                            StringRef Name);
 
-    // Create a synthetic sub-class if it is missing.
-    CodeGenRegisterClass *getOrCreateSubClass(const CodeGenRegisterClass *RC,
-                                              const CodeGenRegister::Vec *Membs,
-                                              StringRef Name);
+  // Infer missing register classes.
+  void computeInferredRegisterClasses();
+  void inferCommonSubClass(CodeGenRegisterClass *RC);
+  void inferSubClassWithSubReg(CodeGenRegisterClass *RC);
 
-    // Infer missing register classes.
-    void computeInferredRegisterClasses();
-    void inferCommonSubClass(CodeGenRegisterClass *RC);
-    void inferSubClassWithSubReg(CodeGenRegisterClass *RC);
+  void inferMatchingSuperRegClass(CodeGenRegisterClass *RC) {
+    inferMatchingSuperRegClass(RC, RegClasses.begin());
+  }
 
-    void inferMatchingSuperRegClass(CodeGenRegisterClass *RC) {
-      inferMatchingSuperRegClass(RC, RegClasses.begin());
-    }
+  void inferMatchingSuperRegClass(
+      CodeGenRegisterClass *RC,
+      std::list<CodeGenRegisterClass>::iterator FirstSubRegRC);
 
-    void inferMatchingSuperRegClass(
-        CodeGenRegisterClass *RC,
-        std::list<CodeGenRegisterClass>::iterator FirstSubRegRC);
+  // Iteratively prune unit sets.
+  void pruneUnitSets();
 
-    // Iteratively prune unit sets.
-    void pruneUnitSets();
+  // Compute a weight for each register unit created during getSubRegs.
+  void computeRegUnitWeights();
 
-    // Compute a weight for each register unit created during getSubRegs.
-    void computeRegUnitWeights();
+  // Create a RegUnitSet for each RegClass and infer superclasses.
+  void computeRegUnitSets();
 
-    // Create a RegUnitSet for each RegClass and infer superclasses.
-    void computeRegUnitSets();
+  // Populate the Composite map from sub-register relationships.
+  void computeComposites();
 
-    // Populate the Composite map from sub-register relationships.
-    void computeComposites();
+  // Compute a lane mask for each sub-register index.
+  void computeSubRegLaneMasks();
 
-    // Compute a lane mask for each sub-register index.
-    void computeSubRegLaneMasks();
+  /// Computes a lane mask for each register unit enumerated by a physical
+  /// register.
+  void computeRegUnitLaneMasks();
 
-    /// Computes a lane mask for each register unit enumerated by a physical
-    /// register.
-    void computeRegUnitLaneMasks();
+public:
+  CodeGenRegBank(RecordKeeper &, const CodeGenHwModes &);
+  CodeGenRegBank(CodeGenRegBank &) = delete;
 
-  public:
-    CodeGenRegBank(RecordKeeper&, const CodeGenHwModes&);
-    CodeGenRegBank(CodeGenRegBank&) = delete;
+  SetTheory &getSets() { return Sets; }
 
-    SetTheory &getSets() { return Sets; }
+  const CodeGenHwModes &getHwModes() const { return CGH; }
 
-    const CodeGenHwModes &getHwModes() const { return CGH; }
+  // Sub-register indices. The first NumNamedIndices are defined by the user
+  // in the .td files. The rest are synthesized such that all sub-registers
+  // have a unique name.
+  const std::deque<CodeGenSubRegIndex> &getSubRegIndices() const {
+    return SubRegIndices;
+  }
 
-    // Sub-register indices. The first NumNamedIndices are defined by the user
-    // in the .td files. The rest are synthesized such that all sub-registers
-    // have a unique name.
-    const std::deque<CodeGenSubRegIndex> &getSubRegIndices() const {
-      return SubRegIndices;
-    }
+  // Find a SubRegIndex from its Record def or add to the list if it does
+  // not exist there yet.
+  CodeGenSubRegIndex *getSubRegIdx(Record *);
 
-    // Find a SubRegIndex from its Record def or add to the list if it does
-    // not exist there yet.
-    CodeGenSubRegIndex *getSubRegIdx(Record*);
+  // Find a SubRegIndex from its Record def.
+  const CodeGenSubRegIndex *findSubRegIdx(const Record *Def) const;
 
-    // Find a SubRegIndex from its Record def.
-    const CodeGenSubRegIndex *findSubRegIdx(const Record* Def) const;
+  // Find or create a sub-register index representing the A+B composition.
+  CodeGenSubRegIndex *getCompositeSubRegIndex(CodeGenSubRegIndex *A,
+                                              CodeGenSubRegIndex *B);
 
-    // Find or create a sub-register index representing the A+B composition.
-    CodeGenSubRegIndex *getCompositeSubRegIndex(CodeGenSubRegIndex *A,
-                                                CodeGenSubRegIndex *B);
+  // Find or create a sub-register index representing the concatenation of
+  // non-overlapping sibling indices.
+  CodeGenSubRegIndex *
+  getConcatSubRegIndex(const SmallVector<CodeGenSubRegIndex *, 8> &);
 
-    // Find or create a sub-register index representing the concatenation of
-    // non-overlapping sibling indices.
-    CodeGenSubRegIndex *
-      getConcatSubRegIndex(const SmallVector<CodeGenSubRegIndex *, 8>&);
+  const std::deque<CodeGenRegister> &getRegisters() const { return Registers; }
 
-    const std::deque<CodeGenRegister> &getRegisters() const {
-      return Registers;
-    }
+  const StringMap<CodeGenRegister *> &getRegistersByName() const {
+    return RegistersByName;
+  }
 
-    const StringMap<CodeGenRegister *> &getRegistersByName() const {
-      return RegistersByName;
-    }
+  // Find a register from its Record def.
+  CodeGenRegister *getReg(Record *);
 
-    // Find a register from its Record def.
-    CodeGenRegister *getReg(Record*);
+  // Get a Register's index into the Registers array.
+  unsigned getRegIndex(const CodeGenRegister *Reg) const {
+    return Reg->EnumValue - 1;
+  }
 
-    // Get a Register's index into the Registers array.
-    unsigned getRegIndex(const CodeGenRegister *Reg) const {
-      return Reg->EnumValue - 1;
-    }
+  // Return the number of allocated TopoSigs. The first TopoSig representing
+  // leaf registers is allocated number 0.
+  unsigned getNumTopoSigs() const { return TopoSigs.size(); }
 
-    // Return the number of allocated TopoSigs. The first TopoSig representing
-    // leaf registers is allocated number 0.
-    unsigned getNumTopoSigs() const {
-      return TopoSigs.size();
-    }
+  // Find or create a TopoSig for the given TopoSigId.
+  // This function is only for use by CodeGenRegister::computeSuperRegs().
+  // Others should simply use Reg->getTopoSig().
+  unsigned getTopoSig(const TopoSigId &Id) {
+    return TopoSigs.insert(std::make_pair(Id, TopoSigs.size())).first->second;
+  }
 
-    // Find or create a TopoSig for the given TopoSigId.
-    // This function is only for use by CodeGenRegister::computeSuperRegs().
-    // Others should simply use Reg->getTopoSig().
-    unsigned getTopoSig(const TopoSigId &Id) {
-      return TopoSigs.insert(std::make_pair(Id, TopoSigs.size())).first->second;
-    }
+  // Create a native register unit that is associated with one or two root
+  // registers.
+  unsigned newRegUnit(CodeGenRegister *R0, CodeGenRegister *R1 = nullptr) {
+    RegUnits.resize(RegUnits.size() + 1);
+    RegUnit &RU = RegUnits.back();
+    RU.Roots[0] = R0;
+    RU.Roots[1] = R1;
+    RU.Artificial = R0->Artificial;
+    if (R1)
+      RU.Artificial |= R1->Artificial;
+    return RegUnits.size() - 1;
+  }
 
-    // Create a native register unit that is associated with one or two root
-    // registers.
-    unsigned newRegUnit(CodeGenRegister *R0, CodeGenRegister *R1 = nullptr) {
-      RegUnits.resize(RegUnits.size() + 1);
-      RegUnit &RU = RegUnits.back();
-      RU.Roots[0] = R0;
-      RU.Roots[1] = R1;
-      RU.Artificial = R0->Artificial;
-      if (R1)
-        RU.Artificial |= R1->Artificial;
-      return RegUnits.size() - 1;
-    }
+  // Create a new non-native register unit that can be adopted by a register
+  // to increase its pressure. Note that NumNativeRegUnits is not increased.
+  unsigned newRegUnit(unsigned Weight) {
+    RegUnits.resize(RegUnits.size() + 1);
+    RegUnits.back().Weight = Weight;
+    return RegUnits.size() - 1;
+  }
 
-    // Create a new non-native register unit that can be adopted by a register
-    // to increase its pressure. Note that NumNativeRegUnits is not increased.
-    unsigned newRegUnit(unsigned Weight) {
-      RegUnits.resize(RegUnits.size() + 1);
-      RegUnits.back().Weight = Weight;
-      return RegUnits.size() - 1;
-    }
+  // Native units are the singular unit of a leaf register. Register aliasing
+  // is completely characterized by native units. Adopted units exist to give
+  // register additional weight but don't affect aliasing.
+  bool isNativeUnit(unsigned RUID) const { return RUID < NumNativeRegUnits; }
 
-    // Native units are the singular unit of a leaf register. Register aliasing
-    // is completely characterized by native units. Adopted units exist to give
-    // register additional weight but don't affect aliasing.
-    bool isNativeUnit(unsigned RUID) const {
-      return RUID < NumNativeRegUnits;
-    }
+  unsigned getNumNativeRegUnits() const { return NumNativeRegUnits; }
 
-    unsigned getNumNativeRegUnits() const {
-      return NumNativeRegUnits;
-    }
+  RegUnit &getRegUnit(unsigned RUID) { return RegUnits[RUID]; }
+  const RegUnit &getRegUnit(unsigned RUID) const { return RegUnits[RUID]; }
 
-    RegUnit &getRegUnit(unsigned RUID) { return RegUnits[RUID]; }
-    const RegUnit &getRegUnit(unsigned RUID) const { return RegUnits[RUID]; }
+  std::list<CodeGenRegisterClass> &getRegClasses() { return RegClasses; }
 
-    std::list<CodeGenRegisterClass> &getRegClasses() { return RegClasses; }
+  const std::list<CodeGenRegisterClass> &getRegClasses() const {
+    return RegClasses;
+  }
 
-    const std::list<CodeGenRegisterClass> &getRegClasses() const {
-      return RegClasses;
-    }
+  std::list<CodeGenRegisterCategory> &getRegCategories() {
+    return RegCategories;
+  }
 
-    std::list<CodeGenRegisterCategory> &getRegCategories() {
-      return RegCategories;
-    }
+  const std::list<CodeGenRegisterCategory> &getRegCategories() const {
+    return RegCategories;
+  }
 
-    const std::list<CodeGenRegisterCategory> &getRegCategories() const {
-      return RegCategories;
-    }
+  // Find a register class from its def.
+  CodeGenRegisterClass *getRegClass(const Record *) const;
+
+  /// getRegisterClassForRegister - Find the register class that contains the
+  /// specified physical register.  If the register is not in a register
+  /// class, return null. If the register is in multiple classes, and the
+  /// classes have a superset-subset relationship and the same set of types,
+  /// return the superclass.  Otherwise return null.
+  const CodeGenRegisterClass *getRegClassForRegister(Record *R);
+
+  // Analog of TargetRegisterInfo::getMinimalPhysRegClass. Unlike
+  // getRegClassForRegister, this tries to find the smallest class containing
+  // the physical register. If \p VT is specified, it will only find classes
+  // with a matching type
+  const CodeGenRegisterClass *
+  getMinimalPhysRegClass(Record *RegRecord, ValueTypeByHwMode *VT = nullptr);
+
+  // Get the sum of unit weights.
+  unsigned getRegUnitSetWeight(const std::vector<unsigned> &Units) const {
+    unsigned Weight = 0;
+    for (unsigned Unit : Units)
+      Weight += getRegUnit(Unit).Weight;
+    return Weight;
+  }
 
-    // Find a register class from its def.
-    CodeGenRegisterClass *getRegClass(const Record *) const;
-
-    /// getRegisterClassForRegister - Find the register class that contains the
-    /// specified physical register.  If the register is not in a register
-    /// class, return null. If the register is in multiple classes, and the
-    /// classes have a superset-subset relationship and the same set of types,
-    /// return the superclass.  Otherwise return null.
-    const CodeGenRegisterClass* getRegClassForRegister(Record *R);
-
-    // Analog of TargetRegisterInfo::getMinimalPhysRegClass. Unlike
-    // getRegClassForRegister, this tries to find the smallest class containing
-    // the physical register. If \p VT is specified, it will only find classes
-    // with a matching type
-    const CodeGenRegisterClass *
-    getMinimalPhysRegClass(Record *RegRecord, ValueTypeByHwMode *VT = nullptr);
-
-    // Get the sum of unit weights.
-    unsigned getRegUnitSetWeight(const std::vector<unsigned> &Units) const {
-      unsigned Weight = 0;
-      for (unsigned Unit : Units)
-        Weight += getRegUnit(Unit).Weight;
-      return Weight;
-    }
+  unsigned getRegSetIDAt(unsigned Order) const {
+    return RegUnitSetOrder[Order];
+  }
 
-    unsigned getRegSetIDAt(unsigned Order) const {
-      return RegUnitSetOrder[Order];
-    }
+  const RegUnitSet &getRegSetAt(unsigned Order) const {
+    return RegUnitSets[RegUnitSetOrder[Order]];
+  }
 
-    const RegUnitSet &getRegSetAt(unsigned Order) const {
-      return RegUnitSets[RegUnitSetOrder[Order]];
-    }
+  // Increase a RegUnitWeight.
+  void increaseRegUnitWeight(unsigned RUID, unsigned Inc) {
+    getRegUnit(RUID).Weight += Inc;
+  }
 
-    // Increase a RegUnitWeight.
-    void increaseRegUnitWeight(unsigned RUID, unsigned Inc) {
-      getRegUnit(RUID).Weight += Inc;
-    }
+  // Get the number of register pressure dimensions.
+  unsigned getNumRegPressureSets() const { return RegUnitSets.size(); }
 
-    // Get the number of register pressure dimensions.
-    unsigned getNumRegPressureSets() const { return RegUnitSets.size(); }
+  // Get a set of register unit IDs for a given dimension of pressure.
+  const RegUnitSet &getRegPressureSet(unsigned Idx) const {
+    return RegUnitSets[Idx];
+  }
 
-    // Get a set of register unit IDs for a given dimension of pressure.
-    const RegUnitSet &getRegPressureSet(unsigned Idx) const {
-      return RegUnitSets[Idx];
-    }
+  // The number of pressure set lists may be larget than the number of
+  // register classes if some register units appeared in a list of sets that
+  // did not correspond to an existing register class.
+  unsigned getNumRegClassPressureSetLists() const {
+    return RegClassUnitSets.size();
+  }
 
-    // The number of pressure set lists may be larget than the number of
-    // register classes if some register units appeared in a list of sets that
-    // did not correspond to an existing register class.
-    unsigned getNumRegClassPressureSetLists() const {
-      return RegClassUnitSets.size();
-    }
+  // Get a list of pressure set IDs for a register class. Liveness of a
+  // register in this class impacts each pressure set in this list by the
+  // weight of the register. An exact solution requires all registers in a
+  // class to have the same class, but it is not strictly guaranteed.
+  ArrayRef<unsigned> getRCPressureSetIDs(unsigned RCIdx) const {
+    return RegClassUnitSets[RCIdx];
+  }
 
-    // Get a list of pressure set IDs for a register class. Liveness of a
-    // register in this class impacts each pressure set in this list by the
-    // weight of the register. An exact solution requires all registers in a
-    // class to have the same class, but it is not strictly guaranteed.
-    ArrayRef<unsigned> getRCPressureSetIDs(unsigned RCIdx) const {
-      return RegClassUnitSets[RCIdx];
-    }
+  // Computed derived records such as missing sub-register indices.
+  void computeDerivedInfo();
 
-    // Computed derived records such as missing sub-register indices.
-    void computeDerivedInfo();
-
-    // Compute the set of registers completely covered by the registers in Regs.
-    // The returned BitVector will have a bit set for each register in Regs,
-    // all sub-registers, and all super-registers that are covered by the
-    // registers in Regs.
-    //
-    // This is used to compute the mask of call-preserved registers from a list
-    // of callee-saves.
-    BitVector computeCoveredRegisters(ArrayRef<Record*> Regs);
-
-    // Bit mask of lanes that cover their registers. A sub-register index whose
-    // LaneMask is contained in CoveringLanes will be completely covered by
-    // another sub-register with the same or larger lane mask.
-    LaneBitmask CoveringLanes;
-
-    // Helper function for printing debug information. Handles artificial
-    // (non-native) reg units.
-    void printRegUnitName(unsigned Unit) const;
-  };
+  // Compute the set of registers completely covered by the registers in Regs.
+  // The returned BitVector will have a bit set for each register in Regs,
+  // all sub-registers, and all super-registers that are covered by the
+  // registers in Regs.
+  //
+  // This is used to compute the mask of call-preserved registers from a list
+  // of callee-saves.
+  BitVector computeCoveredRegisters(ArrayRef<Record *> Regs);
+
+  // Bit mask of lanes that cover their registers. A sub-register index whose
+  // LaneMask is contained in CoveringLanes will be completely covered by
+  // another sub-register with the same or larger lane mask.
+  LaneBitmask CoveringLanes;
+
+  // Helper function for printing debug information. Handles artificial
+  // (non-native) reg units.
+  void printRegUnitName(unsigned Unit) const;
+};
 
 } // end namespace llvm
 
diff --git a/llvm/utils/TableGen/CodeGenSchedule.cpp b/llvm/utils/TableGen/CodeGenSchedule.cpp
index 54463da..9cebc42 100644
--- a/llvm/utils/TableGen/CodeGenSchedule.cpp
+++ b/llvm/utils/TableGen/CodeGenSchedule.cpp
@@ -51,7 +51,7 @@ struct InstrsOp : public SetTheory::Operator {
 // (instregex "OpcPat",...) Find all instructions matching an opcode pattern.
 struct InstRegexOp : public SetTheory::Operator {
   const CodeGenTarget &Target;
-  InstRegexOp(const CodeGenTarget &t): Target(t) {}
+  InstRegexOp(const CodeGenTarget &t) : Target(t) {}
 
   /// Remove any text inside of parentheses from S.
   static std::string removeParens(llvm::StringRef S) {
@@ -182,8 +182,8 @@ struct InstRegexOp : public SetTheory::Operator {
 
 /// CodeGenModels ctor interprets machine model records and populates maps.
 CodeGenSchedModels::CodeGenSchedModels(RecordKeeper &RK,
-                                       const CodeGenTarget &TGT):
-  Records(RK), Target(TGT) {
+                                       const CodeGenTarget &TGT)
+    : Records(RK), Target(TGT) {
 
   Sets.addFieldExpander("InstRW", "Instrs");
 
@@ -298,9 +298,8 @@ static APInt constructOperandMask(ArrayRef<int64_t> Indices) {
   return OperandMask;
 }
 
-static void
-processSTIPredicate(STIPredicateFunction &Fn,
-                    const ProcModelMapTy &ProcModelMap) {
+static void processSTIPredicate(STIPredicateFunction &Fn,
+                                const ProcModelMapTy &ProcModelMap) {
   DenseMap<const Record *, unsigned> Opcode2Index;
   using OpcodeMapPair = std::pair<const Record *, OpcodeInfo>;
   std::vector<OpcodeMapPair> OpcodeMappings;
@@ -380,30 +379,29 @@ processSTIPredicate(STIPredicateFunction &Fn,
 
   // Sort OpcodeMappings elements based on their CPU and predicate masks.
   // As a last resort, order elements by opcode identifier.
-  llvm::sort(OpcodeMappings,
-             [&](const OpcodeMapPair &Lhs, const OpcodeMapPair &Rhs) {
-               unsigned LhsIdx = Opcode2Index[Lhs.first];
-               unsigned RhsIdx = Opcode2Index[Rhs.first];
-               const std::pair<APInt, APInt> &LhsMasks = OpcodeMasks[LhsIdx];
-               const std::pair<APInt, APInt> &RhsMasks = OpcodeMasks[RhsIdx];
-
-               auto PopulationCountAndLeftBit =
-                   [](const APInt &Other) -> std::pair<int, int> {
-                 return std::pair<int, int>(Other.popcount(),
-                                            -Other.countl_zero());
-               };
-               auto lhsmask_first = PopulationCountAndLeftBit(LhsMasks.first);
-               auto rhsmask_first = PopulationCountAndLeftBit(RhsMasks.first);
-               if (lhsmask_first != rhsmask_first)
-                 return lhsmask_first < rhsmask_first;
-
-               auto lhsmask_second = PopulationCountAndLeftBit(LhsMasks.second);
-               auto rhsmask_second = PopulationCountAndLeftBit(RhsMasks.second);
-               if (lhsmask_second != rhsmask_second)
-                 return lhsmask_second < rhsmask_second;
-
-               return LhsIdx < RhsIdx;
-             });
+  llvm::sort(
+      OpcodeMappings, [&](const OpcodeMapPair &Lhs, const OpcodeMapPair &Rhs) {
+        unsigned LhsIdx = Opcode2Index[Lhs.first];
+        unsigned RhsIdx = Opcode2Index[Rhs.first];
+        const std::pair<APInt, APInt> &LhsMasks = OpcodeMasks[LhsIdx];
+        const std::pair<APInt, APInt> &RhsMasks = OpcodeMasks[RhsIdx];
+
+        auto PopulationCountAndLeftBit =
+            [](const APInt &Other) -> std::pair<int, int> {
+          return std::pair<int, int>(Other.popcount(), -Other.countl_zero());
+        };
+        auto lhsmask_first = PopulationCountAndLeftBit(LhsMasks.first);
+        auto rhsmask_first = PopulationCountAndLeftBit(RhsMasks.first);
+        if (lhsmask_first != rhsmask_first)
+          return lhsmask_first < rhsmask_first;
+
+        auto lhsmask_second = PopulationCountAndLeftBit(LhsMasks.second);
+        auto rhsmask_second = PopulationCountAndLeftBit(RhsMasks.second);
+        if (lhsmask_second != rhsmask_second)
+          return lhsmask_second < rhsmask_second;
+
+        return LhsIdx < RhsIdx;
+      });
 
   // Now construct opcode groups. Groups are used by the SubtargetEmitter when
   // expanding the body of a STIPredicate function. In particular, each opcode
@@ -498,8 +496,7 @@ void CodeGenSchedModels::collectLoadStoreQueueInfo() {
     CodeGenProcModel &PM = getProcModel(Queue->getValueAsDef("SchedModel"));
     if (Queue->isSubClassOf("LoadQueue")) {
       if (PM.LoadQueue) {
-        PrintError(Queue->getLoc(),
-                   "Expected a single LoadQueue definition");
+        PrintError(Queue->getLoc(), "Expected a single LoadQueue definition");
         PrintNote(PM.LoadQueue->getLoc(),
                   "Previous definition of LoadQueue was here");
       }
@@ -509,8 +506,7 @@ void CodeGenSchedModels::collectLoadStoreQueueInfo() {
 
     if (Queue->isSubClassOf("StoreQueue")) {
       if (PM.StoreQueue) {
-        PrintError(Queue->getLoc(),
-                   "Expected a single StoreQueue definition");
+        PrintError(Queue->getLoc(), "Expected a single StoreQueue definition");
         PrintNote(PM.StoreQueue->getLoc(),
                   "Previous definition of StoreQueue was here");
       }
@@ -542,14 +538,15 @@ void CodeGenSchedModels::collectProcModels() {
   // Check for duplicated names.
   auto I = std::adjacent_find(ProcRecords.begin(), ProcRecords.end(),
                               [](const Record *Rec1, const Record *Rec2) {
-    return Rec1->getValueAsString("Name") == Rec2->getValueAsString("Name");
-  });
+                                return Rec1->getValueAsString("Name") ==
+                                       Rec2->getValueAsString("Name");
+                              });
   if (I != ProcRecords.end())
     PrintFatalError((*I)->getLoc(), "Duplicate processor name " +
-                    (*I)->getValueAsString("Name"));
+                                        (*I)->getValueAsString("Name"));
 
   // Reserve space because we can. Reallocation would be ok.
-  ProcModels.reserve(ProcRecords.size()+1);
+  ProcModels.reserve(ProcRecords.size() + 1);
 
   // Use idx=0 for NoModel/NoItineraries.
   Record *NoModelDef = Records.getDef("NoSchedModel");
@@ -574,8 +571,7 @@ void CodeGenSchedModels::addProcModel(Record *ProcDef) {
   if (ModelKey->isSubClassOf("SchedMachineModel")) {
     Record *ItinsDef = ModelKey->getValueAsDef("Itineraries");
     ProcModels.emplace_back(ProcModels.size(), Name, ModelKey, ItinsDef);
-  }
-  else {
+  } else {
     // An itinerary is defined without a machine model. Infer a new model.
     if (!ModelKey->getValueAsListOfDefs("IID").empty())
       Name = Name + "Model";
@@ -587,7 +583,7 @@ void CodeGenSchedModels::addProcModel(Record *ProcDef) {
 
 // Recursively find all reachable SchedReadWrite records.
 static void scanSchedRW(Record *RWDef, RecVec &RWDefs,
-                        SmallPtrSet<Record*, 16> &RWSet) {
+                        SmallPtrSet<Record *, 16> &RWSet) {
   if (!RWSet.insert(RWDef).second)
     return;
   RWDefs.push_back(RWDef);
@@ -596,8 +592,7 @@ static void scanSchedRW(Record *RWDef, RecVec &RWDefs,
     RecVec Seq = RWDef->getValueAsListOfDefs("Writes");
     for (Record *WSRec : Seq)
       scanSchedRW(WSRec, RWDefs, RWSet);
-  }
-  else if (RWDef->isSubClassOf("SchedVariant")) {
+  } else if (RWDef->isSubClassOf("SchedVariant")) {
     // Visit each variant (guarded by a different predicate).
     RecVec Vars = RWDef->getValueAsListOfDefs("Variants");
     for (Record *Variant : Vars) {
@@ -616,7 +611,7 @@ void CodeGenSchedModels::collectSchedRW() {
   SchedWrites.resize(1);
   SchedReads.resize(1);
 
-  SmallPtrSet<Record*, 16> RWSet;
+  SmallPtrSet<Record *, 16> RWSet;
 
   // Find all SchedReadWrites referenced by instruction defs.
   RecVec SWDefs, SRDefs;
@@ -673,8 +668,7 @@ void CodeGenSchedModels::collectSchedRW() {
       if (!AliasDef->isSubClassOf("SchedWrite"))
         PrintFatalError(ADef->getLoc(), "SchedWrite Alias must be SchedWrite");
       scanSchedRW(AliasDef, SWDefs, RWSet);
-    }
-    else {
+    } else {
       assert(MatchDef->isSubClassOf("SchedRead") && "Unknown SchedReadWrite");
       if (!AliasDef->isSubClassOf("SchedRead"))
         PrintFatalError(ADef->getLoc(), "SchedRead Alias must be SchedRead");
@@ -690,7 +684,7 @@ void CodeGenSchedModels::collectSchedRW() {
   }
   llvm::sort(SRDefs, LessRecord());
   for (Record *SRDef : SRDefs) {
-    assert(!getSchedRWIdx(SRDef, /*IsRead-*/true) && "duplicate SchedWrite");
+    assert(!getSchedRWIdx(SRDef, /*IsRead-*/ true) && "duplicate SchedWrite");
     SchedReads.emplace_back(SchedReads.size(), SRDef);
   }
   // Initialize WriteSequence vectors.
@@ -753,9 +747,9 @@ unsigned CodeGenSchedModels::getSchedRWIdx(const Record *Def,
 }
 
 bool CodeGenSchedModels::hasReadOfWrite(Record *WriteDef) const {
-  for (auto& ProcModel : ProcModels) {
+  for (auto &ProcModel : ProcModels) {
     const RecVec &RADefs = ProcModel.ReadAdvanceDefs;
-    for (auto& RADef : RADefs) {
+    for (auto &RADef : RADefs) {
       RecVec ValidWrites = RADef->getValueAsListOfDefs("ValidWrites");
       if (is_contained(ValidWrites, WriteDef))
         return true;
@@ -764,8 +758,8 @@ bool CodeGenSchedModels::hasReadOfWrite(Record *WriteDef) const {
   return false;
 }
 
-static void splitSchedReadWrites(const RecVec &RWDefs,
-                                 RecVec &WriteDefs, RecVec &ReadDefs) {
+static void splitSchedReadWrites(const RecVec &RWDefs, RecVec &WriteDefs,
+                                 RecVec &ReadDefs) {
   for (Record *RWDef : RWDefs) {
     if (RWDef->isSubClassOf("SchedWrite"))
       WriteDefs.push_back(RWDef);
@@ -777,8 +771,8 @@ static void splitSchedReadWrites(const RecVec &RWDefs,
 }
 
 // Split the SchedReadWrites defs and call findRWs for each list.
-void CodeGenSchedModels::findRWs(const RecVec &RWDefs,
-                                 IdxVec &Writes, IdxVec &Reads) const {
+void CodeGenSchedModels::findRWs(const RecVec &RWDefs, IdxVec &Writes,
+                                 IdxVec &Reads) const {
   RecVec WriteDefs;
   RecVec ReadDefs;
   splitSchedReadWrites(RWDefs, WriteDefs, ReadDefs);
@@ -803,8 +797,7 @@ void CodeGenSchedModels::expandRWSequence(unsigned RWIdx, IdxVec &RWSeq,
     RWSeq.push_back(RWIdx);
     return;
   }
-  int Repeat =
-    SchedRW.TheDef ? SchedRW.TheDef->getValueAsInt("Repeat") : 1;
+  int Repeat = SchedRW.TheDef ? SchedRW.TheDef->getValueAsInt("Repeat") : 1;
   for (int i = 0; i < Repeat; ++i) {
     for (unsigned I : SchedRW.Sequence) {
       expandRWSequence(I, RWSeq, IsRead);
@@ -815,8 +808,8 @@ void CodeGenSchedModels::expandRWSequence(unsigned RWIdx, IdxVec &RWSeq,
 // Expand a SchedWrite as a sequence following any aliases that coincide with
 // the given processor model.
 void CodeGenSchedModels::expandRWSeqForProc(
-  unsigned RWIdx, IdxVec &RWSeq, bool IsRead,
-  const CodeGenProcModel &ProcModel) const {
+    unsigned RWIdx, IdxVec &RWSeq, bool IsRead,
+    const CodeGenProcModel &ProcModel) const {
 
   const CodeGenSchedRW &SchedWrite = getSchedRW(RWIdx, IsRead);
   Record *AliasDef = nullptr;
@@ -828,14 +821,16 @@ void CodeGenSchedModels::expandRWSeqForProc(
         continue;
     }
     if (AliasDef)
-      PrintFatalError(AliasRW.TheDef->getLoc(), "Multiple aliases "
-                      "defined for processor " + ProcModel.ModelName +
-                      " Ensure only one SchedAlias exists per RW.");
+      PrintFatalError(AliasRW.TheDef->getLoc(),
+                      "Multiple aliases "
+                      "defined for processor " +
+                          ProcModel.ModelName +
+                          " Ensure only one SchedAlias exists per RW.");
     AliasDef = AliasRW.TheDef;
   }
   if (AliasDef) {
-    expandRWSeqForProc(getSchedRWIdx(AliasDef, IsRead),
-                       RWSeq, IsRead,ProcModel);
+    expandRWSeqForProc(getSchedRWIdx(AliasDef, IsRead), RWSeq, IsRead,
+                       ProcModel);
     return;
   }
   if (!SchedWrite.IsSequence) {
@@ -843,7 +838,7 @@ void CodeGenSchedModels::expandRWSeqForProc(
     return;
   }
   int Repeat =
-    SchedWrite.TheDef ? SchedWrite.TheDef->getValueAsInt("Repeat") : 1;
+      SchedWrite.TheDef ? SchedWrite.TheDef->getValueAsInt("Repeat") : 1;
   for (int I = 0, E = Repeat; I < E; ++I) {
     for (unsigned Idx : SchedWrite.Sequence) {
       expandRWSeqForProc(Idx, RWSeq, IsRead, ProcModel);
@@ -888,8 +883,7 @@ void CodeGenSchedModels::collectSchedClasses() {
 
   // NoItinerary is always the first class at Idx=0
   assert(SchedClasses.empty() && "Expected empty sched class");
-  SchedClasses.emplace_back(0, "NoInstrModel",
-                            Records.getDef("NoItinerary"));
+  SchedClasses.emplace_back(0, "NoInstrModel", Records.getDef("NoItinerary"));
   SchedClasses.back().ProcIndices.push_back(0);
 
   // Create a SchedClass for each unique combination of itinerary class and
@@ -901,7 +895,7 @@ void CodeGenSchedModels::collectSchedClasses() {
       findRWs(Inst->TheDef->getValueAsListOfDefs("SchedRW"), Writes, Reads);
 
     // ProcIdx == 0 indicates the class applies to all processors.
-    unsigned SCIdx = addSchedClass(ItinDef, Writes, Reads, /*ProcIndices*/{0});
+    unsigned SCIdx = addSchedClass(ItinDef, Writes, Reads, /*ProcIndices*/ {0});
     InstrClassMap[Inst->TheDef] = SCIdx;
   }
   // Create classes for InstRW defs.
@@ -933,7 +927,8 @@ void CodeGenSchedModels::collectSchedClasses() {
     }
     CodeGenSchedClass &SC = getSchedClass(SCIdx);
     if (SC.ProcIndices[0] != 0)
-      PrintFatalError(Inst->TheDef->getLoc(), "Instruction's sched class "
+      PrintFatalError(Inst->TheDef->getLoc(),
+                      "Instruction's sched class "
                       "must not be subtarget specific.");
 
     IdxVec ProcIndices;
@@ -962,8 +957,7 @@ void CodeGenSchedModels::collectSchedClasses() {
                         << InstName);
       IdxVec Writes;
       IdxVec Reads;
-      findRWs(RWDef->getValueAsListOfDefs("OperandReadWrites"),
-              Writes, Reads);
+      findRWs(RWDef->getValueAsListOfDefs("OperandReadWrites"), Writes, Reads);
       LLVM_DEBUG({
         for (unsigned WIdx : Writes)
           dbgs() << " " << SchedWrites[WIdx].Name;
@@ -1032,25 +1026,23 @@ unsigned CodeGenSchedModels::addSchedClass(Record *ItinClassDef,
   assert(!ProcIndices.empty() && "expect at least one ProcIdx");
 
   auto IsKeyEqual = [=](const CodeGenSchedClass &SC) {
-                     return SC.isKeyEqual(ItinClassDef, OperWrites, OperReads);
-                   };
+    return SC.isKeyEqual(ItinClassDef, OperWrites, OperReads);
+  };
 
   auto I = find_if(make_range(schedClassBegin(), schedClassEnd()), IsKeyEqual);
   unsigned Idx = I == schedClassEnd() ? 0 : std::distance(schedClassBegin(), I);
   if (Idx || SchedClasses[0].isKeyEqual(ItinClassDef, OperWrites, OperReads)) {
     IdxVec PI;
     std::set_union(SchedClasses[Idx].ProcIndices.begin(),
-                   SchedClasses[Idx].ProcIndices.end(),
-                   ProcIndices.begin(), ProcIndices.end(),
-                   std::back_inserter(PI));
+                   SchedClasses[Idx].ProcIndices.end(), ProcIndices.begin(),
+                   ProcIndices.end(), std::back_inserter(PI));
     SchedClasses[Idx].ProcIndices = std::move(PI);
     return Idx;
   }
   Idx = SchedClasses.size();
-  SchedClasses.emplace_back(Idx,
-                            createSchedClassName(ItinClassDef, OperWrites,
-                                                 OperReads),
-                            ItinClassDef);
+  SchedClasses.emplace_back(
+      Idx, createSchedClassName(ItinClassDef, OperWrites, OperReads),
+      ItinClassDef);
   CodeGenSchedClass &SC = SchedClasses.back();
   SC.Writes = OperWrites;
   SC.Reads = OperReads;
@@ -1083,17 +1075,16 @@ void CodeGenSchedModels::createInstRWClass(Record *InstRWDef) {
   // the Instrs to it.
   for (auto &Entry : ClassInstrs) {
     unsigned OldSCIdx = Entry.first;
-    ArrayRef<Record*> InstDefs = Entry.second;
+    ArrayRef<Record *> InstDefs = Entry.second;
     // If the all instrs in the current class are accounted for, then leave
     // them mapped to their old class.
     if (OldSCIdx) {
       const RecVec &RWDefs = SchedClasses[OldSCIdx].InstRWs;
       if (!RWDefs.empty()) {
         const RecVec *OrigInstDefs = Sets.expand(RWDefs[0]);
-        unsigned OrigNumInstrs =
-          count_if(*OrigInstDefs, [&](Record *OIDef) {
-                     return InstrClassMap[OIDef] == OldSCIdx;
-                   });
+        unsigned OrigNumInstrs = count_if(*OrigInstDefs, [&](Record *OIDef) {
+          return InstrClassMap[OIDef] == OldSCIdx;
+        });
         if (OrigNumInstrs == InstDefs.size()) {
           assert(SchedClasses[OldSCIdx].ProcIndices[0] == 0 &&
                  "expected a generic SchedClass");
@@ -1148,8 +1139,7 @@ void CodeGenSchedModels::createInstRWClass(Record *InstRWDef) {
                   "\".");
           PrintFatalNote(OldRWDef->getLoc(), "Previous match was here.");
         }
-        assert(OldRWDef != InstRWDef &&
-               "SchedClass has duplicate InstRW def");
+        assert(OldRWDef != InstRWDef && "SchedClass has duplicate InstRW def");
         SC.InstRWs.push_back(OldRWDef);
       }
     }
@@ -1162,7 +1152,8 @@ void CodeGenSchedModels::createInstRWClass(Record *InstRWDef) {
 
 // True if collectProcItins found anything.
 bool CodeGenSchedModels::hasItineraries() const {
-  for (const CodeGenProcModel &PM : make_range(procModelBegin(),procModelEnd()))
+  for (const CodeGenProcModel &PM :
+       make_range(procModelBegin(), procModelEnd()))
     if (PM.hasItineraries())
       return true;
   return false;
@@ -1217,14 +1208,14 @@ void CodeGenSchedModels::collectProcItins() {
 void CodeGenSchedModels::collectProcItinRW() {
   RecVec ItinRWDefs = Records.getAllDerivedDefinitions("ItinRW");
   llvm::sort(ItinRWDefs, LessRecord());
-  for (Record *RWDef  : ItinRWDefs) {
+  for (Record *RWDef : ItinRWDefs) {
     if (!RWDef->getValueInit("SchedModel")->isComplete())
       PrintFatalError(RWDef->getLoc(), "SchedModel is undefined");
     Record *ModelDef = RWDef->getValueAsDef("SchedModel");
     ProcModelMapTy::const_iterator I = ProcModelMap.find(ModelDef);
     if (I == ProcModelMap.end()) {
-      PrintFatalError(RWDef->getLoc(), "Undefined SchedMachineModel "
-                    + ModelDef->getName());
+      PrintFatalError(RWDef->getLoc(),
+                      "Undefined SchedMachineModel " + ModelDef->getName());
     }
     ProcModels[I->second].ItinRWDefs.push_back(RWDef);
   }
@@ -1254,10 +1245,10 @@ void CodeGenSchedModels::inferSchedClasses() {
     if (!SchedClasses[Idx].InstRWs.empty())
       inferFromInstRWs(Idx);
     if (!SchedClasses[Idx].Writes.empty()) {
-      inferFromRW(SchedClasses[Idx].Writes, SchedClasses[Idx].Reads,
-                  Idx, SchedClasses[Idx].ProcIndices);
+      inferFromRW(SchedClasses[Idx].Writes, SchedClasses[Idx].Reads, Idx,
+                  SchedClasses[Idx].ProcIndices);
     }
-    assert(SchedClasses.size() < (NumInstrSchedClasses*6) &&
+    assert(SchedClasses.size() < (NumInstrSchedClasses * 6) &&
            "too many SchedVariants");
   }
 }
@@ -1274,9 +1265,9 @@ void CodeGenSchedModels::inferFromItinClass(Record *ItinClassDef,
       if (!llvm::is_contained(Matched, ItinClassDef))
         continue;
       if (HasMatch)
-        PrintFatalError(Rec->getLoc(), "Duplicate itinerary class "
-                      + ItinClassDef->getName()
-                      + " in ItinResources for " + PM.ModelName);
+        PrintFatalError(Rec->getLoc(),
+                        "Duplicate itinerary class " + ItinClassDef->getName() +
+                            " in ItinResources for " + PM.ModelName);
       HasMatch = true;
       IdxVec Writes, Reads;
       findRWs(Rec->getValueAsListOfDefs("OperandReadWrites"), Writes, Reads);
@@ -1317,8 +1308,8 @@ struct TransVariant {
   unsigned ProcIdx;     // Processor model index or zero for any.
   unsigned TransVecIdx; // Index into PredTransitions::TransVec.
 
-  TransVariant(Record *def, unsigned rwi, unsigned pi, unsigned ti):
-    VarOrSeqDef(def), RWIdx(rwi), ProcIdx(pi), TransVecIdx(ti) {}
+  TransVariant(Record *def, unsigned rwi, unsigned pi, unsigned ti)
+      : VarOrSeqDef(def), RWIdx(rwi), ProcIdx(pi), TransVecIdx(ti) {}
 };
 
 // Associate a predicate with the SchedReadWrite that it guards.
@@ -1328,15 +1319,16 @@ struct PredCheck {
   unsigned RWIdx;
   Record *Predicate;
 
-  PredCheck(bool r, unsigned w, Record *p): IsRead(r), RWIdx(w), Predicate(p) {}
+  PredCheck(bool r, unsigned w, Record *p)
+      : IsRead(r), RWIdx(w), Predicate(p) {}
 };
 
 // A Predicate transition is a list of RW sequences guarded by a PredTerm.
 struct PredTransition {
   // A predicate term is a conjunction of PredChecks.
   SmallVector<PredCheck, 4> PredTerm;
-  SmallVector<SmallVector<unsigned,4>, 16> WriteSequences;
-  SmallVector<SmallVector<unsigned,4>, 16> ReadSequences;
+  SmallVector<SmallVector<unsigned, 4>, 16> WriteSequences;
+  SmallVector<SmallVector<unsigned, 4>, 16> ReadSequences;
   unsigned ProcIndex = 0;
 
   PredTransition() = default;
@@ -1354,7 +1346,7 @@ class PredTransitions {
 public:
   std::vector<PredTransition> TransVec;
 
-  PredTransitions(CodeGenSchedModels &sm): SchedModels(sm) {}
+  PredTransitions(CodeGenSchedModels &sm) : SchedModels(sm) {}
 
   bool substituteVariantOperand(const SmallVectorImpl<unsigned> &RWSeq,
                                 bool IsRead, unsigned StartIdx);
@@ -1368,9 +1360,8 @@ public:
 private:
   bool mutuallyExclusive(Record *PredDef, ArrayRef<Record *> Preds,
                          ArrayRef<PredCheck> Term);
-  void getIntersectingVariants(
-    const CodeGenSchedRW &SchedRW, unsigned TransIdx,
-    std::vector<TransVariant> &IntersectingVariants);
+  void getIntersectingVariants(const CodeGenSchedRW &SchedRW, unsigned TransIdx,
+                               std::vector<TransVariant> &IntersectingVariants);
   void pushVariant(const TransVariant &VInfo, bool IsRead);
 };
 
@@ -1388,7 +1379,7 @@ private:
 bool PredTransitions::mutuallyExclusive(Record *PredDef,
                                         ArrayRef<Record *> Preds,
                                         ArrayRef<PredCheck> Term) {
-  for (const PredCheck &PC: Term) {
+  for (const PredCheck &PC : Term) {
     if (PC.Predicate == PredDef)
       return false;
 
@@ -1446,8 +1437,8 @@ static std::vector<Record *> getAllPredicates(ArrayRef<TransVariant> Variants,
 // given SchedRW whose processor indices and predicates are not mutually
 // exclusive with the given transition.
 void PredTransitions::getIntersectingVariants(
-  const CodeGenSchedRW &SchedRW, unsigned TransIdx,
-  std::vector<TransVariant> &IntersectingVariants) {
+    const CodeGenSchedRW &SchedRW, unsigned TransIdx,
+    std::vector<TransVariant> &IntersectingVariants) {
 
   bool GenericRW = false;
 
@@ -1489,7 +1480,7 @@ void PredTransitions::getIntersectingVariants(
     }
 
     const CodeGenSchedRW &AliasRW =
-      SchedModels.getSchedRW((*AI)->getValueAsDef("AliasRW"));
+        SchedModels.getSchedRW((*AI)->getValueAsDef("AliasRW"));
 
     if (AliasRW.HasVariants) {
       const RecVec VarDefs = AliasRW.TheDef->getValueAsListOfDefs("Variants");
@@ -1516,8 +1507,7 @@ void PredTransitions::getIntersectingVariants(
       // The first variant builds on the existing transition.
       Variant.TransVecIdx = TransIdx;
       IntersectingVariants.push_back(Variant);
-    }
-    else {
+    } else {
       // Push another copy of the current transition for more variants.
       Variant.TransVecIdx = TransVec.size();
       IntersectingVariants.push_back(Variant);
@@ -1525,15 +1515,15 @@ void PredTransitions::getIntersectingVariants(
     }
   }
   if (GenericRW && IntersectingVariants.empty()) {
-    PrintFatalError(SchedRW.TheDef->getLoc(), "No variant of this type has "
+    PrintFatalError(SchedRW.TheDef->getLoc(),
+                    "No variant of this type has "
                     "a matching predicate on any processor");
   }
 }
 
 // Push the Reads/Writes selected by this variant onto the PredTransition
 // specified by VInfo.
-void PredTransitions::
-pushVariant(const TransVariant &VInfo, bool IsRead) {
+void PredTransitions::pushVariant(const TransVariant &VInfo, bool IsRead) {
   PredTransition &Trans = TransVec[VInfo.TransVecIdx];
 
   // If this operand transition is reached through a processor-specific alias,
@@ -1541,11 +1531,10 @@ pushVariant(const TransVariant &VInfo, bool IsRead) {
   IdxVec SelectedRWs;
   if (VInfo.VarOrSeqDef->isSubClassOf("SchedVar")) {
     Record *PredDef = VInfo.VarOrSeqDef->getValueAsDef("Predicate");
-    Trans.PredTerm.emplace_back(IsRead, VInfo.RWIdx,PredDef);
+    Trans.PredTerm.emplace_back(IsRead, VInfo.RWIdx, PredDef);
     RecVec SelectedDefs = VInfo.VarOrSeqDef->getValueAsListOfDefs("Selected");
     SchedModels.findRWs(SelectedDefs, SelectedRWs, IsRead);
-  }
-  else {
+  } else {
     assert(VInfo.VarOrSeqDef->isSubClassOf("WriteSequence") &&
            "variant must be a SchedVariant or aliased WriteSequence");
     SelectedRWs.push_back(SchedModels.getSchedRWIdx(VInfo.VarOrSeqDef, IsRead));
@@ -1553,10 +1542,10 @@ pushVariant(const TransVariant &VInfo, bool IsRead) {
 
   const CodeGenSchedRW &SchedRW = SchedModels.getSchedRW(VInfo.RWIdx, IsRead);
 
-  SmallVectorImpl<SmallVector<unsigned,4>> &RWSequences = IsRead
-    ? Trans.ReadSequences : Trans.WriteSequences;
+  SmallVectorImpl<SmallVector<unsigned, 4>> &RWSequences =
+      IsRead ? Trans.ReadSequences : Trans.WriteSequences;
   if (SchedRW.IsVariadic) {
-    unsigned OperIdx = RWSequences.size()-1;
+    unsigned OperIdx = RWSequences.size() - 1;
     // Make N-1 copies of this transition's last sequence.
     RWSequences.reserve(RWSequences.size() + SelectedRWs.size() - 1);
     RWSequences.insert(RWSequences.end(), SelectedRWs.size() - 1,
@@ -1565,8 +1554,8 @@ pushVariant(const TransVariant &VInfo, bool IsRead) {
     // sequence (split the current operand into N operands).
     // Note that write sequences should be expanded within this loop--the entire
     // sequence belongs to a single operand.
-    for (IdxIter RWI = SelectedRWs.begin(), RWE = SelectedRWs.end();
-         RWI != RWE; ++RWI, ++OperIdx) {
+    for (IdxIter RWI = SelectedRWs.begin(), RWE = SelectedRWs.end(); RWI != RWE;
+         ++RWI, ++OperIdx) {
       IdxVec ExpandedRWs;
       if (IsRead)
         ExpandedRWs.push_back(*RWI);
@@ -1575,8 +1564,7 @@ pushVariant(const TransVariant &VInfo, bool IsRead) {
       llvm::append_range(RWSequences[OperIdx], ExpandedRWs);
     }
     assert(OperIdx == RWSequences.size() && "missed a sequence");
-  }
-  else {
+  } else {
     // Push this transition's expanded sequence onto this transition's last
     // sequence (add to the current operand's sequence).
     SmallVectorImpl<unsigned> &Seq = RWSequences.back();
@@ -1644,8 +1632,9 @@ bool PredTransitions::substituteVariants(const PredTransition &Trans) {
   // Visit each original write sequence.
   for (const auto &WriteSequence : Trans.WriteSequences) {
     // Push a new (empty) write sequence onto all partial Transitions.
-    for (std::vector<PredTransition>::iterator I =
-           TransVec.begin() + StartIdx, E = TransVec.end(); I != E; ++I) {
+    for (std::vector<PredTransition>::iterator I = TransVec.begin() + StartIdx,
+                                               E = TransVec.end();
+         I != E; ++I) {
       I->WriteSequences.emplace_back();
     }
     Subst |=
@@ -1654,8 +1643,9 @@ bool PredTransitions::substituteVariants(const PredTransition &Trans) {
   // Visit each original read sequence.
   for (const auto &ReadSequence : Trans.ReadSequences) {
     // Push a new (empty) read sequence onto all partial Transitions.
-    for (std::vector<PredTransition>::iterator I =
-           TransVec.begin() + StartIdx, E = TransVec.end(); I != E; ++I) {
+    for (std::vector<PredTransition>::iterator I = TransVec.begin() + StartIdx,
+                                               E = TransVec.end();
+         I != E; ++I) {
       I->ReadSequences.emplace_back();
     }
     Subst |= substituteVariantOperand(ReadSequence, /*IsRead=*/true, StartIdx);
@@ -1814,7 +1804,7 @@ bool CodeGenSchedModels::hasSuperGroup(RecVec &SubUnits, CodeGenProcModel &PM) {
       continue;
     RecVec SuperUnits = ProcResourceDef->getValueAsListOfDefs("Resources");
     RecIter RI = SubUnits.begin(), RE = SubUnits.end();
-    for ( ; RI != RE; ++RI) {
+    for (; RI != RE; ++RI) {
       if (!is_contained(SuperUnits, *RI)) {
         break;
       }
@@ -1831,22 +1821,22 @@ void CodeGenSchedModels::verifyProcResourceGroups(CodeGenProcModel &PM) {
     if (!PM.ProcResourceDefs[i]->isSubClassOf("ProcResGroup"))
       continue;
     RecVec CheckUnits =
-      PM.ProcResourceDefs[i]->getValueAsListOfDefs("Resources");
-    for (unsigned j = i+1; j < e; ++j) {
+        PM.ProcResourceDefs[i]->getValueAsListOfDefs("Resources");
+    for (unsigned j = i + 1; j < e; ++j) {
       if (!PM.ProcResourceDefs[j]->isSubClassOf("ProcResGroup"))
         continue;
       RecVec OtherUnits =
-        PM.ProcResourceDefs[j]->getValueAsListOfDefs("Resources");
+          PM.ProcResourceDefs[j]->getValueAsListOfDefs("Resources");
       if (std::find_first_of(CheckUnits.begin(), CheckUnits.end(),
-                             OtherUnits.begin(), OtherUnits.end())
-          != CheckUnits.end()) {
+                             OtherUnits.begin(),
+                             OtherUnits.end()) != CheckUnits.end()) {
         // CheckUnits and OtherUnits overlap
         llvm::append_range(OtherUnits, CheckUnits);
         if (!hasSuperGroup(OtherUnits, PM)) {
           PrintFatalError((PM.ProcResourceDefs[i])->getLoc(),
-                          "proc resource group overlaps with "
-                          + PM.ProcResourceDefs[j]->getName()
-                          + " but no supergroup contains both.");
+                          "proc resource group overlaps with " +
+                              PM.ProcResourceDefs[j]->getName() +
+                              " but no supergroup contains both.");
         }
       }
     }
@@ -1862,7 +1852,7 @@ void CodeGenSchedModels::collectRegisterFiles() {
     // For each register file definition, construct a CodeGenRegisterFile object
     // and add it to the appropriate scheduling model.
     CodeGenProcModel &PM = getProcModel(RF->getValueAsDef("SchedModel"));
-    PM.RegisterFiles.emplace_back(CodeGenRegisterFile(RF->getName(),RF));
+    PM.RegisterFiles.emplace_back(CodeGenRegisterFile(RF->getName(), RF));
     CodeGenRegisterFile &CGRF = PM.RegisterFiles.back();
     CGRF.MaxMovesEliminatedPerCycle =
         RF->getValueAsInt("MaxMovesEliminatedPerCycle");
@@ -2013,7 +2003,7 @@ void CodeGenSchedModels::checkCompleteness() {
           PrintError(Inst->TheDef->getLoc(),
                      "No schedule information for instruction '" +
                          Inst->TheDef->getName() + "' in SchedMachineModel '" +
-                     ProcModel.ModelDef->getName() + "'");
+                         ProcModel.ModelDef->getName() + "'");
           Complete = false;
         }
         continue;
@@ -2039,14 +2029,18 @@ void CodeGenSchedModels::checkCompleteness() {
     }
   }
   if (!Complete) {
-    errs() << "\n\nIncomplete schedule models found.\n"
-      << "- Consider setting 'CompleteModel = 0' while developing new models.\n"
-      << "- Pseudo instructions can be marked with 'hasNoSchedulingInfo = 1'.\n"
-      << "- Instructions should usually have Sched<[...]> as a superclass, "
-         "you may temporarily use an empty list.\n"
-      << "- Instructions related to unsupported features can be excluded with "
-         "list<Predicate> UnsupportedFeatures = [HasA,..,HasY]; in the "
-         "processor model.\n\n";
+    errs()
+        << "\n\nIncomplete schedule models found.\n"
+        << "- Consider setting 'CompleteModel = 0' while developing new "
+           "models.\n"
+        << "- Pseudo instructions can be marked with 'hasNoSchedulingInfo = "
+           "1'.\n"
+        << "- Instructions should usually have Sched<[...]> as a superclass, "
+           "you may temporarily use an empty list.\n"
+        << "- Instructions related to unsupported features can be excluded "
+           "with "
+           "list<Predicate> UnsupportedFeatures = [HasA,..,HasY]; in the "
+           "processor model.\n\n";
     PrintFatalError("Incomplete schedule model");
   }
 }
@@ -2057,15 +2051,15 @@ void CodeGenSchedModels::collectItinProcResources(Record *ItinClassDef) {
     const CodeGenProcModel &PM = ProcModels[PIdx];
     // For all ItinRW entries.
     bool HasMatch = false;
-    for (RecIter II = PM.ItinRWDefs.begin(), IE = PM.ItinRWDefs.end();
-         II != IE; ++II) {
+    for (RecIter II = PM.ItinRWDefs.begin(), IE = PM.ItinRWDefs.end(); II != IE;
+         ++II) {
       RecVec Matched = (*II)->getValueAsListOfDefs("MatchedItinClasses");
       if (!llvm::is_contained(Matched, ItinClassDef))
         continue;
       if (HasMatch)
-        PrintFatalError((*II)->getLoc(), "Duplicate itinerary class "
-                        + ItinClassDef->getName()
-                        + " in ItinResources for " + PM.ModelName);
+        PrintFatalError((*II)->getLoc(),
+                        "Duplicate itinerary class " + ItinClassDef->getName() +
+                            " in ItinResources for " + PM.ModelName);
       HasMatch = true;
       IdxVec Writes, Reads;
       findRWs((*II)->getValueAsListOfDefs("OperandReadWrites"), Writes, Reads);
@@ -2081,8 +2075,7 @@ void CodeGenSchedModels::collectRWResources(unsigned RWIdx, bool IsRead,
     if (!IsRead && SchedRW.TheDef->isSubClassOf("SchedWriteRes")) {
       for (unsigned Idx : ProcIndices)
         addWriteRes(SchedRW.TheDef, Idx);
-    }
-    else if (IsRead && SchedRW.TheDef->isSubClassOf("SchedReadAdvance")) {
+    } else if (IsRead && SchedRW.TheDef->isSubClassOf("SchedReadAdvance")) {
       for (unsigned Idx : ProcIndices)
         addReadAdvance(SchedRW.TheDef, Idx);
     }
@@ -2128,31 +2121,30 @@ Record *CodeGenSchedModels::findProcResUnits(Record *ProcResKind,
   assert(!ProcResGroups.empty());
 
   for (Record *ProcResDef : ProcResourceDefs) {
-    if (ProcResDef->getValueAsDef("Kind") == ProcResKind
-        && ProcResDef->getValueAsDef("SchedModel") == PM.ModelDef) {
+    if (ProcResDef->getValueAsDef("Kind") == ProcResKind &&
+        ProcResDef->getValueAsDef("SchedModel") == PM.ModelDef) {
       if (ProcUnitDef) {
         PrintFatalError(Loc,
-                        "Multiple ProcessorResourceUnits associated with "
-                        + ProcResKind->getName());
+                        "Multiple ProcessorResourceUnits associated with " +
+                            ProcResKind->getName());
       }
       ProcUnitDef = ProcResDef;
     }
   }
   for (Record *ProcResGroup : ProcResGroups) {
-    if (ProcResGroup == ProcResKind
-        && ProcResGroup->getValueAsDef("SchedModel") == PM.ModelDef) {
+    if (ProcResGroup == ProcResKind &&
+        ProcResGroup->getValueAsDef("SchedModel") == PM.ModelDef) {
       if (ProcUnitDef) {
         PrintFatalError(Loc,
-                        "Multiple ProcessorResourceUnits associated with "
-                        + ProcResKind->getName());
+                        "Multiple ProcessorResourceUnits associated with " +
+                            ProcResKind->getName());
       }
       ProcUnitDef = ProcResGroup;
     }
   }
   if (!ProcUnitDef) {
-    PrintFatalError(Loc,
-                    "No ProcessorResources associated with "
-                    + ProcResKind->getName());
+    PrintFatalError(Loc, "No ProcessorResources associated with " +
+                             ProcResKind->getName());
   }
   return ProcUnitDef;
 }
@@ -2208,14 +2200,16 @@ unsigned CodeGenProcModel::getProcResourceIdx(Record *PRDef) const {
   RecIter PRPos = find(ProcResourceDefs, PRDef);
   if (PRPos == ProcResourceDefs.end())
     PrintFatalError(PRDef->getLoc(), "ProcResource def is not included in "
-                    "the ProcResources list for " + ModelName);
+                                     "the ProcResources list for " +
+                                         ModelName);
   // Idx=0 is reserved for invalid.
   return 1 + (PRPos - ProcResourceDefs.begin());
 }
 
 bool CodeGenProcModel::isUnsupported(const CodeGenInstruction &Inst) const {
   for (const Record *TheDef : UnsupportedFeaturesDefs) {
-    for (const Record *PredDef : Inst.TheDef->getValueAsListOfDefs("Predicates")) {
+    for (const Record *PredDef :
+         Inst.TheDef->getValueAsListOfDefs("Predicates")) {
       if (TheDef->getName() == PredDef->getName())
         return true;
     }
@@ -2239,12 +2233,11 @@ void CodeGenSchedRW::dump() const {
   }
 }
 
-void CodeGenSchedClass::dump(const CodeGenSchedModels* SchedModels) const {
-  dbgs() << "SCHEDCLASS " << Index << ":" << Name << '\n'
-         << "  Writes: ";
+void CodeGenSchedClass::dump(const CodeGenSchedModels *SchedModels) const {
+  dbgs() << "SCHEDCLASS " << Index << ":" << Name << '\n' << "  Writes: ";
   for (unsigned i = 0, N = Writes.size(); i < N; ++i) {
     SchedModels->getSchedWrite(Writes[i]).dump();
-    if (i < N-1) {
+    if (i < N - 1) {
       dbgs() << '\n';
       dbgs().indent(10);
     }
@@ -2252,12 +2245,13 @@ void CodeGenSchedClass::dump(const CodeGenSchedModels* SchedModels) const {
   dbgs() << "\n  Reads: ";
   for (unsigned i = 0, N = Reads.size(); i < N; ++i) {
     SchedModels->getSchedRead(Reads[i]).dump();
-    if (i < N-1) {
+    if (i < N - 1) {
       dbgs() << '\n';
       dbgs().indent(10);
     }
   }
-  dbgs() << "\n  ProcIdx: "; dumpIdxVec(ProcIndices);
+  dbgs() << "\n  ProcIdx: ";
+  dumpIdxVec(ProcIndices);
   if (!Transitions.empty()) {
     dbgs() << "\n Transitions for Proc ";
     for (const CodeGenSchedTransition &Transition : Transitions) {
diff --git a/llvm/utils/TableGen/CodeGenSchedule.h b/llvm/utils/TableGen/CodeGenSchedule.h
index 76ef1e4..61980e7 100644
--- a/llvm/utils/TableGen/CodeGenSchedule.h
+++ b/llvm/utils/TableGen/CodeGenSchedule.h
@@ -33,8 +33,8 @@ class CodeGenTarget;
 class CodeGenSchedModels;
 class CodeGenInstruction;
 
-using RecVec = std::vector<Record*>;
-using RecIter = std::vector<Record*>::const_iterator;
+using RecVec = std::vector<Record *>;
+using RecIter = std::vector<Record *>::const_iterator;
 
 using IdxVec = std::vector<unsigned>;
 using IdxIter = std::vector<unsigned>::const_iterator;
@@ -59,10 +59,10 @@ struct CodeGenSchedRW {
   RecVec Aliases;
 
   CodeGenSchedRW()
-    : Index(0), TheDef(nullptr), IsRead(false), IsAlias(false),
-      HasVariants(false), IsVariadic(false), IsSequence(false) {}
+      : Index(0), TheDef(nullptr), IsRead(false), IsAlias(false),
+        HasVariants(false), IsVariadic(false), IsSequence(false) {}
   CodeGenSchedRW(unsigned Idx, Record *Def)
-    : Index(Idx), TheDef(Def), IsAlias(false), IsVariadic(false) {
+      : Index(Idx), TheDef(Def), IsAlias(false), IsVariadic(false) {
     Name = std::string(Def->getName());
     IsRead = Def->isSubClassOf("SchedRead");
     HasVariants = Def->isSubClassOf("SchedVariant");
@@ -148,7 +148,7 @@ struct CodeGenSchedClass {
   DenseSet<unsigned> InstRWProcIndices;
 
   CodeGenSchedClass(unsigned Index, std::string Name, Record *ItinClassDef)
-    : Index(Index), Name(std::move(Name)), ItinClassDef(ItinClassDef) {}
+      : Index(Index), Name(std::move(Name)), ItinClassDef(ItinClassDef) {}
 
   bool isKeyEqual(Record *IC, ArrayRef<unsigned> W,
                   ArrayRef<unsigned> R) const {
@@ -173,7 +173,8 @@ struct CodeGenRegisterCost {
   Record *RCDef;
   unsigned Cost;
   bool AllowMoveElimination;
-  CodeGenRegisterCost(Record *RC, unsigned RegisterCost, bool AllowMoveElim = false)
+  CodeGenRegisterCost(Record *RC, unsigned RegisterCost,
+                      bool AllowMoveElim = false)
       : RCDef(RC), Cost(RegisterCost), AllowMoveElimination(AllowMoveElim) {}
   CodeGenRegisterCost(const CodeGenRegisterCost &) = default;
   CodeGenRegisterCost &operator=(const CodeGenRegisterCost &) = delete;
@@ -193,12 +194,12 @@ struct CodeGenRegisterFile {
   unsigned NumPhysRegs;
   std::vector<CodeGenRegisterCost> Costs;
 
-  CodeGenRegisterFile(StringRef name, Record *def, unsigned MaxMoveElimPerCy = 0,
+  CodeGenRegisterFile(StringRef name, Record *def,
+                      unsigned MaxMoveElimPerCy = 0,
                       bool AllowZeroMoveElimOnly = false)
       : Name(name), RegisterFileDef(def),
         MaxMovesEliminatedPerCycle(MaxMoveElimPerCy),
-        AllowZeroMoveEliminationOnly(AllowZeroMoveElimOnly),
-        NumPhysRegs(0) {}
+        AllowZeroMoveEliminationOnly(AllowZeroMoveElimOnly), NumPhysRegs(0) {}
 
   bool hasDefaultCosts() const { return Costs.empty(); }
 };
@@ -255,10 +256,9 @@ struct CodeGenProcModel {
   Record *LoadQueue;
   Record *StoreQueue;
 
-  CodeGenProcModel(unsigned Idx, std::string Name, Record *MDef,
-                   Record *IDef) :
-    Index(Idx), ModelName(std::move(Name)), ModelDef(MDef), ItinsDef(IDef),
-    RetireControlUnit(nullptr), LoadQueue(nullptr), StoreQueue(nullptr) {}
+  CodeGenProcModel(unsigned Idx, std::string Name, Record *MDef, Record *IDef)
+      : Index(Idx), ModelName(std::move(Name)), ModelDef(MDef), ItinsDef(IDef),
+        RetireControlUnit(nullptr), LoadQueue(nullptr), StoreQueue(nullptr) {}
 
   bool hasItineraries() const {
     return !ItinsDef->getValueAsListOfDefs("IID").empty();
@@ -443,14 +443,14 @@ class CodeGenSchedModels {
 
   // Map each instruction to its unique SchedClass index considering the
   // combination of it's itinerary class, SchedRW list, and InstRW records.
-  using InstClassMapTy = DenseMap<Record*, unsigned>;
+  using InstClassMapTy = DenseMap<Record *, unsigned>;
   InstClassMapTy InstrClassMap;
 
   std::vector<STIPredicateFunction> STIPredicates;
   std::vector<unsigned> getAllProcIndices() const;
 
 public:
-  CodeGenSchedModels(RecordKeeper& RK, const CodeGenTarget &TGT);
+  CodeGenSchedModels(RecordKeeper &RK, const CodeGenTarget &TGT);
 
   // iterator access to the scheduling classes.
   using class_iterator = std::vector<CodeGenSchedClass>::iterator;
@@ -460,10 +460,10 @@ public:
   class_iterator classes_end() { return SchedClasses.end(); }
   const_class_iterator classes_end() const { return SchedClasses.end(); }
   iterator_range<class_iterator> classes() {
-   return make_range(classes_begin(), classes_end());
+    return make_range(classes_begin(), classes_end());
   }
   iterator_range<const_class_iterator> classes() const {
-   return make_range(classes_begin(), classes_end());
+    return make_range(classes_begin(), classes_end());
   }
   iterator_range<class_iterator> explicit_classes() {
     return make_range(classes_begin(), classes_begin() + NumInstrSchedClasses);
@@ -476,8 +476,8 @@ public:
     Record *ModelDef = ProcDef->getValueAsDef("SchedModel");
     Record *ItinsDef = ProcDef->getValueAsDef("ProcItin");
     if (!ItinsDef->getValueAsListOfDefs("IID").empty()) {
-      assert(ModelDef->getValueAsBit("NoModel")
-             && "Itineraries must be defined within SchedMachineModel");
+      assert(ModelDef->getValueAsBit("NoModel") &&
+             "Itineraries must be defined within SchedMachineModel");
       return ItinsDef;
     }
     return ModelDef;
@@ -496,7 +496,7 @@ public:
     return ProcModels[I->second];
   }
   const CodeGenProcModel &getProcModel(Record *ModelDef) const {
-    return const_cast<CodeGenSchedModels*>(this)->getProcModel(ModelDef);
+    return const_cast<CodeGenSchedModels *>(this)->getProcModel(ModelDef);
   }
 
   // Iterate over the unique processor models.
@@ -527,11 +527,11 @@ public:
   CodeGenSchedRW &getSchedRW(Record *Def) {
     bool IsRead = Def->isSubClassOf("SchedRead");
     unsigned Idx = getSchedRWIdx(Def, IsRead);
-    return const_cast<CodeGenSchedRW&>(
-      IsRead ? getSchedRead(Idx) : getSchedWrite(Idx));
+    return const_cast<CodeGenSchedRW &>(IsRead ? getSchedRead(Idx)
+                                               : getSchedWrite(Idx));
   }
   const CodeGenSchedRW &getSchedRW(Record *Def) const {
-    return const_cast<CodeGenSchedModels&>(*this).getSchedRW(Def);
+    return const_cast<CodeGenSchedModels &>(*this).getSchedRW(Def);
   }
 
   unsigned getSchedRWIdx(const Record *Def, bool IsRead) const;
@@ -579,6 +579,7 @@ public:
   ArrayRef<STIPredicateFunction> getSTIPredicates() const {
     return STIPredicates;
   }
+
 private:
   void collectProcModels();
 
diff --git a/llvm/utils/TableGen/CodeGenTarget.cpp b/llvm/utils/TableGen/CodeGenTarget.cpp
index ceaa51b1..8e2957e 100644
--- a/llvm/utils/TableGen/CodeGenTarget.cpp
+++ b/llvm/utils/TableGen/CodeGenTarget.cpp
@@ -49,10 +49,14 @@ MVT::SimpleValueType llvm::getValueType(const Record *Rec) {
 
 StringRef llvm::getName(MVT::SimpleValueType T) {
   switch (T) {
-  case MVT::Other:   return "UNKNOWN";
-  case MVT::iPTR:    return "TLI.getPointerTy()";
-  case MVT::iPTRAny: return "TLI.getPointerTy()";
-  default: return getEnumName(T);
+  case MVT::Other:
+    return "UNKNOWN";
+  case MVT::iPTR:
+    return "TLI.getPointerTy()";
+  case MVT::iPTRAny:
+    return "TLI.getPointerTy()";
+  default:
+    return getEnumName(T);
   }
 }
 
@@ -280,12 +284,11 @@ std::string llvm::getQualifiedName(const Record *R) {
   return Namespace + "::" + R->getName().str();
 }
 
-
 /// getTarget - Return the current instance of the Target class.
 ///
 CodeGenTarget::CodeGenTarget(RecordKeeper &records)
-  : Records(records), CGH(records) {
-  std::vector<Record*> Targets = Records.getAllDerivedDefinitions("Target");
+    : Records(records), CGH(records) {
+  std::vector<Record *> Targets = Records.getAllDerivedDefinitions("Target");
   if (Targets.size() == 0)
     PrintFatalError("No 'Target' subclasses defined!");
   if (Targets.size() != 1)
@@ -294,8 +297,7 @@ CodeGenTarget::CodeGenTarget(RecordKeeper &records)
   MacroFusions = Records.getAllDerivedDefinitions("Fusion");
 }
 
-CodeGenTarget::~CodeGenTarget() {
-}
+CodeGenTarget::~CodeGenTarget() {}
 
 StringRef CodeGenTarget::getName() const { return TargetRec->getName(); }
 
@@ -331,7 +333,7 @@ bool CodeGenTarget::getAllowRegisterRenaming() const {
 /// getAsmParser - Return the AssemblyParser definition for this target.
 ///
 Record *CodeGenTarget::getAsmParser() const {
-  std::vector<Record*> LI = TargetRec->getValueAsListOfDefs("AssemblyParsers");
+  std::vector<Record *> LI = TargetRec->getValueAsListOfDefs("AssemblyParsers");
   if (AsmParserNum >= LI.size())
     PrintFatalError("Target does not have an AsmParser #" +
                     Twine(AsmParserNum) + "!");
@@ -342,8 +344,8 @@ Record *CodeGenTarget::getAsmParser() const {
 /// this target.
 ///
 Record *CodeGenTarget::getAsmParserVariant(unsigned i) const {
-  std::vector<Record*> LI =
-    TargetRec->getValueAsListOfDefs("AssemblyParserVariants");
+  std::vector<Record *> LI =
+      TargetRec->getValueAsListOfDefs("AssemblyParserVariants");
   if (i >= LI.size())
     PrintFatalError("Target does not have an AsmParserVariant #" + Twine(i) +
                     "!");
@@ -354,15 +356,15 @@ Record *CodeGenTarget::getAsmParserVariant(unsigned i) const {
 /// available for this target.
 ///
 unsigned CodeGenTarget::getAsmParserVariantCount() const {
-  std::vector<Record*> LI =
-    TargetRec->getValueAsListOfDefs("AssemblyParserVariants");
+  std::vector<Record *> LI =
+      TargetRec->getValueAsListOfDefs("AssemblyParserVariants");
   return LI.size();
 }
 
 /// getAsmWriter - Return the AssemblyWriter definition for this target.
 ///
 Record *CodeGenTarget::getAsmWriter() const {
-  std::vector<Record*> LI = TargetRec->getValueAsListOfDefs("AssemblyWriters");
+  std::vector<Record *> LI = TargetRec->getValueAsListOfDefs("AssemblyWriters");
   if (AsmWriterNum >= LI.size())
     PrintFatalError("Target does not have an AsmWriter #" +
                     Twine(AsmWriterNum) + "!");
@@ -437,8 +439,7 @@ const CodeGenRegisterClass &CodeGenTarget::getRegisterClass(Record *R) const {
   return *getRegBank().getRegClass(R);
 }
 
-std::vector<ValueTypeByHwMode> CodeGenTarget::getRegisterVTs(Record *R)
-      const {
+std::vector<ValueTypeByHwMode> CodeGenTarget::getRegisterVTs(Record *R) const {
   const CodeGenRegister *Reg = getRegBank().getReg(R);
   std::vector<ValueTypeByHwMode> Result;
   for (const auto &RC : getRegBank().getRegClasses()) {
@@ -454,16 +455,15 @@ std::vector<ValueTypeByHwMode> CodeGenTarget::getRegisterVTs(Record *R)
   return Result;
 }
 
-
 void CodeGenTarget::ReadLegalValueTypes() const {
   for (const auto &RC : getRegBank().getRegClasses())
     llvm::append_range(LegalValueTypes, RC.VTs);
 
   // Remove duplicates.
   llvm::sort(LegalValueTypes);
-  LegalValueTypes.erase(std::unique(LegalValueTypes.begin(),
-                                    LegalValueTypes.end()),
-                        LegalValueTypes.end());
+  LegalValueTypes.erase(
+      std::unique(LegalValueTypes.begin(), LegalValueTypes.end()),
+      LegalValueTypes.end());
 }
 
 CodeGenSchedModels &CodeGenTarget::getSchedModels() const {
@@ -473,7 +473,7 @@ CodeGenSchedModels &CodeGenTarget::getSchedModels() const {
 }
 
 void CodeGenTarget::ReadInstructions() const {
-  std::vector<Record*> Insts = Records.getAllDerivedDefinitions("Instruction");
+  std::vector<Record *> Insts = Records.getAllDerivedDefinitions("Instruction");
   if (Insts.size() <= 2)
     PrintFatalError("No 'Instruction' subclasses defined!");
 
@@ -482,11 +482,10 @@ void CodeGenTarget::ReadInstructions() const {
     Instructions[Insts[i]] = std::make_unique<CodeGenInstruction>(Insts[i]);
 }
 
-static const CodeGenInstruction *
-GetInstByName(const char *Name,
-              const DenseMap<const Record*,
-                             std::unique_ptr<CodeGenInstruction>> &Insts,
-              RecordKeeper &Records) {
+static const CodeGenInstruction *GetInstByName(
+    const char *Name,
+    const DenseMap<const Record *, std::unique_ptr<CodeGenInstruction>> &Insts,
+    RecordKeeper &Records) {
   const Record *Rec = Records.getDef(Name);
 
   const auto I = Insts.find(Rec);
@@ -545,7 +544,6 @@ void CodeGenTarget::ComputeInstrsByEnum() const {
     Inst->EnumVal = Num++;
 }
 
-
 /// isLittleEndianEncoding - Return whether this target encodes its instruction
 /// in little-endian format, i.e. bits laid out in the order [0..n]
 ///
@@ -576,7 +574,7 @@ void CodeGenTarget::reverseBitsForLittleEndianEncoding() {
       unsigned bitSwapIdx = numBits - bit - 1;
       Init *OrigBit = BI->getBit(bit);
       Init *BitSwap = BI->getBit(bitSwapIdx);
-      NewBits[bit]        = BitSwap;
+      NewBits[bit] = BitSwap;
       NewBits[bitSwapIdx] = OrigBit;
     }
     if (numBits % 2) {
@@ -605,10 +603,10 @@ bool CodeGenTarget::guessInstructionProperties() const {
 // ComplexPattern implementation
 //
 ComplexPattern::ComplexPattern(Record *R) {
-  Ty          = R->getValueAsDef("Ty");
+  Ty = R->getValueAsDef("Ty");
   NumOperands = R->getValueAsInt("NumOperands");
   SelectFunc = std::string(R->getValueAsString("SelectFunc"));
-  RootNodes   = R->getValueAsListOfDefs("RootNodes");
+  RootNodes = R->getValueAsListOfDefs("RootNodes");
 
   // FIXME: This is a hack to statically increase the priority of patterns which
   // maps a sub-dag to a complex pattern. e.g. favors LEA over ADD. To get best
@@ -623,7 +621,7 @@ ComplexPattern::ComplexPattern(Record *R) {
   // FIXME: Why is this different from parseSDPatternOperatorProperties?
   // Parse the properties.
   Properties = 0;
-  std::vector<Record*> PropList = R->getValueAsListOfDefs("Properties");
+  std::vector<Record *> PropList = R->getValueAsListOfDefs("Properties");
   for (unsigned i = 0, e = PropList.size(); i != e; ++i)
     if (PropList[i]->getName() == "SDNPHasChain") {
       Properties |= 1 << SDNPHasChain;
diff --git a/llvm/utils/TableGen/CodeGenTarget.h b/llvm/utils/TableGen/CodeGenTarget.h
index 29f1024..2ae3a3a 100644
--- a/llvm/utils/TableGen/CodeGenTarget.h
+++ b/llvm/utils/TableGen/CodeGenTarget.h
@@ -58,10 +58,10 @@ class CodeGenTarget {
   RecordKeeper &Records;
   Record *TargetRec;
 
-  mutable DenseMap<const Record*,
-                   std::unique_ptr<CodeGenInstruction>> Instructions;
+  mutable DenseMap<const Record *, std::unique_ptr<CodeGenInstruction>>
+      Instructions;
   mutable std::unique_ptr<CodeGenRegBank> RegBank;
-  mutable std::vector<Record*> RegAltNameIndices;
+  mutable std::vector<Record *> RegAltNameIndices;
   mutable SmallVector<ValueTypeByHwMode, 8> LegalValueTypes;
   CodeGenHwModes CGH;
   std::vector<Record *> MacroFusions;
@@ -75,6 +75,7 @@ class CodeGenTarget {
   mutable StringRef InstNamespace;
   mutable std::vector<const CodeGenInstruction *> InstrsByEnum;
   mutable unsigned NumPseudoInstructions = 0;
+
 public:
   CodeGenTarget(RecordKeeper &Records);
   ~CodeGenTarget();
@@ -130,8 +131,9 @@ public:
   /// return it.
   const CodeGenRegister *getRegisterByName(StringRef Name) const;
 
-  const std::vector<Record*> &getRegAltNameIndices() const {
-    if (RegAltNameIndices.empty()) ReadRegAltNameIndices();
+  const std::vector<Record *> &getRegAltNameIndices() const {
+    if (RegAltNameIndices.empty())
+      ReadRegAltNameIndices();
     return RegAltNameIndices;
   }
 
@@ -156,15 +158,17 @@ public:
   const std::vector<Record *> getMacroFusions() const { return MacroFusions; }
 
 private:
-  DenseMap<const Record*, std::unique_ptr<CodeGenInstruction>> &
+  DenseMap<const Record *, std::unique_ptr<CodeGenInstruction>> &
   getInstructions() const {
-    if (Instructions.empty()) ReadInstructions();
+    if (Instructions.empty())
+      ReadInstructions();
     return Instructions;
   }
-public:
 
+public:
   CodeGenInstruction &getInstruction(const Record *InstRec) const {
-    if (Instructions.empty()) ReadInstructions();
+    if (Instructions.empty())
+      ReadInstructions();
     auto I = Instructions.find(InstRec);
     assert(I != Instructions.end() && "Not an instruction");
     return *I->second;
@@ -200,10 +204,11 @@ public:
   }
 
   typedef ArrayRef<const CodeGenInstruction *>::const_iterator inst_iterator;
-  inst_iterator inst_begin() const{return getInstructionsByEnumValue().begin();}
+  inst_iterator inst_begin() const {
+    return getInstructionsByEnumValue().begin();
+  }
   inst_iterator inst_end() const { return getInstructionsByEnumValue().end(); }
 
-
   /// isLittleEndianEncoding - are instruction bit patterns defined as  [0..n]?
   ///
   bool isLittleEndianEncoding() const;
@@ -226,22 +231,21 @@ class ComplexPattern {
   Record *Ty;
   unsigned NumOperands;
   std::string SelectFunc;
-  std::vector<Record*> RootNodes;
+  std::vector<Record *> RootNodes;
   unsigned Properties; // Node properties
   unsigned Complexity;
+
 public:
   ComplexPattern(Record *R);
 
   Record *getValueType() const { return Ty; }
   unsigned getNumOperands() const { return NumOperands; }
   const std::string &getSelectFunc() const { return SelectFunc; }
-  const std::vector<Record*> &getRootNodes() const {
-    return RootNodes;
-  }
+  const std::vector<Record *> &getRootNodes() const { return RootNodes; }
   bool hasProperty(enum SDNP Prop) const { return Properties & (1 << Prop); }
   unsigned getComplexity() const { return Complexity; }
 };
 
-} // End llvm namespace
+} // namespace llvm
 
 #endif
diff --git a/llvm/utils/TableGen/DAGISelEmitter.cpp b/llvm/utils/TableGen/DAGISelEmitter.cpp
index eaf7f7f..32b2746 100644
--- a/llvm/utils/TableGen/DAGISelEmitter.cpp
+++ b/llvm/utils/TableGen/DAGISelEmitter.cpp
@@ -27,6 +27,7 @@ namespace {
 class DAGISelEmitter {
   RecordKeeper &Records; // Just so we can get at the timing functions.
   CodeGenDAGPatterns CGP;
+
 public:
   explicit DAGISelEmitter(RecordKeeper &R) : Records(R), CGP(R) {}
   void run(raw_ostream &OS);
@@ -42,7 +43,8 @@ public:
 /// latencies in this calculation.
 static unsigned getResultPatternCost(TreePatternNode *P,
                                      CodeGenDAGPatterns &CGP) {
-  if (P->isLeaf()) return 0;
+  if (P->isLeaf())
+    return 0;
 
   unsigned Cost = 0;
   Record *Op = P->getOperator();
@@ -61,7 +63,8 @@ static unsigned getResultPatternCost(TreePatternNode *P,
 /// pattern.
 static unsigned getResultPatternSize(TreePatternNode *P,
                                      CodeGenDAGPatterns &CGP) {
-  if (P->isLeaf()) return 0;
+  if (P->isLeaf())
+    return 0;
 
   unsigned Cost = 0;
   Record *Op = P->getOperator();
@@ -98,19 +101,25 @@ struct PatternSortingPredicate {
     // input over nodes that cover fewer.
     int LHSSize = LHS->getPatternComplexity(CGP);
     int RHSSize = RHS->getPatternComplexity(CGP);
-    if (LHSSize > RHSSize) return true;   // LHS -> bigger -> less cost
-    if (LHSSize < RHSSize) return false;
+    if (LHSSize > RHSSize)
+      return true; // LHS -> bigger -> less cost
+    if (LHSSize < RHSSize)
+      return false;
 
     // If the patterns have equal complexity, compare generated instruction cost
     unsigned LHSCost = getResultPatternCost(LHS->getDstPattern(), CGP);
     unsigned RHSCost = getResultPatternCost(RHS->getDstPattern(), CGP);
-    if (LHSCost < RHSCost) return true;
-    if (LHSCost > RHSCost) return false;
+    if (LHSCost < RHSCost)
+      return true;
+    if (LHSCost > RHSCost)
+      return false;
 
     unsigned LHSPatSize = getResultPatternSize(LHS->getDstPattern(), CGP);
     unsigned RHSPatSize = getResultPatternSize(RHS->getDstPattern(), CGP);
-    if (LHSPatSize < RHSPatSize) return true;
-    if (LHSPatSize > RHSPatSize) return false;
+    if (LHSPatSize < RHSPatSize)
+      return true;
+    if (LHSPatSize > RHSPatSize)
+      return false;
 
     // Sort based on the UID of the pattern, to reflect source order.
     // Note that this is not guaranteed to be unique, since a single source
@@ -122,11 +131,11 @@ struct PatternSortingPredicate {
 };
 } // End anonymous namespace
 
-
 void DAGISelEmitter::run(raw_ostream &OS) {
   Records.startTimer("Parse patterns");
   emitSourceFileHeader("DAG Instruction Selector for the " +
-                       CGP.getTargetInfo().getName().str() + " target", OS);
+                           CGP.getTargetInfo().getName().str() + " target",
+                       OS);
 
   OS << "// *** NOTE: This file is #included into the middle of the target\n"
      << "// *** instruction selector class.  These functions are really "
@@ -155,7 +164,7 @@ void DAGISelEmitter::run(raw_ostream &OS) {
 
   // Add all the patterns to a temporary list so we can sort them.
   Records.startTimer("Sort patterns");
-  std::vector<const PatternToMatch*> Patterns;
+  std::vector<const PatternToMatch *> Patterns;
   for (const PatternToMatch &PTM : CGP.ptms())
     Patterns.push_back(&PTM);
 
@@ -167,7 +176,7 @@ void DAGISelEmitter::run(raw_ostream &OS) {
   Records.startTimer("Convert to matchers");
   SmallVector<Matcher *, 0> PatternMatchers;
   for (const PatternToMatch *PTM : Patterns) {
-    for (unsigned Variant = 0; ; ++Variant) {
+    for (unsigned Variant = 0;; ++Variant) {
       if (Matcher *M = ConvertPatternToMatcher(*PTM, Variant, CGP))
         PatternMatchers.push_back(M);
       else
@@ -181,7 +190,7 @@ void DAGISelEmitter::run(raw_ostream &OS) {
   Records.startTimer("Optimize matchers");
   OptimizeMatcher(TheMatcher, CGP);
 
-  //Matcher->dump();
+  // Matcher->dump();
 
   Records.startTimer("Emit matcher table");
   EmitMatcherTable(TheMatcher.get(), CGP, OS);
diff --git a/llvm/utils/TableGen/DAGISelMatcher.cpp b/llvm/utils/TableGen/DAGISelMatcher.cpp
index 1a5c728..5461481 100644
--- a/llvm/utils/TableGen/DAGISelMatcher.cpp
+++ b/llvm/utils/TableGen/DAGISelMatcher.cpp
@@ -15,11 +15,9 @@
 #include "llvm/TableGen/Record.h"
 using namespace llvm;
 
-void Matcher::anchor() { }
+void Matcher::anchor() {}
 
-void Matcher::dump() const {
-  print(errs(), 0);
-}
+void Matcher::dump() const { print(errs(), 0); }
 
 void Matcher::print(raw_ostream &OS, unsigned indent) const {
   printImpl(OS, indent);
@@ -27,9 +25,7 @@ void Matcher::print(raw_ostream &OS, unsigned indent) const {
     return Next->print(OS, indent);
 }
 
-void Matcher::printOne(raw_ostream &OS) const {
-  printImpl(OS, 0);
-}
+void Matcher::printOne(raw_ostream &OS) const { printImpl(OS, 0); }
 
 /// unlinkNode - Unlink the specified node from this chain.  If Other == this,
 /// we unlink the next pointer and return it.  Otherwise we unlink Other from
@@ -43,7 +39,8 @@ Matcher *Matcher::unlinkNode(Matcher *Other) {
   for (; Cur && Cur->getNext() != Other; Cur = Cur->getNext())
     /*empty*/;
 
-  if (!Cur) return nullptr;
+  if (!Cur)
+    return nullptr;
   Cur->takeNext();
   Cur->setNext(Other->takeNext());
   return this;
@@ -55,7 +52,8 @@ Matcher *Matcher::unlinkNode(Matcher *Other) {
 bool Matcher::canMoveBefore(const Matcher *Other) const {
   for (;; Other = Other->getNext()) {
     assert(Other && "Other didn't come before 'this'?");
-    if (this == Other) return true;
+    if (this == Other)
+      return true;
 
     // We have to be able to move this node across the Other node.
     if (!canMoveBeforeNode(Other))
@@ -78,7 +76,6 @@ bool Matcher::canMoveBeforeNode(const Matcher *Other) const {
   return false;
 }
 
-
 ScopeMatcher::~ScopeMatcher() {
   for (Matcher *C : Children)
     delete C;
@@ -96,8 +93,8 @@ SwitchTypeMatcher::~SwitchTypeMatcher() {
 
 CheckPredicateMatcher::CheckPredicateMatcher(
     const TreePredicateFn &pred, const SmallVectorImpl<unsigned> &Ops)
-  : Matcher(CheckPredicate), Pred(pred.getOrigPatFragRecord()),
-    Operands(Ops.begin(), Ops.end()) {}
+    : Matcher(CheckPredicate), Pred(pred.getOrigPatFragRecord()),
+      Operands(Ops.begin(), Ops.end()) {}
 
 TreePredicateFn CheckPredicateMatcher::getPredicate() const {
   return TreePredicateFn(Pred);
@@ -112,16 +109,15 @@ unsigned CheckPredicateMatcher::getOperandNo(unsigned i) const {
   return Operands[i];
 }
 
-
 // printImpl methods.
 
 void ScopeMatcher::printImpl(raw_ostream &OS, unsigned indent) const {
   OS.indent(indent) << "Scope\n";
   for (const Matcher *C : Children) {
     if (!C)
-      OS.indent(indent+1) << "NULL POINTER\n";
+      OS.indent(indent + 1) << "NULL POINTER\n";
     else
-      C->print(OS, indent+2);
+      C->print(OS, indent + 2);
   }
 }
 
@@ -137,7 +133,8 @@ void RecordMemRefMatcher::printImpl(raw_ostream &OS, unsigned indent) const {
   OS.indent(indent) << "RecordMemRef\n";
 }
 
-void CaptureGlueInputMatcher::printImpl(raw_ostream &OS, unsigned indent) const{
+void CaptureGlueInputMatcher::printImpl(raw_ostream &OS,
+                                        unsigned indent) const {
   OS.indent(indent) << "CaptureGlueInput\n";
 }
 
@@ -161,8 +158,8 @@ void CheckChildSameMatcher::printImpl(raw_ostream &OS, unsigned indent) const {
   OS.indent(indent) << "CheckChild" << ChildNo << "Same\n";
 }
 
-void CheckPatternPredicateMatcher::
-printImpl(raw_ostream &OS, unsigned indent) const {
+void CheckPatternPredicateMatcher::printImpl(raw_ostream &OS,
+                                             unsigned indent) const {
   OS.indent(indent) << "CheckPatternPredicate " << Predicate << '\n';
 }
 
@@ -178,32 +175,30 @@ void SwitchOpcodeMatcher::printImpl(raw_ostream &OS, unsigned indent) const {
   OS.indent(indent) << "SwitchOpcode: {\n";
   for (const auto &C : Cases) {
     OS.indent(indent) << "case " << C.first->getEnumName() << ":\n";
-    C.second->print(OS, indent+2);
+    C.second->print(OS, indent + 2);
   }
   OS.indent(indent) << "}\n";
 }
 
-
 void CheckTypeMatcher::printImpl(raw_ostream &OS, unsigned indent) const {
-  OS.indent(indent) << "CheckType " << getEnumName(Type) << ", ResNo="
-    << ResNo << '\n';
+  OS.indent(indent) << "CheckType " << getEnumName(Type) << ", ResNo=" << ResNo
+                    << '\n';
 }
 
 void SwitchTypeMatcher::printImpl(raw_ostream &OS, unsigned indent) const {
   OS.indent(indent) << "SwitchType: {\n";
   for (const auto &C : Cases) {
     OS.indent(indent) << "case " << getEnumName(C.first) << ":\n";
-    C.second->print(OS, indent+2);
+    C.second->print(OS, indent + 2);
   }
   OS.indent(indent) << "}\n";
 }
 
 void CheckChildTypeMatcher::printImpl(raw_ostream &OS, unsigned indent) const {
-  OS.indent(indent) << "CheckChildType " << ChildNo << " "
-    << getEnumName(Type) << '\n';
+  OS.indent(indent) << "CheckChildType " << ChildNo << " " << getEnumName(Type)
+                    << '\n';
 }
 
-
 void CheckIntegerMatcher::printImpl(raw_ostream &OS, unsigned indent) const {
   OS.indent(indent) << "CheckInteger " << Value << '\n';
 }
@@ -258,8 +253,8 @@ void EmitIntegerMatcher::printImpl(raw_ostream &OS, unsigned indent) const {
                     << '\n';
 }
 
-void EmitStringIntegerMatcher::
-printImpl(raw_ostream &OS, unsigned indent) const {
+void EmitStringIntegerMatcher::printImpl(raw_ostream &OS,
+                                         unsigned indent) const {
   OS.indent(indent) << "EmitStringInteger " << Val << " VT=" << getEnumName(VT)
                     << '\n';
 }
@@ -273,13 +268,13 @@ void EmitRegisterMatcher::printImpl(raw_ostream &OS, unsigned indent) const {
   OS << " VT=" << getEnumName(VT) << '\n';
 }
 
-void EmitConvertToTargetMatcher::
-printImpl(raw_ostream &OS, unsigned indent) const {
+void EmitConvertToTargetMatcher::printImpl(raw_ostream &OS,
+                                           unsigned indent) const {
   OS.indent(indent) << "EmitConvertToTarget " << Slot << '\n';
 }
 
-void EmitMergeInputChainsMatcher::
-printImpl(raw_ostream &OS, unsigned indent) const {
+void EmitMergeInputChainsMatcher::printImpl(raw_ostream &OS,
+                                            unsigned indent) const {
   OS.indent(indent) << "EmitMergeInputChains <todo: args>\n";
 }
 
@@ -289,10 +284,9 @@ void EmitCopyToRegMatcher::printImpl(raw_ostream &OS, unsigned indent) const {
 
 void EmitNodeXFormMatcher::printImpl(raw_ostream &OS, unsigned indent) const {
   OS.indent(indent) << "EmitNodeXForm " << NodeXForm->getName()
-     << " Slot=" << Slot << '\n';
+                    << " Slot=" << Slot << '\n';
 }
 
-
 void EmitNodeMatcherCommon::printImpl(raw_ostream &OS, unsigned indent) const {
   OS.indent(indent);
   OS << (isa<MorphNodeToMatcher>(this) ? "MorphNodeTo: " : "EmitNode: ")
@@ -316,7 +310,7 @@ bool CheckOpcodeMatcher::isEqualImpl(const Matcher *M) const {
   // Note: pointer equality isn't enough here, we have to check the enum names
   // to ensure that the nodes are for the same opcode.
   return cast<CheckOpcodeMatcher>(M)->Opcode.getEnumName() ==
-          Opcode.getEnumName();
+         Opcode.getEnumName();
 }
 
 bool EmitNodeMatcherCommon::isEqualImpl(const Matcher *m) const {
@@ -327,9 +321,9 @@ bool EmitNodeMatcherCommon::isEqualImpl(const Matcher *m) const {
          M->NumFixedArityOperands == NumFixedArityOperands;
 }
 
-void EmitNodeMatcher::anchor() { }
+void EmitNodeMatcher::anchor() {}
 
-void MorphNodeToMatcher::anchor() { }
+void MorphNodeToMatcher::anchor() {}
 
 // isContradictoryImpl Implementations.
 
@@ -337,7 +331,8 @@ static bool TypesAreContradictory(MVT::SimpleValueType T1,
                                   MVT::SimpleValueType T2) {
   // If the two types are the same, then they are the same, so they don't
   // contradict.
-  if (T1 == T2) return false;
+  if (T1 == T2)
+    return false;
 
   // If either type is about iPtr, then they don't conflict unless the other
   // one is not a scalar integer type.
@@ -400,7 +395,8 @@ bool CheckIntegerMatcher::isContradictoryImpl(const Matcher *M) const {
 }
 
 bool CheckChildIntegerMatcher::isContradictoryImpl(const Matcher *M) const {
-  if (const CheckChildIntegerMatcher *CCIM = dyn_cast<CheckChildIntegerMatcher>(M)) {
+  if (const CheckChildIntegerMatcher *CCIM =
+          dyn_cast<CheckChildIntegerMatcher>(M)) {
     // If the two checks are about different nodes, we don't know if they
     // conflict!
     if (CCIM->getChildNo() != getChildNo())
diff --git a/llvm/utils/TableGen/DAGISelMatcher.h b/llvm/utils/TableGen/DAGISelMatcher.h
index 6615a15..d4fe513 100644
--- a/llvm/utils/TableGen/DAGISelMatcher.h
+++ b/llvm/utils/TableGen/DAGISelMatcher.h
@@ -21,185 +21,186 @@
 #include <utility>
 
 namespace llvm {
-  class CodeGenRegister;
-  class CodeGenDAGPatterns;
-  class CodeGenInstruction;
-  class Matcher;
-  class PatternToMatch;
-  class raw_ostream;
-  class ComplexPattern;
-  class Record;
-  class SDNodeInfo;
-  class TreePredicateFn;
-  class TreePattern;
-
-  Matcher *ConvertPatternToMatcher(const PatternToMatch &Pattern,
-                                   unsigned Variant,
-                                   const CodeGenDAGPatterns &CGP);
-  void OptimizeMatcher(std::unique_ptr<Matcher> &Matcher,
-                       const CodeGenDAGPatterns &CGP);
-  void EmitMatcherTable(Matcher *Matcher, const CodeGenDAGPatterns &CGP,
-                        raw_ostream &OS);
-
-  /// Matcher - Base class for all the DAG ISel Matcher representation
-  /// nodes.
-  class Matcher {
-    // The next matcher node that is executed after this one.  Null if this is
-    // the last stage of a match.
-    std::unique_ptr<Matcher> Next;
-    size_t Size = 0; // Size in bytes of matcher and all its children (if any).
-    virtual void anchor();
-
-  public:
-    enum KindTy {
-      // Matcher state manipulation.
-      Scope,            // Push a checking scope.
-      RecordNode,       // Record the current node.
-      RecordChild,      // Record a child of the current node.
-      RecordMemRef,     // Record the memref in the current node.
-      CaptureGlueInput, // If the current node has an input glue, save it.
-      MoveChild,        // Move current node to specified child.
-      MoveSibling,      // Move current node to specified sibling.
-      MoveParent,       // Move current node to parent.
-
-      // Predicate checking.
-      CheckSame,      // Fail if not same as prev match.
-      CheckChildSame, // Fail if child not same as prev match.
-      CheckPatternPredicate,
-      CheckPredicate,      // Fail if node predicate fails.
-      CheckOpcode,         // Fail if not opcode.
-      SwitchOpcode,        // Dispatch based on opcode.
-      CheckType,           // Fail if not correct type.
-      SwitchType,          // Dispatch based on type.
-      CheckChildType,      // Fail if child has wrong type.
-      CheckInteger,        // Fail if wrong val.
-      CheckChildInteger,   // Fail if child is wrong val.
-      CheckCondCode,       // Fail if not condcode.
-      CheckChild2CondCode, // Fail if child is wrong condcode.
-      CheckValueType,
-      CheckComplexPat,
-      CheckAndImm,
-      CheckOrImm,
-      CheckImmAllOnesV,
-      CheckImmAllZerosV,
-      CheckFoldableChainNode,
-
-      // Node creation/emisssion.
-      EmitInteger,          // Create a TargetConstant
-      EmitStringInteger,    // Create a TargetConstant from a string.
-      EmitRegister,         // Create a register.
-      EmitConvertToTarget,  // Convert a imm/fpimm to target imm/fpimm
-      EmitMergeInputChains, // Merge together a chains for an input.
-      EmitCopyToReg,        // Emit a copytoreg into a physreg.
-      EmitNode,             // Create a DAG node
-      EmitNodeXForm,        // Run a SDNodeXForm
-      CompleteMatch,        // Finish a match and update the results.
-      MorphNodeTo,          // Build a node, finish a match and update results.
-
-      // Highest enum value; watch out when adding more.
-      HighestKind = MorphNodeTo
-    };
-    const KindTy Kind;
-
-  protected:
-    Matcher(KindTy K) : Kind(K) {}
-
-  public:
-    virtual ~Matcher() {}
-
-    unsigned getSize() const { return Size; }
-    void setSize(unsigned sz) { Size = sz; }
-    KindTy getKind() const { return Kind; }
-
-    Matcher *getNext() { return Next.get(); }
-    const Matcher *getNext() const { return Next.get(); }
-    void setNext(Matcher *C) { Next.reset(C); }
-    Matcher *takeNext() { return Next.release(); }
-
-    std::unique_ptr<Matcher> &getNextPtr() { return Next; }
-
-    bool isEqual(const Matcher *M) const {
-      if (getKind() != M->getKind())
-        return false;
-      return isEqualImpl(M);
-    }
+class CodeGenRegister;
+class CodeGenDAGPatterns;
+class CodeGenInstruction;
+class Matcher;
+class PatternToMatch;
+class raw_ostream;
+class ComplexPattern;
+class Record;
+class SDNodeInfo;
+class TreePredicateFn;
+class TreePattern;
+
+Matcher *ConvertPatternToMatcher(const PatternToMatch &Pattern,
+                                 unsigned Variant,
+                                 const CodeGenDAGPatterns &CGP);
+void OptimizeMatcher(std::unique_ptr<Matcher> &Matcher,
+                     const CodeGenDAGPatterns &CGP);
+void EmitMatcherTable(Matcher *Matcher, const CodeGenDAGPatterns &CGP,
+                      raw_ostream &OS);
+
+/// Matcher - Base class for all the DAG ISel Matcher representation
+/// nodes.
+class Matcher {
+  // The next matcher node that is executed after this one.  Null if this is
+  // the last stage of a match.
+  std::unique_ptr<Matcher> Next;
+  size_t Size = 0; // Size in bytes of matcher and all its children (if any).
+  virtual void anchor();
 
-    /// isSimplePredicateNode - Return true if this is a simple predicate that
-    /// operates on the node or its children without potential side effects or a
-    /// change of the current node.
-    bool isSimplePredicateNode() const {
-      switch (getKind()) {
-      default:
-        return false;
-      case CheckSame:
-      case CheckChildSame:
-      case CheckPatternPredicate:
-      case CheckPredicate:
-      case CheckOpcode:
-      case CheckType:
-      case CheckChildType:
-      case CheckInteger:
-      case CheckChildInteger:
-      case CheckCondCode:
-      case CheckChild2CondCode:
-      case CheckValueType:
-      case CheckAndImm:
-      case CheckOrImm:
-      case CheckImmAllOnesV:
-      case CheckImmAllZerosV:
-      case CheckFoldableChainNode:
-        return true;
-      }
-    }
+public:
+  enum KindTy {
+    // Matcher state manipulation.
+    Scope,            // Push a checking scope.
+    RecordNode,       // Record the current node.
+    RecordChild,      // Record a child of the current node.
+    RecordMemRef,     // Record the memref in the current node.
+    CaptureGlueInput, // If the current node has an input glue, save it.
+    MoveChild,        // Move current node to specified child.
+    MoveSibling,      // Move current node to specified sibling.
+    MoveParent,       // Move current node to parent.
+
+    // Predicate checking.
+    CheckSame,      // Fail if not same as prev match.
+    CheckChildSame, // Fail if child not same as prev match.
+    CheckPatternPredicate,
+    CheckPredicate,      // Fail if node predicate fails.
+    CheckOpcode,         // Fail if not opcode.
+    SwitchOpcode,        // Dispatch based on opcode.
+    CheckType,           // Fail if not correct type.
+    SwitchType,          // Dispatch based on type.
+    CheckChildType,      // Fail if child has wrong type.
+    CheckInteger,        // Fail if wrong val.
+    CheckChildInteger,   // Fail if child is wrong val.
+    CheckCondCode,       // Fail if not condcode.
+    CheckChild2CondCode, // Fail if child is wrong condcode.
+    CheckValueType,
+    CheckComplexPat,
+    CheckAndImm,
+    CheckOrImm,
+    CheckImmAllOnesV,
+    CheckImmAllZerosV,
+    CheckFoldableChainNode,
+
+    // Node creation/emisssion.
+    EmitInteger,          // Create a TargetConstant
+    EmitStringInteger,    // Create a TargetConstant from a string.
+    EmitRegister,         // Create a register.
+    EmitConvertToTarget,  // Convert a imm/fpimm to target imm/fpimm
+    EmitMergeInputChains, // Merge together a chains for an input.
+    EmitCopyToReg,        // Emit a copytoreg into a physreg.
+    EmitNode,             // Create a DAG node
+    EmitNodeXForm,        // Run a SDNodeXForm
+    CompleteMatch,        // Finish a match and update the results.
+    MorphNodeTo,          // Build a node, finish a match and update results.
+
+    // Highest enum value; watch out when adding more.
+    HighestKind = MorphNodeTo
+  };
+  const KindTy Kind;
 
-    /// isSimplePredicateOrRecordNode - Return true if this is a record node or
-    /// a simple predicate.
-    bool isSimplePredicateOrRecordNode() const {
-      return isSimplePredicateNode() || getKind() == RecordNode ||
-             getKind() == RecordChild;
-    }
+protected:
+  Matcher(KindTy K) : Kind(K) {}
 
-    /// unlinkNode - Unlink the specified node from this chain.  If Other ==
-    /// this, we unlink the next pointer and return it.  Otherwise we unlink
-    /// Other from the list and return this.
-    Matcher *unlinkNode(Matcher *Other);
-
-    /// canMoveBefore - Return true if this matcher is the same as Other, or if
-    /// we can move this matcher past all of the nodes in-between Other and this
-    /// node.  Other must be equal to or before this.
-    bool canMoveBefore(const Matcher *Other) const;
-
-    /// canMoveBeforeNode - Return true if it is safe to move the current
-    /// matcher across the specified one.
-    bool canMoveBeforeNode(const Matcher *Other) const;
-
-    /// isContradictory - Return true of these two matchers could never match on
-    /// the same node.
-    bool isContradictory(const Matcher *Other) const {
-      // Since this predicate is reflexive, we canonicalize the ordering so that
-      // we always match a node against nodes with kinds that are greater or
-      // equal to them.  For example, we'll pass in a CheckType node as an
-      // argument to the CheckOpcode method, not the other way around.
-      if (getKind() < Other->getKind())
-        return isContradictoryImpl(Other);
-      return Other->isContradictoryImpl(this);
+public:
+  virtual ~Matcher() {}
+
+  unsigned getSize() const { return Size; }
+  void setSize(unsigned sz) { Size = sz; }
+  KindTy getKind() const { return Kind; }
+
+  Matcher *getNext() { return Next.get(); }
+  const Matcher *getNext() const { return Next.get(); }
+  void setNext(Matcher *C) { Next.reset(C); }
+  Matcher *takeNext() { return Next.release(); }
+
+  std::unique_ptr<Matcher> &getNextPtr() { return Next; }
+
+  bool isEqual(const Matcher *M) const {
+    if (getKind() != M->getKind())
+      return false;
+    return isEqualImpl(M);
+  }
+
+  /// isSimplePredicateNode - Return true if this is a simple predicate that
+  /// operates on the node or its children without potential side effects or a
+  /// change of the current node.
+  bool isSimplePredicateNode() const {
+    switch (getKind()) {
+    default:
+      return false;
+    case CheckSame:
+    case CheckChildSame:
+    case CheckPatternPredicate:
+    case CheckPredicate:
+    case CheckOpcode:
+    case CheckType:
+    case CheckChildType:
+    case CheckInteger:
+    case CheckChildInteger:
+    case CheckCondCode:
+    case CheckChild2CondCode:
+    case CheckValueType:
+    case CheckAndImm:
+    case CheckOrImm:
+    case CheckImmAllOnesV:
+    case CheckImmAllZerosV:
+    case CheckFoldableChainNode:
+      return true;
     }
+  }
 
-    void print(raw_ostream &OS, unsigned indent = 0) const;
-    void printOne(raw_ostream &OS) const;
-    void dump() const;
+  /// isSimplePredicateOrRecordNode - Return true if this is a record node or
+  /// a simple predicate.
+  bool isSimplePredicateOrRecordNode() const {
+    return isSimplePredicateNode() || getKind() == RecordNode ||
+           getKind() == RecordChild;
+  }
 
-  protected:
-    virtual void printImpl(raw_ostream &OS, unsigned indent) const = 0;
-    virtual bool isEqualImpl(const Matcher *M) const = 0;
-    virtual bool isContradictoryImpl(const Matcher *M) const { return false; }
-  };
+  /// unlinkNode - Unlink the specified node from this chain.  If Other ==
+  /// this, we unlink the next pointer and return it.  Otherwise we unlink
+  /// Other from the list and return this.
+  Matcher *unlinkNode(Matcher *Other);
+
+  /// canMoveBefore - Return true if this matcher is the same as Other, or if
+  /// we can move this matcher past all of the nodes in-between Other and this
+  /// node.  Other must be equal to or before this.
+  bool canMoveBefore(const Matcher *Other) const;
+
+  /// canMoveBeforeNode - Return true if it is safe to move the current
+  /// matcher across the specified one.
+  bool canMoveBeforeNode(const Matcher *Other) const;
+
+  /// isContradictory - Return true of these two matchers could never match on
+  /// the same node.
+  bool isContradictory(const Matcher *Other) const {
+    // Since this predicate is reflexive, we canonicalize the ordering so that
+    // we always match a node against nodes with kinds that are greater or
+    // equal to them.  For example, we'll pass in a CheckType node as an
+    // argument to the CheckOpcode method, not the other way around.
+    if (getKind() < Other->getKind())
+      return isContradictoryImpl(Other);
+    return Other->isContradictoryImpl(this);
+  }
+
+  void print(raw_ostream &OS, unsigned indent = 0) const;
+  void printOne(raw_ostream &OS) const;
+  void dump() const;
+
+protected:
+  virtual void printImpl(raw_ostream &OS, unsigned indent) const = 0;
+  virtual bool isEqualImpl(const Matcher *M) const = 0;
+  virtual bool isContradictoryImpl(const Matcher *M) const { return false; }
+};
 
 /// ScopeMatcher - This attempts to match each of its children to find the first
 /// one that successfully matches.  If one child fails, it tries the next child.
 /// If none of the children match then this check fails.  It never has a 'next'.
 class ScopeMatcher : public Matcher {
-  SmallVector<Matcher*, 4> Children;
+  SmallVector<Matcher *, 4> Children;
+
 public:
   ScopeMatcher(SmallVectorImpl<Matcher *> &&children)
       : Matcher(Scope), Children(std::move(children)) {}
@@ -230,9 +231,7 @@ public:
     Children.resize(NC);
   }
 
-  static bool classof(const Matcher *N) {
-    return N->getKind() == Scope;
-  }
+  static bool classof(const Matcher *N) { return N->getKind() == Scope; }
 
 private:
   void printImpl(raw_ostream &OS, unsigned indent) const override;
@@ -248,16 +247,15 @@ class RecordMatcher : public Matcher {
   /// ResultNo - The slot number in the RecordedNodes vector that this will be,
   /// just printed as a comment.
   unsigned ResultNo;
+
 public:
   RecordMatcher(const std::string &whatfor, unsigned resultNo)
-    : Matcher(RecordNode), WhatFor(whatfor), ResultNo(resultNo) {}
+      : Matcher(RecordNode), WhatFor(whatfor), ResultNo(resultNo) {}
 
   const std::string &getWhatFor() const { return WhatFor; }
   unsigned getResultNo() const { return ResultNo; }
 
-  static bool classof(const Matcher *N) {
-    return N->getKind() == RecordNode;
-  }
+  static bool classof(const Matcher *N) { return N->getKind() == RecordNode; }
 
 private:
   void printImpl(raw_ostream &OS, unsigned indent) const override;
@@ -277,19 +275,18 @@ class RecordChildMatcher : public Matcher {
   /// ResultNo - The slot number in the RecordedNodes vector that this will be,
   /// just printed as a comment.
   unsigned ResultNo;
+
 public:
   RecordChildMatcher(unsigned childno, const std::string &whatfor,
                      unsigned resultNo)
-  : Matcher(RecordChild), ChildNo(childno), WhatFor(whatfor),
-    ResultNo(resultNo) {}
+      : Matcher(RecordChild), ChildNo(childno), WhatFor(whatfor),
+        ResultNo(resultNo) {}
 
   unsigned getChildNo() const { return ChildNo; }
   const std::string &getWhatFor() const { return WhatFor; }
   unsigned getResultNo() const { return ResultNo; }
 
-  static bool classof(const Matcher *N) {
-    return N->getKind() == RecordChild;
-  }
+  static bool classof(const Matcher *N) { return N->getKind() == RecordChild; }
 
 private:
   void printImpl(raw_ostream &OS, unsigned indent) const override;
@@ -303,16 +300,13 @@ class RecordMemRefMatcher : public Matcher {
 public:
   RecordMemRefMatcher() : Matcher(RecordMemRef) {}
 
-  static bool classof(const Matcher *N) {
-    return N->getKind() == RecordMemRef;
-  }
+  static bool classof(const Matcher *N) { return N->getKind() == RecordMemRef; }
 
 private:
   void printImpl(raw_ostream &OS, unsigned indent) const override;
   bool isEqualImpl(const Matcher *M) const override { return true; }
 };
 
-
 /// CaptureGlueInputMatcher - If the current record has a glue input, record
 /// it so that it is used as an input to the generated code.
 class CaptureGlueInputMatcher : public Matcher {
@@ -332,14 +326,13 @@ private:
 /// specified child node.
 class MoveChildMatcher : public Matcher {
   unsigned ChildNo;
+
 public:
   MoveChildMatcher(unsigned childNo) : Matcher(MoveChild), ChildNo(childNo) {}
 
   unsigned getChildNo() const { return ChildNo; }
 
-  static bool classof(const Matcher *N) {
-    return N->getKind() == MoveChild;
-  }
+  static bool classof(const Matcher *N) { return N->getKind() == MoveChild; }
 
 private:
   void printImpl(raw_ostream &OS, unsigned indent) const override;
@@ -374,9 +367,7 @@ class MoveParentMatcher : public Matcher {
 public:
   MoveParentMatcher() : Matcher(MoveParent) {}
 
-  static bool classof(const Matcher *N) {
-    return N->getKind() == MoveParent;
-  }
+  static bool classof(const Matcher *N) { return N->getKind() == MoveParent; }
 
 private:
   void printImpl(raw_ostream &OS, unsigned indent) const override;
@@ -388,15 +379,14 @@ private:
 /// when patterns have the same name in them, like '(mul GPR:$in, GPR:$in)'.
 class CheckSameMatcher : public Matcher {
   unsigned MatchNumber;
+
 public:
   CheckSameMatcher(unsigned matchnumber)
-    : Matcher(CheckSame), MatchNumber(matchnumber) {}
+      : Matcher(CheckSame), MatchNumber(matchnumber) {}
 
   unsigned getMatchNumber() const { return MatchNumber; }
 
-  static bool classof(const Matcher *N) {
-    return N->getKind() == CheckSame;
-  }
+  static bool classof(const Matcher *N) { return N->getKind() == CheckSame; }
 
 private:
   void printImpl(raw_ostream &OS, unsigned indent) const override;
@@ -411,9 +401,10 @@ private:
 class CheckChildSameMatcher : public Matcher {
   unsigned ChildNo;
   unsigned MatchNumber;
+
 public:
   CheckChildSameMatcher(unsigned childno, unsigned matchnumber)
-    : Matcher(CheckChildSame), ChildNo(childno), MatchNumber(matchnumber) {}
+      : Matcher(CheckChildSame), ChildNo(childno), MatchNumber(matchnumber) {}
 
   unsigned getChildNo() const { return ChildNo; }
   unsigned getMatchNumber() const { return MatchNumber; }
@@ -435,9 +426,10 @@ private:
 /// not take a node as input.  This is used for subtarget feature checks etc.
 class CheckPatternPredicateMatcher : public Matcher {
   std::string Predicate;
+
 public:
   CheckPatternPredicateMatcher(StringRef predicate)
-    : Matcher(CheckPatternPredicate), Predicate(predicate) {}
+      : Matcher(CheckPatternPredicate), Predicate(predicate) {}
 
   StringRef getPredicate() const { return Predicate; }
 
@@ -457,6 +449,7 @@ private:
 class CheckPredicateMatcher : public Matcher {
   TreePattern *Pred;
   const SmallVector<unsigned, 4> Operands;
+
 public:
   CheckPredicateMatcher(const TreePredicateFn &pred,
                         const SmallVectorImpl<unsigned> &Operands);
@@ -476,20 +469,18 @@ private:
   }
 };
 
-
 /// CheckOpcodeMatcher - This checks to see if the current node has the
 /// specified opcode, if not it fails to match.
 class CheckOpcodeMatcher : public Matcher {
   const SDNodeInfo &Opcode;
+
 public:
   CheckOpcodeMatcher(const SDNodeInfo &opcode)
-    : Matcher(CheckOpcode), Opcode(opcode) {}
+      : Matcher(CheckOpcode), Opcode(opcode) {}
 
   const SDNodeInfo &getOpcode() const { return Opcode; }
 
-  static bool classof(const Matcher *N) {
-    return N->getKind() == CheckOpcode;
-  }
+  static bool classof(const Matcher *N) { return N->getKind() == CheckOpcode; }
 
 private:
   void printImpl(raw_ostream &OS, unsigned indent) const override;
@@ -502,16 +493,15 @@ private:
 /// then the match fails.  This is semantically equivalent to a Scope node where
 /// every child does a CheckOpcode, but is much faster.
 class SwitchOpcodeMatcher : public Matcher {
-  SmallVector<std::pair<const SDNodeInfo*, Matcher*>, 8> Cases;
+  SmallVector<std::pair<const SDNodeInfo *, Matcher *>, 8> Cases;
+
 public:
   SwitchOpcodeMatcher(
       SmallVectorImpl<std::pair<const SDNodeInfo *, Matcher *>> &&cases)
       : Matcher(SwitchOpcode), Cases(std::move(cases)) {}
   ~SwitchOpcodeMatcher() override;
 
-  static bool classof(const Matcher *N) {
-    return N->getKind() == SwitchOpcode;
-  }
+  static bool classof(const Matcher *N) { return N->getKind() == SwitchOpcode; }
 
   unsigned getNumCases() const { return Cases.size(); }
 
@@ -529,16 +519,15 @@ private:
 class CheckTypeMatcher : public Matcher {
   MVT::SimpleValueType Type;
   unsigned ResNo;
+
 public:
   CheckTypeMatcher(MVT::SimpleValueType type, unsigned resno)
-    : Matcher(CheckType), Type(type), ResNo(resno) {}
+      : Matcher(CheckType), Type(type), ResNo(resno) {}
 
   MVT::SimpleValueType getType() const { return Type; }
   unsigned getResNo() const { return ResNo; }
 
-  static bool classof(const Matcher *N) {
-    return N->getKind() == CheckType;
-  }
+  static bool classof(const Matcher *N) { return N->getKind() == CheckType; }
 
 private:
   void printImpl(raw_ostream &OS, unsigned indent) const override;
@@ -553,16 +542,15 @@ private:
 /// then the match fails.  This is semantically equivalent to a Scope node where
 /// every child does a CheckType, but is much faster.
 class SwitchTypeMatcher : public Matcher {
-  SmallVector<std::pair<MVT::SimpleValueType, Matcher*>, 8> Cases;
+  SmallVector<std::pair<MVT::SimpleValueType, Matcher *>, 8> Cases;
+
 public:
   SwitchTypeMatcher(
       SmallVectorImpl<std::pair<MVT::SimpleValueType, Matcher *>> &&cases)
       : Matcher(SwitchType), Cases(std::move(cases)) {}
   ~SwitchTypeMatcher() override;
 
-  static bool classof(const Matcher *N) {
-    return N->getKind() == SwitchType;
-  }
+  static bool classof(const Matcher *N) { return N->getKind() == SwitchType; }
 
   unsigned getNumCases() const { return Cases.size(); }
 
@@ -575,15 +563,15 @@ private:
   bool isEqualImpl(const Matcher *M) const override { return false; }
 };
 
-
 /// CheckChildTypeMatcher - This checks to see if a child node has the
 /// specified type, if not it fails to match.
 class CheckChildTypeMatcher : public Matcher {
   unsigned ChildNo;
   MVT::SimpleValueType Type;
+
 public:
   CheckChildTypeMatcher(unsigned childno, MVT::SimpleValueType type)
-    : Matcher(CheckChildType), ChildNo(childno), Type(type) {}
+      : Matcher(CheckChildType), ChildNo(childno), Type(type) {}
 
   unsigned getChildNo() const { return ChildNo; }
   MVT::SimpleValueType getType() const { return Type; }
@@ -601,20 +589,17 @@ private:
   bool isContradictoryImpl(const Matcher *M) const override;
 };
 
-
 /// CheckIntegerMatcher - This checks to see if the current node is a
 /// ConstantSDNode with the specified integer value, if not it fails to match.
 class CheckIntegerMatcher : public Matcher {
   int64_t Value;
+
 public:
-  CheckIntegerMatcher(int64_t value)
-    : Matcher(CheckInteger), Value(value) {}
+  CheckIntegerMatcher(int64_t value) : Matcher(CheckInteger), Value(value) {}
 
   int64_t getValue() const { return Value; }
 
-  static bool classof(const Matcher *N) {
-    return N->getKind() == CheckInteger;
-  }
+  static bool classof(const Matcher *N) { return N->getKind() == CheckInteger; }
 
 private:
   void printImpl(raw_ostream &OS, unsigned indent) const override;
@@ -629,9 +614,10 @@ private:
 class CheckChildIntegerMatcher : public Matcher {
   unsigned ChildNo;
   int64_t Value;
+
 public:
   CheckChildIntegerMatcher(unsigned childno, int64_t value)
-    : Matcher(CheckChildInteger), ChildNo(childno), Value(value) {}
+      : Matcher(CheckChildInteger), ChildNo(childno), Value(value) {}
 
   unsigned getChildNo() const { return ChildNo; }
   int64_t getValue() const { return Value; }
@@ -653,9 +639,10 @@ private:
 /// CondCodeSDNode with the specified condition, if not it fails to match.
 class CheckCondCodeMatcher : public Matcher {
   StringRef CondCodeName;
+
 public:
   CheckCondCodeMatcher(StringRef condcodename)
-    : Matcher(CheckCondCode), CondCodeName(condcodename) {}
+      : Matcher(CheckCondCode), CondCodeName(condcodename) {}
 
   StringRef getCondCodeName() const { return CondCodeName; }
 
@@ -675,9 +662,10 @@ private:
 /// CondCodeSDNode with the specified condition, if not it fails to match.
 class CheckChild2CondCodeMatcher : public Matcher {
   StringRef CondCodeName;
+
 public:
   CheckChild2CondCodeMatcher(StringRef condcodename)
-    : Matcher(CheckChild2CondCode), CondCodeName(condcodename) {}
+      : Matcher(CheckChild2CondCode), CondCodeName(condcodename) {}
 
   StringRef getCondCodeName() const { return CondCodeName; }
 
@@ -697,9 +685,10 @@ private:
 /// VTSDNode with the specified type, if not it fails to match.
 class CheckValueTypeMatcher : public Matcher {
   StringRef TypeName;
+
 public:
   CheckValueTypeMatcher(StringRef type_name)
-    : Matcher(CheckValueType), TypeName(type_name) {}
+      : Matcher(CheckValueType), TypeName(type_name) {}
 
   StringRef getTypeName() const { return TypeName; }
 
@@ -715,8 +704,6 @@ private:
   bool isContradictoryImpl(const Matcher *M) const override;
 };
 
-
-
 /// CheckComplexPatMatcher - This node runs the specified ComplexPattern on
 /// the current node.
 class CheckComplexPatMatcher : public Matcher {
@@ -732,11 +719,12 @@ class CheckComplexPatMatcher : public Matcher {
   /// FirstResult - This is the first slot in the RecordedNodes list that the
   /// result of the match populates.
   unsigned FirstResult;
+
 public:
   CheckComplexPatMatcher(const ComplexPattern &pattern, unsigned matchnumber,
                          const std::string &name, unsigned firstresult)
-    : Matcher(CheckComplexPat), Pattern(pattern), MatchNumber(matchnumber),
-      Name(name), FirstResult(firstresult) {}
+      : Matcher(CheckComplexPat), Pattern(pattern), MatchNumber(matchnumber),
+        Name(name), FirstResult(firstresult) {}
 
   const ComplexPattern &getPattern() const { return Pattern; }
   unsigned getMatchNumber() const { return MatchNumber; }
@@ -760,15 +748,13 @@ private:
 /// with something equivalent to the specified immediate.
 class CheckAndImmMatcher : public Matcher {
   int64_t Value;
+
 public:
-  CheckAndImmMatcher(int64_t value)
-    : Matcher(CheckAndImm), Value(value) {}
+  CheckAndImmMatcher(int64_t value) : Matcher(CheckAndImm), Value(value) {}
 
   int64_t getValue() const { return Value; }
 
-  static bool classof(const Matcher *N) {
-    return N->getKind() == CheckAndImm;
-  }
+  static bool classof(const Matcher *N) { return N->getKind() == CheckAndImm; }
 
 private:
   void printImpl(raw_ostream &OS, unsigned indent) const override;
@@ -781,15 +767,13 @@ private:
 /// with something equivalent to the specified immediate.
 class CheckOrImmMatcher : public Matcher {
   int64_t Value;
+
 public:
-  CheckOrImmMatcher(int64_t value)
-    : Matcher(CheckOrImm), Value(value) {}
+  CheckOrImmMatcher(int64_t value) : Matcher(CheckOrImm), Value(value) {}
 
   int64_t getValue() const { return Value; }
 
-  static bool classof(const Matcher *N) {
-    return N->getKind() == CheckOrImm;
-  }
+  static bool classof(const Matcher *N) { return N->getKind() == CheckOrImm; }
 
 private:
   void printImpl(raw_ostream &OS, unsigned indent) const override;
@@ -834,8 +818,7 @@ private:
 /// (which defines a chain operand) is safe to fold into a larger pattern.
 class CheckFoldableChainNodeMatcher : public Matcher {
 public:
-  CheckFoldableChainNodeMatcher()
-    : Matcher(CheckFoldableChainNode) {}
+  CheckFoldableChainNodeMatcher() : Matcher(CheckFoldableChainNode) {}
 
   static bool classof(const Matcher *N) {
     return N->getKind() == CheckFoldableChainNode;
@@ -850,16 +833,15 @@ private:
 class EmitIntegerMatcher : public Matcher {
   int64_t Val;
   MVT::SimpleValueType VT;
+
 public:
   EmitIntegerMatcher(int64_t val, MVT::SimpleValueType vt)
-    : Matcher(EmitInteger), Val(val), VT(vt) {}
+      : Matcher(EmitInteger), Val(val), VT(vt) {}
 
   int64_t getValue() const { return Val; }
   MVT::SimpleValueType getVT() const { return VT; }
 
-  static bool classof(const Matcher *N) {
-    return N->getKind() == EmitInteger;
-  }
+  static bool classof(const Matcher *N) { return N->getKind() == EmitInteger; }
 
 private:
   void printImpl(raw_ostream &OS, unsigned indent) const override;
@@ -874,9 +856,10 @@ private:
 class EmitStringIntegerMatcher : public Matcher {
   std::string Val;
   MVT::SimpleValueType VT;
+
 public:
   EmitStringIntegerMatcher(const std::string &val, MVT::SimpleValueType vt)
-    : Matcher(EmitStringInteger), Val(val), VT(vt) {}
+      : Matcher(EmitStringInteger), Val(val), VT(vt) {}
 
   const std::string &getValue() const { return Val; }
   MVT::SimpleValueType getVT() const { return VT; }
@@ -899,16 +882,15 @@ class EmitRegisterMatcher : public Matcher {
   /// this is a reference to zero_reg.
   const CodeGenRegister *Reg;
   MVT::SimpleValueType VT;
+
 public:
   EmitRegisterMatcher(const CodeGenRegister *reg, MVT::SimpleValueType vt)
-    : Matcher(EmitRegister), Reg(reg), VT(vt) {}
+      : Matcher(EmitRegister), Reg(reg), VT(vt) {}
 
   const CodeGenRegister *getReg() const { return Reg; }
   MVT::SimpleValueType getVT() const { return VT; }
 
-  static bool classof(const Matcher *N) {
-    return N->getKind() == EmitRegister;
-  }
+  static bool classof(const Matcher *N) { return N->getKind() == EmitRegister; }
 
 private:
   void printImpl(raw_ostream &OS, unsigned indent) const override;
@@ -923,9 +905,10 @@ private:
 /// ISD::TargetConstant, likewise for ConstantFP.
 class EmitConvertToTargetMatcher : public Matcher {
   unsigned Slot;
+
 public:
   EmitConvertToTargetMatcher(unsigned slot)
-    : Matcher(EmitConvertToTarget), Slot(slot) {}
+      : Matcher(EmitConvertToTarget), Slot(slot) {}
 
   unsigned getSlot() const { return Slot; }
 
@@ -946,9 +929,10 @@ private:
 /// chains of these nodes if they are not themselves a node in the pattern.
 class EmitMergeInputChainsMatcher : public Matcher {
   SmallVector<unsigned, 3> ChainNodes;
+
 public:
   EmitMergeInputChainsMatcher(ArrayRef<unsigned> nodes)
-    : Matcher(EmitMergeInputChains), ChainNodes(nodes.begin(), nodes.end()) {}
+      : Matcher(EmitMergeInputChains), ChainNodes(nodes.begin(), nodes.end()) {}
 
   unsigned getNumNodes() const { return ChainNodes.size(); }
 
@@ -976,9 +960,8 @@ class EmitCopyToRegMatcher : public Matcher {
   const CodeGenRegister *DestPhysReg;
 
 public:
-  EmitCopyToRegMatcher(unsigned srcSlot,
-                       const CodeGenRegister *destPhysReg)
-    : Matcher(EmitCopyToReg), SrcSlot(srcSlot), DestPhysReg(destPhysReg) {}
+  EmitCopyToRegMatcher(unsigned srcSlot, const CodeGenRegister *destPhysReg)
+      : Matcher(EmitCopyToReg), SrcSlot(srcSlot), DestPhysReg(destPhysReg) {}
 
   unsigned getSrcSlot() const { return SrcSlot; }
   const CodeGenRegister *getDestPhysReg() const { return DestPhysReg; }
@@ -995,16 +978,15 @@ private:
   }
 };
 
-
-
 /// EmitNodeXFormMatcher - Emit an operation that runs an SDNodeXForm on a
 /// recorded node and records the result.
 class EmitNodeXFormMatcher : public Matcher {
   unsigned Slot;
   Record *NodeXForm;
+
 public:
   EmitNodeXFormMatcher(unsigned slot, Record *nodeXForm)
-    : Matcher(EmitNodeXForm), Slot(slot), NodeXForm(nodeXForm) {}
+      : Matcher(EmitNodeXForm), Slot(slot), NodeXForm(nodeXForm) {}
 
   unsigned getSlot() const { return Slot; }
   Record *getNodeXForm() const { return NodeXForm; }
@@ -1033,6 +1015,7 @@ class EmitNodeMatcherCommon : public Matcher {
   /// If this is a varidic node, this is set to the number of fixed arity
   /// operands in the root of the pattern.  The rest are appended to this node.
   int NumFixedArityOperands;
+
 public:
   EmitNodeMatcherCommon(const CodeGenInstruction &cgi,
                         ArrayRef<MVT::SimpleValueType> vts,
@@ -1061,7 +1044,6 @@ public:
   const SmallVectorImpl<MVT::SimpleValueType> &getVTList() const { return VTs; }
   const SmallVectorImpl<unsigned> &getOperandList() const { return Operands; }
 
-
   bool hasChain() const { return HasChain; }
   bool hasInGlue() const { return HasInGlue; }
   bool hasOutGlue() const { return HasOutGlue; }
@@ -1081,6 +1063,7 @@ private:
 class EmitNodeMatcher : public EmitNodeMatcherCommon {
   void anchor() override;
   unsigned FirstResultSlot;
+
 public:
   EmitNodeMatcher(const CodeGenInstruction &cgi,
                   ArrayRef<MVT::SimpleValueType> vts,
@@ -1094,15 +1077,13 @@ public:
 
   unsigned getFirstResultSlot() const { return FirstResultSlot; }
 
-  static bool classof(const Matcher *N) {
-    return N->getKind() == EmitNode;
-  }
-
+  static bool classof(const Matcher *N) { return N->getKind() == EmitNode; }
 };
 
 class MorphNodeToMatcher : public EmitNodeMatcherCommon {
   void anchor() override;
   const PatternToMatch &Pattern;
+
 public:
   MorphNodeToMatcher(const CodeGenInstruction &cgi,
                      ArrayRef<MVT::SimpleValueType> vts,
@@ -1116,9 +1097,7 @@ public:
 
   const PatternToMatch &getPattern() const { return Pattern; }
 
-  static bool classof(const Matcher *N) {
-    return N->getKind() == MorphNodeTo;
-  }
+  static bool classof(const Matcher *N) { return N->getKind() == MorphNodeTo; }
 };
 
 /// CompleteMatchMatcher - Complete a match by replacing the results of the
@@ -1127,11 +1106,12 @@ public:
 class CompleteMatchMatcher : public Matcher {
   SmallVector<unsigned, 2> Results;
   const PatternToMatch &Pattern;
+
 public:
   CompleteMatchMatcher(ArrayRef<unsigned> results,
                        const PatternToMatch &pattern)
-  : Matcher(CompleteMatch), Results(results.begin(), results.end()),
-    Pattern(pattern) {}
+      : Matcher(CompleteMatch), Results(results.begin(), results.end()),
+        Pattern(pattern) {}
 
   unsigned getNumResults() const { return Results.size(); }
   unsigned getResult(unsigned R) const { return Results[R]; }
@@ -1145,7 +1125,7 @@ private:
   void printImpl(raw_ostream &OS, unsigned indent) const override;
   bool isEqualImpl(const Matcher *M) const override {
     return cast<CompleteMatchMatcher>(M)->Results == Results &&
-          &cast<CompleteMatchMatcher>(M)->Pattern == &Pattern;
+           &cast<CompleteMatchMatcher>(M)->Pattern == &Pattern;
   }
 };
 
diff --git a/llvm/utils/TableGen/DAGISelMatcherEmitter.cpp b/llvm/utils/TableGen/DAGISelMatcherEmitter.cpp
index 50156d3..8d002e5 100644
--- a/llvm/utils/TableGen/DAGISelMatcherEmitter.cpp
+++ b/llvm/utils/TableGen/DAGISelMatcherEmitter.cpp
@@ -50,7 +50,7 @@ namespace {
 class MatcherTableEmitter {
   const CodeGenDAGPatterns &CGP;
 
-  SmallVector<unsigned, Matcher::HighestKind+1> OpcodeCounts;
+  SmallVector<unsigned, Matcher::HighestKind + 1> OpcodeCounts;
 
   std::vector<TreePattern *> NodePredicates;
   std::vector<TreePattern *> NodePredicatesWithOperands;
@@ -62,14 +62,13 @@ class MatcherTableEmitter {
 
   std::vector<std::string> PatternPredicates;
 
-  std::vector<const ComplexPattern*> ComplexPatterns;
+  std::vector<const ComplexPattern *> ComplexPatterns;
 
-
-  DenseMap<Record*, unsigned> NodeXFormMap;
-  std::vector<Record*> NodeXForms;
+  DenseMap<Record *, unsigned> NodeXFormMap;
+  std::vector<Record *> NodeXForms;
 
   std::vector<std::string> VecIncludeStrings;
-  MapVector<std::string, unsigned, StringMap<unsigned> > VecPatterns;
+  MapVector<std::string, unsigned, StringMap<unsigned>> VecPatterns;
 
   unsigned getPatternIdxFromTable(std::string &&P, std::string &&include_loc) {
     const auto It = VecPatterns.find(P);
@@ -184,8 +183,8 @@ private:
 
   unsigned SizeMatcher(Matcher *N, raw_ostream &OS);
 
-  unsigned EmitMatcher(const Matcher *N, const unsigned Indent, unsigned CurrentIdx,
-                       raw_ostream &OS);
+  unsigned EmitMatcher(const Matcher *N, const unsigned Indent,
+                       unsigned CurrentIdx, raw_ostream &OS);
 
   unsigned getNodePredicate(TreePredicateFn Pred) {
     // We use the first predicate.
@@ -210,9 +209,8 @@ private:
       NodeXForms.push_back(Rec);
       Entry = NodeXForms.size();
     }
-    return Entry-1;
+    return Entry - 1;
   }
-
 };
 } // end anonymous namespace.
 
@@ -224,14 +222,15 @@ static std::string GetPatFromTreePatternNode(const TreePatternNode *N) {
 }
 
 static unsigned GetVBRSize(unsigned Val) {
-  if (Val <= 127) return 1;
+  if (Val <= 127)
+    return 1;
 
   unsigned NumBytes = 0;
   while (Val >= 128) {
     Val >>= 7;
     ++NumBytes;
   }
-  return NumBytes+1;
+  return NumBytes + 1;
 }
 
 /// EmitVBRValue - Emit the specified value as a VBR, returning the number of
@@ -245,7 +244,7 @@ static unsigned EmitVBRValue(uint64_t Val, raw_ostream &OS) {
   uint64_t InVal = Val;
   unsigned NumBytes = 0;
   while (Val >= 128) {
-    OS << (Val&127) << "|128,";
+    OS << (Val & 127) << "|128,";
     Val >>= 7;
     ++NumBytes;
   }
@@ -253,7 +252,7 @@ static unsigned EmitVBRValue(uint64_t Val, raw_ostream &OS) {
   if (!OmitComments)
     OS << "/*" << InVal << "*/";
   OS << ", ";
-  return NumBytes+1;
+  return NumBytes + 1;
 }
 
 /// Emit the specified signed value as a VBR. To improve compression we encode
@@ -290,8 +289,7 @@ static std::string getIncludePath(const Record *R) {
 
 /// This function traverses the matcher tree and sizes all the nodes
 /// that are children of the three kinds of nodes that have them.
-unsigned MatcherTableEmitter::
-SizeMatcherList(Matcher *N, raw_ostream &OS) {
+unsigned MatcherTableEmitter::SizeMatcherList(Matcher *N, raw_ostream &OS) {
   unsigned Size = 0;
   while (N) {
     Size += SizeMatcher(N, OS);
@@ -303,8 +301,7 @@ SizeMatcherList(Matcher *N, raw_ostream &OS) {
 /// This function sizes the children of the three kinds of nodes that
 /// have them. It does so by using special cases for those three
 /// nodes, but sharing the code in EmitMatcher() for the other kinds.
-unsigned MatcherTableEmitter::
-SizeMatcher(Matcher *N, raw_ostream &OS) {
+unsigned MatcherTableEmitter::SizeMatcher(Matcher *N, raw_ostream &OS) {
   unsigned Idx = 0;
 
   ++OpcodeCounts[N->getKind()];
@@ -389,7 +386,7 @@ void MatcherTableEmitter::EmitPatternMatchTable(raw_ostream &OS) {
          "The sizes of Pattern and include vectors should be the same");
 
   BeginEmitFunction(OS, "StringRef", "getPatternForIndex(unsigned Index)",
-                    true/*AddOverride*/);
+                    true /*AddOverride*/);
   OS << "{\n";
   OS << "static const char *PATTERN_MATCH_TABLE[] = {\n";
 
@@ -403,7 +400,7 @@ void MatcherTableEmitter::EmitPatternMatchTable(raw_ostream &OS) {
   EndEmitFunction(OS);
 
   BeginEmitFunction(OS, "StringRef", "getIncludePathForIndex(unsigned Index)",
-                    true/*AddOverride*/);
+                    true /*AddOverride*/);
   OS << "{\n";
   OS << "static const char *INCLUDE_PATH_TABLE[] = {\n";
 
@@ -419,9 +416,10 @@ void MatcherTableEmitter::EmitPatternMatchTable(raw_ostream &OS) {
 
 /// EmitMatcher - Emit bytes for the specified matcher and return
 /// the number of bytes emitted.
-unsigned MatcherTableEmitter::
-EmitMatcher(const Matcher *N, const unsigned Indent, unsigned CurrentIdx,
-            raw_ostream &OS) {
+unsigned MatcherTableEmitter::EmitMatcher(const Matcher *N,
+                                          const unsigned Indent,
+                                          unsigned CurrentIdx,
+                                          raw_ostream &OS) {
   OS.indent(Indent);
 
   switch (N->getKind()) {
@@ -434,7 +432,7 @@ EmitMatcher(const Matcher *N, const unsigned Indent, unsigned CurrentIdx,
       if (i == 0) {
         OS << "OPC_Scope, ";
         ++CurrentIdx;
-      } else  {
+      } else {
         if (!OmitComments) {
           OS << "/*" << format_decimal(CurrentIdx, IndexWidth) << "*/";
           OS.indent(Indent) << "/*Scope*/ ";
@@ -451,7 +449,7 @@ EmitMatcher(const Matcher *N, const unsigned Indent, unsigned CurrentIdx,
       }
       OS << '\n';
 
-      ChildSize = EmitMatcherList(SM->getChild(i), Indent+1,
+      ChildSize = EmitMatcherList(SM->getChild(i), Indent + 1,
                                   CurrentIdx + VBRSize, OS);
       assert(ChildSize == SM->getChild(i)->getSize() &&
              "Emitted child size does not match calculated size");
@@ -471,18 +469,15 @@ EmitMatcher(const Matcher *N, const unsigned Indent, unsigned CurrentIdx,
   case Matcher::RecordNode:
     OS << "OPC_RecordNode,";
     if (!OmitComments)
-      OS << " // #"
-         << cast<RecordMatcher>(N)->getResultNo() << " = "
+      OS << " // #" << cast<RecordMatcher>(N)->getResultNo() << " = "
          << cast<RecordMatcher>(N)->getWhatFor();
     OS << '\n';
     return 1;
 
   case Matcher::RecordChild:
-    OS << "OPC_RecordChild" << cast<RecordChildMatcher>(N)->getChildNo()
-       << ',';
+    OS << "OPC_RecordChild" << cast<RecordChildMatcher>(N)->getChildNo() << ',';
     if (!OmitComments)
-      OS << " // #"
-         << cast<RecordChildMatcher>(N)->getResultNo() << " = "
+      OS << " // #" << cast<RecordChildMatcher>(N)->getResultNo() << " = "
          << cast<RecordChildMatcher>(N)->getWhatFor();
     OS << '\n';
     return 1;
@@ -522,14 +517,13 @@ EmitMatcher(const Matcher *N, const unsigned Indent, unsigned CurrentIdx,
     return 1;
 
   case Matcher::CheckSame:
-    OS << "OPC_CheckSame, "
-       << cast<CheckSameMatcher>(N)->getMatchNumber() << ",\n";
+    OS << "OPC_CheckSame, " << cast<CheckSameMatcher>(N)->getMatchNumber()
+       << ",\n";
     return 2;
 
   case Matcher::CheckChildSame:
-    OS << "OPC_CheckChild"
-       << cast<CheckChildSameMatcher>(N)->getChildNo() << "Same, "
-       << cast<CheckChildSameMatcher>(N)->getMatchNumber() << ",\n";
+    OS << "OPC_CheckChild" << cast<CheckChildSameMatcher>(N)->getChildNo()
+       << "Same, " << cast<CheckChildSameMatcher>(N)->getMatchNumber() << ",\n";
     return 2;
 
   case Matcher::CheckPatternPredicate: {
@@ -602,10 +596,10 @@ EmitMatcher(const Matcher *N, const unsigned Indent, unsigned CurrentIdx,
       unsigned IdxSize;
       if (const SwitchOpcodeMatcher *SOM = dyn_cast<SwitchOpcodeMatcher>(N)) {
         Child = SOM->getCaseMatcher(i);
-        IdxSize = 2;  // size of opcode in table is 2 bytes.
+        IdxSize = 2; // size of opcode in table is 2 bytes.
       } else {
         Child = cast<SwitchTypeMatcher>(N)->getCaseMatcher(i);
-        IdxSize = 1;  // size of type in table is 1 byte.
+        IdxSize = 1; // size of type in table is 1 byte.
       }
 
       if (i != 0) {
@@ -613,8 +607,8 @@ EmitMatcher(const Matcher *N, const unsigned Indent, unsigned CurrentIdx,
           OS << "/*" << format_decimal(CurrentIdx, IndexWidth) << "*/";
         OS.indent(Indent);
         if (!OmitComments)
-          OS << (isa<SwitchOpcodeMatcher>(N) ?
-                     "/*SwitchOpcode*/ " : "/*SwitchType*/ ");
+          OS << (isa<SwitchOpcodeMatcher>(N) ? "/*SwitchOpcode*/ "
+                                             : "/*SwitchType*/ ");
       }
 
       unsigned ChildSize = Child->getSize();
@@ -627,7 +621,7 @@ EmitMatcher(const Matcher *N, const unsigned Indent, unsigned CurrentIdx,
         OS << "// ->" << CurrentIdx + ChildSize;
       OS << '\n';
 
-      ChildSize = EmitMatcherList(Child, Indent+1, CurrentIdx, OS);
+      ChildSize = EmitMatcherList(Child, Indent + 1, CurrentIdx, OS);
       assert(ChildSize == Child->getSize() &&
              "Emitted child size does not match calculated size");
       CurrentIdx += ChildSize;
@@ -638,8 +632,8 @@ EmitMatcher(const Matcher *N, const unsigned Indent, unsigned CurrentIdx,
       OS << "/*" << format_decimal(CurrentIdx, IndexWidth) << "*/";
     OS.indent(Indent) << "0,";
     if (!OmitComments)
-      OS << (isa<SwitchOpcodeMatcher>(N) ?
-             " // EndSwitchOpcode" : " // EndSwitchType");
+      OS << (isa<SwitchOpcodeMatcher>(N) ? " // EndSwitchOpcode"
+                                         : " // EndSwitchType");
 
     OS << '\n';
     return CurrentIdx - StartIdx + 1;
@@ -722,7 +716,7 @@ EmitMatcher(const Matcher *N, const unsigned Indent, unsigned CurrentIdx,
       OS << " // " << Pattern.getSelectFunc();
       OS << ":$" << CCPM->getName();
       for (unsigned i = 0, e = Pattern.getNumOperands(); i != e; ++i)
-        OS << " #" << CCPM->getFirstResult()+i;
+        OS << " #" << CCPM->getFirstResult() + i;
 
       if (Pattern.hasProperty(SDNPHasChain))
         OS << " + chain result";
@@ -733,14 +727,16 @@ EmitMatcher(const Matcher *N, const unsigned Indent, unsigned CurrentIdx,
 
   case Matcher::CheckAndImm: {
     OS << "OPC_CheckAndImm, ";
-    unsigned Bytes=1+EmitVBRValue(cast<CheckAndImmMatcher>(N)->getValue(), OS);
+    unsigned Bytes =
+        1 + EmitVBRValue(cast<CheckAndImmMatcher>(N)->getValue(), OS);
     OS << '\n';
     return Bytes;
   }
 
   case Matcher::CheckOrImm: {
     OS << "OPC_CheckOrImm, ";
-    unsigned Bytes = 1+EmitVBRValue(cast<CheckOrImmMatcher>(N)->getValue(), OS);
+    unsigned Bytes =
+        1 + EmitVBRValue(cast<CheckOrImmMatcher>(N)->getValue(), OS);
     OS << '\n';
     return Bytes;
   }
@@ -843,7 +839,7 @@ EmitMatcher(const Matcher *N, const unsigned Indent, unsigned CurrentIdx,
 
   case Matcher::EmitMergeInputChains: {
     const EmitMergeInputChainsMatcher *MN =
-      cast<EmitMergeInputChainsMatcher>(N);
+        cast<EmitMergeInputChainsMatcher>(N);
 
     // Handle the specialized forms OPC_EmitMergeInputChains1_0, 1_1, and 1_2.
     if (MN->getNumNodes() == 1 && MN->getNode(0) < 3) {
@@ -855,7 +851,7 @@ EmitMatcher(const Matcher *N, const unsigned Indent, unsigned CurrentIdx,
     for (unsigned i = 0, e = MN->getNumNodes(); i != e; ++i)
       OS << MN->getNode(i) << ", ";
     OS << '\n';
-    return 2+MN->getNumNodes();
+    return 2 + MN->getNumNodes();
   }
   case Matcher::EmitCopyToReg: {
     const auto *C2RMatcher = cast<EmitCopyToRegMatcher>(N);
@@ -884,8 +880,8 @@ EmitMatcher(const Matcher *N, const unsigned Indent, unsigned CurrentIdx,
     OS << "OPC_EmitNodeXForm, " << getNodeXFormID(XF->getNodeXForm()) << ", "
        << XF->getSlot() << ',';
     if (!OmitComments)
-      OS << " // "<<XF->getNodeXForm()->getName();
-    OS <<'\n';
+      OS << " // " << XF->getNodeXForm()->getName();
+    OS << '\n';
     return 3;
   }
 
@@ -955,7 +951,7 @@ EmitMatcher(const Matcher *N, const unsigned Indent, unsigned CurrentIdx,
     }
     OS << ",\n";
 
-    OS.indent(FullIndexWidth + Indent+4);
+    OS.indent(FullIndexWidth + Indent + 4);
     if (!CompressVTs) {
       OS << EN->getNumVTs();
       if (!OmitComments)
@@ -980,17 +976,18 @@ EmitMatcher(const Matcher *N, const unsigned Indent, unsigned CurrentIdx,
           OS << " // Results =";
           unsigned First = E->getFirstResultSlot();
           for (unsigned i = 0; i != NumResults; ++i)
-            OS << " #" << First+i;
+            OS << " #" << First + i;
         }
       }
       OS << '\n';
 
       if (const MorphNodeToMatcher *SNT = dyn_cast<MorphNodeToMatcher>(N)) {
-        OS.indent(FullIndexWidth + Indent) << "// Src: "
-          << *SNT->getPattern().getSrcPattern() << " - Complexity = "
-          << SNT->getPattern().getPatternComplexity(CGP) << '\n';
-        OS.indent(FullIndexWidth + Indent) << "// Dst: "
-          << *SNT->getPattern().getDstPattern() << '\n';
+        OS.indent(FullIndexWidth + Indent)
+            << "// Src: " << *SNT->getPattern().getSrcPattern()
+            << " - Complexity = " << SNT->getPattern().getPatternComplexity(CGP)
+            << '\n';
+        OS.indent(FullIndexWidth + Indent)
+            << "// Dst: " << *SNT->getPattern().getDstPattern() << '\n';
       }
     } else
       OS << '\n';
@@ -1021,11 +1018,12 @@ EmitMatcher(const Matcher *N, const unsigned Indent, unsigned CurrentIdx,
       NumResultBytes += EmitVBRValue(CM->getResult(i), OS);
     OS << '\n';
     if (!OmitComments) {
-      OS.indent(FullIndexWidth + Indent) << " // Src: "
-        << *CM->getPattern().getSrcPattern() << " - Complexity = "
-        << CM->getPattern().getPatternComplexity(CGP) << '\n';
-      OS.indent(FullIndexWidth + Indent) << " // Dst: "
-        << *CM->getPattern().getDstPattern();
+      OS.indent(FullIndexWidth + Indent)
+          << " // Src: " << *CM->getPattern().getSrcPattern()
+          << " - Complexity = " << CM->getPattern().getPatternComplexity(CGP)
+          << '\n';
+      OS.indent(FullIndexWidth + Indent)
+          << " // Dst: " << *CM->getPattern().getDstPattern();
     }
     OS << '\n';
     return 2 + NumResultBytes + NumCoveredBytes;
@@ -1036,9 +1034,10 @@ EmitMatcher(const Matcher *N, const unsigned Indent, unsigned CurrentIdx,
 
 /// This function traverses the matcher tree and emits all the nodes.
 /// The nodes have already been sized.
-unsigned MatcherTableEmitter::
-EmitMatcherList(const Matcher *N, const unsigned Indent, unsigned CurrentIdx,
-                raw_ostream &OS) {
+unsigned MatcherTableEmitter::EmitMatcherList(const Matcher *N,
+                                              const unsigned Indent,
+                                              unsigned CurrentIdx,
+                                              raw_ostream &OS) {
   unsigned Size = 0;
   while (N) {
     if (!OmitComments)
@@ -1059,7 +1058,7 @@ void MatcherTableEmitter::EmitNodePredicatesFunction(
   if (Preds.empty())
     return;
 
-  BeginEmitFunction(OS, "bool", Decl, true/*AddOverride*/);
+  BeginEmitFunction(OS, "bool", Decl, true /*AddOverride*/);
   OS << "{\n";
   OS << "  switch (PredNo) {\n";
   OS << "  default: llvm_unreachable(\"Invalid predicate in table?\");\n";
@@ -1083,12 +1082,13 @@ void MatcherTableEmitter::EmitPredicateFunctions(raw_ostream &OS) {
   // Emit pattern predicates.
   if (!PatternPredicates.empty()) {
     BeginEmitFunction(OS, "bool",
-          "CheckPatternPredicate(unsigned PredNo) const", true/*AddOverride*/);
+                      "CheckPatternPredicate(unsigned PredNo) const",
+                      true /*AddOverride*/);
     OS << "{\n";
     OS << "  switch (PredNo) {\n";
     OS << "  default: llvm_unreachable(\"Invalid predicate in table?\");\n";
     for (unsigned i = 0, e = PatternPredicates.size(); i != e; ++i)
-      OS << "  case " << i << ": return "  << PatternPredicates[i] << ";\n";
+      OS << "  case " << i << ": return " << PatternPredicates[i] << ";\n";
     OS << "  }\n";
     OS << "}\n";
     EndEmitFunction(OS);
@@ -1107,11 +1107,12 @@ void MatcherTableEmitter::EmitPredicateFunctions(raw_ostream &OS) {
   // Emit CompletePattern matchers.
   // FIXME: This should be const.
   if (!ComplexPatterns.empty()) {
-    BeginEmitFunction(OS, "bool",
-          "CheckComplexPattern(SDNode *Root, SDNode *Parent,\n"
-          "      SDValue N, unsigned PatternNo,\n"
-          "      SmallVectorImpl<std::pair<SDValue, SDNode *>> &Result)",
-          true/*AddOverride*/);
+    BeginEmitFunction(
+        OS, "bool",
+        "CheckComplexPattern(SDNode *Root, SDNode *Parent,\n"
+        "      SDValue N, unsigned PatternNo,\n"
+        "      SmallVectorImpl<std::pair<SDValue, SDNode *>> &Result)",
+        true /*AddOverride*/);
     OS << "{\n";
     OS << "  unsigned NextRes = Result.size();\n";
     OS << "  switch (PatternNo) {\n";
@@ -1121,7 +1122,7 @@ void MatcherTableEmitter::EmitPredicateFunctions(raw_ostream &OS) {
       unsigned NumOps = P.getNumOperands();
 
       if (P.hasProperty(SDNPHasChain))
-        ++NumOps;  // Get the chained node too.
+        ++NumOps; // Get the chained node too.
 
       OS << "  case " << i << ":\n";
       if (InstrumentCoverage)
@@ -1160,12 +1161,12 @@ void MatcherTableEmitter::EmitPredicateFunctions(raw_ostream &OS) {
     EndEmitFunction(OS);
   }
 
-
   // Emit SDNodeXForm handlers.
   // FIXME: This should be const.
   if (!NodeXForms.empty()) {
     BeginEmitFunction(OS, "SDValue",
-          "RunSDNodeXForm(SDValue V, unsigned XFormNo)", true/*AddOverride*/);
+                      "RunSDNodeXForm(SDValue V, unsigned XFormNo)",
+                      true /*AddOverride*/);
     OS << "{\n";
     OS << "  switch (XFormNo) {\n";
     OS << "  default: llvm_unreachable(\"Invalid xform # in table?\");\n";
@@ -1173,7 +1174,7 @@ void MatcherTableEmitter::EmitPredicateFunctions(raw_ostream &OS) {
     // FIXME: The node xform could take SDValue's instead of SDNode*'s.
     for (unsigned i = 0, e = NodeXForms.size(); i != e; ++i) {
       const CodeGenDAGPatterns::NodeXForm &Entry =
-        CGP.getSDNodeTransform(NodeXForms[i]);
+          CGP.getSDNodeTransform(NodeXForms[i]);
 
       Record *SDNode = Entry.first;
       const std::string &Code = Entry.second;
@@ -1281,8 +1282,7 @@ static StringRef getOpcodeString(Matcher::KindTy Kind) {
   llvm_unreachable("Unhandled opcode?");
 }
 
-void MatcherTableEmitter::EmitHistogram(const Matcher *M,
-                                        raw_ostream &OS) {
+void MatcherTableEmitter::EmitHistogram(const Matcher *M, raw_ostream &OS) {
   if (OmitComments)
     return;
 
@@ -1295,9 +1295,7 @@ void MatcherTableEmitter::EmitHistogram(const Matcher *M,
   OS << '\n';
 }
 
-
-void llvm::EmitMatcherTable(Matcher *TheMatcher,
-                            const CodeGenDAGPatterns &CGP,
+void llvm::EmitMatcherTable(Matcher *TheMatcher, const CodeGenDAGPatterns &CGP,
                             raw_ostream &OS) {
   OS << "#if defined(GET_DAGISEL_DECL) && defined(GET_DAGISEL_BODY)\n";
   OS << "#error GET_DAGISEL_DECL and GET_DAGISEL_BODY cannot be both defined, ";
@@ -1328,7 +1326,7 @@ void llvm::EmitMatcherTable(Matcher *TheMatcher,
   OS << "#define DAGISEL_CLASS_COLONCOLON\n";
   OS << "#endif\n\n";
 
-  BeginEmitFunction(OS, "void", "SelectCode(SDNode *N)", false/*AddOverride*/);
+  BeginEmitFunction(OS, "void", "SelectCode(SDNode *N)", false /*AddOverride*/);
   MatcherTableEmitter MatcherEmitter(TheMatcher, CGP);
 
   // First we size all the children of the three kinds of matchers that have
@@ -1348,7 +1346,8 @@ void llvm::EmitMatcherTable(Matcher *TheMatcher,
   OS << "  #define TARGET_VAL(X) X & 255, unsigned(X) >> 8\n";
   OS << "  static const unsigned char MatcherTable[] = {\n";
   TotalSize = MatcherEmitter.EmitMatcherList(TheMatcher, 1, 0, OS);
-  OS << "    0\n  }; // Total Array size is " << (TotalSize+1) << " bytes\n\n";
+  OS << "    0\n  }; // Total Array size is " << (TotalSize + 1)
+     << " bytes\n\n";
 
   MatcherEmitter.EmitHistogram(TheMatcher, OS);
 
diff --git a/llvm/utils/TableGen/DAGISelMatcherGen.cpp b/llvm/utils/TableGen/DAGISelMatcherGen.cpp
index 3526e97..8ca7aae 100644
--- a/llvm/utils/TableGen/DAGISelMatcherGen.cpp
+++ b/llvm/utils/TableGen/DAGISelMatcherGen.cpp
@@ -20,7 +20,6 @@
 #include <utility>
 using namespace llvm;
 
-
 /// getRegisterValueType - Look up and return the ValueType of the specified
 /// register. If the register is a member of multiple register classes, they
 /// must all have the same type.
@@ -52,96 +51,97 @@ static MVT::SimpleValueType getRegisterValueType(Record *R,
   return VT;
 }
 
-
 namespace {
-  class MatcherGen {
-    const PatternToMatch &Pattern;
-    const CodeGenDAGPatterns &CGP;
-
-    /// PatWithNoTypes - This is a clone of Pattern.getSrcPattern() that starts
-    /// out with all of the types removed.  This allows us to insert type checks
-    /// as we scan the tree.
-    TreePatternNodePtr PatWithNoTypes;
-
-    /// VariableMap - A map from variable names ('$dst') to the recorded operand
-    /// number that they were captured as.  These are biased by 1 to make
-    /// insertion easier.
-    StringMap<unsigned> VariableMap;
-
-    /// This maintains the recorded operand number that OPC_CheckComplexPattern
-    /// drops each sub-operand into. We don't want to insert these into
-    /// VariableMap because that leads to identity checking if they are
-    /// encountered multiple times. Biased by 1 like VariableMap for
-    /// consistency.
-    StringMap<unsigned> NamedComplexPatternOperands;
-
-    /// NextRecordedOperandNo - As we emit opcodes to record matched values in
-    /// the RecordedNodes array, this keeps track of which slot will be next to
-    /// record into.
-    unsigned NextRecordedOperandNo;
-
-    /// MatchedChainNodes - This maintains the position in the recorded nodes
-    /// array of all of the recorded input nodes that have chains.
-    SmallVector<unsigned, 2> MatchedChainNodes;
-
-    /// MatchedComplexPatterns - This maintains a list of all of the
-    /// ComplexPatterns that we need to check. The second element of each pair
-    /// is the recorded operand number of the input node.
-    SmallVector<std::pair<const TreePatternNode*,
-                          unsigned>, 2> MatchedComplexPatterns;
-
-    /// PhysRegInputs - List list has an entry for each explicitly specified
-    /// physreg input to the pattern.  The first elt is the Register node, the
-    /// second is the recorded slot number the input pattern match saved it in.
-    SmallVector<std::pair<Record*, unsigned>, 2> PhysRegInputs;
-
-    /// Matcher - This is the top level of the generated matcher, the result.
-    Matcher *TheMatcher;
-
-    /// CurPredicate - As we emit matcher nodes, this points to the latest check
-    /// which should have future checks stuck into its Next position.
-    Matcher *CurPredicate;
-  public:
-    MatcherGen(const PatternToMatch &pattern, const CodeGenDAGPatterns &cgp);
-
-    bool EmitMatcherCode(unsigned Variant);
-    void EmitResultCode();
-
-    Matcher *GetMatcher() const { return TheMatcher; }
-  private:
-    void AddMatcher(Matcher *NewNode);
-    void InferPossibleTypes();
-
-    // Matcher Generation.
-    void EmitMatchCode(const TreePatternNode *N, TreePatternNode *NodeNoTypes);
-    void EmitLeafMatchCode(const TreePatternNode *N);
-    void EmitOperatorMatchCode(const TreePatternNode *N,
-                               TreePatternNode *NodeNoTypes);
-
-    /// If this is the first time a node with unique identifier Name has been
-    /// seen, record it. Otherwise, emit a check to make sure this is the same
-    /// node. Returns true if this is the first encounter.
-    bool recordUniqueNode(ArrayRef<std::string> Names);
-
-    // Result Code Generation.
-    unsigned getNamedArgumentSlot(StringRef Name) {
-      unsigned VarMapEntry = VariableMap[Name];
-      assert(VarMapEntry != 0 &&
-             "Variable referenced but not defined and not caught earlier!");
-      return VarMapEntry-1;
-    }
+class MatcherGen {
+  const PatternToMatch &Pattern;
+  const CodeGenDAGPatterns &CGP;
+
+  /// PatWithNoTypes - This is a clone of Pattern.getSrcPattern() that starts
+  /// out with all of the types removed.  This allows us to insert type checks
+  /// as we scan the tree.
+  TreePatternNodePtr PatWithNoTypes;
+
+  /// VariableMap - A map from variable names ('$dst') to the recorded operand
+  /// number that they were captured as.  These are biased by 1 to make
+  /// insertion easier.
+  StringMap<unsigned> VariableMap;
+
+  /// This maintains the recorded operand number that OPC_CheckComplexPattern
+  /// drops each sub-operand into. We don't want to insert these into
+  /// VariableMap because that leads to identity checking if they are
+  /// encountered multiple times. Biased by 1 like VariableMap for
+  /// consistency.
+  StringMap<unsigned> NamedComplexPatternOperands;
+
+  /// NextRecordedOperandNo - As we emit opcodes to record matched values in
+  /// the RecordedNodes array, this keeps track of which slot will be next to
+  /// record into.
+  unsigned NextRecordedOperandNo;
+
+  /// MatchedChainNodes - This maintains the position in the recorded nodes
+  /// array of all of the recorded input nodes that have chains.
+  SmallVector<unsigned, 2> MatchedChainNodes;
+
+  /// MatchedComplexPatterns - This maintains a list of all of the
+  /// ComplexPatterns that we need to check. The second element of each pair
+  /// is the recorded operand number of the input node.
+  SmallVector<std::pair<const TreePatternNode *, unsigned>, 2>
+      MatchedComplexPatterns;
+
+  /// PhysRegInputs - List list has an entry for each explicitly specified
+  /// physreg input to the pattern.  The first elt is the Register node, the
+  /// second is the recorded slot number the input pattern match saved it in.
+  SmallVector<std::pair<Record *, unsigned>, 2> PhysRegInputs;
+
+  /// Matcher - This is the top level of the generated matcher, the result.
+  Matcher *TheMatcher;
+
+  /// CurPredicate - As we emit matcher nodes, this points to the latest check
+  /// which should have future checks stuck into its Next position.
+  Matcher *CurPredicate;
+
+public:
+  MatcherGen(const PatternToMatch &pattern, const CodeGenDAGPatterns &cgp);
+
+  bool EmitMatcherCode(unsigned Variant);
+  void EmitResultCode();
+
+  Matcher *GetMatcher() const { return TheMatcher; }
+
+private:
+  void AddMatcher(Matcher *NewNode);
+  void InferPossibleTypes();
+
+  // Matcher Generation.
+  void EmitMatchCode(const TreePatternNode *N, TreePatternNode *NodeNoTypes);
+  void EmitLeafMatchCode(const TreePatternNode *N);
+  void EmitOperatorMatchCode(const TreePatternNode *N,
+                             TreePatternNode *NodeNoTypes);
+
+  /// If this is the first time a node with unique identifier Name has been
+  /// seen, record it. Otherwise, emit a check to make sure this is the same
+  /// node. Returns true if this is the first encounter.
+  bool recordUniqueNode(ArrayRef<std::string> Names);
+
+  // Result Code Generation.
+  unsigned getNamedArgumentSlot(StringRef Name) {
+    unsigned VarMapEntry = VariableMap[Name];
+    assert(VarMapEntry != 0 &&
+           "Variable referenced but not defined and not caught earlier!");
+    return VarMapEntry - 1;
+  }
 
-    void EmitResultOperand(const TreePatternNode *N,
-                           SmallVectorImpl<unsigned> &ResultOps);
-    void EmitResultOfNamedOperand(const TreePatternNode *N,
-                                  SmallVectorImpl<unsigned> &ResultOps);
-    void EmitResultLeafAsOperand(const TreePatternNode *N,
-                                 SmallVectorImpl<unsigned> &ResultOps);
-    void EmitResultInstructionAsOperand(const TreePatternNode *N,
-                                        SmallVectorImpl<unsigned> &ResultOps);
-    void EmitResultSDNodeXFormAsOperand(const TreePatternNode *N,
-                                        SmallVectorImpl<unsigned> &ResultOps);
-    };
+  void EmitResultOperand(const TreePatternNode *N,
+                         SmallVectorImpl<unsigned> &ResultOps);
+  void EmitResultOfNamedOperand(const TreePatternNode *N,
+                                SmallVectorImpl<unsigned> &ResultOps);
+  void EmitResultLeafAsOperand(const TreePatternNode *N,
+                               SmallVectorImpl<unsigned> &ResultOps);
+  void EmitResultInstructionAsOperand(const TreePatternNode *N,
+                                      SmallVectorImpl<unsigned> &ResultOps);
+  void EmitResultSDNodeXFormAsOperand(const TreePatternNode *N,
+                                      SmallVectorImpl<unsigned> &ResultOps);
+};
 
 } // end anonymous namespace
 
@@ -180,11 +180,10 @@ void MatcherGen::InferPossibleTypes() {
 
   bool MadeChange = true;
   while (MadeChange)
-    MadeChange = PatWithNoTypes->ApplyTypeConstraints(TP,
-                                              true/*Ignore reg constraints*/);
+    MadeChange = PatWithNoTypes->ApplyTypeConstraints(
+        TP, true /*Ignore reg constraints*/);
 }
 
-
 /// AddMatcher - Add a matcher node to the current graph we're building.
 void MatcherGen::AddMatcher(Matcher *NewNode) {
   if (CurPredicate)
@@ -194,7 +193,6 @@ void MatcherGen::AddMatcher(Matcher *NewNode) {
   CurPredicate = NewNode;
 }
 
-
 //===----------------------------------------------------------------------===//
 // Pattern Match Generation
 //===----------------------------------------------------------------------===//
@@ -240,7 +238,7 @@ void MatcherGen::EmitLeafMatchCode(const TreePatternNode *N) {
     return AddMatcher(new CheckValueTypeMatcher(LeafRec->getName()));
   }
 
-  if (// Handle register references.  Nothing to do here, they always match.
+  if ( // Handle register references.  Nothing to do here, they always match.
       LeafRec->isSubClassOf("RegisterClass") ||
       LeafRec->isSubClassOf("RegisterOperand") ||
       LeafRec->isSubClassOf("PointerLikeRegClass") ||
@@ -252,7 +250,7 @@ void MatcherGen::EmitLeafMatchCode(const TreePatternNode *N) {
   // If we have a physreg reference like (mul gpr:$src, EAX) then we need to
   // record the register
   if (LeafRec->isSubClassOf("Register")) {
-    AddMatcher(new RecordMatcher("physreg input "+LeafRec->getName().str(),
+    AddMatcher(new RecordMatcher("physreg input " + LeafRec->getName().str(),
                                  NextRecordedOperandNo));
     PhysRegInputs.push_back(std::make_pair(LeafRec, NextRecordedOperandNo++));
     return;
@@ -376,7 +374,7 @@ void MatcherGen::EmitOperatorMatchCode(const TreePatternNode *N,
   if (N->NodeHasProperty(SDNPHasChain, CGP)) {
     // Record the node and remember it in our chained nodes list.
     AddMatcher(new RecordMatcher("'" + N->getOperator()->getName().str() +
-                                         "' chained node",
+                                     "' chained node",
                                  NextRecordedOperandNo));
     // Remember all of the input chains our pattern will match.
     MatchedChainNodes.push_back(NextRecordedOperandNo++);
@@ -407,7 +405,7 @@ void MatcherGen::EmitOperatorMatchCode(const TreePatternNode *N,
     // this to be folded.
     //
     const TreePatternNode *Root = Pattern.getSrcPattern();
-    if (N != Root) {                             // Not the root of the pattern.
+    if (N != Root) { // Not the root of the pattern.
       // If there is a node between the root and this node, then we definitely
       // need to emit the check.
       bool NeedCheck = !Root->hasChild(N);
@@ -419,13 +417,11 @@ void MatcherGen::EmitOperatorMatchCode(const TreePatternNode *N,
       if (!NeedCheck) {
         const SDNodeInfo &PInfo = CGP.getSDNodeInfo(Root->getOperator());
         NeedCheck =
-          Root->getOperator() == CGP.get_intrinsic_void_sdnode() ||
-          Root->getOperator() == CGP.get_intrinsic_w_chain_sdnode() ||
-          Root->getOperator() == CGP.get_intrinsic_wo_chain_sdnode() ||
-          PInfo.getNumOperands() > 1 ||
-          PInfo.hasProperty(SDNPHasChain) ||
-          PInfo.hasProperty(SDNPInGlue) ||
-          PInfo.hasProperty(SDNPOptInGlue);
+            Root->getOperator() == CGP.get_intrinsic_void_sdnode() ||
+            Root->getOperator() == CGP.get_intrinsic_w_chain_sdnode() ||
+            Root->getOperator() == CGP.get_intrinsic_wo_chain_sdnode() ||
+            PInfo.getNumOperands() > 1 || PInfo.hasProperty(SDNPHasChain) ||
+            PInfo.hasProperty(SDNPInGlue) || PInfo.hasProperty(SDNPOptInGlue);
       }
 
       if (NeedCheck)
@@ -434,13 +430,12 @@ void MatcherGen::EmitOperatorMatchCode(const TreePatternNode *N,
   }
 
   // If this node has an output glue and isn't the root, remember it.
-  if (N->NodeHasProperty(SDNPOutGlue, CGP) &&
-      N != Pattern.getSrcPattern()) {
+  if (N->NodeHasProperty(SDNPOutGlue, CGP) && N != Pattern.getSrcPattern()) {
     // TODO: This redundantly records nodes with both glues and chains.
 
     // Record the node and remember it in our chained nodes list.
     AddMatcher(new RecordMatcher("'" + N->getOperator()->getName().str() +
-                                         "' glue output node",
+                                     "' glue output node",
                                  NextRecordedOperandNo));
   }
 
@@ -485,7 +480,7 @@ bool MatcherGen::recordUniqueNode(ArrayRef<std::string> Names) {
     // we already have checked that the first reference is valid, we don't
     // have to recursively match it, just check that it's the same as the
     // previously named thing.
-    AddMatcher(new CheckSameMatcher(Entry-1));
+    AddMatcher(new CheckSameMatcher(Entry - 1));
   }
 
   for (const std::string &Name : Names)
@@ -502,7 +497,8 @@ void MatcherGen::EmitMatchCode(const TreePatternNode *N,
   SmallVector<unsigned, 2> ResultsToTypeCheck;
 
   for (unsigned i = 0, e = NodeNoTypes->getNumTypes(); i != e; ++i) {
-    if (NodeNoTypes->getExtType(i) == N->getExtType(i)) continue;
+    if (NodeNoTypes->getExtType(i) == N->getExtType(i))
+      continue;
     NodeNoTypes->setType(i, N->getExtType(i));
     InferPossibleTypes();
     ResultsToTypeCheck.push_back(i);
@@ -515,7 +511,8 @@ void MatcherGen::EmitMatchCode(const TreePatternNode *N,
     Names.push_back(N->getName());
 
   for (const ScopedName &Name : N->getNamesAsPredicateArg()) {
-    Names.push_back(("pred:" + Twine(Name.getScope()) + ":" + Name.getIdentifier()).str());
+    Names.push_back(
+        ("pred:" + Twine(Name.getScope()) + ":" + Name.getIdentifier()).str());
   }
 
   if (!Names.empty()) {
@@ -557,14 +554,17 @@ bool MatcherGen::EmitMatcherCode(unsigned Variant) {
   // Depending on which variant we're generating code for, emit the root opcode
   // check.
   if (const ComplexPattern *CP =
-                   Pattern.getSrcPattern()->getComplexPatternInfo(CGP)) {
-    const std::vector<Record*> &OpNodes = CP->getRootNodes();
-    assert(!OpNodes.empty() &&"Complex Pattern must specify what it can match");
-    if (Variant >= OpNodes.size()) return true;
+          Pattern.getSrcPattern()->getComplexPatternInfo(CGP)) {
+    const std::vector<Record *> &OpNodes = CP->getRootNodes();
+    assert(!OpNodes.empty() &&
+           "Complex Pattern must specify what it can match");
+    if (Variant >= OpNodes.size())
+      return true;
 
     AddMatcher(new CheckOpcodeMatcher(CGP.getSDNodeInfo(OpNodes[Variant])));
   } else {
-    if (Variant != 0) return true;
+    if (Variant != 0)
+      return true;
   }
 
   // Emit the matcher for the pattern structure and types.
@@ -616,7 +616,7 @@ bool MatcherGen::EmitMatcherCode(unsigned Variant) {
       // It is the last operand recorded.
       assert(NextRecordedOperandNo > 1 &&
              "Should have recorded input/result chains at least!");
-      MatchedChainNodes.push_back(NextRecordedOperandNo-1);
+      MatchedChainNodes.push_back(NextRecordedOperandNo - 1);
     }
 
     // TODO: Complex patterns can't have output glues, if they did, we'd want
@@ -626,13 +626,12 @@ bool MatcherGen::EmitMatcherCode(unsigned Variant) {
   return false;
 }
 
-
 //===----------------------------------------------------------------------===//
 // Node Result Generation
 //===----------------------------------------------------------------------===//
 
-void MatcherGen::EmitResultOfNamedOperand(const TreePatternNode *N,
-                                          SmallVectorImpl<unsigned> &ResultOps){
+void MatcherGen::EmitResultOfNamedOperand(
+    const TreePatternNode *N, SmallVectorImpl<unsigned> &ResultOps) {
   assert(!N->getName().empty() && "Operand not named!");
 
   if (unsigned SlotNo = NamedComplexPatternOperands[N->getName()]) {
@@ -676,8 +675,7 @@ void MatcherGen::EmitResultLeafAsOperand(const TreePatternNode *N,
   if (DefInit *DI = dyn_cast<DefInit>(N->getLeafValue())) {
     Record *Def = DI->getDef();
     if (Def->isSubClassOf("Register")) {
-      const CodeGenRegister *Reg =
-        CGP.getTargetInfo().getRegBank().getReg(Def);
+      const CodeGenRegister *Reg = CGP.getTargetInfo().getRegBank().getReg(Def);
       AddMatcher(new EmitRegisterMatcher(Reg, N->getSimpleType(0)));
       ResultOps.push_back(NextRecordedOperandNo++);
       return;
@@ -746,18 +744,16 @@ void MatcherGen::EmitResultLeafAsOperand(const TreePatternNode *N,
   N->dump();
 }
 
-static bool
-mayInstNodeLoadOrStore(const TreePatternNode *N,
-                       const CodeGenDAGPatterns &CGP) {
+static bool mayInstNodeLoadOrStore(const TreePatternNode *N,
+                                   const CodeGenDAGPatterns &CGP) {
   Record *Op = N->getOperator();
   const CodeGenTarget &CGT = CGP.getTargetInfo();
   CodeGenInstruction &II = CGT.getInstruction(Op);
   return II.mayLoad || II.mayStore;
 }
 
-static unsigned
-numNodesThatMayLoadOrStore(const TreePatternNode *N,
-                           const CodeGenDAGPatterns &CGP) {
+static unsigned numNodesThatMayLoadOrStore(const TreePatternNode *N,
+                                           const CodeGenDAGPatterns &CGP) {
   if (N->isLeaf())
     return 0;
 
@@ -775,9 +771,8 @@ numNodesThatMayLoadOrStore(const TreePatternNode *N,
   return Count;
 }
 
-void MatcherGen::
-EmitResultInstructionAsOperand(const TreePatternNode *N,
-                               SmallVectorImpl<unsigned> &OutputOps) {
+void MatcherGen::EmitResultInstructionAsOperand(
+    const TreePatternNode *N, SmallVectorImpl<unsigned> &OutputOps) {
   Record *Op = N->getOperator();
   const CodeGenTarget &CGT = CGP.getTargetInfo();
   CodeGenInstruction &II = CGT.getInstruction(Op);
@@ -823,11 +818,11 @@ EmitResultInstructionAsOperand(const TreePatternNode *N,
   // filled in with their defaults unconditionally.
   unsigned NonOverridableOperands = NumFixedOperands;
   while (NonOverridableOperands > NumResults &&
-         CGP.operandHasDefault(II.Operands[NonOverridableOperands-1].Rec))
+         CGP.operandHasDefault(II.Operands[NonOverridableOperands - 1].Rec))
     --NonOverridableOperands;
 
-  for (unsigned InstOpNo = NumResults, e = NumFixedOperands;
-       InstOpNo != e; ++InstOpNo) {
+  for (unsigned InstOpNo = NumResults, e = NumFixedOperands; InstOpNo != e;
+       ++InstOpNo) {
     // Determine what to emit for this operand.
     Record *OperandNode = II.Operands[InstOpNo].Rec;
     if (CGP.operandHasDefault(OperandNode) &&
@@ -835,8 +830,7 @@ EmitResultInstructionAsOperand(const TreePatternNode *N,
       // This is a predicate or optional def operand which the pattern has not
       // overridden, or which we aren't letting it override; emit the 'default
       // ops' operands.
-      const DAGDefaultOperand &DefaultOp
-        = CGP.getDefaultOperand(OperandNode);
+      const DAGDefaultOperand &DefaultOp = CGP.getDefaultOperand(OperandNode);
       for (unsigned i = 0, e = DefaultOp.DefaultOps.size(); i != e; ++i)
         EmitResultOperand(DefaultOp.DefaultOps[i].get(), InstOps);
       continue;
@@ -865,7 +859,7 @@ EmitResultInstructionAsOperand(const TreePatternNode *N,
       // If the operand is an instruction and it produced multiple results, just
       // take the first one.
       if (!Child->isLeaf() && Child->getOperator()->isSubClassOf("Instruction"))
-        InstOps.resize(BeforeAddingNumOps+1);
+        InstOps.resize(BeforeAddingNumOps + 1);
 
       ++ChildNo;
     }
@@ -889,9 +883,8 @@ EmitResultInstructionAsOperand(const TreePatternNode *N,
     // occur in patterns like (mul:i8 AL:i8, GR8:i8:$src).
     for (unsigned i = 0, e = PhysRegInputs.size(); i != e; ++i) {
       const CodeGenRegister *Reg =
-        CGP.getTargetInfo().getRegBank().getReg(PhysRegInputs[i].first);
-      AddMatcher(new EmitCopyToRegMatcher(PhysRegInputs[i].second,
-                                          Reg));
+          CGP.getTargetInfo().getRegBank().getReg(PhysRegInputs[i].first);
+      AddMatcher(new EmitCopyToRegMatcher(PhysRegInputs[i].second, Reg));
     }
 
     // Even if the node has no other glue inputs, the resultant node must be
@@ -919,7 +912,8 @@ EmitResultInstructionAsOperand(const TreePatternNode *N,
       HandledReg = II.ImplicitDefs[0];
 
     for (Record *Reg : Pattern.getDstRegs()) {
-      if (!Reg->isSubClassOf("Register") || Reg == HandledReg) continue;
+      if (!Reg->isSubClassOf("Register") || Reg == HandledReg)
+        continue;
       ResultVTs.push_back(getRegisterValueType(Reg, CGT));
     }
   }
@@ -928,8 +922,7 @@ EmitResultInstructionAsOperand(const TreePatternNode *N,
   // a node that is variadic, mark the generated node as variadic so that it
   // gets the excess operands from the input DAG.
   int NumFixedArityOperands = -1;
-  if (isRoot &&
-      Pattern.getSrcPattern()->NodeHasProperty(SDNPVariadic, CGP))
+  if (isRoot && Pattern.getSrcPattern()->NodeHasProperty(SDNPVariadic, CGP))
     NumFixedArityOperands = Pattern.getSrcPattern()->getNumChildren();
 
   // If this is the root node and multiple matched nodes in the input pattern
@@ -940,17 +933,17 @@ EmitResultInstructionAsOperand(const TreePatternNode *N,
   // FIXME3: This is actively incorrect for result patterns with multiple
   // memory-referencing instructions.
   bool PatternHasMemOperands =
-    Pattern.getSrcPattern()->TreeHasProperty(SDNPMemOperand, CGP);
+      Pattern.getSrcPattern()->TreeHasProperty(SDNPMemOperand, CGP);
 
   bool NodeHasMemRefs = false;
   if (PatternHasMemOperands) {
     unsigned NumNodesThatLoadOrStore =
-      numNodesThatMayLoadOrStore(Pattern.getDstPattern(), CGP);
-    bool NodeIsUniqueLoadOrStore = mayInstNodeLoadOrStore(N, CGP) &&
-                                   NumNodesThatLoadOrStore == 1;
+        numNodesThatMayLoadOrStore(Pattern.getDstPattern(), CGP);
+    bool NodeIsUniqueLoadOrStore =
+        mayInstNodeLoadOrStore(N, CGP) && NumNodesThatLoadOrStore == 1;
     NodeHasMemRefs =
-      NodeIsUniqueLoadOrStore || (isRoot && (mayInstNodeLoadOrStore(N, CGP) ||
-                                             NumNodesThatLoadOrStore != 1));
+        NodeIsUniqueLoadOrStore || (isRoot && (mayInstNodeLoadOrStore(N, CGP) ||
+                                               NumNodesThatLoadOrStore != 1));
   }
 
   // Determine whether we need to attach a chain to this node.
@@ -982,14 +975,14 @@ EmitResultInstructionAsOperand(const TreePatternNode *N,
 
   // The non-chain and non-glue results of the newly emitted node get recorded.
   for (unsigned i = 0, e = ResultVTs.size(); i != e; ++i) {
-    if (ResultVTs[i] == MVT::Other || ResultVTs[i] == MVT::Glue) break;
+    if (ResultVTs[i] == MVT::Other || ResultVTs[i] == MVT::Glue)
+      break;
     OutputOps.push_back(NextRecordedOperandNo++);
   }
 }
 
-void MatcherGen::
-EmitResultSDNodeXFormAsOperand(const TreePatternNode *N,
-                               SmallVectorImpl<unsigned> &ResultOps) {
+void MatcherGen::EmitResultSDNodeXFormAsOperand(
+    const TreePatternNode *N, SmallVectorImpl<unsigned> &ResultOps) {
   assert(N->getOperator()->isSubClassOf("SDNodeXForm") && "Not SDNodeXForm?");
 
   // Emit the operand.
@@ -1051,7 +1044,8 @@ void MatcherGen::EmitResultCode() {
     // don't re-add it.
     Record *HandledReg = nullptr;
     const TreePatternNode *DstPat = Pattern.getDstPattern();
-    if (!DstPat->isLeaf() &&DstPat->getOperator()->isSubClassOf("Instruction")){
+    if (!DstPat->isLeaf() &&
+        DstPat->getOperator()->isSubClassOf("Instruction")) {
       const CodeGenTarget &CGT = CGP.getTargetInfo();
       CodeGenInstruction &II = CGT.getInstruction(DstPat->getOperator());
 
@@ -1060,7 +1054,8 @@ void MatcherGen::EmitResultCode() {
     }
 
     for (Record *Reg : Pattern.getDstRegs()) {
-      if (!Reg->isSubClassOf("Register") || Reg == HandledReg) continue;
+      if (!Reg->isSubClassOf("Register") || Reg == HandledReg)
+        continue;
       ++NumSrcResults;
     }
   }
@@ -1077,7 +1072,6 @@ void MatcherGen::EmitResultCode() {
   AddMatcher(new CompleteMatchMatcher(Results, Pattern));
 }
 
-
 /// ConvertPatternToMatcher - Create the matcher for the specified pattern with
 /// the specified variant.  If the variant number is invalid, this returns null.
 Matcher *llvm::ConvertPatternToMatcher(const PatternToMatch &Pattern,
diff --git a/llvm/utils/TableGen/DAGISelMatcherOpt.cpp b/llvm/utils/TableGen/DAGISelMatcherOpt.cpp
index c4c25dc..b137492 100644
--- a/llvm/utils/TableGen/DAGISelMatcherOpt.cpp
+++ b/llvm/utils/TableGen/DAGISelMatcherOpt.cpp
@@ -311,10 +311,9 @@ static void FactorNodes(std::unique_ptr<Matcher> &InputMatcherPtr) {
         // Don't print if it's obvious nothing extract could be merged anyway.
         std::next(J) != E) {
       LLVM_DEBUG(errs() << "Couldn't merge this:\n"; Optn->print(errs(), 4);
-                 errs() << "into this:\n";
-                 (*J)->print(errs(), 4);
+                 errs() << "into this:\n"; (*J)->print(errs(), 4);
                  (*std::next(J))->printOne(errs());
-                 if (std::next(J, 2) != E) (*std::next(J, 2))->printOne(errs());
+                 if (std::next(J, 2) != E)(*std::next(J, 2))->printOne(errs());
                  errs() << "\n");
     }
 
diff --git a/llvm/utils/TableGen/DFAEmitter.cpp b/llvm/utils/TableGen/DFAEmitter.cpp
index 54ad81c..0d22ad2 100644
--- a/llvm/utils/TableGen/DFAEmitter.cpp
+++ b/llvm/utils/TableGen/DFAEmitter.cpp
@@ -147,8 +147,8 @@ void DfaEmitter::emit(StringRef Name, raw_ostream &OS) {
 
   OS << "// A table of DFA transitions, ordered by {FromDfaState, Action}.\n";
   OS << "// The initial state is 1, not zero.\n";
-  OS << "const std::array<" << Name << "Transition, "
-     << DfaTransitions.size() << "> " << Name << "Transitions = {{\n";
+  OS << "const std::array<" << Name << "Transition, " << DfaTransitions.size()
+     << "> " << Name << "Transitions = {{\n";
   for (auto &KV : DfaTransitions) {
     dfa_state_type From = KV.first.first;
     dfa_state_type To = KV.second.first;
@@ -284,7 +284,7 @@ void Automaton::emit(raw_ostream &OS) {
   }
   LLVM_DEBUG(dbgs() << "  NFA automaton has " << SeenStates.size()
                     << " states with " << NumTransitions << " transitions.\n");
-  (void) NumTransitions;
+  (void)NumTransitions;
 
   const auto &ActionTypes = Transitions.back().getTypes();
   OS << "// The type of an action in the " << Name << " automaton.\n";
@@ -346,9 +346,7 @@ bool Transition::canTransitionFrom(uint64_t State) {
   return false;
 }
 
-uint64_t Transition::transitionFrom(uint64_t State) {
-  return State | NewState;
-}
+uint64_t Transition::transitionFrom(uint64_t State) { return State | NewState; }
 
 void CustomDfaEmitter::printActionType(raw_ostream &OS) { OS << TypeName; }
 
diff --git a/llvm/utils/TableGen/DFAPacketizerEmitter.cpp b/llvm/utils/TableGen/DFAPacketizerEmitter.cpp
index 64c7884..26ea184 100644
--- a/llvm/utils/TableGen/DFAPacketizerEmitter.cpp
+++ b/llvm/utils/TableGen/DFAPacketizerEmitter.cpp
@@ -72,8 +72,7 @@ public:
   DFAPacketizerEmitter(RecordKeeper &R);
 
   // Construct a map of function unit names to bits.
-  int collectAllFuncUnits(
-      ArrayRef<const CodeGenProcModel *> ProcModels);
+  int collectAllFuncUnits(ArrayRef<const CodeGenProcModel *> ProcModels);
 
   // Construct a map from a combo function unit bit to the bits of all included
   // functional units.
@@ -129,7 +128,8 @@ int DFAPacketizerEmitter::collectAllFuncUnits(
   return totalFUs;
 }
 
-int DFAPacketizerEmitter::collectAllComboFuncs(ArrayRef<Record *> ComboFuncList) {
+int DFAPacketizerEmitter::collectAllComboFuncs(
+    ArrayRef<Record *> ComboFuncList) {
   LLVM_DEBUG(dbgs() << "-------------------------------------------------------"
                        "----------------------\n");
   LLVM_DEBUG(dbgs() << "collectAllComboFuncs");
diff --git a/llvm/utils/TableGen/DXILEmitter.cpp b/llvm/utils/TableGen/DXILEmitter.cpp
index cb9f9c6..25e818a 100644
--- a/llvm/utils/TableGen/DXILEmitter.cpp
+++ b/llvm/utils/TableGen/DXILEmitter.cpp
@@ -42,22 +42,23 @@ struct DXILParameter {
 };
 
 struct DXILOperationDesc {
-  StringRef OpName;    // name of DXIL operation
-  int OpCode;          // ID of DXIL operation
-  StringRef OpClass;   // name of the opcode class
-  StringRef Category;  // classification for this instruction
-  StringRef Doc;       // the documentation description of this instruction
+  StringRef OpName;   // name of DXIL operation
+  int OpCode;         // ID of DXIL operation
+  StringRef OpClass;  // name of the opcode class
+  StringRef Category; // classification for this instruction
+  StringRef Doc;      // the documentation description of this instruction
 
   SmallVector<DXILParameter> Params; // the operands that this instruction takes
-  StringRef OverloadTypes;       // overload types if applicable
-  StringRef FnAttr;              // attribute shorthands: rn=does not access
-                                 // memory,ro=only reads from memory
-  StringRef Intrinsic; // The llvm intrinsic map to OpName. Default is "" which
-                       // means no map exist
-  bool IsDeriv = false;    // whether this is some kind of derivative
+  StringRef OverloadTypes;           // overload types if applicable
+  StringRef FnAttr;                  // attribute shorthands: rn=does not access
+                                     // memory,ro=only reads from memory
+  StringRef Intrinsic;  // The llvm intrinsic map to OpName. Default is "" which
+                        // means no map exist
+  bool IsDeriv = false; // whether this is some kind of derivative
   bool IsGradient = false; // whether this requires a gradient calculation
   bool IsFeedback = false; // whether this is a sampler feedback op
-  bool IsWave = false;     // whether this requires in-wave, cross-lane functionality
+  bool IsWave =
+      false; // whether this requires in-wave, cross-lane functionality
   bool RequiresUniformInputs = false; // whether this operation requires that
                                       // all of its inputs are uniform across
                                       // the wave
diff --git a/llvm/utils/TableGen/DecoderEmitter.cpp b/llvm/utils/TableGen/DecoderEmitter.cpp
index 591ee5c..2f28ccb 100644
--- a/llvm/utils/TableGen/DecoderEmitter.cpp
+++ b/llvm/utils/TableGen/DecoderEmitter.cpp
@@ -53,7 +53,8 @@ using namespace llvm;
 namespace {
 
 STATISTIC(NumEncodings, "Number of encodings considered");
-STATISTIC(NumEncodingsLackingDisasm, "Number of encodings without disassembler info");
+STATISTIC(NumEncodingsLackingDisasm,
+          "Number of encodings without disassembler info");
 STATISTIC(NumInstructions, "Number of instructions considered");
 STATISTIC(NumEncodingsSupported, "Number of encodings supported");
 STATISTIC(NumEncodingsOmitted, "Number of encodings omitted");
@@ -61,7 +62,7 @@ STATISTIC(NumEncodingsOmitted, "Number of encodings omitted");
 struct EncodingField {
   unsigned Base, Width, Offset;
   EncodingField(unsigned B, unsigned W, unsigned O)
-    : Base(B), Width(W), Offset(O) { }
+      : Base(B), Width(W), Offset(O) {}
 };
 
 struct OperandInfo {
@@ -82,7 +83,7 @@ struct OperandInfo {
   typedef std::vector<EncodingField>::const_iterator const_iterator;
 
   const_iterator begin() const { return Fields.begin(); }
-  const_iterator end() const   { return Fields.end();   }
+  const_iterator end() const { return Fields.end(); }
 };
 
 typedef std::vector<uint8_t> DecoderTable;
@@ -141,8 +142,7 @@ public:
   void emitPredicateFunction(formatted_raw_ostream &OS,
                              PredicateSet &Predicates,
                              unsigned Indentation) const;
-  void emitDecoderFunction(formatted_raw_ostream &OS,
-                           DecoderSet &Decoders,
+  void emitDecoderFunction(formatted_raw_ostream &OS, DecoderSet &Decoders,
                            unsigned Indentation) const;
 
   // run - Output the code emitter
@@ -173,9 +173,7 @@ static bool ValueSet(bit_value_t V) {
   return (V == BIT_TRUE || V == BIT_FALSE);
 }
 
-static bool ValueNotSet(bit_value_t V) {
-  return (V == BIT_UNSET);
-}
+static bool ValueNotSet(bit_value_t V) { return (V == BIT_UNSET); }
 
 static int Value(bit_value_t V) {
   return ValueNotSet(V) ? -1 : (V == BIT_FALSE ? 0 : 1);
@@ -280,14 +278,14 @@ class FilterChooser;
 /// version and return the Opcode since the two have the same Asm format string.
 class Filter {
 protected:
-  const FilterChooser *Owner;// points to the FilterChooser who owns this filter
+  const FilterChooser
+      *Owner;        // points to the FilterChooser who owns this filter
   unsigned StartBit; // the starting bit position
-  unsigned NumBits; // number of bits to filter
-  bool Mixed; // a mixed region contains both set and unset bits
+  unsigned NumBits;  // number of bits to filter
+  bool Mixed;        // a mixed region contains both set and unset bits
 
   // Map of well-known segment value to the set of uid's with that value.
-  std::map<uint64_t, std::vector<EncodingIDAndOpcode>>
-      FilteredInstructions;
+  std::map<uint64_t, std::vector<EncodingIDAndOpcode>> FilteredInstructions;
 
   // Set of uid's with non-constant segment values.
   std::vector<EncodingIDAndOpcode> VariableInstructions;
@@ -471,7 +469,7 @@ protected:
   /// dumpFilterArray - dumpFilterArray prints out debugging info for the given
   /// filter array as a series of chars.
   void dumpFilterArray(raw_ostream &o,
-                       const std::vector<bit_value_t> & filter) const;
+                       const std::vector<bit_value_t> &filter) const;
 
   /// dumpStack - dumpStack traverses the filter chooser chain and calls
   /// dumpFilterArray on each filter chooser up to the top level one.
@@ -504,11 +502,9 @@ protected:
 
   bool doesOpcodeNeedPredicate(unsigned Opc) const;
   unsigned getPredicateIndex(DecoderTableInfo &TableInfo, StringRef P) const;
-  void emitPredicateTableEntry(DecoderTableInfo &TableInfo,
-                               unsigned Opc) const;
+  void emitPredicateTableEntry(DecoderTableInfo &TableInfo, unsigned Opc) const;
 
-  void emitSoftFailTableEntry(DecoderTableInfo &TableInfo,
-                              unsigned Opc) const;
+  void emitSoftFailTableEntry(DecoderTableInfo &TableInfo, unsigned Opc) const;
 
   // Emits table entries to decode the singleton.
   void emitSingletonTableEntry(DecoderTableInfo &TableInfo,
@@ -560,16 +556,15 @@ public:
 ///////////////////////////
 
 Filter::Filter(Filter &&f)
-  : Owner(f.Owner), StartBit(f.StartBit), NumBits(f.NumBits), Mixed(f.Mixed),
-    FilteredInstructions(std::move(f.FilteredInstructions)),
-    VariableInstructions(std::move(f.VariableInstructions)),
-    FilterChooserMap(std::move(f.FilterChooserMap)), NumFiltered(f.NumFiltered),
-    LastOpcFiltered(f.LastOpcFiltered) {
-}
+    : Owner(f.Owner), StartBit(f.StartBit), NumBits(f.NumBits), Mixed(f.Mixed),
+      FilteredInstructions(std::move(f.FilteredInstructions)),
+      VariableInstructions(std::move(f.VariableInstructions)),
+      FilterChooserMap(std::move(f.FilterChooserMap)),
+      NumFiltered(f.NumFiltered), LastOpcFiltered(f.LastOpcFiltered) {}
 
 Filter::Filter(FilterChooser &owner, unsigned startBit, unsigned numBits,
                bool mixed)
-  : Owner(&owner), StartBit(startBit), NumBits(numBits), Mixed(mixed) {
+    : Owner(&owner), StartBit(startBit), NumBits(numBits), Mixed(mixed) {
   assert(StartBit + NumBits - 1 < Owner->BitWidth);
 
   NumFiltered = 0;
@@ -598,8 +593,8 @@ Filter::Filter(FilterChooser &owner, unsigned startBit, unsigned numBits,
     }
   }
 
-  assert((FilteredInstructions.size() + VariableInstructions.size() > 0)
-         && "Filter returns no instruction categories");
+  assert((FilteredInstructions.size() + VariableInstructions.size() > 0) &&
+         "Filter returns no instruction categories");
 }
 
 // Divides the decoding task into sub tasks and delegates them to the
@@ -619,9 +614,11 @@ void Filter::recurse() {
 
     // Delegates to an inferior filter chooser for further processing on this
     // group of instructions whose segment values are variable.
-    FilterChooserMap.insert(std::make_pair(NO_FIXED_SEGMENTS_SENTINEL,
+    FilterChooserMap.insert(std::make_pair(
+        NO_FIXED_SEGMENTS_SENTINEL,
         std::make_unique<FilterChooser>(Owner->AllInstructions,
-            VariableInstructions, Owner->Operands, BitValueArray, *Owner)));
+                                        VariableInstructions, Owner->Operands,
+                                        BitValueArray, *Owner)));
   }
 
   // No need to recurse for a singleton filtered instruction.
@@ -646,8 +643,8 @@ void Filter::recurse() {
     // category of instructions.
     FilterChooserMap.insert(std::make_pair(
         Inst.first, std::make_unique<FilterChooser>(
-                                Owner->AllInstructions, Inst.second,
-                                Owner->Operands, BitValueArray, *Owner)));
+                        Owner->AllInstructions, Inst.second, Owner->Operands,
+                        BitValueArray, *Owner)));
   }
 }
 
@@ -655,8 +652,7 @@ static void resolveTableFixups(DecoderTable &Table, const FixupList &Fixups,
                                uint32_t DestIdx) {
   // Any NumToSkip fixups in the current scope can resolve to the
   // current location.
-  for (FixupList::const_reverse_iterator I = Fixups.rbegin(),
-                                         E = Fixups.rend();
+  for (FixupList::const_reverse_iterator I = Fixups.rbegin(), E = Fixups.rend();
        I != E; ++I) {
     // Calculate the distance from the byte following the fixup entry byte
     // to the destination. The Target is calculated from after the 16-bit
@@ -705,7 +701,7 @@ void Filter::emitTableEntry(DecoderTableInfo &TableInfo) const {
       // Resolve any NumToSkip fixups in the current scope.
       resolveTableFixups(Table, CurScope, Table.size());
       CurScope.clear();
-      PrevFilter = 0;  // Don't re-process the filter's fallthrough.
+      PrevFilter = 0; // Don't re-process the filter's fallthrough.
     } else {
       Table.push_back(MCD::OPC_FilterValue);
       // Encode and emit the value to filter against.
@@ -731,7 +727,8 @@ void Filter::emitTableEntry(DecoderTableInfo &TableInfo) const {
     // two as to account for the width of the NumToSkip field itself.
     if (PrevFilter) {
       uint32_t NumToSkip = Table.size() - PrevFilter - 3;
-      assert(NumToSkip < (1u << 24) && "disassembler decoding table too large!");
+      assert(NumToSkip < (1u << 24) &&
+             "disassembler decoding table too large!");
       Table[PrevFilter] = (uint8_t)NumToSkip;
       Table[PrevFilter + 1] = (uint8_t)(NumToSkip >> 8);
       Table[PrevFilter + 2] = (uint8_t)(NumToSkip >> 16);
@@ -771,7 +768,7 @@ void DecoderEmitter::emitTable(formatted_raw_ostream &OS, DecoderTable &Table,
                                unsigned Indentation, unsigned BitWidth,
                                StringRef Namespace) const {
   OS.indent(Indentation) << "static const uint8_t DecoderTable" << Namespace
-    << BitWidth << "[] = {\n";
+                         << BitWidth << "[] = {\n";
 
   Indentation += 2;
 
@@ -807,7 +804,7 @@ void DecoderEmitter::emitTable(formatted_raw_ostream &OS, DecoderTable &Table,
   DecoderTable::const_iterator I = Table.begin();
   DecoderTable::const_iterator E = Table.end();
   while (I != E) {
-    assert (I < E && "incomplete decode table entry!");
+    assert(I < E && "incomplete decode table entry!");
 
     uint64_t Pos = I - Table.begin();
     OS << "/* " << Pos << " */";
@@ -884,8 +881,8 @@ void DecoderEmitter::emitTable(formatted_raw_ostream &OS, DecoderTable &Table,
                                    Table.data() + Table.size(), &ErrMsg);
       assert(ErrMsg == nullptr && "ULEB128 value too large!");
 
-      OS.indent(Indentation) << "MCD::OPC_" << (IsTry ? "Try" : "")
-        << "Decode, ";
+      OS.indent(Indentation)
+          << "MCD::OPC_" << (IsTry ? "Try" : "") << "Decode, ";
       I += emitULEB128(I, OS);
 
       // Decoder index.
@@ -967,15 +964,16 @@ void DecoderEmitter::emitPredicateFunction(formatted_raw_ostream &OS,
   // The predicate function is just a big switch statement based on the
   // input predicate index.
   OS.indent(Indentation) << "static bool checkDecoderPredicate(unsigned Idx, "
-    << "const FeatureBitset &Bits) {\n";
+                         << "const FeatureBitset &Bits) {\n";
   Indentation += 2;
   if (!Predicates.empty()) {
     OS.indent(Indentation) << "switch (Idx) {\n";
-    OS.indent(Indentation) << "default: llvm_unreachable(\"Invalid index!\");\n";
+    OS.indent(Indentation)
+        << "default: llvm_unreachable(\"Invalid index!\");\n";
     unsigned Index = 0;
     for (const auto &Predicate : Predicates) {
       OS.indent(Indentation) << "case " << Index++ << ":\n";
-      OS.indent(Indentation+2) << "return (" << Predicate << ");\n";
+      OS.indent(Indentation + 2) << "return (" << Predicate << ");\n";
     }
     OS.indent(Indentation) << "}\n";
   } else {
@@ -993,7 +991,7 @@ void DecoderEmitter::emitDecoderFunction(formatted_raw_ostream &OS,
   // input decoder index.
   OS.indent(Indentation) << "template <typename InsnType>\n";
   OS.indent(Indentation) << "static DecodeStatus decodeToMCInst(DecodeStatus S,"
-    << " unsigned Idx, InsnType insn, MCInst &MI,\n";
+                         << " unsigned Idx, InsnType insn, MCInst &MI,\n";
   OS.indent(Indentation)
       << "                                   uint64_t "
       << "Address, const MCDisassembler *Decoder, bool &DecodeComplete) {\n";
@@ -1012,7 +1010,7 @@ void DecoderEmitter::emitDecoderFunction(formatted_raw_ostream &OS,
   for (const auto &Decoder : Decoders) {
     OS.indent(Indentation) << "case " << Index++ << ":\n";
     OS << Decoder;
-    OS.indent(Indentation+2) << "return S;\n";
+    OS.indent(Indentation + 2) << "return S;\n";
   }
   OS.indent(Indentation) << "}\n";
   Indentation -= 2;
@@ -1041,8 +1039,8 @@ bool FilterChooser::fieldFromInsn(uint64_t &Field, insn_t &Insn,
 
 /// dumpFilterArray - dumpFilterArray prints out debugging info for the given
 /// filter array as a series of chars.
-void FilterChooser::dumpFilterArray(raw_ostream &o,
-                                 const std::vector<bit_value_t> &filter) const {
+void FilterChooser::dumpFilterArray(
+    raw_ostream &o, const std::vector<bit_value_t> &filter) const {
   for (unsigned bitIndex = BitWidth; bitIndex > 0; bitIndex--) {
     switch (filter[bitIndex - 1]) {
     case BIT_UNFILTERED:
@@ -1096,7 +1094,8 @@ unsigned FilterChooser::getIslands(std::vector<unsigned> &StartBits,
     int64_t Val = Value(Insn[i]);
     bool Filtered = PositionFiltered(i);
     switch (State) {
-    default: llvm_unreachable("Unreachable code!");
+    default:
+      llvm_unreachable("Unreachable code!");
     case 0:
     case 1:
       if (Filtered || Val == -1)
@@ -1197,8 +1196,7 @@ void FilterChooser::emitDecoder(raw_ostream &OS, unsigned Indentation,
   }
 }
 
-unsigned FilterChooser::getDecoderIndex(DecoderSet &Decoders,
-                                        unsigned Opc,
+unsigned FilterChooser::getDecoderIndex(DecoderSet &Decoders, unsigned Opc,
                                         bool &HasCompleteDecoder) const {
   // Build up the predicate string.
   SmallString<256> Decoder;
@@ -1343,7 +1341,8 @@ void FilterChooser::emitSoftFailTableEntry(DecoderTableInfo &TableInfo,
   const RecordVal *RV = AllInstructions[Opc].EncodingDef->getValue("SoftFail");
   BitsInit *SFBits = RV ? dyn_cast<BitsInit>(RV->getValue()) : nullptr;
 
-  if (!SFBits) return;
+  if (!SFBits)
+    return;
   BitsInit *InstBits =
       AllInstructions[Opc].EncodingDef->getValueAsBitsInit("Inst");
 
@@ -1353,7 +1352,8 @@ void FilterChooser::emitSoftFailTableEntry(DecoderTableInfo &TableInfo,
     bit_value_t B = bitFromBits(*SFBits, i);
     bit_value_t IB = bitFromBits(*InstBits, i);
 
-    if (B != BIT_TRUE) continue;
+    if (B != BIT_TRUE)
+      continue;
 
     switch (IB) {
     case BIT_FALSE:
@@ -1458,12 +1458,12 @@ void FilterChooser::emitSingletonTableEntry(DecoderTableInfo &TableInfo,
   // decoder method indicates that additional processing should be done to see
   // if there is any other instruction that also matches the bitpattern and
   // can decode it.
-  TableInfo.Table.push_back(HasCompleteDecoder ? MCD::OPC_Decode :
-      MCD::OPC_TryDecode);
+  TableInfo.Table.push_back(HasCompleteDecoder ? MCD::OPC_Decode
+                                               : MCD::OPC_TryDecode);
   NumEncodingsSupported++;
   uint8_t Buffer[16], *p;
   encodeULEB128(Opc.Opcode, Buffer);
-  for (p = Buffer; *p >= 128 ; ++p)
+  for (p = Buffer; *p >= 128; ++p)
     TableInfo.Table.push_back(*p);
   TableInfo.Table.push_back(*p);
 
@@ -1825,8 +1825,8 @@ static std::string findOperandDecoderMethod(Record *Record) {
   std::string Decoder;
 
   RecordVal *DecoderString = Record->getValue("DecoderMethod");
-  StringInit *String = DecoderString ?
-    dyn_cast<StringInit>(DecoderString->getValue()) : nullptr;
+  StringInit *String =
+      DecoderString ? dyn_cast<StringInit>(DecoderString->getValue()) : nullptr;
   if (String) {
     Decoder = std::string(String->getValue());
     if (!Decoder.empty())
@@ -1840,7 +1840,7 @@ static std::string findOperandDecoderMethod(Record *Record) {
     Decoder = "Decode" + Record->getName().str() + "RegisterClass";
   } else if (Record->isSubClassOf("PointerLikeRegClass")) {
     Decoder = "DecodePointerLikeRegClass" +
-      utostr(Record->getValueAsInt("RegClassKind"));
+              utostr(Record->getValueAsInt("RegClassKind"));
   }
 
   return Decoder;
@@ -1986,7 +1986,8 @@ populateInstruction(CodeGenTarget &Target, const Record &EncodingDef,
   // of trying to auto-generate the decoder.
   StringRef InstDecoder = EncodingDef.getValueAsString("DecoderMethod");
   if (InstDecoder != "") {
-    bool HasCompleteInstDecoder = EncodingDef.getValueAsBit("hasCompleteDecoder");
+    bool HasCompleteInstDecoder =
+        EncodingDef.getValueAsBit("hasCompleteDecoder");
     InsnOperands.push_back(
         OperandInfo(std::string(InstDecoder), HasCompleteInstDecoder));
     Operands[Opc] = InsnOperands;
@@ -2000,9 +2001,9 @@ populateInstruction(CodeGenTarget &Target, const Record &EncodingDef,
   // Gather the outputs/inputs of the instruction, so we can find their
   // positions in the encoding.  This assumes for now that they appear in the
   // MCInst in the order that they're listed.
-  std::vector<std::pair<Init*, StringRef>> InOutOperands;
-  DagInit *Out  = Def.getValueAsDag("OutOperandList");
-  DagInit *In  = Def.getValueAsDag("InOperandList");
+  std::vector<std::pair<Init *, StringRef>> InOutOperands;
+  DagInit *Out = Def.getValueAsDag("OutOperandList");
+  DagInit *In = Def.getValueAsDag("InOperandList");
   for (unsigned i = 0; i < Out->getNumArgs(); ++i)
     InOutOperands.push_back(
         std::make_pair(Out->getArg(i), Out->getArgNameStr(i)));
@@ -2042,7 +2043,8 @@ populateInstruction(CodeGenTarget &Target, const Record &EncodingDef,
       Init *OpInit = Op.first;
       StringRef OpName = Op.second;
 
-      // We're ready to find the instruction encoding locations for this operand.
+      // We're ready to find the instruction encoding locations for this
+      // operand.
 
       // First, find the operand type ("OpInit"), and sub-op names
       // ("SubArgDag") if present.
@@ -2056,7 +2058,8 @@ populateInstruction(CodeGenTarget &Target, const Record &EncodingDef,
                             ? OpTypeRec->getValueAsDag("MIOperandInfo")
                             : nullptr;
 
-      // Lookup the decoder method and construct a new OperandInfo to hold our result.
+      // Lookup the decoder method and construct a new OperandInfo to hold our
+      // result.
       OperandInfo OpInfo = getOpInfo(OpTypeRec);
 
       // If we have named sub-operands...
@@ -2490,7 +2493,8 @@ void DecoderEmitter::run(raw_ostream &o) {
       NumberedEncodings.emplace_back(NumberedInstruction->TheDef,
                                      NumberedInstruction, HwModeName);
   }
-  for (const auto &NumberedAlias : RK.getAllDerivedDefinitions("AdditionalEncoding"))
+  for (const auto &NumberedAlias :
+       RK.getAllDerivedDefinitions("AdditionalEncoding"))
     NumberedEncodings.emplace_back(
         NumberedAlias,
         &Target.getInstruction(NumberedAlias->getValueAsDef("AliasOf")));
@@ -2551,8 +2555,8 @@ void DecoderEmitter::run(raw_ostream &o) {
   DecoderTableInfo TableInfo;
   for (const auto &Opc : OpcMap) {
     // Emit the decoder for this namespace+width combination.
-    ArrayRef<EncodingAndInst> NumberedEncodingsRef(
-        NumberedEncodings.data(), NumberedEncodings.size());
+    ArrayRef<EncodingAndInst> NumberedEncodingsRef(NumberedEncodings.data(),
+                                                   NumberedEncodings.size());
     FilterChooser FC(NumberedEncodingsRef, Opc.second, Operands,
                      IsVarLenInst ? MaxInstLen : 8 * Opc.first.second, this);
 
diff --git a/llvm/utils/TableGen/DisassemblerEmitter.cpp b/llvm/utils/TableGen/DisassemblerEmitter.cpp
index 92f3721..ae6a8ef 100644
--- a/llvm/utils/TableGen/DisassemblerEmitter.cpp
+++ b/llvm/utils/TableGen/DisassemblerEmitter.cpp
@@ -102,8 +102,8 @@ static void EmitDisassembler(RecordKeeper &Records, raw_ostream &OS) {
   if (Target.getName() == "X86") {
     DisassemblerTables Tables;
 
-    ArrayRef<const CodeGenInstruction*> numberedInstructions =
-      Target.getInstructionsByEnumValue();
+    ArrayRef<const CodeGenInstruction *> numberedInstructions =
+        Target.getInstructionsByEnumValue();
 
     for (unsigned i = 0, e = numberedInstructions.size(); i != e; ++i)
       RecognizableInstr::processInstr(Tables, *numberedInstructions[i], i);
diff --git a/llvm/utils/TableGen/FastISelEmitter.cpp b/llvm/utils/TableGen/FastISelEmitter.cpp
index b773a6b..dff6503 100644
--- a/llvm/utils/TableGen/FastISelEmitter.cpp
+++ b/llvm/utils/TableGen/FastISelEmitter.cpp
@@ -1,4 +1,4 @@
-///===- FastISelEmitter.cpp - Generate an instruction selector -------------===//
+///===- FastISelEmitter.cpp - Generate an instruction selector ------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -30,7 +30,6 @@
 #include <utility>
 using namespace llvm;
 
-
 /// InstructionMemo - This class holds additional information about an
 /// instruction needed to emit code for it.
 ///
@@ -61,15 +60,15 @@ namespace {
 class ImmPredicateSet {
   DenseMap<TreePattern *, unsigned> ImmIDs;
   std::vector<TreePredicateFn> PredsByName;
-public:
 
+public:
   unsigned getIDFor(TreePredicateFn Pred) {
     unsigned &Entry = ImmIDs[Pred.getOrigPatFragRecord()];
     if (Entry == 0) {
       PredsByName.push_back(Pred);
       Entry = PredsByName.size();
     }
-    return Entry-1;
+    return Entry - 1;
   }
 
   const TreePredicateFn &getPredicate(unsigned i) {
@@ -80,7 +79,6 @@ public:
   typedef std::vector<TreePredicateFn>::const_iterator iterator;
   iterator begin() const { return PredsByName.begin(); }
   iterator end() const { return PredsByName.end(); }
-
 };
 } // End anonymous namespace
 
@@ -92,26 +90,39 @@ struct OperandsSignature {
   class OpKind {
     enum { OK_Reg, OK_FP, OK_Imm, OK_Invalid = -1 };
     char Repr;
-  public:
 
+  public:
     OpKind() : Repr(OK_Invalid) {}
 
     bool operator<(OpKind RHS) const { return Repr < RHS.Repr; }
     bool operator==(OpKind RHS) const { return Repr == RHS.Repr; }
 
-    static OpKind getReg() { OpKind K; K.Repr = OK_Reg; return K; }
-    static OpKind getFP()  { OpKind K; K.Repr = OK_FP; return K; }
+    static OpKind getReg() {
+      OpKind K;
+      K.Repr = OK_Reg;
+      return K;
+    }
+    static OpKind getFP() {
+      OpKind K;
+      K.Repr = OK_FP;
+      return K;
+    }
     static OpKind getImm(unsigned V) {
-      assert((unsigned)OK_Imm+V < 128 &&
+      assert((unsigned)OK_Imm + V < 128 &&
              "Too many integer predicates for the 'Repr' char");
-      OpKind K; K.Repr = OK_Imm+V; return K;
+      OpKind K;
+      K.Repr = OK_Imm + V;
+      return K;
     }
 
     bool isReg() const { return Repr == OK_Reg; }
-    bool isFP() const  { return Repr == OK_FP; }
+    bool isFP() const { return Repr == OK_FP; }
     bool isImm() const { return Repr >= OK_Imm; }
 
-    unsigned getImmCode() const { assert(isImm()); return Repr-OK_Imm; }
+    unsigned getImmCode() const {
+      assert(isImm());
+      return Repr - OK_Imm;
+    }
 
     void printManglingSuffix(raw_ostream &OS, ImmPredicateSet &ImmPredicates,
                              bool StripImmCodes) const {
@@ -123,12 +134,11 @@ struct OperandsSignature {
         OS << 'i';
         if (!StripImmCodes)
           if (unsigned Code = getImmCode())
-            OS << "_" << ImmPredicates.getPredicate(Code-1).getFnName();
+            OS << "_" << ImmPredicates.getPredicate(Code - 1).getFnName();
       }
     }
   };
 
-
   SmallVector<OpKind, 3> Operands;
 
   bool operator<(const OperandsSignature &O) const {
@@ -162,15 +172,17 @@ struct OperandsSignature {
   void emitImmediatePredicate(raw_ostream &OS, ImmPredicateSet &ImmPredicates) {
     bool EmittedAnything = false;
     for (unsigned i = 0, e = Operands.size(); i != e; ++i) {
-      if (!Operands[i].isImm()) continue;
+      if (!Operands[i].isImm())
+        continue;
 
       unsigned Code = Operands[i].getImmCode();
-      if (Code == 0) continue;
+      if (Code == 0)
+        continue;
 
       if (EmittedAnything)
         OS << " &&\n        ";
 
-      TreePredicateFn PredFn = ImmPredicates.getPredicate(Code-1);
+      TreePredicateFn PredFn = ImmPredicates.getPredicate(Code - 1);
 
       // Emit the type check.
       TreePattern *TP = PredFn.getOrigPatFragRecord();
@@ -179,7 +191,7 @@ struct OperandsSignature {
              "Cannot use variable value types with fast isel");
       OS << "VT == " << getEnumName(VVT.getSimple().SimpleTy) << " && ";
 
-      OS << PredFn.getFnName() << "(imm" << i <<')';
+      OS << PredFn.getFnName() << "(imm" << i << ')';
       EmittedAnything = true;
     }
   }
@@ -189,8 +201,7 @@ struct OperandsSignature {
   /// are supported, false otherwise.
   ///
   bool initialize(TreePatternNode *InstPatNode, const CodeGenTarget &Target,
-                  MVT::SimpleValueType VT,
-                  ImmPredicateSet &ImmediatePredicates,
+                  MVT::SimpleValueType VT, ImmPredicateSet &ImmediatePredicates,
                   const CodeGenRegisterClass *OrigDstRC) {
     if (InstPatNode->isLeaf())
       return false;
@@ -229,21 +240,20 @@ struct OperandsSignature {
           if (Rec->getValueAsBit("FastIselShouldIgnore"))
             return false;
 
-          PredNo = ImmediatePredicates.getIDFor(PredFn)+1;
+          PredNo = ImmediatePredicates.getIDFor(PredFn) + 1;
         }
 
         Operands.push_back(OpKind::getImm(PredNo));
         continue;
       }
 
-
       // For now, filter out any operand with a predicate.
       // For now, filter out any operand with multiple values.
       if (!Op->getPredicateCalls().empty() || Op->getNumTypes() != 1)
         return false;
 
       if (!Op->isLeaf()) {
-         if (Op->getOperator()->getName() == "fpimm") {
+        if (Op->getOperator()->getName() == "fpimm") {
           Operands.push_back(OpKind::getFP());
           continue;
         }
@@ -347,7 +357,6 @@ struct OperandsSignature {
     }
   }
 
-
   void PrintManglingSuffix(raw_ostream &OS, const std::vector<std::string> &PR,
                            ImmPredicateSet &ImmPredicates,
                            bool StripImmCodes = false) const {
@@ -380,7 +389,7 @@ class FastISelMap {
   typedef std::map<MVT::SimpleValueType, RetPredMap> TypeRetPredMap;
   typedef std::map<std::string, TypeRetPredMap> OpcodeTypeRetPredMap;
   typedef std::map<OperandsSignature, OpcodeTypeRetPredMap>
-            OperandsOpcodeTypeRetPredMap;
+      OperandsOpcodeTypeRetPredMap;
 
   OperandsOpcodeTypeRetPredMap SimplePatterns;
 
@@ -389,22 +398,22 @@ class FastISelMap {
                       MVT::SimpleValueType, std::string>>
       SimplePatternsCheck;
 
-  std::map<OperandsSignature, std::vector<OperandsSignature> >
-    SignaturesWithConstantForms;
+  std::map<OperandsSignature, std::vector<OperandsSignature>>
+      SignaturesWithConstantForms;
 
   StringRef InstNS;
   ImmPredicateSet ImmediatePredicates;
+
 public:
   explicit FastISelMap(StringRef InstNS);
 
   void collectPatterns(CodeGenDAGPatterns &CGP);
   void printImmediatePredicates(raw_ostream &OS);
   void printFunctionDefinitions(raw_ostream &OS);
+
 private:
-  void emitInstructionCode(raw_ostream &OS,
-                           const OperandsSignature &Operands,
-                           const PredMap &PM,
-                           const std::string &RetVTName);
+  void emitInstructionCode(raw_ostream &OS, const OperandsSignature &Operands,
+                           const PredMap &PM, const std::string &RetVTName);
 };
 } // End anonymous namespace
 
@@ -433,7 +442,7 @@ static std::string PhyRegForNode(TreePatternNode *Op,
     return PhysReg;
 
   PhysReg += cast<StringInit>(OpLeafRec->getValue("Namespace")->getValue())
-               ->getValue();
+                 ->getValue();
   PhysReg += "::";
   PhysReg += Target.getRegBank().getReg(OpLeafRec)->getName();
   return PhysReg;
@@ -443,14 +452,15 @@ void FastISelMap::collectPatterns(CodeGenDAGPatterns &CGP) {
   const CodeGenTarget &Target = CGP.getTargetInfo();
 
   // Scan through all the patterns and record the simple ones.
-  for (CodeGenDAGPatterns::ptm_iterator I = CGP.ptm_begin(),
-       E = CGP.ptm_end(); I != E; ++I) {
+  for (CodeGenDAGPatterns::ptm_iterator I = CGP.ptm_begin(), E = CGP.ptm_end();
+       I != E; ++I) {
     const PatternToMatch &Pattern = *I;
 
     // For now, just look at Instructions, so that we don't have to worry
     // about emitting multiple instructions for a pattern.
     TreePatternNode *Dst = Pattern.getDstPattern();
-    if (Dst->isLeaf()) continue;
+    if (Dst->isLeaf())
+      continue;
     Record *Op = Dst->getOperator();
     if (!Op->isSubClassOf("Instruction"))
       continue;
@@ -495,7 +505,8 @@ void FastISelMap::collectPatterns(CodeGenDAGPatterns &CGP) {
     } else {
       // If this isn't a leaf, then continue since the register classes are
       // a bit too complicated for now.
-      if (!Dst->getChild(1)->isLeaf()) continue;
+      if (!Dst->getChild(1)->isLeaf())
+        continue;
 
       DefInit *SR = dyn_cast<DefInit>(Dst->getChild(1)->getLeafValue());
       if (SR)
@@ -506,16 +517,20 @@ void FastISelMap::collectPatterns(CodeGenDAGPatterns &CGP) {
 
     // Inspect the pattern.
     TreePatternNode *InstPatNode = Pattern.getSrcPattern();
-    if (!InstPatNode) continue;
-    if (InstPatNode->isLeaf()) continue;
+    if (!InstPatNode)
+      continue;
+    if (InstPatNode->isLeaf())
+      continue;
 
     // Ignore multiple result nodes for now.
-    if (InstPatNode->getNumTypes() > 1) continue;
+    if (InstPatNode->getNumTypes() > 1)
+      continue;
 
     Record *InstPatOp = InstPatNode->getOperator();
     std::string OpcodeName = getOpcodeName(InstPatOp, CGP);
     MVT::SimpleValueType RetVT = MVT::isVoid;
-    if (InstPatNode->getNumTypes()) RetVT = InstPatNode->getSimpleType(0);
+    if (InstPatNode->getNumTypes())
+      RetVT = InstPatNode->getSimpleType(0);
     MVT::SimpleValueType VT = RetVT;
     if (InstPatNode->getNumChildren()) {
       assert(InstPatNode->getChild(0)->getNumTypes() == 1);
@@ -546,7 +561,7 @@ void FastISelMap::collectPatterns(CodeGenDAGPatterns &CGP) {
         if (PhysReg.empty()) {
           if (DstIndex >= Dst->getNumChildren() ||
               Dst->getChild(DstIndex)->getName() !=
-              InstPatNode->getChild(i)->getName()) {
+                  InstPatNode->getChild(i)->getName()) {
             FoundNonSimplePattern = true;
             break;
           }
@@ -568,21 +583,16 @@ void FastISelMap::collectPatterns(CodeGenDAGPatterns &CGP) {
     raw_string_ostream SuffixOS(ManglingSuffix);
     Operands.PrintManglingSuffix(SuffixOS, ImmediatePredicates, true);
     if (!StringSwitch<bool>(ManglingSuffix)
-        .Cases("", "r", "rr", "ri", "i", "f", true)
-        .Default(false))
+             .Cases("", "r", "rr", "ri", "i", "f", true)
+             .Default(false))
       continue;
 
     // Get the predicate that guards this pattern.
     std::string PredicateCheck = Pattern.getPredicateCheck();
 
     // Ok, we found a pattern that we can handle. Remember it.
-    InstructionMemo Memo(
-      Pattern.getDstPattern()->getOperator()->getName(),
-      DstRC,
-      SubRegNo,
-      PhysRegInputs,
-      PredicateCheck
-    );
+    InstructionMemo Memo(Pattern.getDstPattern()->getOperator()->getName(),
+                         DstRC, SubRegNo, PhysRegInputs, PredicateCheck);
 
     int complexity = Pattern.getPatternComplexity(CGP);
 
@@ -590,7 +600,7 @@ void FastISelMap::collectPatterns(CodeGenDAGPatterns &CGP) {
         std::make_tuple(Operands, OpcodeName, VT, RetVT, PredicateCheck));
     if (!inserted_simple_pattern.second) {
       PrintFatalError(Pattern.getSrcRecord()->getLoc(),
-                    "Duplicate predicate in FastISel table!");
+                      "Duplicate predicate in FastISel table!");
     }
 
     // Note: Instructions with the same complexity will appear in the order
@@ -602,8 +612,8 @@ void FastISelMap::collectPatterns(CodeGenDAGPatterns &CGP) {
     // them down to a signature that doesn't have predicates so that we can
     // associate them with the stripped predicate version.
     if (Operands.hasAnyImmediateCodes()) {
-      SignaturesWithConstantForms[Operands.getWithoutImmCodes()]
-        .push_back(Operands);
+      SignaturesWithConstantForms[Operands.getWithoutImmCodes()].push_back(
+          Operands);
     }
   }
 }
@@ -645,7 +655,8 @@ void FastISelMap::emitInstructionCode(raw_ostream &OS,
       if (OneHadNoPredicate) {
         PrintFatalError("Multiple instructions match and one with no "
                         "predicate came before one with a predicate!  "
-                        "name:" + Memo.Name + "  predicate: " + PredicateCheck);
+                        "name:" +
+                        Memo.Name + "  predicate: " + PredicateCheck);
       }
       OS << "  if (" + PredicateCheck + ") {\n";
       OS << "  ";
@@ -669,8 +680,8 @@ void FastISelMap::emitInstructionCode(raw_ostream &OS,
       Operands.PrintArguments(OS, Memo.PhysRegs);
       OS << ");\n";
     } else {
-      OS << "extractsubreg(" << RetVTName
-         << ", Op0, " << Memo.SubRegNo << ");\n";
+      OS << "extractsubreg(" << RetVTName << ", Op0, " << Memo.SubRegNo
+         << ");\n";
     }
 
     if (!PredicateCheck.empty()) {
@@ -685,7 +696,6 @@ void FastISelMap::emitInstructionCode(raw_ostream &OS,
   OS << "\n";
 }
 
-
 void FastISelMap::printFunctionDefinitions(raw_ostream &OS) {
   // Now emit code for all the patterns that we collected.
   for (const auto &SimplePattern : SimplePatterns) {
@@ -762,8 +772,7 @@ void FastISelMap::printFunctionDefinitions(raw_ostream &OS) {
       }
 
       // Emit one function for the opcode that demultiplexes based on the type.
-      OS << "unsigned fastEmit_"
-         << getLegalCName(Opcode) << "_";
+      OS << "unsigned fastEmit_" << getLegalCName(Opcode) << "_";
       Operands.PrintManglingSuffix(OS, ImmediatePredicates);
       OS << "(MVT VT, MVT RetVT";
       if (!Operands.empty())
@@ -809,8 +818,8 @@ void FastISelMap::printFunctionDefinitions(raw_ostream &OS) {
     // constrained forms of the immediate (e.g., 32-bit sext immediate in a
     // 64-bit operand), check them first.
 
-    std::map<OperandsSignature, std::vector<OperandsSignature> >::iterator MI
-      = SignaturesWithConstantForms.find(Operands);
+    std::map<OperandsSignature, std::vector<OperandsSignature>>::iterator MI =
+        SignaturesWithConstantForms.find(Operands);
     if (MI != SignaturesWithConstantForms.end()) {
       // Unique any duplicates out of the list.
       llvm::sort(MI->second);
@@ -840,8 +849,8 @@ void FastISelMap::printFunctionDefinitions(raw_ostream &OS) {
     for (const auto &I : OTM) {
       const std::string &Opcode = I.first;
 
-      OS << "  case " << Opcode << ": return fastEmit_"
-         << getLegalCName(Opcode) << "_";
+      OS << "  case " << Opcode << ": return fastEmit_" << getLegalCName(Opcode)
+         << "_";
       Operands.PrintManglingSuffix(OS, ImmediatePredicates);
       OS << "(VT, RetVT";
       if (!Operands.empty())
@@ -862,7 +871,8 @@ static void EmitFastISel(RecordKeeper &RK, raw_ostream &OS) {
   CodeGenDAGPatterns CGP(RK);
   const CodeGenTarget &Target = CGP.getTargetInfo();
   emitSourceFileHeader("\"Fast\" Instruction Selector for the " +
-                       Target.getName().str() + " target", OS);
+                           Target.getName().str() + " target",
+                       OS);
 
   // Determine the target's namespace name.
   StringRef InstNS = Target.getInstNamespace();
diff --git a/llvm/utils/TableGen/InfoByHwMode.cpp b/llvm/utils/TableGen/InfoByHwMode.cpp
index 7e4ab53..6d9a35a 100644
--- a/llvm/utils/TableGen/InfoByHwMode.cpp
+++ b/llvm/utils/TableGen/InfoByHwMode.cpp
@@ -11,8 +11,8 @@
 // data).
 //===----------------------------------------------------------------------===//
 
-#include "CodeGenTarget.h"
 #include "InfoByHwMode.h"
+#include "CodeGenTarget.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/Support/Debug.h"
@@ -44,7 +44,7 @@ ValueTypeByHwMode::ValueTypeByHwMode(Record *R, MVT T) : ValueTypeByHwMode(T) {
     PtrAddrSpace = R->getValueAsInt("AddrSpace");
 }
 
-bool ValueTypeByHwMode::operator== (const ValueTypeByHwMode &T) const {
+bool ValueTypeByHwMode::operator==(const ValueTypeByHwMode &T) const {
   assert(isValid() && T.isValid() && "Invalid type in assignment");
   bool Simple = isSimple();
   if (Simple != T.isSimple())
@@ -55,7 +55,7 @@ bool ValueTypeByHwMode::operator== (const ValueTypeByHwMode &T) const {
   return Map == T.Map;
 }
 
-bool ValueTypeByHwMode::operator< (const ValueTypeByHwMode &T) const {
+bool ValueTypeByHwMode::operator<(const ValueTypeByHwMode &T) const {
   assert(isValid() && T.isValid() && "Invalid type in comparison");
   // Default order for maps.
   return Map < T.Map;
@@ -86,7 +86,7 @@ void ValueTypeByHwMode::writeToStream(raw_ostream &OS) const {
     return;
   }
 
-  std::vector<const PairType*> Pairs;
+  std::vector<const PairType *> Pairs;
   for (const auto &P : Map)
     Pairs.push_back(&P);
   llvm::sort(Pairs, deref<std::less<PairType>>());
@@ -100,9 +100,7 @@ void ValueTypeByHwMode::writeToStream(raw_ostream &OS) const {
 }
 
 LLVM_DUMP_METHOD
-void ValueTypeByHwMode::dump() const {
-  dbgs() << *this << '\n';
-}
+void ValueTypeByHwMode::dump() const { dbgs() << *this << '\n'; }
 
 ValueTypeByHwMode llvm::getValueTypeByHwMode(Record *Rec,
                                              const CodeGenHwModes &CGH) {
@@ -123,24 +121,22 @@ RegSizeInfo::RegSizeInfo(Record *R, const CodeGenHwModes &CGH) {
   SpillAlignment = R->getValueAsInt("SpillAlignment");
 }
 
-bool RegSizeInfo::operator< (const RegSizeInfo &I) const {
+bool RegSizeInfo::operator<(const RegSizeInfo &I) const {
   return std::tie(RegSize, SpillSize, SpillAlignment) <
          std::tie(I.RegSize, I.SpillSize, I.SpillAlignment);
 }
 
 bool RegSizeInfo::isSubClassOf(const RegSizeInfo &I) const {
-  return RegSize <= I.RegSize &&
-         SpillAlignment && I.SpillAlignment % SpillAlignment == 0 &&
-         SpillSize <= I.SpillSize;
+  return RegSize <= I.RegSize && SpillAlignment &&
+         I.SpillAlignment % SpillAlignment == 0 && SpillSize <= I.SpillSize;
 }
 
 void RegSizeInfo::writeToStream(raw_ostream &OS) const {
-  OS << "[R=" << RegSize << ",S=" << SpillSize
-     << ",A=" << SpillAlignment << ']';
+  OS << "[R=" << RegSize << ",S=" << SpillSize << ",A=" << SpillAlignment
+     << ']';
 }
 
-RegSizeInfoByHwMode::RegSizeInfoByHwMode(Record *R,
-      const CodeGenHwModes &CGH) {
+RegSizeInfoByHwMode::RegSizeInfoByHwMode(Record *R, const CodeGenHwModes &CGH) {
   const HwModeSelect &MS = CGH.getHwModeSelect(R);
   for (const HwModeSelect::PairType &P : MS.Items) {
     auto I = Map.insert({P.first, RegSizeInfo(P.second, CGH)});
@@ -149,12 +145,12 @@ RegSizeInfoByHwMode::RegSizeInfoByHwMode(Record *R,
   }
 }
 
-bool RegSizeInfoByHwMode::operator< (const RegSizeInfoByHwMode &I) const {
+bool RegSizeInfoByHwMode::operator<(const RegSizeInfoByHwMode &I) const {
   unsigned M0 = Map.begin()->first;
   return get(M0) < I.get(M0);
 }
 
-bool RegSizeInfoByHwMode::operator== (const RegSizeInfoByHwMode &I) const {
+bool RegSizeInfoByHwMode::operator==(const RegSizeInfoByHwMode &I) const {
   unsigned M0 = Map.begin()->first;
   return get(M0) == I.get(M0);
 }
@@ -164,8 +160,8 @@ bool RegSizeInfoByHwMode::isSubClassOf(const RegSizeInfoByHwMode &I) const {
   return get(M0).isSubClassOf(I.get(M0));
 }
 
-bool RegSizeInfoByHwMode::hasStricterSpillThan(const RegSizeInfoByHwMode &I)
-      const {
+bool RegSizeInfoByHwMode::hasStricterSpillThan(
+    const RegSizeInfoByHwMode &I) const {
   unsigned M0 = Map.begin()->first;
   const RegSizeInfo &A0 = get(M0);
   const RegSizeInfo &B0 = I.get(M0);
@@ -175,7 +171,7 @@ bool RegSizeInfoByHwMode::hasStricterSpillThan(const RegSizeInfoByHwMode &I)
 
 void RegSizeInfoByHwMode::writeToStream(raw_ostream &OS) const {
   typedef typename decltype(Map)::value_type PairType;
-  std::vector<const PairType*> Pairs;
+  std::vector<const PairType *> Pairs;
   for (const auto &P : Map)
     Pairs.push_back(&P);
   llvm::sort(Pairs, deref<std::less<PairType>>());
@@ -187,7 +183,8 @@ void RegSizeInfoByHwMode::writeToStream(raw_ostream &OS) const {
   OS << '}';
 }
 
-EncodingInfoByHwMode::EncodingInfoByHwMode(Record *R, const CodeGenHwModes &CGH) {
+EncodingInfoByHwMode::EncodingInfoByHwMode(Record *R,
+                                           const CodeGenHwModes &CGH) {
   const HwModeSelect &MS = CGH.getHwModeSelect(R);
   for (const HwModeSelect::PairType &P : MS.Items) {
     assert(P.second && P.second->isSubClassOf("InstructionEncoding") &&
@@ -199,18 +196,18 @@ EncodingInfoByHwMode::EncodingInfoByHwMode(Record *R, const CodeGenHwModes &CGH)
 }
 
 namespace llvm {
-  raw_ostream &operator<<(raw_ostream &OS, const ValueTypeByHwMode &T) {
-    T.writeToStream(OS);
-    return OS;
-  }
+raw_ostream &operator<<(raw_ostream &OS, const ValueTypeByHwMode &T) {
+  T.writeToStream(OS);
+  return OS;
+}
 
-  raw_ostream &operator<<(raw_ostream &OS, const RegSizeInfo &T) {
-    T.writeToStream(OS);
-    return OS;
-  }
+raw_ostream &operator<<(raw_ostream &OS, const RegSizeInfo &T) {
+  T.writeToStream(OS);
+  return OS;
+}
 
-  raw_ostream &operator<<(raw_ostream &OS, const RegSizeInfoByHwMode &T) {
-    T.writeToStream(OS);
-    return OS;
-  }
+raw_ostream &operator<<(raw_ostream &OS, const RegSizeInfoByHwMode &T) {
+  T.writeToStream(OS);
+  return OS;
 }
+} // namespace llvm
diff --git a/llvm/utils/TableGen/InfoByHwMode.h b/llvm/utils/TableGen/InfoByHwMode.h
index 4692ab2..5f53295 100644
--- a/llvm/utils/TableGen/InfoByHwMode.h
+++ b/llvm/utils/TableGen/InfoByHwMode.h
@@ -40,8 +40,7 @@ enum : unsigned {
 };
 
 template <typename InfoT>
-void union_modes(const InfoByHwMode<InfoT> &A,
-                 const InfoByHwMode<InfoT> &B,
+void union_modes(const InfoByHwMode<InfoT> &A, const InfoByHwMode<InfoT> &B,
                  SmallVectorImpl<unsigned> &Modes) {
   auto AI = A.begin();
   auto BI = B.begin();
@@ -85,9 +84,8 @@ void union_modes(const InfoByHwMode<InfoT> &A,
     Modes.push_back(DefaultMode);
 }
 
-template <typename InfoT>
-struct InfoByHwMode {
-  typedef std::map<unsigned,InfoT> MapType;
+template <typename InfoT> struct InfoByHwMode {
+  typedef std::map<unsigned, InfoT> MapType;
   typedef typename MapType::value_type PairType;
   typedef typename MapType::iterator iterator;
   typedef typename MapType::const_iterator const_iterator;
@@ -98,11 +96,11 @@ struct InfoByHwMode {
   LLVM_ATTRIBUTE_ALWAYS_INLINE
   iterator begin() { return Map.begin(); }
   LLVM_ATTRIBUTE_ALWAYS_INLINE
-  iterator end()   { return Map.end(); }
+  iterator end() { return Map.end(); }
   LLVM_ATTRIBUTE_ALWAYS_INLINE
   const_iterator begin() const { return Map.begin(); }
   LLVM_ATTRIBUTE_ALWAYS_INLINE
-  const_iterator end() const   { return Map.end(); }
+  const_iterator end() const { return Map.end(); }
   LLVM_ATTRIBUTE_ALWAYS_INLINE
   bool empty() const { return Map.empty(); }
 
@@ -156,15 +154,13 @@ protected:
 struct ValueTypeByHwMode : public InfoByHwMode<MVT> {
   ValueTypeByHwMode(Record *R, const CodeGenHwModes &CGH);
   ValueTypeByHwMode(Record *R, MVT T);
-  ValueTypeByHwMode(MVT T) { Map.insert({DefaultMode,T}); }
+  ValueTypeByHwMode(MVT T) { Map.insert({DefaultMode, T}); }
   ValueTypeByHwMode() = default;
 
-  bool operator== (const ValueTypeByHwMode &T) const;
-  bool operator< (const ValueTypeByHwMode &T) const;
+  bool operator==(const ValueTypeByHwMode &T) const;
+  bool operator<(const ValueTypeByHwMode &T) const;
 
-  bool isValid() const {
-    return !Map.empty();
-  }
+  bool isValid() const { return !Map.empty(); }
   MVT getType(unsigned Mode) const { return get(Mode); }
   MVT &getOrCreateTypeForMode(unsigned Mode, MVT Type);
 
@@ -178,8 +174,7 @@ struct ValueTypeByHwMode : public InfoByHwMode<MVT> {
   }
 };
 
-ValueTypeByHwMode getValueTypeByHwMode(Record *Rec,
-                                       const CodeGenHwModes &CGH);
+ValueTypeByHwMode getValueTypeByHwMode(Record *Rec, const CodeGenHwModes &CGH);
 
 struct RegSizeInfo {
   unsigned RegSize;
@@ -188,14 +183,12 @@ struct RegSizeInfo {
 
   RegSizeInfo(Record *R, const CodeGenHwModes &CGH);
   RegSizeInfo() = default;
-  bool operator< (const RegSizeInfo &I) const;
-  bool operator== (const RegSizeInfo &I) const {
+  bool operator<(const RegSizeInfo &I) const;
+  bool operator==(const RegSizeInfo &I) const {
     return std::tie(RegSize, SpillSize, SpillAlignment) ==
            std::tie(I.RegSize, I.SpillSize, I.SpillAlignment);
   }
-  bool operator!= (const RegSizeInfo &I) const {
-    return !(*this == I);
-  }
+  bool operator!=(const RegSizeInfo &I) const { return !(*this == I); }
 
   bool isSubClassOf(const RegSizeInfo &I) const;
   void writeToStream(raw_ostream &OS) const;
@@ -204,9 +197,9 @@ struct RegSizeInfo {
 struct RegSizeInfoByHwMode : public InfoByHwMode<RegSizeInfo> {
   RegSizeInfoByHwMode(Record *R, const CodeGenHwModes &CGH);
   RegSizeInfoByHwMode() = default;
-  bool operator< (const RegSizeInfoByHwMode &VI) const;
-  bool operator== (const RegSizeInfoByHwMode &VI) const;
-  bool operator!= (const RegSizeInfoByHwMode &VI) const {
+  bool operator<(const RegSizeInfoByHwMode &VI) const;
+  bool operator==(const RegSizeInfoByHwMode &VI) const;
+  bool operator!=(const RegSizeInfoByHwMode &VI) const {
     return !(*this == VI);
   }
 
@@ -224,7 +217,7 @@ raw_ostream &operator<<(raw_ostream &OS, const ValueTypeByHwMode &T);
 raw_ostream &operator<<(raw_ostream &OS, const RegSizeInfo &T);
 raw_ostream &operator<<(raw_ostream &OS, const RegSizeInfoByHwMode &T);
 
-struct EncodingInfoByHwMode : public InfoByHwMode<Record*> {
+struct EncodingInfoByHwMode : public InfoByHwMode<Record *> {
   EncodingInfoByHwMode(Record *R, const CodeGenHwModes &CGH);
   EncodingInfoByHwMode() = default;
 };
diff --git a/llvm/utils/TableGen/InstrDocsEmitter.cpp b/llvm/utils/TableGen/InstrDocsEmitter.cpp
index 616e7b5..efabf6b 100644
--- a/llvm/utils/TableGen/InstrDocsEmitter.cpp
+++ b/llvm/utils/TableGen/InstrDocsEmitter.cpp
@@ -44,11 +44,18 @@ static std::string escapeForRST(StringRef Str) {
   for (char C : Str) {
     switch (C) {
     // We want special characters to be shown as their C escape codes.
-    case '\n': Result += "\\n"; break;
-    case '\t': Result += "\\t"; break;
+    case '\n':
+      Result += "\\n";
+      break;
+    case '\t':
+      Result += "\\t";
+      break;
     // Underscore at the end of a line has a special meaning in rst.
-    case '_': Result += "\\_"; break;
-    default: Result += C;
+    case '_':
+      Result += "\\_";
+      break;
+    default:
+      Result += C;
     }
   }
   return Result;
@@ -96,7 +103,10 @@ static void EmitInstrDocs(RecordKeeper &RK, raw_ostream &OS) {
     std::vector<const char *> FlagStrings;
 #define xstr(s) str(s)
 #define str(s) #s
-#define FLAG(f) if (II->f) { FlagStrings.push_back(str(f)); }
+#define FLAG(f)                                                                \
+  if (II->f) {                                                                 \
+    FlagStrings.push_back(str(f));                                             \
+  }
     FLAG(isReturn)
     FLAG(isEHScopeReturn)
     FLAG(isBranch)
@@ -111,9 +121,9 @@ static void EmitInstrDocs(RecordKeeper &RK, raw_ostream &OS) {
     FLAG(isTrap)
     FLAG(canFoldAsLoad)
     FLAG(mayLoad)
-    //FLAG(mayLoad_Unset) // Deliberately omitted.
+    // FLAG(mayLoad_Unset) // Deliberately omitted.
     FLAG(mayStore)
-    //FLAG(mayStore_Unset) // Deliberately omitted.
+    // FLAG(mayStore_Unset) // Deliberately omitted.
     FLAG(isPredicable)
     FLAG(isConvertibleToThreeAddress)
     FLAG(isCommutable)
@@ -125,7 +135,7 @@ static void EmitInstrDocs(RecordKeeper &RK, raw_ostream &OS) {
     FLAG(hasCtrlDep)
     FLAG(isNotDuplicable)
     FLAG(hasSideEffects)
-    //FLAG(hasSideEffects_Unset) // Deliberately omitted.
+    // FLAG(hasSideEffects_Unset) // Deliberately omitted.
     FLAG(isAsCheapAsAMove)
     FLAG(hasExtraSrcRegAllocReq)
     FLAG(hasExtraDefRegAllocReq)
diff --git a/llvm/utils/TableGen/InstrInfoEmitter.cpp b/llvm/utils/TableGen/InstrInfoEmitter.cpp
index dbc5c22..2d08447 100644
--- a/llvm/utils/TableGen/InstrInfoEmitter.cpp
+++ b/llvm/utils/TableGen/InstrInfoEmitter.cpp
@@ -53,8 +53,8 @@ class InstrInfoEmitter {
   const CodeGenSchedModels &SchedModels;
 
 public:
-  InstrInfoEmitter(RecordKeeper &R):
-    Records(R), CDP(R), SchedModels(CDP.getTargetInfo().getSchedModels()) {}
+  InstrInfoEmitter(RecordKeeper &R)
+      : Records(R), CDP(R), SchedModels(CDP.getTargetInfo().getSchedModels()) {}
 
   // run - Output the instruction set description.
   void run(raw_ostream &OS);
@@ -69,8 +69,8 @@ private:
   /// The keys of this map are maps which have OpName enum values as their keys
   /// and instruction operand indices as their values.  The values of this map
   /// are lists of instruction names.
-  typedef std::map<std::map<unsigned, unsigned>,
-                   std::vector<std::string>> OpNameMapTy;
+  typedef std::map<std::map<unsigned, unsigned>, std::vector<std::string>>
+      OpNameMapTy;
   typedef std::map<std::string, unsigned>::iterator StrUintMapIter;
 
   /// Generate member functions in the target-specific GenInstrInfo class.
@@ -94,13 +94,14 @@ private:
   void emitOperandTypeMappings(
       raw_ostream &OS, const CodeGenTarget &Target,
       ArrayRef<const CodeGenInstruction *> NumberedInstructions);
-  void initOperandMapData(
-            ArrayRef<const CodeGenInstruction *> NumberedInstructions,
-            StringRef Namespace,
-            std::map<std::string, unsigned> &Operands,
-            OpNameMapTy &OperandMap);
-  void emitOperandNameMappings(raw_ostream &OS, const CodeGenTarget &Target,
-            ArrayRef<const CodeGenInstruction*> NumberedInstructions);
+  void
+  initOperandMapData(ArrayRef<const CodeGenInstruction *> NumberedInstructions,
+                     StringRef Namespace,
+                     std::map<std::string, unsigned> &Operands,
+                     OpNameMapTy &OperandMap);
+  void emitOperandNameMappings(
+      raw_ostream &OS, const CodeGenTarget &Target,
+      ArrayRef<const CodeGenInstruction *> NumberedInstructions);
 
   void emitLogicalOperandSizeMappings(
       raw_ostream &OS, StringRef Namespace,
@@ -193,8 +194,7 @@ InstrInfoEmitter::GetOperandInfo(const CodeGenInstruction &Inst) {
       // Fill in constraint info.
       Res += ", ";
 
-      const CGIOperandList::ConstraintInfo &Constraint =
-        Op.Constraints[j];
+      const CGIOperandList::ConstraintInfo &Constraint = Op.Constraints[j];
       if (Constraint.isNone())
         Res += "0";
       else if (Constraint.isEarlyClobber())
@@ -246,10 +246,9 @@ void InstrInfoEmitter::EmitOperandInfo(raw_ostream &OS,
 ///        each instructions.  This is used to generate the OperandMap table as
 ///        well as the getNamedOperandIdx() function.
 void InstrInfoEmitter::initOperandMapData(
-        ArrayRef<const CodeGenInstruction *> NumberedInstructions,
-        StringRef Namespace,
-        std::map<std::string, unsigned> &Operands,
-        OpNameMapTy &OperandMap) {
+    ArrayRef<const CodeGenInstruction *> NumberedInstructions,
+    StringRef Namespace, std::map<std::string, unsigned> &Operands,
+    OpNameMapTy &OperandMap) {
   unsigned NumOperands = 0;
   for (const CodeGenInstruction *Inst : NumberedInstructions) {
     if (!Inst->TheDef->getValueAsBit("UseNamedOperandTable"))
@@ -259,13 +258,13 @@ void InstrInfoEmitter::initOperandMapData(
       StrUintMapIter I = Operands.find(Info.Name);
 
       if (I == Operands.end()) {
-        I = Operands.insert(Operands.begin(),
-                    std::pair<std::string, unsigned>(Info.Name, NumOperands++));
+        I = Operands.insert(Operands.begin(), std::pair<std::string, unsigned>(
+                                                  Info.Name, NumOperands++));
       }
       OpList[I->second] = Info.MIOperandNo;
     }
-    OperandMap[OpList].push_back(Namespace.str() + "::" +
-                                 Inst->TheDef->getName().str());
+    OperandMap[OpList].push_back(Namespace.str() +
+                                 "::" + Inst->TheDef->getName().str());
   }
 }
 
@@ -280,9 +279,9 @@ void InstrInfoEmitter::initOperandMapData(
 /// - A function called getNamedOperandIdx(uint16_t Opcode, uint16_t NamedIdx)
 ///   for looking up the operand index for an instruction, given a value from
 ///   OpName enum
-void InstrInfoEmitter::emitOperandNameMappings(raw_ostream &OS,
-           const CodeGenTarget &Target,
-           ArrayRef<const CodeGenInstruction*> NumberedInstructions) {
+void InstrInfoEmitter::emitOperandNameMappings(
+    raw_ostream &OS, const CodeGenTarget &Target,
+    ArrayRef<const CodeGenInstruction *> NumberedInstructions) {
   StringRef Namespace = Target.getInstNamespace();
   std::string OpNameNS = "OpName";
   // Map of operand names to their enumeration value.  This will be used to
@@ -380,7 +379,8 @@ void InstrInfoEmitter::emitOperandTypeMappings(
     }
   }
 
-  OS << "  OPERAND_TYPE_LIST_END" << "\n};\n";
+  OS << "  OPERAND_TYPE_LIST_END"
+     << "\n};\n";
   OS << "} // end namespace OpTypes\n";
   OS << "} // end namespace " << Namespace << "\n";
   OS << "} // end namespace llvm\n";
@@ -685,7 +685,7 @@ void InstrInfoEmitter::emitMCIIHelperMethods(raw_ostream &OS,
 
   for (const Record *Rec : TIIPredicates) {
     OS << "bool " << Rec->getValueAsString("FunctionName")
-        << "(const MCInst &MI);\n";
+       << "(const MCInst &MI);\n";
   }
 
   OS << "void verifyInstructionPredicates(unsigned Opcode, const FeatureBitset "
@@ -939,7 +939,7 @@ void InstrInfoEmitter::run(raw_ostream &OS) {
 
   // Collect all of the instruction's implicit uses and defs.
   Records.startTimer("Collect uses/defs");
-  std::map<std::vector<Record*>, unsigned> EmittedLists;
+  std::map<std::vector<Record *>, unsigned> EmittedLists;
   std::vector<std::vector<Record *>> ImplicitLists;
   unsigned ImplicitListSize = 0;
   for (const CodeGenInstruction *II : Target.getInstructionsByEnumValue()) {
@@ -1017,7 +1017,7 @@ void InstrInfoEmitter::run(raw_ostream &OS) {
   InstrNames.emitStringLiteralDef(OS, Twine("extern const char ") + TargetName +
                                           "InstrNameData[]");
 
-  OS << "extern const unsigned " << TargetName <<"InstrNameIndices[] = {";
+  OS << "extern const unsigned " << TargetName << "InstrNameIndices[] = {";
   Num = 0;
   for (const CodeGenInstruction *Inst : NumberedInstructions) {
     // Newline every eight entries.
@@ -1104,7 +1104,6 @@ void InstrInfoEmitter::run(raw_ostream &OS) {
         "unsigned CatchRetOpcode = ~0u, unsigned ReturnOpcode = ~0u);\n"
      << "  ~" << ClassName << "() override = default;\n";
 
-
   OS << "\n};\n} // end namespace llvm\n";
 
   OS << "#endif // GET_INSTRINFO_HEADER\n\n";
@@ -1180,8 +1179,8 @@ void InstrInfoEmitter::emitRecord(
   int MinOperands = 0;
   if (!Inst.Operands.empty())
     // Each logical operand can be multiple MI operands.
-    MinOperands = Inst.Operands.back().MIOperandNo +
-                  Inst.Operands.back().MINumOperands;
+    MinOperands =
+        Inst.Operands.back().MIOperandNo + Inst.Operands.back().MINumOperands;
 
   OS << "    { ";
   OS << Num << ",\t" << MinOperands << ",\t" << Inst.Operands.NumDefs << ",\t"
@@ -1202,49 +1201,88 @@ void InstrInfoEmitter::emitRecord(
   OS << OperandInfoMap.find(OperandInfo)->second << ",\t0";
 
   // Emit all of the target independent flags...
-  if (Inst.isPreISelOpcode)    OS << "|(1ULL<<MCID::PreISelOpcode)";
-  if (Inst.isPseudo)           OS << "|(1ULL<<MCID::Pseudo)";
-  if (Inst.isMeta)             OS << "|(1ULL<<MCID::Meta)";
-  if (Inst.isReturn)           OS << "|(1ULL<<MCID::Return)";
-  if (Inst.isEHScopeReturn)    OS << "|(1ULL<<MCID::EHScopeReturn)";
-  if (Inst.isBranch)           OS << "|(1ULL<<MCID::Branch)";
-  if (Inst.isIndirectBranch)   OS << "|(1ULL<<MCID::IndirectBranch)";
-  if (Inst.isCompare)          OS << "|(1ULL<<MCID::Compare)";
-  if (Inst.isMoveImm)          OS << "|(1ULL<<MCID::MoveImm)";
-  if (Inst.isMoveReg)          OS << "|(1ULL<<MCID::MoveReg)";
-  if (Inst.isBitcast)          OS << "|(1ULL<<MCID::Bitcast)";
-  if (Inst.isAdd)              OS << "|(1ULL<<MCID::Add)";
-  if (Inst.isTrap)             OS << "|(1ULL<<MCID::Trap)";
-  if (Inst.isSelect)           OS << "|(1ULL<<MCID::Select)";
-  if (Inst.isBarrier)          OS << "|(1ULL<<MCID::Barrier)";
-  if (Inst.hasDelaySlot)       OS << "|(1ULL<<MCID::DelaySlot)";
-  if (Inst.isCall)             OS << "|(1ULL<<MCID::Call)";
-  if (Inst.canFoldAsLoad)      OS << "|(1ULL<<MCID::FoldableAsLoad)";
-  if (Inst.mayLoad)            OS << "|(1ULL<<MCID::MayLoad)";
-  if (Inst.mayStore)           OS << "|(1ULL<<MCID::MayStore)";
-  if (Inst.mayRaiseFPException) OS << "|(1ULL<<MCID::MayRaiseFPException)";
-  if (Inst.isPredicable)       OS << "|(1ULL<<MCID::Predicable)";
-  if (Inst.isConvertibleToThreeAddress) OS << "|(1ULL<<MCID::ConvertibleTo3Addr)";
-  if (Inst.isCommutable)       OS << "|(1ULL<<MCID::Commutable)";
-  if (Inst.isTerminator)       OS << "|(1ULL<<MCID::Terminator)";
-  if (Inst.isReMaterializable) OS << "|(1ULL<<MCID::Rematerializable)";
-  if (Inst.isNotDuplicable)    OS << "|(1ULL<<MCID::NotDuplicable)";
-  if (Inst.Operands.hasOptionalDef) OS << "|(1ULL<<MCID::HasOptionalDef)";
-  if (Inst.usesCustomInserter) OS << "|(1ULL<<MCID::UsesCustomInserter)";
-  if (Inst.hasPostISelHook)    OS << "|(1ULL<<MCID::HasPostISelHook)";
-  if (Inst.Operands.isVariadic)OS << "|(1ULL<<MCID::Variadic)";
-  if (Inst.hasSideEffects)     OS << "|(1ULL<<MCID::UnmodeledSideEffects)";
-  if (Inst.isAsCheapAsAMove)   OS << "|(1ULL<<MCID::CheapAsAMove)";
+  if (Inst.isPreISelOpcode)
+    OS << "|(1ULL<<MCID::PreISelOpcode)";
+  if (Inst.isPseudo)
+    OS << "|(1ULL<<MCID::Pseudo)";
+  if (Inst.isMeta)
+    OS << "|(1ULL<<MCID::Meta)";
+  if (Inst.isReturn)
+    OS << "|(1ULL<<MCID::Return)";
+  if (Inst.isEHScopeReturn)
+    OS << "|(1ULL<<MCID::EHScopeReturn)";
+  if (Inst.isBranch)
+    OS << "|(1ULL<<MCID::Branch)";
+  if (Inst.isIndirectBranch)
+    OS << "|(1ULL<<MCID::IndirectBranch)";
+  if (Inst.isCompare)
+    OS << "|(1ULL<<MCID::Compare)";
+  if (Inst.isMoveImm)
+    OS << "|(1ULL<<MCID::MoveImm)";
+  if (Inst.isMoveReg)
+    OS << "|(1ULL<<MCID::MoveReg)";
+  if (Inst.isBitcast)
+    OS << "|(1ULL<<MCID::Bitcast)";
+  if (Inst.isAdd)
+    OS << "|(1ULL<<MCID::Add)";
+  if (Inst.isTrap)
+    OS << "|(1ULL<<MCID::Trap)";
+  if (Inst.isSelect)
+    OS << "|(1ULL<<MCID::Select)";
+  if (Inst.isBarrier)
+    OS << "|(1ULL<<MCID::Barrier)";
+  if (Inst.hasDelaySlot)
+    OS << "|(1ULL<<MCID::DelaySlot)";
+  if (Inst.isCall)
+    OS << "|(1ULL<<MCID::Call)";
+  if (Inst.canFoldAsLoad)
+    OS << "|(1ULL<<MCID::FoldableAsLoad)";
+  if (Inst.mayLoad)
+    OS << "|(1ULL<<MCID::MayLoad)";
+  if (Inst.mayStore)
+    OS << "|(1ULL<<MCID::MayStore)";
+  if (Inst.mayRaiseFPException)
+    OS << "|(1ULL<<MCID::MayRaiseFPException)";
+  if (Inst.isPredicable)
+    OS << "|(1ULL<<MCID::Predicable)";
+  if (Inst.isConvertibleToThreeAddress)
+    OS << "|(1ULL<<MCID::ConvertibleTo3Addr)";
+  if (Inst.isCommutable)
+    OS << "|(1ULL<<MCID::Commutable)";
+  if (Inst.isTerminator)
+    OS << "|(1ULL<<MCID::Terminator)";
+  if (Inst.isReMaterializable)
+    OS << "|(1ULL<<MCID::Rematerializable)";
+  if (Inst.isNotDuplicable)
+    OS << "|(1ULL<<MCID::NotDuplicable)";
+  if (Inst.Operands.hasOptionalDef)
+    OS << "|(1ULL<<MCID::HasOptionalDef)";
+  if (Inst.usesCustomInserter)
+    OS << "|(1ULL<<MCID::UsesCustomInserter)";
+  if (Inst.hasPostISelHook)
+    OS << "|(1ULL<<MCID::HasPostISelHook)";
+  if (Inst.Operands.isVariadic)
+    OS << "|(1ULL<<MCID::Variadic)";
+  if (Inst.hasSideEffects)
+    OS << "|(1ULL<<MCID::UnmodeledSideEffects)";
+  if (Inst.isAsCheapAsAMove)
+    OS << "|(1ULL<<MCID::CheapAsAMove)";
   if (!Target.getAllowRegisterRenaming() || Inst.hasExtraSrcRegAllocReq)
     OS << "|(1ULL<<MCID::ExtraSrcRegAllocReq)";
   if (!Target.getAllowRegisterRenaming() || Inst.hasExtraDefRegAllocReq)
     OS << "|(1ULL<<MCID::ExtraDefRegAllocReq)";
-  if (Inst.isRegSequence) OS << "|(1ULL<<MCID::RegSequence)";
-  if (Inst.isExtractSubreg) OS << "|(1ULL<<MCID::ExtractSubreg)";
-  if (Inst.isInsertSubreg) OS << "|(1ULL<<MCID::InsertSubreg)";
-  if (Inst.isConvergent) OS << "|(1ULL<<MCID::Convergent)";
-  if (Inst.variadicOpsAreDefs) OS << "|(1ULL<<MCID::VariadicOpsAreDefs)";
-  if (Inst.isAuthenticated) OS << "|(1ULL<<MCID::Authenticated)";
+  if (Inst.isRegSequence)
+    OS << "|(1ULL<<MCID::RegSequence)";
+  if (Inst.isExtractSubreg)
+    OS << "|(1ULL<<MCID::ExtractSubreg)";
+  if (Inst.isInsertSubreg)
+    OS << "|(1ULL<<MCID::InsertSubreg)";
+  if (Inst.isConvergent)
+    OS << "|(1ULL<<MCID::Convergent)";
+  if (Inst.variadicOpsAreDefs)
+    OS << "|(1ULL<<MCID::VariadicOpsAreDefs)";
+  if (Inst.isAuthenticated)
+    OS << "|(1ULL<<MCID::Authenticated)";
 
   // Emit all of the target-specific flags...
   BitsInit *TSF = Inst.TheDef->getValueAsBitsInit("TSFlags");
diff --git a/llvm/utils/TableGen/IntrinsicEmitter.cpp b/llvm/utils/TableGen/IntrinsicEmitter.cpp
index 28604c5..f7ae5ed 100644
--- a/llvm/utils/TableGen/IntrinsicEmitter.cpp
+++ b/llvm/utils/TableGen/IntrinsicEmitter.cpp
@@ -60,8 +60,8 @@ public:
                                     raw_ostream &OS);
   void EmitGenerator(const CodeGenIntrinsicTable &Ints, raw_ostream &OS);
   void EmitAttributes(const CodeGenIntrinsicTable &Ints, raw_ostream &OS);
-  void EmitIntrinsicToBuiltinMap(const CodeGenIntrinsicTable &Ints, bool IsClang,
-                                 raw_ostream &OS);
+  void EmitIntrinsicToBuiltinMap(const CodeGenIntrinsicTable &Ints,
+                                 bool IsClang, raw_ostream &OS);
 };
 } // End anonymous namespace
 
@@ -204,7 +204,7 @@ void IntrinsicEmitter::EmitIITInfo(raw_ostream &OS) {
 }
 
 void IntrinsicEmitter::EmitTargetInfo(const CodeGenIntrinsicTable &Ints,
-                                    raw_ostream &OS) {
+                                      raw_ostream &OS) {
   OS << "// Target mapping\n";
   OS << "#ifdef GET_INTRINSIC_TARGET_DATA\n";
   OS << "struct IntrinsicTargetInfo {\n"
@@ -238,10 +238,10 @@ void IntrinsicEmitter::EmitIntrinsicToOverloadTable(
   OS << "  0";
   for (unsigned i = 0, e = Ints.size(); i != e; ++i) {
     // Add one to the index so we emit a null bit for the invalid #0 intrinsic.
-    if ((i+1)%8 == 0)
+    if ((i + 1) % 8 == 0)
       OS << ",\n  0";
     if (Ints[i].isOverloaded)
-      OS << " | (1<<" << (i+1)%8 << ')';
+      OS << " | (1<<" << (i + 1) % 8 << ')';
   }
   OS << "\n};\n\n";
   // OTable contains a true bit at the position if the intrinsic is overloaded.
@@ -271,7 +271,7 @@ void IntrinsicEmitter::EmitGenerator(const CodeGenIntrinsicTable &Ints,
   // capture it in this vector, otherwise store a ~0U.
   std::vector<unsigned> FixedEncodings;
 
-  SequenceToOffsetTable<std::vector<unsigned char> > LongEncodingTable;
+  SequenceToOffsetTable<std::vector<unsigned char>> LongEncodingTable;
 
   std::vector<unsigned char> TypeSig;
 
@@ -292,7 +292,7 @@ void IntrinsicEmitter::EmitGenerator(const CodeGenIntrinsicTable &Ints,
           Failed = true;
           break;
         }
-        Result = (Result << 4) | TypeSig[e-i-1];
+        Result = (Result << 4) | TypeSig[e - i - 1];
       }
 
       // If this could be encoded into a 31-bit word, return it.
@@ -330,7 +330,6 @@ void IntrinsicEmitter::EmitGenerator(const CodeGenIntrinsicTable &Ints,
     TypeSig.clear();
     ComputeFixedEncoding(Ints[i], TypeSig);
 
-
     // Otherwise, emit the offset into the long encoding table.  We emit it this
     // way so that it is easier to read the offset in the .def file.
     OS << "(1U<<31) | " << LongEncodingTable.get(TypeSig) << ", ";
@@ -344,7 +343,7 @@ void IntrinsicEmitter::EmitGenerator(const CodeGenIntrinsicTable &Ints,
     LongEncodingTable.emit(OS, printIITEntry);
   OS << "  255\n};\n\n";
 
-  OS << "#endif\n\n";  // End of GET_INTRINSIC_GENERATOR_GLOBAL
+  OS << "#endif\n\n"; // End of GET_INTRINSIC_GENERATOR_GLOBAL
 }
 
 namespace {
@@ -393,7 +392,8 @@ std::optional<bool> compareFnAttributes(const CodeGenIntrinsic *L,
   // Try to order by readonly/readnone attribute.
   uint32_t LK = L->ME.toIntValue();
   uint32_t RK = R->ME.toIntValue();
-  if (LK != RK) return (LK > RK);
+  if (LK != RK)
+    return (LK > RK);
 
   return std::nullopt;
 }
@@ -438,8 +438,7 @@ void IntrinsicEmitter::EmitAttributes(const CodeGenIntrinsicTable &Ints,
       if (!UniqArgAttributes.try_emplace(Attrs, ID).second)
         continue;
 
-      assert(is_sorted(Attrs) &&
-             "Argument attributes are not sorted");
+      assert(is_sorted(Attrs) && "Argument attributes are not sorted");
 
       OS << "  case " << ID << ":\n";
       OS << "    return AttributeSet::get(C, {\n";
@@ -473,8 +472,8 @@ void IntrinsicEmitter::EmitAttributes(const CodeGenIntrinsicTable &Ints,
           OS << "      Attribute::get(C, Attribute::ImmArg),\n";
           break;
         case CodeGenIntrinsic::Alignment:
-          OS << "      Attribute::get(C, Attribute::Alignment, "
-             << Attr.Value << "),\n";
+          OS << "      Attribute::get(C, Attribute::Alignment, " << Attr.Value
+             << "),\n";
           break;
         case CodeGenIntrinsic::Dereferenceable:
           OS << "      Attribute::get(C, Attribute::Dereferenceable, "
@@ -489,7 +488,7 @@ void IntrinsicEmitter::EmitAttributes(const CodeGenIntrinsicTable &Ints,
   OS << "}\n\n";
 
   // Compute unique function attribute sets.
-  std::map<const CodeGenIntrinsic*, unsigned, FnAttributeComparator>
+  std::map<const CodeGenIntrinsic *, unsigned, FnAttributeComparator>
       UniqFnAttributes;
   OS << "static AttributeSet getIntrinsicFnAttributeSet("
      << "LLVMContext &C, unsigned ID) {\n"
@@ -542,17 +541,18 @@ void IntrinsicEmitter::EmitAttributes(const CodeGenIntrinsicTable &Ints,
   OS << "AttributeList Intrinsic::getAttributes(LLVMContext &C, ID id) {\n";
 
   // Compute the maximum number of attribute arguments and the map
-  typedef std::map<const CodeGenIntrinsic*, unsigned,
-                   AttributeComparator> UniqAttrMapTy;
+  typedef std::map<const CodeGenIntrinsic *, unsigned, AttributeComparator>
+      UniqAttrMapTy;
   UniqAttrMapTy UniqAttributes;
   unsigned maxArgAttrs = 0;
   unsigned AttrNum = 0;
   for (unsigned i = 0, e = Ints.size(); i != e; ++i) {
     const CodeGenIntrinsic &intrinsic = Ints[i];
     maxArgAttrs =
-      std::max(maxArgAttrs, unsigned(intrinsic.ArgumentAttributes.size()));
+        std::max(maxArgAttrs, unsigned(intrinsic.ArgumentAttributes.size()));
     unsigned &N = UniqAttributes[&intrinsic];
-    if (N) continue;
+    if (N)
+      continue;
     N = ++AttrNum;
     assert(N < 65536 && "Too many unique attributes for table!");
   }
@@ -564,8 +564,8 @@ void IntrinsicEmitter::EmitAttributes(const CodeGenIntrinsicTable &Ints,
   for (unsigned i = 0, e = Ints.size(); i != e; ++i) {
     const CodeGenIntrinsic &intrinsic = Ints[i];
 
-    OS << "    " << UniqAttributes[&intrinsic] << ", // "
-       << intrinsic.Name << "\n";
+    OS << "    " << UniqAttributes[&intrinsic] << ", // " << intrinsic.Name
+       << "\n";
   }
   OS << "  };\n\n";
 
diff --git a/llvm/utils/TableGen/OptParserEmitter.cpp b/llvm/utils/TableGen/OptParserEmitter.cpp
index 257cd44..0f08119 100644
--- a/llvm/utils/TableGen/OptParserEmitter.cpp
+++ b/llvm/utils/TableGen/OptParserEmitter.cpp
@@ -196,9 +196,9 @@ static MarshallingInfo createMarshallingInfo(const Record &R) {
 /// working with those options when given an input command line.
 static void EmitOptParser(RecordKeeper &Records, raw_ostream &OS) {
   // Get the option groups and options.
-  const std::vector<Record*> &Groups =
-    Records.getAllDerivedDefinitions("OptionGroup");
-  std::vector<Record*> Opts = Records.getAllDerivedDefinitions("Option");
+  const std::vector<Record *> &Groups =
+      Records.getAllDerivedDefinitions("OptionGroup");
+  std::vector<Record *> Opts = Records.getAllDerivedDefinitions("Option");
 
   emitSourceFileHeader("Option Parsing Definitions", OS);
 
@@ -423,8 +423,7 @@ static void EmitOptParser(RecordKeeper &Records, raw_ostream &OS) {
       write_cstring(OS, R.getValueAsString("Values"));
     else if (!isa<UnsetInit>(R.getValueInit("ValuesCode"))) {
       OS << getOptionName(R) << "_Values";
-    }
-    else
+    } else
       OS << "nullptr";
   };
 
diff --git a/llvm/utils/TableGen/PredicateExpander.cpp b/llvm/utils/TableGen/PredicateExpander.cpp
index 0b9b6389f..d0a35ff 100644
--- a/llvm/utils/TableGen/PredicateExpander.cpp
+++ b/llvm/utils/TableGen/PredicateExpander.cpp
@@ -101,7 +101,6 @@ void PredicateExpander::expandCheckRegOperand(raw_ostream &OS, int OpIndex,
   OS << Reg->getName();
 }
 
-
 void PredicateExpander::expandCheckRegOperandSimple(raw_ostream &OS,
                                                     int OpIndex,
                                                     StringRef FunctionMapper) {
@@ -487,7 +486,8 @@ void STIPredicateExpander::expandPrologue(raw_ostream &OS,
   OS << "unsigned ProcessorID = getSchedModel().getProcessorID();\n";
 }
 
-void STIPredicateExpander::expandOpcodeGroup(raw_ostream &OS, const OpcodeGroup &Group,
+void STIPredicateExpander::expandOpcodeGroup(raw_ostream &OS,
+                                             const OpcodeGroup &Group,
                                              bool ShouldUpdateOpcodeMask) {
   const OpcodeInfo &OI = Group.getOpcodeInfo();
   for (const PredicateInfo &PI : OI.getPredicates()) {
diff --git a/llvm/utils/TableGen/PseudoLoweringEmitter.cpp b/llvm/utils/TableGen/PseudoLoweringEmitter.cpp
index e07fb91..7f692f2 100644
--- a/llvm/utils/TableGen/PseudoLoweringEmitter.cpp
+++ b/llvm/utils/TableGen/PseudoLoweringEmitter.cpp
@@ -27,19 +27,19 @@ class PseudoLoweringEmitter {
     enum MapKind { Operand, Imm, Reg };
     MapKind Kind;
     union {
-      unsigned Operand;   // Operand number mapped to.
-      uint64_t Imm;       // Integer immedate value.
-      Record *Reg;        // Physical register.
+      unsigned Operand; // Operand number mapped to.
+      uint64_t Imm;     // Integer immedate value.
+      Record *Reg;      // Physical register.
     } Data;
   };
   struct PseudoExpansion {
-    CodeGenInstruction Source;  // The source pseudo instruction definition.
-    CodeGenInstruction Dest;    // The destination instruction to lower to.
+    CodeGenInstruction Source; // The source pseudo instruction definition.
+    CodeGenInstruction Dest;   // The destination instruction to lower to.
     IndexedMap<OpData> OperandMap;
 
     PseudoExpansion(CodeGenInstruction &s, CodeGenInstruction &d,
-                    IndexedMap<OpData> &m) :
-      Source(s), Dest(d), OperandMap(m) {}
+                    IndexedMap<OpData> &m)
+        : Source(s), Dest(d), OperandMap(m) {}
   };
 
   RecordKeeper &Records;
@@ -57,6 +57,7 @@ class PseudoLoweringEmitter {
                                 unsigned BaseIdx);
   void evaluateExpansion(Record *Pseudo);
   void emitLoweringEmitter(raw_ostream &o);
+
 public:
   PseudoLoweringEmitter(RecordKeeper &R) : Records(R), Target(R) {}
 
@@ -69,9 +70,9 @@ public:
 //        The pseudo expansion really should take a list of dags, not just
 //        a single dag, so we can do fancier things.
 
-unsigned PseudoLoweringEmitter::
-addDagOperandMapping(Record *Rec, DagInit *Dag, CodeGenInstruction &Insn,
-                     IndexedMap<OpData> &OperandMap, unsigned BaseIdx) {
+unsigned PseudoLoweringEmitter::addDagOperandMapping(
+    Record *Rec, DagInit *Dag, CodeGenInstruction &Insn,
+    IndexedMap<OpData> &OperandMap, unsigned BaseIdx) {
   unsigned OpsAdded = 0;
   for (unsigned i = 0, e = Dag->getNumArgs(); i != e; ++i) {
     if (DefInit *DI = dyn_cast<DefInit>(Dag->getArg(i))) {
@@ -92,9 +93,9 @@ addDagOperandMapping(Record *Rec, DagInit *Dag, CodeGenInstruction &Insn,
       // FIXME: Are the message operand types backward?
       if (DI->getDef() != Insn.Operands[BaseIdx + i].Rec) {
         PrintError(Rec, "In pseudo instruction '" + Rec->getName() +
-                        "', operand type '" + DI->getDef()->getName() +
-                        "' does not match expansion operand type '" +
-                        Insn.Operands[BaseIdx + i].Rec->getName() + "'");
+                            "', operand type '" + DI->getDef()->getName() +
+                            "' does not match expansion operand type '" +
+                            Insn.Operands[BaseIdx + i].Rec->getName() + "'");
         PrintFatalNote(DI->getDef(),
                        "Value was assigned at the following location:");
       }
@@ -118,7 +119,7 @@ addDagOperandMapping(Record *Rec, DagInit *Dag, CodeGenInstruction &Insn,
       // Just add the operands recursively. This is almost certainly
       // a constant value for a complex operand (> 1 MI operand).
       unsigned NewOps =
-        addDagOperandMapping(Rec, SubDag, Insn, OperandMap, BaseIdx + i);
+          addDagOperandMapping(Rec, SubDag, Insn, OperandMap, BaseIdx + i);
       OpsAdded += NewOps;
       // Since we added more than one, we also need to adjust the base.
       BaseIdx += NewOps - 1;
@@ -140,15 +141,15 @@ void PseudoLoweringEmitter::evaluateExpansion(Record *Rec) {
   DefInit *OpDef = dyn_cast<DefInit>(Dag->getOperator());
   if (!OpDef) {
     PrintError(Rec, "In pseudo instruction '" + Rec->getName() +
-                    "', result operator is not a record");
+                        "', result operator is not a record");
     PrintFatalNote(Rec->getValue("ResultInst"),
                    "Result was assigned at the following location:");
   }
   Record *Operator = OpDef->getDef();
   if (!Operator->isSubClassOf("Instruction")) {
     PrintError(Rec, "In pseudo instruction '" + Rec->getName() +
-                    "', result operator '" + Operator->getName() +
-                    "' is not an instruction");
+                        "', result operator '" + Operator->getName() +
+                        "' is not an instruction");
     PrintFatalNote(Rec->getValue("ResultInst"),
                    "Result was assigned at the following location:");
   }
@@ -157,16 +158,16 @@ void PseudoLoweringEmitter::evaluateExpansion(Record *Rec) {
 
   if (Insn.isCodeGenOnly || Insn.isPseudo) {
     PrintError(Rec, "In pseudo instruction '" + Rec->getName() +
-                    "', result operator '" + Operator->getName() +
-                    "' cannot be a pseudo instruction");
+                        "', result operator '" + Operator->getName() +
+                        "' cannot be a pseudo instruction");
     PrintFatalNote(Rec->getValue("ResultInst"),
                    "Result was assigned at the following location:");
   }
 
   if (Insn.Operands.size() != Dag->getNumArgs()) {
     PrintError(Rec, "In pseudo instruction '" + Rec->getName() +
-                    "', result operator '" + Operator->getName() +
-                    "' has the wrong number of operands");
+                        "', result operator '" + Operator->getName() +
+                        "' has the wrong number of operands");
     PrintFatalNote(Rec->getValue("ResultInst"),
                    "Result was assigned at the following location:");
   }
@@ -201,11 +202,11 @@ void PseudoLoweringEmitter::evaluateExpansion(Record *Rec) {
     if (OperandMap[Insn.Operands[i].MIOperandNo].Kind != OpData::Operand)
       continue;
     StringMap<unsigned>::iterator SourceOp =
-      SourceOperands.find(Dag->getArgNameStr(i));
+        SourceOperands.find(Dag->getArgNameStr(i));
     if (SourceOp == SourceOperands.end()) {
       PrintError(Rec, "In pseudo instruction '" + Rec->getName() +
-                      "', output operand '" + Dag->getArgNameStr(i) +
-                      "' has no matching source operand");
+                          "', output operand '" + Dag->getArgNameStr(i) +
+                          "' has no matching source operand");
       PrintFatalNote(Rec->getValue("ResultInst"),
                      "Value was assigned at the following location:");
     }
@@ -213,7 +214,7 @@ void PseudoLoweringEmitter::evaluateExpansion(Record *Rec) {
     // MachineInstr operand.
     for (unsigned I = 0, E = Insn.Operands[i].MINumOperands; I != E; ++I)
       OperandMap[Insn.Operands[i].MIOperandNo + I].Data.Operand =
-        SourceOp->getValue();
+          SourceOp->getValue();
 
     LLVM_DEBUG(dbgs() << "    " << SourceOp->getValue() << " ==> " << i
                       << "\n");
@@ -226,7 +227,8 @@ void PseudoLoweringEmitter::emitLoweringEmitter(raw_ostream &o) {
   // Emit file header.
   emitSourceFileHeader("Pseudo-instruction MC lowering Source Fragment", o);
 
-  o << "bool " << Target.getName() + "AsmPrinter" << "::\n"
+  o << "bool " << Target.getName() + "AsmPrinter"
+    << "::\n"
     << "emitPseudoExpansionLowering(MCStreamer &OutStreamer,\n"
     << "                            const MachineInstr *MI) {\n";
 
@@ -236,12 +238,12 @@ void PseudoLoweringEmitter::emitLoweringEmitter(raw_ostream &o) {
     for (auto &Expansion : Expansions) {
       CodeGenInstruction &Source = Expansion.Source;
       CodeGenInstruction &Dest = Expansion.Dest;
-      o << "  case " << Source.Namespace << "::"
-        << Source.TheDef->getName() << ": {\n"
+      o << "  case " << Source.Namespace << "::" << Source.TheDef->getName()
+        << ": {\n"
         << "    MCInst TmpInst;\n"
         << "    MCOperand MCOp;\n"
-        << "    TmpInst.setOpcode(" << Dest.Namespace << "::"
-        << Dest.TheDef->getName() << ");\n";
+        << "    TmpInst.setOpcode(" << Dest.Namespace
+        << "::" << Dest.TheDef->getName() << ");\n";
 
       // Copy the operands from the source instruction.
       // FIXME: Instruction operands with defaults values (predicates and cc_out
@@ -252,29 +254,29 @@ void PseudoLoweringEmitter::emitLoweringEmitter(raw_ostream &o) {
         o << "    // Operand: " << DestOperand.Name << "\n";
         for (unsigned i = 0, e = DestOperand.MINumOperands; i != e; ++i) {
           switch (Expansion.OperandMap[MIOpNo + i].Kind) {
-            case OpData::Operand:
+          case OpData::Operand:
             o << "    lowerOperand(MI->getOperand("
-              << Source.Operands[Expansion.OperandMap[MIOpNo].Data
-              .Operand].MIOperandNo + i
+              << Source.Operands[Expansion.OperandMap[MIOpNo].Data.Operand]
+                         .MIOperandNo +
+                     i
               << "), MCOp);\n"
               << "    TmpInst.addOperand(MCOp);\n";
             break;
-            case OpData::Imm:
+          case OpData::Imm:
             o << "    TmpInst.addOperand(MCOperand::createImm("
               << Expansion.OperandMap[MIOpNo + i].Data.Imm << "));\n";
             break;
-            case OpData::Reg: {
-              Record *Reg = Expansion.OperandMap[MIOpNo + i].Data.Reg;
-              o << "    TmpInst.addOperand(MCOperand::createReg(";
-              // "zero_reg" is special.
-              if (Reg->getName() == "zero_reg")
-                o << "0";
-              else
-                o << Reg->getValueAsString("Namespace") << "::"
-                  << Reg->getName();
-              o << "));\n";
-              break;
-            }
+          case OpData::Reg: {
+            Record *Reg = Expansion.OperandMap[MIOpNo + i].Data.Reg;
+            o << "    TmpInst.addOperand(MCOperand::createReg(";
+            // "zero_reg" is special.
+            if (Reg->getName() == "zero_reg")
+              o << "0";
+            else
+              o << Reg->getValueAsString("Namespace") << "::" << Reg->getName();
+            o << "));\n";
+            break;
+          }
           }
         }
         MIOpNo += DestOperand.MINumOperands;
diff --git a/llvm/utils/TableGen/RegisterBankEmitter.cpp b/llvm/utils/TableGen/RegisterBankEmitter.cpp
index f851d9a..8b59411 100644
--- a/llvm/utils/TableGen/RegisterBankEmitter.cpp
+++ b/llvm/utils/TableGen/RegisterBankEmitter.cpp
@@ -46,7 +46,9 @@ public:
   /// Get the human-readable name for the bank.
   StringRef getName() const { return TheDef.getValueAsString("Name"); }
   /// Get the name of the enumerator in the ID enumeration.
-  std::string getEnumeratorName() const { return (TheDef.getName() + "ID").str(); }
+  std::string getEnumeratorName() const {
+    return (TheDef.getName() + "ID").str();
+  }
 
   /// Get the name of the array holding the register class coverage data;
   std::string getCoverageArrayName() const {
@@ -212,8 +214,7 @@ static void visitRegisterBankClasses(
 }
 
 void RegisterBankEmitter::emitBaseClassImplementation(
-    raw_ostream &OS, StringRef TargetName,
-    std::vector<RegisterBank> &Banks) {
+    raw_ostream &OS, StringRef TargetName, std::vector<RegisterBank> &Banks) {
   const CodeGenRegBank &RegisterClassHierarchy = Target.getRegBank();
   const CodeGenHwModes &CGH = Target.getHwModes();
 
@@ -229,7 +230,8 @@ void RegisterBankEmitter::emitBaseClassImplementation(
     OS << "const uint32_t " << Bank.getCoverageArrayName() << "[] = {\n";
     unsigned LowestIdxInWord = 0;
     for (const auto &RCs : RCsGroupedByWord) {
-      OS << "    // " << LowestIdxInWord << "-" << (LowestIdxInWord + 31) << "\n";
+      OS << "    // " << LowestIdxInWord << "-" << (LowestIdxInWord + 31)
+         << "\n";
       for (const auto &RC : RCs) {
         OS << "    (1u << (" << RC->getQualifiedIdName() << " - "
            << LowestIdxInWord << ")) |\n";
diff --git a/llvm/utils/TableGen/RegisterInfoEmitter.cpp b/llvm/utils/TableGen/RegisterInfoEmitter.cpp
index cff9777..8919e07 100644
--- a/llvm/utils/TableGen/RegisterInfoEmitter.cpp
+++ b/llvm/utils/TableGen/RegisterInfoEmitter.cpp
@@ -101,8 +101,8 @@ private:
 } // end anonymous namespace
 
 // runEnums - Print out enum values for all of the registers.
-void RegisterInfoEmitter::runEnums(raw_ostream &OS,
-                                   CodeGenTarget &Target, CodeGenRegBank &Bank) {
+void RegisterInfoEmitter::runEnums(raw_ostream &OS, CodeGenTarget &Target,
+                                   CodeGenRegBank &Bank) {
   const auto &Registers = Bank.getRegisters();
 
   // Register enums are stored as uint16_t in the tables. Make sure we'll fit.
@@ -129,7 +129,7 @@ void RegisterInfoEmitter::runEnums(raw_ostream &OS,
     OS << "  " << Reg.getName() << " = " << Reg.EnumValue << ",\n";
   assert(Registers.size() == Registers.back().EnumValue &&
          "Register enum value mismatch!");
-  OS << "  NUM_TARGET_REGS // " << Registers.size()+1 << "\n";
+  OS << "  NUM_TARGET_REGS // " << Registers.size() + 1 << "\n";
   OS << "};\n";
   if (!Namespace.empty())
     OS << "} // end namespace " << Namespace << "\n";
@@ -152,7 +152,8 @@ void RegisterInfoEmitter::runEnums(raw_ostream &OS,
       OS << "} // end namespace " << Namespace << "\n\n";
   }
 
-  const std::vector<Record*> &RegAltNameIndices = Target.getRegAltNameIndices();
+  const std::vector<Record *> &RegAltNameIndices =
+      Target.getRegAltNameIndices();
   // If the only definition is the default NoRegAltName, we don't need to
   // emit anything.
   if (RegAltNameIndices.size() > 1) {
@@ -188,7 +189,7 @@ void RegisterInfoEmitter::runEnums(raw_ostream &OS,
     OS << "namespace " << Namespace << " {\n";
   OS << "enum RegisterPressureSets {\n";
   unsigned NumSets = Bank.getNumRegPressureSets();
-  for (unsigned i = 0; i < NumSets; ++i ) {
+  for (unsigned i = 0; i < NumSets; ++i) {
     const RegUnitSet &RegUnits = Bank.getRegSetAt(i);
     OS << "  " << RegUnits.Name << " = " << i << ",\n";
   }
@@ -201,13 +202,11 @@ void RegisterInfoEmitter::runEnums(raw_ostream &OS,
   OS << "#endif // GET_REGINFO_ENUM\n\n";
 }
 
-static void printInt(raw_ostream &OS, int Val) {
-  OS << Val;
-}
+static void printInt(raw_ostream &OS, int Val) { OS << Val; }
 
-void RegisterInfoEmitter::
-EmitRegUnitPressure(raw_ostream &OS, const CodeGenRegBank &RegBank,
-                    const std::string &ClassName) {
+void RegisterInfoEmitter::EmitRegUnitPressure(raw_ostream &OS,
+                                              const CodeGenRegBank &RegBank,
+                                              const std::string &ClassName) {
   unsigned NumRCs = RegBank.getRegClasses().size();
   unsigned NumSets = RegBank.getNumRegPressureSets();
 
@@ -254,8 +253,7 @@ EmitRegUnitPressure(raw_ostream &OS, const CodeGenRegBank &RegBank,
     }
     OS << "};\n"
        << "  return RUWeightTable[RegUnit];\n";
-  }
-  else {
+  } else {
     OS << "  // All register units have unit weight.\n"
        << "  return 1;\n";
   }
@@ -271,7 +269,7 @@ EmitRegUnitPressure(raw_ostream &OS, const CodeGenRegBank &RegBank,
      << "getRegPressureSetName(unsigned Idx) const {\n"
      << "  static const char *PressureNameTable[] = {\n";
   unsigned MaxRegUnitWeight = 0;
-  for (unsigned i = 0; i < NumSets; ++i ) {
+  for (unsigned i = 0; i < NumSets; ++i) {
     const RegUnitSet &RegUnits = RegBank.getRegSetAt(i);
     MaxRegUnitWeight = std::max(MaxRegUnitWeight, RegUnits.Weight);
     OS << "    \"" << RegUnits.Name << "\",\n";
@@ -287,10 +285,10 @@ EmitRegUnitPressure(raw_ostream &OS, const CodeGenRegBank &RegBank,
         "{\n"
      << "  static const " << getMinimalTypeForRange(MaxRegUnitWeight, 32)
      << " PressureLimitTable[] = {\n";
-  for (unsigned i = 0; i < NumSets; ++i ) {
+  for (unsigned i = 0; i < NumSets; ++i) {
     const RegUnitSet &RegUnits = RegBank.getRegSetAt(i);
-    OS << "    " << RegUnits.Weight << ",  \t// " << i << ": "
-       << RegUnits.Name << "\n";
+    OS << "    " << RegUnits.Weight << ",  \t// " << i << ": " << RegUnits.Name
+       << "\n";
   }
   OS << "  };\n"
      << "  return PressureLimitTable[Idx];\n"
@@ -353,7 +351,7 @@ EmitRegUnitPressure(raw_ostream &OS, const CodeGenRegBank &RegBank,
      << "}\n\n";
 }
 
-using DwarfRegNumsMapPair = std::pair<Record*, std::vector<int64_t>>;
+using DwarfRegNumsMapPair = std::pair<Record *, std::vector<int64_t>>;
 using DwarfRegNumsVecTy = std::vector<DwarfRegNumsMapPair>;
 
 static void finalizeDwarfRegNumsKeys(DwarfRegNumsVecTy &DwarfRegNums) {
@@ -419,7 +417,7 @@ void RegisterInfoEmitter::EmitRegMappingTables(
 
         // Store the mapping sorted by the LLVM reg num so lookup can be done
         // with a binary search.
-        std::map<uint64_t, Record*> Dwarf2LMap;
+        std::map<uint64_t, Record *> Dwarf2LMap;
         for (auto &DwarfRegNum : DwarfRegNums) {
           int DwarfRegNo = DwarfRegNum.second[I];
           if (DwarfRegNo < 0)
@@ -531,8 +529,8 @@ void RegisterInfoEmitter::EmitRegMapping(
     else
       OS << "EHFlavour";
     OS << ") {\n"
-     << "  default:\n"
-     << "    llvm_unreachable(\"Unknown DWARF flavour\");\n";
+       << "  default:\n"
+       << "    llvm_unreachable(\"Unknown DWARF flavour\");\n";
 
     for (unsigned i = 0, e = maxLength; i != e; ++i) {
       OS << "  case " << i << ":\n";
@@ -540,14 +538,14 @@ void RegisterInfoEmitter::EmitRegMapping(
       if (!isCtor)
         OS << "RI->";
       std::string Tmp;
-      raw_string_ostream(Tmp) << Namespace
-                              << (j == 0 ? "DwarfFlavour" : "EHFlavour") << i
-                              << "Dwarf2L";
+      raw_string_ostream(Tmp)
+          << Namespace << (j == 0 ? "DwarfFlavour" : "EHFlavour") << i
+          << "Dwarf2L";
       OS << "mapDwarfRegsToLLVMRegs(" << Tmp << ", " << Tmp << "Size, ";
       if (j == 0)
-          OS << "false";
-        else
-          OS << "true";
+        OS << "false";
+      else
+        OS << "true";
       OS << ");\n";
       OS << "    break;\n";
     }
@@ -571,14 +569,14 @@ void RegisterInfoEmitter::EmitRegMapping(
       if (!isCtor)
         OS << "RI->";
       std::string Tmp;
-      raw_string_ostream(Tmp) << Namespace
-                              << (j == 0 ? "DwarfFlavour" : "EHFlavour") << i
-                              << "L2Dwarf";
+      raw_string_ostream(Tmp)
+          << Namespace << (j == 0 ? "DwarfFlavour" : "EHFlavour") << i
+          << "L2Dwarf";
       OS << "mapLLVMRegsToDwarfRegs(" << Tmp << ", " << Tmp << "Size, ";
       if (j == 0)
-          OS << "false";
-        else
-          OS << "true";
+        OS << "false";
+      else
+        OS << "true";
       OS << ");\n";
       OS << "    break;\n";
     }
@@ -588,8 +586,7 @@ void RegisterInfoEmitter::EmitRegMapping(
 
 // Print a BitVector as a sequence of hex numbers using a little-endian mapping.
 // Width is the number of bits per hex number.
-static void printBitVectorAsHex(raw_ostream &OS,
-                                const BitVector &Bits,
+static void printBitVectorAsHex(raw_ostream &OS, const BitVector &Bits,
                                 unsigned Width) {
   assert(Width <= 32 && "Width too large");
   unsigned Digits = (Width + 3) / 4;
@@ -604,16 +601,15 @@ static void printBitVectorAsHex(raw_ostream &OS,
 // Helper to emit a set of bits into a constant byte array.
 class BitVectorEmitter {
   BitVector Values;
+
 public:
   void add(unsigned v) {
     if (v >= Values.size())
-      Values.resize(((v/8)+1)*8); // Round up to the next byte.
+      Values.resize(((v / 8) + 1) * 8); // Round up to the next byte.
     Values[v] = true;
   }
 
-  void print(raw_ostream &OS) {
-    printBitVectorAsHex(OS, Values, 8);
-  }
+  void print(raw_ostream &OS) { printBitVectorAsHex(OS, Values, 8); }
 };
 
 static void printSimpleValueType(raw_ostream &OS, MVT::SimpleValueType VT) {
@@ -650,9 +646,8 @@ static DiffVec &diffEncode(DiffVec &V, SparseBitVector<> List) {
   return V;
 }
 
-template<typename Iter>
-static
-DiffVec &diffEncode(DiffVec &V, unsigned InitVal, Iter Begin, Iter End) {
+template <typename Iter>
+static DiffVec &diffEncode(DiffVec &V, unsigned InitVal, Iter Begin, Iter End) {
   assert(V.empty() && "Clear DiffVec before diffEncode.");
   unsigned Val = InitVal;
   for (Iter I = Begin; I != End; ++I) {
@@ -672,7 +667,7 @@ static void printMask(raw_ostream &OS, LaneBitmask Val) {
 // Try to combine Idx's compose map into Vec if it is compatible.
 // Return false if it's not possible.
 static bool combine(const CodeGenSubRegIndex *Idx,
-                    SmallVectorImpl<CodeGenSubRegIndex*> &Vec) {
+                    SmallVectorImpl<CodeGenSubRegIndex *> &Vec) {
   const CodeGenSubRegIndex::CompMap &Map = Idx->getComposites();
   for (const auto &I : Map) {
     CodeGenSubRegIndex *&Entry = Vec[I.first->EnumValue - 1];
@@ -683,17 +678,15 @@ static bool combine(const CodeGenSubRegIndex *Idx,
   // All entries are compatible. Make it so.
   for (const auto &I : Map) {
     auto *&Entry = Vec[I.first->EnumValue - 1];
-    assert((!Entry || Entry == I.second) &&
-           "Expected EnumValue to be unique");
+    assert((!Entry || Entry == I.second) && "Expected EnumValue to be unique");
     Entry = I.second;
   }
   return true;
 }
 
-void
-RegisterInfoEmitter::emitComposeSubRegIndices(raw_ostream &OS,
-                                              CodeGenRegBank &RegBank,
-                                              const std::string &ClName) {
+void RegisterInfoEmitter::emitComposeSubRegIndices(raw_ostream &OS,
+                                                   CodeGenRegBank &RegBank,
+                                                   const std::string &ClName) {
   const auto &SubRegIndices = RegBank.getSubRegIndices();
   OS << "unsigned " << ClName
      << "::composeSubRegIndicesImpl(unsigned IdxA, unsigned IdxB) const {\n";
@@ -707,7 +700,7 @@ RegisterInfoEmitter::emitComposeSubRegIndices(raw_ostream &OS,
 
   // Map each Sub-register index to a compatible table row.
   SmallVector<unsigned, 4> RowMap;
-  SmallVector<SmallVector<CodeGenSubRegIndex*, 4>, 4> Rows;
+  SmallVector<SmallVector<CodeGenSubRegIndex *, 4>, 4> Rows;
 
   auto SubRegIndicesSize =
       std::distance(SubRegIndices.begin(), SubRegIndices.end());
@@ -760,10 +753,8 @@ RegisterInfoEmitter::emitComposeSubRegIndices(raw_ostream &OS,
   OS << "}\n\n";
 }
 
-void
-RegisterInfoEmitter::emitComposeSubRegIndexLaneMask(raw_ostream &OS,
-                                                    CodeGenRegBank &RegBank,
-                                                    const std::string &ClName) {
+void RegisterInfoEmitter::emitComposeSubRegIndexLaneMask(
+    raw_ostream &OS, CodeGenRegBank &RegBank, const std::string &ClName) {
   // See the comments in computeSubRegLaneMasks() for our goal here.
   const auto &SubRegIndices = RegBank.getSubRegIndices();
 
@@ -771,8 +762,8 @@ RegisterInfoEmitter::emitComposeSubRegIndexLaneMask(raw_ostream &OS,
   SmallVector<unsigned, 4> SubReg2SequenceIndexMap;
   SmallVector<SmallVector<MaskRolPair, 1>, 4> Sequences;
   for (const auto &Idx : SubRegIndices) {
-    const SmallVector<MaskRolPair, 1> &IdxSequence
-      = Idx.CompositionLaneMaskTransform;
+    const SmallVector<MaskRolPair, 1> &IdxSequence =
+        Idx.CompositionLaneMaskTransform;
 
     unsigned Found = ~0u;
     unsigned SIdx = 0;
@@ -807,7 +798,7 @@ RegisterInfoEmitter::emitComposeSubRegIndexLaneMask(raw_ostream &OS,
       OS << format(", %2u }, ", P.RotateLeft);
     }
     OS << "{ LaneBitmask::getNone(), 0 }";
-    if (s+1 != se)
+    if (s + 1 != se)
       OS << ", ";
     OS << "  // Sequence " << Idx << "\n";
     Idx += Sequence.size() + 1;
@@ -820,7 +811,7 @@ RegisterInfoEmitter::emitComposeSubRegIndexLaneMask(raw_ostream &OS,
   for (size_t i = 0, e = SubRegIndices.size(); i != e; ++i) {
     OS << "    ";
     OS << SubReg2SequenceIndexMap[i];
-    if (i+1 != e)
+    if (i + 1 != e)
       OS << ",";
     OS << " // to " << SubRegIndices[i].getName() << "\n";
   }
@@ -829,15 +820,18 @@ RegisterInfoEmitter::emitComposeSubRegIndexLaneMask(raw_ostream &OS,
   OS << "LaneBitmask " << ClName
      << "::composeSubRegIndexLaneMaskImpl(unsigned IdxA, LaneBitmask LaneMask)"
         " const {\n"
-        "  --IdxA; assert(IdxA < " << SubRegIndices.size()
+        "  --IdxA; assert(IdxA < "
+     << SubRegIndices.size()
      << " && \"Subregister index out of bounds\");\n"
         "  LaneBitmask Result;\n"
         "  for (const MaskRolOp *Ops =\n"
         "       &LaneMaskComposeSequences[CompositeSequences[IdxA]];\n"
         "       Ops->Mask.any(); ++Ops) {\n"
-        "    LaneBitmask::Type M = LaneMask.getAsInteger() & Ops->Mask.getAsInteger();\n"
+        "    LaneBitmask::Type M = LaneMask.getAsInteger() & "
+        "Ops->Mask.getAsInteger();\n"
         "    if (unsigned S = Ops->RotateLeft)\n"
-        "      Result |= LaneBitmask((M << S) | (M >> (LaneBitmask::BitWidth - S)));\n"
+        "      Result |= LaneBitmask((M << S) | (M >> (LaneBitmask::BitWidth - "
+        "S)));\n"
         "    else\n"
         "      Result |= LaneBitmask(M);\n"
         "  }\n"
@@ -848,7 +842,8 @@ RegisterInfoEmitter::emitComposeSubRegIndexLaneMask(raw_ostream &OS,
      << "::reverseComposeSubRegIndexLaneMaskImpl(unsigned IdxA, "
         " LaneBitmask LaneMask) const {\n"
         "  LaneMask &= getSubRegIndexLaneMask(IdxA);\n"
-        "  --IdxA; assert(IdxA < " << SubRegIndices.size()
+        "  --IdxA; assert(IdxA < "
+     << SubRegIndices.size()
      << " && \"Subregister index out of bounds\");\n"
         "  LaneBitmask Result;\n"
         "  for (const MaskRolOp *Ops =\n"
@@ -856,7 +851,8 @@ RegisterInfoEmitter::emitComposeSubRegIndexLaneMask(raw_ostream &OS,
         "       Ops->Mask.any(); ++Ops) {\n"
         "    LaneBitmask::Type M = LaneMask.getAsInteger();\n"
         "    if (unsigned S = Ops->RotateLeft)\n"
-        "      Result |= LaneBitmask((M >> S) | (M << (LaneBitmask::BitWidth - S)));\n"
+        "      Result |= LaneBitmask((M >> S) | (M << (LaneBitmask::BitWidth - "
+        "S)));\n"
         "    else\n"
         "      Result |= LaneBitmask(M);\n"
         "  }\n"
@@ -867,9 +863,8 @@ RegisterInfoEmitter::emitComposeSubRegIndexLaneMask(raw_ostream &OS,
 //
 // runMCDesc - Print out MC register descriptions.
 //
-void
-RegisterInfoEmitter::runMCDesc(raw_ostream &OS, CodeGenTarget &Target,
-                               CodeGenRegBank &RegBank) {
+void RegisterInfoEmitter::runMCDesc(raw_ostream &OS, CodeGenTarget &Target,
+                                    CodeGenRegBank &RegBank) {
   emitSourceFileHeader("MC Register Information", OS);
 
   OS << "\n#ifdef GET_REGINFO_MC_DESC\n";
@@ -880,7 +875,7 @@ RegisterInfoEmitter::runMCDesc(raw_ostream &OS, CodeGenTarget &Target,
   auto &SubRegIndices = RegBank.getSubRegIndices();
   // The lists of sub-registers and super-registers go in the same array.  That
   // allows us to share suffixes.
-  typedef std::vector<const CodeGenRegister*> RegVec;
+  typedef std::vector<const CodeGenRegister *> RegVec;
 
   // Differentially encoded lists.
   SequenceToOffsetTable<DiffVec> DiffSeqs;
@@ -894,7 +889,7 @@ RegisterInfoEmitter::runMCDesc(raw_ostream &OS, CodeGenTarget &Target,
 
   // Keep track of sub-register names as well. These are not differentially
   // encoded.
-  typedef SmallVector<const CodeGenSubRegIndex*, 4> SubRegIdxVec;
+  typedef SmallVector<const CodeGenSubRegIndex *, 4> SubRegIdxVec;
   SequenceToOffsetTable<SubRegIdxVec, deref<std::less<>>> SubRegIdxSeqs;
   SmallVector<SubRegIdxVec, 4> SubRegIdxLists(Regs.size());
 
@@ -907,7 +902,7 @@ RegisterInfoEmitter::runMCDesc(raw_ostream &OS, CodeGenTarget &Target,
     RegStrings.add(std::string(Reg.getName()));
 
     // Compute the ordered sub-register list.
-    SetVector<const CodeGenRegister*> SR;
+    SetVector<const CodeGenRegister *> SR;
     Reg.addSubRegsPreOrder(SR, RegBank);
     diffEncode(SubRegLists[i], Reg.EnumValue, SR.begin(), SR.end());
     DiffSeqs.add(SubRegLists[i]);
@@ -961,8 +956,8 @@ RegisterInfoEmitter::runMCDesc(raw_ostream &OS, CodeGenTarget &Target,
   OS << "};\n\n";
 
   // Emit the table of sub-register index sizes.
-  OS << "extern const MCRegisterInfo::SubRegCoveredBits "
-     << TargetName << "SubRegIdxRanges[] = {\n";
+  OS << "extern const MCRegisterInfo::SubRegCoveredBits " << TargetName
+     << "SubRegIdxRanges[] = {\n";
   OS << "  { " << (uint16_t)-1 << ", " << (uint16_t)-1 << " },\n";
   for (const auto &Idx : SubRegIndices) {
     OS << "  { " << Idx.Offset << ", " << Idx.Size << " },\t// "
@@ -995,13 +990,13 @@ RegisterInfoEmitter::runMCDesc(raw_ostream &OS, CodeGenTarget &Target,
        << LaneMaskSeqs.get(RegUnitLaneMasks[i]) << " },\n";
     ++i;
   }
-  OS << "};\n\n";      // End of register descriptors...
+  OS << "};\n\n"; // End of register descriptors...
 
   // Emit the table of register unit roots. Each regunit has one or two root
   // registers.
   OS << "extern const MCPhysReg " << TargetName << "RegUnitRoots[][2] = {\n";
   for (unsigned i = 0, e = RegBank.getNumNativeRegUnits(); i != e; ++i) {
-    ArrayRef<const CodeGenRegister*> Roots = RegBank.getRegUnit(i).getRoots();
+    ArrayRef<const CodeGenRegister *> Roots = RegBank.getRegUnit(i).getRoots();
     assert(!Roots.empty() && "All regunits must have a root register.");
     assert(Roots.size() <= 2 && "More than two roots not supported yet.");
     OS << "  { ";
@@ -1021,7 +1016,7 @@ RegisterInfoEmitter::runMCDesc(raw_ostream &OS, CodeGenTarget &Target,
 
   // Emit the register enum value arrays for each RegisterClass
   for (const auto &RC : RegisterClasses) {
-    ArrayRef<Record*> Order = RC.getOrder();
+    ArrayRef<Record *> Order = RC.getOrder();
 
     // Give the register class a legal C name if it's anonymous.
     const std::string &Name = RC.getName();
@@ -1092,7 +1087,7 @@ RegisterInfoEmitter::runMCDesc(raw_ostream &OS, CodeGenTarget &Target,
     }
     OS << "  " << Value << ",\n";
   }
-  OS << "};\n";       // End of HW encoding table
+  OS << "};\n"; // End of HW encoding table
 
   // MCRegisterInfo initialization routine.
   OS << "static inline void Init" << TargetName
@@ -1117,9 +1112,9 @@ RegisterInfoEmitter::runMCDesc(raw_ostream &OS, CodeGenTarget &Target,
   OS << "#endif // GET_REGINFO_MC_DESC\n\n";
 }
 
-void
-RegisterInfoEmitter::runTargetHeader(raw_ostream &OS, CodeGenTarget &Target,
-                                     CodeGenRegBank &RegBank) {
+void RegisterInfoEmitter::runTargetHeader(raw_ostream &OS,
+                                          CodeGenTarget &Target,
+                                          CodeGenRegBank &RegBank) {
   emitSourceFileHeader("Register Information Header Fragment", OS);
 
   OS << "\n#ifdef GET_REGINFO_HEADER\n";
@@ -1175,8 +1170,10 @@ RegisterInfoEmitter::runTargetHeader(raw_ostream &OS, CodeGenTarget &Target,
      << "      const MachineFunction &MF);\n";
 
   const auto &RegisterClasses = RegBank.getRegClasses();
-  if (llvm::any_of(RegisterClasses, [](const auto &RC) { return RC.getBaseClassOrder(); })) {
-    OS << "  const TargetRegisterClass *getPhysRegBaseClass(MCRegister Reg) const override;\n";
+  if (llvm::any_of(RegisterClasses,
+                   [](const auto &RC) { return RC.getBaseClassOrder(); })) {
+    OS << "  const TargetRegisterClass *getPhysRegBaseClass(MCRegister Reg) "
+          "const override;\n";
   }
 
   OS << "};\n\n";
@@ -1200,9 +1197,8 @@ RegisterInfoEmitter::runTargetHeader(raw_ostream &OS, CodeGenTarget &Target,
 //
 // runTargetDesc - Output the target register and register file descriptions.
 //
-void
-RegisterInfoEmitter::runTargetDesc(raw_ostream &OS, CodeGenTarget &Target,
-                                   CodeGenRegBank &RegBank){
+void RegisterInfoEmitter::runTargetDesc(raw_ostream &OS, CodeGenTarget &Target,
+                                        CodeGenRegBank &RegBank) {
   emitSourceFileHeader("Target Register and Register Classes Information", OS);
 
   OS << "\n#ifdef GET_REGINFO_TARGET_DESC\n";
@@ -1219,11 +1215,11 @@ RegisterInfoEmitter::runTargetDesc(raw_ostream &OS, CodeGenTarget &Target,
   const auto &SubRegIndices = RegBank.getSubRegIndices();
 
   // Collect all registers belonging to any allocatable class.
-  std::set<Record*> AllocatableRegs;
+  std::set<Record *> AllocatableRegs;
 
   // Collect allocatable registers.
   for (const auto &RC : RegisterClasses) {
-    ArrayRef<Record*> Order = RC.getOrder();
+    ArrayRef<Record *> Order = RC.getOrder();
 
     if (RC.Allocatable)
       AllocatableRegs.insert(Order.begin(), Order.end());
@@ -1297,7 +1293,6 @@ RegisterInfoEmitter::runTargetDesc(raw_ostream &OS, CodeGenTarget &Target,
     }
     OS << "};\n";
 
-
     OS << "\nstatic const TargetRegisterClass *const "
        << "NullRegClasses[] = { nullptr };\n\n";
 
@@ -1320,7 +1315,7 @@ RegisterInfoEmitter::runTargetDesc(raw_ostream &OS, CodeGenTarget &Target,
     // Every bit mask present in the list has at least one bit set.
 
     // Compress the sub-reg index lists.
-    typedef std::vector<const CodeGenSubRegIndex*> IdxList;
+    typedef std::vector<const CodeGenSubRegIndex *> IdxList;
     SmallVector<IdxList, 8> SuperRegIdxLists(RegisterClasses.size());
     SequenceToOffsetTable<IdxList, deref<std::less<>>> SuperRegIdxSeqs;
     BitVector MaskBV(RegisterClasses.size());
@@ -1354,14 +1349,14 @@ RegisterInfoEmitter::runTargetDesc(raw_ostream &OS, CodeGenTarget &Target,
 
     // Emit NULL terminated super-class lists.
     for (const auto &RC : RegisterClasses) {
-      ArrayRef<CodeGenRegisterClass*> Supers = RC.getSuperClasses();
+      ArrayRef<CodeGenRegisterClass *> Supers = RC.getSuperClasses();
 
       // Skip classes without supers.  We can reuse NullRegClasses.
       if (Supers.empty())
         continue;
 
-      OS << "static const TargetRegisterClass *const "
-         << RC.getName() << "Superclasses[] = {\n";
+      OS << "static const TargetRegisterClass *const " << RC.getName()
+         << "Superclasses[] = {\n";
       for (const auto *Super : Supers)
         OS << "  &" << Super->getQualifiedName() << "RegClass,\n";
       OS << "  nullptr\n};\n\n";
@@ -1371,12 +1366,12 @@ RegisterInfoEmitter::runTargetDesc(raw_ostream &OS, CodeGenTarget &Target,
     for (const auto &RC : RegisterClasses) {
       if (!RC.AltOrderSelect.empty()) {
         OS << "\nstatic inline unsigned " << RC.getName()
-           << "AltOrderSelect(const MachineFunction &MF) {"
-           << RC.AltOrderSelect << "}\n\n"
+           << "AltOrderSelect(const MachineFunction &MF) {" << RC.AltOrderSelect
+           << "}\n\n"
            << "static ArrayRef<MCPhysReg> " << RC.getName()
            << "GetRawAllocationOrder(const MachineFunction &MF) {\n";
-        for (unsigned oi = 1 , oe = RC.getNumOrders(); oi != oe; ++oi) {
-          ArrayRef<Record*> Elems = RC.getOrder(oi);
+        for (unsigned oi = 1, oe = RC.getNumOrders(); oi != oe; ++oi) {
+          ArrayRef<Record *> Elems = RC.getOrder(oi);
           if (!Elems.empty()) {
             OS << "  static const MCPhysReg AltOrder" << oi << "[] = {";
             for (unsigned elem = 0; elem != Elems.size(); ++elem)
@@ -1556,8 +1551,8 @@ RegisterInfoEmitter::runTargetDesc(raw_ostream &OS, CodeGenTarget &Target,
           EnumValue = SubRegClass->EnumValue + 1;
         }
 
-        OS << "      " << EnumValue << ",\t// "
-           << RC.getName() << ':' << Idx.getName();
+        OS << "      " << EnumValue << ",\t// " << RC.getName() << ':'
+           << Idx.getName();
 
         if (MatchingSubClass) {
           CodeGenRegisterClass *SubRegClass = MatchingSubClass->second;
@@ -1581,7 +1576,7 @@ RegisterInfoEmitter::runTargetDesc(raw_ostream &OS, CodeGenTarget &Target,
   // Emit register base class mapper
   if (!RegisterClasses.empty()) {
     // Collect base classes
-    SmallVector<const CodeGenRegisterClass*> BaseClasses;
+    SmallVector<const CodeGenRegisterClass *> BaseClasses;
     for (const auto &RC : RegisterClasses) {
       if (RC.getBaseClassOrder())
         BaseClasses.push_back(&RC);
@@ -1592,9 +1587,10 @@ RegisterInfoEmitter::runTargetDesc(raw_ostream &OS, CodeGenTarget &Target,
 
       // Apply order
       struct BaseClassOrdering {
-        bool operator()(const CodeGenRegisterClass *LHS, const CodeGenRegisterClass *RHS) const {
-          return std::pair(*LHS->getBaseClassOrder(), LHS->EnumValue)
-               < std::pair(*RHS->getBaseClassOrder(), RHS->EnumValue);
+        bool operator()(const CodeGenRegisterClass *LHS,
+                        const CodeGenRegisterClass *RHS) const {
+          return std::pair(*LHS->getBaseClassOrder(), LHS->EnumValue) <
+                 std::pair(*RHS->getBaseClassOrder(), RHS->EnumValue);
         }
       };
       llvm::stable_sort(BaseClasses, BaseClassOrdering());
@@ -1638,8 +1634,8 @@ RegisterInfoEmitter::runTargetDesc(raw_ostream &OS, CodeGenTarget &Target,
   OS << "extern const char " << TargetName << "RegClassStrings[];\n";
   OS << "extern const MCPhysReg " << TargetName << "RegUnitRoots[][2];\n";
   OS << "extern const uint16_t " << TargetName << "SubRegIdxLists[];\n";
-  OS << "extern const MCRegisterInfo::SubRegCoveredBits "
-     << TargetName << "SubRegIdxRanges[];\n";
+  OS << "extern const MCRegisterInfo::SubRegCoveredBits " << TargetName
+     << "SubRegIdxRanges[];\n";
   OS << "extern const uint16_t " << TargetName << "RegEncodingTable[];\n";
 
   EmitRegMappingTables(OS, Regs, true);
@@ -1673,16 +1669,15 @@ RegisterInfoEmitter::runTargetDesc(raw_ostream &OS, CodeGenTarget &Target,
   OS << "}\n\n";
 
   // Emit CalleeSavedRegs information.
-  std::vector<Record*> CSRSets =
-    Records.getAllDerivedDefinitions("CalleeSavedRegs");
+  std::vector<Record *> CSRSets =
+      Records.getAllDerivedDefinitions("CalleeSavedRegs");
   for (unsigned i = 0, e = CSRSets.size(); i != e; ++i) {
     Record *CSRSet = CSRSets[i];
     const SetTheory::RecVec *Regs = RegBank.getSets().expand(CSRSet);
     assert(Regs && "Cannot expand CalleeSavedRegs instance");
 
     // Emit the *_SaveList list of callee-saved registers.
-    OS << "static const MCPhysReg " << CSRSet->getName()
-       << "_SaveList[] = { ";
+    OS << "static const MCPhysReg " << CSRSet->getName() << "_SaveList[] = { ";
     for (unsigned r = 0, re = Regs->size(); r != re; ++r)
       OS << getQualifiedName((*Regs)[r]) << ", ";
     OS << "0 };\n";
@@ -1693,11 +1688,11 @@ RegisterInfoEmitter::runTargetDesc(raw_ostream &OS, CodeGenTarget &Target,
     // Check for an optional OtherPreserved set.
     // Add those registers to RegMask, but not to SaveList.
     if (DagInit *OPDag =
-        dyn_cast<DagInit>(CSRSet->getValueInit("OtherPreserved"))) {
+            dyn_cast<DagInit>(CSRSet->getValueInit("OtherPreserved"))) {
       SetTheory::RecSet OPSet;
       RegBank.getSets().evaluate(OPDag, OPSet, CSRSet->getLoc());
       Covered |= RegBank.computeCoveredRegisters(
-        ArrayRef<Record*>(OPSet.begin(), OPSet.end()));
+          ArrayRef<Record *>(OPSet.begin(), OPSet.end()));
     }
 
     // Add all constant physical registers to the preserved mask:
@@ -1709,8 +1704,7 @@ RegisterInfoEmitter::runTargetDesc(raw_ostream &OS, CodeGenTarget &Target,
     Covered |= RegBank.computeCoveredRegisters(
         ArrayRef<Record *>(ConstantSet.begin(), ConstantSet.end()));
 
-    OS << "static const uint32_t " << CSRSet->getName()
-       << "_RegMask[] = { ";
+    OS << "static const uint32_t " << CSRSet->getName() << "_RegMask[] = { ";
     printBitVectorAsHex(OS, Covered, 32);
     OS << "};\n";
   }
@@ -1795,7 +1789,8 @@ RegisterInfoEmitter::runTargetDesc(raw_ostream &OS, CodeGenTarget &Target,
   }
   OS << "}\n\n";
 
-  OS << "const " << TargetName << "FrameLowering *\n" << TargetName
+  OS << "const " << TargetName << "FrameLowering *\n"
+     << TargetName
      << "GenRegisterInfo::getFrameLowering(const MachineFunction &MF) {\n"
      << "  return static_cast<const " << TargetName << "FrameLowering *>(\n"
      << "      MF.getSubtarget().getFrameLowering());\n"
@@ -1827,7 +1822,7 @@ void RegisterInfoEmitter::debugDump(raw_ostream &OS) {
   CodeGenRegBank &RegBank = Target.getRegBank();
   const CodeGenHwModes &CGH = Target.getHwModes();
   unsigned NumModes = CGH.getNumModeIds();
-  auto getModeName = [CGH] (unsigned M) -> StringRef {
+  auto getModeName = [CGH](unsigned M) -> StringRef {
     if (M == 0)
       return "Default";
     return CGH.getMode(M).Name;
@@ -1883,9 +1878,10 @@ void RegisterInfoEmitter::debugDump(raw_ostream &OS) {
     OS << '\n';
     OS << "\tCoveredBySubregs: " << R.CoveredBySubRegs << '\n';
     OS << "\tHasDisjunctSubRegs: " << R.HasDisjunctSubRegs << '\n';
-    for (std::pair<CodeGenSubRegIndex*,CodeGenRegister*> P : R.getSubRegs()) {
-      OS << "\tSubReg " << P.first->getName()
-         << " = " << P.second->getName() << '\n';
+    for (std::pair<CodeGenSubRegIndex *, CodeGenRegister *> P :
+         R.getSubRegs()) {
+      OS << "\tSubReg " << P.first->getName() << " = " << P.second->getName()
+         << '\n';
     }
   }
 }
diff --git a/llvm/utils/TableGen/SDNodeProperties.h b/llvm/utils/TableGen/SDNodeProperties.h
index 66a04e6..5715423 100644
--- a/llvm/utils/TableGen/SDNodeProperties.h
+++ b/llvm/utils/TableGen/SDNodeProperties.h
@@ -34,6 +34,6 @@ enum SDNP {
 
 unsigned parseSDPatternOperatorProperties(Record *R);
 
-}
+} // namespace llvm
 
 #endif
diff --git a/llvm/utils/TableGen/SearchableTableEmitter.cpp b/llvm/utils/TableGen/SearchableTableEmitter.cpp
index d75a9e9..0cce798a 100644
--- a/llvm/utils/TableGen/SearchableTableEmitter.cpp
+++ b/llvm/utils/TableGen/SearchableTableEmitter.cpp
@@ -134,7 +134,7 @@ private:
                         Twine("Entry for field '") + Field.Name + "' is null");
       return std::string(Entry->first);
     }
-    PrintFatalError(Loc, Twine("invalid field type for field '") + Field.Name + 
+    PrintFatalError(Loc, Twine("invalid field type for field '") + Field.Name +
                              "'; expected: bit, bits, string, or code");
   }
 
@@ -173,7 +173,7 @@ private:
         return "uint32_t";
       if (NumBits <= 64)
         return "uint64_t";
-      PrintFatalError(Index.Loc, Twine("In table '") + Table.Name + 
+      PrintFatalError(Index.Loc, Twine("In table '") + Table.Name +
                                      "' lookup method '" + Index.Name +
                                      "', key field '" + Field.Name +
                                      "' of type bits is too large");
@@ -425,7 +425,7 @@ void SearchableTableEmitter::emitLookupFunction(const GenericTable &Table,
 
   OS << "  struct KeyType {\n";
   for (const auto &Field : Index.Fields) {
-    OS << "    " << searchableFieldType(Table, Index, Field, TypeInTempStruct) 
+    OS << "    " << searchableFieldType(Table, Index, Field, TypeInTempStruct)
        << " " << Field.Name << ";\n";
   }
   OS << "  };\n";
@@ -436,7 +436,7 @@ void SearchableTableEmitter::emitLookupFunction(const GenericTable &Table,
     if (isa<StringRecTy>(Field.RecType)) {
       OS << ".upper()";
       if (IsPrimary)
-        PrintFatalError(Index.Loc, 
+        PrintFatalError(Index.Loc,
                         Twine("In table '") + Table.Name +
                             "', use a secondary lookup method for "
                             "case-insensitive comparison of field '" +
@@ -580,7 +580,7 @@ std::unique_ptr<SearchIndex> SearchableTableEmitter::parseSearchIndex(
           Twine("In table '") + Table.Name +
               "', 'PrimaryKey' or 'Key' refers to nonexistent field '" +
               FieldName + "'");
-                      
+
     Index->Fields.push_back(*Field);
   }
 
@@ -643,11 +643,11 @@ void SearchableTableEmitter::collectTableEntries(
       } else {
         RecTy *Ty = resolveTypes(Field.RecType, TI->getType());
         if (!Ty)
-          PrintFatalError(EntryRec->getValue(Field.Name), 
+          PrintFatalError(EntryRec->getValue(Field.Name),
                           Twine("Field '") + Field.Name + "' of table '" +
-                          Table.Name + "' entry has incompatible type: " +
-                          TI->getType()->getAsString() + " vs. " +
-                          Field.RecType->getAsString());
+                              Table.Name + "' entry has incompatible type: " +
+                              TI->getType()->getAsString() + " vs. " +
+                              Field.RecType->getAsString());
         Field.RecType = Ty;
       }
     }
@@ -702,7 +702,7 @@ void SearchableTableEmitter::run(raw_ostream &OS) {
     StringRef FilterClass = EnumRec->getValueAsString("FilterClass");
     Enum->Class = Records.getClass(FilterClass);
     if (!Enum->Class)
-      PrintFatalError(EnumRec->getValue("FilterClass"), 
+      PrintFatalError(EnumRec->getValue("FilterClass"),
                       Twine("Enum FilterClass '") + FilterClass +
                           "' does not exist");
 
@@ -723,11 +723,13 @@ void SearchableTableEmitter::run(raw_ostream &OS) {
     for (const auto &FieldName : Fields) {
       Table->Fields.emplace_back(FieldName); // Construct a GenericField.
 
-      if (auto TypeOfRecordVal = TableRec->getValue(("TypeOf_" + FieldName).str())) {
-        if (!parseFieldType(Table->Fields.back(), TypeOfRecordVal->getValue())) {
-          PrintError(TypeOfRecordVal, 
-                     Twine("Table '") + Table->Name +
-                         "' has invalid 'TypeOf_" + FieldName +
+      if (auto TypeOfRecordVal =
+              TableRec->getValue(("TypeOf_" + FieldName).str())) {
+        if (!parseFieldType(Table->Fields.back(),
+                            TypeOfRecordVal->getValue())) {
+          PrintError(TypeOfRecordVal,
+                     Twine("Table '") + Table->Name + "' has invalid 'TypeOf_" +
+                         FieldName +
                          "': " + TypeOfRecordVal->getValue()->getAsString());
           PrintFatalNote("The 'TypeOf_xxx' field must be a string naming a "
                          "GenericEnum record, or \"code\"");
@@ -737,9 +739,9 @@ void SearchableTableEmitter::run(raw_ostream &OS) {
 
     StringRef FilterClass = TableRec->getValueAsString("FilterClass");
     if (!Records.getClass(FilterClass))
-      PrintFatalError(TableRec->getValue("FilterClass"), 
-                      Twine("Table FilterClass '") +
-                          FilterClass + "' does not exist");
+      PrintFatalError(TableRec->getValue("FilterClass"),
+                      Twine("Table FilterClass '") + FilterClass +
+                          "' does not exist");
 
     RecordVal *FilterClassFieldVal = TableRec->getValue("FilterClassField");
     std::vector<Record *> Definitions =
@@ -779,14 +781,14 @@ void SearchableTableEmitter::run(raw_ostream &OS) {
     Record *TableRec = IndexRec->getValueAsDef("Table");
     auto It = TableMap.find(TableRec);
     if (It == TableMap.end())
-      PrintFatalError(IndexRec->getValue("Table"), 
+      PrintFatalError(IndexRec->getValue("Table"),
                       Twine("SearchIndex '") + IndexRec->getName() +
                           "' refers to nonexistent table '" +
                           TableRec->getName());
 
     GenericTable &Table = *It->second;
     Table.Indices.push_back(
-        parseSearchIndex(Table, IndexRec->getValue("Key"), IndexRec->getName(), 
+        parseSearchIndex(Table, IndexRec->getValue("Key"), IndexRec->getName(),
                          IndexRec->getValueAsListOfStrings("Key"),
                          IndexRec->getValueAsBit("EarlyOut")));
   }
diff --git a/llvm/utils/TableGen/SequenceToOffsetTable.h b/llvm/utils/TableGen/SequenceToOffsetTable.h
index 77a404d..7db39a9 100644
--- a/llvm/utils/TableGen/SequenceToOffsetTable.h
+++ b/llvm/utils/TableGen/SequenceToOffsetTable.h
@@ -44,7 +44,7 @@ static inline void printChar(raw_ostream &OS, char C) {
 ///
 /// @tparam SeqT The sequence container. (vector or string).
 /// @tparam Less A stable comparator for SeqT elements.
-template<typename SeqT, typename Less = std::less<typename SeqT::value_type> >
+template <typename SeqT, typename Less = std::less<typename SeqT::value_type>>
 class SequenceToOffsetTable {
   typedef typename SeqT::value_type ElemT;
 
@@ -53,8 +53,8 @@ class SequenceToOffsetTable {
   struct SeqLess {
     Less L;
     bool operator()(const SeqT &A, const SeqT &B) const {
-      return std::lexicographical_compare(A.rbegin(), A.rend(),
-                                          B.rbegin(), B.rend(), L);
+      return std::lexicographical_compare(A.rbegin(), A.rend(), B.rbegin(),
+                                          B.rend(), L);
     }
   };
 
@@ -153,15 +153,15 @@ public:
 
   /// emit - Print out the table as the body of an array initializer.
   /// Use the Print function to print elements.
-  void emit(raw_ostream &OS,
-            void (*Print)(raw_ostream&, ElemT),
+  void emit(raw_ostream &OS, void (*Print)(raw_ostream &, ElemT),
             const char *Term = "0") const {
     assert((empty() || Entries) && "Call layout() before emit()");
     for (typename SeqMap::const_iterator I = Seqs.begin(), E = Seqs.end();
          I != E; ++I) {
       OS << "  /* " << I->second << " */ ";
       for (typename SeqT::const_iterator SI = I->first.begin(),
-             SE = I->first.end(); SI != SE; ++SI) {
+                                         SE = I->first.end();
+           SI != SE; ++SI) {
         Print(OS, *SI);
         OS << ", ";
       }
diff --git a/llvm/utils/TableGen/SubtargetEmitter.cpp b/llvm/utils/TableGen/SubtargetEmitter.cpp
index 3922518..b1502ea 100644
--- a/llvm/utils/TableGen/SubtargetEmitter.cpp
+++ b/llvm/utils/TableGen/SubtargetEmitter.cpp
@@ -51,9 +51,9 @@ struct LessRecordFieldFieldName {
 };
 
 class SubtargetEmitter {
-  // Each processor has a SchedClassDesc table with an entry for each SchedClass.
-  // The SchedClassDesc table indexes into a global write resource table, write
-  // latency table, and read advance table.
+  // Each processor has a SchedClassDesc table with an entry for each
+  // SchedClass. The SchedClassDesc table indexes into a global write resource
+  // table, write latency table, and read advance table.
   struct SchedClassTables {
     std::vector<std::vector<MCSchedClassDesc>> ProcSchedClasses;
     std::vector<MCWriteProcResEntry> WriteProcResources;
@@ -89,20 +89,18 @@ class SubtargetEmitter {
                             const DenseMap<Record *, unsigned> &FeatureMap);
   unsigned CPUKeyValues(raw_ostream &OS,
                         const DenseMap<Record *, unsigned> &FeatureMap);
-  void FormItineraryStageString(const std::string &Names,
-                                Record *ItinData, std::string &ItinString,
-                                unsigned &NStages);
-  void FormItineraryOperandCycleString(Record *ItinData, std::string &ItinString,
+  void FormItineraryStageString(const std::string &Names, Record *ItinData,
+                                std::string &ItinString, unsigned &NStages);
+  void FormItineraryOperandCycleString(Record *ItinData,
+                                       std::string &ItinString,
                                        unsigned &NOperandCycles);
-  void FormItineraryBypassString(const std::string &Names,
-                                 Record *ItinData,
-                                 std::string &ItinString, unsigned NOperandCycles);
-  void EmitStageAndOperandCycleData(raw_ostream &OS,
-                                    std::vector<std::vector<InstrItinerary>>
-                                      &ProcItinLists);
+  void FormItineraryBypassString(const std::string &Names, Record *ItinData,
+                                 std::string &ItinString,
+                                 unsigned NOperandCycles);
+  void EmitStageAndOperandCycleData(
+      raw_ostream &OS, std::vector<std::vector<InstrItinerary>> &ProcItinLists);
   void EmitItineraries(raw_ostream &OS,
-                       std::vector<std::vector<InstrItinerary>>
-                         &ProcItinLists);
+                       std::vector<std::vector<InstrItinerary>> &ProcItinLists);
   unsigned EmitRegisterFileTables(const CodeGenProcModel &ProcModel,
                                   raw_ostream &OS);
   void EmitLoadStoreQueueInfo(const CodeGenProcModel &ProcModel,
@@ -153,15 +151,16 @@ public:
 void SubtargetEmitter::Enumeration(raw_ostream &OS,
                                    DenseMap<Record *, unsigned> &FeatureMap) {
   // Get all records of class and sort
-  std::vector<Record*> DefList =
-    Records.getAllDerivedDefinitions("SubtargetFeature");
+  std::vector<Record *> DefList =
+      Records.getAllDerivedDefinitions("SubtargetFeature");
   llvm::sort(DefList, LessRecord());
 
   unsigned N = DefList.size();
   if (N == 0)
     return;
   if (N + 1 > MAX_SUBTARGET_FEATURES)
-    PrintFatalError("Too many subtarget features! Bump MAX_SUBTARGET_FEATURES.");
+    PrintFatalError(
+        "Too many subtarget features! Bump MAX_SUBTARGET_FEATURES.");
 
   OS << "namespace " << Target << " {\n";
 
@@ -248,8 +247,8 @@ void SubtargetEmitter::EmitSubtargetInfoMacroCalls(raw_ostream &OS) {
 unsigned SubtargetEmitter::FeatureKeyValues(
     raw_ostream &OS, const DenseMap<Record *, unsigned> &FeatureMap) {
   // Gather and sort all the features
-  std::vector<Record*> FeatureList =
-                           Records.getAllDerivedDefinitions("SubtargetFeature");
+  std::vector<Record *> FeatureList =
+      Records.getAllDerivedDefinitions("SubtargetFeature");
 
   if (FeatureList.empty())
     return 0;
@@ -269,13 +268,14 @@ unsigned SubtargetEmitter::FeatureKeyValues(
     StringRef CommandLineName = Feature->getValueAsString("Name");
     StringRef Desc = Feature->getValueAsString("Desc");
 
-    if (CommandLineName.empty()) continue;
+    if (CommandLineName.empty())
+      continue;
 
-    // Emit as { "feature", "description", { featureEnum }, { i1 , i2 , ... , in } }
+    // Emit as { "feature", "description", { featureEnum }, { i1 , i2 , ... , in
+    // } }
     OS << "  { "
        << "\"" << CommandLineName << "\", "
-       << "\"" << Desc << "\", "
-       << Target << "::" << Name << ", ";
+       << "\"" << Desc << "\", " << Target << "::" << Name << ", ";
 
     RecVec ImpliesList = Feature->getValueAsListOfDefs("Implies");
 
@@ -299,8 +299,8 @@ unsigned
 SubtargetEmitter::CPUKeyValues(raw_ostream &OS,
                                const DenseMap<Record *, unsigned> &FeatureMap) {
   // Gather and sort processor information
-  std::vector<Record*> ProcessorList =
-                          Records.getAllDerivedDefinitions("Processor");
+  std::vector<Record *> ProcessorList =
+      Records.getAllDerivedDefinitions("Processor");
   llvm::sort(ProcessorList, LessRecordFieldName());
 
   // Begin processor table
@@ -324,7 +324,7 @@ SubtargetEmitter::CPUKeyValues(raw_ostream &OS,
 
     // Emit the scheduler model pointer.
     const std::string &ProcModelName =
-      SchedModels.getModelForProc(Processor).ModelName;
+        SchedModels.getModelForProc(Processor).ModelName;
     OS << ", &" << ProcModelName << " },\n";
   }
 
@@ -363,7 +363,8 @@ void SubtargetEmitter::FormItineraryStageString(const std::string &Name,
     for (unsigned j = 0, M = UnitList.size(); j < M;) {
       // Add name and bitwise or
       ItinString += Name + "FU::" + UnitList[j]->getName().str();
-      if (++j < M) ItinString += " | ";
+      if (++j < M)
+        ItinString += " | ";
     }
 
     int TimeInc = Stage->getValueAsInt("TimeInc");
@@ -374,7 +375,8 @@ void SubtargetEmitter::FormItineraryStageString(const std::string &Name,
 
     // Close off stage
     ItinString += " }";
-    if (++i < N) ItinString += ", ";
+    if (++i < N)
+      ItinString += ", ";
   }
 }
 
@@ -383,11 +385,11 @@ void SubtargetEmitter::FormItineraryStageString(const std::string &Name,
 // operand cycle initialization for the specified itinerary.  N is the
 // number of operands that has cycles specified.
 //
-void SubtargetEmitter::FormItineraryOperandCycleString(Record *ItinData,
-                         std::string &ItinString, unsigned &NOperandCycles) {
+void SubtargetEmitter::FormItineraryOperandCycleString(
+    Record *ItinData, std::string &ItinString, unsigned &NOperandCycles) {
   // Get operand cycle list
   std::vector<int64_t> OperandCycleList =
-    ItinData->getValueAsListOfInts("OperandCycles");
+      ItinData->getValueAsListOfInts("OperandCycles");
 
   // For each operand cycle
   NOperandCycles = OperandCycleList.size();
@@ -422,12 +424,10 @@ void SubtargetEmitter::FormItineraryBypassString(const std::string &Name,
 // cycle tables. Create a list of InstrItinerary objects (ProcItinLists) indexed
 // by CodeGenSchedClass::Index.
 //
-void SubtargetEmitter::
-EmitStageAndOperandCycleData(raw_ostream &OS,
-                             std::vector<std::vector<InstrItinerary>>
-                               &ProcItinLists) {
+void SubtargetEmitter::EmitStageAndOperandCycleData(
+    raw_ostream &OS, std::vector<std::vector<InstrItinerary>> &ProcItinLists) {
   // Multiple processor models may share an itinerary record. Emit it once.
-  SmallPtrSet<Record*, 8> ItinsDefSet;
+  SmallPtrSet<Record *, 8> ItinsDefSet;
 
   // Emit functional units for all the itineraries.
   for (const CodeGenProcModel &ProcModel : SchedModels.procModels()) {
@@ -452,30 +452,31 @@ EmitStageAndOperandCycleData(raw_ostream &OS,
     RecVec BPs = ProcModel.ItinsDef->getValueAsListOfDefs("BP");
     if (!BPs.empty()) {
       OS << "\n// Pipeline forwarding paths for itineraries \"" << Name
-         << "\"\n" << "namespace " << Name << "Bypass {\n";
+         << "\"\n"
+         << "namespace " << Name << "Bypass {\n";
 
       OS << "  const unsigned NoBypass = 0;\n";
       for (unsigned j = 0, BPN = BPs.size(); j < BPN; ++j)
-        OS << "  const unsigned " << BPs[j]->getName()
-           << " = 1 << " << j << ";\n";
+        OS << "  const unsigned " << BPs[j]->getName() << " = 1 << " << j
+           << ";\n";
 
       OS << "} // end namespace " << Name << "Bypass\n";
     }
   }
 
   // Begin stages table
-  std::string StageTable = "\nextern const llvm::InstrStage " + Target +
-                           "Stages[] = {\n";
+  std::string StageTable =
+      "\nextern const llvm::InstrStage " + Target + "Stages[] = {\n";
   StageTable += "  { 0, 0, 0, llvm::InstrStage::Required }, // No itinerary\n";
 
   // Begin operand cycle table
-  std::string OperandCycleTable = "extern const unsigned " + Target +
-    "OperandCycles[] = {\n";
+  std::string OperandCycleTable =
+      "extern const unsigned " + Target + "OperandCycles[] = {\n";
   OperandCycleTable += "  0, // No itinerary\n";
 
   // Begin pipeline bypass table
-  std::string BypassTable = "extern const unsigned " + Target +
-    "ForwardingPaths[] = {\n";
+  std::string BypassTable =
+      "extern const unsigned " + Target + "ForwardingPaths[] = {\n";
   BypassTable += " 0, // No itinerary\n";
 
   // For each Itinerary across all processors, add a unique entry to the stages,
@@ -485,7 +486,7 @@ EmitStageAndOperandCycleData(raw_ostream &OS,
   std::map<std::string, unsigned> ItinStageMap, ItinOperandMap;
   for (const CodeGenProcModel &ProcModel : SchedModels.procModels()) {
     // Add process itinerary to the list.
-    ProcItinLists.resize(ProcItinLists.size()+1);
+    ProcItinLists.resize(ProcItinLists.size() + 1);
 
     // If this processor defines no itineraries, then leave the itinerary list
     // empty.
@@ -542,19 +543,20 @@ EmitStageAndOperandCycleData(raw_ostream &OS,
       // Check to see if operand cycle already exists and create if it doesn't
       uint16_t FindOperandCycle = 0;
       if (NOperandCycles > 0) {
-        std::string ItinOperandString = ItinOperandCycleString+ItinBypassString;
+        std::string ItinOperandString =
+            ItinOperandCycleString + ItinBypassString;
         FindOperandCycle = ItinOperandMap[ItinOperandString];
         if (FindOperandCycle == 0) {
           // Emit as  cycle, // index
           OperandCycleTable += ItinOperandCycleString + ", // ";
           std::string OperandIdxComment = itostr(OperandCycleCount);
           if (NOperandCycles > 1)
-            OperandIdxComment += "-"
-              + itostr(OperandCycleCount + NOperandCycles - 1);
+            OperandIdxComment +=
+                "-" + itostr(OperandCycleCount + NOperandCycles - 1);
           OperandCycleTable += OperandIdxComment + "\n";
           // Record Itin class number.
-          ItinOperandMap[ItinOperandCycleString] =
-            FindOperandCycle = OperandCycleCount;
+          ItinOperandMap[ItinOperandCycleString] = FindOperandCycle =
+              OperandCycleCount;
           // Emit as bypass, // index
           BypassTable += ItinBypassString + ", // " + OperandIdxComment + "\n";
           OperandCycleCount += NOperandCycles;
@@ -599,17 +601,17 @@ EmitStageAndOperandCycleData(raw_ostream &OS,
 // Itineraries for each processor. The Itinerary lists are indexed on
 // CodeGenSchedClass::Index.
 //
-void SubtargetEmitter::
-EmitItineraries(raw_ostream &OS,
-                std::vector<std::vector<InstrItinerary>> &ProcItinLists) {
+void SubtargetEmitter::EmitItineraries(
+    raw_ostream &OS, std::vector<std::vector<InstrItinerary>> &ProcItinLists) {
   // Multiple processor models may share an itinerary record. Emit it once.
-  SmallPtrSet<Record*, 8> ItinsDefSet;
+  SmallPtrSet<Record *, 8> ItinsDefSet;
 
   // For each processor's machine model
-  std::vector<std::vector<InstrItinerary>>::iterator
-      ProcItinListsIter = ProcItinLists.begin();
+  std::vector<std::vector<InstrItinerary>>::iterator ProcItinListsIter =
+      ProcItinLists.begin();
   for (CodeGenSchedModels::ProcIter PI = SchedModels.procModelBegin(),
-         PE = SchedModels.procModelEnd(); PI != PE; ++PI, ++ProcItinListsIter) {
+                                    PE = SchedModels.procModelEnd();
+       PI != PE; ++PI, ++ProcItinListsIter) {
 
     Record *ItinsDef = PI->ItinsDef;
     if (!ItinsDefSet.insert(ItinsDef).second)
@@ -636,13 +638,10 @@ EmitItineraries(raw_ostream &OS,
 
       // Emit Itinerary in the form of
       // { firstStage, lastStage, firstCycle, lastCycle } // index
-      OS << "  { " <<
-        Intinerary.NumMicroOps << ", " <<
-        Intinerary.FirstStage << ", " <<
-        Intinerary.LastStage << ", " <<
-        Intinerary.FirstOperandCycle << ", " <<
-        Intinerary.LastOperandCycle << " }" <<
-        ", // " << j << " " << SchedModels.getSchedClass(j).Name << "\n";
+      OS << "  { " << Intinerary.NumMicroOps << ", " << Intinerary.FirstStage
+         << ", " << Intinerary.LastStage << ", " << Intinerary.FirstOperandCycle
+         << ", " << Intinerary.LastOperandCycle << " }"
+         << ", // " << j << " " << SchedModels.getSchedClass(j).Name << "\n";
     }
     // End processor itinerary table
     OS << "  { 0, uint16_t(~0U), uint16_t(~0U), uint16_t(~0U), uint16_t(~0U) }"
@@ -840,13 +839,11 @@ void SubtargetEmitter::EmitProcessorResources(const CodeGenProcModel &ProcModel,
         NumUnits += RU->getValueAsInt("NumUnits");
         SubUnitsOffset += RU->getValueAsInt("NumUnits");
       }
-    }
-    else {
+    } else {
       // Find the SuperIdx
       if (PRDef->getValueInit("Super")->isComplete()) {
-        SuperDef =
-            SchedModels.findProcResUnits(PRDef->getValueAsDef("Super"),
-                                         ProcModel, PRDef->getLoc());
+        SuperDef = SchedModels.findProcResUnits(PRDef->getValueAsDef("Super"),
+                                                ProcModel, PRDef->getLoc());
         SuperIdx = ProcModel.getProcResourceIdx(SuperDef);
       }
       NumUnits = PRDef->getValueAsInt("NumUnits");
@@ -862,7 +859,7 @@ void SubtargetEmitter::EmitProcessorResources(const CodeGenProcModel &ProcModel,
     } else {
       OS << "nullptr";
     }
-    OS << "}, // #" << i+1;
+    OS << "}, // #" << i + 1;
     if (SuperDef)
       OS << ", Super=" << SuperDef->getName();
     OS << "\n";
@@ -872,8 +869,9 @@ void SubtargetEmitter::EmitProcessorResources(const CodeGenProcModel &ProcModel,
 
 // Find the WriteRes Record that defines processor resources for this
 // SchedWrite.
-Record *SubtargetEmitter::FindWriteResources(
-  const CodeGenSchedRW &SchedWrite, const CodeGenProcModel &ProcModel) {
+Record *
+SubtargetEmitter::FindWriteResources(const CodeGenSchedRW &SchedWrite,
+                                     const CodeGenProcModel &ProcModel) {
 
   // Check if the SchedWrite is already subtarget-specific and directly
   // specifies a set of processor resources.
@@ -883,16 +881,18 @@ Record *SubtargetEmitter::FindWriteResources(
   Record *AliasDef = nullptr;
   for (Record *A : SchedWrite.Aliases) {
     const CodeGenSchedRW &AliasRW =
-      SchedModels.getSchedRW(A->getValueAsDef("AliasRW"));
+        SchedModels.getSchedRW(A->getValueAsDef("AliasRW"));
     if (AliasRW.TheDef->getValueInit("SchedModel")->isComplete()) {
       Record *ModelDef = AliasRW.TheDef->getValueAsDef("SchedModel");
       if (&SchedModels.getProcModel(ModelDef) != &ProcModel)
         continue;
     }
     if (AliasDef)
-      PrintFatalError(AliasRW.TheDef->getLoc(), "Multiple aliases "
-                    "defined for processor " + ProcModel.ModelName +
-                    " Ensure only one SchedAlias exists per RW.");
+      PrintFatalError(AliasRW.TheDef->getLoc(),
+                      "Multiple aliases "
+                      "defined for processor " +
+                          ProcModel.ModelName +
+                          " Ensure only one SchedAlias exists per RW.");
     AliasDef = AliasRW.TheDef;
   }
   if (AliasDef && AliasDef->isSubClassOf("SchedWriteRes"))
@@ -903,12 +903,12 @@ Record *SubtargetEmitter::FindWriteResources(
   for (Record *WR : ProcModel.WriteResDefs) {
     if (!WR->isSubClassOf("WriteRes"))
       continue;
-    if (AliasDef == WR->getValueAsDef("WriteType")
-        || SchedWrite.TheDef == WR->getValueAsDef("WriteType")) {
+    if (AliasDef == WR->getValueAsDef("WriteType") ||
+        SchedWrite.TheDef == WR->getValueAsDef("WriteType")) {
       if (ResDef) {
         PrintFatalError(WR->getLoc(), "Resources are defined for both "
-                      "SchedWrite and its alias on processor " +
-                      ProcModel.ModelName);
+                                      "SchedWrite and its alias on processor " +
+                                          ProcModel.ModelName);
       }
       ResDef = WR;
     }
@@ -918,7 +918,7 @@ Record *SubtargetEmitter::FindWriteResources(
   if (!ResDef) {
     PrintFatalError(ProcModel.ModelDef->getLoc(),
                     Twine("Processor does not define resources for ") +
-                    SchedWrite.TheDef->getName());
+                        SchedWrite.TheDef->getName());
   }
   return ResDef;
 }
@@ -935,16 +935,18 @@ Record *SubtargetEmitter::FindReadAdvance(const CodeGenSchedRW &SchedRead,
   Record *AliasDef = nullptr;
   for (Record *A : SchedRead.Aliases) {
     const CodeGenSchedRW &AliasRW =
-      SchedModels.getSchedRW(A->getValueAsDef("AliasRW"));
+        SchedModels.getSchedRW(A->getValueAsDef("AliasRW"));
     if (AliasRW.TheDef->getValueInit("SchedModel")->isComplete()) {
       Record *ModelDef = AliasRW.TheDef->getValueAsDef("SchedModel");
       if (&SchedModels.getProcModel(ModelDef) != &ProcModel)
         continue;
     }
     if (AliasDef)
-      PrintFatalError(AliasRW.TheDef->getLoc(), "Multiple aliases "
-                    "defined for processor " + ProcModel.ModelName +
-                    " Ensure only one SchedAlias exists per RW.");
+      PrintFatalError(AliasRW.TheDef->getLoc(),
+                      "Multiple aliases "
+                      "defined for processor " +
+                          ProcModel.ModelName +
+                          " Ensure only one SchedAlias exists per RW.");
     AliasDef = AliasRW.TheDef;
   }
   if (AliasDef && AliasDef->isSubClassOf("SchedReadAdvance"))
@@ -955,12 +957,12 @@ Record *SubtargetEmitter::FindReadAdvance(const CodeGenSchedRW &SchedRead,
   for (Record *RA : ProcModel.ReadAdvanceDefs) {
     if (!RA->isSubClassOf("ReadAdvance"))
       continue;
-    if (AliasDef == RA->getValueAsDef("ReadType")
-        || SchedRead.TheDef == RA->getValueAsDef("ReadType")) {
+    if (AliasDef == RA->getValueAsDef("ReadType") ||
+        SchedRead.TheDef == RA->getValueAsDef("ReadType")) {
       if (ResDef) {
         PrintFatalError(RA->getLoc(), "Resources are defined for both "
-                      "SchedRead and its alias on processor " +
-                      ProcModel.ModelName);
+                                      "SchedRead and its alias on processor " +
+                                          ProcModel.ModelName);
       }
       ResDef = RA;
     }
@@ -970,7 +972,7 @@ Record *SubtargetEmitter::FindReadAdvance(const CodeGenSchedRW &SchedRead,
   if (!ResDef && SchedRead.TheDef->getName() != "ReadDefault") {
     PrintFatalError(ProcModel.ModelDef->getLoc(),
                     Twine("Processor does not define resources for ") +
-                    SchedRead.TheDef->getName());
+                        SchedRead.TheDef->getName());
   }
   return ResDef;
 }
@@ -994,11 +996,10 @@ void SubtargetEmitter::ExpandProcResources(
         if (SubDef->isSubClassOf("ProcResGroup")) {
           // Disallow this for simplicitly.
           PrintFatalError(SubDef->getLoc(), "Processor resource group "
-                          " cannot be a super resources.");
+                                            " cannot be a super resources.");
         }
-        Record *SuperDef =
-            SchedModels.findProcResUnits(SubDef->getValueAsDef("Super"), PM,
-                                         SubDef->getLoc());
+        Record *SuperDef = SchedModels.findProcResUnits(
+            SubDef->getValueAsDef("Super"), PM, SubDef->getLoc());
         PRVec.push_back(SuperDef);
         ReleaseAtCycles.push_back(ReleaseAtCycles[i]);
         AcquireAtCycles.push_back(AcquireAtCycles[i]);
@@ -1010,7 +1011,7 @@ void SubtargetEmitter::ExpandProcResources(
         continue;
       RecVec SuperResources = PR->getValueAsListOfDefs("Resources");
       RecIter SubI = SubResources.begin(), SubE = SubResources.end();
-      for( ; SubI != SubE; ++SubI) {
+      for (; SubI != SubE; ++SubI) {
         if (!is_contained(SuperResources, *SubI)) {
           break;
         }
@@ -1051,7 +1052,7 @@ void SubtargetEmitter::GenSchedClassTables(const CodeGenProcModel &ProcModel,
     // A Variant SchedClass has no resources of its own.
     bool HasVariants = false;
     for (const CodeGenSchedTransition &CGT :
-           make_range(SC.Transitions.begin(), SC.Transitions.end())) {
+         make_range(SC.Transitions.begin(), SC.Transitions.end())) {
       if (CGT.ProcIndex == ProcModel.Index) {
         HasVariants = true;
         break;
@@ -1114,8 +1115,7 @@ void SubtargetEmitter::GenSchedClassTables(const CodeGenProcModel &ProcModel,
     std::vector<MCReadAdvanceEntry> ReadAdvanceEntries;
     for (unsigned W : Writes) {
       IdxVec WriteSeq;
-      SchedModels.expandRWSeqForProc(W, WriteSeq, /*IsRead=*/false,
-                                     ProcModel);
+      SchedModels.expandRWSeqForProc(W, WriteSeq, /*IsRead=*/false, ProcModel);
 
       // For each operand, create a latency entry.
       MCWriteLatencyEntry WLEntry;
@@ -1125,7 +1125,7 @@ void SubtargetEmitter::GenSchedClassTables(const CodeGenProcModel &ProcModel,
       // If this Write is not referenced by a ReadAdvance, don't distinguish it
       // from other WriteLatency entries.
       if (!SchedModels.hasReadOfWrite(
-            SchedModels.getSchedWrite(WriteID).TheDef)) {
+              SchedModels.getSchedWrite(WriteID).TheDef)) {
         WriteID = 0;
       }
       WLEntry.WriteResourceID = WriteID;
@@ -1133,7 +1133,7 @@ void SubtargetEmitter::GenSchedClassTables(const CodeGenProcModel &ProcModel,
       for (unsigned WS : WriteSeq) {
 
         Record *WriteRes =
-          FindWriteResources(SchedModels.getSchedWrite(WS), ProcModel);
+            FindWriteResources(SchedModels.getSchedWrite(WS), ProcModel);
 
         // Mark the parent class as invalid for unsupported write types.
         if (WriteRes->getValueAsBit("Unsupported")) {
@@ -1170,7 +1170,8 @@ void SubtargetEmitter::GenSchedClassTables(const CodeGenProcModel &ProcModel,
                   .concat(Twine(ReleaseAtCycles.size())));
         }
 
-        if (!AcquireAtCycles.empty() && AcquireAtCycles.size() != PRVec.size()) {
+        if (!AcquireAtCycles.empty() &&
+            AcquireAtCycles.size() != PRVec.size()) {
           PrintFatalError(
               WriteRes->getLoc(),
               Twine("Inconsistent resource cycles: size(AcquireAtCycles) != "
@@ -1197,8 +1198,8 @@ void SubtargetEmitter::GenSchedClassTables(const CodeGenProcModel &ProcModel,
         ExpandProcResources(PRVec, ReleaseAtCycles, AcquireAtCycles, ProcModel);
         assert(AcquireAtCycles.size() == ReleaseAtCycles.size());
 
-        for (unsigned PRIdx = 0, PREnd = PRVec.size();
-             PRIdx != PREnd; ++PRIdx) {
+        for (unsigned PRIdx = 0, PREnd = PRVec.size(); PRIdx != PREnd;
+             ++PRIdx) {
           MCWriteProcResEntry WPREntry;
           WPREntry.ProcResourceIdx = ProcModel.getProcResourceIdx(PRVec[PRIdx]);
           assert(WPREntry.ProcResourceIdx && "Bad ProcResourceIdx");
@@ -1220,9 +1221,9 @@ void SubtargetEmitter::GenSchedClassTables(const CodeGenProcModel &ProcModel,
           // serially, rather than multiple parallel uses. This is important for
           // in-order machine where the resource consumption is a hazard.
           unsigned WPRIdx = 0, WPREnd = WriteProcResources.size();
-          for( ; WPRIdx != WPREnd; ++WPRIdx) {
-            if (WriteProcResources[WPRIdx].ProcResourceIdx
-                == WPREntry.ProcResourceIdx) {
+          for (; WPRIdx != WPREnd; ++WPRIdx) {
+            if (WriteProcResources[WPRIdx].ProcResourceIdx ==
+                WPREntry.ProcResourceIdx) {
               // TODO: multiple use of the same resources would
               // require either 1. thinking of how to handle multiple
               // intervals for the same resource in
@@ -1245,10 +1246,10 @@ void SubtargetEmitter::GenSchedClassTables(const CodeGenProcModel &ProcModel,
     }
     // Create an entry for each operand Read in this SchedClass.
     // Entries must be sorted first by UseIdx then by WriteResourceID.
-    for (unsigned UseIdx = 0, EndIdx = Reads.size();
-         UseIdx != EndIdx; ++UseIdx) {
+    for (unsigned UseIdx = 0, EndIdx = Reads.size(); UseIdx != EndIdx;
+         ++UseIdx) {
       Record *ReadAdvance =
-        FindReadAdvance(SchedModels.getSchedRead(Reads[UseIdx]), ProcModel);
+          FindReadAdvance(SchedModels.getSchedRead(Reads[UseIdx]), ProcModel);
       if (!ReadAdvance)
         continue;
 
@@ -1267,7 +1268,7 @@ void SubtargetEmitter::GenSchedClassTables(const CodeGenProcModel &ProcModel,
         }
       }
       llvm::sort(WriteIDs);
-      for(unsigned W : WriteIDs) {
+      for (unsigned W : WriteIDs) {
         MCReadAdvanceEntry RAEntry;
         RAEntry.UseIdx = UseIdx;
         RAEntry.WriteResourceID = W;
@@ -1288,9 +1289,9 @@ void SubtargetEmitter::GenSchedClassTables(const CodeGenProcModel &ProcModel,
 
     SCDesc.NumWriteProcResEntries = WriteProcResources.size();
     std::vector<MCWriteProcResEntry>::iterator WPRPos =
-      std::search(SchedTables.WriteProcResources.begin(),
-                  SchedTables.WriteProcResources.end(),
-                  WriteProcResources.begin(), WriteProcResources.end());
+        std::search(SchedTables.WriteProcResources.begin(),
+                    SchedTables.WriteProcResources.end(),
+                    WriteProcResources.begin(), WriteProcResources.end());
     if (WPRPos != SchedTables.WriteProcResources.end())
       SCDesc.WriteProcResIdx = WPRPos - SchedTables.WriteProcResources.begin();
     else {
@@ -1300,10 +1301,9 @@ void SubtargetEmitter::GenSchedClassTables(const CodeGenProcModel &ProcModel,
     }
     // Latency entries must remain in operand order.
     SCDesc.NumWriteLatencyEntries = WriteLatencies.size();
-    std::vector<MCWriteLatencyEntry>::iterator WLPos =
-      std::search(SchedTables.WriteLatencies.begin(),
-                  SchedTables.WriteLatencies.end(),
-                  WriteLatencies.begin(), WriteLatencies.end());
+    std::vector<MCWriteLatencyEntry>::iterator WLPos = std::search(
+        SchedTables.WriteLatencies.begin(), SchedTables.WriteLatencies.end(),
+        WriteLatencies.begin(), WriteLatencies.end());
     if (WLPos != SchedTables.WriteLatencies.end()) {
       unsigned idx = WLPos - SchedTables.WriteLatencies.begin();
       SCDesc.WriteLatencyIdx = idx;
@@ -1312,8 +1312,7 @@ void SubtargetEmitter::GenSchedClassTables(const CodeGenProcModel &ProcModel,
             std::string::npos) {
           SchedTables.WriterNames[idx + i] += std::string("_") + WriterNames[i];
         }
-    }
-    else {
+    } else {
       SCDesc.WriteLatencyIdx = SchedTables.WriteLatencies.size();
       llvm::append_range(SchedTables.WriteLatencies, WriteLatencies);
       llvm::append_range(SchedTables.WriterNames, WriterNames);
@@ -1321,9 +1320,9 @@ void SubtargetEmitter::GenSchedClassTables(const CodeGenProcModel &ProcModel,
     // ReadAdvanceEntries must remain in operand order.
     SCDesc.NumReadAdvanceEntries = ReadAdvanceEntries.size();
     std::vector<MCReadAdvanceEntry>::iterator RAPos =
-      std::search(SchedTables.ReadAdvanceEntries.begin(),
-                  SchedTables.ReadAdvanceEntries.end(),
-                  ReadAdvanceEntries.begin(), ReadAdvanceEntries.end());
+        std::search(SchedTables.ReadAdvanceEntries.begin(),
+                    SchedTables.ReadAdvanceEntries.end(),
+                    ReadAdvanceEntries.begin(), ReadAdvanceEntries.end());
     if (RAPos != SchedTables.ReadAdvanceEntries.end())
       SCDesc.ReadAdvanceIdx = RAPos - SchedTables.ReadAdvanceEntries.begin();
     else {
@@ -1355,8 +1354,8 @@ void SubtargetEmitter::EmitSchedClassTables(SchedClassTables &SchedTables,
 
   // Emit global WriteLatencyTable.
   OS << "\n// {Cycles, WriteResourceID}\n"
-     << "extern const llvm::MCWriteLatencyEntry "
-     << Target << "WriteLatencyTable[] = {\n"
+     << "extern const llvm::MCWriteLatencyEntry " << Target
+     << "WriteLatencyTable[] = {\n"
      << "  { 0,  0}, // Invalid\n";
   for (unsigned WLIdx = 1, WLEnd = SchedTables.WriteLatencies.size();
        WLIdx != WLEnd; ++WLIdx) {
@@ -1371,8 +1370,8 @@ void SubtargetEmitter::EmitSchedClassTables(SchedClassTables &SchedTables,
 
   // Emit global ReadAdvanceTable.
   OS << "\n// {UseIdx, WriteResourceID, Cycles}\n"
-     << "extern const llvm::MCReadAdvanceEntry "
-     << Target << "ReadAdvanceTable[] = {\n"
+     << "extern const llvm::MCReadAdvanceEntry " << Target
+     << "ReadAdvanceTable[] = {\n"
      << "  {0,  0,  0}, // Invalid\n";
   for (unsigned RAIdx = 1, RAEnd = SchedTables.ReadAdvanceEntries.size();
        RAIdx != RAEnd; ++RAIdx) {
@@ -1388,22 +1387,23 @@ void SubtargetEmitter::EmitSchedClassTables(SchedClassTables &SchedTables,
 
   // Emit a SchedClass table for each processor.
   for (CodeGenSchedModels::ProcIter PI = SchedModels.procModelBegin(),
-         PE = SchedModels.procModelEnd(); PI != PE; ++PI) {
+                                    PE = SchedModels.procModelEnd();
+       PI != PE; ++PI) {
     if (!PI->hasInstrSchedModel())
       continue;
 
     std::vector<MCSchedClassDesc> &SCTab =
-      SchedTables.ProcSchedClasses[1 + (PI - SchedModels.procModelBegin())];
+        SchedTables.ProcSchedClasses[1 + (PI - SchedModels.procModelBegin())];
 
     OS << "\n// {Name, NumMicroOps, BeginGroup, EndGroup, RetireOOO,"
        << " WriteProcResIdx,#, WriteLatencyIdx,#, ReadAdvanceIdx,#}\n";
-    OS << "static const llvm::MCSchedClassDesc "
-       << PI->ModelName << "SchedClasses[] = {\n";
+    OS << "static const llvm::MCSchedClassDesc " << PI->ModelName
+       << "SchedClasses[] = {\n";
 
     // The first class is always invalid. We no way to distinguish it except by
     // name and position.
-    assert(SchedModels.getSchedClass(0).Name == "NoInstrModel"
-           && "invalid class not first");
+    assert(SchedModels.getSchedClass(0).Name == "NoInstrModel" &&
+           "invalid class not first");
     OS << "  {DBGFIELD(\"InvalidSchedClass\")  "
        << MCSchedClassDesc::InvalidNumMicroOps
        << ", false, false, false, 0, 0,  0, 0,  0, 0},\n";
@@ -1414,17 +1414,15 @@ void SubtargetEmitter::EmitSchedClassTables(SchedClassTables &SchedTables,
       OS << "  {DBGFIELD(\"" << SchedClass.Name << "\") ";
       if (SchedClass.Name.size() < 18)
         OS.indent(18 - SchedClass.Name.size());
-      OS << MCDesc.NumMicroOps
-         << ", " << ( MCDesc.BeginGroup ? "true" : "false" )
-         << ", " << ( MCDesc.EndGroup ? "true" : "false" )
-         << ", " << ( MCDesc.RetireOOO ? "true" : "false" )
-         << ", " << format("%2d", MCDesc.WriteProcResIdx)
-         << ", " << MCDesc.NumWriteProcResEntries
-         << ", " << format("%2d", MCDesc.WriteLatencyIdx)
-         << ", " << MCDesc.NumWriteLatencyEntries
-         << ", " << format("%2d", MCDesc.ReadAdvanceIdx)
-         << ", " << MCDesc.NumReadAdvanceEntries
-         << "}, // #" << SCIdx << '\n';
+      OS << MCDesc.NumMicroOps << ", " << (MCDesc.BeginGroup ? "true" : "false")
+         << ", " << (MCDesc.EndGroup ? "true" : "false") << ", "
+         << (MCDesc.RetireOOO ? "true" : "false") << ", "
+         << format("%2d", MCDesc.WriteProcResIdx) << ", "
+         << MCDesc.NumWriteProcResEntries << ", "
+         << format("%2d", MCDesc.WriteLatencyIdx) << ", "
+         << MCDesc.NumWriteLatencyEntries << ", "
+         << format("%2d", MCDesc.ReadAdvanceIdx) << ", "
+         << MCDesc.NumReadAdvanceEntries << "}, // #" << SCIdx << '\n';
     }
     OS << "}; // " << PI->ModelName << "SchedClasses\n";
   }
@@ -1439,9 +1437,10 @@ void SubtargetEmitter::EmitProcessorModels(raw_ostream &OS) {
     // Emit processor resource table.
     if (PM.hasInstrSchedModel())
       EmitProcessorResources(PM, OS);
-    else if(!PM.ProcResourceDefs.empty())
-      PrintFatalError(PM.ModelDef->getLoc(), "SchedMachineModel defines "
-                    "ProcResources without defining WriteRes SchedWriteRes");
+    else if (!PM.ProcResourceDefs.empty())
+      PrintFatalError(PM.ModelDef->getLoc(),
+                      "SchedMachineModel defines "
+                      "ProcResources without defining WriteRes SchedWriteRes");
 
     // Begin processor itinerary properties
     OS << "\n";
@@ -1454,13 +1453,13 @@ void SubtargetEmitter::EmitProcessorModels(raw_ostream &OS) {
     EmitProcessorProp(OS, PM.ModelDef, "MispredictPenalty", ',');
 
     bool PostRAScheduler =
-      (PM.ModelDef ? PM.ModelDef->getValueAsBit("PostRAScheduler") : false);
+        (PM.ModelDef ? PM.ModelDef->getValueAsBit("PostRAScheduler") : false);
 
-    OS << "  " << (PostRAScheduler ? "true" : "false")  << ", // "
+    OS << "  " << (PostRAScheduler ? "true" : "false") << ", // "
        << "PostRAScheduler\n";
 
     bool CompleteModel =
-      (PM.ModelDef ? PM.ModelDef->getValueAsBit("CompleteModel") : false);
+        (PM.ModelDef ? PM.ModelDef->getValueAsBit("CompleteModel") : false);
 
     OS << "  " << (CompleteModel ? "true" : "false") << ", // "
        << "CompleteModel\n";
@@ -1473,11 +1472,14 @@ void SubtargetEmitter::EmitProcessorModels(raw_ostream &OS) {
 
     OS << "  " << PM.Index << ", // Processor ID\n";
     if (PM.hasInstrSchedModel())
-      OS << "  " << PM.ModelName << "ProcResources" << ",\n"
-         << "  " << PM.ModelName << "SchedClasses" << ",\n"
-         << "  " << PM.ProcResourceDefs.size()+1 << ",\n"
-         << "  " << (SchedModels.schedClassEnd()
-                     - SchedModels.schedClassBegin()) << ",\n";
+      OS << "  " << PM.ModelName << "ProcResources"
+         << ",\n"
+         << "  " << PM.ModelName << "SchedClasses"
+         << ",\n"
+         << "  " << PM.ProcResourceDefs.size() + 1 << ",\n"
+         << "  "
+         << (SchedModels.schedClassEnd() - SchedModels.schedClassBegin())
+         << ",\n";
     else
       OS << "  nullptr, nullptr, 0, 0,"
          << " // No instruction-level machine model.\n";
@@ -1669,8 +1671,8 @@ void SubtargetEmitter::emitSchedModelHelpersImpl(
   // Construct a switch statement where the condition is a check on the
   // scheduling class identifier. There is a `case` for every variant class
   // defined by the processor models of this target.
-  // Each `case` implements a number of rules to resolve (i.e. to transition from)
-  // a variant scheduling class to another scheduling class.  Rules are
+  // Each `case` implements a number of rules to resolve (i.e. to transition
+  // from) a variant scheduling class to another scheduling class.  Rules are
   // described by instances of CodeGenSchedTransition. Note that transitions may
   // not be valid for all processors.
   OS << "  switch (SchedClass) {\n";
@@ -1781,8 +1783,8 @@ void SubtargetEmitter::EmitHwModeCheck(const std::string &ClassName,
   OS << "unsigned " << ClassName << "::getHwMode() const {\n";
   for (unsigned M = 1, NumModes = CGH.getNumModeIds(); M != NumModes; ++M) {
     const HwMode &HM = CGH.getMode(M);
-    OS << "  if (checkFeatures(\"" << HM.Features
-       << "\")) return " << M << ";\n";
+    OS << "  if (checkFeatures(\"" << HM.Features << "\")) return " << M
+       << ";\n";
   }
   OS << "  return 0;\n}\n";
 }
@@ -1808,8 +1810,8 @@ void SubtargetEmitter::emitGetMacroFusions(const std::string &ClassName,
 // Produces a subtarget specific function for parsing
 // the subtarget features string.
 void SubtargetEmitter::ParseFeaturesFunction(raw_ostream &OS) {
-  std::vector<Record*> Features =
-                       Records.getAllDerivedDefinitions("SubtargetFeature");
+  std::vector<Record *> Features =
+      Records.getAllDerivedDefinitions("SubtargetFeature");
   llvm::sort(Features, LessRecord());
 
   OS << "// ParseSubtargetFeatures - Parses features string setting specified\n"
@@ -1836,15 +1838,12 @@ void SubtargetEmitter::ParseFeaturesFunction(raw_ostream &OS) {
     StringRef Value = R->getValueAsString("Value");
     StringRef FieldName = R->getValueAsString("FieldName");
 
-    if (Value=="true" || Value=="false")
-      OS << "  if (Bits[" << Target << "::"
-         << Instance << "]) "
-         << FieldName << " = " << Value << ";\n";
+    if (Value == "true" || Value == "false")
+      OS << "  if (Bits[" << Target << "::" << Instance << "]) " << FieldName
+         << " = " << Value << ";\n";
     else
-      OS << "  if (Bits[" << Target << "::"
-         << Instance << "] && "
-         << FieldName << " < " << Value << ") "
-         << FieldName << " = " << Value << ";\n";
+      OS << "  if (Bits[" << Target << "::" << Instance << "] && " << FieldName
+         << " < " << Value << ") " << FieldName << " = " << Value << ";\n";
   }
 
   OS << "}\n";
@@ -1955,15 +1954,15 @@ void SubtargetEmitter::run(raw_ostream &OS) {
     OS << Target << "SubTypeKV, ";
   else
     OS << "std::nullopt, ";
-  OS << '\n'; OS.indent(22);
-  OS << Target << "WriteProcResTable, "
-     << Target << "WriteLatencyTable, "
+  OS << '\n';
+  OS.indent(22);
+  OS << Target << "WriteProcResTable, " << Target << "WriteLatencyTable, "
      << Target << "ReadAdvanceTable, ";
-  OS << '\n'; OS.indent(22);
+  OS << '\n';
+  OS.indent(22);
   if (SchedModels.hasItineraries()) {
-    OS << Target << "Stages, "
-       << Target << "OperandCycles, "
-       << Target << "ForwardingPaths";
+    OS << Target << "Stages, " << Target << "OperandCycles, " << Target
+       << "ForwardingPaths";
   } else
     OS << "nullptr, nullptr, nullptr";
   OS << ");\n}\n\n";
@@ -2027,12 +2026,12 @@ void SubtargetEmitter::run(raw_ostream &OS) {
   OS << "namespace llvm {\n";
   OS << "extern const llvm::SubtargetFeatureKV " << Target << "FeatureKV[];\n";
   OS << "extern const llvm::SubtargetSubTypeKV " << Target << "SubTypeKV[];\n";
-  OS << "extern const llvm::MCWriteProcResEntry "
-     << Target << "WriteProcResTable[];\n";
-  OS << "extern const llvm::MCWriteLatencyEntry "
-     << Target << "WriteLatencyTable[];\n";
-  OS << "extern const llvm::MCReadAdvanceEntry "
-     << Target << "ReadAdvanceTable[];\n";
+  OS << "extern const llvm::MCWriteProcResEntry " << Target
+     << "WriteProcResTable[];\n";
+  OS << "extern const llvm::MCWriteLatencyEntry " << Target
+     << "WriteLatencyTable[];\n";
+  OS << "extern const llvm::MCReadAdvanceEntry " << Target
+     << "ReadAdvanceTable[];\n";
 
   if (SchedModels.hasItineraries()) {
     OS << "extern const llvm::InstrStage " << Target << "Stages[];\n";
@@ -2051,15 +2050,15 @@ void SubtargetEmitter::run(raw_ostream &OS) {
     OS << "ArrayRef(" << Target << "SubTypeKV, " << NumProcs << "), ";
   else
     OS << "std::nullopt, ";
-  OS << '\n'; OS.indent(24);
-  OS << Target << "WriteProcResTable, "
-     << Target << "WriteLatencyTable, "
+  OS << '\n';
+  OS.indent(24);
+  OS << Target << "WriteProcResTable, " << Target << "WriteLatencyTable, "
      << Target << "ReadAdvanceTable, ";
-  OS << '\n'; OS.indent(24);
+  OS << '\n';
+  OS.indent(24);
   if (SchedModels.hasItineraries()) {
-    OS << Target << "Stages, "
-       << Target << "OperandCycles, "
-       << Target << "ForwardingPaths";
+    OS << Target << "Stages, " << Target << "OperandCycles, " << Target
+       << "ForwardingPaths";
   } else
     OS << "nullptr, nullptr, nullptr";
   OS << ") {}\n\n";
diff --git a/llvm/utils/TableGen/SubtargetFeatureInfo.cpp b/llvm/utils/TableGen/SubtargetFeatureInfo.cpp
index 52afb4d..819abfa 100644
--- a/llvm/utils/TableGen/SubtargetFeatureInfo.cpp
+++ b/llvm/utils/TableGen/SubtargetFeatureInfo.cpp
@@ -81,7 +81,7 @@ void SubtargetFeatureInfo::emitNameTable(
   uint64_t IndexUB = 0;
   for (const auto &SF : SubtargetFeatures)
     if (IndexUB <= SF.second.Index)
-      IndexUB = SF.second.Index+1;
+      IndexUB = SF.second.Index + 1;
 
   std::vector<std::string> Names;
   if (IndexUB > 0)
diff --git a/llvm/utils/TableGen/SubtargetFeatureInfo.h b/llvm/utils/TableGen/SubtargetFeatureInfo.h
index 9401004..b1016ff 100644
--- a/llvm/utils/TableGen/SubtargetFeatureInfo.h
+++ b/llvm/utils/TableGen/SubtargetFeatureInfo.h
@@ -18,7 +18,8 @@
 
 namespace llvm {
 struct SubtargetFeatureInfo;
-using SubtargetFeatureInfoMap = std::map<Record *, SubtargetFeatureInfo, LessRecordByID>;
+using SubtargetFeatureInfoMap =
+    std::map<Record *, SubtargetFeatureInfo, LessRecordByID>;
 
 /// Helper class for storing information on a subtarget feature which
 /// participates in instruction matching.
diff --git a/llvm/utils/TableGen/TableGenBackends.h b/llvm/utils/TableGen/TableGenBackends.h
index 3afe6b0..e0d12ab 100644
--- a/llvm/utils/TableGen/TableGenBackends.h
+++ b/llvm/utils/TableGen/TableGenBackends.h
@@ -46,7 +46,6 @@
 // backends, this means that the EmitFoo function is the only thing not in
 // the anonymous namespace.
 
-
 // FIXME: Reorganize TableGen so that build dependencies can be more
 // accurately expressed. Currently, touching any of the emitters (or
 // anything that they transitively depend on) causes everything dependent
@@ -57,7 +56,6 @@
 // TableGen binary with as few dependencies as possible on the rest of
 // LLVM.
 
-
 namespace llvm {
 
 class raw_ostream;
diff --git a/llvm/utils/TableGen/Types.cpp b/llvm/utils/TableGen/Types.cpp
index aca8e36..35b79b3 100644
--- a/llvm/utils/TableGen/Types.cpp
+++ b/llvm/utils/TableGen/Types.cpp
@@ -15,7 +15,9 @@
 
 using namespace llvm;
 
-const char *llvm::getMinimalTypeForRange(uint64_t Range, unsigned MaxSize LLVM_ATTRIBUTE_UNUSED) {
+const char *
+llvm::getMinimalTypeForRange(uint64_t Range,
+                             unsigned MaxSize LLVM_ATTRIBUTE_UNUSED) {
   // TODO: The original callers only used 32 and 64 so these are the only
   //       values permitted. Rather than widen the supported values we should
   //       allow 64 for the callers that currently use 32 and remove the
diff --git a/llvm/utils/TableGen/Types.h b/llvm/utils/TableGen/Types.h
index f369d61..74f0f9f 100644
--- a/llvm/utils/TableGen/Types.h
+++ b/llvm/utils/TableGen/Types.h
@@ -16,6 +16,6 @@ namespace llvm {
 /// MaxSize indicates the largest size of integer to consider (in bits) and only
 /// supports values of at least 32.
 const char *getMinimalTypeForRange(uint64_t Range, unsigned MaxSize = 64);
-}
+} // namespace llvm
 
 #endif
diff --git a/llvm/utils/TableGen/WebAssemblyDisassemblerEmitter.cpp b/llvm/utils/TableGen/WebAssemblyDisassemblerEmitter.cpp
index dc037e4..2cf86d3 100644
--- a/llvm/utils/TableGen/WebAssemblyDisassemblerEmitter.cpp
+++ b/llvm/utils/TableGen/WebAssemblyDisassemblerEmitter.cpp
@@ -63,7 +63,8 @@ void emitWebAssemblyDisassemblerTables(
       // should be the canonical one. This determines which variant gets
       // printed in a disassembly. We want e.g. "call" not "i32.call", and
       // "end" when we don't know if its "end_loop" or "end_block" etc.
-      bool IsCanonicalExisting = CGIP.second->TheDef->getValueAsBit("IsCanonical");
+      bool IsCanonicalExisting =
+          CGIP.second->TheDef->getValueAsBit("IsCanonical");
       // We already have one marked explicitly as canonical, so keep it.
       if (IsCanonicalExisting)
         continue;
@@ -126,7 +127,8 @@ void emitWebAssemblyDisassemblerTables(
                ++J) {
             size_t K = 0;
             for (; K < CurOperandList.size(); ++K) {
-              if (OperandTable[J + K] != CurOperandList[K]) break;
+              if (OperandTable[J + K] != CurOperandList[K])
+                break;
             }
             if (K == CurOperandList.size()) {
               OperandStart = J;
diff --git a/llvm/utils/TableGen/X86CompressEVEXTablesEmitter.cpp b/llvm/utils/TableGen/X86CompressEVEXTablesEmitter.cpp
index fef8dc7..e4db995 100644
--- a/llvm/utils/TableGen/X86CompressEVEXTablesEmitter.cpp
+++ b/llvm/utils/TableGen/X86CompressEVEXTablesEmitter.cpp
@@ -83,7 +83,8 @@ void X86CompressEVEXTablesEmitter::printTable(const std::vector<Entry> &Table,
 void X86CompressEVEXTablesEmitter::printCheckPredicate(
     const PredicateInstMap &PredicateInsts, raw_ostream &OS) {
 
-  OS << "static bool checkPredicate(unsigned Opc, const X86Subtarget *Subtarget) {\n"
+  OS << "static bool checkPredicate(unsigned Opc, const X86Subtarget "
+        "*Subtarget) {\n"
      << "  switch (Opc) {\n"
      << "  default: return true;\n";
   for (const auto &[Key, Val] : PredicateInsts) {
@@ -207,9 +208,9 @@ void X86CompressEVEXTablesEmitter::run(raw_ostream &OS) {
           NewInst = &TempInst;
       }
     } else {
-      // For each pre-compression instruction look for a match in the appropriate
-      // vector (instructions with the same opcode) using function object
-      // IsMatch.
+      // For each pre-compression instruction look for a match in the
+      // appropriate vector (instructions with the same opcode) using function
+      // object IsMatch.
       auto Match = llvm::find_if(CompressedInsts[Opcode], IsMatch(Inst));
       if (Match != CompressedInsts[Opcode].end())
         NewInst = *Match;
@@ -225,7 +226,7 @@ void X86CompressEVEXTablesEmitter::run(raw_ostream &OS) {
       return Name == "HasAVXNECONVERT" || Name == "HasAVXVNNI" ||
              Name == "HasAVXIFMA";
     });
-    if(It!= Predicates.end())
+    if (It != Predicates.end())
       PredicateInsts[*It].push_back(NewInst);
   }
 
diff --git a/llvm/utils/TableGen/X86DisassemblerShared.h b/llvm/utils/TableGen/X86DisassemblerShared.h
index 093f220..f60fd47 100644
--- a/llvm/utils/TableGen/X86DisassemblerShared.h
+++ b/llvm/utils/TableGen/X86DisassemblerShared.h
@@ -49,9 +49,7 @@ struct OpcodeDecision {
 struct ContextDecision {
   OpcodeDecision opcodeDecisions[llvm::X86Disassembler::IC_max];
 
-  ContextDecision() {
-    memset(opcodeDecisions, 0, sizeof(opcodeDecisions));
-  }
+  ContextDecision() { memset(opcodeDecisions, 0, sizeof(opcodeDecisions)); }
 };
 
 #endif
diff --git a/llvm/utils/TableGen/X86DisassemblerTables.cpp b/llvm/utils/TableGen/X86DisassemblerTables.cpp
index 23886a3..588d9b2 100644
--- a/llvm/utils/TableGen/X86DisassemblerTables.cpp
+++ b/llvm/utils/TableGen/X86DisassemblerTables.cpp
@@ -41,8 +41,9 @@ static inline const char *stringForContext(InstructionContext insnContext) {
     break;
 #define ENUM_ENTRY_K_B(n, r, d)                                                \
   ENUM_ENTRY(n, r, d)                                                          \
-  ENUM_ENTRY(n##_K_B, r, d) ENUM_ENTRY(n##_KZ, r, d) ENUM_ENTRY(n##_K, r, d)   \
-      ENUM_ENTRY(n##_B, r, d) ENUM_ENTRY(n##_KZ_B, r, d)
+  ENUM_ENTRY(n##_K_B, r, d)                                                    \
+  ENUM_ENTRY(n##_KZ, r, d)                                                     \
+  ENUM_ENTRY(n##_K, r, d) ENUM_ENTRY(n##_B, r, d) ENUM_ENTRY(n##_KZ_B, r, d)
     INSTRUCTION_CONTEXTS
 #undef ENUM_ENTRY
 #undef ENUM_ENTRY_K_B
@@ -595,8 +596,8 @@ static inline bool outranks(InstructionContext upper,
 #define ENUM_ENTRY_K_B(n, r, d)                                                \
   ENUM_ENTRY(n, r, d)                                                          \
   ENUM_ENTRY(n##_K_B, r, d)                                                    \
-  ENUM_ENTRY(n##_KZ_B, r, d) ENUM_ENTRY(n##_KZ, r, d) ENUM_ENTRY(n##_K, r, d)  \
-      ENUM_ENTRY(n##_B, r, d)
+  ENUM_ENTRY(n##_KZ_B, r, d)                                                   \
+  ENUM_ENTRY(n##_KZ, r, d) ENUM_ENTRY(n##_K, r, d) ENUM_ENTRY(n##_B, r, d)
   static int ranks[IC_max] = {INSTRUCTION_CONTEXTS};
 #undef ENUM_ENTRY
 #undef ENUM_ENTRY_K_B
@@ -822,7 +823,8 @@ void DisassemblerTables::emitContextDecision(raw_ostream &o1, raw_ostream &o2,
   }
 
   i2--;
-  o2.indent(i2) << "}};" << "\n";
+  o2.indent(i2) << "}};"
+                << "\n";
 }
 
 void DisassemblerTables::emitInstructionInfo(raw_ostream &o,
@@ -859,7 +861,8 @@ void DisassemblerTables::emitInstructionInfo(raw_ostream &o,
     }
     o << "  },\n";
   }
-  o << "};" << "\n\n";
+  o << "};"
+    << "\n\n";
 
   o.indent(i * 2) << "static const struct InstructionSpecifier ";
   o << INSTRUCTIONS_STR "[" << InstructionSpecifiers.size() << "] = {\n";
@@ -885,7 +888,8 @@ void DisassemblerTables::emitInstructionInfo(raw_ostream &o,
   }
 
   i--;
-  o.indent(i * 2) << "};" << "\n";
+  o.indent(i * 2) << "};"
+                  << "\n";
 }
 
 void DisassemblerTables::emitContextTable(raw_ostream &o, unsigned &i) const {
@@ -1004,7 +1008,8 @@ void DisassemblerTables::emitContextTable(raw_ostream &o, unsigned &i) const {
   }
 
   i--;
-  o.indent(i * 2) << "};" << "\n";
+  o.indent(i * 2) << "};"
+                  << "\n";
 }
 
 void DisassemblerTables::emitContextDecisions(raw_ostream &o1, raw_ostream &o2,
diff --git a/llvm/utils/TableGen/X86DisassemblerTables.h b/llvm/utils/TableGen/X86DisassemblerTables.h
index 4fbc58b..0f38274 100644
--- a/llvm/utils/TableGen/X86DisassemblerTables.h
+++ b/llvm/utils/TableGen/X86DisassemblerTables.h
@@ -91,8 +91,8 @@ private:
   /// @param ModRMTableNum - next table number for adding to ModRMTable.
   /// @param decision - The ModR/M decision to emit.  This decision has 256
   ///                   entries - emitModRMDecision decides how to compact it.
-  void emitModRMDecision(raw_ostream &o1, raw_ostream &o2,
-                         unsigned &i1, unsigned &i2, unsigned &ModRMTableNum,
+  void emitModRMDecision(raw_ostream &o1, raw_ostream &o2, unsigned &i1,
+                         unsigned &i2, unsigned &ModRMTableNum,
                          ModRMDecision &decision) const;
 
   /// emitOpcodeDecision - Emits an OpcodeDecision and all its subsidiary ModR/M
@@ -119,8 +119,8 @@ private:
   /// @param ModRMTableNum - next table number for adding to ModRMTable.
   /// @param decision - The OpcodeDecision to emit along with its subsidiary
   ///                    structures.
-  void emitOpcodeDecision(raw_ostream &o1, raw_ostream &o2,
-                          unsigned &i1, unsigned &i2, unsigned &ModRMTableNum,
+  void emitOpcodeDecision(raw_ostream &o1, raw_ostream &o2, unsigned &i1,
+                          unsigned &i2, unsigned &ModRMTableNum,
                           OpcodeDecision &decision) const;
 
   /// emitContextDecision - Emits a ContextDecision and all its subsidiary
@@ -153,9 +153,9 @@ private:
   /// @param decision - The ContextDecision to emit along with its subsidiary
   ///                   structures.
   /// @param name     - The name for the ContextDecision.
-  void emitContextDecision(raw_ostream &o1, raw_ostream &o2,
-                           unsigned &i1, unsigned &i2, unsigned &ModRMTableNum,
-                           ContextDecision &decision, const char* name) const;
+  void emitContextDecision(raw_ostream &o1, raw_ostream &o2, unsigned &i1,
+                           unsigned &i2, unsigned &ModRMTableNum,
+                           ContextDecision &decision, const char *name) const;
 
   /// emitInstructionInfo - Prints the instruction specifier table, which has
   ///   one entry for each instruction, and contains name and operand
@@ -200,7 +200,8 @@ private:
   ///   IC is the context corresponding to the mask 0x00, and there are 256
   ///   possible masks.
   ///
-  /// @param o  - The output stream to which the context table should be written.
+  /// @param o  - The output stream to which the context table should be
+  /// written.
   /// @param i  - The indent level for use with the stream.
   void emitContextTable(raw_ostream &o, uint32_t &i) const;
 
@@ -213,9 +214,8 @@ private:
   /// @param i1 - The indent level to use with stream o1.
   /// @param i2 - The indent level to use with stream o2.
   /// @param ModRMTableNum - next table number for adding to ModRMTable.
-  void emitContextDecisions(raw_ostream &o1, raw_ostream &o2,
-                            unsigned &i1, unsigned &i2,
-                            unsigned &ModRMTableNum) const;
+  void emitContextDecisions(raw_ostream &o1, raw_ostream &o2, unsigned &i1,
+                            unsigned &i2, unsigned &ModRMTableNum) const;
 
   /// setTableFields - Uses a ModRMFilter to set the appropriate entries in a
   ///   ModRMDecision to refer to a particular instruction ID.
@@ -224,10 +224,9 @@ private:
   /// @param filter   - The filter to use in deciding which entries to populate.
   /// @param uid      - The unique ID to set matching entries to.
   /// @param opcode   - The opcode of the instruction, for error reporting.
-  void setTableFields(ModRMDecision &decision,
-                      const ModRMFilter &filter,
-                      InstrUID uid,
-                      uint8_t opcode);
+  void setTableFields(ModRMDecision &decision, const ModRMFilter &filter,
+                      InstrUID uid, uint8_t opcode);
+
 public:
   /// Constructor - Allocates space for the class decisions and clears them.
   DisassemblerTables();
@@ -247,7 +246,8 @@ public:
   /// @param insnContext  - The context to use (IC, IC_64BIT, etc.)
   /// @param opcode       - The last byte of the opcode (not counting any escape
   ///                       or extended opcodes).
-  /// @param filter       - The ModRMFilter that decides which ModR/M byte values
+  /// @param filter       - The ModRMFilter that decides which ModR/M byte
+  /// values
   ///                       correspond to the desired instruction.
   /// @param uid          - The unique ID of the instruction.
   /// @param is32bit      - Instructon is only 32-bit
@@ -255,23 +255,17 @@ public:
   /// @param ignoresVEX_L - Instruction ignores VEX.L
   /// @param ignoresVEX_W - Instruction ignores VEX.W
   /// @param AddrSize     - Instructions address size 16/32/64. 0 is unspecified
-  void setTableFields(OpcodeType type,
-                      InstructionContext insnContext,
-                      uint8_t opcode,
-                      const ModRMFilter &filter,
-                      InstrUID uid,
-                      bool is32bit,
-                      bool noPrefix,
-                      bool ignoresVEX_L,
-                      bool ignoresVEX_W,
-                      unsigned AddrSize);
+  void setTableFields(OpcodeType type, InstructionContext insnContext,
+                      uint8_t opcode, const ModRMFilter &filter, InstrUID uid,
+                      bool is32bit, bool noPrefix, bool ignoresVEX_L,
+                      bool ignoresVEX_W, unsigned AddrSize);
 
   /// specForUID - Returns the instruction specifier for a given unique
   ///   instruction ID.  Used when resolving collisions.
   ///
   /// @param uid  - The unique ID of the instruction.
   /// @return     - A reference to the instruction specifier.
-  InstructionSpecifier& specForUID(InstrUID uid) {
+  InstructionSpecifier &specForUID(InstrUID uid) {
     if (uid >= InstructionSpecifiers.size())
       InstructionSpecifiers.resize(uid + 1);
 
@@ -282,9 +276,7 @@ public:
   //   from any instructions added to the tables.
   // @return  - true if there were; false otherwise.
 
-  bool hasConflicts() {
-    return HasConflicts;
-  }
+  bool hasConflicts() { return HasConflicts; }
 };
 
 } // namespace X86Disassembler
diff --git a/llvm/utils/TableGen/X86ModRMFilters.cpp b/llvm/utils/TableGen/X86ModRMFilters.cpp
index cf75070..9cfb91c 100644
--- a/llvm/utils/TableGen/X86ModRMFilters.cpp
+++ b/llvm/utils/TableGen/X86ModRMFilters.cpp
@@ -10,14 +10,14 @@
 
 using namespace llvm::X86Disassembler;
 
-void ModRMFilter::anchor() { }
+void ModRMFilter::anchor() {}
 
-void DumbFilter::anchor() { }
+void DumbFilter::anchor() {}
 
-void ModFilter::anchor() { }
+void ModFilter::anchor() {}
 
-void ExtendedFilter::anchor() { }
+void ExtendedFilter::anchor() {}
 
-void ExtendedRMFilter::anchor() { }
+void ExtendedRMFilter::anchor() {}
 
-void ExactFilter::anchor() { }
+void ExactFilter::anchor() {}
diff --git a/llvm/utils/TableGen/X86ModRMFilters.h b/llvm/utils/TableGen/X86ModRMFilters.h
index d2169a8..b579f22 100644
--- a/llvm/utils/TableGen/X86ModRMFilters.h
+++ b/llvm/utils/TableGen/X86ModRMFilters.h
@@ -27,9 +27,10 @@ namespace X86Disassembler {
 ///   ModR/M bytes.
 class ModRMFilter {
   virtual void anchor();
+
 public:
   /// Destructor    - Override as necessary.
-  virtual ~ModRMFilter() { }
+  virtual ~ModRMFilter() {}
 
   /// isDumb        - Indicates whether this filter returns the same value for
   ///                 any value of the ModR/M byte.
@@ -50,14 +51,11 @@ public:
 ///   for operands.
 class DumbFilter : public ModRMFilter {
   void anchor() override;
+
 public:
-  bool isDumb() const override {
-    return true;
-  }
+  bool isDumb() const override { return true; }
 
-  bool accepts(uint8_t modRM) const override {
-    return true;
-  }
+  bool accepts(uint8_t modRM) const override { return true; }
 };
 
 /// ModFilter - Filters based on the mod bits [bits 7-6] of the ModR/M byte.
@@ -66,6 +64,7 @@ public:
 class ModFilter : public ModRMFilter {
   void anchor() override;
   bool R;
+
 public:
   /// Constructor
   ///
@@ -86,6 +85,7 @@ class ExtendedFilter : public ModRMFilter {
   void anchor() override;
   bool R;
   uint8_t NNN;
+
 public:
   /// Constructor
   ///
@@ -95,9 +95,9 @@ public:
   ExtendedFilter(bool r, uint8_t nnn) : R(r), NNN(nnn) {}
 
   bool accepts(uint8_t modRM) const override {
-    return (((R  && ((modRM & 0xc0) == 0xc0)) ||
-             (!R && ((modRM & 0xc0) != 0xc0))) &&
-            (((modRM & 0x38) >> 3) == NNN));
+    return (
+        ((R && ((modRM & 0xc0) == 0xc0)) || (!R && ((modRM & 0xc0) != 0xc0))) &&
+        (((modRM & 0x38) >> 3) == NNN));
   }
 };
 
@@ -107,6 +107,7 @@ class ExtendedRMFilter : public ModRMFilter {
   void anchor() override;
   bool R;
   uint8_t NNN;
+
 public:
   /// Constructor
   ///
@@ -116,8 +117,7 @@ public:
   ExtendedRMFilter(bool r, uint8_t nnn) : R(r), NNN(nnn) {}
 
   bool accepts(uint8_t modRM) const override {
-    return ((R && ((modRM & 0xc0) == 0xc0)) &&
-            ((modRM & 0x7) == NNN));
+    return ((R && ((modRM & 0xc0) == 0xc0)) && ((modRM & 0x7) == NNN));
   }
 };
 /// ExactFilter - The occasional extended opcode (such as VMCALL or MONITOR)
@@ -125,15 +125,14 @@ public:
 class ExactFilter : public ModRMFilter {
   void anchor() override;
   uint8_t ModRM;
+
 public:
   /// Constructor
   ///
   /// \param modRM The required value of the full ModR/M byte.
   ExactFilter(uint8_t modRM) : ModRM(modRM) {}
 
-  bool accepts(uint8_t modRM) const override {
-    return (ModRM == modRM);
-  }
+  bool accepts(uint8_t modRM) const override { return (ModRM == modRM); }
 };
 
 } // namespace X86Disassembler
diff --git a/llvm/utils/TableGen/X86RecognizableInstr.cpp b/llvm/utils/TableGen/X86RecognizableInstr.cpp
index 18f9610..873f3ae 100644
--- a/llvm/utils/TableGen/X86RecognizableInstr.cpp
+++ b/llvm/utils/TableGen/X86RecognizableInstr.cpp
@@ -1,4 +1,4 @@
-//===- X86RecognizableInstr.cpp - Disassembler instruction spec --*- C++ -*-===//
+//===- X86RecognizableInstr.cpp - Disassembler instruction spec -*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/llvm/utils/TableGen/X86RecognizableInstr.h b/llvm/utils/TableGen/X86RecognizableInstr.h
index 007c700..549fc5b 100644
--- a/llvm/utils/TableGen/X86RecognizableInstr.h
+++ b/llvm/utils/TableGen/X86RecognizableInstr.h
@@ -1,4 +1,4 @@
-//===- X86RecognizableInstr.h - Disassembler instruction spec ----*- C++ -*-===//
+//===- X86RecognizableInstr.h - Disassembler instruction spec ---*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
-- 
cgit v1.1


From 2f8e37d20114ecb223caaa5a72e8b7c13daf9f34 Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Fri, 9 Feb 2024 09:36:05 +0100
Subject: [SROA] Unfold gep of index select (#80983)

SROA currently supports converting a gep of select into select of gep if
the select is in the pointer operand. This patch expands support to
selects in an index operand.

This is intended to address the regression reported in
https://github.com/llvm/llvm-project/pull/68882#issuecomment-1924909922.
---
 llvm/lib/Transforms/Scalar/SROA.cpp     | 59 ++++++++++++++++++++-------
 llvm/test/Transforms/SROA/select-gep.ll | 72 ++++++++++++++++++++++++++-------
 2 files changed, 103 insertions(+), 28 deletions(-)

diff --git a/llvm/lib/Transforms/Scalar/SROA.cpp b/llvm/lib/Transforms/Scalar/SROA.cpp
index e92e245..138dc38 100644
--- a/llvm/lib/Transforms/Scalar/SROA.cpp
+++ b/llvm/lib/Transforms/Scalar/SROA.cpp
@@ -3942,30 +3942,62 @@ private:
     return false;
   }
 
-  // Fold gep (select cond, ptr1, ptr2) => select cond, gep(ptr1), gep(ptr2)
+  // Fold gep (select cond, ptr1, ptr2), idx
+  //   => select cond, gep(ptr1, idx), gep(ptr2, idx)
+  // and  gep ptr, (select cond, idx1, idx2)
+  //   => select cond, gep(ptr, idx1), gep(ptr, idx2)
   bool foldGEPSelect(GetElementPtrInst &GEPI) {
-    if (!GEPI.hasAllConstantIndices())
-      return false;
+    // Check whether the GEP has exactly one select operand and all indices
+    // will become constant after the transform.
+    SelectInst *Sel = dyn_cast<SelectInst>(GEPI.getPointerOperand());
+    for (Value *Op : GEPI.indices()) {
+      if (auto *SI = dyn_cast<SelectInst>(Op)) {
+        if (Sel)
+          return false;
+
+        Sel = SI;
+        if (!isa<ConstantInt>(Sel->getTrueValue()) ||
+            !isa<ConstantInt>(Sel->getFalseValue()))
+          return false;
+        continue;
+      }
 
-    SelectInst *Sel = cast<SelectInst>(GEPI.getPointerOperand());
+      if (!isa<ConstantInt>(Op))
+        return false;
+    }
+
+    if (!Sel)
+      return false;
 
     LLVM_DEBUG(dbgs() << "  Rewriting gep(select) -> select(gep):"
                       << "\n    original: " << *Sel
                       << "\n              " << GEPI);
 
+    auto GetNewOps = [&](Value *SelOp) {
+      SmallVector<Value *> NewOps;
+      for (Value *Op : GEPI.operands())
+        if (Op == Sel)
+          NewOps.push_back(SelOp);
+        else
+          NewOps.push_back(Op);
+      return NewOps;
+    };
+
+    Value *True = Sel->getTrueValue();
+    Value *False = Sel->getFalseValue();
+    SmallVector<Value *> TrueOps = GetNewOps(True);
+    SmallVector<Value *> FalseOps = GetNewOps(False);
+
     IRB.SetInsertPoint(&GEPI);
-    SmallVector<Value *, 4> Index(GEPI.indices());
     bool IsInBounds = GEPI.isInBounds();
 
     Type *Ty = GEPI.getSourceElementType();
-    Value *True = Sel->getTrueValue();
-    Value *NTrue = IRB.CreateGEP(Ty, True, Index, True->getName() + ".sroa.gep",
-                                 IsInBounds);
-
-    Value *False = Sel->getFalseValue();
+    Value *NTrue = IRB.CreateGEP(Ty, TrueOps[0], ArrayRef(TrueOps).drop_front(),
+                                 True->getName() + ".sroa.gep", IsInBounds);
 
-    Value *NFalse = IRB.CreateGEP(Ty, False, Index,
-                                  False->getName() + ".sroa.gep", IsInBounds);
+    Value *NFalse =
+        IRB.CreateGEP(Ty, FalseOps[0], ArrayRef(FalseOps).drop_front(),
+                      False->getName() + ".sroa.gep", IsInBounds);
 
     Value *NSel = IRB.CreateSelect(Sel->getCondition(), NTrue, NFalse,
                                    Sel->getName() + ".sroa.sel");
@@ -4039,8 +4071,7 @@ private:
   }
 
   bool visitGetElementPtrInst(GetElementPtrInst &GEPI) {
-    if (isa<SelectInst>(GEPI.getPointerOperand()) &&
-        foldGEPSelect(GEPI))
+    if (foldGEPSelect(GEPI))
       return true;
 
     if (isa<PHINode>(GEPI.getPointerOperand()) &&
diff --git a/llvm/test/Transforms/SROA/select-gep.ll b/llvm/test/Transforms/SROA/select-gep.ll
index 56924a0..1342a2c 100644
--- a/llvm/test/Transforms/SROA/select-gep.ll
+++ b/llvm/test/Transforms/SROA/select-gep.ll
@@ -155,14 +155,24 @@ bb:
   ret i32 %load
 }
 
-
+; Test gep of index select unfolding on an alloca that is splittable, but not
+; promotable. The allocas here will be optimized away by subsequent passes.
 define i32 @test_select_idx_memcpy(i1 %c, ptr %p) {
 ; CHECK-LABEL: @test_select_idx_memcpy(
-; CHECK-NEXT:    [[ALLOCA:%.*]] = alloca [20 x i64], align 8
-; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr [[ALLOCA]], ptr [[P:%.*]], i64 160, i1 false)
+; CHECK-NEXT:    [[ALLOCA_SROA_0:%.*]] = alloca [4 x i8], align 8
+; CHECK-NEXT:    [[ALLOCA_SROA_2:%.*]] = alloca [20 x i8], align 4
+; CHECK-NEXT:    [[ALLOCA_SROA_22:%.*]] = alloca [4 x i8], align 8
+; CHECK-NEXT:    [[ALLOCA_SROA_3:%.*]] = alloca [132 x i8], align 4
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[ALLOCA_SROA_0]], ptr align 1 [[P:%.*]], i64 4, i1 false)
+; CHECK-NEXT:    [[ALLOCA_SROA_2_0_P_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 4
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[ALLOCA_SROA_2]], ptr align 1 [[ALLOCA_SROA_2_0_P_SROA_IDX]], i64 20, i1 false)
+; CHECK-NEXT:    [[ALLOCA_SROA_22_0_P_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 24
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[ALLOCA_SROA_22]], ptr align 1 [[ALLOCA_SROA_22_0_P_SROA_IDX]], i64 4, i1 false)
+; CHECK-NEXT:    [[ALLOCA_SROA_3_0_P_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 28
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[ALLOCA_SROA_3]], ptr align 1 [[ALLOCA_SROA_3_0_P_SROA_IDX]], i64 132, i1 false)
 ; CHECK-NEXT:    [[IDX:%.*]] = select i1 [[C:%.*]], i64 24, i64 0
-; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds i8, ptr [[ALLOCA]], i64 [[IDX]]
-; CHECK-NEXT:    [[RES:%.*]] = load i32, ptr [[GEP]], align 4
+; CHECK-NEXT:    [[IDX_SROA_SEL:%.*]] = select i1 [[C]], ptr [[ALLOCA_SROA_22]], ptr [[ALLOCA_SROA_0]]
+; CHECK-NEXT:    [[RES:%.*]] = load i32, ptr [[IDX_SROA_SEL]], align 4
 ; CHECK-NEXT:    ret i32 [[RES]]
 ;
   %alloca = alloca [20 x i64], align 8
@@ -173,16 +183,13 @@ define i32 @test_select_idx_memcpy(i1 %c, ptr %p) {
   ret i32 %res
 }
 
+; Test gep of index select unfolding on an alloca that is splittable and
+; promotable.
 define i32 @test_select_idx_mem2reg(i1 %c) {
 ; CHECK-LABEL: @test_select_idx_mem2reg(
-; CHECK-NEXT:    [[ALLOCA:%.*]] = alloca [20 x i64], align 8
-; CHECK-NEXT:    store i32 1, ptr [[ALLOCA]], align 4
-; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr inbounds i8, ptr [[ALLOCA]], i64 24
-; CHECK-NEXT:    store i32 2, ptr [[GEP1]], align 4
 ; CHECK-NEXT:    [[IDX:%.*]] = select i1 [[C:%.*]], i64 24, i64 0
-; CHECK-NEXT:    [[GEP2:%.*]] = getelementptr inbounds i8, ptr [[ALLOCA]], i64 [[IDX]]
-; CHECK-NEXT:    [[RES:%.*]] = load i32, ptr [[GEP2]], align 4
-; CHECK-NEXT:    ret i32 [[RES]]
+; CHECK-NEXT:    [[RES_SROA_SPECULATED:%.*]] = select i1 [[C]], i32 2, i32 1
+; CHECK-NEXT:    ret i32 [[RES_SROA_SPECULATED]]
 ;
   %alloca = alloca [20 x i64], align 8
   store i32 1, ptr %alloca
@@ -194,6 +201,9 @@ define i32 @test_select_idx_mem2reg(i1 %c) {
   ret i32 %res
 }
 
+; Test gep of index select unfolding on an alloca that escaped, and as such
+; is not splittable or promotable.
+; FIXME: Ideally, no transform would take place in this case.
 define i32 @test_select_idx_escaped(i1 %c, ptr %p) {
 ; CHECK-LABEL: @test_select_idx_escaped(
 ; CHECK-NEXT:    [[ALLOCA:%.*]] = alloca [20 x i64], align 8
@@ -202,8 +212,10 @@ define i32 @test_select_idx_escaped(i1 %c, ptr %p) {
 ; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr inbounds i8, ptr [[ALLOCA]], i64 24
 ; CHECK-NEXT:    store i32 2, ptr [[GEP1]], align 4
 ; CHECK-NEXT:    [[IDX:%.*]] = select i1 [[C:%.*]], i64 24, i64 0
-; CHECK-NEXT:    [[GEP2:%.*]] = getelementptr inbounds i8, ptr [[ALLOCA]], i64 [[IDX]]
-; CHECK-NEXT:    [[RES:%.*]] = load i32, ptr [[GEP2]], align 4
+; CHECK-NEXT:    [[DOTSROA_GEP:%.*]] = getelementptr inbounds i8, ptr [[ALLOCA]], i64 24
+; CHECK-NEXT:    [[DOTSROA_GEP1:%.*]] = getelementptr inbounds i8, ptr [[ALLOCA]], i64 0
+; CHECK-NEXT:    [[IDX_SROA_SEL:%.*]] = select i1 [[C]], ptr [[DOTSROA_GEP]], ptr [[DOTSROA_GEP1]]
+; CHECK-NEXT:    [[RES:%.*]] = load i32, ptr [[IDX_SROA_SEL]], align 4
 ; CHECK-NEXT:    ret i32 [[RES]]
 ;
   %alloca = alloca [20 x i64], align 8
@@ -217,6 +229,38 @@ define i32 @test_select_idx_escaped(i1 %c, ptr %p) {
   ret i32 %res
 }
 
+; FIXME: Should we allow recursive select unfolding if all the leaves are
+; constants?
+define i32 @test_select_idx_nested(i1 %c, i1 %c2) {
+; CHECK-LABEL: @test_select_idx_nested(
+; CHECK-NEXT:    [[ALLOCA:%.*]] = alloca [20 x i64], align 8
+; CHECK-NEXT:    store i32 1, ptr [[ALLOCA]], align 4
+; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr inbounds i8, ptr [[ALLOCA]], i64 8
+; CHECK-NEXT:    store i32 2, ptr [[GEP1]], align 4
+; CHECK-NEXT:    [[GEP2:%.*]] = getelementptr inbounds i8, ptr [[ALLOCA]], i64 24
+; CHECK-NEXT:    store i32 3, ptr [[GEP2]], align 4
+; CHECK-NEXT:    [[IDX1:%.*]] = select i1 [[C:%.*]], i64 24, i64 0
+; CHECK-NEXT:    [[IDX2:%.*]] = select i1 [[C2:%.*]], i64 [[IDX1]], i64 8
+; CHECK-NEXT:    [[GEP3:%.*]] = getelementptr inbounds i8, ptr [[ALLOCA]], i64 [[IDX2]]
+; CHECK-NEXT:    [[RES:%.*]] = load i32, ptr [[GEP3]], align 4
+; CHECK-NEXT:    ret i32 [[RES]]
+;
+  %alloca = alloca [20 x i64], align 8
+  store i32 1, ptr %alloca
+  %gep1 = getelementptr inbounds i8, ptr %alloca, i64 8
+  store i32 2, ptr %gep1
+  %gep2 = getelementptr inbounds i8, ptr %alloca, i64 24
+  store i32 3, ptr %gep2
+  %idx1 = select i1 %c, i64 24, i64 0
+  %idx2 = select i1 %c2, i64 %idx1, i64 8
+  %gep3 = getelementptr inbounds i8, ptr %alloca, i64 %idx2
+  %res = load i32, ptr %gep3, align 4
+  ret i32 %res
+}
+
+; The following cases involve non-constant indices and should not be
+; transformed.
+
 define i32 @test_select_idx_not_constant1(i1 %c, ptr %p, i64 %arg) {
 ; CHECK-LABEL: @test_select_idx_not_constant1(
 ; CHECK-NEXT:    [[ALLOCA:%.*]] = alloca [20 x i64], align 8
-- 
cgit v1.1


From 38b54c72ca83fd56830b13d2a8d7749887b77922 Mon Sep 17 00:00:00 2001
From: Jean Perier <jperier@nvidia.com>
Date: Fri, 9 Feb 2024 00:50:48 -0800
Subject: [flang] fix shared library builds after #81166

Fix https://lab.llvm.org/buildbot/#/builders/268/builds/7826

IsDerivedTypeWithLengthParameter cannot be used here, it would make
libFortranEvaluate dependent on linFortranSemantics.
Replace by loop through parameter values.
---
 flang/lib/Evaluate/characteristics.cpp | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/flang/lib/Evaluate/characteristics.cpp b/flang/lib/Evaluate/characteristics.cpp
index c14a422..80b0f34 100644
--- a/flang/lib/Evaluate/characteristics.cpp
+++ b/flang/lib/Evaluate/characteristics.cpp
@@ -474,9 +474,13 @@ bool DummyDataObject::IsPassedByDescriptor(bool isBindC) const {
     // Need to pass dynamic type info in a descriptor.
     return true;
   } else if (const auto *derived{GetDerivedTypeSpec(type.type())}) {
-    if (const semantics::Scope *scope = derived->scope()) {
-      // Need to pass length type parameters in a descriptor if any.
-      return scope->IsDerivedTypeWithLengthParameter();
+    if (!derived->parameters().empty()) {
+      for (const auto &param : derived->parameters()) {
+        if (param.second.isLen()) {
+          // Need to pass length type parameters in a descriptor.
+          return true;
+        }
+      }
     }
   } else if (isBindC && type.type().IsAssumedLengthCharacter()) {
     // Fortran 2018 18.3.6 point 2 (5)
-- 
cgit v1.1


From f6610578653fd47535a18284c688d725943ee8c3 Mon Sep 17 00:00:00 2001
From: Jan Patrick Lehr <JanPatrick.Lehr@amd.com>
Date: Fri, 9 Feb 2024 09:57:38 +0100
Subject: Revert "[AMDGPU] Compiler should synthesize private buffer resource
 descriptor from flat_scratch_init" (#81234)

Reverts llvm/llvm-project#79586

This broke the AMDGPU OpenMP Offload buildbot.
The typical error message was that the GPU attempted to read beyong the
largest legal address.

Error message:
AMDGPU fatal error 1: Received error in queue 0x7f8363f22000:
HSA_STATUS_ERROR_MEMORY_APERTURE_VIOLATION: The agent attempted to
access memory beyond the largest legal address.
---
 llvm/docs/AMDGPUUsage.rst                          |  10 +-
 llvm/lib/Target/AMDGPU/SIFrameLowering.cpp         | 108 +++----
 llvm/lib/Target/AMDGPU/SIFrameLowering.h           |  14 +-
 .../AMDGPU/GlobalISel/call-outgoing-stack-args.ll  |  10 +-
 .../abi-attribute-hints-undefined-behavior.ll      |  18 +-
 .../blender-no-live-segment-at-def-implicit-def.ll |   5 +-
 .../AMDGPU/branch-folding-implicit-def-subreg.ll   |   7 +-
 llvm/test/CodeGen/AMDGPU/call-argument-types.ll    | 329 +++++++++------------
 llvm/test/CodeGen/AMDGPU/call-reqd-group-size.ll   |  30 +-
 llvm/test/CodeGen/AMDGPU/call-waitcnt.ll           |  29 +-
 .../CodeGen/AMDGPU/callee-special-input-vgprs.ll   |   6 +-
 llvm/test/CodeGen/AMDGPU/cc-update.ll              |  84 +++---
 .../AMDGPU/cross-block-use-is-not-abi-copy.ll      |  10 +-
 .../CodeGen/AMDGPU/indirect-call-known-callees.ll  |   9 +-
 llvm/test/CodeGen/AMDGPU/indirect-call.ll          |  20 +-
 .../AMDGPU/kernel-vgpr-spill-mubuf-with-voffset.ll |   5 +-
 llvm/test/CodeGen/AMDGPU/lds-frame-extern.ll       |  60 ++--
 .../CodeGen/AMDGPU/llvm.amdgcn.lds.kernel.id.ll    |   5 +-
 .../CodeGen/AMDGPU/lower-module-lds-via-hybrid.ll  |  15 +-
 .../CodeGen/AMDGPU/lower-module-lds-via-table.ll   |  15 +-
 ...machine-sink-temporal-divergence-swdev407790.ll |  14 +-
 .../CodeGen/AMDGPU/need-fp-from-vgpr-spills.ll     |  15 +-
 llvm/test/CodeGen/AMDGPU/simple-indirect-call.ll   |   7 +-
 .../CodeGen/AMDGPU/tuple-allocation-failure.ll     |  14 +-
 llvm/test/CodeGen/AMDGPU/vgpr_constant_to_sgpr.ll  |   5 +-
 25 files changed, 350 insertions(+), 494 deletions(-)

diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst
index 3019968..6b24171 100644
--- a/llvm/docs/AMDGPUUsage.rst
+++ b/llvm/docs/AMDGPUUsage.rst
@@ -5530,13 +5530,9 @@ If the *Target Properties* column of :ref:`amdgpu-processor-table` specifies
 Instead the flat SCRATCH instructions are used.
 
 Otherwise, Private Segment Buffer SGPR register is used to initialize 4 SGPRs
-that are used as a V# to access scratch. 
-The compiler synthesizes the initialization value for the Private Segment
-Buffer in the kernel prologue, using the Flat Scratch Init to initialize low
-64-bit and a known constant for the high ones. If the Flat Scratch Init is not
-available, CP uses the value provided by the runtime. It is used, together with
-Scratch Wavefront Offset as an offset, to access the private memory space using
-a segment address. See
+that are used as a V# to access scratch. CP uses the value provided by the
+runtime. It is used, together with Scratch Wavefront Offset as an offset, to
+access the private memory space using a segment address. See
 :ref:`amdgpu-amdhsa-initial-kernel-execution-state`.
 
 The scratch V# is a four-aligned SGPR and always selected for the kernel as
diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
index 6327a81..d02aee7 100644
--- a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
@@ -379,8 +379,7 @@ public:
 } // namespace llvm
 
 // Emit flat scratch setup code, assuming `MFI->hasFlatScratchInit()`
-// and return the FlatScratchInit Register used
-Register SIFrameLowering::emitEntryFunctionFlatScratchInit(
+void SIFrameLowering::emitEntryFunctionFlatScratchInit(
     MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
     const DebugLoc &DL, Register ScratchWaveOffsetReg) const {
   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
@@ -400,7 +399,6 @@ Register SIFrameLowering::emitEntryFunctionFlatScratchInit(
 
   Register FlatScrInitLo;
   Register FlatScrInitHi;
-  Register FlatScratchInitReg;
 
   if (ST.isAmdPalOS()) {
     // Extract the scratch offset from the descriptor in the GIT
@@ -410,6 +408,7 @@ Register SIFrameLowering::emitEntryFunctionFlatScratchInit(
 
     // Find unused reg to load flat scratch init into
     MachineRegisterInfo &MRI = MF.getRegInfo();
+    Register FlatScrInit = AMDGPU::NoRegister;
     ArrayRef<MCPhysReg> AllSGPR64s = TRI->getAllSGPR64(MF);
     unsigned NumPreloaded = (MFI->getNumPreloadedSGPRs() + 1) / 2;
     AllSGPR64s = AllSGPR64s.slice(
@@ -418,28 +417,16 @@ Register SIFrameLowering::emitEntryFunctionFlatScratchInit(
     for (MCPhysReg Reg : AllSGPR64s) {
       if (LiveUnits.available(Reg) && !MRI.isReserved(Reg) &&
           MRI.isAllocatable(Reg) && !TRI->isSubRegisterEq(Reg, GITPtrLoReg)) {
-        FlatScratchInitReg = Reg;
+        FlatScrInit = Reg;
         break;
       }
     }
+    assert(FlatScrInit && "Failed to find free register for scratch init");
 
-  } else {
-    FlatScratchInitReg =
-        MFI->getPreloadedReg(AMDGPUFunctionArgInfo::FLAT_SCRATCH_INIT);
-
-    MachineRegisterInfo &MRI = MF.getRegInfo();
-    MRI.addLiveIn(FlatScratchInitReg);
-    MBB.addLiveIn(FlatScratchInitReg);
-  }
-
-  assert(FlatScratchInitReg && "Failed to find free register for scratch init");
-
-  FlatScrInitLo = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub0);
-  FlatScrInitHi = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub1);
-
-  if (ST.isAmdPalOS()) {
+    FlatScrInitLo = TRI->getSubReg(FlatScrInit, AMDGPU::sub0);
+    FlatScrInitHi = TRI->getSubReg(FlatScrInit, AMDGPU::sub1);
 
-    buildGitPtr(MBB, I, DL, TII, FlatScratchInitReg);
+    buildGitPtr(MBB, I, DL, TII, FlatScrInit);
 
     // We now have the GIT ptr - now get the scratch descriptor from the entry
     // at offset 0 (or offset 16 for a compute shader).
@@ -454,8 +441,8 @@ Register SIFrameLowering::emitEntryFunctionFlatScratchInit(
         MF.getFunction().getCallingConv() == CallingConv::AMDGPU_CS ? 16 : 0;
     const GCNSubtarget &Subtarget = MF.getSubtarget<GCNSubtarget>();
     unsigned EncodedOffset = AMDGPU::convertSMRDOffsetUnits(Subtarget, Offset);
-    BuildMI(MBB, I, DL, LoadDwordX2, FlatScratchInitReg)
-        .addReg(FlatScratchInitReg)
+    BuildMI(MBB, I, DL, LoadDwordX2, FlatScrInit)
+        .addReg(FlatScrInit)
         .addImm(EncodedOffset) // offset
         .addImm(0)             // cpol
         .addMemOperand(MMO);
@@ -463,9 +450,20 @@ Register SIFrameLowering::emitEntryFunctionFlatScratchInit(
     // Mask the offset in [47:0] of the descriptor
     const MCInstrDesc &SAndB32 = TII->get(AMDGPU::S_AND_B32);
     auto And = BuildMI(MBB, I, DL, SAndB32, FlatScrInitHi)
-                   .addReg(FlatScrInitHi)
-                   .addImm(0xffff);
+        .addReg(FlatScrInitHi)
+        .addImm(0xffff);
     And->getOperand(3).setIsDead(); // Mark SCC as dead.
+  } else {
+    Register FlatScratchInitReg =
+        MFI->getPreloadedReg(AMDGPUFunctionArgInfo::FLAT_SCRATCH_INIT);
+    assert(FlatScratchInitReg);
+
+    MachineRegisterInfo &MRI = MF.getRegInfo();
+    MRI.addLiveIn(FlatScratchInitReg);
+    MBB.addLiveIn(FlatScratchInitReg);
+
+    FlatScrInitLo = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub0);
+    FlatScrInitHi = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub1);
   }
 
   // Do a 64-bit pointer add.
@@ -488,21 +486,20 @@ Register SIFrameLowering::emitEntryFunctionFlatScratchInit(
         addReg(FlatScrInitHi).
         addImm(int16_t(AMDGPU::Hwreg::ID_FLAT_SCR_HI |
                        (31 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_)));
-      return FlatScratchInitReg;
+      return;
     }
 
-    assert(ST.getGeneration() == AMDGPUSubtarget::GFX9);
-
+    // For GFX9.
     BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_U32), AMDGPU::FLAT_SCR_LO)
-        .addReg(FlatScrInitLo)
-        .addReg(ScratchWaveOffsetReg);
+      .addReg(FlatScrInitLo)
+      .addReg(ScratchWaveOffsetReg);
     auto Addc = BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADDC_U32),
                         AMDGPU::FLAT_SCR_HI)
       .addReg(FlatScrInitHi)
       .addImm(0);
     Addc->getOperand(3).setIsDead(); // Mark SCC as dead.
 
-    return AMDGPU::FLAT_SCR;
+    return;
   }
 
   assert(ST.getGeneration() < AMDGPUSubtarget::GFX9);
@@ -523,7 +520,6 @@ Register SIFrameLowering::emitEntryFunctionFlatScratchInit(
     .addReg(FlatScrInitLo, RegState::Kill)
     .addImm(8);
   LShr->getOperand(3).setIsDead(); // Mark SCC as dead.
-  return AMDGPU::FLAT_SCR;
 }
 
 // Note SGPRSpill stack IDs should only be used for SGPR spilling to VGPRs, not
@@ -615,15 +611,11 @@ void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF,
   const SIInstrInfo *TII = ST.getInstrInfo();
   const SIRegisterInfo *TRI = &TII->getRegisterInfo();
   MachineRegisterInfo &MRI = MF.getRegInfo();
+  const Function &F = MF.getFunction();
   MachineFrameInfo &FrameInfo = MF.getFrameInfo();
 
   assert(MFI->isEntryFunction());
 
-  bool NeedsFlatScratchInit =
-      MFI->getUserSGPRInfo().hasFlatScratchInit() &&
-      (MRI.isPhysRegUsed(AMDGPU::FLAT_SCR) || FrameInfo.hasCalls() ||
-       (!allStackObjectsAreDead(FrameInfo) && ST.enableFlatScratch()));
-
   Register PreloadedScratchWaveOffsetReg = MFI->getPreloadedReg(
       AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET);
 
@@ -649,7 +641,7 @@ void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF,
   // Now that we have fixed the reserved SRSRC we need to locate the
   // (potentially) preloaded SRSRC.
   Register PreloadedScratchRsrcReg;
-  if (ST.isAmdHsaOrMesa(MF.getFunction()) && !NeedsFlatScratchInit) {
+  if (ST.isAmdHsaOrMesa(F)) {
     PreloadedScratchRsrcReg =
         MFI->getPreloadedReg(AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_BUFFER);
     if (ScratchRsrcReg && PreloadedScratchRsrcReg) {
@@ -705,30 +697,33 @@ void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF,
     BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), FPReg).addImm(0);
   }
 
+  bool NeedsFlatScratchInit =
+      MFI->getUserSGPRInfo().hasFlatScratchInit() &&
+      (MRI.isPhysRegUsed(AMDGPU::FLAT_SCR) || FrameInfo.hasCalls() ||
+       (!allStackObjectsAreDead(FrameInfo) && ST.enableFlatScratch()));
+
   if ((NeedsFlatScratchInit || ScratchRsrcReg) &&
       PreloadedScratchWaveOffsetReg && !ST.flatScratchIsArchitected()) {
     MRI.addLiveIn(PreloadedScratchWaveOffsetReg);
     MBB.addLiveIn(PreloadedScratchWaveOffsetReg);
   }
 
-  Register FlatScratchInit;
   if (NeedsFlatScratchInit) {
-    FlatScratchInit =
-        emitEntryFunctionFlatScratchInit(MF, MBB, I, DL, ScratchWaveOffsetReg);
+    emitEntryFunctionFlatScratchInit(MF, MBB, I, DL, ScratchWaveOffsetReg);
   }
 
   if (ScratchRsrcReg) {
-    emitEntryFunctionScratchRsrcRegSetup(
-        MF, MBB, I, DL, FlatScratchInit, ScratchRsrcReg,
-        PreloadedScratchRsrcReg, ScratchWaveOffsetReg);
+    emitEntryFunctionScratchRsrcRegSetup(MF, MBB, I, DL,
+                                         PreloadedScratchRsrcReg,
+                                         ScratchRsrcReg, ScratchWaveOffsetReg);
   }
 }
 
 // Emit scratch RSRC setup code, assuming `ScratchRsrcReg != AMDGPU::NoReg`
 void SIFrameLowering::emitEntryFunctionScratchRsrcRegSetup(
     MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
-    const DebugLoc &DL, Register FlatScratchInit, Register ScratchRsrcReg,
-    Register PreloadedScratchRsrcReg, Register ScratchWaveOffsetReg) const {
+    const DebugLoc &DL, Register PreloadedScratchRsrcReg,
+    Register ScratchRsrcReg, Register ScratchWaveOffsetReg) const {
 
   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
   const SIInstrInfo *TII = ST.getInstrInfo();
@@ -776,8 +771,7 @@ void SIFrameLowering::emitEntryFunctionScratchRsrcRegSetup(
           .addImm(21)
           .addReg(Rsrc03);
     }
-  } else if (ST.isMesaGfxShader(Fn) ||
-             (!FlatScratchInit.isValid() && !PreloadedScratchRsrcReg)) {
+  } else if (ST.isMesaGfxShader(Fn) || !PreloadedScratchRsrcReg) {
     assert(!ST.isAmdHsaOrMesa(Fn));
     const MCInstrDesc &SMovB32 = TII->get(AMDGPU::S_MOV_B32);
 
@@ -836,26 +830,6 @@ void SIFrameLowering::emitEntryFunctionScratchRsrcRegSetup(
       .addImm(Rsrc23 >> 32)
       .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
   } else if (ST.isAmdHsaOrMesa(Fn)) {
-
-    if (FlatScratchInit) {
-      const MCInstrDesc &SMovB32 = TII->get(AMDGPU::S_MOV_B32);
-      Register Lo_32 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub2);
-      Register Hi_32 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub3);
-      uint64_t Rsrc23 = TII->getScratchRsrcWords23();
-      I = BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY),
-                  TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0_sub1))
-              .addReg(FlatScratchInit)
-              .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
-      BuildMI(MBB, I, DL, SMovB32, Lo_32)
-          .addImm(Rsrc23 & 0xffffffff)
-          .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
-
-      BuildMI(MBB, I, DL, SMovB32, Hi_32)
-          .addImm(Rsrc23 >> 32)
-          .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
-      return;
-    }
-
     assert(PreloadedScratchRsrcReg);
 
     if (ScratchRsrcReg != PreloadedScratchRsrcReg) {
diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.h b/llvm/lib/Target/AMDGPU/SIFrameLowering.h
index f706d48..b3feb75 100644
--- a/llvm/lib/Target/AMDGPU/SIFrameLowering.h
+++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.h
@@ -67,19 +67,19 @@ public:
                                 MachineBasicBlock::iterator MI) const override;
 
 private:
-  Register
-  emitEntryFunctionFlatScratchInit(MachineFunction &MF, MachineBasicBlock &MBB,
-                                   MachineBasicBlock::iterator I,
-                                   const DebugLoc &DL,
-                                   Register ScratchWaveOffsetReg) const;
+  void emitEntryFunctionFlatScratchInit(MachineFunction &MF,
+                                        MachineBasicBlock &MBB,
+                                        MachineBasicBlock::iterator I,
+                                        const DebugLoc &DL,
+                                        Register ScratchWaveOffsetReg) const;
 
   Register getEntryFunctionReservedScratchRsrcReg(MachineFunction &MF) const;
 
   void emitEntryFunctionScratchRsrcRegSetup(
       MachineFunction &MF, MachineBasicBlock &MBB,
       MachineBasicBlock::iterator I, const DebugLoc &DL,
-      Register FlatScratchInit, Register ScratchRsrcReg,
-      Register PreloadedScratchRsrcReg, Register ScratchWaveOffsetReg) const;
+      Register PreloadedPrivateBufferReg, Register ScratchRsrcReg,
+      Register ScratchWaveOffsetReg) const;
 
 public:
   bool hasFP(const MachineFunction &MF) const override;
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll
index 61bc28b..2465298 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll
@@ -13,11 +13,10 @@ define amdgpu_kernel void @kernel_caller_stack() {
 ; MUBUF-LABEL: kernel_caller_stack:
 ; MUBUF:       ; %bb.0:
 ; MUBUF-NEXT:    s_add_u32 flat_scratch_lo, s4, s7
-; MUBUF-NEXT:    s_mov_b32 s2, -1
 ; MUBUF-NEXT:    s_addc_u32 flat_scratch_hi, s5, 0
-; MUBUF-NEXT:    s_mov_b32 s3, 0xe00000
+; MUBUF-NEXT:    s_add_u32 s0, s0, s7
 ; MUBUF-NEXT:    s_mov_b32 s32, 0
-; MUBUF-NEXT:    s_mov_b64 s[0:1], flat_scratch
+; MUBUF-NEXT:    s_addc_u32 s1, s1, 0
 ; MUBUF-NEXT:    v_mov_b32_e32 v0, 9
 ; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:4
 ; MUBUF-NEXT:    v_mov_b32_e32 v0, 10
@@ -62,10 +61,9 @@ define amdgpu_kernel void @kernel_caller_byval() {
 ; MUBUF-LABEL: kernel_caller_byval:
 ; MUBUF:       ; %bb.0:
 ; MUBUF-NEXT:    s_add_u32 flat_scratch_lo, s4, s7
-; MUBUF-NEXT:    s_mov_b32 s2, -1
 ; MUBUF-NEXT:    s_addc_u32 flat_scratch_hi, s5, 0
-; MUBUF-NEXT:    s_mov_b32 s3, 0xe00000
-; MUBUF-NEXT:    s_mov_b64 s[0:1], flat_scratch
+; MUBUF-NEXT:    s_add_u32 s0, s0, s7
+; MUBUF-NEXT:    s_addc_u32 s1, s1, 0
 ; MUBUF-NEXT:    v_mov_b32_e32 v0, 0
 ; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:4
diff --git a/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll b/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll
index 609b5e6..a439c0f 100644
--- a/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll
+++ b/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll
@@ -48,20 +48,19 @@ define amdgpu_kernel void @parent_kernel_missing_inputs() #0 {
 ; FIXEDABI-SDAG-LABEL: parent_kernel_missing_inputs:
 ; FIXEDABI-SDAG:       ; %bb.0:
 ; FIXEDABI-SDAG-NEXT:    s_add_i32 s4, s4, s9
-; FIXEDABI-SDAG-NEXT:    s_mov_b32 s2, -1
-; FIXEDABI-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
-; FIXEDABI-SDAG-NEXT:    s_mov_b32 flat_scratch_lo, s5
 ; FIXEDABI-SDAG-NEXT:    s_lshr_b32 flat_scratch_hi, s4, 8
-; FIXEDABI-SDAG-NEXT:    s_mov_b32 s3, 0x11e80000
+; FIXEDABI-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
+; FIXEDABI-SDAG-NEXT:    s_add_u32 s0, s0, s9
 ; FIXEDABI-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
 ; FIXEDABI-SDAG-NEXT:    v_or_b32_e32 v0, v0, v1
-; FIXEDABI-SDAG-NEXT:    s_mov_b64 s[0:1], flat_scratch
+; FIXEDABI-SDAG-NEXT:    s_addc_u32 s1, s1, 0
 ; FIXEDABI-SDAG-NEXT:    s_mov_b32 s14, s8
 ; FIXEDABI-SDAG-NEXT:    v_or_b32_e32 v31, v0, v2
 ; FIXEDABI-SDAG-NEXT:    s_mov_b64 s[8:9], 0
 ; FIXEDABI-SDAG-NEXT:    s_mov_b32 s12, s6
 ; FIXEDABI-SDAG-NEXT:    s_mov_b32 s13, s7
 ; FIXEDABI-SDAG-NEXT:    s_mov_b32 s32, 0
+; FIXEDABI-SDAG-NEXT:    s_mov_b32 flat_scratch_lo, s5
 ; FIXEDABI-SDAG-NEXT:    s_getpc_b64 s[4:5]
 ; FIXEDABI-SDAG-NEXT:    s_add_u32 s4, s4, requires_all_inputs@rel32@lo+4
 ; FIXEDABI-SDAG-NEXT:    s_addc_u32 s5, s5, requires_all_inputs@rel32@hi+12
@@ -71,20 +70,19 @@ define amdgpu_kernel void @parent_kernel_missing_inputs() #0 {
 ; FIXEDABI-GISEL-LABEL: parent_kernel_missing_inputs:
 ; FIXEDABI-GISEL:       ; %bb.0:
 ; FIXEDABI-GISEL-NEXT:    s_add_i32 s4, s4, s9
-; FIXEDABI-GISEL-NEXT:    s_mov_b32 s2, -1
-; FIXEDABI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
-; FIXEDABI-GISEL-NEXT:    s_mov_b32 flat_scratch_lo, s5
 ; FIXEDABI-GISEL-NEXT:    s_lshr_b32 flat_scratch_hi, s4, 8
-; FIXEDABI-GISEL-NEXT:    s_mov_b32 s3, 0x11e80000
+; FIXEDABI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
+; FIXEDABI-GISEL-NEXT:    s_add_u32 s0, s0, s9
 ; FIXEDABI-GISEL-NEXT:    v_or_b32_e32 v0, v0, v1
 ; FIXEDABI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 20, v2
-; FIXEDABI-GISEL-NEXT:    s_mov_b64 s[0:1], flat_scratch
+; FIXEDABI-GISEL-NEXT:    s_addc_u32 s1, s1, 0
 ; FIXEDABI-GISEL-NEXT:    s_mov_b32 s14, s8
 ; FIXEDABI-GISEL-NEXT:    v_or_b32_e32 v31, v0, v1
 ; FIXEDABI-GISEL-NEXT:    s_mov_b64 s[8:9], 0
 ; FIXEDABI-GISEL-NEXT:    s_mov_b32 s12, s6
 ; FIXEDABI-GISEL-NEXT:    s_mov_b32 s13, s7
 ; FIXEDABI-GISEL-NEXT:    s_mov_b32 s32, 0
+; FIXEDABI-GISEL-NEXT:    s_mov_b32 flat_scratch_lo, s5
 ; FIXEDABI-GISEL-NEXT:    s_getpc_b64 s[4:5]
 ; FIXEDABI-GISEL-NEXT:    s_add_u32 s4, s4, requires_all_inputs@rel32@lo+4
 ; FIXEDABI-GISEL-NEXT:    s_addc_u32 s5, s5, requires_all_inputs@rel32@hi+12
diff --git a/llvm/test/CodeGen/AMDGPU/blender-no-live-segment-at-def-implicit-def.ll b/llvm/test/CodeGen/AMDGPU/blender-no-live-segment-at-def-implicit-def.ll
index 74c6bb5..7c8d40c 100644
--- a/llvm/test/CodeGen/AMDGPU/blender-no-live-segment-at-def-implicit-def.ll
+++ b/llvm/test/CodeGen/AMDGPU/blender-no-live-segment-at-def-implicit-def.ll
@@ -10,9 +10,8 @@ define amdgpu_kernel void @blender_no_live_segment_at_def_error(<4 x float> %ext
 ; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
 ; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; CHECK-NEXT:    s_load_dwordx8 s[36:43], s[6:7], 0x0
-; CHECK-NEXT:    s_mov_b32 s2, -1
-; CHECK-NEXT:    s_mov_b32 s3, 0x31c16000
-; CHECK-NEXT:    s_mov_b64 s[0:1], s[10:11]
+; CHECK-NEXT:    s_add_u32 s0, s0, s15
+; CHECK-NEXT:    s_addc_u32 s1, s1, 0
 ; CHECK-NEXT:    s_mov_b64 s[10:11], s[8:9]
 ; CHECK-NEXT:    s_mov_b32 s8, 0
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll b/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll
index c06f213..5a128c7 100644
--- a/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll
+++ b/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll
@@ -5,14 +5,13 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
   ; GFX90A-LABEL: name: f1
   ; GFX90A: bb.0.bb:
   ; GFX90A-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
-  ; GFX90A-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr15, $sgpr10_sgpr11
+  ; GFX90A-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr15, $sgpr10_sgpr11
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   $sgpr32 = S_MOV_B32 0
   ; GFX90A-NEXT:   $flat_scr_lo = S_ADD_U32 $sgpr10, $sgpr15, implicit-def $scc
   ; GFX90A-NEXT:   $flat_scr_hi = S_ADDC_U32 $sgpr11, 0, implicit-def dead $scc, implicit $scc
-  ; GFX90A-NEXT:   $sgpr2 = S_MOV_B32 4294967295, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3
-  ; GFX90A-NEXT:   $sgpr3 = S_MOV_B32 14680064, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3
-  ; GFX90A-NEXT:   $sgpr0_sgpr1 = COPY $flat_scr, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT:   $sgpr0 = S_ADD_U32 $sgpr0, $sgpr15, implicit-def $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT:   $sgpr1 = S_ADDC_U32 $sgpr1, 0, implicit-def dead $scc, implicit $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT:   renamable $sgpr10_sgpr11 = COPY $sgpr8_sgpr9
   ; GFX90A-NEXT:   renamable $vgpr31 = COPY $vgpr0, implicit $exec
   ; GFX90A-NEXT:   renamable $sgpr33 = S_LOAD_DWORD_IMM renamable $sgpr6_sgpr7, 24, 0 :: (dereferenceable invariant load (s32) from %ir.arg4.kernarg.offset.align.down, align 8, addrspace 4)
diff --git a/llvm/test/CodeGen/AMDGPU/call-argument-types.ll b/llvm/test/CodeGen/AMDGPU/call-argument-types.ll
index f72d22b..863bd0d 100644
--- a/llvm/test/CodeGen/AMDGPU/call-argument-types.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-argument-types.ll
@@ -129,13 +129,12 @@ define amdgpu_kernel void @test_call_external_void_func_i1_imm() #0 {
 ; HSA-LABEL: test_call_external_void_func_i1_imm:
 ; HSA:       ; %bb.0:
 ; HSA-NEXT:    s_add_i32 s4, s4, s7
-; HSA-NEXT:    s_mov_b32 s2, -1
-; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s5
 ; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s4, 8
-; HSA-NEXT:    s_mov_b32 s3, 0x11e80000
-; HSA-NEXT:    s_mov_b64 s[0:1], flat_scratch
+; HSA-NEXT:    s_add_u32 s0, s0, s7
+; HSA-NEXT:    s_addc_u32 s1, s1, 0
 ; HSA-NEXT:    v_mov_b32_e32 v0, 1
 ; HSA-NEXT:    s_mov_b32 s32, 0
+; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s5
 ; HSA-NEXT:    s_getpc_b64 s[4:5]
 ; HSA-NEXT:    s_add_u32 s4, s4, external_void_func_i1@rel32@lo+4
 ; HSA-NEXT:    s_addc_u32 s5, s5, external_void_func_i1@rel32@hi+12
@@ -235,9 +234,8 @@ define amdgpu_kernel void @test_call_external_void_func_i1_signext(i32) #0 {
 ; HSA-NEXT:    s_mov_b32 s6, -1
 ; HSA-NEXT:    buffer_load_ubyte v0, off, s[4:7], 0 glc
 ; HSA-NEXT:    s_waitcnt vmcnt(0)
-; HSA-NEXT:    s_mov_b32 s2, -1
-; HSA-NEXT:    s_mov_b32 s3, 0x11e80000
-; HSA-NEXT:    s_mov_b64 s[0:1], flat_scratch
+; HSA-NEXT:    s_add_u32 s0, s0, s9
+; HSA-NEXT:    s_addc_u32 s1, s1, 0
 ; HSA-NEXT:    s_mov_b32 s32, 0
 ; HSA-NEXT:    s_getpc_b64 s[4:5]
 ; HSA-NEXT:    s_add_u32 s4, s4, external_void_func_i1_signext@rel32@lo+4
@@ -341,9 +339,8 @@ define amdgpu_kernel void @test_call_external_void_func_i1_zeroext(i32) #0 {
 ; HSA-NEXT:    s_mov_b32 s6, -1
 ; HSA-NEXT:    buffer_load_ubyte v0, off, s[4:7], 0 glc
 ; HSA-NEXT:    s_waitcnt vmcnt(0)
-; HSA-NEXT:    s_mov_b32 s2, -1
-; HSA-NEXT:    s_mov_b32 s3, 0x11e80000
-; HSA-NEXT:    s_mov_b64 s[0:1], flat_scratch
+; HSA-NEXT:    s_add_u32 s0, s0, s9
+; HSA-NEXT:    s_addc_u32 s1, s1, 0
 ; HSA-NEXT:    s_mov_b32 s32, 0
 ; HSA-NEXT:    s_getpc_b64 s[4:5]
 ; HSA-NEXT:    s_add_u32 s4, s4, external_void_func_i1_zeroext@rel32@lo+4
@@ -425,13 +422,12 @@ define amdgpu_kernel void @test_call_external_void_func_i8_imm(i32) #0 {
 ; HSA-LABEL: test_call_external_void_func_i8_imm:
 ; HSA:       ; %bb.0:
 ; HSA-NEXT:    s_add_i32 s6, s6, s9
-; HSA-NEXT:    s_mov_b32 s2, -1
-; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s7
 ; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s6, 8
-; HSA-NEXT:    s_mov_b32 s3, 0x11e80000
-; HSA-NEXT:    s_mov_b64 s[0:1], flat_scratch
+; HSA-NEXT:    s_add_u32 s0, s0, s9
+; HSA-NEXT:    s_addc_u32 s1, s1, 0
 ; HSA-NEXT:    v_mov_b32_e32 v0, 0x7b
 ; HSA-NEXT:    s_mov_b32 s32, 0
+; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s7
 ; HSA-NEXT:    s_getpc_b64 s[4:5]
 ; HSA-NEXT:    s_add_u32 s4, s4, external_void_func_i8@rel32@lo+4
 ; HSA-NEXT:    s_addc_u32 s5, s5, external_void_func_i8@rel32@hi+12
@@ -529,9 +525,8 @@ define amdgpu_kernel void @test_call_external_void_func_i8_signext(i32) #0 {
 ; HSA-NEXT:    s_mov_b32 s6, -1
 ; HSA-NEXT:    buffer_load_sbyte v0, off, s[4:7], 0 glc
 ; HSA-NEXT:    s_waitcnt vmcnt(0)
-; HSA-NEXT:    s_mov_b32 s2, -1
-; HSA-NEXT:    s_mov_b32 s3, 0x11e80000
-; HSA-NEXT:    s_mov_b64 s[0:1], flat_scratch
+; HSA-NEXT:    s_add_u32 s0, s0, s9
+; HSA-NEXT:    s_addc_u32 s1, s1, 0
 ; HSA-NEXT:    s_mov_b32 s32, 0
 ; HSA-NEXT:    s_getpc_b64 s[4:5]
 ; HSA-NEXT:    s_add_u32 s4, s4, external_void_func_i8_signext@rel32@lo+4
@@ -630,9 +625,8 @@ define amdgpu_kernel void @test_call_external_void_func_i8_zeroext(i32) #0 {
 ; HSA-NEXT:    s_mov_b32 s6, -1
 ; HSA-NEXT:    buffer_load_ubyte v0, off, s[4:7], 0 glc
 ; HSA-NEXT:    s_waitcnt vmcnt(0)
-; HSA-NEXT:    s_mov_b32 s2, -1
-; HSA-NEXT:    s_mov_b32 s3, 0x11e80000
-; HSA-NEXT:    s_mov_b64 s[0:1], flat_scratch
+; HSA-NEXT:    s_add_u32 s0, s0, s9
+; HSA-NEXT:    s_addc_u32 s1, s1, 0
 ; HSA-NEXT:    s_mov_b32 s32, 0
 ; HSA-NEXT:    s_getpc_b64 s[4:5]
 ; HSA-NEXT:    s_add_u32 s4, s4, external_void_func_i8_zeroext@rel32@lo+4
@@ -713,13 +707,12 @@ define amdgpu_kernel void @test_call_external_void_func_i16_imm() #0 {
 ; HSA-LABEL: test_call_external_void_func_i16_imm:
 ; HSA:       ; %bb.0:
 ; HSA-NEXT:    s_add_i32 s4, s4, s7
-; HSA-NEXT:    s_mov_b32 s2, -1
-; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s5
 ; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s4, 8
-; HSA-NEXT:    s_mov_b32 s3, 0x11e80000
-; HSA-NEXT:    s_mov_b64 s[0:1], flat_scratch
+; HSA-NEXT:    s_add_u32 s0, s0, s7
+; HSA-NEXT:    s_addc_u32 s1, s1, 0
 ; HSA-NEXT:    v_mov_b32_e32 v0, 0x7b
 ; HSA-NEXT:    s_mov_b32 s32, 0
+; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s5
 ; HSA-NEXT:    s_getpc_b64 s[4:5]
 ; HSA-NEXT:    s_add_u32 s4, s4, external_void_func_i16@rel32@lo+4
 ; HSA-NEXT:    s_addc_u32 s5, s5, external_void_func_i16@rel32@hi+12
@@ -816,9 +809,8 @@ define amdgpu_kernel void @test_call_external_void_func_i16_signext(i32) #0 {
 ; HSA-NEXT:    s_mov_b32 s6, -1
 ; HSA-NEXT:    buffer_load_sshort v0, off, s[4:7], 0 glc
 ; HSA-NEXT:    s_waitcnt vmcnt(0)
-; HSA-NEXT:    s_mov_b32 s2, -1
-; HSA-NEXT:    s_mov_b32 s3, 0x11e80000
-; HSA-NEXT:    s_mov_b64 s[0:1], flat_scratch
+; HSA-NEXT:    s_add_u32 s0, s0, s9
+; HSA-NEXT:    s_addc_u32 s1, s1, 0
 ; HSA-NEXT:    s_mov_b32 s32, 0
 ; HSA-NEXT:    s_getpc_b64 s[4:5]
 ; HSA-NEXT:    s_add_u32 s4, s4, external_void_func_i16_signext@rel32@lo+4
@@ -917,9 +909,8 @@ define amdgpu_kernel void @test_call_external_void_func_i16_zeroext(i32) #0 {
 ; HSA-NEXT:    s_mov_b32 s6, -1
 ; HSA-NEXT:    buffer_load_ushort v0, off, s[4:7], 0 glc
 ; HSA-NEXT:    s_waitcnt vmcnt(0)
-; HSA-NEXT:    s_mov_b32 s2, -1
-; HSA-NEXT:    s_mov_b32 s3, 0x11e80000
-; HSA-NEXT:    s_mov_b64 s[0:1], flat_scratch
+; HSA-NEXT:    s_add_u32 s0, s0, s9
+; HSA-NEXT:    s_addc_u32 s1, s1, 0
 ; HSA-NEXT:    s_mov_b32 s32, 0
 ; HSA-NEXT:    s_getpc_b64 s[4:5]
 ; HSA-NEXT:    s_add_u32 s4, s4, external_void_func_i16_zeroext@rel32@lo+4
@@ -1000,13 +991,12 @@ define amdgpu_kernel void @test_call_external_void_func_i32_imm(i32) #0 {
 ; HSA-LABEL: test_call_external_void_func_i32_imm:
 ; HSA:       ; %bb.0:
 ; HSA-NEXT:    s_add_i32 s6, s6, s9
-; HSA-NEXT:    s_mov_b32 s2, -1
-; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s7
 ; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s6, 8
-; HSA-NEXT:    s_mov_b32 s3, 0x11e80000
-; HSA-NEXT:    s_mov_b64 s[0:1], flat_scratch
+; HSA-NEXT:    s_add_u32 s0, s0, s9
+; HSA-NEXT:    s_addc_u32 s1, s1, 0
 ; HSA-NEXT:    v_mov_b32_e32 v0, 42
 ; HSA-NEXT:    s_mov_b32 s32, 0
+; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s7
 ; HSA-NEXT:    s_getpc_b64 s[4:5]
 ; HSA-NEXT:    s_add_u32 s4, s4, external_void_func_i32@rel32@lo+4
 ; HSA-NEXT:    s_addc_u32 s5, s5, external_void_func_i32@rel32@hi+12
@@ -1088,14 +1078,13 @@ define amdgpu_kernel void @test_call_external_void_func_i64_imm() #0 {
 ; HSA-LABEL: test_call_external_void_func_i64_imm:
 ; HSA:       ; %bb.0:
 ; HSA-NEXT:    s_add_i32 s4, s4, s7
-; HSA-NEXT:    s_mov_b32 s2, -1
-; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s5
 ; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s4, 8
-; HSA-NEXT:    s_mov_b32 s3, 0x11e80000
-; HSA-NEXT:    s_mov_b64 s[0:1], flat_scratch
+; HSA-NEXT:    s_add_u32 s0, s0, s7
+; HSA-NEXT:    s_addc_u32 s1, s1, 0
 ; HSA-NEXT:    v_mov_b32_e32 v0, 0x7b
 ; HSA-NEXT:    v_mov_b32_e32 v1, 0
 ; HSA-NEXT:    s_mov_b32 s32, 0
+; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s5
 ; HSA-NEXT:    s_getpc_b64 s[4:5]
 ; HSA-NEXT:    s_add_u32 s4, s4, external_void_func_i64@rel32@lo+4
 ; HSA-NEXT:    s_addc_u32 s5, s5, external_void_func_i64@rel32@hi+12
@@ -1193,13 +1182,12 @@ define amdgpu_kernel void @test_call_external_void_func_v2i64() #0 {
 ; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s4, 8
 ; HSA-NEXT:    s_mov_b32 s4, 0
 ; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s5
+; HSA-NEXT:    s_add_u32 s0, s0, s7
 ; HSA-NEXT:    s_mov_b32 s7, 0x1100f000
 ; HSA-NEXT:    s_mov_b32 s6, -1
 ; HSA-NEXT:    s_mov_b32 s5, s4
 ; HSA-NEXT:    buffer_load_dwordx4 v[0:3], off, s[4:7], 0
-; HSA-NEXT:    s_mov_b32 s2, -1
-; HSA-NEXT:    s_mov_b32 s3, 0x11e80000
-; HSA-NEXT:    s_mov_b64 s[0:1], flat_scratch
+; HSA-NEXT:    s_addc_u32 s1, s1, 0
 ; HSA-NEXT:    s_mov_b32 s32, 0
 ; HSA-NEXT:    s_getpc_b64 s[4:5]
 ; HSA-NEXT:    s_add_u32 s4, s4, external_void_func_v2i64@rel32@lo+4
@@ -1290,16 +1278,15 @@ define amdgpu_kernel void @test_call_external_void_func_v2i64_imm() #0 {
 ; HSA-LABEL: test_call_external_void_func_v2i64_imm:
 ; HSA:       ; %bb.0:
 ; HSA-NEXT:    s_add_i32 s4, s4, s7
-; HSA-NEXT:    s_mov_b32 s2, -1
-; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s5
 ; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s4, 8
-; HSA-NEXT:    s_mov_b32 s3, 0x11e80000
-; HSA-NEXT:    s_mov_b64 s[0:1], flat_scratch
+; HSA-NEXT:    s_add_u32 s0, s0, s7
+; HSA-NEXT:    s_addc_u32 s1, s1, 0
 ; HSA-NEXT:    v_mov_b32_e32 v0, 1
 ; HSA-NEXT:    v_mov_b32_e32 v1, 2
 ; HSA-NEXT:    v_mov_b32_e32 v2, 3
 ; HSA-NEXT:    v_mov_b32_e32 v3, 4
 ; HSA-NEXT:    s_mov_b32 s32, 0
+; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s5
 ; HSA-NEXT:    s_getpc_b64 s[4:5]
 ; HSA-NEXT:    s_add_u32 s4, s4, external_void_func_v2i64@rel32@lo+4
 ; HSA-NEXT:    s_addc_u32 s5, s5, external_void_func_v2i64@rel32@hi+12
@@ -1404,13 +1391,12 @@ define amdgpu_kernel void @test_call_external_void_func_v3i64() #0 {
 ; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s4, 8
 ; HSA-NEXT:    s_mov_b32 s4, 0
 ; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s5
+; HSA-NEXT:    s_add_u32 s0, s0, s7
 ; HSA-NEXT:    s_mov_b32 s7, 0x1100f000
 ; HSA-NEXT:    s_mov_b32 s6, -1
 ; HSA-NEXT:    s_mov_b32 s5, s4
 ; HSA-NEXT:    buffer_load_dwordx4 v[0:3], off, s[4:7], 0
-; HSA-NEXT:    s_mov_b32 s2, -1
-; HSA-NEXT:    s_mov_b32 s3, 0x11e80000
-; HSA-NEXT:    s_mov_b64 s[0:1], flat_scratch
+; HSA-NEXT:    s_addc_u32 s1, s1, 0
 ; HSA-NEXT:    v_mov_b32_e32 v4, 1
 ; HSA-NEXT:    v_mov_b32_e32 v5, 2
 ; HSA-NEXT:    s_mov_b32 s32, 0
@@ -1528,13 +1514,12 @@ define amdgpu_kernel void @test_call_external_void_func_v4i64() #0 {
 ; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s4, 8
 ; HSA-NEXT:    s_mov_b32 s4, 0
 ; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s5
+; HSA-NEXT:    s_add_u32 s0, s0, s7
 ; HSA-NEXT:    s_mov_b32 s7, 0x1100f000
 ; HSA-NEXT:    s_mov_b32 s6, -1
 ; HSA-NEXT:    s_mov_b32 s5, s4
 ; HSA-NEXT:    buffer_load_dwordx4 v[0:3], off, s[4:7], 0
-; HSA-NEXT:    s_mov_b32 s2, -1
-; HSA-NEXT:    s_mov_b32 s3, 0x11e80000
-; HSA-NEXT:    s_mov_b64 s[0:1], flat_scratch
+; HSA-NEXT:    s_addc_u32 s1, s1, 0
 ; HSA-NEXT:    v_mov_b32_e32 v4, 1
 ; HSA-NEXT:    v_mov_b32_e32 v5, 2
 ; HSA-NEXT:    v_mov_b32_e32 v6, 3
@@ -1620,13 +1605,12 @@ define amdgpu_kernel void @test_call_external_void_func_f16_imm() #0 {
 ; HSA-LABEL: test_call_external_void_func_f16_imm:
 ; HSA:       ; %bb.0:
 ; HSA-NEXT:    s_add_i32 s4, s4, s7
-; HSA-NEXT:    s_mov_b32 s2, -1
-; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s5
 ; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s4, 8
-; HSA-NEXT:    s_mov_b32 s3, 0x11e80000
-; HSA-NEXT:    s_mov_b64 s[0:1], flat_scratch
+; HSA-NEXT:    s_add_u32 s0, s0, s7
+; HSA-NEXT:    s_addc_u32 s1, s1, 0
 ; HSA-NEXT:    v_mov_b32_e32 v0, 0x4400
 ; HSA-NEXT:    s_mov_b32 s32, 0
+; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s5
 ; HSA-NEXT:    s_getpc_b64 s[4:5]
 ; HSA-NEXT:    s_add_u32 s4, s4, external_void_func_f16@rel32@lo+4
 ; HSA-NEXT:    s_addc_u32 s5, s5, external_void_func_f16@rel32@hi+12
@@ -1705,13 +1689,12 @@ define amdgpu_kernel void @test_call_external_void_func_f32_imm() #0 {
 ; HSA-LABEL: test_call_external_void_func_f32_imm:
 ; HSA:       ; %bb.0:
 ; HSA-NEXT:    s_add_i32 s4, s4, s7
-; HSA-NEXT:    s_mov_b32 s2, -1
-; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s5
 ; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s4, 8
-; HSA-NEXT:    s_mov_b32 s3, 0x11e80000
-; HSA-NEXT:    s_mov_b64 s[0:1], flat_scratch
+; HSA-NEXT:    s_add_u32 s0, s0, s7
+; HSA-NEXT:    s_addc_u32 s1, s1, 0
 ; HSA-NEXT:    v_mov_b32_e32 v0, 4.0
 ; HSA-NEXT:    s_mov_b32 s32, 0
+; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s5
 ; HSA-NEXT:    s_getpc_b64 s[4:5]
 ; HSA-NEXT:    s_add_u32 s4, s4, external_void_func_f32@rel32@lo+4
 ; HSA-NEXT:    s_addc_u32 s5, s5, external_void_func_f32@rel32@hi+12
@@ -1793,14 +1776,13 @@ define amdgpu_kernel void @test_call_external_void_func_v2f32_imm() #0 {
 ; HSA-LABEL: test_call_external_void_func_v2f32_imm:
 ; HSA:       ; %bb.0:
 ; HSA-NEXT:    s_add_i32 s4, s4, s7
-; HSA-NEXT:    s_mov_b32 s2, -1
-; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s5
 ; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s4, 8
-; HSA-NEXT:    s_mov_b32 s3, 0x11e80000
-; HSA-NEXT:    s_mov_b64 s[0:1], flat_scratch
+; HSA-NEXT:    s_add_u32 s0, s0, s7
+; HSA-NEXT:    s_addc_u32 s1, s1, 0
 ; HSA-NEXT:    v_mov_b32_e32 v0, 1.0
 ; HSA-NEXT:    v_mov_b32_e32 v1, 2.0
 ; HSA-NEXT:    s_mov_b32 s32, 0
+; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s5
 ; HSA-NEXT:    s_getpc_b64 s[4:5]
 ; HSA-NEXT:    s_add_u32 s4, s4, external_void_func_v2f32@rel32@lo+4
 ; HSA-NEXT:    s_addc_u32 s5, s5, external_void_func_v2f32@rel32@hi+12
@@ -1886,15 +1868,14 @@ define amdgpu_kernel void @test_call_external_void_func_v3f32_imm() #0 {
 ; HSA-LABEL: test_call_external_void_func_v3f32_imm:
 ; HSA:       ; %bb.0:
 ; HSA-NEXT:    s_add_i32 s4, s4, s7
-; HSA-NEXT:    s_mov_b32 s2, -1
-; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s5
 ; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s4, 8
-; HSA-NEXT:    s_mov_b32 s3, 0x11e80000
-; HSA-NEXT:    s_mov_b64 s[0:1], flat_scratch
+; HSA-NEXT:    s_add_u32 s0, s0, s7
+; HSA-NEXT:    s_addc_u32 s1, s1, 0
 ; HSA-NEXT:    v_mov_b32_e32 v0, 1.0
 ; HSA-NEXT:    v_mov_b32_e32 v1, 2.0
 ; HSA-NEXT:    v_mov_b32_e32 v2, 4.0
 ; HSA-NEXT:    s_mov_b32 s32, 0
+; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s5
 ; HSA-NEXT:    s_getpc_b64 s[4:5]
 ; HSA-NEXT:    s_add_u32 s4, s4, external_void_func_v3f32@rel32@lo+4
 ; HSA-NEXT:    s_addc_u32 s5, s5, external_void_func_v3f32@rel32@hi+12
@@ -1987,17 +1968,16 @@ define amdgpu_kernel void @test_call_external_void_func_v5f32_imm() #0 {
 ; HSA-LABEL: test_call_external_void_func_v5f32_imm:
 ; HSA:       ; %bb.0:
 ; HSA-NEXT:    s_add_i32 s4, s4, s7
-; HSA-NEXT:    s_mov_b32 s2, -1
-; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s5
 ; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s4, 8
-; HSA-NEXT:    s_mov_b32 s3, 0x11e80000
-; HSA-NEXT:    s_mov_b64 s[0:1], flat_scratch
+; HSA-NEXT:    s_add_u32 s0, s0, s7
+; HSA-NEXT:    s_addc_u32 s1, s1, 0
 ; HSA-NEXT:    v_mov_b32_e32 v0, 1.0
 ; HSA-NEXT:    v_mov_b32_e32 v1, 2.0
 ; HSA-NEXT:    v_mov_b32_e32 v2, 4.0
 ; HSA-NEXT:    v_mov_b32_e32 v3, -1.0
 ; HSA-NEXT:    v_mov_b32_e32 v4, 0.5
 ; HSA-NEXT:    s_mov_b32 s32, 0
+; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s5
 ; HSA-NEXT:    s_getpc_b64 s[4:5]
 ; HSA-NEXT:    s_add_u32 s4, s4, external_void_func_v5f32@rel32@lo+4
 ; HSA-NEXT:    s_addc_u32 s5, s5, external_void_func_v5f32@rel32@hi+12
@@ -2079,14 +2059,13 @@ define amdgpu_kernel void @test_call_external_void_func_f64_imm() #0 {
 ; HSA-LABEL: test_call_external_void_func_f64_imm:
 ; HSA:       ; %bb.0:
 ; HSA-NEXT:    s_add_i32 s4, s4, s7
-; HSA-NEXT:    s_mov_b32 s2, -1
-; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s5
 ; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s4, 8
-; HSA-NEXT:    s_mov_b32 s3, 0x11e80000
-; HSA-NEXT:    s_mov_b64 s[0:1], flat_scratch
+; HSA-NEXT:    s_add_u32 s0, s0, s7
+; HSA-NEXT:    s_addc_u32 s1, s1, 0
 ; HSA-NEXT:    v_mov_b32_e32 v0, 0
 ; HSA-NEXT:    v_mov_b32_e32 v1, 0x40100000
 ; HSA-NEXT:    s_mov_b32 s32, 0
+; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s5
 ; HSA-NEXT:    s_getpc_b64 s[4:5]
 ; HSA-NEXT:    s_add_u32 s4, s4, external_void_func_f64@rel32@lo+4
 ; HSA-NEXT:    s_addc_u32 s5, s5, external_void_func_f64@rel32@hi+12
@@ -2175,16 +2154,15 @@ define amdgpu_kernel void @test_call_external_void_func_v2f64_imm() #0 {
 ; HSA-LABEL: test_call_external_void_func_v2f64_imm:
 ; HSA:       ; %bb.0:
 ; HSA-NEXT:    s_add_i32 s4, s4, s7
-; HSA-NEXT:    s_mov_b32 s2, -1
-; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s5
 ; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s4, 8
-; HSA-NEXT:    s_mov_b32 s3, 0x11e80000
-; HSA-NEXT:    s_mov_b64 s[0:1], flat_scratch
+; HSA-NEXT:    s_add_u32 s0, s0, s7
+; HSA-NEXT:    s_addc_u32 s1, s1, 0
 ; HSA-NEXT:    v_mov_b32_e32 v0, 0
 ; HSA-NEXT:    v_mov_b32_e32 v1, 2.0
 ; HSA-NEXT:    v_mov_b32_e32 v2, 0
 ; HSA-NEXT:    v_mov_b32_e32 v3, 0x40100000
 ; HSA-NEXT:    s_mov_b32 s32, 0
+; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s5
 ; HSA-NEXT:    s_getpc_b64 s[4:5]
 ; HSA-NEXT:    s_add_u32 s4, s4, external_void_func_v2f64@rel32@lo+4
 ; HSA-NEXT:    s_addc_u32 s5, s5, external_void_func_v2f64@rel32@hi+12
@@ -2280,11 +2258,9 @@ define amdgpu_kernel void @test_call_external_void_func_v3f64_imm() #0 {
 ; HSA-LABEL: test_call_external_void_func_v3f64_imm:
 ; HSA:       ; %bb.0:
 ; HSA-NEXT:    s_add_i32 s4, s4, s7
-; HSA-NEXT:    s_mov_b32 s2, -1
-; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s5
 ; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s4, 8
-; HSA-NEXT:    s_mov_b32 s3, 0x11e80000
-; HSA-NEXT:    s_mov_b64 s[0:1], flat_scratch
+; HSA-NEXT:    s_add_u32 s0, s0, s7
+; HSA-NEXT:    s_addc_u32 s1, s1, 0
 ; HSA-NEXT:    v_mov_b32_e32 v0, 0
 ; HSA-NEXT:    v_mov_b32_e32 v1, 2.0
 ; HSA-NEXT:    v_mov_b32_e32 v2, 0
@@ -2292,6 +2268,7 @@ define amdgpu_kernel void @test_call_external_void_func_v3f64_imm() #0 {
 ; HSA-NEXT:    v_mov_b32_e32 v4, 0
 ; HSA-NEXT:    v_mov_b32_e32 v5, 0x40200000
 ; HSA-NEXT:    s_mov_b32 s32, 0
+; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s5
 ; HSA-NEXT:    s_getpc_b64 s[4:5]
 ; HSA-NEXT:    s_add_u32 s4, s4, external_void_func_v3f64@rel32@lo+4
 ; HSA-NEXT:    s_addc_u32 s5, s5, external_void_func_v3f64@rel32@hi+12
@@ -2380,15 +2357,14 @@ define amdgpu_kernel void @test_call_external_void_func_v2i16() #0 {
 ; HSA-LABEL: test_call_external_void_func_v2i16:
 ; HSA:       ; %bb.0:
 ; HSA-NEXT:    s_add_i32 s4, s4, s7
+; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s4, 8
+; HSA-NEXT:    s_add_u32 s0, s0, s7
 ; HSA-NEXT:    s_mov_b32 s7, 0x1100f000
 ; HSA-NEXT:    s_mov_b32 s6, -1
 ; HSA-NEXT:    buffer_load_dword v0, off, s[4:7], 0
-; HSA-NEXT:    s_mov_b32 s2, -1
-; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s5
-; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s4, 8
-; HSA-NEXT:    s_mov_b32 s3, 0x11e80000
-; HSA-NEXT:    s_mov_b64 s[0:1], flat_scratch
+; HSA-NEXT:    s_addc_u32 s1, s1, 0
 ; HSA-NEXT:    s_mov_b32 s32, 0
+; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s5
 ; HSA-NEXT:    s_getpc_b64 s[4:5]
 ; HSA-NEXT:    s_add_u32 s4, s4, external_void_func_v2i16@rel32@lo+4
 ; HSA-NEXT:    s_addc_u32 s5, s5, external_void_func_v2i16@rel32@hi+12
@@ -2480,15 +2456,14 @@ define amdgpu_kernel void @test_call_external_void_func_v3i16() #0 {
 ; HSA-LABEL: test_call_external_void_func_v3i16:
 ; HSA:       ; %bb.0:
 ; HSA-NEXT:    s_add_i32 s4, s4, s7
+; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s4, 8
+; HSA-NEXT:    s_add_u32 s0, s0, s7
 ; HSA-NEXT:    s_mov_b32 s7, 0x1100f000
 ; HSA-NEXT:    s_mov_b32 s6, -1
 ; HSA-NEXT:    buffer_load_dwordx2 v[0:1], off, s[4:7], 0
-; HSA-NEXT:    s_mov_b32 s2, -1
-; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s5
-; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s4, 8
-; HSA-NEXT:    s_mov_b32 s3, 0x11e80000
-; HSA-NEXT:    s_mov_b64 s[0:1], flat_scratch
+; HSA-NEXT:    s_addc_u32 s1, s1, 0
 ; HSA-NEXT:    s_mov_b32 s32, 0
+; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s5
 ; HSA-NEXT:    s_getpc_b64 s[4:5]
 ; HSA-NEXT:    s_add_u32 s4, s4, external_void_func_v3i16@rel32@lo+4
 ; HSA-NEXT:    s_addc_u32 s5, s5, external_void_func_v3i16@rel32@hi+12
@@ -2581,15 +2556,14 @@ define amdgpu_kernel void @test_call_external_void_func_v3f16() #0 {
 ; HSA-LABEL: test_call_external_void_func_v3f16:
 ; HSA:       ; %bb.0:
 ; HSA-NEXT:    s_add_i32 s4, s4, s7
+; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s4, 8
+; HSA-NEXT:    s_add_u32 s0, s0, s7
 ; HSA-NEXT:    s_mov_b32 s7, 0x1100f000
 ; HSA-NEXT:    s_mov_b32 s6, -1
 ; HSA-NEXT:    buffer_load_dwordx2 v[0:1], off, s[4:7], 0
-; HSA-NEXT:    s_mov_b32 s2, -1
-; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s5
-; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s4, 8
-; HSA-NEXT:    s_mov_b32 s3, 0x11e80000
-; HSA-NEXT:    s_mov_b64 s[0:1], flat_scratch
+; HSA-NEXT:    s_addc_u32 s1, s1, 0
 ; HSA-NEXT:    s_mov_b32 s32, 0
+; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s5
 ; HSA-NEXT:    s_getpc_b64 s[4:5]
 ; HSA-NEXT:    s_add_u32 s4, s4, external_void_func_v3f16@rel32@lo+4
 ; HSA-NEXT:    s_addc_u32 s5, s5, external_void_func_v3f16@rel32@hi+12
@@ -2673,14 +2647,13 @@ define amdgpu_kernel void @test_call_external_void_func_v3i16_imm() #0 {
 ; HSA-LABEL: test_call_external_void_func_v3i16_imm:
 ; HSA:       ; %bb.0:
 ; HSA-NEXT:    s_add_i32 s4, s4, s7
-; HSA-NEXT:    s_mov_b32 s2, -1
-; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s5
 ; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s4, 8
-; HSA-NEXT:    s_mov_b32 s3, 0x11e80000
-; HSA-NEXT:    s_mov_b64 s[0:1], flat_scratch
+; HSA-NEXT:    s_add_u32 s0, s0, s7
+; HSA-NEXT:    s_addc_u32 s1, s1, 0
 ; HSA-NEXT:    v_mov_b32_e32 v0, 0x20001
 ; HSA-NEXT:    v_mov_b32_e32 v1, 3
 ; HSA-NEXT:    s_mov_b32 s32, 0
+; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s5
 ; HSA-NEXT:    s_getpc_b64 s[4:5]
 ; HSA-NEXT:    s_add_u32 s4, s4, external_void_func_v3i16@rel32@lo+4
 ; HSA-NEXT:    s_addc_u32 s5, s5, external_void_func_v3i16@rel32@hi+12
@@ -2764,14 +2737,13 @@ define amdgpu_kernel void @test_call_external_void_func_v3f16_imm() #0 {
 ; HSA-LABEL: test_call_external_void_func_v3f16_imm:
 ; HSA:       ; %bb.0:
 ; HSA-NEXT:    s_add_i32 s4, s4, s7
-; HSA-NEXT:    s_mov_b32 s2, -1
-; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s5
 ; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s4, 8
-; HSA-NEXT:    s_mov_b32 s3, 0x11e80000
-; HSA-NEXT:    s_mov_b64 s[0:1], flat_scratch
+; HSA-NEXT:    s_add_u32 s0, s0, s7
+; HSA-NEXT:    s_addc_u32 s1, s1, 0
 ; HSA-NEXT:    v_mov_b32_e32 v0, 0x40003c00
 ; HSA-NEXT:    v_mov_b32_e32 v1, 0x4400
 ; HSA-NEXT:    s_mov_b32 s32, 0
+; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s5
 ; HSA-NEXT:    s_getpc_b64 s[4:5]
 ; HSA-NEXT:    s_add_u32 s4, s4, external_void_func_v3f16@rel32@lo+4
 ; HSA-NEXT:    s_addc_u32 s5, s5, external_void_func_v3f16@rel32@hi+12
@@ -2863,15 +2835,14 @@ define amdgpu_kernel void @test_call_external_void_func_v4i16() #0 {
 ; HSA-LABEL: test_call_external_void_func_v4i16:
 ; HSA:       ; %bb.0:
 ; HSA-NEXT:    s_add_i32 s4, s4, s7
+; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s4, 8
+; HSA-NEXT:    s_add_u32 s0, s0, s7
 ; HSA-NEXT:    s_mov_b32 s7, 0x1100f000
 ; HSA-NEXT:    s_mov_b32 s6, -1
 ; HSA-NEXT:    buffer_load_dwordx2 v[0:1], off, s[4:7], 0
-; HSA-NEXT:    s_mov_b32 s2, -1
-; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s5
-; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s4, 8
-; HSA-NEXT:    s_mov_b32 s3, 0x11e80000
-; HSA-NEXT:    s_mov_b64 s[0:1], flat_scratch
+; HSA-NEXT:    s_addc_u32 s1, s1, 0
 ; HSA-NEXT:    s_mov_b32 s32, 0
+; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s5
 ; HSA-NEXT:    s_getpc_b64 s[4:5]
 ; HSA-NEXT:    s_add_u32 s4, s4, external_void_func_v4i16@rel32@lo+4
 ; HSA-NEXT:    s_addc_u32 s5, s5, external_void_func_v4i16@rel32@hi+12
@@ -2957,14 +2928,13 @@ define amdgpu_kernel void @test_call_external_void_func_v4i16_imm() #0 {
 ; HSA-LABEL: test_call_external_void_func_v4i16_imm:
 ; HSA:       ; %bb.0:
 ; HSA-NEXT:    s_add_i32 s4, s4, s7
-; HSA-NEXT:    s_mov_b32 s2, -1
-; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s5
 ; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s4, 8
-; HSA-NEXT:    s_mov_b32 s3, 0x11e80000
-; HSA-NEXT:    s_mov_b64 s[0:1], flat_scratch
+; HSA-NEXT:    s_add_u32 s0, s0, s7
+; HSA-NEXT:    s_addc_u32 s1, s1, 0
 ; HSA-NEXT:    v_mov_b32_e32 v0, 0x20001
 ; HSA-NEXT:    v_mov_b32_e32 v1, 0x40003
 ; HSA-NEXT:    s_mov_b32 s32, 0
+; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s5
 ; HSA-NEXT:    s_getpc_b64 s[4:5]
 ; HSA-NEXT:    s_add_u32 s4, s4, external_void_func_v4i16@rel32@lo+4
 ; HSA-NEXT:    s_addc_u32 s5, s5, external_void_func_v4i16@rel32@hi+12
@@ -3055,15 +3025,14 @@ define amdgpu_kernel void @test_call_external_void_func_v2f16() #0 {
 ; HSA-LABEL: test_call_external_void_func_v2f16:
 ; HSA:       ; %bb.0:
 ; HSA-NEXT:    s_add_i32 s4, s4, s7
+; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s4, 8
+; HSA-NEXT:    s_add_u32 s0, s0, s7
 ; HSA-NEXT:    s_mov_b32 s7, 0x1100f000
 ; HSA-NEXT:    s_mov_b32 s6, -1
 ; HSA-NEXT:    buffer_load_dword v0, off, s[4:7], 0
-; HSA-NEXT:    s_mov_b32 s2, -1
-; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s5
-; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s4, 8
-; HSA-NEXT:    s_mov_b32 s3, 0x11e80000
-; HSA-NEXT:    s_mov_b64 s[0:1], flat_scratch
+; HSA-NEXT:    s_addc_u32 s1, s1, 0
 ; HSA-NEXT:    s_mov_b32 s32, 0
+; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s5
 ; HSA-NEXT:    s_getpc_b64 s[4:5]
 ; HSA-NEXT:    s_add_u32 s4, s4, external_void_func_v2f16@rel32@lo+4
 ; HSA-NEXT:    s_addc_u32 s5, s5, external_void_func_v2f16@rel32@hi+12
@@ -3151,15 +3120,14 @@ define amdgpu_kernel void @test_call_external_void_func_v2i32() #0 {
 ; HSA-LABEL: test_call_external_void_func_v2i32:
 ; HSA:       ; %bb.0:
 ; HSA-NEXT:    s_add_i32 s4, s4, s7
+; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s4, 8
+; HSA-NEXT:    s_add_u32 s0, s0, s7
 ; HSA-NEXT:    s_mov_b32 s7, 0x1100f000
 ; HSA-NEXT:    s_mov_b32 s6, -1
 ; HSA-NEXT:    buffer_load_dwordx2 v[0:1], off, s[4:7], 0
-; HSA-NEXT:    s_mov_b32 s2, -1
-; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s5
-; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s4, 8
-; HSA-NEXT:    s_mov_b32 s3, 0x11e80000
-; HSA-NEXT:    s_mov_b64 s[0:1], flat_scratch
+; HSA-NEXT:    s_addc_u32 s1, s1, 0
 ; HSA-NEXT:    s_mov_b32 s32, 0
+; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s5
 ; HSA-NEXT:    s_getpc_b64 s[4:5]
 ; HSA-NEXT:    s_add_u32 s4, s4, external_void_func_v2i32@rel32@lo+4
 ; HSA-NEXT:    s_addc_u32 s5, s5, external_void_func_v2i32@rel32@hi+12
@@ -3242,14 +3210,13 @@ define amdgpu_kernel void @test_call_external_void_func_v2i32_imm() #0 {
 ; HSA-LABEL: test_call_external_void_func_v2i32_imm:
 ; HSA:       ; %bb.0:
 ; HSA-NEXT:    s_add_i32 s4, s4, s7
-; HSA-NEXT:    s_mov_b32 s2, -1
-; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s5
 ; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s4, 8
-; HSA-NEXT:    s_mov_b32 s3, 0x11e80000
-; HSA-NEXT:    s_mov_b64 s[0:1], flat_scratch
+; HSA-NEXT:    s_add_u32 s0, s0, s7
+; HSA-NEXT:    s_addc_u32 s1, s1, 0
 ; HSA-NEXT:    v_mov_b32_e32 v0, 1
 ; HSA-NEXT:    v_mov_b32_e32 v1, 2
 ; HSA-NEXT:    s_mov_b32 s32, 0
+; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s5
 ; HSA-NEXT:    s_getpc_b64 s[4:5]
 ; HSA-NEXT:    s_add_u32 s4, s4, external_void_func_v2i32@rel32@lo+4
 ; HSA-NEXT:    s_addc_u32 s5, s5, external_void_func_v2i32@rel32@hi+12
@@ -3335,15 +3302,14 @@ define amdgpu_kernel void @test_call_external_void_func_v3i32_imm(i32) #0 {
 ; HSA-LABEL: test_call_external_void_func_v3i32_imm:
 ; HSA:       ; %bb.0:
 ; HSA-NEXT:    s_add_i32 s6, s6, s9
-; HSA-NEXT:    s_mov_b32 s2, -1
-; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s7
 ; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s6, 8
-; HSA-NEXT:    s_mov_b32 s3, 0x11e80000
-; HSA-NEXT:    s_mov_b64 s[0:1], flat_scratch
+; HSA-NEXT:    s_add_u32 s0, s0, s9
+; HSA-NEXT:    s_addc_u32 s1, s1, 0
 ; HSA-NEXT:    v_mov_b32_e32 v0, 3
 ; HSA-NEXT:    v_mov_b32_e32 v1, 4
 ; HSA-NEXT:    v_mov_b32_e32 v2, 5
 ; HSA-NEXT:    s_mov_b32 s32, 0
+; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s7
 ; HSA-NEXT:    s_getpc_b64 s[4:5]
 ; HSA-NEXT:    s_add_u32 s4, s4, external_void_func_v3i32@rel32@lo+4
 ; HSA-NEXT:    s_addc_u32 s5, s5, external_void_func_v3i32@rel32@hi+12
@@ -3432,16 +3398,15 @@ define amdgpu_kernel void @test_call_external_void_func_v3i32_i32(i32) #0 {
 ; HSA-LABEL: test_call_external_void_func_v3i32_i32:
 ; HSA:       ; %bb.0:
 ; HSA-NEXT:    s_add_i32 s6, s6, s9
-; HSA-NEXT:    s_mov_b32 s2, -1
-; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s7
 ; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s6, 8
-; HSA-NEXT:    s_mov_b32 s3, 0x11e80000
-; HSA-NEXT:    s_mov_b64 s[0:1], flat_scratch
+; HSA-NEXT:    s_add_u32 s0, s0, s9
+; HSA-NEXT:    s_addc_u32 s1, s1, 0
 ; HSA-NEXT:    v_mov_b32_e32 v0, 3
 ; HSA-NEXT:    v_mov_b32_e32 v1, 4
 ; HSA-NEXT:    v_mov_b32_e32 v2, 5
 ; HSA-NEXT:    v_mov_b32_e32 v3, 6
 ; HSA-NEXT:    s_mov_b32 s32, 0
+; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s7
 ; HSA-NEXT:    s_getpc_b64 s[4:5]
 ; HSA-NEXT:    s_add_u32 s4, s4, external_void_func_v3i32_i32@rel32@lo+4
 ; HSA-NEXT:    s_addc_u32 s5, s5, external_void_func_v3i32_i32@rel32@hi+12
@@ -3528,15 +3493,14 @@ define amdgpu_kernel void @test_call_external_void_func_v4i32() #0 {
 ; HSA-LABEL: test_call_external_void_func_v4i32:
 ; HSA:       ; %bb.0:
 ; HSA-NEXT:    s_add_i32 s4, s4, s7
+; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s4, 8
+; HSA-NEXT:    s_add_u32 s0, s0, s7
 ; HSA-NEXT:    s_mov_b32 s7, 0x1100f000
 ; HSA-NEXT:    s_mov_b32 s6, -1
 ; HSA-NEXT:    buffer_load_dwordx4 v[0:3], off, s[4:7], 0
-; HSA-NEXT:    s_mov_b32 s2, -1
-; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s5
-; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s4, 8
-; HSA-NEXT:    s_mov_b32 s3, 0x11e80000
-; HSA-NEXT:    s_mov_b64 s[0:1], flat_scratch
+; HSA-NEXT:    s_addc_u32 s1, s1, 0
 ; HSA-NEXT:    s_mov_b32 s32, 0
+; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s5
 ; HSA-NEXT:    s_getpc_b64 s[4:5]
 ; HSA-NEXT:    s_add_u32 s4, s4, external_void_func_v4i32@rel32@lo+4
 ; HSA-NEXT:    s_addc_u32 s5, s5, external_void_func_v4i32@rel32@hi+12
@@ -3626,16 +3590,15 @@ define amdgpu_kernel void @test_call_external_void_func_v4i32_imm() #0 {
 ; HSA-LABEL: test_call_external_void_func_v4i32_imm:
 ; HSA:       ; %bb.0:
 ; HSA-NEXT:    s_add_i32 s4, s4, s7
-; HSA-NEXT:    s_mov_b32 s2, -1
-; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s5
 ; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s4, 8
-; HSA-NEXT:    s_mov_b32 s3, 0x11e80000
-; HSA-NEXT:    s_mov_b64 s[0:1], flat_scratch
+; HSA-NEXT:    s_add_u32 s0, s0, s7
+; HSA-NEXT:    s_addc_u32 s1, s1, 0
 ; HSA-NEXT:    v_mov_b32_e32 v0, 1
 ; HSA-NEXT:    v_mov_b32_e32 v1, 2
 ; HSA-NEXT:    v_mov_b32_e32 v2, 3
 ; HSA-NEXT:    v_mov_b32_e32 v3, 4
 ; HSA-NEXT:    s_mov_b32 s32, 0
+; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s5
 ; HSA-NEXT:    s_getpc_b64 s[4:5]
 ; HSA-NEXT:    s_add_u32 s4, s4, external_void_func_v4i32@rel32@lo+4
 ; HSA-NEXT:    s_addc_u32 s5, s5, external_void_func_v4i32@rel32@hi+12
@@ -3728,17 +3691,16 @@ define amdgpu_kernel void @test_call_external_void_func_v5i32_imm() #0 {
 ; HSA-LABEL: test_call_external_void_func_v5i32_imm:
 ; HSA:       ; %bb.0:
 ; HSA-NEXT:    s_add_i32 s4, s4, s7
-; HSA-NEXT:    s_mov_b32 s2, -1
-; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s5
 ; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s4, 8
-; HSA-NEXT:    s_mov_b32 s3, 0x11e80000
-; HSA-NEXT:    s_mov_b64 s[0:1], flat_scratch
+; HSA-NEXT:    s_add_u32 s0, s0, s7
+; HSA-NEXT:    s_addc_u32 s1, s1, 0
 ; HSA-NEXT:    v_mov_b32_e32 v0, 1
 ; HSA-NEXT:    v_mov_b32_e32 v1, 2
 ; HSA-NEXT:    v_mov_b32_e32 v2, 3
 ; HSA-NEXT:    v_mov_b32_e32 v3, 4
 ; HSA-NEXT:    v_mov_b32_e32 v4, 5
 ; HSA-NEXT:    s_mov_b32 s32, 0
+; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s5
 ; HSA-NEXT:    s_getpc_b64 s[4:5]
 ; HSA-NEXT:    s_add_u32 s4, s4, external_void_func_v5i32@rel32@lo+4
 ; HSA-NEXT:    s_addc_u32 s5, s5, external_void_func_v5i32@rel32@hi+12
@@ -3841,14 +3803,13 @@ define amdgpu_kernel void @test_call_external_void_func_v8i32() #0 {
 ; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s5
 ; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s4, 8
 ; HSA-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; HSA-NEXT:    s_add_u32 s0, s0, s7
 ; HSA-NEXT:    s_mov_b32 s7, 0x1100f000
 ; HSA-NEXT:    s_mov_b32 s6, -1
 ; HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; HSA-NEXT:    buffer_load_dwordx4 v[0:3], off, s[4:7], 0
 ; HSA-NEXT:    buffer_load_dwordx4 v[4:7], off, s[4:7], 0 offset:16
-; HSA-NEXT:    s_mov_b32 s2, -1
-; HSA-NEXT:    s_mov_b32 s3, 0x11e80000
-; HSA-NEXT:    s_mov_b64 s[0:1], flat_scratch
+; HSA-NEXT:    s_addc_u32 s1, s1, 0
 ; HSA-NEXT:    s_mov_b32 s32, 0
 ; HSA-NEXT:    s_getpc_b64 s[4:5]
 ; HSA-NEXT:    s_add_u32 s4, s4, external_void_func_v8i32@rel32@lo+4
@@ -3954,11 +3915,9 @@ define amdgpu_kernel void @test_call_external_void_func_v8i32_imm() #0 {
 ; HSA-LABEL: test_call_external_void_func_v8i32_imm:
 ; HSA:       ; %bb.0:
 ; HSA-NEXT:    s_add_i32 s4, s4, s7
-; HSA-NEXT:    s_mov_b32 s2, -1
-; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s5
 ; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s4, 8
-; HSA-NEXT:    s_mov_b32 s3, 0x11e80000
-; HSA-NEXT:    s_mov_b64 s[0:1], flat_scratch
+; HSA-NEXT:    s_add_u32 s0, s0, s7
+; HSA-NEXT:    s_addc_u32 s1, s1, 0
 ; HSA-NEXT:    v_mov_b32_e32 v0, 1
 ; HSA-NEXT:    v_mov_b32_e32 v1, 2
 ; HSA-NEXT:    v_mov_b32_e32 v2, 3
@@ -3968,6 +3927,7 @@ define amdgpu_kernel void @test_call_external_void_func_v8i32_imm() #0 {
 ; HSA-NEXT:    v_mov_b32_e32 v6, 7
 ; HSA-NEXT:    v_mov_b32_e32 v7, 8
 ; HSA-NEXT:    s_mov_b32 s32, 0
+; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s5
 ; HSA-NEXT:    s_getpc_b64 s[4:5]
 ; HSA-NEXT:    s_add_u32 s4, s4, external_void_func_v8i32@rel32@lo+4
 ; HSA-NEXT:    s_addc_u32 s5, s5, external_void_func_v8i32@rel32@hi+12
@@ -4078,6 +4038,7 @@ define amdgpu_kernel void @test_call_external_void_func_v16i32() #0 {
 ; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s5
 ; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s4, 8
 ; HSA-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; HSA-NEXT:    s_add_u32 s0, s0, s7
 ; HSA-NEXT:    s_mov_b32 s7, 0x1100f000
 ; HSA-NEXT:    s_mov_b32 s6, -1
 ; HSA-NEXT:    s_waitcnt lgkmcnt(0)
@@ -4085,9 +4046,7 @@ define amdgpu_kernel void @test_call_external_void_func_v16i32() #0 {
 ; HSA-NEXT:    buffer_load_dwordx4 v[4:7], off, s[4:7], 0 offset:16
 ; HSA-NEXT:    buffer_load_dwordx4 v[8:11], off, s[4:7], 0 offset:32
 ; HSA-NEXT:    buffer_load_dwordx4 v[12:15], off, s[4:7], 0 offset:48
-; HSA-NEXT:    s_mov_b32 s2, -1
-; HSA-NEXT:    s_mov_b32 s3, 0x11e80000
-; HSA-NEXT:    s_mov_b64 s[0:1], flat_scratch
+; HSA-NEXT:    s_addc_u32 s1, s1, 0
 ; HSA-NEXT:    s_mov_b32 s32, 0
 ; HSA-NEXT:    s_getpc_b64 s[4:5]
 ; HSA-NEXT:    s_add_u32 s4, s4, external_void_func_v16i32@rel32@lo+4
@@ -4224,6 +4183,7 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32() #0 {
 ; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s5
 ; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s4, 8
 ; HSA-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; HSA-NEXT:    s_add_u32 s0, s0, s7
 ; HSA-NEXT:    s_mov_b32 s7, 0x1100f000
 ; HSA-NEXT:    s_mov_b32 s6, -1
 ; HSA-NEXT:    s_waitcnt lgkmcnt(0)
@@ -4235,10 +4195,8 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32() #0 {
 ; HSA-NEXT:    buffer_load_dwordx4 v[16:19], off, s[4:7], 0 offset:64
 ; HSA-NEXT:    buffer_load_dwordx4 v[20:23], off, s[4:7], 0 offset:80
 ; HSA-NEXT:    buffer_load_dwordx4 v[24:27], off, s[4:7], 0 offset:96
-; HSA-NEXT:    s_mov_b32 s2, -1
-; HSA-NEXT:    s_mov_b32 s3, 0x11e80000
 ; HSA-NEXT:    s_mov_b32 s32, 0
-; HSA-NEXT:    s_mov_b64 s[0:1], flat_scratch
+; HSA-NEXT:    s_addc_u32 s1, s1, 0
 ; HSA-NEXT:    s_getpc_b64 s[8:9]
 ; HSA-NEXT:    s_add_u32 s8, s8, external_void_func_v32i32@rel32@lo+4
 ; HSA-NEXT:    s_addc_u32 s9, s9, external_void_func_v32i32@rel32@hi+12
@@ -4401,10 +4359,9 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32_i32(i32) #0 {
 ; HSA-NEXT:    buffer_load_dwordx4 v[16:19], off, s[4:7], 0 offset:64
 ; HSA-NEXT:    buffer_load_dwordx4 v[20:23], off, s[4:7], 0 offset:80
 ; HSA-NEXT:    buffer_load_dwordx4 v[24:27], off, s[4:7], 0 offset:96
-; HSA-NEXT:    s_mov_b32 s2, -1
-; HSA-NEXT:    s_mov_b32 s3, 0x11e80000
+; HSA-NEXT:    s_add_u32 s0, s0, s9
 ; HSA-NEXT:    s_mov_b32 s32, 0
-; HSA-NEXT:    s_mov_b64 s[0:1], flat_scratch
+; HSA-NEXT:    s_addc_u32 s1, s1, 0
 ; HSA-NEXT:    s_getpc_b64 s[4:5]
 ; HSA-NEXT:    s_add_u32 s4, s4, external_void_func_v32i32_i32@rel32@lo+4
 ; HSA-NEXT:    s_addc_u32 s5, s5, external_void_func_v32i32_i32@rel32@hi+12
@@ -4509,15 +4466,14 @@ define amdgpu_kernel void @test_call_external_i32_func_i32_imm(ptr addrspace(1)
 ;
 ; HSA-LABEL: test_call_external_i32_func_i32_imm:
 ; HSA:       ; %bb.0:
-; HSA-NEXT:    s_load_dwordx2 s[36:37], s[4:5], 0x0
 ; HSA-NEXT:    s_add_i32 s6, s6, s9
-; HSA-NEXT:    s_mov_b32 s2, -1
-; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s7
+; HSA-NEXT:    s_load_dwordx2 s[36:37], s[4:5], 0x0
 ; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s6, 8
-; HSA-NEXT:    s_mov_b32 s3, 0x11e80000
-; HSA-NEXT:    s_mov_b64 s[0:1], flat_scratch
+; HSA-NEXT:    s_add_u32 s0, s0, s9
+; HSA-NEXT:    s_addc_u32 s1, s1, 0
 ; HSA-NEXT:    v_mov_b32_e32 v0, 42
 ; HSA-NEXT:    s_mov_b32 s32, 0
+; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s7
 ; HSA-NEXT:    s_mov_b32 s39, 0x1100f000
 ; HSA-NEXT:    s_mov_b32 s38, -1
 ; HSA-NEXT:    s_getpc_b64 s[4:5]
@@ -4625,14 +4581,13 @@ define amdgpu_kernel void @test_call_external_void_func_struct_i8_i32() #0 {
 ; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s5
 ; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s4, 8
 ; HSA-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; HSA-NEXT:    s_add_u32 s0, s0, s7
 ; HSA-NEXT:    s_mov_b32 s7, 0x1100f000
 ; HSA-NEXT:    s_mov_b32 s6, -1
 ; HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; HSA-NEXT:    buffer_load_ubyte v0, off, s[4:7], 0
 ; HSA-NEXT:    buffer_load_dword v1, off, s[4:7], 0 offset:4
-; HSA-NEXT:    s_mov_b32 s2, -1
-; HSA-NEXT:    s_mov_b32 s3, 0x11e80000
-; HSA-NEXT:    s_mov_b64 s[0:1], flat_scratch
+; HSA-NEXT:    s_addc_u32 s1, s1, 0
 ; HSA-NEXT:    s_mov_b32 s32, 0
 ; HSA-NEXT:    s_getpc_b64 s[4:5]
 ; HSA-NEXT:    s_add_u32 s4, s4, external_void_func_struct_i8_i32@rel32@lo+4
@@ -4747,11 +4702,9 @@ define amdgpu_kernel void @test_call_external_void_func_byval_struct_i8_i32() #0
 ; HSA-LABEL: test_call_external_void_func_byval_struct_i8_i32:
 ; HSA:       ; %bb.0:
 ; HSA-NEXT:    s_add_i32 s4, s4, s7
-; HSA-NEXT:    s_mov_b32 s2, -1
-; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s5
 ; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s4, 8
-; HSA-NEXT:    s_mov_b32 s3, 0x11e80000
-; HSA-NEXT:    s_mov_b64 s[0:1], flat_scratch
+; HSA-NEXT:    s_add_u32 s0, s0, s7
+; HSA-NEXT:    s_addc_u32 s1, s1, 0
 ; HSA-NEXT:    v_mov_b32_e32 v0, 3
 ; HSA-NEXT:    buffer_store_byte v0, off, s[0:3], 0
 ; HSA-NEXT:    v_mov_b32_e32 v0, 8
@@ -4759,6 +4712,7 @@ define amdgpu_kernel void @test_call_external_void_func_byval_struct_i8_i32() #0
 ; HSA-NEXT:    buffer_load_dword v0, off, s[0:3], 0 offset:4
 ; HSA-NEXT:    buffer_load_dword v1, off, s[0:3], 0
 ; HSA-NEXT:    s_movk_i32 s32, 0x400
+; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s5
 ; HSA-NEXT:    s_getpc_b64 s[4:5]
 ; HSA-NEXT:    s_add_u32 s4, s4, external_void_func_byval_struct_i8_i32@rel32@lo+4
 ; HSA-NEXT:    s_addc_u32 s5, s5, external_void_func_byval_struct_i8_i32@rel32@hi+12
@@ -4923,11 +4877,9 @@ define amdgpu_kernel void @test_call_external_void_func_sret_struct_i8_i32_byval
 ; HSA-LABEL: test_call_external_void_func_sret_struct_i8_i32_byval_struct_i8_i32:
 ; HSA:       ; %bb.0:
 ; HSA-NEXT:    s_add_i32 s6, s6, s9
-; HSA-NEXT:    s_mov_b32 s2, -1
-; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s7
 ; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s6, 8
-; HSA-NEXT:    s_mov_b32 s3, 0x11e80000
-; HSA-NEXT:    s_mov_b64 s[0:1], flat_scratch
+; HSA-NEXT:    s_add_u32 s0, s0, s9
+; HSA-NEXT:    s_addc_u32 s1, s1, 0
 ; HSA-NEXT:    v_mov_b32_e32 v0, 3
 ; HSA-NEXT:    buffer_store_byte v0, off, s[0:3], 0
 ; HSA-NEXT:    v_mov_b32_e32 v0, 8
@@ -4935,6 +4887,7 @@ define amdgpu_kernel void @test_call_external_void_func_sret_struct_i8_i32_byval
 ; HSA-NEXT:    buffer_load_dword v0, off, s[0:3], 0 offset:4
 ; HSA-NEXT:    buffer_load_dword v1, off, s[0:3], 0
 ; HSA-NEXT:    s_movk_i32 s32, 0x800
+; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s7
 ; HSA-NEXT:    s_getpc_b64 s[4:5]
 ; HSA-NEXT:    s_add_u32 s4, s4, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32@rel32@lo+4
 ; HSA-NEXT:    s_addc_u32 s5, s5, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32@rel32@hi+12
@@ -5132,13 +5085,12 @@ define amdgpu_kernel void @test_call_external_void_func_v16i8() #0 {
 ; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s5
 ; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s4, 8
 ; HSA-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; HSA-NEXT:    s_add_u32 s0, s0, s7
 ; HSA-NEXT:    s_mov_b32 s7, 0x1100f000
 ; HSA-NEXT:    s_mov_b32 s6, -1
-; HSA-NEXT:    s_mov_b32 s2, -1
-; HSA-NEXT:    s_mov_b32 s3, 0x11e80000
+; HSA-NEXT:    s_addc_u32 s1, s1, 0
 ; HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; HSA-NEXT:    buffer_load_dwordx4 v[0:3], off, s[4:7], 0
-; HSA-NEXT:    s_mov_b64 s[0:1], flat_scratch
 ; HSA-NEXT:    s_mov_b32 s32, 0
 ; HSA-NEXT:    s_getpc_b64 s[4:5]
 ; HSA-NEXT:    s_add_u32 s4, s4, external_void_func_v16i8@rel32@lo+4
@@ -5387,15 +5339,14 @@ define amdgpu_kernel void @stack_passed_arg_alignment_v32i32_f64(<32 x i32> %val
 ; HSA-LABEL: stack_passed_arg_alignment_v32i32_f64:
 ; HSA:       ; %bb.0: ; %entry
 ; HSA-NEXT:    s_add_i32 s6, s6, s9
-; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s7
 ; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s6, 8
+; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s7
+; HSA-NEXT:    s_add_u32 s0, s0, s9
 ; HSA-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x40
 ; HSA-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x80
 ; HSA-NEXT:    s_load_dwordx16 s[36:51], s[4:5], 0x0
-; HSA-NEXT:    s_mov_b32 s2, -1
-; HSA-NEXT:    s_mov_b32 s3, 0x11e80000
 ; HSA-NEXT:    s_mov_b32 s32, 0
-; HSA-NEXT:    s_mov_b64 s[0:1], flat_scratch
+; HSA-NEXT:    s_addc_u32 s1, s1, 0
 ; HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; HSA-NEXT:    v_mov_b32_e32 v0, s23
 ; HSA-NEXT:    v_mov_b32_e32 v1, s6
diff --git a/llvm/test/CodeGen/AMDGPU/call-reqd-group-size.ll b/llvm/test/CodeGen/AMDGPU/call-reqd-group-size.ll
index 8e2fca5..c62a082 100644
--- a/llvm/test/CodeGen/AMDGPU/call-reqd-group-size.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-reqd-group-size.ll
@@ -11,11 +11,10 @@ define amdgpu_kernel void @known_x_0(ptr addrspace(1) %out) !reqd_work_group_siz
 ; CHECK-LABEL: known_x_0:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_add_u32 flat_scratch_lo, s6, s9
-; CHECK-NEXT:    s_mov_b32 s2, -1
 ; CHECK-NEXT:    s_addc_u32 flat_scratch_hi, s7, 0
-; CHECK-NEXT:    s_mov_b32 s3, 0xe00000
+; CHECK-NEXT:    s_add_u32 s0, s0, s9
 ; CHECK-NEXT:    v_lshlrev_b32_e32 v0, 20, v2
-; CHECK-NEXT:    s_mov_b64 s[0:1], flat_scratch
+; CHECK-NEXT:    s_addc_u32 s1, s1, 0
 ; CHECK-NEXT:    v_lshl_or_b32 v31, v1, 10, v0
 ; CHECK-NEXT:    s_mov_b32 s32, 0
 ; CHECK-NEXT:    s_getpc_b64 s[4:5]
@@ -32,10 +31,9 @@ define amdgpu_kernel void @known_y_0(ptr addrspace(1) %out) !reqd_work_group_siz
 ; CHECK-LABEL: known_y_0:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_add_u32 flat_scratch_lo, s6, s9
-; CHECK-NEXT:    s_mov_b32 s2, -1
 ; CHECK-NEXT:    s_addc_u32 flat_scratch_hi, s7, 0
-; CHECK-NEXT:    s_mov_b32 s3, 0xe00000
-; CHECK-NEXT:    s_mov_b64 s[0:1], flat_scratch
+; CHECK-NEXT:    s_add_u32 s0, s0, s9
+; CHECK-NEXT:    s_addc_u32 s1, s1, 0
 ; CHECK-NEXT:    v_lshl_or_b32 v31, v2, 20, v0
 ; CHECK-NEXT:    s_mov_b32 s32, 0
 ; CHECK-NEXT:    s_getpc_b64 s[4:5]
@@ -52,10 +50,9 @@ define amdgpu_kernel void @known_z_0(ptr addrspace(1) %out) !reqd_work_group_siz
 ; CHECK-LABEL: known_z_0:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_add_u32 flat_scratch_lo, s6, s9
-; CHECK-NEXT:    s_mov_b32 s2, -1
 ; CHECK-NEXT:    s_addc_u32 flat_scratch_hi, s7, 0
-; CHECK-NEXT:    s_mov_b32 s3, 0xe00000
-; CHECK-NEXT:    s_mov_b64 s[0:1], flat_scratch
+; CHECK-NEXT:    s_add_u32 s0, s0, s9
+; CHECK-NEXT:    s_addc_u32 s1, s1, 0
 ; CHECK-NEXT:    v_lshl_or_b32 v31, v1, 10, v0
 ; CHECK-NEXT:    s_mov_b32 s32, 0
 ; CHECK-NEXT:    s_getpc_b64 s[4:5]
@@ -72,10 +69,9 @@ define amdgpu_kernel void @known_yz_0(ptr addrspace(1) %out) !reqd_work_group_si
 ; CHECK-LABEL: known_yz_0:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_add_u32 flat_scratch_lo, s6, s9
-; CHECK-NEXT:    s_mov_b32 s2, -1
 ; CHECK-NEXT:    s_addc_u32 flat_scratch_hi, s7, 0
-; CHECK-NEXT:    s_mov_b32 s3, 0xe00000
-; CHECK-NEXT:    s_mov_b64 s[0:1], flat_scratch
+; CHECK-NEXT:    s_add_u32 s0, s0, s9
+; CHECK-NEXT:    s_addc_u32 s1, s1, 0
 ; CHECK-NEXT:    v_mov_b32_e32 v31, v0
 ; CHECK-NEXT:    s_mov_b32 s32, 0
 ; CHECK-NEXT:    s_getpc_b64 s[4:5]
@@ -92,10 +88,9 @@ define amdgpu_kernel void @known_xz_0(ptr addrspace(1) %out) !reqd_work_group_si
 ; CHECK-LABEL: known_xz_0:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_add_u32 flat_scratch_lo, s6, s9
-; CHECK-NEXT:    s_mov_b32 s2, -1
 ; CHECK-NEXT:    s_addc_u32 flat_scratch_hi, s7, 0
-; CHECK-NEXT:    s_mov_b32 s3, 0xe00000
-; CHECK-NEXT:    s_mov_b64 s[0:1], flat_scratch
+; CHECK-NEXT:    s_add_u32 s0, s0, s9
+; CHECK-NEXT:    s_addc_u32 s1, s1, 0
 ; CHECK-NEXT:    v_lshlrev_b32_e32 v31, 10, v1
 ; CHECK-NEXT:    s_mov_b32 s32, 0
 ; CHECK-NEXT:    s_getpc_b64 s[4:5]
@@ -113,10 +108,9 @@ define amdgpu_kernel void @known_xyz_0(ptr addrspace(1) %out) !reqd_work_group_s
 ; CHECK-LABEL: known_xyz_0:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_add_u32 flat_scratch_lo, s6, s9
-; CHECK-NEXT:    s_mov_b32 s2, -1
 ; CHECK-NEXT:    s_addc_u32 flat_scratch_hi, s7, 0
-; CHECK-NEXT:    s_mov_b32 s3, 0xe00000
-; CHECK-NEXT:    s_mov_b64 s[0:1], flat_scratch
+; CHECK-NEXT:    s_add_u32 s0, s0, s9
+; CHECK-NEXT:    s_addc_u32 s1, s1, 0
 ; CHECK-NEXT:    v_mov_b32_e32 v31, 0
 ; CHECK-NEXT:    s_mov_b32 s32, 0
 ; CHECK-NEXT:    s_getpc_b64 s[4:5]
diff --git a/llvm/test/CodeGen/AMDGPU/call-waitcnt.ll b/llvm/test/CodeGen/AMDGPU/call-waitcnt.ll
index 6db5eff..616e5f0 100644
--- a/llvm/test/CodeGen/AMDGPU/call-waitcnt.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-waitcnt.ll
@@ -7,13 +7,12 @@ define amdgpu_kernel void @call_memory_arg_load(ptr addrspace(3) %ptr, i32) #0 {
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dword s4, s[4:5], 0x0
 ; GCN-NEXT:    s_add_u32 flat_scratch_lo, s6, s9
-; GCN-NEXT:    s_mov_b32 s2, -1
 ; GCN-NEXT:    s_addc_u32 flat_scratch_hi, s7, 0
-; GCN-NEXT:    s_mov_b32 s3, 0xe00000
+; GCN-NEXT:    s_add_u32 s0, s0, s9
+; GCN-NEXT:    s_addc_u32 s1, s1, 0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN-NEXT:    ds_read_b32 v0, v0
-; GCN-NEXT:    s_mov_b64 s[0:1], flat_scratch
 ; GCN-NEXT:    s_mov_b32 s32, 0
 ; GCN-NEXT:    s_getpc_b64 s[4:5]
 ; GCN-NEXT:    s_add_u32 s4, s4, func@rel32@lo+4
@@ -31,11 +30,10 @@ define amdgpu_kernel void @call_memory_no_dep(ptr addrspace(1) %ptr, i32) #0 {
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
 ; GCN-NEXT:    s_add_u32 flat_scratch_lo, s6, s9
-; GCN-NEXT:    s_mov_b32 s2, -1
 ; GCN-NEXT:    s_addc_u32 flat_scratch_hi, s7, 0
-; GCN-NEXT:    s_mov_b32 s3, 0xe00000
+; GCN-NEXT:    s_add_u32 s0, s0, s9
 ; GCN-NEXT:    v_mov_b32_e32 v0, 0
-; GCN-NEXT:    s_mov_b64 s[0:1], flat_scratch
+; GCN-NEXT:    s_addc_u32 s1, s1, 0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    global_store_dword v0, v0, s[4:5]
 ; GCN-NEXT:    v_mov_b32_e32 v0, 0
@@ -54,12 +52,11 @@ define amdgpu_kernel void @call_memory_no_dep(ptr addrspace(1) %ptr, i32) #0 {
 define amdgpu_kernel void @call_no_wait_after_call(ptr addrspace(1) %ptr, i32) #0 {
 ; GCN-LABEL: call_no_wait_after_call:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_dwordx2 s[34:35], s[4:5], 0x0
 ; GCN-NEXT:    s_add_u32 flat_scratch_lo, s6, s9
-; GCN-NEXT:    s_mov_b32 s2, -1
+; GCN-NEXT:    s_load_dwordx2 s[34:35], s[4:5], 0x0
 ; GCN-NEXT:    s_addc_u32 flat_scratch_hi, s7, 0
-; GCN-NEXT:    s_mov_b32 s3, 0xe00000
-; GCN-NEXT:    s_mov_b64 s[0:1], flat_scratch
+; GCN-NEXT:    s_add_u32 s0, s0, s9
+; GCN-NEXT:    s_addc_u32 s1, s1, 0
 ; GCN-NEXT:    v_mov_b32_e32 v0, 0
 ; GCN-NEXT:    s_mov_b32 s32, 0
 ; GCN-NEXT:    s_getpc_b64 s[4:5]
@@ -77,12 +74,11 @@ define amdgpu_kernel void @call_no_wait_after_call(ptr addrspace(1) %ptr, i32) #
 define amdgpu_kernel void @call_no_wait_after_call_return_val(ptr addrspace(1) %ptr, i32) #0 {
 ; GCN-LABEL: call_no_wait_after_call_return_val:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_dwordx2 s[34:35], s[4:5], 0x0
 ; GCN-NEXT:    s_add_u32 flat_scratch_lo, s6, s9
-; GCN-NEXT:    s_mov_b32 s2, -1
+; GCN-NEXT:    s_load_dwordx2 s[34:35], s[4:5], 0x0
 ; GCN-NEXT:    s_addc_u32 flat_scratch_hi, s7, 0
-; GCN-NEXT:    s_mov_b32 s3, 0xe00000
-; GCN-NEXT:    s_mov_b64 s[0:1], flat_scratch
+; GCN-NEXT:    s_add_u32 s0, s0, s9
+; GCN-NEXT:    s_addc_u32 s1, s1, 0
 ; GCN-NEXT:    v_mov_b32_e32 v0, 0
 ; GCN-NEXT:    s_mov_b32 s32, 0
 ; GCN-NEXT:    s_getpc_b64 s[4:5]
@@ -103,13 +99,12 @@ define amdgpu_kernel void @call_got_load(ptr addrspace(1) %ptr, i32) #0 {
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_add_u32 flat_scratch_lo, s6, s9
 ; GCN-NEXT:    s_addc_u32 flat_scratch_hi, s7, 0
+; GCN-NEXT:    s_add_u32 s0, s0, s9
+; GCN-NEXT:    s_addc_u32 s1, s1, 0
 ; GCN-NEXT:    s_getpc_b64 s[4:5]
 ; GCN-NEXT:    s_add_u32 s4, s4, got.func@gotpcrel32@lo+4
 ; GCN-NEXT:    s_addc_u32 s5, s5, got.func@gotpcrel32@hi+12
 ; GCN-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
-; GCN-NEXT:    s_mov_b32 s2, -1
-; GCN-NEXT:    s_mov_b32 s3, 0xe00000
-; GCN-NEXT:    s_mov_b64 s[0:1], flat_scratch
 ; GCN-NEXT:    v_mov_b32_e32 v0, 0
 ; GCN-NEXT:    s_mov_b32 s32, 0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll b/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll
index 0705d49..9f535a9 100644
--- a/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll
+++ b/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll
@@ -165,7 +165,7 @@ define amdgpu_kernel void @kern_indirect_use_workitem_id_z() #1 {
 ; FIXEDABI-NOT: v1
 ; FIXEDABI-NOT: v2
 ; FIXEDABI: v_lshlrev_b32_e32 v1, 10, v1
-; FIXEDABI: v_or_b32_e32 v31, v0, v1
+; FIXEDABI-NEXT: v_or_b32_e32 v31, v0, v1
 ; FIXEDABI-NOT: v0
 ; FIXEDABI-NOT: v1
 ; FIXEDABI-NOT: v2
@@ -181,7 +181,7 @@ define amdgpu_kernel void @kern_indirect_use_workitem_id_xy() #1 {
 ; FIXEDABI-NOT: v1
 ; FIXEDABI-NOT: v2
 ; FIXEDABI: v_lshlrev_b32_e32 v1, 20, v2
-; FIXEDABI: v_or_b32_e32 v31, v0, v1
+; FIXEDABI-NEXT: v_or_b32_e32 v31, v0, v1
 ; FIXEDABI-NOT: v0
 ; FIXEDABI-NOT: v1
 ; FIXEDABI-NOT: v2
@@ -198,7 +198,7 @@ define amdgpu_kernel void @kern_indirect_use_workitem_id_xz() #1 {
 ; FIXEDABI-NOT: v2
 ; FIXEDABI:v_lshlrev_b32_e32 v0, 20, v2
 ; FIXEDABI-NEXT: v_lshlrev_b32_e32 v1, 10, v1
-; FIXEDABI: v_or_b32_e32 v31, v1, v0
+; FIXEDABI-NEXT: v_or_b32_e32 v31, v1, v0
 ; FIXEDABI-NOT: v0
 ; FIXEDABI-NOT: v1
 ; FIXEDABI-NOT: v2
diff --git a/llvm/test/CodeGen/AMDGPU/cc-update.ll b/llvm/test/CodeGen/AMDGPU/cc-update.ll
index 7188883..6f42fd0 100644
--- a/llvm/test/CodeGen/AMDGPU/cc-update.ll
+++ b/llvm/test/CodeGen/AMDGPU/cc-update.ll
@@ -68,14 +68,13 @@ define amdgpu_kernel void @test_kern_call() local_unnamed_addr #0 {
 ; GFX803-LABEL: test_kern_call:
 ; GFX803:       ; %bb.0: ; %entry
 ; GFX803-NEXT:    s_add_i32 s10, s10, s15
-; GFX803-NEXT:    s_mov_b32 s2, -1
-; GFX803-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
-; GFX803-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GFX803-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; GFX803-NEXT:    s_mov_b32 s3, 0x11e80000
+; GFX803-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
+; GFX803-NEXT:    s_add_u32 s0, s0, s15
 ; GFX803-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
 ; GFX803-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX803-NEXT:    s_mov_b64 s[0:1], flat_scratch
+; GFX803-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX803-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX803-NEXT:    s_mov_b64 s[10:11], s[8:9]
 ; GFX803-NEXT:    v_or_b32_e32 v31, v0, v2
 ; GFX803-NEXT:    s_mov_b64 s[8:9], s[6:7]
@@ -89,12 +88,11 @@ define amdgpu_kernel void @test_kern_call() local_unnamed_addr #0 {
 ; GFX900-LABEL: test_kern_call:
 ; GFX900:       ; %bb.0: ; %entry
 ; GFX900-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX900-NEXT:    s_mov_b32 s2, -1
 ; GFX900-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
-; GFX900-NEXT:    s_mov_b32 s3, 0xe00000
+; GFX900-NEXT:    s_add_u32 s0, s0, s15
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
-; GFX900-NEXT:    s_mov_b64 s[0:1], flat_scratch
+; GFX900-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX900-NEXT:    s_mov_b64 s[10:11], s[8:9]
 ; GFX900-NEXT:    v_or3_b32 v31, v0, v1, v2
 ; GFX900-NEXT:    s_mov_b64 s[8:9], s[6:7]
@@ -114,12 +112,11 @@ define amdgpu_kernel void @test_kern_call() local_unnamed_addr #0 {
 ; GFX1010-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX1010-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
 ; GFX1010-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
-; GFX1010-NEXT:    s_mov_b32 s2, -1
-; GFX1010-NEXT:    s_mov_b32 s3, 0x31c16000
-; GFX1010-NEXT:    s_mov_b64 s[0:1], s[10:11]
+; GFX1010-NEXT:    s_add_u32 s0, s0, s15
+; GFX1010-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX1010-NEXT:    s_mov_b64 s[10:11], s[8:9]
-; GFX1010-NEXT:    v_or3_b32 v31, v0, v1, v2
 ; GFX1010-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX1010-NEXT:    v_or3_b32 v31, v0, v1, v2
 ; GFX1010-NEXT:    s_getpc_b64 s[16:17]
 ; GFX1010-NEXT:    s_add_u32 s16, s16, ex@rel32@lo+4
 ; GFX1010-NEXT:    s_addc_u32 s17, s17, ex@rel32@hi+12
@@ -151,14 +148,13 @@ define amdgpu_kernel void @test_kern_stack_and_call() local_unnamed_addr #0 {
 ; GFX803-LABEL: test_kern_stack_and_call:
 ; GFX803:       ; %bb.0: ; %entry
 ; GFX803-NEXT:    s_add_i32 s10, s10, s15
-; GFX803-NEXT:    s_mov_b32 s2, -1
-; GFX803-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
-; GFX803-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GFX803-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; GFX803-NEXT:    s_mov_b32 s3, 0x11e80000
+; GFX803-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
+; GFX803-NEXT:    s_add_u32 s0, s0, s15
 ; GFX803-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
 ; GFX803-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX803-NEXT:    s_mov_b64 s[0:1], flat_scratch
+; GFX803-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX803-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX803-NEXT:    s_mov_b64 s[10:11], s[8:9]
 ; GFX803-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX803-NEXT:    v_or_b32_e32 v31, v0, v2
@@ -175,12 +171,11 @@ define amdgpu_kernel void @test_kern_stack_and_call() local_unnamed_addr #0 {
 ; GFX900-LABEL: test_kern_stack_and_call:
 ; GFX900:       ; %bb.0: ; %entry
 ; GFX900-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX900-NEXT:    s_mov_b32 s2, -1
 ; GFX900-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
-; GFX900-NEXT:    s_mov_b32 s3, 0xe00000
+; GFX900-NEXT:    s_add_u32 s0, s0, s15
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
-; GFX900-NEXT:    s_mov_b64 s[0:1], flat_scratch
+; GFX900-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX900-NEXT:    s_mov_b64 s[10:11], s[8:9]
 ; GFX900-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX900-NEXT:    v_or3_b32 v31, v0, v1, v2
@@ -204,11 +199,10 @@ define amdgpu_kernel void @test_kern_stack_and_call() local_unnamed_addr #0 {
 ; GFX1010-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
 ; GFX1010-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
 ; GFX1010-NEXT:    v_mov_b32_e32 v3, 0
-; GFX1010-NEXT:    s_mov_b32 s2, -1
-; GFX1010-NEXT:    s_mov_b32 s3, 0x31c16000
-; GFX1010-NEXT:    s_mov_b64 s[0:1], s[10:11]
-; GFX1010-NEXT:    v_or3_b32 v31, v0, v1, v2
+; GFX1010-NEXT:    s_add_u32 s0, s0, s15
+; GFX1010-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX1010-NEXT:    s_mov_b64 s[10:11], s[8:9]
+; GFX1010-NEXT:    v_or3_b32 v31, v0, v1, v2
 ; GFX1010-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX1010-NEXT:    buffer_store_dword v3, off, s[0:3], 0
 ; GFX1010-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -317,14 +311,13 @@ define amdgpu_kernel void @test_force_fp_kern_call() local_unnamed_addr #2 {
 ; GFX803-LABEL: test_force_fp_kern_call:
 ; GFX803:       ; %bb.0: ; %entry
 ; GFX803-NEXT:    s_add_i32 s10, s10, s15
-; GFX803-NEXT:    s_mov_b32 s2, -1
-; GFX803-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
-; GFX803-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GFX803-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; GFX803-NEXT:    s_mov_b32 s3, 0x11e80000
+; GFX803-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
+; GFX803-NEXT:    s_add_u32 s0, s0, s15
 ; GFX803-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
 ; GFX803-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX803-NEXT:    s_mov_b64 s[0:1], flat_scratch
+; GFX803-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX803-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX803-NEXT:    s_mov_b64 s[10:11], s[8:9]
 ; GFX803-NEXT:    v_or_b32_e32 v31, v0, v2
 ; GFX803-NEXT:    s_mov_b64 s[8:9], s[6:7]
@@ -339,12 +332,11 @@ define amdgpu_kernel void @test_force_fp_kern_call() local_unnamed_addr #2 {
 ; GFX900-LABEL: test_force_fp_kern_call:
 ; GFX900:       ; %bb.0: ; %entry
 ; GFX900-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX900-NEXT:    s_mov_b32 s2, -1
 ; GFX900-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
-; GFX900-NEXT:    s_mov_b32 s3, 0xe00000
+; GFX900-NEXT:    s_add_u32 s0, s0, s15
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
-; GFX900-NEXT:    s_mov_b64 s[0:1], flat_scratch
+; GFX900-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX900-NEXT:    s_mov_b64 s[10:11], s[8:9]
 ; GFX900-NEXT:    v_or3_b32 v31, v0, v1, v2
 ; GFX900-NEXT:    s_mov_b64 s[8:9], s[6:7]
@@ -366,12 +358,11 @@ define amdgpu_kernel void @test_force_fp_kern_call() local_unnamed_addr #2 {
 ; GFX1010-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX1010-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
 ; GFX1010-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
-; GFX1010-NEXT:    s_mov_b32 s2, -1
-; GFX1010-NEXT:    s_mov_b32 s3, 0x31c16000
-; GFX1010-NEXT:    s_mov_b64 s[0:1], s[10:11]
+; GFX1010-NEXT:    s_add_u32 s0, s0, s15
+; GFX1010-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX1010-NEXT:    s_mov_b64 s[10:11], s[8:9]
-; GFX1010-NEXT:    v_or3_b32 v31, v0, v1, v2
 ; GFX1010-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX1010-NEXT:    v_or3_b32 v31, v0, v1, v2
 ; GFX1010-NEXT:    s_getpc_b64 s[16:17]
 ; GFX1010-NEXT:    s_add_u32 s16, s16, ex@rel32@lo+4
 ; GFX1010-NEXT:    s_addc_u32 s17, s17, ex@rel32@hi+12
@@ -422,15 +413,14 @@ define amdgpu_kernel void @test_force_fp_kern_stack_and_call() local_unnamed_add
 ; GFX803-LABEL: test_force_fp_kern_stack_and_call:
 ; GFX803:       ; %bb.0: ; %entry
 ; GFX803-NEXT:    s_add_i32 s10, s10, s15
-; GFX803-NEXT:    s_mov_b32 s2, -1
-; GFX803-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
-; GFX803-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GFX803-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; GFX803-NEXT:    s_mov_b32 s3, 0x11e80000
+; GFX803-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
+; GFX803-NEXT:    s_add_u32 s0, s0, s15
 ; GFX803-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
 ; GFX803-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX803-NEXT:    s_mov_b32 s33, 0
-; GFX803-NEXT:    s_mov_b64 s[0:1], flat_scratch
+; GFX803-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX803-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX803-NEXT:    s_mov_b64 s[10:11], s[8:9]
 ; GFX803-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX803-NEXT:    v_or_b32_e32 v31, v0, v2
@@ -447,13 +437,12 @@ define amdgpu_kernel void @test_force_fp_kern_stack_and_call() local_unnamed_add
 ; GFX900-LABEL: test_force_fp_kern_stack_and_call:
 ; GFX900:       ; %bb.0: ; %entry
 ; GFX900-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX900-NEXT:    s_mov_b32 s2, -1
 ; GFX900-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
-; GFX900-NEXT:    s_mov_b32 s3, 0xe00000
+; GFX900-NEXT:    s_add_u32 s0, s0, s15
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
 ; GFX900-NEXT:    s_mov_b32 s33, 0
-; GFX900-NEXT:    s_mov_b64 s[0:1], flat_scratch
+; GFX900-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX900-NEXT:    s_mov_b64 s[10:11], s[8:9]
 ; GFX900-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX900-NEXT:    v_or3_b32 v31, v0, v1, v2
@@ -478,11 +467,10 @@ define amdgpu_kernel void @test_force_fp_kern_stack_and_call() local_unnamed_add
 ; GFX1010-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
 ; GFX1010-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
 ; GFX1010-NEXT:    v_mov_b32_e32 v3, 0
-; GFX1010-NEXT:    s_mov_b32 s2, -1
-; GFX1010-NEXT:    s_mov_b32 s3, 0x31c16000
-; GFX1010-NEXT:    s_mov_b64 s[0:1], s[10:11]
-; GFX1010-NEXT:    v_or3_b32 v31, v0, v1, v2
+; GFX1010-NEXT:    s_add_u32 s0, s0, s15
+; GFX1010-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX1010-NEXT:    s_mov_b64 s[10:11], s[8:9]
+; GFX1010-NEXT:    v_or3_b32 v31, v0, v1, v2
 ; GFX1010-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX1010-NEXT:    buffer_store_dword v3, off, s[0:3], s33
 ; GFX1010-NEXT:    s_waitcnt_vscnt null, 0x0
diff --git a/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll b/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll
index 68c632a..11871db 100644
--- a/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll
+++ b/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll
@@ -180,9 +180,8 @@ define amdgpu_kernel void @v3i16_registers(i1 %cond) #0 {
 ; GCN-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GCN-NEXT:    s_mov_b64 s[10:11], s[8:9]
 ; GCN-NEXT:    s_load_dword s8, s[6:7], 0x0
-; GCN-NEXT:    s_mov_b32 s2, -1
-; GCN-NEXT:    s_mov_b32 s3, 0xe00000
-; GCN-NEXT:    s_mov_b64 s[0:1], flat_scratch
+; GCN-NEXT:    s_add_u32 s0, s0, s15
+; GCN-NEXT:    s_addc_u32 s1, s1, 0
 ; GCN-NEXT:    s_mov_b32 s32, 0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_bitcmp1_b32 s8, 0
@@ -230,9 +229,8 @@ define amdgpu_kernel void @v3f16_registers(i1 %cond) #0 {
 ; GCN-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GCN-NEXT:    s_mov_b64 s[10:11], s[8:9]
 ; GCN-NEXT:    s_load_dword s8, s[6:7], 0x0
-; GCN-NEXT:    s_mov_b32 s2, -1
-; GCN-NEXT:    s_mov_b32 s3, 0xe00000
-; GCN-NEXT:    s_mov_b64 s[0:1], flat_scratch
+; GCN-NEXT:    s_add_u32 s0, s0, s15
+; GCN-NEXT:    s_addc_u32 s1, s1, 0
 ; GCN-NEXT:    s_mov_b32 s32, 0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_bitcmp1_b32 s8, 0
diff --git a/llvm/test/CodeGen/AMDGPU/indirect-call-known-callees.ll b/llvm/test/CodeGen/AMDGPU/indirect-call-known-callees.ll
index 2d019ef..47110d9 100644
--- a/llvm/test/CodeGen/AMDGPU/indirect-call-known-callees.ll
+++ b/llvm/test/CodeGen/AMDGPU/indirect-call-known-callees.ll
@@ -13,6 +13,8 @@ define amdgpu_kernel void @indirect_call_known_no_special_inputs() {
 ; GFX9:       ; %bb.0: ; %bb
 ; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s4, s7
 ; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s5, 0
+; GFX9-NEXT:    s_add_u32 s0, s0, s7
+; GFX9-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX9-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX9-NEXT:    s_load_dword s7, s[4:5], 0x0
 ; GFX9-NEXT:    s_getpc_b64 s[4:5]
@@ -23,17 +25,14 @@ define amdgpu_kernel void @indirect_call_known_no_special_inputs() {
 ; GFX9-NEXT:    s_addc_u32 s9, s9, snork@gotpcrel32@hi+12
 ; GFX9-NEXT:    s_load_dwordx2 s[10:11], s[8:9], 0x0
 ; GFX9-NEXT:    s_load_dwordx2 s[12:13], s[4:5], 0x0
-; GFX9-NEXT:    s_mov_b32 s2, -1
+; GFX9-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_and_b32 s4, 1, s7
-; GFX9-NEXT:    s_mov_b32 s3, 0xe00000
 ; GFX9-NEXT:    s_cmp_eq_u32 s4, 1
-; GFX9-NEXT:    s_mov_b64 s[0:1], flat_scratch
+; GFX9-NEXT:    v_mov_b32_e32 v31, v0
 ; GFX9-NEXT:    s_cselect_b32 s5, s13, s11
 ; GFX9-NEXT:    s_cselect_b32 s4, s12, s10
-; GFX9-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX9-NEXT:    s_mov_b32 s12, s6
-; GFX9-NEXT:    v_mov_b32_e32 v31, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-NEXT:    s_mov_b32 s32, 0
diff --git a/llvm/test/CodeGen/AMDGPU/indirect-call.ll b/llvm/test/CodeGen/AMDGPU/indirect-call.ll
index a66ed93..408199b 100644
--- a/llvm/test/CodeGen/AMDGPU/indirect-call.ll
+++ b/llvm/test/CodeGen/AMDGPU/indirect-call.ll
@@ -12,9 +12,8 @@ define amdgpu_kernel void @test_indirect_call_sgpr_ptr(i8) {
 ; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s13
 ; GCN-NEXT:    s_add_i32 s12, s12, s17
 ; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
-; GCN-NEXT:    s_mov_b32 s2, -1
-; GCN-NEXT:    s_mov_b32 s3, 0x1e8f000
-; GCN-NEXT:    s_mov_b64 s[0:1], flat_scratch
+; GCN-NEXT:    s_add_u32 s0, s0, s17
+; GCN-NEXT:    s_addc_u32 s1, s1, 0
 ; GCN-NEXT:    s_mov_b32 s13, s15
 ; GCN-NEXT:    s_mov_b32 s12, s14
 ; GCN-NEXT:    s_getpc_b64 s[14:15]
@@ -38,9 +37,8 @@ define amdgpu_kernel void @test_indirect_call_sgpr_ptr(i8) {
 ; GISEL-NEXT:    s_mov_b32 flat_scratch_lo, s13
 ; GISEL-NEXT:    s_add_i32 s12, s12, s17
 ; GISEL-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
-; GISEL-NEXT:    s_mov_b32 s2, -1
-; GISEL-NEXT:    s_mov_b32 s3, 0x1e8f000
-; GISEL-NEXT:    s_mov_b64 s[0:1], flat_scratch
+; GISEL-NEXT:    s_add_u32 s0, s0, s17
+; GISEL-NEXT:    s_addc_u32 s1, s1, 0
 ; GISEL-NEXT:    s_mov_b32 s13, s15
 ; GISEL-NEXT:    s_mov_b32 s12, s14
 ; GISEL-NEXT:    s_getpc_b64 s[14:15]
@@ -69,9 +67,8 @@ define amdgpu_kernel void @test_indirect_call_sgpr_ptr_arg(i8) {
 ; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s13
 ; GCN-NEXT:    s_add_i32 s12, s12, s17
 ; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
-; GCN-NEXT:    s_mov_b32 s2, -1
-; GCN-NEXT:    s_mov_b32 s3, 0x1e8f000
-; GCN-NEXT:    s_mov_b64 s[0:1], flat_scratch
+; GCN-NEXT:    s_add_u32 s0, s0, s17
+; GCN-NEXT:    s_addc_u32 s1, s1, 0
 ; GCN-NEXT:    s_mov_b32 s13, s15
 ; GCN-NEXT:    s_mov_b32 s12, s14
 ; GCN-NEXT:    s_getpc_b64 s[14:15]
@@ -96,9 +93,8 @@ define amdgpu_kernel void @test_indirect_call_sgpr_ptr_arg(i8) {
 ; GISEL-NEXT:    s_mov_b32 flat_scratch_lo, s13
 ; GISEL-NEXT:    s_add_i32 s12, s12, s17
 ; GISEL-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
-; GISEL-NEXT:    s_mov_b32 s2, -1
-; GISEL-NEXT:    s_mov_b32 s3, 0x1e8f000
-; GISEL-NEXT:    s_mov_b64 s[0:1], flat_scratch
+; GISEL-NEXT:    s_add_u32 s0, s0, s17
+; GISEL-NEXT:    s_addc_u32 s1, s1, 0
 ; GISEL-NEXT:    s_mov_b32 s13, s15
 ; GISEL-NEXT:    s_mov_b32 s12, s14
 ; GISEL-NEXT:    s_getpc_b64 s[14:15]
diff --git a/llvm/test/CodeGen/AMDGPU/kernel-vgpr-spill-mubuf-with-voffset.ll b/llvm/test/CodeGen/AMDGPU/kernel-vgpr-spill-mubuf-with-voffset.ll
index 8843efd..6e90554 100644
--- a/llvm/test/CodeGen/AMDGPU/kernel-vgpr-spill-mubuf-with-voffset.ll
+++ b/llvm/test/CodeGen/AMDGPU/kernel-vgpr-spill-mubuf-with-voffset.ll
@@ -11,9 +11,8 @@ define amdgpu_kernel void @test_kernel(i32 %val) #0 {
 ; CHECK-NEXT:    s_mov_b32 s33, 0
 ; CHECK-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
 ; CHECK-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
-; CHECK-NEXT:    s_mov_b32 s2, -1
-; CHECK-NEXT:    s_mov_b32 s3, 0xe00000
-; CHECK-NEXT:    s_mov_b64 s[0:1], flat_scratch
+; CHECK-NEXT:    s_add_u32 s0, s0, s15
+; CHECK-NEXT:    s_addc_u32 s1, s1, 0
 ; CHECK-NEXT:    ; implicit-def: $vgpr3 : SGPR spill to VGPR lane
 ; CHECK-NEXT:    s_mov_b64 s[10:11], s[8:9]
 ; CHECK-NEXT:    v_mov_b32_e32 v3, v2
diff --git a/llvm/test/CodeGen/AMDGPU/lds-frame-extern.ll b/llvm/test/CodeGen/AMDGPU/lds-frame-extern.ll
index 4851c4f..66f31bb 100644
--- a/llvm/test/CodeGen/AMDGPU/lds-frame-extern.ll
+++ b/llvm/test/CodeGen/AMDGPU/lds-frame-extern.ll
@@ -118,11 +118,10 @@ define amdgpu_kernel void @module_1_kernel_normal_extern_normal(i32 %idx) {
 ; CHECK-NEXT:    s_addc_u32 s7, s7, 0
 ; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6
 ; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7
-; CHECK-NEXT:    s_mov_b32 s2, -1
+; CHECK-NEXT:    s_add_u32 s0, s0, s9
+; CHECK-NEXT:    s_addc_u32 s1, s1, 0
 ; CHECK-NEXT:    s_add_u32 s8, s4, 8
-; CHECK-NEXT:    s_mov_b32 s3, 0x31c16000
 ; CHECK-NEXT:    s_addc_u32 s9, s5, 0
-; CHECK-NEXT:    s_mov_b64 s[0:1], s[6:7]
 ; CHECK-NEXT:    s_getpc_b64 s[6:7]
 ; CHECK-NEXT:    s_add_u32 s6, s6, use_module@gotpcrel32@lo+4
 ; CHECK-NEXT:    s_addc_u32 s7, s7, use_module@gotpcrel32@hi+12
@@ -178,11 +177,10 @@ define amdgpu_kernel void @module_1_kernel_overalign_extern_normal(i32 %idx) {
 ; CHECK-NEXT:    s_addc_u32 s7, s7, 0
 ; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6
 ; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7
-; CHECK-NEXT:    s_mov_b32 s2, -1
+; CHECK-NEXT:    s_add_u32 s0, s0, s9
+; CHECK-NEXT:    s_addc_u32 s1, s1, 0
 ; CHECK-NEXT:    s_add_u32 s8, s4, 8
-; CHECK-NEXT:    s_mov_b32 s3, 0x31c16000
 ; CHECK-NEXT:    s_addc_u32 s9, s5, 0
-; CHECK-NEXT:    s_mov_b64 s[0:1], s[6:7]
 ; CHECK-NEXT:    s_getpc_b64 s[6:7]
 ; CHECK-NEXT:    s_add_u32 s6, s6, use_module@gotpcrel32@lo+4
 ; CHECK-NEXT:    s_addc_u32 s7, s7, use_module@gotpcrel32@hi+12
@@ -238,11 +236,10 @@ define amdgpu_kernel void @module_1_kernel_normal_extern_overalign(i32 %idx) {
 ; CHECK-NEXT:    s_addc_u32 s7, s7, 0
 ; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6
 ; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7
-; CHECK-NEXT:    s_mov_b32 s2, -1
+; CHECK-NEXT:    s_add_u32 s0, s0, s9
+; CHECK-NEXT:    s_addc_u32 s1, s1, 0
 ; CHECK-NEXT:    s_add_u32 s8, s4, 8
-; CHECK-NEXT:    s_mov_b32 s3, 0x31c16000
 ; CHECK-NEXT:    s_addc_u32 s9, s5, 0
-; CHECK-NEXT:    s_mov_b64 s[0:1], s[6:7]
 ; CHECK-NEXT:    s_getpc_b64 s[6:7]
 ; CHECK-NEXT:    s_add_u32 s6, s6, use_module@gotpcrel32@lo+4
 ; CHECK-NEXT:    s_addc_u32 s7, s7, use_module@gotpcrel32@hi+12
@@ -298,11 +295,10 @@ define amdgpu_kernel void @module_1_kernel_overalign_extern_overalign(i32 %idx)
 ; CHECK-NEXT:    s_addc_u32 s7, s7, 0
 ; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6
 ; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7
-; CHECK-NEXT:    s_mov_b32 s2, -1
+; CHECK-NEXT:    s_add_u32 s0, s0, s9
+; CHECK-NEXT:    s_addc_u32 s1, s1, 0
 ; CHECK-NEXT:    s_add_u32 s8, s4, 8
-; CHECK-NEXT:    s_mov_b32 s3, 0x31c16000
 ; CHECK-NEXT:    s_addc_u32 s9, s5, 0
-; CHECK-NEXT:    s_mov_b64 s[0:1], s[6:7]
 ; CHECK-NEXT:    s_getpc_b64 s[6:7]
 ; CHECK-NEXT:    s_add_u32 s6, s6, use_module@gotpcrel32@lo+4
 ; CHECK-NEXT:    s_addc_u32 s7, s7, use_module@gotpcrel32@hi+12
@@ -345,6 +341,8 @@ define amdgpu_kernel void @module_0_kernel_normal_indirect_extern_normal(i32 %id
 ; CHECK-NEXT:    s_addc_u32 s7, s7, 0
 ; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6
 ; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7
+; CHECK-NEXT:    s_add_u32 s0, s0, s9
+; CHECK-NEXT:    s_addc_u32 s1, s1, 0
 ; CHECK-NEXT:    s_add_u32 s8, s4, 8
 ; CHECK-NEXT:    s_addc_u32 s9, s5, 0
 ; CHECK-NEXT:    s_getpc_b64 s[4:5]
@@ -353,9 +351,6 @@ define amdgpu_kernel void @module_0_kernel_normal_indirect_extern_normal(i32 %id
 ; CHECK-NEXT:    v_mov_b32_e32 v0, 0
 ; CHECK-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
 ; CHECK-NEXT:    v_mov_b32_e32 v1, 2
-; CHECK-NEXT:    s_mov_b32 s2, -1
-; CHECK-NEXT:    s_mov_b32 s3, 0x31c16000
-; CHECK-NEXT:    s_mov_b64 s[0:1], s[6:7]
 ; CHECK-NEXT:    s_mov_b32 s15, 0
 ; CHECK-NEXT:    ds_write_b16 v0, v1
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
@@ -375,15 +370,14 @@ define amdgpu_kernel void @module_1_kernel_normal_indirect_extern_normal(i32 %id
 ; CHECK-NEXT:    s_addc_u32 s7, s7, 0
 ; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6
 ; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7
+; CHECK-NEXT:    s_add_u32 s0, s0, s9
+; CHECK-NEXT:    s_addc_u32 s1, s1, 0
 ; CHECK-NEXT:    s_add_u32 s8, s4, 8
 ; CHECK-NEXT:    s_addc_u32 s9, s5, 0
 ; CHECK-NEXT:    s_getpc_b64 s[4:5]
 ; CHECK-NEXT:    s_add_u32 s4, s4, use_module@gotpcrel32@lo+4
 ; CHECK-NEXT:    s_addc_u32 s5, s5, use_module@gotpcrel32@hi+12
-; CHECK-NEXT:    s_mov_b32 s2, -1
 ; CHECK-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
-; CHECK-NEXT:    s_mov_b32 s3, 0x31c16000
-; CHECK-NEXT:    s_mov_b64 s[0:1], s[6:7]
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; CHECK-NEXT:    s_getpc_b64 s[4:5]
@@ -416,6 +410,8 @@ define amdgpu_kernel void @module_0_kernel_overalign_indirect_extern_normal(i32
 ; CHECK-NEXT:    s_addc_u32 s7, s7, 0
 ; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6
 ; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7
+; CHECK-NEXT:    s_add_u32 s0, s0, s9
+; CHECK-NEXT:    s_addc_u32 s1, s1, 0
 ; CHECK-NEXT:    s_add_u32 s8, s4, 8
 ; CHECK-NEXT:    s_addc_u32 s9, s5, 0
 ; CHECK-NEXT:    s_getpc_b64 s[4:5]
@@ -424,9 +420,6 @@ define amdgpu_kernel void @module_0_kernel_overalign_indirect_extern_normal(i32
 ; CHECK-NEXT:    v_mov_b32_e32 v0, 0
 ; CHECK-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
 ; CHECK-NEXT:    v_mov_b32_e32 v1, 2
-; CHECK-NEXT:    s_mov_b32 s2, -1
-; CHECK-NEXT:    s_mov_b32 s3, 0x31c16000
-; CHECK-NEXT:    s_mov_b64 s[0:1], s[6:7]
 ; CHECK-NEXT:    s_mov_b32 s15, 2
 ; CHECK-NEXT:    ds_write_b16 v0, v1
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
@@ -446,15 +439,14 @@ define amdgpu_kernel void @module_1_kernel_overalign_indirect_extern_normal(i32
 ; CHECK-NEXT:    s_addc_u32 s7, s7, 0
 ; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6
 ; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7
+; CHECK-NEXT:    s_add_u32 s0, s0, s9
+; CHECK-NEXT:    s_addc_u32 s1, s1, 0
 ; CHECK-NEXT:    s_add_u32 s8, s4, 8
 ; CHECK-NEXT:    s_addc_u32 s9, s5, 0
 ; CHECK-NEXT:    s_getpc_b64 s[4:5]
 ; CHECK-NEXT:    s_add_u32 s4, s4, use_module@gotpcrel32@lo+4
 ; CHECK-NEXT:    s_addc_u32 s5, s5, use_module@gotpcrel32@hi+12
-; CHECK-NEXT:    s_mov_b32 s2, -1
 ; CHECK-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
-; CHECK-NEXT:    s_mov_b32 s3, 0x31c16000
-; CHECK-NEXT:    s_mov_b64 s[0:1], s[6:7]
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; CHECK-NEXT:    s_getpc_b64 s[4:5]
@@ -487,6 +479,8 @@ define amdgpu_kernel void @module_0_kernel_normal_indirect_extern_overalign(i32
 ; CHECK-NEXT:    s_addc_u32 s7, s7, 0
 ; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6
 ; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7
+; CHECK-NEXT:    s_add_u32 s0, s0, s9
+; CHECK-NEXT:    s_addc_u32 s1, s1, 0
 ; CHECK-NEXT:    s_add_u32 s8, s4, 8
 ; CHECK-NEXT:    s_addc_u32 s9, s5, 0
 ; CHECK-NEXT:    s_getpc_b64 s[4:5]
@@ -495,9 +489,6 @@ define amdgpu_kernel void @module_0_kernel_normal_indirect_extern_overalign(i32
 ; CHECK-NEXT:    v_mov_b32_e32 v0, 0
 ; CHECK-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
 ; CHECK-NEXT:    v_mov_b32_e32 v1, 2
-; CHECK-NEXT:    s_mov_b32 s2, -1
-; CHECK-NEXT:    s_mov_b32 s3, 0x31c16000
-; CHECK-NEXT:    s_mov_b64 s[0:1], s[6:7]
 ; CHECK-NEXT:    s_mov_b32 s15, 1
 ; CHECK-NEXT:    ds_write_b16 v0, v1
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
@@ -517,15 +508,14 @@ define amdgpu_kernel void @module_1_kernel_normal_indirect_extern_overalign(i32
 ; CHECK-NEXT:    s_addc_u32 s7, s7, 0
 ; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6
 ; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7
+; CHECK-NEXT:    s_add_u32 s0, s0, s9
+; CHECK-NEXT:    s_addc_u32 s1, s1, 0
 ; CHECK-NEXT:    s_add_u32 s8, s4, 8
 ; CHECK-NEXT:    s_addc_u32 s9, s5, 0
 ; CHECK-NEXT:    s_getpc_b64 s[4:5]
 ; CHECK-NEXT:    s_add_u32 s4, s4, use_module@gotpcrel32@lo+4
 ; CHECK-NEXT:    s_addc_u32 s5, s5, use_module@gotpcrel32@hi+12
-; CHECK-NEXT:    s_mov_b32 s2, -1
 ; CHECK-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
-; CHECK-NEXT:    s_mov_b32 s3, 0x31c16000
-; CHECK-NEXT:    s_mov_b64 s[0:1], s[6:7]
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; CHECK-NEXT:    s_getpc_b64 s[4:5]
@@ -558,6 +548,8 @@ define amdgpu_kernel void @module_0_kernel_overalign_indirect_extern_overalign(i
 ; CHECK-NEXT:    s_addc_u32 s7, s7, 0
 ; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6
 ; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7
+; CHECK-NEXT:    s_add_u32 s0, s0, s9
+; CHECK-NEXT:    s_addc_u32 s1, s1, 0
 ; CHECK-NEXT:    s_add_u32 s8, s4, 8
 ; CHECK-NEXT:    s_addc_u32 s9, s5, 0
 ; CHECK-NEXT:    s_getpc_b64 s[4:5]
@@ -566,9 +558,6 @@ define amdgpu_kernel void @module_0_kernel_overalign_indirect_extern_overalign(i
 ; CHECK-NEXT:    v_mov_b32_e32 v0, 0
 ; CHECK-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
 ; CHECK-NEXT:    v_mov_b32_e32 v1, 2
-; CHECK-NEXT:    s_mov_b32 s2, -1
-; CHECK-NEXT:    s_mov_b32 s3, 0x31c16000
-; CHECK-NEXT:    s_mov_b64 s[0:1], s[6:7]
 ; CHECK-NEXT:    s_mov_b32 s15, 3
 ; CHECK-NEXT:    ds_write_b16 v0, v1
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
@@ -588,15 +577,14 @@ define amdgpu_kernel void @module_1_kernel_overalign_indirect_extern_overalign(i
 ; CHECK-NEXT:    s_addc_u32 s7, s7, 0
 ; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6
 ; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7
+; CHECK-NEXT:    s_add_u32 s0, s0, s9
+; CHECK-NEXT:    s_addc_u32 s1, s1, 0
 ; CHECK-NEXT:    s_add_u32 s8, s4, 8
 ; CHECK-NEXT:    s_addc_u32 s9, s5, 0
 ; CHECK-NEXT:    s_getpc_b64 s[4:5]
 ; CHECK-NEXT:    s_add_u32 s4, s4, use_module@gotpcrel32@lo+4
 ; CHECK-NEXT:    s_addc_u32 s5, s5, use_module@gotpcrel32@hi+12
-; CHECK-NEXT:    s_mov_b32 s2, -1
 ; CHECK-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
-; CHECK-NEXT:    s_mov_b32 s3, 0x31c16000
-; CHECK-NEXT:    s_mov_b64 s[0:1], s[6:7]
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; CHECK-NEXT:    s_getpc_b64 s[4:5]
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.lds.kernel.id.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.lds.kernel.id.ll
index 26271a0..61818da 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.lds.kernel.id.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.lds.kernel.id.ll
@@ -45,9 +45,8 @@ define amdgpu_kernel void @indirect_lds_id(ptr addrspace(1) %out) !llvm.amdgcn.l
 ; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s7
 ; GCN-NEXT:    s_add_i32 s6, s6, s9
 ; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s6, 8
-; GCN-NEXT:    s_mov_b32 s2, -1
-; GCN-NEXT:    s_mov_b32 s3, 0x1e8f000
-; GCN-NEXT:    s_mov_b64 s[0:1], flat_scratch
+; GCN-NEXT:    s_add_u32 s0, s0, s9
+; GCN-NEXT:    s_addc_u32 s1, s1, 0
 ; GCN-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
 ; GCN-NEXT:    s_getpc_b64 s[6:7]
 ; GCN-NEXT:    s_add_u32 s6, s6, function_lds_id@gotpcrel32@lo+4
diff --git a/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-hybrid.ll b/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-hybrid.ll
index f780188..bb7c43f 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-hybrid.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-hybrid.ll
@@ -164,9 +164,8 @@ define amdgpu_kernel void @k01() {
 ; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s7
 ; GCN-NEXT:    s_add_i32 s6, s6, s9
 ; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s6, 8
-; GCN-NEXT:    s_mov_b32 s2, -1
-; GCN-NEXT:    s_mov_b32 s3, 0x1e8f000
-; GCN-NEXT:    s_mov_b64 s[0:1], flat_scratch  
+; GCN-NEXT:    s_add_u32 s0, s0, s9
+; GCN-NEXT:    s_addc_u32 s1, s1, 0
 ; GCN-NEXT:    s_mov_b64 s[8:9], s[4:5]
 ; GCN-NEXT:    s_getpc_b64 s[4:5]
 ; GCN-NEXT:    s_add_u32 s4, s4, f0@gotpcrel32@lo+4
@@ -199,9 +198,8 @@ define amdgpu_kernel void @k23() {
 ; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s7
 ; GCN-NEXT:    s_add_i32 s6, s6, s9
 ; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s6, 8
-; GCN-NEXT:    s_mov_b32 s2, -1
-; GCN-NEXT:    s_mov_b32 s3, 0x1e8f000
-; GCN-NEXT:    s_mov_b64 s[0:1], flat_scratch 
+; GCN-NEXT:    s_add_u32 s0, s0, s9
+; GCN-NEXT:    s_addc_u32 s1, s1, 0
 ; GCN-NEXT:    s_mov_b64 s[8:9], s[4:5]
 ; GCN-NEXT:    s_getpc_b64 s[4:5]
 ; GCN-NEXT:    s_add_u32 s4, s4, f2@gotpcrel32@lo+4
@@ -242,9 +240,8 @@ define amdgpu_kernel void @k123() {
 ; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s7
 ; GCN-NEXT:    s_add_i32 s6, s6, s9
 ; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s6, 8
-; GCN-NEXT:    s_mov_b32 s2, -1
-; GCN-NEXT:    s_mov_b32 s3, 0x1e8f000
-; GCN-NEXT:    s_mov_b64 s[0:1], flat_scratch
+; GCN-NEXT:    s_add_u32 s0, s0, s9
+; GCN-NEXT:    s_addc_u32 s1, s1, 0
 ; GCN-NEXT:    s_mov_b64 s[8:9], s[4:5]
 ; GCN-NEXT:    s_getpc_b64 s[4:5]
 ; GCN-NEXT:    s_add_u32 s4, s4, f1@gotpcrel32@lo+4
diff --git a/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-table.ll b/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-table.ll
index fa4b93f..4d73436 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-table.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-table.ll
@@ -229,9 +229,8 @@ define amdgpu_kernel void @k01() {
 ; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s7
 ; GCN-NEXT:    s_add_i32 s6, s6, s9
 ; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s6, 8
-; GCN-NEXT:    s_mov_b32 s2, -1
-; GCN-NEXT:    s_mov_b32 s3, 0x1e8f000
-; GCN-NEXT:    s_mov_b64 s[0:1], flat_scratch
+; GCN-NEXT:    s_add_u32 s0, s0, s9
+; GCN-NEXT:    s_addc_u32 s1, s1, 0
 ; GCN-NEXT:    s_mov_b64 s[8:9], s[4:5]
 ; GCN-NEXT:    s_getpc_b64 s[4:5]
 ; GCN-NEXT:    s_add_u32 s4, s4, f0@gotpcrel32@lo+4
@@ -269,9 +268,8 @@ define amdgpu_kernel void @k23() {
 ; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s7
 ; GCN-NEXT:    s_add_i32 s6, s6, s9
 ; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s6, 8
-; GCN-NEXT:    s_mov_b32 s2, -1
-; GCN-NEXT:    s_mov_b32 s3, 0x1e8f000
-; GCN-NEXT:    s_mov_b64 s[0:1], flat_scratch
+; GCN-NEXT:    s_add_u32 s0, s0, s9
+; GCN-NEXT:    s_addc_u32 s1, s1, 0
 ; GCN-NEXT:    s_mov_b64 s[8:9], s[4:5]
 ; GCN-NEXT:    s_getpc_b64 s[4:5]
 ; GCN-NEXT:    s_add_u32 s4, s4, f2@gotpcrel32@lo+4
@@ -312,9 +310,8 @@ define amdgpu_kernel void @k123() {
 ; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s7
 ; GCN-NEXT:    s_add_i32 s6, s6, s9
 ; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s6, 8
-; GCN-NEXT:    s_mov_b32 s2, -1
-; GCN-NEXT:    s_mov_b32 s3, 0x1e8f000
-; GCN-NEXT:    s_mov_b64 s[0:1], flat_scratch
+; GCN-NEXT:    s_add_u32 s0, s0, s9
+; GCN-NEXT:    s_addc_u32 s1, s1, 0
 ; GCN-NEXT:    s_mov_b64 s[8:9], s[4:5]
 ; GCN-NEXT:    s_getpc_b64 s[4:5]
 ; GCN-NEXT:    s_add_u32 s4, s4, f1@gotpcrel32@lo+4
diff --git a/llvm/test/CodeGen/AMDGPU/machine-sink-temporal-divergence-swdev407790.ll b/llvm/test/CodeGen/AMDGPU/machine-sink-temporal-divergence-swdev407790.ll
index e17f311..138a6a8 100644
--- a/llvm/test/CodeGen/AMDGPU/machine-sink-temporal-divergence-swdev407790.ll
+++ b/llvm/test/CodeGen/AMDGPU/machine-sink-temporal-divergence-swdev407790.ll
@@ -44,18 +44,17 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no
 ; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
 ; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; CHECK-NEXT:    s_load_dwordx8 s[44:51], s[6:7], 0x0
+; CHECK-NEXT:    s_add_u32 s0, s0, s15
 ; CHECK-NEXT:    s_mov_b64 s[34:35], s[6:7]
+; CHECK-NEXT:    s_addc_u32 s1, s1, 0
 ; CHECK-NEXT:    v_mov_b32_e32 v40, v0
 ; CHECK-NEXT:    s_add_u32 s42, s34, 40
 ; CHECK-NEXT:    v_mov_b32_e32 v31, v0
 ; CHECK-NEXT:    v_mov_b32_e32 v0, 0
-; CHECK-NEXT:    s_mov_b32 s2, -1
 ; CHECK-NEXT:    s_mov_b64 s[36:37], s[8:9]
 ; CHECK-NEXT:    s_addc_u32 s43, s35, 0
-; CHECK-NEXT:    s_mov_b32 s3, 0x31c16000
-; CHECK-NEXT:    s_mov_b64 s[0:1], s[10:11]
-; CHECK-NEXT:    s_mov_b64 s[8:9], s[42:43]
 ; CHECK-NEXT:    s_mov_b64 s[10:11], s[36:37]
+; CHECK-NEXT:    s_mov_b64 s[8:9], s[42:43]
 ; CHECK-NEXT:    s_mov_b32 s33, s14
 ; CHECK-NEXT:    s_mov_b32 s40, s13
 ; CHECK-NEXT:    s_mov_b32 s41, s12
@@ -782,18 +781,17 @@ define protected amdgpu_kernel void @kernel_round1_short(ptr addrspace(1) nocapt
 ; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
 ; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; CHECK-NEXT:    s_load_dwordx2 s[44:45], s[6:7], 0x10
+; CHECK-NEXT:    s_add_u32 s0, s0, s15
 ; CHECK-NEXT:    s_mov_b64 s[36:37], s[6:7]
+; CHECK-NEXT:    s_addc_u32 s1, s1, 0
 ; CHECK-NEXT:    v_mov_b32_e32 v40, v0
 ; CHECK-NEXT:    s_add_u32 s42, s36, 40
 ; CHECK-NEXT:    v_mov_b32_e32 v31, v0
 ; CHECK-NEXT:    v_mov_b32_e32 v0, 0
-; CHECK-NEXT:    s_mov_b32 s2, -1
 ; CHECK-NEXT:    s_mov_b64 s[34:35], s[8:9]
 ; CHECK-NEXT:    s_addc_u32 s43, s37, 0
-; CHECK-NEXT:    s_mov_b32 s3, 0x31c16000
-; CHECK-NEXT:    s_mov_b64 s[0:1], s[10:11]
-; CHECK-NEXT:    s_mov_b64 s[8:9], s[42:43]
 ; CHECK-NEXT:    s_mov_b64 s[10:11], s[34:35]
+; CHECK-NEXT:    s_mov_b64 s[8:9], s[42:43]
 ; CHECK-NEXT:    s_mov_b32 s33, s14
 ; CHECK-NEXT:    s_mov_b32 s40, s13
 ; CHECK-NEXT:    s_mov_b32 s41, s12
diff --git a/llvm/test/CodeGen/AMDGPU/need-fp-from-vgpr-spills.ll b/llvm/test/CodeGen/AMDGPU/need-fp-from-vgpr-spills.ll
index 70a9bbb..f70441e 100644
--- a/llvm/test/CodeGen/AMDGPU/need-fp-from-vgpr-spills.ll
+++ b/llvm/test/CodeGen/AMDGPU/need-fp-from-vgpr-spills.ll
@@ -69,9 +69,8 @@ define amdgpu_kernel void @kernel_call() {
 ; CHECK-NEXT:    s_mov_b32 s32, 0
 ; CHECK-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
 ; CHECK-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
-; CHECK-NEXT:    s_mov_b32 s2, -1
-; CHECK-NEXT:    s_mov_b32 s3, 0xe00000
-; CHECK-NEXT:    s_mov_b64 s[0:1], flat_scratch
+; CHECK-NEXT:    s_add_u32 s0, s0, s15
+; CHECK-NEXT:    s_addc_u32 s1, s1, 0
 ; CHECK-NEXT:    s_mov_b64 s[10:11], s[8:9]
 ; CHECK-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; CHECK-NEXT:    s_getpc_b64 s[16:17]
@@ -129,9 +128,8 @@ define amdgpu_kernel void @kernel_tailcall() {
 ; CHECK-NEXT:    s_mov_b32 s32, 0
 ; CHECK-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
 ; CHECK-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
-; CHECK-NEXT:    s_mov_b32 s2, -1
-; CHECK-NEXT:    s_mov_b32 s3, 0xe00000
-; CHECK-NEXT:    s_mov_b64 s[0:1], flat_scratch
+; CHECK-NEXT:    s_add_u32 s0, s0, s15
+; CHECK-NEXT:    s_addc_u32 s1, s1, 0
 ; CHECK-NEXT:    s_mov_b64 s[10:11], s[8:9]
 ; CHECK-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; CHECK-NEXT:    s_getpc_b64 s[16:17]
@@ -242,9 +240,8 @@ define protected amdgpu_kernel void @kernel() {
 ; CHECK-NEXT:    s_mov_b32 s32, 0
 ; CHECK-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
 ; CHECK-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
-; CHECK-NEXT:    s_mov_b32 s2, -1
-; CHECK-NEXT:    s_mov_b32 s3, 0xe00000
-; CHECK-NEXT:    s_mov_b64 s[0:1], flat_scratch
+; CHECK-NEXT:    s_add_u32 s0, s0, s15
+; CHECK-NEXT:    s_addc_u32 s1, s1, 0
 ; CHECK-NEXT:    s_mov_b64 s[10:11], s[8:9]
 ; CHECK-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; CHECK-NEXT:    s_getpc_b64 s[16:17]
diff --git a/llvm/test/CodeGen/AMDGPU/simple-indirect-call.ll b/llvm/test/CodeGen/AMDGPU/simple-indirect-call.ll
index e6d9c0d..e7c5aaf 100644
--- a/llvm/test/CodeGen/AMDGPU/simple-indirect-call.ll
+++ b/llvm/test/CodeGen/AMDGPU/simple-indirect-call.ll
@@ -45,8 +45,8 @@ define amdgpu_kernel void @test_simple_indirect_call() {
 ; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x4
 ; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
 ; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
-; GFX9-NEXT:    s_mov_b32 s2, -1
-; GFX9-NEXT:    s_mov_b32 s3, 0xe00000
+; GFX9-NEXT:    s_add_u32 s0, s0, s15
+; GFX9-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_lshr_b32 s4, s4, 16
 ; GFX9-NEXT:    s_mul_i32 s4, s4, s5
@@ -55,9 +55,8 @@ define amdgpu_kernel void @test_simple_indirect_call() {
 ; GFX9-NEXT:    s_add_u32 s6, s6, indirect@rel32@lo+4
 ; GFX9-NEXT:    s_addc_u32 s7, s7, indirect@rel32@hi+12
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s6
-; GFX9-NEXT:    s_mov_b64 s[0:1], flat_scratch
-; GFX9-NEXT:    v_mad_u32_u24 v0, v1, s5, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v4, s7
+; GFX9-NEXT:    v_mad_u32_u24 v0, v1, s5, v0
 ; GFX9-NEXT:    v_add_lshl_u32 v0, v0, v2, 3
 ; GFX9-NEXT:    s_mov_b32 s32, 0
 ; GFX9-NEXT:    ds_write_b64 v0, v[3:4]
diff --git a/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll b/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll
index 8d8459f..1118cc3 100644
--- a/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll
+++ b/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll
@@ -45,8 +45,10 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS1-NEXT:    s_load_dwordx2 s[4:5], s[38:39], 0x18
 ; GLOBALNESS1-NEXT:    s_load_dword s7, s[38:39], 0x20
 ; GLOBALNESS1-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GLOBALNESS1-NEXT:    v_mov_b32_e32 v0, 0
 ; GLOBALNESS1-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
+; GLOBALNESS1-NEXT:    s_add_u32 s0, s0, s15
+; GLOBALNESS1-NEXT:    v_mov_b32_e32 v0, 0
+; GLOBALNESS1-NEXT:    s_addc_u32 s1, s1, 0
 ; GLOBALNESS1-NEXT:    v_mov_b32_e32 v1, 0x40994400
 ; GLOBALNESS1-NEXT:    s_bitcmp1_b32 s74, 0
 ; GLOBALNESS1-NEXT:    s_waitcnt lgkmcnt(0)
@@ -71,10 +73,7 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS1-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; GLOBALNESS1-NEXT:    v_cmp_ne_u32_e64 s[50:51], 1, v0
 ; GLOBALNESS1-NEXT:    v_cmp_ne_u32_e64 s[42:43], 1, v1
-; GLOBALNESS1-NEXT:    s_mov_b32 s2, -1
 ; GLOBALNESS1-NEXT:    v_cmp_ne_u32_e64 s[44:45], 1, v3
-; GLOBALNESS1-NEXT:    s_mov_b32 s3, 0xe00000
-; GLOBALNESS1-NEXT:    s_mov_b64 s[0:1], flat_scratch
 ; GLOBALNESS1-NEXT:    s_mov_b32 s68, s14
 ; GLOBALNESS1-NEXT:    s_mov_b32 s69, s13
 ; GLOBALNESS1-NEXT:    s_mov_b32 s70, s12
@@ -333,8 +332,10 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS0-NEXT:    s_load_dwordx2 s[4:5], s[38:39], 0x18
 ; GLOBALNESS0-NEXT:    s_load_dword s7, s[38:39], 0x20
 ; GLOBALNESS0-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GLOBALNESS0-NEXT:    v_mov_b32_e32 v0, 0
 ; GLOBALNESS0-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
+; GLOBALNESS0-NEXT:    s_add_u32 s0, s0, s15
+; GLOBALNESS0-NEXT:    v_mov_b32_e32 v0, 0
+; GLOBALNESS0-NEXT:    s_addc_u32 s1, s1, 0
 ; GLOBALNESS0-NEXT:    v_mov_b32_e32 v1, 0x40994400
 ; GLOBALNESS0-NEXT:    s_bitcmp1_b32 s74, 0
 ; GLOBALNESS0-NEXT:    s_waitcnt lgkmcnt(0)
@@ -359,10 +360,7 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS0-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; GLOBALNESS0-NEXT:    v_cmp_ne_u32_e64 s[50:51], 1, v0
 ; GLOBALNESS0-NEXT:    v_cmp_ne_u32_e64 s[42:43], 1, v1
-; GLOBALNESS0-NEXT:    s_mov_b32 s2, -1
 ; GLOBALNESS0-NEXT:    v_cmp_ne_u32_e64 s[44:45], 1, v3
-; GLOBALNESS0-NEXT:    s_mov_b32 s3, 0xe00000
-; GLOBALNESS0-NEXT:    s_mov_b64 s[0:1], flat_scratch
 ; GLOBALNESS0-NEXT:    s_mov_b32 s66, s14
 ; GLOBALNESS0-NEXT:    s_mov_b32 s67, s13
 ; GLOBALNESS0-NEXT:    s_mov_b32 s68, s12
diff --git a/llvm/test/CodeGen/AMDGPU/vgpr_constant_to_sgpr.ll b/llvm/test/CodeGen/AMDGPU/vgpr_constant_to_sgpr.ll
index 7d759089..7840559 100644
--- a/llvm/test/CodeGen/AMDGPU/vgpr_constant_to_sgpr.ll
+++ b/llvm/test/CodeGen/AMDGPU/vgpr_constant_to_sgpr.ll
@@ -14,9 +14,8 @@ define protected amdgpu_kernel void @kern(ptr %addr) !llvm.amdgcn.lds.kernel.id
 ; CHECK-NEXT:    s_addc_u32 s11, s11, 0
 ; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
 ; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
-; CHECK-NEXT:    s_mov_b32 s2, -1
-; CHECK-NEXT:    s_mov_b32 s3, 0x31c16000
-; CHECK-NEXT:    s_mov_b64 s[0:1], s[10:11]
+; CHECK-NEXT:    s_add_u32 s0, s0, s15
+; CHECK-NEXT:    s_addc_u32 s1, s1, 0
 ; CHECK-NEXT:    s_mov_b64 s[10:11], s[8:9]
 ; CHECK-NEXT:    s_load_dwordx2 s[8:9], s[6:7], 0x0
 ; CHECK-NEXT:    v_mov_b32_e32 v5, 42
-- 
cgit v1.1


From d72e8c259637991c8d0be642a5ab2bfeb19b1c6e Mon Sep 17 00:00:00 2001
From: Pierre van Houtryve <pierre.vanhoutryve@amd.com>
Date: Fri, 9 Feb 2024 09:57:59 +0100
Subject: [NFC] Add b9079ba to git-blame-ignore-revs (#81233)

---
 .git-blame-ignore-revs | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/.git-blame-ignore-revs b/.git-blame-ignore-revs
index ea84e31..1f498a8 100644
--- a/.git-blame-ignore-revs
+++ b/.git-blame-ignore-revs
@@ -78,3 +78,6 @@ f6d557ee34b6bbdb1dc32f29e34b4a4a8ad35e81
 082b89b25faae3e45a023caf51b65ca0f02f377f
 0ba22f51d128bee9d69756c56c4678097270e10b
 84da0e1bb75f8666cf222d2f600f37bebb9ea389
+
+# [NFC] clang-format utils/TableGen (#80973)
+b9079baaddfed5e604fbfaa1d81a7a1c38e78c26
-- 
cgit v1.1


From df2513c80bbd444ce97d28961bd5c20ffd7d3c44 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Timm=20B=C3=A4der?= <tbaeder@redhat.com>
Date: Fri, 9 Feb 2024 09:18:47 +0100
Subject: [clang][Interp] Fix three-way comparison detection

Instead of using !T && CPlusPlus, just check the BinaryOperator's
opcode. Turns out we also hit this code path for some assignments
of structs in C++.
---
 clang/lib/AST/Interp/ByteCodeExprGen.cpp | 2 +-
 clang/test/SemaCXX/conditional-expr.cpp  | 2 ++
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/clang/lib/AST/Interp/ByteCodeExprGen.cpp b/clang/lib/AST/Interp/ByteCodeExprGen.cpp
index 21bc29f..bf45615 100644
--- a/clang/lib/AST/Interp/ByteCodeExprGen.cpp
+++ b/clang/lib/AST/Interp/ByteCodeExprGen.cpp
@@ -464,7 +464,7 @@ bool ByteCodeExprGen<Emitter>::VisitBinaryOperator(const BinaryOperator *BO) {
   // Special case for C++'s three-way/spaceship operator <=>, which
   // returns a std::{strong,weak,partial}_ordering (which is a class, so doesn't
   // have a PrimType).
-  if (!T && Ctx.getLangOpts().CPlusPlus) {
+  if (!T && BO->getOpcode() == BO_Cmp) {
     if (DiscardResult)
       return true;
     const ComparisonCategoryInfo *CmpInfo =
diff --git a/clang/test/SemaCXX/conditional-expr.cpp b/clang/test/SemaCXX/conditional-expr.cpp
index 9a5e2ba..01effaa 100644
--- a/clang/test/SemaCXX/conditional-expr.cpp
+++ b/clang/test/SemaCXX/conditional-expr.cpp
@@ -1,5 +1,7 @@
 // RUN: %clang_cc1 -fcxx-exceptions -fexceptions -fsyntax-only -verify=expected,expected-cxx11 -std=c++11 -Wsign-conversion %s
+// RUN: %clang_cc1 -fcxx-exceptions -fexceptions -fsyntax-only -verify=expected,expected-cxx11 -std=c++11 -Wsign-conversion %s -fexperimental-new-constant-interpreter
 // RUN: %clang_cc1 -fcxx-exceptions -fexceptions -fsyntax-only -verify=expected,expected-cxx17 -std=c++17 -Wsign-conversion %s
+// RUN: %clang_cc1 -fcxx-exceptions -fexceptions -fsyntax-only -verify=expected,expected-cxx17 -std=c++17 -Wsign-conversion %s -fexperimental-new-constant-interpreter
 
 // C++ rules for ?: are a lot stricter than C rules, and have to take into
 // account more conversion options.
-- 
cgit v1.1


From 79e43eb3e20484bdb6f32eecc336742dd721fcc9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Timm=20B=C3=A4der?= <tbaeder@redhat.com>
Date: Fri, 9 Feb 2024 10:11:51 +0100
Subject: [clang][Interp] Protect ArrayDecay ops against dummy pointers

---
 clang/lib/AST/Interp/Interp.h          |  3 +++
 clang/test/AST/Interp/arrays.cpp       | 20 ++++++++++++++++++++
 clang/test/SemaCXX/self-comparison.cpp |  1 +
 3 files changed, 24 insertions(+)

diff --git a/clang/lib/AST/Interp/Interp.h b/clang/lib/AST/Interp/Interp.h
index 1299a70..bcabd93 100644
--- a/clang/lib/AST/Interp/Interp.h
+++ b/clang/lib/AST/Interp/Interp.h
@@ -1891,6 +1891,9 @@ inline bool ArrayElemPop(InterpState &S, CodePtr OpPC, uint32_t Index) {
 inline bool ArrayDecay(InterpState &S, CodePtr OpPC) {
   const Pointer &Ptr = S.Stk.pop<Pointer>();
 
+  if (Ptr.isDummy())
+    return false;
+
   if (!Ptr.isUnknownSizeArray()) {
     S.Stk.push<Pointer>(Ptr.atIndex(0));
     return true;
diff --git a/clang/test/AST/Interp/arrays.cpp b/clang/test/AST/Interp/arrays.cpp
index e14ff34..dedfa01 100644
--- a/clang/test/AST/Interp/arrays.cpp
+++ b/clang/test/AST/Interp/arrays.cpp
@@ -598,3 +598,23 @@ namespace NonConstReads {
   const int y = 0;
   int yy[y];
 }
+
+namespace SelfComparison {
+  struct S {
+    int field;
+    static int static_field;
+    int array[4];
+  };
+
+  struct T {
+    int field;
+    static int static_field;
+    int array[4];
+    S s;
+  };
+
+  int struct_test(S s1, S s2, S *s3, T t) {
+    return s3->array[t.field] == s3->array[t.field];  // expected-warning {{self-comparison always evaluates to true}} \
+                                                      // ref-warning {{self-comparison always evaluates to true}}
+  };
+}
diff --git a/clang/test/SemaCXX/self-comparison.cpp b/clang/test/SemaCXX/self-comparison.cpp
index 72127f1..c3c8755 100644
--- a/clang/test/SemaCXX/self-comparison.cpp
+++ b/clang/test/SemaCXX/self-comparison.cpp
@@ -1,4 +1,5 @@
 // RUN: %clang_cc1 -fsyntax-only -verify %s -std=c++2a
+// RUN: %clang_cc1 -fsyntax-only -verify %s -std=c++2a -fexperimental-new-constant-interpreter
 
 int foo(int x) {
   return x == x; // expected-warning {{self-comparison always evaluates to true}}
-- 
cgit v1.1


From 9e73656af524a2c592978aec91de67316c5ce69f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Timm=20B=C3=A4der?= <tbaeder@redhat.com>
Date: Fri, 9 Feb 2024 10:23:54 +0100
Subject: [clang][Interp] Support ExpressionTraitExprs

Just push a constant bool value.
---
 clang/lib/AST/Interp/ByteCodeExprGen.cpp | 7 +++++++
 clang/lib/AST/Interp/ByteCodeExprGen.h   | 1 +
 clang/test/SemaCXX/expression-traits.cpp | 1 +
 3 files changed, 9 insertions(+)

diff --git a/clang/lib/AST/Interp/ByteCodeExprGen.cpp b/clang/lib/AST/Interp/ByteCodeExprGen.cpp
index bf45615..2539e75 100644
--- a/clang/lib/AST/Interp/ByteCodeExprGen.cpp
+++ b/clang/lib/AST/Interp/ByteCodeExprGen.cpp
@@ -2051,6 +2051,13 @@ bool ByteCodeExprGen<Emitter>::VisitCXXInheritedCtorInitExpr(
   return this->emitCall(F, E);
 }
 
+template <class Emitter>
+bool ByteCodeExprGen<Emitter>::VisitExpressionTraitExpr(
+    const ExpressionTraitExpr *E) {
+  assert(Ctx.getLangOpts().CPlusPlus);
+  return this->emitConstBool(E->getValue(), E);
+}
+
 template <class Emitter> bool ByteCodeExprGen<Emitter>::discard(const Expr *E) {
   if (E->containsErrors())
     return false;
diff --git a/clang/lib/AST/Interp/ByteCodeExprGen.h b/clang/lib/AST/Interp/ByteCodeExprGen.h
index c908a9b..ae216f5 100644
--- a/clang/lib/AST/Interp/ByteCodeExprGen.h
+++ b/clang/lib/AST/Interp/ByteCodeExprGen.h
@@ -112,6 +112,7 @@ public:
   bool VisitChooseExpr(const ChooseExpr *E);
   bool VisitObjCBoolLiteralExpr(const ObjCBoolLiteralExpr *E);
   bool VisitCXXInheritedCtorInitExpr(const CXXInheritedCtorInitExpr *E);
+  bool VisitExpressionTraitExpr(const ExpressionTraitExpr *E);
 
 protected:
   bool visitExpr(const Expr *E) override;
diff --git a/clang/test/SemaCXX/expression-traits.cpp b/clang/test/SemaCXX/expression-traits.cpp
index a76f0c4..64ddca0 100644
--- a/clang/test/SemaCXX/expression-traits.cpp
+++ b/clang/test/SemaCXX/expression-traits.cpp
@@ -1,4 +1,5 @@
 // RUN: %clang_cc1 -std=c++98 -fsyntax-only -verify -fcxx-exceptions %s
+// RUN: %clang_cc1 -std=c++98 -fsyntax-only -verify -fcxx-exceptions %s -fexperimental-new-constant-interpreter
 
 //
 // Tests for "expression traits" intrinsics such as __is_lvalue_expr.
-- 
cgit v1.1


From 5609bd83c3bd39a7522b05b32decc9e3c8ad08ae Mon Sep 17 00:00:00 2001
From: Owen Pan <owenpiano@gmail.com>
Date: Fri, 9 Feb 2024 01:49:39 -0800
Subject: Revert "[clang-format] Update FormatToken::isSimpleTypeSpecifier()
 (#80241)"

This reverts commit 763139afc19ddf2e0f0265dc828ce8e5fbe92530.

It seems that LangOpts is not initialized before use.
---
 clang/include/clang/Format/Format.h   |  2 --
 clang/lib/Format/FormatToken.cpp      | 35 ++++++++++++++++++++++++++++++++++-
 clang/lib/Format/FormatTokenLexer.cpp |  7 +++----
 clang/lib/Format/FormatTokenLexer.h   |  1 +
 4 files changed, 38 insertions(+), 7 deletions(-)

diff --git a/clang/include/clang/Format/Format.h b/clang/include/clang/Format/Format.h
index bb63d33..cb14d98 100644
--- a/clang/include/clang/Format/Format.h
+++ b/clang/include/clang/Format/Format.h
@@ -5175,8 +5175,6 @@ tooling::Replacements sortUsingDeclarations(const FormatStyle &Style,
                                             ArrayRef<tooling::Range> Ranges,
                                             StringRef FileName = "<stdin>");
 
-extern LangOptions LangOpts;
-
 /// Returns the ``LangOpts`` that the formatter expects you to set.
 ///
 /// \param Style determines specific settings for lexing mode.
diff --git a/clang/lib/Format/FormatToken.cpp b/clang/lib/Format/FormatToken.cpp
index 69f751d..b791c5a 100644
--- a/clang/lib/Format/FormatToken.cpp
+++ b/clang/lib/Format/FormatToken.cpp
@@ -34,8 +34,41 @@ const char *getTokenTypeName(TokenType Type) {
   return nullptr;
 }
 
+// FIXME: This is copy&pasted from Sema. Put it in a common place and remove
+// duplication.
 bool FormatToken::isSimpleTypeSpecifier() const {
-  return Tok.isSimpleTypeSpecifier(LangOpts);
+  switch (Tok.getKind()) {
+  case tok::kw_short:
+  case tok::kw_long:
+  case tok::kw___int64:
+  case tok::kw___int128:
+  case tok::kw_signed:
+  case tok::kw_unsigned:
+  case tok::kw_void:
+  case tok::kw_char:
+  case tok::kw_int:
+  case tok::kw_half:
+  case tok::kw_float:
+  case tok::kw_double:
+  case tok::kw___bf16:
+  case tok::kw__Float16:
+  case tok::kw___float128:
+  case tok::kw___ibm128:
+  case tok::kw_wchar_t:
+  case tok::kw_bool:
+#define TRANSFORM_TYPE_TRAIT_DEF(_, Trait) case tok::kw___##Trait:
+#include "clang/Basic/TransformTypeTraits.def"
+  case tok::annot_typename:
+  case tok::kw_char8_t:
+  case tok::kw_char16_t:
+  case tok::kw_char32_t:
+  case tok::kw_typeof:
+  case tok::kw_decltype:
+  case tok::kw__Atomic:
+    return true;
+  default:
+    return false;
+  }
 }
 
 bool FormatToken::isTypeOrIdentifier() const {
diff --git a/clang/lib/Format/FormatTokenLexer.cpp b/clang/lib/Format/FormatTokenLexer.cpp
index 31b2b7e..a87d0ba 100644
--- a/clang/lib/Format/FormatTokenLexer.cpp
+++ b/clang/lib/Format/FormatTokenLexer.cpp
@@ -22,20 +22,18 @@
 namespace clang {
 namespace format {
 
-LangOptions LangOpts;
-
 FormatTokenLexer::FormatTokenLexer(
     const SourceManager &SourceMgr, FileID ID, unsigned Column,
     const FormatStyle &Style, encoding::Encoding Encoding,
     llvm::SpecificBumpPtrAllocator<FormatToken> &Allocator,
     IdentifierTable &IdentTable)
     : FormatTok(nullptr), IsFirstToken(true), StateStack({LexerState::NORMAL}),
-      Column(Column), TrailingWhitespace(0), SourceMgr(SourceMgr), ID(ID),
+      Column(Column), TrailingWhitespace(0),
+      LangOpts(getFormattingLangOpts(Style)), SourceMgr(SourceMgr), ID(ID),
       Style(Style), IdentTable(IdentTable), Keywords(IdentTable),
       Encoding(Encoding), Allocator(Allocator), FirstInLineIndex(0),
       FormattingDisabled(false), MacroBlockBeginRegex(Style.MacroBlockBegin),
       MacroBlockEndRegex(Style.MacroBlockEnd) {
-  LangOpts = getFormattingLangOpts(Style);
   Lex.reset(new Lexer(ID, SourceMgr.getBufferOrFake(ID), SourceMgr, LangOpts));
   Lex->SetKeepWhitespaceMode(true);
 
@@ -1444,6 +1442,7 @@ void FormatTokenLexer::readRawToken(FormatToken &Tok) {
 
 void FormatTokenLexer::resetLexer(unsigned Offset) {
   StringRef Buffer = SourceMgr.getBufferData(ID);
+  LangOpts = getFormattingLangOpts(Style);
   Lex.reset(new Lexer(SourceMgr.getLocForStartOfFile(ID), LangOpts,
                       Buffer.begin(), Buffer.begin() + Offset, Buffer.end()));
   Lex->SetKeepWhitespaceMode(true);
diff --git a/clang/lib/Format/FormatTokenLexer.h b/clang/lib/Format/FormatTokenLexer.h
index 52838f1..65dd733 100644
--- a/clang/lib/Format/FormatTokenLexer.h
+++ b/clang/lib/Format/FormatTokenLexer.h
@@ -120,6 +120,7 @@ private:
   unsigned Column;
   unsigned TrailingWhitespace;
   std::unique_ptr<Lexer> Lex;
+  LangOptions LangOpts;
   const SourceManager &SourceMgr;
   FileID ID;
   const FormatStyle &Style;
-- 
cgit v1.1


From 245d7727d51548c3d5d867b69b1f9b1efff2502e Mon Sep 17 00:00:00 2001
From: Owen Pan <owenpiano@gmail.com>
Date: Fri, 9 Feb 2024 01:53:47 -0800
Subject: Revert "[clang-format] Fix an out-of-bounds bug uncovered by
 763139afc19d"

This reverts commit 173e674ba55eb93e8af43f2eece7feffe9954b34.

Actually, NotTokens[5] in QualifierFixerTest.cpp is not out of bounds.
---
 clang/unittests/Format/QualifierFixerTest.cpp | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/clang/unittests/Format/QualifierFixerTest.cpp b/clang/unittests/Format/QualifierFixerTest.cpp
index 4e1768d..324366c 100644
--- a/clang/unittests/Format/QualifierFixerTest.cpp
+++ b/clang/unittests/Format/QualifierFixerTest.cpp
@@ -1100,6 +1100,8 @@ TEST_F(QualifierFixerTest, IsQualifierType) {
       NotTokens[3], ConfiguredTokens));
   EXPECT_FALSE(LeftRightQualifierAlignmentFixer::isConfiguredQualifierOrType(
       NotTokens[4], ConfiguredTokens));
+  EXPECT_FALSE(LeftRightQualifierAlignmentFixer::isConfiguredQualifierOrType(
+      NotTokens[5], ConfiguredTokens));
 
   EXPECT_FALSE(
       LeftRightQualifierAlignmentFixer::isQualifierOrType(NotTokens[0]));
@@ -1111,6 +1113,8 @@ TEST_F(QualifierFixerTest, IsQualifierType) {
       LeftRightQualifierAlignmentFixer::isQualifierOrType(NotTokens[3]));
   EXPECT_FALSE(
       LeftRightQualifierAlignmentFixer::isQualifierOrType(NotTokens[4]));
+  EXPECT_FALSE(
+      LeftRightQualifierAlignmentFixer::isQualifierOrType(NotTokens[5]));
 }
 
 TEST_F(QualifierFixerTest, IsMacro) {
-- 
cgit v1.1


From c227eca73970c65d9663e6d65abe3f9daef2a25f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Timm=20B=C3=A4der?= <tbaeder@redhat.com>
Date: Fri, 9 Feb 2024 11:01:07 +0100
Subject: [clang][Interp][NFC] Convert test case to verify=expected,both style

---
 clang/test/AST/Interp/arrays.cpp | 247 +++++++++++++--------------------------
 1 file changed, 83 insertions(+), 164 deletions(-)

diff --git a/clang/test/AST/Interp/arrays.cpp b/clang/test/AST/Interp/arrays.cpp
index dedfa01..3c06ab5 100644
--- a/clang/test/AST/Interp/arrays.cpp
+++ b/clang/test/AST/Interp/arrays.cpp
@@ -1,7 +1,7 @@
-// RUN: %clang_cc1 -fexperimental-new-constant-interpreter -verify %s
-// RUN: %clang_cc1 -fexperimental-new-constant-interpreter -std=c++20 -verify %s
-// RUN: %clang_cc1 -verify=ref %s
-// RUN: %clang_cc1 -verify=ref -std=c++20 %s
+// RUN: %clang_cc1 -fexperimental-new-constant-interpreter -verify=expected,both %s
+// RUN: %clang_cc1 -fexperimental-new-constant-interpreter -std=c++20 -verify=expected,both %s
+// RUN: %clang_cc1 -verify=ref,both %s
+// RUN: %clang_cc1 -verify=ref,both -std=c++20 %s
 
 constexpr int m = 3;
 constexpr const int *foo[][5] = {
@@ -73,53 +73,40 @@ static_assert(getElementFromEnd(data, 5, 0) == 1, "");
 static_assert(getElementFromEnd(data, 5, 4) == 5, "");
 
 constexpr int getFirstElem(const int *a) {
-  return a[0]; // expected-note {{read of dereferenced null pointer}} \
-               // ref-note {{read of dereferenced null pointer}}
+  return a[0]; // both-note {{read of dereferenced null pointer}}
 }
-static_assert(getFirstElem(nullptr) == 1, ""); // expected-error {{not an integral constant expression}} \
-                                               // expected-note {{in call to}} \
-                                               // ref-error {{not an integral constant expression}} \
-                                               // ref-note {{in call to}}
+static_assert(getFirstElem(nullptr) == 1, ""); // both-error {{not an integral constant expression}} \
+                                               // both-note {{in call to}}
 
 constexpr static int arr[2] = {1,2};
 constexpr static int arr2[2] = {3,4};
 constexpr int *p1 = nullptr;
-constexpr int *p2 = p1 + 1; // expected-error {{must be initialized by a constant expression}} \
-                            // expected-note {{cannot perform pointer arithmetic on null pointer}} \
-                            // ref-error {{must be initialized by a constant expression}} \
-                            // ref-note {{cannot perform pointer arithmetic on null pointer}}
+constexpr int *p2 = p1 + 1; // both-error {{must be initialized by a constant expression}} \
+                            // both-note {{cannot perform pointer arithmetic on null pointer}}
 constexpr int *p3 = p1 + 0;
 constexpr int *p4 = p1 - 0;
 constexpr int *p5 =  0 + p1;
-constexpr int *p6 =  0 - p1; // expected-error {{invalid operands to binary expression}} \
-                             // ref-error {{invalid operands to binary expression}}
+constexpr int *p6 =  0 - p1; // both-error {{invalid operands to binary expression}}
 
 constexpr int const * ap1 = &arr[0];
-constexpr int const * ap2 = ap1 + 3; // expected-error {{must be initialized by a constant expression}} \
-                                     // expected-note {{cannot refer to element 3 of array of 2}} \
-                                     // ref-error {{must be initialized by a constant expression}} \
-                                     // ref-note {{cannot refer to element 3 of array of 2}}
-
-constexpr auto ap3 = arr - 1; // expected-error {{must be initialized by a constant expression}} \
-                              // expected-note {{cannot refer to element -1}} \
-                              // ref-error {{must be initialized by a constant expression}} \
-                              // ref-note {{cannot refer to element -1}}
+constexpr int const * ap2 = ap1 + 3; // both-error {{must be initialized by a constant expression}} \
+                                     // both-note {{cannot refer to element 3 of array of 2}}
+
+constexpr auto ap3 = arr - 1; // both-error {{must be initialized by a constant expression}} \
+                              // both-note {{cannot refer to element -1}}
 constexpr int k1 = &arr[1] - &arr[0];
 static_assert(k1 == 1, "");
 static_assert((&arr[0] - &arr[1]) == -1, "");
 
-constexpr int k2 = &arr2[1] - &arr[0]; // expected-error {{must be initialized by a constant expression}} \
-                                       // ref-error {{must be initialized by a constant expression}}
+constexpr int k2 = &arr2[1] - &arr[0]; // both-error {{must be initialized by a constant expression}}
 
 static_assert((arr + 0) == arr, "");
 static_assert(&arr[0] == arr, "");
 static_assert(*(&arr[0]) == 1, "");
 static_assert(*(&arr[1]) == 2, "");
 
-constexpr const int *OOB = (arr + 3) - 3; // expected-error {{must be initialized by a constant expression}} \
-                                          // expected-note {{cannot refer to element 3 of array of 2}} \
-                                          // ref-error {{must be initialized by a constant expression}} \
-                                          // ref-note {{cannot refer to element 3 of array of 2}}
+constexpr const int *OOB = (arr + 3) - 3; // both-error {{must be initialized by a constant expression}} \
+                                          // both-note {{cannot refer to element 3 of array of 2}}
 
 template<typename T>
 constexpr T getElementOf(T* array, int i) {
@@ -135,11 +122,8 @@ constexpr T& getElementOfArray(T (&array)[N], int I) {
 static_assert(getElementOfArray(foo[2], 3) == &m, "");
 
 
-static_assert(data[0] == 4, ""); // expected-error{{failed}} \
-                                 // expected-note{{5 == 4}} \
-                                 // ref-error{{failed}} \
-                                 // ref-note{{5 == 4}}
-
+static_assert(data[0] == 4, ""); // both-error{{failed}} \
+                                 // both-note{{5 == 4}}
 
 constexpr int dynamic[] = {
   f, 3, 2 + 5, data[3], *getElementOf(foo[2], 3)
@@ -185,21 +169,15 @@ struct fred y [] = { [0] = { .s[0] = 'q' } };
 
 namespace indices {
   constexpr int first[] = {1};
-  constexpr int firstValue = first[2]; // ref-error {{must be initialized by a constant expression}} \
-                                       // ref-note {{cannot refer to element 2 of array of 1}} \
-                                       // expected-error {{must be initialized by a constant expression}} \
-                                       // expected-note {{cannot refer to element 2 of array of 1}}
+  constexpr int firstValue = first[2]; // both-error {{must be initialized by a constant expression}} \
+                                       // both-note {{cannot refer to element 2 of array of 1}}
 
   constexpr int second[10] = {17};
-  constexpr int secondValue = second[10];// ref-error {{must be initialized by a constant expression}} \
-                                         // ref-note {{read of dereferenced one-past-the-end pointer}} \
-                                         // expected-error {{must be initialized by a constant expression}} \
-                                         // expected-note {{read of dereferenced one-past-the-end pointer}}
-
-  constexpr int negative = second[-2]; // ref-error {{must be initialized by a constant expression}} \
-                                       // ref-note {{cannot refer to element -2 of array of 10}} \
-                                       // expected-error {{must be initialized by a constant expression}} \
-                                       // expected-note {{cannot refer to element -2 of array of 10}}
+  constexpr int secondValue = second[10];// both-error {{must be initialized by a constant expression}} \
+                                         // both-note {{read of dereferenced one-past-the-end pointer}} \
+
+  constexpr int negative = second[-2]; // both-error {{must be initialized by a constant expression}} \
+                                       // both-note {{cannot refer to element -2 of array of 10}}
 };
 
 namespace DefaultInit {
@@ -222,12 +200,9 @@ public:
 class AU {
 public:
   int a;
-  constexpr AU() : a(5 / 0) {} // expected-warning {{division by zero is undefined}} \
-                               // expected-note 2{{division by zero}} \
-                               // expected-error {{never produces a constant expression}} \
-                               // ref-error {{never produces a constant expression}} \
-                               // ref-note 2{{division by zero}} \
-                               // ref-warning {{division by zero is undefined}}
+  constexpr AU() : a(5 / 0) {} // both-warning {{division by zero is undefined}} \
+                               // both-note 2{{division by zero}} \
+                               // both-error {{never produces a constant expression}}
 };
 class B {
 public:
@@ -241,13 +216,10 @@ static_assert(b.a[1].a == 12, "");
 class BU {
 public:
   AU a[2];
-  constexpr BU() {} // expected-note {{in call to 'AU()'}} \
-                    // ref-note {{in call to 'AU()'}}
+  constexpr BU() {} // both-note {{in call to 'AU()'}}
 };
-constexpr BU bu; // expected-error {{must be initialized by a constant expression}} \
-                 // expected-note {{in call to 'BU()'}} \
-                 // ref-error {{must be initialized by a constant expression}} \
-                 // ref-note {{in call to 'BU()'}}
+constexpr BU bu; // both-error {{must be initialized by a constant expression}} \
+                 // both-note {{in call to 'BU()'}}
 
 namespace IncDec {
   constexpr int getNextElem(const int *A, int I) {
@@ -311,62 +283,43 @@ namespace IncDec {
   }
   static_assert(getSecondToLast2() == 3, "");
 
-  constexpr int bad1() { // ref-error {{never produces a constant expression}} \
-                         // expected-error {{never produces a constant expression}}
+  constexpr int bad1() { // both-error {{never produces a constant expression}}
     const int *e =  E + 3;
     e++; // This is fine because it's a one-past-the-end pointer
-    return *e; // expected-note 2{{read of dereferenced one-past-the-end pointer}} \
-               // ref-note 2{{read of dereferenced one-past-the-end pointer}}
+    return *e; // both-note 2{{read of dereferenced one-past-the-end pointer}}
   }
-  static_assert(bad1() == 0, ""); // expected-error {{not an integral constant expression}} \
-                                  // expected-note {{in call to}} \
-                                  // ref-error {{not an integral constant expression}} \
-                                  // ref-note {{in call to}}
+  static_assert(bad1() == 0, ""); // both-error {{not an integral constant expression}} \
+                                  // both-note {{in call to}}
 
-  constexpr int bad2() { // ref-error {{never produces a constant expression}} \
-                         // expected-error {{never produces a constant expression}}
+  constexpr int bad2() { // both-error {{never produces a constant expression}}
     const int *e = E + 4;
-    e++; // expected-note 2{{cannot refer to element 5 of array of 4 elements}} \
-         // ref-note 2{{cannot refer to element 5 of array of 4 elements}}
+    e++; // both-note 2{{cannot refer to element 5 of array of 4 elements}}
     return *e; // This is UB as well
   }
-  static_assert(bad2() == 0, ""); // expected-error {{not an integral constant expression}} \
-                                  // expected-note {{in call to}} \
-                                  // ref-error {{not an integral constant expression}} \
-                                  // ref-note {{in call to}}
+  static_assert(bad2() == 0, ""); // both-error {{not an integral constant expression}} \
+                                  // both-note {{in call to}}
 
-
-  constexpr int bad3() { // ref-error {{never produces a constant expression}} \
-                         // expected-error {{never produces a constant expression}}
+  constexpr int bad3() { // both-error {{never produces a constant expression}}
     const int *e = E;
-    e--; // expected-note 2{{cannot refer to element -1 of array of 4 elements}} \
-         // ref-note 2{{cannot refer to element -1 of array of 4 elements}}
+    e--; // both-note 2{{cannot refer to element -1 of array of 4 elements}}
     return *e; // This is UB as well
   }
-   static_assert(bad3() == 0, ""); // expected-error {{not an integral constant expression}} \
-                                   // expected-note {{in call to}} \
-                                   // ref-error {{not an integral constant expression}} \
-                                  // ref-note {{in call to}}
+   static_assert(bad3() == 0, ""); // both-error {{not an integral constant expression}} \
+                                   // both-note {{in call to}}
 
   constexpr int nullptr1(bool Pre) {
     int *a = nullptr;
     if (Pre)
-      ++a; // ref-note {{arithmetic on null pointer}} \
-           // expected-note {{arithmetic on null pointer}}
+      ++a; // both-note {{arithmetic on null pointer}}
     else
-      a++; // ref-note {{arithmetic on null pointer}} \
-           // expected-note {{arithmetic on null pointer}}
+      a++; // both-note {{arithmetic on null pointer}}
     return 1;
   }
-  static_assert(nullptr1(true) == 1, ""); // ref-error {{not an integral constant expression}} \
-                                          // ref-note {{in call to}} \
-                                          // expected-error {{not an integral constant expression}} \
-                                          // expected-note {{in call to}}
-
-  static_assert(nullptr1(false) == 1, ""); // ref-error {{not an integral constant expression}} \
-                                           // ref-note {{in call to}} \
-                                           // expected-error {{not an integral constant expression}} \
-                                           // expected-note {{in call to}}
+  static_assert(nullptr1(true) == 1, ""); // both-error {{not an integral constant expression}} \
+                                          // both-note {{in call to}}
+
+  static_assert(nullptr1(false) == 1, ""); // both-error {{not an integral constant expression}} \
+                                           // both-note {{in call to}}
 };
 
 namespace ZeroInit {
@@ -425,28 +378,20 @@ namespace NoInitMapLeak {
 #pragma clang diagnostic push
 #pragma clang diagnostic ignored "-Wdivision-by-zero"
 #pragma clang diagnostic ignored "-Wc++20-extensions"
-  constexpr int testLeak() { // expected-error {{never produces a constant expression}} \
-                             // ref-error {{never produces a constant expression}}
+  constexpr int testLeak() { // both-error {{never produces a constant expression}}
     int a[2];
     a[0] = 1;
     // interrupts interpretation.
-    (void)(1 / 0); // expected-note 2{{division by zero}} \
-                   // ref-note 2{{division by zero}}
-
+    (void)(1 / 0); // both-note 2{{division by zero}}
 
     return 1;
   }
 #pragma clang diagnostic pop
-  static_assert(testLeak() == 1, ""); // expected-error {{not an integral constant expression}} \
-                                      // expected-note {{in call to 'testLeak()'}} \
-                                      // ref-error {{not an integral constant expression}} \
-                                      // ref-note {{in call to 'testLeak()'}}
+  static_assert(testLeak() == 1, ""); // both-error {{not an integral constant expression}} \
+                                      // both-note {{in call to 'testLeak()'}}
 
-
-  constexpr int a[] = {1,2,3,4/0,5}; // expected-error {{must be initialized by a constant expression}} \
-                                     // expected-note {{division by zero}} \
-                                     // ref-error {{must be initialized by a constant expression}} \
-                                     // ref-note {{division by zero}} \
+  constexpr int a[] = {1,2,3,4/0,5}; // both-error {{must be initialized by a constant expression}} \
+                                     // both-note {{division by zero}} \
                                      // ref-note {{declared here}}
 
   /// FIXME: This should fail in the new interpreter as well.
@@ -456,18 +401,13 @@ namespace NoInitMapLeak {
   static_assert(b == 1, ""); // ref-error {{not an integral constant expression}} \
                              // ref-note {{not a constant expression}}
 
-  constexpr int f() { // expected-error {{never produces a constant expression}} \
-                      // ref-error {{never produces a constant expression}}
-    int a[] = {19,2,3/0,4}; // expected-note 2{{division by zero}} \
-                            // expected-warning {{is undefined}} \
-                            // ref-note 2{{division by zero}} \
-                            // ref-warning {{is undefined}}
+  constexpr int f() { // both-error {{never produces a constant expression}}
+    int a[] = {19,2,3/0,4}; // both-note 2{{division by zero}} \
+                            // both-warning {{is undefined}}
     return 1;
   }
-  static_assert(f() == 1, ""); // expected-error {{not an integral constant expression}} \
-                               // expected-note {{in call to}} \
-                               // ref-error {{not an integral constant expression}} \
-                               // ref-note {{in call to}}
+  static_assert(f() == 1, ""); // both-error {{not an integral constant expression}} \
+                               // both-note {{in call to}}
 }
 
 namespace Incomplete {
@@ -477,38 +417,27 @@ namespace Incomplete {
   };
 
   constexpr Foo F{};
-  constexpr const int *A = F.a; // ref-error {{must be initialized by a constant expression}} \
-                                // ref-note {{array-to-pointer decay of array member without known bound}} \
-                                // expected-error {{must be initialized by a constant expression}} \
-                                // expected-note {{array-to-pointer decay of array member without known bound}}
-
-  constexpr const int *B = F.a + 1; // ref-error {{must be initialized by a constant expression}} \
-                                    // ref-note {{array-to-pointer decay of array member without known bound}} \
-                                    // expected-error {{must be initialized by a constant expression}} \
-                                    // expected-note {{array-to-pointer decay of array member without known bound}}
-
-  constexpr int C = *F.a; // ref-error {{must be initialized by a constant expression}} \
-                          // ref-note {{array-to-pointer decay of array member without known bound}} \
-                          // expected-error {{must be initialized by a constant expression}} \
-                          // expected-note {{array-to-pointer decay of array member without known bound}}
+  constexpr const int *A = F.a; // both-error {{must be initialized by a constant expression}} \
+                                // both-note {{array-to-pointer decay of array member without known bound}}
 
+  constexpr const int *B = F.a + 1; // both-error {{must be initialized by a constant expression}} \
+                                    // both-note {{array-to-pointer decay of array member without known bound}}
 
+  constexpr int C = *F.a; // both-error {{must be initialized by a constant expression}} \
+                          // both-note {{array-to-pointer decay of array member without known bound}}
 
   /// These are from test/SemaCXX/constant-expression-cxx11.cpp
   /// and are the only tests using the 'indexing of array without known bound' diagnostic.
   /// We currently diagnose them differently.
   extern int arr[]; // expected-note 3{{declared here}}
-  constexpr int *c = &arr[1]; // ref-error  {{must be initialized by a constant expression}} \
+  constexpr int *c = &arr[1]; // both-error  {{must be initialized by a constant expression}} \
                               // ref-note {{indexing of array without known bound}} \
-                              // expected-error {{must be initialized by a constant expression}} \
                               // expected-note {{read of non-constexpr variable 'arr'}}
-  constexpr int *d = &arr[1]; // ref-error  {{must be initialized by a constant expression}} \
+  constexpr int *d = &arr[1]; // both-error  {{must be initialized by a constant expression}} \
                               // ref-note {{indexing of array without known bound}} \
-                              // expected-error {{must be initialized by a constant expression}} \
                               // expected-note {{read of non-constexpr variable 'arr'}}
-  constexpr int *e = arr + 1; // ref-error  {{must be initialized by a constant expression}} \
+  constexpr int *e = arr + 1; // both-error  {{must be initialized by a constant expression}} \
                               // ref-note {{indexing of array without known bound}} \
-                              // expected-error {{must be initialized by a constant expression}} \
                               // expected-note {{read of non-constexpr variable 'arr'}}
 }
 
@@ -528,8 +457,7 @@ namespace GH69115 {
     if (C)
       return;
     // Invalid in constexpr.
-    (void)(1 / 0); // expected-warning {{undefined}} \
-                   // ref-warning {{undefined}}
+    (void)(1 / 0); // both-warning {{undefined}}
   }
 
   class F {
@@ -569,23 +497,15 @@ namespace GH69115 {
 
 namespace NonConstReads {
 #if __cplusplus >= 202002L
-  void *p = nullptr; // ref-note {{declared here}} \
-                     // expected-note {{declared here}}
-
-  int arr[!p]; // ref-error {{not allowed at file scope}} \
-               // expected-error {{not allowed at file scope}} \
-               // ref-warning {{variable length arrays}} \
-               // ref-note {{read of non-constexpr variable 'p'}} \
-               // expected-warning {{variable length arrays}} \
-               // expected-note {{read of non-constexpr variable 'p'}}
-  int z; // ref-note {{declared here}} \
-         // expected-note {{declared here}}
-  int a[z]; // ref-error {{not allowed at file scope}} \
-            // expected-error {{not allowed at file scope}} \
-            // ref-warning {{variable length arrays}} \
-            // ref-note {{read of non-const variable 'z'}} \
-            // expected-warning {{variable length arrays}} \
-            // expected-note {{read of non-const variable 'z'}}
+  void *p = nullptr; // both-note {{declared here}}
+
+  int arr[!p]; // both-error {{not allowed at file scope}} \
+               // both-warning {{variable length arrays}} \
+               // both-note {{read of non-constexpr variable 'p'}}
+  int z; // both-note {{declared here}}
+  int a[z]; // both-error {{not allowed at file scope}} \
+            // both-warning {{variable length arrays}} \
+            // both-note {{read of non-const variable 'z'}}
 #else
   void *p = nullptr;
   int arr[!p]; // ref-error {{not allowed at file scope}} \
@@ -614,7 +534,6 @@ namespace SelfComparison {
   };
 
   int struct_test(S s1, S s2, S *s3, T t) {
-    return s3->array[t.field] == s3->array[t.field];  // expected-warning {{self-comparison always evaluates to true}} \
-                                                      // ref-warning {{self-comparison always evaluates to true}}
+    return s3->array[t.field] == s3->array[t.field];  // both-warning {{self-comparison always evaluates to true}}
   };
 }
-- 
cgit v1.1


From 02362b1ad1c07a01714b195d769400dd40dbfd04 Mon Sep 17 00:00:00 2001
From: Owen Pan <owenpiano@gmail.com>
Date: Fri, 9 Feb 2024 02:11:44 -0800
Subject: [clang-format] Check token size in QualifierFixerTest.cpp

---
 clang/unittests/Format/QualifierFixerTest.cpp | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/clang/unittests/Format/QualifierFixerTest.cpp b/clang/unittests/Format/QualifierFixerTest.cpp
index 324366c..0aa755a 100644
--- a/clang/unittests/Format/QualifierFixerTest.cpp
+++ b/clang/unittests/Format/QualifierFixerTest.cpp
@@ -1055,6 +1055,7 @@ TEST_F(QualifierFixerTest, IsQualifierType) {
 
   auto Tokens = annotate(
       "const static inline auto restrict int double long constexpr friend");
+  ASSERT_EQ(Tokens.size(), 11u) << Tokens;
 
   EXPECT_TRUE(LeftRightQualifierAlignmentFixer::isConfiguredQualifierOrType(
       Tokens[0], ConfiguredTokens));
@@ -1089,6 +1090,7 @@ TEST_F(QualifierFixerTest, IsQualifierType) {
   EXPECT_TRUE(LeftRightQualifierAlignmentFixer::isQualifierOrType(Tokens[9]));
 
   auto NotTokens = annotate("for while do Foo Bar ");
+  ASSERT_EQ(NotTokens.size(), 6u) << Tokens;
 
   EXPECT_FALSE(LeftRightQualifierAlignmentFixer::isConfiguredQualifierOrType(
       NotTokens[0], ConfiguredTokens));
@@ -1120,6 +1122,7 @@ TEST_F(QualifierFixerTest, IsQualifierType) {
 TEST_F(QualifierFixerTest, IsMacro) {
 
   auto Tokens = annotate("INT INTPR Foo int");
+  ASSERT_EQ(Tokens.size(), 5u) << Tokens;
 
   EXPECT_TRUE(LeftRightQualifierAlignmentFixer::isPossibleMacro(Tokens[0]));
   EXPECT_TRUE(LeftRightQualifierAlignmentFixer::isPossibleMacro(Tokens[1]));
-- 
cgit v1.1


From ac3bd2bd530127786741bd9f164d66a3c3f40961 Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Fri, 9 Feb 2024 11:20:19 +0100
Subject: [LoopReroll] Remove unused and unmaintained pass (#80972)

Remove the LoopReroll pass, which is both unused (in any default
pipeline) and unmaintained, with numerous open correctness issues
(https://github.com/llvm/llvm-project/issues?q=is%3Aissue+is%3Aopen+loop-reroll).
The removal is in line with
https://discourse.llvm.org/t/rfc-disallow-unmaintained-unused-passes/75151.

There is also a defunct `-freroll-loops` option in clang, which I'll
remove separately.

Migrated from https://reviews.llvm.org/D150684.
---
 llvm/include/llvm/Transforms/Scalar/LoopReroll.h   |   25 -
 llvm/lib/Passes/PassBuilder.cpp                    |    1 -
 llvm/lib/Passes/PassRegistry.def                   |    1 -
 llvm/lib/Transforms/Scalar/CMakeLists.txt          |    1 -
 llvm/lib/Transforms/Scalar/LoopRerollPass.cpp      | 1679 --------------------
 llvm/test/Transforms/LoopReroll/basic.ll           |  976 ------------
 llvm/test/Transforms/LoopReroll/basic32iters.ll    |  328 ----
 llvm/test/Transforms/LoopReroll/complex_reroll.ll  |  237 ---
 llvm/test/Transforms/LoopReroll/external_use.ll    |   60 -
 llvm/test/Transforms/LoopReroll/extra_instr.ll     |  361 -----
 llvm/test/Transforms/LoopReroll/indvar_with_ext.ll |  184 ---
 llvm/test/Transforms/LoopReroll/negative.ll        |   48 -
 llvm/test/Transforms/LoopReroll/nonconst_lb.ll     |  168 --
 llvm/test/Transforms/LoopReroll/ptrindvar.ll       |  125 --
 llvm/test/Transforms/LoopReroll/reduction.ll       |  132 --
 llvm/test/Transforms/LoopReroll/reroll_with_dbg.ll |  130 --
 16 files changed, 4456 deletions(-)
 delete mode 100644 llvm/include/llvm/Transforms/Scalar/LoopReroll.h
 delete mode 100644 llvm/lib/Transforms/Scalar/LoopRerollPass.cpp
 delete mode 100644 llvm/test/Transforms/LoopReroll/basic.ll
 delete mode 100644 llvm/test/Transforms/LoopReroll/basic32iters.ll
 delete mode 100644 llvm/test/Transforms/LoopReroll/complex_reroll.ll
 delete mode 100644 llvm/test/Transforms/LoopReroll/external_use.ll
 delete mode 100644 llvm/test/Transforms/LoopReroll/extra_instr.ll
 delete mode 100644 llvm/test/Transforms/LoopReroll/indvar_with_ext.ll
 delete mode 100644 llvm/test/Transforms/LoopReroll/negative.ll
 delete mode 100644 llvm/test/Transforms/LoopReroll/nonconst_lb.ll
 delete mode 100644 llvm/test/Transforms/LoopReroll/ptrindvar.ll
 delete mode 100644 llvm/test/Transforms/LoopReroll/reduction.ll
 delete mode 100644 llvm/test/Transforms/LoopReroll/reroll_with_dbg.ll

diff --git a/llvm/include/llvm/Transforms/Scalar/LoopReroll.h b/llvm/include/llvm/Transforms/Scalar/LoopReroll.h
deleted file mode 100644
index 496e8df..0000000
--- a/llvm/include/llvm/Transforms/Scalar/LoopReroll.h
+++ /dev/null
@@ -1,25 +0,0 @@
-//===- LoopReroll.h - Loop rerolling pass ---------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_TRANSFORMS_SCALAR_LOOPREROLL_H
-#define LLVM_TRANSFORMS_SCALAR_LOOPREROLL_H
-
-#include "llvm/IR/PassManager.h"
-#include "llvm/Transforms/Scalar/LoopPassManager.h"
-
-namespace llvm {
-
-class LoopRerollPass : public PassInfoMixin<LoopRerollPass> {
-public:
-  PreservedAnalyses run(Loop &L, LoopAnalysisManager &AM,
-                        LoopStandardAnalysisResults &AR, LPMUpdater &U);
-};
-
-} // end namespace llvm
-
-#endif // LLVM_TRANSFORMS_SCALAR_LOOPREROLL_H
diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp
index 7c306c4..007dc76 100644
--- a/llvm/lib/Passes/PassBuilder.cpp
+++ b/llvm/lib/Passes/PassBuilder.cpp
@@ -216,7 +216,6 @@
 #include "llvm/Transforms/Scalar/LoopLoadElimination.h"
 #include "llvm/Transforms/Scalar/LoopPassManager.h"
 #include "llvm/Transforms/Scalar/LoopPredication.h"
-#include "llvm/Transforms/Scalar/LoopReroll.h"
 #include "llvm/Transforms/Scalar/LoopRotation.h"
 #include "llvm/Transforms/Scalar/LoopSimplifyCFG.h"
 #include "llvm/Transforms/Scalar/LoopSink.h"
diff --git a/llvm/lib/Passes/PassRegistry.def b/llvm/lib/Passes/PassRegistry.def
index 4451180..6cb87fb 100644
--- a/llvm/lib/Passes/PassRegistry.def
+++ b/llvm/lib/Passes/PassRegistry.def
@@ -599,7 +599,6 @@ LOOP_PASS("loop-idiom", LoopIdiomRecognizePass())
 LOOP_PASS("loop-instsimplify", LoopInstSimplifyPass())
 LOOP_PASS("loop-predication", LoopPredicationPass())
 LOOP_PASS("loop-reduce", LoopStrengthReducePass())
-LOOP_PASS("loop-reroll", LoopRerollPass())
 LOOP_PASS("loop-simplifycfg", LoopSimplifyCFGPass())
 LOOP_PASS("loop-unroll-full", LoopFullUnrollPass())
 LOOP_PASS("loop-versioning-licm", LoopVersioningLICMPass())
diff --git a/llvm/lib/Transforms/Scalar/CMakeLists.txt b/llvm/lib/Transforms/Scalar/CMakeLists.txt
index 2dd2703..5527efa 100644
--- a/llvm/lib/Transforms/Scalar/CMakeLists.txt
+++ b/llvm/lib/Transforms/Scalar/CMakeLists.txt
@@ -40,7 +40,6 @@ add_llvm_component_library(LLVMScalarOpts
   LoopLoadElimination.cpp
   LoopPassManager.cpp
   LoopPredication.cpp
-  LoopRerollPass.cpp
   LoopRotation.cpp
   LoopSimplifyCFG.cpp
   LoopStrengthReduce.cpp
diff --git a/llvm/lib/Transforms/Scalar/LoopRerollPass.cpp b/llvm/lib/Transforms/Scalar/LoopRerollPass.cpp
deleted file mode 100644
index 7f62526..0000000
--- a/llvm/lib/Transforms/Scalar/LoopRerollPass.cpp
+++ /dev/null
@@ -1,1679 +0,0 @@
-//===- LoopReroll.cpp - Loop rerolling pass -------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This pass implements a simple loop reroller.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/ADT/APInt.h"
-#include "llvm/ADT/BitVector.h"
-#include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/DenseSet.h"
-#include "llvm/ADT/MapVector.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/SmallPtrSet.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/Analysis/AliasAnalysis.h"
-#include "llvm/Analysis/AliasSetTracker.h"
-#include "llvm/Analysis/LoopInfo.h"
-#include "llvm/Analysis/LoopPass.h"
-#include "llvm/Analysis/ScalarEvolution.h"
-#include "llvm/Analysis/ScalarEvolutionExpressions.h"
-#include "llvm/Analysis/TargetLibraryInfo.h"
-#include "llvm/Analysis/ValueTracking.h"
-#include "llvm/IR/BasicBlock.h"
-#include "llvm/IR/Constants.h"
-#include "llvm/IR/Dominators.h"
-#include "llvm/IR/InstrTypes.h"
-#include "llvm/IR/Instruction.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/IntrinsicInst.h"
-#include "llvm/IR/Module.h"
-#include "llvm/IR/Type.h"
-#include "llvm/IR/Use.h"
-#include "llvm/IR/User.h"
-#include "llvm/IR/Value.h"
-#include "llvm/Support/Casting.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/Transforms/Scalar/LoopReroll.h"
-#include "llvm/Transforms/Utils.h"
-#include "llvm/Transforms/Utils/BasicBlockUtils.h"
-#include "llvm/Transforms/Utils/Local.h"
-#include "llvm/Transforms/Utils/LoopUtils.h"
-#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
-#include <cassert>
-#include <cstddef>
-#include <cstdint>
-#include <iterator>
-#include <map>
-#include <utility>
-
-using namespace llvm;
-
-#define DEBUG_TYPE "loop-reroll"
-
-STATISTIC(NumRerolledLoops, "Number of rerolled loops");
-
-static cl::opt<unsigned>
-NumToleratedFailedMatches("reroll-num-tolerated-failed-matches", cl::init(400),
-                          cl::Hidden,
-                          cl::desc("The maximum number of failures to tolerate"
-                                   " during fuzzy matching. (default: 400)"));
-
-// This loop re-rolling transformation aims to transform loops like this:
-//
-// int foo(int a);
-// void bar(int *x) {
-//   for (int i = 0; i < 500; i += 3) {
-//     foo(i);
-//     foo(i+1);
-//     foo(i+2);
-//   }
-// }
-//
-// into a loop like this:
-//
-// void bar(int *x) {
-//   for (int i = 0; i < 500; ++i)
-//     foo(i);
-// }
-//
-// It does this by looking for loops that, besides the latch code, are composed
-// of isomorphic DAGs of instructions, with each DAG rooted at some increment
-// to the induction variable, and where each DAG is isomorphic to the DAG
-// rooted at the induction variable (excepting the sub-DAGs which root the
-// other induction-variable increments). In other words, we're looking for loop
-// bodies of the form:
-//
-// %iv = phi [ (preheader, ...), (body, %iv.next) ]
-// f(%iv)
-// %iv.1 = add %iv, 1                <-- a root increment
-// f(%iv.1)
-// %iv.2 = add %iv, 2                <-- a root increment
-// f(%iv.2)
-// %iv.scale_m_1 = add %iv, scale-1  <-- a root increment
-// f(%iv.scale_m_1)
-// ...
-// %iv.next = add %iv, scale
-// %cmp = icmp(%iv, ...)
-// br %cmp, header, exit
-//
-// where each f(i) is a set of instructions that, collectively, are a function
-// only of i (and other loop-invariant values).
-//
-// As a special case, we can also reroll loops like this:
-//
-// int foo(int);
-// void bar(int *x) {
-//   for (int i = 0; i < 500; ++i) {
-//     x[3*i] = foo(0);
-//     x[3*i+1] = foo(0);
-//     x[3*i+2] = foo(0);
-//   }
-// }
-//
-// into this:
-//
-// void bar(int *x) {
-//   for (int i = 0; i < 1500; ++i)
-//     x[i] = foo(0);
-// }
-//
-// in which case, we're looking for inputs like this:
-//
-// %iv = phi [ (preheader, ...), (body, %iv.next) ]
-// %scaled.iv = mul %iv, scale
-// f(%scaled.iv)
-// %scaled.iv.1 = add %scaled.iv, 1
-// f(%scaled.iv.1)
-// %scaled.iv.2 = add %scaled.iv, 2
-// f(%scaled.iv.2)
-// %scaled.iv.scale_m_1 = add %scaled.iv, scale-1
-// f(%scaled.iv.scale_m_1)
-// ...
-// %iv.next = add %iv, 1
-// %cmp = icmp(%iv, ...)
-// br %cmp, header, exit
-
-namespace {
-
-  enum IterationLimits {
-    /// The maximum number of iterations that we'll try and reroll.
-    IL_MaxRerollIterations = 32,
-    /// The bitvector index used by loop induction variables and other
-    /// instructions that belong to all iterations.
-    IL_All,
-    IL_End
-  };
-
-  class LoopReroll {
-  public:
-    LoopReroll(AliasAnalysis *AA, LoopInfo *LI, ScalarEvolution *SE,
-               TargetLibraryInfo *TLI, DominatorTree *DT, bool PreserveLCSSA)
-        : AA(AA), LI(LI), SE(SE), TLI(TLI), DT(DT),
-          PreserveLCSSA(PreserveLCSSA) {}
-    bool runOnLoop(Loop *L);
-
-  protected:
-    AliasAnalysis *AA;
-    LoopInfo *LI;
-    ScalarEvolution *SE;
-    TargetLibraryInfo *TLI;
-    DominatorTree *DT;
-    bool PreserveLCSSA;
-
-    using SmallInstructionVector = SmallVector<Instruction *, 16>;
-    using SmallInstructionSet = SmallPtrSet<Instruction *, 16>;
-    using TinyInstructionVector = SmallVector<Instruction *, 1>;
-
-    // Map between induction variable and its increment
-    DenseMap<Instruction *, int64_t> IVToIncMap;
-
-    // For loop with multiple induction variables, remember the ones used only to
-    // control the loop.
-    TinyInstructionVector LoopControlIVs;
-
-    // A chain of isomorphic instructions, identified by a single-use PHI
-    // representing a reduction. Only the last value may be used outside the
-    // loop.
-    struct SimpleLoopReduction {
-      SimpleLoopReduction(Instruction *P, Loop *L) : Instructions(1, P) {
-        assert(isa<PHINode>(P) && "First reduction instruction must be a PHI");
-        add(L);
-      }
-
-      bool valid() const {
-        return Valid;
-      }
-
-      Instruction *getPHI() const {
-        assert(Valid && "Using invalid reduction");
-        return Instructions.front();
-      }
-
-      Instruction *getReducedValue() const {
-        assert(Valid && "Using invalid reduction");
-        return Instructions.back();
-      }
-
-      Instruction *get(size_t i) const {
-        assert(Valid && "Using invalid reduction");
-        return Instructions[i+1];
-      }
-
-      Instruction *operator [] (size_t i) const { return get(i); }
-
-      // The size, ignoring the initial PHI.
-      size_t size() const {
-        assert(Valid && "Using invalid reduction");
-        return Instructions.size()-1;
-      }
-
-      using iterator = SmallInstructionVector::iterator;
-      using const_iterator = SmallInstructionVector::const_iterator;
-
-      iterator begin() {
-        assert(Valid && "Using invalid reduction");
-        return std::next(Instructions.begin());
-      }
-
-      const_iterator begin() const {
-        assert(Valid && "Using invalid reduction");
-        return std::next(Instructions.begin());
-      }
-
-      iterator end() { return Instructions.end(); }
-      const_iterator end() const { return Instructions.end(); }
-
-    protected:
-      bool Valid = false;
-      SmallInstructionVector Instructions;
-
-      void add(Loop *L);
-    };
-
-    // The set of all reductions, and state tracking of possible reductions
-    // during loop instruction processing.
-    struct ReductionTracker {
-      using SmallReductionVector = SmallVector<SimpleLoopReduction, 16>;
-
-      // Add a new possible reduction.
-      void addSLR(SimpleLoopReduction &SLR) { PossibleReds.push_back(SLR); }
-
-      // Setup to track possible reductions corresponding to the provided
-      // rerolling scale. Only reductions with a number of non-PHI instructions
-      // that is divisible by the scale are considered. Three instructions sets
-      // are filled in:
-      //   - A set of all possible instructions in eligible reductions.
-      //   - A set of all PHIs in eligible reductions
-      //   - A set of all reduced values (last instructions) in eligible
-      //     reductions.
-      void restrictToScale(uint64_t Scale,
-                           SmallInstructionSet &PossibleRedSet,
-                           SmallInstructionSet &PossibleRedPHISet,
-                           SmallInstructionSet &PossibleRedLastSet) {
-        PossibleRedIdx.clear();
-        PossibleRedIter.clear();
-        Reds.clear();
-
-        for (unsigned i = 0, e = PossibleReds.size(); i != e; ++i)
-          if (PossibleReds[i].size() % Scale == 0) {
-            PossibleRedLastSet.insert(PossibleReds[i].getReducedValue());
-            PossibleRedPHISet.insert(PossibleReds[i].getPHI());
-
-            PossibleRedSet.insert(PossibleReds[i].getPHI());
-            PossibleRedIdx[PossibleReds[i].getPHI()] = i;
-            for (Instruction *J : PossibleReds[i]) {
-              PossibleRedSet.insert(J);
-              PossibleRedIdx[J] = i;
-            }
-          }
-      }
-
-      // The functions below are used while processing the loop instructions.
-
-      // Are the two instructions both from reductions, and furthermore, from
-      // the same reduction?
-      bool isPairInSame(Instruction *J1, Instruction *J2) {
-        DenseMap<Instruction *, int>::iterator J1I = PossibleRedIdx.find(J1);
-        if (J1I != PossibleRedIdx.end()) {
-          DenseMap<Instruction *, int>::iterator J2I = PossibleRedIdx.find(J2);
-          if (J2I != PossibleRedIdx.end() && J1I->second == J2I->second)
-            return true;
-        }
-
-        return false;
-      }
-
-      // The two provided instructions, the first from the base iteration, and
-      // the second from iteration i, form a matched pair. If these are part of
-      // a reduction, record that fact.
-      void recordPair(Instruction *J1, Instruction *J2, unsigned i) {
-        if (PossibleRedIdx.count(J1)) {
-          assert(PossibleRedIdx.count(J2) &&
-                 "Recording reduction vs. non-reduction instruction?");
-
-          PossibleRedIter[J1] = 0;
-          PossibleRedIter[J2] = i;
-
-          int Idx = PossibleRedIdx[J1];
-          assert(Idx == PossibleRedIdx[J2] &&
-                 "Recording pair from different reductions?");
-          Reds.insert(Idx);
-        }
-      }
-
-      // The functions below can be called after we've finished processing all
-      // instructions in the loop, and we know which reductions were selected.
-
-      bool validateSelected();
-      void replaceSelected();
-
-    protected:
-      // The vector of all possible reductions (for any scale).
-      SmallReductionVector PossibleReds;
-
-      DenseMap<Instruction *, int> PossibleRedIdx;
-      DenseMap<Instruction *, int> PossibleRedIter;
-      DenseSet<int> Reds;
-    };
-
-    // A DAGRootSet models an induction variable being used in a rerollable
-    // loop. For example,
-    //
-    //   x[i*3+0] = y1
-    //   x[i*3+1] = y2
-    //   x[i*3+2] = y3
-    //
-    //   Base instruction -> i*3
-    //                    +---+----+
-    //                   /    |     \
-    //               ST[y1]  +1     +2  <-- Roots
-    //                        |      |
-    //                      ST[y2] ST[y3]
-    //
-    // There may be multiple DAGRoots, for example:
-    //
-    //   x[i*2+0] = ...   (1)
-    //   x[i*2+1] = ...   (1)
-    //   x[i*2+4] = ...   (2)
-    //   x[i*2+5] = ...   (2)
-    //   x[(i+1234)*2+5678] = ... (3)
-    //   x[(i+1234)*2+5679] = ... (3)
-    //
-    // The loop will be rerolled by adding a new loop induction variable,
-    // one for the Base instruction in each DAGRootSet.
-    //
-    struct DAGRootSet {
-      Instruction *BaseInst;
-      SmallInstructionVector Roots;
-
-      // The instructions between IV and BaseInst (but not including BaseInst).
-      SmallInstructionSet SubsumedInsts;
-    };
-
-    // The set of all DAG roots, and state tracking of all roots
-    // for a particular induction variable.
-    struct DAGRootTracker {
-      DAGRootTracker(LoopReroll *Parent, Loop *L, Instruction *IV,
-                     ScalarEvolution *SE, AliasAnalysis *AA,
-                     TargetLibraryInfo *TLI, DominatorTree *DT, LoopInfo *LI,
-                     bool PreserveLCSSA,
-                     DenseMap<Instruction *, int64_t> &IncrMap,
-                     TinyInstructionVector LoopCtrlIVs)
-          : Parent(Parent), L(L), SE(SE), AA(AA), TLI(TLI), DT(DT), LI(LI),
-            PreserveLCSSA(PreserveLCSSA), IV(IV), IVToIncMap(IncrMap),
-            LoopControlIVs(LoopCtrlIVs) {}
-
-      /// Stage 1: Find all the DAG roots for the induction variable.
-      bool findRoots();
-
-      /// Stage 2: Validate if the found roots are valid.
-      bool validate(ReductionTracker &Reductions);
-
-      /// Stage 3: Assuming validate() returned true, perform the
-      /// replacement.
-      /// @param BackedgeTakenCount The backedge-taken count of L.
-      void replace(const SCEV *BackedgeTakenCount);
-
-    protected:
-      using UsesTy = MapVector<Instruction *, BitVector>;
-
-      void findRootsRecursive(Instruction *IVU,
-                              SmallInstructionSet SubsumedInsts);
-      bool findRootsBase(Instruction *IVU, SmallInstructionSet SubsumedInsts);
-      bool collectPossibleRoots(Instruction *Base,
-                                std::map<int64_t,Instruction*> &Roots);
-      bool validateRootSet(DAGRootSet &DRS);
-
-      bool collectUsedInstructions(SmallInstructionSet &PossibleRedSet);
-      void collectInLoopUserSet(const SmallInstructionVector &Roots,
-                                const SmallInstructionSet &Exclude,
-                                const SmallInstructionSet &Final,
-                                DenseSet<Instruction *> &Users);
-      void collectInLoopUserSet(Instruction *Root,
-                                const SmallInstructionSet &Exclude,
-                                const SmallInstructionSet &Final,
-                                DenseSet<Instruction *> &Users);
-
-      UsesTy::iterator nextInstr(int Val, UsesTy &In,
-                                 const SmallInstructionSet &Exclude,
-                                 UsesTy::iterator *StartI=nullptr);
-      bool isBaseInst(Instruction *I);
-      bool isRootInst(Instruction *I);
-      bool instrDependsOn(Instruction *I,
-                          UsesTy::iterator Start,
-                          UsesTy::iterator End);
-      void replaceIV(DAGRootSet &DRS, const SCEV *Start, const SCEV *IncrExpr);
-
-      LoopReroll *Parent;
-
-      // Members of Parent, replicated here for brevity.
-      Loop *L;
-      ScalarEvolution *SE;
-      AliasAnalysis *AA;
-      TargetLibraryInfo *TLI;
-      DominatorTree *DT;
-      LoopInfo *LI;
-      bool PreserveLCSSA;
-
-      // The loop induction variable.
-      Instruction *IV;
-
-      // Loop step amount.
-      int64_t Inc;
-
-      // Loop reroll count; if Inc == 1, this records the scaling applied
-      // to the indvar: a[i*2+0] = ...; a[i*2+1] = ... ;
-      // If Inc is not 1, Scale = Inc.
-      uint64_t Scale;
-
-      // The roots themselves.
-      SmallVector<DAGRootSet,16> RootSets;
-
-      // All increment instructions for IV.
-      SmallInstructionVector LoopIncs;
-
-      // Map of all instructions in the loop (in order) to the iterations
-      // they are used in (or specially, IL_All for instructions
-      // used in the loop increment mechanism).
-      UsesTy Uses;
-
-      // Map between induction variable and its increment
-      DenseMap<Instruction *, int64_t> &IVToIncMap;
-
-      TinyInstructionVector LoopControlIVs;
-    };
-
-    // Check if it is a compare-like instruction whose user is a branch
-    bool isCompareUsedByBranch(Instruction *I) {
-      auto *TI = I->getParent()->getTerminator();
-      if (!isa<BranchInst>(TI) || !isa<CmpInst>(I))
-        return false;
-      return I->hasOneUse() && TI->getOperand(0) == I;
-    };
-
-    bool isLoopControlIV(Loop *L, Instruction *IV);
-    void collectPossibleIVs(Loop *L, SmallInstructionVector &PossibleIVs);
-    void collectPossibleReductions(Loop *L,
-           ReductionTracker &Reductions);
-    bool reroll(Instruction *IV, Loop *L, BasicBlock *Header,
-                const SCEV *BackedgeTakenCount, ReductionTracker &Reductions);
-  };
-
-} // end anonymous namespace
-
-// Returns true if the provided instruction is used outside the given loop.
-// This operates like Instruction::isUsedOutsideOfBlock, but considers PHIs in
-// non-loop blocks to be outside the loop.
-static bool hasUsesOutsideLoop(Instruction *I, Loop *L) {
-  for (User *U : I->users()) {
-    if (!L->contains(cast<Instruction>(U)))
-      return true;
-  }
-  return false;
-}
-
-// Check if an IV is only used to control the loop. There are two cases:
-// 1. It only has one use which is loop increment, and the increment is only
-// used by comparison and the PHI (could has sext with nsw in between), and the
-// comparison is only used by branch.
-// 2. It is used by loop increment and the comparison, the loop increment is
-// only used by the PHI, and the comparison is used only by the branch.
-bool LoopReroll::isLoopControlIV(Loop *L, Instruction *IV) {
-  unsigned IVUses = IV->getNumUses();
-  if (IVUses != 2 && IVUses != 1)
-    return false;
-
-  for (auto *User : IV->users()) {
-    int32_t IncOrCmpUses = User->getNumUses();
-    bool IsCompInst = isCompareUsedByBranch(cast<Instruction>(User));
-
-    // User can only have one or two uses.
-    if (IncOrCmpUses != 2 && IncOrCmpUses != 1)
-      return false;
-
-    // Case 1
-    if (IVUses == 1) {
-      // The only user must be the loop increment.
-      // The loop increment must have two uses.
-      if (IsCompInst || IncOrCmpUses != 2)
-        return false;
-    }
-
-    // Case 2
-    if (IVUses == 2 && IncOrCmpUses != 1)
-      return false;
-
-    // The users of the IV must be a binary operation or a comparison
-    if (auto *BO = dyn_cast<BinaryOperator>(User)) {
-      if (BO->getOpcode() == Instruction::Add) {
-        // Loop Increment
-        // User of Loop Increment should be either PHI or CMP
-        for (auto *UU : User->users()) {
-          if (PHINode *PN = dyn_cast<PHINode>(UU)) {
-            if (PN != IV)
-              return false;
-          }
-          // Must be a CMP or an ext (of a value with nsw) then CMP
-          else {
-            auto *UUser = cast<Instruction>(UU);
-            // Skip SExt if we are extending an nsw value
-            // TODO: Allow ZExt too
-            if (BO->hasNoSignedWrap() && UUser->hasOneUse() &&
-                isa<SExtInst>(UUser))
-              UUser = cast<Instruction>(*(UUser->user_begin()));
-            if (!isCompareUsedByBranch(UUser))
-              return false;
-          }
-        }
-      } else
-        return false;
-      // Compare : can only have one use, and must be branch
-    } else if (!IsCompInst)
-      return false;
-  }
-  return true;
-}
-
-// Collect the list of loop induction variables with respect to which it might
-// be possible to reroll the loop.
-void LoopReroll::collectPossibleIVs(Loop *L,
-                                    SmallInstructionVector &PossibleIVs) {
-  for (Instruction &IV : L->getHeader()->phis()) {
-    if (!IV.getType()->isIntegerTy() && !IV.getType()->isPointerTy())
-      continue;
-
-    if (const SCEVAddRecExpr *PHISCEV =
-            dyn_cast<SCEVAddRecExpr>(SE->getSCEV(&IV))) {
-      if (PHISCEV->getLoop() != L)
-        continue;
-      if (!PHISCEV->isAffine())
-        continue;
-      const auto *IncSCEV = dyn_cast<SCEVConstant>(PHISCEV->getStepRecurrence(*SE));
-      if (IncSCEV) {
-        IVToIncMap[&IV] = IncSCEV->getValue()->getSExtValue();
-        LLVM_DEBUG(dbgs() << "LRR: Possible IV: " << IV << " = " << *PHISCEV
-                          << "\n");
-
-        if (isLoopControlIV(L, &IV)) {
-          LoopControlIVs.push_back(&IV);
-          LLVM_DEBUG(dbgs() << "LRR: Loop control only IV: " << IV
-                            << " = " << *PHISCEV << "\n");
-        } else
-          PossibleIVs.push_back(&IV);
-      }
-    }
-  }
-}
-
-// Add the remainder of the reduction-variable chain to the instruction vector
-// (the initial PHINode has already been added). If successful, the object is
-// marked as valid.
-void LoopReroll::SimpleLoopReduction::add(Loop *L) {
-  assert(!Valid && "Cannot add to an already-valid chain");
-
-  // The reduction variable must be a chain of single-use instructions
-  // (including the PHI), except for the last value (which is used by the PHI
-  // and also outside the loop).
-  Instruction *C = Instructions.front();
-  if (C->user_empty())
-    return;
-
-  do {
-    C = cast<Instruction>(*C->user_begin());
-    if (C->hasOneUse()) {
-      if (!C->isBinaryOp())
-        return;
-
-      if (!(isa<PHINode>(Instructions.back()) ||
-            C->isSameOperationAs(Instructions.back())))
-        return;
-
-      Instructions.push_back(C);
-    }
-  } while (C->hasOneUse());
-
-  if (Instructions.size() < 2 ||
-      !C->isSameOperationAs(Instructions.back()) ||
-      C->use_empty())
-    return;
-
-  // C is now the (potential) last instruction in the reduction chain.
-  for (User *U : C->users()) {
-    // The only in-loop user can be the initial PHI.
-    if (L->contains(cast<Instruction>(U)))
-      if (cast<Instruction>(U) != Instructions.front())
-        return;
-  }
-
-  Instructions.push_back(C);
-  Valid = true;
-}
-
-// Collect the vector of possible reduction variables.
-void LoopReroll::collectPossibleReductions(Loop *L,
-  ReductionTracker &Reductions) {
-  BasicBlock *Header = L->getHeader();
-  for (BasicBlock::iterator I = Header->begin(),
-       IE = Header->getFirstInsertionPt(); I != IE; ++I) {
-    if (!isa<PHINode>(I))
-      continue;
-    if (!I->getType()->isSingleValueType())
-      continue;
-
-    SimpleLoopReduction SLR(&*I, L);
-    if (!SLR.valid())
-      continue;
-
-    LLVM_DEBUG(dbgs() << "LRR: Possible reduction: " << *I << " (with "
-                      << SLR.size() << " chained instructions)\n");
-    Reductions.addSLR(SLR);
-  }
-}
-
-// Collect the set of all users of the provided root instruction. This set of
-// users contains not only the direct users of the root instruction, but also
-// all users of those users, and so on. There are two exceptions:
-//
-//   1. Instructions in the set of excluded instructions are never added to the
-//   use set (even if they are users). This is used, for example, to exclude
-//   including root increments in the use set of the primary IV.
-//
-//   2. Instructions in the set of final instructions are added to the use set
-//   if they are users, but their users are not added. This is used, for
-//   example, to prevent a reduction update from forcing all later reduction
-//   updates into the use set.
-void LoopReroll::DAGRootTracker::collectInLoopUserSet(
-  Instruction *Root, const SmallInstructionSet &Exclude,
-  const SmallInstructionSet &Final,
-  DenseSet<Instruction *> &Users) {
-  SmallInstructionVector Queue(1, Root);
-  while (!Queue.empty()) {
-    Instruction *I = Queue.pop_back_val();
-    if (!Users.insert(I).second)
-      continue;
-
-    if (!Final.count(I))
-      for (Use &U : I->uses()) {
-        Instruction *User = cast<Instruction>(U.getUser());
-        if (PHINode *PN = dyn_cast<PHINode>(User)) {
-          // Ignore "wrap-around" uses to PHIs of this loop's header.
-          if (PN->getIncomingBlock(U) == L->getHeader())
-            continue;
-        }
-
-        if (L->contains(User) && !Exclude.count(User)) {
-          Queue.push_back(User);
-        }
-      }
-
-    // We also want to collect single-user "feeder" values.
-    for (Use &U : I->operands()) {
-      if (Instruction *Op = dyn_cast<Instruction>(U))
-        if (Op->hasOneUse() && L->contains(Op) && !Exclude.count(Op) &&
-            !Final.count(Op))
-          Queue.push_back(Op);
-    }
-  }
-}
-
-// Collect all of the users of all of the provided root instructions (combined
-// into a single set).
-void LoopReroll::DAGRootTracker::collectInLoopUserSet(
-  const SmallInstructionVector &Roots,
-  const SmallInstructionSet &Exclude,
-  const SmallInstructionSet &Final,
-  DenseSet<Instruction *> &Users) {
-  for (Instruction *Root : Roots)
-    collectInLoopUserSet(Root, Exclude, Final, Users);
-}
-
-static bool isUnorderedLoadStore(Instruction *I) {
-  if (LoadInst *LI = dyn_cast<LoadInst>(I))
-    return LI->isUnordered();
-  if (StoreInst *SI = dyn_cast<StoreInst>(I))
-    return SI->isUnordered();
-  if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(I))
-    return !MI->isVolatile();
-  return false;
-}
-
-/// Return true if IVU is a "simple" arithmetic operation.
-/// This is used for narrowing the search space for DAGRoots; only arithmetic
-/// and GEPs can be part of a DAGRoot.
-static bool isSimpleArithmeticOp(User *IVU) {
-  if (Instruction *I = dyn_cast<Instruction>(IVU)) {
-    switch (I->getOpcode()) {
-    default: return false;
-    case Instruction::Add:
-    case Instruction::Sub:
-    case Instruction::Mul:
-    case Instruction::Shl:
-    case Instruction::AShr:
-    case Instruction::LShr:
-    case Instruction::GetElementPtr:
-    case Instruction::Trunc:
-    case Instruction::ZExt:
-    case Instruction::SExt:
-      return true;
-    }
-  }
-  return false;
-}
-
-static bool isLoopIncrement(User *U, Instruction *IV) {
-  BinaryOperator *BO = dyn_cast<BinaryOperator>(U);
-
-  if ((BO && BO->getOpcode() != Instruction::Add) ||
-      (!BO && !isa<GetElementPtrInst>(U)))
-    return false;
-
-  for (auto *UU : U->users()) {
-    PHINode *PN = dyn_cast<PHINode>(UU);
-    if (PN && PN == IV)
-      return true;
-  }
-  return false;
-}
-
-bool LoopReroll::DAGRootTracker::
-collectPossibleRoots(Instruction *Base, std::map<int64_t,Instruction*> &Roots) {
-  SmallInstructionVector BaseUsers;
-
-  for (auto *I : Base->users()) {
-    ConstantInt *CI = nullptr;
-
-    if (isLoopIncrement(I, IV)) {
-      LoopIncs.push_back(cast<Instruction>(I));
-      continue;
-    }
-
-    // The root nodes must be either GEPs, ORs or ADDs.
-    if (auto *BO = dyn_cast<BinaryOperator>(I)) {
-      if (BO->getOpcode() == Instruction::Add ||
-          BO->getOpcode() == Instruction::Or)
-        CI = dyn_cast<ConstantInt>(BO->getOperand(1));
-    } else if (auto *GEP = dyn_cast<GetElementPtrInst>(I)) {
-      Value *LastOperand = GEP->getOperand(GEP->getNumOperands()-1);
-      CI = dyn_cast<ConstantInt>(LastOperand);
-    }
-
-    if (!CI) {
-      if (Instruction *II = dyn_cast<Instruction>(I)) {
-        BaseUsers.push_back(II);
-        continue;
-      } else {
-        LLVM_DEBUG(dbgs() << "LRR: Aborting due to non-instruction: " << *I
-                          << "\n");
-        return false;
-      }
-    }
-
-    int64_t V = std::abs(CI->getValue().getSExtValue());
-    if (Roots.find(V) != Roots.end())
-      // No duplicates, please.
-      return false;
-
-    Roots[V] = cast<Instruction>(I);
-  }
-
-  // Make sure we have at least two roots.
-  if (Roots.empty() || (Roots.size() == 1 && BaseUsers.empty()))
-    return false;
-
-  // If we found non-loop-inc, non-root users of Base, assume they are
-  // for the zeroth root index. This is because "add %a, 0" gets optimized
-  // away.
-  if (BaseUsers.size()) {
-    if (Roots.find(0) != Roots.end()) {
-      LLVM_DEBUG(dbgs() << "LRR: Multiple roots found for base - aborting!\n");
-      return false;
-    }
-    Roots[0] = Base;
-  }
-
-  // Calculate the number of users of the base, or lowest indexed, iteration.
-  unsigned NumBaseUses = BaseUsers.size();
-  if (NumBaseUses == 0)
-    NumBaseUses = Roots.begin()->second->getNumUses();
-
-  // Check that every node has the same number of users.
-  for (auto &KV : Roots) {
-    if (KV.first == 0)
-      continue;
-    if (!KV.second->hasNUses(NumBaseUses)) {
-      LLVM_DEBUG(dbgs() << "LRR: Aborting - Root and Base #users not the same: "
-                        << "#Base=" << NumBaseUses
-                        << ", #Root=" << KV.second->getNumUses() << "\n");
-      return false;
-    }
-  }
-
-  return true;
-}
-
-void LoopReroll::DAGRootTracker::
-findRootsRecursive(Instruction *I, SmallInstructionSet SubsumedInsts) {
-  // Does the user look like it could be part of a root set?
-  // All its users must be simple arithmetic ops.
-  if (I->hasNUsesOrMore(IL_MaxRerollIterations + 1))
-    return;
-
-  if (I != IV && findRootsBase(I, SubsumedInsts))
-    return;
-
-  SubsumedInsts.insert(I);
-
-  for (User *V : I->users()) {
-    Instruction *I = cast<Instruction>(V);
-    if (is_contained(LoopIncs, I))
-      continue;
-
-    if (!isSimpleArithmeticOp(I))
-      continue;
-
-    // The recursive call makes a copy of SubsumedInsts.
-    findRootsRecursive(I, SubsumedInsts);
-  }
-}
-
-bool LoopReroll::DAGRootTracker::validateRootSet(DAGRootSet &DRS) {
-  if (DRS.Roots.empty())
-    return false;
-
-  // If the value of the base instruction is used outside the loop, we cannot
-  // reroll the loop. Check for other root instructions is unnecessary because
-  // they don't match any base instructions if their values are used outside.
-  if (hasUsesOutsideLoop(DRS.BaseInst, L))
-    return false;
-
-  // Consider a DAGRootSet with N-1 roots (so N different values including
-  //   BaseInst).
-  // Define d = Roots[0] - BaseInst, which should be the same as
-  //   Roots[I] - Roots[I-1] for all I in [1..N).
-  // Define D = BaseInst@J - BaseInst@J-1, where "@J" means the value at the
-  //   loop iteration J.
-  //
-  // Now, For the loop iterations to be consecutive:
-  //   D = d * N
-  const auto *ADR = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(DRS.BaseInst));
-  if (!ADR)
-    return false;
-
-  // Check that the first root is evenly spaced.
-  unsigned N = DRS.Roots.size() + 1;
-  const SCEV *StepSCEV = SE->getMinusSCEV(SE->getSCEV(DRS.Roots[0]), ADR);
-  if (isa<SCEVCouldNotCompute>(StepSCEV) || StepSCEV->getType()->isPointerTy())
-    return false;
-  const SCEV *ScaleSCEV = SE->getConstant(StepSCEV->getType(), N);
-  if (ADR->getStepRecurrence(*SE) != SE->getMulExpr(StepSCEV, ScaleSCEV))
-    return false;
-
-  // Check that the remainling roots are evenly spaced.
-  for (unsigned i = 1; i < N - 1; ++i) {
-    const SCEV *NewStepSCEV = SE->getMinusSCEV(SE->getSCEV(DRS.Roots[i]),
-                                               SE->getSCEV(DRS.Roots[i-1]));
-    if (NewStepSCEV != StepSCEV)
-      return false;
-  }
-
-  return true;
-}
-
-bool LoopReroll::DAGRootTracker::
-findRootsBase(Instruction *IVU, SmallInstructionSet SubsumedInsts) {
-  // The base of a RootSet must be an AddRec, so it can be erased.
-  const auto *IVU_ADR = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(IVU));
-  if (!IVU_ADR || IVU_ADR->getLoop() != L)
-    return false;
-
-  std::map<int64_t, Instruction*> V;
-  if (!collectPossibleRoots(IVU, V))
-    return false;
-
-  // If we didn't get a root for index zero, then IVU must be
-  // subsumed.
-  if (V.find(0) == V.end())
-    SubsumedInsts.insert(IVU);
-
-  // Partition the vector into monotonically increasing indexes.
-  DAGRootSet DRS;
-  DRS.BaseInst = nullptr;
-
-  SmallVector<DAGRootSet, 16> PotentialRootSets;
-
-  for (auto &KV : V) {
-    if (!DRS.BaseInst) {
-      DRS.BaseInst = KV.second;
-      DRS.SubsumedInsts = SubsumedInsts;
-    } else if (DRS.Roots.empty()) {
-      DRS.Roots.push_back(KV.second);
-    } else if (V.find(KV.first - 1) != V.end()) {
-      DRS.Roots.push_back(KV.second);
-    } else {
-      // Linear sequence terminated.
-      if (!validateRootSet(DRS))
-        return false;
-
-      // Construct a new DAGRootSet with the next sequence.
-      PotentialRootSets.push_back(DRS);
-      DRS.BaseInst = KV.second;
-      DRS.Roots.clear();
-    }
-  }
-
-  if (!validateRootSet(DRS))
-    return false;
-
-  PotentialRootSets.push_back(DRS);
-
-  RootSets.append(PotentialRootSets.begin(), PotentialRootSets.end());
-
-  return true;
-}
-
-bool LoopReroll::DAGRootTracker::findRoots() {
-  Inc = IVToIncMap[IV];
-
-  assert(RootSets.empty() && "Unclean state!");
-  if (std::abs(Inc) == 1) {
-    for (auto *IVU : IV->users()) {
-      if (isLoopIncrement(IVU, IV))
-        LoopIncs.push_back(cast<Instruction>(IVU));
-    }
-    findRootsRecursive(IV, SmallInstructionSet());
-    LoopIncs.push_back(IV);
-  } else {
-    if (!findRootsBase(IV, SmallInstructionSet()))
-      return false;
-  }
-
-  // Ensure all sets have the same size.
-  if (RootSets.empty()) {
-    LLVM_DEBUG(dbgs() << "LRR: Aborting because no root sets found!\n");
-    return false;
-  }
-  for (auto &V : RootSets) {
-    if (V.Roots.empty() || V.Roots.size() != RootSets[0].Roots.size()) {
-      LLVM_DEBUG(
-          dbgs()
-          << "LRR: Aborting because not all root sets have the same size\n");
-      return false;
-    }
-  }
-
-  Scale = RootSets[0].Roots.size() + 1;
-
-  if (Scale > IL_MaxRerollIterations) {
-    LLVM_DEBUG(dbgs() << "LRR: Aborting - too many iterations found. "
-                      << "#Found=" << Scale
-                      << ", #Max=" << IL_MaxRerollIterations << "\n");
-    return false;
-  }
-
-  LLVM_DEBUG(dbgs() << "LRR: Successfully found roots: Scale=" << Scale
-                    << "\n");
-
-  return true;
-}
-
-bool LoopReroll::DAGRootTracker::collectUsedInstructions(SmallInstructionSet &PossibleRedSet) {
-  // Populate the MapVector with all instructions in the block, in order first,
-  // so we can iterate over the contents later in perfect order.
-  for (auto &I : *L->getHeader()) {
-    Uses[&I].resize(IL_End);
-  }
-
-  SmallInstructionSet Exclude;
-  for (auto &DRS : RootSets) {
-    Exclude.insert(DRS.Roots.begin(), DRS.Roots.end());
-    Exclude.insert(DRS.SubsumedInsts.begin(), DRS.SubsumedInsts.end());
-    Exclude.insert(DRS.BaseInst);
-  }
-  Exclude.insert(LoopIncs.begin(), LoopIncs.end());
-
-  for (auto &DRS : RootSets) {
-    DenseSet<Instruction*> VBase;
-    collectInLoopUserSet(DRS.BaseInst, Exclude, PossibleRedSet, VBase);
-    for (auto *I : VBase) {
-      Uses[I].set(0);
-    }
-
-    unsigned Idx = 1;
-    for (auto *Root : DRS.Roots) {
-      DenseSet<Instruction*> V;
-      collectInLoopUserSet(Root, Exclude, PossibleRedSet, V);
-
-      // While we're here, check the use sets are the same size.
-      if (V.size() != VBase.size()) {
-        LLVM_DEBUG(dbgs() << "LRR: Aborting - use sets are different sizes\n");
-        return false;
-      }
-
-      for (auto *I : V) {
-        Uses[I].set(Idx);
-      }
-      ++Idx;
-    }
-
-    // Make sure our subsumed instructions are remembered too.
-    for (auto *I : DRS.SubsumedInsts) {
-      Uses[I].set(IL_All);
-    }
-  }
-
-  // Make sure the loop increments are also accounted for.
-
-  Exclude.clear();
-  for (auto &DRS : RootSets) {
-    Exclude.insert(DRS.Roots.begin(), DRS.Roots.end());
-    Exclude.insert(DRS.SubsumedInsts.begin(), DRS.SubsumedInsts.end());
-    Exclude.insert(DRS.BaseInst);
-  }
-
-  DenseSet<Instruction*> V;
-  collectInLoopUserSet(LoopIncs, Exclude, PossibleRedSet, V);
-  for (auto *I : V) {
-    if (I->mayHaveSideEffects()) {
-      LLVM_DEBUG(dbgs() << "LRR: Aborting - "
-                        << "An instruction which does not belong to any root "
-                        << "sets must not have side effects: " << *I);
-      return false;
-    }
-    Uses[I].set(IL_All);
-  }
-
-  return true;
-}
-
-/// Get the next instruction in "In" that is a member of set Val.
-/// Start searching from StartI, and do not return anything in Exclude.
-/// If StartI is not given, start from In.begin().
-LoopReroll::DAGRootTracker::UsesTy::iterator
-LoopReroll::DAGRootTracker::nextInstr(int Val, UsesTy &In,
-                                      const SmallInstructionSet &Exclude,
-                                      UsesTy::iterator *StartI) {
-  UsesTy::iterator I = StartI ? *StartI : In.begin();
-  while (I != In.end() && (I->second.test(Val) == 0 ||
-                           Exclude.contains(I->first)))
-    ++I;
-  return I;
-}
-
-bool LoopReroll::DAGRootTracker::isBaseInst(Instruction *I) {
-  for (auto &DRS : RootSets) {
-    if (DRS.BaseInst == I)
-      return true;
-  }
-  return false;
-}
-
-bool LoopReroll::DAGRootTracker::isRootInst(Instruction *I) {
-  for (auto &DRS : RootSets) {
-    if (is_contained(DRS.Roots, I))
-      return true;
-  }
-  return false;
-}
-
-/// Return true if instruction I depends on any instruction between
-/// Start and End.
-bool LoopReroll::DAGRootTracker::instrDependsOn(Instruction *I,
-                                                UsesTy::iterator Start,
-                                                UsesTy::iterator End) {
-  for (auto *U : I->users()) {
-    for (auto It = Start; It != End; ++It)
-      if (U == It->first)
-        return true;
-  }
-  return false;
-}
-
-static bool isIgnorableInst(const Instruction *I) {
-  if (isa<DbgInfoIntrinsic>(I))
-    return true;
-  const IntrinsicInst* II = dyn_cast<IntrinsicInst>(I);
-  if (!II)
-    return false;
-  switch (II->getIntrinsicID()) {
-    default:
-      return false;
-    case Intrinsic::annotation:
-    case Intrinsic::ptr_annotation:
-    case Intrinsic::var_annotation:
-    // TODO: the following intrinsics may also be allowed:
-    //   lifetime_start, lifetime_end, invariant_start, invariant_end
-      return true;
-  }
-  return false;
-}
-
-bool LoopReroll::DAGRootTracker::validate(ReductionTracker &Reductions) {
-  // We now need to check for equivalence of the use graph of each root with
-  // that of the primary induction variable (excluding the roots). Our goal
-  // here is not to solve the full graph isomorphism problem, but rather to
-  // catch common cases without a lot of work. As a result, we will assume
-  // that the relative order of the instructions in each unrolled iteration
-  // is the same (although we will not make an assumption about how the
-  // different iterations are intermixed). Note that while the order must be
-  // the same, the instructions may not be in the same basic block.
-
-  // An array of just the possible reductions for this scale factor. When we
-  // collect the set of all users of some root instructions, these reduction
-  // instructions are treated as 'final' (their uses are not considered).
-  // This is important because we don't want the root use set to search down
-  // the reduction chain.
-  SmallInstructionSet PossibleRedSet;
-  SmallInstructionSet PossibleRedLastSet;
-  SmallInstructionSet PossibleRedPHISet;
-  Reductions.restrictToScale(Scale, PossibleRedSet,
-                             PossibleRedPHISet, PossibleRedLastSet);
-
-  // Populate "Uses" with where each instruction is used.
-  if (!collectUsedInstructions(PossibleRedSet))
-    return false;
-
-  // Make sure we mark the reduction PHIs as used in all iterations.
-  for (auto *I : PossibleRedPHISet) {
-    Uses[I].set(IL_All);
-  }
-
-  // Make sure we mark loop-control-only PHIs as used in all iterations. See
-  // comment above LoopReroll::isLoopControlIV for more information.
-  BasicBlock *Header = L->getHeader();
-  for (Instruction *LoopControlIV : LoopControlIVs) {
-    for (auto *U : LoopControlIV->users()) {
-      Instruction *IVUser = dyn_cast<Instruction>(U);
-      // IVUser could be loop increment or compare
-      Uses[IVUser].set(IL_All);
-      for (auto *UU : IVUser->users()) {
-        Instruction *UUser = dyn_cast<Instruction>(UU);
-        // UUser could be compare, PHI or branch
-        Uses[UUser].set(IL_All);
-        // Skip SExt
-        if (isa<SExtInst>(UUser)) {
-          UUser = dyn_cast<Instruction>(*(UUser->user_begin()));
-          Uses[UUser].set(IL_All);
-        }
-        // Is UUser a compare instruction?
-        if (UU->hasOneUse()) {
-          Instruction *BI = dyn_cast<BranchInst>(*UUser->user_begin());
-          if (BI == cast<BranchInst>(Header->getTerminator()))
-            Uses[BI].set(IL_All);
-        }
-      }
-    }
-  }
-
-  // Make sure all instructions in the loop are in one and only one
-  // set.
-  for (auto &KV : Uses) {
-    if (KV.second.count() != 1 && !isIgnorableInst(KV.first)) {
-      LLVM_DEBUG(
-          dbgs() << "LRR: Aborting - instruction is not used in 1 iteration: "
-                 << *KV.first << " (#uses=" << KV.second.count() << ")\n");
-      return false;
-    }
-  }
-
-  LLVM_DEBUG(for (auto &KV
-                  : Uses) {
-    dbgs() << "LRR: " << KV.second.find_first() << "\t" << *KV.first << "\n";
-  });
-
-  BatchAAResults BatchAA(*AA);
-  for (unsigned Iter = 1; Iter < Scale; ++Iter) {
-    // In addition to regular aliasing information, we need to look for
-    // instructions from later (future) iterations that have side effects
-    // preventing us from reordering them past other instructions with side
-    // effects.
-    bool FutureSideEffects = false;
-    AliasSetTracker AST(BatchAA);
-    // The map between instructions in f(%iv.(i+1)) and f(%iv).
-    DenseMap<Value *, Value *> BaseMap;
-
-    // Compare iteration Iter to the base.
-    SmallInstructionSet Visited;
-    auto BaseIt = nextInstr(0, Uses, Visited);
-    auto RootIt = nextInstr(Iter, Uses, Visited);
-    auto LastRootIt = Uses.begin();
-
-    while (BaseIt != Uses.end() && RootIt != Uses.end()) {
-      Instruction *BaseInst = BaseIt->first;
-      Instruction *RootInst = RootIt->first;
-
-      // Skip over the IV or root instructions; only match their users.
-      bool Continue = false;
-      if (isBaseInst(BaseInst)) {
-        Visited.insert(BaseInst);
-        BaseIt = nextInstr(0, Uses, Visited);
-        Continue = true;
-      }
-      if (isRootInst(RootInst)) {
-        LastRootIt = RootIt;
-        Visited.insert(RootInst);
-        RootIt = nextInstr(Iter, Uses, Visited);
-        Continue = true;
-      }
-      if (Continue) continue;
-
-      if (!BaseInst->isSameOperationAs(RootInst)) {
-        // Last chance saloon. We don't try and solve the full isomorphism
-        // problem, but try and at least catch the case where two instructions
-        // *of different types* are round the wrong way. We won't be able to
-        // efficiently tell, given two ADD instructions, which way around we
-        // should match them, but given an ADD and a SUB, we can at least infer
-        // which one is which.
-        //
-        // This should allow us to deal with a greater subset of the isomorphism
-        // problem. It does however change a linear algorithm into a quadratic
-        // one, so limit the number of probes we do.
-        auto TryIt = RootIt;
-        unsigned N = NumToleratedFailedMatches;
-        while (TryIt != Uses.end() &&
-               !BaseInst->isSameOperationAs(TryIt->first) &&
-               N--) {
-          ++TryIt;
-          TryIt = nextInstr(Iter, Uses, Visited, &TryIt);
-        }
-
-        if (TryIt == Uses.end() || TryIt == RootIt ||
-            instrDependsOn(TryIt->first, RootIt, TryIt)) {
-          LLVM_DEBUG(dbgs() << "LRR: iteration root match failed at "
-                            << *BaseInst << " vs. " << *RootInst << "\n");
-          return false;
-        }
-
-        RootIt = TryIt;
-        RootInst = TryIt->first;
-      }
-
-      // All instructions between the last root and this root
-      // may belong to some other iteration. If they belong to a
-      // future iteration, then they're dangerous to alias with.
-      //
-      // Note that because we allow a limited amount of flexibility in the order
-      // that we visit nodes, LastRootIt might be *before* RootIt, in which
-      // case we've already checked this set of instructions so we shouldn't
-      // do anything.
-      for (; LastRootIt < RootIt; ++LastRootIt) {
-        Instruction *I = LastRootIt->first;
-        if (LastRootIt->second.find_first() < (int)Iter)
-          continue;
-        if (I->mayWriteToMemory())
-          AST.add(I);
-        // Note: This is specifically guarded by a check on isa<PHINode>,
-        // which while a valid (somewhat arbitrary) micro-optimization, is
-        // needed because otherwise isSafeToSpeculativelyExecute returns
-        // false on PHI nodes.
-        if (!isa<PHINode>(I) && !isUnorderedLoadStore(I) &&
-            !isSafeToSpeculativelyExecute(I))
-          // Intervening instructions cause side effects.
-          FutureSideEffects = true;
-      }
-
-      // Make sure that this instruction, which is in the use set of this
-      // root instruction, does not also belong to the base set or the set of
-      // some other root instruction.
-      if (RootIt->second.count() > 1) {
-        LLVM_DEBUG(dbgs() << "LRR: iteration root match failed at " << *BaseInst
-                          << " vs. " << *RootInst << " (prev. case overlap)\n");
-        return false;
-      }
-
-      // Make sure that we don't alias with any instruction in the alias set
-      // tracker. If we do, then we depend on a future iteration, and we
-      // can't reroll.
-      if (RootInst->mayReadFromMemory()) {
-        for (auto &K : AST) {
-          if (isModOrRefSet(K.aliasesUnknownInst(RootInst, BatchAA))) {
-            LLVM_DEBUG(dbgs() << "LRR: iteration root match failed at "
-                              << *BaseInst << " vs. " << *RootInst
-                              << " (depends on future store)\n");
-            return false;
-          }
-        }
-      }
-
-      // If we've past an instruction from a future iteration that may have
-      // side effects, and this instruction might also, then we can't reorder
-      // them, and this matching fails. As an exception, we allow the alias
-      // set tracker to handle regular (unordered) load/store dependencies.
-      if (FutureSideEffects && ((!isUnorderedLoadStore(BaseInst) &&
-                                 !isSafeToSpeculativelyExecute(BaseInst)) ||
-                                (!isUnorderedLoadStore(RootInst) &&
-                                 !isSafeToSpeculativelyExecute(RootInst)))) {
-        LLVM_DEBUG(dbgs() << "LRR: iteration root match failed at " << *BaseInst
-                          << " vs. " << *RootInst
-                          << " (side effects prevent reordering)\n");
-        return false;
-      }
-
-      // For instructions that are part of a reduction, if the operation is
-      // associative, then don't bother matching the operands (because we
-      // already know that the instructions are isomorphic, and the order
-      // within the iteration does not matter). For non-associative reductions,
-      // we do need to match the operands, because we need to reject
-      // out-of-order instructions within an iteration!
-      // For example (assume floating-point addition), we need to reject this:
-      //   x += a[i]; x += b[i];
-      //   x += a[i+1]; x += b[i+1];
-      //   x += b[i+2]; x += a[i+2];
-      bool InReduction = Reductions.isPairInSame(BaseInst, RootInst);
-
-      if (!(InReduction && BaseInst->isAssociative())) {
-        bool Swapped = false, SomeOpMatched = false;
-        for (unsigned j = 0; j < BaseInst->getNumOperands(); ++j) {
-          Value *Op2 = RootInst->getOperand(j);
-
-          // If this is part of a reduction (and the operation is not
-          // associatve), then we match all operands, but not those that are
-          // part of the reduction.
-          if (InReduction)
-            if (Instruction *Op2I = dyn_cast<Instruction>(Op2))
-              if (Reductions.isPairInSame(RootInst, Op2I))
-                continue;
-
-          DenseMap<Value *, Value *>::iterator BMI = BaseMap.find(Op2);
-          if (BMI != BaseMap.end()) {
-            Op2 = BMI->second;
-          } else {
-            for (auto &DRS : RootSets) {
-              if (DRS.Roots[Iter-1] == (Instruction*) Op2) {
-                Op2 = DRS.BaseInst;
-                break;
-              }
-            }
-          }
-
-          if (BaseInst->getOperand(Swapped ? unsigned(!j) : j) != Op2) {
-            // If we've not already decided to swap the matched operands, and
-            // we've not already matched our first operand (note that we could
-            // have skipped matching the first operand because it is part of a
-            // reduction above), and the instruction is commutative, then try
-            // the swapped match.
-            if (!Swapped && BaseInst->isCommutative() && !SomeOpMatched &&
-                BaseInst->getOperand(!j) == Op2) {
-              Swapped = true;
-            } else {
-              LLVM_DEBUG(dbgs()
-                         << "LRR: iteration root match failed at " << *BaseInst
-                         << " vs. " << *RootInst << " (operand " << j << ")\n");
-              return false;
-            }
-          }
-
-          SomeOpMatched = true;
-        }
-      }
-
-      if ((!PossibleRedLastSet.count(BaseInst) &&
-           hasUsesOutsideLoop(BaseInst, L)) ||
-          (!PossibleRedLastSet.count(RootInst) &&
-           hasUsesOutsideLoop(RootInst, L))) {
-        LLVM_DEBUG(dbgs() << "LRR: iteration root match failed at " << *BaseInst
-                          << " vs. " << *RootInst << " (uses outside loop)\n");
-        return false;
-      }
-
-      Reductions.recordPair(BaseInst, RootInst, Iter);
-      BaseMap.insert(std::make_pair(RootInst, BaseInst));
-
-      LastRootIt = RootIt;
-      Visited.insert(BaseInst);
-      Visited.insert(RootInst);
-      BaseIt = nextInstr(0, Uses, Visited);
-      RootIt = nextInstr(Iter, Uses, Visited);
-    }
-    assert(BaseIt == Uses.end() && RootIt == Uses.end() &&
-           "Mismatched set sizes!");
-  }
-
-  LLVM_DEBUG(dbgs() << "LRR: Matched all iteration increments for " << *IV
-                    << "\n");
-
-  return true;
-}
-
-void LoopReroll::DAGRootTracker::replace(const SCEV *BackedgeTakenCount) {
-  BasicBlock *Header = L->getHeader();
-
-  // Compute the start and increment for each BaseInst before we start erasing
-  // instructions.
-  SmallVector<const SCEV *, 8> StartExprs;
-  SmallVector<const SCEV *, 8> IncrExprs;
-  for (auto &DRS : RootSets) {
-    const SCEVAddRecExpr *IVSCEV =
-        cast<SCEVAddRecExpr>(SE->getSCEV(DRS.BaseInst));
-    StartExprs.push_back(IVSCEV->getStart());
-    IncrExprs.push_back(SE->getMinusSCEV(SE->getSCEV(DRS.Roots[0]), IVSCEV));
-  }
-
-  // Remove instructions associated with non-base iterations.
-  for (Instruction &Inst : llvm::make_early_inc_range(llvm::reverse(*Header))) {
-    unsigned I = Uses[&Inst].find_first();
-    if (I > 0 && I < IL_All) {
-      LLVM_DEBUG(dbgs() << "LRR: removing: " << Inst << "\n");
-      Inst.eraseFromParent();
-    }
-  }
-
-  // Rewrite each BaseInst using SCEV.
-  for (size_t i = 0, e = RootSets.size(); i != e; ++i)
-    // Insert the new induction variable.
-    replaceIV(RootSets[i], StartExprs[i], IncrExprs[i]);
-
-  { // Limit the lifetime of SCEVExpander.
-    BranchInst *BI = cast<BranchInst>(Header->getTerminator());
-    const DataLayout &DL = Header->getModule()->getDataLayout();
-    SCEVExpander Expander(*SE, DL, "reroll");
-    auto Zero = SE->getZero(BackedgeTakenCount->getType());
-    auto One = SE->getOne(BackedgeTakenCount->getType());
-    auto NewIVSCEV = SE->getAddRecExpr(Zero, One, L, SCEV::FlagAnyWrap);
-    Value *NewIV =
-        Expander.expandCodeFor(NewIVSCEV, BackedgeTakenCount->getType(),
-                               Header->getFirstNonPHIOrDbg());
-    // FIXME: This arithmetic can overflow.
-    auto TripCount = SE->getAddExpr(BackedgeTakenCount, One);
-    auto ScaledTripCount = SE->getMulExpr(
-        TripCount, SE->getConstant(BackedgeTakenCount->getType(), Scale));
-    auto ScaledBECount = SE->getMinusSCEV(ScaledTripCount, One);
-    Value *TakenCount =
-        Expander.expandCodeFor(ScaledBECount, BackedgeTakenCount->getType(),
-                               Header->getFirstNonPHIOrDbg());
-    Value *Cond =
-        new ICmpInst(BI, CmpInst::ICMP_EQ, NewIV, TakenCount, "exitcond");
-    BI->setCondition(Cond);
-
-    if (BI->getSuccessor(1) != Header)
-      BI->swapSuccessors();
-  }
-
-  SimplifyInstructionsInBlock(Header, TLI);
-  DeleteDeadPHIs(Header, TLI);
-}
-
-void LoopReroll::DAGRootTracker::replaceIV(DAGRootSet &DRS,
-                                           const SCEV *Start,
-                                           const SCEV *IncrExpr) {
-  BasicBlock *Header = L->getHeader();
-  Instruction *Inst = DRS.BaseInst;
-
-  const SCEV *NewIVSCEV =
-      SE->getAddRecExpr(Start, IncrExpr, L, SCEV::FlagAnyWrap);
-
-  { // Limit the lifetime of SCEVExpander.
-    const DataLayout &DL = Header->getModule()->getDataLayout();
-    SCEVExpander Expander(*SE, DL, "reroll");
-    Value *NewIV = Expander.expandCodeFor(NewIVSCEV, Inst->getType(),
-                                          Header->getFirstNonPHIOrDbg());
-
-    for (auto &KV : Uses)
-      if (KV.second.find_first() == 0)
-        KV.first->replaceUsesOfWith(Inst, NewIV);
-  }
-}
-
-// Validate the selected reductions. All iterations must have an isomorphic
-// part of the reduction chain and, for non-associative reductions, the chain
-// entries must appear in order.
-bool LoopReroll::ReductionTracker::validateSelected() {
-  // For a non-associative reduction, the chain entries must appear in order.
-  for (int i : Reds) {
-    int PrevIter = 0, BaseCount = 0, Count = 0;
-    for (Instruction *J : PossibleReds[i]) {
-      // Note that all instructions in the chain must have been found because
-      // all instructions in the function must have been assigned to some
-      // iteration.
-      int Iter = PossibleRedIter[J];
-      if (Iter != PrevIter && Iter != PrevIter + 1 &&
-          !PossibleReds[i].getReducedValue()->isAssociative()) {
-        LLVM_DEBUG(dbgs() << "LRR: Out-of-order non-associative reduction: "
-                          << J << "\n");
-        return false;
-      }
-
-      if (Iter != PrevIter) {
-        if (Count != BaseCount) {
-          LLVM_DEBUG(dbgs()
-                     << "LRR: Iteration " << PrevIter << " reduction use count "
-                     << Count << " is not equal to the base use count "
-                     << BaseCount << "\n");
-          return false;
-        }
-
-        Count = 0;
-      }
-
-      ++Count;
-      if (Iter == 0)
-        ++BaseCount;
-
-      PrevIter = Iter;
-    }
-  }
-
-  return true;
-}
-
-// For all selected reductions, remove all parts except those in the first
-// iteration (and the PHI). Replace outside uses of the reduced value with uses
-// of the first-iteration reduced value (in other words, reroll the selected
-// reductions).
-void LoopReroll::ReductionTracker::replaceSelected() {
-  // Fixup reductions to refer to the last instruction associated with the
-  // first iteration (not the last).
-  for (int i : Reds) {
-    int j = 0;
-    for (int e = PossibleReds[i].size(); j != e; ++j)
-      if (PossibleRedIter[PossibleReds[i][j]] != 0) {
-        --j;
-        break;
-      }
-
-    // Replace users with the new end-of-chain value.
-    SmallInstructionVector Users;
-    for (User *U : PossibleReds[i].getReducedValue()->users()) {
-      Users.push_back(cast<Instruction>(U));
-    }
-
-    for (Instruction *User : Users)
-      User->replaceUsesOfWith(PossibleReds[i].getReducedValue(),
-                              PossibleReds[i][j]);
-  }
-}
-
-// Reroll the provided loop with respect to the provided induction variable.
-// Generally, we're looking for a loop like this:
-//
-// %iv = phi [ (preheader, ...), (body, %iv.next) ]
-// f(%iv)
-// %iv.1 = add %iv, 1                <-- a root increment
-// f(%iv.1)
-// %iv.2 = add %iv, 2                <-- a root increment
-// f(%iv.2)
-// %iv.scale_m_1 = add %iv, scale-1  <-- a root increment
-// f(%iv.scale_m_1)
-// ...
-// %iv.next = add %iv, scale
-// %cmp = icmp(%iv, ...)
-// br %cmp, header, exit
-//
-// Notably, we do not require that f(%iv), f(%iv.1), etc. be isolated groups of
-// instructions. In other words, the instructions in f(%iv), f(%iv.1), etc. can
-// be intermixed with eachother. The restriction imposed by this algorithm is
-// that the relative order of the isomorphic instructions in f(%iv), f(%iv.1),
-// etc. be the same.
-//
-// First, we collect the use set of %iv, excluding the other increment roots.
-// This gives us f(%iv). Then we iterate over the loop instructions (scale-1)
-// times, having collected the use set of f(%iv.(i+1)), during which we:
-//   - Ensure that the next unmatched instruction in f(%iv) is isomorphic to
-//     the next unmatched instruction in f(%iv.(i+1)).
-//   - Ensure that both matched instructions don't have any external users
-//     (with the exception of last-in-chain reduction instructions).
-//   - Track the (aliasing) write set, and other side effects, of all
-//     instructions that belong to future iterations that come before the matched
-//     instructions. If the matched instructions read from that write set, then
-//     f(%iv) or f(%iv.(i+1)) has some dependency on instructions in
-//     f(%iv.(j+1)) for some j > i, and we cannot reroll the loop. Similarly,
-//     if any of these future instructions had side effects (could not be
-//     speculatively executed), and so do the matched instructions, when we
-//     cannot reorder those side-effect-producing instructions, and rerolling
-//     fails.
-//
-// Finally, we make sure that all loop instructions are either loop increment
-// roots, belong to simple latch code, parts of validated reductions, part of
-// f(%iv) or part of some f(%iv.i). If all of that is true (and all reductions
-// have been validated), then we reroll the loop.
-bool LoopReroll::reroll(Instruction *IV, Loop *L, BasicBlock *Header,
-                        const SCEV *BackedgeTakenCount,
-                        ReductionTracker &Reductions) {
-  DAGRootTracker DAGRoots(this, L, IV, SE, AA, TLI, DT, LI, PreserveLCSSA,
-                          IVToIncMap, LoopControlIVs);
-
-  if (!DAGRoots.findRoots())
-    return false;
-  LLVM_DEBUG(dbgs() << "LRR: Found all root induction increments for: " << *IV
-                    << "\n");
-
-  if (!DAGRoots.validate(Reductions))
-    return false;
-  if (!Reductions.validateSelected())
-    return false;
-  // At this point, we've validated the rerolling, and we're committed to
-  // making changes!
-
-  Reductions.replaceSelected();
-  DAGRoots.replace(BackedgeTakenCount);
-
-  ++NumRerolledLoops;
-  return true;
-}
-
-bool LoopReroll::runOnLoop(Loop *L) {
-  BasicBlock *Header = L->getHeader();
-  LLVM_DEBUG(dbgs() << "LRR: F[" << Header->getParent()->getName() << "] Loop %"
-                    << Header->getName() << " (" << L->getNumBlocks()
-                    << " block(s))\n");
-
-  // For now, we'll handle only single BB loops.
-  if (L->getNumBlocks() > 1)
-    return false;
-
-  if (!SE->hasLoopInvariantBackedgeTakenCount(L))
-    return false;
-
-  const SCEV *BackedgeTakenCount = SE->getBackedgeTakenCount(L);
-  LLVM_DEBUG(dbgs() << "\n Before Reroll:\n" << *(L->getHeader()) << "\n");
-  LLVM_DEBUG(dbgs() << "LRR: backedge-taken count = " << *BackedgeTakenCount
-               << "\n");
-
-  // First, we need to find the induction variable with respect to which we can
-  // reroll (there may be several possible options).
-  SmallInstructionVector PossibleIVs;
-  IVToIncMap.clear();
-  LoopControlIVs.clear();
-  collectPossibleIVs(L, PossibleIVs);
-
-  if (PossibleIVs.empty()) {
-    LLVM_DEBUG(dbgs() << "LRR: No possible IVs found\n");
-    return false;
-  }
-
-  ReductionTracker Reductions;
-  collectPossibleReductions(L, Reductions);
-  bool Changed = false;
-
-  // For each possible IV, collect the associated possible set of 'root' nodes
-  // (i+1, i+2, etc.).
-  for (Instruction *PossibleIV : PossibleIVs)
-    if (reroll(PossibleIV, L, Header, BackedgeTakenCount, Reductions)) {
-      Changed = true;
-      break;
-    }
-  LLVM_DEBUG(dbgs() << "\n After Reroll:\n" << *(L->getHeader()) << "\n");
-
-  // Trip count of L has changed so SE must be re-evaluated.
-  if (Changed)
-    SE->forgetLoop(L);
-
-  return Changed;
-}
-
-PreservedAnalyses LoopRerollPass::run(Loop &L, LoopAnalysisManager &AM,
-                                      LoopStandardAnalysisResults &AR,
-                                      LPMUpdater &U) {
-  return LoopReroll(&AR.AA, &AR.LI, &AR.SE, &AR.TLI, &AR.DT, true).runOnLoop(&L)
-             ? getLoopPassPreservedAnalyses()
-             : PreservedAnalyses::all();
-}
diff --git a/llvm/test/Transforms/LoopReroll/basic.ll b/llvm/test/Transforms/LoopReroll/basic.ll
deleted file mode 100644
index 92d3456..0000000
--- a/llvm/test/Transforms/LoopReroll/basic.ll
+++ /dev/null
@@ -1,976 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2
-; RUN: opt < %s -passes=loop-reroll -S | FileCheck %s
-target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
-target triple = "x86_64-unknown-linux-gnu"
-
-; int foo(int a);
-; void bar(int *x) {
-;   for (int i = 0; i < 500; i += 3) {
-;     foo(i);
-;     foo(i+1);
-;     foo(i+2);
-;   }
-; }
-
-define void @bar(ptr nocapture readnone %x) #0 {
-; CHECK-LABEL: define void @bar
-; CHECK-SAME: (ptr nocapture readnone [[X:%.*]]) #[[ATTR0:[0-9]+]] {
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[INDVAR:%.*]] = phi i32 [ [[INDVAR_NEXT:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[CALL:%.*]] = tail call i32 @foo(i32 [[INDVAR]]) #[[ATTR1:[0-9]+]]
-; CHECK-NEXT:    [[INDVAR_NEXT]] = add i32 [[INDVAR]], 1
-; CHECK-NEXT:    [[EXITCOND1:%.*]] = icmp eq i32 [[INDVAR]], 500
-; CHECK-NEXT:    br i1 [[EXITCOND1]], label [[FOR_END:%.*]], label [[FOR_BODY]]
-; CHECK:       for.end:
-; CHECK-NEXT:    ret void
-;
-entry:
-  br label %for.body
-
-for.body:                                         ; preds = %for.body, %entry
-  %i.08 = phi i32 [ 0, %entry ], [ %add3, %for.body ]
-  %call = tail call i32 @foo(i32 %i.08) #1
-  %add = add nsw i32 %i.08, 1
-  %call1 = tail call i32 @foo(i32 %add) #1
-  %add2 = add nsw i32 %i.08, 2
-  %call3 = tail call i32 @foo(i32 %add2) #1
-  %add3 = add nsw i32 %i.08, 3
-  %exitcond = icmp sge i32 %add3, 500
-  br i1 %exitcond, label %for.end, label %for.body
-
-for.end:                                          ; preds = %for.body
-  ret void
-}
-
-declare i32 @foo(i32)
-
-; void hi1(int *x) {
-;   for (int i = 0; i < 1500; i += 3) {
-;     x[i] = foo(0);
-;     x[i+1] = foo(0);
-;     x[i+2] = foo(0);
-;   }
-; }
-
-; Function Attrs: nounwind uwtable
-define void @hi1(ptr nocapture %x) #0 {
-; CHECK-LABEL: define void @hi1
-; CHECK-SAME: (ptr nocapture [[X:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[INDVAR:%.*]] = phi i64 [ [[INDVAR_NEXT:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = trunc i64 [[INDVAR]] to i32
-; CHECK-NEXT:    [[CALL:%.*]] = tail call i32 @foo(i32 0) #[[ATTR1]]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[X]], i64 [[INDVAR]]
-; CHECK-NEXT:    store i32 [[CALL]], ptr [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[INDVAR_NEXT]] = add i64 [[INDVAR]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[TMP0]], 1499
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]]
-; CHECK:       for.end:
-; CHECK-NEXT:    ret void
-;
-entry:
-  br label %for.body
-
-for.body:                                         ; preds = %entry, %for.body
-  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
-  %call = tail call i32 @foo(i32 0) #1
-  %arrayidx = getelementptr inbounds i32, ptr %x, i64 %indvars.iv
-  store i32 %call, ptr %arrayidx, align 4
-  %call1 = tail call i32 @foo(i32 0) #1
-  %0 = add nsw i64 %indvars.iv, 1
-  %arrayidx3 = getelementptr inbounds i32, ptr %x, i64 %0
-  store i32 %call1, ptr %arrayidx3, align 4
-  %call4 = tail call i32 @foo(i32 0) #1
-  %1 = add nsw i64 %indvars.iv, 2
-  %arrayidx7 = getelementptr inbounds i32, ptr %x, i64 %1
-  store i32 %call4, ptr %arrayidx7, align 4
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 3
-  %2 = trunc i64 %indvars.iv.next to i32
-  %cmp = icmp slt i32 %2, 1500
-  br i1 %cmp, label %for.body, label %for.end
-
-for.end:                                          ; preds = %for.body
-  ret void
-}
-
-; void hi2(int *x) {
-;   for (int i = 0; i < 500; ++i) {
-;     x[3*i] = foo(0);
-;     x[3*i+1] = foo(0);
-;     x[3*i+2] = foo(0);
-;   }
-; }
-
-; Function Attrs: nounwind uwtable
-define void @hi2(ptr nocapture %x) #0 {
-; CHECK-LABEL: define void @hi2
-; CHECK-SAME: (ptr nocapture [[X:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[CALL:%.*]] = tail call i32 @foo(i32 0) #[[ATTR1]]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[X]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    store i32 [[CALL]], ptr [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; CHECK-NEXT:    [[EXITCOND1:%.*]] = icmp eq i64 [[INDVARS_IV]], 1499
-; CHECK-NEXT:    br i1 [[EXITCOND1]], label [[FOR_END:%.*]], label [[FOR_BODY]]
-; CHECK:       for.end:
-; CHECK-NEXT:    ret void
-;
-entry:
-  br label %for.body
-
-for.body:                                         ; preds = %for.body, %entry
-  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
-  %call = tail call i32 @foo(i32 0) #1
-  %0 = mul nsw i64 %indvars.iv, 3
-  %arrayidx = getelementptr inbounds i32, ptr %x, i64 %0
-  store i32 %call, ptr %arrayidx, align 4
-  %call1 = tail call i32 @foo(i32 0) #1
-  %1 = add nsw i64 %0, 1
-  %arrayidx4 = getelementptr inbounds i32, ptr %x, i64 %1
-  store i32 %call1, ptr %arrayidx4, align 4
-  %call5 = tail call i32 @foo(i32 0) #1
-  %2 = add nsw i64 %0, 2
-  %arrayidx9 = getelementptr inbounds i32, ptr %x, i64 %2
-  store i32 %call5, ptr %arrayidx9, align 4
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
-  %exitcond = icmp eq i64 %indvars.iv.next, 500
-  br i1 %exitcond, label %for.end, label %for.body
-
-for.end:                                          ; preds = %for.body
-  ret void
-}
-
-; void goo(float alpha, float *a, float *b) {
-;   for (int i = 0; i < 3200; i += 5) {
-;     a[i] += alpha * b[i];
-;     a[i + 1] += alpha * b[i + 1];
-;     a[i + 2] += alpha * b[i + 2];
-;     a[i + 3] += alpha * b[i + 3];
-;     a[i + 4] += alpha * b[i + 4];
-;   }
-; }
-
-; Function Attrs: nounwind uwtable
-define void @goo(float %alpha, ptr nocapture %a, ptr nocapture readonly %b) #0 {
-; CHECK-LABEL: define void @goo
-; CHECK-SAME: (float [[ALPHA:%.*]], ptr nocapture [[A:%.*]], ptr nocapture readonly [[B:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[INDVAR:%.*]] = phi i64 [ [[INDVAR_NEXT:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = trunc i64 [[INDVAR]] to i32
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[INDVAR]]
-; CHECK-NEXT:    [[TMP1:%.*]] = load float, ptr [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[MUL:%.*]] = fmul float [[TMP1]], [[ALPHA]]
-; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDVAR]]
-; CHECK-NEXT:    [[TMP2:%.*]] = load float, ptr [[ARRAYIDX2]], align 4
-; CHECK-NEXT:    [[ADD:%.*]] = fadd float [[TMP2]], [[MUL]]
-; CHECK-NEXT:    store float [[ADD]], ptr [[ARRAYIDX2]], align 4
-; CHECK-NEXT:    [[INDVAR_NEXT]] = add i64 [[INDVAR]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[TMP0]], 3199
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]]
-; CHECK:       for.end:
-; CHECK-NEXT:    ret void
-;
-entry:
-  br label %for.body
-
-for.body:                                         ; preds = %entry, %for.body
-  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
-  %arrayidx = getelementptr inbounds float, ptr %b, i64 %indvars.iv
-  %0 = load float, ptr %arrayidx, align 4
-  %mul = fmul float %0, %alpha
-  %arrayidx2 = getelementptr inbounds float, ptr %a, i64 %indvars.iv
-  %1 = load float, ptr %arrayidx2, align 4
-  %add = fadd float %1, %mul
-  store float %add, ptr %arrayidx2, align 4
-  %2 = add nsw i64 %indvars.iv, 1
-  %arrayidx5 = getelementptr inbounds float, ptr %b, i64 %2
-  %3 = load float, ptr %arrayidx5, align 4
-  %mul6 = fmul float %3, %alpha
-  %arrayidx9 = getelementptr inbounds float, ptr %a, i64 %2
-  %4 = load float, ptr %arrayidx9, align 4
-  %add10 = fadd float %4, %mul6
-  store float %add10, ptr %arrayidx9, align 4
-  %5 = add nsw i64 %indvars.iv, 2
-  %arrayidx13 = getelementptr inbounds float, ptr %b, i64 %5
-  %6 = load float, ptr %arrayidx13, align 4
-  %mul14 = fmul float %6, %alpha
-  %arrayidx17 = getelementptr inbounds float, ptr %a, i64 %5
-  %7 = load float, ptr %arrayidx17, align 4
-  %add18 = fadd float %7, %mul14
-  store float %add18, ptr %arrayidx17, align 4
-  %8 = add nsw i64 %indvars.iv, 3
-  %arrayidx21 = getelementptr inbounds float, ptr %b, i64 %8
-  %9 = load float, ptr %arrayidx21, align 4
-  %mul22 = fmul float %9, %alpha
-  %arrayidx25 = getelementptr inbounds float, ptr %a, i64 %8
-  %10 = load float, ptr %arrayidx25, align 4
-  %add26 = fadd float %10, %mul22
-  store float %add26, ptr %arrayidx25, align 4
-  %11 = add nsw i64 %indvars.iv, 4
-  %arrayidx29 = getelementptr inbounds float, ptr %b, i64 %11
-  %12 = load float, ptr %arrayidx29, align 4
-  %mul30 = fmul float %12, %alpha
-  %arrayidx33 = getelementptr inbounds float, ptr %a, i64 %11
-  %13 = load float, ptr %arrayidx33, align 4
-  %add34 = fadd float %13, %mul30
-  store float %add34, ptr %arrayidx33, align 4
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 5
-  %14 = trunc i64 %indvars.iv.next to i32
-  %cmp = icmp slt i32 %14, 3200
-  br i1 %cmp, label %for.body, label %for.end
-
-for.end:                                          ; preds = %for.body
-  ret void
-}
-
-; void hoo(float alpha, float *a, float *b, int *ip) {
-;   for (int i = 0; i < 3200; i += 5) {
-;     a[i] += alpha * b[ip[i]];
-;     a[i + 1] += alpha * b[ip[i + 1]];
-;     a[i + 2] += alpha * b[ip[i + 2]];
-;     a[i + 3] += alpha * b[ip[i + 3]];
-;     a[i + 4] += alpha * b[ip[i + 4]];
-;   }
-; }
-
-; Function Attrs: nounwind uwtable
-define void @hoo(float %alpha, ptr nocapture %a, ptr nocapture readonly %b, ptr nocapture readonly %ip) #0 {
-; CHECK-LABEL: define void @hoo
-; CHECK-SAME: (float [[ALPHA:%.*]], ptr nocapture [[A:%.*]], ptr nocapture readonly [[B:%.*]], ptr nocapture readonly [[IP:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[INDVAR:%.*]] = phi i64 [ [[INDVAR_NEXT:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = trunc i64 [[INDVAR]] to i32
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[IP]], i64 [[INDVAR]]
-; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[IDXPROM1:%.*]] = sext i32 [[TMP1]] to i64
-; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[IDXPROM1]]
-; CHECK-NEXT:    [[TMP2:%.*]] = load float, ptr [[ARRAYIDX2]], align 4
-; CHECK-NEXT:    [[MUL:%.*]] = fmul float [[TMP2]], [[ALPHA]]
-; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDVAR]]
-; CHECK-NEXT:    [[TMP3:%.*]] = load float, ptr [[ARRAYIDX4]], align 4
-; CHECK-NEXT:    [[ADD:%.*]] = fadd float [[TMP3]], [[MUL]]
-; CHECK-NEXT:    store float [[ADD]], ptr [[ARRAYIDX4]], align 4
-; CHECK-NEXT:    [[INDVAR_NEXT]] = add i64 [[INDVAR]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[TMP0]], 3199
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]]
-; CHECK:       for.end:
-; CHECK-NEXT:    ret void
-;
-entry:
-  br label %for.body
-
-for.body:                                         ; preds = %entry, %for.body
-  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
-  %arrayidx = getelementptr inbounds i32, ptr %ip, i64 %indvars.iv
-  %0 = load i32, ptr %arrayidx, align 4
-  %idxprom1 = sext i32 %0 to i64
-  %arrayidx2 = getelementptr inbounds float, ptr %b, i64 %idxprom1
-  %1 = load float, ptr %arrayidx2, align 4
-  %mul = fmul float %1, %alpha
-  %arrayidx4 = getelementptr inbounds float, ptr %a, i64 %indvars.iv
-  %2 = load float, ptr %arrayidx4, align 4
-  %add = fadd float %2, %mul
-  store float %add, ptr %arrayidx4, align 4
-  %3 = add nsw i64 %indvars.iv, 1
-  %arrayidx7 = getelementptr inbounds i32, ptr %ip, i64 %3
-  %4 = load i32, ptr %arrayidx7, align 4
-  %idxprom8 = sext i32 %4 to i64
-  %arrayidx9 = getelementptr inbounds float, ptr %b, i64 %idxprom8
-  %5 = load float, ptr %arrayidx9, align 4
-  %mul10 = fmul float %5, %alpha
-  %arrayidx13 = getelementptr inbounds float, ptr %a, i64 %3
-  %6 = load float, ptr %arrayidx13, align 4
-  %add14 = fadd float %6, %mul10
-  store float %add14, ptr %arrayidx13, align 4
-  %7 = add nsw i64 %indvars.iv, 2
-  %arrayidx17 = getelementptr inbounds i32, ptr %ip, i64 %7
-  %8 = load i32, ptr %arrayidx17, align 4
-  %idxprom18 = sext i32 %8 to i64
-  %arrayidx19 = getelementptr inbounds float, ptr %b, i64 %idxprom18
-  %9 = load float, ptr %arrayidx19, align 4
-  %mul20 = fmul float %9, %alpha
-  %arrayidx23 = getelementptr inbounds float, ptr %a, i64 %7
-  %10 = load float, ptr %arrayidx23, align 4
-  %add24 = fadd float %10, %mul20
-  store float %add24, ptr %arrayidx23, align 4
-  %11 = add nsw i64 %indvars.iv, 3
-  %arrayidx27 = getelementptr inbounds i32, ptr %ip, i64 %11
-  %12 = load i32, ptr %arrayidx27, align 4
-  %idxprom28 = sext i32 %12 to i64
-  %arrayidx29 = getelementptr inbounds float, ptr %b, i64 %idxprom28
-  %13 = load float, ptr %arrayidx29, align 4
-  %mul30 = fmul float %13, %alpha
-  %arrayidx33 = getelementptr inbounds float, ptr %a, i64 %11
-  %14 = load float, ptr %arrayidx33, align 4
-  %add34 = fadd float %14, %mul30
-  store float %add34, ptr %arrayidx33, align 4
-  %15 = add nsw i64 %indvars.iv, 4
-  %arrayidx37 = getelementptr inbounds i32, ptr %ip, i64 %15
-  %16 = load i32, ptr %arrayidx37, align 4
-  %idxprom38 = sext i32 %16 to i64
-  %arrayidx39 = getelementptr inbounds float, ptr %b, i64 %idxprom38
-  %17 = load float, ptr %arrayidx39, align 4
-  %mul40 = fmul float %17, %alpha
-  %arrayidx43 = getelementptr inbounds float, ptr %a, i64 %15
-  %18 = load float, ptr %arrayidx43, align 4
-  %add44 = fadd float %18, %mul40
-  store float %add44, ptr %arrayidx43, align 4
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 5
-  %19 = trunc i64 %indvars.iv.next to i32
-  %cmp = icmp slt i32 %19, 3200
-  br i1 %cmp, label %for.body, label %for.end
-
-
-
-
-for.end:                                          ; preds = %for.body
-  ret void
-}
-
-; void multi1(int *x) {
-;   y = foo(0)
-;   for (int i = 0; i < 500; ++i) {
-;     x[3*i] = y;
-;     x[3*i+1] = y;
-;     x[3*i+2] = y;
-;     x[3*i+6] = y;
-;     x[3*i+7] = y;
-;     x[3*i+8] = y;
-;   }
-; }
-
-; Function Attrs: nounwind uwtable
-define void @multi1(ptr nocapture %x) #0 {
-; CHECK-LABEL: define void @multi1
-; CHECK-SAME: (ptr nocapture [[X:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[CALL:%.*]] = tail call i32 @foo(i32 0) #[[ATTR1]]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDVARS_IV]], 6
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[X]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    store i32 [[CALL]], ptr [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds i32, ptr [[X]], i64 [[TMP0]]
-; CHECK-NEXT:    store i32 [[CALL]], ptr [[ARRAYIDX6]], align 4
-; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; CHECK-NEXT:    [[EXITCOND1:%.*]] = icmp eq i64 [[INDVARS_IV]], 1499
-; CHECK-NEXT:    br i1 [[EXITCOND1]], label [[FOR_END:%.*]], label [[FOR_BODY]]
-; CHECK:       for.end:
-; CHECK-NEXT:    ret void
-;
-entry:
-  %call = tail call i32 @foo(i32 0) #1
-  br label %for.body
-
-for.body:                                         ; preds = %for.body, %entry
-  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
-  %0 = mul nsw i64 %indvars.iv, 3
-  %arrayidx = getelementptr inbounds i32, ptr %x, i64 %0
-  store i32 %call, ptr %arrayidx, align 4
-  %1 = add nsw i64 %0, 1
-  %arrayidx4 = getelementptr inbounds i32, ptr %x, i64 %1
-  store i32 %call, ptr %arrayidx4, align 4
-  %2 = add nsw i64 %0, 2
-  %arrayidx9 = getelementptr inbounds i32, ptr %x, i64 %2
-  store i32 %call, ptr %arrayidx9, align 4
-  %3 = add nsw i64 %0, 6
-  %arrayidx6 = getelementptr inbounds i32, ptr %x, i64 %3
-  store i32 %call, ptr %arrayidx6, align 4
-  %4 = add nsw i64 %0, 7
-  %arrayidx7 = getelementptr inbounds i32, ptr %x, i64 %4
-  store i32 %call, ptr %arrayidx7, align 4
-  %5 = add nsw i64 %0, 8
-  %arrayidx8 = getelementptr inbounds i32, ptr %x, i64 %5
-  store i32 %call, ptr %arrayidx8, align 4
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
-  %exitcond = icmp eq i64 %indvars.iv.next, 500
-  br i1 %exitcond, label %for.end, label %for.body
-
-
-
-for.end:                                          ; preds = %for.body
-  ret void
-}
-
-; void multi2(int *x) {
-;   y = foo(0)
-;   for (int i = 0; i < 500; ++i) {
-;     x[3*i] = y;
-;     x[3*i+1] = y;
-;     x[3*i+2] = y;
-;     x[3*(i+1)] = y;
-;     x[3*(i+1)+1] = y;
-;     x[3*(i+1)+2] = y;
-;   }
-; }
-
-; Function Attrs: nounwind uwtable
-define void @multi2(ptr nocapture %x) #0 {
-; CHECK-LABEL: define void @multi2
-; CHECK-SAME: (ptr nocapture [[X:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[CALL:%.*]] = tail call i32 @foo(i32 0) #[[ATTR1]]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDVARS_IV]], 3
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[X]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    store i32 [[CALL]], ptr [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds i32, ptr [[X]], i64 [[TMP0]]
-; CHECK-NEXT:    store i32 [[CALL]], ptr [[ARRAYIDX6]], align 4
-; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; CHECK-NEXT:    [[EXITCOND1:%.*]] = icmp eq i64 [[INDVARS_IV]], 1499
-; CHECK-NEXT:    br i1 [[EXITCOND1]], label [[FOR_END:%.*]], label [[FOR_BODY]]
-; CHECK:       for.end:
-; CHECK-NEXT:    ret void
-;
-entry:
-  %call = tail call i32 @foo(i32 0) #1
-  br label %for.body
-
-for.body:                                         ; preds = %for.body, %entry
-  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
-  %0 = mul nsw i64 %indvars.iv, 3
-  %add = add nsw i64 %indvars.iv, 1
-  %newmul = mul nsw i64 %add, 3
-  %arrayidx = getelementptr inbounds i32, ptr %x, i64 %0
-  store i32 %call, ptr %arrayidx, align 4
-  %1 = add nsw i64 %0, 1
-  %arrayidx4 = getelementptr inbounds i32, ptr %x, i64 %1
-  store i32 %call, ptr %arrayidx4, align 4
-  %2 = add nsw i64 %0, 2
-  %arrayidx9 = getelementptr inbounds i32, ptr %x, i64 %2
-  store i32 %call, ptr %arrayidx9, align 4
-  %arrayidx6 = getelementptr inbounds i32, ptr %x, i64 %newmul
-  store i32 %call, ptr %arrayidx6, align 4
-  %3 = add nsw i64 %newmul, 1
-  %arrayidx7 = getelementptr inbounds i32, ptr %x, i64 %3
-  store i32 %call, ptr %arrayidx7, align 4
-  %4 = add nsw i64 %newmul, 2
-  %arrayidx8 = getelementptr inbounds i32, ptr %x, i64 %4
-  store i32 %call, ptr %arrayidx8, align 4
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
-  %exitcond = icmp eq i64 %indvars.iv.next, 500
-  br i1 %exitcond, label %for.end, label %for.body
-
-
-
-for.end:                                          ; preds = %for.body
-  ret void
-}
-
-; void multi3(int *x) {
-;   y = foo(0)
-;   for (int i = 0; i < 500; ++i) {
-;     // Note: No zero index
-;     x[3*i+3] = y;
-;     x[3*i+4] = y;
-;     x[3*i+5] = y;
-;   }
-; }
-
-; Function Attrs: nounwind uwtable
-define void @multi3(ptr nocapture %x) #0 {
-; CHECK-LABEL: define void @multi3
-; CHECK-SAME: (ptr nocapture [[X:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[CALL:%.*]] = tail call i32 @foo(i32 0) #[[ATTR1]]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDVARS_IV]], 3
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[X]], i64 [[TMP0]]
-; CHECK-NEXT:    store i32 [[CALL]], ptr [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; CHECK-NEXT:    [[EXITCOND1:%.*]] = icmp eq i64 [[INDVARS_IV]], 1499
-; CHECK-NEXT:    br i1 [[EXITCOND1]], label [[FOR_END:%.*]], label [[FOR_BODY]]
-; CHECK:       for.end:
-; CHECK-NEXT:    ret void
-;
-entry:
-  %call = tail call i32 @foo(i32 0) #1
-  br label %for.body
-
-for.body:                                         ; preds = %for.body, %entry
-  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
-  %0 = mul nsw i64 %indvars.iv, 3
-  %x0 = add nsw i64 %0, 3
-  %add = add nsw i64 %indvars.iv, 1
-  %arrayidx = getelementptr inbounds i32, ptr %x, i64 %x0
-  store i32 %call, ptr %arrayidx, align 4
-  %1 = add nsw i64 %0, 4
-  %arrayidx4 = getelementptr inbounds i32, ptr %x, i64 %1
-  store i32 %call, ptr %arrayidx4, align 4
-  %2 = add nsw i64 %0, 5
-  %arrayidx9 = getelementptr inbounds i32, ptr %x, i64 %2
-  store i32 %call, ptr %arrayidx9, align 4
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
-  %exitcond = icmp eq i64 %indvars.iv.next, 500
-  br i1 %exitcond, label %for.end, label %for.body
-
-
-for.end:                                          ; preds = %for.body
-  ret void
-}
-
-; int foo(int a);
-; void bar2(int *x, int y, int z) {
-;   for (int i = 0; i < 500; i += 3) {
-;     foo(i+y+i*z); // Slightly reordered instruction order
-;     foo(i+1+y+(i+1)*z);
-;     foo(i+2+y+(i+2)*z);
-;   }
-; }
-
-; Function Attrs: nounwind uwtable
-define void @bar2(ptr nocapture readnone %x, i32 %y, i32 %z) #0 {
-; CHECK-LABEL: define void @bar2
-; CHECK-SAME: (ptr nocapture readnone [[X:%.*]], i32 [[Y:%.*]], i32 [[Z:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[INDVAR:%.*]] = phi i32 [ [[INDVAR_NEXT:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[INDVAR]], [[Y]]
-; CHECK-NEXT:    [[TMP2:%.*]] = mul i32 [[INDVAR]], [[Z]]
-; CHECK-NEXT:    [[TMP3:%.*]] = add i32 [[TMP2]], [[TMP1]]
-; CHECK-NEXT:    [[CALL:%.*]] = tail call i32 @foo(i32 [[TMP3]]) #[[ATTR1]]
-; CHECK-NEXT:    [[INDVAR_NEXT]] = add i32 [[INDVAR]], 1
-; CHECK-NEXT:    [[EXITCOND1:%.*]] = icmp eq i32 [[INDVAR]], 500
-; CHECK-NEXT:    br i1 [[EXITCOND1]], label [[FOR_END:%.*]], label [[FOR_BODY]]
-; CHECK:       for.end:
-; CHECK-NEXT:    ret void
-;
-entry:
-  br label %for.body
-
-for.body:                                         ; preds = %for.body, %entry
-  %i.08 = phi i32 [ 0, %entry ], [ %add3, %for.body ]
-
-  %tmp1 = add i32 %i.08, %y
-  %tmp2 = mul i32 %i.08, %z
-  %tmp3 = add i32 %tmp2, %tmp1
-  %call = tail call i32 @foo(i32 %tmp3) #1
-
-  %add = add nsw i32 %i.08, 1
-  %tmp2a = mul i32 %add, %z
-  %tmp1a = add i32 %add, %y
-  %tmp3a = add i32 %tmp2a, %tmp1a
-  %calla = tail call i32 @foo(i32 %tmp3a) #1
-
-  %add2 = add nsw i32 %i.08, 2
-  %tmp2b = mul i32 %add2, %z
-  %tmp1b = add i32 %add2, %y
-  %tmp3b = add i32 %tmp2b, %tmp1b
-  %callb = tail call i32 @foo(i32 %tmp3b) #1
-
-  %add3 = add nsw i32 %i.08, 3
-
-  %exitcond = icmp sge i32 %add3, 500
-  br i1 %exitcond, label %for.end, label %for.body
-
-for.end:                                          ; preds = %for.body
-  ret void
-}
-
-%struct.s = type { i32, i32 }
-
-; Function Attrs: nounwind uwtable
-define void @gep1(ptr nocapture %x) #0 {
-; CHECK-LABEL: define void @gep1
-; CHECK-SAME: (ptr nocapture [[X:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[CALL:%.*]] = tail call i32 @foo(i32 0) #[[ATTR1]]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = mul nsw i64 [[INDVARS_IV]], 3
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_S:%.*]], ptr [[X]], i64 [[TMP0]], i32 0
-; CHECK-NEXT:    store i32 [[CALL]], ptr [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[TMP1:%.*]] = add nsw i64 [[TMP0]], 1
-; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[X]], i64 [[TMP1]], i32 0
-; CHECK-NEXT:    store i32 [[CALL]], ptr [[ARRAYIDX4]], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = add nsw i64 [[TMP0]], 2
-; CHECK-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[X]], i64 [[TMP2]], i32 0
-; CHECK-NEXT:    store i32 [[CALL]], ptr [[ARRAYIDX9]], align 4
-; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 500
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]]
-; CHECK:       for.end:
-; CHECK-NEXT:    ret void
-;
-entry:
-  %call = tail call i32 @foo(i32 0) #1
-  br label %for.body
-
-for.body:                                         ; preds = %for.body, %entry
-  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
-  %0 = mul nsw i64 %indvars.iv, 3
-  %arrayidx = getelementptr inbounds %struct.s, ptr %x, i64 %0, i32 0
-  store i32 %call, ptr %arrayidx, align 4
-  %1 = add nsw i64 %0, 1
-  %arrayidx4 = getelementptr inbounds %struct.s, ptr %x, i64 %1, i32 0
-  store i32 %call, ptr %arrayidx4, align 4
-  %2 = add nsw i64 %0, 2
-  %arrayidx9 = getelementptr inbounds %struct.s, ptr %x, i64 %2, i32 0
-  store i32 %call, ptr %arrayidx9, align 4
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
-  %exitcond = icmp eq i64 %indvars.iv.next, 500
-  br i1 %exitcond, label %for.end, label %for.body
-
-; This test is a crash test only.
-for.end:                                          ; preds = %for.body
-  ret void
-}
-
-define void @gep-indexing(ptr nocapture %x) {
-; CHECK-LABEL: define void @gep-indexing
-; CHECK-SAME: (ptr nocapture [[X:%.*]]) {
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[CALL:%.*]] = tail call i32 @foo(i32 0) #[[ATTR1]]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = shl nuw nsw i64 [[INDVARS_IV]], 2
-; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, ptr [[X]], i64 [[TMP0]]
-; CHECK-NEXT:    store i32 [[CALL]], ptr [[SCEVGEP]], align 4
-; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; CHECK-NEXT:    [[EXITCOND1:%.*]] = icmp eq i64 [[INDVARS_IV]], 1499
-; CHECK-NEXT:    br i1 [[EXITCOND1]], label [[FOR_END:%.*]], label [[FOR_BODY]]
-; CHECK:       for.end:
-; CHECK-NEXT:    ret void
-;
-entry:
-  %call = tail call i32 @foo(i32 0) #1
-  br label %for.body
-
-for.body:                                         ; preds = %for.body, %entry
-  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
-  %0 = mul nsw i64 %indvars.iv, 3
-  %arrayidx = getelementptr inbounds i32, ptr %x, i64 %0
-  store i32 %call, ptr %arrayidx, align 4
-  %arrayidx4 = getelementptr inbounds i32, ptr %arrayidx, i64 1
-  store i32 %call, ptr %arrayidx4, align 4
-  %arrayidx9 = getelementptr inbounds i32, ptr %arrayidx, i64 2
-  store i32 %call, ptr %arrayidx9, align 4
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
-  %exitcond = icmp eq i64 %indvars.iv.next, 500
-  br i1 %exitcond, label %for.end, label %for.body
-
-for.end:                                          ; preds = %for.body
-  ret void
-}
-
-
-define void @unordered_atomic_ops(ptr noalias %buf_0, ptr noalias %buf_1) {
-; CHECK-LABEL: define void @unordered_atomic_ops
-; CHECK-SAME: (ptr noalias [[BUF_0:%.*]], ptr noalias [[BUF_1:%.*]]) {
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[INDVAR:%.*]] = phi i32 [ [[INDVAR_NEXT:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[BUF0_A:%.*]] = getelementptr i32, ptr [[BUF_0]], i32 [[INDVAR]]
-; CHECK-NEXT:    [[BUF1_A:%.*]] = getelementptr i32, ptr [[BUF_1]], i32 [[INDVAR]]
-; CHECK-NEXT:    [[VA:%.*]] = load atomic i32, ptr [[BUF0_A]] unordered, align 4
-; CHECK-NEXT:    store atomic i32 [[VA]], ptr [[BUF1_A]] unordered, align 4
-; CHECK-NEXT:    [[INDVAR_NEXT]] = add i32 [[INDVAR]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INDVAR]], 3199
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]]
-; CHECK:       for.end:
-; CHECK-NEXT:    ret void
-;
-entry:
-  br label %for.body
-
-for.body:
-  %indvars.iv = phi i32 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
-  %indvars.iv.next = add i32 %indvars.iv, 2
-  %indvars.mid = add i32 %indvars.iv, 1
-  %buf0_a = getelementptr i32, ptr %buf_0, i32 %indvars.iv
-  %buf0_b = getelementptr i32, ptr %buf_0, i32 %indvars.mid
-  %buf1_a = getelementptr i32, ptr %buf_1, i32 %indvars.iv
-  %buf1_b = getelementptr i32, ptr %buf_1, i32 %indvars.mid
-  %va = load atomic i32, ptr %buf0_a unordered, align 4
-  %vb = load atomic i32, ptr %buf0_b unordered, align 4
-  store atomic i32 %va, ptr %buf1_a unordered, align 4
-  store atomic i32 %vb, ptr %buf1_b unordered, align 4
-  %cmp = icmp slt i32 %indvars.iv.next, 3200
-  br i1 %cmp, label %for.body, label %for.end
-
-for.end:
-  ret void
-}
-
-define void @unordered_atomic_ops_nomatch(ptr noalias %buf_0, ptr noalias %buf_1) {
-; Negative test
-; CHECK-LABEL: define void @unordered_atomic_ops_nomatch
-; CHECK-SAME: (ptr noalias [[BUF_0:%.*]], ptr noalias [[BUF_1:%.*]]) {
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add i32 [[INDVARS_IV]], 2
-; CHECK-NEXT:    [[INDVARS_MID:%.*]] = add i32 [[INDVARS_IV]], 1
-; CHECK-NEXT:    [[BUF0_A:%.*]] = getelementptr i32, ptr [[BUF_0]], i32 [[INDVARS_IV]]
-; CHECK-NEXT:    [[BUF0_B:%.*]] = getelementptr i32, ptr [[BUF_0]], i32 [[INDVARS_MID]]
-; CHECK-NEXT:    [[BUF1_A:%.*]] = getelementptr i32, ptr [[BUF_1]], i32 [[INDVARS_IV]]
-; CHECK-NEXT:    [[BUF1_B:%.*]] = getelementptr i32, ptr [[BUF_1]], i32 [[INDVARS_MID]]
-; CHECK-NEXT:    [[VA:%.*]] = load atomic i32, ptr [[BUF0_A]] unordered, align 4
-; CHECK-NEXT:    [[VB:%.*]] = load atomic i32, ptr [[BUF0_B]] unordered, align 4
-; CHECK-NEXT:    store i32 [[VA]], ptr [[BUF1_A]], align 4
-; CHECK-NEXT:    store atomic i32 [[VB]], ptr [[BUF1_B]] unordered, align 4
-; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[INDVARS_IV_NEXT]], 3200
-; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END:%.*]]
-; CHECK:       for.end:
-; CHECK-NEXT:    ret void
-;
-entry:
-  br label %for.body
-
-for.body:
-
-  %indvars.iv = phi i32 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
-  %indvars.iv.next = add i32 %indvars.iv, 2
-  %indvars.mid = add i32 %indvars.iv, 1
-  %buf0_a = getelementptr i32, ptr %buf_0, i32 %indvars.iv
-  %buf0_b = getelementptr i32, ptr %buf_0, i32 %indvars.mid
-  %buf1_a = getelementptr i32, ptr %buf_1, i32 %indvars.iv
-  %buf1_b = getelementptr i32, ptr %buf_1, i32 %indvars.mid
-  %va = load atomic i32, ptr %buf0_a unordered, align 4
-  %vb = load atomic i32, ptr %buf0_b unordered, align 4
-  store i32 %va, ptr %buf1_a, align 4  ;; Not atomic
-  store atomic i32 %vb, ptr %buf1_b unordered, align 4
-  %cmp = icmp slt i32 %indvars.iv.next, 3200
-  br i1 %cmp, label %for.body, label %for.end
-
-for.end:
-  ret void
-}
-
-define void @ordered_atomic_ops(ptr noalias %buf_0, ptr noalias %buf_1) {
-; Negative test
-; CHECK-LABEL: define void @ordered_atomic_ops
-; CHECK-SAME: (ptr noalias [[BUF_0:%.*]], ptr noalias [[BUF_1:%.*]]) {
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add i32 [[INDVARS_IV]], 2
-; CHECK-NEXT:    [[INDVARS_MID:%.*]] = add i32 [[INDVARS_IV]], 1
-; CHECK-NEXT:    [[BUF0_A:%.*]] = getelementptr i32, ptr [[BUF_0]], i32 [[INDVARS_IV]]
-; CHECK-NEXT:    [[BUF0_B:%.*]] = getelementptr i32, ptr [[BUF_0]], i32 [[INDVARS_MID]]
-; CHECK-NEXT:    [[BUF1_A:%.*]] = getelementptr i32, ptr [[BUF_1]], i32 [[INDVARS_IV]]
-; CHECK-NEXT:    [[BUF1_B:%.*]] = getelementptr i32, ptr [[BUF_1]], i32 [[INDVARS_MID]]
-; CHECK-NEXT:    [[VA:%.*]] = load atomic i32, ptr [[BUF0_A]] acquire, align 4
-; CHECK-NEXT:    [[VB:%.*]] = load atomic i32, ptr [[BUF0_B]] acquire, align 4
-; CHECK-NEXT:    store atomic i32 [[VA]], ptr [[BUF1_A]] release, align 4
-; CHECK-NEXT:    store atomic i32 [[VB]], ptr [[BUF1_B]] release, align 4
-; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[INDVARS_IV_NEXT]], 3200
-; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END:%.*]]
-; CHECK:       for.end:
-; CHECK-NEXT:    ret void
-;
-entry:
-  br label %for.body
-
-for.body:
-
-  %indvars.iv = phi i32 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
-  %indvars.iv.next = add i32 %indvars.iv, 2
-  %indvars.mid = add i32 %indvars.iv, 1
-  %buf0_a = getelementptr i32, ptr %buf_0, i32 %indvars.iv
-  %buf0_b = getelementptr i32, ptr %buf_0, i32 %indvars.mid
-  %buf1_a = getelementptr i32, ptr %buf_1, i32 %indvars.iv
-  %buf1_b = getelementptr i32, ptr %buf_1, i32 %indvars.mid
-  %va = load atomic i32, ptr %buf0_a acquire, align 4
-  %vb = load atomic i32, ptr %buf0_b acquire, align 4
-  store atomic i32 %va, ptr %buf1_a release, align 4
-  store atomic i32 %vb, ptr %buf1_b release, align 4
-  %cmp = icmp slt i32 %indvars.iv.next, 3200
-  br i1 %cmp, label %for.body, label %for.end
-
-for.end:
-  ret void
-}
-
-define void @unordered_atomic_ops_with_fence(ptr noalias %buf_0, ptr noalias %buf_1) {
-; CHECK-LABEL: define void @unordered_atomic_ops_with_fence
-; CHECK-SAME: (ptr noalias [[BUF_0:%.*]], ptr noalias [[BUF_1:%.*]]) {
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add i32 [[INDVARS_IV]], 2
-; CHECK-NEXT:    [[INDVARS_MID:%.*]] = add i32 [[INDVARS_IV]], 1
-; CHECK-NEXT:    [[BUF0_A:%.*]] = getelementptr i32, ptr [[BUF_0]], i32 [[INDVARS_IV]]
-; CHECK-NEXT:    [[BUF0_B:%.*]] = getelementptr i32, ptr [[BUF_0]], i32 [[INDVARS_MID]]
-; CHECK-NEXT:    [[BUF1_A:%.*]] = getelementptr i32, ptr [[BUF_1]], i32 [[INDVARS_IV]]
-; CHECK-NEXT:    [[BUF1_B:%.*]] = getelementptr i32, ptr [[BUF_1]], i32 [[INDVARS_MID]]
-; CHECK-NEXT:    [[VA:%.*]] = load atomic i32, ptr [[BUF0_A]] unordered, align 4
-; CHECK-NEXT:    [[VB:%.*]] = load atomic i32, ptr [[BUF0_B]] unordered, align 4
-; CHECK-NEXT:    fence seq_cst
-; CHECK-NEXT:    store atomic i32 [[VA]], ptr [[BUF1_A]] unordered, align 4
-; CHECK-NEXT:    store atomic i32 [[VB]], ptr [[BUF1_B]] unordered, align 4
-; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[INDVARS_IV_NEXT]], 3200
-; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END:%.*]]
-; CHECK:       for.end:
-; CHECK-NEXT:    ret void
-;
-entry:
-  br label %for.body
-
-for.body:
-
-  %indvars.iv = phi i32 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
-  %indvars.iv.next = add i32 %indvars.iv, 2
-  %indvars.mid = add i32 %indvars.iv, 1
-  %buf0_a = getelementptr i32, ptr %buf_0, i32 %indvars.iv
-  %buf0_b = getelementptr i32, ptr %buf_0, i32 %indvars.mid
-  %buf1_a = getelementptr i32, ptr %buf_1, i32 %indvars.iv
-  %buf1_b = getelementptr i32, ptr %buf_1, i32 %indvars.mid
-  %va = load atomic i32, ptr %buf0_a unordered, align 4
-  %vb = load atomic i32, ptr %buf0_b unordered, align 4
-  fence seq_cst
-  store atomic i32 %va, ptr %buf1_a unordered, align 4
-  store atomic i32 %vb, ptr %buf1_b unordered, align 4
-  %cmp = icmp slt i32 %indvars.iv.next, 3200
-  br i1 %cmp, label %for.body, label %for.end
-
-for.end:
-  ret void
-}
-
-define void @pointer_bitcast_baseinst(ptr %arg, ptr %arg1, i64 %arg2) {
-; CHECK-LABEL: define void @pointer_bitcast_baseinst
-; CHECK-SAME: (ptr [[ARG:%.*]], ptr [[ARG1:%.*]], i64 [[ARG2:%.*]]) {
-; CHECK-NEXT:  bb:
-; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[ARG2]], -17
-; CHECK-NEXT:    [[TMP1:%.*]] = lshr i64 [[TMP0]], 4
-; CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 1
-; CHECK-NEXT:    [[TMP3:%.*]] = add nuw nsw i64 [[TMP2]], 1
-; CHECK-NEXT:    br label [[BB3:%.*]]
-; CHECK:       bb3:
-; CHECK-NEXT:    [[INDVAR:%.*]] = phi i64 [ [[INDVAR_NEXT:%.*]], [[BB3]] ], [ 0, [[BB:%.*]] ]
-; CHECK-NEXT:    [[TMP4:%.*]] = shl nuw i64 [[INDVAR]], 3
-; CHECK-NEXT:    [[TMP5:%.*]] = add i64 [[TMP4]], 1
-; CHECK-NEXT:    [[INST5:%.*]] = shl nuw i64 [[TMP5]], 1
-; CHECK-NEXT:    [[INST6:%.*]] = getelementptr i8, ptr [[ARG1]], i64 [[INST5]]
-; CHECK-NEXT:    [[INST8:%.*]] = load <8 x i16>, ptr [[INST6]], align 2
-; CHECK-NEXT:    [[INST13:%.*]] = getelementptr i16, ptr [[ARG]], i64 [[TMP5]]
-; CHECK-NEXT:    store <8 x i16> [[INST8]], ptr [[INST13]], align 2
-; CHECK-NEXT:    [[INDVAR_NEXT]] = add i64 [[INDVAR]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVAR]], [[TMP3]]
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[BB19:%.*]], label [[BB3]]
-; CHECK:       bb19:
-; CHECK-NEXT:    ret void
-;
-bb:
-  br label %bb3
-
-bb3:                                              ; preds = %bb3, %bb
-  %inst = phi i64 [ 1, %bb ], [ %inst17, %bb3 ]
-  %inst4 = add nuw i64 %inst, 8
-  %inst5 = shl nuw i64 %inst, 1
-  %inst6 = getelementptr i8, ptr %arg1, i64 %inst5
-  %inst8 = load <8 x i16>, ptr %inst6, align 2
-  %inst9 = shl i64 %inst4, 1
-  %inst10 = getelementptr i8, ptr %arg1, i64 %inst9
-  %inst12 = load <8 x i16>, ptr %inst10, align 2
-  %inst13 = getelementptr i16, ptr %arg, i64 %inst
-  store <8 x i16> %inst8, ptr %inst13, align 2
-  %inst15 = getelementptr i16, ptr %arg, i64 %inst4
-  store <8 x i16> %inst12, ptr %inst15, align 2
-  %inst17 = add nuw nsw i64 %inst, 16
-  %inst18 = icmp eq i64 %inst17, %arg2
-  br i1 %inst18, label %bb19, label %bb3
-
-bb19:                                             ; preds = %bb3
-  ret void
-}
-
-define void @bad_step(ptr nocapture readnone %x) #0 {
-; CHECK-LABEL: define void @bad_step
-; CHECK-SAME: (ptr nocapture readnone [[X:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[I_08:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[ADD3:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[CALL:%.*]] = tail call i32 @foo(i32 [[I_08]]) #[[ATTR1]]
-; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[I_08]], 2
-; CHECK-NEXT:    [[CALL1:%.*]] = tail call i32 @foo(i32 [[ADD]]) #[[ATTR1]]
-; CHECK-NEXT:    [[ADD2:%.*]] = add nsw i32 [[I_08]], 3
-; CHECK-NEXT:    [[CALL3:%.*]] = tail call i32 @foo(i32 [[ADD2]]) #[[ATTR1]]
-; CHECK-NEXT:    [[ADD3]] = add nsw i32 [[I_08]], 6
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp sge i32 [[ADD3]], 500
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]]
-; CHECK:       for.end:
-; CHECK-NEXT:    ret void
-;
-entry:
-  br label %for.body
-
-for.body:                                         ; preds = %for.body, %entry
-  %i.08 = phi i32 [ 0, %entry ], [ %add3, %for.body ]
-  %call = tail call i32 @foo(i32 %i.08) #1
-  %add = add nsw i32 %i.08, 2
-  %call1 = tail call i32 @foo(i32 %add) #1
-  %add2 = add nsw i32 %i.08, 3
-  %call3 = tail call i32 @foo(i32 %add2) #1
-  %add3 = add nsw i32 %i.08, 6
-  %exitcond = icmp sge i32 %add3, 500
-  br i1 %exitcond, label %for.end, label %for.body
-
-
-for.end:                                          ; preds = %for.body
-  ret void
-}
-
-@a = external global [2 x [512 x i64]], align 16
-@b = external global [512 x [4 x i64]], align 16
-
-define void @ptr_step_crash() {
-; CHECK-LABEL: define void @ptr_step_crash() {
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br label [[FOR_BODY42_3:%.*]]
-; CHECK:       for.body42.3:
-; CHECK-NEXT:    [[K_2207_3:%.*]] = phi i32 [ -512, [[ENTRY:%.*]] ], [ [[INC63_3:%.*]], [[FOR_BODY42_3]] ]
-; CHECK-NEXT:    [[SUB46_3:%.*]] = add nsw i32 [[K_2207_3]], 512
-; CHECK-NEXT:    [[IDXPROM47_3:%.*]] = zext i32 [[SUB46_3]] to i64
-; CHECK-NEXT:    [[ARRAYIDX48_3:%.*]] = getelementptr inbounds [2 x [512 x i64]], ptr @a, i64 0, i64 0, i64 [[IDXPROM47_3]]
-; CHECK-NEXT:    [[ARRAYIDX55_3:%.*]] = getelementptr inbounds [512 x [4 x i64]], ptr @b, i64 0, i64 [[IDXPROM47_3]], i64 3
-; CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr [[ARRAYIDX55_3]], align 8
-; CHECK-NEXT:    [[INC63_3]] = add nsw i32 [[K_2207_3]], 1
-; CHECK-NEXT:    br i1 true, label [[FOR_INC65_3:%.*]], label [[FOR_BODY42_3]]
-; CHECK:       for.inc65.3:
-; CHECK-NEXT:    ret void
-;
-entry:
-  br label %for.body42.3
-
-for.body42.3:                                     ; preds = %for.body42.3, %entry
-  %k.2207.3 = phi i32 [ -512, %entry ], [ %inc63.3, %for.body42.3 ]
-  %sub46.3 = add nsw i32 %k.2207.3, 512
-  %idxprom47.3 = zext i32 %sub46.3 to i64
-  %arrayidx48.3 = getelementptr inbounds [2 x [512 x i64]], ptr @a, i64 0, i64 0, i64 %idxprom47.3
-  %arrayidx55.3 = getelementptr inbounds [512 x [4 x i64]], ptr @b, i64 0, i64 %idxprom47.3, i64 3
-  %0 = load i64, ptr %arrayidx55.3, align 8
-  %inc63.3 = add nsw i32 %k.2207.3, 1
-  br i1 undef, label %for.inc65.3, label %for.body42.3
-
-for.inc65.3:                                      ; preds = %for.body42.3
-  ret void
-}
-
-attributes #0 = { nounwind uwtable }
-attributes #1 = { nounwind }
diff --git a/llvm/test/Transforms/LoopReroll/basic32iters.ll b/llvm/test/Transforms/LoopReroll/basic32iters.ll
deleted file mode 100644
index edf38cb..0000000
--- a/llvm/test/Transforms/LoopReroll/basic32iters.ll
+++ /dev/null
@@ -1,328 +0,0 @@
-; RUN: opt < %s -passes=loop-reroll -verify-scev -S | FileCheck %s
-target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
-target triple = "x86_64-unknown-linux-gnu"
-
-; void goo32(float alpha, float *a, float *b) {
-;   for (int i = 0; i < 3200; i += 32) {
-;     a[i] += alpha * b[i];
-;     a[i + 1] += alpha * b[i + 1];
-;     a[i + 2] += alpha * b[i + 2];
-;     a[i + 3] += alpha * b[i + 3];
-;     a[i + 4] += alpha * b[i + 4];
-;     a[i + 5] += alpha * b[i + 5];
-;     a[i + 6] += alpha * b[i + 6];
-;     a[i + 7] += alpha * b[i + 7];
-;     a[i + 8] += alpha * b[i + 8];
-;     a[i + 9] += alpha * b[i + 9];
-;     a[i + 10] += alpha * b[i + 10];
-;     a[i + 11] += alpha * b[i + 11];
-;     a[i + 12] += alpha * b[i + 12];
-;     a[i + 13] += alpha * b[i + 13];
-;     a[i + 14] += alpha * b[i + 14];
-;     a[i + 15] += alpha * b[i + 15];
-;     a[i + 16] += alpha * b[i + 16];
-;     a[i + 17] += alpha * b[i + 17];
-;     a[i + 18] += alpha * b[i + 18];
-;     a[i + 19] += alpha * b[i + 19];
-;     a[i + 20] += alpha * b[i + 20];
-;     a[i + 21] += alpha * b[i + 21];
-;     a[i + 22] += alpha * b[i + 22];
-;     a[i + 23] += alpha * b[i + 23];
-;     a[i + 24] += alpha * b[i + 24];
-;     a[i + 25] += alpha * b[i + 25];
-;     a[i + 26] += alpha * b[i + 26];
-;     a[i + 27] += alpha * b[i + 27];
-;     a[i + 28] += alpha * b[i + 28];
-;     a[i + 29] += alpha * b[i + 29];
-;     a[i + 30] += alpha * b[i + 30];
-;     a[i + 31] += alpha * b[i + 31];
-;   }
-; }
-
-; Function Attrs: norecurse nounwind uwtable
-define void @goo32(float %alpha, ptr %a, ptr readonly %b) #0 {
-entry:
-  br label %for.body
-
-for.body:                                         ; preds = %entry, %for.body
-  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
-  %arrayidx = getelementptr inbounds float, ptr %b, i64 %indvars.iv
-  %0 = load float, ptr %arrayidx, align 4
-  %mul = fmul float %0, %alpha
-  %arrayidx2 = getelementptr inbounds float, ptr %a, i64 %indvars.iv
-  %1 = load float, ptr %arrayidx2, align 4
-  %add = fadd float %1, %mul
-  store float %add, ptr %arrayidx2, align 4
-  %2 = or disjoint i64 %indvars.iv, 1
-  %arrayidx5 = getelementptr inbounds float, ptr %b, i64 %2
-  %3 = load float, ptr %arrayidx5, align 4
-  %mul6 = fmul float %3, %alpha
-  %arrayidx9 = getelementptr inbounds float, ptr %a, i64 %2
-  %4 = load float, ptr %arrayidx9, align 4
-  %add10 = fadd float %4, %mul6
-  store float %add10, ptr %arrayidx9, align 4
-  %5 = or disjoint i64 %indvars.iv, 2
-  %arrayidx13 = getelementptr inbounds float, ptr %b, i64 %5
-  %6 = load float, ptr %arrayidx13, align 4
-  %mul14 = fmul float %6, %alpha
-  %arrayidx17 = getelementptr inbounds float, ptr %a, i64 %5
-  %7 = load float, ptr %arrayidx17, align 4
-  %add18 = fadd float %7, %mul14
-  store float %add18, ptr %arrayidx17, align 4
-  %8 = or disjoint i64 %indvars.iv, 3
-  %arrayidx21 = getelementptr inbounds float, ptr %b, i64 %8
-  %9 = load float, ptr %arrayidx21, align 4
-  %mul22 = fmul float %9, %alpha
-  %arrayidx25 = getelementptr inbounds float, ptr %a, i64 %8
-  %10 = load float, ptr %arrayidx25, align 4
-  %add26 = fadd float %10, %mul22
-  store float %add26, ptr %arrayidx25, align 4
-  %11 = or disjoint i64 %indvars.iv, 4
-  %arrayidx29 = getelementptr inbounds float, ptr %b, i64 %11
-  %12 = load float, ptr %arrayidx29, align 4
-  %mul30 = fmul float %12, %alpha
-  %arrayidx33 = getelementptr inbounds float, ptr %a, i64 %11
-  %13 = load float, ptr %arrayidx33, align 4
-  %add34 = fadd float %13, %mul30
-  store float %add34, ptr %arrayidx33, align 4
-  %14 = or disjoint i64 %indvars.iv, 5
-  %arrayidx37 = getelementptr inbounds float, ptr %b, i64 %14
-  %15 = load float, ptr %arrayidx37, align 4
-  %mul38 = fmul float %15, %alpha
-  %arrayidx41 = getelementptr inbounds float, ptr %a, i64 %14
-  %16 = load float, ptr %arrayidx41, align 4
-  %add42 = fadd float %16, %mul38
-  store float %add42, ptr %arrayidx41, align 4
-  %17 = or disjoint i64 %indvars.iv, 6
-  %arrayidx45 = getelementptr inbounds float, ptr %b, i64 %17
-  %18 = load float, ptr %arrayidx45, align 4
-  %mul46 = fmul float %18, %alpha
-  %arrayidx49 = getelementptr inbounds float, ptr %a, i64 %17
-  %19 = load float, ptr %arrayidx49, align 4
-  %add50 = fadd float %19, %mul46
-  store float %add50, ptr %arrayidx49, align 4
-  %20 = or disjoint i64 %indvars.iv, 7
-  %arrayidx53 = getelementptr inbounds float, ptr %b, i64 %20
-  %21 = load float, ptr %arrayidx53, align 4
-  %mul54 = fmul float %21, %alpha
-  %arrayidx57 = getelementptr inbounds float, ptr %a, i64 %20
-  %22 = load float, ptr %arrayidx57, align 4
-  %add58 = fadd float %22, %mul54
-  store float %add58, ptr %arrayidx57, align 4
-  %23 = or disjoint i64 %indvars.iv, 8
-  %arrayidx61 = getelementptr inbounds float, ptr %b, i64 %23
-  %24 = load float, ptr %arrayidx61, align 4
-  %mul62 = fmul float %24, %alpha
-  %arrayidx65 = getelementptr inbounds float, ptr %a, i64 %23
-  %25 = load float, ptr %arrayidx65, align 4
-  %add66 = fadd float %25, %mul62
-  store float %add66, ptr %arrayidx65, align 4
-  %26 = or disjoint i64 %indvars.iv, 9
-  %arrayidx69 = getelementptr inbounds float, ptr %b, i64 %26
-  %27 = load float, ptr %arrayidx69, align 4
-  %mul70 = fmul float %27, %alpha
-  %arrayidx73 = getelementptr inbounds float, ptr %a, i64 %26
-  %28 = load float, ptr %arrayidx73, align 4
-  %add74 = fadd float %28, %mul70
-  store float %add74, ptr %arrayidx73, align 4
-  %29 = or disjoint i64 %indvars.iv, 10
-  %arrayidx77 = getelementptr inbounds float, ptr %b, i64 %29
-  %30 = load float, ptr %arrayidx77, align 4
-  %mul78 = fmul float %30, %alpha
-  %arrayidx81 = getelementptr inbounds float, ptr %a, i64 %29
-  %31 = load float, ptr %arrayidx81, align 4
-  %add82 = fadd float %31, %mul78
-  store float %add82, ptr %arrayidx81, align 4
-  %32 = or disjoint i64 %indvars.iv, 11
-  %arrayidx85 = getelementptr inbounds float, ptr %b, i64 %32
-  %33 = load float, ptr %arrayidx85, align 4
-  %mul86 = fmul float %33, %alpha
-  %arrayidx89 = getelementptr inbounds float, ptr %a, i64 %32
-  %34 = load float, ptr %arrayidx89, align 4
-  %add90 = fadd float %34, %mul86
-  store float %add90, ptr %arrayidx89, align 4
-  %35 = or disjoint i64 %indvars.iv, 12
-  %arrayidx93 = getelementptr inbounds float, ptr %b, i64 %35
-  %36 = load float, ptr %arrayidx93, align 4
-  %mul94 = fmul float %36, %alpha
-  %arrayidx97 = getelementptr inbounds float, ptr %a, i64 %35
-  %37 = load float, ptr %arrayidx97, align 4
-  %add98 = fadd float %37, %mul94
-  store float %add98, ptr %arrayidx97, align 4
-  %38 = or disjoint i64 %indvars.iv, 13
-  %arrayidx101 = getelementptr inbounds float, ptr %b, i64 %38
-  %39 = load float, ptr %arrayidx101, align 4
-  %mul102 = fmul float %39, %alpha
-  %arrayidx105 = getelementptr inbounds float, ptr %a, i64 %38
-  %40 = load float, ptr %arrayidx105, align 4
-  %add106 = fadd float %40, %mul102
-  store float %add106, ptr %arrayidx105, align 4
-  %41 = or disjoint i64 %indvars.iv, 14
-  %arrayidx109 = getelementptr inbounds float, ptr %b, i64 %41
-  %42 = load float, ptr %arrayidx109, align 4
-  %mul110 = fmul float %42, %alpha
-  %arrayidx113 = getelementptr inbounds float, ptr %a, i64 %41
-  %43 = load float, ptr %arrayidx113, align 4
-  %add114 = fadd float %43, %mul110
-  store float %add114, ptr %arrayidx113, align 4
-  %44 = or disjoint i64 %indvars.iv, 15
-  %arrayidx117 = getelementptr inbounds float, ptr %b, i64 %44
-  %45 = load float, ptr %arrayidx117, align 4
-  %mul118 = fmul float %45, %alpha
-  %arrayidx121 = getelementptr inbounds float, ptr %a, i64 %44
-  %46 = load float, ptr %arrayidx121, align 4
-  %add122 = fadd float %46, %mul118
-  store float %add122, ptr %arrayidx121, align 4
-  %47 = or disjoint i64 %indvars.iv, 16
-  %arrayidx125 = getelementptr inbounds float, ptr %b, i64 %47
-  %48 = load float, ptr %arrayidx125, align 4
-  %mul126 = fmul float %48, %alpha
-  %arrayidx129 = getelementptr inbounds float, ptr %a, i64 %47
-  %49 = load float, ptr %arrayidx129, align 4
-  %add130 = fadd float %49, %mul126
-  store float %add130, ptr %arrayidx129, align 4
-  %50 = or disjoint i64 %indvars.iv, 17
-  %arrayidx133 = getelementptr inbounds float, ptr %b, i64 %50
-  %51 = load float, ptr %arrayidx133, align 4
-  %mul134 = fmul float %51, %alpha
-  %arrayidx137 = getelementptr inbounds float, ptr %a, i64 %50
-  %52 = load float, ptr %arrayidx137, align 4
-  %add138 = fadd float %52, %mul134
-  store float %add138, ptr %arrayidx137, align 4
-  %53 = or disjoint i64 %indvars.iv, 18
-  %arrayidx141 = getelementptr inbounds float, ptr %b, i64 %53
-  %54 = load float, ptr %arrayidx141, align 4
-  %mul142 = fmul float %54, %alpha
-  %arrayidx145 = getelementptr inbounds float, ptr %a, i64 %53
-  %55 = load float, ptr %arrayidx145, align 4
-  %add146 = fadd float %55, %mul142
-  store float %add146, ptr %arrayidx145, align 4
-  %56 = or disjoint i64 %indvars.iv, 19
-  %arrayidx149 = getelementptr inbounds float, ptr %b, i64 %56
-  %57 = load float, ptr %arrayidx149, align 4
-  %mul150 = fmul float %57, %alpha
-  %arrayidx153 = getelementptr inbounds float, ptr %a, i64 %56
-  %58 = load float, ptr %arrayidx153, align 4
-  %add154 = fadd float %58, %mul150
-  store float %add154, ptr %arrayidx153, align 4
-  %59 = or disjoint i64 %indvars.iv, 20
-  %arrayidx157 = getelementptr inbounds float, ptr %b, i64 %59
-  %60 = load float, ptr %arrayidx157, align 4
-  %mul158 = fmul float %60, %alpha
-  %arrayidx161 = getelementptr inbounds float, ptr %a, i64 %59
-  %61 = load float, ptr %arrayidx161, align 4
-  %add162 = fadd float %61, %mul158
-  store float %add162, ptr %arrayidx161, align 4
-  %62 = or disjoint i64 %indvars.iv, 21
-  %arrayidx165 = getelementptr inbounds float, ptr %b, i64 %62
-  %63 = load float, ptr %arrayidx165, align 4
-  %mul166 = fmul float %63, %alpha
-  %arrayidx169 = getelementptr inbounds float, ptr %a, i64 %62
-  %64 = load float, ptr %arrayidx169, align 4
-  %add170 = fadd float %64, %mul166
-  store float %add170, ptr %arrayidx169, align 4
-  %65 = or disjoint i64 %indvars.iv, 22
-  %arrayidx173 = getelementptr inbounds float, ptr %b, i64 %65
-  %66 = load float, ptr %arrayidx173, align 4
-  %mul174 = fmul float %66, %alpha
-  %arrayidx177 = getelementptr inbounds float, ptr %a, i64 %65
-  %67 = load float, ptr %arrayidx177, align 4
-  %add178 = fadd float %67, %mul174
-  store float %add178, ptr %arrayidx177, align 4
-  %68 = or disjoint i64 %indvars.iv, 23
-  %arrayidx181 = getelementptr inbounds float, ptr %b, i64 %68
-  %69 = load float, ptr %arrayidx181, align 4
-  %mul182 = fmul float %69, %alpha
-  %arrayidx185 = getelementptr inbounds float, ptr %a, i64 %68
-  %70 = load float, ptr %arrayidx185, align 4
-  %add186 = fadd float %70, %mul182
-  store float %add186, ptr %arrayidx185, align 4
-  %71 = or disjoint i64 %indvars.iv, 24
-  %arrayidx189 = getelementptr inbounds float, ptr %b, i64 %71
-  %72 = load float, ptr %arrayidx189, align 4
-  %mul190 = fmul float %72, %alpha
-  %arrayidx193 = getelementptr inbounds float, ptr %a, i64 %71
-  %73 = load float, ptr %arrayidx193, align 4
-  %add194 = fadd float %73, %mul190
-  store float %add194, ptr %arrayidx193, align 4
-  %74 = or disjoint i64 %indvars.iv, 25
-  %arrayidx197 = getelementptr inbounds float, ptr %b, i64 %74
-  %75 = load float, ptr %arrayidx197, align 4
-  %mul198 = fmul float %75, %alpha
-  %arrayidx201 = getelementptr inbounds float, ptr %a, i64 %74
-  %76 = load float, ptr %arrayidx201, align 4
-  %add202 = fadd float %76, %mul198
-  store float %add202, ptr %arrayidx201, align 4
-  %77 = or disjoint i64 %indvars.iv, 26
-  %arrayidx205 = getelementptr inbounds float, ptr %b, i64 %77
-  %78 = load float, ptr %arrayidx205, align 4
-  %mul206 = fmul float %78, %alpha
-  %arrayidx209 = getelementptr inbounds float, ptr %a, i64 %77
-  %79 = load float, ptr %arrayidx209, align 4
-  %add210 = fadd float %79, %mul206
-  store float %add210, ptr %arrayidx209, align 4
-  %80 = or disjoint i64 %indvars.iv, 27
-  %arrayidx213 = getelementptr inbounds float, ptr %b, i64 %80
-  %81 = load float, ptr %arrayidx213, align 4
-  %mul214 = fmul float %81, %alpha
-  %arrayidx217 = getelementptr inbounds float, ptr %a, i64 %80
-  %82 = load float, ptr %arrayidx217, align 4
-  %add218 = fadd float %82, %mul214
-  store float %add218, ptr %arrayidx217, align 4
-  %83 = or disjoint i64 %indvars.iv, 28
-  %arrayidx221 = getelementptr inbounds float, ptr %b, i64 %83
-  %84 = load float, ptr %arrayidx221, align 4
-  %mul222 = fmul float %84, %alpha
-  %arrayidx225 = getelementptr inbounds float, ptr %a, i64 %83
-  %85 = load float, ptr %arrayidx225, align 4
-  %add226 = fadd float %85, %mul222
-  store float %add226, ptr %arrayidx225, align 4
-  %86 = or disjoint i64 %indvars.iv, 29
-  %arrayidx229 = getelementptr inbounds float, ptr %b, i64 %86
-  %87 = load float, ptr %arrayidx229, align 4
-  %mul230 = fmul float %87, %alpha
-  %arrayidx233 = getelementptr inbounds float, ptr %a, i64 %86
-  %88 = load float, ptr %arrayidx233, align 4
-  %add234 = fadd float %88, %mul230
-  store float %add234, ptr %arrayidx233, align 4
-  %89 = or disjoint i64 %indvars.iv, 30
-  %arrayidx237 = getelementptr inbounds float, ptr %b, i64 %89
-  %90 = load float, ptr %arrayidx237, align 4
-  %mul238 = fmul float %90, %alpha
-  %arrayidx241 = getelementptr inbounds float, ptr %a, i64 %89
-  %91 = load float, ptr %arrayidx241, align 4
-  %add242 = fadd float %91, %mul238
-  store float %add242, ptr %arrayidx241, align 4
-  %92 = or disjoint i64 %indvars.iv, 31
-  %arrayidx245 = getelementptr inbounds float, ptr %b, i64 %92
-  %93 = load float, ptr %arrayidx245, align 4
-  %mul246 = fmul float %93, %alpha
-  %arrayidx249 = getelementptr inbounds float, ptr %a, i64 %92
-  %94 = load float, ptr %arrayidx249, align 4
-  %add250 = fadd float %94, %mul246
-  store float %add250, ptr %arrayidx249, align 4
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 32
-  %cmp = icmp slt i64 %indvars.iv.next, 3200
-  br i1 %cmp, label %for.body, label %for.end
-
-; CHECK-LABEL: @goo32
-
-; CHECK: for.body:
-; CHECK: %indvar = phi i64 [ %indvar.next, %for.body ], [ 0, %entry ]
-; CHECK: %arrayidx = getelementptr inbounds float, ptr %b, i64 %indvar
-; CHECK: %0 = load float, ptr %arrayidx, align 4
-; CHECK: %mul = fmul float %0, %alpha
-; CHECK: %arrayidx2 = getelementptr inbounds float, ptr %a, i64 %indvar
-; CHECK: %1 = load float, ptr %arrayidx2, align 4
-; CHECK: %add = fadd float %1, %mul
-; CHECK: store float %add, ptr %arrayidx2, align 4
-; CHECK: %indvar.next = add i64 %indvar, 1
-; CHECK: %exitcond = icmp eq i64 %indvar, 3199
-; CHECK: br i1 %exitcond, label %for.end, label %for.body
-; CHECK: ret
-
-for.end:                                          ; preds = %for.body
-  ret void
-}
-
-attributes #0 = { nounwind uwtable }
diff --git a/llvm/test/Transforms/LoopReroll/complex_reroll.ll b/llvm/test/Transforms/LoopReroll/complex_reroll.ll
deleted file mode 100644
index 27139ee..0000000
--- a/llvm/test/Transforms/LoopReroll/complex_reroll.ll
+++ /dev/null
@@ -1,237 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -S -passes=loop-reroll   %s | FileCheck %s
-declare i32 @goo(i32, i32)
-
-@buf = external global ptr
-@aaa = global [16 x i8] c"\01\02\03\04\05\06\07\08\09\0A\0B\0C\0D\0E\0F\10", align 1
-
-define i32 @test1(i32 %len) {
-; CHECK-LABEL: @test1(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br label [[WHILE_BODY:%.*]]
-; CHECK:       while.body:
-; CHECK-NEXT:    [[INDVAR:%.*]] = phi i64 [ [[INDVAR_NEXT:%.*]], [[WHILE_BODY]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[SUM44_020:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[ADD:%.*]], [[WHILE_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = trunc i64 [[INDVAR]] to i32
-; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, ptr @aaa, i64 [[INDVAR]]
-; CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr [[SCEVGEP]], align 1
-; CHECK-NEXT:    [[CONV:%.*]] = zext i8 [[TMP1]] to i64
-; CHECK-NEXT:    [[ADD]] = add i64 [[CONV]], [[SUM44_020]]
-; CHECK-NEXT:    [[INDVAR_NEXT]] = add i64 [[INDVAR]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[TMP0]], 15
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[WHILE_END:%.*]], label [[WHILE_BODY]]
-; CHECK:       while.end:
-; CHECK-NEXT:    [[ADD9_LCSSA:%.*]] = phi i64 [ [[ADD]], [[WHILE_BODY]] ]
-; CHECK-NEXT:    [[CONV11:%.*]] = trunc i64 [[ADD9_LCSSA]] to i32
-; CHECK-NEXT:    [[CALL:%.*]] = tail call i32 @goo(i32 0, i32 [[CONV11]])
-; CHECK-NEXT:    unreachable
-;
-entry:
-  br label %while.body
-
-while.body:
-
-  %dec22 = phi i32 [ 4, %entry ], [ %dec, %while.body ]
-  %buf.021 = phi ptr [ @aaa, %entry ], [ %add.ptr, %while.body ]
-  %sum44.020 = phi i64 [ 0, %entry ], [ %add9, %while.body ]
-  %0 = load i8, ptr %buf.021, align 1
-  %conv = zext i8 %0 to i64
-  %add = add i64 %conv, %sum44.020
-  %arrayidx1 = getelementptr inbounds i8, ptr %buf.021, i64 1
-  %1 = load i8, ptr %arrayidx1, align 1
-  %conv2 = zext i8 %1 to i64
-  %add3 = add i64 %add, %conv2
-  %arrayidx4 = getelementptr inbounds i8, ptr %buf.021, i64 2
-  %2 = load i8, ptr %arrayidx4, align 1
-  %conv5 = zext i8 %2 to i64
-  %add6 = add i64 %add3, %conv5
-  %arrayidx7 = getelementptr inbounds i8, ptr %buf.021, i64 3
-  %3 = load i8, ptr %arrayidx7, align 1
-  %conv8 = zext i8 %3 to i64
-  %add9 = add i64 %add6, %conv8
-  %add.ptr = getelementptr inbounds i8, ptr %buf.021, i64 4
-  %dec = add nsw i32 %dec22, -1
-  %tobool = icmp eq i32 %dec, 0
-  br i1 %tobool, label %while.end, label %while.body
-
-while.end:                                        ; preds = %while.body
-  %conv11 = trunc i64 %add9 to i32
-  %call = tail call i32 @goo(i32 0, i32 %conv11)
-  unreachable
-}
-
-define i32 @test2(i32 %N, ptr nocapture readonly %a, i32 %S) {
-; CHECK-LABEL: @test2(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[CMP_9:%.*]] = icmp sgt i32 [[N:%.*]], 0
-; CHECK-NEXT:    br i1 [[CMP_9]], label [[FOR_BODY_LR_PH:%.*]], label [[FOR_COND_CLEANUP:%.*]]
-; CHECK:       for.body.lr.ph:
-; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[N]], -1
-; CHECK-NEXT:    [[TMP1:%.*]] = lshr i32 [[TMP0]], 1
-; CHECK-NEXT:    [[TMP2:%.*]] = shl nuw i32 [[TMP1]], 1
-; CHECK-NEXT:    [[TMP3:%.*]] = add nuw nsw i32 [[TMP2]], 1
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.cond.for.cond.cleanup_crit_edge:
-; CHECK-NEXT:    [[ADD2_LCSSA:%.*]] = phi i32 [ [[ADD:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]]
-; CHECK:       for.cond.cleanup:
-; CHECK-NEXT:    [[S_ADDR_0_LCSSA:%.*]] = phi i32 [ [[ADD2_LCSSA]], [[FOR_COND_FOR_COND_CLEANUP_CRIT_EDGE:%.*]] ], [ [[S:%.*]], [[ENTRY:%.*]] ]
-; CHECK-NEXT:    ret i32 [[S_ADDR_0_LCSSA]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[INDVAR:%.*]] = phi i64 [ [[INDVAR_NEXT:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_LR_PH]] ]
-; CHECK-NEXT:    [[S_ADDR_011:%.*]] = phi i32 [ [[S]], [[FOR_BODY_LR_PH]] ], [ [[ADD]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[TMP4:%.*]] = trunc i64 [[INDVAR]] to i32
-; CHECK-NEXT:    [[TMP5:%.*]] = shl nuw nsw i64 [[INDVAR]], 2
-; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, ptr [[A:%.*]], i64 [[TMP5]]
-; CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr [[SCEVGEP]], align 4
-; CHECK-NEXT:    [[ADD]] = add nsw i32 [[TMP6]], [[S_ADDR_011]]
-; CHECK-NEXT:    [[INDVAR_NEXT]] = add i64 [[INDVAR]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[TMP4]], [[TMP3]]
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_FOR_COND_CLEANUP_CRIT_EDGE]], label [[FOR_BODY]]
-;
-entry:
-  %cmp.9 = icmp sgt i32 %N, 0
-  br i1 %cmp.9, label %for.body.lr.ph, label %for.cond.cleanup
-
-for.body.lr.ph:
-  br label %for.body
-
-for.cond.for.cond.cleanup_crit_edge:
-  br label %for.cond.cleanup
-
-for.cond.cleanup:
-  %S.addr.0.lcssa = phi i32 [ %add2, %for.cond.for.cond.cleanup_crit_edge ], [ %S, %entry ]
-  ret i32 %S.addr.0.lcssa
-
-for.body:
-
-  %i.012 = phi i32 [ 0, %for.body.lr.ph ], [ %add3, %for.body ]
-  %S.addr.011 = phi i32 [ %S, %for.body.lr.ph ], [ %add2, %for.body ]
-  %a.addr.010 = phi ptr [ %a, %for.body.lr.ph ], [ %incdec.ptr1, %for.body ]
-  %incdec.ptr = getelementptr inbounds i32, ptr %a.addr.010, i64 1
-  %0 = load i32, ptr %a.addr.010, align 4
-  %add = add nsw i32 %0, %S.addr.011
-  %incdec.ptr1 = getelementptr inbounds i32, ptr %a.addr.010, i64 2
-  %1 = load i32, ptr %incdec.ptr, align 4
-  %add2 = add nsw i32 %add, %1
-  %add3 = add nsw i32 %i.012, 2
-  %cmp = icmp slt i32 %add3, %N
-  br i1 %cmp, label %for.body, label %for.cond.for.cond.cleanup_crit_edge
-}
-
-define i32 @test3(ptr nocapture readonly %buf, i32 %len) #0 {
-; CHECK-LABEL: @test3(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[CMP10:%.*]] = icmp sgt i32 [[LEN:%.*]], 1
-; CHECK-NEXT:    br i1 [[CMP10]], label [[WHILE_BODY_PREHEADER:%.*]], label [[WHILE_END:%.*]]
-; CHECK:       while.body.preheader:
-; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[LEN]], -2
-; CHECK-NEXT:    [[TMP1:%.*]] = lshr i32 [[TMP0]], 1
-; CHECK-NEXT:    [[TMP2:%.*]] = shl nuw i32 [[TMP1]], 1
-; CHECK-NEXT:    [[TMP3:%.*]] = add nuw nsw i32 [[TMP2]], 1
-; CHECK-NEXT:    br label [[WHILE_BODY:%.*]]
-; CHECK:       while.body:
-; CHECK-NEXT:    [[INDVAR:%.*]] = phi i64 [ [[INDVAR_NEXT:%.*]], [[WHILE_BODY]] ], [ 0, [[WHILE_BODY_PREHEADER]] ]
-; CHECK-NEXT:    [[S_012:%.*]] = phi i32 [ [[ADD:%.*]], [[WHILE_BODY]] ], [ undef, [[WHILE_BODY_PREHEADER]] ]
-; CHECK-NEXT:    [[TMP4:%.*]] = trunc i64 [[INDVAR]] to i32
-; CHECK-NEXT:    [[TMP5:%.*]] = mul nsw i64 [[INDVAR]], -4
-; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, ptr [[BUF:%.*]], i64 [[TMP5]]
-; CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr [[SCEVGEP]], align 4
-; CHECK-NEXT:    [[ADD]] = add nsw i32 [[TMP6]], [[S_012]]
-; CHECK-NEXT:    [[INDVAR_NEXT]] = add i64 [[INDVAR]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[TMP4]], [[TMP3]]
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[WHILE_END_LOOPEXIT:%.*]], label [[WHILE_BODY]]
-; CHECK:       while.end.loopexit:
-; CHECK-NEXT:    [[ADD2_LCSSA:%.*]] = phi i32 [ [[ADD]], [[WHILE_BODY]] ]
-; CHECK-NEXT:    br label [[WHILE_END]]
-; CHECK:       while.end:
-; CHECK-NEXT:    [[S_0_LCSSA:%.*]] = phi i32 [ undef, [[ENTRY:%.*]] ], [ [[ADD2_LCSSA]], [[WHILE_END_LOOPEXIT]] ]
-; CHECK-NEXT:    ret i32 [[S_0_LCSSA]]
-;
-entry:
-  %cmp10 = icmp sgt i32 %len, 1
-  br i1 %cmp10, label %while.body.preheader, label %while.end
-
-while.body.preheader:                             ; preds = %entry
-  br label %while.body
-
-while.body:                                       ; preds = %while.body.preheader, %while.body
-
-  %i.013 = phi i32 [ %sub, %while.body ], [ %len, %while.body.preheader ]
-  %S.012 = phi i32 [ %add2, %while.body ], [ undef, %while.body.preheader ]
-  %buf.addr.011 = phi ptr [ %add.ptr, %while.body ], [ %buf, %while.body.preheader ]
-  %0 = load i32, ptr %buf.addr.011, align 4
-  %add = add nsw i32 %0, %S.012
-  %arrayidx1 = getelementptr inbounds i32, ptr %buf.addr.011, i64 -1
-  %1 = load i32, ptr %arrayidx1, align 4
-  %add2 = add nsw i32 %add, %1
-  %add.ptr = getelementptr inbounds i32, ptr %buf.addr.011, i64 -2
-  %sub = add nsw i32 %i.013, -2
-  %cmp = icmp sgt i32 %sub, 1
-  br i1 %cmp, label %while.body, label %while.end.loopexit
-
-while.end.loopexit:                               ; preds = %while.body
-  br label %while.end
-
-while.end:                                        ; preds = %while.end.loopexit, %entry
-  %S.0.lcssa = phi i32 [ undef, %entry ], [ %add2, %while.end.loopexit ]
-  ret i32 %S.0.lcssa
-}
-
-define i32 @test4(i32 %len) {
-; CHECK-LABEL: @test4(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br label [[WHILE_BODY:%.*]]
-; CHECK:       while.body:
-; CHECK-NEXT:    [[INDVAR:%.*]] = phi i64 [ [[INDVAR_NEXT:%.*]], [[WHILE_BODY]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[SUM44_020:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[ADD:%.*]], [[WHILE_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = trunc i64 [[INDVAR]] to i32
-; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, ptr @aaa, i64 [[INDVAR]]
-; CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr [[SCEVGEP]], align 1
-; CHECK-NEXT:    [[CONV:%.*]] = zext i8 [[TMP1]] to i64
-; CHECK-NEXT:    [[ADD]] = add i64 [[CONV]], [[SUM44_020]]
-; CHECK-NEXT:    [[INDVAR_NEXT]] = add i64 [[INDVAR]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[TMP0]], 23
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[WHILE_END:%.*]], label [[WHILE_BODY]]
-; CHECK:       while.end:
-; CHECK-NEXT:    [[ADD9_LCSSA:%.*]] = phi i64 [ [[ADD]], [[WHILE_BODY]] ]
-; CHECK-NEXT:    [[CONV11:%.*]] = trunc i64 [[ADD9_LCSSA]] to i32
-; CHECK-NEXT:    [[CALL:%.*]] = tail call i32 @goo(i32 0, i32 [[CONV11]])
-; CHECK-NEXT:    unreachable
-;
-entry:
-  br label %while.body
-
-while.body:
-  %a = phi i32 [ 4, %entry ], [ %a.next, %while.body ]
-  %b = phi i32 [ 6, %entry ], [ %b.next, %while.body ]
-  %buf.021 = phi ptr [ @aaa, %entry ], [ %add.ptr, %while.body ]
-  %sum44.020 = phi i64 [ 0, %entry ], [ %add9, %while.body ]
-  %0 = load i8, ptr %buf.021, align 1
-  %conv = zext i8 %0 to i64
-  %add = add i64 %conv, %sum44.020
-  %arrayidx1 = getelementptr inbounds i8, ptr %buf.021, i64 1
-  %1 = load i8, ptr %arrayidx1, align 1
-  %conv2 = zext i8 %1 to i64
-  %add3 = add i64 %add, %conv2
-  %arrayidx4 = getelementptr inbounds i8, ptr %buf.021, i64 2
-  %2 = load i8, ptr %arrayidx4, align 1
-  %conv5 = zext i8 %2 to i64
-  %add6 = add i64 %add3, %conv5
-  %arrayidx7 = getelementptr inbounds i8, ptr %buf.021, i64 3
-  %3 = load i8, ptr %arrayidx7, align 1
-  %conv8 = zext i8 %3 to i64
-  %add9 = add i64 %add6, %conv8
-  %add.ptr = getelementptr inbounds i8, ptr %buf.021, i64 4
-  %a.next = add nsw i32 %a, -1
-  %b.next = add nsw i32 %b, -1
-  %cond = add nsw i32 %a, %b
-  %tobool = icmp eq i32 %cond, 0
-  br i1 %tobool, label %while.end, label %while.body
-
-while.end:                                        ; preds = %while.body
-  %conv11 = trunc i64 %add9 to i32
-  %call = tail call i32 @goo(i32 0, i32 %conv11)
-  unreachable
-}
-
diff --git a/llvm/test/Transforms/LoopReroll/external_use.ll b/llvm/test/Transforms/LoopReroll/external_use.ll
deleted file mode 100644
index 2124f3b..0000000
--- a/llvm/test/Transforms/LoopReroll/external_use.ll
+++ /dev/null
@@ -1,60 +0,0 @@
-; RUN: opt < %s -passes=loop-reroll -S | FileCheck %s
-
-; Check whether rerolling is rejected if values of the base and root
-; instruction are used outside the loop block.
-
-; Only the base/root instructions except a loop increment instruction
-define void @test1() {
-entry:
-  br label %loop1
-
-loop1:
-;CHECK-LABEL: loop1:
-;CHECK-NEXT:   %indvar = phi i64 [ 0, %entry ], [ %indvar.next, %loop1 ]
-;CHECK-NEXT:   %indvar.1 = add nsw i64 %indvar, 1
-
-  %indvar = phi i64 [ 0, %entry ], [ %indvar.next, %loop1 ]
-  %indvar.1 = add nsw i64 %indvar, 1
-  %indvar.next = add nsw i64 %indvar, 2
-  %cmp = icmp slt i64 %indvar.next, 200
-  br i1 %cmp, label %loop1, label %exit
-
-exit:
-  %var1 = phi i64 [ %indvar.1, %loop1 ]
-  %var2 = phi i64 [ %indvar, %loop1 ]
-  ret void
-}
-
-; Both the base/root instructions and reduction instructions
-define void @test2() {
-entry:
-  br label %loop2
-
-loop2:
-;CHECK-LABEL: loop2:
-;CHECK-NEXT:   %indvar = phi i32  [ 0, %entry ], [ %indvar.next, %loop2 ]
-;CHECK-NEXT:   %redvar = phi i32 [ 0, %entry ], [ %add.2, %loop2 ]
-;CHECK-NEXT:   %indvar.1 = add nuw nsw i32 %indvar, 1
-;CHECK-NEXT:   %indvar.2 = add nuw nsw i32 %indvar, 2
-
-  %indvar = phi i32 [ 0, %entry ], [ %indvar.next, %loop2 ]
-  %redvar = phi i32 [ 0, %entry ], [ %add.2, %loop2 ]
-  %indvar.1 = add nuw nsw i32 %indvar, 1
-  %indvar.2 = add nuw nsw i32 %indvar, 2
-  %mul.0 = mul nsw i32 %indvar, %indvar
-  %mul.1 = mul nsw i32 %indvar.1, %indvar.1
-  %mul.2 = mul nsw i32 %indvar.2, %indvar.2
-  %add.0 = add nsw i32 %redvar, %mul.0
-  %add.1 = add nsw i32 %add.0, %mul.1
-  %add.2 = add nsw i32 %add.1, %mul.2
-  %indvar.next = add nuw nsw i32 %indvar, 3
-  %cmp = icmp slt i32 %indvar.next, 300
-  br i1 %cmp, label %loop2, label %exit
-
-exit:
-  %a = phi i32 [ %indvar, %loop2 ]
-  %b = phi i32 [ %indvar.1, %loop2 ]
-  %c = phi i32 [ %indvar.2, %loop2 ]
-  %x = phi i32 [ %add.2, %loop2 ]
-  ret void
-}
diff --git a/llvm/test/Transforms/LoopReroll/extra_instr.ll b/llvm/test/Transforms/LoopReroll/extra_instr.ll
deleted file mode 100644
index 3114463..0000000
--- a/llvm/test/Transforms/LoopReroll/extra_instr.ll
+++ /dev/null
@@ -1,361 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2
-; RUN: opt -S -passes=loop-reroll   %s | FileCheck %s
-target triple = "aarch64--linux-gnu"
-
-define void @rerollable1(ptr nocapture %a) {
-; CHECK-LABEL: define void @rerollable1
-; CHECK-SAME: (ptr nocapture [[A:%.*]]) {
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br label [[LOOP:%.*]]
-; CHECK:       loop:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = shl nuw nsw i64 [[IV]], 2
-; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[TMP0]], 160
-; CHECK-NEXT:    [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[IV]], 2
-; CHECK-NEXT:    [[TMP3:%.*]] = add i64 [[TMP2]], 80
-; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP3]]
-; CHECK-NEXT:    [[VALUE0:%.*]] = load i32, ptr [[SCEVGEP1]], align 4
-; CHECK-NEXT:    store i32 [[VALUE0]], ptr [[SCEVGEP]], align 4
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[EXITCOND2:%.*]] = icmp eq i64 [[IV]], 9
-; CHECK-NEXT:    br i1 [[EXITCOND2]], label [[EXIT:%.*]], label [[LOOP]]
-; CHECK:       exit:
-; CHECK-NEXT:    ret void
-;
-entry:
-  br label %loop
-
-loop:
-
-
-  ; base instruction
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
-
-  ; NO unrerollable instructions
-
-  ; extra simple arithmetic operations, used by root instructions
-  %plus20 = add nuw nsw i64 %iv, 20
-  %plus10 = add nuw nsw i64 %iv, 10
-
-  ; root instruction 0
-  %ldptr0 = getelementptr inbounds [2 x i32], ptr %a, i64 %plus20, i64 0
-  %value0 = load i32, ptr %ldptr0, align 4
-  %stptr0 = getelementptr inbounds [2 x i32], ptr %a, i64 %plus10, i64 0
-  store i32 %value0, ptr %stptr0, align 4
-
-  ; root instruction 1
-  %ldptr1 = getelementptr inbounds [2 x i32], ptr %a, i64 %plus20, i64 1
-  %value1 = load i32, ptr %ldptr1, align 4
-  %stptr1 = getelementptr inbounds [2 x i32], ptr %a, i64 %plus10, i64 1
-  store i32 %value1, ptr %stptr1, align 4
-
-  ; loop-increment
-  %iv.next = add nuw nsw i64 %iv, 1
-
-  ; latch
-  %exitcond = icmp eq i64 %iv.next, 5
-  br i1 %exitcond, label %exit, label %loop
-
-exit:
-  ret void
-}
-
-define void @unrerollable1(ptr nocapture %a) {
-; CHECK-LABEL: define void @unrerollable1
-; CHECK-SAME: (ptr nocapture [[A:%.*]]) {
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br label [[LOOP:%.*]]
-; CHECK:       loop:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[STPTRX:%.*]] = getelementptr inbounds [2 x i32], ptr [[A]], i64 [[IV]], i64 0
-; CHECK-NEXT:    store i32 999, ptr [[STPTRX]], align 4
-; CHECK-NEXT:    [[PLUS20:%.*]] = add nuw nsw i64 [[IV]], 20
-; CHECK-NEXT:    [[PLUS10:%.*]] = add nuw nsw i64 [[IV]], 10
-; CHECK-NEXT:    [[LDPTR0:%.*]] = getelementptr inbounds [2 x i32], ptr [[A]], i64 [[PLUS20]], i64 0
-; CHECK-NEXT:    [[VALUE0:%.*]] = load i32, ptr [[LDPTR0]], align 4
-; CHECK-NEXT:    [[STPTR0:%.*]] = getelementptr inbounds [2 x i32], ptr [[A]], i64 [[PLUS10]], i64 0
-; CHECK-NEXT:    store i32 [[VALUE0]], ptr [[STPTR0]], align 4
-; CHECK-NEXT:    [[LDPTR1:%.*]] = getelementptr inbounds [2 x i32], ptr [[A]], i64 [[PLUS20]], i64 1
-; CHECK-NEXT:    [[VALUE1:%.*]] = load i32, ptr [[LDPTR1]], align 4
-; CHECK-NEXT:    [[STPTR1:%.*]] = getelementptr inbounds [2 x i32], ptr [[A]], i64 [[PLUS10]], i64 1
-; CHECK-NEXT:    store i32 [[VALUE1]], ptr [[STPTR1]], align 4
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], 5
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[EXIT:%.*]], label [[LOOP]]
-; CHECK:       exit:
-; CHECK-NEXT:    ret void
-;
-entry:
-  br label %loop
-
-loop:
-
-
-  ; base instruction
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
-
-  ; unrerollable instructions using %iv
-  %stptrx = getelementptr inbounds [2 x i32], ptr %a, i64 %iv, i64 0
-  store i32 999, ptr %stptrx, align 4
-
-  ; extra simple arithmetic operations, used by root instructions
-  %plus20 = add nuw nsw i64 %iv, 20
-  %plus10 = add nuw nsw i64 %iv, 10
-
-  ; root instruction 0
-  %ldptr0 = getelementptr inbounds [2 x i32], ptr %a, i64 %plus20, i64 0
-  %value0 = load i32, ptr %ldptr0, align 4
-  %stptr0 = getelementptr inbounds [2 x i32], ptr %a, i64 %plus10, i64 0
-  store i32 %value0, ptr %stptr0, align 4
-
-  ; root instruction 1
-  %ldptr1 = getelementptr inbounds [2 x i32], ptr %a, i64 %plus20, i64 1
-  %value1 = load i32, ptr %ldptr1, align 4
-  %stptr1 = getelementptr inbounds [2 x i32], ptr %a, i64 %plus10, i64 1
-  store i32 %value1, ptr %stptr1, align 4
-
-  ; loop-increment
-  %iv.next = add nuw nsw i64 %iv, 1
-
-  ; latch
-  %exitcond = icmp eq i64 %iv.next, 5
-  br i1 %exitcond, label %exit, label %loop
-
-exit:
-  ret void
-}
-
-define void @unrerollable2(ptr nocapture %a) {
-; CHECK-LABEL: define void @unrerollable2
-; CHECK-SAME: (ptr nocapture [[A:%.*]]) {
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br label [[LOOP:%.*]]
-; CHECK:       loop:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[STPTRX:%.*]] = getelementptr inbounds [2 x i32], ptr [[A]], i64 [[IV_NEXT]], i64 0
-; CHECK-NEXT:    store i32 999, ptr [[STPTRX]], align 4
-; CHECK-NEXT:    [[PLUS20:%.*]] = add nuw nsw i64 [[IV]], 20
-; CHECK-NEXT:    [[PLUS10:%.*]] = add nuw nsw i64 [[IV]], 10
-; CHECK-NEXT:    [[LDPTR0:%.*]] = getelementptr inbounds [2 x i32], ptr [[A]], i64 [[PLUS20]], i64 0
-; CHECK-NEXT:    [[VALUE0:%.*]] = load i32, ptr [[LDPTR0]], align 4
-; CHECK-NEXT:    [[STPTR0:%.*]] = getelementptr inbounds [2 x i32], ptr [[A]], i64 [[PLUS10]], i64 0
-; CHECK-NEXT:    store i32 [[VALUE0]], ptr [[STPTR0]], align 4
-; CHECK-NEXT:    [[LDPTR1:%.*]] = getelementptr inbounds [2 x i32], ptr [[A]], i64 [[PLUS20]], i64 1
-; CHECK-NEXT:    [[VALUE1:%.*]] = load i32, ptr [[LDPTR1]], align 4
-; CHECK-NEXT:    [[STPTR1:%.*]] = getelementptr inbounds [2 x i32], ptr [[A]], i64 [[PLUS10]], i64 1
-; CHECK-NEXT:    store i32 [[VALUE1]], ptr [[STPTR1]], align 4
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], 5
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[EXIT:%.*]], label [[LOOP]]
-; CHECK:       exit:
-; CHECK-NEXT:    ret void
-;
-entry:
-  br label %loop
-
-loop:
-
-
-  ; base instruction
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
-
-  ; loop-increment
-  %iv.next = add nuw nsw i64 %iv, 1
-
-  ; unrerollable instructions using %iv.next
-  %stptrx = getelementptr inbounds [2 x i32], ptr %a, i64 %iv.next, i64 0
-  store i32 999, ptr %stptrx, align 4
-
-  ; extra simple arithmetic operations, used by root instructions
-  %plus20 = add nuw nsw i64 %iv, 20
-  %plus10 = add nuw nsw i64 %iv, 10
-
-  ; root instruction 0
-  %ldptr0 = getelementptr inbounds [2 x i32], ptr %a, i64 %plus20, i64 0
-  %value0 = load i32, ptr %ldptr0, align 4
-  %stptr0 = getelementptr inbounds [2 x i32], ptr %a, i64 %plus10, i64 0
-  store i32 %value0, ptr %stptr0, align 4
-
-  ; root instruction 1
-  %ldptr1 = getelementptr inbounds [2 x i32], ptr %a, i64 %plus20, i64 1
-  %value1 = load i32, ptr %ldptr1, align 4
-  %stptr1 = getelementptr inbounds [2 x i32], ptr %a, i64 %plus10, i64 1
-  store i32 %value1, ptr %stptr1, align 4
-
-  ; latch
-  %exitcond = icmp eq i64 %iv.next, 5
-  br i1 %exitcond, label %exit, label %loop
-
-exit:
-  ret void
-}
-
-define dso_local void @rerollable2() {
-; CHECK-LABEL: define dso_local void @rerollable2() {
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br label [[LOOP:%.*]]
-; CHECK:       loop:
-; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[IV]], 24
-; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[IV]], 20
-; CHECK-NEXT:    [[IV_SCALED_DIV5:%.*]] = udiv i32 [[TMP1]], 5
-; CHECK-NEXT:    tail call void @bar(i32 [[IV_SCALED_DIV5]])
-; CHECK-NEXT:    [[IV_SCALED_ADD4_DIV5:%.*]] = udiv i32 [[TMP0]], 5
-; CHECK-NEXT:    tail call void @bar(i32 [[IV_SCALED_ADD4_DIV5]])
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i32 [[IV]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[IV]], 8
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[EXIT:%.*]], label [[LOOP]]
-; CHECK:       exit:
-; CHECK-NEXT:    ret void
-;
-entry:
-  br label %loop
-
-loop:
-
-
-  ; induction variable
-  %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ]
-
-  ; scale instruction
-  %iv.mul3 = mul nuw nsw i32 %iv, 3
-
-  ; extra simple arithmetic operations, used by root instructions
-  %iv.scaled = add nuw nsw i32 %iv.mul3, 20
-
-  ; NO unrerollable instructions
-
-  ; root set 1
-
-  ; base instruction
-  %iv.scaled.div5 = udiv i32 %iv.scaled, 5
-  tail call void @bar(i32 %iv.scaled.div5)
-  ; root instruction 0
-  %iv.scaled.add1 = add nuw nsw i32 %iv.scaled, 1
-  %iv.scaled.add1.div5 = udiv i32 %iv.scaled.add1, 5
-  tail call void @bar(i32 %iv.scaled.add1.div5)
-  ; root instruction 2
-  %iv.scaled.add2 = add nuw nsw i32 %iv.scaled, 2
-  %iv.scaled.add2.div5 = udiv i32 %iv.scaled.add2, 5
-  tail call void @bar(i32 %iv.scaled.add2.div5)
-
-  ; root set 2
-
-  ; base instruction
-  %iv.scaled.add4 = add nuw nsw i32 %iv.scaled, 4
-  %iv.scaled.add4.div5 = udiv i32 %iv.scaled.add4, 5
-  tail call void @bar(i32 %iv.scaled.add4.div5)
-  ; root instruction 0
-  %iv.scaled.add5 = add nuw nsw i32 %iv.scaled, 5
-  %iv.scaled.add5.div5 = udiv i32 %iv.scaled.add5, 5
-  tail call void @bar(i32 %iv.scaled.add5.div5)
-  ; root instruction 2
-  %iv.scaled.add6 = add nuw nsw i32 %iv.scaled, 6
-  %iv.scaled.add6.div5 = udiv i32 %iv.scaled.add6, 5
-  tail call void @bar(i32 %iv.scaled.add6.div5)
-
-  ; loop-increment
-  %iv.next = add nuw nsw i32 %iv, 1
-
-  ; latch
-  %cmp = icmp ult i32 %iv.next, 3
-  br i1 %cmp, label %loop, label %exit
-
-exit:
-  ret void
-}
-
-define dso_local void @unrerollable3() {
-; CHECK-LABEL: define dso_local void @unrerollable3() {
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br label [[LOOP:%.*]]
-; CHECK:       loop:
-; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[IV_MUL3:%.*]] = mul nuw nsw i32 [[IV]], 3
-; CHECK-NEXT:    [[IV_SCALED:%.*]] = add nuw nsw i32 [[IV_MUL3]], 20
-; CHECK-NEXT:    [[IV_MUL7:%.*]] = mul nuw nsw i32 [[IV]], 7
-; CHECK-NEXT:    tail call void @bar(i32 [[IV_MUL7]])
-; CHECK-NEXT:    [[IV_SCALED_DIV5:%.*]] = udiv i32 [[IV_SCALED]], 5
-; CHECK-NEXT:    tail call void @bar(i32 [[IV_SCALED_DIV5]])
-; CHECK-NEXT:    [[IV_SCALED_ADD1:%.*]] = add nuw nsw i32 [[IV_SCALED]], 1
-; CHECK-NEXT:    [[IV_SCALED_ADD1_DIV5:%.*]] = udiv i32 [[IV_SCALED_ADD1]], 5
-; CHECK-NEXT:    tail call void @bar(i32 [[IV_SCALED_ADD1_DIV5]])
-; CHECK-NEXT:    [[IV_SCALED_ADD2:%.*]] = add nuw nsw i32 [[IV_SCALED]], 2
-; CHECK-NEXT:    [[IV_SCALED_ADD2_DIV5:%.*]] = udiv i32 [[IV_SCALED_ADD2]], 5
-; CHECK-NEXT:    tail call void @bar(i32 [[IV_SCALED_ADD2_DIV5]])
-; CHECK-NEXT:    [[IV_SCALED_ADD4:%.*]] = add nuw nsw i32 [[IV_SCALED]], 4
-; CHECK-NEXT:    [[IV_SCALED_ADD4_DIV5:%.*]] = udiv i32 [[IV_SCALED_ADD4]], 5
-; CHECK-NEXT:    tail call void @bar(i32 [[IV_SCALED_ADD4_DIV5]])
-; CHECK-NEXT:    [[IV_SCALED_ADD5:%.*]] = add nuw nsw i32 [[IV_SCALED]], 5
-; CHECK-NEXT:    [[IV_SCALED_ADD5_DIV5:%.*]] = udiv i32 [[IV_SCALED_ADD5]], 5
-; CHECK-NEXT:    tail call void @bar(i32 [[IV_SCALED_ADD5_DIV5]])
-; CHECK-NEXT:    [[IV_SCALED_ADD6:%.*]] = add nuw nsw i32 [[IV_SCALED]], 6
-; CHECK-NEXT:    [[IV_SCALED_ADD6_DIV5:%.*]] = udiv i32 [[IV_SCALED_ADD6]], 5
-; CHECK-NEXT:    tail call void @bar(i32 [[IV_SCALED_ADD6_DIV5]])
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i32 [[IV]], 1
-; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i32 [[IV_NEXT]], 3
-; CHECK-NEXT:    br i1 [[CMP]], label [[LOOP]], label [[EXIT:%.*]]
-; CHECK:       exit:
-; CHECK-NEXT:    ret void
-;
-entry:
-  br label %loop
-
-loop:
-
-
-  ; induction variable
-  %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ]
-
-  ; scale instruction
-  %iv.mul3 = mul nuw nsw i32 %iv, 3
-
-  ; extra simple arithmetic operations, used by root instructions
-  %iv.scaled = add nuw nsw i32 %iv.mul3, 20
-
-  ; unrerollable instructions using %iv
-  %iv.mul7 = mul nuw nsw i32 %iv, 7
-  tail call void @bar(i32 %iv.mul7)
-
-  ; root set 1
-
-  ; base instruction
-  %iv.scaled.div5 = udiv i32 %iv.scaled, 5
-  tail call void @bar(i32 %iv.scaled.div5)
-  ; root instruction 0
-  %iv.scaled.add1 = add nuw nsw i32 %iv.scaled, 1
-  %iv.scaled.add1.div5 = udiv i32 %iv.scaled.add1, 5
-  tail call void @bar(i32 %iv.scaled.add1.div5)
-  ; root instruction 2
-  %iv.scaled.add2 = add nuw nsw i32 %iv.scaled, 2
-  %iv.scaled.add2.div5 = udiv i32 %iv.scaled.add2, 5
-  tail call void @bar(i32 %iv.scaled.add2.div5)
-
-  ; root set 2
-
-  ; base instruction
-  %iv.scaled.add4 = add nuw nsw i32 %iv.scaled, 4
-  %iv.scaled.add4.div5 = udiv i32 %iv.scaled.add4, 5
-  tail call void @bar(i32 %iv.scaled.add4.div5)
-  ; root instruction 0
-  %iv.scaled.add5 = add nuw nsw i32 %iv.scaled, 5
-  %iv.scaled.add5.div5 = udiv i32 %iv.scaled.add5, 5
-  tail call void @bar(i32 %iv.scaled.add5.div5)
-  ; root instruction 2
-  %iv.scaled.add6 = add nuw nsw i32 %iv.scaled, 6
-  %iv.scaled.add6.div5 = udiv i32 %iv.scaled.add6, 5
-  tail call void @bar(i32 %iv.scaled.add6.div5)
-
-  ; loop-increment
-  %iv.next = add nuw nsw i32 %iv, 1
-
-  ; latch
-  %cmp = icmp ult i32 %iv.next, 3
-  br i1 %cmp, label %loop, label %exit
-
-exit:
-  ret void
-}
-
-declare dso_local void @bar(i32)
diff --git a/llvm/test/Transforms/LoopReroll/indvar_with_ext.ll b/llvm/test/Transforms/LoopReroll/indvar_with_ext.ll
deleted file mode 100644
index 3fcd43f..0000000
--- a/llvm/test/Transforms/LoopReroll/indvar_with_ext.ll
+++ /dev/null
@@ -1,184 +0,0 @@
-; RUN: opt -S -passes=loop-reroll   %s | FileCheck %s
-target triple = "aarch64--linux-gnu"
-
-define void @test(i32 %n, ptr %arrayidx200, ptr %arrayidx164, ptr %arrayidx172) {
-entry:
-  %rem.i = srem i32 %n, 4
-  %t22 = load float, ptr %arrayidx172, align 4
-  %cmp.9 = icmp eq i32 %n, 0
-  %t7 = sext i32 %n to i64
-  br i1 %cmp.9, label %while.end, label %while.body.preheader
-
-while.body.preheader:
-  br label %while.body
-
-while.body:
-;CHECK-LABEL: while.body:
-;CHECK-NEXT:    %indvar = phi i64 [ %indvar.next, %while.body ], [ 0, %while.body.preheader ]
-;CHECK-NEXT:    %arrayidx62.i = getelementptr inbounds float, ptr %arrayidx200, i64 %indvar
-;CHECK-NEXT:    %t1 = load float, ptr %arrayidx62.i, align 4
-;CHECK-NEXT:    %arrayidx64.i = getelementptr inbounds float, ptr %arrayidx164, i64 %indvar
-;CHECK-NEXT:    %t2 = load float, ptr %arrayidx64.i, align 4
-;CHECK-NEXT:    %mul65.i = fmul fast float %t2, %t22
-;CHECK-NEXT:    %add66.i = fadd fast float %mul65.i, %t1
-;CHECK-NEXT:    store float %add66.i, ptr %arrayidx62.i, align 4
-;CHECK-NEXT:    %indvar.next = add i64 %indvar, 1
-;CHECK-NEXT:    %exitcond = icmp eq i64 %indvar, %{{[0-9]+}}
-;CHECK-NEXT:    br i1 %exitcond, label %while.end.loopexit, label %while.body
-
-  %indvars.iv.i423 = phi i64 [ %indvars.iv.next.i424, %while.body ], [ 0, %while.body.preheader ]
-  %i.22.i = phi i32 [ %add103.i, %while.body ], [ %rem.i, %while.body.preheader ]
-  %arrayidx62.i = getelementptr inbounds float, ptr %arrayidx200, i64 %indvars.iv.i423
-  %t1 = load float, ptr %arrayidx62.i, align 4
-  %arrayidx64.i = getelementptr inbounds float, ptr %arrayidx164, i64 %indvars.iv.i423
-  %t2 = load float, ptr %arrayidx64.i, align 4
-  %mul65.i = fmul fast float %t2, %t22
-  %add66.i = fadd fast float %mul65.i, %t1
-  store float %add66.i, ptr %arrayidx62.i, align 4
-  %t3 = add nsw i64 %indvars.iv.i423, 1
-  %arrayidx71.i = getelementptr inbounds float, ptr %arrayidx200, i64 %t3
-  %t4 = load float, ptr %arrayidx71.i, align 4
-  %arrayidx74.i = getelementptr inbounds float, ptr %arrayidx164, i64 %t3
-  %t5 = load float, ptr %arrayidx74.i, align 4
-  %mul75.i = fmul fast float %t5, %t22
-  %add76.i = fadd fast float %mul75.i, %t4
-  store float %add76.i, ptr %arrayidx71.i, align 4
-  %add103.i = add nsw i32 %i.22.i, 2
-  %t6 = sext i32 %add103.i to i64
-  %cmp58.i = icmp slt i64 %t6, %t7
-  %indvars.iv.next.i424 = add i64 %indvars.iv.i423, 2
-  br i1 %cmp58.i, label %while.body, label %while.end.loopexit
-
-while.end.loopexit:
-  br label %while.end
-
-while.end:
-  ret void
-}
-
-; Function Attrs: noinline norecurse nounwind
-define i32 @test2(i64 %n, ptr nocapture %x, ptr nocapture readonly %y) {
-entry:
-  %cmp18 = icmp sgt i64 %n, 0
-  br i1 %cmp18, label %for.body.preheader, label %for.end
-
-for.body.preheader:                               ; preds = %entry
-  br label %for.body
-
-for.body:                                         ; preds = %for.body.preheader, %for.body
-
-;CHECK-LABEL:     for.body:
-;CHECK-NEXT:  %indvar = phi i64 [ %indvar.next, %for.body ], [ 0, %for.body.preheader ]
-;CHECK-NEXT:  %arrayidx = getelementptr inbounds i32, ptr %y, i64 %indvar
-;CHECK-NEXT:  [[T1:%[0-9]+]] = load i32, ptr %arrayidx, align 4
-;CHECK-NEXT:  %arrayidx3 = getelementptr inbounds i32, ptr %x, i64 %indvar
-;CHECK-NEXT:  store i32 [[T1]], ptr %arrayidx3, align 4
-;CHECK-NEXT:  %indvar.next = add i64 %indvar, 1
-;CHECK-NEXT:  %exitcond = icmp eq i64 %indvar, %{{[0-9]+}}
-;CHECK-NEXT:  br i1 %exitcond, label %for.end.loopexit, label %for.body
-
-  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ]
-  %arrayidx = getelementptr inbounds i32, ptr %y, i64 %indvars.iv
-  %0 = load i32, ptr %arrayidx, align 4
-  %arrayidx3 = getelementptr inbounds i32, ptr %x, i64 %indvars.iv
-  store i32 %0, ptr %arrayidx3, align 4
-  %1 = or disjoint i64 %indvars.iv, 1
-  %arrayidx5 = getelementptr inbounds i32, ptr %y, i64 %1
-  %2 = load i32, ptr %arrayidx5, align 4
-  %arrayidx8 = getelementptr inbounds i32, ptr %x, i64 %1
-  store i32 %2, ptr %arrayidx8, align 4
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 2
-  %cmp = icmp slt i64 %indvars.iv.next, %n
-  br i1 %cmp, label %for.body, label %for.end.loopexit
-
-for.end.loopexit:                                 ; preds = %for.body
-  br label %for.end
-
-for.end:                                          ; preds = %for.end.loopexit, %entry
-  ret i32 0
-}
-
-; Function Attrs: noinline norecurse nounwind
-define i32 @test3(i32 %n, ptr nocapture %x, ptr nocapture readonly %y) {
-entry:
-  %cmp21 = icmp sgt i32 %n, 0
-  br i1 %cmp21, label %for.body.preheader, label %for.end
-
-for.body.preheader:                               ; preds = %entry
-  br label %for.body
-
-for.body:                                         ; preds = %for.body.preheader, %for.body
-
-;CHECK-LABEL:      for.body:
-;CHECK:        %add12 = add i8 %i.022, 2
-;CHECK-NEXT:   %conv = sext i8 %add12 to i32
-;CHECK-NEXT:   %cmp = icmp slt i32 %conv, %n
-;CHECK-NEXT:   br i1 %cmp, label %for.body, label %for.end.loopexit
-
-  %conv23 = phi i32 [ %conv, %for.body ], [ 0, %for.body.preheader ]
-  %i.022 = phi i8 [ %add12, %for.body ], [ 0, %for.body.preheader ]
-  %idxprom = sext i8 %i.022 to i64
-  %arrayidx = getelementptr inbounds i32, ptr %y, i64 %idxprom
-  %0 = load i32, ptr %arrayidx, align 4
-  %arrayidx3 = getelementptr inbounds i32, ptr %x, i64 %idxprom
-  store i32 %0, ptr %arrayidx3, align 4
-  %add = or disjoint i32 %conv23, 1
-  %idxprom5 = sext i32 %add to i64
-  %arrayidx6 = getelementptr inbounds i32, ptr %y, i64 %idxprom5
-  %1 = load i32, ptr %arrayidx6, align 4
-  %arrayidx10 = getelementptr inbounds i32, ptr %x, i64 %idxprom5
-  store i32 %1, ptr %arrayidx10, align 4
-  %add12 = add i8 %i.022, 2
-  %conv = sext i8 %add12 to i32
-  %cmp = icmp slt i32 %conv, %n
-  br i1 %cmp, label %for.body, label %for.end.loopexit
-
-for.end.loopexit:                                 ; preds = %for.body
-  br label %for.end
-
-for.end:                                          ; preds = %for.end.loopexit, %entry
-  ret i32 0
-}
-
-; Function Attrs: noinline norecurse nounwind
-define i32 @test4(i64 %n, ptr nocapture %x, ptr nocapture readonly %y) {
-entry:
-  %cmp18 = icmp eq i64 %n, 0
-  br i1 %cmp18, label %for.end, label %for.body.preheader
-
-for.body.preheader:                               ; preds = %entry
-  br label %for.body
-
-for.body:                                         ; preds = %for.body.preheader, %for.body
-
-;CHECK-LABEL:     for.body:
-;CHECK-NEXT:  %indvar = phi i64 [ %indvar.next, %for.body ], [ 0, %for.body.preheader ]
-;CHECK-NEXT:  %arrayidx = getelementptr inbounds i32, ptr %y, i64 %indvar
-;CHECK-NEXT:  [[T1:%[0-9]+]] = load i32, ptr %arrayidx, align 4
-;CHECK-NEXT:  %arrayidx3 = getelementptr inbounds i32, ptr %x, i64 %indvar
-;CHECK-NEXT:  store i32 [[T1]], ptr %arrayidx3, align 4
-;CHECK-NEXT:  %indvar.next = add i64 %indvar, 1
-;CHECK-NEXT:  %exitcond = icmp eq i64 %indvar, %{{[0-9]+}}
-;CHECK-NEXT:  br i1 %exitcond, label %for.end.loopexit, label %for.body
-
-  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ]
-  %arrayidx = getelementptr inbounds i32, ptr %y, i64 %indvars.iv
-  %0 = load i32, ptr %arrayidx, align 4
-  %arrayidx3 = getelementptr inbounds i32, ptr %x, i64 %indvars.iv
-  store i32 %0, ptr %arrayidx3, align 4
-  %1 = or disjoint i64 %indvars.iv, 1
-  %arrayidx5 = getelementptr inbounds i32, ptr %y, i64 %1
-  %2 = load i32, ptr %arrayidx5, align 4
-  %arrayidx8 = getelementptr inbounds i32, ptr %x, i64 %1
-  store i32 %2, ptr %arrayidx8, align 4
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 2
-  %cmp = icmp ult i64 %indvars.iv.next, %n
-  br i1 %cmp, label %for.body, label %for.end.loopexit
-
-for.end.loopexit:                                 ; preds = %for.body
-  br label %for.end
-
-for.end:                                          ; preds = %for.end.loopexit, %entry
-  ret i32 0
-}
-
diff --git a/llvm/test/Transforms/LoopReroll/negative.ll b/llvm/test/Transforms/LoopReroll/negative.ll
deleted file mode 100644
index ef850c0..0000000
--- a/llvm/test/Transforms/LoopReroll/negative.ll
+++ /dev/null
@@ -1,48 +0,0 @@
-; RUN: opt -S -passes=loop-reroll   %s | FileCheck %s
-target triple = "aarch64--linux-gnu"
-@buf = global [16 x i8] c"\0A\0A\0A\0A\0A\0A\0A\0A\0A\0A\0A\0A\0A\0A\0A\0A", align 1
-
-define i32 @test1(i32 %len, ptr nocapture readonly %buf) #0 {
-entry:
-  %cmp.13 = icmp sgt i32 %len, 1
-  br i1 %cmp.13, label %while.body.lr.ph, label %while.end
-
-while.body.lr.ph:                                 ; preds = %entry
-  br label %while.body
-
-while.body:
-;CHECK-LABEL: while.body:
-;CHECK-NEXT:    %indvar = phi i32 [ %indvar.next, %while.body ], [ 0, %while.body.lr.ph ]
-;CHECK-NEXT:    %sum4.015 = phi i64 [ 0, %while.body.lr.ph ], [ %add, %while.body ]
-;CHECK-NOT:     %sub5 = add nsw i32 %len.addr.014, -1
-;CHECK-NOT:     %sub5 = add nsw i32 %len.addr.014, -2
-;CHECK:    br i1 %exitcond, label %while.cond.while.end_crit_edge, label %while.body
-
-  %sum4.015 = phi i64 [ 0, %while.body.lr.ph ], [ %add4, %while.body ]
-  %len.addr.014 = phi i32 [ %len, %while.body.lr.ph ], [ %sub5, %while.body ]
-  %idxprom = sext i32 %len.addr.014 to i64
-  %arrayidx = getelementptr inbounds i8, ptr %buf, i64 %idxprom
-  %0 = load i8, ptr %arrayidx, align 1
-  %conv = zext i8 %0 to i64
-  %add = add i64 %conv, %sum4.015
-  %sub = add nsw i32 %len.addr.014, -1
-  %idxprom1 = sext i32 %sub to i64
-  %arrayidx2 = getelementptr inbounds i8, ptr %buf, i64 %idxprom1
-  %1 = load i8, ptr %arrayidx2, align 1
-  %conv3 = zext i8 %1 to i64
-  %add4 = add i64 %add, %conv3
-  %sub5 = add nsw i32 %len.addr.014, -2
-  %cmp = icmp sgt i32 %sub5, 1
-  br i1 %cmp, label %while.body, label %while.cond.while.end_crit_edge
-
-while.cond.while.end_crit_edge:                   ; preds = %while.body
-  %add4.lcssa = phi i64 [ %add4, %while.body ]
-  %phitmp = trunc i64 %add4.lcssa to i32
-  br label %while.end
-
-while.end:                                        ; preds = %while.cond.while.end_crit_edge, %entry
-  %sum4.0.lcssa = phi i32 [ %phitmp, %while.cond.while.end_crit_edge ], [ 0, %entry ]
-  ret i32 %sum4.0.lcssa
-  unreachable
-}
-
diff --git a/llvm/test/Transforms/LoopReroll/nonconst_lb.ll b/llvm/test/Transforms/LoopReroll/nonconst_lb.ll
deleted file mode 100644
index 80ea050..0000000
--- a/llvm/test/Transforms/LoopReroll/nonconst_lb.ll
+++ /dev/null
@@ -1,168 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -passes=loop-reroll -S | FileCheck %s
-target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:32:64-v128:32:128-a0:0:32-n32-S32"
-target triple = "thumbv7-none-linux"
-
-;void foo(int *A, int *B, int m, int n) {
-;  for (int i = m; i < n; i+=4) {
-;    A[i+0] = B[i+0] * 4;
-;    A[i+1] = B[i+1] * 4;
-;    A[i+2] = B[i+2] * 4;
-;    A[i+3] = B[i+3] * 4;
-;  }
-;}
-define void @foo(ptr nocapture %A, ptr nocapture readonly %B, i32 %m, i32 %n) {
-; CHECK-LABEL: @foo(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[CMP34:%.*]] = icmp slt i32 [[M:%.*]], [[N:%.*]]
-; CHECK-NEXT:    br i1 [[CMP34]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_END:%.*]]
-; CHECK:       for.body.preheader:
-; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[M]], 4
-; CHECK-NEXT:    [[SMAX:%.*]] = call i32 @llvm.smax.i32(i32 [[N]], i32 [[TMP0]])
-; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[SMAX]], -1
-; CHECK-NEXT:    [[TMP2:%.*]] = sub i32 [[TMP1]], [[M]]
-; CHECK-NEXT:    [[TMP3:%.*]] = lshr i32 [[TMP2]], 2
-; CHECK-NEXT:    [[TMP4:%.*]] = shl nuw i32 [[TMP3]], 2
-; CHECK-NEXT:    [[TMP5:%.*]] = add nuw nsw i32 [[TMP4]], 3
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[INDVAR:%.*]] = phi i32 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVAR_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[TMP6:%.*]] = add i32 [[M]], [[INDVAR]]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i32 [[TMP6]]
-; CHECK-NEXT:    [[TMP7:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[MUL:%.*]] = shl nsw i32 [[TMP7]], 2
-; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i32 [[TMP6]]
-; CHECK-NEXT:    store i32 [[MUL]], ptr [[ARRAYIDX2]], align 4
-; CHECK-NEXT:    [[INDVAR_NEXT]] = add i32 [[INDVAR]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INDVAR]], [[TMP5]]
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT:%.*]], label [[FOR_BODY]]
-; CHECK:       for.end.loopexit:
-; CHECK-NEXT:    br label [[FOR_END]]
-; CHECK:       for.end:
-; CHECK-NEXT:    ret void
-;
-entry:
-  %cmp34 = icmp slt i32 %m, %n
-  br i1 %cmp34, label %for.body, label %for.end
-
-for.body:                                         ; preds = %entry, %for.body
-  %i.035 = phi i32 [ %add18, %for.body ], [ %m, %entry ]
-  %arrayidx = getelementptr inbounds i32, ptr %B, i32 %i.035
-  %0 = load i32, ptr %arrayidx, align 4
-  %mul = shl nsw i32 %0, 2
-  %arrayidx2 = getelementptr inbounds i32, ptr %A, i32 %i.035
-  store i32 %mul, ptr %arrayidx2, align 4
-  %add3 = add nsw i32 %i.035, 1
-  %arrayidx4 = getelementptr inbounds i32, ptr %B, i32 %add3
-  %1 = load i32, ptr %arrayidx4, align 4
-  %mul5 = shl nsw i32 %1, 2
-  %arrayidx7 = getelementptr inbounds i32, ptr %A, i32 %add3
-  store i32 %mul5, ptr %arrayidx7, align 4
-  %add8 = add nsw i32 %i.035, 2
-  %arrayidx9 = getelementptr inbounds i32, ptr %B, i32 %add8
-  %2 = load i32, ptr %arrayidx9, align 4
-  %mul10 = shl nsw i32 %2, 2
-  %arrayidx12 = getelementptr inbounds i32, ptr %A, i32 %add8
-  store i32 %mul10, ptr %arrayidx12, align 4
-  %add13 = add nsw i32 %i.035, 3
-  %arrayidx14 = getelementptr inbounds i32, ptr %B, i32 %add13
-  %3 = load i32, ptr %arrayidx14, align 4
-  %mul15 = shl nsw i32 %3, 2
-  %arrayidx17 = getelementptr inbounds i32, ptr %A, i32 %add13
-  store i32 %mul15, ptr %arrayidx17, align 4
-  %add18 = add nsw i32 %i.035, 4
-  %cmp = icmp slt i32 %add18, %n
-  br i1 %cmp, label %for.body, label %for.end
-
-for.end:                                          ; preds = %for.body, %entry
-  ret void
-}
-
-;void daxpy_ur(int n,float da,ptr dx,ptr dy)
-;    {
-;    int m = n % 4;
-;    for (int i = m; i < n; i = i + 4)
-;        {
-;        dy[i]   = dy[i]   + da*dx[i];
-;        dy[i+1] = dy[i+1] + da*dx[i+1];
-;        dy[i+2] = dy[i+2] + da*dx[i+2];
-;        dy[i+3] = dy[i+3] + da*dx[i+3];
-;        }
-;    }
-define void @daxpy_ur(i32 %n, float %da, ptr nocapture readonly %dx, ptr nocapture %dy) {
-; CHECK-LABEL: @daxpy_ur(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[REM:%.*]] = srem i32 [[N:%.*]], 4
-; CHECK-NEXT:    [[CMP55:%.*]] = icmp slt i32 [[REM]], [[N]]
-; CHECK-NEXT:    br i1 [[CMP55]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_END:%.*]]
-; CHECK:       for.body.preheader:
-; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[N]], -1
-; CHECK-NEXT:    [[TMP1:%.*]] = sub i32 [[TMP0]], [[REM]]
-; CHECK-NEXT:    [[TMP2:%.*]] = lshr i32 [[TMP1]], 2
-; CHECK-NEXT:    [[TMP3:%.*]] = shl nuw i32 [[TMP2]], 2
-; CHECK-NEXT:    [[TMP4:%.*]] = add nuw nsw i32 [[TMP3]], 3
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[INDVAR:%.*]] = phi i32 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVAR_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[TMP5:%.*]] = add i32 [[REM]], [[INDVAR]]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[DY:%.*]], i32 [[TMP5]]
-; CHECK-NEXT:    [[TMP6:%.*]] = load float, ptr [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds float, ptr [[DX:%.*]], i32 [[TMP5]]
-; CHECK-NEXT:    [[TMP7:%.*]] = load float, ptr [[ARRAYIDX1]], align 4
-; CHECK-NEXT:    [[MUL:%.*]] = fmul float [[TMP7]], [[DA:%.*]]
-; CHECK-NEXT:    [[ADD:%.*]] = fadd float [[TMP6]], [[MUL]]
-; CHECK-NEXT:    store float [[ADD]], ptr [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[INDVAR_NEXT]] = add i32 [[INDVAR]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INDVAR]], [[TMP4]]
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT:%.*]], label [[FOR_BODY]]
-; CHECK:       for.end.loopexit:
-; CHECK-NEXT:    br label [[FOR_END]]
-; CHECK:       for.end:
-; CHECK-NEXT:    ret void
-;
-entry:
-  %rem = srem i32 %n, 4
-  %cmp55 = icmp slt i32 %rem, %n
-  br i1 %cmp55, label %for.body, label %for.end
-
-for.body:                                         ; preds = %entry, %for.body
-  %i.056 = phi i32 [ %add27, %for.body ], [ %rem, %entry ]
-  %arrayidx = getelementptr inbounds float, ptr %dy, i32 %i.056
-  %0 = load float, ptr %arrayidx, align 4
-  %arrayidx1 = getelementptr inbounds float, ptr %dx, i32 %i.056
-  %1 = load float, ptr %arrayidx1, align 4
-  %mul = fmul float %1, %da
-  %add = fadd float %0, %mul
-  store float %add, ptr %arrayidx, align 4
-  %add3 = add nsw i32 %i.056, 1
-  %arrayidx4 = getelementptr inbounds float, ptr %dy, i32 %add3
-  %2 = load float, ptr %arrayidx4, align 4
-  %arrayidx6 = getelementptr inbounds float, ptr %dx, i32 %add3
-  %3 = load float, ptr %arrayidx6, align 4
-  %mul7 = fmul float %3, %da
-  %add8 = fadd float %2, %mul7
-  store float %add8, ptr %arrayidx4, align 4
-  %add11 = add nsw i32 %i.056, 2
-  %arrayidx12 = getelementptr inbounds float, ptr %dy, i32 %add11
-  %4 = load float, ptr %arrayidx12, align 4
-  %arrayidx14 = getelementptr inbounds float, ptr %dx, i32 %add11
-  %5 = load float, ptr %arrayidx14, align 4
-  %mul15 = fmul float %5, %da
-  %add16 = fadd float %4, %mul15
-  store float %add16, ptr %arrayidx12, align 4
-  %add19 = add nsw i32 %i.056, 3
-  %arrayidx20 = getelementptr inbounds float, ptr %dy, i32 %add19
-  %6 = load float, ptr %arrayidx20, align 4
-  %arrayidx22 = getelementptr inbounds float, ptr %dx, i32 %add19
-  %7 = load float, ptr %arrayidx22, align 4
-  %mul23 = fmul float %7, %da
-  %add24 = fadd float %6, %mul23
-  store float %add24, ptr %arrayidx20, align 4
-  %add27 = add nsw i32 %i.056, 4
-  %cmp = icmp slt i32 %add27, %n
-  br i1 %cmp, label %for.body, label %for.end
-
-for.end:                                          ; preds = %for.body, %entry
-  ret void
-}
-
diff --git a/llvm/test/Transforms/LoopReroll/ptrindvar.ll b/llvm/test/Transforms/LoopReroll/ptrindvar.ll
deleted file mode 100644
index 90f6353..0000000
--- a/llvm/test/Transforms/LoopReroll/ptrindvar.ll
+++ /dev/null
@@ -1,125 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2
-; RUN: opt -S -passes=loop-reroll   %s | FileCheck %s
-target triple = "aarch64--linux-gnu"
-
-define i32 @test(ptr readonly %buf, ptr readnone %end) #0 {
-; CHECK-LABEL: define i32 @test
-; CHECK-SAME: (ptr readonly [[BUF:%.*]], ptr readnone [[END:%.*]]) {
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[BUF2:%.*]] = ptrtoint ptr [[BUF]] to i64
-; CHECK-NEXT:    [[END1:%.*]] = ptrtoint ptr [[END]] to i64
-; CHECK-NEXT:    [[CMP_9:%.*]] = icmp eq ptr [[BUF]], [[END]]
-; CHECK-NEXT:    br i1 [[CMP_9]], label [[WHILE_END:%.*]], label [[WHILE_BODY_PREHEADER:%.*]]
-; CHECK:       while.body.preheader:
-; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[END1]], -8
-; CHECK-NEXT:    [[TMP1:%.*]] = sub i64 [[TMP0]], [[BUF2]]
-; CHECK-NEXT:    [[TMP2:%.*]] = lshr i64 [[TMP1]], 3
-; CHECK-NEXT:    [[TMP3:%.*]] = shl nuw nsw i64 [[TMP2]], 1
-; CHECK-NEXT:    [[TMP4:%.*]] = add nuw nsw i64 [[TMP3]], 1
-; CHECK-NEXT:    br label [[WHILE_BODY:%.*]]
-; CHECK:       while.body:
-; CHECK-NEXT:    [[INDVAR:%.*]] = phi i64 [ [[INDVAR_NEXT:%.*]], [[WHILE_BODY]] ], [ 0, [[WHILE_BODY_PREHEADER]] ]
-; CHECK-NEXT:    [[S_011:%.*]] = phi i32 [ [[ADD:%.*]], [[WHILE_BODY]] ], [ undef, [[WHILE_BODY_PREHEADER]] ]
-; CHECK-NEXT:    [[TMP5:%.*]] = shl nuw i64 [[INDVAR]], 2
-; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, ptr [[BUF]], i64 [[TMP5]]
-; CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr [[SCEVGEP]], align 4
-; CHECK-NEXT:    [[ADD]] = add nsw i32 [[TMP6]], [[S_011]]
-; CHECK-NEXT:    [[INDVAR_NEXT]] = add i64 [[INDVAR]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVAR]], [[TMP4]]
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[WHILE_END_LOOPEXIT:%.*]], label [[WHILE_BODY]]
-; CHECK:       while.end.loopexit:
-; CHECK-NEXT:    [[ADD2_LCSSA:%.*]] = phi i32 [ [[ADD]], [[WHILE_BODY]] ]
-; CHECK-NEXT:    br label [[WHILE_END]]
-; CHECK:       while.end:
-; CHECK-NEXT:    [[S_0_LCSSA:%.*]] = phi i32 [ undef, [[ENTRY:%.*]] ], [ [[ADD2_LCSSA]], [[WHILE_END_LOOPEXIT]] ]
-; CHECK-NEXT:    ret i32 [[S_0_LCSSA]]
-;
-entry:
-  %cmp.9 = icmp eq ptr %buf, %end
-  br i1 %cmp.9, label %while.end, label %while.body.preheader
-
-while.body.preheader:
-  br label %while.body
-
-while.body:
-
-  %S.011 = phi i32 [ %add2, %while.body ], [ undef, %while.body.preheader ]
-  %buf.addr.010 = phi ptr [ %add.ptr, %while.body ], [ %buf, %while.body.preheader ]
-  %0 = load i32, ptr %buf.addr.010, align 4
-  %add = add nsw i32 %0, %S.011
-  %arrayidx1 = getelementptr inbounds i32, ptr %buf.addr.010, i64 1
-  %1 = load i32, ptr %arrayidx1, align 4
-  %add2 = add nsw i32 %add, %1
-  %add.ptr = getelementptr inbounds i32, ptr %buf.addr.010, i64 2
-  %cmp = icmp eq ptr %add.ptr, %end
-  br i1 %cmp, label %while.end.loopexit, label %while.body
-
-while.end.loopexit:
-  %add2.lcssa = phi i32 [ %add2, %while.body ]
-  br label %while.end
-
-while.end:
-  %S.0.lcssa = phi i32 [ undef, %entry ], [ %add2.lcssa, %while.end.loopexit ]
-  ret i32 %S.0.lcssa
-}
-
-define i32 @test2(ptr readonly %buf, ptr readnone %end) #0 {
-; CHECK-LABEL: define i32 @test2
-; CHECK-SAME: (ptr readonly [[BUF:%.*]], ptr readnone [[END:%.*]]) {
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[END2:%.*]] = ptrtoint ptr [[END]] to i64
-; CHECK-NEXT:    [[BUF1:%.*]] = ptrtoint ptr [[BUF]] to i64
-; CHECK-NEXT:    [[CMP_9:%.*]] = icmp eq ptr [[BUF]], [[END]]
-; CHECK-NEXT:    br i1 [[CMP_9]], label [[WHILE_END:%.*]], label [[WHILE_BODY_PREHEADER:%.*]]
-; CHECK:       while.body.preheader:
-; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[BUF1]], -8
-; CHECK-NEXT:    [[TMP1:%.*]] = sub i64 [[TMP0]], [[END2]]
-; CHECK-NEXT:    [[TMP2:%.*]] = lshr i64 [[TMP1]], 3
-; CHECK-NEXT:    [[TMP3:%.*]] = shl nuw nsw i64 [[TMP2]], 1
-; CHECK-NEXT:    [[TMP4:%.*]] = add nuw nsw i64 [[TMP3]], 1
-; CHECK-NEXT:    br label [[WHILE_BODY:%.*]]
-; CHECK:       while.body:
-; CHECK-NEXT:    [[INDVAR:%.*]] = phi i64 [ [[INDVAR_NEXT:%.*]], [[WHILE_BODY]] ], [ 0, [[WHILE_BODY_PREHEADER]] ]
-; CHECK-NEXT:    [[S_011:%.*]] = phi i32 [ [[ADD:%.*]], [[WHILE_BODY]] ], [ undef, [[WHILE_BODY_PREHEADER]] ]
-; CHECK-NEXT:    [[TMP5:%.*]] = mul nsw i64 [[INDVAR]], -4
-; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, ptr [[BUF]], i64 [[TMP5]]
-; CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr [[SCEVGEP]], align 4
-; CHECK-NEXT:    [[ADD]] = add nsw i32 [[TMP6]], [[S_011]]
-; CHECK-NEXT:    [[INDVAR_NEXT]] = add i64 [[INDVAR]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVAR]], [[TMP4]]
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[WHILE_END_LOOPEXIT:%.*]], label [[WHILE_BODY]]
-; CHECK:       while.end.loopexit:
-; CHECK-NEXT:    [[ADD2_LCSSA:%.*]] = phi i32 [ [[ADD]], [[WHILE_BODY]] ]
-; CHECK-NEXT:    br label [[WHILE_END]]
-; CHECK:       while.end:
-; CHECK-NEXT:    [[S_0_LCSSA:%.*]] = phi i32 [ undef, [[ENTRY:%.*]] ], [ [[ADD2_LCSSA]], [[WHILE_END_LOOPEXIT]] ]
-; CHECK-NEXT:    ret i32 [[S_0_LCSSA]]
-;
-entry:
-  %cmp.9 = icmp eq ptr %buf, %end
-  br i1 %cmp.9, label %while.end, label %while.body.preheader
-
-while.body.preheader:
-  br label %while.body
-
-while.body:
-
-  %S.011 = phi i32 [ %add2, %while.body ], [ undef, %while.body.preheader ]
-  %buf.addr.010 = phi ptr [ %add.ptr, %while.body ], [ %buf, %while.body.preheader ]
-  %0 = load i32, ptr %buf.addr.010, align 4
-  %add = add nsw i32 %0, %S.011
-  %arrayidx1 = getelementptr inbounds i32, ptr %buf.addr.010, i64 -1
-  %1 = load i32, ptr %arrayidx1, align 4
-  %add2 = add nsw i32 %add, %1
-  %add.ptr = getelementptr inbounds i32, ptr %buf.addr.010, i64 -2
-  %cmp = icmp eq ptr %add.ptr, %end
-  br i1 %cmp, label %while.end.loopexit, label %while.body
-
-while.end.loopexit:
-  %add2.lcssa = phi i32 [ %add2, %while.body ]
-  br label %while.end
-
-while.end:
-  %S.0.lcssa = phi i32 [ undef, %entry ], [ %add2.lcssa, %while.end.loopexit ]
-  ret i32 %S.0.lcssa
-}
diff --git a/llvm/test/Transforms/LoopReroll/reduction.ll b/llvm/test/Transforms/LoopReroll/reduction.ll
deleted file mode 100644
index 94f4d53..0000000
--- a/llvm/test/Transforms/LoopReroll/reduction.ll
+++ /dev/null
@@ -1,132 +0,0 @@
-; RUN: opt < %s -passes=loop-reroll -S | FileCheck %s
-target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
-target triple = "x86_64-unknown-linux-gnu"
-
-define i32 @foo(ptr nocapture readonly %x) #0 {
-entry:
-  br label %for.body
-
-for.body:                                         ; preds = %entry, %for.body
-  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
-  %r.029 = phi i32 [ 0, %entry ], [ %add12, %for.body ]
-  %arrayidx = getelementptr inbounds i32, ptr %x, i64 %indvars.iv
-  %0 = load i32, ptr %arrayidx, align 4
-  %add = add nsw i32 %0, %r.029
-  %1 = or disjoint i64 %indvars.iv, 1
-  %arrayidx3 = getelementptr inbounds i32, ptr %x, i64 %1
-  %2 = load i32, ptr %arrayidx3, align 4
-  %add4 = add nsw i32 %add, %2
-  %3 = or disjoint i64 %indvars.iv, 2
-  %arrayidx7 = getelementptr inbounds i32, ptr %x, i64 %3
-  %4 = load i32, ptr %arrayidx7, align 4
-  %add8 = add nsw i32 %add4, %4
-  %5 = or disjoint i64 %indvars.iv, 3
-  %arrayidx11 = getelementptr inbounds i32, ptr %x, i64 %5
-  %6 = load i32, ptr %arrayidx11, align 4
-  %add12 = add nsw i32 %add8, %6
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 4
-  %7 = trunc i64 %indvars.iv.next to i32
-  %cmp = icmp slt i32 %7, 400
-  br i1 %cmp, label %for.body, label %for.end
-
-; CHECK-LABEL: @foo
-
-; CHECK: for.body:
-; CHECK: %indvar = phi i64 [ %indvar.next, %for.body ], [ 0, %entry ]
-; CHECK: %r.029 = phi i32 [ 0, %entry ], [ %add, %for.body ]
-; CHECK: %arrayidx = getelementptr inbounds i32, ptr %x, i64 %indvar
-; CHECK: %1 = load i32, ptr %arrayidx, align 4
-; CHECK: %add = add nsw i32 %1, %r.029
-; CHECK: %indvar.next = add i64 %indvar, 1
-; CHECK: %exitcond = icmp eq i32 %0, 399
-; CHECK: br i1 %exitcond, label %for.end, label %for.body
-
-; CHECK: ret
-
-for.end:                                          ; preds = %for.body
-  ret i32 %add12
-}
-
-define float @bar(ptr nocapture readonly %x) #0 {
-entry:
-  br label %for.body
-
-for.body:                                         ; preds = %entry, %for.body
-  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
-  %r.029 = phi float [ 0.0, %entry ], [ %add12, %for.body ]
-  %arrayidx = getelementptr inbounds float, ptr %x, i64 %indvars.iv
-  %0 = load float, ptr %arrayidx, align 4
-  %add = fadd float %0, %r.029
-  %1 = or disjoint i64 %indvars.iv, 1
-  %arrayidx3 = getelementptr inbounds float, ptr %x, i64 %1
-  %2 = load float, ptr %arrayidx3, align 4
-  %add4 = fadd float %add, %2
-  %3 = or disjoint i64 %indvars.iv, 2
-  %arrayidx7 = getelementptr inbounds float, ptr %x, i64 %3
-  %4 = load float, ptr %arrayidx7, align 4
-  %add8 = fadd float %add4, %4
-  %5 = or disjoint i64 %indvars.iv, 3
-  %arrayidx11 = getelementptr inbounds float, ptr %x, i64 %5
-  %6 = load float, ptr %arrayidx11, align 4
-  %add12 = fadd float %add8, %6
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 4
-  %7 = trunc i64 %indvars.iv.next to i32
-  %cmp = icmp slt i32 %7, 400
-  br i1 %cmp, label %for.body, label %for.end
-
-; CHECK-LABEL: @bar
-
-; CHECK: for.body:
-; CHECK: %indvar = phi i64 [ %indvar.next, %for.body ], [ 0, %entry ]
-; CHECK: %r.029 = phi float [ 0.000000e+00, %entry ], [ %add, %for.body ]
-; CHECK: %arrayidx = getelementptr inbounds float, ptr %x, i64 %indvar
-; CHECK: %1 = load float, ptr %arrayidx, align 4
-; CHECK: %add = fadd float %1, %r.029
-; CHECK: %indvar.next = add i64 %indvar, 1
-; CHECK: %exitcond = icmp eq i32 %0, 399
-; CHECK: br i1 %exitcond, label %for.end, label %for.body
-
-; CHECK: ret
-
-for.end:                                          ; preds = %for.body
-  ret float %add12
-}
-
-define i32 @foo_unusedphi(ptr nocapture readonly %x) #0 {
-entry:
-  br label %for.body
-
-for.body:                                         ; preds = %entry, %for.body
-  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
-  %r.029 = phi i32 [ 0, %entry ], [ %add12, %for.body ]
-  %arrayidx = getelementptr inbounds i32, ptr %x, i64 %indvars.iv
-  %0 = load i32, ptr %arrayidx, align 4
-  %add = add nsw i32 %0, %0
-  %1 = or disjoint i64 %indvars.iv, 1
-  %arrayidx3 = getelementptr inbounds i32, ptr %x, i64 %1
-  %2 = load i32, ptr %arrayidx3, align 4
-  %add4 = add nsw i32 %add, %2
-  %3 = or disjoint i64 %indvars.iv, 2
-  %arrayidx7 = getelementptr inbounds i32, ptr %x, i64 %3
-  %4 = load i32, ptr %arrayidx7, align 4
-  %add8 = add nsw i32 %add4, %4
-  %5 = or disjoint i64 %indvars.iv, 3
-  %arrayidx11 = getelementptr inbounds i32, ptr %x, i64 %5
-  %6 = load i32, ptr %arrayidx11, align 4
-  %add12 = add nsw i32 %add8, %6
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 4
-  %7 = trunc i64 %indvars.iv.next to i32
-  %cmp = icmp slt i32 %7, 400
-  br i1 %cmp, label %for.body, label %for.end
-
-; CHECK-LABEL: @foo_unusedphi
-; The above is just testing for a crash - no specific output expected.
-
-; CHECK: ret
-
-for.end:                                          ; preds = %for.body
-  ret i32 %add12
-}
-
-attributes #0 = { nounwind readonly uwtable }
-
diff --git a/llvm/test/Transforms/LoopReroll/reroll_with_dbg.ll b/llvm/test/Transforms/LoopReroll/reroll_with_dbg.ll
deleted file mode 100644
index e720e76..0000000
--- a/llvm/test/Transforms/LoopReroll/reroll_with_dbg.ll
+++ /dev/null
@@ -1,130 +0,0 @@
-;RUN: opt < %s -passes=loop-reroll -S | FileCheck %s
-;void foo(ptr restrict a, ptr restrict b, int n) {
-;  for(int i = 0; i < n; i+=4) {
-;    a[i] = b[i];
-;    a[i+1] = b[i+1];
-;    a[i+2] = b[i+2];
-;    a[i+3] = b[i+3];
-;  }
-;}
-target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
-target triple = "armv4t--linux-gnueabi"
-
-; Function Attrs: nounwind
-define void @foo(ptr noalias nocapture %a, ptr noalias nocapture readonly %b, i32 %n) #0 !dbg !4 {
-entry:
-;CHECK-LABEL: @foo
-
-  tail call void @llvm.dbg.value(metadata ptr %a, metadata !12, metadata !22), !dbg !23
-  tail call void @llvm.dbg.value(metadata ptr %b, metadata !13, metadata !22), !dbg !24
-  tail call void @llvm.dbg.value(metadata i32 %n, metadata !14, metadata !22), !dbg !25
-  tail call void @llvm.dbg.value(metadata i32 0, metadata !15, metadata !22), !dbg !26
-  %cmp.30 = icmp sgt i32 %n, 0, !dbg !27
-  br i1 %cmp.30, label %for.body.preheader, label %for.cond.cleanup, !dbg !29
-
-for.body.preheader:                               ; preds = %entry
-  br label %for.body, !dbg !30
-
-for.cond.cleanup.loopexit:                        ; preds = %for.body
-  br label %for.cond.cleanup, !dbg !32
-
-for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit, %entry
-  ret void, !dbg !32
-
-for.body:                                         ; preds = %for.body.preheader, %for.body
-;CHECK: for.body:
-;CHECK: %indvar = phi i32 [ %indvar.next, %for.body ], [ 0, {{.*}} ]
-;CHECK: load
-;CHECK: store
-;CHECK-NOT: load
-;CHECK-NOT: store
-;CHECK: call void @llvm.dbg.value
-;CHECK: %indvar.next = add i32 %indvar, 1
-;CHECK: icmp eq i32 %indvar
-  %i.031 = phi i32 [ %add13, %for.body ], [ 0, %for.body.preheader ]
-  %arrayidx = getelementptr inbounds float, ptr %b, i32 %i.031, !dbg !30
-  %0 = load i32, ptr %arrayidx, align 4, !dbg !30, !tbaa !33
-  %arrayidx1 = getelementptr inbounds float, ptr %a, i32 %i.031, !dbg !37
-  store i32 %0, ptr %arrayidx1, align 4, !dbg !38, !tbaa !33
-  %add = or disjoint i32 %i.031, 1, !dbg !39
-  %arrayidx2 = getelementptr inbounds float, ptr %b, i32 %add, !dbg !40
-  %1 = load i32, ptr %arrayidx2, align 4, !dbg !40, !tbaa !33
-  %arrayidx4 = getelementptr inbounds float, ptr %a, i32 %add, !dbg !41
-  store i32 %1, ptr %arrayidx4, align 4, !dbg !42, !tbaa !33
-  %add5 = or disjoint i32 %i.031, 2, !dbg !43
-  %arrayidx6 = getelementptr inbounds float, ptr %b, i32 %add5, !dbg !44
-  %2 = load i32, ptr %arrayidx6, align 4, !dbg !44, !tbaa !33
-  %arrayidx8 = getelementptr inbounds float, ptr %a, i32 %add5, !dbg !45
-  store i32 %2, ptr %arrayidx8, align 4, !dbg !46, !tbaa !33
-  %add9 = or disjoint i32 %i.031, 3, !dbg !47
-  %arrayidx10 = getelementptr inbounds float, ptr %b, i32 %add9, !dbg !48
-  %3 = load i32, ptr %arrayidx10, align 4, !dbg !48, !tbaa !33
-  %arrayidx12 = getelementptr inbounds float, ptr %a, i32 %add9, !dbg !49
-  store i32 %3, ptr %arrayidx12, align 4, !dbg !50, !tbaa !33
-  %add13 = add nuw nsw i32 %i.031, 4, !dbg !51
-  tail call void @llvm.dbg.value(metadata i32 %add13, metadata !15, metadata !22), !dbg !26
-  %cmp = icmp slt i32 %add13, %n, !dbg !27
-  br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit, !dbg !29
-}
-
-; Function Attrs: nounwind readnone
-declare void @llvm.dbg.value(metadata, metadata, metadata) #1
-
-attributes #0 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="arm7tdmi" "target-features"="+strict-align" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone }
-
-!llvm.dbg.cu = !{!0}
-!llvm.module.flags = !{!17, !18, !19, !20}
-!llvm.ident = !{!21}
-
-!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 3.8.0", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2)
-!1 = !DIFile(filename: "test.c", directory: "/home/weimingz/llvm-build/release/community-tip")
-!2 = !{}
-!4 = distinct !DISubprogram(name: "foo", scope: !1, file: !1, line: 1, type: !5, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !11)
-!5 = !DISubroutineType(types: !6)
-!6 = !{null, !7, !7, !10}
-!7 = !DIDerivedType(tag: DW_TAG_restrict_type, baseType: !8)
-!8 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !9, size: 32, align: 32)
-!9 = !DIBasicType(name: "float", size: 32, align: 32, encoding: DW_ATE_float)
-!10 = !DIBasicType(name: "int", size: 32, align: 32, encoding: DW_ATE_signed)
-!11 = !{!12, !13, !14, !15}
-!12 = !DILocalVariable(name: "a", arg: 1, scope: !4, file: !1, line: 1, type: !7)
-!13 = !DILocalVariable(name: "b", arg: 2, scope: !4, file: !1, line: 1, type: !7)
-!14 = !DILocalVariable(name: "n", arg: 3, scope: !4, file: !1, line: 1, type: !10)
-!15 = !DILocalVariable(name: "i", scope: !16, file: !1, line: 2, type: !10)
-!16 = distinct !DILexicalBlock(scope: !4, file: !1, line: 2, column: 3)
-!17 = !{i32 2, !"Dwarf Version", i32 4}
-!18 = !{i32 2, !"Debug Info Version", i32 3}
-!19 = !{i32 1, !"wchar_size", i32 4}
-!20 = !{i32 1, !"min_enum_size", i32 4}
-!21 = !{!"clang version 3.8.0"}
-!22 = !DIExpression()
-!23 = !DILocation(line: 1, column: 27, scope: !4)
-!24 = !DILocation(line: 1, column: 47, scope: !4)
-!25 = !DILocation(line: 1, column: 54, scope: !4)
-!26 = !DILocation(line: 2, column: 11, scope: !16)
-!27 = !DILocation(line: 2, column: 20, scope: !28)
-!28 = distinct !DILexicalBlock(scope: !16, file: !1, line: 2, column: 3)
-!29 = !DILocation(line: 2, column: 3, scope: !16)
-!30 = !DILocation(line: 3, column: 12, scope: !31)
-!31 = distinct !DILexicalBlock(scope: !28, file: !1, line: 2, column: 31)
-!32 = !DILocation(line: 8, column: 1, scope: !4)
-!33 = !{!34, !34, i64 0}
-!34 = !{!"float", !35, i64 0}
-!35 = !{!"omnipotent char", !36, i64 0}
-!36 = !{!"Simple C/C++ TBAA"}
-!37 = !DILocation(line: 3, column: 5, scope: !31)
-!38 = !DILocation(line: 3, column: 10, scope: !31)
-!39 = !DILocation(line: 4, column: 17, scope: !31)
-!40 = !DILocation(line: 4, column: 14, scope: !31)
-!41 = !DILocation(line: 4, column: 5, scope: !31)
-!42 = !DILocation(line: 4, column: 12, scope: !31)
-!43 = !DILocation(line: 5, column: 17, scope: !31)
-!44 = !DILocation(line: 5, column: 14, scope: !31)
-!45 = !DILocation(line: 5, column: 5, scope: !31)
-!46 = !DILocation(line: 5, column: 12, scope: !31)
-!47 = !DILocation(line: 6, column: 17, scope: !31)
-!48 = !DILocation(line: 6, column: 14, scope: !31)
-!49 = !DILocation(line: 6, column: 5, scope: !31)
-!50 = !DILocation(line: 6, column: 12, scope: !31)
-!51 = !DILocation(line: 2, column: 26, scope: !28)
-- 
cgit v1.1


From 75798f21ca7626419ed341cf723ba51889e85500 Mon Sep 17 00:00:00 2001
From: LLVM GN Syncbot <llvmgnsyncbot@gmail.com>
Date: Fri, 9 Feb 2024 10:20:23 +0000
Subject: [gn build] Port ac3bd2bd5301

---
 llvm/utils/gn/secondary/llvm/lib/Transforms/Scalar/BUILD.gn | 1 -
 1 file changed, 1 deletion(-)

diff --git a/llvm/utils/gn/secondary/llvm/lib/Transforms/Scalar/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/Transforms/Scalar/BUILD.gn
index bed26df..a1c0427 100644
--- a/llvm/utils/gn/secondary/llvm/lib/Transforms/Scalar/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/lib/Transforms/Scalar/BUILD.gn
@@ -50,7 +50,6 @@ static_library("Scalar") {
     "LoopLoadElimination.cpp",
     "LoopPassManager.cpp",
     "LoopPredication.cpp",
-    "LoopRerollPass.cpp",
     "LoopRotation.cpp",
     "LoopSimplifyCFG.cpp",
     "LoopSink.cpp",
-- 
cgit v1.1


From 1198c3aaff63d4ce63b760b4effc14babc0bdd8a Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Thu, 8 Feb 2024 18:51:48 +0000
Subject: [X86] PromoteMaskArithmetic - use ISD::isBitwiseLogicOp wrapper. NFC.

---
 llvm/lib/Target/X86/X86ISelLowering.cpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index f310010..881524f 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -48012,8 +48012,7 @@ static SDValue PromoteMaskArithmetic(SDNode *N, EVT VT, SelectionDAG &DAG,
   if (Depth >= SelectionDAG::MaxRecursionDepth)
     return SDValue();
 
-  if (N->getOpcode() != ISD::XOR && N->getOpcode() != ISD::AND &&
-      N->getOpcode() != ISD::OR)
+  if (!ISD::isBitwiseLogicOp(N->getOpcode()))
     return SDValue();
 
   SDValue N0 = N->getOperand(0);
-- 
cgit v1.1


From 713fe6dfd4803fba41f2102479580bed058ca0b3 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Thu, 8 Feb 2024 19:00:49 +0000
Subject: [X86] PromoteMaskArithmetic - consistently use SDValue instead of
 underlying SDNode. NFC.

---
 llvm/lib/Target/X86/X86ISelLowering.cpp | 38 ++++++++++++++++-----------------
 1 file changed, 19 insertions(+), 19 deletions(-)

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 881524f..7db1b8d 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -48006,24 +48006,24 @@ static SDValue combineAndShuffleNot(SDNode *N, SelectionDAG &DAG,
 // given x, y and z are of type \p VT. We can do so, if operands are either
 // truncates from VT types, the second operand is a vector of constants or can
 // be recursively promoted.
-static SDValue PromoteMaskArithmetic(SDNode *N, EVT VT, SelectionDAG &DAG,
+static SDValue PromoteMaskArithmetic(SDValue N, EVT VT, SelectionDAG &DAG,
                                      unsigned Depth) {
   // Limit recursion to avoid excessive compile times.
   if (Depth >= SelectionDAG::MaxRecursionDepth)
     return SDValue();
 
-  if (!ISD::isBitwiseLogicOp(N->getOpcode()))
+  if (!ISD::isBitwiseLogicOp(N.getOpcode()))
     return SDValue();
 
-  SDValue N0 = N->getOperand(0);
-  SDValue N1 = N->getOperand(1);
+  SDValue N0 = N.getOperand(0);
+  SDValue N1 = N.getOperand(1);
   SDLoc DL(N);
 
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
-  if (!TLI.isOperationLegalOrPromote(N->getOpcode(), VT))
+  if (!TLI.isOperationLegalOrPromote(N.getOpcode(), VT))
     return SDValue();
 
-  if (SDValue NN0 = PromoteMaskArithmetic(N0.getNode(), VT, DAG, Depth + 1))
+  if (SDValue NN0 = PromoteMaskArithmetic(N0, VT, DAG, Depth + 1))
     N0 = NN0;
   else {
     // The Left side has to be a trunc.
@@ -48037,7 +48037,7 @@ static SDValue PromoteMaskArithmetic(SDNode *N, EVT VT, SelectionDAG &DAG,
     N0 = N0.getOperand(0);
   }
 
-  if (SDValue NN1 = PromoteMaskArithmetic(N1.getNode(), VT, DAG, Depth + 1))
+  if (SDValue NN1 = PromoteMaskArithmetic(N1, VT, DAG, Depth + 1))
     N1 = NN1;
   else {
     // The right side has to be a 'trunc' or a constant vector.
@@ -48052,7 +48052,7 @@ static SDValue PromoteMaskArithmetic(SDNode *N, EVT VT, SelectionDAG &DAG,
       N1 = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, N1);
   }
 
-  return DAG.getNode(N->getOpcode(), DL, VT, N0, N1);
+  return DAG.getNode(N.getOpcode(), DL, VT, N0, N1);
 }
 
 // On AVX/AVX2 the type v8i1 is legalized to v8i16, which is an XMM sized
@@ -48061,24 +48061,24 @@ static SDValue PromoteMaskArithmetic(SDNode *N, EVT VT, SelectionDAG &DAG,
 // some of the transition sequences.
 // Even with AVX-512 this is still useful for removing casts around logical
 // operations on vXi1 mask types.
-static SDValue PromoteMaskArithmetic(SDNode *N, SelectionDAG &DAG,
+static SDValue PromoteMaskArithmetic(SDValue N, SelectionDAG &DAG,
                                      const X86Subtarget &Subtarget) {
-  EVT VT = N->getValueType(0);
+  EVT VT = N.getValueType();
   assert(VT.isVector() && "Expected vector type");
 
   SDLoc DL(N);
-  assert((N->getOpcode() == ISD::ANY_EXTEND ||
-          N->getOpcode() == ISD::ZERO_EXTEND ||
-          N->getOpcode() == ISD::SIGN_EXTEND) && "Invalid Node");
+  assert((N.getOpcode() == ISD::ANY_EXTEND ||
+          N.getOpcode() == ISD::ZERO_EXTEND ||
+          N.getOpcode() == ISD::SIGN_EXTEND) && "Invalid Node");
 
-  SDValue Narrow = N->getOperand(0);
+  SDValue Narrow = N.getOperand(0);
   EVT NarrowVT = Narrow.getValueType();
 
   // Generate the wide operation.
-  SDValue Op = PromoteMaskArithmetic(Narrow.getNode(), VT, DAG, 0);
+  SDValue Op = PromoteMaskArithmetic(Narrow, VT, DAG, 0);
   if (!Op)
     return SDValue();
-  switch (N->getOpcode()) {
+  switch (N.getOpcode()) {
   default: llvm_unreachable("Unexpected opcode");
   case ISD::ANY_EXTEND:
     return Op;
@@ -52549,7 +52549,7 @@ static SDValue combineSignExtendInReg(SDNode *N, SelectionDAG &DAG,
 
     // Attempt to promote any comparison mask ops before moving the
     // SIGN_EXTEND_INREG in the way.
-    if (SDValue Promote = PromoteMaskArithmetic(N0.getNode(), DAG, Subtarget))
+    if (SDValue Promote = PromoteMaskArithmetic(N0, DAG, Subtarget))
       return DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, VT, Promote, N1);
 
     if (N00.getValueType() == MVT::v4i32 && ExtraVT.getSizeInBits() < 128) {
@@ -52770,7 +52770,7 @@ static SDValue combineSext(SDNode *N, SelectionDAG &DAG,
     return V;
 
   if (VT.isVector()) {
-    if (SDValue R = PromoteMaskArithmetic(N, DAG, Subtarget))
+    if (SDValue R = PromoteMaskArithmetic(SDValue(N, 0), DAG, Subtarget))
       return R;
 
     if (N0.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG)
@@ -52984,7 +52984,7 @@ static SDValue combineZext(SDNode *N, SelectionDAG &DAG,
     return V;
 
   if (VT.isVector())
-    if (SDValue R = PromoteMaskArithmetic(N, DAG, Subtarget))
+    if (SDValue R = PromoteMaskArithmetic(SDValue(N, 0), DAG, Subtarget))
       return R;
 
   if (SDValue NewAdd = promoteExtBeforeAdd(N, DAG, Subtarget))
-- 
cgit v1.1


From 3902f9b6e2d925d50f9a4861d78e5aba07b6ef11 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Fri, 9 Feb 2024 10:23:49 +0000
Subject: [X86] PromoteMaskArithmetic - explicitly attempt to constant fold
 zext(c) instead of relying on getNode()

Don't rely on isBuildVectorOfConstantSDNodes/getNode to constant fold, this could also help in cases where the constant is behind a bitcast.

Noticed while investigating #80668
---
 llvm/lib/Target/X86/X86ISelLowering.cpp | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 7db1b8d..5d8a3a9 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -48026,7 +48026,7 @@ static SDValue PromoteMaskArithmetic(SDValue N, EVT VT, SelectionDAG &DAG,
   if (SDValue NN0 = PromoteMaskArithmetic(N0, VT, DAG, Depth + 1))
     N0 = NN0;
   else {
-    // The Left side has to be a trunc.
+    // The left side has to be a trunc.
     if (N0.getOpcode() != ISD::TRUNCATE)
       return SDValue();
 
@@ -48040,16 +48040,16 @@ static SDValue PromoteMaskArithmetic(SDValue N, EVT VT, SelectionDAG &DAG,
   if (SDValue NN1 = PromoteMaskArithmetic(N1, VT, DAG, Depth + 1))
     N1 = NN1;
   else {
-    // The right side has to be a 'trunc' or a constant vector.
+    // The right side has to be a 'trunc' or a (foldable) constant.
     bool RHSTrunc = N1.getOpcode() == ISD::TRUNCATE &&
                     N1.getOperand(0).getValueType() == VT;
-    if (!RHSTrunc && !ISD::isBuildVectorOfConstantSDNodes(N1.getNode()))
-      return SDValue();
-
     if (RHSTrunc)
       N1 = N1.getOperand(0);
+    else if (SDValue Cst =
+                 DAG.FoldConstantArithmetic(ISD::ZERO_EXTEND, DL, VT, {N1}))
+      N1 = Cst;
     else
-      N1 = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, N1);
+      return SDValue();
   }
 
   return DAG.getNode(N.getOpcode(), DL, VT, N0, N1);
-- 
cgit v1.1


From 2cb61a1d117e2c20e3372bc23bf12b919feaaca2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Timm=20B=C3=A4der?= <tbaeder@redhat.com>
Date: Fri, 9 Feb 2024 11:54:10 +0100
Subject: [clang][Interp] Fix initializing PredefinedExprs

---
 clang/lib/AST/Interp/ByteCodeExprGen.cpp | 3 +--
 clang/test/Sema/ms_predefined_expr.cpp   | 1 +
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/clang/lib/AST/Interp/ByteCodeExprGen.cpp b/clang/lib/AST/Interp/ByteCodeExprGen.cpp
index 2539e75..aaa8ac8 100644
--- a/clang/lib/AST/Interp/ByteCodeExprGen.cpp
+++ b/clang/lib/AST/Interp/ByteCodeExprGen.cpp
@@ -1750,8 +1750,7 @@ bool ByteCodeExprGen<Emitter>::VisitPredefinedExpr(const PredefinedExpr *E) {
   if (DiscardResult)
     return true;
 
-  assert(!Initializing);
-  return this->visit(E->getFunctionName());
+  return this->delegate(E->getFunctionName());
 }
 
 template <class Emitter>
diff --git a/clang/test/Sema/ms_predefined_expr.cpp b/clang/test/Sema/ms_predefined_expr.cpp
index 9f4eb27..b42a494 100644
--- a/clang/test/Sema/ms_predefined_expr.cpp
+++ b/clang/test/Sema/ms_predefined_expr.cpp
@@ -1,4 +1,5 @@
 // RUN: %clang_cc1 %s -fsyntax-only -Wmicrosoft -verify -fms-extensions
+// RUN: %clang_cc1 %s -fsyntax-only -Wmicrosoft -verify -fms-extensions -fexperimental-new-constant-interpreter
 
 using size_t = __SIZE_TYPE__;
 
-- 
cgit v1.1


From 316373abcc2abde414d4b9601f4752b6939a2133 Mon Sep 17 00:00:00 2001
From: David Spickett <david.spickett@linaro.org>
Date: Fri, 9 Feb 2024 11:15:48 +0000
Subject: [llvm][AArch64] Refactor expansion of CALL_BTI and CALL_RVMARKER
 (#80419)

After a lot of churn in expandCALL_BTI, it ended up doing the exact same
thing that expandCALL_RVMARKER does. This change factors out the common
code to make that clear.
---
 .../Target/AArch64/AArch64ExpandPseudoInsts.cpp    | 87 ++++++++++------------
 1 file changed, 40 insertions(+), 47 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
index 1af064b..b2c52b4 100644
--- a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
@@ -774,6 +774,39 @@ bool AArch64ExpandPseudo::expandSVESpillFill(MachineBasicBlock &MBB,
   return true;
 }
 
+// Create a call to CallTarget, copying over all the operands from *MBBI,
+// starting at the regmask.
+static MachineInstr *createCall(MachineBasicBlock &MBB,
+                                MachineBasicBlock::iterator MBBI,
+                                const AArch64InstrInfo *TII,
+                                MachineOperand &CallTarget,
+                                unsigned RegMaskStartIdx) {
+  unsigned Opc = CallTarget.isGlobal() ? AArch64::BL : AArch64::BLR;
+  MachineInstr *Call =
+      BuildMI(MBB, MBBI, MBBI->getDebugLoc(), TII->get(Opc)).getInstr();
+
+  assert((CallTarget.isGlobal() || CallTarget.isReg()) &&
+         "invalid operand for regular call");
+  Call->addOperand(CallTarget);
+
+  // Register arguments are added during ISel, but cannot be added as explicit
+  // operands of the branch as it expects to be B <target> which is only one
+  // operand. Instead they are implicit operands used by the branch.
+  while (!MBBI->getOperand(RegMaskStartIdx).isRegMask()) {
+    auto MOP = MBBI->getOperand(RegMaskStartIdx);
+    assert(MOP.isReg() && "can only add register operands");
+    Call->addOperand(MachineOperand::CreateReg(
+        MOP.getReg(), /*Def=*/false, /*Implicit=*/true, /*isKill=*/false,
+        /*isDead=*/false, /*isUndef=*/MOP.isUndef()));
+    RegMaskStartIdx++;
+  }
+  for (const MachineOperand &MO :
+       llvm::drop_begin(MBBI->operands(), RegMaskStartIdx))
+    Call->addOperand(MO);
+
+  return Call;
+}
+
 bool AArch64ExpandPseudo::expandCALL_RVMARKER(
     MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI) {
   // Expand CALL_RVMARKER pseudo to:
@@ -782,31 +815,12 @@ bool AArch64ExpandPseudo::expandCALL_RVMARKER(
   // - another branch, to the runtime function
   // Mark the sequence as bundle, to avoid passes moving other code in between.
   MachineInstr &MI = *MBBI;
-
-  MachineInstr *OriginalCall;
   MachineOperand &RVTarget = MI.getOperand(0);
-  MachineOperand &CallTarget = MI.getOperand(1);
-  assert((CallTarget.isGlobal() || CallTarget.isReg()) &&
-         "invalid operand for regular call");
   assert(RVTarget.isGlobal() && "invalid operand for attached call");
-  unsigned Opc = CallTarget.isGlobal() ? AArch64::BL : AArch64::BLR;
-  OriginalCall = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(Opc)).getInstr();
-  OriginalCall->addOperand(CallTarget);
-
-  unsigned RegMaskStartIdx = 2;
-  // Skip register arguments. Those are added during ISel, but are not
-  // needed for the concrete branch.
-  while (!MI.getOperand(RegMaskStartIdx).isRegMask()) {
-    auto MOP = MI.getOperand(RegMaskStartIdx);
-    assert(MOP.isReg() && "can only add register operands");
-    OriginalCall->addOperand(MachineOperand::CreateReg(
-        MOP.getReg(), /*Def=*/false, /*Implicit=*/true, /*isKill=*/false,
-        /*isDead=*/false, /*isUndef=*/MOP.isUndef()));
-    RegMaskStartIdx++;
-  }
-  for (const MachineOperand &MO :
-       llvm::drop_begin(MI.operands(), RegMaskStartIdx))
-    OriginalCall->addOperand(MO);
+  MachineInstr *OriginalCall =
+      createCall(MBB, MBBI, TII, MI.getOperand(1),
+                 // Regmask starts after the RV and call targets.
+                 /*RegMaskStartIdx=*/2);
 
   BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::ORRXrs))
                      .addReg(AArch64::FP, RegState::Define)
@@ -834,31 +848,10 @@ bool AArch64ExpandPseudo::expandCALL_BTI(MachineBasicBlock &MBB,
   // - a BTI instruction
   // Mark the sequence as a bundle, to avoid passes moving other code in
   // between.
-
   MachineInstr &MI = *MBBI;
-  MachineOperand &CallTarget = MI.getOperand(0);
-  assert((CallTarget.isGlobal() || CallTarget.isReg()) &&
-         "invalid operand for regular call");
-  unsigned Opc = CallTarget.isGlobal() ? AArch64::BL : AArch64::BLR;
-  MachineInstr *Call =
-      BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(Opc)).getInstr();
-  Call->addOperand(CallTarget);
-
-  // 1 because we already added the branch target above.
-  unsigned RegMaskStartIdx = 1;
-  // The branch is BL <target>, so we cannot attach the arguments of the called
-  // function to it. Those must be added as implicitly used by the branch.
-  while (!MI.getOperand(RegMaskStartIdx).isRegMask()) {
-    auto MOP = MI.getOperand(RegMaskStartIdx);
-    assert(MOP.isReg() && "can only add register operands");
-    Call->addOperand(MachineOperand::CreateReg(
-        MOP.getReg(), /*Def=*/false, /*Implicit=*/true, /*isKill=*/false,
-        /*isDead=*/false, /*isUndef=*/MOP.isUndef()));
-    RegMaskStartIdx++;
-  }
-  for (const MachineOperand &MO :
-       llvm::drop_begin(MI.operands(), RegMaskStartIdx))
-    Call->addOperand(MO);
+  MachineInstr *Call = createCall(MBB, MBBI, TII, MI.getOperand(0),
+                                  // Regmask starts after the call target.
+                                  /*RegMaskStartIdx=*/1);
 
   Call->setCFIType(*MBB.getParent(), MI.getCFIType());
 
-- 
cgit v1.1


From b5a273a1cfe6f509f8d2541e04d9186438f33348 Mon Sep 17 00:00:00 2001
From: Stephen Tozer <stephen.tozer@sony.com>
Date: Fri, 9 Feb 2024 12:02:59 +0000
Subject: [Polly][DebugInfo] Use getStableDebugLoc to avoid intrinsic-dependent
 behaviour (#81246)

Polly currently uses `getDebugLoc` in a few places to produce diagnostic
output; this is correct when interacting with specific instructions, but
may be incorrect when dealing with instruction ranges if debug
intrinsics are included. As a general rule, the debug locations attached
to debug intrinsics may be misleading compared to the surrounding
instructions, and are not generally used for anything other than
determining variable scope info; the recommended approach is therefore
to use `getStableDebugLoc` instead, which skips over debug intrinsics.
This is necessary to fix test failures that occur when enabling
non-instruction debug info, which removes debug intrinsics from basic
blocks and thus alters the diagnostic output of Polly (despite causing
no functional change).
---
 polly/lib/Analysis/ScopDetectionDiagnostic.cpp                 | 4 ++--
 polly/lib/Support/ScopLocation.cpp                             | 2 +-
 polly/test/ScopDetectionDiagnostics/ReportLoopBound-01.ll      | 8 ++++----
 polly/test/ScopDetectionDiagnostics/loop_has_multiple_exits.ll | 2 +-
 4 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/polly/lib/Analysis/ScopDetectionDiagnostic.cpp b/polly/lib/Analysis/ScopDetectionDiagnostic.cpp
index 364e21a..30fbd17 100644
--- a/polly/lib/Analysis/ScopDetectionDiagnostic.cpp
+++ b/polly/lib/Analysis/ScopDetectionDiagnostic.cpp
@@ -122,7 +122,7 @@ void getDebugLocations(const BBPair &P, DebugLoc &Begin, DebugLoc &End) {
       continue;
     Todo.append(succ_begin(BB), succ_end(BB));
     for (const Instruction &Inst : *BB) {
-      DebugLoc DL = Inst.getDebugLoc();
+      DebugLoc DL = Inst.getStableDebugLoc();
       if (!DL)
         continue;
 
@@ -821,7 +821,7 @@ std::string ReportUnprofitable::getEndUserMessage() const {
 const DebugLoc &ReportUnprofitable::getDebugLoc() const {
   for (const BasicBlock *BB : R->blocks())
     for (const Instruction &Inst : *BB)
-      if (const DebugLoc &DL = Inst.getDebugLoc())
+      if (const DebugLoc &DL = Inst.getStableDebugLoc())
         return DL;
 
   return R->getEntry()->getTerminator()->getDebugLoc();
diff --git a/polly/lib/Support/ScopLocation.cpp b/polly/lib/Support/ScopLocation.cpp
index 01f3d68..9f9941d 100644
--- a/polly/lib/Support/ScopLocation.cpp
+++ b/polly/lib/Support/ScopLocation.cpp
@@ -25,7 +25,7 @@ void getDebugLocation(const Region *R, unsigned &LineBegin, unsigned &LineEnd,
 
   for (const BasicBlock *BB : R->blocks())
     for (const Instruction &Inst : *BB) {
-      DebugLoc DL = Inst.getDebugLoc();
+      DebugLoc DL = Inst.getStableDebugLoc();
       if (!DL)
         continue;
 
diff --git a/polly/test/ScopDetectionDiagnostics/ReportLoopBound-01.ll b/polly/test/ScopDetectionDiagnostics/ReportLoopBound-01.ll
index 6182371..35986b5 100644
--- a/polly/test/ScopDetectionDiagnostics/ReportLoopBound-01.ll
+++ b/polly/test/ScopDetectionDiagnostics/ReportLoopBound-01.ll
@@ -19,20 +19,20 @@
 
 ; If we reject non-affine loops the non-affine loop bound will be reported:
 ;
-; REJECTNONAFFINELOOPS: remark: ReportLoopBound-01.c:1:12: The following errors keep this region from being a Scop.
+; REJECTNONAFFINELOOPS: remark: ReportLoopBound-01.c:2:8: The following errors keep this region from being a Scop.
 ; REJECTNONAFFINELOOPS: remark: ReportLoopBound-01.c:2:8: Failed to derive an affine function from the loop bounds.
 ; REJECTNONAFFINELOOPS: remark: ReportLoopBound-01.c:3:5: Invalid Scop candidate ends here.
 
 ; If we allow non-affine loops the non-affine access will be reported:
 ;
-; ALLOWNONAFFINELOOPS: remark: ReportLoopBound-01.c:1:12: The following errors keep this region from being a Scop.
+; ALLOWNONAFFINELOOPS: remark: ReportLoopBound-01.c:2:8: The following errors keep this region from being a Scop.
 ; ALLOWNONAFFINELOOPS: remark: ReportLoopBound-01.c:3:5: The array subscript of "A" is not affine
 ; ALLOWNONAFFINELOOPS: remark: ReportLoopBound-01.c:3:5: Invalid Scop candidate ends here.
 
 ; If we allow non-affine loops and non-affine accesses the region will be reported as not profitable:
 ;
-; ALLOWNONAFFINEALL: remark: ReportLoopBound-01.c:1:12: The following errors keep this region from being a Scop.
-; ALLOWNONAFFINEALL: remark: ReportLoopBound-01.c:1:12: No profitable polyhedral optimization found
+; ALLOWNONAFFINEALL: remark: ReportLoopBound-01.c:2:8: The following errors keep this region from being a Scop.
+; ALLOWNONAFFINEALL: remark: ReportLoopBound-01.c:2:8: No profitable polyhedral optimization found
 ; ALLOWNONAFFINEALL: remark: ReportLoopBound-01.c:3:5: Invalid Scop candidate ends here.
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
diff --git a/polly/test/ScopDetectionDiagnostics/loop_has_multiple_exits.ll b/polly/test/ScopDetectionDiagnostics/loop_has_multiple_exits.ll
index 7661bd0..a0f2704 100644
--- a/polly/test/ScopDetectionDiagnostics/loop_has_multiple_exits.ll
+++ b/polly/test/ScopDetectionDiagnostics/loop_has_multiple_exits.ll
@@ -2,7 +2,7 @@
 ;
 ; Derived from test-suite/MultiSource/Benchmarks/BitBench/uuencode/uuencode.c
 ;
-; CHECK: remark: uuencode.c:75:18: The following errors keep this region from being a Scop.
+; CHECK: remark: uuencode.c:76:13: The following errors keep this region from being a Scop.
 ; CHECK: remark: uuencode.c:83:3: Loop cannot be handled because it has multiple exits.
 ; CHECK: remark: uuencode.c:95:21: Invalid Scop candidate ends here.
 
-- 
cgit v1.1


From 614fab49b0b47c6463fb4d9d788790345bfdb6ce Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Fri, 9 Feb 2024 11:16:34 +0000
Subject: [X86] PromoteMaskArithmetic - share the same SDLoc argument instead
 of recreating it over and over again.

---
 llvm/lib/Target/X86/X86ISelLowering.cpp | 22 ++++++++++------------
 1 file changed, 10 insertions(+), 12 deletions(-)

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 5d8a3a9..0c2d5f8 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -48006,8 +48006,8 @@ static SDValue combineAndShuffleNot(SDNode *N, SelectionDAG &DAG,
 // given x, y and z are of type \p VT. We can do so, if operands are either
 // truncates from VT types, the second operand is a vector of constants or can
 // be recursively promoted.
-static SDValue PromoteMaskArithmetic(SDValue N, EVT VT, SelectionDAG &DAG,
-                                     unsigned Depth) {
+static SDValue PromoteMaskArithmetic(SDValue N, const SDLoc &DL, EVT VT,
+                                     SelectionDAG &DAG, unsigned Depth) {
   // Limit recursion to avoid excessive compile times.
   if (Depth >= SelectionDAG::MaxRecursionDepth)
     return SDValue();
@@ -48017,13 +48017,12 @@ static SDValue PromoteMaskArithmetic(SDValue N, EVT VT, SelectionDAG &DAG,
 
   SDValue N0 = N.getOperand(0);
   SDValue N1 = N.getOperand(1);
-  SDLoc DL(N);
 
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   if (!TLI.isOperationLegalOrPromote(N.getOpcode(), VT))
     return SDValue();
 
-  if (SDValue NN0 = PromoteMaskArithmetic(N0, VT, DAG, Depth + 1))
+  if (SDValue NN0 = PromoteMaskArithmetic(N0, DL, VT, DAG, Depth + 1))
     N0 = NN0;
   else {
     // The left side has to be a trunc.
@@ -48037,7 +48036,7 @@ static SDValue PromoteMaskArithmetic(SDValue N, EVT VT, SelectionDAG &DAG,
     N0 = N0.getOperand(0);
   }
 
-  if (SDValue NN1 = PromoteMaskArithmetic(N1, VT, DAG, Depth + 1))
+  if (SDValue NN1 = PromoteMaskArithmetic(N1, DL, VT, DAG, Depth + 1))
     N1 = NN1;
   else {
     // The right side has to be a 'trunc' or a (foldable) constant.
@@ -48061,12 +48060,11 @@ static SDValue PromoteMaskArithmetic(SDValue N, EVT VT, SelectionDAG &DAG,
 // some of the transition sequences.
 // Even with AVX-512 this is still useful for removing casts around logical
 // operations on vXi1 mask types.
-static SDValue PromoteMaskArithmetic(SDValue N, SelectionDAG &DAG,
+static SDValue PromoteMaskArithmetic(SDValue N, const SDLoc &DL,
+                                     SelectionDAG &DAG,
                                      const X86Subtarget &Subtarget) {
   EVT VT = N.getValueType();
   assert(VT.isVector() && "Expected vector type");
-
-  SDLoc DL(N);
   assert((N.getOpcode() == ISD::ANY_EXTEND ||
           N.getOpcode() == ISD::ZERO_EXTEND ||
           N.getOpcode() == ISD::SIGN_EXTEND) && "Invalid Node");
@@ -48075,7 +48073,7 @@ static SDValue PromoteMaskArithmetic(SDValue N, SelectionDAG &DAG,
   EVT NarrowVT = Narrow.getValueType();
 
   // Generate the wide operation.
-  SDValue Op = PromoteMaskArithmetic(Narrow, VT, DAG, 0);
+  SDValue Op = PromoteMaskArithmetic(Narrow, DL, VT, DAG, 0);
   if (!Op)
     return SDValue();
   switch (N.getOpcode()) {
@@ -52549,7 +52547,7 @@ static SDValue combineSignExtendInReg(SDNode *N, SelectionDAG &DAG,
 
     // Attempt to promote any comparison mask ops before moving the
     // SIGN_EXTEND_INREG in the way.
-    if (SDValue Promote = PromoteMaskArithmetic(N0, DAG, Subtarget))
+    if (SDValue Promote = PromoteMaskArithmetic(N0, dl, DAG, Subtarget))
       return DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, VT, Promote, N1);
 
     if (N00.getValueType() == MVT::v4i32 && ExtraVT.getSizeInBits() < 128) {
@@ -52770,7 +52768,7 @@ static SDValue combineSext(SDNode *N, SelectionDAG &DAG,
     return V;
 
   if (VT.isVector()) {
-    if (SDValue R = PromoteMaskArithmetic(SDValue(N, 0), DAG, Subtarget))
+    if (SDValue R = PromoteMaskArithmetic(SDValue(N, 0), DL, DAG, Subtarget))
       return R;
 
     if (N0.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG)
@@ -52984,7 +52982,7 @@ static SDValue combineZext(SDNode *N, SelectionDAG &DAG,
     return V;
 
   if (VT.isVector())
-    if (SDValue R = PromoteMaskArithmetic(SDValue(N, 0), DAG, Subtarget))
+    if (SDValue R = PromoteMaskArithmetic(SDValue(N, 0), dl, DAG, Subtarget))
       return R;
 
   if (SDValue NewAdd = promoteExtBeforeAdd(N, DAG, Subtarget))
-- 
cgit v1.1


From 047f8321f14a53caad7b564f7f654a470fdca8a9 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Fri, 9 Feb 2024 11:32:14 +0000
Subject: [X86] ctpop-mask.ll - add 32-bit with SSE2 test coverage

32-bit targets will try to use SSE2 <2 x i64> CTPOP expansion for i64 CTPOP
---
 llvm/test/CodeGen/X86/ctpop-mask.ll | 306 +++++++++++++++++++++++++++---------
 1 file changed, 229 insertions(+), 77 deletions(-)

diff --git a/llvm/test/CodeGen/X86/ctpop-mask.ll b/llvm/test/CodeGen/X86/ctpop-mask.ll
index e0a96a9..6d4fa4a4 100644
--- a/llvm/test/CodeGen/X86/ctpop-mask.ll
+++ b/llvm/test/CodeGen/X86/ctpop-mask.ll
@@ -1,7 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=i686-unknown -mattr=+popcnt | FileCheck %s -check-prefixes=X86-POPCOUNT
 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+popcnt | FileCheck %s -check-prefixes=X64-POPCOUNT
-; RUN: llc < %s -mtriple=i686-unknown -mattr=-popcnt | FileCheck %s -check-prefixes=X86-NO-POPCOUNT
+; RUN: llc < %s -mtriple=i686-unknown -mattr=-popcnt | FileCheck %s -check-prefixes=X86-NO-POPCOUNT,X86-NO-SSE2
+; RUN: llc < %s -mtriple=i686-unknown -mattr=+sse2 -mattr=-popcnt | FileCheck %s -check-prefixes=X86-NO-POPCOUNT,X86-SSE2
 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=-popcnt | FileCheck %s -check-prefixes=X64-NO-POPCOUNT
 
 declare i8 @llvm.ctpop.i8(i8) nounwind readnone
@@ -28,17 +29,42 @@ define i64 @ctpop_mask2(i64 %x) nounwind readnone {
 ; X64-POPCOUNT-NEXT:    popcntl %edi, %eax
 ; X64-POPCOUNT-NEXT:    retq
 ;
-; X86-NO-POPCOUNT-LABEL: ctpop_mask2:
-; X86-NO-POPCOUNT:       # %bb.0:
-; X86-NO-POPCOUNT-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NO-POPCOUNT-NEXT:    andl $3, %eax
-; X86-NO-POPCOUNT-NEXT:    imull $134480385, %eax, %eax # imm = 0x8040201
-; X86-NO-POPCOUNT-NEXT:    shrl $3, %eax
-; X86-NO-POPCOUNT-NEXT:    andl $17895697, %eax # imm = 0x1111111
-; X86-NO-POPCOUNT-NEXT:    imull $286331153, %eax, %eax # imm = 0x11111111
-; X86-NO-POPCOUNT-NEXT:    shrl $28, %eax
-; X86-NO-POPCOUNT-NEXT:    xorl %edx, %edx
-; X86-NO-POPCOUNT-NEXT:    retl
+; X86-NO-SSE2-LABEL: ctpop_mask2:
+; X86-NO-SSE2:       # %bb.0:
+; X86-NO-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-SSE2-NEXT:    andl $3, %eax
+; X86-NO-SSE2-NEXT:    imull $134480385, %eax, %eax # imm = 0x8040201
+; X86-NO-SSE2-NEXT:    shrl $3, %eax
+; X86-NO-SSE2-NEXT:    andl $17895697, %eax # imm = 0x1111111
+; X86-NO-SSE2-NEXT:    imull $286331153, %eax, %eax # imm = 0x11111111
+; X86-NO-SSE2-NEXT:    shrl $28, %eax
+; X86-NO-SSE2-NEXT:    xorl %edx, %edx
+; X86-NO-SSE2-NEXT:    retl
+;
+; X86-SSE2-LABEL: ctpop_mask2:
+; X86-SSE2:       # %bb.0:
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE2-NEXT:    andl $3, %eax
+; X86-SSE2-NEXT:    movd %eax, %xmm0
+; X86-SSE2-NEXT:    movdqa %xmm0, %xmm1
+; X86-SSE2-NEXT:    psrlw $1, %xmm1
+; X86-SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
+; X86-SSE2-NEXT:    psubb %xmm1, %xmm0
+; X86-SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
+; X86-SSE2-NEXT:    movdqa %xmm0, %xmm2
+; X86-SSE2-NEXT:    pand %xmm1, %xmm2
+; X86-SSE2-NEXT:    psrlw $2, %xmm0
+; X86-SSE2-NEXT:    pand %xmm1, %xmm0
+; X86-SSE2-NEXT:    paddb %xmm2, %xmm0
+; X86-SSE2-NEXT:    movdqa %xmm0, %xmm1
+; X86-SSE2-NEXT:    psrlw $4, %xmm1
+; X86-SSE2-NEXT:    paddb %xmm0, %xmm1
+; X86-SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
+; X86-SSE2-NEXT:    pxor %xmm0, %xmm0
+; X86-SSE2-NEXT:    psadbw %xmm1, %xmm0
+; X86-SSE2-NEXT:    movd %xmm0, %eax
+; X86-SSE2-NEXT:    xorl %edx, %edx
+; X86-SSE2-NEXT:    retl
 ;
 ; X64-NO-POPCOUNT-LABEL: ctpop_mask2:
 ; X64-NO-POPCOUNT:       # %bb.0:
@@ -192,17 +218,42 @@ define i64 @ctpop_mask4(i64 %x) nounwind readnone {
 ; X64-POPCOUNT-NEXT:    popcntl %edi, %eax
 ; X64-POPCOUNT-NEXT:    retq
 ;
-; X86-NO-POPCOUNT-LABEL: ctpop_mask4:
-; X86-NO-POPCOUNT:       # %bb.0:
-; X86-NO-POPCOUNT-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NO-POPCOUNT-NEXT:    andl $15, %eax
-; X86-NO-POPCOUNT-NEXT:    imull $134480385, %eax, %eax # imm = 0x8040201
-; X86-NO-POPCOUNT-NEXT:    shrl $3, %eax
-; X86-NO-POPCOUNT-NEXT:    andl $17895697, %eax # imm = 0x1111111
-; X86-NO-POPCOUNT-NEXT:    imull $286331153, %eax, %eax # imm = 0x11111111
-; X86-NO-POPCOUNT-NEXT:    shrl $28, %eax
-; X86-NO-POPCOUNT-NEXT:    xorl %edx, %edx
-; X86-NO-POPCOUNT-NEXT:    retl
+; X86-NO-SSE2-LABEL: ctpop_mask4:
+; X86-NO-SSE2:       # %bb.0:
+; X86-NO-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-SSE2-NEXT:    andl $15, %eax
+; X86-NO-SSE2-NEXT:    imull $134480385, %eax, %eax # imm = 0x8040201
+; X86-NO-SSE2-NEXT:    shrl $3, %eax
+; X86-NO-SSE2-NEXT:    andl $17895697, %eax # imm = 0x1111111
+; X86-NO-SSE2-NEXT:    imull $286331153, %eax, %eax # imm = 0x11111111
+; X86-NO-SSE2-NEXT:    shrl $28, %eax
+; X86-NO-SSE2-NEXT:    xorl %edx, %edx
+; X86-NO-SSE2-NEXT:    retl
+;
+; X86-SSE2-LABEL: ctpop_mask4:
+; X86-SSE2:       # %bb.0:
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE2-NEXT:    andl $15, %eax
+; X86-SSE2-NEXT:    movd %eax, %xmm0
+; X86-SSE2-NEXT:    movdqa %xmm0, %xmm1
+; X86-SSE2-NEXT:    psrlw $1, %xmm1
+; X86-SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
+; X86-SSE2-NEXT:    psubb %xmm1, %xmm0
+; X86-SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
+; X86-SSE2-NEXT:    movdqa %xmm0, %xmm2
+; X86-SSE2-NEXT:    pand %xmm1, %xmm2
+; X86-SSE2-NEXT:    psrlw $2, %xmm0
+; X86-SSE2-NEXT:    pand %xmm1, %xmm0
+; X86-SSE2-NEXT:    paddb %xmm2, %xmm0
+; X86-SSE2-NEXT:    movdqa %xmm0, %xmm1
+; X86-SSE2-NEXT:    psrlw $4, %xmm1
+; X86-SSE2-NEXT:    paddb %xmm0, %xmm1
+; X86-SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
+; X86-SSE2-NEXT:    pxor %xmm0, %xmm0
+; X86-SSE2-NEXT:    psadbw %xmm1, %xmm0
+; X86-SSE2-NEXT:    movd %xmm0, %eax
+; X86-SSE2-NEXT:    xorl %edx, %edx
+; X86-SSE2-NEXT:    retl
 ;
 ; X64-NO-POPCOUNT-LABEL: ctpop_mask4:
 ; X64-NO-POPCOUNT:       # %bb.0:
@@ -274,17 +325,42 @@ define i64 @ctpop_mask5(i64 %x) nounwind readnone {
 ; X64-POPCOUNT-NEXT:    popcntl %edi, %eax
 ; X64-POPCOUNT-NEXT:    retq
 ;
-; X86-NO-POPCOUNT-LABEL: ctpop_mask5:
-; X86-NO-POPCOUNT:       # %bb.0:
-; X86-NO-POPCOUNT-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NO-POPCOUNT-NEXT:    andl $31, %eax
-; X86-NO-POPCOUNT-NEXT:    imull $134480385, %eax, %eax # imm = 0x8040201
-; X86-NO-POPCOUNT-NEXT:    shrl $3, %eax
-; X86-NO-POPCOUNT-NEXT:    andl $286331153, %eax # imm = 0x11111111
-; X86-NO-POPCOUNT-NEXT:    imull $286331153, %eax, %eax # imm = 0x11111111
-; X86-NO-POPCOUNT-NEXT:    shrl $28, %eax
-; X86-NO-POPCOUNT-NEXT:    xorl %edx, %edx
-; X86-NO-POPCOUNT-NEXT:    retl
+; X86-NO-SSE2-LABEL: ctpop_mask5:
+; X86-NO-SSE2:       # %bb.0:
+; X86-NO-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-SSE2-NEXT:    andl $31, %eax
+; X86-NO-SSE2-NEXT:    imull $134480385, %eax, %eax # imm = 0x8040201
+; X86-NO-SSE2-NEXT:    shrl $3, %eax
+; X86-NO-SSE2-NEXT:    andl $286331153, %eax # imm = 0x11111111
+; X86-NO-SSE2-NEXT:    imull $286331153, %eax, %eax # imm = 0x11111111
+; X86-NO-SSE2-NEXT:    shrl $28, %eax
+; X86-NO-SSE2-NEXT:    xorl %edx, %edx
+; X86-NO-SSE2-NEXT:    retl
+;
+; X86-SSE2-LABEL: ctpop_mask5:
+; X86-SSE2:       # %bb.0:
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE2-NEXT:    andl $31, %eax
+; X86-SSE2-NEXT:    movd %eax, %xmm0
+; X86-SSE2-NEXT:    movdqa %xmm0, %xmm1
+; X86-SSE2-NEXT:    psrlw $1, %xmm1
+; X86-SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
+; X86-SSE2-NEXT:    psubb %xmm1, %xmm0
+; X86-SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
+; X86-SSE2-NEXT:    movdqa %xmm0, %xmm2
+; X86-SSE2-NEXT:    pand %xmm1, %xmm2
+; X86-SSE2-NEXT:    psrlw $2, %xmm0
+; X86-SSE2-NEXT:    pand %xmm1, %xmm0
+; X86-SSE2-NEXT:    paddb %xmm2, %xmm0
+; X86-SSE2-NEXT:    movdqa %xmm0, %xmm1
+; X86-SSE2-NEXT:    psrlw $4, %xmm1
+; X86-SSE2-NEXT:    paddb %xmm0, %xmm1
+; X86-SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
+; X86-SSE2-NEXT:    pxor %xmm0, %xmm0
+; X86-SSE2-NEXT:    psadbw %xmm1, %xmm0
+; X86-SSE2-NEXT:    movd %xmm0, %eax
+; X86-SSE2-NEXT:    xorl %edx, %edx
+; X86-SSE2-NEXT:    retl
 ;
 ; X64-NO-POPCOUNT-LABEL: ctpop_mask5:
 ; X64-NO-POPCOUNT:       # %bb.0:
@@ -395,18 +471,43 @@ define i64 @ctpop_shifted_mask6(i64 %x) nounwind readnone {
 ; X64-POPCOUNT-NEXT:    popcntl %edi, %eax
 ; X64-POPCOUNT-NEXT:    retq
 ;
-; X86-NO-POPCOUNT-LABEL: ctpop_shifted_mask6:
-; X86-NO-POPCOUNT:       # %bb.0:
-; X86-NO-POPCOUNT-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NO-POPCOUNT-NEXT:    shrl $9, %eax
-; X86-NO-POPCOUNT-NEXT:    andl $51, %eax
-; X86-NO-POPCOUNT-NEXT:    imull $134480385, %eax, %eax # imm = 0x8040201
-; X86-NO-POPCOUNT-NEXT:    shrl $3, %eax
-; X86-NO-POPCOUNT-NEXT:    andl $286331153, %eax # imm = 0x11111111
-; X86-NO-POPCOUNT-NEXT:    imull $286331153, %eax, %eax # imm = 0x11111111
-; X86-NO-POPCOUNT-NEXT:    shrl $28, %eax
-; X86-NO-POPCOUNT-NEXT:    xorl %edx, %edx
-; X86-NO-POPCOUNT-NEXT:    retl
+; X86-NO-SSE2-LABEL: ctpop_shifted_mask6:
+; X86-NO-SSE2:       # %bb.0:
+; X86-NO-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-SSE2-NEXT:    shrl $9, %eax
+; X86-NO-SSE2-NEXT:    andl $51, %eax
+; X86-NO-SSE2-NEXT:    imull $134480385, %eax, %eax # imm = 0x8040201
+; X86-NO-SSE2-NEXT:    shrl $3, %eax
+; X86-NO-SSE2-NEXT:    andl $286331153, %eax # imm = 0x11111111
+; X86-NO-SSE2-NEXT:    imull $286331153, %eax, %eax # imm = 0x11111111
+; X86-NO-SSE2-NEXT:    shrl $28, %eax
+; X86-NO-SSE2-NEXT:    xorl %edx, %edx
+; X86-NO-SSE2-NEXT:    retl
+;
+; X86-SSE2-LABEL: ctpop_shifted_mask6:
+; X86-SSE2:       # %bb.0:
+; X86-SSE2-NEXT:    movl $26112, %eax # imm = 0x6600
+; X86-SSE2-NEXT:    andl {{[0-9]+}}(%esp), %eax
+; X86-SSE2-NEXT:    movd %eax, %xmm0
+; X86-SSE2-NEXT:    movdqa %xmm0, %xmm1
+; X86-SSE2-NEXT:    psrlw $1, %xmm1
+; X86-SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
+; X86-SSE2-NEXT:    psubb %xmm1, %xmm0
+; X86-SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
+; X86-SSE2-NEXT:    movdqa %xmm0, %xmm2
+; X86-SSE2-NEXT:    pand %xmm1, %xmm2
+; X86-SSE2-NEXT:    psrlw $2, %xmm0
+; X86-SSE2-NEXT:    pand %xmm1, %xmm0
+; X86-SSE2-NEXT:    paddb %xmm2, %xmm0
+; X86-SSE2-NEXT:    movdqa %xmm0, %xmm1
+; X86-SSE2-NEXT:    psrlw $4, %xmm1
+; X86-SSE2-NEXT:    paddb %xmm0, %xmm1
+; X86-SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
+; X86-SSE2-NEXT:    pxor %xmm0, %xmm0
+; X86-SSE2-NEXT:    psadbw %xmm1, %xmm0
+; X86-SSE2-NEXT:    movd %xmm0, %eax
+; X86-SSE2-NEXT:    xorl %edx, %edx
+; X86-SSE2-NEXT:    retl
 ;
 ; X64-NO-POPCOUNT-LABEL: ctpop_shifted_mask6:
 ; X64-NO-POPCOUNT:       # %bb.0:
@@ -559,16 +660,41 @@ define i64 @ctpop_shifted_mask8(i64 %x) nounwind readnone {
 ; X64-POPCOUNT-NEXT:    popcntl %edi, %eax
 ; X64-POPCOUNT-NEXT:    retq
 ;
-; X86-NO-POPCOUNT-LABEL: ctpop_shifted_mask8:
-; X86-NO-POPCOUNT:       # %bb.0:
-; X86-NO-POPCOUNT-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; X86-NO-POPCOUNT-NEXT:    imull $134480385, %eax, %eax # imm = 0x8040201
-; X86-NO-POPCOUNT-NEXT:    shrl $3, %eax
-; X86-NO-POPCOUNT-NEXT:    andl $286331153, %eax # imm = 0x11111111
-; X86-NO-POPCOUNT-NEXT:    imull $286331153, %eax, %eax # imm = 0x11111111
-; X86-NO-POPCOUNT-NEXT:    shrl $28, %eax
-; X86-NO-POPCOUNT-NEXT:    xorl %edx, %edx
-; X86-NO-POPCOUNT-NEXT:    retl
+; X86-NO-SSE2-LABEL: ctpop_shifted_mask8:
+; X86-NO-SSE2:       # %bb.0:
+; X86-NO-SSE2-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; X86-NO-SSE2-NEXT:    imull $134480385, %eax, %eax # imm = 0x8040201
+; X86-NO-SSE2-NEXT:    shrl $3, %eax
+; X86-NO-SSE2-NEXT:    andl $286331153, %eax # imm = 0x11111111
+; X86-NO-SSE2-NEXT:    imull $286331153, %eax, %eax # imm = 0x11111111
+; X86-NO-SSE2-NEXT:    shrl $28, %eax
+; X86-NO-SSE2-NEXT:    xorl %edx, %edx
+; X86-NO-SSE2-NEXT:    retl
+;
+; X86-SSE2-LABEL: ctpop_shifted_mask8:
+; X86-SSE2:       # %bb.0:
+; X86-SSE2-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; X86-SSE2-NEXT:    shll $8, %eax
+; X86-SSE2-NEXT:    movd %eax, %xmm0
+; X86-SSE2-NEXT:    movdqa %xmm0, %xmm1
+; X86-SSE2-NEXT:    psrlw $1, %xmm1
+; X86-SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
+; X86-SSE2-NEXT:    psubb %xmm1, %xmm0
+; X86-SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
+; X86-SSE2-NEXT:    movdqa %xmm0, %xmm2
+; X86-SSE2-NEXT:    pand %xmm1, %xmm2
+; X86-SSE2-NEXT:    psrlw $2, %xmm0
+; X86-SSE2-NEXT:    pand %xmm1, %xmm0
+; X86-SSE2-NEXT:    paddb %xmm2, %xmm0
+; X86-SSE2-NEXT:    movdqa %xmm0, %xmm1
+; X86-SSE2-NEXT:    psrlw $4, %xmm1
+; X86-SSE2-NEXT:    paddb %xmm0, %xmm1
+; X86-SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
+; X86-SSE2-NEXT:    pxor %xmm0, %xmm0
+; X86-SSE2-NEXT:    psadbw %xmm1, %xmm0
+; X86-SSE2-NEXT:    movd %xmm0, %eax
+; X86-SSE2-NEXT:    xorl %edx, %edx
+; X86-SSE2-NEXT:    retl
 ;
 ; X64-NO-POPCOUNT-LABEL: ctpop_shifted_mask8:
 ; X64-NO-POPCOUNT:       # %bb.0:
@@ -657,27 +783,53 @@ define i64 @ctpop_shifted_mask16(i64 %x) nounwind readnone {
 ; X64-POPCOUNT-NEXT:    popcntq %rax, %rax
 ; X64-POPCOUNT-NEXT:    retq
 ;
-; X86-NO-POPCOUNT-LABEL: ctpop_shifted_mask16:
-; X86-NO-POPCOUNT:       # %bb.0:
-; X86-NO-POPCOUNT-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-POPCOUNT-NEXT:    movl %ecx, %eax
-; X86-NO-POPCOUNT-NEXT:    andl $524280, %eax # imm = 0x7FFF8
-; X86-NO-POPCOUNT-NEXT:    shrl %ecx
-; X86-NO-POPCOUNT-NEXT:    andl $87380, %ecx # imm = 0x15554
-; X86-NO-POPCOUNT-NEXT:    subl %ecx, %eax
-; X86-NO-POPCOUNT-NEXT:    movl %eax, %ecx
-; X86-NO-POPCOUNT-NEXT:    andl $858993456, %ecx # imm = 0x33333330
-; X86-NO-POPCOUNT-NEXT:    shrl $2, %eax
-; X86-NO-POPCOUNT-NEXT:    andl $858993459, %eax # imm = 0x33333333
-; X86-NO-POPCOUNT-NEXT:    addl %ecx, %eax
-; X86-NO-POPCOUNT-NEXT:    movl %eax, %ecx
-; X86-NO-POPCOUNT-NEXT:    shrl $4, %ecx
-; X86-NO-POPCOUNT-NEXT:    addl %eax, %ecx
-; X86-NO-POPCOUNT-NEXT:    andl $252645135, %ecx # imm = 0xF0F0F0F
-; X86-NO-POPCOUNT-NEXT:    imull $16843009, %ecx, %eax # imm = 0x1010101
-; X86-NO-POPCOUNT-NEXT:    shrl $24, %eax
-; X86-NO-POPCOUNT-NEXT:    xorl %edx, %edx
-; X86-NO-POPCOUNT-NEXT:    retl
+; X86-NO-SSE2-LABEL: ctpop_shifted_mask16:
+; X86-NO-SSE2:       # %bb.0:
+; X86-NO-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-SSE2-NEXT:    movl %ecx, %eax
+; X86-NO-SSE2-NEXT:    andl $524280, %eax # imm = 0x7FFF8
+; X86-NO-SSE2-NEXT:    shrl %ecx
+; X86-NO-SSE2-NEXT:    andl $87380, %ecx # imm = 0x15554
+; X86-NO-SSE2-NEXT:    subl %ecx, %eax
+; X86-NO-SSE2-NEXT:    movl %eax, %ecx
+; X86-NO-SSE2-NEXT:    andl $858993456, %ecx # imm = 0x33333330
+; X86-NO-SSE2-NEXT:    shrl $2, %eax
+; X86-NO-SSE2-NEXT:    andl $858993459, %eax # imm = 0x33333333
+; X86-NO-SSE2-NEXT:    addl %ecx, %eax
+; X86-NO-SSE2-NEXT:    movl %eax, %ecx
+; X86-NO-SSE2-NEXT:    shrl $4, %ecx
+; X86-NO-SSE2-NEXT:    addl %eax, %ecx
+; X86-NO-SSE2-NEXT:    andl $252645135, %ecx # imm = 0xF0F0F0F
+; X86-NO-SSE2-NEXT:    imull $16843009, %ecx, %eax # imm = 0x1010101
+; X86-NO-SSE2-NEXT:    shrl $24, %eax
+; X86-NO-SSE2-NEXT:    xorl %edx, %edx
+; X86-NO-SSE2-NEXT:    retl
+;
+; X86-SSE2-LABEL: ctpop_shifted_mask16:
+; X86-SSE2:       # %bb.0:
+; X86-SSE2-NEXT:    movl $524280, %eax # imm = 0x7FFF8
+; X86-SSE2-NEXT:    andl {{[0-9]+}}(%esp), %eax
+; X86-SSE2-NEXT:    movd %eax, %xmm0
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,0,1,1]
+; X86-SSE2-NEXT:    movdqa %xmm0, %xmm1
+; X86-SSE2-NEXT:    psrlw $1, %xmm1
+; X86-SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
+; X86-SSE2-NEXT:    psubb %xmm1, %xmm0
+; X86-SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
+; X86-SSE2-NEXT:    movdqa %xmm0, %xmm2
+; X86-SSE2-NEXT:    pand %xmm1, %xmm2
+; X86-SSE2-NEXT:    psrlw $2, %xmm0
+; X86-SSE2-NEXT:    pand %xmm1, %xmm0
+; X86-SSE2-NEXT:    paddb %xmm2, %xmm0
+; X86-SSE2-NEXT:    movdqa %xmm0, %xmm1
+; X86-SSE2-NEXT:    psrlw $4, %xmm1
+; X86-SSE2-NEXT:    paddb %xmm0, %xmm1
+; X86-SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
+; X86-SSE2-NEXT:    pxor %xmm0, %xmm0
+; X86-SSE2-NEXT:    psadbw %xmm1, %xmm0
+; X86-SSE2-NEXT:    movd %xmm0, %eax
+; X86-SSE2-NEXT:    xorl %edx, %edx
+; X86-SSE2-NEXT:    retl
 ;
 ; X64-NO-POPCOUNT-LABEL: ctpop_shifted_mask16:
 ; X64-NO-POPCOUNT:       # %bb.0:
-- 
cgit v1.1


From 9ba265636f3310b5b5b39767715e1843a06ea603 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Fri, 9 Feb 2024 11:51:40 +0000
Subject: [X86] ReplaceNodeResults - shrink i64 CTPOP to (shifted) CTPOP i32 if
 32 or less active bits to avoid SSE2 codegen

32-bit targets perform i64 CTPOP as a v2i64 CTPOP - if we can perform this as a i32 CTPOP by shifting the source bits, then do so to avoid the gpr<->xmm

This also triggers on non-SSE2 capable targets, as can be seen with the minor codegen diffs in ctpop_shifted_mask16
---
 llvm/lib/Target/X86/X86ISelLowering.cpp |  14 ++
 llvm/test/CodeGen/X86/ctpop-mask.ll     | 308 ++++++++------------------------
 2 files changed, 93 insertions(+), 229 deletions(-)

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 0c2d5f8..18f9871 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -32109,6 +32109,20 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
     return;
   case ISD::CTPOP: {
     assert(N->getValueType(0) == MVT::i64 && "Unexpected VT!");
+    // If we have at most 32 active bits, then perform as i32 CTPOP.
+    // TODO: Perform this in generic legalizer?
+    KnownBits Known = DAG.computeKnownBits(N->getOperand(0));
+    unsigned LZ = Known.countMinLeadingZeros();
+    unsigned TZ = Known.countMinTrailingZeros();
+    if ((LZ + TZ) >= 32) {
+      SDValue Op = DAG.getNode(ISD::SRL, dl, MVT::i64, N->getOperand(0),
+                               DAG.getShiftAmountConstant(TZ, MVT::i64, dl));
+      Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Op);
+      Op = DAG.getNode(ISD::CTPOP, dl, MVT::i32, Op);
+      Op = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Op);
+      Results.push_back(Op);
+      return;
+    }
     // Use a v2i64 if possible.
     bool NoImplicitFloatOps =
         DAG.getMachineFunction().getFunction().hasFnAttribute(
diff --git a/llvm/test/CodeGen/X86/ctpop-mask.ll b/llvm/test/CodeGen/X86/ctpop-mask.ll
index 6d4fa4a4..97c634a 100644
--- a/llvm/test/CodeGen/X86/ctpop-mask.ll
+++ b/llvm/test/CodeGen/X86/ctpop-mask.ll
@@ -1,8 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=i686-unknown -mattr=+popcnt | FileCheck %s -check-prefixes=X86-POPCOUNT
 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+popcnt | FileCheck %s -check-prefixes=X64-POPCOUNT
-; RUN: llc < %s -mtriple=i686-unknown -mattr=-popcnt | FileCheck %s -check-prefixes=X86-NO-POPCOUNT,X86-NO-SSE2
-; RUN: llc < %s -mtriple=i686-unknown -mattr=+sse2 -mattr=-popcnt | FileCheck %s -check-prefixes=X86-NO-POPCOUNT,X86-SSE2
+; RUN: llc < %s -mtriple=i686-unknown -mattr=-popcnt | FileCheck %s -check-prefixes=X86-NO-POPCOUNT
+; RUN: llc < %s -mtriple=i686-unknown -mattr=+sse2 -mattr=-popcnt | FileCheck %s -check-prefixes=X86-NO-POPCOUNT
 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=-popcnt | FileCheck %s -check-prefixes=X64-NO-POPCOUNT
 
 declare i8 @llvm.ctpop.i8(i8) nounwind readnone
@@ -29,42 +29,17 @@ define i64 @ctpop_mask2(i64 %x) nounwind readnone {
 ; X64-POPCOUNT-NEXT:    popcntl %edi, %eax
 ; X64-POPCOUNT-NEXT:    retq
 ;
-; X86-NO-SSE2-LABEL: ctpop_mask2:
-; X86-NO-SSE2:       # %bb.0:
-; X86-NO-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NO-SSE2-NEXT:    andl $3, %eax
-; X86-NO-SSE2-NEXT:    imull $134480385, %eax, %eax # imm = 0x8040201
-; X86-NO-SSE2-NEXT:    shrl $3, %eax
-; X86-NO-SSE2-NEXT:    andl $17895697, %eax # imm = 0x1111111
-; X86-NO-SSE2-NEXT:    imull $286331153, %eax, %eax # imm = 0x11111111
-; X86-NO-SSE2-NEXT:    shrl $28, %eax
-; X86-NO-SSE2-NEXT:    xorl %edx, %edx
-; X86-NO-SSE2-NEXT:    retl
-;
-; X86-SSE2-LABEL: ctpop_mask2:
-; X86-SSE2:       # %bb.0:
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT:    andl $3, %eax
-; X86-SSE2-NEXT:    movd %eax, %xmm0
-; X86-SSE2-NEXT:    movdqa %xmm0, %xmm1
-; X86-SSE2-NEXT:    psrlw $1, %xmm1
-; X86-SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
-; X86-SSE2-NEXT:    psubb %xmm1, %xmm0
-; X86-SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
-; X86-SSE2-NEXT:    movdqa %xmm0, %xmm2
-; X86-SSE2-NEXT:    pand %xmm1, %xmm2
-; X86-SSE2-NEXT:    psrlw $2, %xmm0
-; X86-SSE2-NEXT:    pand %xmm1, %xmm0
-; X86-SSE2-NEXT:    paddb %xmm2, %xmm0
-; X86-SSE2-NEXT:    movdqa %xmm0, %xmm1
-; X86-SSE2-NEXT:    psrlw $4, %xmm1
-; X86-SSE2-NEXT:    paddb %xmm0, %xmm1
-; X86-SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
-; X86-SSE2-NEXT:    pxor %xmm0, %xmm0
-; X86-SSE2-NEXT:    psadbw %xmm1, %xmm0
-; X86-SSE2-NEXT:    movd %xmm0, %eax
-; X86-SSE2-NEXT:    xorl %edx, %edx
-; X86-SSE2-NEXT:    retl
+; X86-NO-POPCOUNT-LABEL: ctpop_mask2:
+; X86-NO-POPCOUNT:       # %bb.0:
+; X86-NO-POPCOUNT-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-POPCOUNT-NEXT:    andl $3, %eax
+; X86-NO-POPCOUNT-NEXT:    imull $134480385, %eax, %eax # imm = 0x8040201
+; X86-NO-POPCOUNT-NEXT:    shrl $3, %eax
+; X86-NO-POPCOUNT-NEXT:    andl $17895697, %eax # imm = 0x1111111
+; X86-NO-POPCOUNT-NEXT:    imull $286331153, %eax, %eax # imm = 0x11111111
+; X86-NO-POPCOUNT-NEXT:    shrl $28, %eax
+; X86-NO-POPCOUNT-NEXT:    xorl %edx, %edx
+; X86-NO-POPCOUNT-NEXT:    retl
 ;
 ; X64-NO-POPCOUNT-LABEL: ctpop_mask2:
 ; X64-NO-POPCOUNT:       # %bb.0:
@@ -218,42 +193,17 @@ define i64 @ctpop_mask4(i64 %x) nounwind readnone {
 ; X64-POPCOUNT-NEXT:    popcntl %edi, %eax
 ; X64-POPCOUNT-NEXT:    retq
 ;
-; X86-NO-SSE2-LABEL: ctpop_mask4:
-; X86-NO-SSE2:       # %bb.0:
-; X86-NO-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NO-SSE2-NEXT:    andl $15, %eax
-; X86-NO-SSE2-NEXT:    imull $134480385, %eax, %eax # imm = 0x8040201
-; X86-NO-SSE2-NEXT:    shrl $3, %eax
-; X86-NO-SSE2-NEXT:    andl $17895697, %eax # imm = 0x1111111
-; X86-NO-SSE2-NEXT:    imull $286331153, %eax, %eax # imm = 0x11111111
-; X86-NO-SSE2-NEXT:    shrl $28, %eax
-; X86-NO-SSE2-NEXT:    xorl %edx, %edx
-; X86-NO-SSE2-NEXT:    retl
-;
-; X86-SSE2-LABEL: ctpop_mask4:
-; X86-SSE2:       # %bb.0:
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT:    andl $15, %eax
-; X86-SSE2-NEXT:    movd %eax, %xmm0
-; X86-SSE2-NEXT:    movdqa %xmm0, %xmm1
-; X86-SSE2-NEXT:    psrlw $1, %xmm1
-; X86-SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
-; X86-SSE2-NEXT:    psubb %xmm1, %xmm0
-; X86-SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
-; X86-SSE2-NEXT:    movdqa %xmm0, %xmm2
-; X86-SSE2-NEXT:    pand %xmm1, %xmm2
-; X86-SSE2-NEXT:    psrlw $2, %xmm0
-; X86-SSE2-NEXT:    pand %xmm1, %xmm0
-; X86-SSE2-NEXT:    paddb %xmm2, %xmm0
-; X86-SSE2-NEXT:    movdqa %xmm0, %xmm1
-; X86-SSE2-NEXT:    psrlw $4, %xmm1
-; X86-SSE2-NEXT:    paddb %xmm0, %xmm1
-; X86-SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
-; X86-SSE2-NEXT:    pxor %xmm0, %xmm0
-; X86-SSE2-NEXT:    psadbw %xmm1, %xmm0
-; X86-SSE2-NEXT:    movd %xmm0, %eax
-; X86-SSE2-NEXT:    xorl %edx, %edx
-; X86-SSE2-NEXT:    retl
+; X86-NO-POPCOUNT-LABEL: ctpop_mask4:
+; X86-NO-POPCOUNT:       # %bb.0:
+; X86-NO-POPCOUNT-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-POPCOUNT-NEXT:    andl $15, %eax
+; X86-NO-POPCOUNT-NEXT:    imull $134480385, %eax, %eax # imm = 0x8040201
+; X86-NO-POPCOUNT-NEXT:    shrl $3, %eax
+; X86-NO-POPCOUNT-NEXT:    andl $17895697, %eax # imm = 0x1111111
+; X86-NO-POPCOUNT-NEXT:    imull $286331153, %eax, %eax # imm = 0x11111111
+; X86-NO-POPCOUNT-NEXT:    shrl $28, %eax
+; X86-NO-POPCOUNT-NEXT:    xorl %edx, %edx
+; X86-NO-POPCOUNT-NEXT:    retl
 ;
 ; X64-NO-POPCOUNT-LABEL: ctpop_mask4:
 ; X64-NO-POPCOUNT:       # %bb.0:
@@ -325,42 +275,17 @@ define i64 @ctpop_mask5(i64 %x) nounwind readnone {
 ; X64-POPCOUNT-NEXT:    popcntl %edi, %eax
 ; X64-POPCOUNT-NEXT:    retq
 ;
-; X86-NO-SSE2-LABEL: ctpop_mask5:
-; X86-NO-SSE2:       # %bb.0:
-; X86-NO-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NO-SSE2-NEXT:    andl $31, %eax
-; X86-NO-SSE2-NEXT:    imull $134480385, %eax, %eax # imm = 0x8040201
-; X86-NO-SSE2-NEXT:    shrl $3, %eax
-; X86-NO-SSE2-NEXT:    andl $286331153, %eax # imm = 0x11111111
-; X86-NO-SSE2-NEXT:    imull $286331153, %eax, %eax # imm = 0x11111111
-; X86-NO-SSE2-NEXT:    shrl $28, %eax
-; X86-NO-SSE2-NEXT:    xorl %edx, %edx
-; X86-NO-SSE2-NEXT:    retl
-;
-; X86-SSE2-LABEL: ctpop_mask5:
-; X86-SSE2:       # %bb.0:
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT:    andl $31, %eax
-; X86-SSE2-NEXT:    movd %eax, %xmm0
-; X86-SSE2-NEXT:    movdqa %xmm0, %xmm1
-; X86-SSE2-NEXT:    psrlw $1, %xmm1
-; X86-SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
-; X86-SSE2-NEXT:    psubb %xmm1, %xmm0
-; X86-SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
-; X86-SSE2-NEXT:    movdqa %xmm0, %xmm2
-; X86-SSE2-NEXT:    pand %xmm1, %xmm2
-; X86-SSE2-NEXT:    psrlw $2, %xmm0
-; X86-SSE2-NEXT:    pand %xmm1, %xmm0
-; X86-SSE2-NEXT:    paddb %xmm2, %xmm0
-; X86-SSE2-NEXT:    movdqa %xmm0, %xmm1
-; X86-SSE2-NEXT:    psrlw $4, %xmm1
-; X86-SSE2-NEXT:    paddb %xmm0, %xmm1
-; X86-SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
-; X86-SSE2-NEXT:    pxor %xmm0, %xmm0
-; X86-SSE2-NEXT:    psadbw %xmm1, %xmm0
-; X86-SSE2-NEXT:    movd %xmm0, %eax
-; X86-SSE2-NEXT:    xorl %edx, %edx
-; X86-SSE2-NEXT:    retl
+; X86-NO-POPCOUNT-LABEL: ctpop_mask5:
+; X86-NO-POPCOUNT:       # %bb.0:
+; X86-NO-POPCOUNT-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-POPCOUNT-NEXT:    andl $31, %eax
+; X86-NO-POPCOUNT-NEXT:    imull $134480385, %eax, %eax # imm = 0x8040201
+; X86-NO-POPCOUNT-NEXT:    shrl $3, %eax
+; X86-NO-POPCOUNT-NEXT:    andl $286331153, %eax # imm = 0x11111111
+; X86-NO-POPCOUNT-NEXT:    imull $286331153, %eax, %eax # imm = 0x11111111
+; X86-NO-POPCOUNT-NEXT:    shrl $28, %eax
+; X86-NO-POPCOUNT-NEXT:    xorl %edx, %edx
+; X86-NO-POPCOUNT-NEXT:    retl
 ;
 ; X64-NO-POPCOUNT-LABEL: ctpop_mask5:
 ; X64-NO-POPCOUNT:       # %bb.0:
@@ -471,43 +396,18 @@ define i64 @ctpop_shifted_mask6(i64 %x) nounwind readnone {
 ; X64-POPCOUNT-NEXT:    popcntl %edi, %eax
 ; X64-POPCOUNT-NEXT:    retq
 ;
-; X86-NO-SSE2-LABEL: ctpop_shifted_mask6:
-; X86-NO-SSE2:       # %bb.0:
-; X86-NO-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NO-SSE2-NEXT:    shrl $9, %eax
-; X86-NO-SSE2-NEXT:    andl $51, %eax
-; X86-NO-SSE2-NEXT:    imull $134480385, %eax, %eax # imm = 0x8040201
-; X86-NO-SSE2-NEXT:    shrl $3, %eax
-; X86-NO-SSE2-NEXT:    andl $286331153, %eax # imm = 0x11111111
-; X86-NO-SSE2-NEXT:    imull $286331153, %eax, %eax # imm = 0x11111111
-; X86-NO-SSE2-NEXT:    shrl $28, %eax
-; X86-NO-SSE2-NEXT:    xorl %edx, %edx
-; X86-NO-SSE2-NEXT:    retl
-;
-; X86-SSE2-LABEL: ctpop_shifted_mask6:
-; X86-SSE2:       # %bb.0:
-; X86-SSE2-NEXT:    movl $26112, %eax # imm = 0x6600
-; X86-SSE2-NEXT:    andl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT:    movd %eax, %xmm0
-; X86-SSE2-NEXT:    movdqa %xmm0, %xmm1
-; X86-SSE2-NEXT:    psrlw $1, %xmm1
-; X86-SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
-; X86-SSE2-NEXT:    psubb %xmm1, %xmm0
-; X86-SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
-; X86-SSE2-NEXT:    movdqa %xmm0, %xmm2
-; X86-SSE2-NEXT:    pand %xmm1, %xmm2
-; X86-SSE2-NEXT:    psrlw $2, %xmm0
-; X86-SSE2-NEXT:    pand %xmm1, %xmm0
-; X86-SSE2-NEXT:    paddb %xmm2, %xmm0
-; X86-SSE2-NEXT:    movdqa %xmm0, %xmm1
-; X86-SSE2-NEXT:    psrlw $4, %xmm1
-; X86-SSE2-NEXT:    paddb %xmm0, %xmm1
-; X86-SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
-; X86-SSE2-NEXT:    pxor %xmm0, %xmm0
-; X86-SSE2-NEXT:    psadbw %xmm1, %xmm0
-; X86-SSE2-NEXT:    movd %xmm0, %eax
-; X86-SSE2-NEXT:    xorl %edx, %edx
-; X86-SSE2-NEXT:    retl
+; X86-NO-POPCOUNT-LABEL: ctpop_shifted_mask6:
+; X86-NO-POPCOUNT:       # %bb.0:
+; X86-NO-POPCOUNT-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-POPCOUNT-NEXT:    shrl $9, %eax
+; X86-NO-POPCOUNT-NEXT:    andl $51, %eax
+; X86-NO-POPCOUNT-NEXT:    imull $134480385, %eax, %eax # imm = 0x8040201
+; X86-NO-POPCOUNT-NEXT:    shrl $3, %eax
+; X86-NO-POPCOUNT-NEXT:    andl $286331153, %eax # imm = 0x11111111
+; X86-NO-POPCOUNT-NEXT:    imull $286331153, %eax, %eax # imm = 0x11111111
+; X86-NO-POPCOUNT-NEXT:    shrl $28, %eax
+; X86-NO-POPCOUNT-NEXT:    xorl %edx, %edx
+; X86-NO-POPCOUNT-NEXT:    retl
 ;
 ; X64-NO-POPCOUNT-LABEL: ctpop_shifted_mask6:
 ; X64-NO-POPCOUNT:       # %bb.0:
@@ -660,41 +560,16 @@ define i64 @ctpop_shifted_mask8(i64 %x) nounwind readnone {
 ; X64-POPCOUNT-NEXT:    popcntl %edi, %eax
 ; X64-POPCOUNT-NEXT:    retq
 ;
-; X86-NO-SSE2-LABEL: ctpop_shifted_mask8:
-; X86-NO-SSE2:       # %bb.0:
-; X86-NO-SSE2-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; X86-NO-SSE2-NEXT:    imull $134480385, %eax, %eax # imm = 0x8040201
-; X86-NO-SSE2-NEXT:    shrl $3, %eax
-; X86-NO-SSE2-NEXT:    andl $286331153, %eax # imm = 0x11111111
-; X86-NO-SSE2-NEXT:    imull $286331153, %eax, %eax # imm = 0x11111111
-; X86-NO-SSE2-NEXT:    shrl $28, %eax
-; X86-NO-SSE2-NEXT:    xorl %edx, %edx
-; X86-NO-SSE2-NEXT:    retl
-;
-; X86-SSE2-LABEL: ctpop_shifted_mask8:
-; X86-SSE2:       # %bb.0:
-; X86-SSE2-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT:    shll $8, %eax
-; X86-SSE2-NEXT:    movd %eax, %xmm0
-; X86-SSE2-NEXT:    movdqa %xmm0, %xmm1
-; X86-SSE2-NEXT:    psrlw $1, %xmm1
-; X86-SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
-; X86-SSE2-NEXT:    psubb %xmm1, %xmm0
-; X86-SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
-; X86-SSE2-NEXT:    movdqa %xmm0, %xmm2
-; X86-SSE2-NEXT:    pand %xmm1, %xmm2
-; X86-SSE2-NEXT:    psrlw $2, %xmm0
-; X86-SSE2-NEXT:    pand %xmm1, %xmm0
-; X86-SSE2-NEXT:    paddb %xmm2, %xmm0
-; X86-SSE2-NEXT:    movdqa %xmm0, %xmm1
-; X86-SSE2-NEXT:    psrlw $4, %xmm1
-; X86-SSE2-NEXT:    paddb %xmm0, %xmm1
-; X86-SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
-; X86-SSE2-NEXT:    pxor %xmm0, %xmm0
-; X86-SSE2-NEXT:    psadbw %xmm1, %xmm0
-; X86-SSE2-NEXT:    movd %xmm0, %eax
-; X86-SSE2-NEXT:    xorl %edx, %edx
-; X86-SSE2-NEXT:    retl
+; X86-NO-POPCOUNT-LABEL: ctpop_shifted_mask8:
+; X86-NO-POPCOUNT:       # %bb.0:
+; X86-NO-POPCOUNT-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; X86-NO-POPCOUNT-NEXT:    imull $134480385, %eax, %eax # imm = 0x8040201
+; X86-NO-POPCOUNT-NEXT:    shrl $3, %eax
+; X86-NO-POPCOUNT-NEXT:    andl $286331153, %eax # imm = 0x11111111
+; X86-NO-POPCOUNT-NEXT:    imull $286331153, %eax, %eax # imm = 0x11111111
+; X86-NO-POPCOUNT-NEXT:    shrl $28, %eax
+; X86-NO-POPCOUNT-NEXT:    xorl %edx, %edx
+; X86-NO-POPCOUNT-NEXT:    retl
 ;
 ; X64-NO-POPCOUNT-LABEL: ctpop_shifted_mask8:
 ; X64-NO-POPCOUNT:       # %bb.0:
@@ -783,53 +658,28 @@ define i64 @ctpop_shifted_mask16(i64 %x) nounwind readnone {
 ; X64-POPCOUNT-NEXT:    popcntq %rax, %rax
 ; X64-POPCOUNT-NEXT:    retq
 ;
-; X86-NO-SSE2-LABEL: ctpop_shifted_mask16:
-; X86-NO-SSE2:       # %bb.0:
-; X86-NO-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-SSE2-NEXT:    movl %ecx, %eax
-; X86-NO-SSE2-NEXT:    andl $524280, %eax # imm = 0x7FFF8
-; X86-NO-SSE2-NEXT:    shrl %ecx
-; X86-NO-SSE2-NEXT:    andl $87380, %ecx # imm = 0x15554
-; X86-NO-SSE2-NEXT:    subl %ecx, %eax
-; X86-NO-SSE2-NEXT:    movl %eax, %ecx
-; X86-NO-SSE2-NEXT:    andl $858993456, %ecx # imm = 0x33333330
-; X86-NO-SSE2-NEXT:    shrl $2, %eax
-; X86-NO-SSE2-NEXT:    andl $858993459, %eax # imm = 0x33333333
-; X86-NO-SSE2-NEXT:    addl %ecx, %eax
-; X86-NO-SSE2-NEXT:    movl %eax, %ecx
-; X86-NO-SSE2-NEXT:    shrl $4, %ecx
-; X86-NO-SSE2-NEXT:    addl %eax, %ecx
-; X86-NO-SSE2-NEXT:    andl $252645135, %ecx # imm = 0xF0F0F0F
-; X86-NO-SSE2-NEXT:    imull $16843009, %ecx, %eax # imm = 0x1010101
-; X86-NO-SSE2-NEXT:    shrl $24, %eax
-; X86-NO-SSE2-NEXT:    xorl %edx, %edx
-; X86-NO-SSE2-NEXT:    retl
-;
-; X86-SSE2-LABEL: ctpop_shifted_mask16:
-; X86-SSE2:       # %bb.0:
-; X86-SSE2-NEXT:    movl $524280, %eax # imm = 0x7FFF8
-; X86-SSE2-NEXT:    andl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT:    movd %eax, %xmm0
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,0,1,1]
-; X86-SSE2-NEXT:    movdqa %xmm0, %xmm1
-; X86-SSE2-NEXT:    psrlw $1, %xmm1
-; X86-SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
-; X86-SSE2-NEXT:    psubb %xmm1, %xmm0
-; X86-SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
-; X86-SSE2-NEXT:    movdqa %xmm0, %xmm2
-; X86-SSE2-NEXT:    pand %xmm1, %xmm2
-; X86-SSE2-NEXT:    psrlw $2, %xmm0
-; X86-SSE2-NEXT:    pand %xmm1, %xmm0
-; X86-SSE2-NEXT:    paddb %xmm2, %xmm0
-; X86-SSE2-NEXT:    movdqa %xmm0, %xmm1
-; X86-SSE2-NEXT:    psrlw $4, %xmm1
-; X86-SSE2-NEXT:    paddb %xmm0, %xmm1
-; X86-SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
-; X86-SSE2-NEXT:    pxor %xmm0, %xmm0
-; X86-SSE2-NEXT:    psadbw %xmm1, %xmm0
-; X86-SSE2-NEXT:    movd %xmm0, %eax
-; X86-SSE2-NEXT:    xorl %edx, %edx
-; X86-SSE2-NEXT:    retl
+; X86-NO-POPCOUNT-LABEL: ctpop_shifted_mask16:
+; X86-NO-POPCOUNT:       # %bb.0:
+; X86-NO-POPCOUNT-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-POPCOUNT-NEXT:    movl %ecx, %eax
+; X86-NO-POPCOUNT-NEXT:    andl $524280, %eax # imm = 0x7FFF8
+; X86-NO-POPCOUNT-NEXT:    shrl $4, %ecx
+; X86-NO-POPCOUNT-NEXT:    andl $21845, %ecx # imm = 0x5555
+; X86-NO-POPCOUNT-NEXT:    shrl $3, %eax
+; X86-NO-POPCOUNT-NEXT:    subl %ecx, %eax
+; X86-NO-POPCOUNT-NEXT:    movl %eax, %ecx
+; X86-NO-POPCOUNT-NEXT:    andl $858993459, %ecx # imm = 0x33333333
+; X86-NO-POPCOUNT-NEXT:    shrl $2, %eax
+; X86-NO-POPCOUNT-NEXT:    andl $858993459, %eax # imm = 0x33333333
+; X86-NO-POPCOUNT-NEXT:    addl %ecx, %eax
+; X86-NO-POPCOUNT-NEXT:    movl %eax, %ecx
+; X86-NO-POPCOUNT-NEXT:    shrl $4, %ecx
+; X86-NO-POPCOUNT-NEXT:    addl %eax, %ecx
+; X86-NO-POPCOUNT-NEXT:    andl $252645135, %ecx # imm = 0xF0F0F0F
+; X86-NO-POPCOUNT-NEXT:    imull $16843009, %ecx, %eax # imm = 0x1010101
+; X86-NO-POPCOUNT-NEXT:    shrl $24, %eax
+; X86-NO-POPCOUNT-NEXT:    xorl %edx, %edx
+; X86-NO-POPCOUNT-NEXT:    retl
 ;
 ; X64-NO-POPCOUNT-LABEL: ctpop_shifted_mask16:
 ; X64-NO-POPCOUNT:       # %bb.0:
-- 
cgit v1.1


From eb9cd800b3c8c787f75c00e7d9de3ae6a2e5f876 Mon Sep 17 00:00:00 2001
From: Wang Pengcheng <wangpengcheng.pp@bytedance.com>
Date: Fri, 9 Feb 2024 20:28:34 +0800
Subject: [Clang][TableGen] Add Features to TargetBuiltin

RISCV target will use this parameter, so we need a way to specify
it.

Reviewers: AaronBallman, philnik777

Reviewed By: AaronBallman

Pull Request: https://github.com/llvm/llvm-project/pull/80279
---
 clang/include/clang/Basic/BuiltinsBase.td     | 4 +++-
 clang/utils/TableGen/ClangBuiltinsEmitter.cpp | 2 +-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/clang/include/clang/Basic/BuiltinsBase.td b/clang/include/clang/Basic/BuiltinsBase.td
index b65b41b..bfccff5 100644
--- a/clang/include/clang/Basic/BuiltinsBase.td
+++ b/clang/include/clang/Basic/BuiltinsBase.td
@@ -87,7 +87,9 @@ class CustomEntry {
 }
 
 class AtomicBuiltin : Builtin;
-class TargetBuiltin : Builtin;
+class TargetBuiltin : Builtin {
+  string Features = "";
+}
 
 class LibBuiltin<string header, string languages = "ALL_LANGUAGES"> : Builtin {
   string Header = header;
diff --git a/clang/utils/TableGen/ClangBuiltinsEmitter.cpp b/clang/utils/TableGen/ClangBuiltinsEmitter.cpp
index dc10fa1..48f55b8 100644
--- a/clang/utils/TableGen/ClangBuiltinsEmitter.cpp
+++ b/clang/utils/TableGen/ClangBuiltinsEmitter.cpp
@@ -219,7 +219,7 @@ void EmitBuiltinDef(llvm::raw_ostream &OS, StringRef Substitution,
     break;
   }
   case BuiltinType::TargetBuiltin:
-    OS << ", \"\"";
+    OS << ", \"" << Builtin->getValueAsString("Features") << "\"";
     break;
   case BuiltinType::AtomicBuiltin:
   case BuiltinType::Builtin:
-- 
cgit v1.1


From a8d4a024e6bea3ae71d6187f0c040b2b25e4bf69 Mon Sep 17 00:00:00 2001
From: Wang Pengcheng <wangpengcheng.pp@bytedance.com>
Date: Fri, 9 Feb 2024 20:27:17 +0800
Subject: [Clang][RISCV] Refactor builtins to TableGen

This mechanism is introduced by #68324.

This refactor makes the prototype and attributes clear.

Reviewers: asb, kito-cheng, philnik777, topperc, preames

Reviewed By: topperc

Pull Request: https://github.com/llvm/llvm-project/pull/80280
---
 clang/include/clang/Basic/BuiltinsRISCV.def |  93 -----------------
 clang/include/clang/Basic/BuiltinsRISCV.td  | 148 ++++++++++++++++++++++++++++
 clang/include/clang/Basic/CMakeLists.txt    |   4 +
 clang/include/clang/Basic/TargetBuiltins.h  |   2 +-
 clang/include/module.modulemap              |   1 -
 clang/lib/Basic/Targets/RISCV.cpp           |   2 +-
 6 files changed, 154 insertions(+), 96 deletions(-)
 delete mode 100644 clang/include/clang/Basic/BuiltinsRISCV.def
 create mode 100644 clang/include/clang/Basic/BuiltinsRISCV.td

diff --git a/clang/include/clang/Basic/BuiltinsRISCV.def b/clang/include/clang/Basic/BuiltinsRISCV.def
deleted file mode 100644
index 1528b18..0000000
--- a/clang/include/clang/Basic/BuiltinsRISCV.def
+++ /dev/null
@@ -1,93 +0,0 @@
-//==- BuiltinsRISCV.def - RISC-V Builtin function database -------*- C++ -*-==//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file defines the RISC-V-specific builtin function database.  Users of
-// this file must define the BUILTIN macro to make use of this information.
-//
-//===----------------------------------------------------------------------===//
-
-#if defined(BUILTIN) && !defined(TARGET_BUILTIN)
-#   define TARGET_BUILTIN(ID, TYPE, ATTRS, FEATURE) BUILTIN(ID, TYPE, ATTRS)
-#endif
-
-// Zbb extension
-TARGET_BUILTIN(__builtin_riscv_orc_b_32, "UiUi", "nc", "zbb")
-TARGET_BUILTIN(__builtin_riscv_orc_b_64, "UWiUWi", "nc", "zbb,64bit")
-TARGET_BUILTIN(__builtin_riscv_clz_32, "UiUi", "nc", "zbb|xtheadbb")
-TARGET_BUILTIN(__builtin_riscv_clz_64, "UiUWi", "nc", "zbb|xtheadbb,64bit")
-TARGET_BUILTIN(__builtin_riscv_ctz_32, "UiUi", "nc", "zbb")
-TARGET_BUILTIN(__builtin_riscv_ctz_64, "UiUWi", "nc", "zbb,64bit")
-
-// Zbc or Zbkc extension
-TARGET_BUILTIN(__builtin_riscv_clmul_32, "UiUiUi", "nc", "zbc|zbkc")
-TARGET_BUILTIN(__builtin_riscv_clmul_64, "UWiUWiUWi", "nc", "zbc|zbkc,64bit")
-TARGET_BUILTIN(__builtin_riscv_clmulh_32, "UiUiUi", "nc", "zbc|zbkc,32bit")
-TARGET_BUILTIN(__builtin_riscv_clmulh_64, "UWiUWiUWi", "nc", "zbc|zbkc,64bit")
-TARGET_BUILTIN(__builtin_riscv_clmulr_32, "UiUiUi", "nc", "zbc,32bit")
-TARGET_BUILTIN(__builtin_riscv_clmulr_64, "UWiUWiUWi", "nc", "zbc,64bit")
-
-// Zbkx
-TARGET_BUILTIN(__builtin_riscv_xperm4_32, "UiUiUi", "nc", "zbkx,32bit")
-TARGET_BUILTIN(__builtin_riscv_xperm4_64, "UWiUWiUWi", "nc", "zbkx,64bit")
-TARGET_BUILTIN(__builtin_riscv_xperm8_32, "UiUiUi", "nc", "zbkx,32bit")
-TARGET_BUILTIN(__builtin_riscv_xperm8_64, "UWiUWiUWi", "nc", "zbkx,64bit")
-
-// Zbkb extension
-TARGET_BUILTIN(__builtin_riscv_brev8_32, "UiUi", "nc", "zbkb")
-TARGET_BUILTIN(__builtin_riscv_brev8_64, "UWiUWi", "nc", "zbkb,64bit")
-TARGET_BUILTIN(__builtin_riscv_zip_32, "UiUi", "nc", "zbkb,32bit")
-TARGET_BUILTIN(__builtin_riscv_unzip_32, "UiUi", "nc", "zbkb,32bit")
-
-// Zknd extension
-TARGET_BUILTIN(__builtin_riscv_aes32dsi, "UiUiUiIUi", "nc", "zknd,32bit")
-TARGET_BUILTIN(__builtin_riscv_aes32dsmi, "UiUiUiIUi", "nc", "zknd,32bit")
-TARGET_BUILTIN(__builtin_riscv_aes64ds, "UWiUWiUWi", "nc", "zknd,64bit")
-TARGET_BUILTIN(__builtin_riscv_aes64dsm, "UWiUWiUWi", "nc", "zknd,64bit")
-TARGET_BUILTIN(__builtin_riscv_aes64im, "UWiUWi", "nc", "zknd,64bit")
-
-// Zknd & Zkne
-TARGET_BUILTIN(__builtin_riscv_aes64ks1i, "UWiUWiIUi", "nc", "zknd|zkne,64bit")
-TARGET_BUILTIN(__builtin_riscv_aes64ks2, "UWiUWiUWi", "nc", "zknd|zkne,64bit")
-
-// Zkne extension
-TARGET_BUILTIN(__builtin_riscv_aes32esi, "UiUiUiIUi", "nc", "zkne,32bit")
-TARGET_BUILTIN(__builtin_riscv_aes32esmi, "UiUiUiIUi", "nc", "zkne,32bit")
-TARGET_BUILTIN(__builtin_riscv_aes64es, "UWiUWiUWi", "nc", "zkne,64bit")
-TARGET_BUILTIN(__builtin_riscv_aes64esm, "UWiUWiUWi", "nc", "zkne,64bit")
-
-// Zknh extension
-TARGET_BUILTIN(__builtin_riscv_sha256sig0, "UiUi", "nc", "zknh")
-TARGET_BUILTIN(__builtin_riscv_sha256sig1, "UiUi", "nc", "zknh")
-TARGET_BUILTIN(__builtin_riscv_sha256sum0, "UiUi", "nc", "zknh")
-TARGET_BUILTIN(__builtin_riscv_sha256sum1, "UiUi", "nc", "zknh")
-
-TARGET_BUILTIN(__builtin_riscv_sha512sig0h, "UiUiUi", "nc", "zknh,32bit")
-TARGET_BUILTIN(__builtin_riscv_sha512sig0l, "UiUiUi", "nc", "zknh,32bit")
-TARGET_BUILTIN(__builtin_riscv_sha512sig1h, "UiUiUi", "nc", "zknh,32bit")
-TARGET_BUILTIN(__builtin_riscv_sha512sig1l, "UiUiUi", "nc", "zknh,32bit")
-TARGET_BUILTIN(__builtin_riscv_sha512sum0r, "UiUiUi", "nc", "zknh,32bit")
-TARGET_BUILTIN(__builtin_riscv_sha512sum1r, "UiUiUi", "nc", "zknh,32bit")
-TARGET_BUILTIN(__builtin_riscv_sha512sig0, "UWiUWi", "nc", "zknh,64bit")
-TARGET_BUILTIN(__builtin_riscv_sha512sig1, "UWiUWi", "nc", "zknh,64bit")
-TARGET_BUILTIN(__builtin_riscv_sha512sum0, "UWiUWi", "nc", "zknh,64bit")
-TARGET_BUILTIN(__builtin_riscv_sha512sum1, "UWiUWi", "nc", "zknh,64bit")
-
-// Zksed extension
-TARGET_BUILTIN(__builtin_riscv_sm4ed, "UiUiUiIUi", "nc", "zksed")
-TARGET_BUILTIN(__builtin_riscv_sm4ks, "UiUiUiIUi", "nc", "zksed")
-
-// Zksh extension
-TARGET_BUILTIN(__builtin_riscv_sm3p0, "UiUi", "nc", "zksh")
-TARGET_BUILTIN(__builtin_riscv_sm3p1, "UiUi", "nc", "zksh")
-
-// Zihintntl extension
-TARGET_BUILTIN(__builtin_riscv_ntl_load, "v.", "t", "zihintntl")
-TARGET_BUILTIN(__builtin_riscv_ntl_store, "v.", "t", "zihintntl")
-
-#undef BUILTIN
-#undef TARGET_BUILTIN
diff --git a/clang/include/clang/Basic/BuiltinsRISCV.td b/clang/include/clang/Basic/BuiltinsRISCV.td
new file mode 100644
index 0000000..4cc89a8
--- /dev/null
+++ b/clang/include/clang/Basic/BuiltinsRISCV.td
@@ -0,0 +1,148 @@
+//==- BuiltinsRISCV.td - RISC-V Builtin function database ---*- tablegen -*-==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the RISC-V-specific builtin function database.
+//
+//===----------------------------------------------------------------------===//
+
+include "clang/Basic/BuiltinsBase.td"
+
+class RISCVBuiltin<string prototype, string features = ""> : TargetBuiltin {
+  let Spellings = ["__builtin_riscv_" # NAME];
+  let Prototype = prototype;
+  let Features = features;
+}
+
+let Attributes = [NoThrow, Const] in {
+//===----------------------------------------------------------------------===//
+// Zbb extension.
+//===----------------------------------------------------------------------===//
+def orc_b_32 : RISCVBuiltin<"unsigned int(unsigned int)", "zbb">;
+def orc_b_64 : RISCVBuiltin<"uint64_t(uint64_t)", "zbb,64bit">;
+def clz_32 : RISCVBuiltin<"unsigned int(unsigned int)", "zbb|xtheadbb">;
+def clz_64 : RISCVBuiltin<"unsigned int(uint64_t)", "zbb|xtheadbb,64bit">;
+def ctz_32 : RISCVBuiltin<"unsigned int(unsigned int)", "zbb">;
+def ctz_64 : RISCVBuiltin<"unsigned int(uint64_t)", "zbb,64bit">;
+
+//===----------------------------------------------------------------------===//
+// Zbc or Zbkc extension.
+//===----------------------------------------------------------------------===//
+def clmul_32 : RISCVBuiltin<"unsigned int(unsigned int, unsigned int)", "zbc|zbkc">;
+def clmul_64 : RISCVBuiltin<"uint64_t(uint64_t, uint64_t)", "zbc|zbkc,64bit">;
+def clmulh_32 : RISCVBuiltin<"unsigned int(unsigned int, unsigned int)", "zbc|zbkc,32bit">;
+def clmulh_64 : RISCVBuiltin<"uint64_t(uint64_t, uint64_t)", "zbc|zbkc,64bit">;
+def clmulr_32 : RISCVBuiltin<"unsigned int(unsigned int, unsigned int)", "zbc,32bit">;
+def clmulr_64 : RISCVBuiltin<"uint64_t(uint64_t, uint64_t)", "zbc,64bit">;
+
+//===----------------------------------------------------------------------===//
+// Zbkx extension.
+//===----------------------------------------------------------------------===//
+let Features = "zbkx,32bit" in {
+def xperm4_32 : RISCVBuiltin<"unsigned int(unsigned int, unsigned int)">;
+def xperm8_32 : RISCVBuiltin<"unsigned int(unsigned int, unsigned int)">;
+} // Features = "zbkx,32bit"
+
+let Features = "zbkx,64bit" in {
+def xperm4_64 : RISCVBuiltin<"uint64_t(uint64_t, uint64_t)">;
+def xperm8_64 : RISCVBuiltin<"uint64_t(uint64_t, uint64_t)">;
+} // Features = "zbkx,64bit"
+
+//===----------------------------------------------------------------------===//
+// Zbkb extension.
+//===----------------------------------------------------------------------===//
+def brev8_32 : RISCVBuiltin<"unsigned int(unsigned int)", "zbkb">;
+def brev8_64 : RISCVBuiltin<"uint64_t(uint64_t)", "zbkb,64bit">;
+def zip_32 : RISCVBuiltin<"unsigned int(unsigned int)", "zbkb,32bit">;
+def unzip_32 : RISCVBuiltin<"unsigned int(unsigned int)", "zbkb,32bit">;
+
+//===----------------------------------------------------------------------===//
+// Zknd extension.
+//===----------------------------------------------------------------------===//
+let Features = "zknd,32bit" in {
+def aes32dsi : RISCVBuiltin<"unsigned int(unsigned int, unsigned int, _Constant unsigned int)">;
+def aes32dsmi : RISCVBuiltin<"unsigned int(unsigned int, unsigned int, _Constant unsigned int)">;
+} // Features = "zknd,32bit"
+
+let Features = "zknd,64bit" in {
+def aes64ds : RISCVBuiltin<"uint64_t(uint64_t, uint64_t)">;
+def aes64dsm : RISCVBuiltin<"uint64_t(uint64_t, uint64_t)">;
+def aes64im : RISCVBuiltin<"uint64_t(uint64_t)">;
+} // Features = "zknd,64bit"
+
+//===----------------------------------------------------------------------===//
+// Zknd & Zkne extension.
+//===----------------------------------------------------------------------===//
+let Features = "zknd|zkne,64bit" in {
+def aes64ks1i : RISCVBuiltin<"uint64_t(uint64_t, _Constant unsigned int)">;
+def aes64ks2 : RISCVBuiltin<"uint64_t(uint64_t, uint64_t)">;
+} // Features = "zknd|zkne,64bit"
+
+//===----------------------------------------------------------------------===//
+// Zkne extension.
+//===----------------------------------------------------------------------===//
+let Features = "zkne,32bit" in {
+def aes32esi : RISCVBuiltin<"unsigned int(unsigned int, unsigned int, _Constant unsigned int)">;
+def aes32esmi : RISCVBuiltin<"unsigned int(unsigned int, unsigned int, _Constant unsigned int)">;
+} // Features = "zkne,32bit"
+
+let Features = "zkne,64bit" in {
+def aes64es : RISCVBuiltin<"uint64_t(uint64_t, uint64_t)">;
+def aes64esm : RISCVBuiltin<"uint64_t(uint64_t, uint64_t)">;
+} // Features = "zkne,64bit"
+
+//===----------------------------------------------------------------------===//
+// Zknh extension.
+//===----------------------------------------------------------------------===//
+let Features = "zknh" in {
+def sha256sig0 : RISCVBuiltin<"unsigned int(unsigned int)">;
+def sha256sig1 : RISCVBuiltin<"unsigned int(unsigned int)">;
+def sha256sum0 : RISCVBuiltin<"unsigned int(unsigned int)">;
+def sha256sum1 : RISCVBuiltin<"unsigned int(unsigned int)">;
+} // Features = "zknh"
+
+let Features = "zknh,32bit" in {
+def sha512sig0h : RISCVBuiltin<"unsigned int(unsigned int, unsigned int)">;
+def sha512sig0l : RISCVBuiltin<"unsigned int(unsigned int, unsigned int)">;
+def sha512sig1h : RISCVBuiltin<"unsigned int(unsigned int, unsigned int)">;
+def sha512sig1l : RISCVBuiltin<"unsigned int(unsigned int, unsigned int)">;
+def sha512sum0r : RISCVBuiltin<"unsigned int(unsigned int, unsigned int)">;
+def sha512sum1r : RISCVBuiltin<"unsigned int(unsigned int, unsigned int)">;
+} // Features = "zknh,32bit"
+
+let Features = "zknh,64bit" in {
+def sha512sig0 : RISCVBuiltin<"uint64_t(uint64_t)">;
+def sha512sig1 : RISCVBuiltin<"uint64_t(uint64_t)">;
+def sha512sum0 : RISCVBuiltin<"uint64_t(uint64_t)">;
+def sha512sum1 : RISCVBuiltin<"uint64_t(uint64_t)">;
+} // Features = "zknh,64bit"
+
+//===----------------------------------------------------------------------===//
+// Zksed extension.
+//===----------------------------------------------------------------------===//
+let Features = "zksed" in {
+def sm4ed : RISCVBuiltin<"unsigned int(unsigned int, unsigned int, _Constant unsigned int )">;
+def sm4ks : RISCVBuiltin<"unsigned int(unsigned int, unsigned int, _Constant unsigned int)">;
+} // Features = "zksed"
+
+//===----------------------------------------------------------------------===//
+// Zksh extension.
+//===----------------------------------------------------------------------===//
+let Features = "zksh" in {
+def sm3p0 : RISCVBuiltin<"unsigned int(unsigned int)">;
+def sm3p1 : RISCVBuiltin<"unsigned int(unsigned int)">;
+} // Features = "zksh"
+
+} // Attributes = [Const, NoThrow]
+
+//===----------------------------------------------------------------------===//
+// Zihintntl extension.
+//===----------------------------------------------------------------------===//
+let Features = "zihintntl", Attributes = [CustomTypeChecking] in {
+def ntl_load : RISCVBuiltin<"void(...)">;
+def ntl_store : RISCVBuiltin<"void(...)">;
+} // Features = "zihintntl", Attributes = [CustomTypeChecking]
diff --git a/clang/include/clang/Basic/CMakeLists.txt b/clang/include/clang/Basic/CMakeLists.txt
index 9689a0f..7785fb4 100644
--- a/clang/include/clang/Basic/CMakeLists.txt
+++ b/clang/include/clang/Basic/CMakeLists.txt
@@ -65,6 +65,10 @@ clang_tablegen(BuiltinsBPF.inc -gen-clang-builtins
   SOURCE BuiltinsBPF.td
   TARGET ClangBuiltinsBPF)
 
+clang_tablegen(BuiltinsRISCV.inc -gen-clang-builtins
+  SOURCE BuiltinsRISCV.td
+  TARGET ClangBuiltinsRISCV)
+
 # ARM NEON and MVE
 clang_tablegen(arm_neon.inc -gen-arm-neon-sema
   SOURCE arm_neon.td
diff --git a/clang/include/clang/Basic/TargetBuiltins.h b/clang/include/clang/Basic/TargetBuiltins.h
index a4abaae..4333830 100644
--- a/clang/include/clang/Basic/TargetBuiltins.h
+++ b/clang/include/clang/Basic/TargetBuiltins.h
@@ -159,7 +159,7 @@ namespace clang {
     FirstRVVBuiltin = clang::Builtin::FirstTSBuiltin,
     LastRVVBuiltin = RISCVVector::FirstTSBuiltin - 1,
 #define BUILTIN(ID, TYPE, ATTRS) BI##ID,
-#include "clang/Basic/BuiltinsRISCV.def"
+#include "clang/Basic/BuiltinsRISCV.inc"
     LastTSBuiltin
   };
   } // namespace RISCV
diff --git a/clang/include/module.modulemap b/clang/include/module.modulemap
index 9285595..acd960c 100644
--- a/clang/include/module.modulemap
+++ b/clang/include/module.modulemap
@@ -54,7 +54,6 @@ module Clang_Basic {
   textual header "clang/Basic/BuiltinsNEON.def"
   textual header "clang/Basic/BuiltinsNVPTX.def"
   textual header "clang/Basic/BuiltinsPPC.def"
-  textual header "clang/Basic/BuiltinsRISCV.def"
   textual header "clang/Basic/BuiltinsRISCVVector.def"
   textual header "clang/Basic/BuiltinsSME.def"
   textual header "clang/Basic/BuiltinsSVE.def"
diff --git a/clang/lib/Basic/Targets/RISCV.cpp b/clang/lib/Basic/Targets/RISCV.cpp
index 837a6e7..a6d4af2 100644
--- a/clang/lib/Basic/Targets/RISCV.cpp
+++ b/clang/lib/Basic/Targets/RISCV.cpp
@@ -234,7 +234,7 @@ static constexpr Builtin::Info BuiltinInfo[] = {
   {#ID, TYPE, ATTRS, nullptr, HeaderDesc::NO_HEADER, ALL_LANGUAGES},
 #define TARGET_BUILTIN(ID, TYPE, ATTRS, FEATURE)                               \
   {#ID, TYPE, ATTRS, FEATURE, HeaderDesc::NO_HEADER, ALL_LANGUAGES},
-#include "clang/Basic/BuiltinsRISCV.def"
+#include "clang/Basic/BuiltinsRISCV.inc"
 };
 
 ArrayRef<Builtin::Info> RISCVTargetInfo::getTargetBuiltins() const {
-- 
cgit v1.1


From 1f780bfac919dc34deface0f8e276d3573240291 Mon Sep 17 00:00:00 2001
From: Nico Weber <thakis@chromium.org>
Date: Fri, 9 Feb 2024 08:19:31 -0500
Subject: [gn] port a8d4a024e6bea3a (BuiltinsRISCV.td)

---
 llvm/utils/gn/secondary/clang/include/clang/Basic/BUILD.gn | 4 ++++
 llvm/utils/gn/secondary/clang/lib/Basic/BUILD.gn           | 1 +
 2 files changed, 5 insertions(+)

diff --git a/llvm/utils/gn/secondary/clang/include/clang/Basic/BUILD.gn b/llvm/utils/gn/secondary/clang/include/clang/Basic/BUILD.gn
index 4babd37..d484ff9 100644
--- a/llvm/utils/gn/secondary/clang/include/clang/Basic/BUILD.gn
+++ b/llvm/utils/gn/secondary/clang/include/clang/Basic/BUILD.gn
@@ -100,6 +100,10 @@ clang_tablegen("BuiltinsBPF") {
   args = [ "-gen-clang-builtins" ]
 }
 
+clang_tablegen("BuiltinsRISCV") {
+  args = [ "-gen-clang-builtins" ]
+}
+
 # ARM CDE, MVE, and NEON.
 
 clang_tablegen("arm_neon") {
diff --git a/llvm/utils/gn/secondary/clang/lib/Basic/BUILD.gn b/llvm/utils/gn/secondary/clang/lib/Basic/BUILD.gn
index 1486d16..bbe9373 100644
--- a/llvm/utils/gn/secondary/clang/lib/Basic/BUILD.gn
+++ b/llvm/utils/gn/secondary/clang/lib/Basic/BUILD.gn
@@ -27,6 +27,7 @@ static_library("Basic") {
     "//clang/include/clang/Basic:AttrSubMatchRulesList",
     "//clang/include/clang/Basic:Builtins",
     "//clang/include/clang/Basic:BuiltinsBPF",
+    "//clang/include/clang/Basic:BuiltinsRISCV",
     "//clang/include/clang/Basic:DiagnosticGroups",
     "//clang/include/clang/Basic:RegularKeywordAttrInfo",
     "//clang/include/clang/Basic:arm_cde_builtins",
-- 
cgit v1.1


From 413e82a0875222e19993b1038ea803814e5ee48c Mon Sep 17 00:00:00 2001
From: Benjamin Maxwell <benjamin.maxwell@arm.com>
Date: Fri, 9 Feb 2024 13:33:09 +0000
Subject: [mlir][ArmSVE] Add intrinsics for the SME2 multi-vector zips (#80985)

These are added to the ArmSVE dialect for consistency with LLVM, which
registers SME2 intrinsics that don't require ZA under SVE.
---
 mlir/include/mlir/Dialect/ArmSVE/IR/ArmSVE.td | 25 ++++++++++++++--
 mlir/test/Target/LLVMIR/arm-sve.mlir          | 42 +++++++++++++++++++++++++++
 2 files changed, 65 insertions(+), 2 deletions(-)

diff --git a/mlir/include/mlir/Dialect/ArmSVE/IR/ArmSVE.td b/mlir/include/mlir/Dialect/ArmSVE/IR/ArmSVE.td
index e3f3d9e..f237f23 100644
--- a/mlir/include/mlir/Dialect/ArmSVE/IR/ArmSVE.td
+++ b/mlir/include/mlir/Dialect/ArmSVE/IR/ArmSVE.td
@@ -59,14 +59,15 @@ class ArmSVE_Op<string mnemonic, list<Trait> traits = []> :
 class ArmSVE_IntrOp<string mnemonic,
                     list<Trait> traits = [],
                     list<int> overloadedOperands = [],
-                    list<int> overloadedResults = []> :
+                    list<int> overloadedResults = [],
+                    int numResults = 1> :
   LLVM_IntrOpBase</*Dialect dialect=*/ArmSVE_Dialect,
                   /*string opName=*/"intr." # mnemonic,
                   /*string enumName=*/"aarch64_sve_" # !subst(".", "_", mnemonic),
                   /*list<int> overloadedResults=*/overloadedResults,
                   /*list<int> overloadedOperands=*/overloadedOperands,
                   /*list<Trait> traits=*/traits,
-                  /*int numResults=*/1>;
+                  /*int numResults=*/numResults>;
 
 class ArmSVE_IntrBinaryOverloadedOp<string mnemonic,
                                     list<Trait> traits = []>:
@@ -410,4 +411,24 @@ def ConvertToSvboolIntrOp :
     /*overloadedResults=*/[]>,
     Arguments<(ins SVEPredicate:$mask)>;
 
+// Note: This multi-vector intrinsic requires SME2.
+def ZipX2IntrOp : ArmSVE_IntrOp<"zip.x2",
+    /*traits=*/[],
+    /*overloadedOperands=*/[0],
+    /*overloadedResults=*/[],
+    /*numResults=*/2>,
+    Arguments<(ins Arg<AnyScalableVector, "v1">:$v1,
+                   Arg<AnyScalableVector, "v2">:$v2)>;
+
+// Note: This multi-vector intrinsic requires SME2.
+def ZipX4IntrOp : ArmSVE_IntrOp<"zip.x4",
+    /*traits=*/[],
+    /*overloadedOperands=*/[0],
+    /*overloadedResults=*/[],
+    /*numResults=*/4>,
+    Arguments<(ins Arg<AnyScalableVector, "v1">:$v1,
+                   Arg<AnyScalableVector, "v2">:$v2,
+                   Arg<AnyScalableVector, "v3">:$v3,
+                   Arg<AnyScalableVector, "v3">:$v4)>;
+
 #endif // ARMSVE_OPS
diff --git a/mlir/test/Target/LLVMIR/arm-sve.mlir b/mlir/test/Target/LLVMIR/arm-sve.mlir
index b63d3f0..c7cd1b7 100644
--- a/mlir/test/Target/LLVMIR/arm-sve.mlir
+++ b/mlir/test/Target/LLVMIR/arm-sve.mlir
@@ -314,3 +314,45 @@ llvm.func @arm_sve_convert_to_svbool(
     : (vector<[1]xi1>) -> vector<[16]xi1>
   llvm.return
 }
+
+// CHECK-LABEL: arm_sve_zip_x2(
+// CHECK-SAME:                 <vscale x 16 x i8> %[[V1:[0-9]+]],
+// CHECK-SAME:                 <vscale x 8 x i16> %[[V2:[0-9]+]],
+// CHECK-SAME:                 <vscale x 4 x i32> %[[V3:[0-9]+]],
+// CHECK-SAME:                 <vscale x 2 x i64> %[[V4:[0-9]+]])
+llvm.func @arm_sve_zip_x2(%nxv16i8: vector<[16]xi8>, %nxv8i16: vector<[8]xi16>, %nxv4i32: vector<[4]xi32>, %nxv2i64: vector<[2]xi64>) {
+  // CHECK: call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.zip.x2.nxv16i8(<vscale x 16 x i8> %[[V1]], <vscale x 16 x i8> %[[V1]])
+  %0 = "arm_sve.intr.zip.x2"(%nxv16i8, %nxv16i8) : (vector<[16]xi8>, vector<[16]xi8>)
+    -> !llvm.struct<(vector<[16]xi8>, vector<[16]xi8>)>
+  // CHECK: call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.zip.x2.nxv8i16(<vscale x 8 x i16> %[[V2]], <vscale x 8 x i16> %[[V2]])
+  %1 = "arm_sve.intr.zip.x2"(%nxv8i16, %nxv8i16) : (vector<[8]xi16>, vector<[8]xi16>)
+    -> !llvm.struct<(vector<[8]xi16>, vector<[8]xi16>)>
+  // CHECK: call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.zip.x2.nxv4i32(<vscale x 4 x i32> %[[V3]], <vscale x 4 x i32> %[[V3]])
+  %2 = "arm_sve.intr.zip.x2"(%nxv4i32, %nxv4i32) : (vector<[4]xi32>, vector<[4]xi32>)
+    -> !llvm.struct<(vector<[4]xi32>, vector<[4]xi32>)>
+  // CHECK: call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.zip.x2.nxv2i64(<vscale x 2 x i64> %[[V4]], <vscale x 2 x i64> %[[V4]])
+  %3 = "arm_sve.intr.zip.x2"(%nxv2i64, %nxv2i64) : (vector<[2]xi64>, vector<[2]xi64>)
+     -> !llvm.struct<(vector<[2]xi64>, vector<[2]xi64>)>
+  llvm.return
+}
+
+// CHECK-LABEL: arm_sve_zip_x4(
+// CHECK-SAME:                 <vscale x 16 x i8> %[[V1:[0-9]+]],
+// CHECK-SAME:                 <vscale x 8 x i16> %[[V2:[0-9]+]],
+// CHECK-SAME:                 <vscale x 4 x i32> %[[V3:[0-9]+]],
+// CHECK-SAME:                 <vscale x 2 x i64> %[[V4:[0-9]+]])
+llvm.func @arm_sve_zip_x4(%nxv16i8: vector<[16]xi8>, %nxv8i16: vector<[8]xi16>, %nxv4i32: vector<[4]xi32>, %nxv2i64: vector<[2]xi64>) {
+  // CHECK: call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.zip.x4.nxv16i8(<vscale x 16 x i8> %[[V1]], <vscale x 16 x i8> %[[V1]], <vscale x 16 x i8> %[[V1]], <vscale x 16 x i8> %[[V1]])
+  %0 = "arm_sve.intr.zip.x4"(%nxv16i8, %nxv16i8, %nxv16i8, %nxv16i8) : (vector<[16]xi8>, vector<[16]xi8>, vector<[16]xi8>, vector<[16]xi8>)
+    -> !llvm.struct<(vector<[16]xi8>, vector<[16]xi8>, vector<[16]xi8>, vector<[16]xi8>)>
+  // CHECK: call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.zip.x4.nxv8i16(<vscale x 8 x i16> %[[V2]], <vscale x 8 x i16> %[[V2]], <vscale x 8 x i16> %[[V2]], <vscale x 8 x i16> %[[V2]])
+  %1 = "arm_sve.intr.zip.x4"(%nxv8i16, %nxv8i16, %nxv8i16, %nxv8i16) : (vector<[8]xi16>, vector<[8]xi16>, vector<[8]xi16>, vector<[8]xi16>)
+    -> !llvm.struct<(vector<[8]xi16>, vector<[8]xi16>, vector<[8]xi16>, vector<[8]xi16>)>
+  // CHECK: call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.zip.x4.nxv4i32(<vscale x 4 x i32> %[[V3]], <vscale x 4 x i32> %[[V3]], <vscale x 4 x i32> %[[V3]], <vscale x 4 x i32> %[[V3]])
+  %2 = "arm_sve.intr.zip.x4"(%nxv4i32, %nxv4i32, %nxv4i32, %nxv4i32) : (vector<[4]xi32>, vector<[4]xi32>, vector<[4]xi32>, vector<[4]xi32>)
+    -> !llvm.struct<(vector<[4]xi32>, vector<[4]xi32>, vector<[4]xi32>, vector<[4]xi32>)>
+  // CHECK: call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.zip.x4.nxv2i64(<vscale x 2 x i64> %[[V4]], <vscale x 2 x i64> %[[V4]], <vscale x 2 x i64> %[[V4]], <vscale x 2 x i64> %[[V4]])
+  %3 = "arm_sve.intr.zip.x4"(%nxv2i64, %nxv2i64, %nxv2i64, %nxv2i64) : (vector<[2]xi64>, vector<[2]xi64>, vector<[2]xi64>, vector<[2]xi64>)
+     -> !llvm.struct<(vector<[2]xi64>, vector<[2]xi64>, vector<[2]xi64>, vector<[2]xi64>)>
+  llvm.return
+}
-- 
cgit v1.1


From a9e546cc71e72f9febda174ed1ada70c584628c2 Mon Sep 17 00:00:00 2001
From: Tomas Matheson <76168689+tmatheson-arm@users.noreply.github.com>
Date: Fri, 9 Feb 2024 13:35:42 +0000
Subject: [TableGen][NFC] convert TreePatternNode pointers to references
 (#81134)

Almost all uses of `*TreePatternNode` expect it to be non-null. There
was the occasional check that it wasn't, which I have removed. Making
them references makes it clear that they exist.

This was attempted in 2018 (1b465767d6ca69f4b7201503f5f21e6125fe049a)
for `TreePatternNode::getChild()` but that was reverted.
---
 llvm/utils/TableGen/CodeGenDAGPatterns.cpp    | 364 +++++++++++-----------
 llvm/utils/TableGen/CodeGenDAGPatterns.h      |  20 +-
 llvm/utils/TableGen/DAGISelEmitter.cpp        |  38 +--
 llvm/utils/TableGen/DAGISelMatcher.cpp        |   4 +-
 llvm/utils/TableGen/DAGISelMatcherEmitter.cpp |  12 +-
 llvm/utils/TableGen/DAGISelMatcherGen.cpp     | 281 +++++++++--------
 llvm/utils/TableGen/DAGISelMatcherOpt.cpp     |   4 +-
 llvm/utils/TableGen/FastISelEmitter.cpp       |  98 +++---
 llvm/utils/TableGen/GlobalISelEmitter.cpp     | 422 +++++++++++++-------------
 9 files changed, 615 insertions(+), 628 deletions(-)

diff --git a/llvm/utils/TableGen/CodeGenDAGPatterns.cpp b/llvm/utils/TableGen/CodeGenDAGPatterns.cpp
index 62e0482..a9046e0 100644
--- a/llvm/utils/TableGen/CodeGenDAGPatterns.cpp
+++ b/llvm/utils/TableGen/CodeGenDAGPatterns.cpp
@@ -1406,10 +1406,10 @@ std::string TreePredicateFn::getCodeToRunOnSDNode() const {
 // PatternToMatch implementation
 //
 
-static bool isImmAllOnesAllZerosMatch(const TreePatternNode *P) {
-  if (!P->isLeaf())
+static bool isImmAllOnesAllZerosMatch(const TreePatternNode &P) {
+  if (!P.isLeaf())
     return false;
-  DefInit *DI = dyn_cast<DefInit>(P->getLeafValue());
+  DefInit *DI = dyn_cast<DefInit>(P.getLeafValue());
   if (!DI)
     return false;
 
@@ -1420,15 +1420,15 @@ static bool isImmAllOnesAllZerosMatch(const TreePatternNode *P) {
 /// getPatternSize - Return the 'size' of this pattern.  We want to match large
 /// patterns before small ones.  This is used to determine the size of a
 /// pattern.
-static unsigned getPatternSize(const TreePatternNode *P,
+static unsigned getPatternSize(const TreePatternNode &P,
                                const CodeGenDAGPatterns &CGP) {
   unsigned Size = 3; // The node itself.
   // If the root node is a ConstantSDNode, increases its size.
   // e.g. (set R32:$dst, 0).
-  if (P->isLeaf() && isa<IntInit>(P->getLeafValue()))
+  if (P.isLeaf() && isa<IntInit>(P.getLeafValue()))
     Size += 2;
 
-  if (const ComplexPattern *AM = P->getComplexPatternInfo(CGP)) {
+  if (const ComplexPattern *AM = P.getComplexPatternInfo(CGP)) {
     Size += AM->getComplexity();
     // We don't want to count any children twice, so return early.
     return Size;
@@ -1436,14 +1436,14 @@ static unsigned getPatternSize(const TreePatternNode *P,
 
   // If this node has some predicate function that must match, it adds to the
   // complexity of this node.
-  if (!P->getPredicateCalls().empty())
+  if (!P.getPredicateCalls().empty())
     ++Size;
 
   // Count children in the count if they are also nodes.
-  for (unsigned i = 0, e = P->getNumChildren(); i != e; ++i) {
-    const TreePatternNode *Child = P->getChild(i);
-    if (!Child->isLeaf() && Child->getNumTypes()) {
-      const TypeSetByHwMode &T0 = Child->getExtType(0);
+  for (unsigned i = 0, e = P.getNumChildren(); i != e; ++i) {
+    const TreePatternNode &Child = P.getChild(i);
+    if (!Child.isLeaf() && Child.getNumTypes()) {
+      const TypeSetByHwMode &T0 = Child.getExtType(0);
       // At this point, all variable type sets should be simple, i.e. only
       // have a default mode.
       if (T0.getMachineValueType() != MVT::Other) {
@@ -1451,14 +1451,14 @@ static unsigned getPatternSize(const TreePatternNode *P,
         continue;
       }
     }
-    if (Child->isLeaf()) {
-      if (isa<IntInit>(Child->getLeafValue()))
+    if (Child.isLeaf()) {
+      if (isa<IntInit>(Child.getLeafValue()))
         Size += 5; // Matches a ConstantSDNode (+3) and a specific value (+2).
-      else if (Child->getComplexPatternInfo(CGP))
+      else if (Child.getComplexPatternInfo(CGP))
         Size += getPatternSize(Child, CGP);
       else if (isImmAllOnesAllZerosMatch(Child))
         Size += 4; // Matches a build_vector(+3) and a predicate (+1).
-      else if (!Child->getPredicateCalls().empty())
+      else if (!Child.getPredicateCalls().empty())
         ++Size;
     }
   }
@@ -1582,7 +1582,7 @@ SDTypeConstraint::SDTypeConstraint(Record *R, const CodeGenHwModes &CGH) {
 
 /// getOperandNum - Return the node corresponding to operand #OpNo in tree
 /// N, and the result number in ResNo.
-static TreePatternNode *getOperandNum(unsigned OpNo, TreePatternNode *N,
+static TreePatternNode &getOperandNum(unsigned OpNo, TreePatternNode &N,
                                       const SDNodeInfo &NodeInfo,
                                       unsigned &ResNo) {
   unsigned NumResults = NodeInfo.getNumResults();
@@ -1593,120 +1593,120 @@ static TreePatternNode *getOperandNum(unsigned OpNo, TreePatternNode *N,
 
   OpNo -= NumResults;
 
-  if (OpNo >= N->getNumChildren()) {
+  if (OpNo >= N.getNumChildren()) {
     std::string S;
     raw_string_ostream OS(S);
     OS << "Invalid operand number in type constraint " << (OpNo + NumResults)
        << " ";
-    N->print(OS);
+    N.print(OS);
     PrintFatalError(S);
   }
 
-  return N->getChild(OpNo);
+  return N.getChild(OpNo);
 }
 
 /// ApplyTypeConstraint - Given a node in a pattern, apply this type
 /// constraint to the nodes operands.  This returns true if it makes a
 /// change, false otherwise.  If a type contradiction is found, flag an error.
-bool SDTypeConstraint::ApplyTypeConstraint(TreePatternNode *N,
+bool SDTypeConstraint::ApplyTypeConstraint(TreePatternNode &N,
                                            const SDNodeInfo &NodeInfo,
                                            TreePattern &TP) const {
   if (TP.hasError())
     return false;
 
   unsigned ResNo = 0; // The result number being referenced.
-  TreePatternNode *NodeToApply = getOperandNum(OperandNo, N, NodeInfo, ResNo);
+  TreePatternNode &NodeToApply = getOperandNum(OperandNo, N, NodeInfo, ResNo);
   TypeInfer &TI = TP.getInfer();
 
   switch (ConstraintType) {
   case SDTCisVT:
     // Operand must be a particular type.
-    return NodeToApply->UpdateNodeType(ResNo, VVT, TP);
+    return NodeToApply.UpdateNodeType(ResNo, VVT, TP);
   case SDTCisPtrTy:
     // Operand must be same as target pointer type.
-    return NodeToApply->UpdateNodeType(ResNo, MVT::iPTR, TP);
+    return NodeToApply.UpdateNodeType(ResNo, MVT::iPTR, TP);
   case SDTCisInt:
     // Require it to be one of the legal integer VTs.
-    return TI.EnforceInteger(NodeToApply->getExtType(ResNo));
+    return TI.EnforceInteger(NodeToApply.getExtType(ResNo));
   case SDTCisFP:
     // Require it to be one of the legal fp VTs.
-    return TI.EnforceFloatingPoint(NodeToApply->getExtType(ResNo));
+    return TI.EnforceFloatingPoint(NodeToApply.getExtType(ResNo));
   case SDTCisVec:
     // Require it to be one of the legal vector VTs.
-    return TI.EnforceVector(NodeToApply->getExtType(ResNo));
+    return TI.EnforceVector(NodeToApply.getExtType(ResNo));
   case SDTCisSameAs: {
     unsigned OResNo = 0;
-    TreePatternNode *OtherNode =
+    TreePatternNode &OtherNode =
         getOperandNum(x.SDTCisSameAs_Info.OtherOperandNum, N, NodeInfo, OResNo);
-    return (int)NodeToApply->UpdateNodeType(ResNo,
-                                            OtherNode->getExtType(OResNo), TP) |
-           (int)OtherNode->UpdateNodeType(OResNo,
-                                          NodeToApply->getExtType(ResNo), TP);
+    return (int)NodeToApply.UpdateNodeType(ResNo, OtherNode.getExtType(OResNo),
+                                           TP) |
+           (int)OtherNode.UpdateNodeType(OResNo, NodeToApply.getExtType(ResNo),
+                                         TP);
   }
   case SDTCisVTSmallerThanOp: {
     // The NodeToApply must be a leaf node that is a VT.  OtherOperandNum must
     // have an integer type that is smaller than the VT.
-    if (!NodeToApply->isLeaf() || !isa<DefInit>(NodeToApply->getLeafValue()) ||
-        !cast<DefInit>(NodeToApply->getLeafValue())
+    if (!NodeToApply.isLeaf() || !isa<DefInit>(NodeToApply.getLeafValue()) ||
+        !cast<DefInit>(NodeToApply.getLeafValue())
              ->getDef()
              ->isSubClassOf("ValueType")) {
-      TP.error(N->getOperator()->getName() + " expects a VT operand!");
+      TP.error(N.getOperator()->getName() + " expects a VT operand!");
       return false;
     }
-    DefInit *DI = cast<DefInit>(NodeToApply->getLeafValue());
+    DefInit *DI = cast<DefInit>(NodeToApply.getLeafValue());
     const CodeGenTarget &T = TP.getDAGPatterns().getTargetInfo();
     auto VVT = getValueTypeByHwMode(DI->getDef(), T.getHwModes());
     TypeSetByHwMode TypeListTmp(VVT);
 
     unsigned OResNo = 0;
-    TreePatternNode *OtherNode = getOperandNum(
+    TreePatternNode &OtherNode = getOperandNum(
         x.SDTCisVTSmallerThanOp_Info.OtherOperandNum, N, NodeInfo, OResNo);
 
-    return TI.EnforceSmallerThan(TypeListTmp, OtherNode->getExtType(OResNo),
+    return TI.EnforceSmallerThan(TypeListTmp, OtherNode.getExtType(OResNo),
                                  /*SmallIsVT*/ true);
   }
   case SDTCisOpSmallerThanOp: {
     unsigned BResNo = 0;
-    TreePatternNode *BigOperand = getOperandNum(
+    TreePatternNode &BigOperand = getOperandNum(
         x.SDTCisOpSmallerThanOp_Info.BigOperandNum, N, NodeInfo, BResNo);
-    return TI.EnforceSmallerThan(NodeToApply->getExtType(ResNo),
-                                 BigOperand->getExtType(BResNo));
+    return TI.EnforceSmallerThan(NodeToApply.getExtType(ResNo),
+                                 BigOperand.getExtType(BResNo));
   }
   case SDTCisEltOfVec: {
     unsigned VResNo = 0;
-    TreePatternNode *VecOperand = getOperandNum(
+    TreePatternNode &VecOperand = getOperandNum(
         x.SDTCisEltOfVec_Info.OtherOperandNum, N, NodeInfo, VResNo);
     // Filter vector types out of VecOperand that don't have the right element
     // type.
-    return TI.EnforceVectorEltTypeIs(VecOperand->getExtType(VResNo),
-                                     NodeToApply->getExtType(ResNo));
+    return TI.EnforceVectorEltTypeIs(VecOperand.getExtType(VResNo),
+                                     NodeToApply.getExtType(ResNo));
   }
   case SDTCisSubVecOfVec: {
     unsigned VResNo = 0;
-    TreePatternNode *BigVecOperand = getOperandNum(
+    TreePatternNode &BigVecOperand = getOperandNum(
         x.SDTCisSubVecOfVec_Info.OtherOperandNum, N, NodeInfo, VResNo);
 
     // Filter vector types out of BigVecOperand that don't have the
     // right subvector type.
-    return TI.EnforceVectorSubVectorTypeIs(BigVecOperand->getExtType(VResNo),
-                                           NodeToApply->getExtType(ResNo));
+    return TI.EnforceVectorSubVectorTypeIs(BigVecOperand.getExtType(VResNo),
+                                           NodeToApply.getExtType(ResNo));
   }
   case SDTCVecEltisVT: {
-    return TI.EnforceVectorEltTypeIs(NodeToApply->getExtType(ResNo), VVT);
+    return TI.EnforceVectorEltTypeIs(NodeToApply.getExtType(ResNo), VVT);
   }
   case SDTCisSameNumEltsAs: {
     unsigned OResNo = 0;
-    TreePatternNode *OtherNode = getOperandNum(
+    TreePatternNode &OtherNode = getOperandNum(
         x.SDTCisSameNumEltsAs_Info.OtherOperandNum, N, NodeInfo, OResNo);
-    return TI.EnforceSameNumElts(OtherNode->getExtType(OResNo),
-                                 NodeToApply->getExtType(ResNo));
+    return TI.EnforceSameNumElts(OtherNode.getExtType(OResNo),
+                                 NodeToApply.getExtType(ResNo));
   }
   case SDTCisSameSizeAs: {
     unsigned OResNo = 0;
-    TreePatternNode *OtherNode = getOperandNum(
+    TreePatternNode &OtherNode = getOperandNum(
         x.SDTCisSameSizeAs_Info.OtherOperandNum, N, NodeInfo, OResNo);
-    return TI.EnforceSameSize(OtherNode->getExtType(OResNo),
-                              NodeToApply->getExtType(ResNo));
+    return TI.EnforceSameSize(OtherNode.getExtType(OResNo),
+                              NodeToApply.getExtType(ResNo));
   }
   }
   llvm_unreachable("Invalid ConstraintType!");
@@ -1751,7 +1751,7 @@ bool TreePatternNode::ContainsUnresolvedType(TreePattern &TP) const {
     if (!TP.getInfer().isConcrete(Types[i], true))
       return true;
   for (unsigned i = 0, e = getNumChildren(); i != e; ++i)
-    if (getChild(i)->ContainsUnresolvedType(TP))
+    if (getChild(i).ContainsUnresolvedType(TP))
       return true;
   return false;
 }
@@ -1929,7 +1929,7 @@ void TreePatternNode::print(raw_ostream &OS) const {
       ListSeparator LS;
       for (unsigned i = 0, e = getNumChildren(); i != e; ++i) {
         OS << LS;
-        getChild(i)->print(OS);
+        getChild(i).print(OS);
       }
     }
     OS << ")";
@@ -1958,37 +1958,37 @@ void TreePatternNode::dump() const { print(errs()); }
 /// the assigned name is present in the dependent variable set, then
 /// the assigned name is considered significant and the node is
 /// isomorphic if the names match.
-bool TreePatternNode::isIsomorphicTo(const TreePatternNode *N,
+bool TreePatternNode::isIsomorphicTo(const TreePatternNode &N,
                                      const MultipleUseVarSet &DepVars) const {
-  if (N == this)
+  if (&N == this)
     return true;
-  if (N->isLeaf() != isLeaf())
+  if (N.isLeaf() != isLeaf())
     return false;
 
   // Check operator of non-leaves early since it can be cheaper than checking
   // types.
   if (!isLeaf())
-    if (N->getOperator() != getOperator() ||
-        N->getNumChildren() != getNumChildren())
+    if (N.getOperator() != getOperator() ||
+        N.getNumChildren() != getNumChildren())
       return false;
 
-  if (getExtTypes() != N->getExtTypes() ||
-      getPredicateCalls() != N->getPredicateCalls() ||
-      getTransformFn() != N->getTransformFn())
+  if (getExtTypes() != N.getExtTypes() ||
+      getPredicateCalls() != N.getPredicateCalls() ||
+      getTransformFn() != N.getTransformFn())
     return false;
 
   if (isLeaf()) {
     if (DefInit *DI = dyn_cast<DefInit>(getLeafValue())) {
-      if (DefInit *NDI = dyn_cast<DefInit>(N->getLeafValue())) {
+      if (DefInit *NDI = dyn_cast<DefInit>(N.getLeafValue())) {
         return ((DI->getDef() == NDI->getDef()) &&
-                (!DepVars.contains(getName()) || getName() == N->getName()));
+                (!DepVars.contains(getName()) || getName() == N.getName()));
       }
     }
-    return getLeafValue() == N->getLeafValue();
+    return getLeafValue() == N.getLeafValue();
   }
 
   for (unsigned i = 0, e = getNumChildren(); i != e; ++i)
-    if (!getChild(i)->isIsomorphicTo(N->getChild(i), DepVars))
+    if (!getChild(i).isIsomorphicTo(N.getChild(i), DepVars))
       return false;
   return true;
 }
@@ -2003,7 +2003,7 @@ TreePatternNodePtr TreePatternNode::clone() const {
     std::vector<TreePatternNodePtr> CChildren;
     CChildren.reserve(Children.size());
     for (unsigned i = 0, e = getNumChildren(); i != e; ++i)
-      CChildren.push_back(getChild(i)->clone());
+      CChildren.push_back(getChild(i).clone());
     New = makeIntrusiveRefCnt<TreePatternNode>(
         getOperator(), std::move(CChildren), getNumTypes());
   }
@@ -2023,7 +2023,7 @@ void TreePatternNode::RemoveAllTypes() {
   if (isLeaf())
     return;
   for (unsigned i = 0, e = getNumChildren(); i != e; ++i)
-    getChild(i)->RemoveAllTypes();
+    getChild(i).RemoveAllTypes();
 }
 
 /// SubstituteFormalArguments - Replace the formal arguments in this tree
@@ -2034,24 +2034,24 @@ void TreePatternNode::SubstituteFormalArguments(
     return;
 
   for (unsigned i = 0, e = getNumChildren(); i != e; ++i) {
-    TreePatternNode *Child = getChild(i);
-    if (Child->isLeaf()) {
-      Init *Val = Child->getLeafValue();
+    TreePatternNode &Child = getChild(i);
+    if (Child.isLeaf()) {
+      Init *Val = Child.getLeafValue();
       // Note that, when substituting into an output pattern, Val might be an
       // UnsetInit.
       if (isa<UnsetInit>(Val) ||
           (isa<DefInit>(Val) &&
            cast<DefInit>(Val)->getDef()->getName() == "node")) {
         // We found a use of a formal argument, replace it with its value.
-        TreePatternNodePtr NewChild = ArgMap[Child->getName()];
+        TreePatternNodePtr NewChild = ArgMap[Child.getName()];
         assert(NewChild && "Couldn't find formal argument!");
-        assert((Child->getPredicateCalls().empty() ||
-                NewChild->getPredicateCalls() == Child->getPredicateCalls()) &&
+        assert((Child.getPredicateCalls().empty() ||
+                NewChild->getPredicateCalls() == Child.getPredicateCalls()) &&
                "Non-empty child predicate clobbered!");
         setChild(i, std::move(NewChild));
       }
     } else {
-      getChild(i)->SubstituteFormalArguments(ArgMap);
+      getChild(i).SubstituteFormalArguments(ArgMap);
     }
   }
 }
@@ -2325,7 +2325,7 @@ TreePatternNode::getIntrinsicInfo(const CodeGenDAGPatterns &CDP) const {
       getOperator() != CDP.get_intrinsic_wo_chain_sdnode())
     return nullptr;
 
-  unsigned IID = cast<IntInit>(getChild(0)->getLeafValue())->getValue();
+  unsigned IID = cast<IntInit>(getChild(0).getLeafValue())->getValue();
   return &CDP.getIntrinsicInfo(IID);
 }
 
@@ -2397,7 +2397,7 @@ bool TreePatternNode::TreeHasProperty(SDNP Property,
   if (NodeHasProperty(Property, CGP))
     return true;
   for (unsigned i = 0, e = getNumChildren(); i != e; ++i)
-    if (getChild(i)->TreeHasProperty(Property, CGP))
+    if (getChild(i).TreeHasProperty(Property, CGP))
       return true;
   return false;
 }
@@ -2411,11 +2411,11 @@ bool TreePatternNode::isCommutativeIntrinsic(
   return false;
 }
 
-static bool isOperandClass(const TreePatternNode *N, StringRef Class) {
-  if (!N->isLeaf())
-    return N->getOperator()->isSubClassOf(Class);
+static bool isOperandClass(const TreePatternNode &N, StringRef Class) {
+  if (!N.isLeaf())
+    return N.getOperator()->isSubClassOf(Class);
 
-  DefInit *DI = dyn_cast<DefInit>(N->getLeafValue());
+  DefInit *DI = dyn_cast<DefInit>(N.getLeafValue());
   if (DI && DI->getDef()->isSubClassOf(Class))
     return true;
 
@@ -2506,15 +2506,15 @@ bool TreePatternNode::ApplyTypeConstraints(TreePattern &TP, bool NotRegisters) {
     }
 
     // Apply type info to the intrinsic ID.
-    MadeChange |= getChild(0)->UpdateNodeType(0, MVT::iPTR, TP);
+    MadeChange |= getChild(0).UpdateNodeType(0, MVT::iPTR, TP);
 
     for (unsigned i = 0, e = getNumChildren() - 1; i != e; ++i) {
-      MadeChange |= getChild(i + 1)->ApplyTypeConstraints(TP, NotRegisters);
+      MadeChange |= getChild(i + 1).ApplyTypeConstraints(TP, NotRegisters);
 
       MVT::SimpleValueType OpVT =
           getValueType(Int->IS.ParamTys[i]->getValueAsDef("VT"));
-      assert(getChild(i + 1)->getNumTypes() == 1 && "Unhandled case");
-      MadeChange |= getChild(i + 1)->UpdateNodeType(0, OpVT, TP);
+      assert(getChild(i + 1).getNumTypes() == 1 && "Unhandled case");
+      MadeChange |= getChild(i + 1).UpdateNodeType(0, OpVT, TP);
     }
     return MadeChange;
   }
@@ -2532,8 +2532,8 @@ bool TreePatternNode::ApplyTypeConstraints(TreePattern &TP, bool NotRegisters) {
 
     bool MadeChange = false;
     for (unsigned i = 0, e = getNumChildren(); i != e; ++i)
-      MadeChange |= getChild(i)->ApplyTypeConstraints(TP, NotRegisters);
-    MadeChange |= NI.ApplyTypeConstraints(this, TP);
+      MadeChange |= getChild(i).ApplyTypeConstraints(TP, NotRegisters);
+    MadeChange |= NI.ApplyTypeConstraints(*this, TP);
     return MadeChange;
   }
 
@@ -2568,9 +2568,9 @@ bool TreePatternNode::ApplyTypeConstraints(TreePattern &TP, bool NotRegisters) {
     // If this is an INSERT_SUBREG, constrain the source and destination VTs to
     // be the same.
     if (getOperator()->getName() == "INSERT_SUBREG") {
-      assert(getChild(0)->getNumTypes() == 1 && "FIXME: Unhandled");
-      MadeChange |= UpdateNodeType(0, getChild(0)->getExtType(0), TP);
-      MadeChange |= getChild(0)->UpdateNodeType(0, getExtType(0), TP);
+      assert(getChild(0).getNumTypes() == 1 && "FIXME: Unhandled");
+      MadeChange |= UpdateNodeType(0, getChild(0).getExtType(0), TP);
+      MadeChange |= getChild(0).UpdateNodeType(0, getExtType(0), TP);
     } else if (getOperator()->getName() == "REG_SEQUENCE") {
       // We need to do extra, custom typechecking for REG_SEQUENCE since it is
       // variadic.
@@ -2592,7 +2592,7 @@ bool TreePatternNode::ApplyTypeConstraints(TreePattern &TP, bool NotRegisters) {
       }
 
       for (unsigned I = 1; I < NChild; I += 2) {
-        TreePatternNode *SubIdxChild = getChild(I + 1);
+        TreePatternNode &SubIdxChild = getChild(I + 1);
         if (!isOperandClass(SubIdxChild, "SubRegIndex")) {
           TP.error("REG_SEQUENCE requires a SubRegIndex for operand " +
                    Twine(I + 1) + "!");
@@ -2637,7 +2637,7 @@ bool TreePatternNode::ApplyTypeConstraints(TreePattern &TP, bool NotRegisters) {
         return false;
       }
 
-      TreePatternNode *Child = getChild(ChildNo++);
+      TreePatternNode *Child = &getChild(ChildNo++);
       unsigned ChildResNo = 0; // Instructions always use res #0 of their op.
 
       // If the operand has sub-operands, they may be provided by distinct
@@ -2660,7 +2660,7 @@ bool TreePatternNode::ApplyTypeConstraints(TreePattern &TP, bool NotRegisters) {
                                         getNumChildren());
                 return false;
               }
-              Child = getChild(ChildNo++);
+              Child = &getChild(ChildNo++);
 
               SubRec = cast<DefInit>(MIOpInfo->getArg(Arg))->getDef();
               MadeChange |=
@@ -2683,7 +2683,7 @@ bool TreePatternNode::ApplyTypeConstraints(TreePattern &TP, bool NotRegisters) {
     }
 
     for (unsigned i = 0, e = getNumChildren(); i != e; ++i)
-      MadeChange |= getChild(i)->ApplyTypeConstraints(TP, NotRegisters);
+      MadeChange |= getChild(i).ApplyTypeConstraints(TP, NotRegisters);
     return MadeChange;
   }
 
@@ -2707,7 +2707,7 @@ bool TreePatternNode::ApplyTypeConstraints(TreePattern &TP, bool NotRegisters) {
     }
 
     for (unsigned i = 0; i < getNumChildren(); ++i)
-      MadeChange |= getChild(i)->ApplyTypeConstraints(TP, NotRegisters);
+      MadeChange |= getChild(i).ApplyTypeConstraints(TP, NotRegisters);
 
     return MadeChange;
   }
@@ -2721,16 +2721,16 @@ bool TreePatternNode::ApplyTypeConstraints(TreePattern &TP, bool NotRegisters) {
     return false;
   }
 
-  bool MadeChange = getChild(0)->ApplyTypeConstraints(TP, NotRegisters);
+  bool MadeChange = getChild(0).ApplyTypeConstraints(TP, NotRegisters);
   return MadeChange;
 }
 
 /// OnlyOnRHSOfCommutative - Return true if this value is only allowed on the
 /// RHS of a commutative operation, not the on LHS.
-static bool OnlyOnRHSOfCommutative(TreePatternNode *N) {
-  if (!N->isLeaf() && N->getOperator()->getName() == "imm")
+static bool OnlyOnRHSOfCommutative(TreePatternNode &N) {
+  if (!N.isLeaf() && N.getOperator()->getName() == "imm")
     return true;
-  if (N->isLeaf() && isa<IntInit>(N->getLeafValue()))
+  if (N.isLeaf() && isa<IntInit>(N.getLeafValue()))
     return true;
   if (isImmAllOnesAllZerosMatch(N))
     return true;
@@ -2748,7 +2748,7 @@ bool TreePatternNode::canPatternMatch(std::string &Reason,
     return true;
 
   for (unsigned i = 0, e = getNumChildren(); i != e; ++i)
-    if (!getChild(i)->canPatternMatch(Reason, CDP))
+    if (!getChild(i).canPatternMatch(Reason, CDP))
       return false;
 
   // If this is an intrinsic, handle cases that would make it not match.  For
@@ -2818,15 +2818,15 @@ void TreePattern::error(const Twine &Msg) {
 
 void TreePattern::ComputeNamedNodes() {
   for (TreePatternNodePtr &Tree : Trees)
-    ComputeNamedNodes(Tree.get());
+    ComputeNamedNodes(*Tree);
 }
 
-void TreePattern::ComputeNamedNodes(TreePatternNode *N) {
-  if (!N->getName().empty())
-    NamedNodes[N->getName()].push_back(N);
+void TreePattern::ComputeNamedNodes(TreePatternNode &N) {
+  if (!N.getName().empty())
+    NamedNodes[N.getName()].push_back(&N);
 
-  for (unsigned i = 0, e = N->getNumChildren(); i != e; ++i)
-    ComputeNamedNodes(N->getChild(i));
+  for (unsigned i = 0, e = N.getNumChildren(); i != e; ++i)
+    ComputeNamedNodes(N.getChild(i));
 }
 
 TreePatternNodePtr TreePattern::ParseTreePattern(Init *TheInit,
@@ -3031,7 +3031,7 @@ static bool SimplifyTree(TreePatternNodePtr &N) {
   if (N->getOperator()->getName() == "bitconvert" &&
       N->getExtType(0).isValueTypeByHwMode(false) &&
       !N->getExtType(0).empty() &&
-      N->getExtType(0) == N->getChild(0)->getExtType(0) &&
+      N->getExtType(0) == N->getChild(0).getExtType(0) &&
       N->getName().empty()) {
     N = N->getChildShared(0);
     SimplifyTree(N);
@@ -3451,11 +3451,11 @@ void CodeGenDAGPatterns::FindPatternInputsAndOutputs(
 
   if (Pat->getOperator()->getName() == "implicit") {
     for (unsigned i = 0, e = Pat->getNumChildren(); i != e; ++i) {
-      TreePatternNode *Dest = Pat->getChild(i);
-      if (!Dest->isLeaf())
+      TreePatternNode &Dest = Pat->getChild(i);
+      if (!Dest.isLeaf())
         I.error("implicitly defined value should be a register!");
 
-      DefInit *Val = dyn_cast<DefInit>(Dest->getLeafValue());
+      DefInit *Val = dyn_cast<DefInit>(Dest.getLeafValue());
       if (!Val || !Val->getDef()->isSubClassOf("Register"))
         I.error("implicitly defined value should be a register!");
       if (Val)
@@ -3468,7 +3468,7 @@ void CodeGenDAGPatterns::FindPatternInputsAndOutputs(
     // If this is not a set, verify that the children nodes are not void typed,
     // and recurse.
     for (unsigned i = 0, e = Pat->getNumChildren(); i != e; ++i) {
-      if (Pat->getChild(i)->getNumTypes() == 0)
+      if (Pat->getChild(i).getNumTypes() == 0)
         I.error("Cannot have void nodes inside of patterns!");
       FindPatternInputsAndOutputs(I, Pat->getChildShared(i), InstInputs,
                                   InstResults, InstImpResults);
@@ -3550,35 +3550,35 @@ public:
         isBitcast(false), isVariadic(false), hasChain(false) {}
 
   void Analyze(const PatternToMatch &Pat) {
-    const TreePatternNode *N = Pat.getSrcPattern();
+    const TreePatternNode &N = Pat.getSrcPattern();
     AnalyzeNode(N);
     // These properties are detected only on the root node.
     isBitcast = IsNodeBitcast(N);
   }
 
 private:
-  bool IsNodeBitcast(const TreePatternNode *N) const {
+  bool IsNodeBitcast(const TreePatternNode &N) const {
     if (hasSideEffects || mayLoad || mayStore || isVariadic)
       return false;
 
-    if (N->isLeaf())
+    if (N.isLeaf())
       return false;
-    if (N->getNumChildren() != 1 || !N->getChild(0)->isLeaf())
+    if (N.getNumChildren() != 1 || !N.getChild(0).isLeaf())
       return false;
 
-    if (N->getOperator()->isSubClassOf("ComplexPattern"))
+    if (N.getOperator()->isSubClassOf("ComplexPattern"))
       return false;
 
-    const SDNodeInfo &OpInfo = CDP.getSDNodeInfo(N->getOperator());
+    const SDNodeInfo &OpInfo = CDP.getSDNodeInfo(N.getOperator());
     if (OpInfo.getNumResults() != 1 || OpInfo.getNumOperands() != 1)
       return false;
     return OpInfo.getEnumName() == "ISD::BITCAST";
   }
 
 public:
-  void AnalyzeNode(const TreePatternNode *N) {
-    if (N->isLeaf()) {
-      if (DefInit *DI = dyn_cast<DefInit>(N->getLeafValue())) {
+  void AnalyzeNode(const TreePatternNode &N) {
+    if (N.isLeaf()) {
+      if (DefInit *DI = dyn_cast<DefInit>(N.getLeafValue())) {
         Record *LeafRec = DI->getDef();
         // Handle ComplexPattern leaves.
         if (LeafRec->isSubClassOf("ComplexPattern")) {
@@ -3595,22 +3595,22 @@ public:
     }
 
     // Analyze children.
-    for (unsigned i = 0, e = N->getNumChildren(); i != e; ++i)
-      AnalyzeNode(N->getChild(i));
+    for (unsigned i = 0, e = N.getNumChildren(); i != e; ++i)
+      AnalyzeNode(N.getChild(i));
 
     // Notice properties of the node.
-    if (N->NodeHasProperty(SDNPMayStore, CDP))
+    if (N.NodeHasProperty(SDNPMayStore, CDP))
       mayStore = true;
-    if (N->NodeHasProperty(SDNPMayLoad, CDP))
+    if (N.NodeHasProperty(SDNPMayLoad, CDP))
       mayLoad = true;
-    if (N->NodeHasProperty(SDNPSideEffect, CDP))
+    if (N.NodeHasProperty(SDNPSideEffect, CDP))
       hasSideEffects = true;
-    if (N->NodeHasProperty(SDNPVariadic, CDP))
+    if (N.NodeHasProperty(SDNPVariadic, CDP))
       isVariadic = true;
-    if (N->NodeHasProperty(SDNPHasChain, CDP))
+    if (N.NodeHasProperty(SDNPHasChain, CDP))
       hasChain = true;
 
-    if (const CodeGenIntrinsic *IntInfo = N->getIntrinsicInfo(CDP)) {
+    if (const CodeGenIntrinsic *IntInfo = N.getIntrinsicInfo(CDP)) {
       ModRefInfo MR = IntInfo->ME.getModRef();
       // If this is an intrinsic, analyze it.
       if (isRefSet(MR))
@@ -3723,14 +3723,14 @@ static bool hasNullFragReference(ListInit *LI) {
 }
 
 /// Get all the instructions in a tree.
-static void getInstructionsInTree(TreePatternNode *Tree,
+static void getInstructionsInTree(TreePatternNode &Tree,
                                   SmallVectorImpl<Record *> &Instrs) {
-  if (Tree->isLeaf())
+  if (Tree.isLeaf())
     return;
-  if (Tree->getOperator()->isSubClassOf("Instruction"))
-    Instrs.push_back(Tree->getOperator());
-  for (unsigned i = 0, e = Tree->getNumChildren(); i != e; ++i)
-    getInstructionsInTree(Tree->getChild(i), Instrs);
+  if (Tree.getOperator()->isSubClassOf("Instruction"))
+    Instrs.push_back(Tree.getOperator());
+  for (unsigned i = 0, e = Tree.getNumChildren(); i != e; ++i)
+    getInstructionsInTree(Tree.getChild(i), Instrs);
 }
 
 /// Check the class of a pattern leaf node against the instruction operand it
@@ -3917,7 +3917,7 @@ void CodeGenDAGPatterns::parseInstructionPattern(CodeGenInstruction &CGI,
   TreePatternNodePtr Pattern = I.getTree(0);
   TreePatternNodePtr SrcPattern;
   if (Pattern->getOperator()->getName() == "set") {
-    SrcPattern = Pattern->getChild(Pattern->getNumChildren() - 1)->clone();
+    SrcPattern = Pattern->getChild(Pattern->getNumChildren() - 1).clone();
   } else {
     // Not a set (store or something?)
     SrcPattern = Pattern;
@@ -3995,22 +3995,22 @@ void CodeGenDAGPatterns::ParseInstructions() {
 
 typedef std::pair<TreePatternNode *, unsigned> NameRecord;
 
-static void FindNames(TreePatternNode *P,
+static void FindNames(TreePatternNode &P,
                       std::map<std::string, NameRecord> &Names,
                       TreePattern *PatternTop) {
-  if (!P->getName().empty()) {
-    NameRecord &Rec = Names[P->getName()];
+  if (!P.getName().empty()) {
+    NameRecord &Rec = Names[P.getName()];
     // If this is the first instance of the name, remember the node.
     if (Rec.second++ == 0)
-      Rec.first = P;
-    else if (Rec.first->getExtTypes() != P->getExtTypes())
-      PatternTop->error("repetition of value: $" + P->getName() +
+      Rec.first = &P;
+    else if (Rec.first->getExtTypes() != P.getExtTypes())
+      PatternTop->error("repetition of value: $" + P.getName() +
                         " where different uses have different types!");
   }
 
-  if (!P->isLeaf()) {
-    for (unsigned i = 0, e = P->getNumChildren(); i != e; ++i)
-      FindNames(P->getChild(i), Names, PatternTop);
+  if (!P.isLeaf()) {
+    for (unsigned i = 0, e = P.getNumChildren(); i != e; ++i)
+      FindNames(P.getChild(i), Names, PatternTop);
   }
 }
 
@@ -4018,7 +4018,7 @@ void CodeGenDAGPatterns::AddPatternToMatch(TreePattern *Pattern,
                                            PatternToMatch &&PTM) {
   // Do some sanity checking on the pattern we're about to match.
   std::string Reason;
-  if (!PTM.getSrcPattern()->canPatternMatch(Reason, *this)) {
+  if (!PTM.getSrcPattern().canPatternMatch(Reason, *this)) {
     PrintWarning(Pattern->getRecord()->getLoc(),
                  Twine("Pattern can never match: ") + Reason);
     return;
@@ -4027,7 +4027,7 @@ void CodeGenDAGPatterns::AddPatternToMatch(TreePattern *Pattern,
   // If the source pattern's root is a complex pattern, that complex pattern
   // must specify the nodes it can potentially match.
   if (const ComplexPattern *CP =
-          PTM.getSrcPattern()->getComplexPatternInfo(*this))
+          PTM.getSrcPattern().getComplexPatternInfo(*this))
     if (CP->getRootNodes().empty())
       Pattern->error("ComplexPattern at root must specify list of opcodes it"
                      " could match");
@@ -4189,27 +4189,27 @@ void CodeGenDAGPatterns::VerifyInstructionFlags() {
 /// Given a pattern result with an unresolved type, see if we can find one
 /// instruction with an unresolved result type.  Force this result type to an
 /// arbitrary element if it's possible types to converge results.
-static bool ForceArbitraryInstResultType(TreePatternNode *N, TreePattern &TP) {
-  if (N->isLeaf())
+static bool ForceArbitraryInstResultType(TreePatternNode &N, TreePattern &TP) {
+  if (N.isLeaf())
     return false;
 
   // Analyze children.
-  for (unsigned i = 0, e = N->getNumChildren(); i != e; ++i)
-    if (ForceArbitraryInstResultType(N->getChild(i), TP))
+  for (unsigned i = 0, e = N.getNumChildren(); i != e; ++i)
+    if (ForceArbitraryInstResultType(N.getChild(i), TP))
       return true;
 
-  if (!N->getOperator()->isSubClassOf("Instruction"))
+  if (!N.getOperator()->isSubClassOf("Instruction"))
     return false;
 
   // If this type is already concrete or completely unknown we can't do
   // anything.
   TypeInfer &TI = TP.getInfer();
-  for (unsigned i = 0, e = N->getNumTypes(); i != e; ++i) {
-    if (N->getExtType(i).empty() || TI.isConcrete(N->getExtType(i), false))
+  for (unsigned i = 0, e = N.getNumTypes(); i != e; ++i) {
+    if (N.getExtType(i).empty() || TI.isConcrete(N.getExtType(i), false))
       continue;
 
     // Otherwise, force its type to an arbitrary choice.
-    if (TI.forceArbitrary(N->getExtType(i)))
+    if (TI.forceArbitrary(N.getExtType(i)))
       return true;
   }
 
@@ -4285,7 +4285,7 @@ void CodeGenDAGPatterns::ParseOnePattern(
     // arbitrary types to the result pattern's nodes.
     if (!IterateInference && InferredAllPatternTypes && !InferredAllResultTypes)
       IterateInference =
-          ForceArbitraryInstResultType(Result.getTree(0).get(), Result);
+          ForceArbitraryInstResultType(*Result.getTree(0), Result);
   } while (IterateInference);
 
   // Verify that we inferred enough types that we can do something with the
@@ -4372,13 +4372,13 @@ void CodeGenDAGPatterns::ParsePatterns() {
   }
 }
 
-static void collectModes(std::set<unsigned> &Modes, const TreePatternNode *N) {
-  for (const TypeSetByHwMode &VTS : N->getExtTypes())
+static void collectModes(std::set<unsigned> &Modes, const TreePatternNode &N) {
+  for (const TypeSetByHwMode &VTS : N.getExtTypes())
     for (const auto &I : VTS)
       Modes.insert(I.first);
 
-  for (unsigned i = 0, e = N->getNumChildren(); i != e; ++i)
-    collectModes(Modes, N->getChild(i));
+  for (unsigned i = 0, e = N.getNumChildren(); i != e; ++i)
+    collectModes(Modes, N.getChild(i));
 }
 
 void CodeGenDAGPatterns::ExpandHwModeBasedTypes() {
@@ -4391,8 +4391,8 @@ void CodeGenDAGPatterns::ExpandHwModeBasedTypes() {
 
   auto AppendPattern = [this](PatternToMatch &P, unsigned Mode,
                               StringRef Check) {
-    TreePatternNodePtr NewSrc = P.getSrcPattern()->clone();
-    TreePatternNodePtr NewDst = P.getDstPattern()->clone();
+    TreePatternNodePtr NewSrc = P.getSrcPattern().clone();
+    TreePatternNodePtr NewDst = P.getDstPattern().clone();
     if (!NewSrc->setDefaultMode(Mode) || !NewDst->setDefaultMode(Mode)) {
       return;
     }
@@ -4405,10 +4405,10 @@ void CodeGenDAGPatterns::ExpandHwModeBasedTypes() {
 
   for (PatternToMatch &P : Copy) {
     const TreePatternNode *SrcP = nullptr, *DstP = nullptr;
-    if (P.getSrcPattern()->hasProperTypeByHwMode())
-      SrcP = P.getSrcPattern();
-    if (P.getDstPattern()->hasProperTypeByHwMode())
-      DstP = P.getDstPattern();
+    if (P.getSrcPattern().hasProperTypeByHwMode())
+      SrcP = &P.getSrcPattern();
+    if (P.getDstPattern().hasProperTypeByHwMode())
+      DstP = &P.getDstPattern();
     if (!SrcP && !DstP) {
       PatternsToMatch.push_back(P);
       continue;
@@ -4416,9 +4416,9 @@ void CodeGenDAGPatterns::ExpandHwModeBasedTypes() {
 
     std::set<unsigned> Modes;
     if (SrcP)
-      collectModes(Modes, SrcP);
+      collectModes(Modes, *SrcP);
     if (DstP)
-      collectModes(Modes, DstP);
+      collectModes(Modes, *DstP);
 
     // The predicate for the default mode needs to be constructed for each
     // pattern separately.
@@ -4458,18 +4458,18 @@ void CodeGenDAGPatterns::ExpandHwModeBasedTypes() {
 /// Dependent variable map for CodeGenDAGPattern variant generation
 typedef StringMap<int> DepVarMap;
 
-static void FindDepVarsOf(TreePatternNode *N, DepVarMap &DepMap) {
-  if (N->isLeaf()) {
-    if (N->hasName() && isa<DefInit>(N->getLeafValue()))
-      DepMap[N->getName()]++;
+static void FindDepVarsOf(TreePatternNode &N, DepVarMap &DepMap) {
+  if (N.isLeaf()) {
+    if (N.hasName() && isa<DefInit>(N.getLeafValue()))
+      DepMap[N.getName()]++;
   } else {
-    for (size_t i = 0, e = N->getNumChildren(); i != e; ++i)
-      FindDepVarsOf(N->getChild(i), DepMap);
+    for (size_t i = 0, e = N.getNumChildren(); i != e; ++i)
+      FindDepVarsOf(N.getChild(i), DepMap);
   }
 }
 
 /// Find dependent variables within child patterns
-static void FindDepVars(TreePatternNode *N, MultipleUseVarSet &DepVars) {
+static void FindDepVars(TreePatternNode &N, MultipleUseVarSet &DepVars) {
   DepVarMap depcounts;
   FindDepVarsOf(N, depcounts);
   for (const auto &Pair : depcounts) {
@@ -4543,7 +4543,7 @@ static void CombineChildVariants(
     // which are the same pattern.  Ignore the dups.
     if (R->canPatternMatch(ErrString, CDP) &&
         none_of(OutVariants, [&](TreePatternNodePtr Variant) {
-          return R->isIsomorphicTo(Variant.get(), DepVars);
+          return R->isIsomorphicTo(*Variant, DepVars);
         }))
       OutVariants.push_back(R);
 
@@ -4589,12 +4589,12 @@ GatherChildrenOfAssociativeOpcode(TreePatternNodePtr N,
     return;
   }
 
-  if (N->getChild(0)->isLeaf() || N->getChild(0)->getOperator() != Operator)
+  if (N->getChild(0).isLeaf() || N->getChild(0).getOperator() != Operator)
     Children.push_back(N->getChildShared(0));
   else
     GatherChildrenOfAssociativeOpcode(N->getChildShared(0), Children);
 
-  if (N->getChild(1)->isLeaf() || N->getChild(1)->getOperator() != Operator)
+  if (N->getChild(1).isLeaf() || N->getChild(1).getOperator() != Operator)
     Children.push_back(N->getChildShared(1));
   else
     GatherChildrenOfAssociativeOpcode(N->getChildShared(1), Children);
@@ -4688,9 +4688,9 @@ static void GenerateVariantsOf(TreePatternNodePtr N,
     unsigned i = 0 + Skip;
     unsigned e = 2 + Skip;
     for (; i != e; ++i) {
-      TreePatternNode *Child = N->getChild(i);
-      if (Child->isLeaf())
-        if (DefInit *DI = dyn_cast<DefInit>(Child->getLeafValue())) {
+      TreePatternNode &Child = N->getChild(i);
+      if (Child.isLeaf())
+        if (DefInit *DI = dyn_cast<DefInit>(Child.getLeafValue())) {
           Record *RR = DI->getDef();
           if (RR->isSubClassOf("Register"))
             NoRegisters = false;
@@ -4738,7 +4738,7 @@ void CodeGenDAGPatterns::GenerateVariants() {
       continue;
 
     LLVM_DEBUG(errs() << "FOUND VARIANTS OF: ";
-               PatternsToMatch[i].getSrcPattern()->dump(); errs() << "\n");
+               PatternsToMatch[i].getSrcPattern().dump(); errs() << "\n");
 
     for (unsigned v = 0, e = Variants.size(); v != e; ++v) {
       TreePatternNodePtr Variant = Variants[v];
diff --git a/llvm/utils/TableGen/CodeGenDAGPatterns.h b/llvm/utils/TableGen/CodeGenDAGPatterns.h
index ea6219c..823c40c 100644
--- a/llvm/utils/TableGen/CodeGenDAGPatterns.h
+++ b/llvm/utils/TableGen/CodeGenDAGPatterns.h
@@ -406,7 +406,7 @@ struct SDTypeConstraint {
   /// constraint to the nodes operands.  This returns true if it makes a
   /// change, false otherwise.  If a type contradiction is found, an error
   /// is flagged.
-  bool ApplyTypeConstraint(TreePatternNode *N, const SDNodeInfo &NodeInfo,
+  bool ApplyTypeConstraint(TreePatternNode &N, const SDNodeInfo &NodeInfo,
                            TreePattern &TP) const;
 };
 
@@ -474,7 +474,7 @@ public:
   /// constraints for this node to the operands of the node.  This returns
   /// true if it makes a change, false otherwise.  If a type contradiction is
   /// found, an error is flagged.
-  bool ApplyTypeConstraints(TreePatternNode *N, TreePattern &TP) const;
+  bool ApplyTypeConstraints(TreePatternNode &N, TreePattern &TP) const;
 };
 
 /// TreePredicateFn - This is an abstraction that represents the predicates on
@@ -722,10 +722,10 @@ public:
   }
 
   unsigned getNumChildren() const { return Children.size(); }
-  const TreePatternNode *getChild(unsigned N) const {
-    return Children[N].get();
+  const TreePatternNode &getChild(unsigned N) const {
+    return *Children[N].get();
   }
-  TreePatternNode *getChild(unsigned N) { return Children[N].get(); }
+  TreePatternNode &getChild(unsigned N) { return *Children[N].get(); }
   const TreePatternNodePtr &getChildShared(unsigned N) const {
     return Children[N];
   }
@@ -812,7 +812,7 @@ public: // Higher level manipulation routines.
   /// the specified node.  For this comparison, all of the state of the node
   /// is considered, except for the assigned name.  Nodes with differing names
   /// that are otherwise identical are considered isomorphic.
-  bool isIsomorphicTo(const TreePatternNode *N,
+  bool isIsomorphicTo(const TreePatternNode &N,
                       const MultipleUseVarSet &DepVars) const;
 
   /// SubstituteFormalArguments - Replace the formal arguments in this tree
@@ -974,7 +974,7 @@ public:
 private:
   TreePatternNodePtr ParseTreePattern(Init *DI, StringRef OpName);
   void ComputeNamedNodes();
-  void ComputeNamedNodes(TreePatternNode *N);
+  void ComputeNamedNodes(TreePatternNode &N);
 };
 
 inline bool TreePatternNode::UpdateNodeType(unsigned ResNo,
@@ -1071,9 +1071,9 @@ public:
 
   Record *getSrcRecord() const { return SrcRecord; }
   ListInit *getPredicates() const { return Predicates; }
-  TreePatternNode *getSrcPattern() const { return SrcPattern.get(); }
+  TreePatternNode &getSrcPattern() const { return *SrcPattern; }
   TreePatternNodePtr getSrcPatternShared() const { return SrcPattern; }
-  TreePatternNode *getDstPattern() const { return DstPattern.get(); }
+  TreePatternNode &getDstPattern() const { return *DstPattern; }
   TreePatternNodePtr getDstPatternShared() const { return DstPattern; }
   const std::vector<Record *> &getDstRegs() const { return Dstregs; }
   StringRef getHwModeFeatures() const { return HwModeFeatures; }
@@ -1250,7 +1250,7 @@ private:
       std::vector<Record *> &InstImpResults);
 };
 
-inline bool SDNodeInfo::ApplyTypeConstraints(TreePatternNode *N,
+inline bool SDNodeInfo::ApplyTypeConstraints(TreePatternNode &N,
                                              TreePattern &TP) const {
   bool MadeChange = false;
   for (unsigned i = 0, e = TypeConstraints.size(); i != e; ++i)
diff --git a/llvm/utils/TableGen/DAGISelEmitter.cpp b/llvm/utils/TableGen/DAGISelEmitter.cpp
index 32b2746..336cee0 100644
--- a/llvm/utils/TableGen/DAGISelEmitter.cpp
+++ b/llvm/utils/TableGen/DAGISelEmitter.cpp
@@ -38,41 +38,41 @@ public:
 // DAGISelEmitter Helper methods
 //
 
-/// getResultPatternCost - Compute the number of instructions for this pattern.
+/// Compute the number of instructions for this pattern.
 /// This is a temporary hack.  We should really include the instruction
 /// latencies in this calculation.
-static unsigned getResultPatternCost(TreePatternNode *P,
-                                     CodeGenDAGPatterns &CGP) {
-  if (P->isLeaf())
+static unsigned getResultPatternCost(TreePatternNode &P,
+                                     const CodeGenDAGPatterns &CGP) {
+  if (P.isLeaf())
     return 0;
 
   unsigned Cost = 0;
-  Record *Op = P->getOperator();
+  Record *Op = P.getOperator();
   if (Op->isSubClassOf("Instruction")) {
     Cost++;
     CodeGenInstruction &II = CGP.getTargetInfo().getInstruction(Op);
     if (II.usesCustomInserter)
       Cost += 10;
   }
-  for (unsigned i = 0, e = P->getNumChildren(); i != e; ++i)
-    Cost += getResultPatternCost(P->getChild(i), CGP);
+  for (unsigned i = 0, e = P.getNumChildren(); i != e; ++i)
+    Cost += getResultPatternCost(P.getChild(i), CGP);
   return Cost;
 }
 
 /// getResultPatternCodeSize - Compute the code size of instructions for this
 /// pattern.
-static unsigned getResultPatternSize(TreePatternNode *P,
-                                     CodeGenDAGPatterns &CGP) {
-  if (P->isLeaf())
+static unsigned getResultPatternSize(TreePatternNode &P,
+                                     const CodeGenDAGPatterns &CGP) {
+  if (P.isLeaf())
     return 0;
 
   unsigned Cost = 0;
-  Record *Op = P->getOperator();
+  Record *Op = P.getOperator();
   if (Op->isSubClassOf("Instruction")) {
     Cost += Op->getValueAsInt("CodeSize");
   }
-  for (unsigned i = 0, e = P->getNumChildren(); i != e; ++i)
-    Cost += getResultPatternSize(P->getChild(i), CGP);
+  for (unsigned i = 0, e = P.getNumChildren(); i != e; ++i)
+    Cost += getResultPatternSize(P.getChild(i), CGP);
   return Cost;
 }
 
@@ -85,11 +85,11 @@ struct PatternSortingPredicate {
   CodeGenDAGPatterns &CGP;
 
   bool operator()(const PatternToMatch *LHS, const PatternToMatch *RHS) {
-    const TreePatternNode *LT = LHS->getSrcPattern();
-    const TreePatternNode *RT = RHS->getSrcPattern();
+    const TreePatternNode &LT = LHS->getSrcPattern();
+    const TreePatternNode &RT = RHS->getSrcPattern();
 
-    MVT LHSVT = LT->getNumTypes() != 0 ? LT->getSimpleType(0) : MVT::Other;
-    MVT RHSVT = RT->getNumTypes() != 0 ? RT->getSimpleType(0) : MVT::Other;
+    MVT LHSVT = LT.getNumTypes() != 0 ? LT.getSimpleType(0) : MVT::Other;
+    MVT RHSVT = RT.getNumTypes() != 0 ? RT.getSimpleType(0) : MVT::Other;
     if (LHSVT.isVector() != RHSVT.isVector())
       return RHSVT.isVector();
 
@@ -156,9 +156,9 @@ void DAGISelEmitter::run(raw_ostream &OS) {
                   E = CGP.ptm_end();
                   I != E; ++I) {
                errs() << "PATTERN: ";
-               I->getSrcPattern()->dump();
+               I->getSrcPattern().dump();
                errs() << "\nRESULT:  ";
-               I->getDstPattern()->dump();
+               I->getDstPattern().dump();
                errs() << "\n";
              });
 
diff --git a/llvm/utils/TableGen/DAGISelMatcher.cpp b/llvm/utils/TableGen/DAGISelMatcher.cpp
index 5461481..3298965 100644
--- a/llvm/utils/TableGen/DAGISelMatcher.cpp
+++ b/llvm/utils/TableGen/DAGISelMatcher.cpp
@@ -302,8 +302,8 @@ void EmitNodeMatcherCommon::printImpl(raw_ostream &OS, unsigned indent) const {
 
 void CompleteMatchMatcher::printImpl(raw_ostream &OS, unsigned indent) const {
   OS.indent(indent) << "CompleteMatch <todo args>\n";
-  OS.indent(indent) << "Src = " << *Pattern.getSrcPattern() << "\n";
-  OS.indent(indent) << "Dst = " << *Pattern.getDstPattern() << "\n";
+  OS.indent(indent) << "Src = " << Pattern.getSrcPattern() << "\n";
+  OS.indent(indent) << "Dst = " << Pattern.getDstPattern() << "\n";
 }
 
 bool CheckOpcodeMatcher::isEqualImpl(const Matcher *M) const {
diff --git a/llvm/utils/TableGen/DAGISelMatcherEmitter.cpp b/llvm/utils/TableGen/DAGISelMatcherEmitter.cpp
index 8d002e5..b475c98 100644
--- a/llvm/utils/TableGen/DAGISelMatcherEmitter.cpp
+++ b/llvm/utils/TableGen/DAGISelMatcherEmitter.cpp
@@ -214,10 +214,10 @@ private:
 };
 } // end anonymous namespace.
 
-static std::string GetPatFromTreePatternNode(const TreePatternNode *N) {
+static std::string GetPatFromTreePatternNode(const TreePatternNode &N) {
   std::string str;
   raw_string_ostream Stream(str);
-  Stream << *N;
+  Stream << N;
   return str;
 }
 
@@ -983,11 +983,11 @@ unsigned MatcherTableEmitter::EmitMatcher(const Matcher *N,
 
       if (const MorphNodeToMatcher *SNT = dyn_cast<MorphNodeToMatcher>(N)) {
         OS.indent(FullIndexWidth + Indent)
-            << "// Src: " << *SNT->getPattern().getSrcPattern()
+            << "// Src: " << SNT->getPattern().getSrcPattern()
             << " - Complexity = " << SNT->getPattern().getPatternComplexity(CGP)
             << '\n';
         OS.indent(FullIndexWidth + Indent)
-            << "// Dst: " << *SNT->getPattern().getDstPattern() << '\n';
+            << "// Dst: " << SNT->getPattern().getDstPattern() << '\n';
       }
     } else
       OS << '\n';
@@ -1019,11 +1019,11 @@ unsigned MatcherTableEmitter::EmitMatcher(const Matcher *N,
     OS << '\n';
     if (!OmitComments) {
       OS.indent(FullIndexWidth + Indent)
-          << " // Src: " << *CM->getPattern().getSrcPattern()
+          << " // Src: " << CM->getPattern().getSrcPattern()
           << " - Complexity = " << CM->getPattern().getPatternComplexity(CGP)
           << '\n';
       OS.indent(FullIndexWidth + Indent)
-          << " // Dst: " << *CM->getPattern().getDstPattern();
+          << " // Dst: " << CM->getPattern().getDstPattern();
     }
     OS << '\n';
     return 2 + NumResultBytes + NumCoveredBytes;
diff --git a/llvm/utils/TableGen/DAGISelMatcherGen.cpp b/llvm/utils/TableGen/DAGISelMatcherGen.cpp
index 8ca7aae..956cb5e 100644
--- a/llvm/utils/TableGen/DAGISelMatcherGen.cpp
+++ b/llvm/utils/TableGen/DAGISelMatcherGen.cpp
@@ -113,10 +113,10 @@ private:
   void InferPossibleTypes();
 
   // Matcher Generation.
-  void EmitMatchCode(const TreePatternNode *N, TreePatternNode *NodeNoTypes);
-  void EmitLeafMatchCode(const TreePatternNode *N);
-  void EmitOperatorMatchCode(const TreePatternNode *N,
-                             TreePatternNode *NodeNoTypes);
+  void EmitMatchCode(const TreePatternNode &N, TreePatternNode &NodeNoTypes);
+  void EmitLeafMatchCode(const TreePatternNode &N);
+  void EmitOperatorMatchCode(const TreePatternNode &N,
+                             TreePatternNode &NodeNoTypes);
 
   /// If this is the first time a node with unique identifier Name has been
   /// seen, record it. Otherwise, emit a check to make sure this is the same
@@ -131,15 +131,15 @@ private:
     return VarMapEntry - 1;
   }
 
-  void EmitResultOperand(const TreePatternNode *N,
+  void EmitResultOperand(const TreePatternNode &N,
                          SmallVectorImpl<unsigned> &ResultOps);
-  void EmitResultOfNamedOperand(const TreePatternNode *N,
+  void EmitResultOfNamedOperand(const TreePatternNode &N,
                                 SmallVectorImpl<unsigned> &ResultOps);
-  void EmitResultLeafAsOperand(const TreePatternNode *N,
+  void EmitResultLeafAsOperand(const TreePatternNode &N,
                                SmallVectorImpl<unsigned> &ResultOps);
-  void EmitResultInstructionAsOperand(const TreePatternNode *N,
+  void EmitResultInstructionAsOperand(const TreePatternNode &N,
                                       SmallVectorImpl<unsigned> &ResultOps);
-  void EmitResultSDNodeXFormAsOperand(const TreePatternNode *N,
+  void EmitResultSDNodeXFormAsOperand(const TreePatternNode &N,
                                       SmallVectorImpl<unsigned> &ResultOps);
 };
 
@@ -162,7 +162,7 @@ MatcherGen::MatcherGen(const PatternToMatch &pattern,
   // apply the type to the tree, then rerun type inference.  Iterate until all
   // types are resolved.
   //
-  PatWithNoTypes = Pattern.getSrcPattern()->clone();
+  PatWithNoTypes = Pattern.getSrcPattern().clone();
   PatWithNoTypes->RemoveAllTypes();
 
   // If there are types that are manifestly known, infer them.
@@ -198,15 +198,15 @@ void MatcherGen::AddMatcher(Matcher *NewNode) {
 //===----------------------------------------------------------------------===//
 
 /// EmitLeafMatchCode - Generate matching code for leaf nodes.
-void MatcherGen::EmitLeafMatchCode(const TreePatternNode *N) {
-  assert(N->isLeaf() && "Not a leaf?");
+void MatcherGen::EmitLeafMatchCode(const TreePatternNode &N) {
+  assert(N.isLeaf() && "Not a leaf?");
 
   // Direct match against an integer constant.
-  if (IntInit *II = dyn_cast<IntInit>(N->getLeafValue())) {
+  if (IntInit *II = dyn_cast<IntInit>(N.getLeafValue())) {
     // If this is the root of the dag we're matching, we emit a redundant opcode
     // check to ensure that this gets folded into the normal top-level
     // OpcodeSwitch.
-    if (N == Pattern.getSrcPattern()) {
+    if (&N == &Pattern.getSrcPattern()) {
       const SDNodeInfo &NI = CGP.getSDNodeInfo(CGP.getSDNodeNamed("imm"));
       AddMatcher(new CheckOpcodeMatcher(NI));
     }
@@ -215,14 +215,14 @@ void MatcherGen::EmitLeafMatchCode(const TreePatternNode *N) {
   }
 
   // An UnsetInit represents a named node without any constraints.
-  if (isa<UnsetInit>(N->getLeafValue())) {
-    assert(N->hasName() && "Unnamed ? leaf");
+  if (isa<UnsetInit>(N.getLeafValue())) {
+    assert(N.hasName() && "Unnamed ? leaf");
     return;
   }
 
-  DefInit *DI = dyn_cast<DefInit>(N->getLeafValue());
+  DefInit *DI = dyn_cast<DefInit>(N.getLeafValue());
   if (!DI) {
-    errs() << "Unknown leaf kind: " << *N << "\n";
+    errs() << "Unknown leaf kind: " << N << "\n";
     abort();
   }
 
@@ -232,7 +232,7 @@ void MatcherGen::EmitLeafMatchCode(const TreePatternNode *N) {
   // unnamed.
   if (LeafRec->isSubClassOf("ValueType")) {
     // A named ValueType leaf always matches: (add i32:$a, i32:$b).
-    if (N->hasName())
+    if (N.hasName())
       return;
     // An unnamed ValueType as in (sext_inreg GPR:$foo, i8).
     return AddMatcher(new CheckValueTypeMatcher(LeafRec->getName()));
@@ -262,17 +262,17 @@ void MatcherGen::EmitLeafMatchCode(const TreePatternNode *N) {
   if (LeafRec->isSubClassOf("ComplexPattern")) {
     // We can't model ComplexPattern uses that don't have their name taken yet.
     // The OPC_CheckComplexPattern operation implicitly records the results.
-    if (N->getName().empty()) {
+    if (N.getName().empty()) {
       std::string S;
       raw_string_ostream OS(S);
-      OS << "We expect complex pattern uses to have names: " << *N;
+      OS << "We expect complex pattern uses to have names: " << N;
       PrintFatalError(S);
     }
 
     // Remember this ComplexPattern so that we can emit it after all the other
     // structural matches are done.
-    unsigned InputOperand = VariableMap[N->getName()] - 1;
-    MatchedComplexPatterns.push_back(std::make_pair(N, InputOperand));
+    unsigned InputOperand = VariableMap[N.getName()] - 1;
+    MatchedComplexPatterns.push_back(std::make_pair(&N, InputOperand));
     return;
   }
 
@@ -281,8 +281,8 @@ void MatcherGen::EmitLeafMatchCode(const TreePatternNode *N) {
     // If this is the root of the dag we're matching, we emit a redundant opcode
     // check to ensure that this gets folded into the normal top-level
     // OpcodeSwitch.
-    if (N == Pattern.getSrcPattern()) {
-      MVT VT = N->getSimpleType(0);
+    if (&N == &Pattern.getSrcPattern()) {
+      MVT VT = N.getSimpleType(0);
       StringRef Name = VT.isScalableVector() ? "splat_vector" : "build_vector";
       const SDNodeInfo &NI = CGP.getSDNodeInfo(CGP.getSDNodeNamed(Name));
       AddMatcher(new CheckOpcodeMatcher(NI));
@@ -294,33 +294,33 @@ void MatcherGen::EmitLeafMatchCode(const TreePatternNode *N) {
     return;
   }
 
-  errs() << "Unknown leaf kind: " << *N << "\n";
+  errs() << "Unknown leaf kind: " << N << "\n";
   abort();
 }
 
-void MatcherGen::EmitOperatorMatchCode(const TreePatternNode *N,
-                                       TreePatternNode *NodeNoTypes) {
-  assert(!N->isLeaf() && "Not an operator?");
+void MatcherGen::EmitOperatorMatchCode(const TreePatternNode &N,
+                                       TreePatternNode &NodeNoTypes) {
+  assert(!N.isLeaf() && "Not an operator?");
 
-  if (N->getOperator()->isSubClassOf("ComplexPattern")) {
+  if (N.getOperator()->isSubClassOf("ComplexPattern")) {
     // The "name" of a non-leaf complex pattern (MY_PAT $op1, $op2) is
     // "MY_PAT:op1:op2". We should already have validated that the uses are
     // consistent.
-    std::string PatternName = std::string(N->getOperator()->getName());
-    for (unsigned i = 0; i < N->getNumChildren(); ++i) {
+    std::string PatternName = std::string(N.getOperator()->getName());
+    for (unsigned i = 0; i < N.getNumChildren(); ++i) {
       PatternName += ":";
-      PatternName += N->getChild(i)->getName();
+      PatternName += N.getChild(i).getName();
     }
 
     if (recordUniqueNode(PatternName)) {
-      auto NodeAndOpNum = std::make_pair(N, NextRecordedOperandNo - 1);
+      auto NodeAndOpNum = std::make_pair(&N, NextRecordedOperandNo - 1);
       MatchedComplexPatterns.push_back(NodeAndOpNum);
     }
 
     return;
   }
 
-  const SDNodeInfo &CInfo = CGP.getSDNodeInfo(N->getOperator());
+  const SDNodeInfo &CInfo = CGP.getSDNodeInfo(N.getOperator());
 
   // If this is an 'and R, 1234' where the operation is AND/OR and the RHS is
   // a constant without a predicate fn that has more than one bit set, handle
@@ -332,28 +332,28 @@ void MatcherGen::EmitOperatorMatchCode(const TreePatternNode *N,
   // them from the mask in the dag.  For example, it might turn 'AND X, 255'
   // into 'AND X, 254' if it knows the low bit is set.  Emit code that checks
   // to handle this.
-  if ((N->getOperator()->getName() == "and" ||
-       N->getOperator()->getName() == "or") &&
-      N->getChild(1)->isLeaf() && N->getChild(1)->getPredicateCalls().empty() &&
-      N->getPredicateCalls().empty()) {
-    if (IntInit *II = dyn_cast<IntInit>(N->getChild(1)->getLeafValue())) {
+  if ((N.getOperator()->getName() == "and" ||
+       N.getOperator()->getName() == "or") &&
+      N.getChild(1).isLeaf() && N.getChild(1).getPredicateCalls().empty() &&
+      N.getPredicateCalls().empty()) {
+    if (IntInit *II = dyn_cast<IntInit>(N.getChild(1).getLeafValue())) {
       if (!llvm::has_single_bit<uint32_t>(
               II->getValue())) { // Don't bother with single bits.
         // If this is at the root of the pattern, we emit a redundant
         // CheckOpcode so that the following checks get factored properly under
         // a single opcode check.
-        if (N == Pattern.getSrcPattern())
+        if (&N == &Pattern.getSrcPattern())
           AddMatcher(new CheckOpcodeMatcher(CInfo));
 
         // Emit the CheckAndImm/CheckOrImm node.
-        if (N->getOperator()->getName() == "and")
+        if (N.getOperator()->getName() == "and")
           AddMatcher(new CheckAndImmMatcher(II->getValue()));
         else
           AddMatcher(new CheckOrImmMatcher(II->getValue()));
 
         // Match the LHS of the AND as appropriate.
         AddMatcher(new MoveChildMatcher(0));
-        EmitMatchCode(N->getChild(0), NodeNoTypes->getChild(0));
+        EmitMatchCode(N.getChild(0), NodeNoTypes.getChild(0));
         AddMatcher(new MoveParentMatcher());
         return;
       }
@@ -365,15 +365,15 @@ void MatcherGen::EmitOperatorMatchCode(const TreePatternNode *N,
 
   // If this node has memory references (i.e. is a load or store), tell the
   // interpreter to capture them in the memref array.
-  if (N->NodeHasProperty(SDNPMemOperand, CGP))
+  if (N.NodeHasProperty(SDNPMemOperand, CGP))
     AddMatcher(new RecordMemRefMatcher());
 
   // If this node has a chain, then the chain is operand #0 is the SDNode, and
   // the child numbers of the node are all offset by one.
   unsigned OpNo = 0;
-  if (N->NodeHasProperty(SDNPHasChain, CGP)) {
+  if (N.NodeHasProperty(SDNPHasChain, CGP)) {
     // Record the node and remember it in our chained nodes list.
-    AddMatcher(new RecordMatcher("'" + N->getOperator()->getName().str() +
+    AddMatcher(new RecordMatcher("'" + N.getOperator()->getName().str() +
                                      "' chained node",
                                  NextRecordedOperandNo));
     // Remember all of the input chains our pattern will match.
@@ -404,22 +404,22 @@ void MatcherGen::EmitOperatorMatchCode(const TreePatternNode *N,
     // To prevent this, we emit a dynamic check for legality before allowing
     // this to be folded.
     //
-    const TreePatternNode *Root = Pattern.getSrcPattern();
-    if (N != Root) { // Not the root of the pattern.
+    const TreePatternNode &Root = Pattern.getSrcPattern();
+    if (&N != &Root) { // Not the root of the pattern.
       // If there is a node between the root and this node, then we definitely
       // need to emit the check.
-      bool NeedCheck = !Root->hasChild(N);
+      bool NeedCheck = !Root.hasChild(&N);
 
       // If it *is* an immediate child of the root, we can still need a check if
       // the root SDNode has multiple inputs.  For us, this means that it is an
       // intrinsic, has multiple operands, or has other inputs like chain or
       // glue).
       if (!NeedCheck) {
-        const SDNodeInfo &PInfo = CGP.getSDNodeInfo(Root->getOperator());
+        const SDNodeInfo &PInfo = CGP.getSDNodeInfo(Root.getOperator());
         NeedCheck =
-            Root->getOperator() == CGP.get_intrinsic_void_sdnode() ||
-            Root->getOperator() == CGP.get_intrinsic_w_chain_sdnode() ||
-            Root->getOperator() == CGP.get_intrinsic_wo_chain_sdnode() ||
+            Root.getOperator() == CGP.get_intrinsic_void_sdnode() ||
+            Root.getOperator() == CGP.get_intrinsic_w_chain_sdnode() ||
+            Root.getOperator() == CGP.get_intrinsic_wo_chain_sdnode() ||
             PInfo.getNumOperands() > 1 || PInfo.hasProperty(SDNPHasChain) ||
             PInfo.hasProperty(SDNPInGlue) || PInfo.hasProperty(SDNPOptInGlue);
       }
@@ -430,26 +430,26 @@ void MatcherGen::EmitOperatorMatchCode(const TreePatternNode *N,
   }
 
   // If this node has an output glue and isn't the root, remember it.
-  if (N->NodeHasProperty(SDNPOutGlue, CGP) && N != Pattern.getSrcPattern()) {
+  if (N.NodeHasProperty(SDNPOutGlue, CGP) && &N != &Pattern.getSrcPattern()) {
     // TODO: This redundantly records nodes with both glues and chains.
 
     // Record the node and remember it in our chained nodes list.
-    AddMatcher(new RecordMatcher("'" + N->getOperator()->getName().str() +
+    AddMatcher(new RecordMatcher("'" + N.getOperator()->getName().str() +
                                      "' glue output node",
                                  NextRecordedOperandNo));
   }
 
   // If this node is known to have an input glue or if it *might* have an input
   // glue, capture it as the glue input of the pattern.
-  if (N->NodeHasProperty(SDNPOptInGlue, CGP) ||
-      N->NodeHasProperty(SDNPInGlue, CGP))
+  if (N.NodeHasProperty(SDNPOptInGlue, CGP) ||
+      N.NodeHasProperty(SDNPInGlue, CGP))
     AddMatcher(new CaptureGlueInputMatcher());
 
-  for (unsigned i = 0, e = N->getNumChildren(); i != e; ++i, ++OpNo) {
+  for (unsigned i = 0, e = N.getNumChildren(); i != e; ++i, ++OpNo) {
     // Get the code suitable for matching this child.  Move to the child, check
     // it then move back to the parent.
     AddMatcher(new MoveChildMatcher(OpNo));
-    EmitMatchCode(N->getChild(i), NodeNoTypes->getChild(i));
+    EmitMatchCode(N.getChild(i), NodeNoTypes.getChild(i));
     AddMatcher(new MoveParentMatcher());
   }
 }
@@ -489,17 +489,17 @@ bool MatcherGen::recordUniqueNode(ArrayRef<std::string> Names) {
   return NewRecord;
 }
 
-void MatcherGen::EmitMatchCode(const TreePatternNode *N,
-                               TreePatternNode *NodeNoTypes) {
+void MatcherGen::EmitMatchCode(const TreePatternNode &N,
+                               TreePatternNode &NodeNoTypes) {
   // If N and NodeNoTypes don't agree on a type, then this is a case where we
   // need to do a type check.  Emit the check, apply the type to NodeNoTypes and
   // reinfer any correlated types.
   SmallVector<unsigned, 2> ResultsToTypeCheck;
 
-  for (unsigned i = 0, e = NodeNoTypes->getNumTypes(); i != e; ++i) {
-    if (NodeNoTypes->getExtType(i) == N->getExtType(i))
+  for (unsigned i = 0, e = NodeNoTypes.getNumTypes(); i != e; ++i) {
+    if (NodeNoTypes.getExtType(i) == N.getExtType(i))
       continue;
-    NodeNoTypes->setType(i, N->getExtType(i));
+    NodeNoTypes.setType(i, N.getExtType(i));
     InferPossibleTypes();
     ResultsToTypeCheck.push_back(i);
   }
@@ -507,10 +507,10 @@ void MatcherGen::EmitMatchCode(const TreePatternNode *N,
   // If this node has a name associated with it, capture it in VariableMap. If
   // we already saw this in the pattern, emit code to verify dagness.
   SmallVector<std::string, 4> Names;
-  if (!N->getName().empty())
-    Names.push_back(N->getName());
+  if (!N.getName().empty())
+    Names.push_back(N.getName());
 
-  for (const ScopedName &Name : N->getNamesAsPredicateArg()) {
+  for (const ScopedName &Name : N.getNamesAsPredicateArg()) {
     Names.push_back(
         ("pred:" + Twine(Name.getScope()) + ":" + Name.getIdentifier()).str());
   }
@@ -520,14 +520,14 @@ void MatcherGen::EmitMatchCode(const TreePatternNode *N,
       return;
   }
 
-  if (N->isLeaf())
+  if (N.isLeaf())
     EmitLeafMatchCode(N);
   else
     EmitOperatorMatchCode(N, NodeNoTypes);
 
   // If there are node predicates for this node, generate their checks.
-  for (unsigned i = 0, e = N->getPredicateCalls().size(); i != e; ++i) {
-    const TreePredicateCall &Pred = N->getPredicateCalls()[i];
+  for (unsigned i = 0, e = N.getPredicateCalls().size(); i != e; ++i) {
+    const TreePredicateCall &Pred = N.getPredicateCalls()[i];
     SmallVector<unsigned, 4> Operands;
     if (Pred.Fn.usesOperands()) {
       TreePattern *TP = Pred.Fn.getOrigPatFragRecord();
@@ -541,7 +541,7 @@ void MatcherGen::EmitMatchCode(const TreePatternNode *N,
   }
 
   for (unsigned i = 0, e = ResultsToTypeCheck.size(); i != e; ++i)
-    AddMatcher(new CheckTypeMatcher(N->getSimpleType(ResultsToTypeCheck[i]),
+    AddMatcher(new CheckTypeMatcher(N.getSimpleType(ResultsToTypeCheck[i]),
                                     ResultsToTypeCheck[i]));
 }
 
@@ -554,7 +554,7 @@ bool MatcherGen::EmitMatcherCode(unsigned Variant) {
   // Depending on which variant we're generating code for, emit the root opcode
   // check.
   if (const ComplexPattern *CP =
-          Pattern.getSrcPattern()->getComplexPatternInfo(CGP)) {
+          Pattern.getSrcPattern().getComplexPatternInfo(CGP)) {
     const std::vector<Record *> &OpNodes = CP->getRootNodes();
     assert(!OpNodes.empty() &&
            "Complex Pattern must specify what it can match");
@@ -568,7 +568,7 @@ bool MatcherGen::EmitMatcherCode(unsigned Variant) {
   }
 
   // Emit the matcher for the pattern structure and types.
-  EmitMatchCode(Pattern.getSrcPattern(), PatWithNoTypes.get());
+  EmitMatchCode(Pattern.getSrcPattern(), *PatWithNoTypes);
 
   // If the pattern has a predicate on it (e.g. only enabled when a subtarget
   // feature is around, do the check).
@@ -581,28 +581,28 @@ bool MatcherGen::EmitMatcherCode(unsigned Variant) {
   // because they are generally more expensive to evaluate and more difficult to
   // factor.
   for (unsigned i = 0, e = MatchedComplexPatterns.size(); i != e; ++i) {
-    auto N = MatchedComplexPatterns[i].first;
+    auto &N = *MatchedComplexPatterns[i].first;
 
     // Remember where the results of this match get stuck.
-    if (N->isLeaf()) {
-      NamedComplexPatternOperands[N->getName()] = NextRecordedOperandNo + 1;
+    if (N.isLeaf()) {
+      NamedComplexPatternOperands[N.getName()] = NextRecordedOperandNo + 1;
     } else {
       unsigned CurOp = NextRecordedOperandNo;
-      for (unsigned i = 0; i < N->getNumChildren(); ++i) {
-        NamedComplexPatternOperands[N->getChild(i)->getName()] = CurOp + 1;
-        CurOp += N->getChild(i)->getNumMIResults(CGP);
+      for (unsigned i = 0; i < N.getNumChildren(); ++i) {
+        NamedComplexPatternOperands[N.getChild(i).getName()] = CurOp + 1;
+        CurOp += N.getChild(i).getNumMIResults(CGP);
       }
     }
 
     // Get the slot we recorded the value in from the name on the node.
     unsigned RecNodeEntry = MatchedComplexPatterns[i].second;
 
-    const ComplexPattern *CP = N->getComplexPatternInfo(CGP);
+    const ComplexPattern *CP = N.getComplexPatternInfo(CGP);
     assert(CP && "Not a valid ComplexPattern!");
 
     // Emit a CheckComplexPat operation, which does the match (aborting if it
     // fails) and pushes the matched operands onto the recorded nodes list.
-    AddMatcher(new CheckComplexPatMatcher(*CP, RecNodeEntry, N->getName(),
+    AddMatcher(new CheckComplexPatMatcher(*CP, RecNodeEntry, N.getName(),
                                           NextRecordedOperandNo));
 
     // Record the right number of operands.
@@ -631,25 +631,25 @@ bool MatcherGen::EmitMatcherCode(unsigned Variant) {
 //===----------------------------------------------------------------------===//
 
 void MatcherGen::EmitResultOfNamedOperand(
-    const TreePatternNode *N, SmallVectorImpl<unsigned> &ResultOps) {
-  assert(!N->getName().empty() && "Operand not named!");
+    const TreePatternNode &N, SmallVectorImpl<unsigned> &ResultOps) {
+  assert(!N.getName().empty() && "Operand not named!");
 
-  if (unsigned SlotNo = NamedComplexPatternOperands[N->getName()]) {
+  if (unsigned SlotNo = NamedComplexPatternOperands[N.getName()]) {
     // Complex operands have already been completely selected, just find the
     // right slot ant add the arguments directly.
-    for (unsigned i = 0; i < N->getNumMIResults(CGP); ++i)
+    for (unsigned i = 0; i < N.getNumMIResults(CGP); ++i)
       ResultOps.push_back(SlotNo - 1 + i);
 
     return;
   }
 
-  unsigned SlotNo = getNamedArgumentSlot(N->getName());
+  unsigned SlotNo = getNamedArgumentSlot(N.getName());
 
   // If this is an 'imm' or 'fpimm' node, make sure to convert it to the target
   // version of the immediate so that it doesn't get selected due to some other
   // node use.
-  if (!N->isLeaf()) {
-    StringRef OperatorName = N->getOperator()->getName();
+  if (!N.isLeaf()) {
+    StringRef OperatorName = N.getOperator()->getName();
     if (OperatorName == "imm" || OperatorName == "fpimm") {
       AddMatcher(new EmitConvertToTargetMatcher(SlotNo));
       ResultOps.push_back(NextRecordedOperandNo++);
@@ -657,38 +657,38 @@ void MatcherGen::EmitResultOfNamedOperand(
     }
   }
 
-  for (unsigned i = 0; i < N->getNumMIResults(CGP); ++i)
+  for (unsigned i = 0; i < N.getNumMIResults(CGP); ++i)
     ResultOps.push_back(SlotNo + i);
 }
 
-void MatcherGen::EmitResultLeafAsOperand(const TreePatternNode *N,
+void MatcherGen::EmitResultLeafAsOperand(const TreePatternNode &N,
                                          SmallVectorImpl<unsigned> &ResultOps) {
-  assert(N->isLeaf() && "Must be a leaf");
+  assert(N.isLeaf() && "Must be a leaf");
 
-  if (IntInit *II = dyn_cast<IntInit>(N->getLeafValue())) {
-    AddMatcher(new EmitIntegerMatcher(II->getValue(), N->getSimpleType(0)));
+  if (IntInit *II = dyn_cast<IntInit>(N.getLeafValue())) {
+    AddMatcher(new EmitIntegerMatcher(II->getValue(), N.getSimpleType(0)));
     ResultOps.push_back(NextRecordedOperandNo++);
     return;
   }
 
   // If this is an explicit register reference, handle it.
-  if (DefInit *DI = dyn_cast<DefInit>(N->getLeafValue())) {
+  if (DefInit *DI = dyn_cast<DefInit>(N.getLeafValue())) {
     Record *Def = DI->getDef();
     if (Def->isSubClassOf("Register")) {
       const CodeGenRegister *Reg = CGP.getTargetInfo().getRegBank().getReg(Def);
-      AddMatcher(new EmitRegisterMatcher(Reg, N->getSimpleType(0)));
+      AddMatcher(new EmitRegisterMatcher(Reg, N.getSimpleType(0)));
       ResultOps.push_back(NextRecordedOperandNo++);
       return;
     }
 
     if (Def->getName() == "zero_reg") {
-      AddMatcher(new EmitRegisterMatcher(nullptr, N->getSimpleType(0)));
+      AddMatcher(new EmitRegisterMatcher(nullptr, N.getSimpleType(0)));
       ResultOps.push_back(NextRecordedOperandNo++);
       return;
     }
 
     if (Def->getName() == "undef_tied_input") {
-      MVT::SimpleValueType ResultVT = N->getSimpleType(0);
+      MVT::SimpleValueType ResultVT = N.getSimpleType(0);
       auto IDOperandNo = NextRecordedOperandNo++;
       Record *ImpDef = Def->getRecords().getDef("IMPLICIT_DEF");
       CodeGenInstruction &II = CGP.getTargetInfo().getInstruction(ImpDef);
@@ -741,23 +741,23 @@ void MatcherGen::EmitResultLeafAsOperand(const TreePatternNode *N,
   }
 
   errs() << "unhandled leaf node:\n";
-  N->dump();
+  N.dump();
 }
 
-static bool mayInstNodeLoadOrStore(const TreePatternNode *N,
+static bool mayInstNodeLoadOrStore(const TreePatternNode &N,
                                    const CodeGenDAGPatterns &CGP) {
-  Record *Op = N->getOperator();
+  Record *Op = N.getOperator();
   const CodeGenTarget &CGT = CGP.getTargetInfo();
   CodeGenInstruction &II = CGT.getInstruction(Op);
   return II.mayLoad || II.mayStore;
 }
 
-static unsigned numNodesThatMayLoadOrStore(const TreePatternNode *N,
+static unsigned numNodesThatMayLoadOrStore(const TreePatternNode &N,
                                            const CodeGenDAGPatterns &CGP) {
-  if (N->isLeaf())
+  if (N.isLeaf())
     return 0;
 
-  Record *OpRec = N->getOperator();
+  Record *OpRec = N.getOperator();
   if (!OpRec->isSubClassOf("Instruction"))
     return 0;
 
@@ -765,31 +765,31 @@ static unsigned numNodesThatMayLoadOrStore(const TreePatternNode *N,
   if (mayInstNodeLoadOrStore(N, CGP))
     ++Count;
 
-  for (unsigned i = 0, e = N->getNumChildren(); i != e; ++i)
-    Count += numNodesThatMayLoadOrStore(N->getChild(i), CGP);
+  for (unsigned i = 0, e = N.getNumChildren(); i != e; ++i)
+    Count += numNodesThatMayLoadOrStore(N.getChild(i), CGP);
 
   return Count;
 }
 
 void MatcherGen::EmitResultInstructionAsOperand(
-    const TreePatternNode *N, SmallVectorImpl<unsigned> &OutputOps) {
-  Record *Op = N->getOperator();
+    const TreePatternNode &N, SmallVectorImpl<unsigned> &OutputOps) {
+  Record *Op = N.getOperator();
   const CodeGenTarget &CGT = CGP.getTargetInfo();
   CodeGenInstruction &II = CGT.getInstruction(Op);
   const DAGInstruction &Inst = CGP.getInstruction(Op);
 
-  bool isRoot = N == Pattern.getDstPattern();
+  bool isRoot = &N == &Pattern.getDstPattern();
 
   // TreeHasOutGlue - True if this tree has glue.
   bool TreeHasInGlue = false, TreeHasOutGlue = false;
   if (isRoot) {
-    const TreePatternNode *SrcPat = Pattern.getSrcPattern();
-    TreeHasInGlue = SrcPat->TreeHasProperty(SDNPOptInGlue, CGP) ||
-                    SrcPat->TreeHasProperty(SDNPInGlue, CGP);
+    const TreePatternNode &SrcPat = Pattern.getSrcPattern();
+    TreeHasInGlue = SrcPat.TreeHasProperty(SDNPOptInGlue, CGP) ||
+                    SrcPat.TreeHasProperty(SDNPInGlue, CGP);
 
     // FIXME2: this is checking the entire pattern, not just the node in
     // question, doing this just for the root seems like a total hack.
-    TreeHasOutGlue = SrcPat->TreeHasProperty(SDNPOutGlue, CGP);
+    TreeHasOutGlue = SrcPat.TreeHasProperty(SDNPOutGlue, CGP);
   }
 
   // NumResults - This is the number of results produced by the instruction in
@@ -826,13 +826,13 @@ void MatcherGen::EmitResultInstructionAsOperand(
     // Determine what to emit for this operand.
     Record *OperandNode = II.Operands[InstOpNo].Rec;
     if (CGP.operandHasDefault(OperandNode) &&
-        (InstOpNo < NonOverridableOperands || ChildNo >= N->getNumChildren())) {
+        (InstOpNo < NonOverridableOperands || ChildNo >= N.getNumChildren())) {
       // This is a predicate or optional def operand which the pattern has not
       // overridden, or which we aren't letting it override; emit the 'default
       // ops' operands.
       const DAGDefaultOperand &DefaultOp = CGP.getDefaultOperand(OperandNode);
       for (unsigned i = 0, e = DefaultOp.DefaultOps.size(); i != e; ++i)
-        EmitResultOperand(DefaultOp.DefaultOps[i].get(), InstOps);
+        EmitResultOperand(*DefaultOp.DefaultOps[i], InstOps);
       continue;
     }
 
@@ -851,14 +851,14 @@ void MatcherGen::EmitResultInstructionAsOperand(
 
     unsigned FinalNumOps = InstOps.size() + NumSubOps;
     while (InstOps.size() < FinalNumOps) {
-      const TreePatternNode *Child = N->getChild(ChildNo);
+      const TreePatternNode &Child = N.getChild(ChildNo);
       unsigned BeforeAddingNumOps = InstOps.size();
       EmitResultOperand(Child, InstOps);
       assert(InstOps.size() > BeforeAddingNumOps && "Didn't add any operands");
 
       // If the operand is an instruction and it produced multiple results, just
       // take the first one.
-      if (!Child->isLeaf() && Child->getOperator()->isSubClassOf("Instruction"))
+      if (!Child.isLeaf() && Child.getOperator()->isSubClassOf("Instruction"))
         InstOps.resize(BeforeAddingNumOps + 1);
 
       ++ChildNo;
@@ -871,8 +871,8 @@ void MatcherGen::EmitResultInstructionAsOperand(
   // above. Emit the remaining instructions implicitly added by the use for
   // variable_ops.
   if (II.Operands.isVariadic) {
-    for (unsigned I = ChildNo, E = N->getNumChildren(); I < E; ++I)
-      EmitResultOperand(N->getChild(I), InstOps);
+    for (unsigned I = ChildNo, E = N.getNumChildren(); I < E; ++I)
+      EmitResultOperand(N.getChild(I), InstOps);
   }
 
   // If this node has input glue or explicitly specified input physregs, we
@@ -896,8 +896,8 @@ void MatcherGen::EmitResultInstructionAsOperand(
 
   // Determine the result types.
   SmallVector<MVT::SimpleValueType, 4> ResultVTs;
-  for (unsigned i = 0, e = N->getNumTypes(); i != e; ++i)
-    ResultVTs.push_back(N->getSimpleType(i));
+  for (unsigned i = 0, e = N.getNumTypes(); i != e; ++i)
+    ResultVTs.push_back(N.getSimpleType(i));
 
   // If this is the root instruction of a pattern that has physical registers in
   // its result pattern, add output VTs for them.  For example, X86 has:
@@ -922,8 +922,8 @@ void MatcherGen::EmitResultInstructionAsOperand(
   // a node that is variadic, mark the generated node as variadic so that it
   // gets the excess operands from the input DAG.
   int NumFixedArityOperands = -1;
-  if (isRoot && Pattern.getSrcPattern()->NodeHasProperty(SDNPVariadic, CGP))
-    NumFixedArityOperands = Pattern.getSrcPattern()->getNumChildren();
+  if (isRoot && Pattern.getSrcPattern().NodeHasProperty(SDNPVariadic, CGP))
+    NumFixedArityOperands = Pattern.getSrcPattern().getNumChildren();
 
   // If this is the root node and multiple matched nodes in the input pattern
   // have MemRefs in them, have the interpreter collect them and plop them onto
@@ -933,7 +933,7 @@ void MatcherGen::EmitResultInstructionAsOperand(
   // FIXME3: This is actively incorrect for result patterns with multiple
   // memory-referencing instructions.
   bool PatternHasMemOperands =
-      Pattern.getSrcPattern()->TreeHasProperty(SDNPMemOperand, CGP);
+      Pattern.getSrcPattern().TreeHasProperty(SDNPMemOperand, CGP);
 
   bool NodeHasMemRefs = false;
   if (PatternHasMemOperands) {
@@ -948,7 +948,7 @@ void MatcherGen::EmitResultInstructionAsOperand(
 
   // Determine whether we need to attach a chain to this node.
   bool NodeHasChain = false;
-  if (Pattern.getSrcPattern()->TreeHasProperty(SDNPHasChain, CGP)) {
+  if (Pattern.getSrcPattern().TreeHasProperty(SDNPHasChain, CGP)) {
     // For some instructions, we were able to infer from the pattern whether
     // they should have a chain.  Otherwise, attach the chain to the root.
     //
@@ -982,8 +982,8 @@ void MatcherGen::EmitResultInstructionAsOperand(
 }
 
 void MatcherGen::EmitResultSDNodeXFormAsOperand(
-    const TreePatternNode *N, SmallVectorImpl<unsigned> &ResultOps) {
-  assert(N->getOperator()->isSubClassOf("SDNodeXForm") && "Not SDNodeXForm?");
+    const TreePatternNode &N, SmallVectorImpl<unsigned> &ResultOps) {
+  assert(N.getOperator()->isSubClassOf("SDNodeXForm") && "Not SDNodeXForm?");
 
   // Emit the operand.
   SmallVector<unsigned, 8> InputOps;
@@ -991,31 +991,31 @@ void MatcherGen::EmitResultSDNodeXFormAsOperand(
   // FIXME2: Could easily generalize this to support multiple inputs and outputs
   // to the SDNodeXForm.  For now we just support one input and one output like
   // the old instruction selector.
-  assert(N->getNumChildren() == 1);
-  EmitResultOperand(N->getChild(0), InputOps);
+  assert(N.getNumChildren() == 1);
+  EmitResultOperand(N.getChild(0), InputOps);
 
   // The input currently must have produced exactly one result.
   assert(InputOps.size() == 1 && "Unexpected input to SDNodeXForm");
 
-  AddMatcher(new EmitNodeXFormMatcher(InputOps[0], N->getOperator()));
+  AddMatcher(new EmitNodeXFormMatcher(InputOps[0], N.getOperator()));
   ResultOps.push_back(NextRecordedOperandNo++);
 }
 
-void MatcherGen::EmitResultOperand(const TreePatternNode *N,
+void MatcherGen::EmitResultOperand(const TreePatternNode &N,
                                    SmallVectorImpl<unsigned> &ResultOps) {
   // This is something selected from the pattern we matched.
-  if (!N->getName().empty())
+  if (!N.getName().empty())
     return EmitResultOfNamedOperand(N, ResultOps);
 
-  if (N->isLeaf())
+  if (N.isLeaf())
     return EmitResultLeafAsOperand(N, ResultOps);
 
-  Record *OpRec = N->getOperator();
+  Record *OpRec = N.getOperator();
   if (OpRec->isSubClassOf("Instruction"))
     return EmitResultInstructionAsOperand(N, ResultOps);
   if (OpRec->isSubClassOf("SDNodeXForm"))
     return EmitResultSDNodeXFormAsOperand(N, ResultOps);
-  errs() << "Unknown result node to emit code for: " << *N << '\n';
+  errs() << "Unknown result node to emit code for: " << N << '\n';
   PrintFatalError("Unknown node in result pattern!");
 }
 
@@ -1036,18 +1036,17 @@ void MatcherGen::EmitResultCode() {
   // just lop them off.  This doesn't need to worry about glue or chains, just
   // explicit results.
   //
-  unsigned NumSrcResults = Pattern.getSrcPattern()->getNumTypes();
+  unsigned NumSrcResults = Pattern.getSrcPattern().getNumTypes();
 
   // If the pattern also has (implicit) results, count them as well.
   if (!Pattern.getDstRegs().empty()) {
     // If the root came from an implicit def in the instruction handling stuff,
     // don't re-add it.
     Record *HandledReg = nullptr;
-    const TreePatternNode *DstPat = Pattern.getDstPattern();
-    if (!DstPat->isLeaf() &&
-        DstPat->getOperator()->isSubClassOf("Instruction")) {
+    const TreePatternNode &DstPat = Pattern.getDstPattern();
+    if (!DstPat.isLeaf() && DstPat.getOperator()->isSubClassOf("Instruction")) {
       const CodeGenTarget &CGT = CGP.getTargetInfo();
-      CodeGenInstruction &II = CGT.getInstruction(DstPat->getOperator());
+      CodeGenInstruction &II = CGT.getInstruction(DstPat.getOperator());
 
       if (II.HasOneImplicitDefWithKnownVT(CGT) != MVT::Other)
         HandledReg = II.ImplicitDefs[0];
@@ -1063,9 +1062,9 @@ void MatcherGen::EmitResultCode() {
   SmallVector<unsigned, 8> Results(Ops);
 
   // Apply result permutation.
-  for (unsigned ResNo = 0; ResNo < Pattern.getDstPattern()->getNumResults();
+  for (unsigned ResNo = 0; ResNo < Pattern.getDstPattern().getNumResults();
        ++ResNo) {
-    Results[ResNo] = Ops[Pattern.getDstPattern()->getResultIndex(ResNo)];
+    Results[ResNo] = Ops[Pattern.getDstPattern().getResultIndex(ResNo)];
   }
 
   Results.resize(NumSrcResults);
diff --git a/llvm/utils/TableGen/DAGISelMatcherOpt.cpp b/llvm/utils/TableGen/DAGISelMatcherOpt.cpp
index b137492..f786d41 100644
--- a/llvm/utils/TableGen/DAGISelMatcherOpt.cpp
+++ b/llvm/utils/TableGen/DAGISelMatcherOpt.cpp
@@ -99,7 +99,7 @@ static void ContractNodes(std::unique_ptr<Matcher> &MatcherPtr,
       const PatternToMatch &Pattern = CM->getPattern();
 
       if (!EN->hasChain() &&
-          Pattern.getSrcPattern()->NodeHasProperty(SDNPHasChain, CGP))
+          Pattern.getSrcPattern().NodeHasProperty(SDNPHasChain, CGP))
         ResultsMatch = false;
 
       // If the matched node has glue and the output root doesn't, we can't
@@ -109,7 +109,7 @@ static void ContractNodes(std::unique_ptr<Matcher> &MatcherPtr,
       // because the code in the pattern generator doesn't handle it right.  We
       // do it anyway for thoroughness.
       if (!EN->hasOutGlue() &&
-          Pattern.getSrcPattern()->NodeHasProperty(SDNPOutGlue, CGP))
+          Pattern.getSrcPattern().NodeHasProperty(SDNPOutGlue, CGP))
         ResultsMatch = false;
 
 #if 0
diff --git a/llvm/utils/TableGen/FastISelEmitter.cpp b/llvm/utils/TableGen/FastISelEmitter.cpp
index dff6503..00a1650 100644
--- a/llvm/utils/TableGen/FastISelEmitter.cpp
+++ b/llvm/utils/TableGen/FastISelEmitter.cpp
@@ -200,36 +200,36 @@ struct OperandsSignature {
   /// of the Operands array accordingly. Return true if all the operands
   /// are supported, false otherwise.
   ///
-  bool initialize(TreePatternNode *InstPatNode, const CodeGenTarget &Target,
+  bool initialize(TreePatternNode &InstPatNode, const CodeGenTarget &Target,
                   MVT::SimpleValueType VT, ImmPredicateSet &ImmediatePredicates,
                   const CodeGenRegisterClass *OrigDstRC) {
-    if (InstPatNode->isLeaf())
+    if (InstPatNode.isLeaf())
       return false;
 
-    if (InstPatNode->getOperator()->getName() == "imm") {
+    if (InstPatNode.getOperator()->getName() == "imm") {
       Operands.push_back(OpKind::getImm(0));
       return true;
     }
 
-    if (InstPatNode->getOperator()->getName() == "fpimm") {
+    if (InstPatNode.getOperator()->getName() == "fpimm") {
       Operands.push_back(OpKind::getFP());
       return true;
     }
 
     const CodeGenRegisterClass *DstRC = nullptr;
 
-    for (unsigned i = 0, e = InstPatNode->getNumChildren(); i != e; ++i) {
-      TreePatternNode *Op = InstPatNode->getChild(i);
+    for (unsigned i = 0, e = InstPatNode.getNumChildren(); i != e; ++i) {
+      TreePatternNode &Op = InstPatNode.getChild(i);
 
       // Handle imm operands specially.
-      if (!Op->isLeaf() && Op->getOperator()->getName() == "imm") {
+      if (!Op.isLeaf() && Op.getOperator()->getName() == "imm") {
         unsigned PredNo = 0;
-        if (!Op->getPredicateCalls().empty()) {
-          TreePredicateFn PredFn = Op->getPredicateCalls()[0].Fn;
+        if (!Op.getPredicateCalls().empty()) {
+          TreePredicateFn PredFn = Op.getPredicateCalls()[0].Fn;
           // If there is more than one predicate weighing in on this operand
           // then we don't handle it.  This doesn't typically happen for
           // immediates anyway.
-          if (Op->getPredicateCalls().size() > 1 ||
+          if (Op.getPredicateCalls().size() > 1 ||
               !PredFn.isImmediatePattern() || PredFn.usesOperands())
             return false;
           // Ignore any instruction with 'FastIselShouldIgnore', these are
@@ -249,11 +249,11 @@ struct OperandsSignature {
 
       // For now, filter out any operand with a predicate.
       // For now, filter out any operand with multiple values.
-      if (!Op->getPredicateCalls().empty() || Op->getNumTypes() != 1)
+      if (!Op.getPredicateCalls().empty() || Op.getNumTypes() != 1)
         return false;
 
-      if (!Op->isLeaf()) {
-        if (Op->getOperator()->getName() == "fpimm") {
+      if (!Op.isLeaf()) {
+        if (Op.getOperator()->getName() == "fpimm") {
           Operands.push_back(OpKind::getFP());
           continue;
         }
@@ -261,15 +261,15 @@ struct OperandsSignature {
         return false;
       }
 
-      assert(Op->hasConcreteType(0) && "Type infererence not done?");
+      assert(Op.hasConcreteType(0) && "Type infererence not done?");
 
       // For now, all the operands must have the same type (if they aren't
       // immediates).  Note that this causes us to reject variable sized shifts
       // on X86.
-      if (Op->getSimpleType(0) != VT)
+      if (Op.getSimpleType(0) != VT)
         return false;
 
-      DefInit *OpDI = dyn_cast<DefInit>(Op->getLeafValue());
+      DefInit *OpDI = dyn_cast<DefInit>(Op.getLeafValue());
       if (!OpDI)
         return false;
       Record *OpLeafRec = OpDI->getDef();
@@ -430,14 +430,14 @@ static std::string getLegalCName(std::string OpName) {
 
 FastISelMap::FastISelMap(StringRef instns) : InstNS(instns) {}
 
-static std::string PhyRegForNode(TreePatternNode *Op,
+static std::string PhyRegForNode(TreePatternNode &Op,
                                  const CodeGenTarget &Target) {
   std::string PhysReg;
 
-  if (!Op->isLeaf())
+  if (!Op.isLeaf())
     return PhysReg;
 
-  Record *OpLeafRec = cast<DefInit>(Op->getLeafValue())->getDef();
+  Record *OpLeafRec = cast<DefInit>(Op.getLeafValue())->getDef();
   if (!OpLeafRec->isSubClassOf("Register"))
     return PhysReg;
 
@@ -458,10 +458,10 @@ void FastISelMap::collectPatterns(CodeGenDAGPatterns &CGP) {
 
     // For now, just look at Instructions, so that we don't have to worry
     // about emitting multiple instructions for a pattern.
-    TreePatternNode *Dst = Pattern.getDstPattern();
-    if (Dst->isLeaf())
+    TreePatternNode &Dst = Pattern.getDstPattern();
+    if (Dst.isLeaf())
       continue;
-    Record *Op = Dst->getOperator();
+    Record *Op = Dst.getOperator();
     if (!Op->isSubClassOf("Instruction"))
       continue;
     CodeGenInstruction &II = CGP.getTargetInfo().getInstruction(Op);
@@ -477,11 +477,11 @@ void FastISelMap::collectPatterns(CodeGenDAGPatterns &CGP) {
 
     // For now, ignore multi-instruction patterns.
     bool MultiInsts = false;
-    for (unsigned i = 0, e = Dst->getNumChildren(); i != e; ++i) {
-      TreePatternNode *ChildOp = Dst->getChild(i);
-      if (ChildOp->isLeaf())
+    for (unsigned i = 0, e = Dst.getNumChildren(); i != e; ++i) {
+      TreePatternNode &ChildOp = Dst.getChild(i);
+      if (ChildOp.isLeaf())
         continue;
-      if (ChildOp->getOperator()->isSubClassOf("Instruction")) {
+      if (ChildOp.getOperator()->isSubClassOf("Instruction")) {
         MultiInsts = true;
         break;
       }
@@ -505,40 +505,38 @@ void FastISelMap::collectPatterns(CodeGenDAGPatterns &CGP) {
     } else {
       // If this isn't a leaf, then continue since the register classes are
       // a bit too complicated for now.
-      if (!Dst->getChild(1)->isLeaf())
+      if (!Dst.getChild(1).isLeaf())
         continue;
 
-      DefInit *SR = dyn_cast<DefInit>(Dst->getChild(1)->getLeafValue());
+      DefInit *SR = dyn_cast<DefInit>(Dst.getChild(1).getLeafValue());
       if (SR)
         SubRegNo = getQualifiedName(SR->getDef());
       else
-        SubRegNo = Dst->getChild(1)->getLeafValue()->getAsString();
+        SubRegNo = Dst.getChild(1).getLeafValue()->getAsString();
     }
 
     // Inspect the pattern.
-    TreePatternNode *InstPatNode = Pattern.getSrcPattern();
-    if (!InstPatNode)
-      continue;
-    if (InstPatNode->isLeaf())
+    TreePatternNode &InstPatNode = Pattern.getSrcPattern();
+    if (InstPatNode.isLeaf())
       continue;
 
     // Ignore multiple result nodes for now.
-    if (InstPatNode->getNumTypes() > 1)
+    if (InstPatNode.getNumTypes() > 1)
       continue;
 
-    Record *InstPatOp = InstPatNode->getOperator();
+    Record *InstPatOp = InstPatNode.getOperator();
     std::string OpcodeName = getOpcodeName(InstPatOp, CGP);
     MVT::SimpleValueType RetVT = MVT::isVoid;
-    if (InstPatNode->getNumTypes())
-      RetVT = InstPatNode->getSimpleType(0);
+    if (InstPatNode.getNumTypes())
+      RetVT = InstPatNode.getSimpleType(0);
     MVT::SimpleValueType VT = RetVT;
-    if (InstPatNode->getNumChildren()) {
-      assert(InstPatNode->getChild(0)->getNumTypes() == 1);
-      VT = InstPatNode->getChild(0)->getSimpleType(0);
+    if (InstPatNode.getNumChildren()) {
+      assert(InstPatNode.getChild(0).getNumTypes() == 1);
+      VT = InstPatNode.getChild(0).getSimpleType(0);
     }
 
     // For now, filter out any instructions with predicates.
-    if (!InstPatNode->getPredicateCalls().empty())
+    if (!InstPatNode.getPredicateCalls().empty())
       continue;
 
     // Check all the operands.
@@ -548,20 +546,20 @@ void FastISelMap::collectPatterns(CodeGenDAGPatterns &CGP) {
       continue;
 
     std::vector<std::string> PhysRegInputs;
-    if (InstPatNode->getOperator()->getName() == "imm" ||
-        InstPatNode->getOperator()->getName() == "fpimm")
+    if (InstPatNode.getOperator()->getName() == "imm" ||
+        InstPatNode.getOperator()->getName() == "fpimm")
       PhysRegInputs.push_back("");
     else {
       // Compute the PhysRegs used by the given pattern, and check that
       // the mapping from the src to dst patterns is simple.
       bool FoundNonSimplePattern = false;
       unsigned DstIndex = 0;
-      for (unsigned i = 0, e = InstPatNode->getNumChildren(); i != e; ++i) {
-        std::string PhysReg = PhyRegForNode(InstPatNode->getChild(i), Target);
+      for (unsigned i = 0, e = InstPatNode.getNumChildren(); i != e; ++i) {
+        std::string PhysReg = PhyRegForNode(InstPatNode.getChild(i), Target);
         if (PhysReg.empty()) {
-          if (DstIndex >= Dst->getNumChildren() ||
-              Dst->getChild(DstIndex)->getName() !=
-                  InstPatNode->getChild(i)->getName()) {
+          if (DstIndex >= Dst.getNumChildren() ||
+              Dst.getChild(DstIndex).getName() !=
+                  InstPatNode.getChild(i).getName()) {
             FoundNonSimplePattern = true;
             break;
           }
@@ -571,7 +569,7 @@ void FastISelMap::collectPatterns(CodeGenDAGPatterns &CGP) {
         PhysRegInputs.push_back(PhysReg);
       }
 
-      if (Op->getName() != "EXTRACT_SUBREG" && DstIndex < Dst->getNumChildren())
+      if (Op->getName() != "EXTRACT_SUBREG" && DstIndex < Dst.getNumChildren())
         FoundNonSimplePattern = true;
 
       if (FoundNonSimplePattern)
@@ -591,7 +589,7 @@ void FastISelMap::collectPatterns(CodeGenDAGPatterns &CGP) {
     std::string PredicateCheck = Pattern.getPredicateCheck();
 
     // Ok, we found a pattern that we can handle. Remember it.
-    InstructionMemo Memo(Pattern.getDstPattern()->getOperator()->getName(),
+    InstructionMemo Memo(Pattern.getDstPattern().getOperator()->getName(),
                          DstRC, SubRegNo, PhysRegInputs, PredicateCheck);
 
     int complexity = Pattern.getPatternComplexity(CGP);
diff --git a/llvm/utils/TableGen/GlobalISelEmitter.cpp b/llvm/utils/TableGen/GlobalISelEmitter.cpp
index 22e7785..13f2384 100644
--- a/llvm/utils/TableGen/GlobalISelEmitter.cpp
+++ b/llvm/utils/TableGen/GlobalISelEmitter.cpp
@@ -90,10 +90,10 @@ static cl::opt<bool> OptimizeMatchTable(
 
 namespace {
 
-static std::string explainPredicates(const TreePatternNode *N) {
+static std::string explainPredicates(const TreePatternNode &N) {
   std::string Explanation;
   StringRef Separator = "";
-  for (const TreePredicateCall &Call : N->getPredicateCalls()) {
+  for (const TreePredicateCall &Call : N.getPredicateCalls()) {
     const TreePredicateFn &P = Call.Fn;
     Explanation +=
         (Separator + P.getOrigPatFragRecord()->getRecord()->getName()).str();
@@ -194,12 +194,12 @@ static Error failedImport(const Twine &Reason) {
   return make_error<StringError>(Reason, inconvertibleErrorCode());
 }
 
-static Error isTrivialOperatorNode(const TreePatternNode *N) {
+static Error isTrivialOperatorNode(const TreePatternNode &N) {
   std::string Explanation;
   std::string Separator;
 
   bool HasUnsupportedPredicate = false;
-  for (const TreePredicateCall &Call : N->getPredicateCalls()) {
+  for (const TreePredicateCall &Call : N.getPredicateCalls()) {
     const TreePredicateFn &Predicate = Call.Fn;
 
     if (Predicate.isAlwaysTrue())
@@ -288,8 +288,8 @@ static std::string getMangledRootDefName(StringRef DefOperandName) {
 
 //===- GlobalISelEmitter class --------------------------------------------===//
 
-static Expected<LLTCodeGen> getInstResultType(const TreePatternNode *Dst) {
-  ArrayRef<TypeSetByHwMode> ChildTypes = Dst->getExtTypes();
+static Expected<LLTCodeGen> getInstResultType(const TreePatternNode &Dst) {
+  ArrayRef<TypeSetByHwMode> ChildTypes = Dst.getExtTypes();
   if (ChildTypes.size() != 1)
     return failedImport("Dst pattern child has multiple results");
 
@@ -372,40 +372,40 @@ private:
 
   Record *findNodeEquiv(Record *N) const;
   const CodeGenInstruction *getEquivNode(Record &Equiv,
-                                         const TreePatternNode *N) const;
+                                         const TreePatternNode &N) const;
 
   Error importRulePredicates(RuleMatcher &M, ArrayRef<Record *> Predicates);
   Expected<InstructionMatcher &>
   createAndImportSelDAGMatcher(RuleMatcher &Rule,
                                InstructionMatcher &InsnMatcher,
-                               const TreePatternNode *Src, unsigned &TempOpIdx);
+                               const TreePatternNode &Src, unsigned &TempOpIdx);
   Error importComplexPatternOperandMatcher(OperandMatcher &OM, Record *R,
                                            unsigned &TempOpIdx) const;
   Error importChildMatcher(RuleMatcher &Rule, InstructionMatcher &InsnMatcher,
-                           const TreePatternNode *SrcChild,
+                           const TreePatternNode &SrcChild,
                            bool OperandIsAPointer, bool OperandIsImmArg,
                            unsigned OpIdx, unsigned &TempOpIdx);
 
   Expected<BuildMIAction &> createAndImportInstructionRenderer(
       RuleMatcher &M, InstructionMatcher &InsnMatcher,
-      const TreePatternNode *Src, const TreePatternNode *Dst);
+      const TreePatternNode &Src, const TreePatternNode &Dst);
   Expected<action_iterator> createAndImportSubInstructionRenderer(
-      action_iterator InsertPt, RuleMatcher &M, const TreePatternNode *Dst,
-      const TreePatternNode *Src, unsigned TempReg);
+      action_iterator InsertPt, RuleMatcher &M, const TreePatternNode &Dst,
+      const TreePatternNode &Src, unsigned TempReg);
   Expected<action_iterator>
   createInstructionRenderer(action_iterator InsertPt, RuleMatcher &M,
-                            const TreePatternNode *Dst);
+                            const TreePatternNode &Dst);
 
   Expected<action_iterator> importExplicitDefRenderers(
       action_iterator InsertPt, RuleMatcher &M, BuildMIAction &DstMIBuilder,
-      const TreePatternNode *Src, const TreePatternNode *Dst);
+      const TreePatternNode &Src, const TreePatternNode &Dst);
 
   Expected<action_iterator> importExplicitUseRenderers(
       action_iterator InsertPt, RuleMatcher &M, BuildMIAction &DstMIBuilder,
-      const llvm::TreePatternNode *Dst, const TreePatternNode *Src);
+      const llvm::TreePatternNode &Dst, const TreePatternNode &Src);
   Expected<action_iterator> importExplicitUseRenderer(
       action_iterator InsertPt, RuleMatcher &Rule, BuildMIAction &DstMIBuilder,
-      const TreePatternNode *DstChild, const TreePatternNode *Src);
+      const TreePatternNode &DstChild, const TreePatternNode &Src);
   Error importDefaultOperandRenderers(action_iterator InsertPt, RuleMatcher &M,
                                       BuildMIAction &DstMIBuilder,
                                       const DAGDefaultOperand &DefaultOp) const;
@@ -430,25 +430,25 @@ private:
   /// If no register class is found, return std::nullopt.
   std::optional<const CodeGenRegisterClass *>
   inferSuperRegisterClassForNode(const TypeSetByHwMode &Ty,
-                                 const TreePatternNode *SuperRegNode,
-                                 const TreePatternNode *SubRegIdxNode);
+                                 const TreePatternNode &SuperRegNode,
+                                 const TreePatternNode &SubRegIdxNode);
   std::optional<CodeGenSubRegIndex *>
-  inferSubRegIndexForNode(const TreePatternNode *SubRegIdxNode);
+  inferSubRegIndexForNode(const TreePatternNode &SubRegIdxNode);
 
   /// Infer a CodeGenRegisterClass which suppoorts \p Ty and \p SubRegIdxNode.
   /// Return std::nullopt if no such class exists.
   std::optional<const CodeGenRegisterClass *>
   inferSuperRegisterClass(const TypeSetByHwMode &Ty,
-                          const TreePatternNode *SubRegIdxNode);
+                          const TreePatternNode &SubRegIdxNode);
 
   /// Return the CodeGenRegisterClass associated with \p Leaf if it has one.
   std::optional<const CodeGenRegisterClass *>
-  getRegClassFromLeaf(const TreePatternNode *Leaf);
+  getRegClassFromLeaf(const TreePatternNode &Leaf);
 
   /// Return a CodeGenRegisterClass for \p N if one can be found. Return
   /// std::nullopt otherwise.
   std::optional<const CodeGenRegisterClass *>
-  inferRegClassFromPattern(const TreePatternNode *N);
+  inferRegClassFromPattern(const TreePatternNode &N);
 
   /// Return the size of the MemoryVT in this predicate, if possible.
   std::optional<unsigned>
@@ -498,19 +498,19 @@ Record *GlobalISelEmitter::findNodeEquiv(Record *N) const {
 }
 
 const CodeGenInstruction *
-GlobalISelEmitter::getEquivNode(Record &Equiv, const TreePatternNode *N) const {
-  if (N->getNumChildren() >= 1) {
+GlobalISelEmitter::getEquivNode(Record &Equiv, const TreePatternNode &N) const {
+  if (N.getNumChildren() >= 1) {
     // setcc operation maps to two different G_* instructions based on the type.
     if (!Equiv.isValueUnset("IfFloatingPoint") &&
-        MVT(N->getChild(0)->getSimpleType(0)).isFloatingPoint())
+        MVT(N.getChild(0).getSimpleType(0)).isFloatingPoint())
       return &Target.getInstruction(Equiv.getValueAsDef("IfFloatingPoint"));
   }
 
   if (!Equiv.isValueUnset("IfConvergent") &&
-      N->getIntrinsicInfo(CGP)->isConvergent)
+      N.getIntrinsicInfo(CGP)->isConvergent)
     return &Target.getInstruction(Equiv.getValueAsDef("IfConvergent"));
 
-  for (const TreePredicateCall &Call : N->getPredicateCalls()) {
+  for (const TreePredicateCall &Call : N.getPredicateCalls()) {
     const TreePredicateFn &Predicate = Call.Fn;
     if (!Equiv.isValueUnset("IfSignExtend") &&
         (Predicate.isLoad() || Predicate.isAtomic()) &&
@@ -707,15 +707,15 @@ Expected<InstructionMatcher &> GlobalISelEmitter::addBuiltinPredicates(
 
 Expected<InstructionMatcher &> GlobalISelEmitter::createAndImportSelDAGMatcher(
     RuleMatcher &Rule, InstructionMatcher &InsnMatcher,
-    const TreePatternNode *Src, unsigned &TempOpIdx) {
-  const auto SavedFlags = Rule.setGISelFlags(Src->getGISelFlagsRecord());
+    const TreePatternNode &Src, unsigned &TempOpIdx) {
+  const auto SavedFlags = Rule.setGISelFlags(Src.getGISelFlagsRecord());
 
   Record *SrcGIEquivOrNull = nullptr;
   const CodeGenInstruction *SrcGIOrNull = nullptr;
 
   // Start with the defined operands (i.e., the results of the root operator).
-  if (Src->isLeaf()) {
-    Init *SrcInit = Src->getLeafValue();
+  if (Src.isLeaf()) {
+    Init *SrcInit = Src.getLeafValue();
     if (isa<IntInit>(SrcInit)) {
       InsnMatcher.addPredicate<InstructionOpcodeMatcher>(
           &Target.getInstruction(RK.getDef("G_CONSTANT")));
@@ -723,10 +723,10 @@ Expected<InstructionMatcher &> GlobalISelEmitter::createAndImportSelDAGMatcher(
       return failedImport(
           "Unable to deduce gMIR opcode to handle Src (which is a leaf)");
   } else {
-    SrcGIEquivOrNull = findNodeEquiv(Src->getOperator());
+    SrcGIEquivOrNull = findNodeEquiv(Src.getOperator());
     if (!SrcGIEquivOrNull)
       return failedImport("Pattern operator lacks an equivalent Instruction" +
-                          explainOperator(Src->getOperator()));
+                          explainOperator(Src.getOperator()));
     SrcGIOrNull = getEquivNode(*SrcGIEquivOrNull, Src);
 
     // The operators look good: match the opcode
@@ -734,7 +734,7 @@ Expected<InstructionMatcher &> GlobalISelEmitter::createAndImportSelDAGMatcher(
   }
 
   unsigned OpIdx = 0;
-  for (const TypeSetByHwMode &VTy : Src->getExtTypes()) {
+  for (const TypeSetByHwMode &VTy : Src.getExtTypes()) {
     // Results don't have a name unless they are the root node. The caller will
     // set the name if appropriate.
     const bool OperandIsAPointer =
@@ -745,7 +745,7 @@ Expected<InstructionMatcher &> GlobalISelEmitter::createAndImportSelDAGMatcher(
                           " for result of Src pattern operator");
   }
 
-  for (const TreePredicateCall &Call : Src->getPredicateCalls()) {
+  for (const TreePredicateCall &Call : Src.getPredicateCalls()) {
     const TreePredicateFn &Predicate = Call.Fn;
     bool HasAddedBuiltinMatcher = true;
     if (Predicate.isAlwaysTrue())
@@ -800,11 +800,11 @@ Expected<InstructionMatcher &> GlobalISelEmitter::createAndImportSelDAGMatcher(
         "Unordered", AtomicOrderingMMOPredicateMatcher::AO_OrStronger);
   }
 
-  if (Src->isLeaf()) {
-    Init *SrcInit = Src->getLeafValue();
+  if (Src.isLeaf()) {
+    Init *SrcInit = Src.getLeafValue();
     if (IntInit *SrcIntInit = dyn_cast<IntInit>(SrcInit)) {
       OperandMatcher &OM =
-          InsnMatcher.addOperand(OpIdx++, Src->getName(), TempOpIdx);
+          InsnMatcher.addOperand(OpIdx++, Src.getName(), TempOpIdx);
       OM.addPredicate<LiteralIntOperandMatcher>(SrcIntInit->getValue());
     } else
       return failedImport(
@@ -825,19 +825,19 @@ Expected<InstructionMatcher &> GlobalISelEmitter::createAndImportSelDAGMatcher(
     // predicate operand needs to be swapped from the last operand to the first
     // source.
 
-    unsigned NumChildren = Src->getNumChildren();
+    unsigned NumChildren = Src.getNumChildren();
     bool IsFCmp = SrcGIOrNull->TheDef->getName() == "G_FCMP";
 
     if (IsFCmp || SrcGIOrNull->TheDef->getName() == "G_ICMP") {
-      const TreePatternNode *SrcChild = Src->getChild(NumChildren - 1);
-      if (SrcChild->isLeaf()) {
-        DefInit *DI = dyn_cast<DefInit>(SrcChild->getLeafValue());
+      const TreePatternNode &SrcChild = Src.getChild(NumChildren - 1);
+      if (SrcChild.isLeaf()) {
+        DefInit *DI = dyn_cast<DefInit>(SrcChild.getLeafValue());
         Record *CCDef = DI ? DI->getDef() : nullptr;
         if (!CCDef || !CCDef->isSubClassOf("CondCode"))
           return failedImport("Unable to handle CondCode");
 
         OperandMatcher &OM =
-            InsnMatcher.addOperand(OpIdx++, SrcChild->getName(), TempOpIdx);
+            InsnMatcher.addOperand(OpIdx++, SrcChild.getName(), TempOpIdx);
         StringRef PredType = IsFCmp ? CCDef->getValueAsString("FCmpPredicate")
                                     : CCDef->getValueAsString("ICmpPredicate");
 
@@ -856,12 +856,12 @@ Expected<InstructionMatcher &> GlobalISelEmitter::createAndImportSelDAGMatcher(
         SrcGIOrNull->TheDef->getName() == "G_INTRINSIC_CONVERGENT" ||
         SrcGIOrNull->TheDef->getName() ==
             "G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS";
-    const CodeGenIntrinsic *II = Src->getIntrinsicInfo(CGP);
+    const CodeGenIntrinsic *II = Src.getIntrinsicInfo(CGP);
     if (IsIntrinsic && !II)
       return failedImport("Expected IntInit containing intrinsic ID)");
 
     for (unsigned i = 0; i != NumChildren; ++i) {
-      const TreePatternNode *SrcChild = Src->getChild(i);
+      const TreePatternNode &SrcChild = Src.getChild(i);
 
       // We need to determine the meaning of a literal integer based on the
       // context. If this is a field required to be an immediate (such as an
@@ -884,7 +884,7 @@ Expected<InstructionMatcher &> GlobalISelEmitter::createAndImportSelDAGMatcher(
         // following the defs is an intrinsic ID.
         if (i == 0) {
           OperandMatcher &OM =
-              InsnMatcher.addOperand(OpIdx++, SrcChild->getName(), TempOpIdx);
+              InsnMatcher.addOperand(OpIdx++, SrcChild.getName(), TempOpIdx);
           OM.addPredicate<IntrinsicIDOperandMatcher>(II);
           continue;
         }
@@ -921,11 +921,11 @@ Error GlobalISelEmitter::importComplexPatternOperandMatcher(
 
 // Get the name to use for a pattern operand. For an anonymous physical register
 // input, this should use the register name.
-static StringRef getSrcChildName(const TreePatternNode *SrcChild,
+static StringRef getSrcChildName(const TreePatternNode &SrcChild,
                                  Record *&PhysReg) {
-  StringRef SrcChildName = SrcChild->getName();
-  if (SrcChildName.empty() && SrcChild->isLeaf()) {
-    if (auto *ChildDefInit = dyn_cast<DefInit>(SrcChild->getLeafValue())) {
+  StringRef SrcChildName = SrcChild.getName();
+  if (SrcChildName.empty() && SrcChild.isLeaf()) {
+    if (auto *ChildDefInit = dyn_cast<DefInit>(SrcChild.getLeafValue())) {
       auto *ChildRec = ChildDefInit->getDef();
       if (ChildRec->isSubClassOf("Register")) {
         SrcChildName = ChildRec->getName();
@@ -939,19 +939,19 @@ static StringRef getSrcChildName(const TreePatternNode *SrcChild,
 
 Error GlobalISelEmitter::importChildMatcher(
     RuleMatcher &Rule, InstructionMatcher &InsnMatcher,
-    const TreePatternNode *SrcChild, bool OperandIsAPointer,
+    const TreePatternNode &SrcChild, bool OperandIsAPointer,
     bool OperandIsImmArg, unsigned OpIdx, unsigned &TempOpIdx) {
 
   Record *PhysReg = nullptr;
   std::string SrcChildName = std::string(getSrcChildName(SrcChild, PhysReg));
-  if (!SrcChild->isLeaf() &&
-      SrcChild->getOperator()->isSubClassOf("ComplexPattern")) {
+  if (!SrcChild.isLeaf() &&
+      SrcChild.getOperator()->isSubClassOf("ComplexPattern")) {
     // The "name" of a non-leaf complex pattern (MY_PAT $op1, $op2) is
     // "MY_PAT:op1:op2" and the ones with same "name" represent same operand.
-    std::string PatternName = std::string(SrcChild->getOperator()->getName());
-    for (unsigned i = 0; i < SrcChild->getNumChildren(); ++i) {
+    std::string PatternName = std::string(SrcChild.getOperator()->getName());
+    for (unsigned i = 0; i < SrcChild.getNumChildren(); ++i) {
       PatternName += ":";
-      PatternName += SrcChild->getChild(i)->getName();
+      PatternName += SrcChild.getChild(i).getName();
     }
     SrcChildName = PatternName;
   }
@@ -962,23 +962,23 @@ Error GlobalISelEmitter::importChildMatcher(
   if (OM.isSameAsAnotherOperand())
     return Error::success();
 
-  ArrayRef<TypeSetByHwMode> ChildTypes = SrcChild->getExtTypes();
+  ArrayRef<TypeSetByHwMode> ChildTypes = SrcChild.getExtTypes();
   if (ChildTypes.size() != 1)
     return failedImport("Src pattern child has multiple results");
 
   // Check MBB's before the type check since they are not a known type.
-  if (!SrcChild->isLeaf()) {
-    if (SrcChild->getOperator()->isSubClassOf("SDNode")) {
-      auto &ChildSDNI = CGP.getSDNodeInfo(SrcChild->getOperator());
+  if (!SrcChild.isLeaf()) {
+    if (SrcChild.getOperator()->isSubClassOf("SDNode")) {
+      auto &ChildSDNI = CGP.getSDNodeInfo(SrcChild.getOperator());
       if (ChildSDNI.getSDClassName() == "BasicBlockSDNode") {
         OM.addPredicate<MBBOperandMatcher>();
         return Error::success();
       }
-      if (SrcChild->getOperator()->getName() == "timm") {
+      if (SrcChild.getOperator()->getName() == "timm") {
         OM.addPredicate<ImmOperandMatcher>();
 
         // Add predicates, if any
-        for (const TreePredicateCall &Call : SrcChild->getPredicateCalls()) {
+        for (const TreePredicateCall &Call : SrcChild.getPredicateCalls()) {
           const TreePredicateFn &Predicate = Call.Fn;
 
           // Only handle immediate patterns for now
@@ -998,12 +998,12 @@ Error GlobalISelEmitter::importChildMatcher(
     if (auto Error =
             OM.addTypeCheckPredicate(ChildTypes.front(), OperandIsAPointer))
       return failedImport(toString(std::move(Error)) + " for Src operand (" +
-                          to_string(*SrcChild) + ")");
+                          to_string(SrcChild) + ")");
   }
 
   // Try look up SrcChild for a (named) predicate operand if there is any.
   if (WaitingForNamedOperands) {
-    auto &ScopedNames = SrcChild->getNamesAsPredicateArg();
+    auto &ScopedNames = SrcChild.getNamesAsPredicateArg();
     if (!ScopedNames.empty()) {
       auto PA = ScopedNames.begin();
       std::string Name = getScopedName(PA->getScope(), PA->getIdentifier());
@@ -1013,22 +1013,22 @@ Error GlobalISelEmitter::importChildMatcher(
   }
 
   // Check for nested instructions.
-  if (!SrcChild->isLeaf()) {
-    if (SrcChild->getOperator()->isSubClassOf("ComplexPattern")) {
+  if (!SrcChild.isLeaf()) {
+    if (SrcChild.getOperator()->isSubClassOf("ComplexPattern")) {
       // When a ComplexPattern is used as an operator, it should do the same
       // thing as when used as a leaf. However, the children of the operator
       // name the sub-operands that make up the complex operand and we must
       // prepare to reference them in the renderer too.
       unsigned RendererID = TempOpIdx;
       if (auto Error = importComplexPatternOperandMatcher(
-              OM, SrcChild->getOperator(), TempOpIdx))
+              OM, SrcChild.getOperator(), TempOpIdx))
         return Error;
 
-      for (unsigned i = 0, e = SrcChild->getNumChildren(); i != e; ++i) {
-        auto *SubOperand = SrcChild->getChild(i);
-        if (!SubOperand->getName().empty()) {
+      for (unsigned i = 0, e = SrcChild.getNumChildren(); i != e; ++i) {
+        auto &SubOperand = SrcChild.getChild(i);
+        if (!SubOperand.getName().empty()) {
           if (auto Error = Rule.defineComplexSubOperand(
-                  SubOperand->getName(), SrcChild->getOperator(), RendererID, i,
+                  SubOperand.getName(), SrcChild.getOperator(), RendererID, i,
                   SrcChildName))
             return Error;
         }
@@ -1038,7 +1038,7 @@ Error GlobalISelEmitter::importChildMatcher(
     }
 
     auto MaybeInsnOperand = OM.addPredicate<InstructionOperandMatcher>(
-        InsnMatcher.getRuleMatcher(), SrcChild->getName());
+        InsnMatcher.getRuleMatcher(), SrcChild.getName());
     if (!MaybeInsnOperand) {
       // This isn't strictly true. If the user were to provide exactly the same
       // matchers as the original operand then we could allow it. However, it's
@@ -1057,11 +1057,11 @@ Error GlobalISelEmitter::importChildMatcher(
     return Error::success();
   }
 
-  if (SrcChild->hasAnyPredicate())
+  if (SrcChild.hasAnyPredicate())
     return failedImport("Src pattern child has unsupported predicate");
 
   // Check for constant immediates.
-  if (auto *ChildInt = dyn_cast<IntInit>(SrcChild->getLeafValue())) {
+  if (auto *ChildInt = dyn_cast<IntInit>(SrcChild.getLeafValue())) {
     if (OperandIsImmArg) {
       // Checks for argument directly in operand list
       OM.addPredicate<LiteralIntOperandMatcher>(ChildInt->getValue());
@@ -1073,7 +1073,7 @@ Error GlobalISelEmitter::importChildMatcher(
   }
 
   // Check for def's like register classes or ComplexPattern's.
-  if (auto *ChildDefInit = dyn_cast<DefInit>(SrcChild->getLeafValue())) {
+  if (auto *ChildDefInit = dyn_cast<DefInit>(SrcChild.getLeafValue())) {
     auto *ChildRec = ChildDefInit->getDef();
 
     // Check for register classes.
@@ -1121,7 +1121,7 @@ Error GlobalISelEmitter::importChildMatcher(
     const bool ImmAllOnesV = ChildRec->getName() == "immAllOnesV";
     if (ImmAllOnesV || ChildRec->getName() == "immAllZerosV") {
       auto MaybeInsnOperand = OM.addPredicate<InstructionOperandMatcher>(
-          InsnMatcher.getRuleMatcher(), SrcChild->getName(), false);
+          InsnMatcher.getRuleMatcher(), SrcChild.getName(), false);
       InstructionOperandMatcher &InsnOperand = **MaybeInsnOperand;
 
       ValueTypeByHwMode VTy = ChildTypes.front().getValueTypeByHwMode();
@@ -1161,45 +1161,44 @@ Error GlobalISelEmitter::importChildMatcher(
 
 Expected<action_iterator> GlobalISelEmitter::importExplicitUseRenderer(
     action_iterator InsertPt, RuleMatcher &Rule, BuildMIAction &DstMIBuilder,
-    const TreePatternNode *DstChild, const TreePatternNode *Src) {
+    const TreePatternNode &DstChild, const TreePatternNode &Src) {
 
-  const auto &SubOperand = Rule.getComplexSubOperand(DstChild->getName());
+  const auto &SubOperand = Rule.getComplexSubOperand(DstChild.getName());
   if (SubOperand) {
     DstMIBuilder.addRenderer<RenderComplexPatternOperand>(
-        *std::get<0>(*SubOperand), DstChild->getName(),
-        std::get<1>(*SubOperand), std::get<2>(*SubOperand));
+        *std::get<0>(*SubOperand), DstChild.getName(), std::get<1>(*SubOperand),
+        std::get<2>(*SubOperand));
     return InsertPt;
   }
 
-  if (!DstChild->isLeaf()) {
-    if (DstChild->getOperator()->isSubClassOf("SDNodeXForm")) {
-      auto Child = DstChild->getChild(0);
-      auto I = SDNodeXFormEquivs.find(DstChild->getOperator());
+  if (!DstChild.isLeaf()) {
+    if (DstChild.getOperator()->isSubClassOf("SDNodeXForm")) {
+      auto &Child = DstChild.getChild(0);
+      auto I = SDNodeXFormEquivs.find(DstChild.getOperator());
       if (I != SDNodeXFormEquivs.end()) {
-        Record *XFormOpc = DstChild->getOperator()->getValueAsDef("Opcode");
+        Record *XFormOpc = DstChild.getOperator()->getValueAsDef("Opcode");
         if (XFormOpc->getName() == "timm") {
           // If this is a TargetConstant, there won't be a corresponding
           // instruction to transform. Instead, this will refer directly to an
           // operand in an instruction's operand list.
           DstMIBuilder.addRenderer<CustomOperandRenderer>(*I->second,
-                                                          Child->getName());
+                                                          Child.getName());
         } else {
-          DstMIBuilder.addRenderer<CustomRenderer>(*I->second,
-                                                   Child->getName());
+          DstMIBuilder.addRenderer<CustomRenderer>(*I->second, Child.getName());
         }
 
         return InsertPt;
       }
-      return failedImport("SDNodeXForm " + Child->getName() +
+      return failedImport("SDNodeXForm " + Child.getName() +
                           " has no custom renderer");
     }
 
     // We accept 'bb' here. It's an operator because BasicBlockSDNode isn't
     // inline, but in MI it's just another operand.
-    if (DstChild->getOperator()->isSubClassOf("SDNode")) {
-      auto &ChildSDNI = CGP.getSDNodeInfo(DstChild->getOperator());
+    if (DstChild.getOperator()->isSubClassOf("SDNode")) {
+      auto &ChildSDNI = CGP.getSDNodeInfo(DstChild.getOperator());
       if (ChildSDNI.getSDClassName() == "BasicBlockSDNode") {
-        DstMIBuilder.addRenderer<CopyRenderer>(DstChild->getName());
+        DstMIBuilder.addRenderer<CopyRenderer>(DstChild.getName());
         return InsertPt;
       }
     }
@@ -1208,19 +1207,19 @@ Expected<action_iterator> GlobalISelEmitter::importExplicitUseRenderer(
     // rendered as operands.
     // FIXME: The target should be able to choose sign-extended when appropriate
     //        (e.g. on Mips).
-    if (DstChild->getOperator()->getName() == "timm") {
-      DstMIBuilder.addRenderer<CopyRenderer>(DstChild->getName());
+    if (DstChild.getOperator()->getName() == "timm") {
+      DstMIBuilder.addRenderer<CopyRenderer>(DstChild.getName());
       return InsertPt;
-    } else if (DstChild->getOperator()->getName() == "imm") {
-      DstMIBuilder.addRenderer<CopyConstantAsImmRenderer>(DstChild->getName());
+    } else if (DstChild.getOperator()->getName() == "imm") {
+      DstMIBuilder.addRenderer<CopyConstantAsImmRenderer>(DstChild.getName());
       return InsertPt;
-    } else if (DstChild->getOperator()->getName() == "fpimm") {
+    } else if (DstChild.getOperator()->getName() == "fpimm") {
       DstMIBuilder.addRenderer<CopyFConstantAsFPImmRenderer>(
-          DstChild->getName());
+          DstChild.getName());
       return InsertPt;
     }
 
-    if (DstChild->getOperator()->isSubClassOf("Instruction")) {
+    if (DstChild.getOperator()->isSubClassOf("Instruction")) {
       auto OpTy = getInstResultType(DstChild);
       if (!OpTy)
         return OpTy.takeError();
@@ -1238,22 +1237,22 @@ Expected<action_iterator> GlobalISelEmitter::importExplicitUseRenderer(
     }
 
     return failedImport("Dst pattern child isn't a leaf node or an MBB" +
-                        llvm::to_string(*DstChild));
+                        llvm::to_string(DstChild));
   }
 
   // It could be a specific immediate in which case we should just check for
   // that immediate.
   if (const IntInit *ChildIntInit =
-          dyn_cast<IntInit>(DstChild->getLeafValue())) {
+          dyn_cast<IntInit>(DstChild.getLeafValue())) {
     DstMIBuilder.addRenderer<ImmRenderer>(ChildIntInit->getValue());
     return InsertPt;
   }
 
   // Otherwise, we're looking for a bog-standard RegisterClass operand.
-  if (auto *ChildDefInit = dyn_cast<DefInit>(DstChild->getLeafValue())) {
+  if (auto *ChildDefInit = dyn_cast<DefInit>(DstChild.getLeafValue())) {
     auto *ChildRec = ChildDefInit->getDef();
 
-    ArrayRef<TypeSetByHwMode> ChildTypes = DstChild->getExtTypes();
+    ArrayRef<TypeSetByHwMode> ChildTypes = DstChild.getExtTypes();
     if (ChildTypes.size() != 1)
       return failedImport("Dst pattern child has multiple results");
 
@@ -1274,11 +1273,11 @@ Expected<action_iterator> GlobalISelEmitter::importExplicitUseRenderer(
       if (ChildRec->isSubClassOf("RegisterOperand") &&
           !ChildRec->isValueUnset("GIZeroRegister")) {
         DstMIBuilder.addRenderer<CopyOrAddZeroRegRenderer>(
-            DstChild->getName(), ChildRec->getValueAsDef("GIZeroRegister"));
+            DstChild.getName(), ChildRec->getValueAsDef("GIZeroRegister"));
         return InsertPt;
       }
 
-      DstMIBuilder.addRenderer<CopyRenderer>(DstChild->getName());
+      DstMIBuilder.addRenderer<CopyRenderer>(DstChild.getName());
       return InsertPt;
     }
 
@@ -1294,9 +1293,9 @@ Expected<action_iterator> GlobalISelEmitter::importExplicitUseRenderer(
         return failedImport(
             "SelectionDAG ComplexPattern not mapped to GlobalISel");
 
-      const OperandMatcher &OM = Rule.getOperandMatcher(DstChild->getName());
+      const OperandMatcher &OM = Rule.getOperandMatcher(DstChild.getName());
       DstMIBuilder.addRenderer<RenderComplexPatternOperand>(
-          *ComplexPattern->second, DstChild->getName(),
+          *ComplexPattern->second, DstChild.getName(),
           OM.getAllocatedTemporariesBaseID());
       return InsertPt;
     }
@@ -1307,10 +1306,10 @@ Expected<action_iterator> GlobalISelEmitter::importExplicitUseRenderer(
 
   // Handle the case where the MVT/register class is omitted in the dest pattern
   // but MVT exists in the source pattern.
-  if (isa<UnsetInit>(DstChild->getLeafValue())) {
-    for (unsigned NumOp = 0; NumOp < Src->getNumChildren(); NumOp++)
-      if (Src->getChild(NumOp)->getName() == DstChild->getName()) {
-        DstMIBuilder.addRenderer<CopyRenderer>(Src->getChild(NumOp)->getName());
+  if (isa<UnsetInit>(DstChild.getLeafValue())) {
+    for (unsigned NumOp = 0; NumOp < Src.getNumChildren(); NumOp++)
+      if (Src.getChild(NumOp).getName() == DstChild.getName()) {
+        DstMIBuilder.addRenderer<CopyRenderer>(Src.getChild(NumOp).getName());
         return InsertPt;
       }
   }
@@ -1318,8 +1317,8 @@ Expected<action_iterator> GlobalISelEmitter::importExplicitUseRenderer(
 }
 
 Expected<BuildMIAction &> GlobalISelEmitter::createAndImportInstructionRenderer(
-    RuleMatcher &M, InstructionMatcher &InsnMatcher, const TreePatternNode *Src,
-    const TreePatternNode *Dst) {
+    RuleMatcher &M, InstructionMatcher &InsnMatcher, const TreePatternNode &Src,
+    const TreePatternNode &Dst) {
   auto InsertPtOrError = createInstructionRenderer(M.actions_end(), M, Dst);
   if (auto Error = InsertPtOrError.takeError())
     return std::move(Error);
@@ -1353,8 +1352,8 @@ Expected<BuildMIAction &> GlobalISelEmitter::createAndImportInstructionRenderer(
 
 Expected<action_iterator>
 GlobalISelEmitter::createAndImportSubInstructionRenderer(
-    const action_iterator InsertPt, RuleMatcher &M, const TreePatternNode *Dst,
-    const TreePatternNode *Src, unsigned TempRegID) {
+    const action_iterator InsertPt, RuleMatcher &M, const TreePatternNode &Dst,
+    const TreePatternNode &Src, unsigned TempRegID) {
   auto InsertPtOrError = createInstructionRenderer(InsertPt, M, Dst);
 
   // TODO: Assert there's exactly one result.
@@ -1376,15 +1375,15 @@ GlobalISelEmitter::createAndImportSubInstructionRenderer(
   // We need to make sure that when we import an INSERT_SUBREG as a
   // subinstruction that it ends up being constrained to the correct super
   // register and subregister classes.
-  auto OpName = Target.getInstruction(Dst->getOperator()).TheDef->getName();
+  auto OpName = Target.getInstruction(Dst.getOperator()).TheDef->getName();
   if (OpName == "INSERT_SUBREG") {
-    auto SubClass = inferRegClassFromPattern(Dst->getChild(1));
+    auto SubClass = inferRegClassFromPattern(Dst.getChild(1));
     if (!SubClass)
       return failedImport(
           "Cannot infer register class from INSERT_SUBREG operand #1");
     std::optional<const CodeGenRegisterClass *> SuperClass =
-        inferSuperRegisterClassForNode(Dst->getExtType(0), Dst->getChild(0),
-                                       Dst->getChild(2));
+        inferSuperRegisterClassForNode(Dst.getExtType(0), Dst.getChild(0),
+                                       Dst.getChild(2));
     if (!SuperClass)
       return failedImport(
           "Cannot infer register class for INSERT_SUBREG operand #0");
@@ -1404,12 +1403,12 @@ GlobalISelEmitter::createAndImportSubInstructionRenderer(
     // instructions, the result register class is controlled by the
     // subregisters of the operand. As a result, we must constrain the result
     // class rather than check that it's already the right one.
-    auto SuperClass = inferRegClassFromPattern(Dst->getChild(0));
+    auto SuperClass = inferRegClassFromPattern(Dst.getChild(0));
     if (!SuperClass)
       return failedImport(
           "Cannot infer register class from EXTRACT_SUBREG operand #0");
 
-    auto SubIdx = inferSubRegIndexForNode(Dst->getChild(1));
+    auto SubIdx = inferSubRegIndexForNode(Dst.getChild(1));
     if (!SubIdx)
       return failedImport("EXTRACT_SUBREG child #1 is not a subreg index");
 
@@ -1429,12 +1428,12 @@ GlobalISelEmitter::createAndImportSubInstructionRenderer(
   // Similar to INSERT_SUBREG, we also have to handle SUBREG_TO_REG as a
   // subinstruction.
   if (OpName == "SUBREG_TO_REG") {
-    auto SubClass = inferRegClassFromPattern(Dst->getChild(1));
+    auto SubClass = inferRegClassFromPattern(Dst.getChild(1));
     if (!SubClass)
       return failedImport(
           "Cannot infer register class from SUBREG_TO_REG child #1");
     auto SuperClass =
-        inferSuperRegisterClass(Dst->getExtType(0), Dst->getChild(2));
+        inferSuperRegisterClass(Dst.getExtType(0), Dst.getChild(2));
     if (!SuperClass)
       return failedImport(
           "Cannot infer register class for SUBREG_TO_REG operand #0");
@@ -1446,13 +1445,13 @@ GlobalISelEmitter::createAndImportSubInstructionRenderer(
   }
 
   if (OpName == "REG_SEQUENCE") {
-    auto SuperClass = inferRegClassFromPattern(Dst->getChild(0));
+    auto SuperClass = inferRegClassFromPattern(Dst.getChild(0));
     M.insertAction<ConstrainOperandToRegClassAction>(
         InsertPt, DstMIBuilder.getInsnID(), 0, **SuperClass);
 
-    unsigned Num = Dst->getNumChildren();
+    unsigned Num = Dst.getNumChildren();
     for (unsigned I = 1; I != Num; I += 2) {
-      const TreePatternNode *SubRegChild = Dst->getChild(I + 1);
+      const TreePatternNode &SubRegChild = Dst.getChild(I + 1);
 
       auto SubIdx = inferSubRegIndexForNode(SubRegChild);
       if (!SubIdx)
@@ -1474,8 +1473,8 @@ GlobalISelEmitter::createAndImportSubInstructionRenderer(
 }
 
 Expected<action_iterator> GlobalISelEmitter::createInstructionRenderer(
-    action_iterator InsertPt, RuleMatcher &M, const TreePatternNode *Dst) {
-  Record *DstOp = Dst->getOperator();
+    action_iterator InsertPt, RuleMatcher &M, const TreePatternNode &Dst) {
+  Record *DstOp = Dst.getOperator();
   if (!DstOp->isSubClassOf("Instruction")) {
     if (DstOp->isSubClassOf("ValueType"))
       return failedImport(
@@ -1496,9 +1495,9 @@ Expected<action_iterator> GlobalISelEmitter::createInstructionRenderer(
 
 Expected<action_iterator> GlobalISelEmitter::importExplicitDefRenderers(
     action_iterator InsertPt, RuleMatcher &M, BuildMIAction &DstMIBuilder,
-    const TreePatternNode *Src, const TreePatternNode *Dst) {
+    const TreePatternNode &Src, const TreePatternNode &Dst) {
   const CodeGenInstruction *DstI = DstMIBuilder.getCGI();
-  const unsigned SrcNumDefs = Src->getExtTypes().size();
+  const unsigned SrcNumDefs = Src.getExtTypes().size();
   const unsigned DstNumDefs = DstI->Operands.NumDefs;
   if (DstNumDefs == 0)
     return InsertPt;
@@ -1513,11 +1512,11 @@ Expected<action_iterator> GlobalISelEmitter::importExplicitDefRenderers(
 
   // Some instructions have multiple defs, but are missing a type entry
   // (e.g. s_cc_out operands).
-  if (Dst->getExtTypes().size() < DstNumDefs)
+  if (Dst.getExtTypes().size() < DstNumDefs)
     return failedImport("unhandled discarded def");
 
   for (unsigned I = SrcNumDefs; I < DstNumDefs; ++I) {
-    const TypeSetByHwMode &ExtTy = Dst->getExtType(I);
+    const TypeSetByHwMode &ExtTy = Dst.getExtType(I);
     if (!ExtTy.isMachineValueType())
       return failedImport("unsupported typeset");
 
@@ -1536,24 +1535,24 @@ Expected<action_iterator> GlobalISelEmitter::importExplicitDefRenderers(
 
 Expected<action_iterator> GlobalISelEmitter::importExplicitUseRenderers(
     action_iterator InsertPt, RuleMatcher &M, BuildMIAction &DstMIBuilder,
-    const llvm::TreePatternNode *Dst, const llvm::TreePatternNode *Src) {
+    const llvm::TreePatternNode &Dst, const llvm::TreePatternNode &Src) {
   const CodeGenInstruction *DstI = DstMIBuilder.getCGI();
-  CodeGenInstruction *OrigDstI = &Target.getInstruction(Dst->getOperator());
+  CodeGenInstruction *OrigDstI = &Target.getInstruction(Dst.getOperator());
 
   StringRef Name = OrigDstI->TheDef->getName();
-  unsigned ExpectedDstINumUses = Dst->getNumChildren();
+  unsigned ExpectedDstINumUses = Dst.getNumChildren();
 
   // EXTRACT_SUBREG needs to use a subregister COPY.
   if (Name == "EXTRACT_SUBREG") {
-    if (!Dst->getChild(1)->isLeaf())
+    if (!Dst.getChild(1).isLeaf())
       return failedImport("EXTRACT_SUBREG child #1 is not a leaf");
-    DefInit *SubRegInit = dyn_cast<DefInit>(Dst->getChild(1)->getLeafValue());
+    DefInit *SubRegInit = dyn_cast<DefInit>(Dst.getChild(1).getLeafValue());
     if (!SubRegInit)
       return failedImport("EXTRACT_SUBREG child #1 is not a subreg index");
 
     CodeGenSubRegIndex *SubIdx = CGRegs.getSubRegIdx(SubRegInit->getDef());
-    const TreePatternNode *ValChild = Dst->getChild(0);
-    if (!ValChild->isLeaf()) {
+    const TreePatternNode &ValChild = Dst.getChild(0);
+    if (!ValChild.isLeaf()) {
       // We really have to handle the source instruction, and then insert a
       // copy from the subregister.
       auto ExtractSrcTy = getInstResultType(ValChild);
@@ -1574,7 +1573,7 @@ Expected<action_iterator> GlobalISelEmitter::importExplicitUseRenderers(
     }
 
     // If this is a source operand, this is just a subregister copy.
-    Record *RCDef = getInitValueAsRegClass(ValChild->getLeafValue());
+    Record *RCDef = getInitValueAsRegClass(ValChild.getLeafValue());
     if (!RCDef)
       return failedImport("EXTRACT_SUBREG child #0 could not "
                           "be coerced to a register class");
@@ -1589,7 +1588,7 @@ Expected<action_iterator> GlobalISelEmitter::importExplicitUseRenderers(
         return failedImport("EXTRACT_SUBREG requires an additional COPY");
     }
 
-    StringRef RegOperandName = Dst->getChild(0)->getName();
+    StringRef RegOperandName = Dst.getChild(0).getName();
     if (const auto &SubOperand = M.getComplexSubOperand(RegOperandName)) {
       DstMIBuilder.addRenderer<RenderComplexPatternOperand>(
           *std::get<0>(*SubOperand), RegOperandName, std::get<1>(*SubOperand),
@@ -1602,10 +1601,10 @@ Expected<action_iterator> GlobalISelEmitter::importExplicitUseRenderers(
   }
 
   if (Name == "REG_SEQUENCE") {
-    if (!Dst->getChild(0)->isLeaf())
+    if (!Dst.getChild(0).isLeaf())
       return failedImport("REG_SEQUENCE child #0 is not a leaf");
 
-    Record *RCDef = getInitValueAsRegClass(Dst->getChild(0)->getLeafValue());
+    Record *RCDef = getInitValueAsRegClass(Dst.getChild(0).getLeafValue());
     if (!RCDef)
       return failedImport("REG_SEQUENCE child #0 could not "
                           "be coerced to a register class");
@@ -1614,11 +1613,10 @@ Expected<action_iterator> GlobalISelEmitter::importExplicitUseRenderers(
       return failedImport("Malformed REG_SEQUENCE");
 
     for (unsigned I = 1; I != ExpectedDstINumUses; I += 2) {
-      const TreePatternNode *ValChild = Dst->getChild(I);
-      const TreePatternNode *SubRegChild = Dst->getChild(I + 1);
+      const TreePatternNode &ValChild = Dst.getChild(I);
+      const TreePatternNode &SubRegChild = Dst.getChild(I + 1);
 
-      if (DefInit *SubRegInit =
-              dyn_cast<DefInit>(SubRegChild->getLeafValue())) {
+      if (DefInit *SubRegInit = dyn_cast<DefInit>(SubRegChild.getLeafValue())) {
         CodeGenSubRegIndex *SubIdx = CGRegs.getSubRegIdx(SubRegInit->getDef());
 
         auto InsertPtOrError =
@@ -1676,7 +1674,7 @@ Expected<action_iterator> GlobalISelEmitter::importExplicitUseRenderers(
 
     // If the operand has default values, introduce them now.
     if (CGP.operandHasDefault(OperandNode) &&
-        (InstOpNo < NonOverridableOperands || Child >= Dst->getNumChildren())) {
+        (InstOpNo < NonOverridableOperands || Child >= Dst.getNumChildren())) {
       // This is a predicate or optional def operand which the pattern has not
       // overridden, or which we aren't letting it override; emit the 'default
       // ops' operands.
@@ -1691,7 +1689,7 @@ Expected<action_iterator> GlobalISelEmitter::importExplicitUseRenderers(
     }
 
     auto InsertPtOrError = importExplicitUseRenderer(InsertPt, M, DstMIBuilder,
-                                                     Dst->getChild(Child), Src);
+                                                     Dst.getChild(Child), Src);
     if (auto Error = InsertPtOrError.takeError())
       return std::move(Error);
     InsertPt = InsertPtOrError.get();
@@ -1712,14 +1710,14 @@ Error GlobalISelEmitter::importDefaultOperandRenderers(
     action_iterator InsertPt, RuleMatcher &M, BuildMIAction &DstMIBuilder,
     const DAGDefaultOperand &DefaultOp) const {
   for (const auto &Op : DefaultOp.DefaultOps) {
-    const auto *N = Op.get();
-    if (!N->isLeaf())
+    const auto &N = *Op;
+    if (!N.isLeaf())
       return failedImport("Could not add default op");
 
-    const auto *DefaultOp = N->getLeafValue();
+    const auto *DefaultOp = N.getLeafValue();
 
     if (const DefInit *DefaultDefOp = dyn_cast<DefInit>(DefaultOp)) {
-      std::optional<LLTCodeGen> OpTyOrNone = MVTToLLT(N->getSimpleType(0));
+      std::optional<LLTCodeGen> OpTyOrNone = MVTToLLT(N.getSimpleType(0));
       auto Def = DefaultDefOp->getDef();
       if (Def->getName() == "undef_tied_input") {
         unsigned TempRegID = M.allocateTempRegID();
@@ -1758,10 +1756,9 @@ Error GlobalISelEmitter::importImplicitDefRenderers(
 }
 
 std::optional<const CodeGenRegisterClass *>
-GlobalISelEmitter::getRegClassFromLeaf(const TreePatternNode *Leaf) {
-  assert(Leaf && "Expected node?");
-  assert(Leaf->isLeaf() && "Expected leaf?");
-  Record *RCRec = getInitValueAsRegClass(Leaf->getLeafValue());
+GlobalISelEmitter::getRegClassFromLeaf(const TreePatternNode &Leaf) {
+  assert(Leaf.isLeaf() && "Expected leaf?");
+  Record *RCRec = getInitValueAsRegClass(Leaf.getLeafValue());
   if (!RCRec)
     return std::nullopt;
   CodeGenRegisterClass *RC = CGRegs.getRegClass(RCRec);
@@ -1771,20 +1768,17 @@ GlobalISelEmitter::getRegClassFromLeaf(const TreePatternNode *Leaf) {
 }
 
 std::optional<const CodeGenRegisterClass *>
-GlobalISelEmitter::inferRegClassFromPattern(const TreePatternNode *N) {
-  if (!N)
-    return std::nullopt;
-
-  if (N->isLeaf())
+GlobalISelEmitter::inferRegClassFromPattern(const TreePatternNode &N) {
+  if (N.isLeaf())
     return getRegClassFromLeaf(N);
 
   // We don't have a leaf node, so we have to try and infer something. Check
   // that we have an instruction that we an infer something from.
 
   // Only handle things that produce a single type.
-  if (N->getNumTypes() != 1)
+  if (N.getNumTypes() != 1)
     return std::nullopt;
-  Record *OpRec = N->getOperator();
+  Record *OpRec = N.getOperator();
 
   // We only want instructions.
   if (!OpRec->isSubClassOf("Instruction"))
@@ -1803,21 +1797,21 @@ GlobalISelEmitter::inferRegClassFromPattern(const TreePatternNode *N) {
   if (IsRegSequence || InstName == "COPY_TO_REGCLASS") {
     // If we have a COPY_TO_REGCLASS, then we need to handle it specially. It
     // has the desired register class as the first child.
-    const TreePatternNode *RCChild = N->getChild(IsRegSequence ? 0 : 1);
-    if (!RCChild->isLeaf())
+    const TreePatternNode &RCChild = N.getChild(IsRegSequence ? 0 : 1);
+    if (!RCChild.isLeaf())
       return std::nullopt;
     return getRegClassFromLeaf(RCChild);
   }
   if (InstName == "INSERT_SUBREG") {
-    const TreePatternNode *Child0 = N->getChild(0);
-    assert(Child0->getNumTypes() == 1 && "Unexpected number of types!");
-    const TypeSetByHwMode &VTy = Child0->getExtType(0);
-    return inferSuperRegisterClassForNode(VTy, Child0, N->getChild(2));
+    const TreePatternNode &Child0 = N.getChild(0);
+    assert(Child0.getNumTypes() == 1 && "Unexpected number of types!");
+    const TypeSetByHwMode &VTy = Child0.getExtType(0);
+    return inferSuperRegisterClassForNode(VTy, Child0, N.getChild(2));
   }
   if (InstName == "EXTRACT_SUBREG") {
-    assert(N->getNumTypes() == 1 && "Unexpected number of types!");
-    const TypeSetByHwMode &VTy = N->getExtType(0);
-    return inferSuperRegisterClass(VTy, N->getChild(1));
+    assert(N.getNumTypes() == 1 && "Unexpected number of types!");
+    const TypeSetByHwMode &VTy = N.getExtType(0);
+    return inferSuperRegisterClass(VTy, N.getChild(1));
   }
 
   // Handle destination record types that we can safely infer a register class
@@ -1840,14 +1834,13 @@ GlobalISelEmitter::inferRegClassFromPattern(const TreePatternNode *N) {
 
 std::optional<const CodeGenRegisterClass *>
 GlobalISelEmitter::inferSuperRegisterClass(
-    const TypeSetByHwMode &Ty, const TreePatternNode *SubRegIdxNode) {
-  assert(SubRegIdxNode && "Expected subregister index node!");
+    const TypeSetByHwMode &Ty, const TreePatternNode &SubRegIdxNode) {
   // We need a ValueTypeByHwMode for getSuperRegForSubReg.
   if (!Ty.isValueTypeByHwMode(false))
     return std::nullopt;
-  if (!SubRegIdxNode->isLeaf())
+  if (!SubRegIdxNode.isLeaf())
     return std::nullopt;
-  DefInit *SubRegInit = dyn_cast<DefInit>(SubRegIdxNode->getLeafValue());
+  DefInit *SubRegInit = dyn_cast<DefInit>(SubRegIdxNode.getLeafValue());
   if (!SubRegInit)
     return std::nullopt;
   CodeGenSubRegIndex *SubIdx = CGRegs.getSubRegIdx(SubRegInit->getDef());
@@ -1864,9 +1857,8 @@ GlobalISelEmitter::inferSuperRegisterClass(
 
 std::optional<const CodeGenRegisterClass *>
 GlobalISelEmitter::inferSuperRegisterClassForNode(
-    const TypeSetByHwMode &Ty, const TreePatternNode *SuperRegNode,
-    const TreePatternNode *SubRegIdxNode) {
-  assert(SuperRegNode && "Expected super register node!");
+    const TypeSetByHwMode &Ty, const TreePatternNode &SuperRegNode,
+    const TreePatternNode &SubRegIdxNode) {
   // Check if we already have a defined register class for the super register
   // node. If we do, then we should preserve that rather than inferring anything
   // from the subregister index node. We can assume that whoever wrote the
@@ -1879,11 +1871,11 @@ GlobalISelEmitter::inferSuperRegisterClassForNode(
 }
 
 std::optional<CodeGenSubRegIndex *> GlobalISelEmitter::inferSubRegIndexForNode(
-    const TreePatternNode *SubRegIdxNode) {
-  if (!SubRegIdxNode->isLeaf())
+    const TreePatternNode &SubRegIdxNode) {
+  if (!SubRegIdxNode.isLeaf())
     return std::nullopt;
 
-  DefInit *SubRegInit = dyn_cast<DefInit>(SubRegIdxNode->getLeafValue());
+  DefInit *SubRegInit = dyn_cast<DefInit>(SubRegIdxNode.getLeafValue());
   if (!SubRegInit)
     return std::nullopt;
   return CGRegs.getSubRegIdx(SubRegInit->getDef());
@@ -1894,9 +1886,9 @@ Expected<RuleMatcher> GlobalISelEmitter::runOnPattern(const PatternToMatch &P) {
   int Score = P.getPatternComplexity(CGP);
   RuleMatcher M(P.getSrcRecord()->getLoc());
   RuleMatcherScores[M.getRuleID()] = Score;
-  M.addAction<DebugCommentAction>(llvm::to_string(*P.getSrcPattern()) +
+  M.addAction<DebugCommentAction>(llvm::to_string(P.getSrcPattern()) +
                                   "  =>  " +
-                                  llvm::to_string(*P.getDstPattern()));
+                                  llvm::to_string(P.getDstPattern()));
 
   SmallVector<Record *, 4> Predicates;
   P.getPredicateRecords(Predicates);
@@ -1907,8 +1899,8 @@ Expected<RuleMatcher> GlobalISelEmitter::runOnPattern(const PatternToMatch &P) {
     M.addHwModeIdx(declareHwModeCheck(P.getHwModeFeatures()));
 
   // Next, analyze the pattern operators.
-  TreePatternNode *Src = P.getSrcPattern();
-  TreePatternNode *Dst = P.getDstPattern();
+  TreePatternNode &Src = P.getSrcPattern();
+  TreePatternNode &Dst = P.getDstPattern();
 
   // If the root of either pattern isn't a simple operator, ignore it.
   if (auto Err = isTrivialOperatorNode(Dst))
@@ -1939,7 +1931,7 @@ Expected<RuleMatcher> GlobalISelEmitter::runOnPattern(const PatternToMatch &P) {
   // the capture accross rules. The downside is that it would
   // introduce a dependency between predicates (captures must happen
   // before their first use.)
-  InstructionMatcher &InsnMatcherTemp = M.addInstructionMatcher(Src->getName());
+  InstructionMatcher &InsnMatcherTemp = M.addInstructionMatcher(Src.getName());
   unsigned TempOpIdx = 0;
 
   const auto SavedFlags = M.setGISelFlags(P.getSrcRecord());
@@ -1950,8 +1942,8 @@ Expected<RuleMatcher> GlobalISelEmitter::runOnPattern(const PatternToMatch &P) {
     return std::move(Error);
   InstructionMatcher &InsnMatcher = InsnMatcherOrError.get();
 
-  if (Dst->isLeaf()) {
-    Record *RCDef = getInitValueAsRegClass(Dst->getLeafValue());
+  if (Dst.isLeaf()) {
+    Record *RCDef = getInitValueAsRegClass(Dst.getLeafValue());
     if (RCDef) {
       const CodeGenRegisterClass &RC = Target.getRegisterClass(RCDef);
 
@@ -1969,7 +1961,7 @@ Expected<RuleMatcher> GlobalISelEmitter::runOnPattern(const PatternToMatch &P) {
       auto &DstMIBuilder =
           M.addAction<BuildMIAction>(M.allocateOutputInsnID(), &DstI);
       DstMIBuilder.addRenderer<CopyRenderer>(DstIOperand.Name);
-      DstMIBuilder.addRenderer<CopyRenderer>(Dst->getName());
+      DstMIBuilder.addRenderer<CopyRenderer>(Dst.getName());
       M.addAction<ConstrainOperandToRegClassAction>(0, 0, RC);
 
       // Erase the root.
@@ -1986,7 +1978,7 @@ Expected<RuleMatcher> GlobalISelEmitter::runOnPattern(const PatternToMatch &P) {
   }
 
   // Start with the defined operands (i.e., the results of the root operator).
-  Record *DstOp = Dst->getOperator();
+  Record *DstOp = Dst.getOperator();
   if (!DstOp->isSubClassOf("Instruction"))
     return failedImport("Pattern operator isn't an instruction");
 
@@ -1994,7 +1986,7 @@ Expected<RuleMatcher> GlobalISelEmitter::runOnPattern(const PatternToMatch &P) {
   StringRef DstIName = DstI.TheDef->getName();
 
   unsigned DstNumDefs = DstI.Operands.NumDefs,
-           SrcNumDefs = Src->getExtTypes().size();
+           SrcNumDefs = Src.getExtTypes().size();
   if (DstNumDefs < SrcNumDefs) {
     if (DstNumDefs != 0)
       return failedImport("Src pattern result has more defs than dst MI (" +
@@ -2017,23 +2009,23 @@ Expected<RuleMatcher> GlobalISelEmitter::runOnPattern(const PatternToMatch &P) {
   unsigned OpIdx = 0;
   unsigned N = std::min(DstNumDefs, SrcNumDefs);
   for (unsigned I = 0; I < N; ++I) {
-    const TypeSetByHwMode &VTy = Src->getExtType(I);
+    const TypeSetByHwMode &VTy = Src.getExtType(I);
 
     const auto &DstIOperand = DstI.Operands[OpIdx];
     PointerUnion<Record *, const CodeGenRegisterClass *> MatchedRC =
         DstIOperand.Rec;
     if (DstIName == "COPY_TO_REGCLASS") {
-      MatchedRC = getInitValueAsRegClass(Dst->getChild(1)->getLeafValue());
+      MatchedRC = getInitValueAsRegClass(Dst.getChild(1).getLeafValue());
 
       if (MatchedRC.isNull())
         return failedImport(
             "COPY_TO_REGCLASS operand #1 isn't a register class");
     } else if (DstIName == "REG_SEQUENCE") {
-      MatchedRC = getInitValueAsRegClass(Dst->getChild(0)->getLeafValue());
+      MatchedRC = getInitValueAsRegClass(Dst.getChild(0).getLeafValue());
       if (MatchedRC.isNull())
         return failedImport("REG_SEQUENCE operand #0 isn't a register class");
     } else if (DstIName == "EXTRACT_SUBREG") {
-      auto InferredClass = inferRegClassFromPattern(Dst->getChild(0));
+      auto InferredClass = inferRegClassFromPattern(Dst.getChild(0));
       if (!InferredClass)
         return failedImport(
             "Could not infer class for EXTRACT_SUBREG operand #0");
@@ -2042,8 +2034,8 @@ Expected<RuleMatcher> GlobalISelEmitter::runOnPattern(const PatternToMatch &P) {
       // register.
       MatchedRC = (*InferredClass)->getDef();
     } else if (DstIName == "INSERT_SUBREG") {
-      auto MaybeSuperClass = inferSuperRegisterClassForNode(
-          VTy, Dst->getChild(0), Dst->getChild(2));
+      auto MaybeSuperClass =
+          inferSuperRegisterClassForNode(VTy, Dst.getChild(0), Dst.getChild(2));
       if (!MaybeSuperClass)
         return failedImport(
             "Cannot infer register class for INSERT_SUBREG operand #0");
@@ -2052,7 +2044,7 @@ Expected<RuleMatcher> GlobalISelEmitter::runOnPattern(const PatternToMatch &P) {
       // set DstIOpRec using this.
       MatchedRC = *MaybeSuperClass;
     } else if (DstIName == "SUBREG_TO_REG") {
-      auto MaybeRegClass = inferSuperRegisterClass(VTy, Dst->getChild(2));
+      auto MaybeRegClass = inferSuperRegisterClass(VTy, Dst.getChild(2));
       if (!MaybeRegClass)
         return failedImport(
             "Cannot infer register class for SUBREG_TO_REG operand #0");
@@ -2060,8 +2052,7 @@ Expected<RuleMatcher> GlobalISelEmitter::runOnPattern(const PatternToMatch &P) {
     } else if (MatchedRC.get<Record *>()->isSubClassOf("RegisterOperand"))
       MatchedRC = MatchedRC.get<Record *>()->getValueAsDef("RegClass");
     else if (!MatchedRC.get<Record *>()->isSubClassOf("RegisterClass"))
-      return failedImport("Dst MI def isn't a register class" +
-                          to_string(*Dst));
+      return failedImport("Dst MI def isn't a register class" + to_string(Dst));
 
     OperandMatcher &OM = InsnMatcher.getOperand(OpIdx);
     // The operand names declared in the DstI instruction are unrelated to
@@ -2095,8 +2086,7 @@ Expected<RuleMatcher> GlobalISelEmitter::runOnPattern(const PatternToMatch &P) {
   if (DstIName == "COPY_TO_REGCLASS") {
     // COPY_TO_REGCLASS does not provide operand constraints itself but the
     // result is constrained to the class given by the second child.
-    Record *DstIOpRec =
-        getInitValueAsRegClass(Dst->getChild(1)->getLeafValue());
+    Record *DstIOpRec = getInitValueAsRegClass(Dst.getChild(1).getLeafValue());
 
     if (DstIOpRec == nullptr)
       return failedImport("COPY_TO_REGCLASS operand #1 isn't a register class");
@@ -2104,12 +2094,12 @@ Expected<RuleMatcher> GlobalISelEmitter::runOnPattern(const PatternToMatch &P) {
     M.addAction<ConstrainOperandToRegClassAction>(
         0, 0, Target.getRegisterClass(DstIOpRec));
   } else if (DstIName == "EXTRACT_SUBREG") {
-    auto SuperClass = inferRegClassFromPattern(Dst->getChild(0));
+    auto SuperClass = inferRegClassFromPattern(Dst.getChild(0));
     if (!SuperClass)
       return failedImport(
           "Cannot infer register class from EXTRACT_SUBREG operand #0");
 
-    auto SubIdx = inferSubRegIndexForNode(Dst->getChild(1));
+    auto SubIdx = inferSubRegIndexForNode(Dst.getChild(1));
     if (!SubIdx)
       return failedImport("EXTRACT_SUBREG child #1 is not a subreg index");
 
@@ -2119,7 +2109,7 @@ Expected<RuleMatcher> GlobalISelEmitter::runOnPattern(const PatternToMatch &P) {
     //
     // FIXME: This may introduce an extra copy if the chosen class doesn't
     //        actually contain the subregisters.
-    assert(Src->getExtTypes().size() == 1 &&
+    assert(Src.getExtTypes().size() == 1 &&
            "Expected Src of EXTRACT_SUBREG to have one result type");
 
     const auto SrcRCDstRCPair =
@@ -2134,16 +2124,16 @@ Expected<RuleMatcher> GlobalISelEmitter::runOnPattern(const PatternToMatch &P) {
                                                   *SrcRCDstRCPair->second);
     M.addAction<ConstrainOperandToRegClassAction>(0, 1, *SrcRCDstRCPair->first);
   } else if (DstIName == "INSERT_SUBREG") {
-    assert(Src->getExtTypes().size() == 1 &&
+    assert(Src.getExtTypes().size() == 1 &&
            "Expected Src of INSERT_SUBREG to have one result type");
     // We need to constrain the destination, a super regsister source, and a
     // subregister source.
-    auto SubClass = inferRegClassFromPattern(Dst->getChild(1));
+    auto SubClass = inferRegClassFromPattern(Dst.getChild(1));
     if (!SubClass)
       return failedImport(
           "Cannot infer register class from INSERT_SUBREG operand #1");
     auto SuperClass = inferSuperRegisterClassForNode(
-        Src->getExtType(0), Dst->getChild(0), Dst->getChild(2));
+        Src.getExtType(0), Dst.getChild(0), Dst.getChild(2));
     if (!SuperClass)
       return failedImport(
           "Cannot infer register class for INSERT_SUBREG operand #0");
@@ -2152,32 +2142,32 @@ Expected<RuleMatcher> GlobalISelEmitter::runOnPattern(const PatternToMatch &P) {
     M.addAction<ConstrainOperandToRegClassAction>(0, 2, **SubClass);
   } else if (DstIName == "SUBREG_TO_REG") {
     // We need to constrain the destination and subregister source.
-    assert(Src->getExtTypes().size() == 1 &&
+    assert(Src.getExtTypes().size() == 1 &&
            "Expected Src of SUBREG_TO_REG to have one result type");
 
     // Attempt to infer the subregister source from the first child. If it has
     // an explicitly given register class, we'll use that. Otherwise, we will
     // fail.
-    auto SubClass = inferRegClassFromPattern(Dst->getChild(1));
+    auto SubClass = inferRegClassFromPattern(Dst.getChild(1));
     if (!SubClass)
       return failedImport(
           "Cannot infer register class from SUBREG_TO_REG child #1");
     // We don't have a child to look at that might have a super register node.
     auto SuperClass =
-        inferSuperRegisterClass(Src->getExtType(0), Dst->getChild(2));
+        inferSuperRegisterClass(Src.getExtType(0), Dst.getChild(2));
     if (!SuperClass)
       return failedImport(
           "Cannot infer register class for SUBREG_TO_REG operand #0");
     M.addAction<ConstrainOperandToRegClassAction>(0, 0, **SuperClass);
     M.addAction<ConstrainOperandToRegClassAction>(0, 2, **SubClass);
   } else if (DstIName == "REG_SEQUENCE") {
-    auto SuperClass = inferRegClassFromPattern(Dst->getChild(0));
+    auto SuperClass = inferRegClassFromPattern(Dst.getChild(0));
 
     M.addAction<ConstrainOperandToRegClassAction>(0, 0, **SuperClass);
 
-    unsigned Num = Dst->getNumChildren();
+    unsigned Num = Dst.getNumChildren();
     for (unsigned I = 1; I != Num; I += 2) {
-      TreePatternNode *SubRegChild = Dst->getChild(I + 1);
+      TreePatternNode &SubRegChild = Dst.getChild(I + 1);
 
       auto SubIdx = inferSubRegIndexForNode(SubRegChild);
       if (!SubIdx)
-- 
cgit v1.1


From 0de859c8f22669ab7a816afdf975c7b012e511b9 Mon Sep 17 00:00:00 2001
From: Jay Foad <jay.foad@amd.com>
Date: Fri, 9 Feb 2024 14:16:48 +0000
Subject: [MC] Fix operator++ for various MCRegister iterators (#81250)

Return *this from operator++. NFC, this just allows using ++Iter in
an expression in future patches.
---
 llvm/include/llvm/MC/MCRegisterInfo.h | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/llvm/include/llvm/MC/MCRegisterInfo.h b/llvm/include/llvm/MC/MCRegisterInfo.h
index e52f0a4..fb4d11e 100644
--- a/llvm/include/llvm/MC/MCRegisterInfo.h
+++ b/llvm/include/llvm/MC/MCRegisterInfo.h
@@ -572,9 +572,10 @@ public:
   bool isValid() const { return SRIter.isValid(); }
 
   /// Moves to the next position.
-  void operator++() {
+  MCSubRegIndexIterator &operator++() {
     ++SRIter;
     ++SRIndex;
+    return *this;
   }
 };
 
@@ -688,9 +689,10 @@ public:
   bool isValid() const { return RUIter.isValid(); }
 
   /// Moves to the next position.
-  void operator++() {
+  MCRegUnitMaskIterator &operator++() {
     ++MaskListIter;
     ++RUIter;
+    return *this;
   }
 };
 
@@ -728,10 +730,11 @@ public:
   }
 
   /// Preincrement to move to the next root register.
-  void operator++() {
+  MCRegUnitRootIterator &operator++() {
     assert(isValid() && "Cannot move off the end of the list.");
     Reg0 = Reg1;
     Reg1 = 0;
+    return *this;
   }
 };
 
@@ -788,10 +791,11 @@ public:
     }
   }
 
-  void operator++() {
+  MCRegAliasIterator &operator++() {
     assert(isValid() && "Cannot move off the end of the list.");
     do advance();
     while (!IncludeSelf && isValid() && *SI == Reg);
+    return *this;
   }
 };
 
-- 
cgit v1.1


From b5abaea3c0de605c8145035b21a5ee492883ebd7 Mon Sep 17 00:00:00 2001
From: stephenpeckham <118857872+stephenpeckham@users.noreply.github.com>
Date: Fri, 9 Feb 2024 08:20:21 -0600
Subject: [yaml2obj][XOFF] Update yaml2obj for XCOFF to create valid XCOFF
 files in more cases. (#77620)

yaml2obj creates invalid object files even when the input was created by
obj2yaml using a valid object file. On the other hand, yaml2obj is used
to intentionally create invalid object files for testing purposes.

This update balances using specified input values when provided and
computing file offsets and sizes if necessary.
---
 llvm/lib/ObjectYAML/XCOFFEmitter.cpp               | 232 +++++++++++++--------
 .../tools/llvm-objcopy/XCOFF/invalid-read.test     |   6 +-
 .../XCOFF/disassemble-traceback-table.test         |   2 +-
 .../tools/llvm-objdump/XCOFF/section-headers.test  |   2 +-
 .../test/tools/llvm-readobj/XCOFF/file-header.test |   3 +-
 llvm/test/tools/llvm-readobj/XCOFF/sections.test   |  20 +-
 llvm/test/tools/obj2yaml/XCOFF/aix.yaml            |   4 +-
 .../test/tools/obj2yaml/XCOFF/invalid-section.yaml |   3 +-
 .../tools/yaml2obj/XCOFF/aux-hdr-defaults.yaml     |  24 +--
 llvm/test/tools/yaml2obj/XCOFF/basic-doc.yaml      |   4 +-
 llvm/test/tools/yaml2obj/XCOFF/offset-check.yaml   |  91 ++++++++
 11 files changed, 269 insertions(+), 122 deletions(-)
 create mode 100644 llvm/test/tools/yaml2obj/XCOFF/offset-check.yaml

diff --git a/llvm/lib/ObjectYAML/XCOFFEmitter.cpp b/llvm/lib/ObjectYAML/XCOFFEmitter.cpp
index 5b244ff..f68c571 100644
--- a/llvm/lib/ObjectYAML/XCOFFEmitter.cpp
+++ b/llvm/lib/ObjectYAML/XCOFFEmitter.cpp
@@ -41,17 +41,19 @@ public:
   bool writeXCOFF();
 
 private:
+  void reportOverwrite(uint64_t currentOffset, uint64_t specifiedOffset,
+                       const Twine &fieldName);
   bool nameShouldBeInStringTable(StringRef SymbolName);
   bool initFileHeader(uint64_t CurrentOffset);
   void initAuxFileHeader();
-  bool initSectionHeader(uint64_t &CurrentOffset);
+  bool initSectionHeaders(uint64_t &CurrentOffset);
   bool initRelocations(uint64_t &CurrentOffset);
   bool initStringTable();
   bool assignAddressesAndIndices();
 
   void writeFileHeader();
   void writeAuxFileHeader();
-  void writeSectionHeader();
+  void writeSectionHeaders();
   bool writeSectionData();
   bool writeRelocations();
   bool writeSymbols();
@@ -91,6 +93,14 @@ static void writeName(StringRef StrName, support::endian::Writer W) {
   W.write(NameRef);
 }
 
+void XCOFFWriter::reportOverwrite(uint64_t CurrentOffset,
+                                  uint64_t specifiedOffset,
+                                  const Twine &fieldName) {
+  ErrHandler("current file offset (" + Twine(CurrentOffset) +
+             ") is bigger than the specified " + fieldName + " (" +
+             Twine(specifiedOffset) + ") ");
+}
+
 bool XCOFFWriter::nameShouldBeInStringTable(StringRef SymbolName) {
   // For XCOFF64: The symbol name is always in the string table.
   return (SymbolName.size() > XCOFF::NameSize) || Is64Bit;
@@ -99,14 +109,31 @@ bool XCOFFWriter::nameShouldBeInStringTable(StringRef SymbolName) {
 bool XCOFFWriter::initRelocations(uint64_t &CurrentOffset) {
   for (XCOFFYAML::Section &InitSection : InitSections) {
     if (!InitSection.Relocations.empty()) {
-      InitSection.NumberOfRelocations = InitSection.Relocations.size();
-      InitSection.FileOffsetToRelocations = CurrentOffset;
       uint64_t RelSize = Is64Bit ? XCOFF::RelocationSerializationSize64
                                  : XCOFF::RelocationSerializationSize32;
-      CurrentOffset += InitSection.NumberOfRelocations * RelSize;
+      uint64_t UsedSize = RelSize * InitSection.Relocations.size();
+
+      // If NumberOfRelocations was specified, we use it, even if it's
+      // not consistent with the number of provided relocations.
+      if (!InitSection.NumberOfRelocations)
+        InitSection.NumberOfRelocations = InitSection.Relocations.size();
+
+      // If the YAML file specified an offset to relocations, we use it.
+      if (InitSection.FileOffsetToRelocations) {
+        if (CurrentOffset > InitSection.FileOffsetToRelocations) {
+          reportOverwrite(CurrentOffset, InitSection.FileOffsetToRelocations,
+                          "FileOffsetToRelocations for the " +
+                              InitSection.SectionName + " section");
+          return false;
+        }
+        CurrentOffset = InitSection.FileOffsetToRelocations;
+      } else
+        InitSection.FileOffsetToRelocations = CurrentOffset;
+      CurrentOffset += UsedSize;
       if (CurrentOffset > MaxRawDataSize) {
-        ErrHandler("maximum object size of" + Twine(MaxRawDataSize) +
-                   "exceeded when writing relocation data");
+        ErrHandler("maximum object size (" + Twine(MaxRawDataSize) +
+                   ") exceeded when writing relocation data for section " +
+                   Twine(InitSection.SectionName));
         return false;
       }
     }
@@ -114,15 +141,10 @@ bool XCOFFWriter::initRelocations(uint64_t &CurrentOffset) {
   return true;
 }
 
-bool XCOFFWriter::initSectionHeader(uint64_t &CurrentOffset) {
-  uint64_t CurrentSecAddr = 0;
+bool XCOFFWriter::initSectionHeaders(uint64_t &CurrentOffset) {
+  uint64_t CurrentEndDataAddr = 0;
+  uint64_t CurrentEndTDataAddr = 0;
   for (uint16_t I = 0, E = InitSections.size(); I < E; ++I) {
-    if (CurrentOffset > MaxRawDataSize) {
-      ErrHandler("maximum object size of" + Twine(MaxRawDataSize) +
-                 "exceeded when writing section data");
-      return false;
-    }
-
     // Assign indices for sections.
     if (InitSections[I].SectionName.size() &&
         !SectionIndexMap[InitSections[I].SectionName]) {
@@ -135,23 +157,58 @@ bool XCOFFWriter::initSectionHeader(uint64_t &CurrentOffset) {
       }
     }
 
-    // Calculate the physical/virtual address. This field should contain 0 for
-    // all sections except the text, data and bss sections.
-    if (InitSections[I].Flags != XCOFF::STYP_TEXT &&
-        InitSections[I].Flags != XCOFF::STYP_DATA &&
-        InitSections[I].Flags != XCOFF::STYP_BSS)
-      InitSections[I].Address = 0;
-    else
-      InitSections[I].Address = CurrentSecAddr;
+    if (!InitSections[I].Size)
+      InitSections[I].Size = InitSections[I].SectionData.binary_size();
+
+    // Section data addresses (physical/virtual) are related to symbol
+    // addresses and alignments. Furthermore, it is possible to specify the
+    // same starting addresses for the .text, .data, and .tdata sections.
+    // Without examining all the symbols and their addreses and alignments,
+    // it is not possible to compute valid section addresses. The only
+    // condition required by XCOFF is that the .bss section immediately
+    // follows the .data section, and the .tbss section immediately follows
+    // the .tdata section. Therefore, we only assign addresses to the .bss
+    // and .tbss sections if they do not already have non-zero addresses.
+    // (If the YAML file is being used to generate a valid object file, we
+    // expect all section addresses to be specified explicitly.)
+    switch (InitSections[I].Flags) {
+    case XCOFF::STYP_DATA:
+      CurrentEndDataAddr = InitSections[I].Address + InitSections[I].Size;
+      break;
+    case XCOFF::STYP_BSS:
+      if (!InitSections[I].Address)
+        InitSections[I].Address = CurrentEndDataAddr;
+      break;
+    case XCOFF::STYP_TDATA:
+      CurrentEndTDataAddr = InitSections[I].Address + InitSections[I].Size;
+      break;
+    case XCOFF::STYP_TBSS:
+      if (!InitSections[I].Address)
+        InitSections[I].Address = CurrentEndTDataAddr;
+      break;
+    }
 
-    // Calculate the FileOffsetToData and data size for sections.
     if (InitSections[I].SectionData.binary_size()) {
-      InitSections[I].FileOffsetToData = CurrentOffset;
+      if (InitSections[I].FileOffsetToData) {
+        // Use the providedFileOffsetToData.
+        if (CurrentOffset > InitSections[I].FileOffsetToData) {
+          reportOverwrite(CurrentOffset, InitSections[I].FileOffsetToData,
+                          "FileOffsetToData for the " +
+                              InitSections[I].SectionName + " section");
+          return false;
+        }
+        CurrentOffset = InitSections[I].FileOffsetToData;
+      } else {
+        CurrentOffset = alignTo(CurrentOffset, DefaultSectionAlign);
+        InitSections[I].FileOffsetToData = CurrentOffset;
+      }
       CurrentOffset += InitSections[I].SectionData.binary_size();
-      // Ensure the offset is aligned to DefaultSectionAlign.
-      CurrentOffset = alignTo(CurrentOffset, DefaultSectionAlign);
-      InitSections[I].Size = CurrentOffset - InitSections[I].FileOffsetToData;
-      CurrentSecAddr += InitSections[I].Size;
+      if (CurrentOffset > MaxRawDataSize) {
+        ErrHandler("maximum object size (" + Twine(MaxRawDataSize) +
+                   ") exceeded when writing data for section " + Twine(I + 1) +
+                   " (" + Twine(InitSections[I].SectionName) + ")");
+        return false;
+      }
     }
   }
   return initRelocations(CurrentOffset);
@@ -255,12 +312,20 @@ bool XCOFFWriter::initFileHeader(uint64_t CurrentOffset) {
 
   // Calculate SymbolTableOffset for the file header.
   if (InitFileHdr.NumberOfSymTableEntries) {
+    if (Obj.Header.SymbolTableOffset) {
+      if (CurrentOffset > Obj.Header.SymbolTableOffset) {
+        reportOverwrite(CurrentOffset, Obj.Header.SymbolTableOffset,
+                        "SymbolTableOffset");
+        return false;
+      }
+      CurrentOffset = Obj.Header.SymbolTableOffset;
+    }
     InitFileHdr.SymbolTableOffset = CurrentOffset;
     CurrentOffset +=
         InitFileHdr.NumberOfSymTableEntries * XCOFF::SymbolTableEntrySize;
     if (CurrentOffset > MaxRawDataSize) {
-      ErrHandler("maximum object size of" + Twine(MaxRawDataSize) +
-                 "exceeded when writing symbols");
+      ErrHandler("maximum object size of " + Twine(MaxRawDataSize) +
+                 " exceeded when writing symbols");
       return false;
     }
   }
@@ -269,7 +334,8 @@ bool XCOFFWriter::initFileHeader(uint64_t CurrentOffset) {
 }
 
 void XCOFFWriter::initAuxFileHeader() {
-  InitAuxFileHdr = *Obj.AuxHeader;
+  if (Obj.AuxHeader)
+    InitAuxFileHdr = *Obj.AuxHeader;
   // In general, an object file might contain multiple sections of a given type,
   // but in a loadable module, there must be exactly one .text, .data, .bss, and
   // .loader section. A loadable object might also have one .tdata section and
@@ -323,28 +389,32 @@ void XCOFFWriter::initAuxFileHeader() {
 bool XCOFFWriter::assignAddressesAndIndices() {
   uint64_t FileHdrSize =
       Is64Bit ? XCOFF::FileHeaderSize64 : XCOFF::FileHeaderSize32;
+
+  // If AuxHeaderSize is specified in the YAML file, we construct
+  // an auxiliary header.
   uint64_t AuxFileHdrSize = 0;
-  if (Obj.AuxHeader)
-    AuxFileHdrSize = Obj.Header.AuxHeaderSize
-                         ? Obj.Header.AuxHeaderSize
-                         : (Is64Bit ? XCOFF::AuxFileHeaderSize64
-                                    : XCOFF::AuxFileHeaderSize32);
+
+  if (Obj.Header.AuxHeaderSize)
+    AuxFileHdrSize = Obj.Header.AuxHeaderSize;
+  else if (Obj.AuxHeader)
+    AuxFileHdrSize =
+        (Is64Bit ? XCOFF::AuxFileHeaderSize64 : XCOFF::AuxFileHeaderSize32);
   uint64_t SecHdrSize =
       Is64Bit ? XCOFF::SectionHeaderSize64 : XCOFF::SectionHeaderSize32;
   uint64_t CurrentOffset =
       FileHdrSize + AuxFileHdrSize + InitSections.size() * SecHdrSize;
 
   // Calculate section header info.
-  if (!initSectionHeader(CurrentOffset))
+  if (!initSectionHeaders(CurrentOffset))
     return false;
-  InitFileHdr.AuxHeaderSize = AuxFileHdrSize;
 
   // Calculate file header info.
   if (!initFileHeader(CurrentOffset))
     return false;
+  InitFileHdr.AuxHeaderSize = AuxFileHdrSize;
 
   // Initialize the auxiliary file header.
-  if (Obj.AuxHeader)
+  if (AuxFileHdrSize)
     initAuxFileHeader();
 
   // Initialize the string table.
@@ -357,18 +427,14 @@ void XCOFFWriter::writeFileHeader() {
                                                 : InitFileHdr.NumberOfSections);
   W.write<int32_t>(Obj.Header.TimeStamp);
   if (Is64Bit) {
-    W.write<uint64_t>(Obj.Header.SymbolTableOffset
-                          ? Obj.Header.SymbolTableOffset
-                          : InitFileHdr.SymbolTableOffset);
+    W.write<uint64_t>(InitFileHdr.SymbolTableOffset);
     W.write<uint16_t>(InitFileHdr.AuxHeaderSize);
     W.write<uint16_t>(Obj.Header.Flags);
     W.write<int32_t>(Obj.Header.NumberOfSymTableEntries
                          ? Obj.Header.NumberOfSymTableEntries
                          : InitFileHdr.NumberOfSymTableEntries);
   } else {
-    W.write<uint32_t>(Obj.Header.SymbolTableOffset
-                          ? Obj.Header.SymbolTableOffset
-                          : InitFileHdr.SymbolTableOffset);
+    W.write<uint32_t>(InitFileHdr.SymbolTableOffset);
     W.write<int32_t>(Obj.Header.NumberOfSymTableEntries
                          ? Obj.Header.NumberOfSymTableEntries
                          : InitFileHdr.NumberOfSymTableEntries);
@@ -392,6 +458,9 @@ void XCOFFWriter::writeAuxFileHeader() {
     W.write<uint32_t>(InitAuxFileHdr.EntryPointAddr.value_or(yaml::Hex64(0)));
     W.write<uint32_t>(InitAuxFileHdr.TextStartAddr.value_or(yaml::Hex64(0)));
     W.write<uint32_t>(InitAuxFileHdr.DataStartAddr.value_or(yaml::Hex64(0)));
+    // A short 32-bit auxiliary header ends here.
+    if (InitFileHdr.AuxHeaderSize == XCOFF::AuxFileHeaderSizeShort)
+      return;
     W.write<uint32_t>(InitAuxFileHdr.TOCAnchorAddr.value_or(yaml::Hex64(0)));
   }
   W.write<uint16_t>(InitAuxFileHdr.SecNumOfEntryPoint.value_or(0));
@@ -434,50 +503,39 @@ void XCOFFWriter::writeAuxFileHeader() {
         InitAuxFileHdr.Flag.value_or(yaml::Hex16(XCOFF::SHR_SYMTAB)));
     if (InitFileHdr.AuxHeaderSize > XCOFF::AuxFileHeaderSize64)
       W.OS.write_zeros(InitFileHdr.AuxHeaderSize - XCOFF::AuxFileHeaderSize64);
-  } else if (InitFileHdr.AuxHeaderSize > XCOFF::AuxFileHeaderSize32) {
-    W.OS.write_zeros(InitFileHdr.AuxHeaderSize - XCOFF::AuxFileHeaderSize32);
+  } else {
+    if (InitFileHdr.AuxHeaderSize > XCOFF::AuxFileHeaderSize32)
+      W.OS.write_zeros(InitFileHdr.AuxHeaderSize - XCOFF::AuxFileHeaderSize32);
   }
 }
 
-void XCOFFWriter::writeSectionHeader() {
+void XCOFFWriter::writeSectionHeaders() {
   for (uint16_t I = 0, E = Obj.Sections.size(); I < E; ++I) {
-    XCOFFYAML::Section YamlSec = Obj.Sections[I];
     XCOFFYAML::Section DerivedSec = InitSections[I];
-    writeName(YamlSec.SectionName, W);
-    // Virtual address is the same as physical address.
-    uint64_t SectionAddress =
-        YamlSec.Address ? YamlSec.Address : DerivedSec.Address;
+    writeName(DerivedSec.SectionName, W);
     if (Is64Bit) {
-      W.write<uint64_t>(SectionAddress); // Physical address
-      W.write<uint64_t>(SectionAddress); // Virtual address
-      W.write<uint64_t>(YamlSec.Size ? YamlSec.Size : DerivedSec.Size);
-      W.write<uint64_t>(YamlSec.FileOffsetToData ? YamlSec.FileOffsetToData
-                                                 : DerivedSec.FileOffsetToData);
-      W.write<uint64_t>(YamlSec.FileOffsetToRelocations
-                            ? YamlSec.FileOffsetToRelocations
-                            : DerivedSec.FileOffsetToRelocations);
-      W.write<uint64_t>(YamlSec.FileOffsetToLineNumbers);
-      W.write<uint32_t>(YamlSec.NumberOfRelocations
-                            ? YamlSec.NumberOfRelocations
-                            : DerivedSec.NumberOfRelocations);
-      W.write<uint32_t>(YamlSec.NumberOfLineNumbers);
-      W.write<int32_t>(YamlSec.Flags);
+      // Virtual address is the same as physical address.
+      W.write<uint64_t>(DerivedSec.Address); // Physical address
+      W.write<uint64_t>(DerivedSec.Address); // Virtual address
+      W.write<uint64_t>(DerivedSec.Size);
+      W.write<uint64_t>(DerivedSec.FileOffsetToData);
+      W.write<uint64_t>(DerivedSec.FileOffsetToRelocations);
+      W.write<uint64_t>(DerivedSec.FileOffsetToLineNumbers);
+      W.write<uint32_t>(DerivedSec.NumberOfRelocations);
+      W.write<uint32_t>(DerivedSec.NumberOfLineNumbers);
+      W.write<int32_t>(DerivedSec.Flags);
       W.OS.write_zeros(4);
     } else {
-      W.write<uint32_t>(SectionAddress); // Physical address
-      W.write<uint32_t>(SectionAddress); // Virtual address
-      W.write<uint32_t>(YamlSec.Size ? YamlSec.Size : DerivedSec.Size);
-      W.write<uint32_t>(YamlSec.FileOffsetToData ? YamlSec.FileOffsetToData
-                                                 : DerivedSec.FileOffsetToData);
-      W.write<uint32_t>(YamlSec.FileOffsetToRelocations
-                            ? YamlSec.FileOffsetToRelocations
-                            : DerivedSec.FileOffsetToRelocations);
-      W.write<uint32_t>(YamlSec.FileOffsetToLineNumbers);
-      W.write<uint16_t>(YamlSec.NumberOfRelocations
-                            ? YamlSec.NumberOfRelocations
-                            : DerivedSec.NumberOfRelocations);
-      W.write<uint16_t>(YamlSec.NumberOfLineNumbers);
-      W.write<int32_t>(YamlSec.Flags);
+      // Virtual address is the same as physical address.
+      W.write<uint32_t>(DerivedSec.Address); // Physical address
+      W.write<uint32_t>(DerivedSec.Address); // Virtual address
+      W.write<uint32_t>(DerivedSec.Size);
+      W.write<uint32_t>(DerivedSec.FileOffsetToData);
+      W.write<uint32_t>(DerivedSec.FileOffsetToRelocations);
+      W.write<uint32_t>(DerivedSec.FileOffsetToLineNumbers);
+      W.write<uint16_t>(DerivedSec.NumberOfRelocations);
+      W.write<uint16_t>(DerivedSec.NumberOfLineNumbers);
+      W.write<int32_t>(DerivedSec.Flags);
     }
   }
 }
@@ -487,8 +545,8 @@ bool XCOFFWriter::writeSectionData() {
     XCOFFYAML::Section YamlSec = Obj.Sections[I];
     if (YamlSec.SectionData.binary_size()) {
       // Fill the padding size with zeros.
-      int64_t PaddingSize =
-          InitSections[I].FileOffsetToData - (W.OS.tell() - StartOffset);
+      int64_t PaddingSize = (uint64_t)InitSections[I].FileOffsetToData -
+                            (W.OS.tell() - StartOffset);
       if (PaddingSize < 0) {
         ErrHandler("redundant data was written before section data");
         return false;
@@ -685,7 +743,7 @@ bool XCOFFWriter::writeAuxSymbol(
 
 bool XCOFFWriter::writeSymbols() {
   int64_t PaddingSize =
-      (uint64_t)InitFileHdr.SymbolTableOffset - (W.OS.tell() - StartOffset);
+      InitFileHdr.SymbolTableOffset - (W.OS.tell() - StartOffset);
   if (PaddingSize < 0) {
     ErrHandler("redundant data was written before symbols");
     return false;
@@ -797,10 +855,10 @@ bool XCOFFWriter::writeXCOFF() {
     return false;
   StartOffset = W.OS.tell();
   writeFileHeader();
-  if (Obj.AuxHeader)
+  if (InitFileHdr.AuxHeaderSize)
     writeAuxFileHeader();
   if (!Obj.Sections.empty()) {
-    writeSectionHeader();
+    writeSectionHeaders();
     if (!writeSectionData())
       return false;
     if (!writeRelocations())
diff --git a/llvm/test/tools/llvm-objcopy/XCOFF/invalid-read.test b/llvm/test/tools/llvm-objcopy/XCOFF/invalid-read.test
index 1df6340..96dcd72 100644
--- a/llvm/test/tools/llvm-objcopy/XCOFF/invalid-read.test
+++ b/llvm/test/tools/llvm-objcopy/XCOFF/invalid-read.test
@@ -5,7 +5,7 @@
 # RUN: yaml2obj %s --docnum=1 -o %t1
 # RUN: not llvm-objcopy %t1 %t1.out 2>&1 | FileCheck %s -DFILE=%t1 --check-prefix=ERROR1
 
-# ERROR1: error: '[[FILE]]': The end of the file was unexpectedly encountered: section data with offset 0x70 and size 0x4 goes past the end of the file
+# ERROR1: error: '[[FILE]]': The end of the file was unexpectedly encountered: section data with offset 0x70 and size 0x20 goes past the end of the file
 
 --- !XCOFF
 FileHeader:
@@ -13,6 +13,7 @@ FileHeader:
 Sections:
   - SectionData:      '00007400'
     FileOffsetToData: 0x70
+    Size: 0x20
 
 ## Failed to read relocations.
 # RUN: yaml2obj %s --docnum=2 -o %t2
@@ -35,12 +36,13 @@ Sections:
 # RUN: yaml2obj %s --docnum=3 -o %t3
 # RUN: not llvm-objcopy %t3 %t3.out 2>&1 | FileCheck %s -DFILE=%t3 --check-prefix=ERROR3
 
-# ERROR3: error: '[[FILE]]': The end of the file was unexpectedly encountered: symbol table with offset 0x15 and size 0x24 goes past the end of the file
+# ERROR3: error: '[[FILE]]': The end of the file was unexpectedly encountered: symbol table with offset 0x15 and size 0x36 goes past the end of the file
 
 --- !XCOFF
 FileHeader:
   MagicNumber:         0x01DF
   OffsetToSymbolTable: 0x15
+  EntriesInSymbolTable: 3
 Symbols:
   - Name:         foo
     AuxEntries:
diff --git a/llvm/test/tools/llvm-objdump/XCOFF/disassemble-traceback-table.test b/llvm/test/tools/llvm-objdump/XCOFF/disassemble-traceback-table.test
index 91354f5..96cac6b 100644
--- a/llvm/test/tools/llvm-objdump/XCOFF/disassemble-traceback-table.test
+++ b/llvm/test/tools/llvm-objdump/XCOFF/disassemble-traceback-table.test
@@ -112,4 +112,4 @@ Symbols:
 # CHECK-NEXT:      70: 00 00 00 00  
 # CHECK-NEXT:        ...
 # CHECK-NEXT:      7c: 00 12 34 00  
-# CHECK-NEXT:      80: 00 00 00 00  
+# CHECK-NEXT:      80: 00 00 00     
diff --git a/llvm/test/tools/llvm-objdump/XCOFF/section-headers.test b/llvm/test/tools/llvm-objdump/XCOFF/section-headers.test
index e80d5f6..1a110fb 100644
--- a/llvm/test/tools/llvm-objdump/XCOFF/section-headers.test
+++ b/llvm/test/tools/llvm-objdump/XCOFF/section-headers.test
@@ -10,7 +10,7 @@
 # CHECK-NEXT:   1 .data         00000004 00000000 DATA
 # CHECK-NEXT:   2 .bss          00000000 00000010 BSS
 # CHECK-NEXT:   3 .tdata        00000004 00000000 DATA
-# CHECK-NEXT:   4 .tbss         00000000 00000000 BSS
+# CHECK-NEXT:   4 .tbss         00000000 00000004 BSS
 # CHECK-NEXT:   5 .dwline       00000046 00000000 DEBUG
 # CHECK-NEXT:   6 .debug        00000046 00000000 DEBUG
 
diff --git a/llvm/test/tools/llvm-readobj/XCOFF/file-header.test b/llvm/test/tools/llvm-readobj/XCOFF/file-header.test
index 8cbd847..2407aef 100644
--- a/llvm/test/tools/llvm-readobj/XCOFF/file-header.test
+++ b/llvm/test/tools/llvm-readobj/XCOFF/file-header.test
@@ -23,7 +23,6 @@ FileHeader:
   CreationTime:         [[CREATTIME=1]]
   EntriesInSymbolTable: [[SYMBOLCOUNT=1]]
   NumberOfSections:     1
-  OffsetToSymbolTable:  0x3C
   AuxiliaryHeaderSize:  0
   Flags:                0x12
 Sections:
@@ -42,7 +41,7 @@ Symbols:
 # FILEHEADER64-NEXT:  Magic: 0x1F7
 # FILEHEADER64-NEXT:  NumberOfSections: 1
 # FILEHEADER64-NEXT:  TimeStamp: None (0x0)
-# FILEHEADER64-NEXT:  SymbolTableOffset: 0x3C
+# FILEHEADER64-NEXT:  SymbolTableOffset: 0x60
 # FILEHEADER64-NEXT:  SymbolTableEntries: 1
 # FILEHEADER64-NEXT:  OptionalHeaderSize: 0x0
 # FILEHEADER64-NEXT:  Flags: 0x12
diff --git a/llvm/test/tools/llvm-readobj/XCOFF/sections.test b/llvm/test/tools/llvm-readobj/XCOFF/sections.test
index be09893..36e85d6 100644
--- a/llvm/test/tools/llvm-readobj/XCOFF/sections.test
+++ b/llvm/test/tools/llvm-readobj/XCOFF/sections.test
@@ -13,7 +13,7 @@
 # SEC32-NEXT:    Name: .text
 # SEC32-NEXT:    PhysicalAddress: 0x0
 # SEC32-NEXT:    VirtualAddress: 0x0
-# SEC32-NEXT:    Size: 0x4
+# SEC32-NEXT:    Size: 0x2
 # SEC32-NEXT:    RawDataOffset: 0x64
 # SEC32-NEXT:    RelocationPointer: 0x0
 # SEC32-NEXT:    LineNumberPointer: 0x0
@@ -24,11 +24,11 @@
 # SEC32-NEXT:  Section {
 # SEC32-NEXT:    Index: 2
 # SEC32-NEXT:    Name: .data
-# SEC32-NEXT:    PhysicalAddress: 0x4
-# SEC32-NEXT:    VirtualAddress: 0x4
-# SEC32-NEXT:    Size: 0x4
+# SEC32-NEXT:    PhysicalAddress: 0x0
+# SEC32-NEXT:    VirtualAddress: 0x0
+# SEC32-NEXT:    Size: 0x2
 # SEC32-NEXT:    RawDataOffset: 0x68
-# SEC32-NEXT:    RelocationPointer: 0x6C
+# SEC32-NEXT:    RelocationPointer: 0x6A
 # SEC32-NEXT:    LineNumberPointer: 0x0
 # SEC32-NEXT:    NumberOfRelocations: 1
 # SEC32-NEXT:    NumberOfLineNumbers: 0
@@ -65,7 +65,7 @@ Sections:
 # SEC64-NEXT:    Name: .text
 # SEC64-NEXT:    PhysicalAddress: 0x0
 # SEC64-NEXT:    VirtualAddress: 0x0
-# SEC64-NEXT:    Size: 0x4
+# SEC64-NEXT:    Size: 0x2
 # SEC64-NEXT:    RawDataOffset: 0xA8
 # SEC64-NEXT:    RelocationPointer: 0x0
 # SEC64-NEXT:    LineNumberPointer: 0x0
@@ -76,11 +76,11 @@ Sections:
 # SEC64-NEXT:  Section {
 # SEC64-NEXT:    Index: 2
 # SEC64-NEXT:    Name: .data
-# SEC64-NEXT:    PhysicalAddress: 0x4
-# SEC64-NEXT:    VirtualAddress: 0x4
-# SEC64-NEXT:    Size: 0x4
+# SEC64-NEXT:    PhysicalAddress: 0x0
+# SEC64-NEXT:    VirtualAddress: 0x0
+# SEC64-NEXT:    Size: 0x2
 # SEC64-NEXT:    RawDataOffset: 0xAC
-# SEC64-NEXT:    RelocationPointer: 0xB0
+# SEC64-NEXT:    RelocationPointer: 0xAE
 # SEC64-NEXT:    LineNumberPointer: 0x0
 # SEC64-NEXT:    NumberOfRelocations: 1
 # SEC64-NEXT:    NumberOfLineNumbers: 0
diff --git a/llvm/test/tools/obj2yaml/XCOFF/aix.yaml b/llvm/test/tools/obj2yaml/XCOFF/aix.yaml
index 9f2f68b..12f44d0 100644
--- a/llvm/test/tools/obj2yaml/XCOFF/aix.yaml
+++ b/llvm/test/tools/obj2yaml/XCOFF/aix.yaml
@@ -31,7 +31,7 @@
 # CHECK32-NEXT:         Info:            0xF
 # CHECK32-NEXT:         Type:            0x3
 # CHECK32-NEXT:   - Name:            .data
-# CHECK32-NEXT:     Address:         0x4
+# CHECK32-NEXT:     Address:         0x0
 # CHECK32-NEXT:     Size:            0x4
 # CHECK32-NEXT:     FileOffsetToData: 0x68
 # CHECK32-NEXT:     FileOffsetToRelocations: 0x76
@@ -105,7 +105,7 @@
 # CHECK64-NEXT:         Info:            0xF
 # CHECK64-NEXT:         Type:            0x3
 # CHECK64-NEXT:   - Name:            .data
-# CHECK64-NEXT:     Address:         0x4
+# CHECK64-NEXT:     Address:         0x0
 # CHECK64-NEXT:     Size:            0x4
 # CHECK64-NEXT:     FileOffsetToData: 0xAC
 # CHECK64-NEXT:     FileOffsetToRelocations: 0xBE
diff --git a/llvm/test/tools/obj2yaml/XCOFF/invalid-section.yaml b/llvm/test/tools/obj2yaml/XCOFF/invalid-section.yaml
index 1e16c5f..0e16a47 100644
--- a/llvm/test/tools/obj2yaml/XCOFF/invalid-section.yaml
+++ b/llvm/test/tools/obj2yaml/XCOFF/invalid-section.yaml
@@ -5,7 +5,7 @@
 # RUN: yaml2obj %s --docnum=1 -o %t1
 # RUN: not obj2yaml %t1 2>&1 | FileCheck %s -DFILE=%t1 --check-prefix=ERROR1
 
-# ERROR1: The end of the file was unexpectedly encountered: section data with offset 0x70 and size 0x4 goes past the end of the file
+# ERROR1: The end of the file was unexpectedly encountered: section data with offset 0x70 and size 0x20 goes past the end of the file
 
 --- !XCOFF
 FileHeader:
@@ -13,6 +13,7 @@ FileHeader:
 Sections:
   - SectionData:      '00007400'
     FileOffsetToData: 0x70
+    Size: 0x20
 
 ## Error2: failed to get relocations.
 # RUN: yaml2obj %s --docnum=2 -o %t2
diff --git a/llvm/test/tools/yaml2obj/XCOFF/aux-hdr-defaults.yaml b/llvm/test/tools/yaml2obj/XCOFF/aux-hdr-defaults.yaml
index f6d6193..a93123b 100644
--- a/llvm/test/tools/yaml2obj/XCOFF/aux-hdr-defaults.yaml
+++ b/llvm/test/tools/yaml2obj/XCOFF/aux-hdr-defaults.yaml
@@ -10,12 +10,12 @@
 # CASE1:      AuxiliaryHeader {
 # CASE1-NEXT:   Magic: 0x10B
 # CASE1-NEXT:   Version: 0x1
-# CASE1-NEXT:   Size of .text section: 0x8
-# CASE1-NEXT:   Size of .data section: 0x8
-# CASE1-NEXT:   Size of .bss section: 0x8
+# CASE1-NEXT:   Size of .text section: 0x5
+# CASE1-NEXT:   Size of .data section: 0x5
+# CASE1-NEXT:   Size of .bss section: 0x5
 # CASE1-NEXT:   Entry point address: 0x0
-# CASE1-NEXT:   .text section start address: 0x4
-# CASE1-NEXT:   .data section start address: 0x10
+# CASE1-NEXT:   .text section start address: 0x0
+# CASE1-NEXT:   .data section start address: 0x0
 # CASE1-NEXT:   TOC anchor address: 0x0
 # CASE1-NEXT:   Section number of entryPoint: 0
 # CASE1-NEXT:   Section number of .text: 2
@@ -79,16 +79,12 @@ Sections:
 # RUN: yaml2obj %s --docnum=1 -DMAGIC=0x1F7 -o %t2
 # RUN: llvm-readobj --auxiliary-header %t2 | FileCheck %s --check-prefix=CASE2
 
-## Case2: same as case1, except it is 64-bit.
-# RUN: yaml2obj %s --docnum=1 -DMAGIC=0x1F7 -o %t2
-# RUN: llvm-readobj --auxiliary-header %t2 | FileCheck %s --check-prefix=CASE2
-
 # CASE2:      AuxiliaryHeader {
 # CASE2-NEXT:   Magic: 0x10B
 # CASE2-NEXT:   Version: 0x1
 # CASE2-NEXT:   Reserved for debugger: 0x0
-# CASE2-NEXT:   .text section start address: 0x2
-# CASE2-NEXT:   .data section start address: 0xE
+# CASE2-NEXT:   .text section start address: 0x0
+# CASE2-NEXT:   .data section start address: 0x0
 # CASE2-NEXT:   TOC anchor address: 0x0
 # CASE2-NEXT:   Section number of entryPoint: 0
 # CASE2-NEXT:   Section number of .text: 2
@@ -106,9 +102,9 @@ Sections:
 # CASE2-NEXT:   Stack page size: 0x0
 # CASE2-NEXT:   Flag: 0x80
 # CASE2-NEXT:   Alignment of thread-local storage: 0x0
-# CASE2-NEXT:   Size of .text section: 0x8
-# CASE2-NEXT:   Size of .data section: 0x8
-# CASE2-NEXT:   Size of .bss section: 0x8
+# CASE2-NEXT:   Size of .text section: 0x5
+# CASE2-NEXT:   Size of .data section: 0x5
+# CASE2-NEXT:   Size of .bss section: 0x5
 # CASE2-NEXT:   Entry point address: 0x0
 # CASE2-NEXT:   Maximum stack size: 0x0
 # CASE2-NEXT:   Maximum data size: 0x0
diff --git a/llvm/test/tools/yaml2obj/XCOFF/basic-doc.yaml b/llvm/test/tools/yaml2obj/XCOFF/basic-doc.yaml
index ed85bc6..8c3d77d 100644
--- a/llvm/test/tools/yaml2obj/XCOFF/basic-doc.yaml
+++ b/llvm/test/tools/yaml2obj/XCOFF/basic-doc.yaml
@@ -65,8 +65,8 @@ Symbols:
 # CHECK-NEXT:   Section {
 # CHECK-NEXT:     Index: 2
 # CHECK-NEXT:     Name: .data
-# CHECK-NEXT:     PhysicalAddress: 0x8
-# CHECK-NEXT:     VirtualAddress: 0x8
+# CHECK-NEXT:     PhysicalAddress: 0x0
+# CHECK-NEXT:     VirtualAddress: 0x0
 # CHECK-NEXT:     Size: 0x8
 # CHECK-NEXT:     RawDataOffset: 0xE4
 # CHECK-NEXT:     RelocationPointer: 0xF0
diff --git a/llvm/test/tools/yaml2obj/XCOFF/offset-check.yaml b/llvm/test/tools/yaml2obj/XCOFF/offset-check.yaml
new file mode 100644
index 0000000..ee23a16
--- /dev/null
+++ b/llvm/test/tools/yaml2obj/XCOFF/offset-check.yaml
@@ -0,0 +1,91 @@
+## Report errors when specified file offsets are invalid.
+
+# RUN: not yaml2obj %s -DTEXTRAWDATAOFFSET=0xFFFFFFF0 -o %t 2>&1 | \
+# RUN:   FileCheck %s --check-prefix=ERROR1
+# ERROR1: current file offset (4294967288) is bigger than the specified FileOffsetToData for the .data section (108)
+
+# RUN: not yaml2obj %s -DDATARAWDATAOFFSET=0xFFFFFFF0 -o %t 2>&1 | \
+# RUN:   FileCheck %s --check-prefix=ERROR2
+# ERROR2: current file offset (4294967284) is bigger than the specified FileOffsetToRelocations for the .text section (112)
+
+# RUN: not yaml2obj %s -DRELOCOFFSET=0xFFFFFFF0 -o %t 2>&1 | \
+# RUN:   FileCheck %s --check-prefix=ERROR3
+# ERROR3: current file offset (4294967290) is bigger than the specified SymbolTableOffset (122)
+
+# RUN: not yaml2obj %s -DSYMTABOFFSET=0x100000000 -o %t 2>&1 | \
+# RUN:   FileCheck %s --check-prefix=ERROR4
+# ERROR4: maximum object size of 4294967295 exceeded when writing symbols
+
+# RUN: not yaml2obj %s -DRELOCOFFSET=0x100000000 -o %t 2>&1 | \
+# RUN:   FileCheck %s --check-prefix=ERROR5
+# ERROR5: maximum object size (4294967295) exceeded when writing relocation data for section .text
+
+# RUN: not yaml2obj %s -DDATARAWDATAOFFSET=0x100000000 -o %t 2>&1 | \
+# RUN:   FileCheck %s --check-prefix=ERROR6
+# ERROR6: maximum object size (4294967295) exceeded when writing data for section 2 (.data)
+
+--- !XCOFF
+FileHeader:
+  MagicNumber:     0x1DF
+  NumberOfSections: 2
+  OffsetToSymbolTable: [[SYMTABOFFSET=0x7A]]
+  EntriesInSymbolTable: 6
+  AuxiliaryHeaderSize: 0
+  Flags:           0x0
+Sections:
+  - Name:            .text
+    Address:         0x0
+    Size:            0x8
+    FileOffsetToData: [[TEXTRAWDATAOFFSET=0x64]]
+    FileOffsetToRelocations: [[RELOCOFFSET=0x70]]
+    NumberOfRelocations: 0x1
+    Flags:           [ STYP_TEXT ]
+    SectionData:     386000004BFFFFFC
+    Relocations:
+      - Address:         0x4
+        Symbol:          0x2
+        Info:            0x99
+        Type:            0x1A
+  - Name:            .data
+    Address:         0x0
+    Size:            0x4
+    FileOffsetToData: [[DATARAWDATAOFFSET=0x6C]]
+    FileOffsetToRelocations: 0
+    Flags:           [ STYP_DATA ]
+    SectionData:     3210ABCD
+Symbols:
+  - Name:            .bar
+    Value:           0x0
+    Section:         N_UNDEF
+    Type:            0x0
+    StorageClass:    C_EXT
+    NumberOfAuxEntries: 1
+    AuxEntries:
+      - Type:            AUX_CSECT
+        SymbolAlignmentAndType: 0
+        StorageMappingClass: XMC_PR
+        SectionOrLength: 0
+  - Name:            ''
+    Value:           0x0
+    Section:         .text
+    Type:            0x0
+    StorageClass:    C_HIDEXT
+    NumberOfAuxEntries: 1
+    AuxEntries:
+      - Type:            AUX_CSECT
+        SymbolAlignmentAndType: 17
+        StorageMappingClass: XMC_PR
+        SectionOrLength: 8
+  - Name:            .foo
+    Value:           0x0
+    Section:         .text
+    Type:            0x0
+    StorageClass:    C_EXT
+    NumberOfAuxEntries: 1
+    AuxEntries:
+      - Type:            AUX_CSECT
+        SymbolAlignmentAndType: 2
+        StorageMappingClass: XMC_PR
+        SectionOrLength: 2
+StringTable:     {}
+...
-- 
cgit v1.1


From b1b8a383fcdab007ccd1a5daa08cb33ce7cbc6c0 Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Fri, 9 Feb 2024 15:25:24 +0100
Subject: [InstCombine] Remove one-use restriction on icmp of gep fold (#76730)

The fold for icmp (gep (p, i1), gep (p, i2)) to icmp (i1, i2) is
currently limited to one of the GEPs either having one use or a constant
offset. I believe this is to avoid duplicating complex arithmetic both
in the GEP and the offset comparison.

This patch instead does the same thing that the indexed compare fold
does, which is to rewrite the GEP into i8 form if necessary, so that the
offset arithmetic is not repeated after the transform.

I ran into this problem in a case where there are multiple conditions on
the same pointer, which prevents them from getting folded.
---
 .../Transforms/InstCombine/InstCombineCompares.cpp | 30 +++++++++++++++++-----
 llvm/test/Transforms/InstCombine/icmp-custom-dl.ll |  4 +--
 llvm/test/Transforms/InstCombine/icmp-gep.ll       | 17 +++++++-----
 3 files changed, 35 insertions(+), 16 deletions(-)

diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
index cbb6988..280c4d7 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
@@ -813,14 +813,30 @@ Instruction *InstCombinerImpl::foldGEPICmp(GEPOperator *GEPLHS, Value *RHS,
       }
     }
 
-    // Only lower this if the icmp is the only user of the GEP or if we expect
-    // the result to fold to a constant!
-    if ((GEPsInBounds || CmpInst::isEquality(Cond)) &&
-        (GEPLHS->hasAllConstantIndices() || GEPLHS->hasOneUse()) &&
-        (GEPRHS->hasAllConstantIndices() || GEPRHS->hasOneUse())) {
+    if (GEPsInBounds || CmpInst::isEquality(Cond)) {
+      auto EmitGEPOffsetAndRewrite = [&](GEPOperator *GEP) {
+        IRBuilderBase::InsertPointGuard Guard(Builder);
+        auto *Inst = dyn_cast<Instruction>(GEP);
+        if (Inst)
+          Builder.SetInsertPoint(Inst);
+
+        Value *Offset = EmitGEPOffset(GEP);
+        // If a non-trivial GEP has other uses, rewrite it to avoid duplicating
+        // the offset arithmetic.
+        if (Inst && !GEP->hasOneUse() && !GEP->hasAllConstantIndices() &&
+            !GEP->getSourceElementType()->isIntegerTy(8)) {
+          replaceInstUsesWith(*Inst,
+                              Builder.CreateGEP(Builder.getInt8Ty(),
+                                                GEP->getPointerOperand(),
+                                                Offset, "", GEPsInBounds));
+          eraseInstFromFunction(*Inst);
+        }
+        return Offset;
+      };
+
       // ((gep Ptr, OFFSET1) cmp (gep Ptr, OFFSET2)  --->  (OFFSET1 cmp OFFSET2)
-      Value *L = EmitGEPOffset(GEPLHS);
-      Value *R = EmitGEPOffset(GEPRHS);
+      Value *L = EmitGEPOffsetAndRewrite(GEPLHS);
+      Value *R = EmitGEPOffsetAndRewrite(GEPRHS);
       return new ICmpInst(ICmpInst::getSignedPredicate(Cond), L, R);
     }
   }
diff --git a/llvm/test/Transforms/InstCombine/icmp-custom-dl.ll b/llvm/test/Transforms/InstCombine/icmp-custom-dl.ll
index 491f214..a595ddb 100644
--- a/llvm/test/Transforms/InstCombine/icmp-custom-dl.ll
+++ b/llvm/test/Transforms/InstCombine/icmp-custom-dl.ll
@@ -40,8 +40,8 @@ define i1 @test59_as1(ptr addrspace(1) %foo) {
 define i1 @test60(ptr %foo, i64 %i, i64 %j) {
 ; CHECK-LABEL: @test60(
 ; CHECK-NEXT:    [[TMP1:%.*]] = trunc i64 [[I:%.*]] to i32
-; CHECK-NEXT:    [[TMP2:%.*]] = trunc i64 [[J:%.*]] to i32
 ; CHECK-NEXT:    [[GEP1_IDX:%.*]] = shl nsw i32 [[TMP1]], 2
+; CHECK-NEXT:    [[TMP2:%.*]] = trunc i64 [[J:%.*]] to i32
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[GEP1_IDX]], [[TMP2]]
 ; CHECK-NEXT:    ret i1 [[CMP]]
 ;
@@ -54,8 +54,8 @@ define i1 @test60(ptr %foo, i64 %i, i64 %j) {
 define i1 @test60_as1(ptr addrspace(1) %foo, i64 %i, i64 %j) {
 ; CHECK-LABEL: @test60_as1(
 ; CHECK-NEXT:    [[TMP1:%.*]] = trunc i64 [[I:%.*]] to i16
-; CHECK-NEXT:    [[TMP2:%.*]] = trunc i64 [[J:%.*]] to i16
 ; CHECK-NEXT:    [[GEP1_IDX:%.*]] = shl nsw i16 [[TMP1]], 2
+; CHECK-NEXT:    [[TMP2:%.*]] = trunc i64 [[J:%.*]] to i16
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i16 [[GEP1_IDX]], [[TMP2]]
 ; CHECK-NEXT:    ret i1 [[CMP]]
 ;
diff --git a/llvm/test/Transforms/InstCombine/icmp-gep.ll b/llvm/test/Transforms/InstCombine/icmp-gep.ll
index d912f96..a0e03a5 100644
--- a/llvm/test/Transforms/InstCombine/icmp-gep.ll
+++ b/llvm/test/Transforms/InstCombine/icmp-gep.ll
@@ -313,8 +313,8 @@ define i1 @test_gep_eq_no_inbounds(ptr %foo, i64 %i, i64 %j) {
 define i1 @test60_as1(ptr addrspace(1) %foo, i64 %i, i64 %j) {
 ; CHECK-LABEL: @test60_as1(
 ; CHECK-NEXT:    [[TMP1:%.*]] = trunc i64 [[I:%.*]] to i16
-; CHECK-NEXT:    [[TMP2:%.*]] = trunc i64 [[J:%.*]] to i16
 ; CHECK-NEXT:    [[GEP1_IDX:%.*]] = shl nsw i16 [[TMP1]], 2
+; CHECK-NEXT:    [[TMP2:%.*]] = trunc i64 [[J:%.*]] to i16
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i16 [[GEP1_IDX]], [[TMP2]]
 ; CHECK-NEXT:    ret i1 [[CMP]]
 ;
@@ -400,11 +400,13 @@ define i1 @test61_as1(ptr addrspace(1) %foo, i16 %i, i16 %j) {
 
 define i1 @test60_extra_use(ptr %foo, i64 %i, i64 %j) {
 ; CHECK-LABEL: @test60_extra_use(
-; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr inbounds i32, ptr [[FOO:%.*]], i64 [[I:%.*]]
-; CHECK-NEXT:    [[GEP2:%.*]] = getelementptr inbounds i16, ptr [[FOO]], i64 [[J:%.*]]
+; CHECK-NEXT:    [[GEP1_IDX:%.*]] = shl nsw i64 [[I:%.*]], 2
+; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr inbounds i8, ptr [[FOO:%.*]], i64 [[GEP1_IDX]]
+; CHECK-NEXT:    [[GEP2_IDX:%.*]] = shl nsw i64 [[J:%.*]], 1
+; CHECK-NEXT:    [[GEP2:%.*]] = getelementptr inbounds i8, ptr [[FOO]], i64 [[GEP2_IDX]]
 ; CHECK-NEXT:    call void @use(ptr [[GEP1]])
 ; CHECK-NEXT:    call void @use(ptr [[GEP2]])
-; CHECK-NEXT:    [[CMP:%.*]] = icmp ult ptr [[GEP1]], [[GEP2]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i64 [[GEP1_IDX]], [[GEP2_IDX]]
 ; CHECK-NEXT:    ret i1 [[CMP]]
 ;
   %gep1 = getelementptr inbounds i32, ptr %foo, i64 %i
@@ -446,13 +448,14 @@ define i1 @test60_extra_use_const_operands_no_inbounds(ptr %foo, i64 %i, i64 %j)
 
 define void @test60_extra_use_fold(ptr %foo, i64 %start.idx, i64 %end.offset) {
 ; CHECK-LABEL: @test60_extra_use_fold(
-; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr inbounds i32, ptr [[FOO:%.*]], i64 [[START_IDX:%.*]]
+; CHECK-NEXT:    [[GEP1_IDX:%.*]] = shl nsw i64 [[START_IDX:%.*]], 2
+; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr inbounds i8, ptr [[FOO:%.*]], i64 [[GEP1_IDX]]
 ; CHECK-NEXT:    [[GEP2:%.*]] = getelementptr inbounds i8, ptr [[FOO]], i64 [[END_OFFSET:%.*]]
 ; CHECK-NEXT:    call void @use(ptr [[GEP1]])
 ; CHECK-NEXT:    call void @use(ptr [[GEP2]])
-; CHECK-NEXT:    [[CMP1:%.*]] = icmp eq ptr [[GEP1]], [[GEP2]]
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp eq i64 [[GEP1_IDX]], [[END_OFFSET]]
 ; CHECK-NEXT:    call void @use.i1(i1 [[CMP1]])
-; CHECK-NEXT:    [[CMP2:%.*]] = icmp ult ptr [[GEP1]], [[GEP2]]
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp slt i64 [[GEP1_IDX]], [[END_OFFSET]]
 ; CHECK-NEXT:    call void @use.i1(i1 [[CMP2]])
 ; CHECK-NEXT:    ret void
 ;
-- 
cgit v1.1


From fcb59203c8b883aa39d22cf9788c48dbbb734932 Mon Sep 17 00:00:00 2001
From: Corbin Robeck <corbin.robeck@amd.com>
Date: Fri, 9 Feb 2024 10:05:26 -0500
Subject: [AMDGPU][DOC] Add MI200 Names to AMDGPUUsage Doc (#81252)

---
 llvm/docs/AMDGPUUsage.rst | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst
index 6b24171..f463e83 100644
--- a/llvm/docs/AMDGPUUsage.rst
+++ b/llvm/docs/AMDGPUUsage.rst
@@ -357,12 +357,12 @@ Every processor supports every OS ABI (see :ref:`amdgpu-os`) with the following
                                                                                                         Add product
                                                                                                         names.
 
-     ``gfx90a``                  ``amdgcn``   dGPU  - sramecc         - Absolute      - *rocm-amdhsa* *TBA*
-                                                    - tgsplit           flat
-                                                    - xnack             scratch                       .. TODO::
+     ``gfx90a``                  ``amdgcn``   dGPU  - sramecc         - Absolute      - *rocm-amdhsa* - AMD Instinct MI210 Accelerator
+                                                    - tgsplit           flat          - *rocm-amdhsa* - AMD Instinct MI250 Accelerator
+                                                    - xnack             scratch       - *rocm-amdhsa* - AMD Instinct MI250X Accelerator
                                                     - kernarg preload - Packed
-                                                                        work-item                       Add product
-                                                                        IDs                             names.
+                                                                        work-item                      
+                                                                        IDs                             
 
      ``gfx90c``                  ``amdgcn``   APU   - xnack           - Absolute      - *pal-amdpal*  - Ryzen 7 4700G
                                                                         flat                          - Ryzen 7 4700GE
-- 
cgit v1.1


From 50c5107f42a88a1d2ab66dc6cd1f2cfee6707f7d Mon Sep 17 00:00:00 2001
From: Vlad Serebrennikov <serebrennikov.vladislav@gmail.com>
Date: Fri, 9 Feb 2024 19:20:10 +0400
Subject: [clang] Add tests for DRs about inheriting constructors (#79981)

Covers CWG issues
[1150](https://cplusplus.github.io/CWG/issues/1150.html),
[1487](https://cplusplus.github.io/CWG/issues/1487.html),
[1567](https://cplusplus.github.io/CWG/issues/1567.html),
[1738](https://cplusplus.github.io/CWG/issues/1738.html),
[2273](https://cplusplus.github.io/CWG/issues/2273.html),
[2277](https://cplusplus.github.io/CWG/issues/2277.html),
[2356](https://cplusplus.github.io/CWG/issues/2356.html),
[2504](https://cplusplus.github.io/CWG/issues/2504.html).

On top of the wording in proposed resolutions,
[P0136R1](https://wg21.link/p0136r1) "Rewording inheriting constructors
(core issue 1941 et al)" is a very relevant paper.

Note that status for 1738 `sup P0136R1` is not officially recognized by
CWG, but saying `yes` or `no` seems even more confusing to me. Official
resolution is to reject certain code, but Clang is the only
implementation that still rejects it to this day:
https://godbolt.org/z/b1W8jc1o5. GCC rejected it until 9, now it's
accepted: https://godbolt.org/z/of6oh4sdT
---
 clang/test/CXX/drs/dr11xx.cpp |  2 ++
 clang/test/CXX/drs/dr14xx.cpp | 24 ++++++++++++++++++++++++
 clang/test/CXX/drs/dr15xx.cpp | 39 +++++++++++++++++++++++++++++++++++++++
 clang/test/CXX/drs/dr17xx.cpp | 17 +++++++++++++++++
 clang/test/CXX/drs/dr22xx.cpp | 41 +++++++++++++++++++++++++++++++++++++++++
 clang/test/CXX/drs/dr23xx.cpp | 25 +++++++++++++++++++++++++
 clang/test/CXX/drs/dr2504.cpp | 37 +++++++++++++++++++++++++++++++++++++
 clang/test/CXX/drs/dr25xx.cpp |  2 ++
 clang/www/cxx_dr_status.html  | 16 ++++++++--------
 9 files changed, 195 insertions(+), 8 deletions(-)
 create mode 100644 clang/test/CXX/drs/dr2504.cpp

diff --git a/clang/test/CXX/drs/dr11xx.cpp b/clang/test/CXX/drs/dr11xx.cpp
index 86e726a..a71a105 100644
--- a/clang/test/CXX/drs/dr11xx.cpp
+++ b/clang/test/CXX/drs/dr11xx.cpp
@@ -70,3 +70,5 @@ namespace dr1113 { // dr1113: partial
   }
   void g() { f(); }
 }
+
+// dr1150: na
diff --git a/clang/test/CXX/drs/dr14xx.cpp b/clang/test/CXX/drs/dr14xx.cpp
index d262f6f..58a2b3a 100644
--- a/clang/test/CXX/drs/dr14xx.cpp
+++ b/clang/test/CXX/drs/dr14xx.cpp
@@ -614,6 +614,30 @@ enum E2 : S<E2>::I { e };
 #endif
 } // namespace dr1482
 
+namespace dr1487 { // dr1487: 3.3
+#if __cplusplus >= 201103L
+struct A { // #dr1482-A
+  struct B {
+    using A::A;
+    // since-cxx11-error@-1 {{using declaration refers into 'A::', which is not a base class of 'B'}}
+  };
+
+  struct C : A {
+  // since-cxx11-error@-1 {{base class has incomplete type}}
+  //   since-cxx11-note@#dr1482-A {{definition of 'dr1487::A' is not complete until the closing '}'}}
+    using A::A;
+    // since-cxx11-error@-1 {{using declaration refers into 'A::', which is not a base class of 'C'}}
+  };
+
+  struct D;
+};
+
+struct D : A {
+  using A::A;
+};
+#endif
+} // namespace dr1487
+
 namespace dr1490 {  // dr1490: 3.7 c++11
 #if __cplusplus >= 201103L
   // List-initialization from a string literal
diff --git a/clang/test/CXX/drs/dr15xx.cpp b/clang/test/CXX/drs/dr15xx.cpp
index 3d4050a..ac503db 100644
--- a/clang/test/CXX/drs/dr15xx.cpp
+++ b/clang/test/CXX/drs/dr15xx.cpp
@@ -360,6 +360,45 @@ namespace dr1563 { // dr1563: yes
 #endif
 }
 
+namespace dr1567 { // dr1567: 3.3
+#if __cplusplus >= 201103L
+struct B;
+struct A {
+  A(const A&);
+  A(const B&) = delete;
+  A(A&&);
+  A(B&&) = delete;
+  A(int); // #dr1567-A-int
+};
+
+struct B: A { // #dr1567-B
+  using A::A; // #dr1567-using-A
+  B(double); // #dr1567-B-double
+};
+
+A a{0};
+B b{1.0};
+// Good, deleted converting ctors are not inherited as copy/move ctors
+B b2{b};
+B b3{B{1.0}};
+// Good, copy/move ctors are not inherited
+B b4{a};
+// since-cxx11-error@-1 {{no matching constructor for initialization of 'B'}}
+//   since-cxx11-note@#dr1567-A-int {{candidate inherited constructor not viable: no known conversion from 'A' to 'int' for 1st argument}}
+//   since-cxx11-note@#dr1567-using-A {{constructor from base class 'A' inherited here}}
+//   since-cxx11-note@#dr1567-B {{candidate constructor (the implicit copy constructor) not viable: no known conversion from 'A' to 'const B' for 1st argument}}
+//   since-cxx11-note@#dr1567-B {{candidate constructor (the implicit move constructor) not viable: no known conversion from 'A' to 'B' for 1st argument}}
+//   since-cxx11-note@#dr1567-B-double {{candidate constructor not viable: no known conversion from 'A' to 'double' for 1st argument}}
+B b5{A{0}};
+// since-cxx11-error@-1 {{no matching constructor for initialization of 'B'}}
+//   since-cxx11-note@#dr1567-A-int {{candidate inherited constructor not viable: no known conversion from 'A' to 'int' for 1st argument}}
+//   since-cxx11-note@#dr1567-using-A {{constructor from base class 'A' inherited here}}
+//   since-cxx11-note@#dr1567-B {{candidate constructor (the implicit copy constructor) not viable: no known conversion from 'A' to 'const B' for 1st argument}}
+//   since-cxx11-note@#dr1567-B {{candidate constructor (the implicit move constructor) not viable: no known conversion from 'A' to 'B' for 1st argument}}
+//   since-cxx11-note@#dr1567-B-double {{candidate constructor not viable: no known conversion from 'A' to 'double' for 1st argument}}
+#endif
+}
+
 namespace dr1573 { // dr1573: 3.9
 #if __cplusplus >= 201103L
   // ellipsis is inherited (p0136r1 supersedes this part).
diff --git a/clang/test/CXX/drs/dr17xx.cpp b/clang/test/CXX/drs/dr17xx.cpp
index 885ed00..2f7e62d 100644
--- a/clang/test/CXX/drs/dr17xx.cpp
+++ b/clang/test/CXX/drs/dr17xx.cpp
@@ -89,6 +89,23 @@ S s(q); // #dr1736-s
 #endif
 }
 
+namespace dr1738 { // dr1738: sup P0136R1
+#if __cplusplus >= 201103L
+struct A {
+  template <typename T>
+  A(int, T) {}
+};
+
+struct B : A {
+  using A::A;
+};
+
+// FIXME: this is well-formed since P0136R1
+template B::B(int, double);
+// since-cxx11-error@-1 {{explicit instantiation of 'B' does not refer to a function template, variable template, member function, member class, or static data member}}
+#endif
+}
+
 // dr1748 is in dr1748.cpp
 
 namespace dr1753 { // dr1753: 11
diff --git a/clang/test/CXX/drs/dr22xx.cpp b/clang/test/CXX/drs/dr22xx.cpp
index 1951824..3a13cb0 100644
--- a/clang/test/CXX/drs/dr22xx.cpp
+++ b/clang/test/CXX/drs/dr22xx.cpp
@@ -154,6 +154,47 @@ const D &d3(c); // FIXME ill-formed
 #endif
 }
 
+namespace dr2273 { // dr2273: 3.3
+#if __cplusplus >= 201103L
+struct A {
+  A(int = 0) = delete; // #dr2273-A
+};
+
+struct B : A { // #dr2273-B
+  using A::A;
+};
+
+B b;
+// since-cxx11-error@-1 {{call to implicitly-deleted default constructor of 'B'}}
+//   since-cxx11-note@#dr2273-B {{default constructor of 'B' is implicitly deleted because base class 'A' has a deleted default constructor}}
+//   since-cxx11-note@#dr2273-A {{'A' has been explicitly marked deleted here}}
+#endif
+}
+
+namespace dr2277 { // dr2277: partial
+#if __cplusplus >= 201103L
+struct A {
+  A(int, int = 0);
+  void f(int, int = 0); // #dr2277-A-f
+};
+struct B : A {
+  B(int);
+  using A::A;
+
+  void f(int); // #dr2277-B-f
+  using A::f;
+};
+
+void g() {
+  B b{0};
+  b.f(0); // FIXME: this is well-formed for the same reason as initialization of 'b' above
+  // since-cxx11-error@-1 {{call to member function 'f' is ambiguous}}
+  //   since-cxx11-note@#dr2277-A-f {{candidate function}}
+  //   since-cxx11-note@#dr2277-B-f {{candidate function}}
+}
+#endif
+}
+
 namespace dr2292 { // dr2292: 9
 #if __cplusplus >= 201103L
   template<typename T> using id = T;
diff --git a/clang/test/CXX/drs/dr23xx.cpp b/clang/test/CXX/drs/dr23xx.cpp
index 3f8c476..c046373 100644
--- a/clang/test/CXX/drs/dr23xx.cpp
+++ b/clang/test/CXX/drs/dr23xx.cpp
@@ -147,6 +147,31 @@ enum struct alignas(64) B {};
 #endif
 } // namespace dr2354
 
+namespace dr2356 { // dr2356: 4
+#if __cplusplus >= 201103L
+struct A {
+  A();
+  A(A &&);                        // #1
+  template<typename T> A(T &&);   // #2
+};
+struct B : A {
+  using A::A;
+  B(const B &);                   // #3
+  B(B &&) = default;              // #4, implicitly deleted
+  // since-cxx11-warning@-1 {{explicitly defaulted move constructor is implicitly deleted}}
+  //   since-cxx11-note@#dr2356-X {{move constructor of 'B' is implicitly deleted because field 'x' has a deleted move constructor}}
+  //   since-cxx11-note@#dr2356-X {{'X' has been explicitly marked deleted here}}
+  //   since-cxx11-note@-4 {{replace 'default' with 'delete'}}
+
+  struct X { X(X &&) = delete; } x; // #dr2356-X
+};
+extern B b1;
+B b2 = static_cast<B&&>(b1);      // calls #3: #1, #2, and #4 are not viable
+struct C { operator B&&(); };
+B b3 = C();                       // calls #3
+#endif
+}
+
 #if __cplusplus >= 201402L
 namespace dr2358 { // dr2358: 16
   void f2() {
diff --git a/clang/test/CXX/drs/dr2504.cpp b/clang/test/CXX/drs/dr2504.cpp
new file mode 100644
index 0000000..686ea73
--- /dev/null
+++ b/clang/test/CXX/drs/dr2504.cpp
@@ -0,0 +1,37 @@
+// RUN: %clang_cc1 -std=c++98 %s -triple x86_64-linux-gnu -emit-llvm -o - -fexceptions -fcxx-exceptions -pedantic-errors | llvm-cxxfilt -n | FileCheck %s --check-prefixes CHECK
+// RUN: %clang_cc1 -std=c++11 %s -triple x86_64-linux-gnu -emit-llvm -o - -fexceptions -fcxx-exceptions -pedantic-errors | llvm-cxxfilt -n | FileCheck %s --check-prefixes CHECK,SINCE-CXX11
+// RUN: %clang_cc1 -std=c++14 %s -triple x86_64-linux-gnu -emit-llvm -o - -fexceptions -fcxx-exceptions -pedantic-errors | llvm-cxxfilt -n | FileCheck %s --check-prefixes CHECK,SINCE-CXX11
+// RUN: %clang_cc1 -std=c++17 %s -triple x86_64-linux-gnu -emit-llvm -o - -fexceptions -fcxx-exceptions -pedantic-errors | llvm-cxxfilt -n | FileCheck %s --check-prefixes CHECK,SINCE-CXX11
+// RUN: %clang_cc1 -std=c++20 %s -triple x86_64-linux-gnu -emit-llvm -o - -fexceptions -fcxx-exceptions -pedantic-errors | llvm-cxxfilt -n | FileCheck %s --check-prefixes CHECK,SINCE-CXX11
+// RUN: %clang_cc1 -std=c++23 %s -triple x86_64-linux-gnu -emit-llvm -o - -fexceptions -fcxx-exceptions -pedantic-errors | llvm-cxxfilt -n | FileCheck %s --check-prefixes CHECK,SINCE-CXX11
+// RUN: %clang_cc1 -std=c++2c %s -triple x86_64-linux-gnu -emit-llvm -o - -fexceptions -fcxx-exceptions -pedantic-errors | llvm-cxxfilt -n | FileCheck %s --check-prefixes CHECK,SINCE-CXX11
+
+namespace dr2504 { // dr2504: no
+#if __cplusplus >= 201103L
+struct V { V() = default; V(int); };
+struct Q { Q(); };
+struct A : virtual V, Q {
+  using V::V;
+  A() = delete;
+};
+int bar() { return 42; }
+struct B : A {
+  B() : A(bar()) {}  // ok
+};
+struct C : B {};
+void foo() { C c; } // bar is not invoked, because the V subobject is not initialized as part of B
+#endif
+}
+
+// FIXME: As specified in the comment above (which comes from an example in the Standard),
+//        we are not supposed to unconditionally call `bar()` and call a constructor
+//        inherited from `V`.
+
+// SINCE-CXX11-LABEL: define linkonce_odr void @dr2504::B::B()
+// SINCE-CXX11-NOT:     br
+// SINCE-CXX11:         call noundef i32 @dr2504::bar()
+// SINCE-CXX11-NOT:     br
+// SINCE-CXX11:         call void @dr2504::A::A(int)
+// SINCE-CXX11-LABEL: }
+
+// CHECK: {{.*}}
diff --git a/clang/test/CXX/drs/dr25xx.cpp b/clang/test/CXX/drs/dr25xx.cpp
index 502f032..b1e5480 100644
--- a/clang/test/CXX/drs/dr25xx.cpp
+++ b/clang/test/CXX/drs/dr25xx.cpp
@@ -10,6 +10,8 @@
 // expected-no-diagnostics
 #endif
 
+// dr2504 is in dr2504.cpp
+
 namespace dr2516 { // dr2516: 3.0
                    // NB: reusing 1482 test
 #if __cplusplus >= 201103L
diff --git a/clang/www/cxx_dr_status.html b/clang/www/cxx_dr_status.html
index 3e13a4d..4ce5c43 100755
--- a/clang/www/cxx_dr_status.html
+++ b/clang/www/cxx_dr_status.html
@@ -6708,7 +6708,7 @@ and <I>POD class</I></td>
     <td><a href="https://cplusplus.github.io/CWG/issues/1150.html">1150</a></td>
     <td>NAD</td>
     <td>Inheriting constructors have not been implemented</td>
-    <td class="unknown" align="center">Unknown</td>
+    <td class="na" align="center">N/A</td>
   </tr>
   <tr id="1151">
     <td><a href="https://cplusplus.github.io/CWG/issues/1151.html">1151</a></td>
@@ -8730,7 +8730,7 @@ and <I>POD class</I></td>
     <td><a href="https://cplusplus.github.io/CWG/issues/1487.html">1487</a></td>
     <td>CD3</td>
     <td>When are inheriting constructors declared?</td>
-    <td class="unknown" align="center">Unknown</td>
+    <td class="full" align="center">Clang 3.3</td>
   </tr>
   <tr class="open" id="1488">
     <td><a href="https://cplusplus.github.io/CWG/issues/1488.html">1488</a></td>
@@ -9210,7 +9210,7 @@ and <I>POD class</I></td>
     <td><a href="https://cplusplus.github.io/CWG/issues/1567.html">1567</a></td>
     <td>C++14</td>
     <td>Inheriting constructors and copy/move constructors</td>
-    <td class="unknown" align="center">Unknown</td>
+    <td class="full" align="center">Clang 3.3</td>
   </tr>
   <tr id="1568">
     <td><a href="https://cplusplus.github.io/CWG/issues/1568.html">1568</a></td>
@@ -10236,7 +10236,7 @@ and <I>POD class</I></td>
     <td><a href="https://cplusplus.github.io/CWG/issues/1738.html">1738</a></td>
     <td>C++14</td>
     <td>Explicit instantiation/specialization of inheriting constructor templates</td>
-    <td class="unknown" align="center">Unknown</td>
+    <td class="na" align="center">Superseded by <a href="https://wg21.link/P0136R1">P0136R1</a></td>
   </tr>
   <tr id="1739">
     <td><a href="https://cplusplus.github.io/CWG/issues/1739.html">1739</a></td>
@@ -13446,7 +13446,7 @@ and <I>POD class</I></td>
     <td><a href="https://cplusplus.github.io/CWG/issues/2273.html">2273</a></td>
     <td>CD5</td>
     <td>Inheriting constructors vs implicit default constructor</td>
-    <td class="unknown" align="center">Unknown</td>
+    <td class="full" align="center">Clang 3.3</td>
   </tr>
   <tr id="2274">
     <td><a href="https://cplusplus.github.io/CWG/issues/2274.html">2274</a></td>
@@ -13470,7 +13470,7 @@ and <I>POD class</I></td>
     <td><a href="https://cplusplus.github.io/CWG/issues/2277.html">2277</a></td>
     <td>CD5</td>
     <td>Ambiguity inheriting constructors with default arguments</td>
-    <td class="unknown" align="center">Unknown</td>
+    <td class="partial" align="center">Partial</td>
   </tr>
   <tr id="2278">
     <td><a href="https://cplusplus.github.io/CWG/issues/2278.html">2278</a></td>
@@ -13944,7 +13944,7 @@ and <I>POD class</I></td>
     <td><a href="https://cplusplus.github.io/CWG/issues/2356.html">2356</a></td>
     <td>CD5</td>
     <td>Base class copy and move constructors should not be inherited</td>
-    <td class="unknown" align="center">Unknown</td>
+    <td class="full" align="center">Clang 4</td>
   </tr>
   <tr id="2357">
     <td><a href="https://cplusplus.github.io/CWG/issues/2357.html">2357</a></td>
@@ -14832,7 +14832,7 @@ and <I>POD class</I></td>
     <td><a href="https://cplusplus.github.io/CWG/issues/2504.html">2504</a></td>
     <td>DR</td>
     <td>Inheriting constructors from virtual base classes</td>
-    <td class="unknown" align="center">Unknown</td>
+    <td class="none" align="center">No</td>
   </tr>
   <tr class="open" id="2505">
     <td><a href="https://cplusplus.github.io/CWG/issues/2505.html">2505</a></td>
-- 
cgit v1.1


From d05483288465a87e75cfab51792801cfee43914c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Timm=20B=C3=A4der?= <tbaeder@redhat.com>
Date: Fri, 9 Feb 2024 15:39:36 +0100
Subject: [clang][Interp] Handle dummy pointers in ArrayElemPtr{,Pop}
 differently

Instead of returning false, just ignore the operation and return true;
This gives us the desired diagnostic behavior in the added test case.
---
 clang/lib/AST/Interp/Interp.h | 8 +++++---
 clang/test/AST/Interp/c.c     | 5 +++++
 2 files changed, 10 insertions(+), 3 deletions(-)

diff --git a/clang/lib/AST/Interp/Interp.h b/clang/lib/AST/Interp/Interp.h
index bcabd93..290edc0 100644
--- a/clang/lib/AST/Interp/Interp.h
+++ b/clang/lib/AST/Interp/Interp.h
@@ -1856,7 +1856,7 @@ inline bool ArrayElemPtr(InterpState &S, CodePtr OpPC) {
   const Pointer &Ptr = S.Stk.peek<Pointer>();
 
   if (!CheckDummy(S, OpPC, Ptr))
-    return false;
+    return true;
 
   if (!OffsetHelper<T, ArithOp::Add>(S, OpPC, Offset, Ptr))
     return false;
@@ -1869,8 +1869,10 @@ inline bool ArrayElemPtrPop(InterpState &S, CodePtr OpPC) {
   const T &Offset = S.Stk.pop<T>();
   const Pointer &Ptr = S.Stk.pop<Pointer>();
 
-  if (!CheckDummy(S, OpPC, Ptr))
-    return false;
+  if (!CheckDummy(S, OpPC, Ptr)) {
+    S.Stk.push<Pointer>(Ptr);
+    return true;
+  }
 
   if (!OffsetHelper<T, ArithOp::Add>(S, OpPC, Offset, Ptr))
     return false;
diff --git a/clang/test/AST/Interp/c.c b/clang/test/AST/Interp/c.c
index 9ab271a..3605462 100644
--- a/clang/test/AST/Interp/c.c
+++ b/clang/test/AST/Interp/c.c
@@ -129,3 +129,8 @@ _Static_assert(sizeof(name2) == 0, ""); // expected-error {{failed}} \
                                         // expected-note {{evaluates to}} \
                                         // pedantic-expected-error {{failed}} \
                                         // pedantic-expected-note {{evaluates to}}
+
+void *PR28739d = &(&PR28739d)[(__int128)(unsigned long)-1]; // expected-warning {{refers past the last possible element}} \
+                                                            // pedantic-expected-warning {{refers past the last possible element}} \
+                                                            // ref-warning {{refers past the last possible element}} \
+                                                            // pedantic-ref-warning {{refers past the last possible element}}
-- 
cgit v1.1


From 356fdc31edd1734ef8dc8f010d5f805345157c49 Mon Sep 17 00:00:00 2001
From: Christian Sigg <csigg@google.com>
Date: Fri, 9 Feb 2024 16:23:43 +0100
Subject: [bazel][clang] Fix BUILD after
 a8d4a024e6bea3ae71d6187f0c040b2b25e4bf69.

---
 utils/bazel/llvm-project-overlay/clang/BUILD.bazel | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/utils/bazel/llvm-project-overlay/clang/BUILD.bazel b/utils/bazel/llvm-project-overlay/clang/BUILD.bazel
index dda6d94..b8b3fcb 100644
--- a/utils/bazel/llvm-project-overlay/clang/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/clang/BUILD.bazel
@@ -321,6 +321,20 @@ gentbl(
 )
 
 gentbl(
+    name = "basic_builtins_riscv_gen",
+    tbl_outs = [(
+        "-gen-clang-builtins",
+        "include/clang/Basic/BuiltinsRISCV.inc",
+    )],
+    tblgen = ":clang-tblgen",
+    td_file = "include/clang/Basic/BuiltinsRISCV.td",
+    td_srcs = [
+        "include/clang/Basic/BuiltinsRISCV.td",
+        "include/clang/Basic/BuiltinsBase.td",
+    ],
+)
+
+gentbl(
     name = "basic_builtins_gen",
     tbl_outs = [(
         "-gen-clang-builtins",
@@ -656,6 +670,7 @@ cc_library(
         ":basic_attr_gen",
         ":basic_builtins_bpf_gen",
         ":basic_builtins_gen",
+        ":basic_builtins_riscv_gen",
         ":basic_internal_headers",
         ":basic_riscv_sifive_vector_builtins_gen",
         ":basic_riscv_vector_builtin_cg_gen",
-- 
cgit v1.1


From a9700904765590ca2fbf08c0cc36d0da1107d3a7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Timm=20B=C3=A4der?= <tbaeder@redhat.com>
Date: Fri, 9 Feb 2024 16:45:58 +0100
Subject: [clang][Interp][NFC] Convert test case to verify=expected,all style

---
 clang/test/AST/Interp/c.c | 50 ++++++++++++-----------------------------------
 1 file changed, 13 insertions(+), 37 deletions(-)

diff --git a/clang/test/AST/Interp/c.c b/clang/test/AST/Interp/c.c
index 3605462..337a7cf 100644
--- a/clang/test/AST/Interp/c.c
+++ b/clang/test/AST/Interp/c.c
@@ -1,7 +1,7 @@
-// RUN: %clang_cc1 -fexperimental-new-constant-interpreter -verify -std=c11 %s
-// RUN: %clang_cc1 -fexperimental-new-constant-interpreter -pedantic -verify=pedantic-expected -std=c11 %s
-// RUN: %clang_cc1 -verify=ref -std=c11 %s
-// RUN: %clang_cc1 -pedantic -verify=pedantic-ref -std=c11 %s
+// RUN: %clang_cc1 -fexperimental-new-constant-interpreter -verify=expected,all -std=c11 %s
+// RUN: %clang_cc1 -fexperimental-new-constant-interpreter -pedantic -verify=pedantic-expected,all -std=c11 %s
+// RUN: %clang_cc1 -verify=ref,all -std=c11 %s
+// RUN: %clang_cc1 -pedantic -verify=pedantic-ref,all -std=c11 %s
 
 typedef __INTPTR_TYPE__ intptr_t;
 typedef __PTRDIFF_TYPE__ ptrdiff_t;
@@ -22,10 +22,7 @@ _Static_assert(!!1.0, ""); // pedantic-ref-warning {{not an integer constant exp
 _Static_assert(!!1, "");
 
 int a = (1 == 1 ? 5 : 3);
-_Static_assert(a == 5, ""); // ref-error {{not an integral constant expression}} \
-                            // pedantic-ref-error {{not an integral constant expression}} \
-                            // expected-error {{not an integral constant expression}} \
-                            // pedantic-expected-error {{not an integral constant expression}}
+_Static_assert(a == 5, ""); // all-error {{not an integral constant expression}}
 
 
 const int b = 3;
@@ -67,25 +64,17 @@ _Static_assert((&a - 100) != 0, ""); // pedantic-ref-warning {{is a GNU extensio
 /// extern variable of a composite type.
 /// FIXME: The 'cast from void*' note is missing in the new interpreter.
 extern struct Test50S Test50;
-_Static_assert(&Test50 != (void*)0, ""); // ref-warning {{always true}} \
-                                         // pedantic-ref-warning {{always true}} \
+_Static_assert(&Test50 != (void*)0, ""); // all-warning {{always true}} \
                                          // pedantic-ref-warning {{is a GNU extension}} \
                                          // pedantic-ref-note {{cast from 'void *' is not allowed}} \
-                                         // expected-warning {{always true}} \
-                                         // pedantic-expected-warning {{always true}} \
                                          // pedantic-expected-warning {{is a GNU extension}}
 
 struct y {int x,y;};
-int a2[(intptr_t)&((struct y*)0)->y]; // expected-warning {{folded to constant array}} \
-                                      // pedantic-expected-warning {{folded to constant array}} \
-                                      // ref-warning {{folded to constant array}} \
-                                      // pedantic-ref-warning {{folded to constant array}}
+int a2[(intptr_t)&((struct y*)0)->y]; // all-warning {{folded to constant array}}
 
 const struct y *yy = (struct y*)0;
-const intptr_t L = (intptr_t)(&(yy->y)); // expected-error {{not a compile-time constant}} \
-                                         // pedantic-expected-error {{not a compile-time constant}} \
-                                         // ref-error {{not a compile-time constant}} \
-                                         // pedantic-ref-error {{not a compile-time constant}}
+const intptr_t L = (intptr_t)(&(yy->y)); // all-error {{not a compile-time constant}}
+
 const ptrdiff_t m = &m + 137 - &m;
 _Static_assert(m == 137, ""); // pedantic-ref-warning {{GNU extension}} \
                               // pedantic-expected-warning {{GNU extension}}
@@ -93,10 +82,7 @@ _Static_assert(m == 137, ""); // pedantic-ref-warning {{GNU extension}} \
 /// from test/Sema/switch.c, used to cause an assertion failure.
 void f (int z) {
   while (z) {
-    default: z--; // expected-error {{'default' statement not in switch}} \
-                  // pedantic-expected-error {{'default' statement not in switch}} \
-                  // ref-error {{'default' statement not in switch}} \
-                  // pedantic-ref-error {{'default' statement not in switch}}
+    default: z--; // all-error {{'default' statement not in switch}}
   }
 }
 
@@ -104,15 +90,8 @@ int expr;
 int chooseexpr[__builtin_choose_expr(1, 1, expr)];
 
 int somefunc(int i) {
-  return (i, 65537) * 65537; // expected-warning {{left operand of comma operator has no effect}} \
-                             // expected-warning {{overflow in expression; result is 131073}} \
-                             // pedantic-expected-warning {{left operand of comma operator has no effect}} \
-                             // pedantic-expected-warning {{overflow in expression; result is 131073}} \
-                             // ref-warning {{left operand of comma operator has no effect}} \
-                             // ref-warning {{overflow in expression; result is 131073}} \
-                             // pedantic-ref-warning {{left operand of comma operator has no effect}} \
-                             // pedantic-ref-warning {{overflow in expression; result is 131073}}
-
+  return (i, 65537) * 65537; // all-warning {{left operand of comma operator has no effect}} \
+                             // all-warning {{overflow in expression; result is 131073}}
 }
 
 /// FIXME: The following test is incorrect in the new interpreter.
@@ -130,7 +109,4 @@ _Static_assert(sizeof(name2) == 0, ""); // expected-error {{failed}} \
                                         // pedantic-expected-error {{failed}} \
                                         // pedantic-expected-note {{evaluates to}}
 
-void *PR28739d = &(&PR28739d)[(__int128)(unsigned long)-1]; // expected-warning {{refers past the last possible element}} \
-                                                            // pedantic-expected-warning {{refers past the last possible element}} \
-                                                            // ref-warning {{refers past the last possible element}} \
-                                                            // pedantic-ref-warning {{refers past the last possible element}}
+void *PR28739d = &(&PR28739d)[(__int128)(unsigned long)-1]; // all-warning {{refers past the last possible element}}
-- 
cgit v1.1


From a0635edc5980218ad210da25a5c9afe346110ccb Mon Sep 17 00:00:00 2001
From: Florian Hahn <florian_hahn@apple.com>
Date: Fri, 9 Feb 2024 15:48:01 +0000
Subject: [PhaseOrdering] Add tests showing missed simplifications.

Add tests showing missed simplifications due to phase ordering.
---
 .../AArch64/extra-unroll-simplifications.ll        | 82 ++++++++++++++++++
 .../PhaseOrdering/AArch64/hoist-runtime-checks.ll  | 98 ++++++++++++++++++++++
 2 files changed, 180 insertions(+)
 create mode 100644 llvm/test/Transforms/PhaseOrdering/AArch64/extra-unroll-simplifications.ll
 create mode 100644 llvm/test/Transforms/PhaseOrdering/AArch64/hoist-runtime-checks.ll

diff --git a/llvm/test/Transforms/PhaseOrdering/AArch64/extra-unroll-simplifications.ll b/llvm/test/Transforms/PhaseOrdering/AArch64/extra-unroll-simplifications.ll
new file mode 100644
index 0000000..6132c35
--- /dev/null
+++ b/llvm/test/Transforms/PhaseOrdering/AArch64/extra-unroll-simplifications.ll
@@ -0,0 +1,82 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -passes='default<O3>' -S %s | FileCheck %s
+
+target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
+target triple = "arm64-apple-macosx11.0.0"
+
+define void @partial_unroll_forced(i32 %N, ptr %src, ptr noalias %dst) {
+; CHECK-LABEL: define void @partial_unroll_forced(
+; CHECK-SAME: i32 [[N:%.*]], ptr nocapture readonly [[SRC:%.*]], ptr noalias nocapture writeonly [[DST:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP141:%.*]] = icmp sgt i32 [[N]], 0
+; CHECK-NEXT:    br i1 [[CMP141]], label [[LOOP_LATCH_PREHEADER:%.*]], label [[EXIT:%.*]]
+; CHECK:       loop.latch.preheader:
+; CHECK-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[N]] to i64
+; CHECK-NEXT:    [[XTRAITER:%.*]] = and i64 [[WIDE_TRIP_COUNT]], 1
+; CHECK-NEXT:    [[TMP0:%.*]] = icmp eq i32 [[N]], 1
+; CHECK-NEXT:    br i1 [[TMP0]], label [[EXIT_LOOPEXIT_UNR_LCSSA:%.*]], label [[LOOP_LATCH_PREHEADER_NEW:%.*]]
+; CHECK:       loop.latch.preheader.new:
+; CHECK-NEXT:    [[UNROLL_ITER:%.*]] = and i64 [[WIDE_TRIP_COUNT]], 2147483646
+; CHECK-NEXT:    br label [[LOOP_LATCH:%.*]]
+; CHECK:       loop.latch:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[LOOP_LATCH_PREHEADER_NEW]] ], [ [[INDVARS_IV_NEXT_1:%.*]], [[LOOP_LATCH]] ]
+; CHECK-NEXT:    [[NITER:%.*]] = phi i64 [ 0, [[LOOP_LATCH_PREHEADER_NEW]] ], [ [[NITER_NEXT_1:%.*]], [[LOOP_LATCH]] ]
+; CHECK-NEXT:    [[SRC_IDX:%.*]] = getelementptr <8 x half>, ptr [[SRC]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[L:%.*]] = load <8 x half>, ptr [[SRC_IDX]], align 16
+; CHECK-NEXT:    [[DST_IDX:%.*]] = getelementptr <8 x half>, ptr [[DST]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[ADD:%.*]] = fadd <8 x half> [[L]], [[L]]
+; CHECK-NEXT:    store <8 x half> [[ADD]], ptr [[DST_IDX]], align 16
+; CHECK-NEXT:    [[INDVARS_IV_NEXT:%.*]] = or disjoint i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[SRC_IDX_1:%.*]] = getelementptr <8 x half>, ptr [[SRC]], i64 [[INDVARS_IV_NEXT]]
+; CHECK-NEXT:    [[L_1:%.*]] = load <8 x half>, ptr [[SRC_IDX_1]], align 16
+; CHECK-NEXT:    [[DST_IDX_1:%.*]] = getelementptr <8 x half>, ptr [[DST]], i64 [[INDVARS_IV_NEXT]]
+; CHECK-NEXT:    [[ADD_1:%.*]] = fadd <8 x half> [[L_1]], [[L_1]]
+; CHECK-NEXT:    store <8 x half> [[ADD_1]], ptr [[DST_IDX_1]], align 16
+; CHECK-NEXT:    [[INDVARS_IV_NEXT_1]] = add nuw nsw i64 [[INDVARS_IV]], 2
+; CHECK-NEXT:    [[NITER_NEXT_1]] = add i64 [[NITER]], 2
+; CHECK-NEXT:    [[NITER_NCMP_1:%.*]] = icmp eq i64 [[NITER_NEXT_1]], [[UNROLL_ITER]]
+; CHECK-NEXT:    br i1 [[NITER_NCMP_1]], label [[EXIT_LOOPEXIT_UNR_LCSSA]], label [[LOOP_LATCH]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       exit.loopexit.unr-lcssa:
+; CHECK-NEXT:    [[INDVARS_IV_UNR:%.*]] = phi i64 [ 0, [[LOOP_LATCH_PREHEADER]] ], [ [[INDVARS_IV_NEXT_1]], [[LOOP_LATCH]] ]
+; CHECK-NEXT:    [[LCMP_MOD_NOT:%.*]] = icmp eq i64 [[XTRAITER]], 0
+; CHECK-NEXT:    br i1 [[LCMP_MOD_NOT]], label [[EXIT]], label [[LOOP_LATCH_EPIL:%.*]]
+; CHECK:       loop.latch.epil:
+; CHECK-NEXT:    [[SRC_IDX_EPIL:%.*]] = getelementptr <8 x half>, ptr [[SRC]], i64 [[INDVARS_IV_UNR]]
+; CHECK-NEXT:    [[L_EPIL:%.*]] = load <8 x half>, ptr [[SRC_IDX_EPIL]], align 16
+; CHECK-NEXT:    [[DST_IDX_EPIL:%.*]] = getelementptr <8 x half>, ptr [[DST]], i64 [[INDVARS_IV_UNR]]
+; CHECK-NEXT:    [[ADD_EPIL:%.*]] = fadd <8 x half> [[L_EPIL]], [[L_EPIL]]
+; CHECK-NEXT:    store <8 x half> [[ADD_EPIL]], ptr [[DST_IDX_EPIL]], align 16
+; CHECK-NEXT:    br label [[EXIT]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %loop.header
+
+loop.header:
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop.latch ]
+  %cmp14 = icmp slt i32 %iv, %N
+  br i1 %cmp14, label %loop.latch, label %exit
+
+loop.latch:
+  %iv.ext = zext i32 %iv to i64
+  %src.idx = getelementptr <8 x half>, ptr %src, i64 %iv.ext
+  %l = load <8 x half>, ptr %src.idx, align 16
+  %dst.idx = getelementptr <8 x half>, ptr %dst, i64 %iv.ext
+  %add = fadd <8 x half> %l, %l
+  store <8 x half> %add, ptr %dst.idx, align 16
+  %iv.next = add i32 %iv, 1
+  br label %loop.header, !llvm.loop !0
+
+exit:
+  ret void
+}
+
+!0 = distinct !{!0, !1, !2}
+!1 = !{!"llvm.loop.mustprogress"}
+!2 = !{!"llvm.loop.unroll.count", i32 2}
+;.
+; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
+; CHECK: [[META1]] = !{!"llvm.loop.mustprogress"}
+; CHECK: [[META2]] = !{!"llvm.loop.unroll.disable"}
+;.
diff --git a/llvm/test/Transforms/PhaseOrdering/AArch64/hoist-runtime-checks.ll b/llvm/test/Transforms/PhaseOrdering/AArch64/hoist-runtime-checks.ll
new file mode 100644
index 0000000..c6c9a52
--- /dev/null
+++ b/llvm/test/Transforms/PhaseOrdering/AArch64/hoist-runtime-checks.ll
@@ -0,0 +1,98 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -passes='default<O3>' -S %s | FileCheck %s
+
+target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
+target triple = "arm64-apple-macosx11.0.0"
+
+define i32 @read_only_loop_with_runtime_check(ptr noundef %array, i32 noundef %count, i32 noundef %n) {
+; CHECK-LABEL: define i32 @read_only_loop_with_runtime_check(
+; CHECK-SAME: ptr nocapture noundef readonly [[ARRAY:%.*]], i32 noundef [[COUNT:%.*]], i32 noundef [[N:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP6_NOT:%.*]] = icmp eq i32 [[N]], 0
+; CHECK-NEXT:    br i1 [[CMP6_NOT]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY_PREHEADER:%.*]]
+; CHECK:       for.body.preheader:
+; CHECK-NEXT:    [[TMP0:%.*]] = zext i32 [[N]] to i64
+; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[N]], -1
+; CHECK-NEXT:    [[DOTNOT_NOT:%.*]] = icmp ult i32 [[TMP1]], [[COUNT]]
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.cond.cleanup:
+; CHECK-NEXT:    [[SUM_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[ADD:%.*]], [[IF_END:%.*]] ]
+; CHECK-NEXT:    ret i32 [[SUM_0_LCSSA]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[IF_END]] ]
+; CHECK-NEXT:    [[SUM_07:%.*]] = phi i32 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[ADD]], [[IF_END]] ]
+; CHECK-NEXT:    br i1 [[DOTNOT_NOT]], label [[IF_END]], label [[IF_THEN:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    tail call void @llvm.trap()
+; CHECK-NEXT:    unreachable
+; CHECK:       if.end:
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[ARRAY]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[ADD]] = add nsw i32 [[TMP2]], [[SUM_07]]
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[TMP0]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]]
+;
+entry:
+  %array.addr = alloca ptr, align 8
+  %count.addr = alloca i32, align 4
+  %n.addr = alloca i32, align 4
+  %sum = alloca i32, align 4
+  %i = alloca i32, align 4
+  store ptr %array, ptr %array.addr, align 8
+  store i32 %count, ptr %count.addr, align 4
+  store i32 %n, ptr %n.addr, align 4
+  call void @llvm.lifetime.start.p0(i64 4, ptr %sum) #3
+  store i32 0, ptr %sum, align 4
+  call void @llvm.lifetime.start.p0(i64 4, ptr %i) #3
+  store i32 0, ptr %i, align 4
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %0 = load i32, ptr %i, align 4
+  %1 = load i32, ptr %n.addr, align 4
+  %cmp = icmp ult i32 %0, %1
+  br i1 %cmp, label %for.body, label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.cond
+  call void @llvm.lifetime.end.p0(i64 4, ptr %i) #3
+  br label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %2 = load i32, ptr %i, align 4
+  %3 = load i32, ptr %count.addr, align 4
+  %cmp1 = icmp uge i32 %2, %3
+  br i1 %cmp1, label %if.then, label %if.end
+
+if.then:                                          ; preds = %for.body
+  call void @llvm.trap()
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %for.body
+  %4 = load ptr, ptr %array.addr, align 8
+  %5 = load i32, ptr %i, align 4
+  %idxprom = zext i32 %5 to i64
+  %arrayidx = getelementptr inbounds i32, ptr %4, i64 %idxprom
+  %6 = load i32, ptr %arrayidx, align 4
+  %7 = load i32, ptr %sum, align 4
+  %add = add nsw i32 %7, %6
+  store i32 %add, ptr %sum, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %if.end
+  %8 = load i32, ptr %i, align 4
+  %inc = add i32 %8, 1
+  store i32 %inc, ptr %i, align 4
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond.cleanup
+  %9 = load i32, ptr %sum, align 4
+  call void @llvm.lifetime.end.p0(i64 4, ptr %sum)
+  ret i32 %9
+}
+
+declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture)
+
+declare void @llvm.trap()
+
+declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture)
-- 
cgit v1.1


From fdb16e6fd81b38835795f22730b39b30ddd90f07 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Timm=20B=C3=A4der?= <tbaeder@redhat.com>
Date: Fri, 9 Feb 2024 16:51:53 +0100
Subject: [clang][Interp] Only use __int128 in test case if supported

---
 clang/test/AST/Interp/c.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/clang/test/AST/Interp/c.c b/clang/test/AST/Interp/c.c
index 337a7cf..bb2c7cf 100644
--- a/clang/test/AST/Interp/c.c
+++ b/clang/test/AST/Interp/c.c
@@ -109,4 +109,6 @@ _Static_assert(sizeof(name2) == 0, ""); // expected-error {{failed}} \
                                         // pedantic-expected-error {{failed}} \
                                         // pedantic-expected-note {{evaluates to}}
 
+#ifdef __SIZEOF_INT128__
 void *PR28739d = &(&PR28739d)[(__int128)(unsigned long)-1]; // all-warning {{refers past the last possible element}}
+#endif
-- 
cgit v1.1


From b081e9d4cafe2563c513ed7b5ae3ced6d177b657 Mon Sep 17 00:00:00 2001
From: Daniel Chen <cdchen@ca.ibm.com>
Date: Fri, 9 Feb 2024 10:56:57 -0500
Subject: [Flang] Fix NULLIFY statement that returns too early for multiple
 procedure pointer objects. (#81164)

The current code that handles NULLIFY statement for procedure pointer
returns after the 1st object.
This PR is to remove the `return` so it can nullify multiple procedure
pointer objects.
---
 flang/lib/Lower/Bridge.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/flang/lib/Lower/Bridge.cpp b/flang/lib/Lower/Bridge.cpp
index 579f94b..7577c49 100644
--- a/flang/lib/Lower/Bridge.cpp
+++ b/flang/lib/Lower/Bridge.cpp
@@ -3115,10 +3115,10 @@ private:
         hlfir::Entity nullBoxProc(
             fir::factory::createNullBoxProc(*builder, loc, boxTy));
         builder->createStoreWithConvert(loc, nullBoxProc, pptr);
-        return;
+      } else {
+        fir::MutableBoxValue box = genExprMutableBox(loc, *expr);
+        fir::factory::disassociateMutableBox(*builder, loc, box);
       }
-      fir::MutableBoxValue box = genExprMutableBox(loc, *expr);
-      fir::factory::disassociateMutableBox(*builder, loc, box);
     }
   }
 
-- 
cgit v1.1


From 935f7d633374f7073fec14927922a2d534c8795f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Timm=20B=C3=A4der?= <tbaeder@redhat.com>
Date: Fri, 9 Feb 2024 17:04:50 +0100
Subject: [clang][Interp][NFC] We do support complex bitint now

Remove a stale FIXME comment and improve the test.
---
 clang/test/AST/Interp/complex.cpp | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/clang/test/AST/Interp/complex.cpp b/clang/test/AST/Interp/complex.cpp
index 7d625ab..9fdaabd 100644
--- a/clang/test/AST/Interp/complex.cpp
+++ b/clang/test/AST/Interp/complex.cpp
@@ -98,8 +98,9 @@ constexpr _Complex int I3 = {15};
 static_assert(__real(I3) == 15, "");
 static_assert(__imag(I3) == 0, "");
 
-/// FIXME: This should work in the new interpreter as well.
-// constexpr _Complex _BitInt(8) A = 0;// = {4};
+constexpr _Complex _BitInt(8) A = {4};
+static_assert(__real(A) == 4, "");
+static_assert(__imag(A) == 0, "");
 
 
 constexpr _Complex double Doubles[4] = {{1.0, 2.0}};
-- 
cgit v1.1


From 99d743320c5dddb780f1fb2f49414b10e6a52a05 Mon Sep 17 00:00:00 2001
From: Jon Roelofs <jonathan_roelofs@apple.com>
Date: Fri, 9 Feb 2024 08:13:15 -0800
Subject: [clang][fmv] Drop .ifunc from target_version's entrypoint's mangling
 (#81194)

Fixes: https://github.com/llvm/llvm-project/issues/81043
---
 clang/include/clang/AST/Decl.h                |   4 +
 clang/include/clang/Basic/AttrDocs.td         |   8 ++
 clang/lib/AST/Decl.cpp                        |   4 +
 clang/lib/CodeGen/CodeGenModule.cpp           |  16 +++-
 clang/test/CodeGen/attr-target-version.c      | 129 ++++++++++++++------------
 clang/test/CodeGenCXX/attr-target-version.cpp |  87 ++++++++---------
 6 files changed, 143 insertions(+), 105 deletions(-)

diff --git a/clang/include/clang/AST/Decl.h b/clang/include/clang/AST/Decl.h
index f26fb5a..42fdf2b 100644
--- a/clang/include/clang/AST/Decl.h
+++ b/clang/include/clang/AST/Decl.h
@@ -2619,6 +2619,10 @@ public:
   /// the target-clones functionality.
   bool isTargetClonesMultiVersion() const;
 
+  /// True if this function is a multiversioned dispatch function as a part of
+  /// the target-version functionality.
+  bool isTargetVersionMultiVersion() const;
+
   /// \brief Get the associated-constraints of this function declaration.
   /// Currently, this will either be a vector of size 1 containing the
   /// trailing-requires-clause or an empty vector.
diff --git a/clang/include/clang/Basic/AttrDocs.td b/clang/include/clang/Basic/AttrDocs.td
index 041786f..19a98a0 100644
--- a/clang/include/clang/Basic/AttrDocs.td
+++ b/clang/include/clang/Basic/AttrDocs.td
@@ -2517,6 +2517,14 @@ function it instructs compiler to emit multiple function versions based on
 priority and target features availability. One of the versions is always
 ( implicitly or explicitly ) the ``default`` (fallback). Attribute strings can
 contain dependent features names joined by the "+" sign.
+
+For targets that support the GNU indirect function (IFUNC) feature, dispatch
+is performed by emitting an indirect function that is resolved to the appropriate
+target clone at load time. The indirect function is given the name the
+multiversioned function would have if it had been declared without the attribute.
+For backward compatibility with earlier Clang releases, a function alias with an
+``.ifunc`` suffix is also emitted. The  ``.ifunc`` suffixed symbol is a deprecated
+feature and support for it may be removed in the future.
 }];
 }
 
diff --git a/clang/lib/AST/Decl.cpp b/clang/lib/AST/Decl.cpp
index 26fdfa0..40e2903 100644
--- a/clang/lib/AST/Decl.cpp
+++ b/clang/lib/AST/Decl.cpp
@@ -3541,6 +3541,10 @@ bool FunctionDecl::isTargetClonesMultiVersion() const {
   return isMultiVersion() && hasAttr<TargetClonesAttr>();
 }
 
+bool FunctionDecl::isTargetVersionMultiVersion() const {
+  return isMultiVersion() && hasAttr<TargetVersionAttr>();
+}
+
 void
 FunctionDecl::setPreviousDeclaration(FunctionDecl *PrevDecl) {
   redeclarable_base::setPreviousDecl(PrevDecl);
diff --git a/clang/lib/CodeGen/CodeGenModule.cpp b/clang/lib/CodeGen/CodeGenModule.cpp
index 36b63d7..2f923d5 100644
--- a/clang/lib/CodeGen/CodeGenModule.cpp
+++ b/clang/lib/CodeGen/CodeGenModule.cpp
@@ -30,6 +30,7 @@
 #include "clang/AST/ASTContext.h"
 #include "clang/AST/ASTLambda.h"
 #include "clang/AST/CharUnits.h"
+#include "clang/AST/Decl.h"
 #include "clang/AST/DeclCXX.h"
 #include "clang/AST/DeclObjC.h"
 #include "clang/AST/DeclTemplate.h"
@@ -4212,7 +4213,8 @@ void CodeGenModule::emitMultiVersionFunctions() {
     llvm::Constant *ResolverConstant = GetOrCreateMultiVersionResolver(GD);
     if (auto *IFunc = dyn_cast<llvm::GlobalIFunc>(ResolverConstant)) {
       ResolverConstant = IFunc->getResolver();
-      if (FD->isTargetClonesMultiVersion()) {
+      if (FD->isTargetClonesMultiVersion() ||
+          FD->isTargetVersionMultiVersion()) {
         const CGFunctionInfo &FI = getTypes().arrangeGlobalDeclaration(GD);
         llvm::FunctionType *DeclTy = getTypes().GetFunctionType(FI);
         std::string MangledName = getMangledNameImpl(
@@ -4393,8 +4395,18 @@ llvm::Constant *CodeGenModule::GetOrCreateMultiVersionResolver(GlobalDecl GD) {
   // a separate resolver).
   std::string ResolverName = MangledName;
   if (getTarget().supportsIFunc()) {
-    if (!FD->isTargetClonesMultiVersion())
+    switch (FD->getMultiVersionKind()) {
+    case MultiVersionKind::None:
+      llvm_unreachable("unexpected MultiVersionKind::None for resolver");
+    case MultiVersionKind::Target:
+    case MultiVersionKind::CPUSpecific:
+    case MultiVersionKind::CPUDispatch:
       ResolverName += ".ifunc";
+      break;
+    case MultiVersionKind::TargetClones:
+    case MultiVersionKind::TargetVersion:
+      break;
+    }
   } else if (FD->isTargetMultiVersion()) {
     ResolverName += ".resolver";
   }
diff --git a/clang/test/CodeGen/attr-target-version.c b/clang/test/CodeGen/attr-target-version.c
index 2a96697..c27d48f 100644
--- a/clang/test/CodeGen/attr-target-version.c
+++ b/clang/test/CodeGen/attr-target-version.c
@@ -90,13 +90,20 @@ int hoo(void) {
 
 //.
 // CHECK: @__aarch64_cpu_features = external dso_local global { i64 }
-// CHECK: @fmv.ifunc = weak_odr ifunc i32 (), ptr @fmv.resolver
-// CHECK: @fmv_one.ifunc = weak_odr ifunc i32 (), ptr @fmv_one.resolver
-// CHECK: @fmv_two.ifunc = weak_odr ifunc i32 (), ptr @fmv_two.resolver
-// CHECK: @fmv_e.ifunc = weak_odr ifunc i32 (), ptr @fmv_e.resolver
-// CHECK: @fmv_c.ifunc = weak_odr ifunc void (), ptr @fmv_c.resolver
-// CHECK: @fmv_inline.ifunc = weak_odr ifunc i32 (), ptr @fmv_inline.resolver
-// CHECK: @fmv_d.ifunc = internal ifunc i32 (), ptr @fmv_d.resolver
+// CHECK: @fmv.ifunc = weak_odr alias i32 (), ptr @fmv
+// CHECK: @fmv_one.ifunc = weak_odr alias i32 (), ptr @fmv_one
+// CHECK: @fmv_two.ifunc = weak_odr alias i32 (), ptr @fmv_two
+// CHECK: @fmv_e.ifunc = weak_odr alias i32 (), ptr @fmv_e
+// CHECK: @fmv_inline.ifunc = weak_odr alias i32 (), ptr @fmv_inline
+// CHECK: @fmv_d.ifunc = internal alias i32 (), ptr @fmv_d
+// CHECK: @fmv_c.ifunc = weak_odr alias void (), ptr @fmv_c
+// CHECK: @fmv = weak_odr ifunc i32 (), ptr @fmv.resolver
+// CHECK: @fmv_one = weak_odr ifunc i32 (), ptr @fmv_one.resolver
+// CHECK: @fmv_two = weak_odr ifunc i32 (), ptr @fmv_two.resolver
+// CHECK: @fmv_e = weak_odr ifunc i32 (), ptr @fmv_e.resolver
+// CHECK: @fmv_inline = weak_odr ifunc i32 (), ptr @fmv_inline.resolver
+// CHECK: @fmv_d = internal ifunc i32 (), ptr @fmv_d.resolver
+// CHECK: @fmv_c = weak_odr ifunc void (), ptr @fmv_c.resolver
 //.
 // CHECK: Function Attrs: noinline nounwind optnone
 // CHECK-LABEL: define {{[^@]+}}@fmv._MrngMflagmMfp16fml
@@ -105,6 +112,32 @@ int hoo(void) {
 // CHECK-NEXT:    ret i32 1
 //
 //
+// CHECK: Function Attrs: noinline nounwind optnone
+// CHECK-LABEL: define {{[^@]+}}@fmv_one._MsimdMls64
+// CHECK-SAME: () #[[ATTR1:[0-9]+]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    ret i32 1
+//
+//
+// CHECK: Function Attrs: noinline nounwind optnone
+// CHECK-LABEL: define {{[^@]+}}@fmv_two._Mfp
+// CHECK-SAME: () #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    ret i32 1
+//
+//
+// CHECK: Function Attrs: noinline nounwind optnone
+// CHECK-LABEL: define {{[^@]+}}@foo
+// CHECK-SAME: () #[[ATTR2:[0-9]+]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[CALL:%.*]] = call i32 @fmv()
+// CHECK-NEXT:    [[CALL1:%.*]] = call i32 @fmv_one()
+// CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[CALL]], [[CALL1]]
+// CHECK-NEXT:    [[CALL2:%.*]] = call i32 @fmv_two()
+// CHECK-NEXT:    [[ADD3:%.*]] = add nsw i32 [[ADD]], [[CALL2]]
+// CHECK-NEXT:    ret i32 [[ADD3]]
+//
+//
 // CHECK-LABEL: define {{[^@]+}}@fmv.resolver() comdat {
 // CHECK-NEXT:  resolver_entry:
 // CHECK-NEXT:    call void @__init_cpu_features_resolver()
@@ -183,42 +216,16 @@ int hoo(void) {
 // CHECK-NEXT:    ret ptr @fmv.default
 //
 //
-// CHECK: Function Attrs: noinline nounwind optnone
-// CHECK-LABEL: define {{[^@]+}}@fmv_one._MsimdMls64
-// CHECK-SAME: () #[[ATTR1:[0-9]+]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    ret i32 1
-//
-//
 // CHECK-LABEL: define {{[^@]+}}@fmv_one.resolver() comdat {
 // CHECK-NEXT:  resolver_entry:
 // CHECK-NEXT:    ret ptr @fmv_one._MsimdMls64
 //
 //
-// CHECK: Function Attrs: noinline nounwind optnone
-// CHECK-LABEL: define {{[^@]+}}@fmv_two._Mfp
-// CHECK-SAME: () #[[ATTR1]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    ret i32 1
-//
-//
 // CHECK-LABEL: define {{[^@]+}}@fmv_two.resolver() comdat {
 // CHECK-NEXT:  resolver_entry:
 // CHECK-NEXT:    ret ptr @fmv_two._MsimdMfp16
 //
 //
-// CHECK: Function Attrs: noinline nounwind optnone
-// CHECK-LABEL: define {{[^@]+}}@foo
-// CHECK-SAME: () #[[ATTR2:[0-9]+]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[CALL:%.*]] = call i32 @fmv.ifunc()
-// CHECK-NEXT:    [[CALL1:%.*]] = call i32 @fmv_one.ifunc()
-// CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[CALL]], [[CALL1]]
-// CHECK-NEXT:    [[CALL2:%.*]] = call i32 @fmv_two.ifunc()
-// CHECK-NEXT:    [[ADD3:%.*]] = add nsw i32 [[ADD]], [[CALL2]]
-// CHECK-NEXT:    ret i32 [[ADD3]]
-//
-//
 // CHECK-LABEL: define {{[^@]+}}@fmv_e.resolver() comdat {
 // CHECK-NEXT:  resolver_entry:
 // CHECK-NEXT:    ret ptr @fmv_e._Mls64
@@ -238,28 +245,14 @@ int hoo(void) {
 // CHECK-NEXT:    ret void
 //
 //
-// CHECK-LABEL: define {{[^@]+}}@fmv_c.resolver() comdat {
-// CHECK-NEXT:  resolver_entry:
-// CHECK-NEXT:    call void @__init_cpu_features_resolver()
-// CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = and i64 [[TMP0]], 281474976710656
-// CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 281474976710656
-// CHECK-NEXT:    [[TMP3:%.*]] = and i1 true, [[TMP2]]
-// CHECK-NEXT:    br i1 [[TMP3]], label [[RESOLVER_RETURN:%.*]], label [[RESOLVER_ELSE:%.*]]
-// CHECK:       resolver_return:
-// CHECK-NEXT:    ret ptr @fmv_c._Mssbs
-// CHECK:       resolver_else:
-// CHECK-NEXT:    ret ptr @fmv_c.default
-//
-//
 // CHECK: Function Attrs: noinline nounwind optnone
 // CHECK-LABEL: define {{[^@]+}}@goo
 // CHECK-SAME: () #[[ATTR2]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[CALL:%.*]] = call i32 @fmv_inline.ifunc()
-// CHECK-NEXT:    [[CALL1:%.*]] = call i32 @fmv_e.ifunc()
-// CHECK-NEXT:    [[CALL2:%.*]] = call i32 @fmv_d.ifunc()
-// CHECK-NEXT:    call void @fmv_c.ifunc()
+// CHECK-NEXT:    [[CALL:%.*]] = call i32 @fmv_inline()
+// CHECK-NEXT:    [[CALL1:%.*]] = call i32 @fmv_e()
+// CHECK-NEXT:    [[CALL2:%.*]] = call i32 @fmv_d()
+// CHECK-NEXT:    call void @fmv_c()
 // CHECK-NEXT:    [[CALL3:%.*]] = call i32 @fmv_default()
 // CHECK-NEXT:    ret i32 [[CALL3]]
 //
@@ -412,6 +405,20 @@ int hoo(void) {
 // CHECK-NEXT:    ret ptr @fmv_d.default
 //
 //
+// CHECK-LABEL: define {{[^@]+}}@fmv_c.resolver() comdat {
+// CHECK-NEXT:  resolver_entry:
+// CHECK-NEXT:    call void @__init_cpu_features_resolver()
+// CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = and i64 [[TMP0]], 281474976710656
+// CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 281474976710656
+// CHECK-NEXT:    [[TMP3:%.*]] = and i1 true, [[TMP2]]
+// CHECK-NEXT:    br i1 [[TMP3]], label [[RESOLVER_RETURN:%.*]], label [[RESOLVER_ELSE:%.*]]
+// CHECK:       resolver_return:
+// CHECK-NEXT:    ret ptr @fmv_c._Mssbs
+// CHECK:       resolver_else:
+// CHECK-NEXT:    ret ptr @fmv_c.default
+//
+//
 // CHECK: Function Attrs: noinline nounwind optnone
 // CHECK-LABEL: define {{[^@]+}}@recur
 // CHECK-SAME: () #[[ATTR2]] {
@@ -437,9 +444,9 @@ int hoo(void) {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[FP1:%.*]] = alloca ptr, align 8
 // CHECK-NEXT:    [[FP2:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    call void @f(ptr noundef @fmv.ifunc)
-// CHECK-NEXT:    store ptr @fmv.ifunc, ptr [[FP1]], align 8
-// CHECK-NEXT:    store ptr @fmv.ifunc, ptr [[FP2]], align 8
+// CHECK-NEXT:    call void @f(ptr noundef @fmv)
+// CHECK-NEXT:    store ptr @fmv, ptr [[FP1]], align 8
+// CHECK-NEXT:    store ptr @fmv, ptr [[FP2]], align 8
 // CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[FP1]], align 8
 // CHECK-NEXT:    [[CALL:%.*]] = call i32 [[TMP0]]()
 // CHECK-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[FP2]], align 8
@@ -561,13 +568,6 @@ int hoo(void) {
 //
 //
 // CHECK: Function Attrs: noinline nounwind optnone
-// CHECK-LABEL: define {{[^@]+}}@fmv_c.default
-// CHECK-SAME: () #[[ATTR2]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    ret void
-//
-//
-// CHECK: Function Attrs: noinline nounwind optnone
 // CHECK-LABEL: define {{[^@]+}}@fmv_inline._Msha1MpmullMf64mm
 // CHECK-SAME: () #[[ATTR12:[0-9]+]] {
 // CHECK-NEXT:  entry:
@@ -700,6 +700,13 @@ int hoo(void) {
 // CHECK-NEXT:    ret i32 1
 //
 //
+// CHECK: Function Attrs: noinline nounwind optnone
+// CHECK-LABEL: define {{[^@]+}}@fmv_c.default
+// CHECK-SAME: () #[[ATTR2]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    ret void
+//
+//
 // CHECK-NOFMV: Function Attrs: noinline nounwind optnone
 // CHECK-NOFMV-LABEL: define {{[^@]+}}@fmv
 // CHECK-NOFMV-SAME: () #[[ATTR0:[0-9]+]] {
diff --git a/clang/test/CodeGenCXX/attr-target-version.cpp b/clang/test/CodeGenCXX/attr-target-version.cpp
index 68dd7be..b63815d 100644
--- a/clang/test/CodeGenCXX/attr-target-version.cpp
+++ b/clang/test/CodeGenCXX/attr-target-version.cpp
@@ -26,9 +26,12 @@ int bar() {
 
 //.
 // CHECK: @__aarch64_cpu_features = external dso_local global { i64 }
-// CHECK: @_Z3fooi.ifunc = weak_odr ifunc i32 (i32), ptr @_Z3fooi.resolver
-// CHECK: @_Z3foov.ifunc = weak_odr ifunc i32 (), ptr @_Z3foov.resolver
-// CHECK: @_ZN7MyClass3gooEi.ifunc = weak_odr ifunc i32 (ptr, i32), ptr @_ZN7MyClass3gooEi.resolver
+// CHECK: @_ZN7MyClass3gooEi.ifunc = weak_odr alias i32 (ptr, i32), ptr @_ZN7MyClass3gooEi
+// CHECK: @_Z3fooi.ifunc = weak_odr alias i32 (i32), ptr @_Z3fooi
+// CHECK: @_Z3foov.ifunc = weak_odr alias i32 (), ptr @_Z3foov
+// CHECK: @_ZN7MyClass3gooEi = weak_odr ifunc i32 (ptr, i32), ptr @_ZN7MyClass3gooEi.resolver
+// CHECK: @_Z3fooi = weak_odr ifunc i32 (i32), ptr @_Z3fooi.resolver
+// CHECK: @_Z3foov = weak_odr ifunc i32 (), ptr @_Z3foov.resolver
 //.
 // CHECK-LABEL: @_Z3fooi._Mbf16Msme-f64f64(
 // CHECK-NEXT:  entry:
@@ -37,39 +40,11 @@ int bar() {
 // CHECK-NEXT:    ret i32 1
 //
 //
-// CHECK-LABEL: @_Z3fooi.resolver(
-// CHECK-NEXT:  resolver_entry:
-// CHECK-NEXT:    call void @__init_cpu_features_resolver()
-// CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = and i64 [[TMP0]], 36028797153181696
-// CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 36028797153181696
-// CHECK-NEXT:    [[TMP3:%.*]] = and i1 true, [[TMP2]]
-// CHECK-NEXT:    br i1 [[TMP3]], label [[RESOLVER_RETURN:%.*]], label [[RESOLVER_ELSE:%.*]]
-// CHECK:       resolver_return:
-// CHECK-NEXT:    ret ptr @_Z3fooi._Mbf16Msme-f64f64
-// CHECK:       resolver_else:
-// CHECK-NEXT:    ret ptr @_Z3fooi.default
-//
-//
 // CHECK-LABEL: @_Z3foov._Msm4Mebf16(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    ret i32 3
 //
 //
-// CHECK-LABEL: @_Z3foov.resolver(
-// CHECK-NEXT:  resolver_entry:
-// CHECK-NEXT:    call void @__init_cpu_features_resolver()
-// CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = and i64 [[TMP0]], 268435488
-// CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 268435488
-// CHECK-NEXT:    [[TMP3:%.*]] = and i1 true, [[TMP2]]
-// CHECK-NEXT:    br i1 [[TMP3]], label [[RESOLVER_RETURN:%.*]], label [[RESOLVER_ELSE:%.*]]
-// CHECK:       resolver_return:
-// CHECK-NEXT:    ret ptr @_Z3foov._Msm4Mebf16
-// CHECK:       resolver_else:
-// CHECK-NEXT:    ret ptr @_Z3foov.default
-//
-//
 // CHECK-LABEL: @_ZN7MyClass3gooEi.resolver(
 // CHECK-NEXT:  resolver_entry:
 // CHECK-NEXT:    call void @__init_cpu_features_resolver()
@@ -95,24 +70,40 @@ int bar() {
 // CHECK-LABEL: @_Z3barv(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[M:%.*]] = alloca [[STRUCT_MYCLASS:%.*]], align 1
-// CHECK-NEXT:    [[CALL:%.*]] = call noundef i32 @_ZN7MyClass3gooEi.ifunc(ptr noundef nonnull align 1 dereferenceable(1) [[M]], i32 noundef 1)
-// CHECK-NEXT:    [[CALL1:%.*]] = call noundef i32 @_Z3fooi.ifunc(i32 noundef 1)
+// CHECK-NEXT:    [[CALL:%.*]] = call noundef i32 @_ZN7MyClass3gooEi(ptr noundef nonnull align 1 dereferenceable(1) [[M]], i32 noundef 1)
+// CHECK-NEXT:    [[CALL1:%.*]] = call noundef i32 @_Z3fooi(i32 noundef 1)
 // CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[CALL]], [[CALL1]]
-// CHECK-NEXT:    [[CALL2:%.*]] = call noundef i32 @_Z3foov.ifunc()
+// CHECK-NEXT:    [[CALL2:%.*]] = call noundef i32 @_Z3foov()
 // CHECK-NEXT:    [[ADD3:%.*]] = add nsw i32 [[ADD]], [[CALL2]]
 // CHECK-NEXT:    ret i32 [[ADD3]]
 //
 //
-// CHECK-LABEL: @_Z3fooi.default(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[DOTADDR:%.*]] = alloca i32, align 4
-// CHECK-NEXT:    store i32 [[TMP0:%.*]], ptr [[DOTADDR]], align 4
-// CHECK-NEXT:    ret i32 2
+// CHECK-LABEL: @_Z3fooi.resolver(
+// CHECK-NEXT:  resolver_entry:
+// CHECK-NEXT:    call void @__init_cpu_features_resolver()
+// CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = and i64 [[TMP0]], 36028797153181696
+// CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 36028797153181696
+// CHECK-NEXT:    [[TMP3:%.*]] = and i1 true, [[TMP2]]
+// CHECK-NEXT:    br i1 [[TMP3]], label [[RESOLVER_RETURN:%.*]], label [[RESOLVER_ELSE:%.*]]
+// CHECK:       resolver_return:
+// CHECK-NEXT:    ret ptr @_Z3fooi._Mbf16Msme-f64f64
+// CHECK:       resolver_else:
+// CHECK-NEXT:    ret ptr @_Z3fooi.default
 //
 //
-// CHECK-LABEL: @_Z3foov.default(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    ret i32 4
+// CHECK-LABEL: @_Z3foov.resolver(
+// CHECK-NEXT:  resolver_entry:
+// CHECK-NEXT:    call void @__init_cpu_features_resolver()
+// CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = and i64 [[TMP0]], 268435488
+// CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 268435488
+// CHECK-NEXT:    [[TMP3:%.*]] = and i1 true, [[TMP2]]
+// CHECK-NEXT:    br i1 [[TMP3]], label [[RESOLVER_RETURN:%.*]], label [[RESOLVER_ELSE:%.*]]
+// CHECK:       resolver_return:
+// CHECK-NEXT:    ret ptr @_Z3foov._Msm4Mebf16
+// CHECK:       resolver_else:
+// CHECK-NEXT:    ret ptr @_Z3foov.default
 //
 //
 // CHECK-LABEL: @_ZN7MyClass3gooEi._Mdotprod(
@@ -144,6 +135,18 @@ int bar() {
 // CHECK-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
 // CHECK-NEXT:    ret i32 1
 //
+//
+// CHECK-LABEL: @_Z3fooi.default(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    store i32 [[TMP0:%.*]], ptr [[DOTADDR]], align 4
+// CHECK-NEXT:    ret i32 2
+//
+//
+// CHECK-LABEL: @_Z3foov.default(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    ret i32 4
+//
 //.
 // CHECK: attributes #[[ATTR0:[0-9]+]] = { mustprogress noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bf16,+sme,+sme-f64f64" }
 // CHECK: attributes #[[ATTR1:[0-9]+]] = { mustprogress noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bf16,+fp-armv8,+neon,+sm4" }
-- 
cgit v1.1


From 99446df3f5357b327b388bbbb4adf6465999ea60 Mon Sep 17 00:00:00 2001
From: Adrian Prantl <aprantl@apple.com>
Date: Fri, 9 Feb 2024 08:12:58 -0800
Subject: Bump the minimum LLVM version for chrono datatformatters tests

---
 .../data-formatter-stl/libcxx/chrono/TestDataFormatterLibcxxChrono.py   | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx/chrono/TestDataFormatterLibcxxChrono.py b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx/chrono/TestDataFormatterLibcxxChrono.py
index a90fb82..c306315 100644
--- a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx/chrono/TestDataFormatterLibcxxChrono.py
+++ b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx/chrono/TestDataFormatterLibcxxChrono.py
@@ -11,7 +11,7 @@ from lldbsuite.test import lldbutil
 
 class LibcxxChronoDataFormatterTestCase(TestBase):
     @add_test_categories(["libc++"])
-    @skipIf(compiler="clang", compiler_version=["<", "11.0"])
+    @skipIf(compiler="clang", compiler_version=["<", "17.0"])
     def test_with_run_command(self):
         """Test that that file and class static variables display correctly."""
         self.build()
-- 
cgit v1.1


From 2095655f8e2324971f11be61b88ef1644d5796b8 Mon Sep 17 00:00:00 2001
From: Jon Roelofs <jonathan_roelofs@apple.com>
Date: Fri, 9 Feb 2024 08:14:09 -0800
Subject: [clang][sema] Fix -Wunused-function on target_version'd file-scope
 Fn's (#81167)

We should only warn if the default version is the one that is unused.

Fixes: https://github.com/llvm/llvm-project/issues/80227
---
 clang/include/clang/AST/Decl.h                |  4 ++++
 clang/lib/AST/Decl.cpp                        |  5 +++++
 clang/lib/Sema/Sema.cpp                       |  3 ++-
 clang/test/SemaCXX/warn-unused-filescoped.cpp | 16 ++++++++++++++++
 4 files changed, 27 insertions(+), 1 deletion(-)

diff --git a/clang/include/clang/AST/Decl.h b/clang/include/clang/AST/Decl.h
index 42fdf2b..61117cc 100644
--- a/clang/include/clang/AST/Decl.h
+++ b/clang/include/clang/AST/Decl.h
@@ -2615,6 +2615,10 @@ public:
   /// the target functionality.
   bool isTargetMultiVersion() const;
 
+  /// True if this function is the default version of a multiversioned dispatch
+  /// function as a part of the target functionality.
+  bool isTargetMultiVersionDefault() const;
+
   /// True if this function is a multiversioned dispatch function as a part of
   /// the target-clones functionality.
   bool isTargetClonesMultiVersion() const;
diff --git a/clang/lib/AST/Decl.cpp b/clang/lib/AST/Decl.cpp
index 40e2903..e281f2d 100644
--- a/clang/lib/AST/Decl.cpp
+++ b/clang/lib/AST/Decl.cpp
@@ -3537,6 +3537,11 @@ bool FunctionDecl::isTargetMultiVersion() const {
          (hasAttr<TargetAttr>() || hasAttr<TargetVersionAttr>());
 }
 
+bool FunctionDecl::isTargetMultiVersionDefault() const {
+  return isMultiVersion() && hasAttr<TargetVersionAttr>() &&
+         getAttr<TargetVersionAttr>()->isDefaultVersion();
+}
+
 bool FunctionDecl::isTargetClonesMultiVersion() const {
   return isMultiVersion() && hasAttr<TargetClonesAttr>();
 }
diff --git a/clang/lib/Sema/Sema.cpp b/clang/lib/Sema/Sema.cpp
index 2d4e6d1..cfb653e6 100644
--- a/clang/lib/Sema/Sema.cpp
+++ b/clang/lib/Sema/Sema.cpp
@@ -1393,7 +1393,8 @@ void Sema::ActOnEndOfTranslationUnit() {
               Diag(DiagD->getLocation(), diag::warn_unneeded_internal_decl)
                   << /*function=*/0 << DiagD << DiagRange;
           }
-        } else {
+        } else if (!FD->isTargetMultiVersion() ||
+                   FD->isTargetMultiVersionDefault()) {
           if (FD->getDescribedFunctionTemplate())
             Diag(DiagD->getLocation(), diag::warn_unused_template)
                 << /*function=*/0 << DiagD << DiagRange;
diff --git a/clang/test/SemaCXX/warn-unused-filescoped.cpp b/clang/test/SemaCXX/warn-unused-filescoped.cpp
index be8d350..0c347e9 100644
--- a/clang/test/SemaCXX/warn-unused-filescoped.cpp
+++ b/clang/test/SemaCXX/warn-unused-filescoped.cpp
@@ -236,4 +236,20 @@ constexpr int constexpr4() { return 2; }
 #endif
 }
 
+__attribute__((target_version("fp16")))
+static int not_used_fmv(void) { return 1; }
+__attribute__((target_version("fp16fml")))
+static int not_used_fmv(void) { return 2; }
+__attribute__((target_version("default")))
+static int not_used_fmv(void) { return 0; } // expected-warning {{unused function 'not_used_fmv'}}
+
+
+__attribute__((target_version("fp16")))
+static int definitely_used_fmv(void) { return 1; }
+__attribute__((target_version("fp16fml")))
+static int definitely_used_fmv(void) { return 2; }
+__attribute__((target_version("default")))
+static int definitely_used_fmv(void) { return 0; }
+int definite_user(void) { return definitely_used_fmv(); }
+
 #endif
-- 
cgit v1.1


From 7ddc32052546abd41656d2e670f3902b1bf805a7 Mon Sep 17 00:00:00 2001
From: quic-areg <aregmi@quicinc.com>
Date: Fri, 9 Feb 2024 10:15:23 -0600
Subject: [llvm-objcopy] Support SREC output format (#75874)

Adds a new output target "srec" to write SREC files from ELF inputs.

https://en.wikipedia.org/wiki/SREC_(file_format)
---
 llvm/docs/CommandGuide/llvm-objcopy.rst           |   9 +-
 llvm/include/llvm/ObjCopy/CommonConfig.h          |   7 +-
 llvm/lib/ObjCopy/ELF/ELFObjcopy.cpp               |   4 +-
 llvm/lib/ObjCopy/ELF/ELFObject.cpp                | 280 ++++++++++++++++++----
 llvm/lib/ObjCopy/ELF/ELFObject.h                  | 134 ++++++++++-
 llvm/test/tools/llvm-objcopy/ELF/ihex-writer.test |   6 +-
 llvm/test/tools/llvm-objcopy/ELF/srec-writer.test | 196 +++++++++++++++
 llvm/tools/llvm-objcopy/ObjcopyOptions.cpp        |   1 +
 llvm/tools/llvm-objcopy/llvm-objcopy.cpp          |   1 +
 9 files changed, 576 insertions(+), 62 deletions(-)
 create mode 100644 llvm/test/tools/llvm-objcopy/ELF/srec-writer.test

diff --git a/llvm/docs/CommandGuide/llvm-objcopy.rst b/llvm/docs/CommandGuide/llvm-objcopy.rst
index 42d11fa..b823be9 100644
--- a/llvm/docs/CommandGuide/llvm-objcopy.rst
+++ b/llvm/docs/CommandGuide/llvm-objcopy.rst
@@ -544,8 +544,13 @@ options. For GNU :program:`objcopy` compatibility, the values are all bfdnames.
 - `elf32-sparc`
 - `elf32-sparcel`
 
-Additionally, all targets except `binary` and `ihex` can have `-freebsd` as a
-suffix.
+The following formats are suppoprted by :program:`llvm-objcopy` for the
+:option:`--output-target` only:
+
+- `srec`
+
+Additionally, all targets except `binary`, `ihex`, and `srec` can have
+`-freebsd` as a suffix.
 
 BINARY INPUT AND OUTPUT
 -----------------------
diff --git a/llvm/include/llvm/ObjCopy/CommonConfig.h b/llvm/include/llvm/ObjCopy/CommonConfig.h
index 0d9320e..3833959 100644
--- a/llvm/include/llvm/ObjCopy/CommonConfig.h
+++ b/llvm/include/llvm/ObjCopy/CommonConfig.h
@@ -27,12 +27,7 @@
 namespace llvm {
 namespace objcopy {
 
-enum class FileFormat {
-  Unspecified,
-  ELF,
-  Binary,
-  IHex,
-};
+enum class FileFormat { Unspecified, ELF, Binary, IHex, SREC };
 
 // This type keeps track of the machine info for various architectures. This
 // lets us map architecture names to ELF types and the e_machine value of the
diff --git a/llvm/lib/ObjCopy/ELF/ELFObjcopy.cpp b/llvm/lib/ObjCopy/ELF/ELFObjcopy.cpp
index 36f7994..1b3a582 100644
--- a/llvm/lib/ObjCopy/ELF/ELFObjcopy.cpp
+++ b/llvm/lib/ObjCopy/ELF/ELFObjcopy.cpp
@@ -182,7 +182,9 @@ static std::unique_ptr<Writer> createWriter(const CommonConfig &Config,
   case FileFormat::Binary:
     return std::make_unique<BinaryWriter>(Obj, Out, Config);
   case FileFormat::IHex:
-    return std::make_unique<IHexWriter>(Obj, Out);
+    return std::make_unique<IHexWriter>(Obj, Out, Config.OutputFilename);
+  case FileFormat::SREC:
+    return std::make_unique<SRECWriter>(Obj, Out, Config.OutputFilename);
   default:
     return createELFWriter(Config, Obj, Out, OutputElfType);
   }
diff --git a/llvm/lib/ObjCopy/ELF/ELFObject.cpp b/llvm/lib/ObjCopy/ELF/ELFObject.cpp
index c8b66d6..c2de456 100644
--- a/llvm/lib/ObjCopy/ELF/ELFObject.cpp
+++ b/llvm/lib/ObjCopy/ELF/ELFObject.cpp
@@ -2707,10 +2707,52 @@ Error BinaryWriter::finalize() {
   return Error::success();
 }
 
-bool IHexWriter::SectionCompare::operator()(const SectionBase *Lhs,
-                                            const SectionBase *Rhs) const {
-  return (sectionPhysicalAddr(Lhs) & 0xFFFFFFFFU) <
-         (sectionPhysicalAddr(Rhs) & 0xFFFFFFFFU);
+Error ASCIIHexWriter::checkSection(const SectionBase &S) const {
+  if (addressOverflows32bit(S.Addr) ||
+      addressOverflows32bit(S.Addr + S.Size - 1))
+    return createStringError(
+        errc::invalid_argument,
+        "section '%s' address range [0x%llx, 0x%llx] is not 32 bit",
+        S.Name.c_str(), S.Addr, S.Addr + S.Size - 1);
+  return Error::success();
+}
+
+Error ASCIIHexWriter::finalize() {
+  // We can't write 64-bit addresses.
+  if (addressOverflows32bit(Obj.Entry))
+    return createStringError(errc::invalid_argument,
+                             "entry point address 0x%llx overflows 32 bits",
+                             Obj.Entry);
+
+  for (const SectionBase &S : Obj.sections()) {
+    if ((S.Flags & ELF::SHF_ALLOC) && S.Type != ELF::SHT_NOBITS && S.Size > 0) {
+      if (Error E = checkSection(S))
+        return E;
+      Sections.push_back(&S);
+    }
+  }
+
+  llvm::sort(Sections, [](const SectionBase *A, const SectionBase *B) {
+    return sectionPhysicalAddr(A) < sectionPhysicalAddr(B);
+  });
+
+  std::unique_ptr<WritableMemoryBuffer> EmptyBuffer =
+      WritableMemoryBuffer::getNewMemBuffer(0);
+  if (!EmptyBuffer)
+    return createStringError(errc::not_enough_memory,
+                             "failed to allocate memory buffer of 0 bytes");
+
+  Expected<size_t> ExpTotalSize = getTotalSize(*EmptyBuffer);
+  if (!ExpTotalSize)
+    return ExpTotalSize.takeError();
+  TotalSize = *ExpTotalSize;
+
+  Buf = WritableMemoryBuffer::getNewMemBuffer(TotalSize);
+  if (!Buf)
+    return createStringError(errc::not_enough_memory,
+                             "failed to allocate memory buffer of 0x" +
+                                 Twine::utohexstr(TotalSize) + " bytes");
+  return Error::success();
 }
 
 uint64_t IHexWriter::writeEntryPointRecord(uint8_t *Buf) {
@@ -2740,6 +2782,20 @@ uint64_t IHexWriter::writeEndOfFileRecord(uint8_t *Buf) {
   return HexData.size();
 }
 
+Expected<size_t>
+IHexWriter::getTotalSize(WritableMemoryBuffer &EmptyBuffer) const {
+  IHexSectionWriterBase LengthCalc(EmptyBuffer);
+  for (const SectionBase *Sec : Sections)
+    if (Error Err = Sec->accept(LengthCalc))
+      return Err;
+
+  // We need space to write section records + StartAddress record
+  // (if start adress is not zero) + EndOfFile record.
+  return LengthCalc.getBufferOffset() +
+         (Obj.Entry ? IHexRecord::getLineLength(4) : 0) +
+         IHexRecord::getLineLength(0);
+}
+
 Error IHexWriter::write() {
   IHexSectionWriter Writer(*Buf);
   // Write sections.
@@ -2762,54 +2818,196 @@ Error IHexWriter::write() {
   return Error::success();
 }
 
-Error IHexWriter::checkSection(const SectionBase &Sec) {
-  uint64_t Addr = sectionPhysicalAddr(&Sec);
-  if (addressOverflows32bit(Addr) || addressOverflows32bit(Addr + Sec.Size - 1))
-    return createStringError(
-        errc::invalid_argument,
-        "Section '%s' address range [0x%llx, 0x%llx] is not 32 bit",
-        Sec.Name.c_str(), Addr, Addr + Sec.Size - 1);
+Error SRECSectionWriterBase::visit(const StringTableSection &Sec) {
+  // Check that the sizer has already done its work.
+  assert(Sec.Size == Sec.StrTabBuilder.getSize() &&
+         "Expected section size to have been finalized");
+  // We don't need to write anything here because the real writer has already
+  // done it.
   return Error::success();
 }
 
-Error IHexWriter::finalize() {
-  // We can't write 64-bit addresses.
-  if (addressOverflows32bit(Obj.Entry))
-    return createStringError(errc::invalid_argument,
-                             "Entry point address 0x%llx overflows 32 bits",
-                             Obj.Entry);
+Error SRECSectionWriterBase::visit(const Section &Sec) {
+  writeSection(Sec, Sec.Contents);
+  return Error::success();
+}
 
-  for (const SectionBase &Sec : Obj.sections())
-    if ((Sec.Flags & ELF::SHF_ALLOC) && Sec.Type != ELF::SHT_NOBITS &&
-        Sec.Size > 0) {
-      if (Error E = checkSection(Sec))
-        return E;
-      Sections.insert(&Sec);
-    }
+Error SRECSectionWriterBase::visit(const OwnedDataSection &Sec) {
+  writeSection(Sec, Sec.Data);
+  return Error::success();
+}
 
-  std::unique_ptr<WritableMemoryBuffer> EmptyBuffer =
-      WritableMemoryBuffer::getNewMemBuffer(0);
-  if (!EmptyBuffer)
-    return createStringError(errc::not_enough_memory,
-                             "failed to allocate memory buffer of 0 bytes");
+Error SRECSectionWriterBase::visit(const DynamicRelocationSection &Sec) {
+  writeSection(Sec, Sec.Contents);
+  return Error::success();
+}
+
+void SRECSectionWriter::writeRecord(SRecord &Record, uint64_t Off) {
+  SRecLineData Data = Record.toString();
+  memcpy(Out.getBufferStart() + Off, Data.data(), Data.size());
+}
 
-  IHexSectionWriterBase LengthCalc(*EmptyBuffer);
+void SRECSectionWriterBase::writeRecords(uint32_t Entry) {
+  // The ELF header could contain an entry point outside of the sections we have
+  // seen that does not fit the current record Type.
+  Type = std::max(Type, SRecord::getType(Entry));
+  uint64_t Off = HeaderSize;
+  for (SRecord &Record : Records) {
+    Record.Type = Type;
+    writeRecord(Record, Off);
+    Off += Record.getSize();
+  }
+  Offset = Off;
+}
+
+void SRECSectionWriterBase::writeSection(const SectionBase &S,
+                                         ArrayRef<uint8_t> Data) {
+  const uint32_t ChunkSize = 16;
+  uint32_t Address = sectionPhysicalAddr(&S);
+  uint32_t EndAddr = Address + S.Size - 1;
+  Type = std::max(SRecord::getType(EndAddr), Type);
+  while (!Data.empty()) {
+    uint64_t DataSize = std::min<uint64_t>(Data.size(), ChunkSize);
+    SRecord Record{Type, Address, Data.take_front(DataSize)};
+    Records.push_back(Record);
+    Data = Data.drop_front(DataSize);
+    Address += DataSize;
+  }
+}
+
+Error SRECSectionWriter::visit(const StringTableSection &Sec) {
+  assert(Sec.Size == Sec.StrTabBuilder.getSize() &&
+         "Section size does not match the section's string table builder size");
+  std::vector<uint8_t> Data(Sec.Size);
+  Sec.StrTabBuilder.write(Data.data());
+  writeSection(Sec, Data);
+  return Error::success();
+}
+
+SRecLineData SRecord::toString() const {
+  SRecLineData Line(getSize());
+  auto *Iter = Line.begin();
+  *Iter++ = 'S';
+  *Iter++ = '0' + Type;
+  // Write 1 byte (2 hex characters) record count.
+  Iter = toHexStr(getCount(), Iter, 2);
+  // Write the address field with length depending on record type.
+  Iter = toHexStr(Address, Iter, getAddressSize());
+  // Write data byte by byte.
+  for (uint8_t X : Data)
+    Iter = toHexStr(X, Iter, 2);
+  // Write the 1 byte checksum.
+  Iter = toHexStr(getChecksum(), Iter, 2);
+  *Iter++ = '\r';
+  *Iter++ = '\n';
+  assert(Iter == Line.end());
+  return Line;
+}
+
+uint8_t SRecord::getChecksum() const {
+  uint32_t Sum = getCount();
+  Sum += (Address >> 24) & 0xFF;
+  Sum += (Address >> 16) & 0xFF;
+  Sum += (Address >> 8) & 0xFF;
+  Sum += Address & 0xFF;
+  for (uint8_t Byte : Data)
+    Sum += Byte;
+  return 0xFF - (Sum & 0xFF);
+}
+
+size_t SRecord::getSize() const {
+  // Type, Count, Checksum, and CRLF are two characters each.
+  return 2 + 2 + getAddressSize() + Data.size() * 2 + 2 + 2;
+}
+
+uint8_t SRecord::getAddressSize() const {
+  switch (Type) {
+  case Type::S2:
+    return 6;
+  case Type::S3:
+    return 8;
+  case Type::S7:
+    return 8;
+  case Type::S8:
+    return 6;
+  default:
+    return 4;
+  }
+}
+
+uint8_t SRecord::getCount() const {
+  uint8_t DataSize = Data.size();
+  uint8_t ChecksumSize = 1;
+  return getAddressSize() / 2 + DataSize + ChecksumSize;
+}
+
+uint8_t SRecord::getType(uint32_t Address) {
+  if (isUInt<16>(Address))
+    return SRecord::S1;
+  if (isUInt<24>(Address))
+    return SRecord::S2;
+  return SRecord::S3;
+}
+
+SRecord SRecord::getHeader(StringRef FileName) {
+  // Header is a record with Type S0, Address 0, and Data that is a
+  // vendor-specific text comment. For the comment we will use the output file
+  // name truncated to 40 characters to match the behavior of GNU objcopy.
+  StringRef HeaderContents = FileName.slice(0, 40);
+  ArrayRef<uint8_t> Data(
+      reinterpret_cast<const uint8_t *>(HeaderContents.data()),
+      HeaderContents.size());
+  return {SRecord::S0, 0, Data};
+}
+
+size_t SRECWriter::writeHeader(uint8_t *Buf) {
+  SRecLineData Record = SRecord::getHeader(OutputFileName).toString();
+  memcpy(Buf, Record.data(), Record.size());
+  return Record.size();
+}
+
+size_t SRECWriter::writeTerminator(uint8_t *Buf, uint8_t Type) {
+  assert(Type >= SRecord::S7 && Type <= SRecord::S9 &&
+         "Invalid record type for terminator");
+  uint32_t Entry = Obj.Entry;
+  SRecLineData Data = SRecord{Type, Entry, {}}.toString();
+  memcpy(Buf, Data.data(), Data.size());
+  return Data.size();
+}
+
+Expected<size_t>
+SRECWriter::getTotalSize(WritableMemoryBuffer &EmptyBuffer) const {
+  SRECSizeCalculator SizeCalc(EmptyBuffer, 0);
   for (const SectionBase *Sec : Sections)
-    if (Error Err = Sec->accept(LengthCalc))
+    if (Error Err = Sec->accept(SizeCalc))
       return Err;
 
-  // We need space to write section records + StartAddress record
-  // (if start adress is not zero) + EndOfFile record.
-  TotalSize = LengthCalc.getBufferOffset() +
-              (Obj.Entry ? IHexRecord::getLineLength(4) : 0) +
-              IHexRecord::getLineLength(0);
+  SizeCalc.writeRecords(Obj.Entry);
+  // We need to add the size of the Header and Terminator records.
+  SRecord Header = SRecord::getHeader(OutputFileName);
+  uint8_t TerminatorType = 10 - SizeCalc.getType();
+  SRecord Terminator = {TerminatorType, static_cast<uint32_t>(Obj.Entry), {}};
+  return Header.getSize() + SizeCalc.getBufferOffset() + Terminator.getSize();
+}
 
-  Buf = WritableMemoryBuffer::getNewMemBuffer(TotalSize);
-  if (!Buf)
-    return createStringError(errc::not_enough_memory,
-                             "failed to allocate memory buffer of " +
-                                 Twine::utohexstr(TotalSize) + " bytes");
+Error SRECWriter::write() {
+  uint32_t HeaderSize =
+      writeHeader(reinterpret_cast<uint8_t *>(Buf->getBufferStart()));
+  SRECSectionWriter Writer(*Buf, HeaderSize);
+  for (const SectionBase *S : Sections) {
+    if (Error E = S->accept(Writer))
+      return E;
+  }
+  Writer.writeRecords(Obj.Entry);
+  uint64_t Offset = Writer.getBufferOffset();
 
+  // An S1 record terminates with an S9 record, S2 with S8, and S3 with S7.
+  uint8_t TerminatorType = 10 - Writer.getType();
+  Offset += writeTerminator(
+      reinterpret_cast<uint8_t *>(Buf->getBufferStart() + Offset),
+      TerminatorType);
+  assert(Offset == TotalSize);
+  Out.write(Buf->getBufferStart(), Buf->getBufferSize());
   return Error::success();
 }
 
diff --git a/llvm/lib/ObjCopy/ELF/ELFObject.h b/llvm/lib/ObjCopy/ELF/ELFObject.h
index 95bea09..7a2e20d 100644
--- a/llvm/lib/ObjCopy/ELF/ELFObject.h
+++ b/llvm/lib/ObjCopy/ELF/ELFObject.h
@@ -172,6 +172,9 @@ public:
   friend class SectionWriter;                                                  \
   friend class IHexSectionWriterBase;                                          \
   friend class IHexSectionWriter;                                              \
+  friend class SRECSectionWriter;                                              \
+  friend class SRECSectionWriterBase;                                          \
+  friend class SRECSizeCalculator;                                             \
   template <class ELFT> friend class ELFSectionWriter;                         \
   template <class ELFT> friend class ELFSectionSizer;
 
@@ -371,23 +374,136 @@ public:
       : Writer(Obj, Out), GapFill(Config.GapFill), PadTo(Config.PadTo) {}
 };
 
-class IHexWriter : public Writer {
-  struct SectionCompare {
-    bool operator()(const SectionBase *Lhs, const SectionBase *Rhs) const;
-  };
+// A base class for writing ascii hex formats such as srec and ihex.
+class ASCIIHexWriter : public Writer {
+public:
+  ASCIIHexWriter(Object &Obj, raw_ostream &OS, StringRef OutputFile)
+      : Writer(Obj, OS), OutputFileName(OutputFile) {}
+  Error finalize() override;
 
-  std::set<const SectionBase *, SectionCompare> Sections;
+protected:
+  StringRef OutputFileName;
   size_t TotalSize = 0;
+  std::vector<const SectionBase *> Sections;
+
+  Error checkSection(const SectionBase &S) const;
+  virtual Expected<size_t>
+  getTotalSize(WritableMemoryBuffer &EmptyBuffer) const = 0;
+};
+
+class IHexWriter : public ASCIIHexWriter {
+public:
+  Error write() override;
+  IHexWriter(Object &Obj, raw_ostream &Out, StringRef OutputFile)
+      : ASCIIHexWriter(Obj, Out, OutputFile) {}
 
-  Error checkSection(const SectionBase &Sec);
+private:
   uint64_t writeEntryPointRecord(uint8_t *Buf);
   uint64_t writeEndOfFileRecord(uint8_t *Buf);
+  Expected<size_t>
+  getTotalSize(WritableMemoryBuffer &EmptyBuffer) const override;
+};
 
+class SRECWriter : public ASCIIHexWriter {
 public:
-  ~IHexWriter() {}
-  Error finalize() override;
+  SRECWriter(Object &Obj, raw_ostream &OS, StringRef OutputFile)
+      : ASCIIHexWriter(Obj, OS, OutputFile) {}
   Error write() override;
-  IHexWriter(Object &Obj, raw_ostream &Out) : Writer(Obj, Out) {}
+
+private:
+  size_t writeHeader(uint8_t *Buf);
+  size_t writeTerminator(uint8_t *Buf, uint8_t Type);
+  Expected<size_t>
+  getTotalSize(WritableMemoryBuffer &EmptyBuffer) const override;
+};
+
+using SRecLineData = SmallVector<char, 64>;
+struct SRecord {
+  uint8_t Type;
+  uint32_t Address;
+  ArrayRef<uint8_t> Data;
+  SRecLineData toString() const;
+  uint8_t getCount() const;
+  // Get address size in characters.
+  uint8_t getAddressSize() const;
+  uint8_t getChecksum() const;
+  size_t getSize() const;
+  static SRecord getHeader(StringRef FileName);
+  static uint8_t getType(uint32_t Address);
+
+  enum Type : uint8_t {
+    // Vendor specific text comment.
+    S0 = 0,
+    // Data that starts at a 16 bit address.
+    S1 = 1,
+    // Data that starts at a 24 bit address.
+    S2 = 2,
+    // Data that starts at a 32 bit address.
+    S3 = 3,
+    // Reserved.
+    S4 = 4,
+    // 16 bit count of S1/S2/S3 records (optional).
+    S5 = 5,
+    // 32 bit count of S1/S2/S3 records (optional).
+    S6 = 6,
+    // Terminates a series of S3 records.
+    S7 = 7,
+    // Terminates a series of S2 records.
+    S8 = 8,
+    // Terminates a series of S1 records.
+    S9 = 9
+  };
+};
+
+class SRECSectionWriterBase : public BinarySectionWriter {
+public:
+  explicit SRECSectionWriterBase(WritableMemoryBuffer &Buf,
+                                 uint64_t StartOffset)
+      : BinarySectionWriter(Buf), Offset(StartOffset), HeaderSize(StartOffset) {
+  }
+
+  using BinarySectionWriter::visit;
+
+  void writeRecords(uint32_t Entry);
+  uint64_t getBufferOffset() const { return Offset; }
+  Error visit(const Section &S) override;
+  Error visit(const OwnedDataSection &S) override;
+  Error visit(const StringTableSection &S) override;
+  Error visit(const DynamicRelocationSection &S) override;
+  uint8_t getType() const { return Type; };
+
+protected:
+  // Offset in the output buffer.
+  uint64_t Offset;
+  // Sections start after the header.
+  uint64_t HeaderSize;
+  // Type of records to write.
+  uint8_t Type = SRecord::S1;
+  std::vector<SRecord> Records;
+
+  void writeSection(const SectionBase &S, ArrayRef<uint8_t> Data);
+  virtual void writeRecord(SRecord &Record, uint64_t Off) = 0;
+};
+
+// An SRECSectionWriterBase that visits sections but does not write anything.
+// This class is only used to calculate the size of the output file.
+class SRECSizeCalculator : public SRECSectionWriterBase {
+public:
+  SRECSizeCalculator(WritableMemoryBuffer &EmptyBuffer, uint64_t Offset)
+      : SRECSectionWriterBase(EmptyBuffer, Offset) {}
+
+protected:
+  void writeRecord(SRecord &Record, uint64_t Off) override {}
+};
+
+class SRECSectionWriter : public SRECSectionWriterBase {
+public:
+  SRECSectionWriter(WritableMemoryBuffer &Buf, uint64_t Offset)
+      : SRECSectionWriterBase(Buf, Offset) {}
+  Error visit(const StringTableSection &Sec) override;
+
+protected:
+  void writeRecord(SRecord &Record, uint64_t Off) override;
 };
 
 class SectionBase {
diff --git a/llvm/test/tools/llvm-objcopy/ELF/ihex-writer.test b/llvm/test/tools/llvm-objcopy/ELF/ihex-writer.test
index 09ff8ae..6c07f9f 100644
--- a/llvm/test/tools/llvm-objcopy/ELF/ihex-writer.test
+++ b/llvm/test/tools/llvm-objcopy/ELF/ihex-writer.test
@@ -70,8 +70,8 @@
 # SIGN_EXTENDED-NEXT: :051000000001020304E1
 # SIGN_EXTENDED-NEXT: :00000001FF
 
-# BAD-ADDR: error: {{.*}}: Section '.text2' address range [0x{{.*}}, 0x{{.*}}] is not 32 bit
-# BAD-ADDR2: error: {{.*}}: Section '.text3' address range [0x{{.*}}, 0x{{.*}}] is not 32 bit
+# BAD-ADDR: error: {{.*}}: section '.text2' address range [0x{{.*}}, 0x{{.*}}] is not 32 bit
+# BAD-ADDR2: error: {{.*}}: section '.text3' address range [0x{{.*}}, 0x{{.*}}] is not 32 bit
 
 # There shouldn't be 'ExtendedAddr' nor 'Data' records
 # ZERO_SIZE_SEC-NOT:  :02000004
@@ -81,4 +81,4 @@
 # START1: :040000030000FFFFFB
 # START2: :0400000500100000E7
 # START3: :040000058000100067
-# BAD-START: error: {{.*}}: Entry point address 0x{{.*}} overflows 32 bits
+# BAD-START: error: {{.*}}: entry point address 0x{{.*}} overflows 32 bits
diff --git a/llvm/test/tools/llvm-objcopy/ELF/srec-writer.test b/llvm/test/tools/llvm-objcopy/ELF/srec-writer.test
new file mode 100644
index 0000000..e96b87b
--- /dev/null
+++ b/llvm/test/tools/llvm-objcopy/ELF/srec-writer.test
@@ -0,0 +1,196 @@
+## Check for basic functionality using an input file with
+## various section types, adresses, data, and no segments.
+# RUN: yaml2obj %s --docnum=1 -o %t
+# RUN: llvm-objcopy -O srec %t - | \
+# RUN:   FileCheck --match-full-lines --strict-whitespace %s --check-prefix=SREC
+
+## The record type for the header should be S0 with a 2 byte address
+## of 0. For an output file named "-" the header data field should contain "2D".
+## The byte count field should therefore have a value of 4: 2 bytes for address,
+## 1 byte for output file and 1 byte for checksum.
+     # SREC:S00400002DCE
+# SREC-NEXT:S31500001000000102030405060708090A0B0C0D0E0F62
+# SREC-NEXT:S30A0000101010111213147B
+# SREC-NEXT:S30F00EFFFFF1111111111111111111159
+# SREC-NEXT:S31000FFFFF83031323334353637383940AC
+# SREC-NEXT:S30A8000100000010203045B
+# SREC-NEXT:S70500000000FA
+
+## Terminator should contain the entry point.
+# RUN: llvm-objcopy -O srec --set-start=0xF0000000 %t --only-section=.dummy - 2>&1 | \
+# RUN:   FileCheck --match-full-lines --strict-whitespace %s --check-prefix=ENTRY
+## Sign-extended entry point is OK.
+# RUN: llvm-objcopy -O srec --set-start=0xFFFFFFFFF0000000 %t --only-section=.dummy - 2>&1 | \
+# RUN:   FileCheck --match-full-lines --strict-whitespace %s --check-prefix=ENTRY
+
+     # ENTRY:S00400002DCE
+# ENTRY-NEXT:S705F00000000A
+
+## Start address which exceeds 32 bit range triggers an error.
+# RUN: not llvm-objcopy -O srec --set-start=0xF00000000 %t - 2>&1 | \
+# RUN:   FileCheck %s --check-prefix=BAD_START
+
+# BAD_START: entry point address 0xf00000000 overflows 32 bits
+
+## Sign-extended start address which exceeds 32 bit range triggers an error.
+# RUN: not llvm-objcopy -O srec --set-start=0xFFFFFFFF0F000000 %t - 2>&1 | \
+# RUN:   FileCheck %s --check-prefix=BAD_EXTENDED_START
+
+# BAD_EXTENDED_START: entry point address 0xffffffff0f000000 overflows 32 bits
+
+--- !ELF
+FileHeader:
+  Class:           ELFCLASS64
+  Data:            ELFDATA2LSB
+  Type:            ET_EXEC
+  Machine:         EM_X86_64
+Sections:
+  - Name:            .data1
+## Records for this section should come last.
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_ALLOC ]
+    Content:         "11111111111111111111"
+    Address:         0xEFFFFF
+  - Name:            .data2
+## This section overlaps 24-bit address boundary, so we expect
+## its record type to be S3.
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_ALLOC ]
+    Content:         "3031323334353637383940"
+    Address:         0xFFFFF8
+## Sign-extended addresses are OK.
+  - Name:            .data3
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_ALLOC ]
+    Address:         0xFFFFFFFF80001000
+    Content:         "0001020304"
+  - Name:            .text
+## This section's contents exceed default line length of 16 bytes
+## so we expect two lines created for it. Records for this section
+## should appear before records for the previous section.
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_ALLOC ]
+    Address:         0x1000
+    Content:         "000102030405060708090A0B0C0D0E0F1011121314"
+  - Name:            .bss
+## NOBITS sections are not written.
+    Type:            SHT_NOBITS
+    Flags:           [ SHF_ALLOC ]
+    Address:         0x10100
+    Size:            0x1000
+  - Name:            .dummy
+## Non-allocatable sections are not written.
+    Type:            SHT_PROGBITS
+    Flags:           [ ]
+    Address:         0x20FFF8
+    Size:            65536
+
+## Check for various error cases.
+
+## Check that section address range overlapping 32 bit range
+## triggers an error.
+# RUN: yaml2obj %s --docnum=2 -o %t.err
+# RUN: not llvm-objcopy -O srec --only-section=.text1 %t.err - 2>&1 | \
+# RUN:   FileCheck %s --check-prefix=BAD-ADDR
+# RUN: not llvm-objcopy -O srec --only-section=.text2 %t.err - 2>&1 | \
+# RUN:   FileCheck %s --check-prefix=BAD-ADDR2
+
+# BAD-ADDR: section '.text1' address range [0xfffffff8, 0x100000000] is not 32 bit
+# BAD-ADDR2: section '.text2' address range [0xffffffff0, 0xffffffff4] is not 32 bit
+
+## Check that zero length section is not written.
+# RUN: llvm-objcopy -O srec --only-section=.text %t.err - | \
+# RUN:   FileCheck --match-full-lines --strict-whitespace --implicit-check-not={{.}} %s --check-prefix=ZERO_SIZE_SEC
+
+## There should be no records besides header and terminator.
+# ZERO_SIZE_SEC:S00400002DCE
+# ZERO_SIZE_SEC-NEXT:S9030000FC
+
+--- !ELF
+FileHeader:
+  Class:           ELFCLASS64
+  Data:            ELFDATA2LSB
+  Type:            ET_EXEC
+  Machine:         EM_X86_64
+Sections:
+  - Name:            .text1
+## Part of section data is in 32-bit address range and part isn't.
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_ALLOC]
+    Address:         0xFFFFFFF8
+    Content:         "000102030405060708"
+  - Name:            .text2
+  ## Entire secion is outside of 32-bit range.
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_ALLOC ]
+    Address:         0xFFFFFFFF0
+    Content:         "0001020304"
+
+## This tests an input file with segments and expects
+## physical addresses instead of virtual addresses.
+# RUN: yaml2obj %s --docnum=3 -o %t.seg
+# RUN: llvm-objcopy -O srec %t.seg - | \
+# RUN:   FileCheck --match-full-lines --strict-whitespace %s --check-prefix=PADDR
+
+     # PADDR:S00400002DCE
+# PADDR-NEXT:S214100000000102030405060708090A0B0C0D0E0F63
+# PADDR-NEXT:S20910001010111213147C
+# PADDR-NEXT:S20F10001530313233343536373839407E
+# PADDR-NEXT:S20810002040414243C1
+# PADDR-NEXT:S20F10002450515253545556575859600F
+# PADDR-NEXT:S20720FFF8000000E1
+# PADDR-NEXT:S804100000EB
+
+--- !ELF
+## This file has a non-contiguous section layout with large gaps.
+## These sections are all tightly packed into one PT_LOAD segment
+## starting at physical address 0x100000. Records should use physical addresses.
+FileHeader:
+  Class:           ELFCLASS64
+  Data:            ELFDATA2LSB
+  Type:            ET_EXEC
+  Machine:         EM_X86_64
+  Entry:           0x100000
+Sections:
+  - Name:            .text
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_ALLOC ]
+    Address:         0x0
+    Content:         "000102030405060708090A0B0C0D0E0F1011121314"
+  - Name:            .data1
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_ALLOC ]
+    Content:         "3031323334353637383940"
+    Address:         0xFFF8
+  - Name:            .data2
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_ALLOC ]
+    Content:         "40414243"
+    Address:         0x10100
+  - Name:            .data3
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_ALLOC ]
+    Content:         "5051525354555657585960"
+    Address:         0x10FFF8
+  - Name:            .bss
+    Type:            SHT_NOBITS
+    Flags:           [ SHF_ALLOC ]
+    Address:         0x10100
+    Size:            0x1000
+  - Name:            .dummy
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_ALLOC ]
+    Address:         0x20FFF8
+    Size:            3
+  - Name:            .nonalloc
+    Type:            SHT_PROGBITS
+    Flags:           [ ]
+    Address:         0x300000
+    Size:            1
+ProgramHeaders:
+  - Type:     PT_LOAD
+    Flags:    [ PF_X, PF_R ]
+    VAddr:    0xF00000000
+    PAddr:    0x100000
+    FirstSec: .text
+    LastSec:  .bss
diff --git a/llvm/tools/llvm-objcopy/ObjcopyOptions.cpp b/llvm/tools/llvm-objcopy/ObjcopyOptions.cpp
index 394eaca..9a9b631 100644
--- a/llvm/tools/llvm-objcopy/ObjcopyOptions.cpp
+++ b/llvm/tools/llvm-objcopy/ObjcopyOptions.cpp
@@ -687,6 +687,7 @@ objcopy::parseObjcopyOptions(ArrayRef<const char *> RawArgsArr,
   Config.OutputFormat = StringSwitch<FileFormat>(OutputFormat)
                             .Case("binary", FileFormat::Binary)
                             .Case("ihex", FileFormat::IHex)
+                            .Case("srec", FileFormat::SREC)
                             .Default(FileFormat::Unspecified);
   if (Config.OutputFormat == FileFormat::Unspecified) {
     if (OutputFormat.empty()) {
diff --git a/llvm/tools/llvm-objcopy/llvm-objcopy.cpp b/llvm/tools/llvm-objcopy/llvm-objcopy.cpp
index 730f423..ad3e604 100644
--- a/llvm/tools/llvm-objcopy/llvm-objcopy.cpp
+++ b/llvm/tools/llvm-objcopy/llvm-objcopy.cpp
@@ -121,6 +121,7 @@ static Error executeObjcopyOnRawBinary(ConfigManager &ConfigMgr,
   case FileFormat::Binary:
   case FileFormat::IHex:
   case FileFormat::Unspecified:
+  case FileFormat::SREC:
     Expected<const ELFConfig &> ELFConfig = ConfigMgr.getELFConfig();
     if (!ELFConfig)
       return ELFConfig.takeError();
-- 
cgit v1.1


From 1f20bc2cd273dd21459b9007a10c6aa67e5da1e2 Mon Sep 17 00:00:00 2001
From: lntue <35648136+lntue@users.noreply.github.com>
Date: Fri, 9 Feb 2024 11:21:04 -0500
Subject: [libc][math] Add C23 math function fdimf128. (#81074)

---
 libc/config/linux/aarch64/entrypoints.txt  |  1 +
 libc/config/linux/riscv/entrypoints.txt    |  1 +
 libc/config/linux/x86_64/entrypoints.txt   |  1 +
 libc/docs/math/index.rst                   |  2 ++
 libc/spec/stdc.td                          |  1 +
 libc/src/math/CMakeLists.txt               |  1 +
 libc/src/math/fdimf128.h                   | 20 +++++++++++++++++++
 libc/src/math/generic/CMakeLists.txt       | 32 +++++++++++++++++++++++++-----
 libc/src/math/generic/fdimf128.cpp         | 19 ++++++++++++++++++
 libc/test/src/math/smoke/CMakeLists.txt    | 17 +++++++++++++---
 libc/test/src/math/smoke/FDimTest.h        | 21 +++++++++++++++-----
 libc/test/src/math/smoke/fdim_test.cpp     | 22 +-------------------
 libc/test/src/math/smoke/fdimf128_test.cpp | 13 ++++++++++++
 libc/test/src/math/smoke/fdimf_test.cpp    | 24 +---------------------
 libc/test/src/math/smoke/fdiml_test.cpp    | 24 +---------------------
 15 files changed, 119 insertions(+), 80 deletions(-)
 create mode 100644 libc/src/math/fdimf128.h
 create mode 100644 libc/src/math/generic/fdimf128.cpp
 create mode 100644 libc/test/src/math/smoke/fdimf128_test.cpp

diff --git a/libc/config/linux/aarch64/entrypoints.txt b/libc/config/linux/aarch64/entrypoints.txt
index 5b03080..f75b267 100644
--- a/libc/config/linux/aarch64/entrypoints.txt
+++ b/libc/config/linux/aarch64/entrypoints.txt
@@ -382,6 +382,7 @@ if(LIBC_COMPILER_HAS_FLOAT128)
     libc.src.math.ceilf128
     libc.src.math.copysignf128
     libc.src.math.fabsf128
+    libc.src.math.fdimf128
     libc.src.math.floorf128
     libc.src.math.fmaxf128
     libc.src.math.fminf128
diff --git a/libc/config/linux/riscv/entrypoints.txt b/libc/config/linux/riscv/entrypoints.txt
index 5e98538..762beb9 100644
--- a/libc/config/linux/riscv/entrypoints.txt
+++ b/libc/config/linux/riscv/entrypoints.txt
@@ -391,6 +391,7 @@ if(LIBC_COMPILER_HAS_FLOAT128)
     libc.src.math.ceilf128
     libc.src.math.copysignf128
     libc.src.math.fabsf128
+    libc.src.math.fdimf128
     libc.src.math.floorf128
     libc.src.math.fmaxf128
     libc.src.math.fminf128
diff --git a/libc/config/linux/x86_64/entrypoints.txt b/libc/config/linux/x86_64/entrypoints.txt
index b35fc9f..52a3ce0 100644
--- a/libc/config/linux/x86_64/entrypoints.txt
+++ b/libc/config/linux/x86_64/entrypoints.txt
@@ -410,6 +410,7 @@ if(LIBC_COMPILER_HAS_FLOAT128)
     libc.src.math.ceilf128
     libc.src.math.copysignf128
     libc.src.math.fabsf128
+    libc.src.math.fdimf128
     libc.src.math.floorf128
     libc.src.math.fmaxf128
     libc.src.math.fminf128
diff --git a/libc/docs/math/index.rst b/libc/docs/math/index.rst
index 3af7e10..2758b42 100644
--- a/libc/docs/math/index.rst
+++ b/libc/docs/math/index.rst
@@ -138,6 +138,8 @@ Basic Operations
 +--------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
 | fdiml        | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| | |check| |         |         |
 +--------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
+| fdimf128     | |check| | |check| |         | |check| |         |         |         |         |         |         |         |         |
++--------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
 | floor        | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| | |check| |         |         |
 +--------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
 | floorf       | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| | |check| |         |         |
diff --git a/libc/spec/stdc.td b/libc/spec/stdc.td
index e37f95a..9c8b5e5 100644
--- a/libc/spec/stdc.td
+++ b/libc/spec/stdc.td
@@ -374,6 +374,7 @@ def StdC : StandardSpec<"stdc"> {
           FunctionSpec<"fdim", RetValSpec<DoubleType>, [ArgSpec<DoubleType>, ArgSpec<DoubleType>]>,
           FunctionSpec<"fdimf", RetValSpec<FloatType>, [ArgSpec<FloatType>, ArgSpec<FloatType>]>,
           FunctionSpec<"fdiml", RetValSpec<LongDoubleType>, [ArgSpec<LongDoubleType>, ArgSpec<LongDoubleType>]>,
+          GuardedFunctionSpec<"fdimf128", RetValSpec<Float128Type>, [ArgSpec<Float128Type>, ArgSpec<Float128Type>], "LIBC_COMPILER_HAS_FLOAT128">,
 
           FunctionSpec<"floor", RetValSpec<DoubleType>, [ArgSpec<DoubleType>]>,
           FunctionSpec<"floorf", RetValSpec<FloatType>, [ArgSpec<FloatType>]>,
diff --git a/libc/src/math/CMakeLists.txt b/libc/src/math/CMakeLists.txt
index d4dbeeb..8cdd84a 100644
--- a/libc/src/math/CMakeLists.txt
+++ b/libc/src/math/CMakeLists.txt
@@ -111,6 +111,7 @@ add_math_entrypoint_object(fabsf128)
 add_math_entrypoint_object(fdim)
 add_math_entrypoint_object(fdimf)
 add_math_entrypoint_object(fdiml)
+add_math_entrypoint_object(fdimf128)
 
 add_math_entrypoint_object(floor)
 add_math_entrypoint_object(floorf)
diff --git a/libc/src/math/fdimf128.h b/libc/src/math/fdimf128.h
new file mode 100644
index 0000000..c6f488a
--- /dev/null
+++ b/libc/src/math/fdimf128.h
@@ -0,0 +1,20 @@
+//===-- Implementation header for fdimf128 ----------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_MATH_FDIMF128_H
+#define LLVM_LIBC_SRC_MATH_FDIMF128_H
+
+#include "src/__support/macros/properties/float.h"
+
+namespace LIBC_NAMESPACE {
+
+float128 fdimf128(float128 x, float128 y);
+
+} // namespace LIBC_NAMESPACE
+
+#endif // LLVM_LIBC_SRC_MATH_FDIMF128_H
diff --git a/libc/src/math/generic/CMakeLists.txt b/libc/src/math/generic/CMakeLists.txt
index 05b70be..3216ec3 100644
--- a/libc/src/math/generic/CMakeLists.txt
+++ b/libc/src/math/generic/CMakeLists.txt
@@ -43,6 +43,7 @@ add_entrypoint_object(
   COMPILE_OPTIONS
     -O3
   DEPENDS
+    libc.src.__support.macros.properties.float
     libc.src.__support.FPUtil.nearest_integer_operations
 )
 
@@ -215,6 +216,7 @@ add_entrypoint_object(
   HDRS
     ../fabsf128.h
   DEPENDS
+    libc.src.__support.macros.properties.float
     libc.src.__support.FPUtil.basic_operations
   COMPILE_OPTIONS
     -O3
@@ -265,6 +267,7 @@ add_entrypoint_object(
   COMPILE_OPTIONS
     -O3
   DEPENDS
+    libc.src.__support.macros.properties.float
     libc.src.__support.FPUtil.nearest_integer_operations
 )
 
@@ -313,6 +316,7 @@ add_entrypoint_object(
   COMPILE_OPTIONS
     -O3
   DEPENDS
+    libc.src.__support.macros.properties.float
     libc.src.__support.FPUtil.nearest_integer_operations
 )
 
@@ -361,6 +365,7 @@ add_entrypoint_object(
   COMPILE_OPTIONS
     -O3
   DEPENDS
+    libc.src.__support.macros.properties.float
     libc.src.__support.FPUtil.nearest_integer_operations
 )
 
@@ -899,6 +904,7 @@ add_entrypoint_object(
   HDRS
     ../copysignf128.h
   DEPENDS
+    libc.src.__support.macros.properties.float
     libc.src.__support.FPUtil.manipulation_functions
   COMPILE_OPTIONS
     -O3
@@ -1298,6 +1304,7 @@ add_entrypoint_object(
   HDRS
     ../fminf128.h
   DEPENDS
+    libc.src.__support.macros.properties.float
     libc.src.__support.FPUtil.basic_operations
   COMPILE_OPTIONS
     -O3
@@ -1346,6 +1353,7 @@ add_entrypoint_object(
   HDRS
     ../fmaxf128.h
   DEPENDS
+    libc.src.__support.macros.properties.float
     libc.src.__support.FPUtil.basic_operations
   COMPILE_OPTIONS
     -O3
@@ -1394,6 +1402,7 @@ add_entrypoint_object(
   HDRS
     ../sqrtf128.h
   DEPENDS
+    libc.src.__support.macros.properties.float
     libc.src.__support.FPUtil.sqrt 
   COMPILE_OPTIONS
     -O3
@@ -1491,10 +1500,10 @@ add_entrypoint_object(
     fdim.cpp
   HDRS
     ../fdim.h
+  COMPILE_OPTIONS
+    -O3
   DEPENDS
     libc.src.__support.FPUtil.basic_operations
-  COMPILE_OPTIONS
-    -O2
 )
 
 add_entrypoint_object(
@@ -1503,10 +1512,10 @@ add_entrypoint_object(
     fdimf.cpp
   HDRS
     ../fdimf.h
+  COMPILE_OPTIONS
+    -O3
   DEPENDS
     libc.src.__support.FPUtil.basic_operations
-  COMPILE_OPTIONS
-    -O2
 )
 
 add_entrypoint_object(
@@ -1515,10 +1524,23 @@ add_entrypoint_object(
     fdiml.cpp
   HDRS
     ../fdiml.h
+  COMPILE_OPTIONS
+    -O3
   DEPENDS
     libc.src.__support.FPUtil.basic_operations
+)
+
+add_entrypoint_object(
+  fdimf128
+  SRCS
+    fdimf128.cpp
+  HDRS
+    ../fdimf128.h
   COMPILE_OPTIONS
-    -O2
+    -O3
+  DEPENDS
+    libc.src.__support.macros.properties.float
+    libc.src.__support.FPUtil.basic_operations
 )
 
 add_entrypoint_object(
diff --git a/libc/src/math/generic/fdimf128.cpp b/libc/src/math/generic/fdimf128.cpp
new file mode 100644
index 0000000..a3ea9e5
--- /dev/null
+++ b/libc/src/math/generic/fdimf128.cpp
@@ -0,0 +1,19 @@
+//===-- Implementation of fdimf128 function -------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/fdimf128.h"
+#include "src/__support/FPUtil/BasicOperations.h"
+#include "src/__support/common.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(float128, fdimf128, (float128 x, float128 y)) {
+  return fputil::fdim(x, y);
+}
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/test/src/math/smoke/CMakeLists.txt b/libc/test/src/math/smoke/CMakeLists.txt
index 4ee81ec..93ce0b7 100644
--- a/libc/test/src/math/smoke/CMakeLists.txt
+++ b/libc/test/src/math/smoke/CMakeLists.txt
@@ -1007,7 +1007,6 @@ add_fp_unittest(
   HDRS
     FDimTest.h
   DEPENDS
-    libc.include.math
     libc.src.math.fdimf
     libc.src.__support.FPUtil.basic_operations
     libc.src.__support.FPUtil.fp_bits
@@ -1022,7 +1021,6 @@ add_fp_unittest(
   HDRS
     FDimTest.h
   DEPENDS
-    libc.include.math
     libc.src.math.fdim
     libc.src.__support.FPUtil.basic_operations
     libc.src.__support.FPUtil.fp_bits
@@ -1037,12 +1035,25 @@ add_fp_unittest(
   HDRS
     FDimTest.h
   DEPENDS
-    libc.include.math
     libc.src.math.fdiml
     libc.src.__support.FPUtil.basic_operations
     libc.src.__support.FPUtil.fp_bits
 )
 
+add_fp_unittest(
+  fdimf128_test
+  SUITE
+    libc-math-smoke-tests
+  SRCS
+    fdimf128_test.cpp
+  HDRS
+    FDimTest.h
+  DEPENDS
+    libc.src.math.fdimf128
+    libc.src.__support.FPUtil.basic_operations
+    libc.src.__support.FPUtil.fp_bits
+)
+
 # FIXME: These tests are currently broken on the GPU.
 if(NOT LIBC_TARGET_ARCHITECTURE_IS_GPU)
   add_fp_unittest(
diff --git a/libc/test/src/math/smoke/FDimTest.h b/libc/test/src/math/smoke/FDimTest.h
index e00b4fd..5cb3dd1 100644
--- a/libc/test/src/math/smoke/FDimTest.h
+++ b/libc/test/src/math/smoke/FDimTest.h
@@ -10,7 +10,6 @@
 #include "src/__support/FPUtil/FPBits.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
-#include <math.h>
 
 template <typename T>
 class FDimTestTemplate : public LIBC_NAMESPACE::testing::Test {
@@ -26,7 +25,7 @@ public:
   const T neg_zero = FPBits::zero(Sign::NEG).get_val();
   const T nan = FPBits::quiet_nan().get_val();
 
-  void test_na_n_arg(FuncPtr func) {
+  void test_nan_arg(FuncPtr func) {
     EXPECT_FP_EQ(nan, func(nan, inf));
     EXPECT_FP_EQ(nan, func(neg_inf, nan));
     EXPECT_FP_EQ(nan, func(nan, zero));
@@ -66,12 +65,15 @@ public:
     constexpr StorageType STEP = STORAGE_MAX / COUNT;
     for (StorageType i = 0, v = 0, w = STORAGE_MAX; i <= COUNT;
          ++i, v += STEP, w -= STEP) {
-      T x = FPBits(v).get_val(), y = FPBits(w).get_val();
-      if (isnan(x) || isinf(x))
+      FPBits xbits(v), ybits(w);
+      if (xbits.is_inf_or_nan())
         continue;
-      if (isnan(y) || isinf(y))
+      if (ybits.is_inf_or_nan())
         continue;
 
+      T x = xbits.get_val();
+      T y = ybits.get_val();
+
       if (x > y) {
         EXPECT_FP_EQ(x - y, func(x, y));
       } else {
@@ -80,3 +82,12 @@ public:
     }
   }
 };
+
+#define LIST_FDIM_TESTS(T, func)                                               \
+  using LlvmLibcFDimTest = FDimTestTemplate<T>;                                \
+  TEST_F(LlvmLibcFDimTest, NaNArg) { test_nan_arg(&func); }                    \
+  TEST_F(LlvmLibcFDimTest, InfArg) { test_inf_arg(&func); }                    \
+  TEST_F(LlvmLibcFDimTest, NegInfArg) { test_neg_inf_arg(&func); }             \
+  TEST_F(LlvmLibcFDimTest, BothZero) { test_both_zero(&func); }                \
+  TEST_F(LlvmLibcFDimTest, InFloatRange) { test_in_range(&func); }             \
+  static_assert(true, "Require semicolon.")
diff --git a/libc/test/src/math/smoke/fdim_test.cpp b/libc/test/src/math/smoke/fdim_test.cpp
index 2f00a30..e1c150d 100644
--- a/libc/test/src/math/smoke/fdim_test.cpp
+++ b/libc/test/src/math/smoke/fdim_test.cpp
@@ -8,26 +8,6 @@
 
 #include "FDimTest.h"
 
-#include "src/__support/FPUtil/FPBits.h"
 #include "src/math/fdim.h"
-#include "test/UnitTest/FPMatcher.h"
-#include "test/UnitTest/Test.h"
-#include <math.h>
 
-using LlvmLibcFDimTest = FDimTestTemplate<double>;
-
-TEST_F(LlvmLibcFDimTest, NaNArg_fdim) { test_na_n_arg(&LIBC_NAMESPACE::fdim); }
-
-TEST_F(LlvmLibcFDimTest, InfArg_fdim) { test_inf_arg(&LIBC_NAMESPACE::fdim); }
-
-TEST_F(LlvmLibcFDimTest, NegInfArg_fdim) {
-  test_neg_inf_arg(&LIBC_NAMESPACE::fdim);
-}
-
-TEST_F(LlvmLibcFDimTest, BothZero_fdim) {
-  test_both_zero(&LIBC_NAMESPACE::fdim);
-}
-
-TEST_F(LlvmLibcFDimTest, InDoubleRange_fdim) {
-  test_in_range(&LIBC_NAMESPACE::fdim);
-}
+LIST_FDIM_TESTS(double, LIBC_NAMESPACE::fdim);
diff --git a/libc/test/src/math/smoke/fdimf128_test.cpp b/libc/test/src/math/smoke/fdimf128_test.cpp
new file mode 100644
index 0000000..8e65c2b
--- /dev/null
+++ b/libc/test/src/math/smoke/fdimf128_test.cpp
@@ -0,0 +1,13 @@
+//===-- Unittests for fdimf128 --------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "FDimTest.h"
+
+#include "src/math/fdimf128.h"
+
+LIST_FDIM_TESTS(float128, LIBC_NAMESPACE::fdimf128);
diff --git a/libc/test/src/math/smoke/fdimf_test.cpp b/libc/test/src/math/smoke/fdimf_test.cpp
index 27511ba..9c27c1d 100644
--- a/libc/test/src/math/smoke/fdimf_test.cpp
+++ b/libc/test/src/math/smoke/fdimf_test.cpp
@@ -8,28 +8,6 @@
 
 #include "FDimTest.h"
 
-#include "src/__support/FPUtil/FPBits.h"
 #include "src/math/fdimf.h"
-#include "test/UnitTest/FPMatcher.h"
-#include "test/UnitTest/Test.h"
-#include <math.h>
 
-using LlvmLibcFDimTest = FDimTestTemplate<float>;
-
-TEST_F(LlvmLibcFDimTest, NaNArg_fdimf) {
-  test_na_n_arg(&LIBC_NAMESPACE::fdimf);
-}
-
-TEST_F(LlvmLibcFDimTest, InfArg_fdimf) { test_inf_arg(&LIBC_NAMESPACE::fdimf); }
-
-TEST_F(LlvmLibcFDimTest, NegInfArg_fdimf) {
-  test_neg_inf_arg(&LIBC_NAMESPACE::fdimf);
-}
-
-TEST_F(LlvmLibcFDimTest, BothZero_fdimf) {
-  test_both_zero(&LIBC_NAMESPACE::fdimf);
-}
-
-TEST_F(LlvmLibcFDimTest, InFloatRange_fdimf) {
-  test_in_range(&LIBC_NAMESPACE::fdimf);
-}
+LIST_FDIM_TESTS(float, LIBC_NAMESPACE::fdimf);
diff --git a/libc/test/src/math/smoke/fdiml_test.cpp b/libc/test/src/math/smoke/fdiml_test.cpp
index 45aedb0..ed448a6 100644
--- a/libc/test/src/math/smoke/fdiml_test.cpp
+++ b/libc/test/src/math/smoke/fdiml_test.cpp
@@ -8,28 +8,6 @@
 
 #include "FDimTest.h"
 
-#include "src/__support/FPUtil/FPBits.h"
 #include "src/math/fdiml.h"
-#include "test/UnitTest/FPMatcher.h"
-#include "test/UnitTest/Test.h"
-#include <math.h>
 
-using LlvmLibcFDimTest = FDimTestTemplate<long double>;
-
-TEST_F(LlvmLibcFDimTest, NaNArg_fdiml) {
-  test_na_n_arg(&LIBC_NAMESPACE::fdiml);
-}
-
-TEST_F(LlvmLibcFDimTest, InfArg_fdiml) { test_inf_arg(&LIBC_NAMESPACE::fdiml); }
-
-TEST_F(LlvmLibcFDimTest, NegInfArg_fdiml) {
-  test_neg_inf_arg(&LIBC_NAMESPACE::fdiml);
-}
-
-TEST_F(LlvmLibcFDimTest, BothZero_fdiml) {
-  test_both_zero(&LIBC_NAMESPACE::fdiml);
-}
-
-TEST_F(LlvmLibcFDimTest, InLongDoubleRange_fdiml) {
-  test_in_range(&LIBC_NAMESPACE::fdiml);
-}
+LIST_FDIM_TESTS(long double, LIBC_NAMESPACE::fdiml);
-- 
cgit v1.1


From 088773b0d1c1ee99d78f0b68bf50029637fbead7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Timm=20B=C3=A4der?= <tbaeder@redhat.com>
Date: Fri, 9 Feb 2024 17:22:40 +0100
Subject: [clang][Interp] Specify triple in C test

This is what test/Sema/const-eval.c does as well and without specifying
it, some windows builders are broken:

https://lab.llvm.org/buildbot/#/builders/265/builds/2453
---
 clang/test/AST/Interp/c.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/clang/test/AST/Interp/c.c b/clang/test/AST/Interp/c.c
index bb2c7cf..afbc518 100644
--- a/clang/test/AST/Interp/c.c
+++ b/clang/test/AST/Interp/c.c
@@ -1,7 +1,7 @@
-// RUN: %clang_cc1 -fexperimental-new-constant-interpreter -verify=expected,all -std=c11 %s
-// RUN: %clang_cc1 -fexperimental-new-constant-interpreter -pedantic -verify=pedantic-expected,all -std=c11 %s
-// RUN: %clang_cc1 -verify=ref,all -std=c11 %s
-// RUN: %clang_cc1 -pedantic -verify=pedantic-ref,all -std=c11 %s
+// RUN: %clang_cc1 -triple x86_64-linux -fexperimental-new-constant-interpreter -verify=expected,all -std=c11 %s
+// RUN: %clang_cc1 -triple x86_64-linux -fexperimental-new-constant-interpreter -pedantic -verify=pedantic-expected,all -std=c11 %s
+// RUN: %clang_cc1 -triple x86_64-linux -verify=ref,all -std=c11 %s
+// RUN: %clang_cc1 -triple x86_64-linux -pedantic -verify=pedantic-ref,all -std=c11 %s
 
 typedef __INTPTR_TYPE__ intptr_t;
 typedef __PTRDIFF_TYPE__ ptrdiff_t;
-- 
cgit v1.1


From e973ab150a802a9503ca34753589d29863df30cc Mon Sep 17 00:00:00 2001
From: lntue <35648136+lntue@users.noreply.github.com>
Date: Fri, 9 Feb 2024 11:23:39 -0500
Subject: [libc][NFC] Fix few warnings in tests. (#81262)

```
/usr/local/google/home/lntue/experiment/llvm/llvm-project/libc/test/src/__support/FPUtil/fpbits_test.cpp:268:2: warning: extra ';' outside of a function is incompatible with C++98 [-Wc++98-compat-extra-semi]
};
 ^
1 warning generated.
```

```
In file included from /usr/local/google/home/lntue/experiment/llvm/llvm-project/libc/test/src/sys/socket/linux/bind_test.cpp:17:
/usr/local/google/home/lntue/experiment/llvm/llvm-project/libc/test/UnitTest/Test.h:17:9: warning: 'libc_make_test_file_path' macro redefined [-Wmacro-redefined]
#define libc_make_test_file_path(file_name) (file_name)
        ^
/usr/local/google/home/lntue/experiment/llvm/llvm-project/libc/test/UnitTest/LibcTest.h:20:9: note: previous definition is here
#define libc_make_test_file_path(file_name)                                    \
        ^
1 warning generated.
```
---
 libc/test/src/__support/FPUtil/fpbits_test.cpp | 2 +-
 libc/test/src/sys/socket/linux/bind_test.cpp   | 1 -
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/libc/test/src/__support/FPUtil/fpbits_test.cpp b/libc/test/src/__support/FPUtil/fpbits_test.cpp
index 4504a4f..b1c4b66 100644
--- a/libc/test/src/__support/FPUtil/fpbits_test.cpp
+++ b/libc/test/src/__support/FPUtil/fpbits_test.cpp
@@ -265,7 +265,7 @@ template <typename T> constexpr auto make(Sign sign, FP fp) {
   case FP::QUIET_NAN:
     return T::quiet_nan(sign);
   }
-};
+}
 
 // Tests all properties for all types of float.
 TYPED_TEST(LlvmLibcFPBitsTest, Properties, FPTypes) {
diff --git a/libc/test/src/sys/socket/linux/bind_test.cpp b/libc/test/src/sys/socket/linux/bind_test.cpp
index 305e4889..e70cbd5 100644
--- a/libc/test/src/sys/socket/linux/bind_test.cpp
+++ b/libc/test/src/sys/socket/linux/bind_test.cpp
@@ -13,7 +13,6 @@
 #include "src/unistd/close.h"
 
 #include "src/errno/libc_errno.h"
-#include "test/UnitTest/LibcTest.h"
 #include "test/UnitTest/Test.h"
 
 #include <sys/socket.h> // For AF_UNIX and SOCK_DGRAM
-- 
cgit v1.1


From 4f423e4989056316f9d807abb92c14b188490e30 Mon Sep 17 00:00:00 2001
From: Mark de Wever <koraq@xs4all.nl>
Date: Fri, 9 Feb 2024 17:26:16 +0100
Subject: [libc++][test] Adds backdeployment shorthands. (#78204)

Some changes in libc++ affect the dylib. These changes are not present
on systems that use the system dylib. Currently that are the Apple
backdeployment targets. Figuring out which MacOS versions to target is
not trivial for non-Apple engineers. These shorthands make it easier to
select the proper feature make a test UNSUPPORTED or XFAIL.

During the design discussion with Louis we considered whether or not to
add preprocessor definitions to allow partial disabling of a test. This
would be useful when an existing feature is changed by modifying the
dylib. In the end we decided not to add this feature to avoid additional
complexity in the tests. Instead the test will be disabled for that
target.
---
 libcxx/utils/libcxx/test/features.py | 101 ++++++++++++++++++++++++++++++++---
 1 file changed, 95 insertions(+), 6 deletions(-)

diff --git a/libcxx/utils/libcxx/test/features.py b/libcxx/utils/libcxx/test/features.py
index ae719a1..a9fb64a 100644
--- a/libcxx/utils/libcxx/test/features.py
+++ b/libcxx/utils/libcxx/test/features.py
@@ -526,12 +526,94 @@ DEFAULT_FEATURES += [
 # target that doesn't support it will fail at compile time, not at runtime. This can
 # be achieved by creating a `.verify.cpp` test that checks for the right errors, and
 # mark that test as requiring `stdlib=<vendor>-libc++ && target=<target>`.
+#
+# Since it is not always known which deployment target to pick there are
+# short-hands based on the LLVM version like using-built-library-before-llvm-xx.
+# These short-hands make it easy for libc++ developers to select the proper
+# version the feature will be available in and allows vendors to set the proper
+# target information.
 DEFAULT_FEATURES += [
+    # Backdeployment short-hands
+    Feature(
+        name="using-built-library-before-llvm-11",
+        when=lambda cfg: BooleanExpression.evaluate(
+            "stdlib=apple-libc++ && target={{.+}}-apple-macosx{{(10.9|10.10|10.11|10.12|10.13|10.14|10.15|11.0)(.0)?}}",
+            cfg.available_features,
+        ),
+    ),
+    Feature(
+        name="using-built-library-before-llvm-12",
+        when=lambda cfg: BooleanExpression.evaluate(
+            "using-built-library-before-llvm-11 || (stdlib=apple-libc++ && target={{.+}}-apple-macosx12.{{(0|1|2)}}.0)",
+            cfg.available_features,
+        ),
+    ),
+
+    Feature(
+        name="using-built-library-before-llvm-13",
+        when=lambda cfg: BooleanExpression.evaluate(
+            "using-built-library-before-llvm-12 || (stdlib=apple-libc++ && target={{.+}}-apple-macosx{{((12.(3|4|5|6|7))|(13.(0|1|2|3)))}}.0)",
+            cfg.available_features,
+        ),
+    ),
+
+    Feature(
+        name="using-built-library-before-llvm-14",
+        when=lambda cfg: BooleanExpression.evaluate(
+            "using-built-library-before-llvm-13",
+            cfg.available_features,
+        ),
+    ),
+
+    Feature(
+        name="using-built-library-before-llvm-15",
+        when=lambda cfg: BooleanExpression.evaluate(
+            "using-built-library-before-llvm-14 || (stdlib=apple-libc++ && target={{.+}}-apple-macosx13.{{(4|5|6)}}.0)",
+            cfg.available_features,
+        ),
+    ),
+
+    Feature(
+        name="using-built-library-before-llvm-16",
+        when=lambda cfg: BooleanExpression.evaluate(
+            "using-built-library-before-llvm-15 || (stdlib=apple-libc++ && target={{.+}}-apple-macosx14.{{(0|1|2|3)}}.0)",
+            cfg.available_features,
+        ),
+    ),
+
+    Feature(
+        name="using-built-library-before-llvm-17",
+        when=lambda cfg: BooleanExpression.evaluate(
+            "using-built-library-before-llvm-16",
+            cfg.available_features,
+        ),
+    ),
+
+    Feature(
+        name="using-built-library-before-llvm-18",
+        when=lambda cfg: BooleanExpression.evaluate(
+            # For now, no released version of macOS contains LLVM 18
+            # TODO(ldionne) Please provide the correct value.
+            "using-built-library-before-llvm-17 || stdlib=apple-libc++ && target={{.+}}-apple-macosx{{.+}}",
+            cfg.available_features,
+        ),
+    ),
+
+    Feature(
+        name="using-built-library-before-llvm-19",
+        when=lambda cfg: BooleanExpression.evaluate(
+            # For now, no released version of macOS contains LLVM 19
+            # TODO(ldionne) Please provide the correct value.
+            "using-built-library-before-llvm-18 || stdlib=apple-libc++ && target={{.+}}-apple-macosx{{.+}}",
+            cfg.available_features,
+        ),
+    ),
+
     # Tests that require std::to_chars(floating-point) in the built library
     Feature(
         name="availability-fp_to_chars-missing",
         when=lambda cfg: BooleanExpression.evaluate(
-            "stdlib=apple-libc++ && target={{.+}}-apple-macosx{{(10.13|10.14|10.15|11.0|12.0|13.0)(.0)?}}",
+            "using-built-library-before-llvm-13",
             cfg.available_features,
         ),
     ),
@@ -539,7 +621,7 @@ DEFAULT_FEATURES += [
     Feature(
         name="availability-char8_t_support-missing",
         when=lambda cfg: BooleanExpression.evaluate(
-            "stdlib=apple-libc++ && target={{.+}}-apple-macosx{{(10.13|10.14|10.15|11.0)(.0)?}}",
+            "using-built-library-before-llvm-11",
             cfg.available_features,
         ),
     ),
@@ -547,7 +629,7 @@ DEFAULT_FEATURES += [
     Feature(
         name="availability-verbose_abort-missing",
         when=lambda cfg: BooleanExpression.evaluate(
-            "stdlib=apple-libc++ && target={{.+}}-apple-macosx{{(10.13|10.14|10.15|11.0|12.0|13.0)(.0)?}}",
+            "using-built-library-before-llvm-13",
             cfg.available_features,
         ),
     ),
@@ -555,7 +637,7 @@ DEFAULT_FEATURES += [
     Feature(
         name="availability-pmr-missing",
         when=lambda cfg: BooleanExpression.evaluate(
-            "stdlib=apple-libc++ && target={{.+}}-apple-macosx{{(10.13|10.14|10.15|11.0|12.0|13.0)(.0)?}}",
+            "using-built-library-before-llvm-13",
             cfg.available_features,
         ),
     ),
@@ -579,8 +661,15 @@ DEFAULT_FEATURES += [
     Feature(
         name="availability-tzdb-missing",
         when=lambda cfg: BooleanExpression.evaluate(
-            # TODO(ldionne) Please provide the correct value.
-            "(stdlib=apple-libc++ && target={{.+}}-apple-macosx{{(10.13|10.14|10.15|11.0|12.0|13.0)(.0)?}})",
+            "using-built-library-before-llvm-19",
+            cfg.available_features,
+        ),
+    ),
+    # Tests that require support for <print> and std::print in <ostream> in the built library.
+    Feature(
+        name="availability-print-missing",
+        when=lambda cfg: BooleanExpression.evaluate(
+            "using-built-library-before-llvm-18",
             cfg.available_features,
         ),
     ),
-- 
cgit v1.1


From a5cc1dc82d61c156f75edc72eccacdb6776bf3f1 Mon Sep 17 00:00:00 2001
From: Mark de Wever <koraq@xs4all.nl>
Date: Fri, 9 Feb 2024 17:29:02 +0100
Subject: [NFC][libc++] Removes obsolete compiler support. (#80481)

These work-arounds were slated for removal in LLVM-18, but missed the
deadline.
---
 .../header_exportable_declarations.cpp             | 32 ----------------------
 1 file changed, 32 deletions(-)

diff --git a/libcxx/test/tools/clang_tidy_checks/header_exportable_declarations.cpp b/libcxx/test/tools/clang_tidy_checks/header_exportable_declarations.cpp
index 5157a45..0a48f85 100644
--- a/libcxx/test/tools/clang_tidy_checks/header_exportable_declarations.cpp
+++ b/libcxx/test/tools/clang_tidy_checks/header_exportable_declarations.cpp
@@ -69,50 +69,18 @@ header_exportable_declarations::header_exportable_declarations(
   }
 
   std::optional<llvm::StringRef> list = Options.get("SkipDeclarations");
-  // TODO(LLVM-17) Remove clang 15 work-around.
-#if defined(__clang_major__) && __clang_major__ < 16
-  if (list) {
-    std::string_view s = *list;
-    auto b             = s.begin();
-    auto e             = std::find(b, s.end(), ' ');
-    while (b != e) {
-      skip_decls_.emplace(b, e);
-      if (e == s.end())
-        break;
-      b = e + 1;
-      e = std::find(b, s.end(), ' ');
-    }
-  }
-#else  // defined(__clang_major__) && __clang_major__ < 16
   if (list)
     for (auto decl : std::views::split(*list, ' ')) {
       std::string s;
       std::ranges::copy(decl, std::back_inserter(s)); // use range based constructor
       skip_decls_.emplace(std::move(s));
     }
-#endif // defined(__clang_major__) && __clang_major__ < 16
   decls_ = skip_decls_;
 
   list = Options.get("ExtraDeclarations");
-  // TODO(LLVM-17) Remove clang 15 work-around.
-#if defined(__clang_major__) && __clang_major__ < 16
-  if (list) {
-    std::string_view s = *list;
-    auto b             = s.begin();
-    auto e             = std::find(b, s.end(), ' ');
-    while (b != e) {
-      std::cout << "using ::" << std::string_view{b, e} << ";\n";
-      if (e == s.end())
-        break;
-      b = e + 1;
-      e = std::find(b, s.end(), ' ');
-    }
-  }
-#else  // defined(__clang_major__) && __clang_major__ < 16
   if (list)
     for (auto decl : std::views::split(*list, ' '))
       std::cout << "using ::" << std::string_view{decl.data(), decl.size()} << ";\n";
-#endif // defined(__clang_major__) && __clang_major__ < 16
 }
 
 header_exportable_declarations::~header_exportable_declarations() {
-- 
cgit v1.1


From 7291761669dd63624ccaab30887aca7e9c7d3273 Mon Sep 17 00:00:00 2001
From: Mark de Wever <koraq@xs4all.nl>
Date: Fri, 9 Feb 2024 17:30:30 +0100
Subject: [libc++] Fixes charconv operator bool tests. (#80598)

This was spotted by @philnik.
---
 .../charconv/charconv.syn/from_chars_result.operator_bool.pass.cpp    | 4 ++--
 .../charconv/charconv.syn/to_chars_result.operator_bool.pass.cpp      | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/libcxx/test/std/utilities/charconv/charconv.syn/from_chars_result.operator_bool.pass.cpp b/libcxx/test/std/utilities/charconv/charconv.syn/from_chars_result.operator_bool.pass.cpp
index b628a2c..a6aa590 100644
--- a/libcxx/test/std/utilities/charconv/charconv.syn/from_chars_result.operator_bool.pass.cpp
+++ b/libcxx/test/std/utilities/charconv/charconv.syn/from_chars_result.operator_bool.pass.cpp
@@ -28,13 +28,13 @@ constexpr bool test() {
   {
     std::from_chars_result value{nullptr, std::errc{}};
     assert(bool(value) == true);
-    static_assert(noexcept(bool(true)) == true);
+    static_assert(noexcept(bool(value)) == true);
   }
   // False
   {
     std::from_chars_result value{nullptr, std::errc::value_too_large};
     assert(bool(value) == false);
-    static_assert(noexcept(bool(true)) == true);
+    static_assert(noexcept(bool(value)) == true);
   }
 
   return true;
diff --git a/libcxx/test/std/utilities/charconv/charconv.syn/to_chars_result.operator_bool.pass.cpp b/libcxx/test/std/utilities/charconv/charconv.syn/to_chars_result.operator_bool.pass.cpp
index ef9364d..621eb8a 100644
--- a/libcxx/test/std/utilities/charconv/charconv.syn/to_chars_result.operator_bool.pass.cpp
+++ b/libcxx/test/std/utilities/charconv/charconv.syn/to_chars_result.operator_bool.pass.cpp
@@ -28,13 +28,13 @@ constexpr bool test() {
   {
     std::to_chars_result value{nullptr, std::errc{}};
     assert(bool(value) == true);
-    static_assert(noexcept(bool(true)) == true);
+    static_assert(noexcept(bool(value)) == true);
   }
   // False
   {
     std::to_chars_result value{nullptr, std::errc::value_too_large};
     assert(bool(value) == false);
-    static_assert(noexcept(bool(true)) == true);
+    static_assert(noexcept(bool(value)) == true);
   }
 
   return true;
-- 
cgit v1.1


From b33b91a21788d439f49d6db4e7224c20f740f1a7 Mon Sep 17 00:00:00 2001
From: "Oleksandr \"Alex\" Zinenko" <zinenko@google.com>
Date: Fri, 9 Feb 2024 17:35:14 +0100
Subject: [mlir] update transform dialect tutorials (#81199)

Use the "main" transform-interpreter pass instead of the test pass.
This, along with the previously introduced debug extension, now allow
tutorials to no longer depend on test passes and extensions.
---
 mlir/docs/Tutorials/transform/Ch1.md               | 347 +++++++++++----------
 mlir/docs/Tutorials/transform/Ch2.md               | 202 ++++++------
 mlir/docs/Tutorials/transform/Ch3.md               |  12 +-
 mlir/docs/Tutorials/transform/Ch4.md               |   2 +-
 .../transform/Ch2/transform-opt/transform-opt.cpp  |  22 +-
 .../transform/Ch3/transform-opt/transform-opt.cpp  |  26 +-
 .../transform/Ch4/transform-opt/transform-opt.cpp  |  12 -
 .../mlir/Dialect/Transform/Transforms/Passes.td    |   4 +
 .../Transforms/TransformInterpreterUtils.h         |   5 +
 .../mlir/Dialect/Transform/Utils/RaggedArray.h     |   3 +
 .../Transform/Transforms/InterpreterPass.cpp       |  24 +-
 .../Transforms/TransformInterpreterUtils.cpp       |  36 ++-
 .../Examples/transform/Ch1/invalidation-1.mlir     |  75 ++---
 .../Examples/transform/Ch1/invalidation-2.mlir     |  18 +-
 mlir/test/Examples/transform/Ch1/sequence.mlir     | 105 ++++---
 mlir/test/Examples/transform/Ch2/invalid.mlir      |  10 +-
 mlir/test/Examples/transform/Ch2/ops.mlir          |  15 +-
 mlir/test/Examples/transform/Ch2/sequence.mlir     |  99 +++---
 mlir/test/Examples/transform/Ch3/invalid.mlir      |  10 +-
 mlir/test/Examples/transform/Ch3/ops.mlir          |  28 +-
 mlir/test/Examples/transform/Ch3/sequence.mlir     | 113 +++----
 mlir/test/Examples/transform/ChH/full.mlir         |   6 +-
 22 files changed, 615 insertions(+), 559 deletions(-)

diff --git a/mlir/docs/Tutorials/transform/Ch1.md b/mlir/docs/Tutorials/transform/Ch1.md
index 7a299a4..b0fdf08 100644
--- a/mlir/docs/Tutorials/transform/Ch1.md
+++ b/mlir/docs/Tutorials/transform/Ch1.md
@@ -6,7 +6,7 @@ The Transform dialect allows one to precisely target transformations at specific
 
 Transform IR operations operate on values that may be associated with payload IR operations, values or attributes. We call the first two kinds of values operation and value handles, respectively. We call the last kind of values parameters.
 
-The application of transform IR always starts from one top-level operation. In the C++ API, this operation is passed to the `applyTransforms` function. This top-level operation specifies if other transformations should be performed and how. The most common top-level operation merely applies other transform operations listed in its body one after the other.
+The application of transform IR always starts from one top-level operation. In the C++ API, this operation is passed to the `applyTransforms` function. This top-level operation specifies if other transformations should be performed and how. The most common top-level operation, `transform.named_sequence` merely applies other transform operations listed in its body one after the other, similarly to a function or a macro.
 
 Let us illustrate this with a simple sequence of transformations on the common “fully connected + bias + ReLU” ML layer, which boils down to performing a matrix multiplication, followed by an (elementwise) matrix addition and taking an elementwise maximum with 0. This can be expressed using the following IR:
 
@@ -14,7 +14,7 @@ Let us illustrate this with a simple sequence of transformations on the common 
 func.func @fc_relu(%lhs: tensor<512x512xf32>, %rhs: tensor<512x512xf32>,
                    %bias: tensor<512x512xf32>, %output: tensor<512x512xf32>)
                    -> tensor<512x512xf32> {
-  // Matrix-matrix multiplication.  
+  // Matrix-matrix multiplication.
   %matmul = linalg.matmul ins(%lhs, %rhs: tensor<512x512xf32>, tensor<512x512xf32>)
                           outs(%output: tensor<512x512xf32>) -> tensor<512x512xf32>
 
@@ -22,7 +22,7 @@ func.func @fc_relu(%lhs: tensor<512x512xf32>, %rhs: tensor<512x512xf32>,
   %biased = linalg.elemwise_binary { fun = #linalg.binary_fn<add> }
     ins(%matmul, %bias : tensor<512x512xf32>, tensor<512x512xf32>)
     outs(%output : tensor<512x512xf32>) -> tensor<512x512xf32>
-  
+
   // Elementwise max with 0 (ReLU).
   %c0f = arith.constant 0.0 : f32
   %relued = linalg.elemwise_binary { fun = #linalg.binary_fn<max_signed> }
@@ -37,30 +37,34 @@ func.func @fc_relu(%lhs: tensor<512x512xf32>, %rhs: tensor<512x512xf32>,
 For performance reasons, we would like to tile and fuse these operations to exploit cache locality. This is a sequence of transformations that need to be performed one after another, so we naturally start with the corresponding top-level transform operation.
 
 ```mlir
-transform.sequence failures(propagate) {
-^bb0(%arg0: !transform.any_op,
-     %arg1: !transform.op<"linalg.matmul">,
-     %arg2: !transform.op<"linalg.elemwise_binary">):
-  transform.yield
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(
+      %arg0: !transform.any_op,
+      %arg1: !transform.op<"linalg.matmul">,
+      %arg2: !transform.op<"linalg.elemwise_binary">):
+    transform.yield
+  }
 }
 ```
 
 There are several aspects worth noticing in this operation.
 
-The first entry block argument is mandatory for top-level transform operations and is associated with the top-level payload operation that sequence is applied to, for example, a module or a function. This operation is specified when calling `applyTransforms`.
+Its special name, `@__transform_main` and the first argument are mandated by the interpreter pass, similarly to how the entry point of C programs needs to be called `main` and may have the `int (int argc, char** argv)` signature. This argument will be associated with the top-level payload operation, most often the operation that the pass is applied to. Note that none of this is required when applying the transformation _programmatically_ via `applyTransforms` or `applyNamedSequence`.
 
 The remaining entry block arguments are optional and can be associated with payload attributes, operations or values that are useful in the sequence. These are also specified when calling `applyTransforms`. In our case, we are interested in the matrix multiplication and elementwise operations that we are going to tile and fuse.
 
 All value handles have Transform dialect types. These types specify certain properties of the payload IR entities associated with them. In this example, `transform.any_op` indicates that the handle is associated with arbitrary payload operations. On the contrary, `transform.op<"X">` indicates that the handle is associated _only_ with payload operations of kind `X`. These constraints are verified when the handle/payload association is created. For entry block arguments of top-level transform operations, this happens early in the `applyTransforms` function. If the constraints are not satisfied, the transform application fails and produces diagnostics for the user.
 
+Finally, the operation is wrapped in a module with the `transform.with_named_sequence` attribute that triggers all necessary verifications if multiple named sequences exist.
+
 ## Failure Propagation
 
-Speaking about diagnostics, the `sequence` operation itself has a mandatory attribute specifying the failure propagation mode. There are two options:
+The Transform dialect infrastructure has a particular mechanism for handling diagnostics that supports recoverable errors. It is best understood by considering the (unnamed) sequence operation that has a mandatory attribute specifying the failure propagation mode. There are two options:
 
 *   “propagate” makes the sequence transformation fail if any of the nested transformation fails;
 *   “suppress” makes the sequence succeed even if one of the nested transformations fails, but without attempting to perform the transformations following the failed one in the sequence.
 
-This latter allows the transformation to continue despite (recoverable) errors. As we are only building the transformation, it is preferable to propagate failures so we know when something did not apply.
+This latter allows the transformation script surrounding the sequence to continue despite errors within the sequence, assuming they are recoverable. As we are only building the transformation script, it is preferable to propagate failures so we know when something did not apply.
 
 To check or debug a transform sequence, it is possible to print various entities associated with the transform IR values. For example, we can print the operations associated with the handles:
 
@@ -83,27 +87,26 @@ Since we don’t want to recompile the compiler every time we change a transform
 
 
 ```sh
-$ mlir-opt matmul.mlir --pass-pipeline="
-    builtin.module(test-transform-dialect-interpreter{
-        bind-first-extra-to-ops=linalg.matmul
-        bind-second-extra-to-ops=linalg.elemwise_binary})"
+$ mlir-opt sequence.mlir --pass-pipeline="
+    builtin.module(transform-interpreter{
+        debug-bind-trailing-args=linalg.matmul,linalg.elemwise_binary})"
 ```
 
-The `matmul.mlir` file contains _both_ the payload IR function _and_ the transform IR sequence nested in the same module. The transform interpreter will find the first top-level transform operation in the root operation of the pass (the module in our case) and apply it to that root operation. In our case, we also asked the interpreter pass to associate the two extra arguments of the top-level sequence with all `linalg.matmul` and `linalg.elemwise_binary` payload operations through the respective pass options. Running this pass results in the expected remarks:
+The `sequence.mlir` file contains _both_ the payload IR function _and_ the transform IR sequence nested in the same module. The transform interpreter pass will apply the `@__transform_main` named sequence to the anchor operation of the pass. In our case, we also asked the interpreter pass to associate the two extra arguments of the top-level sequence with all `linalg.matmul` and `linalg.elemwise_binary` payload operations through the respective pass options. Running this pass results in the expected remarks:
 
 ```sh
-matmul.mlir:7:13: remark: matmul
+sequence.mlir:7:13: remark: matmul
   %matmul = linalg.matmul ins(%lhs, %rhs: tensor<512x512xf32>, tensor<512x512xf32>)
             ^
-matmul.mlir:7:13: note: see current operation: %0 = linalg.matmul ins(%arg0, %arg1 : tensor<512x512xf32>, tensor<512x512xf32>) outs(%arg3 : tensor<512x512xf32>) -> tensor<512x512xf32>
-matmul.mlir:10:13: remark: elemwise_binaries
+sequence.mlir:7:13: note: see current operation: %0 = linalg.matmul ins(%arg0, %arg1 : tensor<512x512xf32>, tensor<512x512xf32>) outs(%arg3 : tensor<512x512xf32>) -> tensor<512x512xf32>
+sequence.mlir:10:13: remark: elemwise_binaries
   %biased = linalg.elemwise_binary { fun = #linalg.binary_fn<add> }
             ^
-matmul.mlir:10:13: note: see current operation: %1 = linalg.elemwise_binary {fun = #linalg.binary_fn<add>} ins(%0, %arg2 : tensor<512x512xf32>, tensor<512x512xf32>) outs(%arg3 : tensor<512x512xf32>) -> tensor<512x512xf32>
-matmul.mlir:14:13: remark: elemwise_binaries
+sequence.mlir:10:13: note: see current operation: %1 = linalg.elemwise_binary {fun = #linalg.binary_fn<add>} ins(%0, %arg2 : tensor<512x512xf32>, tensor<512x512xf32>) outs(%arg3 : tensor<512x512xf32>) -> tensor<512x512xf32>
+sequence.mlir:14:13: remark: elemwise_binaries
   %relued = linalg.elemwise_binary { fun = #linalg.binary_fn<max_signed> }
             ^
-matmul.mlir:14:13: note: see current operation: %2 = linalg.elemwise_binary {fun = #linalg.binary_fn<max_signed>} ins(%1, %cst : tensor<512x512xf32>, f32) outs(%arg3 : tensor<512x512xf32>) -> tensor<512x512xf32>
+sequence.mlir:14:13: note: see current operation: %2 = linalg.elemwise_binary {fun = #linalg.binary_fn<max_signed>} ins(%1, %cst : tensor<512x512xf32>, f32) outs(%arg3 : tensor<512x512xf32>) -> tensor<512x512xf32>
 ```
 
 Note that `%arg2` is associated with both elementwise payload operations. Any handle is associated with a list of entities. Individual transformations may or may not care about the order of elements in that list.
@@ -114,26 +117,33 @@ Note that `%arg2` is associated with both elementwise payload operations. Any ha
 Now that we have handles to the operations we want to transform, we are ready to apply the transformations. Let us first try tiling the matmul operation itself.
 
 ```mlir
-transform.sequence failures(propagate) {
-^bb0(%arg0: !transform.any_op,
-     %arg1: !transform.op<"linalg.matmul">,
-     %arg2: !transform.op<"linalg.elemwise_binary">):
-  // The actual tiling transformation takes tile sizes as attributes.
-  %loop, %tiled = transform.structured.tile_using_forall %arg1 tile_sizes [4, 32]
-    : (!transform.op<"linalg.matmul">) -> (!transform.any_op, !transform.any_op)
-  transform.yield
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(
+       %arg0: !transform.any_op,
+       %arg1: !transform.op<"linalg.matmul">,
+       %arg2: !transform.op<"linalg.elemwise_binary">) {
+    // The actual tiling transformation takes tile sizes as attributes.
+    %loop, %tiled = transform.structured.tile_using_forall %arg1
+                    tile_sizes [4, 32]
+      : (!transform.op<"linalg.matmul">)
+     -> (!transform.any_op, !transform.any_op)
+    transform.yield
+  }
 }
 ```
 
-The transformation returns two handles, as indicated in its [documentation](https://mlir.llvm.org/docs/Dialects/Transform/#transformstructuredtile_using_forall-transformtiletoforallop):
+The transformation returns two handles, as indicated in its [documentation](https://mlir.llvm.org/docs/Dialects/Transform/#transformstructuredtile_using_forall-transformtileusingforallop):
 
-*   A handle to the `scf.forall` “multi-for” loop around tensors.
 *   A handle to `linalg.generic` operating on the subset of the original data.
+*   A handle to the `scf.forall` “multi-for” loop around tensors.
 
 Running this transformation with the same command as above expectedly produces the tiled code.
 
 ```mlir
-func.func @fc_relu(%arg0: tensor<512x512xf32>, %arg1: tensor<512x512xf32>, %arg2: tensor<512x512xf32>, %arg3: tensor<512x512xf32>) -> tensor<512x512xf32> {
+func.func @fc_relu(%arg0: tensor<512x512xf32>,
+                   %arg1: tensor<512x512xf32>,
+                   %arg2: tensor<512x512xf32>,
+                   %arg3: tensor<512x512xf32>) -> tensor<512x512xf32> {
   %cst = arith.constant 0.000000e+00 : f32
   %0 = scf.forall (%arg4, %arg5) in (128, 16) shared_outs(%arg6 = %arg3) -> (tensor<512x512xf32>) {
     %3 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg4)
@@ -144,7 +154,7 @@ func.func @fc_relu(%arg0: tensor<512x512xf32>, %arg1: tensor<512x512xf32>, %arg2
                        : tensor<512x512xf32> to tensor<512x32xf32>
     %extracted_slice_1 = tensor.extract_slice %arg6[%3, %4] [4, 32] [1, 1]
                       : tensor<512x512xf32> to tensor<4x32xf32>
-    %5 = linalg.matmul 
+    %5 = linalg.matmul
          ins(%extracted_slice, %extracted_slice_0
              : tensor<4x512xf32>, tensor<512x32xf32>)
          outs(%extracted_slice_1 : tensor<4x32xf32>) -> tensor<4x32xf32>
@@ -168,78 +178,79 @@ Besides producing new handles, the tiling transform operation _consumes_ the ope
 
 ## Handle Invalidation and Expensive Checks Mode
 
-Undefined behavior is difficult to grapple with when it does happen, so the Transform dialect interpreter provides a set of additional expensive checks that detect most undefined behavior in the transform IR. For example, if we wanted to  use the `%arg1` handle after it is consumed, it would cause undefined behavior that manifests as an assertion in the debug build, and likely as a segmentation fault in the release mode.
+Undefined behavior is difficult to grapple with when it does happen, so the Transform dialect interpreter defaults to performing a set of additional, potentially expensive, checks that detect most undefined behavior in the transform IR. For example, if we wanted to  use the `%arg1` handle after it is consumed, it would cause undefined behavior that manifests as an assertion in the debug build, and likely as a segmentation fault in the release mode.
 
 ```mlir
-transform.sequence failures(propagate) {
-^bb0(%arg0: !transform.any_op,
-     %arg1: !transform.op<"linalg.matmul">,
-     %arg2: !transform.op<"linalg.elemwise_binary">):
-  // The actual tiling transformation takes tile sizes as attributes.
-  %loop, %tiled = transform.structured.tile_using_forall %arg1 tile_sizes [4, 32]
-      : (!transform.op<"linalg.matmul">) -> (!transform.any_op, !transform.any_op)
-
-  // This is trying to use an invalidated handle leading to undefined behavior.
-  transform.debug.emit_remark_at %arg1, "remark" : !transform.op<"linalg.matmul">
-  transform.yield
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(
+       %arg0: !transform.any_op,
+       %arg1: !transform.op<"linalg.matmul">,
+       %arg2: !transform.op<"linalg.elemwise_binary">) {
+    // The actual tiling transformation takes tile sizes as attributes.
+    %loop, %tiled = transform.structured.tile_using_forall %arg1 tile_sizes [4, 32]
+        : (!transform.op<"linalg.matmul">) -> (!transform.any_op, !transform.any_op)
+
+    // This is trying to use an invalidated handle leading to undefined behavior.
+    transform.debug.emit_remark_at %arg1, "remark" : !transform.op<"linalg.matmul">
+    transform.yield
+  }
 }
 ```
 
 However, with the expensive checks enabled in the interpreter, a nice diagnostic is produced:
 
 ```sh
-$ mlir-opt matmul.mlir --pass-pipeline="
-    builtin.module(test-transform-dialect-interpreter{
-        bind-first-extra-to-ops=linalg.matmul
-        bind-second-extra-to-ops=linalg.elemwise_binary
-        enable-expensive-checks})"
-```
-
-```sh
-matmul.mlir:28:3: error: op uses a handle invalidated by a previously executed transform op
+sequence.mlir:28:3: error: op uses a handle invalidated by a previously executed transform op
   transform.debug.emit_remark_at %mm, "elemwise_binaries" : !transform.any_op
   ^
-matmul.mlir:26:9: note: handle to invalidated ops
+sequence.mlir:26:9: note: handle to invalidated ops
   %mm = transform.cast %matmul : !transform.op<"linalg.matmul"> to !transform.any_op
         ^
-matmul.mlir:27:19: note: invalidated by this transform op that consumes its operand #0 and invalidates all handles to payload IR entities associated with this operand and entities nested in them
+sequence.mlir:27:19: note: invalidated by this transform op that consumes its operand #0 and invalidates all handles to payload IR entities associated with this operand and entities nested in them
   %loop, %tiled = transform.structured.tile_using_forall %mm tile_sizes [4, 32]
 ```
 
-One may observe that some operations such as `transform.cast` do not consume the operand (because they don’t erase the corresponding operation). So what would happen if we tried to use that operand instead? 
+When compile-time performance is a concern, and the transformation sequence is sufficiently stable, it is possible to disable expensive checks in the interpreter for improved performance by providing the `disable-expensive-checks` option to the pass or by setting the corresponding flag in the `TransformOptions` passed into `applyTransforms`.
+
+One may observe that some operations such as `transform.cast` do not consume the operand (because they don’t erase the corresponding operation). So what would happen if we tried to use that operand instead?
 
 ```mlir
-transform.sequence failures(propagate) {
-^bb0(%arg0: !transform.any_op,
-     %arg1: !transform.op<"linalg.matmul">,
-     %arg2: !transform.op<"linalg.elemwise_binary">):
-  // We can cast one type to another as long as operations are compatible
-  // with both types. This creates "aliasing" handles.
-  %casted = transform.cast %arg1 : !transform.op<"linalg.matmul">
-      to !transform.any_op
-
-  // The actual tiling transformation takes tile sizes as attributes.
-  %loop, %tiled = transform.structured.tile_using_forall %arg1 tile_sizes [4, 32]
-    : (!transform.op<"linalg.matmul">) -> (!transform.any_op, !transform.any_op)
-
-  // Consuming an operand invalidates the consumed handle and any other handle that is
-  // associated with the same payload operations, or payload operations nested in them.
-  transform.debug.emit_remark_at %casted, "remark"
-    : !transform.any_op
-  transform.yield
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main
+       %arg0: !transform.any_op,
+       %arg1: !transform.op<"linalg.matmul">,
+       %arg2: !transform.op<"linalg.elemwise_binary">) {
+    // We can cast one type to another as long as operations are compatible
+    // with both types. This creates "aliasing" handles.
+    %casted = transform.cast %arg1 : !transform.op<"linalg.matmul">
+        to !transform.any_op
+
+    // The actual tiling transformation takes tile sizes as attributes.
+    %loop, %tiled = transform.structured.tile_using_forall %arg1
+                    tile_sizes [4, 32]
+      : (!transform.op<"linalg.matmul">)
+     -> (!transform.any_op, !transform.any_op)
+
+    // Consuming an operand invalidates the consumed handle and any other handle
+    // that is associated with the same payload operations, or payload
+    // operations nested in them.
+    transform.debug.emit_remark_at %casted, "remark"
+      : !transform.any_op
+    transform.yield
+  }
 }
 ```
 
 Both `%arg1` and `%casted` reference the same payload operation. Extending the reference analogy, these references alias. Naturally, when the payload operation is erased, all references to it become dangling. This is also the case for handles. In fact, consuming an operand invalidates the operand handle as well as any other handle that is associated with any of the same payload operations. The payload IR consideration is recursive: a handle associated with a payload operation _nested_ in the erased one is also invalidated (because erasing the operation also erases its regions and all contained operations). The expensive-checks mode can also handle this case.
 
 ```sh
-matmul.mlir:28:3: error: op uses a handle invalidated by a previously executed transform op
+sequence.mlir:28:3: error: op uses a handle invalidated by a previously executed transform op
   transform.debug.emit_remark_at %matmul, "elemwise_binaries" : !transform.op<"linalg.matmul">
   ^
-matmul.mlir:21:29: note: handle to invalidated ops
+sequence.mlir:21:29: note: handle to invalidated ops
 ^bb0(%root: !transform.any_op, %matmul: !transform.op<"linalg.matmul">, %elemwise: !transform.op<"linalg.elemwise_binary">):
                             ^
-matmul.mlir:27:19: note: invalidated by this transform op that consumes its operand #0 and invalidates all handles to payload IR entities associated with this operand and entities nested in them
+sequence.mlir:27:19: note: invalidated by this transform op that consumes its operand #0 and invalidates all handles to payload IR entities associated with this operand and entities nested in them
   %loop, %tiled = transform.structured.tile_using_forall %mm tile_sizes [4, 32]
 ```
 
@@ -248,39 +259,41 @@ matmul.mlir:27:19: note: invalidated by this transform op that consumes its oper
 Going back to the transformation sequence, we have tiled the matrix multiplication, but we also want to tile and fuse the elementwise operations. The typical way of doing in the structured operations paradigm is to tile the last operation in some acyclic dataflow graph, and then progressively fuse the operations that produce its operands. This removes the need to explicitly tile all operations as fusion can adapt their sizes and inject recomputation if desired. So instead of tiling the matmul operation, we are going to tile the last operation in the chain, and then fuse the preceding operations into the loops produced by tiling.
 
 ```mlir
-transform.sequence failures(propagate) {
-^bb0(%arg0: !transform.any_op,
-     %arg1: !transform.op<"linalg.matmul">,
-     %arg2: !transform.op<"linalg.elemwise_binary">):
-  // Since the %arg2 handle is associated with both elementwise operations,
-  // we need to split it into two handles so we can target only the second
-  // elementwise operation.
-  %add, %max = transform.split_handle %arg2
-      : (!transform.op<"linalg.elemwise_binary">)
-      -> (!transform.any_op, !transform.any_op)
-
-  // The actual tiling transformation takes tile sizes as attributes. It
-  // produces a handle to the loop generated during tiling.
-  %tiled_max, %loop =
-      transform.structured.tile_using_forall %max tile_sizes [8, 32]
-        : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-
-  // We can now fuse the other operations into the loop. Here, we fuse
-  // operations one by one. This requires the operation that is being fused to
-  // define the value used within the loop, so the order of such fusions is
-  // important. We could also use "transform.merge_handles" to obtain a single
-  // handle to all operations and give it to `fuse_into_containing_op` that
-  // would take care of the ordering in this case.
-  %add_fused, %loop_0 =
-      transform.structured.fuse_into_containing_op %add into %loop
-        : (!transform.any_op, !transform.any_op)
-          -> (!transform.any_op, !transform.any_op)
-  %matmul_fused, %loop_1 =
-      transform.structured.fuse_into_containing_op %arg1 into %loop_0
-        : (!transform.op<"linalg.matmul">, !transform.any_op)
-          -> (!transform.any_op, !transform.any_op)
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(
+       %arg0: !transform.any_op,
+       %arg1: !transform.op<"linalg.matmul">,
+       %arg2: !transform.op<"linalg.elemwise_binary">) {
+    // Since the %arg2 handle is associated with both elementwise operations,
+    // we need to split it into two handles so we can target only the second
+    // elementwise operation.
+    %add, %max = transform.split_handle %arg2
+        : (!transform.op<"linalg.elemwise_binary">)
+        -> (!transform.any_op, !transform.any_op)
 
-  transform.yield
+    // The actual tiling transformation takes tile sizes as attributes. It
+    // produces a handle to the loop generated during tiling.
+    %tiled_max, %loop =
+        transform.structured.tile_using_forall %max tile_sizes [8, 32]
+          : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+
+    // We can now fuse the other operations into the loop. Here, we fuse
+    // operations one by one. This requires the operation that is being fused to
+    // define the value used within the loop, so the order of such fusions is
+    // important. We could also use "transform.merge_handles" to obtain a single
+    // handle to all operations and give it to `fuse_into_containing_op` that
+    // would take care of the ordering in this case.
+    %add_fused, %loop_0 =
+        transform.structured.fuse_into_containing_op %add into %loop
+          : (!transform.any_op, !transform.any_op)
+            -> (!transform.any_op, !transform.any_op)
+    %matmul_fused, %loop_1 =
+        transform.structured.fuse_into_containing_op %arg1 into %loop_0
+          : (!transform.op<"linalg.matmul">, !transform.any_op)
+            -> (!transform.any_op, !transform.any_op)
+
+    transform.yield
+  }
 }
 ```
 
@@ -291,64 +304,68 @@ This achieves the desired tiling and fusion.
 Finally, let us assume there exists an efficient microkernel, or a hardware instruction expressed as an intrinsic function, for a 4x4 matrix multiplication. For this purpose, we need to tile the fused operation to the desired size, and then outline it. The resulting function call can then be replaced with a call to the microkernel.
 
 ```mlir
-transform.sequence failures(propagate) {
-^bb0(%arg0: !transform.any_op,
-     %arg1: !transform.op<"linalg.matmul">,
-     %arg2: !transform.op<"linalg.elemwise_binary">):
-  // Since the %arg2 handle is associated with both elementwise operations,
-  // we need to split it into two handles so we can target only the second
-  // elementwise operation.
-  %add, %max = transform.split_handle %arg2
-      : (!transform.op<"linalg.elemwise_binary">)
-        -> (!transform.any_op, !transform.any_op)
-
-  // The actual tiling transformation takes tile sizes as attributes. It
-  // produces a handle to the loop generated during tiling.
-  %tiled, %loop  = transform.structured.tile_using_forall %max tile_sizes [8, 32]
-      : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-
-  // We can now fuse the other operations into the loop. Here, we fuse
-  // operations one by one. This requires the operation that is being fused to
-  // define the value used within the loop, so the order of such fusions is
-  // important. We could also use "transform.merge_handles" to obtain a single
-  // handle to all operations and give it to `fuse_into_containing_op` that
-  // would take care of the ordering in this case.
-  %add_fused, %loop_0 =
-      transform.structured.fuse_into_containing_op %add into %loop
-        : (!transform.any_op, !transform.any_op)
-          -> (!transform.any_op, !transform.any_op)
-  %matmul_fused, %loop_1 =
-      transform.structured.fuse_into_containing_op %arg1 into %loop_0
-        : (!transform.op<"linalg.matmul">, !transform.any_op)
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(
+       %arg0: !transform.any_op,
+       %arg1: !transform.op<"linalg.matmul">,
+       %arg2: !transform.op<"linalg.elemwise_binary">) {
+    // Since the %arg2 handle is associated with both elementwise operations,
+    // we need to split it into two handles so we can target only the second
+    // elementwise operation.
+    %add, %max = transform.split_handle %arg2
+        : (!transform.op<"linalg.elemwise_binary">)
           -> (!transform.any_op, !transform.any_op)
 
-  // Tile again to get the desired size. Note that this time this tiles the
-  // "add" operation and fuses matmul into the loop, but doesn't affect the
-  // "max" operation. This illustrates the precise targeting with the transform
-  // dialect. Otherwise, it is difficult to differentiate "add" and "max", both
-  // of which having the same kind.
-  %tiled_2, %loop_2 =
-      transform.structured.tile_using_forall %add_fused tile_sizes [4, 4]
+    // The actual tiling transformation takes tile sizes as attributes. It
+    // produces a handle to the loop generated during tiling.
+    %tiled, %loop = transform.structured.tile_using_forall %max
+                    tile_sizes [8, 32]
         : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-  %matmul_fused_2, %loop_3 =
-      transform.structured.fuse_into_containing_op %matmul_fused into %loop_2
-        : (!transform.any_op, !transform.any_op)
-          -> (!transform.any_op, !transform.any_op)
 
-  // Since outlining is currently only implemented for region-holding operations
-  // such as loops, use tiling to size 1 to materialize the outer loop that is
-  // going to be outlined.
-  %_, %outline_target =
-      transform.structured.tile_using_forall %tiled_2 tile_sizes [1]
-        : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-  transform.structured.fuse_into_containing_op %matmul_fused_2
-      into %outline_target
-        : (!transform.any_op, !transform.any_op)
-          -> (!transform.any_op, !transform.any_op)
-  %func, %call = transform.loop.outline %outline_target {func_name = "outlined"}
-      : (!transform.any_op) -> (!transform.any_op, !transform.op<"func.call">)
-
-  transform.yield
+    // We can now fuse the other operations into the loop. Here, we fuse
+    // operations one by one. This requires the operation that is being fused to
+    // define the value used within the loop, so the order of such fusions is
+    // important. We could also use "transform.merge_handles" to obtain a single
+    // handle to all operations and give it to `fuse_into_containing_op` that
+    // would take care of the ordering in this case.
+    %add_fused, %loop_0 =
+        transform.structured.fuse_into_containing_op %add into %loop
+          : (!transform.any_op, !transform.any_op)
+            -> (!transform.any_op, !transform.any_op)
+    %matmul_fused, %loop_1 =
+        transform.structured.fuse_into_containing_op %arg1 into %loop_0
+          : (!transform.op<"linalg.matmul">, !transform.any_op)
+            -> (!transform.any_op, !transform.any_op)
+
+    // Tile again to get the desired size. Note that this time this tiles the
+    // "add" operation and fuses matmul into the loop, but doesn't affect the
+    // "max" operation. This illustrates the precise targeting with the
+    // transform dialect. Otherwise, it is difficult to differentiate "add" and
+    // "max", both of which having the same kind.
+    %tiled_2, %loop_2 =
+        transform.structured.tile_using_forall %add_fused tile_sizes [4, 4]
+          : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+    %matmul_fused_2, %loop_3 =
+        transform.structured.fuse_into_containing_op %matmul_fused into %loop_2
+          : (!transform.any_op, !transform.any_op)
+            -> (!transform.any_op, !transform.any_op)
+
+    // Since outlining is currently only implemented for region-holding
+    // operations such as loops, use tiling to size 1 to materialize the outer
+    // loop that is going to be outlined.
+    %_, %outline_target =
+        transform.structured.tile_using_forall %tiled_2 tile_sizes [1]
+          : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+    transform.structured.fuse_into_containing_op %matmul_fused_2
+        into %outline_target
+          : (!transform.any_op, !transform.any_op)
+            -> (!transform.any_op, !transform.any_op)
+    %func, %call = transform.loop.outline %outline_target
+                   {func_name = "outlined"}
+        : (!transform.any_op) -> (!transform.any_op, !transform.op<"func.call">)
+
+    transform.yield
+  }
 }
 ```
 
diff --git a/mlir/docs/Tutorials/transform/Ch2.md b/mlir/docs/Tutorials/transform/Ch2.md
index ac6d7d42..1aaefd2 100644
--- a/mlir/docs/Tutorials/transform/Ch2.md
+++ b/mlir/docs/Tutorials/transform/Ch2.md
@@ -10,37 +10,40 @@ The Transform dialect uses the dialect extension mechanism to allow additional o
 // In MyExtension.cpp.
 #include "mlir/Dialect/Transform/IR/TransformDialect.h"
 
-// Define a new Transform dialect extension. This uses the CRTP idiom to identify
-// extensions.
+// Define a new Transform dialect extension. This uses the CRTP idiom to
+// identify extensions.
 class MyExtension : public ::mlir::transform::TransformDialectExtension<MyExtension> {
 public:
   // The extension must derive the base constructor.
   using Base::Base;
 
-  // This function initializes the extension, similarly to `initialize` in dialect 
-  // definitions. List individual operations and dependent dialects here.
+  // This function initializes the extension, similarly to `initialize` in
+  // dialect  definitions. List individual operations and dependent dialects
+  // here.
   void init();
 };
 
 void MyExtension::init() {
-  // Similarly to dialects, an extension can declare a dependent dialect. This dialect 
-  // will be loaded along with the extension and, therefore, along with the Transform 
-  // dialect. Only declare as dependent the dialects that contain the attributes or 
-  // types used by transform operations. Do NOT declare as dependent the dialects 
-  // produced during the transformation.
+  // Similarly to dialects, an extension can declare a dependent dialect. This
+  // dialect will be loaded along with the extension and, therefore, along with
+  // the Transform  dialect. Only declare as dependent the dialects that contain
+  // the attributes or types used by transform operations. Do NOT declare as
+  // dependent the dialects produced during the transformation.
+  //
   // declareDependentDialect<MyDialect>();
 
-  // When transformations are applied, they may produce new operations from previously
-  // unloaded dialects. Typically, a pass would need to declare itself dependent on
-  // the dialects containing such new operations. To avoid confusion with the dialects
-  // the extension itself depends on, the Transform dialects differentiates between:
+  // When transformations are applied, they may produce new operations from
+  // previously unloaded dialects. Typically, a pass would need to declare
+  // itself dependent on the dialects containing such new operations. To avoid
+  // confusion with the dialects the extension itself depends on, the Transform
+  // dialects differentiates between:
   //   - dependent dialects, which are used by the transform operations, and
-  //   - generated dialects, which contain the entities (attributes, operations, 
-  //     types) that may be produced by applying the transformation even when not
-  //     present in the original payload IR.
-  // In the following chapter, we will be add operations that generate function calls
-  // and structured control flow operations, so let's declare the corresponding
-  // dialects as generated.
+  //   - generated dialects, which contain the entities (attributes, operations,
+  //     types) that may be produced by applying the transformation even when
+  //     not present in the original payload IR.
+  // In the following chapter, we will be add operations that generate function
+  // calls and structured control flow operations, so let's declare the
+  // corresponding dialects as generated.
   declareGeneratedDialect<::mlir::scf::SCFDialect>();
   declareGeneratedDialect<::mlir::func::FuncDialect>();
 
@@ -89,7 +92,7 @@ mlir_tablegen(MyExtension.cpp.inc -gen-op-defs)
 # Add a CMakeTarget we can depend on to ensure the generation happens before the compilation.
 add_public_tablegen_target(MyExtensionIncGen)
 
-# Don't forget to generate the documentation, this will produce a MyExtension.md under 
+# Don't forget to generate the documentation, this will produce a MyExtension.md under
 # Dialects.
 add_mlir_doc(MyExtension MyExtension Dialects/ -gen-op-doc)
 ```
@@ -103,7 +106,8 @@ add_mlir_library(
   # Built from the following source files.
   MyExtension.cpp
 
-  # Make sure ODS declaration and definitions are generated before compiling this.
+  # Make sure ODS declaration and definitions are generated before compiling
+  # this.
   DEPENDS
   MyExtensionIncGen
 
@@ -136,10 +140,10 @@ This will generate two files, `MyExtension.h.inc` and `MyExtension.cpp.inc`, tha
 void MyExtension::init() {
   // …
 
-  // Finally, we register the additional transform operations with the dialect. List all 
-  // operations generated from ODS. This call will perform additional checks that the 
-  // operations implement the transform and memory effect interfaces required by the 
-  // dialect interpreter and assert if they do not.
+  // Finally, we register the additional transform operations with the dialect.
+  // List all  operations generated from ODS. This call will perform additional
+  // checks that the  operations implement the transform and memory effect
+  // interfaces required by the dialect interpreter and assert if they do not.
   registerTransformOps<
 #define GET_OP_LIST
 #include "MyExtension.cpp.inc"
@@ -154,34 +158,36 @@ With this setup, we are now ready to define the new transform operation to rewri
 ```tablegen
 // In MyExtension.td.
 
-// Define the new operation. By convention, prefix its name with the name of the dialect 
-// extension, "my.". The full operation name will be further prefixed with "transform.".
+// Define the new operation. By convention, prefix its name with the name of the
+// dialect  extension, "my.". The full operation name will be further prefixed
+// with "transform.".
 def ChangeCallTargetOp : Op<Transform_Dialect, "my.change_call_target",
-    // Indicate that the operation implements the required TransformOpInterface and
-    // MemoryEffectsOpInterface.
+    // Indicate that the operation implements the required TransformOpInterface
+    // and MemoryEffectsOpInterface.
     [DeclareOpInterfaceMethods<TransformOpInterface>,
      DeclareOpInterfaceMethods<MemoryEffectsOpInterface>]> {
-  // Provide a brief and a full description. It is recommended that the latter describes 
-  // the effects on the operands and how the operation processes various failure modes.
+  // Provide a brief and a full description. It is recommended that the latter
+  // describes the effects on the operands and how the operation processes
+  // various failure modes.
   let summary = "Changes the callee of a call operation to the specified one";
   let description = [{
-    For each `func.call` payload operation associated with the handle, changes its 
-    callee to be the symbol whose name is provided as an attribute to this operation.
+    For each `func.call` payload operation associated with the handle, changes
+    its callee to be the symbol whose name is provided as an attribute to this operation.
 
-    Generates a silenceable failure if the operand is associated with payload operations 
-    that are not `func.call`.
-    Only reads the operand.
+    Generates a silenceable failure if the operand is associated with payload operations that are not `func.call`. Only reads the operand.
   }];
 
-  // The arguments include the handle to the payload operations and the attribute that 
-  // specifies the new callee. The handle must implement TransformHandleTypeInterface.   
-  // We use a string attribute as the symbol may not exist in the transform IR so the 
-  // verification may fail. 
+  // The arguments include the handle to the payload operations and the
+  // attribute that specifies the new callee. The handle must implement
+  // TransformHandleTypeInterface.
+  // We use a string attribute as the symbol may not exist in the transform IR
+  // so the verification may fail.
   let arguments = (ins
     TransformHandleTypeInterface:$call,
     StrAttr:$new_target);
 
-  // The results are empty as the transformation does not produce any new payload.
+  // The results are empty as the transformation does not produce any new
+  // payload.
   let results = (outs);
 
   // Provide nice syntax.
@@ -224,8 +230,8 @@ must be modified with the provided rewriter.
     // It can also carry additional user-defined state.
     ::mlir::transform::TransformState &state) {
 
-  // First, we need to obtain the list of payload operations that are associated with
-  // the operand handle.
+  // First, we need to obtain the list of payload operations that are associated
+  // with the operand handle.
   auto payload = state.getPayloadOps(getCall());
 
   // Then, we iterate over the list of operands and call the actual IR-mutating
@@ -280,56 +286,66 @@ void registerMyExtension(::mlir::DialectRegistry &registry) {
 After registering the extension, it becomes possible to use our new operation in the Transform dialect interpreter. The upstream testing pass can be used as is.
 
 ```mlir
-transform.sequence failures(propagate) {
-^bb0(%arg0: !transform.any_op,
-     %arg1: !transform.op<"linalg.matmul">,
-     %arg2: !transform.op<"linalg.elemwise_binary">):
-  // Since the %arg2 handle is associated with both elementwise operations,
-  // we need to split it into two handles so we can target only the second
-  // elementwise operation.
-  %add, %max = transform.split_handle %arg2 : (!transform.op<"linalg.elemwise_binary">)
-      -> (!transform.any_op, !transform.any_op)
-
-  // The actual tiling transformation takes tile sizes as attributes. It produces a
-  // handle to the loop generated during tiling.
-  %loop, %tiled = transform.structured.tile_using_forall %max tile_sizes [8, 32]
-      : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-
-  // We can now fuse the other operations into the loop. Here, we fuse
-  // operations one-by-one. This requires the operation that is being fused
-  // to define the value used within the loop, so the order of such fusions
-  // is important. We could also use "transform.merge_handles" to obtain
-  // a single handle to all operations and give it to `fuse_into_containing_op`
-  // that would take care of the ordering in this case.
-  %add_fused = transform.structured.fuse_into_containing_op %add into %loop
-      : (!transform.any_op, !transform.any_op) -> !transform.any_op
-  %matmul_fused = transform.structured.fuse_into_containing_op %arg1 into %loop
-      : (!transform.op<"linalg.matmul">, !transform.any_op) -> !transform.any_op
-
-  // Tile again to get the desired size. Note that this time this tiles the
-  // "add" operation and fuses matmul into the loop, but doesn't affect the
-  // "max" operation. This illustrates the precise targeting with the transform
-  // dialect. Otherwise, it is difficult to differentiate "add" and "max", both
-  // of which having the same kind.
-  %loop_2, %tiled_2 = transform.structured.tile_using_forall %add_fused tile_sizes [4, 4]
-      : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-  %matmul_fused_2 = transform.structured.fuse_into_containing_op %matmul_fused into %loop_2
-      : (!transform.any_op, !transform.any_op) -> !transform.any_op
-
-  // Since outlining is currently only implemented for region-holding operations
-  // such as loops, use tiling to size 1 to materialize the outer loop that is
-  // going to be outlined.
-  %outline_target, %_ = transform.structured.tile_using_forall %tiled_2 tile_sizes [1]
-      : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-  transform.structured.fuse_into_containing_op %matmul_fused_2 into %outline_target
-      : (!transform.any_op, !transform.any_op) -> !transform.any_op
-  %func, %call = transform.loop.outline %outline_target {func_name = "outlined"}
-      : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-
-  // Rewrite the call target.
-  transform.my.change_call_target %call, "microkernel" : !transform.any_op
-
-  transform.yield
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(
+      %arg0: !transform.any_op,
+      %arg1: !transform.op<"linalg.matmul">,
+      %arg2: !transform.op<"linalg.elemwise_binary">) {
+    // Since the %arg2 handle is associated with both elementwise operations,
+    // we need to split it into two handles so we can target only the second
+    // elementwise operation.
+    %add, %max = transform.split_handle %arg2
+        : (!transform.op<"linalg.elemwise_binary">)
+        -> (!transform.any_op, !transform.any_op)
+
+    // The actual tiling transformation takes tile sizes as attributes. It
+    // produces a handle to the loop generated during tiling.
+    %loop, %tiled = transform.structured.tile_using_forall %max
+                    tile_sizes [8, 32]
+        : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+
+    // We can now fuse the other operations into the loop. Here, we fuse
+    // operations one-by-one. This requires the operation that is being fused
+    // to define the value used within the loop, so the order of such fusions
+    // is important. We could also use "transform.merge_handles" to obtain
+    // a single handle to all operations and give it to
+    // `fuse_into_containing_op` that would take care of the ordering in this
+    // case.
+    %add_fused = transform.structured.fuse_into_containing_op %add into %loop
+        : (!transform.any_op, !transform.any_op) -> !transform.any_op
+    %matmul_fused = transform.structured.fuse_into_containing_op %arg1
+                    into %loop
+        : (!transform.op<"linalg.matmul">, !transform.any_op)
+       -> !transform.any_op
+
+    // Tile again to get the desired size. Note that this time this tiles the
+    // "add" operation and fuses matmul into the loop, but doesn't affect the
+    // "max" operation. This illustrates the precise targeting with the
+    // transform dialect. Otherwise, it is difficult to differentiate "add" and
+    // "max", both of which having the same kind.
+    %loop_2, %tiled_2 = transform.structured.tile_using_forall %add_fused
+                        tile_sizes [4, 4]
+        : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+    %matmul_fused_2 = transform.structured.fuse_into_containing_op %matmul_fused
+                      into %loop_2
+        : (!transform.any_op, !transform.any_op) -> !transform.any_op
+
+    // Since outlining is currently only implemented for region-holding
+    // operations such as loops, use tiling to size 1 to materialize the outer
+    // loop that is going to be outlined.
+    %outline_target, %_ = transform.structured.tile_using_forall %tiled_2 tile_sizes [1]
+        : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+    transform.structured.fuse_into_containing_op %matmul_fused_2 into %outline_target
+        : (!transform.any_op, !transform.any_op) -> !transform.any_op
+    %func, %call = transform.loop.outline %outline_target
+                   {func_name = "outlined"}
+        : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+
+    // Rewrite the call target.
+    transform.my.change_call_target %call, "microkernel" : !transform.any_op
+
+    transform.yield
+  }
 }
 ```
 
diff --git a/mlir/docs/Tutorials/transform/Ch3.md b/mlir/docs/Tutorials/transform/Ch3.md
index 84251df..fa788d1 100644
--- a/mlir/docs/Tutorials/transform/Ch3.md
+++ b/mlir/docs/Tutorials/transform/Ch3.md
@@ -79,7 +79,7 @@ def CallOpInterfaceHandle
       // The type must implement `TransformHandleTypeInterface`.
       [DeclareTypeInterfaceMethods<TransformHandleTypeInterface>]> {
 
-  // The usual components of a type such as description, mnemonic and assembly format 
+  // The usual components of a type such as description, mnemonic and assembly format
   // should be provided.
   let summary = "handle to payload operations implementing CallOpInterface";
   let mnemonic = "my.call_op_interface";
@@ -87,7 +87,7 @@ def CallOpInterfaceHandle
 }
 ```
 
-We will omit the generation of declaration and definitions using Tablegen for brevity as it is identical to the regular case. 
+We will omit the generation of declaration and definitions using Tablegen for brevity as it is identical to the regular case.
 
 To finalize the definition of a transform type, one must implement the interface methods.
 
@@ -109,9 +109,9 @@ mlir::transform::CallOpInterfaceHandleType::checkPayload(
     if (llvm::isa<mlir::CallOpInterface>(op))
       continue;
 
-    // By convention, these verifiers always emit a silenceable failure since they are 
+    // By convention, these verifiers always emit a silenceable failure since they are
     // checking a precondition.
-    DiagnosedSilenceableFailure diag = emitSilenceableError(loc) 
+    DiagnosedSilenceableFailure diag = emitSilenceableError(loc)
         << "expected the payload operation to implement CallOpInterface";
     diag.attachNote(op->getLoc()) << "offending operation";
     return diag;
@@ -129,8 +129,8 @@ Additional attributes and types need to be registered in the extension, next to
 // In MyExtension.cpp.
 
 void MyExtension::init() {
-  // …
-  
+  // ...
+
   registerTypes<
 #define GET_TYPEDEF_LIST
 #include "MyExtensionTypes.cpp.inc"
diff --git a/mlir/docs/Tutorials/transform/Ch4.md b/mlir/docs/Tutorials/transform/Ch4.md
index 9c9aba1..ad5221c 100644
--- a/mlir/docs/Tutorials/transform/Ch4.md
+++ b/mlir/docs/Tutorials/transform/Ch4.md
@@ -205,7 +205,7 @@ transform.named_sequence @__transform_main(
     %root: !transform.any_op {transform.readonly}) {
   // Collect groups of operations that match the criteria specified in the
   // named sequence.
-  %matmul, %el1, %el2 = transform.collect_matching @match_matmul_elemwise in %root 
+  %matmul, %el1, %el2 = transform.collect_matching @match_matmul_elemwise in %root
     : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op)
   %elemwise = transform.merge_handles %el1, %el2 : !transform.any_op
 
diff --git a/mlir/examples/transform/Ch2/transform-opt/transform-opt.cpp b/mlir/examples/transform/Ch2/transform-opt/transform-opt.cpp
index 3a97531..874ad78 100644
--- a/mlir/examples/transform/Ch2/transform-opt/transform-opt.cpp
+++ b/mlir/examples/transform/Ch2/transform-opt/transform-opt.cpp
@@ -12,6 +12,7 @@
 
 #include "MyExtension.h"
 
+#include "mlir/Dialect/Transform/Transforms/Passes.h"
 #include "mlir/IR/DialectRegistry.h"
 #include "mlir/IR/MLIRContext.h"
 #include "mlir/InitAllDialects.h"
@@ -20,14 +21,6 @@
 #include "mlir/Transforms/Passes.h"
 #include <cstdlib>
 
-// Forward declarations of test passes that used in this chapter for
-// illustrative purposes. Test passes are not directly exposed for use in
-// binaries other than mlir-opt, which is too big to serve as an example.
-namespace mlir::test {
-void registerTestTransformDialectEraseSchedulePass();
-void registerTestTransformDialectInterpreterPass();
-} // namespace mlir::test
-
 namespace test {
 void registerTestTransformDialectExtension(mlir::DialectRegistry &);
 } // namespace test
@@ -39,22 +32,15 @@ int main(int argc, char **argv) {
   mlir::registerAllExtensions(registry);
   registerMyExtension(registry);
 
+  // Register transform interpreter pass.
+  mlir::transform::registerInterpreterPass();
+
   // Register a handful of cleanup passes that we can run to make the output IR
   // look nicer.
   mlir::registerCanonicalizerPass();
   mlir::registerCSEPass();
   mlir::registerSymbolDCEPass();
 
-  // Register the test passes.
-#ifdef MLIR_INCLUDE_TESTS
-  mlir::test::registerTestTransformDialectEraseSchedulePass();
-  mlir::test::registerTestTransformDialectInterpreterPass();
-  test::registerTestTransformDialectExtension(registry);
-#else
-  llvm::errs() << "warning: MLIR built without test passes, interpreter "
-                  "testing will not be available\n";
-#endif // MLIR_INCLUDE_TESTS
-
   // Delegate to the MLIR utility for parsing and pass management.
   return mlir::MlirOptMain(argc, argv, "transform-opt-ch2", registry)
                  .succeeded()
diff --git a/mlir/examples/transform/Ch3/transform-opt/transform-opt.cpp b/mlir/examples/transform/Ch3/transform-opt/transform-opt.cpp
index 3c348c6..c9150c6 100644
--- a/mlir/examples/transform/Ch3/transform-opt/transform-opt.cpp
+++ b/mlir/examples/transform/Ch3/transform-opt/transform-opt.cpp
@@ -12,6 +12,7 @@
 
 #include "MyExtension.h"
 
+#include "mlir/Dialect/Transform/Transforms/Passes.h"
 #include "mlir/IR/DialectRegistry.h"
 #include "mlir/IR/MLIRContext.h"
 #include "mlir/InitAllDialects.h"
@@ -20,18 +21,6 @@
 #include "mlir/Transforms/Passes.h"
 #include <cstdlib>
 
-// Forward declarations of test passes that used in this chapter for
-// illustrative purposes. Test passes are not directly exposed for use in
-// binaries other than mlir-opt, which is too big to serve as an example.
-namespace mlir::test {
-void registerTestTransformDialectEraseSchedulePass();
-void registerTestTransformDialectInterpreterPass();
-} // namespace mlir::test
-
-namespace test {
-void registerTestTransformDialectExtension(mlir::DialectRegistry &);
-} // namespace test
-
 int main(int argc, char **argv) {
   // Register all "core" dialects and our transform dialect extension.
   mlir::DialectRegistry registry;
@@ -39,22 +28,15 @@ int main(int argc, char **argv) {
   mlir::registerAllExtensions(registry);
   registerMyExtension(registry);
 
+  // Register the interpreter pass.
+  mlir::transform::registerInterpreterPass();
+
   // Register a handful of cleanup passes that we can run to make the output IR
   // look nicer.
   mlir::registerCanonicalizerPass();
   mlir::registerCSEPass();
   mlir::registerSymbolDCEPass();
 
-  // Register the test passes.
-#ifdef MLIR_INCLUDE_TESTS
-  mlir::test::registerTestTransformDialectEraseSchedulePass();
-  mlir::test::registerTestTransformDialectInterpreterPass();
-  test::registerTestTransformDialectExtension(registry);
-#else
-  llvm::errs() << "warning: MLIR built without test passes, interpreter "
-                  "testing will not be available\n";
-#endif // MLIR_INCLUDE_TESTS
-
   // Delegate to the MLIR utility for parsing and pass management.
   return mlir::MlirOptMain(argc, argv, "transform-opt-ch3", registry)
                  .succeeded()
diff --git a/mlir/examples/transform/Ch4/transform-opt/transform-opt.cpp b/mlir/examples/transform/Ch4/transform-opt/transform-opt.cpp
index 1019066..03c84bd 100644
--- a/mlir/examples/transform/Ch4/transform-opt/transform-opt.cpp
+++ b/mlir/examples/transform/Ch4/transform-opt/transform-opt.cpp
@@ -21,10 +21,6 @@
 #include "mlir/Transforms/Passes.h"
 #include <cstdlib>
 
-namespace test {
-void registerTestTransformDialectExtension(mlir::DialectRegistry &);
-} // namespace test
-
 int main(int argc, char **argv) {
   // Register all "core" dialects and our transform dialect extension.
   mlir::DialectRegistry registry;
@@ -39,14 +35,6 @@ int main(int argc, char **argv) {
   mlir::registerSymbolDCEPass();
   mlir::transform::registerInterpreterPass();
 
-  // Register the test passes.
-#ifdef MLIR_INCLUDE_TESTS
-  test::registerTestTransformDialectExtension(registry);
-#else
-  llvm::errs() << "warning: MLIR built without test extension, interpreter "
-                  "testing will not be available\n";
-#endif // MLIR_INCLUDE_TESTS
-
   // Delegate to the MLIR utility for parsing and pass management.
   return mlir::MlirOptMain(argc, argv, "transform-opt-ch4", registry)
                  .succeeded()
diff --git a/mlir/include/mlir/Dialect/Transform/Transforms/Passes.td b/mlir/include/mlir/Dialect/Transform/Transforms/Passes.td
index c3436fd..1d6eb24 100644
--- a/mlir/include/mlir/Dialect/Transform/Transforms/Passes.td
+++ b/mlir/include/mlir/Dialect/Transform/Transforms/Passes.td
@@ -75,6 +75,10 @@ def InterpreterPass : Pass<"transform-interpreter"> {
            "Select the operation with 'transform.target_tag' attribute having "
            "the given value as payload IR root. If empty select the pass "
            "anchor operation as the payload IR root.">,
+    ListOption<"debugBindTrailingArgs", "debug-bind-trailing-args",
+               "std::string",
+               "Binds trailing arguments of the entry point to the payload "
+               "operations with specified names.">,
     Option<"disableExpensiveChecks", "disable-expensive-checks", "bool",
            "false",
            "Disable expensive checks in the interpreter for a faster run.">,
diff --git a/mlir/include/mlir/Dialect/Transform/Transforms/TransformInterpreterUtils.h b/mlir/include/mlir/Dialect/Transform/Transforms/TransformInterpreterUtils.h
index 1737d72..738e0c5 100644
--- a/mlir/include/mlir/Dialect/Transform/Transforms/TransformInterpreterUtils.h
+++ b/mlir/include/mlir/Dialect/Transform/Transforms/TransformInterpreterUtils.h
@@ -84,6 +84,11 @@ LogicalResult applyTransformNamedSequence(Operation *payload,
                                           ModuleOp transformModule,
                                           const TransformOptions &options);
 
+LogicalResult applyTransformNamedSequence(RaggedArray<MappedValue> bindings,
+                                          TransformOpInterface transformRoot,
+                                          ModuleOp transformModule,
+                                          const TransformOptions &options);
+
 } // namespace transform
 } // namespace mlir
 
diff --git a/mlir/include/mlir/Dialect/Transform/Utils/RaggedArray.h b/mlir/include/mlir/Dialect/Transform/Utils/RaggedArray.h
index 0ee2391..3d4083b 100644
--- a/mlir/include/mlir/Dialect/Transform/Utils/RaggedArray.h
+++ b/mlir/include/mlir/Dialect/Transform/Utils/RaggedArray.h
@@ -150,6 +150,9 @@ public:
     slices.resize(slices.size() + num, std::pair<size_t, size_t>(-1, 0));
   }
 
+  /// Removes the first subarray in-place. Invalidates iterators to all rows.
+  void removeFront() { slices.erase(slices.begin()); }
+
 private:
   /// Appends the given elements to the storage and returns an ArrayRef
   /// pointing to them in the storage.
diff --git a/mlir/lib/Dialect/Transform/Transforms/InterpreterPass.cpp b/mlir/lib/Dialect/Transform/Transforms/InterpreterPass.cpp
index c875519..5073234 100644
--- a/mlir/lib/Dialect/Transform/Transforms/InterpreterPass.cpp
+++ b/mlir/lib/Dialect/Transform/Transforms/InterpreterPass.cpp
@@ -7,6 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "mlir/Dialect/Transform/IR/TransformDialect.h"
+#include "mlir/Dialect/Transform/IR/TransformInterfaces.h"
 #include "mlir/Dialect/Transform/Transforms/Passes.h"
 #include "mlir/Dialect/Transform/Transforms/TransformInterpreterUtils.h"
 
@@ -64,6 +65,20 @@ public:
         transform::detail::getPreloadedTransformModule(context);
     Operation *payloadRoot =
         findPayloadRoot(getOperation(), debugPayloadRootTag);
+    if (!payloadRoot)
+      return signalPassFailure();
+    auto debugBindNames = llvm::map_to_vector(
+        debugBindTrailingArgs,
+        [&](const std::string &name) { return OperationName(name, context); });
+    SmallVector<SmallVector<Operation *>, 2> trailingBindings;
+    trailingBindings.resize(debugBindNames.size());
+    payloadRoot->walk([&](Operation *payload) {
+      for (auto &&[position, name] : llvm::enumerate(debugBindNames)) {
+        if (payload->getName() == name)
+          trailingBindings[position].push_back(payload);
+      }
+    });
+
     Operation *transformEntryPoint = transform::detail::findTransformEntryPoint(
         getOperation(), transformModule, entryPoint);
     if (!transformEntryPoint) {
@@ -73,8 +88,15 @@ public:
       return signalPassFailure();
     }
 
+    RaggedArray<transform::MappedValue> bindings;
+    bindings.push_back(ArrayRef<Operation *>{payloadRoot});
+    for (SmallVector<Operation *> &trailing : trailingBindings)
+      bindings.push_back(std::move(trailing));
+
     if (failed(transform::applyTransformNamedSequence(
-            payloadRoot, transformEntryPoint, transformModule,
+            bindings,
+            cast<transform::TransformOpInterface>(transformEntryPoint),
+            transformModule,
             options.enableExpensiveChecks(!disableExpensiveChecks)))) {
       return signalPassFailure();
     }
diff --git a/mlir/lib/Dialect/Transform/Transforms/TransformInterpreterUtils.cpp b/mlir/lib/Dialect/Transform/Transforms/TransformInterpreterUtils.cpp
index 2f74b76..8a9cd7c 100644
--- a/mlir/lib/Dialect/Transform/Transforms/TransformInterpreterUtils.cpp
+++ b/mlir/lib/Dialect/Transform/Transforms/TransformInterpreterUtils.cpp
@@ -191,22 +191,46 @@ LogicalResult transform::detail::assembleTransformLibraryFromPaths(
 LogicalResult transform::applyTransformNamedSequence(
     Operation *payload, Operation *transformRoot, ModuleOp transformModule,
     const TransformOptions &options) {
+  RaggedArray<MappedValue> bindings;
+  bindings.push_back(ArrayRef<Operation *>{payload});
+  return applyTransformNamedSequence(bindings,
+                                     cast<TransformOpInterface>(transformRoot),
+                                     transformModule, options);
+}
+
+LogicalResult transform::applyTransformNamedSequence(
+    RaggedArray<MappedValue> bindings, TransformOpInterface transformRoot,
+    ModuleOp transformModule, const TransformOptions &options) {
+  if (bindings.empty()) {
+    return transformRoot.emitError()
+           << "expected at least one binding for the root";
+  }
+  if (bindings.at(0).size() != 1) {
+    return transformRoot.emitError()
+           << "expected one payload to be bound to the first argument, got "
+           << bindings.at(0).size();
+  }
+  auto *payloadRoot = bindings.at(0).front().dyn_cast<Operation *>();
+  if (!payloadRoot) {
+    return transformRoot->emitError() << "expected the object bound to the "
+                                         "first argument to be an operation";
+  }
+
+  bindings.removeFront();
+
   // `transformModule` may not be modified.
   if (transformModule && !transformModule->isAncestor(transformRoot)) {
     OwningOpRef<Operation *> clonedTransformModule(transformModule->clone());
     if (failed(detail::mergeSymbolsInto(
             SymbolTable::getNearestSymbolTable(transformRoot),
             std::move(clonedTransformModule)))) {
-      return payload->emitError() << "failed to merge symbols";
+      return payloadRoot->emitError() << "failed to merge symbols";
     }
   }
 
   LLVM_DEBUG(DBGS() << "Apply\n" << *transformRoot << "\n");
-  LLVM_DEBUG(DBGS() << "To\n" << *payload << "\n");
+  LLVM_DEBUG(DBGS() << "To\n" << *payloadRoot << "\n");
 
-  // Apply the transform to the IR, do not enforce top-level constraints.
-  RaggedArray<MappedValue> noExtraMappings;
-  return applyTransforms(payload, cast<TransformOpInterface>(transformRoot),
-                         noExtraMappings, options,
+  return applyTransforms(payloadRoot, transformRoot, bindings, options,
                          /*enforceToplevelTransformOp=*/false);
 }
diff --git a/mlir/test/Examples/transform/Ch1/invalidation-1.mlir b/mlir/test/Examples/transform/Ch1/invalidation-1.mlir
index 69b10ae..2264ade 100644
--- a/mlir/test/Examples/transform/Ch1/invalidation-1.mlir
+++ b/mlir/test/Examples/transform/Ch1/invalidation-1.mlir
@@ -1,8 +1,7 @@
 // RUN: mlir-opt %s \
-// RUN:   --pass-pipeline="builtin.module(test-transform-dialect-interpreter{ \
-// RUN:        bind-first-extra-to-ops=linalg.matmul \
-// RUN:        bind-second-extra-to-ops=linalg.elemwise_binary \
-// RUN:        enable-expensive-checks},canonicalize,cse,symbol-dce)" \
+// RUN:   --pass-pipeline="builtin.module(transform-interpreter{ \
+// RUN:        debug-bind-trailing-args=linalg.matmul,linalg.elemwise_binary},\
+// RUN:        canonicalize,cse,symbol-dce)" \
 // RUN:   --split-input-file --verify-diagnostics
 
 // ****************************** IMPORTANT NOTE ******************************
@@ -12,20 +11,22 @@
 //
 // ****************************************************************************
 
-transform.sequence failures(propagate) {
-^bb0(%arg0: !transform.any_op,
-     // expected-note @below {{handle to invalidated ops}}
-     %arg1: !transform.op<"linalg.matmul">,
-     %arg2: !transform.op<"linalg.elemwise_binary">):
-  // The actual tiling transformation takes tile sizes as attributes.
-  // expected-note @below {{invalidated by this transform op that consumes its operand #0 and invalidates all handles to payload IR entities associated with this operand and entities nested in them}}
-  %tiled, %loop = transform.structured.tile_using_forall %arg1 tile_sizes [4, 32]
-      : (!transform.op<"linalg.matmul">) -> (!transform.any_op, !transform.any_op)
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(
+      %arg0: !transform.any_op,
+      // expected-note @below {{handle to invalidated ops}}
+      %arg1: !transform.op<"linalg.matmul">,
+      %arg2: !transform.op<"linalg.elemwise_binary">) {
+    // The actual tiling transformation takes tile sizes as attributes.
+    // expected-note @below {{invalidated by this transform op that consumes its operand #0 and invalidates all handles to payload IR entities associated with this operand and entities nested in them}}
+    %tiled, %loop = transform.structured.tile_using_forall %arg1 tile_sizes [4, 32]
+        : (!transform.op<"linalg.matmul">) -> (!transform.any_op, !transform.any_op)
 
-  // This is trying to use an invalidated handle leading to undefined behavior.
-  // expected-error @below {{uses a handle invalidated by a previously executed transform op}}
-  transform.debug.emit_remark_at %arg1, "remark" : !transform.op<"linalg.matmul">
-  transform.yield
+    // This is trying to use an invalidated handle leading to undefined behavior.
+    // expected-error @below {{uses a handle invalidated by a previously executed transform op}}
+    transform.debug.emit_remark_at %arg1, "remark" : !transform.op<"linalg.matmul">
+    transform.yield
+  }
 }
 
 // Original function to optimize.
@@ -52,27 +53,29 @@ func.func @fc_relu(%lhs: tensor<512x512xf32>, %rhs: tensor<512x512xf32>,
 
 // -----
 
-transform.sequence failures(propagate) {
-^bb0(%arg0: !transform.any_op,
-     %arg1: !transform.op<"linalg.matmul">,
-     %arg2: !transform.op<"linalg.elemwise_binary">):
-  // We can cast one type to another as long as operations are compatible
-  // with both types. This creates "aliasing" handles.
-  // expected-note @below {{handle to invalidated ops}}
-  %casted = transform.cast %arg1 : !transform.op<"linalg.matmul"> to
-      !transform.any_op
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(
+      %arg0: !transform.any_op,
+      %arg1: !transform.op<"linalg.matmul">,
+      %arg2: !transform.op<"linalg.elemwise_binary">) {
+    // We can cast one type to another as long as operations are compatible
+    // with both types. This creates "aliasing" handles.
+    // expected-note @below {{handle to invalidated ops}}
+    %casted = transform.cast %arg1 : !transform.op<"linalg.matmul"> to
+        !transform.any_op
 
-  // The actual tiling transformation takes tile sizes as attributes.
-  // expected-note @below {{invalidated by this transform op that consumes its operand #0 and invalidates all handles to payload IR entities associated with this operand and entities nested in them}}
-  %tiled, %loop = transform.structured.tile_using_forall %arg1 tile_sizes [4, 32]
-    : (!transform.op<"linalg.matmul">) -> (!transform.any_op, !transform.any_op)
+    // The actual tiling transformation takes tile sizes as attributes.
+    // expected-note @below {{invalidated by this transform op that consumes its operand #0 and invalidates all handles to payload IR entities associated with this operand and entities nested in them}}
+    %tiled, %loop = transform.structured.tile_using_forall %arg1 tile_sizes [4, 32]
+      : (!transform.op<"linalg.matmul">) -> (!transform.any_op, !transform.any_op)
 
-  // Consuming an operand invalidates the consumed handle and any other handle that is
-  // associated with the same payload operations, or payload operations nested in them.
-  // expected-error @below {{uses a handle invalidated by a previously executed transform op}}
-  transform.debug.emit_remark_at %casted, "remark"
-    : !transform.any_op
-  transform.yield
+    // Consuming an operand invalidates the consumed handle and any other handle that is
+    // associated with the same payload operations, or payload operations nested in them.
+    // expected-error @below {{uses a handle invalidated by a previously executed transform op}}
+    transform.debug.emit_remark_at %casted, "remark"
+      : !transform.any_op
+    transform.yield
+  }
 }
 
 // Original function to optimize.
diff --git a/mlir/test/Examples/transform/Ch1/invalidation-2.mlir b/mlir/test/Examples/transform/Ch1/invalidation-2.mlir
index c4a2f1e..0a84a5c 100644
--- a/mlir/test/Examples/transform/Ch1/invalidation-2.mlir
+++ b/mlir/test/Examples/transform/Ch1/invalidation-2.mlir
@@ -1,10 +1,8 @@
 // RUN: mlir-opt %s \
-// RUN:   --pass-pipeline="builtin.module(test-transform-dialect-interpreter{ \
-// RUN:        bind-first-extra-to-ops=linalg.matmul \
-// RUN:        bind-second-extra-to-ops=linalg.elemwise_binary \
-// RUN:        enable-expensive-checks},canonicalize,cse,symbol-dce)" \
+// RUN:   --pass-pipeline="builtin.module(transform-interpreter{ \
+// RUN:        debug-bind-trailing-args=linalg.matmul,linalg.elemwise_binary},\
+// RUN:        canonicalize,cse,symbol-dce)" \
 // RUN:   --split-input-file --verify-diagnostics
-
 // ****************************** IMPORTANT NOTE ******************************
 //
 // If you are changing this file, you may also need to change
@@ -45,10 +43,11 @@ func.func private @microkernel(
     %init: tensor<4x4xf32>,
     %output: tensor<4x4xf32>) -> tensor<4x4xf32>
 
-transform.sequence failures(propagate) {
-^bb0(%arg0: !transform.any_op,
-     %arg1: !transform.op<"linalg.matmul">,
-     %arg2: !transform.op<"linalg.elemwise_binary">):
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(
+      %arg0: !transform.any_op,
+      %arg1: !transform.op<"linalg.matmul">,
+      %arg2: !transform.op<"linalg.elemwise_binary">) {
   // Since the %arg2 handle is associated with both elementwise operations,
   // we need to split it into two handles so we can target only the second
   // elementwise operation.
@@ -99,4 +98,5 @@ transform.sequence failures(propagate) {
   transform.debug.emit_remark_at %f, "fused" : !transform.any_op
 
   transform.yield
+  }
 }
diff --git a/mlir/test/Examples/transform/Ch1/sequence.mlir b/mlir/test/Examples/transform/Ch1/sequence.mlir
index 5de6e6e..3107adc 100644
--- a/mlir/test/Examples/transform/Ch1/sequence.mlir
+++ b/mlir/test/Examples/transform/Ch1/sequence.mlir
@@ -1,8 +1,7 @@
 // RUN: mlir-opt %s \
-// RUN:   --pass-pipeline="builtin.module(test-transform-dialect-interpreter{ \
-// RUN:        bind-first-extra-to-ops=linalg.matmul \
-// RUN:        bind-second-extra-to-ops=linalg.elemwise_binary \
-// RUN:        enable-expensive-checks},canonicalize,cse,symbol-dce)" |\
+// RUN:   --pass-pipeline="builtin.module(transform-interpreter{ \
+// RUN:        debug-bind-trailing-args=linalg.matmul,linalg.elemwise_binary},\
+// RUN:        canonicalize,cse,symbol-dce)" |\
 // RUN: FileCheck %s
 
 // ****************************** IMPORTANT NOTE ******************************
@@ -60,52 +59,54 @@ func.func private @microkernel(
     %init: tensor<4x4xf32>,
     %output: tensor<4x4xf32>) -> tensor<4x4xf32>
 
-transform.sequence failures(propagate) {
-^bb0(%arg0: !transform.any_op,
-     %arg1: !transform.op<"linalg.matmul">,
-     %arg2: !transform.op<"linalg.elemwise_binary">):
-  // Since the %arg2 handle is associated with both elementwise operations,
-  // we need to split it into two handles so we can target only the second
-  // elementwise operation.
-  %add, %max = transform.split_handle %arg2 : (!transform.op<"linalg.elemwise_binary">)
-      -> (!transform.any_op, !transform.any_op)
-
-  // The actual tiling transformation takes tile sizes as attributes. It produces a
-  // handle to the loop generated during tiling.
-  %tiled, %loop = transform.structured.tile_using_forall %max tile_sizes [8, 32]
-      : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-
-  // We can now fuse the other operations into the loop. Here, we fuse
-  // operations one-by-one. This requires the operation that is being fused
-  // to define the value used within the loop, so the order of such fusions
-  // is important. We could also use "transform.merge_handles" to obtain
-  // a single handle to all operations and give it to `fuse_into_containing_op`
-  // that would take care of the ordering in this case.
-  %add_fused, %loop2 = transform.structured.fuse_into_containing_op %add into %loop
-      : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
-  %matmul_fused, %loop3 = transform.structured.fuse_into_containing_op %arg1 into %loop2
-      : (!transform.op<"linalg.matmul">, !transform.any_op) -> (!transform.any_op, !transform.any_op)
-
-  // Tile again to get the desired size. Note that this time this tiles the
-  // "add" operation and fuses matmul into the loop, but doesn't affect the
-  // "max" operation. This illustrates the precise targeting with the transform
-  // dialect. Otherwise, it is difficult to differentiate "add" and "max", both
-  // of which having the same kind.
-  %tiled_second, %loop_second = transform.structured.tile_using_forall %add_fused tile_sizes [4, 4]
-      : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-  %matmul_fused_2, %loop_second_2 =
-      transform.structured.fuse_into_containing_op %matmul_fused into %loop_second
-      : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
-
-  // Since outlining is currently only implemented for region-holding operations
-  // such as loops, use tiling to size 1 to materialize the outer loop that is
-  // going to be outlined.
-  %_0, %loop_third = transform.structured.tile_using_forall %tiled_second tile_sizes [1]
-      : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-  %_1, %outline_target = transform.structured.fuse_into_containing_op %matmul_fused_2 into %loop_third
-      : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
-  %func, %call = transform.loop.outline %outline_target {func_name = "outlined"}
-      : (!transform.any_op) -> (!transform.any_op, !transform.op<"func.call">)
-
-  transform.yield
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(
+      %arg0: !transform.any_op,
+      %arg1: !transform.op<"linalg.matmul">,
+      %arg2: !transform.op<"linalg.elemwise_binary">) {
+    // Since the %arg2 handle is associated with both elementwise operations,
+    // we need to split it into two handles so we can target only the second
+    // elementwise operation.
+    %add, %max = transform.split_handle %arg2 : (!transform.op<"linalg.elemwise_binary">)
+        -> (!transform.any_op, !transform.any_op)
+  
+    // The actual tiling transformation takes tile sizes as attributes. It produces a
+    // handle to the loop generated during tiling.
+    %tiled, %loop = transform.structured.tile_using_forall %max tile_sizes [8, 32]
+        : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+  
+    // We can now fuse the other operations into the loop. Here, we fuse
+    // operations one-by-one. This requires the operation that is being fused
+    // to define the value used within the loop, so the order of such fusions
+    // is important. We could also use "transform.merge_handles" to obtain
+    // a single handle to all operations and give it to `fuse_into_containing_op`
+    // that would take care of the ordering in this case.
+    %add_fused, %loop2 = transform.structured.fuse_into_containing_op %add into %loop
+        : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
+    %matmul_fused, %loop3 = transform.structured.fuse_into_containing_op %arg1 into %loop2
+        : (!transform.op<"linalg.matmul">, !transform.any_op) -> (!transform.any_op, !transform.any_op)
+  
+    // Tile again to get the desired size. Note that this time this tiles the
+    // "add" operation and fuses matmul into the loop, but doesn't affect the
+    // "max" operation. This illustrates the precise targeting with the transform
+    // dialect. Otherwise, it is difficult to differentiate "add" and "max", both
+    // of which having the same kind.
+    %tiled_second, %loop_second = transform.structured.tile_using_forall %add_fused tile_sizes [4, 4]
+        : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+    %matmul_fused_2, %loop_second_2 =
+        transform.structured.fuse_into_containing_op %matmul_fused into %loop_second
+        : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
+  
+    // Since outlining is currently only implemented for region-holding operations
+    // such as loops, use tiling to size 1 to materialize the outer loop that is
+    // going to be outlined.
+    %_0, %loop_third = transform.structured.tile_using_forall %tiled_second tile_sizes [1]
+        : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+    %_1, %outline_target = transform.structured.fuse_into_containing_op %matmul_fused_2 into %loop_third
+        : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
+    %func, %call = transform.loop.outline %outline_target {func_name = "outlined"}
+        : (!transform.any_op) -> (!transform.any_op, !transform.op<"func.call">)
+  
+    transform.yield
+  }
 }
diff --git a/mlir/test/Examples/transform/Ch2/invalid.mlir b/mlir/test/Examples/transform/Ch2/invalid.mlir
index ad53683..cb67389 100644
--- a/mlir/test/Examples/transform/Ch2/invalid.mlir
+++ b/mlir/test/Examples/transform/Ch2/invalid.mlir
@@ -1,11 +1,11 @@
-// RUN: transform-opt-ch2 %s --test-transform-dialect-interpreter --split-input-file --verify-diagnostics
+// RUN: transform-opt-ch2 %s --transform-interpreter --split-input-file \
+// RUN:                      --verify-diagnostics
 
 // expected-note @below {{offending payload}}
-module {
-  transform.sequence failures(propagate) {
-  ^bb0(%arg0: !transform.any_op):
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg0: !transform.any_op) {
     // expected-error @below {{only applies to func.call payloads}}
     transform.my.change_call_target %arg0, "updated" : !transform.any_op
-    yield
+    transform.yield
   }
 }
diff --git a/mlir/test/Examples/transform/Ch2/ops.mlir b/mlir/test/Examples/transform/Ch2/ops.mlir
index d66f89b..410a6e3 100644
--- a/mlir/test/Examples/transform/Ch2/ops.mlir
+++ b/mlir/test/Examples/transform/Ch2/ops.mlir
@@ -1,4 +1,4 @@
-// RUN: transform-opt-ch2 %s --test-transform-dialect-interpreter | FileCheck %s
+// RUN: transform-opt-ch2 %s --transform-interpreter | FileCheck %s
 
 // ****************************** IMPORTANT NOTE ******************************
 //
@@ -17,10 +17,11 @@ func.func @test() {
   return
 }
 
-transform.sequence failures(propagate) {
-^bb0(%arg0: !transform.any_op):
-  %call = transform.structured.match ops{["func.call"]} in %arg0 : (!transform.any_op) -> !transform.any_op
-  // CHECK: transform.my.change_call_target %{{.*}}, "updated" : !transform.any_op
-  transform.my.change_call_target %call, "updated" : !transform.any_op
-  transform.yield
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg0: !transform.any_op) {
+    %call = transform.structured.match ops{["func.call"]} in %arg0 : (!transform.any_op) -> !transform.any_op
+    // CHECK: transform.my.change_call_target %{{.*}}, "updated" : !transform.any_op
+    transform.my.change_call_target %call, "updated" : !transform.any_op
+    transform.yield
+  }
 }
diff --git a/mlir/test/Examples/transform/Ch2/sequence.mlir b/mlir/test/Examples/transform/Ch2/sequence.mlir
index b6f32dc..976df1d 100644
--- a/mlir/test/Examples/transform/Ch2/sequence.mlir
+++ b/mlir/test/Examples/transform/Ch2/sequence.mlir
@@ -1,8 +1,7 @@
 // RUN: transform-opt-ch2 %s \
-// RUN:   --pass-pipeline="builtin.module(test-transform-dialect-interpreter{ \
-// RUN:        bind-first-extra-to-ops=linalg.matmul \
-// RUN:        bind-second-extra-to-ops=linalg.elemwise_binary \
-// RUN:        enable-expensive-checks},canonicalize,cse,symbol-dce)" |\
+// RUN:   --pass-pipeline="builtin.module(transform-interpreter{ \
+// RUN:        debug-bind-trailing-args=linalg.matmul,linalg.elemwise_binary},\
+// RUN:        canonicalize,cse,symbol-dce)" |\
 // RUN: FileCheck %s
 
 // ****************************** IMPORTANT NOTE ******************************
@@ -56,55 +55,57 @@ func.func private @microkernel(
     %init: tensor<4x4xf32>,
     %output: tensor<4x4xf32>) -> tensor<4x4xf32>
 
-transform.sequence failures(propagate) {
-^bb0(%arg0: !transform.any_op,
-     %arg1: !transform.op<"linalg.matmul">,
-     %arg2: !transform.op<"linalg.elemwise_binary">):
-  // Since the %arg2 handle is associated with both elementwise operations,
-  // we need to split it into two handles so we can target only the second
-  // elementwise operation.
-  %add, %max = transform.split_handle %arg2 : (!transform.op<"linalg.elemwise_binary">)
-      -> (!transform.any_op, !transform.any_op)
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(
+      %arg0: !transform.any_op,
+      %arg1: !transform.op<"linalg.matmul">,
+      %arg2: !transform.op<"linalg.elemwise_binary">) {
+    // Since the %arg2 handle is associated with both elementwise operations,
+    // we need to split it into two handles so we can target only the second
+    // elementwise operation.
+    %add, %max = transform.split_handle %arg2 : (!transform.op<"linalg.elemwise_binary">)
+        -> (!transform.any_op, !transform.any_op)
 
-  // The actual tiling transformation takes tile sizes as attributes. It produces a
-  // handle to the loop generated during tiling.
-  %tiled, %loop = transform.structured.tile_using_forall %max tile_sizes [8, 32]
-      : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+    // The actual tiling transformation takes tile sizes as attributes. It produces a
+    // handle to the loop generated during tiling.
+    %tiled, %loop = transform.structured.tile_using_forall %max tile_sizes [8, 32]
+        : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
 
-  // We can now fuse the other operations into the loop. Here, we fuse
-  // operations one-by-one. This requires the operation that is being fused
-  // to define the value used within the loop, so the order of such fusions
-  // is important. We could also use "transform.merge_handles" to obtain
-  // a single handle to all operations and give it to `fuse_into_containing_op`
-  // that would take care of the ordering in this case.
-  %add_fused, %loop2 = transform.structured.fuse_into_containing_op %add into %loop
-      : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
-  %matmul_fused, %loop3 = transform.structured.fuse_into_containing_op %arg1 into %loop2
-      : (!transform.op<"linalg.matmul">, !transform.any_op) -> (!transform.any_op, !transform.any_op)
+    // We can now fuse the other operations into the loop. Here, we fuse
+    // operations one-by-one. This requires the operation that is being fused
+    // to define the value used within the loop, so the order of such fusions
+    // is important. We could also use "transform.merge_handles" to obtain
+    // a single handle to all operations and give it to `fuse_into_containing_op`
+    // that would take care of the ordering in this case.
+    %add_fused, %loop2 = transform.structured.fuse_into_containing_op %add into %loop
+        : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
+    %matmul_fused, %loop3 = transform.structured.fuse_into_containing_op %arg1 into %loop2
+        : (!transform.op<"linalg.matmul">, !transform.any_op) -> (!transform.any_op, !transform.any_op)
 
-  // Tile again to get the desired size. Note that this time this tiles the
-  // "add" operation and fuses matmul into the loop, but doesn't affect the
-  // "max" operation. This illustrates the precise targeting with the transform
-  // dialect. Otherwise, it is difficult to differentiate "add" and "max", both
-  // of which having the same kind.
-  %tiled_second, %loop_second = transform.structured.tile_using_forall %add_fused tile_sizes [4, 4]
-      : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-  %matmul_fused_2, %loop_second_2 =
-      transform.structured.fuse_into_containing_op %matmul_fused into %loop_second
-      : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
+    // Tile again to get the desired size. Note that this time this tiles the
+    // "add" operation and fuses matmul into the loop, but doesn't affect the
+    // "max" operation. This illustrates the precise targeting with the transform
+    // dialect. Otherwise, it is difficult to differentiate "add" and "max", both
+    // of which having the same kind.
+    %tiled_second, %loop_second = transform.structured.tile_using_forall %add_fused tile_sizes [4, 4]
+        : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+    %matmul_fused_2, %loop_second_2 =
+        transform.structured.fuse_into_containing_op %matmul_fused into %loop_second
+        : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
 
-  // Since outlining is currently only implemented for region-holding operations
-  // such as loops, use tiling to size 1 to materialize the outer loop that is
-  // going to be outlined.
-  %_0, %loop_third = transform.structured.tile_using_forall %tiled_second tile_sizes [1]
-      : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-  %_1, %outline_target = transform.structured.fuse_into_containing_op %matmul_fused_2 into %loop_third
-      : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
-  %func, %call = transform.loop.outline %outline_target {func_name = "outlined"}
-      : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+    // Since outlining is currently only implemented for region-holding operations
+    // such as loops, use tiling to size 1 to materialize the outer loop that is
+    // going to be outlined.
+    %_0, %loop_third = transform.structured.tile_using_forall %tiled_second tile_sizes [1]
+        : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+    %_1, %outline_target = transform.structured.fuse_into_containing_op %matmul_fused_2 into %loop_third
+        : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
+    %func, %call = transform.loop.outline %outline_target {func_name = "outlined"}
+        : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
 
-  // Rewrite the call target.
-  transform.my.change_call_target %call, "microkernel" : !transform.any_op
+    // Rewrite the call target.
+    transform.my.change_call_target %call, "microkernel" : !transform.any_op
 
-  transform.yield
+    transform.yield
+  }
 }
diff --git a/mlir/test/Examples/transform/Ch3/invalid.mlir b/mlir/test/Examples/transform/Ch3/invalid.mlir
index 2226295..acaabd5 100644
--- a/mlir/test/Examples/transform/Ch3/invalid.mlir
+++ b/mlir/test/Examples/transform/Ch3/invalid.mlir
@@ -1,10 +1,10 @@
-// RUN: transform-opt-ch3 %s --test-transform-dialect-interpreter --split-input-file --verify-diagnostics
+// RUN: transform-opt-ch3 %s --transform-interpreter --split-input-file --verify-diagnostics
 
 // expected-note @below {{offending operation}}
-module {
-  transform.sequence failures(suppress) {
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(
   // expected-error @below {{expected the payload operation to implement CallOpInterface}}
-  ^bb0(%arg0: !transform.my.call_op_interface):
-    yield
+  %arg0: !transform.my.call_op_interface) {
+    transform.yield
   }
 }
diff --git a/mlir/test/Examples/transform/Ch3/ops.mlir b/mlir/test/Examples/transform/Ch3/ops.mlir
index f4170b8..b2d47cc 100644
--- a/mlir/test/Examples/transform/Ch3/ops.mlir
+++ b/mlir/test/Examples/transform/Ch3/ops.mlir
@@ -1,4 +1,4 @@
-// RUN: transform-opt-ch3 %s --test-transform-dialect-interpreter \
+// RUN: transform-opt-ch3 %s --transform-interpreter \
 // RUN:   --allow-unregistered-dialect --split-input-file | FileCheck %s
 
 // ****************************** IMPORTANT NOTE ******************************
@@ -18,12 +18,13 @@ func.func @test1() {
   return
 }
 
-transform.sequence failures(propagate) {
-^bb0(%arg0: !transform.any_op):
-  %call = transform.structured.match ops{["func.call"]} in %arg0 : (!transform.any_op) -> !transform.op<"func.call">
-  // CHECK: transform.my.change_call_target %{{.*}}, "updated" : !transform.op<"func.call">
-  transform.my.change_call_target %call, "updated" : !transform.op<"func.call">
-  transform.yield
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg0: !transform.any_op) {
+    %call = transform.structured.match ops{["func.call"]} in %arg0 : (!transform.any_op) -> !transform.op<"func.call">
+    // CHECK: transform.my.change_call_target %{{.*}}, "updated" : !transform.op<"func.call">
+    transform.my.change_call_target %call, "updated" : !transform.op<"func.call">
+    transform.yield
+  }
 }
 
 // -----
@@ -37,10 +38,11 @@ func.func @test2() {
   return
 }
 
-transform.sequence failures(propagate) {
-^bb0(%arg0: !transform.any_op):
-  %call = transform.structured.match ops{["func.call"]} in %arg0 : (!transform.any_op) -> !transform.my.call_op_interface
-  // CHECK: transform.my.call_to_op %{{.*}} : (!transform.my.call_op_interface) -> !transform.any_op
-  transform.my.call_to_op %call : (!transform.my.call_op_interface) -> !transform.any_op
-  transform.yield
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg0: !transform.any_op) {
+    %call = transform.structured.match ops{["func.call"]} in %arg0 : (!transform.any_op) -> !transform.my.call_op_interface
+    // CHECK: transform.my.call_to_op %{{.*}} : (!transform.my.call_op_interface) -> !transform.any_op
+    transform.my.call_to_op %call : (!transform.my.call_op_interface) -> !transform.any_op
+    transform.yield
+  }
 }
diff --git a/mlir/test/Examples/transform/Ch3/sequence.mlir b/mlir/test/Examples/transform/Ch3/sequence.mlir
index 9dd46b3..8dc33c3 100644
--- a/mlir/test/Examples/transform/Ch3/sequence.mlir
+++ b/mlir/test/Examples/transform/Ch3/sequence.mlir
@@ -1,8 +1,7 @@
-// RUN: transform-opt-ch2 %s \
-// RUN:   --pass-pipeline="builtin.module(test-transform-dialect-interpreter{ \
-// RUN:        bind-first-extra-to-ops=linalg.matmul \
-// RUN:        bind-second-extra-to-ops=linalg.elemwise_binary \
-// RUN:        enable-expensive-checks},canonicalize,cse,symbol-dce)" |\
+// RUN: transform-opt-ch3 %s \
+// RUN:   --pass-pipeline="builtin.module(transform-interpreter{ \
+// RUN:        debug-bind-trailing-args=linalg.matmul,linalg.elemwise_binary},\
+// RUN:        canonicalize,cse,symbol-dce)" |\
 // RUN: FileCheck %s
 
 // ****************************** IMPORTANT NOTE ******************************
@@ -56,55 +55,57 @@ func.func private @microkernel(
     %init: tensor<4x4xf32>,
     %output: tensor<4x4xf32>) -> tensor<4x4xf32>
 
-transform.sequence failures(propagate) {
-^bb0(%arg0: !transform.any_op,
-     %arg1: !transform.op<"linalg.matmul">,
-     %arg2: !transform.op<"linalg.elemwise_binary">):
-  // Since the %arg2 handle is associated with both elementwise operations,
-  // we need to split it into two handles so we can target only the second
-  // elementwise operation.
-  %add, %max = transform.split_handle %arg2 : (!transform.op<"linalg.elemwise_binary">)
-      -> (!transform.any_op, !transform.any_op)
-
-  // The actual tiling transformation takes tile sizes as attributes. It produces a
-  // handle to the loop generated during tiling.
-  %tiled, %loop = transform.structured.tile_using_forall %max tile_sizes [8, 32]
-      : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-
-  // We can now fuse the other operations into the loop. Here, we fuse
-  // operations one-by-one. This requires the operation that is being fused
-  // to define the value used within the loop, so the order of such fusions
-  // is important. We could also use "transform.merge_handles" to obtain
-  // a single handle to all operations and give it to `fuse_into_containing_op`
-  // that would take care of the ordering in this case.
-  %add_fused, %loop2 = transform.structured.fuse_into_containing_op %add into %loop
-      : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
-  %matmul_fused, %loop3 = transform.structured.fuse_into_containing_op %arg1 into %loop2
-      : (!transform.op<"linalg.matmul">, !transform.any_op) -> (!transform.any_op, !transform.any_op)
-
-  // Tile again to get the desired size. Note that this time this tiles the
-  // "add" operation and fuses matmul into the loop, but doesn't affect the
-  // "max" operation. This illustrates the precise targeting with the transform
-  // dialect. Otherwise, it is difficult to differentiate "add" and "max", both
-  // of which having the same kind.
-  %tiled_second, %loop_second = transform.structured.tile_using_forall %add_fused tile_sizes [4, 4]
-      : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-  %matmul_fused_2, %loop_second_2 =
-      transform.structured.fuse_into_containing_op %matmul_fused into %loop_second
-      : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
-
-  // Since outlining is currently only implemented for region-holding operations
-  // such as loops, use tiling to size 1 to materialize the outer loop that is
-  // going to be outlined.
-  %_0, %loop_third = transform.structured.tile_using_forall %tiled_second tile_sizes [1]
-      : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-  %_1, %outline_target = transform.structured.fuse_into_containing_op %matmul_fused_2 into %loop_third
-      : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
-  %func, %call = transform.loop.outline %outline_target {func_name = "outlined"}
-      : (!transform.any_op) -> (!transform.any_op, !transform.op<"func.call">)
-
-  // Rewrite the call target.
-  transform.my.change_call_target %call, "microkernel" : !transform.op<"func.call">
-
-  transform.yield
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(
+       %arg0: !transform.any_op,
+       %arg1: !transform.op<"linalg.matmul">,
+       %arg2: !transform.op<"linalg.elemwise_binary">) {
+    // Since the %arg2 handle is associated with both elementwise operations,
+    // we need to split it into two handles so we can target only the second
+    // elementwise operation.
+    %add, %max = transform.split_handle %arg2 : (!transform.op<"linalg.elemwise_binary">)
+        -> (!transform.any_op, !transform.any_op)
+  
+    // The actual tiling transformation takes tile sizes as attributes. It produces a
+    // handle to the loop generated during tiling.
+    %tiled, %loop = transform.structured.tile_using_forall %max tile_sizes [8, 32]
+        : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+  
+    // We can now fuse the other operations into the loop. Here, we fuse
+    // operations one-by-one. This requires the operation that is being fused
+    // to define the value used within the loop, so the order of such fusions
+    // is important. We could also use "transform.merge_handles" to obtain
+    // a single handle to all operations and give it to `fuse_into_containing_op`
+    // that would take care of the ordering in this case.
+    %add_fused, %loop2 = transform.structured.fuse_into_containing_op %add into %loop
+        : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
+    %matmul_fused, %loop3 = transform.structured.fuse_into_containing_op %arg1 into %loop2
+        : (!transform.op<"linalg.matmul">, !transform.any_op) -> (!transform.any_op, !transform.any_op)
+  
+    // Tile again to get the desired size. Note that this time this tiles the
+    // "add" operation and fuses matmul into the loop, but doesn't affect the
+    // "max" operation. This illustrates the precise targeting with the transform
+    // dialect. Otherwise, it is difficult to differentiate "add" and "max", both
+    // of which having the same kind.
+    %tiled_second, %loop_second = transform.structured.tile_using_forall %add_fused tile_sizes [4, 4]
+        : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+    %matmul_fused_2, %loop_second_2 =
+        transform.structured.fuse_into_containing_op %matmul_fused into %loop_second
+        : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
+  
+    // Since outlining is currently only implemented for region-holding operations
+    // such as loops, use tiling to size 1 to materialize the outer loop that is
+    // going to be outlined.
+    %_0, %loop_third = transform.structured.tile_using_forall %tiled_second tile_sizes [1]
+        : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+    %_1, %outline_target = transform.structured.fuse_into_containing_op %matmul_fused_2 into %loop_third
+        : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
+    %func, %call = transform.loop.outline %outline_target {func_name = "outlined"}
+        : (!transform.any_op) -> (!transform.any_op, !transform.op<"func.call">)
+  
+    // Rewrite the call target.
+    transform.my.change_call_target %call, "microkernel" : !transform.op<"func.call">
+  
+    transform.yield
+  }
 }
diff --git a/mlir/test/Examples/transform/ChH/full.mlir b/mlir/test/Examples/transform/ChH/full.mlir
index d90d740..f8d9103 100644
--- a/mlir/test/Examples/transform/ChH/full.mlir
+++ b/mlir/test/Examples/transform/ChH/full.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s --test-transform-dialect-interpreter \
+// RUN: mlir-opt %s --transform-interpreter \
 // RUN:             --test-transform-dialect-erase-schedule \
 // RUN:             --math-uplift-to-fma \
 // RUN:             --convert-bufferization-to-memref \
@@ -115,9 +115,9 @@ module attributes { transform.with_named_sequence } {
   // have no effect on the Halide IR as of 294f80c49bf3bb8582446613c25fcce03b82.
   // Also note that the order of dimensions in Halide is inverted, e.g., co and
   // n are the outermost loops in the respective reorder directives.
-  transform.sequence failures(propagate) {
+  transform.named_sequence @__transform_main(
   // This argument will point to the top-level module.
-  ^bb0(%arg0: !transform.any_op):
+      %arg0: !transform.any_op) {
 
     // 1. Find the operations we are going to transform usnig their names. This
     // is a simplistic approach that works when there are few operations in the
-- 
cgit v1.1


From 6d1396148977ca275df243a965ac504448bf5faa Mon Sep 17 00:00:00 2001
From: Mark de Wever <koraq@xs4all.nl>
Date: Fri, 9 Feb 2024 17:40:08 +0100
Subject: [libc++][test] Improves substitution naming (#80471)

Using the `-dir` suffix for directories makes it easier to understand.

Fixes: https://github.com/llvm/llvm-project/issues/78310
---
 libcxx/test/configs/apple-libc++-backdeployment.cfg.in |  4 ++--
 libcxx/test/configs/apple-libc++-shared.cfg.in         |  6 +++---
 libcxx/test/configs/armv7m-picolibc-libc++.cfg.in      |  4 ++--
 libcxx/test/configs/cmake-bridge.cfg.in                | 12 ++++++------
 libcxx/test/configs/ibm-libc++-shared.cfg.in           |  6 +++---
 libcxx/test/configs/llvm-libc++-android-ndk.cfg.in     |  4 ++--
 libcxx/test/configs/llvm-libc++-mingw.cfg.in           |  6 +++---
 libcxx/test/configs/llvm-libc++-shared-clangcl.cfg.in  |  6 +++---
 libcxx/test/configs/llvm-libc++-shared-gcc.cfg.in      |  4 ++--
 .../llvm-libc++-shared-no-vcruntime-clangcl.cfg.in     |  6 +++---
 libcxx/test/configs/llvm-libc++-shared.cfg.in          |  4 ++--
 libcxx/test/configs/llvm-libc++-static-clangcl.cfg.in  |  4 ++--
 libcxx/test/configs/llvm-libc++-static.cfg.in          |  4 ++--
 .../assertions/headers_declare_verbose_abort.gen.py    |  2 +-
 libcxx/test/libcxx/clang_modules_include.gen.py        |  2 +-
 libcxx/test/libcxx/clang_tidy.gen.py                   |  6 +++---
 libcxx/test/libcxx/double_include.gen.py               |  2 +-
 libcxx/test/libcxx/header_inclusions.gen.py            |  2 +-
 libcxx/test/libcxx/headers_in_modulemap.sh.py          |  2 +-
 libcxx/test/libcxx/libcpp_version.gen.py               |  2 +-
 libcxx/test/libcxx/module_std.gen.py                   |  6 +++---
 libcxx/test/libcxx/module_std_compat.gen.py            |  6 +++---
 libcxx/test/libcxx/no_assert_include.gen.py            |  2 +-
 libcxx/test/libcxx/system_reserved_names.gen.py        |  2 +-
 libcxx/test/libcxx/transitive_includes.gen.py          |  8 ++++----
 .../vendor/apple/system-install-properties.sh.cpp      | 18 +++++++++---------
 .../libcxx/vendor/clang-cl/static-lib-exports.sh.cpp   |  4 ++--
 .../test/libcxx/vendor/mingw/static-lib-exports.sh.cpp |  4 ++--
 libcxx/utils/libcxx/test/features.py                   |  2 +-
 libcxx/utils/libcxx/test/format.py                     |  4 ++--
 30 files changed, 72 insertions(+), 72 deletions(-)

diff --git a/libcxx/test/configs/apple-libc++-backdeployment.cfg.in b/libcxx/test/configs/apple-libc++-backdeployment.cfg.in
index b471c02..4259446 100644
--- a/libcxx/test/configs/apple-libc++-backdeployment.cfg.in
+++ b/libcxx/test/configs/apple-libc++-backdeployment.cfg.in
@@ -45,10 +45,10 @@ config.substitutions.append(('%{flags}',
     '-isysroot {}'.format('@CMAKE_OSX_SYSROOT@') if '@CMAKE_OSX_SYSROOT@' else ''
 ))
 config.substitutions.append(('%{compile_flags}',
-    '-nostdinc++ -I %{include} -I %{libcxx}/test/support'
+    '-nostdinc++ -I %{include-dir} -I %{libcxx-dir}/test/support'
 ))
 config.substitutions.append(('%{link_flags}',
-    '-nostdlib++ -L %{lib} -lc++'
+    '-nostdlib++ -L %{lib-dir} -lc++'
 ))
 config.substitutions.append(('%{exec}',
     '%{executor} --execdir %T --env DYLD_LIBRARY_PATH="%{cxx-runtime-root}:%{abi-runtime-root}:%{unwind-runtime-root}" -- '
diff --git a/libcxx/test/configs/apple-libc++-shared.cfg.in b/libcxx/test/configs/apple-libc++-shared.cfg.in
index af1926e..2d0aee3 100644
--- a/libcxx/test/configs/apple-libc++-shared.cfg.in
+++ b/libcxx/test/configs/apple-libc++-shared.cfg.in
@@ -13,13 +13,13 @@ config.substitutions.append(('%{flags}',
     '-isysroot {}'.format('@CMAKE_OSX_SYSROOT@') if '@CMAKE_OSX_SYSROOT@' else ''
 ))
 config.substitutions.append(('%{compile_flags}',
-    '-nostdinc++ -I %{include} -I %{libcxx}/test/support'
+    '-nostdinc++ -I %{include-dir} -I %{libcxx-dir}/test/support'
 ))
 config.substitutions.append(('%{link_flags}',
-    '-nostdlib++ -L %{lib} -lc++'
+    '-nostdlib++ -L %{lib-dir} -lc++'
 ))
 config.substitutions.append(('%{exec}',
-    '%{executor} --execdir %T --env DYLD_LIBRARY_PATH=%{lib} -- '
+    '%{executor} --execdir %T --env DYLD_LIBRARY_PATH=%{lib-dir} -- '
 ))
 
 config.stdlib = 'apple-libc++'
diff --git a/libcxx/test/configs/armv7m-picolibc-libc++.cfg.in b/libcxx/test/configs/armv7m-picolibc-libc++.cfg.in
index a39d43a..8ca8603 100644
--- a/libcxx/test/configs/armv7m-picolibc-libc++.cfg.in
+++ b/libcxx/test/configs/armv7m-picolibc-libc++.cfg.in
@@ -5,7 +5,7 @@ libc_linker_script = '@CMAKE_INSTALL_PREFIX@/lib/picolibcpp.ld'
 config.substitutions.append(('%{flags}', '--sysroot=@CMAKE_INSTALL_PREFIX@'))
 
 config.substitutions.append(('%{compile_flags}',
-    '-nostdinc++ -I %{include} -I %{target-include} -I %{libcxx}/test/support'
+    '-nostdinc++ -I %{include-dir} -I %{target-include-dir} -I %{libcxx-dir}/test/support'
 
     # Disable warnings in cxx_atomic_impl.h:
     # "large atomic operation may incur significant performance penalty; the
@@ -17,7 +17,7 @@ config.substitutions.append(('%{compile_flags}',
     ' -include picolibc.h'
 ))
 config.substitutions.append(('%{link_flags}',
-    '-nostdlib -nostdlib++ -L %{lib} -lc++ -lc++abi'
+    '-nostdlib -nostdlib++ -L %{lib-dir} -lc++ -lc++abi'
     ' -lc -lm -lclang_rt.builtins -lsemihost -lcrt0-semihost' +
     ' -T {}'.format(libc_linker_script) +
     ' -Wl,--defsym=__flash=0x0'
diff --git a/libcxx/test/configs/cmake-bridge.cfg.in b/libcxx/test/configs/cmake-bridge.cfg.in
index 72b2ddf..84b3270 100644
--- a/libcxx/test/configs/cmake-bridge.cfg.in
+++ b/libcxx/test/configs/cmake-bridge.cfg.in
@@ -25,9 +25,9 @@ config.test_exec_root = os.path.join('@CMAKE_BINARY_DIR@', 'test')
 # Add substitutions for bootstrapping the test suite configuration
 import shlex
 config.substitutions.append(('%{cxx}', shlex.quote('@CMAKE_CXX_COMPILER@')))
-config.substitutions.append(('%{libcxx}', '@LIBCXX_SOURCE_DIR@'))
-config.substitutions.append(('%{include}', '@LIBCXX_GENERATED_INCLUDE_DIR@'))
-config.substitutions.append(('%{target-include}', '@LIBCXX_GENERATED_INCLUDE_TARGET_DIR@'))
-config.substitutions.append(('%{lib}', '@LIBCXX_LIBRARY_DIR@'))
-config.substitutions.append(('%{module}', '@LIBCXX_GENERATED_MODULE_DIR@'))
-config.substitutions.append(('%{test-tools}', '@LIBCXX_TEST_TOOLS_PATH@'))
+config.substitutions.append(('%{libcxx-dir}', '@LIBCXX_SOURCE_DIR@'))
+config.substitutions.append(('%{include-dir}', '@LIBCXX_GENERATED_INCLUDE_DIR@'))
+config.substitutions.append(('%{target-include-dir}', '@LIBCXX_GENERATED_INCLUDE_TARGET_DIR@'))
+config.substitutions.append(('%{lib-dir}', '@LIBCXX_LIBRARY_DIR@'))
+config.substitutions.append(('%{module-dir}', '@LIBCXX_GENERATED_MODULE_DIR@'))
+config.substitutions.append(('%{test-tools-dir}', '@LIBCXX_TEST_TOOLS_PATH@'))
diff --git a/libcxx/test/configs/ibm-libc++-shared.cfg.in b/libcxx/test/configs/ibm-libc++-shared.cfg.in
index 50061e9..0f86e74 100644
--- a/libcxx/test/configs/ibm-libc++-shared.cfg.in
+++ b/libcxx/test/configs/ibm-libc++-shared.cfg.in
@@ -12,13 +12,13 @@ if lit.util.isAIXTriple(config.target_triple):
 
 config.substitutions.append(('%{flags}', '-pthread'))
 config.substitutions.append(('%{compile_flags}',
-    '-nostdinc++ -D__LIBC_NO_CPP_MATH_OVERLOADS__ -I %{include} -I %{libcxx}/test/support'
+    '-nostdinc++ -D__LIBC_NO_CPP_MATH_OVERLOADS__ -I %{include-dir} -I %{libcxx-dir}/test/support'
 ))
 config.substitutions.append(('%{link_flags}',
-    '-nostdlib++ -L %{lib} -lc++ -lc++abi -latomic -Wl,-bbigtoc'
+    '-nostdlib++ -L %{lib-dir} -lc++ -lc++abi -latomic -Wl,-bbigtoc'
 ))
 config.substitutions.append(('%{exec}',
-    '%{executor} --execdir %T --env LIBPATH=%{lib} -- '
+    '%{executor} --execdir %T --env LIBPATH=%{lib-dir} -- '
 ))
 
 # LIBCXX-AIX-FIXME is the feature name used to XFAIL the
diff --git a/libcxx/test/configs/llvm-libc++-android-ndk.cfg.in b/libcxx/test/configs/llvm-libc++-android-ndk.cfg.in
index 1be8527..d5f1ccc 100644
--- a/libcxx/test/configs/llvm-libc++-android-ndk.cfg.in
+++ b/libcxx/test/configs/llvm-libc++-android-ndk.cfg.in
@@ -16,7 +16,7 @@ config.substitutions.append(('%{flags}',
     '--sysroot @CMAKE_SYSROOT@' if '@CMAKE_SYSROOT@' else ''
 ))
 
-compile_flags = '-nostdinc++ -I %{include} -I %{target-include} -I %{libcxx}/test/support'
+compile_flags = '-nostdinc++ -I %{include-dir} -I %{target-include-dir} -I %{libcxx-dir}/test/support'
 if re.match(r'i686-linux-android(21|22|23)$', config.target_triple):
     # 32-bit x86 Android has a bug where the stack is sometimes misaligned.
     # The problem appears limited to versions before Android N (API 24) and only
@@ -31,7 +31,7 @@ config.substitutions.append(('%{compile_flags}', compile_flags))
 # libc++_shared.so because older Bionic dynamic loaders don't support rpath
 # lookup.
 config.substitutions.append(('%{link_flags}',
-    '-nostdlib++ -L %{lib} -lc++_shared'
+    '-nostdlib++ -L %{lib-dir} -lc++_shared'
 ))
 config.substitutions.append(('%{exec}',
     '%{executor}' +
diff --git a/libcxx/test/configs/llvm-libc++-mingw.cfg.in b/libcxx/test/configs/llvm-libc++-mingw.cfg.in
index eb77f11..8a0cc96 100644
--- a/libcxx/test/configs/llvm-libc++-mingw.cfg.in
+++ b/libcxx/test/configs/llvm-libc++-mingw.cfg.in
@@ -5,13 +5,13 @@ lit_config.load_config(config, '@CMAKE_CURRENT_BINARY_DIR@/cmake-bridge.cfg')
 
 config.substitutions.append(('%{flags}', ''))
 config.substitutions.append(('%{compile_flags}',
-    '-nostdinc++ -I %{include} -I %{target-include} -I %{libcxx}/test/support'
+    '-nostdinc++ -I %{include-dir} -I %{target-include-dir} -I %{libcxx-dir}/test/support'
 ))
 config.substitutions.append(('%{link_flags}',
-    '-nostdlib++ -L %{lib} -lc++'
+    '-nostdlib++ -L %{lib-dir} -lc++'
 ))
 config.substitutions.append(('%{exec}',
-    '%{executor} --execdir %T --prepend_env PATH=%{lib} -- '
+    '%{executor} --execdir %T --prepend_env PATH=%{lib-dir} -- '
 ))
 
 import os, site
diff --git a/libcxx/test/configs/llvm-libc++-shared-clangcl.cfg.in b/libcxx/test/configs/llvm-libc++-shared-clangcl.cfg.in
index 50d28eb..cca88c8 100644
--- a/libcxx/test/configs/llvm-libc++-shared-clangcl.cfg.in
+++ b/libcxx/test/configs/llvm-libc++-shared-clangcl.cfg.in
@@ -5,13 +5,13 @@ lit_config.load_config(config, '@CMAKE_CURRENT_BINARY_DIR@/cmake-bridge.cfg')
 
 config.substitutions.append(('%{flags}', '--driver-mode=g++'))
 config.substitutions.append(('%{compile_flags}',
-    '-fms-runtime-lib=' + config.fms_runtime_lib + ' -nostdinc++ -I %{include} -I %{target-include} -I %{libcxx}/test/support -D_CRT_SECURE_NO_WARNINGS -D_CRT_NONSTDC_NO_WARNINGS -D_CRT_STDIO_ISO_WIDE_SPECIFIERS -DNOMINMAX' + config.dbg_include
+    '-fms-runtime-lib=' + config.fms_runtime_lib + ' -nostdinc++ -I %{include-dir} -I %{target-include-dir} -I %{libcxx-dir}/test/support -D_CRT_SECURE_NO_WARNINGS -D_CRT_NONSTDC_NO_WARNINGS -D_CRT_STDIO_ISO_WIDE_SPECIFIERS -DNOMINMAX' + config.dbg_include
 ))
 config.substitutions.append(('%{link_flags}',
-    '-nostdlib -L %{lib} -lc++ -l' + config.cxx_lib
+    '-nostdlib -L %{lib-dir} -lc++ -l' + config.cxx_lib
 ))
 config.substitutions.append(('%{exec}',
-    '%{executor} --execdir %T --prepend_env PATH=%{lib} -- '
+    '%{executor} --execdir %T --prepend_env PATH=%{lib-dir} -- '
 ))
 
 import os, site
diff --git a/libcxx/test/configs/llvm-libc++-shared-gcc.cfg.in b/libcxx/test/configs/llvm-libc++-shared-gcc.cfg.in
index a75e90b..7d107c8 100644
--- a/libcxx/test/configs/llvm-libc++-shared-gcc.cfg.in
+++ b/libcxx/test/configs/llvm-libc++-shared-gcc.cfg.in
@@ -6,10 +6,10 @@ lit_config.load_config(config, '@CMAKE_CURRENT_BINARY_DIR@/cmake-bridge.cfg')
 
 config.substitutions.append(('%{flags}', '-pthread'))
 config.substitutions.append(('%{compile_flags}',
-    '-nostdinc++ -I %{include} -I %{target-include} -I %{libcxx}/test/support'
+    '-nostdinc++ -I %{include-dir} -I %{target-include-dir} -I %{libcxx-dir}/test/support'
 ))
 config.substitutions.append(('%{link_flags}',
-    '-nostdlib++ -L %{lib} -Wl,-rpath,%{lib} -lc++ -lm'
+    '-nostdlib++ -L %{lib-dir} -Wl,-rpath,%{lib-dir} -lc++ -lm'
 ))
 config.substitutions.append(('%{exec}',
     '%{executor} --execdir %T -- '
diff --git a/libcxx/test/configs/llvm-libc++-shared-no-vcruntime-clangcl.cfg.in b/libcxx/test/configs/llvm-libc++-shared-no-vcruntime-clangcl.cfg.in
index 4c88af3..a8ad920 100644
--- a/libcxx/test/configs/llvm-libc++-shared-no-vcruntime-clangcl.cfg.in
+++ b/libcxx/test/configs/llvm-libc++-shared-no-vcruntime-clangcl.cfg.in
@@ -6,13 +6,13 @@ lit_config.load_config(config, '@CMAKE_CURRENT_BINARY_DIR@/cmake-bridge.cfg')
 
 config.substitutions.append(('%{flags}', '--driver-mode=g++'))
 config.substitutions.append(('%{compile_flags}',
-    '-fms-runtime-lib=' + config.fms_runtime_lib + ' -nostdinc++ -I %{include} -I %{target-include} -I %{libcxx}/test/support -D_CRT_SECURE_NO_WARNINGS -D_CRT_NONSTDC_NO_WARNINGS -D_CRT_STDIO_ISO_WIDE_SPECIFIERS -DNOMINMAX -D_HAS_EXCEPTIONS=0' + config.dbg_include
+    '-fms-runtime-lib=' + config.fms_runtime_lib + ' -nostdinc++ -I %{include-dir} -I %{target-include-dir} -I %{libcxx-dir}/test/support -D_CRT_SECURE_NO_WARNINGS -D_CRT_NONSTDC_NO_WARNINGS -D_CRT_STDIO_ISO_WIDE_SPECIFIERS -DNOMINMAX -D_HAS_EXCEPTIONS=0' + config.dbg_include
 ))
 config.substitutions.append(('%{link_flags}',
-    '-nostdlib -L %{lib} -lc++ -l' + config.cxx_lib
+    '-nostdlib -L %{lib-dir} -lc++ -l' + config.cxx_lib
 ))
 config.substitutions.append(('%{exec}',
-    '%{executor} --execdir %T --prepend_env PATH=%{lib} -- '
+    '%{executor} --execdir %T --prepend_env PATH=%{lib-dir} -- '
 ))
 
 import os, site
diff --git a/libcxx/test/configs/llvm-libc++-shared.cfg.in b/libcxx/test/configs/llvm-libc++-shared.cfg.in
index 143b3b3..5199f64 100644
--- a/libcxx/test/configs/llvm-libc++-shared.cfg.in
+++ b/libcxx/test/configs/llvm-libc++-shared.cfg.in
@@ -7,10 +7,10 @@ config.substitutions.append(('%{flags}',
     '-pthread' + (' -isysroot {}'.format('@CMAKE_OSX_SYSROOT@') if '@CMAKE_OSX_SYSROOT@' else '')
 ))
 config.substitutions.append(('%{compile_flags}',
-    '-nostdinc++ -I %{include} -I %{target-include} -I %{libcxx}/test/support'
+    '-nostdinc++ -I %{include-dir} -I %{target-include-dir} -I %{libcxx-dir}/test/support'
 ))
 config.substitutions.append(('%{link_flags}',
-    '-nostdlib++ -L %{lib} -Wl,-rpath,%{lib} -lc++'
+    '-nostdlib++ -L %{lib-dir} -Wl,-rpath,%{lib-dir} -lc++'
 ))
 config.substitutions.append(('%{exec}',
     '%{executor} --execdir %T -- '
diff --git a/libcxx/test/configs/llvm-libc++-static-clangcl.cfg.in b/libcxx/test/configs/llvm-libc++-static-clangcl.cfg.in
index 4baaad7..7c700bf 100644
--- a/libcxx/test/configs/llvm-libc++-static-clangcl.cfg.in
+++ b/libcxx/test/configs/llvm-libc++-static-clangcl.cfg.in
@@ -5,10 +5,10 @@ lit_config.load_config(config, '@CMAKE_CURRENT_BINARY_DIR@/cmake-bridge.cfg')
 
 config.substitutions.append(('%{flags}', '--driver-mode=g++'))
 config.substitutions.append(('%{compile_flags}',
-    '-fms-runtime-lib=' + config.fms_runtime_lib + ' -nostdinc++ -I %{include} -I %{target-include} -I %{libcxx}/test/support -D_CRT_SECURE_NO_WARNINGS -D_CRT_NONSTDC_NO_WARNINGS -D_CRT_STDIO_ISO_WIDE_SPECIFIERS -DNOMINMAX' + config.dbg_include
+    '-fms-runtime-lib=' + config.fms_runtime_lib + ' -nostdinc++ -I %{include-dir} -I %{target-include-dir} -I %{libcxx-dir}/test/support -D_CRT_SECURE_NO_WARNINGS -D_CRT_NONSTDC_NO_WARNINGS -D_CRT_STDIO_ISO_WIDE_SPECIFIERS -DNOMINMAX' + config.dbg_include
 ))
 config.substitutions.append(('%{link_flags}',
-    '-nostdlib -L %{lib} -llibc++ -l' + config.cxx_lib
+    '-nostdlib -L %{lib-dir} -llibc++ -l' + config.cxx_lib
 ))
 config.substitutions.append(('%{exec}',
     '%{executor} --execdir %T -- '
diff --git a/libcxx/test/configs/llvm-libc++-static.cfg.in b/libcxx/test/configs/llvm-libc++-static.cfg.in
index e866d4f..097cc4d 100644
--- a/libcxx/test/configs/llvm-libc++-static.cfg.in
+++ b/libcxx/test/configs/llvm-libc++-static.cfg.in
@@ -7,10 +7,10 @@ config.substitutions.append(('%{flags}',
     '-pthread' + (' -isysroot {}'.format('@CMAKE_OSX_SYSROOT@') if '@CMAKE_OSX_SYSROOT@' else '')
 ))
 config.substitutions.append(('%{compile_flags}',
-    '-nostdinc++ -I %{include} -I %{target-include} -I %{libcxx}/test/support'
+    '-nostdinc++ -I %{include-dir} -I %{target-include-dir} -I %{libcxx-dir}/test/support'
 ))
 config.substitutions.append(('%{link_flags}',
-    '-nostdlib++ -L %{lib} -lc++ -lc++abi'
+    '-nostdlib++ -L %{lib-dir} -lc++ -lc++abi'
 ))
 config.substitutions.append(('%{exec}',
     '%{executor} --execdir %T -- '
diff --git a/libcxx/test/libcxx/assertions/headers_declare_verbose_abort.gen.py b/libcxx/test/libcxx/assertions/headers_declare_verbose_abort.gen.py
index a4e1c3c..bd883aa 100644
--- a/libcxx/test/libcxx/assertions/headers_declare_verbose_abort.gen.py
+++ b/libcxx/test/libcxx/assertions/headers_declare_verbose_abort.gen.py
@@ -10,7 +10,7 @@
 # is required for users to be able to include any public header and then override
 # the function using a strong definition.
 
-# RUN: %{python} %s %{libcxx}/utils
+# RUN: %{python} %s %{libcxx-dir}/utils
 
 import sys
 sys.path.append(sys.argv[1])
diff --git a/libcxx/test/libcxx/clang_modules_include.gen.py b/libcxx/test/libcxx/clang_modules_include.gen.py
index 26ef207..e3593ee 100644
--- a/libcxx/test/libcxx/clang_modules_include.gen.py
+++ b/libcxx/test/libcxx/clang_modules_include.gen.py
@@ -10,7 +10,7 @@
 # This is important notably because the LLDB data formatters use
 # libc++ headers with modules enabled.
 
-# RUN: %{python} %s %{libcxx}/utils
+# RUN: %{python} %s %{libcxx-dir}/utils
 
 import sys
 sys.path.append(sys.argv[1])
diff --git a/libcxx/test/libcxx/clang_tidy.gen.py b/libcxx/test/libcxx/clang_tidy.gen.py
index b2f1a17..19b6a99 100644
--- a/libcxx/test/libcxx/clang_tidy.gen.py
+++ b/libcxx/test/libcxx/clang_tidy.gen.py
@@ -8,7 +8,7 @@
 
 # Run our custom libc++ clang-tidy checks on all public headers.
 
-# RUN: %{python} %s %{libcxx}/utils
+# RUN: %{python} %s %{libcxx-dir}/utils
 
 import sys
 sys.path.append(sys.argv[1])
@@ -27,8 +27,8 @@ for header in public_headers:
 {lit_header_restrictions.get(header, '')}
 
 // TODO: run clang-tidy with modules enabled once they are supported
-// RUN{BLOCKLIT}: %{{clang-tidy}} %s --warnings-as-errors=* -header-filter=.* --checks='-*,libcpp-*' --load=%{{test-tools}}/clang_tidy_checks/libcxx-tidy.plugin -- %{{compile_flags}} -fno-modules
-// RUN{BLOCKLIT}: %{{clang-tidy}} %s --warnings-as-errors=* -header-filter=.* --config-file=%{{libcxx}}/.clang-tidy -- -Wweak-vtables %{{compile_flags}} -fno-modules
+// RUN{BLOCKLIT}: %{{clang-tidy}} %s --warnings-as-errors=* -header-filter=.* --checks='-*,libcpp-*' --load=%{{test-tools-dir}}/clang_tidy_checks/libcxx-tidy.plugin -- %{{compile_flags}} -fno-modules
+// RUN{BLOCKLIT}: %{{clang-tidy}} %s --warnings-as-errors=* -header-filter=.* --config-file=%{{libcxx-dir}}/.clang-tidy -- -Wweak-vtables %{{compile_flags}} -fno-modules
 
 #include <{header}>
 """)
diff --git a/libcxx/test/libcxx/double_include.gen.py b/libcxx/test/libcxx/double_include.gen.py
index 85055df..2fcfa50 100644
--- a/libcxx/test/libcxx/double_include.gen.py
+++ b/libcxx/test/libcxx/double_include.gen.py
@@ -8,7 +8,7 @@
 
 # Test that we can include each header in two TU's and link them together.
 
-# RUN: %{python} %s %{libcxx}/utils
+# RUN: %{python} %s %{libcxx-dir}/utils
 
 import sys
 sys.path.append(sys.argv[1])
diff --git a/libcxx/test/libcxx/header_inclusions.gen.py b/libcxx/test/libcxx/header_inclusions.gen.py
index cdbc5b3..faaa4cf 100644
--- a/libcxx/test/libcxx/header_inclusions.gen.py
+++ b/libcxx/test/libcxx/header_inclusions.gen.py
@@ -9,7 +9,7 @@
 # Test that all headers include all the other headers they're supposed to, as
 # prescribed by the Standard.
 
-# RUN: %{python} %s %{libcxx}/utils
+# RUN: %{python} %s %{libcxx-dir}/utils
 
 import sys
 sys.path.append(sys.argv[1])
diff --git a/libcxx/test/libcxx/headers_in_modulemap.sh.py b/libcxx/test/libcxx/headers_in_modulemap.sh.py
index fe007f0..237b006 100644
--- a/libcxx/test/libcxx/headers_in_modulemap.sh.py
+++ b/libcxx/test/libcxx/headers_in_modulemap.sh.py
@@ -1,4 +1,4 @@
-# RUN: %{python} %s %{libcxx}/utils %{include}
+# RUN: %{python} %s %{libcxx-dir}/utils %{include-dir}
 
 import sys
 
diff --git a/libcxx/test/libcxx/libcpp_version.gen.py b/libcxx/test/libcxx/libcpp_version.gen.py
index 47439b0..7d9519d 100644
--- a/libcxx/test/libcxx/libcpp_version.gen.py
+++ b/libcxx/test/libcxx/libcpp_version.gen.py
@@ -8,7 +8,7 @@
 
 # Test that all headers define the _LIBCPP_VERSION macro.
 
-# RUN: %{python} %s %{libcxx}/utils
+# RUN: %{python} %s %{libcxx-dir}/utils
 
 import sys
 sys.path.append(sys.argv[1])
diff --git a/libcxx/test/libcxx/module_std.gen.py b/libcxx/test/libcxx/module_std.gen.py
index a9a05a0..fc23985 100644
--- a/libcxx/test/libcxx/module_std.gen.py
+++ b/libcxx/test/libcxx/module_std.gen.py
@@ -16,7 +16,7 @@
 # to be one monolitic test. Since the test doesn't take very long it's
 # not a huge issue.
 
-# RUN: %{python} %s %{libcxx}/utils
+# RUN: %{python} %s %{libcxx-dir}/utils
 
 import sys
 
@@ -25,9 +25,9 @@ from libcxx.test.modules import module_test_generator
 
 generator = module_test_generator(
     "%t",
-    "%{module}",
+    "%{module-dir}",
     "%{clang-tidy}",
-    "%{test-tools}/clang_tidy_checks/libcxx-tidy.plugin",
+    "%{test-tools-dir}/clang_tidy_checks/libcxx-tidy.plugin",
     "%{cxx}",
     "%{flags} %{compile_flags}",
     "std",
diff --git a/libcxx/test/libcxx/module_std_compat.gen.py b/libcxx/test/libcxx/module_std_compat.gen.py
index 270d131..000aa29 100644
--- a/libcxx/test/libcxx/module_std_compat.gen.py
+++ b/libcxx/test/libcxx/module_std_compat.gen.py
@@ -16,7 +16,7 @@
 # to be one monolitic test. Since the test doesn't take very long it's
 # not a huge issue.
 
-# RUN: %{python} %s %{libcxx}/utils
+# RUN: %{python} %s %{libcxx-dir}/utils
 
 import sys
 
@@ -26,9 +26,9 @@ from libcxx.test.modules import module_test_generator
 
 generator = module_test_generator(
     "%t",
-    "%{module}",
+    "%{module-dir}",
     "%{clang-tidy}",
-    "%{test-tools}/clang_tidy_checks/libcxx-tidy.plugin",
+    "%{test-tools-dir}/clang_tidy_checks/libcxx-tidy.plugin",
     "%{cxx}",
     "%{flags} %{compile_flags}",
     "std.compat",
diff --git a/libcxx/test/libcxx/no_assert_include.gen.py b/libcxx/test/libcxx/no_assert_include.gen.py
index a5e733d..dd8006d 100644
--- a/libcxx/test/libcxx/no_assert_include.gen.py
+++ b/libcxx/test/libcxx/no_assert_include.gen.py
@@ -9,7 +9,7 @@
 # Ensure that none of the standard C++ headers implicitly include cassert or
 # assert.h (because assert() is implemented as a macro).
 
-# RUN: %{python} %s %{libcxx}/utils
+# RUN: %{python} %s %{libcxx-dir}/utils
 
 import sys
 sys.path.append(sys.argv[1])
diff --git a/libcxx/test/libcxx/system_reserved_names.gen.py b/libcxx/test/libcxx/system_reserved_names.gen.py
index 5b75dba..0d935a1 100644
--- a/libcxx/test/libcxx/system_reserved_names.gen.py
+++ b/libcxx/test/libcxx/system_reserved_names.gen.py
@@ -10,7 +10,7 @@
 # alphabetic macros. Also ensure that we don't swallow the definition of user
 # provided macros (in other words, ensure that we push/pop correctly everywhere).
 
-# RUN: %{python} %s %{libcxx}/utils
+# RUN: %{python} %s %{libcxx-dir}/utils
 
 import sys
 sys.path.append(sys.argv[1])
diff --git a/libcxx/test/libcxx/transitive_includes.gen.py b/libcxx/test/libcxx/transitive_includes.gen.py
index 43f92d9..28f223c 100644
--- a/libcxx/test/libcxx/transitive_includes.gen.py
+++ b/libcxx/test/libcxx/transitive_includes.gen.py
@@ -16,7 +16,7 @@
 # forever, however we do try to group removals for a couple of releases
 # to avoid breaking users at every release.
 
-# RUN: %{python} %s %{libcxx}/utils
+# RUN: %{python} %s %{libcxx-dir}/utils
 
 import sys
 sys.path.append(sys.argv[1])
@@ -48,7 +48,7 @@ if regenerate_expected_results:
     all_traces.append(f'%t/trace-includes.{normalized_header}.txt')
 
   print(f"""\
-// RUN{BLOCKLIT}: %{{python}} %{{libcxx}}/test/libcxx/transitive_includes_to_csv.py {' '.join(all_traces)} > %{{libcxx}}/test/libcxx/transitive_includes/%{{cxx_std}}.csv
+// RUN{BLOCKLIT}: %{{python}} %{{libcxx-dir}}/test/libcxx/transitive_includes_to_csv.py {' '.join(all_traces)} > %{{libcxx-dir}}/test/libcxx/transitive_includes/%{{cxx_std}}.csv
 """)
 
 else:
@@ -83,8 +83,8 @@ else:
 
 // RUN{BLOCKLIT}: mkdir %t
 // RUN{BLOCKLIT}: %{{cxx}} %s %{{flags}} %{{compile_flags}} --trace-includes -fshow-skipped-includes --preprocess > /dev/null 2> %t/trace-includes.txt
-// RUN{BLOCKLIT}: %{{python}} %{{libcxx}}/test/libcxx/transitive_includes_to_csv.py %t/trace-includes.txt > %t/actual_transitive_includes.csv
-// RUN{BLOCKLIT}: cat %{{libcxx}}/test/libcxx/transitive_includes/%{{cxx_std}}.csv | awk '/^{escaped_header} / {{ print }}' > %t/expected_transitive_includes.csv
+// RUN{BLOCKLIT}: %{{python}} %{{libcxx-dir}}/test/libcxx/transitive_includes_to_csv.py %t/trace-includes.txt > %t/actual_transitive_includes.csv
+// RUN{BLOCKLIT}: cat %{{libcxx-dir}}/test/libcxx/transitive_includes/%{{cxx_std}}.csv | awk '/^{escaped_header} / {{ print }}' > %t/expected_transitive_includes.csv
 // RUN{BLOCKLIT}: diff -w %t/expected_transitive_includes.csv %t/actual_transitive_includes.csv
 #include <{header}>
 """)
diff --git a/libcxx/test/libcxx/vendor/apple/system-install-properties.sh.cpp b/libcxx/test/libcxx/vendor/apple/system-install-properties.sh.cpp
index 6c84e0d..3e2e080 100644
--- a/libcxx/test/libcxx/vendor/apple/system-install-properties.sh.cpp
+++ b/libcxx/test/libcxx/vendor/apple/system-install-properties.sh.cpp
@@ -13,17 +13,17 @@
 
 // Make sure we install the libc++ headers in the right location.
 //
-// RUN: stat "%{include}/__config"
+// RUN: stat "%{include-dir}/__config"
 
 // Make sure we install libc++.1.dylib and libc++experimental.a in the right location.
 //
-// RUN: stat "%{lib}/libc++.1.dylib"
-// RUN: stat "%{lib}/libc++experimental.a"
+// RUN: stat "%{lib-dir}/libc++.1.dylib"
+// RUN: stat "%{lib-dir}/libc++experimental.a"
 
 // Make sure we install a symlink from libc++.dylib to libc++.1.dylib.
 //
-// RUN: stat "%{lib}/libc++.dylib"
-// RUN: readlink "%{lib}/libc++.dylib" | grep "libc++.1.dylib"
+// RUN: stat "%{lib-dir}/libc++.dylib"
+// RUN: readlink "%{lib-dir}/libc++.dylib" | grep "libc++.1.dylib"
 
 // Make sure the install_name is /usr/lib.
 //
@@ -34,15 +34,15 @@
 //
 // TODO: We currently don't do that correctly in the CMake build.
 //
-// XRUNX: otool -L "%{lib}/libc++.1.dylib" | grep '/usr/lib/libc++.1.dylib'
-// XRUNX: ! otool -l "%{lib}/libc++.1.dylib" | grep -E "LC_RPATH|@loader_path|@rpath"
+// XRUNX: otool -L "%{lib-dir}/libc++.1.dylib" | grep '/usr/lib/libc++.1.dylib'
+// XRUNX: ! otool -l "%{lib-dir}/libc++.1.dylib" | grep -E "LC_RPATH|@loader_path|@rpath"
 
 // Make sure the compatibility_version of libc++ is 1.0.0.
 // Failure to respect this can result in applications not being able to find libc++
 // when they are loaded by dyld, if the compatibility version was bumped.
 //
-// RUN: otool -L "%{lib}/libc++.1.dylib" | grep "libc++.1.dylib" | grep "compatibility version 1.0.0"
+// RUN: otool -L "%{lib-dir}/libc++.1.dylib" | grep "libc++.1.dylib" | grep "compatibility version 1.0.0"
 
 // Make sure we use the libdispatch backend for the PSTL.
 //
-// RUN: grep "%{include}/__config_site" -e '#define _LIBCPP_PSTL_CPU_BACKEND_LIBDISPATCH'
+// RUN: grep "%{include-dir}/__config_site" -e '#define _LIBCPP_PSTL_CPU_BACKEND_LIBDISPATCH'
diff --git a/libcxx/test/libcxx/vendor/clang-cl/static-lib-exports.sh.cpp b/libcxx/test/libcxx/vendor/clang-cl/static-lib-exports.sh.cpp
index 447454e..7ed1492 100644
--- a/libcxx/test/libcxx/vendor/clang-cl/static-lib-exports.sh.cpp
+++ b/libcxx/test/libcxx/vendor/clang-cl/static-lib-exports.sh.cpp
@@ -11,6 +11,6 @@
 // This file checks that the built static libraries don't contain dllexport
 // directives in clang-cl builds.
 
-// RUN: llvm-readobj --coff-directives "%{lib}/libc++.lib" | not grep -i "export:" > /dev/null
+// RUN: llvm-readobj --coff-directives "%{lib-dir}/libc++.lib" | not grep -i "export:" > /dev/null
 
-// RUN: llvm-readobj --coff-directives "%{lib}/libc++experimental.lib" | not grep -i "export:" > /dev/null
+// RUN: llvm-readobj --coff-directives "%{lib-dir}/libc++experimental.lib" | not grep -i "export:" > /dev/null
diff --git a/libcxx/test/libcxx/vendor/mingw/static-lib-exports.sh.cpp b/libcxx/test/libcxx/vendor/mingw/static-lib-exports.sh.cpp
index 8f29f5a..e20269f 100644
--- a/libcxx/test/libcxx/vendor/mingw/static-lib-exports.sh.cpp
+++ b/libcxx/test/libcxx/vendor/mingw/static-lib-exports.sh.cpp
@@ -11,6 +11,6 @@
 // This file checks that the built static libraries don't contain dllexport
 // directives in MinGW builds.
 
-// RUN: llvm-readobj --coff-directives "%{lib}/libc++.a" | not grep -i "export:" > /dev/null
+// RUN: llvm-readobj --coff-directives "%{lib-dir}/libc++.a" | not grep -i "export:" > /dev/null
 
-// RUN: llvm-readobj --coff-directives "%{lib}/libc++experimental.a" | not grep -i "export:" > /dev/null
+// RUN: llvm-readobj --coff-directives "%{lib-dir}/libc++experimental.a" | not grep -i "export:" > /dev/null
diff --git a/libcxx/utils/libcxx/test/features.py b/libcxx/utils/libcxx/test/features.py
index a9fb64a..6ef4075 100644
--- a/libcxx/utils/libcxx/test/features.py
+++ b/libcxx/utils/libcxx/test/features.py
@@ -27,7 +27,7 @@ _msvcVersion = lambda cfg: (int(compilerMacros(cfg)["_MSC_VER"]) // 100, int(com
 def _getSuitableClangTidy(cfg):
     try:
         # If we didn't build the libcxx-tidy plugin via CMake, we can't run the clang-tidy tests.
-        if runScriptExitCode(cfg, ["stat %{test-tools}/clang_tidy_checks/libcxx-tidy.plugin"]) != 0:
+        if runScriptExitCode(cfg, ["stat %{test-tools-dir}/clang_tidy_checks/libcxx-tidy.plugin"]) != 0:
             return None
 
         # TODO MODULES require ToT due module specific fixes.
diff --git a/libcxx/utils/libcxx/test/format.py b/libcxx/utils/libcxx/test/format.py
index 1317521..229da22 100644
--- a/libcxx/utils/libcxx/test/format.py
+++ b/libcxx/utils/libcxx/test/format.py
@@ -172,7 +172,7 @@ def parseScript(test, preamble):
                 f"{compileFlags} "
                 "-Wno-reserved-module-identifier -Wno-reserved-user-defined-literal "
                 "-fmodule-file=std=%T/std.pcm " # The std.compat module imports std.
-                "--precompile -o %T/std.compat.pcm -c %{module}/std.compat.cppm",
+                "--precompile -o %T/std.compat.pcm -c %{module-dir}/std.compat.cppm",
             )
             moduleCompileFlags.extend(
                 ["-fmodule-file=std.compat=%T/std.compat.pcm", "%T/std.compat.pcm"]
@@ -188,7 +188,7 @@ def parseScript(test, preamble):
             "%dbg(MODULE std) %{cxx} %{flags} "
             f"{compileFlags} "
             "-Wno-reserved-module-identifier -Wno-reserved-user-defined-literal "
-            "--precompile -o %T/std.pcm -c %{module}/std.cppm",
+            "--precompile -o %T/std.pcm -c %{module-dir}/std.cppm",
         )
         moduleCompileFlags.extend(["-fmodule-file=std=%T/std.pcm", "%T/std.pcm"])
 
-- 
cgit v1.1


From 4bf9fa5fb50497878edf8e277574ea9fb7d6bb7f Mon Sep 17 00:00:00 2001
From: Mark de Wever <koraq@xs4all.nl>
Date: Fri, 9 Feb 2024 17:41:46 +0100
Subject: [libc++][modules] Guard missing header validation on Windows.
 (#80478)

On Windows the libc++ test suite sees the MSVC STL headers and may
conclude these are libc++ headers when inspecting the name. Modules
guard against forgetting to export new headers. Finding MSVC STL's
headers gives false positives. Since the CI tests non-Windows platforms
too, the validation will be disabled on Windows.

Fixes: https://github.com/llvm/llvm-project/issues/79010

---------

Co-authored-by: Louis Dionne <ldionne.2@gmail.com>
---
 libcxx/modules/std.compat.cppm.in       | 75 ++++++++++++++++++---------------
 libcxx/modules/std.cppm.in              | 75 ++++++++++++++++++---------------
 libcxx/utils/generate_libcxx_cppm_in.py | 23 +++++++---
 3 files changed, 102 insertions(+), 71 deletions(-)

diff --git a/libcxx/modules/std.compat.cppm.in b/libcxx/modules/std.compat.cppm.in
index 651d6ec..1636371 100644
--- a/libcxx/modules/std.compat.cppm.in
+++ b/libcxx/modules/std.compat.cppm.in
@@ -46,39 +46,48 @@ module;
 #endif
 
 // *** Headers not yet available ***
-#if __has_include(<debugging>)
-#  error "please update the header information for <debugging> in headers_not_available in utils/libcxx/header_information.py"
-#endif // __has_include(<debugging>)
-#if __has_include(<flat_map>)
-#  error "please update the header information for <flat_map> in headers_not_available in utils/libcxx/header_information.py"
-#endif // __has_include(<flat_map>)
-#if __has_include(<flat_set>)
-#  error "please update the header information for <flat_set> in headers_not_available in utils/libcxx/header_information.py"
-#endif // __has_include(<flat_set>)
-#if __has_include(<generator>)
-#  error "please update the header information for <generator> in headers_not_available in utils/libcxx/header_information.py"
-#endif // __has_include(<generator>)
-#if __has_include(<hazard_pointer>)
-#  error "please update the header information for <hazard_pointer> in headers_not_available in utils/libcxx/header_information.py"
-#endif // __has_include(<hazard_pointer>)
-#if __has_include(<linalg>)
-#  error "please update the header information for <linalg> in headers_not_available in utils/libcxx/header_information.py"
-#endif // __has_include(<linalg>)
-#if __has_include(<rcu>)
-#  error "please update the header information for <rcu> in headers_not_available in utils/libcxx/header_information.py"
-#endif // __has_include(<rcu>)
-#if __has_include(<spanstream>)
-#  error "please update the header information for <spanstream> in headers_not_available in utils/libcxx/header_information.py"
-#endif // __has_include(<spanstream>)
-#if __has_include(<stacktrace>)
-#  error "please update the header information for <stacktrace> in headers_not_available in utils/libcxx/header_information.py"
-#endif // __has_include(<stacktrace>)
-#if __has_include(<stdfloat>)
-#  error "please update the header information for <stdfloat> in headers_not_available in utils/libcxx/header_information.py"
-#endif // __has_include(<stdfloat>)
-#if __has_include(<text_encoding>)
-#  error "please update the header information for <text_encoding> in headers_not_available in utils/libcxx/header_information.py"
-#endif // __has_include(<text_encoding>)
+//
+// This validation is mainly to aid libc++ developers to add modules for new
+// headers. On Windows the Windows SDK can be in the include path. This SDK
+// contains the MSVC STL headers. This may give false positives when MSVC STL
+// provides a header libc++ has not implemented yet. Therefore this validation
+// is not done on Windows.
+//
+#ifndef _WIN32
+#  if __has_include(<debugging>)
+#    error "please update the header information for <debugging> in headers_not_available in utils/libcxx/header_information.py"
+#  endif // __has_include(<debugging>)
+#  if __has_include(<flat_map>)
+#    error "please update the header information for <flat_map> in headers_not_available in utils/libcxx/header_information.py"
+#  endif // __has_include(<flat_map>)
+#  if __has_include(<flat_set>)
+#    error "please update the header information for <flat_set> in headers_not_available in utils/libcxx/header_information.py"
+#  endif // __has_include(<flat_set>)
+#  if __has_include(<generator>)
+#    error "please update the header information for <generator> in headers_not_available in utils/libcxx/header_information.py"
+#  endif // __has_include(<generator>)
+#  if __has_include(<hazard_pointer>)
+#    error "please update the header information for <hazard_pointer> in headers_not_available in utils/libcxx/header_information.py"
+#  endif // __has_include(<hazard_pointer>)
+#  if __has_include(<linalg>)
+#    error "please update the header information for <linalg> in headers_not_available in utils/libcxx/header_information.py"
+#  endif // __has_include(<linalg>)
+#  if __has_include(<rcu>)
+#    error "please update the header information for <rcu> in headers_not_available in utils/libcxx/header_information.py"
+#  endif // __has_include(<rcu>)
+#  if __has_include(<spanstream>)
+#    error "please update the header information for <spanstream> in headers_not_available in utils/libcxx/header_information.py"
+#  endif // __has_include(<spanstream>)
+#  if __has_include(<stacktrace>)
+#    error "please update the header information for <stacktrace> in headers_not_available in utils/libcxx/header_information.py"
+#  endif // __has_include(<stacktrace>)
+#  if __has_include(<stdfloat>)
+#    error "please update the header information for <stdfloat> in headers_not_available in utils/libcxx/header_information.py"
+#  endif // __has_include(<stdfloat>)
+#  if __has_include(<text_encoding>)
+#    error "please update the header information for <text_encoding> in headers_not_available in utils/libcxx/header_information.py"
+#  endif // __has_include(<text_encoding>)
+#endif // _WIN32
 
 export module std.compat;
 export import std;
diff --git a/libcxx/modules/std.cppm.in b/libcxx/modules/std.cppm.in
index 6ce8e28..3b59c28 100644
--- a/libcxx/modules/std.cppm.in
+++ b/libcxx/modules/std.cppm.in
@@ -168,39 +168,48 @@ module;
 #include <version>
 
 // *** Headers not yet available ***
-#if __has_include(<debugging>)
-#  error "please update the header information for <debugging> in headers_not_available in utils/libcxx/header_information.py"
-#endif // __has_include(<debugging>)
-#if __has_include(<flat_map>)
-#  error "please update the header information for <flat_map> in headers_not_available in utils/libcxx/header_information.py"
-#endif // __has_include(<flat_map>)
-#if __has_include(<flat_set>)
-#  error "please update the header information for <flat_set> in headers_not_available in utils/libcxx/header_information.py"
-#endif // __has_include(<flat_set>)
-#if __has_include(<generator>)
-#  error "please update the header information for <generator> in headers_not_available in utils/libcxx/header_information.py"
-#endif // __has_include(<generator>)
-#if __has_include(<hazard_pointer>)
-#  error "please update the header information for <hazard_pointer> in headers_not_available in utils/libcxx/header_information.py"
-#endif // __has_include(<hazard_pointer>)
-#if __has_include(<linalg>)
-#  error "please update the header information for <linalg> in headers_not_available in utils/libcxx/header_information.py"
-#endif // __has_include(<linalg>)
-#if __has_include(<rcu>)
-#  error "please update the header information for <rcu> in headers_not_available in utils/libcxx/header_information.py"
-#endif // __has_include(<rcu>)
-#if __has_include(<spanstream>)
-#  error "please update the header information for <spanstream> in headers_not_available in utils/libcxx/header_information.py"
-#endif // __has_include(<spanstream>)
-#if __has_include(<stacktrace>)
-#  error "please update the header information for <stacktrace> in headers_not_available in utils/libcxx/header_information.py"
-#endif // __has_include(<stacktrace>)
-#if __has_include(<stdfloat>)
-#  error "please update the header information for <stdfloat> in headers_not_available in utils/libcxx/header_information.py"
-#endif // __has_include(<stdfloat>)
-#if __has_include(<text_encoding>)
-#  error "please update the header information for <text_encoding> in headers_not_available in utils/libcxx/header_information.py"
-#endif // __has_include(<text_encoding>)
+//
+// This validation is mainly to aid libc++ developers to add modules for new
+// headers. On Windows the Windows SDK can be in the include path. This SDK
+// contains the MSVC STL headers. This may give false positives when MSVC STL
+// provides a header libc++ has not implemented yet. Therefore this validation
+// is not done on Windows.
+//
+#ifndef _WIN32
+#  if __has_include(<debugging>)
+#    error "please update the header information for <debugging> in headers_not_available in utils/libcxx/header_information.py"
+#  endif // __has_include(<debugging>)
+#  if __has_include(<flat_map>)
+#    error "please update the header information for <flat_map> in headers_not_available in utils/libcxx/header_information.py"
+#  endif // __has_include(<flat_map>)
+#  if __has_include(<flat_set>)
+#    error "please update the header information for <flat_set> in headers_not_available in utils/libcxx/header_information.py"
+#  endif // __has_include(<flat_set>)
+#  if __has_include(<generator>)
+#    error "please update the header information for <generator> in headers_not_available in utils/libcxx/header_information.py"
+#  endif // __has_include(<generator>)
+#  if __has_include(<hazard_pointer>)
+#    error "please update the header information for <hazard_pointer> in headers_not_available in utils/libcxx/header_information.py"
+#  endif // __has_include(<hazard_pointer>)
+#  if __has_include(<linalg>)
+#    error "please update the header information for <linalg> in headers_not_available in utils/libcxx/header_information.py"
+#  endif // __has_include(<linalg>)
+#  if __has_include(<rcu>)
+#    error "please update the header information for <rcu> in headers_not_available in utils/libcxx/header_information.py"
+#  endif // __has_include(<rcu>)
+#  if __has_include(<spanstream>)
+#    error "please update the header information for <spanstream> in headers_not_available in utils/libcxx/header_information.py"
+#  endif // __has_include(<spanstream>)
+#  if __has_include(<stacktrace>)
+#    error "please update the header information for <stacktrace> in headers_not_available in utils/libcxx/header_information.py"
+#  endif // __has_include(<stacktrace>)
+#  if __has_include(<stdfloat>)
+#    error "please update the header information for <stdfloat> in headers_not_available in utils/libcxx/header_information.py"
+#  endif // __has_include(<stdfloat>)
+#  if __has_include(<text_encoding>)
+#    error "please update the header information for <text_encoding> in headers_not_available in utils/libcxx/header_information.py"
+#  endif // __has_include(<text_encoding>)
+#endif // _WIN32
 
 export module std;
 
diff --git a/libcxx/utils/generate_libcxx_cppm_in.py b/libcxx/utils/generate_libcxx_cppm_in.py
index 2d3f829..0390ce5 100644
--- a/libcxx/utils/generate_libcxx_cppm_in.py
+++ b/libcxx/utils/generate_libcxx_cppm_in.py
@@ -57,18 +57,31 @@ module;
             else:
                 module_cpp_in.write(f"#include <{header}>\n")
 
-        module_cpp_in.write("\n// *** Headers not yet available ***\n")
+        module_cpp_in.write(
+            """
+// *** Headers not yet available ***
+//
+// This validation is mainly to catch when a new header is added but adding the 
+// corresponding .inc file is forgotten. However, the check based on __has_include
+// alone doesn't work on Windows because the Windows SDK is on the include path,
+// and that means the MSVC STL headers can be found as well, tricking __has_include
+// into thinking that libc++ provides the header.
+//
+#ifndef _WIN32
+"""
+        )
         for header in sorted(headers_not_available):
             module_cpp_in.write(
                 f"""\
-#if __has_include(<{header}>)
-#  error "please update the header information for <{header}> in headers_not_available in utils/libcxx/header_information.py"
-#endif // __has_include(<{header}>)
+#  if __has_include(<{header}>)
+#    error "please update the header information for <{header}> in headers_not_available in utils/libcxx/header_information.py"
+#  endif // __has_include(<{header}>)
 """
             )
 
         module_cpp_in.write(
-            f"""
+            f"""#endif // _WIN32
+
 export module {module};
 {'export import std;' if module == 'std.compat' else ''}
 
-- 
cgit v1.1


From a7520d9727d2638047e5c464b2937581f64e2ce5 Mon Sep 17 00:00:00 2001
From: Shourya Goel <shouryagoel10000@gmail.com>
Date: Fri, 9 Feb 2024 22:14:04 +0530
Subject: [Clang-tidy] bugprone-too-small-loop-variable - false-negative when
 const variable is used as loop bound (#81183)

Changed LibASTMatcher to give an appropriate warning
when a const loop bound is initialized with a function declaration.

Fixes: #79580
---
 .../clang-tidy/bugprone/TooSmallLoopVariableCheck.cpp        | 12 ++++++++----
 clang-tools-extra/docs/ReleaseNotes.rst                      |  4 ++++
 .../clang-tidy/checks/bugprone/too-small-loop-variable.rst   |  4 ++++
 .../clang-tidy/checkers/bugprone/too-small-loop-variable.cpp | 12 ++++++++++++
 4 files changed, 28 insertions(+), 4 deletions(-)

diff --git a/clang-tools-extra/clang-tidy/bugprone/TooSmallLoopVariableCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/TooSmallLoopVariableCheck.cpp
index 8ba8b89..a73d46f 100644
--- a/clang-tools-extra/clang-tidy/bugprone/TooSmallLoopVariableCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/TooSmallLoopVariableCheck.cpp
@@ -82,10 +82,14 @@ void TooSmallLoopVariableCheck::registerMatchers(MatchFinder *Finder) {
   // We are interested in only those cases when the loop bound is a variable
   // value (not const, enum, etc.).
   StatementMatcher LoopBoundMatcher =
-      expr(ignoringParenImpCasts(allOf(hasType(isInteger()),
-                                       unless(integerLiteral()),
-                                       unless(hasType(isConstQualified())),
-                                       unless(hasType(enumType())))))
+      expr(ignoringParenImpCasts(allOf(
+               hasType(isInteger()), unless(integerLiteral()),
+               unless(allOf(
+                   hasType(isConstQualified()),
+                   declRefExpr(to(varDecl(anyOf(
+                       hasInitializer(ignoringParenImpCasts(integerLiteral())),
+                       isConstexpr(), isConstinit())))))),
+               unless(hasType(enumType())))))
           .bind(LoopUpperBoundName);
 
   // We use the loop increment expression only to make sure we found the right
diff --git a/clang-tools-extra/docs/ReleaseNotes.rst b/clang-tools-extra/docs/ReleaseNotes.rst
index e50914a..dff8dd2 100644
--- a/clang-tools-extra/docs/ReleaseNotes.rst
+++ b/clang-tools-extra/docs/ReleaseNotes.rst
@@ -117,6 +117,10 @@ Changes in existing checks
   options `HeaderFileExtensions` and `ImplementationFileExtensions` by the
   global options of the same name.
 
+- Improved :doc:`bugprone-too-small-loop-variable
+  <clang-tidy/checks/bugprone/too-small-loop-variable>` support by correctly 
+  implementing the check for const loop boundary.
+
 - Cleaned up :doc:`cppcoreguidelines-prefer-member-initializer
   <clang-tidy/checks/cppcoreguidelines/prefer-member-initializer>`
   by removing enforcement of rule `C.48
diff --git a/clang-tools-extra/docs/clang-tidy/checks/bugprone/too-small-loop-variable.rst b/clang-tools-extra/docs/clang-tidy/checks/bugprone/too-small-loop-variable.rst
index 0f45cc2..2c3ded9 100644
--- a/clang-tools-extra/docs/clang-tidy/checks/bugprone/too-small-loop-variable.rst
+++ b/clang-tools-extra/docs/clang-tidy/checks/bugprone/too-small-loop-variable.rst
@@ -28,6 +28,10 @@ In a real use case size means a container's size which depends on the user input
 This algorithm works for a small amount of objects, but will lead to freeze for
 a larger user input.
 
+It's recommended to enable the compiler warning
+`-Wtautological-constant-out-of-range-compare` as well, since check does not
+inspect compile-time constant loop boundaries to avoid overlaps with the warning.
+
 .. option:: MagnitudeBitsUpperLimit
 
   Upper limit for the magnitude bits of the loop variable. If it's set the check
diff --git a/clang-tools-extra/test/clang-tidy/checkers/bugprone/too-small-loop-variable.cpp b/clang-tools-extra/test/clang-tidy/checkers/bugprone/too-small-loop-variable.cpp
index 3229deb..113150b 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/bugprone/too-small-loop-variable.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/bugprone/too-small-loop-variable.cpp
@@ -93,6 +93,18 @@ void voidBadForLoopWithMacroBound() {
   }
 }
 
+unsigned int getVal() {
+    return 300;
+}
+
+// The iteration's upper bound has a function declaration.
+void voidBadForLoop8() {
+  const unsigned int l = getVal();
+  for (unsigned char i = 0; i < l; ++i) {
+    // CHECK-MESSAGES: :[[@LINE-1]]:29: warning: loop variable has narrower type 'unsigned char' than iteration's upper bound 'const unsigned int' [bugprone-too-small-loop-variable]
+  }
+}
+
 ////////////////////////////////////////////////////////////////////////////////
 /// Correct loops: we should not warn here.
 
-- 
cgit v1.1


From 5afbed1968588fe443a8a55053c2f1eaa321d28e Mon Sep 17 00:00:00 2001
From: Jon Roelofs <jonathan_roelofs@apple.com>
Date: Fri, 9 Feb 2024 08:48:49 -0800
Subject: [llvm-objcopy] Fix the build after
 7ddc32052546abd41656d2e670f3902b1bf805a7. NFCI

---
 llvm/lib/ObjCopy/ELF/ELFObject.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/ObjCopy/ELF/ELFObject.cpp b/llvm/lib/ObjCopy/ELF/ELFObject.cpp
index c2de456..d7559ab 100644
--- a/llvm/lib/ObjCopy/ELF/ELFObject.cpp
+++ b/llvm/lib/ObjCopy/ELF/ELFObject.cpp
@@ -2787,7 +2787,7 @@ IHexWriter::getTotalSize(WritableMemoryBuffer &EmptyBuffer) const {
   IHexSectionWriterBase LengthCalc(EmptyBuffer);
   for (const SectionBase *Sec : Sections)
     if (Error Err = Sec->accept(LengthCalc))
-      return Err;
+      return std::move(Err);
 
   // We need space to write section records + StartAddress record
   // (if start adress is not zero) + EndOfFile record.
-- 
cgit v1.1


From 1245f5f4da8f88d031c0a69388d97e8a6d7f00b5 Mon Sep 17 00:00:00 2001
From: Cyndy Ishida <cyndy_ishida@apple.com>
Date: Fri, 9 Feb 2024 08:52:06 -0800
Subject: [clang][Driver] Add support for XROS_DEPLOYMENT_TARGET env var
 (#81011)

---
 clang/lib/Driver/ToolChains/Darwin.cpp               |  6 ++++--
 clang/lib/Driver/ToolChains/Darwin.h                 |  2 +-
 clang/test/Driver/xros-driver-requires-darwin-host.c | 13 +++++++++++++
 3 files changed, 18 insertions(+), 3 deletions(-)
 create mode 100644 clang/test/Driver/xros-driver-requires-darwin-host.c

diff --git a/clang/lib/Driver/ToolChains/Darwin.cpp b/clang/lib/Driver/ToolChains/Darwin.cpp
index fae8ad1..cc1219d 100644
--- a/clang/lib/Driver/ToolChains/Darwin.cpp
+++ b/clang/lib/Driver/ToolChains/Darwin.cpp
@@ -1902,6 +1902,7 @@ getDeploymentTargetFromEnvironmentVariables(const Driver &TheDriver,
       "TVOS_DEPLOYMENT_TARGET",
       "WATCHOS_DEPLOYMENT_TARGET",
       "DRIVERKIT_DEPLOYMENT_TARGET",
+      "XROS_DEPLOYMENT_TARGET"
   };
   static_assert(std::size(EnvVars) == Darwin::LastDarwinPlatform + 1,
                 "Missing platform");
@@ -1914,14 +1915,15 @@ getDeploymentTargetFromEnvironmentVariables(const Driver &TheDriver,
   // default platform.
   if (!Targets[Darwin::MacOS].empty() &&
       (!Targets[Darwin::IPhoneOS].empty() ||
-       !Targets[Darwin::WatchOS].empty() || !Targets[Darwin::TvOS].empty())) {
+       !Targets[Darwin::WatchOS].empty() || !Targets[Darwin::TvOS].empty() ||
+       !Targets[Darwin::XROS].empty())) {
     if (Triple.getArch() == llvm::Triple::arm ||
         Triple.getArch() == llvm::Triple::aarch64 ||
         Triple.getArch() == llvm::Triple::thumb)
       Targets[Darwin::MacOS] = "";
     else
       Targets[Darwin::IPhoneOS] = Targets[Darwin::WatchOS] =
-          Targets[Darwin::TvOS] = "";
+          Targets[Darwin::TvOS] = Targets[Darwin::XROS] = "";
   } else {
     // Don't allow conflicts in any other platform.
     unsigned FirstTarget = std::size(Targets);
diff --git a/clang/lib/Driver/ToolChains/Darwin.h b/clang/lib/Driver/ToolChains/Darwin.h
index 5e60b08..10d4b69 100644
--- a/clang/lib/Driver/ToolChains/Darwin.h
+++ b/clang/lib/Driver/ToolChains/Darwin.h
@@ -300,7 +300,7 @@ public:
     WatchOS,
     DriverKit,
     XROS,
-    LastDarwinPlatform = DriverKit
+    LastDarwinPlatform = XROS
   };
   enum DarwinEnvironmentKind {
     NativeEnvironment,
diff --git a/clang/test/Driver/xros-driver-requires-darwin-host.c b/clang/test/Driver/xros-driver-requires-darwin-host.c
new file mode 100644
index 0000000..e5bfcca
--- /dev/null
+++ b/clang/test/Driver/xros-driver-requires-darwin-host.c
@@ -0,0 +1,13 @@
+// REQUIRES: system-darwin
+
+// RUN: env XROS_DEPLOYMENT_TARGET=1.0 %clang -arch arm64 -c -### %s 2>&1 | FileCheck %s
+
+// RUN: rm -rf %t.dir
+// RUN: mkdir -p %t.dir/XROS1.0.sdk
+// RUN: %clang -arch arm64 -isysroot %t.dir/XROS1.0.sdk -c -### %s 2>&1 | FileCheck %s
+// RUN: mkdir -p %t.dir/XRSimulator1.0.sdk
+// RUN: %clang -arch arm64 -isysroot %t.dir/XRSimulator1.0.sdk -c -### %s 2>&1 | FileCheck --check-prefix=CHECK_SIM %s
+
+
+// CHECK: "-cc1"{{.*}} "-triple" "arm64-apple-xros1.0.0"
+// CHECK_SIM: "-cc1"{{.*}} "-triple" "arm64-apple-xros1.0.0-simulator"
-- 
cgit v1.1


From 94272a5a5d1549b32818805b82805e42c62ccfb4 Mon Sep 17 00:00:00 2001
From: Daniil Fukalov <1671137+dfukalov@users.noreply.github.com>
Date: Fri, 9 Feb 2024 17:54:14 +0100
Subject: [OpenMP] Fix libomp debug build. (#81029)

Disable libstdc++ assertions in the runtime library just like in
https://reviews.llvm.org/D143168.
---
 openmp/runtime/src/CMakeLists.txt | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/openmp/runtime/src/CMakeLists.txt b/openmp/runtime/src/CMakeLists.txt
index b0ecf12..ff129fe 100644
--- a/openmp/runtime/src/CMakeLists.txt
+++ b/openmp/runtime/src/CMakeLists.txt
@@ -152,6 +152,10 @@ if(UNIX)
   set(LIBOMP_DL_LIBS ${CMAKE_DL_LIBS})
 endif()
 
+# Disable libstdc++ assertions, even in an LLVM_ENABLE_ASSERTIONS build, to
+# avoid an unwanted dependency on libstdc++.so.
+add_definitions(-U_GLIBCXX_ASSERTIONS)
+
 # Add the OpenMP library
 libomp_get_ldflags(LIBOMP_CONFIGURED_LDFLAGS)
 
-- 
cgit v1.1


From c58c6aac7715d720358e317c26b6768940430ce9 Mon Sep 17 00:00:00 2001
From: Vlad Serebrennikov <serebrennikov.vladislav@gmail.com>
Date: Fri, 9 Feb 2024 20:59:02 +0400
Subject: [clang][Sema] Add checks for validity of default ctor's class
 (#78898)

Fixes #10518
Fixes #67914
Fixes #78388
Also addresses the second example in #49103

This patch is based on suggestion from @cor3ntin in
https://github.com/llvm/llvm-project/issues/67914#issuecomment-1896011898
---
 clang/docs/ReleaseNotes.rst                        |  4 ++
 clang/lib/Sema/SemaDeclCXX.cpp                     |  7 ++
 clang/test/SemaCXX/crash-GH10518.cpp               | 22 ++++++
 clang/test/SemaCXX/crash-GH49103-2.cpp             | 13 ++++
 clang/test/SemaCXX/crash-GH67914.cpp               | 78 ++++++++++++++++++++++
 clang/test/SemaCXX/crash-GH78388.cpp               | 17 +++++
 .../transform_error.mandates.verify.cpp            |  2 +-
 7 files changed, 142 insertions(+), 1 deletion(-)
 create mode 100644 clang/test/SemaCXX/crash-GH10518.cpp
 create mode 100644 clang/test/SemaCXX/crash-GH49103-2.cpp
 create mode 100644 clang/test/SemaCXX/crash-GH67914.cpp
 create mode 100644 clang/test/SemaCXX/crash-GH78388.cpp

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index df3ad20..7631f3b 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -182,6 +182,10 @@ Bug Fixes to Attribute Support
 Bug Fixes to C++ Support
 ^^^^^^^^^^^^^^^^^^^^^^^^
 
+- Fix crash when calling the constructor of an invalid class.
+  Fixes (`#10518 <https://github.com/llvm/llvm-project/issues/10518>`_),
+  (`#67914 <https://github.com/llvm/llvm-project/issues/10518>`_),
+  and (`#78388 <https://github.com/llvm/llvm-project/issues/78388>`_)
 - Fix crash when using lifetimebound attribute in function with trailing return.
   Fixes (`#73619 <https://github.com/llvm/llvm-project/issues/73619>`_)
 - Addressed an issue where constraints involving injected class types are perceived
diff --git a/clang/lib/Sema/SemaDeclCXX.cpp b/clang/lib/Sema/SemaDeclCXX.cpp
index fea8c50..ba233c9 100644
--- a/clang/lib/Sema/SemaDeclCXX.cpp
+++ b/clang/lib/Sema/SemaDeclCXX.cpp
@@ -5998,6 +5998,10 @@ void Sema::ActOnDefaultCtorInitializers(Decl *CDtorDecl) {
 
   if (CXXConstructorDecl *Constructor
       = dyn_cast<CXXConstructorDecl>(CDtorDecl)) {
+    if (CXXRecordDecl *ClassDecl = Constructor->getParent();
+        !ClassDecl || ClassDecl->isInvalidDecl()) {
+      return;
+    }
     SetCtorInitializers(Constructor, /*AnyErrors=*/false);
     DiagnoseUninitializedFields(*this, Constructor);
   }
@@ -14038,6 +14042,9 @@ void Sema::DefineImplicitDefaultConstructor(SourceLocation CurrentLocation,
 
   CXXRecordDecl *ClassDecl = Constructor->getParent();
   assert(ClassDecl && "DefineImplicitDefaultConstructor - invalid constructor");
+  if (ClassDecl->isInvalidDecl()) {
+    return;
+  }
 
   SynthesizedFunctionScope Scope(*this, Constructor);
 
diff --git a/clang/test/SemaCXX/crash-GH10518.cpp b/clang/test/SemaCXX/crash-GH10518.cpp
new file mode 100644
index 0000000..6c5f80a
--- /dev/null
+++ b/clang/test/SemaCXX/crash-GH10518.cpp
@@ -0,0 +1,22 @@
+// RUN: %clang_cc1 -verify -std=c++98 %s
+// RUN: %clang_cc1 -verify -std=c++11 %s
+// RUN: %clang_cc1 -verify -std=c++14 %s
+// RUN: %clang_cc1 -verify -std=c++17 %s
+// RUN: %clang_cc1 -verify -std=c++20 %s
+// RUN: %clang_cc1 -verify -std=c++23 %s
+// RUN: %clang_cc1 -verify -std=c++2c %s
+
+// https://github.com/llvm/llvm-project/issues/10518
+
+template <class T>
+class A : public T {
+};
+
+template <class T>
+class B : public A<T> {
+};
+
+template <class T>
+class B<int> : public A<T> { // expected-error 0-1 {{}}
+	B(T *t) {}
+};
diff --git a/clang/test/SemaCXX/crash-GH49103-2.cpp b/clang/test/SemaCXX/crash-GH49103-2.cpp
new file mode 100644
index 0000000..4c17a05
--- /dev/null
+++ b/clang/test/SemaCXX/crash-GH49103-2.cpp
@@ -0,0 +1,13 @@
+// RUN: %clang_cc1 -verify -std=c++98 %s
+// RUN: %clang_cc1 -verify -std=c++11 %s
+// RUN: %clang_cc1 -verify -std=c++14 %s
+// RUN: %clang_cc1 -verify -std=c++17 %s
+// RUN: %clang_cc1 -verify -std=c++20 %s
+// RUN: %clang_cc1 -verify -std=c++23 %s
+// RUN: %clang_cc1 -verify -std=c++2c %s
+
+// https://github.com/llvm/llvm-project/issues/49103
+
+template<class> struct A; // expected-note 0+ {{}}
+struct S : __make_integer_seq<A, int, 42> { }; // expected-error 0+ {{}}
+S s;
diff --git a/clang/test/SemaCXX/crash-GH67914.cpp b/clang/test/SemaCXX/crash-GH67914.cpp
new file mode 100644
index 0000000..fbaeac6
--- /dev/null
+++ b/clang/test/SemaCXX/crash-GH67914.cpp
@@ -0,0 +1,78 @@
+// RUN: %clang_cc1 -verify -std=c++98 %s
+// RUN: %clang_cc1 -verify -std=c++11 %s
+// RUN: %clang_cc1 -verify -std=c++14 %s
+// RUN: %clang_cc1 -verify -std=c++17 %s
+// RUN: %clang_cc1 -verify -std=c++20 %s
+// RUN: %clang_cc1 -verify -std=c++23 %s
+// RUN: %clang_cc1 -verify -std=c++2c %s
+
+// https://github.com/llvm/llvm-project/issues/67914
+
+template < typename, int >
+struct Mask;
+
+template < int, class >
+struct conditional {
+  using type = Mask< int, 16 >; // expected-warning 0+ {{}}
+};
+
+template < class _Then >
+struct conditional< 0, _Then > {
+  using type = _Then; // expected-warning 0+ {{}}
+};
+
+template < int _Bp, class, class _Then >
+using conditional_t = typename conditional< _Bp, _Then >::type; // expected-warning 0+ {{}}
+
+template < typename, int >
+struct Array;
+
+template < typename, int, bool, typename >
+struct StaticArrayImpl;
+
+template < typename Value_, int Size_ >
+struct Mask : StaticArrayImpl< Value_, Size_, 1, Mask< Value_, Size_ > > { // expected-note 0+ {{}}
+  template < typename T1 >
+  Mask(T1) {} // expected-note 0+ {{}}
+};
+
+template < typename T >
+void load(typename T::MaskType mask) {
+  T::load_(mask); // expected-note 0+ {{}}
+}
+
+template < typename Value_, int IsMask_, typename Derived_ >
+struct StaticArrayImpl< Value_, 32, IsMask_, Derived_ > {
+  using Array1 = conditional_t< IsMask_, void, Array< Value_, 16 > >; // expected-warning 0+ {{}}
+  
+  template < typename Mask >
+  static Derived_ load_(Mask mask) {
+    return Derived_{load< Array1 >(mask.a1), Mask{}}; // expected-error 0+ {{}}
+  }
+
+  Array1 a1;
+};
+
+template < typename Derived_ >
+struct KMaskBase;
+
+template < typename Derived_ >
+struct StaticArrayImpl< float, 16, 0, Derived_ > {
+  template < typename Mask >
+  static Derived_ load_(Mask mask);
+};
+
+template < typename Derived_ >
+struct StaticArrayImpl< float, 16, 1, Mask< float, 16 > > : KMaskBase< Derived_ > {}; // expected-error 0+ {{}}
+
+template < typename Derived_ >
+struct StaticArrayImpl< int, 16, 1, Derived_ > {};
+
+template < typename Value_, int Size_ >
+struct Array : StaticArrayImpl< Value_, Size_, 0, Array< Value_, Size_ > > {
+  using MaskType = Mask< Value_, Size_ >; // expected-warning 0+ {{}}
+};
+
+void test11_load_masked() {
+  load< Array< float, 32 > >{} == 0; // expected-error 0+ {{}} expected-warning 0+ {{}} expected-note 0+ {{}}
+}
diff --git a/clang/test/SemaCXX/crash-GH78388.cpp b/clang/test/SemaCXX/crash-GH78388.cpp
new file mode 100644
index 0000000..cdec4d5
--- /dev/null
+++ b/clang/test/SemaCXX/crash-GH78388.cpp
@@ -0,0 +1,17 @@
+// RUN: %clang_cc1 -verify -std=c++98 %s
+// RUN: %clang_cc1 -verify -std=c++11 %s
+// RUN: %clang_cc1 -verify -std=c++14 %s
+// RUN: %clang_cc1 -verify -std=c++17 %s
+// RUN: %clang_cc1 -verify -std=c++20 %s
+// RUN: %clang_cc1 -verify -std=c++23 %s
+// RUN: %clang_cc1 -verify -std=c++2c %s
+
+// https://github.com/llvm/llvm-project/issues/78388
+
+typedef mbstate_t; // expected-error 0+ {{}} expected-note 0+ {{}}
+  template < typename , typename , typename >
+  class a // expected-error 0+ {{}}
+  class b { // expected-error 0+ {{}}
+    namespace { // expected-note 0+ {{}} expected-note 0+ {{}}
+    template < typename c > b::operator=() { // expected-error 0+ {{}} expected-note 0+ {{}}
+      struct :a< c, char, stdmbstate_t > d // expected-error 0+ {{}} expected-warning 0+ {{}}
diff --git a/libcxx/test/libcxx/utilities/expected/expected.void/transform_error.mandates.verify.cpp b/libcxx/test/libcxx/utilities/expected/expected.void/transform_error.mandates.verify.cpp
index 4f4f5839..508b01a 100644
--- a/libcxx/test/libcxx/utilities/expected/expected.void/transform_error.mandates.verify.cpp
+++ b/libcxx/test/libcxx/utilities/expected/expected.void/transform_error.mandates.verify.cpp
@@ -56,7 +56,7 @@ void test() {
     e.transform_error(return_unexpected<int&>); // expected-error-re@*:* {{static assertion failed {{.*}}The result of {{.*}} must be a valid template argument for unexpected}}
     // expected-error-re@*:* 0-1 {{{{(excess elements in struct initializer|no matching constructor for initialization of)}}{{.*}}}}
     // expected-error-re@*:* {{static assertion failed {{.*}}A program that instantiates expected<T, E> with a E that is not a valid argument for unexpected<E> is ill-formed}}
-    // expected-error-re@*:* {{call to deleted constructor of {{.*}}}}
+    // expected-error-re@*:* 0-1 {{call to deleted constructor of {{.*}}}}
     // expected-error-re@*:* {{union member {{.*}} has reference type {{.*}}}}
 
     e.transform_error(return_no_object<int&>); // expected-error-re@*:* {{static assertion failed {{.*}}The result of {{.*}} must be a valid template argument for unexpected}}
-- 
cgit v1.1


From 9bb54b2aa006e3bf5df5eb8672075dd589fb9ba5 Mon Sep 17 00:00:00 2001
From: Jon Roelofs <jonathan_roelofs@apple.com>
Date: Fri, 9 Feb 2024 09:01:05 -0800
Subject: Move the new test added in 2095655f to its own file

... and set an explicit target triple.

Should fix buildbot issues like:

https://lab.llvm.org/buildbot/#/builders/245/builds/20379/steps/5/logs/FAIL__Clang__warn-unused-filescoped_cpp
---
 clang/test/SemaCXX/warn-unused-filescoped-fmv.cpp | 18 ++++++++++++++++++
 clang/test/SemaCXX/warn-unused-filescoped.cpp     | 16 ----------------
 2 files changed, 18 insertions(+), 16 deletions(-)
 create mode 100644 clang/test/SemaCXX/warn-unused-filescoped-fmv.cpp

diff --git a/clang/test/SemaCXX/warn-unused-filescoped-fmv.cpp b/clang/test/SemaCXX/warn-unused-filescoped-fmv.cpp
new file mode 100644
index 0000000..8c21da5
--- /dev/null
+++ b/clang/test/SemaCXX/warn-unused-filescoped-fmv.cpp
@@ -0,0 +1,18 @@
+// RUN: %clang_cc1 -triple arm64-apple-darwin -fsyntax-only -verify -Wunused -std=c++98 %s
+// RUN: %clang_cc1 -triple arm64-apple-darwin -fsyntax-only -verify -Wunused -std=c++14 %s
+
+__attribute__((target_version("fp16")))
+static int not_used_fmv(void) { return 1; }
+__attribute__((target_version("fp16fml")))
+static int not_used_fmv(void) { return 2; }
+__attribute__((target_version("default")))
+static int not_used_fmv(void) { return 0; } // expected-warning {{unused function 'not_used_fmv'}}
+
+
+__attribute__((target_version("fp16")))
+static int definitely_used_fmv(void) { return 1; }
+__attribute__((target_version("fp16fml")))
+static int definitely_used_fmv(void) { return 2; }
+__attribute__((target_version("default")))
+static int definitely_used_fmv(void) { return 0; }
+int definite_user(void) { return definitely_used_fmv(); }
diff --git a/clang/test/SemaCXX/warn-unused-filescoped.cpp b/clang/test/SemaCXX/warn-unused-filescoped.cpp
index 0c347e9..be8d350 100644
--- a/clang/test/SemaCXX/warn-unused-filescoped.cpp
+++ b/clang/test/SemaCXX/warn-unused-filescoped.cpp
@@ -236,20 +236,4 @@ constexpr int constexpr4() { return 2; }
 #endif
 }
 
-__attribute__((target_version("fp16")))
-static int not_used_fmv(void) { return 1; }
-__attribute__((target_version("fp16fml")))
-static int not_used_fmv(void) { return 2; }
-__attribute__((target_version("default")))
-static int not_used_fmv(void) { return 0; } // expected-warning {{unused function 'not_used_fmv'}}
-
-
-__attribute__((target_version("fp16")))
-static int definitely_used_fmv(void) { return 1; }
-__attribute__((target_version("fp16fml")))
-static int definitely_used_fmv(void) { return 2; }
-__attribute__((target_version("default")))
-static int definitely_used_fmv(void) { return 0; }
-int definite_user(void) { return definitely_used_fmv(); }
-
 #endif
-- 
cgit v1.1


From 9dd8ba4429fc22063e6ce18017e7bdbd7552a927 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo@fhahn.com>
Date: Fri, 9 Feb 2024 17:05:22 +0000
Subject: [InstCombine] Add memcpy test with !tbaa.struct with multiple fields.

Add an additional test with a struct with multiple fields.
---
 .../Transforms/InstCombine/struct-assign-tbaa.ll   | 44 ++++++++++++++++++++--
 1 file changed, 41 insertions(+), 3 deletions(-)

diff --git a/llvm/test/Transforms/InstCombine/struct-assign-tbaa.ll b/llvm/test/Transforms/InstCombine/struct-assign-tbaa.ll
index 5c2ea39..1042c41 100644
--- a/llvm/test/Transforms/InstCombine/struct-assign-tbaa.ll
+++ b/llvm/test/Transforms/InstCombine/struct-assign-tbaa.ll
@@ -35,17 +35,55 @@ define ptr @test2() {
   ret ptr %tmp
 }
 
+define void @test3_multiple_fields(ptr nocapture %a, ptr nocapture %b) {
+; CHECK-LABEL: @test3_multiple_fields(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr [[B:%.*]], align 4
+; CHECK-NEXT:    store i64 [[TMP0]], ptr [[A:%.*]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  tail call void @llvm.memcpy.p0.p0.i64(ptr align 4 %a, ptr align 4 %b, i64 8, i1 false), !tbaa.struct !6
+  ret void
+}
+
+define void @test4_multiple_copy_first_field(ptr nocapture %a, ptr nocapture %b) {
+; CHECK-LABEL: @test4_multiple_copy_first_field(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[B:%.*]], align 4
+; CHECK-NEXT:    store i32 [[TMP0]], ptr [[A:%.*]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  tail call void @llvm.memcpy.p0.p0.i64(ptr align 4 %a, ptr align 4 %b, i64 4, i1 false), !tbaa.struct !6
+  ret void
+}
+
+define void @test5_multiple_copy_more_than_first_field(ptr nocapture %a, ptr nocapture %b) {
+; CHECK-LABEL: @test5_multiple_copy_more_than_first_field(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[B:%.*]], align 4
+; CHECK-NEXT:    store i32 [[TMP0]], ptr [[A:%.*]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  tail call void @llvm.memcpy.p0.p0.i64(ptr align 4 %a, ptr align 4 %b, i64 4, i1 false), !tbaa.struct !7
+  ret void
+}
+
 !0 = !{!"Simple C/C++ TBAA"}
 !1 = !{!"omnipotent char", !0}
 !2 = !{!5, !5, i64 0}
 !3 = !{i64 0, i64 4, !2}
 !4 = !{i64 0, i64 8, null}
 !5 = !{!"float", !0}
+!6 = !{i64 0, i64 4, !2, i64 4, i64 4, !2}
+!7 = !{i64 0, i64 2, !2, i64 4, i64 6, !2}
 
 ;.
 ; CHECK: attributes #[[ATTR0:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: readwrite) }
 ;.
-; CHECK: [[TBAA0]] = !{!1, !1, i64 0}
-; CHECK: [[META1:![0-9]+]] = !{!"float", !2}
-; CHECK: [[META2:![0-9]+]] = !{!"Simple C/C++ TBAA"}
+; CHECK: [[TBAA0]] = !{[[META1:![0-9]+]], [[META1]], i64 0}
+; CHECK: [[META1]] = !{!"float", [[META2:![0-9]+]]}
+; CHECK: [[META2]] = !{!"Simple C/C++ TBAA"}
 ;.
-- 
cgit v1.1


From 0d72f0beabc180754eae334f22f01e48a5032bbe Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Andrzej=20Warzy=C5=84ski?= <andrzej.warzynski@arm.com>
Date: Fri, 9 Feb 2024 17:13:37 +0000
Subject: [mlir][Vector] Fix "scalability" in
 CastAwayExtractStridedSliceLeadingOneDim (#81187)

Makes sure that "scalability" flags in the
`CastAwayExtractStridedSliceLeadingOneDim` pattern are correctly
updated.
---
 .../Vector/Transforms/VectorDropLeadUnitDim.cpp     |  3 ++-
 .../Vector/vector-dropleadunitdim-transforms.mlir   | 21 +++++++++++++++++++++
 2 files changed, 23 insertions(+), 1 deletion(-)

diff --git a/mlir/lib/Dialect/Vector/Transforms/VectorDropLeadUnitDim.cpp b/mlir/lib/Dialect/Vector/Transforms/VectorDropLeadUnitDim.cpp
index e1ed5d8..74382b0 100644
--- a/mlir/lib/Dialect/Vector/Transforms/VectorDropLeadUnitDim.cpp
+++ b/mlir/lib/Dialect/Vector/Transforms/VectorDropLeadUnitDim.cpp
@@ -73,7 +73,8 @@ struct CastAwayExtractStridedSliceLeadingOneDim
     VectorType oldDstType = extractOp.getType();
     VectorType newDstType =
         VectorType::get(oldDstType.getShape().drop_front(dropCount),
-                        oldDstType.getElementType());
+                        oldDstType.getElementType(),
+                        oldDstType.getScalableDims().drop_front(dropCount));
 
     Location loc = extractOp.getLoc();
 
diff --git a/mlir/test/Dialect/Vector/vector-dropleadunitdim-transforms.mlir b/mlir/test/Dialect/Vector/vector-dropleadunitdim-transforms.mlir
index f601be0..bb2d30f 100644
--- a/mlir/test/Dialect/Vector/vector-dropleadunitdim-transforms.mlir
+++ b/mlir/test/Dialect/Vector/vector-dropleadunitdim-transforms.mlir
@@ -206,6 +206,16 @@ func.func @cast_away_extract_strided_slice_leading_one_dims(%arg0: vector<1x8x8x
   return %0: vector<1x1x8xf16>
 }
 
+// CHECK-LABEL: func @cast_away_extract_strided_slice_leading_one_dims_scalable
+func.func @cast_away_extract_strided_slice_leading_one_dims_scalable(%arg0: vector<1x8x[8]xf16>) -> vector<1x1x[8]xf16> {
+  // CHECK:     %[[SRC:.+]] = vector.extract %{{.*}}[0] : vector<8x[8]xf16> from vector<1x8x[8]xf16>
+  // CHECK: %[[EXTRACT:.+]] = vector.extract_strided_slice %[[SRC]] {offsets = [4], sizes = [1], strides = [1]} : vector<8x[8]xf16> to vector<1x[8]xf16>
+  %0 = vector.extract_strided_slice %arg0 {offsets = [0, 4], sizes = [1, 1], strides = [1, 1]} : vector<1x8x[8]xf16> to vector<1x1x[8]xf16>
+  // CHECK:     %[[RET:.+]] = vector.broadcast %[[EXTRACT]] : vector<1x[8]xf16> to vector<1x1x[8]xf16>
+  // CHECK: return %[[RET]]
+  return %0: vector<1x1x[8]xf16>
+}
+
 // CHECK-LABEL: func @cast_away_insert_strided_slice_leading_one_dims
 func.func @cast_away_insert_strided_slice_leading_one_dims(%arg0: vector<1x8xf16>, %arg1: vector<1x8x8xf16>) -> vector<1x8x8xf16> {
   // CHECK:    %[[SRC:.+]] = vector.extract %{{.*}}[0] : vector<8xf16> from vector<1x8xf16>
@@ -217,6 +227,17 @@ func.func @cast_away_insert_strided_slice_leading_one_dims(%arg0: vector<1x8xf16
   return %0: vector<1x8x8xf16>
 }
 
+// CHECK-LABEL: func @cast_away_insert_strided_slice_leading_one_dims_scalable
+func.func @cast_away_insert_strided_slice_leading_one_dims_scalable(%arg0: vector<1x[8]xf16>, %arg1: vector<1x8x[8]xf16>) -> vector<1x8x[8]xf16> {
+  // CHECK:    %[[SRC:.+]] = vector.extract %{{.*}}[0] : vector<[8]xf16> from vector<1x[8]xf16>
+  // CHECK:    %[[DST:.+]] = vector.extract %{{.*}}[0] : vector<8x[8]xf16> from vector<1x8x[8]xf16>
+  // CHECK: %[[INSERT:.+]] = vector.insert_strided_slice %[[SRC]], %[[DST]] {offsets = [0, 0], strides = [1]} : vector<[8]xf16> into vector<8x[8]xf16>
+  %0 = vector.insert_strided_slice %arg0, %arg1 {offsets = [0, 0, 0], strides = [1, 1]} : vector<1x[8]xf16> into vector<1x8x[8]xf16>
+  // CHECK:    %[[RET:.+]] = vector.broadcast %[[INSERT]] : vector<8x[8]xf16> to vector<1x8x[8]xf16>
+  // CHECK: return %[[RET]]
+  return %0: vector<1x8x[8]xf16>
+}
+
 // CHECK-LABEL: func @cast_away_insert_strided_slice_leading_one_dims_one_element
 //  CHECK-SAME: %[[ARG0:.+]]: vector<1x1xf16>, %{{.+}}: vector<1x1x1xf16>
 func.func @cast_away_insert_strided_slice_leading_one_dims_one_element(%arg0: vector<1x1xf16>, %arg1: vector<1x1x1xf16>) -> vector<1x1x1xf16> {
-- 
cgit v1.1


From 2884d048396abc82c8356c4e350ef968fb24a0d7 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo@fhahn.com>
Date: Fri, 9 Feb 2024 17:16:21 +0000
Subject: [SROA] Add additional tests for splitting up ops with !tbaa.struct.

---
 llvm/test/Transforms/SROA/tbaa-struct3.ll | 107 ++++++++++++++++++++++++++++++
 1 file changed, 107 insertions(+)
 create mode 100644 llvm/test/Transforms/SROA/tbaa-struct3.ll

diff --git a/llvm/test/Transforms/SROA/tbaa-struct3.ll b/llvm/test/Transforms/SROA/tbaa-struct3.ll
new file mode 100644
index 0000000..4910e0e
--- /dev/null
+++ b/llvm/test/Transforms/SROA/tbaa-struct3.ll
@@ -0,0 +1,107 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -p sroa -S %s | FileCheck %s
+
+define void @load_store_transfer_split_struct_tbaa_2_float(ptr dereferenceable(24) %res, float %a, float %b) {
+; CHECK-LABEL: define void @load_store_transfer_split_struct_tbaa_2_float(
+; CHECK-SAME: ptr dereferenceable(24) [[RES:%.*]], float [[A:%.*]], float [[B:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast float [[A]] to i32
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast float [[B]] to i32
+; CHECK-NEXT:    store i32 [[TMP0]], ptr [[RES]], align 4
+; CHECK-NEXT:    [[RES_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[RES]], i64 4
+; CHECK-NEXT:    store i32 [[TMP1]], ptr [[RES_SROA_IDX]], align 4
+; CHECK-NEXT:    [[P:%.*]] = load ptr, ptr [[RES]], align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  %tmp = alloca { float, float }, align 4
+  store float %a, ptr %tmp, align 4
+  %tmp.4 = getelementptr inbounds i8, ptr %tmp, i64 4
+  store float %b, ptr %tmp.4, align 4
+  %l1 = load i64, ptr %tmp, !tbaa.struct !0
+  store i64 %l1, ptr %res, !tbaa.struct !0
+  %p = load ptr, ptr %res, align 8
+  ret void
+}
+
+define void @memcpy_transfer(ptr dereferenceable(24) %res, float %a, float %b) {
+; CHECK-LABEL: define void @memcpy_transfer(
+; CHECK-SAME: ptr dereferenceable(24) [[RES:%.*]], float [[A:%.*]], float [[B:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[L_PTR:%.*]] = load ptr, ptr [[RES]], align 8
+; CHECK-NEXT:    store float [[A]], ptr [[L_PTR]], align 1, !tbaa.struct [[TBAA_STRUCT0:![0-9]+]]
+; CHECK-NEXT:    [[TMP_SROA_2_0_L_PTR_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[L_PTR]], i64 4
+; CHECK-NEXT:    store float [[B]], ptr [[TMP_SROA_2_0_L_PTR_SROA_IDX]], align 1, !tbaa.struct [[TBAA_STRUCT5:![0-9]+]]
+; CHECK-NEXT:    ret void
+;
+entry:
+  %tmp = alloca { float, float }, align 4
+  store float %a, ptr %tmp, align 4
+  %__im_.i.i = getelementptr inbounds i8, ptr %tmp, i64 4
+  store float %b, ptr %__im_.i.i, align 4
+  %l.ptr = load ptr, ptr %res, align 8
+  call void @llvm.memcpy.p0.p0.i64(ptr %l.ptr, ptr %tmp, i64 8, i1 false), !tbaa.struct !0
+  ret void
+}
+
+define void @memcpy_transfer_tbaa_field_and_size_do_not_align(ptr dereferenceable(24) %res, float %a, float %b) {
+; CHECK-LABEL: define void @memcpy_transfer_tbaa_field_and_size_do_not_align(
+; CHECK-SAME: ptr dereferenceable(24) [[RES:%.*]], float [[A:%.*]], float [[B:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[L_PTR:%.*]] = load ptr, ptr [[RES]], align 8
+; CHECK-NEXT:    store float [[A]], ptr [[L_PTR]], align 1, !tbaa.struct [[TBAA_STRUCT0]]
+; CHECK-NEXT:    [[TMP_SROA_2_0_L_PTR_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[L_PTR]], i64 4
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast float [[B]] to i32
+; CHECK-NEXT:    [[TMP_SROA_2_0_EXTRACT_TRUNC:%.*]] = trunc i32 [[TMP0]] to i16
+; CHECK-NEXT:    store i16 [[TMP_SROA_2_0_EXTRACT_TRUNC]], ptr [[TMP_SROA_2_0_L_PTR_SROA_IDX]], align 1, !tbaa.struct [[TBAA_STRUCT5]]
+; CHECK-NEXT:    ret void
+;
+entry:
+  %tmp = alloca { float, float }, align 4
+  store float %a, ptr %tmp, align 4
+  %__im_.i.i = getelementptr inbounds i8, ptr %tmp, i64 4
+  store float %b, ptr %__im_.i.i, align 4
+  %l.ptr = load ptr, ptr %res, align 8
+  call void @llvm.memcpy.p0.p0.i64(ptr %l.ptr, ptr %tmp, i64 6, i1 false), !tbaa.struct !0
+  ret void
+}
+
+define void @load_store_transfer_split_struct_tbaa_2_i31(ptr dereferenceable(24) %res, i31 %a, i31 %b) {
+; CHECK-LABEL: define void @load_store_transfer_split_struct_tbaa_2_i31(
+; CHECK-SAME: ptr dereferenceable(24) [[RES:%.*]], i31 [[A:%.*]], i31 [[B:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP:%.*]] = alloca { i31, i31 }, align 4
+; CHECK-NEXT:    store i31 [[A]], ptr [[TMP]], align 4
+; CHECK-NEXT:    [[TMP_4_TMP_4_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[TMP]], i64 4
+; CHECK-NEXT:    store i31 [[B]], ptr [[TMP_4_TMP_4_SROA_IDX]], align 4
+; CHECK-NEXT:    [[TMP_0_L1:%.*]] = load i62, ptr [[TMP]], align 4, !tbaa.struct [[TBAA_STRUCT0]]
+; CHECK-NEXT:    store i62 [[TMP_0_L1]], ptr [[RES]], align 4, !tbaa.struct [[TBAA_STRUCT0]]
+; CHECK-NEXT:    ret void
+;
+entry:
+  %tmp = alloca { i31 , i31 }, align 4
+  store i31 %a, ptr %tmp, align 4
+  %tmp.4  = getelementptr inbounds i8, ptr %tmp, i64 4
+  store i31 %b, ptr %tmp.4, align 4
+  %l1 = load i62, ptr %tmp, !tbaa.struct !0
+  store i62 %l1, ptr %res, !tbaa.struct !0
+  ret void
+}
+
+
+; Function Attrs: mustprogress nocallback nofree nounwind willreturn memory(argmem: readwrite)
+declare void @llvm.memcpy.p0.p0.i64(ptr noalias nocapture writeonly, ptr noalias nocapture readonly, i64, i1 immarg) #2
+
+!0 = !{i64 0, i64 4, !1, i64 4, i64 4, !1}
+!1 = !{!2, !2, i64 0}
+!2 = !{!"float", !3, i64 0}
+!3 = !{!"omnipotent char", !4, i64 0}
+!4 = !{!"Simple C++ TBAA"}
+;.
+; CHECK: [[TBAA_STRUCT0]] = !{i64 0, i64 4, [[META1:![0-9]+]], i64 4, i64 4, [[META1]]}
+; CHECK: [[META1]] = !{[[META2:![0-9]+]], [[META2]], i64 0}
+; CHECK: [[META2]] = !{!"float", [[META3:![0-9]+]], i64 0}
+; CHECK: [[META3]] = !{!"omnipotent char", [[META4:![0-9]+]], i64 0}
+; CHECK: [[META4]] = !{!"Simple C++ TBAA"}
+; CHECK: [[TBAA_STRUCT5]] = !{i64 0, i64 4, [[META1]]}
+;.
-- 
cgit v1.1


From bb5c3899d1936ebdf7ebf5ca4347ee2e057bee7f Mon Sep 17 00:00:00 2001
From: Zain Jaffal <zain@jjaffal.com>
Date: Fri, 9 Feb 2024 17:24:41 +0000
Subject: [InstCombine] Optimise x / sqrt(y / z) with fast-math pattern.
 (#76737)

Replace the pattern with
x * sqrt(z/y)

---------

Co-authored-by: Matt Arsenault <arsenm2@gmail.com>
---
 .../InstCombine/InstCombineMulDivRem.cpp           | 30 ++++++++++++++++++++++
 llvm/test/Transforms/InstCombine/fdiv-sqrt.ll      | 18 ++++++-------
 2 files changed, 39 insertions(+), 9 deletions(-)

diff --git a/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp b/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
index f9cee9d..5918567 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
@@ -1709,6 +1709,33 @@ static Instruction *foldFDivPowDivisor(BinaryOperator &I,
   return BinaryOperator::CreateFMulFMF(Op0, Pow, &I);
 }
 
+/// Convert div to mul if we have an sqrt divisor iff sqrt's operand is a fdiv
+/// instruction.
+static Instruction *foldFDivSqrtDivisor(BinaryOperator &I,
+                                        InstCombiner::BuilderTy &Builder) {
+  // X / sqrt(Y / Z) -->  X * sqrt(Z / Y)
+  if (!I.hasAllowReassoc() || !I.hasAllowReciprocal())
+    return nullptr;
+  Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
+  auto *II = dyn_cast<IntrinsicInst>(Op1);
+  if (!II || II->getIntrinsicID() != Intrinsic::sqrt || !II->hasOneUse() ||
+      !II->hasAllowReassoc() || !II->hasAllowReciprocal())
+    return nullptr;
+
+  Value *Y, *Z;
+  auto *DivOp = dyn_cast<Instruction>(II->getOperand(0));
+  if (!DivOp || !DivOp->hasAllowReassoc() || !I.hasAllowReciprocal() ||
+      !DivOp->hasOneUse())
+    return nullptr;
+  if (match(DivOp, m_FDiv(m_Value(Y), m_Value(Z)))) {
+    Value *SwapDiv = Builder.CreateFDivFMF(Z, Y, DivOp);
+    Value *NewSqrt =
+        Builder.CreateUnaryIntrinsic(II->getIntrinsicID(), SwapDiv, II);
+    return BinaryOperator::CreateFMulFMF(Op0, NewSqrt, &I);
+  }
+  return nullptr;
+}
+
 Instruction *InstCombinerImpl::visitFDiv(BinaryOperator &I) {
   Module *M = I.getModule();
 
@@ -1816,6 +1843,9 @@ Instruction *InstCombinerImpl::visitFDiv(BinaryOperator &I) {
   if (Instruction *Mul = foldFDivPowDivisor(I, Builder))
     return Mul;
 
+  if (Instruction *Mul = foldFDivSqrtDivisor(I, Builder))
+    return Mul;
+
   // pow(X, Y) / X --> pow(X, Y-1)
   if (I.hasAllowReassoc() &&
       match(Op0, m_OneUse(m_Intrinsic<Intrinsic::pow>(m_Specific(Op1),
diff --git a/llvm/test/Transforms/InstCombine/fdiv-sqrt.ll b/llvm/test/Transforms/InstCombine/fdiv-sqrt.ll
index 346271b..361837e 100644
--- a/llvm/test/Transforms/InstCombine/fdiv-sqrt.ll
+++ b/llvm/test/Transforms/InstCombine/fdiv-sqrt.ll
@@ -6,9 +6,9 @@ declare double @llvm.sqrt.f64(double)
 define double @sqrt_div_fast(double %x, double %y, double %z) {
 ; CHECK-LABEL: @sqrt_div_fast(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[DIV:%.*]] = fdiv fast double [[Y:%.*]], [[Z:%.*]]
-; CHECK-NEXT:    [[SQRT:%.*]] = call fast double @llvm.sqrt.f64(double [[DIV]])
-; CHECK-NEXT:    [[DIV1:%.*]] = fdiv fast double [[X:%.*]], [[SQRT]]
+; CHECK-NEXT:    [[TMP0:%.*]] = fdiv fast double [[Z:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast double @llvm.sqrt.f64(double [[TMP0]])
+; CHECK-NEXT:    [[DIV1:%.*]] = fmul fast double [[TMP1]], [[X:%.*]]
 ; CHECK-NEXT:    ret double [[DIV1]]
 ;
 entry:
@@ -36,9 +36,9 @@ entry:
 define double @sqrt_div_reassoc_arcp(double %x, double %y, double %z) {
 ; CHECK-LABEL: @sqrt_div_reassoc_arcp(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[DIV:%.*]] = fdiv reassoc arcp double [[Y:%.*]], [[Z:%.*]]
-; CHECK-NEXT:    [[SQRT:%.*]] = call reassoc arcp double @llvm.sqrt.f64(double [[DIV]])
-; CHECK-NEXT:    [[DIV1:%.*]] = fdiv reassoc arcp double [[X:%.*]], [[SQRT]]
+; CHECK-NEXT:    [[TMP0:%.*]] = fdiv reassoc arcp double [[Z:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call reassoc arcp double @llvm.sqrt.f64(double [[TMP0]])
+; CHECK-NEXT:    [[DIV1:%.*]] = fmul reassoc arcp double [[TMP1]], [[X:%.*]]
 ; CHECK-NEXT:    ret double [[DIV1]]
 ;
 entry:
@@ -96,9 +96,9 @@ entry:
 define double @sqrt_div_arcp_missing(double %x, double %y, double %z) {
 ; CHECK-LABEL: @sqrt_div_arcp_missing(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[DIV:%.*]] = fdiv reassoc double [[Y:%.*]], [[Z:%.*]]
-; CHECK-NEXT:    [[SQRT:%.*]] = call reassoc arcp double @llvm.sqrt.f64(double [[DIV]])
-; CHECK-NEXT:    [[DIV1:%.*]] = fdiv reassoc arcp double [[X:%.*]], [[SQRT]]
+; CHECK-NEXT:    [[TMP0:%.*]] = fdiv reassoc double [[Z:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call reassoc arcp double @llvm.sqrt.f64(double [[TMP0]])
+; CHECK-NEXT:    [[DIV1:%.*]] = fmul reassoc arcp double [[TMP1]], [[X:%.*]]
 ; CHECK-NEXT:    ret double [[DIV1]]
 ;
 entry:
-- 
cgit v1.1


From 301f6840522e3d924cf00ab6a04f93f1354142f5 Mon Sep 17 00:00:00 2001
From: Kiran Chandramohan <kiran.chandramohan@arm.com>
Date: Fri, 9 Feb 2024 17:52:30 +0000
Subject: [Flang][OpenMP] NFC: Refactor reduction code (#79876)

Introduces a new enumeration to list all Fortran reduction identifiers.
Moves the combiner code-generation into a separate function for possible
reuse in array context in future.
---
 flang/lib/Lower/OpenMP.cpp | 344 ++++++++++++++++++++++-----------------------
 1 file changed, 172 insertions(+), 172 deletions(-)

diff --git a/flang/lib/Lower/OpenMP.cpp b/flang/lib/Lower/OpenMP.cpp
index ad4cffc..fd18b21 100644
--- a/flang/lib/Lower/OpenMP.cpp
+++ b/flang/lib/Lower/OpenMP.cpp
@@ -731,21 +731,59 @@ static void checkMapType(mlir::Location location, mlir::Type type) {
 
 class ReductionProcessor {
 public:
-  enum IntrinsicProc { MAX, MIN, IAND, IOR, IEOR };
-  static IntrinsicProc
+  // TODO: Move this enumeration to the OpenMP dialect
+  enum ReductionIdentifier {
+    ID,
+    USER_DEF_OP,
+    ADD,
+    SUBTRACT,
+    MULTIPLY,
+    AND,
+    OR,
+    EQV,
+    NEQV,
+    MAX,
+    MIN,
+    IAND,
+    IOR,
+    IEOR
+  };
+  static ReductionIdentifier
   getReductionType(const Fortran::parser::ProcedureDesignator &pd) {
-    auto redType = llvm::StringSwitch<std::optional<IntrinsicProc>>(
+    auto redType = llvm::StringSwitch<std::optional<ReductionIdentifier>>(
                        getRealName(pd).ToString())
-                       .Case("max", IntrinsicProc::MAX)
-                       .Case("min", IntrinsicProc::MIN)
-                       .Case("iand", IntrinsicProc::IAND)
-                       .Case("ior", IntrinsicProc::IOR)
-                       .Case("ieor", IntrinsicProc::IEOR)
+                       .Case("max", ReductionIdentifier::MAX)
+                       .Case("min", ReductionIdentifier::MIN)
+                       .Case("iand", ReductionIdentifier::IAND)
+                       .Case("ior", ReductionIdentifier::IOR)
+                       .Case("ieor", ReductionIdentifier::IEOR)
                        .Default(std::nullopt);
     assert(redType && "Invalid Reduction");
     return *redType;
   }
 
+  static ReductionIdentifier getReductionType(
+      Fortran::parser::DefinedOperator::IntrinsicOperator intrinsicOp) {
+    switch (intrinsicOp) {
+    case Fortran::parser::DefinedOperator::IntrinsicOperator::Add:
+      return ReductionIdentifier::ADD;
+    case Fortran::parser::DefinedOperator::IntrinsicOperator::Subtract:
+      return ReductionIdentifier::SUBTRACT;
+    case Fortran::parser::DefinedOperator::IntrinsicOperator::Multiply:
+      return ReductionIdentifier::MULTIPLY;
+    case Fortran::parser::DefinedOperator::IntrinsicOperator::AND:
+      return ReductionIdentifier::AND;
+    case Fortran::parser::DefinedOperator::IntrinsicOperator::EQV:
+      return ReductionIdentifier::EQV;
+    case Fortran::parser::DefinedOperator::IntrinsicOperator::OR:
+      return ReductionIdentifier::OR;
+    case Fortran::parser::DefinedOperator::IntrinsicOperator::NEQV:
+      return ReductionIdentifier::NEQV;
+    default:
+      llvm_unreachable("unexpected intrinsic operator in reduction");
+    }
+  }
+
   static bool supportedIntrinsicProcReduction(
       const Fortran::parser::ProcedureDesignator &pd) {
     const auto *name{Fortran::parser::Unwrap<Fortran::parser::Name>(pd)};
@@ -753,17 +791,14 @@ public:
     if (!name->symbol->GetUltimate().attrs().test(
             Fortran::semantics::Attr::INTRINSIC))
       return false;
-    auto redType = llvm::StringSwitch<std::optional<IntrinsicProc>>(
-                       getRealName(name).ToString())
-                       .Case("max", IntrinsicProc::MAX)
-                       .Case("min", IntrinsicProc::MIN)
-                       .Case("iand", IntrinsicProc::IAND)
-                       .Case("ior", IntrinsicProc::IOR)
-                       .Case("ieor", IntrinsicProc::IEOR)
-                       .Default(std::nullopt);
-    if (redType)
-      return true;
-    return false;
+    auto redType = llvm::StringSwitch<bool>(getRealName(name).ToString())
+                       .Case("max", true)
+                       .Case("min", true)
+                       .Case("iand", true)
+                       .Case("ior", true)
+                       .Case("ieor", true)
+                       .Default(false);
+    return redType;
   }
 
   static const Fortran::semantics::SourceName
@@ -817,32 +852,30 @@ public:
   /// reductionOpName. For example:
   ///    0 + x = x,
   ///    1 * x = x
-  static int getOperationIdentity(
-      Fortran::parser::DefinedOperator::IntrinsicOperator intrinsicOp,
-      mlir::Location loc) {
-    switch (intrinsicOp) {
-    case Fortran::parser::DefinedOperator::IntrinsicOperator::Add:
-    case Fortran::parser::DefinedOperator::IntrinsicOperator::OR:
-    case Fortran::parser::DefinedOperator::IntrinsicOperator::NEQV:
+  static int getOperationIdentity(ReductionIdentifier redId,
+                                  mlir::Location loc) {
+    switch (redId) {
+    case ReductionIdentifier::ADD:
+    case ReductionIdentifier::OR:
+    case ReductionIdentifier::NEQV:
       return 0;
-    case Fortran::parser::DefinedOperator::IntrinsicOperator::Multiply:
-    case Fortran::parser::DefinedOperator::IntrinsicOperator::AND:
-    case Fortran::parser::DefinedOperator::IntrinsicOperator::EQV:
+    case ReductionIdentifier::MULTIPLY:
+    case ReductionIdentifier::AND:
+    case ReductionIdentifier::EQV:
       return 1;
     default:
       TODO(loc, "Reduction of some intrinsic operators is not supported");
     }
   }
 
-  static mlir::Value getIntrinsicProcInitValue(
-      mlir::Location loc, mlir::Type type,
-      const Fortran::parser::ProcedureDesignator &procDesignator,
-      fir::FirOpBuilder &builder) {
+  static mlir::Value getReductionInitValue(mlir::Location loc, mlir::Type type,
+                                           ReductionIdentifier redId,
+                                           fir::FirOpBuilder &builder) {
     assert((fir::isa_integer(type) || fir::isa_real(type) ||
             type.isa<fir::LogicalType>()) &&
            "only integer, logical and real types are currently supported");
-    switch (getReductionType(procDesignator)) {
-    case IntrinsicProc::MAX: {
+    switch (redId) {
+    case ReductionIdentifier::MAX: {
       if (auto ty = type.dyn_cast<mlir::FloatType>()) {
         const llvm::fltSemantics &sem = ty.getFloatSemantics();
         return builder.createRealConstant(
@@ -852,7 +885,7 @@ public:
       int64_t minInt = llvm::APInt::getSignedMinValue(bits).getSExtValue();
       return builder.createIntegerConstant(loc, type, minInt);
     }
-    case IntrinsicProc::MIN: {
+    case ReductionIdentifier::MIN: {
       if (auto ty = type.dyn_cast<mlir::FloatType>()) {
         const llvm::fltSemantics &sem = ty.getFloatSemantics();
         return builder.createRealConstant(
@@ -862,46 +895,50 @@ public:
       int64_t maxInt = llvm::APInt::getSignedMaxValue(bits).getSExtValue();
       return builder.createIntegerConstant(loc, type, maxInt);
     }
-    case IntrinsicProc::IOR: {
+    case ReductionIdentifier::IOR: {
       unsigned bits = type.getIntOrFloatBitWidth();
       int64_t zeroInt = llvm::APInt::getZero(bits).getSExtValue();
       return builder.createIntegerConstant(loc, type, zeroInt);
     }
-    case IntrinsicProc::IEOR: {
+    case ReductionIdentifier::IEOR: {
       unsigned bits = type.getIntOrFloatBitWidth();
       int64_t zeroInt = llvm::APInt::getZero(bits).getSExtValue();
       return builder.createIntegerConstant(loc, type, zeroInt);
     }
-    case IntrinsicProc::IAND: {
+    case ReductionIdentifier::IAND: {
       unsigned bits = type.getIntOrFloatBitWidth();
       int64_t allOnInt = llvm::APInt::getAllOnes(bits).getSExtValue();
       return builder.createIntegerConstant(loc, type, allOnInt);
     }
-    }
-    llvm_unreachable("Unknown Reduction Intrinsic");
-  }
+    case ReductionIdentifier::ADD:
+    case ReductionIdentifier::MULTIPLY:
+    case ReductionIdentifier::AND:
+    case ReductionIdentifier::OR:
+    case ReductionIdentifier::EQV:
+    case ReductionIdentifier::NEQV:
+      if (type.isa<mlir::FloatType>())
+        return builder.create<mlir::arith::ConstantOp>(
+            loc, type,
+            builder.getFloatAttr(type,
+                                 (double)getOperationIdentity(redId, loc)));
+
+      if (type.isa<fir::LogicalType>()) {
+        mlir::Value intConst = builder.create<mlir::arith::ConstantOp>(
+            loc, builder.getI1Type(),
+            builder.getIntegerAttr(builder.getI1Type(),
+                                   getOperationIdentity(redId, loc)));
+        return builder.createConvert(loc, type, intConst);
+      }
 
-  static mlir::Value getIntrinsicOpInitValue(
-      mlir::Location loc, mlir::Type type,
-      Fortran::parser::DefinedOperator::IntrinsicOperator intrinsicOp,
-      fir::FirOpBuilder &builder) {
-    if (type.isa<mlir::FloatType>())
       return builder.create<mlir::arith::ConstantOp>(
           loc, type,
-          builder.getFloatAttr(type,
-                               (double)getOperationIdentity(intrinsicOp, loc)));
-
-    if (type.isa<fir::LogicalType>()) {
-      mlir::Value intConst = builder.create<mlir::arith::ConstantOp>(
-          loc, builder.getI1Type(),
-          builder.getIntegerAttr(builder.getI1Type(),
-                                 getOperationIdentity(intrinsicOp, loc)));
-      return builder.createConvert(loc, type, intConst);
+          builder.getIntegerAttr(type, getOperationIdentity(redId, loc)));
+    case ReductionIdentifier::ID:
+    case ReductionIdentifier::USER_DEF_OP:
+    case ReductionIdentifier::SUBTRACT:
+      TODO(loc, "Reduction of some identifier types is not supported");
     }
-
-    return builder.create<mlir::arith::ConstantOp>(
-        loc, type,
-        builder.getIntegerAttr(type, getOperationIdentity(intrinsicOp, loc)));
+    llvm_unreachable("Unhandled Reduction identifier : getReductionInitValue");
   }
 
   template <typename FloatOp, typename IntegerOp>
@@ -915,118 +952,46 @@ public:
     return builder.create<FloatOp>(loc, op1, op2);
   }
 
-  /// Creates an OpenMP reduction declaration and inserts it into the provided
-  /// symbol table. The declaration has a constant initializer with the neutral
-  /// value `initValue`, and the reduction combiner carried over from `reduce`.
-  /// TODO: Generalize this for non-integer types, add atomic region.
-  static mlir::omp::ReductionDeclareOp createReductionDecl(
-      fir::FirOpBuilder &builder, llvm::StringRef reductionOpName,
-      const Fortran::parser::ProcedureDesignator &procDesignator,
-      mlir::Type type, mlir::Location loc) {
-    mlir::OpBuilder::InsertionGuard guard(builder);
-    mlir::ModuleOp module = builder.getModule();
-
-    auto decl =
-        module.lookupSymbol<mlir::omp::ReductionDeclareOp>(reductionOpName);
-    if (decl)
-      return decl;
-
-    mlir::OpBuilder modBuilder(module.getBodyRegion());
-
-    decl = modBuilder.create<mlir::omp::ReductionDeclareOp>(
-        loc, reductionOpName, type);
-    builder.createBlock(&decl.getInitializerRegion(),
-                        decl.getInitializerRegion().end(), {type}, {loc});
-    builder.setInsertionPointToEnd(&decl.getInitializerRegion().back());
-    mlir::Value init =
-        getIntrinsicProcInitValue(loc, type, procDesignator, builder);
-    builder.create<mlir::omp::YieldOp>(loc, init);
-
-    builder.createBlock(&decl.getReductionRegion(),
-                        decl.getReductionRegion().end(), {type, type},
-                        {loc, loc});
-
-    builder.setInsertionPointToEnd(&decl.getReductionRegion().back());
-    mlir::Value op1 = decl.getReductionRegion().front().getArgument(0);
-    mlir::Value op2 = decl.getReductionRegion().front().getArgument(1);
-
+  static mlir::Value createScalarCombiner(fir::FirOpBuilder &builder,
+                                          mlir::Location loc,
+                                          ReductionIdentifier redId,
+                                          mlir::Type type, mlir::Value op1,
+                                          mlir::Value op2) {
     mlir::Value reductionOp;
-    switch (getReductionType(procDesignator)) {
-    case IntrinsicProc::MAX:
+    switch (redId) {
+    case ReductionIdentifier::MAX:
       reductionOp =
           getReductionOperation<mlir::arith::MaximumFOp, mlir::arith::MaxSIOp>(
               builder, type, loc, op1, op2);
       break;
-    case IntrinsicProc::MIN:
+    case ReductionIdentifier::MIN:
       reductionOp =
           getReductionOperation<mlir::arith::MinimumFOp, mlir::arith::MinSIOp>(
               builder, type, loc, op1, op2);
       break;
-    case IntrinsicProc::IOR:
+    case ReductionIdentifier::IOR:
       assert((type.isIntOrIndex()) && "only integer is expected");
       reductionOp = builder.create<mlir::arith::OrIOp>(loc, op1, op2);
       break;
-    case IntrinsicProc::IEOR:
+    case ReductionIdentifier::IEOR:
       assert((type.isIntOrIndex()) && "only integer is expected");
       reductionOp = builder.create<mlir::arith::XOrIOp>(loc, op1, op2);
       break;
-    case IntrinsicProc::IAND:
+    case ReductionIdentifier::IAND:
       assert((type.isIntOrIndex()) && "only integer is expected");
       reductionOp = builder.create<mlir::arith::AndIOp>(loc, op1, op2);
       break;
-    }
-
-    builder.create<mlir::omp::YieldOp>(loc, reductionOp);
-    return decl;
-  }
-
-  /// Creates an OpenMP reduction declaration and inserts it into the provided
-  /// symbol table. The declaration has a constant initializer with the neutral
-  /// value `initValue`, and the reduction combiner carried over from `reduce`.
-  /// TODO: Generalize this for non-integer types, add atomic region.
-  static mlir::omp::ReductionDeclareOp createReductionDecl(
-      fir::FirOpBuilder &builder, llvm::StringRef reductionOpName,
-      Fortran::parser::DefinedOperator::IntrinsicOperator intrinsicOp,
-      mlir::Type type, mlir::Location loc) {
-    mlir::OpBuilder::InsertionGuard guard(builder);
-    mlir::ModuleOp module = builder.getModule();
-
-    auto decl =
-        module.lookupSymbol<mlir::omp::ReductionDeclareOp>(reductionOpName);
-    if (decl)
-      return decl;
-
-    mlir::OpBuilder modBuilder(module.getBodyRegion());
-
-    decl = modBuilder.create<mlir::omp::ReductionDeclareOp>(
-        loc, reductionOpName, type);
-    builder.createBlock(&decl.getInitializerRegion(),
-                        decl.getInitializerRegion().end(), {type}, {loc});
-    builder.setInsertionPointToEnd(&decl.getInitializerRegion().back());
-    mlir::Value init = getIntrinsicOpInitValue(loc, type, intrinsicOp, builder);
-    builder.create<mlir::omp::YieldOp>(loc, init);
-
-    builder.createBlock(&decl.getReductionRegion(),
-                        decl.getReductionRegion().end(), {type, type},
-                        {loc, loc});
-
-    builder.setInsertionPointToEnd(&decl.getReductionRegion().back());
-    mlir::Value op1 = decl.getReductionRegion().front().getArgument(0);
-    mlir::Value op2 = decl.getReductionRegion().front().getArgument(1);
-
-    mlir::Value reductionOp;
-    switch (intrinsicOp) {
-    case Fortran::parser::DefinedOperator::IntrinsicOperator::Add:
+    case ReductionIdentifier::ADD:
       reductionOp =
           getReductionOperation<mlir::arith::AddFOp, mlir::arith::AddIOp>(
               builder, type, loc, op1, op2);
       break;
-    case Fortran::parser::DefinedOperator::IntrinsicOperator::Multiply:
+    case ReductionIdentifier::MULTIPLY:
       reductionOp =
           getReductionOperation<mlir::arith::MulFOp, mlir::arith::MulIOp>(
               builder, type, loc, op1, op2);
       break;
-    case Fortran::parser::DefinedOperator::IntrinsicOperator::AND: {
+    case ReductionIdentifier::AND: {
       mlir::Value op1I1 = builder.createConvert(loc, builder.getI1Type(), op1);
       mlir::Value op2I1 = builder.createConvert(loc, builder.getI1Type(), op2);
 
@@ -1036,7 +1001,7 @@ public:
       reductionOp = builder.createConvert(loc, type, andiOp);
       break;
     }
-    case Fortran::parser::DefinedOperator::IntrinsicOperator::OR: {
+    case ReductionIdentifier::OR: {
       mlir::Value op1I1 = builder.createConvert(loc, builder.getI1Type(), op1);
       mlir::Value op2I1 = builder.createConvert(loc, builder.getI1Type(), op2);
 
@@ -1045,7 +1010,7 @@ public:
       reductionOp = builder.createConvert(loc, type, oriOp);
       break;
     }
-    case Fortran::parser::DefinedOperator::IntrinsicOperator::EQV: {
+    case ReductionIdentifier::EQV: {
       mlir::Value op1I1 = builder.createConvert(loc, builder.getI1Type(), op1);
       mlir::Value op2I1 = builder.createConvert(loc, builder.getI1Type(), op2);
 
@@ -1055,7 +1020,7 @@ public:
       reductionOp = builder.createConvert(loc, type, cmpiOp);
       break;
     }
-    case Fortran::parser::DefinedOperator::IntrinsicOperator::NEQV: {
+    case ReductionIdentifier::NEQV: {
       mlir::Value op1I1 = builder.createConvert(loc, builder.getI1Type(), op1);
       mlir::Value op2I1 = builder.createConvert(loc, builder.getI1Type(), op2);
 
@@ -1069,7 +1034,46 @@ public:
       TODO(loc, "Reduction of some intrinsic operators is not supported");
     }
 
+    return reductionOp;
+  }
+
+  /// Creates an OpenMP reduction declaration and inserts it into the provided
+  /// symbol table. The declaration has a constant initializer with the neutral
+  /// value `initValue`, and the reduction combiner carried over from `reduce`.
+  /// TODO: Generalize this for non-integer types, add atomic region.
+  static mlir::omp::ReductionDeclareOp createReductionDecl(
+      fir::FirOpBuilder &builder, llvm::StringRef reductionOpName,
+      const ReductionIdentifier redId, mlir::Type type, mlir::Location loc) {
+    mlir::OpBuilder::InsertionGuard guard(builder);
+    mlir::ModuleOp module = builder.getModule();
+
+    auto decl =
+        module.lookupSymbol<mlir::omp::ReductionDeclareOp>(reductionOpName);
+    if (decl)
+      return decl;
+
+    mlir::OpBuilder modBuilder(module.getBodyRegion());
+
+    decl = modBuilder.create<mlir::omp::ReductionDeclareOp>(
+        loc, reductionOpName, type);
+    builder.createBlock(&decl.getInitializerRegion(),
+                        decl.getInitializerRegion().end(), {type}, {loc});
+    builder.setInsertionPointToEnd(&decl.getInitializerRegion().back());
+    mlir::Value init = getReductionInitValue(loc, type, redId, builder);
+    builder.create<mlir::omp::YieldOp>(loc, init);
+
+    builder.createBlock(&decl.getReductionRegion(),
+                        decl.getReductionRegion().end(), {type, type},
+                        {loc, loc});
+
+    builder.setInsertionPointToEnd(&decl.getReductionRegion().back());
+    mlir::Value op1 = decl.getReductionRegion().front().getArgument(0);
+    mlir::Value op2 = decl.getReductionRegion().front().getArgument(1);
+
+    mlir::Value reductionOp =
+        createScalarCombiner(builder, loc, redId, type, op1, op2);
     builder.create<mlir::omp::YieldOp>(loc, reductionOp);
+
     return decl;
   }
 
@@ -1092,15 +1096,15 @@ public:
       const auto &intrinsicOp{
           std::get<Fortran::parser::DefinedOperator::IntrinsicOperator>(
               redDefinedOp->u)};
-      switch (intrinsicOp) {
-      case Fortran::parser::DefinedOperator::IntrinsicOperator::Add:
-      case Fortran::parser::DefinedOperator::IntrinsicOperator::Multiply:
-      case Fortran::parser::DefinedOperator::IntrinsicOperator::AND:
-      case Fortran::parser::DefinedOperator::IntrinsicOperator::EQV:
-      case Fortran::parser::DefinedOperator::IntrinsicOperator::OR:
-      case Fortran::parser::DefinedOperator::IntrinsicOperator::NEQV:
+      ReductionIdentifier redId = getReductionType(intrinsicOp);
+      switch (redId) {
+      case ReductionIdentifier::ADD:
+      case ReductionIdentifier::MULTIPLY:
+      case ReductionIdentifier::AND:
+      case ReductionIdentifier::EQV:
+      case ReductionIdentifier::OR:
+      case ReductionIdentifier::NEQV:
         break;
-
       default:
         TODO(currentLocation,
              "Reduction of some intrinsic operators is not supported");
@@ -1120,11 +1124,11 @@ public:
               decl = createReductionDecl(
                   firOpBuilder,
                   getReductionName(intrinsicOp, firOpBuilder.getI1Type()),
-                  intrinsicOp, redType, currentLocation);
+                  redId, redType, currentLocation);
             else if (redType.isIntOrIndexOrFloat()) {
               decl = createReductionDecl(firOpBuilder,
                                          getReductionName(intrinsicOp, redType),
-                                         intrinsicOp, redType, currentLocation);
+                                         redId, redType, currentLocation);
             } else {
               TODO(currentLocation, "Reduction of some types is not supported");
             }
@@ -1138,6 +1142,8 @@ public:
                        &redOperator.u)) {
       if (ReductionProcessor::supportedIntrinsicProcReduction(
               *reductionIntrinsic)) {
+        ReductionProcessor::ReductionIdentifier redId =
+            ReductionProcessor::getReductionType(*reductionIntrinsic);
         for (const Fortran::parser::OmpObject &ompObject : objectList.v) {
           if (const auto *name{
                   Fortran::parser::Unwrap<Fortran::parser::Name>(ompObject)}) {
@@ -1154,7 +1160,7 @@ public:
                   firOpBuilder,
                   getReductionName(getRealName(*reductionIntrinsic).ToString(),
                                    redType),
-                  *reductionIntrinsic, redType, currentLocation);
+                  redId, redType, currentLocation);
               reductionDeclSymbols.push_back(mlir::SymbolRefAttr::get(
                   firOpBuilder.getContext(), decl.getSymName()));
             }
@@ -4174,7 +4180,7 @@ void Fortran::lower::genOpenMPReduction(
         if (!ReductionProcessor::supportedIntrinsicProcReduction(
                 *reductionIntrinsic))
           continue;
-        ReductionProcessor::IntrinsicProc redIntrinsicProc =
+        ReductionProcessor::ReductionIdentifier redId =
             ReductionProcessor::getReductionType(*reductionIntrinsic);
         for (const Fortran::parser::OmpObject &ompObject : objectList.v) {
           if (const auto *name{
@@ -4195,10 +4201,8 @@ void Fortran::lower::genOpenMPReduction(
                   if (reductionOp == nullptr)
                     continue;
 
-                  if (redIntrinsicProc ==
-                          ReductionProcessor::IntrinsicProc::MAX ||
-                      redIntrinsicProc ==
-                          ReductionProcessor::IntrinsicProc::MIN) {
+                  if (redId == ReductionProcessor::ReductionIdentifier::MAX ||
+                      redId == ReductionProcessor::ReductionIdentifier::MIN) {
                     assert(mlir::isa<mlir::arith::SelectOp>(reductionOp) &&
                            "Selection Op not found in reduction intrinsic");
                     mlir::Operation *compareOp =
@@ -4206,13 +4210,9 @@ void Fortran::lower::genOpenMPReduction(
                     updateReduction(compareOp, firOpBuilder, loadVal,
                                     reductionVal);
                   }
-                  if (redIntrinsicProc ==
-                          ReductionProcessor::IntrinsicProc::IOR ||
-                      redIntrinsicProc ==
-                          ReductionProcessor::IntrinsicProc::IEOR ||
-                      redIntrinsicProc ==
-                          ReductionProcessor::IntrinsicProc::IAND) {
-
+                  if (redId == ReductionProcessor::ReductionIdentifier::IOR ||
+                      redId == ReductionProcessor::ReductionIdentifier::IEOR ||
+                      redId == ReductionProcessor::ReductionIdentifier::IAND) {
                     updateReduction(reductionOp, firOpBuilder, loadVal,
                                     reductionVal);
                   }
-- 
cgit v1.1


From b2b3a5248540320e74347fcdaffbd148d1e9d494 Mon Sep 17 00:00:00 2001
From: Mats Petersson <mats.petersson@arm.com>
Date: Fri, 9 Feb 2024 18:05:51 +0000
Subject: Skip compiler directives between OMP PARALLEL DO and the loop
 (#81021)

This fixes a compilation error when code like this is presented to the
compiler:

  !$OMP PARALLEL DO
  !DIR$ VECTOR ALIGNED
  DO 20 i=1,N
     a = a + 0.5
20   CONTINUE

The directive itself is later ignored (with a warning that this is
happening), but because the compiler already errored out before that
point, it completely fails to compile this code. Other compilers accept
the code without complaints.
---
 flang/lib/Semantics/canonicalize-omp.cpp         | 16 +++++++++++-----
 flang/test/Semantics/OpenMP/loop-association.f90 |  8 ++++++++
 2 files changed, 19 insertions(+), 5 deletions(-)

diff --git a/flang/lib/Semantics/canonicalize-omp.cpp b/flang/lib/Semantics/canonicalize-omp.cpp
index 013fb40..01adcf5 100644
--- a/flang/lib/Semantics/canonicalize-omp.cpp
+++ b/flang/lib/Semantics/canonicalize-omp.cpp
@@ -90,7 +90,11 @@ private:
     auto &dir{std::get<parser::OmpLoopDirective>(beginDir.t)};
 
     nextIt = it;
-    if (++nextIt != block.end()) {
+    while (++nextIt != block.end()) {
+      // Ignore compiler directives.
+      if (auto *directive{GetConstructIf<parser::CompilerDirective>(*nextIt)})
+        continue;
+
       if (auto *doCons{GetConstructIf<parser::DoConstruct>(*nextIt)}) {
         if (doCons->GetLoopControl()) {
           // move DoConstruct
@@ -111,12 +115,14 @@ private:
               "DO loop after the %s directive must have loop control"_err_en_US,
               parser::ToUpperCaseLetters(dir.source.ToString()));
         }
-        return; // found do-loop
+      } else {
+        messages_.Say(dir.source,
+            "A DO loop must follow the %s directive"_err_en_US,
+            parser::ToUpperCaseLetters(dir.source.ToString()));
       }
+      // If we get here, we either found a loop, or issued an error message.
+      return;
     }
-    messages_.Say(dir.source,
-        "A DO loop must follow the %s directive"_err_en_US,
-        parser::ToUpperCaseLetters(dir.source.ToString()));
   }
 
   void RewriteOmpAllocations(parser::ExecutionPart &body) {
diff --git a/flang/test/Semantics/OpenMP/loop-association.f90 b/flang/test/Semantics/OpenMP/loop-association.f90
index 8a28fd8..d216766 100644
--- a/flang/test/Semantics/OpenMP/loop-association.f90
+++ b/flang/test/Semantics/OpenMP/loop-association.f90
@@ -30,6 +30,14 @@
      c = c - 1
   END DO outer
 
+  ! Accept directives between parallel do and actual loop.
+  !$OMP PARALLEL DO
+  !DIR$ VECTOR ALIGNED
+  DO 20 i=1,N
+     a = a + 0.5
+20   CONTINUE
+  !$OMP END PARALLEL DO
+
   c = 16
   !ERROR: DO loop after the PARALLEL DO directive must have loop control
   !$omp parallel do
-- 
cgit v1.1


From d86f21693c5fb8eaa597cfcb15813ffc52d00847 Mon Sep 17 00:00:00 2001
From: Piotr Zegar <me@piotrzegar.pl>
Date: Fri, 9 Feb 2024 18:19:53 +0000
Subject: [clang-tidy][NFC] Fixes in release notes and documentation

Minor fixes in documentation & release notes.
---
 clang-tools-extra/docs/ReleaseNotes.rst                      | 12 ++++++------
 .../checks/readability/avoid-return-with-void-value.rst      |  6 +++---
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/clang-tools-extra/docs/ReleaseNotes.rst b/clang-tools-extra/docs/ReleaseNotes.rst
index dff8dd2..ee68c8f 100644
--- a/clang-tools-extra/docs/ReleaseNotes.rst
+++ b/clang-tools-extra/docs/ReleaseNotes.rst
@@ -118,8 +118,8 @@ Changes in existing checks
   global options of the same name.
 
 - Improved :doc:`bugprone-too-small-loop-variable
-  <clang-tidy/checks/bugprone/too-small-loop-variable>` support by correctly 
-  implementing the check for const loop boundary.
+  <clang-tidy/checks/bugprone/too-small-loop-variable>` check by incorporating
+  better support for ``const`` loop boundaries.
 
 - Cleaned up :doc:`cppcoreguidelines-prefer-member-initializer
   <clang-tidy/checks/cppcoreguidelines/prefer-member-initializer>`
@@ -163,13 +163,13 @@ Changes in existing checks
 Removed checks
 ^^^^^^^^^^^^^^
 
-Miscellaneous
-^^^^^^^^^^^^^
-
 - Removed `cert-dcl21-cpp`, which was deprecated since :program:`clang-tidy` 17,
   since the rule DCL21-CPP has been removed from the CERT guidelines.
 
-- Fixed incorrect formatting in ``clang-apply-repalcements`` when no ``--format``
+Miscellaneous
+^^^^^^^^^^^^^
+
+- Fixed incorrect formatting in ``clang-apply-replacements`` when no ``--format``
   option is specified. Now ``clang-apply-replacements`` applies formatting only with
   the option.
 
diff --git a/clang-tools-extra/docs/clang-tidy/checks/readability/avoid-return-with-void-value.rst b/clang-tools-extra/docs/clang-tidy/checks/readability/avoid-return-with-void-value.rst
index d802f9b..b079581 100644
--- a/clang-tools-extra/docs/clang-tidy/checks/readability/avoid-return-with-void-value.rst
+++ b/clang-tools-extra/docs/clang-tidy/checks/readability/avoid-return-with-void-value.rst
@@ -29,7 +29,7 @@ that should be written as
    g();
    return;
 
-to make clear that ``g()`` is called and immediately afterwards the function 
+to make clear that ``g()`` is called and immediately afterwards the function
 returns (nothing).
 
 In C, the same issue is detected by the compiler if the ``-Wpedantic`` mode
@@ -46,6 +46,6 @@ Options
 .. option::  StrictMode
 
   The value `false` specifies that a direct return statement shall
-  be excluded from the analysis if it is the only statement not 
-  contained in a block like ``if (cond) return g();``. The default
+  be excluded from the analysis if it is the only statement not
+  contained in a block, like ``if (cond) return g();``. The default
   value is `true`.
-- 
cgit v1.1


From 407f9c06ea2a4f3fc32647ba22e5b60f695ca4b3 Mon Sep 17 00:00:00 2001
From: Paul Kirth <paulkirth@google.com>
Date: Fri, 9 Feb 2024 10:33:58 -0800
Subject: [clang][driver] Set TLSDESC as the default for Android on RISC-V
 (#81198)

---
 clang/test/Driver/tls-dialect.c         | 4 ++++
 llvm/include/llvm/TargetParser/Triple.h | 5 +----
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/clang/test/Driver/tls-dialect.c b/clang/test/Driver/tls-dialect.c
index 4e105ce..f73915b 100644
--- a/clang/test/Driver/tls-dialect.c
+++ b/clang/test/Driver/tls-dialect.c
@@ -3,6 +3,10 @@
 // RUN: %clang -### --target=riscv64-linux %s 2>&1 | FileCheck --check-prefix=NODESC %s
 // RUN: %clang -### --target=x86_64-linux -mtls-dialect=gnu %s 2>&1 | FileCheck --check-prefix=NODESC %s
 
+/// Android supports TLSDESC by default on RISC-V
+/// TLSDESC is not on by default in Linux, even on RISC-V, and is covered above
+// RUN: %clang -### --target=riscv64-android %s 2>&1 | FileCheck --check-prefix=DESC %s
+
 /// LTO
 // RUN: %clang -### --target=riscv64-linux -flto -mtls-dialect=desc %s 2>&1 | FileCheck --check-prefix=LTO-DESC %s
 // RUN: %clang -### --target=riscv64-linux -flto %s 2>&1 | FileCheck --check-prefix=LTO-NODESC %s
diff --git a/llvm/include/llvm/TargetParser/Triple.h b/llvm/include/llvm/TargetParser/Triple.h
index 98d8490..e732070 100644
--- a/llvm/include/llvm/TargetParser/Triple.h
+++ b/llvm/include/llvm/TargetParser/Triple.h
@@ -1035,10 +1035,7 @@ public:
 
   /// True if the target supports both general-dynamic and TLSDESC, and TLSDESC
   /// is enabled by default.
-  bool hasDefaultTLSDESC() const {
-    // TODO: Improve check for other platforms, like Android, and RISC-V
-    return false;
-  }
+  bool hasDefaultTLSDESC() const { return isAndroid() && isRISCV64(); }
 
   /// Tests whether the target uses -data-sections as default.
   bool hasDefaultDataSections() const {
-- 
cgit v1.1


From 0329c1b6d838ec983f215244549b3c5ff2d5fb51 Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Fri, 9 Feb 2024 10:38:03 -0800
Subject: [ELF] --no-rosegment: don't mark read-only PT_LOAD segments
 executable (#81223)

Once we move `.lrodata` after .bss (#78521), or if we use `SECTIONS`
commands, certain read-only sections may be in their own PT_LOAD, not in
the traditional "text segment". Current --no-rosegment code may
unnecessarily mark read-only PT_LOAD executable. Fix it.
---
 lld/ELF/Writer.cpp      | 28 ++++++++++++++++------------
 lld/test/ELF/segments.s |  2 +-
 2 files changed, 17 insertions(+), 13 deletions(-)

diff --git a/lld/ELF/Writer.cpp b/lld/ELF/Writer.cpp
index 6df43a3..53ca70b 100644
--- a/lld/ELF/Writer.cpp
+++ b/lld/ELF/Writer.cpp
@@ -2353,17 +2353,12 @@ static bool needsPtLoad(OutputSection *sec) {
   return true;
 }
 
-// Linker scripts are responsible for aligning addresses. Unfortunately, most
-// linker scripts are designed for creating two PT_LOADs only, one RX and one
-// RW. This means that there is no alignment in the RO to RX transition and we
-// cannot create a PT_LOAD there.
+// Adjust phdr flags according to certain options.
 static uint64_t computeFlags(uint64_t flags) {
   if (config->omagic)
     return PF_R | PF_W | PF_X;
   if (config->executeOnly && (flags & PF_X))
     return flags & ~PF_R;
-  if (config->singleRoRx && !(flags & PF_W))
-    return flags | PF_X;
   return flags;
 }
 
@@ -2451,7 +2446,7 @@ SmallVector<PhdrEntry *, 0> Writer<ELFT>::createPhdrs(Partition &part) {
     // Segments are contiguous memory regions that has the same attributes
     // (e.g. executable or writable). There is one phdr for each segment.
     // Therefore, we need to create a new phdr when the next section has
-    // different flags or is loaded at a discontiguous address or memory region
+    // compatible flags or is loaded at a discontiguous address or memory region
     // using AT or AT> linker script command, respectively.
     //
     // As an exception, we don't create a separate load segment for the ELF
@@ -2465,13 +2460,22 @@ SmallVector<PhdrEntry *, 0> Writer<ELFT>::createPhdrs(Partition &part) {
     // so when hasSectionsCommand, since we cannot introduce the extra alignment
     // needed to create a new LOAD)
     uint64_t newFlags = computeFlags(sec->getPhdrFlags());
+    // When --no-rosegment is specified, RO and RX sections are compatible.
+    uint32_t diff = flags ^ newFlags;
+    if (config->singleRoRx && !(newFlags & PF_W))
+      diff &= ~PF_X;
+    if (diff)
+      load = nullptr;
+
     bool sameLMARegion =
         load && !sec->lmaExpr && sec->lmaRegion == load->firstSec->lmaRegion;
-    if (!(load && newFlags == flags && sec != relroEnd &&
-          sec->memRegion == load->firstSec->memRegion &&
-          (sameLMARegion || load->lastSec == Out::programHeaders) &&
-          (script->hasSectionsCommand || sec->type == SHT_NOBITS ||
-           load->lastSec->type != SHT_NOBITS))) {
+    if (load && sec != relroEnd &&
+        sec->memRegion == load->firstSec->memRegion &&
+        (sameLMARegion || load->lastSec == Out::programHeaders) &&
+        (script->hasSectionsCommand || sec->type == SHT_NOBITS ||
+         load->lastSec->type != SHT_NOBITS)) {
+      load->p_flags |= newFlags;
+    } else {
       load = addHdr(PT_LOAD, newFlags);
       flags = newFlags;
     }
diff --git a/lld/test/ELF/segments.s b/lld/test/ELF/segments.s
index ee17117..1fe248a 100644
--- a/lld/test/ELF/segments.s
+++ b/lld/test/ELF/segments.s
@@ -44,7 +44,7 @@
 # NOROSEGMENT1-NEXT:  LOAD           0x001006 0x0000000000000006 0x0000000000000006 0x000001 0x000001 RW  0x1000
 # NOROSEGMENT1-NEXT:  LOAD           0x001007 0x0000000000000007 0x0000000000000007 0x000002 0x000002 R E 0x1000
 # NOROSEGMENT1-NEXT:  LOAD           0x001009 0x0000000000000009 0x0000000000000009 0x000001 0x000001 RW  0x1000
-# NOROSEGMENT1-NEXT:  LOAD           0x00100a 0x000000000000000a 0x000000000000000a 0x000001 0x000001 R E 0x1000
+# NOROSEGMENT1-NEXT:  LOAD           0x00100a 0x000000000000000a 0x000000000000000a 0x000001 0x000001 R   0x1000
 # NOROSEGMENT1-NEXT:  GNU_STACK      0x000000 0x0000000000000000 0x0000000000000000 0x000000 0x000000 RW  0
 
 # RUN: ld.lld -N a.o -o omagic
-- 
cgit v1.1


From 314ef9617e87b2cba9dd278e228ab03453500054 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Valentin=20Clement=20=28=E3=83=90=E3=83=AC=E3=83=B3?=
 =?UTF-8?q?=E3=82=BF=E3=82=A4=E3=83=B3=20=E3=82=AF=E3=83=AC=E3=83=A1?=
 =?UTF-8?q?=E3=83=B3=29?= <clementval@gmail.com>
Date: Fri, 9 Feb 2024 10:41:37 -0800
Subject: [flang][cuda] Lower attribute for module variables (#81226)

Propagate the CUDA attribute to fir.global operation for simple module
variables.
---
 flang/include/flang/Optimizer/Builder/FIRBuilder.h |  6 ++-
 flang/include/flang/Optimizer/Dialect/FIROps.td    |  3 +-
 flang/lib/Lower/ConvertVariable.cpp                | 16 ++++---
 flang/lib/Optimizer/Builder/FIRBuilder.cpp         | 23 ++++++----
 flang/test/Lower/CUDA/cuda-data-attribute.cuf      | 51 +++++++++++++---------
 5 files changed, 61 insertions(+), 38 deletions(-)

diff --git a/flang/include/flang/Optimizer/Builder/FIRBuilder.h b/flang/include/flang/Optimizer/Builder/FIRBuilder.h
index 5384f6e..f50dacd 100644
--- a/flang/include/flang/Optimizer/Builder/FIRBuilder.h
+++ b/flang/include/flang/Optimizer/Builder/FIRBuilder.h
@@ -230,12 +230,14 @@ public:
                              llvm::StringRef name,
                              mlir::StringAttr linkage = {},
                              mlir::Attribute value = {}, bool isConst = false,
-                             bool isTarget = false);
+                             bool isTarget = false,
+                             fir::CUDAAttributeAttr cudaAttr = {});
 
   fir::GlobalOp createGlobal(mlir::Location loc, mlir::Type type,
                              llvm::StringRef name, bool isConst, bool isTarget,
                              std::function<void(FirOpBuilder &)> bodyBuilder,
-                             mlir::StringAttr linkage = {});
+                             mlir::StringAttr linkage = {},
+                             fir::CUDAAttributeAttr cudaAttr = {});
 
   /// Create a global constant (read-only) value.
   fir::GlobalOp createGlobalConstant(mlir::Location loc, mlir::Type type,
diff --git a/flang/include/flang/Optimizer/Dialect/FIROps.td b/flang/include/flang/Optimizer/Dialect/FIROps.td
index b954a0c..d505fed 100644
--- a/flang/include/flang/Optimizer/Dialect/FIROps.td
+++ b/flang/include/flang/Optimizer/Dialect/FIROps.td
@@ -2737,7 +2737,8 @@ def fir_GlobalOp : fir_Op<"global", [IsolatedFromAbove, Symbol]> {
     OptionalAttr<AnyAttr>:$initVal,
     OptionalAttr<UnitAttr>:$constant,
     OptionalAttr<UnitAttr>:$target,
-    OptionalAttr<StrAttr>:$linkName
+    OptionalAttr<StrAttr>:$linkName,
+    OptionalAttr<fir_CUDAAttributeAttr>:$cuda_attr
   );
 
   let regions = (region AtMostRegion<1>:$region);
diff --git a/flang/lib/Lower/ConvertVariable.cpp b/flang/lib/Lower/ConvertVariable.cpp
index f14267f..2f23757 100644
--- a/flang/lib/Lower/ConvertVariable.cpp
+++ b/flang/lib/Lower/ConvertVariable.cpp
@@ -138,7 +138,8 @@ static bool isConstant(const Fortran::semantics::Symbol &sym) {
 static fir::GlobalOp defineGlobal(Fortran::lower::AbstractConverter &converter,
                                   const Fortran::lower::pft::Variable &var,
                                   llvm::StringRef globalName,
-                                  mlir::StringAttr linkage);
+                                  mlir::StringAttr linkage,
+                                  fir::CUDAAttributeAttr cudaAttr = {});
 
 static mlir::Location genLocation(Fortran::lower::AbstractConverter &converter,
                                   const Fortran::semantics::Symbol &sym) {
@@ -462,7 +463,8 @@ void Fortran::lower::createGlobalInitialization(
 static fir::GlobalOp defineGlobal(Fortran::lower::AbstractConverter &converter,
                                   const Fortran::lower::pft::Variable &var,
                                   llvm::StringRef globalName,
-                                  mlir::StringAttr linkage) {
+                                  mlir::StringAttr linkage,
+                                  fir::CUDAAttributeAttr cudaAttr) {
   fir::FirOpBuilder &builder = converter.getFirOpBuilder();
   const Fortran::semantics::Symbol &sym = var.getSymbol();
   mlir::Location loc = genLocation(converter, sym);
@@ -500,8 +502,9 @@ static fir::GlobalOp defineGlobal(Fortran::lower::AbstractConverter &converter,
     }
   }
   if (!global)
-    global = builder.createGlobal(loc, symTy, globalName, linkage,
-                                  mlir::Attribute{}, isConst, var.isTarget());
+    global =
+        builder.createGlobal(loc, symTy, globalName, linkage, mlir::Attribute{},
+                             isConst, var.isTarget(), cudaAttr);
   if (Fortran::semantics::IsAllocatableOrPointer(sym) &&
       !Fortran::semantics::IsProcedure(sym)) {
     const auto *details =
@@ -2219,7 +2222,10 @@ void Fortran::lower::defineModuleVariable(
     // Do nothing. Mapping will be done on user side.
   } else {
     std::string globalName = converter.mangleName(sym);
-    defineGlobal(converter, var, globalName, linkage);
+    fir::CUDAAttributeAttr cudaAttr =
+        Fortran::lower::translateSymbolCUDAAttribute(
+            converter.getFirOpBuilder().getContext(), sym);
+    defineGlobal(converter, var, globalName, linkage, cudaAttr);
   }
 }
 
diff --git a/flang/lib/Optimizer/Builder/FIRBuilder.cpp b/flang/lib/Optimizer/Builder/FIRBuilder.cpp
index 141f8fc..68fe8de 100644
--- a/flang/lib/Optimizer/Builder/FIRBuilder.cpp
+++ b/flang/lib/Optimizer/Builder/FIRBuilder.cpp
@@ -271,19 +271,24 @@ mlir::Value fir::FirOpBuilder::createHeapTemporary(
 
 /// Create a global variable in the (read-only) data section. A global variable
 /// must have a unique name to identify and reference it.
-fir::GlobalOp fir::FirOpBuilder::createGlobal(mlir::Location loc,
-                                              mlir::Type type,
-                                              llvm::StringRef name,
-                                              mlir::StringAttr linkage,
-                                              mlir::Attribute value,
-                                              bool isConst, bool isTarget) {
+fir::GlobalOp fir::FirOpBuilder::createGlobal(
+    mlir::Location loc, mlir::Type type, llvm::StringRef name,
+    mlir::StringAttr linkage, mlir::Attribute value, bool isConst,
+    bool isTarget, fir::CUDAAttributeAttr cudaAttr) {
   auto module = getModule();
   auto insertPt = saveInsertionPoint();
   if (auto glob = module.lookupSymbol<fir::GlobalOp>(name))
     return glob;
   setInsertionPoint(module.getBody(), module.getBody()->end());
-  auto glob =
-      create<fir::GlobalOp>(loc, name, isConst, isTarget, type, value, linkage);
+  llvm::SmallVector<mlir::NamedAttribute> attrs;
+  if (cudaAttr) {
+    auto globalOpName = mlir::OperationName(fir::GlobalOp::getOperationName(),
+                                            module.getContext());
+    attrs.push_back(mlir::NamedAttribute(
+        fir::GlobalOp::getCudaAttrAttrName(globalOpName), cudaAttr));
+  }
+  auto glob = create<fir::GlobalOp>(loc, name, isConst, isTarget, type, value,
+                                    linkage, attrs);
   restoreInsertionPoint(insertPt);
   return glob;
 }
@@ -291,7 +296,7 @@ fir::GlobalOp fir::FirOpBuilder::createGlobal(mlir::Location loc,
 fir::GlobalOp fir::FirOpBuilder::createGlobal(
     mlir::Location loc, mlir::Type type, llvm::StringRef name, bool isConst,
     bool isTarget, std::function<void(FirOpBuilder &)> bodyBuilder,
-    mlir::StringAttr linkage) {
+    mlir::StringAttr linkage, fir::CUDAAttributeAttr cudaAttr) {
   auto module = getModule();
   auto insertPt = saveInsertionPoint();
   if (auto glob = module.lookupSymbol<fir::GlobalOp>(name))
diff --git a/flang/test/Lower/CUDA/cuda-data-attribute.cuf b/flang/test/Lower/CUDA/cuda-data-attribute.cuf
index b02701b..7596c6b 100644
--- a/flang/test/Lower/CUDA/cuda-data-attribute.cuf
+++ b/flang/test/Lower/CUDA/cuda-data-attribute.cuf
@@ -3,6 +3,18 @@
 
 ! Test lowering of CUDA attribute on variables.
 
+module cuda_var
+  real, constant :: mod_a_rc
+! CHECK: fir.global @_QMcuda_varEmod_a_rc {cuda_attr = #fir.cuda<constant>} : f32 
+  real, device :: mod_b_ra
+! CHECK: fir.global @_QMcuda_varEmod_b_ra {cuda_attr = #fir.cuda<device>} : f32
+  real, allocatable, managed :: mod_c_rm
+! CHECK: fir.global @_QMcuda_varEmod_c_rm {cuda_attr = #fir.cuda<managed>} : !fir.box<!fir.heap<f32>>
+  real, allocatable, pinned :: mod_d_rp
+! CHECK: fir.global @_QMcuda_varEmod_d_rp {cuda_attr = #fir.cuda<pinned>} : !fir.box<!fir.heap<f32>>
+
+contains
+
 subroutine local_var_attrs
   real, constant :: rc
   real, device :: rd
@@ -10,46 +22,43 @@ subroutine local_var_attrs
   real, allocatable, pinned :: rp
 end subroutine
 
-! CHECK-LABEL: func.func @_QPlocal_var_attrs()
-! CHECK: %{{.*}}:2 = hlfir.declare %{{.*}} {cuda_attr = #fir.cuda<constant>, uniq_name = "_QFlocal_var_attrsErc"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
-! CHECK: %{{.*}}:2 = hlfir.declare %{{.*}} {cuda_attr = #fir.cuda<device>, uniq_name = "_QFlocal_var_attrsErd"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
-! CHECK: %{{.*}}:2 = hlfir.declare %{{.*}} {cuda_attr = #fir.cuda<managed>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFlocal_var_attrsErm"} : (!fir.ref<!fir.box<!fir.heap<f32>>>) -> (!fir.ref<!fir.box<!fir.heap<f32>>>, !fir.ref<!fir.box<!fir.heap<f32>>>)
-! CHECK: %{{.*}}:2 = hlfir.declare %{{.*}} {cuda_attr = #fir.cuda<pinned>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFlocal_var_attrsErp"} : (!fir.ref<!fir.box<!fir.heap<f32>>>) -> (!fir.ref<!fir.box<!fir.heap<f32>>>, !fir.ref<!fir.box<!fir.heap<f32>>>)
+! CHECK-LABEL: func.func @_QMcuda_varPlocal_var_attrs()
+! CHECK: %{{.*}}:2 = hlfir.declare %{{.*}} {cuda_attr = #fir.cuda<constant>, uniq_name = "_QMcuda_varFlocal_var_attrsErc"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
+! CHECK: %{{.*}}:2 = hlfir.declare %{{.*}} {cuda_attr = #fir.cuda<device>, uniq_name = "_QMcuda_varFlocal_var_attrsErd"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
+! CHECK: %{{.*}}:2 = hlfir.declare %{{.*}} {cuda_attr = #fir.cuda<managed>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QMcuda_varFlocal_var_attrsErm"} : (!fir.ref<!fir.box<!fir.heap<f32>>>) -> (!fir.ref<!fir.box<!fir.heap<f32>>>, !fir.ref<!fir.box<!fir.heap<f32>>>)
+! CHECK: %{{.*}}:2 = hlfir.declare %{{.*}} {cuda_attr = #fir.cuda<pinned>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QMcuda_varFlocal_var_attrsErp"} : (!fir.ref<!fir.box<!fir.heap<f32>>>) -> (!fir.ref<!fir.box<!fir.heap<f32>>>, !fir.ref<!fir.box<!fir.heap<f32>>>)
 
-! FIR: %{{.*}} = fir.declare %{{.*}} {cuda_attr = #fir.cuda<constant>, uniq_name = "_QFlocal_var_attrsErc"} : (!fir.ref<f32>) -> !fir.ref<f32>
-! FIR: %{{.*}} = fir.declare %{{.*}} {cuda_attr = #fir.cuda<device>, uniq_name = "_QFlocal_var_attrsErd"} : (!fir.ref<f32>) -> !fir.ref<f32>
-! FIR: %{{.*}} = fir.declare %{{.*}} {cuda_attr = #fir.cuda<managed>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFlocal_var_attrsErm"} : (!fir.ref<!fir.box<!fir.heap<f32>>>) -> !fir.ref<!fir.box<!fir.heap<f32>>>
-! FIR: %{{.*}} = fir.declare %{{.*}} {cuda_attr = #fir.cuda<pinned>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFlocal_var_attrsErp"} : (!fir.ref<!fir.box<!fir.heap<f32>>>) -> !fir.ref<!fir.box<!fir.heap<f32>>>
+! FIR: %{{.*}} = fir.declare %{{.*}} {cuda_attr = #fir.cuda<constant>, uniq_name = "_QMcuda_varFlocal_var_attrsErc"} : (!fir.ref<f32>) -> !fir.ref<f32>
+! FIR: %{{.*}} = fir.declare %{{.*}} {cuda_attr = #fir.cuda<device>, uniq_name = "_QMcuda_varFlocal_var_attrsErd"} : (!fir.ref<f32>) -> !fir.ref<f32>
+! FIR: %{{.*}} = fir.declare %{{.*}} {cuda_attr = #fir.cuda<managed>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QMcuda_varFlocal_var_attrsErm"} : (!fir.ref<!fir.box<!fir.heap<f32>>>) -> !fir.ref<!fir.box<!fir.heap<f32>>>
+! FIR: %{{.*}} = fir.declare %{{.*}} {cuda_attr = #fir.cuda<pinned>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QMcuda_varFlocal_var_attrsErp"} : (!fir.ref<!fir.box<!fir.heap<f32>>>) -> !fir.ref<!fir.box<!fir.heap<f32>>>
 
 subroutine dummy_arg_constant(dc)
   real, constant :: dc
 end subroutine
-! CHECK-LABEL: func.func @_QPdummy_arg_constant(
+! CHECK-LABEL: func.func @_QMcuda_varPdummy_arg_constant(
 ! CHECK-SAME: %[[ARG0:.*]]: !fir.ref<f32> {fir.bindc_name = "dc", fir.cuda_attr = #fir.cuda<constant>}
-! CHECK: %{{.*}}:2 = hlfir.declare %[[ARG0]] {cuda_attr = #fir.cuda<constant>, uniq_name = "_QFdummy_arg_constantEdc"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
+! CHECK: %{{.*}}:2 = hlfir.declare %[[ARG0]] {cuda_attr = #fir.cuda<constant>, uniq_name = "_QMcuda_varFdummy_arg_constantEdc"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
 
 subroutine dummy_arg_device(dd)
   real, device :: dd
 end subroutine
-! CHECK-LABEL: func.func @_QPdummy_arg_device(
+! CHECK-LABEL: func.func @_QMcuda_varPdummy_arg_device(
 ! CHECK-SAME: %[[ARG0:.*]]: !fir.ref<f32> {fir.bindc_name = "dd", fir.cuda_attr = #fir.cuda<device>}) {
-! CHECK: %{{.*}}:2 = hlfir.declare %[[ARG0]] {cuda_attr = #fir.cuda<device>, uniq_name = "_QFdummy_arg_deviceEdd"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
+! CHECK: %{{.*}}:2 = hlfir.declare %[[ARG0]] {cuda_attr = #fir.cuda<device>, uniq_name = "_QMcuda_varFdummy_arg_deviceEdd"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
 
 subroutine dummy_arg_managed(dm)
   real, allocatable, managed :: dm
 end subroutine
-! CHECK-LABEL: func.func @_QPdummy_arg_managed(
+! CHECK-LABEL: func.func @_QMcuda_varPdummy_arg_managed(
 ! CHECK-SAME: %[[ARG0:.*]]: !fir.ref<!fir.box<!fir.heap<f32>>> {fir.bindc_name = "dm", fir.cuda_attr = #fir.cuda<managed>}) {
-! CHECK: %{{.*}}:2 = hlfir.declare %[[ARG0]] {cuda_attr = #fir.cuda<managed>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFdummy_arg_managedEdm"} : (!fir.ref<!fir.box<!fir.heap<f32>>>) -> (!fir.ref<!fir.box<!fir.heap<f32>>>, !fir.ref<!fir.box<!fir.heap<f32>>>)
+! CHECK: %{{.*}}:2 = hlfir.declare %[[ARG0]] {cuda_attr = #fir.cuda<managed>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QMcuda_varFdummy_arg_managedEdm"} : (!fir.ref<!fir.box<!fir.heap<f32>>>) -> (!fir.ref<!fir.box<!fir.heap<f32>>>, !fir.ref<!fir.box<!fir.heap<f32>>>)
 
 subroutine dummy_arg_pinned(dp)
   real, allocatable, pinned :: dp
 end subroutine
-! CHECK-LABEL: func.func @_QPdummy_arg_pinned(
+! CHECK-LABEL: func.func @_QMcuda_varPdummy_arg_pinned(
 ! CHECK-SAME: %[[ARG0:.*]]: !fir.ref<!fir.box<!fir.heap<f32>>> {fir.bindc_name = "dp", fir.cuda_attr = #fir.cuda<pinned>}) {
-! CHECK: %{{.*}}:2 = hlfir.declare %[[ARG0]] {cuda_attr = #fir.cuda<pinned>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFdummy_arg_pinnedEdp"} : (!fir.ref<!fir.box<!fir.heap<f32>>>) -> (!fir.ref<!fir.box<!fir.heap<f32>>>, !fir.ref<!fir.box<!fir.heap<f32>>>)
-
-
-
-
+! CHECK: %{{.*}}:2 = hlfir.declare %[[ARG0]] {cuda_attr = #fir.cuda<pinned>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QMcuda_varFdummy_arg_pinnedEdp"} : (!fir.ref<!fir.box<!fir.heap<f32>>>) -> (!fir.ref<!fir.box<!fir.heap<f32>>>, !fir.ref<!fir.box<!fir.heap<f32>>>)
 
+end module
-- 
cgit v1.1


From 2e4d2762b5f8c6b0ae02c2a9d517e009f470b8a6 Mon Sep 17 00:00:00 2001
From: Pranav Kant <prka@google.com>
Date: Fri, 9 Feb 2024 10:55:56 -0800
Subject: [X86][CodeGen] Emit float128 libcalls for math functions (#79611)

Make LLVM emit libcalls to proper float128 variants for float128 types.
---
 llvm/lib/CodeGen/TargetLoweringBase.cpp        | 40 +++++++++++++++++++++
 llvm/test/CodeGen/X86/GlobalISel/roundeven.ll  |  2 +-
 llvm/test/CodeGen/X86/fp128-libcalls-strict.ll | 48 +++++++++++++-------------
 llvm/test/CodeGen/X86/fp128-libcalls.ll        | 24 ++++++-------
 llvm/test/CodeGen/X86/frem.ll                  |  2 +-
 5 files changed, 78 insertions(+), 38 deletions(-)

diff --git a/llvm/lib/CodeGen/TargetLoweringBase.cpp b/llvm/lib/CodeGen/TargetLoweringBase.cpp
index 16cd14b..d8302ba 100644
--- a/llvm/lib/CodeGen/TargetLoweringBase.cpp
+++ b/llvm/lib/CodeGen/TargetLoweringBase.cpp
@@ -122,6 +122,46 @@ void TargetLoweringBase::InitLibcalls(const Triple &TT) {
   for (int LC = 0; LC < RTLIB::UNKNOWN_LIBCALL; ++LC)
     setLibcallCallingConv((RTLIB::Libcall)LC, CallingConv::C);
 
+  // Use the f128 variants of math functions on x86_64
+  if (TT.getArch() == Triple::ArchType::x86_64) {
+    setLibcallName(RTLIB::REM_F128, "fmodf128");
+    setLibcallName(RTLIB::FMA_F128, "fmaf128");
+    setLibcallName(RTLIB::SQRT_F128, "sqrtf128");
+    setLibcallName(RTLIB::CBRT_F128, "cbrtf128");
+    setLibcallName(RTLIB::LOG_F128, "logf128");
+    setLibcallName(RTLIB::LOG_FINITE_F128, "__logf128_finite");
+    setLibcallName(RTLIB::LOG2_F128, "log2f128");
+    setLibcallName(RTLIB::LOG2_FINITE_F128, "__log2f128_finite");
+    setLibcallName(RTLIB::LOG10_F128, "log10f128");
+    setLibcallName(RTLIB::LOG10_FINITE_F128, "__log10f128_finite");
+    setLibcallName(RTLIB::EXP_F128, "expf128");
+    setLibcallName(RTLIB::EXP_FINITE_F128, "__expf128_finite");
+    setLibcallName(RTLIB::EXP2_F128, "exp2f128");
+    setLibcallName(RTLIB::EXP2_FINITE_F128, "__exp2f128_finite");
+    setLibcallName(RTLIB::EXP10_F128, "exp10f128");
+    setLibcallName(RTLIB::SIN_F128, "sinf128");
+    setLibcallName(RTLIB::COS_F128, "cosf128");
+    setLibcallName(RTLIB::SINCOS_F128, "sincosf128");
+    setLibcallName(RTLIB::POW_F128, "powf128");
+    setLibcallName(RTLIB::POW_FINITE_F128, "__powf128_finite");
+    setLibcallName(RTLIB::CEIL_F128, "ceilf128");
+    setLibcallName(RTLIB::TRUNC_F128, "truncf128");
+    setLibcallName(RTLIB::RINT_F128, "rintf128");
+    setLibcallName(RTLIB::NEARBYINT_F128, "nearbyintf128");
+    setLibcallName(RTLIB::ROUND_F128, "roundf128");
+    setLibcallName(RTLIB::ROUNDEVEN_F128, "roundevenf128");
+    setLibcallName(RTLIB::FLOOR_F128, "floorf128");
+    setLibcallName(RTLIB::COPYSIGN_F128, "copysignf128");
+    setLibcallName(RTLIB::FMIN_F128, "fminf128");
+    setLibcallName(RTLIB::FMAX_F128, "fmaxf128");
+    setLibcallName(RTLIB::LROUND_F128, "lroundf128");
+    setLibcallName(RTLIB::LLROUND_F128, "llroundf128");
+    setLibcallName(RTLIB::LRINT_F128, "lrintf128");
+    setLibcallName(RTLIB::LLRINT_F128, "llrintf128");
+    setLibcallName(RTLIB::LDEXP_F128, "ldexpf128");
+    setLibcallName(RTLIB::FREXP_F128, "frexpf128");
+  }
+
   // For IEEE quad-precision libcall names, PPC uses "kf" instead of "tf".
   if (TT.isPPC()) {
     setLibcallName(RTLIB::ADD_F128, "__addkf3");
diff --git a/llvm/test/CodeGen/X86/GlobalISel/roundeven.ll b/llvm/test/CodeGen/X86/GlobalISel/roundeven.ll
index 119821e..dae27ff 100644
--- a/llvm/test/CodeGen/X86/GlobalISel/roundeven.ll
+++ b/llvm/test/CodeGen/X86/GlobalISel/roundeven.ll
@@ -44,7 +44,7 @@ define fp128 @roundeven_f128(fp128 %x) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    pushq %rax
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    callq roundevenl
+; CHECK-NEXT:    callq roundevenf128
 ; CHECK-NEXT:    popq %rax
 ; CHECK-NEXT:    .cfi_def_cfa_offset 8
 ; CHECK-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/fp128-libcalls-strict.ll b/llvm/test/CodeGen/X86/fp128-libcalls-strict.ll
index 4722ce6..47234c3 100644
--- a/llvm/test/CodeGen/X86/fp128-libcalls-strict.ll
+++ b/llvm/test/CodeGen/X86/fp128-libcalls-strict.ll
@@ -163,7 +163,7 @@ define fp128 @fma(fp128 %x, fp128 %y, fp128 %z) nounwind strictfp {
 ; CHECK-LABEL: fma:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    pushq %rax
-; CHECK-NEXT:    callq fmal@PLT
+; CHECK-NEXT:    callq fmaf128
 ; CHECK-NEXT:    popq %rax
 ; CHECK-NEXT:    retq
 ;
@@ -204,7 +204,7 @@ define fp128 @frem(fp128 %x, fp128 %y) nounwind strictfp {
 ; CHECK-LABEL: frem:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    pushq %rax
-; CHECK-NEXT:    callq fmodl@PLT
+; CHECK-NEXT:    callq fmodf128
 ; CHECK-NEXT:    popq %rax
 ; CHECK-NEXT:    retq
 ;
@@ -241,7 +241,7 @@ define fp128 @ceil(fp128 %x) nounwind strictfp {
 ; CHECK-LABEL: ceil:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    pushq %rax
-; CHECK-NEXT:    callq ceill@PLT
+; CHECK-NEXT:    callq ceilf128
 ; CHECK-NEXT:    popq %rax
 ; CHECK-NEXT:    retq
 ;
@@ -274,7 +274,7 @@ define fp128 @cos(fp128 %x) nounwind strictfp {
 ; CHECK-LABEL: cos:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    pushq %rax
-; CHECK-NEXT:    callq cosl@PLT
+; CHECK-NEXT:    callq cosf128
 ; CHECK-NEXT:    popq %rax
 ; CHECK-NEXT:    retq
 ;
@@ -307,7 +307,7 @@ define fp128 @exp(fp128 %x) nounwind strictfp {
 ; CHECK-LABEL: exp:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    pushq %rax
-; CHECK-NEXT:    callq expl@PLT
+; CHECK-NEXT:    callq expf128
 ; CHECK-NEXT:    popq %rax
 ; CHECK-NEXT:    retq
 ;
@@ -340,7 +340,7 @@ define fp128 @exp2(fp128 %x) nounwind strictfp {
 ; CHECK-LABEL: exp2:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    pushq %rax
-; CHECK-NEXT:    callq exp2l@PLT
+; CHECK-NEXT:    callq exp2f128
 ; CHECK-NEXT:    popq %rax
 ; CHECK-NEXT:    retq
 ;
@@ -373,7 +373,7 @@ define fp128 @floor(fp128 %x) nounwind strictfp {
 ; CHECK-LABEL: floor:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    pushq %rax
-; CHECK-NEXT:    callq floorl@PLT
+; CHECK-NEXT:    callq floorf128
 ; CHECK-NEXT:    popq %rax
 ; CHECK-NEXT:    retq
 ;
@@ -406,7 +406,7 @@ define fp128 @log(fp128 %x) nounwind strictfp {
 ; CHECK-LABEL: log:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    pushq %rax
-; CHECK-NEXT:    callq logl@PLT
+; CHECK-NEXT:    callq logf128
 ; CHECK-NEXT:    popq %rax
 ; CHECK-NEXT:    retq
 ;
@@ -439,7 +439,7 @@ define fp128 @log10(fp128 %x) nounwind strictfp {
 ; CHECK-LABEL: log10:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    pushq %rax
-; CHECK-NEXT:    callq log10l@PLT
+; CHECK-NEXT:    callq log10f128
 ; CHECK-NEXT:    popq %rax
 ; CHECK-NEXT:    retq
 ;
@@ -472,7 +472,7 @@ define fp128 @log2(fp128 %x) nounwind strictfp {
 ; CHECK-LABEL: log2:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    pushq %rax
-; CHECK-NEXT:    callq log2l@PLT
+; CHECK-NEXT:    callq log2f128
 ; CHECK-NEXT:    popq %rax
 ; CHECK-NEXT:    retq
 ;
@@ -505,7 +505,7 @@ define fp128 @maxnum(fp128 %x, fp128 %y) nounwind strictfp {
 ; CHECK-LABEL: maxnum:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    pushq %rax
-; CHECK-NEXT:    callq fmaxl@PLT
+; CHECK-NEXT:    callq fmaxf128
 ; CHECK-NEXT:    popq %rax
 ; CHECK-NEXT:    retq
 ;
@@ -542,7 +542,7 @@ define fp128 @minnum(fp128 %x, fp128 %y) nounwind strictfp {
 ; CHECK-LABEL: minnum:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    pushq %rax
-; CHECK-NEXT:    callq fminl@PLT
+; CHECK-NEXT:    callq fminf128
 ; CHECK-NEXT:    popq %rax
 ; CHECK-NEXT:    retq
 ;
@@ -579,7 +579,7 @@ define fp128 @nearbyint(fp128 %x) nounwind strictfp {
 ; CHECK-LABEL: nearbyint:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    pushq %rax
-; CHECK-NEXT:    callq nearbyintl@PLT
+; CHECK-NEXT:    callq nearbyintf128
 ; CHECK-NEXT:    popq %rax
 ; CHECK-NEXT:    retq
 ;
@@ -612,7 +612,7 @@ define fp128 @pow(fp128 %x, fp128 %y) nounwind strictfp {
 ; CHECK-LABEL: pow:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    pushq %rax
-; CHECK-NEXT:    callq powl@PLT
+; CHECK-NEXT:    callq powf128
 ; CHECK-NEXT:    popq %rax
 ; CHECK-NEXT:    retq
 ;
@@ -683,7 +683,7 @@ define fp128 @rint(fp128 %x) nounwind strictfp {
 ; CHECK-LABEL: rint:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    pushq %rax
-; CHECK-NEXT:    callq rintl@PLT
+; CHECK-NEXT:    callq rintf128
 ; CHECK-NEXT:    popq %rax
 ; CHECK-NEXT:    retq
 ;
@@ -716,7 +716,7 @@ define fp128 @round(fp128 %x) nounwind strictfp {
 ; CHECK-LABEL: round:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    pushq %rax
-; CHECK-NEXT:    callq roundl@PLT
+; CHECK-NEXT:    callq roundf128
 ; CHECK-NEXT:    popq %rax
 ; CHECK-NEXT:    retq
 ;
@@ -749,7 +749,7 @@ define fp128 @roundeven(fp128 %x) nounwind strictfp {
 ; CHECK-LABEL: roundeven:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    pushq %rax
-; CHECK-NEXT:    callq roundevenl@PLT
+; CHECK-NEXT:    callq roundevenf128
 ; CHECK-NEXT:    popq %rax
 ; CHECK-NEXT:    retq
 ;
@@ -782,7 +782,7 @@ define fp128 @sin(fp128 %x) nounwind strictfp {
 ; CHECK-LABEL: sin:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    pushq %rax
-; CHECK-NEXT:    callq sinl@PLT
+; CHECK-NEXT:    callq sinf128
 ; CHECK-NEXT:    popq %rax
 ; CHECK-NEXT:    retq
 ;
@@ -815,7 +815,7 @@ define fp128 @sqrt(fp128 %x) nounwind strictfp {
 ; CHECK-LABEL: sqrt:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    pushq %rax
-; CHECK-NEXT:    callq sqrtl@PLT
+; CHECK-NEXT:    callq sqrtf128
 ; CHECK-NEXT:    popq %rax
 ; CHECK-NEXT:    retq
 ;
@@ -848,7 +848,7 @@ define fp128 @trunc(fp128 %x) nounwind strictfp {
 ; CHECK-LABEL: trunc:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    pushq %rax
-; CHECK-NEXT:    callq truncl@PLT
+; CHECK-NEXT:    callq truncf128
 ; CHECK-NEXT:    popq %rax
 ; CHECK-NEXT:    retq
 ;
@@ -881,7 +881,7 @@ define i32 @lrint(fp128 %x) nounwind strictfp {
 ; CHECK-LABEL: lrint:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    pushq %rax
-; CHECK-NEXT:    callq lrintl@PLT
+; CHECK-NEXT:    callq lrintf128
 ; CHECK-NEXT:    popq %rcx
 ; CHECK-NEXT:    retq
 ;
@@ -904,7 +904,7 @@ define i64 @llrint(fp128 %x) nounwind strictfp {
 ; CHECK-LABEL: llrint:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    pushq %rax
-; CHECK-NEXT:    callq llrintl@PLT
+; CHECK-NEXT:    callq llrintf128
 ; CHECK-NEXT:    popq %rcx
 ; CHECK-NEXT:    retq
 ;
@@ -927,7 +927,7 @@ define i32 @lround(fp128 %x) nounwind strictfp {
 ; CHECK-LABEL: lround:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    pushq %rax
-; CHECK-NEXT:    callq lroundl@PLT
+; CHECK-NEXT:    callq lroundf128
 ; CHECK-NEXT:    popq %rcx
 ; CHECK-NEXT:    retq
 ;
@@ -950,7 +950,7 @@ define i64 @llround(fp128 %x) nounwind strictfp {
 ; CHECK-LABEL: llround:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    pushq %rax
-; CHECK-NEXT:    callq llroundl@PLT
+; CHECK-NEXT:    callq llroundf128
 ; CHECK-NEXT:    popq %rcx
 ; CHECK-NEXT:    retq
 ;
diff --git a/llvm/test/CodeGen/X86/fp128-libcalls.ll b/llvm/test/CodeGen/X86/fp128-libcalls.ll
index 4e7e6b4..6946ca2 100644
--- a/llvm/test/CodeGen/X86/fp128-libcalls.ll
+++ b/llvm/test/CodeGen/X86/fp128-libcalls.ll
@@ -299,7 +299,7 @@ define dso_local void @Test128Rem(fp128 %d1, fp128 %d2) nounwind {
 ; CHECK-LABEL: Test128Rem:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    pushq %rax
-; CHECK-NEXT:    callq fmodl@PLT
+; CHECK-NEXT:    callq fmodf128
 ; CHECK-NEXT:    movaps %xmm0, vf128(%rip)
 ; CHECK-NEXT:    popq %rax
 ; CHECK-NEXT:    retq
@@ -335,7 +335,7 @@ define dso_local void @Test128_1Rem(fp128 %d1) nounwind {
 ; CHECK-NEXT:    pushq %rax
 ; CHECK-NEXT:    movaps %xmm0, %xmm1
 ; CHECK-NEXT:    movaps vf128(%rip), %xmm0
-; CHECK-NEXT:    callq fmodl@PLT
+; CHECK-NEXT:    callq fmodf128
 ; CHECK-NEXT:    movaps %xmm0, vf128(%rip)
 ; CHECK-NEXT:    popq %rax
 ; CHECK-NEXT:    retq
@@ -370,7 +370,7 @@ define dso_local void @Test128Sqrt(fp128 %d1) nounwind {
 ; CHECK-LABEL: Test128Sqrt:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    pushq %rax
-; CHECK-NEXT:    callq sqrtl@PLT
+; CHECK-NEXT:    callq sqrtf128
 ; CHECK-NEXT:    movaps %xmm0, vf128(%rip)
 ; CHECK-NEXT:    popq %rax
 ; CHECK-NEXT:    retq
@@ -401,7 +401,7 @@ define dso_local void @Test128Sin(fp128 %d1) nounwind {
 ; CHECK-LABEL: Test128Sin:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    pushq %rax
-; CHECK-NEXT:    callq sinl@PLT
+; CHECK-NEXT:    callq sinf128
 ; CHECK-NEXT:    movaps %xmm0, vf128(%rip)
 ; CHECK-NEXT:    popq %rax
 ; CHECK-NEXT:    retq
@@ -432,7 +432,7 @@ define dso_local void @Test128Cos(fp128 %d1) nounwind {
 ; CHECK-LABEL: Test128Cos:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    pushq %rax
-; CHECK-NEXT:    callq cosl@PLT
+; CHECK-NEXT:    callq cosf128
 ; CHECK-NEXT:    movaps %xmm0, vf128(%rip)
 ; CHECK-NEXT:    popq %rax
 ; CHECK-NEXT:    retq
@@ -463,7 +463,7 @@ define dso_local void @Test128Ceil(fp128 %d1) nounwind {
 ; CHECK-LABEL: Test128Ceil:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    pushq %rax
-; CHECK-NEXT:    callq ceill@PLT
+; CHECK-NEXT:    callq ceilf128
 ; CHECK-NEXT:    movaps %xmm0, vf128(%rip)
 ; CHECK-NEXT:    popq %rax
 ; CHECK-NEXT:    retq
@@ -494,7 +494,7 @@ define dso_local void @Test128Floor(fp128 %d1) nounwind {
 ; CHECK-LABEL: Test128Floor:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    pushq %rax
-; CHECK-NEXT:    callq floorl@PLT
+; CHECK-NEXT:    callq floorf128
 ; CHECK-NEXT:    movaps %xmm0, vf128(%rip)
 ; CHECK-NEXT:    popq %rax
 ; CHECK-NEXT:    retq
@@ -525,7 +525,7 @@ define dso_local void @Test128Trunc(fp128 %d1) nounwind {
 ; CHECK-LABEL: Test128Trunc:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    pushq %rax
-; CHECK-NEXT:    callq truncl@PLT
+; CHECK-NEXT:    callq truncf128
 ; CHECK-NEXT:    movaps %xmm0, vf128(%rip)
 ; CHECK-NEXT:    popq %rax
 ; CHECK-NEXT:    retq
@@ -556,7 +556,7 @@ define dso_local void @Test128Nearbyint(fp128 %d1) nounwind {
 ; CHECK-LABEL: Test128Nearbyint:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    pushq %rax
-; CHECK-NEXT:    callq nearbyintl@PLT
+; CHECK-NEXT:    callq nearbyintf128
 ; CHECK-NEXT:    movaps %xmm0, vf128(%rip)
 ; CHECK-NEXT:    popq %rax
 ; CHECK-NEXT:    retq
@@ -587,7 +587,7 @@ define dso_local void @Test128Rint(fp128 %d1) nounwind {
 ; CHECK-LABEL: Test128Rint:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    pushq %rax
-; CHECK-NEXT:    callq rintl@PLT
+; CHECK-NEXT:    callq rintf128
 ; CHECK-NEXT:    movaps %xmm0, vf128(%rip)
 ; CHECK-NEXT:    popq %rax
 ; CHECK-NEXT:    retq
@@ -618,7 +618,7 @@ define dso_local void @Test128Round(fp128 %d1) nounwind {
 ; CHECK-LABEL: Test128Round:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    pushq %rax
-; CHECK-NEXT:    callq roundl@PLT
+; CHECK-NEXT:    callq roundf128
 ; CHECK-NEXT:    movaps %xmm0, vf128(%rip)
 ; CHECK-NEXT:    popq %rax
 ; CHECK-NEXT:    retq
@@ -648,7 +648,7 @@ declare fp128 @llvm.round.f128(fp128)
 define fp128 @Test128FMA(fp128 %a, fp128 %b, fp128 %c) nounwind {
 ; CHECK-LABEL: Test128FMA:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    jmp fmal@PLT # TAILCALL
+; CHECK-NEXT:    jmp fmaf128@PLT # TAILCALL
 ;
 ; X86-LABEL: Test128FMA:
 ; X86:       # %bb.0: # %entry
diff --git a/llvm/test/CodeGen/X86/frem.ll b/llvm/test/CodeGen/X86/frem.ll
index d91d428..35d16c3 100644
--- a/llvm/test/CodeGen/X86/frem.ll
+++ b/llvm/test/CodeGen/X86/frem.ll
@@ -82,7 +82,7 @@ define void @frem_f128(fp128 %a0, fp128 %a1, ptr%p3) nounwind {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    pushq %rbx
 ; CHECK-NEXT:    movq %rdi, %rbx
-; CHECK-NEXT:    callq fmodl@PLT
+; CHECK-NEXT:    callq fmodf128
 ; CHECK-NEXT:    vmovaps %xmm0, (%rbx)
 ; CHECK-NEXT:    popq %rbx
 ; CHECK-NEXT:    retq
-- 
cgit v1.1


From 647010a06f3af725a2e674f025bc0e04aa1fbbff Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Fri, 9 Feb 2024 10:56:33 -0800
Subject: [RISCV] Remove unnecessary check for RVE from determineCalleeSaves.
 NFCI

The SavedRegs BitVector is checks against the CSR list later. We have
a separate CSR list for RVE that excludes X16-31 so we don't need
to filter here.

If it was needed, it would be needed for the next block of code too
which didn't have an RVE check.
---
 llvm/lib/Target/RISCV/RISCVFrameLowering.cpp | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp b/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp
index b12b497..60f92af 100644
--- a/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp
@@ -1003,9 +1003,7 @@ void RISCVFrameLowering::determineCalleeSaves(MachineFunction &MF,
     };
 
     for (auto Reg : CSRegs)
-      // Only save x0-x15 for RVE.
-      if (Reg < RISCV::X16 || !Subtarget.isRVE())
-        SavedRegs.set(Reg);
+      SavedRegs.set(Reg);
 
     // According to psABI, if ilp32e/lp64e ABIs are used with an ISA that
     // has any of the registers x16-x31 and f0-f31, then these registers are
-- 
cgit v1.1


From 5948d4de1d965d88c8ca05cc84bd94a28fa53ba4 Mon Sep 17 00:00:00 2001
From: Philip Reames <preames@rivosinc.com>
Date: Fri, 9 Feb 2024 11:09:44 -0800
Subject: [RISCV] Add test coverage for buildvectors with long vslidedown
 sequences

In advance of an upcoming change.
---
 .../CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec.ll |  74 +++
 .../RISCV/rvv/fixed-vectors-int-buildvec.ll        | 509 +++++++++++++++++++++
 2 files changed, 583 insertions(+)

diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec.ll
index 57b2193..a2bd862 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec.ll
@@ -1394,3 +1394,77 @@ define <2 x double> @vid_step2_v2f64() {
 ; CHECK-NEXT:    ret
   ret <2 x double> <double 0.0, double 2.0>
 }
+
+
+define <8 x float> @buildvec_v8f32_zvl256(float %e0, float %e1, float %e2, float %e3, float %e4, float %e5, float %e6, float %e7) vscale_range(4, 128) {
+; CHECK-LABEL: buildvec_v8f32_zvl256:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 8, e32, m1, ta, ma
+; CHECK-NEXT:    vfmv.v.f v8, fa0
+; CHECK-NEXT:    vfslide1down.vf v8, v8, fa1
+; CHECK-NEXT:    vfslide1down.vf v8, v8, fa2
+; CHECK-NEXT:    vfslide1down.vf v8, v8, fa3
+; CHECK-NEXT:    vfslide1down.vf v8, v8, fa4
+; CHECK-NEXT:    vfslide1down.vf v8, v8, fa5
+; CHECK-NEXT:    vfslide1down.vf v8, v8, fa6
+; CHECK-NEXT:    vfslide1down.vf v8, v8, fa7
+; CHECK-NEXT:    ret
+  %v0 = insertelement <8 x float> poison, float %e0, i64 0
+  %v1 = insertelement <8 x float> %v0, float %e1, i64 1
+  %v2 = insertelement <8 x float> %v1, float %e2, i64 2
+  %v3 = insertelement <8 x float> %v2, float %e3, i64 3
+  %v4 = insertelement <8 x float> %v3, float %e4, i64 4
+  %v5 = insertelement <8 x float> %v4, float %e5, i64 5
+  %v6 = insertelement <8 x float> %v5, float %e6, i64 6
+  %v7 = insertelement <8 x float> %v6, float %e7, i64 7
+  ret <8 x float> %v7
+}
+
+
+define <8 x double> @buildvec_v8f64_zvl256(double %e0, double %e1, double %e2, double %e3, double %e4, double %e5, double %e6, double %e7) vscale_range(4, 128) {
+; CHECK-LABEL: buildvec_v8f64_zvl256:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 8, e64, m2, ta, ma
+; CHECK-NEXT:    vfmv.v.f v8, fa0
+; CHECK-NEXT:    vfslide1down.vf v8, v8, fa1
+; CHECK-NEXT:    vfslide1down.vf v8, v8, fa2
+; CHECK-NEXT:    vfslide1down.vf v8, v8, fa3
+; CHECK-NEXT:    vfslide1down.vf v8, v8, fa4
+; CHECK-NEXT:    vfslide1down.vf v8, v8, fa5
+; CHECK-NEXT:    vfslide1down.vf v8, v8, fa6
+; CHECK-NEXT:    vfslide1down.vf v8, v8, fa7
+; CHECK-NEXT:    ret
+  %v0 = insertelement <8 x double> poison, double %e0, i64 0
+  %v1 = insertelement <8 x double> %v0, double %e1, i64 1
+  %v2 = insertelement <8 x double> %v1, double %e2, i64 2
+  %v3 = insertelement <8 x double> %v2, double %e3, i64 3
+  %v4 = insertelement <8 x double> %v3, double %e4, i64 4
+  %v5 = insertelement <8 x double> %v4, double %e5, i64 5
+  %v6 = insertelement <8 x double> %v5, double %e6, i64 6
+  %v7 = insertelement <8 x double> %v6, double %e7, i64 7
+  ret <8 x double> %v7
+}
+
+define <8 x double> @buildvec_v8f64_zvl512(double %e0, double %e1, double %e2, double %e3, double %e4, double %e5, double %e6, double %e7) vscale_range(8, 128) {
+; CHECK-LABEL: buildvec_v8f64_zvl512:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 8, e64, m1, ta, ma
+; CHECK-NEXT:    vfmv.v.f v8, fa0
+; CHECK-NEXT:    vfslide1down.vf v8, v8, fa1
+; CHECK-NEXT:    vfslide1down.vf v8, v8, fa2
+; CHECK-NEXT:    vfslide1down.vf v8, v8, fa3
+; CHECK-NEXT:    vfslide1down.vf v8, v8, fa4
+; CHECK-NEXT:    vfslide1down.vf v8, v8, fa5
+; CHECK-NEXT:    vfslide1down.vf v8, v8, fa6
+; CHECK-NEXT:    vfslide1down.vf v8, v8, fa7
+; CHECK-NEXT:    ret
+  %v0 = insertelement <8 x double> poison, double %e0, i64 0
+  %v1 = insertelement <8 x double> %v0, double %e1, i64 1
+  %v2 = insertelement <8 x double> %v1, double %e2, i64 2
+  %v3 = insertelement <8 x double> %v2, double %e3, i64 3
+  %v4 = insertelement <8 x double> %v3, double %e4, i64 4
+  %v5 = insertelement <8 x double> %v4, double %e5, i64 5
+  %v6 = insertelement <8 x double> %v5, double %e6, i64 6
+  %v7 = insertelement <8 x double> %v6, double %e7, i64 7
+  ret <8 x double> %v7
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-buildvec.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-buildvec.ll
index dfafbfb..e691e63 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-buildvec.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-buildvec.ll
@@ -1178,3 +1178,512 @@ define <8 x i64> @v8xi64_exact_undef_prefix(i64 %a, i64 %b, i64 %c, i64 %d) vsca
   %v4 = insertelement <8 x i64> %v3, i64 %d, i32 7
   ret <8 x i64> %v4
 }
+
+
+define <16 x i8> @buildvec_v16i8_loads_contigous(ptr %p) {
+; RV32-LABEL: buildvec_v16i8_loads_contigous:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    sw s0, 12(sp) # 4-byte Folded Spill
+; RV32-NEXT:    .cfi_offset s0, -4
+; RV32-NEXT:    lbu a1, 1(a0)
+; RV32-NEXT:    lbu a2, 2(a0)
+; RV32-NEXT:    lbu a3, 3(a0)
+; RV32-NEXT:    lbu a4, 4(a0)
+; RV32-NEXT:    lbu a5, 5(a0)
+; RV32-NEXT:    lbu a6, 6(a0)
+; RV32-NEXT:    lbu a7, 7(a0)
+; RV32-NEXT:    lbu t0, 8(a0)
+; RV32-NEXT:    lbu t1, 9(a0)
+; RV32-NEXT:    lbu t2, 10(a0)
+; RV32-NEXT:    lbu t3, 11(a0)
+; RV32-NEXT:    lbu t4, 12(a0)
+; RV32-NEXT:    lbu t5, 13(a0)
+; RV32-NEXT:    lbu t6, 14(a0)
+; RV32-NEXT:    lbu s0, 15(a0)
+; RV32-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
+; RV32-NEXT:    vlse8.v v8, (a0), zero
+; RV32-NEXT:    vslide1down.vx v8, v8, a1
+; RV32-NEXT:    vslide1down.vx v8, v8, a2
+; RV32-NEXT:    vslide1down.vx v8, v8, a3
+; RV32-NEXT:    vslide1down.vx v8, v8, a4
+; RV32-NEXT:    vslide1down.vx v8, v8, a5
+; RV32-NEXT:    vslide1down.vx v8, v8, a6
+; RV32-NEXT:    vslide1down.vx v8, v8, a7
+; RV32-NEXT:    vslide1down.vx v8, v8, t0
+; RV32-NEXT:    vslide1down.vx v8, v8, t1
+; RV32-NEXT:    vslide1down.vx v8, v8, t2
+; RV32-NEXT:    vslide1down.vx v8, v8, t3
+; RV32-NEXT:    vslide1down.vx v8, v8, t4
+; RV32-NEXT:    vslide1down.vx v8, v8, t5
+; RV32-NEXT:    vslide1down.vx v8, v8, t6
+; RV32-NEXT:    vslide1down.vx v8, v8, s0
+; RV32-NEXT:    lw s0, 12(sp) # 4-byte Folded Reload
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: buildvec_v16i8_loads_contigous:
+; RV64:       # %bb.0:
+; RV64-NEXT:    addi sp, sp, -16
+; RV64-NEXT:    .cfi_def_cfa_offset 16
+; RV64-NEXT:    sd s0, 8(sp) # 8-byte Folded Spill
+; RV64-NEXT:    .cfi_offset s0, -8
+; RV64-NEXT:    lbu a1, 1(a0)
+; RV64-NEXT:    lbu a2, 2(a0)
+; RV64-NEXT:    lbu a3, 3(a0)
+; RV64-NEXT:    lbu a4, 4(a0)
+; RV64-NEXT:    lbu a5, 5(a0)
+; RV64-NEXT:    lbu a6, 6(a0)
+; RV64-NEXT:    lbu a7, 7(a0)
+; RV64-NEXT:    lbu t0, 8(a0)
+; RV64-NEXT:    lbu t1, 9(a0)
+; RV64-NEXT:    lbu t2, 10(a0)
+; RV64-NEXT:    lbu t3, 11(a0)
+; RV64-NEXT:    lbu t4, 12(a0)
+; RV64-NEXT:    lbu t5, 13(a0)
+; RV64-NEXT:    lbu t6, 14(a0)
+; RV64-NEXT:    lbu s0, 15(a0)
+; RV64-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
+; RV64-NEXT:    vlse8.v v8, (a0), zero
+; RV64-NEXT:    vslide1down.vx v8, v8, a1
+; RV64-NEXT:    vslide1down.vx v8, v8, a2
+; RV64-NEXT:    vslide1down.vx v8, v8, a3
+; RV64-NEXT:    vslide1down.vx v8, v8, a4
+; RV64-NEXT:    vslide1down.vx v8, v8, a5
+; RV64-NEXT:    vslide1down.vx v8, v8, a6
+; RV64-NEXT:    vslide1down.vx v8, v8, a7
+; RV64-NEXT:    vslide1down.vx v8, v8, t0
+; RV64-NEXT:    vslide1down.vx v8, v8, t1
+; RV64-NEXT:    vslide1down.vx v8, v8, t2
+; RV64-NEXT:    vslide1down.vx v8, v8, t3
+; RV64-NEXT:    vslide1down.vx v8, v8, t4
+; RV64-NEXT:    vslide1down.vx v8, v8, t5
+; RV64-NEXT:    vslide1down.vx v8, v8, t6
+; RV64-NEXT:    vslide1down.vx v8, v8, s0
+; RV64-NEXT:    ld s0, 8(sp) # 8-byte Folded Reload
+; RV64-NEXT:    addi sp, sp, 16
+; RV64-NEXT:    ret
+  %p2 = getelementptr i8, ptr %p, i32 1
+  %p3 = getelementptr i8, ptr %p, i32 2
+  %p4 = getelementptr i8, ptr %p, i32 3
+  %p5 = getelementptr i8, ptr %p, i32 4
+  %p6 = getelementptr i8, ptr %p, i32 5
+  %p7 = getelementptr i8, ptr %p, i32 6
+  %p8 = getelementptr i8, ptr %p, i32 7
+  %p9 = getelementptr i8, ptr %p, i32 8
+  %p10 = getelementptr i8, ptr %p, i32 9
+  %p11 = getelementptr i8, ptr %p, i32 10
+  %p12 = getelementptr i8, ptr %p, i32 11
+  %p13 = getelementptr i8, ptr %p, i32 12
+  %p14 = getelementptr i8, ptr %p, i32 13
+  %p15 = getelementptr i8, ptr %p, i32 14
+  %p16 = getelementptr i8, ptr %p, i32 15
+
+  %ld1 = load i8, ptr %p
+  %ld2 = load i8, ptr %p2
+  %ld3 = load i8, ptr %p3
+  %ld4 = load i8, ptr %p4
+  %ld5 = load i8, ptr %p5
+  %ld6 = load i8, ptr %p6
+  %ld7 = load i8, ptr %p7
+  %ld8 = load i8, ptr %p8
+  %ld9 = load i8, ptr %p9
+  %ld10 = load i8, ptr %p10
+  %ld11 = load i8, ptr %p11
+  %ld12 = load i8, ptr %p12
+  %ld13 = load i8, ptr %p13
+  %ld14 = load i8, ptr %p14
+  %ld15 = load i8, ptr %p15
+  %ld16 = load i8, ptr %p16
+
+  %v1 = insertelement <16 x i8> poison, i8 %ld1, i32 0
+  %v2 = insertelement <16 x i8> %v1, i8 %ld2, i32 1
+  %v3 = insertelement <16 x i8> %v2, i8 %ld3, i32 2
+  %v4 = insertelement <16 x i8> %v3, i8 %ld4, i32 3
+  %v5 = insertelement <16 x i8> %v4, i8 %ld5, i32 4
+  %v6 = insertelement <16 x i8> %v5, i8 %ld6, i32 5
+  %v7 = insertelement <16 x i8> %v6, i8 %ld7, i32 6
+  %v8 = insertelement <16 x i8> %v7, i8 %ld8, i32 7
+  %v9 = insertelement <16 x i8> %v8, i8 %ld9, i32 8
+  %v10 = insertelement <16 x i8> %v9, i8 %ld10, i32 9
+  %v11 = insertelement <16 x i8> %v10, i8 %ld11, i32 10
+  %v12 = insertelement <16 x i8> %v11, i8 %ld12, i32 11
+  %v13 = insertelement <16 x i8> %v12, i8 %ld13, i32 12
+  %v14 = insertelement <16 x i8> %v13, i8 %ld14, i32 13
+  %v15 = insertelement <16 x i8> %v14, i8 %ld15, i32 14
+  %v16 = insertelement <16 x i8> %v15, i8 %ld16, i32 15
+  ret <16 x i8> %v16
+}
+
+
+define <16 x i8> @buildvec_v16i8_loads_gather(ptr %p) {
+; RV32-LABEL: buildvec_v16i8_loads_gather:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    sw s0, 12(sp) # 4-byte Folded Spill
+; RV32-NEXT:    .cfi_offset s0, -4
+; RV32-NEXT:    lbu a1, 1(a0)
+; RV32-NEXT:    lbu a2, 22(a0)
+; RV32-NEXT:    lbu a3, 31(a0)
+; RV32-NEXT:    lbu a4, 44(a0)
+; RV32-NEXT:    lbu a5, 55(a0)
+; RV32-NEXT:    lbu a6, 623(a0)
+; RV32-NEXT:    lbu a7, 75(a0)
+; RV32-NEXT:    lbu t0, 82(a0)
+; RV32-NEXT:    lbu t1, 93(a0)
+; RV32-NEXT:    lbu t2, 105(a0)
+; RV32-NEXT:    lbu t3, 161(a0)
+; RV32-NEXT:    lbu t4, 124(a0)
+; RV32-NEXT:    lbu t5, 163(a0)
+; RV32-NEXT:    lbu t6, 144(a0)
+; RV32-NEXT:    lbu s0, 154(a0)
+; RV32-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
+; RV32-NEXT:    vlse8.v v8, (a0), zero
+; RV32-NEXT:    vslide1down.vx v8, v8, a1
+; RV32-NEXT:    vslide1down.vx v8, v8, a2
+; RV32-NEXT:    vslide1down.vx v8, v8, a3
+; RV32-NEXT:    vslide1down.vx v8, v8, a4
+; RV32-NEXT:    vslide1down.vx v8, v8, a5
+; RV32-NEXT:    vslide1down.vx v8, v8, a6
+; RV32-NEXT:    vslide1down.vx v8, v8, a7
+; RV32-NEXT:    vslide1down.vx v8, v8, t0
+; RV32-NEXT:    vslide1down.vx v8, v8, t1
+; RV32-NEXT:    vslide1down.vx v8, v8, t2
+; RV32-NEXT:    vslide1down.vx v8, v8, t3
+; RV32-NEXT:    vslide1down.vx v8, v8, t4
+; RV32-NEXT:    vslide1down.vx v8, v8, t5
+; RV32-NEXT:    vslide1down.vx v8, v8, t6
+; RV32-NEXT:    vslide1down.vx v8, v8, s0
+; RV32-NEXT:    lw s0, 12(sp) # 4-byte Folded Reload
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: buildvec_v16i8_loads_gather:
+; RV64:       # %bb.0:
+; RV64-NEXT:    addi sp, sp, -16
+; RV64-NEXT:    .cfi_def_cfa_offset 16
+; RV64-NEXT:    sd s0, 8(sp) # 8-byte Folded Spill
+; RV64-NEXT:    .cfi_offset s0, -8
+; RV64-NEXT:    lbu a1, 1(a0)
+; RV64-NEXT:    lbu a2, 22(a0)
+; RV64-NEXT:    lbu a3, 31(a0)
+; RV64-NEXT:    lbu a4, 44(a0)
+; RV64-NEXT:    lbu a5, 55(a0)
+; RV64-NEXT:    lbu a6, 623(a0)
+; RV64-NEXT:    lbu a7, 75(a0)
+; RV64-NEXT:    lbu t0, 82(a0)
+; RV64-NEXT:    lbu t1, 93(a0)
+; RV64-NEXT:    lbu t2, 105(a0)
+; RV64-NEXT:    lbu t3, 161(a0)
+; RV64-NEXT:    lbu t4, 124(a0)
+; RV64-NEXT:    lbu t5, 163(a0)
+; RV64-NEXT:    lbu t6, 144(a0)
+; RV64-NEXT:    lbu s0, 154(a0)
+; RV64-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
+; RV64-NEXT:    vlse8.v v8, (a0), zero
+; RV64-NEXT:    vslide1down.vx v8, v8, a1
+; RV64-NEXT:    vslide1down.vx v8, v8, a2
+; RV64-NEXT:    vslide1down.vx v8, v8, a3
+; RV64-NEXT:    vslide1down.vx v8, v8, a4
+; RV64-NEXT:    vslide1down.vx v8, v8, a5
+; RV64-NEXT:    vslide1down.vx v8, v8, a6
+; RV64-NEXT:    vslide1down.vx v8, v8, a7
+; RV64-NEXT:    vslide1down.vx v8, v8, t0
+; RV64-NEXT:    vslide1down.vx v8, v8, t1
+; RV64-NEXT:    vslide1down.vx v8, v8, t2
+; RV64-NEXT:    vslide1down.vx v8, v8, t3
+; RV64-NEXT:    vslide1down.vx v8, v8, t4
+; RV64-NEXT:    vslide1down.vx v8, v8, t5
+; RV64-NEXT:    vslide1down.vx v8, v8, t6
+; RV64-NEXT:    vslide1down.vx v8, v8, s0
+; RV64-NEXT:    ld s0, 8(sp) # 8-byte Folded Reload
+; RV64-NEXT:    addi sp, sp, 16
+; RV64-NEXT:    ret
+  %p2 = getelementptr i8, ptr %p, i32 1
+  %p3 = getelementptr i8, ptr %p, i32 22
+  %p4 = getelementptr i8, ptr %p, i32 31
+  %p5 = getelementptr i8, ptr %p, i32 44
+  %p6 = getelementptr i8, ptr %p, i32 55
+  %p7 = getelementptr i8, ptr %p, i32 623
+  %p8 = getelementptr i8, ptr %p, i32 75
+  %p9 = getelementptr i8, ptr %p, i32 82
+  %p10 = getelementptr i8, ptr %p, i32 93
+  %p11 = getelementptr i8, ptr %p, i32 105
+  %p12 = getelementptr i8, ptr %p, i32 161
+  %p13 = getelementptr i8, ptr %p, i32 124
+  %p14 = getelementptr i8, ptr %p, i32 163
+  %p15 = getelementptr i8, ptr %p, i32 144
+  %p16 = getelementptr i8, ptr %p, i32 154
+
+  %ld1 = load i8, ptr %p
+  %ld2 = load i8, ptr %p2
+  %ld3 = load i8, ptr %p3
+  %ld4 = load i8, ptr %p4
+  %ld5 = load i8, ptr %p5
+  %ld6 = load i8, ptr %p6
+  %ld7 = load i8, ptr %p7
+  %ld8 = load i8, ptr %p8
+  %ld9 = load i8, ptr %p9
+  %ld10 = load i8, ptr %p10
+  %ld11 = load i8, ptr %p11
+  %ld12 = load i8, ptr %p12
+  %ld13 = load i8, ptr %p13
+  %ld14 = load i8, ptr %p14
+  %ld15 = load i8, ptr %p15
+  %ld16 = load i8, ptr %p16
+
+  %v1 = insertelement <16 x i8> poison, i8 %ld1, i32 0
+  %v2 = insertelement <16 x i8> %v1, i8 %ld2, i32 1
+  %v3 = insertelement <16 x i8> %v2, i8 %ld3, i32 2
+  %v4 = insertelement <16 x i8> %v3, i8 %ld4, i32 3
+  %v5 = insertelement <16 x i8> %v4, i8 %ld5, i32 4
+  %v6 = insertelement <16 x i8> %v5, i8 %ld6, i32 5
+  %v7 = insertelement <16 x i8> %v6, i8 %ld7, i32 6
+  %v8 = insertelement <16 x i8> %v7, i8 %ld8, i32 7
+  %v9 = insertelement <16 x i8> %v8, i8 %ld9, i32 8
+  %v10 = insertelement <16 x i8> %v9, i8 %ld10, i32 9
+  %v11 = insertelement <16 x i8> %v10, i8 %ld11, i32 10
+  %v12 = insertelement <16 x i8> %v11, i8 %ld12, i32 11
+  %v13 = insertelement <16 x i8> %v12, i8 %ld13, i32 12
+  %v14 = insertelement <16 x i8> %v13, i8 %ld14, i32 13
+  %v15 = insertelement <16 x i8> %v14, i8 %ld15, i32 14
+  %v16 = insertelement <16 x i8> %v15, i8 %ld16, i32 15
+  ret <16 x i8> %v16
+}
+
+define <16 x i8> @buildvec_v16i8_undef_low_half(ptr %p) {
+; CHECK-LABEL: buildvec_v16i8_undef_low_half:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi a1, a0, 82
+; CHECK-NEXT:    lbu a2, 93(a0)
+; CHECK-NEXT:    lbu a3, 105(a0)
+; CHECK-NEXT:    lbu a4, 161(a0)
+; CHECK-NEXT:    lbu a5, 124(a0)
+; CHECK-NEXT:    lbu a6, 163(a0)
+; CHECK-NEXT:    lbu a7, 144(a0)
+; CHECK-NEXT:    lbu a0, 154(a0)
+; CHECK-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
+; CHECK-NEXT:    vlse8.v v8, (a1), zero
+; CHECK-NEXT:    vslide1down.vx v8, v8, a2
+; CHECK-NEXT:    vslide1down.vx v8, v8, a3
+; CHECK-NEXT:    vslide1down.vx v8, v8, a4
+; CHECK-NEXT:    vslide1down.vx v8, v8, a5
+; CHECK-NEXT:    vslide1down.vx v8, v8, a6
+; CHECK-NEXT:    vslide1down.vx v8, v8, a7
+; CHECK-NEXT:    vslide1down.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %p9 = getelementptr i8, ptr %p, i32 82
+  %p10 = getelementptr i8, ptr %p, i32 93
+  %p11 = getelementptr i8, ptr %p, i32 105
+  %p12 = getelementptr i8, ptr %p, i32 161
+  %p13 = getelementptr i8, ptr %p, i32 124
+  %p14 = getelementptr i8, ptr %p, i32 163
+  %p15 = getelementptr i8, ptr %p, i32 144
+  %p16 = getelementptr i8, ptr %p, i32 154
+
+  %ld9 = load i8, ptr %p9
+  %ld10 = load i8, ptr %p10
+  %ld11 = load i8, ptr %p11
+  %ld12 = load i8, ptr %p12
+  %ld13 = load i8, ptr %p13
+  %ld14 = load i8, ptr %p14
+  %ld15 = load i8, ptr %p15
+  %ld16 = load i8, ptr %p16
+
+  %v9 = insertelement <16 x i8> poison, i8 %ld9, i32 8
+  %v10 = insertelement <16 x i8> %v9, i8 %ld10, i32 9
+  %v11 = insertelement <16 x i8> %v10, i8 %ld11, i32 10
+  %v12 = insertelement <16 x i8> %v11, i8 %ld12, i32 11
+  %v13 = insertelement <16 x i8> %v12, i8 %ld13, i32 12
+  %v14 = insertelement <16 x i8> %v13, i8 %ld14, i32 13
+  %v15 = insertelement <16 x i8> %v14, i8 %ld15, i32 14
+  %v16 = insertelement <16 x i8> %v15, i8 %ld16, i32 15
+  ret <16 x i8> %v16
+}
+
+define <16 x i8> @buildvec_v16i8_undef_high_half(ptr %p) {
+; CHECK-LABEL: buildvec_v16i8_undef_high_half:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lbu a1, 1(a0)
+; CHECK-NEXT:    lbu a2, 22(a0)
+; CHECK-NEXT:    lbu a3, 31(a0)
+; CHECK-NEXT:    lbu a4, 44(a0)
+; CHECK-NEXT:    lbu a5, 55(a0)
+; CHECK-NEXT:    lbu a6, 623(a0)
+; CHECK-NEXT:    lbu a7, 75(a0)
+; CHECK-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
+; CHECK-NEXT:    vlse8.v v8, (a0), zero
+; CHECK-NEXT:    vslide1down.vx v8, v8, a1
+; CHECK-NEXT:    vslide1down.vx v8, v8, a2
+; CHECK-NEXT:    vslide1down.vx v8, v8, a3
+; CHECK-NEXT:    vslide1down.vx v8, v8, a4
+; CHECK-NEXT:    vslide1down.vx v8, v8, a5
+; CHECK-NEXT:    vslide1down.vx v8, v8, a6
+; CHECK-NEXT:    vslide1down.vx v8, v8, a7
+; CHECK-NEXT:    vslidedown.vi v8, v8, 8
+; CHECK-NEXT:    ret
+  %p2 = getelementptr i8, ptr %p, i32 1
+  %p3 = getelementptr i8, ptr %p, i32 22
+  %p4 = getelementptr i8, ptr %p, i32 31
+  %p5 = getelementptr i8, ptr %p, i32 44
+  %p6 = getelementptr i8, ptr %p, i32 55
+  %p7 = getelementptr i8, ptr %p, i32 623
+  %p8 = getelementptr i8, ptr %p, i32 75
+
+  %ld1 = load i8, ptr %p
+  %ld2 = load i8, ptr %p2
+  %ld3 = load i8, ptr %p3
+  %ld4 = load i8, ptr %p4
+  %ld5 = load i8, ptr %p5
+  %ld6 = load i8, ptr %p6
+  %ld7 = load i8, ptr %p7
+  %ld8 = load i8, ptr %p8
+
+  %v1 = insertelement <16 x i8> poison, i8 %ld1, i32 0
+  %v2 = insertelement <16 x i8> %v1, i8 %ld2, i32 1
+  %v3 = insertelement <16 x i8> %v2, i8 %ld3, i32 2
+  %v4 = insertelement <16 x i8> %v3, i8 %ld4, i32 3
+  %v5 = insertelement <16 x i8> %v4, i8 %ld5, i32 4
+  %v6 = insertelement <16 x i8> %v5, i8 %ld6, i32 5
+  %v7 = insertelement <16 x i8> %v6, i8 %ld7, i32 6
+  %v8 = insertelement <16 x i8> %v7, i8 %ld8, i32 7
+  ret <16 x i8> %v8
+}
+
+define <16 x i8> @buildvec_v16i8_undef_edges(ptr %p) {
+; CHECK-LABEL: buildvec_v16i8_undef_edges:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi a1, a0, 31
+; CHECK-NEXT:    lbu a2, 44(a0)
+; CHECK-NEXT:    lbu a3, 55(a0)
+; CHECK-NEXT:    lbu a4, 623(a0)
+; CHECK-NEXT:    lbu a5, 75(a0)
+; CHECK-NEXT:    lbu a6, 82(a0)
+; CHECK-NEXT:    lbu a7, 93(a0)
+; CHECK-NEXT:    lbu t0, 105(a0)
+; CHECK-NEXT:    lbu a0, 161(a0)
+; CHECK-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
+; CHECK-NEXT:    vlse8.v v8, (a1), zero
+; CHECK-NEXT:    vslide1down.vx v8, v8, a2
+; CHECK-NEXT:    vslide1down.vx v8, v8, a3
+; CHECK-NEXT:    vslide1down.vx v8, v8, a4
+; CHECK-NEXT:    vslide1down.vx v8, v8, a5
+; CHECK-NEXT:    vslide1down.vx v8, v8, a6
+; CHECK-NEXT:    vslide1down.vx v8, v8, a7
+; CHECK-NEXT:    vslide1down.vx v8, v8, t0
+; CHECK-NEXT:    vslide1down.vx v8, v8, a0
+; CHECK-NEXT:    vslidedown.vi v8, v8, 4
+; CHECK-NEXT:    ret
+  %p4 = getelementptr i8, ptr %p, i32 31
+  %p5 = getelementptr i8, ptr %p, i32 44
+  %p6 = getelementptr i8, ptr %p, i32 55
+  %p7 = getelementptr i8, ptr %p, i32 623
+  %p8 = getelementptr i8, ptr %p, i32 75
+  %p9 = getelementptr i8, ptr %p, i32 82
+  %p10 = getelementptr i8, ptr %p, i32 93
+  %p11 = getelementptr i8, ptr %p, i32 105
+  %p12 = getelementptr i8, ptr %p, i32 161
+
+  %ld4 = load i8, ptr %p4
+  %ld5 = load i8, ptr %p5
+  %ld6 = load i8, ptr %p6
+  %ld7 = load i8, ptr %p7
+  %ld8 = load i8, ptr %p8
+  %ld9 = load i8, ptr %p9
+  %ld10 = load i8, ptr %p10
+  %ld11 = load i8, ptr %p11
+  %ld12 = load i8, ptr %p12
+
+  %v4 = insertelement <16 x i8> poison, i8 %ld4, i32 3
+  %v5 = insertelement <16 x i8> %v4, i8 %ld5, i32 4
+  %v6 = insertelement <16 x i8> %v5, i8 %ld6, i32 5
+  %v7 = insertelement <16 x i8> %v6, i8 %ld7, i32 6
+  %v8 = insertelement <16 x i8> %v7, i8 %ld8, i32 7
+  %v9 = insertelement <16 x i8> %v8, i8 %ld9, i32 8
+  %v10 = insertelement <16 x i8> %v9, i8 %ld10, i32 9
+  %v11 = insertelement <16 x i8> %v10, i8 %ld11, i32 10
+  %v12 = insertelement <16 x i8> %v11, i8 %ld12, i32 11
+  ret <16 x i8> %v12
+}
+
+define <16 x i8> @buildvec_v16i8_loads_undef_scattered(ptr %p) {
+; CHECK-LABEL: buildvec_v16i8_loads_undef_scattered:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lbu a1, 1(a0)
+; CHECK-NEXT:    lbu a2, 44(a0)
+; CHECK-NEXT:    lbu a3, 55(a0)
+; CHECK-NEXT:    lbu a4, 75(a0)
+; CHECK-NEXT:    lbu a5, 82(a0)
+; CHECK-NEXT:    lbu a6, 93(a0)
+; CHECK-NEXT:    lbu a7, 124(a0)
+; CHECK-NEXT:    lbu t0, 144(a0)
+; CHECK-NEXT:    lbu t1, 154(a0)
+; CHECK-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
+; CHECK-NEXT:    vlse8.v v8, (a0), zero
+; CHECK-NEXT:    vslide1down.vx v8, v8, a1
+; CHECK-NEXT:    vslidedown.vi v8, v8, 2
+; CHECK-NEXT:    vslide1down.vx v8, v8, a2
+; CHECK-NEXT:    vslide1down.vx v8, v8, a3
+; CHECK-NEXT:    vslidedown.vi v8, v8, 1
+; CHECK-NEXT:    vslide1down.vx v8, v8, a4
+; CHECK-NEXT:    vslide1down.vx v8, v8, a5
+; CHECK-NEXT:    vslide1down.vx v8, v8, a6
+; CHECK-NEXT:    vslidedown.vi v8, v8, 2
+; CHECK-NEXT:    vslide1down.vx v8, v8, a7
+; CHECK-NEXT:    vslidedown.vi v8, v8, 1
+; CHECK-NEXT:    vslide1down.vx v8, v8, t0
+; CHECK-NEXT:    vslide1down.vx v8, v8, t1
+; CHECK-NEXT:    ret
+  %p2 = getelementptr i8, ptr %p, i32 1
+  %p3 = getelementptr i8, ptr %p, i32 22
+  %p4 = getelementptr i8, ptr %p, i32 31
+  %p5 = getelementptr i8, ptr %p, i32 44
+  %p6 = getelementptr i8, ptr %p, i32 55
+  %p7 = getelementptr i8, ptr %p, i32 623
+  %p8 = getelementptr i8, ptr %p, i32 75
+  %p9 = getelementptr i8, ptr %p, i32 82
+  %p10 = getelementptr i8, ptr %p, i32 93
+  %p11 = getelementptr i8, ptr %p, i32 105
+  %p12 = getelementptr i8, ptr %p, i32 161
+  %p13 = getelementptr i8, ptr %p, i32 124
+  %p14 = getelementptr i8, ptr %p, i32 163
+  %p15 = getelementptr i8, ptr %p, i32 144
+  %p16 = getelementptr i8, ptr %p, i32 154
+
+  %ld1 = load i8, ptr %p
+  %ld2 = load i8, ptr %p2
+  %ld3 = load i8, ptr %p3
+  %ld4 = load i8, ptr %p4
+  %ld5 = load i8, ptr %p5
+  %ld6 = load i8, ptr %p6
+  %ld7 = load i8, ptr %p7
+  %ld8 = load i8, ptr %p8
+  %ld9 = load i8, ptr %p9
+  %ld10 = load i8, ptr %p10
+  %ld11 = load i8, ptr %p11
+  %ld12 = load i8, ptr %p12
+  %ld13 = load i8, ptr %p13
+  %ld14 = load i8, ptr %p14
+  %ld15 = load i8, ptr %p15
+  %ld16 = load i8, ptr %p16
+
+  %v1 = insertelement <16 x i8> poison, i8 %ld1, i32 0
+  %v2 = insertelement <16 x i8> %v1, i8 %ld2, i32 1
+  %v3 = insertelement <16 x i8> %v2, i8 undef, i32 2
+  %v4 = insertelement <16 x i8> %v3, i8 undef, i32 3
+  %v5 = insertelement <16 x i8> %v4, i8 %ld5, i32 4
+  %v6 = insertelement <16 x i8> %v5, i8 %ld6, i32 5
+  %v7 = insertelement <16 x i8> %v6, i8 undef, i32 6
+  %v8 = insertelement <16 x i8> %v7, i8 %ld8, i32 7
+  %v9 = insertelement <16 x i8> %v8, i8 %ld9, i32 8
+  %v10 = insertelement <16 x i8> %v9, i8 %ld10, i32 9
+  %v11 = insertelement <16 x i8> %v10, i8 undef, i32 10
+  %v12 = insertelement <16 x i8> %v11, i8 undef, i32 11
+  %v13 = insertelement <16 x i8> %v12, i8 %ld13, i32 12
+  %v14 = insertelement <16 x i8> %v13, i8 undef, i32 13
+  %v15 = insertelement <16 x i8> %v14, i8 %ld15, i32 14
+  %v16 = insertelement <16 x i8> %v15, i8 %ld16, i32 15
+  ret <16 x i8> %v16
+}
-- 
cgit v1.1


From 2a6b521b36fb538a49564323ecd457d7b08b1325 Mon Sep 17 00:00:00 2001
From: Yinying Li <107574043+yinying-lisa-li@users.noreply.github.com>
Date: Fri, 9 Feb 2024 14:34:36 -0500
Subject: [mlir][sparse] Add more tests and verification for n:m (#81186)

1. Add python test for n out of m
2. Add more methods for python binding
3. Add verification for n:m and invalid encoding tests
4. Add e2e test for n:m

Previous PRs for n:m #80501 #79935
---
 mlir/include/mlir-c/Dialect/SparseTensor.h         |  10 ++
 mlir/lib/Bindings/Python/DialectSparseTensor.cpp   |  38 +++++++-
 mlir/lib/CAPI/Dialect/SparseTensor.cpp             |  18 ++++
 .../SparseTensor/IR/Detail/LvlTypeParser.cpp       |  34 ++++---
 .../Dialect/SparseTensor/IR/Detail/LvlTypeParser.h |   4 +-
 .../SparseTensor/IR/SparseTensorDialect.cpp        |  31 ++++++
 .../Dialect/SparseTensor/invalid_encoding.mlir     | 106 +++++++++++++++++++++
 .../Dialect/SparseTensor/CPU/sparse_ds.mlir        |  22 +++++
 mlir/test/python/dialects/sparse_tensor/dialect.py |  84 ++++++++++++++++
 9 files changed, 331 insertions(+), 16 deletions(-)

diff --git a/mlir/include/mlir-c/Dialect/SparseTensor.h b/mlir/include/mlir-c/Dialect/SparseTensor.h
index 2c71b00..d549f5d 100644
--- a/mlir/include/mlir-c/Dialect/SparseTensor.h
+++ b/mlir/include/mlir-c/Dialect/SparseTensor.h
@@ -84,6 +84,16 @@ mlirSparseTensorEncodingAttrGetPosWidth(MlirAttribute attr);
 MLIR_CAPI_EXPORTED int
 mlirSparseTensorEncodingAttrGetCrdWidth(MlirAttribute attr);
 
+MLIR_CAPI_EXPORTED unsigned
+mlirSparseTensorEncodingAttrGetStructuredN(MlirSparseTensorLevelType lvlType);
+
+MLIR_CAPI_EXPORTED unsigned
+mlirSparseTensorEncodingAttrGetStructuredM(MlirSparseTensorLevelType lvlType);
+
+MLIR_CAPI_EXPORTED MlirSparseTensorLevelType
+mlirSparseTensorEncodingAttrBuildLvlType(
+    enum MlirBaseSparseTensorLevelType lvlType, unsigned n, unsigned m);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/mlir/lib/Bindings/Python/DialectSparseTensor.cpp b/mlir/lib/Bindings/Python/DialectSparseTensor.cpp
index 607534c..74f4d24 100644
--- a/mlir/lib/Bindings/Python/DialectSparseTensor.cpp
+++ b/mlir/lib/Bindings/Python/DialectSparseTensor.cpp
@@ -60,6 +60,15 @@ static void populateDialectSparseTensorSubmodule(const py::module &m) {
           py::arg("lvl_to_dim"), py::arg("pos_width"), py::arg("crd_width"),
           py::arg("context") = py::none(),
           "Gets a sparse_tensor.encoding from parameters.")
+      .def_classmethod(
+          "build_level_type",
+          [](py::object cls, MlirBaseSparseTensorLevelType lvlType, unsigned n,
+             unsigned m) {
+            return mlirSparseTensorEncodingAttrBuildLvlType(lvlType, n, m);
+          },
+          py::arg("cls"), py::arg("lvl_type"), py::arg("n") = 0,
+          py::arg("m") = 0,
+          "Builds a sparse_tensor.encoding.level_type from parameters.")
       .def_property_readonly(
           "lvl_types",
           [](MlirAttribute self) {
@@ -89,7 +98,34 @@ static void populateDialectSparseTensorSubmodule(const py::module &m) {
       .def_property_readonly("pos_width",
                              mlirSparseTensorEncodingAttrGetPosWidth)
       .def_property_readonly("crd_width",
-                             mlirSparseTensorEncodingAttrGetCrdWidth);
+                             mlirSparseTensorEncodingAttrGetCrdWidth)
+      .def_property_readonly(
+          "structured_n",
+          [](MlirAttribute self) -> unsigned {
+            const int lvlRank = mlirSparseTensorEncodingGetLvlRank(self);
+            return mlirSparseTensorEncodingAttrGetStructuredN(
+                mlirSparseTensorEncodingAttrGetLvlType(self, lvlRank - 1));
+          })
+      .def_property_readonly(
+          "structured_m",
+          [](MlirAttribute self) -> unsigned {
+            const int lvlRank = mlirSparseTensorEncodingGetLvlRank(self);
+            return mlirSparseTensorEncodingAttrGetStructuredM(
+                mlirSparseTensorEncodingAttrGetLvlType(self, lvlRank - 1));
+          })
+      .def_property_readonly("lvl_types_enum", [](MlirAttribute self) {
+        const int lvlRank = mlirSparseTensorEncodingGetLvlRank(self);
+        std::vector<MlirBaseSparseTensorLevelType> ret;
+        ret.reserve(lvlRank);
+        for (int l = 0; l < lvlRank; l++) {
+          // Convert level type to 32 bits to ignore n and m for n_out_of_m
+          // format.
+          ret.push_back(
+              static_cast<MlirBaseSparseTensorLevelType>(static_cast<uint32_t>(
+                  mlirSparseTensorEncodingAttrGetLvlType(self, l))));
+        }
+        return ret;
+      });
 }
 
 PYBIND11_MODULE(_mlirDialectsSparseTensor, m) {
diff --git a/mlir/lib/CAPI/Dialect/SparseTensor.cpp b/mlir/lib/CAPI/Dialect/SparseTensor.cpp
index a34b9a29..4e1bd45 100644
--- a/mlir/lib/CAPI/Dialect/SparseTensor.cpp
+++ b/mlir/lib/CAPI/Dialect/SparseTensor.cpp
@@ -94,3 +94,21 @@ int mlirSparseTensorEncodingAttrGetPosWidth(MlirAttribute attr) {
 int mlirSparseTensorEncodingAttrGetCrdWidth(MlirAttribute attr) {
   return cast<SparseTensorEncodingAttr>(unwrap(attr)).getCrdWidth();
 }
+
+MlirSparseTensorLevelType
+mlirSparseTensorEncodingAttrBuildLvlType(MlirBaseSparseTensorLevelType lvlType,
+                                         unsigned n, unsigned m) {
+  LevelType lt = static_cast<LevelType>(lvlType);
+  return static_cast<MlirSparseTensorLevelType>(*buildLevelType(
+      *getLevelFormat(lt), isOrderedLT(lt), isUniqueLT(lt), n, m));
+}
+
+unsigned
+mlirSparseTensorEncodingAttrGetStructuredN(MlirSparseTensorLevelType lvlType) {
+  return getN(static_cast<LevelType>(lvlType));
+}
+
+unsigned
+mlirSparseTensorEncodingAttrGetStructuredM(MlirSparseTensorLevelType lvlType) {
+  return getM(static_cast<LevelType>(lvlType));
+}
diff --git a/mlir/lib/Dialect/SparseTensor/IR/Detail/LvlTypeParser.cpp b/mlir/lib/Dialect/SparseTensor/IR/Detail/LvlTypeParser.cpp
index 752d6e6..0fb0d27 100644
--- a/mlir/lib/Dialect/SparseTensor/IR/Detail/LvlTypeParser.cpp
+++ b/mlir/lib/Dialect/SparseTensor/IR/Detail/LvlTypeParser.cpp
@@ -35,14 +35,22 @@ FailureOr<uint64_t> LvlTypeParser::parseLvlType(AsmParser &parser) const {
   ERROR_IF(failed(parser.parseOptionalKeyword(&base)),
            "expected valid level format (e.g. dense, compressed or singleton)")
   uint64_t properties = 0;
-  SmallVector<unsigned> structure;
+  SmallVector<unsigned> structured;
 
   if (base.compare("structured") == 0) {
     ParseResult res = parser.parseCommaSeparatedList(
         mlir::OpAsmParser::Delimiter::OptionalSquare,
-        [&]() -> ParseResult { return parseStructure(parser, &structure); },
-        " in block n out of m");
+        [&]() -> ParseResult { return parseStructured(parser, &structured); },
+        " in structured n out of m");
     FAILURE_IF_FAILED(res)
+    if (structured.size() != 2) {
+      parser.emitError(loc, "expected exactly 2 structured sizes");
+      return failure();
+    }
+    if (structured[0] > structured[1]) {
+      parser.emitError(loc, "expected n <= m in n_out_of_m");
+      return failure();
+    }
   }
 
   ParseResult res = parser.parseCommaSeparatedList(
@@ -57,12 +65,8 @@ FailureOr<uint64_t> LvlTypeParser::parseLvlType(AsmParser &parser) const {
   } else if (base.compare("compressed") == 0) {
     properties |= static_cast<uint64_t>(LevelFormat::Compressed);
   } else if (base.compare("structured") == 0) {
-    if (structure.size() != 2) {
-      parser.emitError(loc, "expected exactly 2 structure sizes");
-      return failure();
-    }
     properties |= static_cast<uint64_t>(LevelFormat::NOutOfM);
-    properties |= nToBits(structure[0]) | mToBits(structure[1]);
+    properties |= nToBits(structured[0]) | mToBits(structured[1]);
   } else if (base.compare("loose_compressed") == 0) {
     properties |= static_cast<uint64_t>(LevelFormat::LooseCompressed);
   } else if (base.compare("singleton") == 0) {
@@ -95,20 +99,24 @@ ParseResult LvlTypeParser::parseProperty(AsmParser &parser,
 }
 
 ParseResult
-LvlTypeParser::parseStructure(AsmParser &parser,
-                              SmallVector<unsigned> *structure) const {
+LvlTypeParser::parseStructured(AsmParser &parser,
+                               SmallVector<unsigned> *structured) const {
   int intVal;
   auto loc = parser.getCurrentLocation();
   OptionalParseResult intValParseResult = parser.parseOptionalInteger(intVal);
   if (intValParseResult.has_value()) {
     if (failed(*intValParseResult)) {
-      parser.emitError(loc, "failed to parse block size");
+      parser.emitError(loc, "failed to parse structured size");
+      return failure();
+    }
+    if (intVal < 0) {
+      parser.emitError(loc, "expected structured size to be >= 0");
       return failure();
     }
-    structure->push_back(intVal);
+    structured->push_back(intVal);
     return success();
   }
-  parser.emitError(loc, "expected valid integer for block size");
+  parser.emitError(loc, "expected valid integer for structured size");
   return failure();
 }
 
diff --git a/mlir/lib/Dialect/SparseTensor/IR/Detail/LvlTypeParser.h b/mlir/lib/Dialect/SparseTensor/IR/Detail/LvlTypeParser.h
index 6a13112..1ac8254 100644
--- a/mlir/lib/Dialect/SparseTensor/IR/Detail/LvlTypeParser.h
+++ b/mlir/lib/Dialect/SparseTensor/IR/Detail/LvlTypeParser.h
@@ -22,8 +22,8 @@ public:
 
 private:
   ParseResult parseProperty(AsmParser &parser, uint64_t *properties) const;
-  ParseResult parseStructure(AsmParser &parser,
-                             SmallVector<unsigned> *structure) const;
+  ParseResult parseStructured(AsmParser &parser,
+                              SmallVector<unsigned> *structured) const;
 };
 
 } // namespace ir_detail
diff --git a/mlir/lib/Dialect/SparseTensor/IR/SparseTensorDialect.cpp b/mlir/lib/Dialect/SparseTensor/IR/SparseTensorDialect.cpp
index 67b1d79..aed43f2 100644
--- a/mlir/lib/Dialect/SparseTensor/IR/SparseTensorDialect.cpp
+++ b/mlir/lib/Dialect/SparseTensor/IR/SparseTensorDialect.cpp
@@ -657,6 +657,37 @@ LogicalResult SparseTensorEncodingAttr::verify(
       return emitError() << "expected all singleton lvlTypes "
                             "following a singleton level";
   }
+  // TODO: audit formats that actually are supported by backend.
+  if (auto it = std::find_if(lvlTypes.begin(), lvlTypes.end(), isNOutOfMLT);
+      it != std::end(lvlTypes)) {
+    if (it != lvlTypes.end() - 1)
+      return emitError() << "expected n_out_of_m to be the last level type";
+    if (!std::all_of(lvlTypes.begin(), it,
+                     [](LevelType i) { return isDenseLT(i); }))
+      return emitError() << "expected all dense lvlTypes "
+                            "before a n_out_of_m level";
+    if (dimToLvl && (dimToLvl.getNumDims() != dimToLvl.getNumResults())) {
+      if (!isBlockSparsity(dimToLvl)) {
+        return emitError()
+               << "expected 1xm block structure for n_out_of_m level";
+      }
+      auto sizes = getBlockSize(dimToLvl);
+      unsigned coefficient = 0;
+      for (const auto &elem : sizes) {
+        if (elem != 0) {
+          if (elem != coefficient && coefficient != 0) {
+            return emitError() << "expected only one blocked level "
+                                  "with the same coefficients";
+          }
+          coefficient = elem;
+        }
+      }
+      if (coefficient != getM(*it)) {
+        return emitError() << "expected coeffiencts of Affine expressions "
+                              "to be equal to m of n_out_of_m level";
+      }
+    }
+  }
   // Before we can check that the level-rank is consistent/coherent
   // across all fields, we need to define it.  The source-of-truth for
   // the `getLvlRank` method is the length of the level-types array,
diff --git a/mlir/test/Dialect/SparseTensor/invalid_encoding.mlir b/mlir/test/Dialect/SparseTensor/invalid_encoding.mlir
index 2d189cc..a52a46b4 100644
--- a/mlir/test/Dialect/SparseTensor/invalid_encoding.mlir
+++ b/mlir/test/Dialect/SparseTensor/invalid_encoding.mlir
@@ -315,3 +315,109 @@ func.func private @BSR(%arg0: tensor<?x?xf64, #BSR>) {
 func.func private @BSR_explicit(%arg0: tensor<?x?xf64, #BSR_explicit>) {
   return
 }
+
+// -----
+
+// expected-error@+6 {{expected structured size to be >= 0}}
+#NOutOfM = #sparse_tensor.encoding<{
+  map = ( i, j, k ) ->
+  ( i            : dense,
+    k floordiv 4 : dense,
+    j            : dense,
+    k mod 4      : structured[-2, 4]
+  )
+}>
+func.func private @NOutOfM(%arg0: tensor<?x?x?xf64, #NOutOfM>) {
+  return
+}
+
+// -----
+
+// expected-error@+6 {{expected n <= m in n_out_of_m}}
+#NOutOfM = #sparse_tensor.encoding<{
+  map = ( i, j, k ) ->
+  ( i            : dense,
+    k floordiv 4 : dense,
+    j            : dense,
+    k mod 4      : structured[5, 4]
+  )
+}>
+func.func private @NOutOfM(%arg0: tensor<?x?x?xf64, #NOutOfM>) {
+  return
+}
+
+// -----
+
+// expected-error@+1 {{expected all dense lvlTypes before a n_out_of_m level}}
+#NOutOfM = #sparse_tensor.encoding<{
+  map = ( i, j, k ) ->
+  ( i            : dense,
+    k floordiv 4 : compressed,
+    j            : dense,
+    k mod 4      : structured[2, 4]
+  )
+}>
+func.func private @NOutOfM(%arg0: tensor<?x?x?xf64, #NOutOfM>) {
+  return
+}
+
+// -----
+
+// expected-error@+1 {{expected n_out_of_m to be the last level type}}
+#NOutOfM = #sparse_tensor.encoding<{
+  map = ( i, j, k ) ->
+  ( i            : dense,
+    k floordiv 4 : structured[2, 4],
+    j            : dense,
+    k mod 4      : compressed
+  )
+}>
+func.func private @NOutOfM(%arg0: tensor<?x?x?xf64, #NOutOfM>) {
+  return
+}
+
+// -----
+
+// expected-error@+1 {{expected 1xm block structure for n_out_of_m level}}
+#NOutOfM = #sparse_tensor.encoding<{
+  map = ( i, j, k ) ->
+  ( i            : dense,
+    k floordiv 2 : dense,
+    j            : dense,
+    k mod 4      : structured[2, 4]
+  )
+}>
+func.func private @NOutOfM(%arg0: tensor<?x?x?xf64, #NOutOfM>) {
+  return
+}
+
+// -----
+
+// expected-error@+1 {{expected coeffiencts of Affine expressions to be equal to m of n_out_of_m level}}
+#NOutOfM = #sparse_tensor.encoding<{
+  map = ( i, j, k ) ->
+  ( i            : dense,
+    k floordiv 2 : dense,
+    j            : dense,
+    k mod 2      : structured[2, 4]
+  )
+}>
+func.func private @NOutOfM(%arg0: tensor<?x?x?xf64, #NOutOfM>) {
+  return
+}
+
+// -----
+
+// expected-error@+1 {{expected only one blocked level with the same coefficients}}
+#NOutOfM = #sparse_tensor.encoding<{
+  map = ( i, j, k ) ->
+  ( i floordiv 2 : dense,
+    i mod 2      : dense,
+    j            : dense,
+    k floordiv 4 : dense,
+    k mod 4      : structured[2, 4]
+  )
+}>
+func.func private @NOutOfM(%arg0: tensor<?x?x?xf64, #NOutOfM>) {
+  return
+}
diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_ds.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_ds.mlir
index ec5c758..251944c 100644
--- a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_ds.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_ds.mlir
@@ -45,6 +45,13 @@
   crdWidth = 8
 }>
 
+#NV_58 = #sparse_tensor.encoding<{
+  map = ( i, j ) -> ( i            : dense,
+                      j floordiv 8 : dense,
+                      j mod 8      : structured[5, 8]),
+  crdWidth = 8
+}>
+
 module {
 
   func.func private @getTensorFilename(index) -> (!Filename)
@@ -65,6 +72,7 @@ module {
     %A1 = sparse_tensor.new %fileName : !Filename to tensor<?x?xf64, #CSR>
     %A2 = sparse_tensor.new %fileName : !Filename to tensor<?x?xf64, #CSR_hi>
     %A3 = sparse_tensor.new %fileName : !Filename to tensor<?x?xf64, #NV_24>
+    %A4 = sparse_tensor.new %fileName : !Filename to tensor<?x?xf64, #NV_58>
 
     //
     // CSR:
@@ -113,10 +121,24 @@ module {
     %vecv3 = vector.transfer_read %val3[%c0], %f0 : memref<?xf64>, vector<12xf64>
     vector.print %vecv3 : vector<12xf64>
 
+    //
+    // NV_58
+    //
+    // CHECK-NEXT: ( 2, 3, 5, 7, 1, 2, 4, 7, 0, 2, 4, 5 )
+    // CHECK-NEXT: ( 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12 )
+    //
+    %crd4 = sparse_tensor.coordinates %A4 {level = 2 : index } : tensor<?x?xf64, #NV_58> to memref<?xi8>
+    %vecc4 = vector.transfer_read %crd4[%c0], %u0 : memref<?xi8>, vector<12xi8>
+    vector.print %vecc4 : vector<12xi8>
+    %val4 = sparse_tensor.values %A4 : tensor<?x?xf64, #NV_58> to memref<?xf64>
+    %vecv4 = vector.transfer_read %val4[%c0], %f0 : memref<?xf64>, vector<12xf64>
+    vector.print %vecv4 : vector<12xf64>
+
     // Release the resources.
     bufferization.dealloc_tensor %A1: tensor<?x?xf64, #CSR>
     bufferization.dealloc_tensor %A2: tensor<?x?xf64, #CSR_hi>
     bufferization.dealloc_tensor %A3: tensor<?x?xf64, #NV_24>
+    bufferization.dealloc_tensor %A4: tensor<?x?xf64, #NV_58>
 
     return
   }
diff --git a/mlir/test/python/dialects/sparse_tensor/dialect.py b/mlir/test/python/dialects/sparse_tensor/dialect.py
index 412c579..1fa7030 100644
--- a/mlir/test/python/dialects/sparse_tensor/dialect.py
+++ b/mlir/test/python/dialects/sparse_tensor/dialect.py
@@ -52,6 +52,90 @@ def testEncodingAttr1D():
         print(f"created_pos_width: {created.pos_width}")
 
 
+# CHECK-LABEL: TEST: testEncodingAttrStructure
+@run
+def testEncodingAttrStructure():
+    with Context() as ctx:
+        parsed = Attribute.parse(
+            "#sparse_tensor.encoding<{"
+            "  map = (d0, d1) -> (d0 : dense, d1 floordiv 4 : dense,"
+            "  d1 mod 4 : structured[2, 4]),"
+            "  posWidth = 16,"
+            "  crdWidth = 32"
+            "}>"
+        )
+        # CHECK: #sparse_tensor.encoding<{ map = (d0, d1) -> (d0 : dense, d1 floordiv 4 : dense, d1 mod 4 : structured[2, 4]), posWidth = 16, crdWidth = 32 }>
+        print(parsed)
+
+        casted = st.EncodingAttr(parsed)
+        # CHECK: equal: True
+        print(f"equal: {casted == parsed}")
+
+        # CHECK: lvl_types: [65536, 65536, 4406637494272]
+        print(f"lvl_types: {casted.lvl_types}")
+        # CHECK: lvl_types_enum: [<LevelType.dense: 65536>, <LevelType.dense: 65536>, <LevelType.n_out_of_m: 1048576>]
+        print(f"lvl_types_enum: {casted.lvl_types_enum}")
+        # CHECK: structured_n: 2
+        print(f"structured_n: {casted.structured_n}")
+        # CHECK: structured_m: 4
+        print(f"structured_m: {casted.structured_m}")
+        # CHECK: dim_to_lvl: (d0, d1) -> (d0, d1 floordiv 4, d1 mod 4)
+        print(f"dim_to_lvl: {casted.dim_to_lvl}")
+        # CHECK: lvl_to_dim: (d0, d1, d2) -> (d0, d1 * 4 + d2)
+        print(f"lvl_to_dim: {casted.lvl_to_dim}")
+        # CHECK: pos_width: 16
+        print(f"pos_width: {casted.pos_width}")
+        # CHECK: crd_width: 32
+        print(f"crd_width: {casted.crd_width}")
+
+        created = st.EncodingAttr.get(
+            casted.lvl_types, casted.dim_to_lvl, casted.lvl_to_dim, 0, 0
+        )
+        # CHECK: #sparse_tensor.encoding<{ map = (d0, d1) -> (d0 : dense, d1 floordiv 4 : dense, d1 mod 4 : structured[2, 4]) }>
+        print(created)
+        # CHECK: created_equal: False
+        print(f"created_equal: {created == casted}")
+
+        built_2_4 = st.EncodingAttr.build_level_type(st.LevelType.n_out_of_m, 2, 4)
+        dim_to_lvl = AffineMap.get(
+            2,
+            0,
+            [
+                AffineExpr.get_dim(0),
+                AffineExpr.get_floor_div(AffineExpr.get_dim(1), 4),
+                AffineExpr.get_mod(AffineExpr.get_dim(1), 4),
+            ],
+        )
+        lvl_to_dim = AffineMap.get(
+            3,
+            0,
+            [
+                AffineExpr.get_dim(0),
+                AffineExpr.get_add(
+                    AffineExpr.get_mul(AffineExpr.get_dim(1), 4),
+                    AffineExpr.get_dim(2),
+                ),
+            ],
+        )
+        built = st.EncodingAttr.get(
+            [st.LevelType.dense, st.LevelType.dense, built_2_4],
+            dim_to_lvl,
+            lvl_to_dim,
+            0,
+            0,
+        )
+        # CHECK: #sparse_tensor.encoding<{ map = (d0, d1) -> (d0 : dense, d1 floordiv 4 : dense, d1 mod 4 : structured[2, 4]) }>
+        print(built)
+        # CHECK: built_equal: True
+        print(f"built_equal: {built == created}")
+
+        # Verify that the factory creates an instance of the proper type.
+        # CHECK: is_proper_instance: True
+        print(f"is_proper_instance: {isinstance(created, st.EncodingAttr)}")
+        # CHECK: created_pos_width: 0
+        print(f"created_pos_width: {created.pos_width}")
+
+
 # CHECK-LABEL: TEST: testEncodingAttr2D
 @run
 def testEncodingAttr2D():
-- 
cgit v1.1


From 07dc85ba0cc84e7034ad2a0575c644cfeab60b39 Mon Sep 17 00:00:00 2001
From: Joseph Huber <huberjn@outlook.com>
Date: Fri, 9 Feb 2024 13:39:03 -0600
Subject: [NVVMReflect] Improve folding inside of the NVVMReflect pass (#81253)

Summary:
The previous patch did very simple folding that only worked for driectly
used branches. This patch improves this by traversing the use-def chain
to sipmlify every constant subexpression until it reaches a terminator
we can delete. The support should work for all expected cases now.
---
 llvm/docs/NVPTXUsage.rst                        |  3 +-
 llvm/lib/Target/NVPTX/NVVMReflect.cpp           | 70 ++++++----------------
 llvm/test/CodeGen/NVPTX/nvvm-reflect-arch-O0.ll | 78 ++++++++++++++++++++-----
 3 files changed, 82 insertions(+), 69 deletions(-)

diff --git a/llvm/docs/NVPTXUsage.rst b/llvm/docs/NVPTXUsage.rst
index b5e3918..6a55b12 100644
--- a/llvm/docs/NVPTXUsage.rst
+++ b/llvm/docs/NVPTXUsage.rst
@@ -298,8 +298,7 @@ input IR module ``module.bc``, the following compilation flow is recommended:
 
 The ``NVVMReflect`` pass will attempt to remove dead code even without
 optimizations. This allows potentially incompatible instructions to be avoided
-at all optimizations levels. This currently only works for simple conditionals
-like the above example.
+at all optimizations levels by using the ``__CUDA_ARCH`` argument.
 
 1. Save list of external functions in ``module.bc``
 2. Link ``module.bc`` with ``libdevice.compute_XX.YY.bc``
diff --git a/llvm/lib/Target/NVPTX/NVVMReflect.cpp b/llvm/lib/Target/NVPTX/NVVMReflect.cpp
index 3794ad9b..64fedf3 100644
--- a/llvm/lib/Target/NVPTX/NVVMReflect.cpp
+++ b/llvm/lib/Target/NVPTX/NVVMReflect.cpp
@@ -90,7 +90,7 @@ static bool runNVVMReflect(Function &F, unsigned SmVersion) {
   }
 
   SmallVector<Instruction *, 4> ToRemove;
-  SmallVector<ICmpInst *, 4> ToSimplify;
+  SmallVector<Instruction *, 4> ToSimplify;
 
   // Go through the calls in this function.  Each call to __nvvm_reflect or
   // llvm.nvvm.reflect should be a CallInst with a ConstantArray argument.
@@ -177,9 +177,8 @@ static bool runNVVMReflect(Function &F, unsigned SmVersion) {
     }
 
     // If the immediate user is a simple comparison we want to simplify it.
-    // TODO: This currently does not handle switch instructions.
     for (User *U : Call->users())
-      if (ICmpInst *I = dyn_cast<ICmpInst>(U))
+      if (Instruction *I = dyn_cast<Instruction>(U))
         ToSimplify.push_back(I);
 
     Call->replaceAllUsesWith(ConstantInt::get(Call->getType(), ReflectVal));
@@ -190,56 +189,21 @@ static bool runNVVMReflect(Function &F, unsigned SmVersion) {
     I->eraseFromParent();
 
   // The code guarded by __nvvm_reflect may be invalid for the target machine.
-  // We need to do some basic dead code elimination to trim invalid code before
-  // it reaches the backend at all optimization levels.
-  SmallVector<BranchInst *> Simplified;
-  for (ICmpInst *Cmp : ToSimplify) {
-    Constant *LHS = dyn_cast<Constant>(Cmp->getOperand(0));
-    Constant *RHS = dyn_cast<Constant>(Cmp->getOperand(1));
-
-    if (!LHS || !RHS)
-      continue;
-
-    // If the comparison is a compile time constant we simply propagate it.
-    Constant *C = ConstantFoldCompareInstOperands(
-        Cmp->getPredicate(), LHS, RHS, Cmp->getModule()->getDataLayout());
-
-    if (!C)
-      continue;
-
-    for (User *U : Cmp->users())
-      if (BranchInst *I = dyn_cast<BranchInst>(U))
-        Simplified.push_back(I);
-
-    Cmp->replaceAllUsesWith(C);
-    Cmp->eraseFromParent();
-  }
-
-  // Each instruction here is a conditional branch off of a constant true or
-  // false value. Simply replace it with an unconditional branch to the
-  // appropriate basic block and delete the rest if it is trivially dead.
-  DenseSet<Instruction *> Removed;
-  for (BranchInst *Branch : Simplified) {
-    if (Removed.contains(Branch))
-      continue;
-
-    ConstantInt *C = dyn_cast<ConstantInt>(Branch->getCondition());
-    if (!C || (!C->isOne() && !C->isZero()))
-      continue;
-
-    BasicBlock *TrueBB =
-        C->isOne() ? Branch->getSuccessor(0) : Branch->getSuccessor(1);
-    BasicBlock *FalseBB =
-        C->isOne() ? Branch->getSuccessor(1) : Branch->getSuccessor(0);
-
-    // This transformation is only correct on simple edges.
-    if (!FalseBB->hasNPredecessors(1))
-      continue;
-
-    ReplaceInstWithInst(Branch, BranchInst::Create(TrueBB));
-    if (FalseBB->use_empty() && !FalseBB->getFirstNonPHIOrDbg()) {
-      Removed.insert(FalseBB->getFirstNonPHIOrDbg());
-      changeToUnreachable(FalseBB->getFirstNonPHIOrDbg());
+  // Traverse the use-def chain, continually simplifying constant expressions
+  // until we find a terminator that we can then remove.
+  while (!ToSimplify.empty()) {
+    Instruction *I = ToSimplify.pop_back_val();
+    if (Constant *C =
+            ConstantFoldInstruction(I, F.getParent()->getDataLayout())) {
+      for (User *U : I->users())
+        if (Instruction *I = dyn_cast<Instruction>(U))
+          ToSimplify.push_back(I);
+
+      I->replaceAllUsesWith(C);
+      if (isInstructionTriviallyDead(I))
+        I->eraseFromParent();
+    } else if (I->isTerminator()) {
+      ConstantFoldTerminator(I->getParent());
     }
   }
 
diff --git a/llvm/test/CodeGen/NVPTX/nvvm-reflect-arch-O0.ll b/llvm/test/CodeGen/NVPTX/nvvm-reflect-arch-O0.ll
index 9dcdf5b..0088d6c 100644
--- a/llvm/test/CodeGen/NVPTX/nvvm-reflect-arch-O0.ll
+++ b/llvm/test/CodeGen/NVPTX/nvvm-reflect-arch-O0.ll
@@ -102,23 +102,24 @@ if.end:
   ret void
 }
 
-; SM_52: .visible .func  (.param .b32 func_retval0) qux()
-; SM_52: mov.u32         %[[REG1:.+]], %[[REG2:.+]];
-; SM_52: st.param.b32    [func_retval0+0], %[[REG1:.+]];
-; SM_52: ret;
-; SM_70: .visible .func  (.param .b32 func_retval0) qux()
-; SM_70: mov.u32         %[[REG1:.+]], %[[REG2:.+]];
-; SM_70: st.param.b32    [func_retval0+0], %[[REG1:.+]];
-; SM_70: ret;
-; SM_90: .visible .func  (.param .b32 func_retval0) qux()
-; SM_90: st.param.b32    [func_retval0+0], %[[REG1:.+]];
-; SM_90: ret;
+;      SM_52: .visible .func  (.param .b32 func_retval0) qux()
+;      SM_52: mov.b32         %[[REG:.+]], 3;
+; SM_52-NEXT: st.param.b32    [func_retval0+0], %[[REG:.+]];
+; SM_52-NEXT: ret;
+;
+;      SM_70: .visible .func  (.param .b32 func_retval0) qux()
+;      SM_70: mov.b32         %[[REG:.+]], 2;
+; SM_70-NEXT: st.param.b32    [func_retval0+0], %[[REG:.+]];
+; SM_70-NEXT: ret;
+;
+;      SM_90: .visible .func  (.param .b32 func_retval0) qux()
+;      SM_90: mov.b32         %[[REG:.+]], 1;
+; SM_90-NEXT: st.param.b32    [func_retval0+0], %[[REG:.+]];
+; SM_90-NEXT: ret;
 define i32 @qux() {
 entry:
   %call = call i32 @__nvvm_reflect(ptr noundef @.str)
-  %cmp = icmp uge i32 %call, 700
-  %conv = zext i1 %cmp to i32
-  switch i32 %conv, label %sw.default [
+  switch i32 %call, label %sw.default [
     i32 900, label %sw.bb
     i32 700, label %sw.bb1
     i32 520, label %sw.bb2
@@ -173,3 +174,52 @@ if.exit:
 exit:
   ret float 0.000000e+00
 }
+
+;      SM_52: .visible .func  (.param .b32 func_retval0) prop()
+;      SM_52: mov.b32         %[[REG:.+]], 3;
+; SM_52-NEXT: st.param.b32    [func_retval0+0], %[[REG:.+]];
+; SM_52-NEXT: ret;
+;
+;      SM_70: .visible .func  (.param .b32 func_retval0) prop()
+;      SM_70: mov.b32         %[[REG:.+]], 2;
+; SM_70-NEXT: st.param.b32    [func_retval0+0], %[[REG:.+]];
+; SM_70-NEXT: ret;
+;
+;      SM_90: .visible .func  (.param .b32 func_retval0) prop()
+;      SM_90: mov.b32         %[[REG:.+]], 1;
+; SM_90-NEXT: st.param.b32    [func_retval0+0], %[[REG:.+]];
+; SM_90-NEXT: ret;
+define i32 @prop() {
+entry:
+  %call = call i32 @__nvvm_reflect(ptr @.str)
+  %conv = zext i32 %call to i64
+  %div = udiv i64 %conv, 100
+  %cmp = icmp eq i64 %div, 9
+  br i1 %cmp, label %if.then, label %if.else
+
+if.then:
+  br label %return
+
+if.else:
+  %div2 = udiv i64 %conv, 100
+  %cmp3 = icmp eq i64 %div2, 7
+  br i1 %cmp3, label %if.then5, label %if.else6
+
+if.then5:
+  br label %return
+
+if.else6:
+  %div7 = udiv i64 %conv, 100
+  %cmp8 = icmp eq i64 %div7, 5
+  br i1 %cmp8, label %if.then10, label %if.else11
+
+if.then10:
+  br label %return
+
+if.else11:
+  br label %return
+
+return:
+  %retval = phi i32 [ 1, %if.then ], [ 2, %if.then5 ], [ 3, %if.then10 ], [ 4, %if.else11 ]
+  ret i32 %retval
+}
-- 
cgit v1.1


From 9d9cc3706f59499f443ce4ebaeb24f7c8417e797 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bj=C3=B6rn=20Sch=C3=A4pers?= <bjoern@hazardy.de>
Date: Fri, 9 Feb 2024 20:40:16 +0100
Subject: [clang-format][docs] Fix version (#81185)

#78752 was not merged in time for clang-format 18.
---
 clang/docs/ClangFormatStyleOptions.rst                | 2 +-
 clang/include/clang/Tooling/Inclusions/IncludeStyle.h | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/clang/docs/ClangFormatStyleOptions.rst b/clang/docs/ClangFormatStyleOptions.rst
index 0a8cc18..4ccdd2d 100644
--- a/clang/docs/ClangFormatStyleOptions.rst
+++ b/clang/docs/ClangFormatStyleOptions.rst
@@ -4156,7 +4156,7 @@ the configuration (without a prefix: ``Auto``).
 
 .. _MainIncludeChar:
 
-**MainIncludeChar** (``MainIncludeCharDiscriminator``) :versionbadge:`clang-format 18` :ref:`¶ <MainIncludeChar>`
+**MainIncludeChar** (``MainIncludeCharDiscriminator``) :versionbadge:`clang-format 19` :ref:`¶ <MainIncludeChar>`
   When guessing whether a #include is the "main" include, only the include
   directives that use the specified character are considered.
 
diff --git a/clang/include/clang/Tooling/Inclusions/IncludeStyle.h b/clang/include/clang/Tooling/Inclusions/IncludeStyle.h
index c91e4a6..d167b7e 100644
--- a/clang/include/clang/Tooling/Inclusions/IncludeStyle.h
+++ b/clang/include/clang/Tooling/Inclusions/IncludeStyle.h
@@ -164,7 +164,7 @@ struct IncludeStyle {
 
   /// When guessing whether a #include is the "main" include, only the include
   /// directives that use the specified character are considered.
-  /// \version 18
+  /// \version 19
   MainIncludeCharDiscriminator MainIncludeChar;
 };
 
-- 
cgit v1.1


From 7ad7db0d9960859de10d23fa84aa581c154d152c Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Fri, 9 Feb 2024 11:45:54 -0800
Subject: [RISCV] Fix typo in ABI name in test. NFC

ilp64->lp64.
---
 llvm/test/CodeGen/RISCV/push-pop-opt-crash.ll | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llvm/test/CodeGen/RISCV/push-pop-opt-crash.ll b/llvm/test/CodeGen/RISCV/push-pop-opt-crash.ll
index ca6895d..5edf3cf 100644
--- a/llvm/test/CodeGen/RISCV/push-pop-opt-crash.ll
+++ b/llvm/test/CodeGen/RISCV/push-pop-opt-crash.ll
@@ -1,8 +1,8 @@
 ; RUN: llc -mattr=+zcmp -verify-machineinstrs  \
-; RUN: -mtriple=riscv32 -target-abi ilp32 < %s \
+; RUN: -mtriple=riscv32 -target-abi=ilp32 < %s \
 ; RUN: | FileCheck %s -check-prefixes=RV32IZCMP
 ; RUN: llc -mattr=+zcmp -verify-machineinstrs  \
-; RUN: -mtriple=riscv64 -target-abi ilp64 < %s \
+; RUN: -mtriple=riscv64 -target-abi=lp64 < %s \
 ; RUN: | FileCheck %s -check-prefixes=RV64IZCMP
 
 ; This source code exposed a crash in the RISC-V Zcmp Push/Pop optimization
-- 
cgit v1.1


From 81c4bf946a377b1dc90c02ff3ff8240e78db0edb Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Fri, 9 Feb 2024 12:01:49 -0800
Subject: [ELF] Improve _etext/_edata tests

---
 lld/test/ELF/riscv-section-layout.s  | 25 +++++++++--------
 lld/test/ELF/x86-64-section-layout.s | 54 +++++++++++++++++++++---------------
 2 files changed, 45 insertions(+), 34 deletions(-)

diff --git a/lld/test/ELF/riscv-section-layout.s b/lld/test/ELF/riscv-section-layout.s
index 56ac95d..10e0feb 100644
--- a/lld/test/ELF/riscv-section-layout.s
+++ b/lld/test/ELF/riscv-section-layout.s
@@ -3,20 +3,20 @@
 
 # RUN: llvm-mc -filetype=obj -triple=riscv32 %s -o %t.32.o
 # RUN: ld.lld -pie %t.32.o -o %t.32
-# RUN: llvm-readelf -S -s %t.32 | FileCheck %s --check-prefix=NOSDATA
+# RUN: llvm-readelf -S -sX %t.32 | FileCheck %s --check-prefix=NOSDATA
 # RUN: llvm-mc -filetype=obj -triple=riscv32 --defsym=SDATA=1 %s -o %t.32s.o
 # RUN: ld.lld -pie %t.32s.o -o %t.32s
-# RUN: llvm-readelf -S -s %t.32s | FileCheck %s
+# RUN: llvm-readelf -S -sX %t.32s | FileCheck %s
 
 # RUN: llvm-mc -filetype=obj -triple=riscv64 %s -o %t.64.o
 # RUN: ld.lld -pie %t.64.o -o %t.64
-# RUN: llvm-readelf -S -s %t.64 | FileCheck %s --check-prefix=NOSDATA
+# RUN: llvm-readelf -S -sX %t.64 | FileCheck %s --check-prefix=NOSDATA
 # RUN: llvm-mc -filetype=obj -triple=riscv64 --defsym=SDATA=1 %s -o %t.64s.o
 # RUN: ld.lld -pie %t.64s.o -o %t.64s
-# RUN: llvm-readelf -S -s %t.64s | FileCheck %s
+# RUN: llvm-readelf -S -sX %t.64s | FileCheck %s
 
 # NOSDATA:      .text
-# NOSDATA-NEXT: .tdata
+# NOSDATA-NEXT: .tdata   PROGBITS [[#%x,TDATA:]]
 # NOSDATA-NEXT: .tbss
 # NOSDATA-NEXT: .dynamic
 # NOSDATA-NEXT: .got
@@ -28,9 +28,10 @@
 ## exist, define __global_pointer$ and set its st_shndx arbitrarily to 1.
 ## The symbol value should not be used by the program.
 
-# NOSDATA-DAG:  [[#]]: {{0*}}[[#BSS]]         0 NOTYPE  GLOBAL DEFAULT [[#]] _edata
-# NOSDATA-DAG:  [[#]]: {{0*}}[[#BSS]]         0 NOTYPE  GLOBAL DEFAULT [[#]] __bss_start
-# NOSDATA-DAG:  [[#]]: {{0*}}800              0 NOTYPE  GLOBAL DEFAULT     1 __global_pointer$
+# NOSDATA-DAG:  [[#]]: {{.*}}                 0 NOTYPE  GLOBAL DEFAULT [[#]] (.text) _etext
+# NOSDATA-DAG:  [[#]]: {{0*}}[[#BSS]]         0 NOTYPE  GLOBAL DEFAULT [[#]] (.data) _edata
+# NOSDATA-DAG:  [[#]]: {{0*}}[[#BSS]]         0 NOTYPE  GLOBAL DEFAULT [[#]] (.bss) __bss_start
+# NOSDATA-DAG:  [[#]]: {{0*}}800              0 NOTYPE  GLOBAL DEFAULT  1 (.dynsym) __global_pointer$
 
 # CHECK:      .text
 # CHECK-NEXT: .tdata
@@ -43,11 +44,11 @@
 # CHECK-NEXT: .sbss      NOBITS   [[#%x,SBSS:]]
 # CHECK-NEXT: .bss
 
-# CHECK-DAG:  [[#]]: {{0*}}[[#SBSS]]        0 NOTYPE  GLOBAL DEFAULT [[#]] _edata
-# CHECK-DAG:  [[#]]: {{0*}}[[#SBSS]]        0 NOTYPE  GLOBAL DEFAULT [[#]] __bss_start
-# CHECK-DAG:  [[#]]: {{0*}}[[#SDATA+0x800]] 0 NOTYPE  GLOBAL DEFAULT [[#]] __global_pointer$
+# CHECK-DAG:  [[#]]: {{0*}}[[#SBSS]]        0 NOTYPE  GLOBAL DEFAULT [[#]] (.sdata) _edata
+# CHECK-DAG:  [[#]]: {{0*}}[[#SBSS]]        0 NOTYPE  GLOBAL DEFAULT [[#]] (.sbss) __bss_start
+# CHECK-DAG:  [[#]]: {{0*}}[[#SDATA+0x800]] 0 NOTYPE  GLOBAL DEFAULT [[#]] (.sdata) __global_pointer$
 
-.globl _edata, __bss_start
+.globl _etext, _edata, __bss_start
   lla gp, __global_pointer$
 
 .section .data,"aw",@progbits; .long _GLOBAL_OFFSET_TABLE_ - .
diff --git a/lld/test/ELF/x86-64-section-layout.s b/lld/test/ELF/x86-64-section-layout.s
index 3720127..f292877 100644
--- a/lld/test/ELF/x86-64-section-layout.s
+++ b/lld/test/ELF/x86-64-section-layout.s
@@ -6,31 +6,31 @@
 
 # RUN: llvm-mc -filetype=obj -triple=x86_64 --defsym=BSS=1 a.s -o a.o
 # RUN: ld.lld --section-start=.note=0x200300 a.o -o a
-# RUN: llvm-readelf -S -l a | FileCheck %s
+# RUN: llvm-readelf -S -l -sX a | FileCheck %s
 
 # RUN: llvm-mc -filetype=obj -triple=x86_64 a.s -o a1.o
 # RUN: ld.lld --section-start=.note=0x200300 a1.o -o a1
-# RUN: llvm-readelf -S a1 | FileCheck %s --check-prefix=CHECK1
+# RUN: llvm-readelf -S -sX a1 | FileCheck %s --check-prefix=CHECK1
 
 # RUN: ld.lld -T b.lds -z norelro a.o -o b
 # RUN: llvm-readelf -S -l b | FileCheck %s --check-prefix=CHECK2
 
-# CHECK:       Name       Type            Address          Off    Size   ES Flg Lk Inf Al
-# CHECK-NEXT:             NULL            0000000000000000 000000 000000 00      0   0  0
-# CHECK-NEXT:  .note      NOTE            0000000000200300 000300 000001 00   A  0   0  1
-# CHECK-NEXT:  .lrodata   PROGBITS        0000000000200301 000301 000002 00  Al  0   0  1
-# CHECK-NEXT:  .rodata    PROGBITS        0000000000200303 000303 000001 00   A  0   0  1
-# CHECK-NEXT:  .text      PROGBITS        0000000000201304 000304 000001 00  AX  0   0  4
-# CHECK-NEXT:  .tdata     PROGBITS        0000000000202305 000305 000001 00 WAT  0   0  1
-# CHECK-NEXT:  .tbss      NOBITS          0000000000202306 000306 000002 00 WAT  0   0  1
-# CHECK-NEXT:  .relro_padding NOBITS      0000000000202306 000306 000cfa 00  WA  0   0  1
-# CHECK-NEXT:  .data      PROGBITS        0000000000203306 000306 000001 00  WA  0   0  1
-# CHECK-NEXT:  .bss       NOBITS          0000000000203307 000307 001800 00  WA  0   0  1
+# CHECK:       Name              Type            Address          Off    Size   ES Flg Lk Inf Al
+# CHECK-NEXT:                    NULL            0000000000000000 000000 000000 00      0   0  0
+# CHECK-NEXT:  .note             NOTE            0000000000200300 000300 000001 00   A  0   0  1
+# CHECK-NEXT:  .lrodata          PROGBITS        0000000000200301 000301 000002 00  Al  0   0  1
+# CHECK-NEXT:  .rodata           PROGBITS        0000000000200303 000303 000001 00   A  0   0  1
+# CHECK-NEXT:  .text             PROGBITS        0000000000201304 000304 000001 00  AX  0   0  4
+# CHECK-NEXT:  .tdata            PROGBITS        0000000000202305 000305 000001 00 WAT  0   0  1
+# CHECK-NEXT:  .tbss             NOBITS          0000000000202306 000306 000002 00 WAT  0   0  1
+# CHECK-NEXT:  .relro_padding    NOBITS          0000000000202306 000306 000cfa 00  WA  0   0  1
+# CHECK-NEXT:  .data             PROGBITS        0000000000203306 000306 000001 00  WA  0   0  1
+# CHECK-NEXT:  .bss              NOBITS          0000000000203307 000307 001800 00  WA  0   0  1
 ## We spend size(.bss) % MAXPAGESIZE bytes for .bss.
-# CHECK-NEXT:  .ldata     PROGBITS        0000000000205b07 000b07 000002 00 WAl  0   0  1
-# CHECK-NEXT:  .ldata2    PROGBITS        0000000000205b09 000b09 000001 00 WAl  0   0  1
-# CHECK-NEXT:  .lbss      NOBITS          0000000000205b0a 000b0a 000002 00 WAl  0   0  1
-# CHECK-NEXT:  .comment   PROGBITS        0000000000000000 000b0a {{.*}} 01  MS  0   0  1
+# CHECK-NEXT:  .ldata            PROGBITS        0000000000205b07 000b07 000002 00 WAl  0   0  1
+# CHECK-NEXT:  .ldata2           PROGBITS        0000000000205b09 000b09 000001 00 WAl  0   0  1
+# CHECK-NEXT:  .lbss             NOBITS          0000000000205b0a 000b0a 001201 00 WAl  0   0  1
+# CHECK-NEXT:  .comment          PROGBITS        0000000000000000 000b0a {{.*}} 01  MS  0   0  1
 
 # CHECK:       Program Headers:
 # CHECK-NEXT:    Type  Offset   VirtAddr           PhysAddr           FileSiz  MemSiz   Flg Align
@@ -39,13 +39,23 @@
 # CHECK-NEXT:    LOAD  0x000304 0x0000000000201304 0x0000000000201304 0x000001 0x000001 R E 0x1000
 # CHECK-NEXT:    LOAD  0x000305 0x0000000000202305 0x0000000000202305 0x000001 0x000cfb RW  0x1000
 # CHECK-NEXT:    LOAD  0x000306 0x0000000000203306 0x0000000000203306 0x000001 0x001801 RW  0x1000
-# CHECK-NEXT:    LOAD  0x000b07 0x0000000000205b07 0x0000000000205b07 0x000003 0x000005 RW  0x1000
+# CHECK-NEXT:    LOAD  0x000b07 0x0000000000205b07 0x0000000000205b07 0x000003 0x001204 RW  0x1000
+
+# CHECK:       0000000000201304     0 NOTYPE  GLOBAL DEFAULT [[#]] (.text)   _start
+# CHECK-NEXT:  0000000000201305     0 NOTYPE  GLOBAL DEFAULT [[#]] (.text)   _etext
+# CHECK-NEXT:  0000000000205b0a     0 NOTYPE  GLOBAL DEFAULT [[#]] (.ldata2) _edata
+# CHECK-NEXT:  0000000000206d0b     0 NOTYPE  GLOBAL DEFAULT [[#]] (.lbss)   _end
 
 # CHECK1:      .data      PROGBITS        0000000000203306 000306 000001 00  WA  0   0  1
 # CHECK1-NEXT: .ldata     PROGBITS        0000000000203307 000307 000002 00 WAl  0   0  1
 # CHECK1-NEXT: .ldata2    PROGBITS        0000000000203309 000309 000001 00 WAl  0   0  1
 # CHECK1-NEXT: .comment   PROGBITS        0000000000000000 00030a {{.*}} 01  MS  0   0  1
 
+# CHECK1:       0000000000201304     0 NOTYPE  GLOBAL DEFAULT [[#]] (.text)   _start
+# CHECK1-NEXT:  0000000000201305     0 NOTYPE  GLOBAL DEFAULT [[#]] (.text)   _etext
+# CHECK1-NEXT:  000000000020330a     0 NOTYPE  GLOBAL DEFAULT [[#]] (.ldata2) _edata
+# CHECK1-NEXT:  000000000020330a     0 NOTYPE  GLOBAL DEFAULT [[#]] (.ldata2) _end
+
 # CHECK2:      .note      NOTE            0000000000200300 000300 000001 00   A  0   0  1
 # CHECK2-NEXT: .lrodata   PROGBITS        0000000000200301 000301 000001 00  Al  0   0  1
 ## With a SECTIONS command, we suppress the default rule placing .lrodata.* into .lrodata.
@@ -59,7 +69,7 @@
 # CHECK2-NEXT: .bss       NOBITS          0000000000200307 000307 001800 00  WA  0   0  1
 # CHECK2-NEXT: .ldata     PROGBITS        0000000000201b07 001b07 000002 00 WAl  0   0  1
 # CHECK2-NEXT: .ldata2    PROGBITS        0000000000201b09 001b09 000001 00 WAl  0   0  1
-# CHECK2-NEXT: .lbss      NOBITS          0000000000201b0a 001b0a 000002 00 WAl  0   0  1
+# CHECK2-NEXT: .lbss      NOBITS          0000000000201b0a 001b0a 001201 00 WAl  0   0  1
 # CHECK2-NEXT: .comment   PROGBITS        0000000000000000 001b0a {{.*}} 01  MS  0   0  1
 
 # CHECK2:      Program Headers:
@@ -67,11 +77,11 @@
 # CHECK2-NEXT:   PHDR  0x000040 0x0000000000200040 0x0000000000200040 {{.*}}   {{.*}}   R   0x8
 # CHECK2-NEXT:   LOAD  0x000000 0x0000000000200000 0x0000000000200000 0x000304 0x000304 R   0x1000
 # CHECK2-NEXT:   LOAD  0x000304 0x0000000000200304 0x0000000000200304 0x000001 0x000001 R E 0x1000
-# CHECK2-NEXT:   LOAD  0x000305 0x0000000000200305 0x0000000000200305 0x001805 0x001807 RW  0x1000
+# CHECK2-NEXT:   LOAD  0x000305 0x0000000000200305 0x0000000000200305 0x001805 0x002a06 RW  0x1000
 # CHECK2-NEXT:   TLS   0x000305 0x0000000000200305 0x0000000000200305 0x000001 0x000003 R   0x1
 
 #--- a.s
-.globl _start
+.globl _start, _etext, _edata, _end
 _start:
   ret
 
@@ -92,7 +102,7 @@ _start:
 ## Input .ldata.rel.ro sections are placed in the output .ldata section.
 .section .ldata.rel.ro,"awl"; .space 1
 .ifdef BSS
-.section .lbss,"awl",@nobits; .space 1
+.section .lbss,"awl",@nobits; .space 0x1200
 ## Input .lbss.rel.ro sections are placed in the output .lbss section.
 .section .lbss.rel.ro,"awl",@nobits; .space 1
 .endif
-- 
cgit v1.1


From bb180856ec28efe305dc77ca4bb3db12d8932edf Mon Sep 17 00:00:00 2001
From: Joseph Huber <huberjn@outlook.com>
Date: Fri, 9 Feb 2024 14:04:59 -0600
Subject: [NVPTX][Fix] Update minimum CPU for NVPTX intrinsics test

Summary:
This test requires at least sm_30 to run, but that is still below the
minimum supported version of sm_52 currently. Just set this to sm_60 so
the tests pass in the future.
---
 llvm/test/CodeGen/NVPTX/intrinsics.ll | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/llvm/test/CodeGen/NVPTX/intrinsics.ll b/llvm/test/CodeGen/NVPTX/intrinsics.ll
index 7e45b1f..2994f60 100644
--- a/llvm/test/CodeGen/NVPTX/intrinsics.ll
+++ b/llvm/test/CodeGen/NVPTX/intrinsics.ll
@@ -1,7 +1,7 @@
-; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s
-; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s
-; RUN: %if ptxas && !ptxas-12.0 %{ llc < %s -march=nvptx -mcpu=sm_20 | %ptxas-verify %}
-; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 | %ptxas-verify %}
+; RUN: llc < %s -march=nvptx -mcpu=sm_60 | FileCheck %s
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_60 | FileCheck %s
+; RUN: %if ptxas && !ptxas-12.0 %{ llc < %s -march=nvptx -mcpu=sm_60 | %ptxas-verify %}
+; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_60 | %ptxas-verify %}
 
 ; CHECK-LABEL: test_fabsf(
 define float @test_fabsf(float %f) {
-- 
cgit v1.1


From 5f26b902d59b98ffa450f7bae508b330d0184d0d Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Fri, 9 Feb 2024 12:09:42 -0800
Subject: [ELF] Apply forgotten change to #81223

---
 lld/ELF/Writer.cpp | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/lld/ELF/Writer.cpp b/lld/ELF/Writer.cpp
index 53ca70b..bd4db1e 100644
--- a/lld/ELF/Writer.cpp
+++ b/lld/ELF/Writer.cpp
@@ -2446,8 +2446,8 @@ SmallVector<PhdrEntry *, 0> Writer<ELFT>::createPhdrs(Partition &part) {
     // Segments are contiguous memory regions that has the same attributes
     // (e.g. executable or writable). There is one phdr for each segment.
     // Therefore, we need to create a new phdr when the next section has
-    // compatible flags or is loaded at a discontiguous address or memory region
-    // using AT or AT> linker script command, respectively.
+    // incompatible flags or is loaded at a discontiguous address or memory
+    // region using AT or AT> linker script command, respectively.
     //
     // As an exception, we don't create a separate load segment for the ELF
     // headers, even if the first "real" output has an AT or AT> attribute.
@@ -2461,10 +2461,10 @@ SmallVector<PhdrEntry *, 0> Writer<ELFT>::createPhdrs(Partition &part) {
     // needed to create a new LOAD)
     uint64_t newFlags = computeFlags(sec->getPhdrFlags());
     // When --no-rosegment is specified, RO and RX sections are compatible.
-    uint32_t diff = flags ^ newFlags;
+    uint32_t incompatible = flags ^ newFlags;
     if (config->singleRoRx && !(newFlags & PF_W))
-      diff &= ~PF_X;
-    if (diff)
+      incompatible &= ~PF_X;
+    if (incompatible)
       load = nullptr;
 
     bool sameLMARegion =
-- 
cgit v1.1


From 3c707310a3e0233c1bc364a408e6fb43e56e1b78 Mon Sep 17 00:00:00 2001
From: Joseph Huber <huberjn@outlook.com>
Date: Fri, 9 Feb 2024 14:11:01 -0600
Subject: [NVPTX] Add clang builtin for `__nvvm_reflect` intrinsic (#81277)

Summary:
Some recent support made usage of `__nvvm_reflect` more consistent. We
should expose it as a builtin rather than forcing users to externally
define the function.
---
 clang/include/clang/Basic/BuiltinsNVPTX.def    |  1 +
 clang/test/CodeGen/builtins-nvptx.c            |  8 ++++++++
 clang/test/CodeGenOpenCL/reflect.cl            | 28 ++++++++++++++++++++++++++
 llvm/include/llvm/IR/IntrinsicsNVVM.td         |  3 ++-
 llvm/test/CodeGen/NVPTX/nvvm-reflect-opaque.ll |  4 ++--
 llvm/test/CodeGen/NVPTX/nvvm-reflect.ll        |  4 ++--
 6 files changed, 43 insertions(+), 5 deletions(-)
 create mode 100644 clang/test/CodeGenOpenCL/reflect.cl

diff --git a/clang/include/clang/Basic/BuiltinsNVPTX.def b/clang/include/clang/Basic/BuiltinsNVPTX.def
index 7819e71..8d3c5e6 100644
--- a/clang/include/clang/Basic/BuiltinsNVPTX.def
+++ b/clang/include/clang/Basic/BuiltinsNVPTX.def
@@ -159,6 +159,7 @@ BUILTIN(__nvvm_read_ptx_sreg_pm3, "i", "n")
 
 BUILTIN(__nvvm_prmt, "UiUiUiUi", "")
 BUILTIN(__nvvm_exit, "v", "r")
+BUILTIN(__nvvm_reflect, "UicC*", "r")
 TARGET_BUILTIN(__nvvm_nanosleep, "vUi", "n", AND(SM_70, PTX63))
 
 // Min Max
diff --git a/clang/test/CodeGen/builtins-nvptx.c b/clang/test/CodeGen/builtins-nvptx.c
index ad7c27f..4dba767 100644
--- a/clang/test/CodeGen/builtins-nvptx.c
+++ b/clang/test/CodeGen/builtins-nvptx.c
@@ -44,6 +44,14 @@ __device__ int read_tid() {
 
 }
 
+__device__ bool reflect() {
+
+// CHECK: call i32 @llvm.nvvm.reflect(ptr {{.*}})
+
+  unsigned x = __nvvm_reflect("__CUDA_ARCH");
+  return x >= 700;
+}
+
 __device__ int read_ntid() {
 
 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
diff --git a/clang/test/CodeGenOpenCL/reflect.cl b/clang/test/CodeGenOpenCL/reflect.cl
new file mode 100644
index 0000000..9ae4a5f
--- /dev/null
+++ b/clang/test/CodeGenOpenCL/reflect.cl
@@ -0,0 +1,28 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// RUN: %clang_cc1 %s -triple nvptx-unknown-unknown -emit-llvm -O0 -o - | FileCheck %s
+
+// CHECK-LABEL: define dso_local zeroext i1 @device_function(
+// CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = call i32 @llvm.nvvm.reflect(ptr addrspacecast (ptr addrspace(4) @.str to ptr))
+// CHECK-NEXT:    [[CMP:%.*]] = icmp uge i32 [[TMP0]], 700
+// CHECK-NEXT:    ret i1 [[CMP]]
+//
+bool device_function() {
+  return __nvvm_reflect("__CUDA_ARCH") >= 700;
+}
+
+// CHECK-LABEL: define dso_local spir_kernel void @kernel_function(
+// CHECK-SAME: ptr addrspace(1) noundef align 4 [[I:%.*]]) #[[ATTR2:[0-9]+]] !kernel_arg_addr_space !4 !kernel_arg_access_qual !5 !kernel_arg_type !6 !kernel_arg_base_type !6 !kernel_arg_type_qual !7 {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[I_ADDR:%.*]] = alloca ptr addrspace(1), align 4
+// CHECK-NEXT:    store ptr addrspace(1) [[I]], ptr [[I_ADDR]], align 4
+// CHECK-NEXT:    [[CALL:%.*]] = call zeroext i1 @device_function() #[[ATTR3:[0-9]+]]
+// CHECK-NEXT:    [[CONV:%.*]] = zext i1 [[CALL]] to i32
+// CHECK-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr [[I_ADDR]], align 4
+// CHECK-NEXT:    store i32 [[CONV]], ptr addrspace(1) [[TMP0]], align 4
+// CHECK-NEXT:    ret void
+//
+__kernel void kernel_function(__global int *i) {
+  *i = device_function();
+}
diff --git a/llvm/include/llvm/IR/IntrinsicsNVVM.td b/llvm/include/llvm/IR/IntrinsicsNVVM.td
index d825dc8..726cea0 100644
--- a/llvm/include/llvm/IR/IntrinsicsNVVM.td
+++ b/llvm/include/llvm/IR/IntrinsicsNVVM.td
@@ -1625,7 +1625,8 @@ def int_nvvm_compiler_warn :
     Intrinsic<[], [llvm_anyptr_ty], [], "llvm.nvvm.compiler.warn">;
 
 def int_nvvm_reflect :
-  Intrinsic<[llvm_i32_ty], [llvm_anyptr_ty], [IntrNoMem], "llvm.nvvm.reflect">;
+  Intrinsic<[llvm_i32_ty], [llvm_ptr_ty], [IntrNoMem], "llvm.nvvm.reflect">,
+  ClangBuiltin<"__nvvm_reflect">;
 
 // isspacep.{const, global, local, shared}
 def int_nvvm_isspacep_const
diff --git a/llvm/test/CodeGen/NVPTX/nvvm-reflect-opaque.ll b/llvm/test/CodeGen/NVPTX/nvvm-reflect-opaque.ll
index 1cb5c87..46ab79d 100644
--- a/llvm/test/CodeGen/NVPTX/nvvm-reflect-opaque.ll
+++ b/llvm/test/CodeGen/NVPTX/nvvm-reflect-opaque.ll
@@ -41,7 +41,7 @@ exit:
   ret float %ret
 }
 
-declare i32 @llvm.nvvm.reflect.p0(ptr)
+declare i32 @llvm.nvvm.reflect(ptr)
 
 ; CHECK-LABEL: define noundef i32 @intrinsic
 define i32 @intrinsic() {
@@ -49,7 +49,7 @@ define i32 @intrinsic() {
 ; USE_FTZ_0: ret i32 0
 ; USE_FTZ_1: ret i32 1
   %ptr = tail call ptr @llvm.nvvm.ptr.constant.to.gen.p0.p4(ptr addrspace(4) @str)
-  %reflect = tail call i32 @llvm.nvvm.reflect.p0(ptr %ptr)
+  %reflect = tail call i32 @llvm.nvvm.reflect(ptr %ptr)
   ret i32 %reflect
 }
 
diff --git a/llvm/test/CodeGen/NVPTX/nvvm-reflect.ll b/llvm/test/CodeGen/NVPTX/nvvm-reflect.ll
index 9b1939f..2ed9f7c 100644
--- a/llvm/test/CodeGen/NVPTX/nvvm-reflect.ll
+++ b/llvm/test/CodeGen/NVPTX/nvvm-reflect.ll
@@ -41,7 +41,7 @@ exit:
   ret float %ret
 }
 
-declare i32 @llvm.nvvm.reflect.p0(ptr)
+declare i32 @llvm.nvvm.reflect(ptr)
 
 ; CHECK-LABEL: define noundef i32 @intrinsic
 define i32 @intrinsic() {
@@ -49,7 +49,7 @@ define i32 @intrinsic() {
 ; USE_FTZ_0: ret i32 0
 ; USE_FTZ_1: ret i32 1
   %ptr = tail call ptr @llvm.nvvm.ptr.constant.to.gen.p0.p4(ptr addrspace(4) @str)
-  %reflect = tail call i32 @llvm.nvvm.reflect.p0(ptr %ptr)
+  %reflect = tail call i32 @llvm.nvvm.reflect(ptr %ptr)
   ret i32 %reflect
 }
 
-- 
cgit v1.1


From 2cbe5a33a5fda257747d75863bd9ccb8920b9249 Mon Sep 17 00:00:00 2001
From: Maksim Panchenko <maks@fb.com>
Date: Fri, 9 Feb 2024 12:26:12 -0800
Subject: [llvm-objcopy] Fix the build again after 7ddc320

---
 llvm/lib/ObjCopy/ELF/ELFObject.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/ObjCopy/ELF/ELFObject.cpp b/llvm/lib/ObjCopy/ELF/ELFObject.cpp
index d7559ab..b9b9167 100644
--- a/llvm/lib/ObjCopy/ELF/ELFObject.cpp
+++ b/llvm/lib/ObjCopy/ELF/ELFObject.cpp
@@ -2980,7 +2980,7 @@ SRECWriter::getTotalSize(WritableMemoryBuffer &EmptyBuffer) const {
   SRECSizeCalculator SizeCalc(EmptyBuffer, 0);
   for (const SectionBase *Sec : Sections)
     if (Error Err = Sec->accept(SizeCalc))
-      return Err;
+      return std::move(Err);
 
   SizeCalc.writeRecords(Obj.Entry);
   // We need to add the size of the Header and Terminator records.
-- 
cgit v1.1


From 228e9d5bcfcb6411d2a257b560464323d0248c35 Mon Sep 17 00:00:00 2001
From: Louis Dionne <ldionne.2@gmail.com>
Date: Fri, 9 Feb 2024 15:31:08 -0500
Subject: [clang] Document the type_visibility attribute (#79157)

I was looking for the documentation of that attribute, and the best I
could find was a Stackoverflow answer or the commit message that
originally introduced the attribute. I figured I might as well document
what I find to save everyone time in the future.
---
 clang/include/clang/Basic/Attr.td     |  4 ++--
 clang/include/clang/Basic/AttrDocs.td | 19 +++++++++++++++++++
 2 files changed, 21 insertions(+), 2 deletions(-)

diff --git a/clang/include/clang/Basic/Attr.td b/clang/include/clang/Basic/Attr.td
index b2d5309..45a29e7 100644
--- a/clang/include/clang/Basic/Attr.td
+++ b/clang/include/clang/Basic/Attr.td
@@ -3226,8 +3226,8 @@ def TypeVisibility : InheritableAttr {
   let Args = [EnumArgument<"Visibility", "VisibilityType",
                            ["default", "hidden", "internal", "protected"],
                            ["Default", "Hidden", "Hidden", "Protected"]>];
-//  let Subjects = [Tag, ObjCInterface, Namespace];
-  let Documentation = [Undocumented];
+  // let Subjects = SubjectList<[Tag, ObjCInterface, Namespace], ErrorDiag>;
+  let Documentation = [TypeVisibilityDocs];
 }
 
 def VecReturn : InheritableAttr {
diff --git a/clang/include/clang/Basic/AttrDocs.td b/clang/include/clang/Basic/AttrDocs.td
index 19a98a0..8d36909 100644
--- a/clang/include/clang/Basic/AttrDocs.td
+++ b/clang/include/clang/Basic/AttrDocs.td
@@ -5585,6 +5585,25 @@ See :doc:`LTOVisibility`.
   }];
 }
 
+def TypeVisibilityDocs : Documentation {
+  let Category = DocCatType;
+  let Content = [{
+The ``type_visibility`` attribute allows the visibility of a type and its vague
+linkage objects (vtable, typeinfo, typeinfo name) to be controlled separately from
+the visibility of functions and data members of the type.
+
+For example, this can be used to give default visibility to the typeinfo and the vtable
+of a type while still keeping hidden visibility on its member functions and static data
+members.
+
+This attribute can only be applied to types and namespaces.
+
+If both ``visibility`` and ``type_visibility`` are applied to a type or a namespace, the
+visibility specified with the ``type_visibility`` attribute overrides the visibility
+provided with the regular ``visibility`` attribute.
+  }];
+}
+
 def RenderScriptKernelAttributeDocs : Documentation {
   let Category = DocCatFunction;
   let Content = [{
-- 
cgit v1.1


From bd65547805a4b02be8f8c9e7acf1df850164da53 Mon Sep 17 00:00:00 2001
From: Tom Stellard <tstellar@redhat.com>
Date: Fri, 9 Feb 2024 13:04:49 -0800
Subject: [workflows] Create a more descriptive title and body when creating a
 PR for backports (#80396)

When a backport request is made, the resulting pull request will have a
title like this:

<release branch>: <First line of HEAD commit for the branch>

And a body that says:

Backport <commit0> <commit1> ..

Requested By: <user>
---
 .github/workflows/issue-release-workflow.yml |  1 +
 llvm/utils/git/github-automation.py          | 29 ++++++++++++++++++++++++----
 2 files changed, 26 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/issue-release-workflow.yml b/.github/workflows/issue-release-workflow.yml
index 33a1e89..448c1c5 100644
--- a/.github/workflows/issue-release-workflow.yml
+++ b/.github/workflows/issue-release-workflow.yml
@@ -65,4 +65,5 @@ jobs:
           release-workflow \
           --branch-repo-token ${{ secrets.RELEASE_WORKFLOW_PUSH_SECRET }} \
           --issue-number ${{ github.event.issue.number }} \
+          --requested-by ${{ github.event.issue.user.login }} \
           auto
diff --git a/llvm/utils/git/github-automation.py b/llvm/utils/git/github-automation.py
index e2b84ae..b475eff 100755
--- a/llvm/utils/git/github-automation.py
+++ b/llvm/utils/git/github-automation.py
@@ -343,6 +343,7 @@ class ReleaseWorkflow:
         branch_repo_name: str,
         branch_repo_token: str,
         llvm_project_dir: str,
+        requested_by: str,
     ) -> None:
         self._token = token
         self._repo_name = repo
@@ -353,6 +354,7 @@ class ReleaseWorkflow:
         else:
             self._branch_repo_token = self.token
         self._llvm_project_dir = llvm_project_dir
+        self._requested_by = requested_by
 
     @property
     def token(self) -> str:
@@ -383,6 +385,10 @@ class ReleaseWorkflow:
         return self._llvm_project_dir
 
     @property
+    def requested_by(self) -> str:
+        return self._requested_by
+
+    @property
     def repo(self) -> github.Repository.Repository:
         return github.Github(self.token).get_repo(self.repo_name)
 
@@ -536,7 +542,7 @@ class ReleaseWorkflow:
 
         self.issue_remove_cherry_pick_failed_label()
         return self.create_pull_request(
-            self.branch_repo_owner, self.repo_name, branch_name
+            self.branch_repo_owner, self.repo_name, branch_name, commits
         )
 
     def check_if_pull_request_exists(
@@ -545,7 +551,9 @@ class ReleaseWorkflow:
         pulls = repo.get_pulls(head=head)
         return pulls.totalCount != 0
 
-    def create_pull_request(self, owner: str, repo_name: str, branch: str) -> bool:
+    def create_pull_request(
+        self, owner: str, repo_name: str, branch: str, commits: List[str]
+    ) -> bool:
         """
         Create a pull request in `self.repo_name`.  The base branch of the
         pull request will be chosen based on the the milestone attached to
@@ -567,9 +575,15 @@ class ReleaseWorkflow:
             print("PR already exists...")
             return True
         try:
+            commit_message = repo.get_commit(commits[-1]).commit.message
+            message_lines = commit_message.splitlines()
+            title = "{}: {}".format(release_branch_for_issue, message_lines[0])
+            body = "Backport {}\n\nRequested by: @{}".format(
+                " ".join(commits), self.requested_by
+            )
             pull = repo.create_pull(
-                title=f"PR for {issue_ref}",
-                body="resolves {}".format(issue_ref),
+                title=title,
+                body=body,
                 base=release_branch_for_issue,
                 head=head,
                 maintainer_can_modify=False,
@@ -683,6 +697,12 @@ llvmbot_git_config_parser = subparsers.add_parser(
     "setup-llvmbot-git",
     help="Set the default user and email for the git repo in LLVM_PROJECT_DIR to llvmbot",
 )
+release_workflow_parser.add_argument(
+    "--requested-by",
+    type=str,
+    required=True,
+    help="The user that requested this backport",
+)
 
 args = parser.parse_args()
 
@@ -712,6 +732,7 @@ elif args.command == "release-workflow":
         args.branch_repo,
         args.branch_repo_token,
         args.llvm_project_dir,
+        args.requested_by,
     )
     if not release_workflow.release_branch_for_issue:
         release_workflow.issue_notify_no_milestone(sys.stdin.readlines())
-- 
cgit v1.1


From 967374123bd6eee23db9a57fcac7324e420648c5 Mon Sep 17 00:00:00 2001
From: Joseph Huber <huberjn@outlook.com>
Date: Fri, 9 Feb 2024 15:05:18 -0600
Subject: [libc] Bump up minimum PTX version to 6.3

Summary:
I neglected the fact that `activemask` is a 6.2 or 6.3 feature, so
building this on older machines is incorrect. Bump this up to 6.3 for
now so it works. In the future we will try to get rid of the N
architecture business.
---
 libc/cmake/modules/LLVMLibCObjectRules.cmake | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/libc/cmake/modules/LLVMLibCObjectRules.cmake b/libc/cmake/modules/LLVMLibCObjectRules.cmake
index 70e64a6..ef1f248 100644
--- a/libc/cmake/modules/LLVMLibCObjectRules.cmake
+++ b/libc/cmake/modules/LLVMLibCObjectRules.cmake
@@ -106,13 +106,13 @@ function(get_nvptx_compile_options output_var gpu_arch)
   list(APPEND nvptx_options "-Wno-unknown-cuda-version")
   list(APPEND nvptx_options "SHELL:-mllvm -nvptx-emit-init-fini-kernel=false")
   if(${gpu_arch} STREQUAL "sm_35")
-    list(APPEND nvptx_options "--cuda-feature=+ptx60")
+    list(APPEND nvptx_options "--cuda-feature=+ptx63")
   elseif(${gpu_arch} STREQUAL "sm_37")
-    list(APPEND nvptx_options "--cuda-feature=+ptx60")
+    list(APPEND nvptx_options "--cuda-feature=+ptx63")
   elseif(${gpu_arch} STREQUAL "sm_50")
-    list(APPEND nvptx_options "--cuda-feature=+ptx60")
+    list(APPEND nvptx_options "--cuda-feature=+ptx63")
   elseif(${gpu_arch} STREQUAL "sm_52")
-    list(APPEND nvptx_options "--cuda-feature=+ptx60")
+    list(APPEND nvptx_options "--cuda-feature=+ptx63")
   elseif(${gpu_arch} STREQUAL "sm_53")
     list(APPEND nvptx_options "--cuda-feature=+ptx63")
   elseif(${gpu_arch} STREQUAL "sm_60")
-- 
cgit v1.1


From 7fd1466433a05f4e2e183914a8bd7c372bb0b8a7 Mon Sep 17 00:00:00 2001
From: Richard Dzenis <dzenis@richard.lv>
Date: Fri, 9 Feb 2024 23:07:07 +0200
Subject: [mlir] Fix CallOpInterface extraClassDeclaration to be fully
 namespace qualified (#81258)

`extraClassDeclaration` of `CallOpInterface` can be inherited by other
`OpInterfaces` into foreign namespaces, thus types must be fully
qualified to prevent compiler errors, for example:

    def MyCaller : OpInterface<"MyCaller", [CallOpInterface]> {
        let cppNamespace = "::MyNamespace";
    }
---
 mlir/include/mlir/Interfaces/CallInterfaces.td | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mlir/include/mlir/Interfaces/CallInterfaces.td b/mlir/include/mlir/Interfaces/CallInterfaces.td
index 3e9c002..752de74 100644
--- a/mlir/include/mlir/Interfaces/CallInterfaces.td
+++ b/mlir/include/mlir/Interfaces/CallInterfaces.td
@@ -68,7 +68,7 @@ def CallOpInterface : OpInterface<"CallOpInterface"> {
     /// `symbolTable` is an optional parameter that will allow for using a
     /// cached symbol table for symbol lookups instead of performing an O(N)
     /// scan.
-    Operation *resolveCallable(SymbolTableCollection *symbolTable = nullptr);
+    ::mlir::Operation *resolveCallable(::mlir::SymbolTableCollection *symbolTable = nullptr);
   }];
 }
 
-- 
cgit v1.1


From 1d0f86ba80543931d467d6ce3f2ad8cdde514710 Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Fri, 9 Feb 2024 13:39:08 -0800
Subject: [Sema] Warn unused functions for FMV based on the target attribute
 (#81302)

The spurious -Wunused-function warning issue for `target_version` #80227
also applied to `__attribute__((target(...)))` based FMV. #81167 removed
warnings for all `target`-based FMV. This patch restores the warnings
for `__attribute__((target("default")))`.
---
 clang/lib/AST/Decl.cpp                            |  6 +++++-
 clang/test/SemaCXX/attr-target-mv-warn-unused.cpp | 16 ++++++++++++++++
 2 files changed, 21 insertions(+), 1 deletion(-)
 create mode 100644 clang/test/SemaCXX/attr-target-mv-warn-unused.cpp

diff --git a/clang/lib/AST/Decl.cpp b/clang/lib/AST/Decl.cpp
index e281f2d..5d6bb72 100644
--- a/clang/lib/AST/Decl.cpp
+++ b/clang/lib/AST/Decl.cpp
@@ -3538,7 +3538,11 @@ bool FunctionDecl::isTargetMultiVersion() const {
 }
 
 bool FunctionDecl::isTargetMultiVersionDefault() const {
-  return isMultiVersion() && hasAttr<TargetVersionAttr>() &&
+  if (!isMultiVersion())
+    return false;
+  if (hasAttr<TargetAttr>())
+    return getAttr<TargetAttr>()->isDefaultVersion();
+  return hasAttr<TargetVersionAttr>() &&
          getAttr<TargetVersionAttr>()->isDefaultVersion();
 }
 
diff --git a/clang/test/SemaCXX/attr-target-mv-warn-unused.cpp b/clang/test/SemaCXX/attr-target-mv-warn-unused.cpp
new file mode 100644
index 0000000..1901589
--- /dev/null
+++ b/clang/test/SemaCXX/attr-target-mv-warn-unused.cpp
@@ -0,0 +1,16 @@
+// RUN: %clang_cc1 -triple x86_64-linux-gnu -fsyntax-only -verify -Wunused %s
+
+__attribute__((target("sse3")))
+static int not_used_fmv() { return 1; }
+__attribute__((target("avx2")))
+static int not_used_fmv() { return 2; }
+__attribute__((target("default")))
+static int not_used_fmv() { return 0; } // expected-warning {{unused function 'not_used_fmv'}}
+
+__attribute__((target("sse3")))
+static int definitely_used_fmv() { return 1; }
+__attribute__((target("avx2")))
+static int definitely_used_fmv() { return 2; }
+__attribute__((target("default")))
+static int definitely_used_fmv() { return 0; }
+int definite_user() { return definitely_used_fmv(); }
-- 
cgit v1.1


From c7a0db1e20251f436e3d500eac03bd9be1d88b45 Mon Sep 17 00:00:00 2001
From: yozhu <101743168+yozhu@users.noreply.github.com>
Date: Fri, 9 Feb 2024 13:55:08 -0800
Subject: [CFI][annotation] Leave alone function pointers in function
 annotations (#80173)

Function annotation, as part of llvm.metadata, is for the function
itself and doesn't apply to its corresponding jump table entry, so with
CFI we shouldn't replace function pointer in function annotation with
pointer to its corresponding jump table entry.
---
 llvm/lib/Transforms/IPO/LowerTypeTests.cpp         | 28 ++++++++-
 .../Transforms/LowerTypeTests/cfi-annotation.ll    | 68 ++++++++++++++++++++++
 2 files changed, 94 insertions(+), 2 deletions(-)
 create mode 100644 llvm/test/Transforms/LowerTypeTests/cfi-annotation.ll

diff --git a/llvm/lib/Transforms/IPO/LowerTypeTests.cpp b/llvm/lib/Transforms/IPO/LowerTypeTests.cpp
index 733f290..633fcb3 100644
--- a/llvm/lib/Transforms/IPO/LowerTypeTests.cpp
+++ b/llvm/lib/Transforms/IPO/LowerTypeTests.cpp
@@ -470,6 +470,9 @@ class LowerTypeTestsModule {
 
   Function *WeakInitializerFn = nullptr;
 
+  GlobalVariable *GlobalAnnotation;
+  DenseSet<Value *> FunctionAnnotations;
+
   bool shouldExportConstantsAsAbsoluteSymbols();
   uint8_t *exportTypeId(StringRef TypeId, const TypeIdLowering &TIL);
   TypeIdLowering importTypeId(StringRef TypeId);
@@ -531,6 +534,10 @@ class LowerTypeTestsModule {
   /// replace each use, which is a direct function call.
   void replaceDirectCalls(Value *Old, Value *New);
 
+  bool isFunctionAnnotation(Value *V) const {
+    return FunctionAnnotations.contains(V);
+  }
+
 public:
   LowerTypeTestsModule(Module &M, ModuleAnalysisManager &AM,
                        ModuleSummaryIndex *ExportSummary,
@@ -1377,8 +1384,11 @@ void LowerTypeTestsModule::replaceWeakDeclarationWithJumpTablePtr(
   // (all?) targets. Switch to a runtime initializer.
   SmallSetVector<GlobalVariable *, 8> GlobalVarUsers;
   findGlobalVariableUsersOf(F, GlobalVarUsers);
-  for (auto *GV : GlobalVarUsers)
+  for (auto *GV : GlobalVarUsers) {
+    if (GV == GlobalAnnotation)
+      continue;
     moveInitializerToModuleConstructor(GV);
+  }
 
   // Can not RAUW F with an expression that uses F. Replace with a temporary
   // placeholder first.
@@ -1837,6 +1847,16 @@ LowerTypeTestsModule::LowerTypeTestsModule(
   }
   OS = TargetTriple.getOS();
   ObjectFormat = TargetTriple.getObjectFormat();
+
+  // Function annotation describes or applies to function itself, and
+  // shouldn't be associated with jump table thunk generated for CFI.
+  GlobalAnnotation = M.getGlobalVariable("llvm.global.annotations");
+  if (GlobalAnnotation && GlobalAnnotation->hasInitializer()) {
+    const ConstantArray *CA =
+        cast<ConstantArray>(GlobalAnnotation->getInitializer());
+    for (Value *Op : CA->operands())
+      FunctionAnnotations.insert(Op);
+  }
 }
 
 bool LowerTypeTestsModule::runForTesting(Module &M, ModuleAnalysisManager &AM) {
@@ -1896,10 +1916,14 @@ void LowerTypeTestsModule::replaceCfiUses(Function *Old, Value *New,
     if (isa<BlockAddress, NoCFIValue>(U.getUser()))
       continue;
 
-    // Skip direct calls to externally defined or non-dso_local functions
+    // Skip direct calls to externally defined or non-dso_local functions.
     if (isDirectCall(U) && (Old->isDSOLocal() || !IsJumpTableCanonical))
       continue;
 
+    // Skip function annotation.
+    if (isFunctionAnnotation(U.getUser()))
+      continue;
+
     // Must handle Constants specially, we cannot call replaceUsesOfWith on a
     // constant because they are uniqued.
     if (auto *C = dyn_cast<Constant>(U.getUser())) {
diff --git a/llvm/test/Transforms/LowerTypeTests/cfi-annotation.ll b/llvm/test/Transforms/LowerTypeTests/cfi-annotation.ll
new file mode 100644
index 0000000..034af89
--- /dev/null
+++ b/llvm/test/Transforms/LowerTypeTests/cfi-annotation.ll
@@ -0,0 +1,68 @@
+; REQUIRES: aarch64-registered-target
+
+; RUN: opt -passes=lowertypetests %s -o %t.o
+; RUN: llvm-dis %t.o -o - | FileCheck %s --check-prefix=CHECK-foobar
+; CHECK-foobar: {{llvm.global.annotations = .*[foo|bar], .*[foo|bar],}}
+; RUN: llvm-dis %t.o -o - | FileCheck %s --check-prefix=CHECK-cfi
+; CHECK-cfi-NOT: {{llvm.global.annotations = .*cfi.*}}
+
+target triple = "aarch64-none-linux-gnu"
+
+@.src = private unnamed_addr constant [7 x i8] c"test.c\00", align 1
+@.str = private unnamed_addr constant [30 x i8] c"annotation_string_literal_bar\00", section "llvm.metadata"
+@.str.1 = private unnamed_addr constant [7 x i8] c"test.c\00", section "llvm.metadata"
+@.str.2 = private unnamed_addr constant [30 x i8] c"annotation_string_literal_foo\00", section "llvm.metadata"
+@llvm.global.annotations = appending global [2 x { ptr, ptr, ptr, i32, ptr }] [{ ptr, ptr, ptr, i32, ptr } { ptr @bar, ptr @.str, ptr @.str.1, i32 2, ptr null }, { ptr, ptr, ptr, i32, ptr } { ptr @foo, ptr @.str.2, ptr @.str.1, i32 1, ptr null }], section "llvm.metadata"
+
+define i32 @bar(i32 noundef %0) #0 !type !8 !type !9 {
+  %2 = alloca i32, align 4
+  store i32 %0, ptr %2, align 4
+  %3 = load i32, ptr %2, align 4
+  %4 = call i32 @foo(i32 noundef %3)
+  ret i32 %4
+}
+
+declare !type !8 !type !9 i32 @foo(i32 noundef) #1
+
+define i32 @test(i32 noundef %0) #0 !type !8 !type !9 {
+  %2 = alloca i32, align 4
+  %3 = alloca ptr, align 8
+  store i32 %0, ptr %2, align 4
+  %4 = load i32, ptr %2, align 4
+  %5 = icmp sgt i32 %4, 0
+  %6 = zext i1 %5 to i64
+  %7 = select i1 %5, ptr @foo, ptr @bar
+  store ptr %7, ptr %3, align 8
+  %8 = load ptr, ptr %3, align 8
+  %9 = call i1 @llvm.type.test(ptr %8, metadata !"_ZTSFiiE"), !nosanitize !10
+  br i1 %9, label %11, label %10, !nosanitize !10
+
+10:
+  call void @llvm.ubsantrap(i8 2) #4, !nosanitize !10
+  unreachable, !nosanitize !10
+
+11:
+  %12 = load i32, ptr %2, align 4
+  %13 = call i32 %8(i32 noundef %12)
+  ret i32 %13
+}
+
+declare i1 @llvm.type.test(ptr, metadata)
+declare void @llvm.ubsantrap(i8 immarg)
+
+attributes #0 = { noinline nounwind optnone uwtable "frame-pointer"="non-leaf" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+fp-armv8,+neon,+v8a,-fmv" }
+attributes #1 = { "frame-pointer"="non-leaf" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+fp-armv8,+neon,+v8a,-fmv" }
+attributes #4 = { noreturn nounwind }
+
+!llvm.module.flags = !{!0, !1, !2, !3, !4, !5, !6}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 4, !"CFI Canonical Jump Tables", i32 0}
+!2 = !{i32 8, !"PIC Level", i32 2}
+!3 = !{i32 7, !"uwtable", i32 2}
+!4 = !{i32 7, !"frame-pointer", i32 1}
+!5 = !{i32 1, !"ThinLTO", i32 0}
+!6 = !{i32 1, !"EnableSplitLTOUnit", i32 1}
+!8 = !{i64 0, !"_ZTSFiiE"}
+!9 = !{i64 0, !"_ZTSFiiE.generalized"}
+!10 = !{}
-- 
cgit v1.1


From 7ff488708c0caa1b31af7ad677b9b321209f6738 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Valentin=20Clement=20=28=E3=83=90=E3=83=AC=E3=83=B3?=
 =?UTF-8?q?=E3=82=BF=E3=82=A4=E3=83=B3=20=E3=82=AF=E3=83=AC=E3=83=A1?=
 =?UTF-8?q?=E3=83=B3=29?= <clementval@gmail.com>
Date: Fri, 9 Feb 2024 13:57:26 -0800
Subject: [flang][cuda][NFC] Rename CUDAAttribute to CUDADataAttribute (#81323)

The newly introduced `CUDAAttribute` is meant for CUDA attributes
associated with variable. In order to not clash with the future
attribute for function/subroutine, rename `CUDAAttribute` to
`CUDADataAttribute`.
---
 flang/include/flang/Lower/ConvertVariable.h        |  6 +++---
 flang/include/flang/Optimizer/Builder/FIRBuilder.h |  4 ++--
 flang/include/flang/Optimizer/Builder/HLFIRTools.h |  2 +-
 flang/include/flang/Optimizer/Dialect/FIRAttr.td   |  7 ++++---
 flang/include/flang/Optimizer/Dialect/FIROps.td    |  4 ++--
 flang/include/flang/Optimizer/HLFIR/HLFIROps.td    |  4 ++--
 flang/include/flang/Optimizer/Support/Utils.h      | 20 +++++++++----------
 flang/lib/Lower/CallInterface.cpp                  |  2 +-
 flang/lib/Lower/ConvertVariable.cpp                | 23 +++++++++++-----------
 flang/lib/Optimizer/Builder/FIRBuilder.cpp         |  4 ++--
 flang/lib/Optimizer/Builder/HLFIRTools.cpp         |  2 +-
 flang/lib/Optimizer/Dialect/FIRAttr.cpp            |  2 +-
 flang/lib/Optimizer/HLFIR/IR/HLFIROps.cpp          |  2 +-
 .../Optimizer/HLFIR/Transforms/ConvertToFIR.cpp    |  4 ++--
 flang/unittests/Optimizer/FortranVariableTest.cpp  |  8 ++++----
 15 files changed, 48 insertions(+), 46 deletions(-)

diff --git a/flang/include/flang/Lower/ConvertVariable.h b/flang/include/flang/Lower/ConvertVariable.h
index cdbf050..b13bb41 100644
--- a/flang/include/flang/Lower/ConvertVariable.h
+++ b/flang/include/flang/Lower/ConvertVariable.h
@@ -139,9 +139,9 @@ translateSymbolAttributes(mlir::MLIRContext *mlirContext,
 
 /// Translate the CUDA Fortran attributes of \p sym into the FIR CUDA attribute
 /// representation.
-fir::CUDAAttributeAttr
-translateSymbolCUDAAttribute(mlir::MLIRContext *mlirContext,
-                             const Fortran::semantics::Symbol &sym);
+fir::CUDADataAttributeAttr
+translateSymbolCUDADataAttribute(mlir::MLIRContext *mlirContext,
+                                 const Fortran::semantics::Symbol &sym);
 
 /// Map a symbol to a given fir::ExtendedValue. This will generate an
 /// hlfir.declare when lowering to HLFIR and map the hlfir.declare result to the
diff --git a/flang/include/flang/Optimizer/Builder/FIRBuilder.h b/flang/include/flang/Optimizer/Builder/FIRBuilder.h
index f50dacd..39821f1 100644
--- a/flang/include/flang/Optimizer/Builder/FIRBuilder.h
+++ b/flang/include/flang/Optimizer/Builder/FIRBuilder.h
@@ -231,13 +231,13 @@ public:
                              mlir::StringAttr linkage = {},
                              mlir::Attribute value = {}, bool isConst = false,
                              bool isTarget = false,
-                             fir::CUDAAttributeAttr cudaAttr = {});
+                             fir::CUDADataAttributeAttr cudaAttr = {});
 
   fir::GlobalOp createGlobal(mlir::Location loc, mlir::Type type,
                              llvm::StringRef name, bool isConst, bool isTarget,
                              std::function<void(FirOpBuilder &)> bodyBuilder,
                              mlir::StringAttr linkage = {},
-                             fir::CUDAAttributeAttr cudaAttr = {});
+                             fir::CUDADataAttributeAttr cudaAttr = {});
 
   /// Create a global constant (read-only) value.
   fir::GlobalOp createGlobalConstant(mlir::Location loc, mlir::Type type,
diff --git a/flang/include/flang/Optimizer/Builder/HLFIRTools.h b/flang/include/flang/Optimizer/Builder/HLFIRTools.h
index fe69ffa..170e134 100644
--- a/flang/include/flang/Optimizer/Builder/HLFIRTools.h
+++ b/flang/include/flang/Optimizer/Builder/HLFIRTools.h
@@ -237,7 +237,7 @@ fir::FortranVariableOpInterface
 genDeclare(mlir::Location loc, fir::FirOpBuilder &builder,
            const fir::ExtendedValue &exv, llvm::StringRef name,
            fir::FortranVariableFlagsAttr flags,
-           fir::CUDAAttributeAttr cudaAttr = {});
+           fir::CUDADataAttributeAttr cudaAttr = {});
 
 /// Generate an hlfir.associate to build a variable from an expression value.
 /// The type of the variable must be provided so that scalar logicals are
diff --git a/flang/include/flang/Optimizer/Dialect/FIRAttr.td b/flang/include/flang/Optimizer/Dialect/FIRAttr.td
index bc73124..422ad53 100644
--- a/flang/include/flang/Optimizer/Dialect/FIRAttr.td
+++ b/flang/include/flang/Optimizer/Dialect/FIRAttr.td
@@ -66,8 +66,8 @@ def CUDAshared   : I32EnumAttrCase<"Shared", 4, "shared">;
 def CUDAunified  : I32EnumAttrCase<"Unified", 5, "unified">;
 // Texture is omitted since it is obsolete and rejected by semantic.
 
-def fir_CUDAAttribute : I32EnumAttr<
-    "CUDAAttribute",
+def fir_CUDADataAttribute : I32EnumAttr<
+    "CUDADataAttribute",
     "CUDA Fortran variable attributes",
     [CUDAconstant, CUDAdevice, CUDAmanaged, CUDApinned, CUDAshared,
      CUDAunified]> {
@@ -75,7 +75,8 @@ def fir_CUDAAttribute : I32EnumAttr<
   let cppNamespace = "::fir";
 }
 
-def fir_CUDAAttributeAttr : EnumAttr<fir_Dialect, fir_CUDAAttribute, "cuda"> {
+def fir_CUDADataAttributeAttr :
+    EnumAttr<fir_Dialect, fir_CUDADataAttribute, "cuda"> {
   let assemblyFormat = [{ ```<` $value `>` }];
 }
 
diff --git a/flang/include/flang/Optimizer/Dialect/FIROps.td b/flang/include/flang/Optimizer/Dialect/FIROps.td
index d505fed..9f198a4 100644
--- a/flang/include/flang/Optimizer/Dialect/FIROps.td
+++ b/flang/include/flang/Optimizer/Dialect/FIROps.td
@@ -2738,7 +2738,7 @@ def fir_GlobalOp : fir_Op<"global", [IsolatedFromAbove, Symbol]> {
     OptionalAttr<UnitAttr>:$constant,
     OptionalAttr<UnitAttr>:$target,
     OptionalAttr<StrAttr>:$linkName,
-    OptionalAttr<fir_CUDAAttributeAttr>:$cuda_attr
+    OptionalAttr<fir_CUDADataAttributeAttr>:$cuda_attr
   );
 
   let regions = (region AtMostRegion<1>:$region);
@@ -3029,7 +3029,7 @@ def fir_DeclareOp : fir_Op<"declare", [AttrSizedOperandSegments,
     Variadic<AnyIntegerType>:$typeparams,
     Builtin_StringAttr:$uniq_name,
     OptionalAttr<fir_FortranVariableFlagsAttr>:$fortran_attrs,
-    OptionalAttr<fir_CUDAAttributeAttr>:$cuda_attr
+    OptionalAttr<fir_CUDADataAttributeAttr>:$cuda_attr
   );
 
   let results = (outs AnyRefOrBox);
diff --git a/flang/include/flang/Optimizer/HLFIR/HLFIROps.td b/flang/include/flang/Optimizer/HLFIR/HLFIROps.td
index f22e9a7..c82eae1 100644
--- a/flang/include/flang/Optimizer/HLFIR/HLFIROps.td
+++ b/flang/include/flang/Optimizer/HLFIR/HLFIROps.td
@@ -89,7 +89,7 @@ def hlfir_DeclareOp : hlfir_Op<"declare", [AttrSizedOperandSegments,
     Variadic<AnyIntegerType>:$typeparams,
     Builtin_StringAttr:$uniq_name,
     OptionalAttr<fir_FortranVariableFlagsAttr>:$fortran_attrs,
-    OptionalAttr<fir_CUDAAttributeAttr>:$cuda_attr
+    OptionalAttr<fir_CUDADataAttributeAttr>:$cuda_attr
   );
 
   let results = (outs AnyFortranVariable, AnyRefOrBoxLike);
@@ -103,7 +103,7 @@ def hlfir_DeclareOp : hlfir_Op<"declare", [AttrSizedOperandSegments,
     OpBuilder<(ins "mlir::Value":$memref, "llvm::StringRef":$uniq_name,
       CArg<"mlir::Value", "{}">:$shape, CArg<"mlir::ValueRange", "{}">:$typeparams,
       CArg<"fir::FortranVariableFlagsAttr", "{}">:$fortran_attrs,
-      CArg<"fir::CUDAAttributeAttr", "{}">:$cuda_attr)>];
+      CArg<"fir::CUDADataAttributeAttr", "{}">:$cuda_attr)>];
 
   let extraClassDeclaration = [{
     /// Get the variable original base (same as input). It lacks
diff --git a/flang/include/flang/Optimizer/Support/Utils.h b/flang/include/flang/Optimizer/Support/Utils.h
index 586701b..84c550a 100644
--- a/flang/include/flang/Optimizer/Support/Utils.h
+++ b/flang/include/flang/Optimizer/Support/Utils.h
@@ -273,32 +273,32 @@ inline void genMinMaxlocReductionLoop(
   builder.setInsertionPointAfter(ifMaskTrueOp);
 }
 
-inline fir::CUDAAttributeAttr
-getCUDAAttribute(mlir::MLIRContext *mlirContext,
-                 std::optional<Fortran::common::CUDADataAttr> cudaAttr) {
+inline fir::CUDADataAttributeAttr
+getCUDADataAttribute(mlir::MLIRContext *mlirContext,
+                     std::optional<Fortran::common::CUDADataAttr> cudaAttr) {
   if (cudaAttr) {
-    fir::CUDAAttribute attr;
+    fir::CUDADataAttribute attr;
     switch (*cudaAttr) {
     case Fortran::common::CUDADataAttr::Constant:
-      attr = fir::CUDAAttribute::Constant;
+      attr = fir::CUDADataAttribute::Constant;
       break;
     case Fortran::common::CUDADataAttr::Device:
-      attr = fir::CUDAAttribute::Device;
+      attr = fir::CUDADataAttribute::Device;
       break;
     case Fortran::common::CUDADataAttr::Managed:
-      attr = fir::CUDAAttribute::Managed;
+      attr = fir::CUDADataAttribute::Managed;
       break;
     case Fortran::common::CUDADataAttr::Pinned:
-      attr = fir::CUDAAttribute::Pinned;
+      attr = fir::CUDADataAttribute::Pinned;
       break;
     case Fortran::common::CUDADataAttr::Shared:
-      attr = fir::CUDAAttribute::Shared;
+      attr = fir::CUDADataAttribute::Shared;
       break;
     case Fortran::common::CUDADataAttr::Texture:
       // Obsolete attribute
       return {};
     }
-    return fir::CUDAAttributeAttr::get(mlirContext, attr);
+    return fir::CUDADataAttributeAttr::get(mlirContext, attr);
   }
   return {};
 }
diff --git a/flang/lib/Lower/CallInterface.cpp b/flang/lib/Lower/CallInterface.cpp
index f67ee88..9c32b71 100644
--- a/flang/lib/Lower/CallInterface.cpp
+++ b/flang/lib/Lower/CallInterface.cpp
@@ -972,7 +972,7 @@ private:
     if (obj.cudaDataAttr)
       attrs.emplace_back(
           mlir::StringAttr::get(&mlirContext, fir::getCUDAAttrName()),
-          fir::getCUDAAttribute(&mlirContext, obj.cudaDataAttr));
+          fir::getCUDADataAttribute(&mlirContext, obj.cudaDataAttr));
 
     // TODO: intents that require special care (e.g finalization)
 
diff --git a/flang/lib/Lower/ConvertVariable.cpp b/flang/lib/Lower/ConvertVariable.cpp
index 2f23757..b2279a3 100644
--- a/flang/lib/Lower/ConvertVariable.cpp
+++ b/flang/lib/Lower/ConvertVariable.cpp
@@ -139,7 +139,7 @@ static fir::GlobalOp defineGlobal(Fortran::lower::AbstractConverter &converter,
                                   const Fortran::lower::pft::Variable &var,
                                   llvm::StringRef globalName,
                                   mlir::StringAttr linkage,
-                                  fir::CUDAAttributeAttr cudaAttr = {});
+                                  fir::CUDADataAttributeAttr cudaAttr = {});
 
 static mlir::Location genLocation(Fortran::lower::AbstractConverter &converter,
                                   const Fortran::semantics::Symbol &sym) {
@@ -464,7 +464,7 @@ static fir::GlobalOp defineGlobal(Fortran::lower::AbstractConverter &converter,
                                   const Fortran::lower::pft::Variable &var,
                                   llvm::StringRef globalName,
                                   mlir::StringAttr linkage,
-                                  fir::CUDAAttributeAttr cudaAttr) {
+                                  fir::CUDADataAttributeAttr cudaAttr) {
   fir::FirOpBuilder &builder = converter.getFirOpBuilder();
   const Fortran::semantics::Symbol &sym = var.getSymbol();
   mlir::Location loc = genLocation(converter, sym);
@@ -1583,11 +1583,11 @@ fir::FortranVariableFlagsAttr Fortran::lower::translateSymbolAttributes(
   return fir::FortranVariableFlagsAttr::get(mlirContext, flags);
 }
 
-fir::CUDAAttributeAttr Fortran::lower::translateSymbolCUDAAttribute(
+fir::CUDADataAttributeAttr Fortran::lower::translateSymbolCUDADataAttribute(
     mlir::MLIRContext *mlirContext, const Fortran::semantics::Symbol &sym) {
   std::optional<Fortran::common::CUDADataAttr> cudaAttr =
       Fortran::semantics::GetCUDADataAttr(&sym);
-  return fir::getCUDAAttribute(mlirContext, cudaAttr);
+  return fir::getCUDADataAttribute(mlirContext, cudaAttr);
 }
 
 /// Map a symbol to its FIR address and evaluated specification expressions.
@@ -1629,8 +1629,9 @@ static void genDeclareSymbol(Fortran::lower::AbstractConverter &converter,
     auto name = converter.mangleName(sym);
     fir::FortranVariableFlagsAttr attributes =
         Fortran::lower::translateSymbolAttributes(builder.getContext(), sym);
-    fir::CUDAAttributeAttr cudaAttr =
-        Fortran::lower::translateSymbolCUDAAttribute(builder.getContext(), sym);
+    fir::CUDADataAttributeAttr cudaAttr =
+        Fortran::lower::translateSymbolCUDADataAttribute(builder.getContext(),
+                                                         sym);
 
     if (isCrayPointee) {
       mlir::Type baseType =
@@ -1722,9 +1723,9 @@ void Fortran::lower::genDeclareSymbol(
     fir::FortranVariableFlagsAttr attributes =
         Fortran::lower::translateSymbolAttributes(
             builder.getContext(), sym.GetUltimate(), extraFlags);
-    fir::CUDAAttributeAttr cudaAttr =
-        Fortran::lower::translateSymbolCUDAAttribute(builder.getContext(),
-                                                     sym.GetUltimate());
+    fir::CUDADataAttributeAttr cudaAttr =
+        Fortran::lower::translateSymbolCUDADataAttribute(builder.getContext(),
+                                                         sym.GetUltimate());
     auto name = converter.mangleName(sym);
     hlfir::EntityWithAttributes declare =
         hlfir::genDeclare(loc, builder, exv, name, attributes, cudaAttr);
@@ -2222,8 +2223,8 @@ void Fortran::lower::defineModuleVariable(
     // Do nothing. Mapping will be done on user side.
   } else {
     std::string globalName = converter.mangleName(sym);
-    fir::CUDAAttributeAttr cudaAttr =
-        Fortran::lower::translateSymbolCUDAAttribute(
+    fir::CUDADataAttributeAttr cudaAttr =
+        Fortran::lower::translateSymbolCUDADataAttribute(
             converter.getFirOpBuilder().getContext(), sym);
     defineGlobal(converter, var, globalName, linkage, cudaAttr);
   }
diff --git a/flang/lib/Optimizer/Builder/FIRBuilder.cpp b/flang/lib/Optimizer/Builder/FIRBuilder.cpp
index 68fe8de..3cce39f 100644
--- a/flang/lib/Optimizer/Builder/FIRBuilder.cpp
+++ b/flang/lib/Optimizer/Builder/FIRBuilder.cpp
@@ -274,7 +274,7 @@ mlir::Value fir::FirOpBuilder::createHeapTemporary(
 fir::GlobalOp fir::FirOpBuilder::createGlobal(
     mlir::Location loc, mlir::Type type, llvm::StringRef name,
     mlir::StringAttr linkage, mlir::Attribute value, bool isConst,
-    bool isTarget, fir::CUDAAttributeAttr cudaAttr) {
+    bool isTarget, fir::CUDADataAttributeAttr cudaAttr) {
   auto module = getModule();
   auto insertPt = saveInsertionPoint();
   if (auto glob = module.lookupSymbol<fir::GlobalOp>(name))
@@ -296,7 +296,7 @@ fir::GlobalOp fir::FirOpBuilder::createGlobal(
 fir::GlobalOp fir::FirOpBuilder::createGlobal(
     mlir::Location loc, mlir::Type type, llvm::StringRef name, bool isConst,
     bool isTarget, std::function<void(FirOpBuilder &)> bodyBuilder,
-    mlir::StringAttr linkage, fir::CUDAAttributeAttr cudaAttr) {
+    mlir::StringAttr linkage, fir::CUDADataAttributeAttr cudaAttr) {
   auto module = getModule();
   auto insertPt = saveInsertionPoint();
   if (auto glob = module.lookupSymbol<fir::GlobalOp>(name))
diff --git a/flang/lib/Optimizer/Builder/HLFIRTools.cpp b/flang/lib/Optimizer/Builder/HLFIRTools.cpp
index 61e5311..4ffa303f 100644
--- a/flang/lib/Optimizer/Builder/HLFIRTools.cpp
+++ b/flang/lib/Optimizer/Builder/HLFIRTools.cpp
@@ -199,7 +199,7 @@ fir::FortranVariableOpInterface
 hlfir::genDeclare(mlir::Location loc, fir::FirOpBuilder &builder,
                   const fir::ExtendedValue &exv, llvm::StringRef name,
                   fir::FortranVariableFlagsAttr flags,
-                  fir::CUDAAttributeAttr cudaAttr) {
+                  fir::CUDADataAttributeAttr cudaAttr) {
 
   mlir::Value base = fir::getBase(exv);
   assert(fir::conformsWithPassByRef(base.getType()) &&
diff --git a/flang/lib/Optimizer/Dialect/FIRAttr.cpp b/flang/lib/Optimizer/Dialect/FIRAttr.cpp
index 04431b6..218fa50 100644
--- a/flang/lib/Optimizer/Dialect/FIRAttr.cpp
+++ b/flang/lib/Optimizer/Dialect/FIRAttr.cpp
@@ -298,5 +298,5 @@ void fir::printFirAttribute(FIROpsDialect *dialect, mlir::Attribute attr,
 void FIROpsDialect::registerAttributes() {
   addAttributes<ClosedIntervalAttr, ExactTypeAttr, FortranVariableFlagsAttr,
                 LowerBoundAttr, PointIntervalAttr, RealAttr, SubclassAttr,
-                UpperBoundAttr, CUDAAttributeAttr>();
+                UpperBoundAttr, CUDADataAttributeAttr>();
 }
diff --git a/flang/lib/Optimizer/HLFIR/IR/HLFIROps.cpp b/flang/lib/Optimizer/HLFIR/IR/HLFIROps.cpp
index 85644c1..8bc92a9 100644
--- a/flang/lib/Optimizer/HLFIR/IR/HLFIROps.cpp
+++ b/flang/lib/Optimizer/HLFIR/IR/HLFIROps.cpp
@@ -124,7 +124,7 @@ void hlfir::DeclareOp::build(mlir::OpBuilder &builder,
                              llvm::StringRef uniq_name, mlir::Value shape,
                              mlir::ValueRange typeparams,
                              fir::FortranVariableFlagsAttr fortran_attrs,
-                             fir::CUDAAttributeAttr cuda_attr) {
+                             fir::CUDADataAttributeAttr cuda_attr) {
   auto nameAttr = builder.getStringAttr(uniq_name);
   mlir::Type inputType = memref.getType();
   bool hasExplicitLbs = hasExplicitLowerBounds(shape);
diff --git a/flang/lib/Optimizer/HLFIR/Transforms/ConvertToFIR.cpp b/flang/lib/Optimizer/HLFIR/Transforms/ConvertToFIR.cpp
index b15fb59..cd534ba 100644
--- a/flang/lib/Optimizer/HLFIR/Transforms/ConvertToFIR.cpp
+++ b/flang/lib/Optimizer/HLFIR/Transforms/ConvertToFIR.cpp
@@ -320,12 +320,12 @@ public:
     mlir::Location loc = declareOp->getLoc();
     mlir::Value memref = declareOp.getMemref();
     fir::FortranVariableFlagsAttr fortranAttrs;
-    fir::CUDAAttributeAttr cudaAttr;
+    fir::CUDADataAttributeAttr cudaAttr;
     if (auto attrs = declareOp.getFortranAttrs())
       fortranAttrs =
           fir::FortranVariableFlagsAttr::get(rewriter.getContext(), *attrs);
     if (auto attr = declareOp.getCudaAttr())
-      cudaAttr = fir::CUDAAttributeAttr::get(rewriter.getContext(), *attr);
+      cudaAttr = fir::CUDADataAttributeAttr::get(rewriter.getContext(), *attr);
     auto firDeclareOp = rewriter.create<fir::DeclareOp>(
         loc, memref.getType(), memref, declareOp.getShape(),
         declareOp.getTypeparams(), declareOp.getUniqName(), fortranAttrs,
diff --git a/flang/unittests/Optimizer/FortranVariableTest.cpp b/flang/unittests/Optimizer/FortranVariableTest.cpp
index 4b101ce..790f735 100644
--- a/flang/unittests/Optimizer/FortranVariableTest.cpp
+++ b/flang/unittests/Optimizer/FortranVariableTest.cpp
@@ -50,7 +50,7 @@ TEST_F(FortranVariableTest, SimpleScalar) {
   auto declare = builder->create<fir::DeclareOp>(loc, addr.getType(), addr,
       /*shape=*/mlir::Value{}, /*typeParams=*/std::nullopt, name,
       /*fortran_attrs=*/fir::FortranVariableFlagsAttr{},
-      /*cuda_attr=*/fir::CUDAAttributeAttr{});
+      /*cuda_attr=*/fir::CUDADataAttributeAttr{});
 
   fir::FortranVariableOpInterface fortranVariable = declare;
   EXPECT_FALSE(fortranVariable.isArray());
@@ -76,7 +76,7 @@ TEST_F(FortranVariableTest, CharacterScalar) {
   auto declare = builder->create<fir::DeclareOp>(loc, addr.getType(), addr,
       /*shape=*/mlir::Value{}, typeParams, name,
       /*fortran_attrs=*/fir::FortranVariableFlagsAttr{},
-      /*cuda_attr=*/fir::CUDAAttributeAttr{});
+      /*cuda_attr=*/fir::CUDADataAttributeAttr{});
 
   fir::FortranVariableOpInterface fortranVariable = declare;
   EXPECT_FALSE(fortranVariable.isArray());
@@ -107,7 +107,7 @@ TEST_F(FortranVariableTest, SimpleArray) {
   auto declare = builder->create<fir::DeclareOp>(loc, addr.getType(), addr,
       shape, /*typeParams*/ std::nullopt, name,
       /*fortran_attrs=*/fir::FortranVariableFlagsAttr{},
-      /*cuda_attr=*/fir::CUDAAttributeAttr{});
+      /*cuda_attr=*/fir::CUDADataAttributeAttr{});
 
   fir::FortranVariableOpInterface fortranVariable = declare;
   EXPECT_TRUE(fortranVariable.isArray());
@@ -138,7 +138,7 @@ TEST_F(FortranVariableTest, CharacterArray) {
   auto declare = builder->create<fir::DeclareOp>(loc, addr.getType(), addr,
       shape, typeParams, name,
       /*fortran_attrs=*/fir::FortranVariableFlagsAttr{},
-      /*cuda_attr=*/fir::CUDAAttributeAttr{});
+      /*cuda_attr=*/fir::CUDADataAttributeAttr{});
 
   fir::FortranVariableOpInterface fortranVariable = declare;
   EXPECT_TRUE(fortranVariable.isArray());
-- 
cgit v1.1


From 0b77b19292457b9f2020e290980f1803a16eea34 Mon Sep 17 00:00:00 2001
From: choikwa <5455710+choikwa@users.noreply.github.com>
Date: Fri, 9 Feb 2024 17:10:04 -0500
Subject: [AMDGPU] Add test to show s_cselect generation from uniform select
 (#79384)

---
 llvm/test/CodeGen/AMDGPU/uniform-select.ll | 219 +++++++++++++++++++++++++++++
 1 file changed, 219 insertions(+)
 create mode 100644 llvm/test/CodeGen/AMDGPU/uniform-select.ll

diff --git a/llvm/test/CodeGen/AMDGPU/uniform-select.ll b/llvm/test/CodeGen/AMDGPU/uniform-select.ll
new file mode 100644
index 0000000..0cb4086
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/uniform-select.ll
@@ -0,0 +1,219 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx90a < %s | FileCheck -check-prefixes=GFX90A %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx940 < %s | FileCheck -check-prefixes=GFX940 %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx1030 < %s | FileCheck -check-prefixes=GFX1030 %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX1100 %s
+
+define amdgpu_kernel void @test_insert_extract(i32 %p, i32 %q) {
+; GFX90A-LABEL: test_insert_extract:
+; GFX90A:       ; %bb.0: ; %entry
+; GFX90A-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NEXT:    s_mov_b32 s2, 0
+; GFX90A-NEXT:    s_and_b64 vcc, exec, -1
+; GFX90A-NEXT:    s_mov_b32 s3, 0
+; GFX90A-NEXT:    s_mov_b32 s4, 0
+; GFX90A-NEXT:    s_mov_b32 s5, 0
+; GFX90A-NEXT:    s_mov_b32 s6, 0
+; GFX90A-NEXT:  .LBB0_1: ; %for.body
+; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT:    s_cmp_eq_u32 s1, 1
+; GFX90A-NEXT:    s_cselect_b64 s[8:9], -1, 0
+; GFX90A-NEXT:    s_and_b64 s[8:9], s[8:9], exec
+; GFX90A-NEXT:    s_cselect_b32 s7, s4, s3
+; GFX90A-NEXT:    s_cmp_eq_u32 s1, 2
+; GFX90A-NEXT:    s_cselect_b64 s[8:9], -1, 0
+; GFX90A-NEXT:    s_and_b64 s[8:9], s[8:9], exec
+; GFX90A-NEXT:    s_cselect_b32 s7, s5, s7
+; GFX90A-NEXT:    s_cmp_eq_u32 s1, 3
+; GFX90A-NEXT:    s_cselect_b64 s[8:9], -1, 0
+; GFX90A-NEXT:    s_and_b64 s[8:9], s[8:9], exec
+; GFX90A-NEXT:    s_cselect_b32 s7, s6, s7
+; GFX90A-NEXT:    s_or_b32 s7, s7, s0
+; GFX90A-NEXT:    s_cmp_eq_u32 s1, 1
+; GFX90A-NEXT:    s_cselect_b64 s[8:9], -1, 0
+; GFX90A-NEXT:    s_and_b64 s[10:11], s[8:9], exec
+; GFX90A-NEXT:    s_cselect_b32 s4, s7, s4
+; GFX90A-NEXT:    s_cmp_eq_u32 s1, 3
+; GFX90A-NEXT:    s_cselect_b64 s[10:11], -1, 0
+; GFX90A-NEXT:    s_and_b64 s[12:13], s[10:11], exec
+; GFX90A-NEXT:    s_cselect_b32 s6, s7, s6
+; GFX90A-NEXT:    s_cmp_eq_u32 s1, 2
+; GFX90A-NEXT:    s_cselect_b64 s[12:13], -1, 0
+; GFX90A-NEXT:    s_and_b64 s[14:15], s[12:13], exec
+; GFX90A-NEXT:    s_cselect_b32 s5, s7, s5
+; GFX90A-NEXT:    s_cmp_eq_u32 s1, 0
+; GFX90A-NEXT:    s_cselect_b32 s3, s7, s3
+; GFX90A-NEXT:    s_or_b64 s[8:9], s[12:13], s[8:9]
+; GFX90A-NEXT:    s_or_b64 s[8:9], s[10:11], s[8:9]
+; GFX90A-NEXT:    s_and_b64 s[8:9], s[8:9], exec
+; GFX90A-NEXT:    s_cselect_b32 s2, 0, s2
+; GFX90A-NEXT:    s_mov_b64 vcc, vcc
+; GFX90A-NEXT:    s_cbranch_vccnz .LBB0_1
+; GFX90A-NEXT:  ; %bb.2: ; %DummyReturnBlock
+; GFX90A-NEXT:    s_endpgm
+;
+; GFX940-LABEL: test_insert_extract:
+; GFX940:       ; %bb.0: ; %entry
+; GFX940-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NEXT:    s_mov_b32 s2, 0
+; GFX940-NEXT:    s_and_b64 vcc, exec, -1
+; GFX940-NEXT:    s_mov_b32 s3, 0
+; GFX940-NEXT:    s_mov_b32 s4, 0
+; GFX940-NEXT:    s_mov_b32 s5, 0
+; GFX940-NEXT:    s_mov_b32 s6, 0
+; GFX940-NEXT:  .LBB0_1: ; %for.body
+; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NEXT:    s_cmp_eq_u32 s1, 1
+; GFX940-NEXT:    s_cselect_b64 s[8:9], -1, 0
+; GFX940-NEXT:    s_and_b64 s[8:9], s[8:9], exec
+; GFX940-NEXT:    s_cselect_b32 s7, s4, s3
+; GFX940-NEXT:    s_cmp_eq_u32 s1, 2
+; GFX940-NEXT:    s_cselect_b64 s[8:9], -1, 0
+; GFX940-NEXT:    s_and_b64 s[8:9], s[8:9], exec
+; GFX940-NEXT:    s_cselect_b32 s7, s5, s7
+; GFX940-NEXT:    s_cmp_eq_u32 s1, 3
+; GFX940-NEXT:    s_cselect_b64 s[8:9], -1, 0
+; GFX940-NEXT:    s_and_b64 s[8:9], s[8:9], exec
+; GFX940-NEXT:    s_cselect_b32 s7, s6, s7
+; GFX940-NEXT:    s_or_b32 s7, s7, s0
+; GFX940-NEXT:    s_cmp_eq_u32 s1, 1
+; GFX940-NEXT:    s_cselect_b64 s[8:9], -1, 0
+; GFX940-NEXT:    s_and_b64 s[10:11], s[8:9], exec
+; GFX940-NEXT:    s_cselect_b32 s4, s7, s4
+; GFX940-NEXT:    s_cmp_eq_u32 s1, 3
+; GFX940-NEXT:    s_cselect_b64 s[10:11], -1, 0
+; GFX940-NEXT:    s_and_b64 s[12:13], s[10:11], exec
+; GFX940-NEXT:    s_cselect_b32 s6, s7, s6
+; GFX940-NEXT:    s_cmp_eq_u32 s1, 2
+; GFX940-NEXT:    s_cselect_b64 s[12:13], -1, 0
+; GFX940-NEXT:    s_and_b64 s[14:15], s[12:13], exec
+; GFX940-NEXT:    s_cselect_b32 s5, s7, s5
+; GFX940-NEXT:    s_cmp_eq_u32 s1, 0
+; GFX940-NEXT:    s_cselect_b32 s3, s7, s3
+; GFX940-NEXT:    s_or_b64 s[8:9], s[12:13], s[8:9]
+; GFX940-NEXT:    s_or_b64 s[8:9], s[10:11], s[8:9]
+; GFX940-NEXT:    s_and_b64 s[8:9], s[8:9], exec
+; GFX940-NEXT:    s_cselect_b32 s2, 0, s2
+; GFX940-NEXT:    s_mov_b64 vcc, vcc
+; GFX940-NEXT:    s_cbranch_vccnz .LBB0_1
+; GFX940-NEXT:  ; %bb.2: ; %DummyReturnBlock
+; GFX940-NEXT:    s_endpgm
+;
+; GFX1030-LABEL: test_insert_extract:
+; GFX1030:       ; %bb.0: ; %entry
+; GFX1030-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX1030-NEXT:    s_mov_b32 s2, 0
+; GFX1030-NEXT:    s_mov_b32 s3, 0
+; GFX1030-NEXT:    s_mov_b32 s4, 0
+; GFX1030-NEXT:    s_mov_b32 s5, 0
+; GFX1030-NEXT:    s_mov_b32 s6, 0
+; GFX1030-NEXT:    s_mov_b32 vcc_lo, exec_lo
+; GFX1030-NEXT:    .p2align 6
+; GFX1030-NEXT:  .LBB0_1: ; %for.body
+; GFX1030-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1030-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1030-NEXT:    s_cmp_eq_u32 s1, 1
+; GFX1030-NEXT:    s_cselect_b32 s7, -1, 0
+; GFX1030-NEXT:    s_and_b32 s7, s7, exec_lo
+; GFX1030-NEXT:    s_cselect_b32 s7, s4, s3
+; GFX1030-NEXT:    s_cmp_eq_u32 s1, 2
+; GFX1030-NEXT:    s_cselect_b32 s8, -1, 0
+; GFX1030-NEXT:    s_and_b32 s8, s8, exec_lo
+; GFX1030-NEXT:    s_cselect_b32 s7, s5, s7
+; GFX1030-NEXT:    s_cmp_eq_u32 s1, 3
+; GFX1030-NEXT:    s_cselect_b32 s8, -1, 0
+; GFX1030-NEXT:    s_and_b32 s8, s8, exec_lo
+; GFX1030-NEXT:    s_cselect_b32 s7, s6, s7
+; GFX1030-NEXT:    s_or_b32 s7, s7, s0
+; GFX1030-NEXT:    s_cmp_eq_u32 s1, 1
+; GFX1030-NEXT:    s_cselect_b32 s8, -1, 0
+; GFX1030-NEXT:    s_and_b32 s9, s8, exec_lo
+; GFX1030-NEXT:    s_cselect_b32 s4, s7, s4
+; GFX1030-NEXT:    s_cmp_eq_u32 s1, 3
+; GFX1030-NEXT:    s_cselect_b32 s9, -1, 0
+; GFX1030-NEXT:    s_and_b32 s10, s9, exec_lo
+; GFX1030-NEXT:    s_cselect_b32 s6, s7, s6
+; GFX1030-NEXT:    s_cmp_eq_u32 s1, 2
+; GFX1030-NEXT:    s_cselect_b32 s10, -1, 0
+; GFX1030-NEXT:    s_and_b32 s11, s10, exec_lo
+; GFX1030-NEXT:    s_cselect_b32 s5, s7, s5
+; GFX1030-NEXT:    s_cmp_eq_u32 s1, 0
+; GFX1030-NEXT:    s_cselect_b32 s3, s7, s3
+; GFX1030-NEXT:    s_or_b32 s7, s10, s8
+; GFX1030-NEXT:    s_or_b32 s7, s9, s7
+; GFX1030-NEXT:    s_and_b32 s7, s7, exec_lo
+; GFX1030-NEXT:    s_cselect_b32 s2, 0, s2
+; GFX1030-NEXT:    s_cbranch_vccnz .LBB0_1
+; GFX1030-NEXT:  ; %bb.2: ; %DummyReturnBlock
+; GFX1030-NEXT:    s_endpgm
+;
+; GFX1100-LABEL: test_insert_extract:
+; GFX1100:       ; %bb.0: ; %entry
+; GFX1100-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX1100-NEXT:    s_mov_b32 s2, 0
+; GFX1100-NEXT:    s_mov_b32 s3, 0
+; GFX1100-NEXT:    s_mov_b32 s4, 0
+; GFX1100-NEXT:    s_mov_b32 s5, 0
+; GFX1100-NEXT:    s_mov_b32 s6, 0
+; GFX1100-NEXT:    s_mov_b32 vcc_lo, exec_lo
+; GFX1100-NEXT:    .p2align 6
+; GFX1100-NEXT:  .LBB0_1: ; %for.body
+; GFX1100-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1100-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1100-NEXT:    s_cmp_eq_u32 s1, 1
+; GFX1100-NEXT:    s_cselect_b32 s7, -1, 0
+; GFX1100-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
+; GFX1100-NEXT:    s_and_b32 s7, s7, exec_lo
+; GFX1100-NEXT:    s_cselect_b32 s7, s4, s3
+; GFX1100-NEXT:    s_cmp_eq_u32 s1, 2
+; GFX1100-NEXT:    s_cselect_b32 s8, -1, 0
+; GFX1100-NEXT:    s_and_b32 s8, s8, exec_lo
+; GFX1100-NEXT:    s_cselect_b32 s7, s5, s7
+; GFX1100-NEXT:    s_cmp_eq_u32 s1, 3
+; GFX1100-NEXT:    s_cselect_b32 s8, -1, 0
+; GFX1100-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1100-NEXT:    s_and_b32 s8, s8, exec_lo
+; GFX1100-NEXT:    s_cselect_b32 s7, s6, s7
+; GFX1100-NEXT:    s_or_b32 s7, s7, s0
+; GFX1100-NEXT:    s_cmp_eq_u32 s1, 1
+; GFX1100-NEXT:    s_cselect_b32 s8, -1, 0
+; GFX1100-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
+; GFX1100-NEXT:    s_and_b32 s9, s8, exec_lo
+; GFX1100-NEXT:    s_cselect_b32 s4, s7, s4
+; GFX1100-NEXT:    s_cmp_eq_u32 s1, 3
+; GFX1100-NEXT:    s_cselect_b32 s9, -1, 0
+; GFX1100-NEXT:    s_and_b32 s10, s9, exec_lo
+; GFX1100-NEXT:    s_cselect_b32 s6, s7, s6
+; GFX1100-NEXT:    s_cmp_eq_u32 s1, 2
+; GFX1100-NEXT:    s_cselect_b32 s10, -1, 0
+; GFX1100-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
+; GFX1100-NEXT:    s_and_b32 s11, s10, exec_lo
+; GFX1100-NEXT:    s_cselect_b32 s5, s7, s5
+; GFX1100-NEXT:    s_cmp_eq_u32 s1, 0
+; GFX1100-NEXT:    s_cselect_b32 s3, s7, s3
+; GFX1100-NEXT:    s_or_b32 s7, s10, s8
+; GFX1100-NEXT:    s_or_b32 s7, s9, s7
+; GFX1100-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1100-NEXT:    s_and_b32 s7, s7, exec_lo
+; GFX1100-NEXT:    s_cselect_b32 s2, 0, s2
+; GFX1100-NEXT:    s_cbranch_vccnz .LBB0_1
+; GFX1100-NEXT:  ; %bb.2: ; %DummyReturnBlock
+; GFX1100-NEXT:    s_endpgm
+entry:
+  %init = insertelement <4 x i32> zeroinitializer, i32 0, i64 0
+  br label %for.body
+
+for.body:                                     ; preds = %for.body, %entry
+  %x1 = phi <4 x i32> [ %init, %entry ], [ %i4, %for.body ]
+  %x2 = phi <4 x i32> [ zeroinitializer, %entry ], [ %i2, %for.body ]
+  %idxprom = zext i32 %q to i64
+  %e1 = extractelement <4 x i32> %x2, i64 %idxprom
+  %add = or i32 %e1, %p
+  %i2 = insertelement <4 x i32> %x2, i32 %add, i64 %idxprom
+  %e3 = extractelement <4 x i32> %x1, i64 %idxprom
+  %i4 = insertelement <4 x i32> %x1, i32 %e3, i64 0
+  br label %for.body
+}
+
-- 
cgit v1.1


From 01706e767777aeac9d5a22617d522826b64fce3e Mon Sep 17 00:00:00 2001
From: Derek Schuff <dschuff@chromium.org>
Date: Fri, 9 Feb 2024 14:22:47 -0800
Subject: [llvm-nm][WebAssembly] Print function symbol sizes (#81315)

nm already prints sizes for data symbols. Do that for function symbols
too, and update objdump to also print size information.

Implements item 3 from https://github.com/llvm/llvm-project/issues/76107
---
 llvm/include/llvm/Object/Wasm.h                            |  1 +
 llvm/lib/Object/WasmObjectFile.cpp                         | 14 ++++++++++++++
 llvm/test/MC/WebAssembly/alias-offset.s                    |  8 ++++----
 llvm/test/MC/WebAssembly/alias.s                           |  6 +++---
 llvm/test/Object/wasm-linked-namesec-with-linkingsec.yaml  |  2 +-
 llvm/test/Object/wasm-linked-symbol-table.yaml             |  6 +++---
 llvm/test/tools/llvm-nm/wasm/linked.yaml                   |  5 +++++
 llvm/test/tools/llvm-nm/wasm/print-size.test               |  2 +-
 llvm/test/tools/llvm-objdump/wasm/dylink-symbol-table.yaml |  4 ++--
 .../llvm-objdump/wasm/linked-symbol-table-namesec.yaml     | 12 ++++++------
 llvm/test/tools/llvm-objdump/wasm/symbol-table.test        | 12 ++++++------
 llvm/tools/llvm-nm/llvm-nm.cpp                             |  7 ++-----
 llvm/tools/llvm-objdump/llvm-objdump.cpp                   |  3 +++
 13 files changed, 51 insertions(+), 31 deletions(-)

diff --git a/llvm/include/llvm/Object/Wasm.h b/llvm/include/llvm/Object/Wasm.h
index 13d9a17e..b8f7bb45 100644
--- a/llvm/include/llvm/Object/Wasm.h
+++ b/llvm/include/llvm/Object/Wasm.h
@@ -179,6 +179,7 @@ public:
   Expected<SymbolRef::Type> getSymbolType(DataRefImpl Symb) const override;
   Expected<section_iterator> getSymbolSection(DataRefImpl Symb) const override;
   uint32_t getSymbolSectionId(SymbolRef Sym) const;
+  uint32_t getSymbolSize(SymbolRef Sym) const;
 
   // Overrides from SectionRef.
   void moveSectionNext(DataRefImpl &Sec) const override;
diff --git a/llvm/lib/Object/WasmObjectFile.cpp b/llvm/lib/Object/WasmObjectFile.cpp
index 1d68687..04e2b80 100644
--- a/llvm/lib/Object/WasmObjectFile.cpp
+++ b/llvm/lib/Object/WasmObjectFile.cpp
@@ -1932,6 +1932,20 @@ uint32_t WasmObjectFile::getSymbolSectionIdImpl(const WasmSymbol &Sym) const {
   }
 }
 
+uint32_t WasmObjectFile::getSymbolSize(SymbolRef Symb) const {
+  const WasmSymbol &Sym = getWasmSymbol(Symb);
+  if (!Sym.isDefined())
+    return 0;
+  if (Sym.isTypeData())
+    return Sym.Info.DataRef.Size;
+  if (Sym.isTypeFunction())
+    return functions()[Sym.Info.ElementIndex - getNumImportedFunctions()].Size;
+  // Currently symbol size is only tracked for data segments and functions. In
+  // principle we could also track size (e.g. binary size) for tables, globals
+  // and element segments etc too.
+  return 0;
+}
+
 void WasmObjectFile::moveSectionNext(DataRefImpl &Sec) const { Sec.d.a++; }
 
 Expected<StringRef> WasmObjectFile::getSectionName(DataRefImpl Sec) const {
diff --git a/llvm/test/MC/WebAssembly/alias-offset.s b/llvm/test/MC/WebAssembly/alias-offset.s
index e45b17d..4899922 100644
--- a/llvm/test/MC/WebAssembly/alias-offset.s
+++ b/llvm/test/MC/WebAssembly/alias-offset.s
@@ -12,10 +12,10 @@ sym_a:
 .set sym_b, sym_a + 4
 
 # CHECK-LABEL: SYMBOL TABLE:
-# CHECK-NEXT: 00000000 l     O DATA foo
-# CHECK-NEXT: 00000004 l     O DATA sym_a
-# CHECK-NEXT: 00000008 l     O DATA sym_b
-# CHECK-NEXT: 00000001 l     F CODE main
+# CHECK-NEXT: 00000000 l     O DATA 00000004 foo
+# CHECK-NEXT: 00000004 l     O DATA 00000008 sym_a
+# CHECK-NEXT: 00000008 l     O DATA 00000004 sym_b
+# CHECK-NEXT: 00000001 l     F CODE 00000012 main
 
   .text
   .section    .text,"",@
diff --git a/llvm/test/MC/WebAssembly/alias.s b/llvm/test/MC/WebAssembly/alias.s
index b0a7539..8ed46f5 100644
--- a/llvm/test/MC/WebAssembly/alias.s
+++ b/llvm/test/MC/WebAssembly/alias.s
@@ -10,6 +10,6 @@ sym_a:
 
 .set sym_b, sym_a
 
-# CHECK: 00000000 l     O DATA foo
-# CHECK: 00000004 l     O DATA sym_a
-# CHECK: 00000004 l     O DATA sym_b
+# CHECK: 00000000 l     O DATA 00000004 foo
+# CHECK: 00000004 l     O DATA 00000004 sym_a
+# CHECK: 00000004 l     O DATA 00000004 sym_b
diff --git a/llvm/test/Object/wasm-linked-namesec-with-linkingsec.yaml b/llvm/test/Object/wasm-linked-namesec-with-linkingsec.yaml
index c730417..5dfa394 100644
--- a/llvm/test/Object/wasm-linked-namesec-with-linkingsec.yaml
+++ b/llvm/test/Object/wasm-linked-namesec-with-linkingsec.yaml
@@ -2,7 +2,7 @@
 # RUN: llvm-nm -P %t.wasm | FileCheck %s
 #
 # Test that names from the linking section override those from the name section
-# CHECK:  foo T 1 0
+# CHECK:  foo T 1 3
 # CHECK-NOT: my_func_local_name
 
 --- !WASM
diff --git a/llvm/test/Object/wasm-linked-symbol-table.yaml b/llvm/test/Object/wasm-linked-symbol-table.yaml
index 6dd949a..eccdc2c 100644
--- a/llvm/test/Object/wasm-linked-symbol-table.yaml
+++ b/llvm/test/Object/wasm-linked-symbol-table.yaml
@@ -2,9 +2,9 @@
 # RUN: llvm-objdump -t %t.wasm | FileCheck %s
 #
 # CHECK:      SYMBOL TABLE:
-# CHECK-NEXT: 0000009f g F CODE my_func_export
-# CHECK-NEXT: 0000002a g O DATA my_global_export
-# CHECK-NEXT: 00000000 g   TABLE my_table_export
+# CHECK-NEXT: 0000009f g F CODE 00000003 my_func_export
+# CHECK-NEXT: 0000002a g O DATA 00000000 my_global_export
+# CHECK-NEXT: 00000000 g   TABLE 00000000 my_table_export
 
 --- !WASM
 FileHeader:
diff --git a/llvm/test/tools/llvm-nm/wasm/linked.yaml b/llvm/test/tools/llvm-nm/wasm/linked.yaml
index 992c181..6aee4b9 100644
--- a/llvm/test/tools/llvm-nm/wasm/linked.yaml
+++ b/llvm/test/tools/llvm-nm/wasm/linked.yaml
@@ -1,10 +1,15 @@
 # RUN: yaml2obj %s -o %t.wasm
 # RUN: llvm-nm %t.wasm | FileCheck %s
+# RUN: llvm-nm -P %t.wasm | FileCheck %s --check-prefix=POSIX
 
 # CHECK: 0000009f T my_func_export
 # CHECK-NEXT: 0000002a D my_global_export
 # CHECK-NEXT: 00000000 D my_table_export
 
+# POSIX: my_func_export T 9f 3
+# POSIX-NEXT: my_global_export D 2a 0
+# POSIX-NEXT: my_table_export D 0 0
+
 --- !WASM
 FileHeader:
   Version:         0x1
diff --git a/llvm/test/tools/llvm-nm/wasm/print-size.test b/llvm/test/tools/llvm-nm/wasm/print-size.test
index c166edb..610929b 100644
--- a/llvm/test/tools/llvm-nm/wasm/print-size.test
+++ b/llvm/test/tools/llvm-nm/wasm/print-size.test
@@ -43,4 +43,4 @@ Sections:
         Size:            32
 
 # CHECK: 00000000 00000020 D a_data_symbol
-# CHECK: 00000001 00000000 T a_func
+# CHECK: 00000001 0000000d T a_func
diff --git a/llvm/test/tools/llvm-objdump/wasm/dylink-symbol-table.yaml b/llvm/test/tools/llvm-objdump/wasm/dylink-symbol-table.yaml
index 9c1e90a..f4abf12 100644
--- a/llvm/test/tools/llvm-objdump/wasm/dylink-symbol-table.yaml
+++ b/llvm/test/tools/llvm-objdump/wasm/dylink-symbol-table.yaml
@@ -2,8 +2,8 @@
 # RUN: llvm-objdump -t %t.so | FileCheck %s
 #
 # CHECK:      SYMBOL TABLE:
-# CHECK-NEXT: 00000001 g F CODE my_func_export
-# CHECK-NEXT: 0000002a g O DATA my_global_export
+# CHECK-NEXT: 00000001 g F CODE 00000003 my_func_export
+# CHECK-NEXT: 0000002a g O DATA 00000000 my_global_export
 
 --- !WASM
 FileHeader:
diff --git a/llvm/test/tools/llvm-objdump/wasm/linked-symbol-table-namesec.yaml b/llvm/test/tools/llvm-objdump/wasm/linked-symbol-table-namesec.yaml
index 622a606..dc87e62 100644
--- a/llvm/test/tools/llvm-objdump/wasm/linked-symbol-table-namesec.yaml
+++ b/llvm/test/tools/llvm-objdump/wasm/linked-symbol-table-namesec.yaml
@@ -2,12 +2,12 @@
 # RUN: llvm-objdump -t %t.wasm | FileCheck %s
 #
 # CHECK:      SYMBOL TABLE:
-# CHECK-NEXT: 00000000   F *UND* my_func_import_name
-# CHECK-NEXT: 00000083 g F CODE my_func_export_name
-# CHECK-NEXT: 00000086 l F CODE my_func_local_name
-# CHECK-NEXT: 00000000    *UND* my_global_import_name
-# CHECK-NEXT: 00000001 g  GLOBAL my_global_export_name
-# CHECK-NEXT: 00000000 l O DATA my_datasegment_name
+# CHECK-NEXT: 00000000   F *UND* 00000000 my_func_import_name
+# CHECK-NEXT: 00000083 g F CODE 00000003 my_func_export_name
+# CHECK-NEXT: 00000086 l F CODE 00000003 my_func_local_name
+# CHECK-NEXT: 00000000    *UND* 00000000 my_global_import_name
+# CHECK-NEXT: 00000001 g  GLOBAL 00000000 my_global_export_name
+# CHECK-NEXT: 00000000 l O DATA 00000004 my_datasegment_name
 
 --- !WASM
 FileHeader:
diff --git a/llvm/test/tools/llvm-objdump/wasm/symbol-table.test b/llvm/test/tools/llvm-objdump/wasm/symbol-table.test
index b7301a2..ccb0746 100644
--- a/llvm/test/tools/llvm-objdump/wasm/symbol-table.test
+++ b/llvm/test/tools/llvm-objdump/wasm/symbol-table.test
@@ -1,9 +1,9 @@
 RUN: llvm-objdump -t %p/Inputs/trivial.obj.wasm | FileCheck %s
 
 CHECK:      SYMBOL TABLE:
-CHECK-NEXT: 00000001 g     F CODE	main
-CHECK-NEXT: 00000000 l     O DATA	.L.str
-CHECK-NEXT: 00000000       F *UND*	puts
-CHECK-NEXT: 00000019 l     F CODE	.LSomeOtherFunction_bitcast
-CHECK-NEXT: 00000000       F *UND*	SomeOtherFunction
-CHECK-NEXT: 00000010 g     O DATA	var
+CHECK-NEXT: 00000001 g     F CODE	00000018	main
+CHECK-NEXT: 00000000 l     O DATA	0000000d	.L.str
+CHECK-NEXT: 00000000       F *UND*	00000000	puts
+CHECK-NEXT: 00000019 l     F CODE	0000000b	.LSomeOtherFunction_bitcast
+CHECK-NEXT: 00000000       F *UND*	00000000	SomeOtherFunction
+CHECK-NEXT: 00000010 g     O DATA	00000004	var
diff --git a/llvm/tools/llvm-nm/llvm-nm.cpp b/llvm/tools/llvm-nm/llvm-nm.cpp
index da5998b..e3b8145 100644
--- a/llvm/tools/llvm-nm/llvm-nm.cpp
+++ b/llvm/tools/llvm-nm/llvm-nm.cpp
@@ -1854,11 +1854,8 @@ static bool getSymbolNamesFromObject(SymbolicFile &Obj,
               dyn_cast<const XCOFFObjectFile>(&Obj))
         S.Size = XCOFFObj->getSymbolSize(Sym.getRawDataRefImpl());
 
-      if (const WasmObjectFile *WasmObj = dyn_cast<WasmObjectFile>(&Obj)) {
-        const WasmSymbol &WasmSym = WasmObj->getWasmSymbol(Sym);
-        if (WasmSym.isTypeData() && !WasmSym.isUndefined())
-          S.Size = WasmSym.Info.DataRef.Size;
-      }
+      if (const WasmObjectFile *WasmObj = dyn_cast<WasmObjectFile>(&Obj))
+        S.Size = WasmObj->getSymbolSize(Sym);
 
       if (PrintAddress && isa<ObjectFile>(Obj)) {
         SymbolRef SymRef(Sym);
diff --git a/llvm/tools/llvm-objdump/llvm-objdump.cpp b/llvm/tools/llvm-objdump/llvm-objdump.cpp
index de52ebc..0e4f4e1 100644
--- a/llvm/tools/llvm-objdump/llvm-objdump.cpp
+++ b/llvm/tools/llvm-objdump/llvm-objdump.cpp
@@ -2947,6 +2947,9 @@ void Dumper::printSymbol(const SymbolRef &Symbol,
                               Symbol.getRawDataRefImpl()));
   else if (O.isELF())
     outs() << '\t' << format(Fmt, ELFSymbolRef(Symbol).getSize());
+  else if (O.isWasm())
+    outs() << '\t'
+           << format(Fmt, cast<WasmObjectFile>(O).getSymbolSize(Symbol));
 
   if (O.isELF()) {
     if (!SymbolVersions.empty()) {
-- 
cgit v1.1


From 9397d23671f26ab8631e90f688ae2ea212f3c770 Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Fri, 9 Feb 2024 14:26:49 -0800
Subject: [docs] --save-temps=: add single quotes after #80921 and update
 --save-temps

---
 clang/include/clang/Driver/Options.td | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index 4f498db..31503fc 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -5392,13 +5392,13 @@ def regcall4 : Flag<["-"], "regcall4">, Group<m_Group>,
   MarshallingInfoFlag<LangOpts<"RegCall4">>;
 def save_temps_EQ : Joined<["-", "--"], "save-temps=">, Flags<[NoXarchOption]>,
   Visibility<[ClangOption, CC1Option, FlangOption, FC1Option]>,
-  HelpText<"Save intermediate compilation results. <arg> can be set to cwd for "
-  "current working directory, or obj which will save temporary files in the "
+  HelpText<"Save intermediate compilation results. <arg> can be set to 'cwd' for "
+  "current working directory, or 'obj' which will save temporary files in the "
   "same directory as the final output file">;
 def save_temps : Flag<["-", "--"], "save-temps">, Flags<[NoXarchOption]>,
   Visibility<[ClangOption, FlangOption, FC1Option]>,
   Alias<save_temps_EQ>, AliasArgs<["cwd"]>,
-  HelpText<"Save intermediate compilation results">;
+  HelpText<"Alias for --save-temps=cwd">;
 def save_stats_EQ : Joined<["-", "--"], "save-stats=">, Flags<[NoXarchOption]>,
   HelpText<"Save llvm statistics.">;
 def save_stats : Flag<["-", "--"], "save-stats">, Flags<[NoXarchOption]>,
-- 
cgit v1.1


From 0267f9800ea23921120faa4b1d46ac5806e5eca1 Mon Sep 17 00:00:00 2001
From: Tom Stellard <tstellar@redhat.com>
Date: Fri, 9 Feb 2024 15:01:04 -0800
Subject: [workflows] Add a new workflow for testing release branch CI (#81073)

Since we commit all changes to the release branch CI to main first, we
need someway to test that these changes to main don't break the CI.
---
 .github/workflows/llvm-project-workflow-tests.yml | 32 +++++++++++++++++++++++
 1 file changed, 32 insertions(+)
 create mode 100644 .github/workflows/llvm-project-workflow-tests.yml

diff --git a/.github/workflows/llvm-project-workflow-tests.yml b/.github/workflows/llvm-project-workflow-tests.yml
new file mode 100644
index 0000000..a2539b2
--- /dev/null
+++ b/.github/workflows/llvm-project-workflow-tests.yml
@@ -0,0 +1,32 @@
+# This workflow will test the llvm-project-tests workflow in PRs
+# targetting the main branch.  Since this workflow doesn't normally
+# run on main PRs, we need some way to test it to ensure new updates
+# don't break it.
+
+name: LLVM Workflow Test
+
+permissions:
+  contents: read
+
+on:
+  pull_request:
+    branches:
+      - 'main'
+    paths:
+      - '.github/workflows/llvm-project-tests.yml'
+      - '.github/workflows/llvm-project-workflow-tests.yml'
+
+concurrency:
+  # Skip intermediate builds: always.
+  # Cancel intermediate builds: only if it is a pull request build.
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: ${{ startsWith(github.ref, 'refs/pull/') }}
+
+jobs:
+  llvm-test:
+    if: github.repository_owner == 'llvm'
+    name: Build and Test
+    uses: ./.github/workflows/llvm-project-tests.yml
+    with:
+      build_target: check-all
+      projects: clang;lld;libclc;lldb
-- 
cgit v1.1


From fbba818a78f591d89f25768ba31783714d526532 Mon Sep 17 00:00:00 2001
From: Philipp Tomsich <philipp.tomsich@vrull.eu>
Date: Fri, 9 Feb 2024 15:22:09 -0800
Subject: [AArch64] Add the Ampere1B core (#81297)

The Ampere1B is Ampere's third-generation core implementing a
superscalar, out-of-order microarchitecture with nested virtualization,
speculative side-channel mitigation and architectural support for
defense against ROP/JOP style software attacks.

Ampere1B is an ARMv8.7+ implementation, adding support for the FEAT
WFxT, FEAT CSSC, FEAT PAN3 and FEAT AFP extensions. It also includes all
features of the second-generation Ampere1A, such as the Memory Tagging
Extension and SM3/SM4 cryptography instructions.
---
 clang/test/Driver/aarch64-cssc.c                   |  1 +
 clang/test/Misc/target-invalid-cpu-note.c          |  4 ++--
 .../llvm/TargetParser/AArch64TargetParser.h        |  6 +++++
 llvm/lib/Target/AArch64/AArch64.td                 | 26 ++++++++++++++++++++++
 llvm/lib/Target/AArch64/AArch64Subtarget.cpp       |  1 +
 llvm/lib/Target/AArch64/AArch64Subtarget.h         |  1 +
 llvm/lib/TargetParser/Host.cpp                     |  1 +
 llvm/test/CodeGen/AArch64/cpus.ll                  |  1 +
 llvm/test/CodeGen/AArch64/neon-dot-product.ll      |  1 +
 llvm/test/CodeGen/AArch64/remat.ll                 |  1 +
 llvm/test/MC/AArch64/armv8.2a-dotprod.s            |  3 +++
 .../test/MC/Disassembler/AArch64/armv8.3a-rcpc.txt |  1 +
 llvm/unittests/TargetParser/Host.cpp               |  3 +++
 llvm/unittests/TargetParser/TargetParserTest.cpp   | 14 +++++++++++-
 14 files changed, 61 insertions(+), 3 deletions(-)

diff --git a/clang/test/Driver/aarch64-cssc.c b/clang/test/Driver/aarch64-cssc.c
index a3e1866..5df0ea7 100644
--- a/clang/test/Driver/aarch64-cssc.c
+++ b/clang/test/Driver/aarch64-cssc.c
@@ -9,6 +9,7 @@
 // RUN: %clang -S -o - -emit-llvm --target=aarch64-none-elf -march=armv9.4-a        %s 2>&1 | FileCheck %s
 // RUN: %clang -S -o - -emit-llvm --target=aarch64-none-elf -march=armv9.4-a+cssc   %s 2>&1 | FileCheck %s
 // RUN: %clang -S -o - -emit-llvm --target=aarch64-none-elf -march=armv9.4-a+nocssc %s 2>&1 | FileCheck %s --check-prefix=NO_CSSC
+// RUN: %clang -S -o - -emit-llvm --target=aarch64-none-elf -mcpu=ampere1b          %s 2>&1 | FileCheck %s
 
 // CHECK: "target-features"="{{.*}},+cssc
 // NO_CSSC: "target-features"="{{.*}},-cssc
diff --git a/clang/test/Misc/target-invalid-cpu-note.c b/clang/test/Misc/target-invalid-cpu-note.c
index 2f10bfb..39ed02f 100644
--- a/clang/test/Misc/target-invalid-cpu-note.c
+++ b/clang/test/Misc/target-invalid-cpu-note.c
@@ -5,11 +5,11 @@
 
 // RUN: not %clang_cc1 -triple arm64--- -target-cpu not-a-cpu -fsyntax-only %s 2>&1 | FileCheck %s --check-prefix AARCH64
 // AARCH64: error: unknown target CPU 'not-a-cpu'
-// AARCH64-NEXT: note: valid target CPU values are: cortex-a34, cortex-a35, cortex-a53, cortex-a55, cortex-a510, cortex-a520, cortex-a57, cortex-a65, cortex-a65ae, cortex-a72, cortex-a73, cortex-a75, cortex-a76, cortex-a76ae, cortex-a77, cortex-a78, cortex-a78c, cortex-a710, cortex-a715, cortex-a720, cortex-r82, cortex-x1, cortex-x1c, cortex-x2, cortex-x3, cortex-x4, neoverse-e1, neoverse-n1, neoverse-n2, neoverse-512tvb, neoverse-v1, neoverse-v2, cyclone, apple-a7, apple-a8, apple-a9, apple-a10, apple-a11, apple-a12, apple-a13, apple-a14, apple-a15, apple-a16, apple-a17, apple-m1, apple-m2, apple-m3, apple-s4, apple-s5, exynos-m3, exynos-m4, exynos-m5, falkor, saphira, kryo, thunderx2t99, thunderx3t110, thunderx, thunderxt88, thunderxt81, thunderxt83, tsv110, a64fx, carmel, ampere1, ampere1a, cobalt-100, grace{{$}}
+// AARCH64-NEXT: note: valid target CPU values are: cortex-a34, cortex-a35, cortex-a53, cortex-a55, cortex-a510, cortex-a520, cortex-a57, cortex-a65, cortex-a65ae, cortex-a72, cortex-a73, cortex-a75, cortex-a76, cortex-a76ae, cortex-a77, cortex-a78, cortex-a78c, cortex-a710, cortex-a715, cortex-a720, cortex-r82, cortex-x1, cortex-x1c, cortex-x2, cortex-x3, cortex-x4, neoverse-e1, neoverse-n1, neoverse-n2, neoverse-512tvb, neoverse-v1, neoverse-v2, cyclone, apple-a7, apple-a8, apple-a9, apple-a10, apple-a11, apple-a12, apple-a13, apple-a14, apple-a15, apple-a16, apple-a17, apple-m1, apple-m2, apple-m3, apple-s4, apple-s5, exynos-m3, exynos-m4, exynos-m5, falkor, saphira, kryo, thunderx2t99, thunderx3t110, thunderx, thunderxt88, thunderxt81, thunderxt83, tsv110, a64fx, carmel, ampere1, ampere1a, ampere1b, cobalt-100, grace{{$}}
 
 // RUN: not %clang_cc1 -triple arm64--- -tune-cpu not-a-cpu -fsyntax-only %s 2>&1 | FileCheck %s --check-prefix TUNE_AARCH64
 // TUNE_AARCH64: error: unknown target CPU 'not-a-cpu'
-// TUNE_AARCH64-NEXT: note: valid target CPU values are: cortex-a34, cortex-a35, cortex-a53, cortex-a55, cortex-a510, cortex-a520, cortex-a57, cortex-a65, cortex-a65ae, cortex-a72, cortex-a73, cortex-a75, cortex-a76, cortex-a76ae, cortex-a77, cortex-a78, cortex-a78c, cortex-a710, cortex-a715, cortex-a720, cortex-r82, cortex-x1, cortex-x1c, cortex-x2, cortex-x3, cortex-x4, neoverse-e1, neoverse-n1, neoverse-n2, neoverse-512tvb, neoverse-v1, neoverse-v2, cyclone, apple-a7, apple-a8, apple-a9, apple-a10, apple-a11, apple-a12, apple-a13, apple-a14, apple-a15, apple-a16, apple-a17, apple-m1, apple-m2, apple-m3, apple-s4, apple-s5, exynos-m3, exynos-m4, exynos-m5, falkor, saphira, kryo, thunderx2t99, thunderx3t110, thunderx, thunderxt88, thunderxt81, thunderxt83, tsv110, a64fx, carmel, ampere1, ampere1a, cobalt-100, grace{{$}}
+// TUNE_AARCH64-NEXT: note: valid target CPU values are: cortex-a34, cortex-a35, cortex-a53, cortex-a55, cortex-a510, cortex-a520, cortex-a57, cortex-a65, cortex-a65ae, cortex-a72, cortex-a73, cortex-a75, cortex-a76, cortex-a76ae, cortex-a77, cortex-a78, cortex-a78c, cortex-a710, cortex-a715, cortex-a720, cortex-r82, cortex-x1, cortex-x1c, cortex-x2, cortex-x3, cortex-x4, neoverse-e1, neoverse-n1, neoverse-n2, neoverse-512tvb, neoverse-v1, neoverse-v2, cyclone, apple-a7, apple-a8, apple-a9, apple-a10, apple-a11, apple-a12, apple-a13, apple-a14, apple-a15, apple-a16, apple-a17, apple-m1, apple-m2, apple-m3, apple-s4, apple-s5, exynos-m3, exynos-m4, exynos-m5, falkor, saphira, kryo, thunderx2t99, thunderx3t110, thunderx, thunderxt88, thunderxt81, thunderxt83, tsv110, a64fx, carmel, ampere1, ampere1a, ampere1b, cobalt-100, grace{{$}}
 
 // RUN: not %clang_cc1 -triple i386--- -target-cpu not-a-cpu -fsyntax-only %s 2>&1 | FileCheck %s --check-prefix X86
 // X86: error: unknown target CPU 'not-a-cpu'
diff --git a/llvm/include/llvm/TargetParser/AArch64TargetParser.h b/llvm/include/llvm/TargetParser/AArch64TargetParser.h
index cce9d6d..ed9944b 100644
--- a/llvm/include/llvm/TargetParser/AArch64TargetParser.h
+++ b/llvm/include/llvm/TargetParser/AArch64TargetParser.h
@@ -805,6 +805,12 @@ inline constexpr CpuInfo CpuInfos[] = {
          {AArch64::AEK_FP16, AArch64::AEK_RAND, AArch64::AEK_SM4,
           AArch64::AEK_SHA3, AArch64::AEK_SHA2, AArch64::AEK_AES,
           AArch64::AEK_MTE, AArch64::AEK_SB, AArch64::AEK_SSBS}))},
+    {"ampere1b", ARMV8_7A,
+     (AArch64::ExtensionBitset({AArch64::AEK_FP16, AArch64::AEK_RAND,
+                                AArch64::AEK_SM4, AArch64::AEK_SHA3,
+                                AArch64::AEK_SHA2, AArch64::AEK_AES,
+                                AArch64::AEK_MTE, AArch64::AEK_SB,
+                                AArch64::AEK_SSBS, AArch64::AEK_CSSC}))},
 };
 
 // An alias for a CPU.
diff --git a/llvm/lib/Target/AArch64/AArch64.td b/llvm/lib/Target/AArch64/AArch64.td
index 02fb01c..00833b4 100644
--- a/llvm/lib/Target/AArch64/AArch64.td
+++ b/llvm/lib/Target/AArch64/AArch64.td
@@ -1376,6 +1376,24 @@ def TuneAmpere1A : SubtargetFeature<"ampere1a", "ARMProcFamily", "Ampere1A",
                                     FeatureLdpAlignedOnly,
                                     FeatureStpAlignedOnly]>;
 
+def TuneAmpere1B : SubtargetFeature<"ampere1b", "ARMProcFamily", "Ampere1B",
+                                    "Ampere Computing Ampere-1B processors", [
+                                    FeaturePostRAScheduler,
+                                    FeatureFuseAES,
+                                    FeatureFuseAdrpAdd,
+                                    FeatureAddrLSLFast,
+                                    FeatureALULSLFast,
+                                    FeatureAggressiveFMA,
+                                    FeatureArithmeticBccFusion,
+                                    FeatureCmpBccFusion,
+                                    FeatureFuseAddress,
+                                    FeatureFuseLiterals,
+                                    FeatureStorePairSuppress,
+                                    FeatureEnableSelectOptimize,
+                                    FeaturePredictableSelectIsExpensive,
+                                    FeatureLdpAlignedOnly,
+                                    FeatureStpAlignedOnly]>;
+
 def ProcessorFeatures {
   list<SubtargetFeature> A53  = [HasV8_0aOps, FeatureCRC, FeatureCrypto,
                                  FeatureFPARMv8, FeatureNEON, FeaturePerfMon];
@@ -1530,6 +1548,11 @@ def ProcessorFeatures {
                                      FeatureMTE, FeatureSSBS, FeatureRandGen,
                                      FeatureSB, FeatureSM4, FeatureSHA2,
                                      FeatureSHA3, FeatureAES];
+  list<SubtargetFeature> Ampere1B = [HasV8_7aOps, FeatureNEON, FeaturePerfMon,
+                                     FeatureMTE, FeatureSSBS, FeatureRandGen,
+                                     FeatureSB, FeatureSM4, FeatureSHA2,
+                                     FeatureSHA3, FeatureAES, FeatureCSSC,
+                                     FeatureWFxT];
 
   // ETE and TRBE are future architecture extensions. We temporarily enable them
   // by default for users targeting generic AArch64. The extensions do not
@@ -1697,6 +1720,9 @@ def : ProcessorModel<"ampere1", Ampere1Model, ProcessorFeatures.Ampere1,
 def : ProcessorModel<"ampere1a", Ampere1Model, ProcessorFeatures.Ampere1A,
                      [TuneAmpere1A]>;
 
+def : ProcessorModel<"ampere1b", Ampere1Model, ProcessorFeatures.Ampere1B,
+                     [TuneAmpere1B]>;
+
 //===----------------------------------------------------------------------===//
 // Assembly parser
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/AArch64/AArch64Subtarget.cpp b/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
index 6550c12..2b01deb 100644
--- a/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
+++ b/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
@@ -296,6 +296,7 @@ void AArch64Subtarget::initializeProperties(bool HasMinSize) {
     break;
   case Ampere1:
   case Ampere1A:
+  case Ampere1B:
     CacheLineSize = 64;
     PrefFunctionAlignment = Align(64);
     PrefLoopAlignment = Align(64);
diff --git a/llvm/lib/Target/AArch64/AArch64Subtarget.h b/llvm/lib/Target/AArch64/AArch64Subtarget.h
index 0292c01..01cc471 100644
--- a/llvm/lib/Target/AArch64/AArch64Subtarget.h
+++ b/llvm/lib/Target/AArch64/AArch64Subtarget.h
@@ -42,6 +42,7 @@ public:
     A64FX,
     Ampere1,
     Ampere1A,
+    Ampere1B,
     AppleA7,
     AppleA10,
     AppleA11,
diff --git a/llvm/lib/TargetParser/Host.cpp b/llvm/lib/TargetParser/Host.cpp
index f1197c2..4466d50 100644
--- a/llvm/lib/TargetParser/Host.cpp
+++ b/llvm/lib/TargetParser/Host.cpp
@@ -321,6 +321,7 @@ StringRef sys::detail::getHostCPUNameForARM(StringRef ProcCpuinfoContent) {
     return StringSwitch<const char *>(Part)
         .Case("0xac3", "ampere1")
         .Case("0xac4", "ampere1a")
+        .Case("0xac5", "ampere1b")
         .Default("generic");
   }
 
diff --git a/llvm/test/CodeGen/AArch64/cpus.ll b/llvm/test/CodeGen/AArch64/cpus.ll
index b248660..7b45d0f 100644
--- a/llvm/test/CodeGen/AArch64/cpus.ll
+++ b/llvm/test/CodeGen/AArch64/cpus.ll
@@ -37,6 +37,7 @@
 ; RUN: llc < %s -mtriple=arm64-unknown-unknown -mcpu=a64fx 2>&1 | FileCheck %s
 ; RUN: llc < %s -mtriple=arm64-unknown-unknown -mcpu=ampere1 2>&1 | FileCheck %s
 ; RUN: llc < %s -mtriple=arm64-unknown-unknown -mcpu=ampere1a 2>&1 | FileCheck %s
+; RUN: llc < %s -mtriple=arm64-unknown-unknown -mcpu=ampere1b 2>&1 | FileCheck %s
 ; RUN: llc < %s -mtriple=arm64-unknown-unknown -mcpu=invalidcpu 2>&1 | FileCheck %s --check-prefix=INVALID
 
 ; CHECK-NOT: {{.*}}  is not a recognized processor for this target
diff --git a/llvm/test/CodeGen/AArch64/neon-dot-product.ll b/llvm/test/CodeGen/AArch64/neon-dot-product.ll
index 23d1e43..cf09a46 100644
--- a/llvm/test/CodeGen/AArch64/neon-dot-product.ll
+++ b/llvm/test/CodeGen/AArch64/neon-dot-product.ll
@@ -7,6 +7,7 @@
 ; RUN: llc -mtriple aarch64-none-linux-gnu -mcpu=neoverse-n2  < %s | FileCheck %s
 ; RUN: llc -mtriple aarch64-none-linux-gnu -mcpu=ampere1      < %s | FileCheck %s
 ; RUN: llc -mtriple aarch64-none-linux-gnu -mcpu=ampere1a     < %s | FileCheck %s
+; RUN: llc -mtriple aarch64-none-linux-gnu -mcpu=ampere1b     < %s | FileCheck %s
 
 declare <2 x i32> @llvm.aarch64.neon.udot.v2i32.v8i8(<2 x i32>, <8 x i8>, <8 x i8>)
 declare <4 x i32> @llvm.aarch64.neon.udot.v4i32.v16i8(<4 x i32>, <16 x i8>, <16 x i8>)
diff --git a/llvm/test/CodeGen/AArch64/remat.ll b/llvm/test/CodeGen/AArch64/remat.ll
index 483c4d7..704c87f 100644
--- a/llvm/test/CodeGen/AArch64/remat.ll
+++ b/llvm/test/CodeGen/AArch64/remat.ll
@@ -26,6 +26,7 @@
 ; RUN: llc -mtriple=aarch64-linux-gnuabi -mcpu=thunderx3t110 -o - %s | FileCheck %s
 ; RUN: llc -mtriple=aarch64-linux-gnuabi -mcpu=ampere1 -o - %s | FileCheck %s
 ; RUN: llc -mtriple=aarch64-linux-gnuabi -mcpu=ampere1a -o - %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnuabi -mcpu=ampere1b -o - %s | FileCheck %s
 
 %X = type { i64, i64, i64 }
 declare void @f(ptr)
diff --git a/llvm/test/MC/AArch64/armv8.2a-dotprod.s b/llvm/test/MC/AArch64/armv8.2a-dotprod.s
index a49ed14..4d964090 100644
--- a/llvm/test/MC/AArch64/armv8.2a-dotprod.s
+++ b/llvm/test/MC/AArch64/armv8.2a-dotprod.s
@@ -15,6 +15,7 @@
 // RUN: llvm-mc -triple aarch64 -mattr=+v8r,+dotprod -show-encoding < %s | FileCheck %s --check-prefix=CHECK-DOTPROD
 // RUN: llvm-mc -triple aarch64 -mcpu=ampere1 -show-encoding < %s | FileCheck %s --check-prefix=CHECK-DOTPROD
 // RUN: llvm-mc -triple aarch64 -mcpu=ampere1a -show-encoding < %s | FileCheck %s --check-prefix=CHECK-DOTPROD
+// RUN: llvm-mc -triple aarch64 -mcpu=ampere1b -show-encoding < %s | FileCheck %s --check-prefix=CHECK-DOTPROD
 
 // RUN: not llvm-mc -triple aarch64 -mattr=+v8.2a -show-encoding < %s 2> %t
 // RUN: FileCheck --check-prefix=CHECK-NO-DOTPROD < %t %s
@@ -42,6 +43,8 @@
 // RUN: FileCheck --check-prefix=CHECK-NO-DOTPROD < %t %s
 // RUN: not llvm-mc -triple aarch64 -mcpu=ampere1a -mattr=-dotprod -show-encoding < %s 2> %t
 // RUN: FileCheck --check-prefix=CHECK-NO-DOTPROD < %t %s
+// RUN: not llvm-mc -triple aarch64 -mcpu=ampere1b -mattr=-dotprod -show-encoding < %s 2> %t
+// RUN: FileCheck --check-prefix=CHECK-NO-DOTPROD < %t %s
 
 udot v0.2s, v1.8b, v2.8b
 sdot v0.2s, v1.8b, v2.8b
diff --git a/llvm/test/MC/Disassembler/AArch64/armv8.3a-rcpc.txt b/llvm/test/MC/Disassembler/AArch64/armv8.3a-rcpc.txt
index 907d0c3..259cb9d 100644
--- a/llvm/test/MC/Disassembler/AArch64/armv8.3a-rcpc.txt
+++ b/llvm/test/MC/Disassembler/AArch64/armv8.3a-rcpc.txt
@@ -14,6 +14,7 @@
 # RUN: llvm-mc -triple aarch64-none-linux-gnu -mcpu=neoverse-n2 --disassemble < %s | FileCheck %s
 # RUN: llvm-mc -triple aarch64-none-linux-gnu -mcpu=ampere1 --disassemble < %s | FileCheck %s
 # RUN: llvm-mc -triple aarch64-none-linux-gnu -mcpu=ampere1a --disassemble < %s | FileCheck %s
+# RUN: llvm-mc -triple aarch64-none-linux-gnu -mcpu=ampere1b --disassemble < %s | FileCheck %s
 
 # CHECK: ldaprb w0, [x0]
 # CHECK: ldaprh w0, [x0]
diff --git a/llvm/unittests/TargetParser/Host.cpp b/llvm/unittests/TargetParser/Host.cpp
index 5f15161..6aa1d7a 100644
--- a/llvm/unittests/TargetParser/Host.cpp
+++ b/llvm/unittests/TargetParser/Host.cpp
@@ -122,6 +122,9 @@ TEST(getLinuxHostCPUName, AArch64) {
   EXPECT_EQ(sys::detail::getHostCPUNameForARM("CPU implementer : 0xc0\n"
                                               "CPU part        : 0xac4"),
             "ampere1a");
+  EXPECT_EQ(sys::detail::getHostCPUNameForARM("CPU implementer : 0xc0\n"
+                                              "CPU part        : 0xac5"),
+            "ampere1b");
 
   // MSM8992/4 weirdness
   StringRef MSM8992ProcCpuInfo = R"(
diff --git a/llvm/unittests/TargetParser/TargetParserTest.cpp b/llvm/unittests/TargetParser/TargetParserTest.cpp
index e7f9973..e89fc68 100644
--- a/llvm/unittests/TargetParser/TargetParserTest.cpp
+++ b/llvm/unittests/TargetParser/TargetParserTest.cpp
@@ -1601,6 +1601,18 @@ INSTANTIATE_TEST_SUITE_P(
                  AArch64::AEK_PAUTH})),
             "8.6-A"),
         ARMCPUTestParams<AArch64::ExtensionBitset>(
+            "ampere1b", "armv8.7-a", "crypto-neon-fp-armv8",
+            (AArch64::ExtensionBitset(
+                {AArch64::AEK_CRC,   AArch64::AEK_FP,    AArch64::AEK_FP16,
+                 AArch64::AEK_SIMD,  AArch64::AEK_RAS,   AArch64::AEK_LSE,
+                 AArch64::AEK_RDM,   AArch64::AEK_RCPC,  AArch64::AEK_DOTPROD,
+                 AArch64::AEK_SM4,   AArch64::AEK_SHA3,  AArch64::AEK_BF16,
+                 AArch64::AEK_SHA2,  AArch64::AEK_AES,   AArch64::AEK_I8MM,
+                 AArch64::AEK_SSBS,  AArch64::AEK_SB,    AArch64::AEK_RAND,
+                 AArch64::AEK_MTE,   AArch64::AEK_JSCVT, AArch64::AEK_FCMA,
+                 AArch64::AEK_PAUTH, AArch64::AEK_CSSC})),
+            "8.7-A"),
+        ARMCPUTestParams<AArch64::ExtensionBitset>(
             "neoverse-512tvb", "armv8.4-a", "crypto-neon-fp-armv8",
             (AArch64::ExtensionBitset(
                 {AArch64::AEK_RAS,     AArch64::AEK_SVE,   AArch64::AEK_SSBS,
@@ -1679,7 +1691,7 @@ INSTANTIATE_TEST_SUITE_P(
     ARMCPUTestParams<AArch64::ExtensionBitset>::PrintToStringParamName);
 
 // Note: number of CPUs includes aliases.
-static constexpr unsigned NumAArch64CPUArchs = 68;
+static constexpr unsigned NumAArch64CPUArchs = 69;
 
 TEST(TargetParserTest, testAArch64CPUArchList) {
   SmallVector<StringRef, NumAArch64CPUArchs> List;
-- 
cgit v1.1


From 7b2eff6306c1a20f69f16bc485dbc229c8ada40d Mon Sep 17 00:00:00 2001
From: Philipp Tomsich <philipp.tomsich@vrull.eu>
Date: Fri, 9 Feb 2024 15:31:03 -0800
Subject: [AArch64] Add FeatureFuseAddSub2RegAndConstOne for Ampere1A (#81295)

Ampere1A introduced the Fusion for A+B+1/A-B-1.
However, the Feature flag to enable that fusion-case never was added to
TuneAmpere1A.  This commit corrects that omission.
---
 llvm/lib/Target/AArch64/AArch64.td | 1 +
 1 file changed, 1 insertion(+)

diff --git a/llvm/lib/Target/AArch64/AArch64.td b/llvm/lib/Target/AArch64/AArch64.td
index 00833b4..8f8cc15 100644
--- a/llvm/lib/Target/AArch64/AArch64.td
+++ b/llvm/lib/Target/AArch64/AArch64.td
@@ -1372,6 +1372,7 @@ def TuneAmpere1A : SubtargetFeature<"ampere1a", "ARMProcFamily", "Ampere1A",
                                     FeatureFuseAddress,
                                     FeatureFuseLiterals,
                                     FeatureFuseLiterals,
+                                    FeatureFuseAddSub2RegAndConstOne,
                                     FeatureStorePairSuppress,
                                     FeatureLdpAlignedOnly,
                                     FeatureStpAlignedOnly]>;
-- 
cgit v1.1


From ff2e8788d277cbb8c47fa2a8ea87dec7e06307aa Mon Sep 17 00:00:00 2001
From: Philipp Tomsich <philipp.tomsich@vrull.eu>
Date: Fri, 9 Feb 2024 15:31:30 -0800
Subject: [AArch64] Add FeatureFuseAdrpAdd for Ampere1/1A (#81293)

Both Ampere1 and Ampere1A support fusion of ADRP+ADD.
This adds the missing feature to enable fusion-aware scheduling for this
case.
---
 llvm/lib/Target/AArch64/AArch64.td | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/llvm/lib/Target/AArch64/AArch64.td b/llvm/lib/Target/AArch64/AArch64.td
index 8f8cc15..5098dd8 100644
--- a/llvm/lib/Target/AArch64/AArch64.td
+++ b/llvm/lib/Target/AArch64/AArch64.td
@@ -1349,6 +1349,7 @@ def TuneAmpere1 : SubtargetFeature<"ampere1", "ARMProcFamily", "Ampere1",
                                    "Ampere Computing Ampere-1 processors", [
                                    FeaturePostRAScheduler,
                                    FeatureFuseAES,
+                                   FeatureFuseAdrpAdd,
                                    FeatureAddrLSLFast,
                                    FeatureALULSLFast,
                                    FeatureAggressiveFMA,
@@ -1364,6 +1365,7 @@ def TuneAmpere1A : SubtargetFeature<"ampere1a", "ARMProcFamily", "Ampere1A",
                                     "Ampere Computing Ampere-1A processors", [
                                     FeaturePostRAScheduler,
                                     FeatureFuseAES,
+                                    FeatureFuseAdrpAdd,
                                     FeatureAddrLSLFast,
                                     FeatureALULSLFast,
                                     FeatureAggressiveFMA,
-- 
cgit v1.1


From 4f0ee665b58f3f70cd7e8edad6704b2b053b7ea9 Mon Sep 17 00:00:00 2001
From: Philipp Tomsich <philipp.tomsich@vrull.eu>
Date: Fri, 9 Feb 2024 15:34:37 -0800
Subject: [AArch64] [NFC] Remove duplicate FeatureFuseLiterals from Ampere1A
 (#81292)

---
 llvm/lib/Target/AArch64/AArch64.td | 1 -
 1 file changed, 1 deletion(-)

diff --git a/llvm/lib/Target/AArch64/AArch64.td b/llvm/lib/Target/AArch64/AArch64.td
index 5098dd8..e76204f 100644
--- a/llvm/lib/Target/AArch64/AArch64.td
+++ b/llvm/lib/Target/AArch64/AArch64.td
@@ -1373,7 +1373,6 @@ def TuneAmpere1A : SubtargetFeature<"ampere1a", "ARMProcFamily", "Ampere1A",
                                     FeatureCmpBccFusion,
                                     FeatureFuseAddress,
                                     FeatureFuseLiterals,
-                                    FeatureFuseLiterals,
                                     FeatureFuseAddSub2RegAndConstOne,
                                     FeatureStorePairSuppress,
                                     FeatureLdpAlignedOnly,
-- 
cgit v1.1


From 014401158bbbc6899144905c1eb9e44fac86867e Mon Sep 17 00:00:00 2001
From: Philipp Tomsich <philipp.tomsich@vrull.eu>
Date: Fri, 9 Feb 2024 15:48:46 -0800
Subject: [AArch64] Add Ampere1B scheduling/pipeline model (#81338)

The Ampere1B core is enabled with a new scheduling/pipeline model, as it
provides significant updates over the Ampere1 core; it reduces latencies
on many instructions, has some micro-ops reassigned between the XY and X
units, and provides modelling for the instructions added since Ampere1
and Ampere1A.
---
 llvm/lib/Target/AArch64/AArch64.td              |    3 +-
 llvm/lib/Target/AArch64/AArch64SchedAmpere1B.td | 1061 +++++++++++++++++++++++
 2 files changed, 1063 insertions(+), 1 deletion(-)
 create mode 100644 llvm/lib/Target/AArch64/AArch64SchedAmpere1B.td

diff --git a/llvm/lib/Target/AArch64/AArch64.td b/llvm/lib/Target/AArch64/AArch64.td
index e76204f..156c48e 100644
--- a/llvm/lib/Target/AArch64/AArch64.td
+++ b/llvm/lib/Target/AArch64/AArch64.td
@@ -837,6 +837,7 @@ include "AArch64SchedA64FX.td"
 include "AArch64SchedThunderX3T110.td"
 include "AArch64SchedTSV110.td"
 include "AArch64SchedAmpere1.td"
+include "AArch64SchedAmpere1B.td"
 include "AArch64SchedNeoverseN1.td"
 include "AArch64SchedNeoverseN2.td"
 include "AArch64SchedNeoverseV1.td"
@@ -1722,7 +1723,7 @@ def : ProcessorModel<"ampere1", Ampere1Model, ProcessorFeatures.Ampere1,
 def : ProcessorModel<"ampere1a", Ampere1Model, ProcessorFeatures.Ampere1A,
                      [TuneAmpere1A]>;
 
-def : ProcessorModel<"ampere1b", Ampere1Model, ProcessorFeatures.Ampere1B,
+def : ProcessorModel<"ampere1b", Ampere1BModel, ProcessorFeatures.Ampere1B,
                      [TuneAmpere1B]>;
 
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/AArch64/AArch64SchedAmpere1B.td b/llvm/lib/Target/AArch64/AArch64SchedAmpere1B.td
new file mode 100644
index 0000000..43da762
--- /dev/null
+++ b/llvm/lib/Target/AArch64/AArch64SchedAmpere1B.td
@@ -0,0 +1,1061 @@
+//=- AArch64SchedAmpere1B.td - Ampere-1B scheduling def -----*- tablegen -*-=//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the machine model for the Ampere Computing Ampere-1B to
+// support instruction scheduling and other instruction cost heuristics.
+//
+//===----------------------------------------------------------------------===//
+
+// The Ampere-1 core is an out-of-order micro-architecture.  The front
+// end has branch prediction, with a 10-cycle recovery time from a
+// mispredicted branch.  Instructions coming out of the front end are
+// decoded into internal micro-ops (uops).
+
+def Ampere1BModel : SchedMachineModel {
+  let IssueWidth            =   4;  // 4-way decode and dispatch
+  let MicroOpBufferSize     = 192;  // micro-op re-order buffer size
+  let LoadLatency           =   3;  // Optimistic load latency
+  let MispredictPenalty     =  10;  // Branch mispredict penalty
+  let LoopMicroOpBufferSize =  32;  // Instruction queue size
+  let CompleteModel = 0;
+
+  list<Predicate> UnsupportedFeatures = !listconcat(SVEUnsupported.F,
+                                                    SMEUnsupported.F,
+                                                    PAUnsupported.F);
+}
+
+let SchedModel = Ampere1BModel in {
+
+//===----------------------------------------------------------------------===//
+// Define each kind of processor resource and number available on Ampere-1.
+// Ampere-1 has 12 pipelines that 8 independent scheduler (4 integer, 2 FP,
+// and 2 memory) issue into.  The integer and FP schedulers can each issue
+// one uop per cycle, while the memory schedulers can each issue one load
+// and one store address calculation per cycle.
+
+def Ampere1BUnitA  : ProcResource<2>;  // integer single-cycle, branch, and flags r/w
+def Ampere1BUnitB  : ProcResource<2>;  // integer single-cycle, and complex shifts
+def Ampere1BUnitBS : ProcResource<1>;  // integer multi-cycle
+def Ampere1BUnitL  : ProcResource<2>;  // load
+def Ampere1BUnitS  : ProcResource<2>;  // store address calculation
+def Ampere1BUnitX  : ProcResource<1>;  // FP and vector operations, and flag write
+def Ampere1BUnitY  : ProcResource<1>;  // FP and vector operations, and crypto
+def Ampere1BUnitZ  : ProcResource<1>;  // FP store data and FP-to-integer moves
+
+def Ampere1BUnitAB : ProcResGroup<[Ampere1BUnitA, Ampere1BUnitB]>;
+def Ampere1BUnitXY : ProcResGroup<[Ampere1BUnitX, Ampere1BUnitY]>;
+
+//===----------------------------------------------------------------------===//
+// Define customized scheduler read/write types specific to the Ampere-1.
+
+def Ampere1BWrite_1cyc_1A : SchedWriteRes<[Ampere1BUnitA]> {
+  let Latency = 1;
+  let NumMicroOps = 1;
+}
+
+def Ampere1BWrite_1cyc_2A : SchedWriteRes<[Ampere1BUnitA, Ampere1BUnitA]> {
+  let Latency = 1;
+  let NumMicroOps = 2;
+}
+
+def Ampere1BWrite_1cyc_1B : SchedWriteRes<[Ampere1BUnitB]> {
+  let Latency = 1;
+  let NumMicroOps = 1;
+}
+
+def Ampere1BWrite_1cyc_1BS : SchedWriteRes<[Ampere1BUnitBS]> {
+  let Latency = 1;
+  let NumMicroOps = 1;
+}
+
+def Ampere1BWrite_1cyc_1BS_1B : SchedWriteRes<[Ampere1BUnitBS, Ampere1BUnitB]> {
+  let Latency = 1;
+  let NumMicroOps = 2;
+}
+
+def Ampere1BWrite_1cyc_1AB : SchedWriteRes<[Ampere1BUnitAB]> {
+  let Latency = 1;
+  let NumMicroOps = 1;
+}
+
+def Ampere1BWrite_1cyc_1AB_1A : SchedWriteRes<[Ampere1BUnitAB, Ampere1BUnitA]> {
+  let Latency = 1;
+  let NumMicroOps = 2;
+}
+
+def Ampere1BWrite_1cyc_1L : SchedWriteRes<[Ampere1BUnitL]> {
+  let Latency = 1;
+  let NumMicroOps = 1;
+}
+
+def Ampere1BWrite_1cyc_1S : SchedWriteRes<[Ampere1BUnitS]> {
+  let Latency = 1;
+  let NumMicroOps = 1;
+}
+
+def Ampere1BWrite_1cyc_2S : SchedWriteRes<[Ampere1BUnitS, Ampere1BUnitS]> {
+  let Latency = 1;
+  let NumMicroOps = 2;
+}
+
+def Ampere1BWrite_2cyc_1Y : SchedWriteRes<[Ampere1BUnitY]> {
+  let Latency = 2;
+  let NumMicroOps = 1;
+}
+
+def Ampere1BWrite_2cyc_2AB : SchedWriteRes<[Ampere1BUnitAB, Ampere1BUnitAB]> {
+  let Latency = 2;
+  let NumMicroOps = 2;
+}
+
+def Ampere1BWrite_2cyc_1B_1AB : SchedWriteRes<[Ampere1BUnitB, Ampere1BUnitAB]> {
+  let Latency = 2;
+  let NumMicroOps = 2;
+}
+
+def Ampere1BWrite_2cyc_1B_1S : SchedWriteRes<[Ampere1BUnitB, Ampere1BUnitS]> {
+  let Latency = 2;
+  let NumMicroOps = 2;
+}
+
+def Ampere1BWrite_2cyc_1B_1S_1AB : SchedWriteRes<[Ampere1BUnitB,
+                                                  Ampere1BUnitS,
+                                                  Ampere1BUnitAB]> {
+  let Latency = 2;
+  let NumMicroOps = 3;
+}
+
+def Ampere1BWrite_2cyc_1XY : SchedWriteRes<[Ampere1BUnitXY]> {
+  let Latency = 2;
+  let NumMicroOps = 1;
+}
+
+def Ampere1BWrite_2cyc_1S_1Z : SchedWriteRes<[Ampere1BUnitS, Ampere1BUnitZ]> {
+  let Latency = 2;
+  let NumMicroOps = 2;
+}
+
+def Ampere1BWrite_3cyc_1BS : SchedWriteRes<[Ampere1BUnitBS]> {
+  let Latency = 3;
+  let NumMicroOps = 1;
+}
+
+def Ampere1BWrite_3cyc_1L : SchedWriteRes<[Ampere1BUnitL]> {
+  let Latency = 3;
+  let NumMicroOps = 1;
+}
+
+def Ampere1BWrite_3cyc_1X : SchedWriteRes<[Ampere1BUnitX]> {
+  let Latency = 3;
+  let NumMicroOps = 1;
+}
+
+def Ampere1BWrite_3cyc_1XY : SchedWriteRes<[Ampere1BUnitXY]> {
+  let Latency = 3;
+  let NumMicroOps = 1;
+}
+
+def Ampere1BWrite_3cyc_1Z : SchedWriteRes<[Ampere1BUnitZ]> {
+  let Latency = 3;
+  let NumMicroOps = 1;
+}
+
+def Ampere1BWrite_3cyc_1S_1Z : SchedWriteRes<[Ampere1BUnitS,
+                                              Ampere1BUnitZ]> {
+  let Latency = 3;
+  let NumMicroOps = 2;
+}
+
+def Ampere1BWrite_3cyc_1S_2Z : SchedWriteRes<[Ampere1BUnitS,
+                                              Ampere1BUnitZ, Ampere1BUnitZ]> {
+  let Latency = 3;
+  let NumMicroOps = 3;
+}
+
+def Ampere1BWrite_3cyc_2S_2Z : SchedWriteRes<[Ampere1BUnitS, Ampere1BUnitS,
+                                              Ampere1BUnitZ, Ampere1BUnitZ]> {
+  let Latency = 3;
+  let NumMicroOps = 4;
+}
+
+def Ampere1BWrite_4cyc_1BS_1AB : SchedWriteRes<[Ampere1BUnitBS, Ampere1BUnitAB]> {
+  let Latency = 4;
+  let NumMicroOps = 2;
+}
+
+def Ampere1BWrite_4cyc_1L : SchedWriteRes<[Ampere1BUnitL]> {
+  let Latency = 4;
+  let NumMicroOps = 1;
+}
+
+def Ampere1BWrite_4cyc_1L_1B : SchedWriteRes<[Ampere1BUnitL, Ampere1BUnitB]> {
+  let Latency = 4;
+  let NumMicroOps = 2;
+}
+
+def Ampere1BWrite_4cyc_1X : SchedWriteRes<[Ampere1BUnitX]> {
+  let Latency = 4;
+  let NumMicroOps = 1;
+}
+
+def Ampere1BWrite_4cyc_1XY : SchedWriteRes<[Ampere1BUnitXY]> {
+  let Latency = 4;
+  let NumMicroOps = 1;
+}
+
+def Ampere1BWrite_4cyc_2XY : SchedWriteRes<[Ampere1BUnitXY, Ampere1BUnitXY]> {
+  let Latency = 4;
+  let NumMicroOps = 2;
+}
+
+def Ampere1BWrite_5cyc_1BS : SchedWriteRes<[Ampere1BUnitBS]> {
+  let Latency = 5;
+  let NumMicroOps = 1;
+}
+
+def Ampere1BWrite_4cyc_1XY_1S_1Z : SchedWriteRes<[Ampere1BUnitXY,
+                                                  Ampere1BUnitS
+                                                  Ampere1BUnitZ]> {
+  let Latency = 4;
+  let NumMicroOps = 3;
+}
+
+def Ampere1BWrite_5cyc_1BS : SchedWriteRes<[Ampere1BUnitBS]> {
+  let Latency = 5;
+  let NumMicroOps = 1;
+}
+
+def Ampere1BWrite_5cyc_4S_4Z : SchedWriteRes<[Ampere1BUnitL,
+                                              Ampere1BUnitBS]> {
+  let Latency = 5;
+  let NumMicroOps = 8;
+}
+
+def Ampere1BWrite_5cyc_1L_1BS : SchedWriteRes<[Ampere1BUnitL,
+                                               Ampere1BUnitBS]> {
+  let Latency = 5;
+  let NumMicroOps = 2;
+}
+
+def Ampere1BWrite_5cyc_3L : SchedWriteRes<[Ampere1BUnitL,
+                                           Ampere1BUnitL,
+                                           Ampere1BUnitL]> {
+  let Latency = 5;
+  let NumMicroOps = 3;
+}
+
+def Ampere1BWrite_5cyc_4L : SchedWriteRes<[Ampere1BUnitL,
+                                           Ampere1BUnitL,
+                                           Ampere1BUnitL,
+                                           Ampere1BUnitL]> {
+  let Latency = 5;
+  let NumMicroOps = 4;
+}
+
+def Ampere1BWrite_5cyc_1X : SchedWriteRes<[Ampere1BUnitX]> {
+  let Latency = 5;
+  let NumMicroOps = 1;
+}
+
+def Ampere1BWrite_5cyc_2XY_2S_2Z : SchedWriteRes<[Ampere1BUnitXY, Ampere1BUnitXY,
+                                                  Ampere1BUnitS,  Ampere1BUnitS,
+                                                  Ampere1BUnitZ,  Ampere1BUnitZ]> {
+  let Latency = 5;
+  let NumMicroOps = 6;
+}
+
+def Ampere1BWrite_6cyc_1BS_1A : SchedWriteRes<[Ampere1BUnitBS, Ampere1BUnitA]> {
+  let Latency = 6;
+  let NumMicroOps = 2;
+}
+
+def Ampere1BWrite_6cyc_1BS_2A : SchedWriteRes<[Ampere1BUnitBS, Ampere1BUnitA,
+                                                               Ampere1BUnitA]> {
+  let Latency = 6;
+  let NumMicroOps = 3;
+}
+
+def Ampere1BWrite_6cyc_1X : SchedWriteRes<[Ampere1BUnitX]> {
+  let Latency = 6;
+  let NumMicroOps = 2;
+}
+
+def Ampere1BWrite_6cyc_2XY : SchedWriteRes<[Ampere1BUnitXY, Ampere1BUnitXY]> {
+  let Latency = 6;
+  let NumMicroOps = 2;
+}
+
+def Ampere1BWrite_6cyc_3XY : SchedWriteRes<[Ampere1BUnitXY, Ampere1BUnitXY,
+                                            Ampere1BUnitXY]> {
+  let Latency = 6;
+  let NumMicroOps = 3;
+}
+
+def Ampere1BWrite_6cyc_2XY_2S_2Z : SchedWriteRes<[Ampere1BUnitXY, Ampere1BUnitXY,
+                                                  Ampere1BUnitS,  Ampere1BUnitS,
+                                                  Ampere1BUnitZ,  Ampere1BUnitZ]> {
+  let Latency = 6;
+  let NumMicroOps = 6;
+}
+
+def Ampere1BWrite_6cyc_3XY_3S_3Z : SchedWriteRes<[Ampere1BUnitXY, Ampere1BUnitXY, Ampere1BUnitXY,
+                                                  Ampere1BUnitS,  Ampere1BUnitS,  Ampere1BUnitS,
+                                                  Ampere1BUnitZ,  Ampere1BUnitZ,  Ampere1BUnitZ]> {
+  let Latency = 6;
+  let NumMicroOps = 9;
+}
+
+def Ampere1BWrite_7cyc_1BS_1XY : SchedWriteRes<[Ampere1BUnitBS, Ampere1BUnitXY]> {
+  let Latency = 7;
+  let NumMicroOps = 2;
+}
+
+def Ampere1BWrite_7cyc_1XY_1Z : SchedWriteRes<[Ampere1BUnitXY, Ampere1BUnitZ]> {
+  let Latency = 7;
+  let NumMicroOps = 2;
+}
+
+def Ampere1BWrite_7cyc_4XY_4S_4Z : SchedWriteRes<[Ampere1BUnitXY, Ampere1BUnitXY,
+                                                  Ampere1BUnitXY, Ampere1BUnitXY,
+                                                  Ampere1BUnitS,  Ampere1BUnitS,
+                                                  Ampere1BUnitS,  Ampere1BUnitS,
+                                                  Ampere1BUnitZ,  Ampere1BUnitZ,
+                                                  Ampere1BUnitZ,  Ampere1BUnitZ]> {
+  let Latency = 7;
+  let NumMicroOps = 12;
+}
+
+def Ampere1BWrite_8cyc_1BS_1L : SchedWriteRes<[Ampere1BUnitBS, Ampere1BUnitL]> {
+  let Latency = 8;
+  let NumMicroOps = 2;
+}
+
+def Ampere1BWrite_8cyc_1BS_1XY : SchedWriteRes<[Ampere1BUnitBS, Ampere1BUnitXY]> {
+  let Latency = 8;
+  let NumMicroOps = 2;
+}
+
+def Ampere1BWrite_8cyc_2XY : SchedWriteRes<[Ampere1BUnitXY, Ampere1BUnitXY]> {
+  let Latency = 8;
+  let NumMicroOps = 2;
+}
+
+def Ampere1BWrite_8cyc_4XY : SchedWriteRes<[Ampere1BUnitXY, Ampere1BUnitXY,
+                                            Ampere1BUnitXY, Ampere1BUnitXY]> {
+  let Latency = 8;
+  let NumMicroOps = 4;
+}
+
+def Ampere1BWrite_9cyc_6XY_4S_4Z : SchedWriteRes<[Ampere1BUnitXY, Ampere1BUnitXY,
+                                                  Ampere1BUnitXY, Ampere1BUnitXY,
+                                                  Ampere1BUnitXY, Ampere1BUnitXY,
+                                                  Ampere1BUnitS,  Ampere1BUnitS,
+                                                  Ampere1BUnitS,  Ampere1BUnitS,
+                                                  Ampere1BUnitZ,  Ampere1BUnitZ,
+                                                  Ampere1BUnitZ,  Ampere1BUnitZ]> {
+  let Latency = 9;
+  let NumMicroOps = 14;
+}
+
+def Ampere1BWrite_9cyc_1A_1BS_1X : SchedWriteRes<[Ampere1BUnitA, Ampere1BUnitBS, Ampere1BUnitX]> {
+  let Latency = 9;
+  let NumMicroOps = 3;
+}
+
+def Ampere1BWrite_9cyc_1A_1BS_1XY : SchedWriteRes<[Ampere1BUnitA, Ampere1BUnitBS, Ampere1BUnitXY]> {
+  let Latency = 9;
+  let NumMicroOps = 3;
+}
+
+def Ampere1BWrite_9cyc_1X : SchedWriteRes<[Ampere1BUnitX]> {
+  let Latency = 9;
+  let NumMicroOps = 1;
+}
+
+def Ampere1BWrite_9cyc_3XY : SchedWriteRes<[Ampere1BUnitXY, Ampere1BUnitXY, Ampere1BUnitXY]> {
+  let Latency = 9;
+  let NumMicroOps = 3;
+}
+
+def Ampere1BWrite_11cyc_1BS_2XY : SchedWriteRes<[Ampere1BUnitBS, Ampere1BUnitXY, Ampere1BUnitXY]> {
+  let Latency = 11;
+  let NumMicroOps = 3;
+}
+
+def Ampere1BWrite_12cyc_1X : SchedWriteRes<[Ampere1BUnitX]> {
+  let Latency = 12;
+  let NumMicroOps = 1;
+}
+
+def Ampere1BWrite_13cyc_1BS_1X : SchedWriteRes<[Ampere1BUnitBS, Ampere1BUnitX]> {
+  let Latency = 13;
+  let NumMicroOps = 2;
+}
+
+def Ampere1BWrite_17cyc_1X : SchedWriteRes<[Ampere1BUnitX]> {
+  let Latency = 17;
+  let NumMicroOps = 1;
+}
+
+def Ampere1BWrite_19cyc_2BS_1X : SchedWriteRes<[Ampere1BUnitBS,
+                                                Ampere1BUnitBS,
+                                                Ampere1BUnitX]> {
+  let Latency = 13;
+  let NumMicroOps = 3;
+}
+
+def Ampere1BWrite_19cyc_1X : SchedWriteRes<[Ampere1BUnitX]> {
+  let Latency = 19;
+  let NumMicroOps = 1;
+}
+
+def Ampere1BWrite_21cyc_1X : SchedWriteRes<[Ampere1BUnitX]> {
+  let Latency = 21;
+  let NumMicroOps = 1;
+}
+
+def Ampere1BWrite_33cyc_1X : SchedWriteRes<[Ampere1BUnitX]> {
+  let Latency = 33;
+  let NumMicroOps = 1;
+}
+
+def Ampere1BWrite_39cyc_1X : SchedWriteRes<[Ampere1BUnitX]> {
+  let Latency = 39;
+  let NumMicroOps = 1;
+}
+
+def Ampere1BWrite_63cyc_1X : SchedWriteRes<[Ampere1BUnitX]> {
+  let Latency = 63;
+  let NumMicroOps = 1;
+}
+
+// For basic arithmetic, we have more flexibility for short shifts (LSL shift <= 4),
+// which are a single uop, and for extended registers, which have full flexibility
+// across Unit A or B for both uops.
+def Ampere1BWrite_Arith : SchedWriteVariant<[
+                                SchedVar<RegExtendedPred, [Ampere1Write_2cyc_2AB]>,
+                                SchedVar<IsCheapLSL,      [Ampere1Write_1cyc_1AB]>,
+                                SchedVar<NoSchedPred,     [Ampere1Write_2cyc_1B_1AB]>]>;
+
+def Ampere1BWrite_ArithFlagsetting : SchedWriteVariant<[
+                                SchedVar<RegExtendedPred, [Ampere1Write_2cyc_2AB]>,
+                                SchedVar<IsCheapLSL,      [Ampere1Write_1cyc_1AB]>,
+                                SchedVar<NoSchedPred,     [Ampere1Write_2cyc_1B_1AB]>]>;
+
+//===----------------------------------------------------------------------===//
+// Map the target-defined scheduler read/write resources and latencies for Ampere-1.
+// This provides a coarse model, which is then specialised below.
+
+def : WriteRes<WriteImm,   [Ampere1BUnitAB]>;  // MOVN, MOVZ
+def : WriteRes<WriteI,     [Ampere1BUnitAB]>;  // ALU
+def : WriteRes<WriteISReg, [Ampere1BUnitB, Ampere1BUnitAB]> {
+  let Latency = 2;
+  let NumMicroOps = 2;
+}  // ALU of Shifted-Reg
+def : WriteRes<WriteIEReg, [Ampere1BUnitAB, Ampere1BUnitAB]> {
+  let Latency = 2;
+  let NumMicroOps = 2;
+}  // ALU of Extended-Reg
+def : WriteRes<WriteExtr,  [Ampere1BUnitB]>;  // EXTR shifts a reg pair
+def : WriteRes<WriteIS,    [Ampere1BUnitB]>;  // Shift/Scale
+def : WriteRes<WriteID32,  [Ampere1BUnitBS, Ampere1BUnitX]> {
+  let Latency = 13;
+}  // 32-bit Divide
+def : WriteRes<WriteID64,  [Ampere1BUnitBS, Ampere1BUnitX]> {
+  let Latency = 19;
+}  // 64-bit Divide
+def : WriteRes<WriteIM32,  [Ampere1BUnitBS]> {
+  let Latency = 3;
+}  // 32-bit Multiply
+def : WriteRes<WriteIM64,  [Ampere1BUnitBS, Ampere1UnitAB]> {
+  let Latency = 3;
+}  // 64-bit Multiply
+def : WriteRes<WriteBr,    [Ampere1BUnitA]>;
+def : WriteRes<WriteBrReg, [Ampere1BUnitA, Ampere1UnitA]>;
+def : WriteRes<WriteLD,    [Ampere1BUnitL]> {
+  let Latency = 3;
+}  // Load from base addr plus immediate offset
+def : WriteRes<WriteST,    [Ampere1BUnitS]> {
+  let Latency = 1;
+}  // Store to base addr plus immediate offset
+def : WriteRes<WriteSTP,   [Ampere1BUnitS, Ampere1BUnitS]> {
+  let Latency = 1;
+  let NumMicroOps = 1;
+}  // Store a register pair.
+def : WriteRes<WriteAdr,   [Ampere1BUnitAB]>;
+def : WriteRes<WriteLDIdx, [Ampere1BUnitAB, Ampere1BUnitS]> {
+  let Latency = 3;
+  let NumMicroOps = 1;
+}  // Load from a register index (maybe scaled).
+def : WriteRes<WriteSTIdx, [Ampere1BUnitS, Ampere1BUnitS]> {
+  let Latency = 1;
+  let NumMicroOps = 2;
+}  // Store to a register index (maybe scaled).
+def : WriteRes<WriteF,  [Ampere1BUnitXY]> {
+  let Latency = 2;
+}  // General floating-point ops.
+def : WriteRes<WriteFCmp,  [Ampere1BUnitX]> {
+  let Latency = 3;
+}  // Floating-point compare.
+def : WriteRes<WriteFCvt,  [Ampere1BUnitXY]> {
+  let Latency = 3;
+}  // Float conversion.
+def : WriteRes<WriteFCopy, [Ampere1BUnitXY]> {
+}  // Float-int register copy.
+def : WriteRes<WriteFImm,  [Ampere1BUnitXY]> {
+  let Latency = 2;
+}  // Float-int register copy.
+def : WriteRes<WriteFMul,  [Ampere1BUnitXY]> {
+  let Latency = 4;
+}  // Floating-point multiply.
+def : WriteRes<WriteFDiv,  [Ampere1BUnitXY]> {
+  let Latency = 19;
+}  // Floating-point division.
+def : WriteRes<WriteVd,    [Ampere1BUnitXY]> {
+  let Latency = 3;
+}  // 64bit Vector D ops.
+def : WriteRes<WriteVq,    [Ampere1BUnitXY]> {
+  let Latency = 3;
+}  // 128bit Vector Q ops.
+def : WriteRes<WriteVLD,   [Ampere1BUnitL, Ampere1BUnitL]> {
+  let Latency = 4;
+}  // Vector loads.
+def : WriteRes<WriteVST,   [Ampere1BUnitS, Ampere1BUnitZ]> {
+  let Latency = 2;
+}  // Vector stores.
+
+def : WriteRes<WriteAtomic,  []> { let Unsupported = 1; }
+
+def : WriteRes<WriteSys,     []> { let Latency = 1; }
+def : WriteRes<WriteBarrier, []> { let Latency = 1; }
+def : WriteRes<WriteHint,    []> { let Latency = 1; }
+
+def : WriteRes<WriteLDHi,    []> {
+  let Latency = 3;
+}  // The second register of a load-pair: LDP,LDPSW,LDNP,LDXP,LDAXP
+
+// Forwarding logic.
+def : ReadAdvance<ReadI,       0>;
+def : ReadAdvance<ReadISReg,   0>;
+def : ReadAdvance<ReadIEReg,   0>;
+def : ReadAdvance<ReadIM,      0>;
+def : ReadAdvance<ReadIMA,     1, [WriteIM32, WriteIM64]>;
+def : ReadAdvance<ReadID,      0>;
+def : ReadAdvance<ReadExtrHi,  0>;
+def : ReadAdvance<ReadST,      0>;
+def : ReadAdvance<ReadAdrBase, 0>;
+def : ReadAdvance<ReadVLD,     0>;
+
+//===----------------------------------------------------------------------===//
+// Specialising the scheduling model further for Ampere-1B.
+
+def : InstRW<[Ampere1BWrite_1cyc_1AB], (instrs COPY)>;
+
+// Branch instructions
+def : InstRW<[Ampere1BWrite_1cyc_1A], (instrs Bcc, BL, RET)>;
+def : InstRW<[Ampere1BWrite_1cyc_1A],
+        (instrs CBZW, CBZX, CBNZW, CBNZX, TBZW, TBZX, TBNZW, TBNZX)>;
+def : InstRW<[Ampere1BWrite_1cyc_2A], (instrs BLR)>;
+
+// Common Short Sequence Compression (CSSC)
+def : InstRW<[Ampere1BWrite_1cyc_1AB], (instrs ABS)>;
+def : InstRW<[Ampere1BWrite_1cyc_1BS], (instrs CNT)>;
+def : InstRW<[Ampere1BWrite_1cyc_1AB_1A], (instrs SMAX, SMIN)>;
+def : InstRW<[Ampere1BWrite_1cyc_1B], (instrs CTZ)>;
+def : InstRW<[Ampere1BWrite_1cyc_1AB_1A], (instrs UMAX, USMIN)>;
+
+// Cryptography instructions
+// -- AES encryption/decryption
+def : InstRW<[Ampere1BWrite_2cyc_1XY], (instregex "^AES[DE]")>;
+def : InstRW<[Ampere1BWrite_2cyc_1XY], (instregex "^AESI?MC")>;
+// -- Polynomial multiplication
+def : InstRW<[Ampere1BWrite_2cyc_1XY], (instregex "^PMUL", "^PMULL")>;
+// -- SHA-256 hash
+def : InstRW<[Ampere1BWrite_4cyc_1X], (instregex "^SHA256(H|H2)")>;
+// -- SHA-256 schedule update
+def : InstRW<[Ampere1BWrite_2cyc_1Y], (instregex "^SHA256SU[01]")>;
+// -- SHA-3 instructions
+def : InstRW<[Ampere1BWrite_2cyc_1XY],
+        (instregex "^BCAX", "^EOR3", "^RAX1", "^XAR")>;
+// -- SHA-512 hash
+def : InstRW<[Ampere1BWrite_4cyc_1X], (instregex "^SHA512(H|H2)")>;
+// -- SHA-512 schedule update
+def : InstRW<[Ampere1BWrite_2cyc_1Y], (instregex "^SHA512SU[01]")>;
+// -- SHA1 choose/majority/parity
+def : InstRW<[Ampere1BWrite_4cyc_1X], (instregex "^SHA1[CMP]")>;
+// -- SHA1 hash/schedule update
+def : InstRW<[Ampere1BWrite_2cyc_1Y], (instregex "^SHA1SU[01]")>;
+def : InstRW<[Ampere1BWrite_2cyc_1Y], (instregex "^SHA1H")>;
+// -- SM3 hash
+def : InstRW<[Ampere1BWrite_2cyc_1XY],
+    (instregex "^SM3PARTW[12]$", "^SM3SS1$", "^SM3TT[12][AB]$"0)>;
+def : InstRW<[Ampere1BWrite_4cyc_1X], (instrs SM4E, SM4ENCKEY)>;
+
+// FP and vector load instructions
+// -- Load 1-element structure to one/all lanes
+// ---- all lanes
+def : InstRW<[Ampere1BWrite_6cyc_1L_1XY],
+        (instregex "^LD1Rv(8b|4h|2s|16b|8h|4s|2d)")>;
+// ---- one lane
+def : InstRW<[Ampere1BWrite_6cyc_1L_1XY],
+        (instregex "^LD1i(8|16|32|64)")>;
+// -- Load 1-element structure to one/all lanes, 1D size
+def : InstRW<[Ampere1BWrite_4cyc_1L],
+        (instregex "^LD1Rv1d")>;
+// -- Load 1-element structures to 1 register
+def : InstRW<[Ampere1BWrite_4cyc_1L],
+        (instregex "^LD1Onev(8b|4h|2s|1d|16b|8h|4s|2d)")>;
+// -- Load 1-element structures to 2 registers
+def : InstRW<[Ampere1BWrite_4cyc_2L],
+        (instregex "^LD1Twov(8b|4h|2s|1d|16b|8h|4s|2d)")>;
+// -- Load 1-element structures to 3 registers
+def : InstRW<[Ampere1BWrite_5cyc_3L],
+        (instregex "^LD1Threev(8b|4h|2s|1d|16b|8h|4s|2d)")>;
+// -- Load 1-element structures to 4 registers
+def : InstRW<[Ampere1BWrite_5cyc_4L],
+        (instregex "^LD1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)")>;
+// -- Load 2-element structure to all lanes of 2 registers, 1D size
+def : InstRW<[Ampere1BWrite_4cyc_2L],
+        (instregex "^LD2Rv1d")>;
+// -- Load 2-element structure to all lanes of 2 registers, other sizes
+def : InstRW<[Ampere1BWrite_6cyc_2L_2XY],
+        (instregex "^LD2Rv(8b|4h|2s|16b|8h|4s|2d)")>;
+// -- Load 2-element structure to one lane of 2 registers
+def : InstRW<[Ampere1BWrite_6cyc_2L_2XY],
+        (instregex "^LD2i(8|16|32|64)")>;
+// -- Load 2-element structures to 2 registers, 16B/8H/4S/2D size
+def : InstRW<[Ampere1BWrite_6cyc_2L_2XY],
+        (instregex "^LD2Twov(16b|8h|4s|2d)")>;
+// -- Load 2-element structures to 2 registers, 8B/4H/2S size
+def : InstRW<[Ampere1BWrite_8cyc_2L_3XY],
+        (instregex "^LD2Twov(8b|4h|2s)")>;
+// -- Load 3-element structure to all lanes of 3 registers, 1D size
+def : InstRW<[Ampere1BWrite_5cyc_3L],
+        (instregex "^LD3Rv1d")>;
+// -- Load 3-element structure to all lanes of 3 registers, other sizes
+def : InstRW<[Ampere1BWrite_7cyc_3L_3XY],
+        (instregex "^LD3Rv(8b|4h|2s|16b|8h|4s|2d)")>;
+// -- Load 3-element structure to one lane of 3 registers
+def : InstRW<[Ampere1BWrite_7cyc_3L_3XY],
+        (instregex "^LD3i(8|16|32|64)")>;
+// -- Load 3-element structures to 3 registers, 16B/8H/4S sizes
+def : InstRW<[Ampere1BWrite_8cyc_3L_3XY],
+        (instregex "^LD3Threev(16b|8h|4s)")>;
+// -- Load 3-element structures to 3 registers, 2D size
+def : InstRW<[Ampere1BWrite_7cyc_3L_3XY],
+        (instregex "^LD3Threev2d")>;
+// -- Load 3-element structures to 3 registers, 8B/4H/2S sizes
+def : InstRW<[Ampere1BWrite_9cyc_3L_3XY],
+        (instregex "^LD3Threev(8b|4h|2s)")>;
+// -- Load 4-element structure to all lanes of 4 registers, 1D size
+def : InstRW<[Ampere1BWrite_5cyc_4L],
+        (instregex "^LD4Rv1d")>;
+// -- Load 4-element structure to all lanes of 4 registers, other sizes
+def : InstRW<[Ampere1BWrite_7cyc_4L_4XY],
+        (instregex "^LD4Rv(8b|4h|2s|16b|8h|4s|2d)")>;
+// -- Load 4-element structure to one lane of 4 registers
+def : InstRW<[Ampere1BWrite_7cyc_4L_4XY],
+        (instregex "^LD4i(8|16|32|64)")>;
+// -- Load 4-element structures to 4 registers, 2D size
+def : InstRW<[Ampere1BWrite_8cyc_4L_4XY],
+        (instregex "^LD4Fourv2d")>;
+// -- Load 4-element structures to 4 registers, 2S size
+def : InstRW<[Ampere1BWrite_11cyc_4L_8XY],
+        (instregex "^LD4Fourv2s")>;
+// -- Load 4-element structures to 4 registers, other sizes
+def : InstRW<[Ampere1BWrite_10cyc_4L_8XY],
+        (instregex "^LD4Fourv(8b|4h|16b|8h|4s)")>;
+// -- Load pair, Q-form
+def : InstRW<[Ampere1BWrite_4cyc_2L], (instregex "LDN?PQ")>;
+// -- Load pair, S/D-form
+def : InstRW<[Ampere1BWrite_5cyc_1L_1BS], (instregex "LDN?P(S|D)")>;
+// -- Load register
+def : InstRW<[Ampere1BWrite_4cyc_1L], (instregex "LDU?R[BHSDQ]i")>;
+// -- Load register, sign-extended register
+def : InstRW<[Ampere1BWrite_4cyc_1L], (instregex "LDR[BHSDQ]ro(W|X)")>;
+
+// FP and vector store instructions
+// -- Store 1-element structure from one lane of 1 register
+def : InstRW<[Ampere1BWrite_4cyc_1XY_1S_1Z],
+        (instregex "^ST1i(8|16|32|64)")>;
+// -- Store 1-element structures from 1 register
+def : InstRW<[Ampere1BWrite_2cyc_1S_1Z],
+        (instregex "^ST1Onev(8b|4h|2s|1d|16b|8h|4s|2d)")>;
+// -- Store 1-element structures from 2 registers
+def : InstRW<[Ampere1BWrite_3cyc_2S_2Z],
+        (instregex "^ST1Twov(8b|4h|2s|1d|16b|8h|4s|2d)")>;
+// -- Store 1-element structures from 3 registers
+def : InstRW<[Ampere1BWrite_4cyc_3S_3Z],
+        (instregex "^ST1Threev(8b|4h|2s|1d|16b|8h|4s|2d)")>;
+// -- Store 1-element structures from 4 registers
+def : InstRW<[Ampere1BWrite_5cyc_4S_4Z],
+        (instregex "^ST1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)")>;
+// -- Store 2-element structure from one lane of 2 registers
+def : InstRW<[Ampere1BWrite_5cyc_2XY_2S_2Z],
+        (instregex "^ST2i(8|16|32|64)")>;
+// -- Store 2-element structures from 2 registers, 16B/8H/4S/2D sizes
+def : InstRW<[Ampere1BWrite_5cyc_2XY_2S_2Z],
+        (instregex "^ST2Twov(16b|8h|4s|2d)")>;
+// -- Store 2-element structures from 2 registers, 8B/4H/2S sizes
+def : InstRW<[Ampere1BWrite_6cyc_2XY_2S_2Z],
+        (instregex "^ST2Twov(8b|4h|2s)")>;
+// -- Store 3-element structure from one lane of 3 registers
+def : InstRW<[Ampere1BWrite_6cyc_3XY_3S_3Z],
+        (instregex "^ST3i(8|16|32|64)")>;
+// -- Store 3-element structures from 3 registers
+def : InstRW<[Ampere1BWrite_6cyc_3XY_3S_3Z],
+        (instregex "^ST3Threev(8b|4h|2s|1d|16b|8h|4s|2d)")>;
+// -- Store 4-element structure from one lane of 4 registers
+def : InstRW<[Ampere1BWrite_7cyc_4XY_4S_4Z],
+        (instregex "^ST4i(8|16|32|64)")>;
+// -- Store 4-element structures from 4 registers, 16B/8H/4S sizes
+def : InstRW<[Ampere1BWrite_7cyc_4XY_4S_4Z],
+        (instregex "^ST4Fourv(16b|8h|4s)")>;
+// -- Store 4-element structures from 4 registers, 2D sizes
+def : InstRW<[Ampere1BWrite_7cyc_4XY_4S_4Z],
+        (instregex "^ST4Fourv2d")>;
+// -- Store 4-element structures from 4 registers, 8B/4H/2S sizes
+def : InstRW<[Ampere1BWrite_9cyc_6XY_4S_4Z],
+        (instregex "^ST4Fourv(8b|4h|2s)")>;
+// -- Store pair, Q-form
+def : InstRW<[Ampere1BWrite_3cyc_2S_2Z], (instregex "^STN?PQ")>;
+// -- Store pair, S/D-form
+def : InstRW<[Ampere1BWrite_3cyc_2S_2Z], (instregex "^STN?P[SD]")>;
+// -- Store register
+def : InstRW<[Ampere1BWrite_2cyc_1S_2Z], (instregex "^STU?R[BHSDQ](ui|i)")>;
+// -- Store register, sign-extended register offset
+def : InstRW<[Ampere1BWrite_2cyc_1S_1Z], (instregex "^STR[BHSDQ]ro[XW]")>;
+
+// FP data processing, bfloat16 format
+def : InstRW<[Ampere1BWrite_3cyc_1XY], (instrs BFCVT)>;
+def : InstRW<[Ampere1BWrite_8cyc_2XY], (instrs BFCVTN, BFCVTN2)>;
+def : InstRW<[Ampere1BWrite_2cyc_1XY], (instregex "^BFDOTv", "^BF16DOT")>;
+def : InstRW<[Ampere1BWrite_3cyc_1XY], (instrs BFMMLA)>;
+def : InstRW<[Ampere1BWrite_4cyc_1XY], (instregex "^BFMLAL")>;
+
+// FP data processing, scalar/vector, half precision
+def : InstRW<[Ampere1BWrite_3cyc_1XY], (instregex "^F(ABD|ABS)v.[fi]16")>;
+def : InstRW<[Ampere1BWrite_3cyc_1XY],
+        (instregex "^F(ADD|ADDP|CADD|NEG|NMUL|SUB)v.[fi]16")>;
+def : InstRW<[Ampere1BWrite_3cyc_1XY],
+        (instregex "^F(AC|CM)(EQ|GE|GT|LE|LT)v.[fi]16")>;
+def : InstRW<[Ampere1BWrite_3cyc_1XY],
+        (instregex "^F(AC|CM)(EQ|GE|GT|LE|LT)16")>;
+def : InstRW<[Ampere1BWrite_3cyc_1X],
+        (instregex "^FCMPE?H")>;
+def : InstRW<[Ampere1BWrite_9cyc_1A_1BS_1X],
+        (instregex "^FCCMPE?H")>;
+def : InstRW<[Ampere1BWrite_9cyc_1A_1BS_1XY],
+        (instregex "^FCSELH")>;
+def : InstRW<[Ampere1BWrite_3cyc_1XY], (instregex "^FCVT[AMNPZ][SU]v.[if]16")>;
+// Convert FP to integer, H-form
+def : InstRW<[Ampere1BWrite_3cyc_1XY], (instregex "^[SUd]CVTFv.[fi]16")>;
+// Convert to FP from GPR, H-form
+def : InstRW<[Ampere1BWrite_8cyc_1BS_1XY], (instregex "^[SU]CVTF_ZPmZ_[DSH]toH$")>;
+// Convert to FP from GPR, fixed-point, H-form
+def : InstRW<[Ampere1BWrite_11cyc_1BS_2XY], (instregex "^[SU]CVTF[SU][WX]Hri$")>;
+def : InstRW<[Ampere1BWrite_9cyc_1X], (instrs FDIVHrr)>;
+def : InstRW<[Ampere1BWrite_17cyc_1X], (instregex "^FDIVv.[if]16")>;
+def : InstRW<[Ampere1BWrite_3cyc_1XY], (instregex "^F(MAX|MIN)(NM)?P?v.[if]16")>;
+def : InstRW<[Ampere1BWrite_6cyc_2XY], (instregex "^F(MAX|MIN)(NM)?Vv4[if]16")>;
+def : InstRW<[Ampere1BWrite_9cyc_3XY], (instregex "^F(MAX|MIN)(NM)?Vv8[if]16")>;
+def : InstRW<[Ampere1BWrite_4cyc_1XY], (instregex "^FMULX?v.[if]16")>;
+def : InstRW<[Ampere1BWrite_4cyc_1XY], (instrs FMULX16)>;
+def : InstRW<[Ampere1BWrite_4cyc_1XY], (instregex "^FN?M(ADD|SUB)[H]rrr")>;
+def : InstRW<[Ampere1BWrite_4cyc_1XY], (instregex "^FML[AS]v.[if]16")>;
+def : InstRW<[Ampere1BWrite_3cyc_1XY], (instregex "^FRECPXv.[if]16")>;
+def : InstRW<[Ampere1BWrite_4cyc_1XY], (instregex "^F(RECP|RSQRT)S16")>;
+def : InstRW<[Ampere1BWrite_3cyc_1XY], (instregex "^FRINT[AIMNPXZ]v.[if]16")>;
+// FP square root, H-form
+def : InstRW<[Ampere1BWrite_21cyc_1X], (instrs FSQRTHr)>;
+// FP square root, vector-form, F16
+def : InstRW<[Ampere1BWrite_39cyc_1X], (instregex "^FSQRTv.f16")>;
+
+// FP data processing, scalar/vector, single/double precision
+def : InstRW<[Ampere1BWrite_3cyc_1XY], (instregex "^F(ABD|ABS)v.[fi](32|64)")>;
+def : InstRW<[Ampere1BWrite_3cyc_1XY],
+        (instregex "^F(ADD|ADDP|CADD|NEG|NMUL|SUB)v.[fi](32|64)")>;
+def : InstRW<[Ampere1BWrite_3cyc_1XY],
+        (instregex "^F(AC|CM)(EQ|GE|GT|LE|LT)v.[fi](32|64)")>;
+def : InstRW<[Ampere1BWrite_3cyc_1XY],
+        (instregex "^F(AC|CM)(EQ|GE|GT|LE|LT)(32|64)")>;
+def : InstRW<[Ampere1BWrite_3cyc_1X],
+        (instregex "^FCMPE?(S|D)")>;
+def : InstRW<[Ampere1BWrite_9cyc_1A_1BS_1X],
+        (instregex "^FCCMPE?(S|D)")>;
+def : InstRW<[Ampere1BWrite_9cyc_1A_1BS_1XY],
+        (instregex "^FCSEL(S|D)")>;
+def : InstRW<[Ampere1BWrite_3cyc_1XY], (instregex "^FCVT[AMNPZ][SU]v.[if](32|64)")>;
+// Convert FP to integer, S/D-form
+def : InstRW<[Ampere1BWrite_3cyc_1XY], (instregex "^[SUd]CVTFv.[fi](32|64)")>;
+// Convert to FP from GPR, S/D-form
+def : InstRW<[Ampere1BWrite_8cyc_1BS_1XY], (instregex "^[SU]CVTF_ZPmZ_[DSH]to[DS]$")>;
+// Convert to FP from GPR, fixed-point, S/D-form
+def : InstRW<[Ampere1BWrite_11cyc_1BS_2XY], (instregex "^[SU]CVTF[SU][WX][SD]ri$")>;
+def : InstRW<[Ampere1BWrite_19cyc_1X], (instregex "^FDIVv.[if](64)", "FDIVD")>;
+def : InstRW<[Ampere1BWrite_12cyc_1X], (instregex "^FDIVv.[if](32)", "FDIVS")>;
+def : InstRW<[Ampere1BWrite_3cyc_1XY], (instregex "^F(MAX|MIN)(NM)?P?v.[if](32|64)")>;
+def : InstRW<[Ampere1BWrite_6cyc_2XY], (instregex "^F(MAX|MIN)(NM)?Vv.[if](32|64)")>;
+def : InstRW<[Ampere1BWrite_4cyc_1XY], (instregex "^FMULX?v.[if](32|64)")>;
+def : InstRW<[Ampere1BWrite_4cyc_1XY], (instrs FMULX32, FMULX64)>;
+def : InstRW<[Ampere1BWrite_4cyc_1XY], (instregex "^FN?MUL")>;
+def : InstRW<[Ampere1BWrite_4cyc_1XY], (instregex "^FN?M(ADD|SUB)[SD]rrr")>;
+def : InstRW<[Ampere1BWrite_4cyc_1XY], (instregex "^FML[AS]v.[if](32|64)")>;
+def : InstRW<[Ampere1BWrite_3cyc_1XY], (instregex "^FRECPXv.[if](32|64)")>;
+def : InstRW<[Ampere1BWrite_3cyc_1XY], (instregex "^F(RECP|RSQRT)S(32|64)")>;
+def : InstRW<[Ampere1BWrite_3cyc_1XY], (instregex "^FRINT[AIMNPXZ]v.[if](32|64)")>;
+def : InstRW<[Ampere1BWrite_3cyc_1XY], (instregex "^FRINT(32|64)")>;
+def : InstRW<[Ampere1BWrite_63cyc_1X], (instregex "^FSQRTv.f64", "^FSQRTDr")>;
+def : InstRW<[Ampere1BWrite_33cyc_1X], (instregex "^FSQRTv.f32", "^FSQRTSr")>;
+
+// FP miscellaneous instructions
+def : InstRW<[Ampere1BWrite_7cyc_1XY_1Z], (instregex "^FCVT[AMNPZ][SU][SU][XW][HSD]r")>;
+def : InstRW<[Ampere1BWrite_3cyc_1XY], (instregex "^FCVT[HSD]Hr")>;
+def : InstRW<[Ampere1BWrite_3cyc_1XY], (instregex "^FCVT[HSD][SD]r")>;
+def : InstRW<[Ampere1BWrite_3cyc_1XY], (instregex "^FCVTLv")>;
+def : InstRW<[Ampere1BWrite_3cyc_1XY], (instregex "^FCVT(N|XN)v")>;
+def : InstRW<[Ampere1BWrite_7cyc_1X_1Z], (instrs FJCVTZS)>;
+def : InstRW<[Ampere1BWrite_5cyc_1BS], (instregex "^FMOV[HSD][WX]r")>;
+def : InstRW<[Ampere1BWrite_7cyc_1BS_1XY], (instregex "^FMOVDXHighr")>;
+def : InstRW<[Ampere1BWrite_2cyc_1XY], (instregex "^FMOV[HSD][ri]")>;
+def : InstRW<[Ampere1BWrite_5cyc_1X], (instregex "^FMOVXDHighr")>;
+def : InstRW<[Ampere1BWrite_3cyc_1Z], (instregex "^FMOV[WX][HSD]r")>;
+
+// Integer arithmetic and logical instructions
+def : InstRW<[Ampere1BWrite_1cyc_1A],
+        (instregex "ADC(W|X)r", "SBC(W|X)r")>;
+def : InstRW<[Ampere1BWrite_Arith],
+        (instregex "(ADD|AND|BIC|EON|EOR|ORN|ORR|SUB)[WX]r[sx]")>;
+def : InstRW<[Ampere1BWrite_1cyc_1AB],
+        (instregex "(ADD|AND|BIC|EON|EOR|ORN|ORR|SUB)[WX]r[ri]")>;
+def : InstRW<[Ampere1BWrite_ArithFlagsetting],
+        (instregex "(ADD|AND|BIC|SUB)S[WX]r[sx]")>;
+def : InstRW<[Ampere1BWrite_1cyc_1A],
+        (instregex "(ADD|AND|BIC|SUB)S[WX]r[ri]")>;
+def : InstRW<[Ampere1BWrite_1cyc_1A],
+        (instregex "(ADC|SBC)S[WX]r")>;
+def : InstRW<[Ampere1BWrite_1cyc_1A], (instrs RMIF)>;
+def : InstRW<[Ampere1BWrite_1cyc_1A],
+        (instregex "(CCMN|CCMP)(X|W)")>;
+def : InstRW<[Ampere1BWrite_1cyc_1A],
+        (instregex "(CSEL|CSINC|CSINV|CSNEG)(X|W)")>;
+def : InstRW<[Ampere1BWrite_13cyc_1BS_1X], (instrs SDIVWr, UDIVWr)>;
+def : InstRW<[Ampere1BWrite_19cyc_2BS_1X], (instrs SDIVXr, UDIVXr)>;
+def : InstRW<[Ampere1BWrite_3cyc_1BS],
+        (instregex "(S|U)MULHr")>;
+def : InstRW<[Ampere1BWrite_4cyc_1BS_1AB],
+        (instregex "(S|U)?M(ADD|SUB)L?r")>;
+
+// Integer load instructions
+def : InstRW<[Ampere1BWrite_3cyc_1L],
+        (instregex "(LDNP|LDP|LDPSW)(X|W)")>;
+def : InstRW<[Ampere1BWrite_3cyc_1L],
+        (instregex "LDR(B|D|H|Q|S)ui")>;
+def : InstRW<[Ampere1BWrite_3cyc_1L],
+        (instregex "LDR(D|Q|W|X)l")>;
+def : InstRW<[Ampere1BWrite_3cyc_1L],
+        (instregex "LDTR(B|H|W|X)i")>;
+def : InstRW<[Ampere1BWrite_3cyc_1L],
+        (instregex "LDTRS(BW|BX|HW|HX|W)i")>;
+def : InstRW<[Ampere1BWrite_3cyc_1L],
+        (instregex "LDUR(BB|HH|X|W)i")>;
+def : InstRW<[Ampere1BWrite_3cyc_1L],
+        (instregex "LDURS(BW|BX|HW|HX|W)i")>;
+def : InstRW<[Ampere1BWrite_3cyc_1L],
+        (instregex "LDR(HH|SHW|SHX|W|X)ro(W|X)")>;
+def : InstRW<[Ampere1BWrite_1cyc_1L],
+        (instrs PRFMl, PRFUMi, PRFUMi)>;
+def : InstRW<[Ampere1BWrite_1cyc_1L],
+        (instrs PRFMroW, PRFMroX)>;
+
+// Integer miscellaneous instructions
+def : InstRW<[Ampere1BWrite_1cyc_1A],  (instrs ADR, ADRP)>;
+def : InstRW<[Ampere1BWrite_1cyc_1B],  (instregex "EXTR(W|X)")>;
+def : InstRW<[Ampere1BWrite_1cyc_1B],  (instregex "(S|U)?BFM(W|X)")>;
+def : InstRW<[Ampere1BWrite_3cyc_1BS], (instregex "^CRC32C?[BHWX]")>;
+def : InstRW<[Ampere1BWrite_1cyc_1B],  (instregex "CLS(W|X)")>;
+def : InstRW<[Ampere1BWrite_1cyc_1A],  (instrs SETF8, SETF16)>;
+def : InstRW<[Ampere1BWrite_1cyc_1AB],
+        (instrs MOVKWi, MOVKXi, MOVNWi, MOVNXi, MOVZWi, MOVZXi)>;
+def : InstRW<[Ampere1BWrite_1cyc_1B],
+        (instregex "(RBIT|REV|REV16)(W|X)r", "REV32Xr")>;
+def : InstRW<[Ampere1BWrite_1cyc_1B],
+        (instregex "(ASR|LSL|LSR|ROR)V(W|X)r")>;
+
+// Integer store instructions
+def : InstRW<[Ampere1BWrite_1cyc_2S],        (instregex "STNP(X|W)i")>;
+def : InstRW<[Ampere1BWrite_1cyc_2S],        (instrs STPXi)>;
+def : InstRW<[Ampere1BWrite_2cyc_1B_1S],     (instrs STPWi)>;
+def : InstRW<[Ampere1BWrite_2cyc_1B_1S_1AB], (instregex "STP(W|X)(pre|post)")>;
+def : InstRW<[Ampere1BWrite_1cyc_1S],        (instrs STTRBi, STTRHi, STTRWi, STTRXi)>;
+def : InstRW<[Ampere1BWrite_1cyc_1S],        (instregex "STUR(BB|HH|X|W)i",
+                                                        "STR(X|W)ui",
+                                                        "STUR(BB|HH|X|W)i")>;
+def : InstRW<[Ampere1BWrite_1cyc_2S],        (instrs STRWroX, STRXroX)>;
+def : InstRW<[Ampere1BWrite_1cyc_2S],        (instrs STRWroW, STRXroW)>;
+
+// Memory tagging
+
+// Insert Random Tags
+def : InstRW<[Ampere1BWrite_1cyc_1BS_1B], (instrs IRG, IRGstack)>;
+// Load allocation tag
+def : InstRW<[Ampere1BWrite_4cyc_1L_1B], (instrs LDG, LDGM)>;
+// Store allocation tags
+def : InstRW<[Ampere1BWrite_1cyc_1S],
+    (instrs STGi, STGM, STGPreIndex, STGPostIndex)>;
+// Store allocation tags and pair of registers
+def : InstRW<[Ampere1BWrite_1cyc_2S],
+    (instrs STGPi, STGPpre, STGPpost)>;
+// Store allocation tags and zero data
+def : InstRW<[Ampere1BWrite_1cyc_1S],
+    (instrs STZGi, STZGM, STZGPreIndex, STZGPostIndex)>;
+// Store two tags
+def : InstRW<[Ampere1BWrite_1cyc_2S],
+    (instrs ST2Gi, ST2GPreIndex, ST2GPostIndex)>;
+// Store two tags and zero data
+def : InstRW<[Ampere1BWrite_1cyc_2S],
+    (instrs STZ2Gi, STZ2GPreIndex, STZ2GPostIndex)>;
+// Subtract Pointer
+def : InstRW<[Ampere1BWrite_1cyc_1AB], (instrs SUBP)>;
+// Subtract Pointer, flagset
+def : InstRW<[Ampere1BWrite_1cyc_1AB], (instrs SUBPS)>;
+// Insert Tag Mask
+def : InstRW<[Ampere1BWrite_1cyc_1AB], (instrs GMI)>;
+// Arithmetic, immediate to logical address tag
+def : InstRW<[Ampere1BWrite_1cyc_B], (instrs ADDG, SUBG)>;
+
+// Pointer authentication
+def : InstRW<[Ampere1BWrite_5cyc_1BS], (instregex "^AUT")>;
+def : InstRW<[Ampere1BWrite_6cyc_1BS_1A],
+        (instregex "BRA(A|AZ|B|BZ)", "RETA(A|B)", "ERETA(A|B)")>;
+def : InstRW<[Ampere1BWrite_6cyc_1BS_2A],
+        (instrs BLRAA, BLRAAZ, BLRAB, BLRABZ)>;
+def : InstRW<[Ampere1BWrite_5cyc_1BS], (instregex "^PAC")>;
+def : InstRW<[Ampere1BWrite_8cyc_1BS_1L], (instregex "^LDRA(A|B)")>;
+def : InstRW<[Ampere1BWrite_1cyc_1B], (instrs XPACD, XPACI)>;
+
+// Vector integer instructions
+// -- absolute difference
+def : InstRW<[Ampere1BWrite_2cyc_1XY],
+             (instregex "^SABAv", "^SABALv", "^SABDv", "^SABDLv",
+                        "^UABAv", "^UABALv", "^UABDv", "^UABDLv")>;
+// -- arithmetic
+def : InstRW<[Ampere1BWrite_2cyc_1XY],
+        (instregex "^ABSv", "^(ADD|SUB)v", "^SADDLv", "^SADDW", "SHADD",
+                   "SHSUB", "^SRHADD", "^URHADD", "SSUBL", "SSUBW",
+                   "^UADDLv", "^UADDW", "UHADD", "UHSUB", "USUBL", "USUBW")>;
+// -- arithmetic, horizontal, 16B
+def : InstRW<[Ampere1BWrite_8cyc_4XY],
+            (instregex "^ADDVv16i8v", "^SADDLVv16i8v", "^UADDLVv16i8v")>;
+def : InstRW<[Ampere1BWrite_8cyc_4XY],
+            (instregex "^[SU](MIN|MAX)Vv16i8v")>;
+// -- arithmetic, horizontal, 4H/4S
+def : InstRW<[Ampere1BWrite_4cyc_2XY],
+            (instregex "^[SU]?ADDL?V(v8i8|v4i16|v2i32)v")>;
+def : InstRW<[Ampere1BWrite_4cyc_2XY],
+            (instregex "^[SU](MIN|MAX)V(v4i16|v4i32)v")>;
+// -- arithmetic, horizontal, 8B/8H
+def : InstRW<[Ampere1BWrite_6cyc_3XY],
+            (instregex "^[SU]?ADDL?V(v8i16|v4i32)v")>;
+def : InstRW<[Ampere1BWrite_6cyc_3XY],
+            (instregex "^[SU](MIN|MAX)V(v8i8|v8i16)v")>;
+// -- arithmetic, narrowing
+def : InstRW<[Ampere1BWrite_6cyc_2XY], (instregex "(ADD|SUB)HNv.*")>;
+def : InstRW<[Ampere1BWrite_6cyc_2XY], (instregex "(RADD|RSUB)HNv.*")>;
+// -- arithmetic, pairwise
+def : InstRW<[Ampere1BWrite_2cyc_1XY],
+        (instregex "^ADDPv", "^SADALP", "^UADALP", "^SADDLPv", "^UADDLPv")>;
+// -- arithmetic, saturating
+def : InstRW<[Ampere1BWrite_2cyc_1XY],
+        (instregex "^SQADD", "^SQSUB", "^SUQADD", "^UQADD", "^UQSUB", "^USQADD")>;
+// -- bit count
+def : InstRW<[Ampere1BWrite_2cyc_1XY],
+        (instregex "^(CLS|CLZ|CNT)v")>;
+// -- compare
+def : InstRW<[Ampere1BWrite_2cyc_1XY],
+        (instregex "^CMEQv", "^CMGEv", "^CMGTv", "^CMLEv", "^CMLTv",
+                   "^CMHIv", "^CMHSv")>;
+// -- compare non-zero
+def : InstRW<[Ampere1BWrite_2cyc_1XY], (instregex "^CMTSTv")>;
+// -- dot product
+def : InstRW<[Ampere1BWrite_3cyc_1XY], (instregex "^(S|SU|U|US)DOTv")>;
+// -- fp reciprocal estimate
+def : InstRW<[Ampere1BWrite_6cyc_1X], (instregex "^FRECPEv", "^FRSQRTEv")>;
+// -- integer reciprocal estimate
+def : InstRW<[Ampere1BWrite_2cyc_1XY], (instregex "^URECPEv", "^URSQRTEv")>;
+// -- logical
+def : InstRW<[Ampere1BWrite_2cyc_1XY],
+        (instregex "^ANDv", "^BICv", "^EORv", "^ORRv", "^ORNv", "^NOTv")>;
+// -- logical, narrowing
+def : InstRW<[Ampere1BWrite_6cyc_2XY],
+        (instregex "RSHRNv",
+                   "SHRNv", "SQSHRNv", "SQSHRUNv",
+                   "UQXTNv")>;
+// -- matrix multiply
+def : InstRW<[Ampere1BWrite_3cyc_1XY],
+        (instrs SMMLA, UMMLA, USMMLA)>;
+// -- max/min
+def : InstRW<[Ampere1Write_2cyc_1XY],
+        (instregex "^SMAXv", "^SMINv", "^UMAXv", "^UMINv")>;
+def : InstRW<[Ampere1Write_2cyc_1XY],
+        (instregex "^SMAXPv", "^SMINPv", "^UMAXPv", "^UMINPv")>;
+// -- move immediate
+def : InstRW<[Ampere1Write_2cyc_1XY], (instregex "^MOVIv", "^MVNIv")>;
+// -- multiply
+def : InstRW<[Ampere1Write_3cyc_1XY],
+        (instregex "MULv", "SMULLv", "UMULLv", "SQDMUL(H|L)v", "SQRDMULHv")>;
+// -- multiply accumulate
+def : InstRW<[Ampere1Write_3cyc_1XY],
+        (instregex "MLAv", "MLSv", "(S|U|SQD)(MLAL|MLSL)v", "SQRDML(A|S)Hv")>;
+// -- negation, saturating
+def : InstRW<[Ampere1Write_2cyc_1XY], (instregex "^SQABS", "^SQNEG")>;
+// -- reverse bits/bytes
+def : InstRW<[Ampere1Write_2cyc_1XY],
+        (instregex "^RBITv", "^REV16v", "^REV32v", "^REV64v")>;
+// -- shift
+def : InstRW<[Ampere1Write_2cyc_1XY], (instregex "^[SU]SHL(v16i8|v8i16|v4i32|v2i64)")>;
+// -- shift and accumulate
+def : InstRW<[Ampere1Write_2cyc_1XY],
+        (instregex "SRSRAv", "SSRAv", "URSRAv", "USRAv")>;
+// -- shift, saturating
+def : InstRW<[Ampere1Write_2cyc_1XY],
+        (instregex "^SQRSHLv", "^SQRSHRNv", "^SQRSHRUNv", "^SQSHL", "^SQSHLU",
+                   "^SQXTNv", "^SQXTUNv", "^UQSHRNv", "UQRSHRNv", "^UQRSHL",
+                   "^UQSHL")>;
+
+// Vector miscellaneous instructions
+// -- duplicate element
+def : InstRW<[Ampere1BWrite_2cyc_1XY], (instregex "^DUPv.+lane")>;
+// -- duplicate from GPR
+def : InstRW<[Ampere1BWrite_5cyc_1BS], (instregex "^DUPv.+gpr")>;
+// -- extract narrow
+def : InstRW<[Ampere1BWrite_2cyc_1XY], (instregex "^XTNv")>;
+// -- insert/extract element
+def : InstRW<[Ampere1BWrite_2cyc_1XY], (instregex "^EXTv", "^INSv.+lane")>;
+// -- move FP immediate
+def : InstRW<[Ampere1BWrite_2cyc_1XY], (instregex "^FMOVv")>;
+// -- move element to GPR
+def : InstRW<[Ampere1BWrite_5cyc_1X], (instregex "(S|U)MOVv")>;
+// -- move from GPR to any element
+def : InstRW<[Ampere1BWrite_7cyc_1BS_1XY], (instregex "^INSv.+gpr")>;
+// -- table lookup
+def : InstRW<[Ampere1BWrite_2cyc_1XY],
+            (instrs TBLv8i8One, TBLv16i8One, TBXv8i8One, TBXv16i8One)>;
+def : InstRW<[Ampere1BWrite_4cyc_2XY],
+            (instrs TBLv8i8Two, TBLv16i8Two, TBXv8i8Two, TBXv16i8Two)>;
+def : InstRW<[Ampere1BWrite_6cyc_3XY],
+            (instrs TBLv8i8Three, TBLv16i8Three, TBXv8i8Three, TBXv16i8Three)>;
+def : InstRW<[Ampere1BWrite_8cyc_4XY],
+            (instrs TBLv8i8Four, TBLv16i8Four, TBXv8i8Four, TBXv16i8Four)>;
+// -- transpose
+def : InstRW<[Ampere1Write_2cyc_1XY],
+              (instregex "^TRN1v", "^TRN2v", "^UZP1v", "^UZP2v")>;
+// -- zip/unzip
+def : InstRW<[Ampere1Write_2cyc_1XY], (instregex "^ZIP1v", "^ZIP2v")>;
+
+} // SchedModel = Ampere1BModel
-- 
cgit v1.1


From 78145a6bd0023ff1c218dda59b192345d773ebe5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Valentin=20Clement=20=28=E3=83=90=E3=83=AC=E3=83=B3?=
 =?UTF-8?q?=E3=82=BF=E3=82=A4=E3=83=B3=20=E3=82=AF=E3=83=AC=E3=83=A1?=
 =?UTF-8?q?=E3=83=B3=29?= <clementval@gmail.com>
Date: Fri, 9 Feb 2024 15:57:16 -0800
Subject: [flang][cuda] Lower attribute for procedure (#81336)

This PR adds a new attribute to represent the CUDA attribute attached to
procedure. This attribute is attached to the func.func operation during
lowering.
Other procedures information such as `launch_bounds` and `cluster_dims`
will be added separately.
---
 flang/include/flang/Optimizer/Dialect/FIRAttr.td | 54 ++++++++++++++++--------
 flang/include/flang/Optimizer/Support/Utils.h    | 27 ++++++++++++
 flang/lib/Lower/CallInterface.cpp                |  7 +++
 flang/lib/Optimizer/Dialect/FIRAttr.cpp          |  2 +-
 flang/test/Lower/CUDA/cuda-proc-attribute.cuf    | 34 +++++++++++++++
 5 files changed, 106 insertions(+), 18 deletions(-)
 create mode 100644 flang/test/Lower/CUDA/cuda-proc-attribute.cuf

diff --git a/flang/include/flang/Optimizer/Dialect/FIRAttr.td b/flang/include/flang/Optimizer/Dialect/FIRAttr.td
index 422ad53..00e293e 100644
--- a/flang/include/flang/Optimizer/Dialect/FIRAttr.td
+++ b/flang/include/flang/Optimizer/Dialect/FIRAttr.td
@@ -58,19 +58,34 @@ def fir_FortranVariableFlagsAttr : fir_Attr<"FortranVariableFlags"> {
         "::fir::FortranVariableFlagsAttr::get($_builder.getContext(), $0)";
 }
 
-def CUDAconstant : I32EnumAttrCase<"Constant", 0, "constant">;
-def CUDAdevice   : I32EnumAttrCase<"Device", 1, "device">;
-def CUDAmanaged  : I32EnumAttrCase<"Managed", 2, "managed">;
-def CUDApinned   : I32EnumAttrCase<"Pinned", 3, "pinned">;
-def CUDAshared   : I32EnumAttrCase<"Shared", 4, "shared">;
-def CUDAunified  : I32EnumAttrCase<"Unified", 5, "unified">;
-// Texture is omitted since it is obsolete and rejected by semantic.
+def fir_BoxFieldAttr : I32EnumAttr<
+    "BoxFieldAttr", "",
+    [
+      I32EnumAttrCase<"base_addr", 0>,
+      I32EnumAttrCase<"derived_type", 1>
+    ]> {
+  let cppNamespace = "fir";
+}
+
+// mlir::SideEffects::Resource for modelling operations which add debugging information
+def DebuggingResource : Resource<"::fir::DebuggingResource">;
+
+//===----------------------------------------------------------------------===//
+// CUDA Fortran specific attributes
+//===----------------------------------------------------------------------===//
 
 def fir_CUDADataAttribute : I32EnumAttr<
     "CUDADataAttribute",
     "CUDA Fortran variable attributes",
-    [CUDAconstant, CUDAdevice, CUDAmanaged, CUDApinned, CUDAshared,
-     CUDAunified]> {
+    [
+      I32EnumAttrCase<"Constant", 0, "constant">,
+      I32EnumAttrCase<"Device", 1, "device">,
+      I32EnumAttrCase<"Managed", 2, "managed">,
+      I32EnumAttrCase<"Pinned", 3, "pinned">,
+      I32EnumAttrCase<"Shared", 4, "shared">,
+      I32EnumAttrCase<"Unified", 5, "unified">,
+      // Texture is omitted since it is obsolete and rejected by semantic.
+    ]> {
   let genSpecializedAttr = 0;
   let cppNamespace = "::fir";
 }
@@ -80,17 +95,22 @@ def fir_CUDADataAttributeAttr :
   let assemblyFormat = [{ ```<` $value `>` }];
 }
 
-def fir_BoxFieldAttr : I32EnumAttr<
-    "BoxFieldAttr", "",
+def fir_CUDAProcAttribute : I32EnumAttr<
+    "CUDAProcAttribute", "CUDA Fortran procedure attributes",
     [
-      I32EnumAttrCase<"base_addr", 0>,
-      I32EnumAttrCase<"derived_type", 1>
+      I32EnumAttrCase<"Host", 0, "host">,
+      I32EnumAttrCase<"Device", 1, "device">,
+      I32EnumAttrCase<"HostDevice", 2, "host_device">,
+      I32EnumAttrCase<"Global", 3, "global">,
+      I32EnumAttrCase<"GridGlobal", 4, "grid_global">,
     ]> {
-  let cppNamespace = "fir";
+  let genSpecializedAttr = 0;
+  let cppNamespace = "::fir";
 }
 
-
-// mlir::SideEffects::Resource for modelling operations which add debugging information
-def DebuggingResource : Resource<"::fir::DebuggingResource">;
+def fir_CUDAProcAttributeAttr :
+    EnumAttr<fir_Dialect, fir_CUDAProcAttribute, "cuda_proc"> {
+  let assemblyFormat = [{ ```<` $value `>` }];
+}
 
 #endif // FIR_DIALECT_FIR_ATTRS
diff --git a/flang/include/flang/Optimizer/Support/Utils.h b/flang/include/flang/Optimizer/Support/Utils.h
index 84c550a..4e06bf8 100644
--- a/flang/include/flang/Optimizer/Support/Utils.h
+++ b/flang/include/flang/Optimizer/Support/Utils.h
@@ -303,6 +303,33 @@ getCUDADataAttribute(mlir::MLIRContext *mlirContext,
   return {};
 }
 
+inline fir::CUDAProcAttributeAttr getCUDAProcAttribute(
+    mlir::MLIRContext *mlirContext,
+    std::optional<Fortran::common::CUDASubprogramAttrs> cudaAttr) {
+  if (cudaAttr) {
+    fir::CUDAProcAttribute attr;
+    switch (*cudaAttr) {
+    case Fortran::common::CUDASubprogramAttrs::Host:
+      attr = fir::CUDAProcAttribute::Host;
+      break;
+    case Fortran::common::CUDASubprogramAttrs::Device:
+      attr = fir::CUDAProcAttribute::Device;
+      break;
+    case Fortran::common::CUDASubprogramAttrs::HostDevice:
+      attr = fir::CUDAProcAttribute::HostDevice;
+      break;
+    case Fortran::common::CUDASubprogramAttrs::Global:
+      attr = fir::CUDAProcAttribute::Global;
+      break;
+    case Fortran::common::CUDASubprogramAttrs::Grid_Global:
+      attr = fir::CUDAProcAttribute::GridGlobal;
+      break;
+    }
+    return fir::CUDAProcAttributeAttr::get(mlirContext, attr);
+  }
+  return {};
+}
+
 } // namespace fir
 
 #endif // FORTRAN_OPTIMIZER_SUPPORT_UTILS_H
diff --git a/flang/lib/Lower/CallInterface.cpp b/flang/lib/Lower/CallInterface.cpp
index 9c32b71..41597c1 100644
--- a/flang/lib/Lower/CallInterface.cpp
+++ b/flang/lib/Lower/CallInterface.cpp
@@ -7,6 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "flang/Lower/CallInterface.h"
+#include "flang/Common/Fortran.h"
 #include "flang/Evaluate/fold.h"
 #include "flang/Lower/Bridge.h"
 #include "flang/Lower/Mangler.h"
@@ -559,6 +560,12 @@ void Fortran::lower::CallInterface<T>::declare() {
           func.setArgAttrs(placeHolder.index(), placeHolder.value().attributes);
       side().setFuncAttrs(func);
     }
+    if (characteristic && characteristic->cudaSubprogramAttrs) {
+      func.getOperation()->setAttr(
+          fir::getCUDAAttrName(),
+          fir::getCUDAProcAttribute(func.getContext(),
+                                    *characteristic->cudaSubprogramAttrs));
+    }
   }
 }
 
diff --git a/flang/lib/Optimizer/Dialect/FIRAttr.cpp b/flang/lib/Optimizer/Dialect/FIRAttr.cpp
index 218fa50..8df7a6c 100644
--- a/flang/lib/Optimizer/Dialect/FIRAttr.cpp
+++ b/flang/lib/Optimizer/Dialect/FIRAttr.cpp
@@ -298,5 +298,5 @@ void fir::printFirAttribute(FIROpsDialect *dialect, mlir::Attribute attr,
 void FIROpsDialect::registerAttributes() {
   addAttributes<ClosedIntervalAttr, ExactTypeAttr, FortranVariableFlagsAttr,
                 LowerBoundAttr, PointIntervalAttr, RealAttr, SubclassAttr,
-                UpperBoundAttr, CUDADataAttributeAttr>();
+                UpperBoundAttr, CUDADataAttributeAttr, CUDAProcAttributeAttr>();
 }
diff --git a/flang/test/Lower/CUDA/cuda-proc-attribute.cuf b/flang/test/Lower/CUDA/cuda-proc-attribute.cuf
new file mode 100644
index 0000000..0507310
--- /dev/null
+++ b/flang/test/Lower/CUDA/cuda-proc-attribute.cuf
@@ -0,0 +1,34 @@
+! RUN: bbc -emit-hlfir -fcuda %s -o - | FileCheck %s
+! RUN: bbc -emit-hlfir -fcuda %s -o - | fir-opt -convert-hlfir-to-fir | FileCheck %s
+
+! Test lowering of CUDA attribute on procedures.
+
+attributes(host) subroutine sub_host(); end
+! CHECK: func.func @_QPsub_host() attributes {fir.cuda_attr = #fir.cuda_proc<host>}
+
+attributes(device) subroutine sub_device(); end
+! CHECK: func.func @_QPsub_device() attributes {fir.cuda_attr = #fir.cuda_proc<device>}
+
+attributes(host) attributes(device) subroutine sub_host_device; end
+! CHECK: func.func @_QPsub_host_device() attributes {fir.cuda_attr = #fir.cuda_proc<host_device>}
+
+attributes(device) attributes(host) subroutine sub_device_host; end
+! CHECK: func.func @_QPsub_device_host() attributes {fir.cuda_attr = #fir.cuda_proc<host_device>}
+
+attributes(global) subroutine sub_global(); end
+! CHECK: func.func @_QPsub_global() attributes {fir.cuda_attr = #fir.cuda_proc<global>}
+
+attributes(grid_global) subroutine sub_grid_global(); end
+! CHECK: func.func @_QPsub_grid_global() attributes {fir.cuda_attr = #fir.cuda_proc<grid_global>}
+
+attributes(host) integer function fct_host(); end
+! CHECK: func.func @_QPfct_host() -> i32 attributes {fir.cuda_attr = #fir.cuda_proc<host>}
+
+attributes(device) integer function fct_device(); end
+! CHECK: func.func @_QPfct_device() -> i32 attributes {fir.cuda_attr = #fir.cuda_proc<device>}
+
+attributes(host) attributes(device) integer function fct_host_device; end
+! CHECK: func.func @_QPfct_host_device() -> i32 attributes {fir.cuda_attr = #fir.cuda_proc<host_device>}
+
+attributes(device) attributes(host) integer function fct_device_host; end
+! CHECK: func.func @_QPfct_device_host() -> i32 attributes {fir.cuda_attr = #fir.cuda_proc<host_device>}
-- 
cgit v1.1


From 8509f75d618a41b946391a73cdbfee53565fbf85 Mon Sep 17 00:00:00 2001
From: Philipp Tomsich <philipp.tomsich@vrull.eu>
Date: Sat, 10 Feb 2024 00:57:08 +0100
Subject: Revert "[AArch64] Add Ampere1B scheduling/pipeline model (#81338)"

This reverts commit 014401158bbbc6899144905c1eb9e44fac86867e.
---
 llvm/lib/Target/AArch64/AArch64.td              |    3 +-
 llvm/lib/Target/AArch64/AArch64SchedAmpere1B.td | 1061 -----------------------
 2 files changed, 1 insertion(+), 1063 deletions(-)
 delete mode 100644 llvm/lib/Target/AArch64/AArch64SchedAmpere1B.td

diff --git a/llvm/lib/Target/AArch64/AArch64.td b/llvm/lib/Target/AArch64/AArch64.td
index 156c48e..e76204f 100644
--- a/llvm/lib/Target/AArch64/AArch64.td
+++ b/llvm/lib/Target/AArch64/AArch64.td
@@ -837,7 +837,6 @@ include "AArch64SchedA64FX.td"
 include "AArch64SchedThunderX3T110.td"
 include "AArch64SchedTSV110.td"
 include "AArch64SchedAmpere1.td"
-include "AArch64SchedAmpere1B.td"
 include "AArch64SchedNeoverseN1.td"
 include "AArch64SchedNeoverseN2.td"
 include "AArch64SchedNeoverseV1.td"
@@ -1723,7 +1722,7 @@ def : ProcessorModel<"ampere1", Ampere1Model, ProcessorFeatures.Ampere1,
 def : ProcessorModel<"ampere1a", Ampere1Model, ProcessorFeatures.Ampere1A,
                      [TuneAmpere1A]>;
 
-def : ProcessorModel<"ampere1b", Ampere1BModel, ProcessorFeatures.Ampere1B,
+def : ProcessorModel<"ampere1b", Ampere1Model, ProcessorFeatures.Ampere1B,
                      [TuneAmpere1B]>;
 
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/AArch64/AArch64SchedAmpere1B.td b/llvm/lib/Target/AArch64/AArch64SchedAmpere1B.td
deleted file mode 100644
index 43da762..0000000
--- a/llvm/lib/Target/AArch64/AArch64SchedAmpere1B.td
+++ /dev/null
@@ -1,1061 +0,0 @@
-//=- AArch64SchedAmpere1B.td - Ampere-1B scheduling def -----*- tablegen -*-=//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file defines the machine model for the Ampere Computing Ampere-1B to
-// support instruction scheduling and other instruction cost heuristics.
-//
-//===----------------------------------------------------------------------===//
-
-// The Ampere-1 core is an out-of-order micro-architecture.  The front
-// end has branch prediction, with a 10-cycle recovery time from a
-// mispredicted branch.  Instructions coming out of the front end are
-// decoded into internal micro-ops (uops).
-
-def Ampere1BModel : SchedMachineModel {
-  let IssueWidth            =   4;  // 4-way decode and dispatch
-  let MicroOpBufferSize     = 192;  // micro-op re-order buffer size
-  let LoadLatency           =   3;  // Optimistic load latency
-  let MispredictPenalty     =  10;  // Branch mispredict penalty
-  let LoopMicroOpBufferSize =  32;  // Instruction queue size
-  let CompleteModel = 0;
-
-  list<Predicate> UnsupportedFeatures = !listconcat(SVEUnsupported.F,
-                                                    SMEUnsupported.F,
-                                                    PAUnsupported.F);
-}
-
-let SchedModel = Ampere1BModel in {
-
-//===----------------------------------------------------------------------===//
-// Define each kind of processor resource and number available on Ampere-1.
-// Ampere-1 has 12 pipelines that 8 independent scheduler (4 integer, 2 FP,
-// and 2 memory) issue into.  The integer and FP schedulers can each issue
-// one uop per cycle, while the memory schedulers can each issue one load
-// and one store address calculation per cycle.
-
-def Ampere1BUnitA  : ProcResource<2>;  // integer single-cycle, branch, and flags r/w
-def Ampere1BUnitB  : ProcResource<2>;  // integer single-cycle, and complex shifts
-def Ampere1BUnitBS : ProcResource<1>;  // integer multi-cycle
-def Ampere1BUnitL  : ProcResource<2>;  // load
-def Ampere1BUnitS  : ProcResource<2>;  // store address calculation
-def Ampere1BUnitX  : ProcResource<1>;  // FP and vector operations, and flag write
-def Ampere1BUnitY  : ProcResource<1>;  // FP and vector operations, and crypto
-def Ampere1BUnitZ  : ProcResource<1>;  // FP store data and FP-to-integer moves
-
-def Ampere1BUnitAB : ProcResGroup<[Ampere1BUnitA, Ampere1BUnitB]>;
-def Ampere1BUnitXY : ProcResGroup<[Ampere1BUnitX, Ampere1BUnitY]>;
-
-//===----------------------------------------------------------------------===//
-// Define customized scheduler read/write types specific to the Ampere-1.
-
-def Ampere1BWrite_1cyc_1A : SchedWriteRes<[Ampere1BUnitA]> {
-  let Latency = 1;
-  let NumMicroOps = 1;
-}
-
-def Ampere1BWrite_1cyc_2A : SchedWriteRes<[Ampere1BUnitA, Ampere1BUnitA]> {
-  let Latency = 1;
-  let NumMicroOps = 2;
-}
-
-def Ampere1BWrite_1cyc_1B : SchedWriteRes<[Ampere1BUnitB]> {
-  let Latency = 1;
-  let NumMicroOps = 1;
-}
-
-def Ampere1BWrite_1cyc_1BS : SchedWriteRes<[Ampere1BUnitBS]> {
-  let Latency = 1;
-  let NumMicroOps = 1;
-}
-
-def Ampere1BWrite_1cyc_1BS_1B : SchedWriteRes<[Ampere1BUnitBS, Ampere1BUnitB]> {
-  let Latency = 1;
-  let NumMicroOps = 2;
-}
-
-def Ampere1BWrite_1cyc_1AB : SchedWriteRes<[Ampere1BUnitAB]> {
-  let Latency = 1;
-  let NumMicroOps = 1;
-}
-
-def Ampere1BWrite_1cyc_1AB_1A : SchedWriteRes<[Ampere1BUnitAB, Ampere1BUnitA]> {
-  let Latency = 1;
-  let NumMicroOps = 2;
-}
-
-def Ampere1BWrite_1cyc_1L : SchedWriteRes<[Ampere1BUnitL]> {
-  let Latency = 1;
-  let NumMicroOps = 1;
-}
-
-def Ampere1BWrite_1cyc_1S : SchedWriteRes<[Ampere1BUnitS]> {
-  let Latency = 1;
-  let NumMicroOps = 1;
-}
-
-def Ampere1BWrite_1cyc_2S : SchedWriteRes<[Ampere1BUnitS, Ampere1BUnitS]> {
-  let Latency = 1;
-  let NumMicroOps = 2;
-}
-
-def Ampere1BWrite_2cyc_1Y : SchedWriteRes<[Ampere1BUnitY]> {
-  let Latency = 2;
-  let NumMicroOps = 1;
-}
-
-def Ampere1BWrite_2cyc_2AB : SchedWriteRes<[Ampere1BUnitAB, Ampere1BUnitAB]> {
-  let Latency = 2;
-  let NumMicroOps = 2;
-}
-
-def Ampere1BWrite_2cyc_1B_1AB : SchedWriteRes<[Ampere1BUnitB, Ampere1BUnitAB]> {
-  let Latency = 2;
-  let NumMicroOps = 2;
-}
-
-def Ampere1BWrite_2cyc_1B_1S : SchedWriteRes<[Ampere1BUnitB, Ampere1BUnitS]> {
-  let Latency = 2;
-  let NumMicroOps = 2;
-}
-
-def Ampere1BWrite_2cyc_1B_1S_1AB : SchedWriteRes<[Ampere1BUnitB,
-                                                  Ampere1BUnitS,
-                                                  Ampere1BUnitAB]> {
-  let Latency = 2;
-  let NumMicroOps = 3;
-}
-
-def Ampere1BWrite_2cyc_1XY : SchedWriteRes<[Ampere1BUnitXY]> {
-  let Latency = 2;
-  let NumMicroOps = 1;
-}
-
-def Ampere1BWrite_2cyc_1S_1Z : SchedWriteRes<[Ampere1BUnitS, Ampere1BUnitZ]> {
-  let Latency = 2;
-  let NumMicroOps = 2;
-}
-
-def Ampere1BWrite_3cyc_1BS : SchedWriteRes<[Ampere1BUnitBS]> {
-  let Latency = 3;
-  let NumMicroOps = 1;
-}
-
-def Ampere1BWrite_3cyc_1L : SchedWriteRes<[Ampere1BUnitL]> {
-  let Latency = 3;
-  let NumMicroOps = 1;
-}
-
-def Ampere1BWrite_3cyc_1X : SchedWriteRes<[Ampere1BUnitX]> {
-  let Latency = 3;
-  let NumMicroOps = 1;
-}
-
-def Ampere1BWrite_3cyc_1XY : SchedWriteRes<[Ampere1BUnitXY]> {
-  let Latency = 3;
-  let NumMicroOps = 1;
-}
-
-def Ampere1BWrite_3cyc_1Z : SchedWriteRes<[Ampere1BUnitZ]> {
-  let Latency = 3;
-  let NumMicroOps = 1;
-}
-
-def Ampere1BWrite_3cyc_1S_1Z : SchedWriteRes<[Ampere1BUnitS,
-                                              Ampere1BUnitZ]> {
-  let Latency = 3;
-  let NumMicroOps = 2;
-}
-
-def Ampere1BWrite_3cyc_1S_2Z : SchedWriteRes<[Ampere1BUnitS,
-                                              Ampere1BUnitZ, Ampere1BUnitZ]> {
-  let Latency = 3;
-  let NumMicroOps = 3;
-}
-
-def Ampere1BWrite_3cyc_2S_2Z : SchedWriteRes<[Ampere1BUnitS, Ampere1BUnitS,
-                                              Ampere1BUnitZ, Ampere1BUnitZ]> {
-  let Latency = 3;
-  let NumMicroOps = 4;
-}
-
-def Ampere1BWrite_4cyc_1BS_1AB : SchedWriteRes<[Ampere1BUnitBS, Ampere1BUnitAB]> {
-  let Latency = 4;
-  let NumMicroOps = 2;
-}
-
-def Ampere1BWrite_4cyc_1L : SchedWriteRes<[Ampere1BUnitL]> {
-  let Latency = 4;
-  let NumMicroOps = 1;
-}
-
-def Ampere1BWrite_4cyc_1L_1B : SchedWriteRes<[Ampere1BUnitL, Ampere1BUnitB]> {
-  let Latency = 4;
-  let NumMicroOps = 2;
-}
-
-def Ampere1BWrite_4cyc_1X : SchedWriteRes<[Ampere1BUnitX]> {
-  let Latency = 4;
-  let NumMicroOps = 1;
-}
-
-def Ampere1BWrite_4cyc_1XY : SchedWriteRes<[Ampere1BUnitXY]> {
-  let Latency = 4;
-  let NumMicroOps = 1;
-}
-
-def Ampere1BWrite_4cyc_2XY : SchedWriteRes<[Ampere1BUnitXY, Ampere1BUnitXY]> {
-  let Latency = 4;
-  let NumMicroOps = 2;
-}
-
-def Ampere1BWrite_5cyc_1BS : SchedWriteRes<[Ampere1BUnitBS]> {
-  let Latency = 5;
-  let NumMicroOps = 1;
-}
-
-def Ampere1BWrite_4cyc_1XY_1S_1Z : SchedWriteRes<[Ampere1BUnitXY,
-                                                  Ampere1BUnitS
-                                                  Ampere1BUnitZ]> {
-  let Latency = 4;
-  let NumMicroOps = 3;
-}
-
-def Ampere1BWrite_5cyc_1BS : SchedWriteRes<[Ampere1BUnitBS]> {
-  let Latency = 5;
-  let NumMicroOps = 1;
-}
-
-def Ampere1BWrite_5cyc_4S_4Z : SchedWriteRes<[Ampere1BUnitL,
-                                              Ampere1BUnitBS]> {
-  let Latency = 5;
-  let NumMicroOps = 8;
-}
-
-def Ampere1BWrite_5cyc_1L_1BS : SchedWriteRes<[Ampere1BUnitL,
-                                               Ampere1BUnitBS]> {
-  let Latency = 5;
-  let NumMicroOps = 2;
-}
-
-def Ampere1BWrite_5cyc_3L : SchedWriteRes<[Ampere1BUnitL,
-                                           Ampere1BUnitL,
-                                           Ampere1BUnitL]> {
-  let Latency = 5;
-  let NumMicroOps = 3;
-}
-
-def Ampere1BWrite_5cyc_4L : SchedWriteRes<[Ampere1BUnitL,
-                                           Ampere1BUnitL,
-                                           Ampere1BUnitL,
-                                           Ampere1BUnitL]> {
-  let Latency = 5;
-  let NumMicroOps = 4;
-}
-
-def Ampere1BWrite_5cyc_1X : SchedWriteRes<[Ampere1BUnitX]> {
-  let Latency = 5;
-  let NumMicroOps = 1;
-}
-
-def Ampere1BWrite_5cyc_2XY_2S_2Z : SchedWriteRes<[Ampere1BUnitXY, Ampere1BUnitXY,
-                                                  Ampere1BUnitS,  Ampere1BUnitS,
-                                                  Ampere1BUnitZ,  Ampere1BUnitZ]> {
-  let Latency = 5;
-  let NumMicroOps = 6;
-}
-
-def Ampere1BWrite_6cyc_1BS_1A : SchedWriteRes<[Ampere1BUnitBS, Ampere1BUnitA]> {
-  let Latency = 6;
-  let NumMicroOps = 2;
-}
-
-def Ampere1BWrite_6cyc_1BS_2A : SchedWriteRes<[Ampere1BUnitBS, Ampere1BUnitA,
-                                                               Ampere1BUnitA]> {
-  let Latency = 6;
-  let NumMicroOps = 3;
-}
-
-def Ampere1BWrite_6cyc_1X : SchedWriteRes<[Ampere1BUnitX]> {
-  let Latency = 6;
-  let NumMicroOps = 2;
-}
-
-def Ampere1BWrite_6cyc_2XY : SchedWriteRes<[Ampere1BUnitXY, Ampere1BUnitXY]> {
-  let Latency = 6;
-  let NumMicroOps = 2;
-}
-
-def Ampere1BWrite_6cyc_3XY : SchedWriteRes<[Ampere1BUnitXY, Ampere1BUnitXY,
-                                            Ampere1BUnitXY]> {
-  let Latency = 6;
-  let NumMicroOps = 3;
-}
-
-def Ampere1BWrite_6cyc_2XY_2S_2Z : SchedWriteRes<[Ampere1BUnitXY, Ampere1BUnitXY,
-                                                  Ampere1BUnitS,  Ampere1BUnitS,
-                                                  Ampere1BUnitZ,  Ampere1BUnitZ]> {
-  let Latency = 6;
-  let NumMicroOps = 6;
-}
-
-def Ampere1BWrite_6cyc_3XY_3S_3Z : SchedWriteRes<[Ampere1BUnitXY, Ampere1BUnitXY, Ampere1BUnitXY,
-                                                  Ampere1BUnitS,  Ampere1BUnitS,  Ampere1BUnitS,
-                                                  Ampere1BUnitZ,  Ampere1BUnitZ,  Ampere1BUnitZ]> {
-  let Latency = 6;
-  let NumMicroOps = 9;
-}
-
-def Ampere1BWrite_7cyc_1BS_1XY : SchedWriteRes<[Ampere1BUnitBS, Ampere1BUnitXY]> {
-  let Latency = 7;
-  let NumMicroOps = 2;
-}
-
-def Ampere1BWrite_7cyc_1XY_1Z : SchedWriteRes<[Ampere1BUnitXY, Ampere1BUnitZ]> {
-  let Latency = 7;
-  let NumMicroOps = 2;
-}
-
-def Ampere1BWrite_7cyc_4XY_4S_4Z : SchedWriteRes<[Ampere1BUnitXY, Ampere1BUnitXY,
-                                                  Ampere1BUnitXY, Ampere1BUnitXY,
-                                                  Ampere1BUnitS,  Ampere1BUnitS,
-                                                  Ampere1BUnitS,  Ampere1BUnitS,
-                                                  Ampere1BUnitZ,  Ampere1BUnitZ,
-                                                  Ampere1BUnitZ,  Ampere1BUnitZ]> {
-  let Latency = 7;
-  let NumMicroOps = 12;
-}
-
-def Ampere1BWrite_8cyc_1BS_1L : SchedWriteRes<[Ampere1BUnitBS, Ampere1BUnitL]> {
-  let Latency = 8;
-  let NumMicroOps = 2;
-}
-
-def Ampere1BWrite_8cyc_1BS_1XY : SchedWriteRes<[Ampere1BUnitBS, Ampere1BUnitXY]> {
-  let Latency = 8;
-  let NumMicroOps = 2;
-}
-
-def Ampere1BWrite_8cyc_2XY : SchedWriteRes<[Ampere1BUnitXY, Ampere1BUnitXY]> {
-  let Latency = 8;
-  let NumMicroOps = 2;
-}
-
-def Ampere1BWrite_8cyc_4XY : SchedWriteRes<[Ampere1BUnitXY, Ampere1BUnitXY,
-                                            Ampere1BUnitXY, Ampere1BUnitXY]> {
-  let Latency = 8;
-  let NumMicroOps = 4;
-}
-
-def Ampere1BWrite_9cyc_6XY_4S_4Z : SchedWriteRes<[Ampere1BUnitXY, Ampere1BUnitXY,
-                                                  Ampere1BUnitXY, Ampere1BUnitXY,
-                                                  Ampere1BUnitXY, Ampere1BUnitXY,
-                                                  Ampere1BUnitS,  Ampere1BUnitS,
-                                                  Ampere1BUnitS,  Ampere1BUnitS,
-                                                  Ampere1BUnitZ,  Ampere1BUnitZ,
-                                                  Ampere1BUnitZ,  Ampere1BUnitZ]> {
-  let Latency = 9;
-  let NumMicroOps = 14;
-}
-
-def Ampere1BWrite_9cyc_1A_1BS_1X : SchedWriteRes<[Ampere1BUnitA, Ampere1BUnitBS, Ampere1BUnitX]> {
-  let Latency = 9;
-  let NumMicroOps = 3;
-}
-
-def Ampere1BWrite_9cyc_1A_1BS_1XY : SchedWriteRes<[Ampere1BUnitA, Ampere1BUnitBS, Ampere1BUnitXY]> {
-  let Latency = 9;
-  let NumMicroOps = 3;
-}
-
-def Ampere1BWrite_9cyc_1X : SchedWriteRes<[Ampere1BUnitX]> {
-  let Latency = 9;
-  let NumMicroOps = 1;
-}
-
-def Ampere1BWrite_9cyc_3XY : SchedWriteRes<[Ampere1BUnitXY, Ampere1BUnitXY, Ampere1BUnitXY]> {
-  let Latency = 9;
-  let NumMicroOps = 3;
-}
-
-def Ampere1BWrite_11cyc_1BS_2XY : SchedWriteRes<[Ampere1BUnitBS, Ampere1BUnitXY, Ampere1BUnitXY]> {
-  let Latency = 11;
-  let NumMicroOps = 3;
-}
-
-def Ampere1BWrite_12cyc_1X : SchedWriteRes<[Ampere1BUnitX]> {
-  let Latency = 12;
-  let NumMicroOps = 1;
-}
-
-def Ampere1BWrite_13cyc_1BS_1X : SchedWriteRes<[Ampere1BUnitBS, Ampere1BUnitX]> {
-  let Latency = 13;
-  let NumMicroOps = 2;
-}
-
-def Ampere1BWrite_17cyc_1X : SchedWriteRes<[Ampere1BUnitX]> {
-  let Latency = 17;
-  let NumMicroOps = 1;
-}
-
-def Ampere1BWrite_19cyc_2BS_1X : SchedWriteRes<[Ampere1BUnitBS,
-                                                Ampere1BUnitBS,
-                                                Ampere1BUnitX]> {
-  let Latency = 13;
-  let NumMicroOps = 3;
-}
-
-def Ampere1BWrite_19cyc_1X : SchedWriteRes<[Ampere1BUnitX]> {
-  let Latency = 19;
-  let NumMicroOps = 1;
-}
-
-def Ampere1BWrite_21cyc_1X : SchedWriteRes<[Ampere1BUnitX]> {
-  let Latency = 21;
-  let NumMicroOps = 1;
-}
-
-def Ampere1BWrite_33cyc_1X : SchedWriteRes<[Ampere1BUnitX]> {
-  let Latency = 33;
-  let NumMicroOps = 1;
-}
-
-def Ampere1BWrite_39cyc_1X : SchedWriteRes<[Ampere1BUnitX]> {
-  let Latency = 39;
-  let NumMicroOps = 1;
-}
-
-def Ampere1BWrite_63cyc_1X : SchedWriteRes<[Ampere1BUnitX]> {
-  let Latency = 63;
-  let NumMicroOps = 1;
-}
-
-// For basic arithmetic, we have more flexibility for short shifts (LSL shift <= 4),
-// which are a single uop, and for extended registers, which have full flexibility
-// across Unit A or B for both uops.
-def Ampere1BWrite_Arith : SchedWriteVariant<[
-                                SchedVar<RegExtendedPred, [Ampere1Write_2cyc_2AB]>,
-                                SchedVar<IsCheapLSL,      [Ampere1Write_1cyc_1AB]>,
-                                SchedVar<NoSchedPred,     [Ampere1Write_2cyc_1B_1AB]>]>;
-
-def Ampere1BWrite_ArithFlagsetting : SchedWriteVariant<[
-                                SchedVar<RegExtendedPred, [Ampere1Write_2cyc_2AB]>,
-                                SchedVar<IsCheapLSL,      [Ampere1Write_1cyc_1AB]>,
-                                SchedVar<NoSchedPred,     [Ampere1Write_2cyc_1B_1AB]>]>;
-
-//===----------------------------------------------------------------------===//
-// Map the target-defined scheduler read/write resources and latencies for Ampere-1.
-// This provides a coarse model, which is then specialised below.
-
-def : WriteRes<WriteImm,   [Ampere1BUnitAB]>;  // MOVN, MOVZ
-def : WriteRes<WriteI,     [Ampere1BUnitAB]>;  // ALU
-def : WriteRes<WriteISReg, [Ampere1BUnitB, Ampere1BUnitAB]> {
-  let Latency = 2;
-  let NumMicroOps = 2;
-}  // ALU of Shifted-Reg
-def : WriteRes<WriteIEReg, [Ampere1BUnitAB, Ampere1BUnitAB]> {
-  let Latency = 2;
-  let NumMicroOps = 2;
-}  // ALU of Extended-Reg
-def : WriteRes<WriteExtr,  [Ampere1BUnitB]>;  // EXTR shifts a reg pair
-def : WriteRes<WriteIS,    [Ampere1BUnitB]>;  // Shift/Scale
-def : WriteRes<WriteID32,  [Ampere1BUnitBS, Ampere1BUnitX]> {
-  let Latency = 13;
-}  // 32-bit Divide
-def : WriteRes<WriteID64,  [Ampere1BUnitBS, Ampere1BUnitX]> {
-  let Latency = 19;
-}  // 64-bit Divide
-def : WriteRes<WriteIM32,  [Ampere1BUnitBS]> {
-  let Latency = 3;
-}  // 32-bit Multiply
-def : WriteRes<WriteIM64,  [Ampere1BUnitBS, Ampere1UnitAB]> {
-  let Latency = 3;
-}  // 64-bit Multiply
-def : WriteRes<WriteBr,    [Ampere1BUnitA]>;
-def : WriteRes<WriteBrReg, [Ampere1BUnitA, Ampere1UnitA]>;
-def : WriteRes<WriteLD,    [Ampere1BUnitL]> {
-  let Latency = 3;
-}  // Load from base addr plus immediate offset
-def : WriteRes<WriteST,    [Ampere1BUnitS]> {
-  let Latency = 1;
-}  // Store to base addr plus immediate offset
-def : WriteRes<WriteSTP,   [Ampere1BUnitS, Ampere1BUnitS]> {
-  let Latency = 1;
-  let NumMicroOps = 1;
-}  // Store a register pair.
-def : WriteRes<WriteAdr,   [Ampere1BUnitAB]>;
-def : WriteRes<WriteLDIdx, [Ampere1BUnitAB, Ampere1BUnitS]> {
-  let Latency = 3;
-  let NumMicroOps = 1;
-}  // Load from a register index (maybe scaled).
-def : WriteRes<WriteSTIdx, [Ampere1BUnitS, Ampere1BUnitS]> {
-  let Latency = 1;
-  let NumMicroOps = 2;
-}  // Store to a register index (maybe scaled).
-def : WriteRes<WriteF,  [Ampere1BUnitXY]> {
-  let Latency = 2;
-}  // General floating-point ops.
-def : WriteRes<WriteFCmp,  [Ampere1BUnitX]> {
-  let Latency = 3;
-}  // Floating-point compare.
-def : WriteRes<WriteFCvt,  [Ampere1BUnitXY]> {
-  let Latency = 3;
-}  // Float conversion.
-def : WriteRes<WriteFCopy, [Ampere1BUnitXY]> {
-}  // Float-int register copy.
-def : WriteRes<WriteFImm,  [Ampere1BUnitXY]> {
-  let Latency = 2;
-}  // Float-int register copy.
-def : WriteRes<WriteFMul,  [Ampere1BUnitXY]> {
-  let Latency = 4;
-}  // Floating-point multiply.
-def : WriteRes<WriteFDiv,  [Ampere1BUnitXY]> {
-  let Latency = 19;
-}  // Floating-point division.
-def : WriteRes<WriteVd,    [Ampere1BUnitXY]> {
-  let Latency = 3;
-}  // 64bit Vector D ops.
-def : WriteRes<WriteVq,    [Ampere1BUnitXY]> {
-  let Latency = 3;
-}  // 128bit Vector Q ops.
-def : WriteRes<WriteVLD,   [Ampere1BUnitL, Ampere1BUnitL]> {
-  let Latency = 4;
-}  // Vector loads.
-def : WriteRes<WriteVST,   [Ampere1BUnitS, Ampere1BUnitZ]> {
-  let Latency = 2;
-}  // Vector stores.
-
-def : WriteRes<WriteAtomic,  []> { let Unsupported = 1; }
-
-def : WriteRes<WriteSys,     []> { let Latency = 1; }
-def : WriteRes<WriteBarrier, []> { let Latency = 1; }
-def : WriteRes<WriteHint,    []> { let Latency = 1; }
-
-def : WriteRes<WriteLDHi,    []> {
-  let Latency = 3;
-}  // The second register of a load-pair: LDP,LDPSW,LDNP,LDXP,LDAXP
-
-// Forwarding logic.
-def : ReadAdvance<ReadI,       0>;
-def : ReadAdvance<ReadISReg,   0>;
-def : ReadAdvance<ReadIEReg,   0>;
-def : ReadAdvance<ReadIM,      0>;
-def : ReadAdvance<ReadIMA,     1, [WriteIM32, WriteIM64]>;
-def : ReadAdvance<ReadID,      0>;
-def : ReadAdvance<ReadExtrHi,  0>;
-def : ReadAdvance<ReadST,      0>;
-def : ReadAdvance<ReadAdrBase, 0>;
-def : ReadAdvance<ReadVLD,     0>;
-
-//===----------------------------------------------------------------------===//
-// Specialising the scheduling model further for Ampere-1B.
-
-def : InstRW<[Ampere1BWrite_1cyc_1AB], (instrs COPY)>;
-
-// Branch instructions
-def : InstRW<[Ampere1BWrite_1cyc_1A], (instrs Bcc, BL, RET)>;
-def : InstRW<[Ampere1BWrite_1cyc_1A],
-        (instrs CBZW, CBZX, CBNZW, CBNZX, TBZW, TBZX, TBNZW, TBNZX)>;
-def : InstRW<[Ampere1BWrite_1cyc_2A], (instrs BLR)>;
-
-// Common Short Sequence Compression (CSSC)
-def : InstRW<[Ampere1BWrite_1cyc_1AB], (instrs ABS)>;
-def : InstRW<[Ampere1BWrite_1cyc_1BS], (instrs CNT)>;
-def : InstRW<[Ampere1BWrite_1cyc_1AB_1A], (instrs SMAX, SMIN)>;
-def : InstRW<[Ampere1BWrite_1cyc_1B], (instrs CTZ)>;
-def : InstRW<[Ampere1BWrite_1cyc_1AB_1A], (instrs UMAX, USMIN)>;
-
-// Cryptography instructions
-// -- AES encryption/decryption
-def : InstRW<[Ampere1BWrite_2cyc_1XY], (instregex "^AES[DE]")>;
-def : InstRW<[Ampere1BWrite_2cyc_1XY], (instregex "^AESI?MC")>;
-// -- Polynomial multiplication
-def : InstRW<[Ampere1BWrite_2cyc_1XY], (instregex "^PMUL", "^PMULL")>;
-// -- SHA-256 hash
-def : InstRW<[Ampere1BWrite_4cyc_1X], (instregex "^SHA256(H|H2)")>;
-// -- SHA-256 schedule update
-def : InstRW<[Ampere1BWrite_2cyc_1Y], (instregex "^SHA256SU[01]")>;
-// -- SHA-3 instructions
-def : InstRW<[Ampere1BWrite_2cyc_1XY],
-        (instregex "^BCAX", "^EOR3", "^RAX1", "^XAR")>;
-// -- SHA-512 hash
-def : InstRW<[Ampere1BWrite_4cyc_1X], (instregex "^SHA512(H|H2)")>;
-// -- SHA-512 schedule update
-def : InstRW<[Ampere1BWrite_2cyc_1Y], (instregex "^SHA512SU[01]")>;
-// -- SHA1 choose/majority/parity
-def : InstRW<[Ampere1BWrite_4cyc_1X], (instregex "^SHA1[CMP]")>;
-// -- SHA1 hash/schedule update
-def : InstRW<[Ampere1BWrite_2cyc_1Y], (instregex "^SHA1SU[01]")>;
-def : InstRW<[Ampere1BWrite_2cyc_1Y], (instregex "^SHA1H")>;
-// -- SM3 hash
-def : InstRW<[Ampere1BWrite_2cyc_1XY],
-    (instregex "^SM3PARTW[12]$", "^SM3SS1$", "^SM3TT[12][AB]$"0)>;
-def : InstRW<[Ampere1BWrite_4cyc_1X], (instrs SM4E, SM4ENCKEY)>;
-
-// FP and vector load instructions
-// -- Load 1-element structure to one/all lanes
-// ---- all lanes
-def : InstRW<[Ampere1BWrite_6cyc_1L_1XY],
-        (instregex "^LD1Rv(8b|4h|2s|16b|8h|4s|2d)")>;
-// ---- one lane
-def : InstRW<[Ampere1BWrite_6cyc_1L_1XY],
-        (instregex "^LD1i(8|16|32|64)")>;
-// -- Load 1-element structure to one/all lanes, 1D size
-def : InstRW<[Ampere1BWrite_4cyc_1L],
-        (instregex "^LD1Rv1d")>;
-// -- Load 1-element structures to 1 register
-def : InstRW<[Ampere1BWrite_4cyc_1L],
-        (instregex "^LD1Onev(8b|4h|2s|1d|16b|8h|4s|2d)")>;
-// -- Load 1-element structures to 2 registers
-def : InstRW<[Ampere1BWrite_4cyc_2L],
-        (instregex "^LD1Twov(8b|4h|2s|1d|16b|8h|4s|2d)")>;
-// -- Load 1-element structures to 3 registers
-def : InstRW<[Ampere1BWrite_5cyc_3L],
-        (instregex "^LD1Threev(8b|4h|2s|1d|16b|8h|4s|2d)")>;
-// -- Load 1-element structures to 4 registers
-def : InstRW<[Ampere1BWrite_5cyc_4L],
-        (instregex "^LD1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)")>;
-// -- Load 2-element structure to all lanes of 2 registers, 1D size
-def : InstRW<[Ampere1BWrite_4cyc_2L],
-        (instregex "^LD2Rv1d")>;
-// -- Load 2-element structure to all lanes of 2 registers, other sizes
-def : InstRW<[Ampere1BWrite_6cyc_2L_2XY],
-        (instregex "^LD2Rv(8b|4h|2s|16b|8h|4s|2d)")>;
-// -- Load 2-element structure to one lane of 2 registers
-def : InstRW<[Ampere1BWrite_6cyc_2L_2XY],
-        (instregex "^LD2i(8|16|32|64)")>;
-// -- Load 2-element structures to 2 registers, 16B/8H/4S/2D size
-def : InstRW<[Ampere1BWrite_6cyc_2L_2XY],
-        (instregex "^LD2Twov(16b|8h|4s|2d)")>;
-// -- Load 2-element structures to 2 registers, 8B/4H/2S size
-def : InstRW<[Ampere1BWrite_8cyc_2L_3XY],
-        (instregex "^LD2Twov(8b|4h|2s)")>;
-// -- Load 3-element structure to all lanes of 3 registers, 1D size
-def : InstRW<[Ampere1BWrite_5cyc_3L],
-        (instregex "^LD3Rv1d")>;
-// -- Load 3-element structure to all lanes of 3 registers, other sizes
-def : InstRW<[Ampere1BWrite_7cyc_3L_3XY],
-        (instregex "^LD3Rv(8b|4h|2s|16b|8h|4s|2d)")>;
-// -- Load 3-element structure to one lane of 3 registers
-def : InstRW<[Ampere1BWrite_7cyc_3L_3XY],
-        (instregex "^LD3i(8|16|32|64)")>;
-// -- Load 3-element structures to 3 registers, 16B/8H/4S sizes
-def : InstRW<[Ampere1BWrite_8cyc_3L_3XY],
-        (instregex "^LD3Threev(16b|8h|4s)")>;
-// -- Load 3-element structures to 3 registers, 2D size
-def : InstRW<[Ampere1BWrite_7cyc_3L_3XY],
-        (instregex "^LD3Threev2d")>;
-// -- Load 3-element structures to 3 registers, 8B/4H/2S sizes
-def : InstRW<[Ampere1BWrite_9cyc_3L_3XY],
-        (instregex "^LD3Threev(8b|4h|2s)")>;
-// -- Load 4-element structure to all lanes of 4 registers, 1D size
-def : InstRW<[Ampere1BWrite_5cyc_4L],
-        (instregex "^LD4Rv1d")>;
-// -- Load 4-element structure to all lanes of 4 registers, other sizes
-def : InstRW<[Ampere1BWrite_7cyc_4L_4XY],
-        (instregex "^LD4Rv(8b|4h|2s|16b|8h|4s|2d)")>;
-// -- Load 4-element structure to one lane of 4 registers
-def : InstRW<[Ampere1BWrite_7cyc_4L_4XY],
-        (instregex "^LD4i(8|16|32|64)")>;
-// -- Load 4-element structures to 4 registers, 2D size
-def : InstRW<[Ampere1BWrite_8cyc_4L_4XY],
-        (instregex "^LD4Fourv2d")>;
-// -- Load 4-element structures to 4 registers, 2S size
-def : InstRW<[Ampere1BWrite_11cyc_4L_8XY],
-        (instregex "^LD4Fourv2s")>;
-// -- Load 4-element structures to 4 registers, other sizes
-def : InstRW<[Ampere1BWrite_10cyc_4L_8XY],
-        (instregex "^LD4Fourv(8b|4h|16b|8h|4s)")>;
-// -- Load pair, Q-form
-def : InstRW<[Ampere1BWrite_4cyc_2L], (instregex "LDN?PQ")>;
-// -- Load pair, S/D-form
-def : InstRW<[Ampere1BWrite_5cyc_1L_1BS], (instregex "LDN?P(S|D)")>;
-// -- Load register
-def : InstRW<[Ampere1BWrite_4cyc_1L], (instregex "LDU?R[BHSDQ]i")>;
-// -- Load register, sign-extended register
-def : InstRW<[Ampere1BWrite_4cyc_1L], (instregex "LDR[BHSDQ]ro(W|X)")>;
-
-// FP and vector store instructions
-// -- Store 1-element structure from one lane of 1 register
-def : InstRW<[Ampere1BWrite_4cyc_1XY_1S_1Z],
-        (instregex "^ST1i(8|16|32|64)")>;
-// -- Store 1-element structures from 1 register
-def : InstRW<[Ampere1BWrite_2cyc_1S_1Z],
-        (instregex "^ST1Onev(8b|4h|2s|1d|16b|8h|4s|2d)")>;
-// -- Store 1-element structures from 2 registers
-def : InstRW<[Ampere1BWrite_3cyc_2S_2Z],
-        (instregex "^ST1Twov(8b|4h|2s|1d|16b|8h|4s|2d)")>;
-// -- Store 1-element structures from 3 registers
-def : InstRW<[Ampere1BWrite_4cyc_3S_3Z],
-        (instregex "^ST1Threev(8b|4h|2s|1d|16b|8h|4s|2d)")>;
-// -- Store 1-element structures from 4 registers
-def : InstRW<[Ampere1BWrite_5cyc_4S_4Z],
-        (instregex "^ST1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)")>;
-// -- Store 2-element structure from one lane of 2 registers
-def : InstRW<[Ampere1BWrite_5cyc_2XY_2S_2Z],
-        (instregex "^ST2i(8|16|32|64)")>;
-// -- Store 2-element structures from 2 registers, 16B/8H/4S/2D sizes
-def : InstRW<[Ampere1BWrite_5cyc_2XY_2S_2Z],
-        (instregex "^ST2Twov(16b|8h|4s|2d)")>;
-// -- Store 2-element structures from 2 registers, 8B/4H/2S sizes
-def : InstRW<[Ampere1BWrite_6cyc_2XY_2S_2Z],
-        (instregex "^ST2Twov(8b|4h|2s)")>;
-// -- Store 3-element structure from one lane of 3 registers
-def : InstRW<[Ampere1BWrite_6cyc_3XY_3S_3Z],
-        (instregex "^ST3i(8|16|32|64)")>;
-// -- Store 3-element structures from 3 registers
-def : InstRW<[Ampere1BWrite_6cyc_3XY_3S_3Z],
-        (instregex "^ST3Threev(8b|4h|2s|1d|16b|8h|4s|2d)")>;
-// -- Store 4-element structure from one lane of 4 registers
-def : InstRW<[Ampere1BWrite_7cyc_4XY_4S_4Z],
-        (instregex "^ST4i(8|16|32|64)")>;
-// -- Store 4-element structures from 4 registers, 16B/8H/4S sizes
-def : InstRW<[Ampere1BWrite_7cyc_4XY_4S_4Z],
-        (instregex "^ST4Fourv(16b|8h|4s)")>;
-// -- Store 4-element structures from 4 registers, 2D sizes
-def : InstRW<[Ampere1BWrite_7cyc_4XY_4S_4Z],
-        (instregex "^ST4Fourv2d")>;
-// -- Store 4-element structures from 4 registers, 8B/4H/2S sizes
-def : InstRW<[Ampere1BWrite_9cyc_6XY_4S_4Z],
-        (instregex "^ST4Fourv(8b|4h|2s)")>;
-// -- Store pair, Q-form
-def : InstRW<[Ampere1BWrite_3cyc_2S_2Z], (instregex "^STN?PQ")>;
-// -- Store pair, S/D-form
-def : InstRW<[Ampere1BWrite_3cyc_2S_2Z], (instregex "^STN?P[SD]")>;
-// -- Store register
-def : InstRW<[Ampere1BWrite_2cyc_1S_2Z], (instregex "^STU?R[BHSDQ](ui|i)")>;
-// -- Store register, sign-extended register offset
-def : InstRW<[Ampere1BWrite_2cyc_1S_1Z], (instregex "^STR[BHSDQ]ro[XW]")>;
-
-// FP data processing, bfloat16 format
-def : InstRW<[Ampere1BWrite_3cyc_1XY], (instrs BFCVT)>;
-def : InstRW<[Ampere1BWrite_8cyc_2XY], (instrs BFCVTN, BFCVTN2)>;
-def : InstRW<[Ampere1BWrite_2cyc_1XY], (instregex "^BFDOTv", "^BF16DOT")>;
-def : InstRW<[Ampere1BWrite_3cyc_1XY], (instrs BFMMLA)>;
-def : InstRW<[Ampere1BWrite_4cyc_1XY], (instregex "^BFMLAL")>;
-
-// FP data processing, scalar/vector, half precision
-def : InstRW<[Ampere1BWrite_3cyc_1XY], (instregex "^F(ABD|ABS)v.[fi]16")>;
-def : InstRW<[Ampere1BWrite_3cyc_1XY],
-        (instregex "^F(ADD|ADDP|CADD|NEG|NMUL|SUB)v.[fi]16")>;
-def : InstRW<[Ampere1BWrite_3cyc_1XY],
-        (instregex "^F(AC|CM)(EQ|GE|GT|LE|LT)v.[fi]16")>;
-def : InstRW<[Ampere1BWrite_3cyc_1XY],
-        (instregex "^F(AC|CM)(EQ|GE|GT|LE|LT)16")>;
-def : InstRW<[Ampere1BWrite_3cyc_1X],
-        (instregex "^FCMPE?H")>;
-def : InstRW<[Ampere1BWrite_9cyc_1A_1BS_1X],
-        (instregex "^FCCMPE?H")>;
-def : InstRW<[Ampere1BWrite_9cyc_1A_1BS_1XY],
-        (instregex "^FCSELH")>;
-def : InstRW<[Ampere1BWrite_3cyc_1XY], (instregex "^FCVT[AMNPZ][SU]v.[if]16")>;
-// Convert FP to integer, H-form
-def : InstRW<[Ampere1BWrite_3cyc_1XY], (instregex "^[SUd]CVTFv.[fi]16")>;
-// Convert to FP from GPR, H-form
-def : InstRW<[Ampere1BWrite_8cyc_1BS_1XY], (instregex "^[SU]CVTF_ZPmZ_[DSH]toH$")>;
-// Convert to FP from GPR, fixed-point, H-form
-def : InstRW<[Ampere1BWrite_11cyc_1BS_2XY], (instregex "^[SU]CVTF[SU][WX]Hri$")>;
-def : InstRW<[Ampere1BWrite_9cyc_1X], (instrs FDIVHrr)>;
-def : InstRW<[Ampere1BWrite_17cyc_1X], (instregex "^FDIVv.[if]16")>;
-def : InstRW<[Ampere1BWrite_3cyc_1XY], (instregex "^F(MAX|MIN)(NM)?P?v.[if]16")>;
-def : InstRW<[Ampere1BWrite_6cyc_2XY], (instregex "^F(MAX|MIN)(NM)?Vv4[if]16")>;
-def : InstRW<[Ampere1BWrite_9cyc_3XY], (instregex "^F(MAX|MIN)(NM)?Vv8[if]16")>;
-def : InstRW<[Ampere1BWrite_4cyc_1XY], (instregex "^FMULX?v.[if]16")>;
-def : InstRW<[Ampere1BWrite_4cyc_1XY], (instrs FMULX16)>;
-def : InstRW<[Ampere1BWrite_4cyc_1XY], (instregex "^FN?M(ADD|SUB)[H]rrr")>;
-def : InstRW<[Ampere1BWrite_4cyc_1XY], (instregex "^FML[AS]v.[if]16")>;
-def : InstRW<[Ampere1BWrite_3cyc_1XY], (instregex "^FRECPXv.[if]16")>;
-def : InstRW<[Ampere1BWrite_4cyc_1XY], (instregex "^F(RECP|RSQRT)S16")>;
-def : InstRW<[Ampere1BWrite_3cyc_1XY], (instregex "^FRINT[AIMNPXZ]v.[if]16")>;
-// FP square root, H-form
-def : InstRW<[Ampere1BWrite_21cyc_1X], (instrs FSQRTHr)>;
-// FP square root, vector-form, F16
-def : InstRW<[Ampere1BWrite_39cyc_1X], (instregex "^FSQRTv.f16")>;
-
-// FP data processing, scalar/vector, single/double precision
-def : InstRW<[Ampere1BWrite_3cyc_1XY], (instregex "^F(ABD|ABS)v.[fi](32|64)")>;
-def : InstRW<[Ampere1BWrite_3cyc_1XY],
-        (instregex "^F(ADD|ADDP|CADD|NEG|NMUL|SUB)v.[fi](32|64)")>;
-def : InstRW<[Ampere1BWrite_3cyc_1XY],
-        (instregex "^F(AC|CM)(EQ|GE|GT|LE|LT)v.[fi](32|64)")>;
-def : InstRW<[Ampere1BWrite_3cyc_1XY],
-        (instregex "^F(AC|CM)(EQ|GE|GT|LE|LT)(32|64)")>;
-def : InstRW<[Ampere1BWrite_3cyc_1X],
-        (instregex "^FCMPE?(S|D)")>;
-def : InstRW<[Ampere1BWrite_9cyc_1A_1BS_1X],
-        (instregex "^FCCMPE?(S|D)")>;
-def : InstRW<[Ampere1BWrite_9cyc_1A_1BS_1XY],
-        (instregex "^FCSEL(S|D)")>;
-def : InstRW<[Ampere1BWrite_3cyc_1XY], (instregex "^FCVT[AMNPZ][SU]v.[if](32|64)")>;
-// Convert FP to integer, S/D-form
-def : InstRW<[Ampere1BWrite_3cyc_1XY], (instregex "^[SUd]CVTFv.[fi](32|64)")>;
-// Convert to FP from GPR, S/D-form
-def : InstRW<[Ampere1BWrite_8cyc_1BS_1XY], (instregex "^[SU]CVTF_ZPmZ_[DSH]to[DS]$")>;
-// Convert to FP from GPR, fixed-point, S/D-form
-def : InstRW<[Ampere1BWrite_11cyc_1BS_2XY], (instregex "^[SU]CVTF[SU][WX][SD]ri$")>;
-def : InstRW<[Ampere1BWrite_19cyc_1X], (instregex "^FDIVv.[if](64)", "FDIVD")>;
-def : InstRW<[Ampere1BWrite_12cyc_1X], (instregex "^FDIVv.[if](32)", "FDIVS")>;
-def : InstRW<[Ampere1BWrite_3cyc_1XY], (instregex "^F(MAX|MIN)(NM)?P?v.[if](32|64)")>;
-def : InstRW<[Ampere1BWrite_6cyc_2XY], (instregex "^F(MAX|MIN)(NM)?Vv.[if](32|64)")>;
-def : InstRW<[Ampere1BWrite_4cyc_1XY], (instregex "^FMULX?v.[if](32|64)")>;
-def : InstRW<[Ampere1BWrite_4cyc_1XY], (instrs FMULX32, FMULX64)>;
-def : InstRW<[Ampere1BWrite_4cyc_1XY], (instregex "^FN?MUL")>;
-def : InstRW<[Ampere1BWrite_4cyc_1XY], (instregex "^FN?M(ADD|SUB)[SD]rrr")>;
-def : InstRW<[Ampere1BWrite_4cyc_1XY], (instregex "^FML[AS]v.[if](32|64)")>;
-def : InstRW<[Ampere1BWrite_3cyc_1XY], (instregex "^FRECPXv.[if](32|64)")>;
-def : InstRW<[Ampere1BWrite_3cyc_1XY], (instregex "^F(RECP|RSQRT)S(32|64)")>;
-def : InstRW<[Ampere1BWrite_3cyc_1XY], (instregex "^FRINT[AIMNPXZ]v.[if](32|64)")>;
-def : InstRW<[Ampere1BWrite_3cyc_1XY], (instregex "^FRINT(32|64)")>;
-def : InstRW<[Ampere1BWrite_63cyc_1X], (instregex "^FSQRTv.f64", "^FSQRTDr")>;
-def : InstRW<[Ampere1BWrite_33cyc_1X], (instregex "^FSQRTv.f32", "^FSQRTSr")>;
-
-// FP miscellaneous instructions
-def : InstRW<[Ampere1BWrite_7cyc_1XY_1Z], (instregex "^FCVT[AMNPZ][SU][SU][XW][HSD]r")>;
-def : InstRW<[Ampere1BWrite_3cyc_1XY], (instregex "^FCVT[HSD]Hr")>;
-def : InstRW<[Ampere1BWrite_3cyc_1XY], (instregex "^FCVT[HSD][SD]r")>;
-def : InstRW<[Ampere1BWrite_3cyc_1XY], (instregex "^FCVTLv")>;
-def : InstRW<[Ampere1BWrite_3cyc_1XY], (instregex "^FCVT(N|XN)v")>;
-def : InstRW<[Ampere1BWrite_7cyc_1X_1Z], (instrs FJCVTZS)>;
-def : InstRW<[Ampere1BWrite_5cyc_1BS], (instregex "^FMOV[HSD][WX]r")>;
-def : InstRW<[Ampere1BWrite_7cyc_1BS_1XY], (instregex "^FMOVDXHighr")>;
-def : InstRW<[Ampere1BWrite_2cyc_1XY], (instregex "^FMOV[HSD][ri]")>;
-def : InstRW<[Ampere1BWrite_5cyc_1X], (instregex "^FMOVXDHighr")>;
-def : InstRW<[Ampere1BWrite_3cyc_1Z], (instregex "^FMOV[WX][HSD]r")>;
-
-// Integer arithmetic and logical instructions
-def : InstRW<[Ampere1BWrite_1cyc_1A],
-        (instregex "ADC(W|X)r", "SBC(W|X)r")>;
-def : InstRW<[Ampere1BWrite_Arith],
-        (instregex "(ADD|AND|BIC|EON|EOR|ORN|ORR|SUB)[WX]r[sx]")>;
-def : InstRW<[Ampere1BWrite_1cyc_1AB],
-        (instregex "(ADD|AND|BIC|EON|EOR|ORN|ORR|SUB)[WX]r[ri]")>;
-def : InstRW<[Ampere1BWrite_ArithFlagsetting],
-        (instregex "(ADD|AND|BIC|SUB)S[WX]r[sx]")>;
-def : InstRW<[Ampere1BWrite_1cyc_1A],
-        (instregex "(ADD|AND|BIC|SUB)S[WX]r[ri]")>;
-def : InstRW<[Ampere1BWrite_1cyc_1A],
-        (instregex "(ADC|SBC)S[WX]r")>;
-def : InstRW<[Ampere1BWrite_1cyc_1A], (instrs RMIF)>;
-def : InstRW<[Ampere1BWrite_1cyc_1A],
-        (instregex "(CCMN|CCMP)(X|W)")>;
-def : InstRW<[Ampere1BWrite_1cyc_1A],
-        (instregex "(CSEL|CSINC|CSINV|CSNEG)(X|W)")>;
-def : InstRW<[Ampere1BWrite_13cyc_1BS_1X], (instrs SDIVWr, UDIVWr)>;
-def : InstRW<[Ampere1BWrite_19cyc_2BS_1X], (instrs SDIVXr, UDIVXr)>;
-def : InstRW<[Ampere1BWrite_3cyc_1BS],
-        (instregex "(S|U)MULHr")>;
-def : InstRW<[Ampere1BWrite_4cyc_1BS_1AB],
-        (instregex "(S|U)?M(ADD|SUB)L?r")>;
-
-// Integer load instructions
-def : InstRW<[Ampere1BWrite_3cyc_1L],
-        (instregex "(LDNP|LDP|LDPSW)(X|W)")>;
-def : InstRW<[Ampere1BWrite_3cyc_1L],
-        (instregex "LDR(B|D|H|Q|S)ui")>;
-def : InstRW<[Ampere1BWrite_3cyc_1L],
-        (instregex "LDR(D|Q|W|X)l")>;
-def : InstRW<[Ampere1BWrite_3cyc_1L],
-        (instregex "LDTR(B|H|W|X)i")>;
-def : InstRW<[Ampere1BWrite_3cyc_1L],
-        (instregex "LDTRS(BW|BX|HW|HX|W)i")>;
-def : InstRW<[Ampere1BWrite_3cyc_1L],
-        (instregex "LDUR(BB|HH|X|W)i")>;
-def : InstRW<[Ampere1BWrite_3cyc_1L],
-        (instregex "LDURS(BW|BX|HW|HX|W)i")>;
-def : InstRW<[Ampere1BWrite_3cyc_1L],
-        (instregex "LDR(HH|SHW|SHX|W|X)ro(W|X)")>;
-def : InstRW<[Ampere1BWrite_1cyc_1L],
-        (instrs PRFMl, PRFUMi, PRFUMi)>;
-def : InstRW<[Ampere1BWrite_1cyc_1L],
-        (instrs PRFMroW, PRFMroX)>;
-
-// Integer miscellaneous instructions
-def : InstRW<[Ampere1BWrite_1cyc_1A],  (instrs ADR, ADRP)>;
-def : InstRW<[Ampere1BWrite_1cyc_1B],  (instregex "EXTR(W|X)")>;
-def : InstRW<[Ampere1BWrite_1cyc_1B],  (instregex "(S|U)?BFM(W|X)")>;
-def : InstRW<[Ampere1BWrite_3cyc_1BS], (instregex "^CRC32C?[BHWX]")>;
-def : InstRW<[Ampere1BWrite_1cyc_1B],  (instregex "CLS(W|X)")>;
-def : InstRW<[Ampere1BWrite_1cyc_1A],  (instrs SETF8, SETF16)>;
-def : InstRW<[Ampere1BWrite_1cyc_1AB],
-        (instrs MOVKWi, MOVKXi, MOVNWi, MOVNXi, MOVZWi, MOVZXi)>;
-def : InstRW<[Ampere1BWrite_1cyc_1B],
-        (instregex "(RBIT|REV|REV16)(W|X)r", "REV32Xr")>;
-def : InstRW<[Ampere1BWrite_1cyc_1B],
-        (instregex "(ASR|LSL|LSR|ROR)V(W|X)r")>;
-
-// Integer store instructions
-def : InstRW<[Ampere1BWrite_1cyc_2S],        (instregex "STNP(X|W)i")>;
-def : InstRW<[Ampere1BWrite_1cyc_2S],        (instrs STPXi)>;
-def : InstRW<[Ampere1BWrite_2cyc_1B_1S],     (instrs STPWi)>;
-def : InstRW<[Ampere1BWrite_2cyc_1B_1S_1AB], (instregex "STP(W|X)(pre|post)")>;
-def : InstRW<[Ampere1BWrite_1cyc_1S],        (instrs STTRBi, STTRHi, STTRWi, STTRXi)>;
-def : InstRW<[Ampere1BWrite_1cyc_1S],        (instregex "STUR(BB|HH|X|W)i",
-                                                        "STR(X|W)ui",
-                                                        "STUR(BB|HH|X|W)i")>;
-def : InstRW<[Ampere1BWrite_1cyc_2S],        (instrs STRWroX, STRXroX)>;
-def : InstRW<[Ampere1BWrite_1cyc_2S],        (instrs STRWroW, STRXroW)>;
-
-// Memory tagging
-
-// Insert Random Tags
-def : InstRW<[Ampere1BWrite_1cyc_1BS_1B], (instrs IRG, IRGstack)>;
-// Load allocation tag
-def : InstRW<[Ampere1BWrite_4cyc_1L_1B], (instrs LDG, LDGM)>;
-// Store allocation tags
-def : InstRW<[Ampere1BWrite_1cyc_1S],
-    (instrs STGi, STGM, STGPreIndex, STGPostIndex)>;
-// Store allocation tags and pair of registers
-def : InstRW<[Ampere1BWrite_1cyc_2S],
-    (instrs STGPi, STGPpre, STGPpost)>;
-// Store allocation tags and zero data
-def : InstRW<[Ampere1BWrite_1cyc_1S],
-    (instrs STZGi, STZGM, STZGPreIndex, STZGPostIndex)>;
-// Store two tags
-def : InstRW<[Ampere1BWrite_1cyc_2S],
-    (instrs ST2Gi, ST2GPreIndex, ST2GPostIndex)>;
-// Store two tags and zero data
-def : InstRW<[Ampere1BWrite_1cyc_2S],
-    (instrs STZ2Gi, STZ2GPreIndex, STZ2GPostIndex)>;
-// Subtract Pointer
-def : InstRW<[Ampere1BWrite_1cyc_1AB], (instrs SUBP)>;
-// Subtract Pointer, flagset
-def : InstRW<[Ampere1BWrite_1cyc_1AB], (instrs SUBPS)>;
-// Insert Tag Mask
-def : InstRW<[Ampere1BWrite_1cyc_1AB], (instrs GMI)>;
-// Arithmetic, immediate to logical address tag
-def : InstRW<[Ampere1BWrite_1cyc_B], (instrs ADDG, SUBG)>;
-
-// Pointer authentication
-def : InstRW<[Ampere1BWrite_5cyc_1BS], (instregex "^AUT")>;
-def : InstRW<[Ampere1BWrite_6cyc_1BS_1A],
-        (instregex "BRA(A|AZ|B|BZ)", "RETA(A|B)", "ERETA(A|B)")>;
-def : InstRW<[Ampere1BWrite_6cyc_1BS_2A],
-        (instrs BLRAA, BLRAAZ, BLRAB, BLRABZ)>;
-def : InstRW<[Ampere1BWrite_5cyc_1BS], (instregex "^PAC")>;
-def : InstRW<[Ampere1BWrite_8cyc_1BS_1L], (instregex "^LDRA(A|B)")>;
-def : InstRW<[Ampere1BWrite_1cyc_1B], (instrs XPACD, XPACI)>;
-
-// Vector integer instructions
-// -- absolute difference
-def : InstRW<[Ampere1BWrite_2cyc_1XY],
-             (instregex "^SABAv", "^SABALv", "^SABDv", "^SABDLv",
-                        "^UABAv", "^UABALv", "^UABDv", "^UABDLv")>;
-// -- arithmetic
-def : InstRW<[Ampere1BWrite_2cyc_1XY],
-        (instregex "^ABSv", "^(ADD|SUB)v", "^SADDLv", "^SADDW", "SHADD",
-                   "SHSUB", "^SRHADD", "^URHADD", "SSUBL", "SSUBW",
-                   "^UADDLv", "^UADDW", "UHADD", "UHSUB", "USUBL", "USUBW")>;
-// -- arithmetic, horizontal, 16B
-def : InstRW<[Ampere1BWrite_8cyc_4XY],
-            (instregex "^ADDVv16i8v", "^SADDLVv16i8v", "^UADDLVv16i8v")>;
-def : InstRW<[Ampere1BWrite_8cyc_4XY],
-            (instregex "^[SU](MIN|MAX)Vv16i8v")>;
-// -- arithmetic, horizontal, 4H/4S
-def : InstRW<[Ampere1BWrite_4cyc_2XY],
-            (instregex "^[SU]?ADDL?V(v8i8|v4i16|v2i32)v")>;
-def : InstRW<[Ampere1BWrite_4cyc_2XY],
-            (instregex "^[SU](MIN|MAX)V(v4i16|v4i32)v")>;
-// -- arithmetic, horizontal, 8B/8H
-def : InstRW<[Ampere1BWrite_6cyc_3XY],
-            (instregex "^[SU]?ADDL?V(v8i16|v4i32)v")>;
-def : InstRW<[Ampere1BWrite_6cyc_3XY],
-            (instregex "^[SU](MIN|MAX)V(v8i8|v8i16)v")>;
-// -- arithmetic, narrowing
-def : InstRW<[Ampere1BWrite_6cyc_2XY], (instregex "(ADD|SUB)HNv.*")>;
-def : InstRW<[Ampere1BWrite_6cyc_2XY], (instregex "(RADD|RSUB)HNv.*")>;
-// -- arithmetic, pairwise
-def : InstRW<[Ampere1BWrite_2cyc_1XY],
-        (instregex "^ADDPv", "^SADALP", "^UADALP", "^SADDLPv", "^UADDLPv")>;
-// -- arithmetic, saturating
-def : InstRW<[Ampere1BWrite_2cyc_1XY],
-        (instregex "^SQADD", "^SQSUB", "^SUQADD", "^UQADD", "^UQSUB", "^USQADD")>;
-// -- bit count
-def : InstRW<[Ampere1BWrite_2cyc_1XY],
-        (instregex "^(CLS|CLZ|CNT)v")>;
-// -- compare
-def : InstRW<[Ampere1BWrite_2cyc_1XY],
-        (instregex "^CMEQv", "^CMGEv", "^CMGTv", "^CMLEv", "^CMLTv",
-                   "^CMHIv", "^CMHSv")>;
-// -- compare non-zero
-def : InstRW<[Ampere1BWrite_2cyc_1XY], (instregex "^CMTSTv")>;
-// -- dot product
-def : InstRW<[Ampere1BWrite_3cyc_1XY], (instregex "^(S|SU|U|US)DOTv")>;
-// -- fp reciprocal estimate
-def : InstRW<[Ampere1BWrite_6cyc_1X], (instregex "^FRECPEv", "^FRSQRTEv")>;
-// -- integer reciprocal estimate
-def : InstRW<[Ampere1BWrite_2cyc_1XY], (instregex "^URECPEv", "^URSQRTEv")>;
-// -- logical
-def : InstRW<[Ampere1BWrite_2cyc_1XY],
-        (instregex "^ANDv", "^BICv", "^EORv", "^ORRv", "^ORNv", "^NOTv")>;
-// -- logical, narrowing
-def : InstRW<[Ampere1BWrite_6cyc_2XY],
-        (instregex "RSHRNv",
-                   "SHRNv", "SQSHRNv", "SQSHRUNv",
-                   "UQXTNv")>;
-// -- matrix multiply
-def : InstRW<[Ampere1BWrite_3cyc_1XY],
-        (instrs SMMLA, UMMLA, USMMLA)>;
-// -- max/min
-def : InstRW<[Ampere1Write_2cyc_1XY],
-        (instregex "^SMAXv", "^SMINv", "^UMAXv", "^UMINv")>;
-def : InstRW<[Ampere1Write_2cyc_1XY],
-        (instregex "^SMAXPv", "^SMINPv", "^UMAXPv", "^UMINPv")>;
-// -- move immediate
-def : InstRW<[Ampere1Write_2cyc_1XY], (instregex "^MOVIv", "^MVNIv")>;
-// -- multiply
-def : InstRW<[Ampere1Write_3cyc_1XY],
-        (instregex "MULv", "SMULLv", "UMULLv", "SQDMUL(H|L)v", "SQRDMULHv")>;
-// -- multiply accumulate
-def : InstRW<[Ampere1Write_3cyc_1XY],
-        (instregex "MLAv", "MLSv", "(S|U|SQD)(MLAL|MLSL)v", "SQRDML(A|S)Hv")>;
-// -- negation, saturating
-def : InstRW<[Ampere1Write_2cyc_1XY], (instregex "^SQABS", "^SQNEG")>;
-// -- reverse bits/bytes
-def : InstRW<[Ampere1Write_2cyc_1XY],
-        (instregex "^RBITv", "^REV16v", "^REV32v", "^REV64v")>;
-// -- shift
-def : InstRW<[Ampere1Write_2cyc_1XY], (instregex "^[SU]SHL(v16i8|v8i16|v4i32|v2i64)")>;
-// -- shift and accumulate
-def : InstRW<[Ampere1Write_2cyc_1XY],
-        (instregex "SRSRAv", "SSRAv", "URSRAv", "USRAv")>;
-// -- shift, saturating
-def : InstRW<[Ampere1Write_2cyc_1XY],
-        (instregex "^SQRSHLv", "^SQRSHRNv", "^SQRSHRUNv", "^SQSHL", "^SQSHLU",
-                   "^SQXTNv", "^SQXTUNv", "^UQSHRNv", "UQRSHRNv", "^UQRSHL",
-                   "^UQSHL")>;
-
-// Vector miscellaneous instructions
-// -- duplicate element
-def : InstRW<[Ampere1BWrite_2cyc_1XY], (instregex "^DUPv.+lane")>;
-// -- duplicate from GPR
-def : InstRW<[Ampere1BWrite_5cyc_1BS], (instregex "^DUPv.+gpr")>;
-// -- extract narrow
-def : InstRW<[Ampere1BWrite_2cyc_1XY], (instregex "^XTNv")>;
-// -- insert/extract element
-def : InstRW<[Ampere1BWrite_2cyc_1XY], (instregex "^EXTv", "^INSv.+lane")>;
-// -- move FP immediate
-def : InstRW<[Ampere1BWrite_2cyc_1XY], (instregex "^FMOVv")>;
-// -- move element to GPR
-def : InstRW<[Ampere1BWrite_5cyc_1X], (instregex "(S|U)MOVv")>;
-// -- move from GPR to any element
-def : InstRW<[Ampere1BWrite_7cyc_1BS_1XY], (instregex "^INSv.+gpr")>;
-// -- table lookup
-def : InstRW<[Ampere1BWrite_2cyc_1XY],
-            (instrs TBLv8i8One, TBLv16i8One, TBXv8i8One, TBXv16i8One)>;
-def : InstRW<[Ampere1BWrite_4cyc_2XY],
-            (instrs TBLv8i8Two, TBLv16i8Two, TBXv8i8Two, TBXv16i8Two)>;
-def : InstRW<[Ampere1BWrite_6cyc_3XY],
-            (instrs TBLv8i8Three, TBLv16i8Three, TBXv8i8Three, TBXv16i8Three)>;
-def : InstRW<[Ampere1BWrite_8cyc_4XY],
-            (instrs TBLv8i8Four, TBLv16i8Four, TBXv8i8Four, TBXv16i8Four)>;
-// -- transpose
-def : InstRW<[Ampere1Write_2cyc_1XY],
-              (instregex "^TRN1v", "^TRN2v", "^UZP1v", "^UZP2v")>;
-// -- zip/unzip
-def : InstRW<[Ampere1Write_2cyc_1XY], (instregex "^ZIP1v", "^ZIP2v")>;
-
-} // SchedModel = Ampere1BModel
-- 
cgit v1.1


From 8f23464a5d957242c89ca6f33d4379c42519cd81 Mon Sep 17 00:00:00 2001
From: Jacek Caban <jacek@codeweavers.com>
Date: Sat, 10 Feb 2024 01:00:14 +0100
Subject: [llvm-lib][llvm-dlltool][Object] Add support for EXPORTAS name types.
 (#78772)

EXPORTAS is a new name type in import libraries. It's used by default on ARM64EC,
but it's allowed on other platforms as well.
---
 llvm/include/llvm/BinaryFormat/COFF.h        |  5 +-
 llvm/include/llvm/Object/COFFImportFile.h    |  4 ++
 llvm/lib/Object/COFFImportFile.cpp           | 66 ++++++++++++-------
 llvm/lib/Object/COFFModuleDefinition.cpp     | 13 +++-
 llvm/test/tools/llvm-lib/exportas.test       | 94 ++++++++++++++++++++++++++++
 llvm/tools/llvm-readobj/COFFImportDumper.cpp |  3 +
 6 files changed, 162 insertions(+), 23 deletions(-)
 create mode 100644 llvm/test/tools/llvm-lib/exportas.test

diff --git a/llvm/include/llvm/BinaryFormat/COFF.h b/llvm/include/llvm/BinaryFormat/COFF.h
index 522ee37..72461d0 100644
--- a/llvm/include/llvm/BinaryFormat/COFF.h
+++ b/llvm/include/llvm/BinaryFormat/COFF.h
@@ -716,7 +716,10 @@ enum ImportNameType : unsigned {
   IMPORT_NAME_NOPREFIX = 2,
   /// The import name is the public symbol name, but skipping the leading ?,
   /// @, or optionally _, and truncating at the first @.
-  IMPORT_NAME_UNDECORATE = 3
+  IMPORT_NAME_UNDECORATE = 3,
+  /// The import name is specified as a separate string in the import library
+  /// object file.
+  IMPORT_NAME_EXPORTAS = 4
 };
 
 enum class GuardFlags : uint32_t {
diff --git a/llvm/include/llvm/Object/COFFImportFile.h b/llvm/include/llvm/Object/COFFImportFile.h
index 45a4a79..7c5846e9 100644
--- a/llvm/include/llvm/Object/COFFImportFile.h
+++ b/llvm/include/llvm/Object/COFFImportFile.h
@@ -92,6 +92,10 @@ struct COFFShortExport {
   /// file, this is "baz" in "EXPORTS\nfoo = bar == baz".
   std::string AliasTarget;
 
+  /// Specifies EXPORTAS name. In a .def file, this is "bar" in
+  /// "EXPORTS\nfoo EXPORTAS bar".
+  std::string ExportAs;
+
   uint16_t Ordinal = 0;
   bool Noname = false;
   bool Data = false;
diff --git a/llvm/lib/Object/COFFImportFile.cpp b/llvm/lib/Object/COFFImportFile.cpp
index d7d26f4..51e6274 100644
--- a/llvm/lib/Object/COFFImportFile.cpp
+++ b/llvm/lib/Object/COFFImportFile.cpp
@@ -71,6 +71,12 @@ StringRef COFFImportFile::getExportName() const {
     name = ltrim1(name, "?@_");
     name = name.substr(0, name.find('@'));
     break;
+  case IMPORT_NAME_EXPORTAS: {
+    // Skip DLL name
+    name = Data.getBuffer().substr(sizeof(*hdr) + name.size() + 1);
+    name = name.split('\0').second.split('\0').first;
+    break;
+  }
   default:
     break;
   }
@@ -209,6 +215,7 @@ public:
   // Library Format.
   NewArchiveMember createShortImport(StringRef Sym, uint16_t Ordinal,
                                      ImportType Type, ImportNameType NameType,
+                                     StringRef ExportName,
                                      MachineTypes Machine);
 
   // Create a weak external file which is described in PE/COFF Aux Format 3.
@@ -500,12 +507,13 @@ NewArchiveMember ObjectFactory::createNullThunk(std::vector<uint8_t> &Buffer) {
   return {MemoryBufferRef{F, ImportName}};
 }
 
-NewArchiveMember ObjectFactory::createShortImport(StringRef Sym,
-                                                  uint16_t Ordinal,
-                                                  ImportType ImportType,
-                                                  ImportNameType NameType,
-                                                  MachineTypes Machine) {
+NewArchiveMember
+ObjectFactory::createShortImport(StringRef Sym, uint16_t Ordinal,
+                                 ImportType ImportType, ImportNameType NameType,
+                                 StringRef ExportName, MachineTypes Machine) {
   size_t ImpSize = ImportName.size() + Sym.size() + 2; // +2 for NULs
+  if (!ExportName.empty())
+    ImpSize += ExportName.size() + 1;
   size_t Size = sizeof(coff_import_header) + ImpSize;
   char *Buf = Alloc.Allocate<char>(Size);
   memset(Buf, 0, Size);
@@ -525,6 +533,10 @@ NewArchiveMember ObjectFactory::createShortImport(StringRef Sym,
   memcpy(P, Sym.data(), Sym.size());
   P += Sym.size() + 1;
   memcpy(P, ImportName.data(), ImportName.size());
+  if (!ExportName.empty()) {
+    P += ImportName.size() + 1;
+    memcpy(P, ExportName.data(), ExportName.size());
+  }
 
   return {MemoryBufferRef(StringRef(Buf, Size), ImportName)};
 }
@@ -641,27 +653,39 @@ Error writeImportLibrary(StringRef ImportName, StringRef Path,
       ImportType = IMPORT_CONST;
 
     StringRef SymbolName = E.SymbolName.empty() ? E.Name : E.SymbolName;
-    ImportNameType NameType = E.Noname
-                                  ? IMPORT_ORDINAL
-                                  : getNameType(SymbolName, E.Name,
-                                                Machine, MinGW);
-    Expected<std::string> Name = E.ExtName.empty()
-                                     ? std::string(SymbolName)
-                                     : replace(SymbolName, E.Name, E.ExtName);
-
-    if (!Name)
-      return Name.takeError();
-
-    if (!E.AliasTarget.empty() && *Name != E.AliasTarget) {
+    std::string Name;
+
+    if (E.ExtName.empty()) {
+      Name = std::string(SymbolName);
+    } else {
+      Expected<std::string> ReplacedName =
+          replace(SymbolName, E.Name, E.ExtName);
+      if (!ReplacedName)
+        return ReplacedName.takeError();
+      Name.swap(*ReplacedName);
+    }
+
+    if (!E.AliasTarget.empty() && Name != E.AliasTarget) {
       Members.push_back(
-          OF.createWeakExternal(E.AliasTarget, *Name, false, Machine));
+          OF.createWeakExternal(E.AliasTarget, Name, false, Machine));
       Members.push_back(
-          OF.createWeakExternal(E.AliasTarget, *Name, true, Machine));
+          OF.createWeakExternal(E.AliasTarget, Name, true, Machine));
       continue;
     }
 
-    Members.push_back(
-        OF.createShortImport(*Name, E.Ordinal, ImportType, NameType, Machine));
+    ImportNameType NameType;
+    std::string ExportName;
+    if (E.Noname) {
+      NameType = IMPORT_ORDINAL;
+    } else if (!E.ExportAs.empty()) {
+      NameType = IMPORT_NAME_EXPORTAS;
+      ExportName = E.ExportAs;
+    } else {
+      NameType = getNameType(SymbolName, E.Name, Machine, MinGW);
+    }
+
+    Members.push_back(OF.createShortImport(Name, E.Ordinal, ImportType,
+                                           NameType, ExportName, Machine));
   }
 
   return writeArchive(Path, Members, SymtabWritingMode::NormalSymtab,
diff --git a/llvm/lib/Object/COFFModuleDefinition.cpp b/llvm/lib/Object/COFFModuleDefinition.cpp
index 35e6ab8..549348a 100644
--- a/llvm/lib/Object/COFFModuleDefinition.cpp
+++ b/llvm/lib/Object/COFFModuleDefinition.cpp
@@ -39,6 +39,7 @@ enum Kind {
   KwConstant,
   KwData,
   KwExports,
+  KwExportAs,
   KwHeapsize,
   KwLibrary,
   KwName,
@@ -116,6 +117,7 @@ public:
                    .Case("CONSTANT", KwConstant)
                    .Case("DATA", KwData)
                    .Case("EXPORTS", KwExports)
+                   .Case("EXPORTAS", KwExportAs)
                    .Case("HEAPSIZE", KwHeapsize)
                    .Case("LIBRARY", KwLibrary)
                    .Case("NAME", KwName)
@@ -284,7 +286,16 @@ private:
           E.AliasTarget = std::string("_").append(E.AliasTarget);
         continue;
       }
-      unget();
+      // EXPORTAS must be at the end of export definition
+      if (Tok.K == KwExportAs) {
+        read();
+        if (Tok.K == Eof)
+          return createError(
+              "unexpected end of file, EXPORTAS identifier expected");
+        E.ExportAs = std::string(Tok.Value);
+      } else {
+        unget();
+      }
       Info.Exports.push_back(E);
       return Error::success();
     }
diff --git a/llvm/test/tools/llvm-lib/exportas.test b/llvm/test/tools/llvm-lib/exportas.test
new file mode 100644
index 0000000..f6e845c
--- /dev/null
+++ b/llvm/test/tools/llvm-lib/exportas.test
@@ -0,0 +1,94 @@
+Test EXPORTAS in importlibs.
+
+RUN: split-file %s %t.dir && cd %t.dir
+RUN: llvm-lib -machine:amd64 -def:test.def -out:test.lib
+
+RUN: llvm-nm --print-armap test.lib | FileCheck --check-prefix=ARMAP %s
+
+ARMAP:      Archive map
+ARMAP-NEXT: __IMPORT_DESCRIPTOR_test in test.dll
+ARMAP-NEXT: __NULL_IMPORT_DESCRIPTOR in test.dll
+ARMAP-NEXT: __imp_func in test.dll
+ARMAP-NEXT: __imp_func2 in test.dll
+ARMAP-NEXT: __imp_func3 in test.dll
+ARMAP-NEXT: __imp_mydata in test.dll
+ARMAP-NEXT: func in test.dll
+ARMAP-NEXT: func2 in test.dll
+ARMAP-NEXT: func3 in test.dll
+ARMAP-NEXT: test_NULL_THUNK_DATA in test.dll
+
+RUN: llvm-readobj test.lib | FileCheck --check-prefix=READOBJ %s
+
+READOBJ:      File: test.lib(test.dll)
+READOBJ-NEXT: Format: COFF-x86-64
+READOBJ-NEXT: Arch: x86_64
+READOBJ-NEXT: AddressSize: 64bit
+READOBJ-EMPTY:
+READOBJ-NEXT: File: test.lib(test.dll)
+READOBJ-NEXT: Format: COFF-x86-64
+READOBJ-NEXT: Arch: x86_64
+READOBJ-NEXT: AddressSize: 64bit
+READOBJ-EMPTY:
+READOBJ-NEXT: File: test.lib(test.dll)
+READOBJ-NEXT: Format: COFF-x86-64
+READOBJ-NEXT: Arch: x86_64
+READOBJ-NEXT: AddressSize: 64bit
+READOBJ-EMPTY:
+READOBJ-NEXT: File: test.dll
+READOBJ-NEXT: Format: COFF-import-file-x86-64
+READOBJ-NEXT: Type: code
+READOBJ-NEXT: Name type: export as
+READOBJ-NEXT: Export name: expfunc
+READOBJ-NEXT: Symbol: __imp_func
+READOBJ-NEXT: Symbol: func
+READOBJ-EMPTY:
+READOBJ-NEXT: File: test.dll
+READOBJ-NEXT: Format: COFF-import-file-x86-64
+READOBJ-NEXT: Type: data
+READOBJ-NEXT: Name type: export as
+READOBJ-NEXT: Export name: expdata
+READOBJ-NEXT: Symbol: __imp_mydata
+READOBJ-EMPTY:
+READOBJ-NEXT: File: test.dll
+READOBJ-NEXT: Format: COFF-import-file-x86-64
+READOBJ-NEXT: Type: code
+READOBJ-NEXT: Name type: export as
+READOBJ-NEXT: Export name: expfunc2
+READOBJ-NEXT: Symbol: __imp_func2
+READOBJ-NEXT: Symbol: func2
+READOBJ-EMPTY:
+READOBJ-NEXT: File: test.dll
+READOBJ-NEXT: Format: COFF-import-file-x86-64
+READOBJ-NEXT: Type: code
+READOBJ-NEXT: Name type: export as
+READOBJ-NEXT: Export name: expfunc3
+READOBJ-NEXT: Symbol: __imp_func3
+READOBJ-NEXT: Symbol: func3
+
+
+EXPORTAS must be at the end of entry declaration.
+RUN: not llvm-lib -machine:amd64 -def:test2.def -out:test2.lib 2>&1 \
+RUN:              | FileCheck --check-prefix=ERROR %s
+RUN: not llvm-lib -machine:amd64 -def:test3.def -out:test3.lib 2>&1 \
+RUN:              | FileCheck --check-prefix=ERROR %s
+ERROR: Invalid data was encountered while parsing the file
+
+
+#--- test.def
+LIBRARY test.dll
+EXPORTS
+        func EXPORTAS expfunc
+        mydata DATA EXPORTAS expdata
+        func2 = myfunc2 EXPORTAS expfunc2
+        func3 = otherdll.otherfunc3 EXPORTAS expfunc3
+
+#--- test2.def
+LIBRARY test.dll
+EXPORTS
+        func EXPORTAS expfunc
+        mydata EXPORTAS expdata DATA
+
+#--- test3.def
+LIBRARY test.dll
+EXPORTS
+        mydata EXPORTAS
diff --git a/llvm/tools/llvm-readobj/COFFImportDumper.cpp b/llvm/tools/llvm-readobj/COFFImportDumper.cpp
index 656ca32..0ab2a17 100644
--- a/llvm/tools/llvm-readobj/COFFImportDumper.cpp
+++ b/llvm/tools/llvm-readobj/COFFImportDumper.cpp
@@ -45,6 +45,9 @@ void dumpCOFFImportFile(const COFFImportFile *File, ScopedPrinter &Writer) {
   case COFF::IMPORT_NAME_UNDECORATE:
     Writer.printString("Name type", "undecorate");
     break;
+  case COFF::IMPORT_NAME_EXPORTAS:
+    Writer.printString("Name type", "export as");
+    break;
   }
 
   if (H->getNameType() != COFF::IMPORT_ORDINAL)
-- 
cgit v1.1


From 224145ee882e32aaa1fae9ae88698cf1b07b22e4 Mon Sep 17 00:00:00 2001
From: Felipe de Azevedo Piovezan <fpiovezan@apple.com>
Date: Fri, 9 Feb 2024 16:01:42 -0800
Subject: [DWARFDump][nfc] Fix incorrect comment (#81276)

It claimed to dump all sections by default, but this hasn't been true
since 2017: https://reviews.llvm.org/D37717
---
 llvm/tools/llvm-dwarfdump/llvm-dwarfdump.cpp | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/llvm/tools/llvm-dwarfdump/llvm-dwarfdump.cpp b/llvm/tools/llvm-dwarfdump/llvm-dwarfdump.cpp
index 559e7a6..8cdd84b 100644
--- a/llvm/tools/llvm-dwarfdump/llvm-dwarfdump.cpp
+++ b/llvm/tools/llvm-dwarfdump/llvm-dwarfdump.cpp
@@ -845,8 +845,9 @@ int main(int argc, char **argv) {
 
   bool OffsetRequested = false;
 
-  // Defaults to dumping all sections, unless brief mode is specified in which
-  // case only the .debug_info section in dumped.
+  // Defaults to dumping only debug_info, unless: A) verbose mode is specified,
+  // in which case all sections are dumped, or B) a specific section is
+  // requested.
 #define HANDLE_DWARF_SECTION(ENUM_NAME, ELF_NAME, CMDLINE_NAME, OPTION)        \
   if (Dump##ENUM_NAME.IsRequested) {                                           \
     DumpType |= DIDT_##ENUM_NAME;                                              \
-- 
cgit v1.1


From 3a3302ef7b48f7907d0fb62b380d9d515a5f35e4 Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Fri, 9 Feb 2024 16:10:10 -0800
Subject: [flang][test] Update driver-help*.f90 after
 9397d23671f26ab8631e90f688ae2ea212f3c770

---
 flang/test/Driver/driver-help-hidden.f90 | 2 +-
 flang/test/Driver/driver-help.f90        | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/flang/test/Driver/driver-help-hidden.f90 b/flang/test/Driver/driver-help-hidden.f90
index 36b7d20..44dbac4 100644
--- a/flang/test/Driver/driver-help-hidden.f90
+++ b/flang/test/Driver/driver-help-hidden.f90
@@ -148,7 +148,7 @@
 ! CHECK-NEXT: -Rpass=<value>          Report transformations performed by optimization passes whose name matches the given POSIX regular expression
 ! CHECK-NEXT: -R<remark>              Enable the specified remark
 ! CHECK-NEXT: -save-temps=<value>     Save intermediate compilation results.
-! CHECK-NEXT: -save-temps             Save intermediate compilation results
+! CHECK-NEXT: -save-temps             Alias for --save-temps=cwd
 ! CHECK-NEXT: -std=<value>            Language standard to compile for
 ! CHECK-NEXT: -S                      Only run preprocess and compilation steps
 ! CHECK-NEXT: --target=<value>        Generate code for the given target
diff --git a/flang/test/Driver/driver-help.f90 b/flang/test/Driver/driver-help.f90
index f69f437..b4280a4 100644
--- a/flang/test/Driver/driver-help.f90
+++ b/flang/test/Driver/driver-help.f90
@@ -134,7 +134,7 @@
 ! HELP-NEXT: -Rpass=<value>          Report transformations performed by optimization passes whose name matches the given POSIX regular expression
 ! HELP-NEXT: -R<remark>              Enable the specified remark
 ! HELP-NEXT: -save-temps=<value>     Save intermediate compilation results.
-! HELP-NEXT: -save-temps             Save intermediate compilation results
+! HELP-NEXT: -save-temps             Alias for --save-temps=cwd
 ! HELP-NEXT: -std=<value>            Language standard to compile for
 ! HELP-NEXT: -S                      Only run preprocess and compilation steps
 ! HELP-NEXT: --target=<value>        Generate code for the given target
@@ -275,7 +275,7 @@
 ! HELP-FC1-NEXT: -Rpass=<value>          Report transformations performed by optimization passes whose name matches the given POSIX regular expression
 ! HELP-FC1-NEXT: -R<remark>              Enable the specified remark
 ! HELP-FC1-NEXT: -save-temps=<value>     Save intermediate compilation results.
-! HELP-FC1-NEXT: -save-temps             Save intermediate compilation results
+! HELP-FC1-NEXT: -save-temps             Alias for --save-temps=cwd
 ! HELP-FC1-NEXT: -std=<value>            Language standard to compile for
 ! HELP-FC1-NEXT: -S                      Only run preprocess and compilation steps
 ! HELP-FC1-NEXT: -target-cpu <value>     Target a specific cpu type
-- 
cgit v1.1


From eb1b428750181ea742c547db0bc7136cd5b8f732 Mon Sep 17 00:00:00 2001
From: Jon Roelofs <jonathan_roelofs@apple.com>
Date: Fri, 9 Feb 2024 16:52:25 -0800
Subject: [llvm][aarch64] Apple A16 & A17 had adrp-add fusion, but A14 did not
 (#81325)

---
 llvm/lib/Target/AArch64/AArch64.td | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/Target/AArch64/AArch64.td b/llvm/lib/Target/AArch64/AArch64.td
index e76204f..3377fcf 100644
--- a/llvm/lib/Target/AArch64/AArch64.td
+++ b/llvm/lib/Target/AArch64/AArch64.td
@@ -1120,7 +1120,6 @@ def TuneAppleA14 : SubtargetFeature<"apple-a14", "ARMProcFamily", "AppleA14",
                                     FeatureFuseArithmeticLogic,
                                     FeatureFuseCCSelect,
                                     FeatureFuseCryptoEOR,
-                                    FeatureFuseAdrpAdd,
                                     FeatureFuseLiterals,
                                     FeatureStorePairSuppress,
                                     FeatureZCRegMove,
@@ -1149,6 +1148,7 @@ def TuneAppleA16 : SubtargetFeature<"apple-a16", "ARMProcFamily", "AppleA16",
                                     FeatureArithmeticCbzFusion,
                                     FeatureDisableLatencySchedHeuristic,
                                     FeatureFuseAddress,
+                                    FeatureFuseAdrpAdd,
                                     FeatureFuseAES,
                                     FeatureFuseArithmeticLogic,
                                     FeatureFuseCCSelect,
@@ -1165,6 +1165,7 @@ def TuneAppleA17 : SubtargetFeature<"apple-a17", "ARMProcFamily", "AppleA17",
                                     FeatureArithmeticCbzFusion,
                                     FeatureDisableLatencySchedHeuristic,
                                     FeatureFuseAddress,
+                                    FeatureFuseAdrpAdd,
                                     FeatureFuseAES,
                                     FeatureFuseArithmeticLogic,
                                     FeatureFuseCCSelect,
-- 
cgit v1.1


From b7cc401df5ac714f5de0cbc64e6c7083d2c1d712 Mon Sep 17 00:00:00 2001
From: Enna1 <xumingjie.enna1@bytedance.com>
Date: Sat, 10 Feb 2024 09:10:24 +0800
Subject: =?UTF-8?q?[hwasan]=20Call=20user=20provided=20callback=20function?=
 =?UTF-8?q?=20for=20both=20fatal=20and=20non-=E2=80=A6=20(#80429)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

…fatal error report

This makes the behavior of HWASan matching with ASan: always call user
provided callback function for an error report, but only call
`SetAbortMessage()` on Android when `flags()->halt_on_error` is true.
---
 compiler-rt/lib/hwasan/hwasan_report.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/compiler-rt/lib/hwasan/hwasan_report.cpp b/compiler-rt/lib/hwasan/hwasan_report.cpp
index c3d260d..d3398ff 100644
--- a/compiler-rt/lib/hwasan/hwasan_report.cpp
+++ b/compiler-rt/lib/hwasan/hwasan_report.cpp
@@ -40,7 +40,7 @@ class ScopedReport {
  public:
   explicit ScopedReport(bool fatal) : fatal(fatal) {
     Lock lock(&error_message_lock_);
-    error_message_ptr_ = fatal ? &error_message_ : nullptr;
+    error_message_ptr_ = &error_message_;
     ++hwasan_report_count;
   }
 
-- 
cgit v1.1


From c344953ae78b0e9545b7374a2bea35abaee18c38 Mon Sep 17 00:00:00 2001
From: Derek Schuff <dschuff@chromium.org>
Date: Fri, 9 Feb 2024 17:24:27 -0800
Subject: Fix 01706e7 on 32-bit platforms

Make the type match the printf format.
---
 llvm/tools/llvm-objdump/llvm-objdump.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/llvm/tools/llvm-objdump/llvm-objdump.cpp b/llvm/tools/llvm-objdump/llvm-objdump.cpp
index 0e4f4e1..948a5d7 100644
--- a/llvm/tools/llvm-objdump/llvm-objdump.cpp
+++ b/llvm/tools/llvm-objdump/llvm-objdump.cpp
@@ -2949,7 +2949,8 @@ void Dumper::printSymbol(const SymbolRef &Symbol,
     outs() << '\t' << format(Fmt, ELFSymbolRef(Symbol).getSize());
   else if (O.isWasm())
     outs() << '\t'
-           << format(Fmt, cast<WasmObjectFile>(O).getSymbolSize(Symbol));
+           << format(Fmt, static_cast<uint64_t>(
+                              cast<WasmObjectFile>(O).getSymbolSize(Symbol)));
 
   if (O.isELF()) {
     if (!SymbolVersions.empty()) {
-- 
cgit v1.1


From cc02e50e77419475fa958b2626600a48f8208098 Mon Sep 17 00:00:00 2001
From: Pete Steinfeld <47540744+psteinfeld@users.noreply.github.com>
Date: Fri, 9 Feb 2024 18:04:53 -0800
Subject: =?UTF-8?q?Revert=20"[Flang]=20Update=20the=20fix=20of=20PR=208073?=
 =?UTF-8?q?8=20to=20cover=20generic=20interface=E2=80=A6=20(#81321)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

… inside modules (#81087)"

This reverts commit 0802596df3d1ffd15f6b828a0f5c1e5b687a730f.

See comments in PR #81087 for a test case that shows why I'm reverting.
---
 flang/lib/Semantics/resolve-names.cpp | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/flang/lib/Semantics/resolve-names.cpp b/flang/lib/Semantics/resolve-names.cpp
index 2a42c791..36deab9 100644
--- a/flang/lib/Semantics/resolve-names.cpp
+++ b/flang/lib/Semantics/resolve-names.cpp
@@ -5648,10 +5648,9 @@ void DeclarationVisitor::Post(const parser::ProcDecl &x) {
   const auto &name{std::get<parser::Name>(x.t)};
   const Symbol *procInterface{nullptr};
   if (interfaceName_) {
-    Symbol *ultimate{&interfaceName_->symbol->GetUltimate()};
-    procInterface = ultimate->has<GenericDetails>()
-        ? ultimate->get<GenericDetails>().specific()
-        : ultimate;
+    procInterface = interfaceName_->symbol->has<GenericDetails>()
+        ? interfaceName_->symbol->get<GenericDetails>().specific()
+        : interfaceName_->symbol;
   }
   auto attrs{HandleSaveName(name.source, GetAttrs())};
   DerivedTypeDetails *dtDetails{nullptr};
-- 
cgit v1.1


From 637c37025d2a9747d440034fff7b4d549dead6f3 Mon Sep 17 00:00:00 2001
From: lntue <35648136+lntue@users.noreply.github.com>
Date: Fri, 9 Feb 2024 21:13:14 -0500
Subject: [libc][math] Add C23 math function frexpf128. (#81337)

---
 libc/config/linux/aarch64/entrypoints.txt   |  1 +
 libc/config/linux/riscv/entrypoints.txt     |  1 +
 libc/config/linux/x86_64/entrypoints.txt    |  1 +
 libc/docs/math/index.rst                    |  2 +
 libc/spec/stdc.td                           |  1 +
 libc/src/math/CMakeLists.txt                |  1 +
 libc/src/math/frexpf128.h                   | 20 ++++++++++
 libc/src/math/generic/CMakeLists.txt        | 23 +++++++++---
 libc/src/math/generic/frexpf128.cpp         | 19 ++++++++++
 libc/test/src/math/smoke/CMakeLists.txt     | 18 ++++++---
 libc/test/src/math/smoke/FrexpTest.h        | 58 ++++++++++++++---------------
 libc/test/src/math/smoke/frexp_test.cpp     |  2 +-
 libc/test/src/math/smoke/frexpf128_test.cpp | 13 +++++++
 libc/test/src/math/smoke/frexpf_test.cpp    |  2 +-
 libc/test/src/math/smoke/frexpl_test.cpp    |  2 +-
 15 files changed, 119 insertions(+), 45 deletions(-)
 create mode 100644 libc/src/math/frexpf128.h
 create mode 100644 libc/src/math/generic/frexpf128.cpp
 create mode 100644 libc/test/src/math/smoke/frexpf128_test.cpp

diff --git a/libc/config/linux/aarch64/entrypoints.txt b/libc/config/linux/aarch64/entrypoints.txt
index f75b267..bc09f4881 100644
--- a/libc/config/linux/aarch64/entrypoints.txt
+++ b/libc/config/linux/aarch64/entrypoints.txt
@@ -386,6 +386,7 @@ if(LIBC_COMPILER_HAS_FLOAT128)
     libc.src.math.floorf128
     libc.src.math.fmaxf128
     libc.src.math.fminf128
+    libc.src.math.frexpf128
     libc.src.math.roundf128
     libc.src.math.sqrtf128
     libc.src.math.truncf128
diff --git a/libc/config/linux/riscv/entrypoints.txt b/libc/config/linux/riscv/entrypoints.txt
index 762beb9..02412e7 100644
--- a/libc/config/linux/riscv/entrypoints.txt
+++ b/libc/config/linux/riscv/entrypoints.txt
@@ -395,6 +395,7 @@ if(LIBC_COMPILER_HAS_FLOAT128)
     libc.src.math.floorf128
     libc.src.math.fmaxf128
     libc.src.math.fminf128
+    libc.src.math.frexpf128
     libc.src.math.roundf128
     libc.src.math.sqrtf128
     libc.src.math.truncf128
diff --git a/libc/config/linux/x86_64/entrypoints.txt b/libc/config/linux/x86_64/entrypoints.txt
index 52a3ce0..8ca9375 100644
--- a/libc/config/linux/x86_64/entrypoints.txt
+++ b/libc/config/linux/x86_64/entrypoints.txt
@@ -414,6 +414,7 @@ if(LIBC_COMPILER_HAS_FLOAT128)
     libc.src.math.floorf128
     libc.src.math.fmaxf128
     libc.src.math.fminf128
+    libc.src.math.frexpf128
     libc.src.math.roundf128
     libc.src.math.sqrtf128
     libc.src.math.truncf128
diff --git a/libc/docs/math/index.rst b/libc/docs/math/index.rst
index 2758b42..9460449 100644
--- a/libc/docs/math/index.rst
+++ b/libc/docs/math/index.rst
@@ -176,6 +176,8 @@ Basic Operations
 +--------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
 | frexpl       | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| | |check| |         |         |
 +--------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
+| frexpf128    | |check| | |check| |         | |check| |         |         |         |         |         |         |         |         |
++--------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
 | ilogb        | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| | |check| |         |         |
 +--------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
 | ilogbf       | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| | |check| |         |         |
diff --git a/libc/spec/stdc.td b/libc/spec/stdc.td
index 9c8b5e5..afddc77 100644
--- a/libc/spec/stdc.td
+++ b/libc/spec/stdc.td
@@ -401,6 +401,7 @@ def StdC : StandardSpec<"stdc"> {
           FunctionSpec<"frexp", RetValSpec<DoubleType>, [ArgSpec<DoubleType>, ArgSpec<IntPtr>]>,
           FunctionSpec<"frexpf", RetValSpec<FloatType>, [ArgSpec<FloatType>, ArgSpec<IntPtr>]>,
           FunctionSpec<"frexpl", RetValSpec<LongDoubleType>, [ArgSpec<LongDoubleType>, ArgSpec<IntPtr>]>,
+          GuardedFunctionSpec<"frexpf128", RetValSpec<Float128Type>, [ArgSpec<Float128Type>, ArgSpec<IntPtr>]], "LIBC_COMPILER_HAS_FLOAT128">,
 
           FunctionSpec<"hypot", RetValSpec<DoubleType>, [ArgSpec<DoubleType>, ArgSpec<DoubleType>]>,
           FunctionSpec<"hypotf", RetValSpec<FloatType>, [ArgSpec<FloatType>, ArgSpec<FloatType>]>,
diff --git a/libc/src/math/CMakeLists.txt b/libc/src/math/CMakeLists.txt
index 8cdd84a..985585c 100644
--- a/libc/src/math/CMakeLists.txt
+++ b/libc/src/math/CMakeLists.txt
@@ -137,6 +137,7 @@ add_math_entrypoint_object(fmodf)
 add_math_entrypoint_object(frexp)
 add_math_entrypoint_object(frexpf)
 add_math_entrypoint_object(frexpl)
+add_math_entrypoint_object(frexpf128)
 
 add_math_entrypoint_object(hypot)
 add_math_entrypoint_object(hypotf)
diff --git a/libc/src/math/frexpf128.h b/libc/src/math/frexpf128.h
new file mode 100644
index 0000000..5d70860
--- /dev/null
+++ b/libc/src/math/frexpf128.h
@@ -0,0 +1,20 @@
+//===-- Implementation header for frexpf128 ---------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_MATH_FREXPF128_H
+#define LLVM_LIBC_SRC_MATH_FREXPF128_H
+
+#include "src/__support/macros/properties/float.h"
+
+namespace LIBC_NAMESPACE {
+
+float128 frexpf128(float128 x, int *exp);
+
+} // namespace LIBC_NAMESPACE
+
+#endif // LLVM_LIBC_SRC_MATH_FREXPF128_H
diff --git a/libc/src/math/generic/CMakeLists.txt b/libc/src/math/generic/CMakeLists.txt
index 3216ec3..fdf383f 100644
--- a/libc/src/math/generic/CMakeLists.txt
+++ b/libc/src/math/generic/CMakeLists.txt
@@ -916,10 +916,10 @@ add_entrypoint_object(
     frexp.cpp
   HDRS
     ../frexp.h
+  COMPILE_OPTIONS
+    -O3
   DEPENDS
     libc.src.__support.FPUtil.manipulation_functions
-  COMPILE_OPTIONS
-    -O2
 )
 
 add_entrypoint_object(
@@ -928,10 +928,10 @@ add_entrypoint_object(
     frexpf.cpp
   HDRS
     ../frexpf.h
+  COMPILE_OPTIONS
+    -O3
   DEPENDS
     libc.src.__support.FPUtil.manipulation_functions
-  COMPILE_OPTIONS
-    -O2
 )
 
 add_entrypoint_object(
@@ -940,10 +940,23 @@ add_entrypoint_object(
     frexpl.cpp
   HDRS
     ../frexpl.h
+  COMPILE_OPTIONS
+    -O3
   DEPENDS
     libc.src.__support.FPUtil.manipulation_functions
+)
+
+add_entrypoint_object(
+  frexpf128
+  SRCS
+    frexpf128.cpp
+  HDRS
+    ../frexpf128.h
   COMPILE_OPTIONS
-    -O2
+    -O3
+  DEPENDS
+    libc.src.__support.macros.properties.float
+    libc.src.__support.FPUtil.manipulation_functions
 )
 
 add_entrypoint_object(
diff --git a/libc/src/math/generic/frexpf128.cpp b/libc/src/math/generic/frexpf128.cpp
new file mode 100644
index 0000000..b50f37d
--- /dev/null
+++ b/libc/src/math/generic/frexpf128.cpp
@@ -0,0 +1,19 @@
+//===-- Implementation of frexpf128 function ------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/frexpf128.h"
+#include "src/__support/FPUtil/ManipulationFunctions.h"
+#include "src/__support/common.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(float128, frexpf128, (float128 x, int *exp)) {
+  return fputil::frexp(x, *exp);
+}
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/test/src/math/smoke/CMakeLists.txt b/libc/test/src/math/smoke/CMakeLists.txt
index 93ce0b7..0d55be5 100644
--- a/libc/test/src/math/smoke/CMakeLists.txt
+++ b/libc/test/src/math/smoke/CMakeLists.txt
@@ -779,9 +779,7 @@ add_fp_unittest(
   HDRS
     FrexpTest.h
   DEPENDS
-    libc.include.math
     libc.src.math.frexp
-    libc.src.__support.FPUtil.basic_operations
 )
 
 add_fp_unittest(
@@ -793,9 +791,7 @@ add_fp_unittest(
   HDRS
     FrexpTest.h
   DEPENDS
-    libc.include.math
     libc.src.math.frexpf
-    libc.src.__support.FPUtil.basic_operations
 )
 
 add_fp_unittest(
@@ -807,9 +803,19 @@ add_fp_unittest(
   HDRS
     FrexpTest.h
   DEPENDS
-    libc.include.math
     libc.src.math.frexpl
-    libc.src.__support.FPUtil.basic_operations
+)
+
+add_fp_unittest(
+  frexpf128_test
+  SUITE
+    libc-math-smoke-tests
+  SRCS
+    frexpf128_test.cpp
+  HDRS
+    FrexpTest.h
+  DEPENDS
+    libc.src.math.frexpf128
 )
 
 # FIXME: These tests are currently broken for NVPTX.
diff --git a/libc/test/src/math/smoke/FrexpTest.h b/libc/test/src/math/smoke/FrexpTest.h
index 981872a..bf99a9a 100644
--- a/libc/test/src/math/smoke/FrexpTest.h
+++ b/libc/test/src/math/smoke/FrexpTest.h
@@ -10,81 +10,76 @@
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
 
-#include <math.h>
-
 template <typename T> class FrexpTest : public LIBC_NAMESPACE::testing::Test {
 
   DECLARE_SPECIAL_CONSTANTS(T)
 
-  static constexpr StorageType HIDDEN_BIT =
-      StorageType(1) << LIBC_NAMESPACE::fputil::FPBits<T>::FRACTION_LEN;
-
 public:
   typedef T (*FrexpFunc)(T, int *);
 
   void testSpecialNumbers(FrexpFunc func) {
     int exponent;
-    ASSERT_FP_EQ(aNaN, func(aNaN, &exponent));
-    ASSERT_FP_EQ(inf, func(inf, &exponent));
-    ASSERT_FP_EQ(neg_inf, func(neg_inf, &exponent));
+    EXPECT_FP_EQ_ALL_ROUNDING(aNaN, func(aNaN, &exponent));
+    EXPECT_FP_EQ_ALL_ROUNDING(inf, func(inf, &exponent));
+    EXPECT_FP_EQ_ALL_ROUNDING(neg_inf, func(neg_inf, &exponent));
 
-    ASSERT_FP_EQ(0.0, func(0.0, &exponent));
-    ASSERT_EQ(exponent, 0);
+    EXPECT_FP_EQ_ALL_ROUNDING(0.0, func(0.0, &exponent));
+    EXPECT_EQ(exponent, 0);
 
-    ASSERT_FP_EQ(-0.0, func(-0.0, &exponent));
-    ASSERT_EQ(exponent, 0);
+    EXPECT_FP_EQ_ALL_ROUNDING(-0.0, func(-0.0, &exponent));
+    EXPECT_EQ(exponent, 0);
   }
 
   void testPowersOfTwo(FrexpFunc func) {
     int exponent;
 
-    EXPECT_FP_EQ(T(0.5), func(T(1.0), &exponent));
+    EXPECT_FP_EQ_ALL_ROUNDING(T(0.5), func(T(1.0), &exponent));
     EXPECT_EQ(exponent, 1);
-    EXPECT_FP_EQ(T(-0.5), func(T(-1.0), &exponent));
+    EXPECT_FP_EQ_ALL_ROUNDING(T(-0.5), func(T(-1.0), &exponent));
     EXPECT_EQ(exponent, 1);
 
-    EXPECT_FP_EQ(T(0.5), func(T(2.0), &exponent));
+    EXPECT_FP_EQ_ALL_ROUNDING(T(0.5), func(T(2.0), &exponent));
     EXPECT_EQ(exponent, 2);
-    EXPECT_FP_EQ(T(-0.5), func(T(-2.0), &exponent));
+    EXPECT_FP_EQ_ALL_ROUNDING(T(-0.5), func(T(-2.0), &exponent));
     EXPECT_EQ(exponent, 2);
 
-    EXPECT_FP_EQ(T(0.5), func(T(4.0), &exponent));
+    EXPECT_FP_EQ_ALL_ROUNDING(T(0.5), func(T(4.0), &exponent));
     EXPECT_EQ(exponent, 3);
-    EXPECT_FP_EQ(T(-0.5), func(T(-4.0), &exponent));
+    EXPECT_FP_EQ_ALL_ROUNDING(T(-0.5), func(T(-4.0), &exponent));
     EXPECT_EQ(exponent, 3);
 
-    EXPECT_FP_EQ(T(0.5), func(T(8.0), &exponent));
+    EXPECT_FP_EQ_ALL_ROUNDING(T(0.5), func(T(8.0), &exponent));
     EXPECT_EQ(exponent, 4);
-    EXPECT_FP_EQ(T(-0.5), func(T(-8.0), &exponent));
+    EXPECT_FP_EQ_ALL_ROUNDING(T(-0.5), func(T(-8.0), &exponent));
     EXPECT_EQ(exponent, 4);
 
-    EXPECT_FP_EQ(T(0.5), func(T(16.0), &exponent));
+    EXPECT_FP_EQ_ALL_ROUNDING(T(0.5), func(T(16.0), &exponent));
     EXPECT_EQ(exponent, 5);
-    EXPECT_FP_EQ(T(-0.5), func(T(-16.0), &exponent));
+    EXPECT_FP_EQ_ALL_ROUNDING(T(-0.5), func(T(-16.0), &exponent));
     EXPECT_EQ(exponent, 5);
 
-    EXPECT_FP_EQ(T(0.5), func(T(32.0), &exponent));
+    EXPECT_FP_EQ_ALL_ROUNDING(T(0.5), func(T(32.0), &exponent));
     EXPECT_EQ(exponent, 6);
-    EXPECT_FP_EQ(T(-0.5), func(T(-32.0), &exponent));
+    EXPECT_FP_EQ_ALL_ROUNDING(T(-0.5), func(T(-32.0), &exponent));
     EXPECT_EQ(exponent, 6);
   }
 
   void testSomeIntegers(FrexpFunc func) {
     int exponent;
 
-    EXPECT_FP_EQ(T(0.75), func(T(24.0), &exponent));
+    EXPECT_FP_EQ_ALL_ROUNDING(T(0.75), func(T(24.0), &exponent));
     EXPECT_EQ(exponent, 5);
-    EXPECT_FP_EQ(T(-0.75), func(T(-24.0), &exponent));
+    EXPECT_FP_EQ_ALL_ROUNDING(T(-0.75), func(T(-24.0), &exponent));
     EXPECT_EQ(exponent, 5);
 
-    EXPECT_FP_EQ(T(0.625), func(T(40.0), &exponent));
+    EXPECT_FP_EQ_ALL_ROUNDING(T(0.625), func(T(40.0), &exponent));
     EXPECT_EQ(exponent, 6);
-    EXPECT_FP_EQ(T(-0.625), func(T(-40.0), &exponent));
+    EXPECT_FP_EQ_ALL_ROUNDING(T(-0.625), func(T(-40.0), &exponent));
     EXPECT_EQ(exponent, 6);
 
-    EXPECT_FP_EQ(T(0.78125), func(T(800.0), &exponent));
+    EXPECT_FP_EQ_ALL_ROUNDING(T(0.78125), func(T(800.0), &exponent));
     EXPECT_EQ(exponent, 10);
-    EXPECT_FP_EQ(T(-0.78125), func(T(-800.0), &exponent));
+    EXPECT_FP_EQ_ALL_ROUNDING(T(-0.78125), func(T(-800.0), &exponent));
     EXPECT_EQ(exponent, 10);
   }
 };
@@ -93,4 +88,5 @@ public:
   using LlvmLibcFrexpTest = FrexpTest<T>;                                      \
   TEST_F(LlvmLibcFrexpTest, SpecialNumbers) { testSpecialNumbers(&func); }     \
   TEST_F(LlvmLibcFrexpTest, PowersOfTwo) { testPowersOfTwo(&func); }           \
-  TEST_F(LlvmLibcFrexpTest, SomeIntegers) { testSomeIntegers(&func); }
+  TEST_F(LlvmLibcFrexpTest, SomeIntegers) { testSomeIntegers(&func); }         \
+  static_assert(true, "Require semicolon.")
diff --git a/libc/test/src/math/smoke/frexp_test.cpp b/libc/test/src/math/smoke/frexp_test.cpp
index 4d078ba..79aa972 100644
--- a/libc/test/src/math/smoke/frexp_test.cpp
+++ b/libc/test/src/math/smoke/frexp_test.cpp
@@ -10,4 +10,4 @@
 
 #include "src/math/frexp.h"
 
-LIST_FREXP_TESTS(double, LIBC_NAMESPACE::frexp)
+LIST_FREXP_TESTS(double, LIBC_NAMESPACE::frexp);
diff --git a/libc/test/src/math/smoke/frexpf128_test.cpp b/libc/test/src/math/smoke/frexpf128_test.cpp
new file mode 100644
index 0000000..a0df32f
--- /dev/null
+++ b/libc/test/src/math/smoke/frexpf128_test.cpp
@@ -0,0 +1,13 @@
+//===-- Unittests for frexpf128 -------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "FrexpTest.h"
+
+#include "src/math/frexpf128.h"
+
+LIST_FREXP_TESTS(float128, LIBC_NAMESPACE::frexpf128);
diff --git a/libc/test/src/math/smoke/frexpf_test.cpp b/libc/test/src/math/smoke/frexpf_test.cpp
index 577eb96..f2ae637e 100644
--- a/libc/test/src/math/smoke/frexpf_test.cpp
+++ b/libc/test/src/math/smoke/frexpf_test.cpp
@@ -10,4 +10,4 @@
 
 #include "src/math/frexpf.h"
 
-LIST_FREXP_TESTS(float, LIBC_NAMESPACE::frexpf)
+LIST_FREXP_TESTS(float, LIBC_NAMESPACE::frexpf);
diff --git a/libc/test/src/math/smoke/frexpl_test.cpp b/libc/test/src/math/smoke/frexpl_test.cpp
index e5184cd..3e1f8b4 100644
--- a/libc/test/src/math/smoke/frexpl_test.cpp
+++ b/libc/test/src/math/smoke/frexpl_test.cpp
@@ -10,4 +10,4 @@
 
 #include "src/math/frexpl.h"
 
-LIST_FREXP_TESTS(long double, LIBC_NAMESPACE::frexpl)
+LIST_FREXP_TESTS(long double, LIBC_NAMESPACE::frexpl);
-- 
cgit v1.1


From c5cbfc5689a26651634e1990b430e917d1ae85da Mon Sep 17 00:00:00 2001
From: Owen Pan <owenpiano@gmail.com>
Date: Fri, 9 Feb 2024 19:53:04 -0800
Subject: [clang-format] Rename option AlwaysBreakTemplateDeclarations (#81093)

Drop the "Always" prefix to remove the self-contradiction.
---
 clang/docs/ClangFormatStyleOptions.rst     | 117 +++++++++++++++--------------
 clang/docs/ReleaseNotes.rst                |   3 +
 clang/docs/tools/dump_format_style.py      |   7 ++
 clang/include/clang/Format/Format.h        |   7 +-
 clang/lib/Format/Format.cpp                |   6 +-
 clang/unittests/Format/ConfigParseTest.cpp |  13 ++++
 6 files changed, 94 insertions(+), 59 deletions(-)

diff --git a/clang/docs/ClangFormatStyleOptions.rst b/clang/docs/ClangFormatStyleOptions.rst
index 4ccdd2d..5deeff0 100644
--- a/clang/docs/ClangFormatStyleOptions.rst
+++ b/clang/docs/ClangFormatStyleOptions.rst
@@ -1659,62 +1659,8 @@ the configuration (without a prefix: ``Auto``).
 
 .. _AlwaysBreakTemplateDeclarations:
 
-**AlwaysBreakTemplateDeclarations** (``BreakTemplateDeclarationsStyle``) :versionbadge:`clang-format 3.4` :ref:`¶ <AlwaysBreakTemplateDeclarations>`
-  The template declaration breaking style to use.
-
-  Possible values:
-
-  * ``BTDS_Leave`` (in configuration: ``Leave``)
-    Do not change the line breaking before the declaration.
-
-    .. code-block:: c++
-
-       template <typename T>
-       T foo() {
-       }
-       template <typename T> T foo(int aaaaaaaaaaaaaaaaaaaaa,
-                                   int bbbbbbbbbbbbbbbbbbbbb) {
-       }
-
-  * ``BTDS_No`` (in configuration: ``No``)
-    Do not force break before declaration.
-    ``PenaltyBreakTemplateDeclaration`` is taken into account.
-
-    .. code-block:: c++
-
-       template <typename T> T foo() {
-       }
-       template <typename T> T foo(int aaaaaaaaaaaaaaaaaaaaa,
-                                   int bbbbbbbbbbbbbbbbbbbbb) {
-       }
-
-  * ``BTDS_MultiLine`` (in configuration: ``MultiLine``)
-    Force break after template declaration only when the following
-    declaration spans multiple lines.
-
-    .. code-block:: c++
-
-       template <typename T> T foo() {
-       }
-       template <typename T>
-       T foo(int aaaaaaaaaaaaaaaaaaaaa,
-             int bbbbbbbbbbbbbbbbbbbbb) {
-       }
-
-  * ``BTDS_Yes`` (in configuration: ``Yes``)
-    Always break after template declaration.
-
-    .. code-block:: c++
-
-       template <typename T>
-       T foo() {
-       }
-       template <typename T>
-       T foo(int aaaaaaaaaaaaaaaaaaaaa,
-             int bbbbbbbbbbbbbbbbbbbbb) {
-       }
-
-
+**AlwaysBreakTemplateDeclarations** (``deprecated``) :versionbadge:`clang-format 3.4` :ref:`¶ <AlwaysBreakTemplateDeclarations>`
+  This option is renamed to ``BreakTemplateDeclarations``.
 
 .. _AttributeMacros:
 
@@ -3014,6 +2960,65 @@ the configuration (without a prefix: ``Auto``).
      string x =
          "veryVeryVeryVeryVeryVeryVeryVeryVeryVeryVeryVeryLongString";
 
+.. _BreakTemplateDeclarations:
+
+**BreakTemplateDeclarations** (``BreakTemplateDeclarationsStyle``) :versionbadge:`clang-format 19` :ref:`¶ <BreakTemplateDeclarations>`
+  The template declaration breaking style to use.
+
+  Possible values:
+
+  * ``BTDS_Leave`` (in configuration: ``Leave``)
+    Do not change the line breaking before the declaration.
+
+    .. code-block:: c++
+
+       template <typename T>
+       T foo() {
+       }
+       template <typename T> T foo(int aaaaaaaaaaaaaaaaaaaaa,
+                                   int bbbbbbbbbbbbbbbbbbbbb) {
+       }
+
+  * ``BTDS_No`` (in configuration: ``No``)
+    Do not force break before declaration.
+    ``PenaltyBreakTemplateDeclaration`` is taken into account.
+
+    .. code-block:: c++
+
+       template <typename T> T foo() {
+       }
+       template <typename T> T foo(int aaaaaaaaaaaaaaaaaaaaa,
+                                   int bbbbbbbbbbbbbbbbbbbbb) {
+       }
+
+  * ``BTDS_MultiLine`` (in configuration: ``MultiLine``)
+    Force break after template declaration only when the following
+    declaration spans multiple lines.
+
+    .. code-block:: c++
+
+       template <typename T> T foo() {
+       }
+       template <typename T>
+       T foo(int aaaaaaaaaaaaaaaaaaaaa,
+             int bbbbbbbbbbbbbbbbbbbbb) {
+       }
+
+  * ``BTDS_Yes`` (in configuration: ``Yes``)
+    Always break after template declaration.
+
+    .. code-block:: c++
+
+       template <typename T>
+       T foo() {
+       }
+       template <typename T>
+       T foo(int aaaaaaaaaaaaaaaaaaaaa,
+             int bbbbbbbbbbbbbbbbbbbbb) {
+       }
+
+
+
 .. _ColumnLimit:
 
 **ColumnLimit** (``Unsigned``) :versionbadge:`clang-format 3.7` :ref:`¶ <ColumnLimit>`
diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 7631f3b..ece6013 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -291,6 +291,9 @@ AST Matchers
 clang-format
 ------------
 
+- ``AlwaysBreakTemplateDeclarations`` is deprecated and renamed to
+  ``BreakTemplateDeclarations``.
+
 libclang
 --------
 
diff --git a/clang/docs/tools/dump_format_style.py b/clang/docs/tools/dump_format_style.py
index e41891f..af0124b 100755
--- a/clang/docs/tools/dump_format_style.py
+++ b/clang/docs/tools/dump_format_style.py
@@ -308,6 +308,7 @@ class OptionsReader:
         enum = None
         nested_struct = None
         version = None
+        deprecated = False
 
         for line in self.header:
             self.lineno += 1
@@ -327,6 +328,8 @@ class OptionsReader:
                     match = re.match(r"/// \\version\s*(?P<version>[0-9.]+)*", line)
                     if match:
                         version = match.group("version")
+                elif line.startswith("/// @deprecated"):
+                    deprecated = True
                 elif line.startswith("///"):
                     comment += self.__clean_comment_line(line)
                 elif line.startswith("enum"):
@@ -345,6 +348,9 @@ class OptionsReader:
                     field_type, field_name = re.match(
                         r"([<>:\w(,\s)]+)\s+(\w+);", line
                     ).groups()
+                    if deprecated:
+                        field_type = "deprecated"
+                        deprecated = False
 
                     if not version:
                         self.__warning(f"missing version for {field_name}", line)
@@ -456,6 +462,7 @@ class OptionsReader:
                 "std::vector<IncludeCategory>",
                 "std::vector<RawStringFormat>",
                 "std::optional<unsigned>",
+                "deprecated",
             ]:
                 if option.type in enums:
                     option.enum = enums[option.type]
diff --git a/clang/include/clang/Format/Format.h b/clang/include/clang/Format/Format.h
index cb14d98..b4969aa 100644
--- a/clang/include/clang/Format/Format.h
+++ b/clang/include/clang/Format/Format.h
@@ -1075,8 +1075,9 @@ struct FormatStyle {
     BTDS_Yes
   };
 
-  /// The template declaration breaking style to use.
+  /// This option is renamed to ``BreakTemplateDeclarations``.
   /// \version 3.4
+  /// @deprecated
   BreakTemplateDeclarationsStyle AlwaysBreakTemplateDeclarations;
 
   /// A vector of strings that should be interpreted as attributes/qualifiers
@@ -2293,6 +2294,10 @@ struct FormatStyle {
   /// \version 7
   BreakInheritanceListStyle BreakInheritanceList;
 
+  /// The template declaration breaking style to use.
+  /// \version 19
+  // BreakTemplateDeclarationsStyle BreakTemplateDeclarations;
+
   /// If ``true``, consecutive namespace declarations will be on the same
   /// line. If ``false``, each namespace is declared on a new line.
   /// \code
diff --git a/clang/lib/Format/Format.cpp b/clang/lib/Format/Format.cpp
index c5714af..c5a8949 100644
--- a/clang/lib/Format/Format.cpp
+++ b/clang/lib/Format/Format.cpp
@@ -877,6 +877,8 @@ template <> struct MappingTraits<FormatStyle> {
     if (!IO.outputting()) {
       IO.mapOptional("AlignEscapedNewlinesLeft", Style.AlignEscapedNewlines);
       IO.mapOptional("AllowAllConstructorInitializersOnNextLine", OnNextLine);
+      IO.mapOptional("AlwaysBreakTemplateDeclarations",
+                     Style.AlwaysBreakTemplateDeclarations);
       IO.mapOptional("BreakBeforeInheritanceComma",
                      BreakBeforeInheritanceComma);
       IO.mapOptional("BreakConstructorInitializersBeforeComma",
@@ -943,8 +945,6 @@ template <> struct MappingTraits<FormatStyle> {
                    Style.AlwaysBreakAfterReturnType);
     IO.mapOptional("AlwaysBreakBeforeMultilineStrings",
                    Style.AlwaysBreakBeforeMultilineStrings);
-    IO.mapOptional("AlwaysBreakTemplateDeclarations",
-                   Style.AlwaysBreakTemplateDeclarations);
     IO.mapOptional("AttributeMacros", Style.AttributeMacros);
     IO.mapOptional("BinPackArguments", Style.BinPackArguments);
     IO.mapOptional("BinPackParameters", Style.BinPackParameters);
@@ -971,6 +971,8 @@ template <> struct MappingTraits<FormatStyle> {
                    Style.BreakConstructorInitializers);
     IO.mapOptional("BreakInheritanceList", Style.BreakInheritanceList);
     IO.mapOptional("BreakStringLiterals", Style.BreakStringLiterals);
+    IO.mapOptional("BreakTemplateDeclarations",
+                   Style.AlwaysBreakTemplateDeclarations);
     IO.mapOptional("ColumnLimit", Style.ColumnLimit);
     IO.mapOptional("CommentPragmas", Style.CommentPragmas);
     IO.mapOptional("CompactNamespaces", Style.CompactNamespaces);
diff --git a/clang/unittests/Format/ConfigParseTest.cpp b/clang/unittests/Format/ConfigParseTest.cpp
index 7493b0a..22681a2 100644
--- a/clang/unittests/Format/ConfigParseTest.cpp
+++ b/clang/unittests/Format/ConfigParseTest.cpp
@@ -695,6 +695,19 @@ TEST(ConfigParseTest, ParsesConfiguration) {
               FormatStyle::RTBS_TopLevelDefinitions);
 
   Style.AlwaysBreakTemplateDeclarations = FormatStyle::BTDS_Yes;
+  CHECK_PARSE("BreakTemplateDeclarations: Leave",
+              AlwaysBreakTemplateDeclarations, FormatStyle::BTDS_Leave);
+  CHECK_PARSE("BreakTemplateDeclarations: No", AlwaysBreakTemplateDeclarations,
+              FormatStyle::BTDS_No);
+  CHECK_PARSE("BreakTemplateDeclarations: MultiLine",
+              AlwaysBreakTemplateDeclarations, FormatStyle::BTDS_MultiLine);
+  CHECK_PARSE("BreakTemplateDeclarations: Yes", AlwaysBreakTemplateDeclarations,
+              FormatStyle::BTDS_Yes);
+  CHECK_PARSE("BreakTemplateDeclarations: false",
+              AlwaysBreakTemplateDeclarations, FormatStyle::BTDS_MultiLine);
+  CHECK_PARSE("BreakTemplateDeclarations: true",
+              AlwaysBreakTemplateDeclarations, FormatStyle::BTDS_Yes);
+  // For backward compatibility:
   CHECK_PARSE("AlwaysBreakTemplateDeclarations: Leave",
               AlwaysBreakTemplateDeclarations, FormatStyle::BTDS_Leave);
   CHECK_PARSE("AlwaysBreakTemplateDeclarations: No",
-- 
cgit v1.1


From 7664ddf8811242295abb837640cad8dd8cefb5e8 Mon Sep 17 00:00:00 2001
From: Owen Pan <owenpiano@gmail.com>
Date: Fri, 9 Feb 2024 20:15:35 -0800
Subject: [clang-format][NFC] Drop "Always" in
 "AlwaysBreakTemplateDeclarations"

---
 clang/include/clang/Format/Format.h        |  7 +++---
 clang/lib/Format/ContinuationIndenter.cpp  |  5 ++--
 clang/lib/Format/Format.cpp                | 10 ++++----
 clang/lib/Format/TokenAnnotator.cpp        |  6 ++---
 clang/unittests/Format/ConfigParseTest.cpp | 38 +++++++++++++++---------------
 clang/unittests/Format/FormatTest.cpp      | 11 ++++-----
 6 files changed, 37 insertions(+), 40 deletions(-)

diff --git a/clang/include/clang/Format/Format.h b/clang/include/clang/Format/Format.h
index b4969aa..ab56cc8 100644
--- a/clang/include/clang/Format/Format.h
+++ b/clang/include/clang/Format/Format.h
@@ -1078,7 +1078,7 @@ struct FormatStyle {
   /// This option is renamed to ``BreakTemplateDeclarations``.
   /// \version 3.4
   /// @deprecated
-  BreakTemplateDeclarationsStyle AlwaysBreakTemplateDeclarations;
+  // BreakTemplateDeclarationsStyle AlwaysBreakTemplateDeclarations;
 
   /// A vector of strings that should be interpreted as attributes/qualifiers
   /// instead of identifiers. This can be useful for language extensions or
@@ -2296,7 +2296,7 @@ struct FormatStyle {
 
   /// The template declaration breaking style to use.
   /// \version 19
-  // BreakTemplateDeclarationsStyle BreakTemplateDeclarations;
+  BreakTemplateDeclarationsStyle BreakTemplateDeclarations;
 
   /// If ``true``, consecutive namespace declarations will be on the same
   /// line. If ``false``, each namespace is declared on a new line.
@@ -4822,8 +4822,7 @@ struct FormatStyle {
            AlwaysBreakAfterReturnType == R.AlwaysBreakAfterReturnType &&
            AlwaysBreakBeforeMultilineStrings ==
                R.AlwaysBreakBeforeMultilineStrings &&
-           AlwaysBreakTemplateDeclarations ==
-               R.AlwaysBreakTemplateDeclarations &&
+           BreakTemplateDeclarations == R.BreakTemplateDeclarations &&
            AttributeMacros == R.AttributeMacros &&
            BinPackArguments == R.BinPackArguments &&
            BinPackParameters == R.BinPackParameters &&
diff --git a/clang/lib/Format/ContinuationIndenter.cpp b/clang/lib/Format/ContinuationIndenter.cpp
index 7fd04b2..0b2ef97 100644
--- a/clang/lib/Format/ContinuationIndenter.cpp
+++ b/clang/lib/Format/ContinuationIndenter.cpp
@@ -569,9 +569,8 @@ bool ContinuationIndenter::mustBreak(const LineState &State) {
           return true;
         }
       }
-      return Style.AlwaysBreakTemplateDeclarations != FormatStyle::BTDS_No &&
-             (Style.AlwaysBreakTemplateDeclarations !=
-                  FormatStyle::BTDS_Leave ||
+      return Style.BreakTemplateDeclarations != FormatStyle::BTDS_No &&
+             (Style.BreakTemplateDeclarations != FormatStyle::BTDS_Leave ||
               Current.NewlinesBefore > 0);
     }
     if (Previous.is(TT_FunctionAnnotationRParen) &&
diff --git a/clang/lib/Format/Format.cpp b/clang/lib/Format/Format.cpp
index c5a8949..d2cc466 100644
--- a/clang/lib/Format/Format.cpp
+++ b/clang/lib/Format/Format.cpp
@@ -878,7 +878,7 @@ template <> struct MappingTraits<FormatStyle> {
       IO.mapOptional("AlignEscapedNewlinesLeft", Style.AlignEscapedNewlines);
       IO.mapOptional("AllowAllConstructorInitializersOnNextLine", OnNextLine);
       IO.mapOptional("AlwaysBreakTemplateDeclarations",
-                     Style.AlwaysBreakTemplateDeclarations);
+                     Style.BreakTemplateDeclarations);
       IO.mapOptional("BreakBeforeInheritanceComma",
                      BreakBeforeInheritanceComma);
       IO.mapOptional("BreakConstructorInitializersBeforeComma",
@@ -972,7 +972,7 @@ template <> struct MappingTraits<FormatStyle> {
     IO.mapOptional("BreakInheritanceList", Style.BreakInheritanceList);
     IO.mapOptional("BreakStringLiterals", Style.BreakStringLiterals);
     IO.mapOptional("BreakTemplateDeclarations",
-                   Style.AlwaysBreakTemplateDeclarations);
+                   Style.BreakTemplateDeclarations);
     IO.mapOptional("ColumnLimit", Style.ColumnLimit);
     IO.mapOptional("CommentPragmas", Style.CommentPragmas);
     IO.mapOptional("CompactNamespaces", Style.CompactNamespaces);
@@ -1441,7 +1441,7 @@ FormatStyle getLLVMStyle(FormatStyle::LanguageKind Language) {
   LLVMStyle.AlwaysBreakAfterReturnType = FormatStyle::RTBS_None;
   LLVMStyle.AlwaysBreakAfterDefinitionReturnType = FormatStyle::DRTBS_None;
   LLVMStyle.AlwaysBreakBeforeMultilineStrings = false;
-  LLVMStyle.AlwaysBreakTemplateDeclarations = FormatStyle::BTDS_MultiLine;
+  LLVMStyle.BreakTemplateDeclarations = FormatStyle::BTDS_MultiLine;
   LLVMStyle.AttributeMacros.push_back("__capability");
   LLVMStyle.BitFieldColonSpacing = FormatStyle::BFCS_Both;
   LLVMStyle.BinPackArguments = true;
@@ -1631,7 +1631,7 @@ FormatStyle getGoogleStyle(FormatStyle::LanguageKind Language) {
       FormatStyle::SIS_WithoutElse;
   GoogleStyle.AllowShortLoopsOnASingleLine = true;
   GoogleStyle.AlwaysBreakBeforeMultilineStrings = true;
-  GoogleStyle.AlwaysBreakTemplateDeclarations = FormatStyle::BTDS_Yes;
+  GoogleStyle.BreakTemplateDeclarations = FormatStyle::BTDS_Yes;
   GoogleStyle.DerivePointerAlignment = true;
   GoogleStyle.IncludeStyle.IncludeCategories = {{"^<ext/.*\\.h>", 2, 0, false},
                                                 {"^<.*\\.h>", 1, 0, false},
@@ -1824,7 +1824,7 @@ FormatStyle getMozillaStyle() {
   MozillaStyle.AlwaysBreakAfterReturnType = FormatStyle::RTBS_TopLevel;
   MozillaStyle.AlwaysBreakAfterDefinitionReturnType =
       FormatStyle::DRTBS_TopLevel;
-  MozillaStyle.AlwaysBreakTemplateDeclarations = FormatStyle::BTDS_Yes;
+  MozillaStyle.BreakTemplateDeclarations = FormatStyle::BTDS_Yes;
   MozillaStyle.BinPackParameters = false;
   MozillaStyle.BinPackArguments = false;
   MozillaStyle.BreakBeforeBraces = FormatStyle::BS_Mozilla;
diff --git a/clang/lib/Format/TokenAnnotator.cpp b/clang/lib/Format/TokenAnnotator.cpp
index cec56fa..b103400 100644
--- a/clang/lib/Format/TokenAnnotator.cpp
+++ b/clang/lib/Format/TokenAnnotator.cpp
@@ -5184,8 +5184,8 @@ bool TokenAnnotator::mustBreakBefore(const AnnotatedLine &Line,
     // concept ...
     if (Right.is(tok::kw_concept))
       return Style.BreakBeforeConceptDeclarations == FormatStyle::BBCDS_Always;
-    return Style.AlwaysBreakTemplateDeclarations == FormatStyle::BTDS_Yes ||
-           (Style.AlwaysBreakTemplateDeclarations == FormatStyle::BTDS_Leave &&
+    return Style.BreakTemplateDeclarations == FormatStyle::BTDS_Yes ||
+           (Style.BreakTemplateDeclarations == FormatStyle::BTDS_Leave &&
             Right.NewlinesBefore > 0);
   }
   if (Left.ClosesRequiresClause && Right.isNot(tok::semi)) {
@@ -5620,7 +5620,7 @@ bool TokenAnnotator::canBreakBefore(const AnnotatedLine &Line,
   if (Right.is(TT_RequiresClause))
     return true;
   if (Left.ClosesTemplateDeclaration) {
-    return Style.AlwaysBreakTemplateDeclarations != FormatStyle::BTDS_Leave ||
+    return Style.BreakTemplateDeclarations != FormatStyle::BTDS_Leave ||
            Right.NewlinesBefore > 0;
   }
   if (Left.is(TT_FunctionAnnotationRParen))
diff --git a/clang/unittests/Format/ConfigParseTest.cpp b/clang/unittests/Format/ConfigParseTest.cpp
index 22681a2..571e1eb 100644
--- a/clang/unittests/Format/ConfigParseTest.cpp
+++ b/clang/unittests/Format/ConfigParseTest.cpp
@@ -694,32 +694,32 @@ TEST(ConfigParseTest, ParsesConfiguration) {
               AlwaysBreakAfterReturnType,
               FormatStyle::RTBS_TopLevelDefinitions);
 
-  Style.AlwaysBreakTemplateDeclarations = FormatStyle::BTDS_Yes;
-  CHECK_PARSE("BreakTemplateDeclarations: Leave",
-              AlwaysBreakTemplateDeclarations, FormatStyle::BTDS_Leave);
-  CHECK_PARSE("BreakTemplateDeclarations: No", AlwaysBreakTemplateDeclarations,
+  Style.BreakTemplateDeclarations = FormatStyle::BTDS_Yes;
+  CHECK_PARSE("BreakTemplateDeclarations: Leave", BreakTemplateDeclarations,
+              FormatStyle::BTDS_Leave);
+  CHECK_PARSE("BreakTemplateDeclarations: No", BreakTemplateDeclarations,
               FormatStyle::BTDS_No);
-  CHECK_PARSE("BreakTemplateDeclarations: MultiLine",
-              AlwaysBreakTemplateDeclarations, FormatStyle::BTDS_MultiLine);
-  CHECK_PARSE("BreakTemplateDeclarations: Yes", AlwaysBreakTemplateDeclarations,
+  CHECK_PARSE("BreakTemplateDeclarations: MultiLine", BreakTemplateDeclarations,
+              FormatStyle::BTDS_MultiLine);
+  CHECK_PARSE("BreakTemplateDeclarations: Yes", BreakTemplateDeclarations,
+              FormatStyle::BTDS_Yes);
+  CHECK_PARSE("BreakTemplateDeclarations: false", BreakTemplateDeclarations,
+              FormatStyle::BTDS_MultiLine);
+  CHECK_PARSE("BreakTemplateDeclarations: true", BreakTemplateDeclarations,
               FormatStyle::BTDS_Yes);
-  CHECK_PARSE("BreakTemplateDeclarations: false",
-              AlwaysBreakTemplateDeclarations, FormatStyle::BTDS_MultiLine);
-  CHECK_PARSE("BreakTemplateDeclarations: true",
-              AlwaysBreakTemplateDeclarations, FormatStyle::BTDS_Yes);
   // For backward compatibility:
   CHECK_PARSE("AlwaysBreakTemplateDeclarations: Leave",
-              AlwaysBreakTemplateDeclarations, FormatStyle::BTDS_Leave);
-  CHECK_PARSE("AlwaysBreakTemplateDeclarations: No",
-              AlwaysBreakTemplateDeclarations, FormatStyle::BTDS_No);
+              BreakTemplateDeclarations, FormatStyle::BTDS_Leave);
+  CHECK_PARSE("AlwaysBreakTemplateDeclarations: No", BreakTemplateDeclarations,
+              FormatStyle::BTDS_No);
   CHECK_PARSE("AlwaysBreakTemplateDeclarations: MultiLine",
-              AlwaysBreakTemplateDeclarations, FormatStyle::BTDS_MultiLine);
-  CHECK_PARSE("AlwaysBreakTemplateDeclarations: Yes",
-              AlwaysBreakTemplateDeclarations, FormatStyle::BTDS_Yes);
+              BreakTemplateDeclarations, FormatStyle::BTDS_MultiLine);
+  CHECK_PARSE("AlwaysBreakTemplateDeclarations: Yes", BreakTemplateDeclarations,
+              FormatStyle::BTDS_Yes);
   CHECK_PARSE("AlwaysBreakTemplateDeclarations: false",
-              AlwaysBreakTemplateDeclarations, FormatStyle::BTDS_MultiLine);
+              BreakTemplateDeclarations, FormatStyle::BTDS_MultiLine);
   CHECK_PARSE("AlwaysBreakTemplateDeclarations: true",
-              AlwaysBreakTemplateDeclarations, FormatStyle::BTDS_Yes);
+              BreakTemplateDeclarations, FormatStyle::BTDS_Yes);
 
   Style.AlwaysBreakAfterDefinitionReturnType = FormatStyle::DRTBS_All;
   CHECK_PARSE("AlwaysBreakAfterDefinitionReturnType: None",
diff --git a/clang/unittests/Format/FormatTest.cpp b/clang/unittests/Format/FormatTest.cpp
index b1a2247..7b65c8d 100644
--- a/clang/unittests/Format/FormatTest.cpp
+++ b/clang/unittests/Format/FormatTest.cpp
@@ -10638,7 +10638,7 @@ TEST_F(FormatTest, WrapsTemplateDeclarations) {
                "    const typename aaaaaaaaaaaaaaaa aaaaaaaaaaaaaaaaaaa);");
 
   FormatStyle AlwaysBreak = getLLVMStyle();
-  AlwaysBreak.AlwaysBreakTemplateDeclarations = FormatStyle::BTDS_Yes;
+  AlwaysBreak.BreakTemplateDeclarations = FormatStyle::BTDS_Yes;
   verifyFormat("template <typename T>\nclass C {};", AlwaysBreak);
   verifyFormat("template <typename T>\nvoid f();", AlwaysBreak);
   verifyFormat("template <typename T>\nvoid f() {}", AlwaysBreak);
@@ -10667,7 +10667,7 @@ TEST_F(FormatTest, WrapsTemplateDeclarations) {
                "};");
 
   FormatStyle NeverBreak = getLLVMStyle();
-  NeverBreak.AlwaysBreakTemplateDeclarations = FormatStyle::BTDS_No;
+  NeverBreak.BreakTemplateDeclarations = FormatStyle::BTDS_No;
   verifyFormat("template <typename T> class C {};", NeverBreak);
   verifyFormat("template <typename T> void f();", NeverBreak);
   verifyFormat("template <typename T> void f() {}", NeverBreak);
@@ -10699,7 +10699,7 @@ TEST_F(FormatTest, WrapsTemplateDeclarations) {
                NeverBreak);
 
   auto Style = getLLVMStyle();
-  Style.AlwaysBreakTemplateDeclarations = FormatStyle::BTDS_Leave;
+  Style.BreakTemplateDeclarations = FormatStyle::BTDS_Leave;
 
   verifyNoChange("template <typename T>\n"
                  "class C {};",
@@ -11297,7 +11297,7 @@ TEST_F(FormatTest, UnderstandsFunctionRefQualification) {
   verifyFormat("SomeType MemberFunction( const Deleted & ) &;", Spaces);
 
   FormatStyle BreakTemplate = getLLVMStyle();
-  BreakTemplate.AlwaysBreakTemplateDeclarations = FormatStyle::BTDS_Yes;
+  BreakTemplate.BreakTemplateDeclarations = FormatStyle::BTDS_Yes;
 
   verifyFormat("struct f {\n"
                "  template <class T>\n"
@@ -11330,8 +11330,7 @@ TEST_F(FormatTest, UnderstandsFunctionRefQualification) {
                BreakTemplate);
 
   FormatStyle AlignLeftBreakTemplate = getLLVMStyle();
-  AlignLeftBreakTemplate.AlwaysBreakTemplateDeclarations =
-      FormatStyle::BTDS_Yes;
+  AlignLeftBreakTemplate.BreakTemplateDeclarations = FormatStyle::BTDS_Yes;
   AlignLeftBreakTemplate.PointerAlignment = FormatStyle::PAS_Left;
 
   verifyFormat("struct f {\n"
-- 
cgit v1.1


From e165bea1d4ec2de96ee0548cece79d71a75ce8f8 Mon Sep 17 00:00:00 2001
From: Tom Stellard <tstellar@redhat.com>
Date: Fri, 9 Feb 2024 20:57:05 -0800
Subject: [lld] Fix test failures when running as root user (#81339)

This makes it easier to run the tests in a containerized environment.
---
 lld/test/COFF/lto-cache-errors.ll                 | 2 +-
 lld/test/COFF/thinlto-emit-imports.ll             | 2 +-
 lld/test/ELF/lto/resolution-err.ll                | 2 +-
 lld/test/ELF/lto/thinlto-cant-write-index.ll      | 2 +-
 lld/test/ELF/lto/thinlto-emit-imports.ll          | 2 +-
 lld/test/MachO/invalid/invalid-lto-object-path.ll | 2 +-
 lld/test/MachO/thinlto-emit-imports.ll            | 2 +-
 7 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/lld/test/COFF/lto-cache-errors.ll b/lld/test/COFF/lto-cache-errors.ll
index 55244e5..a46190a 100644
--- a/lld/test/COFF/lto-cache-errors.ll
+++ b/lld/test/COFF/lto-cache-errors.ll
@@ -1,4 +1,4 @@
-; REQUIRES: x86
+; REQUIRES: x86, non-root-user
 ;; Not supported on windows since we use permissions to deny the creation
 ; UNSUPPORTED: system-windows
 
diff --git a/lld/test/COFF/thinlto-emit-imports.ll b/lld/test/COFF/thinlto-emit-imports.ll
index a9f22c1..b47a6cea 100644
--- a/lld/test/COFF/thinlto-emit-imports.ll
+++ b/lld/test/COFF/thinlto-emit-imports.ll
@@ -1,4 +1,4 @@
-; REQUIRES: x86
+; REQUIRES: x86, non-root-user
 
 ; Generate summary sections and test lld handling.
 ; RUN: opt -module-summary %s -o %t1.obj
diff --git a/lld/test/ELF/lto/resolution-err.ll b/lld/test/ELF/lto/resolution-err.ll
index 6dfa64b..f9855ab 100644
--- a/lld/test/ELF/lto/resolution-err.ll
+++ b/lld/test/ELF/lto/resolution-err.ll
@@ -1,5 +1,5 @@
 ; UNSUPPORTED: system-windows
-; REQUIRES: shell
+; REQUIRES: shell, non-root-user
 ; RUN: llvm-as %s -o %t.bc
 ; RUN: touch %t.resolution.txt
 ; RUN: chmod u-w %t.resolution.txt
diff --git a/lld/test/ELF/lto/thinlto-cant-write-index.ll b/lld/test/ELF/lto/thinlto-cant-write-index.ll
index e664acb..286fcdd 100644
--- a/lld/test/ELF/lto/thinlto-cant-write-index.ll
+++ b/lld/test/ELF/lto/thinlto-cant-write-index.ll
@@ -1,4 +1,4 @@
-; REQUIRES: x86
+; REQUIRES: x86, non-root-user
 
 ; Basic ThinLTO tests.
 ; RUN: opt -module-summary %s -o %t1.o
diff --git a/lld/test/ELF/lto/thinlto-emit-imports.ll b/lld/test/ELF/lto/thinlto-emit-imports.ll
index 6d0e1e6..253ec08 100644
--- a/lld/test/ELF/lto/thinlto-emit-imports.ll
+++ b/lld/test/ELF/lto/thinlto-emit-imports.ll
@@ -1,4 +1,4 @@
-; REQUIRES: x86
+; REQUIRES: x86, non-root-user
 ;; Test a few properties not tested by thinlto-index-only.ll
 
 ; RUN: opt -module-summary %s -o %t1.o
diff --git a/lld/test/MachO/invalid/invalid-lto-object-path.ll b/lld/test/MachO/invalid/invalid-lto-object-path.ll
index 75c6a97..c862538 100644
--- a/lld/test/MachO/invalid/invalid-lto-object-path.ll
+++ b/lld/test/MachO/invalid/invalid-lto-object-path.ll
@@ -1,4 +1,4 @@
-; REQUIRES: x86
+; REQUIRES: x86, non-root-user
 
 ;; Creating read-only directories with `chmod 400` isn't supported on Windows
 ; UNSUPPORTED: system-windows
diff --git a/lld/test/MachO/thinlto-emit-imports.ll b/lld/test/MachO/thinlto-emit-imports.ll
index 47a612b..88f766f 100644
--- a/lld/test/MachO/thinlto-emit-imports.ll
+++ b/lld/test/MachO/thinlto-emit-imports.ll
@@ -1,4 +1,4 @@
-; REQUIRES: x86
+; REQUIRES: x86, non-root-user
 ; RUN: rm -rf %t; split-file %s %t
 
 ; Generate summary sections and test lld handling.
-- 
cgit v1.1


From 7192c22ee43500b1a6313d1ade38e002463944a6 Mon Sep 17 00:00:00 2001
From: Mikhail Gudim <mgudim@gmail.com>
Date: Sat, 10 Feb 2024 00:42:33 -0500
Subject: [GlobalISel][RISCV] Use constant pool for large integer constants.
 (#81101)

We apply custom lowering to 64 bit constants where we use the same logic
as in non-global isel: if materializing in registers is too expensive,
we emit a load from constant pool. Later, during instruction selection,
constant pool address is generated using `selectAddr`.
---
 llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp | 89 +++++++++++++++++++++-
 llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.h   |  4 +
 .../legalizer/legalize-bitreverse-rv64.mir         | 33 ++++----
 .../GlobalISel/legalizer/legalize-const-rv64.mir   | 36 +++++++--
 4 files changed, 138 insertions(+), 24 deletions(-)

diff --git a/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp b/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp
index ae02e86..e852052 100644
--- a/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp
+++ b/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp
@@ -11,11 +11,14 @@
 //===----------------------------------------------------------------------===//
 
 #include "RISCVLegalizerInfo.h"
+#include "MCTargetDesc/RISCVMatInt.h"
 #include "RISCVMachineFunctionInfo.h"
 #include "RISCVSubtarget.h"
+#include "llvm/CodeGen/GlobalISel/GIMatchTableExecutor.h"
 #include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/TargetOpcodes.h"
 #include "llvm/CodeGen/ValueTypes.h"
@@ -182,7 +185,13 @@ RISCVLegalizerInfo::RISCVLegalizerInfo(const RISCVSubtarget &ST)
     CTPOPActions.maxScalar(0, sXLen).scalarSameSizeAs(1, 0).lower();
   }
 
-  getActionDefinitionsBuilder({G_CONSTANT, G_IMPLICIT_DEF})
+  auto &ConstantActions = getActionDefinitionsBuilder(G_CONSTANT);
+  ConstantActions.legalFor({s32, p0});
+  if (ST.is64Bit())
+    ConstantActions.customFor({s64});
+  ConstantActions.widenScalarToNextPow2(0).clampScalar(0, s32, sXLen);
+
+  getActionDefinitionsBuilder(G_IMPLICIT_DEF)
       .legalFor({s32, sXLen, p0})
       .widenScalarToNextPow2(0)
       .clampScalar(0, s32, sXLen);
@@ -451,17 +460,95 @@ bool RISCVLegalizerInfo::legalizeVAStart(MachineInstr &MI,
   return true;
 }
 
+bool RISCVLegalizerInfo::shouldBeInConstantPool(APInt APImm,
+                                                bool ShouldOptForSize) const {
+  unsigned BitWidth = APImm.getBitWidth();
+  assert(BitWidth == 32 || BitWidth == 64);
+  int64_t Imm = APImm.getSExtValue();
+  // All simm32 constants should be handled by isel.
+  // NOTE: The getMaxBuildIntsCost call below should return a value >= 2 making
+  // this check redundant, but small immediates are common so this check
+  // should have better compile time.
+  if (isInt<32>(Imm))
+    return false;
+
+  // We only need to cost the immediate, if constant pool lowering is enabled.
+  if (!STI.useConstantPoolForLargeInts())
+    return false;
+
+  RISCVMatInt::InstSeq Seq = RISCVMatInt::generateInstSeq(Imm, STI);
+  if (Seq.size() <= STI.getMaxBuildIntsCost())
+    return false;
+
+  // Optimizations below are disabled for opt size. If we're optimizing for
+  // size, use a constant pool.
+  if (ShouldOptForSize)
+    return true;
+  //
+  // Special case. See if we can build the constant as (ADD (SLLI X, C), X) do
+  // that if it will avoid a constant pool.
+  // It will require an extra temporary register though.
+  // If we have Zba we can use (ADD_UW X, (SLLI X, 32)) to handle cases where
+  // low and high 32 bits are the same and bit 31 and 63 are set.
+  unsigned ShiftAmt, AddOpc;
+  RISCVMatInt::InstSeq SeqLo =
+      RISCVMatInt::generateTwoRegInstSeq(Imm, STI, ShiftAmt, AddOpc);
+  return !(!SeqLo.empty() && (SeqLo.size() + 2) <= STI.getMaxBuildIntsCost());
+}
+
+// TODO: This is almost the same as LegalizerHelper::lowerFConstant and is
+// target-independent. Should we move this to LegalizeHelper?
+bool RISCVLegalizerInfo::emitLoadFromConstantPool(
+    Register DstReg, const Constant *ConstVal,
+    MachineIRBuilder &MIRBuilder) const {
+  MachineRegisterInfo &MRI = *MIRBuilder.getMRI();
+  MachineFunction &MF = MIRBuilder.getMF();
+  const DataLayout &DL = MIRBuilder.getDataLayout();
+  LLVMContext &Ctx = MF.getFunction().getContext();
+  unsigned AddrSpace = DL.getDefaultGlobalsAddressSpace();
+  LLT AddrPtrTy = LLT::pointer(AddrSpace, DL.getPointerSizeInBits(AddrSpace));
+  LLT DstLLT = MRI.getType(DstReg);
+
+  Align Alignment(DL.getABITypeAlign(getTypeForLLT(DstLLT, Ctx)));
+
+  auto Addr = MIRBuilder.buildConstantPool(
+      AddrPtrTy,
+      MF.getConstantPool()->getConstantPoolIndex(ConstVal, Alignment));
+
+  MachineMemOperand *MMO =
+      MF.getMachineMemOperand(MachinePointerInfo::getConstantPool(MF),
+                              MachineMemOperand::MOLoad, DstLLT, Alignment);
+
+  MIRBuilder.buildLoadInstr(TargetOpcode::G_LOAD, DstReg, Addr, *MMO);
+  return true;
+}
+
 bool RISCVLegalizerInfo::legalizeCustom(
     LegalizerHelper &Helper, MachineInstr &MI,
     LostDebugLocObserver &LocObserver) const {
   MachineIRBuilder &MIRBuilder = Helper.MIRBuilder;
   GISelChangeObserver &Observer = Helper.Observer;
+  MachineFunction &MF = *MI.getParent()->getParent();
   switch (MI.getOpcode()) {
   default:
     // No idea what to do.
     return false;
   case TargetOpcode::G_ABS:
     return Helper.lowerAbsToMaxNeg(MI);
+  // TODO: G_FCONSTANT
+  case TargetOpcode::G_CONSTANT: {
+    const Function &F = MF.getFunction();
+    // TODO: if PSI and BFI are present, add " ||
+    // llvm::shouldOptForSize(*CurMBB, PSI, BFI)".
+    bool ShouldOptForSize = F.hasOptSize() || F.hasMinSize();
+    const ConstantInt *ConstVal = MI.getOperand(1).getCImm();
+    if (!shouldBeInConstantPool(ConstVal->getValue(), ShouldOptForSize))
+      return true;
+    emitLoadFromConstantPool(MI.getOperand(0).getReg(),
+                             MI.getOperand(1).getCImm(), MIRBuilder);
+    MI.eraseFromParent();
+    return true;
+  }
   case TargetOpcode::G_SHL:
   case TargetOpcode::G_ASHR:
   case TargetOpcode::G_LSHR:
diff --git a/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.h b/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.h
index f3ec6be..046555f 100644
--- a/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.h
+++ b/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.h
@@ -14,6 +14,7 @@
 #define LLVM_LIB_TARGET_RISCV_RISCVMACHINELEGALIZER_H
 
 #include "llvm/CodeGen/GlobalISel/LegalizerInfo.h"
+#include "llvm/CodeGen/Register.h"
 
 namespace llvm {
 
@@ -36,6 +37,9 @@ public:
                          MachineInstr &MI) const override;
 
 private:
+  bool shouldBeInConstantPool(APInt APImm, bool ShouldOptForSize) const;
+  bool emitLoadFromConstantPool(Register DstReg, const Constant *CPVal,
+                                MachineIRBuilder &MIRBuilder) const;
   bool legalizeShlAshrLshr(MachineInstr &MI, MachineIRBuilder &MIRBuilder,
                            GISelChangeObserver &Observer) const;
 
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-bitreverse-rv64.mir b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-bitreverse-rv64.mir
index f4a098d..d147350 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-bitreverse-rv64.mir
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-bitreverse-rv64.mir
@@ -220,25 +220,28 @@ body:             |
     ; CHECK-NEXT: [[AND5:%[0-9]+]]:_(s64) = G_AND [[LSHR3]], [[C5]]
     ; CHECK-NEXT: [[OR6:%[0-9]+]]:_(s64) = G_OR [[OR5]], [[AND5]]
     ; CHECK-NEXT: [[C7:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
-    ; CHECK-NEXT: [[C8:%[0-9]+]]:_(s64) = G_CONSTANT i64 -1085102592571150096
-    ; CHECK-NEXT: [[AND6:%[0-9]+]]:_(s64) = G_AND [[OR6]], [[C8]]
+    ; CHECK-NEXT: [[CONSTANT_POOL:%[0-9]+]]:_(p0) = G_CONSTANT_POOL %const.2
+    ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[CONSTANT_POOL]](p0) :: (load (s64) from constant-pool)
+    ; CHECK-NEXT: [[AND6:%[0-9]+]]:_(s64) = G_AND [[OR6]], [[LOAD]]
     ; CHECK-NEXT: [[LSHR4:%[0-9]+]]:_(s64) = G_LSHR [[AND6]], [[C7]](s64)
     ; CHECK-NEXT: [[SHL4:%[0-9]+]]:_(s64) = G_SHL [[OR6]], [[C7]](s64)
-    ; CHECK-NEXT: [[AND7:%[0-9]+]]:_(s64) = G_AND [[SHL4]], [[C8]]
+    ; CHECK-NEXT: [[AND7:%[0-9]+]]:_(s64) = G_AND [[SHL4]], [[LOAD]]
     ; CHECK-NEXT: [[OR7:%[0-9]+]]:_(s64) = G_OR [[LSHR4]], [[AND7]]
-    ; CHECK-NEXT: [[C9:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; CHECK-NEXT: [[C10:%[0-9]+]]:_(s64) = G_CONSTANT i64 -3689348814741910324
-    ; CHECK-NEXT: [[AND8:%[0-9]+]]:_(s64) = G_AND [[OR7]], [[C10]]
-    ; CHECK-NEXT: [[LSHR5:%[0-9]+]]:_(s64) = G_LSHR [[AND8]], [[C9]](s64)
-    ; CHECK-NEXT: [[SHL5:%[0-9]+]]:_(s64) = G_SHL [[OR7]], [[C9]](s64)
-    ; CHECK-NEXT: [[AND9:%[0-9]+]]:_(s64) = G_AND [[SHL5]], [[C10]]
+    ; CHECK-NEXT: [[C8:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; CHECK-NEXT: [[CONSTANT_POOL1:%[0-9]+]]:_(p0) = G_CONSTANT_POOL %const.1
+    ; CHECK-NEXT: [[LOAD1:%[0-9]+]]:_(s64) = G_LOAD [[CONSTANT_POOL1]](p0) :: (load (s64) from constant-pool)
+    ; CHECK-NEXT: [[AND8:%[0-9]+]]:_(s64) = G_AND [[OR7]], [[LOAD1]]
+    ; CHECK-NEXT: [[LSHR5:%[0-9]+]]:_(s64) = G_LSHR [[AND8]], [[C8]](s64)
+    ; CHECK-NEXT: [[SHL5:%[0-9]+]]:_(s64) = G_SHL [[OR7]], [[C8]](s64)
+    ; CHECK-NEXT: [[AND9:%[0-9]+]]:_(s64) = G_AND [[SHL5]], [[LOAD1]]
     ; CHECK-NEXT: [[OR8:%[0-9]+]]:_(s64) = G_OR [[LSHR5]], [[AND9]]
-    ; CHECK-NEXT: [[C11:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; CHECK-NEXT: [[C12:%[0-9]+]]:_(s64) = G_CONSTANT i64 -6148914691236517206
-    ; CHECK-NEXT: [[AND10:%[0-9]+]]:_(s64) = G_AND [[OR8]], [[C12]]
-    ; CHECK-NEXT: [[LSHR6:%[0-9]+]]:_(s64) = G_LSHR [[AND10]], [[C11]](s64)
-    ; CHECK-NEXT: [[SHL6:%[0-9]+]]:_(s64) = G_SHL [[OR8]], [[C11]](s64)
-    ; CHECK-NEXT: [[AND11:%[0-9]+]]:_(s64) = G_AND [[SHL6]], [[C12]]
+    ; CHECK-NEXT: [[C9:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; CHECK-NEXT: [[CONSTANT_POOL2:%[0-9]+]]:_(p0) = G_CONSTANT_POOL %const.0
+    ; CHECK-NEXT: [[LOAD2:%[0-9]+]]:_(s64) = G_LOAD [[CONSTANT_POOL2]](p0) :: (load (s64) from constant-pool)
+    ; CHECK-NEXT: [[AND10:%[0-9]+]]:_(s64) = G_AND [[OR8]], [[LOAD2]]
+    ; CHECK-NEXT: [[LSHR6:%[0-9]+]]:_(s64) = G_LSHR [[AND10]], [[C9]](s64)
+    ; CHECK-NEXT: [[SHL6:%[0-9]+]]:_(s64) = G_SHL [[OR8]], [[C9]](s64)
+    ; CHECK-NEXT: [[AND11:%[0-9]+]]:_(s64) = G_AND [[SHL6]], [[LOAD2]]
     ; CHECK-NEXT: [[OR9:%[0-9]+]]:_(s64) = G_OR [[LSHR6]], [[AND11]]
     ; CHECK-NEXT: $x10 = COPY [[OR9]](s64)
     ; CHECK-NEXT: PseudoRET implicit $x10
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-const-rv64.mir b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-const-rv64.mir
index fa57295..6389fd6 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-const-rv64.mir
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-const-rv64.mir
@@ -6,8 +6,9 @@ name:            const_i8
 body:             |
   bb.0.entry:
     ; CHECK-LABEL: name: const_i8
-    ; CHECK: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 -127
-    ; CHECK-NEXT: $x10 = COPY [[C]](s64)
+    ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 -127
+    ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[C]](s32)
+    ; CHECK-NEXT: $x10 = COPY [[ANYEXT]](s64)
     ; CHECK-NEXT: PseudoRET implicit $x10
     %0:_(s8) = G_CONSTANT i8 129
     %1:_(s64) = G_ANYEXT %0(s8)
@@ -20,8 +21,9 @@ name:            const_i15
 body:             |
   bb.0.entry:
     ; CHECK-LABEL: name: const_i15
-    ; CHECK: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 15
-    ; CHECK-NEXT: $x10 = COPY [[C]](s64)
+    ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 15
+    ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[C]](s32)
+    ; CHECK-NEXT: $x10 = COPY [[ANYEXT]](s64)
     ; CHECK-NEXT: PseudoRET implicit $x10
     %0:_(s15) = G_CONSTANT i15 15
     %1:_(s64) = G_ANYEXT %0(s15)
@@ -34,8 +36,9 @@ name:            const_i16
 body:             |
   bb.0.entry:
     ; CHECK-LABEL: name: const_i16
-    ; CHECK: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 767
-    ; CHECK-NEXT: $x10 = COPY [[C]](s64)
+    ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 767
+    ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[C]](s32)
+    ; CHECK-NEXT: $x10 = COPY [[ANYEXT]](s64)
     ; CHECK-NEXT: PseudoRET implicit $x10
     %0:_(s16) = G_CONSTANT i16 -64769
     %1:_(s64) = G_ANYEXT %0(s16)
@@ -48,8 +51,9 @@ name:            const_i32
 body:             |
   bb.0.entry:
     ; CHECK-LABEL: name: const_i32
-    ; CHECK: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 -64769
-    ; CHECK-NEXT: $x10 = COPY [[C]](s64)
+    ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 -64769
+    ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[C]](s32)
+    ; CHECK-NEXT: $x10 = COPY [[ANYEXT]](s64)
     ; CHECK-NEXT: PseudoRET implicit $x10
     %0:_(s32) = G_CONSTANT i32 -64769
     %1:_(s64) = G_ANYEXT %0(s32)
@@ -180,3 +184,19 @@ body:             |
     PseudoRET implicit $x10
 
 ...
+
+...
+---
+name:            constant_pool_i64
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: constant_pool_i64
+    ; CHECK: [[CONSTANT_POOL:%[0-9]+]]:_(p0) = G_CONSTANT_POOL %const.0
+    ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[CONSTANT_POOL]](p0) :: (load (s64) from constant-pool)
+    ; CHECK-NEXT: $x10 = COPY [[LOAD]](s64)
+    ; CHECK-NEXT: PseudoRET implicit $x10
+    %0:_(s64) = G_CONSTANT i64 -1085102592571150096
+    $x10 = COPY %0(s64)
+    PseudoRET implicit $x10
+
+...
-- 
cgit v1.1


From c08b90c50bcac9f3f563c79491c8dbcbe7c3b574 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Fri, 9 Feb 2024 21:34:10 -0800
Subject: [RISCV] Lower the TransientStackAlignment to the ABI alignment for
 rv32e/rv64e.

I don't think the transient alignment needs to be larger than the
ABI alignment.
---
 llvm/lib/Target/RISCV/RISCVFrameLowering.cpp     |   8 +-
 llvm/test/CodeGen/RISCV/callee-saved-gprs.ll     |  80 ++--
 llvm/test/CodeGen/RISCV/calling-conv-ilp32e.ll   | 208 +++++-----
 llvm/test/CodeGen/RISCV/rv64-legal-i32/vararg.ll | 226 +++++------
 llvm/test/CodeGen/RISCV/vararg.ll                | 470 +++++++++++------------
 5 files changed, 496 insertions(+), 496 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp b/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp
index 60f92af..0de4785 100644
--- a/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp
@@ -36,10 +36,10 @@ static Align getABIStackAlignment(RISCVABI::ABI ABI) {
 }
 
 RISCVFrameLowering::RISCVFrameLowering(const RISCVSubtarget &STI)
-    : TargetFrameLowering(StackGrowsDown,
-                          getABIStackAlignment(STI.getTargetABI()),
-                          /*LocalAreaOffset=*/0,
-                          /*TransientStackAlignment=*/Align(16)),
+    : TargetFrameLowering(
+          StackGrowsDown, getABIStackAlignment(STI.getTargetABI()),
+          /*LocalAreaOffset=*/0,
+          /*TransientStackAlignment=*/getABIStackAlignment(STI.getTargetABI())),
       STI(STI) {}
 
 static const MCPhysReg AllPopRegs[] = {
diff --git a/llvm/test/CodeGen/RISCV/callee-saved-gprs.ll b/llvm/test/CodeGen/RISCV/callee-saved-gprs.ll
index 710b602..5e8ed45 100644
--- a/llvm/test/CodeGen/RISCV/callee-saved-gprs.ll
+++ b/llvm/test/CodeGen/RISCV/callee-saved-gprs.ll
@@ -150,24 +150,24 @@ define void @callee() nounwind {
 ;
 ; RV32I-ILP32E-LABEL: callee:
 ; RV32I-ILP32E:       # %bb.0:
-; RV32I-ILP32E-NEXT:    addi sp, sp, -48
-; RV32I-ILP32E-NEXT:    sw ra, 44(sp) # 4-byte Folded Spill
-; RV32I-ILP32E-NEXT:    sw s0, 40(sp) # 4-byte Folded Spill
-; RV32I-ILP32E-NEXT:    sw s1, 36(sp) # 4-byte Folded Spill
+; RV32I-ILP32E-NEXT:    addi sp, sp, -36
+; RV32I-ILP32E-NEXT:    sw ra, 32(sp) # 4-byte Folded Spill
+; RV32I-ILP32E-NEXT:    sw s0, 28(sp) # 4-byte Folded Spill
+; RV32I-ILP32E-NEXT:    sw s1, 24(sp) # 4-byte Folded Spill
 ; RV32I-ILP32E-NEXT:    lui a6, %hi(var)
 ; RV32I-ILP32E-NEXT:    lw a0, %lo(var)(a6)
-; RV32I-ILP32E-NEXT:    sw a0, 32(sp) # 4-byte Folded Spill
+; RV32I-ILP32E-NEXT:    sw a0, 20(sp) # 4-byte Folded Spill
 ; RV32I-ILP32E-NEXT:    lw a0, %lo(var+4)(a6)
-; RV32I-ILP32E-NEXT:    sw a0, 28(sp) # 4-byte Folded Spill
+; RV32I-ILP32E-NEXT:    sw a0, 16(sp) # 4-byte Folded Spill
 ; RV32I-ILP32E-NEXT:    lw a0, %lo(var+8)(a6)
-; RV32I-ILP32E-NEXT:    sw a0, 24(sp) # 4-byte Folded Spill
+; RV32I-ILP32E-NEXT:    sw a0, 12(sp) # 4-byte Folded Spill
 ; RV32I-ILP32E-NEXT:    lw a0, %lo(var+12)(a6)
-; RV32I-ILP32E-NEXT:    sw a0, 20(sp) # 4-byte Folded Spill
+; RV32I-ILP32E-NEXT:    sw a0, 8(sp) # 4-byte Folded Spill
 ; RV32I-ILP32E-NEXT:    addi a5, a6, %lo(var)
 ; RV32I-ILP32E-NEXT:    lw a0, 16(a5)
-; RV32I-ILP32E-NEXT:    sw a0, 16(sp) # 4-byte Folded Spill
+; RV32I-ILP32E-NEXT:    sw a0, 4(sp) # 4-byte Folded Spill
 ; RV32I-ILP32E-NEXT:    lw a0, 20(a5)
-; RV32I-ILP32E-NEXT:    sw a0, 12(sp) # 4-byte Folded Spill
+; RV32I-ILP32E-NEXT:    sw a0, 0(sp) # 4-byte Folded Spill
 ; RV32I-ILP32E-NEXT:    lw t0, 24(a5)
 ; RV32I-ILP32E-NEXT:    lw t1, 28(a5)
 ; RV32I-ILP32E-NEXT:    lw t2, 32(a5)
@@ -220,22 +220,22 @@ define void @callee() nounwind {
 ; RV32I-ILP32E-NEXT:    sw t2, 32(a5)
 ; RV32I-ILP32E-NEXT:    sw t1, 28(a5)
 ; RV32I-ILP32E-NEXT:    sw t0, 24(a5)
-; RV32I-ILP32E-NEXT:    lw a0, 12(sp) # 4-byte Folded Reload
+; RV32I-ILP32E-NEXT:    lw a0, 0(sp) # 4-byte Folded Reload
 ; RV32I-ILP32E-NEXT:    sw a0, 20(a5)
-; RV32I-ILP32E-NEXT:    lw a0, 16(sp) # 4-byte Folded Reload
+; RV32I-ILP32E-NEXT:    lw a0, 4(sp) # 4-byte Folded Reload
 ; RV32I-ILP32E-NEXT:    sw a0, 16(a5)
-; RV32I-ILP32E-NEXT:    lw a0, 20(sp) # 4-byte Folded Reload
+; RV32I-ILP32E-NEXT:    lw a0, 8(sp) # 4-byte Folded Reload
 ; RV32I-ILP32E-NEXT:    sw a0, %lo(var+12)(a6)
-; RV32I-ILP32E-NEXT:    lw a0, 24(sp) # 4-byte Folded Reload
+; RV32I-ILP32E-NEXT:    lw a0, 12(sp) # 4-byte Folded Reload
 ; RV32I-ILP32E-NEXT:    sw a0, %lo(var+8)(a6)
-; RV32I-ILP32E-NEXT:    lw a0, 28(sp) # 4-byte Folded Reload
+; RV32I-ILP32E-NEXT:    lw a0, 16(sp) # 4-byte Folded Reload
 ; RV32I-ILP32E-NEXT:    sw a0, %lo(var+4)(a6)
-; RV32I-ILP32E-NEXT:    lw a0, 32(sp) # 4-byte Folded Reload
+; RV32I-ILP32E-NEXT:    lw a0, 20(sp) # 4-byte Folded Reload
 ; RV32I-ILP32E-NEXT:    sw a0, %lo(var)(a6)
-; RV32I-ILP32E-NEXT:    lw ra, 44(sp) # 4-byte Folded Reload
-; RV32I-ILP32E-NEXT:    lw s0, 40(sp) # 4-byte Folded Reload
-; RV32I-ILP32E-NEXT:    lw s1, 36(sp) # 4-byte Folded Reload
-; RV32I-ILP32E-NEXT:    addi sp, sp, 48
+; RV32I-ILP32E-NEXT:    lw ra, 32(sp) # 4-byte Folded Reload
+; RV32I-ILP32E-NEXT:    lw s0, 28(sp) # 4-byte Folded Reload
+; RV32I-ILP32E-NEXT:    lw s1, 24(sp) # 4-byte Folded Reload
+; RV32I-ILP32E-NEXT:    addi sp, sp, 36
 ; RV32I-ILP32E-NEXT:    ret
 ;
 ; RV32I-WITH-FP-LABEL: callee:
@@ -659,24 +659,24 @@ define void @callee() nounwind {
 ;
 ; RV64I-LP64E-LABEL: callee:
 ; RV64I-LP64E:       # %bb.0:
-; RV64I-LP64E-NEXT:    addi sp, sp, -80
-; RV64I-LP64E-NEXT:    sd ra, 72(sp) # 8-byte Folded Spill
-; RV64I-LP64E-NEXT:    sd s0, 64(sp) # 8-byte Folded Spill
-; RV64I-LP64E-NEXT:    sd s1, 56(sp) # 8-byte Folded Spill
+; RV64I-LP64E-NEXT:    addi sp, sp, -72
+; RV64I-LP64E-NEXT:    sd ra, 64(sp) # 8-byte Folded Spill
+; RV64I-LP64E-NEXT:    sd s0, 56(sp) # 8-byte Folded Spill
+; RV64I-LP64E-NEXT:    sd s1, 48(sp) # 8-byte Folded Spill
 ; RV64I-LP64E-NEXT:    lui a6, %hi(var)
 ; RV64I-LP64E-NEXT:    lw a0, %lo(var)(a6)
-; RV64I-LP64E-NEXT:    sd a0, 48(sp) # 8-byte Folded Spill
-; RV64I-LP64E-NEXT:    lw a0, %lo(var+4)(a6)
 ; RV64I-LP64E-NEXT:    sd a0, 40(sp) # 8-byte Folded Spill
-; RV64I-LP64E-NEXT:    lw a0, %lo(var+8)(a6)
+; RV64I-LP64E-NEXT:    lw a0, %lo(var+4)(a6)
 ; RV64I-LP64E-NEXT:    sd a0, 32(sp) # 8-byte Folded Spill
-; RV64I-LP64E-NEXT:    lw a0, %lo(var+12)(a6)
+; RV64I-LP64E-NEXT:    lw a0, %lo(var+8)(a6)
 ; RV64I-LP64E-NEXT:    sd a0, 24(sp) # 8-byte Folded Spill
+; RV64I-LP64E-NEXT:    lw a0, %lo(var+12)(a6)
+; RV64I-LP64E-NEXT:    sd a0, 16(sp) # 8-byte Folded Spill
 ; RV64I-LP64E-NEXT:    addi a5, a6, %lo(var)
 ; RV64I-LP64E-NEXT:    lw a0, 16(a5)
-; RV64I-LP64E-NEXT:    sd a0, 16(sp) # 8-byte Folded Spill
-; RV64I-LP64E-NEXT:    lw a0, 20(a5)
 ; RV64I-LP64E-NEXT:    sd a0, 8(sp) # 8-byte Folded Spill
+; RV64I-LP64E-NEXT:    lw a0, 20(a5)
+; RV64I-LP64E-NEXT:    sd a0, 0(sp) # 8-byte Folded Spill
 ; RV64I-LP64E-NEXT:    lw t0, 24(a5)
 ; RV64I-LP64E-NEXT:    lw t1, 28(a5)
 ; RV64I-LP64E-NEXT:    lw t2, 32(a5)
@@ -729,22 +729,22 @@ define void @callee() nounwind {
 ; RV64I-LP64E-NEXT:    sw t2, 32(a5)
 ; RV64I-LP64E-NEXT:    sw t1, 28(a5)
 ; RV64I-LP64E-NEXT:    sw t0, 24(a5)
-; RV64I-LP64E-NEXT:    ld a0, 8(sp) # 8-byte Folded Reload
+; RV64I-LP64E-NEXT:    ld a0, 0(sp) # 8-byte Folded Reload
 ; RV64I-LP64E-NEXT:    sw a0, 20(a5)
-; RV64I-LP64E-NEXT:    ld a0, 16(sp) # 8-byte Folded Reload
+; RV64I-LP64E-NEXT:    ld a0, 8(sp) # 8-byte Folded Reload
 ; RV64I-LP64E-NEXT:    sw a0, 16(a5)
-; RV64I-LP64E-NEXT:    ld a0, 24(sp) # 8-byte Folded Reload
+; RV64I-LP64E-NEXT:    ld a0, 16(sp) # 8-byte Folded Reload
 ; RV64I-LP64E-NEXT:    sw a0, %lo(var+12)(a6)
-; RV64I-LP64E-NEXT:    ld a0, 32(sp) # 8-byte Folded Reload
+; RV64I-LP64E-NEXT:    ld a0, 24(sp) # 8-byte Folded Reload
 ; RV64I-LP64E-NEXT:    sw a0, %lo(var+8)(a6)
-; RV64I-LP64E-NEXT:    ld a0, 40(sp) # 8-byte Folded Reload
+; RV64I-LP64E-NEXT:    ld a0, 32(sp) # 8-byte Folded Reload
 ; RV64I-LP64E-NEXT:    sw a0, %lo(var+4)(a6)
-; RV64I-LP64E-NEXT:    ld a0, 48(sp) # 8-byte Folded Reload
+; RV64I-LP64E-NEXT:    ld a0, 40(sp) # 8-byte Folded Reload
 ; RV64I-LP64E-NEXT:    sw a0, %lo(var)(a6)
-; RV64I-LP64E-NEXT:    ld ra, 72(sp) # 8-byte Folded Reload
-; RV64I-LP64E-NEXT:    ld s0, 64(sp) # 8-byte Folded Reload
-; RV64I-LP64E-NEXT:    ld s1, 56(sp) # 8-byte Folded Reload
-; RV64I-LP64E-NEXT:    addi sp, sp, 80
+; RV64I-LP64E-NEXT:    ld ra, 64(sp) # 8-byte Folded Reload
+; RV64I-LP64E-NEXT:    ld s0, 56(sp) # 8-byte Folded Reload
+; RV64I-LP64E-NEXT:    ld s1, 48(sp) # 8-byte Folded Reload
+; RV64I-LP64E-NEXT:    addi sp, sp, 72
 ; RV64I-LP64E-NEXT:    ret
 ;
 ; RV64I-WITH-FP-LABEL: callee:
diff --git a/llvm/test/CodeGen/RISCV/calling-conv-ilp32e.ll b/llvm/test/CodeGen/RISCV/calling-conv-ilp32e.ll
index 0eb6391..5c55113 100644
--- a/llvm/test/CodeGen/RISCV/calling-conv-ilp32e.ll
+++ b/llvm/test/CodeGen/RISCV/calling-conv-ilp32e.ll
@@ -151,20 +151,20 @@ define i32 @callee_float_on_stack(i64 %a, i64 %b, i64 %c, i64 %d, float %e) {
 ;
 ; ILP32E-WITHFP-LABEL: callee_float_on_stack:
 ; ILP32E-WITHFP:       # %bb.0:
-; ILP32E-WITHFP-NEXT:    addi sp, sp, -16
-; ILP32E-WITHFP-NEXT:    .cfi_def_cfa_offset 16
-; ILP32E-WITHFP-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
-; ILP32E-WITHFP-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
+; ILP32E-WITHFP-NEXT:    addi sp, sp, -8
+; ILP32E-WITHFP-NEXT:    .cfi_def_cfa_offset 8
+; ILP32E-WITHFP-NEXT:    sw ra, 4(sp) # 4-byte Folded Spill
+; ILP32E-WITHFP-NEXT:    sw s0, 0(sp) # 4-byte Folded Spill
 ; ILP32E-WITHFP-NEXT:    .cfi_offset ra, -4
 ; ILP32E-WITHFP-NEXT:    .cfi_offset s0, -8
-; ILP32E-WITHFP-NEXT:    addi s0, sp, 16
+; ILP32E-WITHFP-NEXT:    addi s0, sp, 8
 ; ILP32E-WITHFP-NEXT:    .cfi_def_cfa s0, 0
 ; ILP32E-WITHFP-NEXT:    lw a0, 8(s0)
 ; ILP32E-WITHFP-NEXT:    lw a1, 0(s0)
 ; ILP32E-WITHFP-NEXT:    add a0, a1, a0
-; ILP32E-WITHFP-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
-; ILP32E-WITHFP-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
-; ILP32E-WITHFP-NEXT:    addi sp, sp, 16
+; ILP32E-WITHFP-NEXT:    lw ra, 4(sp) # 4-byte Folded Reload
+; ILP32E-WITHFP-NEXT:    lw s0, 0(sp) # 4-byte Folded Reload
+; ILP32E-WITHFP-NEXT:    addi sp, sp, 8
 ; ILP32E-WITHFP-NEXT:    ret
 ;
 ; ILP32E-FPELIM-SAVE-RESTORE-LABEL: callee_float_on_stack:
@@ -298,18 +298,18 @@ define float @callee_tiny_scalar_ret() {
 ;
 ; ILP32E-WITHFP-LABEL: callee_tiny_scalar_ret:
 ; ILP32E-WITHFP:       # %bb.0:
-; ILP32E-WITHFP-NEXT:    addi sp, sp, -16
-; ILP32E-WITHFP-NEXT:    .cfi_def_cfa_offset 16
-; ILP32E-WITHFP-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
-; ILP32E-WITHFP-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
+; ILP32E-WITHFP-NEXT:    addi sp, sp, -8
+; ILP32E-WITHFP-NEXT:    .cfi_def_cfa_offset 8
+; ILP32E-WITHFP-NEXT:    sw ra, 4(sp) # 4-byte Folded Spill
+; ILP32E-WITHFP-NEXT:    sw s0, 0(sp) # 4-byte Folded Spill
 ; ILP32E-WITHFP-NEXT:    .cfi_offset ra, -4
 ; ILP32E-WITHFP-NEXT:    .cfi_offset s0, -8
-; ILP32E-WITHFP-NEXT:    addi s0, sp, 16
+; ILP32E-WITHFP-NEXT:    addi s0, sp, 8
 ; ILP32E-WITHFP-NEXT:    .cfi_def_cfa s0, 0
 ; ILP32E-WITHFP-NEXT:    lui a0, 260096
-; ILP32E-WITHFP-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
-; ILP32E-WITHFP-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
-; ILP32E-WITHFP-NEXT:    addi sp, sp, 16
+; ILP32E-WITHFP-NEXT:    lw ra, 4(sp) # 4-byte Folded Reload
+; ILP32E-WITHFP-NEXT:    lw s0, 0(sp) # 4-byte Folded Reload
+; ILP32E-WITHFP-NEXT:    addi sp, sp, 8
 ; ILP32E-WITHFP-NEXT:    ret
 ;
 ; ILP32E-FPELIM-SAVE-RESTORE-LABEL: callee_tiny_scalar_ret:
@@ -543,13 +543,13 @@ define i32 @callee_aligned_stack(i32 %a, i32 %b, fp128 %c, i32 %d, i32 %e, i64 %
 ;
 ; ILP32E-WITHFP-LABEL: callee_aligned_stack:
 ; ILP32E-WITHFP:       # %bb.0:
-; ILP32E-WITHFP-NEXT:    addi sp, sp, -16
-; ILP32E-WITHFP-NEXT:    .cfi_def_cfa_offset 16
-; ILP32E-WITHFP-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
-; ILP32E-WITHFP-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
+; ILP32E-WITHFP-NEXT:    addi sp, sp, -8
+; ILP32E-WITHFP-NEXT:    .cfi_def_cfa_offset 8
+; ILP32E-WITHFP-NEXT:    sw ra, 4(sp) # 4-byte Folded Spill
+; ILP32E-WITHFP-NEXT:    sw s0, 0(sp) # 4-byte Folded Spill
 ; ILP32E-WITHFP-NEXT:    .cfi_offset ra, -4
 ; ILP32E-WITHFP-NEXT:    .cfi_offset s0, -8
-; ILP32E-WITHFP-NEXT:    addi s0, sp, 16
+; ILP32E-WITHFP-NEXT:    addi s0, sp, 8
 ; ILP32E-WITHFP-NEXT:    .cfi_def_cfa s0, 0
 ; ILP32E-WITHFP-NEXT:    lw a0, 0(a2)
 ; ILP32E-WITHFP-NEXT:    lw a1, 12(s0)
@@ -562,9 +562,9 @@ define i32 @callee_aligned_stack(i32 %a, i32 %b, fp128 %c, i32 %d, i32 %e, i64 %
 ; ILP32E-WITHFP-NEXT:    add a0, a0, a1
 ; ILP32E-WITHFP-NEXT:    add a4, a5, a4
 ; ILP32E-WITHFP-NEXT:    add a0, a0, a4
-; ILP32E-WITHFP-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
-; ILP32E-WITHFP-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
-; ILP32E-WITHFP-NEXT:    addi sp, sp, 16
+; ILP32E-WITHFP-NEXT:    lw ra, 4(sp) # 4-byte Folded Reload
+; ILP32E-WITHFP-NEXT:    lw s0, 0(sp) # 4-byte Folded Reload
+; ILP32E-WITHFP-NEXT:    addi sp, sp, 8
 ; ILP32E-WITHFP-NEXT:    ret
 ;
 ; ILP32E-FPELIM-SAVE-RESTORE-LABEL: callee_aligned_stack:
@@ -847,19 +847,19 @@ define double @callee_small_scalar_ret() {
 ;
 ; ILP32E-WITHFP-LABEL: callee_small_scalar_ret:
 ; ILP32E-WITHFP:       # %bb.0:
-; ILP32E-WITHFP-NEXT:    addi sp, sp, -16
-; ILP32E-WITHFP-NEXT:    .cfi_def_cfa_offset 16
-; ILP32E-WITHFP-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
-; ILP32E-WITHFP-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
+; ILP32E-WITHFP-NEXT:    addi sp, sp, -8
+; ILP32E-WITHFP-NEXT:    .cfi_def_cfa_offset 8
+; ILP32E-WITHFP-NEXT:    sw ra, 4(sp) # 4-byte Folded Spill
+; ILP32E-WITHFP-NEXT:    sw s0, 0(sp) # 4-byte Folded Spill
 ; ILP32E-WITHFP-NEXT:    .cfi_offset ra, -4
 ; ILP32E-WITHFP-NEXT:    .cfi_offset s0, -8
-; ILP32E-WITHFP-NEXT:    addi s0, sp, 16
+; ILP32E-WITHFP-NEXT:    addi s0, sp, 8
 ; ILP32E-WITHFP-NEXT:    .cfi_def_cfa s0, 0
 ; ILP32E-WITHFP-NEXT:    lui a1, 261888
 ; ILP32E-WITHFP-NEXT:    li a0, 0
-; ILP32E-WITHFP-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
-; ILP32E-WITHFP-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
-; ILP32E-WITHFP-NEXT:    addi sp, sp, 16
+; ILP32E-WITHFP-NEXT:    lw ra, 4(sp) # 4-byte Folded Reload
+; ILP32E-WITHFP-NEXT:    lw s0, 0(sp) # 4-byte Folded Reload
+; ILP32E-WITHFP-NEXT:    addi sp, sp, 8
 ; ILP32E-WITHFP-NEXT:    ret
 ;
 ; ILP32E-FPELIM-SAVE-RESTORE-LABEL: callee_small_scalar_ret:
@@ -944,18 +944,18 @@ define i32 @callee_i64_in_regs(i32 %a, i64 %b) {
 ;
 ; ILP32E-WITHFP-LABEL: callee_i64_in_regs:
 ; ILP32E-WITHFP:       # %bb.0:
-; ILP32E-WITHFP-NEXT:    addi sp, sp, -16
-; ILP32E-WITHFP-NEXT:    .cfi_def_cfa_offset 16
-; ILP32E-WITHFP-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
-; ILP32E-WITHFP-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
+; ILP32E-WITHFP-NEXT:    addi sp, sp, -8
+; ILP32E-WITHFP-NEXT:    .cfi_def_cfa_offset 8
+; ILP32E-WITHFP-NEXT:    sw ra, 4(sp) # 4-byte Folded Spill
+; ILP32E-WITHFP-NEXT:    sw s0, 0(sp) # 4-byte Folded Spill
 ; ILP32E-WITHFP-NEXT:    .cfi_offset ra, -4
 ; ILP32E-WITHFP-NEXT:    .cfi_offset s0, -8
-; ILP32E-WITHFP-NEXT:    addi s0, sp, 16
+; ILP32E-WITHFP-NEXT:    addi s0, sp, 8
 ; ILP32E-WITHFP-NEXT:    .cfi_def_cfa s0, 0
 ; ILP32E-WITHFP-NEXT:    add a0, a0, a1
-; ILP32E-WITHFP-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
-; ILP32E-WITHFP-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
-; ILP32E-WITHFP-NEXT:    addi sp, sp, 16
+; ILP32E-WITHFP-NEXT:    lw ra, 4(sp) # 4-byte Folded Reload
+; ILP32E-WITHFP-NEXT:    lw s0, 0(sp) # 4-byte Folded Reload
+; ILP32E-WITHFP-NEXT:    addi sp, sp, 8
 ; ILP32E-WITHFP-NEXT:    ret
 ;
 ; ILP32E-FPELIM-SAVE-RESTORE-LABEL: callee_i64_in_regs:
@@ -1066,13 +1066,13 @@ define i32 @callee_many_scalars(i8 %a, i16 %b, i32 %c, i64 %d, i32 %e, i32 %f, i
 ;
 ; ILP32E-WITHFP-LABEL: callee_many_scalars:
 ; ILP32E-WITHFP:       # %bb.0:
-; ILP32E-WITHFP-NEXT:    addi sp, sp, -16
-; ILP32E-WITHFP-NEXT:    .cfi_def_cfa_offset 16
-; ILP32E-WITHFP-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
-; ILP32E-WITHFP-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
+; ILP32E-WITHFP-NEXT:    addi sp, sp, -8
+; ILP32E-WITHFP-NEXT:    .cfi_def_cfa_offset 8
+; ILP32E-WITHFP-NEXT:    sw ra, 4(sp) # 4-byte Folded Spill
+; ILP32E-WITHFP-NEXT:    sw s0, 0(sp) # 4-byte Folded Spill
 ; ILP32E-WITHFP-NEXT:    .cfi_offset ra, -4
 ; ILP32E-WITHFP-NEXT:    .cfi_offset s0, -8
-; ILP32E-WITHFP-NEXT:    addi s0, sp, 16
+; ILP32E-WITHFP-NEXT:    addi s0, sp, 8
 ; ILP32E-WITHFP-NEXT:    .cfi_def_cfa s0, 0
 ; ILP32E-WITHFP-NEXT:    lw a6, 12(s0)
 ; ILP32E-WITHFP-NEXT:    lw a7, 0(s0)
@@ -1091,9 +1091,9 @@ define i32 @callee_many_scalars(i8 %a, i16 %b, i32 %c, i64 %d, i32 %e, i32 %f, i
 ; ILP32E-WITHFP-NEXT:    add a0, a0, a7
 ; ILP32E-WITHFP-NEXT:    add a0, a0, a6
 ; ILP32E-WITHFP-NEXT:    add a0, a1, a0
-; ILP32E-WITHFP-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
-; ILP32E-WITHFP-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
-; ILP32E-WITHFP-NEXT:    addi sp, sp, 16
+; ILP32E-WITHFP-NEXT:    lw ra, 4(sp) # 4-byte Folded Reload
+; ILP32E-WITHFP-NEXT:    lw s0, 0(sp) # 4-byte Folded Reload
+; ILP32E-WITHFP-NEXT:    addi sp, sp, 8
 ; ILP32E-WITHFP-NEXT:    ret
 ;
 ; ILP32E-FPELIM-SAVE-RESTORE-LABEL: callee_many_scalars:
@@ -1287,13 +1287,13 @@ define i32 @callee_large_scalars(i128 %a, fp128 %b) {
 ;
 ; ILP32E-WITHFP-LABEL: callee_large_scalars:
 ; ILP32E-WITHFP:       # %bb.0:
-; ILP32E-WITHFP-NEXT:    addi sp, sp, -16
-; ILP32E-WITHFP-NEXT:    .cfi_def_cfa_offset 16
-; ILP32E-WITHFP-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
-; ILP32E-WITHFP-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
+; ILP32E-WITHFP-NEXT:    addi sp, sp, -8
+; ILP32E-WITHFP-NEXT:    .cfi_def_cfa_offset 8
+; ILP32E-WITHFP-NEXT:    sw ra, 4(sp) # 4-byte Folded Spill
+; ILP32E-WITHFP-NEXT:    sw s0, 0(sp) # 4-byte Folded Spill
 ; ILP32E-WITHFP-NEXT:    .cfi_offset ra, -4
 ; ILP32E-WITHFP-NEXT:    .cfi_offset s0, -8
-; ILP32E-WITHFP-NEXT:    addi s0, sp, 16
+; ILP32E-WITHFP-NEXT:    addi s0, sp, 8
 ; ILP32E-WITHFP-NEXT:    .cfi_def_cfa s0, 0
 ; ILP32E-WITHFP-NEXT:    lw a2, 0(a1)
 ; ILP32E-WITHFP-NEXT:    lw a3, 0(a0)
@@ -1311,9 +1311,9 @@ define i32 @callee_large_scalars(i128 %a, fp128 %b) {
 ; ILP32E-WITHFP-NEXT:    or a0, a2, a0
 ; ILP32E-WITHFP-NEXT:    or a0, a0, a4
 ; ILP32E-WITHFP-NEXT:    seqz a0, a0
-; ILP32E-WITHFP-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
-; ILP32E-WITHFP-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
-; ILP32E-WITHFP-NEXT:    addi sp, sp, 16
+; ILP32E-WITHFP-NEXT:    lw ra, 4(sp) # 4-byte Folded Reload
+; ILP32E-WITHFP-NEXT:    lw s0, 0(sp) # 4-byte Folded Reload
+; ILP32E-WITHFP-NEXT:    addi sp, sp, 8
 ; ILP32E-WITHFP-NEXT:    ret
 ;
 ; ILP32E-FPELIM-SAVE-RESTORE-LABEL: callee_large_scalars:
@@ -1514,13 +1514,13 @@ define i32 @callee_large_scalars_exhausted_regs(i32 %a, i32 %b, i32 %c, i32 %d,
 ;
 ; ILP32E-WITHFP-LABEL: callee_large_scalars_exhausted_regs:
 ; ILP32E-WITHFP:       # %bb.0:
-; ILP32E-WITHFP-NEXT:    addi sp, sp, -16
-; ILP32E-WITHFP-NEXT:    .cfi_def_cfa_offset 16
-; ILP32E-WITHFP-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
-; ILP32E-WITHFP-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
+; ILP32E-WITHFP-NEXT:    addi sp, sp, -8
+; ILP32E-WITHFP-NEXT:    .cfi_def_cfa_offset 8
+; ILP32E-WITHFP-NEXT:    sw ra, 4(sp) # 4-byte Folded Spill
+; ILP32E-WITHFP-NEXT:    sw s0, 0(sp) # 4-byte Folded Spill
 ; ILP32E-WITHFP-NEXT:    .cfi_offset ra, -4
 ; ILP32E-WITHFP-NEXT:    .cfi_offset s0, -8
-; ILP32E-WITHFP-NEXT:    addi s0, sp, 16
+; ILP32E-WITHFP-NEXT:    addi s0, sp, 8
 ; ILP32E-WITHFP-NEXT:    .cfi_def_cfa s0, 0
 ; ILP32E-WITHFP-NEXT:    lw a0, 12(s0)
 ; ILP32E-WITHFP-NEXT:    lw a1, 4(s0)
@@ -1540,9 +1540,9 @@ define i32 @callee_large_scalars_exhausted_regs(i32 %a, i32 %b, i32 %c, i32 %d,
 ; ILP32E-WITHFP-NEXT:    or a0, a2, a0
 ; ILP32E-WITHFP-NEXT:    or a0, a0, a4
 ; ILP32E-WITHFP-NEXT:    seqz a0, a0
-; ILP32E-WITHFP-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
-; ILP32E-WITHFP-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
-; ILP32E-WITHFP-NEXT:    addi sp, sp, 16
+; ILP32E-WITHFP-NEXT:    lw ra, 4(sp) # 4-byte Folded Reload
+; ILP32E-WITHFP-NEXT:    lw s0, 0(sp) # 4-byte Folded Reload
+; ILP32E-WITHFP-NEXT:    addi sp, sp, 8
 ; ILP32E-WITHFP-NEXT:    ret
 ;
 ; ILP32E-FPELIM-SAVE-RESTORE-LABEL: callee_large_scalars_exhausted_regs:
@@ -1872,19 +1872,19 @@ define i32 @callee_small_coerced_struct([2 x i32] %a.coerce) {
 ;
 ; ILP32E-WITHFP-LABEL: callee_small_coerced_struct:
 ; ILP32E-WITHFP:       # %bb.0:
-; ILP32E-WITHFP-NEXT:    addi sp, sp, -16
-; ILP32E-WITHFP-NEXT:    .cfi_def_cfa_offset 16
-; ILP32E-WITHFP-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
-; ILP32E-WITHFP-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
+; ILP32E-WITHFP-NEXT:    addi sp, sp, -8
+; ILP32E-WITHFP-NEXT:    .cfi_def_cfa_offset 8
+; ILP32E-WITHFP-NEXT:    sw ra, 4(sp) # 4-byte Folded Spill
+; ILP32E-WITHFP-NEXT:    sw s0, 0(sp) # 4-byte Folded Spill
 ; ILP32E-WITHFP-NEXT:    .cfi_offset ra, -4
 ; ILP32E-WITHFP-NEXT:    .cfi_offset s0, -8
-; ILP32E-WITHFP-NEXT:    addi s0, sp, 16
+; ILP32E-WITHFP-NEXT:    addi s0, sp, 8
 ; ILP32E-WITHFP-NEXT:    .cfi_def_cfa s0, 0
 ; ILP32E-WITHFP-NEXT:    xor a0, a0, a1
 ; ILP32E-WITHFP-NEXT:    seqz a0, a0
-; ILP32E-WITHFP-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
-; ILP32E-WITHFP-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
-; ILP32E-WITHFP-NEXT:    addi sp, sp, 16
+; ILP32E-WITHFP-NEXT:    lw ra, 4(sp) # 4-byte Folded Reload
+; ILP32E-WITHFP-NEXT:    lw s0, 0(sp) # 4-byte Folded Reload
+; ILP32E-WITHFP-NEXT:    addi sp, sp, 8
 ; ILP32E-WITHFP-NEXT:    ret
 ;
 ; ILP32E-FPELIM-SAVE-RESTORE-LABEL: callee_small_coerced_struct:
@@ -1983,20 +1983,20 @@ define i32 @callee_large_struct(ptr byval(%struct.large) align 4 %a) {
 ;
 ; ILP32E-WITHFP-LABEL: callee_large_struct:
 ; ILP32E-WITHFP:       # %bb.0:
-; ILP32E-WITHFP-NEXT:    addi sp, sp, -16
-; ILP32E-WITHFP-NEXT:    .cfi_def_cfa_offset 16
-; ILP32E-WITHFP-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
-; ILP32E-WITHFP-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
+; ILP32E-WITHFP-NEXT:    addi sp, sp, -8
+; ILP32E-WITHFP-NEXT:    .cfi_def_cfa_offset 8
+; ILP32E-WITHFP-NEXT:    sw ra, 4(sp) # 4-byte Folded Spill
+; ILP32E-WITHFP-NEXT:    sw s0, 0(sp) # 4-byte Folded Spill
 ; ILP32E-WITHFP-NEXT:    .cfi_offset ra, -4
 ; ILP32E-WITHFP-NEXT:    .cfi_offset s0, -8
-; ILP32E-WITHFP-NEXT:    addi s0, sp, 16
+; ILP32E-WITHFP-NEXT:    addi s0, sp, 8
 ; ILP32E-WITHFP-NEXT:    .cfi_def_cfa s0, 0
 ; ILP32E-WITHFP-NEXT:    lw a1, 0(a0)
 ; ILP32E-WITHFP-NEXT:    lw a0, 12(a0)
 ; ILP32E-WITHFP-NEXT:    add a0, a1, a0
-; ILP32E-WITHFP-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
-; ILP32E-WITHFP-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
-; ILP32E-WITHFP-NEXT:    addi sp, sp, 16
+; ILP32E-WITHFP-NEXT:    lw ra, 4(sp) # 4-byte Folded Reload
+; ILP32E-WITHFP-NEXT:    lw s0, 0(sp) # 4-byte Folded Reload
+; ILP32E-WITHFP-NEXT:    addi sp, sp, 8
 ; ILP32E-WITHFP-NEXT:    ret
 ;
 ; ILP32E-FPELIM-SAVE-RESTORE-LABEL: callee_large_struct:
@@ -2153,19 +2153,19 @@ define %struct.small @callee_small_struct_ret() {
 ;
 ; ILP32E-WITHFP-LABEL: callee_small_struct_ret:
 ; ILP32E-WITHFP:       # %bb.0:
-; ILP32E-WITHFP-NEXT:    addi sp, sp, -16
-; ILP32E-WITHFP-NEXT:    .cfi_def_cfa_offset 16
-; ILP32E-WITHFP-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
-; ILP32E-WITHFP-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
+; ILP32E-WITHFP-NEXT:    addi sp, sp, -8
+; ILP32E-WITHFP-NEXT:    .cfi_def_cfa_offset 8
+; ILP32E-WITHFP-NEXT:    sw ra, 4(sp) # 4-byte Folded Spill
+; ILP32E-WITHFP-NEXT:    sw s0, 0(sp) # 4-byte Folded Spill
 ; ILP32E-WITHFP-NEXT:    .cfi_offset ra, -4
 ; ILP32E-WITHFP-NEXT:    .cfi_offset s0, -8
-; ILP32E-WITHFP-NEXT:    addi s0, sp, 16
+; ILP32E-WITHFP-NEXT:    addi s0, sp, 8
 ; ILP32E-WITHFP-NEXT:    .cfi_def_cfa s0, 0
 ; ILP32E-WITHFP-NEXT:    li a0, 1
 ; ILP32E-WITHFP-NEXT:    li a1, 0
-; ILP32E-WITHFP-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
-; ILP32E-WITHFP-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
-; ILP32E-WITHFP-NEXT:    addi sp, sp, 16
+; ILP32E-WITHFP-NEXT:    lw ra, 4(sp) # 4-byte Folded Reload
+; ILP32E-WITHFP-NEXT:    lw s0, 0(sp) # 4-byte Folded Reload
+; ILP32E-WITHFP-NEXT:    addi sp, sp, 8
 ; ILP32E-WITHFP-NEXT:    ret
 ;
 ; ILP32E-FPELIM-SAVE-RESTORE-LABEL: callee_small_struct_ret:
@@ -2260,22 +2260,22 @@ define fp128 @callee_large_scalar_ret() {
 ;
 ; ILP32E-WITHFP-LABEL: callee_large_scalar_ret:
 ; ILP32E-WITHFP:       # %bb.0:
-; ILP32E-WITHFP-NEXT:    addi sp, sp, -16
-; ILP32E-WITHFP-NEXT:    .cfi_def_cfa_offset 16
-; ILP32E-WITHFP-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
-; ILP32E-WITHFP-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
+; ILP32E-WITHFP-NEXT:    addi sp, sp, -8
+; ILP32E-WITHFP-NEXT:    .cfi_def_cfa_offset 8
+; ILP32E-WITHFP-NEXT:    sw ra, 4(sp) # 4-byte Folded Spill
+; ILP32E-WITHFP-NEXT:    sw s0, 0(sp) # 4-byte Folded Spill
 ; ILP32E-WITHFP-NEXT:    .cfi_offset ra, -4
 ; ILP32E-WITHFP-NEXT:    .cfi_offset s0, -8
-; ILP32E-WITHFP-NEXT:    addi s0, sp, 16
+; ILP32E-WITHFP-NEXT:    addi s0, sp, 8
 ; ILP32E-WITHFP-NEXT:    .cfi_def_cfa s0, 0
 ; ILP32E-WITHFP-NEXT:    lui a1, 524272
 ; ILP32E-WITHFP-NEXT:    sw a1, 12(a0)
 ; ILP32E-WITHFP-NEXT:    sw zero, 8(a0)
 ; ILP32E-WITHFP-NEXT:    sw zero, 4(a0)
 ; ILP32E-WITHFP-NEXT:    sw zero, 0(a0)
-; ILP32E-WITHFP-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
-; ILP32E-WITHFP-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
-; ILP32E-WITHFP-NEXT:    addi sp, sp, 16
+; ILP32E-WITHFP-NEXT:    lw ra, 4(sp) # 4-byte Folded Reload
+; ILP32E-WITHFP-NEXT:    lw s0, 0(sp) # 4-byte Folded Reload
+; ILP32E-WITHFP-NEXT:    addi sp, sp, 8
 ; ILP32E-WITHFP-NEXT:    ret
 ;
 ; ILP32E-FPELIM-SAVE-RESTORE-LABEL: callee_large_scalar_ret:
@@ -2395,13 +2395,13 @@ define void @callee_large_struct_ret(ptr noalias sret(%struct.large) %agg.result
 ;
 ; ILP32E-WITHFP-LABEL: callee_large_struct_ret:
 ; ILP32E-WITHFP:       # %bb.0:
-; ILP32E-WITHFP-NEXT:    addi sp, sp, -16
-; ILP32E-WITHFP-NEXT:    .cfi_def_cfa_offset 16
-; ILP32E-WITHFP-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
-; ILP32E-WITHFP-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
+; ILP32E-WITHFP-NEXT:    addi sp, sp, -8
+; ILP32E-WITHFP-NEXT:    .cfi_def_cfa_offset 8
+; ILP32E-WITHFP-NEXT:    sw ra, 4(sp) # 4-byte Folded Spill
+; ILP32E-WITHFP-NEXT:    sw s0, 0(sp) # 4-byte Folded Spill
 ; ILP32E-WITHFP-NEXT:    .cfi_offset ra, -4
 ; ILP32E-WITHFP-NEXT:    .cfi_offset s0, -8
-; ILP32E-WITHFP-NEXT:    addi s0, sp, 16
+; ILP32E-WITHFP-NEXT:    addi s0, sp, 8
 ; ILP32E-WITHFP-NEXT:    .cfi_def_cfa s0, 0
 ; ILP32E-WITHFP-NEXT:    li a1, 1
 ; ILP32E-WITHFP-NEXT:    sw a1, 0(a0)
@@ -2411,9 +2411,9 @@ define void @callee_large_struct_ret(ptr noalias sret(%struct.large) %agg.result
 ; ILP32E-WITHFP-NEXT:    sw a1, 8(a0)
 ; ILP32E-WITHFP-NEXT:    li a1, 4
 ; ILP32E-WITHFP-NEXT:    sw a1, 12(a0)
-; ILP32E-WITHFP-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
-; ILP32E-WITHFP-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
-; ILP32E-WITHFP-NEXT:    addi sp, sp, 16
+; ILP32E-WITHFP-NEXT:    lw ra, 4(sp) # 4-byte Folded Reload
+; ILP32E-WITHFP-NEXT:    lw s0, 0(sp) # 4-byte Folded Reload
+; ILP32E-WITHFP-NEXT:    addi sp, sp, 8
 ; ILP32E-WITHFP-NEXT:    ret
 ;
 ; ILP32E-FPELIM-SAVE-RESTORE-LABEL: callee_large_struct_ret:
diff --git a/llvm/test/CodeGen/RISCV/rv64-legal-i32/vararg.ll b/llvm/test/CodeGen/RISCV/rv64-legal-i32/vararg.ll
index 2fb674f..7fe67a0 100644
--- a/llvm/test/CodeGen/RISCV/rv64-legal-i32/vararg.ll
+++ b/llvm/test/CodeGen/RISCV/rv64-legal-i32/vararg.ll
@@ -76,28 +76,28 @@ define i32 @va1(ptr %fmt, ...) {
 ;
 ; LP64E-FPELIM-LABEL: va1:
 ; LP64E-FPELIM:       # %bb.0:
-; LP64E-FPELIM-NEXT:    addi sp, sp, -64
-; LP64E-FPELIM-NEXT:    .cfi_def_cfa_offset 64
+; LP64E-FPELIM-NEXT:    addi sp, sp, -56
+; LP64E-FPELIM-NEXT:    .cfi_def_cfa_offset 56
 ; LP64E-FPELIM-NEXT:    mv a0, a1
-; LP64E-FPELIM-NEXT:    sd a5, 56(sp)
-; LP64E-FPELIM-NEXT:    sd a4, 48(sp)
-; LP64E-FPELIM-NEXT:    sd a3, 40(sp)
-; LP64E-FPELIM-NEXT:    sd a2, 32(sp)
-; LP64E-FPELIM-NEXT:    sd a1, 24(sp)
-; LP64E-FPELIM-NEXT:    addi a1, sp, 28
-; LP64E-FPELIM-NEXT:    sd a1, 8(sp)
-; LP64E-FPELIM-NEXT:    addi sp, sp, 64
+; LP64E-FPELIM-NEXT:    sd a5, 48(sp)
+; LP64E-FPELIM-NEXT:    sd a4, 40(sp)
+; LP64E-FPELIM-NEXT:    sd a3, 32(sp)
+; LP64E-FPELIM-NEXT:    sd a2, 24(sp)
+; LP64E-FPELIM-NEXT:    sd a1, 16(sp)
+; LP64E-FPELIM-NEXT:    addi a1, sp, 20
+; LP64E-FPELIM-NEXT:    sd a1, 0(sp)
+; LP64E-FPELIM-NEXT:    addi sp, sp, 56
 ; LP64E-FPELIM-NEXT:    ret
 ;
 ; LP64E-WITHFP-LABEL: va1:
 ; LP64E-WITHFP:       # %bb.0:
-; LP64E-WITHFP-NEXT:    addi sp, sp, -80
-; LP64E-WITHFP-NEXT:    .cfi_def_cfa_offset 80
-; LP64E-WITHFP-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
-; LP64E-WITHFP-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
+; LP64E-WITHFP-NEXT:    addi sp, sp, -72
+; LP64E-WITHFP-NEXT:    .cfi_def_cfa_offset 72
+; LP64E-WITHFP-NEXT:    sd ra, 16(sp) # 8-byte Folded Spill
+; LP64E-WITHFP-NEXT:    sd s0, 8(sp) # 8-byte Folded Spill
 ; LP64E-WITHFP-NEXT:    .cfi_offset ra, -56
 ; LP64E-WITHFP-NEXT:    .cfi_offset s0, -64
-; LP64E-WITHFP-NEXT:    addi s0, sp, 32
+; LP64E-WITHFP-NEXT:    addi s0, sp, 24
 ; LP64E-WITHFP-NEXT:    .cfi_def_cfa s0, 48
 ; LP64E-WITHFP-NEXT:    mv a0, a1
 ; LP64E-WITHFP-NEXT:    sd a5, 40(s0)
@@ -107,9 +107,9 @@ define i32 @va1(ptr %fmt, ...) {
 ; LP64E-WITHFP-NEXT:    sd a1, 8(s0)
 ; LP64E-WITHFP-NEXT:    addi a1, s0, 12
 ; LP64E-WITHFP-NEXT:    sd a1, -24(s0)
-; LP64E-WITHFP-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
-; LP64E-WITHFP-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
-; LP64E-WITHFP-NEXT:    addi sp, sp, 80
+; LP64E-WITHFP-NEXT:    ld ra, 16(sp) # 8-byte Folded Reload
+; LP64E-WITHFP-NEXT:    ld s0, 8(sp) # 8-byte Folded Reload
+; LP64E-WITHFP-NEXT:    addi sp, sp, 72
 ; LP64E-WITHFP-NEXT:    ret
   %va = alloca ptr
   call void @llvm.va_start(ptr %va)
@@ -161,24 +161,24 @@ define i32 @va1_va_arg(ptr %fmt, ...) nounwind {
 ;
 ; LP64E-FPELIM-LABEL: va1_va_arg:
 ; LP64E-FPELIM:       # %bb.0:
-; LP64E-FPELIM-NEXT:    addi sp, sp, -64
+; LP64E-FPELIM-NEXT:    addi sp, sp, -56
 ; LP64E-FPELIM-NEXT:    mv a0, a1
-; LP64E-FPELIM-NEXT:    sd a5, 56(sp)
-; LP64E-FPELIM-NEXT:    sd a4, 48(sp)
-; LP64E-FPELIM-NEXT:    sd a3, 40(sp)
-; LP64E-FPELIM-NEXT:    sd a2, 32(sp)
-; LP64E-FPELIM-NEXT:    sd a1, 24(sp)
-; LP64E-FPELIM-NEXT:    addi a1, sp, 32
-; LP64E-FPELIM-NEXT:    sd a1, 8(sp)
-; LP64E-FPELIM-NEXT:    addi sp, sp, 64
+; LP64E-FPELIM-NEXT:    sd a5, 48(sp)
+; LP64E-FPELIM-NEXT:    sd a4, 40(sp)
+; LP64E-FPELIM-NEXT:    sd a3, 32(sp)
+; LP64E-FPELIM-NEXT:    sd a2, 24(sp)
+; LP64E-FPELIM-NEXT:    sd a1, 16(sp)
+; LP64E-FPELIM-NEXT:    addi a1, sp, 24
+; LP64E-FPELIM-NEXT:    sd a1, 0(sp)
+; LP64E-FPELIM-NEXT:    addi sp, sp, 56
 ; LP64E-FPELIM-NEXT:    ret
 ;
 ; LP64E-WITHFP-LABEL: va1_va_arg:
 ; LP64E-WITHFP:       # %bb.0:
-; LP64E-WITHFP-NEXT:    addi sp, sp, -80
-; LP64E-WITHFP-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
-; LP64E-WITHFP-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
-; LP64E-WITHFP-NEXT:    addi s0, sp, 32
+; LP64E-WITHFP-NEXT:    addi sp, sp, -72
+; LP64E-WITHFP-NEXT:    sd ra, 16(sp) # 8-byte Folded Spill
+; LP64E-WITHFP-NEXT:    sd s0, 8(sp) # 8-byte Folded Spill
+; LP64E-WITHFP-NEXT:    addi s0, sp, 24
 ; LP64E-WITHFP-NEXT:    mv a0, a1
 ; LP64E-WITHFP-NEXT:    sd a5, 40(s0)
 ; LP64E-WITHFP-NEXT:    sd a4, 32(s0)
@@ -187,9 +187,9 @@ define i32 @va1_va_arg(ptr %fmt, ...) nounwind {
 ; LP64E-WITHFP-NEXT:    sd a1, 8(s0)
 ; LP64E-WITHFP-NEXT:    addi a1, s0, 16
 ; LP64E-WITHFP-NEXT:    sd a1, -24(s0)
-; LP64E-WITHFP-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
-; LP64E-WITHFP-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
-; LP64E-WITHFP-NEXT:    addi sp, sp, 80
+; LP64E-WITHFP-NEXT:    ld ra, 16(sp) # 8-byte Folded Reload
+; LP64E-WITHFP-NEXT:    ld s0, 8(sp) # 8-byte Folded Reload
+; LP64E-WITHFP-NEXT:    addi sp, sp, 72
 ; LP64E-WITHFP-NEXT:    ret
   %va = alloca ptr
   call void @llvm.va_start(ptr %va)
@@ -435,24 +435,24 @@ define i64 @va2(ptr %fmt, ...) nounwind {
 ;
 ; LP64E-FPELIM-LABEL: va2:
 ; LP64E-FPELIM:       # %bb.0:
-; LP64E-FPELIM-NEXT:    addi sp, sp, -64
+; LP64E-FPELIM-NEXT:    addi sp, sp, -56
 ; LP64E-FPELIM-NEXT:    mv a0, a1
-; LP64E-FPELIM-NEXT:    sd a5, 56(sp)
-; LP64E-FPELIM-NEXT:    sd a4, 48(sp)
-; LP64E-FPELIM-NEXT:    sd a3, 40(sp)
-; LP64E-FPELIM-NEXT:    sd a2, 32(sp)
-; LP64E-FPELIM-NEXT:    sd a1, 24(sp)
-; LP64E-FPELIM-NEXT:    addi a1, sp, 39
-; LP64E-FPELIM-NEXT:    sd a1, 8(sp)
-; LP64E-FPELIM-NEXT:    addi sp, sp, 64
+; LP64E-FPELIM-NEXT:    sd a5, 48(sp)
+; LP64E-FPELIM-NEXT:    sd a4, 40(sp)
+; LP64E-FPELIM-NEXT:    sd a3, 32(sp)
+; LP64E-FPELIM-NEXT:    sd a2, 24(sp)
+; LP64E-FPELIM-NEXT:    sd a1, 16(sp)
+; LP64E-FPELIM-NEXT:    addi a1, sp, 31
+; LP64E-FPELIM-NEXT:    sd a1, 0(sp)
+; LP64E-FPELIM-NEXT:    addi sp, sp, 56
 ; LP64E-FPELIM-NEXT:    ret
 ;
 ; LP64E-WITHFP-LABEL: va2:
 ; LP64E-WITHFP:       # %bb.0:
-; LP64E-WITHFP-NEXT:    addi sp, sp, -80
-; LP64E-WITHFP-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
-; LP64E-WITHFP-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
-; LP64E-WITHFP-NEXT:    addi s0, sp, 32
+; LP64E-WITHFP-NEXT:    addi sp, sp, -72
+; LP64E-WITHFP-NEXT:    sd ra, 16(sp) # 8-byte Folded Spill
+; LP64E-WITHFP-NEXT:    sd s0, 8(sp) # 8-byte Folded Spill
+; LP64E-WITHFP-NEXT:    addi s0, sp, 24
 ; LP64E-WITHFP-NEXT:    mv a0, a1
 ; LP64E-WITHFP-NEXT:    sd a5, 40(s0)
 ; LP64E-WITHFP-NEXT:    sd a4, 32(s0)
@@ -461,9 +461,9 @@ define i64 @va2(ptr %fmt, ...) nounwind {
 ; LP64E-WITHFP-NEXT:    sd a1, 8(s0)
 ; LP64E-WITHFP-NEXT:    addi a1, s0, 23
 ; LP64E-WITHFP-NEXT:    sd a1, -24(s0)
-; LP64E-WITHFP-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
-; LP64E-WITHFP-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
-; LP64E-WITHFP-NEXT:    addi sp, sp, 80
+; LP64E-WITHFP-NEXT:    ld ra, 16(sp) # 8-byte Folded Reload
+; LP64E-WITHFP-NEXT:    ld s0, 8(sp) # 8-byte Folded Reload
+; LP64E-WITHFP-NEXT:    addi sp, sp, 72
 ; LP64E-WITHFP-NEXT:    ret
   %va = alloca ptr
   call void @llvm.va_start(ptr %va)
@@ -521,24 +521,24 @@ define i64 @va2_va_arg(ptr %fmt, ...) nounwind {
 ;
 ; LP64E-FPELIM-LABEL: va2_va_arg:
 ; LP64E-FPELIM:       # %bb.0:
-; LP64E-FPELIM-NEXT:    addi sp, sp, -64
+; LP64E-FPELIM-NEXT:    addi sp, sp, -56
 ; LP64E-FPELIM-NEXT:    mv a0, a1
-; LP64E-FPELIM-NEXT:    sd a5, 56(sp)
-; LP64E-FPELIM-NEXT:    sd a4, 48(sp)
-; LP64E-FPELIM-NEXT:    sd a3, 40(sp)
-; LP64E-FPELIM-NEXT:    sd a2, 32(sp)
-; LP64E-FPELIM-NEXT:    sd a1, 24(sp)
-; LP64E-FPELIM-NEXT:    addi a1, sp, 32
-; LP64E-FPELIM-NEXT:    sd a1, 8(sp)
-; LP64E-FPELIM-NEXT:    addi sp, sp, 64
+; LP64E-FPELIM-NEXT:    sd a5, 48(sp)
+; LP64E-FPELIM-NEXT:    sd a4, 40(sp)
+; LP64E-FPELIM-NEXT:    sd a3, 32(sp)
+; LP64E-FPELIM-NEXT:    sd a2, 24(sp)
+; LP64E-FPELIM-NEXT:    sd a1, 16(sp)
+; LP64E-FPELIM-NEXT:    addi a1, sp, 24
+; LP64E-FPELIM-NEXT:    sd a1, 0(sp)
+; LP64E-FPELIM-NEXT:    addi sp, sp, 56
 ; LP64E-FPELIM-NEXT:    ret
 ;
 ; LP64E-WITHFP-LABEL: va2_va_arg:
 ; LP64E-WITHFP:       # %bb.0:
-; LP64E-WITHFP-NEXT:    addi sp, sp, -80
-; LP64E-WITHFP-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
-; LP64E-WITHFP-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
-; LP64E-WITHFP-NEXT:    addi s0, sp, 32
+; LP64E-WITHFP-NEXT:    addi sp, sp, -72
+; LP64E-WITHFP-NEXT:    sd ra, 16(sp) # 8-byte Folded Spill
+; LP64E-WITHFP-NEXT:    sd s0, 8(sp) # 8-byte Folded Spill
+; LP64E-WITHFP-NEXT:    addi s0, sp, 24
 ; LP64E-WITHFP-NEXT:    mv a0, a1
 ; LP64E-WITHFP-NEXT:    sd a5, 40(s0)
 ; LP64E-WITHFP-NEXT:    sd a4, 32(s0)
@@ -547,9 +547,9 @@ define i64 @va2_va_arg(ptr %fmt, ...) nounwind {
 ; LP64E-WITHFP-NEXT:    sd a1, 8(s0)
 ; LP64E-WITHFP-NEXT:    addi a1, s0, 16
 ; LP64E-WITHFP-NEXT:    sd a1, -24(s0)
-; LP64E-WITHFP-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
-; LP64E-WITHFP-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
-; LP64E-WITHFP-NEXT:    addi sp, sp, 80
+; LP64E-WITHFP-NEXT:    ld ra, 16(sp) # 8-byte Folded Reload
+; LP64E-WITHFP-NEXT:    ld s0, 8(sp) # 8-byte Folded Reload
+; LP64E-WITHFP-NEXT:    addi sp, sp, 72
 ; LP64E-WITHFP-NEXT:    ret
   %va = alloca ptr
   call void @llvm.va_start(ptr %va)
@@ -654,23 +654,23 @@ define i64 @va3(i32 %a, i64 %b, ...) nounwind {
 ;
 ; LP64E-FPELIM-LABEL: va3:
 ; LP64E-FPELIM:       # %bb.0:
-; LP64E-FPELIM-NEXT:    addi sp, sp, -48
-; LP64E-FPELIM-NEXT:    sd a5, 40(sp)
-; LP64E-FPELIM-NEXT:    sd a4, 32(sp)
-; LP64E-FPELIM-NEXT:    sd a3, 24(sp)
-; LP64E-FPELIM-NEXT:    sd a2, 16(sp)
-; LP64E-FPELIM-NEXT:    addi a3, sp, 31
+; LP64E-FPELIM-NEXT:    addi sp, sp, -40
+; LP64E-FPELIM-NEXT:    sd a5, 32(sp)
+; LP64E-FPELIM-NEXT:    sd a4, 24(sp)
+; LP64E-FPELIM-NEXT:    sd a3, 16(sp)
+; LP64E-FPELIM-NEXT:    sd a2, 8(sp)
+; LP64E-FPELIM-NEXT:    addi a3, sp, 23
 ; LP64E-FPELIM-NEXT:    add a0, a1, a2
-; LP64E-FPELIM-NEXT:    sd a3, 8(sp)
-; LP64E-FPELIM-NEXT:    addi sp, sp, 48
+; LP64E-FPELIM-NEXT:    sd a3, 0(sp)
+; LP64E-FPELIM-NEXT:    addi sp, sp, 40
 ; LP64E-FPELIM-NEXT:    ret
 ;
 ; LP64E-WITHFP-LABEL: va3:
 ; LP64E-WITHFP:       # %bb.0:
-; LP64E-WITHFP-NEXT:    addi sp, sp, -64
-; LP64E-WITHFP-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
-; LP64E-WITHFP-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
-; LP64E-WITHFP-NEXT:    addi s0, sp, 32
+; LP64E-WITHFP-NEXT:    addi sp, sp, -56
+; LP64E-WITHFP-NEXT:    sd ra, 16(sp) # 8-byte Folded Spill
+; LP64E-WITHFP-NEXT:    sd s0, 8(sp) # 8-byte Folded Spill
+; LP64E-WITHFP-NEXT:    addi s0, sp, 24
 ; LP64E-WITHFP-NEXT:    sd a5, 24(s0)
 ; LP64E-WITHFP-NEXT:    sd a4, 16(s0)
 ; LP64E-WITHFP-NEXT:    sd a3, 8(s0)
@@ -678,9 +678,9 @@ define i64 @va3(i32 %a, i64 %b, ...) nounwind {
 ; LP64E-WITHFP-NEXT:    addi a3, s0, 15
 ; LP64E-WITHFP-NEXT:    add a0, a1, a2
 ; LP64E-WITHFP-NEXT:    sd a3, -24(s0)
-; LP64E-WITHFP-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
-; LP64E-WITHFP-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
-; LP64E-WITHFP-NEXT:    addi sp, sp, 64
+; LP64E-WITHFP-NEXT:    ld ra, 16(sp) # 8-byte Folded Reload
+; LP64E-WITHFP-NEXT:    ld s0, 8(sp) # 8-byte Folded Reload
+; LP64E-WITHFP-NEXT:    addi sp, sp, 56
 ; LP64E-WITHFP-NEXT:    ret
   %va = alloca ptr
   call void @llvm.va_start(ptr %va)
@@ -737,23 +737,23 @@ define i64 @va3_va_arg(i32 %a, i64 %b, ...) nounwind {
 ;
 ; LP64E-FPELIM-LABEL: va3_va_arg:
 ; LP64E-FPELIM:       # %bb.0:
-; LP64E-FPELIM-NEXT:    addi sp, sp, -48
-; LP64E-FPELIM-NEXT:    sd a5, 40(sp)
-; LP64E-FPELIM-NEXT:    sd a4, 32(sp)
-; LP64E-FPELIM-NEXT:    sd a3, 24(sp)
-; LP64E-FPELIM-NEXT:    sd a2, 16(sp)
-; LP64E-FPELIM-NEXT:    addi a3, sp, 24
+; LP64E-FPELIM-NEXT:    addi sp, sp, -40
+; LP64E-FPELIM-NEXT:    sd a5, 32(sp)
+; LP64E-FPELIM-NEXT:    sd a4, 24(sp)
+; LP64E-FPELIM-NEXT:    sd a3, 16(sp)
+; LP64E-FPELIM-NEXT:    sd a2, 8(sp)
+; LP64E-FPELIM-NEXT:    addi a3, sp, 16
 ; LP64E-FPELIM-NEXT:    add a0, a1, a2
-; LP64E-FPELIM-NEXT:    sd a3, 8(sp)
-; LP64E-FPELIM-NEXT:    addi sp, sp, 48
+; LP64E-FPELIM-NEXT:    sd a3, 0(sp)
+; LP64E-FPELIM-NEXT:    addi sp, sp, 40
 ; LP64E-FPELIM-NEXT:    ret
 ;
 ; LP64E-WITHFP-LABEL: va3_va_arg:
 ; LP64E-WITHFP:       # %bb.0:
-; LP64E-WITHFP-NEXT:    addi sp, sp, -64
-; LP64E-WITHFP-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
-; LP64E-WITHFP-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
-; LP64E-WITHFP-NEXT:    addi s0, sp, 32
+; LP64E-WITHFP-NEXT:    addi sp, sp, -56
+; LP64E-WITHFP-NEXT:    sd ra, 16(sp) # 8-byte Folded Spill
+; LP64E-WITHFP-NEXT:    sd s0, 8(sp) # 8-byte Folded Spill
+; LP64E-WITHFP-NEXT:    addi s0, sp, 24
 ; LP64E-WITHFP-NEXT:    sd a5, 24(s0)
 ; LP64E-WITHFP-NEXT:    sd a4, 16(s0)
 ; LP64E-WITHFP-NEXT:    sd a3, 8(s0)
@@ -761,9 +761,9 @@ define i64 @va3_va_arg(i32 %a, i64 %b, ...) nounwind {
 ; LP64E-WITHFP-NEXT:    addi a3, s0, 8
 ; LP64E-WITHFP-NEXT:    add a0, a1, a2
 ; LP64E-WITHFP-NEXT:    sd a3, -24(s0)
-; LP64E-WITHFP-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
-; LP64E-WITHFP-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
-; LP64E-WITHFP-NEXT:    addi sp, sp, 64
+; LP64E-WITHFP-NEXT:    ld ra, 16(sp) # 8-byte Folded Reload
+; LP64E-WITHFP-NEXT:    ld s0, 8(sp) # 8-byte Folded Reload
+; LP64E-WITHFP-NEXT:    addi sp, sp, 56
 ; LP64E-WITHFP-NEXT:    ret
   %va = alloca ptr
   call void @llvm.va_start(ptr %va)
@@ -1208,24 +1208,24 @@ define i32 @va6_no_fixed_args(...) nounwind {
 ;
 ; LP64E-FPELIM-LABEL: va6_no_fixed_args:
 ; LP64E-FPELIM:       # %bb.0:
-; LP64E-FPELIM-NEXT:    addi sp, sp, -64
-; LP64E-FPELIM-NEXT:    sd a5, 56(sp)
-; LP64E-FPELIM-NEXT:    sd a4, 48(sp)
-; LP64E-FPELIM-NEXT:    sd a3, 40(sp)
-; LP64E-FPELIM-NEXT:    sd a2, 32(sp)
-; LP64E-FPELIM-NEXT:    sd a1, 24(sp)
-; LP64E-FPELIM-NEXT:    sd a0, 16(sp)
-; LP64E-FPELIM-NEXT:    addi a1, sp, 24
-; LP64E-FPELIM-NEXT:    sd a1, 8(sp)
-; LP64E-FPELIM-NEXT:    addi sp, sp, 64
+; LP64E-FPELIM-NEXT:    addi sp, sp, -56
+; LP64E-FPELIM-NEXT:    sd a5, 48(sp)
+; LP64E-FPELIM-NEXT:    sd a4, 40(sp)
+; LP64E-FPELIM-NEXT:    sd a3, 32(sp)
+; LP64E-FPELIM-NEXT:    sd a2, 24(sp)
+; LP64E-FPELIM-NEXT:    sd a1, 16(sp)
+; LP64E-FPELIM-NEXT:    sd a0, 8(sp)
+; LP64E-FPELIM-NEXT:    addi a1, sp, 16
+; LP64E-FPELIM-NEXT:    sd a1, 0(sp)
+; LP64E-FPELIM-NEXT:    addi sp, sp, 56
 ; LP64E-FPELIM-NEXT:    ret
 ;
 ; LP64E-WITHFP-LABEL: va6_no_fixed_args:
 ; LP64E-WITHFP:       # %bb.0:
-; LP64E-WITHFP-NEXT:    addi sp, sp, -80
-; LP64E-WITHFP-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
-; LP64E-WITHFP-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
-; LP64E-WITHFP-NEXT:    addi s0, sp, 32
+; LP64E-WITHFP-NEXT:    addi sp, sp, -72
+; LP64E-WITHFP-NEXT:    sd ra, 16(sp) # 8-byte Folded Spill
+; LP64E-WITHFP-NEXT:    sd s0, 8(sp) # 8-byte Folded Spill
+; LP64E-WITHFP-NEXT:    addi s0, sp, 24
 ; LP64E-WITHFP-NEXT:    sd a5, 40(s0)
 ; LP64E-WITHFP-NEXT:    sd a4, 32(s0)
 ; LP64E-WITHFP-NEXT:    sd a3, 24(s0)
@@ -1234,9 +1234,9 @@ define i32 @va6_no_fixed_args(...) nounwind {
 ; LP64E-WITHFP-NEXT:    sd a0, 0(s0)
 ; LP64E-WITHFP-NEXT:    addi a1, s0, 8
 ; LP64E-WITHFP-NEXT:    sd a1, -24(s0)
-; LP64E-WITHFP-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
-; LP64E-WITHFP-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
-; LP64E-WITHFP-NEXT:    addi sp, sp, 80
+; LP64E-WITHFP-NEXT:    ld ra, 16(sp) # 8-byte Folded Reload
+; LP64E-WITHFP-NEXT:    ld s0, 8(sp) # 8-byte Folded Reload
+; LP64E-WITHFP-NEXT:    addi sp, sp, 72
 ; LP64E-WITHFP-NEXT:    ret
   %va = alloca ptr
   call void @llvm.va_start(ptr %va)
diff --git a/llvm/test/CodeGen/RISCV/vararg.ll b/llvm/test/CodeGen/RISCV/vararg.ll
index 14afbae..621f549 100644
--- a/llvm/test/CodeGen/RISCV/vararg.ll
+++ b/llvm/test/CodeGen/RISCV/vararg.ll
@@ -111,28 +111,28 @@ define i32 @va1(ptr %fmt, ...) {
 ;
 ; ILP32E-FPELIM-LABEL: va1:
 ; ILP32E-FPELIM:       # %bb.0:
-; ILP32E-FPELIM-NEXT:    addi sp, sp, -32
-; ILP32E-FPELIM-NEXT:    .cfi_def_cfa_offset 32
+; ILP32E-FPELIM-NEXT:    addi sp, sp, -28
+; ILP32E-FPELIM-NEXT:    .cfi_def_cfa_offset 28
 ; ILP32E-FPELIM-NEXT:    mv a0, a1
-; ILP32E-FPELIM-NEXT:    sw a5, 28(sp)
-; ILP32E-FPELIM-NEXT:    sw a4, 24(sp)
-; ILP32E-FPELIM-NEXT:    sw a3, 20(sp)
-; ILP32E-FPELIM-NEXT:    sw a2, 16(sp)
-; ILP32E-FPELIM-NEXT:    sw a1, 12(sp)
-; ILP32E-FPELIM-NEXT:    addi a1, sp, 16
-; ILP32E-FPELIM-NEXT:    sw a1, 4(sp)
-; ILP32E-FPELIM-NEXT:    addi sp, sp, 32
+; ILP32E-FPELIM-NEXT:    sw a5, 24(sp)
+; ILP32E-FPELIM-NEXT:    sw a4, 20(sp)
+; ILP32E-FPELIM-NEXT:    sw a3, 16(sp)
+; ILP32E-FPELIM-NEXT:    sw a2, 12(sp)
+; ILP32E-FPELIM-NEXT:    sw a1, 8(sp)
+; ILP32E-FPELIM-NEXT:    addi a1, sp, 12
+; ILP32E-FPELIM-NEXT:    sw a1, 0(sp)
+; ILP32E-FPELIM-NEXT:    addi sp, sp, 28
 ; ILP32E-FPELIM-NEXT:    ret
 ;
 ; ILP32E-WITHFP-LABEL: va1:
 ; ILP32E-WITHFP:       # %bb.0:
-; ILP32E-WITHFP-NEXT:    addi sp, sp, -48
-; ILP32E-WITHFP-NEXT:    .cfi_def_cfa_offset 48
-; ILP32E-WITHFP-NEXT:    sw ra, 20(sp) # 4-byte Folded Spill
-; ILP32E-WITHFP-NEXT:    sw s0, 16(sp) # 4-byte Folded Spill
+; ILP32E-WITHFP-NEXT:    addi sp, sp, -36
+; ILP32E-WITHFP-NEXT:    .cfi_def_cfa_offset 36
+; ILP32E-WITHFP-NEXT:    sw ra, 8(sp) # 4-byte Folded Spill
+; ILP32E-WITHFP-NEXT:    sw s0, 4(sp) # 4-byte Folded Spill
 ; ILP32E-WITHFP-NEXT:    .cfi_offset ra, -28
 ; ILP32E-WITHFP-NEXT:    .cfi_offset s0, -32
-; ILP32E-WITHFP-NEXT:    addi s0, sp, 24
+; ILP32E-WITHFP-NEXT:    addi s0, sp, 12
 ; ILP32E-WITHFP-NEXT:    .cfi_def_cfa s0, 24
 ; ILP32E-WITHFP-NEXT:    mv a0, a1
 ; ILP32E-WITHFP-NEXT:    sw a5, 20(s0)
@@ -142,9 +142,9 @@ define i32 @va1(ptr %fmt, ...) {
 ; ILP32E-WITHFP-NEXT:    sw a1, 4(s0)
 ; ILP32E-WITHFP-NEXT:    addi a1, s0, 8
 ; ILP32E-WITHFP-NEXT:    sw a1, -12(s0)
-; ILP32E-WITHFP-NEXT:    lw ra, 20(sp) # 4-byte Folded Reload
-; ILP32E-WITHFP-NEXT:    lw s0, 16(sp) # 4-byte Folded Reload
-; ILP32E-WITHFP-NEXT:    addi sp, sp, 48
+; ILP32E-WITHFP-NEXT:    lw ra, 8(sp) # 4-byte Folded Reload
+; ILP32E-WITHFP-NEXT:    lw s0, 4(sp) # 4-byte Folded Reload
+; ILP32E-WITHFP-NEXT:    addi sp, sp, 36
 ; ILP32E-WITHFP-NEXT:    ret
 ;
 ; LP64-LP64F-LP64D-FPELIM-LABEL: va1:
@@ -191,28 +191,28 @@ define i32 @va1(ptr %fmt, ...) {
 ;
 ; LP64E-FPELIM-LABEL: va1:
 ; LP64E-FPELIM:       # %bb.0:
-; LP64E-FPELIM-NEXT:    addi sp, sp, -64
-; LP64E-FPELIM-NEXT:    .cfi_def_cfa_offset 64
-; LP64E-FPELIM-NEXT:    sd a1, 24(sp)
-; LP64E-FPELIM-NEXT:    addi a0, sp, 28
-; LP64E-FPELIM-NEXT:    sd a0, 8(sp)
-; LP64E-FPELIM-NEXT:    lw a0, 24(sp)
-; LP64E-FPELIM-NEXT:    sd a5, 56(sp)
-; LP64E-FPELIM-NEXT:    sd a4, 48(sp)
-; LP64E-FPELIM-NEXT:    sd a3, 40(sp)
-; LP64E-FPELIM-NEXT:    sd a2, 32(sp)
-; LP64E-FPELIM-NEXT:    addi sp, sp, 64
+; LP64E-FPELIM-NEXT:    addi sp, sp, -56
+; LP64E-FPELIM-NEXT:    .cfi_def_cfa_offset 56
+; LP64E-FPELIM-NEXT:    sd a1, 16(sp)
+; LP64E-FPELIM-NEXT:    addi a0, sp, 20
+; LP64E-FPELIM-NEXT:    sd a0, 0(sp)
+; LP64E-FPELIM-NEXT:    lw a0, 16(sp)
+; LP64E-FPELIM-NEXT:    sd a5, 48(sp)
+; LP64E-FPELIM-NEXT:    sd a4, 40(sp)
+; LP64E-FPELIM-NEXT:    sd a3, 32(sp)
+; LP64E-FPELIM-NEXT:    sd a2, 24(sp)
+; LP64E-FPELIM-NEXT:    addi sp, sp, 56
 ; LP64E-FPELIM-NEXT:    ret
 ;
 ; LP64E-WITHFP-LABEL: va1:
 ; LP64E-WITHFP:       # %bb.0:
-; LP64E-WITHFP-NEXT:    addi sp, sp, -80
-; LP64E-WITHFP-NEXT:    .cfi_def_cfa_offset 80
-; LP64E-WITHFP-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
-; LP64E-WITHFP-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
+; LP64E-WITHFP-NEXT:    addi sp, sp, -72
+; LP64E-WITHFP-NEXT:    .cfi_def_cfa_offset 72
+; LP64E-WITHFP-NEXT:    sd ra, 16(sp) # 8-byte Folded Spill
+; LP64E-WITHFP-NEXT:    sd s0, 8(sp) # 8-byte Folded Spill
 ; LP64E-WITHFP-NEXT:    .cfi_offset ra, -56
 ; LP64E-WITHFP-NEXT:    .cfi_offset s0, -64
-; LP64E-WITHFP-NEXT:    addi s0, sp, 32
+; LP64E-WITHFP-NEXT:    addi s0, sp, 24
 ; LP64E-WITHFP-NEXT:    .cfi_def_cfa s0, 48
 ; LP64E-WITHFP-NEXT:    sd a1, 8(s0)
 ; LP64E-WITHFP-NEXT:    addi a0, s0, 12
@@ -222,9 +222,9 @@ define i32 @va1(ptr %fmt, ...) {
 ; LP64E-WITHFP-NEXT:    sd a4, 32(s0)
 ; LP64E-WITHFP-NEXT:    sd a3, 24(s0)
 ; LP64E-WITHFP-NEXT:    sd a2, 16(s0)
-; LP64E-WITHFP-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
-; LP64E-WITHFP-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
-; LP64E-WITHFP-NEXT:    addi sp, sp, 80
+; LP64E-WITHFP-NEXT:    ld ra, 16(sp) # 8-byte Folded Reload
+; LP64E-WITHFP-NEXT:    ld s0, 8(sp) # 8-byte Folded Reload
+; LP64E-WITHFP-NEXT:    addi sp, sp, 72
 ; LP64E-WITHFP-NEXT:    ret
   %va = alloca ptr
   call void @llvm.va_start(ptr %va)
@@ -292,24 +292,24 @@ define i32 @va1_va_arg(ptr %fmt, ...) nounwind {
 ;
 ; ILP32E-FPELIM-LABEL: va1_va_arg:
 ; ILP32E-FPELIM:       # %bb.0:
-; ILP32E-FPELIM-NEXT:    addi sp, sp, -32
+; ILP32E-FPELIM-NEXT:    addi sp, sp, -28
 ; ILP32E-FPELIM-NEXT:    mv a0, a1
-; ILP32E-FPELIM-NEXT:    sw a5, 28(sp)
-; ILP32E-FPELIM-NEXT:    sw a4, 24(sp)
-; ILP32E-FPELIM-NEXT:    sw a3, 20(sp)
-; ILP32E-FPELIM-NEXT:    sw a2, 16(sp)
-; ILP32E-FPELIM-NEXT:    sw a1, 12(sp)
-; ILP32E-FPELIM-NEXT:    addi a1, sp, 16
-; ILP32E-FPELIM-NEXT:    sw a1, 4(sp)
-; ILP32E-FPELIM-NEXT:    addi sp, sp, 32
+; ILP32E-FPELIM-NEXT:    sw a5, 24(sp)
+; ILP32E-FPELIM-NEXT:    sw a4, 20(sp)
+; ILP32E-FPELIM-NEXT:    sw a3, 16(sp)
+; ILP32E-FPELIM-NEXT:    sw a2, 12(sp)
+; ILP32E-FPELIM-NEXT:    sw a1, 8(sp)
+; ILP32E-FPELIM-NEXT:    addi a1, sp, 12
+; ILP32E-FPELIM-NEXT:    sw a1, 0(sp)
+; ILP32E-FPELIM-NEXT:    addi sp, sp, 28
 ; ILP32E-FPELIM-NEXT:    ret
 ;
 ; ILP32E-WITHFP-LABEL: va1_va_arg:
 ; ILP32E-WITHFP:       # %bb.0:
-; ILP32E-WITHFP-NEXT:    addi sp, sp, -48
-; ILP32E-WITHFP-NEXT:    sw ra, 20(sp) # 4-byte Folded Spill
-; ILP32E-WITHFP-NEXT:    sw s0, 16(sp) # 4-byte Folded Spill
-; ILP32E-WITHFP-NEXT:    addi s0, sp, 24
+; ILP32E-WITHFP-NEXT:    addi sp, sp, -36
+; ILP32E-WITHFP-NEXT:    sw ra, 8(sp) # 4-byte Folded Spill
+; ILP32E-WITHFP-NEXT:    sw s0, 4(sp) # 4-byte Folded Spill
+; ILP32E-WITHFP-NEXT:    addi s0, sp, 12
 ; ILP32E-WITHFP-NEXT:    mv a0, a1
 ; ILP32E-WITHFP-NEXT:    sw a5, 20(s0)
 ; ILP32E-WITHFP-NEXT:    sw a4, 16(s0)
@@ -318,9 +318,9 @@ define i32 @va1_va_arg(ptr %fmt, ...) nounwind {
 ; ILP32E-WITHFP-NEXT:    sw a1, 4(s0)
 ; ILP32E-WITHFP-NEXT:    addi a1, s0, 8
 ; ILP32E-WITHFP-NEXT:    sw a1, -12(s0)
-; ILP32E-WITHFP-NEXT:    lw ra, 20(sp) # 4-byte Folded Reload
-; ILP32E-WITHFP-NEXT:    lw s0, 16(sp) # 4-byte Folded Reload
-; ILP32E-WITHFP-NEXT:    addi sp, sp, 48
+; ILP32E-WITHFP-NEXT:    lw ra, 8(sp) # 4-byte Folded Reload
+; ILP32E-WITHFP-NEXT:    lw s0, 4(sp) # 4-byte Folded Reload
+; ILP32E-WITHFP-NEXT:    addi sp, sp, 36
 ; ILP32E-WITHFP-NEXT:    ret
 ;
 ; LP64-LP64F-LP64D-FPELIM-LABEL: va1_va_arg:
@@ -362,24 +362,24 @@ define i32 @va1_va_arg(ptr %fmt, ...) nounwind {
 ;
 ; LP64E-FPELIM-LABEL: va1_va_arg:
 ; LP64E-FPELIM:       # %bb.0:
-; LP64E-FPELIM-NEXT:    addi sp, sp, -64
+; LP64E-FPELIM-NEXT:    addi sp, sp, -56
 ; LP64E-FPELIM-NEXT:    mv a0, a1
-; LP64E-FPELIM-NEXT:    sd a5, 56(sp)
-; LP64E-FPELIM-NEXT:    sd a4, 48(sp)
-; LP64E-FPELIM-NEXT:    sd a3, 40(sp)
-; LP64E-FPELIM-NEXT:    sd a2, 32(sp)
-; LP64E-FPELIM-NEXT:    sd a1, 24(sp)
-; LP64E-FPELIM-NEXT:    addi a1, sp, 32
-; LP64E-FPELIM-NEXT:    sd a1, 8(sp)
-; LP64E-FPELIM-NEXT:    addi sp, sp, 64
+; LP64E-FPELIM-NEXT:    sd a5, 48(sp)
+; LP64E-FPELIM-NEXT:    sd a4, 40(sp)
+; LP64E-FPELIM-NEXT:    sd a3, 32(sp)
+; LP64E-FPELIM-NEXT:    sd a2, 24(sp)
+; LP64E-FPELIM-NEXT:    sd a1, 16(sp)
+; LP64E-FPELIM-NEXT:    addi a1, sp, 24
+; LP64E-FPELIM-NEXT:    sd a1, 0(sp)
+; LP64E-FPELIM-NEXT:    addi sp, sp, 56
 ; LP64E-FPELIM-NEXT:    ret
 ;
 ; LP64E-WITHFP-LABEL: va1_va_arg:
 ; LP64E-WITHFP:       # %bb.0:
-; LP64E-WITHFP-NEXT:    addi sp, sp, -80
-; LP64E-WITHFP-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
-; LP64E-WITHFP-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
-; LP64E-WITHFP-NEXT:    addi s0, sp, 32
+; LP64E-WITHFP-NEXT:    addi sp, sp, -72
+; LP64E-WITHFP-NEXT:    sd ra, 16(sp) # 8-byte Folded Spill
+; LP64E-WITHFP-NEXT:    sd s0, 8(sp) # 8-byte Folded Spill
+; LP64E-WITHFP-NEXT:    addi s0, sp, 24
 ; LP64E-WITHFP-NEXT:    mv a0, a1
 ; LP64E-WITHFP-NEXT:    sd a5, 40(s0)
 ; LP64E-WITHFP-NEXT:    sd a4, 32(s0)
@@ -388,9 +388,9 @@ define i32 @va1_va_arg(ptr %fmt, ...) nounwind {
 ; LP64E-WITHFP-NEXT:    sd a1, 8(s0)
 ; LP64E-WITHFP-NEXT:    addi a1, s0, 16
 ; LP64E-WITHFP-NEXT:    sd a1, -24(s0)
-; LP64E-WITHFP-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
-; LP64E-WITHFP-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
-; LP64E-WITHFP-NEXT:    addi sp, sp, 80
+; LP64E-WITHFP-NEXT:    ld ra, 16(sp) # 8-byte Folded Reload
+; LP64E-WITHFP-NEXT:    ld s0, 8(sp) # 8-byte Folded Reload
+; LP64E-WITHFP-NEXT:    addi sp, sp, 72
 ; LP64E-WITHFP-NEXT:    ret
   %va = alloca ptr
   call void @llvm.va_start(ptr %va)
@@ -879,29 +879,29 @@ define i64 @va2(ptr %fmt, ...) nounwind {
 ;
 ; ILP32E-FPELIM-LABEL: va2:
 ; ILP32E-FPELIM:       # %bb.0:
-; ILP32E-FPELIM-NEXT:    addi sp, sp, -32
-; ILP32E-FPELIM-NEXT:    sw a5, 28(sp)
-; ILP32E-FPELIM-NEXT:    sw a4, 24(sp)
-; ILP32E-FPELIM-NEXT:    sw a3, 20(sp)
-; ILP32E-FPELIM-NEXT:    sw a2, 16(sp)
-; ILP32E-FPELIM-NEXT:    sw a1, 12(sp)
-; ILP32E-FPELIM-NEXT:    addi a0, sp, 12
-; ILP32E-FPELIM-NEXT:    sw a0, 4(sp)
-; ILP32E-FPELIM-NEXT:    addi a0, sp, 19
+; ILP32E-FPELIM-NEXT:    addi sp, sp, -28
+; ILP32E-FPELIM-NEXT:    sw a5, 24(sp)
+; ILP32E-FPELIM-NEXT:    sw a4, 20(sp)
+; ILP32E-FPELIM-NEXT:    sw a3, 16(sp)
+; ILP32E-FPELIM-NEXT:    sw a2, 12(sp)
+; ILP32E-FPELIM-NEXT:    sw a1, 8(sp)
+; ILP32E-FPELIM-NEXT:    addi a0, sp, 8
+; ILP32E-FPELIM-NEXT:    sw a0, 0(sp)
+; ILP32E-FPELIM-NEXT:    addi a0, sp, 15
 ; ILP32E-FPELIM-NEXT:    andi a0, a0, -8
-; ILP32E-FPELIM-NEXT:    addi a1, sp, 27
-; ILP32E-FPELIM-NEXT:    sw a1, 4(sp)
+; ILP32E-FPELIM-NEXT:    addi a1, sp, 23
+; ILP32E-FPELIM-NEXT:    sw a1, 0(sp)
 ; ILP32E-FPELIM-NEXT:    lw a1, 4(a0)
 ; ILP32E-FPELIM-NEXT:    lw a0, 0(a0)
-; ILP32E-FPELIM-NEXT:    addi sp, sp, 32
+; ILP32E-FPELIM-NEXT:    addi sp, sp, 28
 ; ILP32E-FPELIM-NEXT:    ret
 ;
 ; ILP32E-WITHFP-LABEL: va2:
 ; ILP32E-WITHFP:       # %bb.0:
-; ILP32E-WITHFP-NEXT:    addi sp, sp, -48
-; ILP32E-WITHFP-NEXT:    sw ra, 20(sp) # 4-byte Folded Spill
-; ILP32E-WITHFP-NEXT:    sw s0, 16(sp) # 4-byte Folded Spill
-; ILP32E-WITHFP-NEXT:    addi s0, sp, 24
+; ILP32E-WITHFP-NEXT:    addi sp, sp, -36
+; ILP32E-WITHFP-NEXT:    sw ra, 8(sp) # 4-byte Folded Spill
+; ILP32E-WITHFP-NEXT:    sw s0, 4(sp) # 4-byte Folded Spill
+; ILP32E-WITHFP-NEXT:    addi s0, sp, 12
 ; ILP32E-WITHFP-NEXT:    sw a5, 20(s0)
 ; ILP32E-WITHFP-NEXT:    sw a4, 16(s0)
 ; ILP32E-WITHFP-NEXT:    sw a3, 12(s0)
@@ -915,9 +915,9 @@ define i64 @va2(ptr %fmt, ...) nounwind {
 ; ILP32E-WITHFP-NEXT:    sw a1, -12(s0)
 ; ILP32E-WITHFP-NEXT:    lw a1, 4(a0)
 ; ILP32E-WITHFP-NEXT:    lw a0, 0(a0)
-; ILP32E-WITHFP-NEXT:    lw ra, 20(sp) # 4-byte Folded Reload
-; ILP32E-WITHFP-NEXT:    lw s0, 16(sp) # 4-byte Folded Reload
-; ILP32E-WITHFP-NEXT:    addi sp, sp, 48
+; ILP32E-WITHFP-NEXT:    lw ra, 8(sp) # 4-byte Folded Reload
+; ILP32E-WITHFP-NEXT:    lw s0, 4(sp) # 4-byte Folded Reload
+; ILP32E-WITHFP-NEXT:    addi sp, sp, 36
 ; ILP32E-WITHFP-NEXT:    ret
 ;
 ; LP64-LP64F-LP64D-FPELIM-LABEL: va2:
@@ -959,24 +959,24 @@ define i64 @va2(ptr %fmt, ...) nounwind {
 ;
 ; LP64E-FPELIM-LABEL: va2:
 ; LP64E-FPELIM:       # %bb.0:
-; LP64E-FPELIM-NEXT:    addi sp, sp, -64
+; LP64E-FPELIM-NEXT:    addi sp, sp, -56
 ; LP64E-FPELIM-NEXT:    mv a0, a1
-; LP64E-FPELIM-NEXT:    sd a5, 56(sp)
-; LP64E-FPELIM-NEXT:    sd a4, 48(sp)
-; LP64E-FPELIM-NEXT:    sd a3, 40(sp)
-; LP64E-FPELIM-NEXT:    sd a2, 32(sp)
-; LP64E-FPELIM-NEXT:    sd a1, 24(sp)
-; LP64E-FPELIM-NEXT:    addi a1, sp, 39
-; LP64E-FPELIM-NEXT:    sd a1, 8(sp)
-; LP64E-FPELIM-NEXT:    addi sp, sp, 64
+; LP64E-FPELIM-NEXT:    sd a5, 48(sp)
+; LP64E-FPELIM-NEXT:    sd a4, 40(sp)
+; LP64E-FPELIM-NEXT:    sd a3, 32(sp)
+; LP64E-FPELIM-NEXT:    sd a2, 24(sp)
+; LP64E-FPELIM-NEXT:    sd a1, 16(sp)
+; LP64E-FPELIM-NEXT:    addi a1, sp, 31
+; LP64E-FPELIM-NEXT:    sd a1, 0(sp)
+; LP64E-FPELIM-NEXT:    addi sp, sp, 56
 ; LP64E-FPELIM-NEXT:    ret
 ;
 ; LP64E-WITHFP-LABEL: va2:
 ; LP64E-WITHFP:       # %bb.0:
-; LP64E-WITHFP-NEXT:    addi sp, sp, -80
-; LP64E-WITHFP-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
-; LP64E-WITHFP-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
-; LP64E-WITHFP-NEXT:    addi s0, sp, 32
+; LP64E-WITHFP-NEXT:    addi sp, sp, -72
+; LP64E-WITHFP-NEXT:    sd ra, 16(sp) # 8-byte Folded Spill
+; LP64E-WITHFP-NEXT:    sd s0, 8(sp) # 8-byte Folded Spill
+; LP64E-WITHFP-NEXT:    addi s0, sp, 24
 ; LP64E-WITHFP-NEXT:    mv a0, a1
 ; LP64E-WITHFP-NEXT:    sd a5, 40(s0)
 ; LP64E-WITHFP-NEXT:    sd a4, 32(s0)
@@ -985,9 +985,9 @@ define i64 @va2(ptr %fmt, ...) nounwind {
 ; LP64E-WITHFP-NEXT:    sd a1, 8(s0)
 ; LP64E-WITHFP-NEXT:    addi a1, s0, 23
 ; LP64E-WITHFP-NEXT:    sd a1, -24(s0)
-; LP64E-WITHFP-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
-; LP64E-WITHFP-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
-; LP64E-WITHFP-NEXT:    addi sp, sp, 80
+; LP64E-WITHFP-NEXT:    ld ra, 16(sp) # 8-byte Folded Reload
+; LP64E-WITHFP-NEXT:    ld s0, 8(sp) # 8-byte Folded Reload
+; LP64E-WITHFP-NEXT:    addi sp, sp, 72
 ; LP64E-WITHFP-NEXT:    ret
   %va = alloca ptr
   call void @llvm.va_start(ptr %va)
@@ -1076,29 +1076,29 @@ define i64 @va2_va_arg(ptr %fmt, ...) nounwind {
 ;
 ; ILP32E-FPELIM-LABEL: va2_va_arg:
 ; ILP32E-FPELIM:       # %bb.0:
-; ILP32E-FPELIM-NEXT:    addi sp, sp, -32
-; ILP32E-FPELIM-NEXT:    sw a5, 28(sp)
-; ILP32E-FPELIM-NEXT:    sw a4, 24(sp)
-; ILP32E-FPELIM-NEXT:    sw a3, 20(sp)
-; ILP32E-FPELIM-NEXT:    sw a2, 16(sp)
-; ILP32E-FPELIM-NEXT:    sw a1, 12(sp)
-; ILP32E-FPELIM-NEXT:    addi a0, sp, 19
+; ILP32E-FPELIM-NEXT:    addi sp, sp, -28
+; ILP32E-FPELIM-NEXT:    sw a5, 24(sp)
+; ILP32E-FPELIM-NEXT:    sw a4, 20(sp)
+; ILP32E-FPELIM-NEXT:    sw a3, 16(sp)
+; ILP32E-FPELIM-NEXT:    sw a2, 12(sp)
+; ILP32E-FPELIM-NEXT:    sw a1, 8(sp)
+; ILP32E-FPELIM-NEXT:    addi a0, sp, 15
 ; ILP32E-FPELIM-NEXT:    andi a1, a0, -8
 ; ILP32E-FPELIM-NEXT:    addi a0, a1, 4
-; ILP32E-FPELIM-NEXT:    sw a0, 4(sp)
+; ILP32E-FPELIM-NEXT:    sw a0, 0(sp)
 ; ILP32E-FPELIM-NEXT:    lw a0, 0(a1)
 ; ILP32E-FPELIM-NEXT:    addi a2, a1, 8
-; ILP32E-FPELIM-NEXT:    sw a2, 4(sp)
+; ILP32E-FPELIM-NEXT:    sw a2, 0(sp)
 ; ILP32E-FPELIM-NEXT:    lw a1, 4(a1)
-; ILP32E-FPELIM-NEXT:    addi sp, sp, 32
+; ILP32E-FPELIM-NEXT:    addi sp, sp, 28
 ; ILP32E-FPELIM-NEXT:    ret
 ;
 ; ILP32E-WITHFP-LABEL: va2_va_arg:
 ; ILP32E-WITHFP:       # %bb.0:
-; ILP32E-WITHFP-NEXT:    addi sp, sp, -48
-; ILP32E-WITHFP-NEXT:    sw ra, 20(sp) # 4-byte Folded Spill
-; ILP32E-WITHFP-NEXT:    sw s0, 16(sp) # 4-byte Folded Spill
-; ILP32E-WITHFP-NEXT:    addi s0, sp, 24
+; ILP32E-WITHFP-NEXT:    addi sp, sp, -36
+; ILP32E-WITHFP-NEXT:    sw ra, 8(sp) # 4-byte Folded Spill
+; ILP32E-WITHFP-NEXT:    sw s0, 4(sp) # 4-byte Folded Spill
+; ILP32E-WITHFP-NEXT:    addi s0, sp, 12
 ; ILP32E-WITHFP-NEXT:    sw a5, 20(s0)
 ; ILP32E-WITHFP-NEXT:    sw a4, 16(s0)
 ; ILP32E-WITHFP-NEXT:    sw a3, 12(s0)
@@ -1112,9 +1112,9 @@ define i64 @va2_va_arg(ptr %fmt, ...) nounwind {
 ; ILP32E-WITHFP-NEXT:    addi a2, a1, 8
 ; ILP32E-WITHFP-NEXT:    sw a2, -12(s0)
 ; ILP32E-WITHFP-NEXT:    lw a1, 4(a1)
-; ILP32E-WITHFP-NEXT:    lw ra, 20(sp) # 4-byte Folded Reload
-; ILP32E-WITHFP-NEXT:    lw s0, 16(sp) # 4-byte Folded Reload
-; ILP32E-WITHFP-NEXT:    addi sp, sp, 48
+; ILP32E-WITHFP-NEXT:    lw ra, 8(sp) # 4-byte Folded Reload
+; ILP32E-WITHFP-NEXT:    lw s0, 4(sp) # 4-byte Folded Reload
+; ILP32E-WITHFP-NEXT:    addi sp, sp, 36
 ; ILP32E-WITHFP-NEXT:    ret
 ;
 ; LP64-LP64F-LP64D-FPELIM-LABEL: va2_va_arg:
@@ -1156,24 +1156,24 @@ define i64 @va2_va_arg(ptr %fmt, ...) nounwind {
 ;
 ; LP64E-FPELIM-LABEL: va2_va_arg:
 ; LP64E-FPELIM:       # %bb.0:
-; LP64E-FPELIM-NEXT:    addi sp, sp, -64
+; LP64E-FPELIM-NEXT:    addi sp, sp, -56
 ; LP64E-FPELIM-NEXT:    mv a0, a1
-; LP64E-FPELIM-NEXT:    sd a5, 56(sp)
-; LP64E-FPELIM-NEXT:    sd a4, 48(sp)
-; LP64E-FPELIM-NEXT:    sd a3, 40(sp)
-; LP64E-FPELIM-NEXT:    sd a2, 32(sp)
-; LP64E-FPELIM-NEXT:    sd a1, 24(sp)
-; LP64E-FPELIM-NEXT:    addi a1, sp, 32
-; LP64E-FPELIM-NEXT:    sd a1, 8(sp)
-; LP64E-FPELIM-NEXT:    addi sp, sp, 64
+; LP64E-FPELIM-NEXT:    sd a5, 48(sp)
+; LP64E-FPELIM-NEXT:    sd a4, 40(sp)
+; LP64E-FPELIM-NEXT:    sd a3, 32(sp)
+; LP64E-FPELIM-NEXT:    sd a2, 24(sp)
+; LP64E-FPELIM-NEXT:    sd a1, 16(sp)
+; LP64E-FPELIM-NEXT:    addi a1, sp, 24
+; LP64E-FPELIM-NEXT:    sd a1, 0(sp)
+; LP64E-FPELIM-NEXT:    addi sp, sp, 56
 ; LP64E-FPELIM-NEXT:    ret
 ;
 ; LP64E-WITHFP-LABEL: va2_va_arg:
 ; LP64E-WITHFP:       # %bb.0:
-; LP64E-WITHFP-NEXT:    addi sp, sp, -80
-; LP64E-WITHFP-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
-; LP64E-WITHFP-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
-; LP64E-WITHFP-NEXT:    addi s0, sp, 32
+; LP64E-WITHFP-NEXT:    addi sp, sp, -72
+; LP64E-WITHFP-NEXT:    sd ra, 16(sp) # 8-byte Folded Spill
+; LP64E-WITHFP-NEXT:    sd s0, 8(sp) # 8-byte Folded Spill
+; LP64E-WITHFP-NEXT:    addi s0, sp, 24
 ; LP64E-WITHFP-NEXT:    mv a0, a1
 ; LP64E-WITHFP-NEXT:    sd a5, 40(s0)
 ; LP64E-WITHFP-NEXT:    sd a4, 32(s0)
@@ -1182,9 +1182,9 @@ define i64 @va2_va_arg(ptr %fmt, ...) nounwind {
 ; LP64E-WITHFP-NEXT:    sd a1, 8(s0)
 ; LP64E-WITHFP-NEXT:    addi a1, s0, 16
 ; LP64E-WITHFP-NEXT:    sd a1, -24(s0)
-; LP64E-WITHFP-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
-; LP64E-WITHFP-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
-; LP64E-WITHFP-NEXT:    addi sp, sp, 80
+; LP64E-WITHFP-NEXT:    ld ra, 16(sp) # 8-byte Folded Reload
+; LP64E-WITHFP-NEXT:    ld s0, 8(sp) # 8-byte Folded Reload
+; LP64E-WITHFP-NEXT:    addi sp, sp, 72
 ; LP64E-WITHFP-NEXT:    ret
   %va = alloca ptr
   call void @llvm.va_start(ptr %va)
@@ -1389,31 +1389,31 @@ define i64 @va3(i32 %a, i64 %b, ...) nounwind {
 ;
 ; ILP32E-FPELIM-LABEL: va3:
 ; ILP32E-FPELIM:       # %bb.0:
-; ILP32E-FPELIM-NEXT:    addi sp, sp, -32
-; ILP32E-FPELIM-NEXT:    sw a5, 28(sp)
-; ILP32E-FPELIM-NEXT:    sw a4, 24(sp)
-; ILP32E-FPELIM-NEXT:    sw a3, 20(sp)
-; ILP32E-FPELIM-NEXT:    addi a0, sp, 20
-; ILP32E-FPELIM-NEXT:    sw a0, 12(sp)
-; ILP32E-FPELIM-NEXT:    addi a0, sp, 27
+; ILP32E-FPELIM-NEXT:    addi sp, sp, -20
+; ILP32E-FPELIM-NEXT:    sw a5, 16(sp)
+; ILP32E-FPELIM-NEXT:    sw a4, 12(sp)
+; ILP32E-FPELIM-NEXT:    sw a3, 8(sp)
+; ILP32E-FPELIM-NEXT:    addi a0, sp, 8
+; ILP32E-FPELIM-NEXT:    sw a0, 0(sp)
+; ILP32E-FPELIM-NEXT:    addi a0, sp, 15
 ; ILP32E-FPELIM-NEXT:    andi a0, a0, -8
-; ILP32E-FPELIM-NEXT:    addi a3, sp, 35
-; ILP32E-FPELIM-NEXT:    sw a3, 12(sp)
+; ILP32E-FPELIM-NEXT:    addi a3, sp, 23
+; ILP32E-FPELIM-NEXT:    sw a3, 0(sp)
 ; ILP32E-FPELIM-NEXT:    lw a3, 4(a0)
 ; ILP32E-FPELIM-NEXT:    lw a0, 0(a0)
 ; ILP32E-FPELIM-NEXT:    add a2, a2, a3
 ; ILP32E-FPELIM-NEXT:    add a0, a1, a0
 ; ILP32E-FPELIM-NEXT:    sltu a1, a0, a1
 ; ILP32E-FPELIM-NEXT:    add a1, a2, a1
-; ILP32E-FPELIM-NEXT:    addi sp, sp, 32
+; ILP32E-FPELIM-NEXT:    addi sp, sp, 20
 ; ILP32E-FPELIM-NEXT:    ret
 ;
 ; ILP32E-WITHFP-LABEL: va3:
 ; ILP32E-WITHFP:       # %bb.0:
-; ILP32E-WITHFP-NEXT:    addi sp, sp, -32
-; ILP32E-WITHFP-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
-; ILP32E-WITHFP-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
-; ILP32E-WITHFP-NEXT:    addi s0, sp, 16
+; ILP32E-WITHFP-NEXT:    addi sp, sp, -28
+; ILP32E-WITHFP-NEXT:    sw ra, 8(sp) # 4-byte Folded Spill
+; ILP32E-WITHFP-NEXT:    sw s0, 4(sp) # 4-byte Folded Spill
+; ILP32E-WITHFP-NEXT:    addi s0, sp, 12
 ; ILP32E-WITHFP-NEXT:    sw a5, 12(s0)
 ; ILP32E-WITHFP-NEXT:    sw a4, 8(s0)
 ; ILP32E-WITHFP-NEXT:    sw a3, 4(s0)
@@ -1429,9 +1429,9 @@ define i64 @va3(i32 %a, i64 %b, ...) nounwind {
 ; ILP32E-WITHFP-NEXT:    add a0, a1, a0
 ; ILP32E-WITHFP-NEXT:    sltu a1, a0, a1
 ; ILP32E-WITHFP-NEXT:    add a1, a2, a1
-; ILP32E-WITHFP-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
-; ILP32E-WITHFP-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
-; ILP32E-WITHFP-NEXT:    addi sp, sp, 32
+; ILP32E-WITHFP-NEXT:    lw ra, 8(sp) # 4-byte Folded Reload
+; ILP32E-WITHFP-NEXT:    lw s0, 4(sp) # 4-byte Folded Reload
+; ILP32E-WITHFP-NEXT:    addi sp, sp, 28
 ; ILP32E-WITHFP-NEXT:    ret
 ;
 ; LP64-LP64F-LP64D-FPELIM-LABEL: va3:
@@ -1471,23 +1471,23 @@ define i64 @va3(i32 %a, i64 %b, ...) nounwind {
 ;
 ; LP64E-FPELIM-LABEL: va3:
 ; LP64E-FPELIM:       # %bb.0:
-; LP64E-FPELIM-NEXT:    addi sp, sp, -48
-; LP64E-FPELIM-NEXT:    sd a5, 40(sp)
-; LP64E-FPELIM-NEXT:    sd a4, 32(sp)
-; LP64E-FPELIM-NEXT:    sd a3, 24(sp)
-; LP64E-FPELIM-NEXT:    sd a2, 16(sp)
-; LP64E-FPELIM-NEXT:    addi a3, sp, 31
+; LP64E-FPELIM-NEXT:    addi sp, sp, -40
+; LP64E-FPELIM-NEXT:    sd a5, 32(sp)
+; LP64E-FPELIM-NEXT:    sd a4, 24(sp)
+; LP64E-FPELIM-NEXT:    sd a3, 16(sp)
+; LP64E-FPELIM-NEXT:    sd a2, 8(sp)
+; LP64E-FPELIM-NEXT:    addi a3, sp, 23
 ; LP64E-FPELIM-NEXT:    add a0, a1, a2
-; LP64E-FPELIM-NEXT:    sd a3, 8(sp)
-; LP64E-FPELIM-NEXT:    addi sp, sp, 48
+; LP64E-FPELIM-NEXT:    sd a3, 0(sp)
+; LP64E-FPELIM-NEXT:    addi sp, sp, 40
 ; LP64E-FPELIM-NEXT:    ret
 ;
 ; LP64E-WITHFP-LABEL: va3:
 ; LP64E-WITHFP:       # %bb.0:
-; LP64E-WITHFP-NEXT:    addi sp, sp, -64
-; LP64E-WITHFP-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
-; LP64E-WITHFP-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
-; LP64E-WITHFP-NEXT:    addi s0, sp, 32
+; LP64E-WITHFP-NEXT:    addi sp, sp, -56
+; LP64E-WITHFP-NEXT:    sd ra, 16(sp) # 8-byte Folded Spill
+; LP64E-WITHFP-NEXT:    sd s0, 8(sp) # 8-byte Folded Spill
+; LP64E-WITHFP-NEXT:    addi s0, sp, 24
 ; LP64E-WITHFP-NEXT:    sd a5, 24(s0)
 ; LP64E-WITHFP-NEXT:    sd a4, 16(s0)
 ; LP64E-WITHFP-NEXT:    sd a3, 8(s0)
@@ -1495,9 +1495,9 @@ define i64 @va3(i32 %a, i64 %b, ...) nounwind {
 ; LP64E-WITHFP-NEXT:    addi a3, s0, 15
 ; LP64E-WITHFP-NEXT:    add a0, a1, a2
 ; LP64E-WITHFP-NEXT:    sd a3, -24(s0)
-; LP64E-WITHFP-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
-; LP64E-WITHFP-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
-; LP64E-WITHFP-NEXT:    addi sp, sp, 64
+; LP64E-WITHFP-NEXT:    ld ra, 16(sp) # 8-byte Folded Reload
+; LP64E-WITHFP-NEXT:    ld s0, 8(sp) # 8-byte Folded Reload
+; LP64E-WITHFP-NEXT:    addi sp, sp, 56
 ; LP64E-WITHFP-NEXT:    ret
   %va = alloca ptr
   call void @llvm.va_start(ptr %va)
@@ -1593,31 +1593,31 @@ define i64 @va3_va_arg(i32 %a, i64 %b, ...) nounwind {
 ;
 ; ILP32E-FPELIM-LABEL: va3_va_arg:
 ; ILP32E-FPELIM:       # %bb.0:
-; ILP32E-FPELIM-NEXT:    addi sp, sp, -32
-; ILP32E-FPELIM-NEXT:    sw a5, 28(sp)
-; ILP32E-FPELIM-NEXT:    sw a4, 24(sp)
-; ILP32E-FPELIM-NEXT:    sw a3, 20(sp)
-; ILP32E-FPELIM-NEXT:    addi a0, sp, 27
+; ILP32E-FPELIM-NEXT:    addi sp, sp, -20
+; ILP32E-FPELIM-NEXT:    sw a5, 16(sp)
+; ILP32E-FPELIM-NEXT:    sw a4, 12(sp)
+; ILP32E-FPELIM-NEXT:    sw a3, 8(sp)
+; ILP32E-FPELIM-NEXT:    addi a0, sp, 15
 ; ILP32E-FPELIM-NEXT:    andi a0, a0, -8
 ; ILP32E-FPELIM-NEXT:    addi a3, a0, 4
-; ILP32E-FPELIM-NEXT:    sw a3, 12(sp)
+; ILP32E-FPELIM-NEXT:    sw a3, 0(sp)
 ; ILP32E-FPELIM-NEXT:    lw a3, 0(a0)
 ; ILP32E-FPELIM-NEXT:    addi a4, a0, 8
-; ILP32E-FPELIM-NEXT:    sw a4, 12(sp)
+; ILP32E-FPELIM-NEXT:    sw a4, 0(sp)
 ; ILP32E-FPELIM-NEXT:    lw a4, 4(a0)
 ; ILP32E-FPELIM-NEXT:    add a0, a1, a3
 ; ILP32E-FPELIM-NEXT:    sltu a1, a0, a1
 ; ILP32E-FPELIM-NEXT:    add a2, a2, a4
 ; ILP32E-FPELIM-NEXT:    add a1, a2, a1
-; ILP32E-FPELIM-NEXT:    addi sp, sp, 32
+; ILP32E-FPELIM-NEXT:    addi sp, sp, 20
 ; ILP32E-FPELIM-NEXT:    ret
 ;
 ; ILP32E-WITHFP-LABEL: va3_va_arg:
 ; ILP32E-WITHFP:       # %bb.0:
-; ILP32E-WITHFP-NEXT:    addi sp, sp, -32
-; ILP32E-WITHFP-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
-; ILP32E-WITHFP-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
-; ILP32E-WITHFP-NEXT:    addi s0, sp, 16
+; ILP32E-WITHFP-NEXT:    addi sp, sp, -28
+; ILP32E-WITHFP-NEXT:    sw ra, 8(sp) # 4-byte Folded Spill
+; ILP32E-WITHFP-NEXT:    sw s0, 4(sp) # 4-byte Folded Spill
+; ILP32E-WITHFP-NEXT:    addi s0, sp, 12
 ; ILP32E-WITHFP-NEXT:    sw a5, 12(s0)
 ; ILP32E-WITHFP-NEXT:    sw a4, 8(s0)
 ; ILP32E-WITHFP-NEXT:    sw a3, 4(s0)
@@ -1633,9 +1633,9 @@ define i64 @va3_va_arg(i32 %a, i64 %b, ...) nounwind {
 ; ILP32E-WITHFP-NEXT:    sltu a1, a0, a1
 ; ILP32E-WITHFP-NEXT:    add a2, a2, a4
 ; ILP32E-WITHFP-NEXT:    add a1, a2, a1
-; ILP32E-WITHFP-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
-; ILP32E-WITHFP-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
-; ILP32E-WITHFP-NEXT:    addi sp, sp, 32
+; ILP32E-WITHFP-NEXT:    lw ra, 8(sp) # 4-byte Folded Reload
+; ILP32E-WITHFP-NEXT:    lw s0, 4(sp) # 4-byte Folded Reload
+; ILP32E-WITHFP-NEXT:    addi sp, sp, 28
 ; ILP32E-WITHFP-NEXT:    ret
 ;
 ; LP64-LP64F-LP64D-FPELIM-LABEL: va3_va_arg:
@@ -1675,23 +1675,23 @@ define i64 @va3_va_arg(i32 %a, i64 %b, ...) nounwind {
 ;
 ; LP64E-FPELIM-LABEL: va3_va_arg:
 ; LP64E-FPELIM:       # %bb.0:
-; LP64E-FPELIM-NEXT:    addi sp, sp, -48
-; LP64E-FPELIM-NEXT:    sd a5, 40(sp)
-; LP64E-FPELIM-NEXT:    sd a4, 32(sp)
-; LP64E-FPELIM-NEXT:    sd a3, 24(sp)
-; LP64E-FPELIM-NEXT:    sd a2, 16(sp)
-; LP64E-FPELIM-NEXT:    addi a3, sp, 24
+; LP64E-FPELIM-NEXT:    addi sp, sp, -40
+; LP64E-FPELIM-NEXT:    sd a5, 32(sp)
+; LP64E-FPELIM-NEXT:    sd a4, 24(sp)
+; LP64E-FPELIM-NEXT:    sd a3, 16(sp)
+; LP64E-FPELIM-NEXT:    sd a2, 8(sp)
+; LP64E-FPELIM-NEXT:    addi a3, sp, 16
 ; LP64E-FPELIM-NEXT:    add a0, a1, a2
-; LP64E-FPELIM-NEXT:    sd a3, 8(sp)
-; LP64E-FPELIM-NEXT:    addi sp, sp, 48
+; LP64E-FPELIM-NEXT:    sd a3, 0(sp)
+; LP64E-FPELIM-NEXT:    addi sp, sp, 40
 ; LP64E-FPELIM-NEXT:    ret
 ;
 ; LP64E-WITHFP-LABEL: va3_va_arg:
 ; LP64E-WITHFP:       # %bb.0:
-; LP64E-WITHFP-NEXT:    addi sp, sp, -64
-; LP64E-WITHFP-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
-; LP64E-WITHFP-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
-; LP64E-WITHFP-NEXT:    addi s0, sp, 32
+; LP64E-WITHFP-NEXT:    addi sp, sp, -56
+; LP64E-WITHFP-NEXT:    sd ra, 16(sp) # 8-byte Folded Spill
+; LP64E-WITHFP-NEXT:    sd s0, 8(sp) # 8-byte Folded Spill
+; LP64E-WITHFP-NEXT:    addi s0, sp, 24
 ; LP64E-WITHFP-NEXT:    sd a5, 24(s0)
 ; LP64E-WITHFP-NEXT:    sd a4, 16(s0)
 ; LP64E-WITHFP-NEXT:    sd a3, 8(s0)
@@ -1699,9 +1699,9 @@ define i64 @va3_va_arg(i32 %a, i64 %b, ...) nounwind {
 ; LP64E-WITHFP-NEXT:    addi a3, s0, 8
 ; LP64E-WITHFP-NEXT:    add a0, a1, a2
 ; LP64E-WITHFP-NEXT:    sd a3, -24(s0)
-; LP64E-WITHFP-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
-; LP64E-WITHFP-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
-; LP64E-WITHFP-NEXT:    addi sp, sp, 64
+; LP64E-WITHFP-NEXT:    ld ra, 16(sp) # 8-byte Folded Reload
+; LP64E-WITHFP-NEXT:    ld s0, 8(sp) # 8-byte Folded Reload
+; LP64E-WITHFP-NEXT:    addi sp, sp, 56
 ; LP64E-WITHFP-NEXT:    ret
   %va = alloca ptr
   call void @llvm.va_start(ptr %va)
@@ -2675,24 +2675,24 @@ define i32 @va6_no_fixed_args(...) nounwind {
 ;
 ; ILP32E-FPELIM-LABEL: va6_no_fixed_args:
 ; ILP32E-FPELIM:       # %bb.0:
-; ILP32E-FPELIM-NEXT:    addi sp, sp, -32
-; ILP32E-FPELIM-NEXT:    sw a5, 28(sp)
-; ILP32E-FPELIM-NEXT:    sw a4, 24(sp)
-; ILP32E-FPELIM-NEXT:    sw a3, 20(sp)
-; ILP32E-FPELIM-NEXT:    sw a2, 16(sp)
-; ILP32E-FPELIM-NEXT:    sw a1, 12(sp)
-; ILP32E-FPELIM-NEXT:    sw a0, 8(sp)
-; ILP32E-FPELIM-NEXT:    addi a1, sp, 12
-; ILP32E-FPELIM-NEXT:    sw a1, 4(sp)
-; ILP32E-FPELIM-NEXT:    addi sp, sp, 32
+; ILP32E-FPELIM-NEXT:    addi sp, sp, -28
+; ILP32E-FPELIM-NEXT:    sw a5, 24(sp)
+; ILP32E-FPELIM-NEXT:    sw a4, 20(sp)
+; ILP32E-FPELIM-NEXT:    sw a3, 16(sp)
+; ILP32E-FPELIM-NEXT:    sw a2, 12(sp)
+; ILP32E-FPELIM-NEXT:    sw a1, 8(sp)
+; ILP32E-FPELIM-NEXT:    sw a0, 4(sp)
+; ILP32E-FPELIM-NEXT:    addi a1, sp, 8
+; ILP32E-FPELIM-NEXT:    sw a1, 0(sp)
+; ILP32E-FPELIM-NEXT:    addi sp, sp, 28
 ; ILP32E-FPELIM-NEXT:    ret
 ;
 ; ILP32E-WITHFP-LABEL: va6_no_fixed_args:
 ; ILP32E-WITHFP:       # %bb.0:
-; ILP32E-WITHFP-NEXT:    addi sp, sp, -48
-; ILP32E-WITHFP-NEXT:    sw ra, 20(sp) # 4-byte Folded Spill
-; ILP32E-WITHFP-NEXT:    sw s0, 16(sp) # 4-byte Folded Spill
-; ILP32E-WITHFP-NEXT:    addi s0, sp, 24
+; ILP32E-WITHFP-NEXT:    addi sp, sp, -36
+; ILP32E-WITHFP-NEXT:    sw ra, 8(sp) # 4-byte Folded Spill
+; ILP32E-WITHFP-NEXT:    sw s0, 4(sp) # 4-byte Folded Spill
+; ILP32E-WITHFP-NEXT:    addi s0, sp, 12
 ; ILP32E-WITHFP-NEXT:    sw a5, 20(s0)
 ; ILP32E-WITHFP-NEXT:    sw a4, 16(s0)
 ; ILP32E-WITHFP-NEXT:    sw a3, 12(s0)
@@ -2701,9 +2701,9 @@ define i32 @va6_no_fixed_args(...) nounwind {
 ; ILP32E-WITHFP-NEXT:    sw a0, 0(s0)
 ; ILP32E-WITHFP-NEXT:    addi a1, s0, 4
 ; ILP32E-WITHFP-NEXT:    sw a1, -12(s0)
-; ILP32E-WITHFP-NEXT:    lw ra, 20(sp) # 4-byte Folded Reload
-; ILP32E-WITHFP-NEXT:    lw s0, 16(sp) # 4-byte Folded Reload
-; ILP32E-WITHFP-NEXT:    addi sp, sp, 48
+; ILP32E-WITHFP-NEXT:    lw ra, 8(sp) # 4-byte Folded Reload
+; ILP32E-WITHFP-NEXT:    lw s0, 4(sp) # 4-byte Folded Reload
+; ILP32E-WITHFP-NEXT:    addi sp, sp, 36
 ; ILP32E-WITHFP-NEXT:    ret
 ;
 ; LP64-LP64F-LP64D-FPELIM-LABEL: va6_no_fixed_args:
@@ -2745,24 +2745,24 @@ define i32 @va6_no_fixed_args(...) nounwind {
 ;
 ; LP64E-FPELIM-LABEL: va6_no_fixed_args:
 ; LP64E-FPELIM:       # %bb.0:
-; LP64E-FPELIM-NEXT:    addi sp, sp, -64
-; LP64E-FPELIM-NEXT:    sd a5, 56(sp)
-; LP64E-FPELIM-NEXT:    sd a4, 48(sp)
-; LP64E-FPELIM-NEXT:    sd a3, 40(sp)
-; LP64E-FPELIM-NEXT:    sd a2, 32(sp)
-; LP64E-FPELIM-NEXT:    sd a1, 24(sp)
-; LP64E-FPELIM-NEXT:    sd a0, 16(sp)
-; LP64E-FPELIM-NEXT:    addi a1, sp, 24
-; LP64E-FPELIM-NEXT:    sd a1, 8(sp)
-; LP64E-FPELIM-NEXT:    addi sp, sp, 64
+; LP64E-FPELIM-NEXT:    addi sp, sp, -56
+; LP64E-FPELIM-NEXT:    sd a5, 48(sp)
+; LP64E-FPELIM-NEXT:    sd a4, 40(sp)
+; LP64E-FPELIM-NEXT:    sd a3, 32(sp)
+; LP64E-FPELIM-NEXT:    sd a2, 24(sp)
+; LP64E-FPELIM-NEXT:    sd a1, 16(sp)
+; LP64E-FPELIM-NEXT:    sd a0, 8(sp)
+; LP64E-FPELIM-NEXT:    addi a1, sp, 16
+; LP64E-FPELIM-NEXT:    sd a1, 0(sp)
+; LP64E-FPELIM-NEXT:    addi sp, sp, 56
 ; LP64E-FPELIM-NEXT:    ret
 ;
 ; LP64E-WITHFP-LABEL: va6_no_fixed_args:
 ; LP64E-WITHFP:       # %bb.0:
-; LP64E-WITHFP-NEXT:    addi sp, sp, -80
-; LP64E-WITHFP-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
-; LP64E-WITHFP-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
-; LP64E-WITHFP-NEXT:    addi s0, sp, 32
+; LP64E-WITHFP-NEXT:    addi sp, sp, -72
+; LP64E-WITHFP-NEXT:    sd ra, 16(sp) # 8-byte Folded Spill
+; LP64E-WITHFP-NEXT:    sd s0, 8(sp) # 8-byte Folded Spill
+; LP64E-WITHFP-NEXT:    addi s0, sp, 24
 ; LP64E-WITHFP-NEXT:    sd a5, 40(s0)
 ; LP64E-WITHFP-NEXT:    sd a4, 32(s0)
 ; LP64E-WITHFP-NEXT:    sd a3, 24(s0)
@@ -2771,9 +2771,9 @@ define i32 @va6_no_fixed_args(...) nounwind {
 ; LP64E-WITHFP-NEXT:    sd a0, 0(s0)
 ; LP64E-WITHFP-NEXT:    addi a1, s0, 8
 ; LP64E-WITHFP-NEXT:    sd a1, -24(s0)
-; LP64E-WITHFP-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
-; LP64E-WITHFP-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
-; LP64E-WITHFP-NEXT:    addi sp, sp, 80
+; LP64E-WITHFP-NEXT:    ld ra, 16(sp) # 8-byte Folded Reload
+; LP64E-WITHFP-NEXT:    ld s0, 8(sp) # 8-byte Folded Reload
+; LP64E-WITHFP-NEXT:    addi sp, sp, 72
 ; LP64E-WITHFP-NEXT:    ret
   %va = alloca ptr
   call void @llvm.va_start(ptr %va)
@@ -2934,7 +2934,7 @@ define i32 @va_large_stack(ptr %fmt, ...) {
 ; ILP32E-WITHFP-NEXT:    addi s0, sp, 2020
 ; ILP32E-WITHFP-NEXT:    .cfi_def_cfa s0, 24
 ; ILP32E-WITHFP-NEXT:    lui a0, 24414
-; ILP32E-WITHFP-NEXT:    addi a0, a0, -1740
+; ILP32E-WITHFP-NEXT:    addi a0, a0, -1748
 ; ILP32E-WITHFP-NEXT:    sub sp, sp, a0
 ; ILP32E-WITHFP-NEXT:    mv a0, a1
 ; ILP32E-WITHFP-NEXT:    sw a5, 20(s0)
@@ -2947,7 +2947,7 @@ define i32 @va_large_stack(ptr %fmt, ...) {
 ; ILP32E-WITHFP-NEXT:    sub a2, s0, a2
 ; ILP32E-WITHFP-NEXT:    sw a1, -272(a2)
 ; ILP32E-WITHFP-NEXT:    lui a1, 24414
-; ILP32E-WITHFP-NEXT:    addi a1, a1, -1740
+; ILP32E-WITHFP-NEXT:    addi a1, a1, -1748
 ; ILP32E-WITHFP-NEXT:    add sp, sp, a1
 ; ILP32E-WITHFP-NEXT:    lw ra, 2016(sp) # 4-byte Folded Reload
 ; ILP32E-WITHFP-NEXT:    lw s0, 2012(sp) # 4-byte Folded Reload
-- 
cgit v1.1


From 2a4a2558f1533a91519fcc4e7abf04f845f067bd Mon Sep 17 00:00:00 2001
From: Haojian Wu <hokein.wu@gmail.com>
Date: Sat, 10 Feb 2024 08:54:13 +0100
Subject: Fix -Wunused-variable warning in Release build.

---
 llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp b/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp
index e852052..262e8e5 100644
--- a/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp
+++ b/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp
@@ -462,8 +462,7 @@ bool RISCVLegalizerInfo::legalizeVAStart(MachineInstr &MI,
 
 bool RISCVLegalizerInfo::shouldBeInConstantPool(APInt APImm,
                                                 bool ShouldOptForSize) const {
-  unsigned BitWidth = APImm.getBitWidth();
-  assert(BitWidth == 32 || BitWidth == 64);
+  assert(APImm.getBitWidth() == 32 || APImm.getBitWidth() == 64);
   int64_t Imm = APImm.getSExtValue();
   // All simm32 constants should be handled by isel.
   // NOTE: The getMaxBuildIntsCost call below should return a value >= 2 making
-- 
cgit v1.1


From 9308d6688c673606fee1625d777a52539ae72015 Mon Sep 17 00:00:00 2001
From: David Green <david.green@arm.com>
Date: Sat, 10 Feb 2024 08:19:49 +0000
Subject: [Flang] Correct initial limit value in float min/maxloc reductions.
 (#81260)

I was looking through to check whether Nan was being handled correctly,
and couldn't work out why simple cases were behaving differently than
they should. It turns out the initial limit values was backwards for
minloc/maxloc reductions in general. This fixes that, introduced in
#79469.
---
 flang/lib/Optimizer/HLFIR/Transforms/OptimizedBufferization.cpp | 2 +-
 flang/test/HLFIR/maxloc-elemental.fir                           | 1 +
 flang/test/HLFIR/minloc-elemental.fir                           | 1 +
 3 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/flang/lib/Optimizer/HLFIR/Transforms/OptimizedBufferization.cpp b/flang/lib/Optimizer/HLFIR/Transforms/OptimizedBufferization.cpp
index b1165a5..523671f 100644
--- a/flang/lib/Optimizer/HLFIR/Transforms/OptimizedBufferization.cpp
+++ b/flang/lib/Optimizer/HLFIR/Transforms/OptimizedBufferization.cpp
@@ -854,7 +854,7 @@ public:
         const llvm::fltSemantics &sem = ty.getFloatSemantics();
         return builder.createRealConstant(
             loc, elementType,
-            llvm::APFloat::getLargest(sem, /*Negative=*/!isMax));
+            llvm::APFloat::getLargest(sem, /*Negative=*/isMax));
       }
       unsigned bits = elementType.getIntOrFloatBitWidth();
       int64_t limitInt =
diff --git a/flang/test/HLFIR/maxloc-elemental.fir b/flang/test/HLFIR/maxloc-elemental.fir
index 67cd9ee..b4a3ca0 100644
--- a/flang/test/HLFIR/maxloc-elemental.fir
+++ b/flang/test/HLFIR/maxloc-elemental.fir
@@ -110,6 +110,7 @@ func.func @_QPtest_float(%arg0: !fir.box<!fir.array<?xf32>> {fir.bindc_name = "a
   return
 }
 // CHECK-LABEL: _QPtest_float
+// CHECK:        %cst = arith.constant -3.40282347E+38 : f32
 // CHECK:        %[[V11:.*]] = fir.do_loop %arg3 = %c0 to %[[V10:.*]] step %c1 iter_args(%arg4 = %cst) -> (f32) {
 // CHECK-NEXT:     %[[V14:.*]] = arith.addi %arg3, %c1 : index
 // CHECK-NEXT:     %[[V15:.*]] = hlfir.designate %[[V1:.*]]#0 (%[[V14]])  : (!fir.box<!fir.array<?xf32>>, index) -> !fir.ref<f32>
diff --git a/flang/test/HLFIR/minloc-elemental.fir b/flang/test/HLFIR/minloc-elemental.fir
index cb483d5..5cc608b 100644
--- a/flang/test/HLFIR/minloc-elemental.fir
+++ b/flang/test/HLFIR/minloc-elemental.fir
@@ -295,6 +295,7 @@ func.func @_QPtest_float(%arg0: !fir.box<!fir.array<?xf32>> {fir.bindc_name = "a
   return
 }
 // CHECK-LABEL: _QPtest_float
+// CHECK:        %cst = arith.constant 3.40282347E+38 : f32
 // CHECK:        %[[V11:.*]] = fir.do_loop %arg3 = %c0 to %[[V10:.*]] step %c1 iter_args(%arg4 = %cst) -> (f32) {
 // CHECK-NEXT:     %[[V14:.*]] = arith.addi %arg3, %c1 : index
 // CHECK-NEXT:     %[[V15:.*]] = hlfir.designate %[[V1:.*]]#0 (%[[V14]])  : (!fir.box<!fir.array<?xf32>>, index) -> !fir.ref<f32>
-- 
cgit v1.1


From d26b43ff4f7396f79de4b099160262c750d6aba7 Mon Sep 17 00:00:00 2001
From: Alexander Shaposhnikov
 <6532716+alexander-shaposhnikov@users.noreply.github.com>
Date: Sat, 10 Feb 2024 01:12:46 -0800
Subject: Add JumpTableToSwitch pass (#77709)

Add a pass to convert jump tables to switches.
The new pass replaces an indirect call with a switch + direct calls if all the functions in the jump table are smaller than the provided threshold.
The pass is currently disabled by default and can be enabled by -enable-jump-table-to-switch.

Test plan: ninja check-all
---
 .../llvm/Transforms/Scalar/JumpTableToSwitch.h     |  24 +++
 llvm/lib/Passes/PassBuilder.cpp                    |   1 +
 llvm/lib/Passes/PassBuilderPipelines.cpp           |   9 +
 llvm/lib/Passes/PassRegistry.def                   |   1 +
 llvm/lib/Transforms/Scalar/CMakeLists.txt          |   1 +
 llvm/lib/Transforms/Scalar/JumpTableToSwitch.cpp   | 190 +++++++++++++++++
 llvm/test/Other/new-pm-defaults.ll                 |   5 +
 llvm/test/Transforms/JumpTableToSwitch/basic.ll    | 228 +++++++++++++++++++++
 .../JumpTableToSwitch/max_function_size.ll         |  28 +++
 llvm/test/Transforms/JumpTableToSwitch/remarks.ll  |  36 ++++
 llvm/test/Transforms/JumpTableToSwitch/skip.ll     | 131 ++++++++++++
 llvm/test/Transforms/JumpTableToSwitch/stride.ll   |  36 ++++
 llvm/test/Transforms/JumpTableToSwitch/struct.ll   |  42 ++++
 13 files changed, 732 insertions(+)
 create mode 100644 llvm/include/llvm/Transforms/Scalar/JumpTableToSwitch.h
 create mode 100644 llvm/lib/Transforms/Scalar/JumpTableToSwitch.cpp
 create mode 100644 llvm/test/Transforms/JumpTableToSwitch/basic.ll
 create mode 100644 llvm/test/Transforms/JumpTableToSwitch/max_function_size.ll
 create mode 100644 llvm/test/Transforms/JumpTableToSwitch/remarks.ll
 create mode 100644 llvm/test/Transforms/JumpTableToSwitch/skip.ll
 create mode 100644 llvm/test/Transforms/JumpTableToSwitch/stride.ll
 create mode 100644 llvm/test/Transforms/JumpTableToSwitch/struct.ll

diff --git a/llvm/include/llvm/Transforms/Scalar/JumpTableToSwitch.h b/llvm/include/llvm/Transforms/Scalar/JumpTableToSwitch.h
new file mode 100644
index 0000000..6178622
--- /dev/null
+++ b/llvm/include/llvm/Transforms/Scalar/JumpTableToSwitch.h
@@ -0,0 +1,24 @@
+//===- JumpTableToSwitch.h - ------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TRANSFORMS_SCALAR_JUMP_TABLE_TO_SWITCH_H
+#define LLVM_TRANSFORMS_SCALAR_JUMP_TABLE_TO_SWITCH_H
+
+#include "llvm/IR/PassManager.h"
+
+namespace llvm {
+
+class Function;
+
+struct JumpTableToSwitchPass : PassInfoMixin<JumpTableToSwitchPass> {
+  /// Run the pass over the function.
+  PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
+};
+} // end namespace llvm
+
+#endif // LLVM_TRANSFORMS_SCALAR_JUMP_TABLE_TO_SWITCH_H
diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp
index 007dc76..e3f2502 100644
--- a/llvm/lib/Passes/PassBuilder.cpp
+++ b/llvm/lib/Passes/PassBuilder.cpp
@@ -201,6 +201,7 @@
 #include "llvm/Transforms/Scalar/InferAddressSpaces.h"
 #include "llvm/Transforms/Scalar/InferAlignment.h"
 #include "llvm/Transforms/Scalar/InstSimplifyPass.h"
+#include "llvm/Transforms/Scalar/JumpTableToSwitch.h"
 #include "llvm/Transforms/Scalar/JumpThreading.h"
 #include "llvm/Transforms/Scalar/LICM.h"
 #include "llvm/Transforms/Scalar/LoopAccessAnalysisPrinter.h"
diff --git a/llvm/lib/Passes/PassBuilderPipelines.cpp b/llvm/lib/Passes/PassBuilderPipelines.cpp
index 6ede863..4e233d9 100644
--- a/llvm/lib/Passes/PassBuilderPipelines.cpp
+++ b/llvm/lib/Passes/PassBuilderPipelines.cpp
@@ -91,6 +91,7 @@
 #include "llvm/Transforms/Scalar/IndVarSimplify.h"
 #include "llvm/Transforms/Scalar/InferAlignment.h"
 #include "llvm/Transforms/Scalar/InstSimplifyPass.h"
+#include "llvm/Transforms/Scalar/JumpTableToSwitch.h"
 #include "llvm/Transforms/Scalar/JumpThreading.h"
 #include "llvm/Transforms/Scalar/LICM.h"
 #include "llvm/Transforms/Scalar/LoopDeletion.h"
@@ -237,6 +238,10 @@ static cl::opt<bool>
     EnableGVNSink("enable-gvn-sink",
                   cl::desc("Enable the GVN sinking pass (default = off)"));
 
+static cl::opt<bool> EnableJumpTableToSwitch(
+    "enable-jump-table-to-switch",
+    cl::desc("Enable JumpTableToSwitch pass (default = off)"));
+
 // This option is used in simplifying testing SampleFDO optimizations for
 // profile loading.
 static cl::opt<bool>
@@ -559,6 +564,10 @@ PassBuilder::buildFunctionSimplificationPipeline(OptimizationLevel Level,
   FPM.addPass(JumpThreadingPass());
   FPM.addPass(CorrelatedValuePropagationPass());
 
+  // Jump table to switch conversion.
+  if (EnableJumpTableToSwitch)
+    FPM.addPass(JumpTableToSwitchPass());
+
   FPM.addPass(
       SimplifyCFGPass(SimplifyCFGOptions().convertSwitchRangeToICmp(true)));
   FPM.addPass(InstCombinePass());
diff --git a/llvm/lib/Passes/PassRegistry.def b/llvm/lib/Passes/PassRegistry.def
index 6cb87fb..afa5a65 100644
--- a/llvm/lib/Passes/PassRegistry.def
+++ b/llvm/lib/Passes/PassRegistry.def
@@ -348,6 +348,7 @@ FUNCTION_PASS("interleaved-load-combine", InterleavedLoadCombinePass(TM))
 FUNCTION_PASS("invalidate<all>", InvalidateAllAnalysesPass())
 FUNCTION_PASS("irce", IRCEPass())
 FUNCTION_PASS("jump-threading", JumpThreadingPass())
+FUNCTION_PASS("jump-table-to-switch", JumpTableToSwitchPass());
 FUNCTION_PASS("kcfi", KCFIPass())
 FUNCTION_PASS("lcssa", LCSSAPass())
 FUNCTION_PASS("libcalls-shrinkwrap", LibCallsShrinkWrapPass())
diff --git a/llvm/lib/Transforms/Scalar/CMakeLists.txt b/llvm/lib/Transforms/Scalar/CMakeLists.txt
index 5527efa..ba09ebf 100644
--- a/llvm/lib/Transforms/Scalar/CMakeLists.txt
+++ b/llvm/lib/Transforms/Scalar/CMakeLists.txt
@@ -25,6 +25,7 @@ add_llvm_component_library(LLVMScalarOpts
   InferAlignment.cpp
   InstSimplifyPass.cpp
   JumpThreading.cpp
+  JumpTableToSwitch.cpp
   LICM.cpp
   LoopAccessAnalysisPrinter.cpp
   LoopBoundSplit.cpp
diff --git a/llvm/lib/Transforms/Scalar/JumpTableToSwitch.cpp b/llvm/lib/Transforms/Scalar/JumpTableToSwitch.cpp
new file mode 100644
index 0000000..f9712db
--- /dev/null
+++ b/llvm/lib/Transforms/Scalar/JumpTableToSwitch.cpp
@@ -0,0 +1,190 @@
+//===- JumpTableToSwitch.cpp ----------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/JumpTableToSwitch.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Analysis/ConstantFolding.h"
+#include "llvm/Analysis/DomTreeUpdater.h"
+#include "llvm/Analysis/OptimizationRemarkEmitter.h"
+#include "llvm/Analysis/PostDominators.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+
+using namespace llvm;
+
+static cl::opt<unsigned>
+    JumpTableSizeThreshold("jump-table-to-switch-size-threshold", cl::Hidden,
+                           cl::desc("Only split jump tables with size less or "
+                                    "equal than JumpTableSizeThreshold."),
+                           cl::init(10));
+
+// TODO: Consider adding a cost model for profitability analysis of this
+// transformation. Currently we replace a jump table with a switch if all the
+// functions in the jump table are smaller than the provided threshold.
+static cl::opt<unsigned> FunctionSizeThreshold(
+    "jump-table-to-switch-function-size-threshold", cl::Hidden,
+    cl::desc("Only split jump tables containing functions whose sizes are less "
+             "or equal than this threshold."),
+    cl::init(50));
+
+#define DEBUG_TYPE "jump-table-to-switch"
+
+namespace {
+struct JumpTableTy {
+  Value *Index;
+  SmallVector<Function *, 10> Funcs;
+};
+} // anonymous namespace
+
+static std::optional<JumpTableTy> parseJumpTable(GetElementPtrInst *GEP,
+                                                 PointerType *PtrTy) {
+  Constant *Ptr = dyn_cast<Constant>(GEP->getPointerOperand());
+  if (!Ptr)
+    return std::nullopt;
+
+  GlobalVariable *GV = dyn_cast<GlobalVariable>(Ptr);
+  if (!GV || !GV->isConstant() || !GV->hasDefinitiveInitializer())
+    return std::nullopt;
+
+  Function &F = *GEP->getParent()->getParent();
+  const DataLayout &DL = F.getParent()->getDataLayout();
+  const unsigned BitWidth =
+      DL.getIndexSizeInBits(GEP->getPointerAddressSpace());
+  MapVector<Value *, APInt> VariableOffsets;
+  APInt ConstantOffset(BitWidth, 0);
+  if (!GEP->collectOffset(DL, BitWidth, VariableOffsets, ConstantOffset))
+    return std::nullopt;
+  if (VariableOffsets.size() != 1)
+    return std::nullopt;
+  // TODO: consider supporting more general patterns
+  if (!ConstantOffset.isZero())
+    return std::nullopt;
+  APInt StrideBytes = VariableOffsets.front().second;
+  const uint64_t JumpTableSizeBytes = DL.getTypeAllocSize(GV->getValueType());
+  if (JumpTableSizeBytes % StrideBytes.getZExtValue() != 0)
+    return std::nullopt;
+  const uint64_t N = JumpTableSizeBytes / StrideBytes.getZExtValue();
+  if (N > JumpTableSizeThreshold)
+    return std::nullopt;
+
+  JumpTableTy JumpTable;
+  JumpTable.Index = VariableOffsets.front().first;
+  JumpTable.Funcs.reserve(N);
+  for (uint64_t Index = 0; Index < N; ++Index) {
+    // ConstantOffset is zero.
+    APInt Offset = Index * StrideBytes;
+    Constant *C =
+        ConstantFoldLoadFromConst(GV->getInitializer(), PtrTy, Offset, DL);
+    auto *Func = dyn_cast_or_null<Function>(C);
+    if (!Func || Func->isDeclaration() ||
+        Func->getInstructionCount() > FunctionSizeThreshold)
+      return std::nullopt;
+    JumpTable.Funcs.push_back(Func);
+  }
+  return JumpTable;
+}
+
+static BasicBlock *expandToSwitch(CallBase *CB, const JumpTableTy &JT,
+                                  DomTreeUpdater &DTU,
+                                  OptimizationRemarkEmitter &ORE) {
+  const bool IsVoid = CB->getType() == Type::getVoidTy(CB->getContext());
+
+  SmallVector<DominatorTree::UpdateType, 8> DTUpdates;
+  BasicBlock *BB = CB->getParent();
+  BasicBlock *Tail = SplitBlock(BB, CB, &DTU, nullptr, nullptr,
+                                BB->getName() + Twine(".tail"));
+  DTUpdates.push_back({DominatorTree::Delete, BB, Tail});
+  BB->getTerminator()->eraseFromParent();
+
+  Function &F = *BB->getParent();
+  BasicBlock *BBUnreachable = BasicBlock::Create(
+      F.getContext(), "default.switch.case.unreachable", &F, Tail);
+  IRBuilder<> BuilderUnreachable(BBUnreachable);
+  BuilderUnreachable.CreateUnreachable();
+
+  IRBuilder<> Builder(BB);
+  SwitchInst *Switch = Builder.CreateSwitch(JT.Index, BBUnreachable);
+  DTUpdates.push_back({DominatorTree::Insert, BB, BBUnreachable});
+
+  IRBuilder<> BuilderTail(CB);
+  PHINode *PHI =
+      IsVoid ? nullptr : BuilderTail.CreatePHI(CB->getType(), JT.Funcs.size());
+
+  for (auto [Index, Func] : llvm::enumerate(JT.Funcs)) {
+    BasicBlock *B = BasicBlock::Create(Func->getContext(),
+                                       "call." + Twine(Index), &F, Tail);
+    DTUpdates.push_back({DominatorTree::Insert, BB, B});
+    DTUpdates.push_back({DominatorTree::Insert, B, Tail});
+
+    CallBase *Call = cast<CallBase>(CB->clone());
+    Call->setCalledFunction(Func);
+    Call->insertInto(B, B->end());
+    Switch->addCase(
+        cast<ConstantInt>(ConstantInt::get(JT.Index->getType(), Index)), B);
+    BranchInst::Create(Tail, B);
+    if (PHI)
+      PHI->addIncoming(Call, B);
+  }
+  DTU.applyUpdates(DTUpdates);
+  ORE.emit([&]() {
+    return OptimizationRemark(DEBUG_TYPE, "ReplacedJumpTableWithSwitch", CB)
+           << "expanded indirect call into switch";
+  });
+  if (PHI)
+    CB->replaceAllUsesWith(PHI);
+  CB->eraseFromParent();
+  return Tail;
+}
+
+PreservedAnalyses JumpTableToSwitchPass::run(Function &F,
+                                             FunctionAnalysisManager &AM) {
+  OptimizationRemarkEmitter &ORE =
+      AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
+  DominatorTree *DT = AM.getCachedResult<DominatorTreeAnalysis>(F);
+  PostDominatorTree *PDT = AM.getCachedResult<PostDominatorTreeAnalysis>(F);
+  DomTreeUpdater DTU(DT, PDT, DomTreeUpdater::UpdateStrategy::Lazy);
+  bool Changed = false;
+  for (BasicBlock &BB : make_early_inc_range(F)) {
+    BasicBlock *CurrentBB = &BB;
+    while (CurrentBB) {
+      BasicBlock *SplittedOutTail = nullptr;
+      for (Instruction &I : make_early_inc_range(*CurrentBB)) {
+        auto *Call = dyn_cast<CallInst>(&I);
+        if (!Call || Call->getCalledFunction() || Call->isMustTailCall())
+          continue;
+        auto *L = dyn_cast<LoadInst>(Call->getCalledOperand());
+        // Skip atomic or volatile loads.
+        if (!L || !L->isSimple())
+          continue;
+        auto *GEP = dyn_cast<GetElementPtrInst>(L->getPointerOperand());
+        if (!GEP)
+          continue;
+        auto *PtrTy = dyn_cast<PointerType>(L->getType());
+        assert(PtrTy && "call operand must be a pointer");
+        std::optional<JumpTableTy> JumpTable = parseJumpTable(GEP, PtrTy);
+        if (!JumpTable)
+          continue;
+        SplittedOutTail = expandToSwitch(Call, *JumpTable, DTU, ORE);
+        Changed = true;
+        break;
+      }
+      CurrentBB = SplittedOutTail ? SplittedOutTail : nullptr;
+    }
+  }
+
+  if (!Changed)
+    return PreservedAnalyses::all();
+
+  PreservedAnalyses PA;
+  if (DT)
+    PA.preserve<DominatorTreeAnalysis>();
+  if (PDT)
+    PA.preserve<PostDominatorTreeAnalysis>();
+  return PA;
+}
diff --git a/llvm/test/Other/new-pm-defaults.ll b/llvm/test/Other/new-pm-defaults.ll
index ecdb5a5..51fb93d 100644
--- a/llvm/test/Other/new-pm-defaults.ll
+++ b/llvm/test/Other/new-pm-defaults.ll
@@ -72,6 +72,10 @@
 ; RUN:     | FileCheck %s --check-prefixes=CHECK-O,CHECK-DEFAULT,CHECK-O3,%llvmcheckext,CHECK-EP-OPTIMIZER-LAST,CHECK-O23SZ
 
 ; RUN: opt -disable-verify -verify-analysis-invalidation=0 -eagerly-invalidate-analyses=0 -debug-pass-manager \
+; RUN:     -passes='default<O3>' -enable-jump-table-to-switch -S  %s 2>&1 \
+; RUN:     | FileCheck %s --check-prefixes=CHECK-O,CHECK-DEFAULT,CHECK-O3,CHECK-JUMP-TABLE-TO-SWITCH,CHECK-O23SZ,%llvmcheckext
+
+; RUN: opt -disable-verify -verify-analysis-invalidation=0 -eagerly-invalidate-analyses=0 -debug-pass-manager \
 ; RUN:     -passes='default<O3>' -enable-matrix -S  %s 2>&1 \
 ; RUN:     | FileCheck %s --check-prefixes=CHECK-O,CHECK-DEFAULT,CHECK-O3,CHECK-O23SZ,%llvmcheckext,CHECK-MATRIX
 
@@ -151,6 +155,7 @@
 ; CHECK-O23SZ-NEXT: Running analysis: LazyValueAnalysis
 ; CHECK-O23SZ-NEXT: Running pass: CorrelatedValuePropagationPass
 ; CHECK-O23SZ-NEXT: Invalidating analysis: LazyValueAnalysis
+; CHECK-JUMP-TABLE-TO-SWITCH-NEXT: Running pass: JumpTableToSwitchPass
 ; CHECK-O-NEXT: Running pass: SimplifyCFGPass
 ; CHECK-O-NEXT: Running pass: InstCombinePass
 ; CHECK-O23SZ-NEXT: Running pass: AggressiveInstCombinePass
diff --git a/llvm/test/Transforms/JumpTableToSwitch/basic.ll b/llvm/test/Transforms/JumpTableToSwitch/basic.ll
new file mode 100644
index 0000000..321f837
--- /dev/null
+++ b/llvm/test/Transforms/JumpTableToSwitch/basic.ll
@@ -0,0 +1,228 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt < %s -passes=jump-table-to-switch -verify-dom-info -S | FileCheck %s
+; RUN: opt < %s -passes=jump-table-to-switch -jump-table-to-switch-size-threshold=0 -verify-dom-info -S | FileCheck %s --check-prefix=THRESHOLD-0
+
+@func_array = constant [2 x ptr] [ptr @func0, ptr @func1]
+
+define i32 @func0() {
+  ret i32 1
+}
+
+define i32 @func1() {
+  ret i32 2
+}
+
+define i32 @function_with_jump_table(i32 %index) {
+; CHECK-LABEL: define i32 @function_with_jump_table(
+; CHECK-SAME: i32 [[INDEX:%.*]]) {
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds [2 x ptr], ptr @func_array, i32 0, i32 [[INDEX]]
+; CHECK-NEXT:    [[FUNC_PTR:%.*]] = load ptr, ptr [[GEP]], align 8
+; CHECK-NEXT:    switch i32 [[INDEX]], label [[DEFAULT_SWITCH_CASE_UNREACHABLE:%.*]] [
+; CHECK-NEXT:      i32 0, label [[CALL_0:%.*]]
+; CHECK-NEXT:      i32 1, label [[CALL_1:%.*]]
+; CHECK-NEXT:    ]
+; CHECK:       default.switch.case.unreachable:
+; CHECK-NEXT:    unreachable
+; CHECK:       call.0:
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @func0()
+; CHECK-NEXT:    br label [[DOTTAIL:%.*]]
+; CHECK:       call.1:
+; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @func1()
+; CHECK-NEXT:    br label [[DOTTAIL]]
+; CHECK:       .tail:
+; CHECK-NEXT:    [[TMP3:%.*]] = phi i32 [ [[TMP1]], [[CALL_0]] ], [ [[TMP2]], [[CALL_1]] ]
+; CHECK-NEXT:    ret i32 [[TMP3]]
+;
+; THRESHOLD-0-LABEL: define i32 @function_with_jump_table(
+; THRESHOLD-0-SAME: i32 [[INDEX:%.*]]) {
+; THRESHOLD-0-NEXT:    [[GEP:%.*]] = getelementptr inbounds [2 x ptr], ptr @func_array, i32 0, i32 [[INDEX]]
+; THRESHOLD-0-NEXT:    [[FUNC_PTR:%.*]] = load ptr, ptr [[GEP]], align 8
+; THRESHOLD-0-NEXT:    [[RESULT:%.*]] = call i32 [[FUNC_PTR]]()
+; THRESHOLD-0-NEXT:    ret i32 [[RESULT]]
+;
+  %gep = getelementptr inbounds [2 x ptr], ptr @func_array, i32 0, i32 %index
+  %func_ptr = load ptr, ptr %gep
+  %result = call i32 %func_ptr()
+  ret i32 %result
+}
+
+define i32 @basic_block_splitted_twice(i32 %index) {
+; CHECK-LABEL: define i32 @basic_block_splitted_twice(
+; CHECK-SAME: i32 [[INDEX:%.*]]) {
+; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr inbounds [2 x ptr], ptr @func_array, i32 0, i32 [[INDEX]]
+; CHECK-NEXT:    [[FUNC_PTR1:%.*]] = load ptr, ptr [[GEP1]], align 8
+; CHECK-NEXT:    switch i32 [[INDEX]], label [[DEFAULT_SWITCH_CASE_UNREACHABLE:%.*]] [
+; CHECK-NEXT:      i32 0, label [[CALL_0:%.*]]
+; CHECK-NEXT:      i32 1, label [[CALL_1:%.*]]
+; CHECK-NEXT:    ]
+; CHECK:       default.switch.case.unreachable:
+; CHECK-NEXT:    unreachable
+; CHECK:       call.0:
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @func0()
+; CHECK-NEXT:    br label [[DOTTAIL:%.*]]
+; CHECK:       call.1:
+; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @func1()
+; CHECK-NEXT:    br label [[DOTTAIL]]
+; CHECK:       .tail:
+; CHECK-NEXT:    [[TMP3:%.*]] = phi i32 [ [[TMP1]], [[CALL_0]] ], [ [[TMP2]], [[CALL_1]] ]
+; CHECK-NEXT:    [[GEP2:%.*]] = getelementptr inbounds [2 x ptr], ptr @func_array, i32 0, i32 [[INDEX]]
+; CHECK-NEXT:    [[FUNC_PTR2:%.*]] = load ptr, ptr [[GEP2]], align 8
+; CHECK-NEXT:    switch i32 [[INDEX]], label [[DEFAULT_SWITCH_CASE_UNREACHABLE1:%.*]] [
+; CHECK-NEXT:      i32 0, label [[CALL_02:%.*]]
+; CHECK-NEXT:      i32 1, label [[CALL_13:%.*]]
+; CHECK-NEXT:    ]
+; CHECK:       default.switch.case.unreachable1:
+; CHECK-NEXT:    unreachable
+; CHECK:       call.02:
+; CHECK-NEXT:    [[TMP4:%.*]] = call i32 @func0()
+; CHECK-NEXT:    br label [[DOTTAIL_TAIL:%.*]]
+; CHECK:       call.13:
+; CHECK-NEXT:    [[TMP5:%.*]] = call i32 @func1()
+; CHECK-NEXT:    br label [[DOTTAIL_TAIL]]
+; CHECK:       .tail.tail:
+; CHECK-NEXT:    [[TMP6:%.*]] = phi i32 [ [[TMP4]], [[CALL_02]] ], [ [[TMP5]], [[CALL_13]] ]
+; CHECK-NEXT:    [[RESULT:%.*]] = add i32 [[TMP3]], [[TMP6]]
+; CHECK-NEXT:    ret i32 [[RESULT]]
+;
+; THRESHOLD-0-LABEL: define i32 @basic_block_splitted_twice(
+; THRESHOLD-0-SAME: i32 [[INDEX:%.*]]) {
+; THRESHOLD-0-NEXT:    [[GEP1:%.*]] = getelementptr inbounds [2 x ptr], ptr @func_array, i32 0, i32 [[INDEX]]
+; THRESHOLD-0-NEXT:    [[FUNC_PTR1:%.*]] = load ptr, ptr [[GEP1]], align 8
+; THRESHOLD-0-NEXT:    [[RESULT1:%.*]] = call i32 [[FUNC_PTR1]]()
+; THRESHOLD-0-NEXT:    [[GEP2:%.*]] = getelementptr inbounds [2 x ptr], ptr @func_array, i32 0, i32 [[INDEX]]
+; THRESHOLD-0-NEXT:    [[FUNC_PTR2:%.*]] = load ptr, ptr [[GEP2]], align 8
+; THRESHOLD-0-NEXT:    [[RESULT2:%.*]] = call i32 [[FUNC_PTR2]]()
+; THRESHOLD-0-NEXT:    [[RESULT:%.*]] = add i32 [[RESULT1]], [[RESULT2]]
+; THRESHOLD-0-NEXT:    ret i32 [[RESULT]]
+;
+  %gep1 = getelementptr inbounds [2 x ptr], ptr @func_array, i32 0, i32 %index
+  %func_ptr1 = load ptr, ptr %gep1
+  %result1 = call i32 %func_ptr1()
+  %gep2 = getelementptr inbounds [2 x ptr], ptr @func_array, i32 0, i32 %index
+  %func_ptr2 = load ptr, ptr %gep2
+  %result2 = call i32 %func_ptr2()
+  %result = add i32 %result1, %result2
+  ret i32 %result
+}
+
+define void @void_func0() {
+  ret void
+}
+
+define void @void_func1() {
+  ret void
+}
+
+@void_func_array = constant [2 x ptr] [ptr @void_func0, ptr @void_func1]
+
+define void @void_function_with_jump_table(i32 %index) {
+; CHECK-LABEL: define void @void_function_with_jump_table(
+; CHECK-SAME: i32 [[INDEX:%.*]]) {
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds [2 x ptr], ptr @void_func_array, i32 0, i32 [[INDEX]]
+; CHECK-NEXT:    [[FUNC_PTR:%.*]] = load ptr, ptr [[GEP]], align 8
+; CHECK-NEXT:    switch i32 [[INDEX]], label [[DEFAULT_SWITCH_CASE_UNREACHABLE:%.*]] [
+; CHECK-NEXT:      i32 0, label [[CALL_0:%.*]]
+; CHECK-NEXT:      i32 1, label [[CALL_1:%.*]]
+; CHECK-NEXT:    ]
+; CHECK:       default.switch.case.unreachable:
+; CHECK-NEXT:    unreachable
+; CHECK:       call.0:
+; CHECK-NEXT:    call void @void_func0()
+; CHECK-NEXT:    br label [[DOTTAIL:%.*]]
+; CHECK:       call.1:
+; CHECK-NEXT:    call void @void_func1()
+; CHECK-NEXT:    br label [[DOTTAIL]]
+; CHECK:       .tail:
+; CHECK-NEXT:    ret void
+;
+; THRESHOLD-0-LABEL: define void @void_function_with_jump_table(
+; THRESHOLD-0-SAME: i32 [[INDEX:%.*]]) {
+; THRESHOLD-0-NEXT:    [[GEP:%.*]] = getelementptr inbounds [2 x ptr], ptr @void_func_array, i32 0, i32 [[INDEX]]
+; THRESHOLD-0-NEXT:    [[FUNC_PTR:%.*]] = load ptr, ptr [[GEP]], align 8
+; THRESHOLD-0-NEXT:    call void [[FUNC_PTR]]()
+; THRESHOLD-0-NEXT:    ret void
+;
+  %gep = getelementptr inbounds [2 x ptr], ptr @void_func_array, i32 0, i32 %index
+  %func_ptr = load ptr, ptr %gep
+  call void %func_ptr()
+  ret void
+}
+
+define void @void_function_with_jump_table_and_call_site_attr(i32 %index) {
+; CHECK-LABEL: define void @void_function_with_jump_table_and_call_site_attr(
+; CHECK-SAME: i32 [[INDEX:%.*]]) {
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds [2 x ptr], ptr @void_func_array, i32 0, i32 [[INDEX]]
+; CHECK-NEXT:    [[FUNC_PTR:%.*]] = load ptr, ptr [[GEP]], align 8
+; CHECK-NEXT:    switch i32 [[INDEX]], label [[DEFAULT_SWITCH_CASE_UNREACHABLE:%.*]] [
+; CHECK-NEXT:      i32 0, label [[CALL_0:%.*]]
+; CHECK-NEXT:      i32 1, label [[CALL_1:%.*]]
+; CHECK-NEXT:    ]
+; CHECK:       default.switch.case.unreachable:
+; CHECK-NEXT:    unreachable
+; CHECK:       call.0:
+; CHECK-NEXT:    call void @void_func0() #[[ATTR0:[0-9]+]]
+; CHECK-NEXT:    br label [[DOTTAIL:%.*]]
+; CHECK:       call.1:
+; CHECK-NEXT:    call void @void_func1() #[[ATTR0]]
+; CHECK-NEXT:    br label [[DOTTAIL]]
+; CHECK:       .tail:
+; CHECK-NEXT:    ret void
+;
+; THRESHOLD-0-LABEL: define void @void_function_with_jump_table_and_call_site_attr(
+; THRESHOLD-0-SAME: i32 [[INDEX:%.*]]) {
+; THRESHOLD-0-NEXT:    [[GEP:%.*]] = getelementptr inbounds [2 x ptr], ptr @void_func_array, i32 0, i32 [[INDEX]]
+; THRESHOLD-0-NEXT:    [[FUNC_PTR:%.*]] = load ptr, ptr [[GEP]], align 8
+; THRESHOLD-0-NEXT:    call void [[FUNC_PTR]]() #[[ATTR0:[0-9]+]]
+; THRESHOLD-0-NEXT:    ret void
+;
+  %gep = getelementptr inbounds [2 x ptr], ptr @void_func_array, i32 0, i32 %index
+  %func_ptr = load ptr, ptr %gep
+  call void %func_ptr() nounwind
+  ret void
+}
+
+
+define i32 @func0_addrspace_42() addrspace(42) {
+  ret i32 1
+}
+
+define i32 @func1_addrspace_42() addrspace(42) {
+  ret i32 2
+}
+
+@func_array_addrspace_42 = addrspace(42) constant [2 x ptr addrspace(42)] [ptr addrspace(42) @func0_addrspace_42, ptr addrspace(42) @func1_addrspace_42]
+
+define i32 @function_with_jump_table_addrspace_42(i32 %index) addrspace(42) {
+; CHECK-LABEL: define i32 @function_with_jump_table_addrspace_42(
+; CHECK-SAME: i32 [[INDEX:%.*]]) addrspace(42) {
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds [2 x ptr addrspace(42)], ptr addrspace(42) @func_array_addrspace_42, i32 0, i32 [[INDEX]]
+; CHECK-NEXT:    [[FUNC_PTR:%.*]] = load ptr addrspace(42), ptr addrspace(42) [[GEP]], align 8
+; CHECK-NEXT:    switch i32 [[INDEX]], label [[DEFAULT_SWITCH_CASE_UNREACHABLE:%.*]] [
+; CHECK-NEXT:      i32 0, label [[CALL_0:%.*]]
+; CHECK-NEXT:      i32 1, label [[CALL_1:%.*]]
+; CHECK-NEXT:    ]
+; CHECK:       default.switch.case.unreachable:
+; CHECK-NEXT:    unreachable
+; CHECK:       call.0:
+; CHECK-NEXT:    [[TMP1:%.*]] = call addrspace(42) i32 @func0_addrspace_42()
+; CHECK-NEXT:    br label [[DOTTAIL:%.*]]
+; CHECK:       call.1:
+; CHECK-NEXT:    [[TMP2:%.*]] = call addrspace(42) i32 @func1_addrspace_42()
+; CHECK-NEXT:    br label [[DOTTAIL]]
+; CHECK:       .tail:
+; CHECK-NEXT:    [[TMP3:%.*]] = phi i32 [ [[TMP1]], [[CALL_0]] ], [ [[TMP2]], [[CALL_1]] ]
+; CHECK-NEXT:    ret i32 [[TMP3]]
+;
+; THRESHOLD-0-LABEL: define i32 @function_with_jump_table_addrspace_42(
+; THRESHOLD-0-SAME: i32 [[INDEX:%.*]]) addrspace(42) {
+; THRESHOLD-0-NEXT:    [[GEP:%.*]] = getelementptr inbounds [2 x ptr addrspace(42)], ptr addrspace(42) @func_array_addrspace_42, i32 0, i32 [[INDEX]]
+; THRESHOLD-0-NEXT:    [[FUNC_PTR:%.*]] = load ptr addrspace(42), ptr addrspace(42) [[GEP]], align 8
+; THRESHOLD-0-NEXT:    [[RESULT:%.*]] = call addrspace(42) i32 [[FUNC_PTR]]()
+; THRESHOLD-0-NEXT:    ret i32 [[RESULT]]
+;
+  %gep = getelementptr inbounds [2 x ptr addrspace(42)], ptr addrspace(42) @func_array_addrspace_42, i32 0, i32 %index
+  %func_ptr = load ptr addrspace(42), ptr addrspace(42) %gep, align 8
+  %result = call addrspace(42) i32 %func_ptr()
+  ret i32 %result
+}
+
diff --git a/llvm/test/Transforms/JumpTableToSwitch/max_function_size.ll b/llvm/test/Transforms/JumpTableToSwitch/max_function_size.ll
new file mode 100644
index 0000000..f4e9911
--- /dev/null
+++ b/llvm/test/Transforms/JumpTableToSwitch/max_function_size.ll
@@ -0,0 +1,28 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt < %s -passes=jump-table-to-switch -jump-table-to-switch-function-size-threshold=1 -verify-dom-info -S | FileCheck %s
+
+@func_array0 = constant [2 x ptr] [ptr @func0, ptr @large_func]
+
+define i32 @func0() {
+  ret i32 1
+}
+
+define i32 @large_func() {
+  %x = add i32 1, 2
+  ret i32 %x
+}
+
+define i32 @function_with_jump_table_with_large_func(i32 %index) {
+; CHECK-LABEL: define i32 @function_with_jump_table_with_large_func(
+; CHECK-SAME: i32 [[INDEX:%.*]]) {
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds [2 x ptr], ptr @func_array0, i32 0, i32 [[INDEX]]
+; CHECK-NEXT:    [[FUNC_PTR:%.*]] = load ptr, ptr [[GEP]], align 8
+; CHECK-NEXT:    [[RESULT:%.*]] = call i32 [[FUNC_PTR]]()
+; CHECK-NEXT:    ret i32 [[RESULT]]
+;
+  %gep = getelementptr inbounds [2 x ptr], ptr @func_array0, i32 0, i32 %index
+  %func_ptr = load ptr, ptr %gep, align 8
+  %result = call i32 %func_ptr()
+  ret i32 %result
+}
+
diff --git a/llvm/test/Transforms/JumpTableToSwitch/remarks.ll b/llvm/test/Transforms/JumpTableToSwitch/remarks.ll
new file mode 100644
index 0000000..84d4c19
--- /dev/null
+++ b/llvm/test/Transforms/JumpTableToSwitch/remarks.ll
@@ -0,0 +1,36 @@
+; RUN: opt < %s -passes=jump-table-to-switch -pass-remarks=jump-table-to-switch -S -o /dev/null 2>&1 | FileCheck %s
+
+; CHECK: remark: /tmp/tmp.cc:2:20: expanded indirect call into switch
+
+@func_array = constant [2 x ptr] [ptr @func0, ptr @func1]
+
+define i32 @func0() {
+  ret i32 1
+}
+
+define i32 @func1() {
+  ret i32 2
+}
+
+define i32 @function_with_jump_table(i32 %index) {
+  %gep = getelementptr inbounds [2 x ptr], ptr @func_array, i32 0, i32 %index
+  %func_ptr = load ptr, ptr %gep
+  %result = call i32 %func_ptr(), !dbg !8
+  ret i32 %result
+}
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!3, !4}
+!llvm.ident = !{!5}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, producer: "clang version 18.0.0 ", isOptimized: true, runtimeVersion: 0, emissionKind: NoDebug, enums: !2)
+!1 = !DIFile(filename: "/tmp/tmp.cc", directory: "/tmp")
+!2 = !{}
+!3 = !{i32 2, !"Debug Info Version", i32 3}
+!4 = !{i32 1, !"PIC Level", i32 2}
+!5 = !{!"clang version 18.0.0 "}
+!6 = distinct !DISubprogram(name: "success", scope: !1, file: !1, line: 1, type: !7, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !2)
+!7 = !DISubroutineType(types: !2)
+!8 = !DILocation(line: 2, column: 20, scope: !6)
+!9 = !DILocation(line: 2, column: 21, scope: !6)
+!10 = !DILocation(line: 2, column: 22, scope: !6)
diff --git a/llvm/test/Transforms/JumpTableToSwitch/skip.ll b/llvm/test/Transforms/JumpTableToSwitch/skip.ll
new file mode 100644
index 0000000..4504423
--- /dev/null
+++ b/llvm/test/Transforms/JumpTableToSwitch/skip.ll
@@ -0,0 +1,131 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt < %s -passes=jump-table-to-switch -verify-dom-info -S | FileCheck %s
+
+@func_array0 = constant [2 x ptr] [ptr @func0, ptr @declared_only_func1]
+
+define i32 @func0() {
+  ret i32 1
+}
+
+declare i32 @declared_only_func1()
+
+define i32 @function_with_jump_table_with_a_declared_only_func(i32 %index) {
+; CHECK-LABEL: define i32 @function_with_jump_table_with_a_declared_only_func(
+; CHECK-SAME: i32 [[INDEX:%.*]]) {
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds [2 x ptr], ptr @func_array0, i32 0, i32 [[INDEX]]
+; CHECK-NEXT:    [[FUNC_PTR:%.*]] = load ptr, ptr [[GEP]], align 8
+; CHECK-NEXT:    [[RESULT:%.*]] = call i32 [[FUNC_PTR]]()
+; CHECK-NEXT:    ret i32 [[RESULT]]
+;
+  %gep = getelementptr inbounds [2 x ptr], ptr @func_array0, i32 0, i32 %index
+  %func_ptr = load ptr, ptr %gep, align 8
+  %result = call i32 %func_ptr()
+  ret i32 %result
+}
+
+declare i32 @__gxx_personality_v0(...)
+
+define i32 @function_with_jump_table_invoke(i32 %index) personality ptr @__gxx_personality_v0 {
+; CHECK-LABEL: define i32 @function_with_jump_table_invoke(
+; CHECK-SAME: i32 [[INDEX:%.*]]) personality ptr @__gxx_personality_v0 {
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds [2 x ptr], ptr @func_array0, i32 0, i32 [[INDEX]]
+; CHECK-NEXT:    [[FUNC_PTR:%.*]] = load ptr, ptr [[GEP]], align 8
+; CHECK-NEXT:    [[RESULT:%.*]] = invoke i32 [[FUNC_PTR]]()
+; CHECK-NEXT:            to label [[NORMAL:%.*]] unwind label [[EXCEPTIONAL:%.*]]
+; CHECK:       normal:
+; CHECK-NEXT:    ret i32 [[RESULT]]
+; CHECK:       exceptional:
+; CHECK-NEXT:    [[LANDING_PAD:%.*]] = landingpad { ptr, i32 }
+; CHECK-NEXT:            catch ptr null
+; CHECK-NEXT:    resume { ptr, i32 } [[LANDING_PAD]]
+;
+  %gep = getelementptr inbounds [2 x ptr], ptr @func_array0, i32 0, i32 %index
+  %func_ptr = load ptr, ptr %gep, align 8
+  %result = invoke i32 %func_ptr() to label %normal unwind label %exceptional
+normal:
+  ret i32 %result
+exceptional:
+  %landing_pad = landingpad { ptr, i32 } catch ptr null
+  resume { ptr, i32 } %landing_pad
+}
+
+@func_array1 = constant [1 x ptr] [ptr @func2]
+
+define i32 @func2(i32 %arg) {
+  ret i32 %arg
+}
+
+define i32 @function_with_jump_table_musttail_call(i32 %index) {
+; CHECK-LABEL: define i32 @function_with_jump_table_musttail_call(
+; CHECK-SAME: i32 [[INDEX:%.*]]) {
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds [1 x ptr], ptr @func_array1, i32 0, i32 [[INDEX]]
+; CHECK-NEXT:    [[FUNC_PTR:%.*]] = load ptr, ptr [[GEP]], align 8
+; CHECK-NEXT:    [[RESULT:%.*]] = musttail call i32 [[FUNC_PTR]](i32 [[INDEX]])
+; CHECK-NEXT:    ret i32 [[RESULT]]
+;
+  %gep = getelementptr inbounds [1 x ptr], ptr @func_array1, i32 0, i32 %index
+  %func_ptr = load ptr, ptr %gep, align 8
+  %result = musttail call i32 %func_ptr(i32 %index)
+  ret i32 %result
+}
+
+define i32 @function_with_jump_table_and_volatile_load(i32 %index) {
+; CHECK-LABEL: define i32 @function_with_jump_table_and_volatile_load(
+; CHECK-SAME: i32 [[INDEX:%.*]]) {
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds [1 x ptr], ptr @func_array1, i32 0, i32 [[INDEX]]
+; CHECK-NEXT:    [[FUNC_PTR:%.*]] = load volatile ptr, ptr [[GEP]], align 8
+; CHECK-NEXT:    [[RESULT:%.*]] = call i32 [[FUNC_PTR]](i32 [[INDEX]])
+; CHECK-NEXT:    ret i32 [[RESULT]]
+;
+  %gep = getelementptr inbounds [1 x ptr], ptr @func_array1, i32 0, i32 %index
+  %func_ptr = load volatile ptr, ptr %gep, align 8
+  %result = call i32 %func_ptr(i32 %index)
+  ret i32 %result
+}
+
+define i32 @function_with_jump_table_and_atomic_load(i32 %index) {
+; CHECK-LABEL: define i32 @function_with_jump_table_and_atomic_load(
+; CHECK-SAME: i32 [[INDEX:%.*]]) {
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds [1 x ptr], ptr @func_array1, i32 0, i32 [[INDEX]]
+; CHECK-NEXT:    [[FUNC_PTR:%.*]] = load atomic ptr, ptr [[GEP]] monotonic, align 8
+; CHECK-NEXT:    [[RESULT:%.*]] = call i32 [[FUNC_PTR]](i32 [[INDEX]])
+; CHECK-NEXT:    ret i32 [[RESULT]]
+;
+  %gep = getelementptr inbounds [1 x ptr], ptr @func_array1, i32 0, i32 %index
+  %func_ptr = load atomic ptr, ptr %gep monotonic, align 8
+  %result = call i32 %func_ptr(i32 %index)
+  ret i32 %result
+}
+
+@func_array2 = global [1 x ptr] [ptr @func2]
+
+define i32 @function_with_nonconstant_jump_table(i32 %index) {
+; CHECK-LABEL: define i32 @function_with_nonconstant_jump_table(
+; CHECK-SAME: i32 [[INDEX:%.*]]) {
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds [1 x ptr], ptr @func_array2, i32 0, i32 [[INDEX]]
+; CHECK-NEXT:    [[FUNC_PTR:%.*]] = load ptr, ptr [[GEP]], align 8
+; CHECK-NEXT:    [[RESULT:%.*]] = call i32 [[FUNC_PTR]](i32 [[INDEX]])
+; CHECK-NEXT:    ret i32 [[RESULT]]
+;
+  %gep = getelementptr inbounds [1 x ptr], ptr @func_array2, i32 0, i32 %index
+  %func_ptr = load ptr, ptr %gep, align 8
+  %result = call i32 %func_ptr(i32 %index)
+  ret i32 %result
+}
+
+@func_array3 = weak constant [1 x ptr] [ptr @func2]
+
+define i32 @function_with_constant_weak_jump_table(i32 %index) {
+; CHECK-LABEL: define i32 @function_with_constant_weak_jump_table(
+; CHECK-SAME: i32 [[INDEX:%.*]]) {
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds [1 x ptr], ptr @func_array3, i32 0, i32 [[INDEX]]
+; CHECK-NEXT:    [[FUNC_PTR:%.*]] = load ptr, ptr [[GEP]], align 8
+; CHECK-NEXT:    [[RESULT:%.*]] = call i32 [[FUNC_PTR]](i32 [[INDEX]])
+; CHECK-NEXT:    ret i32 [[RESULT]]
+;
+  %gep = getelementptr inbounds [1 x ptr], ptr @func_array3, i32 0, i32 %index
+  %func_ptr = load ptr, ptr %gep, align 8
+  %result = call i32 %func_ptr(i32 %index)
+  ret i32 %result
+}
+
diff --git a/llvm/test/Transforms/JumpTableToSwitch/stride.ll b/llvm/test/Transforms/JumpTableToSwitch/stride.ll
new file mode 100644
index 0000000..ef86e9d
--- /dev/null
+++ b/llvm/test/Transforms/JumpTableToSwitch/stride.ll
@@ -0,0 +1,36 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt < %s -passes=jump-table-to-switch -verify-dom-info -S | FileCheck %s
+
+@func_array = constant [2 x ptr] [ptr @func0, ptr @func1]
+
+define i32 @func0() {
+  ret i32 1
+}
+
+define i32 @func1() {
+  ret i32 2
+}
+
+define i32 @check_stride(i32 %index) {
+; CHECK-LABEL: define i32 @check_stride(
+; CHECK-SAME: i32 [[INDEX:%.*]]) {
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds [2 x { ptr, ptr }], ptr @func_array, i32 0, i32 [[INDEX]]
+; CHECK-NEXT:    [[FUNC_PTR:%.*]] = load ptr, ptr [[GEP]], align 8
+; CHECK-NEXT:    switch i32 [[INDEX]], label [[DEFAULT_SWITCH_CASE_UNREACHABLE:%.*]] [
+; CHECK-NEXT:      i32 0, label [[CALL_0:%.*]]
+; CHECK-NEXT:    ]
+; CHECK:       default.switch.case.unreachable:
+; CHECK-NEXT:    unreachable
+; CHECK:       call.0:
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @func0()
+; CHECK-NEXT:    br label [[DOTTAIL:%.*]]
+; CHECK:       .tail:
+; CHECK-NEXT:    [[TMP2:%.*]] = phi i32 [ [[TMP1]], [[CALL_0]] ]
+; CHECK-NEXT:    ret i32 [[TMP2]]
+;
+  %gep = getelementptr inbounds [2 x { ptr, ptr }], ptr @func_array, i32 0, i32 %index
+  %func_ptr = load ptr, ptr %gep
+  %result = call i32 %func_ptr()
+  ret i32 %result
+}
+
diff --git a/llvm/test/Transforms/JumpTableToSwitch/struct.ll b/llvm/test/Transforms/JumpTableToSwitch/struct.ll
new file mode 100644
index 0000000..7aa709c
--- /dev/null
+++ b/llvm/test/Transforms/JumpTableToSwitch/struct.ll
@@ -0,0 +1,42 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt < %s -passes=jump-table-to-switch -verify-dom-info -S | FileCheck %s
+
+%"struct_ty" = type { [2 x ptr] }
+
+@func_array = constant %"struct_ty" { [2 x ptr] [ptr @func0, ptr @func1] }
+
+define i32 @func0() {
+  ret i32 1
+}
+
+define i32 @func1() {
+  ret i32 2
+}
+
+define i32 @function_with_jump_table(i32 %index) {
+; CHECK-LABEL: define i32 @function_with_jump_table(
+; CHECK-SAME: i32 [[INDEX:%.*]]) {
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds [2 x ptr], ptr @func_array, i32 0, i32 [[INDEX]]
+; CHECK-NEXT:    [[FUNC_PTR:%.*]] = load ptr, ptr [[GEP]], align 8
+; CHECK-NEXT:    switch i32 [[INDEX]], label [[DEFAULT_SWITCH_CASE_UNREACHABLE:%.*]] [
+; CHECK-NEXT:      i32 0, label [[CALL_0:%.*]]
+; CHECK-NEXT:      i32 1, label [[CALL_1:%.*]]
+; CHECK-NEXT:    ]
+; CHECK:       default.switch.case.unreachable:
+; CHECK-NEXT:    unreachable
+; CHECK:       call.0:
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @func0()
+; CHECK-NEXT:    br label [[DOTTAIL:%.*]]
+; CHECK:       call.1:
+; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @func1()
+; CHECK-NEXT:    br label [[DOTTAIL]]
+; CHECK:       .tail:
+; CHECK-NEXT:    [[TMP3:%.*]] = phi i32 [ [[TMP1]], [[CALL_0]] ], [ [[TMP2]], [[CALL_1]] ]
+; CHECK-NEXT:    ret i32 [[TMP3]]
+;
+  %gep = getelementptr inbounds [2 x ptr], ptr @func_array, i32 0, i32 %index
+  %func_ptr = load ptr, ptr %gep
+  %result = call i32 %func_ptr()
+  ret i32 %result
+}
+
-- 
cgit v1.1


From fd140d4283652ff7a906f4ebaaa75c8fcf00d39b Mon Sep 17 00:00:00 2001
From: LLVM GN Syncbot <llvmgnsyncbot@gmail.com>
Date: Sat, 10 Feb 2024 09:13:09 +0000
Subject: [gn build] Port d26b43ff4f73

---
 llvm/utils/gn/secondary/llvm/lib/Transforms/Scalar/BUILD.gn | 1 +
 1 file changed, 1 insertion(+)

diff --git a/llvm/utils/gn/secondary/llvm/lib/Transforms/Scalar/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/Transforms/Scalar/BUILD.gn
index a1c0427..f080c06 100644
--- a/llvm/utils/gn/secondary/llvm/lib/Transforms/Scalar/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/lib/Transforms/Scalar/BUILD.gn
@@ -35,6 +35,7 @@ static_library("Scalar") {
     "InferAddressSpaces.cpp",
     "InferAlignment.cpp",
     "InstSimplifyPass.cpp",
+    "JumpTableToSwitch.cpp",
     "JumpThreading.cpp",
     "LICM.cpp",
     "LoopAccessAnalysisPrinter.cpp",
-- 
cgit v1.1


From f022aaf4e722eae9d0feaf7715a5d8960f4d017b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Martin=20Storsj=C3=B6?= <martin@martin.st>
Date: Sat, 10 Feb 2024 11:33:41 +0200
Subject: Revert "[InstCombine] Optimise x / sqrt(y / z) with fast-math
 pattern. (#76737)"

This reverts commit bb5c3899d1936ebdf7ebf5ca4347ee2e057bee7f.

That commit caused failed asserts like this:

$ cat repro.c
float a, b;
double sqrt();
void c() { b = a / sqrt(a); }
$ clang -target x86_64-linux-gnu -c -O2 -ffast-math repro.c
clang: ../lib/IR/Instruction.cpp:522: bool llvm::Instruction::hasAllowReassoc() const: Assertion `isa<FPMathOperator>(this) && "getting fast-math flag on invalid op"' failed.
---
 .../InstCombine/InstCombineMulDivRem.cpp           | 30 ----------------------
 llvm/test/Transforms/InstCombine/fdiv-sqrt.ll      | 18 ++++++-------
 2 files changed, 9 insertions(+), 39 deletions(-)

diff --git a/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp b/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
index 5918567..f9cee9d 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
@@ -1709,33 +1709,6 @@ static Instruction *foldFDivPowDivisor(BinaryOperator &I,
   return BinaryOperator::CreateFMulFMF(Op0, Pow, &I);
 }
 
-/// Convert div to mul if we have an sqrt divisor iff sqrt's operand is a fdiv
-/// instruction.
-static Instruction *foldFDivSqrtDivisor(BinaryOperator &I,
-                                        InstCombiner::BuilderTy &Builder) {
-  // X / sqrt(Y / Z) -->  X * sqrt(Z / Y)
-  if (!I.hasAllowReassoc() || !I.hasAllowReciprocal())
-    return nullptr;
-  Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
-  auto *II = dyn_cast<IntrinsicInst>(Op1);
-  if (!II || II->getIntrinsicID() != Intrinsic::sqrt || !II->hasOneUse() ||
-      !II->hasAllowReassoc() || !II->hasAllowReciprocal())
-    return nullptr;
-
-  Value *Y, *Z;
-  auto *DivOp = dyn_cast<Instruction>(II->getOperand(0));
-  if (!DivOp || !DivOp->hasAllowReassoc() || !I.hasAllowReciprocal() ||
-      !DivOp->hasOneUse())
-    return nullptr;
-  if (match(DivOp, m_FDiv(m_Value(Y), m_Value(Z)))) {
-    Value *SwapDiv = Builder.CreateFDivFMF(Z, Y, DivOp);
-    Value *NewSqrt =
-        Builder.CreateUnaryIntrinsic(II->getIntrinsicID(), SwapDiv, II);
-    return BinaryOperator::CreateFMulFMF(Op0, NewSqrt, &I);
-  }
-  return nullptr;
-}
-
 Instruction *InstCombinerImpl::visitFDiv(BinaryOperator &I) {
   Module *M = I.getModule();
 
@@ -1843,9 +1816,6 @@ Instruction *InstCombinerImpl::visitFDiv(BinaryOperator &I) {
   if (Instruction *Mul = foldFDivPowDivisor(I, Builder))
     return Mul;
 
-  if (Instruction *Mul = foldFDivSqrtDivisor(I, Builder))
-    return Mul;
-
   // pow(X, Y) / X --> pow(X, Y-1)
   if (I.hasAllowReassoc() &&
       match(Op0, m_OneUse(m_Intrinsic<Intrinsic::pow>(m_Specific(Op1),
diff --git a/llvm/test/Transforms/InstCombine/fdiv-sqrt.ll b/llvm/test/Transforms/InstCombine/fdiv-sqrt.ll
index 361837e..346271b 100644
--- a/llvm/test/Transforms/InstCombine/fdiv-sqrt.ll
+++ b/llvm/test/Transforms/InstCombine/fdiv-sqrt.ll
@@ -6,9 +6,9 @@ declare double @llvm.sqrt.f64(double)
 define double @sqrt_div_fast(double %x, double %y, double %z) {
 ; CHECK-LABEL: @sqrt_div_fast(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = fdiv fast double [[Z:%.*]], [[Y:%.*]]
-; CHECK-NEXT:    [[TMP1:%.*]] = call fast double @llvm.sqrt.f64(double [[TMP0]])
-; CHECK-NEXT:    [[DIV1:%.*]] = fmul fast double [[TMP1]], [[X:%.*]]
+; CHECK-NEXT:    [[DIV:%.*]] = fdiv fast double [[Y:%.*]], [[Z:%.*]]
+; CHECK-NEXT:    [[SQRT:%.*]] = call fast double @llvm.sqrt.f64(double [[DIV]])
+; CHECK-NEXT:    [[DIV1:%.*]] = fdiv fast double [[X:%.*]], [[SQRT]]
 ; CHECK-NEXT:    ret double [[DIV1]]
 ;
 entry:
@@ -36,9 +36,9 @@ entry:
 define double @sqrt_div_reassoc_arcp(double %x, double %y, double %z) {
 ; CHECK-LABEL: @sqrt_div_reassoc_arcp(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = fdiv reassoc arcp double [[Z:%.*]], [[Y:%.*]]
-; CHECK-NEXT:    [[TMP1:%.*]] = call reassoc arcp double @llvm.sqrt.f64(double [[TMP0]])
-; CHECK-NEXT:    [[DIV1:%.*]] = fmul reassoc arcp double [[TMP1]], [[X:%.*]]
+; CHECK-NEXT:    [[DIV:%.*]] = fdiv reassoc arcp double [[Y:%.*]], [[Z:%.*]]
+; CHECK-NEXT:    [[SQRT:%.*]] = call reassoc arcp double @llvm.sqrt.f64(double [[DIV]])
+; CHECK-NEXT:    [[DIV1:%.*]] = fdiv reassoc arcp double [[X:%.*]], [[SQRT]]
 ; CHECK-NEXT:    ret double [[DIV1]]
 ;
 entry:
@@ -96,9 +96,9 @@ entry:
 define double @sqrt_div_arcp_missing(double %x, double %y, double %z) {
 ; CHECK-LABEL: @sqrt_div_arcp_missing(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = fdiv reassoc double [[Z:%.*]], [[Y:%.*]]
-; CHECK-NEXT:    [[TMP1:%.*]] = call reassoc arcp double @llvm.sqrt.f64(double [[TMP0]])
-; CHECK-NEXT:    [[DIV1:%.*]] = fmul reassoc arcp double [[TMP1]], [[X:%.*]]
+; CHECK-NEXT:    [[DIV:%.*]] = fdiv reassoc double [[Y:%.*]], [[Z:%.*]]
+; CHECK-NEXT:    [[SQRT:%.*]] = call reassoc arcp double @llvm.sqrt.f64(double [[DIV]])
+; CHECK-NEXT:    [[DIV1:%.*]] = fdiv reassoc arcp double [[X:%.*]], [[SQRT]]
 ; CHECK-NEXT:    ret double [[DIV1]]
 ;
 entry:
-- 
cgit v1.1


From 8884ba43a8485bebef5c4d41e7ed457e3fa84f07 Mon Sep 17 00:00:00 2001
From: David CARLIER <devnexen@gmail.com>
Date: Sat, 10 Feb 2024 10:08:53 +0000
Subject: [lldb] Fix FreeBSD build. (#81353)

---
 lldb/source/Plugins/Process/FreeBSDKernel/ProcessFreeBSDKernel.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lldb/source/Plugins/Process/FreeBSDKernel/ProcessFreeBSDKernel.cpp b/lldb/source/Plugins/Process/FreeBSDKernel/ProcessFreeBSDKernel.cpp
index 997b590..abfbdb1 100644
--- a/lldb/source/Plugins/Process/FreeBSDKernel/ProcessFreeBSDKernel.cpp
+++ b/lldb/source/Plugins/Process/FreeBSDKernel/ProcessFreeBSDKernel.cpp
@@ -50,7 +50,7 @@ private:
 class ProcessFreeBSDKernelKVM : public ProcessFreeBSDKernel {
 public:
   ProcessFreeBSDKernelKVM(lldb::TargetSP target_sp, lldb::ListenerSP listener,
-                          kvm_t *fvc);
+                          kvm_t *fvc, const FileSpec &core_file);
 
   ~ProcessFreeBSDKernelKVM();
 
-- 
cgit v1.1


From 33c6b77d2a18862fb5b16160ef9d600382e93f19 Mon Sep 17 00:00:00 2001
From: Jacek Caban <jacek@codeweavers.com>
Date: Sat, 10 Feb 2024 12:46:42 +0100
Subject: [llvm-lib][Object] Add support for EC importlib symbols. (#81059)

ARM64EC import libraries expose two additional symbols: mangled thunk
symbol (like `#func`) and auxiliary import symbol (like`__imp_aux_func`).
The main functional change with this patch is that those symbols are
properly added to static library ECSYMBOLS.
---
 llvm/include/llvm/Object/COFF.h                    |  41 ++++++
 llvm/include/llvm/Object/COFFImportFile.h          |  28 +++-
 llvm/lib/Object/COFFImportFile.cpp                 |  15 +++
 .../Target/AArch64/AArch64Arm64ECCallLowering.cpp  |   2 +
 llvm/lib/Target/AArch64/AArch64MCInstLower.cpp     |   2 +
 llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.h    |  28 ----
 llvm/test/tools/llvm-lib/arm64ec-implib.test       | 141 ++++++++++++++++++++-
 7 files changed, 225 insertions(+), 32 deletions(-)

diff --git a/llvm/include/llvm/Object/COFF.h b/llvm/include/llvm/Object/COFF.h
index a548b2c..2a5c3d8 100644
--- a/llvm/include/llvm/Object/COFF.h
+++ b/llvm/include/llvm/Object/COFF.h
@@ -1362,6 +1362,47 @@ public:
   SectionStrippedError() { setErrorCode(object_error::section_stripped); }
 };
 
+inline std::optional<std::string>
+getArm64ECMangledFunctionName(StringRef Name) {
+  bool IsCppFn = Name[0] == '?';
+  if (IsCppFn && Name.find("$$h") != std::string::npos)
+    return std::nullopt;
+  if (!IsCppFn && Name[0] == '#')
+    return std::nullopt;
+
+  StringRef Prefix = "$$h";
+  size_t InsertIdx = 0;
+  if (IsCppFn) {
+    InsertIdx = Name.find("@@");
+    size_t ThreeAtSignsIdx = Name.find("@@@");
+    if (InsertIdx != std::string::npos && InsertIdx != ThreeAtSignsIdx) {
+      InsertIdx += 2;
+    } else {
+      InsertIdx = Name.find("@");
+      if (InsertIdx != std::string::npos)
+        InsertIdx++;
+    }
+  } else {
+    Prefix = "#";
+  }
+
+  return std::optional<std::string>(
+      (Name.substr(0, InsertIdx) + Prefix + Name.substr(InsertIdx)).str());
+}
+
+inline std::optional<std::string>
+getArm64ECDemangledFunctionName(StringRef Name) {
+  if (Name[0] == '#')
+    return std::string(Name.substr(1));
+  if (Name[0] != '?')
+    return std::nullopt;
+
+  std::pair<StringRef, StringRef> Pair = Name.split("$$h");
+  if (Pair.second.empty())
+    return std::nullopt;
+  return (Pair.first + Pair.second).str();
+}
+
 } // end namespace object
 
 } // end namespace llvm
diff --git a/llvm/include/llvm/Object/COFFImportFile.h b/llvm/include/llvm/Object/COFFImportFile.h
index 7c5846e9..46a982d 100644
--- a/llvm/include/llvm/Object/COFFImportFile.h
+++ b/llvm/include/llvm/Object/COFFImportFile.h
@@ -27,6 +27,9 @@ namespace llvm {
 namespace object {
 
 class COFFImportFile : public SymbolicFile {
+private:
+  enum SymbolIndex { ImpSymbol, ThunkSymbol, ECAuxSymbol, ECThunkSymbol };
+
 public:
   COFFImportFile(MemoryBufferRef Source)
       : SymbolicFile(ID_COFFImportFile, Source) {}
@@ -36,9 +39,23 @@ public:
   void moveSymbolNext(DataRefImpl &Symb) const override { ++Symb.p; }
 
   Error printSymbolName(raw_ostream &OS, DataRefImpl Symb) const override {
-    if (Symb.p == 0)
+    switch (Symb.p) {
+    case ImpSymbol:
       OS << "__imp_";
-    OS << StringRef(Data.getBufferStart() + sizeof(coff_import_header));
+      break;
+    case ECAuxSymbol:
+      OS << "__imp_aux_";
+      break;
+    }
+    const char *Name = Data.getBufferStart() + sizeof(coff_import_header);
+    if (Symb.p != ECThunkSymbol && COFF::isArm64EC(getMachine())) {
+      if (std::optional<std::string> DemangledName =
+              getArm64ECDemangledFunctionName(Name)) {
+        OS << StringRef(*DemangledName);
+        return Error::success();
+      }
+    }
+    OS << StringRef(Name);
     return Error::success();
   }
 
@@ -52,7 +69,12 @@ public:
 
   basic_symbol_iterator symbol_end() const override {
     DataRefImpl Symb;
-    Symb.p = isData() ? 1 : 2;
+    if (isData())
+      Symb.p = ImpSymbol + 1;
+    else if (COFF::isArm64EC(getMachine()))
+      Symb.p = ECThunkSymbol + 1;
+    else
+      Symb.p = ThunkSymbol + 1;
     return BasicSymbolRef(Symb, this);
   }
 
diff --git a/llvm/lib/Object/COFFImportFile.cpp b/llvm/lib/Object/COFFImportFile.cpp
index 51e6274..a3e5e78 100644
--- a/llvm/lib/Object/COFFImportFile.cpp
+++ b/llvm/lib/Object/COFFImportFile.cpp
@@ -684,6 +684,21 @@ Error writeImportLibrary(StringRef ImportName, StringRef Path,
       NameType = getNameType(SymbolName, E.Name, Machine, MinGW);
     }
 
+    // On ARM64EC, use EXPORTAS to import demangled name for mangled symbols.
+    if (ImportType == IMPORT_CODE && isArm64EC(Machine)) {
+      if (std::optional<std::string> MangledName =
+              getArm64ECMangledFunctionName(Name)) {
+        if (ExportName.empty()) {
+          NameType = IMPORT_NAME_EXPORTAS;
+          ExportName.swap(Name);
+        }
+        Name = std::move(*MangledName);
+      } else if (ExportName.empty()) {
+        NameType = IMPORT_NAME_EXPORTAS;
+        ExportName = std::move(*getArm64ECDemangledFunctionName(Name));
+      }
+    }
+
     Members.push_back(OF.createShortImport(Name, E.Ordinal, ImportType,
                                            NameType, ExportName, Machine));
   }
diff --git a/llvm/lib/Target/AArch64/AArch64Arm64ECCallLowering.cpp b/llvm/lib/Target/AArch64/AArch64Arm64ECCallLowering.cpp
index 91b4f18..c62582a 100644
--- a/llvm/lib/Target/AArch64/AArch64Arm64ECCallLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64Arm64ECCallLowering.cpp
@@ -24,11 +24,13 @@
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Instruction.h"
 #include "llvm/InitializePasses.h"
+#include "llvm/Object/COFF.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/TargetParser/Triple.h"
 
 using namespace llvm;
+using namespace llvm::object;
 
 using OperandBundleDef = OperandBundleDefT<Value *>;
 
diff --git a/llvm/lib/Target/AArch64/AArch64MCInstLower.cpp b/llvm/lib/Target/AArch64/AArch64MCInstLower.cpp
index 1e12cf5..37d621c 100644
--- a/llvm/lib/Target/AArch64/AArch64MCInstLower.cpp
+++ b/llvm/lib/Target/AArch64/AArch64MCInstLower.cpp
@@ -23,11 +23,13 @@
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCStreamer.h"
+#include "llvm/Object/COFF.h"
 #include "llvm/Support/CodeGen.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Target/TargetLoweringObjectFile.h"
 #include "llvm/Target/TargetMachine.h"
 using namespace llvm;
+using namespace llvm::object;
 
 extern cl::opt<bool> EnableAArch64ELFLocalDynamicTLSGeneration;
 
diff --git a/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.h b/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.h
index e3f1d25..ed8336a 100644
--- a/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.h
+++ b/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.h
@@ -248,34 +248,6 @@ static inline bool atomicBarrierDroppedOnZero(unsigned Opcode) {
   return false;
 }
 
-static inline std::optional<std::string>
-getArm64ECMangledFunctionName(std::string Name) {
-  bool IsCppFn = Name[0] == '?';
-  if (IsCppFn && Name.find("$$h") != std::string::npos)
-    return std::nullopt;
-  if (!IsCppFn && Name[0] == '#')
-    return std::nullopt;
-
-  StringRef Prefix = "$$h";
-  size_t InsertIdx = 0;
-  if (IsCppFn) {
-    InsertIdx = Name.find("@@");
-    size_t ThreeAtSignsIdx = Name.find("@@@");
-    if (InsertIdx != std::string::npos && InsertIdx != ThreeAtSignsIdx) {
-      InsertIdx += 2;
-    } else {
-      InsertIdx = Name.find("@");
-      if (InsertIdx != std::string::npos)
-        InsertIdx++;
-    }
-  } else {
-    Prefix = "#";
-  }
-
-  Name.insert(Name.begin() + InsertIdx, Prefix.begin(), Prefix.end());
-  return std::optional<std::string>(Name);
-}
-
 namespace AArch64CC {
 
 // The CondCodes constants map directly to the 4-bit encoding of the condition
diff --git a/llvm/test/tools/llvm-lib/arm64ec-implib.test b/llvm/test/tools/llvm-lib/arm64ec-implib.test
index 4250c77..c583ef7 100644
--- a/llvm/test/tools/llvm-lib/arm64ec-implib.test
+++ b/llvm/test/tools/llvm-lib/arm64ec-implib.test
@@ -11,9 +11,23 @@ ARMAP-NEXT: __NULL_IMPORT_DESCRIPTOR in test.dll
 ARMAP-NEXT: test_NULL_THUNK_DATA in test.dll
 ARMAP-EMPTY:
 ARMAP-NEXT: Archive EC map
+ARMAP-NEXT: #expname in test.dll
+ARMAP-NEXT: #funcexp in test.dll
+ARMAP-NEXT: #mangledfunc in test.dll
+ARMAP-NEXT: ?test_cpp_func@@$$hYAHPEAX@Z in test.dll
+ARMAP-NEXT: ?test_cpp_func@@YAHPEAX@Z in test.dll
+ARMAP-NEXT: __imp_?test_cpp_func@@YAHPEAX@Z in test.dll
+ARMAP-NEXT: __imp_aux_?test_cpp_func@@YAHPEAX@Z in test.dll
+ARMAP-NEXT: __imp_aux_expname in test.dll
+ARMAP-NEXT: __imp_aux_funcexp in test.dll
+ARMAP-NEXT: __imp_aux_mangledfunc in test.dll
 ARMAP-NEXT: __imp_dataexp in test.dll
+ARMAP-NEXT: __imp_expname in test.dll
 ARMAP-NEXT: __imp_funcexp in test.dll
+ARMAP-NEXT: __imp_mangledfunc in test.dll
+ARMAP-NEXT: expname in test.dll
 ARMAP-NEXT: funcexp in test.dll
+ARMAP-NEXT: mangledfunc in test.dll
 
 RUN: llvm-readobj test.lib | FileCheck -check-prefix=READOBJ %s
 
@@ -35,10 +49,42 @@ READOBJ-EMPTY:
 READOBJ-NEXT: File: test.dll
 READOBJ-NEXT: Format: COFF-import-file-ARM64EC
 READOBJ-NEXT: Type: code
-READOBJ-NEXT: Name type: name
+READOBJ-NEXT: Name type: export as
 READOBJ-NEXT: Export name: funcexp
 READOBJ-NEXT: Symbol: __imp_funcexp
 READOBJ-NEXT: Symbol: funcexp
+READOBJ-NEXT: Symbol: __imp_aux_funcexp
+READOBJ-NEXT: Symbol: #funcexp
+READOBJ-EMPTY:
+READOBJ-NEXT: File: test.dll
+READOBJ-NEXT: Format: COFF-import-file-ARM64EC
+READOBJ-NEXT: Type: code
+READOBJ-NEXT: Name type: export as
+READOBJ-NEXT: Export name: mangledfunc
+READOBJ-NEXT: Symbol: __imp_mangledfunc
+READOBJ-NEXT: Symbol: mangledfunc
+READOBJ-NEXT: Symbol: __imp_aux_mangledfunc
+READOBJ-NEXT: Symbol: #mangledfunc
+READOBJ-EMPTY:
+READOBJ-NEXT: File: test.dll
+READOBJ-NEXT: Format: COFF-import-file-ARM64EC
+READOBJ-NEXT: Type: code
+READOBJ-NEXT: Name type: export as
+READOBJ-NEXT: Export name: ?test_cpp_func@@YAHPEAX@Z
+READOBJ-NEXT: Symbol: __imp_?test_cpp_func@@YAHPEAX@Z
+READOBJ-NEXT: Symbol: ?test_cpp_func@@YAHPEAX@Z
+READOBJ-NEXT: Symbol: __imp_aux_?test_cpp_func@@YAHPEAX@Z
+READOBJ-NEXT: Symbol: ?test_cpp_func@@$$hYAHPEAX@Z
+READOBJ-EMPTY:
+READOBJ-NEXT: File: test.dll
+READOBJ-NEXT: Format: COFF-import-file-ARM64EC
+READOBJ-NEXT: Type: code
+READOBJ-NEXT: Name type: export as
+READOBJ-NEXT: Export name: expname
+READOBJ-NEXT: Symbol: __imp_expname
+READOBJ-NEXT: Symbol: expname
+READOBJ-NEXT: Symbol: __imp_aux_expname
+READOBJ-NEXT: Symbol: #expname
 READOBJ-EMPTY:
 READOBJ-NEXT: File: test.dll
 READOBJ-NEXT: Format: COFF-import-file-ARM64EC
@@ -51,8 +97,101 @@ Creating a new lib containing the existing lib:
 RUN: llvm-lib -machine:arm64ec test.lib -out:test2.lib
 RUN: llvm-nm --print-armap test2.lib | FileCheck -check-prefix=ARMAP %s
 
+
+RUN: llvm-lib -machine:arm64ec -def:exportas.def -out:exportas.lib
+RUN: llvm-nm --print-armap exportas.lib | FileCheck -check-prefix=EXPAS-ARMAP %s
+RUN: llvm-readobj exportas.lib | FileCheck -check-prefix=EXPAS-READOBJ %s
+
+EXPAS-ARMAP:      Archive EC map
+EXPAS-ARMAP-NEXT: #func1 in test.dll
+EXPAS-ARMAP-NEXT: #func2 in test.dll
+EXPAS-ARMAP-NEXT: #func3 in test.dll
+EXPAS-ARMAP-NEXT: #func4 in test.dll
+EXPAS-ARMAP-NEXT: __imp_aux_func1 in test.dll
+EXPAS-ARMAP-NEXT: __imp_aux_func2 in test.dll
+EXPAS-ARMAP-NEXT: __imp_aux_func3 in test.dll
+EXPAS-ARMAP-NEXT: __imp_aux_func4 in test.dll
+EXPAS-ARMAP-NEXT: __imp_data1 in test.dll
+EXPAS-ARMAP-NEXT: __imp_data2 in test.dll
+EXPAS-ARMAP-NEXT: __imp_func1 in test.dll
+EXPAS-ARMAP-NEXT: __imp_func2 in test.dll
+EXPAS-ARMAP-NEXT: __imp_func3 in test.dll
+EXPAS-ARMAP-NEXT: __imp_func4 in test.dll
+EXPAS-ARMAP-NEXT: func1 in test.dll
+EXPAS-ARMAP-NEXT: func2 in test.dll
+EXPAS-ARMAP-NEXT: func3 in test.dll
+EXPAS-ARMAP-NEXT: func4 in test.dll
+
+EXPAS-READOBJ:      File: test.dll
+EXPAS-READOBJ-NEXT: Format: COFF-import-file-ARM64EC
+EXPAS-READOBJ-NEXT: Type: code
+EXPAS-READOBJ-NEXT: Name type: export as
+EXPAS-READOBJ-NEXT: Export name: func1
+EXPAS-READOBJ-NEXT: Symbol: __imp_func1
+EXPAS-READOBJ-NEXT: Symbol: func1
+EXPAS-READOBJ-NEXT: Symbol: __imp_aux_func1
+EXPAS-READOBJ-NEXT: Symbol: #func1
+EXPAS-READOBJ-EMPTY:
+EXPAS-READOBJ-NEXT: File: test.dll
+EXPAS-READOBJ-NEXT: Format: COFF-import-file-ARM64EC
+EXPAS-READOBJ-NEXT: Type: code
+EXPAS-READOBJ-NEXT: Name type: export as
+EXPAS-READOBJ-NEXT: Export name: func2
+EXPAS-READOBJ-NEXT: Symbol: __imp_func2
+EXPAS-READOBJ-NEXT: Symbol: func2
+EXPAS-READOBJ-NEXT: Symbol: __imp_aux_func2
+EXPAS-READOBJ-NEXT: Symbol: #func2
+EXPAS-READOBJ-EMPTY:
+EXPAS-READOBJ-NEXT: File: test.dll
+EXPAS-READOBJ-NEXT: Format: COFF-import-file-ARM64EC
+EXPAS-READOBJ-NEXT: Type: code
+EXPAS-READOBJ-NEXT: Name type: export as
+EXPAS-READOBJ-NEXT: Export name: #func3
+EXPAS-READOBJ-NEXT: Symbol: __imp_func3
+EXPAS-READOBJ-NEXT: Symbol: func3
+EXPAS-READOBJ-NEXT: Symbol: __imp_aux_func3
+EXPAS-READOBJ-NEXT: Symbol: #func3
+EXPAS-READOBJ-EMPTY:
+EXPAS-READOBJ-NEXT: File: test.dll
+EXPAS-READOBJ-NEXT: Format: COFF-import-file-ARM64EC
+EXPAS-READOBJ-NEXT: Type: code
+EXPAS-READOBJ-NEXT: Name type: export as
+EXPAS-READOBJ-NEXT: Export name: #func4
+EXPAS-READOBJ-NEXT: Symbol: __imp_func4
+EXPAS-READOBJ-NEXT: Symbol: func4
+EXPAS-READOBJ-NEXT: Symbol: __imp_aux_func4
+EXPAS-READOBJ-NEXT: Symbol: #func4
+EXPAS-READOBJ-EMPTY:
+EXPAS-READOBJ-NEXT: File: test.dll
+EXPAS-READOBJ-NEXT: Format: COFF-import-file-ARM64EC
+EXPAS-READOBJ-NEXT: Type: data
+EXPAS-READOBJ-NEXT: Name type: export as
+EXPAS-READOBJ-NEXT: Export name: #data1
+EXPAS-READOBJ-NEXT: Symbol: __imp_data1
+EXPAS-READOBJ-EMPTY:
+EXPAS-READOBJ-NEXT: File: test.dll
+EXPAS-READOBJ-NEXT: Format: COFF-import-file-ARM64EC
+EXPAS-READOBJ-NEXT: Type: data
+EXPAS-READOBJ-NEXT: Name type: export as
+EXPAS-READOBJ-NEXT: Export name: data2
+EXPAS-READOBJ-NEXT: Symbol: __imp_data2
+
+
 #--- test.def
 LIBRARY test.dll
 EXPORTS
     funcexp
+    #mangledfunc
+    ?test_cpp_func@@YAHPEAX@Z
+    expname=impname
     dataexp DATA
+
+#--- exportas.def
+LIBRARY test.dll
+EXPORTS
+    #func1 EXPORTAS func1
+    func2 EXPORTAS func2
+    func3 EXPORTAS #func3
+    #func4 EXPORTAS #func4
+    data1 DATA EXPORTAS #data1
+    #data2 DATA EXPORTAS data2
-- 
cgit v1.1


From 7d9540ea96ecb1e83f19cc68a202e8fa697c513d Mon Sep 17 00:00:00 2001
From: Mark de Wever <koraq@xs4all.nl>
Date: Sat, 10 Feb 2024 14:21:57 +0100
Subject: [libc++][chrono] Implements duration Rep constraints. (#80539)

Applies LWG3050 to the constraints of operator*, operator/, and
operator%. The changes to the constructor were done in
https://reviews.llvm.org/D118902, but that patch did not identify the
related LWG-issue, and only adjusted the constructor to the wording in
the Standard.

Implements:
- LWG 3050: Conversion specification problem in chrono::duration
constructor

---------

Co-authored-by: h-vetinari <h.vetinari@gmx.com>
---
 libcxx/docs/Status/Cxx20Issues.csv                 |  2 +-
 libcxx/include/__chrono/duration.h                 |  8 +++---
 libcxx/include/chrono                              |  2 +-
 libcxx/test/std/time/rep.h                         | 23 +++++++++++++++++
 .../op_divide_duration.pass.cpp                    | 15 +++++++++--
 .../op_mod_duration.pass.cpp                       | 15 +++++++++--
 .../time.duration.nonmember/op_times_rep.pass.cpp  | 30 ++++++++++++++++------
 7 files changed, 77 insertions(+), 18 deletions(-)

diff --git a/libcxx/docs/Status/Cxx20Issues.csv b/libcxx/docs/Status/Cxx20Issues.csv
index 316127f..f0e9c40 100644
--- a/libcxx/docs/Status/Cxx20Issues.csv
+++ b/libcxx/docs/Status/Cxx20Issues.csv
@@ -192,7 +192,7 @@
 "`1203 <https://wg21.link/LWG1203>`__","More useful rvalue stream insertion","Prague","|Complete|","12.0"
 "`2859 <https://wg21.link/LWG2859>`__","Definition of *reachable* in [ptr.launder] misses pointer arithmetic from pointer-interconvertible object","Prague","",""
 "`3018 <https://wg21.link/LWG3018>`__","``shared_ptr``\  of function type","Prague","",""
-"`3050 <https://wg21.link/LWG3050>`__","Conversion specification problem in ``chrono::duration``\  constructor","Prague","","","|chrono|"
+"`3050 <https://wg21.link/LWG3050>`__","Conversion specification problem in ``chrono::duration``\  constructor","Prague","|Complete|","19.0","|chrono|"
 "`3141 <https://wg21.link/LWG3141>`__","``CopyConstructible``\  doesn't preserve source values","Prague","|Nothing to do|",""
 "`3150 <https://wg21.link/LWG3150>`__","``UniformRandomBitGenerator``\  should validate ``min``\  and ``max``\ ","Prague","|Complete|","13.0","|ranges|"
 "`3175 <https://wg21.link/LWG3175>`__","The ``CommonReference``\  requirement of concept ``SwappableWith``\  is not satisfied in the example","Prague","|Complete|","13.0"
diff --git a/libcxx/include/__chrono/duration.h b/libcxx/include/__chrono/duration.h
index 5693ee6..1e81420 100644
--- a/libcxx/include/__chrono/duration.h
+++ b/libcxx/include/__chrono/duration.h
@@ -412,7 +412,7 @@ inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR
 template <class _Rep1,
           class _Period,
           class _Rep2,
-          __enable_if_t<is_convertible<_Rep2, typename common_type<_Rep1, _Rep2>::type>::value, int> = 0>
+          __enable_if_t<is_convertible<const _Rep2&, typename common_type<_Rep1, _Rep2>::type>::value, int> = 0>
 inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR duration<typename common_type<_Rep1, _Rep2>::type, _Period>
 operator*(const duration<_Rep1, _Period>& __d, const _Rep2& __s) {
   typedef typename common_type<_Rep1, _Rep2>::type _Cr;
@@ -423,7 +423,7 @@ operator*(const duration<_Rep1, _Period>& __d, const _Rep2& __s) {
 template <class _Rep1,
           class _Period,
           class _Rep2,
-          __enable_if_t<is_convertible<_Rep1, typename common_type<_Rep1, _Rep2>::type>::value, int> = 0>
+          __enable_if_t<is_convertible<const _Rep1&, typename common_type<_Rep1, _Rep2>::type>::value, int> = 0>
 inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR duration<typename common_type<_Rep1, _Rep2>::type, _Period>
 operator*(const _Rep1& __s, const duration<_Rep2, _Period>& __d) {
   return __d * __s;
@@ -435,7 +435,7 @@ template <class _Rep1,
           class _Period,
           class _Rep2,
           __enable_if_t<!__is_duration<_Rep2>::value &&
-                            is_convertible<_Rep2, typename common_type<_Rep1, _Rep2>::type>::value,
+                            is_convertible<const _Rep2&, typename common_type<_Rep1, _Rep2>::type>::value,
                         int> = 0>
 inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR duration<typename common_type<_Rep1, _Rep2>::type, _Period>
 operator/(const duration<_Rep1, _Period>& __d, const _Rep2& __s) {
@@ -457,7 +457,7 @@ template <class _Rep1,
           class _Period,
           class _Rep2,
           __enable_if_t<!__is_duration<_Rep2>::value &&
-                            is_convertible<_Rep2, typename common_type<_Rep1, _Rep2>::type>::value,
+                            is_convertible<const _Rep2&, typename common_type<_Rep1, _Rep2>::type>::value,
                         int> = 0>
 inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR duration<typename common_type<_Rep1, _Rep2>::type, _Period>
 operator%(const duration<_Rep1, _Period>& __d, const _Rep2& __s) {
diff --git a/libcxx/include/chrono b/libcxx/include/chrono
index c80fa78..f840741 100644
--- a/libcxx/include/chrono
+++ b/libcxx/include/chrono
@@ -58,7 +58,7 @@ public:
         constexpr explicit duration(const Rep2& r,
             typename enable_if
             <
-               is_convertible<Rep2, rep>::value &&
+               is_convertible<const Rep2&, rep>::value &&
                (treat_as_floating_point<rep>::value ||
                !treat_as_floating_point<rep>::value && !treat_as_floating_point<Rep2>::value)
             >::type* = 0);
diff --git a/libcxx/test/std/time/rep.h b/libcxx/test/std/time/rep.h
index 80a0e3c..ddb5c0b 100644
--- a/libcxx/test/std/time/rep.h
+++ b/libcxx/test/std/time/rep.h
@@ -10,6 +10,7 @@
 #define REP_H
 
 #include "test_macros.h"
+#include <type_traits>
 
 class Rep
 {
@@ -29,6 +30,28 @@ public:
 
 struct NotARep {};
 
+#if TEST_STD_VER >= 11
+// Several duration operators take a Rep parameter. Before LWG3050 this
+// parameter was constrained to be convertible from a non-const object,
+// but the code always uses a const object. So the function was SFINAE'd
+// away for this type. LWG3050 fixes the constraint to use a const
+// object.
+struct RepConstConvertibleLWG3050 {
+  operator long() = delete;
+  operator long() const { return 2; }
+};
+namespace std {
+template <>
+struct common_type<RepConstConvertibleLWG3050, int> {
+  using type = long;
+};
+template <>
+struct common_type<int, RepConstConvertibleLWG3050> {
+  using type = long;
+};
+} // namespace std
+#endif // TEST_STD_VER >= 11
+
 // std::chrono:::duration has only '*', '/' and '%' taking a "Rep" parameter
 
 // Multiplication is commutative, division is not.
diff --git a/libcxx/test/std/time/time.duration/time.duration.nonmember/op_divide_duration.pass.cpp b/libcxx/test/std/time/time.duration/time.duration.nonmember/op_divide_duration.pass.cpp
index d580f4e..6cedd13 100644
--- a/libcxx/test/std/time/time.duration/time.duration.nonmember/op_divide_duration.pass.cpp
+++ b/libcxx/test/std/time/time.duration/time.duration.nonmember/op_divide_duration.pass.cpp
@@ -21,6 +21,7 @@
 
 #include "test_macros.h"
 #include "truncate_fp.h"
+#include "../../rep.h"
 
 int main(int, char**)
 {
@@ -65,7 +66,17 @@ int main(int, char**)
     constexpr std::chrono::duration<double, std::ratio<3, 5> > s2(5);
     static_assert(s1 / s2 == 20./3, "");
     }
-#endif
+    {
+      std::chrono::duration<int> d(5);
+      RepConstConvertibleLWG3050 x;
+
+      {
+        auto r = d / x;
+        assert(r.count() == 2);
+        ASSERT_SAME_TYPE(std::chrono::duration<long>, decltype(r));
+      }
+    }
+#endif // TEST_STD_VER >= 11
 
-  return 0;
+    return 0;
 }
diff --git a/libcxx/test/std/time/time.duration/time.duration.nonmember/op_mod_duration.pass.cpp b/libcxx/test/std/time/time.duration/time.duration.nonmember/op_mod_duration.pass.cpp
index 8b8b50d..df637e1 100644
--- a/libcxx/test/std/time/time.duration/time.duration.nonmember/op_mod_duration.pass.cpp
+++ b/libcxx/test/std/time/time.duration/time.duration.nonmember/op_mod_duration.pass.cpp
@@ -18,6 +18,7 @@
 #include <chrono>
 #include <cassert>
 #include <ratio>
+#include "../../rep.h"
 
 #include "test_macros.h"
 
@@ -60,7 +61,17 @@ int main(int, char**)
     constexpr std::chrono::duration<int, std::ratio<1, 15> > r = s1 % s2;
     static_assert(r.count() == 24, "");
     }
-#endif
+    {
+      std::chrono::duration<int> d(5);
+      RepConstConvertibleLWG3050 x;
+
+      {
+        auto r = d % x;
+        assert(r.count() == 1);
+        ASSERT_SAME_TYPE(std::chrono::duration<long>, decltype(r));
+      }
+    }
+#endif // TEST_STD_VER >= 11
 
-  return 0;
+    return 0;
 }
diff --git a/libcxx/test/std/time/time.duration/time.duration.nonmember/op_times_rep.pass.cpp b/libcxx/test/std/time/time.duration/time.duration.nonmember/op_times_rep.pass.cpp
index c331032..d7c8c2d 100644
--- a/libcxx/test/std/time/time.duration/time.duration.nonmember/op_times_rep.pass.cpp
+++ b/libcxx/test/std/time/time.duration/time.duration.nonmember/op_times_rep.pass.cpp
@@ -26,28 +26,27 @@
 #include "test_macros.h"
 #include "../../rep.h"
 
-int main(int, char**)
-{
-    {
+int main(int, char**) {
+  {
     std::chrono::nanoseconds ns(3);
     ns = ns * 5;
     assert(ns.count() == 15);
     ns = 6 * ns;
     assert(ns.count() == 90);
-    }
+  }
 
 #if TEST_STD_VER >= 11
-    {
+  {
     constexpr std::chrono::nanoseconds ns(3);
     constexpr std::chrono::nanoseconds ns2 = ns * 5;
     static_assert(ns2.count() == 15, "");
     constexpr std::chrono::nanoseconds ns3 = 6 * ns;
     static_assert(ns3.count() == 18, "");
-    }
+  }
 #endif
 
 #if TEST_STD_VER >= 11
-    { // This is related to PR#41130
+  { // This is related to PR#41130
     typedef std::chrono::nanoseconds Duration;
     Duration d(5);
     NotARep n;
@@ -57,8 +56,23 @@ int main(int, char**)
     assert(d.count() == 5);
     d = n * d;
     assert(d.count() == 5);
+  }
+  {
+    std::chrono::duration<int> d(8);
+    RepConstConvertibleLWG3050 x;
+
+    {
+      auto r = d * x;
+      assert(r.count() == 16);
+      ASSERT_SAME_TYPE(std::chrono::duration<long>, decltype(r));
     }
-#endif
+    {
+      auto r = x * d;
+      assert(r.count() == 16);
+      ASSERT_SAME_TYPE(std::chrono::duration<long>, decltype(r));
+    }
+  }
+#endif // TEST_STD_VER >= 11
 
   return 0;
 }
-- 
cgit v1.1


From 9981f5a72e998e5334852695164731b01bf0307b Mon Sep 17 00:00:00 2001
From: David Green <david.green@arm.com>
Date: Sat, 10 Feb 2024 13:25:53 +0000
Subject: [BasicAA] Add extra onevscale test for multiple dependent geps that
 lose the NSW flag. NFC

---
 llvm/test/Analysis/BasicAA/vscale.ll | 20 +++++++++++++++++++-
 1 file changed, 19 insertions(+), 1 deletion(-)

diff --git a/llvm/test/Analysis/BasicAA/vscale.ll b/llvm/test/Analysis/BasicAA/vscale.ll
index b2f5c66..895ae1e 100644
--- a/llvm/test/Analysis/BasicAA/vscale.ll
+++ b/llvm/test/Analysis/BasicAA/vscale.ll
@@ -469,11 +469,29 @@ define void @vscale_negativescale(ptr %p) vscale_range(1,16) {
   ret void
 }
 
+; CHECK-LABEL: onevscale
+; CHECK-DAG:   MustAlias:    <vscale x 4 x i32>* %vp161, <vscale x 4 x i32>* %vp162
+; CHECK-DAG:   MayAlias:     <vscale x 4 x i32>* %vp161, <vscale x 4 x i32>* %vp161b
+; CHECK-DAG:   MayAlias:     <vscale x 4 x i32>* %vp161b, <vscale x 4 x i32>* %vp162
+define void @onevscale(ptr %p) vscale_range(1,16) {
+  %v1 = call i64 @llvm.vscale.i64()
+  %vp1 = mul nsw i64 %v1, 16
+  %vp2 = mul nsw i64 %v1, 16
+  %vp3 = mul nsw i64 %v1, 17
+  %vp161 = getelementptr i8, ptr %p, i64 %vp1
+  %vp162 = getelementptr i8, ptr %p, i64 %vp2
+  %vp161b = getelementptr i8, ptr %vp161, i64 %vp3
+  load <vscale x 4 x i32>, ptr %vp161
+  load <vscale x 4 x i32>, ptr %vp162
+  load <vscale x 4 x i32>, ptr %vp161b
+  ret void
+}
+
 ; CHECK-LABEL: twovscales
 ; CHECK-DAG:   MayAlias:     <vscale x 4 x i32>* %vp161, <vscale x 4 x i32>* %vp162
 ; CHECK-DAG:   MayAlias:     <vscale x 4 x i32>* %vp161, <vscale x 4 x i32>* %vp161b
 ; CHECK-DAG:   MayAlias:     <vscale x 4 x i32>* %vp161b, <vscale x 4 x i32>* %vp162
-define void @twovscales(ptr %p) {
+define void @twovscales(ptr %p) vscale_range(1,16) {
   %v1 = call i64 @llvm.vscale.i64()
   %v2 = call i64 @llvm.vscale.i64()
   %vp1 = mul nsw i64 %v1, 16
-- 
cgit v1.1


From 59037c0975de51ae29a5f9bd4260131ba3b7c22a Mon Sep 17 00:00:00 2001
From: Yeting Kuo <46629943+yetingk@users.noreply.github.com>
Date: Sat, 10 Feb 2024 22:18:46 +0800
Subject: [RISCV] Add Zicfiss support to the shadow call stack implementation.
 (#68075)

This patch enable hardware shadow stack with `Zicifss` and
`mno-forced-sw-shadow-stack`. New feature forced-sw-shadow-stack
disables hardware shadow stack even when `Zicfiss` enabled.
---
 clang/docs/ShadowCallStack.rst               |  42 +++++----
 clang/include/clang/Driver/Options.td        |   4 +
 clang/test/Driver/riscv-features.c           |   6 ++
 llvm/lib/Target/RISCV/RISCVFeatures.td       |   5 +
 llvm/lib/Target/RISCV/RISCVFrameLowering.cpp |  14 ++-
 llvm/test/CodeGen/RISCV/shadowcallstack.ll   | 134 +++++++++++++++++++++++++++
 6 files changed, 187 insertions(+), 18 deletions(-)

diff --git a/clang/docs/ShadowCallStack.rst b/clang/docs/ShadowCallStack.rst
index 6e5192f..d7ece11 100644
--- a/clang/docs/ShadowCallStack.rst
+++ b/clang/docs/ShadowCallStack.rst
@@ -57,19 +57,25 @@ compiled application or the operating system. Integrating the runtime into
 the operating system should be preferred since otherwise all thread creation
 and destruction would need to be intercepted by the application.
 
-The instrumentation makes use of the platform register ``x18`` on AArch64 and
-``x3`` (``gp``) on RISC-V. For simplicity we will refer to this as the
-``SCSReg``. On some platforms, ``SCSReg`` is reserved, and on others, it is
-designated as a scratch register.  This generally means that any code that may
-run on the same thread as code compiled with ShadowCallStack must either target
-one of the platforms whose ABI reserves ``SCSReg`` (currently Android, Darwin,
-Fuchsia and Windows) or be compiled with a flag to reserve that register (e.g.,
-``-ffixed-x18``). If absolutely necessary, code compiled without reserving the
-register may be run on the same thread as code that uses ShadowCallStack by
-saving the register value temporarily on the stack (`example in Android`_) but
-this should be done with care since it risks leaking the shadow call stack
-address.
-
+The instrumentation makes use of the platform register ``x18`` on AArch64,
+``x3`` (``gp``) on RISC-V with software shadow stack and ``ssp`` on RISC-V with
+hardware shadow stack, which needs `Zicfiss`_ and ``-mno-forced-sw-shadow-stack``
+(default option). Note that with ``Zicfiss``_ the RISC-V backend will default to
+the hardware based shadow call stack. Users can force the RISC-V backend to
+generate the software shadow call stack with ``Zicfiss``_ by passing
+``-mforced-sw-shadow-stack``.
+For simplicity we will refer to this as the ``SCSReg``. On some platforms,
+``SCSReg`` is reserved, and on others, it is designated as a scratch register.
+This generally means that any code that may run on the same thread as code
+compiled with ShadowCallStack must either target one of the platforms whose ABI
+reserves ``SCSReg`` (currently Android, Darwin, Fuchsia and Windows) or be
+compiled with a flag to reserve that register (e.g., ``-ffixed-x18``). If
+absolutely necessary, code compiled without reserving the register may be run on
+the same thread as code that uses ShadowCallStack by saving the register value
+temporarily on the stack (`example in Android`_) but this should be done with
+care since it risks leaking the shadow call stack address.
+
+.. _`Zicfiss`: https://github.com/riscv/riscv-cfi/blob/main/cfi_backward.adoc
 .. _`example in Android`: https://android-review.googlesource.com/c/platform/frameworks/base/+/803717
 
 Because it requires a dedicated register, the ShadowCallStack feature is
@@ -151,9 +157,13 @@ Usage
 
 To enable ShadowCallStack, just pass the ``-fsanitize=shadow-call-stack`` flag
 to both compile and link command lines. On aarch64, you also need to pass
-``-ffixed-x18`` unless your target already reserves ``x18``. On RISC-V, ``x3``
-(``gp``) is always reserved. It is, however, important to disable GP relaxation
-in the linker. This can be done with the ``--no-relax-gp`` flag in GNU ld.
+``-ffixed-x18`` unless your target already reserves ``x18``. No additional flags
+need to be passed on RISC-V because the software based shadow stack uses
+``x3`` (``gp``), which is always reserved, and the hardware based shadow call
+stack uses a dedicated register, ``ssp``.
+However, it is important to disable GP relaxation in the linker when using the
+software based shadow call stack on RISC-V. This can be done with the
+``--no-relax-gp`` flag in GNU ld, and is off by default in LLD.
 
 Low-level API
 -------------
diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index 31503fc..7f00732 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -4614,6 +4614,10 @@ def msave_restore : Flag<["-"], "msave-restore">, Group<m_riscv_Features_Group>,
   HelpText<"Enable using library calls for save and restore">;
 def mno_save_restore : Flag<["-"], "mno-save-restore">, Group<m_riscv_Features_Group>,
   HelpText<"Disable using library calls for save and restore">;
+def mforced_sw_shadow_stack : Flag<["-"], "mforced-sw-shadow-stack">, Group<m_riscv_Features_Group>,
+  HelpText<"Force using software shadow stack when shadow-stack enabled">;
+def mno_forced_sw_shadow_stack : Flag<["-"], "mno-forced-sw-shadow-stack">, Group<m_riscv_Features_Group>,
+  HelpText<"Not force using software shadow stack when shadow-stack enabled">;
 } // let Flags = [TargetSpecific]
 let Flags = [TargetSpecific] in {
 def menable_experimental_extensions : Flag<["-"], "menable-experimental-extensions">, Group<m_Group>,
diff --git a/clang/test/Driver/riscv-features.c b/clang/test/Driver/riscv-features.c
index d3700f7..a108383 100644
--- a/clang/test/Driver/riscv-features.c
+++ b/clang/test/Driver/riscv-features.c
@@ -27,6 +27,12 @@
 // DEFAULT-NOT: "-target-feature" "-save-restore"
 // DEFAULT-NOT: "-target-feature" "+save-restore"
 
+// RUN: %clang --target=riscv32-unknown-elf -### %s -mforced-sw-shadow-stack 2>&1 | FileCheck %s -check-prefix=FORCE-SW-SCS
+// RUN: %clang --target=riscv32-unknown-elf -### %s -mno-forced-sw-shadow-stack 2>&1 | FileCheck %s -check-prefix=NO-FORCE-SW-SCS
+// FORCE-SW-SCS: "-target-feature" "+forced-sw-shadow-stack"
+// NO-FORCE-SW-SCS: "-target-feature" "-forced-sw-shadow-stack"
+// DEFAULT-NOT: "-target-feature" "+forced-sw-shadow-stack"
+
 // RUN: %clang --target=riscv32-unknown-elf -### %s -munaligned-access 2>&1 | FileCheck %s -check-prefix=FAST-UNALIGNED-ACCESS
 // RUN: %clang --target=riscv32-unknown-elf -### %s -mno-unaligned-access 2>&1 | FileCheck %s -check-prefix=NO-FAST-UNALIGNED-ACCESS
 // RUN: %clang --target=riscv32-unknown-elf -### %s -mno-strict-align 2>&1 | FileCheck %s -check-prefix=FAST-UNALIGNED-ACCESS
diff --git a/llvm/lib/Target/RISCV/RISCVFeatures.td b/llvm/lib/Target/RISCV/RISCVFeatures.td
index 03e0980..5b8d51f 100644
--- a/llvm/lib/Target/RISCV/RISCVFeatures.td
+++ b/llvm/lib/Target/RISCV/RISCVFeatures.td
@@ -1227,3 +1227,8 @@ def FeatureTaggedGlobals : SubtargetFeature<"tagged-globals",
     "AllowTaggedGlobals",
     "true", "Use an instruction sequence for taking the address of a global "
     "that allows a memory tag in the upper address bits">;
+
+def FeatureForcedSWShadowStack : SubtargetFeature<
+    "forced-sw-shadow-stack", "HasForcedSWShadowStack", "true",
+    "Implement shadow stack with software.">;
+def HasForcedSWShadowStack : Predicate<"Subtarget->hasForcedSWShadowStack()">;
diff --git a/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp b/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp
index 0de4785..37672dd 100644
--- a/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp
@@ -66,9 +66,14 @@ static void emitSCSPrologue(MachineFunction &MF, MachineBasicBlock &MBB,
           CSI, [&](CalleeSavedInfo &CSR) { return CSR.getReg() == RAReg; }))
     return;
 
+  const RISCVInstrInfo *TII = STI.getInstrInfo();
+  if (!STI.hasForcedSWShadowStack() && STI.hasStdExtZicfiss()) {
+    BuildMI(MBB, MI, DL, TII->get(RISCV::SSPUSH)).addReg(RAReg);
+    return;
+  }
+
   Register SCSPReg = RISCVABI::getSCSPReg();
 
-  const RISCVInstrInfo *TII = STI.getInstrInfo();
   bool IsRV64 = STI.hasFeature(RISCV::Feature64Bit);
   int64_t SlotSize = STI.getXLen() / 8;
   // Store return address to shadow call stack
@@ -121,9 +126,14 @@ static void emitSCSEpilogue(MachineFunction &MF, MachineBasicBlock &MBB,
           CSI, [&](CalleeSavedInfo &CSR) { return CSR.getReg() == RAReg; }))
     return;
 
+  const RISCVInstrInfo *TII = STI.getInstrInfo();
+  if (!STI.hasForcedSWShadowStack() && STI.hasStdExtZicfiss()) {
+    BuildMI(MBB, MI, DL, TII->get(RISCV::SSPOPCHK)).addReg(RAReg);
+    return;
+  }
+
   Register SCSPReg = RISCVABI::getSCSPReg();
 
-  const RISCVInstrInfo *TII = STI.getInstrInfo();
   bool IsRV64 = STI.hasFeature(RISCV::Feature64Bit);
   int64_t SlotSize = STI.getXLen() / 8;
   // Load return address from shadow call stack
diff --git a/llvm/test/CodeGen/RISCV/shadowcallstack.ll b/llvm/test/CodeGen/RISCV/shadowcallstack.ll
index b41b87a..a320b44 100644
--- a/llvm/test/CodeGen/RISCV/shadowcallstack.ll
+++ b/llvm/test/CodeGen/RISCV/shadowcallstack.ll
@@ -3,6 +3,14 @@
 ; RUN:   | FileCheck %s --check-prefix=RV32
 ; RUN: llc -mtriple=riscv64 -verify-machineinstrs < %s \
 ; RUN:   | FileCheck %s --check-prefix=RV64
+; RUN: llc -mtriple=riscv32 -mattr=+experimental-zicfiss < %s \
+; RUN:   -verify-machineinstrs | FileCheck %s --check-prefix=RV32-ZICFISS
+; RUN: llc -mtriple=riscv64 -mattr=+experimental-zicfiss < %s \
+; RUN:   -verify-machineinstrs | FileCheck %s --check-prefix=RV64-ZICFISS
+; RUN: llc -mtriple=riscv32 -mattr=+experimental-zicfiss,forced-sw-shadow-stack \
+; RUN:    -verify-machineinstrs < %s | FileCheck %s --check-prefix=RV32
+; RUN: llc -mtriple=riscv64 -mattr=+experimental-zicfiss,forced-sw-shadow-stack \
+; RUN:    -verify-machineinstrs < %s | FileCheck %s --check-prefix=RV64
 
 define void @f1() shadowcallstack {
 ; RV32-LABEL: f1:
@@ -12,6 +20,14 @@ define void @f1() shadowcallstack {
 ; RV64-LABEL: f1:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    ret
+;
+; RV32-ZICFISS-LABEL: f1:
+; RV32-ZICFISS:       # %bb.0:
+; RV32-ZICFISS-NEXT:    ret
+;
+; RV64-ZICFISS-LABEL: f1:
+; RV64-ZICFISS:       # %bb.0:
+; RV64-ZICFISS-NEXT:    ret
   ret void
 }
 
@@ -25,6 +41,14 @@ define void @f2() shadowcallstack {
 ; RV64-LABEL: f2:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    tail foo
+;
+; RV32-ZICFISS-LABEL: f2:
+; RV32-ZICFISS:       # %bb.0:
+; RV32-ZICFISS-NEXT:    tail foo
+;
+; RV64-ZICFISS-LABEL: f2:
+; RV64-ZICFISS:       # %bb.0:
+; RV64-ZICFISS-NEXT:    tail foo
   tail call void @foo()
   ret void
 }
@@ -65,6 +89,32 @@ define i32 @f3() shadowcallstack {
 ; RV64-NEXT:    addi gp, gp, -8
 ; RV64-NEXT:    .cfi_restore gp
 ; RV64-NEXT:    ret
+;
+; RV32-ZICFISS-LABEL: f3:
+; RV32-ZICFISS:       # %bb.0:
+; RV32-ZICFISS-NEXT:    sspush ra
+; RV32-ZICFISS-NEXT:    addi sp, sp, -16
+; RV32-ZICFISS-NEXT:    .cfi_def_cfa_offset 16
+; RV32-ZICFISS-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32-ZICFISS-NEXT:    .cfi_offset ra, -4
+; RV32-ZICFISS-NEXT:    call bar
+; RV32-ZICFISS-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32-ZICFISS-NEXT:    addi sp, sp, 16
+; RV32-ZICFISS-NEXT:    sspopchk ra
+; RV32-ZICFISS-NEXT:    ret
+;
+; RV64-ZICFISS-LABEL: f3:
+; RV64-ZICFISS:       # %bb.0:
+; RV64-ZICFISS-NEXT:    sspush ra
+; RV64-ZICFISS-NEXT:    addi sp, sp, -16
+; RV64-ZICFISS-NEXT:    .cfi_def_cfa_offset 16
+; RV64-ZICFISS-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; RV64-ZICFISS-NEXT:    .cfi_offset ra, -8
+; RV64-ZICFISS-NEXT:    call bar
+; RV64-ZICFISS-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; RV64-ZICFISS-NEXT:    addi sp, sp, 16
+; RV64-ZICFISS-NEXT:    sspopchk ra
+; RV64-ZICFISS-NEXT:    ret
   %res = call i32 @bar()
   %res1 = add i32 %res, 1
   ret i32 %res
@@ -140,6 +190,68 @@ define i32 @f4() shadowcallstack {
 ; RV64-NEXT:    addi gp, gp, -8
 ; RV64-NEXT:    .cfi_restore gp
 ; RV64-NEXT:    ret
+;
+; RV32-ZICFISS-LABEL: f4:
+; RV32-ZICFISS:       # %bb.0:
+; RV32-ZICFISS-NEXT:    sspush ra
+; RV32-ZICFISS-NEXT:    addi sp, sp, -16
+; RV32-ZICFISS-NEXT:    .cfi_def_cfa_offset 16
+; RV32-ZICFISS-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32-ZICFISS-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
+; RV32-ZICFISS-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
+; RV32-ZICFISS-NEXT:    sw s2, 0(sp) # 4-byte Folded Spill
+; RV32-ZICFISS-NEXT:    .cfi_offset ra, -4
+; RV32-ZICFISS-NEXT:    .cfi_offset s0, -8
+; RV32-ZICFISS-NEXT:    .cfi_offset s1, -12
+; RV32-ZICFISS-NEXT:    .cfi_offset s2, -16
+; RV32-ZICFISS-NEXT:    call bar
+; RV32-ZICFISS-NEXT:    mv s0, a0
+; RV32-ZICFISS-NEXT:    call bar
+; RV32-ZICFISS-NEXT:    mv s1, a0
+; RV32-ZICFISS-NEXT:    call bar
+; RV32-ZICFISS-NEXT:    mv s2, a0
+; RV32-ZICFISS-NEXT:    call bar
+; RV32-ZICFISS-NEXT:    add s0, s0, s1
+; RV32-ZICFISS-NEXT:    add a0, s2, a0
+; RV32-ZICFISS-NEXT:    add a0, s0, a0
+; RV32-ZICFISS-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32-ZICFISS-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
+; RV32-ZICFISS-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
+; RV32-ZICFISS-NEXT:    lw s2, 0(sp) # 4-byte Folded Reload
+; RV32-ZICFISS-NEXT:    addi sp, sp, 16
+; RV32-ZICFISS-NEXT:    sspopchk ra
+; RV32-ZICFISS-NEXT:    ret
+;
+; RV64-ZICFISS-LABEL: f4:
+; RV64-ZICFISS:       # %bb.0:
+; RV64-ZICFISS-NEXT:    sspush ra
+; RV64-ZICFISS-NEXT:    addi sp, sp, -32
+; RV64-ZICFISS-NEXT:    .cfi_def_cfa_offset 32
+; RV64-ZICFISS-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
+; RV64-ZICFISS-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
+; RV64-ZICFISS-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
+; RV64-ZICFISS-NEXT:    sd s2, 0(sp) # 8-byte Folded Spill
+; RV64-ZICFISS-NEXT:    .cfi_offset ra, -8
+; RV64-ZICFISS-NEXT:    .cfi_offset s0, -16
+; RV64-ZICFISS-NEXT:    .cfi_offset s1, -24
+; RV64-ZICFISS-NEXT:    .cfi_offset s2, -32
+; RV64-ZICFISS-NEXT:    call bar
+; RV64-ZICFISS-NEXT:    mv s0, a0
+; RV64-ZICFISS-NEXT:    call bar
+; RV64-ZICFISS-NEXT:    mv s1, a0
+; RV64-ZICFISS-NEXT:    call bar
+; RV64-ZICFISS-NEXT:    mv s2, a0
+; RV64-ZICFISS-NEXT:    call bar
+; RV64-ZICFISS-NEXT:    add s0, s0, s1
+; RV64-ZICFISS-NEXT:    add a0, s2, a0
+; RV64-ZICFISS-NEXT:    addw a0, s0, a0
+; RV64-ZICFISS-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
+; RV64-ZICFISS-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
+; RV64-ZICFISS-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
+; RV64-ZICFISS-NEXT:    ld s2, 0(sp) # 8-byte Folded Reload
+; RV64-ZICFISS-NEXT:    addi sp, sp, 32
+; RV64-ZICFISS-NEXT:    sspopchk ra
+; RV64-ZICFISS-NEXT:    ret
   %res1 = call i32 @bar()
   %res2 = call i32 @bar()
   %res3 = call i32 @bar()
@@ -176,6 +288,28 @@ define i32 @f5() shadowcallstack nounwind {
 ; RV64-NEXT:    ld ra, -8(gp)
 ; RV64-NEXT:    addi gp, gp, -8
 ; RV64-NEXT:    ret
+;
+; RV32-ZICFISS-LABEL: f5:
+; RV32-ZICFISS:       # %bb.0:
+; RV32-ZICFISS-NEXT:    sspush ra
+; RV32-ZICFISS-NEXT:    addi sp, sp, -16
+; RV32-ZICFISS-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32-ZICFISS-NEXT:    call bar
+; RV32-ZICFISS-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32-ZICFISS-NEXT:    addi sp, sp, 16
+; RV32-ZICFISS-NEXT:    sspopchk ra
+; RV32-ZICFISS-NEXT:    ret
+;
+; RV64-ZICFISS-LABEL: f5:
+; RV64-ZICFISS:       # %bb.0:
+; RV64-ZICFISS-NEXT:    sspush ra
+; RV64-ZICFISS-NEXT:    addi sp, sp, -16
+; RV64-ZICFISS-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; RV64-ZICFISS-NEXT:    call bar
+; RV64-ZICFISS-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; RV64-ZICFISS-NEXT:    addi sp, sp, 16
+; RV64-ZICFISS-NEXT:    sspopchk ra
+; RV64-ZICFISS-NEXT:    ret
   %res = call i32 @bar()
   %res1 = add i32 %res, 1
   ret i32 %res
-- 
cgit v1.1


From 30cd1838dc334775f7a29f57b581f2bdda3f0ea1 Mon Sep 17 00:00:00 2001
From: Po-yao Chang <poyaoc97@gmail.com>
Date: Sat, 10 Feb 2024 22:22:16 +0800
Subject: [libc++][modules] Fix disabling Unicode (#81294)

-DLIBCXX_ENABLE_UNICODE=OFF or -D_LIBCPP_HAS_NO_UNICODE doesn't build
without this change.
---
 libcxx/modules/std/ostream.inc | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/libcxx/modules/std/ostream.inc b/libcxx/modules/std/ostream.inc
index 8fcbfb4..0e0e2d54 100644
--- a/libcxx/modules/std/ostream.inc
+++ b/libcxx/modules/std/ostream.inc
@@ -33,8 +33,10 @@ export namespace std {
   using std::println;
 
   using std::vprint_nonunicode;
+#    ifndef _LIBCPP_HAS_NO_UNICODE
   using std::vprint_unicode;
-#  endif // _LIBCPP_STD_VER >= 23
+#    endif // _LIBCPP_HAS_NO_UNICODE
+#  endif   // _LIBCPP_STD_VER >= 23
 
 #endif // _LIBCPP_HAS_NO_LOCALIZATION
 } // namespace std
-- 
cgit v1.1


From f66f44eb0c194f6bd0b6387d778624b303b6edc1 Mon Sep 17 00:00:00 2001
From: Mark de Wever <koraq@xs4all.nl>
Date: Sat, 10 Feb 2024 15:25:30 +0100
Subject: [libc++][modules] Regenerates files.

After applying the review comments of
https://github.com/llvm/llvm-project/pull/80478
I've forgotten to update the generated files. This fixes the issue and
removes trailing whitespace.
---
 libcxx/modules/std.compat.cppm.in       | 10 +++++-----
 libcxx/modules/std.cppm.in              | 10 +++++-----
 libcxx/utils/generate_libcxx_cppm_in.py |  2 +-
 3 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/libcxx/modules/std.compat.cppm.in b/libcxx/modules/std.compat.cppm.in
index 1636371..b44dbab 100644
--- a/libcxx/modules/std.compat.cppm.in
+++ b/libcxx/modules/std.compat.cppm.in
@@ -47,11 +47,11 @@ module;
 
 // *** Headers not yet available ***
 //
-// This validation is mainly to aid libc++ developers to add modules for new
-// headers. On Windows the Windows SDK can be in the include path. This SDK
-// contains the MSVC STL headers. This may give false positives when MSVC STL
-// provides a header libc++ has not implemented yet. Therefore this validation
-// is not done on Windows.
+// This validation is mainly to catch when a new header is added but adding the
+// corresponding .inc file is forgotten. However, the check based on __has_include
+// alone doesn't work on Windows because the Windows SDK is on the include path,
+// and that means the MSVC STL headers can be found as well, tricking __has_include
+// into thinking that libc++ provides the header.
 //
 #ifndef _WIN32
 #  if __has_include(<debugging>)
diff --git a/libcxx/modules/std.cppm.in b/libcxx/modules/std.cppm.in
index 3b59c28..b8d8913 100644
--- a/libcxx/modules/std.cppm.in
+++ b/libcxx/modules/std.cppm.in
@@ -169,11 +169,11 @@ module;
 
 // *** Headers not yet available ***
 //
-// This validation is mainly to aid libc++ developers to add modules for new
-// headers. On Windows the Windows SDK can be in the include path. This SDK
-// contains the MSVC STL headers. This may give false positives when MSVC STL
-// provides a header libc++ has not implemented yet. Therefore this validation
-// is not done on Windows.
+// This validation is mainly to catch when a new header is added but adding the
+// corresponding .inc file is forgotten. However, the check based on __has_include
+// alone doesn't work on Windows because the Windows SDK is on the include path,
+// and that means the MSVC STL headers can be found as well, tricking __has_include
+// into thinking that libc++ provides the header.
 //
 #ifndef _WIN32
 #  if __has_include(<debugging>)
diff --git a/libcxx/utils/generate_libcxx_cppm_in.py b/libcxx/utils/generate_libcxx_cppm_in.py
index 0390ce5..e98ac1b 100644
--- a/libcxx/utils/generate_libcxx_cppm_in.py
+++ b/libcxx/utils/generate_libcxx_cppm_in.py
@@ -61,7 +61,7 @@ module;
             """
 // *** Headers not yet available ***
 //
-// This validation is mainly to catch when a new header is added but adding the 
+// This validation is mainly to catch when a new header is added but adding the
 // corresponding .inc file is forgotten. However, the check based on __has_include
 // alone doesn't work on Windows because the Windows SDK is on the include path,
 // and that means the MSVC STL headers can be found as well, tricking __has_include
-- 
cgit v1.1


From a4ac099487d057dde8151700b3802eaeb69cead2 Mon Sep 17 00:00:00 2001
From: Daniel Chen <cdchen@ca.ibm.com>
Date: Sat, 10 Feb 2024 11:00:00 -0500
Subject: [Flang] Support passing a function that returns procedure pointer as
 actual corresponding to a procedure dummy. (#80891)

Flang crashes with the following case. The problem is we missed the case
when passing a reference to a function that returns a procedure pointer
as actual that corresponds to a procedure dummy. This PR is to fix that.

```
  PROGRAM main
  IMPLICIT NONE
  INTERFACE
    FUNCTION IntF(Arg)
      integer :: Arg, IntF
    END FUNCTION
  END INTERFACE

  INTERFACE
    FUNCTION RetPtr(Arg)
      IMPORT
      PROCEDURE(IntF) :: Arg
      PROCEDURE(IntF), POINTER :: RetPtr
    END FUNCTION
  END INTERFACE

  CALL ModSub(RetPtr(IntF))
  contains
    SUBROUTINE ModSub(Fun1)
    PROCEDURE(IntF) :: Fun1
    END SUBROUTINE
  END
  ```
---
 flang/lib/Lower/ConvertCall.cpp | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/flang/lib/Lower/ConvertCall.cpp b/flang/lib/Lower/ConvertCall.cpp
index f60cdbb..d8271b1 100644
--- a/flang/lib/Lower/ConvertCall.cpp
+++ b/flang/lib/Lower/ConvertCall.cpp
@@ -922,7 +922,8 @@ static PreparedDummyArgument preparePresentUserCallActualArgument(
   // Handle procedure arguments (procedure pointers should go through
   // prepareProcedurePointerActualArgument).
   if (hlfir::isFortranProcedureValue(dummyType)) {
-    // Procedure pointer actual to procedure dummy.
+    // Procedure pointer or function returns procedure pointer actual to
+    // procedure dummy.
     if (actual.isProcedurePointer()) {
       actual = hlfir::derefPointersAndAllocatables(loc, builder, actual);
       return PreparedDummyArgument{actual, /*cleanups=*/{}};
@@ -931,7 +932,11 @@ static PreparedDummyArgument preparePresentUserCallActualArgument(
     assert(actual.isProcedure());
     // Do nothing if this is a procedure argument. It is already a
     // fir.boxproc/fir.tuple<fir.boxproc, len> as it should.
-    if (actual.getType() != dummyType)
+    if (!actual.getType().isa<fir::BoxProcType>() &&
+        actual.getType() != dummyType)
+      // The actual argument may be a procedure that returns character (a
+      // fir.tuple<fir.boxproc, len>) while the dummy is not. Extract the tuple
+      // in that case.
       actual = fixProcedureDummyMismatch(loc, builder, actual, dummyType);
     return PreparedDummyArgument{actual, /*cleanups=*/{}};
   }
-- 
cgit v1.1


From 4fb7b3301bfbd439eb3d30d6a36c7cdb26941a0d Mon Sep 17 00:00:00 2001
From: Mark de Wever <koraq@xs4all.nl>
Date: Sat, 10 Feb 2024 17:09:53 +0100
Subject: [libc++][print] Moves is_terminal to the dylib. (#80464)

Having the test in the header requires including unistd.h on POSIX
platforms. This header has other declarations which may conflict with
code that uses named declarations provided by this header. For example
code using "int pipe;" would conflict with the function pipe in this
header.

Moving the code to the dylib means std::print would not be available on
Apple backdeployment targets. On POSIX platforms there is no transcoding
required so a not Standard conforming implementation is still a useful
and the observable differences are minimal. This behaviour has been done
for print before https://github.com/llvm/llvm-project/pull/76293.

Note questions have been raised in LWG4044 "Confusing requirements for
std::print on POSIX platforms", whether or not the isatty check on POSIX
platforms is required. When this LWG issue is resolved the
backdeployment targets could become Standard compliant.

This patch is intended to be backported to the LLVM-18 branch.

Fixes: https://github.com/llvm/llvm-project/issues/79782
---
 libcxx/include/print                               | 14 ++++++------
 libcxx/lib/abi/CHANGELOG.TXT                       |  8 +++++++
 ...in.libcxxabi.v1.stable.exceptions.nonew.abilist |  1 +
 ...21.libcxxabi.v1.stable.exceptions.nonew.abilist |  1 +
 ...ix.libcxxabi.v1.stable.exceptions.nonew.abilist |  1 +
 ...ix.libcxxabi.v1.stable.exceptions.nonew.abilist |  1 +
 ...in.libcxxabi.v1.stable.exceptions.nonew.abilist |  1 +
 ...21.libcxxabi.v1.stable.exceptions.nonew.abilist |  1 +
 ...sd.libcxxabi.v1.stable.exceptions.nonew.abilist |  1 +
 ...nu.libcxxabi.v1.stable.exceptions.nonew.abilist |  1 +
 ....libcxxabi.v1.stable.noexceptions.nonew.abilist |  1 +
 libcxx/src/print.cpp                               | 25 ++++++++++++++--------
 12 files changed, 40 insertions(+), 16 deletions(-)

diff --git a/libcxx/include/print b/libcxx/include/print
index 7f2b5ba..543a540 100644
--- a/libcxx/include/print
+++ b/libcxx/include/print
@@ -32,6 +32,7 @@ namespace std {
 */
 
 #include <__assert> // all public C++ headers provide the assertion handler
+#include <__availability>
 #include <__concepts/same_as.h>
 #include <__config>
 #include <__system_error/system_error.h>
@@ -43,10 +44,6 @@ namespace std {
 #include <string_view>
 #include <version>
 
-#if __has_include(<unistd.h>)
-#  include <unistd.h>
-#endif
-
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
 #  pragma GCC system_header
 #endif
@@ -68,7 +65,8 @@ _LIBCPP_EXPORTED_FROM_ABI bool __is_windows_terminal(FILE* __stream);
 // Note the function is only implemented on the Windows platform.
 _LIBCPP_EXPORTED_FROM_ABI void __write_to_windows_console(FILE* __stream, wstring_view __view);
 #  endif // _LIBCPP_HAS_NO_WIDE_CHARACTERS
-
+#elif __has_include(<unistd.h>)
+_LIBCPP_EXPORTED_FROM_ABI bool __is_posix_terminal(FILE* __stream);
 #endif // _LIBCPP_WIN32API
 
 #if _LIBCPP_STD_VER >= 23
@@ -195,15 +193,17 @@ inline constexpr bool __use_unicode_execution_charset = _MSVC_EXECUTION_CHARACTE
 inline constexpr bool __use_unicode_execution_charset = true;
 #  endif
 
-_LIBCPP_HIDE_FROM_ABI inline bool __is_terminal(FILE* __stream) {
+_LIBCPP_HIDE_FROM_ABI inline bool __is_terminal([[maybe_unused]] FILE* __stream) {
   // The macro _LIBCPP_TESTING_PRINT_IS_TERMINAL is used to change
   // the behavior in the test. This is not part of the public API.
 #  ifdef _LIBCPP_TESTING_PRINT_IS_TERMINAL
   return _LIBCPP_TESTING_PRINT_IS_TERMINAL(__stream);
+#  elif _LIBCPP_AVAILABILITY_HAS_PRINT == 0
+  return false;
 #  elif defined(_LIBCPP_WIN32API)
   return std::__is_windows_terminal(__stream);
 #  elif __has_include(<unistd.h>)
-  return isatty(fileno(__stream));
+  return std::__is_posix_terminal(__stream);
 #  else
 #    error "Provide a way to determine whether a FILE* is a terminal"
 #  endif
diff --git a/libcxx/lib/abi/CHANGELOG.TXT b/libcxx/lib/abi/CHANGELOG.TXT
index 1179c25..7ff6049 100644
--- a/libcxx/lib/abi/CHANGELOG.TXT
+++ b/libcxx/lib/abi/CHANGELOG.TXT
@@ -16,6 +16,14 @@ New entries should be added directly below the "Version" header.
 Version 18.0
 ------------
 
+* [libc++] Moves is_terminal to the dylib
+
+  The patch moves the POSIX implementation of is_terminal to the dylib. This is
+  needed to avoid using <unistd.h> in public headers.
+
+  All platforms
+  Symbol added: _ZNSt6__ndk119__is_posix_terminalEP7__sFILE
+
 * [libc++abi] Implement __cxa_init_primary_exception and use it to optimize std::make_exception_ptr (#65534)
 
   This patch implements __cxa_init_primary_exception, an extension to the Itanium C++ ABI.
diff --git a/libcxx/lib/abi/arm64-apple-darwin.libcxxabi.v1.stable.exceptions.nonew.abilist b/libcxx/lib/abi/arm64-apple-darwin.libcxxabi.v1.stable.exceptions.nonew.abilist
index c2fea4d..2064f45 100644
--- a/libcxx/lib/abi/arm64-apple-darwin.libcxxabi.v1.stable.exceptions.nonew.abilist
+++ b/libcxx/lib/abi/arm64-apple-darwin.libcxxabi.v1.stable.exceptions.nonew.abilist
@@ -1495,6 +1495,7 @@
 {'is_defined': True, 'name': '__ZNSt3__118shared_timed_mutex8try_lockEv', 'type': 'FUNC'}
 {'is_defined': True, 'name': '__ZNSt3__118shared_timed_mutexC1Ev', 'type': 'FUNC'}
 {'is_defined': True, 'name': '__ZNSt3__118shared_timed_mutexC2Ev', 'type': 'FUNC'}
+{'is_defined': True, 'name': '__ZNSt3__119__is_posix_terminalEP7__sFILE', 'type': 'FUNC'}
 {'is_defined': True, 'name': '__ZNSt3__119__shared_mutex_base11lock_sharedEv', 'type': 'FUNC'}
 {'is_defined': True, 'name': '__ZNSt3__119__shared_mutex_base13unlock_sharedEv', 'type': 'FUNC'}
 {'is_defined': True, 'name': '__ZNSt3__119__shared_mutex_base15try_lock_sharedEv', 'type': 'FUNC'}
diff --git a/libcxx/lib/abi/i686-linux-android21.libcxxabi.v1.stable.exceptions.nonew.abilist b/libcxx/lib/abi/i686-linux-android21.libcxxabi.v1.stable.exceptions.nonew.abilist
index a60f099..fec3a45 100644
--- a/libcxx/lib/abi/i686-linux-android21.libcxxabi.v1.stable.exceptions.nonew.abilist
+++ b/libcxx/lib/abi/i686-linux-android21.libcxxabi.v1.stable.exceptions.nonew.abilist
@@ -1176,6 +1176,7 @@
 {'is_defined': True, 'name': '_ZNSt6__ndk118shared_timed_mutex8try_lockEv', 'type': 'FUNC'}
 {'is_defined': True, 'name': '_ZNSt6__ndk118shared_timed_mutexC1Ev', 'type': 'FUNC'}
 {'is_defined': True, 'name': '_ZNSt6__ndk118shared_timed_mutexC2Ev', 'type': 'FUNC'}
+{'is_defined': True, 'name': '_ZNSt6__ndk119__is_posix_terminalEP7__sFILE', 'type': 'FUNC'}
 {'is_defined': True, 'name': '_ZNSt6__ndk119__shared_mutex_base11lock_sharedEv', 'type': 'FUNC'}
 {'is_defined': True, 'name': '_ZNSt6__ndk119__shared_mutex_base13unlock_sharedEv', 'type': 'FUNC'}
 {'is_defined': True, 'name': '_ZNSt6__ndk119__shared_mutex_base15try_lock_sharedEv', 'type': 'FUNC'}
diff --git a/libcxx/lib/abi/powerpc-ibm-aix.libcxxabi.v1.stable.exceptions.nonew.abilist b/libcxx/lib/abi/powerpc-ibm-aix.libcxxabi.v1.stable.exceptions.nonew.abilist
index a159ff52..e52cf98 100644
--- a/libcxx/lib/abi/powerpc-ibm-aix.libcxxabi.v1.stable.exceptions.nonew.abilist
+++ b/libcxx/lib/abi/powerpc-ibm-aix.libcxxabi.v1.stable.exceptions.nonew.abilist
@@ -534,6 +534,7 @@
 {'import_export': 'EXP', 'is_defined': True, 'name': '_ZNSt3__118shared_timed_mutex8try_lockEv', 'storage_mapping_class': 'DS', 'type': 'FUNC'}
 {'import_export': 'EXP', 'is_defined': True, 'name': '_ZNSt3__118shared_timed_mutexC1Ev', 'storage_mapping_class': 'DS', 'type': 'FUNC'}
 {'import_export': 'EXP', 'is_defined': True, 'name': '_ZNSt3__118shared_timed_mutexC2Ev', 'storage_mapping_class': 'DS', 'type': 'FUNC'}
+{'import_export': 'EXP', 'is_defined': True, 'name': '_ZNSt3__119__is_posix_terminalEP4FILE', 'storage_mapping_class': 'DS', 'type': 'FUNC'}
 {'import_export': 'EXP', 'is_defined': True, 'name': '_ZNSt3__119__shared_mutex_base11lock_sharedEv', 'storage_mapping_class': 'DS', 'type': 'FUNC'}
 {'import_export': 'EXP', 'is_defined': True, 'name': '_ZNSt3__119__shared_mutex_base13unlock_sharedEv', 'storage_mapping_class': 'DS', 'type': 'FUNC'}
 {'import_export': 'EXP', 'is_defined': True, 'name': '_ZNSt3__119__shared_mutex_base15try_lock_sharedEv', 'storage_mapping_class': 'DS', 'type': 'FUNC'}
diff --git a/libcxx/lib/abi/powerpc64-ibm-aix.libcxxabi.v1.stable.exceptions.nonew.abilist b/libcxx/lib/abi/powerpc64-ibm-aix.libcxxabi.v1.stable.exceptions.nonew.abilist
index 5749a75..52a0470 100644
--- a/libcxx/lib/abi/powerpc64-ibm-aix.libcxxabi.v1.stable.exceptions.nonew.abilist
+++ b/libcxx/lib/abi/powerpc64-ibm-aix.libcxxabi.v1.stable.exceptions.nonew.abilist
@@ -534,6 +534,7 @@
 {'import_export': 'EXP', 'is_defined': True, 'name': '_ZNSt3__118shared_timed_mutex8try_lockEv', 'storage_mapping_class': 'DS', 'type': 'FUNC'}
 {'import_export': 'EXP', 'is_defined': True, 'name': '_ZNSt3__118shared_timed_mutexC1Ev', 'storage_mapping_class': 'DS', 'type': 'FUNC'}
 {'import_export': 'EXP', 'is_defined': True, 'name': '_ZNSt3__118shared_timed_mutexC2Ev', 'storage_mapping_class': 'DS', 'type': 'FUNC'}
+{'import_export': 'EXP', 'is_defined': True, 'name': '_ZNSt3__119__is_posix_terminalEP4FILE', 'storage_mapping_class': 'DS', 'type': 'FUNC'}
 {'import_export': 'EXP', 'is_defined': True, 'name': '_ZNSt3__119__shared_mutex_base11lock_sharedEv', 'storage_mapping_class': 'DS', 'type': 'FUNC'}
 {'import_export': 'EXP', 'is_defined': True, 'name': '_ZNSt3__119__shared_mutex_base13unlock_sharedEv', 'storage_mapping_class': 'DS', 'type': 'FUNC'}
 {'import_export': 'EXP', 'is_defined': True, 'name': '_ZNSt3__119__shared_mutex_base15try_lock_sharedEv', 'storage_mapping_class': 'DS', 'type': 'FUNC'}
diff --git a/libcxx/lib/abi/x86_64-apple-darwin.libcxxabi.v1.stable.exceptions.nonew.abilist b/libcxx/lib/abi/x86_64-apple-darwin.libcxxabi.v1.stable.exceptions.nonew.abilist
index e827114..bced6b2 100644
--- a/libcxx/lib/abi/x86_64-apple-darwin.libcxxabi.v1.stable.exceptions.nonew.abilist
+++ b/libcxx/lib/abi/x86_64-apple-darwin.libcxxabi.v1.stable.exceptions.nonew.abilist
@@ -1495,6 +1495,7 @@
 {'is_defined': True, 'name': '__ZNSt3__118shared_timed_mutex8try_lockEv', 'type': 'FUNC'}
 {'is_defined': True, 'name': '__ZNSt3__118shared_timed_mutexC1Ev', 'type': 'FUNC'}
 {'is_defined': True, 'name': '__ZNSt3__118shared_timed_mutexC2Ev', 'type': 'FUNC'}
+{'is_defined': True, 'name': '__ZNSt3__119__is_posix_terminalEP7__sFILE', 'type': 'FUNC'}
 {'is_defined': True, 'name': '__ZNSt3__119__shared_mutex_base11lock_sharedEv', 'type': 'FUNC'}
 {'is_defined': True, 'name': '__ZNSt3__119__shared_mutex_base13unlock_sharedEv', 'type': 'FUNC'}
 {'is_defined': True, 'name': '__ZNSt3__119__shared_mutex_base15try_lock_sharedEv', 'type': 'FUNC'}
diff --git a/libcxx/lib/abi/x86_64-linux-android21.libcxxabi.v1.stable.exceptions.nonew.abilist b/libcxx/lib/abi/x86_64-linux-android21.libcxxabi.v1.stable.exceptions.nonew.abilist
index f4077ad..efa2189 100644
--- a/libcxx/lib/abi/x86_64-linux-android21.libcxxabi.v1.stable.exceptions.nonew.abilist
+++ b/libcxx/lib/abi/x86_64-linux-android21.libcxxabi.v1.stable.exceptions.nonew.abilist
@@ -1176,6 +1176,7 @@
 {'is_defined': True, 'name': '_ZNSt6__ndk118shared_timed_mutex8try_lockEv', 'type': 'FUNC'}
 {'is_defined': True, 'name': '_ZNSt6__ndk118shared_timed_mutexC1Ev', 'type': 'FUNC'}
 {'is_defined': True, 'name': '_ZNSt6__ndk118shared_timed_mutexC2Ev', 'type': 'FUNC'}
+{'is_defined': True, 'name': '_ZNSt6__ndk119__is_posix_terminalEP7__sFILE', 'type': 'FUNC'}
 {'is_defined': True, 'name': '_ZNSt6__ndk119__shared_mutex_base11lock_sharedEv', 'type': 'FUNC'}
 {'is_defined': True, 'name': '_ZNSt6__ndk119__shared_mutex_base13unlock_sharedEv', 'type': 'FUNC'}
 {'is_defined': True, 'name': '_ZNSt6__ndk119__shared_mutex_base15try_lock_sharedEv', 'type': 'FUNC'}
diff --git a/libcxx/lib/abi/x86_64-unknown-freebsd.libcxxabi.v1.stable.exceptions.nonew.abilist b/libcxx/lib/abi/x86_64-unknown-freebsd.libcxxabi.v1.stable.exceptions.nonew.abilist
index e3d3fcb..ebda5b0 100644
--- a/libcxx/lib/abi/x86_64-unknown-freebsd.libcxxabi.v1.stable.exceptions.nonew.abilist
+++ b/libcxx/lib/abi/x86_64-unknown-freebsd.libcxxabi.v1.stable.exceptions.nonew.abilist
@@ -1190,6 +1190,7 @@
 {'is_defined': True, 'name': '_ZNSt3__118shared_timed_mutex8try_lockEv', 'type': 'FUNC'}
 {'is_defined': True, 'name': '_ZNSt3__118shared_timed_mutexC1Ev', 'type': 'FUNC'}
 {'is_defined': True, 'name': '_ZNSt3__118shared_timed_mutexC2Ev', 'type': 'FUNC'}
+{'is_defined': True, 'name': '_ZNSt3__119__is_posix_terminalEP7__sFILE', 'type': 'FUNC'}
 {'is_defined': True, 'name': '_ZNSt3__119__shared_mutex_base11lock_sharedEv', 'type': 'FUNC'}
 {'is_defined': True, 'name': '_ZNSt3__119__shared_mutex_base13unlock_sharedEv', 'type': 'FUNC'}
 {'is_defined': True, 'name': '_ZNSt3__119__shared_mutex_base15try_lock_sharedEv', 'type': 'FUNC'}
diff --git a/libcxx/lib/abi/x86_64-unknown-linux-gnu.libcxxabi.v1.stable.exceptions.nonew.abilist b/libcxx/lib/abi/x86_64-unknown-linux-gnu.libcxxabi.v1.stable.exceptions.nonew.abilist
index 1692330..6432ad3 100644
--- a/libcxx/lib/abi/x86_64-unknown-linux-gnu.libcxxabi.v1.stable.exceptions.nonew.abilist
+++ b/libcxx/lib/abi/x86_64-unknown-linux-gnu.libcxxabi.v1.stable.exceptions.nonew.abilist
@@ -1188,6 +1188,7 @@
 {'is_defined': True, 'name': '_ZNSt3__118shared_timed_mutex8try_lockEv', 'type': 'FUNC'}
 {'is_defined': True, 'name': '_ZNSt3__118shared_timed_mutexC1Ev', 'type': 'FUNC'}
 {'is_defined': True, 'name': '_ZNSt3__118shared_timed_mutexC2Ev', 'type': 'FUNC'}
+{'is_defined': True, 'name': '_ZNSt3__119__is_posix_terminalEP8_IO_FILE', 'type': 'FUNC'}
 {'is_defined': True, 'name': '_ZNSt3__119__shared_mutex_base11lock_sharedEv', 'type': 'FUNC'}
 {'is_defined': True, 'name': '_ZNSt3__119__shared_mutex_base13unlock_sharedEv', 'type': 'FUNC'}
 {'is_defined': True, 'name': '_ZNSt3__119__shared_mutex_base15try_lock_sharedEv', 'type': 'FUNC'}
diff --git a/libcxx/lib/abi/x86_64-unknown-linux-gnu.libcxxabi.v1.stable.noexceptions.nonew.abilist b/libcxx/lib/abi/x86_64-unknown-linux-gnu.libcxxabi.v1.stable.noexceptions.nonew.abilist
index 2380ffb..1fe84e1 100644
--- a/libcxx/lib/abi/x86_64-unknown-linux-gnu.libcxxabi.v1.stable.noexceptions.nonew.abilist
+++ b/libcxx/lib/abi/x86_64-unknown-linux-gnu.libcxxabi.v1.stable.noexceptions.nonew.abilist
@@ -1159,6 +1159,7 @@
 {'is_defined': True, 'name': '_ZNSt3__118shared_timed_mutex8try_lockEv', 'type': 'FUNC'}
 {'is_defined': True, 'name': '_ZNSt3__118shared_timed_mutexC1Ev', 'type': 'FUNC'}
 {'is_defined': True, 'name': '_ZNSt3__118shared_timed_mutexC2Ev', 'type': 'FUNC'}
+{'is_defined': True, 'name': '_ZNSt3__119__is_posix_terminalEP8_IO_FILE', 'type': 'FUNC'}
 {'is_defined': True, 'name': '_ZNSt3__119__shared_mutex_base11lock_sharedEv', 'type': 'FUNC'}
 {'is_defined': True, 'name': '_ZNSt3__119__shared_mutex_base13unlock_sharedEv', 'type': 'FUNC'}
 {'is_defined': True, 'name': '_ZNSt3__119__shared_mutex_base15try_lock_sharedEv', 'type': 'FUNC'}
diff --git a/libcxx/src/print.cpp b/libcxx/src/print.cpp
index 3692187..8fa59fd 100644
--- a/libcxx/src/print.cpp
+++ b/libcxx/src/print.cpp
@@ -8,22 +8,26 @@
 
 #include <__config>
 
-#if defined(_LIBCPP_WIN32API)
+#include <cstdlib>
+#include <print>
+
+#include <__system_error/system_error.h>
 
-#  include <cstdlib>
-#  include <print>
+#include "filesystem/error.h"
 
+#if defined(_LIBCPP_WIN32API)
 #  define WIN32_LEAN_AND_MEAN
 #  define NOMINMAX
 #  include <io.h>
 #  include <windows.h>
-
-#  include <__system_error/system_error.h>
-
-#  include "filesystem/error.h"
+#elif __has_include(<unistd.h>)
+#  include <unistd.h>
+#endif
 
 _LIBCPP_BEGIN_NAMESPACE_STD
 
+#if defined(_LIBCPP_WIN32API)
+
 _LIBCPP_EXPORTED_FROM_ABI bool __is_windows_terminal(FILE* __stream) {
   // Note the Standard does this in one call, but it's unclear whether
   // an invalid handle is allowed when calling GetConsoleMode.
@@ -52,6 +56,9 @@ __write_to_windows_console([[maybe_unused]] FILE* __stream, [[maybe_unused]] wst
 }
 #  endif // _LIBCPP_HAS_NO_WIDE_CHARACTERS
 
-_LIBCPP_END_NAMESPACE_STD
+#elif __has_include(<unistd.h>) // !_LIBCPP_WIN32API
 
-#endif // !_LIBCPP_WIN32API
+_LIBCPP_EXPORTED_FROM_ABI bool __is_posix_terminal(FILE* __stream) { return isatty(fileno(__stream)); }
+#endif
+
+_LIBCPP_END_NAMESPACE_STD
-- 
cgit v1.1


From b4c6ab600f2ef6f3a842afee569dcf86bce7a43a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Timm=20B=C3=A4der?= <tbaeder@redhat.com>
Date: Fri, 9 Feb 2024 19:03:18 +0100
Subject: [clang][Interp][NFC] Don't use visitLocalInitializer in visitExpr

We were unnecessarily getting the pointer of the local variable twice.
---
 clang/lib/AST/Interp/ByteCodeExprGen.cpp | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/clang/lib/AST/Interp/ByteCodeExprGen.cpp b/clang/lib/AST/Interp/ByteCodeExprGen.cpp
index aaa8ac8..6993d75 100644
--- a/clang/lib/AST/Interp/ByteCodeExprGen.cpp
+++ b/clang/lib/AST/Interp/ByteCodeExprGen.cpp
@@ -2555,10 +2555,13 @@ bool ByteCodeExprGen<Emitter>::visitExpr(const Expr *E) {
   // For us, that means everything we don't
   // have a PrimType for.
   if (std::optional<unsigned> LocalOffset = this->allocateLocal(E)) {
-    if (!this->visitLocalInitializer(E, *LocalOffset))
+    if (!this->emitGetPtrLocal(*LocalOffset, E))
       return false;
 
-    if (!this->emitGetPtrLocal(*LocalOffset, E))
+    if (!visitInitializer(E))
+      return false;
+
+    if (!this->emitInitPtr(E))
       return false;
     return this->emitRetValue(E);
   }
-- 
cgit v1.1


From d2e4a725da5b4cbef8b5c1446f29fed1487aeab0 Mon Sep 17 00:00:00 2001
From: Frederic Cambus <fred@statdns.com>
Date: Sat, 10 Feb 2024 17:39:30 +0100
Subject: [clang] Update Clang version from 18 to 19 in scan-build.1.

Similar to D110763.
---
 clang/tools/scan-build/man/scan-build.1 | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/clang/tools/scan-build/man/scan-build.1 b/clang/tools/scan-build/man/scan-build.1
index 29edbca..e2b37f6 100644
--- a/clang/tools/scan-build/man/scan-build.1
+++ b/clang/tools/scan-build/man/scan-build.1
@@ -2,9 +2,9 @@
 .\" See https://llvm.org/LICENSE.txt for license information.
 .\" SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 .\" $Id$
-.Dd Sep 21, 2023
+.Dd Feb 10, 2024
 .Dt SCAN-BUILD 1
-.Os "clang" "18"
+.Os "clang" "19"
 .Sh NAME
 .Nm scan-build
 .Nd Clang static analyzer
-- 
cgit v1.1


From dce77a357948709e335910ddc07f9c3f2eb2ac4b Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo@fhahn.com>
Date: Sat, 10 Feb 2024 18:11:17 +0000
Subject: [IndVars] Preserve flags of narrow IV inc if replacing with wider
 inc. (#80446)

We are replacing a narrow IV increment with a wider one. If the original
(narrow) increment did not wrap, the wider one should not wrap either.
Set the flags to be the union of both wide increment and original
increment; this ensures we preserve flags SCEV could infer for the wider
increment.

Fixes https://github.com/llvm/llvm-project/issues/71517.
---
 llvm/lib/Transforms/Utils/SimplifyIndVar.cpp       | 21 ++++++
 llvm/test/Transforms/IndVarSimplify/X86/pr27133.ll |  4 +-
 llvm/test/Transforms/IndVarSimplify/lftr-reuse.ll  |  2 +-
 .../Transforms/IndVarSimplify/pr30806-phi-scev.ll  |  2 +-
 .../preserve-nsw-during-expansion.ll               |  2 +-
 .../Transforms/IndVarSimplify/widen-i32-i8ptr.ll   |  2 +-
 llvm/test/Transforms/LoopFlatten/widen-iv.ll       |  6 +-
 llvm/test/Transforms/LoopFlatten/widen-iv2.ll      |  4 +-
 llvm/test/Transforms/LoopFlatten/widen-iv3.ll      |  4 +-
 .../PhaseOrdering/AArch64/indvars-vectorization.ll | 81 ++++++++++++++++++++--
 .../PhaseOrdering/AArch64/loopflatten.ll           |  2 +-
 11 files changed, 111 insertions(+), 19 deletions(-)

diff --git a/llvm/lib/Transforms/Utils/SimplifyIndVar.cpp b/llvm/lib/Transforms/Utils/SimplifyIndVar.cpp
index 1b142f1..5aa6df4 100644
--- a/llvm/lib/Transforms/Utils/SimplifyIndVar.cpp
+++ b/llvm/lib/Transforms/Utils/SimplifyIndVar.cpp
@@ -1985,7 +1985,28 @@ PHINode *WidenIV::createWideIV(SCEVExpander &Rewriter) {
       // increment to the new (widened) increment.
       auto *OrigInc =
           cast<Instruction>(OrigPhi->getIncomingValueForBlock(LatchBlock));
+
       WideInc->setDebugLoc(OrigInc->getDebugLoc());
+      // We are replacing a narrow IV increment with a wider IV increment. If
+      // the original (narrow) increment did not wrap, the wider increment one
+      // should not wrap either. Set the flags to be the union of both wide
+      // increment and original increment; this ensures we preserve flags SCEV
+      // could infer for the wider increment. Limit this only to cases where
+      // both increments directly increment the corresponding PHI nodes and have
+      // the same opcode. It is not safe to re-use the flags from the original
+      // increment, if it is more complex and SCEV expansion may have yielded a
+      // more simplified wider increment.
+      bool MatchingOps =
+          match(OrigInc, m_c_BinOp(m_Specific(OrigPhi), m_Value())) &&
+          match(WideInc, m_c_BinOp(m_Specific(WidePhi), m_Value())) &&
+          OrigInc->getOpcode() == WideInc->getOpcode();
+      if (MatchingOps && isa<OverflowingBinaryOperator>(OrigInc) &&
+          isa<OverflowingBinaryOperator>(WideInc)) {
+        WideInc->setHasNoUnsignedWrap(WideInc->hasNoUnsignedWrap() ||
+                                      OrigInc->hasNoUnsignedWrap());
+        WideInc->setHasNoSignedWrap(WideInc->hasNoSignedWrap() ||
+                                    OrigInc->hasNoSignedWrap());
+      }
     }
   }
 
diff --git a/llvm/test/Transforms/IndVarSimplify/X86/pr27133.ll b/llvm/test/Transforms/IndVarSimplify/X86/pr27133.ll
index 6efe86d..b7d0700 100644
--- a/llvm/test/Transforms/IndVarSimplify/X86/pr27133.ll
+++ b/llvm/test/Transforms/IndVarSimplify/X86/pr27133.ll
@@ -11,7 +11,7 @@ define i32 @fn2() personality ptr @__CxxFrameHandler3 {
 ; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ], [ 0, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    [[INDVARS1:%.*]] = trunc i64 [[INDVARS_IV]] to i32
 ; CHECK-NEXT:    invoke void @fn1(i64 [[INDVARS_IV]])
-; CHECK-NEXT:    to label [[FOR_INC]] unwind label [[CATCH_DISPATCH:%.*]]
+; CHECK-NEXT:            to label [[FOR_INC]] unwind label [[CATCH_DISPATCH:%.*]]
 ; CHECK:       catch.dispatch:
 ; CHECK-NEXT:    [[C_0_LCSSA:%.*]] = phi i32 [ [[INDVARS1]], [[FOR_COND]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = catchswitch within none [label %catch] unwind to caller
@@ -21,7 +21,7 @@ define i32 @fn2() personality ptr @__CxxFrameHandler3 {
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret i32 [[C_0_LCSSA]]
 ; CHECK:       for.inc:
-; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
 ; CHECK-NEXT:    br label [[FOR_COND]]
 ;
 entry:
diff --git a/llvm/test/Transforms/IndVarSimplify/lftr-reuse.ll b/llvm/test/Transforms/IndVarSimplify/lftr-reuse.ll
index 8aa698a..7409fc8 100644
--- a/llvm/test/Transforms/IndVarSimplify/lftr-reuse.ll
+++ b/llvm/test/Transforms/IndVarSimplify/lftr-reuse.ll
@@ -148,7 +148,7 @@ define void @guardedloop(ptr %matrix, ptr %vector,
 ; CHECK-NEXT:    [[VECTORP:%.*]] = getelementptr inbounds [0 x double], ptr [[VECTOR:%.*]], i32 0, i64 [[INDVARS_IV2]]
 ; CHECK-NEXT:    [[V2:%.*]] = load double, ptr [[VECTORP]], align 8
 ; CHECK-NEXT:    call void @use(double [[V2]])
-; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add i64 [[INDVARS_IV]], [[TMP0]]
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nsw i64 [[INDVARS_IV]], [[TMP0]]
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT3]] = add nuw nsw i64 [[INDVARS_IV2]], 1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDVARS_IV_NEXT3]], [[WIDE_TRIP_COUNT]]
 ; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LOOP]], label [[RETURN_LOOPEXIT:%.*]]
diff --git a/llvm/test/Transforms/IndVarSimplify/pr30806-phi-scev.ll b/llvm/test/Transforms/IndVarSimplify/pr30806-phi-scev.ll
index b45f094..6a2bbfa 100644
--- a/llvm/test/Transforms/IndVarSimplify/pr30806-phi-scev.ll
+++ b/llvm/test/Transforms/IndVarSimplify/pr30806-phi-scev.ll
@@ -43,7 +43,7 @@ define void @foo(ptr %buf, i32 %denominator, ptr %flag) local_unnamed_addr {
 ; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[WHILE_BODY]] ], [ 0, [[WHILE_BODY_LR_PH]] ]
 ; CHECK-NEXT:    [[BUF_ADDR_07:%.*]] = phi ptr [ [[BUF]], [[WHILE_BODY_LR_PH]] ], [ [[CALL:%.*]], [[WHILE_BODY]] ]
 ; CHECK-NEXT:    [[TMP2:%.*]] = sext i32 [[DIV]] to i64
-; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add i64 [[INDVARS_IV]], [[TMP2]]
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nsw i64 [[INDVARS_IV]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr @theSize, align 4
 ; CHECK-NEXT:    store i32 [[TMP3]], ptr [[I]], align 4
 ; CHECK-NEXT:    call void @bar(ptr nonnull [[I]], i64 [[INDVARS_IV_NEXT]])
diff --git a/llvm/test/Transforms/IndVarSimplify/preserve-nsw-during-expansion.ll b/llvm/test/Transforms/IndVarSimplify/preserve-nsw-during-expansion.ll
index 9c2237c..080bc9b 100644
--- a/llvm/test/Transforms/IndVarSimplify/preserve-nsw-during-expansion.ll
+++ b/llvm/test/Transforms/IndVarSimplify/preserve-nsw-during-expansion.ll
@@ -23,7 +23,7 @@ define void @test_s172(i32 noundef %xa, i32 noundef %xb, ptr nocapture noundef %
 ; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
 ; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP3]], [[TMP2]]
 ; CHECK-NEXT:    store i32 [[ADD]], ptr [[ARRAYIDX2]], align 4
-; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add i64 [[INDVARS_IV]], [[TMP1]]
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nsw i64 [[INDVARS_IV]], [[TMP1]]
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i64 [[INDVARS_IV_NEXT]], 32000
 ; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END_LOOPEXIT:%.*]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       for.end.loopexit:
diff --git a/llvm/test/Transforms/IndVarSimplify/widen-i32-i8ptr.ll b/llvm/test/Transforms/IndVarSimplify/widen-i32-i8ptr.ll
index 17ce13d..35e6ca6 100644
--- a/llvm/test/Transforms/IndVarSimplify/widen-i32-i8ptr.ll
+++ b/llvm/test/Transforms/IndVarSimplify/widen-i32-i8ptr.ll
@@ -15,7 +15,7 @@ define dso_local void @Widen_i32_i8ptr() local_unnamed_addr {
 ; CHECK-NEXT:    [[INCDEC_PTR]] = getelementptr inbounds i8, ptr [[GID_0]], i64 1
 ; CHECK-NEXT:    [[ARRAYIDX2115:%.*]] = getelementptr inbounds [15 x ptr], ptr [[PTRIDS]], i64 0, i64 [[INDVARS_IV]]
 ; CHECK-NEXT:    store ptr [[GID_0]], ptr [[ARRAYIDX2115]], align 8
-; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
 ; CHECK-NEXT:    br label [[FOR_COND2106]]
 ;
 entry:
diff --git a/llvm/test/Transforms/LoopFlatten/widen-iv.ll b/llvm/test/Transforms/LoopFlatten/widen-iv.ll
index 2feca40..ac42acb9 100644
--- a/llvm/test/Transforms/LoopFlatten/widen-iv.ll
+++ b/llvm/test/Transforms/LoopFlatten/widen-iv.ll
@@ -36,7 +36,7 @@ define void @foo(ptr %A, i32 %N, i32 %M) {
 ; CHECK-NEXT:    tail call void @f(ptr [[ARRAYIDX_US]])
 ; CHECK-NEXT:    br label [[FOR_COND1_FOR_COND_CLEANUP3_CRIT_EDGE_US]]
 ; CHECK:       for.cond1.for.cond.cleanup3_crit_edge.us:
-; CHECK-NEXT:    [[INDVAR_NEXT3]] = add i64 [[INDVAR2]], 1
+; CHECK-NEXT:    [[INDVAR_NEXT3]] = add nuw nsw i64 [[INDVAR2]], 1
 ; CHECK-NEXT:    [[CMP_US:%.*]] = icmp slt i64 [[INDVAR_NEXT3]], [[FLATTEN_TRIPCOUNT]]
 ; CHECK-NEXT:    br i1 [[CMP_US]], label [[FOR_COND1_PREHEADER_US]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]]
 ; CHECK:       for.cond.cleanup.loopexit:
@@ -143,7 +143,7 @@ define void @foo2_sext(ptr nocapture readonly %A, i32 %N, i32 %M) {
 ; CHECK-NEXT:    tail call void @g(i32 [[TMP2]])
 ; CHECK-NEXT:    br label [[FOR_COND1_FOR_COND_CLEANUP3_CRIT_EDGE_US]]
 ; CHECK:       for.cond1.for.cond.cleanup3_crit_edge.us:
-; CHECK-NEXT:    [[INDVAR_NEXT3]] = add i64 [[INDVAR2]], 1
+; CHECK-NEXT:    [[INDVAR_NEXT3]] = add nuw nsw i64 [[INDVAR2]], 1
 ; CHECK-NEXT:    [[CMP_US:%.*]] = icmp slt i64 [[INDVAR_NEXT3]], [[FLATTEN_TRIPCOUNT]]
 ; CHECK-NEXT:    br i1 [[CMP_US]], label [[FOR_COND1_PREHEADER_US]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]]
 ; CHECK:       for.cond1.preheader:
@@ -1005,7 +1005,7 @@ define void @foo_M_sext(ptr %A, i32 %N, i16 %M) {
 ; CHECK-NEXT:    tail call void @f(ptr [[ARRAYIDX_US]])
 ; CHECK-NEXT:    br label [[FOR_COND1_FOR_COND_CLEANUP3_CRIT_EDGE_US]]
 ; CHECK:       for.cond1.for.cond.cleanup3_crit_edge.us:
-; CHECK-NEXT:    [[INDVAR_NEXT3]] = add i64 [[INDVAR2]], 1
+; CHECK-NEXT:    [[INDVAR_NEXT3]] = add nuw nsw i64 [[INDVAR2]], 1
 ; CHECK-NEXT:    [[CMP_US:%.*]] = icmp slt i64 [[INDVAR_NEXT3]], [[FLATTEN_TRIPCOUNT]]
 ; CHECK-NEXT:    br i1 [[CMP_US]], label [[FOR_COND1_PREHEADER_US]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]]
 ; CHECK:       for.cond.cleanup.loopexit:
diff --git a/llvm/test/Transforms/LoopFlatten/widen-iv2.ll b/llvm/test/Transforms/LoopFlatten/widen-iv2.ll
index 946b984..7b1caa7 100644
--- a/llvm/test/Transforms/LoopFlatten/widen-iv2.ll
+++ b/llvm/test/Transforms/LoopFlatten/widen-iv2.ll
@@ -45,12 +45,12 @@ define dso_local i32 @fn1() local_unnamed_addr #0 {
 ; CHECK-NEXT:    [[IDXPROM_US:%.*]] = sext i32 [[ADD_US]] to i64
 ; CHECK-NEXT:    [[ARRAYIDX_US:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i64 [[TMP7]]
 ; CHECK-NEXT:    store i32 32, ptr [[ARRAYIDX_US]], align 4
-; CHECK-NEXT:    [[INDVAR_NEXT]] = add i64 [[INDVAR]], 1
+; CHECK-NEXT:    [[INDVAR_NEXT]] = add nuw nsw i64 [[INDVAR]], 1
 ; CHECK-NEXT:    [[INC_US]] = add nuw nsw i32 [[J_014_US]], 1
 ; CHECK-NEXT:    [[CMP2_US:%.*]] = icmp slt i64 [[INDVAR_NEXT]], [[TMP1]]
 ; CHECK-NEXT:    br i1 [[CMP2_US]], label [[FOR_BODY3_US]], label [[FOR_COND1_FOR_INC4_CRIT_EDGE_US]]
 ; CHECK:       for.cond1.for.inc4_crit_edge.us:
-; CHECK-NEXT:    [[INDVAR_NEXT3]] = add i64 [[INDVAR2]], 1
+; CHECK-NEXT:    [[INDVAR_NEXT3]] = add nuw nsw i64 [[INDVAR2]], 1
 ; CHECK-NEXT:    [[INC5_US]] = add nuw nsw i32 [[I_016_US]], 1
 ; CHECK-NEXT:    [[CMP_US:%.*]] = icmp slt i64 [[INDVAR_NEXT3]], [[TMP3]]
 ; CHECK-NEXT:    br i1 [[CMP_US]], label [[FOR_COND1_PREHEADER_US]], label [[FOR_END6_LOOPEXIT:%.*]]
diff --git a/llvm/test/Transforms/LoopFlatten/widen-iv3.ll b/llvm/test/Transforms/LoopFlatten/widen-iv3.ll
index df8ee6f..6e6c045 100644
--- a/llvm/test/Transforms/LoopFlatten/widen-iv3.ll
+++ b/llvm/test/Transforms/LoopFlatten/widen-iv3.ll
@@ -25,7 +25,7 @@ define i16 @foo() {
 ; CHECK-NEXT:    ret i16 [[ADD5_LCSSA_LCSSA]]
 ; CHECK:       for.cond.cleanup3:
 ; CHECK-NEXT:    [[ADD5_LCSSA]] = phi i16 [ [[ADD5:%.*]], [[FOR_BODY4]] ]
-; CHECK-NEXT:    [[INDVAR_NEXT3]] = add i32 [[INDVAR2]], 1
+; CHECK-NEXT:    [[INDVAR_NEXT3]] = add nuw nsw i32 [[INDVAR2]], 1
 ; CHECK-NEXT:    [[INC7]] = add nuw nsw i16 [[I_013]], 1
 ; CHECK-NEXT:    [[EXITCOND14_NOT:%.*]] = icmp eq i32 [[INDVAR_NEXT3]], 4
 ; CHECK-NEXT:    br i1 [[EXITCOND14_NOT]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_COND1_PREHEADER]]
@@ -39,7 +39,7 @@ define i16 @foo() {
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [64 x i16], ptr @v, i16 0, i16 [[TMP3]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i16, ptr [[ARRAYIDX]], align 1
 ; CHECK-NEXT:    [[ADD5]] = add nsw i16 [[TMP4]], [[SUM_110]]
-; CHECK-NEXT:    [[INDVAR_NEXT]] = add i32 [[INDVAR]], 1
+; CHECK-NEXT:    [[INDVAR_NEXT]] = add nuw nsw i32 [[INDVAR]], 1
 ; CHECK-NEXT:    [[INC]] = add nuw nsw i16 [[J_011]], 1
 ; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i32 [[INDVAR_NEXT]], 16
 ; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP3]], label [[FOR_BODY4]]
diff --git a/llvm/test/Transforms/PhaseOrdering/AArch64/indvars-vectorization.ll b/llvm/test/Transforms/PhaseOrdering/AArch64/indvars-vectorization.ll
index a7e8e15..af24a9a 100644
--- a/llvm/test/Transforms/PhaseOrdering/AArch64/indvars-vectorization.ll
+++ b/llvm/test/Transforms/PhaseOrdering/AArch64/indvars-vectorization.ll
@@ -14,18 +14,81 @@ define void @s172(i32 noundef %xa, i32 noundef %xb, ptr noundef %a, ptr noundef
 ; CHECK-NEXT:    [[SUB:%.*]] = add i32 [[XA]], -1
 ; CHECK-NEXT:    [[TMP0:%.*]] = sext i32 [[SUB]] to i64
 ; CHECK-NEXT:    [[TMP1:%.*]] = sext i32 [[XB]] to i64
+; CHECK-NEXT:    [[TMP2:%.*]] = add nsw i64 [[TMP1]], [[TMP0]]
+; CHECK-NEXT:    [[SMAX7:%.*]] = tail call i64 @llvm.smax.i64(i64 [[TMP2]], i64 32000)
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp slt i64 [[TMP2]], 32000
+; CHECK-NEXT:    [[UMIN8:%.*]] = zext i1 [[TMP3]] to i64
+; CHECK-NEXT:    [[TMP4:%.*]] = add nsw i64 [[TMP2]], [[UMIN8]]
+; CHECK-NEXT:    [[TMP5:%.*]] = sub i64 [[SMAX7]], [[TMP4]]
+; CHECK-NEXT:    [[UMAX9:%.*]] = tail call i64 @llvm.umax.i64(i64 [[TMP1]], i64 1)
+; CHECK-NEXT:    [[TMP6:%.*]] = udiv i64 [[TMP5]], [[UMAX9]]
+; CHECK-NEXT:    [[TMP7:%.*]] = add i64 [[TMP6]], [[UMIN8]]
+; CHECK-NEXT:    [[TMP8:%.*]] = add i64 [[TMP7]], 1
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ugt i64 [[TMP8]], 23
+; CHECK-NEXT:    [[IDENT_CHECK_NOT:%.*]] = icmp eq i32 [[XB]], 1
+; CHECK-NEXT:    [[OR_COND:%.*]] = and i1 [[MIN_ITERS_CHECK]], [[IDENT_CHECK_NOT]]
+; CHECK-NEXT:    br i1 [[OR_COND]], label [[VECTOR_MEMCHECK:%.*]], label [[FOR_BODY_PREHEADER13:%.*]]
+; CHECK:       vector.memcheck:
+; CHECK-NEXT:    [[TMP9:%.*]] = shl nsw i64 [[TMP0]], 2
+; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP9]]
+; CHECK-NEXT:    [[TMP10:%.*]] = add nsw i64 [[TMP1]], [[TMP0]]
+; CHECK-NEXT:    [[SMAX:%.*]] = tail call i64 @llvm.smax.i64(i64 [[TMP10]], i64 32000)
+; CHECK-NEXT:    [[TMP11:%.*]] = icmp slt i64 [[TMP10]], 32000
+; CHECK-NEXT:    [[UMIN:%.*]] = zext i1 [[TMP11]] to i64
+; CHECK-NEXT:    [[TMP12:%.*]] = add nsw i64 [[TMP10]], [[UMIN]]
+; CHECK-NEXT:    [[TMP13:%.*]] = sub i64 [[SMAX]], [[TMP12]]
+; CHECK-NEXT:    [[TMP14:%.*]] = add i64 [[TMP13]], [[UMIN]]
+; CHECK-NEXT:    [[TMP15:%.*]] = add i64 [[TMP14]], [[TMP0]]
+; CHECK-NEXT:    [[TMP16:%.*]] = shl i64 [[TMP15]], 2
+; CHECK-NEXT:    [[TMP17:%.*]] = add i64 [[TMP16]], 4
+; CHECK-NEXT:    [[SCEVGEP4:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP17]]
+; CHECK-NEXT:    [[SCEVGEP5:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP9]]
+; CHECK-NEXT:    [[SCEVGEP6:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP17]]
+; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ult ptr [[SCEVGEP]], [[SCEVGEP6]]
+; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ult ptr [[SCEVGEP5]], [[SCEVGEP4]]
+; CHECK-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
+; CHECK-NEXT:    br i1 [[FOUND_CONFLICT]], label [[FOR_BODY_PREHEADER13]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[TMP8]], -8
+; CHECK-NEXT:    [[TMP18:%.*]] = mul nuw i64 [[N_VEC]], [[TMP1]]
+; CHECK-NEXT:    [[IND_END:%.*]] = add i64 [[TMP18]], [[TMP0]]
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP19:%.*]] = mul nuw i64 [[INDEX]], [[TMP1]]
+; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = add i64 [[TMP19]], [[TMP0]]
+; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[OFFSET_IDX]]
+; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i8, ptr [[TMP20]], i64 16
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP20]], align 4, !alias.scope [[META0:![0-9]+]]
+; CHECK-NEXT:    [[WIDE_LOAD10:%.*]] = load <4 x i32>, ptr [[TMP21]], align 4, !alias.scope [[META0]]
+; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[OFFSET_IDX]]
+; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i8, ptr [[TMP22]], i64 16
+; CHECK-NEXT:    [[WIDE_LOAD11:%.*]] = load <4 x i32>, ptr [[TMP22]], align 4, !alias.scope [[META3:![0-9]+]], !noalias [[META0]]
+; CHECK-NEXT:    [[WIDE_LOAD12:%.*]] = load <4 x i32>, ptr [[TMP23]], align 4, !alias.scope [[META3]], !noalias [[META0]]
+; CHECK-NEXT:    [[TMP24:%.*]] = add nsw <4 x i32> [[WIDE_LOAD11]], [[WIDE_LOAD]]
+; CHECK-NEXT:    [[TMP25:%.*]] = add nsw <4 x i32> [[WIDE_LOAD12]], [[WIDE_LOAD10]]
+; CHECK-NEXT:    store <4 x i32> [[TMP24]], ptr [[TMP22]], align 4, !alias.scope [[META3]], !noalias [[META0]]
+; CHECK-NEXT:    store <4 x i32> [[TMP25]], ptr [[TMP23]], align 4, !alias.scope [[META3]], !noalias [[META0]]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+; CHECK-NEXT:    [[TMP26:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP26]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP8]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END]], label [[FOR_BODY_PREHEADER13]]
+; CHECK:       for.body.preheader13:
+; CHECK-NEXT:    [[INDVARS_IV_PH:%.*]] = phi i64 [ [[TMP0]], [[VECTOR_MEMCHECK]] ], [ [[TMP0]], [[FOR_BODY_PREHEADER]] ], [ [[IND_END]], [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
-; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[TMP0]], [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[INDVARS_IV_PH]], [[FOR_BODY_PREHEADER13]] ]
 ; CHECK-NEXT:    [[GEP_B:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDVARS_IV]]
 ; CHECK-NEXT:    [[L_B:%.*]] = load i32, ptr [[GEP_B]], align 4
 ; CHECK-NEXT:    [[GEP_A:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDVARS_IV]]
 ; CHECK-NEXT:    [[L_A:%.*]] = load i32, ptr [[GEP_A]], align 4
 ; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[L_A]], [[L_B]]
 ; CHECK-NEXT:    store i32 [[ADD]], ptr [[GEP_A]], align 4
-; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add i64 [[INDVARS_IV]], [[TMP1]]
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nsw i64 [[INDVARS_IV]], [[TMP1]]
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i64 [[INDVARS_IV_NEXT]], 32000
-; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END]], !llvm.loop [[LOOP9:![0-9]+]]
 ; CHECK:       for.end:
 ; CHECK-NEXT:    ret void
 ;
@@ -63,6 +126,14 @@ for.end:
 !0 = distinct !{!0, !1}
 !1 = !{!"llvm.loop.mustprogress"}
 ;.
-; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]]}
-; CHECK: [[META1]] = !{!"llvm.loop.mustprogress"}
+; CHECK: [[META0]] = !{[[META1:![0-9]+]]}
+; CHECK: [[META1]] = distinct !{[[META1]], [[META2:![0-9]+]]}
+; CHECK: [[META2]] = distinct !{[[META2]], !"LVerDomain"}
+; CHECK: [[META3]] = !{[[META4:![0-9]+]]}
+; CHECK: [[META4]] = distinct !{[[META4]], [[META2]]}
+; CHECK: [[LOOP5]] = distinct !{[[LOOP5]], [[META6:![0-9]+]], [[META7:![0-9]+]], [[META8:![0-9]+]]}
+; CHECK: [[META6]] = !{!"llvm.loop.mustprogress"}
+; CHECK: [[META7]] = !{!"llvm.loop.isvectorized", i32 1}
+; CHECK: [[META8]] = !{!"llvm.loop.unroll.runtime.disable"}
+; CHECK: [[LOOP9]] = distinct !{[[LOOP9]], [[META6]], [[META7]]}
 ;.
diff --git a/llvm/test/Transforms/PhaseOrdering/AArch64/loopflatten.ll b/llvm/test/Transforms/PhaseOrdering/AArch64/loopflatten.ll
index 77f53ad..e514def 100644
--- a/llvm/test/Transforms/PhaseOrdering/AArch64/loopflatten.ll
+++ b/llvm/test/Transforms/PhaseOrdering/AArch64/loopflatten.ll
@@ -21,7 +21,7 @@ define dso_local void @_Z3fooPiii(ptr %A, i32 %N, i32 %M) #0 {
 ; CHECK-NEXT:    [[ARRAYIDX_US:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[INDVAR6]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX_US]], align 4
 ; CHECK-NEXT:    tail call void @_Z1fi(i32 [[TMP2]])
-; CHECK-NEXT:    [[INDVAR_NEXT7]] = add nuw nsw i64 [[INDVAR6]], 1
+; CHECK-NEXT:    [[INDVAR_NEXT7]] = add nuw i64 [[INDVAR6]], 1
 ; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVAR_NEXT7]], [[FLATTEN_TRIPCOUNT]]
 ; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_COND1_PREHEADER_US]]
 ; CHECK:       for.cond.cleanup:
-- 
cgit v1.1


From ba451c80ba67ab6834305f35d47e36b6b446ce83 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Timm=20B=C3=A4der?= <tbaeder@redhat.com>
Date: Sat, 10 Feb 2024 17:28:48 +0100
Subject: [clang][Interp][NFC] Only set result invalid if empty

This is currently NFC but required for later changes. A Ret op
might fail and set the result to invalid, causing another setInvalid()
call, which asserts that the result is still empty.
---
 clang/lib/AST/Interp/EvalEmitter.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/clang/lib/AST/Interp/EvalEmitter.cpp b/clang/lib/AST/Interp/EvalEmitter.cpp
index a60f893..945b78d7 100644
--- a/clang/lib/AST/Interp/EvalEmitter.cpp
+++ b/clang/lib/AST/Interp/EvalEmitter.cpp
@@ -36,7 +36,7 @@ EvalEmitter::~EvalEmitter() {
 EvaluationResult EvalEmitter::interpretExpr(const Expr *E) {
   EvalResult.setSource(E);
 
-  if (!this->visitExpr(E))
+  if (!this->visitExpr(E) && EvalResult.empty())
     EvalResult.setInvalid();
 
   return std::move(this->EvalResult);
@@ -45,7 +45,7 @@ EvaluationResult EvalEmitter::interpretExpr(const Expr *E) {
 EvaluationResult EvalEmitter::interpretDecl(const VarDecl *VD) {
   EvalResult.setSource(VD);
 
-  if (!this->visitDecl(VD))
+  if (!this->visitDecl(VD) && EvalResult.empty())
     EvalResult.setInvalid();
 
   return std::move(this->EvalResult);
-- 
cgit v1.1


From bc034baaff1f6ce4e18b68c20df3be45bfb5104f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Timm=20B=C3=A4der?= <tbaeder@redhat.com>
Date: Sat, 10 Feb 2024 17:42:36 +0100
Subject: [clang][Interp] Protect InitPtr from non-initializable pointers

This can happen when an initializer returns a dummy pointer.
---
 clang/lib/AST/Interp/Interp.h   |  7 +++++--
 clang/test/AST/Interp/complex.c | 14 ++++++++++++++
 2 files changed, 19 insertions(+), 2 deletions(-)
 create mode 100644 clang/test/AST/Interp/complex.c

diff --git a/clang/lib/AST/Interp/Interp.h b/clang/lib/AST/Interp/Interp.h
index 290edc0..15c1370 100644
--- a/clang/lib/AST/Interp/Interp.h
+++ b/clang/lib/AST/Interp/Interp.h
@@ -1278,13 +1278,16 @@ inline bool GetPtrThisBase(InterpState &S, CodePtr OpPC, uint32_t Off) {
 
 inline bool InitPtrPop(InterpState &S, CodePtr OpPC) {
   const Pointer &Ptr = S.Stk.pop<Pointer>();
-  Ptr.initialize();
+  if (Ptr.canBeInitialized())
+    Ptr.initialize();
   return true;
 }
 
 inline bool InitPtr(InterpState &S, CodePtr OpPC) {
   const Pointer &Ptr = S.Stk.peek<Pointer>();
-  Ptr.initialize();
+
+  if (Ptr.canBeInitialized())
+    Ptr.initialize();
   return true;
 }
 
diff --git a/clang/test/AST/Interp/complex.c b/clang/test/AST/Interp/complex.c
new file mode 100644
index 0000000..b07d024
--- /dev/null
+++ b/clang/test/AST/Interp/complex.c
@@ -0,0 +1,14 @@
+// RUN: %clang_cc1 -fexperimental-new-constant-interpreter -verify=expected,both -Wno-unused-value %s
+// RUN: %clang_cc1 -verify=ref,both -Wno-unused-value %s
+
+// expected-no-diagnostics
+// ref-no-diagnostics
+
+void blah() {
+  __complex__ unsigned xx;
+  __complex__ signed yy;
+  __complex__ int result;
+
+  /// The following line calls into the constant interpreter.
+  result = xx * yy;
+}
-- 
cgit v1.1


From 0a255fcf4a90f9e864ae9321b28e4956f7c865fb Mon Sep 17 00:00:00 2001
From: David CARLIER <devnexen@gmail.com>
Date: Sat, 10 Feb 2024 19:14:28 +0000
Subject: [compiler-rt][profile] Fix InstrProfilingFile possible resource leak.
 (#81363)

close #79708
---
 compiler-rt/lib/profile/InstrProfilingFile.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/compiler-rt/lib/profile/InstrProfilingFile.c b/compiler-rt/lib/profile/InstrProfilingFile.c
index 867ae73..f3b457d 100644
--- a/compiler-rt/lib/profile/InstrProfilingFile.c
+++ b/compiler-rt/lib/profile/InstrProfilingFile.c
@@ -677,6 +677,7 @@ static void initializeProfileForContinuousMode(void) {
       PROF_ERR("Continuous counter sync mode is enabled, but raw profile is not"
                "page-aligned. CurrentFileOffset = %" PRIu64 ", pagesz = %u.\n",
                (uint64_t)CurrentFileOffset, PageSize);
+      fclose(File);
       return;
     }
     if (writeProfileWithFileObject(Filename, File) != 0) {
@@ -692,6 +693,8 @@ static void initializeProfileForContinuousMode(void) {
 
   if (doMerging()) {
     lprofUnlockFileHandle(File);
+  }
+  if (File != NULL) {
     fclose(File);
   }
 }
-- 
cgit v1.1


From 5e9eaf87b374c3f6638543682b523827834494a8 Mon Sep 17 00:00:00 2001
From: Mark de Wever <koraq@xs4all.nl>
Date: Sat, 10 Feb 2024 20:44:14 +0100
Subject: [lldb][libc++] Adds valarray data formatters. (#80609)

The code is heavily based on the vector data formatter.
---
 .../Plugins/Language/CPlusPlus/CMakeLists.txt      |   1 +
 .../Language/CPlusPlus/CPlusPlusLanguage.cpp       |   9 ++
 lldb/source/Plugins/Language/CPlusPlus/LibCxx.h    |   4 +
 .../Plugins/Language/CPlusPlus/LibCxxValarray.cpp  | 145 +++++++++++++++++++++
 .../data-formatter-stl/libcxx/valarray/Makefile    |   5 +
 .../valarray/TestDataFormatterLibcxxValarray.py    |  78 +++++++++++
 .../data-formatter-stl/libcxx/valarray/main.cpp    |  17 +++
 7 files changed, 259 insertions(+)
 create mode 100644 lldb/source/Plugins/Language/CPlusPlus/LibCxxValarray.cpp
 create mode 100644 lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx/valarray/Makefile
 create mode 100644 lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx/valarray/TestDataFormatterLibcxxValarray.py
 create mode 100644 lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx/valarray/main.cpp

diff --git a/lldb/source/Plugins/Language/CPlusPlus/CMakeLists.txt b/lldb/source/Plugins/Language/CPlusPlus/CMakeLists.txt
index 21108b2..97fa894 100644
--- a/lldb/source/Plugins/Language/CPlusPlus/CMakeLists.txt
+++ b/lldb/source/Plugins/Language/CPlusPlus/CMakeLists.txt
@@ -17,6 +17,7 @@ add_lldb_library(lldbPluginCPlusPlusLanguage PLUGIN
   LibCxxTuple.cpp
   LibCxxUnorderedMap.cpp
   LibCxxVariant.cpp
+  LibCxxValarray.cpp
   LibCxxVector.cpp
   LibStdcpp.cpp
   LibStdcppTuple.cpp
diff --git a/lldb/source/Plugins/Language/CPlusPlus/CPlusPlusLanguage.cpp b/lldb/source/Plugins/Language/CPlusPlus/CPlusPlusLanguage.cpp
index 1dcda53..675ca38 100644
--- a/lldb/source/Plugins/Language/CPlusPlus/CPlusPlusLanguage.cpp
+++ b/lldb/source/Plugins/Language/CPlusPlus/CPlusPlusLanguage.cpp
@@ -752,6 +752,11 @@ static void LoadLibCxxFormatters(lldb::TypeCategoryImplSP cpp_category_sp) {
       "^std::__[[:alnum:]]+::vector<.+>$", stl_deref_flags, true);
   AddCXXSynthetic(
       cpp_category_sp,
+      lldb_private::formatters::LibcxxStdValarraySyntheticFrontEndCreator,
+      "libc++ std::valarray synthetic children",
+      "^std::__[[:alnum:]]+::valarray<.+>$", stl_deref_flags, true);
+  AddCXXSynthetic(
+      cpp_category_sp,
       lldb_private::formatters::LibcxxStdForwardListSyntheticFrontEndCreator,
       "libc++ std::forward_list synthetic children",
       "^std::__[[:alnum:]]+::forward_list<.+>$", stl_synth_flags, true);
@@ -871,6 +876,10 @@ static void LoadLibCxxFormatters(lldb::TypeCategoryImplSP cpp_category_sp) {
                 lldb_private::formatters::LibcxxContainerSummaryProvider,
                 "libc++ std::vector summary provider",
                 "^std::__[[:alnum:]]+::vector<.+>$", stl_summary_flags, true);
+  AddCXXSummary(cpp_category_sp,
+                lldb_private::formatters::LibcxxContainerSummaryProvider,
+                "libc++ std::valarray summary provider",
+                "^std::__[[:alnum:]]+::valarray<.+>$", stl_summary_flags, true);
   AddCXXSummary(
       cpp_category_sp, lldb_private::formatters::LibcxxContainerSummaryProvider,
       "libc++ std::list summary provider",
diff --git a/lldb/source/Plugins/Language/CPlusPlus/LibCxx.h b/lldb/source/Plugins/Language/CPlusPlus/LibCxx.h
index cc8e13d..d823fbd 100644
--- a/lldb/source/Plugins/Language/CPlusPlus/LibCxx.h
+++ b/lldb/source/Plugins/Language/CPlusPlus/LibCxx.h
@@ -220,6 +220,10 @@ LibcxxStdVectorSyntheticFrontEndCreator(CXXSyntheticChildren *,
                                         lldb::ValueObjectSP);
 
 SyntheticChildrenFrontEnd *
+LibcxxStdValarraySyntheticFrontEndCreator(CXXSyntheticChildren *,
+                                          lldb::ValueObjectSP);
+
+SyntheticChildrenFrontEnd *
 LibcxxStdListSyntheticFrontEndCreator(CXXSyntheticChildren *,
                                       lldb::ValueObjectSP);
 
diff --git a/lldb/source/Plugins/Language/CPlusPlus/LibCxxValarray.cpp b/lldb/source/Plugins/Language/CPlusPlus/LibCxxValarray.cpp
new file mode 100644
index 0000000..7c8fd25
--- /dev/null
+++ b/lldb/source/Plugins/Language/CPlusPlus/LibCxxValarray.cpp
@@ -0,0 +1,145 @@
+//===-- LibCxxValarray.cpp ------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "LibCxx.h"
+
+#include "lldb/Core/ValueObject.h"
+#include "lldb/DataFormatters/FormattersHelpers.h"
+#include <optional>
+
+using namespace lldb;
+using namespace lldb_private;
+using namespace lldb_private::formatters;
+
+namespace lldb_private {
+namespace formatters {
+class LibcxxStdValarraySyntheticFrontEnd : public SyntheticChildrenFrontEnd {
+public:
+  LibcxxStdValarraySyntheticFrontEnd(lldb::ValueObjectSP valobj_sp);
+
+  ~LibcxxStdValarraySyntheticFrontEnd() override;
+
+  size_t CalculateNumChildren() override;
+
+  lldb::ValueObjectSP GetChildAtIndex(size_t idx) override;
+
+  lldb::ChildCacheState Update() override;
+
+  bool MightHaveChildren() override;
+
+  size_t GetIndexOfChildWithName(ConstString name) override;
+
+private:
+  /// A non-owning pointer to valarray's __begin_ member.
+  ValueObject *m_start = nullptr;
+  /// A non-owning pointer to valarray's __end_ member.
+  ValueObject *m_finish = nullptr;
+  /// The type of valarray's template argument T.
+  CompilerType m_element_type;
+  /// The sizeof valarray's template argument T.
+  uint32_t m_element_size = 0;
+};
+
+} // namespace formatters
+} // namespace lldb_private
+
+lldb_private::formatters::LibcxxStdValarraySyntheticFrontEnd::
+    LibcxxStdValarraySyntheticFrontEnd(lldb::ValueObjectSP valobj_sp)
+    : SyntheticChildrenFrontEnd(*valobj_sp), m_element_type() {
+  if (valobj_sp)
+    Update();
+}
+
+lldb_private::formatters::LibcxxStdValarraySyntheticFrontEnd::
+    ~LibcxxStdValarraySyntheticFrontEnd() {
+  // these need to stay around because they are child objects who will follow
+  // their parent's life cycle
+  // delete m_start;
+  // delete m_finish;
+}
+
+size_t lldb_private::formatters::LibcxxStdValarraySyntheticFrontEnd::
+    CalculateNumChildren() {
+  if (!m_start || !m_finish)
+    return 0;
+  uint64_t start_val = m_start->GetValueAsUnsigned(0);
+  uint64_t finish_val = m_finish->GetValueAsUnsigned(0);
+
+  if (start_val == 0 || finish_val == 0)
+    return 0;
+
+  if (start_val >= finish_val)
+    return 0;
+
+  size_t num_children = (finish_val - start_val);
+  if (num_children % m_element_size)
+    return 0;
+  return num_children / m_element_size;
+}
+
+lldb::ValueObjectSP
+lldb_private::formatters::LibcxxStdValarraySyntheticFrontEnd::GetChildAtIndex(
+    size_t idx) {
+  if (!m_start || !m_finish)
+    return lldb::ValueObjectSP();
+
+  uint64_t offset = idx * m_element_size;
+  offset = offset + m_start->GetValueAsUnsigned(0);
+  StreamString name;
+  name.Printf("[%" PRIu64 "]", (uint64_t)idx);
+  return CreateValueObjectFromAddress(name.GetString(), offset,
+                                      m_backend.GetExecutionContextRef(),
+                                      m_element_type);
+}
+
+lldb::ChildCacheState
+lldb_private::formatters::LibcxxStdValarraySyntheticFrontEnd::Update() {
+  m_start = m_finish = nullptr;
+
+  CompilerType type = m_backend.GetCompilerType();
+  if (type.GetNumTemplateArguments() == 0)
+    return ChildCacheState::eRefetch;
+
+  m_element_type = type.GetTypeTemplateArgument(0);
+  if (std::optional<uint64_t> size = m_element_type.GetByteSize(nullptr))
+    m_element_size = *size;
+
+  if (m_element_size == 0)
+    return ChildCacheState::eRefetch;
+
+  ValueObjectSP start = m_backend.GetChildMemberWithName("__begin_");
+  ValueObjectSP finish = m_backend.GetChildMemberWithName("__end_");
+
+  if (!start || !finish)
+    return ChildCacheState::eRefetch;
+
+  m_start = start.get();
+  m_finish = finish.get();
+
+  return ChildCacheState::eRefetch;
+}
+
+bool lldb_private::formatters::LibcxxStdValarraySyntheticFrontEnd::
+    MightHaveChildren() {
+  return true;
+}
+
+size_t lldb_private::formatters::LibcxxStdValarraySyntheticFrontEnd::
+    GetIndexOfChildWithName(ConstString name) {
+  if (!m_start || !m_finish)
+    return std::numeric_limits<size_t>::max();
+  return ExtractIndexFromString(name.GetCString());
+}
+
+lldb_private::SyntheticChildrenFrontEnd *
+lldb_private::formatters::LibcxxStdValarraySyntheticFrontEndCreator(
+    CXXSyntheticChildren *, lldb::ValueObjectSP valobj_sp) {
+  if (!valobj_sp)
+    return nullptr;
+  return new LibcxxStdValarraySyntheticFrontEnd(valobj_sp);
+}
diff --git a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx/valarray/Makefile b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx/valarray/Makefile
new file mode 100644
index 0000000..c5df567
--- /dev/null
+++ b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx/valarray/Makefile
@@ -0,0 +1,5 @@
+CXX_SOURCES := main.cpp
+
+USE_LIBCPP := 1
+
+include Makefile.rules
diff --git a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx/valarray/TestDataFormatterLibcxxValarray.py b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx/valarray/TestDataFormatterLibcxxValarray.py
new file mode 100644
index 0000000..7b54b34
--- /dev/null
+++ b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx/valarray/TestDataFormatterLibcxxValarray.py
@@ -0,0 +1,78 @@
+"""
+Test lldb data formatter subsystem.
+"""
+
+
+import lldb
+from lldbsuite.test.decorators import *
+from lldbsuite.test.lldbtest import *
+from lldbsuite.test import lldbutil
+
+
+class LibcxxChronoDataFormatterTestCase(TestBase):
+    @add_test_categories(["libc++"])
+    def test_with_run_command(self):
+        """Test that that file and class static variables display correctly."""
+        self.build()
+        (self.target, process, thread, bkpt) = lldbutil.run_to_source_breakpoint(
+            self, "break here", lldb.SBFileSpec("main.cpp", False)
+        )
+
+        self.expect(
+            "frame variable va_int",
+            substrs=[
+                "va_int = size=4",
+                "[0] = 0",
+                "[1] = 0",
+                "[2] = 0",
+                "[3] = 0",
+                "}",
+            ],
+        )
+
+        lldbutil.continue_to_breakpoint(process, bkpt)
+        self.expect(
+            "frame variable va_int",
+            substrs=[
+                "va_int = size=4",
+                "[0] = 1",
+                "[1] = 12",
+                "[2] = 123",
+                "[3] = 1234",
+                "}",
+            ],
+        )
+
+        # check access-by-index
+        self.expect("frame variable va_int[0]", substrs=["1"])
+        self.expect("frame variable va_int[1]", substrs=["12"])
+        self.expect("frame variable va_int[2]", substrs=["123"])
+        self.expect("frame variable va_int[3]", substrs=["1234"])
+        self.expect(
+            "frame variable va_int[4]",
+            error=True,
+            substrs=['array index 4 is not valid for "(valarray<int>) va_int"'],
+        )
+
+        self.expect(
+            "frame variable va_double",
+            substrs=[
+                "va_double = size=4",
+                "[0] = 1",
+                "[1] = 0.5",
+                "[2] = 0.25",
+                "[3] = 0.125",
+                "}",
+            ],
+        )
+
+        # check access-by-index
+        self.expect("frame variable va_double[0]", substrs=["1"])
+        self.expect("frame variable va_double[1]", substrs=["0.5"])
+        self.expect("frame variable va_double[2]", substrs=["0.25"])
+        self.expect("frame variable va_double[3]", substrs=["0.125"])
+        self.expect(
+            "frame variable va_double[4]",
+            error=True,
+            substrs=['array index 4 is not valid for "(valarray<double>) va_double"'],
+        )
diff --git a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx/valarray/main.cpp b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx/valarray/main.cpp
new file mode 100644
index 0000000..f32921e
--- /dev/null
+++ b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx/valarray/main.cpp
@@ -0,0 +1,17 @@
+#include <iostream>
+#include <valarray>
+
+int main() {
+
+  std::valarray<int> va_int(4);
+  std::cout << "break here";
+
+  va_int[0] = 1;
+  va_int[1] = 12;
+  va_int[2] = 123;
+  va_int[3] = 1234;
+
+  std::valarray<double> va_double({1.0, 0.5, 0.25, 0.125});
+
+  std::cout << "break here\n";
+}
-- 
cgit v1.1


From 2a51c56d8e0e410bf896be2c6bebe37344a996e1 Mon Sep 17 00:00:00 2001
From: LLVM GN Syncbot <llvmgnsyncbot@gmail.com>
Date: Sat, 10 Feb 2024 19:44:41 +0000
Subject: [gn build] Port 5e9eaf87b374

---
 llvm/utils/gn/secondary/lldb/source/Plugins/Language/CPlusPlus/BUILD.gn | 1 +
 1 file changed, 1 insertion(+)

diff --git a/llvm/utils/gn/secondary/lldb/source/Plugins/Language/CPlusPlus/BUILD.gn b/llvm/utils/gn/secondary/lldb/source/Plugins/Language/CPlusPlus/BUILD.gn
index 60562ef..6c667b2 100644
--- a/llvm/utils/gn/secondary/lldb/source/Plugins/Language/CPlusPlus/BUILD.gn
+++ b/llvm/utils/gn/secondary/lldb/source/Plugins/Language/CPlusPlus/BUILD.gn
@@ -37,6 +37,7 @@ static_library("CPlusPlus") {
     "LibCxxSpan.cpp",
     "LibCxxTuple.cpp",
     "LibCxxUnorderedMap.cpp",
+    "LibCxxValarray.cpp",
     "LibCxxVariant.cpp",
     "LibCxxVector.cpp",
     "LibStdcpp.cpp",
-- 
cgit v1.1


From 3a05e7651bc71b3c71757bb406f211645c1c1a37 Mon Sep 17 00:00:00 2001
From: Vlad Serebrennikov <serebrennikov.vladislav@gmail.com>
Date: Sat, 10 Feb 2024 23:35:29 +0300
Subject: [clang][NFC] Annotate `Sema/DeclSpec.h` with `preferred_type`

This helps debuggers to display values in bit-fields in a more helpful way.
---
 clang/include/clang/Sema/DeclSpec.h | 56 +++++++++++++++++++++++++++++++++----
 1 file changed, 50 insertions(+), 6 deletions(-)

diff --git a/clang/include/clang/Sema/DeclSpec.h b/clang/include/clang/Sema/DeclSpec.h
index 77638de..d161147 100644
--- a/clang/include/clang/Sema/DeclSpec.h
+++ b/clang/include/clang/Sema/DeclSpec.h
@@ -353,36 +353,57 @@ public:
 
 private:
   // storage-class-specifier
-  /*SCS*/unsigned StorageClassSpec : 3;
-  /*TSCS*/unsigned ThreadStorageClassSpec : 2;
+  LLVM_PREFERRED_TYPE(SCS)
+  unsigned StorageClassSpec : 3;
+  LLVM_PREFERRED_TYPE(TSCS)
+  unsigned ThreadStorageClassSpec : 2;
+  LLVM_PREFERRED_TYPE(bool)
   unsigned SCS_extern_in_linkage_spec : 1;
 
   // type-specifier
-  /*TypeSpecifierWidth*/ unsigned TypeSpecWidth : 2;
-  /*TSC*/unsigned TypeSpecComplex : 2;
-  /*TSS*/unsigned TypeSpecSign : 2;
-  /*TST*/unsigned TypeSpecType : 7;
+  LLVM_PREFERRED_TYPE(TypeSpecifierWidth)
+  unsigned TypeSpecWidth : 2;
+  LLVM_PREFERRED_TYPE(TSC)
+  unsigned TypeSpecComplex : 2;
+  LLVM_PREFERRED_TYPE(TypeSpecifierSign)
+  unsigned TypeSpecSign : 2;
+  LLVM_PREFERRED_TYPE(TST)
+  unsigned TypeSpecType : 7;
+  LLVM_PREFERRED_TYPE(bool)
   unsigned TypeAltiVecVector : 1;
+  LLVM_PREFERRED_TYPE(bool)
   unsigned TypeAltiVecPixel : 1;
+  LLVM_PREFERRED_TYPE(bool)
   unsigned TypeAltiVecBool : 1;
+  LLVM_PREFERRED_TYPE(bool)
   unsigned TypeSpecOwned : 1;
+  LLVM_PREFERRED_TYPE(bool)
   unsigned TypeSpecPipe : 1;
+  LLVM_PREFERRED_TYPE(bool)
   unsigned TypeSpecSat : 1;
+  LLVM_PREFERRED_TYPE(bool)
   unsigned ConstrainedAuto : 1;
 
   // type-qualifiers
+  LLVM_PREFERRED_TYPE(TQ)
   unsigned TypeQualifiers : 5;  // Bitwise OR of TQ.
 
   // function-specifier
+  LLVM_PREFERRED_TYPE(bool)
   unsigned FS_inline_specified : 1;
+  LLVM_PREFERRED_TYPE(bool)
   unsigned FS_forceinline_specified: 1;
+  LLVM_PREFERRED_TYPE(bool)
   unsigned FS_virtual_specified : 1;
+  LLVM_PREFERRED_TYPE(bool)
   unsigned FS_noreturn_specified : 1;
 
   // friend-specifier
+  LLVM_PREFERRED_TYPE(bool)
   unsigned Friend_specified : 1;
 
   // constexpr-specifier
+  LLVM_PREFERRED_TYPE(ConstexprSpecKind)
   unsigned ConstexprSpecifier : 2;
 
   union {
@@ -1246,6 +1267,7 @@ struct DeclaratorChunk {
 
   struct PointerTypeInfo {
     /// The type qualifiers: const/volatile/restrict/unaligned/atomic.
+    LLVM_PREFERRED_TYPE(DeclSpec::TQ)
     unsigned TypeQuals : 5;
 
     /// The location of the const-qualifier, if any.
@@ -1279,12 +1301,15 @@ struct DeclaratorChunk {
   struct ArrayTypeInfo {
     /// The type qualifiers for the array:
     /// const/volatile/restrict/__unaligned/_Atomic.
+    LLVM_PREFERRED_TYPE(DeclSpec::TQ)
     unsigned TypeQuals : 5;
 
     /// True if this dimension included the 'static' keyword.
+    LLVM_PREFERRED_TYPE(bool)
     unsigned hasStatic : 1;
 
     /// True if this dimension was [*].  In this case, NumElts is null.
+    LLVM_PREFERRED_TYPE(bool)
     unsigned isStar : 1;
 
     /// This is the size of the array, or null if [] or [*] was specified.
@@ -1331,28 +1356,35 @@ struct DeclaratorChunk {
     /// hasPrototype - This is true if the function had at least one typed
     /// parameter.  If the function is () or (a,b,c), then it has no prototype,
     /// and is treated as a K&R-style function.
+    LLVM_PREFERRED_TYPE(bool)
     unsigned hasPrototype : 1;
 
     /// isVariadic - If this function has a prototype, and if that
     /// proto ends with ',...)', this is true. When true, EllipsisLoc
     /// contains the location of the ellipsis.
+    LLVM_PREFERRED_TYPE(bool)
     unsigned isVariadic : 1;
 
     /// Can this declaration be a constructor-style initializer?
+    LLVM_PREFERRED_TYPE(bool)
     unsigned isAmbiguous : 1;
 
     /// Whether the ref-qualifier (if any) is an lvalue reference.
     /// Otherwise, it's an rvalue reference.
+    LLVM_PREFERRED_TYPE(bool)
     unsigned RefQualifierIsLValueRef : 1;
 
     /// ExceptionSpecType - An ExceptionSpecificationType value.
+    LLVM_PREFERRED_TYPE(ExceptionSpecificationType)
     unsigned ExceptionSpecType : 4;
 
     /// DeleteParams - If this is true, we need to delete[] Params.
+    LLVM_PREFERRED_TYPE(bool)
     unsigned DeleteParams : 1;
 
     /// HasTrailingReturnType - If this is true, a trailing return type was
     /// specified.
+    LLVM_PREFERRED_TYPE(bool)
     unsigned HasTrailingReturnType : 1;
 
     /// The location of the left parenthesis in the source.
@@ -1567,6 +1599,7 @@ struct DeclaratorChunk {
   struct BlockPointerTypeInfo {
     /// For now, sema will catch these as invalid.
     /// The type qualifiers: const/volatile/restrict/__unaligned/_Atomic.
+    LLVM_PREFERRED_TYPE(DeclSpec::TQ)
     unsigned TypeQuals : 5;
 
     void destroy() {
@@ -1575,6 +1608,7 @@ struct DeclaratorChunk {
 
   struct MemberPointerTypeInfo {
     /// The type qualifiers: const/volatile/restrict/__unaligned/_Atomic.
+    LLVM_PREFERRED_TYPE(DeclSpec::TQ)
     unsigned TypeQuals : 5;
     /// Location of the '*' token.
     SourceLocation StarLoc;
@@ -1767,6 +1801,7 @@ private:
   /// The bindings.
   Binding *Bindings;
   unsigned NumBindings : 31;
+  LLVM_PREFERRED_TYPE(bool)
   unsigned DeleteBindings : 1;
 
   friend class Declarator;
@@ -1883,33 +1918,42 @@ private:
   SmallVector<DeclaratorChunk, 8> DeclTypeInfo;
 
   /// InvalidType - Set by Sema::GetTypeForDeclarator().
+  LLVM_PREFERRED_TYPE(bool)
   unsigned InvalidType : 1;
 
   /// GroupingParens - Set by Parser::ParseParenDeclarator().
+  LLVM_PREFERRED_TYPE(bool)
   unsigned GroupingParens : 1;
 
   /// FunctionDefinition - Is this Declarator for a function or member
   /// definition and, if so, what kind?
   ///
   /// Actually a FunctionDefinitionKind.
+  LLVM_PREFERRED_TYPE(FunctionDefinitionKind)
   unsigned FunctionDefinition : 2;
 
   /// Is this Declarator a redeclaration?
+  LLVM_PREFERRED_TYPE(bool)
   unsigned Redeclaration : 1;
 
   /// true if the declaration is preceded by \c __extension__.
+  LLVM_PREFERRED_TYPE(bool)
   unsigned Extension : 1;
 
   /// Indicates whether this is an Objective-C instance variable.
+  LLVM_PREFERRED_TYPE(bool)
   unsigned ObjCIvar : 1;
 
   /// Indicates whether this is an Objective-C 'weak' property.
+  LLVM_PREFERRED_TYPE(bool)
   unsigned ObjCWeakProperty : 1;
 
   /// Indicates whether the InlineParams / InlineBindings storage has been used.
+  LLVM_PREFERRED_TYPE(bool)
   unsigned InlineStorageUsed : 1;
 
   /// Indicates whether this declarator has an initializer.
+  LLVM_PREFERRED_TYPE(bool)
   unsigned HasInitializer : 1;
 
   /// Attributes attached to the declarator.
-- 
cgit v1.1


From 76e3759d8d2dc5af755737a764b237ff04aaf7f4 Mon Sep 17 00:00:00 2001
From: Ikhlas Ajbar <iajbar@quicinc.com>
Date: Sat, 10 Feb 2024 14:42:50 -0600
Subject: [Hexagon] Order objects on the stack by their alignments (#81280)

This patch sorts stack objects by their alignment value from the largest
to the smallest. If two objects have the same alignment, then they are
sorted by their size from the largest to the smallest. This minimizes
padding and reduces run time stack size.
---
 llvm/lib/Target/Hexagon/HexagonFrameLowering.cpp   | 64 ++++++++++++++++++++++
 llvm/lib/Target/Hexagon/HexagonFrameLowering.h     |  4 ++
 llvm/test/CodeGen/Hexagon/hvx-reuse-fi-base.ll     | 16 +++---
 llvm/test/CodeGen/Hexagon/order-stack-object.ll    | 42 ++++++++++++++
 .../test/CodeGen/Hexagon/store-imm-stack-object.ll | 12 ++--
 .../hexagon_generated_funcs.ll.generated.expected  | 34 ++++++------
 ...hexagon_generated_funcs.ll.nogenerated.expected | 34 ++++++------
 7 files changed, 158 insertions(+), 48 deletions(-)
 create mode 100644 llvm/test/CodeGen/Hexagon/order-stack-object.ll

diff --git a/llvm/lib/Target/Hexagon/HexagonFrameLowering.cpp b/llvm/lib/Target/Hexagon/HexagonFrameLowering.cpp
index 812e5f7..2326511 100644
--- a/llvm/lib/Target/Hexagon/HexagonFrameLowering.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonFrameLowering.cpp
@@ -2688,3 +2688,67 @@ bool HexagonFrameLowering::mayOverflowFrameOffset(MachineFunction &MF) const {
 
   return false;
 }
+
+namespace {
+// Struct used by orderFrameObjects to help sort the stack objects.
+struct HexagonFrameSortingObject {
+  bool IsValid = false;
+  unsigned Index = 0; // Index of Object into MFI list.
+  unsigned Size = 0;
+  Align ObjectAlignment = Align(1); // Alignment of Object in bytes.
+};
+
+struct HexagonFrameSortingComparator {
+  inline bool operator()(const HexagonFrameSortingObject &A,
+                         const HexagonFrameSortingObject &B) const {
+    return std::make_tuple(!A.IsValid, A.ObjectAlignment, A.Size) <
+           std::make_tuple(!B.IsValid, B.ObjectAlignment, B.Size);
+  }
+};
+} // namespace
+
+// Sort objects on the stack by alignment value and then by size to minimize
+// padding.
+void HexagonFrameLowering::orderFrameObjects(
+    const MachineFunction &MF, SmallVectorImpl<int> &ObjectsToAllocate) const {
+
+  if (ObjectsToAllocate.empty())
+    return;
+
+  const MachineFrameInfo &MFI = MF.getFrameInfo();
+  int NObjects = ObjectsToAllocate.size();
+
+  // Create an array of all MFI objects.
+  SmallVector<HexagonFrameSortingObject> SortingObjects(
+      MFI.getObjectIndexEnd());
+
+  for (int i = 0, j = 0, e = MFI.getObjectIndexEnd(); i < e && j != NObjects;
+       ++i) {
+    if (i != ObjectsToAllocate[j])
+      continue;
+    j++;
+
+    // A variable size object has size equal to 0. Since Hexagon sets
+    // getUseLocalStackAllocationBlock() to true, a local block is allocated
+    // earlier. This case is not handled here for now.
+    int Size = MFI.getObjectSize(i);
+    if (Size == 0)
+      return;
+
+    SortingObjects[i].IsValid = true;
+    SortingObjects[i].Index = i;
+    SortingObjects[i].Size = Size;
+    SortingObjects[i].ObjectAlignment = MFI.getObjectAlign(i);
+  }
+
+  // Sort objects by alignment and then by size.
+  llvm::stable_sort(SortingObjects, HexagonFrameSortingComparator());
+
+  // Modify the original list to represent the final order.
+  int i = NObjects;
+  for (auto &Obj : SortingObjects) {
+    if (i == 0)
+      break;
+    ObjectsToAllocate[--i] = Obj.Index;
+  }
+}
diff --git a/llvm/lib/Target/Hexagon/HexagonFrameLowering.h b/llvm/lib/Target/Hexagon/HexagonFrameLowering.h
index b2222f0..98e69dc 100644
--- a/llvm/lib/Target/Hexagon/HexagonFrameLowering.h
+++ b/llvm/lib/Target/Hexagon/HexagonFrameLowering.h
@@ -35,6 +35,10 @@ public:
   explicit HexagonFrameLowering()
       : TargetFrameLowering(StackGrowsDown, Align(8), 0, Align(1), true) {}
 
+  void
+  orderFrameObjects(const MachineFunction &MF,
+                    SmallVectorImpl<int> &ObjectsToAllocate) const override;
+
   // All of the prolog/epilog functionality, including saving and restoring
   // callee-saved registers is handled in emitPrologue. This is to have the
   // logic for shrink-wrapping in one place.
diff --git a/llvm/test/CodeGen/Hexagon/hvx-reuse-fi-base.ll b/llvm/test/CodeGen/Hexagon/hvx-reuse-fi-base.ll
index 6000b9b..9ca1b17 100644
--- a/llvm/test/CodeGen/Hexagon/hvx-reuse-fi-base.ll
+++ b/llvm/test/CodeGen/Hexagon/hvx-reuse-fi-base.ll
@@ -42,7 +42,7 @@ define dso_local void @f2() #0 {
 ; CHECK-NEXT:    } // 8-byte Folded Spill
 ; CHECK-NEXT:    {
 ; CHECK-NEXT:     v0 = vsplat(r16)
-; CHECK-NEXT:     vmem(r29+#6) = v0.new
+; CHECK-NEXT:     vmem(r29+#2) = v0.new
 ; CHECK-NEXT:    } // 128-byte Folded Spill
 ; CHECK-NEXT:    {
 ; CHECK-NEXT:     q0 = vand(v0,r0)
@@ -56,7 +56,7 @@ define dso_local void @f2() #0 {
 ; CHECK-NEXT:    {
 ; CHECK-NEXT:     v0 = vand(q0,r17)
 ; CHECK-NEXT:     r19 = ##g0+128
-; CHECK-NEXT:     vmem(r29+#7) = v0.new
+; CHECK-NEXT:     vmem(r29+#1) = v0.new
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
 ; CHECK-NEXT:     r20 = ##g0
@@ -78,15 +78,15 @@ define dso_local void @f2() #0 {
 ; CHECK-NEXT:     vmem(r20+#0) = v30
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
-; CHECK-NEXT:     v0 = vmem(r29+#6)
+; CHECK-NEXT:     v0 = vmem(r29+#2)
 ; CHECK-NEXT:    } // 128-byte Folded Reload
 ; CHECK-NEXT:    {
 ; CHECK-NEXT:     v1:0.h = vadd(v0.ub,v0.ub)
 ; CHECK-NEXT:     r0 = ##g2
-; CHECK-NEXT:     vmem(r29+#1) = v0.new
+; CHECK-NEXT:     vmem(r29+#6) = v0.new
 ; CHECK-NEXT:    } // 256-byte Folded Spill
 ; CHECK-NEXT:    {
-; CHECK-NEXT:     vmem(r29+#2) = v1
+; CHECK-NEXT:     vmem(r29+#7) = v1
 ; CHECK-NEXT:    } // 256-byte Folded Spill
 ; CHECK-NEXT:    {
 ; CHECK-NEXT:     v1:0.uw = vrmpy(v1:0.ub,r17.ub,#0)
@@ -98,10 +98,10 @@ define dso_local void @f2() #0 {
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
 ; CHECK-NEXT:     r0 = ##2147483647
-; CHECK-NEXT:     v0 = vmem(r29+#1)
+; CHECK-NEXT:     v0 = vmem(r29+#6)
 ; CHECK-NEXT:    } // 256-byte Folded Reload
 ; CHECK-NEXT:    {
-; CHECK-NEXT:     v1 = vmem(r29+#2)
+; CHECK-NEXT:     v1 = vmem(r29+#7)
 ; CHECK-NEXT:    } // 256-byte Folded Reload
 ; CHECK-NEXT:    {
 ; CHECK-NEXT:     v1:0.uw = vrmpy(v1:0.ub,r0.ub,#1)
@@ -142,7 +142,7 @@ define dso_local void @f2() #0 {
 ; CHECK-NEXT:     vmem(r20+#0) = v0
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
-; CHECK-NEXT:     v0 = vmem(r29+#6)
+; CHECK-NEXT:     v0 = vmem(r29+#2)
 ; CHECK-NEXT:    } // 128-byte Folded Reload
 ; CHECK-NEXT:    {
 ; CHECK-NEXT:     v1 = vmem(r29+#3)
diff --git a/llvm/test/CodeGen/Hexagon/order-stack-object.ll b/llvm/test/CodeGen/Hexagon/order-stack-object.ll
new file mode 100644
index 0000000..bdc16e9
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/order-stack-object.ll
@@ -0,0 +1,42 @@
+; RUN: llc -march=hexagon -mattr=+hvxv68,+hvx-length128b < %s | FileCheck %s
+
+; Check that ordering objects on the stack from the largest to the smallest has
+; decreased the space allocated on the stack by 512 Bytes.
+
+; CHECK: allocframe(r29,#2432):raw
+
+define void @test(ptr nocapture readonly %arg, ptr nocapture writeonly %arg1, i32 %arg2) local_unnamed_addr {
+bb:
+  %shl = shl i32 %arg2, 5
+  br label %bb3
+
+bb3:
+  %phi = phi i32 [ 0, %bb ], [ %add13, %bb3 ]
+  %add = add i32 %phi, %shl
+  %sext = sext i32 %add to i64
+  %getelementptr = getelementptr float, ptr %arg, i64 %sext
+  %load = load <32 x float>, ptr %getelementptr, align 4
+  %fmul = fmul <32 x float> %load, <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
+  %fmul4 = fmul <32 x float> %load, <float 0x3FE9884620000000, float 0x3FE9884620000000, float 0x3FE9884620000000, float 0x3FE9884620000000, float 0x3FE9884620000000, float 0x3FE9884620000000, float 0x3FE9884620000000, float 0x3FE9884620000000, float 0x3FE9884620000000, float 0x3FE9884620000000, float 0x3FE9884620000000, float 0x3FE9884620000000, float 0x3FE9884620000000, float 0x3FE9884620000000, float 0x3FE9884620000000, float 0x3FE9884620000000, float 0x3FE9884620000000, float 0x3FE9884620000000, float 0x3FE9884620000000, float 0x3FE9884620000000, float 0x3FE9884620000000, float 0x3FE9884620000000, float 0x3FE9884620000000, float 0x3FE9884620000000, float 0x3FE9884620000000, float 0x3FE9884620000000, float 0x3FE9884620000000, float 0x3FE9884620000000, float 0x3FE9884620000000, float 0x3FE9884620000000, float 0x3FE9884620000000, float 0x3FE9884620000000>
+  %fmul5 = fmul <32 x float> %load, <float 0x3FA2444180000000, float 0x3FA2444180000000, float 0x3FA2444180000000, float 0x3FA2444180000000, float 0x3FA2444180000000, float 0x3FA2444180000000, float 0x3FA2444180000000, float 0x3FA2444180000000, float 0x3FA2444180000000, float 0x3FA2444180000000, float 0x3FA2444180000000, float 0x3FA2444180000000, float 0x3FA2444180000000, float 0x3FA2444180000000, float 0x3FA2444180000000, float 0x3FA2444180000000, float 0x3FA2444180000000, float 0x3FA2444180000000, float 0x3FA2444180000000, float 0x3FA2444180000000, float 0x3FA2444180000000, float 0x3FA2444180000000, float 0x3FA2444180000000, float 0x3FA2444180000000, float 0x3FA2444180000000, float 0x3FA2444180000000, float 0x3FA2444180000000, float 0x3FA2444180000000, float 0x3FA2444180000000, float 0x3FA2444180000000, float 0x3FA2444180000000, float 0x3FA2444180000000>
+  %fmul6 = fmul <32 x float> %load, %fmul5
+  %fmul7 = fmul <32 x float> %load, %fmul6
+  %fadd = fadd <32 x float> %fmul4, %fmul7
+  %fmul8 = fmul <32 x float> %fadd, <float -2.000000e+00, float -2.000000e+00, float -2.000000e+00, float -2.000000e+00, float -2.000000e+00, float -2.000000e+00, float -2.000000e+00, float -2.000000e+00, float -2.000000e+00, float -2.000000e+00, float -2.000000e+00, float -2.000000e+00, float -2.000000e+00, float -2.000000e+00, float -2.000000e+00, float -2.000000e+00, float -2.000000e+00, float -2.000000e+00, float -2.000000e+00, float -2.000000e+00, float -2.000000e+00, float -2.000000e+00, float -2.000000e+00, float -2.000000e+00, float -2.000000e+00, float -2.000000e+00, float -2.000000e+00, float -2.000000e+00, float -2.000000e+00, float -2.000000e+00, float -2.000000e+00, float -2.000000e+00>
+  %call = tail call <32 x float> @llvm.exp.v32f32(<32 x float> %fmul8)
+  %fsub = fsub <32 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, %call
+  %fadd9 = fadd <32 x float> %call, <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
+  %fdiv = fdiv <32 x float> %fsub, %fadd9
+  %fadd10 = fadd <32 x float> %fdiv, <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
+  %fmul11 = fmul <32 x float> %fmul, %fadd10
+  %getelementptr12 = getelementptr float, ptr %arg1, i64 %sext
+  store <32 x float> %fmul11, ptr %getelementptr12, align 128
+  %add13 = add nuw nsw i32 %phi, 128
+  %icmp = icmp ult i32 %phi, 8064
+  br i1 %icmp, label %bb3, label %bb14
+
+bb14:
+  ret void
+}
+
+declare <32 x float> @llvm.exp.v32f32(<32 x float>)
diff --git a/llvm/test/CodeGen/Hexagon/store-imm-stack-object.ll b/llvm/test/CodeGen/Hexagon/store-imm-stack-object.ll
index 8c5b11d..bb9f7cf 100644
--- a/llvm/test/CodeGen/Hexagon/store-imm-stack-object.ll
+++ b/llvm/test/CodeGen/Hexagon/store-imm-stack-object.ll
@@ -3,10 +3,10 @@
 target triple = "hexagon"
 
 ; CHECK-LABEL: test1:
-; CHECK-DAG: memw(r29+#4) = ##875770417
+; CHECK-DAG: memw(r29+#12) = ##875770417
 ; CHECK-DAG: memw(r29+#8) = #51
-; CHECK-DAG: memh(r29+#12) = #50
-; CHECK-DAG: memb(r29+#15) = #49
+; CHECK-DAG: memh(r29+#6) = #50
+; CHECK-DAG: memb(r29+#5) = #49
 define void @test1() {
 b0:
   %v1 = alloca [1 x i8], align 1
@@ -30,9 +30,9 @@ b0:
 }
 
 ; CHECK-LABEL: test2:
-; CHECK-DAG: memw(r29+#208) = #51
-; CHECK-DAG: memh(r29+#212) = r{{[0-9]+}}
-; CHECK-DAG: memb(r29+#215) = r{{[0-9]+}}
+; CHECK-DAG: memw(r29+#8) = #51
+; CHECK-DAG: memh(r29+#6) = r{{[0-9]+}}
+; CHECK-DAG: memb(r29+#5) = r{{[0-9]+}}
 define void @test2() {
 b0:
   %v1 = alloca [1 x i8], align 1
diff --git a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/hexagon_generated_funcs.ll.generated.expected b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/hexagon_generated_funcs.ll.generated.expected
index 2ab769f..cd135ce 100644
--- a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/hexagon_generated_funcs.ll.generated.expected
+++ b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/hexagon_generated_funcs.ll.generated.expected
@@ -75,31 +75,31 @@ attributes #0 = { noredzone nounwind ssp uwtable "frame-pointer"="all" }
 ; CHECK-NEXT:    .cfi_offset r31, -4
 ; CHECK-NEXT:    .cfi_offset r30, -8
 ; CHECK-NEXT:    {
-; CHECK-NEXT:     memw(r29+#20) = #0
+; CHECK-NEXT:     memw(r29+#4) = #0
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
-; CHECK-NEXT:     memw(r29+#16) = #0
-; CHECK-NEXT:     memw(r29+#16) = #1
+; CHECK-NEXT:     memw(r29+#8) = #0
+; CHECK-NEXT:     memw(r29+#8) = #1
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
-; CHECK-NEXT:     r1 = memw(r29+#16)
+; CHECK-NEXT:     r1 = memw(r29+#8)
 ; CHECK-NEXT:     memw(r29+#12) = #2
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
-; CHECK-NEXT:     memw(r29+#8) = #3
-; CHECK-NEXT:     memw(r29+#4) = #4
+; CHECK-NEXT:     memw(r29+#16) = #3
+; CHECK-NEXT:     memw(r29+#20) = #4
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
 ; CHECK-NEXT:     p0 = cmp.eq(r1,#0)
-; CHECK-NEXT:     if (p0.new) memw(r29+#8) = #3
+; CHECK-NEXT:     if (p0.new) memw(r29+#16) = #3
 ; CHECK-NEXT:     if (p0.new) memw(r29+#12) = #2
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
-; CHECK-NEXT:     if (p0) memw(r29+#4) = #4
-; CHECK-NEXT:     if (p0) memw(r29+#16) = #1
+; CHECK-NEXT:     if (p0) memw(r29+#20) = #4
+; CHECK-NEXT:     if (p0) memw(r29+#8) = #1
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
-; CHECK-NEXT:     if (!p0) memw(r29+#8) = #1
+; CHECK-NEXT:     if (!p0) memw(r29+#16) = #1
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
 ; CHECK-NEXT:     r31:30 = dealloc_return(r30):raw
@@ -116,27 +116,27 @@ attributes #0 = { noredzone nounwind ssp uwtable "frame-pointer"="all" }
 ; CHECK-NEXT:    .cfi_offset r31, -4
 ; CHECK-NEXT:    .cfi_offset r30, -8
 ; CHECK-NEXT:    {
-; CHECK-NEXT:     memw(r29+#20) = #0
+; CHECK-NEXT:     memw(r29+#4) = #0
 ; CHECK-NEXT:     memw(r0+#0) = #1
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
-; CHECK-NEXT:     memw(r29+#16) = #1
+; CHECK-NEXT:     memw(r29+#8) = #1
 ; CHECK-NEXT:     memw(r29+#12) = #2
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
-; CHECK-NEXT:     memw(r29+#8) = #3
-; CHECK-NEXT:     memw(r29+#4) = #4
+; CHECK-NEXT:     memw(r29+#16) = #3
+; CHECK-NEXT:     memw(r29+#20) = #4
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    //# InlineAsm Start
 ; CHECK-NEXT:    //# InlineAsm End
 ; CHECK-NEXT:    {
 ; CHECK-NEXT:     r0 = #0
-; CHECK-NEXT:     memw(r29+#16) = #1
+; CHECK-NEXT:     memw(r29+#8) = #1
 ; CHECK-NEXT:     memw(r29+#12) = #2
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
-; CHECK-NEXT:     memw(r29+#8) = #3
-; CHECK-NEXT:     memw(r29+#4) = #4
+; CHECK-NEXT:     memw(r29+#16) = #3
+; CHECK-NEXT:     memw(r29+#20) = #4
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
 ; CHECK-NEXT:     r31:30 = dealloc_return(r30):raw
diff --git a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/hexagon_generated_funcs.ll.nogenerated.expected b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/hexagon_generated_funcs.ll.nogenerated.expected
index 52dd5f1..833bf68 100644
--- a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/hexagon_generated_funcs.ll.nogenerated.expected
+++ b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/hexagon_generated_funcs.ll.nogenerated.expected
@@ -16,31 +16,31 @@ define dso_local i32 @check_boundaries() #0 {
 ; CHECK-NEXT:    .cfi_offset r31, -4
 ; CHECK-NEXT:    .cfi_offset r30, -8
 ; CHECK-NEXT:    {
-; CHECK-NEXT:     memw(r29+#20) = #0
+; CHECK-NEXT:     memw(r29+#4) = #0
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
-; CHECK-NEXT:     memw(r29+#16) = #0
-; CHECK-NEXT:     memw(r29+#16) = #1
+; CHECK-NEXT:     memw(r29+#8) = #0
+; CHECK-NEXT:     memw(r29+#8) = #1
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
-; CHECK-NEXT:     r1 = memw(r29+#16)
+; CHECK-NEXT:     r1 = memw(r29+#8)
 ; CHECK-NEXT:     memw(r29+#12) = #2
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
-; CHECK-NEXT:     memw(r29+#8) = #3
-; CHECK-NEXT:     memw(r29+#4) = #4
+; CHECK-NEXT:     memw(r29+#16) = #3
+; CHECK-NEXT:     memw(r29+#20) = #4
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
 ; CHECK-NEXT:     p0 = cmp.eq(r1,#0)
-; CHECK-NEXT:     if (p0.new) memw(r29+#8) = #3
+; CHECK-NEXT:     if (p0.new) memw(r29+#16) = #3
 ; CHECK-NEXT:     if (p0.new) memw(r29+#12) = #2
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
-; CHECK-NEXT:     if (p0) memw(r29+#4) = #4
-; CHECK-NEXT:     if (p0) memw(r29+#16) = #1
+; CHECK-NEXT:     if (p0) memw(r29+#20) = #4
+; CHECK-NEXT:     if (p0) memw(r29+#8) = #1
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
-; CHECK-NEXT:     if (!p0) memw(r29+#8) = #1
+; CHECK-NEXT:     if (!p0) memw(r29+#16) = #1
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
 ; CHECK-NEXT:     r31:30 = dealloc_return(r30):raw
@@ -93,27 +93,27 @@ define dso_local i32 @main() #0 {
 ; CHECK-NEXT:    .cfi_offset r31, -4
 ; CHECK-NEXT:    .cfi_offset r30, -8
 ; CHECK-NEXT:    {
-; CHECK-NEXT:     memw(r29+#20) = #0
+; CHECK-NEXT:     memw(r29+#4) = #0
 ; CHECK-NEXT:     memw(r0+#0) = #1
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
-; CHECK-NEXT:     memw(r29+#16) = #1
+; CHECK-NEXT:     memw(r29+#8) = #1
 ; CHECK-NEXT:     memw(r29+#12) = #2
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
-; CHECK-NEXT:     memw(r29+#8) = #3
-; CHECK-NEXT:     memw(r29+#4) = #4
+; CHECK-NEXT:     memw(r29+#16) = #3
+; CHECK-NEXT:     memw(r29+#20) = #4
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    //# InlineAsm Start
 ; CHECK-NEXT:    //# InlineAsm End
 ; CHECK-NEXT:    {
 ; CHECK-NEXT:     r0 = #0
-; CHECK-NEXT:     memw(r29+#16) = #1
+; CHECK-NEXT:     memw(r29+#8) = #1
 ; CHECK-NEXT:     memw(r29+#12) = #2
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
-; CHECK-NEXT:     memw(r29+#8) = #3
-; CHECK-NEXT:     memw(r29+#4) = #4
+; CHECK-NEXT:     memw(r29+#16) = #3
+; CHECK-NEXT:     memw(r29+#20) = #4
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
 ; CHECK-NEXT:     r31:30 = dealloc_return(r30):raw
-- 
cgit v1.1


From 4e16a75902d5718f4932fae9b2a07c410cd0ba34 Mon Sep 17 00:00:00 2001
From: Vlad Serebrennikov <serebrennikov.vladislav@gmail.com>
Date: Sat, 10 Feb 2024 23:58:26 +0300
Subject: [clang][NFC] Annotate `Sema/ScopeInfo.h` with `preferred_type`

This helps debuggers to display values in bit-fields in a more helpful way.
---
 clang/include/clang/Sema/ScopeInfo.h | 37 ++++++++++++++++++++++++++----------
 1 file changed, 27 insertions(+), 10 deletions(-)

diff --git a/clang/include/clang/Sema/ScopeInfo.h b/clang/include/clang/Sema/ScopeInfo.h
index 6eaa7438..076dcaa 100644
--- a/clang/include/clang/Sema/ScopeInfo.h
+++ b/clang/include/clang/Sema/ScopeInfo.h
@@ -97,6 +97,8 @@ public:
       : PD(PD), Loc(Loc), Stmts(Stmts) {}
 };
 
+enum class FirstCoroutineStmtKind { co_return, co_await, co_yield };
+
 /// Retains information about a function, method, or block that is
 /// currently being parsed.
 class FunctionScopeInfo {
@@ -170,6 +172,7 @@ public:
 
   /// An enumeration representing the kind of the first coroutine statement
   /// in the function. One of co_return, co_await, or co_yield.
+  LLVM_PREFERRED_TYPE(FirstCoroutineStmtKind)
   unsigned char FirstCoroutineStmtKind : 2;
 
   /// Whether we found an immediate-escalating expression.
@@ -502,22 +505,30 @@ public:
     assert(FirstCoroutineStmtLoc.isInvalid() &&
                    "first coroutine statement location already set");
     FirstCoroutineStmtLoc = Loc;
-    FirstCoroutineStmtKind = llvm::StringSwitch<unsigned char>(Keyword)
-            .Case("co_return", 0)
-            .Case("co_await", 1)
-            .Case("co_yield", 2);
+    FirstCoroutineStmtKind =
+        llvm::StringSwitch<unsigned char>(Keyword)
+            .Case("co_return",
+                  llvm::to_underlying(FirstCoroutineStmtKind::co_return))
+            .Case("co_await",
+                  llvm::to_underlying(FirstCoroutineStmtKind::co_await))
+            .Case("co_yield",
+                  llvm::to_underlying(FirstCoroutineStmtKind::co_yield));
   }
 
   StringRef getFirstCoroutineStmtKeyword() const {
     assert(FirstCoroutineStmtLoc.isValid()
                    && "no coroutine statement available");
-    switch (FirstCoroutineStmtKind) {
-    case 0: return "co_return";
-    case 1: return "co_await";
-    case 2: return "co_yield";
-    default:
-      llvm_unreachable("FirstCoroutineStmtKind has an invalid value");
+    auto Value =
+        static_cast<enum FirstCoroutineStmtKind>(FirstCoroutineStmtKind);
+    switch (Value) {
+    case FirstCoroutineStmtKind::co_return:
+      return "co_return";
+    case FirstCoroutineStmtKind::co_await:
+      return "co_await";
+    case FirstCoroutineStmtKind::co_yield:
+      return "co_yield";
     };
+    llvm_unreachable("FirstCoroutineStmtKind has an invalid value");
   }
 
   void setNeedsCoroutineSuspends(bool value = true) {
@@ -582,25 +593,31 @@ class Capture {
   QualType CaptureType;
 
   /// The CaptureKind of this capture.
+  LLVM_PREFERRED_TYPE(CaptureKind)
   unsigned Kind : 2;
 
   /// Whether this is a nested capture (a capture of an enclosing capturing
   /// scope's capture).
+  LLVM_PREFERRED_TYPE(bool)
   unsigned Nested : 1;
 
   /// Whether this is a capture of '*this'.
+  LLVM_PREFERRED_TYPE(bool)
   unsigned CapturesThis : 1;
 
   /// Whether an explicit capture has been odr-used in the body of the
   /// lambda.
+  LLVM_PREFERRED_TYPE(bool)
   unsigned ODRUsed : 1;
 
   /// Whether an explicit capture has been non-odr-used in the body of
   /// the lambda.
+  LLVM_PREFERRED_TYPE(bool)
   unsigned NonODRUsed : 1;
 
   /// Whether the capture is invalid (a capture was required but the entity is
   /// non-capturable).
+  LLVM_PREFERRED_TYPE(bool)
   unsigned Invalid : 1;
 
 public:
-- 
cgit v1.1


From d2812d2d1a9b4edb64e95a9a86a2599a24bcb5ec Mon Sep 17 00:00:00 2001
From: Vlad Serebrennikov <serebrennikov.vladislav@gmail.com>
Date: Sun, 11 Feb 2024 00:11:13 +0300
Subject: [clang][NFC] Annotate `Sema/Overload.h` with `preferred_type`

This helps debuggers to display values in bit-fields in a more helpful way.
---
 clang/include/clang/Sema/Overload.h | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/clang/include/clang/Sema/Overload.h b/clang/include/clang/Sema/Overload.h
index 6ccabad..9b342c0 100644
--- a/clang/include/clang/Sema/Overload.h
+++ b/clang/include/clang/Sema/Overload.h
@@ -278,40 +278,50 @@ class Sema;
     /// Whether this is the deprecated conversion of a
     /// string literal to a pointer to non-const character data
     /// (C++ 4.2p2).
+    LLVM_PREFERRED_TYPE(bool)
     unsigned DeprecatedStringLiteralToCharPtr : 1;
 
     /// Whether the qualification conversion involves a change in the
     /// Objective-C lifetime (for automatic reference counting).
+    LLVM_PREFERRED_TYPE(bool)
     unsigned QualificationIncludesObjCLifetime : 1;
 
     /// IncompatibleObjC - Whether this is an Objective-C conversion
     /// that we should warn about (if we actually use it).
+    LLVM_PREFERRED_TYPE(bool)
     unsigned IncompatibleObjC : 1;
 
     /// ReferenceBinding - True when this is a reference binding
     /// (C++ [over.ics.ref]).
+    LLVM_PREFERRED_TYPE(bool)
     unsigned ReferenceBinding : 1;
 
     /// DirectBinding - True when this is a reference binding that is a
     /// direct binding (C++ [dcl.init.ref]).
+    LLVM_PREFERRED_TYPE(bool)
     unsigned DirectBinding : 1;
 
     /// Whether this is an lvalue reference binding (otherwise, it's
     /// an rvalue reference binding).
+    LLVM_PREFERRED_TYPE(bool)
     unsigned IsLvalueReference : 1;
 
     /// Whether we're binding to a function lvalue.
+    LLVM_PREFERRED_TYPE(bool)
     unsigned BindsToFunctionLvalue : 1;
 
     /// Whether we're binding to an rvalue.
+    LLVM_PREFERRED_TYPE(bool)
     unsigned BindsToRvalue : 1;
 
     /// Whether this binds an implicit object argument to a
     /// non-static member function without a ref-qualifier.
+    LLVM_PREFERRED_TYPE(bool)
     unsigned BindsImplicitObjectArgumentWithoutRefQualifier : 1;
 
     /// Whether this binds a reference to an object with a different
     /// Objective-C lifetime qualifier.
+    LLVM_PREFERRED_TYPE(bool)
     unsigned ObjCLifetimeConversionBinding : 1;
 
     /// FromType - The type that this conversion is converting
@@ -541,9 +551,11 @@ class Sema;
     };
 
     /// ConversionKind - The kind of implicit conversion sequence.
+    LLVM_PREFERRED_TYPE(Kind)
     unsigned ConversionKind : 31;
 
     // Whether the initializer list was of an incomplete array.
+    LLVM_PREFERRED_TYPE(bool)
     unsigned InitializerListOfIncompleteArray : 1;
 
     /// When initializing an array or std::initializer_list from an
@@ -878,6 +890,7 @@ class Sema;
     CallExpr::ADLCallKind IsADLCandidate : 1;
 
     /// Whether this is a rewritten candidate, and if so, of what kind?
+    LLVM_PREFERRED_TYPE(OverloadCandidateRewriteKind)
     unsigned RewriteKind : 2;
 
     /// FailureKind - The reason why this candidate is not viable.
-- 
cgit v1.1


From 425fd3eb10f29e73d722b4c2bc9cb50798de18e8 Mon Sep 17 00:00:00 2001
From: Vlad Serebrennikov <serebrennikov.vladislav@gmail.com>
Date: Sun, 11 Feb 2024 00:15:25 +0300
Subject: [clang][NFC] Rename FirstCoroutineStmtKind enumerators

So that they do not use coroutine keywords.
Fixed buildbot failure https://lab.llvm.org/buildbot/#/builders/86/builds/74100
---
 clang/include/clang/Sema/ScopeInfo.h | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/clang/include/clang/Sema/ScopeInfo.h b/clang/include/clang/Sema/ScopeInfo.h
index 076dcaa..ca3d0a0 100644
--- a/clang/include/clang/Sema/ScopeInfo.h
+++ b/clang/include/clang/Sema/ScopeInfo.h
@@ -97,7 +97,7 @@ public:
       : PD(PD), Loc(Loc), Stmts(Stmts) {}
 };
 
-enum class FirstCoroutineStmtKind { co_return, co_await, co_yield };
+enum class FirstCoroutineStmtKind { CoReturn, CoAwait, CoYield };
 
 /// Retains information about a function, method, or block that is
 /// currently being parsed.
@@ -508,11 +508,11 @@ public:
     FirstCoroutineStmtKind =
         llvm::StringSwitch<unsigned char>(Keyword)
             .Case("co_return",
-                  llvm::to_underlying(FirstCoroutineStmtKind::co_return))
+                  llvm::to_underlying(FirstCoroutineStmtKind::CoReturn))
             .Case("co_await",
-                  llvm::to_underlying(FirstCoroutineStmtKind::co_await))
+                  llvm::to_underlying(FirstCoroutineStmtKind::CoAwait))
             .Case("co_yield",
-                  llvm::to_underlying(FirstCoroutineStmtKind::co_yield));
+                  llvm::to_underlying(FirstCoroutineStmtKind::CoYield));
   }
 
   StringRef getFirstCoroutineStmtKeyword() const {
@@ -521,11 +521,11 @@ public:
     auto Value =
         static_cast<enum FirstCoroutineStmtKind>(FirstCoroutineStmtKind);
     switch (Value) {
-    case FirstCoroutineStmtKind::co_return:
+    case FirstCoroutineStmtKind::CoReturn:
       return "co_return";
-    case FirstCoroutineStmtKind::co_await:
+    case FirstCoroutineStmtKind::CoAwait:
       return "co_await";
-    case FirstCoroutineStmtKind::co_yield:
+    case FirstCoroutineStmtKind::CoYield:
       return "co_yield";
     };
     llvm_unreachable("FirstCoroutineStmtKind has an invalid value");
-- 
cgit v1.1


From 6a7cf806a66c67df01818fda01116a2dd2d90b0d Mon Sep 17 00:00:00 2001
From: Vlad Serebrennikov <serebrennikov.vladislav@gmail.com>
Date: Sun, 11 Feb 2024 00:21:37 +0300
Subject: [clang][NFC] Annotate `Sema/ParsedAttr.h` with `preferred_type`

This helps debuggers to display values in bit-fields in a more helpful way.
---
 clang/include/clang/Sema/ParsedAttr.h | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/clang/include/clang/Sema/ParsedAttr.h b/clang/include/clang/Sema/ParsedAttr.h
index 8c0edca..8c3ba39 100644
--- a/clang/include/clang/Sema/ParsedAttr.h
+++ b/clang/include/clang/Sema/ParsedAttr.h
@@ -82,7 +82,9 @@ struct AvailabilityData {
 
 struct TypeTagForDatatypeData {
   ParsedType MatchingCType;
+  LLVM_PREFERRED_TYPE(bool)
   unsigned LayoutCompatible : 1;
+  LLVM_PREFERRED_TYPE(bool)
   unsigned MustBeNull : 1;
 };
 struct PropertyData {
@@ -149,33 +151,41 @@ private:
   unsigned NumArgs : 16;
 
   /// True if already diagnosed as invalid.
+  LLVM_PREFERRED_TYPE(bool)
   mutable unsigned Invalid : 1;
 
   /// True if this attribute was used as a type attribute.
+  LLVM_PREFERRED_TYPE(bool)
   mutable unsigned UsedAsTypeAttr : 1;
 
   /// True if this has the extra information associated with an
   /// availability attribute.
+  LLVM_PREFERRED_TYPE(bool)
   unsigned IsAvailability : 1;
 
   /// True if this has extra information associated with a
   /// type_tag_for_datatype attribute.
+  LLVM_PREFERRED_TYPE(bool)
   unsigned IsTypeTagForDatatype : 1;
 
   /// True if this has extra information associated with a
   /// Microsoft __delcspec(property) attribute.
+  LLVM_PREFERRED_TYPE(bool)
   unsigned IsProperty : 1;
 
   /// True if this has a ParsedType
+  LLVM_PREFERRED_TYPE(bool)
   unsigned HasParsedType : 1;
 
   /// True if the processing cache is valid.
+  LLVM_PREFERRED_TYPE(bool)
   mutable unsigned HasProcessingCache : 1;
 
   /// A cached value.
   mutable unsigned ProcessingCache : 8;
 
   /// True if the attribute is specified using '#pragma clang attribute'.
+  LLVM_PREFERRED_TYPE(bool)
   mutable unsigned IsPragmaClangAttribute : 1;
 
   /// The location of the 'unavailable' keyword in an
-- 
cgit v1.1


From 0df8aed6c30f08ded526038a6bbb4daf113a31c1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Martin=20Storsj=C3=B6?= <martin@martin.st>
Date: Sat, 10 Feb 2024 23:57:12 +0200
Subject: [LLD] [COFF] Pick timestamps from the SOURCE_DATE_EPOCH variable
 (#81326)

The SOURCE_DATE_EPOCH environment variable can be set in order to get
reproducible build.

When linking PE/COFF modules with LLD, the timestamp field is set to the
current time, unless either the /timestamp: or /Brepro option is set. If
neither of them is set, check the SOURCE_DATE_EPOCH variable, before
resorting to using the actual current date and time.

See https://reproducible-builds.org/docs/source-date-epoch/ for reference
on the use of this variable.
---
 lld/COFF/Driver.cpp          | 10 +++++++++-
 lld/test/COFF/timestamp.test | 18 ++++++++++++++++++
 2 files changed, 27 insertions(+), 1 deletion(-)

diff --git a/lld/COFF/Driver.cpp b/lld/COFF/Driver.cpp
index e0afb6b..22ee2f1 100644
--- a/lld/COFF/Driver.cpp
+++ b/lld/COFF/Driver.cpp
@@ -1825,7 +1825,15 @@ void LinkerDriver::linkerMain(ArrayRef<const char *> argsArr) {
     }
   } else {
     config->repro = false;
-    config->timestamp = time(nullptr);
+    if (std::optional<std::string> epoch =
+            Process::GetEnv("SOURCE_DATE_EPOCH")) {
+      StringRef value(*epoch);
+      if (value.getAsInteger(0, config->timestamp))
+        fatal(Twine("invalid SOURCE_DATE_EPOCH timestamp: ") + value +
+              ".  Expected 32-bit integer");
+    } else {
+      config->timestamp = time(nullptr);
+    }
   }
 
   // Handle /alternatename
diff --git a/lld/test/COFF/timestamp.test b/lld/test/COFF/timestamp.test
index fbdc5788..c0658d6 100644
--- a/lld/test/COFF/timestamp.test
+++ b/lld/test/COFF/timestamp.test
@@ -3,9 +3,19 @@ RUN: yaml2obj %p/Inputs/generic.yaml -o %t.obj
 RUN: lld-link %t.obj /debug /Brepro /entry:main /nodefaultlib /out:%t.1.exe
 RUN: lld-link %t.obj /debug /Brepro /entry:main /nodefaultlib /out:%t.2.exe
 RUN: lld-link %t.obj /debug /timestamp:0 /entry:main /nodefaultlib /out:%t.3.exe
+RUN: env SOURCE_DATE_EPOCH=0 lld-link %t.obj /debug /entry:main /nodefaultlib /out:%t.4.exe
+RUN: lld-link %t.obj /debug /timestamp:4294967295 /entry:main /nodefaultlib /out:%t.5.exe
+RUN: env SOURCE_DATE_EPOCH=4294967295 lld-link %t.obj /debug /entry:main /nodefaultlib /out:%t.6.exe
+RUN: env SOURCE_DATE_EPOCH=12345 lld-link %t.obj /debug /timestamp:0 /entry:main /nodefaultlib /out:%t.7.exe
+RUN: env LLD_IN_TEST=1 not lld-link %t.obj /debug /timestamp:4294967296 /entry:main /nodefaultlib /out:%t.8.exe 2>&1 | FileCheck %s --check-prefix=ERROR
+RUN: env SOURCE_DATE_EPOCH=4294967296 env LLD_IN_TEST=1 not lld-link %t.obj /debug /entry:main /nodefaultlib /out:%t.9.exe 2>&1 | FileCheck %s --check-prefix=ERROR2
 RUN: llvm-readobj --file-headers --coff-debug-directory %t.1.exe | FileCheck %s --check-prefix=HASH
 RUN: llvm-readobj --file-headers --coff-debug-directory %t.2.exe | FileCheck %s --check-prefix=HASH
 RUN: llvm-readobj --file-headers --coff-debug-directory %t.3.exe | FileCheck %s --check-prefix=ZERO
+RUN: llvm-readobj --file-headers --coff-debug-directory %t.4.exe | FileCheck %s --check-prefix=ZERO
+RUN: llvm-readobj --file-headers --coff-debug-directory %t.5.exe | FileCheck %s --check-prefix=MAX
+RUN: llvm-readobj --file-headers --coff-debug-directory %t.6.exe | FileCheck %s --check-prefix=MAX
+RUN: llvm-readobj --file-headers --coff-debug-directory %t.7.exe | FileCheck %s --check-prefix=ZERO
 
 HASH: ImageFileHeader {
 HASH: TimeDateStamp: [[STAMP:.*]]
@@ -16,3 +26,11 @@ ZERO: ImageFileHeader {
 ZERO: TimeDateStamp: 1970-01-01 00:00:00 (0x0)
 ZERO: DebugDirectory [
 ZERO: TimeDateStamp: 1970-01-01 00:00:00 (0x0)
+
+MAX: ImageFileHeader {
+MAX: TimeDateStamp: 2106-02-07 06:28:15 (0xFFFFFFFF)
+MAX: DebugDirectory [
+MAX: TimeDateStamp: 2106-02-07 06:28:15 (0xFFFFFFFF)
+
+ERROR: error: invalid timestamp: 4294967296.  Expected 32-bit integer
+ERROR2: error: invalid SOURCE_DATE_EPOCH timestamp: 4294967296.  Expected 32-bit integer
-- 
cgit v1.1


From b17348c3b541d7fc7ec441c98db75c18d8959910 Mon Sep 17 00:00:00 2001
From: Kai Sasaki <lewuathe@gmail.com>
Date: Sun, 11 Feb 2024 07:35:19 +0900
Subject: [mlir][complex] Prevent underflow in complex.abs (#79786) (#81092)

---
 .../ComplexToStandard/ComplexToStandard.cpp        |  58 +++++++---
 .../ComplexToStandard/convert-to-standard.mlir     | 125 +++++++++++++++++----
 .../ComplexToStandard/full-conversion.mlir         |  27 ++++-
 .../Dialect/Complex/CPU/correctness.mlir           |  54 +++++++++
 4 files changed, 224 insertions(+), 40 deletions(-)

diff --git a/mlir/lib/Conversion/ComplexToStandard/ComplexToStandard.cpp b/mlir/lib/Conversion/ComplexToStandard/ComplexToStandard.cpp
index 4c9dad9..cc31511 100644
--- a/mlir/lib/Conversion/ComplexToStandard/ComplexToStandard.cpp
+++ b/mlir/lib/Conversion/ComplexToStandard/ComplexToStandard.cpp
@@ -26,29 +26,59 @@ namespace mlir {
 using namespace mlir;
 
 namespace {
+// The algorithm is listed in https://dl.acm.org/doi/pdf/10.1145/363717.363780.
 struct AbsOpConversion : public OpConversionPattern<complex::AbsOp> {
   using OpConversionPattern<complex::AbsOp>::OpConversionPattern;
 
   LogicalResult
   matchAndRewrite(complex::AbsOp op, OpAdaptor adaptor,
                   ConversionPatternRewriter &rewriter) const override {
-    auto loc = op.getLoc();
-    auto type = op.getType();
+    mlir::ImplicitLocOpBuilder b(op.getLoc(), rewriter);
 
     arith::FastMathFlagsAttr fmf = op.getFastMathFlagsAttr();
 
-    Value real =
-        rewriter.create<complex::ReOp>(loc, type, adaptor.getComplex());
-    Value imag =
-        rewriter.create<complex::ImOp>(loc, type, adaptor.getComplex());
-    Value realSqr =
-        rewriter.create<arith::MulFOp>(loc, real, real, fmf.getValue());
-    Value imagSqr =
-        rewriter.create<arith::MulFOp>(loc, imag, imag, fmf.getValue());
-    Value sqNorm =
-        rewriter.create<arith::AddFOp>(loc, realSqr, imagSqr, fmf.getValue());
-
-    rewriter.replaceOpWithNewOp<math::SqrtOp>(op, sqNorm);
+    Type elementType = op.getType();
+    Value arg = adaptor.getComplex();
+
+    Value zero =
+        b.create<arith::ConstantOp>(elementType, b.getZeroAttr(elementType));
+    Value one = b.create<arith::ConstantOp>(elementType,
+                                            b.getFloatAttr(elementType, 1.0));
+
+    Value real = b.create<complex::ReOp>(elementType, arg);
+    Value imag = b.create<complex::ImOp>(elementType, arg);
+
+    Value realIsZero =
+        b.create<arith::CmpFOp>(arith::CmpFPredicate::OEQ, real, zero);
+    Value imagIsZero =
+        b.create<arith::CmpFOp>(arith::CmpFPredicate::OEQ, imag, zero);
+
+    // Real > Imag
+    Value imagDivReal = b.create<arith::DivFOp>(imag, real, fmf.getValue());
+    Value imagSq =
+        b.create<arith::MulFOp>(imagDivReal, imagDivReal, fmf.getValue());
+    Value imagSqPlusOne = b.create<arith::AddFOp>(imagSq, one, fmf.getValue());
+    Value imagSqrt = b.create<math::SqrtOp>(imagSqPlusOne, fmf.getValue());
+    Value realAbs = b.create<math::AbsFOp>(real, fmf.getValue());
+    Value absImag = b.create<arith::MulFOp>(imagSqrt, realAbs, fmf.getValue());
+
+    // Real <= Imag
+    Value realDivImag = b.create<arith::DivFOp>(real, imag, fmf.getValue());
+    Value realSq =
+        b.create<arith::MulFOp>(realDivImag, realDivImag, fmf.getValue());
+    Value realSqPlusOne = b.create<arith::AddFOp>(realSq, one, fmf.getValue());
+    Value realSqrt = b.create<math::SqrtOp>(realSqPlusOne, fmf.getValue());
+    Value imagAbs = b.create<math::AbsFOp>(imag, fmf.getValue());
+    Value absReal = b.create<arith::MulFOp>(realSqrt, imagAbs, fmf.getValue());
+
+    rewriter.replaceOpWithNewOp<arith::SelectOp>(
+        op, realIsZero, imagAbs,
+        b.create<arith::SelectOp>(
+            imagIsZero, realAbs,
+            b.create<arith::SelectOp>(
+                b.create<arith::CmpFOp>(arith::CmpFPredicate::OGT, real, imag),
+                absImag, absReal)));
+
     return success();
   }
 };
diff --git a/mlir/test/Conversion/ComplexToStandard/convert-to-standard.mlir b/mlir/test/Conversion/ComplexToStandard/convert-to-standard.mlir
index 8fa29ea..1fe843b 100644
--- a/mlir/test/Conversion/ComplexToStandard/convert-to-standard.mlir
+++ b/mlir/test/Conversion/ComplexToStandard/convert-to-standard.mlir
@@ -7,13 +7,30 @@ func.func @complex_abs(%arg: complex<f32>) -> f32 {
   %abs = complex.abs %arg: complex<f32>
   return %abs : f32
 }
+
+// CHECK: %[[ZERO:.*]] = arith.constant 0.000000e+00 : f32
+// CHECK: %[[ONE:.*]] = arith.constant 1.000000e+00 : f32
 // CHECK: %[[REAL:.*]] = complex.re %[[ARG]] : complex<f32>
 // CHECK: %[[IMAG:.*]] = complex.im %[[ARG]] : complex<f32>
-// CHECK-DAG: %[[REAL_SQ:.*]] = arith.mulf %[[REAL]], %[[REAL]] : f32
-// CHECK-DAG: %[[IMAG_SQ:.*]] = arith.mulf %[[IMAG]], %[[IMAG]] : f32
-// CHECK: %[[SQ_NORM:.*]] = arith.addf %[[REAL_SQ]], %[[IMAG_SQ]] : f32
-// CHECK: %[[NORM:.*]] = math.sqrt %[[SQ_NORM]] : f32
-// CHECK: return %[[NORM]] : f32
+// CHECK: %[[IS_REAL_ZERO:.*]] = arith.cmpf oeq, %[[REAL]], %[[ZERO]] : f32
+// CHECK: %[[IS_IMAG_ZERO:.*]] = arith.cmpf oeq, %[[IMAG]], %[[ZERO]] : f32
+// CHECK: %[[IMAG_DIV_REAL:.*]] = arith.divf %[[IMAG]], %[[REAL]] : f32
+// CHECK: %[[IMAG_SQ:.*]] = arith.mulf %[[IMAG_DIV_REAL]], %[[IMAG_DIV_REAL]] : f32
+// CHECK: %[[IMAG_SQ_PLUS_ONE:.*]] = arith.addf %[[IMAG_SQ]], %[[ONE]] : f32
+// CHECK: %[[IMAG_SQRT:.*]] = math.sqrt %[[IMAG_SQ_PLUS_ONE]] : f32
+// CHECK: %[[REAL_ABS:.*]] = math.absf %[[REAL]] : f32
+// CHECK: %[[ABS_IMAG:.*]] = arith.mulf %[[IMAG_SQRT]], %[[REAL_ABS]] : f32
+// CHECK: %[[REAL_DIV_IMAG:.*]] = arith.divf %[[REAL]], %[[IMAG]] : f32
+// CHECK: %[[REAL_SQ:.*]] = arith.mulf %[[REAL_DIV_IMAG]], %[[REAL_DIV_IMAG]] : f32
+// CHECK: %[[REAL_SQ_PLUS_ONE:.*]] = arith.addf %[[REAL_SQ]], %[[ONE]] : f32
+// CHECK: %[[REAL_SQRT:.*]] = math.sqrt %[[REAL_SQ_PLUS_ONE]] : f32
+// CHECK: %[[IMAG_ABS:.*]] = math.absf %[[IMAG]] : f32
+// CHECK: %[[ABS_REAL:.*]] = arith.mulf %[[REAL_SQRT]], %[[IMAG_ABS]] : f32
+// CHECK: %[[REAL_GT_IMAG:.*]] = arith.cmpf ogt, %[[REAL]], %[[IMAG]] : f32
+// CHECK: %[[ABS1:.*]] = arith.select %[[REAL_GT_IMAG]], %[[ABS_IMAG]], %[[ABS_REAL]] : f32
+// CHECK: %[[ABS2:.*]] = arith.select %[[IS_IMAG_ZERO]], %[[REAL_ABS]], %[[ABS1]] : f32
+// CHECK: %[[ABS3:.*]] = arith.select %[[IS_REAL_ZERO]], %[[IMAG_ABS]], %[[ABS2]] : f32
+// CHECK: return %[[ABS3]] : f32
 
 // -----
 
@@ -241,12 +258,28 @@ func.func @complex_log(%arg: complex<f32>) -> complex<f32> {
   %log = complex.log %arg: complex<f32>
   return %log : complex<f32>
 }
+// CHECK: %[[ZERO:.*]] = arith.constant 0.000000e+00 : f32
+// CHECK: %[[ONE:.*]] = arith.constant 1.000000e+00 : f32
 // CHECK: %[[REAL:.*]] = complex.re %[[ARG]] : complex<f32>
 // CHECK: %[[IMAG:.*]] = complex.im %[[ARG]] : complex<f32>
-// CHECK: %[[SQR_REAL:.*]] = arith.mulf %[[REAL]], %[[REAL]] : f32
-// CHECK: %[[SQR_IMAG:.*]] = arith.mulf %[[IMAG]], %[[IMAG]] : f32
-// CHECK: %[[SQ_NORM:.*]] = arith.addf %[[SQR_REAL]], %[[SQR_IMAG]] : f32
-// CHECK: %[[NORM:.*]] = math.sqrt %[[SQ_NORM]] : f32
+// CHECK: %[[IS_REAL_ZERO:.*]] = arith.cmpf oeq, %[[REAL]], %[[ZERO]] : f32
+// CHECK: %[[IS_IMAG_ZERO:.*]] = arith.cmpf oeq, %[[IMAG]], %[[ZERO]] : f32
+// CHECK: %[[IMAG_DIV_REAL:.*]] = arith.divf %[[IMAG]], %[[REAL]] : f32
+// CHECK: %[[IMAG_SQ:.*]] = arith.mulf %[[IMAG_DIV_REAL]], %[[IMAG_DIV_REAL]] : f32
+// CHECK: %[[IMAG_SQ_PLUS_ONE:.*]] = arith.addf %[[IMAG_SQ]], %[[ONE]] : f32
+// CHECK: %[[IMAG_SQRT:.*]] = math.sqrt %[[IMAG_SQ_PLUS_ONE]] : f32
+// CHECK: %[[REAL_ABS:.*]] = math.absf %[[REAL]] : f32
+// CHECK: %[[ABS_IMAG:.*]] = arith.mulf %[[IMAG_SQRT]], %[[REAL_ABS]] : f32
+// CHECK: %[[REAL_DIV_IMAG:.*]] = arith.divf %[[REAL]], %[[IMAG]] : f32
+// CHECK: %[[REAL_SQ:.*]] = arith.mulf %[[REAL_DIV_IMAG]], %[[REAL_DIV_IMAG]] : f32
+// CHECK: %[[REAL_SQ_PLUS_ONE:.*]] = arith.addf %[[REAL_SQ]], %[[ONE]] : f32
+// CHECK: %[[REAL_SQRT:.*]] = math.sqrt %[[REAL_SQ_PLUS_ONE]] : f32
+// CHECK: %[[IMAG_ABS:.*]] = math.absf %[[IMAG]] : f32
+// CHECK: %[[ABS_REAL:.*]] = arith.mulf %[[REAL_SQRT]], %[[IMAG_ABS]] : f32
+// CHECK: %[[REAL_GT_IMAG:.*]] = arith.cmpf ogt, %[[REAL]], %[[IMAG]] : f32
+// CHECK: %[[ABS1:.*]] = arith.select %[[REAL_GT_IMAG]], %[[ABS_IMAG]], %[[ABS_REAL]] : f32
+// CHECK: %[[ABS2:.*]] = arith.select %[[IS_IMAG_ZERO]], %[[REAL_ABS]], %[[ABS1]] : f32
+// CHECK: %[[NORM:.*]] = arith.select %[[IS_REAL_ZERO]], %[[IMAG_ABS]], %[[ABS2]] : f32
 // CHECK: %[[RESULT_REAL:.*]] = math.log %[[NORM]] : f32
 // CHECK: %[[REAL2:.*]] = complex.re %[[ARG]] : complex<f32>
 // CHECK: %[[IMAG2:.*]] = complex.im %[[ARG]] : complex<f32>
@@ -469,12 +502,28 @@ func.func @complex_sign(%arg: complex<f32>) -> complex<f32> {
 // CHECK: %[[REAL_IS_ZERO:.*]] = arith.cmpf oeq, %[[REAL]], %[[ZERO]] : f32
 // CHECK: %[[IMAG_IS_ZERO:.*]] = arith.cmpf oeq, %[[IMAG]], %[[ZERO]] : f32
 // CHECK: %[[IS_ZERO:.*]] = arith.andi %[[REAL_IS_ZERO]], %[[IMAG_IS_ZERO]] : i1
+// CHECK: %[[ZERO:.*]] = arith.constant 0.000000e+00 : f32
+// CHECK: %[[ONE:.*]] = arith.constant 1.000000e+00 : f32
 // CHECK: %[[REAL2:.*]] = complex.re %[[ARG]] : complex<f32>
 // CHECK: %[[IMAG2:.*]] = complex.im %[[ARG]] : complex<f32>
-// CHECK: %[[SQR_REAL:.*]] = arith.mulf %[[REAL2]], %[[REAL2]] : f32
-// CHECK: %[[SQR_IMAG:.*]] = arith.mulf %[[IMAG2]], %[[IMAG2]] : f32
-// CHECK: %[[SQ_NORM:.*]] = arith.addf %[[SQR_REAL]], %[[SQR_IMAG]] : f32
-// CHECK: %[[NORM:.*]] = math.sqrt %[[SQ_NORM]] : f32
+// CHECK: %[[IS_REAL_ZERO:.*]] = arith.cmpf oeq, %[[REAL2]], %[[ZERO]] : f32
+// CHECK: %[[IS_IMAG_ZERO:.*]] = arith.cmpf oeq, %[[IMAG2]], %[[ZERO]] : f32
+// CHECK: %[[IMAG_DIV_REAL:.*]] = arith.divf %[[IMAG2]], %[[REAL2]] : f32
+// CHECK: %[[IMAG_SQ:.*]] = arith.mulf %[[IMAG_DIV_REAL]], %[[IMAG_DIV_REAL]] : f32
+// CHECK: %[[IMAG_SQ_PLUS_ONE:.*]] = arith.addf %[[IMAG_SQ]], %[[ONE]] : f32
+// CHECK: %[[IMAG_SQRT:.*]] = math.sqrt %[[IMAG_SQ_PLUS_ONE]] : f32
+// CHECK: %[[REAL_ABS:.*]] = math.absf %[[REAL2]] : f32
+// CHECK: %[[ABS_IMAG:.*]] = arith.mulf %[[IMAG_SQRT]], %[[REAL_ABS]] : f32
+// CHECK: %[[REAL_DIV_IMAG:.*]] = arith.divf %[[REAL2]], %[[IMAG2]] : f32
+// CHECK: %[[REAL_SQ:.*]] = arith.mulf %[[REAL_DIV_IMAG]], %[[REAL_DIV_IMAG]] : f32
+// CHECK: %[[REAL_SQ_PLUS_ONE:.*]] = arith.addf %[[REAL_SQ]], %[[ONE]] : f32
+// CHECK: %[[REAL_SQRT:.*]] = math.sqrt %[[REAL_SQ_PLUS_ONE]] : f32
+// CHECK: %[[IMAG_ABS:.*]] = math.absf %[[IMAG2]] : f32
+// CHECK: %[[ABS_REAL:.*]] = arith.mulf %[[REAL_SQRT]], %[[IMAG_ABS]] : f32
+// CHECK: %[[REAL_GT_IMAG:.*]] = arith.cmpf ogt, %[[REAL2]], %[[IMAG2]] : f32
+// CHECK: %[[ABS1:.*]] = arith.select %[[REAL_GT_IMAG]], %[[ABS_IMAG]], %[[ABS_REAL]] : f32
+// CHECK: %[[ABS2:.*]] = arith.select %[[IS_IMAG_ZERO]], %[[REAL_ABS]], %[[ABS1]] : f32
+// CHECK: %[[NORM:.*]] = arith.select %[[IS_REAL_ZERO]], %[[IMAG_ABS]], %[[ABS2]] : f32
 // CHECK: %[[REAL_SIGN:.*]] = arith.divf %[[REAL]], %[[NORM]] : f32
 // CHECK: %[[IMAG_SIGN:.*]] = arith.divf %[[IMAG]], %[[NORM]] : f32
 // CHECK: %[[SIGN:.*]] = complex.create %[[REAL_SIGN]], %[[IMAG_SIGN]] : complex<f32>
@@ -716,13 +765,29 @@ func.func @complex_abs_with_fmf(%arg: complex<f32>) -> f32 {
   %abs = complex.abs %arg fastmath<nnan,contract> : complex<f32>
   return %abs : f32
 }
+// CHECK: %[[ZERO:.*]] = arith.constant 0.000000e+00 : f32
+// CHECK: %[[ONE:.*]] = arith.constant 1.000000e+00 : f32
 // CHECK: %[[REAL:.*]] = complex.re %[[ARG]] : complex<f32>
 // CHECK: %[[IMAG:.*]] = complex.im %[[ARG]] : complex<f32>
-// CHECK-DAG: %[[REAL_SQ:.*]] = arith.mulf %[[REAL]], %[[REAL]] fastmath<nnan,contract> : f32
-// CHECK-DAG: %[[IMAG_SQ:.*]] = arith.mulf %[[IMAG]], %[[IMAG]] fastmath<nnan,contract> : f32
-// CHECK: %[[SQ_NORM:.*]] = arith.addf %[[REAL_SQ]], %[[IMAG_SQ]] fastmath<nnan,contract> : f32
-// CHECK: %[[NORM:.*]] = math.sqrt %[[SQ_NORM]] : f32
-// CHECK: return %[[NORM]] : f32
+// CHECK: %[[IS_REAL_ZERO:.*]] = arith.cmpf oeq, %[[REAL]], %[[ZERO]] : f32
+// CHECK: %[[IS_IMAG_ZERO:.*]] = arith.cmpf oeq, %[[IMAG]], %[[ZERO]] : f32
+// CHECK: %[[IMAG_DIV_REAL:.*]] = arith.divf %[[IMAG]], %[[REAL]] fastmath<nnan,contract> : f32
+// CHECK: %[[IMAG_SQ:.*]] = arith.mulf %[[IMAG_DIV_REAL]], %[[IMAG_DIV_REAL]] fastmath<nnan,contract> : f32
+// CHECK: %[[IMAG_SQ_PLUS_ONE:.*]] = arith.addf %[[IMAG_SQ]], %[[ONE]] fastmath<nnan,contract> : f32
+// CHECK: %[[IMAG_SQRT:.*]] = math.sqrt %[[IMAG_SQ_PLUS_ONE]] fastmath<nnan,contract> : f32
+// CHECK: %[[REAL_ABS:.*]] = math.absf %[[REAL]] fastmath<nnan,contract> : f32
+// CHECK: %[[ABS_IMAG:.*]] = arith.mulf %[[IMAG_SQRT]], %[[REAL_ABS]] fastmath<nnan,contract> : f32
+// CHECK: %[[REAL_DIV_IMAG:.*]] = arith.divf %[[REAL]], %[[IMAG]] fastmath<nnan,contract> : f32
+// CHECK: %[[REAL_SQ:.*]] = arith.mulf %[[REAL_DIV_IMAG]], %[[REAL_DIV_IMAG]] fastmath<nnan,contract> : f32
+// CHECK: %[[REAL_SQ_PLUS_ONE:.*]] = arith.addf %[[REAL_SQ]], %[[ONE]] fastmath<nnan,contract> : f32
+// CHECK: %[[REAL_SQRT:.*]] = math.sqrt %[[REAL_SQ_PLUS_ONE]] fastmath<nnan,contract> : f32
+// CHECK: %[[IMAG_ABS:.*]] = math.absf %[[IMAG]] fastmath<nnan,contract> : f32
+// CHECK: %[[ABS_REAL:.*]] = arith.mulf %[[REAL_SQRT]], %[[IMAG_ABS]] fastmath<nnan,contract> : f32
+// CHECK: %[[REAL_GT_IMAG:.*]] = arith.cmpf ogt, %[[REAL]], %[[IMAG]] : f32
+// CHECK: %[[ABS1:.*]] = arith.select %[[REAL_GT_IMAG]], %[[ABS_IMAG]], %[[ABS_REAL]] : f32
+// CHECK: %[[ABS2:.*]] = arith.select %[[IS_IMAG_ZERO]], %[[REAL_ABS]], %[[ABS1]] : f32
+// CHECK: %[[ABS3:.*]] = arith.select %[[IS_REAL_ZERO]], %[[IMAG_ABS]], %[[ABS2]] : f32
+// CHECK: return %[[ABS3]] : f32
 
 // -----
 
@@ -807,12 +872,28 @@ func.func @complex_log_with_fmf(%arg: complex<f32>) -> complex<f32> {
   %log = complex.log %arg fastmath<nnan,contract> : complex<f32>
   return %log : complex<f32>
 }
+// CHECK: %[[ZERO:.*]] = arith.constant 0.000000e+00 : f32
+// CHECK: %[[ONE:.*]] = arith.constant 1.000000e+00 : f32
 // CHECK: %[[REAL:.*]] = complex.re %[[ARG]] : complex<f32>
 // CHECK: %[[IMAG:.*]] = complex.im %[[ARG]] : complex<f32>
-// CHECK: %[[SQR_REAL:.*]] = arith.mulf %[[REAL]], %[[REAL]] fastmath<nnan,contract> : f32
-// CHECK: %[[SQR_IMAG:.*]] = arith.mulf %[[IMAG]], %[[IMAG]] fastmath<nnan,contract> : f32
-// CHECK: %[[SQ_NORM:.*]] = arith.addf %[[SQR_REAL]], %[[SQR_IMAG]] fastmath<nnan,contract> : f32
-// CHECK: %[[NORM:.*]] = math.sqrt %[[SQ_NORM]] : f32
+// CHECK: %[[IS_REAL_ZERO:.*]] = arith.cmpf oeq, %[[REAL]], %[[ZERO]] : f32
+// CHECK: %[[IS_IMAG_ZERO:.*]] = arith.cmpf oeq, %[[IMAG]], %[[ZERO]] : f32
+// CHECK: %[[IMAG_DIV_REAL:.*]] = arith.divf %[[IMAG]], %[[REAL]] fastmath<nnan,contract> : f32
+// CHECK: %[[IMAG_SQ:.*]] = arith.mulf %[[IMAG_DIV_REAL]], %[[IMAG_DIV_REAL]] fastmath<nnan,contract> : f32
+// CHECK: %[[IMAG_SQ_PLUS_ONE:.*]] = arith.addf %[[IMAG_SQ]], %[[ONE]] fastmath<nnan,contract> : f32
+// CHECK: %[[IMAG_SQRT:.*]] = math.sqrt %[[IMAG_SQ_PLUS_ONE]] fastmath<nnan,contract> : f32
+// CHECK: %[[REAL_ABS:.*]] = math.absf %[[REAL]] fastmath<nnan,contract> : f32
+// CHECK: %[[ABS_IMAG:.*]] = arith.mulf %[[IMAG_SQRT]], %[[REAL_ABS]] fastmath<nnan,contract> : f32
+// CHECK: %[[REAL_DIV_IMAG:.*]] = arith.divf %[[REAL]], %[[IMAG]] fastmath<nnan,contract> : f32
+// CHECK: %[[REAL_SQ:.*]] = arith.mulf %[[REAL_DIV_IMAG]], %[[REAL_DIV_IMAG]] fastmath<nnan,contract> : f32
+// CHECK: %[[REAL_SQ_PLUS_ONE:.*]] = arith.addf %[[REAL_SQ]], %[[ONE]] fastmath<nnan,contract> : f32
+// CHECK: %[[REAL_SQRT:.*]] = math.sqrt %[[REAL_SQ_PLUS_ONE]] fastmath<nnan,contract> : f32
+// CHECK: %[[IMAG_ABS:.*]] = math.absf %[[IMAG]] fastmath<nnan,contract> : f32
+// CHECK: %[[ABS_REAL:.*]] = arith.mulf %[[REAL_SQRT]], %[[IMAG_ABS]] fastmath<nnan,contract> : f32
+// CHECK: %[[REAL_GT_IMAG:.*]] = arith.cmpf ogt, %[[REAL]], %[[IMAG]] : f32
+// CHECK: %[[ABS1:.*]] = arith.select %[[REAL_GT_IMAG]], %[[ABS_IMAG]], %[[ABS_REAL]] : f32
+// CHECK: %[[ABS2:.*]] = arith.select %[[IS_IMAG_ZERO]], %[[REAL_ABS]], %[[ABS1]] : f32
+// CHECK: %[[NORM:.*]] = arith.select %[[IS_REAL_ZERO]], %[[IMAG_ABS]], %[[ABS2]] : f32
 // CHECK: %[[RESULT_REAL:.*]] = math.log %[[NORM]] fastmath<nnan,contract> : f32
 // CHECK: %[[REAL2:.*]] = complex.re %[[ARG]] : complex<f32>
 // CHECK: %[[IMAG2:.*]] = complex.im %[[ARG]] : complex<f32>
diff --git a/mlir/test/Conversion/ComplexToStandard/full-conversion.mlir b/mlir/test/Conversion/ComplexToStandard/full-conversion.mlir
index 9983dd4..0f23e20 100644
--- a/mlir/test/Conversion/ComplexToStandard/full-conversion.mlir
+++ b/mlir/test/Conversion/ComplexToStandard/full-conversion.mlir
@@ -6,12 +6,31 @@ func.func @complex_abs(%arg: complex<f32>) -> f32 {
   %abs = complex.abs %arg: complex<f32>
   return %abs : f32
 }
+// CHECK: %[[ZERO:.*]] = llvm.mlir.constant(0.000000e+00 : f32) : f32
+// CHECK: %[[ONE:.*]] = llvm.mlir.constant(1.000000e+00 : f32) : f32
 // CHECK: %[[REAL:.*]] = llvm.extractvalue %[[ARG]][0] : ![[C_TY]]
 // CHECK: %[[IMAG:.*]] = llvm.extractvalue %[[ARG]][1] : ![[C_TY]]
-// CHECK-DAG: %[[REAL_SQ:.*]] = llvm.fmul %[[REAL]], %[[REAL]]  : f32
-// CHECK-DAG: %[[IMAG_SQ:.*]] = llvm.fmul %[[IMAG]], %[[IMAG]]  : f32
-// CHECK: %[[SQ_NORM:.*]] = llvm.fadd %[[REAL_SQ]], %[[IMAG_SQ]]  : f32
-// CHECK: %[[NORM:.*]] = llvm.intr.sqrt(%[[SQ_NORM]]) : (f32) -> f32
+// CHECK: %[[REAL_IS_ZERO:.*]] = llvm.fcmp "oeq" %[[REAL]], %[[ZERO]] : f32
+// CHECK: %[[IMAG_IS_ZERO:.*]] = llvm.fcmp "oeq" %[[IMAG]], %[[ZERO]] : f32
+
+// CHECK: %[[IMAG_DIV_REAL:.*]] = llvm.fdiv %[[IMAG]], %[[REAL]] : f32
+// CHECK: %[[IMAG_SQ:.*]] = llvm.fmul %[[IMAG_DIV_REAL]], %[[IMAG_DIV_REAL]]  : f32
+// CHECK: %[[IMAG_SQ_PLUS_ONE:.*]] = llvm.fadd %[[IMAG_SQ]], %[[ONE]] : f32
+// CHECK: %[[IMAG_SQRT:.*]] = llvm.intr.sqrt(%[[IMAG_SQ_PLUS_ONE]]) : (f32) -> f32
+// CHECK: %[[REAL_ABS:.*]] = llvm.intr.fabs(%[[REAL]]) : (f32) -> f32
+// CHECK: %[[ABS_IMAG:.*]] = llvm.fmul %[[IMAG_SQRT]], %[[REAL_ABS]] : f32
+
+// CHECK: %[[REAL_DIV_IMAG:.*]] = llvm.fdiv %[[REAL]], %[[IMAG]] : f32
+// CHECK: %[[REAL_SQ:.*]] = llvm.fmul %[[REAL_DIV_IMAG]], %[[REAL_DIV_IMAG]] : f32
+// CHECK: %[[REAL_SQ_PLUS_ONE:.*]] = llvm.fadd %[[REAL_SQ]], %[[ONE]]  : f32
+// CHECK: %[[REAL_SQRT:.*]] = llvm.intr.sqrt(%[[REAL_SQ_PLUS_ONE]])  : (f32) -> f32
+// CHECK: %[[IMAG_ABS:.*]] = llvm.intr.fabs(%[[IMAG]]) : (f32) -> f32
+// CHECK: %[[ABS_REAL:.*]] = llvm.fmul %[[REAL_SQRT]], %[[IMAG_ABS]]  : f32
+
+// CHECK: %[[REAL_GT_IMAG:.*]] = llvm.fcmp "ogt" %[[REAL]], %[[IMAG]] : f32
+// CHECK: %[[ABS1:.*]] = llvm.select %[[REAL_GT_IMAG]], %[[ABS_IMAG]], %[[ABS_REAL]] : i1, f32
+// CHECK: %[[ABS2:.*]] = llvm.select %[[IMAG_IS_ZERO]], %[[REAL_ABS]], %[[ABS1]] : i1, f32
+// CHECK: %[[NORM:.*]] = llvm.select %[[REAL_IS_ZERO]], %[[IMAG_ABS]], %[[ABS2]] : i1, f32
 // CHECK: llvm.return %[[NORM]] : f32
 
 // CHECK-LABEL: llvm.func @complex_eq
diff --git a/mlir/test/Integration/Dialect/Complex/CPU/correctness.mlir b/mlir/test/Integration/Dialect/Complex/CPU/correctness.mlir
index 349b92a..a42ed69 100644
--- a/mlir/test/Integration/Dialect/Complex/CPU/correctness.mlir
+++ b/mlir/test/Integration/Dialect/Complex/CPU/correctness.mlir
@@ -106,6 +106,27 @@ func.func @angle(%arg: complex<f32>) -> f32 {
   func.return %angle : f32
 }
 
+func.func @test_element_f64(%input: tensor<?xcomplex<f64>>,
+                      %func: (complex<f64>) -> f64) {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %size = tensor.dim %input, %c0: tensor<?xcomplex<f64>>
+
+  scf.for %i = %c0 to %size step %c1 {
+    %elem = tensor.extract %input[%i]: tensor<?xcomplex<f64>>
+
+    %val = func.call_indirect %func(%elem) : (complex<f64>) -> f64
+    vector.print %val : f64
+    scf.yield
+  }
+  func.return
+}
+
+func.func @abs(%arg: complex<f64>) -> f64 {
+  %abs = complex.abs %arg : complex<f64>
+  func.return %abs : f64
+}
+
 func.func @entry() {
   // complex.sqrt test
   %sqrt_test = arith.constant dense<[
@@ -300,5 +321,38 @@ func.func @entry() {
   call @test_element(%angle_test_cast, %angle_func)
     : (tensor<?xcomplex<f32>>, (complex<f32>) -> f32) -> ()
 
+  // complex.abs test
+  %abs_test = arith.constant dense<[
+    (1.0, 1.0),
+    // CHECK:  1.414
+    (1.0e300, 1.0e300),
+    // CHECK-NEXT:  1.41421e+300
+    (1.0e-300, 1.0e-300),
+    // CHECK-NEXT:  1.41421e-300
+    (5.0, 0.0),
+    // CHECK-NEXT:  5
+    (0.0, 6.0),
+    // CHECK-NEXT:  6
+    (7.0, 8.0),
+    // CHECK-NEXT:  10.6301
+    (-1.0, -1.0),
+    // CHECK-NEXT: 1.414
+    (-1.0e300, -1.0e300),
+    // CHECK-NEXT:  1.41421e+300
+    (-1.0, 0.0),
+    // CHECK-NOT: -1
+    // CHECK-NEXT:  1
+    (0.0, -1.0)
+    // CHECK-NOT:  -1
+    // CHECK-NEXT:  1
+  ]> : tensor<10xcomplex<f64>>
+  %abs_test_cast = tensor.cast %abs_test
+    :  tensor<10xcomplex<f64>> to tensor<?xcomplex<f64>>
+
+  %abs_func = func.constant @abs : (complex<f64>) -> f64
+
+  call @test_element_f64(%abs_test_cast, %abs_func)
+    : (tensor<?xcomplex<f64>>, (complex<f64>) -> f64) -> ()
+
   func.return
 }
-- 
cgit v1.1


From d70b1c1206d93b5cdf31fa330d5717eb73e8794a Mon Sep 17 00:00:00 2001
From: Po-yao Chang <poyaoc97@gmail.com>
Date: Sun, 11 Feb 2024 09:36:59 +0800
Subject: [LLDB][Docs] Replace LLDB_RELOCATABLE_PYTHON with
 LLDB_EMBED_PYTHON_HOME (#81310)

LLDB_RELOCATABLE_PYTHON was removed in LLVM 11
(https://github.com/llvm/llvm-project/commit/3ec3f62f0a0b1ac13230922c91ffc988c1b1e8d5).
---
 lldb/docs/resources/build.rst | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/lldb/docs/resources/build.rst b/lldb/docs/resources/build.rst
index 0ccfef32..55fe73c 100644
--- a/lldb/docs/resources/build.rst
+++ b/lldb/docs/resources/build.rst
@@ -278,12 +278,12 @@ are commonly used on Windows.
   crash, rather than having to reproduce a failure or use a crash dump.
 * ``PYTHON_HOME`` (Required): Path to the folder where the Python distribution
   is installed. For example, ``C:\Python35``.
-* ``LLDB_RELOCATABLE_PYTHON`` (Default=0): When this is 0, LLDB will bind
+* ``LLDB_EMBED_PYTHON_HOME`` (Default=1 on Windows): When this is 1, LLDB will bind
   statically to the location specified in the ``PYTHON_HOME`` CMake variable,
   ignoring any value of ``PYTHONHOME`` set in the environment. This is most
   useful for developers who simply want to run LLDB after they build it. If you
   wish to move a build of LLDB to a different machine where Python will be in a
-  different location, setting ``LLDB_RELOCATABLE_PYTHON`` to 1 will cause
+  different location, setting ``LLDB_EMBED_PYTHON_HOME`` to 0 will cause
   Python to use its default mechanism for finding the python installation at
   runtime (looking for installed Pythons, or using the ``PYTHONHOME``
   environment variable if it is specified).
-- 
cgit v1.1


From d0f4663f488dee869ed797b684d4c3361539ac1c Mon Sep 17 00:00:00 2001
From: darkbuck <michael.hliao@gmail.com>
Date: Sat, 10 Feb 2024 21:44:05 -0500
Subject: [GlobalISel][Mips] Global ISel for `brcond`

- Enable equivalent between `brcond` and `G_BRCOND`.
- Remove the manual selection of `G_BRCOND` in Mips. Revise test cases.

Reviewers: petar-avramovic, bcardosolopes, arsenm

Reviewed By: arsenm

Pull Request: https://github.com/llvm/llvm-project/pull/81306
---
 .../llvm/Target/GlobalISel/SelectionDAGCompat.td   |   1 +
 llvm/lib/Target/Mips/MipsInstructionSelector.cpp   |   7 -
 .../Mips/GlobalISel/instruction-select/branch.mir  |   2 +-
 .../instruction-select/jump_table_and_brjt.mir     | 297 ++++++++++++---------
 .../Mips/GlobalISel/instruction-select/phi.mir     |  16 +-
 5 files changed, 176 insertions(+), 147 deletions(-)

diff --git a/llvm/include/llvm/Target/GlobalISel/SelectionDAGCompat.td b/llvm/include/llvm/Target/GlobalISel/SelectionDAGCompat.td
index f792237..6bc1942 100644
--- a/llvm/include/llvm/Target/GlobalISel/SelectionDAGCompat.td
+++ b/llvm/include/llvm/Target/GlobalISel/SelectionDAGCompat.td
@@ -130,6 +130,7 @@ let IfConvergent = G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS in {
 }
 
 def : GINodeEquiv<G_BR, br>;
+def : GINodeEquiv<G_BRCOND, brcond>;
 def : GINodeEquiv<G_BSWAP, bswap>;
 def : GINodeEquiv<G_BITREVERSE, bitreverse>;
 def : GINodeEquiv<G_FSHL, fshl>;
diff --git a/llvm/lib/Target/Mips/MipsInstructionSelector.cpp b/llvm/lib/Target/Mips/MipsInstructionSelector.cpp
index 4478a57..654f29d 100644
--- a/llvm/lib/Target/Mips/MipsInstructionSelector.cpp
+++ b/llvm/lib/Target/Mips/MipsInstructionSelector.cpp
@@ -357,13 +357,6 @@ bool MipsInstructionSelector::select(MachineInstr &I) {
              .addImm(0);
     break;
   }
-  case G_BRCOND: {
-    MI = BuildMI(MBB, I, I.getDebugLoc(), TII.get(Mips::BNE))
-             .add(I.getOperand(0))
-             .addUse(Mips::ZERO)
-             .add(I.getOperand(1));
-    break;
-  }
   case G_BRJT: {
     unsigned EntrySize =
         MF.getJumpTableInfo()->getEntrySize(MF.getDataLayout());
diff --git a/llvm/test/CodeGen/Mips/GlobalISel/instruction-select/branch.mir b/llvm/test/CodeGen/Mips/GlobalISel/instruction-select/branch.mir
index 2de4096..1311632 100644
--- a/llvm/test/CodeGen/Mips/GlobalISel/instruction-select/branch.mir
+++ b/llvm/test/CodeGen/Mips/GlobalISel/instruction-select/branch.mir
@@ -77,7 +77,7 @@ body:             |
   ; MIPS32-NEXT:   [[COPY1:%[0-9]+]]:gpr32 = COPY $a1
   ; MIPS32-NEXT:   [[COPY2:%[0-9]+]]:gpr32 = COPY $a2
   ; MIPS32-NEXT:   [[ANDi:%[0-9]+]]:gpr32 = ANDi [[COPY]], 1
-  ; MIPS32-NEXT:   BNE [[ANDi]], $zero, %bb.1, implicit-def $at
+  ; MIPS32-NEXT:   BNE [[ANDi]], $zero, %bb.1, implicit-def dead $at
   ; MIPS32-NEXT:   J %bb.2, implicit-def dead $at
   ; MIPS32-NEXT: {{  $}}
   ; MIPS32-NEXT: bb.1.if.then:
diff --git a/llvm/test/CodeGen/Mips/GlobalISel/instruction-select/jump_table_and_brjt.mir b/llvm/test/CodeGen/Mips/GlobalISel/instruction-select/jump_table_and_brjt.mir
index b8450ff..6022e7a 100644
--- a/llvm/test/CodeGen/Mips/GlobalISel/instruction-select/jump_table_and_brjt.mir
+++ b/llvm/test/CodeGen/Mips/GlobalISel/instruction-select/jump_table_and_brjt.mir
@@ -73,139 +73,174 @@ jumpTable:
 body:             |
   ; MIPS32-LABEL: name: mod4_0_to_11
   ; MIPS32: bb.0.entry:
-  ; MIPS32:   successors: %bb.6(0x40000000), %bb.1(0x40000000)
-  ; MIPS32:   liveins: $a0
-  ; MIPS32:   [[COPY:%[0-9]+]]:gpr32 = COPY $a0
-  ; MIPS32:   [[ORi:%[0-9]+]]:gpr32 = ORi $zero, 7
-  ; MIPS32:   [[ORi1:%[0-9]+]]:gpr32 = ORi $zero, 3
-  ; MIPS32:   [[ORi2:%[0-9]+]]:gpr32 = ORi $zero, 2
-  ; MIPS32:   [[ORi3:%[0-9]+]]:gpr32 = ORi $zero, 1
-  ; MIPS32:   [[ORi4:%[0-9]+]]:gpr32 = ORi $zero, 0
-  ; MIPS32:   [[ADDiu:%[0-9]+]]:gpr32 = ADDiu $zero, 65535
-  ; MIPS32:   [[ORi5:%[0-9]+]]:gpr32 = ORi $zero, 0
-  ; MIPS32:   [[SUBu:%[0-9]+]]:gpr32 = SUBu [[COPY]], [[ORi5]]
-  ; MIPS32:   [[SLTu:%[0-9]+]]:gpr32 = SLTu [[ORi]], [[SUBu]]
-  ; MIPS32:   [[ANDi:%[0-9]+]]:gpr32 = ANDi [[SLTu]], 1
-  ; MIPS32:   BNE [[ANDi]], $zero, %bb.6, implicit-def $at
-  ; MIPS32: bb.1.entry:
-  ; MIPS32:   successors: %bb.2(0x20000000), %bb.3(0x20000000), %bb.4(0x20000000), %bb.5(0x20000000)
-  ; MIPS32:   [[LUi:%[0-9]+]]:gpr32 = LUi target-flags(mips-abs-hi) %jump-table.0
-  ; MIPS32:   [[SLL:%[0-9]+]]:gpr32 = SLL [[SUBu]], 2
-  ; MIPS32:   [[ADDu:%[0-9]+]]:gpr32 = ADDu [[LUi]], [[SLL]]
-  ; MIPS32:   [[LW:%[0-9]+]]:gpr32 = LW [[ADDu]], target-flags(mips-abs-lo) %jump-table.0 :: (load (s32))
-  ; MIPS32:   PseudoIndirectBranch [[LW]]
-  ; MIPS32: bb.2.sw.bb:
-  ; MIPS32:   $v0 = COPY [[ORi4]]
-  ; MIPS32:   RetRA implicit $v0
-  ; MIPS32: bb.3.sw.bb1:
-  ; MIPS32:   $v0 = COPY [[ORi3]]
-  ; MIPS32:   RetRA implicit $v0
-  ; MIPS32: bb.4.sw.bb2:
-  ; MIPS32:   $v0 = COPY [[ORi2]]
-  ; MIPS32:   RetRA implicit $v0
-  ; MIPS32: bb.5.sw.bb3:
-  ; MIPS32:   $v0 = COPY [[ORi1]]
-  ; MIPS32:   RetRA implicit $v0
-  ; MIPS32: bb.6.sw.default:
-  ; MIPS32:   successors: %bb.7(0x80000000)
-  ; MIPS32: bb.7.sw.epilog:
-  ; MIPS32:   successors: %bb.13(0x40000000), %bb.8(0x40000000)
-  ; MIPS32:   [[ORi6:%[0-9]+]]:gpr32 = ORi $zero, 8
-  ; MIPS32:   [[SUBu1:%[0-9]+]]:gpr32 = SUBu [[COPY]], [[ORi6]]
-  ; MIPS32:   [[SLTu1:%[0-9]+]]:gpr32 = SLTu [[ORi1]], [[SUBu1]]
-  ; MIPS32:   [[ANDi1:%[0-9]+]]:gpr32 = ANDi [[SLTu1]], 1
-  ; MIPS32:   BNE [[ANDi1]], $zero, %bb.13, implicit-def $at
-  ; MIPS32: bb.8.sw.epilog:
-  ; MIPS32:   successors: %bb.9(0x20000000), %bb.10(0x20000000), %bb.11(0x20000000), %bb.12(0x20000000)
-  ; MIPS32:   [[LUi1:%[0-9]+]]:gpr32 = LUi target-flags(mips-abs-hi) %jump-table.1
-  ; MIPS32:   [[SLL1:%[0-9]+]]:gpr32 = SLL [[SUBu1]], 2
-  ; MIPS32:   [[ADDu1:%[0-9]+]]:gpr32 = ADDu [[LUi1]], [[SLL1]]
-  ; MIPS32:   [[LW1:%[0-9]+]]:gpr32 = LW [[ADDu1]], target-flags(mips-abs-lo) %jump-table.1 :: (load (s32))
-  ; MIPS32:   PseudoIndirectBranch [[LW1]]
-  ; MIPS32: bb.9.sw.bb4:
-  ; MIPS32:   $v0 = COPY [[ORi4]]
-  ; MIPS32:   RetRA implicit $v0
-  ; MIPS32: bb.10.sw.bb5:
-  ; MIPS32:   $v0 = COPY [[ORi3]]
-  ; MIPS32:   RetRA implicit $v0
-  ; MIPS32: bb.11.sw.bb6:
-  ; MIPS32:   $v0 = COPY [[ORi2]]
-  ; MIPS32:   RetRA implicit $v0
-  ; MIPS32: bb.12.sw.bb7:
-  ; MIPS32:   $v0 = COPY [[ORi1]]
-  ; MIPS32:   RetRA implicit $v0
-  ; MIPS32: bb.13.sw.default8:
-  ; MIPS32:   $v0 = COPY [[ADDiu]]
-  ; MIPS32:   RetRA implicit $v0
+  ; MIPS32-NEXT:   successors: %bb.6(0x40000000), %bb.1(0x40000000)
+  ; MIPS32-NEXT:   liveins: $a0
+  ; MIPS32-NEXT: {{  $}}
+  ; MIPS32-NEXT:   [[COPY:%[0-9]+]]:gpr32 = COPY $a0
+  ; MIPS32-NEXT:   [[ORi:%[0-9]+]]:gpr32 = ORi $zero, 7
+  ; MIPS32-NEXT:   [[ORi1:%[0-9]+]]:gpr32 = ORi $zero, 3
+  ; MIPS32-NEXT:   [[ORi2:%[0-9]+]]:gpr32 = ORi $zero, 2
+  ; MIPS32-NEXT:   [[ORi3:%[0-9]+]]:gpr32 = ORi $zero, 1
+  ; MIPS32-NEXT:   [[ORi4:%[0-9]+]]:gpr32 = ORi $zero, 0
+  ; MIPS32-NEXT:   [[ADDiu:%[0-9]+]]:gpr32 = ADDiu $zero, 65535
+  ; MIPS32-NEXT:   [[ORi5:%[0-9]+]]:gpr32 = ORi $zero, 0
+  ; MIPS32-NEXT:   [[SUBu:%[0-9]+]]:gpr32 = SUBu [[COPY]], [[ORi5]]
+  ; MIPS32-NEXT:   [[SLTu:%[0-9]+]]:gpr32 = SLTu [[ORi]], [[SUBu]]
+  ; MIPS32-NEXT:   [[ANDi:%[0-9]+]]:gpr32 = ANDi [[SLTu]], 1
+  ; MIPS32-NEXT:   BNE [[ANDi]], $zero, %bb.6, implicit-def dead $at
+  ; MIPS32-NEXT: {{  $}}
+  ; MIPS32-NEXT: bb.1.entry:
+  ; MIPS32-NEXT:   successors: %bb.2(0x20000000), %bb.3(0x20000000), %bb.4(0x20000000), %bb.5(0x20000000)
+  ; MIPS32-NEXT: {{  $}}
+  ; MIPS32-NEXT:   [[LUi:%[0-9]+]]:gpr32 = LUi target-flags(mips-abs-hi) %jump-table.0
+  ; MIPS32-NEXT:   [[SLL:%[0-9]+]]:gpr32 = SLL [[SUBu]], 2
+  ; MIPS32-NEXT:   [[ADDu:%[0-9]+]]:gpr32 = ADDu [[LUi]], [[SLL]]
+  ; MIPS32-NEXT:   [[LW:%[0-9]+]]:gpr32 = LW [[ADDu]], target-flags(mips-abs-lo) %jump-table.0 :: (load (s32))
+  ; MIPS32-NEXT:   PseudoIndirectBranch [[LW]]
+  ; MIPS32-NEXT: {{  $}}
+  ; MIPS32-NEXT: bb.2.sw.bb:
+  ; MIPS32-NEXT:   $v0 = COPY [[ORi4]]
+  ; MIPS32-NEXT:   RetRA implicit $v0
+  ; MIPS32-NEXT: {{  $}}
+  ; MIPS32-NEXT: bb.3.sw.bb1:
+  ; MIPS32-NEXT:   $v0 = COPY [[ORi3]]
+  ; MIPS32-NEXT:   RetRA implicit $v0
+  ; MIPS32-NEXT: {{  $}}
+  ; MIPS32-NEXT: bb.4.sw.bb2:
+  ; MIPS32-NEXT:   $v0 = COPY [[ORi2]]
+  ; MIPS32-NEXT:   RetRA implicit $v0
+  ; MIPS32-NEXT: {{  $}}
+  ; MIPS32-NEXT: bb.5.sw.bb3:
+  ; MIPS32-NEXT:   $v0 = COPY [[ORi1]]
+  ; MIPS32-NEXT:   RetRA implicit $v0
+  ; MIPS32-NEXT: {{  $}}
+  ; MIPS32-NEXT: bb.6.sw.default:
+  ; MIPS32-NEXT:   successors: %bb.7(0x80000000)
+  ; MIPS32-NEXT: {{  $}}
+  ; MIPS32-NEXT: bb.7.sw.epilog:
+  ; MIPS32-NEXT:   successors: %bb.13(0x40000000), %bb.8(0x40000000)
+  ; MIPS32-NEXT: {{  $}}
+  ; MIPS32-NEXT:   [[ORi6:%[0-9]+]]:gpr32 = ORi $zero, 8
+  ; MIPS32-NEXT:   [[SUBu1:%[0-9]+]]:gpr32 = SUBu [[COPY]], [[ORi6]]
+  ; MIPS32-NEXT:   [[SLTu1:%[0-9]+]]:gpr32 = SLTu [[ORi1]], [[SUBu1]]
+  ; MIPS32-NEXT:   [[ANDi1:%[0-9]+]]:gpr32 = ANDi [[SLTu1]], 1
+  ; MIPS32-NEXT:   BNE [[ANDi1]], $zero, %bb.13, implicit-def dead $at
+  ; MIPS32-NEXT: {{  $}}
+  ; MIPS32-NEXT: bb.8.sw.epilog:
+  ; MIPS32-NEXT:   successors: %bb.9(0x20000000), %bb.10(0x20000000), %bb.11(0x20000000), %bb.12(0x20000000)
+  ; MIPS32-NEXT: {{  $}}
+  ; MIPS32-NEXT:   [[LUi1:%[0-9]+]]:gpr32 = LUi target-flags(mips-abs-hi) %jump-table.1
+  ; MIPS32-NEXT:   [[SLL1:%[0-9]+]]:gpr32 = SLL [[SUBu1]], 2
+  ; MIPS32-NEXT:   [[ADDu1:%[0-9]+]]:gpr32 = ADDu [[LUi1]], [[SLL1]]
+  ; MIPS32-NEXT:   [[LW1:%[0-9]+]]:gpr32 = LW [[ADDu1]], target-flags(mips-abs-lo) %jump-table.1 :: (load (s32))
+  ; MIPS32-NEXT:   PseudoIndirectBranch [[LW1]]
+  ; MIPS32-NEXT: {{  $}}
+  ; MIPS32-NEXT: bb.9.sw.bb4:
+  ; MIPS32-NEXT:   $v0 = COPY [[ORi4]]
+  ; MIPS32-NEXT:   RetRA implicit $v0
+  ; MIPS32-NEXT: {{  $}}
+  ; MIPS32-NEXT: bb.10.sw.bb5:
+  ; MIPS32-NEXT:   $v0 = COPY [[ORi3]]
+  ; MIPS32-NEXT:   RetRA implicit $v0
+  ; MIPS32-NEXT: {{  $}}
+  ; MIPS32-NEXT: bb.11.sw.bb6:
+  ; MIPS32-NEXT:   $v0 = COPY [[ORi2]]
+  ; MIPS32-NEXT:   RetRA implicit $v0
+  ; MIPS32-NEXT: {{  $}}
+  ; MIPS32-NEXT: bb.12.sw.bb7:
+  ; MIPS32-NEXT:   $v0 = COPY [[ORi1]]
+  ; MIPS32-NEXT:   RetRA implicit $v0
+  ; MIPS32-NEXT: {{  $}}
+  ; MIPS32-NEXT: bb.13.sw.default8:
+  ; MIPS32-NEXT:   $v0 = COPY [[ADDiu]]
+  ; MIPS32-NEXT:   RetRA implicit $v0
+  ;
   ; MIPS32_PIC-LABEL: name: mod4_0_to_11
   ; MIPS32_PIC: bb.0.entry:
-  ; MIPS32_PIC:   successors: %bb.6(0x40000000), %bb.1(0x40000000)
-  ; MIPS32_PIC:   liveins: $a0, $t9, $v0
-  ; MIPS32_PIC:   [[ADDu:%[0-9]+]]:gpr32 = ADDu $v0, $t9
-  ; MIPS32_PIC:   [[COPY:%[0-9]+]]:gpr32 = COPY $a0
-  ; MIPS32_PIC:   [[ORi:%[0-9]+]]:gpr32 = ORi $zero, 7
-  ; MIPS32_PIC:   [[ORi1:%[0-9]+]]:gpr32 = ORi $zero, 3
-  ; MIPS32_PIC:   [[ORi2:%[0-9]+]]:gpr32 = ORi $zero, 2
-  ; MIPS32_PIC:   [[ORi3:%[0-9]+]]:gpr32 = ORi $zero, 1
-  ; MIPS32_PIC:   [[ORi4:%[0-9]+]]:gpr32 = ORi $zero, 0
-  ; MIPS32_PIC:   [[ADDiu:%[0-9]+]]:gpr32 = ADDiu $zero, 65535
-  ; MIPS32_PIC:   [[ORi5:%[0-9]+]]:gpr32 = ORi $zero, 0
-  ; MIPS32_PIC:   [[SUBu:%[0-9]+]]:gpr32 = SUBu [[COPY]], [[ORi5]]
-  ; MIPS32_PIC:   [[SLTu:%[0-9]+]]:gpr32 = SLTu [[ORi]], [[SUBu]]
-  ; MIPS32_PIC:   [[ANDi:%[0-9]+]]:gpr32 = ANDi [[SLTu]], 1
-  ; MIPS32_PIC:   BNE [[ANDi]], $zero, %bb.6, implicit-def $at
-  ; MIPS32_PIC: bb.1.entry:
-  ; MIPS32_PIC:   successors: %bb.2(0x20000000), %bb.3(0x20000000), %bb.4(0x20000000), %bb.5(0x20000000)
-  ; MIPS32_PIC:   [[LW:%[0-9]+]]:gpr32 = LW [[ADDu]], target-flags(mips-got) %jump-table.0 :: (load (s32) from got)
-  ; MIPS32_PIC:   [[SLL:%[0-9]+]]:gpr32 = SLL [[SUBu]], 2
-  ; MIPS32_PIC:   [[ADDu1:%[0-9]+]]:gpr32 = ADDu [[LW]], [[SLL]]
-  ; MIPS32_PIC:   [[LW1:%[0-9]+]]:gpr32 = LW [[ADDu1]], target-flags(mips-abs-lo) %jump-table.0 :: (load (s32))
-  ; MIPS32_PIC:   [[ADDu2:%[0-9]+]]:gpr32 = ADDu [[LW1]], [[ADDu]]
-  ; MIPS32_PIC:   PseudoIndirectBranch [[ADDu2]]
-  ; MIPS32_PIC: bb.2.sw.bb:
-  ; MIPS32_PIC:   $v0 = COPY [[ORi4]]
-  ; MIPS32_PIC:   RetRA implicit $v0
-  ; MIPS32_PIC: bb.3.sw.bb1:
-  ; MIPS32_PIC:   $v0 = COPY [[ORi3]]
-  ; MIPS32_PIC:   RetRA implicit $v0
-  ; MIPS32_PIC: bb.4.sw.bb2:
-  ; MIPS32_PIC:   $v0 = COPY [[ORi2]]
-  ; MIPS32_PIC:   RetRA implicit $v0
-  ; MIPS32_PIC: bb.5.sw.bb3:
-  ; MIPS32_PIC:   $v0 = COPY [[ORi1]]
-  ; MIPS32_PIC:   RetRA implicit $v0
-  ; MIPS32_PIC: bb.6.sw.default:
-  ; MIPS32_PIC:   successors: %bb.7(0x80000000)
-  ; MIPS32_PIC: bb.7.sw.epilog:
-  ; MIPS32_PIC:   successors: %bb.13(0x40000000), %bb.8(0x40000000)
-  ; MIPS32_PIC:   [[ORi6:%[0-9]+]]:gpr32 = ORi $zero, 8
-  ; MIPS32_PIC:   [[SUBu1:%[0-9]+]]:gpr32 = SUBu [[COPY]], [[ORi6]]
-  ; MIPS32_PIC:   [[SLTu1:%[0-9]+]]:gpr32 = SLTu [[ORi1]], [[SUBu1]]
-  ; MIPS32_PIC:   [[ANDi1:%[0-9]+]]:gpr32 = ANDi [[SLTu1]], 1
-  ; MIPS32_PIC:   BNE [[ANDi1]], $zero, %bb.13, implicit-def $at
-  ; MIPS32_PIC: bb.8.sw.epilog:
-  ; MIPS32_PIC:   successors: %bb.9(0x20000000), %bb.10(0x20000000), %bb.11(0x20000000), %bb.12(0x20000000)
-  ; MIPS32_PIC:   [[LW2:%[0-9]+]]:gpr32 = LW [[ADDu]], target-flags(mips-got) %jump-table.1 :: (load (s32) from got)
-  ; MIPS32_PIC:   [[SLL1:%[0-9]+]]:gpr32 = SLL [[SUBu1]], 2
-  ; MIPS32_PIC:   [[ADDu3:%[0-9]+]]:gpr32 = ADDu [[LW2]], [[SLL1]]
-  ; MIPS32_PIC:   [[LW3:%[0-9]+]]:gpr32 = LW [[ADDu3]], target-flags(mips-abs-lo) %jump-table.1 :: (load (s32))
-  ; MIPS32_PIC:   [[ADDu4:%[0-9]+]]:gpr32 = ADDu [[LW3]], [[ADDu]]
-  ; MIPS32_PIC:   PseudoIndirectBranch [[ADDu4]]
-  ; MIPS32_PIC: bb.9.sw.bb4:
-  ; MIPS32_PIC:   $v0 = COPY [[ORi4]]
-  ; MIPS32_PIC:   RetRA implicit $v0
-  ; MIPS32_PIC: bb.10.sw.bb5:
-  ; MIPS32_PIC:   $v0 = COPY [[ORi3]]
-  ; MIPS32_PIC:   RetRA implicit $v0
-  ; MIPS32_PIC: bb.11.sw.bb6:
-  ; MIPS32_PIC:   $v0 = COPY [[ORi2]]
-  ; MIPS32_PIC:   RetRA implicit $v0
-  ; MIPS32_PIC: bb.12.sw.bb7:
-  ; MIPS32_PIC:   $v0 = COPY [[ORi1]]
-  ; MIPS32_PIC:   RetRA implicit $v0
-  ; MIPS32_PIC: bb.13.sw.default8:
-  ; MIPS32_PIC:   $v0 = COPY [[ADDiu]]
-  ; MIPS32_PIC:   RetRA implicit $v0
+  ; MIPS32_PIC-NEXT:   successors: %bb.6(0x40000000), %bb.1(0x40000000)
+  ; MIPS32_PIC-NEXT:   liveins: $a0, $t9, $v0
+  ; MIPS32_PIC-NEXT: {{  $}}
+  ; MIPS32_PIC-NEXT:   [[ADDu:%[0-9]+]]:gpr32 = ADDu $v0, $t9
+  ; MIPS32_PIC-NEXT:   [[COPY:%[0-9]+]]:gpr32 = COPY $a0
+  ; MIPS32_PIC-NEXT:   [[ORi:%[0-9]+]]:gpr32 = ORi $zero, 7
+  ; MIPS32_PIC-NEXT:   [[ORi1:%[0-9]+]]:gpr32 = ORi $zero, 3
+  ; MIPS32_PIC-NEXT:   [[ORi2:%[0-9]+]]:gpr32 = ORi $zero, 2
+  ; MIPS32_PIC-NEXT:   [[ORi3:%[0-9]+]]:gpr32 = ORi $zero, 1
+  ; MIPS32_PIC-NEXT:   [[ORi4:%[0-9]+]]:gpr32 = ORi $zero, 0
+  ; MIPS32_PIC-NEXT:   [[ADDiu:%[0-9]+]]:gpr32 = ADDiu $zero, 65535
+  ; MIPS32_PIC-NEXT:   [[ORi5:%[0-9]+]]:gpr32 = ORi $zero, 0
+  ; MIPS32_PIC-NEXT:   [[SUBu:%[0-9]+]]:gpr32 = SUBu [[COPY]], [[ORi5]]
+  ; MIPS32_PIC-NEXT:   [[SLTu:%[0-9]+]]:gpr32 = SLTu [[ORi]], [[SUBu]]
+  ; MIPS32_PIC-NEXT:   [[ANDi:%[0-9]+]]:gpr32 = ANDi [[SLTu]], 1
+  ; MIPS32_PIC-NEXT:   BNE [[ANDi]], $zero, %bb.6, implicit-def dead $at
+  ; MIPS32_PIC-NEXT: {{  $}}
+  ; MIPS32_PIC-NEXT: bb.1.entry:
+  ; MIPS32_PIC-NEXT:   successors: %bb.2(0x20000000), %bb.3(0x20000000), %bb.4(0x20000000), %bb.5(0x20000000)
+  ; MIPS32_PIC-NEXT: {{  $}}
+  ; MIPS32_PIC-NEXT:   [[LW:%[0-9]+]]:gpr32 = LW [[ADDu]], target-flags(mips-got) %jump-table.0 :: (load (s32) from got)
+  ; MIPS32_PIC-NEXT:   [[SLL:%[0-9]+]]:gpr32 = SLL [[SUBu]], 2
+  ; MIPS32_PIC-NEXT:   [[ADDu1:%[0-9]+]]:gpr32 = ADDu [[LW]], [[SLL]]
+  ; MIPS32_PIC-NEXT:   [[LW1:%[0-9]+]]:gpr32 = LW [[ADDu1]], target-flags(mips-abs-lo) %jump-table.0 :: (load (s32))
+  ; MIPS32_PIC-NEXT:   [[ADDu2:%[0-9]+]]:gpr32 = ADDu [[LW1]], [[ADDu]]
+  ; MIPS32_PIC-NEXT:   PseudoIndirectBranch [[ADDu2]]
+  ; MIPS32_PIC-NEXT: {{  $}}
+  ; MIPS32_PIC-NEXT: bb.2.sw.bb:
+  ; MIPS32_PIC-NEXT:   $v0 = COPY [[ORi4]]
+  ; MIPS32_PIC-NEXT:   RetRA implicit $v0
+  ; MIPS32_PIC-NEXT: {{  $}}
+  ; MIPS32_PIC-NEXT: bb.3.sw.bb1:
+  ; MIPS32_PIC-NEXT:   $v0 = COPY [[ORi3]]
+  ; MIPS32_PIC-NEXT:   RetRA implicit $v0
+  ; MIPS32_PIC-NEXT: {{  $}}
+  ; MIPS32_PIC-NEXT: bb.4.sw.bb2:
+  ; MIPS32_PIC-NEXT:   $v0 = COPY [[ORi2]]
+  ; MIPS32_PIC-NEXT:   RetRA implicit $v0
+  ; MIPS32_PIC-NEXT: {{  $}}
+  ; MIPS32_PIC-NEXT: bb.5.sw.bb3:
+  ; MIPS32_PIC-NEXT:   $v0 = COPY [[ORi1]]
+  ; MIPS32_PIC-NEXT:   RetRA implicit $v0
+  ; MIPS32_PIC-NEXT: {{  $}}
+  ; MIPS32_PIC-NEXT: bb.6.sw.default:
+  ; MIPS32_PIC-NEXT:   successors: %bb.7(0x80000000)
+  ; MIPS32_PIC-NEXT: {{  $}}
+  ; MIPS32_PIC-NEXT: bb.7.sw.epilog:
+  ; MIPS32_PIC-NEXT:   successors: %bb.13(0x40000000), %bb.8(0x40000000)
+  ; MIPS32_PIC-NEXT: {{  $}}
+  ; MIPS32_PIC-NEXT:   [[ORi6:%[0-9]+]]:gpr32 = ORi $zero, 8
+  ; MIPS32_PIC-NEXT:   [[SUBu1:%[0-9]+]]:gpr32 = SUBu [[COPY]], [[ORi6]]
+  ; MIPS32_PIC-NEXT:   [[SLTu1:%[0-9]+]]:gpr32 = SLTu [[ORi1]], [[SUBu1]]
+  ; MIPS32_PIC-NEXT:   [[ANDi1:%[0-9]+]]:gpr32 = ANDi [[SLTu1]], 1
+  ; MIPS32_PIC-NEXT:   BNE [[ANDi1]], $zero, %bb.13, implicit-def dead $at
+  ; MIPS32_PIC-NEXT: {{  $}}
+  ; MIPS32_PIC-NEXT: bb.8.sw.epilog:
+  ; MIPS32_PIC-NEXT:   successors: %bb.9(0x20000000), %bb.10(0x20000000), %bb.11(0x20000000), %bb.12(0x20000000)
+  ; MIPS32_PIC-NEXT: {{  $}}
+  ; MIPS32_PIC-NEXT:   [[LW2:%[0-9]+]]:gpr32 = LW [[ADDu]], target-flags(mips-got) %jump-table.1 :: (load (s32) from got)
+  ; MIPS32_PIC-NEXT:   [[SLL1:%[0-9]+]]:gpr32 = SLL [[SUBu1]], 2
+  ; MIPS32_PIC-NEXT:   [[ADDu3:%[0-9]+]]:gpr32 = ADDu [[LW2]], [[SLL1]]
+  ; MIPS32_PIC-NEXT:   [[LW3:%[0-9]+]]:gpr32 = LW [[ADDu3]], target-flags(mips-abs-lo) %jump-table.1 :: (load (s32))
+  ; MIPS32_PIC-NEXT:   [[ADDu4:%[0-9]+]]:gpr32 = ADDu [[LW3]], [[ADDu]]
+  ; MIPS32_PIC-NEXT:   PseudoIndirectBranch [[ADDu4]]
+  ; MIPS32_PIC-NEXT: {{  $}}
+  ; MIPS32_PIC-NEXT: bb.9.sw.bb4:
+  ; MIPS32_PIC-NEXT:   $v0 = COPY [[ORi4]]
+  ; MIPS32_PIC-NEXT:   RetRA implicit $v0
+  ; MIPS32_PIC-NEXT: {{  $}}
+  ; MIPS32_PIC-NEXT: bb.10.sw.bb5:
+  ; MIPS32_PIC-NEXT:   $v0 = COPY [[ORi3]]
+  ; MIPS32_PIC-NEXT:   RetRA implicit $v0
+  ; MIPS32_PIC-NEXT: {{  $}}
+  ; MIPS32_PIC-NEXT: bb.11.sw.bb6:
+  ; MIPS32_PIC-NEXT:   $v0 = COPY [[ORi2]]
+  ; MIPS32_PIC-NEXT:   RetRA implicit $v0
+  ; MIPS32_PIC-NEXT: {{  $}}
+  ; MIPS32_PIC-NEXT: bb.12.sw.bb7:
+  ; MIPS32_PIC-NEXT:   $v0 = COPY [[ORi1]]
+  ; MIPS32_PIC-NEXT:   RetRA implicit $v0
+  ; MIPS32_PIC-NEXT: {{  $}}
+  ; MIPS32_PIC-NEXT: bb.13.sw.default8:
+  ; MIPS32_PIC-NEXT:   $v0 = COPY [[ADDiu]]
+  ; MIPS32_PIC-NEXT:   RetRA implicit $v0
   bb.1.entry:
     liveins: $a0
 
diff --git a/llvm/test/CodeGen/Mips/GlobalISel/instruction-select/phi.mir b/llvm/test/CodeGen/Mips/GlobalISel/instruction-select/phi.mir
index 77e5ee2..44d31d99 100644
--- a/llvm/test/CodeGen/Mips/GlobalISel/instruction-select/phi.mir
+++ b/llvm/test/CodeGen/Mips/GlobalISel/instruction-select/phi.mir
@@ -80,7 +80,7 @@ body:             |
   ; MIPS32FP32-NEXT:   [[COPY1:%[0-9]+]]:gpr32 = COPY $a1
   ; MIPS32FP32-NEXT:   [[COPY2:%[0-9]+]]:gpr32 = COPY $a2
   ; MIPS32FP32-NEXT:   [[ANDi:%[0-9]+]]:gpr32 = ANDi [[COPY]], 1
-  ; MIPS32FP32-NEXT:   BNE [[ANDi]], $zero, %bb.1, implicit-def $at
+  ; MIPS32FP32-NEXT:   BNE [[ANDi]], $zero, %bb.1, implicit-def dead $at
   ; MIPS32FP32-NEXT:   J %bb.2, implicit-def dead $at
   ; MIPS32FP32-NEXT: {{  $}}
   ; MIPS32FP32-NEXT: bb.1.cond.true:
@@ -105,7 +105,7 @@ body:             |
   ; MIPS32FP64-NEXT:   [[COPY1:%[0-9]+]]:gpr32 = COPY $a1
   ; MIPS32FP64-NEXT:   [[COPY2:%[0-9]+]]:gpr32 = COPY $a2
   ; MIPS32FP64-NEXT:   [[ANDi:%[0-9]+]]:gpr32 = ANDi [[COPY]], 1
-  ; MIPS32FP64-NEXT:   BNE [[ANDi]], $zero, %bb.1, implicit-def $at
+  ; MIPS32FP64-NEXT:   BNE [[ANDi]], $zero, %bb.1, implicit-def dead $at
   ; MIPS32FP64-NEXT:   J %bb.2, implicit-def dead $at
   ; MIPS32FP64-NEXT: {{  $}}
   ; MIPS32FP64-NEXT: bb.1.cond.true:
@@ -166,7 +166,7 @@ body:             |
   ; MIPS32FP32-NEXT:   [[ADDiu1:%[0-9]+]]:gpr32 = ADDiu %fixed-stack.1, 0
   ; MIPS32FP32-NEXT:   [[LW1:%[0-9]+]]:gpr32 = LW [[ADDiu1]], 0 :: (load (s32) from %fixed-stack.1)
   ; MIPS32FP32-NEXT:   [[ANDi:%[0-9]+]]:gpr32 = ANDi [[COPY]], 1
-  ; MIPS32FP32-NEXT:   BNE [[ANDi]], $zero, %bb.1, implicit-def $at
+  ; MIPS32FP32-NEXT:   BNE [[ANDi]], $zero, %bb.1, implicit-def dead $at
   ; MIPS32FP32-NEXT:   J %bb.2, implicit-def dead $at
   ; MIPS32FP32-NEXT: {{  $}}
   ; MIPS32FP32-NEXT: bb.1.cond.true:
@@ -197,7 +197,7 @@ body:             |
   ; MIPS32FP64-NEXT:   [[ADDiu1:%[0-9]+]]:gpr32 = ADDiu %fixed-stack.1, 0
   ; MIPS32FP64-NEXT:   [[LW1:%[0-9]+]]:gpr32 = LW [[ADDiu1]], 0 :: (load (s32) from %fixed-stack.1)
   ; MIPS32FP64-NEXT:   [[ANDi:%[0-9]+]]:gpr32 = ANDi [[COPY]], 1
-  ; MIPS32FP64-NEXT:   BNE [[ANDi]], $zero, %bb.1, implicit-def $at
+  ; MIPS32FP64-NEXT:   BNE [[ANDi]], $zero, %bb.1, implicit-def dead $at
   ; MIPS32FP64-NEXT:   J %bb.2, implicit-def dead $at
   ; MIPS32FP64-NEXT: {{  $}}
   ; MIPS32FP64-NEXT: bb.1.cond.true:
@@ -259,7 +259,7 @@ body:             |
   ; MIPS32FP32-NEXT:   [[COPY1:%[0-9]+]]:gpr32 = COPY $a1
   ; MIPS32FP32-NEXT:   [[COPY2:%[0-9]+]]:gpr32 = COPY $a2
   ; MIPS32FP32-NEXT:   [[ANDi:%[0-9]+]]:gpr32 = ANDi [[COPY]], 1
-  ; MIPS32FP32-NEXT:   BNE [[ANDi]], $zero, %bb.1, implicit-def $at
+  ; MIPS32FP32-NEXT:   BNE [[ANDi]], $zero, %bb.1, implicit-def dead $at
   ; MIPS32FP32-NEXT:   J %bb.2, implicit-def dead $at
   ; MIPS32FP32-NEXT: {{  $}}
   ; MIPS32FP32-NEXT: bb.1.cond.true:
@@ -284,7 +284,7 @@ body:             |
   ; MIPS32FP64-NEXT:   [[COPY1:%[0-9]+]]:gpr32 = COPY $a1
   ; MIPS32FP64-NEXT:   [[COPY2:%[0-9]+]]:gpr32 = COPY $a2
   ; MIPS32FP64-NEXT:   [[ANDi:%[0-9]+]]:gpr32 = ANDi [[COPY]], 1
-  ; MIPS32FP64-NEXT:   BNE [[ANDi]], $zero, %bb.1, implicit-def $at
+  ; MIPS32FP64-NEXT:   BNE [[ANDi]], $zero, %bb.1, implicit-def dead $at
   ; MIPS32FP64-NEXT:   J %bb.2, implicit-def dead $at
   ; MIPS32FP64-NEXT: {{  $}}
   ; MIPS32FP64-NEXT: bb.1.cond.true:
@@ -341,7 +341,7 @@ body:             |
   ; MIPS32FP32-NEXT:   [[ADDiu:%[0-9]+]]:gpr32 = ADDiu %fixed-stack.0, 0
   ; MIPS32FP32-NEXT:   [[LW:%[0-9]+]]:gpr32 = LW [[ADDiu]], 0 :: (load (s32) from %fixed-stack.0, align 8)
   ; MIPS32FP32-NEXT:   [[ANDi:%[0-9]+]]:gpr32 = ANDi [[LW]], 1
-  ; MIPS32FP32-NEXT:   BNE [[ANDi]], $zero, %bb.1, implicit-def $at
+  ; MIPS32FP32-NEXT:   BNE [[ANDi]], $zero, %bb.1, implicit-def dead $at
   ; MIPS32FP32-NEXT:   J %bb.2, implicit-def dead $at
   ; MIPS32FP32-NEXT: {{  $}}
   ; MIPS32FP32-NEXT: bb.1.cond.true:
@@ -367,7 +367,7 @@ body:             |
   ; MIPS32FP64-NEXT:   [[ADDiu:%[0-9]+]]:gpr32 = ADDiu %fixed-stack.0, 0
   ; MIPS32FP64-NEXT:   [[LW:%[0-9]+]]:gpr32 = LW [[ADDiu]], 0 :: (load (s32) from %fixed-stack.0, align 8)
   ; MIPS32FP64-NEXT:   [[ANDi:%[0-9]+]]:gpr32 = ANDi [[LW]], 1
-  ; MIPS32FP64-NEXT:   BNE [[ANDi]], $zero, %bb.1, implicit-def $at
+  ; MIPS32FP64-NEXT:   BNE [[ANDi]], $zero, %bb.1, implicit-def dead $at
   ; MIPS32FP64-NEXT:   J %bb.2, implicit-def dead $at
   ; MIPS32FP64-NEXT: {{  $}}
   ; MIPS32FP64-NEXT: bb.1.cond.true:
-- 
cgit v1.1


From 2c3ba9f6225612caf7d2d5ba6613ba1454d52dc3 Mon Sep 17 00:00:00 2001
From: Quinn Dawkins <quinn.dawkins@gmail.com>
Date: Sat, 10 Feb 2024 20:01:14 -0800
Subject: [mlir][Linalg] Unrestrict redundant transfer hoisting from func.func
 (#79516)

All the hoistRedundantVectorTransfers op does is walk the target
operation, which does not have to be restricted to func.func.
---
 mlir/include/mlir/Dialect/Linalg/Transforms/Hoisting.h | 6 ++----
 mlir/lib/Dialect/Linalg/Transforms/Hoisting.cpp        | 6 +++---
 2 files changed, 5 insertions(+), 7 deletions(-)

diff --git a/mlir/include/mlir/Dialect/Linalg/Transforms/Hoisting.h b/mlir/include/mlir/Dialect/Linalg/Transforms/Hoisting.h
index 921c3c3..186e83a 100644
--- a/mlir/include/mlir/Dialect/Linalg/Transforms/Hoisting.h
+++ b/mlir/include/mlir/Dialect/Linalg/Transforms/Hoisting.h
@@ -10,10 +10,8 @@
 #define MLIR_DIALECT_LINALG_TRANSFORMS_HOISTING_H_
 
 namespace mlir {
+class Operation;
 class RewriterBase;
-namespace func {
-class FuncOp;
-} // namespace func
 namespace scf {
 class ForOp;
 } // namespace scf
@@ -43,7 +41,7 @@ namespace linalg {
 ///
 /// WARNING: This hoisting does not model parallelism and is generally incorrect
 /// when used on distributed loops with memref semantics!
-void hoistRedundantVectorTransfers(func::FuncOp func);
+void hoistRedundantVectorTransfers(Operation *root);
 
 } // namespace linalg
 } // namespace mlir
diff --git a/mlir/lib/Dialect/Linalg/Transforms/Hoisting.cpp b/mlir/lib/Dialect/Linalg/Transforms/Hoisting.cpp
index 80ce97e..34c9b2c 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/Hoisting.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/Hoisting.cpp
@@ -73,16 +73,16 @@ static bool noAliasingUseInLoop(vector::TransferReadOp transferRead,
   return true;
 }
 
-void mlir::linalg::hoistRedundantVectorTransfers(func::FuncOp func) {
+void mlir::linalg::hoistRedundantVectorTransfers(Operation *root) {
   bool changed = true;
   while (changed) {
     changed = false;
     // First move loop invariant ops outside of their loop. This needs to be
     // done before as we cannot move ops without interrupting the function walk.
-    func.walk(
+    root->walk(
         [&](LoopLikeOpInterface loopLike) { moveLoopInvariantCode(loopLike); });
 
-    func.walk([&](vector::TransferReadOp transferRead) {
+    root->walk([&](vector::TransferReadOp transferRead) {
       if (!isa<MemRefType>(transferRead.getShapedType()))
         return WalkResult::advance();
 
-- 
cgit v1.1


From c2f9885a8aa3a820eefdacccf3fcc6b9d87e3284 Mon Sep 17 00:00:00 2001
From: Koakuma <koachan@protonmail.com>
Date: Sun, 11 Feb 2024 14:04:18 +0700
Subject: [SPARC] Support reserving arbitrary general purpose registers
 (#74927)

This adds support for marking arbitrary general purpose registers -
except for those with special purpose (G0, I6-I7, O6-O7) - as reserved,
as needed by some software like the Linux kernel.
---
 clang/include/clang/Driver/Options.td              |  12 ++
 clang/lib/Driver/ToolChains/Arch/Sparc.cpp         |  81 +++++++++
 clang/test/Driver/sparc-fixed-register.c           | 181 +++++++++++++++++++++
 llvm/lib/Target/Sparc/Sparc.td                     |  14 ++
 llvm/lib/Target/Sparc/SparcISelLowering.cpp        |  43 +++++
 llvm/lib/Target/Sparc/SparcRegisterInfo.cpp        |  14 +-
 llvm/lib/Target/Sparc/SparcRegisterInfo.h          |   1 +
 llvm/lib/Target/Sparc/SparcRegisterInfo.td         |   4 +
 llvm/lib/Target/Sparc/SparcSubtarget.cpp           |   1 +
 llvm/lib/Target/Sparc/SparcSubtarget.h             |  10 ++
 llvm/test/CodeGen/SPARC/reserved-arg-regs.ll       |  25 +++
 llvm/test/CodeGen/SPARC/reserved-regs-named.ll     |  13 ++
 .../CodeGen/SPARC/reserved-regs-unavailable.ll     |  14 ++
 llvm/test/CodeGen/SPARC/reserved-regs.ll           |  17 ++
 14 files changed, 428 insertions(+), 2 deletions(-)
 create mode 100644 clang/test/Driver/sparc-fixed-register.c
 create mode 100644 llvm/test/CodeGen/SPARC/reserved-arg-regs.ll
 create mode 100644 llvm/test/CodeGen/SPARC/reserved-regs-named.ll
 create mode 100644 llvm/test/CodeGen/SPARC/reserved-regs-unavailable.ll

diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index 7f00732..31e8571 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -5829,6 +5829,18 @@ def mvis3 : Flag<["-"], "mvis3">, Group<m_sparc_Features_Group>;
 def mno_vis3 : Flag<["-"], "mno-vis3">, Group<m_sparc_Features_Group>;
 def mhard_quad_float : Flag<["-"], "mhard-quad-float">, Group<m_sparc_Features_Group>;
 def msoft_quad_float : Flag<["-"], "msoft-quad-float">, Group<m_sparc_Features_Group>;
+foreach i = 1 ... 7 in
+  def ffixed_g#i : Flag<["-"], "ffixed-g"#i>, Group<m_sparc_Features_Group>,
+    HelpText<"Reserve the G"#i#" register (SPARC only)">;
+foreach i = 0 ... 5 in
+  def ffixed_o#i : Flag<["-"], "ffixed-o"#i>, Group<m_sparc_Features_Group>,
+    HelpText<"Reserve the O"#i#" register (SPARC only)">;
+foreach i = 0 ... 7 in
+  def ffixed_l#i : Flag<["-"], "ffixed-l"#i>, Group<m_sparc_Features_Group>,
+    HelpText<"Reserve the L"#i#" register (SPARC only)">;
+foreach i = 0 ... 5 in
+  def ffixed_i#i : Flag<["-"], "ffixed-i"#i>, Group<m_sparc_Features_Group>,
+    HelpText<"Reserve the I"#i#" register (SPARC only)">;
 } // let Flags = [TargetSpecific]
 
 // M68k features flags
diff --git a/clang/lib/Driver/ToolChains/Arch/Sparc.cpp b/clang/lib/Driver/ToolChains/Arch/Sparc.cpp
index 22e5830..ae1a4ba 100644
--- a/clang/lib/Driver/ToolChains/Arch/Sparc.cpp
+++ b/clang/lib/Driver/ToolChains/Arch/Sparc.cpp
@@ -178,4 +178,85 @@ void sparc::getSparcTargetFeatures(const Driver &D, const ArgList &Args,
     else
       Features.push_back("-hard-quad-float");
   }
+
+  if (Args.hasArg(options::OPT_ffixed_g1))
+    Features.push_back("+reserve-g1");
+
+  if (Args.hasArg(options::OPT_ffixed_g2))
+    Features.push_back("+reserve-g2");
+
+  if (Args.hasArg(options::OPT_ffixed_g3))
+    Features.push_back("+reserve-g3");
+
+  if (Args.hasArg(options::OPT_ffixed_g4))
+    Features.push_back("+reserve-g4");
+
+  if (Args.hasArg(options::OPT_ffixed_g5))
+    Features.push_back("+reserve-g5");
+
+  if (Args.hasArg(options::OPT_ffixed_g6))
+    Features.push_back("+reserve-g6");
+
+  if (Args.hasArg(options::OPT_ffixed_g7))
+    Features.push_back("+reserve-g7");
+
+  if (Args.hasArg(options::OPT_ffixed_o0))
+    Features.push_back("+reserve-o0");
+
+  if (Args.hasArg(options::OPT_ffixed_o1))
+    Features.push_back("+reserve-o1");
+
+  if (Args.hasArg(options::OPT_ffixed_o2))
+    Features.push_back("+reserve-o2");
+
+  if (Args.hasArg(options::OPT_ffixed_o3))
+    Features.push_back("+reserve-o3");
+
+  if (Args.hasArg(options::OPT_ffixed_o4))
+    Features.push_back("+reserve-o4");
+
+  if (Args.hasArg(options::OPT_ffixed_o5))
+    Features.push_back("+reserve-o5");
+
+  if (Args.hasArg(options::OPT_ffixed_l0))
+    Features.push_back("+reserve-l0");
+
+  if (Args.hasArg(options::OPT_ffixed_l1))
+    Features.push_back("+reserve-l1");
+
+  if (Args.hasArg(options::OPT_ffixed_l2))
+    Features.push_back("+reserve-l2");
+
+  if (Args.hasArg(options::OPT_ffixed_l3))
+    Features.push_back("+reserve-l3");
+
+  if (Args.hasArg(options::OPT_ffixed_l4))
+    Features.push_back("+reserve-l4");
+
+  if (Args.hasArg(options::OPT_ffixed_l5))
+    Features.push_back("+reserve-l5");
+
+  if (Args.hasArg(options::OPT_ffixed_l6))
+    Features.push_back("+reserve-l6");
+
+  if (Args.hasArg(options::OPT_ffixed_l7))
+    Features.push_back("+reserve-l7");
+
+  if (Args.hasArg(options::OPT_ffixed_i0))
+    Features.push_back("+reserve-i0");
+
+  if (Args.hasArg(options::OPT_ffixed_i1))
+    Features.push_back("+reserve-i1");
+
+  if (Args.hasArg(options::OPT_ffixed_i2))
+    Features.push_back("+reserve-i2");
+
+  if (Args.hasArg(options::OPT_ffixed_i3))
+    Features.push_back("+reserve-i3");
+
+  if (Args.hasArg(options::OPT_ffixed_i4))
+    Features.push_back("+reserve-i4");
+
+  if (Args.hasArg(options::OPT_ffixed_i5))
+    Features.push_back("+reserve-i5");
 }
diff --git a/clang/test/Driver/sparc-fixed-register.c b/clang/test/Driver/sparc-fixed-register.c
new file mode 100644
index 0000000..24880b9
--- /dev/null
+++ b/clang/test/Driver/sparc-fixed-register.c
@@ -0,0 +1,181 @@
+// RUN: %clang --target=sparc-none-gnu -ffixed-g1 -### %s 2> %t
+// RUN: FileCheck --check-prefix=CHECK-FIXED-G1 < %t %s
+// CHECK-FIXED-G1: "-target-feature" "+reserve-g1"
+
+// RUN: %clang --target=sparc-none-gnu -ffixed-g2 -### %s 2> %t
+// RUN: FileCheck --check-prefix=CHECK-FIXED-G2 < %t %s
+// CHECK-FIXED-G2: "-target-feature" "+reserve-g2"
+
+// RUN: %clang --target=sparc-none-gnu -ffixed-g3 -### %s 2> %t
+// RUN: FileCheck --check-prefix=CHECK-FIXED-G3 < %t %s
+// CHECK-FIXED-G3: "-target-feature" "+reserve-g3"
+
+// RUN: %clang --target=sparc-none-gnu -ffixed-g4 -### %s 2> %t
+// RUN: FileCheck --check-prefix=CHECK-FIXED-G4 < %t %s
+// CHECK-FIXED-G4: "-target-feature" "+reserve-g4"
+
+// RUN: %clang --target=sparc-none-gnu -ffixed-g5 -### %s 2> %t
+// RUN: FileCheck --check-prefix=CHECK-FIXED-G5 < %t %s
+// CHECK-FIXED-G5: "-target-feature" "+reserve-g5"
+
+// RUN: %clang --target=sparc-none-gnu -ffixed-g6 -### %s 2> %t
+// RUN: FileCheck --check-prefix=CHECK-FIXED-G6 < %t %s
+// CHECK-FIXED-G6: "-target-feature" "+reserve-g6"
+
+// RUN: %clang --target=sparc-none-gnu -ffixed-g7 -### %s 2> %t
+// RUN: FileCheck --check-prefix=CHECK-FIXED-G7 < %t %s
+// CHECK-FIXED-G7: "-target-feature" "+reserve-g7"
+
+// RUN: %clang --target=sparc-none-gnu -ffixed-o0 -### %s 2> %t
+// RUN: FileCheck --check-prefix=CHECK-FIXED-O0 < %t %s
+// CHECK-FIXED-O0: "-target-feature" "+reserve-o0"
+
+// RUN: %clang --target=sparc-none-gnu -ffixed-o1 -### %s 2> %t
+// RUN: FileCheck --check-prefix=CHECK-FIXED-O1 < %t %s
+// CHECK-FIXED-O1: "-target-feature" "+reserve-o1"
+
+// RUN: %clang --target=sparc-none-gnu -ffixed-o2 -### %s 2> %t
+// RUN: FileCheck --check-prefix=CHECK-FIXED-O2 < %t %s
+// CHECK-FIXED-O2: "-target-feature" "+reserve-o2"
+
+// RUN: %clang --target=sparc-none-gnu -ffixed-o3 -### %s 2> %t
+// RUN: FileCheck --check-prefix=CHECK-FIXED-O3 < %t %s
+// CHECK-FIXED-O3: "-target-feature" "+reserve-o3"
+
+// RUN: %clang --target=sparc-none-gnu -ffixed-o4 -### %s 2> %t
+// RUN: FileCheck --check-prefix=CHECK-FIXED-O4 < %t %s
+// CHECK-FIXED-O4: "-target-feature" "+reserve-o4"
+
+// RUN: %clang --target=sparc-none-gnu -ffixed-o5 -### %s 2> %t
+// RUN: FileCheck --check-prefix=CHECK-FIXED-O5 < %t %s
+// CHECK-FIXED-O5: "-target-feature" "+reserve-o5"
+
+// RUN: %clang --target=sparc-none-gnu -ffixed-l0 -### %s 2> %t
+// RUN: FileCheck --check-prefix=CHECK-FIXED-L0 < %t %s
+// CHECK-FIXED-L0: "-target-feature" "+reserve-l0"
+
+// RUN: %clang --target=sparc-none-gnu -ffixed-l1 -### %s 2> %t
+// RUN: FileCheck --check-prefix=CHECK-FIXED-L1 < %t %s
+// CHECK-FIXED-L1: "-target-feature" "+reserve-l1"
+
+// RUN: %clang --target=sparc-none-gnu -ffixed-l2 -### %s 2> %t
+// RUN: FileCheck --check-prefix=CHECK-FIXED-L2 < %t %s
+// CHECK-FIXED-L2: "-target-feature" "+reserve-l2"
+
+// RUN: %clang --target=sparc-none-gnu -ffixed-l3 -### %s 2> %t
+// RUN: FileCheck --check-prefix=CHECK-FIXED-L3 < %t %s
+// CHECK-FIXED-L3: "-target-feature" "+reserve-l3"
+
+// RUN: %clang --target=sparc-none-gnu -ffixed-l4 -### %s 2> %t
+// RUN: FileCheck --check-prefix=CHECK-FIXED-L4 < %t %s
+// CHECK-FIXED-L4: "-target-feature" "+reserve-l4"
+
+// RUN: %clang --target=sparc-none-gnu -ffixed-l5 -### %s 2> %t
+// RUN: FileCheck --check-prefix=CHECK-FIXED-L5 < %t %s
+// CHECK-FIXED-L5: "-target-feature" "+reserve-l5"
+
+// RUN: %clang --target=sparc-none-gnu -ffixed-l6 -### %s 2> %t
+// RUN: FileCheck --check-prefix=CHECK-FIXED-L6 < %t %s
+// CHECK-FIXED-L6: "-target-feature" "+reserve-l6"
+
+// RUN: %clang --target=sparc-none-gnu -ffixed-l7 -### %s 2> %t
+// RUN: FileCheck --check-prefix=CHECK-FIXED-L7 < %t %s
+// CHECK-FIXED-L7: "-target-feature" "+reserve-l7"
+
+// RUN: %clang --target=sparc-none-gnu -ffixed-i0 -### %s 2> %t
+// RUN: FileCheck --check-prefix=CHECK-FIXED-I0 < %t %s
+// CHECK-FIXED-I0: "-target-feature" "+reserve-i0"
+
+// RUN: %clang --target=sparc-none-gnu -ffixed-i1 -### %s 2> %t
+// RUN: FileCheck --check-prefix=CHECK-FIXED-I1 < %t %s
+// CHECK-FIXED-I1: "-target-feature" "+reserve-i1"
+
+// RUN: %clang --target=sparc-none-gnu -ffixed-i2 -### %s 2> %t
+// RUN: FileCheck --check-prefix=CHECK-FIXED-I2 < %t %s
+// CHECK-FIXED-I2: "-target-feature" "+reserve-i2"
+
+// RUN: %clang --target=sparc-none-gnu -ffixed-i3 -### %s 2> %t
+// RUN: FileCheck --check-prefix=CHECK-FIXED-I3 < %t %s
+// CHECK-FIXED-I3: "-target-feature" "+reserve-i3"
+
+// RUN: %clang --target=sparc-none-gnu -ffixed-i4 -### %s 2> %t
+// RUN: FileCheck --check-prefix=CHECK-FIXED-I4 < %t %s
+// CHECK-FIXED-I4: "-target-feature" "+reserve-i4"
+
+// RUN: %clang --target=sparc-none-gnu -ffixed-i5 -### %s 2> %t
+// RUN: FileCheck --check-prefix=CHECK-FIXED-I5 < %t %s
+// CHECK-FIXED-I5: "-target-feature" "+reserve-i5"
+
+// Test multiple of reserve-* options together.
+// RUN: %clang --target=sparc-none-gnu \
+// RUN: -ffixed-g1 \
+// RUN: -ffixed-o2 \
+// RUN: -ffixed-l3 \
+// RUN: -ffixed-i4 \
+// RUN: -### %s 2> %t
+// RUN: FileCheck \
+// RUN: --check-prefix=CHECK-FIXED-G1 \
+// RUN: --check-prefix=CHECK-FIXED-O2 \
+// RUN: --check-prefix=CHECK-FIXED-L3 \
+// RUN: --check-prefix=CHECK-FIXED-I4 \
+// RUN: < %t %s
+
+// Test all reserve-* options together.
+// RUN: %clang --target=sparc-none-gnu \
+// RUN: -ffixed-g1 \
+// RUN: -ffixed-g2 \
+// RUN: -ffixed-g3 \
+// RUN: -ffixed-g4 \
+// RUN: -ffixed-g5 \
+// RUN: -ffixed-g6 \
+// RUN: -ffixed-g7 \
+// RUN: -ffixed-o0 \
+// RUN: -ffixed-o1 \
+// RUN: -ffixed-o2 \
+// RUN: -ffixed-o3 \
+// RUN: -ffixed-o4 \
+// RUN: -ffixed-o5 \
+// RUN: -ffixed-l0 \
+// RUN: -ffixed-l1 \
+// RUN: -ffixed-l2 \
+// RUN: -ffixed-l3 \
+// RUN: -ffixed-l4 \
+// RUN: -ffixed-l5 \
+// RUN: -ffixed-l6 \
+// RUN: -ffixed-l7 \
+// RUN: -ffixed-i0 \
+// RUN: -ffixed-i1 \
+// RUN: -ffixed-i2 \
+// RUN: -ffixed-i3 \
+// RUN: -ffixed-i4 \
+// RUN: -ffixed-i5 \
+// RUN: -### %s 2> %t
+// RUN: FileCheck \
+// RUN: --check-prefix=CHECK-FIXED-G1 \
+// RUN: --check-prefix=CHECK-FIXED-G2 \
+// RUN: --check-prefix=CHECK-FIXED-G3 \
+// RUN: --check-prefix=CHECK-FIXED-G4 \
+// RUN: --check-prefix=CHECK-FIXED-G5 \
+// RUN: --check-prefix=CHECK-FIXED-G6 \
+// RUN: --check-prefix=CHECK-FIXED-G7 \
+// RUN: --check-prefix=CHECK-FIXED-O0 \
+// RUN: --check-prefix=CHECK-FIXED-O1 \
+// RUN: --check-prefix=CHECK-FIXED-O2 \
+// RUN: --check-prefix=CHECK-FIXED-O3 \
+// RUN: --check-prefix=CHECK-FIXED-O4 \
+// RUN: --check-prefix=CHECK-FIXED-O5 \
+// RUN: --check-prefix=CHECK-FIXED-L0 \
+// RUN: --check-prefix=CHECK-FIXED-L1 \
+// RUN: --check-prefix=CHECK-FIXED-L2 \
+// RUN: --check-prefix=CHECK-FIXED-L3 \
+// RUN: --check-prefix=CHECK-FIXED-L4 \
+// RUN: --check-prefix=CHECK-FIXED-L5 \
+// RUN: --check-prefix=CHECK-FIXED-L6 \
+// RUN: --check-prefix=CHECK-FIXED-L7 \
+// RUN: --check-prefix=CHECK-FIXED-I0 \
+// RUN: --check-prefix=CHECK-FIXED-I1 \
+// RUN: --check-prefix=CHECK-FIXED-I2 \
+// RUN: --check-prefix=CHECK-FIXED-I3 \
+// RUN: --check-prefix=CHECK-FIXED-I4 \
+// RUN: --check-prefix=CHECK-FIXED-I5 \
+// RUN: < %t %s
diff --git a/llvm/lib/Target/Sparc/Sparc.td b/llvm/lib/Target/Sparc/Sparc.td
index 7b10339..38a59e6 100644
--- a/llvm/lib/Target/Sparc/Sparc.td
+++ b/llvm/lib/Target/Sparc/Sparc.td
@@ -72,6 +72,20 @@ def TuneSlowRDPC : SubtargetFeature<"slow-rdpc", "HasSlowRDPC", "true",
 //==== Features added predmoninantly for LEON subtarget support
 include "LeonFeatures.td"
 
+//==== Register allocation tweaks needed by some low-level software
+foreach i = 1 ... 7  in
+    def FeatureReserveG#i : SubtargetFeature<"reserve-g"#i, "ReserveRegister["#i#" + SP::G0]", "true",
+                                             "Reserve G"#i#", making it unavailable as a GPR">;
+foreach i = 0 ... 5 in
+    def FeatureReserveO#i : SubtargetFeature<"reserve-o"#i, "ReserveRegister["#i#" + SP::O0]", "true",
+                                             "Reserve O"#i#", making it unavailable as a GPR">;
+foreach i = 0 ... 7 in
+    def FeatureReserveL#i : SubtargetFeature<"reserve-l"#i, "ReserveRegister["#i#" + SP::L0]", "true",
+                                             "Reserve L"#i#", making it unavailable as a GPR">;
+foreach i = 0 ... 5 in
+    def FeatureReserveI#i : SubtargetFeature<"reserve-i"#i, "ReserveRegister["#i#" + SP::I0]", "true",
+                                             "Reserve I"#i#", making it unavailable as a GPR">;
+
 //===----------------------------------------------------------------------===//
 // Register File, Calling Conv, Instruction Descriptions
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/Sparc/SparcISelLowering.cpp b/llvm/lib/Target/Sparc/SparcISelLowering.cpp
index 78bdf3a..bdefb08 100644
--- a/llvm/lib/Target/Sparc/SparcISelLowering.cpp
+++ b/llvm/lib/Target/Sparc/SparcISelLowering.cpp
@@ -13,6 +13,7 @@
 
 #include "SparcISelLowering.h"
 #include "MCTargetDesc/SparcMCExpr.h"
+#include "MCTargetDesc/SparcMCTargetDesc.h"
 #include "SparcMachineFunctionInfo.h"
 #include "SparcRegisterInfo.h"
 #include "SparcTargetMachine.h"
@@ -28,6 +29,7 @@
 #include "llvm/CodeGen/SelectionDAGNodes.h"
 #include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
 #include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/DiagnosticInfo.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/Module.h"
 #include "llvm/Support/ErrorHandling.h"
@@ -729,6 +731,30 @@ SDValue SparcTargetLowering::LowerFormalArguments_64(
   return Chain;
 }
 
+// Check whether any of the argument registers are reserved
+static bool isAnyArgRegReserved(const SparcRegisterInfo *TRI,
+                                const MachineFunction &MF) {
+  // The register window design means that outgoing parameters at O*
+  // will appear in the callee as I*.
+  // Be conservative and check both sides of the register names.
+  bool Outgoing =
+      llvm::any_of(SP::GPROutgoingArgRegClass, [TRI, &MF](MCPhysReg r) {
+        return TRI->isReservedReg(MF, r);
+      });
+  bool Incoming =
+      llvm::any_of(SP::GPRIncomingArgRegClass, [TRI, &MF](MCPhysReg r) {
+        return TRI->isReservedReg(MF, r);
+      });
+  return Outgoing || Incoming;
+}
+
+static void emitReservedArgRegCallError(const MachineFunction &MF) {
+  const Function &F = MF.getFunction();
+  F.getContext().diagnose(DiagnosticInfoUnsupported{
+      F, ("SPARC doesn't support"
+          " function calls if any of the argument registers is reserved.")});
+}
+
 SDValue
 SparcTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
                                SmallVectorImpl<SDValue> &InVals) const {
@@ -805,6 +831,7 @@ SparcTargetLowering::LowerCall_32(TargetLowering::CallLoweringInfo &CLI,
   bool &isTailCall                      = CLI.IsTailCall;
   CallingConv::ID CallConv              = CLI.CallConv;
   bool isVarArg                         = CLI.IsVarArg;
+  MachineFunction &MF = DAG.getMachineFunction();
 
   // Analyze operands of the call, assigning locations to each operand.
   SmallVector<CCValAssign, 16> ArgLocs;
@@ -1055,6 +1082,10 @@ SparcTargetLowering::LowerCall_32(TargetLowering::CallLoweringInfo &CLI,
       ((hasReturnsTwice)
            ? TRI->getRTCallPreservedMask(CallConv)
            : TRI->getCallPreservedMask(DAG.getMachineFunction(), CallConv));
+
+  if (isAnyArgRegReserved(TRI, MF))
+    emitReservedArgRegCallError(MF);
+
   assert(Mask && "Missing call preserved mask for calling convention");
   Ops.push_back(DAG.getRegisterMask(Mask));
 
@@ -1125,6 +1156,13 @@ Register SparcTargetLowering::getRegisterByName(const char* RegName, LLT VT,
     .Case("g4", SP::G4).Case("g5", SP::G5).Case("g6", SP::G6).Case("g7", SP::G7)
     .Default(0);
 
+  // If we're directly referencing register names
+  // (e.g in GCC C extension `register int r asm("g1");`),
+  // make sure that said register is in the reserve list.
+  const SparcRegisterInfo *TRI = Subtarget->getRegisterInfo();
+  if (!TRI->isReservedReg(MF, Reg))
+    Reg = 0;
+
   if (Reg)
     return Reg;
 
@@ -1189,6 +1227,7 @@ SparcTargetLowering::LowerCall_64(TargetLowering::CallLoweringInfo &CLI,
   SDLoc DL = CLI.DL;
   SDValue Chain = CLI.Chain;
   auto PtrVT = getPointerTy(DAG.getDataLayout());
+  MachineFunction &MF = DAG.getMachineFunction();
 
   // Analyze operands of the call, assigning locations to each operand.
   SmallVector<CCValAssign, 16> ArgLocs;
@@ -1372,6 +1411,10 @@ SparcTargetLowering::LowerCall_64(TargetLowering::CallLoweringInfo &CLI,
       ((hasReturnsTwice) ? TRI->getRTCallPreservedMask(CLI.CallConv)
                          : TRI->getCallPreservedMask(DAG.getMachineFunction(),
                                                      CLI.CallConv));
+
+  if (isAnyArgRegReserved(TRI, MF))
+    emitReservedArgRegCallError(MF);
+
   assert(Mask && "Missing call preserved mask for calling convention");
   Ops.push_back(DAG.getRegisterMask(Mask));
 
diff --git a/llvm/lib/Target/Sparc/SparcRegisterInfo.cpp b/llvm/lib/Target/Sparc/SparcRegisterInfo.cpp
index f97bf57..71a27f7 100644
--- a/llvm/lib/Target/Sparc/SparcRegisterInfo.cpp
+++ b/llvm/lib/Target/Sparc/SparcRegisterInfo.cpp
@@ -12,10 +12,8 @@
 
 #include "SparcRegisterInfo.h"
 #include "Sparc.h"
-#include "SparcMachineFunctionInfo.h"
 #include "SparcSubtarget.h"
 #include "llvm/ADT/BitVector.h"
-#include "llvm/ADT/STLExtras.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
@@ -98,9 +96,21 @@ BitVector SparcRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
   for (unsigned n = 0; n < 31; n++)
     Reserved.set(SP::ASR1 + n);
 
+  for (TargetRegisterClass::iterator i = SP::IntRegsRegClass.begin();
+       i != SP::IntRegsRegClass.end(); ++i) {
+    if (MF.getSubtarget<SparcSubtarget>().isRegisterReserved(*i))
+      markSuperRegs(Reserved, *i);
+  }
+
+  assert(checkAllSuperRegsMarked(Reserved));
   return Reserved;
 }
 
+bool SparcRegisterInfo::isReservedReg(const MachineFunction &MF,
+                                      MCRegister Reg) const {
+  return getReservedRegs(MF)[Reg];
+}
+
 const TargetRegisterClass*
 SparcRegisterInfo::getPointerRegClass(const MachineFunction &MF,
                                       unsigned Kind) const {
diff --git a/llvm/lib/Target/Sparc/SparcRegisterInfo.h b/llvm/lib/Target/Sparc/SparcRegisterInfo.h
index 5b3c1a7..58c85f3 100644
--- a/llvm/lib/Target/Sparc/SparcRegisterInfo.h
+++ b/llvm/lib/Target/Sparc/SparcRegisterInfo.h
@@ -30,6 +30,7 @@ struct SparcRegisterInfo : public SparcGenRegisterInfo {
   const uint32_t* getRTCallPreservedMask(CallingConv::ID CC) const;
 
   BitVector getReservedRegs(const MachineFunction &MF) const override;
+  bool isReservedReg(const MachineFunction &MF, MCRegister Reg) const;
 
   const TargetRegisterClass *getPointerRegClass(const MachineFunction &MF,
                                                 unsigned Kind) const override;
diff --git a/llvm/lib/Target/Sparc/SparcRegisterInfo.td b/llvm/lib/Target/Sparc/SparcRegisterInfo.td
index d5ba746..d8319a8 100644
--- a/llvm/lib/Target/Sparc/SparcRegisterInfo.td
+++ b/llvm/lib/Target/Sparc/SparcRegisterInfo.td
@@ -370,6 +370,10 @@ def LowQFPRegs : RegisterClass<"SP", [f128], 128, (sequence "Q%u", 0, 7)>;
 // Floating point control register classes.
 def FCCRegs : RegisterClass<"SP", [i1], 1, (sequence "FCC%u", 0, 3)>;
 
+// GPR argument registers.
+def GPROutgoingArg : RegisterClass<"SP", [i32, i64], 32, (sequence "O%u", 0, 5)>;
+def GPRIncomingArg : RegisterClass<"SP", [i32, i64], 32, (sequence "I%u", 0, 5)>;
+
 let isAllocatable = 0 in {
   // Ancillary state registers
   // FIXME: TICK is special-cased here as it can be accessed
diff --git a/llvm/lib/Target/Sparc/SparcSubtarget.cpp b/llvm/lib/Target/Sparc/SparcSubtarget.cpp
index 6b09904..5b65e34 100644
--- a/llvm/lib/Target/Sparc/SparcSubtarget.cpp
+++ b/llvm/lib/Target/Sparc/SparcSubtarget.cpp
@@ -50,6 +50,7 @@ SparcSubtarget::SparcSubtarget(const StringRef &CPU, const StringRef &TuneCPU,
                                const StringRef &FS, const TargetMachine &TM,
                                bool is64Bit)
     : SparcGenSubtargetInfo(TM.getTargetTriple(), CPU, TuneCPU, FS),
+      ReserveRegister(TM.getMCRegisterInfo()->getNumRegs()),
       TargetTriple(TM.getTargetTriple()), Is64Bit(is64Bit),
       InstrInfo(initializeSubtargetDependencies(CPU, TuneCPU, FS)),
       TLInfo(TM, *this), FrameLowering(*this) {}
diff --git a/llvm/lib/Target/Sparc/SparcSubtarget.h b/llvm/lib/Target/Sparc/SparcSubtarget.h
index cdb210f..fe4aca5 100644
--- a/llvm/lib/Target/Sparc/SparcSubtarget.h
+++ b/llvm/lib/Target/Sparc/SparcSubtarget.h
@@ -13,12 +13,14 @@
 #ifndef LLVM_LIB_TARGET_SPARC_SPARCSUBTARGET_H
 #define LLVM_LIB_TARGET_SPARC_SPARCSUBTARGET_H
 
+#include "MCTargetDesc/SparcMCTargetDesc.h"
 #include "SparcFrameLowering.h"
 #include "SparcISelLowering.h"
 #include "SparcInstrInfo.h"
 #include "llvm/CodeGen/SelectionDAGTargetInfo.h"
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
 #include "llvm/IR/DataLayout.h"
+#include "llvm/Support/ErrorHandling.h"
 #include "llvm/TargetParser/Triple.h"
 #include <string>
 
@@ -29,6 +31,10 @@ namespace llvm {
 class StringRef;
 
 class SparcSubtarget : public SparcGenSubtargetInfo {
+  // ReserveRegister[i] - Register #i is not available as a general purpose
+  // register.
+  BitVector ReserveRegister;
+
   Triple TargetTriple;
   virtual void anchor();
 
@@ -82,6 +88,10 @@ public:
     return is64Bit() ? 2047 : 0;
   }
 
+  bool isRegisterReserved(MCPhysReg PhysReg) const {
+    return ReserveRegister[PhysReg];
+  }
+
   /// Given a actual stack size as determined by FrameInfo, this function
   /// returns adjusted framesize which includes space for register window
   /// spills and arguments.
diff --git a/llvm/test/CodeGen/SPARC/reserved-arg-regs.ll b/llvm/test/CodeGen/SPARC/reserved-arg-regs.ll
new file mode 100644
index 0000000..3587ecb
--- /dev/null
+++ b/llvm/test/CodeGen/SPARC/reserved-arg-regs.ll
@@ -0,0 +1,25 @@
+;; Test reserving argument registers.
+; RUN: not llc < %s -mtriple=sparc-linux-gnu -mattr=+reserve-o0 2>&1 | FileCheck %s --check-prefixes=CHECK-RESERVED-O0
+; RUN: not llc < %s -mtriple=sparc64-linux-gnu -mattr=+reserve-o0 2>&1 | FileCheck %s --check-prefixes=CHECK-RESERVED-O0
+; RUN: not llc < %s -mtriple=sparc-linux-gnu -mattr=+reserve-i0 2>&1 | FileCheck %s --check-prefixes=CHECK-RESERVED-I0
+; RUN: not llc < %s -mtriple=sparc64-linux-gnu -mattr=+reserve-i0 2>&1 | FileCheck %s --check-prefixes=CHECK-RESERVED-I0
+
+; CHECK-RESERVED-O0: error:
+; CHECK-RESERVED-O0-SAME: SPARC doesn't support function calls if any of the argument registers is reserved.
+; CHECK-RESERVED-I0: error:
+; CHECK-RESERVED-I0-SAME: SPARC doesn't support function calls if any of the argument registers is reserved.
+define void @call_function() {
+  call void @foo()
+  ret void
+}
+declare void @foo()
+
+; CHECK-RESERVED-O0: error:
+; CHECK-RESERVED-O0-SAME: SPARC doesn't support function calls if any of the argument registers is reserved.
+; CHECK-RESERVED-I0: error:
+; CHECK-RESERVED-I0-SAME: SPARC doesn't support function calls if any of the argument registers is reserved.
+define void @call_function_with_arg(i8 %in) {
+  call void @bar(i8 %in)
+  ret void
+}
+declare void @bar(i8)
diff --git a/llvm/test/CodeGen/SPARC/reserved-regs-named.ll b/llvm/test/CodeGen/SPARC/reserved-regs-named.ll
new file mode 100644
index 0000000..91808be
--- /dev/null
+++ b/llvm/test/CodeGen/SPARC/reserved-regs-named.ll
@@ -0,0 +1,13 @@
+; RUN: llc -mtriple=sparc64-linux-gnu -mattr=+reserve-l0 -o - %s | FileCheck %s --check-prefixes=CHECK-RESERVED-L0
+
+;; Ensure explicit register references are catched as well.
+
+; CHECK-RESERVED-L0: %l0
+define void @set_reg(i32 zeroext %x) {
+entry:
+  tail call void @llvm.write_register.i32(metadata !0, i32 %x)
+  ret void
+}
+
+declare void @llvm.write_register.i32(metadata, i32)
+!0 = !{!"l0"}
diff --git a/llvm/test/CodeGen/SPARC/reserved-regs-unavailable.ll b/llvm/test/CodeGen/SPARC/reserved-regs-unavailable.ll
new file mode 100644
index 0000000..53ca045
--- /dev/null
+++ b/llvm/test/CodeGen/SPARC/reserved-regs-unavailable.ll
@@ -0,0 +1,14 @@
+; RUN: not --crash llc -mtriple=sparc64-linux-gnu -o - %s 2>&1 | FileCheck %s --check-prefixes=CHECK-RESERVED-L0
+
+;; Ensure explicit register references for non-reserved registers
+;; are caught properly.
+
+; CHECK-RESERVED-L0: LLVM ERROR: Invalid register name global variable
+define void @set_reg(i32 zeroext %x) {
+entry:
+  tail call void @llvm.write_register.i32(metadata !0, i32 %x)
+  ret void
+}
+
+declare void @llvm.write_register.i32(metadata, i32)
+!0 = !{!"l0"}
diff --git a/llvm/test/CodeGen/SPARC/reserved-regs.ll b/llvm/test/CodeGen/SPARC/reserved-regs.ll
index 27ebf47..bf46177 100644
--- a/llvm/test/CodeGen/SPARC/reserved-regs.ll
+++ b/llvm/test/CodeGen/SPARC/reserved-regs.ll
@@ -1,5 +1,14 @@
 ; RUN: llc -march=sparc -verify-machineinstrs < %s | FileCheck %s
 
+;; Test reserve-* options.
+; RUN: llc -mtriple=sparc64-linux-gnu -mattr=+reserve-g1 -o - %s | FileCheck %s --check-prefixes=CHECK-RESERVED-G1
+; RUN: llc -mtriple=sparc64-linux-gnu -mattr=+reserve-o1 -o - %s | FileCheck %s --check-prefixes=CHECK-RESERVED-O1
+; RUN: llc -mtriple=sparc64-linux-gnu -mattr=+reserve-l1 -o - %s | FileCheck %s --check-prefixes=CHECK-RESERVED-L1
+; RUN: llc -mtriple=sparc64-linux-gnu -mattr=+reserve-i1 -o - %s | FileCheck %s --check-prefixes=CHECK-RESERVED-I1
+
+;; Test multiple reserve-* options together.
+; RUN: llc -mtriple=sparc64-linux-gnu -mattr=+reserve-g1 -mattr=+reserve-o1 -mattr=+reserve-l1 -mattr=+reserve-i1 -o - %s | FileCheck %s --check-prefixes=CHECK-RESERVED-G1,CHECK-RESERVED-O1,CHECK-RESERVED-L1,CHECK-RESERVED-I1
+
 @g = common global [32 x i32] zeroinitializer, align 16
 @h = common global [16 x i64] zeroinitializer, align 16
 
@@ -16,6 +25,10 @@
 ; CHECK-NOT: %o6
 ; CHECK-NOT: %i6
 ; CHECK-NOT: %i7
+; CHECK-RESERVED-G1-NOT: %g1
+; CHECK-RESERVED-O1-NOT: %o1
+; CHECK-RESERVED-L1-NOT: %l1
+; CHECK-RESERVED-I1-NOT: %i1
 ; CHECK: ret
 define void @use_all_i32_regs() {
 entry:
@@ -100,6 +113,10 @@ entry:
 ; CHECK-NOT: %o7
 ; CHECK-NOT: %i6
 ; CHECK-NOT: %i7
+; CHECK-RESERVED-G1-NOT: %g1
+; CHECK-RESERVED-O1-NOT: %o1
+; CHECK-RESERVED-L1-NOT: %l1
+; CHECK-RESERVED-I1-NOT: %i1
 ; CHECK: ret
 define void @use_all_i64_regs() {
 entry:
-- 
cgit v1.1


From d9124332aa3b95725b149617066fdd1f525b530d Mon Sep 17 00:00:00 2001
From: Vlad Serebrennikov <serebrennikov.vladislav@gmail.com>
Date: Sun, 11 Feb 2024 10:40:05 +0300
Subject: [clang][NFC] Annotate `Sema/DelayedDiagnostic.h` with
 `preferred_type`

This helps debuggers to display values in bit-fields in a more helpful way.
---
 clang/include/clang/Sema/DelayedDiagnostic.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/clang/include/clang/Sema/DelayedDiagnostic.h b/clang/include/clang/Sema/DelayedDiagnostic.h
index 9de7131..0105089 100644
--- a/clang/include/clang/Sema/DelayedDiagnostic.h
+++ b/clang/include/clang/Sema/DelayedDiagnostic.h
@@ -111,7 +111,9 @@ public:
   }
 
 private:
+  LLVM_PREFERRED_TYPE(AccessSpecifier)
   unsigned Access : 2;
+  LLVM_PREFERRED_TYPE(bool)
   unsigned IsMember : 1;
   NamedDecl *Target;
   CXXRecordDecl *NamingClass;
-- 
cgit v1.1


From c0ed1b2c08ab3b75e79d90fcda7e949ca50400a5 Mon Sep 17 00:00:00 2001
From: Vlad Serebrennikov <serebrennikov.vladislav@gmail.com>
Date: Sun, 11 Feb 2024 10:47:45 +0300
Subject:  [clang][NFC] Annotate `Basic/Visibility.h` with `preferred_type`

This helps debuggers to display values in bit-fields in a more helpful way.
---
 clang/include/clang/Basic/Visibility.h | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/clang/include/clang/Basic/Visibility.h b/clang/include/clang/Basic/Visibility.h
index 1e19630..b9693e6 100644
--- a/clang/include/clang/Basic/Visibility.h
+++ b/clang/include/clang/Basic/Visibility.h
@@ -51,8 +51,11 @@ inline Visibility minVisibility(Visibility L, Visibility R) {
 }
 
 class LinkageInfo {
+  LLVM_PREFERRED_TYPE(Linkage)
   uint8_t linkage_    : 3;
+  LLVM_PREFERRED_TYPE(Visibility)
   uint8_t visibility_ : 2;
+  LLVM_PREFERRED_TYPE(bool)
   uint8_t explicit_   : 1;
 
   void setVisibility(Visibility V, bool E) { visibility_ = V; explicit_ = E; }
-- 
cgit v1.1


From 07ec9a3799fa1e80888f8bd0c1101ad6dd546842 Mon Sep 17 00:00:00 2001
From: Vlad Serebrennikov <serebrennikov.vladislav@gmail.com>
Date: Sun, 11 Feb 2024 10:58:03 +0300
Subject: [clang][NFC] Partially annotate `CGFunctionInfo.h` with
 `preferred_type`

`CallingConvention` and `EffectiveCallingConvention` bit-fields that hold `llvm::CallingConv` are impossible to annotate at the moment, as `llvm::CallingConv` is actually a namespace with an unnamed enum inside.
---
 clang/include/clang/CodeGen/CGFunctionInfo.h | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/clang/include/clang/CodeGen/CGFunctionInfo.h b/clang/include/clang/CodeGen/CGFunctionInfo.h
index e388901..811f334 100644
--- a/clang/include/clang/CodeGen/CGFunctionInfo.h
+++ b/clang/include/clang/CodeGen/CGFunctionInfo.h
@@ -564,35 +564,45 @@ class CGFunctionInfo final
   unsigned EffectiveCallingConvention : 8;
 
   /// The clang::CallingConv that this was originally created with.
+  LLVM_PREFERRED_TYPE(CallingConv)
   unsigned ASTCallingConvention : 6;
 
   /// Whether this is an instance method.
+  LLVM_PREFERRED_TYPE(bool)
   unsigned InstanceMethod : 1;
 
   /// Whether this is a chain call.
+  LLVM_PREFERRED_TYPE(bool)
   unsigned ChainCall : 1;
 
   /// Whether this function is called by forwarding arguments.
   /// This doesn't support inalloca or varargs.
+  LLVM_PREFERRED_TYPE(bool)
   unsigned DelegateCall : 1;
 
   /// Whether this function is a CMSE nonsecure call
+  LLVM_PREFERRED_TYPE(bool)
   unsigned CmseNSCall : 1;
 
   /// Whether this function is noreturn.
+  LLVM_PREFERRED_TYPE(bool)
   unsigned NoReturn : 1;
 
   /// Whether this function is returns-retained.
+  LLVM_PREFERRED_TYPE(bool)
   unsigned ReturnsRetained : 1;
 
   /// Whether this function saved caller registers.
+  LLVM_PREFERRED_TYPE(bool)
   unsigned NoCallerSavedRegs : 1;
 
   /// How many arguments to pass inreg.
+  LLVM_PREFERRED_TYPE(bool)
   unsigned HasRegParm : 1;
   unsigned RegParm : 3;
 
   /// Whether this function has nocf_check attribute.
+  LLVM_PREFERRED_TYPE(bool)
   unsigned NoCfCheck : 1;
 
   /// Log 2 of the maximum vector width.
@@ -604,6 +614,7 @@ class CGFunctionInfo final
   /// passing non-trivial types with inalloca.  Not part of the profile.
   llvm::StructType *ArgStruct;
   unsigned ArgStructAlign : 31;
+  LLVM_PREFERRED_TYPE(bool)
   unsigned HasExtParameterInfos : 1;
 
   unsigned NumArgs;
-- 
cgit v1.1


From c112f963ce2b2efc8da765a1161402cebfa379b8 Mon Sep 17 00:00:00 2001
From: Vlad Serebrennikov <serebrennikov.vladislav@gmail.com>
Date: Sun, 11 Feb 2024 11:00:28 +0300
Subject:  [clang][NFC] Annotate `AnalysisBasedWarning.h` with `preferred_type`

This helps debuggers to display values in bit-fields in a more helpful way.
---
 clang/include/clang/Sema/AnalysisBasedWarnings.h | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/clang/include/clang/Sema/AnalysisBasedWarnings.h b/clang/include/clang/Sema/AnalysisBasedWarnings.h
index 020ddd3..aafe227 100644
--- a/clang/include/clang/Sema/AnalysisBasedWarnings.h
+++ b/clang/include/clang/Sema/AnalysisBasedWarnings.h
@@ -34,9 +34,13 @@ public:
   class Policy {
     friend class AnalysisBasedWarnings;
     // The warnings to run.
+    LLVM_PREFERRED_TYPE(bool)
     unsigned enableCheckFallThrough : 1;
+    LLVM_PREFERRED_TYPE(bool)
     unsigned enableCheckUnreachable : 1;
+    LLVM_PREFERRED_TYPE(bool)
     unsigned enableThreadSafetyAnalysis : 1;
+    LLVM_PREFERRED_TYPE(bool)
     unsigned enableConsumedAnalysis : 1;
   public:
     Policy();
-- 
cgit v1.1


From 0764254e014db8783a31e84a322636c651bc7d6d Mon Sep 17 00:00:00 2001
From: Vlad Serebrennikov <serebrennikov.vladislav@gmail.com>
Date: Sun, 11 Feb 2024 11:04:12 +0300
Subject: [clang][NFC] Annotate `StmtOpenMP.h` with `preferred_type`

This helps debuggers to display values in bit-fields in a more helpful way.
---
 clang/include/clang/AST/StmtOpenMP.h | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/clang/include/clang/AST/StmtOpenMP.h b/clang/include/clang/AST/StmtOpenMP.h
index 6216433..3cb3c10 100644
--- a/clang/include/clang/AST/StmtOpenMP.h
+++ b/clang/include/clang/AST/StmtOpenMP.h
@@ -2974,6 +2974,7 @@ class OMPAtomicDirective : public OMPExecutableDirective {
     /// This field is 1 for the first form of the expression and 0 for the
     /// second. Required for correct codegen of non-associative operations (like
     /// << or >>).
+    LLVM_PREFERRED_TYPE(bool)
     uint8_t IsXLHSInRHSPart : 1;
     /// Used for 'atomic update' or 'atomic capture' constructs. They may
     /// have atomic expressions of forms:
@@ -2983,9 +2984,11 @@ class OMPAtomicDirective : public OMPExecutableDirective {
     /// \endcode
     /// This field is 1 for the first(postfix) form of the expression and 0
     /// otherwise.
+    LLVM_PREFERRED_TYPE(bool)
     uint8_t IsPostfixUpdate : 1;
     /// 1 if 'v' is updated only when the condition is false (compare capture
     /// only).
+    LLVM_PREFERRED_TYPE(bool)
     uint8_t IsFailOnly : 1;
   } Flags;
 
-- 
cgit v1.1


From c8a12ed413aae2c2602c880395270acbdbb15e70 Mon Sep 17 00:00:00 2001
From: Vlad Serebrennikov <serebrennikov.vladislav@gmail.com>
Date: Sun, 11 Feb 2024 11:09:34 +0300
Subject:  [clang][NFC] Annotate `CommentCommandTraits.h` with `preferred_type`

This helps debuggers to display values in bit-fields in a more helpful way.
---
 clang/include/clang/AST/CommentCommandTraits.h | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/clang/include/clang/AST/CommentCommandTraits.h b/clang/include/clang/AST/CommentCommandTraits.h
index 83a29a5..0c3254d 100644
--- a/clang/include/clang/AST/CommentCommandTraits.h
+++ b/clang/include/clang/AST/CommentCommandTraits.h
@@ -50,52 +50,65 @@ struct CommandInfo {
   unsigned NumArgs : 4;
 
   /// True if this command is a inline command (of any kind).
+  LLVM_PREFERRED_TYPE(bool)
   unsigned IsInlineCommand : 1;
 
   /// True if this command is a block command (of any kind).
+  LLVM_PREFERRED_TYPE(bool)
   unsigned IsBlockCommand : 1;
 
   /// True if this command is introducing a brief documentation
   /// paragraph (\or an alias).
+  LLVM_PREFERRED_TYPE(bool)
   unsigned IsBriefCommand : 1;
 
   /// True if this command is \\returns or an alias.
+  LLVM_PREFERRED_TYPE(bool)
   unsigned IsReturnsCommand : 1;
 
   /// True if this command is introducing documentation for a function
   /// parameter (\\param or an alias).
+  LLVM_PREFERRED_TYPE(bool)
   unsigned IsParamCommand : 1;
 
   /// True if this command is introducing documentation for
   /// a template parameter (\\tparam or an alias).
+  LLVM_PREFERRED_TYPE(bool)
   unsigned IsTParamCommand : 1;
 
   /// True if this command is \\throws or an alias.
+  LLVM_PREFERRED_TYPE(bool)
   unsigned IsThrowsCommand : 1;
 
   /// True if this command is \\deprecated or an alias.
+  LLVM_PREFERRED_TYPE(bool)
   unsigned IsDeprecatedCommand : 1;
 
   /// True if this is a \\headerfile-like command.
+  LLVM_PREFERRED_TYPE(bool)
   unsigned IsHeaderfileCommand : 1;
 
   /// True if we don't want to warn about this command being passed an empty
   /// paragraph.  Meaningful only for block commands.
+  LLVM_PREFERRED_TYPE(bool)
   unsigned IsEmptyParagraphAllowed : 1;
 
   /// True if this command is a verbatim-like block command.
   ///
   /// A verbatim-like block command eats every character (except line starting
   /// decorations) until matching end command is seen or comment end is hit.
+  LLVM_PREFERRED_TYPE(bool)
   unsigned IsVerbatimBlockCommand : 1;
 
   /// True if this command is an end command for a verbatim-like block.
+  LLVM_PREFERRED_TYPE(bool)
   unsigned IsVerbatimBlockEndCommand : 1;
 
   /// True if this command is a verbatim line command.
   ///
   /// A verbatim-like line command eats everything until a newline is seen or
   /// comment end is hit.
+  LLVM_PREFERRED_TYPE(bool)
   unsigned IsVerbatimLineCommand : 1;
 
   /// True if this command contains a declaration for the entity being
@@ -105,20 +118,25 @@ struct CommandInfo {
   /// \code
   ///   \fn void f(int a);
   /// \endcode
+  LLVM_PREFERRED_TYPE(bool)
   unsigned IsDeclarationCommand : 1;
 
   /// True if verbatim-like line command is a function declaration.
+  LLVM_PREFERRED_TYPE(bool)
   unsigned IsFunctionDeclarationCommand : 1;
 
   /// True if block command is further describing a container API; such
   /// as \@coclass, \@classdesign, etc.
+  LLVM_PREFERRED_TYPE(bool)
   unsigned IsRecordLikeDetailCommand : 1;
 
   /// True if block command is a container API; such as \@interface.
+  LLVM_PREFERRED_TYPE(bool)
   unsigned IsRecordLikeDeclarationCommand : 1;
 
   /// True if this command is unknown.  This \c CommandInfo object was
   /// created during parsing.
+  LLVM_PREFERRED_TYPE(bool)
   unsigned IsUnknownCommand : 1;
 };
 
-- 
cgit v1.1


From ab2cef5391cc3434bc54b755810c51b55e9a04fc Mon Sep 17 00:00:00 2001
From: Vlad Serebrennikov <serebrennikov.vladislav@gmail.com>
Date: Sun, 11 Feb 2024 11:16:58 +0300
Subject: [clang][NFC] Annotate `Analysis/CFG.h` with `preferred_type`

This helps debuggers to display values in bit-fields in a more helpful way.
---
 clang/include/clang/Analysis/CFG.h | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/clang/include/clang/Analysis/CFG.h b/clang/include/clang/Analysis/CFG.h
index 9f776ca..a7ff38c 100644
--- a/clang/include/clang/Analysis/CFG.h
+++ b/clang/include/clang/Analysis/CFG.h
@@ -879,6 +879,7 @@ private:
   ///
   /// Optimization Note: This bit could be profitably folded with Terminator's
   /// storage if the memory usage of CFGBlock becomes an issue.
+  LLVM_PREFERRED_TYPE(bool)
   unsigned HasNoReturnElement : 1;
 
   /// The parent CFG that owns this CFGBlock.
@@ -1007,7 +1008,9 @@ public:
 
   class FilterOptions {
   public:
+    LLVM_PREFERRED_TYPE(bool)
     unsigned IgnoreNullPredecessors : 1;
+    LLVM_PREFERRED_TYPE(bool)
     unsigned IgnoreDefaultsWithCoveredEnums : 1;
 
     FilterOptions()
-- 
cgit v1.1


From 535da10842c7309e9eeaf9828cf6bb034fecaf16 Mon Sep 17 00:00:00 2001
From: David CARLIER <devnexen@gmail.com>
Date: Sun, 11 Feb 2024 08:17:53 +0000
Subject: [lldb] checks if lldb can trace/attach/set a breakpoint a process or
 load a file to debug on FreeBSD.

before having the generic EINVAL message, we check if the
`security.bsd.unprivileged_proc_debug` allows process debugging.

close #79634
---
 .../Process/FreeBSD/NativeProcessFreeBSD.cpp       | 39 +++++++++++++++++++---
 1 file changed, 34 insertions(+), 5 deletions(-)

diff --git a/lldb/source/Plugins/Process/FreeBSD/NativeProcessFreeBSD.cpp b/lldb/source/Plugins/Process/FreeBSD/NativeProcessFreeBSD.cpp
index 19e0986..9c620e4 100644
--- a/lldb/source/Plugins/Process/FreeBSD/NativeProcessFreeBSD.cpp
+++ b/lldb/source/Plugins/Process/FreeBSD/NativeProcessFreeBSD.cpp
@@ -48,20 +48,38 @@ static Status EnsureFDFlags(int fd, int flags) {
   return error;
 }
 
+static Status CanTrace() {
+  int proc_debug, ret;
+  size_t len = sizeof(proc_debug);
+  ret = ::sysctlbyname("security.bsd.unprivileged_proc_debug", &proc_debug,
+                       &len, nullptr, 0);
+  if (ret != 0)
+    return Status("sysctlbyname() security.bsd.unprivileged_proc_debug failed");
+
+  if (proc_debug < 1)
+    return Status(
+        "process debug disabled by security.bsd.unprivileged_proc_debug oid");
+
+  return {};
+}
+
 // Public Static Methods
 
 llvm::Expected<std::unique_ptr<NativeProcessProtocol>>
 NativeProcessFreeBSD::Manager::Launch(ProcessLaunchInfo &launch_info,
                                       NativeDelegate &native_delegate) {
   Log *log = GetLog(POSIXLog::Process);
-
   Status status;
+
   ::pid_t pid = ProcessLauncherPosixFork()
                     .LaunchProcess(launch_info, status)
                     .GetProcessId();
   LLDB_LOG(log, "pid = {0:x}", pid);
   if (status.Fail()) {
+    auto error = CanTrace();
     LLDB_LOG(log, "failed to launch process: {0}", status);
+    if (status.Fail())
+      return error.ToError();
     return status.ToError();
   }
 
@@ -392,8 +410,11 @@ Status NativeProcessFreeBSD::PtraceWrapper(int req, lldb::pid_t pid, void *addr,
   ret =
       ptrace(req, static_cast<::pid_t>(pid), static_cast<caddr_t>(addr), data);
 
-  if (ret == -1)
-    error.SetErrorToErrno();
+  if (ret == -1) {
+    error = CanTrace();
+    if (error.Success())
+      error.SetErrorToErrno();
+  }
 
   if (result)
     *result = ret;
@@ -707,8 +728,12 @@ Status NativeProcessFreeBSD::SetBreakpoint(lldb::addr_t addr, uint32_t size,
 Status NativeProcessFreeBSD::GetLoadedModuleFileSpec(const char *module_path,
                                                      FileSpec &file_spec) {
   Status error = PopulateMemoryRegionCache();
-  if (error.Fail())
+  if (error.Fail()) {
+    auto status = CanTrace();
+    if (status.Fail())
+      return status;
     return error;
+  }
 
   FileSpec module_file_spec(module_path);
   FileSystem::Instance().Resolve(module_file_spec);
@@ -729,8 +754,12 @@ NativeProcessFreeBSD::GetFileLoadAddress(const llvm::StringRef &file_name,
                                          lldb::addr_t &load_addr) {
   load_addr = LLDB_INVALID_ADDRESS;
   Status error = PopulateMemoryRegionCache();
-  if (error.Fail())
+  if (error.Fail()) {
+    auto status = CanTrace();
+    if (status.Fail())
+      return status;
     return error;
+  }
 
   FileSpec file(file_name);
   for (const auto &it : m_mem_region_cache) {
-- 
cgit v1.1


From 6496948a427fc8f815f7c21cd068acd046873cca Mon Sep 17 00:00:00 2001
From: Vlad Serebrennikov <serebrennikov.vladislav@gmail.com>
Date: Sun, 11 Feb 2024 11:28:37 +0300
Subject: [clang][NFC] Partially annotate `APINotes/Types.h` with
 `preferred_type`

`RawRetainCountConversion` bit-field requires a new enumerator in `RetainCountConventionKind` to be annotated.
---
 clang/include/clang/APINotes/Types.h | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

diff --git a/clang/include/clang/APINotes/Types.h b/clang/include/clang/APINotes/Types.h
index 1d116be..93bb045 100644
--- a/clang/include/clang/APINotes/Types.h
+++ b/clang/include/clang/APINotes/Types.h
@@ -55,16 +55,20 @@ public:
   std::string UnavailableMsg;
 
   /// Whether this entity is marked unavailable.
+  LLVM_PREFERRED_TYPE(bool)
   unsigned Unavailable : 1;
 
   /// Whether this entity is marked unavailable in Swift.
+  LLVM_PREFERRED_TYPE(bool)
   unsigned UnavailableInSwift : 1;
 
 private:
   /// Whether SwiftPrivate was specified.
+  LLVM_PREFERRED_TYPE(bool)
   unsigned SwiftPrivateSpecified : 1;
 
   /// Whether this entity is considered "private" to a Swift overlay.
+  LLVM_PREFERRED_TYPE(bool)
   unsigned SwiftPrivate : 1;
 
 public:
@@ -191,18 +195,25 @@ inline bool operator!=(const CommonTypeInfo &LHS, const CommonTypeInfo &RHS) {
 /// Describes API notes data for an Objective-C class or protocol.
 class ObjCContextInfo : public CommonTypeInfo {
   /// Whether this class has a default nullability.
+  LLVM_PREFERRED_TYPE(bool)
   unsigned HasDefaultNullability : 1;
 
   /// The default nullability.
+  LLVM_PREFERRED_TYPE(NullabilityKind)
   unsigned DefaultNullability : 2;
 
   /// Whether this class has designated initializers recorded.
+  LLVM_PREFERRED_TYPE(bool)
   unsigned HasDesignatedInits : 1;
 
+  LLVM_PREFERRED_TYPE(bool)
   unsigned SwiftImportAsNonGenericSpecified : 1;
+  LLVM_PREFERRED_TYPE(bool)
   unsigned SwiftImportAsNonGeneric : 1;
 
+  LLVM_PREFERRED_TYPE(bool)
   unsigned SwiftObjCMembersSpecified : 1;
+  LLVM_PREFERRED_TYPE(bool)
   unsigned SwiftObjCMembers : 1;
 
 public:
@@ -298,10 +309,12 @@ inline bool operator!=(const ObjCContextInfo &LHS, const ObjCContextInfo &RHS) {
 /// API notes for a variable/property.
 class VariableInfo : public CommonEntityInfo {
   /// Whether this property has been audited for nullability.
+  LLVM_PREFERRED_TYPE(bool)
   unsigned NullabilityAudited : 1;
 
   /// The kind of nullability for this property. Only valid if the nullability
   /// has been audited.
+  LLVM_PREFERRED_TYPE(NullabilityKind)
   unsigned Nullable : 2;
 
   /// The C type of the variable, as a string.
@@ -352,7 +365,9 @@ inline bool operator!=(const VariableInfo &LHS, const VariableInfo &RHS) {
 
 /// Describes API notes data for an Objective-C property.
 class ObjCPropertyInfo : public VariableInfo {
+  LLVM_PREFERRED_TYPE(bool)
   unsigned SwiftImportAsAccessorsSpecified : 1;
+  LLVM_PREFERRED_TYPE(bool)
   unsigned SwiftImportAsAccessors : 1;
 
 public:
@@ -409,9 +424,11 @@ inline bool operator!=(const ObjCPropertyInfo &LHS,
 /// Describes a function or method parameter.
 class ParamInfo : public VariableInfo {
   /// Whether noescape was specified.
+  LLVM_PREFERRED_TYPE(bool)
   unsigned NoEscapeSpecified : 1;
 
   /// Whether the this parameter has the 'noescape' attribute.
+  LLVM_PREFERRED_TYPE(bool)
   unsigned NoEscape : 1;
 
   /// A biased RetainCountConventionKind, where 0 means "unspecified".
@@ -488,6 +505,7 @@ public:
   // unknown nullability.
 
   /// Whether the signature has been audited with respect to nullability.
+  LLVM_PREFERRED_TYPE(bool)
   unsigned NullabilityAudited : 1;
 
   /// Number of types whose nullability is encoded with the NullabilityPayload.
@@ -597,9 +615,11 @@ inline bool operator!=(const FunctionInfo &LHS, const FunctionInfo &RHS) {
 class ObjCMethodInfo : public FunctionInfo {
 public:
   /// Whether this is a designated initializer of its class.
+  LLVM_PREFERRED_TYPE(bool)
   unsigned DesignatedInit : 1;
 
   /// Whether this is a required initializer.
+  LLVM_PREFERRED_TYPE(bool)
   unsigned RequiredInit : 1;
 
   ObjCMethodInfo() : DesignatedInit(false), RequiredInit(false) {}
@@ -650,7 +670,9 @@ public:
 
 /// Describes API notes data for a tag.
 class TagInfo : public CommonTypeInfo {
+  LLVM_PREFERRED_TYPE(bool)
   unsigned HasFlagEnum : 1;
+  LLVM_PREFERRED_TYPE(bool)
   unsigned IsFlagEnum : 1;
 
 public:
-- 
cgit v1.1


From fcd21624b082b0c42777f6047cdfbc8a59057001 Mon Sep 17 00:00:00 2001
From: Vlad Serebrennikov <serebrennikov.vladislav@gmail.com>
Date: Sun, 11 Feb 2024 11:44:02 +0300
Subject:  [clang][NFC] Annotate `Driver.h` with `preferred_type`

This helps debuggers to display values in bit-fields in a more helpful way.
---
 clang/include/clang/Driver/Driver.h | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/clang/include/clang/Driver/Driver.h b/clang/include/clang/Driver/Driver.h
index 3ee1bcf..908bc87 100644
--- a/clang/include/clang/Driver/Driver.h
+++ b/clang/include/clang/Driver/Driver.h
@@ -232,10 +232,12 @@ public:
   bool IsDXCMode() const { return Mode == DXCMode; }
 
   /// Only print tool bindings, don't build any jobs.
+  LLVM_PREFERRED_TYPE(bool)
   unsigned CCCPrintBindings : 1;
 
   /// Set CC_PRINT_OPTIONS mode, which is like -v but logs the commands to
   /// CCPrintOptionsFilename or to stderr.
+  LLVM_PREFERRED_TYPE(bool)
   unsigned CCPrintOptions : 1;
 
   /// The format of the header information that is emitted. If CC_PRINT_HEADERS
@@ -252,17 +254,21 @@ public:
   /// Set CC_LOG_DIAGNOSTICS mode, which causes the frontend to log diagnostics
   /// to CCLogDiagnosticsFilename or to stderr, in a stable machine readable
   /// format.
+  LLVM_PREFERRED_TYPE(bool)
   unsigned CCLogDiagnostics : 1;
 
   /// Whether the driver is generating diagnostics for debugging purposes.
+  LLVM_PREFERRED_TYPE(bool)
   unsigned CCGenDiagnostics : 1;
 
   /// Set CC_PRINT_PROC_STAT mode, which causes the driver to dump
   /// performance report to CC_PRINT_PROC_STAT_FILE or to stdout.
+  LLVM_PREFERRED_TYPE(bool)
   unsigned CCPrintProcessStats : 1;
 
   /// Set CC_PRINT_INTERNAL_STAT mode, which causes the driver to dump internal
   /// performance report to CC_PRINT_INTERNAL_STAT_FILE or to stdout.
+  LLVM_PREFERRED_TYPE(bool)
   unsigned CCPrintInternalStats : 1;
 
   /// Pointer to the ExecuteCC1Tool function, if available.
@@ -303,9 +309,11 @@ private:
 
   /// Whether to check that input files exist when constructing compilation
   /// jobs.
+  LLVM_PREFERRED_TYPE(bool)
   unsigned CheckInputsExist : 1;
   /// Whether to probe for PCH files on disk, in order to upgrade
   /// -include foo.h to -include-pch foo.h.pch.
+  LLVM_PREFERRED_TYPE(bool)
   unsigned ProbePrecompiled : 1;
 
 public:
@@ -319,6 +327,7 @@ public:
 
 private:
   /// Certain options suppress the 'no input files' warning.
+  LLVM_PREFERRED_TYPE(bool)
   unsigned SuppressMissingInputWarning : 1;
 
   /// Cache of all the ToolChains in use by the driver.
-- 
cgit v1.1


From 956722698172a806652ca8e2dba0a783a1c3d593 Mon Sep 17 00:00:00 2001
From: Vlad Serebrennikov <serebrennikov.vladislav@gmail.com>
Date: Sun, 11 Feb 2024 11:46:51 +0300
Subject: [clang][NFC] Annotate `CodeCompletionConsumer.h` with
 `preferred_type`

This helps debuggers to display values in bit-fields in a more helpful way.
---
 clang/include/clang/Sema/CodeCompleteConsumer.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/clang/include/clang/Sema/CodeCompleteConsumer.h b/clang/include/clang/Sema/CodeCompleteConsumer.h
index 274eaac..a2028e4 100644
--- a/clang/include/clang/Sema/CodeCompleteConsumer.h
+++ b/clang/include/clang/Sema/CodeCompleteConsumer.h
@@ -581,6 +581,7 @@ private:
   unsigned Priority : 16;
 
   /// The availability of this code-completion result.
+  LLVM_PREFERRED_TYPE(CXAvailabilityKind)
   unsigned Availability : 2;
 
   /// The name of the parent context.
-- 
cgit v1.1


From 04812c72dee2c1c211306a4dd6d51e783f0c5015 Mon Sep 17 00:00:00 2001
From: Vlad Serebrennikov <serebrennikov.vladislav@gmail.com>
Date: Sun, 11 Feb 2024 11:49:20 +0300
Subject: [clang][NFC] Annotate `CodeCompletionOptions.h` with `preferred_type`

This helps debuggers to display values in bit-fields in a more helpful way.
---
 clang/include/clang/Sema/CodeCompleteOptions.h | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/clang/include/clang/Sema/CodeCompleteOptions.h b/clang/include/clang/Sema/CodeCompleteOptions.h
index a3403b0..d8dc386 100644
--- a/clang/include/clang/Sema/CodeCompleteOptions.h
+++ b/clang/include/clang/Sema/CodeCompleteOptions.h
@@ -9,18 +9,23 @@
 #ifndef LLVM_CLANG_SEMA_CODECOMPLETEOPTIONS_H
 #define LLVM_CLANG_SEMA_CODECOMPLETEOPTIONS_H
 
+#include "llvm/Support/Compiler.h"
+
 namespace clang {
 
 /// Options controlling the behavior of code completion.
 class CodeCompleteOptions {
 public:
   /// Show macros in code completion results.
+  LLVM_PREFERRED_TYPE(bool)
   unsigned IncludeMacros : 1;
 
   /// Show code patterns in code completion results.
+  LLVM_PREFERRED_TYPE(bool)
   unsigned IncludeCodePatterns : 1;
 
   /// Show top-level decls in code completion results.
+  LLVM_PREFERRED_TYPE(bool)
   unsigned IncludeGlobals : 1;
 
   /// Show decls in namespace (including the global namespace) in code
@@ -29,18 +34,22 @@ public:
   /// Currently, this only works when completing qualified IDs (i.e.
   /// `Sema::CodeCompleteQualifiedId`).
   /// FIXME: consider supporting more completion cases with this option.
+  LLVM_PREFERRED_TYPE(bool)
   unsigned IncludeNamespaceLevelDecls : 1;
 
   /// Show brief documentation comments in code completion results.
+  LLVM_PREFERRED_TYPE(bool)
   unsigned IncludeBriefComments : 1;
 
   /// Hint whether to load data from the external AST to provide full results.
   /// If false, namespace-level declarations and macros from the preamble may be
   /// omitted.
+  LLVM_PREFERRED_TYPE(bool)
   unsigned LoadExternal : 1;
 
   /// Include results after corrections (small fix-its), e.g. change '.' to '->'
   /// on member access, etc.
+  LLVM_PREFERRED_TYPE(bool)
   unsigned IncludeFixIts : 1;
 
   CodeCompleteOptions()
-- 
cgit v1.1


From 1ee81076388078cb0cb1fbc90ad374fceafd0c98 Mon Sep 17 00:00:00 2001
From: Vlad Serebrennikov <serebrennikov.vladislav@gmail.com>
Date: Sun, 11 Feb 2024 11:52:10 +0300
Subject: [clang][NFC] Annotate `ASTReader.h` with `preferred_type`

This helps debuggers to display values in bit-fields in a more helpful way.
---
 clang/include/clang/Serialization/ASTReader.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/clang/include/clang/Serialization/ASTReader.h b/clang/include/clang/Serialization/ASTReader.h
index a4c7f54..2002bf2 100644
--- a/clang/include/clang/Serialization/ASTReader.h
+++ b/clang/include/clang/Serialization/ASTReader.h
@@ -721,6 +721,7 @@ private:
     unsigned ID;
 
     /// Whether this is a wildcard export.
+    LLVM_PREFERRED_TYPE(bool)
     unsigned IsWildcard : 1;
 
     /// String data.
-- 
cgit v1.1


From 3bf89e5883ff0ea82ca4ad5cd511b77826b7bb71 Mon Sep 17 00:00:00 2001
From: Vlad Serebrennikov <serebrennikov.vladislav@gmail.com>
Date: Sun, 11 Feb 2024 11:55:07 +0300
Subject: [clang][NFC] Annotate `AST/Linkage.h` with `preferred_type`

This helps debuggers to display values in bit-fields in a more helpful way.
---
 clang/lib/AST/Linkage.h | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/clang/lib/AST/Linkage.h b/clang/lib/AST/Linkage.h
index 31f384e..e4dcb5e 100644
--- a/clang/lib/AST/Linkage.h
+++ b/clang/lib/AST/Linkage.h
@@ -29,12 +29,15 @@ namespace clang {
 struct LVComputationKind {
   /// The kind of entity whose visibility is ultimately being computed;
   /// visibility computations for types and non-types follow different rules.
+  LLVM_PREFERRED_TYPE(bool)
   unsigned ExplicitKind : 1;
   /// Whether explicit visibility attributes should be ignored. When set,
   /// visibility may only be restricted by the visibility of template arguments.
+  LLVM_PREFERRED_TYPE(bool)
   unsigned IgnoreExplicitVisibility : 1;
   /// Whether all visibility should be ignored. When set, we're only interested
   /// in computing linkage.
+  LLVM_PREFERRED_TYPE(bool)
   unsigned IgnoreAllVisibility : 1;
 
   enum { NumLVComputationKindBits = 3 };
-- 
cgit v1.1


From 1366e4f594bdb4cd429423a1e07509e984838fa0 Mon Sep 17 00:00:00 2001
From: Vlad Serebrennikov <serebrennikov.vladislav@gmail.com>
Date: Sun, 11 Feb 2024 11:59:03 +0300
Subject: [clang][NFC] Annotate `Interp/Descriptor.h` with `preferred_type`

This helps debuggers to display values in bit-fields in a more helpful way.
---
 clang/lib/AST/Interp/Descriptor.h | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/clang/lib/AST/Interp/Descriptor.h b/clang/lib/AST/Interp/Descriptor.h
index 6cca9d5..6a53205 100644
--- a/clang/lib/AST/Interp/Descriptor.h
+++ b/clang/lib/AST/Interp/Descriptor.h
@@ -59,17 +59,22 @@ struct InlineDescriptor {
 
   /// Flag indicating if the storage is constant or not.
   /// Relevant for primitive fields.
+  LLVM_PREFERRED_TYPE(bool)
   unsigned IsConst : 1;
   /// For primitive fields, it indicates if the field was initialized.
   /// Primitive fields in static storage are always initialized.
   /// Arrays are always initialized, even though their elements might not be.
   /// Base classes are initialized after the constructor is invoked.
+  LLVM_PREFERRED_TYPE(bool)
   unsigned IsInitialized : 1;
   /// Flag indicating if the field is an embedded base class.
+  LLVM_PREFERRED_TYPE(bool)
   unsigned IsBase : 1;
   /// Flag indicating if the field is the active member of a union.
+  LLVM_PREFERRED_TYPE(bool)
   unsigned IsActive : 1;
   /// Flag indicating if the field is mutable (if in a record).
+  LLVM_PREFERRED_TYPE(bool)
   unsigned IsFieldMutable : 1;
 
   const Descriptor *Desc;
-- 
cgit v1.1


From ee56d494974311049e055c73e4feb2e4098f1da8 Mon Sep 17 00:00:00 2001
From: Vlad Serebrennikov <serebrennikov.vladislav@gmail.com>
Date: Sun, 11 Feb 2024 12:02:26 +0300
Subject: [clang][NFC] Annotate `Targets/ARM.h` with `preferred_type`

This helps debuggers to display values in bit-fields in a more helpful way.
---
 clang/lib/Basic/Targets/ARM.h | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/clang/lib/Basic/Targets/ARM.h b/clang/lib/Basic/Targets/ARM.h
index 9802eb0..71322a0 100644
--- a/clang/lib/Basic/Targets/ARM.h
+++ b/clang/lib/Basic/Targets/ARM.h
@@ -61,26 +61,43 @@ class LLVM_LIBRARY_VISIBILITY ARMTargetInfo : public TargetInfo {
   llvm::ARM::ProfileKind ArchProfile;
   unsigned ArchVersion;
 
+  LLVM_PREFERRED_TYPE(FPUMode)
   unsigned FPU : 5;
+  LLVM_PREFERRED_TYPE(MVEMode)
   unsigned MVE : 2;
 
+  LLVM_PREFERRED_TYPE(bool)
   unsigned IsAAPCS : 1;
+  LLVM_PREFERRED_TYPE(HWDivMode)
   unsigned HWDiv : 2;
 
   // Initialized via features.
+  LLVM_PREFERRED_TYPE(bool)
   unsigned SoftFloat : 1;
+  LLVM_PREFERRED_TYPE(bool)
   unsigned SoftFloatABI : 1;
 
+  LLVM_PREFERRED_TYPE(bool)
   unsigned CRC : 1;
+  LLVM_PREFERRED_TYPE(bool)
   unsigned Crypto : 1;
+  LLVM_PREFERRED_TYPE(bool)
   unsigned SHA2 : 1;
+  LLVM_PREFERRED_TYPE(bool)
   unsigned AES : 1;
+  LLVM_PREFERRED_TYPE(bool)
   unsigned DSP : 1;
+  LLVM_PREFERRED_TYPE(bool)
   unsigned Unaligned : 1;
+  LLVM_PREFERRED_TYPE(bool)
   unsigned DotProd : 1;
+  LLVM_PREFERRED_TYPE(bool)
   unsigned HasMatMul : 1;
+  LLVM_PREFERRED_TYPE(bool)
   unsigned FPRegsDisabled : 1;
+  LLVM_PREFERRED_TYPE(bool)
   unsigned HasPAC : 1;
+  LLVM_PREFERRED_TYPE(bool)
   unsigned HasBTI : 1;
 
   enum {
-- 
cgit v1.1


From ba0d35181cef094209306207dc6e3fa816ddde36 Mon Sep 17 00:00:00 2001
From: Vlad Serebrennikov <serebrennikov.vladislav@gmail.com>
Date: Sun, 11 Feb 2024 12:04:55 +0300
Subject: [clang][NFC] Annotate `CGCall.h` with `preferred_type`

This helps debuggers to display values in bit-fields in a more helpful way.
---
 clang/lib/CodeGen/CGCall.h | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/clang/lib/CodeGen/CGCall.h b/clang/lib/CodeGen/CGCall.h
index 1c0d15d..1bd48a0 100644
--- a/clang/lib/CodeGen/CGCall.h
+++ b/clang/lib/CodeGen/CGCall.h
@@ -357,8 +357,11 @@ class ReturnValueSlot {
   Address Addr = Address::invalid();
 
   // Return value slot flags
+  LLVM_PREFERRED_TYPE(bool)
   unsigned IsVolatile : 1;
+  LLVM_PREFERRED_TYPE(bool)
   unsigned IsUnused : 1;
+  LLVM_PREFERRED_TYPE(bool)
   unsigned IsExternallyDestructed : 1;
 
 public:
-- 
cgit v1.1


From fd80304763a41f86b877c91b750551d7e6bd852d Mon Sep 17 00:00:00 2001
From: Vlad Serebrennikov <serebrennikov.vladislav@gmail.com>
Date: Sun, 11 Feb 2024 12:07:27 +0300
Subject: [clang][NFC] Annotate `CGCUDARuntime.h` with `preferred_type`

This helps debuggers to display values in bit-fields in a more helpful way.
---
 clang/lib/CodeGen/CGCUDARuntime.h | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/clang/lib/CodeGen/CGCUDARuntime.h b/clang/lib/CodeGen/CGCUDARuntime.h
index c7af8f1..8030d63 100644
--- a/clang/lib/CodeGen/CGCUDARuntime.h
+++ b/clang/lib/CodeGen/CGCUDARuntime.h
@@ -54,10 +54,15 @@ public:
     };
 
   private:
+    LLVM_PREFERRED_TYPE(DeviceVarKind)
     unsigned Kind : 2;
+    LLVM_PREFERRED_TYPE(bool)
     unsigned Extern : 1;
+    LLVM_PREFERRED_TYPE(bool)
     unsigned Constant : 1;   // Constant variable.
+    LLVM_PREFERRED_TYPE(bool)
     unsigned Managed : 1;    // Managed variable.
+    LLVM_PREFERRED_TYPE(bool)
     unsigned Normalized : 1; // Normalized texture.
     int SurfTexType;         // Type of surface/texutre.
 
-- 
cgit v1.1


From 35737beaef1452b6ecdb0e6d7a359d48c8e9236a Mon Sep 17 00:00:00 2001
From: Vlad Serebrennikov <serebrennikov.vladislav@gmail.com>
Date: Sun, 11 Feb 2024 12:11:49 +0300
Subject: [clang][NFC] Annotate `CodeGenFunction.h` with `preferred_type`

This helps debuggers to display values in bit-fields in a more helpful way.
---
 clang/lib/CodeGen/CodeGenFunction.h | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/clang/lib/CodeGen/CodeGenFunction.h b/clang/lib/CodeGen/CodeGenFunction.h
index 143ad64..fc9b328 100644
--- a/clang/lib/CodeGen/CodeGenFunction.h
+++ b/clang/lib/CodeGen/CodeGenFunction.h
@@ -203,6 +203,7 @@ template <> struct DominatingValue<RValue> {
 
     llvm::Value *Value;
     llvm::Type *ElementType;
+    LLVM_PREFERRED_TYPE(Kind)
     unsigned K : 3;
     unsigned Align : 29;
     saved_type(llvm::Value *v, llvm::Type *e, Kind k, unsigned a = 0)
@@ -650,9 +651,11 @@ public:
   struct LifetimeExtendedCleanupHeader {
     /// The size of the following cleanup object.
     unsigned Size;
-    /// The kind of cleanup to push: a value from the CleanupKind enumeration.
+    /// The kind of cleanup to push.
+    LLVM_PREFERRED_TYPE(CleanupKind)
     unsigned Kind : 31;
     /// Whether this is a conditional cleanup.
+    LLVM_PREFERRED_TYPE(bool)
     unsigned IsConditional : 1;
 
     size_t getSize() const { return Size; }
-- 
cgit v1.1


From 866e073c2851bd4180cc0c64ce5a3d7f109e21dc Mon Sep 17 00:00:00 2001
From: Vlad Serebrennikov <serebrennikov.vladislav@gmail.com>
Date: Sun, 11 Feb 2024 12:14:31 +0300
Subject: [clang][NFC] Annotate `CGRecordLayout.h` with `preferred_type`

This helps debuggers to display values in bit-fields in a more helpful way.
---
 clang/lib/CodeGen/CGRecordLayout.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/clang/lib/CodeGen/CGRecordLayout.h b/clang/lib/CodeGen/CGRecordLayout.h
index d5ea749..6c06ad2 100644
--- a/clang/lib/CodeGen/CGRecordLayout.h
+++ b/clang/lib/CodeGen/CGRecordLayout.h
@@ -71,6 +71,7 @@ struct CGBitFieldInfo {
   unsigned Size : 15;
 
   /// Whether the bit-field is signed.
+  LLVM_PREFERRED_TYPE(bool)
   unsigned IsSigned : 1;
 
   /// The storage size in bits which should be used when accessing this
-- 
cgit v1.1


From 1ed37606ca4bda4659b33a7f570d273b5afd16ea Mon Sep 17 00:00:00 2001
From: Vlad Serebrennikov <serebrennikov.vladislav@gmail.com>
Date: Sun, 11 Feb 2024 12:20:34 +0300
Subject: [clang][NFC] Annotate `CGCleanup.h` with `preferred_type`

This helps debuggers to display values in bit-fields in a more helpful way.
---
 clang/lib/CodeGen/CGCleanup.h | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/clang/lib/CodeGen/CGCleanup.h b/clang/lib/CodeGen/CGCleanup.h
index fcfbf41..7a7344c 100644
--- a/clang/lib/CodeGen/CGCleanup.h
+++ b/clang/lib/CodeGen/CGCleanup.h
@@ -40,6 +40,10 @@ struct CatchTypeInfo {
 
 /// A protected scope for zero-cost EH handling.
 class EHScope {
+public:
+  enum Kind { Cleanup, Catch, Terminate, Filter };
+
+private:
   llvm::BasicBlock *CachedLandingPad;
   llvm::BasicBlock *CachedEHDispatchBlock;
 
@@ -47,6 +51,7 @@ class EHScope {
 
   class CommonBitFields {
     friend class EHScope;
+    LLVM_PREFERRED_TYPE(Kind)
     unsigned Kind : 3;
   };
   enum { NumCommonBits = 3 };
@@ -64,21 +69,27 @@ protected:
     unsigned : NumCommonBits;
 
     /// Whether this cleanup needs to be run along normal edges.
+    LLVM_PREFERRED_TYPE(bool)
     unsigned IsNormalCleanup : 1;
 
     /// Whether this cleanup needs to be run along exception edges.
+    LLVM_PREFERRED_TYPE(bool)
     unsigned IsEHCleanup : 1;
 
     /// Whether this cleanup is currently active.
+    LLVM_PREFERRED_TYPE(bool)
     unsigned IsActive : 1;
 
     /// Whether this cleanup is a lifetime marker
+    LLVM_PREFERRED_TYPE(bool)
     unsigned IsLifetimeMarker : 1;
 
     /// Whether the normal cleanup should test the activation flag.
+    LLVM_PREFERRED_TYPE(bool)
     unsigned TestFlagInNormalCleanup : 1;
 
     /// Whether the EH cleanup should test the activation flag.
+    LLVM_PREFERRED_TYPE(bool)
     unsigned TestFlagInEHCleanup : 1;
 
     /// The amount of extra storage needed by the Cleanup.
@@ -101,8 +112,6 @@ protected:
   };
 
 public:
-  enum Kind { Cleanup, Catch, Terminate, Filter };
-
   EHScope(Kind kind, EHScopeStack::stable_iterator enclosingEHScope)
     : CachedLandingPad(nullptr), CachedEHDispatchBlock(nullptr),
       EnclosingEHScope(enclosingEHScope) {
-- 
cgit v1.1


From bf571059f3bcf50bf8d3b39dc6aadeb14ede14bf Mon Sep 17 00:00:00 2001
From: Vlad Serebrennikov <serebrennikov.vladislav@gmail.com>
Date: Sun, 11 Feb 2024 12:57:42 +0300
Subject: [clang][NFC] Annotate `LangOptions.h` with `preferred_type`

This helps debuggers to display values in bit-fields in a more helpful way.
---
 clang/include/clang/Basic/LangOptions.h | 48 ++++++++++++++++-----------------
 1 file changed, 24 insertions(+), 24 deletions(-)

diff --git a/clang/include/clang/Basic/LangOptions.h b/clang/include/clang/Basic/LangOptions.h
index c1cc554..862952d 100644
--- a/clang/include/clang/Basic/LangOptions.h
+++ b/clang/include/clang/Basic/LangOptions.h
@@ -30,27 +30,6 @@
 
 namespace clang {
 
-/// Bitfields of LangOptions, split out from LangOptions in order to ensure that
-/// this large collection of bitfields is a trivial class type.
-class LangOptionsBase {
-  friend class CompilerInvocation;
-  friend class CompilerInvocationBase;
-
-public:
-  // Define simple language options (with no accessors).
-#define LANGOPT(Name, Bits, Default, Description) unsigned Name : Bits;
-#define ENUM_LANGOPT(Name, Type, Bits, Default, Description)
-#include "clang/Basic/LangOptions.def"
-
-protected:
-  // Define language options of enumeration type. These are private, and will
-  // have accessors (below).
-#define LANGOPT(Name, Bits, Default, Description)
-#define ENUM_LANGOPT(Name, Type, Bits, Default, Description) \
-  unsigned Name : Bits;
-#include "clang/Basic/LangOptions.def"
-};
-
 /// In the Microsoft ABI, this controls the placement of virtual displacement
 /// members used to implement virtual inheritance.
 enum class MSVtorDispMode { Never, ForVBaseOverride, ForVFTable };
@@ -78,9 +57,12 @@ enum class ShaderStage {
   Invalid,
 };
 
-/// Keeps track of the various options that can be
-/// enabled, which controls the dialect of C or C++ that is accepted.
-class LangOptions : public LangOptionsBase {
+/// Bitfields of LangOptions, split out from LangOptions in order to ensure that
+/// this large collection of bitfields is a trivial class type.
+class LangOptionsBase {
+  friend class CompilerInvocation;
+  friend class CompilerInvocationBase;
+
 public:
   using Visibility = clang::Visibility;
   using RoundingMode = llvm::RoundingMode;
@@ -416,6 +398,24 @@ public:
 
   enum ComplexRangeKind { CX_Full, CX_Limited, CX_Fortran, CX_None };
 
+  // Define simple language options (with no accessors).
+#define LANGOPT(Name, Bits, Default, Description) unsigned Name : Bits;
+#define ENUM_LANGOPT(Name, Type, Bits, Default, Description)
+#include "clang/Basic/LangOptions.def"
+
+protected:
+  // Define language options of enumeration type. These are private, and will
+  // have accessors (below).
+#define LANGOPT(Name, Bits, Default, Description)
+#define ENUM_LANGOPT(Name, Type, Bits, Default, Description) \
+  LLVM_PREFERRED_TYPE(Type) \
+  unsigned Name : Bits;
+#include "clang/Basic/LangOptions.def"
+};
+
+/// Keeps track of the various options that can be
+/// enabled, which controls the dialect of C or C++ that is accepted.
+class LangOptions : public LangOptionsBase {
 public:
   /// The used language standard.
   LangStandard::Kind LangStd;
-- 
cgit v1.1


From 4502dc416f40e0165ef988ded7db2673ac35670e Mon Sep 17 00:00:00 2001
From: Andrzej Warzynski <andrzej.warzynski@arm.com>
Date: Sun, 11 Feb 2024 10:04:29 +0000
Subject: [mlir][nfc] Remove leftover print stmt in a test

---
 mlir/test/Integration/Dialect/Linalg/CPU/ArmSVE/matmul.mlir | 1 -
 1 file changed, 1 deletion(-)

diff --git a/mlir/test/Integration/Dialect/Linalg/CPU/ArmSVE/matmul.mlir b/mlir/test/Integration/Dialect/Linalg/CPU/ArmSVE/matmul.mlir
index 1739341..22cf15d 100644
--- a/mlir/test/Integration/Dialect/Linalg/CPU/ArmSVE/matmul.mlir
+++ b/mlir/test/Integration/Dialect/Linalg/CPU/ArmSVE/matmul.mlir
@@ -98,7 +98,6 @@ module attributes {transform.with_named_sequence} {
     // Step 1: Tile
     %tiled_matmul, %loops:3 = transform.structured.tile_using_for %matmul [2, [4], 1]
       : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op)
-    transform.print %tiled_matmul {name = "matmul lal"}: !transform.any_op
 
     // Step 2: Vectorize
     transform.structured.vectorize %tiled_matmul vector_sizes [2, [4], 1] : !transform.any_op
-- 
cgit v1.1


From 5aec9392674572fa5a06283173a6a739742d261d Mon Sep 17 00:00:00 2001
From: Owen Pan <owenpiano@gmail.com>
Date: Sun, 11 Feb 2024 02:14:22 -0800
Subject: [clang-format][NFC] Keep Operator== sorted in Format.h

---
 clang/include/clang/Format/Format.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/clang/include/clang/Format/Format.h b/clang/include/clang/Format/Format.h
index ab56cc8..d9c18e5 100644
--- a/clang/include/clang/Format/Format.h
+++ b/clang/include/clang/Format/Format.h
@@ -4822,7 +4822,6 @@ struct FormatStyle {
            AlwaysBreakAfterReturnType == R.AlwaysBreakAfterReturnType &&
            AlwaysBreakBeforeMultilineStrings ==
                R.AlwaysBreakBeforeMultilineStrings &&
-           BreakTemplateDeclarations == R.BreakTemplateDeclarations &&
            AttributeMacros == R.AttributeMacros &&
            BinPackArguments == R.BinPackArguments &&
            BinPackParameters == R.BinPackParameters &&
@@ -4840,6 +4839,7 @@ struct FormatStyle {
            BreakConstructorInitializers == R.BreakConstructorInitializers &&
            BreakInheritanceList == R.BreakInheritanceList &&
            BreakStringLiterals == R.BreakStringLiterals &&
+           BreakTemplateDeclarations == R.BreakTemplateDeclarations &&
            ColumnLimit == R.ColumnLimit && CommentPragmas == R.CommentPragmas &&
            CompactNamespaces == R.CompactNamespaces &&
            ConstructorInitializerIndentWidth ==
-- 
cgit v1.1


From 5932fcc47855fdd209784f38820422d2369b84b2 Mon Sep 17 00:00:00 2001
From: Quentin Dian <dianqk@dianqk.net>
Date: Sun, 11 Feb 2024 18:24:59 +0800
Subject: [InlineCost] Consider the default branch when calculating cost 
 (#77856)

First step in fixing #76772.

This PR considers the default branch as a case branch. This will give
the unreachable default branch fair consideration.
---
 .../include/llvm/Analysis/InlineModelFeatureMaps.h |   2 +
 llvm/include/llvm/IR/Instructions.h                |   7 +
 llvm/lib/Analysis/InlineCost.cpp                   |  21 +-
 .../Transforms/Inline/inline-switch-default-2.ll   | 317 +++++++++++++++++++++
 .../Transforms/Inline/inline-switch-default.ll     | 216 ++++++++++++++
 5 files changed, 555 insertions(+), 8 deletions(-)
 create mode 100644 llvm/test/Transforms/Inline/inline-switch-default-2.ll
 create mode 100644 llvm/test/Transforms/Inline/inline-switch-default.ll

diff --git a/llvm/include/llvm/Analysis/InlineModelFeatureMaps.h b/llvm/include/llvm/Analysis/InlineModelFeatureMaps.h
index ca9bb724..d62ec9c 100644
--- a/llvm/include/llvm/Analysis/InlineModelFeatureMaps.h
+++ b/llvm/include/llvm/Analysis/InlineModelFeatureMaps.h
@@ -39,6 +39,8 @@ namespace llvm {
   M(int64_t, {1}, jump_table_penalty, "Accumulation of costs for jump tables") \
   M(int64_t, {1}, case_cluster_penalty,                                        \
     "Accumulation of costs for case clusters")                                 \
+  M(int64_t, {1}, switch_default_dest_penalty,                                 \
+    "Accumulation of costs for switch default destination")                    \
   M(int64_t, {1}, switch_penalty,                                              \
     "Accumulation of costs for switch statements")                             \
   M(int64_t, {1}, unsimplified_common_instructions,                            \
diff --git a/llvm/include/llvm/IR/Instructions.h b/llvm/include/llvm/IR/Instructions.h
index 4b5a442..1db4ff2 100644
--- a/llvm/include/llvm/IR/Instructions.h
+++ b/llvm/include/llvm/IR/Instructions.h
@@ -49,6 +49,7 @@ class DataLayout;
 class StringRef;
 class Type;
 class Value;
+class UnreachableInst;
 
 //===----------------------------------------------------------------------===//
 //                                AllocaInst Class
@@ -3505,6 +3506,12 @@ public:
     return cast<BasicBlock>(getOperand(1));
   }
 
+  /// Returns true if the default branch must result in immediate undefined
+  /// behavior, false otherwise.
+  bool defaultDestUndefined() const {
+    return isa<UnreachableInst>(getDefaultDest()->getFirstNonPHIOrDbg());
+  }
+
   void setDefaultDest(BasicBlock *DefaultCase) {
     setOperand(1, reinterpret_cast<Value*>(DefaultCase));
   }
diff --git a/llvm/lib/Analysis/InlineCost.cpp b/llvm/lib/Analysis/InlineCost.cpp
index 5b780b5..e55eaa5 100644
--- a/llvm/lib/Analysis/InlineCost.cpp
+++ b/llvm/lib/Analysis/InlineCost.cpp
@@ -336,8 +336,8 @@ protected:
 
   /// Called at the end of processing a switch instruction, with the given
   /// number of case clusters.
-  virtual void onFinalizeSwitch(unsigned JumpTableSize,
-                                unsigned NumCaseCluster) {}
+  virtual void onFinalizeSwitch(unsigned JumpTableSize, unsigned NumCaseCluster,
+                                bool DefaultDestUndefined) {}
 
   /// Called to account for any other instruction not specifically accounted
   /// for.
@@ -699,15 +699,16 @@ class InlineCostCallAnalyzer final : public CallAnalyzer {
                                        CallPenalty));
   }
 
-  void onFinalizeSwitch(unsigned JumpTableSize,
-                        unsigned NumCaseCluster) override {
+  void onFinalizeSwitch(unsigned JumpTableSize, unsigned NumCaseCluster,
+                        bool DefaultDestUndefined) override {
+    if (!DefaultDestUndefined)
+      addCost(2 * InstrCost);
     // If suitable for a jump table, consider the cost for the table size and
     // branch to destination.
     // Maximum valid cost increased in this function.
     if (JumpTableSize) {
       int64_t JTCost =
           static_cast<int64_t>(JumpTableSize) * InstrCost + 4 * InstrCost;
-
       addCost(JTCost);
       return;
     }
@@ -1153,6 +1154,7 @@ private:
   // heuristics in the ML inliner.
   static constexpr int JTCostMultiplier = 4;
   static constexpr int CaseClusterCostMultiplier = 2;
+  static constexpr int SwitchDefaultDestCostMultiplier = 2;
   static constexpr int SwitchCostMultiplier = 2;
 
   // FIXME: These are taken from the heuristic-based cost visitor: we should
@@ -1231,8 +1233,11 @@ private:
     }
   }
 
-  void onFinalizeSwitch(unsigned JumpTableSize,
-                        unsigned NumCaseCluster) override {
+  void onFinalizeSwitch(unsigned JumpTableSize, unsigned NumCaseCluster,
+                        bool DefaultDestUndefined) override {
+    if (!DefaultDestUndefined)
+      increment(InlineCostFeatureIndex::switch_default_dest_penalty,
+                SwitchDefaultDestCostMultiplier * InstrCost);
 
     if (JumpTableSize) {
       int64_t JTCost = static_cast<int64_t>(JumpTableSize) * InstrCost +
@@ -2461,7 +2466,7 @@ bool CallAnalyzer::visitSwitchInst(SwitchInst &SI) {
   unsigned NumCaseCluster =
       TTI.getEstimatedNumberOfCaseClusters(SI, JumpTableSize, PSI, BFI);
 
-  onFinalizeSwitch(JumpTableSize, NumCaseCluster);
+  onFinalizeSwitch(JumpTableSize, NumCaseCluster, SI.defaultDestUndefined());
   return false;
 }
 
diff --git a/llvm/test/Transforms/Inline/inline-switch-default-2.ll b/llvm/test/Transforms/Inline/inline-switch-default-2.ll
new file mode 100644
index 0000000..8d3e24c
--- /dev/null
+++ b/llvm/test/Transforms/Inline/inline-switch-default-2.ll
@@ -0,0 +1,317 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt %s -S -passes=inline -inline-threshold=21 | FileCheck %s
+
+; Check for scenarios without TTI.
+
+define i64 @foo1(i64 %a) {
+; LOOKUPTABLE-LABEL: define i64 @foo1(
+; LOOKUPTABLE-SAME: i64 [[TMP0:%.*]]) {
+; LOOKUPTABLE-NEXT:    switch i64 [[TMP0]], label [[DEFAULT_BRANCH_I:%.*]] [
+; LOOKUPTABLE-NEXT:      i64 0, label [[BRANCH_0_I:%.*]]
+; LOOKUPTABLE-NEXT:      i64 2, label [[BRANCH_2_I:%.*]]
+; LOOKUPTABLE-NEXT:      i64 4, label [[BRANCH_4_I:%.*]]
+; LOOKUPTABLE-NEXT:      i64 6, label [[BRANCH_6_I:%.*]]
+; LOOKUPTABLE-NEXT:    ]
+; LOOKUPTABLE:       branch_0.i:
+; LOOKUPTABLE-NEXT:    br label [[BAR1_EXIT:%.*]]
+; LOOKUPTABLE:       branch_2.i:
+; LOOKUPTABLE-NEXT:    br label [[BAR1_EXIT]]
+; LOOKUPTABLE:       branch_4.i:
+; LOOKUPTABLE-NEXT:    br label [[BAR1_EXIT]]
+; LOOKUPTABLE:       branch_6.i:
+; LOOKUPTABLE-NEXT:    br label [[BAR1_EXIT]]
+; LOOKUPTABLE:       default_branch.i:
+; LOOKUPTABLE-NEXT:    br label [[BAR1_EXIT]]
+; LOOKUPTABLE:       bar1.exit:
+; LOOKUPTABLE-NEXT:    [[TMP2:%.*]] = phi i64 [ 5, [[BRANCH_0_I]] ], [ 9, [[BRANCH_2_I]] ], [ 2, [[BRANCH_4_I]] ], [ 7, [[BRANCH_6_I]] ], [ 3, [[DEFAULT_BRANCH_I]] ]
+; LOOKUPTABLE-NEXT:    ret i64 [[TMP2]]
+;
+; SWITCH-LABEL: define i64 @foo1(
+; SWITCH-SAME: i64 [[TMP0:%.*]]) {
+; SWITCH-NEXT:    switch i64 [[TMP0]], label [[DEFAULT_BRANCH_I:%.*]] [
+; SWITCH-NEXT:      i64 0, label [[BRANCH_0_I:%.*]]
+; SWITCH-NEXT:      i64 2, label [[BRANCH_2_I:%.*]]
+; SWITCH-NEXT:      i64 4, label [[BRANCH_4_I:%.*]]
+; SWITCH-NEXT:      i64 6, label [[BRANCH_6_I:%.*]]
+; SWITCH-NEXT:    ]
+; SWITCH:       branch_0.i:
+; SWITCH-NEXT:    br label [[BAR1_EXIT:%.*]]
+; SWITCH:       branch_2.i:
+; SWITCH-NEXT:    br label [[BAR1_EXIT]]
+; SWITCH:       branch_4.i:
+; SWITCH-NEXT:    br label [[BAR1_EXIT]]
+; SWITCH:       branch_6.i:
+; SWITCH-NEXT:    br label [[BAR1_EXIT]]
+; SWITCH:       default_branch.i:
+; SWITCH-NEXT:    br label [[BAR1_EXIT]]
+; SWITCH:       bar1.exit:
+; SWITCH-NEXT:    [[TMP2:%.*]] = phi i64 [ 5, [[BRANCH_0_I]] ], [ 9, [[BRANCH_2_I]] ], [ 2, [[BRANCH_4_I]] ], [ 7, [[BRANCH_6_I]] ], [ 3, [[DEFAULT_BRANCH_I]] ]
+; SWITCH-NEXT:    ret i64 [[TMP2]]
+;
+; CHECK-LABEL: define i64 @foo1(
+; CHECK-SAME: i64 [[A:%.*]]) {
+; CHECK-NEXT:    [[B:%.*]] = call i64 @bar1(i64 [[A]])
+; CHECK-NEXT:    ret i64 [[B]]
+;
+  %b = call i64 @bar1(i64 %a)
+  ret i64 %b
+}
+
+define i64 @foo2(i64 %a) {
+; LOOKUPTABLE-LABEL: define i64 @foo2(
+; LOOKUPTABLE-SAME: i64 [[TMP0:%.*]]) {
+; LOOKUPTABLE-NEXT:    switch i64 [[TMP0]], label [[UNREACHABLEDEFAULT_I:%.*]] [
+; LOOKUPTABLE-NEXT:      i64 0, label [[BRANCH_0_I:%.*]]
+; LOOKUPTABLE-NEXT:      i64 2, label [[BRANCH_2_I:%.*]]
+; LOOKUPTABLE-NEXT:      i64 4, label [[BRANCH_4_I:%.*]]
+; LOOKUPTABLE-NEXT:      i64 6, label [[BRANCH_6_I:%.*]]
+; LOOKUPTABLE-NEXT:    ]
+; LOOKUPTABLE:       branch_0.i:
+; LOOKUPTABLE-NEXT:    br label [[BAR2_EXIT:%.*]]
+; LOOKUPTABLE:       branch_2.i:
+; LOOKUPTABLE-NEXT:    br label [[BAR2_EXIT]]
+; LOOKUPTABLE:       branch_4.i:
+; LOOKUPTABLE-NEXT:    br label [[BAR2_EXIT]]
+; LOOKUPTABLE:       branch_6.i:
+; LOOKUPTABLE-NEXT:    br label [[BAR2_EXIT]]
+; LOOKUPTABLE:       unreachabledefault.i:
+; LOOKUPTABLE-NEXT:    unreachable
+; LOOKUPTABLE:       bar2.exit:
+; LOOKUPTABLE-NEXT:    [[TMP2:%.*]] = phi i64 [ 5, [[BRANCH_0_I]] ], [ 9, [[BRANCH_2_I]] ], [ 2, [[BRANCH_4_I]] ], [ 7, [[BRANCH_6_I]] ]
+; LOOKUPTABLE-NEXT:    ret i64 [[TMP2]]
+;
+; SWITCH-LABEL: define i64 @foo2(
+; SWITCH-SAME: i64 [[TMP0:%.*]]) {
+; SWITCH-NEXT:    switch i64 [[TMP0]], label [[UNREACHABLEDEFAULT_I:%.*]] [
+; SWITCH-NEXT:      i64 0, label [[BRANCH_0_I:%.*]]
+; SWITCH-NEXT:      i64 2, label [[BRANCH_2_I:%.*]]
+; SWITCH-NEXT:      i64 4, label [[BRANCH_4_I:%.*]]
+; SWITCH-NEXT:      i64 6, label [[BRANCH_6_I:%.*]]
+; SWITCH-NEXT:    ]
+; SWITCH:       branch_0.i:
+; SWITCH-NEXT:    br label [[BAR2_EXIT:%.*]]
+; SWITCH:       branch_2.i:
+; SWITCH-NEXT:    br label [[BAR2_EXIT]]
+; SWITCH:       branch_4.i:
+; SWITCH-NEXT:    br label [[BAR2_EXIT]]
+; SWITCH:       branch_6.i:
+; SWITCH-NEXT:    br label [[BAR2_EXIT]]
+; SWITCH:       unreachabledefault.i:
+; SWITCH-NEXT:    unreachable
+; SWITCH:       bar2.exit:
+; SWITCH-NEXT:    [[TMP2:%.*]] = phi i64 [ 5, [[BRANCH_0_I]] ], [ 9, [[BRANCH_2_I]] ], [ 2, [[BRANCH_4_I]] ], [ 7, [[BRANCH_6_I]] ]
+; SWITCH-NEXT:    ret i64 [[TMP2]]
+;
+; CHECK-LABEL: define i64 @foo2(
+; CHECK-SAME: i64 [[A:%.*]]) {
+; CHECK-NEXT:    switch i64 [[A]], label [[UNREACHABLEDEFAULT_I:%.*]] [
+; CHECK-NEXT:      i64 0, label [[BRANCH_0_I:%.*]]
+; CHECK-NEXT:      i64 2, label [[BRANCH_2_I:%.*]]
+; CHECK-NEXT:      i64 4, label [[BRANCH_4_I:%.*]]
+; CHECK-NEXT:      i64 6, label [[BRANCH_6_I:%.*]]
+; CHECK-NEXT:    ]
+; CHECK:       branch_0.i:
+; CHECK-NEXT:    br label [[BAR2_EXIT:%.*]]
+; CHECK:       branch_2.i:
+; CHECK-NEXT:    br label [[BAR2_EXIT]]
+; CHECK:       branch_4.i:
+; CHECK-NEXT:    br label [[BAR2_EXIT]]
+; CHECK:       branch_6.i:
+; CHECK-NEXT:    br label [[BAR2_EXIT]]
+; CHECK:       unreachabledefault.i:
+; CHECK-NEXT:    unreachable
+; CHECK:       bar2.exit:
+; CHECK-NEXT:    [[B_I:%.*]] = phi i64 [ 5, [[BRANCH_0_I]] ], [ 9, [[BRANCH_2_I]] ], [ 2, [[BRANCH_4_I]] ], [ 7, [[BRANCH_6_I]] ]
+; CHECK-NEXT:    ret i64 [[B_I]]
+;
+  %b = call i64 @bar2(i64 %a)
+  ret i64 %b
+}
+
+define i64 @bar1(i64 %a) {
+; LOOKUPTABLE-LABEL: define i64 @bar1(
+; LOOKUPTABLE-SAME: i64 [[TMP0:%.*]]) {
+; LOOKUPTABLE-NEXT:    switch i64 [[TMP0]], label [[DEFAULT_BRANCH:%.*]] [
+; LOOKUPTABLE-NEXT:      i64 0, label [[BRANCH_0:%.*]]
+; LOOKUPTABLE-NEXT:      i64 2, label [[BRANCH_2:%.*]]
+; LOOKUPTABLE-NEXT:      i64 4, label [[BRANCH_4:%.*]]
+; LOOKUPTABLE-NEXT:      i64 6, label [[BRANCH_6:%.*]]
+; LOOKUPTABLE-NEXT:    ]
+; LOOKUPTABLE:       branch_0:
+; LOOKUPTABLE-NEXT:    br label [[EXIT:%.*]]
+; LOOKUPTABLE:       branch_2:
+; LOOKUPTABLE-NEXT:    br label [[EXIT]]
+; LOOKUPTABLE:       branch_4:
+; LOOKUPTABLE-NEXT:    br label [[EXIT]]
+; LOOKUPTABLE:       branch_6:
+; LOOKUPTABLE-NEXT:    br label [[EXIT]]
+; LOOKUPTABLE:       default_branch:
+; LOOKUPTABLE-NEXT:    br label [[EXIT]]
+; LOOKUPTABLE:       exit:
+; LOOKUPTABLE-NEXT:    [[TMP2:%.*]] = phi i64 [ 5, [[BRANCH_0]] ], [ 9, [[BRANCH_2]] ], [ 2, [[BRANCH_4]] ], [ 7, [[BRANCH_6]] ], [ 3, [[DEFAULT_BRANCH]] ]
+; LOOKUPTABLE-NEXT:    ret i64 [[TMP2]]
+;
+; SWITCH-LABEL: define i64 @bar1(
+; SWITCH-SAME: i64 [[TMP0:%.*]]) {
+; SWITCH-NEXT:    switch i64 [[TMP0]], label [[DEFAULT_BRANCH:%.*]] [
+; SWITCH-NEXT:      i64 0, label [[BRANCH_0:%.*]]
+; SWITCH-NEXT:      i64 2, label [[BRANCH_2:%.*]]
+; SWITCH-NEXT:      i64 4, label [[BRANCH_4:%.*]]
+; SWITCH-NEXT:      i64 6, label [[BRANCH_6:%.*]]
+; SWITCH-NEXT:    ]
+; SWITCH:       branch_0:
+; SWITCH-NEXT:    br label [[EXIT:%.*]]
+; SWITCH:       branch_2:
+; SWITCH-NEXT:    br label [[EXIT]]
+; SWITCH:       branch_4:
+; SWITCH-NEXT:    br label [[EXIT]]
+; SWITCH:       branch_6:
+; SWITCH-NEXT:    br label [[EXIT]]
+; SWITCH:       default_branch:
+; SWITCH-NEXT:    br label [[EXIT]]
+; SWITCH:       exit:
+; SWITCH-NEXT:    [[TMP2:%.*]] = phi i64 [ 5, [[BRANCH_0]] ], [ 9, [[BRANCH_2]] ], [ 2, [[BRANCH_4]] ], [ 7, [[BRANCH_6]] ], [ 3, [[DEFAULT_BRANCH]] ]
+; SWITCH-NEXT:    ret i64 [[TMP2]]
+;
+; CHECK-LABEL: define i64 @bar1(
+; CHECK-SAME: i64 [[A:%.*]]) {
+; CHECK-NEXT:    switch i64 [[A]], label [[DEFAULT_BRANCH:%.*]] [
+; CHECK-NEXT:      i64 0, label [[BRANCH_0:%.*]]
+; CHECK-NEXT:      i64 2, label [[BRANCH_2:%.*]]
+; CHECK-NEXT:      i64 4, label [[BRANCH_4:%.*]]
+; CHECK-NEXT:      i64 6, label [[BRANCH_6:%.*]]
+; CHECK-NEXT:    ]
+; CHECK:       branch_0:
+; CHECK-NEXT:    br label [[EXIT:%.*]]
+; CHECK:       branch_2:
+; CHECK-NEXT:    br label [[EXIT]]
+; CHECK:       branch_4:
+; CHECK-NEXT:    br label [[EXIT]]
+; CHECK:       branch_6:
+; CHECK-NEXT:    br label [[EXIT]]
+; CHECK:       default_branch:
+; CHECK-NEXT:    br label [[EXIT]]
+; CHECK:       exit:
+; CHECK-NEXT:    [[B:%.*]] = phi i64 [ 5, [[BRANCH_0]] ], [ 9, [[BRANCH_2]] ], [ 2, [[BRANCH_4]] ], [ 7, [[BRANCH_6]] ], [ 3, [[DEFAULT_BRANCH]] ]
+; CHECK-NEXT:    ret i64 [[B]]
+;
+  switch i64 %a, label %default_branch [
+  i64 0, label %branch_0
+  i64 2, label %branch_2
+  i64 4, label %branch_4
+  i64 6, label %branch_6
+  ]
+
+branch_0:
+  br label %exit
+
+branch_2:
+  br label %exit
+
+branch_4:
+  br label %exit
+
+branch_6:
+  br label %exit
+
+default_branch:
+  br label %exit
+
+exit:
+  %b = phi i64 [ 5, %branch_0 ], [ 9, %branch_2 ], [ 2, %branch_4 ], [ 7, %branch_6 ], [ 3, %default_branch ]
+  ret i64 %b
+}
+
+define i64 @bar2(i64 %a) {
+; LOOKUPTABLE-LABEL: define i64 @bar2(
+; LOOKUPTABLE-SAME: i64 [[TMP0:%.*]]) {
+; LOOKUPTABLE-NEXT:    switch i64 [[TMP0]], label [[UNREACHABLEDEFAULT:%.*]] [
+; LOOKUPTABLE-NEXT:      i64 0, label [[BRANCH_0:%.*]]
+; LOOKUPTABLE-NEXT:      i64 2, label [[BRANCH_2:%.*]]
+; LOOKUPTABLE-NEXT:      i64 4, label [[BRANCH_4:%.*]]
+; LOOKUPTABLE-NEXT:      i64 6, label [[BRANCH_6:%.*]]
+; LOOKUPTABLE-NEXT:    ]
+; LOOKUPTABLE:       branch_0:
+; LOOKUPTABLE-NEXT:    br label [[EXIT:%.*]]
+; LOOKUPTABLE:       branch_2:
+; LOOKUPTABLE-NEXT:    br label [[EXIT]]
+; LOOKUPTABLE:       branch_4:
+; LOOKUPTABLE-NEXT:    br label [[EXIT]]
+; LOOKUPTABLE:       branch_6:
+; LOOKUPTABLE-NEXT:    br label [[EXIT]]
+; LOOKUPTABLE:       unreachabledefault:
+; LOOKUPTABLE-NEXT:    unreachable
+; LOOKUPTABLE:       exit:
+; LOOKUPTABLE-NEXT:    [[TMP2:%.*]] = phi i64 [ 5, [[BRANCH_0]] ], [ 9, [[BRANCH_2]] ], [ 2, [[BRANCH_4]] ], [ 7, [[BRANCH_6]] ]
+; LOOKUPTABLE-NEXT:    ret i64 [[TMP2]]
+;
+; SWITCH-LABEL: define i64 @bar2(
+; SWITCH-SAME: i64 [[TMP0:%.*]]) {
+; SWITCH-NEXT:    switch i64 [[TMP0]], label [[UNREACHABLEDEFAULT:%.*]] [
+; SWITCH-NEXT:      i64 0, label [[BRANCH_0:%.*]]
+; SWITCH-NEXT:      i64 2, label [[BRANCH_2:%.*]]
+; SWITCH-NEXT:      i64 4, label [[BRANCH_4:%.*]]
+; SWITCH-NEXT:      i64 6, label [[BRANCH_6:%.*]]
+; SWITCH-NEXT:    ]
+; SWITCH:       branch_0:
+; SWITCH-NEXT:    br label [[EXIT:%.*]]
+; SWITCH:       branch_2:
+; SWITCH-NEXT:    br label [[EXIT]]
+; SWITCH:       branch_4:
+; SWITCH-NEXT:    br label [[EXIT]]
+; SWITCH:       branch_6:
+; SWITCH-NEXT:    br label [[EXIT]]
+; SWITCH:       unreachabledefault:
+; SWITCH-NEXT:    unreachable
+; SWITCH:       exit:
+; SWITCH-NEXT:    [[TMP2:%.*]] = phi i64 [ 5, [[BRANCH_0]] ], [ 9, [[BRANCH_2]] ], [ 2, [[BRANCH_4]] ], [ 7, [[BRANCH_6]] ]
+; SWITCH-NEXT:    ret i64 [[TMP2]]
+;
+; CHECK-LABEL: define i64 @bar2(
+; CHECK-SAME: i64 [[A:%.*]]) {
+; CHECK-NEXT:    switch i64 [[A]], label [[UNREACHABLEDEFAULT:%.*]] [
+; CHECK-NEXT:      i64 0, label [[BRANCH_0:%.*]]
+; CHECK-NEXT:      i64 2, label [[BRANCH_2:%.*]]
+; CHECK-NEXT:      i64 4, label [[BRANCH_4:%.*]]
+; CHECK-NEXT:      i64 6, label [[BRANCH_6:%.*]]
+; CHECK-NEXT:    ]
+; CHECK:       branch_0:
+; CHECK-NEXT:    br label [[EXIT:%.*]]
+; CHECK:       branch_2:
+; CHECK-NEXT:    br label [[EXIT]]
+; CHECK:       branch_4:
+; CHECK-NEXT:    br label [[EXIT]]
+; CHECK:       branch_6:
+; CHECK-NEXT:    br label [[EXIT]]
+; CHECK:       unreachabledefault:
+; CHECK-NEXT:    unreachable
+; CHECK:       exit:
+; CHECK-NEXT:    [[B:%.*]] = phi i64 [ 5, [[BRANCH_0]] ], [ 9, [[BRANCH_2]] ], [ 2, [[BRANCH_4]] ], [ 7, [[BRANCH_6]] ]
+; CHECK-NEXT:    ret i64 [[B]]
+;
+  switch i64 %a, label %unreachabledefault [
+  i64 0, label %branch_0
+  i64 2, label %branch_2
+  i64 4, label %branch_4
+  i64 6, label %branch_6
+  ]
+
+branch_0:
+  br label %exit
+
+branch_2:
+  br label %exit
+
+branch_4:
+  br label %exit
+
+branch_6:
+  br label %exit
+
+unreachabledefault:
+  unreachable
+
+exit:
+  %b = phi i64 [ 5, %branch_0 ], [ 9, %branch_2 ], [ 2, %branch_4 ], [ 7, %branch_6 ]
+  ret i64 %b
+}
diff --git a/llvm/test/Transforms/Inline/inline-switch-default.ll b/llvm/test/Transforms/Inline/inline-switch-default.ll
new file mode 100644
index 0000000..44f1304
--- /dev/null
+++ b/llvm/test/Transforms/Inline/inline-switch-default.ll
@@ -0,0 +1,216 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt %s -S -passes=inline -inline-threshold=26 -min-jump-table-entries=4 | FileCheck %s -check-prefix=LOOKUPTABLE
+; RUN: opt %s -S -passes=inline -inline-threshold=21 -min-jump-table-entries=5 | FileCheck %s -check-prefix=SWITCH
+
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; The `bar1` should not be inlined since there is a default branch.
+
+define i64 @foo1(i64 %a) {
+; LOOKUPTABLE-LABEL: define i64 @foo1(
+; LOOKUPTABLE-SAME: i64 [[A:%.*]]) {
+; LOOKUPTABLE-NEXT:    [[B:%.*]] = call i64 @bar1(i64 [[A]])
+; LOOKUPTABLE-NEXT:    ret i64 [[B]]
+;
+; SWITCH-LABEL: define i64 @foo1(
+; SWITCH-SAME: i64 [[A:%.*]]) {
+; SWITCH-NEXT:    [[B:%.*]] = call i64 @bar1(i64 [[A]])
+; SWITCH-NEXT:    ret i64 [[B]]
+;
+  %b = call i64 @bar1(i64 %a)
+  ret i64 %b
+}
+
+define i64 @foo2(i64 %a) {
+; LOOKUPTABLE-LABEL: define i64 @foo2(
+; LOOKUPTABLE-SAME: i64 [[A:%.*]]) {
+; LOOKUPTABLE-NEXT:    switch i64 [[A]], label [[UNREACHABLEDEFAULT_I:%.*]] [
+; LOOKUPTABLE-NEXT:      i64 0, label [[BRANCH_0_I:%.*]]
+; LOOKUPTABLE-NEXT:      i64 2, label [[BRANCH_2_I:%.*]]
+; LOOKUPTABLE-NEXT:      i64 4, label [[BRANCH_4_I:%.*]]
+; LOOKUPTABLE-NEXT:      i64 6, label [[BRANCH_6_I:%.*]]
+; LOOKUPTABLE-NEXT:    ]
+; LOOKUPTABLE:       branch_0.i:
+; LOOKUPTABLE-NEXT:    br label [[BAR2_EXIT:%.*]]
+; LOOKUPTABLE:       branch_2.i:
+; LOOKUPTABLE-NEXT:    br label [[BAR2_EXIT]]
+; LOOKUPTABLE:       branch_4.i:
+; LOOKUPTABLE-NEXT:    br label [[BAR2_EXIT]]
+; LOOKUPTABLE:       branch_6.i:
+; LOOKUPTABLE-NEXT:    br label [[BAR2_EXIT]]
+; LOOKUPTABLE:       unreachabledefault.i:
+; LOOKUPTABLE-NEXT:    unreachable
+; LOOKUPTABLE:       bar2.exit:
+; LOOKUPTABLE-NEXT:    [[B_I:%.*]] = phi i64 [ 5, [[BRANCH_0_I]] ], [ 9, [[BRANCH_2_I]] ], [ 2, [[BRANCH_4_I]] ], [ 7, [[BRANCH_6_I]] ]
+; LOOKUPTABLE-NEXT:    ret i64 [[B_I]]
+;
+; SWITCH-LABEL: define i64 @foo2(
+; SWITCH-SAME: i64 [[A:%.*]]) {
+; SWITCH-NEXT:    switch i64 [[A]], label [[UNREACHABLEDEFAULT_I:%.*]] [
+; SWITCH-NEXT:      i64 0, label [[BRANCH_0_I:%.*]]
+; SWITCH-NEXT:      i64 2, label [[BRANCH_2_I:%.*]]
+; SWITCH-NEXT:      i64 4, label [[BRANCH_4_I:%.*]]
+; SWITCH-NEXT:      i64 6, label [[BRANCH_6_I:%.*]]
+; SWITCH-NEXT:    ]
+; SWITCH:       branch_0.i:
+; SWITCH-NEXT:    br label [[BAR2_EXIT:%.*]]
+; SWITCH:       branch_2.i:
+; SWITCH-NEXT:    br label [[BAR2_EXIT]]
+; SWITCH:       branch_4.i:
+; SWITCH-NEXT:    br label [[BAR2_EXIT]]
+; SWITCH:       branch_6.i:
+; SWITCH-NEXT:    br label [[BAR2_EXIT]]
+; SWITCH:       unreachabledefault.i:
+; SWITCH-NEXT:    unreachable
+; SWITCH:       bar2.exit:
+; SWITCH-NEXT:    [[B_I:%.*]] = phi i64 [ 5, [[BRANCH_0_I]] ], [ 9, [[BRANCH_2_I]] ], [ 2, [[BRANCH_4_I]] ], [ 7, [[BRANCH_6_I]] ]
+; SWITCH-NEXT:    ret i64 [[B_I]]
+;
+  %b = call i64 @bar2(i64 %a)
+  ret i64 %b
+}
+
+define i64 @bar1(i64 %a) {
+; LOOKUPTABLE-LABEL: define i64 @bar1(
+; LOOKUPTABLE-SAME: i64 [[A:%.*]]) {
+; LOOKUPTABLE-NEXT:    switch i64 [[A]], label [[DEFAULT_BRANCH:%.*]] [
+; LOOKUPTABLE-NEXT:      i64 0, label [[BRANCH_0:%.*]]
+; LOOKUPTABLE-NEXT:      i64 2, label [[BRANCH_2:%.*]]
+; LOOKUPTABLE-NEXT:      i64 4, label [[BRANCH_4:%.*]]
+; LOOKUPTABLE-NEXT:      i64 6, label [[BRANCH_6:%.*]]
+; LOOKUPTABLE-NEXT:    ]
+; LOOKUPTABLE:       branch_0:
+; LOOKUPTABLE-NEXT:    br label [[EXIT:%.*]]
+; LOOKUPTABLE:       branch_2:
+; LOOKUPTABLE-NEXT:    br label [[EXIT]]
+; LOOKUPTABLE:       branch_4:
+; LOOKUPTABLE-NEXT:    br label [[EXIT]]
+; LOOKUPTABLE:       branch_6:
+; LOOKUPTABLE-NEXT:    br label [[EXIT]]
+; LOOKUPTABLE:       default_branch:
+; LOOKUPTABLE-NEXT:    br label [[EXIT]]
+; LOOKUPTABLE:       exit:
+; LOOKUPTABLE-NEXT:    [[B:%.*]] = phi i64 [ 5, [[BRANCH_0]] ], [ 9, [[BRANCH_2]] ], [ 2, [[BRANCH_4]] ], [ 7, [[BRANCH_6]] ], [ 3, [[DEFAULT_BRANCH]] ]
+; LOOKUPTABLE-NEXT:    ret i64 [[B]]
+;
+; SWITCH-LABEL: define i64 @bar1(
+; SWITCH-SAME: i64 [[A:%.*]]) {
+; SWITCH-NEXT:    switch i64 [[A]], label [[DEFAULT_BRANCH:%.*]] [
+; SWITCH-NEXT:      i64 0, label [[BRANCH_0:%.*]]
+; SWITCH-NEXT:      i64 2, label [[BRANCH_2:%.*]]
+; SWITCH-NEXT:      i64 4, label [[BRANCH_4:%.*]]
+; SWITCH-NEXT:      i64 6, label [[BRANCH_6:%.*]]
+; SWITCH-NEXT:    ]
+; SWITCH:       branch_0:
+; SWITCH-NEXT:    br label [[EXIT:%.*]]
+; SWITCH:       branch_2:
+; SWITCH-NEXT:    br label [[EXIT]]
+; SWITCH:       branch_4:
+; SWITCH-NEXT:    br label [[EXIT]]
+; SWITCH:       branch_6:
+; SWITCH-NEXT:    br label [[EXIT]]
+; SWITCH:       default_branch:
+; SWITCH-NEXT:    br label [[EXIT]]
+; SWITCH:       exit:
+; SWITCH-NEXT:    [[B:%.*]] = phi i64 [ 5, [[BRANCH_0]] ], [ 9, [[BRANCH_2]] ], [ 2, [[BRANCH_4]] ], [ 7, [[BRANCH_6]] ], [ 3, [[DEFAULT_BRANCH]] ]
+; SWITCH-NEXT:    ret i64 [[B]]
+;
+  switch i64 %a, label %default_branch [
+  i64 0, label %branch_0
+  i64 2, label %branch_2
+  i64 4, label %branch_4
+  i64 6, label %branch_6
+  ]
+
+branch_0:
+  br label %exit
+
+branch_2:
+  br label %exit
+
+branch_4:
+  br label %exit
+
+branch_6:
+  br label %exit
+
+default_branch:
+  br label %exit
+
+exit:
+  %b = phi i64 [ 5, %branch_0 ], [ 9, %branch_2 ], [ 2, %branch_4 ], [ 7, %branch_6 ], [ 3, %default_branch ]
+  ret i64 %b
+}
+
+define i64 @bar2(i64 %a) {
+; LOOKUPTABLE-LABEL: define i64 @bar2(
+; LOOKUPTABLE-SAME: i64 [[A:%.*]]) {
+; LOOKUPTABLE-NEXT:    switch i64 [[A]], label [[UNREACHABLEDEFAULT:%.*]] [
+; LOOKUPTABLE-NEXT:      i64 0, label [[BRANCH_0:%.*]]
+; LOOKUPTABLE-NEXT:      i64 2, label [[BRANCH_2:%.*]]
+; LOOKUPTABLE-NEXT:      i64 4, label [[BRANCH_4:%.*]]
+; LOOKUPTABLE-NEXT:      i64 6, label [[BRANCH_6:%.*]]
+; LOOKUPTABLE-NEXT:    ]
+; LOOKUPTABLE:       branch_0:
+; LOOKUPTABLE-NEXT:    br label [[EXIT:%.*]]
+; LOOKUPTABLE:       branch_2:
+; LOOKUPTABLE-NEXT:    br label [[EXIT]]
+; LOOKUPTABLE:       branch_4:
+; LOOKUPTABLE-NEXT:    br label [[EXIT]]
+; LOOKUPTABLE:       branch_6:
+; LOOKUPTABLE-NEXT:    br label [[EXIT]]
+; LOOKUPTABLE:       unreachabledefault:
+; LOOKUPTABLE-NEXT:    unreachable
+; LOOKUPTABLE:       exit:
+; LOOKUPTABLE-NEXT:    [[B:%.*]] = phi i64 [ 5, [[BRANCH_0]] ], [ 9, [[BRANCH_2]] ], [ 2, [[BRANCH_4]] ], [ 7, [[BRANCH_6]] ]
+; LOOKUPTABLE-NEXT:    ret i64 [[B]]
+;
+; SWITCH-LABEL: define i64 @bar2(
+; SWITCH-SAME: i64 [[A:%.*]]) {
+; SWITCH-NEXT:    switch i64 [[A]], label [[UNREACHABLEDEFAULT:%.*]] [
+; SWITCH-NEXT:      i64 0, label [[BRANCH_0:%.*]]
+; SWITCH-NEXT:      i64 2, label [[BRANCH_2:%.*]]
+; SWITCH-NEXT:      i64 4, label [[BRANCH_4:%.*]]
+; SWITCH-NEXT:      i64 6, label [[BRANCH_6:%.*]]
+; SWITCH-NEXT:    ]
+; SWITCH:       branch_0:
+; SWITCH-NEXT:    br label [[EXIT:%.*]]
+; SWITCH:       branch_2:
+; SWITCH-NEXT:    br label [[EXIT]]
+; SWITCH:       branch_4:
+; SWITCH-NEXT:    br label [[EXIT]]
+; SWITCH:       branch_6:
+; SWITCH-NEXT:    br label [[EXIT]]
+; SWITCH:       unreachabledefault:
+; SWITCH-NEXT:    unreachable
+; SWITCH:       exit:
+; SWITCH-NEXT:    [[B:%.*]] = phi i64 [ 5, [[BRANCH_0]] ], [ 9, [[BRANCH_2]] ], [ 2, [[BRANCH_4]] ], [ 7, [[BRANCH_6]] ]
+; SWITCH-NEXT:    ret i64 [[B]]
+;
+  switch i64 %a, label %unreachabledefault [
+  i64 0, label %branch_0
+  i64 2, label %branch_2
+  i64 4, label %branch_4
+  i64 6, label %branch_6
+  ]
+
+branch_0:
+  br label %exit
+
+branch_2:
+  br label %exit
+
+branch_4:
+  br label %exit
+
+branch_6:
+  br label %exit
+
+unreachabledefault:
+  unreachable
+
+exit:
+  %b = phi i64 [ 5, %branch_0 ], [ 9, %branch_2 ], [ 2, %branch_4 ], [ 7, %branch_6 ]
+  ret i64 %b
+}
-- 
cgit v1.1


From c3dfbb6f49845edd4b953055f5fe14257fad6b58 Mon Sep 17 00:00:00 2001
From: David Green <david.green@arm.com>
Date: Sun, 11 Feb 2024 11:20:11 +0000
Subject: [AArch64][GlobalISel] Add commute_constant_to_rhs to post legalizer
 combiners (#81103)

This helps the fp reductions, moving the constant operands to the RHS
which in turn helps simplify away fadd -0.0 and fmul 1.0.
---
 llvm/lib/Target/AArch64/AArch64Combine.td          |   3 +-
 .../CodeGen/AArch64/GlobalISel/arm64-atomic.ll     |  12 +-
 llvm/test/CodeGen/AArch64/vecreduce-fadd-strict.ll | 214 +++++++--------------
 llvm/test/CodeGen/AArch64/vecreduce-fmul-strict.ll | 112 ++++-------
 4 files changed, 105 insertions(+), 236 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64Combine.td b/llvm/lib/Target/AArch64/AArch64Combine.td
index 1daa7d5..fdea974 100644
--- a/llvm/lib/Target/AArch64/AArch64Combine.td
+++ b/llvm/lib/Target/AArch64/AArch64Combine.td
@@ -288,5 +288,6 @@ def AArch64PostLegalizerCombiner
                         constant_fold_binops, identity_combines,
                         ptr_add_immed_chain, overlapping_and,
                         split_store_zero_128, undef_combines,
-                        select_to_minmax, or_to_bsp]> {
+                        select_to_minmax, or_to_bsp,
+                        commute_constant_to_rhs]> {
 }
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/arm64-atomic.ll b/llvm/test/CodeGen/AArch64/GlobalISel/arm64-atomic.ll
index 0e9c126..458c2cb 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/arm64-atomic.ll
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/arm64-atomic.ll
@@ -2146,8 +2146,7 @@ define i8 @atomicrmw_and_i8(ptr %ptr, i8 %rhs) {
 ; CHECK-OUTLINE-O1-NEXT:    .cfi_offset w30, -8
 ; CHECK-OUTLINE-O1-NEXT:    .cfi_offset w29, -16
 ; CHECK-OUTLINE-O1-NEXT:    mov x2, x0
-; CHECK-OUTLINE-O1-NEXT:    mov w8, #-1 ; =0xffffffff
-; CHECK-OUTLINE-O1-NEXT:    eor w0, w8, w1
+; CHECK-OUTLINE-O1-NEXT:    mvn w0, w1
 ; CHECK-OUTLINE-O1-NEXT:    mov x1, x2
 ; CHECK-OUTLINE-O1-NEXT:    bl ___aarch64_ldclr1_rel
 ; CHECK-OUTLINE-O1-NEXT:    ldp x29, x30, [sp], #16 ; 16-byte Folded Reload
@@ -3202,8 +3201,7 @@ define i16 @atomicrmw_and_i16(ptr %ptr, i16 %rhs) {
 ; CHECK-OUTLINE-O1-NEXT:    .cfi_offset w30, -8
 ; CHECK-OUTLINE-O1-NEXT:    .cfi_offset w29, -16
 ; CHECK-OUTLINE-O1-NEXT:    mov x2, x0
-; CHECK-OUTLINE-O1-NEXT:    mov w8, #-1 ; =0xffffffff
-; CHECK-OUTLINE-O1-NEXT:    eor w0, w8, w1
+; CHECK-OUTLINE-O1-NEXT:    mvn w0, w1
 ; CHECK-OUTLINE-O1-NEXT:    mov x1, x2
 ; CHECK-OUTLINE-O1-NEXT:    bl ___aarch64_ldclr2_rel
 ; CHECK-OUTLINE-O1-NEXT:    ldp x29, x30, [sp], #16 ; 16-byte Folded Reload
@@ -4255,8 +4253,7 @@ define i32 @atomicrmw_and_i32(ptr %ptr, i32 %rhs) {
 ; CHECK-OUTLINE-O1-NEXT:    .cfi_offset w30, -8
 ; CHECK-OUTLINE-O1-NEXT:    .cfi_offset w29, -16
 ; CHECK-OUTLINE-O1-NEXT:    mov x2, x0
-; CHECK-OUTLINE-O1-NEXT:    mov w8, #-1 ; =0xffffffff
-; CHECK-OUTLINE-O1-NEXT:    eor w0, w8, w1
+; CHECK-OUTLINE-O1-NEXT:    mvn w0, w1
 ; CHECK-OUTLINE-O1-NEXT:    mov x1, x2
 ; CHECK-OUTLINE-O1-NEXT:    bl ___aarch64_ldclr4_rel
 ; CHECK-OUTLINE-O1-NEXT:    ldp x29, x30, [sp], #16 ; 16-byte Folded Reload
@@ -5276,8 +5273,7 @@ define i64 @atomicrmw_and_i64(ptr %ptr, i64 %rhs) {
 ; CHECK-OUTLINE-O1-NEXT:    .cfi_offset w30, -8
 ; CHECK-OUTLINE-O1-NEXT:    .cfi_offset w29, -16
 ; CHECK-OUTLINE-O1-NEXT:    mov x2, x0
-; CHECK-OUTLINE-O1-NEXT:    mov x8, #-1 ; =0xffffffffffffffff
-; CHECK-OUTLINE-O1-NEXT:    eor x0, x8, x1
+; CHECK-OUTLINE-O1-NEXT:    mvn x0, x1
 ; CHECK-OUTLINE-O1-NEXT:    mov x1, x2
 ; CHECK-OUTLINE-O1-NEXT:    bl ___aarch64_ldclr8_rel
 ; CHECK-OUTLINE-O1-NEXT:    ldp x29, x30, [sp], #16 ; 16-byte Folded Reload
diff --git a/llvm/test/CodeGen/AArch64/vecreduce-fadd-strict.ll b/llvm/test/CodeGen/AArch64/vecreduce-fadd-strict.ll
index 2023770..de95943 100644
--- a/llvm/test/CodeGen/AArch64/vecreduce-fadd-strict.ll
+++ b/llvm/test/CodeGen/AArch64/vecreduce-fadd-strict.ll
@@ -13,11 +13,7 @@ define float @add_HalfS(<2 x float> %bin.rdx)  {
 ;
 ; CHECK-GI-LABEL: add_HalfS:
 ; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    movi v1.2s, #128, lsl #24
-; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-NEXT:    mov s2, v0.s[1]
-; CHECK-GI-NEXT:    fadd s0, s1, s0
-; CHECK-GI-NEXT:    fadd s0, s0, s2
+; CHECK-GI-NEXT:    faddp s0, v0.2s
 ; CHECK-GI-NEXT:    ret
   %r = call float @llvm.vector.reduce.fadd.f32.v2f32(float -0.0, <2 x float> %bin.rdx)
   ret float %r
@@ -82,15 +78,12 @@ define half @add_HalfH(<4 x half> %bin.rdx)  {
 ;
 ; CHECK-GI-FP16-LABEL: add_HalfH:
 ; CHECK-GI-FP16:       // %bb.0:
-; CHECK-GI-FP16-NEXT:    adrp x8, .LCPI1_0
 ; CHECK-GI-FP16-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-FP16-NEXT:    mov h2, v0.h[1]
-; CHECK-GI-FP16-NEXT:    mov h3, v0.h[2]
-; CHECK-GI-FP16-NEXT:    ldr h1, [x8, :lo12:.LCPI1_0]
-; CHECK-GI-FP16-NEXT:    fadd h1, h1, h0
+; CHECK-GI-FP16-NEXT:    mov h1, v0.h[1]
+; CHECK-GI-FP16-NEXT:    mov h2, v0.h[2]
+; CHECK-GI-FP16-NEXT:    fadd h1, h0, h1
 ; CHECK-GI-FP16-NEXT:    mov h0, v0.h[3]
 ; CHECK-GI-FP16-NEXT:    fadd h1, h1, h2
-; CHECK-GI-FP16-NEXT:    fadd h1, h1, h3
 ; CHECK-GI-FP16-NEXT:    fadd h0, h1, h0
 ; CHECK-GI-FP16-NEXT:    ret
   %r = call half @llvm.vector.reduce.fadd.f16.v4f16(half -0.0, <4 x half> %bin.rdx)
@@ -202,22 +195,18 @@ define half @add_H(<8 x half> %bin.rdx)  {
 ;
 ; CHECK-GI-FP16-LABEL: add_H:
 ; CHECK-GI-FP16:       // %bb.0:
-; CHECK-GI-FP16-NEXT:    adrp x8, .LCPI2_0
-; CHECK-GI-FP16-NEXT:    mov h2, v0.h[1]
-; CHECK-GI-FP16-NEXT:    mov h3, v0.h[2]
-; CHECK-GI-FP16-NEXT:    ldr h1, [x8, :lo12:.LCPI2_0]
-; CHECK-GI-FP16-NEXT:    fadd h1, h1, h0
-; CHECK-GI-FP16-NEXT:    fadd h1, h1, h2
-; CHECK-GI-FP16-NEXT:    mov h2, v0.h[3]
+; CHECK-GI-FP16-NEXT:    mov h1, v0.h[2]
+; CHECK-GI-FP16-NEXT:    faddp h2, v0.2h
+; CHECK-GI-FP16-NEXT:    mov h3, v0.h[3]
+; CHECK-GI-FP16-NEXT:    fadd h1, h2, h1
+; CHECK-GI-FP16-NEXT:    mov h2, v0.h[4]
 ; CHECK-GI-FP16-NEXT:    fadd h1, h1, h3
-; CHECK-GI-FP16-NEXT:    mov h3, v0.h[4]
+; CHECK-GI-FP16-NEXT:    mov h3, v0.h[5]
 ; CHECK-GI-FP16-NEXT:    fadd h1, h1, h2
-; CHECK-GI-FP16-NEXT:    mov h2, v0.h[5]
-; CHECK-GI-FP16-NEXT:    fadd h1, h1, h3
-; CHECK-GI-FP16-NEXT:    mov h3, v0.h[6]
+; CHECK-GI-FP16-NEXT:    mov h2, v0.h[6]
 ; CHECK-GI-FP16-NEXT:    mov h0, v0.h[7]
-; CHECK-GI-FP16-NEXT:    fadd h1, h1, h2
 ; CHECK-GI-FP16-NEXT:    fadd h1, h1, h3
+; CHECK-GI-FP16-NEXT:    fadd h1, h1, h2
 ; CHECK-GI-FP16-NEXT:    fadd h0, h1, h0
 ; CHECK-GI-FP16-NEXT:    ret
   %r = call half @llvm.vector.reduce.fadd.f16.v8f16(half -0.0, <8 x half> %bin.rdx)
@@ -225,44 +214,23 @@ define half @add_H(<8 x half> %bin.rdx)  {
 }
 
 define float @add_S(<4 x float> %bin.rdx)  {
-; CHECK-SD-LABEL: add_S:
-; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    mov s1, v0.s[2]
-; CHECK-SD-NEXT:    faddp s2, v0.2s
-; CHECK-SD-NEXT:    mov s0, v0.s[3]
-; CHECK-SD-NEXT:    fadd s1, s2, s1
-; CHECK-SD-NEXT:    fadd s0, s1, s0
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: add_S:
-; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    movi v1.2s, #128, lsl #24
-; CHECK-GI-NEXT:    mov s2, v0.s[1]
-; CHECK-GI-NEXT:    mov s3, v0.s[2]
-; CHECK-GI-NEXT:    fadd s1, s1, s0
-; CHECK-GI-NEXT:    mov s0, v0.s[3]
-; CHECK-GI-NEXT:    fadd s1, s1, s2
-; CHECK-GI-NEXT:    fadd s1, s1, s3
-; CHECK-GI-NEXT:    fadd s0, s1, s0
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: add_S:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov s1, v0.s[2]
+; CHECK-NEXT:    faddp s2, v0.2s
+; CHECK-NEXT:    mov s0, v0.s[3]
+; CHECK-NEXT:    fadd s1, s2, s1
+; CHECK-NEXT:    fadd s0, s1, s0
+; CHECK-NEXT:    ret
   %r = call float @llvm.vector.reduce.fadd.f32.v4f32(float -0.0, <4 x float> %bin.rdx)
   ret float %r
 }
 
 define double @add_D(<2 x double> %bin.rdx)  {
-; CHECK-SD-LABEL: add_D:
-; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    faddp d0, v0.2d
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: add_D:
-; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    mov x8, #-9223372036854775808 // =0x8000000000000000
-; CHECK-GI-NEXT:    mov d2, v0.d[1]
-; CHECK-GI-NEXT:    fmov d1, x8
-; CHECK-GI-NEXT:    fadd d0, d1, d0
-; CHECK-GI-NEXT:    fadd d0, d0, d2
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: add_D:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    faddp d0, v0.2d
+; CHECK-NEXT:    ret
   %r = call double @llvm.vector.reduce.fadd.f64.v2f64(double -0.0, <2 x double> %bin.rdx)
   ret double %r
 }
@@ -464,23 +432,19 @@ define half @add_2H(<16 x half> %bin.rdx)  {
 ;
 ; CHECK-GI-FP16-LABEL: add_2H:
 ; CHECK-GI-FP16:       // %bb.0:
-; CHECK-GI-FP16-NEXT:    adrp x8, .LCPI5_0
-; CHECK-GI-FP16-NEXT:    mov h3, v0.h[1]
-; CHECK-GI-FP16-NEXT:    mov h4, v0.h[2]
-; CHECK-GI-FP16-NEXT:    ldr h2, [x8, :lo12:.LCPI5_0]
-; CHECK-GI-FP16-NEXT:    fadd h2, h2, h0
-; CHECK-GI-FP16-NEXT:    fadd h2, h2, h3
-; CHECK-GI-FP16-NEXT:    mov h3, v0.h[3]
+; CHECK-GI-FP16-NEXT:    mov h2, v0.h[2]
+; CHECK-GI-FP16-NEXT:    faddp h3, v0.2h
+; CHECK-GI-FP16-NEXT:    mov h4, v0.h[3]
+; CHECK-GI-FP16-NEXT:    fadd h2, h3, h2
+; CHECK-GI-FP16-NEXT:    mov h3, v0.h[4]
 ; CHECK-GI-FP16-NEXT:    fadd h2, h2, h4
-; CHECK-GI-FP16-NEXT:    mov h4, v0.h[4]
+; CHECK-GI-FP16-NEXT:    mov h4, v0.h[5]
 ; CHECK-GI-FP16-NEXT:    fadd h2, h2, h3
-; CHECK-GI-FP16-NEXT:    mov h3, v0.h[5]
-; CHECK-GI-FP16-NEXT:    fadd h2, h2, h4
-; CHECK-GI-FP16-NEXT:    mov h4, v0.h[6]
+; CHECK-GI-FP16-NEXT:    mov h3, v0.h[6]
 ; CHECK-GI-FP16-NEXT:    mov h0, v0.h[7]
+; CHECK-GI-FP16-NEXT:    fadd h2, h2, h4
 ; CHECK-GI-FP16-NEXT:    fadd h2, h2, h3
 ; CHECK-GI-FP16-NEXT:    mov h3, v1.h[2]
-; CHECK-GI-FP16-NEXT:    fadd h2, h2, h4
 ; CHECK-GI-FP16-NEXT:    fadd h0, h2, h0
 ; CHECK-GI-FP16-NEXT:    mov h2, v1.h[1]
 ; CHECK-GI-FP16-NEXT:    fadd h0, h0, h1
@@ -502,95 +466,51 @@ define half @add_2H(<16 x half> %bin.rdx)  {
 }
 
 define float @add_2S(<8 x float> %bin.rdx)  {
-; CHECK-SD-LABEL: add_2S:
-; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    mov s2, v0.s[2]
-; CHECK-SD-NEXT:    faddp s3, v0.2s
-; CHECK-SD-NEXT:    mov s0, v0.s[3]
-; CHECK-SD-NEXT:    fadd s2, s3, s2
-; CHECK-SD-NEXT:    mov s3, v1.s[2]
-; CHECK-SD-NEXT:    fadd s0, s2, s0
-; CHECK-SD-NEXT:    mov s2, v1.s[1]
-; CHECK-SD-NEXT:    fadd s0, s0, s1
-; CHECK-SD-NEXT:    mov s1, v1.s[3]
-; CHECK-SD-NEXT:    fadd s0, s0, s2
-; CHECK-SD-NEXT:    fadd s0, s0, s3
-; CHECK-SD-NEXT:    fadd s0, s0, s1
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: add_2S:
-; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    movi v2.2s, #128, lsl #24
-; CHECK-GI-NEXT:    mov s3, v0.s[1]
-; CHECK-GI-NEXT:    mov s4, v0.s[2]
-; CHECK-GI-NEXT:    fadd s2, s2, s0
-; CHECK-GI-NEXT:    mov s0, v0.s[3]
-; CHECK-GI-NEXT:    fadd s2, s2, s3
-; CHECK-GI-NEXT:    mov s3, v1.s[2]
-; CHECK-GI-NEXT:    fadd s2, s2, s4
-; CHECK-GI-NEXT:    fadd s0, s2, s0
-; CHECK-GI-NEXT:    mov s2, v1.s[1]
-; CHECK-GI-NEXT:    fadd s0, s0, s1
-; CHECK-GI-NEXT:    mov s1, v1.s[3]
-; CHECK-GI-NEXT:    fadd s0, s0, s2
-; CHECK-GI-NEXT:    fadd s0, s0, s3
-; CHECK-GI-NEXT:    fadd s0, s0, s1
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: add_2S:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov s2, v0.s[2]
+; CHECK-NEXT:    faddp s3, v0.2s
+; CHECK-NEXT:    mov s0, v0.s[3]
+; CHECK-NEXT:    fadd s2, s3, s2
+; CHECK-NEXT:    mov s3, v1.s[2]
+; CHECK-NEXT:    fadd s0, s2, s0
+; CHECK-NEXT:    mov s2, v1.s[1]
+; CHECK-NEXT:    fadd s0, s0, s1
+; CHECK-NEXT:    mov s1, v1.s[3]
+; CHECK-NEXT:    fadd s0, s0, s2
+; CHECK-NEXT:    fadd s0, s0, s3
+; CHECK-NEXT:    fadd s0, s0, s1
+; CHECK-NEXT:    ret
   %r = call float @llvm.vector.reduce.fadd.f32.v8f32(float -0.0, <8 x float> %bin.rdx)
   ret float %r
 }
 
 define double @add_2D(<4 x double> %bin.rdx)  {
-; CHECK-SD-LABEL: add_2D:
-; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    faddp d0, v0.2d
-; CHECK-SD-NEXT:    mov d2, v1.d[1]
-; CHECK-SD-NEXT:    fadd d0, d0, d1
-; CHECK-SD-NEXT:    fadd d0, d0, d2
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: add_2D:
-; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    mov x8, #-9223372036854775808 // =0x8000000000000000
-; CHECK-GI-NEXT:    mov d3, v0.d[1]
-; CHECK-GI-NEXT:    fmov d2, x8
-; CHECK-GI-NEXT:    fadd d0, d2, d0
-; CHECK-GI-NEXT:    mov d2, v1.d[1]
-; CHECK-GI-NEXT:    fadd d0, d0, d3
-; CHECK-GI-NEXT:    fadd d0, d0, d1
-; CHECK-GI-NEXT:    fadd d0, d0, d2
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: add_2D:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    faddp d0, v0.2d
+; CHECK-NEXT:    mov d2, v1.d[1]
+; CHECK-NEXT:    fadd d0, d0, d1
+; CHECK-NEXT:    fadd d0, d0, d2
+; CHECK-NEXT:    ret
   %r = call double @llvm.vector.reduce.fadd.f64.v4f64(double -0.0, <4 x double> %bin.rdx)
   ret double %r
 }
 
 ; Added at least one test where the start value is not -0.0.
 define float @add_S_init_42(<4 x float> %bin.rdx)  {
-; CHECK-SD-LABEL: add_S_init_42:
-; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    mov w8, #1109917696 // =0x42280000
-; CHECK-SD-NEXT:    mov s2, v0.s[1]
-; CHECK-SD-NEXT:    mov s3, v0.s[2]
-; CHECK-SD-NEXT:    fmov s1, w8
-; CHECK-SD-NEXT:    fadd s1, s0, s1
-; CHECK-SD-NEXT:    mov s0, v0.s[3]
-; CHECK-SD-NEXT:    fadd s1, s1, s2
-; CHECK-SD-NEXT:    fadd s1, s1, s3
-; CHECK-SD-NEXT:    fadd s0, s1, s0
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: add_S_init_42:
-; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    mov w8, #1109917696 // =0x42280000
-; CHECK-GI-NEXT:    mov s2, v0.s[1]
-; CHECK-GI-NEXT:    mov s3, v0.s[2]
-; CHECK-GI-NEXT:    fmov s1, w8
-; CHECK-GI-NEXT:    fadd s1, s1, s0
-; CHECK-GI-NEXT:    mov s0, v0.s[3]
-; CHECK-GI-NEXT:    fadd s1, s1, s2
-; CHECK-GI-NEXT:    fadd s1, s1, s3
-; CHECK-GI-NEXT:    fadd s0, s1, s0
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: add_S_init_42:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w8, #1109917696 // =0x42280000
+; CHECK-NEXT:    mov s2, v0.s[1]
+; CHECK-NEXT:    mov s3, v0.s[2]
+; CHECK-NEXT:    fmov s1, w8
+; CHECK-NEXT:    fadd s1, s0, s1
+; CHECK-NEXT:    mov s0, v0.s[3]
+; CHECK-NEXT:    fadd s1, s1, s2
+; CHECK-NEXT:    fadd s1, s1, s3
+; CHECK-NEXT:    fadd s0, s1, s0
+; CHECK-NEXT:    ret
   %r = call float @llvm.vector.reduce.fadd.f32.v4f32(float 42.0, <4 x float> %bin.rdx)
   ret float %r
 }
@@ -604,5 +524,3 @@ declare float @llvm.vector.reduce.fadd.f32.v4f32(float, <4 x float>)
 declare float @llvm.vector.reduce.fadd.f32.v8f32(float, <8 x float>)
 declare double @llvm.vector.reduce.fadd.f64.v2f64(double, <2 x double>)
 declare double @llvm.vector.reduce.fadd.f64.v4f64(double, <4 x double>)
-;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; CHECK: {{.*}}
diff --git a/llvm/test/CodeGen/AArch64/vecreduce-fmul-strict.ll b/llvm/test/CodeGen/AArch64/vecreduce-fmul-strict.ll
index 32ce4d6..7b93e60 100644
--- a/llvm/test/CodeGen/AArch64/vecreduce-fmul-strict.ll
+++ b/llvm/test/CodeGen/AArch64/vecreduce-fmul-strict.ll
@@ -13,11 +13,9 @@ define float @mul_HalfS(<2 x float> %bin.rdx)  {
 ;
 ; CHECK-GI-LABEL: mul_HalfS:
 ; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    fmov s1, #1.00000000
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-NEXT:    mov s2, v0.s[1]
-; CHECK-GI-NEXT:    fmul s0, s1, s0
-; CHECK-GI-NEXT:    fmul s0, s0, s2
+; CHECK-GI-NEXT:    mov s1, v0.s[1]
+; CHECK-GI-NEXT:    fmul s0, s0, s1
 ; CHECK-GI-NEXT:    ret
   %r = call float @llvm.vector.reduce.fmul.f32.v2f32(float 1.0, <2 x float> %bin.rdx)
   ret float %r
@@ -80,14 +78,12 @@ define half @mul_HalfH(<4 x half> %bin.rdx)  {
 ;
 ; CHECK-GI-FP16-LABEL: mul_HalfH:
 ; CHECK-GI-FP16:       // %bb.0:
-; CHECK-GI-FP16-NEXT:    fmov h1, #1.00000000
 ; CHECK-GI-FP16-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-FP16-NEXT:    mov h2, v0.h[1]
-; CHECK-GI-FP16-NEXT:    mov h3, v0.h[2]
-; CHECK-GI-FP16-NEXT:    fmul h1, h1, h0
+; CHECK-GI-FP16-NEXT:    mov h1, v0.h[1]
+; CHECK-GI-FP16-NEXT:    mov h2, v0.h[2]
+; CHECK-GI-FP16-NEXT:    fmul h1, h0, h1
 ; CHECK-GI-FP16-NEXT:    mov h0, v0.h[3]
 ; CHECK-GI-FP16-NEXT:    fmul h1, h1, h2
-; CHECK-GI-FP16-NEXT:    fmul h1, h1, h3
 ; CHECK-GI-FP16-NEXT:    fmul h0, h1, h0
 ; CHECK-GI-FP16-NEXT:    ret
   %r = call half @llvm.vector.reduce.fmul.f16.v4f16(half 1.0, <4 x half> %bin.rdx)
@@ -193,9 +189,7 @@ define half @mul_H(<8 x half> %bin.rdx)  {
 ;
 ; CHECK-GI-FP16-LABEL: mul_H:
 ; CHECK-GI-FP16:       // %bb.0:
-; CHECK-GI-FP16-NEXT:    fmov h1, #1.00000000
-; CHECK-GI-FP16-NEXT:    fmul h1, h1, h0
-; CHECK-GI-FP16-NEXT:    fmul h1, h1, v0.h[1]
+; CHECK-GI-FP16-NEXT:    fmul h1, h0, v0.h[1]
 ; CHECK-GI-FP16-NEXT:    fmul h1, h1, v0.h[2]
 ; CHECK-GI-FP16-NEXT:    fmul h1, h1, v0.h[3]
 ; CHECK-GI-FP16-NEXT:    fmul h1, h1, v0.h[4]
@@ -208,37 +202,21 @@ define half @mul_H(<8 x half> %bin.rdx)  {
 }
 
 define float @mul_S(<4 x float> %bin.rdx)  {
-; CHECK-SD-LABEL: mul_S:
-; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    fmul s1, s0, v0.s[1]
-; CHECK-SD-NEXT:    fmul s1, s1, v0.s[2]
-; CHECK-SD-NEXT:    fmul s0, s1, v0.s[3]
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: mul_S:
-; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    fmov s1, #1.00000000
-; CHECK-GI-NEXT:    fmul s1, s1, s0
-; CHECK-GI-NEXT:    fmul s1, s1, v0.s[1]
-; CHECK-GI-NEXT:    fmul s1, s1, v0.s[2]
-; CHECK-GI-NEXT:    fmul s0, s1, v0.s[3]
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: mul_S:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmul s1, s0, v0.s[1]
+; CHECK-NEXT:    fmul s1, s1, v0.s[2]
+; CHECK-NEXT:    fmul s0, s1, v0.s[3]
+; CHECK-NEXT:    ret
   %r = call float @llvm.vector.reduce.fmul.f32.v4f32(float 1.0, <4 x float> %bin.rdx)
   ret float %r
 }
 
 define double @mul_D(<2 x double> %bin.rdx)  {
-; CHECK-SD-LABEL: mul_D:
-; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    fmul d0, d0, v0.d[1]
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: mul_D:
-; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    fmov d1, #1.00000000
-; CHECK-GI-NEXT:    fmul d1, d1, d0
-; CHECK-GI-NEXT:    fmul d0, d1, v0.d[1]
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: mul_D:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmul d0, d0, v0.d[1]
+; CHECK-NEXT:    ret
   %r = call double @llvm.vector.reduce.fmul.f64.v2f64(double 1.0, <2 x double> %bin.rdx)
   ret double %r
 }
@@ -427,9 +405,7 @@ define half @mul_2H(<16 x half> %bin.rdx)  {
 ;
 ; CHECK-GI-FP16-LABEL: mul_2H:
 ; CHECK-GI-FP16:       // %bb.0:
-; CHECK-GI-FP16-NEXT:    fmov h2, #1.00000000
-; CHECK-GI-FP16-NEXT:    fmul h2, h2, h0
-; CHECK-GI-FP16-NEXT:    fmul h2, h2, v0.h[1]
+; CHECK-GI-FP16-NEXT:    fmul h2, h0, v0.h[1]
 ; CHECK-GI-FP16-NEXT:    fmul h2, h2, v0.h[2]
 ; CHECK-GI-FP16-NEXT:    fmul h2, h2, v0.h[3]
 ; CHECK-GI-FP16-NEXT:    fmul h2, h2, v0.h[4]
@@ -450,49 +426,27 @@ define half @mul_2H(<16 x half> %bin.rdx)  {
 }
 
 define float @mul_2S(<8 x float> %bin.rdx)  {
-; CHECK-SD-LABEL: mul_2S:
-; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    fmul s2, s0, v0.s[1]
-; CHECK-SD-NEXT:    fmul s2, s2, v0.s[2]
-; CHECK-SD-NEXT:    fmul s0, s2, v0.s[3]
-; CHECK-SD-NEXT:    fmul s0, s0, s1
-; CHECK-SD-NEXT:    fmul s0, s0, v1.s[1]
-; CHECK-SD-NEXT:    fmul s0, s0, v1.s[2]
-; CHECK-SD-NEXT:    fmul s0, s0, v1.s[3]
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: mul_2S:
-; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    fmov s2, #1.00000000
-; CHECK-GI-NEXT:    fmul s2, s2, s0
-; CHECK-GI-NEXT:    fmul s2, s2, v0.s[1]
-; CHECK-GI-NEXT:    fmul s2, s2, v0.s[2]
-; CHECK-GI-NEXT:    fmul s0, s2, v0.s[3]
-; CHECK-GI-NEXT:    fmul s0, s0, s1
-; CHECK-GI-NEXT:    fmul s0, s0, v1.s[1]
-; CHECK-GI-NEXT:    fmul s0, s0, v1.s[2]
-; CHECK-GI-NEXT:    fmul s0, s0, v1.s[3]
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: mul_2S:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmul s2, s0, v0.s[1]
+; CHECK-NEXT:    fmul s2, s2, v0.s[2]
+; CHECK-NEXT:    fmul s0, s2, v0.s[3]
+; CHECK-NEXT:    fmul s0, s0, s1
+; CHECK-NEXT:    fmul s0, s0, v1.s[1]
+; CHECK-NEXT:    fmul s0, s0, v1.s[2]
+; CHECK-NEXT:    fmul s0, s0, v1.s[3]
+; CHECK-NEXT:    ret
   %r = call float @llvm.vector.reduce.fmul.f32.v8f32(float 1.0, <8 x float> %bin.rdx)
   ret float %r
 }
 
 define double @mul_2D(<4 x double> %bin.rdx)  {
-; CHECK-SD-LABEL: mul_2D:
-; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    fmul d0, d0, v0.d[1]
-; CHECK-SD-NEXT:    fmul d0, d0, d1
-; CHECK-SD-NEXT:    fmul d0, d0, v1.d[1]
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: mul_2D:
-; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    fmov d2, #1.00000000
-; CHECK-GI-NEXT:    fmul d2, d2, d0
-; CHECK-GI-NEXT:    fmul d0, d2, v0.d[1]
-; CHECK-GI-NEXT:    fmul d0, d0, d1
-; CHECK-GI-NEXT:    fmul d0, d0, v1.d[1]
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: mul_2D:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmul d0, d0, v0.d[1]
+; CHECK-NEXT:    fmul d0, d0, d1
+; CHECK-NEXT:    fmul d0, d0, v1.d[1]
+; CHECK-NEXT:    ret
   %r = call double @llvm.vector.reduce.fmul.f64.v4f64(double 1.0, <4 x double> %bin.rdx)
   ret double %r
 }
-- 
cgit v1.1


From 887ed6d2876156ade8a382e521130feae4b91b82 Mon Sep 17 00:00:00 2001
From: David Green <david.green@arm.com>
Date: Sun, 11 Feb 2024 11:20:53 +0000
Subject: [AArch64][GlobalISel] Remove mulh c++ lowering (#81105)

I believe these should be selectable via tablegen patterns nowadays.
---
 .../AArch64/GISel/AArch64InstructionSelector.cpp   | 28 ----------------------
 1 file changed, 28 deletions(-)

diff --git a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
index 2515991..9d51a7f 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
@@ -3020,34 +3020,6 @@ bool AArch64InstructionSelector::select(MachineInstr &I) {
   case TargetOpcode::G_INDEXED_STORE:
     return selectIndexedStore(cast<GIndexedStore>(I), MRI);
 
-  case TargetOpcode::G_SMULH:
-  case TargetOpcode::G_UMULH: {
-    // Reject the various things we don't support yet.
-    if (unsupportedBinOp(I, RBI, MRI, TRI))
-      return false;
-
-    const Register DefReg = I.getOperand(0).getReg();
-    const RegisterBank &RB = *RBI.getRegBank(DefReg, MRI, TRI);
-
-    if (RB.getID() != AArch64::GPRRegBankID) {
-      LLVM_DEBUG(dbgs() << "G_[SU]MULH on bank: " << RB << ", expected: GPR\n");
-      return false;
-    }
-
-    if (Ty != LLT::scalar(64)) {
-      LLVM_DEBUG(dbgs() << "G_[SU]MULH has type: " << Ty
-                        << ", expected: " << LLT::scalar(64) << '\n');
-      return false;
-    }
-
-    unsigned NewOpc = I.getOpcode() == TargetOpcode::G_SMULH ? AArch64::SMULHrr
-                                                             : AArch64::UMULHrr;
-    I.setDesc(TII.get(NewOpc));
-
-    // Now that we selected an opcode, we need to constrain the register
-    // operands to use appropriate classes.
-    return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
-  }
   case TargetOpcode::G_LSHR:
   case TargetOpcode::G_ASHR:
     if (MRI.getType(I.getOperand(0).getReg()).isVector())
-- 
cgit v1.1


From b985d4179a882892ce009fb3668cdc917e27f5d5 Mon Sep 17 00:00:00 2001
From: Vlad Serebrennikov <serebrennikov.vladislav@gmail.com>
Date: Sun, 11 Feb 2024 14:59:33 +0300
Subject: [clang][NFC] Annotate `ExprConstant.cpp` with `preferred_type`

This helps debuggers to display values in bit-fields in a more helpful way.
---
 clang/lib/AST/ExprConstant.cpp | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp
index 02e153f..33ad94e 100644
--- a/clang/lib/AST/ExprConstant.cpp
+++ b/clang/lib/AST/ExprConstant.cpp
@@ -240,15 +240,19 @@ namespace {
     /// True if the subobject was named in a manner not supported by C++11. Such
     /// lvalues can still be folded, but they are not core constant expressions
     /// and we cannot perform lvalue-to-rvalue conversions on them.
+    LLVM_PREFERRED_TYPE(bool)
     unsigned Invalid : 1;
 
     /// Is this a pointer one past the end of an object?
+    LLVM_PREFERRED_TYPE(bool)
     unsigned IsOnePastTheEnd : 1;
 
     /// Indicator of whether the first entry is an unsized array.
+    LLVM_PREFERRED_TYPE(bool)
     unsigned FirstEntryIsAnUnsizedArray : 1;
 
     /// Indicator of whether the most-derived object is an array element.
+    LLVM_PREFERRED_TYPE(bool)
     unsigned MostDerivedIsArrayElement : 1;
 
     /// The length of the path to the most-derived object of which this is a
-- 
cgit v1.1


From 63b414e4977d6e19f05947c88f57cd127fa328e3 Mon Sep 17 00:00:00 2001
From: Vlad Serebrennikov <serebrennikov.vladislav@gmail.com>
Date: Sun, 11 Feb 2024 15:01:18 +0300
Subject: [clang][NFC] Annotate `RecordLayoutBuilder.cpp` with `preferred_type`

This helps debuggers to display values in bit-fields in a more helpful way.
---
 clang/lib/AST/RecordLayoutBuilder.cpp | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/clang/lib/AST/RecordLayoutBuilder.cpp b/clang/lib/AST/RecordLayoutBuilder.cpp
index 6dfaadd..a3b7431 100644
--- a/clang/lib/AST/RecordLayoutBuilder.cpp
+++ b/clang/lib/AST/RecordLayoutBuilder.cpp
@@ -602,21 +602,28 @@ protected:
 
   /// Whether the external AST source has provided a layout for this
   /// record.
+  LLVM_PREFERRED_TYPE(bool)
   unsigned UseExternalLayout : 1;
 
   /// Whether we need to infer alignment, even when we have an
   /// externally-provided layout.
+  LLVM_PREFERRED_TYPE(bool)
   unsigned InferAlignment : 1;
 
   /// Packed - Whether the record is packed or not.
+  LLVM_PREFERRED_TYPE(bool)
   unsigned Packed : 1;
 
+  LLVM_PREFERRED_TYPE(bool)
   unsigned IsUnion : 1;
 
+  LLVM_PREFERRED_TYPE(bool)
   unsigned IsMac68kAlign : 1;
 
+  LLVM_PREFERRED_TYPE(bool)
   unsigned IsNaturalAlign : 1;
 
+  LLVM_PREFERRED_TYPE(bool)
   unsigned IsMsStruct : 1;
 
   /// UnfilledBitsInLastUnit - If the last field laid out was a bitfield,
-- 
cgit v1.1


From eaff01f4fc1b3f1ccdc5fc6dafb39af959d00f6d Mon Sep 17 00:00:00 2001
From: Vlad Serebrennikov <serebrennikov.vladislav@gmail.com>
Date: Sun, 11 Feb 2024 15:03:03 +0300
Subject: [clang][NFC] Annotate `CGExprCXX.cpp` with `preferred_type`

This helps debuggers to display values in bit-fields in a more helpful way.
---
 clang/lib/CodeGen/CGExprCXX.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/clang/lib/CodeGen/CGExprCXX.cpp b/clang/lib/CodeGen/CGExprCXX.cpp
index d136bfc..2adbef6 100644
--- a/clang/lib/CodeGen/CGExprCXX.cpp
+++ b/clang/lib/CodeGen/CGExprCXX.cpp
@@ -1423,6 +1423,7 @@ namespace {
     };
 
     unsigned NumPlacementArgs : 31;
+    LLVM_PREFERRED_TYPE(bool)
     unsigned PassAlignmentToPlacementDelete : 1;
     const FunctionDecl *OperatorDelete;
     ValueTy Ptr;
-- 
cgit v1.1


From bcc4c8231fbee46f1b16f8b9db7d9926745db9bb Mon Sep 17 00:00:00 2001
From: Vlad Serebrennikov <serebrennikov.vladislav@gmail.com>
Date: Sun, 11 Feb 2024 15:04:28 +0300
Subject: [clang][NFC] Annotate `CGObjC.cpp` with `preferred_type`

This helps debuggers to display values in bit-fields in a more helpful way.
---
 clang/lib/CodeGen/CGObjC.cpp | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/clang/lib/CodeGen/CGObjC.cpp b/clang/lib/CodeGen/CGObjC.cpp
index 03fc0ec..f3a948c 100644
--- a/clang/lib/CodeGen/CGObjC.cpp
+++ b/clang/lib/CodeGen/CGObjC.cpp
@@ -899,9 +899,13 @@ namespace {
                          const ObjCPropertyImplDecl *propImpl);
 
   private:
+    LLVM_PREFERRED_TYPE(StrategyKind)
     unsigned Kind : 8;
+    LLVM_PREFERRED_TYPE(bool)
     unsigned IsAtomic : 1;
+    LLVM_PREFERRED_TYPE(bool)
     unsigned IsCopy : 1;
+    LLVM_PREFERRED_TYPE(bool)
     unsigned HasStrong : 1;
 
     CharUnits IvarSize;
-- 
cgit v1.1


From 6884657de8da3024b50d8737219c1f24ab075c4c Mon Sep 17 00:00:00 2001
From: Vlad Serebrennikov <serebrennikov.vladislav@gmail.com>
Date: Sun, 11 Feb 2024 15:06:15 +0300
Subject: [clang][NFC] Annotate `SemaChecking.cpp` with `preferred_type`

This helps debuggers to display values in bit-fields in a more helpful way.
---
 clang/lib/Sema/SemaChecking.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp
index f8b73c7..71e6e72 100644
--- a/clang/lib/Sema/SemaChecking.cpp
+++ b/clang/lib/Sema/SemaChecking.cpp
@@ -16652,6 +16652,7 @@ class SequenceChecker : public ConstEvaluatedExprVisitor<SequenceChecker> {
     struct Value {
       explicit Value(unsigned Parent) : Parent(Parent), Merged(false) {}
       unsigned Parent : 31;
+      LLVM_PREFERRED_TYPE(bool)
       unsigned Merged : 1;
     };
     SmallVector<Value, 8> Values;
-- 
cgit v1.1


From f0b2bcfe91e70816b33973bc50a2cb63144ba77a Mon Sep 17 00:00:00 2001
From: Vlad Serebrennikov <serebrennikov.vladislav@gmail.com>
Date: Sun, 11 Feb 2024 15:07:14 +0300
Subject: [clang][NFC] Annotate `SemaStmt.cpp` with `preferred_type`

This helps debuggers to display values in bit-fields in a more helpful way.
---
 clang/lib/Sema/SemaStmt.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/clang/lib/Sema/SemaStmt.cpp b/clang/lib/Sema/SemaStmt.cpp
index 5ab2534..d9aaea8 100644
--- a/clang/lib/Sema/SemaStmt.cpp
+++ b/clang/lib/Sema/SemaStmt.cpp
@@ -4381,6 +4381,7 @@ Sema::ActOnObjCAutoreleasePoolStmt(SourceLocation AtLoc, Stmt *Body) {
 namespace {
 class CatchHandlerType {
   QualType QT;
+  LLVM_PREFERRED_TYPE(bool)
   unsigned IsPointer : 1;
 
   // This is a special constructor to be used only with DenseMapInfo's
-- 
cgit v1.1


From 83269a04def26fe9890036857d3e1a8c6c1f770d Mon Sep 17 00:00:00 2001
From: Vlad Serebrennikov <serebrennikov.vladislav@gmail.com>
Date: Sun, 11 Feb 2024 15:08:58 +0300
Subject: [clang][NFC] Annotate `cc1as_main.cpp` with `preferred_type`

This helps debuggers to display values in bit-fields in a more helpful way.
---
 clang/tools/driver/cc1as_main.cpp | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/clang/tools/driver/cc1as_main.cpp b/clang/tools/driver/cc1as_main.cpp
index bc398fa..a55e0650 100644
--- a/clang/tools/driver/cc1as_main.cpp
+++ b/clang/tools/driver/cc1as_main.cpp
@@ -89,10 +89,15 @@ struct AssemblerInvocation {
   /// @{
 
   std::vector<std::string> IncludePaths;
+  LLVM_PREFERRED_TYPE(bool)
   unsigned NoInitialTextSection : 1;
+  LLVM_PREFERRED_TYPE(bool)
   unsigned SaveTemporaryLabels : 1;
+  LLVM_PREFERRED_TYPE(bool)
   unsigned GenDwarfForAssembly : 1;
+  LLVM_PREFERRED_TYPE(bool)
   unsigned RelaxELFRelocations : 1;
+  LLVM_PREFERRED_TYPE(bool)
   unsigned Dwarf64 : 1;
   unsigned DwarfVersion;
   std::string DwarfDebugFlags;
@@ -117,7 +122,9 @@ struct AssemblerInvocation {
     FT_Obj   ///< Object file output.
   };
   FileType OutputType;
+  LLVM_PREFERRED_TYPE(bool)
   unsigned ShowHelp : 1;
+  LLVM_PREFERRED_TYPE(bool)
   unsigned ShowVersion : 1;
 
   /// @}
@@ -125,19 +132,28 @@ struct AssemblerInvocation {
   /// @{
 
   unsigned OutputAsmVariant;
+  LLVM_PREFERRED_TYPE(bool)
   unsigned ShowEncoding : 1;
+  LLVM_PREFERRED_TYPE(bool)
   unsigned ShowInst : 1;
 
   /// @}
   /// @name Assembler Options
   /// @{
 
+  LLVM_PREFERRED_TYPE(bool)
   unsigned RelaxAll : 1;
+  LLVM_PREFERRED_TYPE(bool)
   unsigned NoExecStack : 1;
+  LLVM_PREFERRED_TYPE(bool)
   unsigned FatalWarnings : 1;
+  LLVM_PREFERRED_TYPE(bool)
   unsigned NoWarn : 1;
+  LLVM_PREFERRED_TYPE(bool)
   unsigned NoTypeCheck : 1;
+  LLVM_PREFERRED_TYPE(bool)
   unsigned IncrementalLinkerCompatible : 1;
+  LLVM_PREFERRED_TYPE(bool)
   unsigned EmbedBitcode : 1;
 
   /// Whether to emit DWARF unwind info.
@@ -145,6 +161,7 @@ struct AssemblerInvocation {
 
   // Whether to emit compact-unwind for non-canonical entries.
   // Note: maybe overriden by other constraints.
+  LLVM_PREFERRED_TYPE(bool)
   unsigned EmitCompactUnwindNonCanonical : 1;
 
   /// The name of the relocation model to use.
-- 
cgit v1.1


From bc1d61cbf8759f5144217af50d2309b5dddd5538 Mon Sep 17 00:00:00 2001
From: Vlad Serebrennikov <serebrennikov.vladislav@gmail.com>
Date: Sun, 11 Feb 2024 15:27:21 +0300
Subject: [clang][NFC] Annotate `SourceManagerTest.cpp` with `preferred_type`

This helps debuggers to display values in bit-fields in a more helpful way.
---
 clang/unittests/Basic/SourceManagerTest.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/clang/unittests/Basic/SourceManagerTest.cpp b/clang/unittests/Basic/SourceManagerTest.cpp
index 5572814..45840f5 100644
--- a/clang/unittests/Basic/SourceManagerTest.cpp
+++ b/clang/unittests/Basic/SourceManagerTest.cpp
@@ -530,6 +530,7 @@ struct MacroAction {
 
   SourceLocation Loc;
   std::string Name;
+  LLVM_PREFERRED_TYPE(Kind)
   unsigned MAKind : 3;
 
   MacroAction(SourceLocation Loc, StringRef Name, unsigned K)
-- 
cgit v1.1


From 23bdca2c6737f25f1d184f03021f61157bac6196 Mon Sep 17 00:00:00 2001
From: Vlad Serebrennikov <serebrennikov.vladislav@gmail.com>
Date: Sun, 11 Feb 2024 15:41:49 +0300
Subject: [clang][NFC] Annotate `RISCVVEmitter.cpp` with `preferred_type`

This helps debuggers to display values in bit-fields in a more helpful way.
---
 clang/utils/TableGen/RISCVVEmitter.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/clang/utils/TableGen/RISCVVEmitter.cpp b/clang/utils/TableGen/RISCVVEmitter.cpp
index 9f6ed39..8513174 100644
--- a/clang/utils/TableGen/RISCVVEmitter.cpp
+++ b/clang/utils/TableGen/RISCVVEmitter.cpp
@@ -67,7 +67,9 @@ struct SemaRecord {
   bool HasMaskPolicy : 1;
   bool HasFRMRoundModeOp : 1;
   bool IsTuple : 1;
+  LLVM_PREFERRED_TYPE(PolicyScheme)
   uint8_t UnMaskedPolicyScheme : 2;
+  LLVM_PREFERRED_TYPE(PolicyScheme)
   uint8_t MaskedPolicyScheme : 2;
 };
 
-- 
cgit v1.1


From 4bbae068d704752acbd7c5d8652c11b0954742be Mon Sep 17 00:00:00 2001
From: Vlad Serebrennikov <serebrennikov.vladislav@gmail.com>
Date: Sun, 11 Feb 2024 15:43:35 +0300
Subject: [clang][NFC] Annotate `RISCVVIntrinsicUtils.h` with `preferred_type`

This helps debuggers to display values in bit-fields in a more helpful way.
---
 clang/include/clang/Support/RISCVVIntrinsicUtils.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/clang/include/clang/Support/RISCVVIntrinsicUtils.h b/clang/include/clang/Support/RISCVVIntrinsicUtils.h
index 30bf36e..ef9d6c1 100644
--- a/clang/include/clang/Support/RISCVVIntrinsicUtils.h
+++ b/clang/include/clang/Support/RISCVVIntrinsicUtils.h
@@ -554,7 +554,9 @@ struct RVVIntrinsicRecord {
   bool HasMaskPolicy : 1;
   bool HasFRMRoundModeOp : 1;
   bool IsTuple : 1;
+  LLVM_PREFERRED_TYPE(PolicyScheme)
   uint8_t UnMaskedPolicyScheme : 2;
+  LLVM_PREFERRED_TYPE(PolicyScheme)
   uint8_t MaskedPolicyScheme : 2;
 };
 
-- 
cgit v1.1


From 803374994602910aae2cb483d03bcbdb294b21bb Mon Sep 17 00:00:00 2001
From: Vlad Serebrennikov <serebrennikov.vladislav@gmail.com>
Date: Sun, 11 Feb 2024 15:48:52 +0300
Subject: [clang][NFC] Annotate `DiagnosticID.cpp` with `preferred_type`

This helps debuggers to display values in bit-fields in a more helpful way.
---
 clang/lib/Basic/DiagnosticIDs.cpp | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/clang/lib/Basic/DiagnosticIDs.cpp b/clang/lib/Basic/DiagnosticIDs.cpp
index 6c7bd50..b353a66 100644
--- a/clang/lib/Basic/DiagnosticIDs.cpp
+++ b/clang/lib/Basic/DiagnosticIDs.cpp
@@ -100,7 +100,7 @@ const uint32_t StaticDiagInfoDescriptionOffsets[] = {
 };
 
 // Diagnostic classes.
-enum {
+enum DiagnosticClass {
   CLASS_NOTE       = 0x01,
   CLASS_REMARK     = 0x02,
   CLASS_WARNING    = 0x03,
@@ -110,15 +110,22 @@ enum {
 
 struct StaticDiagInfoRec {
   uint16_t DiagID;
+  LLVM_PREFERRED_TYPE(diag::Severity)
   uint8_t DefaultSeverity : 3;
+  LLVM_PREFERRED_TYPE(DiagnosticClass)
   uint8_t Class : 3;
+  LLVM_PREFERRED_TYPE(DiagnosticIDs::SFINAEResponse)
   uint8_t SFINAE : 2;
   uint8_t Category : 6;
+  LLVM_PREFERRED_TYPE(bool)
   uint8_t WarnNoWerror : 1;
+  LLVM_PREFERRED_TYPE(bool)
   uint8_t WarnShowInSystemHeader : 1;
+  LLVM_PREFERRED_TYPE(bool)
   uint8_t WarnShowInSystemMacro : 1;
 
   uint16_t OptionGroupIndex : 15;
+  LLVM_PREFERRED_TYPE(bool)
   uint16_t Deferrable : 1;
 
   uint16_t DescriptionLen;
-- 
cgit v1.1


From fe0d277f31d3369de1fd92ad8dd8044f5b1d4ed7 Mon Sep 17 00:00:00 2001
From: Mark de Wever <koraq@xs4all.nl>
Date: Sun, 11 Feb 2024 13:53:59 +0100
Subject: [libc++][ratio] Avoids accepting unrelated types. (#80491)

The arithmetic and comparison operators are ill-formed when R1 or R2 is
not a std::ratio.

Fixes: https://github.com/llvm/llvm-project/issues/63753
---
 libcxx/include/ratio                               | 42 +++++++++--
 .../ratio.arithmetic/R1_R2_requirement.verify.cpp  | 56 +++++++++++++++
 .../ratio.comparison/R1_R2_requirement.verify.cpp  | 81 ++++++++++++++++++++++
 .../R1_R2_requirement_v.verify.cpp                 | 69 ++++++++++++++++++
 4 files changed, 242 insertions(+), 6 deletions(-)
 create mode 100644 libcxx/test/std/utilities/ratio/ratio.arithmetic/R1_R2_requirement.verify.cpp
 create mode 100644 libcxx/test/std/utilities/ratio/ratio.comparison/R1_R2_requirement.verify.cpp
 create mode 100644 libcxx/test/std/utilities/ratio/ratio.comparison/R1_R2_requirement_v.verify.cpp

diff --git a/libcxx/include/ratio b/libcxx/include/ratio
index 3b11a2a..de656f3 100644
--- a/libcxx/include/ratio
+++ b/libcxx/include/ratio
@@ -289,6 +289,9 @@ private:
   static const intmax_t __gcd_n1_d2 = __static_gcd<_R1::num, _R2::den>::value;
   static const intmax_t __gcd_d1_n2 = __static_gcd<_R1::den, _R2::num>::value;
 
+  static_assert(__is_ratio<_R1>::value, "[ratio.general]/2 requires R1 to be a specialisation of the ratio template");
+  static_assert(__is_ratio<_R2>::value, "[ratio.general]/2 requires R2 to be a specialisation of the ratio template");
+
 public:
   typedef typename ratio< __ll_mul<_R1::num / __gcd_n1_d2, _R2::num / __gcd_d1_n2>::value,
                           __ll_mul<_R2::den / __gcd_n1_d2, _R1::den / __gcd_d1_n2>::value >::type type;
@@ -312,6 +315,9 @@ private:
   static const intmax_t __gcd_n1_n2 = __static_gcd<_R1::num, _R2::num>::value;
   static const intmax_t __gcd_d1_d2 = __static_gcd<_R1::den, _R2::den>::value;
 
+  static_assert(__is_ratio<_R1>::value, "[ratio.general]/2 requires R1 to be a specialisation of the ratio template");
+  static_assert(__is_ratio<_R2>::value, "[ratio.general]/2 requires R2 to be a specialisation of the ratio template");
+
 public:
   typedef typename ratio< __ll_mul<_R1::num / __gcd_n1_n2, _R2::den / __gcd_d1_d2>::value,
                           __ll_mul<_R2::num / __gcd_n1_n2, _R1::den / __gcd_d1_d2>::value >::type type;
@@ -335,6 +341,9 @@ private:
   static const intmax_t __gcd_n1_n2 = __static_gcd<_R1::num, _R2::num>::value;
   static const intmax_t __gcd_d1_d2 = __static_gcd<_R1::den, _R2::den>::value;
 
+  static_assert(__is_ratio<_R1>::value, "[ratio.general]/2 requires R1 to be a specialisation of the ratio template");
+  static_assert(__is_ratio<_R2>::value, "[ratio.general]/2 requires R2 to be a specialisation of the ratio template");
+
 public:
   typedef typename ratio_multiply<
       ratio<__gcd_n1_n2, _R1::den / __gcd_d1_d2>,
@@ -361,6 +370,9 @@ private:
   static const intmax_t __gcd_n1_n2 = __static_gcd<_R1::num, _R2::num>::value;
   static const intmax_t __gcd_d1_d2 = __static_gcd<_R1::den, _R2::den>::value;
 
+  static_assert(__is_ratio<_R1>::value, "[ratio.general]/2 requires R1 to be a specialisation of the ratio template");
+  static_assert(__is_ratio<_R2>::value, "[ratio.general]/2 requires R2 to be a specialisation of the ratio template");
+
 public:
   typedef typename ratio_multiply<
       ratio<__gcd_n1_n2, _R1::den / __gcd_d1_d2>,
@@ -384,10 +396,16 @@ struct _LIBCPP_TEMPLATE_VIS ratio_subtract : public __ratio_subtract<_R1, _R2>::
 // ratio_equal
 
 template <class _R1, class _R2>
-struct _LIBCPP_TEMPLATE_VIS ratio_equal : _BoolConstant<(_R1::num == _R2::num && _R1::den == _R2::den)> {};
+struct _LIBCPP_TEMPLATE_VIS ratio_equal : _BoolConstant<(_R1::num == _R2::num && _R1::den == _R2::den)> {
+  static_assert(__is_ratio<_R1>::value, "[ratio.general]/2 requires R1 to be a specialisation of the ratio template");
+  static_assert(__is_ratio<_R2>::value, "[ratio.general]/2 requires R2 to be a specialisation of the ratio template");
+};
 
 template <class _R1, class _R2>
-struct _LIBCPP_TEMPLATE_VIS ratio_not_equal : _BoolConstant<!ratio_equal<_R1, _R2>::value> {};
+struct _LIBCPP_TEMPLATE_VIS ratio_not_equal : _BoolConstant<!ratio_equal<_R1, _R2>::value> {
+  static_assert(__is_ratio<_R1>::value, "[ratio.general]/2 requires R1 to be a specialisation of the ratio template");
+  static_assert(__is_ratio<_R2>::value, "[ratio.general]/2 requires R2 to be a specialisation of the ratio template");
+};
 
 // ratio_less
 
@@ -441,16 +459,28 @@ struct __ratio_less<_R1, _R2, -1LL, -1LL> {
 };
 
 template <class _R1, class _R2>
-struct _LIBCPP_TEMPLATE_VIS ratio_less : _BoolConstant<__ratio_less<_R1, _R2>::value> {};
+struct _LIBCPP_TEMPLATE_VIS ratio_less : _BoolConstant<__ratio_less<_R1, _R2>::value> {
+  static_assert(__is_ratio<_R1>::value, "[ratio.general]/2 requires R1 to be a specialisation of the ratio template");
+  static_assert(__is_ratio<_R2>::value, "[ratio.general]/2 requires R2 to be a specialisation of the ratio template");
+};
 
 template <class _R1, class _R2>
-struct _LIBCPP_TEMPLATE_VIS ratio_less_equal : _BoolConstant<!ratio_less<_R2, _R1>::value> {};
+struct _LIBCPP_TEMPLATE_VIS ratio_less_equal : _BoolConstant<!ratio_less<_R2, _R1>::value> {
+  static_assert(__is_ratio<_R1>::value, "[ratio.general]/2 requires R1 to be a specialisation of the ratio template");
+  static_assert(__is_ratio<_R2>::value, "[ratio.general]/2 requires R2 to be a specialisation of the ratio template");
+};
 
 template <class _R1, class _R2>
-struct _LIBCPP_TEMPLATE_VIS ratio_greater : _BoolConstant<ratio_less<_R2, _R1>::value> {};
+struct _LIBCPP_TEMPLATE_VIS ratio_greater : _BoolConstant<ratio_less<_R2, _R1>::value> {
+  static_assert(__is_ratio<_R1>::value, "[ratio.general]/2 requires R1 to be a specialisation of the ratio template");
+  static_assert(__is_ratio<_R2>::value, "[ratio.general]/2 requires R2 to be a specialisation of the ratio template");
+};
 
 template <class _R1, class _R2>
-struct _LIBCPP_TEMPLATE_VIS ratio_greater_equal : _BoolConstant<!ratio_less<_R1, _R2>::value> {};
+struct _LIBCPP_TEMPLATE_VIS ratio_greater_equal : _BoolConstant<!ratio_less<_R1, _R2>::value> {
+  static_assert(__is_ratio<_R1>::value, "[ratio.general]/2 requires R1 to be a specialisation of the ratio template");
+  static_assert(__is_ratio<_R2>::value, "[ratio.general]/2 requires R2 to be a specialisation of the ratio template");
+};
 
 template <class _R1, class _R2>
 struct __ratio_gcd {
diff --git a/libcxx/test/std/utilities/ratio/ratio.arithmetic/R1_R2_requirement.verify.cpp b/libcxx/test/std/utilities/ratio/ratio.arithmetic/R1_R2_requirement.verify.cpp
new file mode 100644
index 0000000..9fc91e1
--- /dev/null
+++ b/libcxx/test/std/utilities/ratio/ratio.arithmetic/R1_R2_requirement.verify.cpp
@@ -0,0 +1,56 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// <ratio>
+//
+// [ratio.general]/2
+//   Throughout subclause [ratio], the names of template parameters are
+//   used to express type requirements. If a template parameter is named
+//   R1 or R2, and the template argument is not a specialization of the
+//   ratio template, the program is ill-formed.
+
+#include <ratio>
+
+struct invalid {
+  static const int num = 1;
+  static const int den = 1;
+};
+
+using valid = std::ratio<1, 1>;
+
+namespace add {
+using valid_valid = std::ratio_add<valid, valid>::type;
+using invalid_valid =
+    std::ratio_add<invalid, valid>::type; // expected-error@*:* {{R1 to be a specialisation of the ratio template}}
+using valid_invalid =
+    std::ratio_add<valid, invalid>::type; // expected-error@*:* {{R2 to be a specialisation of the ratio template}}
+} // namespace add
+
+namespace subtract {
+using valid_valid = std::ratio_subtract<valid, valid>::type;
+using invalid_valid =
+    std::ratio_subtract<invalid, valid>::type; // expected-error@*:* {{R1 to be a specialisation of the ratio template}}
+using valid_invalid =
+    std::ratio_subtract<valid, invalid>::type; // expected-error@*:* {{R2 to be a specialisation of the ratio template}}
+} // namespace subtract
+
+namespace multiply {
+using valid_valid = std::ratio_multiply<valid, valid>::type;
+using invalid_valid =
+    std::ratio_multiply<invalid, valid>::type; // expected-error@*:* {{R1 to be a specialisation of the ratio template}}
+using valid_invalid =
+    std::ratio_multiply<valid, invalid>::type; // expected-error@*:* {{R2 to be a specialisation of the ratio template}}
+} // namespace multiply
+
+namespace divide {
+using valid_valid = std::ratio_divide<valid, valid>::type;
+using invalid_valid =
+    std::ratio_divide<invalid, valid>::type; // expected-error@*:* {{R1 to be a specialisation of the ratio template}}
+using valid_invalid =
+    std::ratio_divide<valid, invalid>::type; // expected-error@*:* {{R2 to be a specialisation of the ratio template}}
+} // namespace divide
diff --git a/libcxx/test/std/utilities/ratio/ratio.comparison/R1_R2_requirement.verify.cpp b/libcxx/test/std/utilities/ratio/ratio.comparison/R1_R2_requirement.verify.cpp
new file mode 100644
index 0000000..03bb266
--- /dev/null
+++ b/libcxx/test/std/utilities/ratio/ratio.comparison/R1_R2_requirement.verify.cpp
@@ -0,0 +1,81 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// <ratio>
+//
+// [ratio.general]/2
+//   Throughout subclause [ratio], the names of template parameters are
+//   used to express type requirements. If a template parameter is named
+//   R1 or R2, and the template argument is not a specialization of the
+//   ratio template, the program is ill-formed.
+//
+// Since all std::ratio_xxx_v variables use the same instantiation, only one
+// error will be generated. These values are tested in a separate test.
+
+#include <ratio>
+
+struct invalid {
+  static const int num = 1;
+  static const int den = 1;
+};
+
+using valid = std::ratio<1, 1>;
+
+namespace equal {
+using valid_valid = std::ratio_equal<valid, valid>::type;
+using invalid_valid =
+    std::ratio_equal<invalid, valid>::type; // expected-error@*:* {{R1 to be a specialisation of the ratio template}}
+using valid_invalid =
+    std::ratio_equal<valid, invalid>::type; // expected-error@*:* {{R2 to be a specialisation of the ratio template}}
+} // namespace equal
+
+namespace not_equal {
+using valid_valid = std::ratio_not_equal<valid, valid>::type;
+using invalid_valid =
+    std::ratio_not_equal<invalid,
+                         valid>::type; // expected-error@*:* {{R1 to be a specialisation of the ratio template}}
+using valid_invalid =
+    std::ratio_not_equal<valid,
+                         invalid>::type; // expected-error@*:* {{R2 to be a specialisation of the ratio template}}
+} // namespace not_equal
+
+namespace less {
+using valid_valid = std::ratio_less<valid, valid>::type;
+using invalid_valid =
+    std::ratio_less<invalid, valid>::type; // expected-error@*:* {{R1 to be a specialisation of the ratio template}}
+using valid_invalid =
+    std::ratio_less<valid, invalid>::type; // expected-error@*:* {{R2 to be a specialisation of the ratio template}}
+} // namespace less
+
+namespace less_equal {
+using valid_valid = std::ratio_less_equal<valid, valid>::type;
+using invalid_valid =
+    std::ratio_less_equal<invalid,
+                          valid>::type; // expected-error@*:* {{R1 to be a specialisation of the ratio template}}
+using valid_invalid =
+    std::ratio_less_equal<valid,
+                          invalid>::type; // expected-error@*:* {{R2 to be a specialisation of the ratio template}}
+} // namespace less_equal
+
+namespace greater {
+using valid_valid = std::ratio_greater<valid, valid>::type;
+using invalid_valid =
+    std::ratio_greater<invalid, valid>::type; // expected-error@*:* {{R1 to be a specialisation of the ratio template}}
+using valid_invalid =
+    std::ratio_greater<valid, invalid>::type; // expected-error@*:* {{R2 to be a specialisation of the ratio template}}
+} // namespace greater
+
+namespace greater_equal {
+using valid_valid = std::ratio_greater_equal<valid, valid>::type;
+using invalid_valid =
+    std::ratio_greater_equal<invalid,
+                             valid>::type; // expected-error@*:* {{R1 to be a specialisation of the ratio template}}
+using valid_invalid =
+    std::ratio_greater_equal<valid,
+                             invalid>::type; // expected-error@*:* {{R2 to be a specialisation of the ratio template}}
+} // namespace greater_equal
diff --git a/libcxx/test/std/utilities/ratio/ratio.comparison/R1_R2_requirement_v.verify.cpp b/libcxx/test/std/utilities/ratio/ratio.comparison/R1_R2_requirement_v.verify.cpp
new file mode 100644
index 0000000..fbcf358
--- /dev/null
+++ b/libcxx/test/std/utilities/ratio/ratio.comparison/R1_R2_requirement_v.verify.cpp
@@ -0,0 +1,69 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14
+
+// <ratio>
+//
+// [ratio.general]/2
+//   Throughout subclause [ratio], the names of template parameters are
+//   used to express type requirements. If a template parameter is named
+//   R1 or R2, and the template argument is not a specialization of the
+//   ratio template, the program is ill-formed.
+//
+// Since all std::ratio_xxx_v variables use the same instantiation, only one
+// error will be generated. These values are tested in a separate test.
+
+#include <ratio>
+
+struct invalid {
+  constexpr static int num = 1;
+  constexpr static int den = 1;
+};
+
+using valid = std::ratio<1, 1>;
+
+void test() {
+  // equal
+  (void)std::ratio_equal_v<valid, valid>;
+  (void)std::ratio_equal_v<invalid, valid>; // expected-error@*:* {{R1 to be a specialisation of the ratio template}}
+  (void)std::ratio_equal_v<valid, invalid>; // expected-error@*:* {{R2 to be a specialisation of the ratio template}}
+
+  // not_equal
+  (void)std::ratio_not_equal_v<valid, valid>;
+  (void)std::ratio_not_equal_v<invalid,
+                               valid>; // expected-error@*:* {{R1 to be a specialisation of the ratio template}}
+  (void)std::ratio_not_equal_v<valid,
+                               invalid>; // expected-error@*:* {{R2 to be a specialisation of the ratio template}}
+
+  // less
+  (void)std::ratio_less_v<valid, valid>;
+  (void)std::ratio_less_v<invalid, valid>; // expected-error@*:* {{R1 to be a specialisation of the ratio template}}
+  (void)std::ratio_less_v<valid, invalid>; // expected-error@*:* {{R2 to be a specialisation of the ratio template}}
+
+  // less_equal
+  (void)std::ratio_less_equal_v<valid, valid>;
+  (void)std::ratio_less_equal_v<invalid,
+                                valid>; // expected-error@*:* {{R1 to be a specialisation of the ratio template}}
+  (void)std::ratio_less_equal_v<valid,
+                                invalid>; // expected-error@*:* {{R2 to be a specialisation of the ratio template}}
+
+  // greater
+  (void)std::ratio_greater_v<valid, valid>;
+  (void)std::ratio_greater_v<invalid, valid>; // expected-error@*:* {{R1 to be a specialisation of the ratio template}}
+  (void)std::ratio_greater_v<valid, invalid>; // expected-error@*:* {{R2 to be a specialisation of the ratio template}}
+
+  // greater_equal
+  (void)std::ratio_greater_equal_v<valid, valid>;
+
+  (void)std::ratio_greater_equal_v<invalid,
+                                   valid>; // expected-error@*:* {{R1 to be a specialisation of the ratio template}}
+
+  (void)std::ratio_greater_equal_v<valid,
+                                   invalid>; // expected-error@*:* {{R2 to be a specialisation of the ratio template}}
+}
-- 
cgit v1.1


From 1503db86d65ee2bcc8ec1c2a5a4d00dea02aae0d Mon Sep 17 00:00:00 2001
From: Vlad Serebrennikov <serebrennikov.vladislav@gmail.com>
Date: Sun, 11 Feb 2024 16:29:17 +0300
Subject: [clang][NFC] Refactor bit-fields in `RawComment`

Make them all of the same `unsigned` type, which brings `sizeof(RawComment)` down from 12 to 4 when compiling Clang for Microsoft ABI.
---
 clang/include/clang/AST/RawCommentList.h | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/clang/include/clang/AST/RawCommentList.h b/clang/include/clang/AST/RawCommentList.h
index 53aae24..3e4567b 100644
--- a/clang/include/clang/AST/RawCommentList.h
+++ b/clang/include/clang/AST/RawCommentList.h
@@ -175,17 +175,22 @@ private:
   mutable StringRef RawText;
   mutable const char *BriefText = nullptr;
 
-  mutable bool RawTextValid : 1;   ///< True if RawText is valid
-  mutable bool BriefTextValid : 1; ///< True if BriefText is valid
+  LLVM_PREFERRED_TYPE(bool)
+  mutable unsigned RawTextValid : 1;
+  LLVM_PREFERRED_TYPE(bool)
+  mutable unsigned BriefTextValid : 1;
 
   LLVM_PREFERRED_TYPE(CommentKind)
   unsigned Kind : 3;
 
   /// True if comment is attached to a declaration in ASTContext.
-  bool IsAttached : 1;
+  LLVM_PREFERRED_TYPE(bool)
+  unsigned IsAttached : 1;
 
-  bool IsTrailingComment : 1;
-  bool IsAlmostTrailingComment : 1;
+  LLVM_PREFERRED_TYPE(bool)
+  unsigned IsTrailingComment : 1;
+  LLVM_PREFERRED_TYPE(bool)
+  unsigned IsAlmostTrailingComment : 1;
 
   /// Constructor for AST deserialization.
   RawComment(SourceRange SR, CommentKind K, bool IsTrailingComment,
-- 
cgit v1.1


From 082439c33fa76ad4df267600472695d24ad53821 Mon Sep 17 00:00:00 2001
From: Vlad Serebrennikov <serebrennikov.vladislav@gmail.com>
Date: Sun, 11 Feb 2024 16:40:15 +0300
Subject: [clang][NFC] Refactor bit-fields in `ObjCAtTryStmt`

Make all bit-fields of type `unsigned`, which reduces amoung of padding on Microsoft ABI, resulting in potentially lesser size of the object.
---
 clang/include/clang/AST/StmtObjC.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/clang/include/clang/AST/StmtObjC.h b/clang/include/clang/AST/StmtObjC.h
index c46ff46..03bc61f 100644
--- a/clang/include/clang/AST/StmtObjC.h
+++ b/clang/include/clang/AST/StmtObjC.h
@@ -177,7 +177,8 @@ class ObjCAtTryStmt final
   unsigned NumCatchStmts : 16;
 
   // Whether this statement has a \@finally statement.
-  bool HasFinally : 1;
+  LLVM_PREFERRED_TYPE(bool)
+  unsigned HasFinally : 1;
 
   /// Retrieve the statements that are stored after this \@try statement.
   ///
-- 
cgit v1.1


From 15279e7569108cccb49ca1fcfdfae420124d3fac Mon Sep 17 00:00:00 2001
From: Carlos Galvez <carlosgalvezp@gmail.com>
Date: Sun, 11 Feb 2024 15:04:03 +0100
Subject: [OpenMP] Remove -Wno-enum-constexpr-conversion (#81318)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This effectively reverts commit
9ff0cc7e0fa7e99163610d2fcb58e96f3315e343. For some reason "git revert"
lead to "no changes" after fixing conflicts, so a clean revert was not
possible.

The original issue (#57022) is no longer reproducible even with this
patch, so we can remove the suppression.

This is in line with our goal to make -Wenum-constexpr-conversion a
non-downgradeable error, see #59036.

Co-authored-by: Carlos Gálvez <carlos.galvez@zenseact.com>
---
 openmp/cmake/HandleOpenMPOptions.cmake | 1 -
 openmp/cmake/config-ix.cmake           | 1 -
 2 files changed, 2 deletions(-)

diff --git a/openmp/cmake/HandleOpenMPOptions.cmake b/openmp/cmake/HandleOpenMPOptions.cmake
index 9387d9b..4809520 100644
--- a/openmp/cmake/HandleOpenMPOptions.cmake
+++ b/openmp/cmake/HandleOpenMPOptions.cmake
@@ -41,7 +41,6 @@ append_if(OPENMP_HAVE_WSIGN_COMPARE_FLAG "-Wsign-compare" CMAKE_C_FLAGS CMAKE_CX
 # printed. Therefore, check for whether the compiler supports options in the
 # form -W<foo>, and if supported, add the corresponding -Wno-<foo> option.
 
-append_if(OPENMP_HAVE_WENUM_CONSTEXPR_CONVERSION_FLAG "-Wno-enum-constexpr-conversion" CMAKE_C_FLAGS CMAKE_CXX_FLAGS)
 append_if(OPENMP_HAVE_WEXTRA_FLAG "-Wno-extra" CMAKE_C_FLAGS CMAKE_CXX_FLAGS)
 append_if(OPENMP_HAVE_WPEDANTIC_FLAG "-Wno-pedantic" CMAKE_C_FLAGS CMAKE_CXX_FLAGS)
 append_if(OPENMP_HAVE_WMAYBE_UNINITIALIZED_FLAG "-Wno-maybe-uninitialized" CMAKE_C_FLAGS CMAKE_CXX_FLAGS)
diff --git a/openmp/cmake/config-ix.cmake b/openmp/cmake/config-ix.cmake
index a1e1b61..cfc6833 100644
--- a/openmp/cmake/config-ix.cmake
+++ b/openmp/cmake/config-ix.cmake
@@ -33,7 +33,6 @@ check_cxx_compiler_flag(-Wsign-compare OPENMP_HAVE_WSIGN_COMPARE_FLAG)
 # printed. Therefore, check for whether the compiler supports options in the
 # form -W<foo>, and if supported, add the corresponding -Wno-<foo> option.
 
-check_cxx_compiler_flag(-Wenum-constexpr-conversion OPENMP_HAVE_WENUM_CONSTEXPR_CONVERSION_FLAG)
 check_cxx_compiler_flag(-Wextra OPENMP_HAVE_WEXTRA_FLAG)
 check_cxx_compiler_flag(-Wpedantic OPENMP_HAVE_WPEDANTIC_FLAG)
 check_cxx_compiler_flag(-Wmaybe-uninitialized OPENMP_HAVE_WMAYBE_UNINITIALIZED_FLAG)
-- 
cgit v1.1


From e3f684d86b308bc2576d813aad1a230aa6b295ab Mon Sep 17 00:00:00 2001
From: Vlad Serebrennikov <serebrennikov.vladislav@gmail.com>
Date: Sun, 11 Feb 2024 17:27:31 +0300
Subject: [clang][NFC] Refactor bit-fields in `DefaultedFunctionKind`

This patch makes all bit-fields in `DefaultedFunctionKind` of type `unsigned`, which brings `sizeof(DefaultedFunctionKind)` down from 8 to 4 when compiling Clang for Microsoft ABI.
---
 clang/include/clang/Sema/Sema.h | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h
index 3c26003..851560f 100644
--- a/clang/include/clang/Sema/Sema.h
+++ b/clang/include/clang/Sema/Sema.h
@@ -3501,29 +3501,29 @@ public:
 
   /// For a defaulted function, the kind of defaulted function that it is.
   class DefaultedFunctionKind {
-    CXXSpecialMember SpecialMember : 8;
-    DefaultedComparisonKind Comparison : 8;
+    unsigned SpecialMember : 8;
+    unsigned Comparison : 8;
 
   public:
     DefaultedFunctionKind()
-        : SpecialMember(CXXInvalid), Comparison(DefaultedComparisonKind::None) {
+        : SpecialMember(CXXInvalid), Comparison(llvm::to_underlying(DefaultedComparisonKind::None)) {
     }
     DefaultedFunctionKind(CXXSpecialMember CSM)
-        : SpecialMember(CSM), Comparison(DefaultedComparisonKind::None) {}
+        : SpecialMember(CSM), Comparison(llvm::to_underlying(DefaultedComparisonKind::None)) {}
     DefaultedFunctionKind(DefaultedComparisonKind Comp)
-        : SpecialMember(CXXInvalid), Comparison(Comp) {}
+        : SpecialMember(CXXInvalid), Comparison(llvm::to_underlying(Comp)) {}
 
     bool isSpecialMember() const { return SpecialMember != CXXInvalid; }
     bool isComparison() const {
-      return Comparison != DefaultedComparisonKind::None;
+      return static_cast<DefaultedComparisonKind>(Comparison) != DefaultedComparisonKind::None;
     }
 
     explicit operator bool() const {
       return isSpecialMember() || isComparison();
     }
 
-    CXXSpecialMember asSpecialMember() const { return SpecialMember; }
-    DefaultedComparisonKind asComparison() const { return Comparison; }
+    CXXSpecialMember asSpecialMember() const { return static_cast<CXXSpecialMember>(SpecialMember); }
+    DefaultedComparisonKind asComparison() const { return static_cast<DefaultedComparisonKind>(Comparison); }
 
     /// Get the index of this function kind for use in diagnostics.
     unsigned getDiagnosticIndex() const {
@@ -3531,7 +3531,7 @@ public:
                     "invalid should have highest index");
       static_assert((unsigned)DefaultedComparisonKind::None == 0,
                     "none should be equal to zero");
-      return SpecialMember + (unsigned)Comparison;
+      return SpecialMember + Comparison;
     }
   };
 
-- 
cgit v1.1


From b45de48be24695b613f48ed21bb35f844454193b Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <RKSimon@users.noreply.github.com>
Date: Sun, 11 Feb 2024 15:02:27 +0000
Subject: [MVE] Expand64BitShift - handle all constant shift amounts less than
 32 (#81261)

Expand64BitShift was always dropping to generic shift legalization if the shift amount type was larger than i64, even if the constant shift amount was actually very small. I've adjusted the constant bounds checks to work with APInt types so we can always perform the comparison.

This results in the MVE long shift instructions being used more often, and it looks like this is preventing some additional combines from happening. This could be addressed in the future.

This came about while I was trying to extend the DAGTypeLegalizer::ExpandShift* helpers and need to move to consistently using the legal shift amount types instead of reusing the shift amount type from the original wider shift.
---
 llvm/lib/Target/ARM/ARMISelLowering.cpp           |    4 +-
 llvm/test/CodeGen/Thumb2/mve-fptosi-sat-vector.ll | 1703 +++++++++++----------
 llvm/test/CodeGen/Thumb2/mve-fptoui-sat-vector.ll | 1327 ++++++++--------
 3 files changed, 1545 insertions(+), 1489 deletions(-)

diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
index b5c4a8a..b98006e 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -6702,8 +6702,8 @@ static SDValue Expand64BitShift(SDNode *N, SelectionDAG &DAG,
 
     // If the shift amount is greater than 32 or has a greater bitwidth than 64
     // then do the default optimisation
-    if (ShAmt->getValueType(0).getSizeInBits() > 64 ||
-        (Con && (Con->getZExtValue() == 0 || Con->getZExtValue() >= 32)))
+    if ((!Con && ShAmt->getValueType(0).getSizeInBits() > 64) ||
+        (Con && (Con->getAPIntValue() == 0 || Con->getAPIntValue().uge(32))))
       return SDValue();
 
     // Extract the lower 32 bits of the shift amount if it's not an i32
diff --git a/llvm/test/CodeGen/Thumb2/mve-fptosi-sat-vector.ll b/llvm/test/CodeGen/Thumb2/mve-fptosi-sat-vector.ll
index 3ca01cf..570834f 100644
--- a/llvm/test/CodeGen/Thumb2/mve-fptosi-sat-vector.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-fptosi-sat-vector.ll
@@ -1821,44 +1821,42 @@ define arm_aapcs_vfpcc <4 x i32> @test_signed_v4f32_v4i32_duplicate(<4 x float>
 define arm_aapcs_vfpcc <4 x i50> @test_signed_v4f32_v4i50(<4 x float> %f) {
 ; CHECK-LABEL: test_signed_v4f32_v4i50:
 ; CHECK:       @ %bb.0:
-; CHECK-NEXT:    .save {r4, r5, r6, r7, r8, r9, lr}
-; CHECK-NEXT:    push.w {r4, r5, r6, r7, r8, r9, lr}
-; CHECK-NEXT:    .pad #4
-; CHECK-NEXT:    sub sp, #4
+; CHECK-NEXT:    .save {r4, r5, r6, r7, r8, r9, r10, lr}
+; CHECK-NEXT:    push.w {r4, r5, r6, r7, r8, r9, r10, lr}
 ; CHECK-NEXT:    .vsave {d8, d9, d10, d11}
 ; CHECK-NEXT:    vpush {d8, d9, d10, d11}
 ; CHECK-NEXT:    vmov q4, q0
 ; CHECK-NEXT:    mov r8, r0
-; CHECK-NEXT:    vmov r0, s17
+; CHECK-NEXT:    vmov r0, s18
 ; CHECK-NEXT:    bl __aeabi_f2lz
 ; CHECK-NEXT:    mov r9, r0
 ; CHECK-NEXT:    vmov r0, s19
 ; CHECK-NEXT:    vldr s20, .LCPI28_0
-; CHECK-NEXT:    mov r7, r1
-; CHECK-NEXT:    vmov r4, s16
-; CHECK-NEXT:    vcmp.f32 s17, s20
+; CHECK-NEXT:    mov r5, r1
+; CHECK-NEXT:    vmov r6, s16
+; CHECK-NEXT:    vcmp.f32 s18, s20
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    itt lt
-; CHECK-NEXT:    movlt r7, #0
-; CHECK-NEXT:    movtlt r7, #65534
+; CHECK-NEXT:    movlt r5, #0
+; CHECK-NEXT:    movtlt r5, #65534
 ; CHECK-NEXT:    bl __aeabi_f2lz
 ; CHECK-NEXT:    vldr s22, .LCPI28_1
 ; CHECK-NEXT:    vcmp.f32 s19, s20
-; CHECK-NEXT:    mov r6, r0
+; CHECK-NEXT:    mov r4, r0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    mov r5, r1
-; CHECK-NEXT:    mov r0, r4
-; CHECK-NEXT:    vcmp.f32 s17, s22
+; CHECK-NEXT:    mov r7, r1
+; CHECK-NEXT:    mov r0, r6
+; CHECK-NEXT:    vcmp.f32 s18, s22
 ; CHECK-NEXT:    itt lt
-; CHECK-NEXT:    movlt r5, #0
-; CHECK-NEXT:    movtlt r5, #65534
+; CHECK-NEXT:    movlt r7, #0
+; CHECK-NEXT:    movtlt r7, #65534
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    itt gt
-; CHECK-NEXT:    movwgt r7, #65535
-; CHECK-NEXT:    movtgt r7, #1
+; CHECK-NEXT:    movwgt r5, #65535
+; CHECK-NEXT:    movtgt r5, #1
 ; CHECK-NEXT:    bl __aeabi_f2lz
 ; CHECK-NEXT:    vcmp.f32 s16, s20
-; CHECK-NEXT:    mov r4, r1
+; CHECK-NEXT:    mov r10, r1
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    vcmp.f32 s19, s22
 ; CHECK-NEXT:    it lt
@@ -1866,109 +1864,103 @@ define arm_aapcs_vfpcc <4 x i50> @test_signed_v4f32_v4i50(<4 x float> %f) {
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    vcmp.f32 s16, s22
 ; CHECK-NEXT:    itt gt
-; CHECK-NEXT:    movwgt r5, #65535
-; CHECK-NEXT:    movtgt r5, #1
+; CHECK-NEXT:    movwgt r7, #65535
+; CHECK-NEXT:    movtgt r7, #1
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s16, s16
 ; CHECK-NEXT:    it gt
 ; CHECK-NEXT:    movgt.w r0, #-1
+; CHECK-NEXT:    vcmp.f32 s16, s16
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s19, s20
 ; CHECK-NEXT:    it vs
 ; CHECK-NEXT:    movvs r0, #0
+; CHECK-NEXT:    str.w r0, [r8]
+; CHECK-NEXT:    vmov r0, s17
+; CHECK-NEXT:    vcmp.f32 s19, s20
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    vcmp.f32 s19, s22
-; CHECK-NEXT:    str.w r0, [r8]
 ; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r6, #0
+; CHECK-NEXT:    movlt r4, #0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    vcmp.f32 s19, s19
 ; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt.w r6, #-1
+; CHECK-NEXT:    movgt.w r4, #-1
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s17, s17
+; CHECK-NEXT:    vcmp.f32 s18, s20
 ; CHECK-NEXT:    itt vs
-; CHECK-NEXT:    movvs r6, #0
-; CHECK-NEXT:    movvs r5, #0
-; CHECK-NEXT:    lsls r0, r5, #22
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s17, s20
-; CHECK-NEXT:    orr.w r0, r0, r6, lsr #10
-; CHECK-NEXT:    str.w r0, [r8, #20]
-; CHECK-NEXT:    it vs
+; CHECK-NEXT:    movvs r4, #0
 ; CHECK-NEXT:    movvs r7, #0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    it lt
 ; CHECK-NEXT:    movlt.w r9, #0
-; CHECK-NEXT:    vcmp.f32 s17, s22
+; CHECK-NEXT:    vcmp.f32 s18, s22
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    it gt
 ; CHECK-NEXT:    movgt.w r9, #-1
-; CHECK-NEXT:    vcmp.f32 s17, s17
+; CHECK-NEXT:    vcmp.f32 s18, s18
+; CHECK-NEXT:    mov r1, r7
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    it vs
+; CHECK-NEXT:    itt vs
 ; CHECK-NEXT:    movvs.w r9, #0
-; CHECK-NEXT:    lsr.w r0, r9, #14
-; CHECK-NEXT:    orr.w r1, r0, r7, lsl #18
-; CHECK-NEXT:    vmov r0, s18
-; CHECK-NEXT:    str.w r1, [r8, #8]
+; CHECK-NEXT:    movvs r5, #0
+; CHECK-NEXT:    bfc r1, #18, #14
+; CHECK-NEXT:    vcmp.f32 s16, s20
+; CHECK-NEXT:    bfc r5, #18, #14
+; CHECK-NEXT:    mov r6, r9
+; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-NEXT:    lsll r4, r1, #22
+; CHECK-NEXT:    lsrl r6, r5, #28
+; CHECK-NEXT:    itt lt
+; CHECK-NEXT:    movwlt r10, #0
+; CHECK-NEXT:    movtlt r10, #65534
+; CHECK-NEXT:    vcmp.f32 s16, s22
+; CHECK-NEXT:    orrs r1, r5
+; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-NEXT:    itt gt
+; CHECK-NEXT:    movwgt r10, #65535
+; CHECK-NEXT:    movtgt r10, #1
+; CHECK-NEXT:    str.w r1, [r8, #20]
 ; CHECK-NEXT:    bl __aeabi_f2lz
-; CHECK-NEXT:    vcmp.f32 s18, s20
-; CHECK-NEXT:    lsrs r2, r5, #10
+; CHECK-NEXT:    vcmp.f32 s17, s20
+; CHECK-NEXT:    orr.w r2, r6, r4
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s18, s22
 ; CHECK-NEXT:    itt lt
 ; CHECK-NEXT:    movlt r1, #0
 ; CHECK-NEXT:    movtlt r1, #65534
+; CHECK-NEXT:    vcmp.f32 s17, s22
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s16, s20
 ; CHECK-NEXT:    itt gt
 ; CHECK-NEXT:    movwgt r1, #65535
 ; CHECK-NEXT:    movtgt r1, #1
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s16, s22
-; CHECK-NEXT:    itt lt
-; CHECK-NEXT:    movlt r4, #0
-; CHECK-NEXT:    movtlt r4, #65534
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s18, s20
-; CHECK-NEXT:    itt gt
-; CHECK-NEXT:    movwgt r4, #65535
-; CHECK-NEXT:    movtgt r4, #1
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s18, s22
+; CHECK-NEXT:    str.w r2, [r8, #16]
+; CHECK-NEXT:    lsrs r2, r7, #10
+; CHECK-NEXT:    vcmp.f32 s17, s20
 ; CHECK-NEXT:    strb.w r2, [r8, #24]
+; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    it lt
 ; CHECK-NEXT:    movlt r0, #0
+; CHECK-NEXT:    vcmp.f32 s17, s22
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s18, s18
 ; CHECK-NEXT:    it gt
 ; CHECK-NEXT:    movgt.w r0, #-1
+; CHECK-NEXT:    vcmp.f32 s17, s17
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    ubfx r2, r7, #14, #4
-; CHECK-NEXT:    vcmp.f32 s16, s16
-; CHECK-NEXT:    it vs
+; CHECK-NEXT:    itt vs
 ; CHECK-NEXT:    movvs r0, #0
-; CHECK-NEXT:    orr.w r2, r2, r0, lsl #4
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    str.w r2, [r8, #12]
-; CHECK-NEXT:    it vs
-; CHECK-NEXT:    movvs r4, #0
-; CHECK-NEXT:    vcmp.f32 s18, s18
-; CHECK-NEXT:    bfc r4, #18, #14
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    orr.w r2, r4, r9, lsl #18
-; CHECK-NEXT:    str.w r2, [r8, #4]
-; CHECK-NEXT:    it vs
 ; CHECK-NEXT:    movvs r1, #0
-; CHECK-NEXT:    lsrs r0, r0, #28
 ; CHECK-NEXT:    bfc r1, #18, #14
-; CHECK-NEXT:    orr.w r0, r0, r1, lsl #4
-; CHECK-NEXT:    orr.w r0, r0, r6, lsl #22
-; CHECK-NEXT:    str.w r0, [r8, #16]
+; CHECK-NEXT:    mov r2, r0
+; CHECK-NEXT:    lsrl r2, r1, #14
+; CHECK-NEXT:    vcmp.f32 s16, s16
+; CHECK-NEXT:    orr.w r1, r1, r9, lsl #4
+; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-NEXT:    strd r2, r1, [r8, #8]
+; CHECK-NEXT:    it vs
+; CHECK-NEXT:    movvs.w r10, #0
+; CHECK-NEXT:    bfc r10, #18, #14
+; CHECK-NEXT:    orr.w r0, r10, r0, lsl #18
+; CHECK-NEXT:    str.w r0, [r8, #4]
 ; CHECK-NEXT:    vpop {d8, d9, d10, d11}
-; CHECK-NEXT:    add sp, #4
-; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, pc}
+; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, r10, pc}
 ; CHECK-NEXT:    .p2align 2
 ; CHECK-NEXT:  @ %bb.1:
 ; CHECK-NEXT:  .LCPI28_0:
@@ -2120,21 +2112,22 @@ define arm_aapcs_vfpcc <4 x i64> @test_signed_v4f32_v4i64(<4 x float> %f) {
 define arm_aapcs_vfpcc <4 x i100> @test_signed_v4f32_v4i100(<4 x float> %f) {
 ; CHECK-LABEL: test_signed_v4f32_v4i100:
 ; CHECK:       @ %bb.0:
-; CHECK-NEXT:    .save {r4, r5, r6, r7, lr}
-; CHECK-NEXT:    push {r4, r5, r6, r7, lr}
+; CHECK-NEXT:    .save {r4, r5, r6, r7, r8, r9, r10, r11, lr}
+; CHECK-NEXT:    push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr}
 ; CHECK-NEXT:    .pad #4
 ; CHECK-NEXT:    sub sp, #4
 ; CHECK-NEXT:    .vsave {d8, d9, d10, d11}
 ; CHECK-NEXT:    vpush {d8, d9, d10, d11}
 ; CHECK-NEXT:    vmov q4, q0
-; CHECK-NEXT:    mov r4, r0
+; CHECK-NEXT:    mov r9, r0
 ; CHECK-NEXT:    vmov r0, s18
-; CHECK-NEXT:    vldr s20, .LCPI30_0
-; CHECK-NEXT:    vmov r7, s19
-; CHECK-NEXT:    vmov r5, s16
 ; CHECK-NEXT:    bl __fixsfti
-; CHECK-NEXT:    vldr s22, .LCPI30_1
-; CHECK-NEXT:    mov r6, r3
+; CHECK-NEXT:    mov r10, r3
+; CHECK-NEXT:    vmov r3, s16
+; CHECK-NEXT:    vldr s22, .LCPI30_0
+; CHECK-NEXT:    vmov r7, s17
+; CHECK-NEXT:    vldr s20, .LCPI30_1
+; CHECK-NEXT:    vmov r4, s19
 ; CHECK-NEXT:    vcmp.f32 s18, s22
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    vcmp.f32 s18, s20
@@ -2150,7 +2143,7 @@ define arm_aapcs_vfpcc <4 x i100> @test_signed_v4f32_v4i100(<4 x float> %f) {
 ; CHECK-NEXT:    movvs r2, #0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    vcmp.f32 s18, s20
-; CHECK-NEXT:    str.w r2, [r4, #33]
+; CHECK-NEXT:    str.w r2, [r9, #33]
 ; CHECK-NEXT:    it lt
 ; CHECK-NEXT:    movlt r1, #0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
@@ -2162,7 +2155,7 @@ define arm_aapcs_vfpcc <4 x i100> @test_signed_v4f32_v4i100(<4 x float> %f) {
 ; CHECK-NEXT:    it vs
 ; CHECK-NEXT:    movvs r1, #0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    str.w r1, [r4, #29]
+; CHECK-NEXT:    str.w r1, [r9, #29]
 ; CHECK-NEXT:    it lt
 ; CHECK-NEXT:    movlt r0, #0
 ; CHECK-NEXT:    vcmp.f32 s18, s20
@@ -2173,11 +2166,11 @@ define arm_aapcs_vfpcc <4 x i100> @test_signed_v4f32_v4i100(<4 x float> %f) {
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    it vs
 ; CHECK-NEXT:    movvs r0, #0
-; CHECK-NEXT:    str.w r0, [r4, #25]
-; CHECK-NEXT:    mov r0, r5
+; CHECK-NEXT:    str.w r0, [r9, #25]
+; CHECK-NEXT:    mov r0, r3
 ; CHECK-NEXT:    bl __fixsfti
 ; CHECK-NEXT:    vcmp.f32 s16, s22
-; CHECK-NEXT:    mov r5, r3
+; CHECK-NEXT:    mov r11, r3
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    vcmp.f32 s16, s20
 ; CHECK-NEXT:    it lt
@@ -2192,7 +2185,7 @@ define arm_aapcs_vfpcc <4 x i100> @test_signed_v4f32_v4i100(<4 x float> %f) {
 ; CHECK-NEXT:    movvs r2, #0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    vcmp.f32 s16, s20
-; CHECK-NEXT:    str r2, [r4, #8]
+; CHECK-NEXT:    str.w r2, [r9, #8]
 ; CHECK-NEXT:    it lt
 ; CHECK-NEXT:    movlt r1, #0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
@@ -2204,7 +2197,7 @@ define arm_aapcs_vfpcc <4 x i100> @test_signed_v4f32_v4i100(<4 x float> %f) {
 ; CHECK-NEXT:    it vs
 ; CHECK-NEXT:    movvs r1, #0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    str r1, [r4, #4]
+; CHECK-NEXT:    str.w r1, [r9, #4]
 ; CHECK-NEXT:    it lt
 ; CHECK-NEXT:    movlt r0, #0
 ; CHECK-NEXT:    vcmp.f32 s16, s20
@@ -2215,165 +2208,165 @@ define arm_aapcs_vfpcc <4 x i100> @test_signed_v4f32_v4i100(<4 x float> %f) {
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    it vs
 ; CHECK-NEXT:    movvs r0, #0
-; CHECK-NEXT:    str r0, [r4]
-; CHECK-NEXT:    mov r0, r7
+; CHECK-NEXT:    str.w r0, [r9]
+; CHECK-NEXT:    mov r0, r4
 ; CHECK-NEXT:    bl __fixsfti
 ; CHECK-NEXT:    vcmp.f32 s19, s22
+; CHECK-NEXT:    mov r6, r0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    vcmp.f32 s19, s20
 ; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r1, #0
+; CHECK-NEXT:    movlt r6, #0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    vcmp.f32 s19, s19
 ; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt.w r1, #-1
+; CHECK-NEXT:    movgt.w r6, #-1
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s19, s22
+; CHECK-NEXT:    vcmp.f32 s18, s22
 ; CHECK-NEXT:    it vs
-; CHECK-NEXT:    movvs r1, #0
+; CHECK-NEXT:    movvs r6, #0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s19, s20
 ; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r2, #0
+; CHECK-NEXT:    mvnlt r10, #7
+; CHECK-NEXT:    vcmp.f32 s18, s20
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s19, s19
 ; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt.w r2, #-1
+; CHECK-NEXT:    movgt.w r10, #7
+; CHECK-NEXT:    vcmp.f32 s18, s18
+; CHECK-NEXT:    mov r5, r1
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    lsr.w r7, r1, #28
-; CHECK-NEXT:    vcmp.f32 s19, s22
 ; CHECK-NEXT:    it vs
-; CHECK-NEXT:    movvs r2, #0
-; CHECK-NEXT:    orr.w r7, r7, r2, lsl #4
+; CHECK-NEXT:    movvs.w r10, #0
+; CHECK-NEXT:    and r0, r10, #15
+; CHECK-NEXT:    mov r4, r2
+; CHECK-NEXT:    orr.w r0, r0, r6, lsl #4
+; CHECK-NEXT:    str.w r0, [r9, #37]
+; CHECK-NEXT:    mov r0, r7
+; CHECK-NEXT:    mov r8, r3
+; CHECK-NEXT:    bl __fixsfti
+; CHECK-NEXT:    vcmp.f32 s17, s22
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    str.w r7, [r4, #45]
+; CHECK-NEXT:    vcmp.f32 s17, s20
 ; CHECK-NEXT:    it lt
 ; CHECK-NEXT:    movlt r0, #0
-; CHECK-NEXT:    vcmp.f32 s19, s20
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-NEXT:    vcmp.f32 s17, s17
 ; CHECK-NEXT:    it gt
 ; CHECK-NEXT:    movgt.w r0, #-1
-; CHECK-NEXT:    vcmp.f32 s19, s19
-; CHECK-NEXT:    lsrs r2, r2, #28
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-NEXT:    vcmp.f32 s16, s22
 ; CHECK-NEXT:    it vs
 ; CHECK-NEXT:    movvs r0, #0
-; CHECK-NEXT:    lsrs r7, r0, #28
-; CHECK-NEXT:    vcmp.f32 s19, s22
-; CHECK-NEXT:    orr.w r7, r7, r1, lsl #4
-; CHECK-NEXT:    vmov r1, s17
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s19, s20
-; CHECK-NEXT:    str.w r7, [r4, #41]
+; CHECK-NEXT:    vcmp.f32 s16, s20
 ; CHECK-NEXT:    it lt
-; CHECK-NEXT:    mvnlt r3, #7
+; CHECK-NEXT:    mvnlt r11, #7
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s19, s19
+; CHECK-NEXT:    vcmp.f32 s16, s16
 ; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt r3, #7
+; CHECK-NEXT:    movgt.w r11, #7
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s18, s22
+; CHECK-NEXT:    vcmp.f32 s19, s22
 ; CHECK-NEXT:    it vs
-; CHECK-NEXT:    movvs r3, #0
-; CHECK-NEXT:    orr.w r2, r2, r3, lsl #4
+; CHECK-NEXT:    movvs.w r11, #0
+; CHECK-NEXT:    and r7, r11, #15
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    strb.w r2, [r4, #49]
+; CHECK-NEXT:    vcmp.f32 s19, s20
+; CHECK-NEXT:    orr.w r7, r7, r0, lsl #4
+; CHECK-NEXT:    str.w r7, [r9, #12]
 ; CHECK-NEXT:    it lt
-; CHECK-NEXT:    mvnlt r6, #7
-; CHECK-NEXT:    vcmp.f32 s18, s20
+; CHECK-NEXT:    movlt r5, #0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-NEXT:    vcmp.f32 s19, s19
 ; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt r6, #7
-; CHECK-NEXT:    vcmp.f32 s18, s18
+; CHECK-NEXT:    movgt.w r5, #-1
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-NEXT:    vcmp.f32 s19, s22
 ; CHECK-NEXT:    it vs
-; CHECK-NEXT:    movvs r6, #0
-; CHECK-NEXT:    and r2, r6, #15
-; CHECK-NEXT:    orr.w r0, r2, r0, lsl #4
-; CHECK-NEXT:    str.w r0, [r4, #37]
-; CHECK-NEXT:    mov r0, r1
-; CHECK-NEXT:    bl __fixsfti
-; CHECK-NEXT:    vcmp.f32 s17, s22
+; CHECK-NEXT:    movvs r5, #0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s17, s20
+; CHECK-NEXT:    vcmp.f32 s19, s20
 ; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r1, #0
+; CHECK-NEXT:    movlt r4, #0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s17, s17
+; CHECK-NEXT:    vcmp.f32 s19, s19
 ; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt.w r1, #-1
+; CHECK-NEXT:    movgt.w r4, #-1
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s17, s22
+; CHECK-NEXT:    lsrl r6, r5, #28
+; CHECK-NEXT:    vcmp.f32 s19, s22
 ; CHECK-NEXT:    it vs
-; CHECK-NEXT:    movvs r1, #0
+; CHECK-NEXT:    movvs r4, #0
+; CHECK-NEXT:    orr.w r7, r5, r4, lsl #4
+; CHECK-NEXT:    str.w r7, [r9, #45]
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s17, s20
+; CHECK-NEXT:    str.w r6, [r9, #41]
 ; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r2, #0
+; CHECK-NEXT:    mvnlt r8, #7
+; CHECK-NEXT:    vcmp.f32 s19, s20
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s17, s17
 ; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt.w r2, #-1
+; CHECK-NEXT:    movgt.w r8, #7
+; CHECK-NEXT:    vcmp.f32 s19, s19
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s17, s22
 ; CHECK-NEXT:    it vs
-; CHECK-NEXT:    movvs r2, #0
-; CHECK-NEXT:    lsrs r7, r1, #28
+; CHECK-NEXT:    movvs.w r8, #0
+; CHECK-NEXT:    and r5, r8, #15
+; CHECK-NEXT:    vcmp.f32 s17, s22
+; CHECK-NEXT:    lsrl r4, r5, #28
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    vcmp.f32 s17, s20
-; CHECK-NEXT:    orr.w r7, r7, r2, lsl #4
-; CHECK-NEXT:    str r7, [r4, #20]
+; CHECK-NEXT:    strb.w r4, [r9, #49]
 ; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r0, #0
+; CHECK-NEXT:    mvnlt r3, #7
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    vcmp.f32 s17, s17
 ; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt.w r0, #-1
+; CHECK-NEXT:    movgt r3, #7
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    vcmp.f32 s17, s22
 ; CHECK-NEXT:    it vs
-; CHECK-NEXT:    movvs r0, #0
+; CHECK-NEXT:    movvs r3, #0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s17, s20
-; CHECK-NEXT:    lsr.w r7, r0, #28
-; CHECK-NEXT:    orr.w r1, r7, r1, lsl #4
-; CHECK-NEXT:    str r1, [r4, #16]
 ; CHECK-NEXT:    it lt
-; CHECK-NEXT:    mvnlt r3, #7
+; CHECK-NEXT:    movlt r1, #0
+; CHECK-NEXT:    vcmp.f32 s17, s20
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s17, s17
 ; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt r3, #7
+; CHECK-NEXT:    movgt.w r1, #-1
+; CHECK-NEXT:    vcmp.f32 s17, s17
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    lsr.w r1, r2, #28
-; CHECK-NEXT:    vcmp.f32 s16, s22
 ; CHECK-NEXT:    it vs
-; CHECK-NEXT:    movvs r3, #0
-; CHECK-NEXT:    orr.w r1, r1, r3, lsl #4
+; CHECK-NEXT:    movvs r1, #0
+; CHECK-NEXT:    vmov q0[3], q0[1], r1, r3
+; CHECK-NEXT:    vcmp.f32 s17, s22
+; CHECK-NEXT:    vmov r1, s1
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    strb r1, [r4, #24]
 ; CHECK-NEXT:    it lt
-; CHECK-NEXT:    mvnlt r5, #7
-; CHECK-NEXT:    vcmp.f32 s16, s20
+; CHECK-NEXT:    movlt r2, #0
+; CHECK-NEXT:    vcmp.f32 s17, s20
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-NEXT:    lsrl r0, r1, #28
 ; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt r5, #7
-; CHECK-NEXT:    vcmp.f32 s16, s16
+; CHECK-NEXT:    movgt.w r2, #-1
+; CHECK-NEXT:    vcmp.f32 s17, s17
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    it vs
-; CHECK-NEXT:    movvs r5, #0
-; CHECK-NEXT:    and r1, r5, #15
-; CHECK-NEXT:    orr.w r0, r1, r0, lsl #4
-; CHECK-NEXT:    str r0, [r4, #12]
+; CHECK-NEXT:    movvs r2, #0
+; CHECK-NEXT:    orr.w r1, r1, r2, lsl #4
+; CHECK-NEXT:    strd r0, r1, [r9, #16]
+; CHECK-NEXT:    and r1, r3, #15
+; CHECK-NEXT:    lsrl r2, r1, #28
+; CHECK-NEXT:    strb.w r2, [r9, #24]
 ; CHECK-NEXT:    vpop {d8, d9, d10, d11}
 ; CHECK-NEXT:    add sp, #4
-; CHECK-NEXT:    pop {r4, r5, r6, r7, pc}
+; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
 ; CHECK-NEXT:    .p2align 2
 ; CHECK-NEXT:  @ %bb.1:
 ; CHECK-NEXT:  .LCPI30_0:
-; CHECK-NEXT:    .long 0x70ffffff @ float 6.33825262E+29
-; CHECK-NEXT:  .LCPI30_1:
 ; CHECK-NEXT:    .long 0xf1000000 @ float -6.338253E+29
+; CHECK-NEXT:  .LCPI30_1:
+; CHECK-NEXT:    .long 0x70ffffff @ float 6.33825262E+29
     %x = call <4 x i100> @llvm.fptosi.sat.v4f32.v4i100(<4 x float> %f)
     ret <4 x i100> %x
 }
@@ -3694,151 +3687,155 @@ define arm_aapcs_vfpcc <2 x i100> @test_signed_v2f64_v2i100(<2 x double> %f) {
 ; CHECK-NEXT:    sub sp, #48
 ; CHECK-NEXT:    vmov q4, q0
 ; CHECK-NEXT:    vldr d0, .LCPI40_0
-; CHECK-NEXT:    vmov r6, r5, d8
-; CHECK-NEXT:    mov r11, r0
-; CHECK-NEXT:    vmov r9, r8, d0
-; CHECK-NEXT:    str.w r8, [sp, #28] @ 4-byte Spill
-; CHECK-NEXT:    mov r0, r6
-; CHECK-NEXT:    mov r1, r5
+; CHECK-NEXT:    vmov r5, r7, d8
+; CHECK-NEXT:    mov r10, r0
+; CHECK-NEXT:    vmov r9, r3, d0
+; CHECK-NEXT:    str r0, [sp, #40] @ 4-byte Spill
+; CHECK-NEXT:    str r3, [sp, #32] @ 4-byte Spill
+; CHECK-NEXT:    mov r0, r5
+; CHECK-NEXT:    mov r1, r7
 ; CHECK-NEXT:    mov r2, r9
-; CHECK-NEXT:    mov r3, r8
 ; CHECK-NEXT:    bl __aeabi_dcmpgt
 ; CHECK-NEXT:    vldr d0, .LCPI40_1
-; CHECK-NEXT:    mov r10, r0
-; CHECK-NEXT:    mov r0, r6
-; CHECK-NEXT:    mov r1, r5
-; CHECK-NEXT:    vmov r7, r3, d0
-; CHECK-NEXT:    str r3, [sp, #32] @ 4-byte Spill
-; CHECK-NEXT:    mov r2, r7
+; CHECK-NEXT:    mov r6, r0
+; CHECK-NEXT:    mov r0, r5
+; CHECK-NEXT:    mov r1, r7
+; CHECK-NEXT:    vmov r8, r3, d0
+; CHECK-NEXT:    mov r2, r8
+; CHECK-NEXT:    mov r11, r3
 ; CHECK-NEXT:    bl __aeabi_dcmpge
 ; CHECK-NEXT:    mov r4, r0
-; CHECK-NEXT:    mov r0, r6
-; CHECK-NEXT:    mov r1, r5
+; CHECK-NEXT:    mov r0, r5
+; CHECK-NEXT:    mov r1, r7
 ; CHECK-NEXT:    bl __fixdfti
 ; CHECK-NEXT:    cmp r4, #0
-; CHECK-NEXT:    strd r1, r0, [sp, #8] @ 8-byte Folded Spill
+; CHECK-NEXT:    str r0, [sp, #36] @ 4-byte Spill
 ; CHECK-NEXT:    csel r4, r2, r4, ne
+; CHECK-NEXT:    str r1, [sp, #16] @ 4-byte Spill
 ; CHECK-NEXT:    str r3, [sp, #24] @ 4-byte Spill
-; CHECK-NEXT:    mov r0, r6
-; CHECK-NEXT:    mov r1, r5
-; CHECK-NEXT:    mov r2, r6
-; CHECK-NEXT:    mov r3, r5
-; CHECK-NEXT:    cmp.w r10, #0
+; CHECK-NEXT:    mov r0, r5
+; CHECK-NEXT:    mov r1, r7
+; CHECK-NEXT:    mov r2, r5
+; CHECK-NEXT:    mov r3, r7
+; CHECK-NEXT:    cmp r6, #0
 ; CHECK-NEXT:    it ne
 ; CHECK-NEXT:    movne.w r4, #-1
 ; CHECK-NEXT:    bl __aeabi_dcmpun
 ; CHECK-NEXT:    cmp r0, #0
-; CHECK-NEXT:    mov r0, r6
-; CHECK-NEXT:    mov r1, r5
-; CHECK-NEXT:    mov r2, r9
-; CHECK-NEXT:    mov r3, r8
 ; CHECK-NEXT:    it ne
 ; CHECK-NEXT:    movne r4, #0
-; CHECK-NEXT:    str.w r11, [sp, #44] @ 4-byte Spill
-; CHECK-NEXT:    str.w r4, [r11, #8]
-; CHECK-NEXT:    str.w r9, [sp, #40] @ 4-byte Spill
-; CHECK-NEXT:    bl __aeabi_dcmpgt
+; CHECK-NEXT:    str.w r4, [r10, #8]
+; CHECK-NEXT:    mov r0, r5
 ; CHECK-NEXT:    ldr r4, [sp, #32] @ 4-byte Reload
-; CHECK-NEXT:    mov r8, r0
-; CHECK-NEXT:    mov r0, r6
-; CHECK-NEXT:    mov r1, r5
-; CHECK-NEXT:    mov r2, r7
-; CHECK-NEXT:    mov r10, r7
+; CHECK-NEXT:    mov r1, r7
+; CHECK-NEXT:    mov r2, r9
+; CHECK-NEXT:    str.w r9, [sp, #44] @ 4-byte Spill
 ; CHECK-NEXT:    mov r3, r4
+; CHECK-NEXT:    bl __aeabi_dcmpgt
+; CHECK-NEXT:    mov r10, r0
+; CHECK-NEXT:    mov r0, r5
+; CHECK-NEXT:    mov r1, r7
+; CHECK-NEXT:    mov r2, r8
+; CHECK-NEXT:    mov r3, r11
 ; CHECK-NEXT:    bl __aeabi_dcmpge
-; CHECK-NEXT:    ldr r1, [sp, #8] @ 4-byte Reload
+; CHECK-NEXT:    ldr r1, [sp, #16] @ 4-byte Reload
 ; CHECK-NEXT:    cmp r0, #0
-; CHECK-NEXT:    mov r2, r6
-; CHECK-NEXT:    mov r3, r5
-; CHECK-NEXT:    csel r7, r1, r0, ne
-; CHECK-NEXT:    mov r0, r6
-; CHECK-NEXT:    mov r1, r5
-; CHECK-NEXT:    cmp.w r8, #0
+; CHECK-NEXT:    mov r2, r5
+; CHECK-NEXT:    mov r3, r7
+; CHECK-NEXT:    csel r6, r1, r0, ne
+; CHECK-NEXT:    mov r0, r5
+; CHECK-NEXT:    mov r1, r7
+; CHECK-NEXT:    cmp.w r10, #0
 ; CHECK-NEXT:    it ne
-; CHECK-NEXT:    movne.w r7, #-1
+; CHECK-NEXT:    movne.w r6, #-1
 ; CHECK-NEXT:    bl __aeabi_dcmpun
 ; CHECK-NEXT:    cmp r0, #0
 ; CHECK-NEXT:    it ne
-; CHECK-NEXT:    movne r7, #0
-; CHECK-NEXT:    str.w r7, [r11, #4]
-; CHECK-NEXT:    mov r0, r6
-; CHECK-NEXT:    ldr.w r11, [sp, #28] @ 4-byte Reload
-; CHECK-NEXT:    mov r1, r5
+; CHECK-NEXT:    movne r6, #0
+; CHECK-NEXT:    ldr r0, [sp, #40] @ 4-byte Reload
+; CHECK-NEXT:    mov r1, r7
 ; CHECK-NEXT:    mov r2, r9
-; CHECK-NEXT:    mov r3, r11
-; CHECK-NEXT:    bl __aeabi_dcmpgt
-; CHECK-NEXT:    mov r9, r0
-; CHECK-NEXT:    mov r0, r6
-; CHECK-NEXT:    mov r1, r5
-; CHECK-NEXT:    mov r2, r10
 ; CHECK-NEXT:    mov r3, r4
-; CHECK-NEXT:    str.w r10, [sp, #36] @ 4-byte Spill
+; CHECK-NEXT:    str r6, [r0, #4]
+; CHECK-NEXT:    mov r0, r5
+; CHECK-NEXT:    bl __aeabi_dcmpgt
+; CHECK-NEXT:    mov r4, r0
+; CHECK-NEXT:    mov r0, r5
+; CHECK-NEXT:    mov r1, r7
+; CHECK-NEXT:    mov r2, r8
+; CHECK-NEXT:    mov r3, r11
+; CHECK-NEXT:    mov r10, r8
 ; CHECK-NEXT:    bl __aeabi_dcmpge
-; CHECK-NEXT:    ldr r1, [sp, #12] @ 4-byte Reload
+; CHECK-NEXT:    ldr r1, [sp, #36] @ 4-byte Reload
 ; CHECK-NEXT:    cmp r0, #0
-; CHECK-NEXT:    mov r2, r6
-; CHECK-NEXT:    mov r3, r5
-; CHECK-NEXT:    csel r7, r1, r0, ne
-; CHECK-NEXT:    mov r0, r6
-; CHECK-NEXT:    mov r1, r5
-; CHECK-NEXT:    cmp.w r9, #0
+; CHECK-NEXT:    mov r2, r5
+; CHECK-NEXT:    mov r3, r7
+; CHECK-NEXT:    csel r6, r1, r0, ne
+; CHECK-NEXT:    mov r0, r5
+; CHECK-NEXT:    mov r1, r7
+; CHECK-NEXT:    cmp r4, #0
 ; CHECK-NEXT:    it ne
-; CHECK-NEXT:    movne.w r7, #-1
-; CHECK-NEXT:    str r6, [sp, #16] @ 4-byte Spill
-; CHECK-NEXT:    str r5, [sp, #20] @ 4-byte Spill
+; CHECK-NEXT:    movne.w r6, #-1
+; CHECK-NEXT:    str r5, [sp, #12] @ 4-byte Spill
+; CHECK-NEXT:    str r7, [sp, #20] @ 4-byte Spill
 ; CHECK-NEXT:    bl __aeabi_dcmpun
 ; CHECK-NEXT:    vmov r9, r8, d9
 ; CHECK-NEXT:    cmp r0, #0
 ; CHECK-NEXT:    it ne
-; CHECK-NEXT:    movne r7, #0
-; CHECK-NEXT:    ldr r0, [sp, #44] @ 4-byte Reload
-; CHECK-NEXT:    mov r3, r11
-; CHECK-NEXT:    mov r5, r11
-; CHECK-NEXT:    str r7, [r0]
-; CHECK-NEXT:    ldr r7, [sp, #40] @ 4-byte Reload
-; CHECK-NEXT:    mov r2, r7
+; CHECK-NEXT:    movne r6, #0
+; CHECK-NEXT:    ldr r0, [sp, #40] @ 4-byte Reload
+; CHECK-NEXT:    str r6, [r0]
+; CHECK-NEXT:    ldr r6, [sp, #32] @ 4-byte Reload
+; CHECK-NEXT:    ldr r2, [sp, #44] @ 4-byte Reload
+; CHECK-NEXT:    mov r3, r6
 ; CHECK-NEXT:    mov r0, r9
 ; CHECK-NEXT:    mov r1, r8
 ; CHECK-NEXT:    bl __aeabi_dcmpgt
-; CHECK-NEXT:    ldr r4, [sp, #32] @ 4-byte Reload
-; CHECK-NEXT:    mov r6, r0
+; CHECK-NEXT:    mov r7, r0
 ; CHECK-NEXT:    mov r0, r9
 ; CHECK-NEXT:    mov r1, r8
 ; CHECK-NEXT:    mov r2, r10
-; CHECK-NEXT:    mov r3, r4
+; CHECK-NEXT:    mov r3, r11
+; CHECK-NEXT:    mov r4, r10
+; CHECK-NEXT:    str.w r10, [sp, #28] @ 4-byte Spill
+; CHECK-NEXT:    mov r5, r11
+; CHECK-NEXT:    str.w r11, [sp, #4] @ 4-byte Spill
 ; CHECK-NEXT:    bl __aeabi_dcmpge
 ; CHECK-NEXT:    mov r11, r0
 ; CHECK-NEXT:    mov r0, r9
 ; CHECK-NEXT:    mov r1, r8
 ; CHECK-NEXT:    bl __fixdfti
+; CHECK-NEXT:    mov r10, r3
+; CHECK-NEXT:    str r0, [sp, #8] @ 4-byte Spill
+; CHECK-NEXT:    str r1, [sp, #36] @ 4-byte Spill
 ; CHECK-NEXT:    cmp.w r11, #0
-; CHECK-NEXT:    strd r2, r0, [sp, #4] @ 8-byte Folded Spill
-; CHECK-NEXT:    csel r10, r1, r11, ne
-; CHECK-NEXT:    str r3, [sp, #12] @ 4-byte Spill
+; CHECK-NEXT:    str r2, [sp, #16] @ 4-byte Spill
+; CHECK-NEXT:    it eq
+; CHECK-NEXT:    mvneq r10, #7
 ; CHECK-NEXT:    mov r0, r9
 ; CHECK-NEXT:    mov r1, r8
 ; CHECK-NEXT:    mov r2, r9
 ; CHECK-NEXT:    mov r3, r8
-; CHECK-NEXT:    cmp r6, #0
+; CHECK-NEXT:    cmp r7, #0
 ; CHECK-NEXT:    it ne
-; CHECK-NEXT:    movne.w r10, #-1
+; CHECK-NEXT:    movne.w r10, #7
 ; CHECK-NEXT:    bl __aeabi_dcmpun
 ; CHECK-NEXT:    cmp r0, #0
+; CHECK-NEXT:    it ne
+; CHECK-NEXT:    movne.w r10, #0
+; CHECK-NEXT:    ldr r7, [sp, #44] @ 4-byte Reload
 ; CHECK-NEXT:    mov r0, r9
 ; CHECK-NEXT:    mov r1, r8
+; CHECK-NEXT:    mov r3, r6
 ; CHECK-NEXT:    mov r2, r7
-; CHECK-NEXT:    mov r3, r5
-; CHECK-NEXT:    it ne
-; CHECK-NEXT:    movne.w r10, #0
 ; CHECK-NEXT:    bl __aeabi_dcmpgt
-; CHECK-NEXT:    ldr r6, [sp, #36] @ 4-byte Reload
 ; CHECK-NEXT:    mov r11, r0
 ; CHECK-NEXT:    mov r0, r9
 ; CHECK-NEXT:    mov r1, r8
-; CHECK-NEXT:    mov r3, r4
-; CHECK-NEXT:    mov r2, r6
+; CHECK-NEXT:    mov r2, r4
+; CHECK-NEXT:    mov r3, r5
 ; CHECK-NEXT:    bl __aeabi_dcmpge
-; CHECK-NEXT:    ldr r1, [sp, #4] @ 4-byte Reload
+; CHECK-NEXT:    ldr r1, [sp, #36] @ 4-byte Reload
 ; CHECK-NEXT:    cmp r0, #0
 ; CHECK-NEXT:    mov r2, r9
 ; CHECK-NEXT:    mov r3, r8
@@ -3852,21 +3849,21 @@ define arm_aapcs_vfpcc <2 x i100> @test_signed_v2f64_v2i100(<2 x double> %f) {
 ; CHECK-NEXT:    cmp r0, #0
 ; CHECK-NEXT:    it ne
 ; CHECK-NEXT:    movne r4, #0
-; CHECK-NEXT:    ldr r1, [sp, #44] @ 4-byte Reload
-; CHECK-NEXT:    lsr.w r0, r10, #28
-; CHECK-NEXT:    orr.w r0, r0, r4, lsl #4
+; CHECK-NEXT:    vmov q0[3], q0[1], r4, r10
+; CHECK-NEXT:    mov r1, r8
+; CHECK-NEXT:    vmov r0, s1
 ; CHECK-NEXT:    mov r2, r7
-; CHECK-NEXT:    mov r3, r5
-; CHECK-NEXT:    mov r7, r5
-; CHECK-NEXT:    str r0, [r1, #20]
+; CHECK-NEXT:    mov r3, r6
+; CHECK-NEXT:    mov r5, r6
+; CHECK-NEXT:    str r0, [sp, #36] @ 4-byte Spill
 ; CHECK-NEXT:    mov r0, r9
-; CHECK-NEXT:    mov r1, r8
 ; CHECK-NEXT:    bl __aeabi_dcmpgt
-; CHECK-NEXT:    mov r2, r6
-; CHECK-NEXT:    ldr r6, [sp, #32] @ 4-byte Reload
-; CHECK-NEXT:    mov r5, r0
+; CHECK-NEXT:    ldr r7, [sp, #28] @ 4-byte Reload
+; CHECK-NEXT:    mov r4, r0
+; CHECK-NEXT:    ldr r6, [sp, #4] @ 4-byte Reload
 ; CHECK-NEXT:    mov r0, r9
 ; CHECK-NEXT:    mov r1, r8
+; CHECK-NEXT:    mov r2, r7
 ; CHECK-NEXT:    mov r3, r6
 ; CHECK-NEXT:    bl __aeabi_dcmpge
 ; CHECK-NEXT:    ldr r1, [sp, #8] @ 4-byte Reload
@@ -3876,73 +3873,75 @@ define arm_aapcs_vfpcc <2 x i100> @test_signed_v2f64_v2i100(<2 x double> %f) {
 ; CHECK-NEXT:    csel r11, r1, r0, ne
 ; CHECK-NEXT:    mov r0, r9
 ; CHECK-NEXT:    mov r1, r8
-; CHECK-NEXT:    cmp r5, #0
+; CHECK-NEXT:    cmp r4, #0
 ; CHECK-NEXT:    it ne
 ; CHECK-NEXT:    movne.w r11, #-1
 ; CHECK-NEXT:    bl __aeabi_dcmpun
 ; CHECK-NEXT:    cmp r0, #0
 ; CHECK-NEXT:    it ne
 ; CHECK-NEXT:    movne.w r11, #0
-; CHECK-NEXT:    ldr r5, [sp, #44] @ 4-byte Reload
-; CHECK-NEXT:    lsr.w r0, r11, #28
-; CHECK-NEXT:    orr.w r0, r0, r10, lsl #4
-; CHECK-NEXT:    mov r1, r8
-; CHECK-NEXT:    mov r3, r7
-; CHECK-NEXT:    str r0, [r5, #16]
+; CHECK-NEXT:    ldr r1, [sp, #36] @ 4-byte Reload
+; CHECK-NEXT:    mov r4, r11
+; CHECK-NEXT:    ldr r2, [sp, #44] @ 4-byte Reload
 ; CHECK-NEXT:    mov r0, r9
-; CHECK-NEXT:    ldr r2, [sp, #40] @ 4-byte Reload
+; CHECK-NEXT:    lsrl r4, r1, #28
+; CHECK-NEXT:    mov r3, r5
+; CHECK-NEXT:    str r1, [sp, #36] @ 4-byte Spill
+; CHECK-NEXT:    mov r1, r8
 ; CHECK-NEXT:    bl __aeabi_dcmpgt
-; CHECK-NEXT:    ldr r2, [sp, #36] @ 4-byte Reload
-; CHECK-NEXT:    mov r7, r0
+; CHECK-NEXT:    mov r5, r0
+; CHECK-NEXT:    mov r2, r7
 ; CHECK-NEXT:    mov r0, r9
 ; CHECK-NEXT:    mov r1, r8
 ; CHECK-NEXT:    mov r3, r6
-; CHECK-NEXT:    mov r10, r6
+; CHECK-NEXT:    mov r7, r6
 ; CHECK-NEXT:    bl __aeabi_dcmpge
+; CHECK-NEXT:    ldr r1, [sp, #16] @ 4-byte Reload
 ; CHECK-NEXT:    cmp r0, #0
-; CHECK-NEXT:    ldr r0, [sp, #12] @ 4-byte Reload
-; CHECK-NEXT:    it eq
-; CHECK-NEXT:    mvneq r0, #7
-; CHECK-NEXT:    cmp r7, #0
-; CHECK-NEXT:    it ne
-; CHECK-NEXT:    movne r0, #7
-; CHECK-NEXT:    mov r6, r0
-; CHECK-NEXT:    mov r0, r9
-; CHECK-NEXT:    mov r1, r8
 ; CHECK-NEXT:    mov r2, r9
 ; CHECK-NEXT:    mov r3, r8
+; CHECK-NEXT:    csel r6, r1, r0, ne
+; CHECK-NEXT:    mov r0, r9
+; CHECK-NEXT:    mov r1, r8
+; CHECK-NEXT:    cmp r5, #0
+; CHECK-NEXT:    it ne
+; CHECK-NEXT:    movne.w r6, #-1
 ; CHECK-NEXT:    bl __aeabi_dcmpun
 ; CHECK-NEXT:    cmp r0, #0
-; CHECK-NEXT:    lsr.w r0, r4, #28
 ; CHECK-NEXT:    it ne
 ; CHECK-NEXT:    movne r6, #0
-; CHECK-NEXT:    orr.w r0, r0, r6, lsl #4
-; CHECK-NEXT:    strb r0, [r5, #24]
-; CHECK-NEXT:    ldr r7, [sp, #16] @ 4-byte Reload
-; CHECK-NEXT:    ldr r4, [sp, #20] @ 4-byte Reload
+; CHECK-NEXT:    ldr r0, [sp, #36] @ 4-byte Reload
+; CHECK-NEXT:    and r1, r10, #15
 ; CHECK-NEXT:    ldr r2, [sp, #40] @ 4-byte Reload
-; CHECK-NEXT:    ldr r3, [sp, #28] @ 4-byte Reload
-; CHECK-NEXT:    mov r0, r7
-; CHECK-NEXT:    mov r1, r4
+; CHECK-NEXT:    orr.w r0, r0, r6, lsl #4
+; CHECK-NEXT:    lsrl r6, r1, #28
+; CHECK-NEXT:    strd r4, r0, [r2, #16]
+; CHECK-NEXT:    mov r8, r2
+; CHECK-NEXT:    strb r6, [r2, #24]
+; CHECK-NEXT:    ldr r5, [sp, #12] @ 4-byte Reload
+; CHECK-NEXT:    ldr r6, [sp, #20] @ 4-byte Reload
+; CHECK-NEXT:    ldr r2, [sp, #44] @ 4-byte Reload
+; CHECK-NEXT:    ldr r3, [sp, #32] @ 4-byte Reload
+; CHECK-NEXT:    mov r0, r5
+; CHECK-NEXT:    mov r1, r6
 ; CHECK-NEXT:    bl __aeabi_dcmpgt
-; CHECK-NEXT:    ldr r2, [sp, #36] @ 4-byte Reload
-; CHECK-NEXT:    mov r8, r0
-; CHECK-NEXT:    mov r0, r7
-; CHECK-NEXT:    mov r1, r4
-; CHECK-NEXT:    mov r3, r10
-; CHECK-NEXT:    mov r6, r4
+; CHECK-NEXT:    ldr r2, [sp, #28] @ 4-byte Reload
+; CHECK-NEXT:    mov r4, r0
+; CHECK-NEXT:    mov r0, r5
+; CHECK-NEXT:    mov r1, r6
+; CHECK-NEXT:    mov r3, r7
 ; CHECK-NEXT:    bl __aeabi_dcmpge
 ; CHECK-NEXT:    cmp r0, #0
 ; CHECK-NEXT:    ldr r0, [sp, #24] @ 4-byte Reload
 ; CHECK-NEXT:    it eq
 ; CHECK-NEXT:    mvneq r0, #7
-; CHECK-NEXT:    cmp.w r8, #0
+; CHECK-NEXT:    cmp r4, #0
 ; CHECK-NEXT:    it ne
 ; CHECK-NEXT:    movne r0, #7
 ; CHECK-NEXT:    mov r4, r0
-; CHECK-NEXT:    mov r0, r7
+; CHECK-NEXT:    mov r0, r5
 ; CHECK-NEXT:    mov r1, r6
-; CHECK-NEXT:    mov r2, r7
+; CHECK-NEXT:    mov r2, r5
 ; CHECK-NEXT:    mov r3, r6
 ; CHECK-NEXT:    bl __aeabi_dcmpun
 ; CHECK-NEXT:    cmp r0, #0
@@ -3950,7 +3949,7 @@ define arm_aapcs_vfpcc <2 x i100> @test_signed_v2f64_v2i100(<2 x double> %f) {
 ; CHECK-NEXT:    movne r4, #0
 ; CHECK-NEXT:    and r0, r4, #15
 ; CHECK-NEXT:    orr.w r0, r0, r11, lsl #4
-; CHECK-NEXT:    str r0, [r5, #12]
+; CHECK-NEXT:    str.w r0, [r8, #12]
 ; CHECK-NEXT:    add sp, #48
 ; CHECK-NEXT:    vpop {d8, d9}
 ; CHECK-NEXT:    add sp, #4
@@ -4694,107 +4693,127 @@ define arm_aapcs_vfpcc <8 x i16> @test_signed_v8f16_v8i16(<8 x half> %f) {
 define arm_aapcs_vfpcc <8 x i19> @test_signed_v8f16_v8i19(<8 x half> %f) {
 ; CHECK-LABEL: test_signed_v8f16_v8i19:
 ; CHECK:       @ %bb.0:
-; CHECK-NEXT:    .save {r7, lr}
-; CHECK-NEXT:    push {r7, lr}
-; CHECK-NEXT:    .vsave {d8}
-; CHECK-NEXT:    vpush {d8}
-; CHECK-NEXT:    vldr s12, .LCPI46_0
-; CHECK-NEXT:    vcvtt.f32.f16 s15, s3
-; CHECK-NEXT:    vldr s14, .LCPI46_1
-; CHECK-NEXT:    vcvtb.f32.f16 s7, s0
-; CHECK-NEXT:    vmaxnm.f32 s16, s15, s12
-; CHECK-NEXT:    vcvtb.f32.f16 s4, s1
-; CHECK-NEXT:    vcvtt.f32.f16 s8, s1
-; CHECK-NEXT:    vcvtb.f32.f16 s1, s2
-; CHECK-NEXT:    vcvtt.f32.f16 s0, s0
-; CHECK-NEXT:    vcvtt.f32.f16 s2, s2
-; CHECK-NEXT:    vcvtb.f32.f16 s3, s3
-; CHECK-NEXT:    vmaxnm.f32 s6, s4, s12
-; CHECK-NEXT:    vmaxnm.f32 s10, s8, s12
-; CHECK-NEXT:    vmaxnm.f32 s5, s1, s12
-; CHECK-NEXT:    vmaxnm.f32 s9, s7, s12
-; CHECK-NEXT:    vmaxnm.f32 s11, s0, s12
-; CHECK-NEXT:    vmaxnm.f32 s13, s2, s12
-; CHECK-NEXT:    vminnm.f32 s16, s16, s14
-; CHECK-NEXT:    vmaxnm.f32 s12, s3, s12
-; CHECK-NEXT:    vcvt.s32.f32 s16, s16
-; CHECK-NEXT:    vminnm.f32 s12, s12, s14
-; CHECK-NEXT:    vminnm.f32 s13, s13, s14
-; CHECK-NEXT:    vcvt.s32.f32 s12, s12
-; CHECK-NEXT:    vminnm.f32 s9, s9, s14
-; CHECK-NEXT:    vcvt.s32.f32 s13, s13
-; CHECK-NEXT:    vminnm.f32 s11, s11, s14
-; CHECK-NEXT:    vcvt.s32.f32 s11, s11
-; CHECK-NEXT:    vminnm.f32 s5, s5, s14
-; CHECK-NEXT:    vcvt.s32.f32 s9, s9
-; CHECK-NEXT:    vminnm.f32 s10, s10, s14
-; CHECK-NEXT:    vcmp.f32 s15, s15
-; CHECK-NEXT:    vminnm.f32 s6, s6, s14
-; CHECK-NEXT:    vmov r1, s16
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    it vs
-; CHECK-NEXT:    movvs r1, #0
-; CHECK-NEXT:    lsrs r2, r1, #11
-; CHECK-NEXT:    vcmp.f32 s3, s3
-; CHECK-NEXT:    strb r2, [r0, #18]
-; CHECK-NEXT:    vmov r3, s12
+; CHECK-NEXT:    .save {r4, r5, r7, r9, r11, lr}
+; CHECK-NEXT:    push.w {r4, r5, r7, r9, r11, lr}
+; CHECK-NEXT:    vldr s4, .LCPI46_0
+; CHECK-NEXT:    vcvtb.f32.f16 s8, s1
+; CHECK-NEXT:    vcvtt.f32.f16 s12, s1
+; CHECK-NEXT:    vcvtt.f32.f16 s1, s0
+; CHECK-NEXT:    vldr s6, .LCPI46_1
+; CHECK-NEXT:    vmaxnm.f32 s5, s1, s4
+; CHECK-NEXT:    vcvtb.f32.f16 s0, s0
+; CHECK-NEXT:    vmaxnm.f32 s14, s12, s4
+; CHECK-NEXT:    vminnm.f32 s5, s5, s6
+; CHECK-NEXT:    vmaxnm.f32 s7, s0, s4
+; CHECK-NEXT:    vminnm.f32 s7, s7, s6
+; CHECK-NEXT:    vcvt.s32.f32 s5, s5
+; CHECK-NEXT:    vcvt.s32.f32 s7, s7
+; CHECK-NEXT:    vminnm.f32 s14, s14, s6
+; CHECK-NEXT:    vcvt.s32.f32 s14, s14
+; CHECK-NEXT:    vmaxnm.f32 s10, s8, s4
+; CHECK-NEXT:    vminnm.f32 s10, s10, s6
+; CHECK-NEXT:    vcmp.f32 s1, s1
+; CHECK-NEXT:    vcvt.s32.f32 s10, s10
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-NEXT:    vcmp.f32 s0, s0
+; CHECK-NEXT:    mov.w r7, #0
+; CHECK-NEXT:    vcvtb.f32.f16 s0, s2
+; CHECK-NEXT:    mov.w r9, #0
+; CHECK-NEXT:    vmov r2, s5
+; CHECK-NEXT:    mov.w r5, #0
 ; CHECK-NEXT:    it vs
-; CHECK-NEXT:    movvs r3, #0
-; CHECK-NEXT:    ubfx r2, r3, #14, #5
-; CHECK-NEXT:    vcvt.s32.f32 s5, s5
-; CHECK-NEXT:    orr.w r1, r2, r1, lsl #5
-; CHECK-NEXT:    vcmp.f32 s2, s2
-; CHECK-NEXT:    strh r1, [r0, #16]
-; CHECK-NEXT:    vmov lr, s13
+; CHECK-NEXT:    movvs r2, #0
+; CHECK-NEXT:    vmov r1, s7
+; CHECK-NEXT:    bfc r2, #19, #13
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    it vs
-; CHECK-NEXT:    movvs.w lr, #0
-; CHECK-NEXT:    ubfx r1, lr, #1, #18
-; CHECK-NEXT:    vcmp.f32 s0, s0
-; CHECK-NEXT:    orr.w r1, r1, r3, lsl #18
-; CHECK-NEXT:    vcvt.s32.f32 s10, s10
+; CHECK-NEXT:    movvs r1, #0
+; CHECK-NEXT:    vcmp.f32 s12, s12
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vmov r12, s11
-; CHECK-NEXT:    str r1, [r0, #12]
-; CHECK-NEXT:    vmov r3, s9
+; CHECK-NEXT:    vcmp.f32 s8, s8
+; CHECK-NEXT:    lsll r2, r7, #19
+; CHECK-NEXT:    bfc r1, #19, #13
+; CHECK-NEXT:    vmov r12, s14
+; CHECK-NEXT:    vmaxnm.f32 s8, s0, s4
+; CHECK-NEXT:    orr.w r1, r1, r2
+; CHECK-NEXT:    str r1, [r0]
 ; CHECK-NEXT:    it vs
 ; CHECK-NEXT:    movvs.w r12, #0
-; CHECK-NEXT:    vcmp.f32 s7, s7
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-NEXT:    vcmp.f32 s0, s0
+; CHECK-NEXT:    vcvtt.f32.f16 s0, s2
+; CHECK-NEXT:    vmaxnm.f32 s2, s0, s4
+; CHECK-NEXT:    vminnm.f32 s8, s8, s6
+; CHECK-NEXT:    vminnm.f32 s2, s2, s6
+; CHECK-NEXT:    vmov r3, s10
+; CHECK-NEXT:    vcvt.s32.f32 s2, s2
 ; CHECK-NEXT:    it vs
 ; CHECK-NEXT:    movvs r3, #0
+; CHECK-NEXT:    vcvt.s32.f32 s8, s8
 ; CHECK-NEXT:    bfc r3, #19, #13
-; CHECK-NEXT:    vcvt.s32.f32 s6, s6
-; CHECK-NEXT:    orr.w r3, r3, r12, lsl #19
-; CHECK-NEXT:    str r3, [r0]
-; CHECK-NEXT:    vcmp.f32 s1, s1
-; CHECK-NEXT:    vmov r3, s5
+; CHECK-NEXT:    mov r2, r12
+; CHECK-NEXT:    movs r1, #0
+; CHECK-NEXT:    bfc r2, #19, #13
+; CHECK-NEXT:    mov r4, r3
+; CHECK-NEXT:    lsrl r2, r1, #7
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-NEXT:    vcmp.f32 s0, s0
+; CHECK-NEXT:    lsrl r4, r9, #26
+; CHECK-NEXT:    vcvtt.f32.f16 s0, s3
+; CHECK-NEXT:    mov lr, r1
+; CHECK-NEXT:    orr.w r1, r4, r2
+; CHECK-NEXT:    vmov r4, s2
+; CHECK-NEXT:    vmaxnm.f32 s2, s0, s4
+; CHECK-NEXT:    vmov r2, s8
+; CHECK-NEXT:    vminnm.f32 s2, s2, s6
 ; CHECK-NEXT:    it vs
-; CHECK-NEXT:    movvs r3, #0
-; CHECK-NEXT:    vcmp.f32 s8, s8
-; CHECK-NEXT:    bfc r3, #19, #13
-; CHECK-NEXT:    vmov r1, s10
+; CHECK-NEXT:    movvs r2, #0
+; CHECK-NEXT:    vcvt.s32.f32 s2, s2
+; CHECK-NEXT:    bfc r2, #19, #13
+; CHECK-NEXT:    lsll r2, r5, #12
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    it vs
-; CHECK-NEXT:    movvs r1, #0
-; CHECK-NEXT:    ubfx r2, r1, #7, #12
-; CHECK-NEXT:    vcmp.f32 s4, s4
-; CHECK-NEXT:    orr.w r2, r2, r3, lsl #12
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    orr.w r2, r2, lr, lsl #31
+; CHECK-NEXT:    movvs r4, #0
+; CHECK-NEXT:    orrs r2, r1
+; CHECK-NEXT:    bfc r4, #19, #13
+; CHECK-NEXT:    movs r1, #0
+; CHECK-NEXT:    lsll r4, r1, #31
+; CHECK-NEXT:    vcmp.f32 s0, s0
+; CHECK-NEXT:    orrs r2, r4
 ; CHECK-NEXT:    str r2, [r0, #8]
-; CHECK-NEXT:    vmov r2, s6
-; CHECK-NEXT:    ubfx r3, r12, #13, #6
+; CHECK-NEXT:    orr.w r2, r7, r3, lsl #6
+; CHECK-NEXT:    vcvtb.f32.f16 s0, s3
+; CHECK-NEXT:    orr.w r3, r2, r12, lsl #25
+; CHECK-NEXT:    vmov r2, s2
+; CHECK-NEXT:    vmaxnm.f32 s2, s0, s4
+; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-NEXT:    vminnm.f32 s2, s2, s6
 ; CHECK-NEXT:    it vs
 ; CHECK-NEXT:    movvs r2, #0
+; CHECK-NEXT:    vcvt.s32.f32 s2, s2
 ; CHECK-NEXT:    bfc r2, #19, #13
-; CHECK-NEXT:    orr.w r2, r3, r2, lsl #6
-; CHECK-NEXT:    orr.w r1, r2, r1, lsl #25
-; CHECK-NEXT:    str r1, [r0, #4]
-; CHECK-NEXT:    vpop {d8}
-; CHECK-NEXT:    pop {r7, pc}
+; CHECK-NEXT:    movs r7, #0
+; CHECK-NEXT:    vcmp.f32 s0, s0
+; CHECK-NEXT:    lsll r2, r7, #5
+; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-NEXT:    mov.w r11, #0
+; CHECK-NEXT:    vmov r7, s2
+; CHECK-NEXT:    it vs
+; CHECK-NEXT:    movvs r7, #0
+; CHECK-NEXT:    mov r4, r7
+; CHECK-NEXT:    bfc r4, #19, #13
+; CHECK-NEXT:    lsrl r4, r11, #14
+; CHECK-NEXT:    orrs r2, r4
+; CHECK-NEXT:    strh r2, [r0, #16]
+; CHECK-NEXT:    str r3, [r0, #4]
+; CHECK-NEXT:    lsrs r2, r2, #16
+; CHECK-NEXT:    strb r2, [r0, #18]
+; CHECK-NEXT:    orr.w r2, r9, lr
+; CHECK-NEXT:    orrs r2, r5
+; CHECK-NEXT:    orrs r1, r2
+; CHECK-NEXT:    orr.w r1, r1, r7, lsl #18
+; CHECK-NEXT:    str r1, [r0, #12]
+; CHECK-NEXT:    pop.w {r4, r5, r7, r9, r11, pc}
 ; CHECK-NEXT:    .p2align 2
 ; CHECK-NEXT:  @ %bb.1:
 ; CHECK-NEXT:  .LCPI46_0:
@@ -4844,42 +4863,40 @@ define arm_aapcs_vfpcc <8 x i50> @test_signed_v8f16_v8i50(<8 x half> %f) {
 ; CHECK-NEXT:    push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr}
 ; CHECK-NEXT:    .pad #4
 ; CHECK-NEXT:    sub sp, #4
-; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14}
-; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14}
-; CHECK-NEXT:    .pad #16
-; CHECK-NEXT:    sub sp, #16
+; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
+; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    vmov q4, q0
-; CHECK-NEXT:    mov r11, r0
-; CHECK-NEXT:    vcvtt.f32.f16 s28, s19
-; CHECK-NEXT:    vmov r0, s28
+; CHECK-NEXT:    mov r9, r0
+; CHECK-NEXT:    vcvtt.f32.f16 s30, s19
+; CHECK-NEXT:    vmov r0, s30
 ; CHECK-NEXT:    bl __aeabi_f2lz
 ; CHECK-NEXT:    vcvtb.f32.f16 s26, s18
-; CHECK-NEXT:    mov r7, r0
+; CHECK-NEXT:    mov r4, r0
 ; CHECK-NEXT:    vmov r0, s26
-; CHECK-NEXT:    vldr s22, .LCPI48_1
-; CHECK-NEXT:    vcvtb.f32.f16 s24, s16
-; CHECK-NEXT:    vcvtt.f32.f16 s18, s18
-; CHECK-NEXT:    vcmp.f32 s28, s22
-; CHECK-NEXT:    mov r4, r1
+; CHECK-NEXT:    vldr s24, .LCPI48_1
+; CHECK-NEXT:    vcvtb.f32.f16 s20, s16
+; CHECK-NEXT:    vcvtb.f32.f16 s28, s19
+; CHECK-NEXT:    vcmp.f32 s30, s24
+; CHECK-NEXT:    mov r5, r1
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vmov r6, s24
-; CHECK-NEXT:    vldr s20, .LCPI48_0
-; CHECK-NEXT:    vmov r5, s18
+; CHECK-NEXT:    vmov r7, s20
+; CHECK-NEXT:    vldr s22, .LCPI48_0
+; CHECK-NEXT:    vmov r6, s28
 ; CHECK-NEXT:    itt lt
-; CHECK-NEXT:    movlt r4, #0
-; CHECK-NEXT:    movtlt r4, #65534
+; CHECK-NEXT:    movlt r5, #0
+; CHECK-NEXT:    movtlt r5, #65534
 ; CHECK-NEXT:    bl __aeabi_f2lz
-; CHECK-NEXT:    vcmp.f32 s26, s22
-; CHECK-NEXT:    str r1, [sp, #4] @ 4-byte Spill
+; CHECK-NEXT:    vcmp.f32 s26, s24
+; CHECK-NEXT:    mov r10, r1
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s28, s20
+; CHECK-NEXT:    vcmp.f32 s30, s22
 ; CHECK-NEXT:    it lt
 ; CHECK-NEXT:    movlt r0, #0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s26, s20
+; CHECK-NEXT:    vcmp.f32 s26, s22
 ; CHECK-NEXT:    itt gt
-; CHECK-NEXT:    movwgt r4, #65535
-; CHECK-NEXT:    movtgt r4, #1
+; CHECK-NEXT:    movwgt r5, #65535
+; CHECK-NEXT:    movtgt r5, #1
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    it gt
 ; CHECK-NEXT:    movgt.w r0, #-1
@@ -4887,263 +4904,244 @@ define arm_aapcs_vfpcc <8 x i50> @test_signed_v8f16_v8i50(<8 x half> %f) {
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    it vs
 ; CHECK-NEXT:    movvs r0, #0
-; CHECK-NEXT:    str.w r0, [r11, #25]
-; CHECK-NEXT:    mov r0, r6
+; CHECK-NEXT:    str.w r0, [r9, #25]
+; CHECK-NEXT:    mov r0, r7
 ; CHECK-NEXT:    bl __aeabi_f2lz
-; CHECK-NEXT:    vcmp.f32 s24, s22
-; CHECK-NEXT:    str r1, [sp, #8] @ 4-byte Spill
+; CHECK-NEXT:    vcmp.f32 s20, s24
+; CHECK-NEXT:    mov r8, r1
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s24, s20
+; CHECK-NEXT:    vcmp.f32 s20, s22
 ; CHECK-NEXT:    it lt
 ; CHECK-NEXT:    movlt r0, #0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s24, s24
+; CHECK-NEXT:    vcmp.f32 s20, s20
 ; CHECK-NEXT:    it gt
 ; CHECK-NEXT:    movgt.w r0, #-1
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s28, s22
+; CHECK-NEXT:    vcmp.f32 s30, s24
 ; CHECK-NEXT:    it vs
 ; CHECK-NEXT:    movvs r0, #0
+; CHECK-NEXT:    str.w r0, [r9]
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    str.w r0, [r11]
 ; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r7, #0
-; CHECK-NEXT:    vcmp.f32 s28, s20
+; CHECK-NEXT:    movlt r4, #0
+; CHECK-NEXT:    vcmp.f32 s30, s22
+; CHECK-NEXT:    mov r0, r6
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt.w r7, #-1
-; CHECK-NEXT:    vcmp.f32 s28, s28
+; CHECK-NEXT:    movgt.w r4, #-1
+; CHECK-NEXT:    vcmp.f32 s30, s30
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    it vs
-; CHECK-NEXT:    movvs r7, #0
-; CHECK-NEXT:    str r7, [sp, #12] @ 4-byte Spill
-; CHECK-NEXT:    it vs
+; CHECK-NEXT:    itt vs
 ; CHECK-NEXT:    movvs r4, #0
-; CHECK-NEXT:    lsls r0, r4, #22
-; CHECK-NEXT:    orr.w r7, r0, r7, lsr #10
-; CHECK-NEXT:    mov r0, r5
+; CHECK-NEXT:    movvs r5, #0
+; CHECK-NEXT:    mov r7, r5
+; CHECK-NEXT:    bfc r7, #18, #14
+; CHECK-NEXT:    lsll r4, r7, #22
 ; CHECK-NEXT:    bl __aeabi_f2lz
-; CHECK-NEXT:    vcmp.f32 s18, s22
-; CHECK-NEXT:    mov r6, r1
+; CHECK-NEXT:    vcmp.f32 s28, s24
+; CHECK-NEXT:    mov r6, r0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s18, s20
-; CHECK-NEXT:    itt lt
+; CHECK-NEXT:    vcmp.f32 s28, s22
+; CHECK-NEXT:    it lt
 ; CHECK-NEXT:    movlt r6, #0
-; CHECK-NEXT:    movtlt r6, #65534
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s18, s18
-; CHECK-NEXT:    itt gt
-; CHECK-NEXT:    movwgt r6, #65535
-; CHECK-NEXT:    movtgt r6, #1
+; CHECK-NEXT:    vcmp.f32 s28, s28
+; CHECK-NEXT:    it gt
+; CHECK-NEXT:    movgt.w r6, #-1
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    mov r5, r0
-; CHECK-NEXT:    vcmp.f32 s18, s22
-; CHECK-NEXT:    str.w r7, [r11, #45]
+; CHECK-NEXT:    vcmp.f32 s28, s24
 ; CHECK-NEXT:    it vs
 ; CHECK-NEXT:    movvs r6, #0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r5, #0
-; CHECK-NEXT:    vcmp.f32 s18, s20
+; CHECK-NEXT:    itt lt
+; CHECK-NEXT:    movlt r1, #0
+; CHECK-NEXT:    movtlt r1, #65534
+; CHECK-NEXT:    vcmp.f32 s28, s22
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s18, s18
-; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt.w r5, #-1
+; CHECK-NEXT:    vcmp.f32 s28, s28
+; CHECK-NEXT:    itt gt
+; CHECK-NEXT:    movwgt r1, #65535
+; CHECK-NEXT:    movtgt r1, #1
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    it vs
-; CHECK-NEXT:    movvs r5, #0
-; CHECK-NEXT:    lsrs r0, r5, #14
-; CHECK-NEXT:    orr.w r0, r0, r6, lsl #18
-; CHECK-NEXT:    vcvtt.f32.f16 s18, s17
-; CHECK-NEXT:    str.w r0, [r11, #33]
-; CHECK-NEXT:    vmov r0, s18
+; CHECK-NEXT:    movvs r1, #0
+; CHECK-NEXT:    mov r2, r6
+; CHECK-NEXT:    bfc r1, #18, #14
+; CHECK-NEXT:    vcvtt.f32.f16 s28, s18
+; CHECK-NEXT:    lsrl r2, r1, #28
+; CHECK-NEXT:    orr.w r0, r1, r7
+; CHECK-NEXT:    str.w r0, [r9, #45]
+; CHECK-NEXT:    vmov r0, s28
+; CHECK-NEXT:    orrs r4, r2
 ; CHECK-NEXT:    bl __aeabi_f2lz
-; CHECK-NEXT:    vcmp.f32 s18, s22
-; CHECK-NEXT:    mov r9, r1
+; CHECK-NEXT:    vcmp.f32 s28, s24
+; CHECK-NEXT:    mov r7, r0
+; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-NEXT:    itt lt
+; CHECK-NEXT:    movlt r1, #0
+; CHECK-NEXT:    movtlt r1, #65534
+; CHECK-NEXT:    vcmp.f32 s28, s22
+; CHECK-NEXT:    vcvtb.f32.f16 s18, s17
+; CHECK-NEXT:    lsrs r0, r5, #10
+; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-NEXT:    itt gt
+; CHECK-NEXT:    movwgt r1, #65535
+; CHECK-NEXT:    movtgt r1, #1
+; CHECK-NEXT:    str.w r4, [r9, #41]
+; CHECK-NEXT:    strb.w r0, [r9, #49]
+; CHECK-NEXT:    vmov r0, s18
+; CHECK-NEXT:    vcmp.f32 s28, s24
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s18, s20
 ; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r0, #0
+; CHECK-NEXT:    movlt r7, #0
+; CHECK-NEXT:    vcmp.f32 s28, s22
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s18, s18
 ; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt.w r0, #-1
+; CHECK-NEXT:    movgt.w r7, #-1
+; CHECK-NEXT:    vcmp.f32 s28, s28
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s18, s22
-; CHECK-NEXT:    it vs
-; CHECK-NEXT:    movvs r0, #0
+; CHECK-NEXT:    itt vs
+; CHECK-NEXT:    movvs r7, #0
+; CHECK-NEXT:    movvs r1, #0
+; CHECK-NEXT:    bfc r1, #18, #14
+; CHECK-NEXT:    mov r4, r7
+; CHECK-NEXT:    lsrl r4, r1, #14
+; CHECK-NEXT:    orr.w r6, r1, r6, lsl #4
+; CHECK-NEXT:    bl __aeabi_f2lz
+; CHECK-NEXT:    vcvtt.f32.f16 s28, s17
+; CHECK-NEXT:    mov r11, r0
+; CHECK-NEXT:    vmov r0, s28
+; CHECK-NEXT:    mov r5, r1
+; CHECK-NEXT:    vcmp.f32 s18, s24
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    str r0, [sp] @ 4-byte Spill
 ; CHECK-NEXT:    itt lt
-; CHECK-NEXT:    movwlt r9, #0
-; CHECK-NEXT:    movtlt r9, #65534
-; CHECK-NEXT:    vcmp.f32 s18, s20
-; CHECK-NEXT:    mov r1, r0
+; CHECK-NEXT:    movlt r5, #0
+; CHECK-NEXT:    movtlt r5, #65534
+; CHECK-NEXT:    vcmp.f32 s18, s22
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    itt gt
-; CHECK-NEXT:    movwgt r9, #65535
-; CHECK-NEXT:    movtgt r9, #1
-; CHECK-NEXT:    vcmp.f32 s18, s18
-; CHECK-NEXT:    vcvtt.f32.f16 s16, s16
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    it vs
-; CHECK-NEXT:    movvs.w r9, #0
-; CHECK-NEXT:    lsl.w r0, r9, #22
-; CHECK-NEXT:    orr.w r0, r0, r1, lsr #10
-; CHECK-NEXT:    str.w r0, [r11, #20]
-; CHECK-NEXT:    vmov r0, s16
+; CHECK-NEXT:    movwgt r5, #65535
+; CHECK-NEXT:    movtgt r5, #1
+; CHECK-NEXT:    str.w r6, [r9, #37]
+; CHECK-NEXT:    str.w r4, [r9, #33]
 ; CHECK-NEXT:    bl __aeabi_f2lz
-; CHECK-NEXT:    vcmp.f32 s16, s22
-; CHECK-NEXT:    mov r8, r0
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s16, s20
-; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt.w r8, #0
+; CHECK-NEXT:    vcmp.f32 s28, s24
+; CHECK-NEXT:    mov r4, r1
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s16, s16
-; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt.w r8, #-1
+; CHECK-NEXT:    vcmp.f32 s28, s22
+; CHECK-NEXT:    itt lt
+; CHECK-NEXT:    movlt r4, #0
+; CHECK-NEXT:    movtlt r4, #65534
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    mov r10, r1
-; CHECK-NEXT:    vcmp.f32 s16, s22
-; CHECK-NEXT:    it vs
-; CHECK-NEXT:    movvs.w r8, #0
+; CHECK-NEXT:    vcmp.f32 s26, s24
+; CHECK-NEXT:    itt gt
+; CHECK-NEXT:    movwgt r4, #65535
+; CHECK-NEXT:    movtgt r4, #1
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    itt lt
 ; CHECK-NEXT:    movwlt r10, #0
 ; CHECK-NEXT:    movtlt r10, #65534
-; CHECK-NEXT:    vcmp.f32 s16, s20
-; CHECK-NEXT:    lsr.w r0, r8, #14
+; CHECK-NEXT:    vcmp.f32 s26, s22
+; CHECK-NEXT:    mov r6, r0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s16, s16
 ; CHECK-NEXT:    itt gt
 ; CHECK-NEXT:    movwgt r10, #65535
 ; CHECK-NEXT:    movtgt r10, #1
+; CHECK-NEXT:    vcmp.f32 s26, s26
+; CHECK-NEXT:    vcvtt.f32.f16 s16, s16
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    it vs
 ; CHECK-NEXT:    movvs.w r10, #0
-; CHECK-NEXT:    orr.w r0, r0, r10, lsl #18
-; CHECK-NEXT:    str.w r0, [r11, #8]
-; CHECK-NEXT:    lsrs r0, r4, #10
-; CHECK-NEXT:    vcvtb.f32.f16 s16, s19
-; CHECK-NEXT:    strb.w r0, [r11, #49]
+; CHECK-NEXT:    bfc r10, #18, #14
+; CHECK-NEXT:    vcmp.f32 s28, s24
+; CHECK-NEXT:    orr.w r0, r10, r7, lsl #18
+; CHECK-NEXT:    str.w r0, [r9, #29]
 ; CHECK-NEXT:    vmov r0, s16
-; CHECK-NEXT:    bl __aeabi_f2lz
-; CHECK-NEXT:    mov r7, r0
-; CHECK-NEXT:    vcmp.f32 s16, s22
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-NEXT:    vcmp.f32 s28, s22
 ; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r7, #0
-; CHECK-NEXT:    vcmp.f32 s16, s20
-; CHECK-NEXT:    ubfx r0, r6, #14, #4
+; CHECK-NEXT:    movlt r6, #0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-NEXT:    vcmp.f32 s28, s28
 ; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt.w r7, #-1
-; CHECK-NEXT:    vcmp.f32 s16, s16
-; CHECK-NEXT:    vcvtb.f32.f16 s18, s17
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    it vs
-; CHECK-NEXT:    movvs r7, #0
-; CHECK-NEXT:    orr.w r0, r0, r7, lsl #4
-; CHECK-NEXT:    str.w r0, [r11, #37]
-; CHECK-NEXT:    vcmp.f32 s26, s22
-; CHECK-NEXT:    ldr r0, [sp, #4] @ 4-byte Reload
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    itt lt
-; CHECK-NEXT:    movlt r0, #0
-; CHECK-NEXT:    movtlt r0, #65534
-; CHECK-NEXT:    vcmp.f32 s26, s20
-; CHECK-NEXT:    mov r4, r1
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    itt gt
-; CHECK-NEXT:    movwgt r0, #65535
-; CHECK-NEXT:    movtgt r0, #1
-; CHECK-NEXT:    vcmp.f32 s26, s26
+; CHECK-NEXT:    movgt.w r6, #-1
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    it vs
-; CHECK-NEXT:    movvs r0, #0
-; CHECK-NEXT:    bfc r0, #18, #14
-; CHECK-NEXT:    orr.w r0, r0, r5, lsl #18
-; CHECK-NEXT:    str.w r0, [r11, #29]
-; CHECK-NEXT:    lsr.w r0, r9, #10
-; CHECK-NEXT:    strb.w r0, [r11, #24]
-; CHECK-NEXT:    vmov r0, s18
-; CHECK-NEXT:    bl __aeabi_f2lz
-; CHECK-NEXT:    vcmp.f32 s18, s22
-; CHECK-NEXT:    ubfx r2, r10, #14, #4
+; CHECK-NEXT:    vcmp.f32 s18, s24
+; CHECK-NEXT:    itt vs
+; CHECK-NEXT:    movvs r6, #0
+; CHECK-NEXT:    movvs r4, #0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r0, #0
-; CHECK-NEXT:    vcmp.f32 s18, s20
+; CHECK-NEXT:    movlt.w r11, #0
+; CHECK-NEXT:    vcmp.f32 s18, s22
+; CHECK-NEXT:    mov r1, r4
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt.w r0, #-1
+; CHECK-NEXT:    movgt.w r11, #-1
 ; CHECK-NEXT:    vcmp.f32 s18, s18
+; CHECK-NEXT:    bfc r1, #18, #14
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    it vs
-; CHECK-NEXT:    movvs r0, #0
-; CHECK-NEXT:    orr.w r2, r2, r0, lsl #4
-; CHECK-NEXT:    str.w r2, [r11, #12]
-; CHECK-NEXT:    vcmp.f32 s24, s22
-; CHECK-NEXT:    ldr r2, [sp, #8] @ 4-byte Reload
+; CHECK-NEXT:    itt vs
+; CHECK-NEXT:    movvs.w r11, #0
+; CHECK-NEXT:    movvs r5, #0
+; CHECK-NEXT:    vcmp.f32 s20, s24
+; CHECK-NEXT:    bfc r5, #18, #14
+; CHECK-NEXT:    mov r10, r11
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s24, s20
+; CHECK-NEXT:    lsll r6, r1, #22
+; CHECK-NEXT:    lsrl r10, r5, #28
 ; CHECK-NEXT:    itt lt
-; CHECK-NEXT:    movlt r2, #0
-; CHECK-NEXT:    movtlt r2, #65534
+; CHECK-NEXT:    movwlt r8, #0
+; CHECK-NEXT:    movtlt r8, #65534
+; CHECK-NEXT:    vcmp.f32 s20, s22
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s24, s24
 ; CHECK-NEXT:    itt gt
-; CHECK-NEXT:    movwgt r2, #65535
-; CHECK-NEXT:    movtgt r2, #1
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s18, s22
-; CHECK-NEXT:    it vs
-; CHECK-NEXT:    movvs r2, #0
+; CHECK-NEXT:    movwgt r8, #65535
+; CHECK-NEXT:    movtgt r8, #1
+; CHECK-NEXT:    orrs r1, r5
+; CHECK-NEXT:    str.w r1, [r9, #20]
+; CHECK-NEXT:    bl __aeabi_f2lz
+; CHECK-NEXT:    vcmp.f32 s16, s24
+; CHECK-NEXT:    orr.w r2, r10, r6
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s18, s20
 ; CHECK-NEXT:    itt lt
 ; CHECK-NEXT:    movlt r1, #0
 ; CHECK-NEXT:    movtlt r1, #65534
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    vcmp.f32 s16, s22
+; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    itt gt
 ; CHECK-NEXT:    movwgt r1, #65535
 ; CHECK-NEXT:    movtgt r1, #1
+; CHECK-NEXT:    str.w r2, [r9, #16]
+; CHECK-NEXT:    lsrs r2, r4, #10
+; CHECK-NEXT:    vcmp.f32 s16, s24
+; CHECK-NEXT:    strb.w r2, [r9, #24]
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    itt lt
-; CHECK-NEXT:    movlt r4, #0
-; CHECK-NEXT:    movtlt r4, #65534
-; CHECK-NEXT:    vcmp.f32 s16, s20
-; CHECK-NEXT:    bfc r2, #18, #14
+; CHECK-NEXT:    it lt
+; CHECK-NEXT:    movlt r0, #0
+; CHECK-NEXT:    vcmp.f32 s16, s22
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    itt gt
-; CHECK-NEXT:    movwgt r4, #65535
-; CHECK-NEXT:    movtgt r4, #1
+; CHECK-NEXT:    it gt
+; CHECK-NEXT:    movgt.w r0, #-1
 ; CHECK-NEXT:    vcmp.f32 s16, s16
-; CHECK-NEXT:    orr.w r2, r2, r8, lsl #18
-; CHECK-NEXT:    str.w r2, [r11, #4]
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    it vs
-; CHECK-NEXT:    movvs r4, #0
-; CHECK-NEXT:    bfc r4, #18, #14
-; CHECK-NEXT:    ldr r3, [sp, #12] @ 4-byte Reload
-; CHECK-NEXT:    lsrs r2, r7, #28
-; CHECK-NEXT:    vcmp.f32 s18, s18
-; CHECK-NEXT:    orr.w r2, r2, r4, lsl #4
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    orr.w r2, r2, r3, lsl #22
-; CHECK-NEXT:    str.w r2, [r11, #41]
-; CHECK-NEXT:    it vs
+; CHECK-NEXT:    itt vs
+; CHECK-NEXT:    movvs r0, #0
 ; CHECK-NEXT:    movvs r1, #0
-; CHECK-NEXT:    lsrs r0, r0, #28
 ; CHECK-NEXT:    bfc r1, #18, #14
-; CHECK-NEXT:    orr.w r0, r0, r1, lsl #4
-; CHECK-NEXT:    ldr r1, [sp] @ 4-byte Reload
-; CHECK-NEXT:    orr.w r0, r0, r1, lsl #22
-; CHECK-NEXT:    str.w r0, [r11, #16]
-; CHECK-NEXT:    add sp, #16
-; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14}
+; CHECK-NEXT:    mov r2, r0
+; CHECK-NEXT:    lsrl r2, r1, #14
+; CHECK-NEXT:    vcmp.f32 s20, s20
+; CHECK-NEXT:    orr.w r1, r1, r11, lsl #4
+; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-NEXT:    strd r2, r1, [r9, #8]
+; CHECK-NEXT:    it vs
+; CHECK-NEXT:    movvs.w r8, #0
+; CHECK-NEXT:    bfc r8, #18, #14
+; CHECK-NEXT:    orr.w r0, r8, r0, lsl #18
+; CHECK-NEXT:    str.w r0, [r9, #4]
+; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    add sp, #4
 ; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
 ; CHECK-NEXT:    .p2align 2
@@ -5426,477 +5424,482 @@ define arm_aapcs_vfpcc <8 x i64> @test_signed_v8f16_v8i64(<8 x half> %f) {
 define arm_aapcs_vfpcc <8 x i100> @test_signed_v8f16_v8i100(<8 x half> %f) {
 ; CHECK-LABEL: test_signed_v8f16_v8i100:
 ; CHECK:       @ %bb.0:
-; CHECK-NEXT:    .save {r4, r5, r6, r7, r8, r9, r10, lr}
-; CHECK-NEXT:    push.w {r4, r5, r6, r7, r8, r9, r10, lr}
+; CHECK-NEXT:    .save {r4, r5, r6, r7, r8, r9, r10, r11, lr}
+; CHECK-NEXT:    push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr}
+; CHECK-NEXT:    .pad #4
+; CHECK-NEXT:    sub sp, #4
 ; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
+; CHECK-NEXT:    .pad #32
+; CHECK-NEXT:    sub sp, #32
 ; CHECK-NEXT:    vmov q4, q0
-; CHECK-NEXT:    mov r4, r0
-; CHECK-NEXT:    vcvtb.f32.f16 s30, s19
-; CHECK-NEXT:    vmov r0, s30
+; CHECK-NEXT:    mov r9, r0
+; CHECK-NEXT:    vcvtb.f32.f16 s21, s19
+; CHECK-NEXT:    vcvtt.f32.f16 s24, s19
+; CHECK-NEXT:    vmov r0, s21
+; CHECK-NEXT:    vcvtb.f32.f16 s26, s16
+; CHECK-NEXT:    vcvtb.f32.f16 s28, s17
+; CHECK-NEXT:    vcvtb.f32.f16 s30, s18
+; CHECK-NEXT:    vldr s20, .LCPI50_2
+; CHECK-NEXT:    vmov r8, s24
+; CHECK-NEXT:    vmov r4, s26
+; CHECK-NEXT:    vcvtt.f32.f16 s22, s18
+; CHECK-NEXT:    vmov r6, s28
+; CHECK-NEXT:    vmov r5, s30
 ; CHECK-NEXT:    bl __fixsfti
-; CHECK-NEXT:    vcvtb.f32.f16 s28, s18
-; CHECK-NEXT:    mov r5, r3
-; CHECK-NEXT:    vmov r3, s28
-; CHECK-NEXT:    vldr s24, .LCPI50_2
-; CHECK-NEXT:    vldr s20, .LCPI50_3
-; CHECK-NEXT:    vcvtt.f32.f16 s19, s19
-; CHECK-NEXT:    vcmp.f32 s30, s24
-; CHECK-NEXT:    vcvtb.f32.f16 s22, s16
+; CHECK-NEXT:    vldr s18, .LCPI50_3
+; CHECK-NEXT:    mov r7, r3
+; CHECK-NEXT:    vcmp.f32 s21, s18
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s30, s20
+; CHECK-NEXT:    vcmp.f32 s21, s20
 ; CHECK-NEXT:    it lt
 ; CHECK-NEXT:    movlt r2, #0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s30, s30
+; CHECK-NEXT:    vcmp.f32 s21, s21
 ; CHECK-NEXT:    it gt
 ; CHECK-NEXT:    movgt.w r2, #-1
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s30, s24
+; CHECK-NEXT:    vcmp.f32 s21, s18
 ; CHECK-NEXT:    it vs
 ; CHECK-NEXT:    movvs r2, #0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s30, s20
-; CHECK-NEXT:    str.w r2, [r4, #83]
+; CHECK-NEXT:    vcmp.f32 s21, s20
+; CHECK-NEXT:    str.w r2, [r9, #83]
 ; CHECK-NEXT:    it lt
 ; CHECK-NEXT:    movlt r1, #0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s30, s30
+; CHECK-NEXT:    vcmp.f32 s21, s21
 ; CHECK-NEXT:    it gt
 ; CHECK-NEXT:    movgt.w r1, #-1
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s30, s24
+; CHECK-NEXT:    vcmp.f32 s21, s18
 ; CHECK-NEXT:    it vs
 ; CHECK-NEXT:    movvs r1, #0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    str.w r1, [r4, #79]
+; CHECK-NEXT:    str.w r1, [r9, #79]
 ; CHECK-NEXT:    it lt
 ; CHECK-NEXT:    movlt r0, #0
-; CHECK-NEXT:    vcmp.f32 s30, s20
-; CHECK-NEXT:    vcvtb.f32.f16 s26, s17
+; CHECK-NEXT:    vcmp.f32 s21, s20
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    it gt
 ; CHECK-NEXT:    movgt.w r0, #-1
-; CHECK-NEXT:    vcmp.f32 s30, s30
+; CHECK-NEXT:    vcmp.f32 s21, s21
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    it vs
 ; CHECK-NEXT:    movvs r0, #0
-; CHECK-NEXT:    str.w r0, [r4, #75]
-; CHECK-NEXT:    vmov r9, s19
-; CHECK-NEXT:    vmov r8, s22
-; CHECK-NEXT:    mov r0, r3
-; CHECK-NEXT:    vmov r6, s26
+; CHECK-NEXT:    str.w r0, [r9, #75]
+; CHECK-NEXT:    mov r0, r5
 ; CHECK-NEXT:    bl __fixsfti
-; CHECK-NEXT:    vcmp.f32 s28, s24
-; CHECK-NEXT:    mov r7, r3
+; CHECK-NEXT:    vcmp.f32 s30, s18
+; CHECK-NEXT:    mov r5, r3
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s28, s20
+; CHECK-NEXT:    vcmp.f32 s30, s20
 ; CHECK-NEXT:    it lt
 ; CHECK-NEXT:    movlt r2, #0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s28, s28
+; CHECK-NEXT:    vcmp.f32 s30, s30
 ; CHECK-NEXT:    it gt
 ; CHECK-NEXT:    movgt.w r2, #-1
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s28, s24
+; CHECK-NEXT:    vcmp.f32 s30, s18
 ; CHECK-NEXT:    it vs
 ; CHECK-NEXT:    movvs r2, #0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s28, s20
-; CHECK-NEXT:    str.w r2, [r4, #58]
+; CHECK-NEXT:    vcmp.f32 s30, s20
+; CHECK-NEXT:    str.w r2, [r9, #58]
 ; CHECK-NEXT:    it lt
 ; CHECK-NEXT:    movlt r1, #0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s28, s28
+; CHECK-NEXT:    vcmp.f32 s30, s30
 ; CHECK-NEXT:    it gt
 ; CHECK-NEXT:    movgt.w r1, #-1
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s28, s24
+; CHECK-NEXT:    vcmp.f32 s30, s18
 ; CHECK-NEXT:    it vs
 ; CHECK-NEXT:    movvs r1, #0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    str.w r1, [r4, #54]
+; CHECK-NEXT:    str.w r1, [r9, #54]
 ; CHECK-NEXT:    it lt
 ; CHECK-NEXT:    movlt r0, #0
-; CHECK-NEXT:    vcmp.f32 s28, s20
+; CHECK-NEXT:    vcmp.f32 s30, s20
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    it gt
 ; CHECK-NEXT:    movgt.w r0, #-1
-; CHECK-NEXT:    vcmp.f32 s28, s28
+; CHECK-NEXT:    vcmp.f32 s30, s30
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    it vs
 ; CHECK-NEXT:    movvs r0, #0
-; CHECK-NEXT:    str.w r0, [r4, #50]
+; CHECK-NEXT:    str.w r0, [r9, #50]
 ; CHECK-NEXT:    mov r0, r6
 ; CHECK-NEXT:    bl __fixsfti
-; CHECK-NEXT:    vcmp.f32 s26, s24
-; CHECK-NEXT:    mov r10, r3
+; CHECK-NEXT:    vcmp.f32 s28, s18
+; CHECK-NEXT:    str r3, [sp, #24] @ 4-byte Spill
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s26, s20
+; CHECK-NEXT:    vcmp.f32 s28, s20
 ; CHECK-NEXT:    it lt
 ; CHECK-NEXT:    movlt r2, #0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s26, s26
+; CHECK-NEXT:    vcmp.f32 s28, s28
 ; CHECK-NEXT:    it gt
 ; CHECK-NEXT:    movgt.w r2, #-1
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s26, s24
+; CHECK-NEXT:    vcmp.f32 s28, s18
 ; CHECK-NEXT:    it vs
 ; CHECK-NEXT:    movvs r2, #0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s26, s20
-; CHECK-NEXT:    str.w r2, [r4, #33]
+; CHECK-NEXT:    vcmp.f32 s28, s20
+; CHECK-NEXT:    str.w r2, [r9, #33]
 ; CHECK-NEXT:    it lt
 ; CHECK-NEXT:    movlt r1, #0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s26, s26
+; CHECK-NEXT:    vcmp.f32 s28, s28
 ; CHECK-NEXT:    it gt
 ; CHECK-NEXT:    movgt.w r1, #-1
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s26, s24
+; CHECK-NEXT:    vcmp.f32 s28, s18
 ; CHECK-NEXT:    it vs
 ; CHECK-NEXT:    movvs r1, #0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    str.w r1, [r4, #29]
+; CHECK-NEXT:    str.w r1, [r9, #29]
 ; CHECK-NEXT:    it lt
 ; CHECK-NEXT:    movlt r0, #0
-; CHECK-NEXT:    vcmp.f32 s26, s20
+; CHECK-NEXT:    vcmp.f32 s28, s20
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    it gt
 ; CHECK-NEXT:    movgt.w r0, #-1
-; CHECK-NEXT:    vcmp.f32 s26, s26
+; CHECK-NEXT:    vcmp.f32 s28, s28
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    it vs
 ; CHECK-NEXT:    movvs r0, #0
-; CHECK-NEXT:    str.w r0, [r4, #25]
-; CHECK-NEXT:    mov r0, r8
+; CHECK-NEXT:    str.w r0, [r9, #25]
+; CHECK-NEXT:    mov r0, r4
 ; CHECK-NEXT:    bl __fixsfti
-; CHECK-NEXT:    vcmp.f32 s22, s24
-; CHECK-NEXT:    mov r8, r3
+; CHECK-NEXT:    vcmp.f32 s26, s18
+; CHECK-NEXT:    str r3, [sp, #12] @ 4-byte Spill
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s22, s20
+; CHECK-NEXT:    vcmp.f32 s26, s20
 ; CHECK-NEXT:    it lt
 ; CHECK-NEXT:    movlt r2, #0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s22, s22
+; CHECK-NEXT:    vcmp.f32 s26, s26
 ; CHECK-NEXT:    it gt
 ; CHECK-NEXT:    movgt.w r2, #-1
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s22, s24
+; CHECK-NEXT:    vcmp.f32 s26, s18
 ; CHECK-NEXT:    it vs
 ; CHECK-NEXT:    movvs r2, #0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s22, s20
-; CHECK-NEXT:    str r2, [r4, #8]
+; CHECK-NEXT:    vcmp.f32 s26, s20
+; CHECK-NEXT:    str.w r2, [r9, #8]
 ; CHECK-NEXT:    it lt
 ; CHECK-NEXT:    movlt r1, #0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s22, s22
+; CHECK-NEXT:    vcmp.f32 s26, s26
 ; CHECK-NEXT:    it gt
 ; CHECK-NEXT:    movgt.w r1, #-1
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s22, s24
+; CHECK-NEXT:    vcmp.f32 s26, s18
 ; CHECK-NEXT:    it vs
 ; CHECK-NEXT:    movvs r1, #0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    str r1, [r4, #4]
+; CHECK-NEXT:    str.w r1, [r9, #4]
 ; CHECK-NEXT:    it lt
 ; CHECK-NEXT:    movlt r0, #0
-; CHECK-NEXT:    vcmp.f32 s22, s20
+; CHECK-NEXT:    vcmp.f32 s26, s20
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    it gt
 ; CHECK-NEXT:    movgt.w r0, #-1
-; CHECK-NEXT:    vcmp.f32 s22, s22
+; CHECK-NEXT:    vcmp.f32 s26, s26
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    it vs
 ; CHECK-NEXT:    movvs r0, #0
-; CHECK-NEXT:    str r0, [r4]
-; CHECK-NEXT:    mov r0, r9
+; CHECK-NEXT:    str.w r0, [r9]
+; CHECK-NEXT:    mov r0, r8
 ; CHECK-NEXT:    bl __fixsfti
-; CHECK-NEXT:    vcmp.f32 s19, s24
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s19, s20
-; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r1, #0
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s19, s19
-; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt.w r1, #-1
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s19, s24
-; CHECK-NEXT:    it vs
-; CHECK-NEXT:    movvs r1, #0
+; CHECK-NEXT:    vcmp.f32 s24, s18
+; CHECK-NEXT:    mov r6, r0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s19, s20
+; CHECK-NEXT:    vcmp.f32 s24, s20
+; CHECK-NEXT:    str r3, [sp, #8] @ 4-byte Spill
 ; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r2, #0
+; CHECK-NEXT:    movlt r6, #0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s19, s19
+; CHECK-NEXT:    vcmp.f32 s24, s24
 ; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt.w r2, #-1
+; CHECK-NEXT:    movgt.w r6, #-1
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s19, s24
+; CHECK-NEXT:    vcmp.f32 s21, s18
 ; CHECK-NEXT:    it vs
-; CHECK-NEXT:    movvs r2, #0
-; CHECK-NEXT:    lsrs r6, r1, #28
+; CHECK-NEXT:    movvs r6, #0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s19, s20
-; CHECK-NEXT:    orr.w r6, r6, r2, lsl #4
-; CHECK-NEXT:    str.w r6, [r4, #95]
 ; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r0, #0
+; CHECK-NEXT:    mvnlt r7, #7
+; CHECK-NEXT:    vcmp.f32 s21, s20
+; CHECK-NEXT:    mov r11, r1
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s19, s19
 ; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt.w r0, #-1
+; CHECK-NEXT:    movgt r7, #7
+; CHECK-NEXT:    vcmp.f32 s21, s21
+; CHECK-NEXT:    mov r10, r2
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s19, s24
 ; CHECK-NEXT:    it vs
-; CHECK-NEXT:    movvs r0, #0
-; CHECK-NEXT:    lsrs r6, r0, #28
-; CHECK-NEXT:    orr.w r1, r6, r1, lsl #4
-; CHECK-NEXT:    str.w r1, [r4, #91]
+; CHECK-NEXT:    movvs r7, #0
+; CHECK-NEXT:    and r0, r7, #15
+; CHECK-NEXT:    orr.w r1, r0, r6, lsl #4
+; CHECK-NEXT:    vmov r0, s22
+; CHECK-NEXT:    str.w r1, [r9, #87]
+; CHECK-NEXT:    bl __fixsfti
+; CHECK-NEXT:    vcmp.f32 s22, s18
+; CHECK-NEXT:    mov r8, r0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-NEXT:    vcmp.f32 s22, s20
+; CHECK-NEXT:    str r2, [sp, #20] @ 4-byte Spill
 ; CHECK-NEXT:    it lt
-; CHECK-NEXT:    mvnlt r3, #7
-; CHECK-NEXT:    vcmp.f32 s19, s20
+; CHECK-NEXT:    movlt.w r8, #0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s19, s19
+; CHECK-NEXT:    vcmp.f32 s22, s22
 ; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt r3, #7
-; CHECK-NEXT:    lsrs r1, r2, #28
-; CHECK-NEXT:    vcvtt.f32.f16 s19, s18
+; CHECK-NEXT:    movgt.w r8, #-1
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-NEXT:    vcmp.f32 s30, s18
 ; CHECK-NEXT:    it vs
-; CHECK-NEXT:    movvs r3, #0
-; CHECK-NEXT:    orr.w r2, r1, r3, lsl #4
-; CHECK-NEXT:    vmov r1, s19
-; CHECK-NEXT:    strb.w r2, [r4, #99]
-; CHECK-NEXT:    vcmp.f32 s30, s24
+; CHECK-NEXT:    movvs.w r8, #0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    it lt
 ; CHECK-NEXT:    mvnlt r5, #7
 ; CHECK-NEXT:    vcmp.f32 s30, s20
+; CHECK-NEXT:    mov r4, r1
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-NEXT:    vcmp.f32 s30, s30
 ; CHECK-NEXT:    it gt
 ; CHECK-NEXT:    movgt r5, #7
-; CHECK-NEXT:    vcmp.f32 s30, s30
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    it vs
 ; CHECK-NEXT:    movvs r5, #0
-; CHECK-NEXT:    and r2, r5, #15
-; CHECK-NEXT:    orr.w r0, r2, r0, lsl #4
-; CHECK-NEXT:    str.w r0, [r4, #87]
-; CHECK-NEXT:    mov r0, r1
+; CHECK-NEXT:    and r0, r5, #15
+; CHECK-NEXT:    orr.w r0, r0, r8, lsl #4
+; CHECK-NEXT:    vcvtt.f32.f16 s30, s17
+; CHECK-NEXT:    str.w r0, [r9, #62]
+; CHECK-NEXT:    vmov r0, s30
+; CHECK-NEXT:    mov r7, r3
 ; CHECK-NEXT:    bl __fixsfti
-; CHECK-NEXT:    vcmp.f32 s19, s24
-; CHECK-NEXT:    vcvtt.f32.f16 s18, s17
+; CHECK-NEXT:    vcmp.f32 s30, s18
+; CHECK-NEXT:    str r1, [sp, #16] @ 4-byte Spill
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s19, s20
+; CHECK-NEXT:    str r2, [sp, #28] @ 4-byte Spill
+; CHECK-NEXT:    str r3, [sp, #4] @ 4-byte Spill
 ; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r1, #0
+; CHECK-NEXT:    movlt r0, #0
+; CHECK-NEXT:    vcmp.f32 s30, s20
+; CHECK-NEXT:    vcvtt.f32.f16 s16, s16
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s19, s19
 ; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt.w r1, #-1
+; CHECK-NEXT:    movgt.w r0, #-1
+; CHECK-NEXT:    vcmp.f32 s30, s30
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s19, s24
 ; CHECK-NEXT:    it vs
-; CHECK-NEXT:    movvs r1, #0
+; CHECK-NEXT:    movvs r0, #0
+; CHECK-NEXT:    vcmp.f32 s28, s18
+; CHECK-NEXT:    mov r1, r0
+; CHECK-NEXT:    str r0, [sp] @ 4-byte Spill
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s19, s20
+; CHECK-NEXT:    ldr r0, [sp, #24] @ 4-byte Reload
+; CHECK-NEXT:    vcmp.f32 s28, s20
 ; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r2, #0
+; CHECK-NEXT:    mvnlt r0, #7
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s19, s19
 ; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt.w r2, #-1
+; CHECK-NEXT:    movgt r0, #7
+; CHECK-NEXT:    vcmp.f32 s28, s28
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    lsr.w r6, r1, #28
-; CHECK-NEXT:    vcmp.f32 s19, s24
 ; CHECK-NEXT:    it vs
-; CHECK-NEXT:    movvs r2, #0
-; CHECK-NEXT:    orr.w r6, r6, r2, lsl #4
+; CHECK-NEXT:    movvs r0, #0
+; CHECK-NEXT:    and r0, r0, #15
+; CHECK-NEXT:    orr.w r0, r0, r1, lsl #4
+; CHECK-NEXT:    str.w r0, [r9, #37]
+; CHECK-NEXT:    vmov r0, s16
+; CHECK-NEXT:    bl __fixsfti
+; CHECK-NEXT:    vcmp.f32 s16, s18
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    str.w r6, [r4, #70]
 ; CHECK-NEXT:    it lt
 ; CHECK-NEXT:    movlt r0, #0
-; CHECK-NEXT:    vcmp.f32 s19, s20
+; CHECK-NEXT:    vcmp.f32 s16, s20
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    it gt
 ; CHECK-NEXT:    movgt.w r0, #-1
-; CHECK-NEXT:    vcmp.f32 s19, s19
-; CHECK-NEXT:    lsrs r2, r2, #28
+; CHECK-NEXT:    vcmp.f32 s16, s16
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    it vs
 ; CHECK-NEXT:    movvs r0, #0
-; CHECK-NEXT:    lsrs r6, r0, #28
-; CHECK-NEXT:    orr.w r1, r6, r1, lsl #4
-; CHECK-NEXT:    str.w r1, [r4, #66]
-; CHECK-NEXT:    vmov r1, s18
-; CHECK-NEXT:    vcmp.f32 s19, s24
-; CHECK-NEXT:    vcvtt.f32.f16 s16, s16
+; CHECK-NEXT:    vcmp.f32 s26, s18
+; CHECK-NEXT:    ldr r5, [sp, #12] @ 4-byte Reload
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s19, s20
 ; CHECK-NEXT:    it lt
-; CHECK-NEXT:    mvnlt r3, #7
+; CHECK-NEXT:    mvnlt r5, #7
+; CHECK-NEXT:    vcmp.f32 s26, s20
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s19, s19
 ; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt r3, #7
+; CHECK-NEXT:    movgt r5, #7
+; CHECK-NEXT:    vcmp.f32 s26, s26
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s28, s24
 ; CHECK-NEXT:    it vs
-; CHECK-NEXT:    movvs r3, #0
-; CHECK-NEXT:    orr.w r2, r2, r3, lsl #4
+; CHECK-NEXT:    movvs r5, #0
+; CHECK-NEXT:    vcmp.f32 s24, s18
+; CHECK-NEXT:    and r5, r5, #15
+; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-NEXT:    vcmp.f32 s24, s20
+; CHECK-NEXT:    orr.w r5, r5, r0, lsl #4
+; CHECK-NEXT:    str.w r5, [r9, #12]
+; CHECK-NEXT:    it lt
+; CHECK-NEXT:    movlt.w r11, #0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    b.w .LBB50_3
 ; CHECK-NEXT:    .p2align 2
 ; CHECK-NEXT:  @ %bb.1:
 ; CHECK-NEXT:  .LCPI50_2:
-; CHECK-NEXT:    .long 0xf1000000 @ float -6.338253E+29
+; CHECK-NEXT:    .long 0x70ffffff @ float 6.33825262E+29
 ; CHECK-NEXT:    .p2align 2
 ; CHECK-NEXT:  @ %bb.2:
 ; CHECK-NEXT:  .LCPI50_3:
-; CHECK-NEXT:    .long 0x70ffffff @ float 6.33825262E+29
+; CHECK-NEXT:    .long 0xf1000000 @ float -6.338253E+29
 ; CHECK-NEXT:    .p2align 1
 ; CHECK-NEXT:  .LBB50_3:
-; CHECK-NEXT:    strb.w r2, [r4, #74]
-; CHECK-NEXT:    it lt
-; CHECK-NEXT:    mvnlt r7, #7
-; CHECK-NEXT:    vcmp.f32 s28, s20
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-NEXT:    vcmp.f32 s24, s24
 ; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt r7, #7
-; CHECK-NEXT:    vcmp.f32 s28, s28
+; CHECK-NEXT:    movgt.w r11, #-1
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-NEXT:    vcmp.f32 s24, s18
 ; CHECK-NEXT:    it vs
-; CHECK-NEXT:    movvs r7, #0
-; CHECK-NEXT:    and r2, r7, #15
-; CHECK-NEXT:    orr.w r0, r2, r0, lsl #4
-; CHECK-NEXT:    str.w r0, [r4, #62]
-; CHECK-NEXT:    mov r0, r1
-; CHECK-NEXT:    bl __fixsfti
-; CHECK-NEXT:    vcmp.f32 s18, s24
+; CHECK-NEXT:    movvs.w r11, #0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s18, s20
 ; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r1, #0
+; CHECK-NEXT:    movlt.w r10, #0
+; CHECK-NEXT:    vcmp.f32 s24, s20
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s18, s18
+; CHECK-NEXT:    lsrl r6, r11, #28
 ; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt.w r1, #-1
+; CHECK-NEXT:    movgt.w r10, #-1
+; CHECK-NEXT:    vcmp.f32 s24, s24
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s18, s24
 ; CHECK-NEXT:    it vs
-; CHECK-NEXT:    movvs r1, #0
+; CHECK-NEXT:    movvs.w r10, #0
+; CHECK-NEXT:    orr.w r5, r11, r10, lsl #4
+; CHECK-NEXT:    str.w r5, [r9, #95]
+; CHECK-NEXT:    str.w r6, [r9, #91]
+; CHECK-NEXT:    vcmp.f32 s24, s18
+; CHECK-NEXT:    ldr r6, [sp, #8] @ 4-byte Reload
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s18, s20
 ; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r2, #0
+; CHECK-NEXT:    mvnlt r6, #7
+; CHECK-NEXT:    vcmp.f32 s24, s20
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s18, s18
 ; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt.w r2, #-1
+; CHECK-NEXT:    movgt r6, #7
+; CHECK-NEXT:    vcmp.f32 s24, s24
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    lsr.w r7, r1, #28
-; CHECK-NEXT:    vcmp.f32 s18, s24
 ; CHECK-NEXT:    it vs
-; CHECK-NEXT:    movvs r2, #0
-; CHECK-NEXT:    orr.w r7, r7, r2, lsl #4
+; CHECK-NEXT:    movvs r6, #0
+; CHECK-NEXT:    and r5, r6, #15
+; CHECK-NEXT:    vcmp.f32 s22, s18
+; CHECK-NEXT:    lsrl r10, r5, #28
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    str.w r7, [r4, #45]
+; CHECK-NEXT:    vcmp.f32 s22, s20
+; CHECK-NEXT:    strb.w r10, [r9, #99]
 ; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r0, #0
-; CHECK-NEXT:    vcmp.f32 s18, s20
+; CHECK-NEXT:    mvnlt r7, #7
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-NEXT:    vcmp.f32 s22, s22
 ; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt.w r0, #-1
-; CHECK-NEXT:    vcmp.f32 s18, s18
-; CHECK-NEXT:    lsrs r2, r2, #28
+; CHECK-NEXT:    movgt r7, #7
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-NEXT:    vcmp.f32 s22, s18
 ; CHECK-NEXT:    it vs
-; CHECK-NEXT:    movvs r0, #0
-; CHECK-NEXT:    lsrs r7, r0, #28
-; CHECK-NEXT:    vcmp.f32 s18, s24
-; CHECK-NEXT:    orr.w r7, r7, r1, lsl #4
-; CHECK-NEXT:    vmov r1, s16
+; CHECK-NEXT:    movvs r7, #0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s18, s20
-; CHECK-NEXT:    str.w r7, [r4, #41]
 ; CHECK-NEXT:    it lt
-; CHECK-NEXT:    mvnlt r3, #7
+; CHECK-NEXT:    movlt r4, #0
+; CHECK-NEXT:    vcmp.f32 s22, s20
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s18, s18
 ; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt r3, #7
+; CHECK-NEXT:    movgt.w r4, #-1
+; CHECK-NEXT:    vcmp.f32 s22, s22
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s26, s24
 ; CHECK-NEXT:    it vs
-; CHECK-NEXT:    movvs r3, #0
-; CHECK-NEXT:    orr.w r2, r2, r3, lsl #4
+; CHECK-NEXT:    movvs r4, #0
+; CHECK-NEXT:    vmov q0[3], q0[1], r4, r7
+; CHECK-NEXT:    vcmp.f32 s22, s18
+; CHECK-NEXT:    ldr r4, [sp, #20] @ 4-byte Reload
+; CHECK-NEXT:    vmov r5, s1
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    strb.w r2, [r4, #49]
 ; CHECK-NEXT:    it lt
-; CHECK-NEXT:    mvnlt r10, #7
-; CHECK-NEXT:    vcmp.f32 s26, s20
+; CHECK-NEXT:    movlt r4, #0
+; CHECK-NEXT:    vcmp.f32 s22, s20
+; CHECK-NEXT:    lsrl r8, r5, #28
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt.w r10, #7
-; CHECK-NEXT:    vcmp.f32 s26, s26
+; CHECK-NEXT:    movgt.w r4, #-1
+; CHECK-NEXT:    vcmp.f32 s22, s22
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    it vs
-; CHECK-NEXT:    movvs.w r10, #0
-; CHECK-NEXT:    and r2, r10, #15
-; CHECK-NEXT:    orr.w r0, r2, r0, lsl #4
-; CHECK-NEXT:    str.w r0, [r4, #37]
-; CHECK-NEXT:    mov r0, r1
-; CHECK-NEXT:    bl __fixsfti
-; CHECK-NEXT:    vcmp.f32 s16, s24
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s16, s20
+; CHECK-NEXT:    movvs r4, #0
+; CHECK-NEXT:    orr.w r6, r5, r4, lsl #4
+; CHECK-NEXT:    and r5, r7, #15
+; CHECK-NEXT:    lsrl r4, r5, #28
+; CHECK-NEXT:    str.w r6, [r9, #70]
+; CHECK-NEXT:    str.w r8, [r9, #66]
+; CHECK-NEXT:    vcmp.f32 s30, s18
+; CHECK-NEXT:    strb.w r4, [r9, #74]
+; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-NEXT:    ldr r4, [sp, #4] @ 4-byte Reload
+; CHECK-NEXT:    vcmp.f32 s30, s20
 ; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r1, #0
+; CHECK-NEXT:    mvnlt r4, #7
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s16, s16
 ; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt.w r1, #-1
+; CHECK-NEXT:    movgt r4, #7
+; CHECK-NEXT:    vcmp.f32 s30, s30
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s16, s24
 ; CHECK-NEXT:    it vs
-; CHECK-NEXT:    movvs r1, #0
+; CHECK-NEXT:    movvs r4, #0
+; CHECK-NEXT:    vcmp.f32 s30, s18
+; CHECK-NEXT:    ldr r7, [sp, #16] @ 4-byte Reload
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s16, s20
 ; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r2, #0
+; CHECK-NEXT:    movlt r7, #0
+; CHECK-NEXT:    vcmp.f32 s30, s20
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s16, s16
 ; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt.w r2, #-1
+; CHECK-NEXT:    movgt.w r7, #-1
+; CHECK-NEXT:    vcmp.f32 s30, s30
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s16, s24
 ; CHECK-NEXT:    it vs
-; CHECK-NEXT:    movvs r2, #0
-; CHECK-NEXT:    lsrs r7, r1, #28
+; CHECK-NEXT:    movvs r7, #0
+; CHECK-NEXT:    vmov q0[3], q0[1], r7, r4
+; CHECK-NEXT:    vcmp.f32 s30, s18
+; CHECK-NEXT:    ldr.w r12, [sp] @ 4-byte Reload
+; CHECK-NEXT:    vmov r5, s1
+; CHECK-NEXT:    ldr r6, [sp, #28] @ 4-byte Reload
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s16, s20
-; CHECK-NEXT:    orr.w r7, r7, r2, lsl #4
-; CHECK-NEXT:    str r7, [r4, #20]
 ; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r0, #0
+; CHECK-NEXT:    movlt r6, #0
+; CHECK-NEXT:    vcmp.f32 s30, s20
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s16, s16
+; CHECK-NEXT:    lsrl r12, r5, #28
 ; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt.w r0, #-1
+; CHECK-NEXT:    movgt.w r6, #-1
+; CHECK-NEXT:    vcmp.f32 s30, s30
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s16, s24
 ; CHECK-NEXT:    it vs
-; CHECK-NEXT:    movvs r0, #0
+; CHECK-NEXT:    movvs r6, #0
+; CHECK-NEXT:    orr.w r7, r5, r6, lsl #4
+; CHECK-NEXT:    and r5, r4, #15
+; CHECK-NEXT:    vcmp.f32 s16, s18
+; CHECK-NEXT:    lsrl r6, r5, #28
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    vcmp.f32 s16, s20
-; CHECK-NEXT:    lsr.w r7, r0, #28
-; CHECK-NEXT:    orr.w r1, r7, r1, lsl #4
-; CHECK-NEXT:    str r1, [r4, #16]
+; CHECK-NEXT:    str.w r7, [r9, #45]
+; CHECK-NEXT:    str.w r12, [r9, #41]
+; CHECK-NEXT:    strb.w r6, [r9, #49]
 ; CHECK-NEXT:    it lt
 ; CHECK-NEXT:    mvnlt r3, #7
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
@@ -5904,28 +5907,44 @@ define arm_aapcs_vfpcc <8 x i100> @test_signed_v8f16_v8i100(<8 x half> %f) {
 ; CHECK-NEXT:    it gt
 ; CHECK-NEXT:    movgt r3, #7
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    lsr.w r1, r2, #28
-; CHECK-NEXT:    vcmp.f32 s22, s24
+; CHECK-NEXT:    vcmp.f32 s16, s18
 ; CHECK-NEXT:    it vs
 ; CHECK-NEXT:    movvs r3, #0
-; CHECK-NEXT:    orr.w r1, r1, r3, lsl #4
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    strb r1, [r4, #24]
 ; CHECK-NEXT:    it lt
-; CHECK-NEXT:    mvnlt r8, #7
-; CHECK-NEXT:    vcmp.f32 s22, s20
+; CHECK-NEXT:    movlt r1, #0
+; CHECK-NEXT:    vcmp.f32 s16, s20
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt.w r8, #7
-; CHECK-NEXT:    vcmp.f32 s22, s22
+; CHECK-NEXT:    movgt.w r1, #-1
+; CHECK-NEXT:    vcmp.f32 s16, s16
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    it vs
-; CHECK-NEXT:    movvs.w r8, #0
-; CHECK-NEXT:    and r1, r8, #15
-; CHECK-NEXT:    orr.w r0, r1, r0, lsl #4
-; CHECK-NEXT:    str r0, [r4, #12]
+; CHECK-NEXT:    movvs r1, #0
+; CHECK-NEXT:    vmov q0[3], q0[1], r1, r3
+; CHECK-NEXT:    vcmp.f32 s16, s18
+; CHECK-NEXT:    vmov r1, s1
+; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-NEXT:    it lt
+; CHECK-NEXT:    movlt r2, #0
+; CHECK-NEXT:    vcmp.f32 s16, s20
+; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-NEXT:    lsrl r0, r1, #28
+; CHECK-NEXT:    it gt
+; CHECK-NEXT:    movgt.w r2, #-1
+; CHECK-NEXT:    vcmp.f32 s16, s16
+; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-NEXT:    it vs
+; CHECK-NEXT:    movvs r2, #0
+; CHECK-NEXT:    orr.w r1, r1, r2, lsl #4
+; CHECK-NEXT:    strd r0, r1, [r9, #16]
+; CHECK-NEXT:    and r1, r3, #15
+; CHECK-NEXT:    lsrl r2, r1, #28
+; CHECK-NEXT:    strb.w r2, [r9, #24]
+; CHECK-NEXT:    add sp, #32
 ; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
-; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, r10, pc}
+; CHECK-NEXT:    add sp, #4
+; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
 ; CHECK-NEXT:  @ %bb.4:
     %x = call <8 x i100> @llvm.fptosi.sat.v8f16.v8i100(<8 x half> %f)
     ret <8 x i100> %x
diff --git a/llvm/test/CodeGen/Thumb2/mve-fptoui-sat-vector.ll b/llvm/test/CodeGen/Thumb2/mve-fptoui-sat-vector.ll
index 8ea12bd..2b6d0da 100644
--- a/llvm/test/CodeGen/Thumb2/mve-fptoui-sat-vector.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-fptoui-sat-vector.ll
@@ -1506,110 +1506,110 @@ define arm_aapcs_vfpcc <4 x i50> @test_unsigned_v4f32_v4i50(<4 x float> %f) {
 ; CHECK-NEXT:    vmov q4, q0
 ; CHECK-NEXT:    mov r8, r0
 ; CHECK-NEXT:    vmov r0, s16
-; CHECK-NEXT:    vldr s20, .LCPI28_0
-; CHECK-NEXT:    vmov r4, s17
-; CHECK-NEXT:    vmov r6, s19
 ; CHECK-NEXT:    bl __aeabi_f2ulz
-; CHECK-NEXT:    mov r7, r0
+; CHECK-NEXT:    mov r6, r0
+; CHECK-NEXT:    vmov r0, s18
 ; CHECK-NEXT:    vcmp.f32 s16, #0
-; CHECK-NEXT:    mov r0, r4
+; CHECK-NEXT:    mov r9, r1
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    mov r5, r1
+; CHECK-NEXT:    vmov r4, s19
+; CHECK-NEXT:    vldr s20, .LCPI28_0
 ; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r7, #0
+; CHECK-NEXT:    movlt r6, #0
 ; CHECK-NEXT:    bl __aeabi_f2ulz
-; CHECK-NEXT:    vcmp.f32 s17, #0
-; CHECK-NEXT:    mov r10, r1
+; CHECK-NEXT:    vcmp.f32 s18, #0
+; CHECK-NEXT:    mov r5, r1
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    mov r9, r0
-; CHECK-NEXT:    mov r0, r6
+; CHECK-NEXT:    mov r10, r0
+; CHECK-NEXT:    mov r0, r4
 ; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt.w r10, #0
-; CHECK-NEXT:    vcmp.f32 s17, s20
+; CHECK-NEXT:    movlt r5, #0
+; CHECK-NEXT:    vcmp.f32 s18, s20
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    itt gt
-; CHECK-NEXT:    movwgt r10, #65535
-; CHECK-NEXT:    movtgt r10, #3
+; CHECK-NEXT:    movwgt r5, #65535
+; CHECK-NEXT:    movtgt r5, #3
 ; CHECK-NEXT:    bl __aeabi_f2ulz
+; CHECK-NEXT:    mov r4, r0
+; CHECK-NEXT:    vmov r0, s17
 ; CHECK-NEXT:    vcmp.f32 s19, #0
-; CHECK-NEXT:    mov r6, r1
+; CHECK-NEXT:    mov r7, r1
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    vcmp.f32 s19, s20
 ; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r6, #0
+; CHECK-NEXT:    movlt r7, #0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    vcmp.f32 s16, s20
 ; CHECK-NEXT:    itt gt
-; CHECK-NEXT:    movwgt r6, #65535
-; CHECK-NEXT:    movtgt r6, #3
+; CHECK-NEXT:    movwgt r7, #65535
+; CHECK-NEXT:    movtgt r7, #3
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    vcmp.f32 s19, #0
 ; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt.w r7, #-1
-; CHECK-NEXT:    mov r4, r0
+; CHECK-NEXT:    movgt.w r6, #-1
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    vcmp.f32 s19, s20
-; CHECK-NEXT:    str.w r7, [r8]
+; CHECK-NEXT:    str.w r6, [r8]
 ; CHECK-NEXT:    it lt
 ; CHECK-NEXT:    movlt r4, #0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    lsl.w r0, r6, #22
-; CHECK-NEXT:    vcmp.f32 s17, #0
+; CHECK-NEXT:    vcmp.f32 s18, #0
+; CHECK-NEXT:    mov r1, r7
 ; CHECK-NEXT:    it gt
 ; CHECK-NEXT:    movgt.w r4, #-1
-; CHECK-NEXT:    orr.w r0, r0, r4, lsr #10
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    str.w r0, [r8, #20]
 ; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt.w r9, #0
-; CHECK-NEXT:    vcmp.f32 s17, s20
+; CHECK-NEXT:    movlt.w r10, #0
+; CHECK-NEXT:    vcmp.f32 s18, s20
+; CHECK-NEXT:    bfc r1, #18, #14
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt.w r9, #-1
-; CHECK-NEXT:    lsr.w r0, r9, #14
-; CHECK-NEXT:    orr.w r1, r0, r10, lsl #18
-; CHECK-NEXT:    vmov r0, s18
-; CHECK-NEXT:    str.w r1, [r8, #8]
-; CHECK-NEXT:    bl __aeabi_f2ulz
-; CHECK-NEXT:    vcmp.f32 s18, #0
-; CHECK-NEXT:    lsrs r2, r6, #10
+; CHECK-NEXT:    movgt.w r10, #-1
+; CHECK-NEXT:    vcmp.f32 s16, #0
+; CHECK-NEXT:    bfc r5, #18, #14
+; CHECK-NEXT:    mov r6, r10
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s18, s20
+; CHECK-NEXT:    lsll r4, r1, #22
+; CHECK-NEXT:    lsrl r6, r5, #28
 ; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r1, #0
+; CHECK-NEXT:    movlt.w r9, #0
+; CHECK-NEXT:    vcmp.f32 s16, s20
+; CHECK-NEXT:    orrs r1, r5
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s16, #0
 ; CHECK-NEXT:    itt gt
-; CHECK-NEXT:    movwgt r1, #65535
-; CHECK-NEXT:    movtgt r1, #3
+; CHECK-NEXT:    movwgt r9, #65535
+; CHECK-NEXT:    movtgt r9, #3
+; CHECK-NEXT:    str.w r1, [r8, #20]
+; CHECK-NEXT:    bl __aeabi_f2ulz
+; CHECK-NEXT:    vcmp.f32 s17, #0
+; CHECK-NEXT:    orr.w r2, r6, r4
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s16, s20
 ; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r5, #0
+; CHECK-NEXT:    movlt r1, #0
+; CHECK-NEXT:    vcmp.f32 s17, s20
+; CHECK-NEXT:    bfc r9, #18, #14
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s18, #0
 ; CHECK-NEXT:    itt gt
-; CHECK-NEXT:    movwgt r5, #65535
-; CHECK-NEXT:    movtgt r5, #3
+; CHECK-NEXT:    movwgt r1, #65535
+; CHECK-NEXT:    movtgt r1, #3
+; CHECK-NEXT:    str.w r2, [r8, #16]
+; CHECK-NEXT:    lsrs r2, r7, #10
+; CHECK-NEXT:    vcmp.f32 s17, #0
 ; CHECK-NEXT:    strb.w r2, [r8, #24]
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    it lt
 ; CHECK-NEXT:    movlt r0, #0
-; CHECK-NEXT:    vcmp.f32 s18, s20
+; CHECK-NEXT:    vcmp.f32 s17, s20
+; CHECK-NEXT:    bfc r1, #18, #14
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    it gt
 ; CHECK-NEXT:    movgt.w r0, #-1
-; CHECK-NEXT:    ubfx r2, r10, #14, #4
-; CHECK-NEXT:    bfc r1, #18, #14
-; CHECK-NEXT:    orr.w r2, r2, r0, lsl #4
-; CHECK-NEXT:    lsrs r0, r0, #28
-; CHECK-NEXT:    orr.w r0, r0, r1, lsl #4
-; CHECK-NEXT:    bfc r5, #18, #14
-; CHECK-NEXT:    str.w r2, [r8, #12]
-; CHECK-NEXT:    orr.w r2, r5, r9, lsl #18
-; CHECK-NEXT:    str.w r2, [r8, #4]
-; CHECK-NEXT:    orr.w r0, r0, r4, lsl #22
-; CHECK-NEXT:    str.w r0, [r8, #16]
+; CHECK-NEXT:    mov r2, r0
+; CHECK-NEXT:    orr.w r0, r9, r0, lsl #18
+; CHECK-NEXT:    lsrl r2, r1, #14
+; CHECK-NEXT:    orr.w r1, r1, r10, lsl #4
+; CHECK-NEXT:    strd r2, r1, [r8, #8]
+; CHECK-NEXT:    str.w r0, [r8, #4]
 ; CHECK-NEXT:    vpop {d8, d9, d10}
 ; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, r10, pc}
 ; CHECK-NEXT:    .p2align 2
@@ -1729,32 +1729,31 @@ define arm_aapcs_vfpcc <4 x i64> @test_unsigned_v4f32_v4i64(<4 x float> %f) {
 define arm_aapcs_vfpcc <4 x i100> @test_unsigned_v4f32_v4i100(<4 x float> %f) {
 ; CHECK-LABEL: test_unsigned_v4f32_v4i100:
 ; CHECK:       @ %bb.0:
-; CHECK-NEXT:    .save {r4, r5, r6, r7, lr}
-; CHECK-NEXT:    push {r4, r5, r6, r7, lr}
+; CHECK-NEXT:    .save {r4, r5, r6, r7, r8, r9, r10, r11, lr}
+; CHECK-NEXT:    push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr}
 ; CHECK-NEXT:    .pad #4
 ; CHECK-NEXT:    sub sp, #4
 ; CHECK-NEXT:    .vsave {d8, d9, d10}
 ; CHECK-NEXT:    vpush {d8, d9, d10}
 ; CHECK-NEXT:    vmov q4, q0
-; CHECK-NEXT:    mov r4, r0
+; CHECK-NEXT:    mov r8, r0
 ; CHECK-NEXT:    vmov r0, s18
-; CHECK-NEXT:    vldr s20, .LCPI30_0
-; CHECK-NEXT:    vmov r5, s16
-; CHECK-NEXT:    vmov r7, s19
 ; CHECK-NEXT:    bl __fixunssfti
+; CHECK-NEXT:    mov r9, r3
+; CHECK-NEXT:    vmov r3, s16
+; CHECK-NEXT:    vldr s20, .LCPI30_0
 ; CHECK-NEXT:    vcmp.f32 s18, #0
-; CHECK-NEXT:    mov r6, r3
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s18, s20
 ; CHECK-NEXT:    it lt
 ; CHECK-NEXT:    movlt r2, #0
+; CHECK-NEXT:    vcmp.f32 s18, s20
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    vcmp.f32 s18, #0
 ; CHECK-NEXT:    it gt
 ; CHECK-NEXT:    movgt.w r2, #-1
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    vcmp.f32 s18, s20
-; CHECK-NEXT:    str.w r2, [r4, #33]
+; CHECK-NEXT:    str.w r2, [r8, #33]
 ; CHECK-NEXT:    it lt
 ; CHECK-NEXT:    movlt r1, #0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
@@ -1762,18 +1761,20 @@ define arm_aapcs_vfpcc <4 x i100> @test_unsigned_v4f32_v4i100(<4 x float> %f) {
 ; CHECK-NEXT:    it gt
 ; CHECK-NEXT:    movgt.w r1, #-1
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    str.w r1, [r4, #29]
+; CHECK-NEXT:    str.w r1, [r8, #29]
 ; CHECK-NEXT:    it lt
 ; CHECK-NEXT:    movlt r0, #0
 ; CHECK-NEXT:    vcmp.f32 s18, s20
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    it gt
 ; CHECK-NEXT:    movgt.w r0, #-1
-; CHECK-NEXT:    str.w r0, [r4, #25]
-; CHECK-NEXT:    mov r0, r5
+; CHECK-NEXT:    str.w r0, [r8, #25]
+; CHECK-NEXT:    vmov r7, s17
+; CHECK-NEXT:    vmov r4, s19
+; CHECK-NEXT:    mov r0, r3
 ; CHECK-NEXT:    bl __fixunssfti
 ; CHECK-NEXT:    vcmp.f32 s16, #0
-; CHECK-NEXT:    mov r5, r3
+; CHECK-NEXT:    mov r10, r3
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    vcmp.f32 s16, s20
 ; CHECK-NEXT:    it lt
@@ -1784,7 +1785,7 @@ define arm_aapcs_vfpcc <4 x i100> @test_unsigned_v4f32_v4i100(<4 x float> %f) {
 ; CHECK-NEXT:    movgt.w r2, #-1
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    vcmp.f32 s16, s20
-; CHECK-NEXT:    str r2, [r4, #8]
+; CHECK-NEXT:    str.w r2, [r8, #8]
 ; CHECK-NEXT:    it lt
 ; CHECK-NEXT:    movlt r1, #0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
@@ -1792,126 +1793,126 @@ define arm_aapcs_vfpcc <4 x i100> @test_unsigned_v4f32_v4i100(<4 x float> %f) {
 ; CHECK-NEXT:    it gt
 ; CHECK-NEXT:    movgt.w r1, #-1
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    str r1, [r4, #4]
+; CHECK-NEXT:    str.w r1, [r8, #4]
 ; CHECK-NEXT:    it lt
 ; CHECK-NEXT:    movlt r0, #0
 ; CHECK-NEXT:    vcmp.f32 s16, s20
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    it gt
 ; CHECK-NEXT:    movgt.w r0, #-1
-; CHECK-NEXT:    str r0, [r4]
-; CHECK-NEXT:    mov r0, r7
+; CHECK-NEXT:    str.w r0, [r8]
+; CHECK-NEXT:    mov r0, r4
 ; CHECK-NEXT:    bl __fixunssfti
 ; CHECK-NEXT:    vcmp.f32 s19, #0
+; CHECK-NEXT:    mov r4, r0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    vcmp.f32 s19, s20
 ; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r1, #0
+; CHECK-NEXT:    movlt r4, #0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s19, #0
+; CHECK-NEXT:    vcmp.f32 s18, #0
 ; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt.w r1, #-1
+; CHECK-NEXT:    movgt.w r4, #-1
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s19, s20
 ; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r2, #0
+; CHECK-NEXT:    movlt.w r9, #0
+; CHECK-NEXT:    vcmp.f32 s18, s20
+; CHECK-NEXT:    mov r5, r1
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    lsr.w r7, r1, #28
-; CHECK-NEXT:    vcmp.f32 s19, #0
 ; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt.w r2, #-1
-; CHECK-NEXT:    orr.w r7, r7, r2, lsl #4
+; CHECK-NEXT:    movgt.w r9, #15
+; CHECK-NEXT:    and r0, r9, #15
+; CHECK-NEXT:    mov r6, r2
+; CHECK-NEXT:    orr.w r0, r0, r4, lsl #4
+; CHECK-NEXT:    str.w r0, [r8, #37]
+; CHECK-NEXT:    mov r0, r7
+; CHECK-NEXT:    mov r11, r3
+; CHECK-NEXT:    bl __fixunssfti
+; CHECK-NEXT:    vcmp.f32 s17, #0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    str.w r7, [r4, #45]
+; CHECK-NEXT:    vcmp.f32 s17, s20
 ; CHECK-NEXT:    it lt
 ; CHECK-NEXT:    movlt r0, #0
-; CHECK-NEXT:    vcmp.f32 s19, s20
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-NEXT:    vcmp.f32 s16, #0
 ; CHECK-NEXT:    it gt
 ; CHECK-NEXT:    movgt.w r0, #-1
-; CHECK-NEXT:    lsrs r7, r0, #28
-; CHECK-NEXT:    vcmp.f32 s19, #0
-; CHECK-NEXT:    orr.w r7, r7, r1, lsl #4
-; CHECK-NEXT:    vmov r1, s17
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s19, s20
-; CHECK-NEXT:    str.w r7, [r4, #41]
+; CHECK-NEXT:    vcmp.f32 s16, s20
 ; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r3, #0
+; CHECK-NEXT:    movlt.w r10, #0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    lsr.w r2, r2, #28
-; CHECK-NEXT:    vcmp.f32 s18, #0
+; CHECK-NEXT:    vcmp.f32 s19, #0
 ; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt r3, #15
-; CHECK-NEXT:    orr.w r2, r2, r3, lsl #4
+; CHECK-NEXT:    movgt.w r10, #15
+; CHECK-NEXT:    and r7, r10, #15
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    strb.w r2, [r4, #49]
+; CHECK-NEXT:    vcmp.f32 s19, s20
+; CHECK-NEXT:    orr.w r7, r7, r0, lsl #4
+; CHECK-NEXT:    str.w r7, [r8, #12]
 ; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r6, #0
-; CHECK-NEXT:    vcmp.f32 s18, s20
+; CHECK-NEXT:    movlt r5, #0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-NEXT:    vcmp.f32 s19, #0
 ; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt r6, #15
-; CHECK-NEXT:    and r2, r6, #15
-; CHECK-NEXT:    orr.w r0, r2, r0, lsl #4
-; CHECK-NEXT:    str.w r0, [r4, #37]
-; CHECK-NEXT:    mov r0, r1
-; CHECK-NEXT:    bl __fixunssfti
-; CHECK-NEXT:    vcmp.f32 s17, #0
+; CHECK-NEXT:    movgt.w r5, #-1
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s17, s20
+; CHECK-NEXT:    vcmp.f32 s19, s20
 ; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r1, #0
+; CHECK-NEXT:    movlt r6, #0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s17, #0
+; CHECK-NEXT:    lsrl r4, r5, #28
+; CHECK-NEXT:    vcmp.f32 s19, #0
 ; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt.w r1, #-1
+; CHECK-NEXT:    movgt.w r6, #-1
+; CHECK-NEXT:    orr.w r7, r5, r6, lsl #4
+; CHECK-NEXT:    str.w r7, [r8, #45]
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s17, s20
+; CHECK-NEXT:    str.w r4, [r8, #41]
 ; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r2, #0
+; CHECK-NEXT:    movlt.w r11, #0
+; CHECK-NEXT:    vcmp.f32 s19, s20
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s17, #0
 ; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt.w r2, #-1
-; CHECK-NEXT:    lsrs r7, r1, #28
+; CHECK-NEXT:    movgt.w r11, #15
+; CHECK-NEXT:    and r5, r11, #15
+; CHECK-NEXT:    vcmp.f32 s17, #0
+; CHECK-NEXT:    lsrl r6, r5, #28
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    vcmp.f32 s17, s20
-; CHECK-NEXT:    orr.w r7, r7, r2, lsl #4
-; CHECK-NEXT:    str r7, [r4, #20]
+; CHECK-NEXT:    strb.w r6, [r8, #49]
 ; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r0, #0
+; CHECK-NEXT:    movlt r3, #0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    vcmp.f32 s17, #0
 ; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt.w r0, #-1
-; CHECK-NEXT:    lsrs r7, r0, #28
-; CHECK-NEXT:    orr.w r1, r7, r1, lsl #4
+; CHECK-NEXT:    movgt r3, #15
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s17, s20
-; CHECK-NEXT:    str r1, [r4, #16]
 ; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r3, #0
+; CHECK-NEXT:    movlt r1, #0
+; CHECK-NEXT:    vcmp.f32 s17, s20
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    lsr.w r1, r2, #28
-; CHECK-NEXT:    vcmp.f32 s16, #0
 ; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt r3, #15
-; CHECK-NEXT:    orr.w r1, r1, r3, lsl #4
+; CHECK-NEXT:    movgt.w r1, #-1
+; CHECK-NEXT:    vmov q0[3], q0[1], r1, r3
+; CHECK-NEXT:    vcmp.f32 s17, #0
+; CHECK-NEXT:    vmov r1, s1
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    strb r1, [r4, #24]
+; CHECK-NEXT:    lsrl r0, r1, #28
 ; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r5, #0
-; CHECK-NEXT:    vcmp.f32 s16, s20
+; CHECK-NEXT:    movlt r2, #0
+; CHECK-NEXT:    vcmp.f32 s17, s20
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt r5, #15
-; CHECK-NEXT:    and r1, r5, #15
-; CHECK-NEXT:    orr.w r0, r1, r0, lsl #4
-; CHECK-NEXT:    str r0, [r4, #12]
+; CHECK-NEXT:    movgt.w r2, #-1
+; CHECK-NEXT:    orr.w r1, r1, r2, lsl #4
+; CHECK-NEXT:    strd r0, r1, [r8, #16]
+; CHECK-NEXT:    and r1, r3, #15
+; CHECK-NEXT:    lsrl r2, r1, #28
+; CHECK-NEXT:    strb.w r2, [r8, #24]
 ; CHECK-NEXT:    vpop {d8, d9, d10}
 ; CHECK-NEXT:    add sp, #4
-; CHECK-NEXT:    pop {r4, r5, r6, r7, pc}
+; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
 ; CHECK-NEXT:    .p2align 2
 ; CHECK-NEXT:  @ %bb.1:
 ; CHECK-NEXT:  .LCPI30_0:
@@ -2923,195 +2924,196 @@ define arm_aapcs_vfpcc <2 x i100> @test_unsigned_v2f64_v2i100(<2 x double> %f) {
 ; CHECK-NEXT:    sub sp, #48
 ; CHECK-NEXT:    vmov q4, q0
 ; CHECK-NEXT:    vldr d0, .LCPI40_0
-; CHECK-NEXT:    vmov r9, r5, d8
-; CHECK-NEXT:    str r0, [sp, #44] @ 4-byte Spill
-; CHECK-NEXT:    vmov r2, r3, d0
-; CHECK-NEXT:    mov r0, r9
+; CHECK-NEXT:    vmov r6, r5, d8
+; CHECK-NEXT:    mov r8, r0
+; CHECK-NEXT:    vmov r2, r9, d0
+; CHECK-NEXT:    mov r0, r6
 ; CHECK-NEXT:    mov r1, r5
-; CHECK-NEXT:    mov r7, r2
-; CHECK-NEXT:    mov r6, r3
+; CHECK-NEXT:    mov r3, r9
+; CHECK-NEXT:    mov r10, r2
 ; CHECK-NEXT:    bl __aeabi_dcmpgt
 ; CHECK-NEXT:    vldr d0, .LCPI40_1
-; CHECK-NEXT:    mov r11, r0
-; CHECK-NEXT:    mov r0, r9
+; CHECK-NEXT:    mov r7, r0
+; CHECK-NEXT:    mov r0, r6
 ; CHECK-NEXT:    mov r1, r5
-; CHECK-NEXT:    vmov r2, r3, d0
-; CHECK-NEXT:    str r2, [sp, #40] @ 4-byte Spill
-; CHECK-NEXT:    mov r10, r3
+; CHECK-NEXT:    vmov r11, r3, d0
+; CHECK-NEXT:    str r3, [sp, #44] @ 4-byte Spill
+; CHECK-NEXT:    mov r2, r11
 ; CHECK-NEXT:    bl __aeabi_dcmpge
-; CHECK-NEXT:    mov r8, r0
-; CHECK-NEXT:    mov r0, r9
+; CHECK-NEXT:    mov r4, r0
+; CHECK-NEXT:    mov r0, r6
 ; CHECK-NEXT:    mov r1, r5
+; CHECK-NEXT:    str r5, [sp, #36] @ 4-byte Spill
 ; CHECK-NEXT:    bl __fixunsdfti
-; CHECK-NEXT:    cmp.w r8, #0
-; CHECK-NEXT:    strd r1, r0, [sp, #8] @ 8-byte Folded Spill
-; CHECK-NEXT:    csel r0, r2, r8, ne
-; CHECK-NEXT:    str r3, [sp, #24] @ 4-byte Spill
-; CHECK-NEXT:    cmp.w r11, #0
+; CHECK-NEXT:    cmp r4, #0
+; CHECK-NEXT:    strd r1, r0, [sp, #20] @ 8-byte Folded Spill
+; CHECK-NEXT:    csel r0, r2, r4, ne
+; CHECK-NEXT:    str r3, [sp, #40] @ 4-byte Spill
+; CHECK-NEXT:    cmp r7, #0
 ; CHECK-NEXT:    it ne
 ; CHECK-NEXT:    movne.w r0, #-1
-; CHECK-NEXT:    ldr r4, [sp, #44] @ 4-byte Reload
+; CHECK-NEXT:    str.w r0, [r8, #8]
+; CHECK-NEXT:    mov r0, r6
 ; CHECK-NEXT:    mov r1, r5
-; CHECK-NEXT:    mov r2, r7
-; CHECK-NEXT:    mov r3, r6
-; CHECK-NEXT:    mov r11, r7
-; CHECK-NEXT:    str r0, [r4, #8]
-; CHECK-NEXT:    mov r0, r9
-; CHECK-NEXT:    str r5, [sp, #20] @ 4-byte Spill
-; CHECK-NEXT:    str r7, [sp, #28] @ 4-byte Spill
-; CHECK-NEXT:    str r6, [sp, #32] @ 4-byte Spill
+; CHECK-NEXT:    mov r2, r10
+; CHECK-NEXT:    mov r3, r9
 ; CHECK-NEXT:    bl __aeabi_dcmpgt
-; CHECK-NEXT:    ldr r7, [sp, #40] @ 4-byte Reload
-; CHECK-NEXT:    mov r8, r0
-; CHECK-NEXT:    mov r0, r9
+; CHECK-NEXT:    ldr r3, [sp, #44] @ 4-byte Reload
+; CHECK-NEXT:    mov r4, r0
+; CHECK-NEXT:    mov r0, r6
 ; CHECK-NEXT:    mov r1, r5
-; CHECK-NEXT:    mov r3, r10
-; CHECK-NEXT:    str.w r9, [sp, #16] @ 4-byte Spill
-; CHECK-NEXT:    mov r2, r7
+; CHECK-NEXT:    mov r2, r11
+; CHECK-NEXT:    mov r7, r6
 ; CHECK-NEXT:    bl __aeabi_dcmpge
-; CHECK-NEXT:    ldr r1, [sp, #8] @ 4-byte Reload
+; CHECK-NEXT:    ldr r1, [sp, #20] @ 4-byte Reload
 ; CHECK-NEXT:    cmp r0, #0
-; CHECK-NEXT:    mov r2, r11
-; CHECK-NEXT:    mov r3, r6
+; CHECK-NEXT:    mov r2, r10
+; CHECK-NEXT:    mov r3, r9
 ; CHECK-NEXT:    csel r0, r1, r0, ne
-; CHECK-NEXT:    cmp.w r8, #0
+; CHECK-NEXT:    cmp r4, #0
 ; CHECK-NEXT:    it ne
 ; CHECK-NEXT:    movne.w r0, #-1
-; CHECK-NEXT:    str r0, [r4, #4]
-; CHECK-NEXT:    mov r0, r9
+; CHECK-NEXT:    str.w r0, [r8, #4]
+; CHECK-NEXT:    mov r0, r7
 ; CHECK-NEXT:    mov r1, r5
+; CHECK-NEXT:    mov r6, r8
+; CHECK-NEXT:    strd r8, r7, [sp, #28] @ 8-byte Folded Spill
 ; CHECK-NEXT:    bl __aeabi_dcmpgt
-; CHECK-NEXT:    mov r6, r0
-; CHECK-NEXT:    mov r0, r9
+; CHECK-NEXT:    mov r4, r0
+; CHECK-NEXT:    mov r0, r7
+; CHECK-NEXT:    ldr r7, [sp, #44] @ 4-byte Reload
 ; CHECK-NEXT:    mov r1, r5
-; CHECK-NEXT:    mov r2, r7
-; CHECK-NEXT:    mov r3, r10
-; CHECK-NEXT:    mov r9, r7
-; CHECK-NEXT:    str.w r10, [sp, #36] @ 4-byte Spill
+; CHECK-NEXT:    mov r2, r11
+; CHECK-NEXT:    mov r5, r11
+; CHECK-NEXT:    mov r3, r7
 ; CHECK-NEXT:    bl __aeabi_dcmpge
 ; CHECK-NEXT:    vmov r8, r11, d9
-; CHECK-NEXT:    ldr r1, [sp, #12] @ 4-byte Reload
+; CHECK-NEXT:    ldr r1, [sp, #24] @ 4-byte Reload
 ; CHECK-NEXT:    cmp r0, #0
+; CHECK-NEXT:    mov r2, r10
 ; CHECK-NEXT:    csel r0, r1, r0, ne
-; CHECK-NEXT:    cmp r6, #0
+; CHECK-NEXT:    cmp r4, #0
 ; CHECK-NEXT:    it ne
 ; CHECK-NEXT:    movne.w r0, #-1
-; CHECK-NEXT:    str r0, [r4]
-; CHECK-NEXT:    ldr r5, [sp, #28] @ 4-byte Reload
-; CHECK-NEXT:    ldr r6, [sp, #32] @ 4-byte Reload
-; CHECK-NEXT:    mov r2, r5
-; CHECK-NEXT:    mov r3, r6
+; CHECK-NEXT:    str r0, [r6]
+; CHECK-NEXT:    mov r3, r9
 ; CHECK-NEXT:    mov r0, r8
 ; CHECK-NEXT:    mov r1, r11
 ; CHECK-NEXT:    bl __aeabi_dcmpgt
 ; CHECK-NEXT:    mov r4, r0
 ; CHECK-NEXT:    mov r0, r8
 ; CHECK-NEXT:    mov r1, r11
-; CHECK-NEXT:    mov r2, r7
-; CHECK-NEXT:    mov r3, r10
+; CHECK-NEXT:    mov r2, r5
+; CHECK-NEXT:    mov r3, r7
+; CHECK-NEXT:    mov r6, r5
+; CHECK-NEXT:    str r5, [sp] @ 4-byte Spill
+; CHECK-NEXT:    mov r5, r7
 ; CHECK-NEXT:    bl __aeabi_dcmpge
-; CHECK-NEXT:    mov r10, r0
+; CHECK-NEXT:    mov r7, r0
 ; CHECK-NEXT:    mov r0, r8
 ; CHECK-NEXT:    mov r1, r11
 ; CHECK-NEXT:    bl __fixunsdfti
-; CHECK-NEXT:    cmp.w r10, #0
-; CHECK-NEXT:    strd r2, r0, [sp, #4] @ 8-byte Folded Spill
-; CHECK-NEXT:    csel r7, r1, r10, ne
-; CHECK-NEXT:    str r3, [sp, #12] @ 4-byte Spill
-; CHECK-NEXT:    mov r0, r8
-; CHECK-NEXT:    mov r1, r11
-; CHECK-NEXT:    mov r2, r5
-; CHECK-NEXT:    mov r3, r6
+; CHECK-NEXT:    cmp r7, #0
+; CHECK-NEXT:    strd r0, r2, [sp, #20] @ 8-byte Folded Spill
+; CHECK-NEXT:    csel r0, r3, r7, ne
+; CHECK-NEXT:    str r1, [sp, #4] @ 4-byte Spill
 ; CHECK-NEXT:    cmp r4, #0
 ; CHECK-NEXT:    it ne
-; CHECK-NEXT:    movne.w r7, #-1
-; CHECK-NEXT:    mov r4, r6
-; CHECK-NEXT:    bl __aeabi_dcmpgt
-; CHECK-NEXT:    ldr.w r10, [sp, #36] @ 4-byte Reload
-; CHECK-NEXT:    mov r6, r0
+; CHECK-NEXT:    movne r0, #15
+; CHECK-NEXT:    mov r7, r0
+; CHECK-NEXT:    str r0, [sp, #16] @ 4-byte Spill
 ; CHECK-NEXT:    mov r0, r8
 ; CHECK-NEXT:    mov r1, r11
-; CHECK-NEXT:    mov r2, r9
-; CHECK-NEXT:    mov r3, r10
+; CHECK-NEXT:    mov r2, r10
+; CHECK-NEXT:    mov r3, r9
+; CHECK-NEXT:    bl __aeabi_dcmpgt
+; CHECK-NEXT:    mov r4, r0
+; CHECK-NEXT:    mov r0, r8
+; CHECK-NEXT:    mov r1, r11
+; CHECK-NEXT:    mov r2, r6
+; CHECK-NEXT:    mov r3, r5
 ; CHECK-NEXT:    bl __aeabi_dcmpge
 ; CHECK-NEXT:    ldr r1, [sp, #4] @ 4-byte Reload
 ; CHECK-NEXT:    cmp r0, #0
-; CHECK-NEXT:    mov r2, r5
-; CHECK-NEXT:    mov r3, r4
-; CHECK-NEXT:    csel r9, r1, r0, ne
-; CHECK-NEXT:    cmp r6, #0
+; CHECK-NEXT:    mov r2, r10
+; CHECK-NEXT:    mov r3, r9
+; CHECK-NEXT:    csel r0, r1, r0, ne
+; CHECK-NEXT:    cmp r4, #0
 ; CHECK-NEXT:    it ne
-; CHECK-NEXT:    movne.w r9, #-1
-; CHECK-NEXT:    ldr r6, [sp, #44] @ 4-byte Reload
-; CHECK-NEXT:    lsrs r0, r7, #28
+; CHECK-NEXT:    movne.w r0, #-1
+; CHECK-NEXT:    vmov q0[3], q0[1], r0, r7
+; CHECK-NEXT:    vmov r0, s1
 ; CHECK-NEXT:    mov r1, r11
-; CHECK-NEXT:    orr.w r0, r0, r9, lsl #4
-; CHECK-NEXT:    str r0, [r6, #20]
+; CHECK-NEXT:    mov r6, r10
+; CHECK-NEXT:    str.w r10, [sp, #12] @ 4-byte Spill
+; CHECK-NEXT:    str.w r9, [sp, #8] @ 4-byte Spill
+; CHECK-NEXT:    str r0, [sp, #4] @ 4-byte Spill
 ; CHECK-NEXT:    mov r0, r8
 ; CHECK-NEXT:    bl __aeabi_dcmpgt
-; CHECK-NEXT:    ldr r2, [sp, #40] @ 4-byte Reload
-; CHECK-NEXT:    mov r1, r11
-; CHECK-NEXT:    str r0, [sp, #4] @ 4-byte Spill
+; CHECK-NEXT:    ldr r7, [sp] @ 4-byte Reload
+; CHECK-NEXT:    mov r10, r0
 ; CHECK-NEXT:    mov r0, r8
-; CHECK-NEXT:    mov r3, r10
-; CHECK-NEXT:    mov r5, r10
+; CHECK-NEXT:    mov r1, r11
+; CHECK-NEXT:    mov r3, r5
+; CHECK-NEXT:    mov r2, r7
 ; CHECK-NEXT:    bl __aeabi_dcmpge
-; CHECK-NEXT:    ldr r1, [sp, #8] @ 4-byte Reload
+; CHECK-NEXT:    ldr r1, [sp, #20] @ 4-byte Reload
 ; CHECK-NEXT:    cmp r0, #0
+; CHECK-NEXT:    mov r2, r6
+; CHECK-NEXT:    mov r3, r9
 ; CHECK-NEXT:    csel r4, r1, r0, ne
-; CHECK-NEXT:    ldr r0, [sp, #4] @ 4-byte Reload
-; CHECK-NEXT:    mov r1, r11
-; CHECK-NEXT:    cmp r0, #0
+; CHECK-NEXT:    cmp.w r10, #0
 ; CHECK-NEXT:    it ne
 ; CHECK-NEXT:    movne.w r4, #-1
-; CHECK-NEXT:    lsrs r0, r4, #28
-; CHECK-NEXT:    orr.w r0, r0, r7, lsl #4
-; CHECK-NEXT:    str r0, [r6, #16]
-; CHECK-NEXT:    ldr r6, [sp, #28] @ 4-byte Reload
+; CHECK-NEXT:    ldr r5, [sp, #4] @ 4-byte Reload
+; CHECK-NEXT:    mov r10, r4
 ; CHECK-NEXT:    mov r0, r8
-; CHECK-NEXT:    ldr.w r10, [sp, #32] @ 4-byte Reload
-; CHECK-NEXT:    mov r2, r6
-; CHECK-NEXT:    mov r3, r10
+; CHECK-NEXT:    mov r1, r11
+; CHECK-NEXT:    lsrl r10, r5, #28
 ; CHECK-NEXT:    bl __aeabi_dcmpgt
 ; CHECK-NEXT:    mov r1, r11
-; CHECK-NEXT:    ldr.w r11, [sp, #40] @ 4-byte Reload
-; CHECK-NEXT:    mov r7, r0
+; CHECK-NEXT:    ldr.w r11, [sp, #44] @ 4-byte Reload
+; CHECK-NEXT:    mov r6, r0
 ; CHECK-NEXT:    mov r0, r8
-; CHECK-NEXT:    mov r3, r5
-; CHECK-NEXT:    mov r2, r11
+; CHECK-NEXT:    mov r2, r7
+; CHECK-NEXT:    mov r9, r7
+; CHECK-NEXT:    mov r3, r11
 ; CHECK-NEXT:    bl __aeabi_dcmpge
-; CHECK-NEXT:    ldr r1, [sp, #12] @ 4-byte Reload
+; CHECK-NEXT:    ldr r1, [sp, #24] @ 4-byte Reload
 ; CHECK-NEXT:    cmp r0, #0
-; CHECK-NEXT:    mov r2, r6
-; CHECK-NEXT:    mov r3, r10
 ; CHECK-NEXT:    csel r0, r1, r0, ne
-; CHECK-NEXT:    cmp r7, #0
+; CHECK-NEXT:    cmp r6, #0
 ; CHECK-NEXT:    it ne
-; CHECK-NEXT:    movne r0, #15
-; CHECK-NEXT:    lsr.w r1, r9, #28
-; CHECK-NEXT:    ldr.w r9, [sp, #44] @ 4-byte Reload
-; CHECK-NEXT:    orr.w r0, r1, r0, lsl #4
-; CHECK-NEXT:    strb.w r0, [r9, #24]
-; CHECK-NEXT:    ldr r7, [sp, #16] @ 4-byte Reload
-; CHECK-NEXT:    ldr r5, [sp, #20] @ 4-byte Reload
-; CHECK-NEXT:    mov r0, r7
-; CHECK-NEXT:    mov r1, r5
+; CHECK-NEXT:    movne.w r0, #-1
+; CHECK-NEXT:    ldr r2, [sp, #28] @ 4-byte Reload
+; CHECK-NEXT:    orr.w r1, r5, r0, lsl #4
+; CHECK-NEXT:    strd r10, r1, [r2, #16]
+; CHECK-NEXT:    mov r8, r2
+; CHECK-NEXT:    ldr r1, [sp, #16] @ 4-byte Reload
+; CHECK-NEXT:    and r1, r1, #15
+; CHECK-NEXT:    lsrl r0, r1, #28
+; CHECK-NEXT:    strb r0, [r2, #24]
+; CHECK-NEXT:    ldr r6, [sp, #32] @ 4-byte Reload
+; CHECK-NEXT:    ldr r7, [sp, #36] @ 4-byte Reload
+; CHECK-NEXT:    ldrd r3, r2, [sp, #8] @ 8-byte Folded Reload
+; CHECK-NEXT:    mov r0, r6
+; CHECK-NEXT:    mov r1, r7
 ; CHECK-NEXT:    bl __aeabi_dcmpgt
-; CHECK-NEXT:    ldr r3, [sp, #36] @ 4-byte Reload
-; CHECK-NEXT:    mov r8, r0
-; CHECK-NEXT:    mov r0, r7
-; CHECK-NEXT:    mov r1, r5
-; CHECK-NEXT:    mov r2, r11
+; CHECK-NEXT:    mov r10, r0
+; CHECK-NEXT:    mov r0, r6
+; CHECK-NEXT:    mov r1, r7
+; CHECK-NEXT:    mov r2, r9
+; CHECK-NEXT:    mov r3, r11
 ; CHECK-NEXT:    bl __aeabi_dcmpge
-; CHECK-NEXT:    ldr r1, [sp, #24] @ 4-byte Reload
+; CHECK-NEXT:    ldr r1, [sp, #40] @ 4-byte Reload
 ; CHECK-NEXT:    cmp r0, #0
 ; CHECK-NEXT:    csel r0, r1, r0, ne
-; CHECK-NEXT:    cmp.w r8, #0
+; CHECK-NEXT:    cmp.w r10, #0
 ; CHECK-NEXT:    it ne
 ; CHECK-NEXT:    movne r0, #15
 ; CHECK-NEXT:    and r0, r0, #15
 ; CHECK-NEXT:    orr.w r0, r0, r4, lsl #4
-; CHECK-NEXT:    str.w r0, [r9, #12]
+; CHECK-NEXT:    str.w r0, [r8, #12]
 ; CHECK-NEXT:    add sp, #48
 ; CHECK-NEXT:    vpop {d8, d9}
 ; CHECK-NEXT:    add sp, #4
@@ -3639,73 +3641,90 @@ define arm_aapcs_vfpcc <8 x i16> @test_unsigned_v8f16_v8i16(<8 x half> %f) {
 define arm_aapcs_vfpcc <8 x i19> @test_unsigned_v8f16_v8i19(<8 x half> %f) {
 ; CHECK-LABEL: test_unsigned_v8f16_v8i19:
 ; CHECK:       @ %bb.0:
-; CHECK-NEXT:    .save {r4, r5, r7, lr}
-; CHECK-NEXT:    push {r4, r5, r7, lr}
-; CHECK-NEXT:    vldr s6, .LCPI46_1
-; CHECK-NEXT:    vcvtb.f32.f16 s8, s0
-; CHECK-NEXT:    vcvtb.f32.f16 s12, s2
-; CHECK-NEXT:    vcvtb.f32.f16 s10, s1
-; CHECK-NEXT:    vcvtt.f32.f16 s14, s1
-; CHECK-NEXT:    vcvtb.f32.f16 s1, s3
-; CHECK-NEXT:    vcvtt.f32.f16 s0, s0
-; CHECK-NEXT:    vcvtt.f32.f16 s2, s2
+; CHECK-NEXT:    .save {r4, r5, r6, r7, r9, r11, lr}
+; CHECK-NEXT:    push.w {r4, r5, r6, r7, r9, r11, lr}
 ; CHECK-NEXT:    vldr s4, .LCPI46_0
-; CHECK-NEXT:    vcvtt.f32.f16 s3, s3
-; CHECK-NEXT:    vmaxnm.f32 s8, s8, s6
-; CHECK-NEXT:    vmaxnm.f32 s10, s10, s6
-; CHECK-NEXT:    vmaxnm.f32 s0, s0, s6
-; CHECK-NEXT:    vmaxnm.f32 s12, s12, s6
-; CHECK-NEXT:    vmaxnm.f32 s14, s14, s6
-; CHECK-NEXT:    vmaxnm.f32 s2, s2, s6
-; CHECK-NEXT:    vmaxnm.f32 s1, s1, s6
-; CHECK-NEXT:    vmaxnm.f32 s6, s3, s6
-; CHECK-NEXT:    vminnm.f32 s8, s8, s4
-; CHECK-NEXT:    vminnm.f32 s10, s10, s4
-; CHECK-NEXT:    vminnm.f32 s0, s0, s4
-; CHECK-NEXT:    vminnm.f32 s12, s12, s4
-; CHECK-NEXT:    vminnm.f32 s14, s14, s4
-; CHECK-NEXT:    vminnm.f32 s2, s2, s4
-; CHECK-NEXT:    vminnm.f32 s1, s1, s4
-; CHECK-NEXT:    vminnm.f32 s4, s6, s4
-; CHECK-NEXT:    vcvt.u32.f32 s1, s1
-; CHECK-NEXT:    vcvt.u32.f32 s4, s4
-; CHECK-NEXT:    vcvt.u32.f32 s2, s2
+; CHECK-NEXT:    vcvtb.f32.f16 s14, s1
+; CHECK-NEXT:    vldr s6, .LCPI46_1
+; CHECK-NEXT:    vcvtt.f32.f16 s12, s1
+; CHECK-NEXT:    vmaxnm.f32 s14, s14, s4
+; CHECK-NEXT:    vmaxnm.f32 s12, s12, s4
+; CHECK-NEXT:    vminnm.f32 s14, s14, s6
+; CHECK-NEXT:    vminnm.f32 s12, s12, s6
 ; CHECK-NEXT:    vcvt.u32.f32 s14, s14
+; CHECK-NEXT:    vcvtb.f32.f16 s10, s0
 ; CHECK-NEXT:    vcvt.u32.f32 s12, s12
+; CHECK-NEXT:    vcvtt.f32.f16 s0, s0
+; CHECK-NEXT:    vmaxnm.f32 s0, s0, s4
+; CHECK-NEXT:    vmaxnm.f32 s10, s10, s4
+; CHECK-NEXT:    vminnm.f32 s0, s0, s6
+; CHECK-NEXT:    vminnm.f32 s10, s10, s6
 ; CHECK-NEXT:    vcvt.u32.f32 s0, s0
+; CHECK-NEXT:    movs r1, #0
 ; CHECK-NEXT:    vcvt.u32.f32 s10, s10
-; CHECK-NEXT:    vmov r1, s1
-; CHECK-NEXT:    vmov r3, s4
-; CHECK-NEXT:    vcvt.u32.f32 s8, s8
-; CHECK-NEXT:    vmov r4, s12
-; CHECK-NEXT:    vmov r5, s10
-; CHECK-NEXT:    lsrs r2, r1, #14
-; CHECK-NEXT:    orr.w r12, r2, r3, lsl #5
-; CHECK-NEXT:    vmov r3, s2
-; CHECK-NEXT:    strh.w r12, [r0, #16]
-; CHECK-NEXT:    lsrs r2, r3, #1
-; CHECK-NEXT:    orr.w lr, r2, r1, lsl #18
+; CHECK-NEXT:    vcvtt.f32.f16 s8, s2
+; CHECK-NEXT:    vcvtb.f32.f16 s2, s2
+; CHECK-NEXT:    vmaxnm.f32 s8, s8, s4
 ; CHECK-NEXT:    vmov r2, s14
-; CHECK-NEXT:    lsrs r1, r2, #7
-; CHECK-NEXT:    orr.w r1, r1, r4, lsl #12
-; CHECK-NEXT:    orr.w r1, r1, r3, lsl #31
-; CHECK-NEXT:    vmov r3, s0
-; CHECK-NEXT:    lsrs r4, r3, #13
-; CHECK-NEXT:    orr.w r4, r4, r5, lsl #6
-; CHECK-NEXT:    orr.w r2, r4, r2, lsl #25
+; CHECK-NEXT:    vmaxnm.f32 s2, s2, s4
+; CHECK-NEXT:    vmov r4, s12
+; CHECK-NEXT:    vminnm.f32 s2, s2, s6
+; CHECK-NEXT:    vcvt.u32.f32 s2, s2
+; CHECK-NEXT:    vminnm.f32 s8, s8, s6
+; CHECK-NEXT:    vcvt.u32.f32 s8, s8
+; CHECK-NEXT:    mov.w r11, #0
+; CHECK-NEXT:    vmov r12, s0
+; CHECK-NEXT:    vcvtt.f32.f16 s0, s3
+; CHECK-NEXT:    lsll r12, r1, #19
+; CHECK-NEXT:    vmaxnm.f32 s0, s0, s4
+; CHECK-NEXT:    vminnm.f32 s0, s0, s6
+; CHECK-NEXT:    movs r5, #0
+; CHECK-NEXT:    vcvt.u32.f32 s0, s0
+; CHECK-NEXT:    movs r7, #0
+; CHECK-NEXT:    mov.w r9, #0
+; CHECK-NEXT:    movs r3, #0
+; CHECK-NEXT:    orr.w r1, r1, r2, lsl #6
+; CHECK-NEXT:    lsrl r2, r5, #26
+; CHECK-NEXT:    orr.w r1, r1, r4, lsl #25
+; CHECK-NEXT:    str r1, [r0, #4]
+; CHECK-NEXT:    vmov r1, s10
+; CHECK-NEXT:    lsrl r4, r11, #7
+; CHECK-NEXT:    orr.w r1, r1, r12
+; CHECK-NEXT:    str r1, [r0]
+; CHECK-NEXT:    orr.w r1, r2, r4
+; CHECK-NEXT:    vmov r2, s2
+; CHECK-NEXT:    lsll r2, r7, #12
 ; CHECK-NEXT:    vmov r4, s8
-; CHECK-NEXT:    orr.w r3, r4, r3, lsl #19
-; CHECK-NEXT:    strd r3, r2, [r0]
-; CHECK-NEXT:    strd r1, lr, [r0, #8]
-; CHECK-NEXT:    lsr.w r1, r12, #16
-; CHECK-NEXT:    strb r1, [r0, #18]
-; CHECK-NEXT:    pop {r4, r5, r7, pc}
+; CHECK-NEXT:    orrs r2, r1
+; CHECK-NEXT:    movs r1, #0
+; CHECK-NEXT:    lsll r4, r1, #31
+; CHECK-NEXT:    orr.w r12, r2, r4
+; CHECK-NEXT:    vmov r4, s0
+; CHECK-NEXT:    vcvtb.f32.f16 s0, s3
+; CHECK-NEXT:    lsll r4, r3, #5
+; CHECK-NEXT:    vmaxnm.f32 s0, s0, s4
+; CHECK-NEXT:    vminnm.f32 s0, s0, s6
+; CHECK-NEXT:    vcvt.u32.f32 s0, s0
+; CHECK-NEXT:    vmov r2, s0
+; CHECK-NEXT:    mov r6, r2
+; CHECK-NEXT:    lsrl r6, r9, #14
+; CHECK-NEXT:    orr.w r3, r6, r4
+; CHECK-NEXT:    strh r3, [r0, #16]
+; CHECK-NEXT:    str.w r12, [r0, #8]
+; CHECK-NEXT:    lsrs r3, r3, #16
+; CHECK-NEXT:    strb r3, [r0, #18]
+; CHECK-NEXT:    orr.w r3, r5, r11
+; CHECK-NEXT:    orrs r3, r7
+; CHECK-NEXT:    orrs r1, r3
+; CHECK-NEXT:    orr.w r1, r1, r2, lsl #18
+; CHECK-NEXT:    str r1, [r0, #12]
+; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r9, r11, pc}
 ; CHECK-NEXT:    .p2align 2
 ; CHECK-NEXT:  @ %bb.1:
 ; CHECK-NEXT:  .LCPI46_0:
-; CHECK-NEXT:    .long 0x48ffffe0 @ float 524287
-; CHECK-NEXT:  .LCPI46_1:
 ; CHECK-NEXT:    .long 0x00000000 @ float 0
+; CHECK-NEXT:  .LCPI46_1:
+; CHECK-NEXT:    .long 0x48ffffe0 @ float 524287
     %x = call <8 x i19> @llvm.fptoui.sat.v8f16.v8i19(<8 x half> %f)
     ret <8 x i19> %x
 }
@@ -3749,46 +3768,46 @@ define arm_aapcs_vfpcc <8 x i50> @test_unsigned_v8f16_v8i50(<8 x half> %f) {
 ; CHECK-NEXT:    push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr}
 ; CHECK-NEXT:    .pad #4
 ; CHECK-NEXT:    sub sp, #4
-; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13}
-; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13}
-; CHECK-NEXT:    .pad #16
-; CHECK-NEXT:    sub sp, #16
+; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14}
+; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14}
+; CHECK-NEXT:    .pad #8
+; CHECK-NEXT:    sub sp, #8
 ; CHECK-NEXT:    vmov q4, q0
-; CHECK-NEXT:    mov r4, r0
+; CHECK-NEXT:    mov r10, r0
 ; CHECK-NEXT:    vcvtb.f32.f16 s24, s18
 ; CHECK-NEXT:    vmov r0, s24
 ; CHECK-NEXT:    bl __aeabi_f2ulz
-; CHECK-NEXT:    vcvtt.f32.f16 s26, s19
+; CHECK-NEXT:    vcvtt.f32.f16 s28, s19
 ; CHECK-NEXT:    mov r7, r0
-; CHECK-NEXT:    vmov r0, s26
+; CHECK-NEXT:    vmov r0, s28
 ; CHECK-NEXT:    vcvtb.f32.f16 s22, s16
-; CHECK-NEXT:    vcvtt.f32.f16 s18, s18
+; CHECK-NEXT:    vcvtb.f32.f16 s26, s19
 ; CHECK-NEXT:    vcmp.f32 s24, #0
-; CHECK-NEXT:    str r1, [sp, #8] @ 4-byte Spill
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-NEXT:    mov r9, r1
 ; CHECK-NEXT:    vmov r5, s22
 ; CHECK-NEXT:    vldr s20, .LCPI48_0
-; CHECK-NEXT:    vmov r8, s18
+; CHECK-NEXT:    vmov r11, s26
 ; CHECK-NEXT:    it lt
 ; CHECK-NEXT:    movlt r7, #0
 ; CHECK-NEXT:    bl __aeabi_f2ulz
-; CHECK-NEXT:    vcmp.f32 s26, #0
-; CHECK-NEXT:    mov r10, r1
+; CHECK-NEXT:    vcmp.f32 s28, #0
+; CHECK-NEXT:    mov r4, r1
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s26, s20
+; CHECK-NEXT:    vcmp.f32 s28, s20
 ; CHECK-NEXT:    mov r6, r0
 ; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt.w r10, #0
+; CHECK-NEXT:    movlt r4, #0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    mov r0, r5
 ; CHECK-NEXT:    vcmp.f32 s24, s20
 ; CHECK-NEXT:    itt gt
-; CHECK-NEXT:    movwgt r10, #65535
-; CHECK-NEXT:    movtgt r10, #3
+; CHECK-NEXT:    movwgt r4, #65535
+; CHECK-NEXT:    movtgt r4, #3
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    it gt
 ; CHECK-NEXT:    movgt.w r7, #-1
-; CHECK-NEXT:    str.w r7, [r4, #25]
+; CHECK-NEXT:    str.w r7, [r10, #25]
 ; CHECK-NEXT:    bl __aeabi_f2ulz
 ; CHECK-NEXT:    vcmp.f32 s22, #0
 ; CHECK-NEXT:    str r1, [sp, #4] @ 4-byte Spill
@@ -3797,188 +3816,184 @@ define arm_aapcs_vfpcc <8 x i50> @test_unsigned_v8f16_v8i50(<8 x half> %f) {
 ; CHECK-NEXT:    it lt
 ; CHECK-NEXT:    movlt r0, #0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s26, #0
+; CHECK-NEXT:    vcmp.f32 s28, #0
 ; CHECK-NEXT:    it gt
 ; CHECK-NEXT:    movgt.w r0, #-1
-; CHECK-NEXT:    str r0, [r4]
+; CHECK-NEXT:    mov r7, r4
+; CHECK-NEXT:    str.w r0, [r10]
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    it lt
 ; CHECK-NEXT:    movlt r6, #0
-; CHECK-NEXT:    vcmp.f32 s26, s20
+; CHECK-NEXT:    vcmp.f32 s28, s20
+; CHECK-NEXT:    mov r0, r11
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    it gt
 ; CHECK-NEXT:    movgt.w r6, #-1
-; CHECK-NEXT:    lsl.w r0, r10, #22
-; CHECK-NEXT:    str r6, [sp, #12] @ 4-byte Spill
-; CHECK-NEXT:    orr.w r6, r0, r6, lsr #10
-; CHECK-NEXT:    mov r0, r8
+; CHECK-NEXT:    bfc r7, #18, #14
+; CHECK-NEXT:    lsll r6, r7, #22
 ; CHECK-NEXT:    bl __aeabi_f2ulz
-; CHECK-NEXT:    vcmp.f32 s18, #0
-; CHECK-NEXT:    mov r5, r1
+; CHECK-NEXT:    vcmp.f32 s26, #0
+; CHECK-NEXT:    mov r5, r0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s18, s20
+; CHECK-NEXT:    vcmp.f32 s26, s20
 ; CHECK-NEXT:    it lt
 ; CHECK-NEXT:    movlt r5, #0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-NEXT:    vcmp.f32 s26, #0
+; CHECK-NEXT:    it gt
+; CHECK-NEXT:    movgt.w r5, #-1
+; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-NEXT:    vcmp.f32 s26, s20
+; CHECK-NEXT:    it lt
+; CHECK-NEXT:    movlt r1, #0
+; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-NEXT:    itt gt
+; CHECK-NEXT:    movwgt r1, #65535
+; CHECK-NEXT:    movtgt r1, #3
+; CHECK-NEXT:    mov r2, r5
+; CHECK-NEXT:    bfc r1, #18, #14
+; CHECK-NEXT:    vcvtt.f32.f16 s26, s18
+; CHECK-NEXT:    lsrl r2, r1, #28
+; CHECK-NEXT:    orr.w r0, r1, r7
+; CHECK-NEXT:    str.w r0, [r10, #45]
+; CHECK-NEXT:    vmov r0, s26
+; CHECK-NEXT:    orrs r6, r2
+; CHECK-NEXT:    bl __aeabi_f2ulz
+; CHECK-NEXT:    vcmp.f32 s26, #0
 ; CHECK-NEXT:    mov r7, r0
-; CHECK-NEXT:    vcmp.f32 s18, #0
+; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-NEXT:    it lt
+; CHECK-NEXT:    movlt r1, #0
+; CHECK-NEXT:    vcmp.f32 s26, s20
+; CHECK-NEXT:    vcvtb.f32.f16 s18, s17
+; CHECK-NEXT:    lsrs r0, r4, #10
+; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    itt gt
-; CHECK-NEXT:    movwgt r5, #65535
-; CHECK-NEXT:    movtgt r5, #3
+; CHECK-NEXT:    movwgt r1, #65535
+; CHECK-NEXT:    movtgt r1, #3
+; CHECK-NEXT:    str.w r6, [r10, #41]
+; CHECK-NEXT:    strb.w r0, [r10, #49]
+; CHECK-NEXT:    vmov r0, s18
+; CHECK-NEXT:    vcmp.f32 s26, #0
+; CHECK-NEXT:    bfc r1, #18, #14
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    str.w r6, [r4, #45]
-; CHECK-NEXT:    vcmp.f32 s18, s20
 ; CHECK-NEXT:    it lt
 ; CHECK-NEXT:    movlt r7, #0
+; CHECK-NEXT:    vcmp.f32 s26, s20
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    it gt
 ; CHECK-NEXT:    movgt.w r7, #-1
-; CHECK-NEXT:    lsrs r0, r7, #14
-; CHECK-NEXT:    orr.w r0, r0, r5, lsl #18
-; CHECK-NEXT:    vcvtt.f32.f16 s18, s17
-; CHECK-NEXT:    str.w r0, [r4, #33]
-; CHECK-NEXT:    vmov r0, s18
+; CHECK-NEXT:    mov r4, r7
+; CHECK-NEXT:    lsrl r4, r1, #14
+; CHECK-NEXT:    orr.w r6, r1, r5, lsl #4
 ; CHECK-NEXT:    bl __aeabi_f2ulz
+; CHECK-NEXT:    vcvtt.f32.f16 s26, s17
+; CHECK-NEXT:    mov r11, r0
+; CHECK-NEXT:    vmov r0, s26
+; CHECK-NEXT:    mov r5, r1
 ; CHECK-NEXT:    vcmp.f32 s18, #0
-; CHECK-NEXT:    mov r9, r1
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-NEXT:    it lt
+; CHECK-NEXT:    movlt r5, #0
 ; CHECK-NEXT:    vcmp.f32 s18, s20
+; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-NEXT:    itt gt
+; CHECK-NEXT:    movwgt r5, #65535
+; CHECK-NEXT:    movtgt r5, #3
+; CHECK-NEXT:    str.w r6, [r10, #37]
+; CHECK-NEXT:    str.w r4, [r10, #33]
+; CHECK-NEXT:    bl __aeabi_f2ulz
+; CHECK-NEXT:    vcmp.f32 s26, #0
+; CHECK-NEXT:    mov r6, r1
+; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-NEXT:    vcmp.f32 s26, s20
 ; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r0, #0
+; CHECK-NEXT:    movlt r6, #0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s18, #0
-; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt.w r0, #-1
+; CHECK-NEXT:    vcmp.f32 s24, #0
+; CHECK-NEXT:    itt gt
+; CHECK-NEXT:    movwgt r6, #65535
+; CHECK-NEXT:    movtgt r6, #3
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    str r0, [sp] @ 4-byte Spill
 ; CHECK-NEXT:    it lt
 ; CHECK-NEXT:    movlt.w r9, #0
-; CHECK-NEXT:    vcmp.f32 s18, s20
+; CHECK-NEXT:    vcmp.f32 s24, s20
+; CHECK-NEXT:    mov r4, r0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    mov r1, r0
 ; CHECK-NEXT:    itt gt
 ; CHECK-NEXT:    movwgt r9, #65535
 ; CHECK-NEXT:    movtgt r9, #3
-; CHECK-NEXT:    lsl.w r0, r9, #22
-; CHECK-NEXT:    orr.w r0, r0, r1, lsr #10
+; CHECK-NEXT:    bfc r9, #18, #14
 ; CHECK-NEXT:    vcvtt.f32.f16 s16, s16
-; CHECK-NEXT:    str r0, [r4, #20]
+; CHECK-NEXT:    orr.w r0, r9, r7, lsl #18
+; CHECK-NEXT:    str.w r0, [r10, #29]
 ; CHECK-NEXT:    vmov r0, s16
-; CHECK-NEXT:    bl __aeabi_f2ulz
-; CHECK-NEXT:    vcmp.f32 s16, #0
-; CHECK-NEXT:    mov r11, r1
+; CHECK-NEXT:    mov r1, r6
+; CHECK-NEXT:    vcmp.f32 s26, #0
+; CHECK-NEXT:    bfc r1, #18, #14
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s16, s20
+; CHECK-NEXT:    vcmp.f32 s26, s20
 ; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt.w r11, #0
+; CHECK-NEXT:    movlt r4, #0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    mov r8, r0
-; CHECK-NEXT:    vcmp.f32 s16, #0
-; CHECK-NEXT:    itt gt
-; CHECK-NEXT:    movwgt r11, #65535
-; CHECK-NEXT:    movtgt r11, #3
+; CHECK-NEXT:    vcmp.f32 s18, #0
+; CHECK-NEXT:    it gt
+; CHECK-NEXT:    movgt.w r4, #-1
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s16, s20
 ; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt.w r8, #0
+; CHECK-NEXT:    movlt.w r11, #0
+; CHECK-NEXT:    vcmp.f32 s18, s20
+; CHECK-NEXT:    bfc r5, #18, #14
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt.w r8, #-1
-; CHECK-NEXT:    lsr.w r0, r8, #14
-; CHECK-NEXT:    vcvtb.f32.f16 s16, s19
-; CHECK-NEXT:    orr.w r0, r0, r11, lsl #18
-; CHECK-NEXT:    str r0, [r4, #8]
-; CHECK-NEXT:    lsr.w r0, r10, #10
-; CHECK-NEXT:    strb.w r0, [r4, #49]
-; CHECK-NEXT:    vmov r0, s16
+; CHECK-NEXT:    movgt.w r11, #-1
+; CHECK-NEXT:    mov r8, r11
+; CHECK-NEXT:    vcmp.f32 s22, #0
+; CHECK-NEXT:    ldr r7, [sp, #4] @ 4-byte Reload
+; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-NEXT:    lsll r4, r1, #22
+; CHECK-NEXT:    lsrl r8, r5, #28
+; CHECK-NEXT:    it lt
+; CHECK-NEXT:    movlt r7, #0
+; CHECK-NEXT:    vcmp.f32 s22, s20
+; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-NEXT:    itt gt
+; CHECK-NEXT:    movwgt r7, #65535
+; CHECK-NEXT:    movtgt r7, #3
+; CHECK-NEXT:    orrs r1, r5
+; CHECK-NEXT:    str.w r1, [r10, #20]
 ; CHECK-NEXT:    bl __aeabi_f2ulz
-; CHECK-NEXT:    mov r6, r0
 ; CHECK-NEXT:    vcmp.f32 s16, #0
+; CHECK-NEXT:    orr.w r2, r8, r4
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r6, #0
+; CHECK-NEXT:    movlt r1, #0
 ; CHECK-NEXT:    vcmp.f32 s16, s20
-; CHECK-NEXT:    ubfx r0, r5, #14, #4
+; CHECK-NEXT:    bfc r7, #18, #14
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt.w r6, #-1
-; CHECK-NEXT:    orr.w r0, r0, r6, lsl #4
-; CHECK-NEXT:    str.w r0, [r4, #37]
-; CHECK-NEXT:    vcmp.f32 s24, #0
-; CHECK-NEXT:    ldr r0, [sp, #8] @ 4-byte Reload
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r0, #0
-; CHECK-NEXT:    vcmp.f32 s24, s20
-; CHECK-NEXT:    vcvtb.f32.f16 s18, s17
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    itt gt
-; CHECK-NEXT:    movwgt r0, #65535
-; CHECK-NEXT:    movtgt r0, #3
-; CHECK-NEXT:    bfc r0, #18, #14
-; CHECK-NEXT:    mov r10, r1
-; CHECK-NEXT:    orr.w r0, r0, r7, lsl #18
-; CHECK-NEXT:    str.w r0, [r4, #29]
-; CHECK-NEXT:    lsr.w r0, r9, #10
-; CHECK-NEXT:    strb r0, [r4, #24]
-; CHECK-NEXT:    vmov r0, s18
-; CHECK-NEXT:    bl __aeabi_f2ulz
-; CHECK-NEXT:    vcmp.f32 s18, #0
-; CHECK-NEXT:    ubfx r2, r11, #14, #4
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r0, #0
-; CHECK-NEXT:    vcmp.f32 s18, s20
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt.w r0, #-1
-; CHECK-NEXT:    orr.w r2, r2, r0, lsl #4
-; CHECK-NEXT:    str r2, [r4, #12]
-; CHECK-NEXT:    vcmp.f32 s22, #0
-; CHECK-NEXT:    ldr r2, [sp, #4] @ 4-byte Reload
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s22, s20
-; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r2, #0
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s18, #0
-; CHECK-NEXT:    itt gt
-; CHECK-NEXT:    movwgt r2, #65535
-; CHECK-NEXT:    movtgt r2, #3
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s18, s20
-; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r1, #0
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s16, #0
 ; CHECK-NEXT:    itt gt
 ; CHECK-NEXT:    movwgt r1, #65535
 ; CHECK-NEXT:    movtgt r1, #3
-; CHECK-NEXT:    bfc r2, #18, #14
+; CHECK-NEXT:    str.w r2, [r10, #16]
+; CHECK-NEXT:    lsrs r2, r6, #10
+; CHECK-NEXT:    vcmp.f32 s16, #0
+; CHECK-NEXT:    strb.w r2, [r10, #24]
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt.w r10, #0
+; CHECK-NEXT:    movlt r0, #0
 ; CHECK-NEXT:    vcmp.f32 s16, s20
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    itt gt
-; CHECK-NEXT:    movwgt r10, #65535
-; CHECK-NEXT:    movtgt r10, #3
-; CHECK-NEXT:    orr.w r2, r2, r8, lsl #18
-; CHECK-NEXT:    str r2, [r4, #4]
-; CHECK-NEXT:    bfc r10, #18, #14
-; CHECK-NEXT:    ldr r3, [sp, #12] @ 4-byte Reload
-; CHECK-NEXT:    lsrs r2, r6, #28
 ; CHECK-NEXT:    bfc r1, #18, #14
-; CHECK-NEXT:    orr.w r2, r2, r10, lsl #4
-; CHECK-NEXT:    lsrs r0, r0, #28
-; CHECK-NEXT:    orr.w r2, r2, r3, lsl #22
-; CHECK-NEXT:    str.w r2, [r4, #41]
-; CHECK-NEXT:    orr.w r0, r0, r1, lsl #4
-; CHECK-NEXT:    ldr r1, [sp] @ 4-byte Reload
-; CHECK-NEXT:    orr.w r0, r0, r1, lsl #22
-; CHECK-NEXT:    str r0, [r4, #16]
-; CHECK-NEXT:    add sp, #16
-; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13}
+; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-NEXT:    it gt
+; CHECK-NEXT:    movgt.w r0, #-1
+; CHECK-NEXT:    mov r2, r0
+; CHECK-NEXT:    orr.w r0, r7, r0, lsl #18
+; CHECK-NEXT:    lsrl r2, r1, #14
+; CHECK-NEXT:    orr.w r1, r1, r11, lsl #4
+; CHECK-NEXT:    strd r2, r1, [r10, #8]
+; CHECK-NEXT:    str.w r0, [r10, #4]
+; CHECK-NEXT:    add sp, #8
+; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14}
 ; CHECK-NEXT:    add sp, #4
 ; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
 ; CHECK-NEXT:    .p2align 2
@@ -4192,21 +4207,61 @@ define arm_aapcs_vfpcc <8 x i64> @test_unsigned_v8f16_v8i64(<8 x half> %f) {
 define arm_aapcs_vfpcc <8 x i100> @test_unsigned_v8f16_v8i100(<8 x half> %f) {
 ; CHECK-LABEL: test_unsigned_v8f16_v8i100:
 ; CHECK:       @ %bb.0:
-; CHECK-NEXT:    .save {r4, r5, r6, r7, r8, r9, r10, lr}
-; CHECK-NEXT:    push.w {r4, r5, r6, r7, r8, r9, r10, lr}
+; CHECK-NEXT:    .save {r4, r5, r6, r7, r8, r9, r10, r11, lr}
+; CHECK-NEXT:    push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr}
+; CHECK-NEXT:    .pad #4
+; CHECK-NEXT:    sub sp, #4
 ; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
+; CHECK-NEXT:    .pad #32
+; CHECK-NEXT:    sub sp, #32
 ; CHECK-NEXT:    vmov q4, q0
-; CHECK-NEXT:    mov r4, r0
-; CHECK-NEXT:    vcvtb.f32.f16 s28, s19
-; CHECK-NEXT:    vmov r0, s28
-; CHECK-NEXT:    bl __fixunssfti
-; CHECK-NEXT:    vcvtb.f32.f16 s26, s18
-; CHECK-NEXT:    mov r5, r3
-; CHECK-NEXT:    vmov r3, s26
+; CHECK-NEXT:    mov r9, r0
+; CHECK-NEXT:    vcvtb.f32.f16 s30, s19
+; CHECK-NEXT:    vcvtb.f32.f16 s28, s18
+; CHECK-NEXT:    vmov r0, s30
+; CHECK-NEXT:    vcvtt.f32.f16 s22, s19
+; CHECK-NEXT:    vcvtb.f32.f16 s24, s16
+; CHECK-NEXT:    vcvtb.f32.f16 s26, s17
 ; CHECK-NEXT:    vldr s20, .LCPI50_1
+; CHECK-NEXT:    vmov r8, s22
+; CHECK-NEXT:    vmov r5, s28
+; CHECK-NEXT:    vcvtt.f32.f16 s18, s18
+; CHECK-NEXT:    vmov r4, s24
+; CHECK-NEXT:    vmov r6, s26
+; CHECK-NEXT:    bl __fixunssfti
+; CHECK-NEXT:    vcmp.f32 s30, #0
+; CHECK-NEXT:    mov r7, r3
+; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-NEXT:    vcmp.f32 s30, s20
+; CHECK-NEXT:    it lt
+; CHECK-NEXT:    movlt r2, #0
+; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-NEXT:    vcmp.f32 s30, #0
+; CHECK-NEXT:    it gt
+; CHECK-NEXT:    movgt.w r2, #-1
+; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-NEXT:    vcmp.f32 s30, s20
+; CHECK-NEXT:    str.w r2, [r9, #83]
+; CHECK-NEXT:    it lt
+; CHECK-NEXT:    movlt r1, #0
+; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-NEXT:    vcmp.f32 s30, #0
+; CHECK-NEXT:    it gt
+; CHECK-NEXT:    movgt.w r1, #-1
+; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-NEXT:    str.w r1, [r9, #79]
+; CHECK-NEXT:    it lt
+; CHECK-NEXT:    movlt r0, #0
+; CHECK-NEXT:    vcmp.f32 s30, s20
+; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-NEXT:    it gt
+; CHECK-NEXT:    movgt.w r0, #-1
+; CHECK-NEXT:    str.w r0, [r9, #75]
+; CHECK-NEXT:    mov r0, r5
+; CHECK-NEXT:    bl __fixunssfti
 ; CHECK-NEXT:    vcmp.f32 s28, #0
-; CHECK-NEXT:    vcvtt.f32.f16 s30, s19
+; CHECK-NEXT:    mov r5, r3
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    vcmp.f32 s28, s20
 ; CHECK-NEXT:    it lt
@@ -4217,7 +4272,7 @@ define arm_aapcs_vfpcc <8 x i100> @test_unsigned_v8f16_v8i100(<8 x half> %f) {
 ; CHECK-NEXT:    movgt.w r2, #-1
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    vcmp.f32 s28, s20
-; CHECK-NEXT:    str.w r2, [r4, #83]
+; CHECK-NEXT:    str.w r2, [r9, #58]
 ; CHECK-NEXT:    it lt
 ; CHECK-NEXT:    movlt r1, #0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
@@ -4225,23 +4280,18 @@ define arm_aapcs_vfpcc <8 x i100> @test_unsigned_v8f16_v8i100(<8 x half> %f) {
 ; CHECK-NEXT:    it gt
 ; CHECK-NEXT:    movgt.w r1, #-1
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    str.w r1, [r4, #79]
+; CHECK-NEXT:    str.w r1, [r9, #54]
 ; CHECK-NEXT:    it lt
 ; CHECK-NEXT:    movlt r0, #0
 ; CHECK-NEXT:    vcmp.f32 s28, s20
-; CHECK-NEXT:    vcvtb.f32.f16 s22, s16
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    it gt
 ; CHECK-NEXT:    movgt.w r0, #-1
-; CHECK-NEXT:    vcvtb.f32.f16 s24, s17
-; CHECK-NEXT:    str.w r0, [r4, #75]
-; CHECK-NEXT:    vmov r9, s30
-; CHECK-NEXT:    vmov r8, s22
-; CHECK-NEXT:    vmov r6, s24
-; CHECK-NEXT:    mov r0, r3
+; CHECK-NEXT:    str.w r0, [r9, #50]
+; CHECK-NEXT:    mov r0, r6
 ; CHECK-NEXT:    bl __fixunssfti
 ; CHECK-NEXT:    vcmp.f32 s26, #0
-; CHECK-NEXT:    mov r7, r3
+; CHECK-NEXT:    str r3, [sp, #24] @ 4-byte Spill
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    vcmp.f32 s26, s20
 ; CHECK-NEXT:    it lt
@@ -4252,7 +4302,7 @@ define arm_aapcs_vfpcc <8 x i100> @test_unsigned_v8f16_v8i100(<8 x half> %f) {
 ; CHECK-NEXT:    movgt.w r2, #-1
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    vcmp.f32 s26, s20
-; CHECK-NEXT:    str.w r2, [r4, #58]
+; CHECK-NEXT:    str.w r2, [r9, #33]
 ; CHECK-NEXT:    it lt
 ; CHECK-NEXT:    movlt r1, #0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
@@ -4260,18 +4310,18 @@ define arm_aapcs_vfpcc <8 x i100> @test_unsigned_v8f16_v8i100(<8 x half> %f) {
 ; CHECK-NEXT:    it gt
 ; CHECK-NEXT:    movgt.w r1, #-1
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    str.w r1, [r4, #54]
+; CHECK-NEXT:    str.w r1, [r9, #29]
 ; CHECK-NEXT:    it lt
 ; CHECK-NEXT:    movlt r0, #0
 ; CHECK-NEXT:    vcmp.f32 s26, s20
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    it gt
 ; CHECK-NEXT:    movgt.w r0, #-1
-; CHECK-NEXT:    str.w r0, [r4, #50]
-; CHECK-NEXT:    mov r0, r6
+; CHECK-NEXT:    str.w r0, [r9, #25]
+; CHECK-NEXT:    mov r0, r4
 ; CHECK-NEXT:    bl __fixunssfti
 ; CHECK-NEXT:    vcmp.f32 s24, #0
-; CHECK-NEXT:    mov r10, r3
+; CHECK-NEXT:    mov r4, r3
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    vcmp.f32 s24, s20
 ; CHECK-NEXT:    it lt
@@ -4282,7 +4332,7 @@ define arm_aapcs_vfpcc <8 x i100> @test_unsigned_v8f16_v8i100(<8 x half> %f) {
 ; CHECK-NEXT:    movgt.w r2, #-1
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    vcmp.f32 s24, s20
-; CHECK-NEXT:    str.w r2, [r4, #33]
+; CHECK-NEXT:    str.w r2, [r9, #8]
 ; CHECK-NEXT:    it lt
 ; CHECK-NEXT:    movlt r1, #0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
@@ -4290,227 +4340,200 @@ define arm_aapcs_vfpcc <8 x i100> @test_unsigned_v8f16_v8i100(<8 x half> %f) {
 ; CHECK-NEXT:    it gt
 ; CHECK-NEXT:    movgt.w r1, #-1
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    str.w r1, [r4, #29]
+; CHECK-NEXT:    str.w r1, [r9, #4]
 ; CHECK-NEXT:    it lt
 ; CHECK-NEXT:    movlt r0, #0
 ; CHECK-NEXT:    vcmp.f32 s24, s20
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    it gt
 ; CHECK-NEXT:    movgt.w r0, #-1
-; CHECK-NEXT:    str.w r0, [r4, #25]
+; CHECK-NEXT:    str.w r0, [r9]
 ; CHECK-NEXT:    mov r0, r8
 ; CHECK-NEXT:    bl __fixunssfti
 ; CHECK-NEXT:    vcmp.f32 s22, #0
-; CHECK-NEXT:    mov r8, r3
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s22, s20
-; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r2, #0
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s22, #0
-; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt.w r2, #-1
+; CHECK-NEXT:    mov r6, r0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    vcmp.f32 s22, s20
-; CHECK-NEXT:    str r2, [r4, #8]
+; CHECK-NEXT:    str r3, [sp, #12] @ 4-byte Spill
 ; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r1, #0
+; CHECK-NEXT:    movlt r6, #0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s22, #0
+; CHECK-NEXT:    vcmp.f32 s30, #0
 ; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt.w r1, #-1
+; CHECK-NEXT:    movgt.w r6, #-1
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    str r1, [r4, #4]
 ; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r0, #0
-; CHECK-NEXT:    vcmp.f32 s22, s20
+; CHECK-NEXT:    movlt r7, #0
+; CHECK-NEXT:    vcmp.f32 s30, s20
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt.w r0, #-1
-; CHECK-NEXT:    str r0, [r4]
-; CHECK-NEXT:    mov r0, r9
+; CHECK-NEXT:    movgt r7, #15
+; CHECK-NEXT:    and r0, r7, #15
+; CHECK-NEXT:    mov r11, r1
+; CHECK-NEXT:    orr.w r1, r0, r6, lsl #4
+; CHECK-NEXT:    vmov r0, s18
+; CHECK-NEXT:    mov r10, r2
+; CHECK-NEXT:    str.w r1, [r9, #87]
 ; CHECK-NEXT:    bl __fixunssfti
-; CHECK-NEXT:    vcmp.f32 s30, #0
+; CHECK-NEXT:    vcmp.f32 s18, #0
+; CHECK-NEXT:    mov r8, r0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s30, s20
+; CHECK-NEXT:    vcmp.f32 s18, s20
+; CHECK-NEXT:    str r1, [sp, #8] @ 4-byte Spill
+; CHECK-NEXT:    mov r7, r3
+; CHECK-NEXT:    str r2, [sp, #20] @ 4-byte Spill
 ; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r1, #0
+; CHECK-NEXT:    movlt.w r8, #0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s30, #0
+; CHECK-NEXT:    vcmp.f32 s28, #0
 ; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt.w r1, #-1
+; CHECK-NEXT:    movgt.w r8, #-1
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s30, s20
+; CHECK-NEXT:    vcmp.f32 s28, s20
 ; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r2, #0
+; CHECK-NEXT:    movlt r5, #0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    lsr.w r6, r1, #28
-; CHECK-NEXT:    vcmp.f32 s30, #0
 ; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt.w r2, #-1
-; CHECK-NEXT:    orr.w r6, r6, r2, lsl #4
+; CHECK-NEXT:    movgt r5, #15
+; CHECK-NEXT:    and r0, r5, #15
+; CHECK-NEXT:    vcvtt.f32.f16 s28, s17
+; CHECK-NEXT:    orr.w r0, r0, r8, lsl #4
+; CHECK-NEXT:    str.w r0, [r9, #62]
+; CHECK-NEXT:    vmov r0, s28
+; CHECK-NEXT:    bl __fixunssfti
+; CHECK-NEXT:    vcmp.f32 s28, #0
+; CHECK-NEXT:    str r1, [sp, #16] @ 4-byte Spill
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    str.w r6, [r4, #95]
+; CHECK-NEXT:    str r2, [sp, #28] @ 4-byte Spill
+; CHECK-NEXT:    str r3, [sp, #4] @ 4-byte Spill
 ; CHECK-NEXT:    it lt
 ; CHECK-NEXT:    movlt r0, #0
-; CHECK-NEXT:    vcmp.f32 s30, s20
+; CHECK-NEXT:    vcmp.f32 s28, s20
+; CHECK-NEXT:    vcvtt.f32.f16 s16, s16
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    it gt
 ; CHECK-NEXT:    movgt.w r0, #-1
-; CHECK-NEXT:    lsrs r6, r0, #28
-; CHECK-NEXT:    orr.w r1, r6, r1, lsl #4
-; CHECK-NEXT:    vcmp.f32 s30, #0
-; CHECK-NEXT:    str.w r1, [r4, #91]
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s30, s20
-; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r3, #0
-; CHECK-NEXT:    lsrs r1, r2, #28
-; CHECK-NEXT:    vcvtt.f32.f16 s30, s18
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt r3, #15
-; CHECK-NEXT:    orr.w r2, r1, r3, lsl #4
-; CHECK-NEXT:    vmov r1, s30
-; CHECK-NEXT:    strb.w r2, [r4, #99]
-; CHECK-NEXT:    vcmp.f32 s28, #0
+; CHECK-NEXT:    vcmp.f32 s26, #0
+; CHECK-NEXT:    mov r1, r0
+; CHECK-NEXT:    str r0, [sp] @ 4-byte Spill
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-NEXT:    ldr r0, [sp, #24] @ 4-byte Reload
+; CHECK-NEXT:    vcmp.f32 s26, s20
 ; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r5, #0
-; CHECK-NEXT:    vcmp.f32 s28, s20
+; CHECK-NEXT:    movlt r0, #0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt r5, #15
-; CHECK-NEXT:    and r2, r5, #15
-; CHECK-NEXT:    orr.w r0, r2, r0, lsl #4
-; CHECK-NEXT:    str.w r0, [r4, #87]
-; CHECK-NEXT:    mov r0, r1
+; CHECK-NEXT:    movgt r0, #15
+; CHECK-NEXT:    and r0, r0, #15
+; CHECK-NEXT:    orr.w r0, r0, r1, lsl #4
+; CHECK-NEXT:    str.w r0, [r9, #37]
+; CHECK-NEXT:    vmov r0, s16
 ; CHECK-NEXT:    bl __fixunssfti
-; CHECK-NEXT:    vcmp.f32 s30, #0
-; CHECK-NEXT:    vcvtt.f32.f16 s18, s17
+; CHECK-NEXT:    vcmp.f32 s16, #0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s30, s20
+; CHECK-NEXT:    vcmp.f32 s16, s20
 ; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r1, #0
+; CHECK-NEXT:    movlt r0, #0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s30, #0
+; CHECK-NEXT:    vcmp.f32 s24, #0
 ; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt.w r1, #-1
+; CHECK-NEXT:    movgt.w r0, #-1
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s30, s20
 ; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r2, #0
+; CHECK-NEXT:    movlt r4, #0
+; CHECK-NEXT:    vcmp.f32 s24, s20
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    lsr.w r6, r1, #28
-; CHECK-NEXT:    vcmp.f32 s30, #0
 ; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt.w r2, #-1
-; CHECK-NEXT:    orr.w r6, r6, r2, lsl #4
+; CHECK-NEXT:    movgt r4, #15
+; CHECK-NEXT:    and r5, r4, #15
+; CHECK-NEXT:    vcmp.f32 s22, #0
+; CHECK-NEXT:    orr.w r5, r5, r0, lsl #4
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    str.w r6, [r4, #70]
+; CHECK-NEXT:    str.w r5, [r9, #12]
 ; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r0, #0
-; CHECK-NEXT:    vcmp.f32 s30, s20
+; CHECK-NEXT:    movlt.w r11, #0
+; CHECK-NEXT:    vcmp.f32 s22, s20
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt.w r0, #-1
-; CHECK-NEXT:    lsrs r6, r0, #28
-; CHECK-NEXT:    orr.w r1, r6, r1, lsl #4
-; CHECK-NEXT:    str.w r1, [r4, #66]
-; CHECK-NEXT:    vmov r1, s18
-; CHECK-NEXT:    vcmp.f32 s30, #0
-; CHECK-NEXT:    lsrs r2, r2, #28
+; CHECK-NEXT:    movgt.w r11, #-1
+; CHECK-NEXT:    vcmp.f32 s22, #0
+; CHECK-NEXT:    lsrl r6, r11, #28
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s30, s20
 ; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r3, #0
+; CHECK-NEXT:    movlt.w r10, #0
+; CHECK-NEXT:    vcmp.f32 s22, s20
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s26, #0
 ; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt r3, #15
-; CHECK-NEXT:    orr.w r2, r2, r3, lsl #4
+; CHECK-NEXT:    movgt.w r10, #-1
+; CHECK-NEXT:    orr.w r5, r11, r10, lsl #4
+; CHECK-NEXT:    str.w r5, [r9, #95]
+; CHECK-NEXT:    str.w r6, [r9, #91]
+; CHECK-NEXT:    vcmp.f32 s22, #0
+; CHECK-NEXT:    ldr r6, [sp, #12] @ 4-byte Reload
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    strb.w r2, [r4, #74]
 ; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r7, #0
-; CHECK-NEXT:    vcmp.f32 s26, s20
-; CHECK-NEXT:    vcvtt.f32.f16 s16, s16
+; CHECK-NEXT:    movlt r6, #0
+; CHECK-NEXT:    vcmp.f32 s22, s20
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt r7, #15
-; CHECK-NEXT:    and r2, r7, #15
-; CHECK-NEXT:    orr.w r0, r2, r0, lsl #4
-; CHECK-NEXT:    str.w r0, [r4, #62]
-; CHECK-NEXT:    mov r0, r1
-; CHECK-NEXT:    bl __fixunssfti
+; CHECK-NEXT:    movgt r6, #15
+; CHECK-NEXT:    and r5, r6, #15
 ; CHECK-NEXT:    vcmp.f32 s18, #0
+; CHECK-NEXT:    lsrl r10, r5, #28
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s18, s20
+; CHECK-NEXT:    strb.w r10, [r9, #99]
 ; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r1, #0
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s18, #0
-; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt.w r1, #-1
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-NEXT:    movlt r7, #0
 ; CHECK-NEXT:    vcmp.f32 s18, s20
-; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r2, #0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    lsr.w r7, r1, #28
-; CHECK-NEXT:    vcmp.f32 s18, #0
 ; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt.w r2, #-1
-; CHECK-NEXT:    orr.w r7, r7, r2, lsl #4
+; CHECK-NEXT:    movgt r7, #15
+; CHECK-NEXT:    vcmp.f32 s18, #0
+; CHECK-NEXT:    ldr r6, [sp, #8] @ 4-byte Reload
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    str.w r7, [r4, #45]
 ; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r0, #0
+; CHECK-NEXT:    movlt r6, #0
 ; CHECK-NEXT:    vcmp.f32 s18, s20
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt.w r0, #-1
-; CHECK-NEXT:    lsrs r7, r0, #28
+; CHECK-NEXT:    movgt.w r6, #-1
+; CHECK-NEXT:    vmov q0[3], q0[1], r6, r7
 ; CHECK-NEXT:    vcmp.f32 s18, #0
-; CHECK-NEXT:    orr.w r7, r7, r1, lsl #4
-; CHECK-NEXT:    vmov r1, s16
+; CHECK-NEXT:    vmov r5, s1
+; CHECK-NEXT:    ldr r4, [sp, #20] @ 4-byte Reload
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s18, s20
-; CHECK-NEXT:    str.w r7, [r4, #41]
+; CHECK-NEXT:    lsrl r8, r5, #28
 ; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r3, #0
+; CHECK-NEXT:    movlt r4, #0
+; CHECK-NEXT:    vcmp.f32 s18, s20
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    lsr.w r2, r2, #28
-; CHECK-NEXT:    vcmp.f32 s24, #0
 ; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt r3, #15
-; CHECK-NEXT:    orr.w r2, r2, r3, lsl #4
+; CHECK-NEXT:    movgt.w r4, #-1
+; CHECK-NEXT:    orr.w r6, r5, r4, lsl #4
+; CHECK-NEXT:    and r5, r7, #15
+; CHECK-NEXT:    lsrl r4, r5, #28
+; CHECK-NEXT:    str.w r6, [r9, #70]
+; CHECK-NEXT:    str.w r8, [r9, #66]
+; CHECK-NEXT:    vcmp.f32 s28, #0
+; CHECK-NEXT:    strb.w r4, [r9, #74]
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    strb.w r2, [r4, #49]
+; CHECK-NEXT:    ldr r7, [sp, #4] @ 4-byte Reload
+; CHECK-NEXT:    vcmp.f32 s28, s20
 ; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt.w r10, #0
-; CHECK-NEXT:    vcmp.f32 s24, s20
+; CHECK-NEXT:    movlt r7, #0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt.w r10, #15
-; CHECK-NEXT:    and r2, r10, #15
-; CHECK-NEXT:    orr.w r0, r2, r0, lsl #4
-; CHECK-NEXT:    str.w r0, [r4, #37]
-; CHECK-NEXT:    mov r0, r1
-; CHECK-NEXT:    bl __fixunssfti
-; CHECK-NEXT:    vcmp.f32 s16, #0
+; CHECK-NEXT:    movgt r7, #15
+; CHECK-NEXT:    mov r12, r7
+; CHECK-NEXT:    vcmp.f32 s28, #0
+; CHECK-NEXT:    ldr r7, [sp, #16] @ 4-byte Reload
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s16, s20
 ; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r1, #0
+; CHECK-NEXT:    movlt r7, #0
+; CHECK-NEXT:    vcmp.f32 s28, s20
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s16, #0
 ; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt.w r1, #-1
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s16, s20
-; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r2, #0
+; CHECK-NEXT:    movgt.w r7, #-1
 ; CHECK-NEXT:    b.w .LBB50_2
 ; CHECK-NEXT:    .p2align 2
 ; CHECK-NEXT:  @ %bb.1:
@@ -4518,47 +4541,61 @@ define arm_aapcs_vfpcc <8 x i100> @test_unsigned_v8f16_v8i100(<8 x half> %f) {
 ; CHECK-NEXT:    .long 0x717fffff @ float 1.26765052E+30
 ; CHECK-NEXT:    .p2align 1
 ; CHECK-NEXT:  .LBB50_2:
+; CHECK-NEXT:    vmov q0[3], q0[1], r7, r12
+; CHECK-NEXT:    ldr r4, [sp] @ 4-byte Reload
+; CHECK-NEXT:    vmov r5, s1
+; CHECK-NEXT:    ldr r6, [sp, #28] @ 4-byte Reload
+; CHECK-NEXT:    vcmp.f32 s28, #0
+; CHECK-NEXT:    lsrl r4, r5, #28
+; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-NEXT:    it lt
+; CHECK-NEXT:    movlt r6, #0
+; CHECK-NEXT:    vcmp.f32 s28, s20
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s16, #0
 ; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt.w r2, #-1
-; CHECK-NEXT:    lsrs r7, r1, #28
+; CHECK-NEXT:    movgt.w r6, #-1
+; CHECK-NEXT:    orr.w r7, r5, r6, lsl #4
+; CHECK-NEXT:    and r5, r12, #15
+; CHECK-NEXT:    vcmp.f32 s16, #0
+; CHECK-NEXT:    lsrl r6, r5, #28
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    vcmp.f32 s16, s20
-; CHECK-NEXT:    orr.w r7, r7, r2, lsl #4
-; CHECK-NEXT:    str r7, [r4, #20]
+; CHECK-NEXT:    str.w r7, [r9, #45]
+; CHECK-NEXT:    str.w r4, [r9, #41]
+; CHECK-NEXT:    strb.w r6, [r9, #49]
 ; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r0, #0
+; CHECK-NEXT:    movlt r3, #0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    vcmp.f32 s16, #0
 ; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt.w r0, #-1
-; CHECK-NEXT:    lsrs r7, r0, #28
-; CHECK-NEXT:    orr.w r1, r7, r1, lsl #4
+; CHECK-NEXT:    movgt r3, #15
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s16, s20
-; CHECK-NEXT:    str r1, [r4, #16]
 ; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r3, #0
+; CHECK-NEXT:    movlt r1, #0
+; CHECK-NEXT:    vcmp.f32 s16, s20
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    lsr.w r1, r2, #28
-; CHECK-NEXT:    vcmp.f32 s22, #0
 ; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt r3, #15
-; CHECK-NEXT:    orr.w r1, r1, r3, lsl #4
+; CHECK-NEXT:    movgt.w r1, #-1
+; CHECK-NEXT:    vmov q0[3], q0[1], r1, r3
+; CHECK-NEXT:    vcmp.f32 s16, #0
+; CHECK-NEXT:    vmov r1, s1
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    strb r1, [r4, #24]
+; CHECK-NEXT:    lsrl r0, r1, #28
 ; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt.w r8, #0
-; CHECK-NEXT:    vcmp.f32 s22, s20
+; CHECK-NEXT:    movlt r2, #0
+; CHECK-NEXT:    vcmp.f32 s16, s20
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt.w r8, #15
-; CHECK-NEXT:    and r1, r8, #15
-; CHECK-NEXT:    orr.w r0, r1, r0, lsl #4
-; CHECK-NEXT:    str r0, [r4, #12]
+; CHECK-NEXT:    movgt.w r2, #-1
+; CHECK-NEXT:    orr.w r1, r1, r2, lsl #4
+; CHECK-NEXT:    strd r0, r1, [r9, #16]
+; CHECK-NEXT:    and r1, r3, #15
+; CHECK-NEXT:    lsrl r2, r1, #28
+; CHECK-NEXT:    strb.w r2, [r9, #24]
+; CHECK-NEXT:    add sp, #32
 ; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
-; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, r10, pc}
+; CHECK-NEXT:    add sp, #4
+; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
 ; CHECK-NEXT:  @ %bb.3:
     %x = call <8 x i100> @llvm.fptoui.sat.v8f16.v8i100(<8 x half> %f)
     ret <8 x i100> %x
-- 
cgit v1.1


From 00e80fbfb9151a68e7383dcec7da69c867225e54 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Danny=20M=C3=B6sch?= <danny.moesch@icloud.com>
Date: Sun, 11 Feb 2024 19:43:34 +0100
Subject: [NFC] Correct C++ standard names (#81421)

---
 .../clang-tidy/modernize/DeprecatedHeadersCheck.cpp            |  2 +-
 .../docs/clang-tidy/checks/modernize/deprecated-headers.rst    |  2 +-
 .../docs/clang-tidy/checks/modernize/use-override.rst          |  2 +-
 .../docs/clang-tidy/checks/readability/container-contains.rst  |  2 +-
 .../docs/clang-tidy/checks/readability/use-anyofallof.rst      |  2 +-
 clang/include/clang/Basic/Module.h                             |  6 +++---
 clang/lib/Basic/Module.cpp                                     |  4 ++--
 clang/lib/Headers/stdatomic.h                                  |  2 +-
 clang/lib/Lex/DependencyDirectivesScanner.cpp                  |  2 +-
 clang/test/Analysis/bitwise-shift-common.c                     |  2 +-
 clang/unittests/Analysis/FlowSensitive/TransferTest.cpp        |  2 +-
 clang/unittests/Lex/DependencyDirectivesScannerTest.cpp        |  2 +-
 libcxx/docs/FeatureTestMacroTable.rst                          | 10 +++++-----
 libcxx/include/__locale_dir/locale_base_api/ibm.h              |  2 +-
 .../ostream.inserters.arithmetic/pointer.volatile.pass.cpp     |  2 +-
 libcxx/utils/generate_feature_test_macro_components.py         |  2 +-
 llvm/docs/CMake.rst                                            |  2 +-
 17 files changed, 24 insertions(+), 24 deletions(-)

diff --git a/clang-tools-extra/clang-tidy/modernize/DeprecatedHeadersCheck.cpp b/clang-tools-extra/clang-tidy/modernize/DeprecatedHeadersCheck.cpp
index 6d287eb..6a46791 100644
--- a/clang-tools-extra/clang-tidy/modernize/DeprecatedHeadersCheck.cpp
+++ b/clang-tools-extra/clang-tidy/modernize/DeprecatedHeadersCheck.cpp
@@ -158,7 +158,7 @@ IncludeModernizePPCallbacks::IncludeModernizePPCallbacks(
             {"wctype.h", "cwctype"}})) {
     CStyledHeaderToCxx.insert(KeyValue);
   }
-  // Add C++ 11 headers.
+  // Add C++11 headers.
   if (LangOpts.CPlusPlus11) {
     for (const auto &KeyValue :
          std::vector<std::pair<llvm::StringRef, std::string>>(
diff --git a/clang-tools-extra/docs/clang-tidy/checks/modernize/deprecated-headers.rst b/clang-tools-extra/docs/clang-tidy/checks/modernize/deprecated-headers.rst
index 974a56a..298243f 100644
--- a/clang-tools-extra/docs/clang-tidy/checks/modernize/deprecated-headers.rst
+++ b/clang-tools-extra/docs/clang-tidy/checks/modernize/deprecated-headers.rst
@@ -4,7 +4,7 @@ modernize-deprecated-headers
 ============================
 
 Some headers from C library were deprecated in C++ and are no longer welcome in
-C++ codebases. Some have no effect in C++. For more details refer to the C++ 14
+C++ codebases. Some have no effect in C++. For more details refer to the C++14
 Standard [depr.c.headers] section.
 
 This check replaces C standard library headers with their C++ alternatives and
diff --git a/clang-tools-extra/docs/clang-tidy/checks/modernize/use-override.rst b/clang-tools-extra/docs/clang-tidy/checks/modernize/use-override.rst
index 0440ab85..f8f3479 100644
--- a/clang-tools-extra/docs/clang-tidy/checks/modernize/use-override.rst
+++ b/clang-tools-extra/docs/clang-tidy/checks/modernize/use-override.rst
@@ -10,7 +10,7 @@ removes ``virtual`` from those functions as it is not required.
 user that a function was virtual. C++ compilers did not use the presence of
 this to signify an overridden function.
 
-In C++ 11 ``override`` and ``final`` keywords were introduced to allow
+In C++11 ``override`` and ``final`` keywords were introduced to allow
 overridden functions to be marked appropriately. Their presence allows
 compilers to verify that an overridden function correctly overrides a base
 class implementation.
diff --git a/clang-tools-extra/docs/clang-tidy/checks/readability/container-contains.rst b/clang-tools-extra/docs/clang-tidy/checks/readability/container-contains.rst
index 07d1e35..b28daec 100644
--- a/clang-tools-extra/docs/clang-tidy/checks/readability/container-contains.rst
+++ b/clang-tools-extra/docs/clang-tidy/checks/readability/container-contains.rst
@@ -3,7 +3,7 @@
 readability-container-contains
 ==============================
 
-Finds usages of ``container.count()`` and ``container.find() == container.end()`` which should be replaced by a call to the ``container.contains()`` method introduced in C++ 20.
+Finds usages of ``container.count()`` and ``container.find() == container.end()`` which should be replaced by a call to the ``container.contains()`` method introduced in C++20.
 
 Whether an element is contained inside a container should be checked with ``contains`` instead of ``count``/``find`` because ``contains`` conveys the intent more clearly. Furthermore, for containers which permit multiple entries per key (``multimap``, ``multiset``, ...), ``contains`` is more efficient than ``count`` because ``count`` has to do unnecessary additional work.
 
diff --git a/clang-tools-extra/docs/clang-tidy/checks/readability/use-anyofallof.rst b/clang-tools-extra/docs/clang-tidy/checks/readability/use-anyofallof.rst
index f7bd9ff..6e58766 100644
--- a/clang-tools-extra/docs/clang-tidy/checks/readability/use-anyofallof.rst
+++ b/clang-tools-extra/docs/clang-tidy/checks/readability/use-anyofallof.rst
@@ -4,7 +4,7 @@ readability-use-anyofallof
 ==========================
 
 Finds range-based for loops that can be replaced by a call to ``std::any_of`` or
-``std::all_of``. In C++ 20 mode, suggests ``std::ranges::any_of`` or
+``std::all_of``. In C++20 mode, suggests ``std::ranges::any_of`` or
 ``std::ranges::all_of``.
 
 Example:
diff --git a/clang/include/clang/Basic/Module.h b/clang/include/clang/Basic/Module.h
index 62786e3..30ec9c9 100644
--- a/clang/include/clang/Basic/Module.h
+++ b/clang/include/clang/Basic/Module.h
@@ -118,7 +118,7 @@ public:
     /// of header files.
     ModuleMapModule,
 
-    /// This is a C++ 20 header unit.
+    /// This is a C++20 header unit.
     ModuleHeaderUnit,
 
     /// This is a C++20 module interface unit.
@@ -127,10 +127,10 @@ public:
     /// This is a C++20 module implementation unit.
     ModuleImplementationUnit,
 
-    /// This is a C++ 20 module partition interface.
+    /// This is a C++20 module partition interface.
     ModulePartitionInterface,
 
-    /// This is a C++ 20 module partition implementation.
+    /// This is a C++20 module partition implementation.
     ModulePartitionImplementation,
 
     /// This is the explicit Global Module Fragment of a modular TU.
diff --git a/clang/lib/Basic/Module.cpp b/clang/lib/Basic/Module.cpp
index 9252174..1c5043a 100644
--- a/clang/lib/Basic/Module.cpp
+++ b/clang/lib/Basic/Module.cpp
@@ -376,7 +376,7 @@ Module *Module::findOrInferSubmodule(StringRef Name) {
 
 Module *Module::getGlobalModuleFragment() const {
   assert(isNamedModuleUnit() && "We should only query the global module "
-                                "fragment from the C++ 20 Named modules");
+                                "fragment from the C++20 Named modules");
 
   for (auto *SubModule : SubModules)
     if (SubModule->isExplicitGlobalModule())
@@ -387,7 +387,7 @@ Module *Module::getGlobalModuleFragment() const {
 
 Module *Module::getPrivateModuleFragment() const {
   assert(isNamedModuleUnit() && "We should only query the private module "
-                                "fragment from the C++ 20 Named modules");
+                                "fragment from the C++20 Named modules");
 
   for (auto *SubModule : SubModules)
     if (SubModule->isPrivateModule())
diff --git a/clang/lib/Headers/stdatomic.h b/clang/lib/Headers/stdatomic.h
index 521c473d..9c103d9 100644
--- a/clang/lib/Headers/stdatomic.h
+++ b/clang/lib/Headers/stdatomic.h
@@ -16,7 +16,7 @@
  * Exclude the MSVC path as well as the MSVC header as of the 14.31.30818
  * explicitly disallows `stdatomic.h` in the C mode via an `#error`.  Fallback
  * to the clang resource header until that is fully supported.  The
- * `stdatomic.h` header requires C++ 23 or newer.
+ * `stdatomic.h` header requires C++23 or newer.
  */
 #if __STDC_HOSTED__ &&                                                         \
     __has_include_next(<stdatomic.h>) &&                                       \
diff --git a/clang/lib/Lex/DependencyDirectivesScanner.cpp b/clang/lib/Lex/DependencyDirectivesScanner.cpp
index 980f865..0971daa 100644
--- a/clang/lib/Lex/DependencyDirectivesScanner.cpp
+++ b/clang/lib/Lex/DependencyDirectivesScanner.cpp
@@ -369,7 +369,7 @@ static void skipBlockComment(const char *&First, const char *const End) {
     }
 }
 
-/// \returns True if the current single quotation mark character is a C++ 14
+/// \returns True if the current single quotation mark character is a C++14
 /// digit separator.
 static bool isQuoteCppDigitSeparator(const char *const Start,
                                      const char *const Cur,
diff --git a/clang/test/Analysis/bitwise-shift-common.c b/clang/test/Analysis/bitwise-shift-common.c
index 39108bc..5f37d99 100644
--- a/clang/test/Analysis/bitwise-shift-common.c
+++ b/clang/test/Analysis/bitwise-shift-common.c
@@ -154,7 +154,7 @@ int expression_tracked_back(void) {
 //===----------------------------------------------------------------------===//
 
 int allow_overflows_and_negative_operands(void) {
-  // These are all legal under C++ 20 and many compilers accept them under
+  // These are all legal under C++20 and many compilers accept them under
   // earlier standards as well.
   int int_min = 1 << 31; // no-warning
   int this_overflows = 1027 << 30; // no-warning
diff --git a/clang/unittests/Analysis/FlowSensitive/TransferTest.cpp b/clang/unittests/Analysis/FlowSensitive/TransferTest.cpp
index 8bbb040..55af702 100644
--- a/clang/unittests/Analysis/FlowSensitive/TransferTest.cpp
+++ b/clang/unittests/Analysis/FlowSensitive/TransferTest.cpp
@@ -2093,7 +2093,7 @@ TEST(TransferTest, TemporaryObject) {
 
 TEST(TransferTest, ElidableConstructor) {
   // This test is effectively the same as TransferTest.TemporaryObject, but
-  // the code is compiled as C++ 14.
+  // the code is compiled as C++14.
   std::string Code = R"(
     struct A {
       int Bar;
diff --git a/clang/unittests/Lex/DependencyDirectivesScannerTest.cpp b/clang/unittests/Lex/DependencyDirectivesScannerTest.cpp
index bc4eee7..59fef9e 100644
--- a/clang/unittests/Lex/DependencyDirectivesScannerTest.cpp
+++ b/clang/unittests/Lex/DependencyDirectivesScannerTest.cpp
@@ -583,7 +583,7 @@ TEST(MinimizeSourceToDependencyDirectivesTest, UnderscorePragma) {
       R"(_Pragma(u"clang module import"))", Out));
   EXPECT_STREQ("<TokBeforeEOF>\n", Out.data());
 
-  // FIXME: R"()" strings depend on using C++ 11 language mode
+  // FIXME: R"()" strings depend on using C++11 language mode
   ASSERT_FALSE(minimizeSourceToDependencyDirectives(
       R"(_Pragma(R"abc(clang module import)abc"))", Out));
   EXPECT_STREQ("<TokBeforeEOF>\n", Out.data());
diff --git a/libcxx/docs/FeatureTestMacroTable.rst b/libcxx/docs/FeatureTestMacroTable.rst
index a5c6fa2..468226c 100644
--- a/libcxx/docs/FeatureTestMacroTable.rst
+++ b/libcxx/docs/FeatureTestMacroTable.rst
@@ -24,7 +24,7 @@ Status
     =================================================== =================
     Macro Name                                          Value
     =================================================== =================
-    **C++ 14**
+    **C++14**
     ---------------------------------------------------------------------
     ``__cpp_lib_chrono_udls``                           ``201304L``
     --------------------------------------------------- -----------------
@@ -66,7 +66,7 @@ Status
     --------------------------------------------------- -----------------
     ``__cpp_lib_tuples_by_type``                        ``201304L``
     --------------------------------------------------- -----------------
-    **C++ 17**
+    **C++17**
     ---------------------------------------------------------------------
     ``__cpp_lib_addressof_constexpr``                   ``201603L``
     --------------------------------------------------- -----------------
@@ -166,7 +166,7 @@ Status
     --------------------------------------------------- -----------------
     ``__cpp_lib_void_t``                                ``201411L``
     --------------------------------------------------- -----------------
-    **C++ 20**
+    **C++20**
     ---------------------------------------------------------------------
     ``__cpp_lib_array_constexpr``                       ``201811L``
     --------------------------------------------------- -----------------
@@ -300,7 +300,7 @@ Status
     --------------------------------------------------- -----------------
     ``__cpp_lib_unwrap_ref``                            ``201811L``
     --------------------------------------------------- -----------------
-    **C++ 23**
+    **C++23**
     ---------------------------------------------------------------------
     ``__cpp_lib_adaptor_iterator_pair_constructor``     ``202106L``
     --------------------------------------------------- -----------------
@@ -388,7 +388,7 @@ Status
     --------------------------------------------------- -----------------
     ``__cpp_lib_unreachable``                           ``202202L``
     --------------------------------------------------- -----------------
-    **C++ 26**
+    **C++26**
     ---------------------------------------------------------------------
     ``__cpp_lib_associative_heterogeneous_insertion``   *unimplemented*
     --------------------------------------------------- -----------------
diff --git a/libcxx/include/__locale_dir/locale_base_api/ibm.h b/libcxx/include/__locale_dir/locale_base_api/ibm.h
index 498ea1e..c5d7f34 100644
--- a/libcxx/include/__locale_dir/locale_base_api/ibm.h
+++ b/libcxx/include/__locale_dir/locale_base_api/ibm.h
@@ -100,7 +100,7 @@ inline _LIBCPP_HIDE_FROM_ABI int vasprintf(char** strp, const char* fmt, va_list
   }
 
   va_list ap_copy;
-  // va_copy may not be provided by the C library in C++ 03 mode.
+  // va_copy may not be provided by the C library in C++03 mode.
 #if defined(_LIBCPP_CXX03_LANG) && __has_builtin(__builtin_va_copy)
   __builtin_va_copy(ap_copy, ap);
 #else
diff --git a/libcxx/test/std/input.output/iostream.format/output.streams/ostream.formatted/ostream.inserters.arithmetic/pointer.volatile.pass.cpp b/libcxx/test/std/input.output/iostream.format/output.streams/ostream.formatted/ostream.inserters.arithmetic/pointer.volatile.pass.cpp
index b016bab..69d84f64 100644
--- a/libcxx/test/std/input.output/iostream.format/output.streams/ostream.formatted/ostream.inserters.arithmetic/pointer.volatile.pass.cpp
+++ b/libcxx/test/std/input.output/iostream.format/output.streams/ostream.formatted/ostream.inserters.arithmetic/pointer.volatile.pass.cpp
@@ -17,7 +17,7 @@
 //
 // If the library was built in c++23 mode, this test would succeed.
 //
-// Older CMake passed -std:c++latest to set C++ 20 mode on clang-cl, which
+// Older CMake passed -std:c++latest to set C++20 mode on clang-cl, which
 // hid this issue. With newer CMake versions, it passes -std:c++20 which
 // makes this fail.
 //
diff --git a/libcxx/utils/generate_feature_test_macro_components.py b/libcxx/utils/generate_feature_test_macro_components.py
index cc1fc50..9e7ea86 100755
--- a/libcxx/utils/generate_feature_test_macro_components.py
+++ b/libcxx/utils/generate_feature_test_macro_components.py
@@ -1769,7 +1769,7 @@ def pad_cell(s, length, left_align=True):
 def get_status_table():
     table = [["Macro Name", "Value"]]
     for std in get_std_dialects():
-        table += [["**" + std.replace("c++", "C++ ") + "**", ""]]
+        table += [["**" + std.replace("c++", "C++") + "**", ""]]
         for tc in feature_test_macros:
             if std not in tc["values"].keys():
                 continue
diff --git a/llvm/docs/CMake.rst b/llvm/docs/CMake.rst
index 20f73c9..abef4f8 100644
--- a/llvm/docs/CMake.rst
+++ b/llvm/docs/CMake.rst
@@ -277,7 +277,7 @@ manual, or execute ``cmake --help-variable VARIABLE_NAME``.
 
 **CMAKE_CXX_STANDARD**:STRING
   Sets the C++ standard to conform to when building LLVM.  Possible values are
-  17 and 20.  LLVM Requires C++ 17 or higher.  This defaults to 17.
+  17 and 20.  LLVM Requires C++17 or higher.  This defaults to 17.
 
 **CMAKE_INSTALL_BINDIR**:PATH
   The path to install executables, relative to the *CMAKE_INSTALL_PREFIX*.
-- 
cgit v1.1


From ffab5a089b1e94b3305fbdfdf1547b751121c090 Mon Sep 17 00:00:00 2001
From: Jon Roelofs <jonathan_roelofs@apple.com>
Date: Sun, 11 Feb 2024 10:50:59 -0800
Subject: Add a test for the A16/A17 parts of
 eb1b428750181ea742c547db0bc7136cd5b8f732

There are a couple of open questions on what we should do for A14, so I'll
leave that off for now.

https://github.com/llvm/llvm-project/pull/81325#issuecomment-1937489565
---
 llvm/test/CodeGen/AArch64/misched-fusion-addadrp.ll | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/llvm/test/CodeGen/AArch64/misched-fusion-addadrp.ll b/llvm/test/CodeGen/AArch64/misched-fusion-addadrp.ll
index cddcd46..a75c303 100644
--- a/llvm/test/CodeGen/AArch64/misched-fusion-addadrp.ll
+++ b/llvm/test/CodeGen/AArch64/misched-fusion-addadrp.ll
@@ -13,6 +13,8 @@
 ; RUN: llc %s -o - -mtriple=aarch64-unknown -mcpu=neoverse-v1     | FileCheck %s
 ; RUN: llc %s -o - -mtriple=aarch64-unknown -mcpu=neoverse-n2     | FileCheck %s
 ; RUN: llc %s -o - -mtriple=aarch64-unknown -mcpu=neoverse-v2     | FileCheck %s
+; RUN: llc %s -o - -mtriple=aarch64-unknown -mcpu=apple-a16 -mattr=-fuse-literals | FileCheck %s
+; RUN: llc %s -o - -mtriple=aarch64-unknown -mcpu=apple-a17 -mattr=-fuse-literals | FileCheck %s
 
 @g = common local_unnamed_addr global ptr null, align 8
 
-- 
cgit v1.1


From 03f571995b4f0c260254955afd16ec44d0764794 Mon Sep 17 00:00:00 2001
From: Owen Pan <owenpiano@gmail.com>
Date: Sun, 11 Feb 2024 12:59:05 -0800
Subject: [clang-format][NFC] Make LangOpts global in namespace Format (#81390)

---
 clang/include/clang/Format/Format.h               |  6 ----
 clang/lib/Format/Format.cpp                       | 30 -------------------
 clang/lib/Format/FormatTokenLexer.cpp             | 12 +++-----
 clang/lib/Format/FormatTokenLexer.h               |  2 --
 clang/lib/Format/IntegerLiteralSeparatorFixer.cpp |  2 +-
 clang/lib/Format/TokenAnalyzer.cpp                | 36 ++++++++++++++++++++++-
 clang/lib/Format/TokenAnalyzer.h                  |  2 ++
 clang/unittests/Format/TestLexer.h                |  4 ++-
 8 files changed, 45 insertions(+), 49 deletions(-)

diff --git a/clang/include/clang/Format/Format.h b/clang/include/clang/Format/Format.h
index d9c18e5..b7e8246 100644
--- a/clang/include/clang/Format/Format.h
+++ b/clang/include/clang/Format/Format.h
@@ -14,7 +14,6 @@
 #ifndef LLVM_CLANG_FORMAT_FORMAT_H
 #define LLVM_CLANG_FORMAT_FORMAT_H
 
-#include "clang/Basic/LangOptions.h"
 #include "clang/Tooling/Core/Replacement.h"
 #include "clang/Tooling/Inclusions/IncludeStyle.h"
 #include "llvm/ADT/ArrayRef.h"
@@ -5179,11 +5178,6 @@ tooling::Replacements sortUsingDeclarations(const FormatStyle &Style,
                                             ArrayRef<tooling::Range> Ranges,
                                             StringRef FileName = "<stdin>");
 
-/// Returns the ``LangOpts`` that the formatter expects you to set.
-///
-/// \param Style determines specific settings for lexing mode.
-LangOptions getFormattingLangOpts(const FormatStyle &Style = getLLVMStyle());
-
 /// Description to be used for help text for a ``llvm::cl`` option for
 /// specifying format style. The description is closely related to the operation
 /// of ``getStyle()``.
diff --git a/clang/lib/Format/Format.cpp b/clang/lib/Format/Format.cpp
index d2cc466..8431d3c 100644
--- a/clang/lib/Format/Format.cpp
+++ b/clang/lib/Format/Format.cpp
@@ -3823,36 +3823,6 @@ tooling::Replacements sortUsingDeclarations(const FormatStyle &Style,
   return UsingDeclarationsSorter(*Env, Style).process().first;
 }
 
-LangOptions getFormattingLangOpts(const FormatStyle &Style) {
-  LangOptions LangOpts;
-
-  FormatStyle::LanguageStandard LexingStd = Style.Standard;
-  if (LexingStd == FormatStyle::LS_Auto)
-    LexingStd = FormatStyle::LS_Latest;
-  if (LexingStd == FormatStyle::LS_Latest)
-    LexingStd = FormatStyle::LS_Cpp20;
-  LangOpts.CPlusPlus = 1;
-  LangOpts.CPlusPlus11 = LexingStd >= FormatStyle::LS_Cpp11;
-  LangOpts.CPlusPlus14 = LexingStd >= FormatStyle::LS_Cpp14;
-  LangOpts.CPlusPlus17 = LexingStd >= FormatStyle::LS_Cpp17;
-  LangOpts.CPlusPlus20 = LexingStd >= FormatStyle::LS_Cpp20;
-  LangOpts.Char8 = LexingStd >= FormatStyle::LS_Cpp20;
-  // Turning on digraphs in standards before C++0x is error-prone, because e.g.
-  // the sequence "<::" will be unconditionally treated as "[:".
-  // Cf. Lexer::LexTokenInternal.
-  LangOpts.Digraphs = LexingStd >= FormatStyle::LS_Cpp11;
-
-  LangOpts.LineComment = 1;
-  bool AlternativeOperators = Style.isCpp();
-  LangOpts.CXXOperatorNames = AlternativeOperators ? 1 : 0;
-  LangOpts.Bool = 1;
-  LangOpts.ObjC = 1;
-  LangOpts.MicrosoftExt = 1;    // To get kw___try, kw___finally.
-  LangOpts.DeclSpecKeyword = 1; // To get __declspec.
-  LangOpts.C99 = 1; // To get kw_restrict for non-underscore-prefixed restrict.
-  return LangOpts;
-}
-
 const char *StyleOptionHelpDescription =
     "Set coding style. <string> can be:\n"
     "1. A preset: LLVM, GNU, Google, Chromium, Microsoft,\n"
diff --git a/clang/lib/Format/FormatTokenLexer.cpp b/clang/lib/Format/FormatTokenLexer.cpp
index a87d0ba..a57659f 100644
--- a/clang/lib/Format/FormatTokenLexer.cpp
+++ b/clang/lib/Format/FormatTokenLexer.cpp
@@ -13,11 +13,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "FormatTokenLexer.h"
-#include "FormatToken.h"
-#include "clang/Basic/SourceLocation.h"
-#include "clang/Basic/SourceManager.h"
-#include "clang/Format/Format.h"
-#include "llvm/Support/Regex.h"
+#include "TokenAnalyzer.h"
 
 namespace clang {
 namespace format {
@@ -28,12 +24,12 @@ FormatTokenLexer::FormatTokenLexer(
     llvm::SpecificBumpPtrAllocator<FormatToken> &Allocator,
     IdentifierTable &IdentTable)
     : FormatTok(nullptr), IsFirstToken(true), StateStack({LexerState::NORMAL}),
-      Column(Column), TrailingWhitespace(0),
-      LangOpts(getFormattingLangOpts(Style)), SourceMgr(SourceMgr), ID(ID),
+      Column(Column), TrailingWhitespace(0), SourceMgr(SourceMgr), ID(ID),
       Style(Style), IdentTable(IdentTable), Keywords(IdentTable),
       Encoding(Encoding), Allocator(Allocator), FirstInLineIndex(0),
       FormattingDisabled(false), MacroBlockBeginRegex(Style.MacroBlockBegin),
       MacroBlockEndRegex(Style.MacroBlockEnd) {
+  assert(LangOpts.CPlusPlus);
   Lex.reset(new Lexer(ID, SourceMgr.getBufferOrFake(ID), SourceMgr, LangOpts));
   Lex->SetKeepWhitespaceMode(true);
 
@@ -1442,7 +1438,7 @@ void FormatTokenLexer::readRawToken(FormatToken &Tok) {
 
 void FormatTokenLexer::resetLexer(unsigned Offset) {
   StringRef Buffer = SourceMgr.getBufferData(ID);
-  LangOpts = getFormattingLangOpts(Style);
+  assert(LangOpts.CPlusPlus);
   Lex.reset(new Lexer(SourceMgr.getLocForStartOfFile(ID), LangOpts,
                       Buffer.begin(), Buffer.begin() + Offset, Buffer.end()));
   Lex->SetKeepWhitespaceMode(true);
diff --git a/clang/lib/Format/FormatTokenLexer.h b/clang/lib/Format/FormatTokenLexer.h
index 65dd733..0d0f36f 100644
--- a/clang/lib/Format/FormatTokenLexer.h
+++ b/clang/lib/Format/FormatTokenLexer.h
@@ -17,7 +17,6 @@
 
 #include "Encoding.h"
 #include "FormatToken.h"
-#include "clang/Basic/LangOptions.h"
 #include "clang/Basic/SourceLocation.h"
 #include "clang/Basic/SourceManager.h"
 #include "clang/Format/Format.h"
@@ -120,7 +119,6 @@ private:
   unsigned Column;
   unsigned TrailingWhitespace;
   std::unique_ptr<Lexer> Lex;
-  LangOptions LangOpts;
   const SourceManager &SourceMgr;
   FileID ID;
   const FormatStyle &Style;
diff --git a/clang/lib/Format/IntegerLiteralSeparatorFixer.cpp b/clang/lib/Format/IntegerLiteralSeparatorFixer.cpp
index 87823ae..3c2cedd 100644
--- a/clang/lib/Format/IntegerLiteralSeparatorFixer.cpp
+++ b/clang/lib/Format/IntegerLiteralSeparatorFixer.cpp
@@ -79,7 +79,7 @@ IntegerLiteralSeparatorFixer::process(const Environment &Env,
   AffectedRangeManager AffectedRangeMgr(SourceMgr, Env.getCharRanges());
 
   const auto ID = Env.getFileID();
-  const auto LangOpts = getFormattingLangOpts(Style);
+  assert(LangOpts.CPlusPlus);
   Lexer Lex(ID, SourceMgr.getBufferOrFake(ID), SourceMgr, LangOpts);
   Lex.SetCommentRetentionState(true);
 
diff --git a/clang/lib/Format/TokenAnalyzer.cpp b/clang/lib/Format/TokenAnalyzer.cpp
index bd648c4..4e77683 100644
--- a/clang/lib/Format/TokenAnalyzer.cpp
+++ b/clang/lib/Format/TokenAnalyzer.cpp
@@ -35,6 +35,38 @@
 namespace clang {
 namespace format {
 
+LangOptions LangOpts;
+
+/// Sets `LangOpts` for the formatter.
+///
+/// \param `Style` determines specific settings for lexing mode.
+static void setFormattingLangOpts(const FormatStyle &Style) {
+  FormatStyle::LanguageStandard LexingStd = Style.Standard;
+  if (LexingStd == FormatStyle::LS_Auto)
+    LexingStd = FormatStyle::LS_Latest;
+  if (LexingStd == FormatStyle::LS_Latest)
+    LexingStd = FormatStyle::LS_Cpp20;
+  LangOpts.CPlusPlus = 1;
+  LangOpts.CPlusPlus11 = LexingStd >= FormatStyle::LS_Cpp11;
+  LangOpts.CPlusPlus14 = LexingStd >= FormatStyle::LS_Cpp14;
+  LangOpts.CPlusPlus17 = LexingStd >= FormatStyle::LS_Cpp17;
+  LangOpts.CPlusPlus20 = LexingStd >= FormatStyle::LS_Cpp20;
+  LangOpts.Char8 = LexingStd >= FormatStyle::LS_Cpp20;
+  // Turning on digraphs in standards before C++0x is error-prone, because e.g.
+  // the sequence "<::" will be unconditionally treated as "[:".
+  // Cf. Lexer::LexTokenInternal.
+  LangOpts.Digraphs = LexingStd >= FormatStyle::LS_Cpp11;
+
+  LangOpts.LineComment = 1;
+  bool AlternativeOperators = Style.isCpp();
+  LangOpts.CXXOperatorNames = AlternativeOperators ? 1 : 0;
+  LangOpts.Bool = 1;
+  LangOpts.ObjC = 1;
+  LangOpts.MicrosoftExt = 1;    // To get kw___try, kw___finally.
+  LangOpts.DeclSpecKeyword = 1; // To get __declspec.
+  LangOpts.C99 = 1; // To get kw_restrict for non-underscore-prefixed restrict.
+}
+
 // FIXME: Instead of printing the diagnostic we should store it and have a
 // better way to return errors through the format APIs.
 class FatalDiagnosticConsumer : public DiagnosticConsumer {
@@ -99,9 +131,11 @@ TokenAnalyzer::TokenAnalyzer(const Environment &Env, const FormatStyle &Style)
 
 std::pair<tooling::Replacements, unsigned>
 TokenAnalyzer::process(bool SkipAnnotation) {
+  setFormattingLangOpts(Style);
+
   tooling::Replacements Result;
   llvm::SpecificBumpPtrAllocator<FormatToken> Allocator;
-  IdentifierTable IdentTable(getFormattingLangOpts(Style));
+  IdentifierTable IdentTable(LangOpts);
   FormatTokenLexer Lex(Env.getSourceManager(), Env.getFileID(),
                        Env.getFirstStartColumn(), Style, Encoding, Allocator,
                        IdentTable);
diff --git a/clang/lib/Format/TokenAnalyzer.h b/clang/lib/Format/TokenAnalyzer.h
index 4086dab..18c1431 100644
--- a/clang/lib/Format/TokenAnalyzer.h
+++ b/clang/lib/Format/TokenAnalyzer.h
@@ -34,6 +34,8 @@
 namespace clang {
 namespace format {
 
+extern LangOptions LangOpts;
+
 class Environment {
 public:
   // This sets up an virtual file system with file \p FileName containing the
diff --git a/clang/unittests/Format/TestLexer.h b/clang/unittests/Format/TestLexer.h
index 8b5949b..6a3d0bd 100644
--- a/clang/unittests/Format/TestLexer.h
+++ b/clang/unittests/Format/TestLexer.h
@@ -61,7 +61,9 @@ public:
             std::vector<std::unique_ptr<llvm::MemoryBuffer>> &Buffers,
             FormatStyle Style = getLLVMStyle())
       : Allocator(Allocator), Buffers(Buffers), Style(Style),
-        SourceMgr("test.cpp", ""), IdentTable(getFormattingLangOpts(Style)) {}
+        SourceMgr("test.cpp", ""), IdentTable(LangOpts) {
+    assert(LangOpts.CPlusPlus);
+  }
 
   TokenList lex(llvm::StringRef Code) {
     FormatTokenLexer Lex = getNewLexer(Code);
-- 
cgit v1.1


From 3dc8ef677d7d05116a0bf6524eb38b02ca6ba042 Mon Sep 17 00:00:00 2001
From: Owen Pan <owenpiano@gmail.com>
Date: Sun, 11 Feb 2024 13:08:28 -0800
Subject: Revert "[clang-format][NFC] Make LangOpts global in namespace Format
 (#81390)"

This reverts commit 03f571995b4f0c260254955afd16ec44d0764794.

We can't hide getFormattingLangOpts() as it's used by other tools.
---
 clang/include/clang/Format/Format.h               |  6 ++++
 clang/lib/Format/Format.cpp                       | 30 +++++++++++++++++++
 clang/lib/Format/FormatTokenLexer.cpp             | 12 +++++---
 clang/lib/Format/FormatTokenLexer.h               |  2 ++
 clang/lib/Format/IntegerLiteralSeparatorFixer.cpp |  2 +-
 clang/lib/Format/TokenAnalyzer.cpp                | 36 +----------------------
 clang/lib/Format/TokenAnalyzer.h                  |  2 --
 clang/unittests/Format/TestLexer.h                |  4 +--
 8 files changed, 49 insertions(+), 45 deletions(-)

diff --git a/clang/include/clang/Format/Format.h b/clang/include/clang/Format/Format.h
index b7e8246..d9c18e5 100644
--- a/clang/include/clang/Format/Format.h
+++ b/clang/include/clang/Format/Format.h
@@ -14,6 +14,7 @@
 #ifndef LLVM_CLANG_FORMAT_FORMAT_H
 #define LLVM_CLANG_FORMAT_FORMAT_H
 
+#include "clang/Basic/LangOptions.h"
 #include "clang/Tooling/Core/Replacement.h"
 #include "clang/Tooling/Inclusions/IncludeStyle.h"
 #include "llvm/ADT/ArrayRef.h"
@@ -5178,6 +5179,11 @@ tooling::Replacements sortUsingDeclarations(const FormatStyle &Style,
                                             ArrayRef<tooling::Range> Ranges,
                                             StringRef FileName = "<stdin>");
 
+/// Returns the ``LangOpts`` that the formatter expects you to set.
+///
+/// \param Style determines specific settings for lexing mode.
+LangOptions getFormattingLangOpts(const FormatStyle &Style = getLLVMStyle());
+
 /// Description to be used for help text for a ``llvm::cl`` option for
 /// specifying format style. The description is closely related to the operation
 /// of ``getStyle()``.
diff --git a/clang/lib/Format/Format.cpp b/clang/lib/Format/Format.cpp
index 8431d3c..d2cc466 100644
--- a/clang/lib/Format/Format.cpp
+++ b/clang/lib/Format/Format.cpp
@@ -3823,6 +3823,36 @@ tooling::Replacements sortUsingDeclarations(const FormatStyle &Style,
   return UsingDeclarationsSorter(*Env, Style).process().first;
 }
 
+LangOptions getFormattingLangOpts(const FormatStyle &Style) {
+  LangOptions LangOpts;
+
+  FormatStyle::LanguageStandard LexingStd = Style.Standard;
+  if (LexingStd == FormatStyle::LS_Auto)
+    LexingStd = FormatStyle::LS_Latest;
+  if (LexingStd == FormatStyle::LS_Latest)
+    LexingStd = FormatStyle::LS_Cpp20;
+  LangOpts.CPlusPlus = 1;
+  LangOpts.CPlusPlus11 = LexingStd >= FormatStyle::LS_Cpp11;
+  LangOpts.CPlusPlus14 = LexingStd >= FormatStyle::LS_Cpp14;
+  LangOpts.CPlusPlus17 = LexingStd >= FormatStyle::LS_Cpp17;
+  LangOpts.CPlusPlus20 = LexingStd >= FormatStyle::LS_Cpp20;
+  LangOpts.Char8 = LexingStd >= FormatStyle::LS_Cpp20;
+  // Turning on digraphs in standards before C++0x is error-prone, because e.g.
+  // the sequence "<::" will be unconditionally treated as "[:".
+  // Cf. Lexer::LexTokenInternal.
+  LangOpts.Digraphs = LexingStd >= FormatStyle::LS_Cpp11;
+
+  LangOpts.LineComment = 1;
+  bool AlternativeOperators = Style.isCpp();
+  LangOpts.CXXOperatorNames = AlternativeOperators ? 1 : 0;
+  LangOpts.Bool = 1;
+  LangOpts.ObjC = 1;
+  LangOpts.MicrosoftExt = 1;    // To get kw___try, kw___finally.
+  LangOpts.DeclSpecKeyword = 1; // To get __declspec.
+  LangOpts.C99 = 1; // To get kw_restrict for non-underscore-prefixed restrict.
+  return LangOpts;
+}
+
 const char *StyleOptionHelpDescription =
     "Set coding style. <string> can be:\n"
     "1. A preset: LLVM, GNU, Google, Chromium, Microsoft,\n"
diff --git a/clang/lib/Format/FormatTokenLexer.cpp b/clang/lib/Format/FormatTokenLexer.cpp
index a57659f..a87d0ba 100644
--- a/clang/lib/Format/FormatTokenLexer.cpp
+++ b/clang/lib/Format/FormatTokenLexer.cpp
@@ -13,7 +13,11 @@
 //===----------------------------------------------------------------------===//
 
 #include "FormatTokenLexer.h"
-#include "TokenAnalyzer.h"
+#include "FormatToken.h"
+#include "clang/Basic/SourceLocation.h"
+#include "clang/Basic/SourceManager.h"
+#include "clang/Format/Format.h"
+#include "llvm/Support/Regex.h"
 
 namespace clang {
 namespace format {
@@ -24,12 +28,12 @@ FormatTokenLexer::FormatTokenLexer(
     llvm::SpecificBumpPtrAllocator<FormatToken> &Allocator,
     IdentifierTable &IdentTable)
     : FormatTok(nullptr), IsFirstToken(true), StateStack({LexerState::NORMAL}),
-      Column(Column), TrailingWhitespace(0), SourceMgr(SourceMgr), ID(ID),
+      Column(Column), TrailingWhitespace(0),
+      LangOpts(getFormattingLangOpts(Style)), SourceMgr(SourceMgr), ID(ID),
       Style(Style), IdentTable(IdentTable), Keywords(IdentTable),
       Encoding(Encoding), Allocator(Allocator), FirstInLineIndex(0),
       FormattingDisabled(false), MacroBlockBeginRegex(Style.MacroBlockBegin),
       MacroBlockEndRegex(Style.MacroBlockEnd) {
-  assert(LangOpts.CPlusPlus);
   Lex.reset(new Lexer(ID, SourceMgr.getBufferOrFake(ID), SourceMgr, LangOpts));
   Lex->SetKeepWhitespaceMode(true);
 
@@ -1438,7 +1442,7 @@ void FormatTokenLexer::readRawToken(FormatToken &Tok) {
 
 void FormatTokenLexer::resetLexer(unsigned Offset) {
   StringRef Buffer = SourceMgr.getBufferData(ID);
-  assert(LangOpts.CPlusPlus);
+  LangOpts = getFormattingLangOpts(Style);
   Lex.reset(new Lexer(SourceMgr.getLocForStartOfFile(ID), LangOpts,
                       Buffer.begin(), Buffer.begin() + Offset, Buffer.end()));
   Lex->SetKeepWhitespaceMode(true);
diff --git a/clang/lib/Format/FormatTokenLexer.h b/clang/lib/Format/FormatTokenLexer.h
index 0d0f36f..65dd733 100644
--- a/clang/lib/Format/FormatTokenLexer.h
+++ b/clang/lib/Format/FormatTokenLexer.h
@@ -17,6 +17,7 @@
 
 #include "Encoding.h"
 #include "FormatToken.h"
+#include "clang/Basic/LangOptions.h"
 #include "clang/Basic/SourceLocation.h"
 #include "clang/Basic/SourceManager.h"
 #include "clang/Format/Format.h"
@@ -119,6 +120,7 @@ private:
   unsigned Column;
   unsigned TrailingWhitespace;
   std::unique_ptr<Lexer> Lex;
+  LangOptions LangOpts;
   const SourceManager &SourceMgr;
   FileID ID;
   const FormatStyle &Style;
diff --git a/clang/lib/Format/IntegerLiteralSeparatorFixer.cpp b/clang/lib/Format/IntegerLiteralSeparatorFixer.cpp
index 3c2cedd..87823ae 100644
--- a/clang/lib/Format/IntegerLiteralSeparatorFixer.cpp
+++ b/clang/lib/Format/IntegerLiteralSeparatorFixer.cpp
@@ -79,7 +79,7 @@ IntegerLiteralSeparatorFixer::process(const Environment &Env,
   AffectedRangeManager AffectedRangeMgr(SourceMgr, Env.getCharRanges());
 
   const auto ID = Env.getFileID();
-  assert(LangOpts.CPlusPlus);
+  const auto LangOpts = getFormattingLangOpts(Style);
   Lexer Lex(ID, SourceMgr.getBufferOrFake(ID), SourceMgr, LangOpts);
   Lex.SetCommentRetentionState(true);
 
diff --git a/clang/lib/Format/TokenAnalyzer.cpp b/clang/lib/Format/TokenAnalyzer.cpp
index 4e77683..bd648c4 100644
--- a/clang/lib/Format/TokenAnalyzer.cpp
+++ b/clang/lib/Format/TokenAnalyzer.cpp
@@ -35,38 +35,6 @@
 namespace clang {
 namespace format {
 
-LangOptions LangOpts;
-
-/// Sets `LangOpts` for the formatter.
-///
-/// \param `Style` determines specific settings for lexing mode.
-static void setFormattingLangOpts(const FormatStyle &Style) {
-  FormatStyle::LanguageStandard LexingStd = Style.Standard;
-  if (LexingStd == FormatStyle::LS_Auto)
-    LexingStd = FormatStyle::LS_Latest;
-  if (LexingStd == FormatStyle::LS_Latest)
-    LexingStd = FormatStyle::LS_Cpp20;
-  LangOpts.CPlusPlus = 1;
-  LangOpts.CPlusPlus11 = LexingStd >= FormatStyle::LS_Cpp11;
-  LangOpts.CPlusPlus14 = LexingStd >= FormatStyle::LS_Cpp14;
-  LangOpts.CPlusPlus17 = LexingStd >= FormatStyle::LS_Cpp17;
-  LangOpts.CPlusPlus20 = LexingStd >= FormatStyle::LS_Cpp20;
-  LangOpts.Char8 = LexingStd >= FormatStyle::LS_Cpp20;
-  // Turning on digraphs in standards before C++0x is error-prone, because e.g.
-  // the sequence "<::" will be unconditionally treated as "[:".
-  // Cf. Lexer::LexTokenInternal.
-  LangOpts.Digraphs = LexingStd >= FormatStyle::LS_Cpp11;
-
-  LangOpts.LineComment = 1;
-  bool AlternativeOperators = Style.isCpp();
-  LangOpts.CXXOperatorNames = AlternativeOperators ? 1 : 0;
-  LangOpts.Bool = 1;
-  LangOpts.ObjC = 1;
-  LangOpts.MicrosoftExt = 1;    // To get kw___try, kw___finally.
-  LangOpts.DeclSpecKeyword = 1; // To get __declspec.
-  LangOpts.C99 = 1; // To get kw_restrict for non-underscore-prefixed restrict.
-}
-
 // FIXME: Instead of printing the diagnostic we should store it and have a
 // better way to return errors through the format APIs.
 class FatalDiagnosticConsumer : public DiagnosticConsumer {
@@ -131,11 +99,9 @@ TokenAnalyzer::TokenAnalyzer(const Environment &Env, const FormatStyle &Style)
 
 std::pair<tooling::Replacements, unsigned>
 TokenAnalyzer::process(bool SkipAnnotation) {
-  setFormattingLangOpts(Style);
-
   tooling::Replacements Result;
   llvm::SpecificBumpPtrAllocator<FormatToken> Allocator;
-  IdentifierTable IdentTable(LangOpts);
+  IdentifierTable IdentTable(getFormattingLangOpts(Style));
   FormatTokenLexer Lex(Env.getSourceManager(), Env.getFileID(),
                        Env.getFirstStartColumn(), Style, Encoding, Allocator,
                        IdentTable);
diff --git a/clang/lib/Format/TokenAnalyzer.h b/clang/lib/Format/TokenAnalyzer.h
index 18c1431..4086dab 100644
--- a/clang/lib/Format/TokenAnalyzer.h
+++ b/clang/lib/Format/TokenAnalyzer.h
@@ -34,8 +34,6 @@
 namespace clang {
 namespace format {
 
-extern LangOptions LangOpts;
-
 class Environment {
 public:
   // This sets up an virtual file system with file \p FileName containing the
diff --git a/clang/unittests/Format/TestLexer.h b/clang/unittests/Format/TestLexer.h
index 6a3d0bd..8b5949b 100644
--- a/clang/unittests/Format/TestLexer.h
+++ b/clang/unittests/Format/TestLexer.h
@@ -61,9 +61,7 @@ public:
             std::vector<std::unique_ptr<llvm::MemoryBuffer>> &Buffers,
             FormatStyle Style = getLLVMStyle())
       : Allocator(Allocator), Buffers(Buffers), Style(Style),
-        SourceMgr("test.cpp", ""), IdentTable(LangOpts) {
-    assert(LangOpts.CPlusPlus);
-  }
+        SourceMgr("test.cpp", ""), IdentTable(getFormattingLangOpts(Style)) {}
 
   TokenList lex(llvm::StringRef Code) {
     FormatTokenLexer Lex = getNewLexer(Code);
-- 
cgit v1.1


From b1771475da91805a4ac1831810b62a7b3655ccca Mon Sep 17 00:00:00 2001
From: David Green <david.green@arm.com>
Date: Sun, 11 Feb 2024 22:25:16 +0000
Subject: [AArch64][GlobalISel] Additional insert and extract GISel tests. NFC

---
 llvm/test/CodeGen/AArch64/insertextract.ll | 2256 ++++++++++++++++++++++++++++
 1 file changed, 2256 insertions(+)
 create mode 100644 llvm/test/CodeGen/AArch64/insertextract.ll

diff --git a/llvm/test/CodeGen/AArch64/insertextract.ll b/llvm/test/CodeGen/AArch64/insertextract.ll
new file mode 100644
index 0000000..794abca
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/insertextract.ll
@@ -0,0 +1,2256 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -mtriple=aarch64-none-eabi -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD
+; RUN: llc -mtriple=aarch64-none-eabi -global-isel -global-isel-abort=2 -verify-machineinstrs %s -o - 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI
+
+; CHECK-GI:       warning: Instruction selection used fallback path for insert_v2f64_c
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for insert_v3f64_c
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for insert_v4f64_0
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for insert_v4f64_2
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for insert_v4f64_c
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for insert_v2f32_c
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for insert_v3f32_c
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for insert_v4f32_c
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for insert_v8f32_0
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for insert_v8f32_2
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for insert_v8f32_c
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for insert_v4f16_c
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for insert_v8f16_c
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for insert_v16f16_0
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for insert_v16f16_2
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for insert_v16f16_c
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for insert_v8i8_c
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for insert_v16i8_c
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for insert_v32i8_0
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for insert_v32i8_2
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for insert_v32i8_c
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for insert_v4i16_c
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for insert_v8i16_c
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for insert_v16i16_0
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for insert_v16i16_2
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for insert_v16i16_c
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for insert_v2i32_c
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for insert_v3i32_c
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for insert_v4i32_c
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for insert_v8i32_0
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for insert_v8i32_2
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for insert_v8i32_c
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for insert_v2i64_c
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for insert_v3i64_c
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for insert_v4i64_0
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for insert_v4i64_2
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for insert_v4i64_c
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for extract_v32i8_0
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for extract_v32i8_2
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for extract_v32i8_c
+
+define <2 x double> @insert_v2f64_0(<2 x double> %a, double %b, i32 %c) {
+; CHECK-LABEL: insert_v2f64_0:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT:    mov v0.d[0], v1.d[0]
+; CHECK-NEXT:    ret
+entry:
+  %d = insertelement <2 x double> %a, double %b, i32 0
+  ret <2 x double> %d
+}
+
+define <2 x double> @insert_v2f64_1(<2 x double> %a, double %b, i32 %c) {
+; CHECK-LABEL: insert_v2f64_1:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT:    mov v0.d[1], v1.d[0]
+; CHECK-NEXT:    ret
+entry:
+  %d = insertelement <2 x double> %a, double %b, i32 1
+  ret <2 x double> %d
+}
+
+define <2 x double> @insert_v2f64_c(<2 x double> %a, double %b, i32 %c) {
+; CHECK-LABEL: insert_v2f64_c:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    sub sp, sp, #16
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    mov x8, sp
+; CHECK-NEXT:    // kill: def $w0 killed $w0 def $x0
+; CHECK-NEXT:    str q0, [sp]
+; CHECK-NEXT:    bfi x8, x0, #3, #1
+; CHECK-NEXT:    str d1, [x8]
+; CHECK-NEXT:    ldr q0, [sp], #16
+; CHECK-NEXT:    ret
+entry:
+  %d = insertelement <2 x double> %a, double %b, i32 %c
+  ret <2 x double> %d
+}
+
+define <3 x double> @insert_v3f64_0(<3 x double> %a, double %b, i32 %c) {
+; CHECK-SD-LABEL: insert_v3f64_0:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-SD-NEXT:    mov v0.d[1], v1.d[0]
+; CHECK-SD-NEXT:    // kill: def $d3 killed $d3 def $q3
+; CHECK-SD-NEXT:    mov v0.d[0], v3.d[0]
+; CHECK-SD-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-SD-NEXT:    // kill: def $d1 killed $d1 killed $q1
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: insert_v3f64_0:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    fmov d0, d3
+; CHECK-GI-NEXT:    ret
+entry:
+  %d = insertelement <3 x double> %a, double %b, i32 0
+  ret <3 x double> %d
+}
+
+define <3 x double> @insert_v3f64_2(<3 x double> %a, double %b, i32 %c) {
+; CHECK-LABEL: insert_v3f64_2:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fmov d2, d3
+; CHECK-NEXT:    ret
+entry:
+  %d = insertelement <3 x double> %a, double %b, i32 2
+  ret <3 x double> %d
+}
+
+define <3 x double> @insert_v3f64_c(<3 x double> %a, double %b, i32 %c) {
+; CHECK-LABEL: insert_v3f64_c:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT:    // kill: def $w0 killed $w0 def $x0
+; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
+; CHECK-NEXT:    mov v0.d[1], v1.d[0]
+; CHECK-NEXT:    stp q0, q2, [sp, #-32]!
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:    mov x8, sp
+; CHECK-NEXT:    and x9, x0, #0x3
+; CHECK-NEXT:    str d3, [x8, x9, lsl #3]
+; CHECK-NEXT:    ldr q0, [sp]
+; CHECK-NEXT:    ldr d2, [sp, #16]
+; CHECK-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT:    // kill: def $d1 killed $d1 killed $q1
+; CHECK-NEXT:    add sp, sp, #32
+; CHECK-NEXT:    ret
+entry:
+  %d = insertelement <3 x double> %a, double %b, i32 %c
+  ret <3 x double> %d
+}
+
+define <4 x double> @insert_v4f64_0(<4 x double> %a, double %b, i32 %c) {
+; CHECK-LABEL: insert_v4f64_0:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
+; CHECK-NEXT:    mov v0.d[0], v2.d[0]
+; CHECK-NEXT:    ret
+entry:
+  %d = insertelement <4 x double> %a, double %b, i32 0
+  ret <4 x double> %d
+}
+
+define <4 x double> @insert_v4f64_2(<4 x double> %a, double %b, i32 %c) {
+; CHECK-LABEL: insert_v4f64_2:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
+; CHECK-NEXT:    mov v1.d[0], v2.d[0]
+; CHECK-NEXT:    ret
+entry:
+  %d = insertelement <4 x double> %a, double %b, i32 2
+  ret <4 x double> %d
+}
+
+define <4 x double> @insert_v4f64_c(<4 x double> %a, double %b, i32 %c) {
+; CHECK-LABEL: insert_v4f64_c:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $w0 killed $w0 def $x0
+; CHECK-NEXT:    stp q0, q1, [sp, #-32]!
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:    and x8, x0, #0x3
+; CHECK-NEXT:    mov x9, sp
+; CHECK-NEXT:    str d2, [x9, x8, lsl #3]
+; CHECK-NEXT:    ldp q0, q1, [sp], #32
+; CHECK-NEXT:    ret
+entry:
+  %d = insertelement <4 x double> %a, double %b, i32 %c
+  ret <4 x double> %d
+}
+
+define <2 x float> @insert_v2f32_0(<2 x float> %a, float %b, i32 %c) {
+; CHECK-LABEL: insert_v2f32_0:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    // kill: def $s1 killed $s1 def $q1
+; CHECK-NEXT:    mov v0.s[0], v1.s[0]
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT:    ret
+entry:
+  %d = insertelement <2 x float> %a, float %b, i32 0
+  ret <2 x float> %d
+}
+
+define <2 x float> @insert_v2f32_1(<2 x float> %a, float %b, i32 %c) {
+; CHECK-LABEL: insert_v2f32_1:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    // kill: def $s1 killed $s1 def $q1
+; CHECK-NEXT:    mov v0.s[1], v1.s[0]
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT:    ret
+entry:
+  %d = insertelement <2 x float> %a, float %b, i32 1
+  ret <2 x float> %d
+}
+
+define <2 x float> @insert_v2f32_c(<2 x float> %a, float %b, i32 %c) {
+; CHECK-LABEL: insert_v2f32_c:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    sub sp, sp, #16
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    add x8, sp, #8
+; CHECK-NEXT:    // kill: def $w0 killed $w0 def $x0
+; CHECK-NEXT:    str d0, [sp, #8]
+; CHECK-NEXT:    bfi x8, x0, #2, #1
+; CHECK-NEXT:    str s1, [x8]
+; CHECK-NEXT:    ldr d0, [sp, #8]
+; CHECK-NEXT:    add sp, sp, #16
+; CHECK-NEXT:    ret
+entry:
+  %d = insertelement <2 x float> %a, float %b, i32 %c
+  ret <2 x float> %d
+}
+
+define <3 x float> @insert_v3f32_0(<3 x float> %a, float %b, i32 %c) {
+; CHECK-SD-LABEL: insert_v3f32_0:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    // kill: def $s1 killed $s1 def $q1
+; CHECK-SD-NEXT:    mov v1.s[1], v0.s[1]
+; CHECK-SD-NEXT:    mov v1.s[2], v0.s[2]
+; CHECK-SD-NEXT:    mov v0.16b, v1.16b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: insert_v3f32_0:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    mov s2, v0.s[1]
+; CHECK-GI-NEXT:    // kill: def $s1 killed $s1 def $q1
+; CHECK-GI-NEXT:    mov s0, v0.s[2]
+; CHECK-GI-NEXT:    mov v1.s[1], v2.s[0]
+; CHECK-GI-NEXT:    mov v1.s[2], v0.s[0]
+; CHECK-GI-NEXT:    mov v1.s[3], v0.s[0]
+; CHECK-GI-NEXT:    mov v0.16b, v1.16b
+; CHECK-GI-NEXT:    ret
+entry:
+  %d = insertelement <3 x float> %a, float %b, i32 0
+  ret <3 x float> %d
+}
+
+define <3 x float> @insert_v3f32_2(<3 x float> %a, float %b, i32 %c) {
+; CHECK-SD-LABEL: insert_v3f32_2:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    // kill: def $s1 killed $s1 def $q1
+; CHECK-SD-NEXT:    mov v0.s[2], v1.s[0]
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: insert_v3f32_2:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    mov s2, v0.s[1]
+; CHECK-GI-NEXT:    // kill: def $s1 killed $s1 def $q1
+; CHECK-GI-NEXT:    mov v0.s[1], v2.s[0]
+; CHECK-GI-NEXT:    mov v0.s[2], v1.s[0]
+; CHECK-GI-NEXT:    mov v0.s[3], v0.s[0]
+; CHECK-GI-NEXT:    ret
+entry:
+  %d = insertelement <3 x float> %a, float %b, i32 2
+  ret <3 x float> %d
+}
+
+define <3 x float> @insert_v3f32_c(<3 x float> %a, float %b, i32 %c) {
+; CHECK-LABEL: insert_v3f32_c:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    sub sp, sp, #16
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    mov x8, sp
+; CHECK-NEXT:    // kill: def $w0 killed $w0 def $x0
+; CHECK-NEXT:    str q0, [sp]
+; CHECK-NEXT:    bfi x8, x0, #2, #2
+; CHECK-NEXT:    str s1, [x8]
+; CHECK-NEXT:    ldr q0, [sp], #16
+; CHECK-NEXT:    ret
+entry:
+  %d = insertelement <3 x float> %a, float %b, i32 %c
+  ret <3 x float> %d
+}
+
+define <4 x float> @insert_v4f32_0(<4 x float> %a, float %b, i32 %c) {
+; CHECK-LABEL: insert_v4f32_0:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $s1 killed $s1 def $q1
+; CHECK-NEXT:    mov v0.s[0], v1.s[0]
+; CHECK-NEXT:    ret
+entry:
+  %d = insertelement <4 x float> %a, float %b, i32 0
+  ret <4 x float> %d
+}
+
+define <4 x float> @insert_v4f32_2(<4 x float> %a, float %b, i32 %c) {
+; CHECK-LABEL: insert_v4f32_2:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $s1 killed $s1 def $q1
+; CHECK-NEXT:    mov v0.s[2], v1.s[0]
+; CHECK-NEXT:    ret
+entry:
+  %d = insertelement <4 x float> %a, float %b, i32 2
+  ret <4 x float> %d
+}
+
+define <4 x float> @insert_v4f32_c(<4 x float> %a, float %b, i32 %c) {
+; CHECK-LABEL: insert_v4f32_c:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    sub sp, sp, #16
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    mov x8, sp
+; CHECK-NEXT:    // kill: def $w0 killed $w0 def $x0
+; CHECK-NEXT:    str q0, [sp]
+; CHECK-NEXT:    bfi x8, x0, #2, #2
+; CHECK-NEXT:    str s1, [x8]
+; CHECK-NEXT:    ldr q0, [sp], #16
+; CHECK-NEXT:    ret
+entry:
+  %d = insertelement <4 x float> %a, float %b, i32 %c
+  ret <4 x float> %d
+}
+
+define <8 x float> @insert_v8f32_0(<8 x float> %a, float %b, i32 %c) {
+; CHECK-LABEL: insert_v8f32_0:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $s2 killed $s2 def $q2
+; CHECK-NEXT:    mov v0.s[0], v2.s[0]
+; CHECK-NEXT:    ret
+entry:
+  %d = insertelement <8 x float> %a, float %b, i32 0
+  ret <8 x float> %d
+}
+
+define <8 x float> @insert_v8f32_2(<8 x float> %a, float %b, i32 %c) {
+; CHECK-LABEL: insert_v8f32_2:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $s2 killed $s2 def $q2
+; CHECK-NEXT:    mov v0.s[2], v2.s[0]
+; CHECK-NEXT:    ret
+entry:
+  %d = insertelement <8 x float> %a, float %b, i32 2
+  ret <8 x float> %d
+}
+
+define <8 x float> @insert_v8f32_c(<8 x float> %a, float %b, i32 %c) {
+; CHECK-LABEL: insert_v8f32_c:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $w0 killed $w0 def $x0
+; CHECK-NEXT:    stp q0, q1, [sp, #-32]!
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:    and x8, x0, #0x7
+; CHECK-NEXT:    mov x9, sp
+; CHECK-NEXT:    str s2, [x9, x8, lsl #2]
+; CHECK-NEXT:    ldp q0, q1, [sp], #32
+; CHECK-NEXT:    ret
+entry:
+  %d = insertelement <8 x float> %a, float %b, i32 %c
+  ret <8 x float> %d
+}
+
+define <4 x half> @insert_v4f16_0(<4 x half> %a, half %b, i32 %c) {
+; CHECK-LABEL: insert_v4f16_0:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    // kill: def $h1 killed $h1 def $q1
+; CHECK-NEXT:    mov v0.h[0], v1.h[0]
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT:    ret
+entry:
+  %d = insertelement <4 x half> %a, half %b, i32 0
+  ret <4 x half> %d
+}
+
+define <4 x half> @insert_v4f16_2(<4 x half> %a, half %b, i32 %c) {
+; CHECK-LABEL: insert_v4f16_2:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    // kill: def $h1 killed $h1 def $q1
+; CHECK-NEXT:    mov v0.h[2], v1.h[0]
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT:    ret
+entry:
+  %d = insertelement <4 x half> %a, half %b, i32 2
+  ret <4 x half> %d
+}
+
+define <4 x half> @insert_v4f16_c(<4 x half> %a, half %b, i32 %c) {
+; CHECK-LABEL: insert_v4f16_c:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    sub sp, sp, #16
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    add x8, sp, #8
+; CHECK-NEXT:    // kill: def $w0 killed $w0 def $x0
+; CHECK-NEXT:    str d0, [sp, #8]
+; CHECK-NEXT:    bfi x8, x0, #1, #2
+; CHECK-NEXT:    str h1, [x8]
+; CHECK-NEXT:    ldr d0, [sp, #8]
+; CHECK-NEXT:    add sp, sp, #16
+; CHECK-NEXT:    ret
+entry:
+  %d = insertelement <4 x half> %a, half %b, i32 %c
+  ret <4 x half> %d
+}
+
+define <8 x half> @insert_v8f16_0(<8 x half> %a, half %b, i32 %c) {
+; CHECK-LABEL: insert_v8f16_0:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $h1 killed $h1 def $q1
+; CHECK-NEXT:    mov v0.h[0], v1.h[0]
+; CHECK-NEXT:    ret
+entry:
+  %d = insertelement <8 x half> %a, half %b, i32 0
+  ret <8 x half> %d
+}
+
+define <8 x half> @insert_v8f16_2(<8 x half> %a, half %b, i32 %c) {
+; CHECK-LABEL: insert_v8f16_2:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $h1 killed $h1 def $q1
+; CHECK-NEXT:    mov v0.h[2], v1.h[0]
+; CHECK-NEXT:    ret
+entry:
+  %d = insertelement <8 x half> %a, half %b, i32 2
+  ret <8 x half> %d
+}
+
+define <8 x half> @insert_v8f16_c(<8 x half> %a, half %b, i32 %c) {
+; CHECK-LABEL: insert_v8f16_c:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    sub sp, sp, #16
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    mov x8, sp
+; CHECK-NEXT:    // kill: def $w0 killed $w0 def $x0
+; CHECK-NEXT:    str q0, [sp]
+; CHECK-NEXT:    bfi x8, x0, #1, #3
+; CHECK-NEXT:    str h1, [x8]
+; CHECK-NEXT:    ldr q0, [sp], #16
+; CHECK-NEXT:    ret
+entry:
+  %d = insertelement <8 x half> %a, half %b, i32 %c
+  ret <8 x half> %d
+}
+
+define <16 x half> @insert_v16f16_0(<16 x half> %a, half %b, i32 %c) {
+; CHECK-LABEL: insert_v16f16_0:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $h2 killed $h2 def $q2
+; CHECK-NEXT:    mov v0.h[0], v2.h[0]
+; CHECK-NEXT:    ret
+entry:
+  %d = insertelement <16 x half> %a, half %b, i32 0
+  ret <16 x half> %d
+}
+
+define <16 x half> @insert_v16f16_2(<16 x half> %a, half %b, i32 %c) {
+; CHECK-LABEL: insert_v16f16_2:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $h2 killed $h2 def $q2
+; CHECK-NEXT:    mov v0.h[2], v2.h[0]
+; CHECK-NEXT:    ret
+entry:
+  %d = insertelement <16 x half> %a, half %b, i32 2
+  ret <16 x half> %d
+}
+
+define <16 x half> @insert_v16f16_c(<16 x half> %a, half %b, i32 %c) {
+; CHECK-LABEL: insert_v16f16_c:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $w0 killed $w0 def $x0
+; CHECK-NEXT:    stp q0, q1, [sp, #-32]!
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:    and x8, x0, #0xf
+; CHECK-NEXT:    mov x9, sp
+; CHECK-NEXT:    str h2, [x9, x8, lsl #1]
+; CHECK-NEXT:    ldp q0, q1, [sp], #32
+; CHECK-NEXT:    ret
+entry:
+  %d = insertelement <16 x half> %a, half %b, i32 %c
+  ret <16 x half> %d
+}
+
+define <8 x i8> @insert_v8i8_0(<8 x i8> %a, i8 %b, i32 %c) {
+; CHECK-LABEL: insert_v8i8_0:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    mov v0.b[0], w0
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT:    ret
+entry:
+  %d = insertelement <8 x i8> %a, i8 %b, i32 0
+  ret <8 x i8> %d
+}
+
+define <8 x i8> @insert_v8i8_2(<8 x i8> %a, i8 %b, i32 %c) {
+; CHECK-LABEL: insert_v8i8_2:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    mov v0.b[2], w0
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT:    ret
+entry:
+  %d = insertelement <8 x i8> %a, i8 %b, i32 2
+  ret <8 x i8> %d
+}
+
+define <8 x i8> @insert_v8i8_c(<8 x i8> %a, i8 %b, i32 %c) {
+; CHECK-LABEL: insert_v8i8_c:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    sub sp, sp, #16
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    add x8, sp, #8
+; CHECK-NEXT:    // kill: def $w1 killed $w1 def $x1
+; CHECK-NEXT:    str d0, [sp, #8]
+; CHECK-NEXT:    bfxil x8, x1, #0, #3
+; CHECK-NEXT:    strb w0, [x8]
+; CHECK-NEXT:    ldr d0, [sp, #8]
+; CHECK-NEXT:    add sp, sp, #16
+; CHECK-NEXT:    ret
+entry:
+  %d = insertelement <8 x i8> %a, i8 %b, i32 %c
+  ret <8 x i8> %d
+}
+
+define <16 x i8> @insert_v16i8_0(<16 x i8> %a, i8 %b, i32 %c) {
+; CHECK-LABEL: insert_v16i8_0:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov v0.b[0], w0
+; CHECK-NEXT:    ret
+entry:
+  %d = insertelement <16 x i8> %a, i8 %b, i32 0
+  ret <16 x i8> %d
+}
+
+define <16 x i8> @insert_v16i8_2(<16 x i8> %a, i8 %b, i32 %c) {
+; CHECK-LABEL: insert_v16i8_2:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov v0.b[2], w0
+; CHECK-NEXT:    ret
+entry:
+  %d = insertelement <16 x i8> %a, i8 %b, i32 2
+  ret <16 x i8> %d
+}
+
+define <16 x i8> @insert_v16i8_c(<16 x i8> %a, i8 %b, i32 %c) {
+; CHECK-LABEL: insert_v16i8_c:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    sub sp, sp, #16
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    mov x8, sp
+; CHECK-NEXT:    // kill: def $w1 killed $w1 def $x1
+; CHECK-NEXT:    str q0, [sp]
+; CHECK-NEXT:    bfxil x8, x1, #0, #4
+; CHECK-NEXT:    strb w0, [x8]
+; CHECK-NEXT:    ldr q0, [sp], #16
+; CHECK-NEXT:    ret
+entry:
+  %d = insertelement <16 x i8> %a, i8 %b, i32 %c
+  ret <16 x i8> %d
+}
+
+define <32 x i8> @insert_v32i8_0(<32 x i8> %a, i8 %b, i32 %c) {
+; CHECK-LABEL: insert_v32i8_0:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov v0.b[0], w0
+; CHECK-NEXT:    ret
+entry:
+  %d = insertelement <32 x i8> %a, i8 %b, i32 0
+  ret <32 x i8> %d
+}
+
+define <32 x i8> @insert_v32i8_2(<32 x i8> %a, i8 %b, i32 %c) {
+; CHECK-LABEL: insert_v32i8_2:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov v0.b[2], w0
+; CHECK-NEXT:    ret
+entry:
+  %d = insertelement <32 x i8> %a, i8 %b, i32 2
+  ret <32 x i8> %d
+}
+
+define <32 x i8> @insert_v32i8_c(<32 x i8> %a, i8 %b, i32 %c) {
+; CHECK-LABEL: insert_v32i8_c:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $w1 killed $w1 def $x1
+; CHECK-NEXT:    stp q0, q1, [sp, #-32]!
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:    and x8, x1, #0x1f
+; CHECK-NEXT:    mov x9, sp
+; CHECK-NEXT:    strb w0, [x9, x8]
+; CHECK-NEXT:    ldp q0, q1, [sp], #32
+; CHECK-NEXT:    ret
+entry:
+  %d = insertelement <32 x i8> %a, i8 %b, i32 %c
+  ret <32 x i8> %d
+}
+
+define <4 x i16> @insert_v4i16_0(<4 x i16> %a, i16 %b, i32 %c) {
+; CHECK-LABEL: insert_v4i16_0:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    mov v0.h[0], w0
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT:    ret
+entry:
+  %d = insertelement <4 x i16> %a, i16 %b, i32 0
+  ret <4 x i16> %d
+}
+
+define <4 x i16> @insert_v4i16_2(<4 x i16> %a, i16 %b, i32 %c) {
+; CHECK-LABEL: insert_v4i16_2:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    mov v0.h[2], w0
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT:    ret
+entry:
+  %d = insertelement <4 x i16> %a, i16 %b, i32 2
+  ret <4 x i16> %d
+}
+
+define <4 x i16> @insert_v4i16_c(<4 x i16> %a, i16 %b, i32 %c) {
+; CHECK-LABEL: insert_v4i16_c:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    sub sp, sp, #16
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    add x8, sp, #8
+; CHECK-NEXT:    // kill: def $w1 killed $w1 def $x1
+; CHECK-NEXT:    str d0, [sp, #8]
+; CHECK-NEXT:    bfi x8, x1, #1, #2
+; CHECK-NEXT:    strh w0, [x8]
+; CHECK-NEXT:    ldr d0, [sp, #8]
+; CHECK-NEXT:    add sp, sp, #16
+; CHECK-NEXT:    ret
+entry:
+  %d = insertelement <4 x i16> %a, i16 %b, i32 %c
+  ret <4 x i16> %d
+}
+
+define <8 x i16> @insert_v8i16_0(<8 x i16> %a, i16 %b, i32 %c) {
+; CHECK-LABEL: insert_v8i16_0:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov v0.h[0], w0
+; CHECK-NEXT:    ret
+entry:
+  %d = insertelement <8 x i16> %a, i16 %b, i32 0
+  ret <8 x i16> %d
+}
+
+define <8 x i16> @insert_v8i16_2(<8 x i16> %a, i16 %b, i32 %c) {
+; CHECK-LABEL: insert_v8i16_2:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov v0.h[2], w0
+; CHECK-NEXT:    ret
+entry:
+  %d = insertelement <8 x i16> %a, i16 %b, i32 2
+  ret <8 x i16> %d
+}
+
+define <8 x i16> @insert_v8i16_c(<8 x i16> %a, i16 %b, i32 %c) {
+; CHECK-LABEL: insert_v8i16_c:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    sub sp, sp, #16
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    mov x8, sp
+; CHECK-NEXT:    // kill: def $w1 killed $w1 def $x1
+; CHECK-NEXT:    str q0, [sp]
+; CHECK-NEXT:    bfi x8, x1, #1, #3
+; CHECK-NEXT:    strh w0, [x8]
+; CHECK-NEXT:    ldr q0, [sp], #16
+; CHECK-NEXT:    ret
+entry:
+  %d = insertelement <8 x i16> %a, i16 %b, i32 %c
+  ret <8 x i16> %d
+}
+
+define <16 x i16> @insert_v16i16_0(<16 x i16> %a, i16 %b, i32 %c) {
+; CHECK-LABEL: insert_v16i16_0:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov v0.h[0], w0
+; CHECK-NEXT:    ret
+entry:
+  %d = insertelement <16 x i16> %a, i16 %b, i32 0
+  ret <16 x i16> %d
+}
+
+define <16 x i16> @insert_v16i16_2(<16 x i16> %a, i16 %b, i32 %c) {
+; CHECK-LABEL: insert_v16i16_2:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov v0.h[2], w0
+; CHECK-NEXT:    ret
+entry:
+  %d = insertelement <16 x i16> %a, i16 %b, i32 2
+  ret <16 x i16> %d
+}
+
+define <16 x i16> @insert_v16i16_c(<16 x i16> %a, i16 %b, i32 %c) {
+; CHECK-LABEL: insert_v16i16_c:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $w1 killed $w1 def $x1
+; CHECK-NEXT:    stp q0, q1, [sp, #-32]!
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:    and x8, x1, #0xf
+; CHECK-NEXT:    mov x9, sp
+; CHECK-NEXT:    strh w0, [x9, x8, lsl #1]
+; CHECK-NEXT:    ldp q0, q1, [sp], #32
+; CHECK-NEXT:    ret
+entry:
+  %d = insertelement <16 x i16> %a, i16 %b, i32 %c
+  ret <16 x i16> %d
+}
+
+define <2 x i32> @insert_v2i32_0(<2 x i32> %a, i32 %b, i32 %c) {
+; CHECK-LABEL: insert_v2i32_0:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    mov v0.s[0], w0
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT:    ret
+entry:
+  %d = insertelement <2 x i32> %a, i32 %b, i32 0
+  ret <2 x i32> %d
+}
+
+define <2 x i32> @insert_v2i32_1(<2 x i32> %a, i32 %b, i32 %c) {
+; CHECK-LABEL: insert_v2i32_1:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    mov v0.s[1], w0
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT:    ret
+entry:
+  %d = insertelement <2 x i32> %a, i32 %b, i32 1
+  ret <2 x i32> %d
+}
+
+define <2 x i32> @insert_v2i32_c(<2 x i32> %a, i32 %b, i32 %c) {
+; CHECK-LABEL: insert_v2i32_c:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    sub sp, sp, #16
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    add x8, sp, #8
+; CHECK-NEXT:    // kill: def $w1 killed $w1 def $x1
+; CHECK-NEXT:    str d0, [sp, #8]
+; CHECK-NEXT:    bfi x8, x1, #2, #1
+; CHECK-NEXT:    str w0, [x8]
+; CHECK-NEXT:    ldr d0, [sp, #8]
+; CHECK-NEXT:    add sp, sp, #16
+; CHECK-NEXT:    ret
+entry:
+  %d = insertelement <2 x i32> %a, i32 %b, i32 %c
+  ret <2 x i32> %d
+}
+
+define <3 x i32> @insert_v3i32_0(<3 x i32> %a, i32 %b, i32 %c) {
+; CHECK-SD-LABEL: insert_v3i32_0:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    fmov s1, w0
+; CHECK-SD-NEXT:    mov v1.s[1], v0.s[1]
+; CHECK-SD-NEXT:    mov v1.s[2], v0.s[2]
+; CHECK-SD-NEXT:    mov v0.16b, v1.16b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: insert_v3i32_0:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    mov s1, v0.s[1]
+; CHECK-GI-NEXT:    mov s2, v0.s[2]
+; CHECK-GI-NEXT:    fmov s0, w0
+; CHECK-GI-NEXT:    fmov w8, s1
+; CHECK-GI-NEXT:    mov v0.s[1], w8
+; CHECK-GI-NEXT:    fmov w8, s2
+; CHECK-GI-NEXT:    mov v0.s[2], w8
+; CHECK-GI-NEXT:    mov v0.s[3], w8
+; CHECK-GI-NEXT:    ret
+entry:
+  %d = insertelement <3 x i32> %a, i32 %b, i32 0
+  ret <3 x i32> %d
+}
+
+define <3 x i32> @insert_v3i32_2(<3 x i32> %a, i32 %b, i32 %c) {
+; CHECK-SD-LABEL: insert_v3i32_2:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    mov v0.s[2], w0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: insert_v3i32_2:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    mov s1, v0.s[1]
+; CHECK-GI-NEXT:    mov v0.s[1], v1.s[0]
+; CHECK-GI-NEXT:    fmov s1, w0
+; CHECK-GI-NEXT:    mov v0.s[2], v1.s[0]
+; CHECK-GI-NEXT:    mov v0.s[3], v0.s[0]
+; CHECK-GI-NEXT:    ret
+entry:
+  %d = insertelement <3 x i32> %a, i32 %b, i32 2
+  ret <3 x i32> %d
+}
+
+define <3 x i32> @insert_v3i32_c(<3 x i32> %a, i32 %b, i32 %c) {
+; CHECK-LABEL: insert_v3i32_c:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    sub sp, sp, #16
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    mov x8, sp
+; CHECK-NEXT:    // kill: def $w1 killed $w1 def $x1
+; CHECK-NEXT:    str q0, [sp]
+; CHECK-NEXT:    bfi x8, x1, #2, #2
+; CHECK-NEXT:    str w0, [x8]
+; CHECK-NEXT:    ldr q0, [sp], #16
+; CHECK-NEXT:    ret
+entry:
+  %d = insertelement <3 x i32> %a, i32 %b, i32 %c
+  ret <3 x i32> %d
+}
+
+define <4 x i32> @insert_v4i32_0(<4 x i32> %a, i32 %b, i32 %c) {
+; CHECK-LABEL: insert_v4i32_0:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov v0.s[0], w0
+; CHECK-NEXT:    ret
+entry:
+  %d = insertelement <4 x i32> %a, i32 %b, i32 0
+  ret <4 x i32> %d
+}
+
+define <4 x i32> @insert_v4i32_2(<4 x i32> %a, i32 %b, i32 %c) {
+; CHECK-LABEL: insert_v4i32_2:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov v0.s[2], w0
+; CHECK-NEXT:    ret
+entry:
+  %d = insertelement <4 x i32> %a, i32 %b, i32 2
+  ret <4 x i32> %d
+}
+
+define <4 x i32> @insert_v4i32_c(<4 x i32> %a, i32 %b, i32 %c) {
+; CHECK-LABEL: insert_v4i32_c:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    sub sp, sp, #16
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    mov x8, sp
+; CHECK-NEXT:    // kill: def $w1 killed $w1 def $x1
+; CHECK-NEXT:    str q0, [sp]
+; CHECK-NEXT:    bfi x8, x1, #2, #2
+; CHECK-NEXT:    str w0, [x8]
+; CHECK-NEXT:    ldr q0, [sp], #16
+; CHECK-NEXT:    ret
+entry:
+  %d = insertelement <4 x i32> %a, i32 %b, i32 %c
+  ret <4 x i32> %d
+}
+
+define <8 x i32> @insert_v8i32_0(<8 x i32> %a, i32 %b, i32 %c) {
+; CHECK-LABEL: insert_v8i32_0:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov v0.s[0], w0
+; CHECK-NEXT:    ret
+entry:
+  %d = insertelement <8 x i32> %a, i32 %b, i32 0
+  ret <8 x i32> %d
+}
+
+define <8 x i32> @insert_v8i32_2(<8 x i32> %a, i32 %b, i32 %c) {
+; CHECK-LABEL: insert_v8i32_2:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov v0.s[2], w0
+; CHECK-NEXT:    ret
+entry:
+  %d = insertelement <8 x i32> %a, i32 %b, i32 2
+  ret <8 x i32> %d
+}
+
+define <8 x i32> @insert_v8i32_c(<8 x i32> %a, i32 %b, i32 %c) {
+; CHECK-LABEL: insert_v8i32_c:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $w1 killed $w1 def $x1
+; CHECK-NEXT:    stp q0, q1, [sp, #-32]!
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:    and x8, x1, #0x7
+; CHECK-NEXT:    mov x9, sp
+; CHECK-NEXT:    str w0, [x9, x8, lsl #2]
+; CHECK-NEXT:    ldp q0, q1, [sp], #32
+; CHECK-NEXT:    ret
+entry:
+  %d = insertelement <8 x i32> %a, i32 %b, i32 %c
+  ret <8 x i32> %d
+}
+
+define <2 x i64> @insert_v2i64_0(<2 x i64> %a, i64 %b, i32 %c) {
+; CHECK-LABEL: insert_v2i64_0:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov v0.d[0], x0
+; CHECK-NEXT:    ret
+entry:
+  %d = insertelement <2 x i64> %a, i64 %b, i32 0
+  ret <2 x i64> %d
+}
+
+define <2 x i64> @insert_v2i64_1(<2 x i64> %a, i64 %b, i32 %c) {
+; CHECK-LABEL: insert_v2i64_1:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov v0.d[1], x0
+; CHECK-NEXT:    ret
+entry:
+  %d = insertelement <2 x i64> %a, i64 %b, i32 1
+  ret <2 x i64> %d
+}
+
+define <2 x i64> @insert_v2i64_c(<2 x i64> %a, i64 %b, i32 %c) {
+; CHECK-LABEL: insert_v2i64_c:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    sub sp, sp, #16
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    mov x8, sp
+; CHECK-NEXT:    // kill: def $w1 killed $w1 def $x1
+; CHECK-NEXT:    str q0, [sp]
+; CHECK-NEXT:    bfi x8, x1, #3, #1
+; CHECK-NEXT:    str x0, [x8]
+; CHECK-NEXT:    ldr q0, [sp], #16
+; CHECK-NEXT:    ret
+entry:
+  %d = insertelement <2 x i64> %a, i64 %b, i32 %c
+  ret <2 x i64> %d
+}
+
+define <3 x i64> @insert_v3i64_0(<3 x i64> %a, i64 %b, i32 %c) {
+; CHECK-SD-LABEL: insert_v3i64_0:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-SD-NEXT:    mov v0.d[1], v1.d[0]
+; CHECK-SD-NEXT:    mov v0.d[0], x0
+; CHECK-SD-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-SD-NEXT:    // kill: def $d1 killed $d1 killed $q1
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: insert_v3i64_0:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    fmov d0, x0
+; CHECK-GI-NEXT:    ret
+entry:
+  %d = insertelement <3 x i64> %a, i64 %b, i32 0
+  ret <3 x i64> %d
+}
+
+define <3 x i64> @insert_v3i64_2(<3 x i64> %a, i64 %b, i32 %c) {
+; CHECK-LABEL: insert_v3i64_2:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fmov d2, x0
+; CHECK-NEXT:    ret
+entry:
+  %d = insertelement <3 x i64> %a, i64 %b, i32 2
+  ret <3 x i64> %d
+}
+
+define <3 x i64> @insert_v3i64_c(<3 x i64> %a, i64 %b, i32 %c) {
+; CHECK-LABEL: insert_v3i64_c:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT:    // kill: def $w1 killed $w1 def $x1
+; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
+; CHECK-NEXT:    mov v0.d[1], v1.d[0]
+; CHECK-NEXT:    stp q0, q2, [sp, #-32]!
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:    mov x8, sp
+; CHECK-NEXT:    and x9, x1, #0x3
+; CHECK-NEXT:    str x0, [x8, x9, lsl #3]
+; CHECK-NEXT:    ldr q0, [sp]
+; CHECK-NEXT:    ldr d2, [sp, #16]
+; CHECK-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT:    // kill: def $d1 killed $d1 killed $q1
+; CHECK-NEXT:    add sp, sp, #32
+; CHECK-NEXT:    ret
+entry:
+  %d = insertelement <3 x i64> %a, i64 %b, i32 %c
+  ret <3 x i64> %d
+}
+
+define <4 x i64> @insert_v4i64_0(<4 x i64> %a, i64 %b, i32 %c) {
+; CHECK-LABEL: insert_v4i64_0:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov v0.d[0], x0
+; CHECK-NEXT:    ret
+entry:
+  %d = insertelement <4 x i64> %a, i64 %b, i32 0
+  ret <4 x i64> %d
+}
+
+define <4 x i64> @insert_v4i64_2(<4 x i64> %a, i64 %b, i32 %c) {
+; CHECK-LABEL: insert_v4i64_2:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov v1.d[0], x0
+; CHECK-NEXT:    ret
+entry:
+  %d = insertelement <4 x i64> %a, i64 %b, i32 2
+  ret <4 x i64> %d
+}
+
+define <4 x i64> @insert_v4i64_c(<4 x i64> %a, i64 %b, i32 %c) {
+; CHECK-LABEL: insert_v4i64_c:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $w1 killed $w1 def $x1
+; CHECK-NEXT:    stp q0, q1, [sp, #-32]!
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:    and x8, x1, #0x3
+; CHECK-NEXT:    mov x9, sp
+; CHECK-NEXT:    str x0, [x9, x8, lsl #3]
+; CHECK-NEXT:    ldp q0, q1, [sp], #32
+; CHECK-NEXT:    ret
+entry:
+  %d = insertelement <4 x i64> %a, i64 %b, i32 %c
+  ret <4 x i64> %d
+}
+
+define double @extract_v2f64_0(<2 x double> %a, i32 %c) {
+; CHECK-LABEL: extract_v2f64_0:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT:    ret
+entry:
+  %d = extractelement <2 x double> %a, i32 0
+  ret double %d
+}
+
+define double @extract_v2f64_1(<2 x double> %a, i32 %c) {
+; CHECK-LABEL: extract_v2f64_1:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov d0, v0.d[1]
+; CHECK-NEXT:    ret
+entry:
+  %d = extractelement <2 x double> %a, i32 1
+  ret double %d
+}
+
+define double @extract_v2f64_c(<2 x double> %a, i32 %c) {
+; CHECK-SD-LABEL: extract_v2f64_c:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    sub sp, sp, #16
+; CHECK-SD-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-SD-NEXT:    mov x8, sp
+; CHECK-SD-NEXT:    // kill: def $w0 killed $w0 def $x0
+; CHECK-SD-NEXT:    str q0, [sp]
+; CHECK-SD-NEXT:    bfi x8, x0, #3, #1
+; CHECK-SD-NEXT:    ldr d0, [x8]
+; CHECK-SD-NEXT:    add sp, sp, #16
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: extract_v2f64_c:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sub sp, sp, #16
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-GI-NEXT:    mov w9, w0
+; CHECK-GI-NEXT:    mov x8, sp
+; CHECK-GI-NEXT:    str q0, [sp]
+; CHECK-GI-NEXT:    and x9, x9, #0x1
+; CHECK-GI-NEXT:    ldr d0, [x8, x9, lsl #3]
+; CHECK-GI-NEXT:    add sp, sp, #16
+; CHECK-GI-NEXT:    ret
+entry:
+  %d = extractelement <2 x double> %a, i32 %c
+  ret double %d
+}
+
+define double @extract_v3f64_0(<3 x double> %a, i32 %c) {
+; CHECK-LABEL: extract_v3f64_0:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ret
+entry:
+  %d = extractelement <3 x double> %a, i32 0
+  ret double %d
+}
+
+define double @extract_v3f64_2(<3 x double> %a, i32 %c) {
+; CHECK-LABEL: extract_v3f64_2:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fmov d0, d2
+; CHECK-NEXT:    ret
+entry:
+  %d = extractelement <3 x double> %a, i32 2
+  ret double %d
+}
+
+define double @extract_v3f64_c(<3 x double> %a, i32 %c) {
+; CHECK-SD-LABEL: extract_v3f64_c:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-SD-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-SD-NEXT:    // kill: def $w0 killed $w0 def $x0
+; CHECK-SD-NEXT:    // kill: def $d2 killed $d2 def $q2
+; CHECK-SD-NEXT:    and x8, x0, #0x3
+; CHECK-SD-NEXT:    mov v0.d[1], v1.d[0]
+; CHECK-SD-NEXT:    stp q0, q2, [sp, #-32]!
+; CHECK-SD-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-SD-NEXT:    mov x9, sp
+; CHECK-SD-NEXT:    ldr d0, [x9, x8, lsl #3]
+; CHECK-SD-NEXT:    add sp, sp, #32
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: extract_v3f64_c:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
+; CHECK-GI-NEXT:    sub x9, sp, #48
+; CHECK-GI-NEXT:    mov x29, sp
+; CHECK-GI-NEXT:    and sp, x9, #0xffffffffffffffe0
+; CHECK-GI-NEXT:    .cfi_def_cfa w29, 16
+; CHECK-GI-NEXT:    .cfi_offset w30, -8
+; CHECK-GI-NEXT:    .cfi_offset w29, -16
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-GI-NEXT:    mov w8, w0
+; CHECK-GI-NEXT:    // kill: def $d2 killed $d2 def $q2
+; CHECK-GI-NEXT:    mov x9, sp
+; CHECK-GI-NEXT:    mov v0.d[1], v1.d[0]
+; CHECK-GI-NEXT:    and x8, x8, #0x3
+; CHECK-GI-NEXT:    stp q0, q2, [sp]
+; CHECK-GI-NEXT:    ldr d0, [x9, x8, lsl #3]
+; CHECK-GI-NEXT:    mov sp, x29
+; CHECK-GI-NEXT:    ldp x29, x30, [sp], #16 // 16-byte Folded Reload
+; CHECK-GI-NEXT:    ret
+entry:
+  %d = extractelement <3 x double> %a, i32 %c
+  ret double %d
+}
+
+define double @extract_v4f64_0(<4 x double> %a, i32 %c) {
+; CHECK-LABEL: extract_v4f64_0:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT:    ret
+entry:
+  %d = extractelement <4 x double> %a, i32 0
+  ret double %d
+}
+
+define double @extract_v4f64_2(<4 x double> %a, i32 %c) {
+; CHECK-LABEL: extract_v4f64_2:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov v0.16b, v1.16b
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT:    ret
+entry:
+  %d = extractelement <4 x double> %a, i32 2
+  ret double %d
+}
+
+define double @extract_v4f64_c(<4 x double> %a, i32 %c) {
+; CHECK-SD-LABEL: extract_v4f64_c:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    // kill: def $w0 killed $w0 def $x0
+; CHECK-SD-NEXT:    stp q0, q1, [sp, #-32]!
+; CHECK-SD-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-SD-NEXT:    and x8, x0, #0x3
+; CHECK-SD-NEXT:    mov x9, sp
+; CHECK-SD-NEXT:    ldr d0, [x9, x8, lsl #3]
+; CHECK-SD-NEXT:    add sp, sp, #32
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: extract_v4f64_c:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
+; CHECK-GI-NEXT:    sub x9, sp, #48
+; CHECK-GI-NEXT:    mov x29, sp
+; CHECK-GI-NEXT:    and sp, x9, #0xffffffffffffffe0
+; CHECK-GI-NEXT:    .cfi_def_cfa w29, 16
+; CHECK-GI-NEXT:    .cfi_offset w30, -8
+; CHECK-GI-NEXT:    .cfi_offset w29, -16
+; CHECK-GI-NEXT:    mov w8, w0
+; CHECK-GI-NEXT:    stp q0, q1, [sp]
+; CHECK-GI-NEXT:    mov x9, sp
+; CHECK-GI-NEXT:    and x8, x8, #0x3
+; CHECK-GI-NEXT:    ldr d0, [x9, x8, lsl #3]
+; CHECK-GI-NEXT:    mov sp, x29
+; CHECK-GI-NEXT:    ldp x29, x30, [sp], #16 // 16-byte Folded Reload
+; CHECK-GI-NEXT:    ret
+entry:
+  %d = extractelement <4 x double> %a, i32 %c
+  ret double %d
+}
+
+define float @extract_v2f32_0(<2 x float> %a, i32 %c) {
+; CHECK-SD-LABEL: extract_v2f32_0:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-SD-NEXT:    // kill: def $s0 killed $s0 killed $q0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: extract_v2f32_0:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    // kill: def $s0 killed $s0 killed $d0
+; CHECK-GI-NEXT:    ret
+entry:
+  %d = extractelement <2 x float> %a, i32 0
+  ret float %d
+}
+
+define float @extract_v2f32_1(<2 x float> %a, i32 %c) {
+; CHECK-LABEL: extract_v2f32_1:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    mov s0, v0.s[1]
+; CHECK-NEXT:    ret
+entry:
+  %d = extractelement <2 x float> %a, i32 1
+  ret float %d
+}
+
+define float @extract_v2f32_c(<2 x float> %a, i32 %c) {
+; CHECK-SD-LABEL: extract_v2f32_c:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    sub sp, sp, #16
+; CHECK-SD-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-SD-NEXT:    add x8, sp, #8
+; CHECK-SD-NEXT:    // kill: def $w0 killed $w0 def $x0
+; CHECK-SD-NEXT:    str d0, [sp, #8]
+; CHECK-SD-NEXT:    bfi x8, x0, #2, #1
+; CHECK-SD-NEXT:    ldr s0, [x8]
+; CHECK-SD-NEXT:    add sp, sp, #16
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: extract_v2f32_c:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sub sp, sp, #16
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-GI-NEXT:    mov w9, w0
+; CHECK-GI-NEXT:    add x8, sp, #8
+; CHECK-GI-NEXT:    str d0, [sp, #8]
+; CHECK-GI-NEXT:    and x9, x9, #0x1
+; CHECK-GI-NEXT:    ldr s0, [x8, x9, lsl #2]
+; CHECK-GI-NEXT:    add sp, sp, #16
+; CHECK-GI-NEXT:    ret
+entry:
+  %d = extractelement <2 x float> %a, i32 %c
+  ret float %d
+}
+
+define float @extract_v3f32_0(<3 x float> %a, i32 %c) {
+; CHECK-LABEL: extract_v3f32_0:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $q0
+; CHECK-NEXT:    ret
+entry:
+  %d = extractelement <3 x float> %a, i32 0
+  ret float %d
+}
+
+define float @extract_v3f32_2(<3 x float> %a, i32 %c) {
+; CHECK-LABEL: extract_v3f32_2:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov s0, v0.s[2]
+; CHECK-NEXT:    ret
+entry:
+  %d = extractelement <3 x float> %a, i32 2
+  ret float %d
+}
+
+define float @extract_v3f32_c(<3 x float> %a, i32 %c) {
+; CHECK-SD-LABEL: extract_v3f32_c:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    sub sp, sp, #16
+; CHECK-SD-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-SD-NEXT:    mov x8, sp
+; CHECK-SD-NEXT:    // kill: def $w0 killed $w0 def $x0
+; CHECK-SD-NEXT:    str q0, [sp]
+; CHECK-SD-NEXT:    bfi x8, x0, #2, #2
+; CHECK-SD-NEXT:    ldr s0, [x8]
+; CHECK-SD-NEXT:    add sp, sp, #16
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: extract_v3f32_c:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sub sp, sp, #16
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-GI-NEXT:    mov w9, w0
+; CHECK-GI-NEXT:    mov x8, sp
+; CHECK-GI-NEXT:    str q0, [sp]
+; CHECK-GI-NEXT:    and x9, x9, #0x3
+; CHECK-GI-NEXT:    ldr s0, [x8, x9, lsl #2]
+; CHECK-GI-NEXT:    add sp, sp, #16
+; CHECK-GI-NEXT:    ret
+entry:
+  %d = extractelement <3 x float> %a, i32 %c
+  ret float %d
+}
+
+define float @extract_v4f32_0(<4 x float> %a, i32 %c) {
+; CHECK-LABEL: extract_v4f32_0:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $q0
+; CHECK-NEXT:    ret
+entry:
+  %d = extractelement <4 x float> %a, i32 0
+  ret float %d
+}
+
+define float @extract_v4f32_2(<4 x float> %a, i32 %c) {
+; CHECK-LABEL: extract_v4f32_2:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov s0, v0.s[2]
+; CHECK-NEXT:    ret
+entry:
+  %d = extractelement <4 x float> %a, i32 2
+  ret float %d
+}
+
+define float @extract_v4f32_c(<4 x float> %a, i32 %c) {
+; CHECK-SD-LABEL: extract_v4f32_c:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    sub sp, sp, #16
+; CHECK-SD-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-SD-NEXT:    mov x8, sp
+; CHECK-SD-NEXT:    // kill: def $w0 killed $w0 def $x0
+; CHECK-SD-NEXT:    str q0, [sp]
+; CHECK-SD-NEXT:    bfi x8, x0, #2, #2
+; CHECK-SD-NEXT:    ldr s0, [x8]
+; CHECK-SD-NEXT:    add sp, sp, #16
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: extract_v4f32_c:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sub sp, sp, #16
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-GI-NEXT:    mov w9, w0
+; CHECK-GI-NEXT:    mov x8, sp
+; CHECK-GI-NEXT:    str q0, [sp]
+; CHECK-GI-NEXT:    and x9, x9, #0x3
+; CHECK-GI-NEXT:    ldr s0, [x8, x9, lsl #2]
+; CHECK-GI-NEXT:    add sp, sp, #16
+; CHECK-GI-NEXT:    ret
+entry:
+  %d = extractelement <4 x float> %a, i32 %c
+  ret float %d
+}
+
+define float @extract_v8f32_0(<8 x float> %a, i32 %c) {
+; CHECK-LABEL: extract_v8f32_0:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $q0
+; CHECK-NEXT:    ret
+entry:
+  %d = extractelement <8 x float> %a, i32 0
+  ret float %d
+}
+
+define float @extract_v8f32_2(<8 x float> %a, i32 %c) {
+; CHECK-LABEL: extract_v8f32_2:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov s0, v0.s[2]
+; CHECK-NEXT:    ret
+entry:
+  %d = extractelement <8 x float> %a, i32 2
+  ret float %d
+}
+
+define float @extract_v8f32_c(<8 x float> %a, i32 %c) {
+; CHECK-SD-LABEL: extract_v8f32_c:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    // kill: def $w0 killed $w0 def $x0
+; CHECK-SD-NEXT:    stp q0, q1, [sp, #-32]!
+; CHECK-SD-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-SD-NEXT:    and x8, x0, #0x7
+; CHECK-SD-NEXT:    mov x9, sp
+; CHECK-SD-NEXT:    ldr s0, [x9, x8, lsl #2]
+; CHECK-SD-NEXT:    add sp, sp, #32
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: extract_v8f32_c:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
+; CHECK-GI-NEXT:    sub x9, sp, #48
+; CHECK-GI-NEXT:    mov x29, sp
+; CHECK-GI-NEXT:    and sp, x9, #0xffffffffffffffe0
+; CHECK-GI-NEXT:    .cfi_def_cfa w29, 16
+; CHECK-GI-NEXT:    .cfi_offset w30, -8
+; CHECK-GI-NEXT:    .cfi_offset w29, -16
+; CHECK-GI-NEXT:    mov w8, w0
+; CHECK-GI-NEXT:    stp q0, q1, [sp]
+; CHECK-GI-NEXT:    mov x9, sp
+; CHECK-GI-NEXT:    and x8, x8, #0x7
+; CHECK-GI-NEXT:    ldr s0, [x9, x8, lsl #2]
+; CHECK-GI-NEXT:    mov sp, x29
+; CHECK-GI-NEXT:    ldp x29, x30, [sp], #16 // 16-byte Folded Reload
+; CHECK-GI-NEXT:    ret
+entry:
+  %d = extractelement <8 x float> %a, i32 %c
+  ret float %d
+}
+
+define half @extract_v4f16_0(<4 x half> %a, i32 %c) {
+; CHECK-SD-LABEL: extract_v4f16_0:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-SD-NEXT:    // kill: def $h0 killed $h0 killed $q0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: extract_v4f16_0:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    // kill: def $h0 killed $h0 killed $d0
+; CHECK-GI-NEXT:    ret
+entry:
+  %d = extractelement <4 x half> %a, i32 0
+  ret half %d
+}
+
+define half @extract_v4f16_2(<4 x half> %a, i32 %c) {
+; CHECK-LABEL: extract_v4f16_2:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    mov h0, v0.h[2]
+; CHECK-NEXT:    ret
+entry:
+  %d = extractelement <4 x half> %a, i32 2
+  ret half %d
+}
+
+define half @extract_v4f16_c(<4 x half> %a, i32 %c) {
+; CHECK-SD-LABEL: extract_v4f16_c:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    sub sp, sp, #16
+; CHECK-SD-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-SD-NEXT:    add x8, sp, #8
+; CHECK-SD-NEXT:    // kill: def $w0 killed $w0 def $x0
+; CHECK-SD-NEXT:    str d0, [sp, #8]
+; CHECK-SD-NEXT:    bfi x8, x0, #1, #2
+; CHECK-SD-NEXT:    ldr h0, [x8]
+; CHECK-SD-NEXT:    add sp, sp, #16
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: extract_v4f16_c:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sub sp, sp, #16
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-GI-NEXT:    mov w9, w0
+; CHECK-GI-NEXT:    add x8, sp, #8
+; CHECK-GI-NEXT:    str d0, [sp, #8]
+; CHECK-GI-NEXT:    and x9, x9, #0x3
+; CHECK-GI-NEXT:    ldr h0, [x8, x9, lsl #1]
+; CHECK-GI-NEXT:    add sp, sp, #16
+; CHECK-GI-NEXT:    ret
+entry:
+  %d = extractelement <4 x half> %a, i32 %c
+  ret half %d
+}
+
+define half @extract_v8f16_0(<8 x half> %a, i32 %c) {
+; CHECK-LABEL: extract_v8f16_0:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $h0 killed $h0 killed $q0
+; CHECK-NEXT:    ret
+entry:
+  %d = extractelement <8 x half> %a, i32 0
+  ret half %d
+}
+
+define half @extract_v8f16_2(<8 x half> %a, i32 %c) {
+; CHECK-LABEL: extract_v8f16_2:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov h0, v0.h[2]
+; CHECK-NEXT:    ret
+entry:
+  %d = extractelement <8 x half> %a, i32 2
+  ret half %d
+}
+
+define half @extract_v8f16_c(<8 x half> %a, i32 %c) {
+; CHECK-SD-LABEL: extract_v8f16_c:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    sub sp, sp, #16
+; CHECK-SD-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-SD-NEXT:    mov x8, sp
+; CHECK-SD-NEXT:    // kill: def $w0 killed $w0 def $x0
+; CHECK-SD-NEXT:    str q0, [sp]
+; CHECK-SD-NEXT:    bfi x8, x0, #1, #3
+; CHECK-SD-NEXT:    ldr h0, [x8]
+; CHECK-SD-NEXT:    add sp, sp, #16
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: extract_v8f16_c:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sub sp, sp, #16
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-GI-NEXT:    mov w9, w0
+; CHECK-GI-NEXT:    mov x8, sp
+; CHECK-GI-NEXT:    str q0, [sp]
+; CHECK-GI-NEXT:    and x9, x9, #0x7
+; CHECK-GI-NEXT:    ldr h0, [x8, x9, lsl #1]
+; CHECK-GI-NEXT:    add sp, sp, #16
+; CHECK-GI-NEXT:    ret
+entry:
+  %d = extractelement <8 x half> %a, i32 %c
+  ret half %d
+}
+
+define half @extract_v16f16_0(<16 x half> %a, i32 %c) {
+; CHECK-LABEL: extract_v16f16_0:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $h0 killed $h0 killed $q0
+; CHECK-NEXT:    ret
+entry:
+  %d = extractelement <16 x half> %a, i32 0
+  ret half %d
+}
+
+define half @extract_v16f16_2(<16 x half> %a, i32 %c) {
+; CHECK-LABEL: extract_v16f16_2:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov h0, v0.h[2]
+; CHECK-NEXT:    ret
+entry:
+  %d = extractelement <16 x half> %a, i32 2
+  ret half %d
+}
+
+define half @extract_v16f16_c(<16 x half> %a, i32 %c) {
+; CHECK-SD-LABEL: extract_v16f16_c:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    // kill: def $w0 killed $w0 def $x0
+; CHECK-SD-NEXT:    stp q0, q1, [sp, #-32]!
+; CHECK-SD-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-SD-NEXT:    and x8, x0, #0xf
+; CHECK-SD-NEXT:    mov x9, sp
+; CHECK-SD-NEXT:    ldr h0, [x9, x8, lsl #1]
+; CHECK-SD-NEXT:    add sp, sp, #32
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: extract_v16f16_c:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
+; CHECK-GI-NEXT:    sub x9, sp, #48
+; CHECK-GI-NEXT:    mov x29, sp
+; CHECK-GI-NEXT:    and sp, x9, #0xffffffffffffffe0
+; CHECK-GI-NEXT:    .cfi_def_cfa w29, 16
+; CHECK-GI-NEXT:    .cfi_offset w30, -8
+; CHECK-GI-NEXT:    .cfi_offset w29, -16
+; CHECK-GI-NEXT:    mov w8, w0
+; CHECK-GI-NEXT:    stp q0, q1, [sp]
+; CHECK-GI-NEXT:    mov x9, sp
+; CHECK-GI-NEXT:    and x8, x8, #0xf
+; CHECK-GI-NEXT:    ldr h0, [x9, x8, lsl #1]
+; CHECK-GI-NEXT:    mov sp, x29
+; CHECK-GI-NEXT:    ldp x29, x30, [sp], #16 // 16-byte Folded Reload
+; CHECK-GI-NEXT:    ret
+entry:
+  %d = extractelement <16 x half> %a, i32 %c
+  ret half %d
+}
+
+define i8 @extract_v8i8_0(<8 x i8> %a, i32 %c) {
+; CHECK-LABEL: extract_v8i8_0:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    umov w0, v0.b[0]
+; CHECK-NEXT:    ret
+entry:
+  %d = extractelement <8 x i8> %a, i32 0
+  ret i8 %d
+}
+
+define i8 @extract_v8i8_2(<8 x i8> %a, i32 %c) {
+; CHECK-LABEL: extract_v8i8_2:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    umov w0, v0.b[2]
+; CHECK-NEXT:    ret
+entry:
+  %d = extractelement <8 x i8> %a, i32 2
+  ret i8 %d
+}
+
+define i8 @extract_v8i8_c(<8 x i8> %a, i32 %c) {
+; CHECK-SD-LABEL: extract_v8i8_c:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    sub sp, sp, #16
+; CHECK-SD-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-SD-NEXT:    add x8, sp, #8
+; CHECK-SD-NEXT:    // kill: def $w0 killed $w0 def $x0
+; CHECK-SD-NEXT:    str d0, [sp, #8]
+; CHECK-SD-NEXT:    bfxil x8, x0, #0, #3
+; CHECK-SD-NEXT:    ldrb w0, [x8]
+; CHECK-SD-NEXT:    add sp, sp, #16
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: extract_v8i8_c:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sub sp, sp, #16
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-GI-NEXT:    mov w9, w0
+; CHECK-GI-NEXT:    add x8, sp, #8
+; CHECK-GI-NEXT:    str d0, [sp, #8]
+; CHECK-GI-NEXT:    and x9, x9, #0x7
+; CHECK-GI-NEXT:    lsl x10, x9, #1
+; CHECK-GI-NEXT:    sub x9, x10, x9
+; CHECK-GI-NEXT:    ldrb w0, [x8, x9]
+; CHECK-GI-NEXT:    add sp, sp, #16
+; CHECK-GI-NEXT:    ret
+entry:
+  %d = extractelement <8 x i8> %a, i32 %c
+  ret i8 %d
+}
+
+define i8 @extract_v16i8_0(<16 x i8> %a, i32 %c) {
+; CHECK-LABEL: extract_v16i8_0:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    umov w0, v0.b[0]
+; CHECK-NEXT:    ret
+entry:
+  %d = extractelement <16 x i8> %a, i32 0
+  ret i8 %d
+}
+
+define i8 @extract_v16i8_2(<16 x i8> %a, i32 %c) {
+; CHECK-LABEL: extract_v16i8_2:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    umov w0, v0.b[2]
+; CHECK-NEXT:    ret
+entry:
+  %d = extractelement <16 x i8> %a, i32 2
+  ret i8 %d
+}
+
+define i8 @extract_v16i8_c(<16 x i8> %a, i32 %c) {
+; CHECK-SD-LABEL: extract_v16i8_c:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    sub sp, sp, #16
+; CHECK-SD-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-SD-NEXT:    mov x8, sp
+; CHECK-SD-NEXT:    // kill: def $w0 killed $w0 def $x0
+; CHECK-SD-NEXT:    str q0, [sp]
+; CHECK-SD-NEXT:    bfxil x8, x0, #0, #4
+; CHECK-SD-NEXT:    ldrb w0, [x8]
+; CHECK-SD-NEXT:    add sp, sp, #16
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: extract_v16i8_c:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sub sp, sp, #16
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-GI-NEXT:    mov w9, w0
+; CHECK-GI-NEXT:    mov x8, sp
+; CHECK-GI-NEXT:    str q0, [sp]
+; CHECK-GI-NEXT:    and x9, x9, #0xf
+; CHECK-GI-NEXT:    lsl x10, x9, #1
+; CHECK-GI-NEXT:    sub x9, x10, x9
+; CHECK-GI-NEXT:    ldrb w0, [x8, x9]
+; CHECK-GI-NEXT:    add sp, sp, #16
+; CHECK-GI-NEXT:    ret
+entry:
+  %d = extractelement <16 x i8> %a, i32 %c
+  ret i8 %d
+}
+
+define i8 @extract_v32i8_0(<32 x i8> %a, i32 %c) {
+; CHECK-LABEL: extract_v32i8_0:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    umov w0, v0.b[0]
+; CHECK-NEXT:    ret
+entry:
+  %d = extractelement <32 x i8> %a, i32 0
+  ret i8 %d
+}
+
+define i8 @extract_v32i8_2(<32 x i8> %a, i32 %c) {
+; CHECK-LABEL: extract_v32i8_2:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    umov w0, v0.b[2]
+; CHECK-NEXT:    ret
+entry:
+  %d = extractelement <32 x i8> %a, i32 2
+  ret i8 %d
+}
+
+define i8 @extract_v32i8_c(<32 x i8> %a, i32 %c) {
+; CHECK-LABEL: extract_v32i8_c:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $w0 killed $w0 def $x0
+; CHECK-NEXT:    stp q0, q1, [sp, #-32]!
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:    and x8, x0, #0x1f
+; CHECK-NEXT:    mov x9, sp
+; CHECK-NEXT:    ldrb w0, [x9, x8]
+; CHECK-NEXT:    add sp, sp, #32
+; CHECK-NEXT:    ret
+entry:
+  %d = extractelement <32 x i8> %a, i32 %c
+  ret i8 %d
+}
+
+define i16 @extract_v4i16_0(<4 x i16> %a, i32 %c) {
+; CHECK-LABEL: extract_v4i16_0:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    umov w0, v0.h[0]
+; CHECK-NEXT:    ret
+entry:
+  %d = extractelement <4 x i16> %a, i32 0
+  ret i16 %d
+}
+
+define i16 @extract_v4i16_2(<4 x i16> %a, i32 %c) {
+; CHECK-LABEL: extract_v4i16_2:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    umov w0, v0.h[2]
+; CHECK-NEXT:    ret
+entry:
+  %d = extractelement <4 x i16> %a, i32 2
+  ret i16 %d
+}
+
+define i16 @extract_v4i16_c(<4 x i16> %a, i32 %c) {
+; CHECK-SD-LABEL: extract_v4i16_c:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    sub sp, sp, #16
+; CHECK-SD-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-SD-NEXT:    add x8, sp, #8
+; CHECK-SD-NEXT:    // kill: def $w0 killed $w0 def $x0
+; CHECK-SD-NEXT:    str d0, [sp, #8]
+; CHECK-SD-NEXT:    bfi x8, x0, #1, #2
+; CHECK-SD-NEXT:    ldrh w0, [x8]
+; CHECK-SD-NEXT:    add sp, sp, #16
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: extract_v4i16_c:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sub sp, sp, #16
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-GI-NEXT:    mov w9, w0
+; CHECK-GI-NEXT:    add x8, sp, #8
+; CHECK-GI-NEXT:    str d0, [sp, #8]
+; CHECK-GI-NEXT:    and x9, x9, #0x3
+; CHECK-GI-NEXT:    ldrh w0, [x8, x9, lsl #1]
+; CHECK-GI-NEXT:    add sp, sp, #16
+; CHECK-GI-NEXT:    ret
+entry:
+  %d = extractelement <4 x i16> %a, i32 %c
+  ret i16 %d
+}
+
+define i16 @extract_v8i16_0(<8 x i16> %a, i32 %c) {
+; CHECK-LABEL: extract_v8i16_0:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    umov w0, v0.h[0]
+; CHECK-NEXT:    ret
+entry:
+  %d = extractelement <8 x i16> %a, i32 0
+  ret i16 %d
+}
+
+define i16 @extract_v8i16_2(<8 x i16> %a, i32 %c) {
+; CHECK-LABEL: extract_v8i16_2:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    umov w0, v0.h[2]
+; CHECK-NEXT:    ret
+entry:
+  %d = extractelement <8 x i16> %a, i32 2
+  ret i16 %d
+}
+
+define i16 @extract_v8i16_c(<8 x i16> %a, i32 %c) {
+; CHECK-SD-LABEL: extract_v8i16_c:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    sub sp, sp, #16
+; CHECK-SD-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-SD-NEXT:    mov x8, sp
+; CHECK-SD-NEXT:    // kill: def $w0 killed $w0 def $x0
+; CHECK-SD-NEXT:    str q0, [sp]
+; CHECK-SD-NEXT:    bfi x8, x0, #1, #3
+; CHECK-SD-NEXT:    ldrh w0, [x8]
+; CHECK-SD-NEXT:    add sp, sp, #16
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: extract_v8i16_c:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sub sp, sp, #16
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-GI-NEXT:    mov w9, w0
+; CHECK-GI-NEXT:    mov x8, sp
+; CHECK-GI-NEXT:    str q0, [sp]
+; CHECK-GI-NEXT:    and x9, x9, #0x7
+; CHECK-GI-NEXT:    ldrh w0, [x8, x9, lsl #1]
+; CHECK-GI-NEXT:    add sp, sp, #16
+; CHECK-GI-NEXT:    ret
+entry:
+  %d = extractelement <8 x i16> %a, i32 %c
+  ret i16 %d
+}
+
+define i16 @extract_v16i16_0(<16 x i16> %a, i32 %c) {
+; CHECK-LABEL: extract_v16i16_0:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    umov w0, v0.h[0]
+; CHECK-NEXT:    ret
+entry:
+  %d = extractelement <16 x i16> %a, i32 0
+  ret i16 %d
+}
+
+define i16 @extract_v16i16_2(<16 x i16> %a, i32 %c) {
+; CHECK-LABEL: extract_v16i16_2:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    umov w0, v0.h[2]
+; CHECK-NEXT:    ret
+entry:
+  %d = extractelement <16 x i16> %a, i32 2
+  ret i16 %d
+}
+
+define i16 @extract_v16i16_c(<16 x i16> %a, i32 %c) {
+; CHECK-SD-LABEL: extract_v16i16_c:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    // kill: def $w0 killed $w0 def $x0
+; CHECK-SD-NEXT:    stp q0, q1, [sp, #-32]!
+; CHECK-SD-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-SD-NEXT:    and x8, x0, #0xf
+; CHECK-SD-NEXT:    mov x9, sp
+; CHECK-SD-NEXT:    ldrh w0, [x9, x8, lsl #1]
+; CHECK-SD-NEXT:    add sp, sp, #32
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: extract_v16i16_c:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
+; CHECK-GI-NEXT:    sub x9, sp, #48
+; CHECK-GI-NEXT:    mov x29, sp
+; CHECK-GI-NEXT:    and sp, x9, #0xffffffffffffffe0
+; CHECK-GI-NEXT:    .cfi_def_cfa w29, 16
+; CHECK-GI-NEXT:    .cfi_offset w30, -8
+; CHECK-GI-NEXT:    .cfi_offset w29, -16
+; CHECK-GI-NEXT:    mov w8, w0
+; CHECK-GI-NEXT:    stp q0, q1, [sp]
+; CHECK-GI-NEXT:    mov x9, sp
+; CHECK-GI-NEXT:    and x8, x8, #0xf
+; CHECK-GI-NEXT:    ldrh w0, [x9, x8, lsl #1]
+; CHECK-GI-NEXT:    mov sp, x29
+; CHECK-GI-NEXT:    ldp x29, x30, [sp], #16 // 16-byte Folded Reload
+; CHECK-GI-NEXT:    ret
+entry:
+  %d = extractelement <16 x i16> %a, i32 %c
+  ret i16 %d
+}
+
+define i32 @extract_v2i32_0(<2 x i32> %a, i32 %c) {
+; CHECK-SD-LABEL: extract_v2i32_0:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-SD-NEXT:    fmov w0, s0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: extract_v2i32_0:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    fmov w0, s0
+; CHECK-GI-NEXT:    ret
+entry:
+  %d = extractelement <2 x i32> %a, i32 0
+  ret i32 %d
+}
+
+define i32 @extract_v2i32_1(<2 x i32> %a, i32 %c) {
+; CHECK-SD-LABEL: extract_v2i32_1:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-SD-NEXT:    mov w0, v0.s[1]
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: extract_v2i32_1:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT:    mov s0, v0.s[1]
+; CHECK-GI-NEXT:    fmov w0, s0
+; CHECK-GI-NEXT:    ret
+entry:
+  %d = extractelement <2 x i32> %a, i32 1
+  ret i32 %d
+}
+
+define i32 @extract_v2i32_c(<2 x i32> %a, i32 %c) {
+; CHECK-SD-LABEL: extract_v2i32_c:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    sub sp, sp, #16
+; CHECK-SD-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-SD-NEXT:    add x8, sp, #8
+; CHECK-SD-NEXT:    // kill: def $w0 killed $w0 def $x0
+; CHECK-SD-NEXT:    str d0, [sp, #8]
+; CHECK-SD-NEXT:    bfi x8, x0, #2, #1
+; CHECK-SD-NEXT:    ldr w0, [x8]
+; CHECK-SD-NEXT:    add sp, sp, #16
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: extract_v2i32_c:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sub sp, sp, #16
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-GI-NEXT:    mov w9, w0
+; CHECK-GI-NEXT:    add x8, sp, #8
+; CHECK-GI-NEXT:    str d0, [sp, #8]
+; CHECK-GI-NEXT:    and x9, x9, #0x1
+; CHECK-GI-NEXT:    ldr w0, [x8, x9, lsl #2]
+; CHECK-GI-NEXT:    add sp, sp, #16
+; CHECK-GI-NEXT:    ret
+entry:
+  %d = extractelement <2 x i32> %a, i32 %c
+  ret i32 %d
+}
+
+define i32 @extract_v3i32_0(<3 x i32> %a, i32 %c) {
+; CHECK-LABEL: extract_v3i32_0:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    ret
+entry:
+  %d = extractelement <3 x i32> %a, i32 0
+  ret i32 %d
+}
+
+define i32 @extract_v3i32_2(<3 x i32> %a, i32 %c) {
+; CHECK-SD-LABEL: extract_v3i32_2:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    mov w0, v0.s[2]
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: extract_v3i32_2:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    mov s0, v0.s[2]
+; CHECK-GI-NEXT:    fmov w0, s0
+; CHECK-GI-NEXT:    ret
+entry:
+  %d = extractelement <3 x i32> %a, i32 2
+  ret i32 %d
+}
+
+define i32 @extract_v3i32_c(<3 x i32> %a, i32 %c) {
+; CHECK-SD-LABEL: extract_v3i32_c:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    sub sp, sp, #16
+; CHECK-SD-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-SD-NEXT:    mov x8, sp
+; CHECK-SD-NEXT:    // kill: def $w0 killed $w0 def $x0
+; CHECK-SD-NEXT:    str q0, [sp]
+; CHECK-SD-NEXT:    bfi x8, x0, #2, #2
+; CHECK-SD-NEXT:    ldr w0, [x8]
+; CHECK-SD-NEXT:    add sp, sp, #16
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: extract_v3i32_c:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sub sp, sp, #16
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-GI-NEXT:    mov w9, w0
+; CHECK-GI-NEXT:    mov x8, sp
+; CHECK-GI-NEXT:    str q0, [sp]
+; CHECK-GI-NEXT:    and x9, x9, #0x3
+; CHECK-GI-NEXT:    ldr w0, [x8, x9, lsl #2]
+; CHECK-GI-NEXT:    add sp, sp, #16
+; CHECK-GI-NEXT:    ret
+entry:
+  %d = extractelement <3 x i32> %a, i32 %c
+  ret i32 %d
+}
+
+define i32 @extract_v4i32_0(<4 x i32> %a, i32 %c) {
+; CHECK-LABEL: extract_v4i32_0:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    ret
+entry:
+  %d = extractelement <4 x i32> %a, i32 0
+  ret i32 %d
+}
+
+define i32 @extract_v4i32_2(<4 x i32> %a, i32 %c) {
+; CHECK-SD-LABEL: extract_v4i32_2:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    mov w0, v0.s[2]
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: extract_v4i32_2:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    mov s0, v0.s[2]
+; CHECK-GI-NEXT:    fmov w0, s0
+; CHECK-GI-NEXT:    ret
+entry:
+  %d = extractelement <4 x i32> %a, i32 2
+  ret i32 %d
+}
+
+define i32 @extract_v4i32_c(<4 x i32> %a, i32 %c) {
+; CHECK-SD-LABEL: extract_v4i32_c:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    sub sp, sp, #16
+; CHECK-SD-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-SD-NEXT:    mov x8, sp
+; CHECK-SD-NEXT:    // kill: def $w0 killed $w0 def $x0
+; CHECK-SD-NEXT:    str q0, [sp]
+; CHECK-SD-NEXT:    bfi x8, x0, #2, #2
+; CHECK-SD-NEXT:    ldr w0, [x8]
+; CHECK-SD-NEXT:    add sp, sp, #16
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: extract_v4i32_c:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sub sp, sp, #16
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-GI-NEXT:    mov w9, w0
+; CHECK-GI-NEXT:    mov x8, sp
+; CHECK-GI-NEXT:    str q0, [sp]
+; CHECK-GI-NEXT:    and x9, x9, #0x3
+; CHECK-GI-NEXT:    ldr w0, [x8, x9, lsl #2]
+; CHECK-GI-NEXT:    add sp, sp, #16
+; CHECK-GI-NEXT:    ret
+entry:
+  %d = extractelement <4 x i32> %a, i32 %c
+  ret i32 %d
+}
+
+define i32 @extract_v8i32_0(<8 x i32> %a, i32 %c) {
+; CHECK-LABEL: extract_v8i32_0:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    ret
+entry:
+  %d = extractelement <8 x i32> %a, i32 0
+  ret i32 %d
+}
+
+define i32 @extract_v8i32_2(<8 x i32> %a, i32 %c) {
+; CHECK-SD-LABEL: extract_v8i32_2:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    mov w0, v0.s[2]
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: extract_v8i32_2:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    mov s0, v0.s[2]
+; CHECK-GI-NEXT:    fmov w0, s0
+; CHECK-GI-NEXT:    ret
+entry:
+  %d = extractelement <8 x i32> %a, i32 2
+  ret i32 %d
+}
+
+define i32 @extract_v8i32_c(<8 x i32> %a, i32 %c) {
+; CHECK-SD-LABEL: extract_v8i32_c:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    // kill: def $w0 killed $w0 def $x0
+; CHECK-SD-NEXT:    stp q0, q1, [sp, #-32]!
+; CHECK-SD-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-SD-NEXT:    and x8, x0, #0x7
+; CHECK-SD-NEXT:    mov x9, sp
+; CHECK-SD-NEXT:    ldr w0, [x9, x8, lsl #2]
+; CHECK-SD-NEXT:    add sp, sp, #32
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: extract_v8i32_c:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
+; CHECK-GI-NEXT:    sub x9, sp, #48
+; CHECK-GI-NEXT:    mov x29, sp
+; CHECK-GI-NEXT:    and sp, x9, #0xffffffffffffffe0
+; CHECK-GI-NEXT:    .cfi_def_cfa w29, 16
+; CHECK-GI-NEXT:    .cfi_offset w30, -8
+; CHECK-GI-NEXT:    .cfi_offset w29, -16
+; CHECK-GI-NEXT:    mov w8, w0
+; CHECK-GI-NEXT:    stp q0, q1, [sp]
+; CHECK-GI-NEXT:    mov x9, sp
+; CHECK-GI-NEXT:    and x8, x8, #0x7
+; CHECK-GI-NEXT:    ldr w0, [x9, x8, lsl #2]
+; CHECK-GI-NEXT:    mov sp, x29
+; CHECK-GI-NEXT:    ldp x29, x30, [sp], #16 // 16-byte Folded Reload
+; CHECK-GI-NEXT:    ret
+entry:
+  %d = extractelement <8 x i32> %a, i32 %c
+  ret i32 %d
+}
+
+define i64 @extract_v2i64_0(<2 x i64> %a, i32 %c) {
+; CHECK-LABEL: extract_v2i64_0:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fmov x0, d0
+; CHECK-NEXT:    ret
+entry:
+  %d = extractelement <2 x i64> %a, i32 0
+  ret i64 %d
+}
+
+define i64 @extract_v2i64_1(<2 x i64> %a, i32 %c) {
+; CHECK-SD-LABEL: extract_v2i64_1:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    mov x0, v0.d[1]
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: extract_v2i64_1:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    mov d0, v0.d[1]
+; CHECK-GI-NEXT:    fmov x0, d0
+; CHECK-GI-NEXT:    ret
+entry:
+  %d = extractelement <2 x i64> %a, i32 1
+  ret i64 %d
+}
+
+define i64 @extract_v2i64_c(<2 x i64> %a, i32 %c) {
+; CHECK-SD-LABEL: extract_v2i64_c:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    sub sp, sp, #16
+; CHECK-SD-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-SD-NEXT:    mov x8, sp
+; CHECK-SD-NEXT:    // kill: def $w0 killed $w0 def $x0
+; CHECK-SD-NEXT:    str q0, [sp]
+; CHECK-SD-NEXT:    bfi x8, x0, #3, #1
+; CHECK-SD-NEXT:    ldr x0, [x8]
+; CHECK-SD-NEXT:    add sp, sp, #16
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: extract_v2i64_c:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sub sp, sp, #16
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-GI-NEXT:    mov w9, w0
+; CHECK-GI-NEXT:    mov x8, sp
+; CHECK-GI-NEXT:    str q0, [sp]
+; CHECK-GI-NEXT:    and x9, x9, #0x1
+; CHECK-GI-NEXT:    ldr x0, [x8, x9, lsl #3]
+; CHECK-GI-NEXT:    add sp, sp, #16
+; CHECK-GI-NEXT:    ret
+entry:
+  %d = extractelement <2 x i64> %a, i32 %c
+  ret i64 %d
+}
+
+define i64 @extract_v3i64_0(<3 x i64> %a, i32 %c) {
+; CHECK-SD-LABEL: extract_v3i64_0:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-SD-NEXT:    fmov x0, d0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: extract_v3i64_0:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    fmov x0, d0
+; CHECK-GI-NEXT:    ret
+entry:
+  %d = extractelement <3 x i64> %a, i32 0
+  ret i64 %d
+}
+
+define i64 @extract_v3i64_2(<3 x i64> %a, i32 %c) {
+; CHECK-SD-LABEL: extract_v3i64_2:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    // kill: def $d2 killed $d2 def $q2
+; CHECK-SD-NEXT:    fmov x0, d2
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: extract_v3i64_2:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    fmov x0, d2
+; CHECK-GI-NEXT:    ret
+entry:
+  %d = extractelement <3 x i64> %a, i32 2
+  ret i64 %d
+}
+
+define i64 @extract_v3i64_c(<3 x i64> %a, i32 %c) {
+; CHECK-SD-LABEL: extract_v3i64_c:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-SD-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-SD-NEXT:    // kill: def $w0 killed $w0 def $x0
+; CHECK-SD-NEXT:    // kill: def $d2 killed $d2 def $q2
+; CHECK-SD-NEXT:    and x8, x0, #0x3
+; CHECK-SD-NEXT:    mov v0.d[1], v1.d[0]
+; CHECK-SD-NEXT:    stp q0, q2, [sp, #-32]!
+; CHECK-SD-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-SD-NEXT:    mov x9, sp
+; CHECK-SD-NEXT:    ldr x0, [x9, x8, lsl #3]
+; CHECK-SD-NEXT:    add sp, sp, #32
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: extract_v3i64_c:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
+; CHECK-GI-NEXT:    sub x9, sp, #48
+; CHECK-GI-NEXT:    mov x29, sp
+; CHECK-GI-NEXT:    and sp, x9, #0xffffffffffffffe0
+; CHECK-GI-NEXT:    .cfi_def_cfa w29, 16
+; CHECK-GI-NEXT:    .cfi_offset w30, -8
+; CHECK-GI-NEXT:    .cfi_offset w29, -16
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-GI-NEXT:    mov w8, w0
+; CHECK-GI-NEXT:    // kill: def $d2 killed $d2 def $q2
+; CHECK-GI-NEXT:    mov x9, sp
+; CHECK-GI-NEXT:    mov v0.d[1], v1.d[0]
+; CHECK-GI-NEXT:    and x8, x8, #0x3
+; CHECK-GI-NEXT:    stp q0, q2, [sp]
+; CHECK-GI-NEXT:    ldr x0, [x9, x8, lsl #3]
+; CHECK-GI-NEXT:    mov sp, x29
+; CHECK-GI-NEXT:    ldp x29, x30, [sp], #16 // 16-byte Folded Reload
+; CHECK-GI-NEXT:    ret
+entry:
+  %d = extractelement <3 x i64> %a, i32 %c
+  ret i64 %d
+}
+
+define i64 @extract_v4i64_0(<4 x i64> %a, i32 %c) {
+; CHECK-LABEL: extract_v4i64_0:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fmov x0, d0
+; CHECK-NEXT:    ret
+entry:
+  %d = extractelement <4 x i64> %a, i32 0
+  ret i64 %d
+}
+
+define i64 @extract_v4i64_2(<4 x i64> %a, i32 %c) {
+; CHECK-LABEL: extract_v4i64_2:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fmov x0, d1
+; CHECK-NEXT:    ret
+entry:
+  %d = extractelement <4 x i64> %a, i32 2
+  ret i64 %d
+}
+
+define i64 @extract_v4i64_c(<4 x i64> %a, i32 %c) {
+; CHECK-SD-LABEL: extract_v4i64_c:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    // kill: def $w0 killed $w0 def $x0
+; CHECK-SD-NEXT:    stp q0, q1, [sp, #-32]!
+; CHECK-SD-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-SD-NEXT:    and x8, x0, #0x3
+; CHECK-SD-NEXT:    mov x9, sp
+; CHECK-SD-NEXT:    ldr x0, [x9, x8, lsl #3]
+; CHECK-SD-NEXT:    add sp, sp, #32
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: extract_v4i64_c:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
+; CHECK-GI-NEXT:    sub x9, sp, #48
+; CHECK-GI-NEXT:    mov x29, sp
+; CHECK-GI-NEXT:    and sp, x9, #0xffffffffffffffe0
+; CHECK-GI-NEXT:    .cfi_def_cfa w29, 16
+; CHECK-GI-NEXT:    .cfi_offset w30, -8
+; CHECK-GI-NEXT:    .cfi_offset w29, -16
+; CHECK-GI-NEXT:    mov w8, w0
+; CHECK-GI-NEXT:    stp q0, q1, [sp]
+; CHECK-GI-NEXT:    mov x9, sp
+; CHECK-GI-NEXT:    and x8, x8, #0x3
+; CHECK-GI-NEXT:    ldr x0, [x9, x8, lsl #3]
+; CHECK-GI-NEXT:    mov sp, x29
+; CHECK-GI-NEXT:    ldp x29, x30, [sp], #16 // 16-byte Folded Reload
+; CHECK-GI-NEXT:    ret
+entry:
+  %d = extractelement <4 x i64> %a, i32 %c
+  ret i64 %d
+}
-- 
cgit v1.1


From 1a988869319bb4cfe04b3d2618818180b3cfb28c Mon Sep 17 00:00:00 2001
From: lntue <35648136+lntue@users.noreply.github.com>
Date: Sun, 11 Feb 2024 20:02:37 -0500
Subject: [libc] Remove extra ] in stdc.td. (#81438)

---
 libc/spec/stdc.td | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libc/spec/stdc.td b/libc/spec/stdc.td
index afddc77..011abbf 100644
--- a/libc/spec/stdc.td
+++ b/libc/spec/stdc.td
@@ -401,7 +401,7 @@ def StdC : StandardSpec<"stdc"> {
           FunctionSpec<"frexp", RetValSpec<DoubleType>, [ArgSpec<DoubleType>, ArgSpec<IntPtr>]>,
           FunctionSpec<"frexpf", RetValSpec<FloatType>, [ArgSpec<FloatType>, ArgSpec<IntPtr>]>,
           FunctionSpec<"frexpl", RetValSpec<LongDoubleType>, [ArgSpec<LongDoubleType>, ArgSpec<IntPtr>]>,
-          GuardedFunctionSpec<"frexpf128", RetValSpec<Float128Type>, [ArgSpec<Float128Type>, ArgSpec<IntPtr>]], "LIBC_COMPILER_HAS_FLOAT128">,
+          GuardedFunctionSpec<"frexpf128", RetValSpec<Float128Type>, [ArgSpec<Float128Type>, ArgSpec<IntPtr>], "LIBC_COMPILER_HAS_FLOAT128">,
 
           FunctionSpec<"hypot", RetValSpec<DoubleType>, [ArgSpec<DoubleType>, ArgSpec<DoubleType>]>,
           FunctionSpec<"hypotf", RetValSpec<FloatType>, [ArgSpec<FloatType>, ArgSpec<FloatType>]>,
-- 
cgit v1.1


From 5da801386c2b820a4596fc6d8da6b5f4a6da94b4 Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Sun, 11 Feb 2024 20:21:29 -0800
Subject: [AArch64AsmParser] Allow branch target symbol to have a shift/extend
 modifier name (#80571)

Shift and extend modifiers are parsed as separate operands. When a
symbol operand of a branch instruction has such a "bad" name,
AArch64AsmParser will report an error.

```
% cat a.c
void lsl(); void lsr(); void asr(); void ror(); void uxtb(); void sxtx();
void foo() { lsl(); asr(); asr(); ror(); uxtb(); sxtx(); }
% clang --target=aarch64 -c -save-temps a.c
a.s:15:8: error: expected #imm after shift specifier
        bl      lsl
                   ^
a.s:16:8: error: expected #imm after shift specifier
        bl      asr
                   ^
a.s:17:8: error: expected #imm after shift specifier
        bl      asr
                   ^
a.s:18:8: error: expected #imm after shift specifier
        bl      ror
                   ^
a.s:19:5: error: expected label or encodable integer pc offset
        bl      uxtb
                ^
a.s:20:5: error: expected label or encodable integer pc offset
        bl      sxtx
                ^
```

In contrast, gas correctly parses these instructions.

Fix #79729 by parsing shift/extend modifier after an immediate
value/register
---
 .../Target/AArch64/AsmParser/AArch64AsmParser.cpp  | 33 +++++++++++++++++-----
 llvm/test/MC/AArch64/arm64-adr.s                   | 10 +++++++
 llvm/test/MC/AArch64/arm64-branch-encoding.s       | 10 +++++++
 llvm/test/MC/AArch64/basic-a64-diagnostics.s       |  8 ++++++
 4 files changed, 54 insertions(+), 7 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp b/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
index e9d96f3..4e7c8f6 100644
--- a/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
+++ b/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
@@ -4809,20 +4809,30 @@ bool AArch64AsmParser::parseOperand(OperandVector &Operands, bool isCondCode,
       return parseCondCode(Operands, invertCondCode);
 
     // If it's a register name, parse it.
-    if (!parseRegister(Operands))
+    if (!parseRegister(Operands)) {
+      // Parse an optional shift/extend modifier.
+      AsmToken SavedTok = getTok();
+      if (parseOptionalToken(AsmToken::Comma)) {
+        // The operand after the register may be a label (e.g. ADR/ADRP).  Check
+        // such cases and don't report an error when <label> happens to match a
+        // shift/extend modifier.
+        ParseStatus Res = MatchOperandParserImpl(Operands, Mnemonic,
+                                                 /*ParseForAllFeatures=*/true);
+        if (!Res.isNoMatch())
+          return Res.isFailure();
+        Res = tryParseOptionalShiftExtend(Operands);
+        if (!Res.isNoMatch())
+          return Res.isFailure();
+        getLexer().UnLex(SavedTok);
+      }
       return false;
+    }
 
     // See if this is a "mul vl" decoration or "mul #<int>" operand used
     // by SVE instructions.
     if (!parseOptionalMulOperand(Operands))
       return false;
 
-    // This could be an optional "shift" or "extend" operand.
-    ParseStatus GotShift = tryParseOptionalShiftExtend(Operands);
-    // We can only continue if no tokens were eaten.
-    if (!GotShift.isNoMatch())
-      return GotShift.isFailure();
-
     // If this is a two-word mnemonic, parse its special keyword
     // operand as an identifier.
     if (Mnemonic == "brb" || Mnemonic == "smstart" || Mnemonic == "smstop" ||
@@ -4883,6 +4893,15 @@ bool AArch64AsmParser::parseOperand(OperandVector &Operands, bool isCondCode,
 
     E = SMLoc::getFromPointer(getLoc().getPointer() - 1);
     Operands.push_back(AArch64Operand::CreateImm(ImmVal, S, E, getContext()));
+
+    // Parse an optional shift/extend modifier.
+    AsmToken SavedTok = Tok;
+    if (parseOptionalToken(AsmToken::Comma)) {
+      ParseStatus Res = tryParseOptionalShiftExtend(Operands);
+      if (!Res.isNoMatch())
+        return Res.isFailure();
+      getLexer().UnLex(SavedTok);
+    }
     return false;
   }
   case AsmToken::Equal: {
diff --git a/llvm/test/MC/AArch64/arm64-adr.s b/llvm/test/MC/AArch64/arm64-adr.s
index 131e545..d1a91b6 100644
--- a/llvm/test/MC/AArch64/arm64-adr.s
+++ b/llvm/test/MC/AArch64/arm64-adr.s
@@ -23,6 +23,16 @@ adrp x0, foo
 // CHECK: adrp    x0, foo     // encoding: [A,A,A,0x90'A']
 // CHECK-NEXT:                //   fixup A - offset: 0, value: foo, kind: fixup_aarch64_pcrel_adrp_imm21
 
+// CHECK:      adrp    x0, lsl     // encoding: [A,A,A,0x90'A']
+// CHECK-NEXT:                //   fixup A - offset: 0, value: lsl, kind: fixup_aarch64_pcrel_adrp_imm21
+// CHECK-NEXT: adrp    x0, ror     // encoding: [A,A,A,0x90'A']
+// CHECK-NEXT:                //   fixup A - offset: 0, value: ror, kind: fixup_aarch64_pcrel_adrp_imm21
+// CHECK-NEXT: adr    x0, uxtb    // encoding: [A,A,A,0x10'A']
+// CHECK-NEXT:                //   fixup A - offset: 0, value: uxtb, kind: fixup_aarch64_pcrel_adr_imm21
+adrp x0, lsl
+adrp x0, ror
+adr x0, uxtb
+
 adr x0, #0xffffffff
 adrp x0, #0xffffffff
 adrp x0, #1
diff --git a/llvm/test/MC/AArch64/arm64-branch-encoding.s b/llvm/test/MC/AArch64/arm64-branch-encoding.s
index 48c2099..17fdbd9 100644
--- a/llvm/test/MC/AArch64/arm64-branch-encoding.s
+++ b/llvm/test/MC/AArch64/arm64-branch-encoding.s
@@ -157,3 +157,13 @@ L1:
 ; CHECK: dcps2                     ; encoding: [0x02,0x00,0xa0,0xd4]
 ; CHECK: dcps3                     ; encoding: [0x03,0x00,0xa0,0xd4]
 
+;; Test "bad" names
+  bl lsl
+  b.eq lsr
+  b.ne uxth
+; CHECK:      bl lsl     ; encoding: [A,A,A,0b100101AA]
+; CHECK-NEXT:   fixup A - offset: 0, value: lsl, kind: fixup_aarch64_pcrel_call26
+; CHECK-NEXT: b.eq lsr   ; encoding: [0bAAA00000,A,A,0x54]
+; CHECK-NEXT:   fixup A - offset: 0, value: lsr, kind: fixup_aarch64_pcrel_branch19
+; CHECK-NEXT: b.ne uxth  ; encoding: [0bAAA00001,A,A,0x54]
+; CHECK-NEXT:   fixup A - offset: 0, value: uxth, kind: fixup_aarch64_pcrel_branch19
diff --git a/llvm/test/MC/AArch64/basic-a64-diagnostics.s b/llvm/test/MC/AArch64/basic-a64-diagnostics.s
index a59861e..028700f 100644
--- a/llvm/test/MC/AArch64/basic-a64-diagnostics.s
+++ b/llvm/test/MC/AArch64/basic-a64-diagnostics.s
@@ -1149,6 +1149,10 @@
 // CHECK-ERROR-NEXT:           cbz x29, #1
 // CHECK-ERROR-NEXT:                    ^
 
+/// Test "bad" names
+cbz w1, lsl
+// CHECK-ERROR: [[#@LINE-1]]:12: error: expected #imm after shift specifier
+
 //------------------------------------------------------------------------------
 // Conditional branch (immediate)
 //------------------------------------------------------------------------------
@@ -1343,6 +1347,7 @@
         csel sp, x2, x3, ne
         csel x10, x11, sp, ge
         csel x1, x2, x3, #3
+        csel x1, x2, x3, lsl #1, eq
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR-NEXT:        csel w4, wsp, w9, eq
 // CHECK-ERROR-NEXT:                 ^
@@ -1367,6 +1372,9 @@
 // CHECK-ERROR-NEXT: error: expected AArch64 condition code
 // CHECK-ERROR-NEXT:        csel x1, x2, x3, #3
 // CHECK-ERROR-NEXT:                         ^
+// CHECK-ERROR-NEXT: error: expected AArch64 condition code
+// CHECK-ERROR-NEXT:        csel x1, x2, x3, lsl #1, eq
+// CHECK-ERROR-NEXT:                         ^
 
         csinc w20, w21, wsp, mi
         csinc sp, x30, x29, eq
-- 
cgit v1.1


From 32e65b0b8a743678974c7ca7913c1d6c41bb0772 Mon Sep 17 00:00:00 2001
From: Owen Pan <owenpiano@gmail.com>
Date: Sun, 11 Feb 2024 21:54:32 -0800
Subject: Reland "[clang-format][NFC] Make LangOpts global in namespace Format
 (#81390)"

Restore getFormattingLangOpts().
---
 clang/lib/Format/FormatTokenLexer.cpp             | 12 ++++--------
 clang/lib/Format/FormatTokenLexer.h               |  6 ------
 clang/lib/Format/IntegerLiteralSeparatorFixer.cpp |  2 +-
 clang/lib/Format/TokenAnalyzer.cpp                |  6 +++++-
 clang/lib/Format/TokenAnalyzer.h                  |  2 ++
 clang/unittests/Format/TestLexer.h                |  4 +++-
 6 files changed, 15 insertions(+), 17 deletions(-)

diff --git a/clang/lib/Format/FormatTokenLexer.cpp b/clang/lib/Format/FormatTokenLexer.cpp
index a87d0ba..a57659f 100644
--- a/clang/lib/Format/FormatTokenLexer.cpp
+++ b/clang/lib/Format/FormatTokenLexer.cpp
@@ -13,11 +13,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "FormatTokenLexer.h"
-#include "FormatToken.h"
-#include "clang/Basic/SourceLocation.h"
-#include "clang/Basic/SourceManager.h"
-#include "clang/Format/Format.h"
-#include "llvm/Support/Regex.h"
+#include "TokenAnalyzer.h"
 
 namespace clang {
 namespace format {
@@ -28,12 +24,12 @@ FormatTokenLexer::FormatTokenLexer(
     llvm::SpecificBumpPtrAllocator<FormatToken> &Allocator,
     IdentifierTable &IdentTable)
     : FormatTok(nullptr), IsFirstToken(true), StateStack({LexerState::NORMAL}),
-      Column(Column), TrailingWhitespace(0),
-      LangOpts(getFormattingLangOpts(Style)), SourceMgr(SourceMgr), ID(ID),
+      Column(Column), TrailingWhitespace(0), SourceMgr(SourceMgr), ID(ID),
       Style(Style), IdentTable(IdentTable), Keywords(IdentTable),
       Encoding(Encoding), Allocator(Allocator), FirstInLineIndex(0),
       FormattingDisabled(false), MacroBlockBeginRegex(Style.MacroBlockBegin),
       MacroBlockEndRegex(Style.MacroBlockEnd) {
+  assert(LangOpts.CPlusPlus);
   Lex.reset(new Lexer(ID, SourceMgr.getBufferOrFake(ID), SourceMgr, LangOpts));
   Lex->SetKeepWhitespaceMode(true);
 
@@ -1442,7 +1438,7 @@ void FormatTokenLexer::readRawToken(FormatToken &Tok) {
 
 void FormatTokenLexer::resetLexer(unsigned Offset) {
   StringRef Buffer = SourceMgr.getBufferData(ID);
-  LangOpts = getFormattingLangOpts(Style);
+  assert(LangOpts.CPlusPlus);
   Lex.reset(new Lexer(SourceMgr.getLocForStartOfFile(ID), LangOpts,
                       Buffer.begin(), Buffer.begin() + Offset, Buffer.end()));
   Lex->SetKeepWhitespaceMode(true);
diff --git a/clang/lib/Format/FormatTokenLexer.h b/clang/lib/Format/FormatTokenLexer.h
index 65dd733..ca91c5b 100644
--- a/clang/lib/Format/FormatTokenLexer.h
+++ b/clang/lib/Format/FormatTokenLexer.h
@@ -17,14 +17,9 @@
 
 #include "Encoding.h"
 #include "FormatToken.h"
-#include "clang/Basic/LangOptions.h"
-#include "clang/Basic/SourceLocation.h"
-#include "clang/Basic/SourceManager.h"
-#include "clang/Format/Format.h"
 #include "llvm/ADT/MapVector.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/StringSet.h"
-#include "llvm/Support/Regex.h"
 
 #include <stack>
 
@@ -120,7 +115,6 @@ private:
   unsigned Column;
   unsigned TrailingWhitespace;
   std::unique_ptr<Lexer> Lex;
-  LangOptions LangOpts;
   const SourceManager &SourceMgr;
   FileID ID;
   const FormatStyle &Style;
diff --git a/clang/lib/Format/IntegerLiteralSeparatorFixer.cpp b/clang/lib/Format/IntegerLiteralSeparatorFixer.cpp
index 87823ae..3c2cedd 100644
--- a/clang/lib/Format/IntegerLiteralSeparatorFixer.cpp
+++ b/clang/lib/Format/IntegerLiteralSeparatorFixer.cpp
@@ -79,7 +79,7 @@ IntegerLiteralSeparatorFixer::process(const Environment &Env,
   AffectedRangeManager AffectedRangeMgr(SourceMgr, Env.getCharRanges());
 
   const auto ID = Env.getFileID();
-  const auto LangOpts = getFormattingLangOpts(Style);
+  assert(LangOpts.CPlusPlus);
   Lexer Lex(ID, SourceMgr.getBufferOrFake(ID), SourceMgr, LangOpts);
   Lex.SetCommentRetentionState(true);
 
diff --git a/clang/lib/Format/TokenAnalyzer.cpp b/clang/lib/Format/TokenAnalyzer.cpp
index bd648c4..f9d1fdb 100644
--- a/clang/lib/Format/TokenAnalyzer.cpp
+++ b/clang/lib/Format/TokenAnalyzer.cpp
@@ -35,6 +35,8 @@
 namespace clang {
 namespace format {
 
+LangOptions LangOpts;
+
 // FIXME: Instead of printing the diagnostic we should store it and have a
 // better way to return errors through the format APIs.
 class FatalDiagnosticConsumer : public DiagnosticConsumer {
@@ -99,9 +101,11 @@ TokenAnalyzer::TokenAnalyzer(const Environment &Env, const FormatStyle &Style)
 
 std::pair<tooling::Replacements, unsigned>
 TokenAnalyzer::process(bool SkipAnnotation) {
+  LangOpts = getFormattingLangOpts(Style);
+
   tooling::Replacements Result;
   llvm::SpecificBumpPtrAllocator<FormatToken> Allocator;
-  IdentifierTable IdentTable(getFormattingLangOpts(Style));
+  IdentifierTable IdentTable(LangOpts);
   FormatTokenLexer Lex(Env.getSourceManager(), Env.getFileID(),
                        Env.getFirstStartColumn(), Style, Encoding, Allocator,
                        IdentTable);
diff --git a/clang/lib/Format/TokenAnalyzer.h b/clang/lib/Format/TokenAnalyzer.h
index 4086dab..18c1431 100644
--- a/clang/lib/Format/TokenAnalyzer.h
+++ b/clang/lib/Format/TokenAnalyzer.h
@@ -34,6 +34,8 @@
 namespace clang {
 namespace format {
 
+extern LangOptions LangOpts;
+
 class Environment {
 public:
   // This sets up an virtual file system with file \p FileName containing the
diff --git a/clang/unittests/Format/TestLexer.h b/clang/unittests/Format/TestLexer.h
index 8b5949b..6a3d0bd 100644
--- a/clang/unittests/Format/TestLexer.h
+++ b/clang/unittests/Format/TestLexer.h
@@ -61,7 +61,9 @@ public:
             std::vector<std::unique_ptr<llvm::MemoryBuffer>> &Buffers,
             FormatStyle Style = getLLVMStyle())
       : Allocator(Allocator), Buffers(Buffers), Style(Style),
-        SourceMgr("test.cpp", ""), IdentTable(getFormattingLangOpts(Style)) {}
+        SourceMgr("test.cpp", ""), IdentTable(LangOpts) {
+    assert(LangOpts.CPlusPlus);
+  }
 
   TokenList lex(llvm::StringRef Code) {
     FormatTokenLexer Lex = getNewLexer(Code);
-- 
cgit v1.1


From da9559d69a856ba668b659609a0ab73322fc738a Mon Sep 17 00:00:00 2001
From: Adrian Kuegel <akuegel@google.com>
Date: Mon, 12 Feb 2024 07:31:31 +0100
Subject: Do not use PerformEXTRACTCombine for v8i8 types (#81242)

Same as with v4i8 types, we should not be using PerformEXTRACTCombine
for v8i8 types.
---
 llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp |  7 ++-
 llvm/test/CodeGen/NVPTX/extractelement.ll   | 92 +++++++++++++++--------------
 2 files changed, 51 insertions(+), 48 deletions(-)

diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index c7bc623..5c24f00 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -5354,10 +5354,11 @@ static SDValue PerformEXTRACTCombine(SDNode *N,
   if (Vector->getOpcode() == ISD::LOAD && VectorVT.isSimple() &&
       IsPTXVectorType(VectorVT.getSimpleVT()))
     return SDValue(); // Native vector loads already combine nicely w/
-                      // extract_vector_elt, except for v4i8.
-  // Don't mess with singletons or v2*16 types, we already handle them OK.
+                      // extract_vector_elt.
+  // Don't mess with singletons or v2*16, v4i8 and v8i8 types, we already
+  // handle them OK.
   if (VectorVT.getVectorNumElements() == 1 || Isv2x16VT(VectorVT) ||
-      VectorVT == MVT::v4i8)
+      VectorVT == MVT::v4i8 || VectorVT == MVT::v8i8)
     return SDValue();
 
   uint64_t VectorBits = VectorVT.getSizeInBits();
diff --git a/llvm/test/CodeGen/NVPTX/extractelement.ll b/llvm/test/CodeGen/NVPTX/extractelement.ll
index ed7dd45..92d00f8 100644
--- a/llvm/test/CodeGen/NVPTX/extractelement.ll
+++ b/llvm/test/CodeGen/NVPTX/extractelement.ll
@@ -3,9 +3,9 @@
 
 
 ; CHECK-LABEL: test_v2i8
-; CHECK-DAG:        ld.param.u16    [[A:%rs[0-9+]]], [test_v2i8_param_0];
-; CHECK-DAG:        cvt.s16.s8      [[E0:%rs[0-9+]]], [[A]];
-; CHECK-DAG:        shr.s16         [[E1:%rs[0-9+]]], [[A]], 8;
+; CHECK-DAG:        ld.param.u16    [[A:%rs[0-9]+]], [test_v2i8_param_0];
+; CHECK-DAG:        cvt.s16.s8      [[E0:%rs[0-9]+]], [[A]];
+; CHECK-DAG:        shr.s16         [[E1:%rs[0-9]+]], [[A]], 8;
 define i16  @test_v2i8(i16 %a) {
   %v = bitcast i16 %a to <2 x i8>
   %r0 = extractelement <2 x i8> %v, i64 0
@@ -17,15 +17,15 @@ define i16  @test_v2i8(i16 %a) {
 }
 
 ; CHECK-LABEL: test_v4i8
-; CHECK:            ld.param.u32    [[R:%r[0-9+]]], [test_v4i8_param_0];
-; CHECK-DAG:        bfe.s32         [[R0:%r[0-9+]]], [[R]], 0, 8;
-; CHECK-DAG:        cvt.s8.s32      [[E0:%rs[0-9+]]], [[R0]];
-; CHECK-DAG:        bfe.s32         [[R1:%r[0-9+]]], [[R]], 8, 8;
-; CHECK-DAG:        cvt.s8.s32      [[E1:%rs[0-9+]]], [[R1]];
-; CHECK-DAG:        bfe.s32         [[R2:%r[0-9+]]], [[R]], 16, 8;
-; CHECK-DAG:        cvt.s8.s32      [[E2:%rs[0-9+]]], [[R2]];
-; CHECK-DAG:        bfe.s32         [[R3:%r[0-9+]]], [[R]], 24, 8;
-; CHECK-DAG:        cvt.s8.s32      [[E3:%rs[0-9+]]], [[R3]];
+; CHECK:            ld.param.u32    [[R:%r[0-9]+]], [test_v4i8_param_0];
+; CHECK-DAG:        bfe.s32         [[R0:%r[0-9]+]], [[R]], 0, 8;
+; CHECK-DAG:        cvt.s8.s32      [[E0:%rs[0-9]+]], [[R0]];
+; CHECK-DAG:        bfe.s32         [[R1:%r[0-9]+]], [[R]], 8, 8;
+; CHECK-DAG:        cvt.s8.s32      [[E1:%rs[0-9]+]], [[R1]];
+; CHECK-DAG:        bfe.s32         [[R2:%r[0-9]+]], [[R]], 16, 8;
+; CHECK-DAG:        cvt.s8.s32      [[E2:%rs[0-9]+]], [[R2]];
+; CHECK-DAG:        bfe.s32         [[R3:%r[0-9]+]], [[R]], 24, 8;
+; CHECK-DAG:        cvt.s8.s32      [[E3:%rs[0-9]+]], [[R3]];
 define i16  @test_v4i8(i32 %a) {
   %v = bitcast i32 %a to <4 x i8>
   %r0 = extractelement <4 x i8> %v, i64 0
@@ -43,14 +43,14 @@ define i16  @test_v4i8(i32 %a) {
 }
 
 ; CHECK-LABEL: test_v4i8_s32
-; CHECK:            ld.param.u32    [[R:%r[0-9+]]], [test_v4i8_s32_param_0];
-; CHECK-DAG:        bfe.s32         [[R0:%r[0-9+]]], [[R]], 0, 8;
-; CHECK-DAG:        bfe.s32         [[R1:%r[0-9+]]], [[R]], 8, 8;
-; CHECK-DAG:        bfe.s32         [[R2:%r[0-9+]]], [[R]], 16, 8;
-; CHECK-DAG:        bfe.s32         [[R3:%r[0-9+]]], [[R]], 24, 8;
-; CHECK-DAG:        add.s32         [[R01:%r[0-9+]]], [[R0]], [[R1]]
-; CHECK-DAG:        add.s32         [[R23:%r[0-9+]]], [[R2]], [[R3]]
-; CHECK-DAG:        add.s32         [[R0123:%r[0-9+]]], [[R01]], [[R23]]
+; CHECK:            ld.param.u32    [[R:%r[0-9]+]], [test_v4i8_s32_param_0];
+; CHECK-DAG:        bfe.s32         [[R0:%r[0-9]+]], [[R]], 0, 8;
+; CHECK-DAG:        bfe.s32         [[R1:%r[0-9]+]], [[R]], 8, 8;
+; CHECK-DAG:        bfe.s32         [[R2:%r[0-9]+]], [[R]], 16, 8;
+; CHECK-DAG:        bfe.s32         [[R3:%r[0-9]+]], [[R]], 24, 8;
+; CHECK-DAG:        add.s32         [[R01:%r[0-9]+]], [[R0]], [[R1]]
+; CHECK-DAG:        add.s32         [[R23:%r[0-9]+]], [[R2]], [[R3]]
+; CHECK-DAG:        add.s32         [[R0123:%r[0-9]+]], [[R01]], [[R23]]
 define i32  @test_v4i8_s32(i32 %a) {
   %v = bitcast i32 %a to <4 x i8>
   %r0 = extractelement <4 x i8> %v, i64 0
@@ -68,14 +68,14 @@ define i32  @test_v4i8_s32(i32 %a) {
 }
 
 ; CHECK-LABEL: test_v4i8_u32
-; CHECK:            ld.param.u32    [[R:%r[0-9+]]], [test_v4i8_u32_param_0];
-; CHECK-DAG:        bfe.u32         [[R0:%r[0-9+]]], [[R]], 0, 8;
-; CHECK-DAG:        bfe.u32         [[R1:%r[0-9+]]], [[R]], 8, 8;
-; CHECK-DAG:        bfe.u32         [[R2:%r[0-9+]]], [[R]], 16, 8;
-; CHECK-DAG:        bfe.u32         [[R3:%r[0-9+]]], [[R]], 24, 8;
-; CHECK-DAG:        add.s32         [[R01:%r[0-9+]]], [[R0]], [[R1]]
-; CHECK-DAG:        add.s32         [[R23:%r[0-9+]]], [[R2]], [[R3]]
-; CHECK-DAG:        add.s32         [[R0123:%r[0-9+]]], [[R01]], [[R23]]
+; CHECK:            ld.param.u32    [[R:%r[0-9]+]], [test_v4i8_u32_param_0];
+; CHECK-DAG:        bfe.u32         [[R0:%r[0-9]+]], [[R]], 0, 8;
+; CHECK-DAG:        bfe.u32         [[R1:%r[0-9]+]], [[R]], 8, 8;
+; CHECK-DAG:        bfe.u32         [[R2:%r[0-9]+]], [[R]], 16, 8;
+; CHECK-DAG:        bfe.u32         [[R3:%r[0-9]+]], [[R]], 24, 8;
+; CHECK-DAG:        add.s32         [[R01:%r[0-9]+]], [[R0]], [[R1]]
+; CHECK-DAG:        add.s32         [[R23:%r[0-9]+]], [[R2]], [[R3]]
+; CHECK-DAG:        add.s32         [[R0123:%r[0-9]+]], [[R01]], [[R23]]
 define i32  @test_v4i8_u32(i32 %a) {
   %v = bitcast i32 %a to <4 x i8>
   %r0 = extractelement <4 x i8> %v, i64 0
@@ -95,23 +95,25 @@ define i32  @test_v4i8_u32(i32 %a) {
 
 
 ; CHECK-LABEL: test_v8i8
-; CHECK:       ld.param.u64    [[R:%rd[0-9+]]], [test_v8i8_param_0];
-; CHECK-DAG:        cvt.s8.s64      [[E0:%rs[0-9+]]], [[R]];
-; Element 1 is still extracted by trunc, shr 8, not sure why.
-; CHECK-DAG:        cvt.u16.u64     [[R01:%rs[0-9+]]], [[R]];
-; CHECK-DAG:        shr.s16         [[E1:%rs[0-9+]]], [[R01]], 8;
-; CHECK-DAG:        bfe.s64         [[RD2:%rd[0-9+]]], [[R]], 16, 8;
-; CHECK-DAG:        cvt.s8.s64      [[E2:%rs[0-9+]]], [[RD2]];
-; CHECK-DAG:        bfe.s64         [[RD3:%rd[0-9+]]], [[R]], 24, 8;
-; CHECK-DAG:        cvt.s8.s64      [[E3:%rs[0-9+]]], [[RD3]];
-; CHECK-DAG:        bfe.s64         [[RD4:%rd[0-9+]]], [[R]], 32, 8;
-; CHECK-DAG:        cvt.s8.s64      [[E4:%rs[0-9+]]], [[RD4]];
-; CHECK-DAG:        bfe.s64         [[RD5:%rd[0-9+]]], [[R]], 40, 8;
-; CHECK-DAG:        cvt.s8.s64      [[E5:%rs[0-9+]]], [[RD5]];
-; CHECK-DAG:        bfe.s64         [[RD6:%rd[0-9+]]], [[R]], 48, 8;
-; CHECK-DAG:        cvt.s8.s64      [[E6:%rs[0-9+]]], [[RD6]];
-; CHECK-DAG:        bfe.s64         [[RD7:%rd[0-9+]]], [[R]], 56, 8;
-; CHECK-DAG:        cvt.s8.s64      [[E7:%rs[0-9+]]], [[RD7]];
+; CHECK:       ld.param.u64    [[R:%rd[0-9]+]], [test_v8i8_param_0];
+; CHECK-DAG:        cvt.u32.u64     [[R00:%r[0-9]+]], [[R]];
+; CHECK-DAG:        { .reg .b32 tmp; mov.b64 {tmp, [[R01:%r[0-9]+]]}, [[R]]; }
+; CHECK-DAG:        bfe.s32         [[R1:%r[0-9]+]], [[R00]], 0, 8;
+; CHECK-DAG:        cvt.s8.s32      [[E1:%rs[0-9]+]], [[R1]];
+; CHECK-DAG:        bfe.s32         [[R2:%r[0-9]+]], [[R00]], 8, 8;
+; CHECK-DAG:        cvt.s8.s32      [[E2:%rs[0-9]+]], [[R2]];
+; CHECK-DAG:        bfe.s32         [[R3:%r[0-9]+]], [[R00]], 16, 8;
+; CHECK-DAG:        cvt.s8.s32      [[E3:%rs[0-9]+]], [[R3]];
+; CHECK-DAG:        bfe.s32         [[R4:%r[0-9]+]], [[R00]], 24, 8;
+; CHECK-DAG:        cvt.s8.s32      [[E4:%rs[0-9]+]], [[R4]];
+; CHECK-DAG:        bfe.s32         [[R5:%r[0-9]+]], [[R01]], 0, 8;
+; CHECK-DAG:        cvt.s8.s32      [[E5:%rs[0-9]+]], [[R5]];
+; CHECK-DAG:        bfe.s32         [[R6:%r[0-9]+]], [[R01]], 8, 8;
+; CHECK-DAG:        cvt.s8.s32      [[E6:%rs[0-9]+]], [[R6]];
+; CHECK-DAG:        bfe.s32         [[R7:%r[0-9]+]], [[R01]], 16, 8;
+; CHECK-DAG:        cvt.s8.s32      [[E7:%rs[0-9]+]], [[R7]];
+; CHECK-DAG:        bfe.s32         [[R8:%r[0-9]+]], [[R01]], 24, 8;
+; CHECK-DAG:        cvt.s8.s32      [[E8:%rs[0-9]+]], [[R8]];
 
 define i16  @test_v8i8(i64 %a) {
   %v = bitcast i64 %a to <8 x i8>
-- 
cgit v1.1


From 00918933fc4e181fe3c74006d81d7a598c2227b3 Mon Sep 17 00:00:00 2001
From: Owen Pan <owenpiano@gmail.com>
Date: Sun, 11 Feb 2024 22:48:28 -0800
Subject: [clang-format] Undo the change to TestLexer() in commit 32e65b0b8a74

We can't skip calling getFormattingLangOpts() because LangOpts is not
initialized in the unit tests.
---
 clang/unittests/Format/TestLexer.h | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/clang/unittests/Format/TestLexer.h b/clang/unittests/Format/TestLexer.h
index 6a3d0bd..8b5949b 100644
--- a/clang/unittests/Format/TestLexer.h
+++ b/clang/unittests/Format/TestLexer.h
@@ -61,9 +61,7 @@ public:
             std::vector<std::unique_ptr<llvm::MemoryBuffer>> &Buffers,
             FormatStyle Style = getLLVMStyle())
       : Allocator(Allocator), Buffers(Buffers), Style(Style),
-        SourceMgr("test.cpp", ""), IdentTable(LangOpts) {
-    assert(LangOpts.CPlusPlus);
-  }
+        SourceMgr("test.cpp", ""), IdentTable(getFormattingLangOpts(Style)) {}
 
   TokenList lex(llvm::StringRef Code) {
     FormatTokenLexer Lex = getNewLexer(Code);
-- 
cgit v1.1


From 373d9d72145cd40c9dc00abefd14632763a2987b Mon Sep 17 00:00:00 2001
From: Yingwei Zheng <dtcxzyw2333@gmail.com>
Date: Mon, 12 Feb 2024 15:00:54 +0800
Subject: [RISCV] Add sched model for XiangShan-NanHu (#70232)

[XiangShan](https://github.com/OpenXiangShan/XiangShan) is an
open-source high-performance RISC-V processor.

This PR adds the schedule model for XiangShan-NanHu, the 2nd Gen core of
the XiangShan processor series.
Overview:
https://xiangshan-doc.readthedocs.io/zh-cn/latest/integration/overview/

It is based on the patch [D122556](https://reviews.llvm.org/D122556) by
@SForeKeeper. The original patch hasn't been updated for a long time and
it is out of sync with the current RTL design.

---------

Co-authored-by: SForeKeeper <zkliu6@gmail.com>
---
 llvm/lib/Target/RISCV/RISCV.td                     |   1 +
 llvm/lib/Target/RISCV/RISCVProcessors.td           |   8 +-
 llvm/lib/Target/RISCV/RISCVSchedXiangShanNanHu.td  | 308 ++++++++++++
 .../tools/llvm-mca/RISCV/XiangShan/cascade-fma.s   |  53 +++
 .../tools/llvm-mca/RISCV/XiangShan/gpr-bypass.s    | 527 +++++++++++++++++++++
 .../tools/llvm-mca/RISCV/XiangShan/load-to-alu.s   |  73 +++
 6 files changed, 968 insertions(+), 2 deletions(-)
 create mode 100644 llvm/lib/Target/RISCV/RISCVSchedXiangShanNanHu.td
 create mode 100644 llvm/test/tools/llvm-mca/RISCV/XiangShan/cascade-fma.s
 create mode 100644 llvm/test/tools/llvm-mca/RISCV/XiangShan/gpr-bypass.s
 create mode 100644 llvm/test/tools/llvm-mca/RISCV/XiangShan/load-to-alu.s

diff --git a/llvm/lib/Target/RISCV/RISCV.td b/llvm/lib/Target/RISCV/RISCV.td
index 27d52c1..575bd4c 100644
--- a/llvm/lib/Target/RISCV/RISCV.td
+++ b/llvm/lib/Target/RISCV/RISCV.td
@@ -44,6 +44,7 @@ include "RISCVSchedRocket.td"
 include "RISCVSchedSiFive7.td"
 include "RISCVSchedSiFiveP400.td"
 include "RISCVSchedSyntacoreSCR1.td"
+include "RISCVSchedXiangShanNanHu.td"
 
 //===----------------------------------------------------------------------===//
 // RISC-V processors supported.
diff --git a/llvm/lib/Target/RISCV/RISCVProcessors.td b/llvm/lib/Target/RISCV/RISCVProcessors.td
index 59bb811..8c75df4 100644
--- a/llvm/lib/Target/RISCV/RISCVProcessors.td
+++ b/llvm/lib/Target/RISCV/RISCVProcessors.td
@@ -330,7 +330,7 @@ def VENTANA_VEYRON_V1 : RISCVProcessorModel<"veyron-v1",
                                               TuneLDADDFusion]>;
 
 def XIANGSHAN_NANHU : RISCVProcessorModel<"xiangshan-nanhu",
-                                          NoSchedModel,
+                                          XiangShanNanHuModel,
                                           [Feature64Bit,
                                            FeatureStdExtZicsr,
                                            FeatureStdExtZifencei,
@@ -348,4 +348,8 @@ def XIANGSHAN_NANHU : RISCVProcessorModel<"xiangshan-nanhu",
                                            FeatureStdExtZksh,
                                            FeatureStdExtSvinval,
                                            FeatureStdExtZicbom,
-                                           FeatureStdExtZicboz]>;
+                                           FeatureStdExtZicboz],
+                                           [TuneNoDefaultUnroll,
+                                            TuneZExtHFusion,
+                                            TuneZExtWFusion,
+                                            TuneShiftedZExtWFusion]>;
diff --git a/llvm/lib/Target/RISCV/RISCVSchedXiangShanNanHu.td b/llvm/lib/Target/RISCV/RISCVSchedXiangShanNanHu.td
new file mode 100644
index 0000000..667b598
--- /dev/null
+++ b/llvm/lib/Target/RISCV/RISCVSchedXiangShanNanHu.td
@@ -0,0 +1,308 @@
+//==- RISCVSchedXiangShanNanHu.td - XS-NanHu Scheduling Defs -*- tablegen -*-=//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+
+// XiangShan is a high-performance open-source RISC-V processor developed by
+// the Institute of Computing Technology (ICT), Chinese Academy of Sciences.
+// Source: https://github.com/OpenXiangShan/XiangShan
+// Documentation: https://github.com/OpenXiangShan/XiangShan-doc
+
+// XiangShan-NanHu is the second generation of XiangShan processor series.
+// Overview: https://xiangshan-doc.readthedocs.io/zh-cn/latest/integration/overview/
+
+def XiangShanNanHuModel : SchedMachineModel {
+  let MicroOpBufferSize = 256;
+  let LoopMicroOpBufferSize = 48;  // Instruction queue size
+  let IssueWidth = 6;  // 6-way decode and dispatch
+  let LoadLatency = 4;
+  let MispredictPenalty = 11; // Based on estimate of pipeline depth.
+  let CompleteModel = 0;
+  let UnsupportedFeatures = [HasStdExtZcmt, HasStdExtZkr, HasVInstructions,
+                             HasVInstructionsI64];
+}
+
+let SchedModel = XiangShanNanHuModel in {
+
+// The reservation stations are distributed and grouped as 32-entry or 16-entry smaller ones.
+let BufferSize = 16 in {
+  def XS2ALU : ProcResource<4>;
+  def XS2MDU : ProcResource<2>;
+  def XS2MISC : ProcResource<1>;
+
+  def XS2FMAC : ProcResource<4>;
+  def XS2FMISC : ProcResource<2>;
+
+  // Load/Store queues are ignored.
+  def XS2LD : ProcResource<2>;
+  def XS2ST : ProcResource<2>;
+}
+
+// Branching
+def : WriteRes<WriteJmp, [XS2MISC]>;
+def : WriteRes<WriteJal, [XS2MISC]>;
+def : WriteRes<WriteJalr, [XS2MISC]>;
+
+// Integer arithmetic and logic
+let Latency = 1 in {
+def : WriteRes<WriteIALU, [XS2ALU]>;
+def : WriteRes<WriteIALU32, [XS2ALU]>;
+def : WriteRes<WriteShiftImm, [XS2ALU]>;
+def : WriteRes<WriteShiftImm32, [XS2ALU]>;
+def : WriteRes<WriteShiftReg, [XS2ALU]>;
+def : WriteRes<WriteShiftReg32, [XS2ALU]>;
+}
+
+// Integer multiplication
+let Latency = 3 in {
+def : WriteRes<WriteIMul, [XS2MDU]>;
+def : WriteRes<WriteIMul32, [XS2MDU]>;
+}
+
+// Integer division
+// SRT16 algorithm
+let Latency = 20, ReleaseAtCycles = [20] in {
+def : WriteRes<WriteIDiv32, [XS2MDU]>;
+def : WriteRes<WriteIDiv, [XS2MDU]>;
+}
+
+// Zb*
+let Latency = 1 in {
+// Zba
+def : WriteRes<WriteSHXADD, [XS2ALU]>;
+def : WriteRes<WriteSHXADD32, [XS2ALU]>;
+
+// Zbb
+def : WriteRes<WriteRotateImm, [XS2ALU]>;
+def : WriteRes<WriteRotateImm32, [XS2ALU]>;
+def : WriteRes<WriteRotateReg, [XS2ALU]>;
+def : WriteRes<WriteRotateReg32, [XS2ALU]>;
+def : WriteRes<WriteORCB, [XS2ALU]>;
+def : WriteRes<WriteREV8, [XS2ALU]>;
+
+// Zbkb
+def : WriteRes<WriteBREV8, [XS2ALU]>;
+def : WriteRes<WritePACK, [XS2ALU]>;
+def : WriteRes<WritePACK32, [XS2ALU]>;
+def : WriteRes<WriteZIP, [XS2ALU]>;
+
+// Zbs
+def : WriteRes<WriteSingleBit, [XS2ALU]>;
+def : WriteRes<WriteSingleBitImm, [XS2ALU]>;
+def : WriteRes<WriteBEXT, [XS2ALU]>;
+def : WriteRes<WriteBEXTI, [XS2ALU]>;
+}
+
+let Latency = 3 in {
+// Zbb
+def : WriteRes<WriteCLZ, [XS2MDU]>;
+def : WriteRes<WriteCLZ32, [XS2MDU]>;
+def : WriteRes<WriteCTZ, [XS2MDU]>;
+def : WriteRes<WriteCTZ32, [XS2MDU]>;
+def : WriteRes<WriteCPOP, [XS2MDU]>;
+def : WriteRes<WriteCPOP32, [XS2MDU]>;
+
+// Zbkc
+def : WriteRes<WriteCLMUL, [XS2MDU]>;
+
+// Zbkx
+def : WriteRes<WriteXPERM, [XS2MDU]>;
+}
+
+// Memory
+def : WriteRes<WriteSTB, [XS2ST]>;
+def : WriteRes<WriteSTH, [XS2ST]>;
+def : WriteRes<WriteSTW, [XS2ST]>;
+def : WriteRes<WriteSTD, [XS2ST]>;
+def : WriteRes<WriteFST32, [XS2ST]>;
+def : WriteRes<WriteFST64, [XS2ST]>;
+def : WriteRes<WriteAtomicSTW, [XS2ST]>;
+def : WriteRes<WriteAtomicSTD, [XS2ST]>;
+
+let Latency = 5 in {
+def : WriteRes<WriteLDB, [XS2LD]>;
+def : WriteRes<WriteLDH, [XS2LD]>;
+def : WriteRes<WriteLDW, [XS2LD]>;
+def : WriteRes<WriteLDD, [XS2LD]>;
+
+def : WriteRes<WriteAtomicW, [XS2LD]>;
+def : WriteRes<WriteAtomicD, [XS2LD]>;
+def : WriteRes<WriteAtomicLDW, [XS2LD]>;
+def : WriteRes<WriteAtomicLDD, [XS2LD]>;
+
+def : WriteRes<WriteFLD32, [XS2LD]>;
+def : WriteRes<WriteFLD64, [XS2LD]>;
+}
+
+// XiangShan-NanHu uses FuDian FPU instead of Berkeley HardFloat.
+// Documentation: https://github.com/OpenXiangShan/fudian
+
+let Latency = 3 in {
+def : WriteRes<WriteFAdd32, [XS2FMAC]>;
+def : WriteRes<WriteFSGNJ32, [XS2FMAC]>;
+def : WriteRes<WriteFMinMax32, [XS2FMAC]>;
+def : WriteRes<WriteFAdd64, [XS2FMAC]>;
+def : WriteRes<WriteFSGNJ64, [XS2FMAC]>;
+def : WriteRes<WriteFMinMax64, [XS2FMAC]>;
+
+def : WriteRes<WriteFCvtI32ToF32, [XS2FMAC]>;
+def : WriteRes<WriteFCvtI32ToF64, [XS2FMAC]>;
+def : WriteRes<WriteFCvtI64ToF32, [XS2FMAC]>;
+def : WriteRes<WriteFCvtI64ToF64, [XS2FMAC]>;
+def : WriteRes<WriteFCvtF32ToI32, [XS2FMAC]>;
+def : WriteRes<WriteFCvtF32ToI64, [XS2FMAC]>;
+def : WriteRes<WriteFCvtF64ToI32, [XS2FMAC]>;
+def : WriteRes<WriteFCvtF64ToI64, [XS2FMAC]>;
+def : WriteRes<WriteFCvtF32ToF64, [XS2FMAC]>;
+def : WriteRes<WriteFCvtF64ToF32, [XS2FMAC]>;
+
+def : WriteRes<WriteFClass32, [XS2FMAC]>;
+def : WriteRes<WriteFClass64, [XS2FMAC]>;
+def : WriteRes<WriteFCmp32, [XS2FMAC]>;
+def : WriteRes<WriteFCmp64, [XS2FMAC]>;
+def : WriteRes<WriteFMovF32ToI32, [XS2FMAC]>;
+def : WriteRes<WriteFMovI32ToF32, [XS2FMAC]>;
+def : WriteRes<WriteFMovF64ToI64, [XS2FMAC]>;
+def : WriteRes<WriteFMovI64ToF64, [XS2FMAC]>;
+}
+
+// FP multiplication
+let Latency = 3 in {
+def : WriteRes<WriteFMul32, [XS2FMAC]>;
+def : WriteRes<WriteFMul64, [XS2FMAC]>;
+}
+
+let Latency = 5 in {
+def : WriteRes<WriteFMA32, [XS2FMAC]>;
+def : WriteRes<WriteFMA64, [XS2FMAC]>;
+}
+
+// FP division
+def : WriteRes<WriteFDiv32, [XS2FMISC]> {
+    let Latency = 11;
+}
+def : WriteRes<WriteFDiv64, [XS2FMISC]> {
+    let Latency = 18;
+}
+
+def : WriteRes<WriteFSqrt32, [XS2FMISC]> {
+    let Latency = 17;
+}
+def : WriteRes<WriteFSqrt64, [XS2FMISC]> {
+    let Latency = 31;
+}
+
+// Others
+def : WriteRes<WriteCSR, [XS2MISC]>;
+def : WriteRes<WriteNop, []>;
+
+def : InstRW<[WriteIALU], (instrs COPY)>;
+
+// Bypass and advance
+
+class XS2LoadToALUBypass<SchedRead read>
+    : ReadAdvance<read, 1, [WriteLDB, WriteLDH, WriteLDW, WriteLDD, WriteAtomicW, WriteAtomicD, WriteAtomicLDW, WriteAtomicLDD]>;
+
+def : ReadAdvance<ReadJmp, 0>;
+def : ReadAdvance<ReadJalr, 0>;
+def : ReadAdvance<ReadCSR, 0>;
+def : ReadAdvance<ReadStoreData, 0>;
+def : ReadAdvance<ReadMemBase, 0>;
+def : XS2LoadToALUBypass<ReadIALU>;
+def : XS2LoadToALUBypass<ReadIALU32>;
+def : XS2LoadToALUBypass<ReadShiftImm>;
+def : XS2LoadToALUBypass<ReadShiftImm32>;
+def : XS2LoadToALUBypass<ReadShiftReg>;
+def : XS2LoadToALUBypass<ReadShiftReg32>;
+def : ReadAdvance<ReadIDiv, 0>;
+def : ReadAdvance<ReadIDiv32, 0>;
+def : ReadAdvance<ReadIMul, 0>;
+def : ReadAdvance<ReadIMul32, 0>;
+def : ReadAdvance<ReadAtomicWA, 0>;
+def : ReadAdvance<ReadAtomicWD, 0>;
+def : ReadAdvance<ReadAtomicDA, 0>;
+def : ReadAdvance<ReadAtomicDD, 0>;
+def : ReadAdvance<ReadAtomicLDW, 0>;
+def : ReadAdvance<ReadAtomicLDD, 0>;
+def : ReadAdvance<ReadAtomicSTW, 0>;
+def : ReadAdvance<ReadAtomicSTD, 0>;
+def : ReadAdvance<ReadFStoreData, 0>;
+def : ReadAdvance<ReadFMemBase, 0>;
+def : ReadAdvance<ReadFAdd32, 0>;
+def : ReadAdvance<ReadFAdd64, 0>;
+def : ReadAdvance<ReadFMul32, 0>;
+def : ReadAdvance<ReadFMul64, 0>;
+def : ReadAdvance<ReadFMA32, 0>;
+def : ReadAdvance<ReadFMA32Addend, 2>; // Cascade FMA
+def : ReadAdvance<ReadFMA64, 0>;
+def : ReadAdvance<ReadFMA64Addend, 2>; // Cascade FMA
+def : ReadAdvance<ReadFDiv32, 0>;
+def : ReadAdvance<ReadFDiv64, 0>;
+def : ReadAdvance<ReadFSqrt32, 0>;
+def : ReadAdvance<ReadFSqrt64, 0>;
+def : ReadAdvance<ReadFCmp32, 0>;
+def : ReadAdvance<ReadFCmp64, 0>;
+def : ReadAdvance<ReadFSGNJ32, 0>;
+def : ReadAdvance<ReadFSGNJ64, 0>;
+def : ReadAdvance<ReadFMinMax32, 0>;
+def : ReadAdvance<ReadFMinMax64, 0>;
+def : ReadAdvance<ReadFCvtF32ToI32, 0>;
+def : ReadAdvance<ReadFCvtF32ToI64, 0>;
+def : ReadAdvance<ReadFCvtF64ToI32, 0>;
+def : ReadAdvance<ReadFCvtF64ToI64, 0>;
+def : ReadAdvance<ReadFCvtI32ToF32, 0>;
+def : ReadAdvance<ReadFCvtI32ToF64, 0>;
+def : ReadAdvance<ReadFCvtI64ToF32, 0>;
+def : ReadAdvance<ReadFCvtI64ToF64, 0>;
+def : ReadAdvance<ReadFCvtF32ToF64, 0>;
+def : ReadAdvance<ReadFCvtF64ToF32, 0>;
+def : ReadAdvance<ReadFMovF32ToI32, 0>;
+def : ReadAdvance<ReadFMovI32ToF32, 0>;
+def : ReadAdvance<ReadFMovF64ToI64, 0>;
+def : ReadAdvance<ReadFMovI64ToF64, 0>;
+def : ReadAdvance<ReadFClass32, 0>;
+def : ReadAdvance<ReadFClass64, 0>;
+
+// Zb*
+// Zba
+def : XS2LoadToALUBypass<ReadSHXADD>;
+def : XS2LoadToALUBypass<ReadSHXADD32>;
+// Zbb
+def : XS2LoadToALUBypass<ReadRotateImm>;
+def : XS2LoadToALUBypass<ReadRotateImm32>;
+def : XS2LoadToALUBypass<ReadRotateReg>;
+def : XS2LoadToALUBypass<ReadRotateReg32>;
+def : ReadAdvance<ReadCLZ, 0>;
+def : ReadAdvance<ReadCLZ32, 0>;
+def : ReadAdvance<ReadCTZ, 0>;
+def : ReadAdvance<ReadCTZ32, 0>;
+def : ReadAdvance<ReadCPOP, 0>;
+def : ReadAdvance<ReadCPOP32, 0>;
+def : XS2LoadToALUBypass<ReadORCB>;
+def : XS2LoadToALUBypass<ReadREV8>;
+// Zbkc
+def : ReadAdvance<ReadCLMUL, 0>;
+// Zbs
+def : XS2LoadToALUBypass<ReadSingleBit>;
+def : XS2LoadToALUBypass<ReadSingleBitImm>;
+// Zbkb
+def : XS2LoadToALUBypass<ReadBREV8>;
+def : XS2LoadToALUBypass<ReadPACK>;
+def : XS2LoadToALUBypass<ReadPACK32>;
+def : XS2LoadToALUBypass<ReadZIP>;
+// Zbkx
+def : ReadAdvance<ReadXPERM, 0>;
+
+//===----------------------------------------------------------------------===//
+// Unsupported extensions
+defm : UnsupportedSchedV;
+defm : UnsupportedSchedZfa;
+defm : UnsupportedSchedZfh;
+defm : UnsupportedSchedSFB;
+defm : UnsupportedSchedZabha;
+}
diff --git a/llvm/test/tools/llvm-mca/RISCV/XiangShan/cascade-fma.s b/llvm/test/tools/llvm-mca/RISCV/XiangShan/cascade-fma.s
new file mode 100644
index 0000000..d44eb55
--- /dev/null
+++ b/llvm/test/tools/llvm-mca/RISCV/XiangShan/cascade-fma.s
@@ -0,0 +1,53 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=riscv64 -mcpu=xiangshan-nanhu < %s | FileCheck %s
+
+# Test XiangShan FuDian's cascade FMA, CPI = 3
+fmadd.s fa0, fa1, fa2, fa0
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      100
+# CHECK-NEXT: Total Cycles:      305
+# CHECK-NEXT: Total uOps:        100
+
+# CHECK:      Dispatch Width:    6
+# CHECK-NEXT: uOps Per Cycle:    0.33
+# CHECK-NEXT: IPC:               0.33
+# CHECK-NEXT: Block RThroughput: 0.3
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      5     0.25                        fmadd.s	fa0, fa1, fa2, fa0
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0.0] - XS2ALU
+# CHECK-NEXT: [0.1] - XS2ALU
+# CHECK-NEXT: [0.2] - XS2ALU
+# CHECK-NEXT: [0.3] - XS2ALU
+# CHECK-NEXT: [1.0] - XS2FMAC
+# CHECK-NEXT: [1.1] - XS2FMAC
+# CHECK-NEXT: [1.2] - XS2FMAC
+# CHECK-NEXT: [1.3] - XS2FMAC
+# CHECK-NEXT: [2.0] - XS2FMISC
+# CHECK-NEXT: [2.1] - XS2FMISC
+# CHECK-NEXT: [3.0] - XS2LD
+# CHECK-NEXT: [3.1] - XS2LD
+# CHECK-NEXT: [4.0] - XS2MDU
+# CHECK-NEXT: [4.1] - XS2MDU
+# CHECK-NEXT: [5]   - XS2MISC
+# CHECK-NEXT: [6.0] - XS2ST
+# CHECK-NEXT: [6.1] - XS2ST
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0.0]  [0.1]  [0.2]  [0.3]  [1.0]  [1.1]  [1.2]  [1.3]  [2.0]  [2.1]  [3.0]  [3.1]  [4.0]  [4.1]  [5]    [6.0]  [6.1]
+# CHECK-NEXT:  -      -      -      -     0.25   0.25   0.25   0.25    -      -      -      -      -      -      -      -      -
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0.0]  [0.1]  [0.2]  [0.3]  [1.0]  [1.1]  [1.2]  [1.3]  [2.0]  [2.1]  [3.0]  [3.1]  [4.0]  [4.1]  [5]    [6.0]  [6.1]  Instructions:
+# CHECK-NEXT:  -      -      -      -     0.25   0.25   0.25   0.25    -      -      -      -      -      -      -      -      -     fmadd.s	fa0, fa1, fa2, fa0
diff --git a/llvm/test/tools/llvm-mca/RISCV/XiangShan/gpr-bypass.s b/llvm/test/tools/llvm-mca/RISCV/XiangShan/gpr-bypass.s
new file mode 100644
index 0000000..677fece
--- /dev/null
+++ b/llvm/test/tools/llvm-mca/RISCV/XiangShan/gpr-bypass.s
@@ -0,0 +1,527 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=riscv64 -mcpu=xiangshan-nanhu -timeline \
+# RUN:   -timeline-max-cycles=1000 -iterations=1 < %s | FileCheck %s
+
+lui a0, 1
+auipc a1, 1
+add a0, a0, a1
+addi a0, a0, 1
+addw a0, a0, a0
+addiw a0, a0, 1
+sub a0, a0, a0
+subw a0, a0, a0
+and a0, a0, a0
+andi a0, a0, 1
+or a0, a0, a0
+ori a0, a0, 1
+xor a0, a0, a0
+xori a0, a0, 1
+sll a0, a0, a0
+slli a0, a0, 1
+sllw a0, a0, a0
+slliw a0, a0, 1
+srl a0, a0, a0
+srli a0, a0, 1
+srlw a0, a0, a0
+srliw a0, a0, 1
+sra a0, a0, a0
+srai a0, a0, 1
+sraw a0, a0, a0
+sraiw a0, a0, 1
+slt a0, a0, a0
+slti a0, a0, 1
+sltu a0, a0, a0
+sltiu a0, a0, 1
+mul a0, a0, a0
+add a0, a0, a0
+mulw a0, a0, a0
+add a0, a0, a0
+beq a0, a0, 1f
+1:
+add a0, a0, a0
+bne a0, a0, 1f
+1:
+add a0, a0, a0
+blt a0, a0, 1f
+1:
+add a0, a0, a0
+bltu a0, a0, 1f
+1:
+add a0, a0, a0
+bge a0, a0, 1f
+1:
+add a0, a0, a0
+bgeu a0, a0, 1f
+1:
+# zba
+add.uw a0, a0, a0
+slli.uw a0, a0, 1
+sh1add.uw a0, a0, a0
+sh2add.uw a0, a0, a0
+sh3add.uw a0, a0, a0
+sh1add a0, a0, a0
+sh2add a0, a0, a0
+sh3add a0, a0, a0
+# zbb
+andn a0, a0, a0
+orn a0, a0, a0
+xnor a0, a0, a0
+sext.b a0, a0
+sext.h a0, a0
+zext.h a0, a0
+min a0, a0, a0
+minu a0, a0, a0
+max a0, a0, a0
+maxu a0, a0, a0
+rol a0, a0, a0
+ror a0, a0, a0
+rori a0, a0, 1
+clz a0, a0
+clzw a0, a0
+ctz a0, a0
+ctzw a0, a0
+cpop a0, a0
+add a0, a0, a0
+cpopw a0, a0
+add a0, a0, a0
+rev8 a0, a0
+orc.b a0, a0
+lb a0, 0(a0)
+add a0, a0, a0
+lh a0, 0(a0)
+and a0, a0, a0
+lw a0, 0(a0)
+or a0, a0, a0
+ld a0, 0(a0)
+xor a0, a0, a0
+lbu a0, 0(a0)
+addi a0, a0, 1
+lhu a0, 0(a0)
+sub a0, a0, a0
+lwu a0, 0(a0)
+addw a0, a0, a0
+jr a0
+
+# CHECK:      Iterations:        1
+# CHECK-NEXT: Instructions:      91
+# CHECK-NEXT: Total Cycles:      124
+# CHECK-NEXT: Total uOps:        91
+
+# CHECK:      Dispatch Width:    6
+# CHECK-NEXT: uOps Per Cycle:    0.73
+# CHECK-NEXT: IPC:               0.73
+# CHECK-NEXT: Block RThroughput: 17.3
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      1     0.25                        lui	a0, 1
+# CHECK-NEXT:  1      1     0.25                        auipc	a1, 1
+# CHECK-NEXT:  1      1     0.25                        add	a0, a0, a1
+# CHECK-NEXT:  1      1     0.25                        addi	a0, a0, 1
+# CHECK-NEXT:  1      1     0.25                        addw	a0, a0, a0
+# CHECK-NEXT:  1      1     0.25                        addiw	a0, a0, 1
+# CHECK-NEXT:  1      1     0.25                        sub	a0, a0, a0
+# CHECK-NEXT:  1      1     0.25                        subw	a0, a0, a0
+# CHECK-NEXT:  1      1     0.25                        and	a0, a0, a0
+# CHECK-NEXT:  1      1     0.25                        andi	a0, a0, 1
+# CHECK-NEXT:  1      1     0.25                        or	a0, a0, a0
+# CHECK-NEXT:  1      1     0.25                        ori	a0, a0, 1
+# CHECK-NEXT:  1      1     0.25                        xor	a0, a0, a0
+# CHECK-NEXT:  1      1     0.25                        xori	a0, a0, 1
+# CHECK-NEXT:  1      1     0.25                        sll	a0, a0, a0
+# CHECK-NEXT:  1      1     0.25                        slli	a0, a0, 1
+# CHECK-NEXT:  1      1     0.25                        sllw	a0, a0, a0
+# CHECK-NEXT:  1      1     0.25                        slliw	a0, a0, 1
+# CHECK-NEXT:  1      1     0.25                        srl	a0, a0, a0
+# CHECK-NEXT:  1      1     0.25                        srli	a0, a0, 1
+# CHECK-NEXT:  1      1     0.25                        srlw	a0, a0, a0
+# CHECK-NEXT:  1      1     0.25                        srliw	a0, a0, 1
+# CHECK-NEXT:  1      1     0.25                        sra	a0, a0, a0
+# CHECK-NEXT:  1      1     0.25                        srai	a0, a0, 1
+# CHECK-NEXT:  1      1     0.25                        sraw	a0, a0, a0
+# CHECK-NEXT:  1      1     0.25                        sraiw	a0, a0, 1
+# CHECK-NEXT:  1      1     0.25                        slt	a0, a0, a0
+# CHECK-NEXT:  1      1     0.25                        slti	a0, a0, 1
+# CHECK-NEXT:  1      1     0.25                        sltu	a0, a0, a0
+# CHECK-NEXT:  1      1     0.25                        seqz	a0, a0
+# CHECK-NEXT:  1      3     0.50                        mul	a0, a0, a0
+# CHECK-NEXT:  1      1     0.25                        add	a0, a0, a0
+# CHECK-NEXT:  1      3     0.50                        mulw	a0, a0, a0
+# CHECK-NEXT:  1      1     0.25                        add	a0, a0, a0
+# CHECK-NEXT:  1      1     1.00                        beq	a0, a0, .Ltmp0
+# CHECK-NEXT:  1      1     0.25                        add	a0, a0, a0
+# CHECK-NEXT:  1      1     1.00                        bne	a0, a0, .Ltmp1
+# CHECK-NEXT:  1      1     0.25                        add	a0, a0, a0
+# CHECK-NEXT:  1      1     1.00                        blt	a0, a0, .Ltmp2
+# CHECK-NEXT:  1      1     0.25                        add	a0, a0, a0
+# CHECK-NEXT:  1      1     1.00                        bltu	a0, a0, .Ltmp3
+# CHECK-NEXT:  1      1     0.25                        add	a0, a0, a0
+# CHECK-NEXT:  1      1     1.00                        bge	a0, a0, .Ltmp4
+# CHECK-NEXT:  1      1     0.25                        add	a0, a0, a0
+# CHECK-NEXT:  1      1     1.00                        bgeu	a0, a0, .Ltmp5
+# CHECK-NEXT:  1      1     0.25                        add.uw	a0, a0, a0
+# CHECK-NEXT:  1      1     0.25                        slli.uw	a0, a0, 1
+# CHECK-NEXT:  1      1     0.25                        sh1add.uw	a0, a0, a0
+# CHECK-NEXT:  1      1     0.25                        sh2add.uw	a0, a0, a0
+# CHECK-NEXT:  1      1     0.25                        sh3add.uw	a0, a0, a0
+# CHECK-NEXT:  1      1     0.25                        sh1add	a0, a0, a0
+# CHECK-NEXT:  1      1     0.25                        sh2add	a0, a0, a0
+# CHECK-NEXT:  1      1     0.25                        sh3add	a0, a0, a0
+# CHECK-NEXT:  1      1     0.25                        andn	a0, a0, a0
+# CHECK-NEXT:  1      1     0.25                        orn	a0, a0, a0
+# CHECK-NEXT:  1      1     0.25                        xnor	a0, a0, a0
+# CHECK-NEXT:  1      1     0.25                        sext.b	a0, a0
+# CHECK-NEXT:  1      1     0.25                        sext.h	a0, a0
+# CHECK-NEXT:  1      1     0.25                        zext.h	a0, a0
+# CHECK-NEXT:  1      1     0.25                        min	a0, a0, a0
+# CHECK-NEXT:  1      1     0.25                        minu	a0, a0, a0
+# CHECK-NEXT:  1      1     0.25                        max	a0, a0, a0
+# CHECK-NEXT:  1      1     0.25                        maxu	a0, a0, a0
+# CHECK-NEXT:  1      1     0.25                        rol	a0, a0, a0
+# CHECK-NEXT:  1      1     0.25                        ror	a0, a0, a0
+# CHECK-NEXT:  1      1     0.25                        rori	a0, a0, 1
+# CHECK-NEXT:  1      3     0.50                        clz	a0, a0
+# CHECK-NEXT:  1      3     0.50                        clzw	a0, a0
+# CHECK-NEXT:  1      3     0.50                        ctz	a0, a0
+# CHECK-NEXT:  1      3     0.50                        ctzw	a0, a0
+# CHECK-NEXT:  1      3     0.50                        cpop	a0, a0
+# CHECK-NEXT:  1      1     0.25                        add	a0, a0, a0
+# CHECK-NEXT:  1      3     0.50                        cpopw	a0, a0
+# CHECK-NEXT:  1      1     0.25                        add	a0, a0, a0
+# CHECK-NEXT:  1      1     0.25                        rev8	a0, a0
+# CHECK-NEXT:  1      1     0.25                        orc.b	a0, a0
+# CHECK-NEXT:  1      5     0.50    *                   lb	a0, 0(a0)
+# CHECK-NEXT:  1      1     0.25                        add	a0, a0, a0
+# CHECK-NEXT:  1      5     0.50    *                   lh	a0, 0(a0)
+# CHECK-NEXT:  1      1     0.25                        and	a0, a0, a0
+# CHECK-NEXT:  1      5     0.50    *                   lw	a0, 0(a0)
+# CHECK-NEXT:  1      1     0.25                        or	a0, a0, a0
+# CHECK-NEXT:  1      5     0.50    *                   ld	a0, 0(a0)
+# CHECK-NEXT:  1      1     0.25                        xor	a0, a0, a0
+# CHECK-NEXT:  1      5     0.50    *                   lbu	a0, 0(a0)
+# CHECK-NEXT:  1      1     0.25                        addi	a0, a0, 1
+# CHECK-NEXT:  1      5     0.50    *                   lhu	a0, 0(a0)
+# CHECK-NEXT:  1      1     0.25                        sub	a0, a0, a0
+# CHECK-NEXT:  1      5     0.50    *                   lwu	a0, 0(a0)
+# CHECK-NEXT:  1      1     0.25                        addw	a0, a0, a0
+# CHECK-NEXT:  1      1     1.00                        jr	a0
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0.0] - XS2ALU
+# CHECK-NEXT: [0.1] - XS2ALU
+# CHECK-NEXT: [0.2] - XS2ALU
+# CHECK-NEXT: [0.3] - XS2ALU
+# CHECK-NEXT: [1.0] - XS2FMAC
+# CHECK-NEXT: [1.1] - XS2FMAC
+# CHECK-NEXT: [1.2] - XS2FMAC
+# CHECK-NEXT: [1.3] - XS2FMAC
+# CHECK-NEXT: [2.0] - XS2FMISC
+# CHECK-NEXT: [2.1] - XS2FMISC
+# CHECK-NEXT: [3.0] - XS2LD
+# CHECK-NEXT: [3.1] - XS2LD
+# CHECK-NEXT: [4.0] - XS2MDU
+# CHECK-NEXT: [4.1] - XS2MDU
+# CHECK-NEXT: [5]   - XS2MISC
+# CHECK-NEXT: [6.0] - XS2ST
+# CHECK-NEXT: [6.1] - XS2ST
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0.0]  [0.1]  [0.2]  [0.3]  [1.0]  [1.1]  [1.2]  [1.3]  [2.0]  [2.1]  [3.0]  [3.1]  [4.0]  [4.1]  [5]    [6.0]  [6.1]
+# CHECK-NEXT: 17.00  17.00  17.00  18.00   -      -      -      -      -      -     3.00   4.00   4.00   4.00   7.00    -      -
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0.0]  [0.1]  [0.2]  [0.3]  [1.0]  [1.1]  [1.2]  [1.3]  [2.0]  [2.1]  [3.0]  [3.1]  [4.0]  [4.1]  [5]    [6.0]  [6.1]  Instructions:
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     lui	a0, 1
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -     auipc	a1, 1
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     add	a0, a0, a1
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     addi	a0, a0, 1
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     addw	a0, a0, a0
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -     addiw	a0, a0, 1
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     sub	a0, a0, a0
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     subw	a0, a0, a0
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     and	a0, a0, a0
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -     andi	a0, a0, 1
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     or	a0, a0, a0
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     ori	a0, a0, 1
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     xor	a0, a0, a0
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -     xori	a0, a0, 1
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     sll	a0, a0, a0
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     slli	a0, a0, 1
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     sllw	a0, a0, a0
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -     slliw	a0, a0, 1
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     srl	a0, a0, a0
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     srli	a0, a0, 1
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     srlw	a0, a0, a0
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -     srliw	a0, a0, 1
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     sra	a0, a0, a0
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     srai	a0, a0, 1
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     sraw	a0, a0, a0
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -     sraiw	a0, a0, 1
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     slt	a0, a0, a0
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     slti	a0, a0, 1
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     sltu	a0, a0, a0
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -     seqz	a0, a0
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -      -      -     mul	a0, a0, a0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     add	a0, a0, a0
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     1.00    -      -      -      -     mulw	a0, a0, a0
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     add	a0, a0, a0
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -      -     beq	a0, a0, .Ltmp0
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     add	a0, a0, a0
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -      -     bne	a0, a0, .Ltmp1
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -     add	a0, a0, a0
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -      -     blt	a0, a0, .Ltmp2
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     add	a0, a0, a0
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -      -     bltu	a0, a0, .Ltmp3
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     add	a0, a0, a0
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -      -     bge	a0, a0, .Ltmp4
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     add	a0, a0, a0
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -      -     bgeu	a0, a0, .Ltmp5
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -     add.uw	a0, a0, a0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     slli.uw	a0, a0, 1
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     sh1add.uw	a0, a0, a0
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     sh2add.uw	a0, a0, a0
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -     sh3add.uw	a0, a0, a0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     sh1add	a0, a0, a0
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     sh2add	a0, a0, a0
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     sh3add	a0, a0, a0
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -     andn	a0, a0, a0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     orn	a0, a0, a0
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     xnor	a0, a0, a0
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     sext.b	a0, a0
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -     sext.h	a0, a0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     zext.h	a0, a0
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     min	a0, a0, a0
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     minu	a0, a0, a0
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -     max	a0, a0, a0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     maxu	a0, a0, a0
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     rol	a0, a0, a0
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     ror	a0, a0, a0
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -     rori	a0, a0, 1
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -      -      -     clz	a0, a0
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     1.00    -      -      -      -     clzw	a0, a0
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -      -      -     ctz	a0, a0
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     1.00    -      -      -      -     ctzw	a0, a0
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -      -      -     cpop	a0, a0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     add	a0, a0, a0
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     1.00    -      -      -      -     cpopw	a0, a0
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     add	a0, a0, a0
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     rev8	a0, a0
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -     orc.b	a0, a0
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     1.00    -      -      -      -      -     lb	a0, 0(a0)
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     add	a0, a0, a0
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     1.00    -      -      -      -      -      -     lh	a0, 0(a0)
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     and	a0, a0, a0
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     1.00    -      -      -      -      -     lw	a0, 0(a0)
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     or	a0, a0, a0
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     1.00    -      -      -      -      -      -     ld	a0, 0(a0)
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -     xor	a0, a0, a0
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     1.00    -      -      -      -      -     lbu	a0, 0(a0)
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     addi	a0, a0, 1
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     1.00    -      -      -      -      -      -     lhu	a0, 0(a0)
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     sub	a0, a0, a0
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     1.00    -      -      -      -      -     lwu	a0, 0(a0)
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     addw	a0, a0, a0
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -      -     jr	a0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123456789          0123456789          0123456789          0123456789          0123456789
+# CHECK-NEXT: Index     0123456789          0123456789          0123456789          0123456789          0123456789          0123456789          0123
+
+# CHECK:      [0,0]     DeER .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .  .   lui	a0, 1
+# CHECK-NEXT: [0,1]     DeER .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .  .   auipc	a1, 1
+# CHECK-NEXT: [0,2]     D=eER.    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .  .   add	a0, a0, a1
+# CHECK-NEXT: [0,3]     D==eER    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .  .   addi	a0, a0, 1
+# CHECK-NEXT: [0,4]     D===eER   .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .  .   addw	a0, a0, a0
+# CHECK-NEXT: [0,5]     D====eER  .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .  .   addiw	a0, a0, 1
+# CHECK-NEXT: [0,6]     .D====eER .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .  .   sub	a0, a0, a0
+# CHECK-NEXT: [0,7]     .D=====eER.    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .  .   subw	a0, a0, a0
+# CHECK-NEXT: [0,8]     .D======eER    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .  .   and	a0, a0, a0
+# CHECK-NEXT: [0,9]     .D=======eER   .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .  .   andi	a0, a0, 1
+# CHECK-NEXT: [0,10]    .D========eER  .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .  .   or	a0, a0, a0
+# CHECK-NEXT: [0,11]    .D=========eER .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .  .   ori	a0, a0, 1
+# CHECK-NEXT: [0,12]    . D=========eER.    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .  .   xor	a0, a0, a0
+# CHECK-NEXT: [0,13]    . D==========eER    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .  .   xori	a0, a0, 1
+# CHECK-NEXT: [0,14]    . D===========eER   .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .  .   sll	a0, a0, a0
+# CHECK-NEXT: [0,15]    . D============eER  .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .  .   slli	a0, a0, 1
+# CHECK-NEXT: [0,16]    . D=============eER .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .  .   sllw	a0, a0, a0
+# CHECK-NEXT: [0,17]    . D==============eER.    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .  .   slliw	a0, a0, 1
+# CHECK-NEXT: [0,18]    .  D==============eER    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .  .   srl	a0, a0, a0
+# CHECK-NEXT: [0,19]    .  D===============eER   .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .  .   srli	a0, a0, 1
+# CHECK-NEXT: [0,20]    .   D===============eER  .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .  .   srlw	a0, a0, a0
+# CHECK-NEXT: [0,21]    .    D===============eER .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .  .   srliw	a0, a0, 1
+# CHECK-NEXT: [0,22]    .    .D===============eER.    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .  .   sra	a0, a0, a0
+# CHECK-NEXT: [0,23]    .    . D===============eER    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .  .   srai	a0, a0, 1
+# CHECK-NEXT: [0,24]    .    .  D===============eER   .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .  .   sraw	a0, a0, a0
+# CHECK-NEXT: [0,25]    .    .   D===============eER  .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .  .   sraiw	a0, a0, 1
+# CHECK-NEXT: [0,26]    .    .    D===============eER .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .  .   slt	a0, a0, a0
+# CHECK-NEXT: [0,27]    .    .    .D===============eER.    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .  .   slti	a0, a0, 1
+# CHECK-NEXT: [0,28]    .    .    . D===============eER    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .  .   sltu	a0, a0, a0
+# CHECK-NEXT: [0,29]    .    .    .  D===============eER   .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .  .   seqz	a0, a0
+# CHECK-NEXT: [0,30]    .    .    .  D================eeeER.    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .  .   mul	a0, a0, a0
+# CHECK-NEXT: [0,31]    .    .    .   D==================eER    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .  .   add	a0, a0, a0
+# CHECK-NEXT: [0,32]    .    .    .   D===================eeeER .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .  .   mulw	a0, a0, a0
+# CHECK-NEXT: [0,33]    .    .    .    D=====================eER.    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .  .   add	a0, a0, a0
+# CHECK-NEXT: [0,34]    .    .    .    D======================eER    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .  .   beq	a0, a0, .Ltmp0
+# CHECK-NEXT: [0,35]    .    .    .    .D=====================eER    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .  .   add	a0, a0, a0
+# CHECK-NEXT: [0,36]    .    .    .    .D======================eER   .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .  .   bne	a0, a0, .Ltmp1
+# CHECK-NEXT: [0,37]    .    .    .    . D=====================eER   .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .  .   add	a0, a0, a0
+# CHECK-NEXT: [0,38]    .    .    .    . D======================eER  .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .  .   blt	a0, a0, .Ltmp2
+# CHECK-NEXT: [0,39]    .    .    .    .  D=====================eER  .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .  .   add	a0, a0, a0
+# CHECK-NEXT: [0,40]    .    .    .    .  D======================eER .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .  .   bltu	a0, a0, .Ltmp3
+# CHECK-NEXT: [0,41]    .    .    .    .   D=====================eER .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .  .   add	a0, a0, a0
+# CHECK-NEXT: [0,42]    .    .    .    .   D======================eER.    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .  .   bge	a0, a0, .Ltmp4
+# CHECK-NEXT: [0,43]    .    .    .    .    D=====================eER.    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .  .   add	a0, a0, a0
+# CHECK-NEXT: [0,44]    .    .    .    .    D======================eER    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .  .   bgeu	a0, a0, .Ltmp5
+# CHECK-NEXT: [0,45]    .    .    .    .    .D=====================eER    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .  .   add.uw	a0, a0, a0
+# CHECK-NEXT: [0,46]    .    .    .    .    . D=====================eER   .    .    .    .    .    .    .    .    .    .    .    .    .    .    .  .   slli.uw	a0, a0, 1
+# CHECK-NEXT: [0,47]    .    .    .    .    .  D=====================eER  .    .    .    .    .    .    .    .    .    .    .    .    .    .    .  .   sh1add.uw	a0, a0, a0
+# CHECK-NEXT: [0,48]    .    .    .    .    .   D=====================eER .    .    .    .    .    .    .    .    .    .    .    .    .    .    .  .   sh2add.uw	a0, a0, a0
+# CHECK-NEXT: [0,49]    .    .    .    .    .    D=====================eER.    .    .    .    .    .    .    .    .    .    .    .    .    .    .  .   sh3add.uw	a0, a0, a0
+# CHECK-NEXT: [0,50]    .    .    .    .    .    .D=====================eER    .    .    .    .    .    .    .    .    .    .    .    .    .    .  .   sh1add	a0, a0, a0
+# CHECK-NEXT: [0,51]    .    .    .    .    .    . D=====================eER   .    .    .    .    .    .    .    .    .    .    .    .    .    .  .   sh2add	a0, a0, a0
+# CHECK-NEXT: [0,52]    .    .    .    .    .    .  D=====================eER  .    .    .    .    .    .    .    .    .    .    .    .    .    .  .   sh3add	a0, a0, a0
+# CHECK-NEXT: [0,53]    .    .    .    .    .    .   D=====================eER .    .    .    .    .    .    .    .    .    .    .    .    .    .  .   andn	a0, a0, a0
+# CHECK-NEXT: [0,54]    .    .    .    .    .    .    .  D==================eER.    .    .    .    .    .    .    .    .    .    .    .    .    .  .   orn	a0, a0, a0
+# CHECK-NEXT: [0,55]    .    .    .    .    .    .    .    . D===============eER    .    .    .    .    .    .    .    .    .    .    .    .    .  .   xnor	a0, a0, a0
+# CHECK-NEXT: [0,56]    .    .    .    .    .    .    .    .  D===============eER   .    .    .    .    .    .    .    .    .    .    .    .    .  .   sext.b	a0, a0
+# CHECK-NEXT: [0,57]    .    .    .    .    .    .    .    .   D===============eER  .    .    .    .    .    .    .    .    .    .    .    .    .  .   sext.h	a0, a0
+# CHECK-NEXT: [0,58]    .    .    .    .    .    .    .    .    D===============eER .    .    .    .    .    .    .    .    .    .    .    .    .  .   zext.h	a0, a0
+# CHECK-NEXT: [0,59]    .    .    .    .    .    .    .    .    .D===============eER.    .    .    .    .    .    .    .    .    .    .    .    .  .   min	a0, a0, a0
+# CHECK-NEXT: [0,60]    .    .    .    .    .    .    .    .    . D===============eER    .    .    .    .    .    .    .    .    .    .    .    .  .   minu	a0, a0, a0
+# CHECK-NEXT: [0,61]    .    .    .    .    .    .    .    .    .  D===============eER   .    .    .    .    .    .    .    .    .    .    .    .  .   max	a0, a0, a0
+# CHECK-NEXT: [0,62]    .    .    .    .    .    .    .    .    .   D===============eER  .    .    .    .    .    .    .    .    .    .    .    .  .   maxu	a0, a0, a0
+# CHECK-NEXT: [0,63]    .    .    .    .    .    .    .    .    .    D===============eER .    .    .    .    .    .    .    .    .    .    .    .  .   rol	a0, a0, a0
+# CHECK-NEXT: [0,64]    .    .    .    .    .    .    .    .    .    .D===============eER.    .    .    .    .    .    .    .    .    .    .    .  .   ror	a0, a0, a0
+# CHECK-NEXT: [0,65]    .    .    .    .    .    .    .    .    .    . D===============eER    .    .    .    .    .    .    .    .    .    .    .  .   rori	a0, a0, 1
+# CHECK-NEXT: [0,66]    .    .    .    .    .    .    .    .    .    . D================eeeER .    .    .    .    .    .    .    .    .    .    .  .   clz	a0, a0
+# CHECK-NEXT: [0,67]    .    .    .    .    .    .    .    .    .    . D===================eeeER   .    .    .    .    .    .    .    .    .    .  .   clzw	a0, a0
+# CHECK-NEXT: [0,68]    .    .    .    .    .    .    .    .    .    . D======================eeeER.    .    .    .    .    .    .    .    .    .  .   ctz	a0, a0
+# CHECK-NEXT: [0,69]    .    .    .    .    .    .    .    .    .    . D=========================eeeER  .    .    .    .    .    .    .    .    .  .   ctzw	a0, a0
+# CHECK-NEXT: [0,70]    .    .    .    .    .    .    .    .    .    . D============================eeeER    .    .    .    .    .    .    .    .  .   cpop	a0, a0
+# CHECK-NEXT: [0,71]    .    .    .    .    .    .    .    .    .    .  D==============================eER   .    .    .    .    .    .    .    .  .   add	a0, a0, a0
+# CHECK-NEXT: [0,72]    .    .    .    .    .    .    .    .    .    .  D===============================eeeER.    .    .    .    .    .    .    .  .   cpopw	a0, a0
+# CHECK-NEXT: [0,73]    .    .    .    .    .    .    .    .    .    .   D=================================eER    .    .    .    .    .    .    .  .   add	a0, a0, a0
+# CHECK-NEXT: [0,74]    .    .    .    .    .    .    .    .    .    .    D=================================eER   .    .    .    .    .    .    .  .   rev8	a0, a0
+# CHECK-NEXT: [0,75]    .    .    .    .    .    .    .    .    .    .    .D=================================eER  .    .    .    .    .    .    .  .   orc.b	a0, a0
+# CHECK-NEXT: [0,76]    .    .    .    .    .    .    .    .    .    .    .D==================================eeeeeER  .    .    .    .    .    .  .   lb	a0, 0(a0)
+# CHECK-NEXT: [0,77]    .    .    .    .    .    .    .    .    .    .    . D=====================================eER  .    .    .    .    .    .  .   add	a0, a0, a0
+# CHECK-NEXT: [0,78]    .    .    .    .    .    .    .    .    .    .    . D======================================eeeeeER  .    .    .    .    .  .   lh	a0, 0(a0)
+# CHECK-NEXT: [0,79]    .    .    .    .    .    .    .    .    .    .    .  D=========================================eER  .    .    .    .    .  .   and	a0, a0, a0
+# CHECK-NEXT: [0,80]    .    .    .    .    .    .    .    .    .    .    .  D==========================================eeeeeER  .    .    .    .  .   lw	a0, 0(a0)
+# CHECK-NEXT: [0,81]    .    .    .    .    .    .    .    .    .    .    .   D=============================================eER  .    .    .    .  .   or	a0, a0, a0
+# CHECK-NEXT: [0,82]    .    .    .    .    .    .    .    .    .    .    .   D==============================================eeeeeER  .    .    .  .   ld	a0, 0(a0)
+# CHECK-NEXT: [0,83]    .    .    .    .    .    .    .    .    .    .    .    D=================================================eER  .    .    .  .   xor	a0, a0, a0
+# CHECK-NEXT: [0,84]    .    .    .    .    .    .    .    .    .    .    .    D==================================================eeeeeER  .    .  .   lbu	a0, 0(a0)
+# CHECK-NEXT: [0,85]    .    .    .    .    .    .    .    .    .    .    .    .D=====================================================eER  .    .  .   addi	a0, a0, 1
+# CHECK-NEXT: [0,86]    .    .    .    .    .    .    .    .    .    .    .    .D======================================================eeeeeER  .  .   lhu	a0, 0(a0)
+# CHECK-NEXT: [0,87]    .    .    .    .    .    .    .    .    .    .    .    . D=========================================================eER  .  .   sub	a0, a0, a0
+# CHECK-NEXT: [0,88]    .    .    .    .    .    .    .    .    .    .    .    . D==========================================================eeeeeER.   lwu	a0, 0(a0)
+# CHECK-NEXT: [0,89]    .    .    .    .    .    .    .    .    .    .    .    .  D=============================================================eER.   addw	a0, a0, a0
+# CHECK-NEXT: [0,90]    .    .    .    .    .    .    .    .    .    .    .    .  D==============================================================eER   jr	a0
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       lui	a0, 1
+# CHECK-NEXT: 1.     1     1.0    1.0    0.0       auipc	a1, 1
+# CHECK-NEXT: 2.     1     2.0    0.0    0.0       add	a0, a0, a1
+# CHECK-NEXT: 3.     1     3.0    0.0    0.0       addi	a0, a0, 1
+# CHECK-NEXT: 4.     1     4.0    0.0    0.0       addw	a0, a0, a0
+# CHECK-NEXT: 5.     1     5.0    0.0    0.0       addiw	a0, a0, 1
+# CHECK-NEXT: 6.     1     5.0    0.0    0.0       sub	a0, a0, a0
+# CHECK-NEXT: 7.     1     6.0    0.0    0.0       subw	a0, a0, a0
+# CHECK-NEXT: 8.     1     7.0    0.0    0.0       and	a0, a0, a0
+# CHECK-NEXT: 9.     1     8.0    0.0    0.0       andi	a0, a0, 1
+# CHECK-NEXT: 10.    1     9.0    0.0    0.0       or	a0, a0, a0
+# CHECK-NEXT: 11.    1     10.0   0.0    0.0       ori	a0, a0, 1
+# CHECK-NEXT: 12.    1     10.0   0.0    0.0       xor	a0, a0, a0
+# CHECK-NEXT: 13.    1     11.0   0.0    0.0       xori	a0, a0, 1
+# CHECK-NEXT: 14.    1     12.0   0.0    0.0       sll	a0, a0, a0
+# CHECK-NEXT: 15.    1     13.0   0.0    0.0       slli	a0, a0, 1
+# CHECK-NEXT: 16.    1     14.0   0.0    0.0       sllw	a0, a0, a0
+# CHECK-NEXT: 17.    1     15.0   0.0    0.0       slliw	a0, a0, 1
+# CHECK-NEXT: 18.    1     15.0   0.0    0.0       srl	a0, a0, a0
+# CHECK-NEXT: 19.    1     16.0   0.0    0.0       srli	a0, a0, 1
+# CHECK-NEXT: 20.    1     16.0   0.0    0.0       srlw	a0, a0, a0
+# CHECK-NEXT: 21.    1     16.0   0.0    0.0       srliw	a0, a0, 1
+# CHECK-NEXT: 22.    1     16.0   0.0    0.0       sra	a0, a0, a0
+# CHECK-NEXT: 23.    1     16.0   0.0    0.0       srai	a0, a0, 1
+# CHECK-NEXT: 24.    1     16.0   0.0    0.0       sraw	a0, a0, a0
+# CHECK-NEXT: 25.    1     16.0   0.0    0.0       sraiw	a0, a0, 1
+# CHECK-NEXT: 26.    1     16.0   0.0    0.0       slt	a0, a0, a0
+# CHECK-NEXT: 27.    1     16.0   0.0    0.0       slti	a0, a0, 1
+# CHECK-NEXT: 28.    1     16.0   0.0    0.0       sltu	a0, a0, a0
+# CHECK-NEXT: 29.    1     16.0   0.0    0.0       seqz	a0, a0
+# CHECK-NEXT: 30.    1     17.0   0.0    0.0       mul	a0, a0, a0
+# CHECK-NEXT: 31.    1     19.0   0.0    0.0       add	a0, a0, a0
+# CHECK-NEXT: 32.    1     20.0   0.0    0.0       mulw	a0, a0, a0
+# CHECK-NEXT: 33.    1     22.0   0.0    0.0       add	a0, a0, a0
+# CHECK-NEXT: 34.    1     23.0   0.0    0.0       beq	a0, a0, .Ltmp0
+# CHECK-NEXT: 35.    1     22.0   0.0    0.0       add	a0, a0, a0
+# CHECK-NEXT: 36.    1     23.0   0.0    0.0       bne	a0, a0, .Ltmp1
+# CHECK-NEXT: 37.    1     22.0   0.0    0.0       add	a0, a0, a0
+# CHECK-NEXT: 38.    1     23.0   0.0    0.0       blt	a0, a0, .Ltmp2
+# CHECK-NEXT: 39.    1     22.0   0.0    0.0       add	a0, a0, a0
+# CHECK-NEXT: 40.    1     23.0   0.0    0.0       bltu	a0, a0, .Ltmp3
+# CHECK-NEXT: 41.    1     22.0   0.0    0.0       add	a0, a0, a0
+# CHECK-NEXT: 42.    1     23.0   0.0    0.0       bge	a0, a0, .Ltmp4
+# CHECK-NEXT: 43.    1     22.0   0.0    0.0       add	a0, a0, a0
+# CHECK-NEXT: 44.    1     23.0   0.0    0.0       bgeu	a0, a0, .Ltmp5
+# CHECK-NEXT: 45.    1     22.0   0.0    0.0       add.uw	a0, a0, a0
+# CHECK-NEXT: 46.    1     22.0   0.0    0.0       slli.uw	a0, a0, 1
+# CHECK-NEXT: 47.    1     22.0   0.0    0.0       sh1add.uw	a0, a0, a0
+# CHECK-NEXT: 48.    1     22.0   0.0    0.0       sh2add.uw	a0, a0, a0
+# CHECK-NEXT: 49.    1     22.0   0.0    0.0       sh3add.uw	a0, a0, a0
+# CHECK-NEXT: 50.    1     22.0   0.0    0.0       sh1add	a0, a0, a0
+# CHECK-NEXT: 51.    1     22.0   0.0    0.0       sh2add	a0, a0, a0
+# CHECK-NEXT: 52.    1     22.0   0.0    0.0       sh3add	a0, a0, a0
+# CHECK-NEXT: 53.    1     22.0   0.0    0.0       andn	a0, a0, a0
+# CHECK-NEXT: 54.    1     19.0   0.0    0.0       orn	a0, a0, a0
+# CHECK-NEXT: 55.    1     16.0   0.0    0.0       xnor	a0, a0, a0
+# CHECK-NEXT: 56.    1     16.0   0.0    0.0       sext.b	a0, a0
+# CHECK-NEXT: 57.    1     16.0   0.0    0.0       sext.h	a0, a0
+# CHECK-NEXT: 58.    1     16.0   0.0    0.0       zext.h	a0, a0
+# CHECK-NEXT: 59.    1     16.0   0.0    0.0       min	a0, a0, a0
+# CHECK-NEXT: 60.    1     16.0   0.0    0.0       minu	a0, a0, a0
+# CHECK-NEXT: 61.    1     16.0   0.0    0.0       max	a0, a0, a0
+# CHECK-NEXT: 62.    1     16.0   0.0    0.0       maxu	a0, a0, a0
+# CHECK-NEXT: 63.    1     16.0   0.0    0.0       rol	a0, a0, a0
+# CHECK-NEXT: 64.    1     16.0   0.0    0.0       ror	a0, a0, a0
+# CHECK-NEXT: 65.    1     16.0   0.0    0.0       rori	a0, a0, 1
+# CHECK-NEXT: 66.    1     17.0   0.0    0.0       clz	a0, a0
+# CHECK-NEXT: 67.    1     20.0   0.0    0.0       clzw	a0, a0
+# CHECK-NEXT: 68.    1     23.0   0.0    0.0       ctz	a0, a0
+# CHECK-NEXT: 69.    1     26.0   0.0    0.0       ctzw	a0, a0
+# CHECK-NEXT: 70.    1     29.0   0.0    0.0       cpop	a0, a0
+# CHECK-NEXT: 71.    1     31.0   0.0    0.0       add	a0, a0, a0
+# CHECK-NEXT: 72.    1     32.0   0.0    0.0       cpopw	a0, a0
+# CHECK-NEXT: 73.    1     34.0   0.0    0.0       add	a0, a0, a0
+# CHECK-NEXT: 74.    1     34.0   0.0    0.0       rev8	a0, a0
+# CHECK-NEXT: 75.    1     34.0   0.0    0.0       orc.b	a0, a0
+# CHECK-NEXT: 76.    1     35.0   0.0    0.0       lb	a0, 0(a0)
+# CHECK-NEXT: 77.    1     38.0   0.0    0.0       add	a0, a0, a0
+# CHECK-NEXT: 78.    1     39.0   0.0    0.0       lh	a0, 0(a0)
+# CHECK-NEXT: 79.    1     42.0   0.0    0.0       and	a0, a0, a0
+# CHECK-NEXT: 80.    1     43.0   0.0    0.0       lw	a0, 0(a0)
+# CHECK-NEXT: 81.    1     46.0   0.0    0.0       or	a0, a0, a0
+# CHECK-NEXT: 82.    1     47.0   0.0    0.0       ld	a0, 0(a0)
+# CHECK-NEXT: 83.    1     50.0   0.0    0.0       xor	a0, a0, a0
+# CHECK-NEXT: 84.    1     51.0   0.0    0.0       lbu	a0, 0(a0)
+# CHECK-NEXT: 85.    1     54.0   0.0    0.0       addi	a0, a0, 1
+# CHECK-NEXT: 86.    1     55.0   0.0    0.0       lhu	a0, 0(a0)
+# CHECK-NEXT: 87.    1     58.0   0.0    0.0       sub	a0, a0, a0
+# CHECK-NEXT: 88.    1     59.0   0.0    0.0       lwu	a0, 0(a0)
+# CHECK-NEXT: 89.    1     62.0   0.0    0.0       addw	a0, a0, a0
+# CHECK-NEXT: 90.    1     63.0   0.0    0.0       jr	a0
+# CHECK-NEXT:        1     22.7   0.0    0.0       <total>
diff --git a/llvm/test/tools/llvm-mca/RISCV/XiangShan/load-to-alu.s b/llvm/test/tools/llvm-mca/RISCV/XiangShan/load-to-alu.s
new file mode 100644
index 0000000..e1925e7
--- /dev/null
+++ b/llvm/test/tools/llvm-mca/RISCV/XiangShan/load-to-alu.s
@@ -0,0 +1,73 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=riscv64 -mcpu=xiangshan-nanhu -timeline -iterations=1 < %s | FileCheck %s
+
+# Test XiangShan load to ALU (4 cycles)
+ld a1, 0(a0)
+addi a2, a1, 1
+
+# CHECK:      Iterations:        1
+# CHECK-NEXT: Instructions:      2
+# CHECK-NEXT: Total Cycles:      8
+# CHECK-NEXT: Total uOps:        2
+
+# CHECK:      Dispatch Width:    6
+# CHECK-NEXT: uOps Per Cycle:    0.25
+# CHECK-NEXT: IPC:               0.25
+# CHECK-NEXT: Block RThroughput: 0.5
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      5     0.50    *                   ld	a1, 0(a0)
+# CHECK-NEXT:  1      1     0.25                        addi	a2, a1, 1
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0.0] - XS2ALU
+# CHECK-NEXT: [0.1] - XS2ALU
+# CHECK-NEXT: [0.2] - XS2ALU
+# CHECK-NEXT: [0.3] - XS2ALU
+# CHECK-NEXT: [1.0] - XS2FMAC
+# CHECK-NEXT: [1.1] - XS2FMAC
+# CHECK-NEXT: [1.2] - XS2FMAC
+# CHECK-NEXT: [1.3] - XS2FMAC
+# CHECK-NEXT: [2.0] - XS2FMISC
+# CHECK-NEXT: [2.1] - XS2FMISC
+# CHECK-NEXT: [3.0] - XS2LD
+# CHECK-NEXT: [3.1] - XS2LD
+# CHECK-NEXT: [4.0] - XS2MDU
+# CHECK-NEXT: [4.1] - XS2MDU
+# CHECK-NEXT: [5]   - XS2MISC
+# CHECK-NEXT: [6.0] - XS2ST
+# CHECK-NEXT: [6.1] - XS2ST
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0.0]  [0.1]  [0.2]  [0.3]  [1.0]  [1.1]  [1.2]  [1.3]  [2.0]  [2.1]  [3.0]  [3.1]  [4.0]  [4.1]  [5]    [6.0]  [6.1]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -      -      -     1.00    -      -      -      -      -
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0.0]  [0.1]  [0.2]  [0.3]  [1.0]  [1.1]  [1.2]  [1.3]  [2.0]  [2.1]  [3.0]  [3.1]  [4.0]  [4.1]  [5]    [6.0]  [6.1]  Instructions:
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     1.00    -      -      -      -      -     ld	a1, 0(a0)
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     addi	a2, a1, 1
+
+# CHECK:      Timeline view:
+# CHECK-NEXT: Index     01234567
+
+# CHECK:      [0,0]     DeeeeeER   ld	a1, 0(a0)
+# CHECK-NEXT: [0,1]     D====eER   addi	a2, a1, 1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld	a1, 0(a0)
+# CHECK-NEXT: 1.     1     5.0    0.0    0.0       addi	a2, a1, 1
+# CHECK-NEXT:        1     3.0    0.5    0.0       <total>
-- 
cgit v1.1


From f9d6d6fbcc23e0227d6b065f5723d80cd332cb1d Mon Sep 17 00:00:00 2001
From: Nikolas Klauser <nikolasklauser@berlin.de>
Date: Mon, 12 Feb 2024 08:32:28 +0100
Subject: [libc++] Move the contents of `__fwd/get.h` into the forward
 declaration headers they actually belong to (#81368)

This brings us closer to one forward declaring header per public header.
---
 libcxx/include/CMakeLists.txt                    |  1 -
 libcxx/include/__fwd/array.h                     | 14 ++++
 libcxx/include/__fwd/get.h                       | 99 ------------------------
 libcxx/include/__fwd/pair.h                      | 20 +++++
 libcxx/include/__fwd/subrange.h                  | 15 +++-
 libcxx/include/__fwd/tuple.h                     | 23 ++++++
 libcxx/include/__memory/compressed_pair.h        |  1 -
 libcxx/include/__ranges/elements_view.h          |  1 -
 libcxx/include/__ranges/subrange.h               |  1 -
 libcxx/include/__tuple/tuple_like.h              |  1 -
 libcxx/include/__utility/pair.h                  |  1 -
 libcxx/include/istream                           |  1 +
 libcxx/include/module.modulemap.in               |  1 -
 libcxx/include/streambuf                         |  1 +
 libcxx/include/tuple                             |  1 -
 libcxx/test/libcxx/transitive_includes/cxx03.csv |  1 +
 libcxx/test/libcxx/transitive_includes/cxx11.csv |  1 +
 libcxx/test/libcxx/transitive_includes/cxx14.csv |  1 +
 libcxx/test/libcxx/transitive_includes/cxx17.csv |  1 +
 libcxx/test/libcxx/transitive_includes/cxx20.csv |  1 +
 libcxx/test/libcxx/transitive_includes/cxx23.csv |  1 +
 libcxx/test/libcxx/transitive_includes/cxx26.csv |  1 +
 22 files changed, 79 insertions(+), 109 deletions(-)
 delete mode 100644 libcxx/include/__fwd/get.h

diff --git a/libcxx/include/CMakeLists.txt b/libcxx/include/CMakeLists.txt
index 6ded426..d55dc66 100644
--- a/libcxx/include/CMakeLists.txt
+++ b/libcxx/include/CMakeLists.txt
@@ -427,7 +427,6 @@ set(files
   __fwd/array.h
   __fwd/bit_reference.h
   __fwd/fstream.h
-  __fwd/get.h
   __fwd/hash.h
   __fwd/ios.h
   __fwd/istream.h
diff --git a/libcxx/include/__fwd/array.h b/libcxx/include/__fwd/array.h
index 9a79eff..ff3a3ee 100644
--- a/libcxx/include/__fwd/array.h
+++ b/libcxx/include/__fwd/array.h
@@ -21,6 +21,20 @@ _LIBCPP_BEGIN_NAMESPACE_STD
 template <class _Tp, size_t _Size>
 struct _LIBCPP_TEMPLATE_VIS array;
 
+template <size_t _Ip, class _Tp, size_t _Size>
+_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 _Tp& get(array<_Tp, _Size>&) _NOEXCEPT;
+
+template <size_t _Ip, class _Tp, size_t _Size>
+_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 const _Tp& get(const array<_Tp, _Size>&) _NOEXCEPT;
+
+#ifndef _LIBCPP_CXX03_LANG
+template <size_t _Ip, class _Tp, size_t _Size>
+_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 _Tp&& get(array<_Tp, _Size>&&) _NOEXCEPT;
+
+template <size_t _Ip, class _Tp, size_t _Size>
+_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 const _Tp&& get(const array<_Tp, _Size>&&) _NOEXCEPT;
+#endif
+
 _LIBCPP_END_NAMESPACE_STD
 
 #endif // _LIBCPP___FWD_ARRAY_H
diff --git a/libcxx/include/__fwd/get.h b/libcxx/include/__fwd/get.h
deleted file mode 100644
index e7261b8..0000000
--- a/libcxx/include/__fwd/get.h
+++ /dev/null
@@ -1,99 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef _LIBCPP___FWD_GET_H
-#define _LIBCPP___FWD_GET_H
-
-#include <__concepts/copyable.h>
-#include <__config>
-#include <__fwd/array.h>
-#include <__fwd/pair.h>
-#include <__fwd/subrange.h>
-#include <__fwd/tuple.h>
-#include <__tuple/tuple_element.h>
-#include <cstddef>
-
-#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
-#  pragma GCC system_header
-#endif
-
-_LIBCPP_BEGIN_NAMESPACE_STD
-
-#ifndef _LIBCPP_CXX03_LANG
-
-template <size_t _Ip, class... _Tp>
-_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 typename tuple_element<_Ip, tuple<_Tp...> >::type&
-get(tuple<_Tp...>&) _NOEXCEPT;
-
-template <size_t _Ip, class... _Tp>
-_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 const typename tuple_element<_Ip, tuple<_Tp...> >::type&
-get(const tuple<_Tp...>&) _NOEXCEPT;
-
-template <size_t _Ip, class... _Tp>
-_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 typename tuple_element<_Ip, tuple<_Tp...> >::type&&
-get(tuple<_Tp...>&&) _NOEXCEPT;
-
-template <size_t _Ip, class... _Tp>
-_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 const typename tuple_element<_Ip, tuple<_Tp...> >::type&&
-get(const tuple<_Tp...>&&) _NOEXCEPT;
-
-#endif //_LIBCPP_CXX03_LANG
-
-template <size_t _Ip, class _T1, class _T2>
-_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 typename tuple_element<_Ip, pair<_T1, _T2> >::type&
-get(pair<_T1, _T2>&) _NOEXCEPT;
-
-template <size_t _Ip, class _T1, class _T2>
-_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 const typename tuple_element<_Ip, pair<_T1, _T2> >::type&
-get(const pair<_T1, _T2>&) _NOEXCEPT;
-
-#ifndef _LIBCPP_CXX03_LANG
-template <size_t _Ip, class _T1, class _T2>
-_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 typename tuple_element<_Ip, pair<_T1, _T2> >::type&&
-get(pair<_T1, _T2>&&) _NOEXCEPT;
-
-template <size_t _Ip, class _T1, class _T2>
-_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 const typename tuple_element<_Ip, pair<_T1, _T2> >::type&&
-get(const pair<_T1, _T2>&&) _NOEXCEPT;
-#endif
-
-template <size_t _Ip, class _Tp, size_t _Size>
-_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 _Tp& get(array<_Tp, _Size>&) _NOEXCEPT;
-
-template <size_t _Ip, class _Tp, size_t _Size>
-_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 const _Tp& get(const array<_Tp, _Size>&) _NOEXCEPT;
-
-#ifndef _LIBCPP_CXX03_LANG
-template <size_t _Ip, class _Tp, size_t _Size>
-_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 _Tp&& get(array<_Tp, _Size>&&) _NOEXCEPT;
-
-template <size_t _Ip, class _Tp, size_t _Size>
-_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 const _Tp&& get(const array<_Tp, _Size>&&) _NOEXCEPT;
-#endif
-
-#if _LIBCPP_STD_VER >= 20
-
-namespace ranges {
-
-template <size_t _Index, class _Iter, class _Sent, subrange_kind _Kind>
-  requires((_Index == 0 && copyable<_Iter>) || _Index == 1)
-_LIBCPP_HIDE_FROM_ABI constexpr auto get(const subrange<_Iter, _Sent, _Kind>& __subrange);
-
-template <size_t _Index, class _Iter, class _Sent, subrange_kind _Kind>
-  requires(_Index < 2)
-_LIBCPP_HIDE_FROM_ABI constexpr auto get(subrange<_Iter, _Sent, _Kind>&& __subrange);
-
-} // namespace ranges
-
-using ranges::get;
-
-#endif // _LIBCPP_STD_VER >= 20
-
-_LIBCPP_END_NAMESPACE_STD
-
-#endif // _LIBCPP___FWD_GET_H
diff --git a/libcxx/include/__fwd/pair.h b/libcxx/include/__fwd/pair.h
index 3844014..af32628 100644
--- a/libcxx/include/__fwd/pair.h
+++ b/libcxx/include/__fwd/pair.h
@@ -10,6 +10,8 @@
 #define _LIBCPP___FWD_PAIR_H
 
 #include <__config>
+#include <__fwd/tuple.h>
+#include <cstddef>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
 #  pragma GCC system_header
@@ -20,6 +22,24 @@ _LIBCPP_BEGIN_NAMESPACE_STD
 template <class, class>
 struct _LIBCPP_TEMPLATE_VIS pair;
 
+template <size_t _Ip, class _T1, class _T2>
+_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 typename tuple_element<_Ip, pair<_T1, _T2> >::type&
+get(pair<_T1, _T2>&) _NOEXCEPT;
+
+template <size_t _Ip, class _T1, class _T2>
+_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 const typename tuple_element<_Ip, pair<_T1, _T2> >::type&
+get(const pair<_T1, _T2>&) _NOEXCEPT;
+
+#ifndef _LIBCPP_CXX03_LANG
+template <size_t _Ip, class _T1, class _T2>
+_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 typename tuple_element<_Ip, pair<_T1, _T2> >::type&&
+get(pair<_T1, _T2>&&) _NOEXCEPT;
+
+template <size_t _Ip, class _T1, class _T2>
+_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 const typename tuple_element<_Ip, pair<_T1, _T2> >::type&&
+get(const pair<_T1, _T2>&&) _NOEXCEPT;
+#endif
+
 _LIBCPP_END_NAMESPACE_STD
 
 #endif // _LIBCPP___FWD_PAIR_H
diff --git a/libcxx/include/__fwd/subrange.h b/libcxx/include/__fwd/subrange.h
index d09b9b1..60a41da 100644
--- a/libcxx/include/__fwd/subrange.h
+++ b/libcxx/include/__fwd/subrange.h
@@ -9,7 +9,10 @@
 #ifndef _LIBCPP___FWD_SUBRANGE_H
 #define _LIBCPP___FWD_SUBRANGE_H
 
+#include <__concepts/copyable.h>
 #include <__config>
+#include <__iterator/concepts.h>
+#include <cstddef>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
 #  pragma GCC system_header
@@ -17,8 +20,6 @@
 
 #if _LIBCPP_STD_VER >= 20
 
-#  include <__iterator/concepts.h>
-
 _LIBCPP_BEGIN_NAMESPACE_STD
 
 namespace ranges {
@@ -29,8 +30,18 @@ template <input_or_output_iterator _Iter, sentinel_for<_Iter> _Sent, subrange_ki
   requires(_Kind == subrange_kind::sized || !sized_sentinel_for<_Sent, _Iter>)
 class _LIBCPP_TEMPLATE_VIS subrange;
 
+template <size_t _Index, class _Iter, class _Sent, subrange_kind _Kind>
+  requires((_Index == 0 && copyable<_Iter>) || _Index == 1)
+_LIBCPP_HIDE_FROM_ABI constexpr auto get(const subrange<_Iter, _Sent, _Kind>&);
+
+template <size_t _Index, class _Iter, class _Sent, subrange_kind _Kind>
+  requires(_Index < 2)
+_LIBCPP_HIDE_FROM_ABI constexpr auto get(subrange<_Iter, _Sent, _Kind>&&);
+
 } // namespace ranges
 
+using ranges::get;
+
 _LIBCPP_END_NAMESPACE_STD
 
 #endif // _LIBCPP_STD_VER >= 20
diff --git a/libcxx/include/__fwd/tuple.h b/libcxx/include/__fwd/tuple.h
index 16b3fab..902770c 100644
--- a/libcxx/include/__fwd/tuple.h
+++ b/libcxx/include/__fwd/tuple.h
@@ -10,6 +10,7 @@
 #define _LIBCPP___FWD_TUPLE_H
 
 #include <__config>
+#include <cstddef>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
 #  pragma GCC system_header
@@ -17,11 +18,33 @@
 
 _LIBCPP_BEGIN_NAMESPACE_STD
 
+template <size_t, class>
+struct _LIBCPP_TEMPLATE_VIS tuple_element;
+
 #ifndef _LIBCPP_CXX03_LANG
 
 template <class...>
 class _LIBCPP_TEMPLATE_VIS tuple;
 
+template <class>
+struct _LIBCPP_TEMPLATE_VIS tuple_size;
+
+template <size_t _Ip, class... _Tp>
+_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 typename tuple_element<_Ip, tuple<_Tp...> >::type&
+get(tuple<_Tp...>&) _NOEXCEPT;
+
+template <size_t _Ip, class... _Tp>
+_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 const typename tuple_element<_Ip, tuple<_Tp...> >::type&
+get(const tuple<_Tp...>&) _NOEXCEPT;
+
+template <size_t _Ip, class... _Tp>
+_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 typename tuple_element<_Ip, tuple<_Tp...> >::type&&
+get(tuple<_Tp...>&&) _NOEXCEPT;
+
+template <size_t _Ip, class... _Tp>
+_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 const typename tuple_element<_Ip, tuple<_Tp...> >::type&&
+get(const tuple<_Tp...>&&) _NOEXCEPT;
+
 #endif // _LIBCPP_CXX03_LANG
 
 _LIBCPP_END_NAMESPACE_STD
diff --git a/libcxx/include/__memory/compressed_pair.h b/libcxx/include/__memory/compressed_pair.h
index e9faada..a28f852 100644
--- a/libcxx/include/__memory/compressed_pair.h
+++ b/libcxx/include/__memory/compressed_pair.h
@@ -11,7 +11,6 @@
 #define _LIBCPP___MEMORY_COMPRESSED_PAIR_H
 
 #include <__config>
-#include <__fwd/get.h>
 #include <__fwd/tuple.h>
 #include <__tuple/tuple_indices.h>
 #include <__type_traits/decay.h>
diff --git a/libcxx/include/__ranges/elements_view.h b/libcxx/include/__ranges/elements_view.h
index 989d36f..3f35e93 100644
--- a/libcxx/include/__ranges/elements_view.h
+++ b/libcxx/include/__ranges/elements_view.h
@@ -16,7 +16,6 @@
 #include <__concepts/derived_from.h>
 #include <__concepts/equality_comparable.h>
 #include <__config>
-#include <__fwd/get.h>
 #include <__iterator/concepts.h>
 #include <__iterator/iterator_traits.h>
 #include <__ranges/access.h>
diff --git a/libcxx/include/__ranges/subrange.h b/libcxx/include/__ranges/subrange.h
index a419782..bb4411c 100644
--- a/libcxx/include/__ranges/subrange.h
+++ b/libcxx/include/__ranges/subrange.h
@@ -17,7 +17,6 @@
 #include <__concepts/derived_from.h>
 #include <__concepts/different_from.h>
 #include <__config>
-#include <__fwd/get.h>
 #include <__fwd/subrange.h>
 #include <__iterator/advance.h>
 #include <__iterator/concepts.h>
diff --git a/libcxx/include/__tuple/tuple_like.h b/libcxx/include/__tuple/tuple_like.h
index dab395b..3548af9 100644
--- a/libcxx/include/__tuple/tuple_like.h
+++ b/libcxx/include/__tuple/tuple_like.h
@@ -16,7 +16,6 @@
 #include <__fwd/tuple.h>
 #include <__type_traits/integral_constant.h>
 #include <__type_traits/remove_cvref.h>
-#include <cstddef>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
 #  pragma GCC system_header
diff --git a/libcxx/include/__utility/pair.h b/libcxx/include/__utility/pair.h
index 577e231..0db6f7f 100644
--- a/libcxx/include/__utility/pair.h
+++ b/libcxx/include/__utility/pair.h
@@ -14,7 +14,6 @@
 #include <__concepts/different_from.h>
 #include <__config>
 #include <__fwd/array.h>
-#include <__fwd/get.h>
 #include <__fwd/pair.h>
 #include <__fwd/subrange.h>
 #include <__fwd/tuple.h>
diff --git a/libcxx/include/istream b/libcxx/include/istream
index 1c7fb99..0d05a26 100644
--- a/libcxx/include/istream
+++ b/libcxx/include/istream
@@ -167,6 +167,7 @@ template <class Stream, class T>
 #include <__type_traits/is_base_of.h>
 #include <__utility/declval.h>
 #include <__utility/forward.h>
+#include <bitset>
 #include <ostream>
 #include <version>
 
diff --git a/libcxx/include/module.modulemap.in b/libcxx/include/module.modulemap.in
index 9828c48..f3246f8 100644
--- a/libcxx/include/module.modulemap.in
+++ b/libcxx/include/module.modulemap.in
@@ -1790,7 +1790,6 @@ module std_private_thread_thread               [system] {
 }
 module std_private_thread_timed_backoff_policy [system] { header "__thread/timed_backoff_policy.h" }
 
-module std_private_tuple_get_fwd          [system] { header "__fwd/get.h" }
 module std_private_tuple_make_tuple_types [system] { header "__tuple/make_tuple_types.h" }
 module std_private_tuple_pair_like        [system] {
   header "__tuple/pair_like.h"
diff --git a/libcxx/include/streambuf b/libcxx/include/streambuf
index 577ad34..aad7686 100644
--- a/libcxx/include/streambuf
+++ b/libcxx/include/streambuf
@@ -110,6 +110,7 @@ protected:
 #include <__assert> // all public C++ headers provide the assertion handler
 #include <__config>
 #include <__fwd/streambuf.h>
+#include <__type_traits/is_same.h>
 #include <climits>
 #include <ios>
 #include <iosfwd>
diff --git a/libcxx/include/tuple b/libcxx/include/tuple
index c41e924..cb20d27 100644
--- a/libcxx/include/tuple
+++ b/libcxx/include/tuple
@@ -211,7 +211,6 @@ template <class... Types>
 #include <__config>
 #include <__functional/invoke.h>
 #include <__fwd/array.h>
-#include <__fwd/get.h>
 #include <__fwd/tuple.h>
 #include <__memory/allocator_arg_t.h>
 #include <__memory/uses_allocator.h>
diff --git a/libcxx/test/libcxx/transitive_includes/cxx03.csv b/libcxx/test/libcxx/transitive_includes/cxx03.csv
index d4c2202..c65c654 100644
--- a/libcxx/test/libcxx/transitive_includes/cxx03.csv
+++ b/libcxx/test/libcxx/transitive_includes/cxx03.csv
@@ -393,6 +393,7 @@ iostream istream
 iostream ostream
 iostream streambuf
 iostream version
+istream bitset
 istream concepts
 istream cstddef
 istream iosfwd
diff --git a/libcxx/test/libcxx/transitive_includes/cxx11.csv b/libcxx/test/libcxx/transitive_includes/cxx11.csv
index 5eb839a..e9eae06 100644
--- a/libcxx/test/libcxx/transitive_includes/cxx11.csv
+++ b/libcxx/test/libcxx/transitive_includes/cxx11.csv
@@ -396,6 +396,7 @@ iostream istream
 iostream ostream
 iostream streambuf
 iostream version
+istream bitset
 istream concepts
 istream cstddef
 istream iosfwd
diff --git a/libcxx/test/libcxx/transitive_includes/cxx14.csv b/libcxx/test/libcxx/transitive_includes/cxx14.csv
index 7b372fb..08db2a0 100644
--- a/libcxx/test/libcxx/transitive_includes/cxx14.csv
+++ b/libcxx/test/libcxx/transitive_includes/cxx14.csv
@@ -398,6 +398,7 @@ iostream istream
 iostream ostream
 iostream streambuf
 iostream version
+istream bitset
 istream concepts
 istream cstddef
 istream iosfwd
diff --git a/libcxx/test/libcxx/transitive_includes/cxx17.csv b/libcxx/test/libcxx/transitive_includes/cxx17.csv
index 7b372fb..08db2a0 100644
--- a/libcxx/test/libcxx/transitive_includes/cxx17.csv
+++ b/libcxx/test/libcxx/transitive_includes/cxx17.csv
@@ -398,6 +398,7 @@ iostream istream
 iostream ostream
 iostream streambuf
 iostream version
+istream bitset
 istream concepts
 istream cstddef
 istream iosfwd
diff --git a/libcxx/test/libcxx/transitive_includes/cxx20.csv b/libcxx/test/libcxx/transitive_includes/cxx20.csv
index 9b4915a..69805d6 100644
--- a/libcxx/test/libcxx/transitive_includes/cxx20.csv
+++ b/libcxx/test/libcxx/transitive_includes/cxx20.csv
@@ -403,6 +403,7 @@ iostream istream
 iostream ostream
 iostream streambuf
 iostream version
+istream bitset
 istream concepts
 istream cstddef
 istream iosfwd
diff --git a/libcxx/test/libcxx/transitive_includes/cxx23.csv b/libcxx/test/libcxx/transitive_includes/cxx23.csv
index bd82411..f995789 100644
--- a/libcxx/test/libcxx/transitive_includes/cxx23.csv
+++ b/libcxx/test/libcxx/transitive_includes/cxx23.csv
@@ -282,6 +282,7 @@ iostream istream
 iostream ostream
 iostream streambuf
 iostream version
+istream bitset
 istream cstddef
 istream ostream
 istream version
diff --git a/libcxx/test/libcxx/transitive_includes/cxx26.csv b/libcxx/test/libcxx/transitive_includes/cxx26.csv
index bd82411..f995789 100644
--- a/libcxx/test/libcxx/transitive_includes/cxx26.csv
+++ b/libcxx/test/libcxx/transitive_includes/cxx26.csv
@@ -282,6 +282,7 @@ iostream istream
 iostream ostream
 iostream streambuf
 iostream version
+istream bitset
 istream cstddef
 istream ostream
 istream version
-- 
cgit v1.1


From 1c48c9234b3e30e6ed5b9a0ce68320579bb6fec7 Mon Sep 17 00:00:00 2001
From: LLVM GN Syncbot <llvmgnsyncbot@gmail.com>
Date: Mon, 12 Feb 2024 07:32:50 +0000
Subject: [gn build] Port f9d6d6fbcc23

---
 llvm/utils/gn/secondary/libcxx/include/BUILD.gn | 1 -
 1 file changed, 1 deletion(-)

diff --git a/llvm/utils/gn/secondary/libcxx/include/BUILD.gn b/llvm/utils/gn/secondary/libcxx/include/BUILD.gn
index d7fee7d..45125ce 100644
--- a/llvm/utils/gn/secondary/libcxx/include/BUILD.gn
+++ b/llvm/utils/gn/secondary/libcxx/include/BUILD.gn
@@ -505,7 +505,6 @@ if (current_toolchain == default_toolchain) {
       "__fwd/array.h",
       "__fwd/bit_reference.h",
       "__fwd/fstream.h",
-      "__fwd/get.h",
       "__fwd/hash.h",
       "__fwd/ios.h",
       "__fwd/istream.h",
-- 
cgit v1.1


From f5d71b79fe978f1e70f4a41adfd15f65dec6e210 Mon Sep 17 00:00:00 2001
From: Owen Pan <owenpiano@gmail.com>
Date: Mon, 12 Feb 2024 00:00:13 -0800
Subject: Reland "[clang-format][NFC] Make LangOpts global in namespace Format
 (#81390)"

---
 clang/lib/Format/FormatToken.cpp | 40 +++-------------------------------------
 1 file changed, 3 insertions(+), 37 deletions(-)

diff --git a/clang/lib/Format/FormatToken.cpp b/clang/lib/Format/FormatToken.cpp
index b791c5a..33bcde3 100644
--- a/clang/lib/Format/FormatToken.cpp
+++ b/clang/lib/Format/FormatToken.cpp
@@ -14,9 +14,7 @@
 
 #include "FormatToken.h"
 #include "ContinuationIndenter.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/Support/Debug.h"
-#include <climits>
+#include "TokenAnalyzer.h"
 
 namespace clang {
 namespace format {
@@ -34,41 +32,9 @@ const char *getTokenTypeName(TokenType Type) {
   return nullptr;
 }
 
-// FIXME: This is copy&pasted from Sema. Put it in a common place and remove
-// duplication.
 bool FormatToken::isSimpleTypeSpecifier() const {
-  switch (Tok.getKind()) {
-  case tok::kw_short:
-  case tok::kw_long:
-  case tok::kw___int64:
-  case tok::kw___int128:
-  case tok::kw_signed:
-  case tok::kw_unsigned:
-  case tok::kw_void:
-  case tok::kw_char:
-  case tok::kw_int:
-  case tok::kw_half:
-  case tok::kw_float:
-  case tok::kw_double:
-  case tok::kw___bf16:
-  case tok::kw__Float16:
-  case tok::kw___float128:
-  case tok::kw___ibm128:
-  case tok::kw_wchar_t:
-  case tok::kw_bool:
-#define TRANSFORM_TYPE_TRAIT_DEF(_, Trait) case tok::kw___##Trait:
-#include "clang/Basic/TransformTypeTraits.def"
-  case tok::annot_typename:
-  case tok::kw_char8_t:
-  case tok::kw_char16_t:
-  case tok::kw_char32_t:
-  case tok::kw_typeof:
-  case tok::kw_decltype:
-  case tok::kw__Atomic:
-    return true;
-  default:
-    return false;
-  }
+  assert(LangOpts.CPlusPlus);
+  return Tok.isSimpleTypeSpecifier(LangOpts);
 }
 
 bool FormatToken::isTypeOrIdentifier() const {
-- 
cgit v1.1


From 33108fae9020cb49577fc3ee0aed219cc581f82a Mon Sep 17 00:00:00 2001
From: Owen Pan <owenpiano@gmail.com>
Date: Mon, 12 Feb 2024 00:07:47 -0800
Subject: Reverted due to wrong commit message

This reverts commit f5d71b79fe978f1e70f4a41adfd15f65dec6e210.
---
 clang/lib/Format/FormatToken.cpp | 40 +++++++++++++++++++++++++++++++++++++---
 1 file changed, 37 insertions(+), 3 deletions(-)

diff --git a/clang/lib/Format/FormatToken.cpp b/clang/lib/Format/FormatToken.cpp
index 33bcde3..b791c5a 100644
--- a/clang/lib/Format/FormatToken.cpp
+++ b/clang/lib/Format/FormatToken.cpp
@@ -14,7 +14,9 @@
 
 #include "FormatToken.h"
 #include "ContinuationIndenter.h"
-#include "TokenAnalyzer.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Support/Debug.h"
+#include <climits>
 
 namespace clang {
 namespace format {
@@ -32,9 +34,41 @@ const char *getTokenTypeName(TokenType Type) {
   return nullptr;
 }
 
+// FIXME: This is copy&pasted from Sema. Put it in a common place and remove
+// duplication.
 bool FormatToken::isSimpleTypeSpecifier() const {
-  assert(LangOpts.CPlusPlus);
-  return Tok.isSimpleTypeSpecifier(LangOpts);
+  switch (Tok.getKind()) {
+  case tok::kw_short:
+  case tok::kw_long:
+  case tok::kw___int64:
+  case tok::kw___int128:
+  case tok::kw_signed:
+  case tok::kw_unsigned:
+  case tok::kw_void:
+  case tok::kw_char:
+  case tok::kw_int:
+  case tok::kw_half:
+  case tok::kw_float:
+  case tok::kw_double:
+  case tok::kw___bf16:
+  case tok::kw__Float16:
+  case tok::kw___float128:
+  case tok::kw___ibm128:
+  case tok::kw_wchar_t:
+  case tok::kw_bool:
+#define TRANSFORM_TYPE_TRAIT_DEF(_, Trait) case tok::kw___##Trait:
+#include "clang/Basic/TransformTypeTraits.def"
+  case tok::annot_typename:
+  case tok::kw_char8_t:
+  case tok::kw_char16_t:
+  case tok::kw_char32_t:
+  case tok::kw_typeof:
+  case tok::kw_decltype:
+  case tok::kw__Atomic:
+    return true;
+  default:
+    return false;
+  }
 }
 
 bool FormatToken::isTypeOrIdentifier() const {
-- 
cgit v1.1


From 7f40c5cc4e15a0a67b031f13370afc342a5dc14b Mon Sep 17 00:00:00 2001
From: Owen Pan <owenpiano@gmail.com>
Date: Mon, 12 Feb 2024 00:10:24 -0800
Subject: Reland "[clang-format] Update FormatToken::isSimpleTypeSpecifier()
 (#80241)"

---
 clang/lib/Format/FormatToken.cpp | 40 +++-------------------------------------
 1 file changed, 3 insertions(+), 37 deletions(-)

diff --git a/clang/lib/Format/FormatToken.cpp b/clang/lib/Format/FormatToken.cpp
index b791c5a..33bcde3 100644
--- a/clang/lib/Format/FormatToken.cpp
+++ b/clang/lib/Format/FormatToken.cpp
@@ -14,9 +14,7 @@
 
 #include "FormatToken.h"
 #include "ContinuationIndenter.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/Support/Debug.h"
-#include <climits>
+#include "TokenAnalyzer.h"
 
 namespace clang {
 namespace format {
@@ -34,41 +32,9 @@ const char *getTokenTypeName(TokenType Type) {
   return nullptr;
 }
 
-// FIXME: This is copy&pasted from Sema. Put it in a common place and remove
-// duplication.
 bool FormatToken::isSimpleTypeSpecifier() const {
-  switch (Tok.getKind()) {
-  case tok::kw_short:
-  case tok::kw_long:
-  case tok::kw___int64:
-  case tok::kw___int128:
-  case tok::kw_signed:
-  case tok::kw_unsigned:
-  case tok::kw_void:
-  case tok::kw_char:
-  case tok::kw_int:
-  case tok::kw_half:
-  case tok::kw_float:
-  case tok::kw_double:
-  case tok::kw___bf16:
-  case tok::kw__Float16:
-  case tok::kw___float128:
-  case tok::kw___ibm128:
-  case tok::kw_wchar_t:
-  case tok::kw_bool:
-#define TRANSFORM_TYPE_TRAIT_DEF(_, Trait) case tok::kw___##Trait:
-#include "clang/Basic/TransformTypeTraits.def"
-  case tok::annot_typename:
-  case tok::kw_char8_t:
-  case tok::kw_char16_t:
-  case tok::kw_char32_t:
-  case tok::kw_typeof:
-  case tok::kw_decltype:
-  case tok::kw__Atomic:
-    return true;
-  default:
-    return false;
-  }
+  assert(LangOpts.CPlusPlus);
+  return Tok.isSimpleTypeSpecifier(LangOpts);
 }
 
 bool FormatToken::isTypeOrIdentifier() const {
-- 
cgit v1.1


From f5399e89a27d8c64849a1631958952f4dfe14692 Mon Sep 17 00:00:00 2001
From: pvanhout <pierre.vanhoutryve@amd.com>
Date: Mon, 12 Feb 2024 09:30:10 +0100
Subject: Remove trailing whitespaces in AMDGPUUsage.rst

---
 llvm/docs/AMDGPUUsage.rst | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst
index f463e83..ebc7fda 100644
--- a/llvm/docs/AMDGPUUsage.rst
+++ b/llvm/docs/AMDGPUUsage.rst
@@ -361,8 +361,8 @@ Every processor supports every OS ABI (see :ref:`amdgpu-os`) with the following
                                                     - tgsplit           flat          - *rocm-amdhsa* - AMD Instinct MI250 Accelerator
                                                     - xnack             scratch       - *rocm-amdhsa* - AMD Instinct MI250X Accelerator
                                                     - kernarg preload - Packed
-                                                                        work-item                      
-                                                                        IDs                             
+                                                                        work-item
+                                                                        IDs
 
      ``gfx90c``                  ``amdgcn``   APU   - xnack           - Absolute      - *pal-amdpal*  - Ryzen 7 4700G
                                                                         flat                          - Ryzen 7 4700GE
-- 
cgit v1.1


From 074f7c2235a7d39f59f5c4d85b8fb87d3459b90d Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Mon, 12 Feb 2024 09:54:32 +0100
Subject: [InstCombine] Remove redundant fold (NFCI)

This has been subsumed by simplifyAndOrWithOpReplaced().
---
 llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp | 13 -------------
 1 file changed, 13 deletions(-)

diff --git a/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp b/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
index a53eb39..4465eb8 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
@@ -3660,19 +3660,6 @@ Instruction *InstCombinerImpl::visitOr(BinaryOperator &I) {
     }
   }
 
-  // A | ~(A | B) -> A | ~B
-  // A | ~(A ^ B) -> A | ~B
-  if (match(Op1, m_Not(m_Value(A))))
-    if (BinaryOperator *B = dyn_cast<BinaryOperator>(A))
-      if ((Op0 == B->getOperand(0) || Op0 == B->getOperand(1)) &&
-          Op1->hasOneUse() && (B->getOpcode() == Instruction::Or ||
-                               B->getOpcode() == Instruction::Xor)) {
-        Value *NotOp = Op0 == B->getOperand(0) ? B->getOperand(1) :
-                                                 B->getOperand(0);
-        Value *Not = Builder.CreateNot(NotOp, NotOp->getName() + ".not");
-        return BinaryOperator::CreateOr(Not, Op0);
-      }
-
   if (SwappedForXor)
     std::swap(Op0, Op1);
 
-- 
cgit v1.1


From 92d79922051f732560acf3791b543df1e6580689 Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Mon, 12 Feb 2024 10:00:34 +0100
Subject: [AArch64] Only apply bool vector bitcast opt if result is scalar
 (#81256)

This optimization tries to optimize bitcasts from `<N x i1>` to iN, but
currently also triggers for `<N x i1>` to `<M x iK>` bitcasts, if custom
lowering has been requested for these for an unrelated reason. Fix this
by explicitly checking that the result type is scalar.

Fixes https://github.com/llvm/llvm-project/issues/81216.
---
 llvm/lib/Target/AArch64/AArch64ISelLowering.cpp    |  3 ++-
 .../AArch64/vec-combine-compare-to-bitmask.ll      | 28 ++++++++++++++++++++++
 2 files changed, 30 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 20290c9..2854dd4 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -24615,7 +24615,8 @@ void AArch64TargetLowering::ReplaceBITCASTResults(
     return;
   }
 
-  if (SrcVT.isVector() && SrcVT.getVectorElementType() == MVT::i1)
+  if (SrcVT.isVector() && SrcVT.getVectorElementType() == MVT::i1 &&
+      !VT.isVector())
     return replaceBoolVectorBitcast(N, Results, DAG);
 
   if (VT != MVT::i16 || (SrcVT != MVT::f16 && SrcVT != MVT::bf16))
diff --git a/llvm/test/CodeGen/AArch64/vec-combine-compare-to-bitmask.ll b/llvm/test/CodeGen/AArch64/vec-combine-compare-to-bitmask.ll
index 1b22e2f..557aa01 100644
--- a/llvm/test/CodeGen/AArch64/vec-combine-compare-to-bitmask.ll
+++ b/llvm/test/CodeGen/AArch64/vec-combine-compare-to-bitmask.ll
@@ -489,3 +489,31 @@ define i6 @no_combine_illegal_num_elements(<6 x i32> %vec) {
   %bitmask = bitcast <6 x i1> %cmp_result to i6
   ret i6 %bitmask
 }
+
+; Only apply the combine when casting a vector to a scalar.
+define <2 x i8> @vector_to_vector_cast(<16 x i1> %arg) nounwind {
+; CHECK-LABEL: vector_to_vector_cast:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    sub sp, sp, #16
+; CHECK-NEXT:    shl.16b v0, v0, #7
+; CHECK-NEXT:  Lloh36:
+; CHECK-NEXT:    adrp x8, lCPI20_0@PAGE
+; CHECK-NEXT:  Lloh37:
+; CHECK-NEXT:    ldr q1, [x8, lCPI20_0@PAGEOFF]
+; CHECK-NEXT:    add x8, sp, #14
+; CHECK-NEXT:    cmlt.16b v0, v0, #0
+; CHECK-NEXT:    and.16b v0, v0, v1
+; CHECK-NEXT:    ext.16b v1, v0, v0, #8
+; CHECK-NEXT:    zip1.16b v0, v0, v1
+; CHECK-NEXT:    addv.8h h0, v0
+; CHECK-NEXT:    str h0, [sp, #14]
+; CHECK-NEXT:    ld1.b { v0 }[0], [x8]
+; CHECK-NEXT:    orr x8, x8, #0x1
+; CHECK-NEXT:    ld1.b { v0 }[4], [x8]
+; CHECK-NEXT:    ; kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT:    add sp, sp, #16
+; CHECK-NEXT:    ret
+; CHECK-NEXT:    .loh AdrpLdr Lloh36, Lloh37
+  %bc = bitcast <16 x i1> %arg to <2 x i8>
+  ret <2 x i8> %bc
+}
-- 
cgit v1.1


From 1e36d92b705b7977c867dc3afa90da76dee6effd Mon Sep 17 00:00:00 2001
From: Pierre van Houtryve <pierre.vanhoutryve@amd.com>
Date: Mon, 12 Feb 2024 10:01:22 +0100
Subject: [LowerMemIntrinsics] Avoid udiv/urem when type size is a power of 2
 (#81238)

See #64620 - does not fix the issue but improves the generated code a
bit.
---
 llvm/lib/Transforms/Utils/LowerMemIntrinsics.cpp   |  33 +++-
 llvm/test/CodeGen/AMDGPU/lower-mem-intrinsics.ll   | 168 +++++++++++++--------
 .../test/CodeGen/AMDGPU/memcpy-crash-issue63986.ll |   5 +-
 3 files changed, 137 insertions(+), 69 deletions(-)

diff --git a/llvm/lib/Transforms/Utils/LowerMemIntrinsics.cpp b/llvm/lib/Transforms/Utils/LowerMemIntrinsics.cpp
index c75de86..88934a34 100644
--- a/llvm/lib/Transforms/Utils/LowerMemIntrinsics.cpp
+++ b/llvm/lib/Transforms/Utils/LowerMemIntrinsics.cpp
@@ -13,6 +13,7 @@
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/MDBuilder.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/MathExtras.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include <optional>
 
@@ -155,6 +156,26 @@ void llvm::createMemCpyLoopKnownSize(
          "Bytes copied should match size in the call!");
 }
 
+// \returns \p Len udiv \p OpSize, checking for optimization opportunities.
+static Value *getRuntimeLoopCount(const DataLayout &DL, IRBuilderBase &B,
+                                  Value *Len, Value *OpSize,
+                                  unsigned OpSizeVal) {
+  // For powers of 2, we can lshr by log2 instead of using udiv.
+  if (isPowerOf2_32(OpSizeVal))
+    return B.CreateLShr(Len, Log2_32(OpSizeVal));
+  return B.CreateUDiv(Len, OpSize);
+}
+
+// \returns \p Len urem \p OpSize, checking for optimization opportunities.
+static Value *getRuntimeLoopRemainder(const DataLayout &DL, IRBuilderBase &B,
+                                      Value *Len, Value *OpSize,
+                                      unsigned OpSizeVal) {
+  // For powers of 2, we can and by (OpSizeVal - 1) instead of using urem.
+  if (isPowerOf2_32(OpSizeVal))
+    return B.CreateAnd(Len, OpSizeVal - 1);
+  return B.CreateURem(Len, OpSize);
+}
+
 void llvm::createMemCpyLoopUnknownSize(
     Instruction *InsertBefore, Value *SrcAddr, Value *DstAddr, Value *CopyLen,
     Align SrcAlign, Align DstAlign, bool SrcIsVolatile, bool DstIsVolatile,
@@ -194,9 +215,11 @@ void llvm::createMemCpyLoopUnknownSize(
   Type *Int8Type = Type::getInt8Ty(Ctx);
   bool LoopOpIsInt8 = LoopOpType == Int8Type;
   ConstantInt *CILoopOpSize = ConstantInt::get(ILengthType, LoopOpSize);
-  Value *RuntimeLoopCount = LoopOpIsInt8 ?
-                            CopyLen :
-                            PLBuilder.CreateUDiv(CopyLen, CILoopOpSize);
+  Value *RuntimeLoopCount = LoopOpIsInt8
+                                ? CopyLen
+                                : getRuntimeLoopCount(DL, PLBuilder, CopyLen,
+                                                      CILoopOpSize, LoopOpSize);
+
   BasicBlock *LoopBB =
       BasicBlock::Create(Ctx, "loop-memcpy-expansion", ParentFunc, PostLoopBB);
   IRBuilder<> LoopBuilder(LoopBB);
@@ -239,8 +262,8 @@ void llvm::createMemCpyLoopUnknownSize(
     assert((ResLoopOpSize == AtomicElementSize ? *AtomicElementSize : 1) &&
            "Store size is expected to match type size");
 
-    // Add in the
-    Value *RuntimeResidual = PLBuilder.CreateURem(CopyLen, CILoopOpSize);
+    Value *RuntimeResidual = getRuntimeLoopRemainder(DL, PLBuilder, CopyLen,
+                                                     CILoopOpSize, LoopOpSize);
     Value *RuntimeBytesCopied = PLBuilder.CreateSub(CopyLen, RuntimeResidual);
 
     // Loop body for the residual copy.
diff --git a/llvm/test/CodeGen/AMDGPU/lower-mem-intrinsics.ll b/llvm/test/CodeGen/AMDGPU/lower-mem-intrinsics.ll
index 8018e0c..0f4e790 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-mem-intrinsics.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-mem-intrinsics.ll
@@ -189,8 +189,8 @@ define amdgpu_kernel void @min_size_large_static_memset_caller0(ptr addrspace(1)
 
 define amdgpu_kernel void @variable_memcpy_caller0(ptr addrspace(1) %dst, ptr addrspace(1) %src, i64 %n) #0 {
 ; OPT-LABEL: @variable_memcpy_caller0(
-; OPT-NEXT:    [[TMP1:%.*]] = udiv i64 [[N:%.*]], 16
-; OPT-NEXT:    [[TMP2:%.*]] = urem i64 [[N]], 16
+; OPT-NEXT:    [[TMP1:%.*]] = lshr i64 [[N:%.*]], 4
+; OPT-NEXT:    [[TMP2:%.*]] = and i64 [[N]], 15
 ; OPT-NEXT:    [[TMP3:%.*]] = sub i64 [[N]], [[TMP2]]
 ; OPT-NEXT:    [[TMP4:%.*]] = icmp ne i64 [[TMP1]], 0
 ; OPT-NEXT:    br i1 [[TMP4]], label [[LOOP_MEMCPY_EXPANSION:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER:%.*]]
@@ -225,8 +225,8 @@ define amdgpu_kernel void @variable_memcpy_caller0(ptr addrspace(1) %dst, ptr ad
 
 define amdgpu_kernel void @variable_memcpy_caller1(ptr addrspace(1) %dst, ptr addrspace(1) %src, i64 %n) #0 {
 ; OPT-LABEL: @variable_memcpy_caller1(
-; OPT-NEXT:    [[TMP1:%.*]] = udiv i64 [[N:%.*]], 16
-; OPT-NEXT:    [[TMP2:%.*]] = urem i64 [[N]], 16
+; OPT-NEXT:    [[TMP1:%.*]] = lshr i64 [[N:%.*]], 4
+; OPT-NEXT:    [[TMP2:%.*]] = and i64 [[N]], 15
 ; OPT-NEXT:    [[TMP3:%.*]] = sub i64 [[N]], [[TMP2]]
 ; OPT-NEXT:    [[TMP4:%.*]] = icmp ne i64 [[TMP1]], 0
 ; OPT-NEXT:    br i1 [[TMP4]], label [[LOOP_MEMCPY_EXPANSION:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER:%.*]]
@@ -261,8 +261,8 @@ define amdgpu_kernel void @variable_memcpy_caller1(ptr addrspace(1) %dst, ptr ad
 
 define amdgpu_kernel void @memcpy_multi_use_one_function(ptr addrspace(1) %dst0, ptr addrspace(1) %dst1, ptr addrspace(1) %src, i64 %n, i64 %m) #0 {
 ; OPT-LABEL: @memcpy_multi_use_one_function(
-; OPT-NEXT:    [[TMP1:%.*]] = udiv i64 [[N:%.*]], 16
-; OPT-NEXT:    [[TMP2:%.*]] = urem i64 [[N]], 16
+; OPT-NEXT:    [[TMP1:%.*]] = lshr i64 [[N:%.*]], 4
+; OPT-NEXT:    [[TMP2:%.*]] = and i64 [[N]], 15
 ; OPT-NEXT:    [[TMP3:%.*]] = sub i64 [[N]], [[TMP2]]
 ; OPT-NEXT:    [[TMP4:%.*]] = icmp ne i64 [[TMP1]], 0
 ; OPT-NEXT:    br i1 [[TMP4]], label [[LOOP_MEMCPY_EXPANSION2:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER5:%.*]]
@@ -286,8 +286,8 @@ define amdgpu_kernel void @memcpy_multi_use_one_function(ptr addrspace(1) %dst0,
 ; OPT-NEXT:    [[TMP15:%.*]] = icmp ult i64 [[TMP14]], [[TMP2]]
 ; OPT-NEXT:    br i1 [[TMP15]], label [[LOOP_MEMCPY_RESIDUAL4]], label [[POST_LOOP_MEMCPY_EXPANSION1:%.*]]
 ; OPT:       post-loop-memcpy-expansion1:
-; OPT-NEXT:    [[TMP16:%.*]] = udiv i64 [[M:%.*]], 16
-; OPT-NEXT:    [[TMP17:%.*]] = urem i64 [[M]], 16
+; OPT-NEXT:    [[TMP16:%.*]] = lshr i64 [[M:%.*]], 4
+; OPT-NEXT:    [[TMP17:%.*]] = and i64 [[M]], 15
 ; OPT-NEXT:    [[TMP18:%.*]] = sub i64 [[M]], [[TMP17]]
 ; OPT-NEXT:    [[TMP19:%.*]] = icmp ne i64 [[TMP16]], 0
 ; OPT-NEXT:    br i1 [[TMP19]], label [[LOOP_MEMCPY_EXPANSION:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER:%.*]]
@@ -326,8 +326,8 @@ define amdgpu_kernel void @memcpy_multi_use_one_function(ptr addrspace(1) %dst0,
 
 define amdgpu_kernel void @memcpy_alt_type(ptr addrspace(1) %dst, ptr addrspace(3) %src, i32 %n) #0 {
 ; OPT-LABEL: @memcpy_alt_type(
-; OPT-NEXT:    [[TMP1:%.*]] = udiv i32 [[N:%.*]], 8
-; OPT-NEXT:    [[TMP2:%.*]] = urem i32 [[N]], 8
+; OPT-NEXT:    [[TMP1:%.*]] = lshr i32 [[N:%.*]], 3
+; OPT-NEXT:    [[TMP2:%.*]] = and i32 [[N]], 7
 ; OPT-NEXT:    [[TMP3:%.*]] = sub i32 [[N]], [[TMP2]]
 ; OPT-NEXT:    [[TMP4:%.*]] = icmp ne i32 [[TMP1]], 0
 ; OPT-NEXT:    br i1 [[TMP4]], label [[LOOP_MEMCPY_EXPANSION:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER:%.*]]
@@ -363,8 +363,8 @@ define amdgpu_kernel void @memcpy_alt_type(ptr addrspace(1) %dst, ptr addrspace(
 ; One of the uses in the function should be expanded, the other left alone.
 define amdgpu_kernel void @memcpy_multi_use_one_function_keep_small(ptr addrspace(1) %dst0, ptr addrspace(1) %dst1, ptr addrspace(1) %src, i64 %n) #0 {
 ; MAX1024-LABEL: @memcpy_multi_use_one_function_keep_small(
-; MAX1024-NEXT:    [[TMP1:%.*]] = udiv i64 [[N:%.*]], 16
-; MAX1024-NEXT:    [[TMP2:%.*]] = urem i64 [[N]], 16
+; MAX1024-NEXT:    [[TMP1:%.*]] = lshr i64 [[N:%.*]], 4
+; MAX1024-NEXT:    [[TMP2:%.*]] = and i64 [[N]], 15
 ; MAX1024-NEXT:    [[TMP3:%.*]] = sub i64 [[N]], [[TMP2]]
 ; MAX1024-NEXT:    [[TMP4:%.*]] = icmp ne i64 [[TMP1]], 0
 ; MAX1024-NEXT:    br i1 [[TMP4]], label [[LOOP_MEMCPY_EXPANSION:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER:%.*]]
@@ -395,8 +395,8 @@ define amdgpu_kernel void @memcpy_multi_use_one_function_keep_small(ptr addrspac
 ; MAX1024-NEXT:    br i1 [[TMP16]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION]]
 ;
 ; ALL-LABEL: @memcpy_multi_use_one_function_keep_small(
-; ALL-NEXT:    [[TMP1:%.*]] = udiv i64 [[N:%.*]], 16
-; ALL-NEXT:    [[TMP2:%.*]] = urem i64 [[N]], 16
+; ALL-NEXT:    [[TMP1:%.*]] = lshr i64 [[N:%.*]], 4
+; ALL-NEXT:    [[TMP2:%.*]] = and i64 [[N]], 15
 ; ALL-NEXT:    [[TMP3:%.*]] = sub i64 [[N]], [[TMP2]]
 ; ALL-NEXT:    [[TMP4:%.*]] = icmp ne i64 [[TMP1]], 0
 ; ALL-NEXT:    br i1 [[TMP4]], label [[LOOP_MEMCPY_EXPANSION:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER:%.*]]
@@ -909,8 +909,8 @@ define amdgpu_kernel void @memcpy_private_align2_private_align2_1027(ptr addrspa
 
 define amdgpu_kernel void @memcpy_global_align4_global_align4_variable(ptr addrspace(1) %dst, ptr addrspace(1) %src, i64 %n) #0 {
 ; OPT-LABEL: @memcpy_global_align4_global_align4_variable(
-; OPT-NEXT:    [[TMP1:%.*]] = udiv i64 [[N:%.*]], 16
-; OPT-NEXT:    [[TMP2:%.*]] = urem i64 [[N]], 16
+; OPT-NEXT:    [[TMP1:%.*]] = lshr i64 [[N:%.*]], 4
+; OPT-NEXT:    [[TMP2:%.*]] = and i64 [[N]], 15
 ; OPT-NEXT:    [[TMP3:%.*]] = sub i64 [[N]], [[TMP2]]
 ; OPT-NEXT:    [[TMP4:%.*]] = icmp ne i64 [[TMP1]], 0
 ; OPT-NEXT:    br i1 [[TMP4]], label [[LOOP_MEMCPY_EXPANSION:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER:%.*]]
@@ -945,8 +945,8 @@ define amdgpu_kernel void @memcpy_global_align4_global_align4_variable(ptr addrs
 
 define amdgpu_kernel void @memcpy_global_align2_global_align2_variable(ptr addrspace(1) %dst, ptr addrspace(1) %src, i64 %n) #0 {
 ; OPT-LABEL: @memcpy_global_align2_global_align2_variable(
-; OPT-NEXT:    [[TMP1:%.*]] = udiv i64 [[N:%.*]], 2
-; OPT-NEXT:    [[TMP2:%.*]] = urem i64 [[N]], 2
+; OPT-NEXT:    [[TMP1:%.*]] = lshr i64 [[N:%.*]], 1
+; OPT-NEXT:    [[TMP2:%.*]] = and i64 [[N]], 1
 ; OPT-NEXT:    [[TMP3:%.*]] = sub i64 [[N]], [[TMP2]]
 ; OPT-NEXT:    [[TMP4:%.*]] = icmp ne i64 [[TMP1]], 0
 ; OPT-NEXT:    br i1 [[TMP4]], label [[LOOP_MEMCPY_EXPANSION:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER:%.*]]
@@ -981,8 +981,8 @@ define amdgpu_kernel void @memcpy_global_align2_global_align2_variable(ptr addrs
 
 define amdgpu_kernel void @memcpy_global_align1_global_align1_variable(ptr addrspace(1) %dst, ptr addrspace(1) %src, i64 %n) #0 {
 ; OPT-LABEL: @memcpy_global_align1_global_align1_variable(
-; OPT-NEXT:    [[TMP1:%.*]] = udiv i64 [[N:%.*]], 16
-; OPT-NEXT:    [[TMP2:%.*]] = urem i64 [[N]], 16
+; OPT-NEXT:    [[TMP1:%.*]] = lshr i64 [[N:%.*]], 4
+; OPT-NEXT:    [[TMP2:%.*]] = and i64 [[N]], 15
 ; OPT-NEXT:    [[TMP3:%.*]] = sub i64 [[N]], [[TMP2]]
 ; OPT-NEXT:    [[TMP4:%.*]] = icmp ne i64 [[TMP1]], 0
 ; OPT-NEXT:    br i1 [[TMP4]], label [[LOOP_MEMCPY_EXPANSION:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER:%.*]]
@@ -1017,8 +1017,8 @@ define amdgpu_kernel void @memcpy_global_align1_global_align1_variable(ptr addrs
 
 define amdgpu_kernel void @memcpy_local_align4_local_align4_variable(ptr addrspace(3) %dst, ptr addrspace(3) %src, i32 %n) #0 {
 ; OPT-LABEL: @memcpy_local_align4_local_align4_variable(
-; OPT-NEXT:    [[TMP1:%.*]] = udiv i32 [[N:%.*]], 8
-; OPT-NEXT:    [[TMP2:%.*]] = urem i32 [[N]], 8
+; OPT-NEXT:    [[TMP1:%.*]] = lshr i32 [[N:%.*]], 3
+; OPT-NEXT:    [[TMP2:%.*]] = and i32 [[N]], 7
 ; OPT-NEXT:    [[TMP3:%.*]] = sub i32 [[N]], [[TMP2]]
 ; OPT-NEXT:    [[TMP4:%.*]] = icmp ne i32 [[TMP1]], 0
 ; OPT-NEXT:    br i1 [[TMP4]], label [[LOOP_MEMCPY_EXPANSION:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER:%.*]]
@@ -1053,8 +1053,8 @@ define amdgpu_kernel void @memcpy_local_align4_local_align4_variable(ptr addrspa
 
 define amdgpu_kernel void @memcpy_local_align2_local_align2_variable(ptr addrspace(3) %dst, ptr addrspace(3) %src, i32 %n) #0 {
 ; OPT-LABEL: @memcpy_local_align2_local_align2_variable(
-; OPT-NEXT:    [[TMP1:%.*]] = udiv i32 [[N:%.*]], 2
-; OPT-NEXT:    [[TMP2:%.*]] = urem i32 [[N]], 2
+; OPT-NEXT:    [[TMP1:%.*]] = lshr i32 [[N:%.*]], 1
+; OPT-NEXT:    [[TMP2:%.*]] = and i32 [[N]], 1
 ; OPT-NEXT:    [[TMP3:%.*]] = sub i32 [[N]], [[TMP2]]
 ; OPT-NEXT:    [[TMP4:%.*]] = icmp ne i32 [[TMP1]], 0
 ; OPT-NEXT:    br i1 [[TMP4]], label [[LOOP_MEMCPY_EXPANSION:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER:%.*]]
@@ -1089,8 +1089,8 @@ define amdgpu_kernel void @memcpy_local_align2_local_align2_variable(ptr addrspa
 
 define amdgpu_kernel void @memcpy_local_align1_local_align1_variable(ptr addrspace(3) %dst, ptr addrspace(3) %src, i32 %n) #0 {
 ; OPT-LABEL: @memcpy_local_align1_local_align1_variable(
-; OPT-NEXT:    [[TMP1:%.*]] = udiv i32 [[N:%.*]], 8
-; OPT-NEXT:    [[TMP2:%.*]] = urem i32 [[N]], 8
+; OPT-NEXT:    [[TMP1:%.*]] = lshr i32 [[N:%.*]], 3
+; OPT-NEXT:    [[TMP2:%.*]] = and i32 [[N]], 7
 ; OPT-NEXT:    [[TMP3:%.*]] = sub i32 [[N]], [[TMP2]]
 ; OPT-NEXT:    [[TMP4:%.*]] = icmp ne i32 [[TMP1]], 0
 ; OPT-NEXT:    br i1 [[TMP4]], label [[LOOP_MEMCPY_EXPANSION:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER:%.*]]
@@ -1125,8 +1125,8 @@ define amdgpu_kernel void @memcpy_local_align1_local_align1_variable(ptr addrspa
 
 define amdgpu_kernel void @memcpy_local_align4_global_align4_variable(ptr addrspace(3) %dst, ptr addrspace(1) %src, i32 %n) #0 {
 ; OPT-LABEL: @memcpy_local_align4_global_align4_variable(
-; OPT-NEXT:    [[TMP1:%.*]] = udiv i32 [[N:%.*]], 8
-; OPT-NEXT:    [[TMP2:%.*]] = urem i32 [[N]], 8
+; OPT-NEXT:    [[TMP1:%.*]] = lshr i32 [[N:%.*]], 3
+; OPT-NEXT:    [[TMP2:%.*]] = and i32 [[N]], 7
 ; OPT-NEXT:    [[TMP3:%.*]] = sub i32 [[N]], [[TMP2]]
 ; OPT-NEXT:    [[TMP4:%.*]] = icmp ne i32 [[TMP1]], 0
 ; OPT-NEXT:    br i1 [[TMP4]], label [[LOOP_MEMCPY_EXPANSION:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER:%.*]]
@@ -1161,8 +1161,8 @@ define amdgpu_kernel void @memcpy_local_align4_global_align4_variable(ptr addrsp
 
 define amdgpu_kernel void @memcpy_global_align4_local_align4_variable(ptr addrspace(1) %dst, ptr addrspace(3) %src, i32 %n) #0 {
 ; OPT-LABEL: @memcpy_global_align4_local_align4_variable(
-; OPT-NEXT:    [[TMP1:%.*]] = udiv i32 [[N:%.*]], 8
-; OPT-NEXT:    [[TMP2:%.*]] = urem i32 [[N]], 8
+; OPT-NEXT:    [[TMP1:%.*]] = lshr i32 [[N:%.*]], 3
+; OPT-NEXT:    [[TMP2:%.*]] = and i32 [[N]], 7
 ; OPT-NEXT:    [[TMP3:%.*]] = sub i32 [[N]], [[TMP2]]
 ; OPT-NEXT:    [[TMP4:%.*]] = icmp ne i32 [[TMP1]], 0
 ; OPT-NEXT:    br i1 [[TMP4]], label [[LOOP_MEMCPY_EXPANSION:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER:%.*]]
@@ -1488,9 +1488,9 @@ define amdgpu_kernel void @memmove_private_align1_global_align1(ptr addrspace(5)
 ; ALL:       load-store-loop:
 ; ALL-NEXT:    [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], [[LOAD_STORE_LOOP]] ]
 ; ALL-NEXT:    [[TMP1:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(1) [[SRC:%.*]], i64 [[LOOP_INDEX]]
-; ALL-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP1]], align 1, !alias.scope !0
+; ALL-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP1]], align 1, !alias.scope [[META0:![0-9]+]]
 ; ALL-NEXT:    [[TMP3:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(5) [[DST:%.*]], i64 [[LOOP_INDEX]]
-; ALL-NEXT:    store <4 x i32> [[TMP2]], ptr addrspace(5) [[TMP3]], align 1, !noalias !0
+; ALL-NEXT:    store <4 x i32> [[TMP2]], ptr addrspace(5) [[TMP3]], align 1, !noalias [[META0]]
 ; ALL-NEXT:    [[TMP4]] = add i64 [[LOOP_INDEX]], 1
 ; ALL-NEXT:    [[TMP5:%.*]] = icmp ult i64 [[TMP4]], 16
 ; ALL-NEXT:    br i1 [[TMP5]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]]
@@ -1511,9 +1511,9 @@ define amdgpu_kernel void @memmove_global_align1_private_align1(ptr addrspace(1)
 ; ALL:       load-store-loop:
 ; ALL-NEXT:    [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], [[LOAD_STORE_LOOP]] ]
 ; ALL-NEXT:    [[TMP1:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(5) [[SRC:%.*]], i64 [[LOOP_INDEX]]
-; ALL-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr addrspace(5) [[TMP1]], align 1, !alias.scope !3
+; ALL-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr addrspace(5) [[TMP1]], align 1, !alias.scope [[META3:![0-9]+]]
 ; ALL-NEXT:    [[TMP3:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(1) [[DST:%.*]], i64 [[LOOP_INDEX]]
-; ALL-NEXT:    store <4 x i32> [[TMP2]], ptr addrspace(1) [[TMP3]], align 1, !noalias !3
+; ALL-NEXT:    store <4 x i32> [[TMP2]], ptr addrspace(1) [[TMP3]], align 1, !noalias [[META3]]
 ; ALL-NEXT:    [[TMP4]] = add i64 [[LOOP_INDEX]], 1
 ; ALL-NEXT:    [[TMP5:%.*]] = icmp ult i64 [[TMP4]], 16
 ; ALL-NEXT:    br i1 [[TMP5]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]]
@@ -1561,9 +1561,9 @@ define amdgpu_kernel void @memmove_local_align1_private_align1(ptr addrspace(3)
 ; ALL:       load-store-loop:
 ; ALL-NEXT:    [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], [[LOAD_STORE_LOOP]] ]
 ; ALL-NEXT:    [[TMP1:%.*]] = getelementptr inbounds <2 x i32>, ptr addrspace(5) [[SRC:%.*]], i32 [[LOOP_INDEX]]
-; ALL-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr addrspace(5) [[TMP1]], align 1, !alias.scope !6
+; ALL-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr addrspace(5) [[TMP1]], align 1, !alias.scope [[META6:![0-9]+]]
 ; ALL-NEXT:    [[TMP3:%.*]] = getelementptr inbounds <2 x i32>, ptr addrspace(3) [[DST:%.*]], i32 [[LOOP_INDEX]]
-; ALL-NEXT:    store <2 x i32> [[TMP2]], ptr addrspace(3) [[TMP3]], align 1, !noalias !6
+; ALL-NEXT:    store <2 x i32> [[TMP2]], ptr addrspace(3) [[TMP3]], align 1, !noalias [[META6]]
 ; ALL-NEXT:    [[TMP4]] = add i32 [[LOOP_INDEX]], 1
 ; ALL-NEXT:    [[TMP5:%.*]] = icmp ult i32 [[TMP4]], 32
 ; ALL-NEXT:    br i1 [[TMP5]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]]
@@ -1576,17 +1576,17 @@ define amdgpu_kernel void @memmove_local_align1_private_align1(ptr addrspace(3)
 
 define amdgpu_kernel void @memmove_local_align1_private_align1_unknown_size(ptr addrspace(3) %dst, ptr addrspace(5) %src, i32 %size) {
 ; MAX1024-LABEL: @memmove_local_align1_private_align1_unknown_size(
-; MAX1024-NEXT:    [[TMP1:%.*]] = udiv i32 [[SIZE:%.*]], 8
-; MAX1024-NEXT:    [[TMP2:%.*]] = urem i32 [[SIZE]], 8
+; MAX1024-NEXT:    [[TMP1:%.*]] = lshr i32 [[SIZE:%.*]], 3
+; MAX1024-NEXT:    [[TMP2:%.*]] = and i32 [[SIZE]], 7
 ; MAX1024-NEXT:    [[TMP3:%.*]] = sub i32 [[SIZE]], [[TMP2]]
 ; MAX1024-NEXT:    [[TMP4:%.*]] = icmp ne i32 [[TMP1]], 0
 ; MAX1024-NEXT:    br i1 [[TMP4]], label [[LOOP_MEMCPY_EXPANSION:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER:%.*]]
 ; MAX1024:       loop-memcpy-expansion:
 ; MAX1024-NEXT:    [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP8:%.*]], [[LOOP_MEMCPY_EXPANSION]] ]
 ; MAX1024-NEXT:    [[TMP5:%.*]] = getelementptr inbounds <2 x i32>, ptr addrspace(5) [[SRC:%.*]], i32 [[LOOP_INDEX]]
-; MAX1024-NEXT:    [[TMP6:%.*]] = load <2 x i32>, ptr addrspace(5) [[TMP5]], align 1, !alias.scope !0
+; MAX1024-NEXT:    [[TMP6:%.*]] = load <2 x i32>, ptr addrspace(5) [[TMP5]], align 1, !alias.scope [[META0:![0-9]+]]
 ; MAX1024-NEXT:    [[TMP7:%.*]] = getelementptr inbounds <2 x i32>, ptr addrspace(3) [[DST:%.*]], i32 [[LOOP_INDEX]]
-; MAX1024-NEXT:    store <2 x i32> [[TMP6]], ptr addrspace(3) [[TMP7]], align 1, !noalias !0
+; MAX1024-NEXT:    store <2 x i32> [[TMP6]], ptr addrspace(3) [[TMP7]], align 1, !noalias [[META0]]
 ; MAX1024-NEXT:    [[TMP8]] = add i32 [[LOOP_INDEX]], 1
 ; MAX1024-NEXT:    [[TMP9:%.*]] = icmp ult i32 [[TMP8]], [[TMP1]]
 ; MAX1024-NEXT:    br i1 [[TMP9]], label [[LOOP_MEMCPY_EXPANSION]], label [[LOOP_MEMCPY_RESIDUAL_HEADER]]
@@ -1594,9 +1594,9 @@ define amdgpu_kernel void @memmove_local_align1_private_align1_unknown_size(ptr
 ; MAX1024-NEXT:    [[RESIDUAL_LOOP_INDEX:%.*]] = phi i32 [ 0, [[LOOP_MEMCPY_RESIDUAL_HEADER]] ], [ [[TMP14:%.*]], [[LOOP_MEMCPY_RESIDUAL:%.*]] ]
 ; MAX1024-NEXT:    [[TMP10:%.*]] = add i32 [[TMP3]], [[RESIDUAL_LOOP_INDEX]]
 ; MAX1024-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[SRC]], i32 [[TMP10]]
-; MAX1024-NEXT:    [[TMP12:%.*]] = load i8, ptr addrspace(5) [[TMP11]], align 1, !alias.scope !0
+; MAX1024-NEXT:    [[TMP12:%.*]] = load i8, ptr addrspace(5) [[TMP11]], align 1, !alias.scope [[META0]]
 ; MAX1024-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[DST]], i32 [[TMP10]]
-; MAX1024-NEXT:    store i8 [[TMP12]], ptr addrspace(3) [[TMP13]], align 1, !noalias !0
+; MAX1024-NEXT:    store i8 [[TMP12]], ptr addrspace(3) [[TMP13]], align 1, !noalias [[META0]]
 ; MAX1024-NEXT:    [[TMP14]] = add i32 [[RESIDUAL_LOOP_INDEX]], 1
 ; MAX1024-NEXT:    [[TMP15:%.*]] = icmp ult i32 [[TMP14]], [[TMP2]]
 ; MAX1024-NEXT:    br i1 [[TMP15]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION:%.*]]
@@ -1607,17 +1607,17 @@ define amdgpu_kernel void @memmove_local_align1_private_align1_unknown_size(ptr
 ; MAX1024-NEXT:    br i1 [[TMP16]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION]]
 ;
 ; ALL-LABEL: @memmove_local_align1_private_align1_unknown_size(
-; ALL-NEXT:    [[TMP1:%.*]] = udiv i32 [[SIZE:%.*]], 8
-; ALL-NEXT:    [[TMP2:%.*]] = urem i32 [[SIZE]], 8
+; ALL-NEXT:    [[TMP1:%.*]] = lshr i32 [[SIZE:%.*]], 3
+; ALL-NEXT:    [[TMP2:%.*]] = and i32 [[SIZE]], 7
 ; ALL-NEXT:    [[TMP3:%.*]] = sub i32 [[SIZE]], [[TMP2]]
 ; ALL-NEXT:    [[TMP4:%.*]] = icmp ne i32 [[TMP1]], 0
 ; ALL-NEXT:    br i1 [[TMP4]], label [[LOOP_MEMCPY_EXPANSION:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER:%.*]]
 ; ALL:       loop-memcpy-expansion:
 ; ALL-NEXT:    [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP8:%.*]], [[LOOP_MEMCPY_EXPANSION]] ]
 ; ALL-NEXT:    [[TMP5:%.*]] = getelementptr inbounds <2 x i32>, ptr addrspace(5) [[SRC:%.*]], i32 [[LOOP_INDEX]]
-; ALL-NEXT:    [[TMP6:%.*]] = load <2 x i32>, ptr addrspace(5) [[TMP5]], align 1, !alias.scope !9
+; ALL-NEXT:    [[TMP6:%.*]] = load <2 x i32>, ptr addrspace(5) [[TMP5]], align 1, !alias.scope [[META9:![0-9]+]]
 ; ALL-NEXT:    [[TMP7:%.*]] = getelementptr inbounds <2 x i32>, ptr addrspace(3) [[DST:%.*]], i32 [[LOOP_INDEX]]
-; ALL-NEXT:    store <2 x i32> [[TMP6]], ptr addrspace(3) [[TMP7]], align 1, !noalias !9
+; ALL-NEXT:    store <2 x i32> [[TMP6]], ptr addrspace(3) [[TMP7]], align 1, !noalias [[META9]]
 ; ALL-NEXT:    [[TMP8]] = add i32 [[LOOP_INDEX]], 1
 ; ALL-NEXT:    [[TMP9:%.*]] = icmp ult i32 [[TMP8]], [[TMP1]]
 ; ALL-NEXT:    br i1 [[TMP9]], label [[LOOP_MEMCPY_EXPANSION]], label [[LOOP_MEMCPY_RESIDUAL_HEADER]]
@@ -1625,9 +1625,9 @@ define amdgpu_kernel void @memmove_local_align1_private_align1_unknown_size(ptr
 ; ALL-NEXT:    [[RESIDUAL_LOOP_INDEX:%.*]] = phi i32 [ 0, [[LOOP_MEMCPY_RESIDUAL_HEADER]] ], [ [[TMP14:%.*]], [[LOOP_MEMCPY_RESIDUAL:%.*]] ]
 ; ALL-NEXT:    [[TMP10:%.*]] = add i32 [[TMP3]], [[RESIDUAL_LOOP_INDEX]]
 ; ALL-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[SRC]], i32 [[TMP10]]
-; ALL-NEXT:    [[TMP12:%.*]] = load i8, ptr addrspace(5) [[TMP11]], align 1, !alias.scope !9
+; ALL-NEXT:    [[TMP12:%.*]] = load i8, ptr addrspace(5) [[TMP11]], align 1, !alias.scope [[META9]]
 ; ALL-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[DST]], i32 [[TMP10]]
-; ALL-NEXT:    store i8 [[TMP12]], ptr addrspace(3) [[TMP13]], align 1, !noalias !9
+; ALL-NEXT:    store i8 [[TMP12]], ptr addrspace(3) [[TMP13]], align 1, !noalias [[META9]]
 ; ALL-NEXT:    [[TMP14]] = add i32 [[RESIDUAL_LOOP_INDEX]], 1
 ; ALL-NEXT:    [[TMP15:%.*]] = icmp ult i32 [[TMP14]], [[TMP2]]
 ; ALL-NEXT:    br i1 [[TMP15]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION:%.*]]
@@ -1651,9 +1651,9 @@ define amdgpu_kernel void @memmove_private_align1_local_align1(ptr addrspace(5)
 ; ALL:       load-store-loop:
 ; ALL-NEXT:    [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], [[LOAD_STORE_LOOP]] ]
 ; ALL-NEXT:    [[TMP1:%.*]] = getelementptr inbounds <2 x i32>, ptr addrspace(3) [[SRC:%.*]], i32 [[LOOP_INDEX]]
-; ALL-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr addrspace(3) [[TMP1]], align 1, !alias.scope !12
+; ALL-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr addrspace(3) [[TMP1]], align 1, !alias.scope [[META12:![0-9]+]]
 ; ALL-NEXT:    [[TMP3:%.*]] = getelementptr inbounds <2 x i32>, ptr addrspace(5) [[DST:%.*]], i32 [[LOOP_INDEX]]
-; ALL-NEXT:    store <2 x i32> [[TMP2]], ptr addrspace(5) [[TMP3]], align 1, !noalias !12
+; ALL-NEXT:    store <2 x i32> [[TMP2]], ptr addrspace(5) [[TMP3]], align 1, !noalias [[META12]]
 ; ALL-NEXT:    [[TMP4]] = add i32 [[LOOP_INDEX]], 1
 ; ALL-NEXT:    [[TMP5:%.*]] = icmp ult i32 [[TMP4]], 32
 ; ALL-NEXT:    br i1 [[TMP5]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]]
@@ -1666,17 +1666,17 @@ define amdgpu_kernel void @memmove_private_align1_local_align1(ptr addrspace(5)
 
 define amdgpu_kernel void @memmove_private_align1_local_align1_unknown_size(ptr addrspace(5) %dst, ptr addrspace(3) %src, i32 %size) {
 ; MAX1024-LABEL: @memmove_private_align1_local_align1_unknown_size(
-; MAX1024-NEXT:    [[TMP1:%.*]] = udiv i32 [[SIZE:%.*]], 8
-; MAX1024-NEXT:    [[TMP2:%.*]] = urem i32 [[SIZE]], 8
+; MAX1024-NEXT:    [[TMP1:%.*]] = lshr i32 [[SIZE:%.*]], 3
+; MAX1024-NEXT:    [[TMP2:%.*]] = and i32 [[SIZE]], 7
 ; MAX1024-NEXT:    [[TMP3:%.*]] = sub i32 [[SIZE]], [[TMP2]]
 ; MAX1024-NEXT:    [[TMP4:%.*]] = icmp ne i32 [[TMP1]], 0
 ; MAX1024-NEXT:    br i1 [[TMP4]], label [[LOOP_MEMCPY_EXPANSION:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER:%.*]]
 ; MAX1024:       loop-memcpy-expansion:
 ; MAX1024-NEXT:    [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP8:%.*]], [[LOOP_MEMCPY_EXPANSION]] ]
 ; MAX1024-NEXT:    [[TMP5:%.*]] = getelementptr inbounds <2 x i32>, ptr addrspace(3) [[SRC:%.*]], i32 [[LOOP_INDEX]]
-; MAX1024-NEXT:    [[TMP6:%.*]] = load <2 x i32>, ptr addrspace(3) [[TMP5]], align 1, !alias.scope !3
+; MAX1024-NEXT:    [[TMP6:%.*]] = load <2 x i32>, ptr addrspace(3) [[TMP5]], align 1, !alias.scope [[META3:![0-9]+]]
 ; MAX1024-NEXT:    [[TMP7:%.*]] = getelementptr inbounds <2 x i32>, ptr addrspace(5) [[DST:%.*]], i32 [[LOOP_INDEX]]
-; MAX1024-NEXT:    store <2 x i32> [[TMP6]], ptr addrspace(5) [[TMP7]], align 1, !noalias !3
+; MAX1024-NEXT:    store <2 x i32> [[TMP6]], ptr addrspace(5) [[TMP7]], align 1, !noalias [[META3]]
 ; MAX1024-NEXT:    [[TMP8]] = add i32 [[LOOP_INDEX]], 1
 ; MAX1024-NEXT:    [[TMP9:%.*]] = icmp ult i32 [[TMP8]], [[TMP1]]
 ; MAX1024-NEXT:    br i1 [[TMP9]], label [[LOOP_MEMCPY_EXPANSION]], label [[LOOP_MEMCPY_RESIDUAL_HEADER]]
@@ -1684,9 +1684,9 @@ define amdgpu_kernel void @memmove_private_align1_local_align1_unknown_size(ptr
 ; MAX1024-NEXT:    [[RESIDUAL_LOOP_INDEX:%.*]] = phi i32 [ 0, [[LOOP_MEMCPY_RESIDUAL_HEADER]] ], [ [[TMP14:%.*]], [[LOOP_MEMCPY_RESIDUAL:%.*]] ]
 ; MAX1024-NEXT:    [[TMP10:%.*]] = add i32 [[TMP3]], [[RESIDUAL_LOOP_INDEX]]
 ; MAX1024-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[SRC]], i32 [[TMP10]]
-; MAX1024-NEXT:    [[TMP12:%.*]] = load i8, ptr addrspace(3) [[TMP11]], align 1, !alias.scope !3
+; MAX1024-NEXT:    [[TMP12:%.*]] = load i8, ptr addrspace(3) [[TMP11]], align 1, !alias.scope [[META3]]
 ; MAX1024-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[DST]], i32 [[TMP10]]
-; MAX1024-NEXT:    store i8 [[TMP12]], ptr addrspace(5) [[TMP13]], align 1, !noalias !3
+; MAX1024-NEXT:    store i8 [[TMP12]], ptr addrspace(5) [[TMP13]], align 1, !noalias [[META3]]
 ; MAX1024-NEXT:    [[TMP14]] = add i32 [[RESIDUAL_LOOP_INDEX]], 1
 ; MAX1024-NEXT:    [[TMP15:%.*]] = icmp ult i32 [[TMP14]], [[TMP2]]
 ; MAX1024-NEXT:    br i1 [[TMP15]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION:%.*]]
@@ -1697,17 +1697,17 @@ define amdgpu_kernel void @memmove_private_align1_local_align1_unknown_size(ptr
 ; MAX1024-NEXT:    br i1 [[TMP16]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION]]
 ;
 ; ALL-LABEL: @memmove_private_align1_local_align1_unknown_size(
-; ALL-NEXT:    [[TMP1:%.*]] = udiv i32 [[SIZE:%.*]], 8
-; ALL-NEXT:    [[TMP2:%.*]] = urem i32 [[SIZE]], 8
+; ALL-NEXT:    [[TMP1:%.*]] = lshr i32 [[SIZE:%.*]], 3
+; ALL-NEXT:    [[TMP2:%.*]] = and i32 [[SIZE]], 7
 ; ALL-NEXT:    [[TMP3:%.*]] = sub i32 [[SIZE]], [[TMP2]]
 ; ALL-NEXT:    [[TMP4:%.*]] = icmp ne i32 [[TMP1]], 0
 ; ALL-NEXT:    br i1 [[TMP4]], label [[LOOP_MEMCPY_EXPANSION:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER:%.*]]
 ; ALL:       loop-memcpy-expansion:
 ; ALL-NEXT:    [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP8:%.*]], [[LOOP_MEMCPY_EXPANSION]] ]
 ; ALL-NEXT:    [[TMP5:%.*]] = getelementptr inbounds <2 x i32>, ptr addrspace(3) [[SRC:%.*]], i32 [[LOOP_INDEX]]
-; ALL-NEXT:    [[TMP6:%.*]] = load <2 x i32>, ptr addrspace(3) [[TMP5]], align 1, !alias.scope !15
+; ALL-NEXT:    [[TMP6:%.*]] = load <2 x i32>, ptr addrspace(3) [[TMP5]], align 1, !alias.scope [[META15:![0-9]+]]
 ; ALL-NEXT:    [[TMP7:%.*]] = getelementptr inbounds <2 x i32>, ptr addrspace(5) [[DST:%.*]], i32 [[LOOP_INDEX]]
-; ALL-NEXT:    store <2 x i32> [[TMP6]], ptr addrspace(5) [[TMP7]], align 1, !noalias !15
+; ALL-NEXT:    store <2 x i32> [[TMP6]], ptr addrspace(5) [[TMP7]], align 1, !noalias [[META15]]
 ; ALL-NEXT:    [[TMP8]] = add i32 [[LOOP_INDEX]], 1
 ; ALL-NEXT:    [[TMP9:%.*]] = icmp ult i32 [[TMP8]], [[TMP1]]
 ; ALL-NEXT:    br i1 [[TMP9]], label [[LOOP_MEMCPY_EXPANSION]], label [[LOOP_MEMCPY_RESIDUAL_HEADER]]
@@ -1715,9 +1715,9 @@ define amdgpu_kernel void @memmove_private_align1_local_align1_unknown_size(ptr
 ; ALL-NEXT:    [[RESIDUAL_LOOP_INDEX:%.*]] = phi i32 [ 0, [[LOOP_MEMCPY_RESIDUAL_HEADER]] ], [ [[TMP14:%.*]], [[LOOP_MEMCPY_RESIDUAL:%.*]] ]
 ; ALL-NEXT:    [[TMP10:%.*]] = add i32 [[TMP3]], [[RESIDUAL_LOOP_INDEX]]
 ; ALL-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[SRC]], i32 [[TMP10]]
-; ALL-NEXT:    [[TMP12:%.*]] = load i8, ptr addrspace(3) [[TMP11]], align 1, !alias.scope !15
+; ALL-NEXT:    [[TMP12:%.*]] = load i8, ptr addrspace(3) [[TMP11]], align 1, !alias.scope [[META15]]
 ; ALL-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[DST]], i32 [[TMP10]]
-; ALL-NEXT:    store i8 [[TMP12]], ptr addrspace(5) [[TMP13]], align 1, !noalias !15
+; ALL-NEXT:    store i8 [[TMP12]], ptr addrspace(5) [[TMP13]], align 1, !noalias [[META15]]
 ; ALL-NEXT:    [[TMP14]] = add i32 [[RESIDUAL_LOOP_INDEX]], 1
 ; ALL-NEXT:    [[TMP15:%.*]] = icmp ult i32 [[TMP14]], [[TMP2]]
 ; ALL-NEXT:    br i1 [[TMP15]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION:%.*]]
@@ -1731,5 +1731,49 @@ define amdgpu_kernel void @memmove_private_align1_local_align1_unknown_size(ptr
   ret void
 }
 
+define void @test_umin(i64 %0, i64 %idxprom, ptr %x, ptr %y) {
+; OPT-LABEL: @test_umin(
+; OPT-NEXT:  entry:
+; OPT-NEXT:    [[ARRAYIDX:%.*]] = getelementptr [32 x [8 x i64]], ptr [[Y:%.*]], i64 0, i64 [[IDXPROM:%.*]]
+; OPT-NEXT:    [[SPEC_SELECT:%.*]] = tail call i64 @llvm.umin.i64(i64 sub (i64 ptrtoint (ptr addrspacecast (ptr addrspace(4) inttoptr (i64 32 to ptr addrspace(4)) to ptr) to i64), i64 ptrtoint (ptr addrspacecast (ptr addrspace(4) null to ptr) to i64)), i64 56)
+; OPT-NEXT:    [[TMP1:%.*]] = lshr i64 [[SPEC_SELECT]], 4
+; OPT-NEXT:    [[TMP2:%.*]] = and i64 [[SPEC_SELECT]], 15
+; OPT-NEXT:    [[TMP3:%.*]] = sub i64 [[SPEC_SELECT]], [[TMP2]]
+; OPT-NEXT:    [[TMP4:%.*]] = icmp ne i64 [[TMP1]], 0
+; OPT-NEXT:    br i1 [[TMP4]], label [[LOOP_MEMCPY_EXPANSION:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER:%.*]]
+; OPT:       loop-memcpy-expansion:
+; OPT-NEXT:    [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[TMP8:%.*]], [[LOOP_MEMCPY_EXPANSION]] ]
+; OPT-NEXT:    [[TMP5:%.*]] = getelementptr inbounds <4 x i32>, ptr [[X:%.*]], i64 [[LOOP_INDEX]]
+; OPT-NEXT:    [[TMP6:%.*]] = load <4 x i32>, ptr [[TMP5]], align 1
+; OPT-NEXT:    [[TMP7:%.*]] = getelementptr inbounds <4 x i32>, ptr [[ARRAYIDX]], i64 [[LOOP_INDEX]]
+; OPT-NEXT:    store <4 x i32> [[TMP6]], ptr [[TMP7]], align 1
+; OPT-NEXT:    [[TMP8]] = add i64 [[LOOP_INDEX]], 1
+; OPT-NEXT:    [[TMP9:%.*]] = icmp ult i64 [[TMP8]], [[TMP1]]
+; OPT-NEXT:    br i1 [[TMP9]], label [[LOOP_MEMCPY_EXPANSION]], label [[LOOP_MEMCPY_RESIDUAL_HEADER]]
+; OPT:       loop-memcpy-residual:
+; OPT-NEXT:    [[RESIDUAL_LOOP_INDEX:%.*]] = phi i64 [ 0, [[LOOP_MEMCPY_RESIDUAL_HEADER]] ], [ [[TMP14:%.*]], [[LOOP_MEMCPY_RESIDUAL:%.*]] ]
+; OPT-NEXT:    [[TMP10:%.*]] = add i64 [[TMP3]], [[RESIDUAL_LOOP_INDEX]]
+; OPT-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[X]], i64 [[TMP10]]
+; OPT-NEXT:    [[TMP12:%.*]] = load i8, ptr [[TMP11]], align 1
+; OPT-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i8, ptr [[ARRAYIDX]], i64 [[TMP10]]
+; OPT-NEXT:    store i8 [[TMP12]], ptr [[TMP13]], align 1
+; OPT-NEXT:    [[TMP14]] = add i64 [[RESIDUAL_LOOP_INDEX]], 1
+; OPT-NEXT:    [[TMP15:%.*]] = icmp ult i64 [[TMP14]], [[TMP2]]
+; OPT-NEXT:    br i1 [[TMP15]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION:%.*]]
+; OPT:       post-loop-memcpy-expansion:
+; OPT-NEXT:    ret void
+; OPT:       loop-memcpy-residual-header:
+; OPT-NEXT:    [[TMP16:%.*]] = icmp ne i64 [[TMP2]], 0
+; OPT-NEXT:    br i1 [[TMP16]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION]]
+;
+entry:
+  %arrayidx = getelementptr [32 x [8 x i64]], ptr %y, i64 0, i64 %idxprom
+  %spec.select =  tail call i64 @llvm.umin.i64(i64 sub (i64 ptrtoint (ptr addrspacecast (ptr addrspace(4) inttoptr (i64 32 to ptr addrspace(4)) to ptr) to i64), i64 ptrtoint (ptr addrspacecast (ptr addrspace(4) null to ptr) to i64)), i64 56)
+  tail call void @llvm.memcpy.p0.p0.i64(ptr %arrayidx, ptr %x, i64 %spec.select, i1 false)
+  ret void
+}
+
+declare i64 @llvm.umin.i64(i64, i64)
+
 attributes #0 = { nounwind }
 attributes #1 = { argmemonly nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/memcpy-crash-issue63986.ll b/llvm/test/CodeGen/AMDGPU/memcpy-crash-issue63986.ll
index 8cf70ea..1dd18b4 100644
--- a/llvm/test/CodeGen/AMDGPU/memcpy-crash-issue63986.ll
+++ b/llvm/test/CodeGen/AMDGPU/memcpy-crash-issue63986.ll
@@ -59,8 +59,9 @@ define void @issue63986(i64 %0, i64 %idxprom) {
 ; CHECK-NEXT:    flat_store_byte v[8:9], v25 offset:12
 ; CHECK-NEXT:    s_cbranch_vccz .LBB0_2
 ; CHECK-NEXT:  ; %bb.3: ; %loop-memcpy-residual-header
-; CHECK-NEXT:    s_and_b32 s4, 32, 15
-; CHECK-NEXT:    s_mov_b32 s5, 0
+; CHECK-NEXT:    s_mov_b32 s4, 0
+; CHECK-NEXT:    s_mov_b32 s5, s4
+; CHECK-NEXT:    s_cmp_lg_u64 s[4:5], 0
 ; CHECK-NEXT:    s_cbranch_scc0 .LBB0_5
 ; CHECK-NEXT:  ; %bb.4:
 ; CHECK-NEXT:    ; implicit-def: $vgpr2_vgpr3
-- 
cgit v1.1


From b221b9733688d149dc288339e304e48af609ad75 Mon Sep 17 00:00:00 2001
From: Vyacheslav Levytskyy
 <89994100+VyacheslavLevytskyy@users.noreply.github.com>
Date: Mon, 12 Feb 2024 10:05:21 +0100
Subject: Add support for SPIR-V extension:  SPV_INTEL_subgroups (#81023)

The goal of this PR is to implement SPV_INTEL_subgroups extension in
SPIR-V Backend.
---
 llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp            |  78 +++++++++
 llvm/lib/Target/SPIRV/SPIRVBuiltins.td             |  58 ++++++-
 llvm/lib/Target/SPIRV/SPIRVInstrInfo.td            |  18 ++
 llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp      |  23 +++
 llvm/lib/Target/SPIRV/SPIRVSubtarget.cpp           |   5 +
 llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td     |   6 +-
 .../SPV_INTEL_subgroups/cl_intel_sub_groups.ll     | 189 +++++++++++++++++++++
 7 files changed, 373 insertions(+), 4 deletions(-)
 create mode 100644 llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_subgroups/cl_intel_sub_groups.ll

diff --git a/llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp b/llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp
index e4593e7..8721b90 100644
--- a/llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp
@@ -13,6 +13,7 @@
 
 #include "SPIRVBuiltins.h"
 #include "SPIRV.h"
+#include "SPIRVSubtarget.h"
 #include "SPIRVUtils.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/Analysis/ValueTracking.h"
@@ -82,6 +83,16 @@ struct GroupBuiltin {
 #define GET_GroupBuiltins_DECL
 #define GET_GroupBuiltins_IMPL
 
+struct IntelSubgroupsBuiltin {
+  StringRef Name;
+  uint32_t Opcode;
+  bool IsBlock;
+  bool IsWrite;
+};
+
+#define GET_IntelSubgroupsBuiltins_DECL
+#define GET_IntelSubgroupsBuiltins_IMPL
+
 struct GetBuiltin {
   StringRef Name;
   InstructionSet::InstructionSet Set;
@@ -549,6 +560,7 @@ static bool buildAtomicCompareExchangeInst(const SPIRV::IncomingCall *Call,
   assert(GR->getSPIRVTypeForVReg(ObjectPtr)->getOpcode() ==
          SPIRV::OpTypePointer);
   unsigned ExpectedType = GR->getSPIRVTypeForVReg(ExpectedArg)->getOpcode();
+  (void)ExpectedType;
   assert(IsCmpxchg ? ExpectedType == SPIRV::OpTypeInt
                    : ExpectedType == SPIRV::OpTypePointer);
   assert(GR->isScalarOfType(Desired, SPIRV::OpTypeInt));
@@ -849,6 +861,7 @@ static bool generateGroupInst(const SPIRV::IncomingCall *Call,
   if (GroupBuiltin->HasBoolArg) {
     Register ConstRegister = Call->Arguments[0];
     auto ArgInstruction = getDefInstrMaybeConstant(ConstRegister, MRI);
+    (void)ArgInstruction;
     // TODO: support non-constant bool values.
     assert(ArgInstruction->getOpcode() == TargetOpcode::G_CONSTANT &&
            "Only constant bool value args are supported");
@@ -900,6 +913,67 @@ static bool generateGroupInst(const SPIRV::IncomingCall *Call,
   return true;
 }
 
+static bool generateIntelSubgroupsInst(const SPIRV::IncomingCall *Call,
+                                       MachineIRBuilder &MIRBuilder,
+                                       SPIRVGlobalRegistry *GR) {
+  const SPIRV::DemangledBuiltin *Builtin = Call->Builtin;
+  MachineFunction &MF = MIRBuilder.getMF();
+  const auto *ST = static_cast<const SPIRVSubtarget *>(&MF.getSubtarget());
+  if (!ST->canUseExtension(SPIRV::Extension::SPV_INTEL_subgroups)) {
+    std::string DiagMsg = std::string(Builtin->Name) +
+                          ": the builtin requires the following SPIR-V "
+                          "extension: SPV_INTEL_subgroups";
+    report_fatal_error(DiagMsg.c_str(), false);
+  }
+  const SPIRV::IntelSubgroupsBuiltin *IntelSubgroups =
+      SPIRV::lookupIntelSubgroupsBuiltin(Builtin->Name);
+  MachineRegisterInfo *MRI = MIRBuilder.getMRI();
+
+  uint32_t OpCode = IntelSubgroups->Opcode;
+  if (IntelSubgroups->IsBlock) {
+    // Minimal number or arguments set in TableGen records is 1
+    if (SPIRVType *Arg0Type = GR->getSPIRVTypeForVReg(Call->Arguments[0])) {
+      if (Arg0Type->getOpcode() == SPIRV::OpTypeImage) {
+        // TODO: add required validation from the specification:
+        // "'Image' must be an object whose type is OpTypeImage with a 'Sampled'
+        // operand of 0 or 2. If the 'Sampled' operand is 2, then some
+        // dimensions require a capability."
+        switch (OpCode) {
+        case SPIRV::OpSubgroupBlockReadINTEL:
+          OpCode = SPIRV::OpSubgroupImageBlockReadINTEL;
+          break;
+        case SPIRV::OpSubgroupBlockWriteINTEL:
+          OpCode = SPIRV::OpSubgroupImageBlockWriteINTEL;
+          break;
+        }
+      }
+    }
+  }
+
+  // TODO: opaque pointers types should be eventually resolved in such a way
+  // that validation of block read is enabled with respect to the following
+  // specification requirement:
+  // "'Result Type' may be a scalar or vector type, and its component type must
+  // be equal to the type pointed to by 'Ptr'."
+  // For example, function parameter type should not be default i8 pointer, but
+  // depend on the result type of the instruction where it is used as a pointer
+  // argument of OpSubgroupBlockReadINTEL
+
+  // Build Intel subgroups instruction
+  MachineInstrBuilder MIB =
+      IntelSubgroups->IsWrite
+          ? MIRBuilder.buildInstr(OpCode)
+          : MIRBuilder.buildInstr(OpCode)
+                .addDef(Call->ReturnRegister)
+                .addUse(GR->getSPIRVTypeID(Call->ReturnType));
+  for (size_t i = 0; i < Call->Arguments.size(); ++i) {
+    MIB.addUse(Call->Arguments[i]);
+    MRI->setRegClass(Call->Arguments[i], &SPIRV::IDRegClass);
+  }
+
+  return true;
+}
+
 // These queries ask for a single size_t result for a given dimension index, e.g
 // size_t get_global_id(uint dimindex). In SPIR-V, the builtins corresonding to
 // these values are all vec3 types, so we need to extract the correct index or
@@ -1199,6 +1273,7 @@ static bool generateImageMiscQueryInst(const SPIRV::IncomingCall *Call,
   MIRBuilder.getMRI()->setRegClass(Image, &SPIRV::IDRegClass);
   SPIRV::Dim::Dim ImageDimensionality = static_cast<SPIRV::Dim::Dim>(
       GR->getSPIRVTypeForVReg(Image)->getOperand(2).getImm());
+  (void)ImageDimensionality;
 
   switch (Opcode) {
   case SPIRV::OpImageQuerySamples:
@@ -1976,6 +2051,8 @@ std::optional<bool> lowerBuiltin(const StringRef DemangledCall,
     return generateVectorLoadStoreInst(Call.get(), MIRBuilder, GR);
   case SPIRV::LoadStore:
     return generateLoadStoreInst(Call.get(), MIRBuilder, GR);
+  case SPIRV::IntelSubgroups:
+    return generateIntelSubgroupsInst(Call.get(), MIRBuilder, GR);
   }
   return false;
 }
@@ -2119,6 +2196,7 @@ parseBuiltinTypeNameToTargetExtType(std::string TypeName,
   for (unsigned i = HasTypeParameter ? 1 : 0; i < Parameters.size(); i++) {
     unsigned IntParameter = 0;
     bool ValidLiteral = !Parameters[i].getAsInteger(10, IntParameter);
+    (void)ValidLiteral;
     assert(ValidLiteral &&
            "Invalid format of SPIR-V builtin parameter literal!");
     IntParameters.push_back(IntParameter);
diff --git a/llvm/lib/Target/SPIRV/SPIRVBuiltins.td b/llvm/lib/Target/SPIRV/SPIRVBuiltins.td
index 8acd469..4013dd2 100644
--- a/llvm/lib/Target/SPIRV/SPIRVBuiltins.td
+++ b/llvm/lib/Target/SPIRV/SPIRVBuiltins.td
@@ -54,6 +54,7 @@ def Enqueue : BuiltinGroup;
 def AsyncCopy : BuiltinGroup;
 def VectorLoadStore : BuiltinGroup;
 def LoadStore : BuiltinGroup;
+def IntelSubgroups : BuiltinGroup;
 
 //===----------------------------------------------------------------------===//
 // Class defining a demangled builtin record. The information in the record
@@ -625,7 +626,7 @@ def GroupBuiltins : GenericTable {
                 "IsBallotFindBit", "IsLogical", "NoGroupOperation", "HasBoolArg"];
 }
 
-// Function to lookup native builtins by their name and set.
+// Function to lookup group builtins by their name and set.
 def lookupGroupBuiltin : SearchIndex {
   let Table = GroupBuiltins;
   let Key = ["Name"];
@@ -871,6 +872,61 @@ defm : DemangledGroupBuiltin<"group_non_uniform_scan_inclusive_logical_xors", Wo
 defm : DemangledGroupBuiltin<"group_non_uniform_scan_exclusive_logical_xors", WorkOrSub, OpGroupNonUniformLogicalXor>;
 defm : DemangledGroupBuiltin<"group_clustered_reduce_logical_xor", WorkOrSub, OpGroupNonUniformLogicalXor>;
 
+//===----------------------------------------------------------------------===//
+// Class defining a sub group builtin that should be translated into a
+// SPIR-V instruction using the SPV_INTEL_subgroups extension.
+//
+// name is the demangled name of the given builtin.
+// opcode specifies the SPIR-V operation code of the generated instruction.
+//===----------------------------------------------------------------------===//
+class IntelSubgroupsBuiltin<string name, Op operation> {
+  string Name = name;
+  Op Opcode = operation;
+  bit IsBlock = !or(!eq(operation, OpSubgroupBlockReadINTEL),
+                    !eq(operation, OpSubgroupBlockWriteINTEL));
+  bit IsWrite = !eq(operation, OpSubgroupBlockWriteINTEL);
+}
+
+// Table gathering all the Intel sub group builtins.
+def IntelSubgroupsBuiltins : GenericTable {
+  let FilterClass = "IntelSubgroupsBuiltin";
+  let Fields = ["Name", "Opcode", "IsBlock", "IsWrite"];
+}
+
+// Function to lookup group builtins by their name and set.
+def lookupIntelSubgroupsBuiltin : SearchIndex {
+  let Table = IntelSubgroupsBuiltins;
+  let Key = ["Name"];
+}
+
+// Multiclass used to define incoming builtin records for the SPV_INTEL_subgroups extension
+// and corresponding work/sub group builtin records.
+multiclass DemangledIntelSubgroupsBuiltin<string name, bits<8> minNumArgs, bits<8> maxNumArgs, Op operation> {
+  def : DemangledBuiltin<!strconcat("intel_sub_group_", name), OpenCL_std, IntelSubgroups, minNumArgs, maxNumArgs>;
+  def : IntelSubgroupsBuiltin<!strconcat("intel_sub_group_", name), operation>;
+}
+
+// cl_intel_subgroups
+defm : DemangledIntelSubgroupsBuiltin<"shuffle", 2, 2, OpSubgroupShuffleINTEL>;
+defm : DemangledIntelSubgroupsBuiltin<"shuffle_down", 3, 3, OpSubgroupShuffleDownINTEL>;
+defm : DemangledIntelSubgroupsBuiltin<"shuffle_up", 3, 3, OpSubgroupShuffleUpINTEL>;
+defm : DemangledIntelSubgroupsBuiltin<"shuffle_xor", 2, 2, OpSubgroupShuffleXorINTEL>;
+foreach i = ["", "2", "4", "8"] in {
+  // cl_intel_subgroups
+  defm : DemangledIntelSubgroupsBuiltin<!strconcat("block_read",  i), 1, 2, OpSubgroupBlockReadINTEL>;
+  defm : DemangledIntelSubgroupsBuiltin<!strconcat("block_write", i), 2, 3, OpSubgroupBlockWriteINTEL>;
+  // cl_intel_subgroups_short
+  defm : DemangledIntelSubgroupsBuiltin<!strconcat("block_read_ui",  i), 1, 2, OpSubgroupBlockReadINTEL>;
+  defm : DemangledIntelSubgroupsBuiltin<!strconcat("block_write_ui", i), 2, 3, OpSubgroupBlockWriteINTEL>;
+}
+// cl_intel_subgroups_char, cl_intel_subgroups_short, cl_intel_subgroups_long
+foreach i = ["", "2", "4", "8", "16"] in {
+  foreach j = ["c", "s", "l"] in {
+    defm : DemangledIntelSubgroupsBuiltin<!strconcat("block_read_u", j,  i), 1, 2, OpSubgroupBlockReadINTEL>;
+    defm : DemangledIntelSubgroupsBuiltin<!strconcat("block_write_u", j, i), 2, 3, OpSubgroupBlockWriteINTEL>;
+  }
+}
+// OpSubgroupImageBlockReadINTEL and OpSubgroupImageBlockWriteINTEL are to be resolved later on (in code)
 
 //===----------------------------------------------------------------------===//
 // Class defining a get builtin record used for lowering builtin calls such as
diff --git a/llvm/lib/Target/SPIRV/SPIRVInstrInfo.td b/llvm/lib/Target/SPIRV/SPIRVInstrInfo.td
index da033ba..caf2ae4 100644
--- a/llvm/lib/Target/SPIRV/SPIRVInstrInfo.td
+++ b/llvm/lib/Target/SPIRV/SPIRVInstrInfo.td
@@ -761,3 +761,21 @@ def OpGroupNonUniformBitwiseXor: OpGroupNUGroup<"BitwiseXor", 361>;
 def OpGroupNonUniformLogicalAnd: OpGroupNUGroup<"LogicalAnd", 362>;
 def OpGroupNonUniformLogicalOr: OpGroupNUGroup<"LogicalOr", 363>;
 def OpGroupNonUniformLogicalXor: OpGroupNUGroup<"LogicalXor", 364>;
+
+// 3.49.21. Group and Subgroup Instructions
+def OpSubgroupShuffleINTEL: Op<5571, (outs ID:$res), (ins TYPE:$type, ID:$data, ID:$invocationId),
+                  "$res = OpSubgroupShuffleINTEL $type $data $invocationId">;
+def OpSubgroupShuffleDownINTEL: Op<5572, (outs ID:$res), (ins TYPE:$type, ID:$current, ID:$next, ID:$delta),
+                  "$res = OpSubgroupShuffleDownINTEL $type $current $next $delta">;
+def OpSubgroupShuffleUpINTEL: Op<5573, (outs ID:$res), (ins TYPE:$type, ID:$previous, ID:$current, ID:$delta),
+                  "$res = OpSubgroupShuffleUpINTEL $type $previous $current $delta">;
+def OpSubgroupShuffleXorINTEL: Op<5574, (outs ID:$res), (ins TYPE:$type, ID:$data, ID:$value),
+                  "$res = OpSubgroupShuffleXorINTEL $type $data $value">;
+def OpSubgroupBlockReadINTEL: Op<5575, (outs ID:$res), (ins TYPE:$type, ID:$ptr),
+                  "$res = OpSubgroupBlockReadINTEL $type $ptr">;
+def OpSubgroupBlockWriteINTEL: Op<5576, (outs), (ins ID:$ptr, ID:$data),
+                  "OpSubgroupBlockWriteINTEL $ptr $data">;
+def OpSubgroupImageBlockReadINTEL: Op<5577, (outs ID:$res), (ins TYPE:$type, ID:$image, ID:$coordinate),
+                  "$res = OpSubgroupImageBlockReadINTEL $type $image $coordinate">;
+def OpSubgroupImageBlockWriteINTEL: Op<5578, (outs), (ins ID:$image, ID:$coordinate, ID:$data),
+                  "OpSubgroupImageBlockWriteINTEL $image $coordinate $data">;
diff --git a/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp b/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp
index 370da04..2dfb71d 100644
--- a/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp
@@ -908,6 +908,29 @@ void addInstrRequirements(const MachineInstr &MI,
   case SPIRV::OpGroupNonUniformBallotFindMSB:
     Reqs.addCapability(SPIRV::Capability::GroupNonUniformBallot);
     break;
+  case SPIRV::OpSubgroupShuffleINTEL:
+  case SPIRV::OpSubgroupShuffleDownINTEL:
+  case SPIRV::OpSubgroupShuffleUpINTEL:
+  case SPIRV::OpSubgroupShuffleXorINTEL:
+    if (ST.canUseExtension(SPIRV::Extension::SPV_INTEL_subgroups)) {
+      Reqs.addExtension(SPIRV::Extension::SPV_INTEL_subgroups);
+      Reqs.addCapability(SPIRV::Capability::SubgroupShuffleINTEL);
+    }
+    break;
+  case SPIRV::OpSubgroupBlockReadINTEL:
+  case SPIRV::OpSubgroupBlockWriteINTEL:
+    if (ST.canUseExtension(SPIRV::Extension::SPV_INTEL_subgroups)) {
+      Reqs.addExtension(SPIRV::Extension::SPV_INTEL_subgroups);
+      Reqs.addCapability(SPIRV::Capability::SubgroupBufferBlockIOINTEL);
+    }
+    break;
+  case SPIRV::OpSubgroupImageBlockReadINTEL:
+  case SPIRV::OpSubgroupImageBlockWriteINTEL:
+    if (ST.canUseExtension(SPIRV::Extension::SPV_INTEL_subgroups)) {
+      Reqs.addExtension(SPIRV::Extension::SPV_INTEL_subgroups);
+      Reqs.addCapability(SPIRV::Capability::SubgroupImageBlockIOINTEL);
+    }
+    break;
   case SPIRV::OpAssumeTrueKHR:
   case SPIRV::OpExpectKHR:
     if (ST.canUseExtension(SPIRV::Extension::SPV_KHR_expect_assume)) {
diff --git a/llvm/lib/Target/SPIRV/SPIRVSubtarget.cpp b/llvm/lib/Target/SPIRV/SPIRVSubtarget.cpp
index cf6dfb1..6eb81f2 100644
--- a/llvm/lib/Target/SPIRV/SPIRVSubtarget.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVSubtarget.cpp
@@ -37,6 +37,11 @@ cl::list<SPIRV::Extension::Extension> Extensions(
         clEnumValN(SPIRV::Extension::SPV_INTEL_optnone, "SPV_INTEL_optnone",
                    "Adds OptNoneINTEL value for Function Control mask that "
                    "indicates a request to not optimize the function"),
+        clEnumValN(SPIRV::Extension::SPV_INTEL_subgroups, "SPV_INTEL_subgroups",
+                   "Allows work items in a subgroup to share data without the "
+                   "use of local memory and work group barriers, and to "
+                   "utilize specialized hardware to load and store blocks of "
+                   "data from images or buffers."),
         clEnumValN(SPIRV::Extension::SPV_KHR_no_integer_wrap_decoration,
                    "SPV_KHR_no_integer_wrap_decoration",
                    "Adds decorations to indicate that a given instruction does "
diff --git a/llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td b/llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td
index ac92ee4..58ba778 100644
--- a/llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td
+++ b/llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td
@@ -431,9 +431,9 @@ defm InputAttachmentArrayNonUniformIndexingEXT : CapabilityOperand<5310, 0, 0, [
 defm UniformTexelBufferArrayNonUniformIndexingEXT : CapabilityOperand<5311, 0, 0, [], [SampledBuffer, ShaderNonUniformEXT]>;
 defm StorageTexelBufferArrayNonUniformIndexingEXT : CapabilityOperand<5312, 0, 0, [], [ImageBuffer, ShaderNonUniformEXT]>;
 defm RayTracingNV : CapabilityOperand<5340, 0, 0, [], [Shader]>;
-defm SubgroupShuffleINTEL : CapabilityOperand<5568, 0, 0, [], []>;
-defm SubgroupBufferBlockIOINTEL : CapabilityOperand<5569, 0, 0, [], []>;
-defm SubgroupImageBlockIOINTEL : CapabilityOperand<5570, 0, 0, [], []>;
+defm SubgroupShuffleINTEL : CapabilityOperand<5568, 0, 0, [SPV_INTEL_subgroups], []>;
+defm SubgroupBufferBlockIOINTEL : CapabilityOperand<5569, 0, 0, [SPV_INTEL_subgroups], []>;
+defm SubgroupImageBlockIOINTEL : CapabilityOperand<5570, 0, 0, [SPV_INTEL_subgroups], []>;
 defm SubgroupImageMediaBlockIOINTEL : CapabilityOperand<5579, 0, 0, [], []>;
 defm SubgroupAvcMotionEstimationINTEL : CapabilityOperand<5696, 0, 0, [], []>;
 defm SubgroupAvcMotionEstimationIntraINTEL : CapabilityOperand<5697, 0, 0, [], []>;
diff --git a/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_subgroups/cl_intel_sub_groups.ll b/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_subgroups/cl_intel_sub_groups.ll
new file mode 100644
index 0000000..0e0b2a4
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_subgroups/cl_intel_sub_groups.ll
@@ -0,0 +1,189 @@
+; Modified from: https://github.com/KhronosGroup/SPIRV-LLVM-Translator/test/extensions/INTEL/SPV_INTEL_subgroups/cl_intel_sub_groups.ll
+
+;Source:
+;void __kernel test(float2 x, uint c,
+;                   read_only image2d_t image_in,
+;                   write_only image2d_t image_out,
+;                   int2 coord,
+;                   __global uint* p,
+;                   __global ushort* sp,
+;                   __global uchar* cp,
+;                   __global ulong* lp) {
+;    intel_sub_group_shuffle(x, c);
+;    intel_sub_group_shuffle_down(x, x, c);
+;    intel_sub_group_shuffle_up(x, x, c);
+;    intel_sub_group_shuffle_xor(x, c);
+;
+;    uint2 ui2 = intel_sub_group_block_read2(image_in, coord);
+;    intel_sub_group_block_write2(image_out, coord, ui2);
+;    ui2 = intel_sub_group_block_read2(p);
+;    intel_sub_group_block_write2(p, ui2);
+;
+;    ushort2 us2 = intel_sub_group_block_read_us2(image_in, coord);
+;    intel_sub_group_block_write_us2(image_out, coord, us2);
+;    us2 = intel_sub_group_block_read_us2(sp);
+;    intel_sub_group_block_write_us2(sp, us2);
+;
+;    uchar2 uc2 = intel_sub_group_block_read_uc2(image_in, coord);
+;    intel_sub_group_block_write_uc2(image_out, coord, uc2);
+;    uc2 = intel_sub_group_block_read_uc2(cp);
+;    intel_sub_group_block_write_uc2(cp, uc2);
+;
+;    ulong2 ul2 = intel_sub_group_block_read_ul2(image_in, coord);
+;    intel_sub_group_block_write_ul2(image_out, coord, ul2);
+;    ul2 = intel_sub_group_block_read_ul2(lp);
+;    intel_sub_group_block_write_ul2(lp, ul2);
+;}
+
+; RUN: not llc -O0 -mtriple=spirv32-unknown-unknown %s -o %t.spvt 2>&1 | FileCheck %s --check-prefix=CHECK-ERROR
+
+; RUN: llc -O0 -mtriple=spirv32-unknown-unknown --spirv-extensions=SPV_INTEL_subgroups %s -o - | FileCheck %s
+
+; CHECK-ERROR: LLVM ERROR: intel_sub_group_shuffle: the builtin requires the following SPIR-V extension: SPV_INTEL_subgroups
+
+; CHECK-DAG: Capability SubgroupShuffleINTEL
+; CHECK-DAG: Capability SubgroupBufferBlockIOINTEL
+; CHECK-DAG: Capability SubgroupImageBlockIOINTEL
+; CHECK: Extension "SPV_INTEL_subgroups"
+
+; CHECK-SPIRV-LABEL: Function
+; CHECK-SPIRV-LABEL: Label
+
+; CHECK: SubgroupShuffleINTEL
+; CHECK: SubgroupShuffleDownINTEL
+; CHECK: SubgroupShuffleUpINTEL
+; CHECK: SubgroupShuffleXorINTEL
+
+; CHECK: SubgroupImageBlockReadINTEL
+; CHECK: SubgroupImageBlockWriteINTEL
+; CHECK: SubgroupBlockReadINTEL
+; CHECK: SubgroupBlockWriteINTEL
+
+; CHECK: SubgroupImageBlockReadINTEL
+; CHECK: SubgroupImageBlockWriteINTEL
+; CHECK: SubgroupBlockReadINTEL
+; CHECK: SubgroupBlockWriteINTEL
+
+; CHECK: SubgroupImageBlockReadINTEL
+; CHECK: SubgroupImageBlockWriteINTEL
+; CHECK: SubgroupBlockReadINTEL
+; CHECK: SubgroupBlockWriteINTEL
+
+; CHECK: SubgroupImageBlockReadINTEL
+; CHECK: SubgroupImageBlockWriteINTEL
+; CHECK: SubgroupBlockReadINTEL
+; CHECK: SubgroupBlockWriteINTEL
+
+; CHECK-SPIRV-LABEL: Return
+
+target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024"
+target triple = "spir64"
+
+%opencl.image2d_ro_t = type opaque
+%opencl.image2d_wo_t = type opaque
+
+; Function Attrs: convergent nounwind
+define spir_kernel void @test(<2 x float> %x, i32 %c, ptr addrspace(1) %image_in, ptr addrspace(1) %image_out, <2 x i32> %coord, ptr addrspace(1) %p, ptr addrspace(1) %sp, ptr addrspace(1) %cp, ptr addrspace(1) %lp) local_unnamed_addr #0 !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !3 !kernel_arg_base_type !4 !kernel_arg_type_qual !5 !kernel_arg_name !6 {
+entry:
+  %call = tail call spir_func <2 x float> @_Z23intel_sub_group_shuffleDv2_fj(<2 x float> %x, i32 %c) #2
+  %call1 = tail call spir_func <2 x float> @_Z28intel_sub_group_shuffle_downDv2_fS_j(<2 x float> %x, <2 x float> %x, i32 %c) #2
+  %call2 = tail call spir_func <2 x float> @_Z26intel_sub_group_shuffle_upDv2_fS_j(<2 x float> %x, <2 x float> %x, i32 %c) #2
+  %call3 = tail call spir_func <2 x float> @_Z27intel_sub_group_shuffle_xorDv2_fj(<2 x float> %x, i32 %c) #2
+
+  %call4 = tail call spir_func <2 x i32> @_Z27intel_sub_group_block_read214ocl_image2d_roDv2_i(ptr addrspace(1) %image_in, <2 x i32> %coord) #2
+  tail call spir_func void @_Z28intel_sub_group_block_write214ocl_image2d_woDv2_iDv2_j(ptr addrspace(1) %image_out, <2 x i32> %coord, <2 x i32> %call4) #2
+  %call5 = tail call spir_func <2 x i32> @_Z27intel_sub_group_block_read2PU3AS1Kj(ptr addrspace(1) %p) #2
+  tail call spir_func void @_Z28intel_sub_group_block_write2PU3AS1jDv2_j(ptr addrspace(1) %p, <2 x i32> %call5) #2
+
+  %call6 = tail call spir_func <2 x i16> @_Z30intel_sub_group_block_read_us214ocl_image2d_roDv2_i(ptr addrspace(1) %image_in, <2 x i32> %coord) #2
+  tail call spir_func void @_Z31intel_sub_group_block_write_us214ocl_image2d_woDv2_iDv2_t(ptr addrspace(1) %image_out, <2 x i32> %coord, <2 x i16> %call6) #2
+  %call7 = tail call spir_func <2 x i16> @_Z30intel_sub_group_block_read_us2PU3AS1Kt(ptr addrspace(1) %sp) #2
+  tail call spir_func void @_Z31intel_sub_group_block_write_us2PU3AS1tDv2_t(ptr addrspace(1) %sp, <2 x i16> %call7) #2
+
+  %call8 = tail call spir_func <2 x i8> @_Z30intel_sub_group_block_read_uc214ocl_image2d_roDv2_i(ptr addrspace(1) %image_in, <2 x i32> %coord) #2
+  tail call spir_func void @_Z31intel_sub_group_block_write_uc214ocl_image2d_woDv2_iDv2_h(ptr addrspace(1) %image_out, <2 x i32> %coord, <2 x i8> %call8) #2
+  %call9 = tail call spir_func <2 x i8> @_Z30intel_sub_group_block_read_uc2PU3AS1Kh(ptr addrspace(1) %cp) #2
+  tail call spir_func void @_Z31intel_sub_group_block_write_uc2PU3AS1hDv2_h(ptr addrspace(1) %cp, <2 x i8> %call9) #2
+
+  %call10 = tail call spir_func <2 x i64> @_Z30intel_sub_group_block_read_ul214ocl_image2d_roDv2_i(ptr addrspace(1) %image_in, <2 x i32> %coord) #2
+  tail call spir_func void @_Z31intel_sub_group_block_write_ul214ocl_image2d_woDv2_iDv2_m(ptr addrspace(1) %image_out, <2 x i32> %coord, <2 x i64> %call10) #2
+  %call11 = tail call spir_func <2 x i64> @_Z30intel_sub_group_block_read_ul2PU3AS1Km(ptr addrspace(1) %lp) #2
+  tail call spir_func void @_Z31intel_sub_group_block_write_ul2PU3AS1mDv2_m(ptr addrspace(1) %lp, <2 x i64> %call11) #2
+
+  ret void
+}
+
+; Function Attrs: convergent
+declare spir_func <2 x float> @_Z23intel_sub_group_shuffleDv2_fj(<2 x float>, i32) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare spir_func <2 x float> @_Z28intel_sub_group_shuffle_downDv2_fS_j(<2 x float>, <2 x float>, i32) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare spir_func <2 x float> @_Z26intel_sub_group_shuffle_upDv2_fS_j(<2 x float>, <2 x float>, i32) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare spir_func <2 x float> @_Z27intel_sub_group_shuffle_xorDv2_fj(<2 x float>, i32) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare spir_func <2 x i32> @_Z27intel_sub_group_block_read214ocl_image2d_roDv2_i(ptr addrspace(1), <2 x i32>) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare spir_func void @_Z28intel_sub_group_block_write214ocl_image2d_woDv2_iDv2_j(ptr addrspace(1), <2 x i32>, <2 x i32>) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare spir_func <2 x i32> @_Z27intel_sub_group_block_read2PU3AS1Kj(ptr addrspace(1)) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare spir_func void @_Z28intel_sub_group_block_write2PU3AS1jDv2_j(ptr addrspace(1), <2 x i32>) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare spir_func <2 x i16> @_Z30intel_sub_group_block_read_us214ocl_image2d_roDv2_i(ptr addrspace(1), <2 x i32>) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare spir_func void @_Z31intel_sub_group_block_write_us214ocl_image2d_woDv2_iDv2_t(ptr addrspace(1), <2 x i32>, <2 x i16>) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare spir_func <2 x i16> @_Z30intel_sub_group_block_read_us2PU3AS1Kt(ptr addrspace(1)) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare spir_func void @_Z31intel_sub_group_block_write_us2PU3AS1tDv2_t(ptr addrspace(1), <2 x i16>) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare spir_func <2 x i8> @_Z30intel_sub_group_block_read_uc214ocl_image2d_roDv2_i(ptr addrspace(1), <2 x i32>) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare spir_func void @_Z31intel_sub_group_block_write_uc214ocl_image2d_woDv2_iDv2_h(ptr addrspace(1), <2 x i32>, <2 x i8>) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare spir_func <2 x i8> @_Z30intel_sub_group_block_read_uc2PU3AS1Kh(ptr addrspace(1)) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare spir_func void @_Z31intel_sub_group_block_write_uc2PU3AS1hDv2_h(ptr addrspace(1), <2 x i8>) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare spir_func <2 x i64> @_Z30intel_sub_group_block_read_ul214ocl_image2d_roDv2_i(ptr addrspace(1), <2 x i32>) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare spir_func void @_Z31intel_sub_group_block_write_ul214ocl_image2d_woDv2_iDv2_m(ptr addrspace(1), <2 x i32>, <2 x i64>) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare spir_func <2 x i64> @_Z30intel_sub_group_block_read_ul2PU3AS1Km(ptr addrspace(1)) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare spir_func void @_Z31intel_sub_group_block_write_ul2PU3AS1mDv2_m(ptr addrspace(1), <2 x i64>) local_unnamed_addr #1
+
+attributes #0 = { convergent nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="128" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "uniform-work-group-size"="true" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { convergent "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { convergent nounwind }
+
+!opencl.ocl.version = !{!0}
+!opencl.spir.version = !{!0}
+
+!0 = !{i32 1, i32 2}
+!1 = !{i32 0, i32 0, i32 1, i32 1, i32 0, i32 1, i32 1, i32 1, i32 1}
+!2 = !{!"none", !"none", !"read_only", !"write_only", !"none", !"none", !"none", !"none", !"none"}
+!3 = !{!"float2", !"uint", !"image2d_t", !"image2d_t", !"int2", !"uint*", !"ushort*", !"uchar*", !"ulong*"}
+!4 = !{!"float __attribute__((ext_vector_type(2)))", !"uint", !"image2d_t", !"image2d_t", !"int __attribute__((ext_vector_type(2)))", !"uint*", !"ushort*", !"uchar*", !"ulong*"}
+!5 = !{!"", !"", !"", !"", !"", !"", !"", !"", !""}
+!6 = !{!"x", !"c", !"image_in", !"image_out", !"coord", !"p", !"sp", !"cp", !"lp"}
-- 
cgit v1.1


From f93aa5157a3317b24cff660ac972814ee9ed4dbc Mon Sep 17 00:00:00 2001
From: Pierre van Houtryve <pierre.vanhoutryve@amd.com>
Date: Mon, 12 Feb 2024 10:18:20 +0100
Subject: [AMDGPU] Introduce GFX9/10.1/10.3/11 Generic Targets (#76955)

These generic targets include multiple GPUs and will, in the future,
provide a way to build once and run on multiple GPU, at the cost of less
optimization opportunities.

Note that this is just doing the compiler side of things, device libs an
runtimes/loader/etc. don't know about these targets yet, so none of them
actually work in practice right now. This is just the initial commit to
make LLVM aware of them.

This contains the documentation changes for both this change and #76954
as well.
---
 clang/lib/Basic/Targets/AMDGPU.cpp                 |  21 +-
 clang/test/Driver/amdgpu-macros.cl                 |   5 +
 clang/test/Driver/amdgpu-mcpu.cl                   |  10 +
 clang/test/Misc/target-invalid-cpu-note.c          |   2 +-
 llvm/docs/AMDGPUUsage.rst                          | 304 +++++++++++----
 llvm/include/llvm/BinaryFormat/ELF.h               |   6 +-
 llvm/include/llvm/TargetParser/TargetParser.h      |  10 +
 llvm/lib/Object/ELFObjectFile.cpp                  |  10 +
 llvm/lib/ObjectYAML/ELFYAML.cpp                    |   4 +
 llvm/lib/Target/AMDGPU/AMDGPU.td                   |  87 +++--
 llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp        |   7 +
 .../AMDGPU/AMDGPURemoveIncompatibleFunctions.cpp   |   6 +-
 llvm/lib/Target/AMDGPU/GCNProcessors.td            |  22 ++
 llvm/lib/Target/AMDGPU/GCNSubtarget.h              |   4 +
 .../AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp   |  26 ++
 llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h      |  11 +
 llvm/lib/TargetParser/TargetParser.cpp             |  46 +++
 .../AMDGPU/GlobalISel/llvm.amdgcn.workitem.id.ll   |  19 +-
 .../test/CodeGen/AMDGPU/directive-amdgcn-target.ll |  14 +
 llvm/test/CodeGen/AMDGPU/elf-header-flags-mach.ll  |  10 +
 llvm/test/CodeGen/AMDGPU/gds-allocation.ll         |   1 +
 llvm/test/CodeGen/AMDGPU/gds-atomic.ll             |   1 +
 .../CodeGen/AMDGPU/generic-targets-require-v6.ll   |  18 +
 .../CodeGen/AMDGPU/hsa-generic-target-features.ll  |  31 ++
 .../AMDGPU/llvm.amdgcn.image.gather4.d16.dim.ll    |   3 +
 .../CodeGen/AMDGPU/llvm.amdgcn.image.sample.dim.ll |   3 +
 llvm/test/CodeGen/AMDGPU/mad-mix.ll                | 421 +++++++++++++++++++++
 .../CodeGen/AMDGPU/unsupported-image-sample.ll     |  12 +-
 llvm/test/Object/AMDGPU/elf-header-flags-mach.yaml |  29 ++
 .../tools/llvm-objdump/ELF/AMDGPU/subtarget.ll     |  20 +
 .../tools/llvm-readobj/ELF/AMDGPU/elf-headers.test |  12 +
 llvm/tools/llvm-readobj/ELFDumper.cpp              | 128 ++++---
 32 files changed, 1101 insertions(+), 202 deletions(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/generic-targets-require-v6.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/hsa-generic-target-features.ll

diff --git a/clang/lib/Basic/Targets/AMDGPU.cpp b/clang/lib/Basic/Targets/AMDGPU.cpp
index 141501e8..10cba6b 100644
--- a/clang/lib/Basic/Targets/AMDGPU.cpp
+++ b/clang/lib/Basic/Targets/AMDGPU.cpp
@@ -17,6 +17,7 @@
 #include "clang/Basic/LangOptions.h"
 #include "clang/Basic/MacroBuilder.h"
 #include "clang/Basic/TargetBuiltins.h"
+#include "llvm/ADT/SmallString.h"
 using namespace clang;
 using namespace clang::targets;
 
@@ -279,13 +280,25 @@ void AMDGPUTargetInfo::getTargetDefines(const LangOptions &Opts,
   if (GPUKind == llvm::AMDGPU::GK_NONE && !IsHIPHost)
     return;
 
-  StringRef CanonName = isAMDGCN(getTriple()) ? getArchNameAMDGCN(GPUKind)
-                                              : getArchNameR600(GPUKind);
+  llvm::SmallString<16> CanonName =
+      (isAMDGCN(getTriple()) ? getArchNameAMDGCN(GPUKind)
+                             : getArchNameR600(GPUKind));
+
+  // Sanitize the name of generic targets.
+  // e.g. gfx10.1-generic -> gfx10_1_generic
+  if (GPUKind >= llvm::AMDGPU::GK_AMDGCN_GENERIC_FIRST &&
+      GPUKind <= llvm::AMDGPU::GK_AMDGCN_GENERIC_LAST) {
+    std::replace(CanonName.begin(), CanonName.end(), '.', '_');
+    std::replace(CanonName.begin(), CanonName.end(), '-', '_');
+  }
+
   Builder.defineMacro(Twine("__") + Twine(CanonName) + Twine("__"));
   // Emit macros for gfx family e.g. gfx906 -> __GFX9__, gfx1030 -> __GFX10___
   if (isAMDGCN(getTriple()) && !IsHIPHost) {
-    assert(CanonName.starts_with("gfx") && "Invalid amdgcn canonical name");
-    Builder.defineMacro(Twine("__") + Twine(CanonName.drop_back(2).upper()) +
+    assert(StringRef(CanonName).starts_with("gfx") &&
+           "Invalid amdgcn canonical name");
+    StringRef CanonFamilyName = getArchFamilyNameAMDGCN(GPUKind);
+    Builder.defineMacro(Twine("__") + Twine(CanonFamilyName.upper()) +
                         Twine("__"));
     Builder.defineMacro("__amdgcn_processor__",
                         Twine("\"") + Twine(CanonName) + Twine("\""));
diff --git a/clang/test/Driver/amdgpu-macros.cl b/clang/test/Driver/amdgpu-macros.cl
index 81c22af..3b10444 100644
--- a/clang/test/Driver/amdgpu-macros.cl
+++ b/clang/test/Driver/amdgpu-macros.cl
@@ -131,6 +131,11 @@
 // RUN: %clang -E -dM -target amdgcn -mcpu=gfx1200 %s 2>&1 | FileCheck --check-prefixes=ARCH-GCN,FAST_FMAF %s -DWAVEFRONT_SIZE=32 -DCPU=gfx1200 -DFAMILY=GFX12
 // RUN: %clang -E -dM -target amdgcn -mcpu=gfx1201 %s 2>&1 | FileCheck --check-prefixes=ARCH-GCN,FAST_FMAF %s -DWAVEFRONT_SIZE=32 -DCPU=gfx1201 -DFAMILY=GFX12
 
+// RUN: %clang -E -dM -target amdgcn -mcpu=gfx9-generic %s 2>&1 | FileCheck --check-prefixes=ARCH-GCN,FAST_FMAF %s -DWAVEFRONT_SIZE=64 -DCPU=gfx9_generic -DFAMILY=GFX9
+// RUN: %clang -E -dM -target amdgcn -mcpu=gfx10.1-generic %s 2>&1 | FileCheck --check-prefixes=ARCH-GCN,FAST_FMAF %s -DWAVEFRONT_SIZE=32 -DCPU=gfx10_1_generic -DFAMILY=GFX10
+// RUN: %clang -E -dM -target amdgcn -mcpu=gfx10.3-generic %s 2>&1 | FileCheck --check-prefixes=ARCH-GCN,FAST_FMAF %s -DWAVEFRONT_SIZE=32 -DCPU=gfx10_3_generic -DFAMILY=GFX10
+// RUN: %clang -E -dM -target amdgcn -mcpu=gfx11-generic %s 2>&1 | FileCheck --check-prefixes=ARCH-GCN,FAST_FMAF %s -DWAVEFRONT_SIZE=32 -DCPU=gfx11_generic -DFAMILY=GFX11
+
 // ARCH-GCN-DAG: #define FP_FAST_FMA 1
 
 // FAST_FMAF-DAG: #define FP_FAST_FMAF 1
diff --git a/clang/test/Driver/amdgpu-mcpu.cl b/clang/test/Driver/amdgpu-mcpu.cl
index eeb16ae..6f18ea0 100644
--- a/clang/test/Driver/amdgpu-mcpu.cl
+++ b/clang/test/Driver/amdgpu-mcpu.cl
@@ -115,6 +115,11 @@
 // RUN: %clang -### -target amdgcn -mcpu=gfx1200 %s 2>&1 | FileCheck --check-prefix=GFX1200 %s
 // RUN: %clang -### -target amdgcn -mcpu=gfx1201 %s 2>&1 | FileCheck --check-prefix=GFX1201 %s
 
+// RUN: %clang -### -target amdgcn -mcpu=gfx9-generic %s 2>&1 | FileCheck --check-prefix=GFX9_GENERIC %s
+// RUN: %clang -### -target amdgcn -mcpu=gfx10.1-generic %s 2>&1 | FileCheck --check-prefix=GFX10_1_GENERIC %s
+// RUN: %clang -### -target amdgcn -mcpu=gfx10.3-generic %s 2>&1 | FileCheck --check-prefix=GFX10_3_GENERIC %s
+// RUN: %clang -### -target amdgcn -mcpu=gfx11-generic %s 2>&1 | FileCheck --check-prefix=GFX11_GENERIC %s
+
 // GCNDEFAULT-NOT: -target-cpu
 // GFX600:    "-target-cpu" "gfx600"
 // GFX601:    "-target-cpu" "gfx601"
@@ -160,3 +165,8 @@
 // GFX1151:   "-target-cpu" "gfx1151"
 // GFX1200:   "-target-cpu" "gfx1200"
 // GFX1201:   "-target-cpu" "gfx1201"
+
+// GFX9_GENERIC:      "-target-cpu" "gfx9-generic"
+// GFX10_1_GENERIC:   "-target-cpu" "gfx10.1-generic"
+// GFX10_3_GENERIC:   "-target-cpu" "gfx10.3-generic"
+// GFX11_GENERIC:     "-target-cpu" "gfx11-generic"
diff --git a/clang/test/Misc/target-invalid-cpu-note.c b/clang/test/Misc/target-invalid-cpu-note.c
index 39ed02f..123b203 100644
--- a/clang/test/Misc/target-invalid-cpu-note.c
+++ b/clang/test/Misc/target-invalid-cpu-note.c
@@ -37,7 +37,7 @@
 
 // RUN: not %clang_cc1 -triple amdgcn--- -target-cpu not-a-cpu -fsyntax-only %s 2>&1 | FileCheck %s --check-prefix AMDGCN
 // AMDGCN: error: unknown target CPU 'not-a-cpu'
-// AMDGCN-NEXT: note: valid target CPU values are: gfx600, tahiti, gfx601, pitcairn, verde, gfx602, hainan, oland, gfx700, kaveri, gfx701, hawaii, gfx702, gfx703, kabini, mullins, gfx704, bonaire, gfx705, gfx801, carrizo, gfx802, iceland, tonga, gfx803, fiji, polaris10, polaris11, gfx805, tongapro, gfx810, stoney, gfx900, gfx902, gfx904, gfx906, gfx908, gfx909, gfx90a, gfx90c, gfx940, gfx941, gfx942, gfx1010, gfx1011, gfx1012, gfx1013, gfx1030, gfx1031, gfx1032, gfx1033, gfx1034, gfx1035, gfx1036, gfx1100, gfx1101, gfx1102, gfx1103, gfx1150, gfx1151, gfx1200, gfx1201{{$}}
+// AMDGCN-NEXT: note: valid target CPU values are: gfx600, tahiti, gfx601, pitcairn, verde, gfx602, hainan, oland, gfx700, kaveri, gfx701, hawaii, gfx702, gfx703, kabini, mullins, gfx704, bonaire, gfx705, gfx801, carrizo, gfx802, iceland, tonga, gfx803, fiji, polaris10, polaris11, gfx805, tongapro, gfx810, stoney, gfx900, gfx902, gfx904, gfx906, gfx908, gfx909, gfx90a, gfx90c, gfx940, gfx941, gfx942, gfx1010, gfx1011, gfx1012, gfx1013, gfx1030, gfx1031, gfx1032, gfx1033, gfx1034, gfx1035, gfx1036, gfx1100, gfx1101, gfx1102, gfx1103, gfx1150, gfx1151, gfx1200, gfx1201, gfx9-generic, gfx10.1-generic, gfx10.3-generic, gfx11-generic{{$}}
 
 // RUN: not %clang_cc1 -triple wasm64--- -target-cpu not-a-cpu -fsyntax-only %s 2>&1 | FileCheck %s --check-prefix WEBASM
 // WEBASM: error: unknown target CPU 'not-a-cpu'
diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst
index ebc7fda..970b5e2 100644
--- a/llvm/docs/AMDGPUUsage.rst
+++ b/llvm/docs/AMDGPUUsage.rst
@@ -520,6 +520,108 @@ Every processor supports every OS ABI (see :ref:`amdgpu-os`) with the following
 
      =========== =============== ============ ===== ================= =============== =============== ======================
 
+Generic processors allow execution of a single code object on any of the processors that
+it supports. Such code objects may not perform as well as those for the non-generic processors.
+
+Generic processors are only available on code object V6 and above (see :ref:`amdgpu-elf-code-object`).
+
+Generic processor code objects are versioned (see :ref:`amdgpu-elf-header-e_flags-table-v6-onwards`) between 1 and 255.
+The version of non-generic code objects is always set to 0.
+
+For a generic code object, adding a new supported processor may require the code generated for the generic target to be changed
+so it can continue to execute on the previously supported processors as well as on the new one.
+When this happens, the generic code object version number is incremented at the same time as the generic target is updated.
+
+Each supported processor of a generic target is mapped to the version it was introduced in.
+A generic code object can execute on a supported processor if the version of the code object being loaded is
+greater than or equal to the version in which the processor was added to the generic target.
+
+  .. table:: AMDGPU Generic Processors
+     :name: amdgpu-generic-processor-table
+
+     ==================== ============== ================= ================== ================= =================================
+     Processor             Target        Supported         Target Features    Target Properties Target Restrictions
+                           Triple        Processors        Supported
+                           Architecture
+
+     ==================== ============== ================= ================== ================= =================================
+     ``gfx9-generic``     ``amdgcn``     - ``gfx900``      - xnack            - Absolute flat   - ``v_mad_mix`` instructions
+                                         - ``gfx902``                           scratch           are not available on
+                                         - ``gfx904``                                             ``gfx900``, ``gfx902``,
+                                         - ``gfx906``                                             ``gfx909``, ``gfx90c``
+                                         - ``gfx909``                                           - ``v_fma_mix`` instructions
+                                         - ``gfx90c``                                             are not available on ``gfx904``
+                                                                                                - sramecc is not available on
+                                                                                                  ``gfx906``
+                                                                                                - The following instructions
+                                                                                                  are not available on ``gfx906``:
+
+                                                                                                  - ``v_fmac_f32``
+                                                                                                  - ``v_xnor_b32``
+                                                                                                  - ``v_dot4_i32_i8``
+                                                                                                  - ``v_dot8_i32_i4``
+                                                                                                  - ``v_dot2_i32_i16``
+                                                                                                  - ``v_dot2_u32_u16``
+                                                                                                  - ``v_dot4_u32_u8``
+                                                                                                  - ``v_dot8_u32_u4``
+                                                                                                  - ``v_dot2_f32_f16``
+
+
+     ``gfx10.1-generic``  ``amdgcn``     - ``gfx1010``     - xnack            - Absolute flat   - The following instructions are
+                                         - ``gfx1011``     - wavefrontsize64    scratch           not available on ``gfx1011``
+                                         - ``gfx1012``     - cumode                               and ``gfx1012``
+                                         - ``gfx1013``
+                                                                                                  - ``v_dot4_i32_i8``
+                                                                                                  - ``v_dot8_i32_i4``
+                                                                                                  - ``v_dot2_i32_i16``
+                                                                                                  - ``v_dot2_u32_u16``
+                                                                                                  - ``v_dot2c_f32_f16``
+                                                                                                  - ``v_dot4c_i32_i8``
+                                                                                                  - ``v_dot4_u32_u8``
+                                                                                                  - ``v_dot8_u32_u4``
+                                                                                                  - ``v_dot2_f32_f16``
+
+                                                                                                - BVH Ray Tracing instructions
+                                                                                                  are not available on
+                                                                                                  ``gfx1013``
+
+
+     ``gfx10.3-generic``  ``amdgcn``     - ``gfx1030``     - wavefrontsize64  - Absolute flat   No restrictions.
+                                         - ``gfx1031``     - cumode             scratch
+                                         - ``gfx1032``
+                                         - ``gfx1033``
+                                         - ``gfx1034``
+                                         - ``gfx1035``
+                                         - ``gfx1036``
+
+
+     ``gfx11-generic``    ``amdgcn``     - ``gfx1100``     - wavefrontsize64  - Architected     Various codegen pessimizations
+                                         - ``gfx1101``     - cumode             flat scratch    are applied to work around some
+                                         - ``gfx1102``                        - Packed          hazards specific to some targets
+                                         - ``gfx1103``                          work-item       within this family.
+                                         - ``gfx1150``                          IDs
+                                         - ``gfx1151``                                          Not all VGPRs can be used on:
+
+                                                                                                - ``gfx1100``
+                                                                                                - ``gfx1101``
+                                                                                                - ``gfx1151``
+
+                                                                                                SALU floating point instructions
+                                                                                                and single-use VGPR hint
+                                                                                                instructions are not available
+                                                                                                on:
+
+                                                                                                - ``gfx1150``
+                                                                                                - ``gfx1151``
+
+                                                                                                SGPRs are not supported for src1
+                                                                                                in dpp instructions for:
+
+                                                                                                - ``gfx1150``
+                                                                                                - ``gfx1151``
+     ==================== ============== ================= ================== ================= =================================
+
+
 .. _amdgpu-target-features:
 
 Target Features
@@ -533,7 +635,7 @@ generating the code. A mismatch of features may result in incorrect
 execution, or a reduction in performance.
 
 The target features supported by each processor is listed in
-:ref:`amdgpu-processor-table`.
+:ref:`amdgpu-processors`.
 
 Target features are controlled by exactly one of the following Clang
 options:
@@ -1443,6 +1545,7 @@ The AMDGPU backend uses the following ELF header:
                                 - ``ELFABIVERSION_AMDGPU_HSA_V3``
                                 - ``ELFABIVERSION_AMDGPU_HSA_V4``
                                 - ``ELFABIVERSION_AMDGPU_HSA_V5``
+                                - ``ELFABIVERSION_AMDGPU_HSA_V6``
                                 - ``ELFABIVERSION_AMDGPU_PAL``
                                 - ``ELFABIVERSION_AMDGPU_MESA3D``
      ``e_type``                 - ``ET_REL``
@@ -1451,7 +1554,8 @@ The AMDGPU backend uses the following ELF header:
      ``e_entry``                0
      ``e_flags``                See :ref:`amdgpu-elf-header-e_flags-v2-table`,
                                 :ref:`amdgpu-elf-header-e_flags-table-v3`,
-                                and :ref:`amdgpu-elf-header-e_flags-table-v4-onwards`
+                                :ref:`amdgpu-elf-header-e_flags-table-v4-v5`,
+                                and :ref:`amdgpu-elf-header-e_flags-table-v6-onwards`
      ========================== ===============================
 
 ..
@@ -1471,6 +1575,7 @@ The AMDGPU backend uses the following ELF header:
      ``ELFABIVERSION_AMDGPU_HSA_V3`` 1
      ``ELFABIVERSION_AMDGPU_HSA_V4`` 2
      ``ELFABIVERSION_AMDGPU_HSA_V5`` 3
+     ``ELFABIVERSION_AMDGPU_HSA_V6`` 4
      ``ELFABIVERSION_AMDGPU_PAL``    0
      ``ELFABIVERSION_AMDGPU_MESA3D`` 0
      =============================== =====
@@ -1517,6 +1622,10 @@ The AMDGPU backend uses the following ELF header:
     ``-mcode-object-version=5``. This is the default code object
     version if not specified.
 
+  * ``ELFABIVERSION_AMDGPU_HSA_V6`` is used to specify the version of AMD HSA
+    runtime ABI for code object V6. Specify using the Clang option
+    ``-mcode-object-version=6``.
+
   * ``ELFABIVERSION_AMDGPU_PAL`` is used to specify the version of AMD PAL
     runtime ABI.
 
@@ -1543,8 +1652,9 @@ The AMDGPU backend uses the following ELF header:
   ``NT_AMD_HSA_ISA_VERSION`` note record for code object V2 (see
   :ref:`amdgpu-note-records-v2`) and in the ``EF_AMDGPU_MACH`` bit field of the
   ``e_flags`` for code object V3 and above (see
-  :ref:`amdgpu-elf-header-e_flags-table-v3` and
-  :ref:`amdgpu-elf-header-e_flags-table-v4-onwards`).
+  :ref:`amdgpu-elf-header-e_flags-table-v3`,
+  :ref:`amdgpu-elf-header-e_flags-table-v4-v5` and
+  :ref:`amdgpu-elf-header-e_flags-table-v6-onwards`).
 
 ``e_entry``
   The entry point is 0 as the entry points for individual kernels must be
@@ -1615,8 +1725,8 @@ The AMDGPU backend uses the following ELF header:
                                              :ref:`amdgpu-target-features`.
      ================================= ===== =============================
 
-  .. table:: AMDGPU ELF Header ``e_flags`` for Code Object V4 and After
-     :name: amdgpu-elf-header-e_flags-table-v4-onwards
+  .. table:: AMDGPU ELF Header ``e_flags`` for Code Object V4 and V5
+     :name: amdgpu-elf-header-e_flags-table-v4-v5
 
      ============================================ ===== ===================================
      Name                                         Value      Description
@@ -1642,80 +1752,120 @@ The AMDGPU backend uses the following ELF header:
      ``EF_AMDGPU_FEATURE_SRAMECC_ON_V4``          0xc00 SRAMECC enabled.
      ============================================ ===== ===================================
 
+  .. table:: AMDGPU ELF Header ``e_flags`` for Code Object V6 and After
+     :name: amdgpu-elf-header-e_flags-table-v6-onwards
+
+     ============================================ ========== =========================================
+     Name                                         Value      Description
+     ============================================ ========== =========================================
+     ``EF_AMDGPU_MACH``                           0x0ff      AMDGPU processor selection
+                                                             mask for
+                                                             ``EF_AMDGPU_MACH_xxx`` values
+                                                             defined in
+                                                             :ref:`amdgpu-ef-amdgpu-mach-table`.
+     ``EF_AMDGPU_FEATURE_XNACK_V4``               0x300      XNACK selection mask for
+                                                             ``EF_AMDGPU_FEATURE_XNACK_*_V4``
+                                                             values.
+     ``EF_AMDGPU_FEATURE_XNACK_UNSUPPORTED_V4``   0x000      XNACK unsupported.
+     ``EF_AMDGPU_FEATURE_XNACK_ANY_V4``           0x100      XNACK can have any value.
+     ``EF_AMDGPU_FEATURE_XNACK_OFF_V4``           0x200      XNACK disabled.
+     ``EF_AMDGPU_FEATURE_XNACK_ON_V4``            0x300      XNACK enabled.
+     ``EF_AMDGPU_FEATURE_SRAMECC_V4``             0xc00      SRAMECC selection mask for
+                                                             ``EF_AMDGPU_FEATURE_SRAMECC_*_V4``
+                                                             values.
+     ``EF_AMDGPU_FEATURE_SRAMECC_UNSUPPORTED_V4`` 0x000      SRAMECC unsupported.
+     ``EF_AMDGPU_FEATURE_SRAMECC_ANY_V4``         0x400      SRAMECC can have any value.
+     ``EF_AMDGPU_FEATURE_SRAMECC_OFF_V4``         0x800      SRAMECC disabled,
+     ``EF_AMDGPU_FEATURE_SRAMECC_ON_V4``          0xc00      SRAMECC enabled.
+     ``EF_AMDGPU_GENERIC_VERSION_V``              0xff000000 Generic code object version selection
+                                                             mask. This is a value between 1 and 255,
+                                                             stored in the most significant byte
+                                                             of EFLAGS.
+                                                             See :ref:`amdgpu-generic-processor-table`
+     ============================================ ========== =========================================
+
   .. table:: AMDGPU ``EF_AMDGPU_MACH`` Values
      :name: amdgpu-ef-amdgpu-mach-table
 
-     ==================================== ========== =============================
-     Name                                 Value      Description (see
-                                                     :ref:`amdgpu-processor-table`)
-     ==================================== ========== =============================
-     ``EF_AMDGPU_MACH_NONE``              0x000      *not specified*
-     ``EF_AMDGPU_MACH_R600_R600``         0x001      ``r600``
-     ``EF_AMDGPU_MACH_R600_R630``         0x002      ``r630``
-     ``EF_AMDGPU_MACH_R600_RS880``        0x003      ``rs880``
-     ``EF_AMDGPU_MACH_R600_RV670``        0x004      ``rv670``
-     ``EF_AMDGPU_MACH_R600_RV710``        0x005      ``rv710``
-     ``EF_AMDGPU_MACH_R600_RV730``        0x006      ``rv730``
-     ``EF_AMDGPU_MACH_R600_RV770``        0x007      ``rv770``
-     ``EF_AMDGPU_MACH_R600_CEDAR``        0x008      ``cedar``
-     ``EF_AMDGPU_MACH_R600_CYPRESS``      0x009      ``cypress``
-     ``EF_AMDGPU_MACH_R600_JUNIPER``      0x00a      ``juniper``
-     ``EF_AMDGPU_MACH_R600_REDWOOD``      0x00b      ``redwood``
-     ``EF_AMDGPU_MACH_R600_SUMO``         0x00c      ``sumo``
-     ``EF_AMDGPU_MACH_R600_BARTS``        0x00d      ``barts``
-     ``EF_AMDGPU_MACH_R600_CAICOS``       0x00e      ``caicos``
-     ``EF_AMDGPU_MACH_R600_CAYMAN``       0x00f      ``cayman``
-     ``EF_AMDGPU_MACH_R600_TURKS``        0x010      ``turks``
-     *reserved*                           0x011 -    Reserved for ``r600``
-                                          0x01f      architecture processors.
-     ``EF_AMDGPU_MACH_AMDGCN_GFX600``     0x020      ``gfx600``
-     ``EF_AMDGPU_MACH_AMDGCN_GFX601``     0x021      ``gfx601``
-     ``EF_AMDGPU_MACH_AMDGCN_GFX700``     0x022      ``gfx700``
-     ``EF_AMDGPU_MACH_AMDGCN_GFX701``     0x023      ``gfx701``
-     ``EF_AMDGPU_MACH_AMDGCN_GFX702``     0x024      ``gfx702``
-     ``EF_AMDGPU_MACH_AMDGCN_GFX703``     0x025      ``gfx703``
-     ``EF_AMDGPU_MACH_AMDGCN_GFX704``     0x026      ``gfx704``
-     *reserved*                           0x027      Reserved.
-     ``EF_AMDGPU_MACH_AMDGCN_GFX801``     0x028      ``gfx801``
-     ``EF_AMDGPU_MACH_AMDGCN_GFX802``     0x029      ``gfx802``
-     ``EF_AMDGPU_MACH_AMDGCN_GFX803``     0x02a      ``gfx803``
-     ``EF_AMDGPU_MACH_AMDGCN_GFX810``     0x02b      ``gfx810``
-     ``EF_AMDGPU_MACH_AMDGCN_GFX900``     0x02c      ``gfx900``
-     ``EF_AMDGPU_MACH_AMDGCN_GFX902``     0x02d      ``gfx902``
-     ``EF_AMDGPU_MACH_AMDGCN_GFX904``     0x02e      ``gfx904``
-     ``EF_AMDGPU_MACH_AMDGCN_GFX906``     0x02f      ``gfx906``
-     ``EF_AMDGPU_MACH_AMDGCN_GFX908``     0x030      ``gfx908``
-     ``EF_AMDGPU_MACH_AMDGCN_GFX909``     0x031      ``gfx909``
-     ``EF_AMDGPU_MACH_AMDGCN_GFX90C``     0x032      ``gfx90c``
-     ``EF_AMDGPU_MACH_AMDGCN_GFX1010``    0x033      ``gfx1010``
-     ``EF_AMDGPU_MACH_AMDGCN_GFX1011``    0x034      ``gfx1011``
-     ``EF_AMDGPU_MACH_AMDGCN_GFX1012``    0x035      ``gfx1012``
-     ``EF_AMDGPU_MACH_AMDGCN_GFX1030``    0x036      ``gfx1030``
-     ``EF_AMDGPU_MACH_AMDGCN_GFX1031``    0x037      ``gfx1031``
-     ``EF_AMDGPU_MACH_AMDGCN_GFX1032``    0x038      ``gfx1032``
-     ``EF_AMDGPU_MACH_AMDGCN_GFX1033``    0x039      ``gfx1033``
-     ``EF_AMDGPU_MACH_AMDGCN_GFX602``     0x03a      ``gfx602``
-     ``EF_AMDGPU_MACH_AMDGCN_GFX705``     0x03b      ``gfx705``
-     ``EF_AMDGPU_MACH_AMDGCN_GFX805``     0x03c      ``gfx805``
-     ``EF_AMDGPU_MACH_AMDGCN_GFX1035``    0x03d      ``gfx1035``
-     ``EF_AMDGPU_MACH_AMDGCN_GFX1034``    0x03e      ``gfx1034``
-     ``EF_AMDGPU_MACH_AMDGCN_GFX90A``     0x03f      ``gfx90a``
-     ``EF_AMDGPU_MACH_AMDGCN_GFX940``     0x040      ``gfx940``
-     ``EF_AMDGPU_MACH_AMDGCN_GFX1100``    0x041      ``gfx1100``
-     ``EF_AMDGPU_MACH_AMDGCN_GFX1013``    0x042      ``gfx1013``
-     ``EF_AMDGPU_MACH_AMDGCN_GFX1150``    0x043      ``gfx1150``
-     ``EF_AMDGPU_MACH_AMDGCN_GFX1103``    0x044      ``gfx1103``
-     ``EF_AMDGPU_MACH_AMDGCN_GFX1036``    0x045      ``gfx1036``
-     ``EF_AMDGPU_MACH_AMDGCN_GFX1101``    0x046      ``gfx1101``
-     ``EF_AMDGPU_MACH_AMDGCN_GFX1102``    0x047      ``gfx1102``
-     ``EF_AMDGPU_MACH_AMDGCN_GFX1200``    0x048      ``gfx1200``
-     *reserved*                           0x049      Reserved.
-     ``EF_AMDGPU_MACH_AMDGCN_GFX1151``    0x04a      ``gfx1151``
-     ``EF_AMDGPU_MACH_AMDGCN_GFX941``     0x04b      ``gfx941``
-     ``EF_AMDGPU_MACH_AMDGCN_GFX942``     0x04c      ``gfx942``
-     *reserved*                           0x04d      Reserved.
-     ``EF_AMDGPU_MACH_AMDGCN_GFX1201``    0x04e      ``gfx1201``
-     ==================================== ========== =============================
+     ========================================== ========== =============================
+     Name                                       Value      Description (see
+                                                           :ref:`amdgpu-processor-table`)
+     ========================================== ========== =============================
+     ``EF_AMDGPU_MACH_NONE``                    0x000      *not specified*
+     ``EF_AMDGPU_MACH_R600_R600``               0x001      ``r600``
+     ``EF_AMDGPU_MACH_R600_R630``               0x002      ``r630``
+     ``EF_AMDGPU_MACH_R600_RS880``              0x003      ``rs880``
+     ``EF_AMDGPU_MACH_R600_RV670``              0x004      ``rv670``
+     ``EF_AMDGPU_MACH_R600_RV710``              0x005      ``rv710``
+     ``EF_AMDGPU_MACH_R600_RV730``              0x006      ``rv730``
+     ``EF_AMDGPU_MACH_R600_RV770``              0x007      ``rv770``
+     ``EF_AMDGPU_MACH_R600_CEDAR``              0x008      ``cedar``
+     ``EF_AMDGPU_MACH_R600_CYPRESS``            0x009      ``cypress``
+     ``EF_AMDGPU_MACH_R600_JUNIPER``            0x00a      ``juniper``
+     ``EF_AMDGPU_MACH_R600_REDWOOD``            0x00b      ``redwood``
+     ``EF_AMDGPU_MACH_R600_SUMO``               0x00c      ``sumo``
+     ``EF_AMDGPU_MACH_R600_BARTS``              0x00d      ``barts``
+     ``EF_AMDGPU_MACH_R600_CAICOS``             0x00e      ``caicos``
+     ``EF_AMDGPU_MACH_R600_CAYMAN``             0x00f      ``cayman``
+     ``EF_AMDGPU_MACH_R600_TURKS``              0x010      ``turks``
+     *reserved*                                 0x011 -    Reserved for ``r600``
+                                                0x01f      architecture processors.
+     ``EF_AMDGPU_MACH_AMDGCN_GFX600``           0x020      ``gfx600``
+     ``EF_AMDGPU_MACH_AMDGCN_GFX601``           0x021      ``gfx601``
+     ``EF_AMDGPU_MACH_AMDGCN_GFX700``           0x022      ``gfx700``
+     ``EF_AMDGPU_MACH_AMDGCN_GFX701``           0x023      ``gfx701``
+     ``EF_AMDGPU_MACH_AMDGCN_GFX702``           0x024      ``gfx702``
+     ``EF_AMDGPU_MACH_AMDGCN_GFX703``           0x025      ``gfx703``
+     ``EF_AMDGPU_MACH_AMDGCN_GFX704``           0x026      ``gfx704``
+     *reserved*                                 0x027      Reserved.
+     ``EF_AMDGPU_MACH_AMDGCN_GFX801``           0x028      ``gfx801``
+     ``EF_AMDGPU_MACH_AMDGCN_GFX802``           0x029      ``gfx802``
+     ``EF_AMDGPU_MACH_AMDGCN_GFX803``           0x02a      ``gfx803``
+     ``EF_AMDGPU_MACH_AMDGCN_GFX810``           0x02b      ``gfx810``
+     ``EF_AMDGPU_MACH_AMDGCN_GFX900``           0x02c      ``gfx900``
+     ``EF_AMDGPU_MACH_AMDGCN_GFX902``           0x02d      ``gfx902``
+     ``EF_AMDGPU_MACH_AMDGCN_GFX904``           0x02e      ``gfx904``
+     ``EF_AMDGPU_MACH_AMDGCN_GFX906``           0x02f      ``gfx906``
+     ``EF_AMDGPU_MACH_AMDGCN_GFX908``           0x030      ``gfx908``
+     ``EF_AMDGPU_MACH_AMDGCN_GFX909``           0x031      ``gfx909``
+     ``EF_AMDGPU_MACH_AMDGCN_GFX90C``           0x032      ``gfx90c``
+     ``EF_AMDGPU_MACH_AMDGCN_GFX1010``          0x033      ``gfx1010``
+     ``EF_AMDGPU_MACH_AMDGCN_GFX1011``          0x034      ``gfx1011``
+     ``EF_AMDGPU_MACH_AMDGCN_GFX1012``          0x035      ``gfx1012``
+     ``EF_AMDGPU_MACH_AMDGCN_GFX1030``          0x036      ``gfx1030``
+     ``EF_AMDGPU_MACH_AMDGCN_GFX1031``          0x037      ``gfx1031``
+     ``EF_AMDGPU_MACH_AMDGCN_GFX1032``          0x038      ``gfx1032``
+     ``EF_AMDGPU_MACH_AMDGCN_GFX1033``          0x039      ``gfx1033``
+     ``EF_AMDGPU_MACH_AMDGCN_GFX602``           0x03a      ``gfx602``
+     ``EF_AMDGPU_MACH_AMDGCN_GFX705``           0x03b      ``gfx705``
+     ``EF_AMDGPU_MACH_AMDGCN_GFX805``           0x03c      ``gfx805``
+     ``EF_AMDGPU_MACH_AMDGCN_GFX1035``          0x03d      ``gfx1035``
+     ``EF_AMDGPU_MACH_AMDGCN_GFX1034``          0x03e      ``gfx1034``
+     ``EF_AMDGPU_MACH_AMDGCN_GFX90A``           0x03f      ``gfx90a``
+     ``EF_AMDGPU_MACH_AMDGCN_GFX940``           0x040      ``gfx940``
+     ``EF_AMDGPU_MACH_AMDGCN_GFX1100``          0x041      ``gfx1100``
+     ``EF_AMDGPU_MACH_AMDGCN_GFX1013``          0x042      ``gfx1013``
+     ``EF_AMDGPU_MACH_AMDGCN_GFX1150``          0x043      ``gfx1150``
+     ``EF_AMDGPU_MACH_AMDGCN_GFX1103``          0x044      ``gfx1103``
+     ``EF_AMDGPU_MACH_AMDGCN_GFX1036``          0x045      ``gfx1036``
+     ``EF_AMDGPU_MACH_AMDGCN_GFX1101``          0x046      ``gfx1101``
+     ``EF_AMDGPU_MACH_AMDGCN_GFX1102``          0x047      ``gfx1102``
+     ``EF_AMDGPU_MACH_AMDGCN_GFX1200``          0x048      ``gfx1200``
+     *reserved*                                 0x049      Reserved.
+     ``EF_AMDGPU_MACH_AMDGCN_GFX1151``          0x04a      ``gfx1151``
+     ``EF_AMDGPU_MACH_AMDGCN_GFX941``           0x04b      ``gfx941``
+     ``EF_AMDGPU_MACH_AMDGCN_GFX942``           0x04c      ``gfx942``
+     *reserved*                                 0x04d      Reserved.
+     ``EF_AMDGPU_MACH_AMDGCN_GFX1201``          0x04e      ``gfx1201``
+     ``EF_AMDGPU_MACH_AMDGCN_GFX1201``          0x04e      ``gfx1201``
+     ``EF_AMDGPU_MACH_AMDGCN_GFX1201``          0x04e      ``gfx1201``
+     *reserved*                                 0x04f      Reserved.
+     *reserved*                                 0x050      Reserved.
+     ``EF_AMDGPU_MACH_AMDGCN_GFX9_GENERIC``     0x051      ``gfx9-generic``
+     ``EF_AMDGPU_MACH_AMDGCN_GFX10_1_GENERIC``  0x052      ``gfx10.1-generic``
+     ``EF_AMDGPU_MACH_AMDGCN_GFX10_3_GENERIC``  0x053      ``gfx10.3-generic``
+     ``EF_AMDGPU_MACH_AMDGCN_GFX11_GENERIC``    0x054      ``gfx11-generic``
+     ========================================== ========== =============================
 
 Sections
 --------
diff --git a/llvm/include/llvm/BinaryFormat/ELF.h b/llvm/include/llvm/BinaryFormat/ELF.h
index efd41f9..3eddaee 100644
--- a/llvm/include/llvm/BinaryFormat/ELF.h
+++ b/llvm/include/llvm/BinaryFormat/ELF.h
@@ -790,11 +790,15 @@ enum : unsigned {
   EF_AMDGPU_MACH_AMDGCN_GFX1201       = 0x04e,
   EF_AMDGPU_MACH_AMDGCN_RESERVED_0X4F = 0x04f,
   EF_AMDGPU_MACH_AMDGCN_RESERVED_0X50 = 0x050,
+  EF_AMDGPU_MACH_AMDGCN_GFX9_GENERIC      = 0x051,
+  EF_AMDGPU_MACH_AMDGCN_GFX10_1_GENERIC   = 0x052,
+  EF_AMDGPU_MACH_AMDGCN_GFX10_3_GENERIC   = 0x053,
+  EF_AMDGPU_MACH_AMDGCN_GFX11_GENERIC     = 0x054,
   // clang-format on
 
   // First/last AMDGCN-based processors.
   EF_AMDGPU_MACH_AMDGCN_FIRST = EF_AMDGPU_MACH_AMDGCN_GFX600,
-  EF_AMDGPU_MACH_AMDGCN_LAST = EF_AMDGPU_MACH_AMDGCN_GFX1201,
+  EF_AMDGPU_MACH_AMDGCN_LAST = EF_AMDGPU_MACH_AMDGCN_GFX11_GENERIC,
 
   // Indicates if the "xnack" target feature is enabled for all code contained
   // in the object.
diff --git a/llvm/include/llvm/TargetParser/TargetParser.h b/llvm/include/llvm/TargetParser/TargetParser.h
index 6464285..7da1170 100644
--- a/llvm/include/llvm/TargetParser/TargetParser.h
+++ b/llvm/include/llvm/TargetParser/TargetParser.h
@@ -111,6 +111,14 @@ enum GPUKind : uint32_t {
 
   GK_AMDGCN_FIRST = GK_GFX600,
   GK_AMDGCN_LAST = GK_GFX1201,
+
+  GK_GFX9_GENERIC = 192,
+  GK_GFX10_1_GENERIC = 193,
+  GK_GFX10_3_GENERIC = 194,
+  GK_GFX11_GENERIC = 195,
+
+  GK_AMDGCN_GENERIC_FIRST = GK_GFX9_GENERIC,
+  GK_AMDGCN_GENERIC_LAST = GK_GFX11_GENERIC,
 };
 
 /// Instruction set architecture version.
@@ -147,6 +155,8 @@ enum ArchFeatureKind : uint32_t {
   FEATURE_WGP = 1 << 9,
 };
 
+StringRef getArchFamilyNameAMDGCN(GPUKind AK);
+
 StringRef getArchNameAMDGCN(GPUKind AK);
 StringRef getArchNameR600(GPUKind AK);
 StringRef getCanonicalArchName(const Triple &T, StringRef Arch);
diff --git a/llvm/lib/Object/ELFObjectFile.cpp b/llvm/lib/Object/ELFObjectFile.cpp
index 38a9e0e..01949c6 100644
--- a/llvm/lib/Object/ELFObjectFile.cpp
+++ b/llvm/lib/Object/ELFObjectFile.cpp
@@ -514,6 +514,16 @@ StringRef ELFObjectFileBase::getAMDGPUCPUName() const {
     return "gfx1200";
   case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1201:
     return "gfx1201";
+
+  // Generic AMDGCN targets
+  case ELF::EF_AMDGPU_MACH_AMDGCN_GFX9_GENERIC:
+    return "gfx9-generic";
+  case ELF::EF_AMDGPU_MACH_AMDGCN_GFX10_1_GENERIC:
+    return "gfx10.1-generic";
+  case ELF::EF_AMDGPU_MACH_AMDGCN_GFX10_3_GENERIC:
+    return "gfx10.3-generic";
+  case ELF::EF_AMDGPU_MACH_AMDGCN_GFX11_GENERIC:
+    return "gfx11-generic";
   default:
     llvm_unreachable("Unknown EF_AMDGPU_MACH value");
   }
diff --git a/llvm/lib/ObjectYAML/ELFYAML.cpp b/llvm/lib/ObjectYAML/ELFYAML.cpp
index 1436e92..de1ef24 100644
--- a/llvm/lib/ObjectYAML/ELFYAML.cpp
+++ b/llvm/lib/ObjectYAML/ELFYAML.cpp
@@ -612,6 +612,10 @@ void ScalarBitSetTraits<ELFYAML::ELF_EF>::bitset(IO &IO,
     BCaseMask(EF_AMDGPU_MACH_AMDGCN_GFX1151, EF_AMDGPU_MACH);
     BCaseMask(EF_AMDGPU_MACH_AMDGCN_GFX1200, EF_AMDGPU_MACH);
     BCaseMask(EF_AMDGPU_MACH_AMDGCN_GFX1201, EF_AMDGPU_MACH);
+    BCaseMask(EF_AMDGPU_MACH_AMDGCN_GFX9_GENERIC, EF_AMDGPU_MACH);
+    BCaseMask(EF_AMDGPU_MACH_AMDGCN_GFX10_1_GENERIC, EF_AMDGPU_MACH);
+    BCaseMask(EF_AMDGPU_MACH_AMDGCN_GFX10_3_GENERIC, EF_AMDGPU_MACH);
+    BCaseMask(EF_AMDGPU_MACH_AMDGCN_GFX11_GENERIC, EF_AMDGPU_MACH);
     switch (Object->Header.ABIVersion) {
     default:
       // ELFOSABI_AMDGPU_PAL, ELFOSABI_AMDGPU_MESA3D support *_V3 flags.
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
index 55dbc1a..4ab2b12 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -1002,6 +1002,12 @@ def FeatureGWS : SubtargetFeature<"gws",
   "Has Global Wave Sync"
 >;
 
+def FeatureRequiresCOV6 : SubtargetFeature<"requires-cov6",
+  "RequiresCOV6",
+  "true",
+  "Target Requires Code Object V6"
+>;
+
 // Dummy feature used to disable assembler instructions.
 def FeatureDisable : SubtargetFeature<"",
   "FeatureDisable","true",
@@ -1212,6 +1218,17 @@ def FeatureISAVersion9_0_Common : FeatureSet<
    FeatureImageInsts,
    FeatureMadMacF32Insts]>;
 
+def FeatureISAVersion9_0_Consumer_Common : FeatureSet<
+  !listconcat(FeatureISAVersion9_0_Common.Features,
+    [FeatureImageGather4D16Bug,
+     FeatureDsSrc2Insts,
+     FeatureExtendedImageInsts,
+     FeatureGDS])>;
+
+def FeatureISAVersion9_Generic : FeatureSet<
+  !listconcat(FeatureISAVersion9_0_Consumer_Common.Features,
+    [FeatureRequiresCOV6])>;
+
 def FeatureISAVersion9_0_MI_Common : FeatureSet<
   !listconcat(FeatureISAVersion9_0_Common.Features,
     [FeatureFmaMixInsts,
@@ -1230,43 +1247,27 @@ def FeatureISAVersion9_0_MI_Common : FeatureSet<
      FeatureSupportsSRAMECC])>;
 
 def FeatureISAVersion9_0_0 : FeatureSet<
-  !listconcat(FeatureISAVersion9_0_Common.Features,
-    [FeatureGDS,
-     FeatureMadMixInsts,
-     FeatureDsSrc2Insts,
-     FeatureExtendedImageInsts,
-     FeatureImageGather4D16Bug])>;
+  !listconcat(FeatureISAVersion9_0_Consumer_Common.Features,
+    [FeatureMadMixInsts])>;
 
 def FeatureISAVersion9_0_2 : FeatureSet<
-  !listconcat(FeatureISAVersion9_0_Common.Features,
-    [FeatureGDS,
-     FeatureMadMixInsts,
-     FeatureDsSrc2Insts,
-     FeatureExtendedImageInsts,
-     FeatureImageGather4D16Bug])>;
+  !listconcat(FeatureISAVersion9_0_Consumer_Common.Features,
+    [FeatureMadMixInsts])>;
 
 def FeatureISAVersion9_0_4 : FeatureSet<
-  !listconcat(FeatureISAVersion9_0_Common.Features,
-    [FeatureGDS,
-     FeatureDsSrc2Insts,
-     FeatureExtendedImageInsts,
-     FeatureFmaMixInsts,
-     FeatureImageGather4D16Bug])>;
+  !listconcat(FeatureISAVersion9_0_Consumer_Common.Features,
+    [FeatureFmaMixInsts])>;
 
 def FeatureISAVersion9_0_6 : FeatureSet<
-  !listconcat(FeatureISAVersion9_0_Common.Features,
-    [FeatureGDS,
-     HalfRate64Ops,
+  !listconcat(FeatureISAVersion9_0_Consumer_Common.Features,
+    [HalfRate64Ops,
      FeatureFmaMixInsts,
-     FeatureDsSrc2Insts,
-     FeatureExtendedImageInsts,
      FeatureDLInsts,
      FeatureDot1Insts,
      FeatureDot2Insts,
      FeatureDot7Insts,
      FeatureDot10Insts,
-     FeatureSupportsSRAMECC,
-     FeatureImageGather4D16Bug])>;
+     FeatureSupportsSRAMECC])>;
 
 def FeatureISAVersion9_0_8 : FeatureSet<
   !listconcat(FeatureISAVersion9_0_MI_Common.Features,
@@ -1279,13 +1280,9 @@ def FeatureISAVersion9_0_8 : FeatureSet<
      FeatureImageGather4D16Bug])>;
 
 def FeatureISAVersion9_0_9 : FeatureSet<
-  !listconcat(FeatureISAVersion9_0_Common.Features,
-    [FeatureGDS,
-     FeatureMadMixInsts,
-     FeatureDsSrc2Insts,
-     FeatureExtendedImageInsts,
-     FeatureImageInsts,
-     FeatureImageGather4D16Bug])>;
+  !listconcat(FeatureISAVersion9_0_Consumer_Common.Features,
+    [FeatureMadMixInsts,
+     FeatureImageInsts])>;
 
 def FeatureISAVersion9_0_A : FeatureSet<
   !listconcat(FeatureISAVersion9_0_MI_Common.Features,
@@ -1301,12 +1298,8 @@ def FeatureISAVersion9_0_A : FeatureSet<
      FeatureKernargPreload])>;
 
 def FeatureISAVersion9_0_C : FeatureSet<
-  !listconcat(FeatureISAVersion9_0_Common.Features,
-    [FeatureGDS,
-     FeatureMadMixInsts,
-     FeatureDsSrc2Insts,
-     FeatureExtendedImageInsts,
-     FeatureImageGather4D16Bug])>;
+  !listconcat(FeatureISAVersion9_0_Consumer_Common.Features,
+    [FeatureMadMixInsts])>;
 
 def FeatureISAVersion9_4_Common : FeatureSet<
   [FeatureGFX9,
@@ -1387,6 +1380,10 @@ def FeatureISAVersion10_1_Common : FeatureSet<
      FeatureFlatSegmentOffsetBug,
      FeatureNegativeUnalignedScratchOffsetBug])>;
 
+def FeatureISAVersion10_1_Generic : FeatureSet<
+  !listconcat(FeatureISAVersion10_1_Common.Features,
+    [FeatureRequiresCOV6])>;
+
 def FeatureISAVersion10_1_0 : FeatureSet<
   !listconcat(FeatureISAVersion10_1_Common.Features,
     [])>;
@@ -1426,6 +1423,10 @@ def FeatureISAVersion10_3_0 : FeatureSet<
      FeatureDot10Insts,
      FeatureShaderCyclesRegister])>;
 
+def FeatureISAVersion10_3_Generic: FeatureSet<
+  !listconcat(FeatureISAVersion10_3_0.Features,
+    [FeatureRequiresCOV6])>;
+
 def FeatureISAVersion11_Common : FeatureSet<
   [FeatureGFX11,
    FeatureLDSBankCount32,
@@ -1447,6 +1448,16 @@ def FeatureISAVersion11_Common : FeatureSet<
    FeaturePackedTID,
    FeatureVcmpxPermlaneHazard]>;
 
+// There are few workarounds that need to be
+// added to all targets. This pessimizes codegen
+// a bit on the generic GFX11 target.
+def FeatureISAVersion11_Generic: FeatureSet<
+  !listconcat(FeatureISAVersion11_Common.Features,
+    [FeatureMSAALoadDstSelBug,
+     FeatureVALUTransUseHazard,
+     FeatureUserSGPRInit16Bug,
+     FeatureRequiresCOV6])>;
+
 def FeatureISAVersion11_0_Common : FeatureSet<
   !listconcat(FeatureISAVersion11_Common.Features,
     [FeatureMSAALoadDstSelBug,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
index db81e1e..5777a7c 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
@@ -156,6 +156,13 @@ void AMDGPUAsmPrinter::emitFunctionBodyStart() {
   const GCNSubtarget &STM = MF->getSubtarget<GCNSubtarget>();
   const Function &F = MF->getFunction();
 
+  // TODO: We're checking this late, would be nice to check it earlier.
+  if (STM.requiresCodeObjectV6() && CodeObjectVersion < AMDGPU::AMDHSA_COV6) {
+    report_fatal_error(
+        STM.getCPU() + " is only available on code object version 6 or better",
+        /*gen_crash_diag*/ false);
+  }
+
   // TODO: Which one is called first, emitStartOfAsmFile or
   // emitFunctionBodyStart?
   if (!getTargetStreamer()->getTargetID())
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURemoveIncompatibleFunctions.cpp b/llvm/lib/Target/AMDGPU/AMDGPURemoveIncompatibleFunctions.cpp
index 6f1236f..9d44b65 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURemoveIncompatibleFunctions.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURemoveIncompatibleFunctions.cpp
@@ -139,10 +139,10 @@ bool AMDGPURemoveIncompatibleFunctions::checkFunction(Function &F) {
   const GCNSubtarget *ST =
       static_cast<const GCNSubtarget *>(TM->getSubtargetImpl(F));
 
-  // Check the GPU isn't generic. Generic is used for testing only
-  // and we don't want this pass to interfere with it.
+  // Check the GPU isn't generic or generic-hsa. Generic is used for testing
+  // only and we don't want this pass to interfere with it.
   StringRef GPUName = ST->getCPU();
-  if (GPUName.empty() || GPUName.contains("generic"))
+  if (GPUName.empty() || GPUName.starts_with("generic"))
     return false;
 
   // Try to fetch the GPU's info. If we can't, it's likely an unknown processor
diff --git a/llvm/lib/Target/AMDGPU/GCNProcessors.td b/llvm/lib/Target/AMDGPU/GCNProcessors.td
index 96af1a6..4671e03 100644
--- a/llvm/lib/Target/AMDGPU/GCNProcessors.td
+++ b/llvm/lib/Target/AMDGPU/GCNProcessors.td
@@ -204,6 +204,11 @@ def : ProcessorModel<"gfx942", SIDPGFX940FullSpeedModel,
   FeatureISAVersion9_4_2.Features
 >;
 
+// [gfx900, gfx902, gfx904, gfx906, gfx909, gfx90c]
+def : ProcessorModel<"gfx9-generic", SIQuarterSpeedModel,
+  FeatureISAVersion9_Generic.Features
+>;
+
 //===----------------------------------------------------------------------===//
 // GCN GFX10.
 //===----------------------------------------------------------------------===//
@@ -252,6 +257,16 @@ def : ProcessorModel<"gfx1036", GFX10SpeedModel,
   FeatureISAVersion10_3_0.Features
 >;
 
+// [gfx1010, gfx1011, gfx1012, gfx1013]
+def : ProcessorModel<"gfx10.1-generic", GFX10SpeedModel,
+  FeatureISAVersion10_1_Generic.Features
+>;
+
+// [gfx1030, gfx1031, gfx1032, gfx1033, gfx1034, gfx1035, gfx1036]
+def : ProcessorModel<"gfx10.3-generic", GFX10SpeedModel,
+  FeatureISAVersion10_3_Generic.Features
+>;
+
 //===----------------------------------------------------------------------===//
 // GCN GFX11.
 //===----------------------------------------------------------------------===//
@@ -280,10 +295,17 @@ def : ProcessorModel<"gfx1151", GFX11SpeedModel,
   FeatureISAVersion11_5_1.Features
 >;
 
+// [gfx1100, gfx1101, gfx1102, gfx1103, gfx1150, gfx1151]
+def : ProcessorModel<"gfx11-generic", GFX11SpeedModel,
+  FeatureISAVersion11_Generic.Features
+>;
+
 //===----------------------------------------------------------------------===//
 // GCN GFX12.
 //===----------------------------------------------------------------------===//
 
+// TODO: gfx12-generic ?
+
 def : ProcessorModel<"gfx1200", GFX12SpeedModel,
   FeatureISAVersion12.Features
 >;
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index 4f8eeaa..b13b4f7 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -224,6 +224,8 @@ protected:
   bool HasVALUTransUseHazard = false;
   bool HasForceStoreSC0SC1 = false;
 
+  bool RequiresCOV6 = false;
+
   // Dummy feature to use for assembler in tablegen.
   bool FeatureDisable = false;
 
@@ -1165,6 +1167,8 @@ public:
 
   bool hasForceStoreSC0SC1() const { return HasForceStoreSC0SC1; }
 
+  bool requiresCodeObjectV6() const { return RequiresCOV6; }
+
   bool hasVALUMaskWriteHazard() const { return getGeneration() == GFX11; }
 
   /// Return if operations acting on VGPR tuples require even alignment.
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
index 5e9b167..a25622c 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
@@ -115,6 +115,10 @@ StringRef AMDGPUTargetStreamer::getArchNameFromElfMach(unsigned ElfMach) {
   case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1151: AK = GK_GFX1151; break;
   case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1200: AK = GK_GFX1200; break;
   case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1201: AK = GK_GFX1201; break;
+  case ELF::EF_AMDGPU_MACH_AMDGCN_GFX9_GENERIC:     AK = GK_GFX9_GENERIC; break;
+  case ELF::EF_AMDGPU_MACH_AMDGCN_GFX10_1_GENERIC:  AK = GK_GFX10_1_GENERIC; break;
+  case ELF::EF_AMDGPU_MACH_AMDGCN_GFX10_3_GENERIC:  AK = GK_GFX10_3_GENERIC; break;
+  case ELF::EF_AMDGPU_MACH_AMDGCN_GFX11_GENERIC:    AK = GK_GFX11_GENERIC; break;
   case ELF::EF_AMDGPU_MACH_NONE:           AK = GK_NONE;    break;
   default:                                 AK = GK_NONE;    break;
   }
@@ -193,6 +197,10 @@ unsigned AMDGPUTargetStreamer::getElfMach(StringRef GPU) {
   case GK_GFX1151: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX1151;
   case GK_GFX1200: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX1200;
   case GK_GFX1201: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX1201;
+  case GK_GFX9_GENERIC:     return ELF::EF_AMDGPU_MACH_AMDGCN_GFX9_GENERIC;
+  case GK_GFX10_1_GENERIC:  return ELF::EF_AMDGPU_MACH_AMDGCN_GFX10_1_GENERIC;
+  case GK_GFX10_3_GENERIC:  return ELF::EF_AMDGPU_MACH_AMDGCN_GFX10_3_GENERIC;
+  case GK_GFX11_GENERIC:    return ELF::EF_AMDGPU_MACH_AMDGCN_GFX11_GENERIC;
   case GK_NONE:    return ELF::EF_AMDGPU_MACH_NONE;
   }
   // clang-format on
@@ -659,6 +667,24 @@ unsigned AMDGPUTargetELFStreamer::getEFlagsV6() {
   unsigned Flags = getEFlagsV4();
 
   unsigned Version = ForceGenericVersion;
+  if (!Version) {
+    switch (parseArchAMDGCN(STI.getCPU())) {
+    case AMDGPU::GK_GFX9_GENERIC:
+      Version = GenericVersion::GFX9;
+      break;
+    case AMDGPU::GK_GFX10_1_GENERIC:
+      Version = GenericVersion::GFX10_1;
+      break;
+    case AMDGPU::GK_GFX10_3_GENERIC:
+      Version = GenericVersion::GFX10_3;
+      break;
+    case AMDGPU::GK_GFX11_GENERIC:
+      Version = GenericVersion::GFX11;
+      break;
+    default:
+      break;
+    }
+  }
 
   // Versions start at 1.
   if (Version) {
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
index f24b9f0..ded252c 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -42,6 +42,17 @@ namespace AMDGPU {
 
 struct IsaVersion;
 
+/// Generic target versions emitted by this version of LLVM.
+///
+/// These numbers are incremented every time a codegen breaking change occurs
+/// within a generic family.
+namespace GenericVersion {
+static constexpr unsigned GFX9 = 1;
+static constexpr unsigned GFX10_1 = 1;
+static constexpr unsigned GFX10_3 = 1;
+static constexpr unsigned GFX11 = 1;
+} // namespace GenericVersion
+
 enum { AMDHSA_COV4 = 4, AMDHSA_COV5 = 5, AMDHSA_COV6 = 6 };
 
 /// \returns True if \p STI is AMDHSA.
diff --git a/llvm/lib/TargetParser/TargetParser.cpp b/llvm/lib/TargetParser/TargetParser.cpp
index 20f3246..684d698 100644
--- a/llvm/lib/TargetParser/TargetParser.cpp
+++ b/llvm/lib/TargetParser/TargetParser.cpp
@@ -126,6 +126,11 @@ constexpr GPUInfo AMDGCNGPUs[] = {
     {{"gfx1151"},   {"gfx1151"}, GK_GFX1151, FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_WAVE32|FEATURE_WGP},
     {{"gfx1200"},   {"gfx1200"}, GK_GFX1200, FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_WAVE32|FEATURE_WGP},
     {{"gfx1201"},   {"gfx1201"}, GK_GFX1201, FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_WAVE32|FEATURE_WGP},
+
+    {{"gfx9-generic"},      {"gfx9-generic"},    GK_GFX9_GENERIC,    FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_XNACK},
+    {{"gfx10.1-generic"},   {"gfx10.1-generic"}, GK_GFX10_1_GENERIC, FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_WAVE32|FEATURE_XNACK|FEATURE_WGP},
+    {{"gfx10.3-generic"},   {"gfx10.3-generic"}, GK_GFX10_3_GENERIC, FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_WAVE32|FEATURE_WGP},
+    {{"gfx11-generic"},     {"gfx11-generic"},   GK_GFX11_GENERIC,   FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_WAVE32|FEATURE_WGP},
     // clang-format on
 };
 
@@ -144,6 +149,22 @@ const GPUInfo *getArchEntry(AMDGPU::GPUKind AK, ArrayRef<GPUInfo> Table) {
 
 } // namespace
 
+StringRef llvm::AMDGPU::getArchFamilyNameAMDGCN(GPUKind AK) {
+  switch (AK) {
+  case AMDGPU::GK_GFX9_GENERIC:
+    return "gfx9";
+  case AMDGPU::GK_GFX10_1_GENERIC:
+  case AMDGPU::GK_GFX10_3_GENERIC:
+    return "gfx10";
+  case AMDGPU::GK_GFX11_GENERIC:
+    return "gfx11";
+  default: {
+    StringRef ArchName = getArchNameAMDGCN(AK);
+    return ArchName.empty() ? "" : ArchName.drop_back(2);
+  }
+  }
+}
+
 StringRef llvm::AMDGPU::getArchNameAMDGCN(GPUKind AK) {
   if (const auto *Entry = getArchEntry(AK, AMDGCNGPUs))
     return Entry->CanonicalName;
@@ -253,6 +274,24 @@ AMDGPU::IsaVersion AMDGPU::getIsaVersion(StringRef GPU) {
   case GK_GFX1151: return {11, 5, 1};
   case GK_GFX1200: return {12, 0, 0};
   case GK_GFX1201: return {12, 0, 1};
+
+  // Generic targets return the lowest common denominator
+  // within their family. That is, the ISA that is the most
+  // restricted in terms of features.
+  //
+  // gfx9-generic is tricky because there is no lowest
+  // common denominator, so we return gfx900 which has mad-mix
+  // but this family doesn't have it.
+  //
+  // This API should never be used to check for a particular
+  // feature anyway.
+  //
+  // TODO: Split up this API depending on its caller so
+  // generic target handling is more obvious and less risky.
+  case GK_GFX9_GENERIC:    return {9, 0, 0};
+  case GK_GFX10_1_GENERIC: return {10, 1, 0};
+  case GK_GFX10_3_GENERIC: return {10, 3, 0};
+  case GK_GFX11_GENERIC:   return {11, 0, 3};
   default:         return {0, 0, 0};
   }
   // clang-format on
@@ -302,6 +341,7 @@ void AMDGPU::fillAMDGPUFeatureMap(StringRef GPU, const Triple &T,
     case GK_GFX1102:
     case GK_GFX1101:
     case GK_GFX1100:
+    case GK_GFX11_GENERIC:
       Features["ci-insts"] = true;
       Features["dot5-insts"] = true;
       Features["dot7-insts"] = true;
@@ -327,6 +367,7 @@ void AMDGPU::fillAMDGPUFeatureMap(StringRef GPU, const Triple &T,
     case GK_GFX1032:
     case GK_GFX1031:
     case GK_GFX1030:
+    case GK_GFX10_3_GENERIC:
       Features["ci-insts"] = true;
       Features["dot1-insts"] = true;
       Features["dot2-insts"] = true;
@@ -357,6 +398,7 @@ void AMDGPU::fillAMDGPUFeatureMap(StringRef GPU, const Triple &T,
       [[fallthrough]];
     case GK_GFX1013:
     case GK_GFX1010:
+    case GK_GFX10_1_GENERIC:
       Features["dl-insts"] = true;
       Features["ci-insts"] = true;
       Features["16-bit-insts"] = true;
@@ -424,6 +466,7 @@ void AMDGPU::fillAMDGPUFeatureMap(StringRef GPU, const Triple &T,
     case GK_GFX904:
     case GK_GFX902:
     case GK_GFX900:
+    case GK_GFX9_GENERIC:
       Features["gfx9-insts"] = true;
       [[fallthrough]];
     case GK_GFX810:
@@ -510,6 +553,9 @@ static bool isWave32Capable(StringRef GPU, const Triple &T) {
     case GK_GFX1011:
     case GK_GFX1013:
     case GK_GFX1010:
+    case GK_GFX11_GENERIC:
+    case GK_GFX10_3_GENERIC:
+    case GK_GFX10_1_GENERIC:
       IsWave32Capable = true;
       break;
     default:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.workitem.id.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.workitem.id.ll
index 155f4c9..9698a38 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.workitem.id.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.workitem.id.ll
@@ -1,11 +1,12 @@
-; RUN: llc -global-isel -mtriple=amdgcn-unknown-amdhsa -verify-machineinstrs  < %s | FileCheck --check-prefixes=ALL,HSA,UNPACKED %s
-; RUN: llc -global-isel -mtriple=amdgcn-unknown-amdhsa -verify-machineinstrs < %s | FileCheck --check-prefixes=ALL,HSA,UNPACKED %s
-; RUN: llc -global-isel -mtriple=amdgcn-- -mcpu=hawaii -mattr=+flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefixes=ALL,MESA,UNPACKED %s
-; RUN: llc -global-isel -mtriple=amdgcn-- -mcpu=tonga -mattr=+flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefixes=ALL,MESA,UNPACKED %s
-; RUN: llc -global-isel -mtriple=amdgcn-unknown-mesa3d -mattr=+flat-for-global -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -check-prefixes=ALL,MESA3D,UNPACKED %s
-; RUN: llc -global-isel -mtriple=amdgcn-unknown-mesa3d -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=ALL,MESA3D,UNPACKED %s
-; RUN: llc -global-isel -mtriple=amdgcn -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -check-prefixes=ALL,PACKED-TID %s
-; RUN: llc -global-isel -mtriple=amdgcn -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx1100 -verify-machineinstrs -amdgpu-enable-vopd=0 < %s | FileCheck -check-prefixes=ALL,PACKED-TID %s
+; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -global-isel -mtriple=amdgcn-unknown-amdhsa -verify-machineinstrs | FileCheck --check-prefixes=ALL,HSA,UNPACKED %s
+; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -global-isel -mtriple=amdgcn-unknown-amdhsa -verify-machineinstrs| FileCheck --check-prefixes=ALL,HSA,UNPACKED %s
+; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -global-isel -mtriple=amdgcn-- -mcpu=hawaii -mattr=+flat-for-global -verify-machineinstrs | FileCheck --check-prefixes=ALL,MESA,UNPACKED %s
+; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -global-isel -mtriple=amdgcn-- -mcpu=tonga -mattr=+flat-for-global -verify-machineinstrs | FileCheck --check-prefixes=ALL,MESA,UNPACKED %s
+; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -global-isel -mtriple=amdgcn-unknown-mesa3d -mattr=+flat-for-global -mcpu=hawaii -verify-machineinstrs | FileCheck -check-prefixes=ALL,MESA3D,UNPACKED %s
+; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -global-isel -mtriple=amdgcn-unknown-mesa3d -mcpu=tonga -verify-machineinstrs | FileCheck -check-prefixes=ALL,MESA3D,UNPACKED %s
+; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -global-isel -mtriple=amdgcn -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx90a -verify-machineinstrs | FileCheck -check-prefixes=ALL,PACKED-TID %s
+; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -global-isel -mtriple=amdgcn -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx1100 -verify-machineinstrs -amdgpu-enable-vopd=0 | FileCheck -check-prefixes=ALL,PACKED-TID %s
+; RUN: sed 's/CODE_OBJECT_VERSION/600/g' %s | llc -global-isel -mtriple=amdgcn -mtriple=amdgcn-unknown-amdhsa --amdhsa-code-object-version=6 -mcpu=gfx11-generic -verify-machineinstrs -amdgpu-enable-vopd=0 | FileCheck -check-prefixes=ALL,PACKED-TID %s
 
 declare i32 @llvm.amdgcn.workitem.id.x() #0
 declare i32 @llvm.amdgcn.workitem.id.y() #0
@@ -200,4 +201,4 @@ attributes #1 = { nounwind }
 !2 = !{i32 1, i32 1, i32 64}
 
 !llvm.module.flags = !{!99}
-!99 = !{i32 1, !"amdgpu_code_object_version", i32 400}
+!99 = !{i32 1, !"amdgpu_code_object_version", i32 CODE_OBJECT_VERSION}
diff --git a/llvm/test/CodeGen/AMDGPU/directive-amdgcn-target.ll b/llvm/test/CodeGen/AMDGPU/directive-amdgcn-target.ll
index 357fcf8..038219f 100644
--- a/llvm/test/CodeGen/AMDGPU/directive-amdgcn-target.ll
+++ b/llvm/test/CodeGen/AMDGPU/directive-amdgcn-target.ll
@@ -108,6 +108,13 @@
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX1200 %s
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1201 < %s | FileCheck --check-prefixes=GFX1201 %s
 
+; RUN: llc --amdhsa-code-object-version=6 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx9-generic -mattr=-xnack < %s | FileCheck --check-prefixes=GFX9_GENERIC_NOXNACK %s
+; RUN: llc --amdhsa-code-object-version=6 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx9-generic -mattr=+xnack < %s | FileCheck --check-prefixes=GFX9_GENERIC_XNACK %s
+; RUN: llc --amdhsa-code-object-version=6 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx10.1-generic -mattr=-xnack < %s | FileCheck --check-prefixes=GFX10_1_GENERIC_NOXNACK %s
+; RUN: llc --amdhsa-code-object-version=6 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx10.1-generic -mattr=+xnack < %s | FileCheck --check-prefixes=GFX10_1_GENERIC_XNACK %s
+; RUN: llc --amdhsa-code-object-version=6 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx10.3-generic < %s | FileCheck --check-prefixes=GFX10_3_GENERIC %s
+; RUN: llc --amdhsa-code-object-version=6 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx11-generic < %s | FileCheck --check-prefixes=GFX11_GENERIC %s
+
 ; GFX600: .amdgcn_target "amdgcn-amd-amdhsa--gfx600"
 ; GFX601: .amdgcn_target "amdgcn-amd-amdhsa--gfx601"
 ; GFX602: .amdgcn_target "amdgcn-amd-amdhsa--gfx602"
@@ -196,6 +203,13 @@
 ; GFX1200: .amdgcn_target "amdgcn-amd-amdhsa--gfx1200"
 ; GFX1201: .amdgcn_target "amdgcn-amd-amdhsa--gfx1201"
 
+; GFX9_GENERIC_NOXNACK:     .amdgcn_target "amdgcn-amd-amdhsa--gfx9-generic:xnack-"
+; GFX9_GENERIC_XNACK:       .amdgcn_target "amdgcn-amd-amdhsa--gfx9-generic:xnack+"
+; GFX10_1_GENERIC_NOXNACK:  .amdgcn_target "amdgcn-amd-amdhsa--gfx10.1-generic:xnack-"
+; GFX10_1_GENERIC_XNACK:    .amdgcn_target "amdgcn-amd-amdhsa--gfx10.1-generic:xnack+"
+; GFX10_3_GENERIC:          .amdgcn_target "amdgcn-amd-amdhsa--gfx10.3-generic"
+; GFX11_GENERIC:            .amdgcn_target "amdgcn-amd-amdhsa--gfx11-generic"
+
 define amdgpu_kernel void @directive_amdgcn_target() {
   ret void
 }
diff --git a/llvm/test/CodeGen/AMDGPU/elf-header-flags-mach.ll b/llvm/test/CodeGen/AMDGPU/elf-header-flags-mach.ll
index 380439d..9ba8176 100644
--- a/llvm/test/CodeGen/AMDGPU/elf-header-flags-mach.ll
+++ b/llvm/test/CodeGen/AMDGPU/elf-header-flags-mach.ll
@@ -77,6 +77,11 @@
 ; RUN: llc -filetype=obj -mtriple=amdgcn -mcpu=gfx1200 < %s | llvm-readobj --file-header - | FileCheck --check-prefixes=ALL,ARCH-GCN,GFX1200 %s
 ; RUN: llc -filetype=obj -mtriple=amdgcn -mcpu=gfx1201 < %s | llvm-readobj --file-header - | FileCheck --check-prefixes=ALL,ARCH-GCN,GFX1201 %s
 
+; RUN: llc -filetype=obj --amdhsa-code-object-version=6 -mtriple=amdgcn -mcpu=gfx9-generic < %s | llvm-readobj --file-header - | FileCheck --check-prefixes=ALL,ARCH-GCN,GFX9_GENERIC %s
+; RUN: llc -filetype=obj --amdhsa-code-object-version=6 -mtriple=amdgcn -mcpu=gfx10.1-generic < %s | llvm-readobj --file-header - | FileCheck --check-prefixes=ALL,ARCH-GCN,GFX10_1_GENERIC %s
+; RUN: llc -filetype=obj --amdhsa-code-object-version=6 -mtriple=amdgcn -mcpu=gfx10.3-generic < %s | llvm-readobj --file-header - | FileCheck --check-prefixes=ALL,ARCH-GCN,GFX10_3_GENERIC %s
+; RUN: llc -filetype=obj --amdhsa-code-object-version=6 -mtriple=amdgcn -mcpu=gfx11-generic < %s | llvm-readobj --file-header - | FileCheck --check-prefixes=ALL,ARCH-GCN,GFX11_GENERIC %s
+
 ; FIXME: With the default attributes the eflags are not accurate for
 ; xnack and sramecc. Subsequent Target-ID patches will address this.
 
@@ -149,6 +154,11 @@
 ; GFX1151:       EF_AMDGPU_MACH_AMDGCN_GFX1151 (0x4A)
 ; GFX1200:       EF_AMDGPU_MACH_AMDGCN_GFX1200 (0x48)
 ; GFX1201:       EF_AMDGPU_MACH_AMDGCN_GFX1201 (0x4E)
+
+; GFX9_GENERIC:       EF_AMDGPU_MACH_AMDGCN_GFX9_GENERIC (0x51)
+; GFX10_1_GENERIC:    EF_AMDGPU_MACH_AMDGCN_GFX10_1_GENERIC (0x52)
+; GFX10_3_GENERIC:    EF_AMDGPU_MACH_AMDGCN_GFX10_3_GENERIC (0x53)
+; GFX11_GENERIC:      EF_AMDGPU_MACH_AMDGCN_GFX11_GENERIC (0x54)
 ; ALL:         ]
 
 define amdgpu_kernel void @elf_header() {
diff --git a/llvm/test/CodeGen/AMDGPU/gds-allocation.ll b/llvm/test/CodeGen/AMDGPU/gds-allocation.ll
index dc6fea4..1a93347 100644
--- a/llvm/test/CodeGen/AMDGPU/gds-allocation.ll
+++ b/llvm/test/CodeGen/AMDGPU/gds-allocation.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -amdgpu-atomic-optimizer-strategy=None -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx9-generic --amdhsa-code-object-version=6 -amdgpu-atomic-optimizer-strategy=None -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
 
 @gds0 = internal addrspace(2) global [4 x i32] undef, align 4
 @lds0 = internal addrspace(3) global [4 x i32] undef, align 128
diff --git a/llvm/test/CodeGen/AMDGPU/gds-atomic.ll b/llvm/test/CodeGen/AMDGPU/gds-atomic.ll
index 3e4e693..8d44330 100644
--- a/llvm/test/CodeGen/AMDGPU/gds-atomic.ll
+++ b/llvm/test/CodeGen/AMDGPU/gds-atomic.ll
@@ -2,6 +2,7 @@
 ; RUN: llc -mtriple=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,FUNC %s
 ; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,FUNC %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,FUNC %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx9-generic --amdhsa-code-object-version=6 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,FUNC %s
 
 ; FUNC-LABEL: {{^}}atomic_add_ret_gds:
 ; GCN-DAG: v_mov_b32_e32 v[[OFF:[0-9]+]], s
diff --git a/llvm/test/CodeGen/AMDGPU/generic-targets-require-v6.ll b/llvm/test/CodeGen/AMDGPU/generic-targets-require-v6.ll
new file mode 100644
index 0000000..e3f4b14
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/generic-targets-require-v6.ll
@@ -0,0 +1,18 @@
+; RUN: not llc -march=amdgcn -mcpu=gfx9-generic --amdhsa-code-object-version=5 -o - %s 2>&1 | FileCheck --check-prefix=GFX9-V5 %s
+; RUN: not llc -march=amdgcn -mcpu=gfx10.1-generic --amdhsa-code-object-version=5 -o - %s 2>&1 | FileCheck --check-prefix=GFX101-V5 %s
+; RUN: not llc -march=amdgcn -mcpu=gfx10.3-generic --amdhsa-code-object-version=5 -o - %s 2>&1 | FileCheck --check-prefix=GFX103-V5 %s
+; RUN: not llc -march=amdgcn -mcpu=gfx11-generic --amdhsa-code-object-version=5 -o - %s 2>&1 | FileCheck --check-prefix=GFX11-V5 %s
+
+; RUN: llc -march=amdgcn -mcpu=gfx9-generic --amdhsa-code-object-version=6 -o - %s
+; RUN: llc -march=amdgcn -mcpu=gfx10.1-generic --amdhsa-code-object-version=6 -o - %s
+; RUN: llc -march=amdgcn -mcpu=gfx10.3-generic --amdhsa-code-object-version=6 -o - %s
+; RUN: llc -march=amdgcn -mcpu=gfx11-generic --amdhsa-code-object-version=6 -o - %s
+
+; GFX9-V5:   gfx9-generic is only available on code object version 6 or better
+; GFX101-V5: gfx10.1-generic is only available on code object version 6 or better
+; GFX103-V5: gfx10.3-generic is only available on code object version 6 or better
+; GFX11-V5:  gfx11-generic is only available on code object version 6 or better
+
+define void @foo() {
+  ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/hsa-generic-target-features.ll b/llvm/test/CodeGen/AMDGPU/hsa-generic-target-features.ll
new file mode 100644
index 0000000..4fee563
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/hsa-generic-target-features.ll
@@ -0,0 +1,31 @@
+; RUN: llc --amdhsa-code-object-version=6 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx10.1-generic -mattr=+cumode < %s | FileCheck -check-prefix=NOCU %s
+; RUN: llc --amdhsa-code-object-version=6 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx10.1-generic < %s | FileCheck -check-prefix=CU %s
+; RUN: llc --amdhsa-code-object-version=6 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx10.3-generic -mattr=+cumode < %s | FileCheck -check-prefix=NOCU %s
+; RUN: llc --amdhsa-code-object-version=6 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx10.3-generic < %s | FileCheck -check-prefix=CU %s
+; RUN: llc --amdhsa-code-object-version=6 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx11-generic -mattr=+cumode < %s | FileCheck -check-prefix=NOCU %s
+; RUN: llc --amdhsa-code-object-version=6 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx11-generic < %s | FileCheck -check-prefix=CU %s
+
+; RUN: llc --amdhsa-code-object-version=6 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx10.1-generic -mattr=+wavefrontsize32,-wavefrontsize64 < %s | FileCheck -check-prefix=W32 %s
+; RUN: llc --amdhsa-code-object-version=6 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx10.1-generic -mattr=-wavefrontsize32,+wavefrontsize64 < %s | FileCheck -check-prefix=W64 %s
+; RUN: llc --amdhsa-code-object-version=6 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx10.3-generic -mattr=+wavefrontsize32,-wavefrontsize64 < %s | FileCheck -check-prefix=W32 %s
+; RUN: llc --amdhsa-code-object-version=6 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx10.3-generic -mattr=-wavefrontsize32,+wavefrontsize64 < %s | FileCheck -check-prefix=W64 %s
+; RUN: llc --amdhsa-code-object-version=6 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx11-generic -mattr=+wavefrontsize32,-wavefrontsize64 < %s | FileCheck -check-prefix=W32 %s
+; RUN: llc --amdhsa-code-object-version=6 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx11-generic -mattr=-wavefrontsize32,+wavefrontsize64 < %s | FileCheck -check-prefix=W64 %s
+
+; Checks 10.1, 10.3 and 11 generic targets allow cumode/wave64.
+
+; NOCU:    .amdhsa_workgroup_processor_mode 0
+; NOCU:    .workgroup_processor_mode: 0
+; CU:      .amdhsa_workgroup_processor_mode 1
+; CU:      .workgroup_processor_mode: 1
+
+; W64:      .amdhsa_wavefront_size32 0
+; W32:      .amdhsa_wavefront_size32 1
+
+define amdgpu_kernel void @wavefrontsize() {
+entry:
+  ret void
+}
+
+!llvm.module.flags = !{!0}
+!0 = !{i32 1, !"amdgpu_code_object_version", i32 600}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.gather4.d16.dim.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.gather4.d16.dim.ll
index 91284d3..cf324d6 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.gather4.d16.dim.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.gather4.d16.dim.ll
@@ -1,8 +1,11 @@
 ; RUN: llc < %s -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck -check-prefixes=GCN,UNPACKED %s
 ; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx810 -verify-machineinstrs | FileCheck --check-prefix=GCN %s
 ; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs | FileCheck -check-prefixes=GCN,GFX9 %s
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx9-generic --amdhsa-code-object-version=6 -verify-machineinstrs | FileCheck -check-prefixes=GCN,GFX9 %s
 ; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs | FileCheck -check-prefixes=GCN,GFX10 %s
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx10.1-generic --amdhsa-code-object-version=6  -verify-machineinstrs | FileCheck -check-prefixes=GCN,GFX10 %s
 ; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs | FileCheck -check-prefixes=GCN,GFX10 %s
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx11-generic --amdhsa-code-object-version=6 -verify-machineinstrs | FileCheck -check-prefixes=GCN,GFX10 %s
 ; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs | FileCheck -check-prefixes=GCN,GFX12 %s
 
 ; GCN-LABEL: {{^}}image_gather4_b_2d_v4f16:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.dim.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.dim.ll
index 7dc139e..10e1ae3 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.dim.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.dim.ll
@@ -1,8 +1,11 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefixes=VERDE %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX6789 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx9-generic --amdhsa-code-object-version=6 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX6789 %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10PLUS,GFX10 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx10.1-generic --amdhsa-code-object-version=6 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10PLUS,GFX10 %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10PLUS,GFX11 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx11-generic --amdhsa-code-object-version=6 -amdgpu-enable-delay-alu=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10PLUS,GFX11 %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -amdgpu-enable-delay-alu=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12 %s
 
 define amdgpu_ps <4 x float> @sample_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s) {
diff --git a/llvm/test/CodeGen/AMDGPU/mad-mix.ll b/llvm/test/CodeGen/AMDGPU/mad-mix.ll
index e8b9526..b520dd1 100644
--- a/llvm/test/CodeGen/AMDGPU/mad-mix.ll
+++ b/llvm/test/CodeGen/AMDGPU/mad-mix.ll
@@ -2,12 +2,14 @@
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX1100,SDAG-GFX1100 %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX900,SDAG-GFX900 %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx906 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX906,SDAG-GFX906 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx9-generic -verify-machineinstrs --amdhsa-code-object-version=6 < %s | FileCheck -check-prefixes=GFX9GEN,SDAG-GFX9GEN %s
 ; RUN: llc -mtriple=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=VI,SDAG-VI %s
 ; RUN: llc -mtriple=amdgcn -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -check-prefixes=CI,SDAG-CI %s
 
 ; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX1100,GISEL-GFX1100 %s
 ; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX900,GISEL-GFX900 %s
 ; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx906 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX906,GISEL-GFX906 %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx9-generic --amdhsa-code-object-version=6 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9GEN,GISEL-GFX9GEN %s
 ; RUN: llc -global-isel -mtriple=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=VI,GISEL-VI %s
 ; RUN: llc -global-isel -mtriple=amdgcn -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -check-prefixes=CI,GISEL-CI %s
 
@@ -30,6 +32,15 @@ define float @v_mad_mix_f32_f16lo_f16lo_f16lo(half %src0, half %src1, half %src2
 ; GFX906-NEXT:    v_fma_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,1]
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX9GEN-LABEL: v_mad_mix_f32_f16lo_f16lo_f16lo:
+; GFX9GEN:       ; %bb.0:
+; GFX9GEN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9GEN-NEXT:    v_cvt_f32_f16_e32 v3, v0
+; GFX9GEN-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX9GEN-NEXT:    v_cvt_f32_f16_e32 v0, v2
+; GFX9GEN-NEXT:    v_mac_f32_e32 v0, v3, v1
+; GFX9GEN-NEXT:    s_setpc_b64 s[30:31]
+;
 ; VI-LABEL: v_mad_mix_f32_f16lo_f16lo_f16lo:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -79,6 +90,15 @@ define float @v_mad_mix_f32_f16hi_f16hi_f16hi_int(i32 %src0, i32 %src1, i32 %src
 ; GFX906-NEXT:    v_fma_mix_f32 v0, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1]
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX9GEN-LABEL: v_mad_mix_f32_f16hi_f16hi_f16hi_int:
+; GFX9GEN:       ; %bb.0:
+; GFX9GEN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9GEN-NEXT:    v_cvt_f32_f16_sdwa v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX9GEN-NEXT:    v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX9GEN-NEXT:    v_cvt_f32_f16_sdwa v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX9GEN-NEXT:    v_mac_f32_e32 v0, v3, v1
+; GFX9GEN-NEXT:    s_setpc_b64 s[30:31]
+;
 ; VI-LABEL: v_mad_mix_f32_f16hi_f16hi_f16hi_int:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -134,6 +154,15 @@ define float @v_mad_mix_f32_f16hi_f16hi_f16hi_elt(<2 x half> %src0, <2 x half> %
 ; GFX906-NEXT:    v_fma_mix_f32 v0, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1]
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX9GEN-LABEL: v_mad_mix_f32_f16hi_f16hi_f16hi_elt:
+; GFX9GEN:       ; %bb.0:
+; GFX9GEN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9GEN-NEXT:    v_cvt_f32_f16_sdwa v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX9GEN-NEXT:    v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX9GEN-NEXT:    v_cvt_f32_f16_sdwa v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX9GEN-NEXT:    v_mac_f32_e32 v0, v3, v1
+; GFX9GEN-NEXT:    s_setpc_b64 s[30:31]
+;
 ; VI-LABEL: v_mad_mix_f32_f16hi_f16hi_f16hi_elt:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -193,6 +222,19 @@ define <2 x float> @v_mad_mix_v2f32(<2 x half> %src0, <2 x half> %src1, <2 x hal
 ; SDAG-GFX906-NEXT:    v_mov_b32_e32 v1, v3
 ; SDAG-GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
+; SDAG-GFX9GEN-LABEL: v_mad_mix_v2f32:
+; SDAG-GFX9GEN:       ; %bb.0:
+; SDAG-GFX9GEN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-GFX9GEN-NEXT:    v_cvt_f32_f16_sdwa v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; SDAG-GFX9GEN-NEXT:    v_cvt_f32_f16_e32 v4, v0
+; SDAG-GFX9GEN-NEXT:    v_cvt_f32_f16_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; SDAG-GFX9GEN-NEXT:    v_cvt_f32_f16_e32 v6, v1
+; SDAG-GFX9GEN-NEXT:    v_cvt_f32_f16_sdwa v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; SDAG-GFX9GEN-NEXT:    v_cvt_f32_f16_e32 v0, v2
+; SDAG-GFX9GEN-NEXT:    v_mac_f32_e32 v1, v3, v5
+; SDAG-GFX9GEN-NEXT:    v_mac_f32_e32 v0, v4, v6
+; SDAG-GFX9GEN-NEXT:    s_setpc_b64 s[30:31]
+;
 ; SDAG-VI-LABEL: v_mad_mix_v2f32:
 ; SDAG-VI:       ; %bb.0:
 ; SDAG-VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -242,6 +284,19 @@ define <2 x float> @v_mad_mix_v2f32(<2 x half> %src0, <2 x half> %src1, <2 x hal
 ; GISEL-GFX906-NEXT:    v_mov_b32_e32 v0, v3
 ; GISEL-GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GISEL-GFX9GEN-LABEL: v_mad_mix_v2f32:
+; GISEL-GFX9GEN:       ; %bb.0:
+; GISEL-GFX9GEN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-GFX9GEN-NEXT:    v_cvt_f32_f16_e32 v3, v0
+; GISEL-GFX9GEN-NEXT:    v_cvt_f32_f16_sdwa v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GISEL-GFX9GEN-NEXT:    v_cvt_f32_f16_e32 v5, v1
+; GISEL-GFX9GEN-NEXT:    v_cvt_f32_f16_sdwa v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GISEL-GFX9GEN-NEXT:    v_cvt_f32_f16_e32 v0, v2
+; GISEL-GFX9GEN-NEXT:    v_cvt_f32_f16_sdwa v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GISEL-GFX9GEN-NEXT:    v_mac_f32_e32 v0, v3, v5
+; GISEL-GFX9GEN-NEXT:    v_mac_f32_e32 v1, v4, v6
+; GISEL-GFX9GEN-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GISEL-VI-LABEL: v_mad_mix_v2f32:
 ; GISEL-VI:       ; %bb.0:
 ; GISEL-VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -300,6 +355,19 @@ define <2 x float> @v_mad_mix_v2f32_shuffle(<2 x half> %src0, <2 x half> %src1,
 ; GFX906-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX9GEN-LABEL: v_mad_mix_v2f32_shuffle:
+; GFX9GEN:       ; %bb.0:
+; GFX9GEN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9GEN-NEXT:    v_cvt_f32_f16_sdwa v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX9GEN-NEXT:    v_cvt_f32_f16_e32 v4, v0
+; GFX9GEN-NEXT:    v_cvt_f32_f16_e32 v0, v1
+; GFX9GEN-NEXT:    v_cvt_f32_f16_sdwa v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX9GEN-NEXT:    v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX9GEN-NEXT:    v_mad_f32 v0, v3, v0, v2
+; GFX9GEN-NEXT:    v_mac_f32_e32 v2, v4, v1
+; GFX9GEN-NEXT:    v_mov_b32_e32 v1, v2
+; GFX9GEN-NEXT:    s_setpc_b64 s[30:31]
+;
 ; VI-LABEL: v_mad_mix_v2f32_shuffle:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -378,6 +446,15 @@ define float @v_mad_mix_f32_negf16lo_f16lo_f16lo(half %src0, half %src1, half %s
 ; GFX906-NEXT:    v_fma_mix_f32 v0, -v0, v1, v2 op_sel_hi:[1,1,1]
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
+; SDAG-GFX9GEN-LABEL: v_mad_mix_f32_negf16lo_f16lo_f16lo:
+; SDAG-GFX9GEN:       ; %bb.0:
+; SDAG-GFX9GEN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-GFX9GEN-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; SDAG-GFX9GEN-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; SDAG-GFX9GEN-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; SDAG-GFX9GEN-NEXT:    v_mad_f32 v0, -v0, v1, v2
+; SDAG-GFX9GEN-NEXT:    s_setpc_b64 s[30:31]
+;
 ; SDAG-VI-LABEL: v_mad_mix_f32_negf16lo_f16lo_f16lo:
 ; SDAG-VI:       ; %bb.0:
 ; SDAG-VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -393,6 +470,15 @@ define float @v_mad_mix_f32_negf16lo_f16lo_f16lo(half %src0, half %src1, half %s
 ; SDAG-CI-NEXT:    v_mad_f32 v0, -v0, v1, v2
 ; SDAG-CI-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GISEL-GFX9GEN-LABEL: v_mad_mix_f32_negf16lo_f16lo_f16lo:
+; GISEL-GFX9GEN:       ; %bb.0:
+; GISEL-GFX9GEN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-GFX9GEN-NEXT:    v_cvt_f32_f16_e64 v3, -v0
+; GISEL-GFX9GEN-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GISEL-GFX9GEN-NEXT:    v_cvt_f32_f16_e32 v0, v2
+; GISEL-GFX9GEN-NEXT:    v_mac_f32_e32 v0, v3, v1
+; GISEL-GFX9GEN-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GISEL-VI-LABEL: v_mad_mix_f32_negf16lo_f16lo_f16lo:
 ; GISEL-VI:       ; %bb.0:
 ; GISEL-VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -437,6 +523,15 @@ define float @v_mad_mix_f32_absf16lo_f16lo_f16lo(half %src0, half %src1, half %s
 ; GFX906-NEXT:    v_fma_mix_f32 v0, |v0|, v1, v2 op_sel_hi:[1,1,1]
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX9GEN-LABEL: v_mad_mix_f32_absf16lo_f16lo_f16lo:
+; GFX9GEN:       ; %bb.0:
+; GFX9GEN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9GEN-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX9GEN-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX9GEN-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GFX9GEN-NEXT:    v_mad_f32 v0, |v0|, v1, v2
+; GFX9GEN-NEXT:    s_setpc_b64 s[30:31]
+;
 ; VI-LABEL: v_mad_mix_f32_absf16lo_f16lo_f16lo:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -487,6 +582,15 @@ define float @v_mad_mix_f32_negabsf16lo_f16lo_f16lo(half %src0, half %src1, half
 ; GFX906-NEXT:    v_fma_mix_f32 v0, -|v0|, v1, v2 op_sel_hi:[1,1,1]
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX9GEN-LABEL: v_mad_mix_f32_negabsf16lo_f16lo_f16lo:
+; GFX9GEN:       ; %bb.0:
+; GFX9GEN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9GEN-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX9GEN-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX9GEN-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GFX9GEN-NEXT:    v_mad_f32 v0, -|v0|, v1, v2
+; GFX9GEN-NEXT:    s_setpc_b64 s[30:31]
+;
 ; VI-LABEL: v_mad_mix_f32_negabsf16lo_f16lo_f16lo:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -538,6 +642,14 @@ define float @v_mad_mix_f32_f16lo_f16lo_f32(half %src0, half %src1, float %src2)
 ; GFX906-NEXT:    v_fma_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,0]
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX9GEN-LABEL: v_mad_mix_f32_f16lo_f16lo_f32:
+; GFX9GEN:       ; %bb.0:
+; GFX9GEN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9GEN-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX9GEN-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX9GEN-NEXT:    v_mad_f32 v0, v0, v1, v2
+; GFX9GEN-NEXT:    s_setpc_b64 s[30:31]
+;
 ; VI-LABEL: v_mad_mix_f32_f16lo_f16lo_f32:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -584,6 +696,14 @@ define float @v_mad_mix_f32_f16lo_f16lo_negf32(half %src0, half %src1, float %sr
 ; GFX906-NEXT:    v_fma_mix_f32 v0, v0, v1, -v2 op_sel_hi:[1,1,0]
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX9GEN-LABEL: v_mad_mix_f32_f16lo_f16lo_negf32:
+; GFX9GEN:       ; %bb.0:
+; GFX9GEN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9GEN-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX9GEN-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX9GEN-NEXT:    v_mad_f32 v0, v0, v1, -v2
+; GFX9GEN-NEXT:    s_setpc_b64 s[30:31]
+;
 ; VI-LABEL: v_mad_mix_f32_f16lo_f16lo_negf32:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -631,6 +751,14 @@ define float @v_mad_mix_f32_f16lo_f16lo_absf32(half %src0, half %src1, float %sr
 ; GFX906-NEXT:    v_fma_mix_f32 v0, v0, v1, |v2| op_sel_hi:[1,1,0]
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX9GEN-LABEL: v_mad_mix_f32_f16lo_f16lo_absf32:
+; GFX9GEN:       ; %bb.0:
+; GFX9GEN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9GEN-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX9GEN-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX9GEN-NEXT:    v_mad_f32 v0, v0, v1, |v2|
+; GFX9GEN-NEXT:    s_setpc_b64 s[30:31]
+;
 ; VI-LABEL: v_mad_mix_f32_f16lo_f16lo_absf32:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -678,6 +806,14 @@ define float @v_mad_mix_f32_f16lo_f16lo_negabsf32(half %src0, half %src1, float
 ; GFX906-NEXT:    v_fma_mix_f32 v0, v0, v1, -|v2| op_sel_hi:[1,1,0]
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX9GEN-LABEL: v_mad_mix_f32_f16lo_f16lo_negabsf32:
+; GFX9GEN:       ; %bb.0:
+; GFX9GEN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9GEN-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX9GEN-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX9GEN-NEXT:    v_mad_f32 v0, v0, v1, -|v2|
+; GFX9GEN-NEXT:    s_setpc_b64 s[30:31]
+;
 ; VI-LABEL: v_mad_mix_f32_f16lo_f16lo_negabsf32:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -734,6 +870,14 @@ define float @v_mad_mix_f32_f16lo_f16lo_f32imm1(half %src0, half %src1) #0 {
 ; SDAG-GFX906-NEXT:    v_fma_mix_f32 v0, v0, v1, s4 op_sel_hi:[1,1,0]
 ; SDAG-GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX9GEN-LABEL: v_mad_mix_f32_f16lo_f16lo_f32imm1:
+; GFX9GEN:       ; %bb.0:
+; GFX9GEN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9GEN-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX9GEN-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX9GEN-NEXT:    v_mad_f32 v0, v0, v1, 1.0
+; GFX9GEN-NEXT:    s_setpc_b64 s[30:31]
+;
 ; VI-LABEL: v_mad_mix_f32_f16lo_f16lo_f32imm1:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -806,6 +950,14 @@ define float @v_mad_mix_f32_f16lo_f16lo_f32imminv2pi(half %src0, half %src1) #0
 ; SDAG-GFX906-NEXT:    v_fma_mix_f32 v0, v0, v1, s4 op_sel_hi:[1,1,0]
 ; SDAG-GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX9GEN-LABEL: v_mad_mix_f32_f16lo_f16lo_f32imminv2pi:
+; GFX9GEN:       ; %bb.0:
+; GFX9GEN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9GEN-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX9GEN-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX9GEN-NEXT:    v_mad_f32 v0, v0, v1, 0.15915494
+; GFX9GEN-NEXT:    s_setpc_b64 s[30:31]
+;
 ; VI-LABEL: v_mad_mix_f32_f16lo_f16lo_f32imminv2pi:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -885,6 +1037,14 @@ define float @v_mad_mix_f32_f16lo_f16lo_cvtf16imminv2pi(half %src0, half %src1)
 ; SDAG-GFX906-NEXT:    v_fma_mix_f32 v0, v0, v1, s4 op_sel_hi:[1,1,0]
 ; SDAG-GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
+; SDAG-GFX9GEN-LABEL: v_mad_mix_f32_f16lo_f16lo_cvtf16imminv2pi:
+; SDAG-GFX9GEN:       ; %bb.0:
+; SDAG-GFX9GEN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-GFX9GEN-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; SDAG-GFX9GEN-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; SDAG-GFX9GEN-NEXT:    v_madak_f32 v0, v0, v1, 0x3e230000
+; SDAG-GFX9GEN-NEXT:    s_setpc_b64 s[30:31]
+;
 ; SDAG-VI-LABEL: v_mad_mix_f32_f16lo_f16lo_cvtf16imminv2pi:
 ; SDAG-VI:       ; %bb.0:
 ; SDAG-VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -921,6 +1081,15 @@ define float @v_mad_mix_f32_f16lo_f16lo_cvtf16imminv2pi(half %src0, half %src1)
 ; GISEL-GFX906-NEXT:    v_fma_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,0]
 ; GISEL-GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GISEL-GFX9GEN-LABEL: v_mad_mix_f32_f16lo_f16lo_cvtf16imminv2pi:
+; GISEL-GFX9GEN:       ; %bb.0:
+; GISEL-GFX9GEN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-GFX9GEN-NEXT:    v_cvt_f32_f16_e32 v2, v0
+; GISEL-GFX9GEN-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GISEL-GFX9GEN-NEXT:    v_mov_b32_e32 v0, 0x3e230000
+; GISEL-GFX9GEN-NEXT:    v_mac_f32_e32 v0, v2, v1
+; GISEL-GFX9GEN-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GISEL-VI-LABEL: v_mad_mix_f32_f16lo_f16lo_cvtf16imminv2pi:
 ; GISEL-VI:       ; %bb.0:
 ; GISEL-VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -969,6 +1138,14 @@ define float @v_mad_mix_f32_f16lo_f16lo_cvtf16imm63(half %src0, half %src1) #0 {
 ; SDAG-GFX906-NEXT:    v_fma_mix_f32 v0, v0, v1, s4 op_sel_hi:[1,1,0]
 ; SDAG-GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
+; SDAG-GFX9GEN-LABEL: v_mad_mix_f32_f16lo_f16lo_cvtf16imm63:
+; SDAG-GFX9GEN:       ; %bb.0:
+; SDAG-GFX9GEN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-GFX9GEN-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; SDAG-GFX9GEN-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; SDAG-GFX9GEN-NEXT:    v_madak_f32 v0, v0, v1, 0x367c0000
+; SDAG-GFX9GEN-NEXT:    s_setpc_b64 s[30:31]
+;
 ; SDAG-VI-LABEL: v_mad_mix_f32_f16lo_f16lo_cvtf16imm63:
 ; SDAG-VI:       ; %bb.0:
 ; SDAG-VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1005,6 +1182,15 @@ define float @v_mad_mix_f32_f16lo_f16lo_cvtf16imm63(half %src0, half %src1) #0 {
 ; GISEL-GFX906-NEXT:    v_fma_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,0]
 ; GISEL-GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GISEL-GFX9GEN-LABEL: v_mad_mix_f32_f16lo_f16lo_cvtf16imm63:
+; GISEL-GFX9GEN:       ; %bb.0:
+; GISEL-GFX9GEN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-GFX9GEN-NEXT:    v_cvt_f32_f16_e32 v2, v0
+; GISEL-GFX9GEN-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GISEL-GFX9GEN-NEXT:    v_mov_b32_e32 v0, 0x367c0000
+; GISEL-GFX9GEN-NEXT:    v_mac_f32_e32 v0, v2, v1
+; GISEL-GFX9GEN-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GISEL-VI-LABEL: v_mad_mix_f32_f16lo_f16lo_cvtf16imm63:
 ; GISEL-VI:       ; %bb.0:
 ; GISEL-VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1058,6 +1244,17 @@ define <2 x float> @v_mad_mix_v2f32_f32imm1(<2 x half> %src0, <2 x half> %src1)
 ; SDAG-GFX906-NEXT:    v_mov_b32_e32 v1, v2
 ; SDAG-GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
+; SDAG-GFX9GEN-LABEL: v_mad_mix_v2f32_f32imm1:
+; SDAG-GFX9GEN:       ; %bb.0:
+; SDAG-GFX9GEN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-GFX9GEN-NEXT:    v_cvt_f32_f16_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; SDAG-GFX9GEN-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; SDAG-GFX9GEN-NEXT:    v_cvt_f32_f16_e32 v3, v1
+; SDAG-GFX9GEN-NEXT:    v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; SDAG-GFX9GEN-NEXT:    v_mad_f32 v0, v0, v3, 1.0
+; SDAG-GFX9GEN-NEXT:    v_mad_f32 v1, v2, v1, 1.0
+; SDAG-GFX9GEN-NEXT:    s_setpc_b64 s[30:31]
+;
 ; SDAG-VI-LABEL: v_mad_mix_v2f32_f32imm1:
 ; SDAG-VI:       ; %bb.0:
 ; SDAG-VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1112,6 +1309,17 @@ define <2 x float> @v_mad_mix_v2f32_f32imm1(<2 x half> %src0, <2 x half> %src1)
 ; GISEL-GFX906-NEXT:    v_mov_b32_e32 v0, v2
 ; GISEL-GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GISEL-GFX9GEN-LABEL: v_mad_mix_v2f32_f32imm1:
+; GISEL-GFX9GEN:       ; %bb.0:
+; GISEL-GFX9GEN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-GFX9GEN-NEXT:    v_cvt_f32_f16_e32 v2, v0
+; GISEL-GFX9GEN-NEXT:    v_cvt_f32_f16_sdwa v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GISEL-GFX9GEN-NEXT:    v_cvt_f32_f16_e32 v0, v1
+; GISEL-GFX9GEN-NEXT:    v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GISEL-GFX9GEN-NEXT:    v_mad_f32 v0, v2, v0, 1.0
+; GISEL-GFX9GEN-NEXT:    v_mad_f32 v1, v3, v1, 1.0
+; GISEL-GFX9GEN-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GISEL-VI-LABEL: v_mad_mix_v2f32_f32imm1:
 ; GISEL-VI:       ; %bb.0:
 ; GISEL-VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1168,6 +1376,18 @@ define <2 x float> @v_mad_mix_v2f32_cvtf16imminv2pi(<2 x half> %src0, <2 x half>
 ; SDAG-GFX906-NEXT:    v_mov_b32_e32 v1, v2
 ; SDAG-GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
+; SDAG-GFX9GEN-LABEL: v_mad_mix_v2f32_cvtf16imminv2pi:
+; SDAG-GFX9GEN:       ; %bb.0:
+; SDAG-GFX9GEN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-GFX9GEN-NEXT:    v_cvt_f32_f16_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; SDAG-GFX9GEN-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; SDAG-GFX9GEN-NEXT:    v_cvt_f32_f16_e32 v3, v1
+; SDAG-GFX9GEN-NEXT:    v_cvt_f32_f16_sdwa v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; SDAG-GFX9GEN-NEXT:    v_mov_b32_e32 v1, 0x3e230000
+; SDAG-GFX9GEN-NEXT:    v_madak_f32 v0, v0, v3, 0x3e230000
+; SDAG-GFX9GEN-NEXT:    v_mac_f32_e32 v1, v2, v4
+; SDAG-GFX9GEN-NEXT:    s_setpc_b64 s[30:31]
+;
 ; SDAG-VI-LABEL: v_mad_mix_v2f32_cvtf16imminv2pi:
 ; SDAG-VI:       ; %bb.0:
 ; SDAG-VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1224,6 +1444,18 @@ define <2 x float> @v_mad_mix_v2f32_cvtf16imminv2pi(<2 x half> %src0, <2 x half>
 ; GISEL-GFX906-NEXT:    v_mov_b32_e32 v0, v2
 ; GISEL-GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GISEL-GFX9GEN-LABEL: v_mad_mix_v2f32_cvtf16imminv2pi:
+; GISEL-GFX9GEN:       ; %bb.0:
+; GISEL-GFX9GEN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-GFX9GEN-NEXT:    v_cvt_f32_f16_e32 v2, v0
+; GISEL-GFX9GEN-NEXT:    v_cvt_f32_f16_sdwa v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GISEL-GFX9GEN-NEXT:    v_cvt_f32_f16_e32 v0, v1
+; GISEL-GFX9GEN-NEXT:    v_cvt_f32_f16_sdwa v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GISEL-GFX9GEN-NEXT:    v_mov_b32_e32 v1, 0x3e230000
+; GISEL-GFX9GEN-NEXT:    v_madak_f32 v0, v2, v0, 0x3e230000
+; GISEL-GFX9GEN-NEXT:    v_mac_f32_e32 v1, v3, v4
+; GISEL-GFX9GEN-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GISEL-VI-LABEL: v_mad_mix_v2f32_cvtf16imminv2pi:
 ; GISEL-VI:       ; %bb.0:
 ; GISEL-VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1283,6 +1515,17 @@ define <2 x float> @v_mad_mix_v2f32_f32imminv2pi(<2 x half> %src0, <2 x half> %s
 ; SDAG-GFX906-NEXT:    v_mov_b32_e32 v1, v2
 ; SDAG-GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
+; SDAG-GFX9GEN-LABEL: v_mad_mix_v2f32_f32imminv2pi:
+; SDAG-GFX9GEN:       ; %bb.0:
+; SDAG-GFX9GEN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-GFX9GEN-NEXT:    v_cvt_f32_f16_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; SDAG-GFX9GEN-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; SDAG-GFX9GEN-NEXT:    v_cvt_f32_f16_e32 v3, v1
+; SDAG-GFX9GEN-NEXT:    v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; SDAG-GFX9GEN-NEXT:    v_mad_f32 v0, v0, v3, 0.15915494
+; SDAG-GFX9GEN-NEXT:    v_mad_f32 v1, v2, v1, 0.15915494
+; SDAG-GFX9GEN-NEXT:    s_setpc_b64 s[30:31]
+;
 ; SDAG-VI-LABEL: v_mad_mix_v2f32_f32imminv2pi:
 ; SDAG-VI:       ; %bb.0:
 ; SDAG-VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1338,6 +1581,17 @@ define <2 x float> @v_mad_mix_v2f32_f32imminv2pi(<2 x half> %src0, <2 x half> %s
 ; GISEL-GFX906-NEXT:    v_mov_b32_e32 v0, v2
 ; GISEL-GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GISEL-GFX9GEN-LABEL: v_mad_mix_v2f32_f32imminv2pi:
+; GISEL-GFX9GEN:       ; %bb.0:
+; GISEL-GFX9GEN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-GFX9GEN-NEXT:    v_cvt_f32_f16_e32 v2, v0
+; GISEL-GFX9GEN-NEXT:    v_cvt_f32_f16_sdwa v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GISEL-GFX9GEN-NEXT:    v_cvt_f32_f16_e32 v0, v1
+; GISEL-GFX9GEN-NEXT:    v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GISEL-GFX9GEN-NEXT:    v_mad_f32 v0, v2, v0, 0.15915494
+; GISEL-GFX9GEN-NEXT:    v_mad_f32 v1, v3, v1, 0.15915494
+; GISEL-GFX9GEN-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GISEL-VI-LABEL: v_mad_mix_v2f32_f32imminv2pi:
 ; GISEL-VI:       ; %bb.0:
 ; GISEL-VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1386,6 +1640,15 @@ define float @v_mad_mix_clamp_f32_f16hi_f16hi_f16hi_elt(<2 x half> %src0, <2 x h
 ; GFX906-NEXT:    v_fma_mix_f32 v0, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX9GEN-LABEL: v_mad_mix_clamp_f32_f16hi_f16hi_f16hi_elt:
+; GFX9GEN:       ; %bb.0:
+; GFX9GEN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9GEN-NEXT:    v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX9GEN-NEXT:    v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX9GEN-NEXT:    v_cvt_f32_f16_sdwa v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX9GEN-NEXT:    v_mad_f32 v0, v0, v1, v2 clamp
+; GFX9GEN-NEXT:    s_setpc_b64 s[30:31]
+;
 ; VI-LABEL: v_mad_mix_clamp_f32_f16hi_f16hi_f16hi_elt:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1440,6 +1703,12 @@ define float @no_mix_simple(float %src0, float %src1, float %src2) #0 {
 ; GFX906-NEXT:    v_fma_f32 v0, v0, v1, v2
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX9GEN-LABEL: no_mix_simple:
+; GFX9GEN:       ; %bb.0:
+; GFX9GEN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9GEN-NEXT:    v_mad_f32 v0, v0, v1, v2
+; GFX9GEN-NEXT:    s_setpc_b64 s[30:31]
+;
 ; VI-LABEL: no_mix_simple:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1474,6 +1743,12 @@ define float @no_mix_simple_fabs(float %src0, float %src1, float %src2) #0 {
 ; GFX906-NEXT:    v_fma_f32 v0, |v0|, v1, v2
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX9GEN-LABEL: no_mix_simple_fabs:
+; GFX9GEN:       ; %bb.0:
+; GFX9GEN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9GEN-NEXT:    v_mad_f32 v0, |v0|, v1, v2
+; GFX9GEN-NEXT:    s_setpc_b64 s[30:31]
+;
 ; VI-LABEL: no_mix_simple_fabs:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1516,6 +1791,15 @@ define float @v_mad_mix_f32_f16lo_f16lo_f16lo_f32_denormals(half %src0, half %sr
 ; GFX906-NEXT:    v_fma_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,1]
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX9GEN-LABEL: v_mad_mix_f32_f16lo_f16lo_f16lo_f32_denormals:
+; GFX9GEN:       ; %bb.0:
+; GFX9GEN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9GEN-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX9GEN-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX9GEN-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GFX9GEN-NEXT:    v_fma_f32 v0, v0, v1, v2
+; GFX9GEN-NEXT:    s_setpc_b64 s[30:31]
+;
 ; VI-LABEL: v_mad_mix_f32_f16lo_f16lo_f16lo_f32_denormals:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1568,6 +1852,14 @@ define float @v_mad_mix_f32_f16lo_f16lo_f32_denormals(half %src0, half %src1, fl
 ; GFX906-NEXT:    v_fma_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,0]
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX9GEN-LABEL: v_mad_mix_f32_f16lo_f16lo_f32_denormals:
+; GFX9GEN:       ; %bb.0:
+; GFX9GEN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9GEN-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX9GEN-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX9GEN-NEXT:    v_fma_f32 v0, v0, v1, v2
+; GFX9GEN-NEXT:    s_setpc_b64 s[30:31]
+;
 ; VI-LABEL: v_mad_mix_f32_f16lo_f16lo_f32_denormals:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1628,6 +1920,16 @@ define float @v_mad_mix_f32_f16lo_f16lo_f16lo_f32_denormals_fmulfadd(half %src0,
 ; GFX906-NEXT:    v_add_f32_e32 v0, v0, v2
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX9GEN-LABEL: v_mad_mix_f32_f16lo_f16lo_f16lo_f32_denormals_fmulfadd:
+; GFX9GEN:       ; %bb.0:
+; GFX9GEN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9GEN-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX9GEN-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX9GEN-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GFX9GEN-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX9GEN-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX9GEN-NEXT:    s_setpc_b64 s[30:31]
+;
 ; VI-LABEL: v_mad_mix_f32_f16lo_f16lo_f16lo_f32_denormals_fmulfadd:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1691,6 +1993,15 @@ define float @v_mad_mix_f32_f16lo_f16lo_f32_denormals_fmulfadd(half %src0, half
 ; GFX906-NEXT:    v_add_f32_e32 v0, v0, v2
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX9GEN-LABEL: v_mad_mix_f32_f16lo_f16lo_f32_denormals_fmulfadd:
+; GFX9GEN:       ; %bb.0:
+; GFX9GEN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9GEN-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX9GEN-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX9GEN-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX9GEN-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX9GEN-NEXT:    s_setpc_b64 s[30:31]
+;
 ; VI-LABEL: v_mad_mix_f32_f16lo_f16lo_f32_denormals_fmulfadd:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1741,6 +2052,15 @@ define float @v_mad_mix_f32_f16lo_f16lo_f16lo_f32_flush_fmulfadd(half %src0, hal
 ; GFX906-NEXT:    v_fma_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,1]
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX9GEN-LABEL: v_mad_mix_f32_f16lo_f16lo_f16lo_f32_flush_fmulfadd:
+; GFX9GEN:       ; %bb.0:
+; GFX9GEN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9GEN-NEXT:    v_cvt_f32_f16_e32 v3, v0
+; GFX9GEN-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX9GEN-NEXT:    v_cvt_f32_f16_e32 v0, v2
+; GFX9GEN-NEXT:    v_mac_f32_e32 v0, v3, v1
+; GFX9GEN-NEXT:    s_setpc_b64 s[30:31]
+;
 ; VI-LABEL: v_mad_mix_f32_f16lo_f16lo_f16lo_f32_flush_fmulfadd:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1791,6 +2111,14 @@ define float @v_mad_mix_f32_f16lo_f16lo_f32_flush_fmulfadd(half %src0, half %src
 ; GFX906-NEXT:    v_fma_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,0]
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX9GEN-LABEL: v_mad_mix_f32_f16lo_f16lo_f32_flush_fmulfadd:
+; GFX9GEN:       ; %bb.0:
+; GFX9GEN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9GEN-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX9GEN-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX9GEN-NEXT:    v_mad_f32 v0, v0, v1, v2
+; GFX9GEN-NEXT:    s_setpc_b64 s[30:31]
+;
 ; VI-LABEL: v_mad_mix_f32_f16lo_f16lo_f32_flush_fmulfadd:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1838,6 +2166,15 @@ define float @v_mad_mix_f32_negprecvtf16lo_f16lo_f16lo(i32 %src0.arg, half %src1
 ; GFX906-NEXT:    v_fma_mix_f32 v0, -v0, v1, v2 op_sel_hi:[1,1,1]
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
+; SDAG-GFX9GEN-LABEL: v_mad_mix_f32_negprecvtf16lo_f16lo_f16lo:
+; SDAG-GFX9GEN:       ; %bb.0:
+; SDAG-GFX9GEN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-GFX9GEN-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; SDAG-GFX9GEN-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; SDAG-GFX9GEN-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; SDAG-GFX9GEN-NEXT:    v_mad_f32 v0, -v0, v1, v2
+; SDAG-GFX9GEN-NEXT:    s_setpc_b64 s[30:31]
+;
 ; SDAG-VI-LABEL: v_mad_mix_f32_negprecvtf16lo_f16lo_f16lo:
 ; SDAG-VI:       ; %bb.0:
 ; SDAG-VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1854,6 +2191,15 @@ define float @v_mad_mix_f32_negprecvtf16lo_f16lo_f16lo(i32 %src0.arg, half %src1
 ; SDAG-CI-NEXT:    v_mad_f32 v0, -v0, v1, v2
 ; SDAG-CI-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GISEL-GFX9GEN-LABEL: v_mad_mix_f32_negprecvtf16lo_f16lo_f16lo:
+; GISEL-GFX9GEN:       ; %bb.0:
+; GISEL-GFX9GEN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-GFX9GEN-NEXT:    v_cvt_f32_f16_e64 v3, -v0
+; GISEL-GFX9GEN-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GISEL-GFX9GEN-NEXT:    v_cvt_f32_f16_e32 v0, v2
+; GISEL-GFX9GEN-NEXT:    v_mac_f32_e32 v0, v3, v1
+; GISEL-GFX9GEN-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GISEL-VI-LABEL: v_mad_mix_f32_negprecvtf16lo_f16lo_f16lo:
 ; GISEL-VI:       ; %bb.0:
 ; GISEL-VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1910,6 +2256,15 @@ define float @v_mad_mix_f32_precvtnegf16hi_abs_f16lo_f16lo(i32 %src0.arg, half %
 ; GFX906-NEXT:    v_fma_mix_f32 v0, |v0|, v1, v2 op_sel_hi:[1,1,1]
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX9GEN-LABEL: v_mad_mix_f32_precvtnegf16hi_abs_f16lo_f16lo:
+; GFX9GEN:       ; %bb.0:
+; GFX9GEN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9GEN-NEXT:    v_cvt_f32_f16_sdwa v0, -v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX9GEN-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX9GEN-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GFX9GEN-NEXT:    v_mad_f32 v0, |v0|, v1, v2
+; GFX9GEN-NEXT:    s_setpc_b64 s[30:31]
+;
 ; VI-LABEL: v_mad_mix_f32_precvtnegf16hi_abs_f16lo_f16lo:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1966,6 +2321,15 @@ define float @v_mad_mix_f32_precvtabsf16hi_f16lo_f16lo(i32 %src0.arg, half %src1
 ; GFX906-NEXT:    v_fma_mix_f32 v0, |v0|, v1, v2 op_sel:[1,0,0] op_sel_hi:[1,1,1]
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX9GEN-LABEL: v_mad_mix_f32_precvtabsf16hi_f16lo_f16lo:
+; GFX9GEN:       ; %bb.0:
+; GFX9GEN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9GEN-NEXT:    v_cvt_f32_f16_sdwa v3, |v0| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX9GEN-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX9GEN-NEXT:    v_cvt_f32_f16_e32 v0, v2
+; GFX9GEN-NEXT:    v_mac_f32_e32 v0, v3, v1
+; GFX9GEN-NEXT:    s_setpc_b64 s[30:31]
+;
 ; VI-LABEL: v_mad_mix_f32_precvtabsf16hi_f16lo_f16lo:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2021,6 +2385,15 @@ define float @v_mad_mix_f32_preextractfneg_f16hi_f16lo_f16lo(i32 %src0.arg, half
 ; GFX906-NEXT:    v_fma_mix_f32 v0, -v0, v1, v2 op_sel:[1,0,0] op_sel_hi:[1,1,1]
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
+; SDAG-GFX9GEN-LABEL: v_mad_mix_f32_preextractfneg_f16hi_f16lo_f16lo:
+; SDAG-GFX9GEN:       ; %bb.0:
+; SDAG-GFX9GEN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-GFX9GEN-NEXT:    v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; SDAG-GFX9GEN-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; SDAG-GFX9GEN-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; SDAG-GFX9GEN-NEXT:    v_mad_f32 v0, -v0, v1, v2
+; SDAG-GFX9GEN-NEXT:    s_setpc_b64 s[30:31]
+;
 ; SDAG-VI-LABEL: v_mad_mix_f32_preextractfneg_f16hi_f16lo_f16lo:
 ; SDAG-VI:       ; %bb.0:
 ; SDAG-VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2038,6 +2411,16 @@ define float @v_mad_mix_f32_preextractfneg_f16hi_f16lo_f16lo(i32 %src0.arg, half
 ; SDAG-CI-NEXT:    v_mad_f32 v0, v0, v1, v2
 ; SDAG-CI-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GISEL-GFX9GEN-LABEL: v_mad_mix_f32_preextractfneg_f16hi_f16lo_f16lo:
+; GISEL-GFX9GEN:       ; %bb.0:
+; GISEL-GFX9GEN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-GFX9GEN-NEXT:    v_xor_b32_e32 v0, 0x80008000, v0
+; GISEL-GFX9GEN-NEXT:    v_cvt_f32_f16_sdwa v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GISEL-GFX9GEN-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GISEL-GFX9GEN-NEXT:    v_cvt_f32_f16_e32 v0, v2
+; GISEL-GFX9GEN-NEXT:    v_mac_f32_e32 v0, v3, v1
+; GISEL-GFX9GEN-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GISEL-VI-LABEL: v_mad_mix_f32_preextractfneg_f16hi_f16lo_f16lo:
 ; GISEL-VI:       ; %bb.0:
 ; GISEL-VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2087,6 +2470,15 @@ define float @v_mad_mix_f32_preextractfabs_f16hi_f16lo_f16lo(i32 %src0.arg, half
 ; GFX906-NEXT:    v_fma_mix_f32 v0, |v0|, v1, v2 op_sel:[1,0,0] op_sel_hi:[1,1,1]
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
+; SDAG-GFX9GEN-LABEL: v_mad_mix_f32_preextractfabs_f16hi_f16lo_f16lo:
+; SDAG-GFX9GEN:       ; %bb.0:
+; SDAG-GFX9GEN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-GFX9GEN-NEXT:    v_cvt_f32_f16_sdwa v3, |v0| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; SDAG-GFX9GEN-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; SDAG-GFX9GEN-NEXT:    v_cvt_f32_f16_e32 v0, v2
+; SDAG-GFX9GEN-NEXT:    v_mac_f32_e32 v0, v3, v1
+; SDAG-GFX9GEN-NEXT:    s_setpc_b64 s[30:31]
+;
 ; SDAG-VI-LABEL: v_mad_mix_f32_preextractfabs_f16hi_f16lo_f16lo:
 ; SDAG-VI:       ; %bb.0:
 ; SDAG-VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2104,6 +2496,16 @@ define float @v_mad_mix_f32_preextractfabs_f16hi_f16lo_f16lo(i32 %src0.arg, half
 ; SDAG-CI-NEXT:    v_mad_f32 v0, v0, v1, v2
 ; SDAG-CI-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GISEL-GFX9GEN-LABEL: v_mad_mix_f32_preextractfabs_f16hi_f16lo_f16lo:
+; GISEL-GFX9GEN:       ; %bb.0:
+; GISEL-GFX9GEN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-GFX9GEN-NEXT:    v_and_b32_e32 v0, 0x7fff7fff, v0
+; GISEL-GFX9GEN-NEXT:    v_cvt_f32_f16_sdwa v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GISEL-GFX9GEN-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GISEL-GFX9GEN-NEXT:    v_cvt_f32_f16_e32 v0, v2
+; GISEL-GFX9GEN-NEXT:    v_mac_f32_e32 v0, v3, v1
+; GISEL-GFX9GEN-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GISEL-VI-LABEL: v_mad_mix_f32_preextractfabs_f16hi_f16lo_f16lo:
 ; GISEL-VI:       ; %bb.0:
 ; GISEL-VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2153,6 +2555,15 @@ define float @v_mad_mix_f32_preextractfabsfneg_f16hi_f16lo_f16lo(i32 %src0.arg,
 ; GFX906-NEXT:    v_fma_mix_f32 v0, -|v0|, v1, v2 op_sel:[1,0,0] op_sel_hi:[1,1,1]
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
+; SDAG-GFX9GEN-LABEL: v_mad_mix_f32_preextractfabsfneg_f16hi_f16lo_f16lo:
+; SDAG-GFX9GEN:       ; %bb.0:
+; SDAG-GFX9GEN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-GFX9GEN-NEXT:    v_cvt_f32_f16_sdwa v0, |v0| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; SDAG-GFX9GEN-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; SDAG-GFX9GEN-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; SDAG-GFX9GEN-NEXT:    v_mad_f32 v0, -v0, v1, v2
+; SDAG-GFX9GEN-NEXT:    s_setpc_b64 s[30:31]
+;
 ; SDAG-VI-LABEL: v_mad_mix_f32_preextractfabsfneg_f16hi_f16lo_f16lo:
 ; SDAG-VI:       ; %bb.0:
 ; SDAG-VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2170,6 +2581,16 @@ define float @v_mad_mix_f32_preextractfabsfneg_f16hi_f16lo_f16lo(i32 %src0.arg,
 ; SDAG-CI-NEXT:    v_mad_f32 v0, v0, v1, v2
 ; SDAG-CI-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GISEL-GFX9GEN-LABEL: v_mad_mix_f32_preextractfabsfneg_f16hi_f16lo_f16lo:
+; GISEL-GFX9GEN:       ; %bb.0:
+; GISEL-GFX9GEN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-GFX9GEN-NEXT:    v_or_b32_e32 v0, 0x80008000, v0
+; GISEL-GFX9GEN-NEXT:    v_cvt_f32_f16_sdwa v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GISEL-GFX9GEN-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GISEL-GFX9GEN-NEXT:    v_cvt_f32_f16_e32 v0, v2
+; GISEL-GFX9GEN-NEXT:    v_mac_f32_e32 v0, v3, v1
+; GISEL-GFX9GEN-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GISEL-VI-LABEL: v_mad_mix_f32_preextractfabsfneg_f16hi_f16lo_f16lo:
 ; GISEL-VI:       ; %bb.0:
 ; GISEL-VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/unsupported-image-sample.ll b/llvm/test/CodeGen/AMDGPU/unsupported-image-sample.ll
index 0878fc6..b08586e 100644
--- a/llvm/test/CodeGen/AMDGPU/unsupported-image-sample.ll
+++ b/llvm/test/CodeGen/AMDGPU/unsupported-image-sample.ll
@@ -1,15 +1,13 @@
-; RUN: llc -O0 -mtriple=amdgcn -mcpu=gfx906 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX906 %s
-; RUN: llc -O0 -mtriple=amdgcn -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX908 %s
+; RUN: llc -O0 -mtriple=amdgcn -mcpu=gfx906 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9 %s
+; RUN: llc -O0 -mtriple=amdgcn -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9 %s
+; RUN: llc -O0 -mtriple=amdgcn -mcpu=gfx9-generic --amdhsa-code-object-version=6 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9 %s
 ; RUN: not --crash llc -O0 -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s 2>&1 | FileCheck -check-prefixes=GFX90A %s
 ; RUN: not --crash llc -O0 -mtriple=amdgcn -mcpu=gfx940 -verify-machineinstrs < %s 2>&1 | FileCheck -check-prefixes=GFX940 %s
 ; RUN: llc -O0 -mtriple=amdgcn -mcpu=gfx1030 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX1030 %s
 ; RUN: llc -O0 -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX1100 %s
 
-; GFX906-LABEL: image_sample_test:
-; GFX906: image_sample_lz
-
-; GFX908-LABEL: image_sample_test:
-; GFX908: image_sample_lz
+; GFX9-LABEL: image_sample_test:
+; GFX9: image_sample_lz
 
 ; GFX90A: LLVM ERROR: requested image instruction is not supported on this GPU
 
diff --git a/llvm/test/Object/AMDGPU/elf-header-flags-mach.yaml b/llvm/test/Object/AMDGPU/elf-header-flags-mach.yaml
index 7fb33ca..4c2b447 100644
--- a/llvm/test/Object/AMDGPU/elf-header-flags-mach.yaml
+++ b/llvm/test/Object/AMDGPU/elf-header-flags-mach.yaml
@@ -238,6 +238,23 @@
 # RUN: llvm-readobj -S --file-headers %t.o.AMDGCN_GFX1201 | FileCheck --check-prefixes=ELF-AMDGCN-ALL,ELF-AMDGCN-GFX1201 %s
 # RUN: obj2yaml %t.o.AMDGCN_GFX1201 | FileCheck --check-prefixes=YAML-AMDGCN-ALL,YAML-AMDGCN-GFX1201 %s
 
+# RUN: sed -e 's/<BITS>/64/' -e 's/<MACH>/AMDGCN_GFX9_GENERIC/' %s | yaml2obj -o %t.o.AMDGCN_GFX9_GENERIC
+# RUN: llvm-readobj -S --file-headers %t.o.AMDGCN_GFX9_GENERIC | FileCheck --check-prefixes=ELF-AMDGCN-ALL,ELF-AMDGCN-GFX9_GENERIC %s
+# RUN: obj2yaml %t.o.AMDGCN_GFX9_GENERIC | FileCheck --check-prefixes=YAML-AMDGCN-ALL,YAML-AMDGCN-GFX9_GENERIC %s
+
+# RUN: sed -e 's/<BITS>/64/' -e 's/<MACH>/AMDGCN_GFX10_1_GENERIC/' %s | yaml2obj -o %t.o.AMDGCN_GFX10_1_GENERIC
+# RUN: llvm-readobj -S --file-headers %t.o.AMDGCN_GFX10_1_GENERIC | FileCheck --check-prefixes=ELF-AMDGCN-ALL,ELF-AMDGCN-GFX10_1_GENERIC %s
+# RUN: obj2yaml %t.o.AMDGCN_GFX10_1_GENERIC | FileCheck --check-prefixes=YAML-AMDGCN-ALL,YAML-AMDGCN-GFX10_1_GENERIC %s
+
+# RUN: sed -e 's/<BITS>/64/' -e 's/<MACH>/AMDGCN_GFX10_3_GENERIC/' %s | yaml2obj -o %t.o.AMDGCN_GFX10_3_GENERIC
+# RUN: llvm-readobj -S --file-headers %t.o.AMDGCN_GFX10_3_GENERIC | FileCheck --check-prefixes=ELF-AMDGCN-ALL,ELF-AMDGCN-GFX10_3_GENERIC %s
+# RUN: obj2yaml %t.o.AMDGCN_GFX10_3_GENERIC | FileCheck --check-prefixes=YAML-AMDGCN-ALL,YAML-AMDGCN-GFX10_3_GENERIC %s
+
+# RUN: sed -e 's/<BITS>/64/' -e 's/<MACH>/AMDGCN_GFX11_GENERIC/' %s | yaml2obj -o %t.o.AMDGCN_GFX11_GENERIC
+# RUN: llvm-readobj -S --file-headers %t.o.AMDGCN_GFX11_GENERIC | FileCheck --check-prefixes=ELF-AMDGCN-ALL,ELF-AMDGCN-GFX11_GENERIC %s
+# RUN: obj2yaml %t.o.AMDGCN_GFX11_GENERIC | FileCheck --check-prefixes=YAML-AMDGCN-ALL,YAML-AMDGCN-GFX11_GENERIC %s
+
+
 # ELF-R600-ALL:       Format: elf32-amdgpu
 # ELF-R600-ALL:       Arch: r600
 # ELF-R600-ALL:       AddressSize: 32bit
@@ -435,6 +452,18 @@
 # ELF-AMDGCN-GFX1201:   EF_AMDGPU_MACH_AMDGCN_GFX1201 (0x4E)
 # YAML-AMDGCN-GFX1201:  Flags: [ EF_AMDGPU_MACH_AMDGCN_GFX1201 ]
 
+# ELF-AMDGCN-GFX9_GENERIC:   EF_AMDGPU_MACH_AMDGCN_GFX9_GENERIC (0x51)
+# YAML-AMDGCN-GFX9_GENERIC:  Flags: [ EF_AMDGPU_MACH_AMDGCN_GFX9_GENERIC ]
+
+# ELF-AMDGCN-GFX10_1_GENERIC:   EF_AMDGPU_MACH_AMDGCN_GFX10_1_GENERIC (0x52)
+# YAML-AMDGCN-GFX10_1_GENERIC:  Flags: [ EF_AMDGPU_MACH_AMDGCN_GFX10_1_GENERIC ]
+
+# ELF-AMDGCN-GFX10_3_GENERIC:   EF_AMDGPU_MACH_AMDGCN_GFX10_3_GENERIC (0x53)
+# YAML-AMDGCN-GFX10_3_GENERIC:  Flags: [ EF_AMDGPU_MACH_AMDGCN_GFX10_3_GENERIC ]
+
+# ELF-AMDGCN-GFX11_GENERIC:   EF_AMDGPU_MACH_AMDGCN_GFX11_GENERIC (0x54)
+# YAML-AMDGCN-GFX11_GENERIC:  Flags: [ EF_AMDGPU_MACH_AMDGCN_GFX11_GENERIC ]
+
 # ELF-AMDGCN-ALL:       ]
 
 
diff --git a/llvm/test/tools/llvm-objdump/ELF/AMDGPU/subtarget.ll b/llvm/test/tools/llvm-objdump/ELF/AMDGPU/subtarget.ll
index e296d7f..ca136a6 100644
--- a/llvm/test/tools/llvm-objdump/ELF/AMDGPU/subtarget.ll
+++ b/llvm/test/tools/llvm-objdump/ELF/AMDGPU/subtarget.ll
@@ -18,6 +18,11 @@ define amdgpu_kernel void @test_kernel() {
 
 ; ----------------------------------GFX11--------------------------------------
 ;
+; RUN: llc -mtriple=amdgcn-amd-amdhsa --amdhsa-code-object-version=6 -mcpu=gfx11-generic -filetype=obj -O0 -o %t.o %s
+; RUN: llvm-objdump -D --arch-name=amdgcn -mllvm --amdhsa-code-object-version=6 --mcpu=gfx11-generic %t.o > %t-specify.txt
+; RUN: llvm-objdump -D -mllvm --amdhsa-code-object-version=6 %t.o > %t-detect.txt
+; RUN: diff %t-specify.txt %t-detect.txt
+
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1151 -filetype=obj -O0 -o %t.o %s
 ; RUN: llvm-objdump -D --arch-name=amdgcn --mcpu=gfx1151 %t.o > %t-specify.txt
 ; RUN: llvm-objdump -D %t.o > %t-detect.txt
@@ -49,6 +54,11 @@ define amdgpu_kernel void @test_kernel() {
 ; RUN: diff %t-specify.txt %t-detect.txt
 
 ; ----------------------------------GFX10--------------------------------------
+; RUN: llc -mtriple=amdgcn-amd-amdhsa --amdhsa-code-object-version=6 -mcpu=gfx10.3-generic -filetype=obj -O0 -o %t.o %s
+; RUN: llvm-objdump -D --arch-name=amdgcn -mllvm --amdhsa-code-object-version=6 --mcpu=gfx10.3-generic %t.o > %t-specify.txt
+; RUN: llvm-objdump -D  -mllvm --amdhsa-code-object-version=6 %t.o > %t-detect.txt
+; RUN: diff %t-specify.txt %t-detect.txt
+
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1036 -filetype=obj -O0 -o %t.o %s
 ; RUN: llvm-objdump -D --arch-name=amdgcn --mcpu=gfx1036 %t.o > %t-specify.txt
 ; RUN: llvm-objdump -D %t.o > %t-detect.txt
@@ -84,6 +94,11 @@ define amdgpu_kernel void @test_kernel() {
 ; RUN: llvm-objdump -D %t.o > %t-detect.txt
 ; RUN: diff %t-specify.txt %t-detect.txt
 
+; RUN: llc -mtriple=amdgcn-amd-amdhsa --amdhsa-code-object-version=6 -mcpu=gfx10.1-generic -filetype=obj -O0 -o %t.o %s
+; RUN: llvm-objdump -D --arch-name=amdgcn -mllvm --amdhsa-code-object-version=6 --mcpu=gfx10.1-generic %t.o > %t-specify.txt
+; RUN: llvm-objdump -D  -mllvm --amdhsa-code-object-version=6 %t.o > %t-detect.txt
+; RUN: diff %t-specify.txt %t-detect.txt
+
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1013 -filetype=obj -O0 -o %t.o %s
 ; RUN: llvm-objdump -D --arch-name=amdgcn --mcpu=gfx1013 %t.o > %t-specify.txt
 ; RUN: llvm-objdump -D %t.o > %t-detect.txt
@@ -107,6 +122,11 @@ define amdgpu_kernel void @test_kernel() {
 
 ; ----------------------------------GFX9---------------------------------------
 ;
+; RUN: llc -mtriple=amdgcn-amd-amdhsa --amdhsa-code-object-version=6 -mcpu=gfx9-generic -filetype=obj -O0 -o %t.o %s
+; RUN: llvm-objdump -D --arch-name=amdgcn -mllvm --amdhsa-code-object-version=6 --mcpu=gfx9-generic %t.o > %t-specify.txt
+; RUN: llvm-objdump -D  -mllvm --amdhsa-code-object-version=6 %t.o > %t-detect.txt
+; RUN: diff %t-specify.txt %t-detect.txt
+
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -filetype=obj -O0 -o %t.o %s
 ; RUN: llvm-objdump -D --arch-name=amdgcn --mcpu=gfx942 %t.o > %t-specify.txt
 ; RUN: llvm-objdump -D %t.o > %t-detect.txt
diff --git a/llvm/test/tools/llvm-readobj/ELF/AMDGPU/elf-headers.test b/llvm/test/tools/llvm-readobj/ELF/AMDGPU/elf-headers.test
index e2266d8..7fbf4aa 100644
--- a/llvm/test/tools/llvm-readobj/ELF/AMDGPU/elf-headers.test
+++ b/llvm/test/tools/llvm-readobj/ELF/AMDGPU/elf-headers.test
@@ -253,6 +253,9 @@
 # RUN: yaml2obj %s -o %t -DABI_VERSION=0 -DFLAG_NAME=EF_AMDGPU_MACH_AMDGCN_GFX1013
 # RUN: llvm-readobj -h %t | FileCheck %s --check-prefixes=ALL,KNOWN-ABI-VERSION,SINGLE-FLAG --match-full-lines -DABI_VERSION=0 -DFILE=%t -DFLAG_NAME=EF_AMDGPU_MACH_AMDGCN_GFX1013 -DFLAG_VALUE=0x42
 
+# RUN: yaml2obj %s -o %t -DABI_VERSION=4 -DFLAG_NAME=EF_AMDGPU_MACH_AMDGCN_GFX10_1_GENERIC
+# RUN: llvm-readobj -h %t | FileCheck %s --check-prefixes=ALL,KNOWN-ABI-VERSION,SINGLE-FLAG --match-full-lines -DABI_VERSION=4 -DFILE=%t -DFLAG_NAME=EF_AMDGPU_MACH_AMDGCN_GFX10_1_GENERIC -DFLAG_VALUE=0x52
+
 # RUN: yaml2obj %s -o %t -DABI_VERSION=1 -DFLAG_NAME=EF_AMDGPU_MACH_AMDGCN_GFX1013
 # RUN: llvm-readobj -h %t | FileCheck %s --check-prefixes=ALL,KNOWN-ABI-VERSION,SINGLE-FLAG --match-full-lines -DABI_VERSION=1 -DFILE=%t -DFLAG_NAME=EF_AMDGPU_MACH_AMDGCN_GFX1013 -DFLAG_VALUE=0x42
 
@@ -322,6 +325,9 @@
 # RUN: yaml2obj %s -o %t -DABI_VERSION=2 -DFLAG_NAME=EF_AMDGPU_MACH_AMDGCN_GFX1036
 # RUN: llvm-readobj -h %t | FileCheck %s --check-prefixes=ALL,KNOWN-ABI-VERSION,SINGLE-FLAG --match-full-lines -DABI_VERSION=2 -DFILE=%t -DFLAG_NAME=EF_AMDGPU_MACH_AMDGCN_GFX1036 -DFLAG_VALUE=0x45
 
+# RUN: yaml2obj %s -o %t -DABI_VERSION=4 -DFLAG_NAME=EF_AMDGPU_MACH_AMDGCN_GFX10_3_GENERIC
+# RUN: llvm-readobj -h %t | FileCheck %s --check-prefixes=ALL,KNOWN-ABI-VERSION,SINGLE-FLAG --match-full-lines -DABI_VERSION=4 -DFILE=%t -DFLAG_NAME=EF_AMDGPU_MACH_AMDGCN_GFX10_3_GENERIC -DFLAG_VALUE=0x53
+
 # RUN: yaml2obj %s -o %t -DABI_VERSION=0 -DFLAG_NAME="EF_AMDGPU_MACH_AMDGCN_GFX90A, EF_AMDGPU_FEATURE_XNACK_V3"
 # RUN: llvm-readobj -h %t | FileCheck %s --check-prefixes=ALL,KNOWN-ABI-VERSION,DOUBLE-FLAG --match-full-lines -DABI_VERSION=0 -DFILE=%t -DFLAG_0="EF_AMDGPU_FEATURE_XNACK_V3 (0x100)" -DFLAG_1="EF_AMDGPU_MACH_AMDGCN_GFX90A (0x3F)" -DFLAG_VALUE=0x13F
 
@@ -355,6 +361,9 @@
 # RUN: yaml2obj %s -o %t -DABI_VERSION=16 -DFLAG_NAME=EF_AMDGPU_MACH_AMDGCN_GFX90A
 # RUN: llvm-readobj -h %t | FileCheck %s --check-prefixes=ALL,UNKNOWN-ABI-VERSION --match-full-lines -DABI_VERSION=16 -DFILE=%t -DFLAG_VALUE=0x3F
 
+# RUN: yaml2obj %s -o %t -DABI_VERSION=4 -DFLAG_NAME=EF_AMDGPU_MACH_AMDGCN_GFX9_GENERIC
+# RUN: llvm-readobj -h %t | FileCheck %s --check-prefixes=ALL,KNOWN-ABI-VERSION,SINGLE-FLAG --match-full-lines -DABI_VERSION=4 -DFILE=%t -DFLAG_NAME=EF_AMDGPU_MACH_AMDGCN_GFX9_GENERIC -DFLAG_VALUE=0x51
+
 # RUN: yaml2obj %s -o %t -DABI_VERSION=0 -DFLAG_NAME=EF_AMDGPU_MACH_AMDGCN_GFX1100
 # RUN: llvm-readobj -h %t | FileCheck %s --check-prefixes=ALL,KNOWN-ABI-VERSION,SINGLE-FLAG --match-full-lines -DABI_VERSION=0 -DFILE=%t -DFLAG_NAME=EF_AMDGPU_MACH_AMDGCN_GFX1100 -DFLAG_VALUE=0x41
 
@@ -391,6 +400,9 @@
 # RUN: yaml2obj %s -o %t -DABI_VERSION=2 -DFLAG_NAME=EF_AMDGPU_MACH_AMDGCN_GFX1103
 # RUN: llvm-readobj -h %t | FileCheck %s --check-prefixes=ALL,KNOWN-ABI-VERSION,SINGLE-FLAG --match-full-lines -DABI_VERSION=2 -DFILE=%t -DFLAG_NAME=EF_AMDGPU_MACH_AMDGCN_GFX1103 -DFLAG_VALUE=0x44
 
+# RUN: yaml2obj %s -o %t -DABI_VERSION=4 -DFLAG_NAME=EF_AMDGPU_MACH_AMDGCN_GFX11_GENERIC
+# RUN: llvm-readobj -h %t | FileCheck %s --check-prefixes=ALL,KNOWN-ABI-VERSION,SINGLE-FLAG --match-full-lines -DABI_VERSION=4 -DFILE=%t -DFLAG_NAME=EF_AMDGPU_MACH_AMDGCN_GFX11_GENERIC -DFLAG_VALUE=0x54
+
 # RUN: yaml2obj %s -o %t -DABI_VERSION=0 -DFLAG_NAME=EF_AMDGPU_MACH_AMDGCN_GFX1150
 # RUN: llvm-readobj -h %t | FileCheck %s --check-prefixes=ALL,KNOWN-ABI-VERSION,SINGLE-FLAG --match-full-lines -DABI_VERSION=0 -DFILE=%t -DFLAG_NAME=EF_AMDGPU_MACH_AMDGCN_GFX1150 -DFLAG_VALUE=0x43
 
diff --git a/llvm/tools/llvm-readobj/ELFDumper.cpp b/llvm/tools/llvm-readobj/ELFDumper.cpp
index 82bb12f..8e68f08 100644
--- a/llvm/tools/llvm-readobj/ELFDumper.cpp
+++ b/llvm/tools/llvm-readobj/ELFDumper.cpp
@@ -1559,68 +1559,72 @@ const EnumEntry<unsigned> ElfHeaderMipsFlags[] = {
 };
 
 // clang-format off
-#define AMDGPU_MACH_ENUM_ENTS                                              \
-  ENUM_ENT(EF_AMDGPU_MACH_NONE, "none"),                                   \
-  ENUM_ENT(EF_AMDGPU_MACH_R600_R600, "r600"),                              \
-  ENUM_ENT(EF_AMDGPU_MACH_R600_R630, "r630"),                              \
-  ENUM_ENT(EF_AMDGPU_MACH_R600_RS880, "rs880"),                            \
-  ENUM_ENT(EF_AMDGPU_MACH_R600_RV670, "rv670"),                            \
-  ENUM_ENT(EF_AMDGPU_MACH_R600_RV710, "rv710"),                            \
-  ENUM_ENT(EF_AMDGPU_MACH_R600_RV730, "rv730"),                            \
-  ENUM_ENT(EF_AMDGPU_MACH_R600_RV770, "rv770"),                            \
-  ENUM_ENT(EF_AMDGPU_MACH_R600_CEDAR, "cedar"),                            \
-  ENUM_ENT(EF_AMDGPU_MACH_R600_CYPRESS, "cypress"),                        \
-  ENUM_ENT(EF_AMDGPU_MACH_R600_JUNIPER, "juniper"),                        \
-  ENUM_ENT(EF_AMDGPU_MACH_R600_REDWOOD, "redwood"),                        \
-  ENUM_ENT(EF_AMDGPU_MACH_R600_SUMO, "sumo"),                              \
-  ENUM_ENT(EF_AMDGPU_MACH_R600_BARTS, "barts"),                            \
-  ENUM_ENT(EF_AMDGPU_MACH_R600_CAICOS, "caicos"),                          \
-  ENUM_ENT(EF_AMDGPU_MACH_R600_CAYMAN, "cayman"),                          \
-  ENUM_ENT(EF_AMDGPU_MACH_R600_TURKS, "turks"),                            \
-  ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX600, "gfx600"),                        \
-  ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX601, "gfx601"),                        \
-  ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX602, "gfx602"),                        \
-  ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX700, "gfx700"),                        \
-  ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX701, "gfx701"),                        \
-  ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX702, "gfx702"),                        \
-  ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX703, "gfx703"),                        \
-  ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX704, "gfx704"),                        \
-  ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX705, "gfx705"),                        \
-  ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX801, "gfx801"),                        \
-  ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX802, "gfx802"),                        \
-  ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX803, "gfx803"),                        \
-  ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX805, "gfx805"),                        \
-  ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX810, "gfx810"),                        \
-  ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX900, "gfx900"),                        \
-  ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX902, "gfx902"),                        \
-  ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX904, "gfx904"),                        \
-  ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX906, "gfx906"),                        \
-  ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX908, "gfx908"),                        \
-  ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX909, "gfx909"),                        \
-  ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX90A, "gfx90a"),                        \
-  ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX90C, "gfx90c"),                        \
-  ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX940, "gfx940"),                        \
-  ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX941, "gfx941"),                        \
-  ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX942, "gfx942"),                        \
-  ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX1010, "gfx1010"),                      \
-  ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX1011, "gfx1011"),                      \
-  ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX1012, "gfx1012"),                      \
-  ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX1013, "gfx1013"),                      \
-  ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX1030, "gfx1030"),                      \
-  ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX1031, "gfx1031"),                      \
-  ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX1032, "gfx1032"),                      \
-  ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX1033, "gfx1033"),                      \
-  ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX1034, "gfx1034"),                      \
-  ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX1035, "gfx1035"),                      \
-  ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX1036, "gfx1036"),                      \
-  ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX1100, "gfx1100"),                      \
-  ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX1101, "gfx1101"),                      \
-  ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX1102, "gfx1102"),                      \
-  ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX1103, "gfx1103"),                      \
-  ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX1150, "gfx1150"),                      \
-  ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX1151, "gfx1151"),                      \
-  ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX1200, "gfx1200"),                      \
-  ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX1201, "gfx1201")
+#define AMDGPU_MACH_ENUM_ENTS                                                  \
+  ENUM_ENT(EF_AMDGPU_MACH_NONE, "none"),                                       \
+  ENUM_ENT(EF_AMDGPU_MACH_R600_R600, "r600"),                                  \
+  ENUM_ENT(EF_AMDGPU_MACH_R600_R630, "r630"),                                  \
+  ENUM_ENT(EF_AMDGPU_MACH_R600_RS880, "rs880"),                                \
+  ENUM_ENT(EF_AMDGPU_MACH_R600_RV670, "rv670"),                                \
+  ENUM_ENT(EF_AMDGPU_MACH_R600_RV710, "rv710"),                                \
+  ENUM_ENT(EF_AMDGPU_MACH_R600_RV730, "rv730"),                                \
+  ENUM_ENT(EF_AMDGPU_MACH_R600_RV770, "rv770"),                                \
+  ENUM_ENT(EF_AMDGPU_MACH_R600_CEDAR, "cedar"),                                \
+  ENUM_ENT(EF_AMDGPU_MACH_R600_CYPRESS, "cypress"),                            \
+  ENUM_ENT(EF_AMDGPU_MACH_R600_JUNIPER, "juniper"),                            \
+  ENUM_ENT(EF_AMDGPU_MACH_R600_REDWOOD, "redwood"),                            \
+  ENUM_ENT(EF_AMDGPU_MACH_R600_SUMO, "sumo"),                                  \
+  ENUM_ENT(EF_AMDGPU_MACH_R600_BARTS, "barts"),                                \
+  ENUM_ENT(EF_AMDGPU_MACH_R600_CAICOS, "caicos"),                              \
+  ENUM_ENT(EF_AMDGPU_MACH_R600_CAYMAN, "cayman"),                              \
+  ENUM_ENT(EF_AMDGPU_MACH_R600_TURKS, "turks"),                                \
+  ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX600, "gfx600"),                            \
+  ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX601, "gfx601"),                            \
+  ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX602, "gfx602"),                            \
+  ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX700, "gfx700"),                            \
+  ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX701, "gfx701"),                            \
+  ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX702, "gfx702"),                            \
+  ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX703, "gfx703"),                            \
+  ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX704, "gfx704"),                            \
+  ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX705, "gfx705"),                            \
+  ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX801, "gfx801"),                            \
+  ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX802, "gfx802"),                            \
+  ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX803, "gfx803"),                            \
+  ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX805, "gfx805"),                            \
+  ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX810, "gfx810"),                            \
+  ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX900, "gfx900"),                            \
+  ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX902, "gfx902"),                            \
+  ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX904, "gfx904"),                            \
+  ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX906, "gfx906"),                            \
+  ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX908, "gfx908"),                            \
+  ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX909, "gfx909"),                            \
+  ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX90A, "gfx90a"),                            \
+  ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX90C, "gfx90c"),                            \
+  ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX940, "gfx940"),                            \
+  ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX941, "gfx941"),                            \
+  ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX942, "gfx942"),                            \
+  ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX1010, "gfx1010"),                          \
+  ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX1011, "gfx1011"),                          \
+  ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX1012, "gfx1012"),                          \
+  ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX1013, "gfx1013"),                          \
+  ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX1030, "gfx1030"),                          \
+  ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX1031, "gfx1031"),                          \
+  ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX1032, "gfx1032"),                          \
+  ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX1033, "gfx1033"),                          \
+  ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX1034, "gfx1034"),                          \
+  ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX1035, "gfx1035"),                          \
+  ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX1036, "gfx1036"),                          \
+  ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX1100, "gfx1100"),                          \
+  ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX1101, "gfx1101"),                          \
+  ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX1102, "gfx1102"),                          \
+  ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX1103, "gfx1103"),                          \
+  ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX1150, "gfx1150"),                          \
+  ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX1151, "gfx1151"),                          \
+  ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX1200, "gfx1200"),                          \
+  ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX1201, "gfx1201"),                          \
+  ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX9_GENERIC, "gfx9-generic"),                \
+  ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX10_1_GENERIC, "gfx10.1-generic"),          \
+  ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX10_3_GENERIC, "gfx10.3-generic"),          \
+  ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX11_GENERIC, "gfx11-generic")
 // clang-format on
 
 const EnumEntry<unsigned> ElfHeaderAMDGPUFlagsABIVersion3[] = {
-- 
cgit v1.1


From f7f947e6208cb65ab1a29c4573bf927f967b9d5d Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Mon, 12 Feb 2024 10:33:21 +0100
Subject: [InstCombine] Remove some uninteresting FIXMEs (NFC)

If there are two undef operands, the select would get folded away
entirely. One undef operand can occur if the other two operands
do not satisfy the poison implication check. However, I don't think
that handling this edge case is worthwhile in this fold. If we
wanted to handle this, it would be more natural to do so in the
simplifyValueKnownNonZero() fold (as this is actually the property
we would be exploiting -- this doesn't really have any relation
to taking the log2).
---
 llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp b/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
index f9cee9d..0bd4b6d 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
@@ -1299,9 +1299,6 @@ static Value *takeLog2(IRBuilderBase &Builder, Value *Op, unsigned Depth,
   }
 
   // log2(Cond ? X : Y) -> Cond ? log2(X) : log2(Y)
-  // FIXME: missed optimization: if one of the hands of select is/contains
-  //        undef, just directly pick the other one.
-  // FIXME: can both hands contain undef?
   // FIXME: Require one use?
   if (SelectInst *SI = dyn_cast<SelectInst>(Op))
     if (Value *LogX = takeLog2(Builder, SI->getOperand(1), Depth,
-- 
cgit v1.1


From 8a2a65ffdf7fb2bb0195dcc0d1dc6705cef8bce5 Mon Sep 17 00:00:00 2001
From: Benjamin Maxwell <benjamin.maxwell@arm.com>
Date: Mon, 12 Feb 2024 09:40:58 +0000
Subject: [mlir][ArmSME][test] Unroll reduction dimension in
 multi-tile-matmul.mlir (#81160)

This tests both #80148 and #80170 work together to allow unrolling the
reduction dimension of a matmul.
---
 .../Dialect/Linalg/CPU/ArmSME/multi-tile-matmul.mlir           | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/mlir/test/Integration/Dialect/Linalg/CPU/ArmSME/multi-tile-matmul.mlir b/mlir/test/Integration/Dialect/Linalg/CPU/ArmSME/multi-tile-matmul.mlir
index 327f237b..d5c3506 100644
--- a/mlir/test/Integration/Dialect/Linalg/CPU/ArmSME/multi-tile-matmul.mlir
+++ b/mlir/test/Integration/Dialect/Linalg/CPU/ArmSME/multi-tile-matmul.mlir
@@ -73,14 +73,14 @@ module attributes {transform.with_named_sequence} {
     %matmul = transform.structured.match ops{["linalg.matmul"]} in %module
       : (!transform.any_op) -> !transform.any_op
 
-    // Step 1: Tile for size [8] x [8], which corresponds to (2 x SVLs) x (2 x SVLs),
-    // where SVLs is the number of 32-bit elements in a vector of SVL bits.
-    // This uses all four 32-bit SME virtual tiles.
-    %tiled_linalg_op, %loop_i, %loop_j, %loop_k = transform.structured.tile_using_for %matmul[[8], [8], 1]
+    // Step 1: Tile for size [8] x [8] (unrolled by 4), which corresponds to
+    // (2 x SVLs) x (2 x SVLs), where SVLs is the number of 32-bit elements in a
+    // vector of SVL bits. This uses all four 32-bit SME virtual tiles.
+    %tiled_linalg_op, %loop_i, %loop_j, %loop_k = transform.structured.tile_using_for %matmul[[8], [8], 4]
       : (!transform.any_op) -> (!transform.any_op, !transform.op<"scf.for">, !transform.op<"scf.for">, !transform.op<"scf.for">)
 
     // Step 2: Vectorize.
-    transform.structured.vectorize %tiled_linalg_op vector_sizes [[8], [8], 1]
+    transform.structured.vectorize %tiled_linalg_op vector_sizes [[8], [8], 4]
       : !transform.any_op
 
     // Step 3: Bufferize ahead of TransferReadDropUnitDimsPattern, which
-- 
cgit v1.1


From c90114c993ec1443360769a4d51963e3536de0ef Mon Sep 17 00:00:00 2001
From: Mariya Podchishchaeva <mariya.podchishchaeva@intel.com>
Date: Mon, 12 Feb 2024 12:44:20 +0300
Subject: [clang] Avoid -Wshadow warning when init-capture named same as class
 field (#74512)

Shadowing warning doesn't make much sense since field is not available
in lambda's body without capturing this.

Fixes https://github.com/llvm/llvm-project/issues/71976
---
 clang/docs/ReleaseNotes.rst                   |  3 +
 clang/include/clang/Sema/ScopeInfo.h          |  4 +-
 clang/lib/Sema/SemaDecl.cpp                   | 73 +++++++++++++--------
 clang/test/SemaCXX/warn-shadow-in-lambdas.cpp | 92 ++++++++++++++++++++++++++-
 4 files changed, 141 insertions(+), 31 deletions(-)

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index ece6013..402a2f8 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -165,6 +165,9 @@ Bug Fixes in This Version
   a member class template for an implicit instantiation of a class template.
 
 - Fixed missing warnings when doing bool-like conversions in C23 (`#79435 <https://github.com/llvm/llvm-project/issues/79435>`_).
+- Clang's ``-Wshadow`` no longer warns when an init-capture is named the same as
+  a class field unless the lambda can capture this.
+  Fixes (`#71976 <https://github.com/llvm/llvm-project/issues/71976>`_)
 
 - Clang now accepts qualified partial/explicit specializations of variable templates that
   are not nominable in the lookup context of the specialization.
diff --git a/clang/include/clang/Sema/ScopeInfo.h b/clang/include/clang/Sema/ScopeInfo.h
index ca3d0a0..700e361e 100644
--- a/clang/include/clang/Sema/ScopeInfo.h
+++ b/clang/include/clang/Sema/ScopeInfo.h
@@ -942,8 +942,8 @@ public:
   /// that were defined in parent contexts. Used to avoid warnings when the
   /// shadowed variables are uncaptured by this lambda.
   struct ShadowedOuterDecl {
-    const VarDecl *VD;
-    const VarDecl *ShadowedDecl;
+    const NamedDecl *VD;
+    const NamedDecl *ShadowedDecl;
   };
   llvm::SmallVector<ShadowedOuterDecl, 4> ShadowingDecls;
 
diff --git a/clang/lib/Sema/SemaDecl.cpp b/clang/lib/Sema/SemaDecl.cpp
index 2c526cd..529fbcb 100644
--- a/clang/lib/Sema/SemaDecl.cpp
+++ b/clang/lib/Sema/SemaDecl.cpp
@@ -8357,28 +8357,40 @@ void Sema::CheckShadow(NamedDecl *D, NamedDecl *ShadowedDecl,
 
   unsigned WarningDiag = diag::warn_decl_shadow;
   SourceLocation CaptureLoc;
-  if (isa<VarDecl>(D) && isa<VarDecl>(ShadowedDecl) && NewDC &&
-      isa<CXXMethodDecl>(NewDC)) {
+  if (isa<VarDecl>(D) && NewDC && isa<CXXMethodDecl>(NewDC)) {
     if (const auto *RD = dyn_cast<CXXRecordDecl>(NewDC->getParent())) {
       if (RD->isLambda() && OldDC->Encloses(NewDC->getLexicalParent())) {
-        if (RD->getLambdaCaptureDefault() == LCD_None) {
-          // Try to avoid warnings for lambdas with an explicit capture list.
+        if (const auto *VD = dyn_cast<VarDecl>(ShadowedDecl)) {
           const auto *LSI = cast<LambdaScopeInfo>(getCurFunction());
-          // Warn only when the lambda captures the shadowed decl explicitly.
-          CaptureLoc = getCaptureLocation(LSI, cast<VarDecl>(ShadowedDecl));
-          if (CaptureLoc.isInvalid())
-            WarningDiag = diag::warn_decl_shadow_uncaptured_local;
-        } else {
-          // Remember that this was shadowed so we can avoid the warning if the
-          // shadowed decl isn't captured and the warning settings allow it.
+          if (RD->getLambdaCaptureDefault() == LCD_None) {
+            // Try to avoid warnings for lambdas with an explicit capture
+            // list. Warn only when the lambda captures the shadowed decl
+            // explicitly.
+            CaptureLoc = getCaptureLocation(LSI, VD);
+            if (CaptureLoc.isInvalid())
+              WarningDiag = diag::warn_decl_shadow_uncaptured_local;
+          } else {
+            // Remember that this was shadowed so we can avoid the warning if
+            // the shadowed decl isn't captured and the warning settings allow
+            // it.
+            cast<LambdaScopeInfo>(getCurFunction())
+                ->ShadowingDecls.push_back({D, VD});
+            return;
+          }
+        }
+        if (isa<FieldDecl>(ShadowedDecl)) {
+          // If lambda can capture this, then emit default shadowing warning,
+          // Otherwise it is not really a shadowing case since field is not
+          // available in lambda's body.
+          // At this point we don't know that lambda can capture this, so
+          // remember that this was shadowed and delay until we know.
           cast<LambdaScopeInfo>(getCurFunction())
-              ->ShadowingDecls.push_back(
-                  {cast<VarDecl>(D), cast<VarDecl>(ShadowedDecl)});
+              ->ShadowingDecls.push_back({D, ShadowedDecl});
           return;
         }
       }
-
-      if (cast<VarDecl>(ShadowedDecl)->hasLocalStorage()) {
+      if (const auto *VD = dyn_cast<VarDecl>(ShadowedDecl);
+          VD && VD->hasLocalStorage()) {
         // A variable can't shadow a local variable in an enclosing scope, if
         // they are separated by a non-capturing declaration context.
         for (DeclContext *ParentDC = NewDC;
@@ -8429,19 +8441,28 @@ void Sema::CheckShadow(NamedDecl *D, NamedDecl *ShadowedDecl,
 /// when these variables are captured by the lambda.
 void Sema::DiagnoseShadowingLambdaDecls(const LambdaScopeInfo *LSI) {
   for (const auto &Shadow : LSI->ShadowingDecls) {
-    const VarDecl *ShadowedDecl = Shadow.ShadowedDecl;
+    const NamedDecl *ShadowedDecl = Shadow.ShadowedDecl;
     // Try to avoid the warning when the shadowed decl isn't captured.
-    SourceLocation CaptureLoc = getCaptureLocation(LSI, ShadowedDecl);
     const DeclContext *OldDC = ShadowedDecl->getDeclContext();
-    Diag(Shadow.VD->getLocation(), CaptureLoc.isInvalid()
-                                       ? diag::warn_decl_shadow_uncaptured_local
-                                       : diag::warn_decl_shadow)
-        << Shadow.VD->getDeclName()
-        << computeShadowedDeclKind(ShadowedDecl, OldDC) << OldDC;
-    if (!CaptureLoc.isInvalid())
-      Diag(CaptureLoc, diag::note_var_explicitly_captured_here)
-          << Shadow.VD->getDeclName() << /*explicitly*/ 0;
-    Diag(ShadowedDecl->getLocation(), diag::note_previous_declaration);
+    if (const auto *VD = dyn_cast<VarDecl>(ShadowedDecl)) {
+      SourceLocation CaptureLoc = getCaptureLocation(LSI, VD);
+      Diag(Shadow.VD->getLocation(),
+           CaptureLoc.isInvalid() ? diag::warn_decl_shadow_uncaptured_local
+                                  : diag::warn_decl_shadow)
+          << Shadow.VD->getDeclName()
+          << computeShadowedDeclKind(ShadowedDecl, OldDC) << OldDC;
+      if (CaptureLoc.isValid())
+        Diag(CaptureLoc, diag::note_var_explicitly_captured_here)
+            << Shadow.VD->getDeclName() << /*explicitly*/ 0;
+      Diag(ShadowedDecl->getLocation(), diag::note_previous_declaration);
+    } else if (isa<FieldDecl>(ShadowedDecl)) {
+      Diag(Shadow.VD->getLocation(),
+           LSI->isCXXThisCaptured() ? diag::warn_decl_shadow
+                                    : diag::warn_decl_shadow_uncaptured_local)
+          << Shadow.VD->getDeclName()
+          << computeShadowedDeclKind(ShadowedDecl, OldDC) << OldDC;
+      Diag(ShadowedDecl->getLocation(), diag::note_previous_declaration);
+    }
   }
 }
 
diff --git a/clang/test/SemaCXX/warn-shadow-in-lambdas.cpp b/clang/test/SemaCXX/warn-shadow-in-lambdas.cpp
index bda6a65..d54b394 100644
--- a/clang/test/SemaCXX/warn-shadow-in-lambdas.cpp
+++ b/clang/test/SemaCXX/warn-shadow-in-lambdas.cpp
@@ -1,6 +1,6 @@
-// RUN: %clang_cc1 -std=c++14 -verify -fsyntax-only -Wshadow -D AVOID %s
-// RUN: %clang_cc1 -std=c++14 -verify -fsyntax-only -Wshadow -Wshadow-uncaptured-local %s
-// RUN: %clang_cc1 -std=c++14 -verify -fsyntax-only -Wshadow-all %s
+// RUN: %clang_cc1 -std=c++14 -verify=expected,cxx14 -fsyntax-only -Wshadow -D AVOID %s
+// RUN: %clang_cc1 -std=c++14 -verify=expected,cxx14 -fsyntax-only -Wshadow -Wshadow-uncaptured-local %s
+// RUN: %clang_cc1 -std=c++14 -verify=expected,cxx14 -fsyntax-only -Wshadow-all %s
 // RUN: %clang_cc1 -std=c++17 -verify -fsyntax-only -Wshadow-all %s
 // RUN: %clang_cc1 -std=c++20 -verify -fsyntax-only -Wshadow-all %s
 
@@ -179,3 +179,89 @@ void f() {
 #endif
 }
 }
+
+namespace GH71976 {
+#ifdef AVOID
+struct A {
+  int b = 5;
+  int foo() {
+    return [b = b]() { return b; }(); // no -Wshadow diagnostic, init-capture does not shadow b due to not capturing this
+  }
+};
+
+struct B {
+  int a;
+  void foo() {
+    auto b = [a = this->a] {}; // no -Wshadow diagnostic, init-capture does not shadow a due to not capturing his
+  }
+};
+
+struct C {
+  int b = 5;
+  int foo() {
+    return [a = b]() {
+      return [=, b = a]() { // no -Wshadow diagnostic, init-capture does not shadow b due to outer lambda
+        return b;
+      }();
+    }();
+  }
+};
+
+#else
+struct A {
+  int b = 5; // expected-note {{previous}}
+  int foo() {
+    return [b = b]() { return b; }(); // expected-warning {{declaration shadows a field}}
+  }
+};
+
+struct B {
+  int a; // expected-note {{previous}}
+  void foo() {
+    auto b = [a = this->a] {}; // expected-warning {{declaration shadows a field}}
+  }
+};
+
+struct C {
+  int b = 5; // expected-note {{previous}}
+  int foo() {
+    return [a = b]() {
+      return [=, b = a]() { // expected-warning {{declaration shadows a field}}
+        return b;
+      }();
+    }();
+  }
+};
+
+struct D {
+  int b = 5; // expected-note {{previous}}
+  int foo() {
+    return [b = b, this]() { return b; }(); // expected-warning {{declaration shadows a field}}
+  }
+};
+
+struct E {
+  int b = 5;
+  int foo() {
+    return [a = b]() { // expected-note {{previous}}
+      return [=, a = a]() { // expected-warning {{shadows a local}}
+        return a;
+      }();
+    }();
+  }
+};
+
+#endif
+
+struct S {
+    int a ;
+};
+
+int foo() {
+  auto [a] = S{0}; // expected-note {{previous}} \
+                   // cxx14-warning {{decomposition declarations are a C++17 extension}}
+  [a = a] () { // expected-warning {{declaration shadows a structured binding}}
+  }();
+}
+
+}
-- 
cgit v1.1


From b726a81947c40521e047385c5217933c18162187 Mon Sep 17 00:00:00 2001
From: Benji Smith <6193112+Benjins@users.noreply.github.com>
Date: Mon, 12 Feb 2024 04:59:00 -0500
Subject: [C API] Add blockaddress getters to C API (#81382)

This allows for accessing the function/basic block that a blockaddress
constant refers to

Due to the difficulties of fully supporting cloning BlockAddress values
in echo.cpp, tests are instead done using a unit test.

This previously was up for review at
https://github.com/llvm/llvm-project/pull/77390.
---
 llvm/docs/ReleaseNotes.rst          |  3 +++
 llvm/include/llvm-c/Core.h          | 10 ++++++++++
 llvm/lib/IR/Core.cpp                |  8 ++++++++
 llvm/unittests/IR/ConstantsTest.cpp | 39 +++++++++++++++++++++++++++++++++++++
 4 files changed, 60 insertions(+)

diff --git a/llvm/docs/ReleaseNotes.rst b/llvm/docs/ReleaseNotes.rst
index 05d8eea..120e8b6 100644
--- a/llvm/docs/ReleaseNotes.rst
+++ b/llvm/docs/ReleaseNotes.rst
@@ -122,6 +122,9 @@ Changes to the Python bindings
 Changes to the C API
 --------------------
 
+* Added ``LLVMGetBlockAddressFunction`` and ``LLVMGetBlockAddressBasicBlock``
+  functions for accessing the values in a blockaddress constant.
+
 Changes to the CodeGen infrastructure
 -------------------------------------
 
diff --git a/llvm/include/llvm-c/Core.h b/llvm/include/llvm-c/Core.h
index 83530ae..09746bd 100644
--- a/llvm/include/llvm-c/Core.h
+++ b/llvm/include/llvm-c/Core.h
@@ -2328,6 +2328,16 @@ LLVMValueRef LLVMConstShuffleVector(LLVMValueRef VectorAConstant,
                                     LLVMValueRef MaskConstant);
 LLVMValueRef LLVMBlockAddress(LLVMValueRef F, LLVMBasicBlockRef BB);
 
+/**
+ * Gets the function associated with a given BlockAddress constant value.
+ */
+LLVMValueRef LLVMGetBlockAddressFunction(LLVMValueRef BlockAddr);
+
+/**
+ * Gets the basic block associated with a given BlockAddress constant value.
+ */
+LLVMBasicBlockRef LLVMGetBlockAddressBasicBlock(LLVMValueRef BlockAddr);
+
 /** Deprecated: Use LLVMGetInlineAsm instead. */
 LLVMValueRef LLVMConstInlineAsm(LLVMTypeRef Ty,
                                 const char *AsmString, const char *Constraints,
diff --git a/llvm/lib/IR/Core.cpp b/llvm/lib/IR/Core.cpp
index fb30fbc..d6d159a 100644
--- a/llvm/lib/IR/Core.cpp
+++ b/llvm/lib/IR/Core.cpp
@@ -1805,6 +1805,14 @@ LLVMValueRef LLVMBlockAddress(LLVMValueRef F, LLVMBasicBlockRef BB) {
   return wrap(BlockAddress::get(unwrap<Function>(F), unwrap(BB)));
 }
 
+LLVMValueRef LLVMGetBlockAddressFunction(LLVMValueRef BlockAddr) {
+  return wrap(unwrap<BlockAddress>(BlockAddr)->getFunction());
+}
+
+LLVMBasicBlockRef LLVMGetBlockAddressBasicBlock(LLVMValueRef BlockAddr) {
+  return wrap(unwrap<BlockAddress>(BlockAddr)->getBasicBlock());
+}
+
 /*--.. Operations on global variables, functions, and aliases (globals) ....--*/
 
 LLVMModuleRef LLVMGetGlobalParent(LLVMValueRef Global) {
diff --git a/llvm/unittests/IR/ConstantsTest.cpp b/llvm/unittests/IR/ConstantsTest.cpp
index 314264b..1d6a92c 100644
--- a/llvm/unittests/IR/ConstantsTest.cpp
+++ b/llvm/unittests/IR/ConstantsTest.cpp
@@ -736,5 +736,44 @@ TEST(ConstantsTest, ComdatUserTracking) {
   EXPECT_TRUE(Users.size() == 0);
 }
 
+// Verify that the C API getters for BlockAddress work
+TEST(ConstantsTest, BlockAddressCAPITest) {
+  const char *BlockAddressIR = R"(
+    define void @test_block_address_func() {
+    entry:
+      br label %block_bb_0
+    block_bb_0:
+      ret void
+    }
+  )";
+
+  LLVMContext Context;
+  SMDiagnostic Error;
+  std::unique_ptr<Module> M =
+      parseAssemblyString(BlockAddressIR, Error, Context);
+
+  EXPECT_TRUE(M.get() != nullptr);
+
+  // Get the function
+  auto *Func = M->getFunction("test_block_address_func");
+  EXPECT_TRUE(Func != nullptr);
+
+  // Get the second basic block, since we can't use the entry one
+  const BasicBlock &BB = *(++Func->begin());
+  EXPECT_EQ(BB.getName(), "block_bb_0");
+
+  // Construct the C API values
+  LLVMValueRef BlockAddr = LLVMBlockAddress(wrap(Func), wrap(&BB));
+  EXPECT_TRUE(LLVMIsABlockAddress(BlockAddr));
+
+  // Get the Function/BasicBlock values back out
+  auto *OutFunc = unwrap(LLVMGetBlockAddressFunction(BlockAddr));
+  auto *OutBB = unwrap(LLVMGetBlockAddressBasicBlock(BlockAddr));
+
+  // Verify that they round-tripped properly
+  EXPECT_EQ(Func, OutFunc);
+  EXPECT_EQ(&BB, OutBB);
+}
+
 } // end anonymous namespace
 } // end namespace llvm
-- 
cgit v1.1


From 53c260d99e375f666c6cffa15f5fa261858147a2 Mon Sep 17 00:00:00 2001
From: jeanPerier <jperier@nvidia.com>
Date: Mon, 12 Feb 2024 11:17:31 +0100
Subject: [flang] ensure parent component are first in runtime type info
 (#81259)

Static info generated to describe derived types contain an array listing
the components of some derived type.

The parent component must be first for the runtime to properly works.
The current sort was only relying on the offset, but if the parent is an
empty type, this did not work properly because its offset did not
compare smaller than the first component and the parent was not added
first
---
 flang/lib/Semantics/runtime-type-info.cpp |  5 +++--
 flang/test/Semantics/typeinfo10.f90       | 14 ++++++++++++++
 2 files changed, 17 insertions(+), 2 deletions(-)
 create mode 100644 flang/test/Semantics/typeinfo10.f90

diff --git a/flang/lib/Semantics/runtime-type-info.cpp b/flang/lib/Semantics/runtime-type-info.cpp
index 66c4216..9845a19 100644
--- a/flang/lib/Semantics/runtime-type-info.cpp
+++ b/flang/lib/Semantics/runtime-type-info.cpp
@@ -555,10 +555,11 @@ const Symbol *RuntimeTableBuilder::DescribeType(Scope &dtScope) {
           },
           symbol.details());
     }
-    // Sort the data component symbols by offset before emitting them
+    // Sort the data component symbols by offset before emitting them, placing
+    // the parent component first if any.
     std::sort(dataComponentSymbols.begin(), dataComponentSymbols.end(),
         [](const Symbol *x, const Symbol *y) {
-          return x->offset() < y->offset();
+          return x->test(Symbol::Flag::ParentComp) || x->offset() < y->offset();
         });
     std::vector<evaluate::StructureConstructor> dataComponents;
     for (const Symbol *symbol : dataComponentSymbols) {
diff --git a/flang/test/Semantics/typeinfo10.f90 b/flang/test/Semantics/typeinfo10.f90
new file mode 100644
index 0000000..43c418b
--- /dev/null
+++ b/flang/test/Semantics/typeinfo10.f90
@@ -0,0 +1,14 @@
+!RUN: bbc --dump-symbols %s | FileCheck %s
+!RUN: %flang_fc1 -fdebug-dump-symbols %s | FileCheck %s
+
+! Test that empty parent types are still set first in the
+! runtime info global array describing components.
+module empty_parent
+ type :: z
+ end type
+
+ type, extends(z) :: t
+  integer :: a
+ end type
+end module
+! CHECK: .c.t, SAVE{{.*}}.n.z{{.*}}n.a
-- 
cgit v1.1


From 9d8a236164082a6d92a58eaafce1f733ce2e81a7 Mon Sep 17 00:00:00 2001
From: David Green <david.green@arm.com>
Date: Mon, 12 Feb 2024 10:21:20 +0000
Subject: [BasicAA] Check for Overflow using vscale_range (#81144)

This extends #80818 when IsNSW is lost (possibly due to looking through
multiple GEPs), to check the vscale_range for an access that will not
overflow even with the maximum range.
---
 llvm/lib/Analysis/BasicAliasAnalysis.cpp | 24 +++++++++++++++++-------
 llvm/test/Analysis/BasicAA/vscale.ll     |  6 +++---
 2 files changed, 20 insertions(+), 10 deletions(-)

diff --git a/llvm/lib/Analysis/BasicAliasAnalysis.cpp b/llvm/lib/Analysis/BasicAliasAnalysis.cpp
index 682b0a2..2ea2917 100644
--- a/llvm/lib/Analysis/BasicAliasAnalysis.cpp
+++ b/llvm/lib/Analysis/BasicAliasAnalysis.cpp
@@ -1173,7 +1173,7 @@ AliasResult BasicAAResult::aliasGEP(
   // VScale Alias Analysis - Given one scalable offset between accesses and a
   // scalable typesize, we can divide each side by vscale, treating both values
   // as a constant. We prove that Offset/vscale >= TypeSize/vscale.
-  if (DecompGEP1.VarIndices.size() == 1 && DecompGEP1.VarIndices[0].IsNSW &&
+  if (DecompGEP1.VarIndices.size() == 1 &&
       DecompGEP1.VarIndices[0].Val.TruncBits == 0 &&
       DecompGEP1.Offset.isZero() &&
       PatternMatch::match(DecompGEP1.VarIndices[0].Val.V,
@@ -1183,12 +1183,22 @@ AliasResult BasicAAResult::aliasGEP(
         ScalableVar.IsNegated ? -ScalableVar.Scale : ScalableVar.Scale;
     LocationSize VLeftSize = Scale.isNegative() ? V1Size : V2Size;
 
-    // Note that we do not check that the typesize is scalable, as vscale >= 1
-    // so noalias still holds so long as the dependency distance is at least as
-    // big as the typesize.
-    if (VLeftSize.hasValue() &&
-        Scale.abs().uge(VLeftSize.getValue().getKnownMinValue()))
-      return AliasResult::NoAlias;
+    // Check if the offset is known to not overflow, if it does then attempt to
+    // prove it with the known values of vscale_range.
+    bool Overflows = !DecompGEP1.VarIndices[0].IsNSW;
+    if (Overflows) {
+      ConstantRange CR = getVScaleRange(&F, Scale.getBitWidth());
+      (void)CR.getSignedMax().smul_ov(Scale, Overflows);
+    }
+
+    if (!Overflows) {
+      // Note that we do not check that the typesize is scalable, as vscale >= 1
+      // so noalias still holds so long as the dependency distance is at least
+      // as big as the typesize.
+      if (VLeftSize.hasValue() &&
+          Scale.abs().uge(VLeftSize.getValue().getKnownMinValue()))
+        return AliasResult::NoAlias;
+    }
   }
 
   // Bail on analysing scalable LocationSize
diff --git a/llvm/test/Analysis/BasicAA/vscale.ll b/llvm/test/Analysis/BasicAA/vscale.ll
index 895ae1e..18aad4f 100644
--- a/llvm/test/Analysis/BasicAA/vscale.ll
+++ b/llvm/test/Analysis/BasicAA/vscale.ll
@@ -471,8 +471,8 @@ define void @vscale_negativescale(ptr %p) vscale_range(1,16) {
 
 ; CHECK-LABEL: onevscale
 ; CHECK-DAG:   MustAlias:    <vscale x 4 x i32>* %vp161, <vscale x 4 x i32>* %vp162
-; CHECK-DAG:   MayAlias:     <vscale x 4 x i32>* %vp161, <vscale x 4 x i32>* %vp161b
-; CHECK-DAG:   MayAlias:     <vscale x 4 x i32>* %vp161b, <vscale x 4 x i32>* %vp162
+; CHECK-DAG:   NoAlias:      <vscale x 4 x i32>* %vp161, <vscale x 4 x i32>* %vp161b
+; CHECK-DAG:   NoAlias:      <vscale x 4 x i32>* %vp161b, <vscale x 4 x i32>* %vp162
 define void @onevscale(ptr %p) vscale_range(1,16) {
   %v1 = call i64 @llvm.vscale.i64()
   %vp1 = mul nsw i64 %v1, 16
@@ -489,7 +489,7 @@ define void @onevscale(ptr %p) vscale_range(1,16) {
 
 ; CHECK-LABEL: twovscales
 ; CHECK-DAG:   MayAlias:     <vscale x 4 x i32>* %vp161, <vscale x 4 x i32>* %vp162
-; CHECK-DAG:   MayAlias:     <vscale x 4 x i32>* %vp161, <vscale x 4 x i32>* %vp161b
+; CHECK-DAG:   NoAlias:      <vscale x 4 x i32>* %vp161, <vscale x 4 x i32>* %vp161b
 ; CHECK-DAG:   MayAlias:     <vscale x 4 x i32>* %vp161b, <vscale x 4 x i32>* %vp162
 define void @twovscales(ptr %p) vscale_range(1,16) {
   %v1 = call i64 @llvm.vscale.i64()
-- 
cgit v1.1


From d153ef6a34956e0fc580c0f4b8e4b9b76b93b522 Mon Sep 17 00:00:00 2001
From: Vyacheslav Levytskyy
 <89994100+VyacheslavLevytskyy@users.noreply.github.com>
Date: Mon, 12 Feb 2024 11:22:48 +0100
Subject: Add support for SPIR-V extension: SPV_INTEL_function_pointers
 (#80759)

This PR adds initial support for "SPV_INTEL_function_pointers" SPIR-V
extension:
https://github.com/intel/llvm/blob/sycl/sycl/doc/design/spirv-extensions/SPV_INTEL_function_pointers.asciidoc

The goal of the extension is to support indirect function calls and
translation of function pointers into SPIR-V.
---
 llvm/lib/Target/SPIRV/SPIRVCallLowering.cpp        | 121 +++++++++++++++++----
 llvm/lib/Target/SPIRV/SPIRVCallLowering.h          |  11 ++
 llvm/lib/Target/SPIRV/SPIRVDuplicatesTracker.cpp   |   9 +-
 llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.h        |  29 +++++
 llvm/lib/Target/SPIRV/SPIRVInstrInfo.cpp           |   1 +
 llvm/lib/Target/SPIRV/SPIRVInstrInfo.td            |  10 ++
 llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp |  27 +++++
 llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp      |  42 +++++++
 llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.h        |   2 +
 llvm/lib/Target/SPIRV/SPIRVSubtarget.cpp           |   5 +-
 llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td     |   6 +
 .../SPV_INTEL_function_pointers/fp_const.ll        |  34 ++++++
 .../SPV_INTEL_function_pointers/fp_two_calls.ll    |  34 ++++++
 13 files changed, 307 insertions(+), 24 deletions(-)
 create mode 100644 llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_function_pointers/fp_const.ll
 create mode 100644 llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_function_pointers/fp_two_calls.ll

diff --git a/llvm/lib/Target/SPIRV/SPIRVCallLowering.cpp b/llvm/lib/Target/SPIRV/SPIRVCallLowering.cpp
index 97b2514..8ac498e 100644
--- a/llvm/lib/Target/SPIRV/SPIRVCallLowering.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVCallLowering.cpp
@@ -34,6 +34,12 @@ bool SPIRVCallLowering::lowerReturn(MachineIRBuilder &MIRBuilder,
                                     const Value *Val, ArrayRef<Register> VRegs,
                                     FunctionLoweringInfo &FLI,
                                     Register SwiftErrorVReg) const {
+  // Maybe run postponed production of types for function pointers
+  if (IndirectCalls.size() > 0) {
+    produceIndirectPtrTypes(MIRBuilder);
+    IndirectCalls.clear();
+  }
+
   // Currently all return types should use a single register.
   // TODO: handle the case of multiple registers.
   if (VRegs.size() > 1)
@@ -292,7 +298,6 @@ bool SPIRVCallLowering::lowerFormalArguments(MachineIRBuilder &MIRBuilder,
     }
   }
 
-  // Generate a SPIR-V type for the function.
   auto MRI = MIRBuilder.getMRI();
   Register FuncVReg = MRI->createGenericVirtualRegister(LLT::scalar(32));
   MRI->setRegClass(FuncVReg, &SPIRV::IDRegClass);
@@ -301,17 +306,17 @@ bool SPIRVCallLowering::lowerFormalArguments(MachineIRBuilder &MIRBuilder,
   SPIRVType *RetTy = GR->getOrCreateSPIRVType(FTy->getReturnType(), MIRBuilder);
   SPIRVType *FuncTy = GR->getOrCreateOpTypeFunctionWithArgs(
       FTy, RetTy, ArgTypeVRegs, MIRBuilder);
-
-  // Build the OpTypeFunction declaring it.
   uint32_t FuncControl = getFunctionControl(F);
 
-  MIRBuilder.buildInstr(SPIRV::OpFunction)
-      .addDef(FuncVReg)
-      .addUse(GR->getSPIRVTypeID(RetTy))
-      .addImm(FuncControl)
-      .addUse(GR->getSPIRVTypeID(FuncTy));
+  // Add OpFunction instruction
+  MachineInstrBuilder MB = MIRBuilder.buildInstr(SPIRV::OpFunction)
+                               .addDef(FuncVReg)
+                               .addUse(GR->getSPIRVTypeID(RetTy))
+                               .addImm(FuncControl)
+                               .addUse(GR->getSPIRVTypeID(FuncTy));
+  GR->recordFunctionDefinition(&F, &MB.getInstr()->getOperand(0));
 
-  // Add OpFunctionParameters.
+  // Add OpFunctionParameter instructions
   int i = 0;
   for (const auto &Arg : F.args()) {
     assert(VRegs[i].size() == 1 && "Formal arg has multiple vregs");
@@ -343,9 +348,56 @@ bool SPIRVCallLowering::lowerFormalArguments(MachineIRBuilder &MIRBuilder,
                     {static_cast<uint32_t>(LnkTy)}, F.getGlobalIdentifier());
   }
 
+  // Handle function pointers decoration
+  const auto *ST =
+      static_cast<const SPIRVSubtarget *>(&MIRBuilder.getMF().getSubtarget());
+  bool hasFunctionPointers =
+      ST->canUseExtension(SPIRV::Extension::SPV_INTEL_function_pointers);
+  if (hasFunctionPointers) {
+    if (F.hasFnAttribute("referenced-indirectly")) {
+      assert((F.getCallingConv() != CallingConv::SPIR_KERNEL) &&
+             "Unexpected 'referenced-indirectly' attribute of the kernel "
+             "function");
+      buildOpDecorate(FuncVReg, MIRBuilder,
+                      SPIRV::Decoration::ReferencedIndirectlyINTEL, {});
+    }
+  }
+
   return true;
 }
 
+// Used to postpone producing of indirect function pointer types after all
+// indirect calls info is collected
+// TODO:
+// - add a topological sort of IndirectCalls to ensure the best types knowledge
+// - we may need to fix function formal parameter types if they are opaque
+//   pointers used as function pointers in these indirect calls
+void SPIRVCallLowering::produceIndirectPtrTypes(
+    MachineIRBuilder &MIRBuilder) const {
+  // Create indirect call data types if any
+  MachineFunction &MF = MIRBuilder.getMF();
+  for (auto const &IC : IndirectCalls) {
+    SPIRVType *SpirvRetTy = GR->getOrCreateSPIRVType(IC.RetTy, MIRBuilder);
+    SmallVector<SPIRVType *, 4> SpirvArgTypes;
+    for (size_t i = 0; i < IC.ArgTys.size(); ++i) {
+      SPIRVType *SPIRVTy = GR->getOrCreateSPIRVType(IC.ArgTys[i], MIRBuilder);
+      SpirvArgTypes.push_back(SPIRVTy);
+      if (!GR->getSPIRVTypeForVReg(IC.ArgRegs[i]))
+        GR->assignSPIRVTypeToVReg(SPIRVTy, IC.ArgRegs[i], MF);
+    }
+    // SPIR-V function type:
+    FunctionType *FTy =
+        FunctionType::get(const_cast<Type *>(IC.RetTy), IC.ArgTys, false);
+    SPIRVType *SpirvFuncTy = GR->getOrCreateOpTypeFunctionWithArgs(
+        FTy, SpirvRetTy, SpirvArgTypes, MIRBuilder);
+    // SPIR-V pointer to function type:
+    SPIRVType *IndirectFuncPtrTy = GR->getOrCreateSPIRVPointerType(
+        SpirvFuncTy, MIRBuilder, SPIRV::StorageClass::Function);
+    // Correct the Calee type
+    GR->assignSPIRVTypeToVReg(IndirectFuncPtrTy, IC.Callee, MF);
+  }
+}
+
 bool SPIRVCallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
                                   CallLoweringInfo &Info) const {
   // Currently call returns should have single vregs.
@@ -356,45 +408,44 @@ bool SPIRVCallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
   GR->setCurrentFunc(MF);
   FunctionType *FTy = nullptr;
   const Function *CF = nullptr;
+  std::string DemangledName;
+  const Type *OrigRetTy = Info.OrigRet.Ty;
 
   // Emit a regular OpFunctionCall. If it's an externally declared function,
   // be sure to emit its type and function declaration here. It will be hoisted
   // globally later.
   if (Info.Callee.isGlobal()) {
+    std::string FuncName = Info.Callee.getGlobal()->getName().str();
+    DemangledName = getOclOrSpirvBuiltinDemangledName(FuncName);
     CF = dyn_cast_or_null<const Function>(Info.Callee.getGlobal());
     // TODO: support constexpr casts and indirect calls.
     if (CF == nullptr)
       return false;
-    FTy = getOriginalFunctionType(*CF);
+    if ((FTy = getOriginalFunctionType(*CF)) != nullptr)
+      OrigRetTy = FTy->getReturnType();
   }
 
   MachineRegisterInfo *MRI = MIRBuilder.getMRI();
   Register ResVReg =
       Info.OrigRet.Regs.empty() ? Register(0) : Info.OrigRet.Regs[0];
-  std::string FuncName = Info.Callee.getGlobal()->getName().str();
-  std::string DemangledName = getOclOrSpirvBuiltinDemangledName(FuncName);
   const auto *ST = static_cast<const SPIRVSubtarget *>(&MF.getSubtarget());
   // TODO: check that it's OCL builtin, then apply OpenCL_std.
   if (!DemangledName.empty() && CF && CF->isDeclaration() &&
       ST->canUseExtInstSet(SPIRV::InstructionSet::OpenCL_std)) {
-    const Type *OrigRetTy = Info.OrigRet.Ty;
-    if (FTy)
-      OrigRetTy = FTy->getReturnType();
     SmallVector<Register, 8> ArgVRegs;
     for (auto Arg : Info.OrigArgs) {
       assert(Arg.Regs.size() == 1 && "Call arg has multiple VRegs");
       ArgVRegs.push_back(Arg.Regs[0]);
       SPIRVType *SPIRVTy = GR->getOrCreateSPIRVType(Arg.Ty, MIRBuilder);
       if (!GR->getSPIRVTypeForVReg(Arg.Regs[0]))
-        GR->assignSPIRVTypeToVReg(SPIRVTy, Arg.Regs[0], MIRBuilder.getMF());
+        GR->assignSPIRVTypeToVReg(SPIRVTy, Arg.Regs[0], MF);
     }
     if (auto Res = SPIRV::lowerBuiltin(
             DemangledName, SPIRV::InstructionSet::OpenCL_std, MIRBuilder,
             ResVReg, OrigRetTy, ArgVRegs, GR))
       return *Res;
   }
-  if (CF && CF->isDeclaration() &&
-      !GR->find(CF, &MIRBuilder.getMF()).isValid()) {
+  if (CF && CF->isDeclaration() && !GR->find(CF, &MF).isValid()) {
     // Emit the type info and forward function declaration to the first MBB
     // to ensure VReg definition dependencies are valid across all MBBs.
     MachineIRBuilder FirstBlockBuilder;
@@ -416,14 +467,40 @@ bool SPIRVCallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
     lowerFormalArguments(FirstBlockBuilder, *CF, VRegArgs, FuncInfo);
   }
 
+  unsigned CallOp;
+  if (Info.CB->isIndirectCall()) {
+    if (!ST->canUseExtension(SPIRV::Extension::SPV_INTEL_function_pointers))
+      report_fatal_error("An indirect call is encountered but SPIR-V without "
+                         "extensions does not support it",
+                         false);
+    // Set instruction operation according to SPV_INTEL_function_pointers
+    CallOp = SPIRV::OpFunctionPointerCallINTEL;
+    // Collect information about the indirect call to support possible
+    // specification of opaque ptr types of parent function's parameters
+    Register CalleeReg = Info.Callee.getReg();
+    if (CalleeReg.isValid()) {
+      SPIRVCallLowering::SPIRVIndirectCall IndirectCall;
+      IndirectCall.Callee = CalleeReg;
+      IndirectCall.RetTy = OrigRetTy;
+      for (const auto &Arg : Info.OrigArgs) {
+        assert(Arg.Regs.size() == 1 && "Call arg has multiple VRegs");
+        IndirectCall.ArgTys.push_back(Arg.Ty);
+        IndirectCall.ArgRegs.push_back(Arg.Regs[0]);
+      }
+      IndirectCalls.push_back(IndirectCall);
+    }
+  } else {
+    // Emit a regular OpFunctionCall
+    CallOp = SPIRV::OpFunctionCall;
+  }
+
   // Make sure there's a valid return reg, even for functions returning void.
   if (!ResVReg.isValid())
     ResVReg = MIRBuilder.getMRI()->createVirtualRegister(&SPIRV::IDRegClass);
-  SPIRVType *RetType =
-      GR->assignTypeToVReg(FTy->getReturnType(), ResVReg, MIRBuilder);
+  SPIRVType *RetType = GR->assignTypeToVReg(OrigRetTy, ResVReg, MIRBuilder);
 
-  // Emit the OpFunctionCall and its args.
-  auto MIB = MIRBuilder.buildInstr(SPIRV::OpFunctionCall)
+  // Emit the call instruction and its args.
+  auto MIB = MIRBuilder.buildInstr(CallOp)
                  .addDef(ResVReg)
                  .addUse(GR->getSPIRVTypeID(RetType))
                  .add(Info.Callee);
diff --git a/llvm/lib/Target/SPIRV/SPIRVCallLowering.h b/llvm/lib/Target/SPIRV/SPIRVCallLowering.h
index c2d6ad8..48b3a5e 100644
--- a/llvm/lib/Target/SPIRV/SPIRVCallLowering.h
+++ b/llvm/lib/Target/SPIRV/SPIRVCallLowering.h
@@ -26,6 +26,17 @@ private:
   // Used to create and assign function, argument, and return type information.
   SPIRVGlobalRegistry *GR;
 
+  // Used to postpone producing of indirect function pointer types
+  // after all indirect calls info is collected
+  struct SPIRVIndirectCall {
+    const Type *RetTy = nullptr;
+    SmallVector<Type *> ArgTys;
+    SmallVector<Register> ArgRegs;
+    Register Callee;
+  };
+  void produceIndirectPtrTypes(MachineIRBuilder &MIRBuilder) const;
+  mutable SmallVector<SPIRVIndirectCall> IndirectCalls;
+
 public:
   SPIRVCallLowering(const SPIRVTargetLowering &TLI, SPIRVGlobalRegistry *GR);
 
diff --git a/llvm/lib/Target/SPIRV/SPIRVDuplicatesTracker.cpp b/llvm/lib/Target/SPIRV/SPIRVDuplicatesTracker.cpp
index cbe1a53..d82fb2df 100644
--- a/llvm/lib/Target/SPIRV/SPIRVDuplicatesTracker.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVDuplicatesTracker.cpp
@@ -54,7 +54,14 @@ void SPIRVGeneralDuplicatesTracker::buildDepsGraph(
         MachineOperand &Op = MI->getOperand(i);
         if (!Op.isReg())
           continue;
-        MachineOperand *RegOp = &MRI.getVRegDef(Op.getReg())->getOperand(0);
+        MachineInstr *VRegDef = MRI.getVRegDef(Op.getReg());
+        // References to a function via function pointers generate virtual
+        // registers without a definition. We are able to resolve this
+        // reference using Globar Register info into an OpFunction instruction
+        // but do not expect to find it in Reg2Entry.
+        if (MI->getOpcode() == SPIRV::OpConstantFunctionPointerINTEL && i == 2)
+          continue;
+        MachineOperand *RegOp = &VRegDef->getOperand(0);
         assert((MI->getOpcode() == SPIRV::OpVariable && i == 3) ||
                Reg2Entry.count(RegOp));
         if (Reg2Entry.count(RegOp))
diff --git a/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.h b/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.h
index f328092..792a007 100644
--- a/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.h
+++ b/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.h
@@ -38,6 +38,12 @@ class SPIRVGlobalRegistry {
 
   DenseMap<SPIRVType *, const Type *> SPIRVToLLVMType;
 
+  // map a Function to its definition (as a machine instruction operand)
+  DenseMap<const Function *, const MachineOperand *> FunctionToInstr;
+  // map function pointer (as a machine instruction operand) to the used
+  // Function
+  DenseMap<const MachineOperand *, const Function *> InstrToFunction;
+
   // Look for an equivalent of the newType in the map. Return the equivalent
   // if it's found, otherwise insert newType to the map and return the type.
   const MachineInstr *checkSpecialInstr(const SPIRV::SpecialTypeDescriptor &TD,
@@ -101,6 +107,29 @@ public:
     DT.buildDepsGraph(Graph, MMI);
   }
 
+  // Map a machine operand that represents a use of a function via function
+  // pointer to a machine operand that represents the function definition.
+  // Return either the register or invalid value, because we have no context for
+  // a good diagnostic message in case of unexpectedly missing references.
+  const MachineOperand *getFunctionDefinitionByUse(const MachineOperand *Use) {
+    auto ResF = InstrToFunction.find(Use);
+    if (ResF == InstrToFunction.end())
+      return nullptr;
+    auto ResReg = FunctionToInstr.find(ResF->second);
+    return ResReg == FunctionToInstr.end() ? nullptr : ResReg->second;
+  }
+  // map function pointer (as a machine instruction operand) to the used
+  // Function
+  void recordFunctionPointer(const MachineOperand *MO, const Function *F) {
+    InstrToFunction[MO] = F;
+  }
+  // map a Function to its definition (as a machine instruction)
+  void recordFunctionDefinition(const Function *F, const MachineOperand *MO) {
+    FunctionToInstr[F] = MO;
+  }
+  // Return true if any OpConstantFunctionPointerINTEL were generated
+  bool hasConstFunPtr() { return !InstrToFunction.empty(); }
+
   // Get or create a SPIR-V type corresponding the given LLVM IR type,
   // and map it to the given VReg by creating an ASSIGN_TYPE instruction.
   SPIRVType *assignTypeToVReg(const Type *Type, Register VReg,
diff --git a/llvm/lib/Target/SPIRV/SPIRVInstrInfo.cpp b/llvm/lib/Target/SPIRV/SPIRVInstrInfo.cpp
index 4231745..e3f7641 100644
--- a/llvm/lib/Target/SPIRV/SPIRVInstrInfo.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVInstrInfo.cpp
@@ -40,6 +40,7 @@ bool SPIRVInstrInfo::isConstantInstr(const MachineInstr &MI) const {
   case SPIRV::OpSpecConstantComposite:
   case SPIRV::OpSpecConstantOp:
   case SPIRV::OpUndef:
+  case SPIRV::OpConstantFunctionPointerINTEL:
     return true;
   default:
     return false;
diff --git a/llvm/lib/Target/SPIRV/SPIRVInstrInfo.td b/llvm/lib/Target/SPIRV/SPIRVInstrInfo.td
index caf2ae4..904fef1 100644
--- a/llvm/lib/Target/SPIRV/SPIRVInstrInfo.td
+++ b/llvm/lib/Target/SPIRV/SPIRVInstrInfo.td
@@ -762,6 +762,16 @@ def OpGroupNonUniformLogicalAnd: OpGroupNUGroup<"LogicalAnd", 362>;
 def OpGroupNonUniformLogicalOr: OpGroupNUGroup<"LogicalOr", 363>;
 def OpGroupNonUniformLogicalXor: OpGroupNUGroup<"LogicalXor", 364>;
 
+// 3.49.7, Constant-Creation Instructions
+
+//  - SPV_INTEL_function_pointers
+def OpConstantFunctionPointerINTEL: Op<5600, (outs ID:$res), (ins TYPE:$ty, ID:$fun), "$res = OpConstantFunctionPointerINTEL $ty $fun">;
+
+// 3.49.9. Function Instructions
+
+//  - SPV_INTEL_function_pointers
+def OpFunctionPointerCallINTEL: Op<5601, (outs ID:$res), (ins TYPE:$ty, ID:$funPtr, variable_ops), "$res = OpFunctionPointerCallINTEL $ty $funPtr">;
+
 // 3.49.21. Group and Subgroup Instructions
 def OpSubgroupShuffleINTEL: Op<5571, (outs ID:$res), (ins TYPE:$type, ID:$data, ID:$invocationId),
                   "$res = OpSubgroupShuffleINTEL $type $data $invocationId">;
diff --git a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
index 8c1dfc5..52eeb8a 100644
--- a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
@@ -1534,6 +1534,12 @@ bool SPIRVInstructionSelector::selectGlobalValue(
     GlobalIdent = GV->getGlobalIdentifier();
   }
 
+  // Behaviour of functions as operands depends on availability of the
+  // corresponding extension (SPV_INTEL_function_pointers):
+  // - If there is an extension to operate with functions as operands:
+  // We create a proper constant operand and evaluate a correct type for a
+  // function pointer.
+  // - Without the required extension:
   // We have functions as operands in tests with blocks of instruction e.g. in
   // transcoding/global_block.ll. These operands are not used and should be
   // substituted by zero constants. Their type is expected to be always
@@ -1545,6 +1551,27 @@ bool SPIRVInstructionSelector::selectGlobalValue(
     if (!NewReg.isValid()) {
       Register NewReg = ResVReg;
       GR.add(ConstVal, GR.CurMF, NewReg);
+      const Function *GVFun =
+          STI.canUseExtension(SPIRV::Extension::SPV_INTEL_function_pointers)
+              ? dyn_cast<Function>(GV)
+              : nullptr;
+      if (GVFun) {
+        // References to a function via function pointers generate virtual
+        // registers without a definition. We will resolve it later, during
+        // module analysis stage.
+        MachineRegisterInfo *MRI = MIRBuilder.getMRI();
+        Register FuncVReg = MRI->createGenericVirtualRegister(LLT::scalar(32));
+        MRI->setRegClass(FuncVReg, &SPIRV::IDRegClass);
+        MachineInstrBuilder MB =
+            BuildMI(BB, I, I.getDebugLoc(),
+                    TII.get(SPIRV::OpConstantFunctionPointerINTEL))
+                .addDef(NewReg)
+                .addUse(GR.getSPIRVTypeID(ResType))
+                .addUse(FuncVReg);
+        // mapping the function pointer to the used Function
+        GR.recordFunctionPointer(&MB.getInstr()->getOperand(2), GVFun);
+        return MB.constrainAllUses(TII, TRI, RBI);
+      }
       return BuildMI(BB, I, I.getDebugLoc(), TII.get(SPIRV::OpConstantNull))
           .addDef(NewReg)
           .addUse(GR.getSPIRVTypeID(ResType))
diff --git a/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp b/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp
index 2dfb71d..a18aae1 100644
--- a/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp
@@ -291,6 +291,32 @@ void SPIRVModuleAnalysis::collectFuncNames(MachineInstr &MI,
   }
 }
 
+// References to a function via function pointers generate virtual
+// registers without a definition. We are able to resolve this
+// reference using Globar Register info into an OpFunction instruction
+// and replace dummy operands by the corresponding global register references.
+void SPIRVModuleAnalysis::collectFuncPtrs() {
+  for (auto &MI : MAI.MS[SPIRV::MB_TypeConstVars])
+    if (MI->getOpcode() == SPIRV::OpConstantFunctionPointerINTEL)
+      collectFuncPtrs(MI);
+}
+
+void SPIRVModuleAnalysis::collectFuncPtrs(MachineInstr *MI) {
+  const MachineOperand *FunUse = &MI->getOperand(2);
+  if (const MachineOperand *FunDef = GR->getFunctionDefinitionByUse(FunUse)) {
+    const MachineInstr *FunDefMI = FunDef->getParent();
+    assert(FunDefMI->getOpcode() == SPIRV::OpFunction &&
+           "Constant function pointer must refer to function definition");
+    Register FunDefReg = FunDef->getReg();
+    Register GlobalFunDefReg =
+        MAI.getRegisterAlias(FunDefMI->getMF(), FunDefReg);
+    assert(GlobalFunDefReg.isValid() &&
+           "Function definition must refer to a global register");
+    Register FunPtrReg = FunUse->getReg();
+    MAI.setRegisterAlias(MI->getMF(), FunPtrReg, GlobalFunDefReg);
+  }
+}
+
 using InstrSignature = SmallVector<size_t>;
 using InstrTraces = std::set<InstrSignature>;
 
@@ -938,6 +964,18 @@ void addInstrRequirements(const MachineInstr &MI,
       Reqs.addCapability(SPIRV::Capability::ExpectAssumeKHR);
     }
     break;
+  case SPIRV::OpConstantFunctionPointerINTEL:
+    if (ST.canUseExtension(SPIRV::Extension::SPV_INTEL_function_pointers)) {
+      Reqs.addExtension(SPIRV::Extension::SPV_INTEL_function_pointers);
+      Reqs.addCapability(SPIRV::Capability::FunctionPointersINTEL);
+    }
+    break;
+  case SPIRV::OpFunctionPointerCallINTEL:
+    if (ST.canUseExtension(SPIRV::Extension::SPV_INTEL_function_pointers)) {
+      Reqs.addExtension(SPIRV::Extension::SPV_INTEL_function_pointers);
+      Reqs.addCapability(SPIRV::Capability::FunctionPointersINTEL);
+    }
+    break;
   default:
     break;
   }
@@ -1096,6 +1134,10 @@ bool SPIRVModuleAnalysis::runOnModule(Module &M) {
   // Number rest of registers from N+1 onwards.
   numberRegistersGlobally(M);
 
+  // Update references to OpFunction instructions to use Global Registers
+  if (GR->hasConstFunPtr())
+    collectFuncPtrs();
+
   // Collect OpName, OpEntryPoint, OpDecorate etc, process other instructions.
   processOtherInstrs(M);
 
diff --git a/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.h b/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.h
index d0b8027..b05526b 100644
--- a/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.h
+++ b/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.h
@@ -224,6 +224,8 @@ private:
   void collectFuncNames(MachineInstr &MI, const Function *F);
   void processOtherInstrs(const Module &M);
   void numberRegistersGlobally(const Module &M);
+  void collectFuncPtrs();
+  void collectFuncPtrs(MachineInstr *MI);
 
   const SPIRVSubtarget *ST;
   SPIRVGlobalRegistry *GR;
diff --git a/llvm/lib/Target/SPIRV/SPIRVSubtarget.cpp b/llvm/lib/Target/SPIRV/SPIRVSubtarget.cpp
index 6eb81f2..effedc2 100644
--- a/llvm/lib/Target/SPIRV/SPIRVSubtarget.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVSubtarget.cpp
@@ -53,7 +53,10 @@ cl::list<SPIRV::Extension::Extension> Extensions(
         clEnumValN(SPIRV::Extension::SPV_KHR_bit_instructions,
                    "SPV_KHR_bit_instructions",
                    "This enables bit instructions to be used by SPIR-V modules "
-                   "without requiring the Shader capability")));
+                   "without requiring the Shader capability"),
+        clEnumValN(SPIRV::Extension::SPV_INTEL_function_pointers,
+                   "SPV_INTEL_function_pointers",
+                   "Allows translation of function pointers")));
 
 // Compare version numbers, but allow 0 to mean unspecified.
 static bool isAtLeastVer(uint32_t Target, uint32_t VerToCompareTo) {
diff --git a/llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td b/llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td
index 58ba778..5d25227 100644
--- a/llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td
+++ b/llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td
@@ -295,6 +295,7 @@ defm SPV_INTEL_usm_storage_classes : ExtensionOperand<100>;
 defm SPV_INTEL_fpga_latency_control : ExtensionOperand<101>;
 defm SPV_INTEL_fpga_argument_interfaces : ExtensionOperand<102>;
 defm SPV_INTEL_optnone : ExtensionOperand<103>;
+defm SPV_INTEL_function_pointers : ExtensionOperand<104>;
 
 //===----------------------------------------------------------------------===//
 // Multiclass used to define Capabilities enum values and at the same time
@@ -452,6 +453,8 @@ defm ArbitraryPrecisionIntegersINTEL : CapabilityOperand<5844, 0, 0, [SPV_INTEL_
 defm OptNoneINTEL : CapabilityOperand<6094, 0, 0, [SPV_INTEL_optnone], []>;
 defm BitInstructions : CapabilityOperand<6025, 0, 0, [SPV_KHR_bit_instructions], []>;
 defm ExpectAssumeKHR : CapabilityOperand<5629, 0, 0, [SPV_KHR_expect_assume], []>;
+defm FunctionPointersINTEL : CapabilityOperand<5603, 0, 0, [SPV_INTEL_function_pointers], []>;
+defm IndirectReferencesINTEL : CapabilityOperand<5604, 0, 0, [SPV_INTEL_function_pointers], []>;
 
 //===----------------------------------------------------------------------===//
 // Multiclass used to define SourceLanguage enum values and at the same time
@@ -688,6 +691,7 @@ defm HitAttributeNV : StorageClassOperand<5339, [RayTracingNV]>;
 defm IncomingRayPayloadNV : StorageClassOperand<5342, [RayTracingNV]>;
 defm ShaderRecordBufferNV : StorageClassOperand<5343, [RayTracingNV]>;
 defm PhysicalStorageBufferEXT : StorageClassOperand<5349, [PhysicalStorageBufferAddressesEXT]>;
+defm CodeSectionINTEL : StorageClassOperand<5605, [FunctionPointersINTEL]>;
 
 //===----------------------------------------------------------------------===//
 // Multiclass used to define Dim enum values and at the same time
@@ -1179,6 +1183,8 @@ defm CountBuffer : DecorationOperand<5634, 0, 0, [], []>;
 defm UserSemantic : DecorationOperand<5635, 0, 0, [], []>;
 defm RestrictPointerEXT : DecorationOperand<5355, 0, 0, [], [PhysicalStorageBufferAddressesEXT]>;
 defm AliasedPointerEXT : DecorationOperand<5356, 0, 0, [], [PhysicalStorageBufferAddressesEXT]>;
+defm ReferencedIndirectlyINTEL : DecorationOperand<5602, 0, 0, [], [IndirectReferencesINTEL]>;
+defm ArgumentAttributeINTEL : DecorationOperand<6409, 0, 0, [], [FunctionPointersINTEL]>;
 
 //===----------------------------------------------------------------------===//
 // Multiclass used to define BuiltIn enum values and at the same time
diff --git a/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_function_pointers/fp_const.ll b/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_function_pointers/fp_const.ll
new file mode 100644
index 0000000..0bd1b5d
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_function_pointers/fp_const.ll
@@ -0,0 +1,34 @@
+; RUN: llc -O0 -mtriple=spirv32-unknown-unknown --spirv-extensions=SPV_INTEL_function_pointers %s -o - | FileCheck %s
+; TODO: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %}
+
+; CHECK-DAG: OpCapability Int8
+; CHECK-DAG: OpCapability FunctionPointersINTEL
+; CHECK-DAG: OpCapability Int64
+; CHECK: OpExtension "SPV_INTEL_function_pointers"
+; CHECK-DAG: %[[TyInt8:.*]] = OpTypeInt 8 0
+; CHECK-DAG: %[[TyVoid:.*]] = OpTypeVoid
+; CHECK-DAG: %[[TyInt64:.*]] = OpTypeInt 64 0
+; CHECK-DAG: %[[TyFunFp:.*]] = OpTypeFunction %[[TyVoid]] %[[TyInt64]]
+; CHECK-DAG: %[[ConstInt64:.*]] = OpConstant %[[TyInt64]] 42
+; CHECK-DAG: %[[TyPtrFunFp:.*]] = OpTypePointer Function %[[TyFunFp]]
+; CHECK-DAG: %[[ConstFunFp:.*]] = OpConstantFunctionPointerINTEL %[[TyPtrFunFp]] %[[DefFunFp:.*]]
+; CHECK: %[[FunPtr1:.*]] = OpBitcast %[[#]] %[[ConstFunFp]]
+; CHECK: %[[FunPtr2:.*]] = OpLoad %[[#]] %[[FunPtr1]]
+; CHECK: OpFunctionPointerCallINTEL %[[TyInt64]] %[[FunPtr2]] %[[ConstInt64]]
+; CHECK: OpReturn
+; CHECK: OpFunctionEnd
+; CHECK: %[[DefFunFp]] = OpFunction %[[TyVoid]] None %[[TyFunFp]]
+
+target triple = "spir64-unknown-unknown"
+
+define spir_kernel void @test() {
+entry:
+  %0 = load ptr, ptr @foo
+  %1 = call i64 %0(i64 42)
+  ret void
+}
+
+define void @foo(i64 %a) {
+entry:
+  ret void
+}
diff --git a/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_function_pointers/fp_two_calls.ll b/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_function_pointers/fp_two_calls.ll
new file mode 100644
index 0000000..89de098
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_function_pointers/fp_two_calls.ll
@@ -0,0 +1,34 @@
+; RUN: llc -O0 -mtriple=spirv32-unknown-unknown --spirv-extensions=SPV_INTEL_function_pointers %s -o - | FileCheck %s
+; TODO: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %}
+
+; CHECK-DAG: OpCapability Int8
+; CHECK-DAG: OpCapability FunctionPointersINTEL
+; CHECK-DAG: OpCapability Int64
+; CHECK: OpExtension "SPV_INTEL_function_pointers"
+; CHECK-DAG: %[[TyInt8:.*]] = OpTypeInt 8 0
+; CHECK-DAG: %[[TyVoid:.*]] = OpTypeVoid
+; CHECK-DAG: %[[TyFloat32:.*]] = OpTypeFloat 32
+; CHECK-DAG: %[[TyInt64:.*]] = OpTypeInt 64 0
+; CHECK-DAG: %[[TyPtrInt8:.*]] = OpTypePointer Function %[[TyInt8]]
+; CHECK-DAG: %[[TyFunFp:.*]] = OpTypeFunction %[[TyFloat32]] %[[TyPtrInt8]]
+; CHECK-DAG: %[[TyFunBar:.*]] = OpTypeFunction %[[TyInt64]] %[[TyPtrInt8]] %[[TyPtrInt8]]
+; CHECK-DAG: %[[TyPtrFunFp:.*]] = OpTypePointer Function %[[TyFunFp]]
+; CHECK-DAG: %[[TyPtrFunBar:.*]] = OpTypePointer Function %[[TyFunBar]]
+; CHECK-DAG: %[[TyFunTest:.*]] = OpTypeFunction %[[TyVoid]] %[[TyPtrInt8]] %[[TyPtrInt8]] %[[TyPtrInt8]]
+; CHECK: %[[FunTest:.*]] = OpFunction %[[TyVoid]] None %[[TyFunTest]]
+; CHECK: %[[ArgFp:.*]] = OpFunctionParameter %[[TyPtrInt8]]
+; CHECK: %[[ArgData:.*]] = OpFunctionParameter %[[TyPtrInt8]]
+; CHECK: %[[ArgBar:.*]] = OpFunctionParameter %[[TyPtrInt8]]
+; CHECK: OpFunctionPointerCallINTEL %[[TyFloat32]] %[[ArgFp]] %[[ArgBar]]
+; CHECK: OpFunctionPointerCallINTEL %[[TyInt64]] %[[ArgBar]] %[[ArgFp]] %[[ArgData]]
+; CHECK: OpReturn
+; CHECK: OpFunctionEnd
+
+target triple = "spir64-unknown-unknown"
+
+define spir_kernel void @test(ptr %fp, ptr %data, ptr %bar) {
+entry:
+  %0 = call spir_func float %fp(ptr %bar)
+  %1 = call spir_func i64 %bar(ptr %fp, ptr %data)
+  ret void
+}
-- 
cgit v1.1


From 213b0ae4978581d382ba99107040122c5e69662b Mon Sep 17 00:00:00 2001
From: Serge Pavlov <sepavloff@gmail.com>
Date: Mon, 12 Feb 2024 17:46:59 +0700
Subject: [GlobalISel][ARM] legalize G_FPENV_RESET for soft-float mode (#81456)

---
 llvm/lib/Target/ARM/ARMLegalizerInfo.cpp  |  3 ++-
 llvm/test/CodeGen/ARM/GlobalISel/fpenv.ll | 14 ++++++++++++++
 2 files changed, 16 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/Target/ARM/ARMLegalizerInfo.cpp b/llvm/lib/Target/ARM/ARMLegalizerInfo.cpp
index 67187c4..c5199aab 100644
--- a/llvm/lib/Target/ARM/ARMLegalizerInfo.cpp
+++ b/llvm/lib/Target/ARM/ARMLegalizerInfo.cpp
@@ -223,7 +223,8 @@ ARMLegalizerInfo::ARMLegalizerInfo(const ARMSubtarget &ST) {
     getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
         .libcallForCartesianProduct({s32, s64}, {s32});
 
-    getActionDefinitionsBuilder({G_GET_FPENV, G_SET_FPENV}).libcall();
+    getActionDefinitionsBuilder({G_GET_FPENV, G_SET_FPENV, G_RESET_FPENV})
+        .libcall();
   }
 
   // Just expand whatever loads and stores are left.
diff --git a/llvm/test/CodeGen/ARM/GlobalISel/fpenv.ll b/llvm/test/CodeGen/ARM/GlobalISel/fpenv.ll
index 3d18a65..f5aea62 100644
--- a/llvm/test/CodeGen/ARM/GlobalISel/fpenv.ll
+++ b/llvm/test/CodeGen/ARM/GlobalISel/fpenv.ll
@@ -74,5 +74,19 @@ entry:
   ret void
 }
 
+define void @func_reset_soft() #0 {
+; CHECK-LABEL: func_reset_soft:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .save {r11, lr}
+; CHECK-NEXT:    push {r11, lr}
+; CHECK-NEXT:    mvn r0, #0
+; CHECK-NEXT:    bl fesetenv
+; CHECK-NEXT:    pop {r11, lr}
+; CHECK-NEXT:    mov pc, lr
+entry:
+  call void @llvm.reset.fpenv()
+  ret void
+}
+
 attributes #0 = { nounwind "use-soft-float"="true" }
 
-- 
cgit v1.1


From c609846155279090a4f9e659f63fb851e4946cb7 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo@fhahn.com>
Date: Mon, 12 Feb 2024 11:19:02 +0000
Subject: [TBAA] Extract logic to use TBAA tag for field of !tbaa.struct (NFC).
 (#81284)

---
 llvm/include/llvm/IR/Metadata.h                      |  5 +++++
 llvm/lib/Analysis/TypeBasedAliasAnalysis.cpp         | 15 +++++++++++++++
 llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp | 15 +--------------
 3 files changed, 21 insertions(+), 14 deletions(-)

diff --git a/llvm/include/llvm/IR/Metadata.h b/llvm/include/llvm/IR/Metadata.h
index db1f44f..6f23ac4 100644
--- a/llvm/include/llvm/IR/Metadata.h
+++ b/llvm/include/llvm/IR/Metadata.h
@@ -844,6 +844,11 @@ struct AAMDNodes {
   /// together. Different from `merge`, where different locations should
   /// overlap each other, `concat` puts non-overlapping locations together.
   AAMDNodes concat(const AAMDNodes &Other) const;
+
+  /// Create a new AAMDNode for accessing \p AccessSize bytes of this AAMDNode.
+  /// If his AAMDNode has !tbaa.struct and \p AccessSize matches the size of the
+  /// field at offset 0, get the TBAA tag describing the accessed field.
+  AAMDNodes adjustForAccess(unsigned AccessSize);
 };
 
 // Specialize DenseMapInfo for AAMDNodes.
diff --git a/llvm/lib/Analysis/TypeBasedAliasAnalysis.cpp b/llvm/lib/Analysis/TypeBasedAliasAnalysis.cpp
index e4dc1a8..d05f425 100644
--- a/llvm/lib/Analysis/TypeBasedAliasAnalysis.cpp
+++ b/llvm/lib/Analysis/TypeBasedAliasAnalysis.cpp
@@ -817,3 +817,18 @@ MDNode *AAMDNodes::extendToTBAA(MDNode *MD, ssize_t Len) {
       ConstantAsMetadata::get(ConstantInt::get(PreviousSize->getType(), Len));
   return MDNode::get(MD->getContext(), NextNodes);
 }
+
+AAMDNodes AAMDNodes::adjustForAccess(unsigned AccessSize) {
+  AAMDNodes New = *this;
+  MDNode *M = New.TBAAStruct;
+  New.TBAAStruct = nullptr;
+  if (M && M->getNumOperands() == 3 && M->getOperand(0) &&
+      mdconst::hasa<ConstantInt>(M->getOperand(0)) &&
+      mdconst::extract<ConstantInt>(M->getOperand(0))->isZero() &&
+      M->getOperand(1) && mdconst::hasa<ConstantInt>(M->getOperand(1)) &&
+      mdconst::extract<ConstantInt>(M->getOperand(1))->getValue() ==
+          AccessSize &&
+      M->getOperand(2) && isa<MDNode>(M->getOperand(2)))
+    New.TBAA = cast<MDNode>(M->getOperand(2));
+  return New;
+}
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
index ed5d447..56d1259 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -172,20 +172,7 @@ Instruction *InstCombinerImpl::SimplifyAnyMemTransfer(AnyMemTransferInst *MI) {
 
   // If the memcpy has metadata describing the members, see if we can get the
   // TBAA tag describing our copy.
-  AAMDNodes AACopyMD = MI->getAAMetadata();
-
-  if (MDNode *M = AACopyMD.TBAAStruct) {
-    AACopyMD.TBAAStruct = nullptr;
-    if (M->getNumOperands() == 3 && M->getOperand(0) &&
-        mdconst::hasa<ConstantInt>(M->getOperand(0)) &&
-        mdconst::extract<ConstantInt>(M->getOperand(0))->isZero() &&
-        M->getOperand(1) &&
-        mdconst::hasa<ConstantInt>(M->getOperand(1)) &&
-        mdconst::extract<ConstantInt>(M->getOperand(1))->getValue() ==
-        Size &&
-        M->getOperand(2) && isa<MDNode>(M->getOperand(2)))
-      AACopyMD.TBAA = cast<MDNode>(M->getOperand(2));
-  }
+  AAMDNodes AACopyMD = MI->getAAMetadata().adjustForAccess(Size);
 
   Value *Src = MI->getArgOperand(1);
   Value *Dest = MI->getArgOperand(0);
-- 
cgit v1.1


From 0bf4ff29816c0eead99ba576a2df2e3c4d214b1f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Martin=20Storsj=C3=B6?= <martin@martin.st>
Date: Mon, 12 Feb 2024 13:22:45 +0200
Subject: [LLD] [test] Avoid printing timestamps past INT32_MAX with
 llvm-readobj (#81463)

If llvm-readobj is built with a 32 bit time_t, it can't print such
timestamps correctly.
---
 lld/test/COFF/timestamp.test | 29 +++++++++++++++++++----------
 1 file changed, 19 insertions(+), 10 deletions(-)

diff --git a/lld/test/COFF/timestamp.test b/lld/test/COFF/timestamp.test
index c0658d6..cc73af1 100644
--- a/lld/test/COFF/timestamp.test
+++ b/lld/test/COFF/timestamp.test
@@ -4,19 +4,28 @@ RUN: lld-link %t.obj /debug /Brepro /entry:main /nodefaultlib /out:%t.1.exe
 RUN: lld-link %t.obj /debug /Brepro /entry:main /nodefaultlib /out:%t.2.exe
 RUN: lld-link %t.obj /debug /timestamp:0 /entry:main /nodefaultlib /out:%t.3.exe
 RUN: env SOURCE_DATE_EPOCH=0 lld-link %t.obj /debug /entry:main /nodefaultlib /out:%t.4.exe
-RUN: lld-link %t.obj /debug /timestamp:4294967295 /entry:main /nodefaultlib /out:%t.5.exe
-RUN: env SOURCE_DATE_EPOCH=4294967295 lld-link %t.obj /debug /entry:main /nodefaultlib /out:%t.6.exe
+# Test timestamps corresponding to INT32_TMAX
+RUN: lld-link %t.obj /debug /timestamp:2147483647 /entry:main /nodefaultlib /out:%t.5.exe
+RUN: env SOURCE_DATE_EPOCH=2147483647 lld-link %t.obj /debug /entry:main /nodefaultlib /out:%t.6.exe
+# Test that the command line option /timestamp has precedence over SOURCE_DATE_EPOCH
 RUN: env SOURCE_DATE_EPOCH=12345 lld-link %t.obj /debug /timestamp:0 /entry:main /nodefaultlib /out:%t.7.exe
-RUN: env LLD_IN_TEST=1 not lld-link %t.obj /debug /timestamp:4294967296 /entry:main /nodefaultlib /out:%t.8.exe 2>&1 | FileCheck %s --check-prefix=ERROR
-RUN: env SOURCE_DATE_EPOCH=4294967296 env LLD_IN_TEST=1 not lld-link %t.obj /debug /entry:main /nodefaultlib /out:%t.9.exe 2>&1 | FileCheck %s --check-prefix=ERROR2
+# Test timestamps corresponding to UINT32_TMAX
+RUN: lld-link %t.obj /debug /timestamp:4294967295 /entry:main /nodefaultlib /out:%t.8.exe
+RUN: env SOURCE_DATE_EPOCH=4294967295 lld-link %t.obj /debug /entry:main /nodefaultlib /out:%t.9.exe
+# Test that setting UINT32_MAX+1 as timestamp fails.
+RUN: env LLD_IN_TEST=1 not lld-link %t.obj /debug /timestamp:4294967296 /entry:main /nodefaultlib /out:%t.10.exe 2>&1 | FileCheck %s --check-prefix=ERROR
+RUN: env SOURCE_DATE_EPOCH=4294967296 env LLD_IN_TEST=1 not lld-link %t.obj /debug /entry:main /nodefaultlib /out:%t.11.exe 2>&1 | FileCheck %s --check-prefix=ERROR2
 RUN: llvm-readobj --file-headers --coff-debug-directory %t.1.exe | FileCheck %s --check-prefix=HASH
 RUN: llvm-readobj --file-headers --coff-debug-directory %t.2.exe | FileCheck %s --check-prefix=HASH
 RUN: llvm-readobj --file-headers --coff-debug-directory %t.3.exe | FileCheck %s --check-prefix=ZERO
 RUN: llvm-readobj --file-headers --coff-debug-directory %t.4.exe | FileCheck %s --check-prefix=ZERO
-RUN: llvm-readobj --file-headers --coff-debug-directory %t.5.exe | FileCheck %s --check-prefix=MAX
-RUN: llvm-readobj --file-headers --coff-debug-directory %t.6.exe | FileCheck %s --check-prefix=MAX
+RUN: llvm-readobj --file-headers --coff-debug-directory %t.5.exe | FileCheck %s --check-prefix=LARGE
+RUN: llvm-readobj --file-headers --coff-debug-directory %t.6.exe | FileCheck %s --check-prefix=LARGE
 RUN: llvm-readobj --file-headers --coff-debug-directory %t.7.exe | FileCheck %s --check-prefix=ZERO
 
+# Not inspecting %t.8.exe and %t.9.exe; llvm-readobj with a 32 bit time_t fails to print dates
+# past INT32_MAX correctly.
+
 HASH: ImageFileHeader {
 HASH: TimeDateStamp: [[STAMP:.*]]
 HASH: DebugDirectory [
@@ -27,10 +36,10 @@ ZERO: TimeDateStamp: 1970-01-01 00:00:00 (0x0)
 ZERO: DebugDirectory [
 ZERO: TimeDateStamp: 1970-01-01 00:00:00 (0x0)
 
-MAX: ImageFileHeader {
-MAX: TimeDateStamp: 2106-02-07 06:28:15 (0xFFFFFFFF)
-MAX: DebugDirectory [
-MAX: TimeDateStamp: 2106-02-07 06:28:15 (0xFFFFFFFF)
+LARGE: ImageFileHeader {
+LARGE: TimeDateStamp: 2038-01-19 03:14:07 (0x7FFFFFFF)
+LARGE: DebugDirectory [
+LARGE: TimeDateStamp: 2038-01-19 03:14:07 (0x7FFFFFFF)
 
 ERROR: error: invalid timestamp: 4294967296.  Expected 32-bit integer
 ERROR2: error: invalid SOURCE_DATE_EPOCH timestamp: 4294967296.  Expected 32-bit integer
-- 
cgit v1.1


From b0958bd9dee3920f15ae266b276a6ad43022b4b5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Andrzej=20Warzy=C5=84ski?= <andrzej.warzynski@arm.com>
Date: Mon, 12 Feb 2024 11:24:58 +0000
Subject: [mlir][Vector] Add missing test (scalable vec + strided access)
 (#81296)

This is a follow-up for #81187, it simply adds missing tests for
scalable vectors.
---
 .../test/Dialect/Vector/vector-dropleadunitdim-transforms.mlir | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/mlir/test/Dialect/Vector/vector-dropleadunitdim-transforms.mlir b/mlir/test/Dialect/Vector/vector-dropleadunitdim-transforms.mlir
index bb2d30f..af6e636 100644
--- a/mlir/test/Dialect/Vector/vector-dropleadunitdim-transforms.mlir
+++ b/mlir/test/Dialect/Vector/vector-dropleadunitdim-transforms.mlir
@@ -248,6 +248,16 @@ func.func @cast_away_insert_strided_slice_leading_one_dims_one_element(%arg0: ve
   return %0: vector<1x1x1xf16>
 }
 
+// CHECK-LABEL: func @cast_away_insert_strided_slice_leading_one_dims_one_element_scalable
+//  CHECK-SAME: %[[ARG0:.+]]: vector<1x[1]xf16>, %{{.+}}: vector<1x1x[1]xf16>
+func.func @cast_away_insert_strided_slice_leading_one_dims_one_element_scalable(%arg0: vector<1x[1]xf16>, %arg1: vector<1x1x[1]xf16>) -> vector<1x1x[1]xf16> {
+  // CHECK: %[[EXT:.+]] = vector.extract %{{.*}}[0] : vector<[1]xf16> from vector<1x[1]xf16>
+  // CHECK: %[[B:.+]] = vector.broadcast %[[EXT]] : vector<[1]xf16> to vector<1x1x[1]xf16>
+  %0 = vector.insert_strided_slice %arg0, %arg1 {offsets = [0, 0, 0], strides = [1, 1]} : vector<1x[1]xf16> into vector<1x1x[1]xf16>
+  // CHECK: return %[[B]]
+  return %0: vector<1x1x[1]xf16>
+}
+
 // CHECK-LABEL: func @cast_away_transfer_read_leading_one_dims
 func.func @cast_away_transfer_read_leading_one_dims(%arg0: memref<1x4x8x16xf16>) -> vector<1x4xf16> {
   // CHECK: %[[C0:.+]] = arith.constant 0 : index
-- 
cgit v1.1


From 0c634537148f594203fbe946822cc38deca20ae3 Mon Sep 17 00:00:00 2001
From: Mariusz Sikora <mariusz.sikora@amd.com>
Date: Mon, 12 Feb 2024 12:25:54 +0100
Subject: [AMDGPU][NFC] Docs - remove duplicates (#81465)

---
 llvm/docs/AMDGPUUsage.rst | 2 --
 1 file changed, 2 deletions(-)

diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst
index 970b5e2..f253648 100644
--- a/llvm/docs/AMDGPUUsage.rst
+++ b/llvm/docs/AMDGPUUsage.rst
@@ -1857,8 +1857,6 @@ The AMDGPU backend uses the following ELF header:
      ``EF_AMDGPU_MACH_AMDGCN_GFX942``           0x04c      ``gfx942``
      *reserved*                                 0x04d      Reserved.
      ``EF_AMDGPU_MACH_AMDGCN_GFX1201``          0x04e      ``gfx1201``
-     ``EF_AMDGPU_MACH_AMDGCN_GFX1201``          0x04e      ``gfx1201``
-     ``EF_AMDGPU_MACH_AMDGCN_GFX1201``          0x04e      ``gfx1201``
      *reserved*                                 0x04f      Reserved.
      *reserved*                                 0x050      Reserved.
      ``EF_AMDGPU_MACH_AMDGCN_GFX9_GENERIC``     0x051      ``gfx9-generic``
-- 
cgit v1.1


From 5e6b4be5cbddbc7538cdae0f0889b116e386fcca Mon Sep 17 00:00:00 2001
From: David Green <david.green@arm.com>
Date: Mon, 12 Feb 2024 11:27:49 +0000
Subject: [BasicAA] Treat different VScale intrinsics as the same value.
 (#81152)

The IR may contain multiple llvm.vscale intrinsics that have not been CSEd.
This patch ensures that multiple vscales can be treated the same, either in the
decomposition of geps and when we subtract one decomposition from another.
---
 llvm/lib/Analysis/BasicAliasAnalysis.cpp | 12 ++++++++++--
 llvm/test/Analysis/BasicAA/vscale.ll     |  4 ++--
 2 files changed, 12 insertions(+), 4 deletions(-)

diff --git a/llvm/lib/Analysis/BasicAliasAnalysis.cpp b/llvm/lib/Analysis/BasicAliasAnalysis.cpp
index 2ea2917..8dfc4b2 100644
--- a/llvm/lib/Analysis/BasicAliasAnalysis.cpp
+++ b/llvm/lib/Analysis/BasicAliasAnalysis.cpp
@@ -188,6 +188,12 @@ static bool isObjectSize(const Value *V, TypeSize Size, const DataLayout &DL,
   return ObjectSize && *ObjectSize == Size;
 }
 
+/// Return true if both V1 and V2 are VScale
+static bool areBothVScale(const Value *V1, const Value *V2) {
+  return PatternMatch::match(V1, PatternMatch::m_VScale()) &&
+         PatternMatch::match(V2, PatternMatch::m_VScale());
+}
+
 //===----------------------------------------------------------------------===//
 // CaptureInfo implementations
 //===----------------------------------------------------------------------===//
@@ -679,7 +685,8 @@ BasicAAResult::DecomposeGEPExpression(const Value *V, const DataLayout &DL,
       //   A[x][x] -> x*16 + x*4 -> x*20
       // This also ensures that 'x' only appears in the index list once.
       for (unsigned i = 0, e = Decomposed.VarIndices.size(); i != e; ++i) {
-        if (Decomposed.VarIndices[i].Val.V == LE.Val.V &&
+        if ((Decomposed.VarIndices[i].Val.V == LE.Val.V ||
+             areBothVScale(Decomposed.VarIndices[i].Val.V, LE.Val.V)) &&
             Decomposed.VarIndices[i].Val.hasSameCastsAs(LE.Val)) {
           Scale += Decomposed.VarIndices[i].Scale;
           LE.IsNSW = false; // We cannot guarantee nsw for the merge.
@@ -1792,7 +1799,8 @@ void BasicAAResult::subtractDecomposedGEPs(DecomposedGEP &DestGEP,
     bool Found = false;
     for (auto I : enumerate(DestGEP.VarIndices)) {
       VariableGEPIndex &Dest = I.value();
-      if (!isValueEqualInPotentialCycles(Dest.Val.V, Src.Val.V, AAQI) ||
+      if ((!isValueEqualInPotentialCycles(Dest.Val.V, Src.Val.V, AAQI) &&
+           !areBothVScale(Dest.Val.V, Src.Val.V)) ||
           !Dest.Val.hasSameCastsAs(Src.Val))
         continue;
 
diff --git a/llvm/test/Analysis/BasicAA/vscale.ll b/llvm/test/Analysis/BasicAA/vscale.ll
index 18aad4f..05b9b6b 100644
--- a/llvm/test/Analysis/BasicAA/vscale.ll
+++ b/llvm/test/Analysis/BasicAA/vscale.ll
@@ -488,9 +488,9 @@ define void @onevscale(ptr %p) vscale_range(1,16) {
 }
 
 ; CHECK-LABEL: twovscales
-; CHECK-DAG:   MayAlias:     <vscale x 4 x i32>* %vp161, <vscale x 4 x i32>* %vp162
+; CHECK-DAG:   MustAlias:    <vscale x 4 x i32>* %vp161, <vscale x 4 x i32>* %vp162
 ; CHECK-DAG:   NoAlias:      <vscale x 4 x i32>* %vp161, <vscale x 4 x i32>* %vp161b
-; CHECK-DAG:   MayAlias:     <vscale x 4 x i32>* %vp161b, <vscale x 4 x i32>* %vp162
+; CHECK-DAG:   NoAlias:      <vscale x 4 x i32>* %vp161b, <vscale x 4 x i32>* %vp162
 define void @twovscales(ptr %p) vscale_range(1,16) {
   %v1 = call i64 @llvm.vscale.i64()
   %v2 = call i64 @llvm.vscale.i64()
-- 
cgit v1.1


From b21e3282864c9f7ad656c64bc375f5869ef76d19 Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Mon, 12 Feb 2024 10:25:13 +0100
Subject: [IR] Add ptradd fast path in accumulateConstantOffset() (NFC)

For getelementptr i8 (aka ptradd) we can skip the whole logic and
directly use the offset. As we're now canonicalizing to this form,
it's pretty common and worth having a fast-path for.
---
 llvm/lib/IR/Operator.cpp | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/llvm/lib/IR/Operator.cpp b/llvm/lib/IR/Operator.cpp
index 16a8953..caf8fe6 100644
--- a/llvm/lib/IR/Operator.cpp
+++ b/llvm/lib/IR/Operator.cpp
@@ -108,6 +108,15 @@ bool GEPOperator::accumulateConstantOffset(
 bool GEPOperator::accumulateConstantOffset(
     Type *SourceType, ArrayRef<const Value *> Index, const DataLayout &DL,
     APInt &Offset, function_ref<bool(Value &, APInt &)> ExternalAnalysis) {
+  // Fast path for canonical getelementptr i8 form.
+  if (SourceType->isIntegerTy(8) && !ExternalAnalysis) {
+    if (auto *CI = dyn_cast<ConstantInt>(Index.front())) {
+      Offset += CI->getValue().sextOrTrunc(Offset.getBitWidth());
+      return true;
+    }
+    return false;
+  }
+
   bool UsedExternalAnalysis = false;
   auto AccumulateOffset = [&](APInt Index, uint64_t Size) -> bool {
     Index = Index.sextOrTrunc(Offset.getBitWidth());
-- 
cgit v1.1


From 7bc079c85219ad6e954fb6071cd108151203c85e Mon Sep 17 00:00:00 2001
From: Alexey Bataev <5361294+alexey-bataev@users.noreply.github.com>
Date: Mon, 12 Feb 2024 07:09:49 -0500
Subject: [TTI]Fallback to SingleSrcPermute shuffle kind, if no direct
 estimation for

extract subvector.

Many targets do not have cost for extractsubvector shuffle kind, but
have the costs for single source permute. If there are no costs
estimation for extractsubvector, better to switchto single source
permute for better cost estimation.

Reviewers: RKSimon, davemgreen, arsenm

Reviewed By: RKSimon

Pull Request: https://github.com/llvm/llvm-project/pull/79837
---
 .../Target/AArch64/AArch64TargetTransformInfo.cpp  |    7 +
 .../Target/AMDGPU/AMDGPUTargetTransformInfo.cpp    |    7 +
 llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp     |    7 +
 llvm/lib/Target/X86/X86TargetTransformInfo.cpp     |    2 +
 .../test/Analysis/CostModel/AArch64/reduce-fadd.ll |   48 +-
 .../Analysis/CostModel/AArch64/reduce-minmax.ll    |    8 +-
 .../X86/shuffle-extract_subvector-codesize.ll      | 1338 ++++++++++++++------
 .../X86/shuffle-extract_subvector-latency.ll       | 1338 ++++++++++++++------
 .../X86/shuffle-extract_subvector-sizelatency.ll   | 1338 ++++++++++++++------
 .../CostModel/X86/shuffle-extract_subvector.ll     | 1338 ++++++++++++++------
 .../LowerMatrixIntrinsics/dot-product-float.ll     |   44 +-
 11 files changed, 3799 insertions(+), 1676 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index cdd2750..af0b687 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -3809,6 +3809,10 @@ InstructionCost AArch64TTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
   }
 
   Kind = improveShuffleKindFromMask(Kind, Mask, Tp, Index, SubTp);
+  // Treat extractsubvector as single op permutation.
+  bool IsExtractSubvector = Kind == TTI::SK_ExtractSubvector;
+  if (IsExtractSubvector && LT.second.isFixedLengthVector())
+    Kind = TTI::SK_PermuteSingleSrc;
 
   // Check for broadcast loads, which are supported by the LD1R instruction.
   // In terms of code-size, the shuffle vector is free when a load + dup get
@@ -3971,6 +3975,9 @@ InstructionCost AArch64TTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
     }
   }
 
+  // Restore optimal kind.
+  if (IsExtractSubvector)
+    Kind = TTI::SK_ExtractSubvector;
   return BaseT::getShuffleCost(Kind, Tp, Mask, CostKind, Index, SubTp);
 }
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
index ebe0b85..31077db 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
@@ -1129,6 +1129,10 @@ InstructionCost GCNTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
                                            int Index, VectorType *SubTp,
                                            ArrayRef<const Value *> Args) {
   Kind = improveShuffleKindFromMask(Kind, Mask, VT, Index, SubTp);
+  // Treat extractsubvector as single op permutation.
+  bool IsExtractSubvector = Kind == TTI::SK_ExtractSubvector;
+  if (IsExtractSubvector)
+    Kind = TTI::SK_PermuteSingleSrc;
 
   if (ST->hasVOP3PInsts()) {
     if (cast<FixedVectorType>(VT)->getNumElements() == 2 &&
@@ -1146,6 +1150,9 @@ InstructionCost GCNTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
       }
     }
   }
+  // Restore optimal kind.
+  if (IsExtractSubvector)
+    Kind = TTI::SK_ExtractSubvector;
 
   return BaseT::getShuffleCost(Kind, VT, Mask, CostKind, Index, SubTp);
 }
diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
index a1b1101..3be894a 100644
--- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
+++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
@@ -1214,6 +1214,10 @@ InstructionCost ARMTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
                                            int Index, VectorType *SubTp,
                                            ArrayRef<const Value *> Args) {
   Kind = improveShuffleKindFromMask(Kind, Mask, Tp, Index, SubTp);
+  // Treat extractsubvector as single op permutation.
+  bool IsExtractSubvector = Kind == TTI::SK_ExtractSubvector;
+  if (IsExtractSubvector)
+    Kind = TTI::SK_PermuteSingleSrc;
   if (ST->hasNEON()) {
     if (Kind == TTI::SK_Broadcast) {
       static const CostTblEntry NEONDupTbl[] = {
@@ -1308,6 +1312,9 @@ InstructionCost ARMTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
     }
   }
 
+  // Restore optimal kind.
+  if (IsExtractSubvector)
+    Kind = TTI::SK_ExtractSubvector;
   int BaseCost = ST->hasMVEIntegerOps() && Tp->isVectorTy()
                      ? ST->getMVEVectorCostFactor(TTI::TCK_RecipThroughput)
                      : 1;
diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
index cd40b1d..be1a094 100644
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -1540,6 +1540,8 @@ InstructionCost X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
         return ExtractCost + 2; // worst case pshufhw + pshufd
       }
     }
+    // If the extract subvector is not optimal, treat it as single op shuffle.
+    Kind = TTI::SK_PermuteSingleSrc;
   }
 
   // Subvector insertions are cheap if the subvectors are aligned.
diff --git a/llvm/test/Analysis/CostModel/AArch64/reduce-fadd.ll b/llvm/test/Analysis/CostModel/AArch64/reduce-fadd.ll
index 7374372..954a836 100644
--- a/llvm/test/Analysis/CostModel/AArch64/reduce-fadd.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/reduce-fadd.ll
@@ -59,19 +59,19 @@ define void @fast_fp_reductions() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %fadd_v8f16 = call fast half @llvm.vector.reduce.fadd.v8f16(half 0xH0000, <8 x half> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %fadd_v8f16_reassoc = call reassoc half @llvm.vector.reduce.fadd.v8f16(half 0xH0000, <8 x half> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %fadd_v11f16 = call fast half @llvm.vector.reduce.fadd.v11f16(half 0xH0000, <11 x half> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %fadd_v13f16_reassoc = call reassoc half @llvm.vector.reduce.fadd.v13f16(half 0xH0000, <13 x half> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %fadd_v13f16_reassoc = call reassoc half @llvm.vector.reduce.fadd.v13f16(half 0xH0000, <13 x half> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %fadd_v4f32 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %fadd_v4f32_reassoc = call reassoc float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %fadd_v8f32 = call fast float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %fadd_v8f32_reassoc = call reassoc float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %fadd_v13f32 = call fast float @llvm.vector.reduce.fadd.v13f32(float 0.000000e+00, <13 x float> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %fadd_v8f32 = call fast float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %fadd_v8f32_reassoc = call reassoc float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %fadd_v13f32 = call fast float @llvm.vector.reduce.fadd.v13f32(float 0.000000e+00, <13 x float> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %fadd_v5f32_reassoc = call reassoc float @llvm.vector.reduce.fadd.v5f32(float 0.000000e+00, <5 x float> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %fadd_v2f64 = call fast double @llvm.vector.reduce.fadd.v2f64(double 0.000000e+00, <2 x double> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %fadd_v2f64_reassoc = call reassoc double @llvm.vector.reduce.fadd.v2f64(double 0.000000e+00, <2 x double> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %fadd_v4f64 = call fast double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %fadd_v4f64_reassoc = call reassoc double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %fadd_v7f64 = call fast double @llvm.vector.reduce.fadd.v7f64(double 0.000000e+00, <7 x double> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %fadd_v9f64_reassoc = call reassoc double @llvm.vector.reduce.fadd.v9f64(double 0.000000e+00, <9 x double> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %fadd_v4f64 = call fast double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %fadd_v4f64_reassoc = call reassoc double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %fadd_v7f64 = call fast double @llvm.vector.reduce.fadd.v7f64(double 0.000000e+00, <7 x double> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %fadd_v9f64_reassoc = call reassoc double @llvm.vector.reduce.fadd.v9f64(double 0.000000e+00, <9 x double> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %fadd_v4f8 = call reassoc bfloat @llvm.vector.reduce.fadd.v4bf16(bfloat 0xR8000, <4 x bfloat> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %fadd_v4f128 = call reassoc fp128 @llvm.vector.reduce.fadd.v4f128(fp128 undef, <4 x fp128> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
@@ -82,19 +82,19 @@ define void @fast_fp_reductions() {
 ; FP16-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %fadd_v8f16 = call fast half @llvm.vector.reduce.fadd.v8f16(half 0xH0000, <8 x half> undef)
 ; FP16-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %fadd_v8f16_reassoc = call reassoc half @llvm.vector.reduce.fadd.v8f16(half 0xH0000, <8 x half> undef)
 ; FP16-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %fadd_v11f16 = call fast half @llvm.vector.reduce.fadd.v11f16(half 0xH0000, <11 x half> undef)
-; FP16-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: %fadd_v13f16_reassoc = call reassoc half @llvm.vector.reduce.fadd.v13f16(half 0xH0000, <13 x half> undef)
+; FP16-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %fadd_v13f16_reassoc = call reassoc half @llvm.vector.reduce.fadd.v13f16(half 0xH0000, <13 x half> undef)
 ; FP16-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %fadd_v4f32 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> undef)
 ; FP16-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %fadd_v4f32_reassoc = call reassoc float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> undef)
-; FP16-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %fadd_v8f32 = call fast float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> undef)
-; FP16-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %fadd_v8f32_reassoc = call reassoc float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> undef)
-; FP16-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %fadd_v13f32 = call fast float @llvm.vector.reduce.fadd.v13f32(float 0.000000e+00, <13 x float> undef)
+; FP16-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %fadd_v8f32 = call fast float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> undef)
+; FP16-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %fadd_v8f32_reassoc = call reassoc float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> undef)
+; FP16-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %fadd_v13f32 = call fast float @llvm.vector.reduce.fadd.v13f32(float 0.000000e+00, <13 x float> undef)
 ; FP16-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %fadd_v5f32_reassoc = call reassoc float @llvm.vector.reduce.fadd.v5f32(float 0.000000e+00, <5 x float> undef)
 ; FP16-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %fadd_v2f64 = call fast double @llvm.vector.reduce.fadd.v2f64(double 0.000000e+00, <2 x double> undef)
 ; FP16-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %fadd_v2f64_reassoc = call reassoc double @llvm.vector.reduce.fadd.v2f64(double 0.000000e+00, <2 x double> undef)
-; FP16-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %fadd_v4f64 = call fast double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> undef)
-; FP16-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %fadd_v4f64_reassoc = call reassoc double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> undef)
-; FP16-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %fadd_v7f64 = call fast double @llvm.vector.reduce.fadd.v7f64(double 0.000000e+00, <7 x double> undef)
-; FP16-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %fadd_v9f64_reassoc = call reassoc double @llvm.vector.reduce.fadd.v9f64(double 0.000000e+00, <9 x double> undef)
+; FP16-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %fadd_v4f64 = call fast double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> undef)
+; FP16-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %fadd_v4f64_reassoc = call reassoc double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> undef)
+; FP16-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %fadd_v7f64 = call fast double @llvm.vector.reduce.fadd.v7f64(double 0.000000e+00, <7 x double> undef)
+; FP16-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %fadd_v9f64_reassoc = call reassoc double @llvm.vector.reduce.fadd.v9f64(double 0.000000e+00, <9 x double> undef)
 ; FP16-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %fadd_v4f8 = call reassoc bfloat @llvm.vector.reduce.fadd.v4bf16(bfloat 0xR8000, <4 x bfloat> undef)
 ; FP16-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %fadd_v4f128 = call reassoc fp128 @llvm.vector.reduce.fadd.v4f128(fp128 undef, <4 x fp128> undef)
 ; FP16-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
@@ -105,19 +105,19 @@ define void @fast_fp_reductions() {
 ; BF16-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %fadd_v8f16 = call fast half @llvm.vector.reduce.fadd.v8f16(half 0xH0000, <8 x half> undef)
 ; BF16-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %fadd_v8f16_reassoc = call reassoc half @llvm.vector.reduce.fadd.v8f16(half 0xH0000, <8 x half> undef)
 ; BF16-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %fadd_v11f16 = call fast half @llvm.vector.reduce.fadd.v11f16(half 0xH0000, <11 x half> undef)
-; BF16-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %fadd_v13f16_reassoc = call reassoc half @llvm.vector.reduce.fadd.v13f16(half 0xH0000, <13 x half> undef)
+; BF16-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %fadd_v13f16_reassoc = call reassoc half @llvm.vector.reduce.fadd.v13f16(half 0xH0000, <13 x half> undef)
 ; BF16-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %fadd_v4f32 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> undef)
 ; BF16-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %fadd_v4f32_reassoc = call reassoc float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> undef)
-; BF16-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %fadd_v8f32 = call fast float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> undef)
-; BF16-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %fadd_v8f32_reassoc = call reassoc float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> undef)
-; BF16-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %fadd_v13f32 = call fast float @llvm.vector.reduce.fadd.v13f32(float 0.000000e+00, <13 x float> undef)
+; BF16-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %fadd_v8f32 = call fast float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> undef)
+; BF16-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %fadd_v8f32_reassoc = call reassoc float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> undef)
+; BF16-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %fadd_v13f32 = call fast float @llvm.vector.reduce.fadd.v13f32(float 0.000000e+00, <13 x float> undef)
 ; BF16-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %fadd_v5f32_reassoc = call reassoc float @llvm.vector.reduce.fadd.v5f32(float 0.000000e+00, <5 x float> undef)
 ; BF16-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %fadd_v2f64 = call fast double @llvm.vector.reduce.fadd.v2f64(double 0.000000e+00, <2 x double> undef)
 ; BF16-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %fadd_v2f64_reassoc = call reassoc double @llvm.vector.reduce.fadd.v2f64(double 0.000000e+00, <2 x double> undef)
-; BF16-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %fadd_v4f64 = call fast double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> undef)
-; BF16-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %fadd_v4f64_reassoc = call reassoc double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> undef)
-; BF16-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %fadd_v7f64 = call fast double @llvm.vector.reduce.fadd.v7f64(double 0.000000e+00, <7 x double> undef)
-; BF16-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %fadd_v9f64_reassoc = call reassoc double @llvm.vector.reduce.fadd.v9f64(double 0.000000e+00, <9 x double> undef)
+; BF16-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %fadd_v4f64 = call fast double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> undef)
+; BF16-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %fadd_v4f64_reassoc = call reassoc double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> undef)
+; BF16-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %fadd_v7f64 = call fast double @llvm.vector.reduce.fadd.v7f64(double 0.000000e+00, <7 x double> undef)
+; BF16-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %fadd_v9f64_reassoc = call reassoc double @llvm.vector.reduce.fadd.v9f64(double 0.000000e+00, <9 x double> undef)
 ; BF16-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %fadd_v4f8 = call reassoc bfloat @llvm.vector.reduce.fadd.v4bf16(bfloat 0xR8000, <4 x bfloat> undef)
 ; BF16-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %fadd_v4f128 = call reassoc fp128 @llvm.vector.reduce.fadd.v4f128(fp128 undef, <4 x fp128> undef)
 ; BF16-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
diff --git a/llvm/test/Analysis/CostModel/AArch64/reduce-minmax.ll b/llvm/test/Analysis/CostModel/AArch64/reduce-minmax.ll
index dcca118..cf03026 100644
--- a/llvm/test/Analysis/CostModel/AArch64/reduce-minmax.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/reduce-minmax.ll
@@ -165,11 +165,11 @@ define void @reduce_fmin16() {
 ; CHECK-NOF16-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V2f16 = call half @llvm.vector.reduce.fmin.v2f16(<2 x half> undef)
 ; CHECK-NOF16-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V4f16 = call half @llvm.vector.reduce.fmin.v4f16(<4 x half> undef)
 ; CHECK-NOF16-NEXT:  Cost Model: Found an estimated cost of 174 for instruction: %V8f16 = call half @llvm.vector.reduce.fmin.v8f16(<8 x half> undef)
-; CHECK-NOF16-NEXT:  Cost Model: Found an estimated cost of 252 for instruction: %V16f16 = call half @llvm.vector.reduce.fmin.v16f16(<16 x half> undef)
+; CHECK-NOF16-NEXT:  Cost Model: Found an estimated cost of 240 for instruction: %V16f16 = call half @llvm.vector.reduce.fmin.v16f16(<16 x half> undef)
 ; CHECK-NOF16-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V2f16m = call half @llvm.vector.reduce.fminimum.v2f16(<2 x half> undef)
 ; CHECK-NOF16-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V4f16m = call half @llvm.vector.reduce.fminimum.v4f16(<4 x half> undef)
 ; CHECK-NOF16-NEXT:  Cost Model: Found an estimated cost of 174 for instruction: %V8f16m = call half @llvm.vector.reduce.fminimum.v8f16(<8 x half> undef)
-; CHECK-NOF16-NEXT:  Cost Model: Found an estimated cost of 252 for instruction: %V16f16m = call half @llvm.vector.reduce.fminimum.v16f16(<16 x half> undef)
+; CHECK-NOF16-NEXT:  Cost Model: Found an estimated cost of 240 for instruction: %V16f16m = call half @llvm.vector.reduce.fminimum.v16f16(<16 x half> undef)
 ; CHECK-NOF16-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; CHECK-F16-LABEL: 'reduce_fmin16'
@@ -199,11 +199,11 @@ define void @reduce_fmax16() {
 ; CHECK-NOF16-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V2f16 = call half @llvm.vector.reduce.fmax.v2f16(<2 x half> undef)
 ; CHECK-NOF16-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V4f16 = call half @llvm.vector.reduce.fmax.v4f16(<4 x half> undef)
 ; CHECK-NOF16-NEXT:  Cost Model: Found an estimated cost of 174 for instruction: %V8f16 = call half @llvm.vector.reduce.fmax.v8f16(<8 x half> undef)
-; CHECK-NOF16-NEXT:  Cost Model: Found an estimated cost of 252 for instruction: %V16f16 = call half @llvm.vector.reduce.fmax.v16f16(<16 x half> undef)
+; CHECK-NOF16-NEXT:  Cost Model: Found an estimated cost of 240 for instruction: %V16f16 = call half @llvm.vector.reduce.fmax.v16f16(<16 x half> undef)
 ; CHECK-NOF16-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V2f16m = call half @llvm.vector.reduce.fmaximum.v2f16(<2 x half> undef)
 ; CHECK-NOF16-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V4f16m = call half @llvm.vector.reduce.fmaximum.v4f16(<4 x half> undef)
 ; CHECK-NOF16-NEXT:  Cost Model: Found an estimated cost of 174 for instruction: %V8f16m = call half @llvm.vector.reduce.fmaximum.v8f16(<8 x half> undef)
-; CHECK-NOF16-NEXT:  Cost Model: Found an estimated cost of 252 for instruction: %V16f16m = call half @llvm.vector.reduce.fmaximum.v16f16(<16 x half> undef)
+; CHECK-NOF16-NEXT:  Cost Model: Found an estimated cost of 240 for instruction: %V16f16m = call half @llvm.vector.reduce.fmaximum.v16f16(<16 x half> undef)
 ; CHECK-NOF16-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; CHECK-F16-LABEL: 'reduce_fmax16'
diff --git a/llvm/test/Analysis/CostModel/X86/shuffle-extract_subvector-codesize.ll b/llvm/test/Analysis/CostModel/X86/shuffle-extract_subvector-codesize.ll
index 7e897f4..9a86fbe 100644
--- a/llvm/test/Analysis/CostModel/X86/shuffle-extract_subvector-codesize.ll
+++ b/llvm/test/Analysis/CostModel/X86/shuffle-extract_subvector-codesize.ll
@@ -2,15 +2,15 @@
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mattr=+sse2 | FileCheck %s -check-prefixes=SSE,SSE2
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mattr=+ssse3 | FileCheck %s -check-prefixes=SSE,SSSE3
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mattr=+sse4.2 | FileCheck %s -check-prefixes=SSE,SSE42
-; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mattr=+avx | FileCheck %s -check-prefixes=AVX
-; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mattr=+avx2 | FileCheck %s -check-prefixes=AVX
-; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512
-; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX512
-; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mattr=+avx512f,+avx512bw,+avx512vbmi | FileCheck %s --check-prefixes=AVX512
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mattr=+avx | FileCheck %s -check-prefixes=AVX,AVX1
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mattr=+avx2 | FileCheck %s -check-prefixes=AVX,AVX2
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512,AVX512F
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX512,AVX512BW
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mattr=+avx512f,+avx512bw,+avx512vbmi | FileCheck %s --check-prefixes=AVX512,AVX512VBMI
 ;
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mcpu=slm | FileCheck %s --check-prefixes=SSE,SLM
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mcpu=goldmont | FileCheck %s --check-prefixes=SSE,GLM
-; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mcpu=btver2 | FileCheck %s --check-prefixes=AVX
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mcpu=btver2 | FileCheck %s --check-prefixes=AVX,AVX1
 
 ;
 ; Verify the cost model for extract_subector style shuffles.
@@ -38,7 +38,7 @@ define void @test_vXf64(<4 x double> %src256, <8 x double> %src512) {
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_45 = shufflevector <8 x double> %src512, <8 x double> undef, <2 x i32> <i32 4, i32 5>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_67 = shufflevector <8 x double> %src512, <8 x double> undef, <2 x i32> <i32 6, i32 7>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_0123 = shufflevector <8 x double> %src512, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V512_2345 = shufflevector <8 x double> %src512, <8 x double> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_2345 = shufflevector <8 x double> %src512, <8 x double> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_4567 = shufflevector <8 x double> %src512, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_567u = shufflevector <8 x double> %src512, <8 x double> undef, <4 x i32> <i32 5, i32 6, i32 7, i32 poison>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
@@ -51,7 +51,7 @@ define void @test_vXf64(<4 x double> %src256, <8 x double> %src512) {
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <8 x double> %src512, <8 x double> undef, <2 x i32> <i32 4, i32 5>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_67 = shufflevector <8 x double> %src512, <8 x double> undef, <2 x i32> <i32 6, i32 7>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_0123 = shufflevector <8 x double> %src512, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V512_2345 = shufflevector <8 x double> %src512, <8 x double> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_2345 = shufflevector <8 x double> %src512, <8 x double> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_4567 = shufflevector <8 x double> %src512, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_567u = shufflevector <8 x double> %src512, <8 x double> undef, <4 x i32> <i32 5, i32 6, i32 7, i32 poison>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
@@ -90,7 +90,7 @@ define void @test_vXi64(<4 x i64> %src256, <8 x i64> %src512) {
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_45 = shufflevector <8 x i64> %src512, <8 x i64> undef, <2 x i32> <i32 4, i32 5>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_67 = shufflevector <8 x i64> %src512, <8 x i64> undef, <2 x i32> <i32 6, i32 7>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_0123 = shufflevector <8 x i64> %src512, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V512_2345 = shufflevector <8 x i64> %src512, <8 x i64> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_2345 = shufflevector <8 x i64> %src512, <8 x i64> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_4567 = shufflevector <8 x i64> %src512, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
@@ -102,7 +102,7 @@ define void @test_vXi64(<4 x i64> %src256, <8 x i64> %src512) {
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <8 x i64> %src512, <8 x i64> undef, <2 x i32> <i32 4, i32 5>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_67 = shufflevector <8 x i64> %src512, <8 x i64> undef, <2 x i32> <i32 6, i32 7>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_0123 = shufflevector <8 x i64> %src512, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V512_2345 = shufflevector <8 x i64> %src512, <8 x i64> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_2345 = shufflevector <8 x i64> %src512, <8 x i64> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_4567 = shufflevector <8 x i64> %src512, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
@@ -238,9 +238,9 @@ define void @test_vXi16(<4 x i16> %src64, <8 x i16> %src128, <16 x i16> %src256,
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_CD = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 12, i32 13>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_EF = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_0123 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_2345 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V256_2345 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_4567 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_6789 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_6789 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_89AB = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_CDEF = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_01234567 = shufflevector <16 x i16> %src256, <16 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -262,9 +262,9 @@ define void @test_vXi16(<4 x i16> %src64, <8 x i16> %src128, <16 x i16> %src256,
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_1C_1D = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 28, i32 29>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 30, i32 31>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_02_03_04_05 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V512_02_03_04_05 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05_06_07 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_06_07_08_09 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V512_06_07_08_09 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_08_09_0A_0B = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0C_0D_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_10_11_12_13 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
@@ -297,9 +297,9 @@ define void @test_vXi16(<4 x i16> %src64, <8 x i16> %src128, <16 x i16> %src256,
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_CD = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 12, i32 13>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_EF = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_0123 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_2345 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_2345 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_4567 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_6789 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_6789 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_89AB = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_CDEF = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_01234567 = shufflevector <16 x i16> %src256, <16 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -321,9 +321,9 @@ define void @test_vXi16(<4 x i16> %src64, <8 x i16> %src128, <16 x i16> %src256,
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_1C_1D = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 28, i32 29>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 30, i32 31>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_02_03_04_05 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_02_03_04_05 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05_06_07 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_06_07_08_09 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_06_07_08_09 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_08_09_0A_0B = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0C_0D_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_10_11_12_13 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
@@ -356,9 +356,9 @@ define void @test_vXi16(<4 x i16> %src64, <8 x i16> %src128, <16 x i16> %src256,
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_CD = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 12, i32 13>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_EF = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_0123 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_2345 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_2345 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_4567 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_6789 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_6789 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_89AB = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_CDEF = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_01234567 = shufflevector <16 x i16> %src256, <16 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -380,9 +380,9 @@ define void @test_vXi16(<4 x i16> %src64, <8 x i16> %src128, <16 x i16> %src256,
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_1C_1D = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 28, i32 29>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 30, i32 31>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_02_03_04_05 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_02_03_04_05 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05_06_07 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_06_07_08_09 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_06_07_08_09 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_08_09_0A_0B = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0C_0D_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_10_11_12_13 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
@@ -397,123 +397,300 @@ define void @test_vXi16(<4 x i16> %src64, <8 x i16> %src128, <16 x i16> %src256,
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
-; AVX-LABEL: 'test_vXi16'
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64_01 = shufflevector <4 x i16> %src64, <4 x i16> undef, <2 x i32> <i32 0, i32 1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_23 = shufflevector <4 x i16> %src64, <4 x i16> undef, <2 x i32> <i32 2, i32 3>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_01 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> <i32 0, i32 1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_23 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> <i32 2, i32 3>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_45 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> <i32 4, i32 5>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_67 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> <i32 6, i32 7>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_0123 = shufflevector <8 x i16> %src128, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_4567 = shufflevector <8 x i16> %src128, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_01 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 0, i32 1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_23 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 2, i32 3>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_45 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 4, i32 5>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_67 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 6, i32 7>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 8, i32 9>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_AB = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 10, i32 11>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_CD = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 12, i32 13>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_EF = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 14, i32 15>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_0123 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_2345 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_4567 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V256_6789 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89AB = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_CDEF = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_01234567 = shufflevector <16 x i16> %src256, <16 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89ABCDEF = shufflevector <16 x i16> %src256, <16 x i16> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 0, i32 1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_02_03 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 2, i32 3>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 4, i32 5>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_06_07 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 6, i32 7>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 8, i32 9>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0A_0B = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 10, i32 11>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0C_0D = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 12, i32 13>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 14, i32 15>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_10_11 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 16, i32 17>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_12_13 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 18, i32 19>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_14_15 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 20, i32 21>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_16_17 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 22, i32 23>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_18_19 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 24, i32 25>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1A_1B = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 26, i32 27>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1C_1D = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 28, i32 29>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 30, i32 31>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_02_03_04_05 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05_06_07 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V512_06_07_08_09 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09_0A_0B = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0C_0D_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_10_11_12_13 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_14_15_16_17 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 20, i32 21, i32 22, i32 23>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_18_19_1A_1B = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 24, i32 25, i32 26, i32 27>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1C_1D_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 28, i32 29, i32 30, i32 31>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07 = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_10_11_12_13_14_15_16_17 = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> <i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+; AVX1-LABEL: 'test_vXi16'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64_01 = shufflevector <4 x i16> %src64, <4 x i16> undef, <2 x i32> <i32 0, i32 1>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_23 = shufflevector <4 x i16> %src64, <4 x i16> undef, <2 x i32> <i32 2, i32 3>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_01 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> <i32 0, i32 1>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_23 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> <i32 2, i32 3>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_45 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> <i32 4, i32 5>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_67 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> <i32 6, i32 7>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_0123 = shufflevector <8 x i16> %src128, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_4567 = shufflevector <8 x i16> %src128, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_01 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 0, i32 1>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_23 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 2, i32 3>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_45 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 4, i32 5>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_67 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 6, i32 7>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 8, i32 9>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_AB = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 10, i32 11>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_CD = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 12, i32 13>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_EF = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_0123 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_2345 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_4567 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_6789 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89AB = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_CDEF = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_01234567 = shufflevector <16 x i16> %src256, <16 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89ABCDEF = shufflevector <16 x i16> %src256, <16 x i16> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 0, i32 1>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_02_03 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 2, i32 3>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 4, i32 5>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_06_07 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 6, i32 7>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 8, i32 9>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0A_0B = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 10, i32 11>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0C_0D = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 12, i32 13>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_10_11 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 16, i32 17>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_12_13 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 18, i32 19>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_14_15 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 20, i32 21>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_16_17 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 22, i32 23>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_18_19 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 24, i32 25>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1A_1B = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 26, i32 27>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1C_1D = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 28, i32 29>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 30, i32 31>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_02_03_04_05 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05_06_07 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_06_07_08_09 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09_0A_0B = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0C_0D_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_10_11_12_13 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_14_15_16_17 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 20, i32 21, i32 22, i32 23>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_18_19_1A_1B = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 24, i32 25, i32 26, i32 27>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1C_1D_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 28, i32 29, i32 30, i32 31>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07 = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_10_11_12_13_14_15_16_17 = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> <i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
-; AVX512-LABEL: 'test_vXi16'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64_01 = shufflevector <4 x i16> %src64, <4 x i16> undef, <2 x i32> <i32 0, i32 1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_23 = shufflevector <4 x i16> %src64, <4 x i16> undef, <2 x i32> <i32 2, i32 3>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_01 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> <i32 0, i32 1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_23 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> <i32 2, i32 3>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_45 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> <i32 4, i32 5>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_67 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> <i32 6, i32 7>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_0123 = shufflevector <8 x i16> %src128, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_4567 = shufflevector <8 x i16> %src128, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_01 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 0, i32 1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_23 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 2, i32 3>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_45 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 4, i32 5>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_67 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 6, i32 7>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 8, i32 9>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_AB = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 10, i32 11>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_CD = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 12, i32 13>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_EF = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 14, i32 15>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_0123 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_2345 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_4567 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V256_6789 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89AB = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_CDEF = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_01234567 = shufflevector <16 x i16> %src256, <16 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89ABCDEF = shufflevector <16 x i16> %src256, <16 x i16> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 0, i32 1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_02_03 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 2, i32 3>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 4, i32 5>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_06_07 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 6, i32 7>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 8, i32 9>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0A_0B = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 10, i32 11>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0C_0D = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 12, i32 13>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 14, i32 15>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 16, i32 17>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_12_13 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 18, i32 19>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_14_15 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 20, i32 21>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_16_17 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 22, i32 23>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_18_19 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 24, i32 25>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1A_1B = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 26, i32 27>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1C_1D = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 28, i32 29>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 30, i32 31>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_02_03_04_05 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05_06_07 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V512_06_07_08_09 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09_0A_0B = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0C_0D_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_14_15_16_17 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 20, i32 21, i32 22, i32 23>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_18_19_1A_1B = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 24, i32 25, i32 26, i32 27>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1C_1D_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 28, i32 29, i32 30, i32 31>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07 = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13_14_15_16_17 = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> <i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+; AVX2-LABEL: 'test_vXi16'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64_01 = shufflevector <4 x i16> %src64, <4 x i16> undef, <2 x i32> <i32 0, i32 1>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_23 = shufflevector <4 x i16> %src64, <4 x i16> undef, <2 x i32> <i32 2, i32 3>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_01 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> <i32 0, i32 1>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_23 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> <i32 2, i32 3>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_45 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> <i32 4, i32 5>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_67 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> <i32 6, i32 7>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_0123 = shufflevector <8 x i16> %src128, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_4567 = shufflevector <8 x i16> %src128, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_01 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 0, i32 1>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_23 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 2, i32 3>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_45 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 4, i32 5>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_67 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 6, i32 7>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 8, i32 9>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_AB = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 10, i32 11>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_CD = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 12, i32 13>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_EF = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_0123 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_2345 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_4567 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_6789 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89AB = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_CDEF = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_01234567 = shufflevector <16 x i16> %src256, <16 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89ABCDEF = shufflevector <16 x i16> %src256, <16 x i16> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 0, i32 1>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_02_03 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 2, i32 3>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 4, i32 5>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_06_07 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 6, i32 7>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 8, i32 9>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0A_0B = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 10, i32 11>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0C_0D = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 12, i32 13>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_10_11 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 16, i32 17>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_12_13 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 18, i32 19>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_14_15 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 20, i32 21>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_16_17 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 22, i32 23>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_18_19 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 24, i32 25>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1A_1B = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 26, i32 27>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1C_1D = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 28, i32 29>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 30, i32 31>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_02_03_04_05 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05_06_07 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_06_07_08_09 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09_0A_0B = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0C_0D_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_10_11_12_13 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_14_15_16_17 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 20, i32 21, i32 22, i32 23>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_18_19_1A_1B = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 24, i32 25, i32 26, i32 27>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1C_1D_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 28, i32 29, i32 30, i32 31>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07 = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_10_11_12_13_14_15_16_17 = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> <i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX512F-LABEL: 'test_vXi16'
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64_01 = shufflevector <4 x i16> %src64, <4 x i16> undef, <2 x i32> <i32 0, i32 1>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_23 = shufflevector <4 x i16> %src64, <4 x i16> undef, <2 x i32> <i32 2, i32 3>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_01 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> <i32 0, i32 1>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_23 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> <i32 2, i32 3>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_45 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> <i32 4, i32 5>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_67 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> <i32 6, i32 7>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_0123 = shufflevector <8 x i16> %src128, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_4567 = shufflevector <8 x i16> %src128, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_01 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 0, i32 1>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_23 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 2, i32 3>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_45 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 4, i32 5>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_67 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 6, i32 7>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 8, i32 9>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_AB = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 10, i32 11>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_CD = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 12, i32 13>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_EF = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 14, i32 15>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_0123 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_2345 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_4567 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_6789 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89AB = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_CDEF = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_01234567 = shufflevector <16 x i16> %src256, <16 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89ABCDEF = shufflevector <16 x i16> %src256, <16 x i16> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 0, i32 1>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_02_03 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 2, i32 3>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 4, i32 5>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_06_07 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 6, i32 7>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 8, i32 9>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0A_0B = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 10, i32 11>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0C_0D = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 12, i32 13>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 14, i32 15>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 16, i32 17>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_12_13 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 18, i32 19>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_14_15 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 20, i32 21>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_16_17 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 22, i32 23>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_18_19 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 24, i32 25>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1A_1B = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 26, i32 27>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1C_1D = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 28, i32 29>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 30, i32 31>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V512_02_03_04_05 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05_06_07 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V512_06_07_08_09 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09_0A_0B = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0C_0D_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_14_15_16_17 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 20, i32 21, i32 22, i32 23>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_18_19_1A_1B = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 24, i32 25, i32 26, i32 27>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1C_1D_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 28, i32 29, i32 30, i32 31>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07 = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13_14_15_16_17 = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> <i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX512BW-LABEL: 'test_vXi16'
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64_01 = shufflevector <4 x i16> %src64, <4 x i16> undef, <2 x i32> <i32 0, i32 1>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_23 = shufflevector <4 x i16> %src64, <4 x i16> undef, <2 x i32> <i32 2, i32 3>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_01 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> <i32 0, i32 1>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_23 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> <i32 2, i32 3>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_45 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> <i32 4, i32 5>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_67 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> <i32 6, i32 7>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_0123 = shufflevector <8 x i16> %src128, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_4567 = shufflevector <8 x i16> %src128, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_01 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 0, i32 1>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_23 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 2, i32 3>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_45 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 4, i32 5>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_67 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 6, i32 7>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 8, i32 9>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_AB = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 10, i32 11>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_CD = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 12, i32 13>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_EF = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 14, i32 15>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_0123 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_2345 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_4567 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_6789 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89AB = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_CDEF = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_01234567 = shufflevector <16 x i16> %src256, <16 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89ABCDEF = shufflevector <16 x i16> %src256, <16 x i16> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 0, i32 1>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_02_03 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 2, i32 3>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 4, i32 5>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_06_07 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 6, i32 7>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 8, i32 9>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0A_0B = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 10, i32 11>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0C_0D = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 12, i32 13>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 14, i32 15>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 16, i32 17>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_12_13 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 18, i32 19>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_14_15 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 20, i32 21>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_16_17 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 22, i32 23>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_18_19 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 24, i32 25>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1A_1B = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 26, i32 27>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1C_1D = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 28, i32 29>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 30, i32 31>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_02_03_04_05 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05_06_07 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_06_07_08_09 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09_0A_0B = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0C_0D_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_14_15_16_17 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 20, i32 21, i32 22, i32 23>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_18_19_1A_1B = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 24, i32 25, i32 26, i32 27>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1C_1D_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 28, i32 29, i32 30, i32 31>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07 = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13_14_15_16_17 = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> <i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX512VBMI-LABEL: 'test_vXi16'
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64_01 = shufflevector <4 x i16> %src64, <4 x i16> undef, <2 x i32> <i32 0, i32 1>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_23 = shufflevector <4 x i16> %src64, <4 x i16> undef, <2 x i32> <i32 2, i32 3>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_01 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> <i32 0, i32 1>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_23 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> <i32 2, i32 3>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_45 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> <i32 4, i32 5>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_67 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> <i32 6, i32 7>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_0123 = shufflevector <8 x i16> %src128, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_4567 = shufflevector <8 x i16> %src128, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_01 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 0, i32 1>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_23 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 2, i32 3>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_45 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 4, i32 5>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_67 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 6, i32 7>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 8, i32 9>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_AB = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 10, i32 11>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_CD = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 12, i32 13>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_EF = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 14, i32 15>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_0123 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_2345 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_4567 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_6789 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89AB = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_CDEF = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_01234567 = shufflevector <16 x i16> %src256, <16 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89ABCDEF = shufflevector <16 x i16> %src256, <16 x i16> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 0, i32 1>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_02_03 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 2, i32 3>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 4, i32 5>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_06_07 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 6, i32 7>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 8, i32 9>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0A_0B = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 10, i32 11>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0C_0D = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 12, i32 13>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 14, i32 15>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 16, i32 17>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_12_13 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 18, i32 19>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_14_15 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 20, i32 21>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_16_17 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 22, i32 23>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_18_19 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 24, i32 25>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1A_1B = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 26, i32 27>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1C_1D = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 28, i32 29>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 30, i32 31>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_02_03_04_05 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05_06_07 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_06_07_08_09 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09_0A_0B = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0C_0D_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_14_15_16_17 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 20, i32 21, i32 22, i32 23>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_18_19_1A_1B = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 24, i32 25, i32 26, i32 27>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1C_1D_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 28, i32 29, i32 30, i32 31>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07 = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13_14_15_16_17 = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> <i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SLM-LABEL: 'test_vXi16'
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64_01 = shufflevector <4 x i16> %src64, <4 x i16> undef, <2 x i32> <i32 0, i32 1>
@@ -533,9 +710,9 @@ define void @test_vXi16(<4 x i16> %src64, <8 x i16> %src128, <16 x i16> %src256,
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_CD = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 12, i32 13>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_EF = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 14, i32 15>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_0123 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SLM-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V256_2345 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_2345 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_4567 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; SLM-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V256_6789 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_6789 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_89AB = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_CDEF = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_01234567 = shufflevector <16 x i16> %src256, <16 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -557,9 +734,9 @@ define void @test_vXi16(<4 x i16> %src64, <8 x i16> %src128, <16 x i16> %src256,
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_1C_1D = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 28, i32 29>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 30, i32 31>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SLM-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V512_02_03_04_05 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_02_03_04_05 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05_06_07 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; SLM-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V512_06_07_08_09 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_06_07_08_09 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_08_09_0A_0B = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0C_0D_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_10_11_12_13 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
@@ -592,9 +769,9 @@ define void @test_vXi16(<4 x i16> %src64, <8 x i16> %src128, <16 x i16> %src256,
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_CD = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 12, i32 13>
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_EF = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 14, i32 15>
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_0123 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; GLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_2345 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_2345 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_4567 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; GLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_6789 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_6789 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_89AB = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_CDEF = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_01234567 = shufflevector <16 x i16> %src256, <16 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -616,9 +793,9 @@ define void @test_vXi16(<4 x i16> %src64, <8 x i16> %src128, <16 x i16> %src256,
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_1C_1D = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 28, i32 29>
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 30, i32 31>
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; GLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_02_03_04_05 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_02_03_04_05 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05_06_07 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; GLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_06_07_08_09 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_06_07_08_09 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_08_09_0A_0B = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0C_0D_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_10_11_12_13 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
@@ -709,9 +886,9 @@ define void @test_vXi8(<8 x i8> %src64, <16 x i8> %src128, <32 x i8> %src256, <6
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_CD = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 12, i32 13>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_EF = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_0123 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V128_2345 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V128_2345 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_4567 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V128_6789 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V128_6789 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_89AB = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_CDEF = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_01234567 = shufflevector <16 x i8> %src128, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -733,9 +910,9 @@ define void @test_vXi8(<8 x i8> %src64, <16 x i8> %src128, <32 x i8> %src256, <6
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_1C_1D = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 28, i32 29>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 30, i32 31>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01_02_03 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V256_02_03_04_05 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V256_02_03_04_05 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_04_05_06_07 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V256_06_07_08_09 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V256_06_07_08_09 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_08_09_0A_0B = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0C_0D_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_10_11_12_13 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
@@ -828,9 +1005,9 @@ define void @test_vXi8(<8 x i8> %src64, <16 x i8> %src128, <32 x i8> %src256, <6
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_CD = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 12, i32 13>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_EF = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_0123 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V128_2345 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_2345 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_4567 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V128_6789 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_6789 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_89AB = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_CDEF = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_01234567 = shufflevector <16 x i8> %src128, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -852,9 +1029,9 @@ define void @test_vXi8(<8 x i8> %src64, <16 x i8> %src128, <32 x i8> %src256, <6
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_1C_1D = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 28, i32 29>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 30, i32 31>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01_02_03 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V256_02_03_04_05 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_02_03_04_05 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_04_05_06_07 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V256_06_07_08_09 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_06_07_08_09 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_08_09_0A_0B = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0C_0D_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_10_11_12_13 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
@@ -947,9 +1124,9 @@ define void @test_vXi8(<8 x i8> %src64, <16 x i8> %src128, <32 x i8> %src256, <6
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_CD = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 12, i32 13>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_EF = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_0123 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V128_2345 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_2345 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_4567 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V128_6789 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_6789 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_89AB = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_CDEF = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_01234567 = shufflevector <16 x i8> %src128, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -971,9 +1148,9 @@ define void @test_vXi8(<8 x i8> %src64, <16 x i8> %src128, <32 x i8> %src256, <6
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_1C_1D = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 28, i32 29>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 30, i32 31>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01_02_03 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_02_03_04_05 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_02_03_04_05 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_04_05_06_07 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_06_07_08_09 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_06_07_08_09 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_08_09_0A_0B = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0C_0D_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_10_11_12_13 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
@@ -1050,243 +1227,600 @@ define void @test_vXi8(<8 x i8> %src64, <16 x i8> %src128, <32 x i8> %src256, <6
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_20_21_22_23_24_25_26_27_28_29_2A_2B_2C_2D_2E_2F_30_31_32_33_34_35_36_37_38_39_3A_3B_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <32 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
-; AVX-LABEL: 'test_vXi8'
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64_01 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 0, i32 1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_23 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 2, i32 3>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_45 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 4, i32 5>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_67 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 6, i32 7>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64_0123 = shufflevector <8 x i8> %src64, <8 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_4567 = shufflevector <8 x i8> %src64, <8 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_01 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 0, i32 1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_23 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 2, i32 3>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_45 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 4, i32 5>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_67 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 6, i32 7>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_89 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 8, i32 9>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_AB = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 10, i32 11>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_CD = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 12, i32 13>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_EF = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 14, i32 15>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_0123 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V128_2345 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_4567 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V128_6789 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_89AB = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_CDEF = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_01234567 = shufflevector <16 x i8> %src128, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_89ABCDEF = shufflevector <16 x i8> %src128, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 0, i32 1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_02_03 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 2, i32 3>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_04_05 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 4, i32 5>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_06_07 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 6, i32 7>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_08_09 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 8, i32 9>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0A_0B = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 10, i32 11>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0C_0D = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 12, i32 13>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 14, i32 15>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_10_11 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 16, i32 17>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_12_13 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 18, i32 19>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_14_15 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 20, i32 21>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_16_17 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 22, i32 23>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_18_19 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 24, i32 25>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_1A_1B = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 26, i32 27>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_1C_1D = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 28, i32 29>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 30, i32 31>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01_02_03 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_02_03_04_05 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_04_05_06_07 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_06_07_08_09 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_08_09_0A_0B = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0C_0D_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_10_11_12_13 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_14_15_16_17 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 20, i32 21, i32 22, i32 23>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_18_19_1A_1B = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 24, i32 25, i32 26, i32 27>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_1C_1D_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 28, i32 29, i32 30, i32 31>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01_02_03_04_05_06_07 = shufflevector <32 x i8> %src256, <32 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_10_11_12_13_14_15_16_17 = shufflevector <32 x i8> %src256, <32 x i8> undef, <8 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <8 x i32> <i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 0, i32 1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_02_03 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 2, i32 3>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 4, i32 5>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_06_07 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 6, i32 7>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 8, i32 9>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0A_0B = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 10, i32 11>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0C_0D = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 12, i32 13>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0E_0F = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 14, i32 15>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 16, i32 17>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_12_13 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 18, i32 19>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_14_15 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 20, i32 21>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_16_17 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 22, i32 23>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_18_19 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 24, i32 25>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1A_1B = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 26, i32 27>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1C_1D = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 28, i32 29>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 30, i32 31>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_20_21 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 32, i32 33>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_22_23 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 34, i32 35>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_24_25 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 36, i32 37>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_26_27 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 38, i32 39>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_28_29 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 40, i32 41>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_2A_2B = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 42, i32 43>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_2C_2D = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 44, i32 45>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_2E_2F = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 46, i32 47>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_30_31 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 48, i32 49>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_32_33 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 50, i32 51>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_34_35 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 52, i32 53>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_36_37 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 54, i32 55>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_38_39 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 56, i32 57>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_3A_3B = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 58, i32 59>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_3C_3D = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 60, i32 61>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 62, i32 63>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05_06_07 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09_0A_0B = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0C_0D_0E_0F = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_14_15_16_17 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 20, i32 21, i32 22, i32 23>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_18_19_1A_1B = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 24, i32 25, i32 26, i32 27>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 28, i32 29, i32 30, i32 31>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_20_21_22_23 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 32, i32 33, i32 34, i32 35>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_24_25_26_27 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 36, i32 37, i32 38, i32 39>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_28_29_2A_2B = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 40, i32 41, i32 42, i32 43>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_2C_2D_2E_2F = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 44, i32 45, i32 46, i32 47>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_30_31_32_33 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 48, i32 49, i32 50, i32 51>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_34_35_36_37 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 52, i32 53, i32 54, i32 55>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_38_39_3A_3B = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 56, i32 57, i32 58, i32 59>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 60, i32 61, i32 62, i32 63>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07 = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09_0A_0B_0C_0D_0E_0F = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13_14_15_16_17 = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_18_19_1A_1B_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_20_21_22_23_24_25_26_27 = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_28_29_2A_2B_2C_2D_2E_2F = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_30_31_32_33_34_35_36_37 = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_38_39_3A_3B_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F = shufflevector <64 x i8> %src512, <64 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_20_21_22_23_24_25_26_27_28_29_2A_2B_2C_2D_2E_2F = shufflevector <64 x i8> %src512, <64 x i8> undef, <16 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_30_31_32_33_34_35_36_37_38_39_3A_3B_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <16 x i32> <i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_20_21_22_23_24_25_26_27_28_29_2A_2B_2C_2D_2E_2F_30_31_32_33_34_35_36_37_38_39_3A_3B_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <32 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+; AVX1-LABEL: 'test_vXi8'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64_01 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 0, i32 1>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_23 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 2, i32 3>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_45 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 4, i32 5>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_67 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 6, i32 7>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64_0123 = shufflevector <8 x i8> %src64, <8 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_4567 = shufflevector <8 x i8> %src64, <8 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_01 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 0, i32 1>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_23 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 2, i32 3>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_45 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 4, i32 5>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_67 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 6, i32 7>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_89 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 8, i32 9>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_AB = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 10, i32 11>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_CD = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 12, i32 13>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_EF = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_0123 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_2345 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_4567 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_6789 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_89AB = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_CDEF = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_01234567 = shufflevector <16 x i8> %src128, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_89ABCDEF = shufflevector <16 x i8> %src128, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 0, i32 1>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_02_03 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 2, i32 3>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_04_05 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 4, i32 5>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_06_07 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 6, i32 7>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_08_09 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 8, i32 9>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0A_0B = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 10, i32 11>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0C_0D = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 12, i32 13>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_10_11 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 16, i32 17>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_12_13 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 18, i32 19>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_14_15 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 20, i32 21>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_16_17 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 22, i32 23>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_18_19 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 24, i32 25>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_1A_1B = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 26, i32 27>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_1C_1D = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 28, i32 29>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 30, i32 31>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01_02_03 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_02_03_04_05 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_04_05_06_07 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_06_07_08_09 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_08_09_0A_0B = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0C_0D_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_10_11_12_13 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_14_15_16_17 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 20, i32 21, i32 22, i32 23>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_18_19_1A_1B = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 24, i32 25, i32 26, i32 27>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_1C_1D_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 28, i32 29, i32 30, i32 31>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01_02_03_04_05_06_07 = shufflevector <32 x i8> %src256, <32 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_10_11_12_13_14_15_16_17 = shufflevector <32 x i8> %src256, <32 x i8> undef, <8 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <8 x i32> <i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 0, i32 1>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_02_03 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 2, i32 3>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 4, i32 5>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_06_07 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 6, i32 7>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 8, i32 9>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0A_0B = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 10, i32 11>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0C_0D = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 12, i32 13>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0E_0F = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 16, i32 17>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_12_13 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 18, i32 19>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_14_15 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 20, i32 21>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_16_17 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 22, i32 23>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_18_19 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 24, i32 25>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1A_1B = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 26, i32 27>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1C_1D = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 28, i32 29>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 30, i32 31>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_20_21 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 32, i32 33>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_22_23 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 34, i32 35>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_24_25 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 36, i32 37>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_26_27 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 38, i32 39>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_28_29 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 40, i32 41>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_2A_2B = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 42, i32 43>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_2C_2D = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 44, i32 45>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_2E_2F = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 46, i32 47>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_30_31 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 48, i32 49>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_32_33 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 50, i32 51>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_34_35 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 52, i32 53>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_36_37 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 54, i32 55>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_38_39 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 56, i32 57>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_3A_3B = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 58, i32 59>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_3C_3D = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 60, i32 61>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 62, i32 63>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05_06_07 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09_0A_0B = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0C_0D_0E_0F = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_14_15_16_17 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 20, i32 21, i32 22, i32 23>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_18_19_1A_1B = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 24, i32 25, i32 26, i32 27>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 28, i32 29, i32 30, i32 31>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_20_21_22_23 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 32, i32 33, i32 34, i32 35>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_24_25_26_27 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 36, i32 37, i32 38, i32 39>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_28_29_2A_2B = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 40, i32 41, i32 42, i32 43>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_2C_2D_2E_2F = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 44, i32 45, i32 46, i32 47>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_30_31_32_33 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 48, i32 49, i32 50, i32 51>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_34_35_36_37 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 52, i32 53, i32 54, i32 55>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_38_39_3A_3B = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 56, i32 57, i32 58, i32 59>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 60, i32 61, i32 62, i32 63>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07 = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09_0A_0B_0C_0D_0E_0F = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13_14_15_16_17 = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_18_19_1A_1B_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_20_21_22_23_24_25_26_27 = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_28_29_2A_2B_2C_2D_2E_2F = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_30_31_32_33_34_35_36_37 = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_38_39_3A_3B_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F = shufflevector <64 x i8> %src512, <64 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_20_21_22_23_24_25_26_27_28_29_2A_2B_2C_2D_2E_2F = shufflevector <64 x i8> %src512, <64 x i8> undef, <16 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_30_31_32_33_34_35_36_37_38_39_3A_3B_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <16 x i32> <i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_20_21_22_23_24_25_26_27_28_29_2A_2B_2C_2D_2E_2F_30_31_32_33_34_35_36_37_38_39_3A_3B_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <32 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
-; AVX512-LABEL: 'test_vXi8'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64_01 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 0, i32 1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_23 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 2, i32 3>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_45 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 4, i32 5>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_67 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 6, i32 7>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64_0123 = shufflevector <8 x i8> %src64, <8 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_4567 = shufflevector <8 x i8> %src64, <8 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_01 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 0, i32 1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_23 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 2, i32 3>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_45 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 4, i32 5>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_67 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 6, i32 7>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_89 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 8, i32 9>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_AB = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 10, i32 11>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_CD = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 12, i32 13>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_EF = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 14, i32 15>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_0123 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V128_2345 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_4567 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V128_6789 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_89AB = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_CDEF = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_01234567 = shufflevector <16 x i8> %src128, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_89ABCDEF = shufflevector <16 x i8> %src128, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 0, i32 1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_02_03 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 2, i32 3>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_04_05 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 4, i32 5>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_06_07 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 6, i32 7>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_08_09 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 8, i32 9>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0A_0B = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 10, i32 11>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0C_0D = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 12, i32 13>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 14, i32 15>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_10_11 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 16, i32 17>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_12_13 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 18, i32 19>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_14_15 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 20, i32 21>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_16_17 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 22, i32 23>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_18_19 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 24, i32 25>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_1A_1B = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 26, i32 27>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_1C_1D = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 28, i32 29>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 30, i32 31>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01_02_03 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_02_03_04_05 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_04_05_06_07 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_06_07_08_09 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_08_09_0A_0B = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0C_0D_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_10_11_12_13 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_14_15_16_17 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 20, i32 21, i32 22, i32 23>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_18_19_1A_1B = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 24, i32 25, i32 26, i32 27>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_1C_1D_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 28, i32 29, i32 30, i32 31>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01_02_03_04_05_06_07 = shufflevector <32 x i8> %src256, <32 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_10_11_12_13_14_15_16_17 = shufflevector <32 x i8> %src256, <32 x i8> undef, <8 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <8 x i32> <i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 0, i32 1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_02_03 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 2, i32 3>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 4, i32 5>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_06_07 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 6, i32 7>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 8, i32 9>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0A_0B = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 10, i32 11>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0C_0D = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 12, i32 13>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0E_0F = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 14, i32 15>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 16, i32 17>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_12_13 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 18, i32 19>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_14_15 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 20, i32 21>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_16_17 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 22, i32 23>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_18_19 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 24, i32 25>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1A_1B = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 26, i32 27>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1C_1D = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 28, i32 29>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 30, i32 31>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_20_21 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 32, i32 33>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_22_23 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 34, i32 35>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_24_25 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 36, i32 37>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_26_27 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 38, i32 39>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_28_29 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 40, i32 41>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_2A_2B = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 42, i32 43>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_2C_2D = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 44, i32 45>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_2E_2F = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 46, i32 47>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_30_31 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 48, i32 49>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_32_33 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 50, i32 51>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_34_35 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 52, i32 53>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_36_37 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 54, i32 55>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_38_39 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 56, i32 57>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_3A_3B = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 58, i32 59>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_3C_3D = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 60, i32 61>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 62, i32 63>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05_06_07 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09_0A_0B = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0C_0D_0E_0F = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_14_15_16_17 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 20, i32 21, i32 22, i32 23>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_18_19_1A_1B = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 24, i32 25, i32 26, i32 27>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 28, i32 29, i32 30, i32 31>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_20_21_22_23 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 32, i32 33, i32 34, i32 35>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_24_25_26_27 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 36, i32 37, i32 38, i32 39>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_28_29_2A_2B = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 40, i32 41, i32 42, i32 43>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_2C_2D_2E_2F = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 44, i32 45, i32 46, i32 47>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_30_31_32_33 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 48, i32 49, i32 50, i32 51>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_34_35_36_37 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 52, i32 53, i32 54, i32 55>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_38_39_3A_3B = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 56, i32 57, i32 58, i32 59>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 60, i32 61, i32 62, i32 63>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07 = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09_0A_0B_0C_0D_0E_0F = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13_14_15_16_17 = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_18_19_1A_1B_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_20_21_22_23_24_25_26_27 = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_28_29_2A_2B_2C_2D_2E_2F = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_30_31_32_33_34_35_36_37 = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_38_39_3A_3B_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F = shufflevector <64 x i8> %src512, <64 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_20_21_22_23_24_25_26_27_28_29_2A_2B_2C_2D_2E_2F = shufflevector <64 x i8> %src512, <64 x i8> undef, <16 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_30_31_32_33_34_35_36_37_38_39_3A_3B_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <16 x i32> <i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_20_21_22_23_24_25_26_27_28_29_2A_2B_2C_2D_2E_2F_30_31_32_33_34_35_36_37_38_39_3A_3B_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <32 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+; AVX2-LABEL: 'test_vXi8'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64_01 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 0, i32 1>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_23 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 2, i32 3>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_45 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 4, i32 5>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_67 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 6, i32 7>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64_0123 = shufflevector <8 x i8> %src64, <8 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_4567 = shufflevector <8 x i8> %src64, <8 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_01 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 0, i32 1>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_23 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 2, i32 3>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_45 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 4, i32 5>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_67 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 6, i32 7>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_89 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 8, i32 9>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_AB = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 10, i32 11>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_CD = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 12, i32 13>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_EF = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_0123 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_2345 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_4567 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_6789 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_89AB = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_CDEF = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_01234567 = shufflevector <16 x i8> %src128, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_89ABCDEF = shufflevector <16 x i8> %src128, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 0, i32 1>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_02_03 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 2, i32 3>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_04_05 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 4, i32 5>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_06_07 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 6, i32 7>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_08_09 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 8, i32 9>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0A_0B = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 10, i32 11>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0C_0D = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 12, i32 13>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_10_11 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 16, i32 17>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_12_13 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 18, i32 19>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_14_15 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 20, i32 21>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_16_17 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 22, i32 23>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_18_19 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 24, i32 25>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_1A_1B = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 26, i32 27>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_1C_1D = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 28, i32 29>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 30, i32 31>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01_02_03 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_02_03_04_05 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_04_05_06_07 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_06_07_08_09 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_08_09_0A_0B = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0C_0D_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_10_11_12_13 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_14_15_16_17 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 20, i32 21, i32 22, i32 23>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_18_19_1A_1B = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 24, i32 25, i32 26, i32 27>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_1C_1D_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 28, i32 29, i32 30, i32 31>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01_02_03_04_05_06_07 = shufflevector <32 x i8> %src256, <32 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_10_11_12_13_14_15_16_17 = shufflevector <32 x i8> %src256, <32 x i8> undef, <8 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <8 x i32> <i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 0, i32 1>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_02_03 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 2, i32 3>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 4, i32 5>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_06_07 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 6, i32 7>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 8, i32 9>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0A_0B = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 10, i32 11>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0C_0D = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 12, i32 13>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0E_0F = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 16, i32 17>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_12_13 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 18, i32 19>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_14_15 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 20, i32 21>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_16_17 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 22, i32 23>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_18_19 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 24, i32 25>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1A_1B = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 26, i32 27>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1C_1D = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 28, i32 29>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 30, i32 31>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_20_21 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 32, i32 33>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_22_23 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 34, i32 35>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_24_25 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 36, i32 37>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_26_27 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 38, i32 39>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_28_29 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 40, i32 41>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_2A_2B = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 42, i32 43>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_2C_2D = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 44, i32 45>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_2E_2F = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 46, i32 47>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_30_31 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 48, i32 49>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_32_33 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 50, i32 51>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_34_35 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 52, i32 53>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_36_37 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 54, i32 55>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_38_39 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 56, i32 57>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_3A_3B = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 58, i32 59>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_3C_3D = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 60, i32 61>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 62, i32 63>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05_06_07 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09_0A_0B = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0C_0D_0E_0F = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_14_15_16_17 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 20, i32 21, i32 22, i32 23>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_18_19_1A_1B = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 24, i32 25, i32 26, i32 27>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 28, i32 29, i32 30, i32 31>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_20_21_22_23 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 32, i32 33, i32 34, i32 35>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_24_25_26_27 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 36, i32 37, i32 38, i32 39>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_28_29_2A_2B = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 40, i32 41, i32 42, i32 43>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_2C_2D_2E_2F = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 44, i32 45, i32 46, i32 47>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_30_31_32_33 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 48, i32 49, i32 50, i32 51>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_34_35_36_37 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 52, i32 53, i32 54, i32 55>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_38_39_3A_3B = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 56, i32 57, i32 58, i32 59>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 60, i32 61, i32 62, i32 63>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07 = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09_0A_0B_0C_0D_0E_0F = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13_14_15_16_17 = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_18_19_1A_1B_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_20_21_22_23_24_25_26_27 = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_28_29_2A_2B_2C_2D_2E_2F = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_30_31_32_33_34_35_36_37 = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_38_39_3A_3B_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F = shufflevector <64 x i8> %src512, <64 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_20_21_22_23_24_25_26_27_28_29_2A_2B_2C_2D_2E_2F = shufflevector <64 x i8> %src512, <64 x i8> undef, <16 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_30_31_32_33_34_35_36_37_38_39_3A_3B_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <16 x i32> <i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_20_21_22_23_24_25_26_27_28_29_2A_2B_2C_2D_2E_2F_30_31_32_33_34_35_36_37_38_39_3A_3B_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <32 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX512F-LABEL: 'test_vXi8'
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64_01 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 0, i32 1>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_23 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 2, i32 3>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_45 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 4, i32 5>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_67 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 6, i32 7>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64_0123 = shufflevector <8 x i8> %src64, <8 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_4567 = shufflevector <8 x i8> %src64, <8 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_01 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 0, i32 1>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_23 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 2, i32 3>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_45 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 4, i32 5>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_67 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 6, i32 7>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_89 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 8, i32 9>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_AB = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 10, i32 11>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_CD = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 12, i32 13>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_EF = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 14, i32 15>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_0123 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_2345 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_4567 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_6789 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_89AB = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_CDEF = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_01234567 = shufflevector <16 x i8> %src128, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_89ABCDEF = shufflevector <16 x i8> %src128, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 0, i32 1>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_02_03 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 2, i32 3>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_04_05 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 4, i32 5>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_06_07 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 6, i32 7>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_08_09 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 8, i32 9>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0A_0B = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 10, i32 11>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0C_0D = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 12, i32 13>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 14, i32 15>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_10_11 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 16, i32 17>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_12_13 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 18, i32 19>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_14_15 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 20, i32 21>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_16_17 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 22, i32 23>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_18_19 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 24, i32 25>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_1A_1B = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 26, i32 27>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_1C_1D = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 28, i32 29>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 30, i32 31>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01_02_03 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_02_03_04_05 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_04_05_06_07 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_06_07_08_09 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_08_09_0A_0B = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0C_0D_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_10_11_12_13 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_14_15_16_17 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 20, i32 21, i32 22, i32 23>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_18_19_1A_1B = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 24, i32 25, i32 26, i32 27>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_1C_1D_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 28, i32 29, i32 30, i32 31>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01_02_03_04_05_06_07 = shufflevector <32 x i8> %src256, <32 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_10_11_12_13_14_15_16_17 = shufflevector <32 x i8> %src256, <32 x i8> undef, <8 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <8 x i32> <i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 0, i32 1>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_02_03 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 2, i32 3>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 4, i32 5>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_06_07 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 6, i32 7>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 8, i32 9>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0A_0B = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 10, i32 11>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0C_0D = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 12, i32 13>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0E_0F = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 14, i32 15>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 16, i32 17>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_12_13 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 18, i32 19>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_14_15 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 20, i32 21>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_16_17 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 22, i32 23>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_18_19 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 24, i32 25>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1A_1B = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 26, i32 27>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1C_1D = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 28, i32 29>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 30, i32 31>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_20_21 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 32, i32 33>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_22_23 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 34, i32 35>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_24_25 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 36, i32 37>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_26_27 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 38, i32 39>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_28_29 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 40, i32 41>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_2A_2B = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 42, i32 43>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_2C_2D = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 44, i32 45>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_2E_2F = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 46, i32 47>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_30_31 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 48, i32 49>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_32_33 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 50, i32 51>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_34_35 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 52, i32 53>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_36_37 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 54, i32 55>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_38_39 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 56, i32 57>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_3A_3B = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 58, i32 59>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_3C_3D = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 60, i32 61>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 62, i32 63>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05_06_07 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09_0A_0B = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0C_0D_0E_0F = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_14_15_16_17 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 20, i32 21, i32 22, i32 23>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_18_19_1A_1B = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 24, i32 25, i32 26, i32 27>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 28, i32 29, i32 30, i32 31>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_20_21_22_23 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 32, i32 33, i32 34, i32 35>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_24_25_26_27 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 36, i32 37, i32 38, i32 39>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_28_29_2A_2B = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 40, i32 41, i32 42, i32 43>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_2C_2D_2E_2F = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 44, i32 45, i32 46, i32 47>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_30_31_32_33 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 48, i32 49, i32 50, i32 51>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_34_35_36_37 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 52, i32 53, i32 54, i32 55>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_38_39_3A_3B = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 56, i32 57, i32 58, i32 59>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 60, i32 61, i32 62, i32 63>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07 = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09_0A_0B_0C_0D_0E_0F = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13_14_15_16_17 = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_18_19_1A_1B_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_20_21_22_23_24_25_26_27 = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_28_29_2A_2B_2C_2D_2E_2F = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_30_31_32_33_34_35_36_37 = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_38_39_3A_3B_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F = shufflevector <64 x i8> %src512, <64 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_20_21_22_23_24_25_26_27_28_29_2A_2B_2C_2D_2E_2F = shufflevector <64 x i8> %src512, <64 x i8> undef, <16 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_30_31_32_33_34_35_36_37_38_39_3A_3B_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <16 x i32> <i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_20_21_22_23_24_25_26_27_28_29_2A_2B_2C_2D_2E_2F_30_31_32_33_34_35_36_37_38_39_3A_3B_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <32 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX512BW-LABEL: 'test_vXi8'
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64_01 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 0, i32 1>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_23 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 2, i32 3>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_45 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 4, i32 5>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_67 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 6, i32 7>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64_0123 = shufflevector <8 x i8> %src64, <8 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_4567 = shufflevector <8 x i8> %src64, <8 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_01 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 0, i32 1>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_23 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 2, i32 3>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_45 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 4, i32 5>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_67 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 6, i32 7>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_89 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 8, i32 9>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_AB = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 10, i32 11>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_CD = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 12, i32 13>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_EF = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 14, i32 15>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_0123 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_2345 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_4567 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_6789 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_89AB = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_CDEF = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_01234567 = shufflevector <16 x i8> %src128, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_89ABCDEF = shufflevector <16 x i8> %src128, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 0, i32 1>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_02_03 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 2, i32 3>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_04_05 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 4, i32 5>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_06_07 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 6, i32 7>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_08_09 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 8, i32 9>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0A_0B = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 10, i32 11>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0C_0D = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 12, i32 13>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 14, i32 15>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_10_11 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 16, i32 17>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_12_13 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 18, i32 19>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_14_15 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 20, i32 21>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_16_17 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 22, i32 23>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_18_19 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 24, i32 25>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_1A_1B = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 26, i32 27>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_1C_1D = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 28, i32 29>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 30, i32 31>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01_02_03 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_02_03_04_05 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_04_05_06_07 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_06_07_08_09 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_08_09_0A_0B = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0C_0D_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_10_11_12_13 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_14_15_16_17 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 20, i32 21, i32 22, i32 23>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_18_19_1A_1B = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 24, i32 25, i32 26, i32 27>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_1C_1D_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 28, i32 29, i32 30, i32 31>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01_02_03_04_05_06_07 = shufflevector <32 x i8> %src256, <32 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_10_11_12_13_14_15_16_17 = shufflevector <32 x i8> %src256, <32 x i8> undef, <8 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <8 x i32> <i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 0, i32 1>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_02_03 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 2, i32 3>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 4, i32 5>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_06_07 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 6, i32 7>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 8, i32 9>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0A_0B = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 10, i32 11>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0C_0D = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 12, i32 13>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0E_0F = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 14, i32 15>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 16, i32 17>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_12_13 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 18, i32 19>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_14_15 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 20, i32 21>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_16_17 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 22, i32 23>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_18_19 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 24, i32 25>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1A_1B = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 26, i32 27>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1C_1D = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 28, i32 29>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 30, i32 31>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_20_21 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 32, i32 33>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_22_23 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 34, i32 35>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_24_25 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 36, i32 37>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_26_27 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 38, i32 39>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_28_29 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 40, i32 41>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_2A_2B = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 42, i32 43>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_2C_2D = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 44, i32 45>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_2E_2F = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 46, i32 47>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_30_31 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 48, i32 49>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_32_33 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 50, i32 51>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_34_35 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 52, i32 53>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_36_37 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 54, i32 55>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_38_39 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 56, i32 57>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_3A_3B = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 58, i32 59>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_3C_3D = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 60, i32 61>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 62, i32 63>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05_06_07 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09_0A_0B = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0C_0D_0E_0F = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_14_15_16_17 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 20, i32 21, i32 22, i32 23>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_18_19_1A_1B = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 24, i32 25, i32 26, i32 27>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 28, i32 29, i32 30, i32 31>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_20_21_22_23 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 32, i32 33, i32 34, i32 35>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_24_25_26_27 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 36, i32 37, i32 38, i32 39>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_28_29_2A_2B = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 40, i32 41, i32 42, i32 43>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_2C_2D_2E_2F = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 44, i32 45, i32 46, i32 47>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_30_31_32_33 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 48, i32 49, i32 50, i32 51>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_34_35_36_37 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 52, i32 53, i32 54, i32 55>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_38_39_3A_3B = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 56, i32 57, i32 58, i32 59>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 60, i32 61, i32 62, i32 63>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07 = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09_0A_0B_0C_0D_0E_0F = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13_14_15_16_17 = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_18_19_1A_1B_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_20_21_22_23_24_25_26_27 = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_28_29_2A_2B_2C_2D_2E_2F = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_30_31_32_33_34_35_36_37 = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_38_39_3A_3B_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F = shufflevector <64 x i8> %src512, <64 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_20_21_22_23_24_25_26_27_28_29_2A_2B_2C_2D_2E_2F = shufflevector <64 x i8> %src512, <64 x i8> undef, <16 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_30_31_32_33_34_35_36_37_38_39_3A_3B_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <16 x i32> <i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_20_21_22_23_24_25_26_27_28_29_2A_2B_2C_2D_2E_2F_30_31_32_33_34_35_36_37_38_39_3A_3B_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <32 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX512VBMI-LABEL: 'test_vXi8'
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64_01 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 0, i32 1>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_23 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 2, i32 3>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_45 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 4, i32 5>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_67 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 6, i32 7>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64_0123 = shufflevector <8 x i8> %src64, <8 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_4567 = shufflevector <8 x i8> %src64, <8 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_01 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 0, i32 1>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_23 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 2, i32 3>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_45 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 4, i32 5>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_67 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 6, i32 7>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_89 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 8, i32 9>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_AB = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 10, i32 11>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_CD = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 12, i32 13>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_EF = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 14, i32 15>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_0123 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_2345 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_4567 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_6789 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_89AB = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_CDEF = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_01234567 = shufflevector <16 x i8> %src128, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_89ABCDEF = shufflevector <16 x i8> %src128, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 0, i32 1>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_02_03 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 2, i32 3>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_04_05 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 4, i32 5>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_06_07 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 6, i32 7>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_08_09 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 8, i32 9>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0A_0B = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 10, i32 11>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0C_0D = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 12, i32 13>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 14, i32 15>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_10_11 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 16, i32 17>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_12_13 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 18, i32 19>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_14_15 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 20, i32 21>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_16_17 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 22, i32 23>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_18_19 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 24, i32 25>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_1A_1B = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 26, i32 27>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_1C_1D = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 28, i32 29>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 30, i32 31>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01_02_03 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_02_03_04_05 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_04_05_06_07 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_06_07_08_09 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_08_09_0A_0B = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0C_0D_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_10_11_12_13 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_14_15_16_17 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 20, i32 21, i32 22, i32 23>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_18_19_1A_1B = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 24, i32 25, i32 26, i32 27>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_1C_1D_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 28, i32 29, i32 30, i32 31>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01_02_03_04_05_06_07 = shufflevector <32 x i8> %src256, <32 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_10_11_12_13_14_15_16_17 = shufflevector <32 x i8> %src256, <32 x i8> undef, <8 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <8 x i32> <i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 0, i32 1>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_02_03 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 2, i32 3>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 4, i32 5>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_06_07 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 6, i32 7>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 8, i32 9>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0A_0B = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 10, i32 11>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0C_0D = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 12, i32 13>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0E_0F = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 14, i32 15>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 16, i32 17>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_12_13 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 18, i32 19>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_14_15 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 20, i32 21>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_16_17 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 22, i32 23>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_18_19 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 24, i32 25>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1A_1B = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 26, i32 27>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1C_1D = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 28, i32 29>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 30, i32 31>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_20_21 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 32, i32 33>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_22_23 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 34, i32 35>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_24_25 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 36, i32 37>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_26_27 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 38, i32 39>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_28_29 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 40, i32 41>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_2A_2B = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 42, i32 43>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_2C_2D = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 44, i32 45>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_2E_2F = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 46, i32 47>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_30_31 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 48, i32 49>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_32_33 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 50, i32 51>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_34_35 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 52, i32 53>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_36_37 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 54, i32 55>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_38_39 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 56, i32 57>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_3A_3B = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 58, i32 59>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_3C_3D = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 60, i32 61>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 62, i32 63>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05_06_07 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09_0A_0B = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0C_0D_0E_0F = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_14_15_16_17 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 20, i32 21, i32 22, i32 23>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_18_19_1A_1B = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 24, i32 25, i32 26, i32 27>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 28, i32 29, i32 30, i32 31>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_20_21_22_23 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 32, i32 33, i32 34, i32 35>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_24_25_26_27 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 36, i32 37, i32 38, i32 39>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_28_29_2A_2B = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 40, i32 41, i32 42, i32 43>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_2C_2D_2E_2F = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 44, i32 45, i32 46, i32 47>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_30_31_32_33 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 48, i32 49, i32 50, i32 51>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_34_35_36_37 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 52, i32 53, i32 54, i32 55>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_38_39_3A_3B = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 56, i32 57, i32 58, i32 59>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 60, i32 61, i32 62, i32 63>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07 = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09_0A_0B_0C_0D_0E_0F = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13_14_15_16_17 = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_18_19_1A_1B_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_20_21_22_23_24_25_26_27 = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_28_29_2A_2B_2C_2D_2E_2F = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_30_31_32_33_34_35_36_37 = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_38_39_3A_3B_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F = shufflevector <64 x i8> %src512, <64 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_20_21_22_23_24_25_26_27_28_29_2A_2B_2C_2D_2E_2F = shufflevector <64 x i8> %src512, <64 x i8> undef, <16 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_30_31_32_33_34_35_36_37_38_39_3A_3B_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <16 x i32> <i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_20_21_22_23_24_25_26_27_28_29_2A_2B_2C_2D_2E_2F_30_31_32_33_34_35_36_37_38_39_3A_3B_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <32 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SLM-LABEL: 'test_vXi8'
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64_01 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 0, i32 1>
@@ -1304,9 +1838,9 @@ define void @test_vXi8(<8 x i8> %src64, <16 x i8> %src128, <32 x i8> %src256, <6
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_CD = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 12, i32 13>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_EF = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 14, i32 15>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_0123 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SLM-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V128_2345 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_2345 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_4567 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; SLM-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V128_6789 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_6789 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_89AB = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_CDEF = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_01234567 = shufflevector <16 x i8> %src128, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -1328,9 +1862,9 @@ define void @test_vXi8(<8 x i8> %src64, <16 x i8> %src128, <32 x i8> %src256, <6
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_1C_1D = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 28, i32 29>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 30, i32 31>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01_02_03 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SLM-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V256_02_03_04_05 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_02_03_04_05 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_04_05_06_07 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; SLM-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V256_06_07_08_09 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_06_07_08_09 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_08_09_0A_0B = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0C_0D_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_10_11_12_13 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
@@ -1423,9 +1957,9 @@ define void @test_vXi8(<8 x i8> %src64, <16 x i8> %src128, <32 x i8> %src256, <6
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_CD = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 12, i32 13>
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_EF = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 14, i32 15>
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_0123 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; GLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V128_2345 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_2345 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_4567 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; GLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V128_6789 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_6789 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_89AB = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_CDEF = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_01234567 = shufflevector <16 x i8> %src128, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -1447,9 +1981,9 @@ define void @test_vXi8(<8 x i8> %src64, <16 x i8> %src128, <32 x i8> %src256, <6
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_1C_1D = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 28, i32 29>
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 30, i32 31>
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01_02_03 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; GLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_02_03_04_05 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_02_03_04_05 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_04_05_06_07 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; GLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_06_07_08_09 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_06_07_08_09 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_08_09_0A_0B = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0C_0D_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_10_11_12_13 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
diff --git a/llvm/test/Analysis/CostModel/X86/shuffle-extract_subvector-latency.ll b/llvm/test/Analysis/CostModel/X86/shuffle-extract_subvector-latency.ll
index 94e2622..393dec8 100644
--- a/llvm/test/Analysis/CostModel/X86/shuffle-extract_subvector-latency.ll
+++ b/llvm/test/Analysis/CostModel/X86/shuffle-extract_subvector-latency.ll
@@ -2,15 +2,15 @@
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=latency -mattr=+sse2 | FileCheck %s -check-prefixes=SSE,SSE2
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=latency -mattr=+ssse3 | FileCheck %s -check-prefixes=SSE,SSSE3
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=latency -mattr=+sse4.2 | FileCheck %s -check-prefixes=SSE,SSE42
-; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=latency -mattr=+avx | FileCheck %s -check-prefixes=AVX
-; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=latency -mattr=+avx2 | FileCheck %s -check-prefixes=AVX
-; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=latency -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512
-; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=latency -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX512
-; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=latency -mattr=+avx512f,+avx512bw,+avx512vbmi | FileCheck %s --check-prefixes=AVX512
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=latency -mattr=+avx | FileCheck %s -check-prefixes=AVX,AVX1
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=latency -mattr=+avx2 | FileCheck %s -check-prefixes=AVX,AVX2
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=latency -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512,AVX512F
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=latency -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX512,AVX512BW
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=latency -mattr=+avx512f,+avx512bw,+avx512vbmi | FileCheck %s --check-prefixes=AVX512,AVX512VBMI
 ;
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=latency -mcpu=slm | FileCheck %s --check-prefixes=SSE,SLM
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=latency -mcpu=goldmont | FileCheck %s --check-prefixes=SSE,GLM
-; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=latency -mcpu=btver2 | FileCheck %s --check-prefixes=AVX
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=latency -mcpu=btver2 | FileCheck %s --check-prefixes=AVX,AVX1
 
 ;
 ; Verify the cost model for extract_subector style shuffles.
@@ -38,7 +38,7 @@ define void @test_vXf64(<4 x double> %src256, <8 x double> %src512) {
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_45 = shufflevector <8 x double> %src512, <8 x double> undef, <2 x i32> <i32 4, i32 5>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_67 = shufflevector <8 x double> %src512, <8 x double> undef, <2 x i32> <i32 6, i32 7>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_0123 = shufflevector <8 x double> %src512, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V512_2345 = shufflevector <8 x double> %src512, <8 x double> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_2345 = shufflevector <8 x double> %src512, <8 x double> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_4567 = shufflevector <8 x double> %src512, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_567u = shufflevector <8 x double> %src512, <8 x double> undef, <4 x i32> <i32 5, i32 6, i32 7, i32 poison>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
@@ -51,7 +51,7 @@ define void @test_vXf64(<4 x double> %src256, <8 x double> %src512) {
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <8 x double> %src512, <8 x double> undef, <2 x i32> <i32 4, i32 5>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_67 = shufflevector <8 x double> %src512, <8 x double> undef, <2 x i32> <i32 6, i32 7>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_0123 = shufflevector <8 x double> %src512, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V512_2345 = shufflevector <8 x double> %src512, <8 x double> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V512_2345 = shufflevector <8 x double> %src512, <8 x double> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_4567 = shufflevector <8 x double> %src512, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_567u = shufflevector <8 x double> %src512, <8 x double> undef, <4 x i32> <i32 5, i32 6, i32 7, i32 poison>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
@@ -90,7 +90,7 @@ define void @test_vXi64(<4 x i64> %src256, <8 x i64> %src512) {
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_45 = shufflevector <8 x i64> %src512, <8 x i64> undef, <2 x i32> <i32 4, i32 5>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_67 = shufflevector <8 x i64> %src512, <8 x i64> undef, <2 x i32> <i32 6, i32 7>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_0123 = shufflevector <8 x i64> %src512, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V512_2345 = shufflevector <8 x i64> %src512, <8 x i64> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_2345 = shufflevector <8 x i64> %src512, <8 x i64> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_4567 = shufflevector <8 x i64> %src512, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
@@ -102,7 +102,7 @@ define void @test_vXi64(<4 x i64> %src256, <8 x i64> %src512) {
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <8 x i64> %src512, <8 x i64> undef, <2 x i32> <i32 4, i32 5>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_67 = shufflevector <8 x i64> %src512, <8 x i64> undef, <2 x i32> <i32 6, i32 7>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_0123 = shufflevector <8 x i64> %src512, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V512_2345 = shufflevector <8 x i64> %src512, <8 x i64> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V512_2345 = shufflevector <8 x i64> %src512, <8 x i64> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_4567 = shufflevector <8 x i64> %src512, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
@@ -238,9 +238,9 @@ define void @test_vXi16(<4 x i16> %src64, <8 x i16> %src128, <16 x i16> %src256,
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_CD = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 12, i32 13>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_EF = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_0123 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_2345 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V256_2345 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_4567 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_6789 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_6789 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_89AB = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_CDEF = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_01234567 = shufflevector <16 x i16> %src256, <16 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -262,9 +262,9 @@ define void @test_vXi16(<4 x i16> %src64, <8 x i16> %src128, <16 x i16> %src256,
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_1C_1D = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 28, i32 29>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 30, i32 31>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_02_03_04_05 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V512_02_03_04_05 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05_06_07 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_06_07_08_09 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V512_06_07_08_09 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_08_09_0A_0B = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0C_0D_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_10_11_12_13 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
@@ -297,9 +297,9 @@ define void @test_vXi16(<4 x i16> %src64, <8 x i16> %src128, <16 x i16> %src256,
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_CD = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 12, i32 13>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_EF = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_0123 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_2345 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_2345 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_4567 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_6789 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_6789 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_89AB = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_CDEF = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_01234567 = shufflevector <16 x i16> %src256, <16 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -321,9 +321,9 @@ define void @test_vXi16(<4 x i16> %src64, <8 x i16> %src128, <16 x i16> %src256,
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_1C_1D = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 28, i32 29>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 30, i32 31>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_02_03_04_05 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_02_03_04_05 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05_06_07 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_06_07_08_09 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_06_07_08_09 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_08_09_0A_0B = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0C_0D_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_10_11_12_13 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
@@ -356,9 +356,9 @@ define void @test_vXi16(<4 x i16> %src64, <8 x i16> %src128, <16 x i16> %src256,
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_CD = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 12, i32 13>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_EF = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_0123 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_2345 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_2345 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_4567 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_6789 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_6789 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_89AB = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_CDEF = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_01234567 = shufflevector <16 x i16> %src256, <16 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -380,9 +380,9 @@ define void @test_vXi16(<4 x i16> %src64, <8 x i16> %src128, <16 x i16> %src256,
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_1C_1D = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 28, i32 29>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 30, i32 31>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_02_03_04_05 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_02_03_04_05 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05_06_07 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_06_07_08_09 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_06_07_08_09 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_08_09_0A_0B = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0C_0D_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_10_11_12_13 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
@@ -397,123 +397,300 @@ define void @test_vXi16(<4 x i16> %src64, <8 x i16> %src128, <16 x i16> %src256,
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
-; AVX-LABEL: 'test_vXi16'
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64_01 = shufflevector <4 x i16> %src64, <4 x i16> undef, <2 x i32> <i32 0, i32 1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_23 = shufflevector <4 x i16> %src64, <4 x i16> undef, <2 x i32> <i32 2, i32 3>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_01 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> <i32 0, i32 1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_23 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> <i32 2, i32 3>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_45 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> <i32 4, i32 5>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_67 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> <i32 6, i32 7>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_0123 = shufflevector <8 x i16> %src128, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_4567 = shufflevector <8 x i16> %src128, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_01 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 0, i32 1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_23 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 2, i32 3>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_45 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 4, i32 5>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_67 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 6, i32 7>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 8, i32 9>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_AB = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 10, i32 11>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_CD = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 12, i32 13>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_EF = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 14, i32 15>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_0123 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_2345 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_4567 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V256_6789 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89AB = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_CDEF = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_01234567 = shufflevector <16 x i16> %src256, <16 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89ABCDEF = shufflevector <16 x i16> %src256, <16 x i16> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 0, i32 1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_02_03 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 2, i32 3>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 4, i32 5>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_06_07 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 6, i32 7>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 8, i32 9>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0A_0B = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 10, i32 11>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0C_0D = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 12, i32 13>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 14, i32 15>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_10_11 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 16, i32 17>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_12_13 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 18, i32 19>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_14_15 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 20, i32 21>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_16_17 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 22, i32 23>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_18_19 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 24, i32 25>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1A_1B = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 26, i32 27>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1C_1D = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 28, i32 29>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 30, i32 31>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_02_03_04_05 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05_06_07 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V512_06_07_08_09 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09_0A_0B = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0C_0D_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_10_11_12_13 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_14_15_16_17 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 20, i32 21, i32 22, i32 23>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_18_19_1A_1B = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 24, i32 25, i32 26, i32 27>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1C_1D_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 28, i32 29, i32 30, i32 31>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07 = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_10_11_12_13_14_15_16_17 = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> <i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+; AVX1-LABEL: 'test_vXi16'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64_01 = shufflevector <4 x i16> %src64, <4 x i16> undef, <2 x i32> <i32 0, i32 1>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_23 = shufflevector <4 x i16> %src64, <4 x i16> undef, <2 x i32> <i32 2, i32 3>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_01 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> <i32 0, i32 1>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_23 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> <i32 2, i32 3>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_45 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> <i32 4, i32 5>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_67 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> <i32 6, i32 7>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_0123 = shufflevector <8 x i16> %src128, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_4567 = shufflevector <8 x i16> %src128, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_01 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 0, i32 1>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_23 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 2, i32 3>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_45 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 4, i32 5>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_67 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 6, i32 7>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 8, i32 9>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_AB = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 10, i32 11>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_CD = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 12, i32 13>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_EF = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_0123 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_2345 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_4567 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_6789 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89AB = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_CDEF = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_01234567 = shufflevector <16 x i16> %src256, <16 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89ABCDEF = shufflevector <16 x i16> %src256, <16 x i16> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 0, i32 1>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_02_03 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 2, i32 3>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 4, i32 5>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_06_07 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 6, i32 7>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 8, i32 9>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0A_0B = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 10, i32 11>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0C_0D = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 12, i32 13>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_10_11 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 16, i32 17>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_12_13 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 18, i32 19>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_14_15 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 20, i32 21>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_16_17 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 22, i32 23>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_18_19 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 24, i32 25>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1A_1B = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 26, i32 27>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1C_1D = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 28, i32 29>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 30, i32 31>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_02_03_04_05 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05_06_07 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_06_07_08_09 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09_0A_0B = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0C_0D_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_10_11_12_13 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_14_15_16_17 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 20, i32 21, i32 22, i32 23>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_18_19_1A_1B = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 24, i32 25, i32 26, i32 27>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1C_1D_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 28, i32 29, i32 30, i32 31>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07 = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_10_11_12_13_14_15_16_17 = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> <i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
-; AVX512-LABEL: 'test_vXi16'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64_01 = shufflevector <4 x i16> %src64, <4 x i16> undef, <2 x i32> <i32 0, i32 1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_23 = shufflevector <4 x i16> %src64, <4 x i16> undef, <2 x i32> <i32 2, i32 3>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_01 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> <i32 0, i32 1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_23 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> <i32 2, i32 3>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_45 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> <i32 4, i32 5>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_67 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> <i32 6, i32 7>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_0123 = shufflevector <8 x i16> %src128, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_4567 = shufflevector <8 x i16> %src128, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_01 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 0, i32 1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_23 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 2, i32 3>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_45 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 4, i32 5>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_67 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 6, i32 7>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 8, i32 9>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_AB = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 10, i32 11>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_CD = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 12, i32 13>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_EF = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 14, i32 15>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_0123 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_2345 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_4567 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V256_6789 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89AB = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_CDEF = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_01234567 = shufflevector <16 x i16> %src256, <16 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89ABCDEF = shufflevector <16 x i16> %src256, <16 x i16> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 0, i32 1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_02_03 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 2, i32 3>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 4, i32 5>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_06_07 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 6, i32 7>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 8, i32 9>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0A_0B = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 10, i32 11>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0C_0D = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 12, i32 13>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 14, i32 15>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 16, i32 17>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_12_13 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 18, i32 19>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_14_15 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 20, i32 21>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_16_17 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 22, i32 23>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_18_19 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 24, i32 25>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1A_1B = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 26, i32 27>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1C_1D = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 28, i32 29>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 30, i32 31>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_02_03_04_05 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05_06_07 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V512_06_07_08_09 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09_0A_0B = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0C_0D_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_14_15_16_17 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 20, i32 21, i32 22, i32 23>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_18_19_1A_1B = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 24, i32 25, i32 26, i32 27>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1C_1D_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 28, i32 29, i32 30, i32 31>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07 = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13_14_15_16_17 = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> <i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+; AVX2-LABEL: 'test_vXi16'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64_01 = shufflevector <4 x i16> %src64, <4 x i16> undef, <2 x i32> <i32 0, i32 1>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_23 = shufflevector <4 x i16> %src64, <4 x i16> undef, <2 x i32> <i32 2, i32 3>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_01 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> <i32 0, i32 1>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_23 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> <i32 2, i32 3>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_45 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> <i32 4, i32 5>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_67 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> <i32 6, i32 7>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_0123 = shufflevector <8 x i16> %src128, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_4567 = shufflevector <8 x i16> %src128, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_01 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 0, i32 1>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_23 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 2, i32 3>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_45 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 4, i32 5>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_67 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 6, i32 7>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 8, i32 9>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_AB = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 10, i32 11>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_CD = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 12, i32 13>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_EF = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_0123 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_2345 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_4567 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_6789 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89AB = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_CDEF = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_01234567 = shufflevector <16 x i16> %src256, <16 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89ABCDEF = shufflevector <16 x i16> %src256, <16 x i16> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 0, i32 1>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_02_03 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 2, i32 3>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 4, i32 5>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_06_07 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 6, i32 7>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 8, i32 9>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0A_0B = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 10, i32 11>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0C_0D = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 12, i32 13>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_10_11 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 16, i32 17>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_12_13 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 18, i32 19>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_14_15 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 20, i32 21>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_16_17 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 22, i32 23>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_18_19 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 24, i32 25>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1A_1B = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 26, i32 27>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1C_1D = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 28, i32 29>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 30, i32 31>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_02_03_04_05 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05_06_07 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_06_07_08_09 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09_0A_0B = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0C_0D_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_10_11_12_13 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_14_15_16_17 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 20, i32 21, i32 22, i32 23>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_18_19_1A_1B = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 24, i32 25, i32 26, i32 27>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1C_1D_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 28, i32 29, i32 30, i32 31>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07 = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_10_11_12_13_14_15_16_17 = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> <i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX512F-LABEL: 'test_vXi16'
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64_01 = shufflevector <4 x i16> %src64, <4 x i16> undef, <2 x i32> <i32 0, i32 1>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_23 = shufflevector <4 x i16> %src64, <4 x i16> undef, <2 x i32> <i32 2, i32 3>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_01 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> <i32 0, i32 1>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_23 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> <i32 2, i32 3>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_45 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> <i32 4, i32 5>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_67 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> <i32 6, i32 7>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_0123 = shufflevector <8 x i16> %src128, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_4567 = shufflevector <8 x i16> %src128, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_01 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 0, i32 1>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_23 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 2, i32 3>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_45 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 4, i32 5>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_67 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 6, i32 7>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 8, i32 9>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_AB = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 10, i32 11>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_CD = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 12, i32 13>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_EF = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 14, i32 15>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_0123 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_2345 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_4567 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_6789 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89AB = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_CDEF = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_01234567 = shufflevector <16 x i16> %src256, <16 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89ABCDEF = shufflevector <16 x i16> %src256, <16 x i16> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 0, i32 1>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_02_03 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 2, i32 3>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 4, i32 5>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_06_07 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 6, i32 7>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 8, i32 9>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0A_0B = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 10, i32 11>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0C_0D = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 12, i32 13>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 14, i32 15>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 16, i32 17>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_12_13 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 18, i32 19>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_14_15 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 20, i32 21>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_16_17 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 22, i32 23>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_18_19 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 24, i32 25>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1A_1B = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 26, i32 27>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1C_1D = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 28, i32 29>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 30, i32 31>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V512_02_03_04_05 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05_06_07 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V512_06_07_08_09 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09_0A_0B = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0C_0D_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_14_15_16_17 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 20, i32 21, i32 22, i32 23>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_18_19_1A_1B = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 24, i32 25, i32 26, i32 27>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1C_1D_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 28, i32 29, i32 30, i32 31>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07 = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13_14_15_16_17 = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> <i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX512BW-LABEL: 'test_vXi16'
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64_01 = shufflevector <4 x i16> %src64, <4 x i16> undef, <2 x i32> <i32 0, i32 1>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_23 = shufflevector <4 x i16> %src64, <4 x i16> undef, <2 x i32> <i32 2, i32 3>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_01 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> <i32 0, i32 1>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_23 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> <i32 2, i32 3>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_45 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> <i32 4, i32 5>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_67 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> <i32 6, i32 7>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_0123 = shufflevector <8 x i16> %src128, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_4567 = shufflevector <8 x i16> %src128, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_01 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 0, i32 1>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_23 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 2, i32 3>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_45 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 4, i32 5>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_67 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 6, i32 7>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 8, i32 9>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_AB = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 10, i32 11>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_CD = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 12, i32 13>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_EF = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 14, i32 15>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_0123 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_2345 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_4567 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_6789 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89AB = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_CDEF = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_01234567 = shufflevector <16 x i16> %src256, <16 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89ABCDEF = shufflevector <16 x i16> %src256, <16 x i16> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 0, i32 1>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_02_03 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 2, i32 3>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 4, i32 5>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_06_07 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 6, i32 7>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 8, i32 9>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0A_0B = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 10, i32 11>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0C_0D = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 12, i32 13>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 14, i32 15>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 16, i32 17>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_12_13 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 18, i32 19>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_14_15 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 20, i32 21>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_16_17 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 22, i32 23>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_18_19 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 24, i32 25>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1A_1B = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 26, i32 27>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1C_1D = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 28, i32 29>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 30, i32 31>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_02_03_04_05 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05_06_07 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_06_07_08_09 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09_0A_0B = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0C_0D_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_14_15_16_17 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 20, i32 21, i32 22, i32 23>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_18_19_1A_1B = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 24, i32 25, i32 26, i32 27>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1C_1D_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 28, i32 29, i32 30, i32 31>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07 = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13_14_15_16_17 = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> <i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX512VBMI-LABEL: 'test_vXi16'
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64_01 = shufflevector <4 x i16> %src64, <4 x i16> undef, <2 x i32> <i32 0, i32 1>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_23 = shufflevector <4 x i16> %src64, <4 x i16> undef, <2 x i32> <i32 2, i32 3>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_01 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> <i32 0, i32 1>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_23 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> <i32 2, i32 3>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_45 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> <i32 4, i32 5>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_67 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> <i32 6, i32 7>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_0123 = shufflevector <8 x i16> %src128, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_4567 = shufflevector <8 x i16> %src128, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_01 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 0, i32 1>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_23 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 2, i32 3>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_45 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 4, i32 5>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_67 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 6, i32 7>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 8, i32 9>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_AB = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 10, i32 11>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_CD = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 12, i32 13>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_EF = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 14, i32 15>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_0123 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_2345 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_4567 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_6789 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89AB = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_CDEF = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_01234567 = shufflevector <16 x i16> %src256, <16 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89ABCDEF = shufflevector <16 x i16> %src256, <16 x i16> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 0, i32 1>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_02_03 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 2, i32 3>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 4, i32 5>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_06_07 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 6, i32 7>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 8, i32 9>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0A_0B = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 10, i32 11>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0C_0D = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 12, i32 13>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 14, i32 15>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 16, i32 17>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_12_13 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 18, i32 19>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_14_15 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 20, i32 21>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_16_17 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 22, i32 23>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_18_19 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 24, i32 25>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1A_1B = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 26, i32 27>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1C_1D = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 28, i32 29>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 30, i32 31>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_02_03_04_05 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05_06_07 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_06_07_08_09 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09_0A_0B = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0C_0D_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_14_15_16_17 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 20, i32 21, i32 22, i32 23>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_18_19_1A_1B = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 24, i32 25, i32 26, i32 27>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1C_1D_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 28, i32 29, i32 30, i32 31>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07 = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13_14_15_16_17 = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> <i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SLM-LABEL: 'test_vXi16'
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64_01 = shufflevector <4 x i16> %src64, <4 x i16> undef, <2 x i32> <i32 0, i32 1>
@@ -533,9 +710,9 @@ define void @test_vXi16(<4 x i16> %src64, <8 x i16> %src128, <16 x i16> %src256,
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_CD = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 12, i32 13>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_EF = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 14, i32 15>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_0123 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SLM-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V256_2345 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_2345 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_4567 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; SLM-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V256_6789 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_6789 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_89AB = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_CDEF = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_01234567 = shufflevector <16 x i16> %src256, <16 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -557,9 +734,9 @@ define void @test_vXi16(<4 x i16> %src64, <8 x i16> %src128, <16 x i16> %src256,
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_1C_1D = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 28, i32 29>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 30, i32 31>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SLM-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V512_02_03_04_05 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_02_03_04_05 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05_06_07 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; SLM-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V512_06_07_08_09 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_06_07_08_09 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_08_09_0A_0B = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0C_0D_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_10_11_12_13 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
@@ -592,9 +769,9 @@ define void @test_vXi16(<4 x i16> %src64, <8 x i16> %src128, <16 x i16> %src256,
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_CD = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 12, i32 13>
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_EF = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 14, i32 15>
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_0123 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; GLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_2345 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_2345 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_4567 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; GLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_6789 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_6789 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_89AB = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_CDEF = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_01234567 = shufflevector <16 x i16> %src256, <16 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -616,9 +793,9 @@ define void @test_vXi16(<4 x i16> %src64, <8 x i16> %src128, <16 x i16> %src256,
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_1C_1D = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 28, i32 29>
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 30, i32 31>
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; GLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_02_03_04_05 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_02_03_04_05 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05_06_07 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; GLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_06_07_08_09 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_06_07_08_09 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_08_09_0A_0B = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0C_0D_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_10_11_12_13 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
@@ -709,9 +886,9 @@ define void @test_vXi8(<8 x i8> %src64, <16 x i8> %src128, <32 x i8> %src256, <6
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_CD = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 12, i32 13>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_EF = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_0123 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V128_2345 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V128_2345 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_4567 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V128_6789 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V128_6789 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_89AB = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_CDEF = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_01234567 = shufflevector <16 x i8> %src128, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -733,9 +910,9 @@ define void @test_vXi8(<8 x i8> %src64, <16 x i8> %src128, <32 x i8> %src256, <6
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_1C_1D = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 28, i32 29>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 30, i32 31>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01_02_03 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V256_02_03_04_05 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V256_02_03_04_05 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_04_05_06_07 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V256_06_07_08_09 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V256_06_07_08_09 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_08_09_0A_0B = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0C_0D_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_10_11_12_13 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
@@ -828,9 +1005,9 @@ define void @test_vXi8(<8 x i8> %src64, <16 x i8> %src128, <32 x i8> %src256, <6
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_CD = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 12, i32 13>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_EF = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_0123 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V128_2345 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_2345 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_4567 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V128_6789 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_6789 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_89AB = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_CDEF = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_01234567 = shufflevector <16 x i8> %src128, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -852,9 +1029,9 @@ define void @test_vXi8(<8 x i8> %src64, <16 x i8> %src128, <32 x i8> %src256, <6
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_1C_1D = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 28, i32 29>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 30, i32 31>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01_02_03 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V256_02_03_04_05 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_02_03_04_05 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_04_05_06_07 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V256_06_07_08_09 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_06_07_08_09 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_08_09_0A_0B = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0C_0D_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_10_11_12_13 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
@@ -947,9 +1124,9 @@ define void @test_vXi8(<8 x i8> %src64, <16 x i8> %src128, <32 x i8> %src256, <6
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_CD = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 12, i32 13>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_EF = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_0123 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V128_2345 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_2345 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_4567 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V128_6789 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_6789 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_89AB = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_CDEF = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_01234567 = shufflevector <16 x i8> %src128, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -971,9 +1148,9 @@ define void @test_vXi8(<8 x i8> %src64, <16 x i8> %src128, <32 x i8> %src256, <6
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_1C_1D = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 28, i32 29>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 30, i32 31>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01_02_03 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_02_03_04_05 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_02_03_04_05 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_04_05_06_07 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_06_07_08_09 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_06_07_08_09 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_08_09_0A_0B = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0C_0D_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_10_11_12_13 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
@@ -1050,243 +1227,600 @@ define void @test_vXi8(<8 x i8> %src64, <16 x i8> %src128, <32 x i8> %src256, <6
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_20_21_22_23_24_25_26_27_28_29_2A_2B_2C_2D_2E_2F_30_31_32_33_34_35_36_37_38_39_3A_3B_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <32 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
-; AVX-LABEL: 'test_vXi8'
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64_01 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 0, i32 1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_23 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 2, i32 3>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_45 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 4, i32 5>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_67 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 6, i32 7>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64_0123 = shufflevector <8 x i8> %src64, <8 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_4567 = shufflevector <8 x i8> %src64, <8 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_01 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 0, i32 1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_23 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 2, i32 3>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_45 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 4, i32 5>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_67 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 6, i32 7>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_89 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 8, i32 9>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_AB = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 10, i32 11>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_CD = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 12, i32 13>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_EF = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 14, i32 15>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_0123 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V128_2345 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_4567 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V128_6789 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_89AB = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_CDEF = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_01234567 = shufflevector <16 x i8> %src128, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_89ABCDEF = shufflevector <16 x i8> %src128, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 0, i32 1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_02_03 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 2, i32 3>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_04_05 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 4, i32 5>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_06_07 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 6, i32 7>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_08_09 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 8, i32 9>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0A_0B = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 10, i32 11>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0C_0D = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 12, i32 13>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 14, i32 15>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_10_11 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 16, i32 17>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_12_13 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 18, i32 19>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_14_15 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 20, i32 21>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_16_17 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 22, i32 23>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_18_19 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 24, i32 25>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_1A_1B = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 26, i32 27>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_1C_1D = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 28, i32 29>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 30, i32 31>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01_02_03 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_02_03_04_05 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_04_05_06_07 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_06_07_08_09 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_08_09_0A_0B = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0C_0D_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_10_11_12_13 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_14_15_16_17 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 20, i32 21, i32 22, i32 23>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_18_19_1A_1B = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 24, i32 25, i32 26, i32 27>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_1C_1D_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 28, i32 29, i32 30, i32 31>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01_02_03_04_05_06_07 = shufflevector <32 x i8> %src256, <32 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_10_11_12_13_14_15_16_17 = shufflevector <32 x i8> %src256, <32 x i8> undef, <8 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <8 x i32> <i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 0, i32 1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_02_03 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 2, i32 3>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 4, i32 5>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_06_07 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 6, i32 7>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 8, i32 9>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0A_0B = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 10, i32 11>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0C_0D = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 12, i32 13>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0E_0F = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 14, i32 15>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 16, i32 17>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_12_13 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 18, i32 19>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_14_15 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 20, i32 21>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_16_17 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 22, i32 23>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_18_19 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 24, i32 25>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1A_1B = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 26, i32 27>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1C_1D = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 28, i32 29>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 30, i32 31>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_20_21 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 32, i32 33>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_22_23 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 34, i32 35>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_24_25 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 36, i32 37>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_26_27 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 38, i32 39>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_28_29 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 40, i32 41>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_2A_2B = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 42, i32 43>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_2C_2D = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 44, i32 45>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_2E_2F = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 46, i32 47>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_30_31 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 48, i32 49>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_32_33 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 50, i32 51>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_34_35 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 52, i32 53>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_36_37 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 54, i32 55>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_38_39 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 56, i32 57>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_3A_3B = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 58, i32 59>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_3C_3D = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 60, i32 61>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 62, i32 63>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05_06_07 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09_0A_0B = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0C_0D_0E_0F = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_14_15_16_17 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 20, i32 21, i32 22, i32 23>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_18_19_1A_1B = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 24, i32 25, i32 26, i32 27>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 28, i32 29, i32 30, i32 31>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_20_21_22_23 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 32, i32 33, i32 34, i32 35>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_24_25_26_27 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 36, i32 37, i32 38, i32 39>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_28_29_2A_2B = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 40, i32 41, i32 42, i32 43>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_2C_2D_2E_2F = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 44, i32 45, i32 46, i32 47>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_30_31_32_33 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 48, i32 49, i32 50, i32 51>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_34_35_36_37 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 52, i32 53, i32 54, i32 55>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_38_39_3A_3B = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 56, i32 57, i32 58, i32 59>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 60, i32 61, i32 62, i32 63>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07 = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09_0A_0B_0C_0D_0E_0F = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13_14_15_16_17 = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_18_19_1A_1B_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_20_21_22_23_24_25_26_27 = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_28_29_2A_2B_2C_2D_2E_2F = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_30_31_32_33_34_35_36_37 = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_38_39_3A_3B_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F = shufflevector <64 x i8> %src512, <64 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_20_21_22_23_24_25_26_27_28_29_2A_2B_2C_2D_2E_2F = shufflevector <64 x i8> %src512, <64 x i8> undef, <16 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_30_31_32_33_34_35_36_37_38_39_3A_3B_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <16 x i32> <i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_20_21_22_23_24_25_26_27_28_29_2A_2B_2C_2D_2E_2F_30_31_32_33_34_35_36_37_38_39_3A_3B_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <32 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+; AVX1-LABEL: 'test_vXi8'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64_01 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 0, i32 1>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_23 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 2, i32 3>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_45 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 4, i32 5>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_67 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 6, i32 7>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64_0123 = shufflevector <8 x i8> %src64, <8 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_4567 = shufflevector <8 x i8> %src64, <8 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_01 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 0, i32 1>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_23 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 2, i32 3>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_45 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 4, i32 5>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_67 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 6, i32 7>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_89 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 8, i32 9>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_AB = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 10, i32 11>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_CD = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 12, i32 13>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_EF = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_0123 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_2345 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_4567 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_6789 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_89AB = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_CDEF = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_01234567 = shufflevector <16 x i8> %src128, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_89ABCDEF = shufflevector <16 x i8> %src128, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 0, i32 1>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_02_03 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 2, i32 3>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_04_05 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 4, i32 5>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_06_07 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 6, i32 7>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_08_09 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 8, i32 9>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0A_0B = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 10, i32 11>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0C_0D = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 12, i32 13>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_10_11 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 16, i32 17>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_12_13 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 18, i32 19>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_14_15 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 20, i32 21>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_16_17 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 22, i32 23>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_18_19 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 24, i32 25>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_1A_1B = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 26, i32 27>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_1C_1D = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 28, i32 29>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 30, i32 31>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01_02_03 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_02_03_04_05 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_04_05_06_07 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_06_07_08_09 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_08_09_0A_0B = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0C_0D_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_10_11_12_13 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_14_15_16_17 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 20, i32 21, i32 22, i32 23>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_18_19_1A_1B = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 24, i32 25, i32 26, i32 27>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_1C_1D_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 28, i32 29, i32 30, i32 31>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01_02_03_04_05_06_07 = shufflevector <32 x i8> %src256, <32 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_10_11_12_13_14_15_16_17 = shufflevector <32 x i8> %src256, <32 x i8> undef, <8 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <8 x i32> <i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 0, i32 1>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_02_03 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 2, i32 3>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 4, i32 5>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_06_07 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 6, i32 7>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 8, i32 9>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0A_0B = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 10, i32 11>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0C_0D = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 12, i32 13>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0E_0F = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 16, i32 17>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_12_13 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 18, i32 19>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_14_15 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 20, i32 21>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_16_17 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 22, i32 23>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_18_19 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 24, i32 25>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1A_1B = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 26, i32 27>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1C_1D = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 28, i32 29>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 30, i32 31>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_20_21 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 32, i32 33>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_22_23 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 34, i32 35>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_24_25 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 36, i32 37>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_26_27 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 38, i32 39>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_28_29 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 40, i32 41>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_2A_2B = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 42, i32 43>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_2C_2D = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 44, i32 45>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_2E_2F = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 46, i32 47>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_30_31 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 48, i32 49>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_32_33 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 50, i32 51>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_34_35 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 52, i32 53>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_36_37 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 54, i32 55>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_38_39 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 56, i32 57>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_3A_3B = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 58, i32 59>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_3C_3D = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 60, i32 61>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 62, i32 63>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05_06_07 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09_0A_0B = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0C_0D_0E_0F = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_14_15_16_17 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 20, i32 21, i32 22, i32 23>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_18_19_1A_1B = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 24, i32 25, i32 26, i32 27>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 28, i32 29, i32 30, i32 31>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_20_21_22_23 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 32, i32 33, i32 34, i32 35>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_24_25_26_27 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 36, i32 37, i32 38, i32 39>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_28_29_2A_2B = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 40, i32 41, i32 42, i32 43>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_2C_2D_2E_2F = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 44, i32 45, i32 46, i32 47>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_30_31_32_33 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 48, i32 49, i32 50, i32 51>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_34_35_36_37 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 52, i32 53, i32 54, i32 55>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_38_39_3A_3B = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 56, i32 57, i32 58, i32 59>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 60, i32 61, i32 62, i32 63>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07 = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09_0A_0B_0C_0D_0E_0F = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13_14_15_16_17 = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_18_19_1A_1B_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_20_21_22_23_24_25_26_27 = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_28_29_2A_2B_2C_2D_2E_2F = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_30_31_32_33_34_35_36_37 = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_38_39_3A_3B_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F = shufflevector <64 x i8> %src512, <64 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_20_21_22_23_24_25_26_27_28_29_2A_2B_2C_2D_2E_2F = shufflevector <64 x i8> %src512, <64 x i8> undef, <16 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_30_31_32_33_34_35_36_37_38_39_3A_3B_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <16 x i32> <i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_20_21_22_23_24_25_26_27_28_29_2A_2B_2C_2D_2E_2F_30_31_32_33_34_35_36_37_38_39_3A_3B_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <32 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
-; AVX512-LABEL: 'test_vXi8'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64_01 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 0, i32 1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_23 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 2, i32 3>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_45 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 4, i32 5>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_67 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 6, i32 7>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64_0123 = shufflevector <8 x i8> %src64, <8 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_4567 = shufflevector <8 x i8> %src64, <8 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_01 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 0, i32 1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_23 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 2, i32 3>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_45 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 4, i32 5>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_67 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 6, i32 7>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_89 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 8, i32 9>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_AB = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 10, i32 11>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_CD = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 12, i32 13>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_EF = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 14, i32 15>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_0123 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V128_2345 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_4567 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V128_6789 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_89AB = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_CDEF = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_01234567 = shufflevector <16 x i8> %src128, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_89ABCDEF = shufflevector <16 x i8> %src128, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 0, i32 1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_02_03 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 2, i32 3>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_04_05 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 4, i32 5>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_06_07 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 6, i32 7>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_08_09 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 8, i32 9>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0A_0B = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 10, i32 11>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0C_0D = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 12, i32 13>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 14, i32 15>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_10_11 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 16, i32 17>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_12_13 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 18, i32 19>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_14_15 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 20, i32 21>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_16_17 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 22, i32 23>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_18_19 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 24, i32 25>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_1A_1B = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 26, i32 27>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_1C_1D = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 28, i32 29>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 30, i32 31>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01_02_03 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_02_03_04_05 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_04_05_06_07 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_06_07_08_09 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_08_09_0A_0B = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0C_0D_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_10_11_12_13 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_14_15_16_17 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 20, i32 21, i32 22, i32 23>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_18_19_1A_1B = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 24, i32 25, i32 26, i32 27>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_1C_1D_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 28, i32 29, i32 30, i32 31>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01_02_03_04_05_06_07 = shufflevector <32 x i8> %src256, <32 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_10_11_12_13_14_15_16_17 = shufflevector <32 x i8> %src256, <32 x i8> undef, <8 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <8 x i32> <i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 0, i32 1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_02_03 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 2, i32 3>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 4, i32 5>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_06_07 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 6, i32 7>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 8, i32 9>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0A_0B = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 10, i32 11>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0C_0D = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 12, i32 13>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0E_0F = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 14, i32 15>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 16, i32 17>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_12_13 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 18, i32 19>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_14_15 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 20, i32 21>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_16_17 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 22, i32 23>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_18_19 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 24, i32 25>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1A_1B = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 26, i32 27>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1C_1D = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 28, i32 29>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 30, i32 31>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_20_21 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 32, i32 33>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_22_23 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 34, i32 35>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_24_25 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 36, i32 37>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_26_27 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 38, i32 39>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_28_29 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 40, i32 41>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_2A_2B = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 42, i32 43>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_2C_2D = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 44, i32 45>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_2E_2F = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 46, i32 47>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_30_31 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 48, i32 49>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_32_33 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 50, i32 51>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_34_35 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 52, i32 53>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_36_37 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 54, i32 55>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_38_39 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 56, i32 57>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_3A_3B = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 58, i32 59>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_3C_3D = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 60, i32 61>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 62, i32 63>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05_06_07 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09_0A_0B = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0C_0D_0E_0F = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_14_15_16_17 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 20, i32 21, i32 22, i32 23>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_18_19_1A_1B = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 24, i32 25, i32 26, i32 27>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 28, i32 29, i32 30, i32 31>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_20_21_22_23 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 32, i32 33, i32 34, i32 35>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_24_25_26_27 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 36, i32 37, i32 38, i32 39>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_28_29_2A_2B = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 40, i32 41, i32 42, i32 43>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_2C_2D_2E_2F = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 44, i32 45, i32 46, i32 47>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_30_31_32_33 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 48, i32 49, i32 50, i32 51>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_34_35_36_37 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 52, i32 53, i32 54, i32 55>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_38_39_3A_3B = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 56, i32 57, i32 58, i32 59>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 60, i32 61, i32 62, i32 63>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07 = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09_0A_0B_0C_0D_0E_0F = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13_14_15_16_17 = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_18_19_1A_1B_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_20_21_22_23_24_25_26_27 = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_28_29_2A_2B_2C_2D_2E_2F = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_30_31_32_33_34_35_36_37 = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_38_39_3A_3B_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F = shufflevector <64 x i8> %src512, <64 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_20_21_22_23_24_25_26_27_28_29_2A_2B_2C_2D_2E_2F = shufflevector <64 x i8> %src512, <64 x i8> undef, <16 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_30_31_32_33_34_35_36_37_38_39_3A_3B_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <16 x i32> <i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_20_21_22_23_24_25_26_27_28_29_2A_2B_2C_2D_2E_2F_30_31_32_33_34_35_36_37_38_39_3A_3B_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <32 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+; AVX2-LABEL: 'test_vXi8'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64_01 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 0, i32 1>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_23 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 2, i32 3>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_45 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 4, i32 5>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_67 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 6, i32 7>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64_0123 = shufflevector <8 x i8> %src64, <8 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_4567 = shufflevector <8 x i8> %src64, <8 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_01 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 0, i32 1>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_23 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 2, i32 3>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_45 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 4, i32 5>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_67 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 6, i32 7>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_89 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 8, i32 9>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_AB = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 10, i32 11>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_CD = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 12, i32 13>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_EF = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_0123 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_2345 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_4567 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_6789 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_89AB = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_CDEF = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_01234567 = shufflevector <16 x i8> %src128, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_89ABCDEF = shufflevector <16 x i8> %src128, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 0, i32 1>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_02_03 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 2, i32 3>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_04_05 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 4, i32 5>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_06_07 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 6, i32 7>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_08_09 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 8, i32 9>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0A_0B = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 10, i32 11>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0C_0D = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 12, i32 13>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_10_11 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 16, i32 17>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_12_13 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 18, i32 19>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_14_15 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 20, i32 21>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_16_17 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 22, i32 23>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_18_19 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 24, i32 25>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_1A_1B = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 26, i32 27>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_1C_1D = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 28, i32 29>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 30, i32 31>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01_02_03 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_02_03_04_05 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_04_05_06_07 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_06_07_08_09 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_08_09_0A_0B = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0C_0D_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_10_11_12_13 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_14_15_16_17 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 20, i32 21, i32 22, i32 23>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_18_19_1A_1B = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 24, i32 25, i32 26, i32 27>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_1C_1D_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 28, i32 29, i32 30, i32 31>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01_02_03_04_05_06_07 = shufflevector <32 x i8> %src256, <32 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_10_11_12_13_14_15_16_17 = shufflevector <32 x i8> %src256, <32 x i8> undef, <8 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <8 x i32> <i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 0, i32 1>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_02_03 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 2, i32 3>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 4, i32 5>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_06_07 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 6, i32 7>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 8, i32 9>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0A_0B = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 10, i32 11>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0C_0D = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 12, i32 13>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0E_0F = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 16, i32 17>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_12_13 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 18, i32 19>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_14_15 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 20, i32 21>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_16_17 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 22, i32 23>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_18_19 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 24, i32 25>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1A_1B = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 26, i32 27>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1C_1D = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 28, i32 29>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 30, i32 31>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_20_21 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 32, i32 33>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_22_23 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 34, i32 35>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_24_25 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 36, i32 37>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_26_27 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 38, i32 39>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_28_29 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 40, i32 41>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_2A_2B = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 42, i32 43>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_2C_2D = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 44, i32 45>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_2E_2F = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 46, i32 47>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_30_31 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 48, i32 49>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_32_33 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 50, i32 51>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_34_35 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 52, i32 53>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_36_37 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 54, i32 55>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_38_39 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 56, i32 57>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_3A_3B = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 58, i32 59>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_3C_3D = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 60, i32 61>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 62, i32 63>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05_06_07 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09_0A_0B = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0C_0D_0E_0F = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_14_15_16_17 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 20, i32 21, i32 22, i32 23>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_18_19_1A_1B = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 24, i32 25, i32 26, i32 27>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 28, i32 29, i32 30, i32 31>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_20_21_22_23 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 32, i32 33, i32 34, i32 35>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_24_25_26_27 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 36, i32 37, i32 38, i32 39>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_28_29_2A_2B = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 40, i32 41, i32 42, i32 43>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_2C_2D_2E_2F = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 44, i32 45, i32 46, i32 47>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_30_31_32_33 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 48, i32 49, i32 50, i32 51>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_34_35_36_37 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 52, i32 53, i32 54, i32 55>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_38_39_3A_3B = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 56, i32 57, i32 58, i32 59>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 60, i32 61, i32 62, i32 63>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07 = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09_0A_0B_0C_0D_0E_0F = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13_14_15_16_17 = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_18_19_1A_1B_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_20_21_22_23_24_25_26_27 = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_28_29_2A_2B_2C_2D_2E_2F = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_30_31_32_33_34_35_36_37 = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_38_39_3A_3B_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F = shufflevector <64 x i8> %src512, <64 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_20_21_22_23_24_25_26_27_28_29_2A_2B_2C_2D_2E_2F = shufflevector <64 x i8> %src512, <64 x i8> undef, <16 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_30_31_32_33_34_35_36_37_38_39_3A_3B_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <16 x i32> <i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_20_21_22_23_24_25_26_27_28_29_2A_2B_2C_2D_2E_2F_30_31_32_33_34_35_36_37_38_39_3A_3B_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <32 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX512F-LABEL: 'test_vXi8'
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64_01 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 0, i32 1>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_23 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 2, i32 3>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_45 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 4, i32 5>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_67 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 6, i32 7>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64_0123 = shufflevector <8 x i8> %src64, <8 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_4567 = shufflevector <8 x i8> %src64, <8 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_01 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 0, i32 1>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_23 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 2, i32 3>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_45 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 4, i32 5>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_67 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 6, i32 7>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_89 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 8, i32 9>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_AB = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 10, i32 11>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_CD = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 12, i32 13>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_EF = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 14, i32 15>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_0123 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128_2345 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_4567 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128_6789 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_89AB = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_CDEF = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_01234567 = shufflevector <16 x i8> %src128, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_89ABCDEF = shufflevector <16 x i8> %src128, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 0, i32 1>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_02_03 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 2, i32 3>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_04_05 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 4, i32 5>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_06_07 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 6, i32 7>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_08_09 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 8, i32 9>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0A_0B = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 10, i32 11>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0C_0D = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 12, i32 13>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 14, i32 15>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_10_11 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 16, i32 17>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_12_13 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 18, i32 19>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_14_15 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 20, i32 21>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_16_17 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 22, i32 23>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_18_19 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 24, i32 25>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_1A_1B = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 26, i32 27>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_1C_1D = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 28, i32 29>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 30, i32 31>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01_02_03 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_02_03_04_05 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_04_05_06_07 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_06_07_08_09 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_08_09_0A_0B = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0C_0D_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_10_11_12_13 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_14_15_16_17 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 20, i32 21, i32 22, i32 23>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_18_19_1A_1B = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 24, i32 25, i32 26, i32 27>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_1C_1D_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 28, i32 29, i32 30, i32 31>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01_02_03_04_05_06_07 = shufflevector <32 x i8> %src256, <32 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_10_11_12_13_14_15_16_17 = shufflevector <32 x i8> %src256, <32 x i8> undef, <8 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <8 x i32> <i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 0, i32 1>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_02_03 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 2, i32 3>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 4, i32 5>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_06_07 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 6, i32 7>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 8, i32 9>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0A_0B = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 10, i32 11>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0C_0D = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 12, i32 13>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0E_0F = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 14, i32 15>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 16, i32 17>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_12_13 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 18, i32 19>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_14_15 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 20, i32 21>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_16_17 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 22, i32 23>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_18_19 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 24, i32 25>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1A_1B = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 26, i32 27>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1C_1D = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 28, i32 29>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 30, i32 31>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_20_21 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 32, i32 33>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_22_23 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 34, i32 35>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_24_25 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 36, i32 37>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_26_27 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 38, i32 39>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_28_29 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 40, i32 41>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_2A_2B = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 42, i32 43>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_2C_2D = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 44, i32 45>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_2E_2F = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 46, i32 47>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_30_31 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 48, i32 49>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_32_33 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 50, i32 51>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_34_35 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 52, i32 53>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_36_37 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 54, i32 55>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_38_39 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 56, i32 57>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_3A_3B = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 58, i32 59>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_3C_3D = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 60, i32 61>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 62, i32 63>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05_06_07 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09_0A_0B = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0C_0D_0E_0F = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_14_15_16_17 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 20, i32 21, i32 22, i32 23>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_18_19_1A_1B = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 24, i32 25, i32 26, i32 27>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 28, i32 29, i32 30, i32 31>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_20_21_22_23 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 32, i32 33, i32 34, i32 35>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_24_25_26_27 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 36, i32 37, i32 38, i32 39>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_28_29_2A_2B = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 40, i32 41, i32 42, i32 43>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_2C_2D_2E_2F = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 44, i32 45, i32 46, i32 47>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_30_31_32_33 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 48, i32 49, i32 50, i32 51>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_34_35_36_37 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 52, i32 53, i32 54, i32 55>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_38_39_3A_3B = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 56, i32 57, i32 58, i32 59>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 60, i32 61, i32 62, i32 63>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07 = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09_0A_0B_0C_0D_0E_0F = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13_14_15_16_17 = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_18_19_1A_1B_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_20_21_22_23_24_25_26_27 = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_28_29_2A_2B_2C_2D_2E_2F = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_30_31_32_33_34_35_36_37 = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_38_39_3A_3B_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F = shufflevector <64 x i8> %src512, <64 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_20_21_22_23_24_25_26_27_28_29_2A_2B_2C_2D_2E_2F = shufflevector <64 x i8> %src512, <64 x i8> undef, <16 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_30_31_32_33_34_35_36_37_38_39_3A_3B_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <16 x i32> <i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_20_21_22_23_24_25_26_27_28_29_2A_2B_2C_2D_2E_2F_30_31_32_33_34_35_36_37_38_39_3A_3B_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <32 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX512BW-LABEL: 'test_vXi8'
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64_01 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 0, i32 1>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_23 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 2, i32 3>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_45 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 4, i32 5>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_67 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 6, i32 7>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64_0123 = shufflevector <8 x i8> %src64, <8 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_4567 = shufflevector <8 x i8> %src64, <8 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_01 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 0, i32 1>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_23 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 2, i32 3>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_45 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 4, i32 5>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_67 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 6, i32 7>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_89 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 8, i32 9>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_AB = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 10, i32 11>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_CD = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 12, i32 13>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_EF = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 14, i32 15>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_0123 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128_2345 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_4567 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128_6789 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_89AB = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_CDEF = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_01234567 = shufflevector <16 x i8> %src128, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_89ABCDEF = shufflevector <16 x i8> %src128, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 0, i32 1>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_02_03 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 2, i32 3>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_04_05 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 4, i32 5>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_06_07 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 6, i32 7>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_08_09 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 8, i32 9>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0A_0B = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 10, i32 11>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0C_0D = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 12, i32 13>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 14, i32 15>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_10_11 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 16, i32 17>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_12_13 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 18, i32 19>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_14_15 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 20, i32 21>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_16_17 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 22, i32 23>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_18_19 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 24, i32 25>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_1A_1B = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 26, i32 27>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_1C_1D = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 28, i32 29>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 30, i32 31>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01_02_03 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_02_03_04_05 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_04_05_06_07 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_06_07_08_09 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_08_09_0A_0B = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0C_0D_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_10_11_12_13 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_14_15_16_17 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 20, i32 21, i32 22, i32 23>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_18_19_1A_1B = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 24, i32 25, i32 26, i32 27>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_1C_1D_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 28, i32 29, i32 30, i32 31>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01_02_03_04_05_06_07 = shufflevector <32 x i8> %src256, <32 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_10_11_12_13_14_15_16_17 = shufflevector <32 x i8> %src256, <32 x i8> undef, <8 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <8 x i32> <i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 0, i32 1>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_02_03 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 2, i32 3>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 4, i32 5>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_06_07 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 6, i32 7>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 8, i32 9>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0A_0B = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 10, i32 11>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0C_0D = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 12, i32 13>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0E_0F = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 14, i32 15>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 16, i32 17>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_12_13 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 18, i32 19>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_14_15 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 20, i32 21>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_16_17 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 22, i32 23>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_18_19 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 24, i32 25>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1A_1B = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 26, i32 27>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1C_1D = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 28, i32 29>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 30, i32 31>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_20_21 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 32, i32 33>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_22_23 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 34, i32 35>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_24_25 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 36, i32 37>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_26_27 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 38, i32 39>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_28_29 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 40, i32 41>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_2A_2B = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 42, i32 43>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_2C_2D = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 44, i32 45>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_2E_2F = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 46, i32 47>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_30_31 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 48, i32 49>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_32_33 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 50, i32 51>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_34_35 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 52, i32 53>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_36_37 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 54, i32 55>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_38_39 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 56, i32 57>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_3A_3B = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 58, i32 59>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_3C_3D = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 60, i32 61>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 62, i32 63>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05_06_07 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09_0A_0B = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0C_0D_0E_0F = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_14_15_16_17 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 20, i32 21, i32 22, i32 23>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_18_19_1A_1B = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 24, i32 25, i32 26, i32 27>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 28, i32 29, i32 30, i32 31>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_20_21_22_23 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 32, i32 33, i32 34, i32 35>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_24_25_26_27 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 36, i32 37, i32 38, i32 39>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_28_29_2A_2B = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 40, i32 41, i32 42, i32 43>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_2C_2D_2E_2F = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 44, i32 45, i32 46, i32 47>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_30_31_32_33 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 48, i32 49, i32 50, i32 51>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_34_35_36_37 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 52, i32 53, i32 54, i32 55>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_38_39_3A_3B = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 56, i32 57, i32 58, i32 59>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 60, i32 61, i32 62, i32 63>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07 = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09_0A_0B_0C_0D_0E_0F = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13_14_15_16_17 = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_18_19_1A_1B_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_20_21_22_23_24_25_26_27 = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_28_29_2A_2B_2C_2D_2E_2F = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_30_31_32_33_34_35_36_37 = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_38_39_3A_3B_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F = shufflevector <64 x i8> %src512, <64 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_20_21_22_23_24_25_26_27_28_29_2A_2B_2C_2D_2E_2F = shufflevector <64 x i8> %src512, <64 x i8> undef, <16 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_30_31_32_33_34_35_36_37_38_39_3A_3B_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <16 x i32> <i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_20_21_22_23_24_25_26_27_28_29_2A_2B_2C_2D_2E_2F_30_31_32_33_34_35_36_37_38_39_3A_3B_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <32 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX512VBMI-LABEL: 'test_vXi8'
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64_01 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 0, i32 1>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_23 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 2, i32 3>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_45 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 4, i32 5>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_67 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 6, i32 7>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64_0123 = shufflevector <8 x i8> %src64, <8 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_4567 = shufflevector <8 x i8> %src64, <8 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_01 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 0, i32 1>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_23 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 2, i32 3>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_45 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 4, i32 5>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_67 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 6, i32 7>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_89 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 8, i32 9>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_AB = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 10, i32 11>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_CD = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 12, i32 13>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_EF = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 14, i32 15>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_0123 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128_2345 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_4567 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128_6789 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_89AB = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_CDEF = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_01234567 = shufflevector <16 x i8> %src128, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_89ABCDEF = shufflevector <16 x i8> %src128, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 0, i32 1>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_02_03 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 2, i32 3>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_04_05 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 4, i32 5>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_06_07 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 6, i32 7>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_08_09 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 8, i32 9>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0A_0B = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 10, i32 11>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0C_0D = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 12, i32 13>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 14, i32 15>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_10_11 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 16, i32 17>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_12_13 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 18, i32 19>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_14_15 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 20, i32 21>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_16_17 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 22, i32 23>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_18_19 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 24, i32 25>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_1A_1B = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 26, i32 27>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_1C_1D = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 28, i32 29>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 30, i32 31>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01_02_03 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_02_03_04_05 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_04_05_06_07 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_06_07_08_09 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_08_09_0A_0B = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0C_0D_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_10_11_12_13 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_14_15_16_17 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 20, i32 21, i32 22, i32 23>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_18_19_1A_1B = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 24, i32 25, i32 26, i32 27>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_1C_1D_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 28, i32 29, i32 30, i32 31>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01_02_03_04_05_06_07 = shufflevector <32 x i8> %src256, <32 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_10_11_12_13_14_15_16_17 = shufflevector <32 x i8> %src256, <32 x i8> undef, <8 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <8 x i32> <i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 0, i32 1>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_02_03 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 2, i32 3>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 4, i32 5>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_06_07 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 6, i32 7>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 8, i32 9>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0A_0B = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 10, i32 11>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0C_0D = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 12, i32 13>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0E_0F = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 14, i32 15>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 16, i32 17>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_12_13 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 18, i32 19>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_14_15 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 20, i32 21>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_16_17 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 22, i32 23>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_18_19 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 24, i32 25>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1A_1B = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 26, i32 27>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1C_1D = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 28, i32 29>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 30, i32 31>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_20_21 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 32, i32 33>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_22_23 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 34, i32 35>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_24_25 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 36, i32 37>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_26_27 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 38, i32 39>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_28_29 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 40, i32 41>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_2A_2B = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 42, i32 43>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_2C_2D = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 44, i32 45>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_2E_2F = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 46, i32 47>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_30_31 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 48, i32 49>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_32_33 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 50, i32 51>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_34_35 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 52, i32 53>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_36_37 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 54, i32 55>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_38_39 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 56, i32 57>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_3A_3B = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 58, i32 59>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_3C_3D = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 60, i32 61>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 62, i32 63>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05_06_07 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09_0A_0B = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0C_0D_0E_0F = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_14_15_16_17 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 20, i32 21, i32 22, i32 23>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_18_19_1A_1B = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 24, i32 25, i32 26, i32 27>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 28, i32 29, i32 30, i32 31>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_20_21_22_23 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 32, i32 33, i32 34, i32 35>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_24_25_26_27 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 36, i32 37, i32 38, i32 39>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_28_29_2A_2B = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 40, i32 41, i32 42, i32 43>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_2C_2D_2E_2F = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 44, i32 45, i32 46, i32 47>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_30_31_32_33 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 48, i32 49, i32 50, i32 51>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_34_35_36_37 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 52, i32 53, i32 54, i32 55>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_38_39_3A_3B = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 56, i32 57, i32 58, i32 59>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 60, i32 61, i32 62, i32 63>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07 = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09_0A_0B_0C_0D_0E_0F = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13_14_15_16_17 = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_18_19_1A_1B_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_20_21_22_23_24_25_26_27 = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_28_29_2A_2B_2C_2D_2E_2F = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_30_31_32_33_34_35_36_37 = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_38_39_3A_3B_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F = shufflevector <64 x i8> %src512, <64 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_20_21_22_23_24_25_26_27_28_29_2A_2B_2C_2D_2E_2F = shufflevector <64 x i8> %src512, <64 x i8> undef, <16 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_30_31_32_33_34_35_36_37_38_39_3A_3B_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <16 x i32> <i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_20_21_22_23_24_25_26_27_28_29_2A_2B_2C_2D_2E_2F_30_31_32_33_34_35_36_37_38_39_3A_3B_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <32 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SLM-LABEL: 'test_vXi8'
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64_01 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 0, i32 1>
@@ -1304,9 +1838,9 @@ define void @test_vXi8(<8 x i8> %src64, <16 x i8> %src128, <32 x i8> %src256, <6
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_CD = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 12, i32 13>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_EF = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 14, i32 15>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_0123 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SLM-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V128_2345 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_2345 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_4567 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; SLM-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V128_6789 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_6789 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_89AB = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_CDEF = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_01234567 = shufflevector <16 x i8> %src128, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -1328,9 +1862,9 @@ define void @test_vXi8(<8 x i8> %src64, <16 x i8> %src128, <32 x i8> %src256, <6
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_1C_1D = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 28, i32 29>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 30, i32 31>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01_02_03 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SLM-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V256_02_03_04_05 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_02_03_04_05 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_04_05_06_07 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; SLM-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V256_06_07_08_09 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_06_07_08_09 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_08_09_0A_0B = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0C_0D_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_10_11_12_13 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
@@ -1423,9 +1957,9 @@ define void @test_vXi8(<8 x i8> %src64, <16 x i8> %src128, <32 x i8> %src256, <6
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_CD = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 12, i32 13>
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_EF = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 14, i32 15>
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_0123 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; GLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V128_2345 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_2345 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_4567 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; GLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V128_6789 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_6789 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_89AB = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_CDEF = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_01234567 = shufflevector <16 x i8> %src128, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -1447,9 +1981,9 @@ define void @test_vXi8(<8 x i8> %src64, <16 x i8> %src128, <32 x i8> %src256, <6
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_1C_1D = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 28, i32 29>
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 30, i32 31>
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01_02_03 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; GLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_02_03_04_05 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_02_03_04_05 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_04_05_06_07 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; GLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_06_07_08_09 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_06_07_08_09 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_08_09_0A_0B = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0C_0D_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_10_11_12_13 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
diff --git a/llvm/test/Analysis/CostModel/X86/shuffle-extract_subvector-sizelatency.ll b/llvm/test/Analysis/CostModel/X86/shuffle-extract_subvector-sizelatency.ll
index 787a96f..63bb07b 100644
--- a/llvm/test/Analysis/CostModel/X86/shuffle-extract_subvector-sizelatency.ll
+++ b/llvm/test/Analysis/CostModel/X86/shuffle-extract_subvector-sizelatency.ll
@@ -2,15 +2,15 @@
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=size-latency -mattr=+sse2 | FileCheck %s -check-prefixes=SSE,SSE2
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=size-latency -mattr=+ssse3 | FileCheck %s -check-prefixes=SSE,SSSE3
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=size-latency -mattr=+sse4.2 | FileCheck %s -check-prefixes=SSE,SSE42
-; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=size-latency -mattr=+avx | FileCheck %s -check-prefixes=AVX
-; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=size-latency -mattr=+avx2 | FileCheck %s -check-prefixes=AVX
-; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=size-latency -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512
-; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=size-latency -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX512
-; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=size-latency -mattr=+avx512f,+avx512bw,+avx512vbmi | FileCheck %s --check-prefixes=AVX512
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=size-latency -mattr=+avx | FileCheck %s -check-prefixes=AVX,AVX1
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=size-latency -mattr=+avx2 | FileCheck %s -check-prefixes=AVX,AVX2
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=size-latency -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512,AVX512F
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=size-latency -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX512,AVX512BW
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=size-latency -mattr=+avx512f,+avx512bw,+avx512vbmi | FileCheck %s --check-prefixes=AVX512,AVX512VBMI
 ;
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=size-latency -mcpu=slm | FileCheck %s --check-prefixes=SSE,SLM
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=size-latency -mcpu=goldmont | FileCheck %s --check-prefixes=SSE,GLM
-; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=size-latency -mcpu=btver2 | FileCheck %s --check-prefixes=AVX
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=size-latency -mcpu=btver2 | FileCheck %s --check-prefixes=AVX,AVX1
 
 ;
 ; Verify the cost model for extract_subector style shuffles.
@@ -38,7 +38,7 @@ define void @test_vXf64(<4 x double> %src256, <8 x double> %src512) {
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_45 = shufflevector <8 x double> %src512, <8 x double> undef, <2 x i32> <i32 4, i32 5>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_67 = shufflevector <8 x double> %src512, <8 x double> undef, <2 x i32> <i32 6, i32 7>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_0123 = shufflevector <8 x double> %src512, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V512_2345 = shufflevector <8 x double> %src512, <8 x double> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_2345 = shufflevector <8 x double> %src512, <8 x double> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_4567 = shufflevector <8 x double> %src512, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_567u = shufflevector <8 x double> %src512, <8 x double> undef, <4 x i32> <i32 5, i32 6, i32 7, i32 poison>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
@@ -51,7 +51,7 @@ define void @test_vXf64(<4 x double> %src256, <8 x double> %src512) {
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <8 x double> %src512, <8 x double> undef, <2 x i32> <i32 4, i32 5>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_67 = shufflevector <8 x double> %src512, <8 x double> undef, <2 x i32> <i32 6, i32 7>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_0123 = shufflevector <8 x double> %src512, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V512_2345 = shufflevector <8 x double> %src512, <8 x double> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_2345 = shufflevector <8 x double> %src512, <8 x double> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_4567 = shufflevector <8 x double> %src512, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_567u = shufflevector <8 x double> %src512, <8 x double> undef, <4 x i32> <i32 5, i32 6, i32 7, i32 poison>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
@@ -90,7 +90,7 @@ define void @test_vXi64(<4 x i64> %src256, <8 x i64> %src512) {
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_45 = shufflevector <8 x i64> %src512, <8 x i64> undef, <2 x i32> <i32 4, i32 5>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_67 = shufflevector <8 x i64> %src512, <8 x i64> undef, <2 x i32> <i32 6, i32 7>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_0123 = shufflevector <8 x i64> %src512, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V512_2345 = shufflevector <8 x i64> %src512, <8 x i64> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_2345 = shufflevector <8 x i64> %src512, <8 x i64> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_4567 = shufflevector <8 x i64> %src512, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
@@ -102,7 +102,7 @@ define void @test_vXi64(<4 x i64> %src256, <8 x i64> %src512) {
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <8 x i64> %src512, <8 x i64> undef, <2 x i32> <i32 4, i32 5>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_67 = shufflevector <8 x i64> %src512, <8 x i64> undef, <2 x i32> <i32 6, i32 7>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_0123 = shufflevector <8 x i64> %src512, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V512_2345 = shufflevector <8 x i64> %src512, <8 x i64> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_2345 = shufflevector <8 x i64> %src512, <8 x i64> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_4567 = shufflevector <8 x i64> %src512, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
@@ -238,9 +238,9 @@ define void @test_vXi16(<4 x i16> %src64, <8 x i16> %src128, <16 x i16> %src256,
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_CD = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 12, i32 13>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_EF = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_0123 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_2345 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V256_2345 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_4567 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_6789 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_6789 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_89AB = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_CDEF = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_01234567 = shufflevector <16 x i16> %src256, <16 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -262,9 +262,9 @@ define void @test_vXi16(<4 x i16> %src64, <8 x i16> %src128, <16 x i16> %src256,
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_1C_1D = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 28, i32 29>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 30, i32 31>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_02_03_04_05 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V512_02_03_04_05 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05_06_07 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_06_07_08_09 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V512_06_07_08_09 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_08_09_0A_0B = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0C_0D_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_10_11_12_13 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
@@ -297,9 +297,9 @@ define void @test_vXi16(<4 x i16> %src64, <8 x i16> %src128, <16 x i16> %src256,
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_CD = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 12, i32 13>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_EF = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_0123 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_2345 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_2345 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_4567 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_6789 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_6789 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_89AB = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_CDEF = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_01234567 = shufflevector <16 x i16> %src256, <16 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -321,9 +321,9 @@ define void @test_vXi16(<4 x i16> %src64, <8 x i16> %src128, <16 x i16> %src256,
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_1C_1D = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 28, i32 29>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 30, i32 31>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_02_03_04_05 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_02_03_04_05 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05_06_07 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_06_07_08_09 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_06_07_08_09 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_08_09_0A_0B = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0C_0D_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_10_11_12_13 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
@@ -356,9 +356,9 @@ define void @test_vXi16(<4 x i16> %src64, <8 x i16> %src128, <16 x i16> %src256,
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_CD = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 12, i32 13>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_EF = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_0123 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_2345 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_2345 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_4567 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_6789 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_6789 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_89AB = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_CDEF = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_01234567 = shufflevector <16 x i16> %src256, <16 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -380,9 +380,9 @@ define void @test_vXi16(<4 x i16> %src64, <8 x i16> %src128, <16 x i16> %src256,
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_1C_1D = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 28, i32 29>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 30, i32 31>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_02_03_04_05 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_02_03_04_05 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05_06_07 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_06_07_08_09 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_06_07_08_09 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_08_09_0A_0B = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0C_0D_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_10_11_12_13 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
@@ -397,123 +397,300 @@ define void @test_vXi16(<4 x i16> %src64, <8 x i16> %src128, <16 x i16> %src256,
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
-; AVX-LABEL: 'test_vXi16'
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64_01 = shufflevector <4 x i16> %src64, <4 x i16> undef, <2 x i32> <i32 0, i32 1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_23 = shufflevector <4 x i16> %src64, <4 x i16> undef, <2 x i32> <i32 2, i32 3>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_01 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> <i32 0, i32 1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_23 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> <i32 2, i32 3>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_45 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> <i32 4, i32 5>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_67 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> <i32 6, i32 7>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_0123 = shufflevector <8 x i16> %src128, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_4567 = shufflevector <8 x i16> %src128, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_01 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 0, i32 1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_23 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 2, i32 3>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_45 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 4, i32 5>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_67 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 6, i32 7>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 8, i32 9>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_AB = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 10, i32 11>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_CD = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 12, i32 13>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_EF = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 14, i32 15>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_0123 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_2345 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_4567 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V256_6789 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89AB = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_CDEF = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_01234567 = shufflevector <16 x i16> %src256, <16 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89ABCDEF = shufflevector <16 x i16> %src256, <16 x i16> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 0, i32 1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_02_03 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 2, i32 3>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 4, i32 5>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_06_07 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 6, i32 7>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 8, i32 9>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0A_0B = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 10, i32 11>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0C_0D = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 12, i32 13>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 14, i32 15>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_10_11 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 16, i32 17>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_12_13 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 18, i32 19>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_14_15 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 20, i32 21>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_16_17 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 22, i32 23>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_18_19 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 24, i32 25>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1A_1B = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 26, i32 27>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1C_1D = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 28, i32 29>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 30, i32 31>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_02_03_04_05 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05_06_07 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V512_06_07_08_09 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09_0A_0B = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0C_0D_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_10_11_12_13 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_14_15_16_17 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 20, i32 21, i32 22, i32 23>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_18_19_1A_1B = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 24, i32 25, i32 26, i32 27>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1C_1D_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 28, i32 29, i32 30, i32 31>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07 = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_10_11_12_13_14_15_16_17 = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> <i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+; AVX1-LABEL: 'test_vXi16'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64_01 = shufflevector <4 x i16> %src64, <4 x i16> undef, <2 x i32> <i32 0, i32 1>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_23 = shufflevector <4 x i16> %src64, <4 x i16> undef, <2 x i32> <i32 2, i32 3>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_01 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> <i32 0, i32 1>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_23 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> <i32 2, i32 3>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_45 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> <i32 4, i32 5>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_67 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> <i32 6, i32 7>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_0123 = shufflevector <8 x i16> %src128, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_4567 = shufflevector <8 x i16> %src128, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_01 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 0, i32 1>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_23 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 2, i32 3>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_45 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 4, i32 5>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_67 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 6, i32 7>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 8, i32 9>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_AB = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 10, i32 11>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_CD = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 12, i32 13>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_EF = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_0123 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_2345 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_4567 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_6789 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89AB = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_CDEF = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_01234567 = shufflevector <16 x i16> %src256, <16 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89ABCDEF = shufflevector <16 x i16> %src256, <16 x i16> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 0, i32 1>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_02_03 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 2, i32 3>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 4, i32 5>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_06_07 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 6, i32 7>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 8, i32 9>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0A_0B = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 10, i32 11>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0C_0D = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 12, i32 13>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_10_11 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 16, i32 17>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_12_13 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 18, i32 19>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_14_15 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 20, i32 21>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_16_17 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 22, i32 23>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_18_19 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 24, i32 25>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1A_1B = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 26, i32 27>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1C_1D = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 28, i32 29>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 30, i32 31>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_02_03_04_05 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05_06_07 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_06_07_08_09 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09_0A_0B = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0C_0D_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_10_11_12_13 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_14_15_16_17 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 20, i32 21, i32 22, i32 23>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_18_19_1A_1B = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 24, i32 25, i32 26, i32 27>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1C_1D_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 28, i32 29, i32 30, i32 31>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07 = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_10_11_12_13_14_15_16_17 = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> <i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
-; AVX512-LABEL: 'test_vXi16'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64_01 = shufflevector <4 x i16> %src64, <4 x i16> undef, <2 x i32> <i32 0, i32 1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_23 = shufflevector <4 x i16> %src64, <4 x i16> undef, <2 x i32> <i32 2, i32 3>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_01 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> <i32 0, i32 1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_23 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> <i32 2, i32 3>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_45 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> <i32 4, i32 5>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_67 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> <i32 6, i32 7>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_0123 = shufflevector <8 x i16> %src128, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_4567 = shufflevector <8 x i16> %src128, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_01 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 0, i32 1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_23 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 2, i32 3>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_45 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 4, i32 5>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_67 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 6, i32 7>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 8, i32 9>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_AB = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 10, i32 11>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_CD = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 12, i32 13>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_EF = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 14, i32 15>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_0123 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_2345 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_4567 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V256_6789 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89AB = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_CDEF = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_01234567 = shufflevector <16 x i16> %src256, <16 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89ABCDEF = shufflevector <16 x i16> %src256, <16 x i16> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 0, i32 1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_02_03 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 2, i32 3>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 4, i32 5>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_06_07 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 6, i32 7>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 8, i32 9>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0A_0B = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 10, i32 11>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0C_0D = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 12, i32 13>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 14, i32 15>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 16, i32 17>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_12_13 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 18, i32 19>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_14_15 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 20, i32 21>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_16_17 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 22, i32 23>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_18_19 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 24, i32 25>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1A_1B = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 26, i32 27>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1C_1D = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 28, i32 29>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 30, i32 31>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_02_03_04_05 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05_06_07 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V512_06_07_08_09 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09_0A_0B = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0C_0D_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_14_15_16_17 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 20, i32 21, i32 22, i32 23>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_18_19_1A_1B = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 24, i32 25, i32 26, i32 27>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1C_1D_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 28, i32 29, i32 30, i32 31>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07 = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13_14_15_16_17 = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> <i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+; AVX2-LABEL: 'test_vXi16'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64_01 = shufflevector <4 x i16> %src64, <4 x i16> undef, <2 x i32> <i32 0, i32 1>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_23 = shufflevector <4 x i16> %src64, <4 x i16> undef, <2 x i32> <i32 2, i32 3>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_01 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> <i32 0, i32 1>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_23 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> <i32 2, i32 3>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_45 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> <i32 4, i32 5>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_67 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> <i32 6, i32 7>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_0123 = shufflevector <8 x i16> %src128, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_4567 = shufflevector <8 x i16> %src128, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_01 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 0, i32 1>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_23 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 2, i32 3>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_45 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 4, i32 5>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_67 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 6, i32 7>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 8, i32 9>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_AB = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 10, i32 11>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_CD = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 12, i32 13>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_EF = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_0123 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_2345 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_4567 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_6789 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89AB = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_CDEF = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_01234567 = shufflevector <16 x i16> %src256, <16 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89ABCDEF = shufflevector <16 x i16> %src256, <16 x i16> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 0, i32 1>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_02_03 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 2, i32 3>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 4, i32 5>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_06_07 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 6, i32 7>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 8, i32 9>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0A_0B = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 10, i32 11>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0C_0D = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 12, i32 13>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_10_11 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 16, i32 17>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_12_13 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 18, i32 19>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_14_15 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 20, i32 21>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_16_17 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 22, i32 23>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_18_19 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 24, i32 25>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1A_1B = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 26, i32 27>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1C_1D = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 28, i32 29>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 30, i32 31>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_02_03_04_05 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05_06_07 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_06_07_08_09 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09_0A_0B = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0C_0D_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_10_11_12_13 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_14_15_16_17 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 20, i32 21, i32 22, i32 23>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_18_19_1A_1B = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 24, i32 25, i32 26, i32 27>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1C_1D_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 28, i32 29, i32 30, i32 31>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07 = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_10_11_12_13_14_15_16_17 = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> <i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX512F-LABEL: 'test_vXi16'
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64_01 = shufflevector <4 x i16> %src64, <4 x i16> undef, <2 x i32> <i32 0, i32 1>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_23 = shufflevector <4 x i16> %src64, <4 x i16> undef, <2 x i32> <i32 2, i32 3>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_01 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> <i32 0, i32 1>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_23 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> <i32 2, i32 3>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_45 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> <i32 4, i32 5>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_67 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> <i32 6, i32 7>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_0123 = shufflevector <8 x i16> %src128, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_4567 = shufflevector <8 x i16> %src128, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_01 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 0, i32 1>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_23 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 2, i32 3>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_45 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 4, i32 5>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_67 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 6, i32 7>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 8, i32 9>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_AB = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 10, i32 11>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_CD = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 12, i32 13>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_EF = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 14, i32 15>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_0123 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_2345 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_4567 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_6789 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89AB = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_CDEF = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_01234567 = shufflevector <16 x i16> %src256, <16 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89ABCDEF = shufflevector <16 x i16> %src256, <16 x i16> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 0, i32 1>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_02_03 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 2, i32 3>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 4, i32 5>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_06_07 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 6, i32 7>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 8, i32 9>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0A_0B = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 10, i32 11>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0C_0D = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 12, i32 13>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 14, i32 15>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 16, i32 17>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_12_13 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 18, i32 19>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_14_15 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 20, i32 21>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_16_17 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 22, i32 23>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_18_19 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 24, i32 25>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1A_1B = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 26, i32 27>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1C_1D = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 28, i32 29>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 30, i32 31>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V512_02_03_04_05 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05_06_07 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V512_06_07_08_09 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09_0A_0B = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0C_0D_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_14_15_16_17 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 20, i32 21, i32 22, i32 23>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_18_19_1A_1B = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 24, i32 25, i32 26, i32 27>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1C_1D_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 28, i32 29, i32 30, i32 31>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07 = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13_14_15_16_17 = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> <i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX512BW-LABEL: 'test_vXi16'
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64_01 = shufflevector <4 x i16> %src64, <4 x i16> undef, <2 x i32> <i32 0, i32 1>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_23 = shufflevector <4 x i16> %src64, <4 x i16> undef, <2 x i32> <i32 2, i32 3>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_01 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> <i32 0, i32 1>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_23 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> <i32 2, i32 3>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_45 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> <i32 4, i32 5>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_67 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> <i32 6, i32 7>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_0123 = shufflevector <8 x i16> %src128, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_4567 = shufflevector <8 x i16> %src128, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_01 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 0, i32 1>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_23 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 2, i32 3>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_45 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 4, i32 5>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_67 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 6, i32 7>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 8, i32 9>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_AB = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 10, i32 11>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_CD = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 12, i32 13>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_EF = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 14, i32 15>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_0123 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_2345 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_4567 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_6789 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89AB = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_CDEF = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_01234567 = shufflevector <16 x i16> %src256, <16 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89ABCDEF = shufflevector <16 x i16> %src256, <16 x i16> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 0, i32 1>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_02_03 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 2, i32 3>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 4, i32 5>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_06_07 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 6, i32 7>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 8, i32 9>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0A_0B = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 10, i32 11>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0C_0D = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 12, i32 13>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 14, i32 15>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 16, i32 17>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_12_13 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 18, i32 19>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_14_15 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 20, i32 21>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_16_17 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 22, i32 23>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_18_19 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 24, i32 25>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1A_1B = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 26, i32 27>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1C_1D = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 28, i32 29>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 30, i32 31>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_02_03_04_05 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05_06_07 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_06_07_08_09 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09_0A_0B = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0C_0D_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_14_15_16_17 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 20, i32 21, i32 22, i32 23>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_18_19_1A_1B = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 24, i32 25, i32 26, i32 27>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1C_1D_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 28, i32 29, i32 30, i32 31>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07 = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13_14_15_16_17 = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> <i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX512VBMI-LABEL: 'test_vXi16'
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64_01 = shufflevector <4 x i16> %src64, <4 x i16> undef, <2 x i32> <i32 0, i32 1>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_23 = shufflevector <4 x i16> %src64, <4 x i16> undef, <2 x i32> <i32 2, i32 3>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_01 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> <i32 0, i32 1>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_23 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> <i32 2, i32 3>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_45 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> <i32 4, i32 5>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_67 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> <i32 6, i32 7>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_0123 = shufflevector <8 x i16> %src128, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_4567 = shufflevector <8 x i16> %src128, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_01 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 0, i32 1>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_23 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 2, i32 3>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_45 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 4, i32 5>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_67 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 6, i32 7>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 8, i32 9>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_AB = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 10, i32 11>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_CD = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 12, i32 13>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_EF = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 14, i32 15>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_0123 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_2345 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_4567 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_6789 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89AB = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_CDEF = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_01234567 = shufflevector <16 x i16> %src256, <16 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89ABCDEF = shufflevector <16 x i16> %src256, <16 x i16> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 0, i32 1>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_02_03 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 2, i32 3>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 4, i32 5>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_06_07 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 6, i32 7>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 8, i32 9>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0A_0B = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 10, i32 11>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0C_0D = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 12, i32 13>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 14, i32 15>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 16, i32 17>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_12_13 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 18, i32 19>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_14_15 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 20, i32 21>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_16_17 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 22, i32 23>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_18_19 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 24, i32 25>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1A_1B = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 26, i32 27>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1C_1D = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 28, i32 29>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 30, i32 31>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_02_03_04_05 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05_06_07 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_06_07_08_09 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09_0A_0B = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0C_0D_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_14_15_16_17 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 20, i32 21, i32 22, i32 23>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_18_19_1A_1B = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 24, i32 25, i32 26, i32 27>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1C_1D_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 28, i32 29, i32 30, i32 31>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07 = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13_14_15_16_17 = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> <i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SLM-LABEL: 'test_vXi16'
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64_01 = shufflevector <4 x i16> %src64, <4 x i16> undef, <2 x i32> <i32 0, i32 1>
@@ -533,9 +710,9 @@ define void @test_vXi16(<4 x i16> %src64, <8 x i16> %src128, <16 x i16> %src256,
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_CD = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 12, i32 13>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_EF = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 14, i32 15>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_0123 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SLM-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V256_2345 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_2345 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_4567 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; SLM-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V256_6789 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_6789 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_89AB = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_CDEF = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_01234567 = shufflevector <16 x i16> %src256, <16 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -557,9 +734,9 @@ define void @test_vXi16(<4 x i16> %src64, <8 x i16> %src128, <16 x i16> %src256,
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_1C_1D = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 28, i32 29>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 30, i32 31>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SLM-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V512_02_03_04_05 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_02_03_04_05 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05_06_07 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; SLM-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V512_06_07_08_09 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_06_07_08_09 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_08_09_0A_0B = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0C_0D_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_10_11_12_13 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
@@ -592,9 +769,9 @@ define void @test_vXi16(<4 x i16> %src64, <8 x i16> %src128, <16 x i16> %src256,
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_CD = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 12, i32 13>
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_EF = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 14, i32 15>
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_0123 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; GLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_2345 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_2345 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_4567 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; GLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_6789 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_6789 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_89AB = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_CDEF = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_01234567 = shufflevector <16 x i16> %src256, <16 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -616,9 +793,9 @@ define void @test_vXi16(<4 x i16> %src64, <8 x i16> %src128, <16 x i16> %src256,
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_1C_1D = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 28, i32 29>
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 30, i32 31>
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; GLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_02_03_04_05 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_02_03_04_05 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05_06_07 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; GLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_06_07_08_09 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_06_07_08_09 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_08_09_0A_0B = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0C_0D_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_10_11_12_13 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
@@ -709,9 +886,9 @@ define void @test_vXi8(<8 x i8> %src64, <16 x i8> %src128, <32 x i8> %src256, <6
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_CD = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 12, i32 13>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_EF = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_0123 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V128_2345 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V128_2345 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_4567 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V128_6789 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V128_6789 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_89AB = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_CDEF = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_01234567 = shufflevector <16 x i8> %src128, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -733,9 +910,9 @@ define void @test_vXi8(<8 x i8> %src64, <16 x i8> %src128, <32 x i8> %src256, <6
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_1C_1D = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 28, i32 29>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 30, i32 31>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01_02_03 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V256_02_03_04_05 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V256_02_03_04_05 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_04_05_06_07 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V256_06_07_08_09 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V256_06_07_08_09 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_08_09_0A_0B = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0C_0D_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_10_11_12_13 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
@@ -828,9 +1005,9 @@ define void @test_vXi8(<8 x i8> %src64, <16 x i8> %src128, <32 x i8> %src256, <6
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_CD = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 12, i32 13>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_EF = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_0123 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V128_2345 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_2345 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_4567 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V128_6789 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_6789 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_89AB = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_CDEF = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_01234567 = shufflevector <16 x i8> %src128, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -852,9 +1029,9 @@ define void @test_vXi8(<8 x i8> %src64, <16 x i8> %src128, <32 x i8> %src256, <6
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_1C_1D = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 28, i32 29>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 30, i32 31>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01_02_03 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V256_02_03_04_05 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_02_03_04_05 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_04_05_06_07 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V256_06_07_08_09 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_06_07_08_09 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_08_09_0A_0B = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0C_0D_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_10_11_12_13 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
@@ -947,9 +1124,9 @@ define void @test_vXi8(<8 x i8> %src64, <16 x i8> %src128, <32 x i8> %src256, <6
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_CD = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 12, i32 13>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_EF = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_0123 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V128_2345 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_2345 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_4567 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V128_6789 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_6789 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_89AB = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_CDEF = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_01234567 = shufflevector <16 x i8> %src128, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -971,9 +1148,9 @@ define void @test_vXi8(<8 x i8> %src64, <16 x i8> %src128, <32 x i8> %src256, <6
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_1C_1D = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 28, i32 29>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 30, i32 31>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01_02_03 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_02_03_04_05 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_02_03_04_05 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_04_05_06_07 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_06_07_08_09 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_06_07_08_09 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_08_09_0A_0B = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0C_0D_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_10_11_12_13 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
@@ -1050,243 +1227,600 @@ define void @test_vXi8(<8 x i8> %src64, <16 x i8> %src128, <32 x i8> %src256, <6
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_20_21_22_23_24_25_26_27_28_29_2A_2B_2C_2D_2E_2F_30_31_32_33_34_35_36_37_38_39_3A_3B_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <32 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
-; AVX-LABEL: 'test_vXi8'
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64_01 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 0, i32 1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_23 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 2, i32 3>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_45 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 4, i32 5>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_67 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 6, i32 7>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64_0123 = shufflevector <8 x i8> %src64, <8 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_4567 = shufflevector <8 x i8> %src64, <8 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_01 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 0, i32 1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_23 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 2, i32 3>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_45 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 4, i32 5>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_67 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 6, i32 7>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_89 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 8, i32 9>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_AB = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 10, i32 11>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_CD = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 12, i32 13>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_EF = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 14, i32 15>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_0123 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V128_2345 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_4567 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V128_6789 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_89AB = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_CDEF = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_01234567 = shufflevector <16 x i8> %src128, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_89ABCDEF = shufflevector <16 x i8> %src128, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 0, i32 1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_02_03 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 2, i32 3>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_04_05 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 4, i32 5>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_06_07 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 6, i32 7>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_08_09 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 8, i32 9>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0A_0B = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 10, i32 11>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0C_0D = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 12, i32 13>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 14, i32 15>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_10_11 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 16, i32 17>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_12_13 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 18, i32 19>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_14_15 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 20, i32 21>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_16_17 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 22, i32 23>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_18_19 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 24, i32 25>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_1A_1B = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 26, i32 27>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_1C_1D = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 28, i32 29>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 30, i32 31>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01_02_03 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_02_03_04_05 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_04_05_06_07 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_06_07_08_09 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_08_09_0A_0B = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0C_0D_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_10_11_12_13 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_14_15_16_17 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 20, i32 21, i32 22, i32 23>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_18_19_1A_1B = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 24, i32 25, i32 26, i32 27>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_1C_1D_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 28, i32 29, i32 30, i32 31>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01_02_03_04_05_06_07 = shufflevector <32 x i8> %src256, <32 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_10_11_12_13_14_15_16_17 = shufflevector <32 x i8> %src256, <32 x i8> undef, <8 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <8 x i32> <i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 0, i32 1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_02_03 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 2, i32 3>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 4, i32 5>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_06_07 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 6, i32 7>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 8, i32 9>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0A_0B = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 10, i32 11>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0C_0D = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 12, i32 13>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0E_0F = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 14, i32 15>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 16, i32 17>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_12_13 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 18, i32 19>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_14_15 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 20, i32 21>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_16_17 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 22, i32 23>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_18_19 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 24, i32 25>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1A_1B = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 26, i32 27>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1C_1D = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 28, i32 29>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 30, i32 31>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_20_21 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 32, i32 33>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_22_23 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 34, i32 35>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_24_25 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 36, i32 37>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_26_27 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 38, i32 39>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_28_29 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 40, i32 41>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_2A_2B = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 42, i32 43>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_2C_2D = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 44, i32 45>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_2E_2F = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 46, i32 47>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_30_31 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 48, i32 49>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_32_33 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 50, i32 51>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_34_35 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 52, i32 53>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_36_37 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 54, i32 55>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_38_39 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 56, i32 57>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_3A_3B = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 58, i32 59>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_3C_3D = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 60, i32 61>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 62, i32 63>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05_06_07 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09_0A_0B = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0C_0D_0E_0F = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_14_15_16_17 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 20, i32 21, i32 22, i32 23>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_18_19_1A_1B = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 24, i32 25, i32 26, i32 27>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 28, i32 29, i32 30, i32 31>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_20_21_22_23 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 32, i32 33, i32 34, i32 35>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_24_25_26_27 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 36, i32 37, i32 38, i32 39>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_28_29_2A_2B = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 40, i32 41, i32 42, i32 43>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_2C_2D_2E_2F = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 44, i32 45, i32 46, i32 47>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_30_31_32_33 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 48, i32 49, i32 50, i32 51>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_34_35_36_37 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 52, i32 53, i32 54, i32 55>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_38_39_3A_3B = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 56, i32 57, i32 58, i32 59>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 60, i32 61, i32 62, i32 63>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07 = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09_0A_0B_0C_0D_0E_0F = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13_14_15_16_17 = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_18_19_1A_1B_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_20_21_22_23_24_25_26_27 = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_28_29_2A_2B_2C_2D_2E_2F = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_30_31_32_33_34_35_36_37 = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_38_39_3A_3B_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F = shufflevector <64 x i8> %src512, <64 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_20_21_22_23_24_25_26_27_28_29_2A_2B_2C_2D_2E_2F = shufflevector <64 x i8> %src512, <64 x i8> undef, <16 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_30_31_32_33_34_35_36_37_38_39_3A_3B_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <16 x i32> <i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_20_21_22_23_24_25_26_27_28_29_2A_2B_2C_2D_2E_2F_30_31_32_33_34_35_36_37_38_39_3A_3B_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <32 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+; AVX1-LABEL: 'test_vXi8'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64_01 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 0, i32 1>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_23 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 2, i32 3>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_45 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 4, i32 5>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_67 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 6, i32 7>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64_0123 = shufflevector <8 x i8> %src64, <8 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_4567 = shufflevector <8 x i8> %src64, <8 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_01 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 0, i32 1>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_23 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 2, i32 3>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_45 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 4, i32 5>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_67 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 6, i32 7>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_89 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 8, i32 9>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_AB = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 10, i32 11>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_CD = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 12, i32 13>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_EF = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_0123 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_2345 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_4567 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_6789 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_89AB = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_CDEF = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_01234567 = shufflevector <16 x i8> %src128, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_89ABCDEF = shufflevector <16 x i8> %src128, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 0, i32 1>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_02_03 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 2, i32 3>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_04_05 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 4, i32 5>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_06_07 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 6, i32 7>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_08_09 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 8, i32 9>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0A_0B = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 10, i32 11>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0C_0D = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 12, i32 13>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_10_11 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 16, i32 17>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_12_13 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 18, i32 19>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_14_15 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 20, i32 21>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_16_17 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 22, i32 23>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_18_19 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 24, i32 25>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_1A_1B = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 26, i32 27>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_1C_1D = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 28, i32 29>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 30, i32 31>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01_02_03 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_02_03_04_05 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_04_05_06_07 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_06_07_08_09 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_08_09_0A_0B = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0C_0D_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_10_11_12_13 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_14_15_16_17 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 20, i32 21, i32 22, i32 23>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_18_19_1A_1B = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 24, i32 25, i32 26, i32 27>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_1C_1D_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 28, i32 29, i32 30, i32 31>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01_02_03_04_05_06_07 = shufflevector <32 x i8> %src256, <32 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_10_11_12_13_14_15_16_17 = shufflevector <32 x i8> %src256, <32 x i8> undef, <8 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <8 x i32> <i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 0, i32 1>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_02_03 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 2, i32 3>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 4, i32 5>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_06_07 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 6, i32 7>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 8, i32 9>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0A_0B = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 10, i32 11>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0C_0D = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 12, i32 13>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0E_0F = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 16, i32 17>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_12_13 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 18, i32 19>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_14_15 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 20, i32 21>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_16_17 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 22, i32 23>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_18_19 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 24, i32 25>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1A_1B = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 26, i32 27>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1C_1D = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 28, i32 29>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 30, i32 31>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_20_21 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 32, i32 33>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_22_23 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 34, i32 35>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_24_25 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 36, i32 37>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_26_27 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 38, i32 39>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_28_29 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 40, i32 41>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_2A_2B = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 42, i32 43>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_2C_2D = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 44, i32 45>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_2E_2F = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 46, i32 47>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_30_31 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 48, i32 49>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_32_33 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 50, i32 51>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_34_35 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 52, i32 53>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_36_37 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 54, i32 55>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_38_39 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 56, i32 57>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_3A_3B = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 58, i32 59>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_3C_3D = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 60, i32 61>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 62, i32 63>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05_06_07 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09_0A_0B = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0C_0D_0E_0F = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_14_15_16_17 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 20, i32 21, i32 22, i32 23>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_18_19_1A_1B = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 24, i32 25, i32 26, i32 27>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 28, i32 29, i32 30, i32 31>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_20_21_22_23 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 32, i32 33, i32 34, i32 35>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_24_25_26_27 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 36, i32 37, i32 38, i32 39>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_28_29_2A_2B = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 40, i32 41, i32 42, i32 43>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_2C_2D_2E_2F = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 44, i32 45, i32 46, i32 47>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_30_31_32_33 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 48, i32 49, i32 50, i32 51>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_34_35_36_37 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 52, i32 53, i32 54, i32 55>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_38_39_3A_3B = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 56, i32 57, i32 58, i32 59>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 60, i32 61, i32 62, i32 63>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07 = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09_0A_0B_0C_0D_0E_0F = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13_14_15_16_17 = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_18_19_1A_1B_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_20_21_22_23_24_25_26_27 = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_28_29_2A_2B_2C_2D_2E_2F = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_30_31_32_33_34_35_36_37 = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_38_39_3A_3B_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F = shufflevector <64 x i8> %src512, <64 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_20_21_22_23_24_25_26_27_28_29_2A_2B_2C_2D_2E_2F = shufflevector <64 x i8> %src512, <64 x i8> undef, <16 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_30_31_32_33_34_35_36_37_38_39_3A_3B_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <16 x i32> <i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_20_21_22_23_24_25_26_27_28_29_2A_2B_2C_2D_2E_2F_30_31_32_33_34_35_36_37_38_39_3A_3B_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <32 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
-; AVX512-LABEL: 'test_vXi8'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64_01 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 0, i32 1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_23 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 2, i32 3>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_45 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 4, i32 5>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_67 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 6, i32 7>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64_0123 = shufflevector <8 x i8> %src64, <8 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_4567 = shufflevector <8 x i8> %src64, <8 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_01 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 0, i32 1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_23 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 2, i32 3>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_45 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 4, i32 5>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_67 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 6, i32 7>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_89 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 8, i32 9>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_AB = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 10, i32 11>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_CD = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 12, i32 13>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_EF = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 14, i32 15>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_0123 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V128_2345 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_4567 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V128_6789 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_89AB = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_CDEF = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_01234567 = shufflevector <16 x i8> %src128, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_89ABCDEF = shufflevector <16 x i8> %src128, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 0, i32 1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_02_03 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 2, i32 3>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_04_05 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 4, i32 5>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_06_07 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 6, i32 7>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_08_09 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 8, i32 9>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0A_0B = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 10, i32 11>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0C_0D = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 12, i32 13>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 14, i32 15>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_10_11 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 16, i32 17>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_12_13 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 18, i32 19>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_14_15 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 20, i32 21>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_16_17 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 22, i32 23>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_18_19 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 24, i32 25>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_1A_1B = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 26, i32 27>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_1C_1D = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 28, i32 29>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 30, i32 31>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01_02_03 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_02_03_04_05 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_04_05_06_07 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_06_07_08_09 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_08_09_0A_0B = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0C_0D_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_10_11_12_13 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_14_15_16_17 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 20, i32 21, i32 22, i32 23>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_18_19_1A_1B = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 24, i32 25, i32 26, i32 27>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_1C_1D_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 28, i32 29, i32 30, i32 31>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01_02_03_04_05_06_07 = shufflevector <32 x i8> %src256, <32 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_10_11_12_13_14_15_16_17 = shufflevector <32 x i8> %src256, <32 x i8> undef, <8 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <8 x i32> <i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 0, i32 1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_02_03 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 2, i32 3>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 4, i32 5>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_06_07 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 6, i32 7>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 8, i32 9>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0A_0B = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 10, i32 11>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0C_0D = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 12, i32 13>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0E_0F = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 14, i32 15>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 16, i32 17>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_12_13 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 18, i32 19>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_14_15 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 20, i32 21>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_16_17 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 22, i32 23>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_18_19 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 24, i32 25>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1A_1B = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 26, i32 27>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1C_1D = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 28, i32 29>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 30, i32 31>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_20_21 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 32, i32 33>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_22_23 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 34, i32 35>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_24_25 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 36, i32 37>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_26_27 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 38, i32 39>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_28_29 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 40, i32 41>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_2A_2B = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 42, i32 43>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_2C_2D = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 44, i32 45>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_2E_2F = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 46, i32 47>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_30_31 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 48, i32 49>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_32_33 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 50, i32 51>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_34_35 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 52, i32 53>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_36_37 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 54, i32 55>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_38_39 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 56, i32 57>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_3A_3B = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 58, i32 59>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_3C_3D = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 60, i32 61>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 62, i32 63>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05_06_07 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09_0A_0B = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0C_0D_0E_0F = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_14_15_16_17 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 20, i32 21, i32 22, i32 23>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_18_19_1A_1B = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 24, i32 25, i32 26, i32 27>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 28, i32 29, i32 30, i32 31>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_20_21_22_23 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 32, i32 33, i32 34, i32 35>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_24_25_26_27 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 36, i32 37, i32 38, i32 39>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_28_29_2A_2B = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 40, i32 41, i32 42, i32 43>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_2C_2D_2E_2F = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 44, i32 45, i32 46, i32 47>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_30_31_32_33 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 48, i32 49, i32 50, i32 51>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_34_35_36_37 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 52, i32 53, i32 54, i32 55>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_38_39_3A_3B = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 56, i32 57, i32 58, i32 59>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 60, i32 61, i32 62, i32 63>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07 = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09_0A_0B_0C_0D_0E_0F = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13_14_15_16_17 = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_18_19_1A_1B_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_20_21_22_23_24_25_26_27 = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_28_29_2A_2B_2C_2D_2E_2F = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_30_31_32_33_34_35_36_37 = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_38_39_3A_3B_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F = shufflevector <64 x i8> %src512, <64 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_20_21_22_23_24_25_26_27_28_29_2A_2B_2C_2D_2E_2F = shufflevector <64 x i8> %src512, <64 x i8> undef, <16 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_30_31_32_33_34_35_36_37_38_39_3A_3B_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <16 x i32> <i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_20_21_22_23_24_25_26_27_28_29_2A_2B_2C_2D_2E_2F_30_31_32_33_34_35_36_37_38_39_3A_3B_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <32 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+; AVX2-LABEL: 'test_vXi8'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64_01 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 0, i32 1>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_23 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 2, i32 3>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_45 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 4, i32 5>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_67 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 6, i32 7>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64_0123 = shufflevector <8 x i8> %src64, <8 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_4567 = shufflevector <8 x i8> %src64, <8 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_01 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 0, i32 1>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_23 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 2, i32 3>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_45 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 4, i32 5>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_67 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 6, i32 7>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_89 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 8, i32 9>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_AB = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 10, i32 11>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_CD = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 12, i32 13>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_EF = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_0123 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_2345 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_4567 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_6789 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_89AB = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_CDEF = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_01234567 = shufflevector <16 x i8> %src128, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_89ABCDEF = shufflevector <16 x i8> %src128, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 0, i32 1>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_02_03 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 2, i32 3>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_04_05 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 4, i32 5>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_06_07 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 6, i32 7>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_08_09 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 8, i32 9>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0A_0B = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 10, i32 11>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0C_0D = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 12, i32 13>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_10_11 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 16, i32 17>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_12_13 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 18, i32 19>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_14_15 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 20, i32 21>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_16_17 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 22, i32 23>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_18_19 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 24, i32 25>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_1A_1B = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 26, i32 27>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_1C_1D = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 28, i32 29>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 30, i32 31>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01_02_03 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_02_03_04_05 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_04_05_06_07 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_06_07_08_09 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_08_09_0A_0B = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0C_0D_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_10_11_12_13 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_14_15_16_17 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 20, i32 21, i32 22, i32 23>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_18_19_1A_1B = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 24, i32 25, i32 26, i32 27>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_1C_1D_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 28, i32 29, i32 30, i32 31>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01_02_03_04_05_06_07 = shufflevector <32 x i8> %src256, <32 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_10_11_12_13_14_15_16_17 = shufflevector <32 x i8> %src256, <32 x i8> undef, <8 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <8 x i32> <i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 0, i32 1>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_02_03 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 2, i32 3>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 4, i32 5>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_06_07 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 6, i32 7>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 8, i32 9>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0A_0B = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 10, i32 11>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0C_0D = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 12, i32 13>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0E_0F = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 16, i32 17>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_12_13 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 18, i32 19>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_14_15 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 20, i32 21>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_16_17 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 22, i32 23>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_18_19 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 24, i32 25>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1A_1B = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 26, i32 27>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1C_1D = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 28, i32 29>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 30, i32 31>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_20_21 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 32, i32 33>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_22_23 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 34, i32 35>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_24_25 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 36, i32 37>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_26_27 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 38, i32 39>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_28_29 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 40, i32 41>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_2A_2B = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 42, i32 43>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_2C_2D = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 44, i32 45>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_2E_2F = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 46, i32 47>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_30_31 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 48, i32 49>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_32_33 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 50, i32 51>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_34_35 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 52, i32 53>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_36_37 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 54, i32 55>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_38_39 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 56, i32 57>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_3A_3B = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 58, i32 59>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_3C_3D = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 60, i32 61>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 62, i32 63>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05_06_07 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09_0A_0B = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0C_0D_0E_0F = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_14_15_16_17 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 20, i32 21, i32 22, i32 23>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_18_19_1A_1B = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 24, i32 25, i32 26, i32 27>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 28, i32 29, i32 30, i32 31>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_20_21_22_23 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 32, i32 33, i32 34, i32 35>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_24_25_26_27 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 36, i32 37, i32 38, i32 39>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_28_29_2A_2B = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 40, i32 41, i32 42, i32 43>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_2C_2D_2E_2F = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 44, i32 45, i32 46, i32 47>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_30_31_32_33 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 48, i32 49, i32 50, i32 51>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_34_35_36_37 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 52, i32 53, i32 54, i32 55>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_38_39_3A_3B = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 56, i32 57, i32 58, i32 59>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 60, i32 61, i32 62, i32 63>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07 = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09_0A_0B_0C_0D_0E_0F = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13_14_15_16_17 = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_18_19_1A_1B_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_20_21_22_23_24_25_26_27 = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_28_29_2A_2B_2C_2D_2E_2F = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_30_31_32_33_34_35_36_37 = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_38_39_3A_3B_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F = shufflevector <64 x i8> %src512, <64 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_20_21_22_23_24_25_26_27_28_29_2A_2B_2C_2D_2E_2F = shufflevector <64 x i8> %src512, <64 x i8> undef, <16 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_30_31_32_33_34_35_36_37_38_39_3A_3B_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <16 x i32> <i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_20_21_22_23_24_25_26_27_28_29_2A_2B_2C_2D_2E_2F_30_31_32_33_34_35_36_37_38_39_3A_3B_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <32 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX512F-LABEL: 'test_vXi8'
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64_01 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 0, i32 1>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_23 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 2, i32 3>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_45 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 4, i32 5>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_67 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 6, i32 7>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64_0123 = shufflevector <8 x i8> %src64, <8 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_4567 = shufflevector <8 x i8> %src64, <8 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_01 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 0, i32 1>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_23 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 2, i32 3>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_45 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 4, i32 5>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_67 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 6, i32 7>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_89 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 8, i32 9>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_AB = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 10, i32 11>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_CD = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 12, i32 13>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_EF = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 14, i32 15>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_0123 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_2345 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_4567 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_6789 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_89AB = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_CDEF = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_01234567 = shufflevector <16 x i8> %src128, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_89ABCDEF = shufflevector <16 x i8> %src128, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 0, i32 1>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_02_03 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 2, i32 3>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_04_05 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 4, i32 5>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_06_07 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 6, i32 7>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_08_09 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 8, i32 9>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0A_0B = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 10, i32 11>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0C_0D = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 12, i32 13>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 14, i32 15>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_10_11 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 16, i32 17>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_12_13 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 18, i32 19>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_14_15 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 20, i32 21>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_16_17 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 22, i32 23>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_18_19 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 24, i32 25>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_1A_1B = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 26, i32 27>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_1C_1D = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 28, i32 29>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 30, i32 31>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01_02_03 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_02_03_04_05 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_04_05_06_07 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_06_07_08_09 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_08_09_0A_0B = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0C_0D_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_10_11_12_13 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_14_15_16_17 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 20, i32 21, i32 22, i32 23>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_18_19_1A_1B = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 24, i32 25, i32 26, i32 27>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_1C_1D_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 28, i32 29, i32 30, i32 31>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01_02_03_04_05_06_07 = shufflevector <32 x i8> %src256, <32 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_10_11_12_13_14_15_16_17 = shufflevector <32 x i8> %src256, <32 x i8> undef, <8 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <8 x i32> <i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 0, i32 1>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_02_03 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 2, i32 3>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 4, i32 5>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_06_07 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 6, i32 7>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 8, i32 9>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0A_0B = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 10, i32 11>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0C_0D = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 12, i32 13>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0E_0F = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 14, i32 15>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 16, i32 17>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_12_13 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 18, i32 19>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_14_15 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 20, i32 21>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_16_17 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 22, i32 23>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_18_19 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 24, i32 25>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1A_1B = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 26, i32 27>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1C_1D = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 28, i32 29>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 30, i32 31>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_20_21 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 32, i32 33>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_22_23 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 34, i32 35>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_24_25 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 36, i32 37>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_26_27 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 38, i32 39>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_28_29 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 40, i32 41>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_2A_2B = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 42, i32 43>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_2C_2D = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 44, i32 45>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_2E_2F = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 46, i32 47>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_30_31 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 48, i32 49>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_32_33 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 50, i32 51>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_34_35 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 52, i32 53>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_36_37 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 54, i32 55>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_38_39 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 56, i32 57>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_3A_3B = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 58, i32 59>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_3C_3D = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 60, i32 61>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 62, i32 63>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05_06_07 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09_0A_0B = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0C_0D_0E_0F = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_14_15_16_17 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 20, i32 21, i32 22, i32 23>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_18_19_1A_1B = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 24, i32 25, i32 26, i32 27>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 28, i32 29, i32 30, i32 31>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_20_21_22_23 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 32, i32 33, i32 34, i32 35>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_24_25_26_27 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 36, i32 37, i32 38, i32 39>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_28_29_2A_2B = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 40, i32 41, i32 42, i32 43>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_2C_2D_2E_2F = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 44, i32 45, i32 46, i32 47>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_30_31_32_33 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 48, i32 49, i32 50, i32 51>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_34_35_36_37 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 52, i32 53, i32 54, i32 55>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_38_39_3A_3B = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 56, i32 57, i32 58, i32 59>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 60, i32 61, i32 62, i32 63>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07 = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09_0A_0B_0C_0D_0E_0F = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13_14_15_16_17 = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_18_19_1A_1B_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_20_21_22_23_24_25_26_27 = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_28_29_2A_2B_2C_2D_2E_2F = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_30_31_32_33_34_35_36_37 = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_38_39_3A_3B_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F = shufflevector <64 x i8> %src512, <64 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_20_21_22_23_24_25_26_27_28_29_2A_2B_2C_2D_2E_2F = shufflevector <64 x i8> %src512, <64 x i8> undef, <16 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_30_31_32_33_34_35_36_37_38_39_3A_3B_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <16 x i32> <i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_20_21_22_23_24_25_26_27_28_29_2A_2B_2C_2D_2E_2F_30_31_32_33_34_35_36_37_38_39_3A_3B_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <32 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX512BW-LABEL: 'test_vXi8'
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64_01 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 0, i32 1>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_23 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 2, i32 3>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_45 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 4, i32 5>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_67 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 6, i32 7>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64_0123 = shufflevector <8 x i8> %src64, <8 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_4567 = shufflevector <8 x i8> %src64, <8 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_01 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 0, i32 1>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_23 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 2, i32 3>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_45 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 4, i32 5>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_67 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 6, i32 7>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_89 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 8, i32 9>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_AB = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 10, i32 11>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_CD = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 12, i32 13>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_EF = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 14, i32 15>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_0123 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_2345 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_4567 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_6789 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_89AB = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_CDEF = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_01234567 = shufflevector <16 x i8> %src128, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_89ABCDEF = shufflevector <16 x i8> %src128, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 0, i32 1>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_02_03 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 2, i32 3>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_04_05 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 4, i32 5>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_06_07 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 6, i32 7>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_08_09 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 8, i32 9>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0A_0B = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 10, i32 11>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0C_0D = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 12, i32 13>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 14, i32 15>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_10_11 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 16, i32 17>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_12_13 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 18, i32 19>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_14_15 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 20, i32 21>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_16_17 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 22, i32 23>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_18_19 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 24, i32 25>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_1A_1B = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 26, i32 27>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_1C_1D = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 28, i32 29>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 30, i32 31>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01_02_03 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_02_03_04_05 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_04_05_06_07 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_06_07_08_09 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_08_09_0A_0B = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0C_0D_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_10_11_12_13 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_14_15_16_17 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 20, i32 21, i32 22, i32 23>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_18_19_1A_1B = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 24, i32 25, i32 26, i32 27>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_1C_1D_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 28, i32 29, i32 30, i32 31>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01_02_03_04_05_06_07 = shufflevector <32 x i8> %src256, <32 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_10_11_12_13_14_15_16_17 = shufflevector <32 x i8> %src256, <32 x i8> undef, <8 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <8 x i32> <i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 0, i32 1>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_02_03 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 2, i32 3>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 4, i32 5>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_06_07 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 6, i32 7>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 8, i32 9>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0A_0B = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 10, i32 11>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0C_0D = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 12, i32 13>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0E_0F = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 14, i32 15>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 16, i32 17>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_12_13 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 18, i32 19>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_14_15 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 20, i32 21>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_16_17 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 22, i32 23>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_18_19 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 24, i32 25>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1A_1B = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 26, i32 27>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1C_1D = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 28, i32 29>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 30, i32 31>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_20_21 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 32, i32 33>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_22_23 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 34, i32 35>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_24_25 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 36, i32 37>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_26_27 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 38, i32 39>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_28_29 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 40, i32 41>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_2A_2B = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 42, i32 43>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_2C_2D = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 44, i32 45>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_2E_2F = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 46, i32 47>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_30_31 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 48, i32 49>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_32_33 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 50, i32 51>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_34_35 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 52, i32 53>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_36_37 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 54, i32 55>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_38_39 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 56, i32 57>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_3A_3B = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 58, i32 59>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_3C_3D = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 60, i32 61>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 62, i32 63>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05_06_07 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09_0A_0B = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0C_0D_0E_0F = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_14_15_16_17 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 20, i32 21, i32 22, i32 23>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_18_19_1A_1B = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 24, i32 25, i32 26, i32 27>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 28, i32 29, i32 30, i32 31>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_20_21_22_23 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 32, i32 33, i32 34, i32 35>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_24_25_26_27 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 36, i32 37, i32 38, i32 39>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_28_29_2A_2B = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 40, i32 41, i32 42, i32 43>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_2C_2D_2E_2F = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 44, i32 45, i32 46, i32 47>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_30_31_32_33 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 48, i32 49, i32 50, i32 51>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_34_35_36_37 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 52, i32 53, i32 54, i32 55>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_38_39_3A_3B = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 56, i32 57, i32 58, i32 59>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 60, i32 61, i32 62, i32 63>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07 = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09_0A_0B_0C_0D_0E_0F = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13_14_15_16_17 = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_18_19_1A_1B_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_20_21_22_23_24_25_26_27 = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_28_29_2A_2B_2C_2D_2E_2F = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_30_31_32_33_34_35_36_37 = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_38_39_3A_3B_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F = shufflevector <64 x i8> %src512, <64 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_20_21_22_23_24_25_26_27_28_29_2A_2B_2C_2D_2E_2F = shufflevector <64 x i8> %src512, <64 x i8> undef, <16 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_30_31_32_33_34_35_36_37_38_39_3A_3B_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <16 x i32> <i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_20_21_22_23_24_25_26_27_28_29_2A_2B_2C_2D_2E_2F_30_31_32_33_34_35_36_37_38_39_3A_3B_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <32 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX512VBMI-LABEL: 'test_vXi8'
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64_01 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 0, i32 1>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_23 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 2, i32 3>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_45 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 4, i32 5>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_67 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 6, i32 7>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64_0123 = shufflevector <8 x i8> %src64, <8 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_4567 = shufflevector <8 x i8> %src64, <8 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_01 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 0, i32 1>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_23 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 2, i32 3>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_45 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 4, i32 5>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_67 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 6, i32 7>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_89 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 8, i32 9>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_AB = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 10, i32 11>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_CD = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 12, i32 13>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_EF = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 14, i32 15>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_0123 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_2345 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_4567 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_6789 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_89AB = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_CDEF = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_01234567 = shufflevector <16 x i8> %src128, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_89ABCDEF = shufflevector <16 x i8> %src128, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 0, i32 1>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_02_03 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 2, i32 3>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_04_05 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 4, i32 5>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_06_07 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 6, i32 7>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_08_09 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 8, i32 9>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0A_0B = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 10, i32 11>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0C_0D = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 12, i32 13>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 14, i32 15>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_10_11 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 16, i32 17>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_12_13 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 18, i32 19>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_14_15 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 20, i32 21>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_16_17 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 22, i32 23>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_18_19 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 24, i32 25>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_1A_1B = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 26, i32 27>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_1C_1D = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 28, i32 29>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 30, i32 31>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01_02_03 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_02_03_04_05 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_04_05_06_07 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_06_07_08_09 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_08_09_0A_0B = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0C_0D_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_10_11_12_13 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_14_15_16_17 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 20, i32 21, i32 22, i32 23>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_18_19_1A_1B = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 24, i32 25, i32 26, i32 27>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_1C_1D_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 28, i32 29, i32 30, i32 31>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01_02_03_04_05_06_07 = shufflevector <32 x i8> %src256, <32 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_10_11_12_13_14_15_16_17 = shufflevector <32 x i8> %src256, <32 x i8> undef, <8 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <8 x i32> <i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 0, i32 1>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_02_03 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 2, i32 3>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 4, i32 5>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_06_07 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 6, i32 7>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 8, i32 9>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0A_0B = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 10, i32 11>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0C_0D = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 12, i32 13>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0E_0F = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 14, i32 15>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 16, i32 17>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_12_13 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 18, i32 19>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_14_15 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 20, i32 21>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_16_17 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 22, i32 23>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_18_19 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 24, i32 25>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1A_1B = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 26, i32 27>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1C_1D = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 28, i32 29>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 30, i32 31>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_20_21 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 32, i32 33>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_22_23 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 34, i32 35>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_24_25 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 36, i32 37>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_26_27 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 38, i32 39>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_28_29 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 40, i32 41>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_2A_2B = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 42, i32 43>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_2C_2D = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 44, i32 45>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_2E_2F = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 46, i32 47>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_30_31 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 48, i32 49>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_32_33 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 50, i32 51>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_34_35 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 52, i32 53>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_36_37 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 54, i32 55>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_38_39 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 56, i32 57>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_3A_3B = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 58, i32 59>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_3C_3D = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 60, i32 61>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 62, i32 63>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05_06_07 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09_0A_0B = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0C_0D_0E_0F = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_14_15_16_17 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 20, i32 21, i32 22, i32 23>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_18_19_1A_1B = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 24, i32 25, i32 26, i32 27>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 28, i32 29, i32 30, i32 31>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_20_21_22_23 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 32, i32 33, i32 34, i32 35>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_24_25_26_27 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 36, i32 37, i32 38, i32 39>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_28_29_2A_2B = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 40, i32 41, i32 42, i32 43>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_2C_2D_2E_2F = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 44, i32 45, i32 46, i32 47>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_30_31_32_33 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 48, i32 49, i32 50, i32 51>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_34_35_36_37 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 52, i32 53, i32 54, i32 55>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_38_39_3A_3B = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 56, i32 57, i32 58, i32 59>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 60, i32 61, i32 62, i32 63>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07 = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09_0A_0B_0C_0D_0E_0F = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13_14_15_16_17 = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_18_19_1A_1B_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_20_21_22_23_24_25_26_27 = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_28_29_2A_2B_2C_2D_2E_2F = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_30_31_32_33_34_35_36_37 = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_38_39_3A_3B_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F = shufflevector <64 x i8> %src512, <64 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_20_21_22_23_24_25_26_27_28_29_2A_2B_2C_2D_2E_2F = shufflevector <64 x i8> %src512, <64 x i8> undef, <16 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_30_31_32_33_34_35_36_37_38_39_3A_3B_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <16 x i32> <i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_20_21_22_23_24_25_26_27_28_29_2A_2B_2C_2D_2E_2F_30_31_32_33_34_35_36_37_38_39_3A_3B_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <32 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SLM-LABEL: 'test_vXi8'
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64_01 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 0, i32 1>
@@ -1304,9 +1838,9 @@ define void @test_vXi8(<8 x i8> %src64, <16 x i8> %src128, <32 x i8> %src256, <6
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_CD = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 12, i32 13>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_EF = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 14, i32 15>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_0123 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SLM-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V128_2345 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_2345 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_4567 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; SLM-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V128_6789 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_6789 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_89AB = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_CDEF = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_01234567 = shufflevector <16 x i8> %src128, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -1328,9 +1862,9 @@ define void @test_vXi8(<8 x i8> %src64, <16 x i8> %src128, <32 x i8> %src256, <6
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_1C_1D = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 28, i32 29>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 30, i32 31>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01_02_03 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SLM-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V256_02_03_04_05 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_02_03_04_05 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_04_05_06_07 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; SLM-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V256_06_07_08_09 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_06_07_08_09 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_08_09_0A_0B = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0C_0D_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_10_11_12_13 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
@@ -1423,9 +1957,9 @@ define void @test_vXi8(<8 x i8> %src64, <16 x i8> %src128, <32 x i8> %src256, <6
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_CD = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 12, i32 13>
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_EF = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 14, i32 15>
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_0123 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; GLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V128_2345 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_2345 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_4567 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; GLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V128_6789 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_6789 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_89AB = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_CDEF = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_01234567 = shufflevector <16 x i8> %src128, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -1447,9 +1981,9 @@ define void @test_vXi8(<8 x i8> %src64, <16 x i8> %src128, <32 x i8> %src256, <6
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_1C_1D = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 28, i32 29>
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 30, i32 31>
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01_02_03 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; GLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_02_03_04_05 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_02_03_04_05 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_04_05_06_07 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; GLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_06_07_08_09 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_06_07_08_09 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_08_09_0A_0B = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0C_0D_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_10_11_12_13 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
diff --git a/llvm/test/Analysis/CostModel/X86/shuffle-extract_subvector.ll b/llvm/test/Analysis/CostModel/X86/shuffle-extract_subvector.ll
index 58e99c9..b521a75 100644
--- a/llvm/test/Analysis/CostModel/X86/shuffle-extract_subvector.ll
+++ b/llvm/test/Analysis/CostModel/X86/shuffle-extract_subvector.ll
@@ -2,15 +2,15 @@
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -mattr=+sse2 | FileCheck %s -check-prefixes=SSE,SSE2
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -mattr=+ssse3 | FileCheck %s -check-prefixes=SSE,SSSE3
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -mattr=+sse4.2 | FileCheck %s -check-prefixes=SSE,SSE42
-; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -mattr=+avx | FileCheck %s -check-prefixes=AVX
-; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -mattr=+avx2 | FileCheck %s -check-prefixes=AVX
-; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512
-; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX512
-; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -mattr=+avx512f,+avx512bw,+avx512vbmi | FileCheck %s --check-prefixes=AVX512
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -mattr=+avx | FileCheck %s -check-prefixes=AVX,AVX1
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -mattr=+avx2 | FileCheck %s -check-prefixes=AVX,AVX2
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512,AVX512F
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX512,AVX512BW
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -mattr=+avx512f,+avx512bw,+avx512vbmi | FileCheck %s --check-prefixes=AVX512,AVX512VBMI
 ;
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -mcpu=slm | FileCheck %s --check-prefixes=SSE,SLM
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -mcpu=goldmont | FileCheck %s --check-prefixes=SSE,GLM
-; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -mcpu=btver2 | FileCheck %s --check-prefixes=AVX
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -mcpu=btver2 | FileCheck %s --check-prefixes=AVX,AVX1
 
 ;
 ; Verify the cost model for extract_subector style shuffles.
@@ -38,7 +38,7 @@ define void @test_vXf64(<4 x double> %src256, <8 x double> %src512) {
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_45 = shufflevector <8 x double> %src512, <8 x double> undef, <2 x i32> <i32 4, i32 5>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_67 = shufflevector <8 x double> %src512, <8 x double> undef, <2 x i32> <i32 6, i32 7>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_0123 = shufflevector <8 x double> %src512, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V512_2345 = shufflevector <8 x double> %src512, <8 x double> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_2345 = shufflevector <8 x double> %src512, <8 x double> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_4567 = shufflevector <8 x double> %src512, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %V512_567u = shufflevector <8 x double> %src512, <8 x double> undef, <4 x i32> <i32 5, i32 6, i32 7, i32 poison>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
@@ -51,7 +51,7 @@ define void @test_vXf64(<4 x double> %src256, <8 x double> %src512) {
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <8 x double> %src512, <8 x double> undef, <2 x i32> <i32 4, i32 5>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_67 = shufflevector <8 x double> %src512, <8 x double> undef, <2 x i32> <i32 6, i32 7>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_0123 = shufflevector <8 x double> %src512, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V512_2345 = shufflevector <8 x double> %src512, <8 x double> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_2345 = shufflevector <8 x double> %src512, <8 x double> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_4567 = shufflevector <8 x double> %src512, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %V512_567u = shufflevector <8 x double> %src512, <8 x double> undef, <4 x i32> <i32 5, i32 6, i32 7, i32 poison>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
@@ -90,7 +90,7 @@ define void @test_vXi64(<4 x i64> %src256, <8 x i64> %src512) {
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_45 = shufflevector <8 x i64> %src512, <8 x i64> undef, <2 x i32> <i32 4, i32 5>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_67 = shufflevector <8 x i64> %src512, <8 x i64> undef, <2 x i32> <i32 6, i32 7>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_0123 = shufflevector <8 x i64> %src512, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V512_2345 = shufflevector <8 x i64> %src512, <8 x i64> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_2345 = shufflevector <8 x i64> %src512, <8 x i64> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_4567 = shufflevector <8 x i64> %src512, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
@@ -102,7 +102,7 @@ define void @test_vXi64(<4 x i64> %src256, <8 x i64> %src512) {
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <8 x i64> %src512, <8 x i64> undef, <2 x i32> <i32 4, i32 5>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_67 = shufflevector <8 x i64> %src512, <8 x i64> undef, <2 x i32> <i32 6, i32 7>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_0123 = shufflevector <8 x i64> %src512, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V512_2345 = shufflevector <8 x i64> %src512, <8 x i64> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_2345 = shufflevector <8 x i64> %src512, <8 x i64> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_4567 = shufflevector <8 x i64> %src512, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
@@ -238,9 +238,9 @@ define void @test_vXi16(<4 x i16> %src64, <8 x i16> %src128, <16 x i16> %src256,
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_CD = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 12, i32 13>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_EF = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_0123 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_2345 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V256_2345 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_4567 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_6789 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_6789 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_89AB = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_CDEF = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_01234567 = shufflevector <16 x i16> %src256, <16 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -262,9 +262,9 @@ define void @test_vXi16(<4 x i16> %src64, <8 x i16> %src128, <16 x i16> %src256,
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_1C_1D = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 28, i32 29>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 30, i32 31>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_02_03_04_05 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V512_02_03_04_05 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05_06_07 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_06_07_08_09 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V512_06_07_08_09 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_08_09_0A_0B = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0C_0D_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_10_11_12_13 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
@@ -297,9 +297,9 @@ define void @test_vXi16(<4 x i16> %src64, <8 x i16> %src128, <16 x i16> %src256,
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_CD = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 12, i32 13>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_EF = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_0123 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_2345 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_2345 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_4567 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_6789 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_6789 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_89AB = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_CDEF = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_01234567 = shufflevector <16 x i16> %src256, <16 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -321,9 +321,9 @@ define void @test_vXi16(<4 x i16> %src64, <8 x i16> %src128, <16 x i16> %src256,
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_1C_1D = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 28, i32 29>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 30, i32 31>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_02_03_04_05 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_02_03_04_05 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05_06_07 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_06_07_08_09 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_06_07_08_09 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_08_09_0A_0B = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0C_0D_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_10_11_12_13 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
@@ -356,9 +356,9 @@ define void @test_vXi16(<4 x i16> %src64, <8 x i16> %src128, <16 x i16> %src256,
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_CD = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 12, i32 13>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_EF = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_0123 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_2345 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_2345 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_4567 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_6789 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_6789 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_89AB = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_CDEF = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_01234567 = shufflevector <16 x i16> %src256, <16 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -380,9 +380,9 @@ define void @test_vXi16(<4 x i16> %src64, <8 x i16> %src128, <16 x i16> %src256,
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_1C_1D = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 28, i32 29>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 30, i32 31>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_02_03_04_05 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_02_03_04_05 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05_06_07 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_06_07_08_09 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_06_07_08_09 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_08_09_0A_0B = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0C_0D_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_10_11_12_13 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
@@ -397,123 +397,300 @@ define void @test_vXi16(<4 x i16> %src64, <8 x i16> %src128, <16 x i16> %src256,
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
-; AVX-LABEL: 'test_vXi16'
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64_01 = shufflevector <4 x i16> %src64, <4 x i16> undef, <2 x i32> <i32 0, i32 1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_23 = shufflevector <4 x i16> %src64, <4 x i16> undef, <2 x i32> <i32 2, i32 3>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_01 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> <i32 0, i32 1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_23 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> <i32 2, i32 3>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_45 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> <i32 4, i32 5>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_67 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> <i32 6, i32 7>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_0123 = shufflevector <8 x i16> %src128, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_4567 = shufflevector <8 x i16> %src128, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_01 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 0, i32 1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_23 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 2, i32 3>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_45 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 4, i32 5>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_67 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 6, i32 7>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 8, i32 9>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_AB = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 10, i32 11>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_CD = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 12, i32 13>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_EF = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 14, i32 15>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_0123 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_2345 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_4567 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V256_6789 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89AB = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_CDEF = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_01234567 = shufflevector <16 x i16> %src256, <16 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89ABCDEF = shufflevector <16 x i16> %src256, <16 x i16> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 0, i32 1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_02_03 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 2, i32 3>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 4, i32 5>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_06_07 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 6, i32 7>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 8, i32 9>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0A_0B = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 10, i32 11>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0C_0D = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 12, i32 13>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 14, i32 15>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_10_11 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 16, i32 17>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_12_13 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 18, i32 19>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_14_15 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 20, i32 21>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_16_17 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 22, i32 23>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_18_19 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 24, i32 25>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1A_1B = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 26, i32 27>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1C_1D = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 28, i32 29>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 30, i32 31>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_02_03_04_05 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05_06_07 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V512_06_07_08_09 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09_0A_0B = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0C_0D_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_10_11_12_13 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_14_15_16_17 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 20, i32 21, i32 22, i32 23>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_18_19_1A_1B = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 24, i32 25, i32 26, i32 27>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1C_1D_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 28, i32 29, i32 30, i32 31>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07 = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_10_11_12_13_14_15_16_17 = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> <i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; AVX1-LABEL: 'test_vXi16'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64_01 = shufflevector <4 x i16> %src64, <4 x i16> undef, <2 x i32> <i32 0, i32 1>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_23 = shufflevector <4 x i16> %src64, <4 x i16> undef, <2 x i32> <i32 2, i32 3>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_01 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> <i32 0, i32 1>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_23 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> <i32 2, i32 3>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_45 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> <i32 4, i32 5>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_67 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> <i32 6, i32 7>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_0123 = shufflevector <8 x i16> %src128, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_4567 = shufflevector <8 x i16> %src128, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_01 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 0, i32 1>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_23 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 2, i32 3>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_45 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 4, i32 5>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_67 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 6, i32 7>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 8, i32 9>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_AB = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 10, i32 11>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_CD = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 12, i32 13>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_EF = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_0123 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_2345 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_4567 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_6789 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89AB = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_CDEF = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_01234567 = shufflevector <16 x i16> %src256, <16 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89ABCDEF = shufflevector <16 x i16> %src256, <16 x i16> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 0, i32 1>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_02_03 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 2, i32 3>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 4, i32 5>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_06_07 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 6, i32 7>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 8, i32 9>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0A_0B = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 10, i32 11>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0C_0D = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 12, i32 13>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_10_11 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 16, i32 17>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_12_13 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 18, i32 19>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_14_15 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 20, i32 21>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_16_17 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 22, i32 23>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_18_19 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 24, i32 25>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1A_1B = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 26, i32 27>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1C_1D = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 28, i32 29>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 30, i32 31>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_02_03_04_05 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05_06_07 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_06_07_08_09 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09_0A_0B = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0C_0D_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_10_11_12_13 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_14_15_16_17 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 20, i32 21, i32 22, i32 23>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_18_19_1A_1B = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 24, i32 25, i32 26, i32 27>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1C_1D_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 28, i32 29, i32 30, i32 31>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07 = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_10_11_12_13_14_15_16_17 = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> <i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
-; AVX512-LABEL: 'test_vXi16'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64_01 = shufflevector <4 x i16> %src64, <4 x i16> undef, <2 x i32> <i32 0, i32 1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_23 = shufflevector <4 x i16> %src64, <4 x i16> undef, <2 x i32> <i32 2, i32 3>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_01 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> <i32 0, i32 1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_23 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> <i32 2, i32 3>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_45 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> <i32 4, i32 5>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_67 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> <i32 6, i32 7>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_0123 = shufflevector <8 x i16> %src128, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_4567 = shufflevector <8 x i16> %src128, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_01 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 0, i32 1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_23 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 2, i32 3>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_45 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 4, i32 5>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_67 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 6, i32 7>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 8, i32 9>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_AB = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 10, i32 11>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_CD = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 12, i32 13>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_EF = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 14, i32 15>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_0123 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_2345 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_4567 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V256_6789 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89AB = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_CDEF = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_01234567 = shufflevector <16 x i16> %src256, <16 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89ABCDEF = shufflevector <16 x i16> %src256, <16 x i16> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 0, i32 1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_02_03 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 2, i32 3>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 4, i32 5>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_06_07 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 6, i32 7>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 8, i32 9>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0A_0B = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 10, i32 11>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0C_0D = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 12, i32 13>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 14, i32 15>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 16, i32 17>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_12_13 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 18, i32 19>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_14_15 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 20, i32 21>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_16_17 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 22, i32 23>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_18_19 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 24, i32 25>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1A_1B = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 26, i32 27>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1C_1D = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 28, i32 29>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 30, i32 31>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_02_03_04_05 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05_06_07 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V512_06_07_08_09 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09_0A_0B = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0C_0D_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_14_15_16_17 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 20, i32 21, i32 22, i32 23>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_18_19_1A_1B = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 24, i32 25, i32 26, i32 27>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1C_1D_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 28, i32 29, i32 30, i32 31>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07 = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13_14_15_16_17 = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> <i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; AVX2-LABEL: 'test_vXi16'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64_01 = shufflevector <4 x i16> %src64, <4 x i16> undef, <2 x i32> <i32 0, i32 1>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_23 = shufflevector <4 x i16> %src64, <4 x i16> undef, <2 x i32> <i32 2, i32 3>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_01 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> <i32 0, i32 1>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_23 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> <i32 2, i32 3>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_45 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> <i32 4, i32 5>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_67 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> <i32 6, i32 7>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_0123 = shufflevector <8 x i16> %src128, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_4567 = shufflevector <8 x i16> %src128, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_01 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 0, i32 1>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_23 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 2, i32 3>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_45 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 4, i32 5>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_67 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 6, i32 7>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 8, i32 9>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_AB = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 10, i32 11>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_CD = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 12, i32 13>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_EF = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_0123 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_2345 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_4567 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_6789 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89AB = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_CDEF = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_01234567 = shufflevector <16 x i16> %src256, <16 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89ABCDEF = shufflevector <16 x i16> %src256, <16 x i16> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 0, i32 1>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_02_03 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 2, i32 3>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 4, i32 5>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_06_07 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 6, i32 7>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 8, i32 9>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0A_0B = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 10, i32 11>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0C_0D = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 12, i32 13>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_10_11 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 16, i32 17>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_12_13 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 18, i32 19>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_14_15 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 20, i32 21>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_16_17 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 22, i32 23>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_18_19 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 24, i32 25>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1A_1B = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 26, i32 27>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1C_1D = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 28, i32 29>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 30, i32 31>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_02_03_04_05 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05_06_07 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_06_07_08_09 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09_0A_0B = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0C_0D_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_10_11_12_13 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_14_15_16_17 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 20, i32 21, i32 22, i32 23>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_18_19_1A_1B = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 24, i32 25, i32 26, i32 27>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1C_1D_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 28, i32 29, i32 30, i32 31>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07 = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_10_11_12_13_14_15_16_17 = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> <i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX512F-LABEL: 'test_vXi16'
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64_01 = shufflevector <4 x i16> %src64, <4 x i16> undef, <2 x i32> <i32 0, i32 1>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_23 = shufflevector <4 x i16> %src64, <4 x i16> undef, <2 x i32> <i32 2, i32 3>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_01 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> <i32 0, i32 1>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_23 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> <i32 2, i32 3>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_45 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> <i32 4, i32 5>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_67 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> <i32 6, i32 7>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_0123 = shufflevector <8 x i16> %src128, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_4567 = shufflevector <8 x i16> %src128, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_01 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 0, i32 1>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_23 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 2, i32 3>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_45 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 4, i32 5>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_67 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 6, i32 7>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 8, i32 9>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_AB = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 10, i32 11>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_CD = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 12, i32 13>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_EF = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 14, i32 15>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_0123 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_2345 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_4567 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_6789 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89AB = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_CDEF = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_01234567 = shufflevector <16 x i16> %src256, <16 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89ABCDEF = shufflevector <16 x i16> %src256, <16 x i16> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 0, i32 1>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_02_03 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 2, i32 3>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 4, i32 5>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_06_07 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 6, i32 7>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 8, i32 9>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0A_0B = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 10, i32 11>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0C_0D = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 12, i32 13>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 14, i32 15>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 16, i32 17>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_12_13 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 18, i32 19>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_14_15 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 20, i32 21>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_16_17 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 22, i32 23>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_18_19 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 24, i32 25>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1A_1B = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 26, i32 27>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1C_1D = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 28, i32 29>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 30, i32 31>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V512_02_03_04_05 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05_06_07 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V512_06_07_08_09 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09_0A_0B = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0C_0D_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_14_15_16_17 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 20, i32 21, i32 22, i32 23>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_18_19_1A_1B = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 24, i32 25, i32 26, i32 27>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1C_1D_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 28, i32 29, i32 30, i32 31>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07 = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13_14_15_16_17 = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> <i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX512BW-LABEL: 'test_vXi16'
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64_01 = shufflevector <4 x i16> %src64, <4 x i16> undef, <2 x i32> <i32 0, i32 1>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_23 = shufflevector <4 x i16> %src64, <4 x i16> undef, <2 x i32> <i32 2, i32 3>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_01 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> <i32 0, i32 1>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_23 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> <i32 2, i32 3>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_45 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> <i32 4, i32 5>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_67 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> <i32 6, i32 7>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_0123 = shufflevector <8 x i16> %src128, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_4567 = shufflevector <8 x i16> %src128, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_01 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 0, i32 1>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_23 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 2, i32 3>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_45 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 4, i32 5>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_67 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 6, i32 7>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 8, i32 9>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_AB = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 10, i32 11>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_CD = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 12, i32 13>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_EF = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 14, i32 15>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_0123 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_2345 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_4567 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_6789 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89AB = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_CDEF = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_01234567 = shufflevector <16 x i16> %src256, <16 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89ABCDEF = shufflevector <16 x i16> %src256, <16 x i16> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 0, i32 1>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_02_03 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 2, i32 3>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 4, i32 5>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_06_07 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 6, i32 7>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 8, i32 9>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0A_0B = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 10, i32 11>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0C_0D = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 12, i32 13>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 14, i32 15>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 16, i32 17>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_12_13 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 18, i32 19>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_14_15 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 20, i32 21>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_16_17 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 22, i32 23>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_18_19 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 24, i32 25>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1A_1B = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 26, i32 27>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1C_1D = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 28, i32 29>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 30, i32 31>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_02_03_04_05 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05_06_07 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_06_07_08_09 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09_0A_0B = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0C_0D_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_14_15_16_17 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 20, i32 21, i32 22, i32 23>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_18_19_1A_1B = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 24, i32 25, i32 26, i32 27>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1C_1D_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 28, i32 29, i32 30, i32 31>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07 = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13_14_15_16_17 = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> <i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX512VBMI-LABEL: 'test_vXi16'
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64_01 = shufflevector <4 x i16> %src64, <4 x i16> undef, <2 x i32> <i32 0, i32 1>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_23 = shufflevector <4 x i16> %src64, <4 x i16> undef, <2 x i32> <i32 2, i32 3>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_01 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> <i32 0, i32 1>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_23 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> <i32 2, i32 3>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_45 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> <i32 4, i32 5>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_67 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> <i32 6, i32 7>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_0123 = shufflevector <8 x i16> %src128, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_4567 = shufflevector <8 x i16> %src128, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_01 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 0, i32 1>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_23 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 2, i32 3>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_45 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 4, i32 5>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_67 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 6, i32 7>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 8, i32 9>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_AB = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 10, i32 11>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_CD = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 12, i32 13>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_EF = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 14, i32 15>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_0123 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_2345 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_4567 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_6789 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89AB = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_CDEF = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_01234567 = shufflevector <16 x i16> %src256, <16 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89ABCDEF = shufflevector <16 x i16> %src256, <16 x i16> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 0, i32 1>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_02_03 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 2, i32 3>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 4, i32 5>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_06_07 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 6, i32 7>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 8, i32 9>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0A_0B = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 10, i32 11>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0C_0D = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 12, i32 13>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 14, i32 15>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 16, i32 17>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_12_13 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 18, i32 19>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_14_15 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 20, i32 21>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_16_17 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 22, i32 23>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_18_19 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 24, i32 25>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1A_1B = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 26, i32 27>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1C_1D = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 28, i32 29>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 30, i32 31>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_02_03_04_05 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05_06_07 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_06_07_08_09 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09_0A_0B = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0C_0D_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_14_15_16_17 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 20, i32 21, i32 22, i32 23>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_18_19_1A_1B = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 24, i32 25, i32 26, i32 27>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1C_1D_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 28, i32 29, i32 30, i32 31>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07 = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13_14_15_16_17 = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> <i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; SLM-LABEL: 'test_vXi16'
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64_01 = shufflevector <4 x i16> %src64, <4 x i16> undef, <2 x i32> <i32 0, i32 1>
@@ -533,9 +710,9 @@ define void @test_vXi16(<4 x i16> %src64, <8 x i16> %src128, <16 x i16> %src256,
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_CD = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 12, i32 13>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_EF = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 14, i32 15>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_0123 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SLM-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V256_2345 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_2345 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_4567 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; SLM-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V256_6789 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_6789 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_89AB = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_CDEF = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_01234567 = shufflevector <16 x i16> %src256, <16 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -557,9 +734,9 @@ define void @test_vXi16(<4 x i16> %src64, <8 x i16> %src128, <16 x i16> %src256,
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_1C_1D = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 28, i32 29>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 30, i32 31>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SLM-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V512_02_03_04_05 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_02_03_04_05 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05_06_07 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; SLM-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V512_06_07_08_09 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_06_07_08_09 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_08_09_0A_0B = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0C_0D_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_10_11_12_13 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
@@ -592,9 +769,9 @@ define void @test_vXi16(<4 x i16> %src64, <8 x i16> %src128, <16 x i16> %src256,
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_CD = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 12, i32 13>
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_EF = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 14, i32 15>
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_0123 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; GLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_2345 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_2345 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_4567 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; GLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_6789 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_6789 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_89AB = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_CDEF = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_01234567 = shufflevector <16 x i16> %src256, <16 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -616,9 +793,9 @@ define void @test_vXi16(<4 x i16> %src64, <8 x i16> %src128, <16 x i16> %src256,
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_1C_1D = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 28, i32 29>
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 30, i32 31>
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; GLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_02_03_04_05 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_02_03_04_05 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05_06_07 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; GLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_06_07_08_09 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_06_07_08_09 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_08_09_0A_0B = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0C_0D_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_10_11_12_13 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
@@ -709,9 +886,9 @@ define void @test_vXi8(<8 x i8> %src64, <16 x i8> %src128, <32 x i8> %src256, <6
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_CD = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 12, i32 13>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_EF = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_0123 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V128_2345 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V128_2345 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_4567 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V128_6789 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V128_6789 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_89AB = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_CDEF = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_01234567 = shufflevector <16 x i8> %src128, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -733,9 +910,9 @@ define void @test_vXi8(<8 x i8> %src64, <16 x i8> %src128, <32 x i8> %src256, <6
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_1C_1D = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 28, i32 29>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 30, i32 31>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01_02_03 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V256_02_03_04_05 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V256_02_03_04_05 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_04_05_06_07 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V256_06_07_08_09 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V256_06_07_08_09 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_08_09_0A_0B = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0C_0D_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_10_11_12_13 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
@@ -828,9 +1005,9 @@ define void @test_vXi8(<8 x i8> %src64, <16 x i8> %src128, <32 x i8> %src256, <6
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_CD = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 12, i32 13>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_EF = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_0123 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V128_2345 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_2345 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_4567 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V128_6789 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_6789 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_89AB = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_CDEF = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_01234567 = shufflevector <16 x i8> %src128, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -852,9 +1029,9 @@ define void @test_vXi8(<8 x i8> %src64, <16 x i8> %src128, <32 x i8> %src256, <6
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_1C_1D = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 28, i32 29>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 30, i32 31>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01_02_03 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V256_02_03_04_05 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_02_03_04_05 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_04_05_06_07 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V256_06_07_08_09 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_06_07_08_09 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_08_09_0A_0B = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0C_0D_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_10_11_12_13 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
@@ -947,9 +1124,9 @@ define void @test_vXi8(<8 x i8> %src64, <16 x i8> %src128, <32 x i8> %src256, <6
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_CD = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 12, i32 13>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_EF = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_0123 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V128_2345 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_2345 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_4567 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V128_6789 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_6789 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_89AB = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_CDEF = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_01234567 = shufflevector <16 x i8> %src128, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -971,9 +1148,9 @@ define void @test_vXi8(<8 x i8> %src64, <16 x i8> %src128, <32 x i8> %src256, <6
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_1C_1D = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 28, i32 29>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 30, i32 31>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01_02_03 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_02_03_04_05 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_02_03_04_05 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_04_05_06_07 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_06_07_08_09 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_06_07_08_09 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_08_09_0A_0B = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0C_0D_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_10_11_12_13 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
@@ -1050,243 +1227,600 @@ define void @test_vXi8(<8 x i8> %src64, <16 x i8> %src128, <32 x i8> %src256, <6
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_20_21_22_23_24_25_26_27_28_29_2A_2B_2C_2D_2E_2F_30_31_32_33_34_35_36_37_38_39_3A_3B_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <32 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
-; AVX-LABEL: 'test_vXi8'
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64_01 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 0, i32 1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_23 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 2, i32 3>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_45 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 4, i32 5>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_67 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 6, i32 7>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64_0123 = shufflevector <8 x i8> %src64, <8 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_4567 = shufflevector <8 x i8> %src64, <8 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_01 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 0, i32 1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_23 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 2, i32 3>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_45 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 4, i32 5>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_67 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 6, i32 7>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_89 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 8, i32 9>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_AB = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 10, i32 11>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_CD = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 12, i32 13>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_EF = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 14, i32 15>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_0123 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V128_2345 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_4567 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V128_6789 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_89AB = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_CDEF = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_01234567 = shufflevector <16 x i8> %src128, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_89ABCDEF = shufflevector <16 x i8> %src128, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 0, i32 1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_02_03 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 2, i32 3>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_04_05 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 4, i32 5>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_06_07 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 6, i32 7>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_08_09 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 8, i32 9>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0A_0B = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 10, i32 11>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0C_0D = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 12, i32 13>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 14, i32 15>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_10_11 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 16, i32 17>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_12_13 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 18, i32 19>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_14_15 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 20, i32 21>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_16_17 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 22, i32 23>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_18_19 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 24, i32 25>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_1A_1B = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 26, i32 27>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_1C_1D = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 28, i32 29>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 30, i32 31>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01_02_03 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_02_03_04_05 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_04_05_06_07 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_06_07_08_09 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_08_09_0A_0B = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0C_0D_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_10_11_12_13 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_14_15_16_17 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 20, i32 21, i32 22, i32 23>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_18_19_1A_1B = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 24, i32 25, i32 26, i32 27>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_1C_1D_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 28, i32 29, i32 30, i32 31>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01_02_03_04_05_06_07 = shufflevector <32 x i8> %src256, <32 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_10_11_12_13_14_15_16_17 = shufflevector <32 x i8> %src256, <32 x i8> undef, <8 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <8 x i32> <i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 0, i32 1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_02_03 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 2, i32 3>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 4, i32 5>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_06_07 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 6, i32 7>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 8, i32 9>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0A_0B = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 10, i32 11>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0C_0D = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 12, i32 13>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0E_0F = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 14, i32 15>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 16, i32 17>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_12_13 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 18, i32 19>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_14_15 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 20, i32 21>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_16_17 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 22, i32 23>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_18_19 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 24, i32 25>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1A_1B = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 26, i32 27>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1C_1D = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 28, i32 29>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 30, i32 31>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_20_21 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 32, i32 33>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_22_23 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 34, i32 35>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_24_25 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 36, i32 37>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_26_27 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 38, i32 39>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_28_29 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 40, i32 41>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_2A_2B = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 42, i32 43>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_2C_2D = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 44, i32 45>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_2E_2F = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 46, i32 47>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_30_31 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 48, i32 49>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_32_33 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 50, i32 51>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_34_35 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 52, i32 53>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_36_37 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 54, i32 55>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_38_39 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 56, i32 57>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_3A_3B = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 58, i32 59>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_3C_3D = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 60, i32 61>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 62, i32 63>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05_06_07 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09_0A_0B = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0C_0D_0E_0F = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_14_15_16_17 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 20, i32 21, i32 22, i32 23>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_18_19_1A_1B = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 24, i32 25, i32 26, i32 27>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 28, i32 29, i32 30, i32 31>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_20_21_22_23 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 32, i32 33, i32 34, i32 35>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_24_25_26_27 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 36, i32 37, i32 38, i32 39>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_28_29_2A_2B = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 40, i32 41, i32 42, i32 43>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_2C_2D_2E_2F = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 44, i32 45, i32 46, i32 47>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_30_31_32_33 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 48, i32 49, i32 50, i32 51>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_34_35_36_37 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 52, i32 53, i32 54, i32 55>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_38_39_3A_3B = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 56, i32 57, i32 58, i32 59>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 60, i32 61, i32 62, i32 63>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07 = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09_0A_0B_0C_0D_0E_0F = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13_14_15_16_17 = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_18_19_1A_1B_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_20_21_22_23_24_25_26_27 = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_28_29_2A_2B_2C_2D_2E_2F = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_30_31_32_33_34_35_36_37 = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_38_39_3A_3B_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F = shufflevector <64 x i8> %src512, <64 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_20_21_22_23_24_25_26_27_28_29_2A_2B_2C_2D_2E_2F = shufflevector <64 x i8> %src512, <64 x i8> undef, <16 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_30_31_32_33_34_35_36_37_38_39_3A_3B_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <16 x i32> <i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_20_21_22_23_24_25_26_27_28_29_2A_2B_2C_2D_2E_2F_30_31_32_33_34_35_36_37_38_39_3A_3B_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <32 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; AVX1-LABEL: 'test_vXi8'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64_01 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 0, i32 1>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_23 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 2, i32 3>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_45 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 4, i32 5>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_67 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 6, i32 7>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64_0123 = shufflevector <8 x i8> %src64, <8 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_4567 = shufflevector <8 x i8> %src64, <8 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_01 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 0, i32 1>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_23 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 2, i32 3>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_45 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 4, i32 5>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_67 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 6, i32 7>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_89 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 8, i32 9>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_AB = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 10, i32 11>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_CD = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 12, i32 13>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_EF = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_0123 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_2345 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_4567 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_6789 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_89AB = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_CDEF = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_01234567 = shufflevector <16 x i8> %src128, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_89ABCDEF = shufflevector <16 x i8> %src128, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 0, i32 1>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_02_03 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 2, i32 3>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_04_05 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 4, i32 5>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_06_07 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 6, i32 7>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_08_09 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 8, i32 9>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0A_0B = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 10, i32 11>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0C_0D = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 12, i32 13>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_10_11 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 16, i32 17>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_12_13 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 18, i32 19>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_14_15 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 20, i32 21>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_16_17 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 22, i32 23>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_18_19 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 24, i32 25>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_1A_1B = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 26, i32 27>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_1C_1D = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 28, i32 29>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 30, i32 31>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01_02_03 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_02_03_04_05 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_04_05_06_07 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_06_07_08_09 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_08_09_0A_0B = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0C_0D_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_10_11_12_13 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_14_15_16_17 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 20, i32 21, i32 22, i32 23>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_18_19_1A_1B = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 24, i32 25, i32 26, i32 27>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_1C_1D_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 28, i32 29, i32 30, i32 31>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01_02_03_04_05_06_07 = shufflevector <32 x i8> %src256, <32 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_10_11_12_13_14_15_16_17 = shufflevector <32 x i8> %src256, <32 x i8> undef, <8 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <8 x i32> <i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 0, i32 1>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_02_03 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 2, i32 3>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 4, i32 5>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_06_07 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 6, i32 7>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 8, i32 9>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0A_0B = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 10, i32 11>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0C_0D = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 12, i32 13>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0E_0F = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 16, i32 17>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_12_13 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 18, i32 19>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_14_15 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 20, i32 21>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_16_17 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 22, i32 23>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_18_19 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 24, i32 25>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1A_1B = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 26, i32 27>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1C_1D = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 28, i32 29>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 30, i32 31>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_20_21 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 32, i32 33>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_22_23 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 34, i32 35>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_24_25 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 36, i32 37>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_26_27 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 38, i32 39>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_28_29 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 40, i32 41>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_2A_2B = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 42, i32 43>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_2C_2D = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 44, i32 45>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_2E_2F = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 46, i32 47>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_30_31 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 48, i32 49>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_32_33 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 50, i32 51>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_34_35 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 52, i32 53>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_36_37 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 54, i32 55>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_38_39 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 56, i32 57>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_3A_3B = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 58, i32 59>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_3C_3D = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 60, i32 61>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 62, i32 63>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05_06_07 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09_0A_0B = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0C_0D_0E_0F = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_14_15_16_17 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 20, i32 21, i32 22, i32 23>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_18_19_1A_1B = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 24, i32 25, i32 26, i32 27>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 28, i32 29, i32 30, i32 31>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_20_21_22_23 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 32, i32 33, i32 34, i32 35>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_24_25_26_27 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 36, i32 37, i32 38, i32 39>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_28_29_2A_2B = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 40, i32 41, i32 42, i32 43>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_2C_2D_2E_2F = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 44, i32 45, i32 46, i32 47>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_30_31_32_33 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 48, i32 49, i32 50, i32 51>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_34_35_36_37 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 52, i32 53, i32 54, i32 55>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_38_39_3A_3B = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 56, i32 57, i32 58, i32 59>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 60, i32 61, i32 62, i32 63>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07 = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09_0A_0B_0C_0D_0E_0F = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13_14_15_16_17 = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_18_19_1A_1B_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_20_21_22_23_24_25_26_27 = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_28_29_2A_2B_2C_2D_2E_2F = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_30_31_32_33_34_35_36_37 = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_38_39_3A_3B_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F = shufflevector <64 x i8> %src512, <64 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_20_21_22_23_24_25_26_27_28_29_2A_2B_2C_2D_2E_2F = shufflevector <64 x i8> %src512, <64 x i8> undef, <16 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_30_31_32_33_34_35_36_37_38_39_3A_3B_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <16 x i32> <i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_20_21_22_23_24_25_26_27_28_29_2A_2B_2C_2D_2E_2F_30_31_32_33_34_35_36_37_38_39_3A_3B_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <32 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
-; AVX512-LABEL: 'test_vXi8'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64_01 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 0, i32 1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_23 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 2, i32 3>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_45 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 4, i32 5>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_67 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 6, i32 7>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64_0123 = shufflevector <8 x i8> %src64, <8 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_4567 = shufflevector <8 x i8> %src64, <8 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_01 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 0, i32 1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_23 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 2, i32 3>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_45 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 4, i32 5>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_67 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 6, i32 7>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_89 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 8, i32 9>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_AB = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 10, i32 11>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_CD = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 12, i32 13>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_EF = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 14, i32 15>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_0123 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V128_2345 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_4567 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V128_6789 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_89AB = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_CDEF = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_01234567 = shufflevector <16 x i8> %src128, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_89ABCDEF = shufflevector <16 x i8> %src128, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 0, i32 1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_02_03 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 2, i32 3>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_04_05 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 4, i32 5>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_06_07 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 6, i32 7>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_08_09 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 8, i32 9>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0A_0B = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 10, i32 11>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0C_0D = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 12, i32 13>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 14, i32 15>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_10_11 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 16, i32 17>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_12_13 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 18, i32 19>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_14_15 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 20, i32 21>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_16_17 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 22, i32 23>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_18_19 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 24, i32 25>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_1A_1B = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 26, i32 27>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_1C_1D = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 28, i32 29>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 30, i32 31>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01_02_03 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_02_03_04_05 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_04_05_06_07 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_06_07_08_09 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_08_09_0A_0B = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0C_0D_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_10_11_12_13 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_14_15_16_17 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 20, i32 21, i32 22, i32 23>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_18_19_1A_1B = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 24, i32 25, i32 26, i32 27>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_1C_1D_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 28, i32 29, i32 30, i32 31>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01_02_03_04_05_06_07 = shufflevector <32 x i8> %src256, <32 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_10_11_12_13_14_15_16_17 = shufflevector <32 x i8> %src256, <32 x i8> undef, <8 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <8 x i32> <i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 0, i32 1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_02_03 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 2, i32 3>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 4, i32 5>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_06_07 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 6, i32 7>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 8, i32 9>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0A_0B = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 10, i32 11>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0C_0D = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 12, i32 13>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0E_0F = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 14, i32 15>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 16, i32 17>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_12_13 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 18, i32 19>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_14_15 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 20, i32 21>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_16_17 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 22, i32 23>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_18_19 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 24, i32 25>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1A_1B = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 26, i32 27>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1C_1D = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 28, i32 29>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 30, i32 31>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_20_21 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 32, i32 33>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_22_23 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 34, i32 35>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_24_25 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 36, i32 37>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_26_27 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 38, i32 39>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_28_29 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 40, i32 41>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_2A_2B = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 42, i32 43>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_2C_2D = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 44, i32 45>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_2E_2F = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 46, i32 47>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_30_31 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 48, i32 49>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_32_33 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 50, i32 51>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_34_35 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 52, i32 53>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_36_37 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 54, i32 55>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_38_39 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 56, i32 57>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_3A_3B = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 58, i32 59>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_3C_3D = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 60, i32 61>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 62, i32 63>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05_06_07 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09_0A_0B = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0C_0D_0E_0F = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_14_15_16_17 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 20, i32 21, i32 22, i32 23>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_18_19_1A_1B = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 24, i32 25, i32 26, i32 27>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 28, i32 29, i32 30, i32 31>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_20_21_22_23 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 32, i32 33, i32 34, i32 35>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_24_25_26_27 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 36, i32 37, i32 38, i32 39>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_28_29_2A_2B = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 40, i32 41, i32 42, i32 43>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_2C_2D_2E_2F = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 44, i32 45, i32 46, i32 47>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_30_31_32_33 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 48, i32 49, i32 50, i32 51>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_34_35_36_37 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 52, i32 53, i32 54, i32 55>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_38_39_3A_3B = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 56, i32 57, i32 58, i32 59>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 60, i32 61, i32 62, i32 63>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07 = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09_0A_0B_0C_0D_0E_0F = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13_14_15_16_17 = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_18_19_1A_1B_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_20_21_22_23_24_25_26_27 = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_28_29_2A_2B_2C_2D_2E_2F = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_30_31_32_33_34_35_36_37 = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_38_39_3A_3B_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F = shufflevector <64 x i8> %src512, <64 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_20_21_22_23_24_25_26_27_28_29_2A_2B_2C_2D_2E_2F = shufflevector <64 x i8> %src512, <64 x i8> undef, <16 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_30_31_32_33_34_35_36_37_38_39_3A_3B_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <16 x i32> <i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_20_21_22_23_24_25_26_27_28_29_2A_2B_2C_2D_2E_2F_30_31_32_33_34_35_36_37_38_39_3A_3B_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <32 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; AVX2-LABEL: 'test_vXi8'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64_01 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 0, i32 1>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_23 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 2, i32 3>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_45 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 4, i32 5>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_67 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 6, i32 7>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64_0123 = shufflevector <8 x i8> %src64, <8 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_4567 = shufflevector <8 x i8> %src64, <8 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_01 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 0, i32 1>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_23 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 2, i32 3>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_45 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 4, i32 5>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_67 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 6, i32 7>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_89 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 8, i32 9>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_AB = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 10, i32 11>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_CD = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 12, i32 13>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_EF = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_0123 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_2345 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_4567 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_6789 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_89AB = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_CDEF = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_01234567 = shufflevector <16 x i8> %src128, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_89ABCDEF = shufflevector <16 x i8> %src128, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 0, i32 1>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_02_03 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 2, i32 3>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_04_05 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 4, i32 5>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_06_07 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 6, i32 7>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_08_09 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 8, i32 9>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0A_0B = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 10, i32 11>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0C_0D = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 12, i32 13>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_10_11 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 16, i32 17>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_12_13 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 18, i32 19>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_14_15 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 20, i32 21>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_16_17 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 22, i32 23>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_18_19 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 24, i32 25>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_1A_1B = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 26, i32 27>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_1C_1D = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 28, i32 29>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 30, i32 31>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01_02_03 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_02_03_04_05 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_04_05_06_07 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_06_07_08_09 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_08_09_0A_0B = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0C_0D_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_10_11_12_13 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_14_15_16_17 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 20, i32 21, i32 22, i32 23>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_18_19_1A_1B = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 24, i32 25, i32 26, i32 27>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_1C_1D_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 28, i32 29, i32 30, i32 31>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01_02_03_04_05_06_07 = shufflevector <32 x i8> %src256, <32 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_10_11_12_13_14_15_16_17 = shufflevector <32 x i8> %src256, <32 x i8> undef, <8 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <8 x i32> <i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 0, i32 1>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_02_03 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 2, i32 3>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 4, i32 5>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_06_07 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 6, i32 7>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 8, i32 9>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0A_0B = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 10, i32 11>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0C_0D = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 12, i32 13>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0E_0F = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 16, i32 17>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_12_13 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 18, i32 19>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_14_15 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 20, i32 21>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_16_17 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 22, i32 23>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_18_19 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 24, i32 25>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1A_1B = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 26, i32 27>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1C_1D = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 28, i32 29>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 30, i32 31>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_20_21 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 32, i32 33>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_22_23 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 34, i32 35>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_24_25 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 36, i32 37>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_26_27 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 38, i32 39>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_28_29 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 40, i32 41>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_2A_2B = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 42, i32 43>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_2C_2D = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 44, i32 45>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_2E_2F = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 46, i32 47>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_30_31 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 48, i32 49>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_32_33 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 50, i32 51>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_34_35 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 52, i32 53>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_36_37 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 54, i32 55>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_38_39 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 56, i32 57>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_3A_3B = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 58, i32 59>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_3C_3D = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 60, i32 61>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 62, i32 63>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05_06_07 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09_0A_0B = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0C_0D_0E_0F = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_14_15_16_17 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 20, i32 21, i32 22, i32 23>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_18_19_1A_1B = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 24, i32 25, i32 26, i32 27>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 28, i32 29, i32 30, i32 31>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_20_21_22_23 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 32, i32 33, i32 34, i32 35>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_24_25_26_27 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 36, i32 37, i32 38, i32 39>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_28_29_2A_2B = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 40, i32 41, i32 42, i32 43>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_2C_2D_2E_2F = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 44, i32 45, i32 46, i32 47>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_30_31_32_33 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 48, i32 49, i32 50, i32 51>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_34_35_36_37 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 52, i32 53, i32 54, i32 55>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_38_39_3A_3B = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 56, i32 57, i32 58, i32 59>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 60, i32 61, i32 62, i32 63>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07 = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09_0A_0B_0C_0D_0E_0F = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13_14_15_16_17 = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_18_19_1A_1B_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_20_21_22_23_24_25_26_27 = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_28_29_2A_2B_2C_2D_2E_2F = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_30_31_32_33_34_35_36_37 = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_38_39_3A_3B_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F = shufflevector <64 x i8> %src512, <64 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_20_21_22_23_24_25_26_27_28_29_2A_2B_2C_2D_2E_2F = shufflevector <64 x i8> %src512, <64 x i8> undef, <16 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_30_31_32_33_34_35_36_37_38_39_3A_3B_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <16 x i32> <i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_20_21_22_23_24_25_26_27_28_29_2A_2B_2C_2D_2E_2F_30_31_32_33_34_35_36_37_38_39_3A_3B_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <32 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX512F-LABEL: 'test_vXi8'
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64_01 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 0, i32 1>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_23 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 2, i32 3>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_45 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 4, i32 5>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_67 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 6, i32 7>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64_0123 = shufflevector <8 x i8> %src64, <8 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_4567 = shufflevector <8 x i8> %src64, <8 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_01 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 0, i32 1>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_23 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 2, i32 3>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_45 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 4, i32 5>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_67 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 6, i32 7>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_89 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 8, i32 9>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_AB = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 10, i32 11>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_CD = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 12, i32 13>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_EF = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 14, i32 15>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_0123 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_2345 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_4567 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_6789 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_89AB = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_CDEF = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_01234567 = shufflevector <16 x i8> %src128, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_89ABCDEF = shufflevector <16 x i8> %src128, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 0, i32 1>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_02_03 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 2, i32 3>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_04_05 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 4, i32 5>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_06_07 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 6, i32 7>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_08_09 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 8, i32 9>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0A_0B = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 10, i32 11>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0C_0D = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 12, i32 13>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 14, i32 15>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_10_11 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 16, i32 17>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_12_13 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 18, i32 19>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_14_15 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 20, i32 21>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_16_17 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 22, i32 23>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_18_19 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 24, i32 25>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_1A_1B = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 26, i32 27>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_1C_1D = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 28, i32 29>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 30, i32 31>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01_02_03 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_02_03_04_05 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_04_05_06_07 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_06_07_08_09 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_08_09_0A_0B = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0C_0D_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_10_11_12_13 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_14_15_16_17 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 20, i32 21, i32 22, i32 23>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_18_19_1A_1B = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 24, i32 25, i32 26, i32 27>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_1C_1D_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 28, i32 29, i32 30, i32 31>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01_02_03_04_05_06_07 = shufflevector <32 x i8> %src256, <32 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_10_11_12_13_14_15_16_17 = shufflevector <32 x i8> %src256, <32 x i8> undef, <8 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <8 x i32> <i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 0, i32 1>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_02_03 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 2, i32 3>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 4, i32 5>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_06_07 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 6, i32 7>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 8, i32 9>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0A_0B = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 10, i32 11>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0C_0D = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 12, i32 13>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0E_0F = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 14, i32 15>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 16, i32 17>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_12_13 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 18, i32 19>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_14_15 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 20, i32 21>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_16_17 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 22, i32 23>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_18_19 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 24, i32 25>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1A_1B = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 26, i32 27>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1C_1D = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 28, i32 29>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 30, i32 31>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_20_21 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 32, i32 33>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_22_23 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 34, i32 35>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_24_25 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 36, i32 37>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_26_27 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 38, i32 39>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_28_29 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 40, i32 41>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_2A_2B = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 42, i32 43>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_2C_2D = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 44, i32 45>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_2E_2F = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 46, i32 47>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_30_31 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 48, i32 49>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_32_33 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 50, i32 51>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_34_35 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 52, i32 53>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_36_37 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 54, i32 55>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_38_39 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 56, i32 57>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_3A_3B = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 58, i32 59>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_3C_3D = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 60, i32 61>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 62, i32 63>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05_06_07 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09_0A_0B = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0C_0D_0E_0F = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_14_15_16_17 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 20, i32 21, i32 22, i32 23>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_18_19_1A_1B = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 24, i32 25, i32 26, i32 27>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 28, i32 29, i32 30, i32 31>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_20_21_22_23 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 32, i32 33, i32 34, i32 35>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_24_25_26_27 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 36, i32 37, i32 38, i32 39>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_28_29_2A_2B = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 40, i32 41, i32 42, i32 43>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_2C_2D_2E_2F = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 44, i32 45, i32 46, i32 47>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_30_31_32_33 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 48, i32 49, i32 50, i32 51>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_34_35_36_37 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 52, i32 53, i32 54, i32 55>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_38_39_3A_3B = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 56, i32 57, i32 58, i32 59>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 60, i32 61, i32 62, i32 63>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07 = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09_0A_0B_0C_0D_0E_0F = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13_14_15_16_17 = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_18_19_1A_1B_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_20_21_22_23_24_25_26_27 = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_28_29_2A_2B_2C_2D_2E_2F = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_30_31_32_33_34_35_36_37 = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_38_39_3A_3B_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F = shufflevector <64 x i8> %src512, <64 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_20_21_22_23_24_25_26_27_28_29_2A_2B_2C_2D_2E_2F = shufflevector <64 x i8> %src512, <64 x i8> undef, <16 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_30_31_32_33_34_35_36_37_38_39_3A_3B_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <16 x i32> <i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_20_21_22_23_24_25_26_27_28_29_2A_2B_2C_2D_2E_2F_30_31_32_33_34_35_36_37_38_39_3A_3B_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <32 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX512BW-LABEL: 'test_vXi8'
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64_01 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 0, i32 1>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_23 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 2, i32 3>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_45 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 4, i32 5>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_67 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 6, i32 7>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64_0123 = shufflevector <8 x i8> %src64, <8 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_4567 = shufflevector <8 x i8> %src64, <8 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_01 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 0, i32 1>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_23 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 2, i32 3>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_45 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 4, i32 5>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_67 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 6, i32 7>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_89 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 8, i32 9>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_AB = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 10, i32 11>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_CD = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 12, i32 13>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_EF = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 14, i32 15>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_0123 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_2345 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_4567 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_6789 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_89AB = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_CDEF = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_01234567 = shufflevector <16 x i8> %src128, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_89ABCDEF = shufflevector <16 x i8> %src128, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 0, i32 1>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_02_03 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 2, i32 3>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_04_05 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 4, i32 5>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_06_07 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 6, i32 7>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_08_09 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 8, i32 9>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0A_0B = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 10, i32 11>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0C_0D = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 12, i32 13>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 14, i32 15>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_10_11 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 16, i32 17>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_12_13 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 18, i32 19>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_14_15 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 20, i32 21>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_16_17 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 22, i32 23>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_18_19 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 24, i32 25>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_1A_1B = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 26, i32 27>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_1C_1D = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 28, i32 29>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 30, i32 31>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01_02_03 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_02_03_04_05 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_04_05_06_07 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_06_07_08_09 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_08_09_0A_0B = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0C_0D_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_10_11_12_13 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_14_15_16_17 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 20, i32 21, i32 22, i32 23>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_18_19_1A_1B = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 24, i32 25, i32 26, i32 27>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_1C_1D_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 28, i32 29, i32 30, i32 31>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01_02_03_04_05_06_07 = shufflevector <32 x i8> %src256, <32 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_10_11_12_13_14_15_16_17 = shufflevector <32 x i8> %src256, <32 x i8> undef, <8 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <8 x i32> <i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 0, i32 1>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_02_03 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 2, i32 3>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 4, i32 5>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_06_07 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 6, i32 7>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 8, i32 9>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0A_0B = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 10, i32 11>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0C_0D = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 12, i32 13>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0E_0F = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 14, i32 15>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 16, i32 17>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_12_13 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 18, i32 19>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_14_15 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 20, i32 21>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_16_17 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 22, i32 23>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_18_19 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 24, i32 25>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1A_1B = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 26, i32 27>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1C_1D = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 28, i32 29>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 30, i32 31>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_20_21 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 32, i32 33>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_22_23 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 34, i32 35>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_24_25 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 36, i32 37>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_26_27 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 38, i32 39>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_28_29 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 40, i32 41>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_2A_2B = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 42, i32 43>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_2C_2D = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 44, i32 45>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_2E_2F = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 46, i32 47>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_30_31 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 48, i32 49>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_32_33 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 50, i32 51>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_34_35 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 52, i32 53>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_36_37 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 54, i32 55>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_38_39 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 56, i32 57>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_3A_3B = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 58, i32 59>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_3C_3D = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 60, i32 61>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 62, i32 63>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05_06_07 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09_0A_0B = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0C_0D_0E_0F = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_14_15_16_17 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 20, i32 21, i32 22, i32 23>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_18_19_1A_1B = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 24, i32 25, i32 26, i32 27>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 28, i32 29, i32 30, i32 31>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_20_21_22_23 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 32, i32 33, i32 34, i32 35>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_24_25_26_27 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 36, i32 37, i32 38, i32 39>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_28_29_2A_2B = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 40, i32 41, i32 42, i32 43>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_2C_2D_2E_2F = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 44, i32 45, i32 46, i32 47>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_30_31_32_33 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 48, i32 49, i32 50, i32 51>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_34_35_36_37 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 52, i32 53, i32 54, i32 55>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_38_39_3A_3B = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 56, i32 57, i32 58, i32 59>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 60, i32 61, i32 62, i32 63>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07 = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09_0A_0B_0C_0D_0E_0F = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13_14_15_16_17 = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_18_19_1A_1B_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_20_21_22_23_24_25_26_27 = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_28_29_2A_2B_2C_2D_2E_2F = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_30_31_32_33_34_35_36_37 = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_38_39_3A_3B_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F = shufflevector <64 x i8> %src512, <64 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_20_21_22_23_24_25_26_27_28_29_2A_2B_2C_2D_2E_2F = shufflevector <64 x i8> %src512, <64 x i8> undef, <16 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_30_31_32_33_34_35_36_37_38_39_3A_3B_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <16 x i32> <i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_20_21_22_23_24_25_26_27_28_29_2A_2B_2C_2D_2E_2F_30_31_32_33_34_35_36_37_38_39_3A_3B_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <32 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX512VBMI-LABEL: 'test_vXi8'
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64_01 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 0, i32 1>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_23 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 2, i32 3>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_45 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 4, i32 5>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_67 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 6, i32 7>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64_0123 = shufflevector <8 x i8> %src64, <8 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_4567 = shufflevector <8 x i8> %src64, <8 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_01 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 0, i32 1>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_23 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 2, i32 3>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_45 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 4, i32 5>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_67 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 6, i32 7>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_89 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 8, i32 9>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_AB = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 10, i32 11>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_CD = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 12, i32 13>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_EF = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 14, i32 15>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_0123 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_2345 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_4567 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_6789 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_89AB = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_CDEF = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_01234567 = shufflevector <16 x i8> %src128, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_89ABCDEF = shufflevector <16 x i8> %src128, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 0, i32 1>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_02_03 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 2, i32 3>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_04_05 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 4, i32 5>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_06_07 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 6, i32 7>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_08_09 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 8, i32 9>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0A_0B = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 10, i32 11>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0C_0D = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 12, i32 13>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 14, i32 15>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_10_11 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 16, i32 17>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_12_13 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 18, i32 19>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_14_15 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 20, i32 21>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_16_17 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 22, i32 23>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_18_19 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 24, i32 25>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_1A_1B = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 26, i32 27>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_1C_1D = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 28, i32 29>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 30, i32 31>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01_02_03 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_02_03_04_05 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_04_05_06_07 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_06_07_08_09 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_08_09_0A_0B = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0C_0D_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_10_11_12_13 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_14_15_16_17 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 20, i32 21, i32 22, i32 23>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_18_19_1A_1B = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 24, i32 25, i32 26, i32 27>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_1C_1D_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 28, i32 29, i32 30, i32 31>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01_02_03_04_05_06_07 = shufflevector <32 x i8> %src256, <32 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_10_11_12_13_14_15_16_17 = shufflevector <32 x i8> %src256, <32 x i8> undef, <8 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <8 x i32> <i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 0, i32 1>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_02_03 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 2, i32 3>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 4, i32 5>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_06_07 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 6, i32 7>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 8, i32 9>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0A_0B = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 10, i32 11>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0C_0D = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 12, i32 13>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0E_0F = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 14, i32 15>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 16, i32 17>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_12_13 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 18, i32 19>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_14_15 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 20, i32 21>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_16_17 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 22, i32 23>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_18_19 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 24, i32 25>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1A_1B = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 26, i32 27>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1C_1D = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 28, i32 29>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 30, i32 31>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_20_21 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 32, i32 33>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_22_23 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 34, i32 35>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_24_25 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 36, i32 37>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_26_27 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 38, i32 39>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_28_29 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 40, i32 41>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_2A_2B = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 42, i32 43>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_2C_2D = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 44, i32 45>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_2E_2F = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 46, i32 47>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_30_31 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 48, i32 49>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_32_33 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 50, i32 51>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_34_35 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 52, i32 53>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_36_37 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 54, i32 55>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_38_39 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 56, i32 57>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_3A_3B = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 58, i32 59>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_3C_3D = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 60, i32 61>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 62, i32 63>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05_06_07 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09_0A_0B = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0C_0D_0E_0F = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_14_15_16_17 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 20, i32 21, i32 22, i32 23>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_18_19_1A_1B = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 24, i32 25, i32 26, i32 27>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 28, i32 29, i32 30, i32 31>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_20_21_22_23 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 32, i32 33, i32 34, i32 35>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_24_25_26_27 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 36, i32 37, i32 38, i32 39>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_28_29_2A_2B = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 40, i32 41, i32 42, i32 43>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_2C_2D_2E_2F = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 44, i32 45, i32 46, i32 47>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_30_31_32_33 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 48, i32 49, i32 50, i32 51>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_34_35_36_37 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 52, i32 53, i32 54, i32 55>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_38_39_3A_3B = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 56, i32 57, i32 58, i32 59>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 60, i32 61, i32 62, i32 63>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07 = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09_0A_0B_0C_0D_0E_0F = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13_14_15_16_17 = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_18_19_1A_1B_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_20_21_22_23_24_25_26_27 = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_28_29_2A_2B_2C_2D_2E_2F = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_30_31_32_33_34_35_36_37 = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_38_39_3A_3B_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F = shufflevector <64 x i8> %src512, <64 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_20_21_22_23_24_25_26_27_28_29_2A_2B_2C_2D_2E_2F = shufflevector <64 x i8> %src512, <64 x i8> undef, <16 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_30_31_32_33_34_35_36_37_38_39_3A_3B_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <16 x i32> <i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_20_21_22_23_24_25_26_27_28_29_2A_2B_2C_2D_2E_2F_30_31_32_33_34_35_36_37_38_39_3A_3B_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <32 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; SLM-LABEL: 'test_vXi8'
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64_01 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 0, i32 1>
@@ -1304,9 +1838,9 @@ define void @test_vXi8(<8 x i8> %src64, <16 x i8> %src128, <32 x i8> %src256, <6
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_CD = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 12, i32 13>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_EF = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 14, i32 15>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_0123 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SLM-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V128_2345 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_2345 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_4567 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; SLM-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V128_6789 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_6789 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_89AB = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_CDEF = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_01234567 = shufflevector <16 x i8> %src128, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -1328,9 +1862,9 @@ define void @test_vXi8(<8 x i8> %src64, <16 x i8> %src128, <32 x i8> %src256, <6
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_1C_1D = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 28, i32 29>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 30, i32 31>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01_02_03 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SLM-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V256_02_03_04_05 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_02_03_04_05 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_04_05_06_07 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; SLM-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V256_06_07_08_09 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_06_07_08_09 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_08_09_0A_0B = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0C_0D_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_10_11_12_13 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
@@ -1423,9 +1957,9 @@ define void @test_vXi8(<8 x i8> %src64, <16 x i8> %src128, <32 x i8> %src256, <6
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_CD = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 12, i32 13>
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_EF = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 14, i32 15>
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_0123 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; GLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V128_2345 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_2345 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_4567 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; GLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V128_6789 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_6789 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_89AB = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_CDEF = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_01234567 = shufflevector <16 x i8> %src128, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -1447,9 +1981,9 @@ define void @test_vXi8(<8 x i8> %src64, <16 x i8> %src128, <32 x i8> %src256, <6
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_1C_1D = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 28, i32 29>
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 30, i32 31>
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01_02_03 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; GLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_02_03_04_05 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_02_03_04_05 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_04_05_06_07 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; GLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_06_07_08_09 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_06_07_08_09 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_08_09_0A_0B = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0C_0D_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_10_11_12_13 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
diff --git a/llvm/test/Transforms/LowerMatrixIntrinsics/dot-product-float.ll b/llvm/test/Transforms/LowerMatrixIntrinsics/dot-product-float.ll
index 4ea6d7f..97faa23 100644
--- a/llvm/test/Transforms/LowerMatrixIntrinsics/dot-product-float.ll
+++ b/llvm/test/Transforms/LowerMatrixIntrinsics/dot-product-float.ll
@@ -88,46 +88,10 @@ declare <1 x float> @llvm.matrix.multiply.v1f32.v7f32.v7f32(<7 x float>, <7 x fl
 define <1 x double> @dotproduct_double_v6(<6 x double> %a, <6 x double> %b) {
 ; CHECK-LABEL: @dotproduct_double_v6(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[SPLIT:%.*]] = shufflevector <6 x double> [[A:%.*]], <6 x double> poison, <1 x i32> zeroinitializer
-; CHECK-NEXT:    [[SPLIT1:%.*]] = shufflevector <6 x double> [[A]], <6 x double> poison, <1 x i32> <i32 1>
-; CHECK-NEXT:    [[SPLIT2:%.*]] = shufflevector <6 x double> [[A]], <6 x double> poison, <1 x i32> <i32 2>
-; CHECK-NEXT:    [[SPLIT3:%.*]] = shufflevector <6 x double> [[A]], <6 x double> poison, <1 x i32> <i32 3>
-; CHECK-NEXT:    [[SPLIT4:%.*]] = shufflevector <6 x double> [[A]], <6 x double> poison, <1 x i32> <i32 4>
-; CHECK-NEXT:    [[SPLIT5:%.*]] = shufflevector <6 x double> [[A]], <6 x double> poison, <1 x i32> <i32 5>
-; CHECK-NEXT:    [[SPLIT6:%.*]] = shufflevector <6 x double> [[B:%.*]], <6 x double> poison, <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5>
-; CHECK-NEXT:    [[BLOCK:%.*]] = shufflevector <1 x double> [[SPLIT]], <1 x double> poison, <1 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP0:%.*]] = extractelement <6 x double> [[SPLIT6]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT:%.*]] = insertelement <1 x double> poison, double [[TMP0]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT]], <1 x double> poison, <1 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP1:%.*]] = fmul fast <1 x double> [[BLOCK]], [[SPLAT_SPLAT]]
-; CHECK-NEXT:    [[BLOCK7:%.*]] = shufflevector <1 x double> [[SPLIT1]], <1 x double> poison, <1 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <6 x double> [[SPLIT6]], i64 1
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT8:%.*]] = insertelement <1 x double> poison, double [[TMP2]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT9:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT8]], <1 x double> poison, <1 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP3:%.*]] = call fast <1 x double> @llvm.fmuladd.v1f64(<1 x double> [[BLOCK7]], <1 x double> [[SPLAT_SPLAT9]], <1 x double> [[TMP1]])
-; CHECK-NEXT:    [[BLOCK10:%.*]] = shufflevector <1 x double> [[SPLIT2]], <1 x double> poison, <1 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <6 x double> [[SPLIT6]], i64 2
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT11:%.*]] = insertelement <1 x double> poison, double [[TMP4]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT12:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT11]], <1 x double> poison, <1 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP5:%.*]] = call fast <1 x double> @llvm.fmuladd.v1f64(<1 x double> [[BLOCK10]], <1 x double> [[SPLAT_SPLAT12]], <1 x double> [[TMP3]])
-; CHECK-NEXT:    [[BLOCK13:%.*]] = shufflevector <1 x double> [[SPLIT3]], <1 x double> poison, <1 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <6 x double> [[SPLIT6]], i64 3
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT14:%.*]] = insertelement <1 x double> poison, double [[TMP6]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT15:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT14]], <1 x double> poison, <1 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP7:%.*]] = call fast <1 x double> @llvm.fmuladd.v1f64(<1 x double> [[BLOCK13]], <1 x double> [[SPLAT_SPLAT15]], <1 x double> [[TMP5]])
-; CHECK-NEXT:    [[BLOCK16:%.*]] = shufflevector <1 x double> [[SPLIT4]], <1 x double> poison, <1 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <6 x double> [[SPLIT6]], i64 4
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT17:%.*]] = insertelement <1 x double> poison, double [[TMP8]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT18:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT17]], <1 x double> poison, <1 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP9:%.*]] = call fast <1 x double> @llvm.fmuladd.v1f64(<1 x double> [[BLOCK16]], <1 x double> [[SPLAT_SPLAT18]], <1 x double> [[TMP7]])
-; CHECK-NEXT:    [[BLOCK19:%.*]] = shufflevector <1 x double> [[SPLIT5]], <1 x double> poison, <1 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <6 x double> [[SPLIT6]], i64 5
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT20:%.*]] = insertelement <1 x double> poison, double [[TMP10]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT21:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT20]], <1 x double> poison, <1 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP11:%.*]] = call fast <1 x double> @llvm.fmuladd.v1f64(<1 x double> [[BLOCK19]], <1 x double> [[SPLAT_SPLAT21]], <1 x double> [[TMP9]])
-; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <1 x double> [[TMP11]], <1 x double> poison, <1 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <1 x double> poison, <1 x double> [[TMP12]], <1 x i32> <i32 1>
-; CHECK-NEXT:    ret <1 x double> [[TMP13]]
+; CHECK-NEXT:    [[TMP0:%.*]] = fmul <6 x double> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast double @llvm.vector.reduce.fadd.v6f64(double 0.000000e+00, <6 x double> [[TMP0]])
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <1 x double> poison, double [[TMP1]], i64 0
+; CHECK-NEXT:    ret <1 x double> [[TMP2]]
 ;
 entry:
   %c = tail call fast <1 x double> @llvm.matrix.multiply.v1f64.v6f64.v6f64(<6 x double> %a, <6 x double> %b, i32 1, i32 6, i32 1)
-- 
cgit v1.1


From 268799cc59ef7af237278d058c6e2ee2c8eb227d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Timm=20B=C3=A4der?= <tbaeder@redhat.com>
Date: Mon, 12 Feb 2024 13:11:11 +0100
Subject: [clang][Interp][NFC] Refactor Pointer::toAPValue()

Use early-out style rather than letting control flow through
the entire function.
---
 clang/lib/AST/Interp/Pointer.cpp | 94 +++++++++++++++++-----------------------
 1 file changed, 39 insertions(+), 55 deletions(-)

diff --git a/clang/lib/AST/Interp/Pointer.cpp b/clang/lib/AST/Interp/Pointer.cpp
index 316a7ed..dd8868d 100644
--- a/clang/lib/AST/Interp/Pointer.cpp
+++ b/clang/lib/AST/Interp/Pointer.cpp
@@ -83,63 +83,47 @@ void Pointer::operator=(Pointer &&P) {
 }
 
 APValue Pointer::toAPValue() const {
-  APValue::LValueBase Base;
   llvm::SmallVector<APValue::LValuePathEntry, 5> Path;
-  CharUnits Offset;
-  bool IsNullPtr;
-  bool IsOnePastEnd;
-
-  if (isZero()) {
-    Base = static_cast<const Expr *>(nullptr);
-    IsNullPtr = true;
-    IsOnePastEnd = false;
-    Offset = CharUnits::Zero();
-  } else {
-    // Build the lvalue base from the block.
-    const Descriptor *Desc = getDeclDesc();
-    if (auto *VD = Desc->asValueDecl())
-      Base = VD;
-    else if (auto *E = Desc->asExpr())
-      Base = E;
-    else
-      llvm_unreachable("Invalid allocation type");
-
-    // Not a null pointer.
-    IsNullPtr = false;
-
-    if (isUnknownSizeArray()) {
-      IsOnePastEnd = false;
-      Offset = CharUnits::Zero();
-    } else if (Desc->asExpr()) {
-      // Pointer pointing to a an expression.
-      IsOnePastEnd = false;
-      Offset = CharUnits::Zero();
+
+  if (isZero())
+    return APValue(static_cast<const Expr *>(nullptr), CharUnits::Zero(), Path,
+                   false, /*IsNullPtr=*/false);
+
+  // Build the lvalue base from the block.
+  const Descriptor *Desc = getDeclDesc();
+  APValue::LValueBase Base;
+  if (auto *VD = Desc->asValueDecl())
+    Base = VD;
+  else if (auto *E = Desc->asExpr())
+    Base = E;
+  else
+    llvm_unreachable("Invalid allocation type");
+
+  if (isUnknownSizeArray() || Desc->asExpr())
+    return APValue(Base, CharUnits::Zero(), Path, false, false);
+
+  // TODO: compute the offset into the object.
+  CharUnits Offset = CharUnits::Zero();
+  bool IsOnePastEnd = isOnePastEnd();
+
+  // Build the path into the object.
+  Pointer Ptr = *this;
+  while (Ptr.isField() || Ptr.isArrayElement()) {
+    if (Ptr.isArrayElement()) {
+      Path.push_back(APValue::LValuePathEntry::ArrayIndex(Ptr.getIndex()));
+      Ptr = Ptr.getArray();
     } else {
-      // TODO: compute the offset into the object.
-      Offset = CharUnits::Zero();
-
-      // Build the path into the object.
-      Pointer Ptr = *this;
-      while (Ptr.isField() || Ptr.isArrayElement()) {
-        if (Ptr.isArrayElement()) {
-          Path.push_back(APValue::LValuePathEntry::ArrayIndex(Ptr.getIndex()));
-          Ptr = Ptr.getArray();
-        } else {
-          // TODO: figure out if base is virtual
-          bool IsVirtual = false;
-
-          // Create a path entry for the field.
-          const Descriptor *Desc = Ptr.getFieldDesc();
-          if (const auto *BaseOrMember = Desc->asDecl()) {
-            Path.push_back(APValue::LValuePathEntry({BaseOrMember, IsVirtual}));
-            Ptr = Ptr.getBase();
-            continue;
-          }
-          llvm_unreachable("Invalid field type");
-        }
+      // TODO: figure out if base is virtual
+      bool IsVirtual = false;
+
+      // Create a path entry for the field.
+      const Descriptor *Desc = Ptr.getFieldDesc();
+      if (const auto *BaseOrMember = Desc->asDecl()) {
+        Path.push_back(APValue::LValuePathEntry({BaseOrMember, IsVirtual}));
+        Ptr = Ptr.getBase();
+        continue;
       }
-
-      IsOnePastEnd = isOnePastEnd();
+      llvm_unreachable("Invalid field type");
     }
   }
 
@@ -149,7 +133,7 @@ APValue Pointer::toAPValue() const {
   // Just invert the order of the elements.
   std::reverse(Path.begin(), Path.end());
 
-  return APValue(Base, Offset, Path, IsOnePastEnd, IsNullPtr);
+  return APValue(Base, Offset, Path, IsOnePastEnd, /*IsNullPtr=*/false);
 }
 
 std::string Pointer::toDiagnosticString(const ASTContext &Ctx) const {
-- 
cgit v1.1


From 0940f9083e68bda78bcbb323c2968a4294092e21 Mon Sep 17 00:00:00 2001
From: Alexey Bataev <5361294+alexey-bataev@users.noreply.github.com>
Date: Mon, 12 Feb 2024 07:41:42 -0500
Subject: [SLP]Add support for strided loads.

Added basic support for strided loads support in SLP vectorizer.
Supports constant strides only. If the strided load must be
reversed, applies -stride to avoid extra reverse shuffle.

Reviewers: preames, lukel97

Reviewed By: preames

Pull Request: https://github.com/llvm/llvm-project/pull/80310
---
 llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp    | 288 ++++++-----
 .../SLPVectorizer/RISCV/complex-loads.ll           | 546 ++++++++++-----------
 .../RISCV/strided-loads-vectorized.ll              | 131 +----
 .../RISCV/strided-loads-with-external-use-ptr.ll   |   4 +-
 .../SLPVectorizer/RISCV/strided-loads.ll           |  13 +-
 .../X86/gep-nodes-with-non-gep-inst.ll             |   2 +-
 .../X86/remark_gather-load-redux-cost.ll           |   2 +-
 7 files changed, 456 insertions(+), 530 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index c0b7298f..c94fb71 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -175,6 +175,15 @@ static cl::opt<int> RootLookAheadMaxDepth(
     "slp-max-root-look-ahead-depth", cl::init(2), cl::Hidden,
     cl::desc("The maximum look-ahead depth for searching best rooting option"));
 
+static cl::opt<unsigned> MinProfitableStridedLoads(
+    "slp-min-strided-loads", cl::init(2), cl::Hidden,
+    cl::desc("The minimum number of loads, which should be considered strided, "
+             "if the stride is > 1 or is runtime value"));
+
+static cl::opt<unsigned> MaxProfitableLoadStride(
+    "slp-max-stride", cl::init(8), cl::Hidden,
+    cl::desc("The maximum stride, considered to be profitable."));
+
 static cl::opt<bool>
     ViewSLPTree("view-slp-tree", cl::Hidden,
                 cl::desc("Display the SLP trees with Graphviz"));
@@ -2575,7 +2584,7 @@ private:
     enum EntryState {
       Vectorize,
       ScatterVectorize,
-      PossibleStridedVectorize,
+      StridedVectorize,
       NeedToGather
     };
     EntryState State;
@@ -2753,8 +2762,8 @@ private:
       case ScatterVectorize:
         dbgs() << "ScatterVectorize\n";
         break;
-      case PossibleStridedVectorize:
-        dbgs() << "PossibleStridedVectorize\n";
+      case StridedVectorize:
+        dbgs() << "StridedVectorize\n";
         break;
       case NeedToGather:
         dbgs() << "NeedToGather\n";
@@ -3680,7 +3689,7 @@ template <> struct DOTGraphTraits<BoUpSLP *> : public DefaultDOTGraphTraits {
     if (Entry->State == TreeEntry::NeedToGather)
       return "color=red";
     if (Entry->State == TreeEntry::ScatterVectorize ||
-        Entry->State == TreeEntry::PossibleStridedVectorize)
+        Entry->State == TreeEntry::StridedVectorize)
       return "color=blue";
     return "";
   }
@@ -3842,12 +3851,7 @@ BoUpSLP::findReusedOrderedScalars(const BoUpSLP::TreeEntry &TE) {
 
 namespace {
 /// Tracks the state we can represent the loads in the given sequence.
-enum class LoadsState {
-  Gather,
-  Vectorize,
-  ScatterVectorize,
-  PossibleStridedVectorize
-};
+enum class LoadsState { Gather, Vectorize, ScatterVectorize, StridedVectorize };
 } // anonymous namespace
 
 static bool arePointersCompatible(Value *Ptr1, Value *Ptr2,
@@ -3878,6 +3882,14 @@ static Align computeCommonAlignment(ArrayRef<Value *> VL) {
   return CommonAlignment;
 }
 
+/// Check if \p Order represents reverse order.
+static bool isReverseOrder(ArrayRef<unsigned> Order) {
+  unsigned Sz = Order.size();
+  return !Order.empty() && all_of(enumerate(Order), [&](const auto &Pair) {
+    return Pair.value() == Sz || Sz - Pair.index() - 1 == Pair.value();
+  });
+}
+
 /// Checks if the given array of loads can be represented as a vectorized,
 /// scatter or just simple gather.
 static LoadsState canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0,
@@ -3900,7 +3912,8 @@ static LoadsState canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0,
   // Make sure all loads in the bundle are simple - we can't vectorize
   // atomic or volatile loads.
   PointerOps.clear();
-  PointerOps.resize(VL.size());
+  const unsigned Sz = VL.size();
+  PointerOps.resize(Sz);
   auto *POIter = PointerOps.begin();
   for (Value *V : VL) {
     auto *L = cast<LoadInst>(V);
@@ -3911,12 +3924,12 @@ static LoadsState canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0,
   }
 
   Order.clear();
+  auto *VecTy = FixedVectorType::get(ScalarTy, Sz);
   // Check the order of pointer operands or that all pointers are the same.
   bool IsSorted = sortPtrAccesses(PointerOps, ScalarTy, DL, SE, Order);
   if (IsSorted || all_of(PointerOps, [&](Value *P) {
         return arePointersCompatible(P, PointerOps.front(), TLI);
       })) {
-    bool IsPossibleStrided = false;
     if (IsSorted) {
       Value *Ptr0;
       Value *PtrN;
@@ -3930,30 +3943,71 @@ static LoadsState canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0,
       std::optional<int> Diff =
           getPointersDiff(ScalarTy, Ptr0, ScalarTy, PtrN, DL, SE);
       // Check that the sorted loads are consecutive.
-      if (static_cast<unsigned>(*Diff) == VL.size() - 1)
+      if (static_cast<unsigned>(*Diff) == Sz - 1)
         return LoadsState::Vectorize;
       // Simple check if not a strided access - clear order.
-      IsPossibleStrided = *Diff % (VL.size() - 1) == 0;
+      bool IsPossibleStrided = *Diff % (Sz - 1) == 0;
+      // Try to generate strided load node if:
+      // 1. Target with strided load support is detected.
+      // 2. The number of loads is greater than MinProfitableStridedLoads,
+      // or the potential stride <= MaxProfitableLoadStride and the
+      // potential stride is power-of-2 (to avoid perf regressions for the very
+      // small number of loads) and max distance > number of loads, or potential
+      // stride is -1.
+      // 3. The loads are ordered, or number of unordered loads <=
+      // MaxProfitableUnorderedLoads, or loads are in reversed order.
+      // (this check is to avoid extra costs for very expensive shuffles).
+      if (IsPossibleStrided && (((Sz > MinProfitableStridedLoads ||
+                                  (static_cast<unsigned>(std::abs(*Diff)) <=
+                                       MaxProfitableLoadStride * Sz &&
+                                   isPowerOf2_32(std::abs(*Diff)))) &&
+                                 static_cast<unsigned>(std::abs(*Diff)) > Sz) ||
+                                *Diff == -(static_cast<int>(Sz) - 1))) {
+        int Stride = *Diff / static_cast<int>(Sz - 1);
+        if (*Diff == Stride * static_cast<int>(Sz - 1)) {
+          Align Alignment =
+              cast<LoadInst>(Order.empty() ? VL.front() : VL[Order.front()])
+                  ->getAlign();
+          if (TTI.isLegalStridedLoadStore(VecTy, Alignment)) {
+            // Iterate through all pointers and check if all distances are
+            // unique multiple of Dist.
+            SmallSet<int, 4> Dists;
+            for (Value *Ptr : PointerOps) {
+              int Dist = 0;
+              if (Ptr == PtrN)
+                Dist = *Diff;
+              else if (Ptr != Ptr0)
+                Dist = *getPointersDiff(ScalarTy, Ptr0, ScalarTy, Ptr, DL, SE);
+              // If the strides are not the same or repeated, we can't
+              // vectorize.
+              if (((Dist / Stride) * Stride) != Dist ||
+                  !Dists.insert(Dist).second)
+                break;
+            }
+            if (Dists.size() == Sz)
+              return LoadsState::StridedVectorize;
+          }
+        }
+      }
     }
     // TODO: need to improve analysis of the pointers, if not all of them are
     // GEPs or have > 2 operands, we end up with a gather node, which just
     // increases the cost.
     Loop *L = LI.getLoopFor(cast<LoadInst>(VL0)->getParent());
     bool ProfitableGatherPointers =
-        static_cast<unsigned>(count_if(PointerOps, [L](Value *V) {
-          return L && L->isLoopInvariant(V);
-        })) <= VL.size() / 2 && VL.size() > 2;
+        static_cast<unsigned>(count_if(
+            PointerOps,
+            [L](Value *V) { return L && L->isLoopInvariant(V); })) <= Sz / 2 &&
+        Sz > 2;
     if (ProfitableGatherPointers || all_of(PointerOps, [IsSorted](Value *P) {
           auto *GEP = dyn_cast<GetElementPtrInst>(P);
           return (IsSorted && !GEP && doesNotNeedToBeScheduled(P)) ||
                  (GEP && GEP->getNumOperands() == 2);
         })) {
       Align CommonAlignment = computeCommonAlignment<LoadInst>(VL);
-      auto *VecTy = FixedVectorType::get(ScalarTy, VL.size());
       if (TTI.isLegalMaskedGather(VecTy, CommonAlignment) &&
           !TTI.forceScalarizeMaskedGather(VecTy, CommonAlignment))
-        return IsPossibleStrided ? LoadsState::PossibleStridedVectorize
-                                 : LoadsState::ScatterVectorize;
+        return LoadsState::ScatterVectorize;
     }
   }
 
@@ -4160,7 +4214,7 @@ BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom) {
     return std::move(ResOrder);
   }
   if ((TE.State == TreeEntry::Vectorize ||
-       TE.State == TreeEntry::PossibleStridedVectorize) &&
+       TE.State == TreeEntry::StridedVectorize) &&
       (isa<LoadInst, ExtractElementInst, ExtractValueInst>(TE.getMainOp()) ||
        (TopToBottom && isa<StoreInst, InsertElementInst>(TE.getMainOp()))) &&
       !TE.isAltShuffle())
@@ -4418,7 +4472,7 @@ void BoUpSLP::reorderTopToBottom() {
       }
       VFToOrderedEntries[TE->getVectorFactor()].insert(TE.get());
       if (!(TE->State == TreeEntry::Vectorize ||
-            TE->State == TreeEntry::PossibleStridedVectorize) ||
+            TE->State == TreeEntry::StridedVectorize) ||
           !TE->ReuseShuffleIndices.empty())
         GathersToOrders.try_emplace(TE.get(), *CurrentOrder);
       if (TE->State == TreeEntry::Vectorize &&
@@ -4442,9 +4496,6 @@ void BoUpSLP::reorderTopToBottom() {
     MapVector<OrdersType, unsigned,
               DenseMap<OrdersType, unsigned, OrdersTypeDenseMapInfo>>
         OrdersUses;
-    // Last chance orders - scatter vectorize. Try to use their orders if no
-    // other orders or the order is counted already.
-    SmallVector<OrdersType> StridedVectorizeOrders;
     SmallPtrSet<const TreeEntry *, 4> VisitedOps;
     for (const TreeEntry *OpTE : OrderedEntries) {
       // No need to reorder this nodes, still need to extend and to use shuffle,
@@ -4491,11 +4542,6 @@ void BoUpSLP::reorderTopToBottom() {
         if (Order.empty())
           continue;
       }
-      // Postpone scatter orders.
-      if (OpTE->State == TreeEntry::PossibleStridedVectorize) {
-        StridedVectorizeOrders.push_back(Order);
-        continue;
-      }
       // Stores actually store the mask, not the order, need to invert.
       if (OpTE->State == TreeEntry::Vectorize && !OpTE->isAltShuffle() &&
           OpTE->getOpcode() == Instruction::Store && !Order.empty()) {
@@ -4512,22 +4558,6 @@ void BoUpSLP::reorderTopToBottom() {
         ++OrdersUses.insert(std::make_pair(Order, 0)).first->second;
       }
     }
-    // Set order of the user node.
-    if (OrdersUses.empty()) {
-      if (StridedVectorizeOrders.empty())
-        continue;
-      // Add (potentially!) strided vectorize orders.
-      for (OrdersType &Order : StridedVectorizeOrders)
-        ++OrdersUses.insert(std::make_pair(Order, 0)).first->second;
-    } else {
-      // Account (potentially!) strided vectorize orders only if it was used
-      // already.
-      for (OrdersType &Order : StridedVectorizeOrders) {
-        auto *It = OrdersUses.find(Order);
-        if (It != OrdersUses.end())
-          ++It->second;
-      }
-    }
     // Choose the most used order.
     ArrayRef<unsigned> BestOrder = OrdersUses.front().first;
     unsigned Cnt = OrdersUses.front().second;
@@ -4569,7 +4599,7 @@ void BoUpSLP::reorderTopToBottom() {
         continue;
       }
       if ((TE->State == TreeEntry::Vectorize ||
-           TE->State == TreeEntry::PossibleStridedVectorize) &&
+           TE->State == TreeEntry::StridedVectorize) &&
           isa<ExtractElementInst, ExtractValueInst, LoadInst, StoreInst,
               InsertElementInst>(TE->getMainOp()) &&
           !TE->isAltShuffle()) {
@@ -4610,10 +4640,6 @@ bool BoUpSLP::canReorderOperands(
         }))
       continue;
     if (TreeEntry *TE = getVectorizedOperand(UserTE, I)) {
-      // FIXME: Do not reorder (possible!) strided vectorized nodes, they
-      // require reordering of the operands, which is not implemented yet.
-      if (TE->State == TreeEntry::PossibleStridedVectorize)
-        return false;
       // Do not reorder if operand node is used by many user nodes.
       if (any_of(TE->UserTreeIndices,
                  [UserTE](const EdgeInfo &EI) { return EI.UserTE != UserTE; }))
@@ -4664,13 +4690,13 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
   SmallVector<TreeEntry *> NonVectorized;
   for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
     if (TE->State != TreeEntry::Vectorize &&
-        TE->State != TreeEntry::PossibleStridedVectorize)
+        TE->State != TreeEntry::StridedVectorize)
       NonVectorized.push_back(TE.get());
     if (std::optional<OrdersType> CurrentOrder =
             getReorderingData(*TE, /*TopToBottom=*/false)) {
       OrderedEntries.insert(TE.get());
       if (!(TE->State == TreeEntry::Vectorize ||
-            TE->State == TreeEntry::PossibleStridedVectorize) ||
+            TE->State == TreeEntry::StridedVectorize) ||
           !TE->ReuseShuffleIndices.empty())
         GathersToOrders.try_emplace(TE.get(), *CurrentOrder);
     }
@@ -4688,7 +4714,7 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
     SmallVector<TreeEntry *> Filtered;
     for (TreeEntry *TE : OrderedEntries) {
       if (!(TE->State == TreeEntry::Vectorize ||
-            TE->State == TreeEntry::PossibleStridedVectorize ||
+            TE->State == TreeEntry::StridedVectorize ||
             (TE->State == TreeEntry::NeedToGather &&
              GathersToOrders.count(TE))) ||
           TE->UserTreeIndices.empty() || !TE->ReuseShuffleIndices.empty() ||
@@ -4733,9 +4759,6 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
       MapVector<OrdersType, unsigned,
                 DenseMap<OrdersType, unsigned, OrdersTypeDenseMapInfo>>
           OrdersUses;
-      // Last chance orders - scatter vectorize. Try to use their orders if no
-      // other orders or the order is counted already.
-      SmallVector<std::pair<OrdersType, unsigned>> StridedVectorizeOrders;
       // Do the analysis for each tree entry only once, otherwise the order of
       // the same node my be considered several times, though might be not
       // profitable.
@@ -4757,11 +4780,6 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
             Data.second, [OpTE](const std::pair<unsigned, TreeEntry *> &P) {
               return P.second == OpTE;
             });
-        // Postpone scatter orders.
-        if (OpTE->State == TreeEntry::PossibleStridedVectorize) {
-          StridedVectorizeOrders.emplace_back(Order, NumOps);
-          continue;
-        }
         // Stores actually store the mask, not the order, need to invert.
         if (OpTE->State == TreeEntry::Vectorize && !OpTE->isAltShuffle() &&
             OpTE->getOpcode() == Instruction::Store && !Order.empty()) {
@@ -4819,30 +4837,6 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
             ++Res.first->second;
         }
       }
-      // If no orders - skip current nodes and jump to the next one, if any.
-      if (OrdersUses.empty()) {
-        if (StridedVectorizeOrders.empty() ||
-            (Data.first->ReorderIndices.empty() &&
-             Data.first->ReuseShuffleIndices.empty() &&
-             !(IgnoreReorder &&
-               Data.first == VectorizableTree.front().get()))) {
-          for (const std::pair<unsigned, TreeEntry *> &Op : Data.second)
-            OrderedEntries.remove(Op.second);
-          continue;
-        }
-        // Add (potentially!) strided vectorize orders.
-        for (std::pair<OrdersType, unsigned> &Pair : StridedVectorizeOrders)
-          OrdersUses.insert(std::make_pair(Pair.first, 0)).first->second +=
-              Pair.second;
-      } else {
-        // Account (potentially!) strided vectorize orders only if it was used
-        // already.
-        for (std::pair<OrdersType, unsigned> &Pair : StridedVectorizeOrders) {
-          auto *It = OrdersUses.find(Pair.first);
-          if (It != OrdersUses.end())
-            It->second += Pair.second;
-        }
-      }
       // Choose the best order.
       ArrayRef<unsigned> BestOrder = OrdersUses.front().first;
       unsigned Cnt = OrdersUses.front().second;
@@ -4878,7 +4872,7 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
         }
         // Gathers are processed separately.
         if (TE->State != TreeEntry::Vectorize &&
-            TE->State != TreeEntry::PossibleStridedVectorize &&
+            TE->State != TreeEntry::StridedVectorize &&
             (TE->State != TreeEntry::ScatterVectorize ||
              TE->ReorderIndices.empty()))
           continue;
@@ -4910,7 +4904,7 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
         Data.first->reorderOperands(Mask);
       if (!isa<InsertElementInst, StoreInst>(Data.first->getMainOp()) ||
           Data.first->isAltShuffle() ||
-          Data.first->State == TreeEntry::PossibleStridedVectorize) {
+          Data.first->State == TreeEntry::StridedVectorize) {
         reorderScalars(Data.first->Scalars, Mask);
         reorderOrder(Data.first->ReorderIndices, MaskOrder,
                      /*BottomOrder=*/true);
@@ -4973,7 +4967,6 @@ void BoUpSLP::buildExternalUses(
           // instructions. If that is the case, the one in FoundLane will
           // be used.
           if (UseEntry->State == TreeEntry::ScatterVectorize ||
-              UseEntry->State == TreeEntry::PossibleStridedVectorize ||
               !doesInTreeUserNeedToExtract(
                   Scalar, cast<Instruction>(UseEntry->Scalars.front()), TLI)) {
             LLVM_DEBUG(dbgs() << "SLP: \tInternal user will be removed:" << *U
@@ -5331,8 +5324,8 @@ BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(
       return TreeEntry::Vectorize;
     case LoadsState::ScatterVectorize:
       return TreeEntry::ScatterVectorize;
-    case LoadsState::PossibleStridedVectorize:
-      return TreeEntry::PossibleStridedVectorize;
+    case LoadsState::StridedVectorize:
+      return TreeEntry::StridedVectorize;
     case LoadsState::Gather:
 #ifndef NDEBUG
       Type *ScalarTy = VL0->getType();
@@ -5753,8 +5746,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
   BasicBlock *BB = nullptr;
   bool IsScatterVectorizeUserTE =
       UserTreeIdx.UserTE &&
-      (UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize ||
-       UserTreeIdx.UserTE->State == TreeEntry::PossibleStridedVectorize);
+      UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize;
   bool AreAllSameInsts =
       (S.getOpcode() && allSameBlock(VL)) ||
       (S.OpValue->getType()->isPointerTy() && IsScatterVectorizeUserTE &&
@@ -5851,8 +5843,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
   // Special processing for sorted pointers for ScatterVectorize node with
   // constant indeces only.
   if (AreAllSameInsts && UserTreeIdx.UserTE &&
-      (UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize ||
-       UserTreeIdx.UserTE->State == TreeEntry::PossibleStridedVectorize) &&
+      UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize &&
       !(S.getOpcode() && allSameBlock(VL))) {
     assert(S.OpValue->getType()->isPointerTy() &&
            count_if(VL, [](Value *V) { return isa<GetElementPtrInst>(V); }) >=
@@ -6049,18 +6040,17 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
         }
         TE->setOperandsInOrder();
         break;
-      case TreeEntry::PossibleStridedVectorize:
+      case TreeEntry::StridedVectorize:
         // Vectorizing non-consecutive loads with `llvm.masked.gather`.
         if (CurrentOrder.empty()) {
-          TE = newTreeEntry(VL, TreeEntry::PossibleStridedVectorize, Bundle, S,
+          TE = newTreeEntry(VL, TreeEntry::StridedVectorize, Bundle, S,
                             UserTreeIdx, ReuseShuffleIndicies);
         } else {
-          TE = newTreeEntry(VL, TreeEntry::PossibleStridedVectorize, Bundle, S,
+          TE = newTreeEntry(VL, TreeEntry::StridedVectorize, Bundle, S,
                             UserTreeIdx, ReuseShuffleIndicies, CurrentOrder);
         }
         TE->setOperandsInOrder();
-        buildTree_rec(PointerOps, Depth + 1, {TE, 0});
-        LLVM_DEBUG(dbgs() << "SLP: added a vector of non-consecutive loads.\n");
+        LLVM_DEBUG(dbgs() << "SLP: added a vector of strided loads.\n");
         break;
       case TreeEntry::ScatterVectorize:
         // Vectorizing non-consecutive loads with `llvm.masked.gather`.
@@ -7091,7 +7081,7 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
         !isSplat(Gathers)) {
       InstructionCost BaseCost = R.getGatherCost(Gathers, !Root);
       SetVector<Value *> VectorizedLoads;
-      SmallVector<unsigned> VectorizedStarts;
+      SmallVector<std::pair<unsigned, LoadsState>> VectorizedStarts;
       SmallVector<unsigned> ScatterVectorized;
       unsigned StartIdx = 0;
       unsigned VF = VL.size() / 2;
@@ -7115,12 +7105,16 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
             switch (LS) {
             case LoadsState::Vectorize:
             case LoadsState::ScatterVectorize:
-            case LoadsState::PossibleStridedVectorize:
+            case LoadsState::StridedVectorize:
               // Mark the vectorized loads so that we don't vectorize them
               // again.
               // TODO: better handling of loads with reorders.
-              if (LS == LoadsState::Vectorize && CurrentOrder.empty())
-                VectorizedStarts.push_back(Cnt);
+              if (((LS == LoadsState::Vectorize ||
+                    LS == LoadsState::StridedVectorize) &&
+                   CurrentOrder.empty()) ||
+                  (LS == LoadsState::StridedVectorize &&
+                   isReverseOrder(CurrentOrder)))
+                VectorizedStarts.emplace_back(Cnt, LS);
               else
                 ScatterVectorized.push_back(Cnt);
               VectorizedLoads.insert(Slice.begin(), Slice.end());
@@ -7164,16 +7158,20 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
                                   CostKind, TTI::OperandValueInfo(), LI);
         }
         auto *LoadTy = FixedVectorType::get(VL.front()->getType(), VF);
-        for (unsigned P : VectorizedStarts) {
-          auto *LI = cast<LoadInst>(VL[P]);
+        for (const std::pair<unsigned, LoadsState> &P : VectorizedStarts) {
+          auto *LI = cast<LoadInst>(VL[P.first]);
           Align Alignment = LI->getAlign();
           GatherCost +=
-              TTI.getMemoryOpCost(Instruction::Load, LoadTy, Alignment,
-                                  LI->getPointerAddressSpace(), CostKind,
-                                  TTI::OperandValueInfo(), LI);
+              P.second == LoadsState::Vectorize
+                  ? TTI.getMemoryOpCost(Instruction::Load, LoadTy, Alignment,
+                                        LI->getPointerAddressSpace(), CostKind,
+                                        TTI::OperandValueInfo(), LI)
+                  : TTI.getStridedMemoryOpCost(
+                        Instruction::Load, LoadTy, LI->getPointerOperand(),
+                        /*VariableMask=*/false, Alignment, CostKind, LI);
           // Estimate GEP cost.
           SmallVector<Value *> PointerOps(VF);
-          for (auto [I, V] : enumerate(VL.slice(P, VF)))
+          for (auto [I, V] : enumerate(VL.slice(P.first, VF)))
             PointerOps[I] = cast<LoadInst>(V)->getPointerOperand();
           auto [ScalarGEPCost, VectorGEPCost] =
               getGEPCosts(TTI, PointerOps, LI->getPointerOperand(),
@@ -7913,8 +7911,9 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
   }
   InstructionCost CommonCost = 0;
   SmallVector<int> Mask;
+  bool IsReverseOrder = isReverseOrder(E->ReorderIndices);
   if (!E->ReorderIndices.empty() &&
-      E->State != TreeEntry::PossibleStridedVectorize) {
+      (E->State != TreeEntry::StridedVectorize || !IsReverseOrder)) {
     SmallVector<int> NewMask;
     if (E->getOpcode() == Instruction::Store) {
       // For stores the order is actually a mask.
@@ -7932,7 +7931,7 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
         TTI->getShuffleCost(TTI::SK_PermuteSingleSrc, FinalVecTy, Mask);
   assert((E->State == TreeEntry::Vectorize ||
           E->State == TreeEntry::ScatterVectorize ||
-          E->State == TreeEntry::PossibleStridedVectorize) &&
+          E->State == TreeEntry::StridedVectorize) &&
          "Unhandled state");
   assert(E->getOpcode() &&
          ((allSameType(VL) && allSameBlock(VL)) ||
@@ -7952,7 +7951,8 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
   }
   auto GetCastContextHint = [&](Value *V) {
     if (const TreeEntry *OpTE = getTreeEntry(V)) {
-      if (OpTE->State == TreeEntry::ScatterVectorize)
+      if (OpTE->State == TreeEntry::ScatterVectorize ||
+          OpTE->State == TreeEntry::StridedVectorize)
         return TTI::CastContextHint::GatherScatter;
       if (OpTE->State == TreeEntry::Vectorize &&
           OpTE->getOpcode() == Instruction::Load && !OpTE->isAltShuffle()) {
@@ -8028,8 +8028,9 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
   // Calculate cost difference from vectorizing set of GEPs.
   // Negative value means vectorizing is profitable.
   auto GetGEPCostDiff = [=](ArrayRef<Value *> Ptrs, Value *BasePtr) {
-    assert(E->State == TreeEntry::Vectorize &&
-           "Entry state expected to be Vectorize here.");
+    assert((E->State == TreeEntry::Vectorize ||
+            E->State == TreeEntry::StridedVectorize) &&
+           "Entry state expected to be Vectorize or StridedVectorize here.");
     InstructionCost ScalarCost = 0;
     InstructionCost VecCost = 0;
     std::tie(ScalarCost, VecCost) = getGEPCosts(
@@ -8382,10 +8383,14 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
         VecLdCost = TTI->getMemoryOpCost(
             Instruction::Load, VecTy, LI0->getAlign(),
             LI0->getPointerAddressSpace(), CostKind, TTI::OperandValueInfo());
+      } else if (E->State == TreeEntry::StridedVectorize) {
+        Align CommonAlignment =
+            computeCommonAlignment<LoadInst>(UniqueValues.getArrayRef());
+        VecLdCost = TTI->getStridedMemoryOpCost(
+            Instruction::Load, VecTy, LI0->getPointerOperand(),
+            /*VariableMask=*/false, CommonAlignment, CostKind);
       } else {
-        assert((E->State == TreeEntry::ScatterVectorize ||
-                E->State == TreeEntry::PossibleStridedVectorize) &&
-               "Unknown EntryState");
+        assert(E->State == TreeEntry::ScatterVectorize && "Unknown EntryState");
         Align CommonAlignment =
             computeCommonAlignment<LoadInst>(UniqueValues.getArrayRef());
         VecLdCost = TTI->getGatherScatterOpCost(
@@ -8398,8 +8403,7 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
     InstructionCost Cost = GetCostDiff(GetScalarCost, GetVectorCost);
     // If this node generates masked gather load then it is not a terminal node.
     // Hence address operand cost is estimated separately.
-    if (E->State == TreeEntry::ScatterVectorize ||
-        E->State == TreeEntry::PossibleStridedVectorize)
+    if (E->State == TreeEntry::ScatterVectorize)
       return Cost;
 
     // Estimate cost of GEPs since this tree node is a terminator.
@@ -8608,7 +8612,7 @@ bool BoUpSLP::isFullyVectorizableTinyTree(bool ForReduction) const {
   if (VectorizableTree[0]->State == TreeEntry::NeedToGather ||
       (VectorizableTree[1]->State == TreeEntry::NeedToGather &&
        VectorizableTree[0]->State != TreeEntry::ScatterVectorize &&
-       VectorizableTree[0]->State != TreeEntry::PossibleStridedVectorize))
+       VectorizableTree[0]->State != TreeEntry::StridedVectorize))
     return false;
 
   return true;
@@ -10579,11 +10583,6 @@ public:
 Value *BoUpSLP::vectorizeOperand(TreeEntry *E, unsigned NodeIdx,
                                  bool PostponedPHIs) {
   ValueList &VL = E->getOperand(NodeIdx);
-  if (E->State == TreeEntry::PossibleStridedVectorize &&
-      !E->ReorderIndices.empty()) {
-    SmallVector<int> Mask(E->ReorderIndices.begin(), E->ReorderIndices.end());
-    reorderScalars(VL, Mask);
-  }
   const unsigned VF = VL.size();
   InstructionsState S = getSameOpcode(VL, *TLI);
   // Special processing for GEPs bundle, which may include non-gep values.
@@ -11157,6 +11156,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) {
     return Vec;
   }
 
+  bool IsReverseOrder = isReverseOrder(E->ReorderIndices);
   auto FinalShuffle = [&](Value *V, const TreeEntry *E, VectorType *VecTy,
                           bool IsSigned) {
     if (V->getType() != VecTy)
@@ -11167,7 +11167,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) {
           ArrayRef(reinterpret_cast<const int *>(E->ReorderIndices.begin()),
                    E->ReorderIndices.size());
       ShuffleBuilder.add(V, Mask);
-    } else if (E->State == TreeEntry::PossibleStridedVectorize) {
+    } else if (E->State == TreeEntry::StridedVectorize && IsReverseOrder) {
       ShuffleBuilder.addOrdered(V, std::nullopt);
     } else {
       ShuffleBuilder.addOrdered(V, E->ReorderIndices);
@@ -11177,7 +11177,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) {
 
   assert((E->State == TreeEntry::Vectorize ||
           E->State == TreeEntry::ScatterVectorize ||
-          E->State == TreeEntry::PossibleStridedVectorize) &&
+          E->State == TreeEntry::StridedVectorize) &&
          "Unhandled state");
   unsigned ShuffleOrOp =
       E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector : E->getOpcode();
@@ -11642,10 +11642,29 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) {
       Value *PO = LI->getPointerOperand();
       if (E->State == TreeEntry::Vectorize) {
         NewLI = Builder.CreateAlignedLoad(VecTy, PO, LI->getAlign());
+      } else if (E->State == TreeEntry::StridedVectorize) {
+        Value *Ptr0 = cast<LoadInst>(E->Scalars.front())->getPointerOperand();
+        Value *PtrN = cast<LoadInst>(E->Scalars.back())->getPointerOperand();
+        PO = IsReverseOrder ? PtrN : Ptr0;
+        std::optional<int> Diff = getPointersDiff(
+            VL0->getType(), Ptr0, VL0->getType(), PtrN, *DL, *SE);
+        Type *StrideTy = DL->getIndexType(PO->getType());
+        int Stride = *Diff / (static_cast<int>(E->Scalars.size()) - 1);
+        Value *StrideVal =
+            ConstantInt::get(StrideTy, (IsReverseOrder ? -1 : 1) * Stride *
+                                           DL->getTypeAllocSize(ScalarTy));
+        Align CommonAlignment = computeCommonAlignment<LoadInst>(E->Scalars);
+        auto *Inst = Builder.CreateIntrinsic(
+            Intrinsic::experimental_vp_strided_load,
+            {VecTy, PO->getType(), StrideTy},
+            {PO, StrideVal, Builder.getAllOnesMask(VecTy->getElementCount()),
+             Builder.getInt32(E->Scalars.size())});
+        Inst->addParamAttr(
+            /*ArgNo=*/0,
+            Attribute::getWithAlignment(Inst->getContext(), CommonAlignment));
+        NewLI = Inst;
       } else {
-        assert((E->State == TreeEntry::ScatterVectorize ||
-                E->State == TreeEntry::PossibleStridedVectorize) &&
-               "Unhandled state");
+        assert(E->State == TreeEntry::ScatterVectorize && "Unhandled state");
         Value *VecPtr = vectorizeOperand(E, 0, PostponedPHIs);
         if (E->VectorizedValue) {
           LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
@@ -12069,8 +12088,11 @@ Value *BoUpSLP::vectorizeTree(
                      [&](llvm::User *U) {
                        TreeEntry *UseEntry = getTreeEntry(U);
                        return UseEntry &&
-                              UseEntry->State == TreeEntry::Vectorize &&
-                              E->State == TreeEntry::Vectorize &&
+                              (UseEntry->State == TreeEntry::Vectorize ||
+                               UseEntry->State ==
+                                   TreeEntry::StridedVectorize) &&
+                              (E->State == TreeEntry::Vectorize ||
+                               E->State == TreeEntry::StridedVectorize) &&
                               doesInTreeUserNeedToExtract(
                                   Scalar,
                                   cast<Instruction>(UseEntry->Scalars.front()),
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/complex-loads.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/complex-loads.ll
index dc5fb91..e167b6a 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/complex-loads.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/complex-loads.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
-; RUN: opt -S -mtriple riscv64-unknown-linux-gnu < %s --passes=slp-vectorizer -mattr=+v -slp-threshold=-80 | FileCheck %s
+; RUN: opt -S -mtriple riscv64-unknown-linux-gnu < %s --passes=slp-vectorizer -mattr=+v -slp-threshold=-40 | FileCheck %s
 
 define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.ptr, ptr %add.ptr64) {
 ; CHECK-LABEL: define i32 @test(
@@ -67,305 +67,303 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
 ; CHECK-NEXT:    [[TMP48:%.*]] = extractelement <2 x i32> [[TMP47]], i32 0
 ; CHECK-NEXT:    [[TMP49:%.*]] = extractelement <2 x i32> [[TMP47]], i32 1
 ; CHECK-NEXT:    [[SUB59_2:%.*]] = sub i32 [[TMP48]], [[TMP49]]
-; CHECK-NEXT:    [[ARRAYIDX3_3:%.*]] = getelementptr i8, ptr null, i64 4
 ; CHECK-NEXT:    [[TMP50:%.*]] = load i8, ptr null, align 1
 ; CHECK-NEXT:    [[ARRAYIDX20_3:%.*]] = getelementptr i8, ptr null, i64 2
 ; CHECK-NEXT:    [[ARRAYIDX22_3:%.*]] = getelementptr i8, ptr null, i64 2
 ; CHECK-NEXT:    [[TMP51:%.*]] = load i8, ptr null, align 1
-; CHECK-NEXT:    [[TMP52:%.*]] = insertelement <2 x ptr> <ptr null, ptr poison>, ptr [[ARRAYIDX20_3]], i32 1
+; CHECK-NEXT:    [[TMP52:%.*]] = insertelement <2 x ptr> <ptr poison, ptr null>, ptr [[ARRAYIDX20_3]], i32 0
 ; CHECK-NEXT:    [[TMP53:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP52]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
 ; CHECK-NEXT:    [[TMP54:%.*]] = zext <2 x i8> [[TMP53]] to <2 x i32>
-; CHECK-NEXT:    [[TMP55:%.*]] = insertelement <2 x ptr> <ptr null, ptr poison>, ptr [[ARRAYIDX22_3]], i32 1
+; CHECK-NEXT:    [[TMP55:%.*]] = insertelement <2 x ptr> <ptr poison, ptr null>, ptr [[ARRAYIDX22_3]], i32 0
 ; CHECK-NEXT:    [[TMP56:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP55]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
 ; CHECK-NEXT:    [[TMP57:%.*]] = zext <2 x i8> [[TMP56]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP58:%.*]] = sub <2 x i32> [[TMP54]], [[TMP57]]
-; CHECK-NEXT:    [[TMP59:%.*]] = insertelement <2 x ptr> <ptr poison, ptr null>, ptr [[ARRAYIDX3_3]], i32 0
-; CHECK-NEXT:    [[TMP60:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP59]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
-; CHECK-NEXT:    [[TMP61:%.*]] = zext <2 x i8> [[TMP60]] to <2 x i32>
-; CHECK-NEXT:    [[TMP62:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> getelementptr (i8, <2 x ptr> zeroinitializer, <2 x i64> <i64 4, i64 6>), i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
-; CHECK-NEXT:    [[TMP63:%.*]] = zext <2 x i8> [[TMP62]] to <2 x i32>
-; CHECK-NEXT:    [[TMP64:%.*]] = sub <2 x i32> [[TMP61]], [[TMP63]]
-; CHECK-NEXT:    [[TMP65:%.*]] = shl <2 x i32> [[TMP64]], <i32 16, i32 16>
-; CHECK-NEXT:    [[TMP66:%.*]] = add <2 x i32> [[TMP65]], [[TMP58]]
-; CHECK-NEXT:    [[TMP67:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> getelementptr (i8, <2 x ptr> zeroinitializer, <2 x i64> <i64 1, i64 3>), i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
-; CHECK-NEXT:    [[TMP68:%.*]] = zext <2 x i8> [[TMP67]] to <2 x i32>
-; CHECK-NEXT:    [[TMP69:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> getelementptr (i8, <2 x ptr> zeroinitializer, <2 x i64> <i64 1, i64 3>), i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
-; CHECK-NEXT:    [[TMP70:%.*]] = zext <2 x i8> [[TMP69]] to <2 x i32>
-; CHECK-NEXT:    [[TMP71:%.*]] = sub <2 x i32> [[TMP68]], [[TMP70]]
-; CHECK-NEXT:    [[TMP72:%.*]] = insertelement <2 x i8> poison, i8 [[TMP50]], i32 0
-; CHECK-NEXT:    [[TMP73:%.*]] = insertelement <2 x i8> [[TMP72]], i8 [[TMP51]], i32 1
-; CHECK-NEXT:    [[TMP74:%.*]] = zext <2 x i8> [[TMP73]] to <2 x i32>
-; CHECK-NEXT:    [[TMP75:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> getelementptr (i8, <2 x ptr> zeroinitializer, <2 x i64> <i64 5, i64 7>), i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
-; CHECK-NEXT:    [[TMP76:%.*]] = zext <2 x i8> [[TMP75]] to <2 x i32>
-; CHECK-NEXT:    [[TMP77:%.*]] = sub <2 x i32> [[TMP74]], [[TMP76]]
-; CHECK-NEXT:    [[TMP78:%.*]] = shl <2 x i32> [[TMP77]], <i32 16, i32 16>
-; CHECK-NEXT:    [[TMP79:%.*]] = add <2 x i32> [[TMP78]], [[TMP71]]
-; CHECK-NEXT:    [[TMP80:%.*]] = sub <2 x i32> [[TMP66]], [[TMP79]]
-; CHECK-NEXT:    [[TMP81:%.*]] = shufflevector <2 x i32> [[TMP79]], <2 x i32> [[TMP46]], <2 x i32> <i32 0, i32 2>
-; CHECK-NEXT:    [[TMP82:%.*]] = shufflevector <2 x i32> [[TMP66]], <2 x i32> [[TMP30]], <2 x i32> <i32 0, i32 2>
-; CHECK-NEXT:    [[TMP83:%.*]] = add <2 x i32> [[TMP81]], [[TMP82]]
-; CHECK-NEXT:    [[TMP84:%.*]] = shufflevector <2 x i32> [[TMP79]], <2 x i32> [[TMP46]], <2 x i32> <i32 1, i32 3>
-; CHECK-NEXT:    [[TMP85:%.*]] = shufflevector <2 x i32> [[TMP66]], <2 x i32> [[TMP30]], <2 x i32> <i32 1, i32 3>
-; CHECK-NEXT:    [[TMP86:%.*]] = add <2 x i32> [[TMP84]], [[TMP85]]
-; CHECK-NEXT:    [[TMP87:%.*]] = add <2 x i32> [[TMP86]], [[TMP83]]
-; CHECK-NEXT:    [[TMP88:%.*]] = sub <2 x i32> [[TMP83]], [[TMP86]]
-; CHECK-NEXT:    [[TMP89:%.*]] = extractelement <2 x i32> [[TMP80]], i32 0
-; CHECK-NEXT:    [[TMP90:%.*]] = extractelement <2 x i32> [[TMP80]], i32 1
-; CHECK-NEXT:    [[SUB59_3:%.*]] = sub i32 [[TMP89]], [[TMP90]]
-; CHECK-NEXT:    [[TMP91:%.*]] = extractelement <2 x i32> [[TMP87]], i32 0
-; CHECK-NEXT:    [[TMP92:%.*]] = extractelement <2 x i32> [[TMP87]], i32 1
-; CHECK-NEXT:    [[ADD94:%.*]] = add i32 [[TMP91]], [[TMP92]]
-; CHECK-NEXT:    [[SUB102:%.*]] = sub i32 [[TMP92]], [[TMP91]]
-; CHECK-NEXT:    [[TMP93:%.*]] = extractelement <2 x i32> [[TMP54]], i32 0
-; CHECK-NEXT:    [[SHR_I:%.*]] = lshr i32 [[TMP93]], 15
+; CHECK-NEXT:    [[TMP59:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 null, i64 4, <2 x i1> <i1 true, i1 true>, i32 2)
+; CHECK-NEXT:    [[TMP60:%.*]] = zext <2 x i8> [[TMP59]] to <2 x i32>
+; CHECK-NEXT:    [[TMP61:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> getelementptr (i8, <2 x ptr> zeroinitializer, <2 x i64> <i64 6, i64 4>), i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
+; CHECK-NEXT:    [[TMP62:%.*]] = zext <2 x i8> [[TMP61]] to <2 x i32>
+; CHECK-NEXT:    [[TMP63:%.*]] = sub <2 x i32> [[TMP60]], [[TMP62]]
+; CHECK-NEXT:    [[TMP64:%.*]] = shl <2 x i32> [[TMP63]], <i32 16, i32 16>
+; CHECK-NEXT:    [[TMP65:%.*]] = add <2 x i32> [[TMP64]], [[TMP58]]
+; CHECK-NEXT:    [[TMP66:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> getelementptr (i8, <2 x ptr> zeroinitializer, <2 x i64> <i64 3, i64 1>), i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
+; CHECK-NEXT:    [[TMP67:%.*]] = zext <2 x i8> [[TMP66]] to <2 x i32>
+; CHECK-NEXT:    [[TMP68:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> getelementptr (i8, <2 x ptr> zeroinitializer, <2 x i64> <i64 3, i64 1>), i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
+; CHECK-NEXT:    [[TMP69:%.*]] = zext <2 x i8> [[TMP68]] to <2 x i32>
+; CHECK-NEXT:    [[TMP70:%.*]] = sub <2 x i32> [[TMP67]], [[TMP69]]
+; CHECK-NEXT:    [[TMP71:%.*]] = insertelement <2 x i8> poison, i8 [[TMP51]], i32 0
+; CHECK-NEXT:    [[TMP72:%.*]] = insertelement <2 x i8> [[TMP71]], i8 [[TMP50]], i32 1
+; CHECK-NEXT:    [[TMP73:%.*]] = zext <2 x i8> [[TMP72]] to <2 x i32>
+; CHECK-NEXT:    [[TMP74:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> getelementptr (i8, <2 x ptr> zeroinitializer, <2 x i64> <i64 7, i64 5>), i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
+; CHECK-NEXT:    [[TMP75:%.*]] = zext <2 x i8> [[TMP74]] to <2 x i32>
+; CHECK-NEXT:    [[TMP76:%.*]] = sub <2 x i32> [[TMP73]], [[TMP75]]
+; CHECK-NEXT:    [[TMP77:%.*]] = shl <2 x i32> [[TMP76]], <i32 16, i32 16>
+; CHECK-NEXT:    [[TMP78:%.*]] = add <2 x i32> [[TMP77]], [[TMP70]]
+; CHECK-NEXT:    [[TMP79:%.*]] = sub <2 x i32> [[TMP65]], [[TMP78]]
+; CHECK-NEXT:    [[TMP80:%.*]] = shufflevector <2 x i32> [[TMP78]], <2 x i32> [[TMP46]], <2 x i32> <i32 1, i32 2>
+; CHECK-NEXT:    [[TMP81:%.*]] = shufflevector <2 x i32> [[TMP65]], <2 x i32> [[TMP30]], <2 x i32> <i32 1, i32 2>
+; CHECK-NEXT:    [[TMP82:%.*]] = add <2 x i32> [[TMP80]], [[TMP81]]
+; CHECK-NEXT:    [[TMP83:%.*]] = shufflevector <2 x i32> [[TMP78]], <2 x i32> [[TMP46]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    [[TMP84:%.*]] = shufflevector <2 x i32> [[TMP65]], <2 x i32> [[TMP30]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    [[TMP85:%.*]] = add <2 x i32> [[TMP83]], [[TMP84]]
+; CHECK-NEXT:    [[TMP86:%.*]] = add <2 x i32> [[TMP85]], [[TMP82]]
+; CHECK-NEXT:    [[TMP87:%.*]] = sub <2 x i32> [[TMP82]], [[TMP85]]
+; CHECK-NEXT:    [[TMP88:%.*]] = extractelement <2 x i32> [[TMP79]], i32 0
+; CHECK-NEXT:    [[TMP89:%.*]] = extractelement <2 x i32> [[TMP79]], i32 1
+; CHECK-NEXT:    [[SUB59_3:%.*]] = sub i32 [[TMP89]], [[TMP88]]
+; CHECK-NEXT:    [[TMP90:%.*]] = extractelement <2 x i32> [[TMP86]], i32 0
+; CHECK-NEXT:    [[TMP91:%.*]] = extractelement <2 x i32> [[TMP86]], i32 1
+; CHECK-NEXT:    [[ADD94:%.*]] = add i32 [[TMP90]], [[TMP91]]
+; CHECK-NEXT:    [[SUB102:%.*]] = sub i32 [[TMP91]], [[TMP90]]
+; CHECK-NEXT:    [[TMP92:%.*]] = extractelement <2 x i32> [[TMP54]], i32 1
+; CHECK-NEXT:    [[SHR_I:%.*]] = lshr i32 [[TMP92]], 15
 ; CHECK-NEXT:    [[AND_I:%.*]] = and i32 [[SHR_I]], 65537
 ; CHECK-NEXT:    [[MUL_I:%.*]] = mul i32 [[AND_I]], 65535
-; CHECK-NEXT:    [[TMP94:%.*]] = extractelement <2 x i32> [[TMP86]], i32 1
-; CHECK-NEXT:    [[SHR_I49:%.*]] = lshr i32 [[TMP94]], 15
+; CHECK-NEXT:    [[TMP93:%.*]] = extractelement <2 x i32> [[TMP85]], i32 1
+; CHECK-NEXT:    [[SHR_I49:%.*]] = lshr i32 [[TMP93]], 15
 ; CHECK-NEXT:    [[AND_I50:%.*]] = and i32 [[SHR_I49]], 65537
 ; CHECK-NEXT:    [[MUL_I51:%.*]] = mul i32 [[AND_I50]], 65535
-; CHECK-NEXT:    [[TMP95:%.*]] = extractelement <2 x i32> [[TMP88]], i32 0
-; CHECK-NEXT:    [[TMP96:%.*]] = extractelement <2 x i32> [[TMP88]], i32 1
-; CHECK-NEXT:    [[ADD94_2:%.*]] = add i32 [[TMP95]], [[TMP96]]
-; CHECK-NEXT:    [[TMP97:%.*]] = load <2 x i8>, ptr [[ARRAYIDX20]], align 1
-; CHECK-NEXT:    [[TMP98:%.*]] = zext <2 x i8> [[TMP97]] to <2 x i32>
-; CHECK-NEXT:    [[TMP99:%.*]] = insertelement <2 x i32> poison, i32 [[SUB59_2]], i32 0
-; CHECK-NEXT:    [[TMP100:%.*]] = shufflevector <2 x i32> [[TMP99]], <2 x i32> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP101:%.*]] = insertelement <2 x i32> poison, i32 [[SUB59_3]], i32 0
-; CHECK-NEXT:    [[TMP102:%.*]] = shufflevector <2 x i32> [[TMP101]], <2 x i32> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP103:%.*]] = add <2 x i32> [[TMP100]], [[TMP102]]
-; CHECK-NEXT:    [[TMP104:%.*]] = sub <2 x i32> [[TMP100]], [[TMP102]]
-; CHECK-NEXT:    [[TMP105:%.*]] = shufflevector <2 x i32> [[TMP103]], <2 x i32> [[TMP104]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT:    [[TMP106:%.*]] = load <2 x i8>, ptr [[PIX1]], align 1
-; CHECK-NEXT:    [[TMP107:%.*]] = zext <2 x i8> [[TMP106]] to <2 x i32>
-; CHECK-NEXT:    [[TMP108:%.*]] = shufflevector <2 x i32> [[TMP107]], <2 x i32> poison, <2 x i32> <i32 1, i32 0>
-; CHECK-NEXT:    [[TMP109:%.*]] = insertelement <2 x ptr> [[TMP4]], ptr [[ARRAYIDX22]], i32 1
-; CHECK-NEXT:    [[TMP110:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP109]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
-; CHECK-NEXT:    [[TMP111:%.*]] = zext <2 x i8> [[TMP110]] to <2 x i32>
-; CHECK-NEXT:    [[TMP112:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP2]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
-; CHECK-NEXT:    [[TMP113:%.*]] = zext <2 x i8> [[TMP112]] to <2 x i32>
-; CHECK-NEXT:    [[TMP114:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP5]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
-; CHECK-NEXT:    [[TMP115:%.*]] = zext <2 x i8> [[TMP114]] to <2 x i32>
-; CHECK-NEXT:    [[TMP116:%.*]] = sub <2 x i32> [[TMP113]], [[TMP115]]
-; CHECK-NEXT:    [[TMP117:%.*]] = shl <2 x i32> [[TMP116]], <i32 16, i32 16>
-; CHECK-NEXT:    [[TMP118:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP6]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
-; CHECK-NEXT:    [[TMP119:%.*]] = zext <2 x i8> [[TMP118]] to <2 x i32>
-; CHECK-NEXT:    [[TMP120:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP7]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
-; CHECK-NEXT:    [[TMP121:%.*]] = zext <2 x i8> [[TMP120]] to <2 x i32>
-; CHECK-NEXT:    [[TMP122:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP8]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
-; CHECK-NEXT:    [[TMP123:%.*]] = zext <2 x i8> [[TMP122]] to <2 x i32>
-; CHECK-NEXT:    [[TMP124:%.*]] = sub <2 x i32> [[TMP121]], [[TMP123]]
-; CHECK-NEXT:    [[TMP125:%.*]] = shl <2 x i32> [[TMP124]], <i32 16, i32 16>
-; CHECK-NEXT:    [[TMP126:%.*]] = shufflevector <2 x i32> [[TMP107]], <2 x i32> [[TMP98]], <2 x i32> <i32 0, i32 2>
-; CHECK-NEXT:    [[TMP127:%.*]] = sub <2 x i32> [[TMP126]], [[TMP111]]
-; CHECK-NEXT:    [[TMP128:%.*]] = add <2 x i32> [[TMP117]], [[TMP127]]
-; CHECK-NEXT:    [[TMP129:%.*]] = shufflevector <2 x i32> [[TMP108]], <2 x i32> [[TMP98]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT:    [[TMP130:%.*]] = sub <2 x i32> [[TMP129]], [[TMP119]]
-; CHECK-NEXT:    [[TMP131:%.*]] = add <2 x i32> [[TMP125]], [[TMP130]]
-; CHECK-NEXT:    [[TMP132:%.*]] = extractelement <2 x i32> [[TMP128]], i32 1
-; CHECK-NEXT:    [[TMP133:%.*]] = extractelement <2 x i32> [[TMP131]], i32 1
-; CHECK-NEXT:    [[ADD46:%.*]] = add i32 [[TMP133]], [[TMP132]]
-; CHECK-NEXT:    [[TMP134:%.*]] = sub <2 x i32> [[TMP128]], [[TMP131]]
-; CHECK-NEXT:    [[TMP135:%.*]] = extractelement <2 x i32> [[TMP128]], i32 0
-; CHECK-NEXT:    [[TMP136:%.*]] = extractelement <2 x i32> [[TMP131]], i32 0
-; CHECK-NEXT:    [[ADD44:%.*]] = add i32 [[TMP136]], [[TMP135]]
-; CHECK-NEXT:    [[TMP137:%.*]] = lshr <2 x i32> [[TMP108]], <i32 15, i32 15>
-; CHECK-NEXT:    [[TMP138:%.*]] = and <2 x i32> [[TMP137]], <i32 65537, i32 65537>
-; CHECK-NEXT:    [[TMP139:%.*]] = mul <2 x i32> [[TMP138]], <i32 65535, i32 65535>
-; CHECK-NEXT:    [[TMP140:%.*]] = extractelement <2 x i32> [[TMP134]], i32 0
-; CHECK-NEXT:    [[TMP141:%.*]] = extractelement <2 x i32> [[TMP134]], i32 1
-; CHECK-NEXT:    [[SUB59:%.*]] = sub i32 [[TMP140]], [[TMP141]]
-; CHECK-NEXT:    [[TMP142:%.*]] = load <2 x i8>, ptr [[ARRAYIDX8_1]], align 1
-; CHECK-NEXT:    [[TMP143:%.*]] = zext <2 x i8> [[TMP142]] to <2 x i32>
+; CHECK-NEXT:    [[TMP94:%.*]] = extractelement <2 x i32> [[TMP87]], i32 0
+; CHECK-NEXT:    [[TMP95:%.*]] = extractelement <2 x i32> [[TMP87]], i32 1
+; CHECK-NEXT:    [[ADD94_2:%.*]] = add i32 [[TMP94]], [[TMP95]]
+; CHECK-NEXT:    [[TMP96:%.*]] = load <2 x i8>, ptr [[ARRAYIDX20]], align 1
+; CHECK-NEXT:    [[TMP97:%.*]] = zext <2 x i8> [[TMP96]] to <2 x i32>
+; CHECK-NEXT:    [[TMP98:%.*]] = insertelement <2 x i32> poison, i32 [[SUB59_2]], i32 0
+; CHECK-NEXT:    [[TMP99:%.*]] = shufflevector <2 x i32> [[TMP98]], <2 x i32> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP100:%.*]] = insertelement <2 x i32> poison, i32 [[SUB59_3]], i32 0
+; CHECK-NEXT:    [[TMP101:%.*]] = shufflevector <2 x i32> [[TMP100]], <2 x i32> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP102:%.*]] = add <2 x i32> [[TMP99]], [[TMP101]]
+; CHECK-NEXT:    [[TMP103:%.*]] = sub <2 x i32> [[TMP99]], [[TMP101]]
+; CHECK-NEXT:    [[TMP104:%.*]] = shufflevector <2 x i32> [[TMP102]], <2 x i32> [[TMP103]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    [[TMP105:%.*]] = load <2 x i8>, ptr [[PIX1]], align 1
+; CHECK-NEXT:    [[TMP106:%.*]] = zext <2 x i8> [[TMP105]] to <2 x i32>
+; CHECK-NEXT:    [[TMP107:%.*]] = shufflevector <2 x i32> [[TMP106]], <2 x i32> poison, <2 x i32> <i32 1, i32 0>
+; CHECK-NEXT:    [[TMP108:%.*]] = insertelement <2 x ptr> [[TMP4]], ptr [[ARRAYIDX22]], i32 1
+; CHECK-NEXT:    [[TMP109:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP108]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
+; CHECK-NEXT:    [[TMP110:%.*]] = zext <2 x i8> [[TMP109]] to <2 x i32>
+; CHECK-NEXT:    [[TMP111:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP2]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
+; CHECK-NEXT:    [[TMP112:%.*]] = zext <2 x i8> [[TMP111]] to <2 x i32>
+; CHECK-NEXT:    [[TMP113:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP5]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
+; CHECK-NEXT:    [[TMP114:%.*]] = zext <2 x i8> [[TMP113]] to <2 x i32>
+; CHECK-NEXT:    [[TMP115:%.*]] = sub <2 x i32> [[TMP112]], [[TMP114]]
+; CHECK-NEXT:    [[TMP116:%.*]] = shl <2 x i32> [[TMP115]], <i32 16, i32 16>
+; CHECK-NEXT:    [[TMP117:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP6]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
+; CHECK-NEXT:    [[TMP118:%.*]] = zext <2 x i8> [[TMP117]] to <2 x i32>
+; CHECK-NEXT:    [[TMP119:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP7]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
+; CHECK-NEXT:    [[TMP120:%.*]] = zext <2 x i8> [[TMP119]] to <2 x i32>
+; CHECK-NEXT:    [[TMP121:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP8]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
+; CHECK-NEXT:    [[TMP122:%.*]] = zext <2 x i8> [[TMP121]] to <2 x i32>
+; CHECK-NEXT:    [[TMP123:%.*]] = sub <2 x i32> [[TMP120]], [[TMP122]]
+; CHECK-NEXT:    [[TMP124:%.*]] = shl <2 x i32> [[TMP123]], <i32 16, i32 16>
+; CHECK-NEXT:    [[TMP125:%.*]] = shufflevector <2 x i32> [[TMP106]], <2 x i32> [[TMP97]], <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:    [[TMP126:%.*]] = sub <2 x i32> [[TMP125]], [[TMP110]]
+; CHECK-NEXT:    [[TMP127:%.*]] = add <2 x i32> [[TMP116]], [[TMP126]]
+; CHECK-NEXT:    [[TMP128:%.*]] = shufflevector <2 x i32> [[TMP107]], <2 x i32> [[TMP97]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    [[TMP129:%.*]] = sub <2 x i32> [[TMP128]], [[TMP118]]
+; CHECK-NEXT:    [[TMP130:%.*]] = add <2 x i32> [[TMP124]], [[TMP129]]
+; CHECK-NEXT:    [[TMP131:%.*]] = extractelement <2 x i32> [[TMP127]], i32 1
+; CHECK-NEXT:    [[TMP132:%.*]] = extractelement <2 x i32> [[TMP130]], i32 1
+; CHECK-NEXT:    [[ADD46:%.*]] = add i32 [[TMP132]], [[TMP131]]
+; CHECK-NEXT:    [[TMP133:%.*]] = sub <2 x i32> [[TMP127]], [[TMP130]]
+; CHECK-NEXT:    [[TMP134:%.*]] = extractelement <2 x i32> [[TMP127]], i32 0
+; CHECK-NEXT:    [[TMP135:%.*]] = extractelement <2 x i32> [[TMP130]], i32 0
+; CHECK-NEXT:    [[ADD44:%.*]] = add i32 [[TMP135]], [[TMP134]]
+; CHECK-NEXT:    [[TMP136:%.*]] = lshr <2 x i32> [[TMP107]], <i32 15, i32 15>
+; CHECK-NEXT:    [[TMP137:%.*]] = and <2 x i32> [[TMP136]], <i32 65537, i32 65537>
+; CHECK-NEXT:    [[TMP138:%.*]] = mul <2 x i32> [[TMP137]], <i32 65535, i32 65535>
+; CHECK-NEXT:    [[TMP139:%.*]] = extractelement <2 x i32> [[TMP133]], i32 0
+; CHECK-NEXT:    [[TMP140:%.*]] = extractelement <2 x i32> [[TMP133]], i32 1
+; CHECK-NEXT:    [[SUB59:%.*]] = sub i32 [[TMP139]], [[TMP140]]
+; CHECK-NEXT:    [[TMP141:%.*]] = load <2 x i8>, ptr [[ARRAYIDX8_1]], align 1
+; CHECK-NEXT:    [[TMP142:%.*]] = zext <2 x i8> [[TMP141]] to <2 x i32>
 ; CHECK-NEXT:    [[ADD_PTR644:%.*]] = getelementptr i8, ptr [[PIX2]], i64 [[IDX_EXT63]]
 ; CHECK-NEXT:    [[ARRAYIDX22_1:%.*]] = getelementptr i8, ptr [[ADD_PTR644]], i64 2
-; CHECK-NEXT:    [[TMP144:%.*]] = insertelement <2 x ptr> poison, ptr [[ADD_PTR644]], i32 0
-; CHECK-NEXT:    [[TMP145:%.*]] = insertelement <2 x ptr> [[TMP144]], ptr [[ARRAYIDX22_1]], i32 1
-; CHECK-NEXT:    [[TMP146:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP145]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
-; CHECK-NEXT:    [[TMP147:%.*]] = zext <2 x i8> [[TMP146]] to <2 x i32>
-; CHECK-NEXT:    [[TMP148:%.*]] = insertelement <2 x ptr> poison, ptr [[ADD_PTR3]], i32 0
-; CHECK-NEXT:    [[TMP149:%.*]] = shufflevector <2 x ptr> [[TMP148]], <2 x ptr> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP150:%.*]] = getelementptr i8, <2 x ptr> [[TMP149]], <2 x i64> <i64 4, i64 6>
-; CHECK-NEXT:    [[TMP151:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP150]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
-; CHECK-NEXT:    [[TMP152:%.*]] = zext <2 x i8> [[TMP151]] to <2 x i32>
-; CHECK-NEXT:    [[TMP153:%.*]] = shufflevector <2 x ptr> [[TMP145]], <2 x ptr> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP154:%.*]] = getelementptr i8, <2 x ptr> [[TMP153]], <2 x i64> <i64 4, i64 6>
-; CHECK-NEXT:    [[TMP155:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP154]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
-; CHECK-NEXT:    [[TMP156:%.*]] = zext <2 x i8> [[TMP155]] to <2 x i32>
-; CHECK-NEXT:    [[TMP157:%.*]] = sub <2 x i32> [[TMP152]], [[TMP156]]
-; CHECK-NEXT:    [[TMP158:%.*]] = shl <2 x i32> [[TMP157]], <i32 16, i32 16>
-; CHECK-NEXT:    [[TMP159:%.*]] = getelementptr i8, <2 x ptr> [[TMP153]], <2 x i64> <i64 1, i64 3>
-; CHECK-NEXT:    [[TMP160:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP159]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
-; CHECK-NEXT:    [[TMP161:%.*]] = zext <2 x i8> [[TMP160]] to <2 x i32>
-; CHECK-NEXT:    [[TMP162:%.*]] = getelementptr i8, <2 x ptr> [[TMP149]], <2 x i64> <i64 5, i64 7>
-; CHECK-NEXT:    [[TMP163:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP162]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
-; CHECK-NEXT:    [[TMP164:%.*]] = zext <2 x i8> [[TMP163]] to <2 x i32>
-; CHECK-NEXT:    [[TMP165:%.*]] = getelementptr i8, <2 x ptr> [[TMP153]], <2 x i64> <i64 5, i64 7>
-; CHECK-NEXT:    [[TMP166:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP165]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
-; CHECK-NEXT:    [[TMP167:%.*]] = zext <2 x i8> [[TMP166]] to <2 x i32>
-; CHECK-NEXT:    [[TMP168:%.*]] = sub <2 x i32> [[TMP164]], [[TMP167]]
-; CHECK-NEXT:    [[TMP169:%.*]] = shl <2 x i32> [[TMP168]], <i32 16, i32 16>
-; CHECK-NEXT:    [[TMP170:%.*]] = insertelement <2 x i32> [[TMP143]], i32 [[CONV33_1]], i32 1
-; CHECK-NEXT:    [[TMP171:%.*]] = sub <2 x i32> [[TMP170]], [[TMP161]]
-; CHECK-NEXT:    [[TMP172:%.*]] = add <2 x i32> [[TMP169]], [[TMP171]]
-; CHECK-NEXT:    [[TMP173:%.*]] = insertelement <2 x i32> [[TMP143]], i32 [[CONV_1]], i32 0
-; CHECK-NEXT:    [[TMP174:%.*]] = sub <2 x i32> [[TMP173]], [[TMP147]]
-; CHECK-NEXT:    [[TMP175:%.*]] = add <2 x i32> [[TMP158]], [[TMP174]]
-; CHECK-NEXT:    [[TMP176:%.*]] = add <2 x i32> [[TMP172]], [[TMP175]]
-; CHECK-NEXT:    [[TMP177:%.*]] = sub <2 x i32> [[TMP175]], [[TMP172]]
-; CHECK-NEXT:    [[TMP178:%.*]] = extractelement <2 x i32> [[TMP176]], i32 0
-; CHECK-NEXT:    [[TMP179:%.*]] = extractelement <2 x i32> [[TMP176]], i32 1
-; CHECK-NEXT:    [[SUB51_1:%.*]] = sub i32 [[TMP178]], [[TMP179]]
-; CHECK-NEXT:    [[TMP180:%.*]] = shufflevector <2 x i32> [[TMP177]], <2 x i32> [[TMP134]], <2 x i32> <i32 1, i32 3>
-; CHECK-NEXT:    [[TMP181:%.*]] = shufflevector <2 x i32> [[TMP177]], <2 x i32> [[TMP134]], <2 x i32> <i32 0, i32 2>
-; CHECK-NEXT:    [[TMP182:%.*]] = add <2 x i32> [[TMP180]], [[TMP181]]
-; CHECK-NEXT:    [[TMP183:%.*]] = extractelement <2 x i32> [[TMP177]], i32 0
-; CHECK-NEXT:    [[TMP184:%.*]] = extractelement <2 x i32> [[TMP177]], i32 1
-; CHECK-NEXT:    [[SUB59_1:%.*]] = sub i32 [[TMP183]], [[TMP184]]
-; CHECK-NEXT:    [[SHR_I54:%.*]] = lshr i32 [[TMP179]], 15
+; CHECK-NEXT:    [[TMP143:%.*]] = insertelement <2 x ptr> poison, ptr [[ADD_PTR644]], i32 0
+; CHECK-NEXT:    [[TMP144:%.*]] = insertelement <2 x ptr> [[TMP143]], ptr [[ARRAYIDX22_1]], i32 1
+; CHECK-NEXT:    [[TMP145:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP144]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
+; CHECK-NEXT:    [[TMP146:%.*]] = zext <2 x i8> [[TMP145]] to <2 x i32>
+; CHECK-NEXT:    [[TMP147:%.*]] = insertelement <2 x ptr> poison, ptr [[ADD_PTR3]], i32 0
+; CHECK-NEXT:    [[TMP148:%.*]] = shufflevector <2 x ptr> [[TMP147]], <2 x ptr> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP149:%.*]] = getelementptr i8, <2 x ptr> [[TMP148]], <2 x i64> <i64 4, i64 6>
+; CHECK-NEXT:    [[TMP150:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP149]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
+; CHECK-NEXT:    [[TMP151:%.*]] = zext <2 x i8> [[TMP150]] to <2 x i32>
+; CHECK-NEXT:    [[TMP152:%.*]] = shufflevector <2 x ptr> [[TMP144]], <2 x ptr> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP153:%.*]] = getelementptr i8, <2 x ptr> [[TMP152]], <2 x i64> <i64 4, i64 6>
+; CHECK-NEXT:    [[TMP154:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP153]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
+; CHECK-NEXT:    [[TMP155:%.*]] = zext <2 x i8> [[TMP154]] to <2 x i32>
+; CHECK-NEXT:    [[TMP156:%.*]] = sub <2 x i32> [[TMP151]], [[TMP155]]
+; CHECK-NEXT:    [[TMP157:%.*]] = shl <2 x i32> [[TMP156]], <i32 16, i32 16>
+; CHECK-NEXT:    [[TMP158:%.*]] = getelementptr i8, <2 x ptr> [[TMP152]], <2 x i64> <i64 1, i64 3>
+; CHECK-NEXT:    [[TMP159:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP158]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
+; CHECK-NEXT:    [[TMP160:%.*]] = zext <2 x i8> [[TMP159]] to <2 x i32>
+; CHECK-NEXT:    [[TMP161:%.*]] = getelementptr i8, <2 x ptr> [[TMP148]], <2 x i64> <i64 5, i64 7>
+; CHECK-NEXT:    [[TMP162:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP161]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
+; CHECK-NEXT:    [[TMP163:%.*]] = zext <2 x i8> [[TMP162]] to <2 x i32>
+; CHECK-NEXT:    [[TMP164:%.*]] = getelementptr i8, <2 x ptr> [[TMP152]], <2 x i64> <i64 5, i64 7>
+; CHECK-NEXT:    [[TMP165:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP164]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
+; CHECK-NEXT:    [[TMP166:%.*]] = zext <2 x i8> [[TMP165]] to <2 x i32>
+; CHECK-NEXT:    [[TMP167:%.*]] = sub <2 x i32> [[TMP163]], [[TMP166]]
+; CHECK-NEXT:    [[TMP168:%.*]] = shl <2 x i32> [[TMP167]], <i32 16, i32 16>
+; CHECK-NEXT:    [[TMP169:%.*]] = insertelement <2 x i32> [[TMP142]], i32 [[CONV33_1]], i32 1
+; CHECK-NEXT:    [[TMP170:%.*]] = sub <2 x i32> [[TMP169]], [[TMP160]]
+; CHECK-NEXT:    [[TMP171:%.*]] = add <2 x i32> [[TMP168]], [[TMP170]]
+; CHECK-NEXT:    [[TMP172:%.*]] = insertelement <2 x i32> [[TMP142]], i32 [[CONV_1]], i32 0
+; CHECK-NEXT:    [[TMP173:%.*]] = sub <2 x i32> [[TMP172]], [[TMP146]]
+; CHECK-NEXT:    [[TMP174:%.*]] = add <2 x i32> [[TMP157]], [[TMP173]]
+; CHECK-NEXT:    [[TMP175:%.*]] = add <2 x i32> [[TMP171]], [[TMP174]]
+; CHECK-NEXT:    [[TMP176:%.*]] = sub <2 x i32> [[TMP174]], [[TMP171]]
+; CHECK-NEXT:    [[TMP177:%.*]] = extractelement <2 x i32> [[TMP175]], i32 0
+; CHECK-NEXT:    [[TMP178:%.*]] = extractelement <2 x i32> [[TMP175]], i32 1
+; CHECK-NEXT:    [[SUB51_1:%.*]] = sub i32 [[TMP177]], [[TMP178]]
+; CHECK-NEXT:    [[TMP179:%.*]] = shufflevector <2 x i32> [[TMP176]], <2 x i32> [[TMP133]], <2 x i32> <i32 1, i32 3>
+; CHECK-NEXT:    [[TMP180:%.*]] = shufflevector <2 x i32> [[TMP176]], <2 x i32> [[TMP133]], <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:    [[TMP181:%.*]] = add <2 x i32> [[TMP179]], [[TMP180]]
+; CHECK-NEXT:    [[TMP182:%.*]] = extractelement <2 x i32> [[TMP176]], i32 0
+; CHECK-NEXT:    [[TMP183:%.*]] = extractelement <2 x i32> [[TMP176]], i32 1
+; CHECK-NEXT:    [[SUB59_1:%.*]] = sub i32 [[TMP182]], [[TMP183]]
+; CHECK-NEXT:    [[SHR_I54:%.*]] = lshr i32 [[TMP178]], 15
 ; CHECK-NEXT:    [[AND_I55:%.*]] = and i32 [[SHR_I54]], 65537
 ; CHECK-NEXT:    [[MUL_I56:%.*]] = mul i32 [[AND_I55]], 65535
-; CHECK-NEXT:    [[TMP185:%.*]] = lshr <2 x i32> [[TMP143]], <i32 15, i32 15>
-; CHECK-NEXT:    [[TMP186:%.*]] = and <2 x i32> [[TMP185]], <i32 65537, i32 65537>
-; CHECK-NEXT:    [[TMP187:%.*]] = mul <2 x i32> [[TMP186]], <i32 65535, i32 65535>
-; CHECK-NEXT:    [[TMP188:%.*]] = insertelement <2 x i32> poison, i32 [[SUB59_1]], i32 0
-; CHECK-NEXT:    [[TMP189:%.*]] = shufflevector <2 x i32> [[TMP188]], <2 x i32> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP190:%.*]] = extractelement <2 x i32> [[TMP182]], i32 0
-; CHECK-NEXT:    [[TMP191:%.*]] = extractelement <2 x i32> [[TMP182]], i32 1
-; CHECK-NEXT:    [[ADD78_1:%.*]] = add i32 [[TMP190]], [[TMP191]]
-; CHECK-NEXT:    [[TMP192:%.*]] = shufflevector <2 x i32> [[TMP33]], <2 x i32> [[TMP177]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT:    [[TMP193:%.*]] = lshr <2 x i32> [[TMP192]], <i32 15, i32 15>
-; CHECK-NEXT:    [[TMP194:%.*]] = and <2 x i32> [[TMP193]], <i32 65537, i32 65537>
-; CHECK-NEXT:    [[TMP195:%.*]] = mul <2 x i32> [[TMP194]], <i32 65535, i32 65535>
-; CHECK-NEXT:    [[TMP196:%.*]] = insertelement <2 x i32> poison, i32 [[ADD78_1]], i32 0
-; CHECK-NEXT:    [[TMP197:%.*]] = shufflevector <2 x i32> [[TMP196]], <2 x i32> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP198:%.*]] = insertelement <2 x i32> poison, i32 [[ADD94_2]], i32 0
-; CHECK-NEXT:    [[TMP199:%.*]] = shufflevector <2 x i32> [[TMP198]], <2 x i32> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP200:%.*]] = insertelement <2 x i32> poison, i32 [[ADD44]], i32 0
-; CHECK-NEXT:    [[TMP201:%.*]] = shufflevector <2 x i32> [[TMP200]], <2 x i32> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP202:%.*]] = insertelement <2 x i32> <i32 15, i32 poison>, i32 [[ADD46]], i32 1
-; CHECK-NEXT:    [[TMP203:%.*]] = lshr <2 x i32> [[TMP201]], [[TMP202]]
-; CHECK-NEXT:    [[TMP204:%.*]] = sub <2 x i32> [[TMP201]], [[TMP202]]
-; CHECK-NEXT:    [[TMP205:%.*]] = shufflevector <2 x i32> [[TMP203]], <2 x i32> [[TMP204]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT:    [[TMP206:%.*]] = extractelement <2 x i32> [[TMP205]], i32 1
-; CHECK-NEXT:    [[ADD78_2:%.*]] = add i32 [[SUB51_1]], [[TMP206]]
-; CHECK-NEXT:    [[TMP207:%.*]] = insertelement <2 x i32> <i32 65537, i32 poison>, i32 [[SUB51_1]], i32 1
-; CHECK-NEXT:    [[TMP208:%.*]] = and <2 x i32> [[TMP205]], [[TMP207]]
-; CHECK-NEXT:    [[TMP209:%.*]] = sub <2 x i32> [[TMP205]], [[TMP207]]
-; CHECK-NEXT:    [[TMP210:%.*]] = shufflevector <2 x i32> [[TMP208]], <2 x i32> [[TMP209]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT:    [[TMP211:%.*]] = insertelement <2 x i32> poison, i32 [[ADD78_2]], i32 0
-; CHECK-NEXT:    [[TMP212:%.*]] = shufflevector <2 x i32> [[TMP211]], <2 x i32> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP213:%.*]] = add <2 x i32> [[TMP212]], [[TMP199]]
-; CHECK-NEXT:    [[TMP214:%.*]] = sub <2 x i32> [[TMP212]], [[TMP199]]
-; CHECK-NEXT:    [[TMP215:%.*]] = shufflevector <2 x i32> [[TMP213]], <2 x i32> [[TMP214]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT:    [[TMP216:%.*]] = insertelement <2 x i32> [[TMP134]], i32 [[CONV_1]], i32 0
-; CHECK-NEXT:    [[TMP217:%.*]] = lshr <2 x i32> [[TMP216]], <i32 15, i32 15>
-; CHECK-NEXT:    [[TMP218:%.*]] = and <2 x i32> [[TMP217]], <i32 65537, i32 65537>
-; CHECK-NEXT:    [[TMP219:%.*]] = mul <2 x i32> [[TMP218]], <i32 65535, i32 65535>
-; CHECK-NEXT:    [[TMP220:%.*]] = shufflevector <2 x i32> [[TMP88]], <2 x i32> poison, <2 x i32> <i32 1, i32 poison>
-; CHECK-NEXT:    [[TMP221:%.*]] = shufflevector <2 x i32> [[TMP220]], <2 x i32> [[TMP182]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT:    [[TMP222:%.*]] = shufflevector <2 x i32> [[TMP88]], <2 x i32> [[TMP182]], <2 x i32> <i32 0, i32 2>
-; CHECK-NEXT:    [[TMP223:%.*]] = sub <2 x i32> [[TMP221]], [[TMP222]]
-; CHECK-NEXT:    [[TMP224:%.*]] = shufflevector <2 x i32> [[TMP47]], <2 x i32> poison, <2 x i32> <i32 1, i32 poison>
-; CHECK-NEXT:    [[TMP225:%.*]] = insertelement <2 x i32> [[TMP224]], i32 [[ADD46]], i32 1
-; CHECK-NEXT:    [[TMP226:%.*]] = insertelement <2 x i32> [[TMP47]], i32 [[ADD44]], i32 1
-; CHECK-NEXT:    [[TMP227:%.*]] = add <2 x i32> [[TMP225]], [[TMP226]]
-; CHECK-NEXT:    [[TMP228:%.*]] = shufflevector <2 x i32> [[TMP80]], <2 x i32> [[TMP176]], <2 x i32> <i32 1, i32 3>
-; CHECK-NEXT:    [[TMP229:%.*]] = shufflevector <2 x i32> [[TMP80]], <2 x i32> [[TMP176]], <2 x i32> <i32 0, i32 2>
-; CHECK-NEXT:    [[TMP230:%.*]] = add <2 x i32> [[TMP228]], [[TMP229]]
-; CHECK-NEXT:    [[TMP231:%.*]] = extractelement <2 x i32> [[TMP227]], i32 0
-; CHECK-NEXT:    [[TMP232:%.*]] = extractelement <2 x i32> [[TMP230]], i32 0
-; CHECK-NEXT:    [[ADD94_1:%.*]] = add i32 [[TMP232]], [[TMP231]]
-; CHECK-NEXT:    [[TMP233:%.*]] = insertelement <2 x i32> [[TMP14]], i32 [[ADD46]], i32 1
-; CHECK-NEXT:    [[TMP234:%.*]] = lshr <2 x i32> [[TMP233]], <i32 15, i32 15>
-; CHECK-NEXT:    [[TMP235:%.*]] = and <2 x i32> [[TMP234]], <i32 65537, i32 65537>
-; CHECK-NEXT:    [[TMP236:%.*]] = mul <2 x i32> [[TMP235]], <i32 65535, i32 65535>
-; CHECK-NEXT:    [[TMP237:%.*]] = extractelement <2 x i32> [[TMP227]], i32 1
-; CHECK-NEXT:    [[TMP238:%.*]] = extractelement <2 x i32> [[TMP230]], i32 1
-; CHECK-NEXT:    [[ADD78:%.*]] = add i32 [[TMP238]], [[TMP237]]
-; CHECK-NEXT:    [[TMP239:%.*]] = sub <2 x i32> [[TMP227]], [[TMP230]]
+; CHECK-NEXT:    [[TMP184:%.*]] = lshr <2 x i32> [[TMP142]], <i32 15, i32 15>
+; CHECK-NEXT:    [[TMP185:%.*]] = and <2 x i32> [[TMP184]], <i32 65537, i32 65537>
+; CHECK-NEXT:    [[TMP186:%.*]] = mul <2 x i32> [[TMP185]], <i32 65535, i32 65535>
+; CHECK-NEXT:    [[TMP187:%.*]] = insertelement <2 x i32> poison, i32 [[SUB59_1]], i32 0
+; CHECK-NEXT:    [[TMP188:%.*]] = shufflevector <2 x i32> [[TMP187]], <2 x i32> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP189:%.*]] = extractelement <2 x i32> [[TMP181]], i32 0
+; CHECK-NEXT:    [[TMP190:%.*]] = extractelement <2 x i32> [[TMP181]], i32 1
+; CHECK-NEXT:    [[ADD78_1:%.*]] = add i32 [[TMP189]], [[TMP190]]
+; CHECK-NEXT:    [[TMP191:%.*]] = shufflevector <2 x i32> [[TMP33]], <2 x i32> [[TMP176]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    [[TMP192:%.*]] = lshr <2 x i32> [[TMP191]], <i32 15, i32 15>
+; CHECK-NEXT:    [[TMP193:%.*]] = and <2 x i32> [[TMP192]], <i32 65537, i32 65537>
+; CHECK-NEXT:    [[TMP194:%.*]] = mul <2 x i32> [[TMP193]], <i32 65535, i32 65535>
+; CHECK-NEXT:    [[TMP195:%.*]] = insertelement <2 x i32> poison, i32 [[ADD78_1]], i32 0
+; CHECK-NEXT:    [[TMP196:%.*]] = shufflevector <2 x i32> [[TMP195]], <2 x i32> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP197:%.*]] = insertelement <2 x i32> poison, i32 [[ADD94_2]], i32 0
+; CHECK-NEXT:    [[TMP198:%.*]] = shufflevector <2 x i32> [[TMP197]], <2 x i32> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP199:%.*]] = insertelement <2 x i32> poison, i32 [[ADD44]], i32 0
+; CHECK-NEXT:    [[TMP200:%.*]] = shufflevector <2 x i32> [[TMP199]], <2 x i32> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP201:%.*]] = insertelement <2 x i32> <i32 15, i32 poison>, i32 [[ADD46]], i32 1
+; CHECK-NEXT:    [[TMP202:%.*]] = lshr <2 x i32> [[TMP200]], [[TMP201]]
+; CHECK-NEXT:    [[TMP203:%.*]] = sub <2 x i32> [[TMP200]], [[TMP201]]
+; CHECK-NEXT:    [[TMP204:%.*]] = shufflevector <2 x i32> [[TMP202]], <2 x i32> [[TMP203]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    [[TMP205:%.*]] = extractelement <2 x i32> [[TMP204]], i32 1
+; CHECK-NEXT:    [[ADD78_2:%.*]] = add i32 [[SUB51_1]], [[TMP205]]
+; CHECK-NEXT:    [[TMP206:%.*]] = insertelement <2 x i32> <i32 65537, i32 poison>, i32 [[SUB51_1]], i32 1
+; CHECK-NEXT:    [[TMP207:%.*]] = and <2 x i32> [[TMP204]], [[TMP206]]
+; CHECK-NEXT:    [[TMP208:%.*]] = sub <2 x i32> [[TMP204]], [[TMP206]]
+; CHECK-NEXT:    [[TMP209:%.*]] = shufflevector <2 x i32> [[TMP207]], <2 x i32> [[TMP208]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    [[TMP210:%.*]] = insertelement <2 x i32> poison, i32 [[ADD78_2]], i32 0
+; CHECK-NEXT:    [[TMP211:%.*]] = shufflevector <2 x i32> [[TMP210]], <2 x i32> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP212:%.*]] = add <2 x i32> [[TMP211]], [[TMP198]]
+; CHECK-NEXT:    [[TMP213:%.*]] = sub <2 x i32> [[TMP211]], [[TMP198]]
+; CHECK-NEXT:    [[TMP214:%.*]] = shufflevector <2 x i32> [[TMP212]], <2 x i32> [[TMP213]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    [[TMP215:%.*]] = insertelement <2 x i32> [[TMP133]], i32 [[CONV_1]], i32 0
+; CHECK-NEXT:    [[TMP216:%.*]] = lshr <2 x i32> [[TMP215]], <i32 15, i32 15>
+; CHECK-NEXT:    [[TMP217:%.*]] = and <2 x i32> [[TMP216]], <i32 65537, i32 65537>
+; CHECK-NEXT:    [[TMP218:%.*]] = mul <2 x i32> [[TMP217]], <i32 65535, i32 65535>
+; CHECK-NEXT:    [[TMP219:%.*]] = shufflevector <2 x i32> [[TMP87]], <2 x i32> poison, <2 x i32> <i32 1, i32 poison>
+; CHECK-NEXT:    [[TMP220:%.*]] = shufflevector <2 x i32> [[TMP219]], <2 x i32> [[TMP181]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    [[TMP221:%.*]] = shufflevector <2 x i32> [[TMP87]], <2 x i32> [[TMP181]], <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:    [[TMP222:%.*]] = sub <2 x i32> [[TMP220]], [[TMP221]]
+; CHECK-NEXT:    [[TMP223:%.*]] = shufflevector <2 x i32> [[TMP47]], <2 x i32> poison, <2 x i32> <i32 1, i32 poison>
+; CHECK-NEXT:    [[TMP224:%.*]] = insertelement <2 x i32> [[TMP223]], i32 [[ADD46]], i32 1
+; CHECK-NEXT:    [[TMP225:%.*]] = insertelement <2 x i32> [[TMP47]], i32 [[ADD44]], i32 1
+; CHECK-NEXT:    [[TMP226:%.*]] = add <2 x i32> [[TMP224]], [[TMP225]]
+; CHECK-NEXT:    [[TMP227:%.*]] = shufflevector <2 x i32> [[TMP79]], <2 x i32> [[TMP175]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    [[TMP228:%.*]] = shufflevector <2 x i32> [[TMP79]], <2 x i32> [[TMP175]], <2 x i32> <i32 1, i32 2>
+; CHECK-NEXT:    [[TMP229:%.*]] = add <2 x i32> [[TMP227]], [[TMP228]]
+; CHECK-NEXT:    [[TMP230:%.*]] = extractelement <2 x i32> [[TMP226]], i32 0
+; CHECK-NEXT:    [[TMP231:%.*]] = extractelement <2 x i32> [[TMP229]], i32 0
+; CHECK-NEXT:    [[ADD94_1:%.*]] = add i32 [[TMP231]], [[TMP230]]
+; CHECK-NEXT:    [[TMP232:%.*]] = insertelement <2 x i32> [[TMP14]], i32 [[ADD46]], i32 1
+; CHECK-NEXT:    [[TMP233:%.*]] = lshr <2 x i32> [[TMP232]], <i32 15, i32 15>
+; CHECK-NEXT:    [[TMP234:%.*]] = and <2 x i32> [[TMP233]], <i32 65537, i32 65537>
+; CHECK-NEXT:    [[TMP235:%.*]] = mul <2 x i32> [[TMP234]], <i32 65535, i32 65535>
+; CHECK-NEXT:    [[TMP236:%.*]] = extractelement <2 x i32> [[TMP226]], i32 1
+; CHECK-NEXT:    [[TMP237:%.*]] = extractelement <2 x i32> [[TMP229]], i32 1
+; CHECK-NEXT:    [[ADD78:%.*]] = add i32 [[TMP237]], [[TMP236]]
+; CHECK-NEXT:    [[TMP238:%.*]] = sub <2 x i32> [[TMP226]], [[TMP229]]
 ; CHECK-NEXT:    [[ADD103:%.*]] = add i32 [[ADD94]], [[ADD78]]
 ; CHECK-NEXT:    [[SUB104:%.*]] = sub i32 [[ADD78]], [[ADD94]]
-; CHECK-NEXT:    [[TMP240:%.*]] = extractelement <2 x i32> [[TMP239]], i32 1
-; CHECK-NEXT:    [[ADD105:%.*]] = add i32 [[SUB102]], [[TMP240]]
+; CHECK-NEXT:    [[TMP239:%.*]] = extractelement <2 x i32> [[TMP238]], i32 1
+; CHECK-NEXT:    [[ADD105:%.*]] = add i32 [[SUB102]], [[TMP239]]
 ; CHECK-NEXT:    [[ADD_I:%.*]] = add i32 [[MUL_I]], [[ADD103]]
-; CHECK-NEXT:    [[XOR_I:%.*]] = xor i32 [[ADD_I]], [[TMP93]]
+; CHECK-NEXT:    [[XOR_I:%.*]] = xor i32 [[ADD_I]], [[TMP92]]
 ; CHECK-NEXT:    [[ADD_I52:%.*]] = add i32 [[MUL_I51]], [[ADD105]]
-; CHECK-NEXT:    [[XOR_I53:%.*]] = xor i32 [[ADD_I52]], [[TMP94]]
+; CHECK-NEXT:    [[XOR_I53:%.*]] = xor i32 [[ADD_I52]], [[TMP93]]
 ; CHECK-NEXT:    [[ADD_I57:%.*]] = add i32 [[MUL_I56]], [[SUB104]]
-; CHECK-NEXT:    [[XOR_I58:%.*]] = xor i32 [[ADD_I57]], [[TMP179]]
+; CHECK-NEXT:    [[XOR_I58:%.*]] = xor i32 [[ADD_I57]], [[TMP178]]
 ; CHECK-NEXT:    [[ADD110:%.*]] = add i32 [[XOR_I53]], [[XOR_I]]
 ; CHECK-NEXT:    [[ADD112:%.*]] = add i32 [[ADD110]], [[XOR_I58]]
-; CHECK-NEXT:    [[TMP241:%.*]] = shufflevector <2 x i32> [[TMP223]], <2 x i32> poison, <2 x i32> <i32 1, i32 poison>
-; CHECK-NEXT:    [[TMP242:%.*]] = insertelement <2 x i32> [[TMP241]], i32 [[SUB102]], i32 1
-; CHECK-NEXT:    [[TMP243:%.*]] = add <2 x i32> [[TMP239]], [[TMP242]]
-; CHECK-NEXT:    [[TMP244:%.*]] = sub <2 x i32> [[TMP239]], [[TMP242]]
-; CHECK-NEXT:    [[TMP245:%.*]] = shufflevector <2 x i32> [[TMP243]], <2 x i32> [[TMP244]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT:    [[TMP246:%.*]] = add <2 x i32> [[TMP236]], [[TMP245]]
-; CHECK-NEXT:    [[TMP247:%.*]] = xor <2 x i32> [[TMP246]], [[TMP233]]
-; CHECK-NEXT:    [[TMP248:%.*]] = extractelement <2 x i32> [[TMP247]], i32 1
-; CHECK-NEXT:    [[ADD113:%.*]] = add i32 [[ADD112]], [[TMP248]]
-; CHECK-NEXT:    [[TMP249:%.*]] = insertelement <2 x i32> poison, i32 [[ADD94_1]], i32 0
-; CHECK-NEXT:    [[TMP250:%.*]] = shufflevector <2 x i32> [[TMP249]], <2 x i32> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP251:%.*]] = add <2 x i32> [[TMP197]], [[TMP250]]
-; CHECK-NEXT:    [[TMP252:%.*]] = sub <2 x i32> [[TMP197]], [[TMP250]]
-; CHECK-NEXT:    [[TMP253:%.*]] = shufflevector <2 x i32> [[TMP251]], <2 x i32> [[TMP252]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT:    [[TMP254:%.*]] = add <2 x i32> [[TMP195]], [[TMP253]]
-; CHECK-NEXT:    [[TMP255:%.*]] = xor <2 x i32> [[TMP254]], [[TMP192]]
-; CHECK-NEXT:    [[TMP256:%.*]] = extractelement <2 x i32> [[TMP247]], i32 0
-; CHECK-NEXT:    [[ADD108_1:%.*]] = add i32 [[TMP256]], [[ADD113]]
-; CHECK-NEXT:    [[TMP257:%.*]] = extractelement <2 x i32> [[TMP255]], i32 0
-; CHECK-NEXT:    [[ADD110_1:%.*]] = add i32 [[ADD108_1]], [[TMP257]]
-; CHECK-NEXT:    [[TMP258:%.*]] = extractelement <2 x i32> [[TMP255]], i32 1
-; CHECK-NEXT:    [[ADD112_1:%.*]] = add i32 [[ADD110_1]], [[TMP258]]
-; CHECK-NEXT:    [[TMP259:%.*]] = shufflevector <2 x i32> [[TMP210]], <2 x i32> poison, <2 x i32> <i32 1, i32 poison>
-; CHECK-NEXT:    [[TMP260:%.*]] = shufflevector <2 x i32> [[TMP259]], <2 x i32> [[TMP239]], <2 x i32> <i32 0, i32 2>
-; CHECK-NEXT:    [[TMP261:%.*]] = add <2 x i32> [[TMP223]], [[TMP260]]
-; CHECK-NEXT:    [[TMP262:%.*]] = sub <2 x i32> [[TMP223]], [[TMP260]]
-; CHECK-NEXT:    [[TMP263:%.*]] = shufflevector <2 x i32> [[TMP261]], <2 x i32> [[TMP262]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT:    [[TMP264:%.*]] = add <2 x i32> [[TMP219]], [[TMP263]]
-; CHECK-NEXT:    [[TMP265:%.*]] = xor <2 x i32> [[TMP264]], [[TMP216]]
-; CHECK-NEXT:    [[TMP266:%.*]] = extractelement <2 x i32> [[TMP265]], i32 1
-; CHECK-NEXT:    [[ADD113_1:%.*]] = add i32 [[ADD112_1]], [[TMP266]]
-; CHECK-NEXT:    [[TMP267:%.*]] = shufflevector <2 x i32> <i32 65535, i32 poison>, <2 x i32> [[TMP223]], <2 x i32> <i32 0, i32 2>
-; CHECK-NEXT:    [[TMP268:%.*]] = mul <2 x i32> [[TMP210]], [[TMP267]]
-; CHECK-NEXT:    [[TMP269:%.*]] = sub <2 x i32> [[TMP210]], [[TMP267]]
-; CHECK-NEXT:    [[TMP270:%.*]] = shufflevector <2 x i32> [[TMP268]], <2 x i32> [[TMP269]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT:    [[TMP271:%.*]] = add <2 x i32> [[TMP187]], [[TMP215]]
-; CHECK-NEXT:    [[TMP272:%.*]] = xor <2 x i32> [[TMP271]], [[TMP143]]
-; CHECK-NEXT:    [[TMP273:%.*]] = extractelement <2 x i32> [[TMP270]], i32 0
-; CHECK-NEXT:    [[TMP274:%.*]] = extractelement <2 x i32> [[TMP270]], i32 1
-; CHECK-NEXT:    [[ADD_I62_2:%.*]] = add i32 [[TMP273]], [[TMP274]]
+; CHECK-NEXT:    [[TMP240:%.*]] = shufflevector <2 x i32> [[TMP222]], <2 x i32> poison, <2 x i32> <i32 1, i32 poison>
+; CHECK-NEXT:    [[TMP241:%.*]] = insertelement <2 x i32> [[TMP240]], i32 [[SUB102]], i32 1
+; CHECK-NEXT:    [[TMP242:%.*]] = add <2 x i32> [[TMP238]], [[TMP241]]
+; CHECK-NEXT:    [[TMP243:%.*]] = sub <2 x i32> [[TMP238]], [[TMP241]]
+; CHECK-NEXT:    [[TMP244:%.*]] = shufflevector <2 x i32> [[TMP242]], <2 x i32> [[TMP243]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    [[TMP245:%.*]] = add <2 x i32> [[TMP235]], [[TMP244]]
+; CHECK-NEXT:    [[TMP246:%.*]] = xor <2 x i32> [[TMP245]], [[TMP232]]
+; CHECK-NEXT:    [[TMP247:%.*]] = extractelement <2 x i32> [[TMP246]], i32 1
+; CHECK-NEXT:    [[ADD113:%.*]] = add i32 [[ADD112]], [[TMP247]]
+; CHECK-NEXT:    [[TMP248:%.*]] = insertelement <2 x i32> poison, i32 [[ADD94_1]], i32 0
+; CHECK-NEXT:    [[TMP249:%.*]] = shufflevector <2 x i32> [[TMP248]], <2 x i32> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP250:%.*]] = add <2 x i32> [[TMP196]], [[TMP249]]
+; CHECK-NEXT:    [[TMP251:%.*]] = sub <2 x i32> [[TMP196]], [[TMP249]]
+; CHECK-NEXT:    [[TMP252:%.*]] = shufflevector <2 x i32> [[TMP250]], <2 x i32> [[TMP251]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    [[TMP253:%.*]] = add <2 x i32> [[TMP194]], [[TMP252]]
+; CHECK-NEXT:    [[TMP254:%.*]] = xor <2 x i32> [[TMP253]], [[TMP191]]
+; CHECK-NEXT:    [[TMP255:%.*]] = extractelement <2 x i32> [[TMP246]], i32 0
+; CHECK-NEXT:    [[ADD108_1:%.*]] = add i32 [[TMP255]], [[ADD113]]
+; CHECK-NEXT:    [[TMP256:%.*]] = extractelement <2 x i32> [[TMP254]], i32 0
+; CHECK-NEXT:    [[ADD110_1:%.*]] = add i32 [[ADD108_1]], [[TMP256]]
+; CHECK-NEXT:    [[TMP257:%.*]] = extractelement <2 x i32> [[TMP254]], i32 1
+; CHECK-NEXT:    [[ADD112_1:%.*]] = add i32 [[ADD110_1]], [[TMP257]]
+; CHECK-NEXT:    [[TMP258:%.*]] = shufflevector <2 x i32> [[TMP209]], <2 x i32> poison, <2 x i32> <i32 1, i32 poison>
+; CHECK-NEXT:    [[TMP259:%.*]] = shufflevector <2 x i32> [[TMP258]], <2 x i32> [[TMP238]], <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:    [[TMP260:%.*]] = add <2 x i32> [[TMP222]], [[TMP259]]
+; CHECK-NEXT:    [[TMP261:%.*]] = sub <2 x i32> [[TMP222]], [[TMP259]]
+; CHECK-NEXT:    [[TMP262:%.*]] = shufflevector <2 x i32> [[TMP260]], <2 x i32> [[TMP261]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    [[TMP263:%.*]] = add <2 x i32> [[TMP218]], [[TMP262]]
+; CHECK-NEXT:    [[TMP264:%.*]] = xor <2 x i32> [[TMP263]], [[TMP215]]
+; CHECK-NEXT:    [[TMP265:%.*]] = extractelement <2 x i32> [[TMP264]], i32 1
+; CHECK-NEXT:    [[ADD113_1:%.*]] = add i32 [[ADD112_1]], [[TMP265]]
+; CHECK-NEXT:    [[TMP266:%.*]] = shufflevector <2 x i32> <i32 65535, i32 poison>, <2 x i32> [[TMP222]], <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:    [[TMP267:%.*]] = mul <2 x i32> [[TMP209]], [[TMP266]]
+; CHECK-NEXT:    [[TMP268:%.*]] = sub <2 x i32> [[TMP209]], [[TMP266]]
+; CHECK-NEXT:    [[TMP269:%.*]] = shufflevector <2 x i32> [[TMP267]], <2 x i32> [[TMP268]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    [[TMP270:%.*]] = add <2 x i32> [[TMP186]], [[TMP214]]
+; CHECK-NEXT:    [[TMP271:%.*]] = xor <2 x i32> [[TMP270]], [[TMP142]]
+; CHECK-NEXT:    [[TMP272:%.*]] = extractelement <2 x i32> [[TMP269]], i32 0
+; CHECK-NEXT:    [[TMP273:%.*]] = extractelement <2 x i32> [[TMP269]], i32 1
+; CHECK-NEXT:    [[ADD_I62_2:%.*]] = add i32 [[TMP272]], [[TMP273]]
 ; CHECK-NEXT:    [[XOR_I63_2:%.*]] = xor i32 [[ADD_I62_2]], [[ADD44]]
-; CHECK-NEXT:    [[TMP275:%.*]] = extractelement <2 x i32> [[TMP265]], i32 0
-; CHECK-NEXT:    [[ADD108_2:%.*]] = add i32 [[TMP275]], [[ADD113_1]]
-; CHECK-NEXT:    [[TMP276:%.*]] = extractelement <2 x i32> [[TMP272]], i32 0
-; CHECK-NEXT:    [[ADD110_2:%.*]] = add i32 [[ADD108_2]], [[TMP276]]
-; CHECK-NEXT:    [[TMP277:%.*]] = extractelement <2 x i32> [[TMP272]], i32 1
-; CHECK-NEXT:    [[ADD112_2:%.*]] = add i32 [[ADD110_2]], [[TMP277]]
+; CHECK-NEXT:    [[TMP274:%.*]] = extractelement <2 x i32> [[TMP264]], i32 0
+; CHECK-NEXT:    [[ADD108_2:%.*]] = add i32 [[TMP274]], [[ADD113_1]]
+; CHECK-NEXT:    [[TMP275:%.*]] = extractelement <2 x i32> [[TMP271]], i32 0
+; CHECK-NEXT:    [[ADD110_2:%.*]] = add i32 [[ADD108_2]], [[TMP275]]
+; CHECK-NEXT:    [[TMP276:%.*]] = extractelement <2 x i32> [[TMP271]], i32 1
+; CHECK-NEXT:    [[ADD112_2:%.*]] = add i32 [[ADD110_2]], [[TMP276]]
 ; CHECK-NEXT:    [[ADD113_2:%.*]] = add i32 [[ADD112_2]], [[XOR_I63_2]]
-; CHECK-NEXT:    [[TMP278:%.*]] = insertelement <2 x i32> poison, i32 [[SUB59]], i32 0
-; CHECK-NEXT:    [[TMP279:%.*]] = shufflevector <2 x i32> [[TMP278]], <2 x i32> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP280:%.*]] = add <2 x i32> [[TMP279]], [[TMP189]]
-; CHECK-NEXT:    [[TMP281:%.*]] = sub <2 x i32> [[TMP279]], [[TMP189]]
-; CHECK-NEXT:    [[TMP282:%.*]] = shufflevector <2 x i32> [[TMP280]], <2 x i32> [[TMP281]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT:    [[TMP283:%.*]] = add <2 x i32> [[TMP105]], [[TMP282]]
-; CHECK-NEXT:    [[TMP284:%.*]] = sub <2 x i32> [[TMP282]], [[TMP105]]
-; CHECK-NEXT:    [[TMP285:%.*]] = add <2 x i32> [[TMP139]], [[TMP283]]
-; CHECK-NEXT:    [[TMP286:%.*]] = xor <2 x i32> [[TMP285]], [[TMP108]]
-; CHECK-NEXT:    [[TMP287:%.*]] = lshr <2 x i32> [[TMP98]], <i32 15, i32 15>
-; CHECK-NEXT:    [[TMP288:%.*]] = and <2 x i32> [[TMP287]], <i32 65537, i32 65537>
-; CHECK-NEXT:    [[TMP289:%.*]] = mul <2 x i32> [[TMP288]], <i32 65535, i32 65535>
-; CHECK-NEXT:    [[TMP290:%.*]] = add <2 x i32> [[TMP289]], [[TMP284]]
-; CHECK-NEXT:    [[TMP291:%.*]] = xor <2 x i32> [[TMP290]], [[TMP98]]
-; CHECK-NEXT:    [[TMP292:%.*]] = extractelement <2 x i32> [[TMP286]], i32 1
-; CHECK-NEXT:    [[ADD108_3:%.*]] = add i32 [[TMP292]], [[ADD113_2]]
-; CHECK-NEXT:    [[TMP293:%.*]] = extractelement <2 x i32> [[TMP286]], i32 0
-; CHECK-NEXT:    [[ADD110_3:%.*]] = add i32 [[ADD108_3]], [[TMP293]]
-; CHECK-NEXT:    [[TMP294:%.*]] = extractelement <2 x i32> [[TMP291]], i32 0
-; CHECK-NEXT:    [[ADD112_3:%.*]] = add i32 [[ADD110_3]], [[TMP294]]
-; CHECK-NEXT:    [[TMP295:%.*]] = extractelement <2 x i32> [[TMP291]], i32 1
-; CHECK-NEXT:    [[ADD113_3:%.*]] = add i32 [[ADD112_3]], [[TMP295]]
+; CHECK-NEXT:    [[TMP277:%.*]] = insertelement <2 x i32> poison, i32 [[SUB59]], i32 0
+; CHECK-NEXT:    [[TMP278:%.*]] = shufflevector <2 x i32> [[TMP277]], <2 x i32> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP279:%.*]] = add <2 x i32> [[TMP278]], [[TMP188]]
+; CHECK-NEXT:    [[TMP280:%.*]] = sub <2 x i32> [[TMP278]], [[TMP188]]
+; CHECK-NEXT:    [[TMP281:%.*]] = shufflevector <2 x i32> [[TMP279]], <2 x i32> [[TMP280]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    [[TMP282:%.*]] = add <2 x i32> [[TMP104]], [[TMP281]]
+; CHECK-NEXT:    [[TMP283:%.*]] = sub <2 x i32> [[TMP281]], [[TMP104]]
+; CHECK-NEXT:    [[TMP284:%.*]] = add <2 x i32> [[TMP138]], [[TMP282]]
+; CHECK-NEXT:    [[TMP285:%.*]] = xor <2 x i32> [[TMP284]], [[TMP107]]
+; CHECK-NEXT:    [[TMP286:%.*]] = lshr <2 x i32> [[TMP97]], <i32 15, i32 15>
+; CHECK-NEXT:    [[TMP287:%.*]] = and <2 x i32> [[TMP286]], <i32 65537, i32 65537>
+; CHECK-NEXT:    [[TMP288:%.*]] = mul <2 x i32> [[TMP287]], <i32 65535, i32 65535>
+; CHECK-NEXT:    [[TMP289:%.*]] = add <2 x i32> [[TMP288]], [[TMP283]]
+; CHECK-NEXT:    [[TMP290:%.*]] = xor <2 x i32> [[TMP289]], [[TMP97]]
+; CHECK-NEXT:    [[TMP291:%.*]] = extractelement <2 x i32> [[TMP285]], i32 1
+; CHECK-NEXT:    [[ADD108_3:%.*]] = add i32 [[TMP291]], [[ADD113_2]]
+; CHECK-NEXT:    [[TMP292:%.*]] = extractelement <2 x i32> [[TMP285]], i32 0
+; CHECK-NEXT:    [[ADD110_3:%.*]] = add i32 [[ADD108_3]], [[TMP292]]
+; CHECK-NEXT:    [[TMP293:%.*]] = extractelement <2 x i32> [[TMP290]], i32 0
+; CHECK-NEXT:    [[ADD112_3:%.*]] = add i32 [[ADD110_3]], [[TMP293]]
+; CHECK-NEXT:    [[TMP294:%.*]] = extractelement <2 x i32> [[TMP290]], i32 1
+; CHECK-NEXT:    [[ADD113_3:%.*]] = add i32 [[ADD112_3]], [[TMP294]]
 ; CHECK-NEXT:    ret i32 [[ADD113_3]]
 ;
 entry:
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/strided-loads-vectorized.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/strided-loads-vectorized.ll
index a4cc311..4b0b419 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/strided-loads-vectorized.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/strided-loads-vectorized.ll
@@ -5,61 +5,12 @@ define void @test(ptr %p, ptr noalias %s) {
 ; CHECK-LABEL: @test(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [48 x float], ptr [[P:%.*]], i64 0, i64 0
-; CHECK-NEXT:    [[I:%.*]] = load float, ptr [[ARRAYIDX]], align 4
 ; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 30
-; CHECK-NEXT:    [[I1:%.*]] = load float, ptr [[ARRAYIDX1]], align 4
-; CHECK-NEXT:    [[ADD:%.*]] = fsub fast float [[I1]], [[I]]
 ; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[S:%.*]], i64 0
-; CHECK-NEXT:    store float [[ADD]], ptr [[ARRAYIDX2]], align 4
-; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 4
-; CHECK-NEXT:    [[I2:%.*]] = load float, ptr [[ARRAYIDX4]], align 4
-; CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 26
-; CHECK-NEXT:    [[I3:%.*]] = load float, ptr [[ARRAYIDX6]], align 4
-; CHECK-NEXT:    [[ADD7:%.*]] = fsub fast float [[I3]], [[I2]]
-; CHECK-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds float, ptr [[S]], i64 1
-; CHECK-NEXT:    store float [[ADD7]], ptr [[ARRAYIDX9]], align 4
-; CHECK-NEXT:    [[ARRAYIDX11:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 8
-; CHECK-NEXT:    [[I4:%.*]] = load float, ptr [[ARRAYIDX11]], align 4
-; CHECK-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 22
-; CHECK-NEXT:    [[I5:%.*]] = load float, ptr [[ARRAYIDX13]], align 4
-; CHECK-NEXT:    [[ADD14:%.*]] = fsub fast float [[I5]], [[I4]]
-; CHECK-NEXT:    [[ARRAYIDX16:%.*]] = getelementptr inbounds float, ptr [[S]], i64 2
-; CHECK-NEXT:    store float [[ADD14]], ptr [[ARRAYIDX16]], align 4
-; CHECK-NEXT:    [[ARRAYIDX18:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 12
-; CHECK-NEXT:    [[I6:%.*]] = load float, ptr [[ARRAYIDX18]], align 4
-; CHECK-NEXT:    [[ARRAYIDX20:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 18
-; CHECK-NEXT:    [[I7:%.*]] = load float, ptr [[ARRAYIDX20]], align 4
-; CHECK-NEXT:    [[ADD21:%.*]] = fsub fast float [[I7]], [[I6]]
-; CHECK-NEXT:    [[ARRAYIDX23:%.*]] = getelementptr inbounds float, ptr [[S]], i64 3
-; CHECK-NEXT:    store float [[ADD21]], ptr [[ARRAYIDX23]], align 4
-; CHECK-NEXT:    [[ARRAYIDX25:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 16
-; CHECK-NEXT:    [[I8:%.*]] = load float, ptr [[ARRAYIDX25]], align 4
-; CHECK-NEXT:    [[ARRAYIDX27:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 14
-; CHECK-NEXT:    [[I9:%.*]] = load float, ptr [[ARRAYIDX27]], align 4
-; CHECK-NEXT:    [[ADD28:%.*]] = fsub fast float [[I9]], [[I8]]
-; CHECK-NEXT:    [[ARRAYIDX30:%.*]] = getelementptr inbounds float, ptr [[S]], i64 4
-; CHECK-NEXT:    store float [[ADD28]], ptr [[ARRAYIDX30]], align 4
-; CHECK-NEXT:    [[ARRAYIDX32:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 20
-; CHECK-NEXT:    [[I10:%.*]] = load float, ptr [[ARRAYIDX32]], align 4
-; CHECK-NEXT:    [[ARRAYIDX34:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 10
-; CHECK-NEXT:    [[I11:%.*]] = load float, ptr [[ARRAYIDX34]], align 4
-; CHECK-NEXT:    [[ADD35:%.*]] = fsub fast float [[I11]], [[I10]]
-; CHECK-NEXT:    [[ARRAYIDX37:%.*]] = getelementptr inbounds float, ptr [[S]], i64 5
-; CHECK-NEXT:    store float [[ADD35]], ptr [[ARRAYIDX37]], align 4
-; CHECK-NEXT:    [[ARRAYIDX39:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 24
-; CHECK-NEXT:    [[I12:%.*]] = load float, ptr [[ARRAYIDX39]], align 4
-; CHECK-NEXT:    [[ARRAYIDX41:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 6
-; CHECK-NEXT:    [[I13:%.*]] = load float, ptr [[ARRAYIDX41]], align 4
-; CHECK-NEXT:    [[ADD42:%.*]] = fsub fast float [[I13]], [[I12]]
-; CHECK-NEXT:    [[ARRAYIDX44:%.*]] = getelementptr inbounds float, ptr [[S]], i64 6
-; CHECK-NEXT:    store float [[ADD42]], ptr [[ARRAYIDX44]], align 4
-; CHECK-NEXT:    [[ARRAYIDX46:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 28
-; CHECK-NEXT:    [[I14:%.*]] = load float, ptr [[ARRAYIDX46]], align 4
-; CHECK-NEXT:    [[ARRAYIDX48:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 2
-; CHECK-NEXT:    [[I15:%.*]] = load float, ptr [[ARRAYIDX48]], align 4
-; CHECK-NEXT:    [[ADD49:%.*]] = fsub fast float [[I15]], [[I14]]
-; CHECK-NEXT:    [[ARRAYIDX51:%.*]] = getelementptr inbounds float, ptr [[S]], i64 7
-; CHECK-NEXT:    store float [[ADD49]], ptr [[ARRAYIDX51]], align 4
+; CHECK-NEXT:    [[TMP0:%.*]] = call <8 x float> @llvm.experimental.vp.strided.load.v8f32.p0.i64(ptr align 4 [[ARRAYIDX]], i64 16, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, i32 8)
+; CHECK-NEXT:    [[TMP1:%.*]] = call <8 x float> @llvm.experimental.vp.strided.load.v8f32.p0.i64(ptr align 4 [[ARRAYIDX1]], i64 -16, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, i32 8)
+; CHECK-NEXT:    [[TMP2:%.*]] = fsub fast <8 x float> [[TMP1]], [[TMP0]]
+; CHECK-NEXT:    store <8 x float> [[TMP2]], ptr [[ARRAYIDX2]], align 4
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -262,67 +213,40 @@ define void @test2(ptr %p, ptr noalias %s, i32 %stride) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[STR:%.*]] = zext i32 [[STRIDE:%.*]] to i64
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [48 x float], ptr [[P:%.*]], i64 0, i64 2
-; CHECK-NEXT:    [[I:%.*]] = load float, ptr [[ARRAYIDX]], align 4
 ; CHECK-NEXT:    [[ST6:%.*]] = mul i64 [[STR]], 7
 ; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 [[ST6]]
 ; CHECK-NEXT:    [[I1:%.*]] = load float, ptr [[ARRAYIDX1]], align 4
-; CHECK-NEXT:    [[ADD:%.*]] = fsub fast float [[I1]], [[I]]
 ; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[S:%.*]], i64 0
-; CHECK-NEXT:    store float [[ADD]], ptr [[ARRAYIDX2]], align 4
-; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 6
-; CHECK-NEXT:    [[I2:%.*]] = load float, ptr [[ARRAYIDX4]], align 4
 ; CHECK-NEXT:    [[ST5:%.*]] = mul i64 [[STR]], 6
 ; CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 [[ST5]]
 ; CHECK-NEXT:    [[I3:%.*]] = load float, ptr [[ARRAYIDX6]], align 4
-; CHECK-NEXT:    [[ADD7:%.*]] = fsub fast float [[I3]], [[I2]]
-; CHECK-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds float, ptr [[S]], i64 1
-; CHECK-NEXT:    store float [[ADD7]], ptr [[ARRAYIDX9]], align 4
-; CHECK-NEXT:    [[ARRAYIDX11:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 10
-; CHECK-NEXT:    [[I4:%.*]] = load float, ptr [[ARRAYIDX11]], align 4
 ; CHECK-NEXT:    [[ST4:%.*]] = mul i64 [[STR]], 5
 ; CHECK-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 [[ST4]]
 ; CHECK-NEXT:    [[I5:%.*]] = load float, ptr [[ARRAYIDX13]], align 4
-; CHECK-NEXT:    [[ADD14:%.*]] = fsub fast float [[I5]], [[I4]]
-; CHECK-NEXT:    [[ARRAYIDX16:%.*]] = getelementptr inbounds float, ptr [[S]], i64 2
-; CHECK-NEXT:    store float [[ADD14]], ptr [[ARRAYIDX16]], align 4
-; CHECK-NEXT:    [[ARRAYIDX18:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 14
-; CHECK-NEXT:    [[I6:%.*]] = load float, ptr [[ARRAYIDX18]], align 4
 ; CHECK-NEXT:    [[ST3:%.*]] = mul i64 [[STR]], 4
 ; CHECK-NEXT:    [[ARRAYIDX20:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 [[ST3]]
 ; CHECK-NEXT:    [[I7:%.*]] = load float, ptr [[ARRAYIDX20]], align 4
-; CHECK-NEXT:    [[ADD21:%.*]] = fsub fast float [[I7]], [[I6]]
-; CHECK-NEXT:    [[ARRAYIDX23:%.*]] = getelementptr inbounds float, ptr [[S]], i64 3
-; CHECK-NEXT:    store float [[ADD21]], ptr [[ARRAYIDX23]], align 4
-; CHECK-NEXT:    [[ARRAYIDX25:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 18
 ; CHECK-NEXT:    [[ST2:%.*]] = mul i64 [[STR]], 3
-; CHECK-NEXT:    [[I8:%.*]] = load float, ptr [[ARRAYIDX25]], align 4
 ; CHECK-NEXT:    [[ARRAYIDX27:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 [[ST2]]
 ; CHECK-NEXT:    [[I9:%.*]] = load float, ptr [[ARRAYIDX27]], align 4
-; CHECK-NEXT:    [[ADD28:%.*]] = fsub fast float [[I9]], [[I8]]
-; CHECK-NEXT:    [[ARRAYIDX30:%.*]] = getelementptr inbounds float, ptr [[S]], i64 4
-; CHECK-NEXT:    store float [[ADD28]], ptr [[ARRAYIDX30]], align 4
-; CHECK-NEXT:    [[ARRAYIDX32:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 22
-; CHECK-NEXT:    [[I10:%.*]] = load float, ptr [[ARRAYIDX32]], align 4
 ; CHECK-NEXT:    [[ST1:%.*]] = mul i64 [[STR]], 2
 ; CHECK-NEXT:    [[ARRAYIDX34:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 [[ST1]]
 ; CHECK-NEXT:    [[I11:%.*]] = load float, ptr [[ARRAYIDX34]], align 4
-; CHECK-NEXT:    [[ADD35:%.*]] = fsub fast float [[I11]], [[I10]]
-; CHECK-NEXT:    [[ARRAYIDX37:%.*]] = getelementptr inbounds float, ptr [[S]], i64 5
-; CHECK-NEXT:    store float [[ADD35]], ptr [[ARRAYIDX37]], align 4
-; CHECK-NEXT:    [[ARRAYIDX39:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 26
-; CHECK-NEXT:    [[I12:%.*]] = load float, ptr [[ARRAYIDX39]], align 4
 ; CHECK-NEXT:    [[ARRAYIDX41:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 [[STR]]
 ; CHECK-NEXT:    [[I13:%.*]] = load float, ptr [[ARRAYIDX41]], align 4
-; CHECK-NEXT:    [[ADD42:%.*]] = fsub fast float [[I13]], [[I12]]
-; CHECK-NEXT:    [[ARRAYIDX44:%.*]] = getelementptr inbounds float, ptr [[S]], i64 6
-; CHECK-NEXT:    store float [[ADD42]], ptr [[ARRAYIDX44]], align 4
-; CHECK-NEXT:    [[ARRAYIDX46:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 30
-; CHECK-NEXT:    [[I14:%.*]] = load float, ptr [[ARRAYIDX46]], align 4
 ; CHECK-NEXT:    [[ARRAYIDX48:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 0
 ; CHECK-NEXT:    [[I15:%.*]] = load float, ptr [[ARRAYIDX48]], align 4
-; CHECK-NEXT:    [[ADD49:%.*]] = fsub fast float [[I15]], [[I14]]
-; CHECK-NEXT:    [[ARRAYIDX51:%.*]] = getelementptr inbounds float, ptr [[S]], i64 7
-; CHECK-NEXT:    store float [[ADD49]], ptr [[ARRAYIDX51]], align 4
+; CHECK-NEXT:    [[TMP0:%.*]] = call <8 x float> @llvm.experimental.vp.strided.load.v8f32.p0.i64(ptr align 4 [[ARRAYIDX]], i64 16, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, i32 8)
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <8 x float> poison, float [[I1]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <8 x float> [[TMP1]], float [[I3]], i32 1
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <8 x float> [[TMP2]], float [[I5]], i32 2
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <8 x float> [[TMP3]], float [[I7]], i32 3
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <8 x float> [[TMP4]], float [[I9]], i32 4
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <8 x float> [[TMP5]], float [[I11]], i32 5
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <8 x float> [[TMP6]], float [[I13]], i32 6
+; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <8 x float> [[TMP7]], float [[I15]], i32 7
+; CHECK-NEXT:    [[TMP9:%.*]] = fsub fast <8 x float> [[TMP8]], [[TMP0]]
+; CHECK-NEXT:    store <8 x float> [[TMP9]], ptr [[ARRAYIDX2]], align 4
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -397,27 +321,12 @@ define void @test3(ptr %p, ptr noalias %s) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [48 x float], ptr [[P:%.*]], i64 0, i64 0
 ; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[S:%.*]], i64 0
-; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 4
-; CHECK-NEXT:    [[ARRAYIDX11:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 8
-; CHECK-NEXT:    [[ARRAYIDX18:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 12
-; CHECK-NEXT:    [[ARRAYIDX25:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 16
-; CHECK-NEXT:    [[ARRAYIDX32:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 20
-; CHECK-NEXT:    [[ARRAYIDX39:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 24
-; CHECK-NEXT:    [[ARRAYIDX46:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 28
 ; CHECK-NEXT:    [[ARRAYIDX48:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 23
-; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <8 x ptr> poison, ptr [[ARRAYIDX]], i32 0
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <8 x ptr> [[TMP0]], ptr [[ARRAYIDX4]], i32 1
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <8 x ptr> [[TMP1]], ptr [[ARRAYIDX11]], i32 2
-; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <8 x ptr> [[TMP2]], ptr [[ARRAYIDX18]], i32 3
-; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <8 x ptr> [[TMP3]], ptr [[ARRAYIDX25]], i32 4
-; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <8 x ptr> [[TMP4]], ptr [[ARRAYIDX32]], i32 5
-; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <8 x ptr> [[TMP5]], ptr [[ARRAYIDX39]], i32 6
-; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <8 x ptr> [[TMP6]], ptr [[ARRAYIDX46]], i32 7
-; CHECK-NEXT:    [[TMP8:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> [[TMP7]], i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x float> poison)
-; CHECK-NEXT:    [[TMP9:%.*]] = load <8 x float>, ptr [[ARRAYIDX48]], align 4
-; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <8 x float> [[TMP9]], <8 x float> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    [[TMP11:%.*]] = fsub fast <8 x float> [[TMP10]], [[TMP8]]
-; CHECK-NEXT:    store <8 x float> [[TMP11]], ptr [[ARRAYIDX2]], align 4
+; CHECK-NEXT:    [[TMP0:%.*]] = call <8 x float> @llvm.experimental.vp.strided.load.v8f32.p0.i64(ptr align 4 [[ARRAYIDX]], i64 16, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, i32 8)
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x float>, ptr [[ARRAYIDX48]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[TMP3:%.*]] = fsub fast <8 x float> [[TMP2]], [[TMP0]]
+; CHECK-NEXT:    store <8 x float> [[TMP3]], ptr [[ARRAYIDX2]], align 4
 ; CHECK-NEXT:    ret void
 ;
 entry:
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/strided-loads-with-external-use-ptr.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/strided-loads-with-external-use-ptr.ll
index 5aba9ea..ec152c7 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/strided-loads-with-external-use-ptr.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/strided-loads-with-external-use-ptr.ll
@@ -8,7 +8,7 @@ define i16 @test() {
 ; CHECK-SAME: () #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[PPREV_058_I:%.*]] = getelementptr [[S:%.*]], ptr null, i64 -1
-; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x ptr> <ptr poison, ptr null>, ptr [[PPREV_058_I]], i32 0
+; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x ptr> <ptr null, ptr poison>, ptr [[PPREV_058_I]], i32 1
 ; CHECK-NEXT:    br label [[WHILE_BODY_I:%.*]]
 ; CHECK:       while.body.i:
 ; CHECK-NEXT:    [[TMP1:%.*]] = phi i16 [ 0, [[WHILE_BODY_I]] ], [ 0, [[ENTRY:%.*]] ]
@@ -17,7 +17,7 @@ define i16 @test() {
 ; CHECK-NEXT:    [[TMP4:%.*]] = call <2 x i16> @llvm.masked.gather.v2i16.v2p0(<2 x ptr> [[TMP3]], i32 2, <2 x i1> <i1 true, i1 true>, <2 x i16> poison)
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x i16> [[TMP4]], i32 0
 ; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x i16> [[TMP4]], i32 1
-; CHECK-NEXT:    [[CMP_I178:%.*]] = icmp ult i16 [[TMP6]], [[TMP5]]
+; CHECK-NEXT:    [[CMP_I178:%.*]] = icmp ult i16 [[TMP5]], [[TMP6]]
 ; CHECK-NEXT:    br label [[WHILE_BODY_I]]
 ;
 entry:
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/strided-loads.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/strided-loads.ll
index 8f2c72b..8ab57cc 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/strided-loads.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/strided-loads.ll
@@ -5,14 +5,11 @@ define i32 @sum_of_abs(ptr noalias %a, ptr noalias %b) {
 ; CHECK-LABEL: define i32 @sum_of_abs
 ; CHECK-SAME: (ptr noalias [[A:%.*]], ptr noalias [[B:%.*]]) #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <8 x ptr> poison, ptr [[A]], i32 0
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x ptr> [[TMP0]], <8 x ptr> poison, <8 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, <8 x ptr> [[TMP1]], <8 x i64> <i64 0, i64 64, i64 128, i64 192, i64 256, i64 320, i64 384, i64 448>
-; CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> [[TMP2]], i32 1, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i8> poison)
-; CHECK-NEXT:    [[TMP4:%.*]] = call <8 x i8> @llvm.abs.v8i8(<8 x i8> [[TMP3]], i1 false)
-; CHECK-NEXT:    [[TMP5:%.*]] = sext <8 x i8> [[TMP4]] to <8 x i32>
-; CHECK-NEXT:    [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP5]])
-; CHECK-NEXT:    ret i32 [[TMP6]]
+; CHECK-NEXT:    [[TMP0:%.*]] = call <8 x i8> @llvm.experimental.vp.strided.load.v8i8.p0.i64(ptr align 1 [[A]], i64 64, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, i32 8)
+; CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i8> @llvm.abs.v8i8(<8 x i8> [[TMP0]], i1 false)
+; CHECK-NEXT:    [[TMP2:%.*]] = sext <8 x i8> [[TMP1]] to <8 x i32>
+; CHECK-NEXT:    [[TMP3:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP2]])
+; CHECK-NEXT:    ret i32 [[TMP3]]
 ;
 entry:
   %0 = load i8, ptr %a, align 1
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/gep-nodes-with-non-gep-inst.ll b/llvm/test/Transforms/SLPVectorizer/X86/gep-nodes-with-non-gep-inst.ll
index 96d4c30..9e43cef 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/gep-nodes-with-non-gep-inst.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/gep-nodes-with-non-gep-inst.ll
@@ -30,7 +30,7 @@ define void @test() {
 ; CHECK-SLP-THRESHOLD:       bb:
 ; CHECK-SLP-THRESHOLD-NEXT:    [[TMP0:%.*]] = insertelement <4 x ptr> poison, ptr [[COND_IN_V]], i32 0
 ; CHECK-SLP-THRESHOLD-NEXT:    [[TMP1:%.*]] = shufflevector <4 x ptr> [[TMP0]], <4 x ptr> poison, <4 x i32> zeroinitializer
-; CHECK-SLP-THRESHOLD-NEXT:    [[TMP2:%.*]] = getelementptr i64, <4 x ptr> [[TMP1]], <4 x i64> <i64 0, i64 4, i64 8, i64 12>
+; CHECK-SLP-THRESHOLD-NEXT:    [[TMP2:%.*]] = getelementptr i64, <4 x ptr> [[TMP1]], <4 x i64> <i64 12, i64 8, i64 4, i64 0>
 ; CHECK-SLP-THRESHOLD-NEXT:    [[TMP3:%.*]] = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> [[TMP2]], i32 8, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i64> poison)
 ; CHECK-SLP-THRESHOLD-NEXT:    [[TMP4:%.*]] = icmp eq <4 x i64> [[TMP3]], zeroinitializer
 ; CHECK-SLP-THRESHOLD-NEXT:    ret void
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/remark_gather-load-redux-cost.ll b/llvm/test/Transforms/SLPVectorizer/X86/remark_gather-load-redux-cost.ll
index 1add732..3bc6e64 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/remark_gather-load-redux-cost.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/remark_gather-load-redux-cost.ll
@@ -7,7 +7,7 @@ define i32 @test(ptr noalias %p, ptr noalias %addr) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <8 x ptr> poison, ptr [[ADDR:%.*]], i32 0
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x ptr> [[TMP0]], <8 x ptr> poison, <8 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i32, <8 x ptr> [[TMP1]], <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i32, <8 x ptr> [[TMP1]], <8 x i32> <i32 15, i32 13, i32 11, i32 9, i32 7, i32 5, i32 3, i32 1>
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> [[TMP2]], i32 8, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> poison)
 ; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <8 x ptr> poison, ptr [[P:%.*]], i32 0
 ; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <8 x ptr> [[TMP4]], <8 x ptr> poison, <8 x i32> zeroinitializer
-- 
cgit v1.1


From 45260bf23b802047ab4fd888b8bf2b32e4c5eb69 Mon Sep 17 00:00:00 2001
From: Petr <piter.zh@gmail.com>
Date: Mon, 12 Feb 2024 13:50:00 +0100
Subject: Fix use after free error in NVVMReflect (#81471)

I have a Triton kernel, which triggered a heap-use-after-free error in
LLVM.

The problem was that the same instruction may be added to the
`ToSimplify` array multiple times. If this duplicate instruction is
trivially dead, it gets deleted on the first pass. Then, on the second
pass, the freed instruction is passed.

To fix this, I'm adding the instructions to the `ToRemove` array and
filter it out for duplicates to avoid possible double frees.
---
 llvm/lib/Target/NVPTX/NVVMReflect.cpp | 18 +++++++++++++-----
 1 file changed, 13 insertions(+), 5 deletions(-)

diff --git a/llvm/lib/Target/NVPTX/NVVMReflect.cpp b/llvm/lib/Target/NVPTX/NVVMReflect.cpp
index 64fedf3..29c95e4 100644
--- a/llvm/lib/Target/NVPTX/NVVMReflect.cpp
+++ b/llvm/lib/Target/NVPTX/NVVMReflect.cpp
@@ -39,6 +39,7 @@
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/Local.h"
+#include <algorithm>
 #include <sstream>
 #include <string>
 #define NVVM_REFLECT_FUNCTION "__nvvm_reflect"
@@ -185,9 +186,6 @@ static bool runNVVMReflect(Function &F, unsigned SmVersion) {
     ToRemove.push_back(Call);
   }
 
-  for (Instruction *I : ToRemove)
-    I->eraseFromParent();
-
   // The code guarded by __nvvm_reflect may be invalid for the target machine.
   // Traverse the use-def chain, continually simplifying constant expressions
   // until we find a terminator that we can then remove.
@@ -200,13 +198,23 @@ static bool runNVVMReflect(Function &F, unsigned SmVersion) {
           ToSimplify.push_back(I);
 
       I->replaceAllUsesWith(C);
-      if (isInstructionTriviallyDead(I))
-        I->eraseFromParent();
+      if (isInstructionTriviallyDead(I)) {
+        ToRemove.push_back(I);
+      }
     } else if (I->isTerminator()) {
       ConstantFoldTerminator(I->getParent());
     }
   }
 
+  // Removing via isInstructionTriviallyDead may add duplicates to the ToRemove
+  // array. Filter out the duplicates before starting to erase from parent.
+  std::sort(ToRemove.begin(), ToRemove.end());
+  auto NewLastIter = std::unique(ToRemove.begin(), ToRemove.end());
+  ToRemove.erase(NewLastIter, ToRemove.end());
+
+  for (Instruction *I : ToRemove)
+    I->eraseFromParent();
+
   return ToRemove.size() > 0;
 }
 
-- 
cgit v1.1


From 2ac8e6b7f5b8f495f496a55512e5ea452beb0bca Mon Sep 17 00:00:00 2001
From: Joseph Huber <huberjn@outlook.com>
Date: Mon, 12 Feb 2024 07:07:48 -0600
Subject: [NVPTX] Implement `__builtin_readcyclecounter` on NVPTX (#81344)

Summary:
This patch simply states that `__builtin_readcyclecounter` is legal on
NVPTX and makes it  return the value from the `clock64` sreg. The timer
intrinsics are marked as having side effects, which is desireable for
timing primitives and required to pattern match the instrinic DAG.
---
 llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp |  2 ++
 llvm/lib/Target/NVPTX/NVPTXIntrinsics.td    |  4 ++++
 llvm/test/CodeGen/NVPTX/intrinsics.ll       | 12 ++++++++++++
 3 files changed, 18 insertions(+)

diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index 5c24f00..80a67ca 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -489,6 +489,8 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
   setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2f16, Expand);
   setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2f16, Expand);
 
+  setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Legal);
+
   setFP16OperationAction(ISD::SETCC, MVT::f16, Legal, Promote);
   setFP16OperationAction(ISD::SETCC, MVT::v2f16, Legal, Expand);
 
diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
index 2330d72..133e282 100644
--- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
+++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
@@ -6372,12 +6372,16 @@ def INT_PTX_SREG_LANEMASK_GE :
 def INT_PTX_SREG_LANEMASK_GT :
     PTX_READ_SREG_R32<"lanemask_gt", int_nvvm_read_ptx_sreg_lanemask_gt>;
 
+let hasSideEffects = 1 in {
 def INT_PTX_SREG_CLOCK :
     PTX_READ_SREG_R32<"clock", int_nvvm_read_ptx_sreg_clock>;
 def INT_PTX_SREG_CLOCK64 :
     PTX_READ_SREG_R64<"clock64", int_nvvm_read_ptx_sreg_clock64>;
 def INT_PTX_SREG_GLOBALTIMER :
     PTX_READ_SREG_R64<"globaltimer", int_nvvm_read_ptx_sreg_globaltimer>;
+}
+
+def: Pat <(i64 (readcyclecounter)), (INT_PTX_SREG_CLOCK64)>;
 
 def INT_PTX_SREG_PM0 : PTX_READ_SREG_R32<"pm0", int_nvvm_read_ptx_sreg_pm0>;
 def INT_PTX_SREG_PM1 : PTX_READ_SREG_R32<"pm1", int_nvvm_read_ptx_sreg_pm1>;
diff --git a/llvm/test/CodeGen/NVPTX/intrinsics.ll b/llvm/test/CodeGen/NVPTX/intrinsics.ll
index 2994f60..d84ee67 100644
--- a/llvm/test/CodeGen/NVPTX/intrinsics.ll
+++ b/llvm/test/CodeGen/NVPTX/intrinsics.ll
@@ -151,6 +151,17 @@ define i64 @test_globaltimer() {
   ret i64 %ret
 }
 
+; CHECK-LABEL: test_cyclecounter
+define i64 @test_cyclecounter() {
+; CHECK: mov.u64         %r{{.*}}, %clock64;
+  %a = tail call i64 @llvm.readcyclecounter()
+; CHECK: mov.u64         %r{{.*}}, %clock64;
+  %b = tail call i64 @llvm.readcyclecounter()
+  %ret = add i64 %a, %b
+; CHECK: ret
+  ret i64 %ret
+}
+
 declare float @llvm.fabs.f32(float)
 declare double @llvm.fabs.f64(double)
 declare float @llvm.nvvm.sqrt.f(float)
@@ -166,3 +177,4 @@ declare i32 @llvm.nvvm.read.ptx.sreg.clock()
 declare i64 @llvm.nvvm.read.ptx.sreg.clock64()
 declare void @llvm.nvvm.exit()
 declare i64 @llvm.nvvm.read.ptx.sreg.globaltimer()
+declare i64 @llvm.readcyclecounter()
-- 
cgit v1.1


From d1c481d27db07acb6fa68a244bb110bd7ad57a05 Mon Sep 17 00:00:00 2001
From: Antonio Frighetto <me@antoniofrighetto.com>
Date: Fri, 29 Dec 2023 00:21:12 +0100
Subject: [CGP] Precommit tests for PR76613 (NFC)

---
 llvm/test/CodeGen/X86/tailcall-cgp-dup.ll | 193 ++++++++++++++++++++++++++++++
 1 file changed, 193 insertions(+)

diff --git a/llvm/test/CodeGen/X86/tailcall-cgp-dup.ll b/llvm/test/CodeGen/X86/tailcall-cgp-dup.ll
index 75bbae1..c48087d 100644
--- a/llvm/test/CodeGen/X86/tailcall-cgp-dup.ll
+++ b/llvm/test/CodeGen/X86/tailcall-cgp-dup.ll
@@ -184,3 +184,196 @@ return:
   %retval = phi ptr [ %ptr, %if.then ], [ %obj, %entry ]
   ret ptr %retval
 }
+
+define ptr @memset_tailc(ptr %ret_val, i64 %sz) nounwind {
+; CHECK-LABEL: memset_tailc:
+; CHECK:       ## %bb.0: ## %entry
+; CHECK-NEXT:    pushq %rbx
+; CHECK-NEXT:    movq %rdi, %rbx
+; CHECK-NEXT:    testq %rdi, %rdi
+; CHECK-NEXT:    je LBB4_2
+; CHECK-NEXT:  ## %bb.1: ## %if.then
+; CHECK-NEXT:    movq %rsi, %rdx
+; CHECK-NEXT:    movq %rbx, %rdi
+; CHECK-NEXT:    xorl %esi, %esi
+; CHECK-NEXT:    callq _memset
+; CHECK-NEXT:  LBB4_2: ## %return
+; CHECK-NEXT:    movq %rbx, %rax
+; CHECK-NEXT:    popq %rbx
+; CHECK-NEXT:    retq
+entry:
+  %cmp = icmp eq ptr %ret_val, null
+  br i1 %cmp, label %return, label %if.then
+
+if.then:
+  tail call void @llvm.memset.p0.i64(ptr nonnull align 1 %ret_val, i8 0, i64 %sz, i1 false)
+  br label %return
+
+return:
+  ret ptr %ret_val
+}
+
+define ptr @memcpy_tailc(ptr %ret_val, i64 %sz, ptr %src) nounwind {
+; CHECK-LABEL: memcpy_tailc:
+; CHECK:       ## %bb.0: ## %entry
+; CHECK-NEXT:    pushq %rbx
+; CHECK-NEXT:    testq %rsi, %rsi
+; CHECK-NEXT:    je LBB5_1
+; CHECK-NEXT:  ## %bb.2: ## %if.then
+; CHECK-NEXT:    movq %rsi, %rax
+; CHECK-NEXT:    movq %rdi, %rbx
+; CHECK-NEXT:    movq %rdx, %rsi
+; CHECK-NEXT:    movq %rax, %rdx
+; CHECK-NEXT:    callq _memcpy
+; CHECK-NEXT:    jmp LBB5_3
+; CHECK-NEXT:  LBB5_1:
+; CHECK-NEXT:    movq %rdx, %rbx
+; CHECK-NEXT:  LBB5_3: ## %return
+; CHECK-NEXT:    movq %rbx, %rax
+; CHECK-NEXT:    popq %rbx
+; CHECK-NEXT:    retq
+entry:
+  %cmp = icmp eq i64 %sz, 0
+  br i1 %cmp, label %return, label %if.then
+
+if.then:
+  tail call void @llvm.memcpy.p0.p0.i64(ptr align 1 %ret_val, ptr align 1 %src, i64 %sz, i1 false)
+  br label %return
+
+return:
+  %phi = phi ptr [ %ret_val, %if.then ], [ %src, %entry ]
+  ret ptr %phi
+}
+
+define ptr @strcpy_legal_and_baz_illegal(ptr %arg, i64 %sz, ptr %2) nounwind {
+; CHECK-LABEL: strcpy_legal_and_baz_illegal:
+; CHECK:       ## %bb.0: ## %entry
+; CHECK-NEXT:    pushq %r15
+; CHECK-NEXT:    pushq %r14
+; CHECK-NEXT:    pushq %rbx
+; CHECK-NEXT:    movq %rdx, %r14
+; CHECK-NEXT:    movq %rsi, %r15
+; CHECK-NEXT:    movq %rdi, %rbx
+; CHECK-NEXT:    movq %rsi, %rdi
+; CHECK-NEXT:    callq _malloc
+; CHECK-NEXT:    testq %r15, %r15
+; CHECK-NEXT:    je LBB6_2
+; CHECK-NEXT:  ## %bb.1: ## %if.then
+; CHECK-NEXT:    movq %rax, %rdi
+; CHECK-NEXT:    movq %r14, %rsi
+; CHECK-NEXT:    movq %rax, %rbx
+; CHECK-NEXT:    callq _strcpy
+; CHECK-NEXT:    jmp LBB6_3
+; CHECK-NEXT:  LBB6_2: ## %if.else
+; CHECK-NEXT:    movq %rbx, %rdi
+; CHECK-NEXT:    movq %r14, %rsi
+; CHECK-NEXT:    callq _baz
+; CHECK-NEXT:  LBB6_3: ## %return
+; CHECK-NEXT:    movq %rbx, %rax
+; CHECK-NEXT:    popq %rbx
+; CHECK-NEXT:    popq %r14
+; CHECK-NEXT:    popq %r15
+; CHECK-NEXT:    retq
+entry:
+  %strcpy_ret_val = tail call noalias ptr @malloc(i64 %sz)
+  %cmp = icmp eq i64 %sz, 0
+  br i1 %cmp, label %if.else, label %if.then
+
+if.then:
+  %rv_unused = tail call ptr @strcpy(ptr dereferenceable(1) %strcpy_ret_val, ptr dereferenceable(1) %2)
+  br label %return
+
+if.else:
+  %rv_unused_2 = tail call ptr @baz(ptr %arg, ptr %2)
+  br label %return
+
+return:
+  %phi = phi ptr [ %strcpy_ret_val, %if.then ], [ %arg, %if.else ]
+  ret ptr %phi
+}
+
+define ptr @baz_illegal_tailc(ptr %ret_val, ptr %arg) nounwind {
+; CHECK-LABEL: baz_illegal_tailc:
+; CHECK:       ## %bb.0: ## %entry
+; CHECK-NEXT:    pushq %rbx
+; CHECK-NEXT:    movq %rdi, %rbx
+; CHECK-NEXT:    testq %rdi, %rdi
+; CHECK-NEXT:    je LBB7_2
+; CHECK-NEXT:  ## %bb.1: ## %if.then
+; CHECK-NEXT:    movq %rbx, %rdi
+; CHECK-NEXT:    callq _baz
+; CHECK-NEXT:  LBB7_2: ## %return
+; CHECK-NEXT:    movq %rbx, %rax
+; CHECK-NEXT:    popq %rbx
+; CHECK-NEXT:    retq
+entry:
+  %cmp = icmp eq ptr %ret_val, null
+  br i1 %cmp, label %return, label %if.then
+
+if.then:
+  %rv = tail call ptr @baz(ptr %ret_val, ptr %arg)
+  br label %return
+
+return:
+  ret ptr %ret_val
+}
+
+define ptr @memset_illegal_tailc(ptr %arg, i64 %sz, ptr %ret_val_1, ptr %ret_val_2) nounwind {
+; CHECK-LABEL: memset_illegal_tailc:
+; CHECK:       ## %bb.0: ## %entry
+; CHECK-NEXT:    movq %rdx, %rax
+; CHECK-NEXT:    testq %rsi, %rsi
+; CHECK-NEXT:    je LBB8_2
+; CHECK-NEXT:  ## %bb.1: ## %if.then
+; CHECK-NEXT:    pushq %rbx
+; CHECK-NEXT:    movq %rcx, %rbx
+; CHECK-NEXT:    movq %rsi, %rdx
+; CHECK-NEXT:    xorl %esi, %esi
+; CHECK-NEXT:    callq _memset
+; CHECK-NEXT:    movq %rbx, %rax
+; CHECK-NEXT:    popq %rbx
+; CHECK-NEXT:  LBB8_2: ## %return
+; CHECK-NEXT:    retq
+entry:
+  %cmp = icmp eq i64 %sz, 0
+  br i1 %cmp, label %return, label %if.then
+
+if.then:
+  tail call void @llvm.memset.p0.i64(ptr align 1 %arg, i8 0, i64 %sz, i1 false)
+  br label %return
+
+return:
+  %phi = phi ptr [ %ret_val_2, %if.then ], [ %ret_val_1, %entry ]
+  ret ptr %phi
+}
+
+define ptr @strcpy_illegal_tailc(ptr %dest, i64 %sz, ptr readonly returned %src) nounwind {
+; CHECK-LABEL: strcpy_illegal_tailc:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    pushq %rbx
+; CHECK-NEXT:    movq %rdx, %rbx
+; CHECK-NEXT:    testq %rsi, %rsi
+; CHECK-NEXT:    je LBB9_2
+; CHECK-NEXT:  ## %bb.1: ## %if.then
+; CHECK-NEXT:    movq %rbx, %rsi
+; CHECK-NEXT:    callq _strcpy
+; CHECK-NEXT:  LBB9_2: ## %return
+; CHECK-NEXT:    movq %rbx, %rax
+; CHECK-NEXT:    popq %rbx
+; CHECK-NEXT:    retq
+  %cmp = icmp eq i64 %sz, 0
+  br i1 %cmp, label %return, label %if.then
+
+if.then:
+  %6 = tail call ptr @strcpy(ptr dereferenceable(1) %dest, ptr dereferenceable(1) %src)
+  br label %return
+
+return:
+  ret ptr %src
+}
+
+declare void @llvm.memcpy.p0.p0.i64(ptr noalias nocapture writeonly, ptr noalias nocapture readonly, i64, i1)
+declare void @llvm.memset.p0.i64(ptr nocapture writeonly, i8, i64, i1)
+declare noalias ptr @malloc(i64)
+declare ptr @strcpy(ptr noalias returned writeonly, ptr noalias nocapture readonly)
+declare ptr @baz(ptr, ptr)
-- 
cgit v1.1


From 8373ceef8f2ee377d6daf884e2f3ea11408a7fe2 Mon Sep 17 00:00:00 2001
From: Antonio Frighetto <me@antoniofrighetto.com>
Date: Fri, 29 Dec 2023 00:25:23 +0100
Subject: [CGP] Extend `dupRetToEnableTailCallOpts` to known intrinsics

Hint further tail call optimization opportunities when the examined
returned value is the return value of a known intrinsic or library
function, and it appears as first function argument.

Fixes: https://github.com/llvm/llvm-project/issues/75455.
---
 llvm/lib/CodeGen/CodeGenPrepare.cpp       | 70 ++++++++++++++++++++++++++++---
 llvm/test/CodeGen/X86/tailcall-cgp-dup.ll | 52 ++++++++++-------------
 2 files changed, 85 insertions(+), 37 deletions(-)

diff --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp
index 09c4922..32a25b4 100644
--- a/llvm/lib/CodeGen/CodeGenPrepare.cpp
+++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp
@@ -2522,8 +2522,40 @@ bool CodeGenPrepare::optimizeCallInst(CallInst *CI, ModifyDT &ModifiedDT) {
   return false;
 }
 
+static bool isIntrinsicOrLFToBeTailCalled(const TargetLibraryInfo *TLInfo,
+                                          const CallInst *CI) {
+  assert(CI && CI->use_empty());
+
+  if (const auto *II = dyn_cast<IntrinsicInst>(CI))
+    switch (II->getIntrinsicID()) {
+    case Intrinsic::memset:
+    case Intrinsic::memcpy:
+    case Intrinsic::memmove:
+      return true;
+    default:
+      return false;
+    }
+
+  LibFunc LF;
+  Function *Callee = CI->getCalledFunction();
+  if (Callee && TLInfo && TLInfo->getLibFunc(*Callee, LF))
+    switch (LF) {
+    case LibFunc_strcpy:
+    case LibFunc_strncpy:
+    case LibFunc_strcat:
+    case LibFunc_strncat:
+      return true;
+    default:
+      return false;
+    }
+
+  return false;
+}
+
 /// Look for opportunities to duplicate return instructions to the predecessor
-/// to enable tail call optimizations. The case it is currently looking for is:
+/// to enable tail call optimizations. The case it is currently looking for is
+/// the following one. Known intrinsics or library function that may be tail
+/// called are taken into account as well.
 /// @code
 /// bb0:
 ///   %tmp0 = tail call i32 @f0()
@@ -2580,8 +2612,6 @@ bool CodeGenPrepare::dupRetToEnableTailCallOpts(BasicBlock *BB,
     }
 
     PN = dyn_cast<PHINode>(V);
-    if (!PN)
-      return false;
   }
 
   if (PN && PN->getParent() != BB)
@@ -2620,8 +2650,30 @@ bool CodeGenPrepare::dupRetToEnableTailCallOpts(BasicBlock *BB,
       // Make sure the phi value is indeed produced by the tail call.
       if (CI && CI->hasOneUse() && CI->getParent() == PredBB &&
           TLI->mayBeEmittedAsTailCall(CI) &&
-          attributesPermitTailCall(F, CI, RetI, *TLI))
+          attributesPermitTailCall(F, CI, RetI, *TLI)) {
         TailCallBBs.push_back(PredBB);
+      } else {
+        // Consider the cases in which the phi value is indirectly produced by
+        // the tail call, for example when encountering memset(), memmove(),
+        // strcpy(), whose return value may have been optimized out. In such
+        // cases, the value needs to be the first function argument.
+        //
+        // bb0:
+        //   tail call void @llvm.memset.p0.i64(ptr %0, i8 0, i64 %1)
+        //   br label %return
+        // return:
+        //   %phi = phi ptr [ %0, %bb0 ], [ %2, %entry ]
+        if (PredBB && PredBB->getSingleSuccessor() == BB)
+          CI = dyn_cast_or_null<CallInst>(
+              PredBB->getTerminator()->getPrevNonDebugInstruction(true));
+
+        if (CI && CI->use_empty() &&
+            isIntrinsicOrLFToBeTailCalled(TLInfo, CI) &&
+            IncomingVal == CI->getArgOperand(0) &&
+            TLI->mayBeEmittedAsTailCall(CI) &&
+            attributesPermitTailCall(F, CI, RetI, *TLI))
+          TailCallBBs.push_back(PredBB);
+      }
     }
   } else {
     SmallPtrSet<BasicBlock *, 4> VisitedBBs;
@@ -2631,8 +2683,14 @@ bool CodeGenPrepare::dupRetToEnableTailCallOpts(BasicBlock *BB,
       if (Instruction *I = Pred->rbegin()->getPrevNonDebugInstruction(true)) {
         CallInst *CI = dyn_cast<CallInst>(I);
         if (CI && CI->use_empty() && TLI->mayBeEmittedAsTailCall(CI) &&
-            attributesPermitTailCall(F, CI, RetI, *TLI))
-          TailCallBBs.push_back(Pred);
+            attributesPermitTailCall(F, CI, RetI, *TLI)) {
+          // Either we return void or the return value must be the first
+          // argument of a known intrinsic or library function.
+          if (!V || (isIntrinsicOrLFToBeTailCalled(TLInfo, CI) &&
+                     V == CI->getArgOperand(0))) {
+            TailCallBBs.push_back(Pred);
+          }
+        }
       }
     }
   }
diff --git a/llvm/test/CodeGen/X86/tailcall-cgp-dup.ll b/llvm/test/CodeGen/X86/tailcall-cgp-dup.ll
index c48087d..401ed9f7 100644
--- a/llvm/test/CodeGen/X86/tailcall-cgp-dup.ll
+++ b/llvm/test/CodeGen/X86/tailcall-cgp-dup.ll
@@ -188,18 +188,14 @@ return:
 define ptr @memset_tailc(ptr %ret_val, i64 %sz) nounwind {
 ; CHECK-LABEL: memset_tailc:
 ; CHECK:       ## %bb.0: ## %entry
-; CHECK-NEXT:    pushq %rbx
-; CHECK-NEXT:    movq %rdi, %rbx
 ; CHECK-NEXT:    testq %rdi, %rdi
-; CHECK-NEXT:    je LBB4_2
-; CHECK-NEXT:  ## %bb.1: ## %if.then
+; CHECK-NEXT:    je LBB4_1
+; CHECK-NEXT:  ## %bb.2: ## %if.then
 ; CHECK-NEXT:    movq %rsi, %rdx
-; CHECK-NEXT:    movq %rbx, %rdi
 ; CHECK-NEXT:    xorl %esi, %esi
-; CHECK-NEXT:    callq _memset
-; CHECK-NEXT:  LBB4_2: ## %return
-; CHECK-NEXT:    movq %rbx, %rax
-; CHECK-NEXT:    popq %rbx
+; CHECK-NEXT:    jmp _memset ## TAILCALL
+; CHECK-NEXT:  LBB4_1: ## %return
+; CHECK-NEXT:    movq %rdi, %rax
 ; CHECK-NEXT:    retq
 entry:
   %cmp = icmp eq ptr %ret_val, null
@@ -216,21 +212,15 @@ return:
 define ptr @memcpy_tailc(ptr %ret_val, i64 %sz, ptr %src) nounwind {
 ; CHECK-LABEL: memcpy_tailc:
 ; CHECK:       ## %bb.0: ## %entry
-; CHECK-NEXT:    pushq %rbx
 ; CHECK-NEXT:    testq %rsi, %rsi
 ; CHECK-NEXT:    je LBB5_1
 ; CHECK-NEXT:  ## %bb.2: ## %if.then
 ; CHECK-NEXT:    movq %rsi, %rax
-; CHECK-NEXT:    movq %rdi, %rbx
 ; CHECK-NEXT:    movq %rdx, %rsi
 ; CHECK-NEXT:    movq %rax, %rdx
-; CHECK-NEXT:    callq _memcpy
-; CHECK-NEXT:    jmp LBB5_3
-; CHECK-NEXT:  LBB5_1:
-; CHECK-NEXT:    movq %rdx, %rbx
-; CHECK-NEXT:  LBB5_3: ## %return
-; CHECK-NEXT:    movq %rbx, %rax
-; CHECK-NEXT:    popq %rbx
+; CHECK-NEXT:    jmp _memcpy ## TAILCALL
+; CHECK-NEXT:  LBB5_1: ## %return
+; CHECK-NEXT:    movq %rdx, %rax
 ; CHECK-NEXT:    retq
 entry:
   %cmp = icmp eq i64 %sz, 0
@@ -251,25 +241,25 @@ define ptr @strcpy_legal_and_baz_illegal(ptr %arg, i64 %sz, ptr %2) nounwind {
 ; CHECK-NEXT:    pushq %r15
 ; CHECK-NEXT:    pushq %r14
 ; CHECK-NEXT:    pushq %rbx
-; CHECK-NEXT:    movq %rdx, %r14
+; CHECK-NEXT:    movq %rdx, %rbx
 ; CHECK-NEXT:    movq %rsi, %r15
-; CHECK-NEXT:    movq %rdi, %rbx
+; CHECK-NEXT:    movq %rdi, %r14
 ; CHECK-NEXT:    movq %rsi, %rdi
 ; CHECK-NEXT:    callq _malloc
 ; CHECK-NEXT:    testq %r15, %r15
-; CHECK-NEXT:    je LBB6_2
-; CHECK-NEXT:  ## %bb.1: ## %if.then
+; CHECK-NEXT:    je LBB6_1
+; CHECK-NEXT:  ## %bb.2: ## %if.then
 ; CHECK-NEXT:    movq %rax, %rdi
-; CHECK-NEXT:    movq %r14, %rsi
-; CHECK-NEXT:    movq %rax, %rbx
-; CHECK-NEXT:    callq _strcpy
-; CHECK-NEXT:    jmp LBB6_3
-; CHECK-NEXT:  LBB6_2: ## %if.else
-; CHECK-NEXT:    movq %rbx, %rdi
-; CHECK-NEXT:    movq %r14, %rsi
+; CHECK-NEXT:    movq %rbx, %rsi
+; CHECK-NEXT:    popq %rbx
+; CHECK-NEXT:    popq %r14
+; CHECK-NEXT:    popq %r15
+; CHECK-NEXT:    jmp _strcpy ## TAILCALL
+; CHECK-NEXT:  LBB6_1: ## %if.else
+; CHECK-NEXT:    movq %r14, %rdi
+; CHECK-NEXT:    movq %rbx, %rsi
 ; CHECK-NEXT:    callq _baz
-; CHECK-NEXT:  LBB6_3: ## %return
-; CHECK-NEXT:    movq %rbx, %rax
+; CHECK-NEXT:    movq %r14, %rax
 ; CHECK-NEXT:    popq %rbx
 ; CHECK-NEXT:    popq %r14
 ; CHECK-NEXT:    popq %r15
-- 
cgit v1.1


From afa413a132c0959295df36c28814ee83948e4931 Mon Sep 17 00:00:00 2001
From: Stephen Tozer <stephen.tozer@sony.com>
Date: Mon, 12 Feb 2024 13:50:37 +0000
Subject: [RemoveDIs][DebugInfo] Correctly visit DPValues in
 StackInfoBuilder::visit (#81247)

In `StackInfoBuilder::visit(Instruction &Inst)`, operations are
performed on memory-related instructions, including debug intrinsics
that refer to "interesting" allocas. There is a block that also visits
DPValues attached to the instruction, but this block is near the end of
the function; this has two problems:
1. The DPValues attached to an instruction precede that instruction, so
they should always be processed before the instruction itself.
2. More importantly, some of the paths for visiting other instructions
contain early returns, which will result in the DPValues not being
visited at all.

This patch simply moves the DPValue-visiting block to the top of the
function, which should resolve both of these problems.
---
 llvm/lib/Transforms/Utils/MemoryTaggingSupport.cpp | 28 +++++++++++-----------
 .../HWAddressSanitizer/dbg-declare-tag-offset.ll   |  7 +++++-
 2 files changed, 20 insertions(+), 15 deletions(-)

diff --git a/llvm/lib/Transforms/Utils/MemoryTaggingSupport.cpp b/llvm/lib/Transforms/Utils/MemoryTaggingSupport.cpp
index 648a527..4336695 100644
--- a/llvm/lib/Transforms/Utils/MemoryTaggingSupport.cpp
+++ b/llvm/lib/Transforms/Utils/MemoryTaggingSupport.cpp
@@ -110,6 +110,20 @@ Instruction *getUntagLocationIfFunctionExit(Instruction &Inst) {
 }
 
 void StackInfoBuilder::visit(Instruction &Inst) {
+  // Check for non-intrinsic debug-info records.
+  for (auto &DPV : Inst.getDbgValueRange()) {
+    for (Value *V : DPV.location_ops()) {
+      if (auto *AI = dyn_cast_or_null<AllocaInst>(V)) {
+        if (!isInterestingAlloca(*AI))
+          continue;
+        AllocaInfo &AInfo = Info.AllocasToInstrument[AI];
+        auto &DPVVec = AInfo.DbgVariableRecords;
+        if (DPVVec.empty() || DPVVec.back() != &DPV)
+          DPVVec.push_back(&DPV);
+      }
+    }
+  }
+
   if (CallInst *CI = dyn_cast<CallInst>(&Inst)) {
     if (CI->canReturnTwice()) {
       Info.CallsReturnTwice = true;
@@ -150,20 +164,6 @@ void StackInfoBuilder::visit(Instruction &Inst) {
     }
   }
 
-  // Check for non-intrinsic debug-info records.
-  for (auto &DPV : Inst.getDbgValueRange()) {
-    for (Value *V : DPV.location_ops()) {
-      if (auto *AI = dyn_cast_or_null<AllocaInst>(V)) {
-        if (!isInterestingAlloca(*AI))
-          continue;
-        AllocaInfo &AInfo = Info.AllocasToInstrument[AI];
-        auto &DPVVec = AInfo.DbgVariableRecords;
-        if (DPVVec.empty() || DPVVec.back() != &DPV)
-          DPVVec.push_back(&DPV);
-      }
-    }
-  }
-
   Instruction *ExitUntag = getUntagLocationIfFunctionExit(Inst);
   if (ExitUntag)
     Info.RetVec.push_back(ExitUntag);
diff --git a/llvm/test/Instrumentation/HWAddressSanitizer/dbg-declare-tag-offset.ll b/llvm/test/Instrumentation/HWAddressSanitizer/dbg-declare-tag-offset.ll
index d5e362b..50bad19 100644
--- a/llvm/test/Instrumentation/HWAddressSanitizer/dbg-declare-tag-offset.ll
+++ b/llvm/test/Instrumentation/HWAddressSanitizer/dbg-declare-tag-offset.ll
@@ -1,5 +1,10 @@
 ; RUN: opt -passes=hwasan -S -o - %s | FileCheck %s
 
+;; Also test with RemoveDIs to verify that debug intrinsics immediately
+;; preceding an alloca (or other instruction of interest to stack tagging) will
+;; be correctly processed.
+; RUN: opt --try-experimental-debuginfo-iterators -passes=hwasan -S -o - %s | FileCheck %s
+
 target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
 target triple = "aarch64--linux-android"
 
@@ -12,11 +17,11 @@ entry:
   %nodebug2 = alloca ptr
   %nodebug3 = alloca ptr
   %a = alloca ptr
-  %b = alloca ptr
   ; CHECK: @llvm.dbg.declare{{.*}} !DIExpression(DW_OP_LLVM_tag_offset, 32)
   call void @llvm.dbg.declare(metadata ptr %a, metadata !12, metadata !DIExpression()), !dbg !14
   ; CHECK: @llvm.dbg.declare{{.*}} !DIExpression(DW_OP_LLVM_tag_offset, 32)
   call void @llvm.dbg.declare(metadata ptr %a, metadata !12, metadata !DIExpression()), !dbg !14
+  %b = alloca ptr
   ; CHECK: @llvm.dbg.declare{{.*}} !DIExpression(DW_OP_LLVM_tag_offset, 96)
   call void @llvm.dbg.declare(metadata ptr %b, metadata !13, metadata !DIExpression()), !dbg !14
   ; CHECK: @llvm.dbg.declare{{.*}} !DIExpression(DW_OP_LLVM_tag_offset, 96)
-- 
cgit v1.1


From f5fd0deb2371d0bae3bef2563f50e005a140fc6d Mon Sep 17 00:00:00 2001
From: Joseph Huber <huberjn@outlook.com>
Date: Mon, 12 Feb 2024 08:14:54 -0600
Subject: [LinkerWrapper][NFC] Rename 'all' to 'generic' for architecture
 agnostic IR

Summary:
A previous patch introduced `all` as a special architecture. I have
decided I do not like this name and have changed it to `generic`.
---
 clang/docs/ClangLinkerWrapper.rst  | 8 ++++++++
 clang/test/Driver/linker-wrapper.c | 2 +-
 llvm/lib/Object/OffloadBinary.cpp  | 2 +-
 3 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/clang/docs/ClangLinkerWrapper.rst b/clang/docs/ClangLinkerWrapper.rst
index 6d7770b..3bef558 100644
--- a/clang/docs/ClangLinkerWrapper.rst
+++ b/clang/docs/ClangLinkerWrapper.rst
@@ -79,6 +79,14 @@ linking is desired, simply do not run the binaries through the
 ``clang-linker-wrapper``. This will simply append the embedded device code so
 that it can be linked later.
 
+Matching
+========
+
+The linker wrapper will link extracted device code that is compatible with each
+other. Generally, this requires that the target triple and architecture match.
+An exception is made when the architecture is listed as ``generic``, which will
+cause it be linked with any other device code with the same target triple.
+
 Example
 =======
 
diff --git a/clang/test/Driver/linker-wrapper.c b/clang/test/Driver/linker-wrapper.c
index 647629a..7fd4677 100644
--- a/clang/test/Driver/linker-wrapper.c
+++ b/clang/test/Driver/linker-wrapper.c
@@ -173,7 +173,7 @@ __attribute__((visibility("protected"), used)) int x;
 // AMD-TARGET-ID: clang{{.*}} -o {{.*}}.img --target=amdgcn-amd-amdhsa -mcpu=gfx90a:xnack- -O2 -Wl,--no-undefined {{.*}}.o {{.*}}.o
 
 // RUN: clang-offload-packager -o %t-lib.out \
-// RUN:   --image=file=%t.elf.o,kind=openmp,triple=amdgcn-amd-amdhsa,arch=all
+// RUN:   --image=file=%t.elf.o,kind=openmp,triple=amdgcn-amd-amdhsa,arch=generic
 // RUN: %clang -cc1 %s -triple x86_64-unknown-linux-gnu -emit-obj -o %t.o -fembed-offload-object=%t-lib.out
 // RUN: llvm-ar rcs %t.a %t.o
 // RUN: clang-offload-packager -o %t1.out \
diff --git a/llvm/lib/Object/OffloadBinary.cpp b/llvm/lib/Object/OffloadBinary.cpp
index 58b9b39..4ab6536 100644
--- a/llvm/lib/Object/OffloadBinary.cpp
+++ b/llvm/lib/Object/OffloadBinary.cpp
@@ -356,7 +356,7 @@ bool object::areTargetsCompatible(const OffloadFile::TargetID &LHS,
     return false;
 
   // If the architecture is "all" we assume it is always compatible.
-  if (LHS.second.equals("all") || RHS.second.equals("all"))
+  if (LHS.second.equals("generic") || RHS.second.equals("generic"))
     return true;
 
   // Only The AMDGPU target requires additional checks.
-- 
cgit v1.1


From 50ed98f1cf2516c59de0351fa683639c937782ea Mon Sep 17 00:00:00 2001
From: Haojian Wu <hokein.wu@gmail.com>
Date: Mon, 12 Feb 2024 15:05:48 +0100
Subject: Revert "Reland "[clang-format] Update
 FormatToken::isSimpleTypeSpecifier() (#80241)""

The change caused an asan crash when running the `QualifierFixerTest.IsQualifierType` unittest, see
details: https://github.com/llvm/llvm-project/pull/80241#issuecomment-1938749844

This reverts commit 7f40c5cc4e15a0a67b031f13370afc342a5dc14b.
---
 clang/lib/Format/FormatToken.cpp | 40 +++++++++++++++++++++++++++++++++++++---
 1 file changed, 37 insertions(+), 3 deletions(-)

diff --git a/clang/lib/Format/FormatToken.cpp b/clang/lib/Format/FormatToken.cpp
index 33bcde3..b791c5a 100644
--- a/clang/lib/Format/FormatToken.cpp
+++ b/clang/lib/Format/FormatToken.cpp
@@ -14,7 +14,9 @@
 
 #include "FormatToken.h"
 #include "ContinuationIndenter.h"
-#include "TokenAnalyzer.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Support/Debug.h"
+#include <climits>
 
 namespace clang {
 namespace format {
@@ -32,9 +34,41 @@ const char *getTokenTypeName(TokenType Type) {
   return nullptr;
 }
 
+// FIXME: This is copy&pasted from Sema. Put it in a common place and remove
+// duplication.
 bool FormatToken::isSimpleTypeSpecifier() const {
-  assert(LangOpts.CPlusPlus);
-  return Tok.isSimpleTypeSpecifier(LangOpts);
+  switch (Tok.getKind()) {
+  case tok::kw_short:
+  case tok::kw_long:
+  case tok::kw___int64:
+  case tok::kw___int128:
+  case tok::kw_signed:
+  case tok::kw_unsigned:
+  case tok::kw_void:
+  case tok::kw_char:
+  case tok::kw_int:
+  case tok::kw_half:
+  case tok::kw_float:
+  case tok::kw_double:
+  case tok::kw___bf16:
+  case tok::kw__Float16:
+  case tok::kw___float128:
+  case tok::kw___ibm128:
+  case tok::kw_wchar_t:
+  case tok::kw_bool:
+#define TRANSFORM_TYPE_TRAIT_DEF(_, Trait) case tok::kw___##Trait:
+#include "clang/Basic/TransformTypeTraits.def"
+  case tok::annot_typename:
+  case tok::kw_char8_t:
+  case tok::kw_char16_t:
+  case tok::kw_char32_t:
+  case tok::kw_typeof:
+  case tok::kw_decltype:
+  case tok::kw__Atomic:
+    return true;
+  default:
+    return false;
+  }
 }
 
 bool FormatToken::isTypeOrIdentifier() const {
-- 
cgit v1.1


From 6a471611a4b98396abb0680229aa0694ffe88a43 Mon Sep 17 00:00:00 2001
From: Hirofumi Nakamura <k.nakamura.hirofumi@gmail.com>
Date: Mon, 12 Feb 2024 23:27:09 +0900
Subject: [clang-format] Support of TableGen value annotations. (#80299)

This implements the annotation of the values in TableGen.
The main changes are,

- parseTableGenValue(), the simplified parser method for the syntax of
values.
- modified consumeToken() to parseTableGenValue in 'if', 'assert' and
after '='.
- modified parseParens() to call parseTableGenValue inside.
- modified parseSquare() to to call parseTableGenValue inside, with
skipping separator tokens.
- modified parseAngle() to call parseTableGenValue inside, with skipping
separator tokens.
---
 clang/lib/Format/FormatToken.h                |  10 +
 clang/lib/Format/FormatTokenLexer.cpp         |   2 +-
 clang/lib/Format/TokenAnnotator.cpp           | 308 +++++++++++++++++++++++++-
 clang/lib/Format/UnwrappedLineParser.cpp      |  13 +-
 clang/unittests/Format/TokenAnnotatorTest.cpp |  45 ++++
 5 files changed, 369 insertions(+), 9 deletions(-)

diff --git a/clang/lib/Format/FormatToken.h b/clang/lib/Format/FormatToken.h
index bace91b..0c1dce7 100644
--- a/clang/lib/Format/FormatToken.h
+++ b/clang/lib/Format/FormatToken.h
@@ -150,7 +150,17 @@ namespace format {
   TYPE(StructuredBindingLSquare)                                               \
   TYPE(TableGenBangOperator)                                                   \
   TYPE(TableGenCondOperator)                                                   \
+  TYPE(TableGenCondOperatorColon)                                              \
+  TYPE(TableGenCondOperatorComma)                                              \
+  TYPE(TableGenDAGArgCloser)                                                   \
+  TYPE(TableGenDAGArgListColon)                                                \
+  TYPE(TableGenDAGArgListComma)                                                \
+  TYPE(TableGenDAGArgOpener)                                                   \
+  TYPE(TableGenListCloser)                                                     \
+  TYPE(TableGenListOpener)                                                     \
   TYPE(TableGenMultiLineString)                                                \
+  TYPE(TableGenTrailingPasteOperator)                                          \
+  TYPE(TableGenValueSuffix)                                                    \
   TYPE(TemplateCloser)                                                         \
   TYPE(TemplateOpener)                                                         \
   TYPE(TemplateString)                                                         \
diff --git a/clang/lib/Format/FormatTokenLexer.cpp b/clang/lib/Format/FormatTokenLexer.cpp
index a57659f..492e7e9 100644
--- a/clang/lib/Format/FormatTokenLexer.cpp
+++ b/clang/lib/Format/FormatTokenLexer.cpp
@@ -812,7 +812,7 @@ void FormatTokenLexer::handleTableGenMultilineString() {
   auto CloseOffset = Lex->getBuffer().find("}]", OpenOffset);
   if (CloseOffset == StringRef::npos)
     return;
-  auto Text = Lex->getBuffer().substr(OpenOffset, CloseOffset + 2);
+  auto Text = Lex->getBuffer().substr(OpenOffset, CloseOffset - OpenOffset + 2);
   MultiLineString->TokenText = Text;
   resetLexer(SourceMgr.getFileOffset(
       Lex->getSourceLocation(Lex->getBufferLocation() - 2 + Text.size())));
diff --git a/clang/lib/Format/TokenAnnotator.cpp b/clang/lib/Format/TokenAnnotator.cpp
index b103400..d353388 100644
--- a/clang/lib/Format/TokenAnnotator.cpp
+++ b/clang/lib/Format/TokenAnnotator.cpp
@@ -256,6 +256,18 @@ private:
           }
         }
       }
+      if (Style.isTableGen()) {
+        if (CurrentToken->isOneOf(tok::comma, tok::equal)) {
+          // They appear as separators. Unless they are not in class definition.
+          next();
+          continue;
+        }
+        // In angle, there must be Value like tokens. Types are also able to be
+        // parsed in the same way with Values.
+        if (!parseTableGenValue())
+          return false;
+        continue;
+      }
       if (!consumeToken())
         return false;
     }
@@ -388,6 +400,28 @@ private:
       Contexts.back().IsExpression = !IsForOrCatch;
     }
 
+    if (Style.isTableGen()) {
+      if (FormatToken *Prev = OpeningParen.Previous) {
+        if (Prev->is(TT_TableGenCondOperator)) {
+          Contexts.back().IsTableGenCondOpe = true;
+          Contexts.back().IsExpression = true;
+        } else if (Contexts.size() > 1 &&
+                   Contexts[Contexts.size() - 2].IsTableGenBangOpe) {
+          // Hack to handle bang operators. The parent context's flag
+          // was set by parseTableGenSimpleValue().
+          // We have to specify the context outside because the prev of "(" may
+          // be ">", not the bang operator in this case.
+          Contexts.back().IsTableGenBangOpe = true;
+          Contexts.back().IsExpression = true;
+        } else {
+          // Otherwise, this paren seems DAGArg.
+          if (!parseTableGenDAGArg())
+            return false;
+          return parseTableGenDAGArgAndList(&OpeningParen);
+        }
+      }
+    }
+
     // Infer the role of the l_paren based on the previous token if we haven't
     // detected one yet.
     if (PrevNonComment && OpeningParen.is(TT_Unknown)) {
@@ -549,6 +583,22 @@ private:
       if (CurrentToken->is(tok::comma))
         Contexts.back().CanBeExpression = true;
 
+      if (Style.isTableGen()) {
+        if (CurrentToken->is(tok::comma)) {
+          if (Contexts.back().IsTableGenCondOpe)
+            CurrentToken->setType(TT_TableGenCondOperatorComma);
+          next();
+        } else if (CurrentToken->is(tok::colon)) {
+          if (Contexts.back().IsTableGenCondOpe)
+            CurrentToken->setType(TT_TableGenCondOperatorColon);
+          next();
+        }
+        // In TableGen there must be Values in parens.
+        if (!parseTableGenValue())
+          return false;
+        continue;
+      }
+
       FormatToken *Tok = CurrentToken;
       if (!consumeToken())
         return false;
@@ -803,6 +853,8 @@ private:
           if (Left->BlockParameterCount > 1)
             Contexts.back().FirstObjCSelectorName->LongestObjCSelectorName = 0;
         }
+        if (Style.isTableGen() && Left->is(TT_TableGenListOpener))
+          CurrentToken->setType(TT_TableGenListCloser);
         next();
         return true;
       }
@@ -833,6 +885,19 @@ private:
         Left->setType(TT_ArrayInitializerLSquare);
       }
       FormatToken *Tok = CurrentToken;
+      if (Style.isTableGen()) {
+        if (CurrentToken->isOneOf(tok::comma, tok::minus, tok::ellipsis)) {
+          // '-' and '...' appears as a separator in slice.
+          next();
+        } else {
+          // In TableGen there must be a list of Values in square brackets.
+          // It must be ValueList or SliceElements.
+          if (!parseTableGenValue())
+            return false;
+        }
+        updateParameterCount(Left, Tok);
+        continue;
+      }
       if (!consumeToken())
         return false;
       updateParameterCount(Left, Tok);
@@ -840,6 +905,194 @@ private:
     return false;
   }
 
+  void skipToNextNonComment() {
+    next();
+    while (CurrentToken && CurrentToken->is(tok::comment))
+      next();
+  }
+
+  // Simplified parser for TableGen Value. Returns true on success.
+  // It consists of SimpleValues, SimpleValues with Suffixes, and Value followed
+  // by '#', paste operator.
+  // There also exists the case the Value is parsed as NameValue.
+  // In this case, the Value ends if '{' is found.
+  bool parseTableGenValue(bool ParseNameMode = false) {
+    if (!CurrentToken)
+      return false;
+    while (CurrentToken->is(tok::comment))
+      next();
+    if (!parseTableGenSimpleValue())
+      return false;
+    if (!CurrentToken)
+      return true;
+    // Value "#" [Value]
+    if (CurrentToken->is(tok::hash)) {
+      if (CurrentToken->Next &&
+          CurrentToken->Next->isOneOf(tok::colon, tok::semi, tok::l_brace)) {
+        // Trailing paste operator.
+        // These are only the allowed cases in TGParser::ParseValue().
+        CurrentToken->setType(TT_TableGenTrailingPasteOperator);
+        next();
+        return true;
+      }
+      FormatToken *HashTok = CurrentToken;
+      skipToNextNonComment();
+      HashTok->setType(TT_Unknown);
+      if (!parseTableGenValue(ParseNameMode))
+        return false;
+    }
+    // In name mode, '{' is regarded as the end of the value.
+    // See TGParser::ParseValue in TGParser.cpp
+    if (ParseNameMode && CurrentToken->is(tok::l_brace))
+      return true;
+    // These tokens indicates this is a value with suffixes.
+    if (CurrentToken->isOneOf(tok::l_brace, tok::l_square, tok::period)) {
+      CurrentToken->setType(TT_TableGenValueSuffix);
+      FormatToken *Suffix = CurrentToken;
+      skipToNextNonComment();
+      if (Suffix->is(tok::l_square))
+        return parseSquare();
+      if (Suffix->is(tok::l_brace)) {
+        Scopes.push_back(getScopeType(*Suffix));
+        return parseBrace();
+      }
+    }
+    return true;
+  }
+
+  // TokVarName    ::=  "$" ualpha (ualpha |  "0"..."9")*
+  // Appears as a part of DagArg.
+  // This does not change the current token on fail.
+  bool tryToParseTableGenTokVar() {
+    if (!CurrentToken)
+      return false;
+    if (CurrentToken->is(tok::identifier) &&
+        CurrentToken->TokenText.front() == '$') {
+      skipToNextNonComment();
+      return true;
+    }
+    return false;
+  }
+
+  // DagArg       ::=  Value [":" TokVarName] | TokVarName
+  // Appears as a part of SimpleValue6.
+  bool parseTableGenDAGArg() {
+    if (tryToParseTableGenTokVar())
+      return true;
+    if (parseTableGenValue()) {
+      if (CurrentToken && CurrentToken->is(tok::colon)) {
+        CurrentToken->setType(TT_TableGenDAGArgListColon);
+        skipToNextNonComment();
+        return tryToParseTableGenTokVar();
+      }
+      return true;
+    }
+    return false;
+  }
+
+  // SimpleValue6 ::=  "(" DagArg [DagArgList] ")"
+  // This parses SimpleValue 6's inside part of "(" ")"
+  bool parseTableGenDAGArgAndList(FormatToken *Opener) {
+    FormatToken *FirstTok = CurrentToken;
+    if (!parseTableGenDAGArg())
+      return false;
+    // Parse the [DagArgList] part
+    bool FirstDAGArgListElm = true;
+    while (CurrentToken) {
+      if (!FirstDAGArgListElm && CurrentToken->is(tok::comma)) {
+        CurrentToken->setType(TT_TableGenDAGArgListComma);
+        skipToNextNonComment();
+      }
+      if (CurrentToken && CurrentToken->is(tok::r_paren)) {
+        CurrentToken->setType(TT_TableGenDAGArgCloser);
+        Opener->MatchingParen = CurrentToken;
+        CurrentToken->MatchingParen = Opener;
+        skipToNextNonComment();
+        return true;
+      }
+      if (!parseTableGenDAGArg())
+        return false;
+      FirstDAGArgListElm = false;
+    }
+    return false;
+  }
+
+  bool parseTableGenSimpleValue() {
+    assert(Style.isTableGen());
+    if (!CurrentToken)
+      return false;
+    FormatToken *Tok = CurrentToken;
+    skipToNextNonComment();
+    // SimpleValue 1, 2, 3: Literals
+    if (Tok->isOneOf(tok::numeric_constant, tok::string_literal,
+                     TT_TableGenMultiLineString, tok::kw_true, tok::kw_false,
+                     tok::question, tok::kw_int)) {
+      return true;
+    }
+    // SimpleValue 4: ValueList, Type
+    if (Tok->is(tok::l_brace)) {
+      Scopes.push_back(getScopeType(*Tok));
+      return parseBrace();
+    }
+    // SimpleValue 5: List initializer
+    if (Tok->is(tok::l_square)) {
+      Tok->setType(TT_TableGenListOpener);
+      if (!parseSquare())
+        return false;
+      if (Tok->is(tok::less)) {
+        CurrentToken->setType(TT_TemplateOpener);
+        return parseAngle();
+      }
+      return true;
+    }
+    // SimpleValue 6: DAGArg [DAGArgList]
+    // SimpleValue6 ::=  "(" DagArg [DagArgList] ")"
+    if (Tok->is(tok::l_paren)) {
+      Tok->setType(TT_TableGenDAGArgOpener);
+      return parseTableGenDAGArgAndList(Tok);
+    }
+    // SimpleValue 9: Bang operator
+    if (Tok->is(TT_TableGenBangOperator)) {
+      if (CurrentToken && CurrentToken->is(tok::less)) {
+        CurrentToken->setType(TT_TemplateOpener);
+        skipToNextNonComment();
+        if (!parseAngle())
+          return false;
+      }
+      if (!CurrentToken || CurrentToken->isNot(tok::l_paren))
+        return false;
+      skipToNextNonComment();
+      // FIXME: Hack using inheritance to child context
+      Contexts.back().IsTableGenBangOpe = true;
+      bool Result = parseParens();
+      Contexts.back().IsTableGenBangOpe = false;
+      return Result;
+    }
+    // SimpleValue 9: Cond operator
+    if (Tok->is(TT_TableGenCondOperator)) {
+      Tok = CurrentToken;
+      skipToNextNonComment();
+      if (!Tok || Tok->isNot(tok::l_paren))
+        return false;
+      bool Result = parseParens();
+      return Result;
+    }
+    // We have to check identifier at the last because the kind of bang/cond
+    // operators are also identifier.
+    // SimpleValue 7: Identifiers
+    if (Tok->is(tok::identifier)) {
+      // SimpleValue 8: Anonymous record
+      if (CurrentToken && CurrentToken->is(tok::less)) {
+        CurrentToken->setType(TT_TemplateOpener);
+        skipToNextNonComment();
+        return parseAngle();
+      }
+      return true;
+    }
+
+    return false;
+  }
+
   bool couldBeInStructArrayInitializer() const {
     if (Contexts.size() < 2)
       return false;
@@ -880,6 +1133,8 @@ private:
          OpeningBrace.getPreviousNonComment()->isNot(Keywords.kw_apostrophe))) {
       Contexts.back().VerilogMayBeConcatenation = true;
     }
+    if (Style.isTableGen())
+      Contexts.back().ColonIsDictLiteral = false;
 
     unsigned CommaCount = 0;
     while (CurrentToken) {
@@ -906,7 +1161,7 @@ private:
         FormatToken *Previous = CurrentToken->getPreviousNonComment();
         if (Previous->is(TT_JsTypeOptionalQuestion))
           Previous = Previous->getPreviousNonComment();
-        if ((CurrentToken->is(tok::colon) &&
+        if ((CurrentToken->is(tok::colon) && !Style.isTableGen() &&
              (!Contexts.back().ColonIsDictLiteral || !Style.isCpp())) ||
             Style.isProto()) {
           OpeningBrace.setType(TT_DictLiteral);
@@ -915,10 +1170,12 @@ private:
             Previous->setType(TT_SelectorName);
           }
         }
-        if (CurrentToken->is(tok::colon) && OpeningBrace.is(TT_Unknown))
+        if (CurrentToken->is(tok::colon) && OpeningBrace.is(TT_Unknown) &&
+            !Style.isTableGen()) {
           OpeningBrace.setType(TT_DictLiteral);
-        else if (Style.isJavaScript())
+        } else if (Style.isJavaScript()) {
           OpeningBrace.overwriteFixedType(TT_DictLiteral);
+        }
       }
       if (CurrentToken->is(tok::comma)) {
         if (Style.isJavaScript())
@@ -989,6 +1246,9 @@ private:
     // operators.
     if (Tok->is(TT_VerilogTableItem))
       return true;
+    // Multi-line string itself is a single annotated token.
+    if (Tok->is(TT_TableGenMultiLineString))
+      return true;
     switch (Tok->Tok.getKind()) {
     case tok::plus:
     case tok::minus:
@@ -1119,6 +1379,10 @@ private:
         Tok->setType(TT_ObjCMethodExpr);
       } else if (Contexts.back().ContextKind == tok::l_paren &&
                  !Line.InPragmaDirective) {
+        if (Style.isTableGen() && Contexts.back().IsTableGenDAGArg) {
+          Tok->setType(TT_TableGenDAGArgListColon);
+          break;
+        }
         Tok->setType(TT_InlineASMColon);
       }
       break;
@@ -1130,6 +1394,14 @@ private:
         Tok->setType(TT_JsTypeOperator);
       break;
     case tok::kw_if:
+      if (Style.isTableGen()) {
+        // In TableGen it has the form 'if' <value> 'then'.
+        if (!parseTableGenValue())
+          return false;
+        if (CurrentToken && CurrentToken->is(Keywords.kw_then))
+          next(); // skip then
+        break;
+      }
       if (CurrentToken &&
           CurrentToken->isOneOf(tok::kw_constexpr, tok::identifier)) {
         next();
@@ -1235,6 +1507,8 @@ private:
       }
       break;
     case tok::l_square:
+      if (Style.isTableGen())
+        Tok->setType(TT_TableGenListOpener);
       if (!parseSquare())
         return false;
       break;
@@ -1264,6 +1538,8 @@ private:
           if (Previous && Previous->getType() != TT_DictLiteral)
             Previous->setType(TT_SelectorName);
         }
+        if (Style.isTableGen())
+          Tok->setType(TT_TemplateOpener);
       } else {
         Tok->setType(TT_BinaryOperator);
         NonTemplateLess.insert(Tok);
@@ -1423,11 +1699,28 @@ private:
         if (!Tok->getPreviousNonComment())
           Line.IsContinuation = true;
       }
+      if (Style.isTableGen()) {
+        if (Tok->is(Keywords.kw_assert)) {
+          if (!parseTableGenValue())
+            return false;
+        } else if (Tok->isOneOf(Keywords.kw_def, Keywords.kw_defm) &&
+                   (!Tok->Next ||
+                    !Tok->Next->isOneOf(tok::colon, tok::l_brace))) {
+          // The case NameValue appears.
+          if (!parseTableGenValue(true))
+            return false;
+        }
+      }
       break;
     case tok::arrow:
       if (Tok->Previous && Tok->Previous->is(tok::kw_noexcept))
         Tok->setType(TT_TrailingReturnArrow);
       break;
+    case tok::equal:
+      // In TableGen, there must be a value after "=";
+      if (Style.isTableGen() && !parseTableGenValue())
+        return false;
+      break;
     default:
       break;
     }
@@ -1757,6 +2050,9 @@ private:
     // Whether the braces may mean concatenation instead of structure or array
     // literal.
     bool VerilogMayBeConcatenation = false;
+    bool IsTableGenDAGArg = false;
+    bool IsTableGenBangOpe = false;
+    bool IsTableGenCondOpe = false;
     enum {
       Unknown,
       // Like the part after `:` in a constructor.
@@ -2061,6 +2357,9 @@ private:
         // In JavaScript, `interface X { foo?(): bar; }` is an optional method
         // on the interface, not a ternary expression.
         Current.setType(TT_JsTypeOptionalQuestion);
+      } else if (Style.isTableGen()) {
+        // In TableGen, '?' is just an identifier like token.
+        Current.setType(TT_Unknown);
       } else {
         Current.setType(TT_ConditionalExpr);
       }
@@ -2239,6 +2538,9 @@ private:
       // keywords such as let and def* defines names.
       if (Keywords.isTableGenDefinition(*PreviousNotConst))
         return true;
+      // Otherwise C++ style declarations is available only inside the brace.
+      if (Contexts.back().ContextKind != tok::l_brace)
+        return false;
     }
 
     bool IsPPKeyword = PreviousNotConst->is(tok::identifier) &&
diff --git a/clang/lib/Format/UnwrappedLineParser.cpp b/clang/lib/Format/UnwrappedLineParser.cpp
index d4f9b3f..d84914c 100644
--- a/clang/lib/Format/UnwrappedLineParser.cpp
+++ b/clang/lib/Format/UnwrappedLineParser.cpp
@@ -495,12 +495,15 @@ void UnwrappedLineParser::calculateBraceTypes(bool ExpectClassBody) {
     do {
       NextTok = Tokens->getNextToken();
     } while (NextTok->is(tok::comment));
-    while (NextTok->is(tok::hash) && !Line->InMacroBody) {
-      NextTok = Tokens->getNextToken();
-      do {
+    if (!Style.isTableGen()) {
+      // InTableGen, '#' is like binary operator. Not a preprocessor directive.
+      while (NextTok->is(tok::hash) && !Line->InMacroBody) {
         NextTok = Tokens->getNextToken();
-      } while (NextTok->is(tok::comment) ||
-               (NextTok->NewlinesBefore == 0 && NextTok->isNot(tok::eof)));
+        do {
+          NextTok = Tokens->getNextToken();
+        } while (NextTok->is(tok::comment) ||
+                 (NextTok->NewlinesBefore == 0 && NextTok->isNot(tok::eof)));
+      }
     }
 
     switch (Tok->Tok.getKind()) {
diff --git a/clang/unittests/Format/TokenAnnotatorTest.cpp b/clang/unittests/Format/TokenAnnotatorTest.cpp
index 52a00c8..3b36e40 100644
--- a/clang/unittests/Format/TokenAnnotatorTest.cpp
+++ b/clang/unittests/Format/TokenAnnotatorTest.cpp
@@ -2287,6 +2287,51 @@ TEST_F(TokenAnnotatorTest, UnderstandTableGenTokens) {
   EXPECT_TOKEN(Tokens[0], tok::identifier, TT_TableGenBangOperator);
   Tokens = Annotate("!cond");
   EXPECT_TOKEN(Tokens[0], tok::identifier, TT_TableGenCondOperator);
+
+  auto AnnotateValue = [this, &Style](llvm::StringRef Code) {
+    // Values are annotated only in specific context.
+    auto Result = annotate(("def X { let V = " + Code + "; }").str(), Style);
+    return decltype(Result){Result.begin() + 6, Result.end() - 3};
+  };
+  // Both of bang/cond operators.
+  Tokens = AnnotateValue("!cond(!eq(x, 0): 1, true: x)");
+  ASSERT_EQ(Tokens.size(), 15u) << Tokens;
+  EXPECT_TOKEN(Tokens[0], tok::identifier, TT_TableGenCondOperator);
+  EXPECT_TOKEN(Tokens[2], tok::identifier, TT_TableGenBangOperator);
+  EXPECT_TOKEN(Tokens[8], tok::colon, TT_TableGenCondOperatorColon);
+  EXPECT_TOKEN(Tokens[10], tok::comma, TT_TableGenCondOperatorComma);
+  EXPECT_TOKEN(Tokens[12], tok::colon, TT_TableGenCondOperatorColon);
+  // DAGArg values with operator identifier
+  Tokens = AnnotateValue("(ins type1:$src1, type2:$src2)");
+  ASSERT_EQ(Tokens.size(), 10u) << Tokens;
+  EXPECT_TOKEN(Tokens[0], tok::l_paren, TT_TableGenDAGArgOpener);
+  EXPECT_TOKEN(Tokens[3], tok::colon, TT_TableGenDAGArgListColon);
+  EXPECT_TOKEN(Tokens[4], tok::identifier, TT_Unknown); // $src1
+  EXPECT_TOKEN(Tokens[5], tok::comma, TT_TableGenDAGArgListComma);
+  EXPECT_TOKEN(Tokens[7], tok::colon, TT_TableGenDAGArgListColon);
+  EXPECT_TOKEN(Tokens[9], tok::r_paren, TT_TableGenDAGArgCloser);
+  // List literal
+  Tokens = AnnotateValue("[1, 2, 3]");
+  ASSERT_EQ(Tokens.size(), 7u) << Tokens;
+  EXPECT_TOKEN(Tokens[0], tok::l_square, TT_TableGenListOpener);
+  EXPECT_TOKEN(Tokens[6], tok::r_square, TT_TableGenListCloser);
+  // Suffixes of values
+  Tokens = AnnotateValue("valid.field");
+  ASSERT_EQ(Tokens.size(), 3u) << Tokens;
+  EXPECT_TOKEN(Tokens[1], tok::period, TT_TableGenValueSuffix);
+  // Code
+  Tokens = AnnotateValue("[{ code is multiline string }]");
+  ASSERT_EQ(Tokens.size(), 1u) << Tokens;
+  EXPECT_TOKEN(Tokens[0], tok::string_literal, TT_TableGenMultiLineString);
+
+  // The definition
+  Tokens = annotate("def Def : Parent<Child> {}", Style);
+  ASSERT_EQ(Tokens.size(), 10u) << Tokens; // This contains eof.
+  // We use inheritance colon and function brace. They are enough.
+  EXPECT_TOKEN(Tokens[2], tok::colon, TT_InheritanceColon);
+  EXPECT_TOKEN(Tokens[4], tok::less, TT_TemplateOpener);
+  EXPECT_TOKEN(Tokens[6], tok::greater, TT_TemplateCloser);
+  EXPECT_TOKEN(Tokens[7], tok::l_brace, TT_FunctionLBrace);
 }
 
 TEST_F(TokenAnnotatorTest, UnderstandConstructors) {
-- 
cgit v1.1


From 2b5e4eeb26ce57a8d7c2835cffc80388f43f044a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bal=C3=A1zs=20K=C3=A9ri?= <balazs.keri@ericsson.com>
Date: Mon, 12 Feb 2024 15:30:58 +0100
Subject: [clang][analyzer] Remove 'alpha.core.CallAndMessageUnInitRefArg' from
 documentation (NFC). (#81138)

This checker does not exist (any more?) but appeared in the
documentation. No other references to CallAndMessageUnInitRefArg are
found in the full clang code.
---
 clang/docs/analyzer/checkers.rst     | 22 ----------------------
 clang/www/analyzer/alpha_checks.html | 23 -----------------------
 2 files changed, 45 deletions(-)

diff --git a/clang/docs/analyzer/checkers.rst b/clang/docs/analyzer/checkers.rst
index bb637cf..510629d 100644
--- a/clang/docs/analyzer/checkers.rst
+++ b/clang/docs/analyzer/checkers.rst
@@ -1890,28 +1890,6 @@ the locking/unlocking of ``mtx_t`` mutexes.
    mtx_lock(&mtx1); // warn: This lock has already been acquired
  }
 
-.. _alpha-core-CallAndMessageUnInitRefArg:
-
-alpha.core.CallAndMessageUnInitRefArg (C,C++, ObjC)
-"""""""""""""""""""""""""""""""""""""""""""""""""""
-Check for logical errors for function calls and Objective-C
-message expressions (e.g., uninitialized arguments, null function pointers, and pointer to undefined variables).
-
-.. code-block:: c
-
- void test(void) {
-   int t;
-   int &p = t;
-   int &s = p;
-   int &q = s;
-   foo(q); // warn
- }
-
- void test(void) {
-   int x;
-   foo(&x); // warn
- }
-
 .. _alpha-core-CastSize:
 
 alpha.core.CastSize (C)
diff --git a/clang/www/analyzer/alpha_checks.html b/clang/www/analyzer/alpha_checks.html
index 11ef7d4..7bbe4a2 100644
--- a/clang/www/analyzer/alpha_checks.html
+++ b/clang/www/analyzer/alpha_checks.html
@@ -87,29 +87,6 @@ void test() {
 </pre></div></div></td></tr>
 
 
-<tr><td><a id="alpha.core.CallAndMessageUnInitRefArg"><div class="namedescr expandable"><span class="name">
-alpha.core.CallAndMessageUnInitRefArg</span><span class="lang">
-(C, C++)</span><div class="descr">
-Check for uninitialized arguments in function calls and Objective-C
-message expressions.</div></div></a></td>
-<td><div class="exampleContainer expandable">
-<div class="example"><pre>
-void test(void) {
-  int t;
-  int &p = t;
-  int &s = p;
-  int &q = s;
-  foo(q); // warn
-}
-</pre></div><div class="separator"></div>
-<div class="example"><pre>
-void test(void) {
-  int x;
-  foo(&x); // warn
-}
-</pre></div></div></td></tr>
-
-
 <tr><td><a id="alpha.core.CastSize"><div class="namedescr expandable"><span class="name">
 alpha.core.CastSize</span><span class="lang">
 (C)</span><div class="descr">
-- 
cgit v1.1


From 15b0cc1212701908400e8059c6581ffe85d8070f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Timm=20B=C3=A4der?= <tbaeder@redhat.com>
Date: Sat, 10 Feb 2024 20:02:19 +0100
Subject: [clang][Interp][NFC] Move a declaration into an if statement

---
 clang/lib/AST/Interp/ByteCodeExprGen.cpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/clang/lib/AST/Interp/ByteCodeExprGen.cpp b/clang/lib/AST/Interp/ByteCodeExprGen.cpp
index 6993d75..8a2c1e5 100644
--- a/clang/lib/AST/Interp/ByteCodeExprGen.cpp
+++ b/clang/lib/AST/Interp/ByteCodeExprGen.cpp
@@ -2283,8 +2283,7 @@ bool ByteCodeExprGen<Emitter>::dereferenceParam(
     const Expr *LV, PrimType T, const ParmVarDecl *PD, DerefKind AK,
     llvm::function_ref<bool(PrimType)> Direct,
     llvm::function_ref<bool(PrimType)> Indirect) {
-  auto It = this->Params.find(PD);
-  if (It != this->Params.end()) {
+  if (auto It = this->Params.find(PD); It != this->Params.end()) {
     unsigned Idx = It->second.Offset;
     switch (AK) {
     case DerefKind::Read:
-- 
cgit v1.1


From ea8de6e4336cf82aa541c6ad951b62585c3ea55c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Timm=20B=C3=A4der?= <tbaeder@redhat.com>
Date: Mon, 12 Feb 2024 15:20:24 +0100
Subject: [clang][Interp][NFC] Make two local variables const

---
 clang/lib/AST/Interp/Pointer.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/clang/lib/AST/Interp/Pointer.cpp b/clang/lib/AST/Interp/Pointer.cpp
index dd8868d..1eedbc3 100644
--- a/clang/lib/AST/Interp/Pointer.cpp
+++ b/clang/lib/AST/Interp/Pointer.cpp
@@ -92,9 +92,9 @@ APValue Pointer::toAPValue() const {
   // Build the lvalue base from the block.
   const Descriptor *Desc = getDeclDesc();
   APValue::LValueBase Base;
-  if (auto *VD = Desc->asValueDecl())
+  if (const auto *VD = Desc->asValueDecl())
     Base = VD;
-  else if (auto *E = Desc->asExpr())
+  else if (const auto *E = Desc->asExpr())
     Base = E;
   else
     llvm_unreachable("Invalid allocation type");
-- 
cgit v1.1


From 635dfd5d69c6be52a8a2bb612f3483d9a1226cfa Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Timm=20B=C3=A4der?= <tbaeder@redhat.com>
Date: Mon, 12 Feb 2024 13:00:36 +0100
Subject: [clang][Interp] Fix a designated initializer testcase

This protected GetPtrField and ArrayDecay ops from dummy pointers
which fixes the attached test case for designated initializers in C.
---
 clang/lib/AST/Interp/Interp.h    | 21 +++++++++++++--------
 clang/lib/AST/Interp/Pointer.cpp |  2 +-
 clang/test/AST/Interp/c.c        | 12 ++++++++++++
 3 files changed, 26 insertions(+), 9 deletions(-)

diff --git a/clang/lib/AST/Interp/Interp.h b/clang/lib/AST/Interp/Interp.h
index 15c1370..e2fda18e 100644
--- a/clang/lib/AST/Interp/Interp.h
+++ b/clang/lib/AST/Interp/Interp.h
@@ -1187,15 +1187,18 @@ inline bool GetPtrGlobal(InterpState &S, CodePtr OpPC, uint32_t I) {
 /// 2) Pushes Pointer.atField(Off) on the stack
 inline bool GetPtrField(InterpState &S, CodePtr OpPC, uint32_t Off) {
   const Pointer &Ptr = S.Stk.pop<Pointer>();
+
   if (S.inConstantContext() && !CheckNull(S, OpPC, Ptr, CSK_Field))
     return false;
-  if (!CheckExtern(S, OpPC, Ptr))
-    return false;
-  if (!CheckRange(S, OpPC, Ptr, CSK_Field))
-    return false;
-  if (!CheckSubobject(S, OpPC, Ptr, CSK_Field))
-    return false;
 
+  if (CheckDummy(S, OpPC, Ptr)) {
+    if (!CheckExtern(S, OpPC, Ptr))
+      return false;
+    if (!CheckRange(S, OpPC, Ptr, CSK_Field))
+      return false;
+    if (!CheckSubobject(S, OpPC, Ptr, CSK_Field))
+      return false;
+  }
   S.Stk.push<Pointer>(Ptr.atField(Off));
   return true;
 }
@@ -1896,8 +1899,10 @@ inline bool ArrayElemPop(InterpState &S, CodePtr OpPC, uint32_t Index) {
 inline bool ArrayDecay(InterpState &S, CodePtr OpPC) {
   const Pointer &Ptr = S.Stk.pop<Pointer>();
 
-  if (Ptr.isDummy())
-    return false;
+  if (Ptr.isDummy()) {
+    S.Stk.push<Pointer>(Ptr);
+    return true;
+  }
 
   if (!Ptr.isUnknownSizeArray()) {
     S.Stk.push<Pointer>(Ptr.atIndex(0));
diff --git a/clang/lib/AST/Interp/Pointer.cpp b/clang/lib/AST/Interp/Pointer.cpp
index 1eedbc3..d68af79 100644
--- a/clang/lib/AST/Interp/Pointer.cpp
+++ b/clang/lib/AST/Interp/Pointer.cpp
@@ -99,7 +99,7 @@ APValue Pointer::toAPValue() const {
   else
     llvm_unreachable("Invalid allocation type");
 
-  if (isUnknownSizeArray() || Desc->asExpr())
+  if (isDummy() || isUnknownSizeArray() || Desc->asExpr())
     return APValue(Base, CharUnits::Zero(), Path, false, false);
 
   // TODO: compute the offset into the object.
diff --git a/clang/test/AST/Interp/c.c b/clang/test/AST/Interp/c.c
index afbc518..392b682 100644
--- a/clang/test/AST/Interp/c.c
+++ b/clang/test/AST/Interp/c.c
@@ -112,3 +112,15 @@ _Static_assert(sizeof(name2) == 0, ""); // expected-error {{failed}} \
 #ifdef __SIZEOF_INT128__
 void *PR28739d = &(&PR28739d)[(__int128)(unsigned long)-1]; // all-warning {{refers past the last possible element}}
 #endif
+
+extern float global_float;
+struct XX { int a, *b; };
+struct XY { int before; struct XX xx, *xp; float* after; } xy[] = {
+  0, 0, &xy[0].xx.a, &xy[0].xx, &global_float,
+  [1].xx = 0, &xy[1].xx.a, &xy[1].xx, &global_float,
+  0,              // all-note {{previous initialization is here}}
+  0,              // all-note {{previous initialization is here}}
+  [2].before = 0, // all-warning {{initializer overrides prior initialization of this subobject}}
+  0,              // all-warning {{initializer overrides prior initialization of this subobject}}
+  &xy[2].xx.a, &xy[2].xx, &global_float
+};
-- 
cgit v1.1


From 85598ae35aa61f1f5fcf9fd19657768985806d9f Mon Sep 17 00:00:00 2001
From: Tarun Prabhu <tarun@lanl.gov>
Date: Mon, 12 Feb 2024 07:58:44 -0700
Subject: [flang][docs] Update meeting link for technical call and some cleanup
 (#81147)

Added the meeting numbers and passcodes for both the technical and
community calls, just in case. Removed an obsolete reference to "LLVM
flang and current flang". Some minor stylistic edits.
---
 flang/docs/GettingInvolved.md | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/flang/docs/GettingInvolved.md b/flang/docs/GettingInvolved.md
index 09f10fa..c98079a 100644
--- a/flang/docs/GettingInvolved.md
+++ b/flang/docs/GettingInvolved.md
@@ -44,17 +44,21 @@ To understand the status of various developments in Flang please join the respec
 
 ### Flang Community Biweekly Call
 
--   General updates on the Flang Project, both LLVM Flang and current Flang.
+-   General updates on the Flang project.
 -   Join [Flang Community Biweekly Call](https://lanl-us.webex.com/lanl-us/j.php?MTID=mdce13c9bd55202e8071d8128fb953614)
--   Time: On Wednesdays 8:30 Pacific Time, on the weeks alternating with regular Flang Community Technical Biweekly Call.
+    - Meeting number: 177 400 7047
+    - Password: 6PCdCYKmN43
+-   Time: Wednesdays, 8:30 a.m. Pacific Time, on the weeks alternating with regular Flang Community Technical Biweekly Call.
 -   Meeting minutes are available in this [Google Doc](https://docs.google.com/document/d/1Z2U5UAtJ-Dag5wlMaLaW1KRmNgENNAYynJqLW2j2AZQ/edit).
 -   Minutes from older meetings were posted on the [Flang forum](https://discourse.llvm.org/c/subprojects/flang). Search for `Flang Biweekly Sync - Notes`.
 
 ### Flang Community Technical Biweekly Call
 
 -   Technical topics call.
--   Join [Flang Community Technical Biweekly Call](https://bluejeans.com/284530575)
--   Time: On Mondays 8:30 Pacific Time, on the weeks alternating with regular Flang Community Biweekly Call.
+-   Join [Flang Community Technical Biweekly Call](https://teams.microsoft.com/l/meetup-join/19%3ameeting_YWU1NzU4ZjQtOTljOS00NWU1LTg5NjktYTUzOTU3MGEwMzAx%40thread.v2/0?context=%7b%22Tid%22%3a%22f34e5979-57d9-4aaa-ad4d-b122a662184d%22%2c%22Oid%22%3a%223641875c-ef5b-4767-8105-0787a195852f%22%7d)
+    - Meeting ID: 330 223 778 657
+    - Passcode: 7GjaJ3
+-   Time: Mondays, 8:30 a.m. Pacific Time, on the weeks alternating with regular Flang Community Biweekly Call.
 -   The agenda is in this [Google Doc](https://docs.google.com/document/d/1Z2U5UAtJ-Dag5wlMaLaW1KRmNgENNAYynJqLW2j2AZQ/).
 
 ### LLVM Alias Analysis Technical Call
-- 
cgit v1.1


From 32eb95cc40d93bc5fb67e0b52531f7620204ec8c Mon Sep 17 00:00:00 2001
From: nikitalita <69168929+nikitalita@users.noreply.github.com>
Date: Mon, 12 Feb 2024 07:02:29 -0800
Subject: [DebugInfo] Update CodeView enums (#71038)

This adds the following values to the CodeView.h enums (and updates the
various functions that use them):


* CPUType:
  * Added `Unknown`
* This is not currently documented in the online documentation, but this
is present in `cvconst.h` in the latest DIA SDK (Visual Studio 2022,
17.7.6)
* `Unknown` is the CPUType that is emitted by `aliasobj.exe` in the
Compile3Sym records, and can be found in objects that link with
`oldnames.lib`


![image](https://github.com/llvm/llvm-project/assets/69168929/8ee7b032-761b-45da-8439-d07aba797940)


* SourceLanguage (All of these are documented at
https://learn.microsoft.com/en-us/visualstudio/debugger/debug-interface-access/cv-cfl-lang?view=vs-2022
and are present in `cvconst.h` in the latest DIA SDK (Visual Studio
2022, 17.7.6))
  * Added Go
  * Added AliasObj
* emitted by `aliasobj.exe` in certain records, can be found in PDBs
that link with `oldnames.lib`
  * Changed Swift to the official Microsoft enumeration
  * Added `OldSwift`
* The old Swift enumeration of `S` was changed to `OldSwift` to allow
pdb dumping utilities to continue to emit correct source language
information for old PDBs

### WARNING
The `Swift` change is a potentially breaking change, as the swift
compiler will now emit `0x13` for the SourceLanguage type in PDB records
instead of `S`. This could potentially break utilities that relied on
the old enum value.

 * CallType
   * Added Swift
* This is not currently documented in the online documentation, but this
is present in `cvconst.h` in the latest DIA SDK (Visual Studio 2022,
17.7.6)
---
 llvm/include/llvm/DebugInfo/CodeView/CodeView.h | 19 ++++++++++++------
 llvm/lib/DebugInfo/CodeView/EnumTables.cpp      | 26 ++++++++++++++-----------
 llvm/lib/DebugInfo/CodeView/TypeDumpVisitor.cpp |  1 +
 llvm/lib/DebugInfo/PDB/PDBExtras.cpp            |  4 ++++
 llvm/lib/ObjectYAML/CodeViewYAMLTypes.cpp       |  1 +
 llvm/test/DebugInfo/COFF/swift.ll               |  4 ++--
 llvm/tools/llvm-pdbutil/MinimalSymbolDumper.cpp |  4 ++++
 llvm/tools/llvm-pdbutil/MinimalTypeDumper.cpp   |  1 +
 8 files changed, 41 insertions(+), 19 deletions(-)

diff --git a/llvm/include/llvm/DebugInfo/CodeView/CodeView.h b/llvm/include/llvm/DebugInfo/CodeView/CodeView.h
index 0bfd06e..5cdff5f 100644
--- a/llvm/include/llvm/DebugInfo/CodeView/CodeView.h
+++ b/llvm/include/llvm/DebugInfo/CodeView/CodeView.h
@@ -136,11 +136,13 @@ enum class CPUType : uint16_t {
   HybridX86ARM64 = 0xf7,
   ARM64EC = 0xf8,
   ARM64X = 0xf9,
+  Unknown = 0xff,
   D3D11_Shader = 0x100,
 };
 
 /// These values correspond to the CV_CFL_LANG enumeration in the Microsoft
-/// Debug Interface Access SDK
+/// Debug Interface Access SDK, and are documented here:
+/// https://learn.microsoft.com/en-us/visualstudio/debugger/debug-interface-access/cv-cfl-lang
 enum SourceLanguage : uint8_t {
   C = 0x00,
   Cpp = 0x01,
@@ -161,13 +163,17 @@ enum SourceLanguage : uint8_t {
   HLSL = 0x10,
   ObjC = 0x11,
   ObjCpp = 0x12,
-
+  Swift = 0x13,
+  AliasObj = 0x14,
   Rust = 0x15,
+  Go = 0x16,
 
-  /// The DMD & Swift compilers emit 'D' and 'S', respectively, for the CV
-  /// source language. Microsoft does not have enumerators for them yet.
+  /// The DMD compiler emits 'D' for the CV source language. Microsoft does not
+  /// have an enumerator for it yet.
   D = 'D',
-  Swift = 'S',
+  /// The Swift compiler used to emit 'S' for the CV source language, but
+  /// current versions emit the enumerator defined above.
+  OldSwift = 'S',
 };
 
 /// These values correspond to the CV_call_e enumeration, and are documented
@@ -200,7 +206,8 @@ enum class CallingConvention : uint8_t {
   ClrCall = 0x16,     // clr call
   Inline =
       0x17, // Marker for routines always inlined and thus lacking a convention
-  NearVector = 0x18 // near left to right push with regs, callee pops stack
+  NearVector = 0x18, // near left to right push with regs, callee pops stack
+  Swift = 0x19,      // Swift call
 };
 
 enum class ClassOptions : uint16_t {
diff --git a/llvm/lib/DebugInfo/CodeView/EnumTables.cpp b/llvm/lib/DebugInfo/CodeView/EnumTables.cpp
index 7e30873..1ff34bd 100644
--- a/llvm/lib/DebugInfo/CodeView/EnumTables.cpp
+++ b/llvm/lib/DebugInfo/CodeView/EnumTables.cpp
@@ -95,17 +95,19 @@ static const EnumEntry<uint8_t> FrameCookieKinds[] = {
 };
 
 static const EnumEntry<codeview::SourceLanguage> SourceLanguages[] = {
-    CV_ENUM_ENT(SourceLanguage, C),       CV_ENUM_ENT(SourceLanguage, Cpp),
-    CV_ENUM_ENT(SourceLanguage, Fortran), CV_ENUM_ENT(SourceLanguage, Masm),
-    CV_ENUM_ENT(SourceLanguage, Pascal),  CV_ENUM_ENT(SourceLanguage, Basic),
-    CV_ENUM_ENT(SourceLanguage, Cobol),   CV_ENUM_ENT(SourceLanguage, Link),
-    CV_ENUM_ENT(SourceLanguage, Cvtres),  CV_ENUM_ENT(SourceLanguage, Cvtpgd),
-    CV_ENUM_ENT(SourceLanguage, CSharp),  CV_ENUM_ENT(SourceLanguage, VB),
-    CV_ENUM_ENT(SourceLanguage, ILAsm),   CV_ENUM_ENT(SourceLanguage, Java),
-    CV_ENUM_ENT(SourceLanguage, JScript), CV_ENUM_ENT(SourceLanguage, MSIL),
-    CV_ENUM_ENT(SourceLanguage, HLSL),    CV_ENUM_ENT(SourceLanguage, D),
-    CV_ENUM_ENT(SourceLanguage, Swift),   CV_ENUM_ENT(SourceLanguage, Rust),
-    CV_ENUM_ENT(SourceLanguage, ObjC),    CV_ENUM_ENT(SourceLanguage, ObjCpp),
+    CV_ENUM_ENT(SourceLanguage, C),        CV_ENUM_ENT(SourceLanguage, Cpp),
+    CV_ENUM_ENT(SourceLanguage, Fortran),  CV_ENUM_ENT(SourceLanguage, Masm),
+    CV_ENUM_ENT(SourceLanguage, Pascal),   CV_ENUM_ENT(SourceLanguage, Basic),
+    CV_ENUM_ENT(SourceLanguage, Cobol),    CV_ENUM_ENT(SourceLanguage, Link),
+    CV_ENUM_ENT(SourceLanguage, Cvtres),   CV_ENUM_ENT(SourceLanguage, Cvtpgd),
+    CV_ENUM_ENT(SourceLanguage, CSharp),   CV_ENUM_ENT(SourceLanguage, VB),
+    CV_ENUM_ENT(SourceLanguage, ILAsm),    CV_ENUM_ENT(SourceLanguage, Java),
+    CV_ENUM_ENT(SourceLanguage, JScript),  CV_ENUM_ENT(SourceLanguage, MSIL),
+    CV_ENUM_ENT(SourceLanguage, HLSL),     CV_ENUM_ENT(SourceLanguage, D),
+    CV_ENUM_ENT(SourceLanguage, Swift),    CV_ENUM_ENT(SourceLanguage, Rust),
+    CV_ENUM_ENT(SourceLanguage, ObjC),     CV_ENUM_ENT(SourceLanguage, ObjCpp),
+    CV_ENUM_ENT(SourceLanguage, AliasObj), CV_ENUM_ENT(SourceLanguage, Go),
+    {"Swift", SourceLanguage::OldSwift},
 };
 
 static const EnumEntry<uint32_t> CompileSym2FlagNames[] = {
@@ -205,6 +207,7 @@ static const EnumEntry<unsigned> CPUTypeNames[] = {
     CV_ENUM_CLASS_ENT(CPUType, HybridX86ARM64),
     CV_ENUM_CLASS_ENT(CPUType, ARM64EC),
     CV_ENUM_CLASS_ENT(CPUType, ARM64X),
+    CV_ENUM_CLASS_ENT(CPUType, Unknown),
     CV_ENUM_CLASS_ENT(CPUType, D3D11_Shader),
 };
 
@@ -421,6 +424,7 @@ static const EnumEntry<uint8_t> CallingConventions[] = {
     CV_ENUM_CLASS_ENT(CallingConvention, ClrCall),
     CV_ENUM_CLASS_ENT(CallingConvention, Inline),
     CV_ENUM_CLASS_ENT(CallingConvention, NearVector),
+    CV_ENUM_CLASS_ENT(CallingConvention, Swift),
 };
 
 static const EnumEntry<uint8_t> FunctionOptionEnum[] = {
diff --git a/llvm/lib/DebugInfo/CodeView/TypeDumpVisitor.cpp b/llvm/lib/DebugInfo/CodeView/TypeDumpVisitor.cpp
index df7e42d..7766764 100644
--- a/llvm/lib/DebugInfo/CodeView/TypeDumpVisitor.cpp
+++ b/llvm/lib/DebugInfo/CodeView/TypeDumpVisitor.cpp
@@ -133,6 +133,7 @@ static const EnumEntry<uint8_t> CallingConventions[] = {
     ENUM_ENTRY(CallingConvention, ClrCall),
     ENUM_ENTRY(CallingConvention, Inline),
     ENUM_ENTRY(CallingConvention, NearVector),
+    ENUM_ENTRY(CallingConvention, Swift),
 };
 
 static const EnumEntry<uint8_t> FunctionOptionEnum[] = {
diff --git a/llvm/lib/DebugInfo/PDB/PDBExtras.cpp b/llvm/lib/DebugInfo/PDB/PDBExtras.cpp
index 2b318bf..cb8afab 100644
--- a/llvm/lib/DebugInfo/PDB/PDBExtras.cpp
+++ b/llvm/lib/DebugInfo/PDB/PDBExtras.cpp
@@ -96,6 +96,7 @@ raw_ostream &llvm::pdb::operator<<(raw_ostream &OS,
     CASE_OUTPUT_ENUM_CLASS_STR(PDB_CallingConv, ClrCall    , "clrcall", OS)
     CASE_OUTPUT_ENUM_CLASS_STR(PDB_CallingConv, Inline     , "inlinecall", OS)
     CASE_OUTPUT_ENUM_CLASS_STR(PDB_CallingConv, NearVector , "vectorcall", OS)
+    CASE_OUTPUT_ENUM_CLASS_STR(PDB_CallingConv, Swift, "swiftcall", OS)
   }
   return OS;
 }
@@ -234,6 +235,9 @@ raw_ostream &llvm::pdb::operator<<(raw_ostream &OS, const PDB_Lang &Lang) {
     CASE_OUTPUT_ENUM_CLASS_NAME(PDB_Lang, Rust, OS)
     CASE_OUTPUT_ENUM_CLASS_NAME(PDB_Lang, ObjC, OS)
     CASE_OUTPUT_ENUM_CLASS_STR(PDB_Lang, ObjCpp, "ObjC++", OS)
+    CASE_OUTPUT_ENUM_CLASS_NAME(PDB_Lang, AliasObj, OS)
+    CASE_OUTPUT_ENUM_CLASS_NAME(PDB_Lang, Go, OS)
+    CASE_OUTPUT_ENUM_CLASS_STR(PDB_Lang, OldSwift, "Swift", OS)
   }
   return OS;
 }
diff --git a/llvm/lib/ObjectYAML/CodeViewYAMLTypes.cpp b/llvm/lib/ObjectYAML/CodeViewYAMLTypes.cpp
index 9968978..f4ca1b2 100644
--- a/llvm/lib/ObjectYAML/CodeViewYAMLTypes.cpp
+++ b/llvm/lib/ObjectYAML/CodeViewYAMLTypes.cpp
@@ -259,6 +259,7 @@ void ScalarEnumerationTraits<CallingConvention>::enumeration(
   IO.enumCase(Value, "ClrCall", CallingConvention::ClrCall);
   IO.enumCase(Value, "Inline", CallingConvention::Inline);
   IO.enumCase(Value, "NearVector", CallingConvention::NearVector);
+  IO.enumCase(Value, "Swift", CallingConvention::Swift);
 }
 
 void ScalarEnumerationTraits<PointerKind>::enumeration(IO &IO,
diff --git a/llvm/test/DebugInfo/COFF/swift.ll b/llvm/test/DebugInfo/COFF/swift.ll
index 81922cc..710e129 100644
--- a/llvm/test/DebugInfo/COFF/swift.ll
+++ b/llvm/test/DebugInfo/COFF/swift.ll
@@ -2,12 +2,12 @@
 ; RUN: llc -filetype=obj < %s | llvm-readobj --codeview - | FileCheck %s --check-prefix=OBJ
 
 ; ASM:      .short  4412                    # Record kind: S_COMPILE3
-; ASM-NEXT: .long   83                      # Flags and language
+; ASM-NEXT: .long   19                      # Flags and language
 ; ASM-NEXT: .short  208                     # CPUType
 
 ; OBJ-LABEL: Compile3Sym {
 ; OBJ-NEXT:    Kind: S_COMPILE3 (0x113C)
-; OBJ-NEXT:    Language: Swift (0x53)
+; OBJ-NEXT:    Language: Swift (0x13)
 ; OBJ-NEXT:    Flags [ (0x0)
 ; OBJ-NEXT:    ]
 ; OBJ-NEXT:    Machine: X64 (0xD0)
diff --git a/llvm/tools/llvm-pdbutil/MinimalSymbolDumper.cpp b/llvm/tools/llvm-pdbutil/MinimalSymbolDumper.cpp
index 1beb2d2..479d025 100644
--- a/llvm/tools/llvm-pdbutil/MinimalSymbolDumper.cpp
+++ b/llvm/tools/llvm-pdbutil/MinimalSymbolDumper.cpp
@@ -213,6 +213,9 @@ static std::string formatSourceLanguage(SourceLanguage Lang) {
     RETURN_CASE(SourceLanguage, Rust, "rust");
     RETURN_CASE(SourceLanguage, ObjC, "objc");
     RETURN_CASE(SourceLanguage, ObjCpp, "objc++");
+    RETURN_CASE(SourceLanguage, AliasObj, "aliasobj");
+    RETURN_CASE(SourceLanguage, Go, "go");
+    RETURN_CASE(SourceLanguage, OldSwift, "swift");
   }
   return formatUnknownEnum(Lang);
 }
@@ -282,6 +285,7 @@ static std::string formatMachineType(CPUType Cpu) {
     RETURN_CASE(CPUType, Thumb, "thumb");
     RETURN_CASE(CPUType, ARMNT, "arm nt");
     RETURN_CASE(CPUType, D3D11_Shader, "d3d11 shader");
+    RETURN_CASE(CPUType, Unknown, "unknown");
   }
   return formatUnknownEnum(Cpu);
 }
diff --git a/llvm/tools/llvm-pdbutil/MinimalTypeDumper.cpp b/llvm/tools/llvm-pdbutil/MinimalTypeDumper.cpp
index aaa430a..a407782 100644
--- a/llvm/tools/llvm-pdbutil/MinimalTypeDumper.cpp
+++ b/llvm/tools/llvm-pdbutil/MinimalTypeDumper.cpp
@@ -125,6 +125,7 @@ static std::string formatCallingConvention(CallingConvention Convention) {
     RETURN_CASE(CallingConvention, PpcCall, "ppccall");
     RETURN_CASE(CallingConvention, SHCall, "shcall");
     RETURN_CASE(CallingConvention, SH5Call, "sh5call");
+    RETURN_CASE(CallingConvention, Swift, "swift");
     RETURN_CASE(CallingConvention, ThisCall, "thiscall");
     RETURN_CASE(CallingConvention, TriCall, "tricall");
   }
-- 
cgit v1.1


From c9afeaa6434a61b3b3a57c8eda6d2cfb25ab675b Mon Sep 17 00:00:00 2001
From: Haojian Wu <hokein.wu@gmail.com>
Date: Mon, 12 Feb 2024 16:19:42 +0100
Subject: [format] Remove an unused variable.

---
 clang/lib/Format/TokenAnnotator.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/clang/lib/Format/TokenAnnotator.cpp b/clang/lib/Format/TokenAnnotator.cpp
index d353388..b9a000f 100644
--- a/clang/lib/Format/TokenAnnotator.cpp
+++ b/clang/lib/Format/TokenAnnotator.cpp
@@ -993,7 +993,6 @@ private:
   // SimpleValue6 ::=  "(" DagArg [DagArgList] ")"
   // This parses SimpleValue 6's inside part of "(" ")"
   bool parseTableGenDAGArgAndList(FormatToken *Opener) {
-    FormatToken *FirstTok = CurrentToken;
     if (!parseTableGenDAGArg())
       return false;
     // Parse the [DagArgList] part
-- 
cgit v1.1


From 69ddf1eb4dc27331083dff49e60033265ae4b568 Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Mon, 12 Feb 2024 16:40:23 +0100
Subject: [X86] Add test for #80911 (NFC)

---
 llvm/test/CodeGen/X86/load-combine.ll | 33 +++++++++++++++++++++++++++++++++
 1 file changed, 33 insertions(+)

diff --git a/llvm/test/CodeGen/X86/load-combine.ll b/llvm/test/CodeGen/X86/load-combine.ll
index 7f8115d..7e4e11f 100644
--- a/llvm/test/CodeGen/X86/load-combine.ll
+++ b/llvm/test/CodeGen/X86/load-combine.ll
@@ -1282,3 +1282,36 @@ define i32 @zext_load_i32_by_i8_bswap_shl_16(ptr %arg) {
   %tmp8 = or i32 %tmp7, %tmp30
   ret i32 %tmp8
 }
+
+; FIXME: This is a miscompile.
+define i32 @pr80911_vector_load_multiuse(ptr %ptr, ptr %clobber) nounwind {
+; CHECK-LABEL: pr80911_vector_load_multiuse:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    pushl %esi
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; CHECK-NEXT:    movl (%edx), %esi
+; CHECK-NEXT:    movzwl (%edx), %eax
+; CHECK-NEXT:    movl $0, (%ecx)
+; CHECK-NEXT:    movl %esi, (%edx)
+; CHECK-NEXT:    popl %esi
+; CHECK-NEXT:    retl
+;
+; CHECK64-LABEL: pr80911_vector_load_multiuse:
+; CHECK64:       # %bb.0:
+; CHECK64-NEXT:    movzwl (%rdi), %eax
+; CHECK64-NEXT:    movl $0, (%rsi)
+; CHECK64-NEXT:    movl (%rdi), %ecx
+; CHECK64-NEXT:    movl %ecx, (%rdi)
+; CHECK64-NEXT:    retq
+  %load = load <4 x i8>, ptr %ptr, align 16
+  store i32 0, ptr %clobber
+  store <4 x i8> %load, ptr %ptr, align 16
+  %e1 = extractelement <4 x i8> %load, i64 1
+  %e1.ext = zext i8 %e1 to i32
+  %e1.ext.shift = shl nuw nsw i32 %e1.ext, 8
+  %e0 = extractelement <4 x i8> %load, i64 0
+  %e0.ext = zext i8 %e0 to i32
+  %res = or i32 %e1.ext.shift, %e0.ext
+  ret i32 %res
+}
-- 
cgit v1.1


From d08d3159d5ee0ee127d3581776ef569b8a6113af Mon Sep 17 00:00:00 2001
From: Jon Roelofs <jonathan_roelofs@apple.com>
Date: Mon, 12 Feb 2024 07:46:50 -0800
Subject: [clang][sema][FMV] Forbid multi-versioning arm_streaming functions.
 (#81268)

The streaming mode change is incompatible with the ifunc mechanism used
to implement FMV: we can't conditionally change it based on the
particular callee that is resolved at runtime.

Fixes: https://github.com/llvm/llvm-project/issues/80077
---
 clang/include/clang/Basic/DiagnosticSemaKinds.td |  2 ++
 clang/include/clang/Sema/Sema.h                  | 13 ++++----
 clang/lib/Sema/SemaDeclAttr.cpp                  | 22 ++++++++++---
 clang/test/Sema/aarch64-sme-func-attrs.c         | 40 ++++++++++++++++++++++++
 4 files changed, 65 insertions(+), 12 deletions(-)

diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td
index b4dc4fe..83b89d1 100644
--- a/clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -3728,6 +3728,8 @@ def err_sme_definition_using_zt0_in_non_sme2_target : Error<
   "function using ZT0 state requires 'sme2'">;
 def err_conflicting_attributes_arm_state : Error<
   "conflicting attributes for state '%0'">;
+def err_sme_streaming_cannot_be_multiversioned : Error<
+  "streaming function cannot be multi-versioned">;
 def err_unknown_arm_state : Error<
   "unknown state '%0'">;
 def err_missing_arm_state : Error<
diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h
index 851560f..32f7202 100644
--- a/clang/include/clang/Sema/Sema.h
+++ b/clang/include/clang/Sema/Sema.h
@@ -4838,13 +4838,12 @@ public:
   llvm::Error isValidSectionSpecifier(StringRef Str);
   bool checkSectionName(SourceLocation LiteralLoc, StringRef Str);
   bool checkTargetAttr(SourceLocation LiteralLoc, StringRef Str);
-  bool checkTargetVersionAttr(SourceLocation LiteralLoc, StringRef &Str,
-                              bool &isDefault);
-  bool
-  checkTargetClonesAttrString(SourceLocation LiteralLoc, StringRef Str,
-                              const StringLiteral *Literal, bool &HasDefault,
-                              bool &HasCommas, bool &HasNotDefault,
-                              SmallVectorImpl<SmallString<64>> &StringsBuffer);
+  bool checkTargetVersionAttr(SourceLocation LiteralLoc, Decl *D,
+                              StringRef &Str, bool &isDefault);
+  bool checkTargetClonesAttrString(
+      SourceLocation LiteralLoc, StringRef Str, const StringLiteral *Literal,
+      Decl *D, bool &HasDefault, bool &HasCommas, bool &HasNotDefault,
+      SmallVectorImpl<SmallString<64>> &StringsBuffer);
   bool checkMSInheritanceAttrOnDefinition(
       CXXRecordDecl *RD, SourceRange Range, bool BestCase,
       MSInheritanceModel SemanticSpelling);
diff --git a/clang/lib/Sema/SemaDeclAttr.cpp b/clang/lib/Sema/SemaDeclAttr.cpp
index d785714..d552695 100644
--- a/clang/lib/Sema/SemaDeclAttr.cpp
+++ b/clang/lib/Sema/SemaDeclAttr.cpp
@@ -3501,9 +3501,16 @@ bool Sema::checkTargetAttr(SourceLocation LiteralLoc, StringRef AttrStr) {
   return false;
 }
 
+static bool hasArmStreamingInterface(const FunctionDecl *FD) {
+  if (const auto *T = FD->getType()->getAs<FunctionProtoType>())
+    if (T->getAArch64SMEAttributes() & FunctionType::SME_PStateSMEnabledMask)
+      return true;
+  return false;
+}
+
 // Check Target Version attrs
-bool Sema::checkTargetVersionAttr(SourceLocation LiteralLoc, StringRef &AttrStr,
-                                  bool &isDefault) {
+bool Sema::checkTargetVersionAttr(SourceLocation LiteralLoc, Decl *D,
+                                  StringRef &AttrStr, bool &isDefault) {
   enum FirstParam { Unsupported };
   enum SecondParam { None };
   enum ThirdParam { Target, TargetClones, TargetVersion };
@@ -3519,6 +3526,8 @@ bool Sema::checkTargetVersionAttr(SourceLocation LiteralLoc, StringRef &AttrStr,
       return Diag(LiteralLoc, diag::warn_unsupported_target_attribute)
              << Unsupported << None << CurFeature << TargetVersion;
   }
+  if (hasArmStreamingInterface(cast<FunctionDecl>(D)))
+    return Diag(LiteralLoc, diag::err_sme_streaming_cannot_be_multiversioned);
   return false;
 }
 
@@ -3527,7 +3536,7 @@ static void handleTargetVersionAttr(Sema &S, Decl *D, const ParsedAttr &AL) {
   SourceLocation LiteralLoc;
   bool isDefault = false;
   if (!S.checkStringLiteralArgumentAttr(AL, 0, Str, &LiteralLoc) ||
-      S.checkTargetVersionAttr(LiteralLoc, Str, isDefault))
+      S.checkTargetVersionAttr(LiteralLoc, D, Str, isDefault))
     return;
   // Do not create default only target_version attribute
   if (!isDefault) {
@@ -3550,7 +3559,7 @@ static void handleTargetAttr(Sema &S, Decl *D, const ParsedAttr &AL) {
 
 bool Sema::checkTargetClonesAttrString(
     SourceLocation LiteralLoc, StringRef Str, const StringLiteral *Literal,
-    bool &HasDefault, bool &HasCommas, bool &HasNotDefault,
+    Decl *D, bool &HasDefault, bool &HasCommas, bool &HasNotDefault,
     SmallVectorImpl<SmallString<64>> &StringsBuffer) {
   enum FirstParam { Unsupported, Duplicate, Unknown };
   enum SecondParam { None, CPU, Tune };
@@ -3619,6 +3628,9 @@ bool Sema::checkTargetClonesAttrString(
           HasNotDefault = true;
         }
       }
+      if (hasArmStreamingInterface(cast<FunctionDecl>(D)))
+        return Diag(LiteralLoc,
+                    diag::err_sme_streaming_cannot_be_multiversioned);
     } else {
       // Other targets ( currently X86 )
       if (Cur.starts_with("arch=")) {
@@ -3670,7 +3682,7 @@ static void handleTargetClonesAttr(Sema &S, Decl *D, const ParsedAttr &AL) {
     if (!S.checkStringLiteralArgumentAttr(AL, I, CurStr, &LiteralLoc) ||
         S.checkTargetClonesAttrString(
             LiteralLoc, CurStr,
-            cast<StringLiteral>(AL.getArgAsExpr(I)->IgnoreParenCasts()),
+            cast<StringLiteral>(AL.getArgAsExpr(I)->IgnoreParenCasts()), D,
             HasDefault, HasCommas, HasNotDefault, StringsBuffer))
       return;
   }
diff --git a/clang/test/Sema/aarch64-sme-func-attrs.c b/clang/test/Sema/aarch64-sme-func-attrs.c
index 2bf1886..47dbeca 100644
--- a/clang/test/Sema/aarch64-sme-func-attrs.c
+++ b/clang/test/Sema/aarch64-sme-func-attrs.c
@@ -454,3 +454,43 @@ void unimplemented_spill_fill_za(void (*share_zt0_only)(void) __arm_inout("zt0")
   // expected-note@+1 {{add '__arm_preserves("za")' to the callee if it preserves ZA}}
   share_zt0_only();
 }
+
+// expected-cpp-error@+2 {{streaming function cannot be multi-versioned}}
+// expected-error@+1 {{streaming function cannot be multi-versioned}}
+__attribute__((target_version("sme2")))
+void cannot_work_version(void) __arm_streaming {}
+// expected-cpp-error@+5 {{function declared 'void ()' was previously declared 'void () __arm_streaming', which has different SME function attributes}}
+// expected-cpp-note@-2 {{previous declaration is here}}
+// expected-error@+3 {{function declared 'void (void)' was previously declared 'void (void) __arm_streaming', which has different SME function attributes}}
+// expected-note@-4 {{previous declaration is here}}
+__attribute__((target_version("default")))
+void cannot_work_version(void) {}
+
+
+// expected-cpp-error@+2 {{streaming function cannot be multi-versioned}}
+// expected-error@+1 {{streaming function cannot be multi-versioned}}
+__attribute__((target_clones("sme2")))
+void cannot_work_clones(void) __arm_streaming {}
+
+
+__attribute__((target("sme2")))
+void just_fine_streaming(void) __arm_streaming {}
+__attribute__((target_version("sme2")))
+void just_fine(void) { just_fine_streaming(); }
+__attribute__((target_version("default")))
+void just_fine(void) {}
+
+
+__arm_locally_streaming
+__attribute__((target_version("sme2")))
+void just_fine_locally_streaming(void) {}
+__attribute__((target_version("default")))
+void just_fine_locally_streaming(void) {}
+
+
+void fmv_caller() {
+    cannot_work_version();
+    cannot_work_clones();
+    just_fine();
+    just_fine_locally_streaming();
+}
-- 
cgit v1.1


From 5c9f7682b090124d9a8b69f92d3f7c269dca25fc Mon Sep 17 00:00:00 2001
From: Stephen Tozer <Stephen.Tozer@Sony.com>
Date: Mon, 12 Feb 2024 15:46:41 +0000
Subject: Reapply "[DebugInfo][RemoveDIs] Turn on non-instrinsic debug-info by
 default"

This reapplies commit bdde5f9bea75e897bcc31a95b9c3376988c211cc.

The above commit previously failed due to buildbot errors:
  https://lab.llvm.org/buildbot/#/builders/205/builds/25126
  https://lab.llvm.org/buildbot/#/builders/184/builds/10242

These failures should have been respectively resolved by the commits:
  afa413a132c0959295df36c28814ee83948e4931
  b5a273a1cfe6f509f8d2541e04d9186438f33348

As noted in the original commit, this commit may break downstream tests.
If this commit is breaking your downstream tests, please see comment 12 in
[0], which documents the kind of variation in tests we'd expect to see from
this change and what to do about it.

[0] https://discourse.llvm.org/t/rfc-instruction-api-changes-needed-to-eliminate-debug-intrinsics-from-ir/68939
---
 llvm/lib/IR/BasicBlock.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/IR/BasicBlock.cpp b/llvm/lib/IR/BasicBlock.cpp
index fe9d0d0..bf02eba 100644
--- a/llvm/lib/IR/BasicBlock.cpp
+++ b/llvm/lib/IR/BasicBlock.cpp
@@ -34,7 +34,7 @@ cl::opt<bool>
     UseNewDbgInfoFormat("experimental-debuginfo-iterators",
                         cl::desc("Enable communicating debuginfo positions "
                                  "through iterators, eliminating intrinsics"),
-                        cl::init(false));
+                        cl::init(true));
 
 DPMarker *BasicBlock::createMarker(Instruction *I) {
   assert(IsNewDbgInfoFormat &&
-- 
cgit v1.1


From 70caa316e955a35904e34961c79d75935b7d906f Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo@fhahn.com>
Date: Mon, 12 Feb 2024 16:02:59 +0000
Subject: [Matrix] Refactor shape info computation (NFCI).

Factor our forward shape computation for a given instruction. This
allows re-use in a follow-up fix.
---
 .../Transforms/Scalar/LowerMatrixIntrinsics.cpp    | 203 +++++++++++----------
 1 file changed, 105 insertions(+), 98 deletions(-)

diff --git a/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp b/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp
index b528762..03e289f 100644
--- a/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp
+++ b/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp
@@ -192,6 +192,109 @@ Value *computeVectorAddr(Value *BasePtr, Value *VecIdx, Value *Stride,
   return VecStart;
 }
 
+namespace {
+struct ShapeInfo {
+  unsigned NumRows;
+  unsigned NumColumns;
+
+  bool IsColumnMajor;
+
+  ShapeInfo(unsigned NumRows = 0, unsigned NumColumns = 0)
+      : NumRows(NumRows), NumColumns(NumColumns),
+        IsColumnMajor(MatrixLayout == MatrixLayoutTy::ColumnMajor) {}
+
+  ShapeInfo(Value *NumRows, Value *NumColumns)
+      : ShapeInfo(cast<ConstantInt>(NumRows)->getZExtValue(),
+                  cast<ConstantInt>(NumColumns)->getZExtValue()) {}
+
+  bool operator==(const ShapeInfo &other) {
+    return NumRows == other.NumRows && NumColumns == other.NumColumns;
+  }
+  bool operator!=(const ShapeInfo &other) { return !(*this == other); }
+
+  /// Returns true if shape-information is defined, meaning both dimensions
+  /// are != 0.
+  operator bool() const {
+    assert(NumRows == 0 || NumColumns != 0);
+    return NumRows != 0;
+  }
+
+  unsigned getStride() const {
+    if (IsColumnMajor)
+      return NumRows;
+    return NumColumns;
+  }
+
+  unsigned getNumVectors() const {
+    if (IsColumnMajor)
+      return NumColumns;
+    return NumRows;
+  }
+
+  /// Returns the transposed shape.
+  ShapeInfo t() const { return ShapeInfo(NumColumns, NumRows); }
+};
+} // namespace
+
+static bool isUniformShape(Value *V) {
+  Instruction *I = dyn_cast<Instruction>(V);
+  if (!I)
+    return true;
+
+  switch (I->getOpcode()) {
+  case Instruction::FAdd:
+  case Instruction::FSub:
+  case Instruction::FMul: // Scalar multiply.
+  case Instruction::FNeg:
+  case Instruction::Add:
+  case Instruction::Mul:
+  case Instruction::Sub:
+    return true;
+  default:
+    return false;
+  }
+}
+
+/// Return the ShapeInfo for the result of \p I, it it can be determined.
+static std::optional<ShapeInfo>
+computeShapeInfoForInst(Instruction *I,
+                        const ValueMap<Value *, ShapeInfo> &ShapeMap) {
+  Value *M;
+  Value *N;
+  Value *K;
+  if (match(I, m_Intrinsic<Intrinsic::matrix_multiply>(
+                   m_Value(), m_Value(), m_Value(M), m_Value(N), m_Value(K))))
+    return ShapeInfo(M, K);
+  if (match(I, m_Intrinsic<Intrinsic::matrix_transpose>(m_Value(), m_Value(M),
+                                                        m_Value(N)))) {
+    // Flip dimensions.
+    return ShapeInfo(N, M);
+  }
+  if (match(I, m_Intrinsic<Intrinsic::matrix_column_major_store>(
+                   m_Value(), m_Value(), m_Value(), m_Value(), m_Value(M),
+                   m_Value(N))))
+    return ShapeInfo(N, M);
+  if (match(I, m_Intrinsic<Intrinsic::matrix_column_major_load>(
+                   m_Value(), m_Value(), m_Value(), m_Value(M), m_Value(N))))
+    return ShapeInfo(M, N);
+  Value *MatrixA;
+  if (match(I, m_Store(m_Value(MatrixA), m_Value()))) {
+    auto OpShape = ShapeMap.find(MatrixA);
+    if (OpShape != ShapeMap.end())
+      return OpShape->second;
+  }
+
+  if (isUniformShape(I)) {
+    // Find the first operand that has a known shape and use that.
+    for (auto &Op : I->operands()) {
+      auto OpShape = ShapeMap.find(Op.get());
+      if (OpShape != ShapeMap.end())
+        return OpShape->second;
+    }
+  }
+  return std::nullopt;
+}
+
 /// LowerMatrixIntrinsics contains the methods used to lower matrix intrinsics.
 ///
 /// Currently, the lowering for each matrix intrinsic is done as follows:
@@ -383,48 +486,6 @@ class LowerMatrixIntrinsics {
     }
   };
 
-  struct ShapeInfo {
-    unsigned NumRows;
-    unsigned NumColumns;
-
-    bool IsColumnMajor;
-
-    ShapeInfo(unsigned NumRows = 0, unsigned NumColumns = 0)
-        : NumRows(NumRows), NumColumns(NumColumns),
-          IsColumnMajor(MatrixLayout == MatrixLayoutTy::ColumnMajor) {}
-
-    ShapeInfo(Value *NumRows, Value *NumColumns)
-        : ShapeInfo(cast<ConstantInt>(NumRows)->getZExtValue(),
-                    cast<ConstantInt>(NumColumns)->getZExtValue()) {}
-
-    bool operator==(const ShapeInfo &other) {
-      return NumRows == other.NumRows && NumColumns == other.NumColumns;
-    }
-    bool operator!=(const ShapeInfo &other) { return !(*this == other); }
-
-    /// Returns true if shape-information is defined, meaning both dimensions
-    /// are != 0.
-    operator bool() const {
-      assert(NumRows == 0 || NumColumns != 0);
-      return NumRows != 0;
-    }
-
-    unsigned getStride() const {
-      if (IsColumnMajor)
-        return NumRows;
-      return NumColumns;
-    }
-
-    unsigned getNumVectors() const {
-      if (IsColumnMajor)
-        return NumColumns;
-      return NumRows;
-    }
-
-    /// Returns the transposed shape.
-    ShapeInfo t() const { return ShapeInfo(NumColumns, NumRows); }
-  };
-
   /// Maps instructions to their shape information. The shape information
   /// describes the shape to be used while lowering. This matches the shape of
   /// the result value of the instruction, with the only exceptions being store
@@ -554,25 +615,6 @@ public:
     return true;
   }
 
-  bool isUniformShape(Value *V) {
-    Instruction *I = dyn_cast<Instruction>(V);
-    if (!I)
-      return true;
-
-    switch (I->getOpcode()) {
-    case Instruction::FAdd:
-    case Instruction::FSub:
-    case Instruction::FMul: // Scalar multiply.
-    case Instruction::FNeg:
-    case Instruction::Add:
-    case Instruction::Mul:
-    case Instruction::Sub:
-      return true;
-    default:
-      return false;
-    }
-  }
-
   /// Returns true if shape information can be used for \p V. The supported
   /// instructions must match the instructions that can be lowered by this pass.
   bool supportsShapeInfo(Value *V) {
@@ -610,43 +652,8 @@ public:
 
       // New entry, set the value and insert operands
       bool Propagate = false;
-
-      Value *MatrixA;
-      Value *MatrixB;
-      Value *M;
-      Value *N;
-      Value *K;
-      if (match(Inst, m_Intrinsic<Intrinsic::matrix_multiply>(
-                          m_Value(MatrixA), m_Value(MatrixB), m_Value(M),
-                          m_Value(N), m_Value(K)))) {
-        Propagate = setShapeInfo(Inst, {M, K});
-      } else if (match(Inst, m_Intrinsic<Intrinsic::matrix_transpose>(
-                                 m_Value(MatrixA), m_Value(M), m_Value(N)))) {
-        // Flip dimensions.
-        Propagate = setShapeInfo(Inst, {N, M});
-      } else if (match(Inst, m_Intrinsic<Intrinsic::matrix_column_major_store>(
-                                 m_Value(MatrixA), m_Value(), m_Value(),
-                                 m_Value(), m_Value(M), m_Value(N)))) {
-        Propagate = setShapeInfo(Inst, {N, M});
-      } else if (match(Inst, m_Intrinsic<Intrinsic::matrix_column_major_load>(
-                                 m_Value(), m_Value(), m_Value(), m_Value(M),
-                                 m_Value(N)))) {
-        Propagate = setShapeInfo(Inst, {M, N});
-      } else if (match(Inst, m_Store(m_Value(MatrixA), m_Value()))) {
-        auto OpShape = ShapeMap.find(MatrixA);
-        if (OpShape != ShapeMap.end())
-          setShapeInfo(Inst, OpShape->second);
-        continue;
-      } else if (isUniformShape(Inst)) {
-        // Find the first operand that has a known shape and use that.
-        for (auto &Op : Inst->operands()) {
-          auto OpShape = ShapeMap.find(Op.get());
-          if (OpShape != ShapeMap.end()) {
-            Propagate |= setShapeInfo(Inst, OpShape->second);
-            break;
-          }
-        }
-      }
+      if (auto SI = computeShapeInfoForInst(Inst, ShapeMap))
+        Propagate = setShapeInfo(Inst, *SI);
 
       if (Propagate) {
         NewWorkList.push_back(Inst);
-- 
cgit v1.1


From 76eb8e35d6631827c0450e9d5f4743e410d4151a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Timm=20B=C3=A4der?= <tbaeder@redhat.com>
Date: Mon, 12 Feb 2024 17:03:01 +0100
Subject: [clang][Interp] Fix marking results as nullptr

I screwed this up in an earlier NFC commit, but the isZero()
case needs to set IsNullPtr to true of course.
---
 clang/lib/AST/Interp/Pointer.cpp | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/clang/lib/AST/Interp/Pointer.cpp b/clang/lib/AST/Interp/Pointer.cpp
index d68af79..8a0a155 100644
--- a/clang/lib/AST/Interp/Pointer.cpp
+++ b/clang/lib/AST/Interp/Pointer.cpp
@@ -87,7 +87,7 @@ APValue Pointer::toAPValue() const {
 
   if (isZero())
     return APValue(static_cast<const Expr *>(nullptr), CharUnits::Zero(), Path,
-                   false, /*IsNullPtr=*/false);
+                   /*IsOnePastEnd=*/false, /*IsNullPtr=*/true);
 
   // Build the lvalue base from the block.
   const Descriptor *Desc = getDeclDesc();
@@ -100,7 +100,8 @@ APValue Pointer::toAPValue() const {
     llvm_unreachable("Invalid allocation type");
 
   if (isDummy() || isUnknownSizeArray() || Desc->asExpr())
-    return APValue(Base, CharUnits::Zero(), Path, false, false);
+    return APValue(Base, CharUnits::Zero(), Path,
+                   /*IsOnePastEnd=*/false, /*IsNullPtr=*/false);
 
   // TODO: compute the offset into the object.
   CharUnits Offset = CharUnits::Zero();
-- 
cgit v1.1


From 1da7f410bcd831d4b2c3166b672d93c8110edfc0 Mon Sep 17 00:00:00 2001
From: Paul T Robinson <paul.robinson@sony.com>
Date: Mon, 12 Feb 2024 08:17:30 -0800
Subject: [Headers][X86] Editorial fixes to ia32intrin.h descriptions (#80490)

Use indicative not imperative; fix missing capitalization; spell out
some abbreviations; "time-stamp" not "time stamp"; and similar.
---
 clang/lib/Headers/ia32intrin.h | 72 +++++++++++++++++++++---------------------
 1 file changed, 36 insertions(+), 36 deletions(-)

diff --git a/clang/lib/Headers/ia32intrin.h b/clang/lib/Headers/ia32intrin.h
index 1b97977..8e65f23 100644
--- a/clang/lib/Headers/ia32intrin.h
+++ b/clang/lib/Headers/ia32intrin.h
@@ -26,8 +26,8 @@
 #define __DEFAULT_FN_ATTRS_CONSTEXPR __DEFAULT_FN_ATTRS
 #endif
 
-/// Find the first set bit starting from the lsb. Result is undefined if
-///    input is 0.
+/// Finds the first set bit starting from the least significant bit. The result
+///    is undefined if the input is 0.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -43,8 +43,8 @@ __bsfd(int __A) {
   return __builtin_ctz((unsigned int)__A);
 }
 
-/// Find the first set bit starting from the msb. Result is undefined if
-///    input is 0.
+/// Finds the first set bit starting from the most significant bit. The result
+///    is undefined if the input is 0.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -90,8 +90,8 @@ _bswap(int __A) {
   return (int)__builtin_bswap32((unsigned int)__A);
 }
 
-/// Find the first set bit starting from the lsb. Result is undefined if
-///    input is 0.
+/// Finds the first set bit starting from the least significant bit. The result
+///    is undefined if the input is 0.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -108,8 +108,8 @@ _bswap(int __A) {
 /// \see __bsfd
 #define _bit_scan_forward(A) __bsfd((A))
 
-/// Find the first set bit starting from the msb. Result is undefined if
-///    input is 0.
+/// Finds the first set bit starting from the most significant bit. The result
+///    is undefined if the input is 0.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -127,8 +127,8 @@ _bswap(int __A) {
 #define _bit_scan_reverse(A) __bsrd((A))
 
 #ifdef __x86_64__
-/// Find the first set bit starting from the lsb. Result is undefined if
-///    input is 0.
+/// Finds the first set bit starting from the least significant bit. The result
+///    is undefined if the input is 0.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -143,8 +143,8 @@ __bsfq(long long __A) {
   return (long long)__builtin_ctzll((unsigned long long)__A);
 }
 
-/// Find the first set bit starting from the msb. Result is undefined if
-///    input is 0.
+/// Finds the first set bit starting from the most significant bit. The result
+///    is undefined if input is 0.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -159,7 +159,7 @@ __bsrq(long long __A) {
   return 63 - __builtin_clzll((unsigned long long)__A);
 }
 
-/// Swaps the bytes in the input. Converting little endian to big endian or
+/// Swaps the bytes in the input, converting little endian to big endian or
 ///    vice versa.
 ///
 /// \headerfile <x86intrin.h>
@@ -175,7 +175,7 @@ __bswapq(long long __A) {
   return (long long)__builtin_bswap64((unsigned long long)__A);
 }
 
-/// Swaps the bytes in the input. Converting little endian to big endian or
+/// Swaps the bytes in the input, converting little endian to big endian or
 ///    vice versa.
 ///
 /// \headerfile <x86intrin.h>
@@ -198,7 +198,7 @@ __bswapq(long long __A) {
 /// \headerfile <x86intrin.h>
 ///
 /// This intrinsic corresponds to the \c POPCNT instruction or a
-///    a sequence of arithmetic and logic ops to calculate it.
+///    sequence of arithmetic and logic operations to calculate it.
 ///
 /// \param __A
 ///    An unsigned 32-bit integer operand.
@@ -220,7 +220,7 @@ __popcntd(unsigned int __A)
 /// \endcode
 ///
 /// This intrinsic corresponds to the \c POPCNT instruction or a
-///    a sequence of arithmetic and logic ops to calculate it.
+///    sequence of arithmetic and logic operations to calculate it.
 ///
 /// \param A
 ///    An unsigned 32-bit integer operand.
@@ -235,7 +235,7 @@ __popcntd(unsigned int __A)
 /// \headerfile <x86intrin.h>
 ///
 /// This intrinsic corresponds to the \c POPCNT instruction or a
-///    a sequence of arithmetic and logic ops to calculate it.
+///    sequence of arithmetic and logic operations to calculate it.
 ///
 /// \param __A
 ///    An unsigned 64-bit integer operand.
@@ -257,7 +257,7 @@ __popcntq(unsigned long long __A)
 /// \endcode
 ///
 /// This intrinsic corresponds to the \c POPCNT instruction or a
-///    a sequence of arithmetic and logic ops to calculate it.
+///    sequence of arithmetic and logic operations to calculate it.
 ///
 /// \param A
 ///    An unsigned 64-bit integer operand.
@@ -268,7 +268,7 @@ __popcntq(unsigned long long __A)
 #endif /* __x86_64__ */
 
 #ifdef __x86_64__
-/// Returns the program status and control \c RFLAGS register with the \c VM
+/// Returns the program status-and-control \c RFLAGS register with the \c VM
 ///    and \c RF flags cleared.
 ///
 /// \headerfile <x86intrin.h>
@@ -282,7 +282,7 @@ __readeflags(void)
   return __builtin_ia32_readeflags_u64();
 }
 
-/// Writes the specified value to the program status and control \c RFLAGS
+/// Writes the specified value to the program status-and-control \c RFLAGS
 ///    register. Reserved bits are not affected.
 ///
 /// \headerfile <x86intrin.h>
@@ -298,7 +298,7 @@ __writeeflags(unsigned long long __f)
 }
 
 #else /* !__x86_64__ */
-/// Returns the program status and control \c EFLAGS register with the \c VM
+/// Returns the program status-and-control \c EFLAGS register with the \c VM
 ///    and \c RF flags cleared.
 ///
 /// \headerfile <x86intrin.h>
@@ -312,7 +312,7 @@ __readeflags(void)
   return __builtin_ia32_readeflags_u32();
 }
 
-/// Writes the specified value to the program status and control \c EFLAGS
+/// Writes the specified value to the program status-and-control \c EFLAGS
 ///    register. Reserved bits are not affected.
 ///
 /// \headerfile <x86intrin.h>
@@ -328,7 +328,7 @@ __writeeflags(unsigned int __f)
 }
 #endif /* !__x86_64__ */
 
-/// Cast a 32-bit float value to a 32-bit unsigned integer value.
+/// Casts a 32-bit float value to a 32-bit unsigned integer value.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -337,13 +337,13 @@ __writeeflags(unsigned int __f)
 ///
 /// \param __A
 ///    A 32-bit float value.
-/// \returns a 32-bit unsigned integer containing the converted value.
+/// \returns A 32-bit unsigned integer containing the converted value.
 static __inline__ unsigned int __DEFAULT_FN_ATTRS_CAST
 _castf32_u32(float __A) {
   return __builtin_bit_cast(unsigned int, __A);
 }
 
-/// Cast a 64-bit float value to a 64-bit unsigned integer value.
+/// Casts a 64-bit float value to a 64-bit unsigned integer value.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -352,13 +352,13 @@ _castf32_u32(float __A) {
 ///
 /// \param __A
 ///    A 64-bit float value.
-/// \returns a 64-bit unsigned integer containing the converted value.
+/// \returns A 64-bit unsigned integer containing the converted value.
 static __inline__ unsigned long long __DEFAULT_FN_ATTRS_CAST
 _castf64_u64(double __A) {
   return __builtin_bit_cast(unsigned long long, __A);
 }
 
-/// Cast a 32-bit unsigned integer value to a 32-bit float value.
+/// Casts a 32-bit unsigned integer value to a 32-bit float value.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -367,13 +367,13 @@ _castf64_u64(double __A) {
 ///
 /// \param __A
 ///    A 32-bit unsigned integer value.
-/// \returns a 32-bit float value containing the converted value.
+/// \returns A 32-bit float value containing the converted value.
 static __inline__ float __DEFAULT_FN_ATTRS_CAST
 _castu32_f32(unsigned int __A) {
   return __builtin_bit_cast(float, __A);
 }
 
-/// Cast a 64-bit unsigned integer value to a 64-bit float value.
+/// Casts a 64-bit unsigned integer value to a 64-bit float value.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -382,7 +382,7 @@ _castu32_f32(unsigned int __A) {
 ///
 /// \param __A
 ///    A 64-bit unsigned integer value.
-/// \returns a 64-bit float value containing the converted value.
+/// \returns A 64-bit float value containing the converted value.
 static __inline__ double __DEFAULT_FN_ATTRS_CAST
 _castu64_f64(unsigned long long __A) {
   return __builtin_bit_cast(double, __A);
@@ -470,7 +470,7 @@ __crc32q(unsigned long long __C, unsigned long long __D)
 }
 #endif /* __x86_64__ */
 
-/// Reads the specified performance monitoring counter. Refer to your
+/// Reads the specified performance-monitoring counter. Refer to your
 ///    processor's documentation to determine which performance counters are
 ///    supported.
 ///
@@ -487,7 +487,7 @@ __rdpmc(int __A) {
   return __builtin_ia32_rdpmc(__A);
 }
 
-/// Reads the processor's time stamp counter and the \c IA32_TSC_AUX MSR
+/// Reads the processor's time-stamp counter and the \c IA32_TSC_AUX MSR
 ///    \c (0xc0000103).
 ///
 /// \headerfile <x86intrin.h>
@@ -495,14 +495,14 @@ __rdpmc(int __A) {
 /// This intrinsic corresponds to the \c RDTSCP instruction.
 ///
 /// \param __A
-///    Address of where to store the 32-bit \c IA32_TSC_AUX value.
-/// \returns The 64-bit value of the time stamp counter.
+///    The address of where to store the 32-bit \c IA32_TSC_AUX value.
+/// \returns The 64-bit value of the time-stamp counter.
 static __inline__ unsigned long long __DEFAULT_FN_ATTRS
 __rdtscp(unsigned int *__A) {
   return __builtin_ia32_rdtscp(__A);
 }
 
-/// Reads the processor's time stamp counter.
+/// Reads the processor's time-stamp counter.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -512,7 +512,7 @@ __rdtscp(unsigned int *__A) {
 ///
 /// This intrinsic corresponds to the \c RDTSC instruction.
 ///
-/// \returns The 64-bit value of the time stamp counter.
+/// \returns The 64-bit value of the time-stamp counter.
 #define _rdtsc() __rdtsc()
 
 /// Reads the specified performance monitoring counter. Refer to your
-- 
cgit v1.1


From 673e5e34b40071b2762c5ac1fea32c3596fe4dd3 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo@fhahn.com>
Date: Mon, 12 Feb 2024 16:19:01 +0000
Subject: [Matrix] Add dedicated tests for transpose lifting.

Add extra test coverage for transpose lifting using
-matrix-print-after-transpose-opt.

The added tests show a mis-compile.
---
 .../transpose-opts-lifting.ll                      | 115 +++++++++++++++++++++
 1 file changed, 115 insertions(+)
 create mode 100644 llvm/test/Transforms/LowerMatrixIntrinsics/transpose-opts-lifting.ll

diff --git a/llvm/test/Transforms/LowerMatrixIntrinsics/transpose-opts-lifting.ll b/llvm/test/Transforms/LowerMatrixIntrinsics/transpose-opts-lifting.ll
new file mode 100644
index 0000000..d0c6755
--- /dev/null
+++ b/llvm/test/Transforms/LowerMatrixIntrinsics/transpose-opts-lifting.ll
@@ -0,0 +1,115 @@
+; RUN: opt -p lower-matrix-intrinsics -matrix-print-after-transpose-opt -disable-output -S %s 2>&1 | FileCheck %s
+
+; REQUIRES: asserts
+
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+
+; FIXME: Lifted transpose dimensions are incorrect.
+define <6 x double> @lift_through_add_matching_transpose_dimensions(<6 x double> %a, <6 x double> %b) {
+; CHECK-LABEL:  define <6 x double> @lift_through_add_matching_transpose_dimensions(<6 x double> %a, <6 x double> %b) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[A:%.+]] = fadd <6 x double> %a, %b
+; CHECK-NEXT:    [[T:%.+]] = call <6 x double> @llvm.matrix.transpose.v6f64(<6 x double> [[A]], i32 2, i32 3)
+; CHECK-NEXT:    ret <6 x double> [[T]]
+;
+entry:
+  %a.t = call <6 x double> @llvm.matrix.transpose.v6f64(<6 x double> %a, i32 3, i32 2)
+  %b.t = call <6 x double> @llvm.matrix.transpose.v6f64(<6 x double> %b, i32 3, i32 2)
+  %add = fadd <6 x double> %a.t, %b.t
+  ret <6 x double> %add
+}
+
+define <6 x double> @lift_through_add_matching_transpose_dimensions_ops_also_have_shape_info(ptr %a.ptr, ptr %b.ptr) {
+; CHECK-LABEL: define <6 x double> @lift_through_add_matching_transpose_dimensions_ops_also_have_shape_info(ptr %a.ptr, ptr %b.ptr)
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[A:%.+]] = load <6 x double>, ptr %a.ptr
+; CHECK-NEXT:    [[B:%.+]] = load <6 x double>, ptr %b.ptr
+; CHECK-NEXT:    [[ADD:%.+]] = fadd <6 x double> [[A]], [[B]]
+; CHECK-NEXT:    [[T:%.+]] = call <6 x double> @llvm.matrix.transpose.v6f64(<6 x double> [[ADD]], i32 2, i32 3)
+; CHECK-NEXT:    ret <6 x double> [[T]]
+;
+entry:
+  %a = load <6 x double>, ptr %a.ptr
+  %b = load <6 x double>, ptr %b.ptr
+  %a.t = call <6 x double> @llvm.matrix.transpose.v6f64(<6 x double> %a, i32 3, i32 2)
+  %b.t = call <6 x double> @llvm.matrix.transpose.v6f64(<6 x double> %b, i32 3, i32 2)
+  %add = fadd <6 x double> %a.t, %b.t
+  ret <6 x double> %add
+}
+
+define <6 x double> @lift_through_add_mismatching_dimensions_1(<6 x double> %a, <6 x double> %b) {
+; CHECK-LABEL:  define <6 x double> @lift_through_add_mismatching_dimensions_1(<6 x double> %a, <6 x double> %b) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[A:%.+]] = fadd <6 x double> %a, %b
+; CHECK-NEXT:    [[T:%.+]] = call <6 x double> @llvm.matrix.transpose.v6f64(<6 x double> [[A]], i32 2, i32 3)
+; CHECK-NEXT:    ret <6 x double> [[T]]
+;
+entry:
+  %a.t = call <6 x double> @llvm.matrix.transpose.v6f64(<6 x double> %a, i32 1, i32 6)
+  %b.t = call <6 x double> @llvm.matrix.transpose.v6f64(<6 x double> %b, i32 3, i32 2)
+  %add = fadd <6 x double> %a.t, %b.t
+  ret <6 x double> %add
+}
+
+define <6 x double> @lift_through_add_mismatching_dimensions_2(<6 x double> %a, <6 x double> %b) {
+; CHECK-LABEL:  define <6 x double> @lift_through_add_mismatching_dimensions_2(<6 x double> %a, <6 x double> %b) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[A:%.+]] = fadd <6 x double> %a, %b
+; CHECK-NEXT:    [[T:%.+]] = call <6 x double> @llvm.matrix.transpose.v6f64(<6 x double> [[A]], i32 1, i32 6)
+; CHECK-NEXT:    ret <6 x double> [[T]]
+;
+
+entry:
+  %a.t = call <6 x double> @llvm.matrix.transpose.v6f64(<6 x double> %a, i32 3, i32 2)
+  %b.t = call <6 x double> @llvm.matrix.transpose.v6f64(<6 x double> %b, i32 6, i32 1)
+  %add = fadd <6 x double> %a.t, %b.t
+  ret <6 x double> %add
+}
+
+define <9 x double> @lift_through_multiply(<6 x double> %a, <6 x double> %b) {
+; CHECK-LABEL: define <9 x double> @lift_through_multiply(<6 x double> %a, <6 x double> %b) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[MUL:%.+]] = call <9 x double> @llvm.matrix.multiply.v9f64.v6f64.v6f64(<6 x double> %b, <6 x double> %a, i32 3, i32 2, i32 3)
+; CHECK-NEXT:    [[T:%.+]] = call <9 x double> @llvm.matrix.transpose.v9f64(<9 x double> [[MUL]], i32 3, i32 3)
+; CHECK-NEXT:   ret <9 x double> [[T]]
+;
+entry:
+  %a.t = call <6 x double> @llvm.matrix.transpose.v6f64(<6 x double> %a, i32 3, i32 2)
+  %b.t = call <6 x double> @llvm.matrix.transpose.v6f64(<6 x double> %b, i32 2, i32 3)
+  %mul = call <9 x double> @llvm.matrix.multiply.v9f64.v6f64(<6 x double> %a.t, <6 x double> %b.t, i32 3, i32 2 , i32 3)
+  ret <9 x double> %mul
+}
+
+define <6 x double> @lift_through_multiply_2(<6 x double> %a, <4 x double> %b) {
+; CHECK-LABEL: define <6 x double> @lift_through_multiply_2(<6 x double> %a, <4 x double> %b) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[MUL:%.+]] = call <6 x double> @llvm.matrix.multiply.v6f64.v4f64.v6f64(<4 x double> %b, <6 x double> %a, i32 2, i32 2, i32 3)
+; CHECK-NEXT:    [[T:%.+]] = call <6 x double> @llvm.matrix.transpose.v6f64(<6 x double> [[MUL]], i32 2, i32 3)
+; CHECK-NEXT:    ret <6 x double> [[T]]
+;
+entry:
+  %a.t = call <6 x double> @llvm.matrix.transpose.v6f64(<6 x double> %a, i32 3, i32 2)
+  %b.t = call <4 x double> @llvm.matrix.transpose.v4f64(<4 x double> %b, i32 2, i32 2)
+  %mul = call <6 x double> @llvm.matrix.multiply.v6f64.v6f64.v4f64(<6 x double> %a.t, <4 x double> %b.t, i32 3, i32 2 , i32 2)
+  ret <6 x double> %mul
+}
+
+define <6 x double> @lift_through_multiply_3(<4 x double> %a, <6 x double> %b) {
+; CHECK-LABEL: define <6 x double> @lift_through_multiply_3(<4 x double> %a, <6 x double> %b) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[MUL:%.+]] = call <6 x double> @llvm.matrix.multiply.v6f64.v6f64.v4f64(<6 x double> %b, <4 x double> %a, i32 3, i32 2, i32 2)
+; CHECK-NEXT:    [[T:%.+]] = call <6 x double> @llvm.matrix.transpose.v6f64(<6 x double> [[MUL]], i32 3, i32 2)
+; CHECK-NEXT:    ret <6 x double> [[T]]
+;
+entry:
+  %a.t = call <4 x double> @llvm.matrix.transpose.v4f64(<4 x double> %a, i32 2, i32 2)
+  %b.t = call <6 x double> @llvm.matrix.transpose.v6f64(<6 x double> %b, i32 2, i32 3)
+  %mul = call <6 x double> @llvm.matrix.multiply.v6f64.v4f64.v6f64(<4 x double> %a.t, <6 x double> %b.t, i32 2, i32 2 , i32 3)
+  ret <6 x double> %mul
+}
+
+declare <6 x double> @llvm.matrix.transpose.v6f64.v6f64(<6 x double>, i32, i32)
+declare <4 x double> @llvm.matrix.transpose.v4f64.v4f64(<4 x double>, i32, i32)
+declare <9 x double> @llvm.matrix.multiply.v9f64.v6f64(<6 x double>, <6 x double>, i32, i32, i32)
+declare <6 x double> @llvm.matrix.multiply.v6f64.v6f64.v4f64(<6 x double>, <4 x double>, i32, i32, i32)
+declare <6 x double> @llvm.matrix.multiply.v6f64.v6f64.v6f64(<6 x double>, <4 x double>, i32, i32, i32)
-- 
cgit v1.1


From ab702513f1ee2268ce216d11c310a08cae24a0e7 Mon Sep 17 00:00:00 2001
From: Benjamin Maxwell <benjamin.maxwell@arm.com>
Date: Mon, 12 Feb 2024 16:21:02 +0000
Subject: [mlir][VectorOps] Add vector.interleave operation (#80965)

The interleave operation constructs a new vector by interleaving the
elements from the trailing (or final) dimension of two input vectors,
returning a new vector where the trailing dimension is twice the size.

Note that for the n-D case this differs from the interleaving possible
with `vector.shuffle`, which would only operate on the leading
dimension.

Another key difference is this operation supports scalable vectors,
though currently a general LLVM lowering is limited to the case where
only the trailing dimension is scalable.

Example:
```mlir
%0 = vector.interleave %a, %b
            : vector<[4]xi32>     ; yields vector<[8]xi32>
%1 = vector.interleave %c, %d
            : vector<8xi8>        ; yields vector<16xi8>
%2 = vector.interleave %e, %f
            : vector<f16>         ; yields vector<2xf16>
%3 = vector.interleave %g, %h
            : vector<2x4x[2]xf64> ; yields vector<2x4x[4]xf64>
%4 = vector.interleave %i, %j
            : vector<6x3xf32>     ; yields vector<6x6xf32>
```

Note: This change alone does not add any lowerings.
---
 mlir/include/mlir/Dialect/Vector/IR/VectorOps.td | 63 ++++++++++++++++++++++++
 mlir/test/Dialect/Vector/ops.mlir                | 35 +++++++++++++
 2 files changed, 98 insertions(+)

diff --git a/mlir/include/mlir/Dialect/Vector/IR/VectorOps.td b/mlir/include/mlir/Dialect/Vector/IR/VectorOps.td
index bc08f8d..6d50b06 100644
--- a/mlir/include/mlir/Dialect/Vector/IR/VectorOps.td
+++ b/mlir/include/mlir/Dialect/Vector/IR/VectorOps.td
@@ -478,6 +478,69 @@ def Vector_ShuffleOp :
   let hasCanonicalizer = 1;
 }
 
+def Vector_InterleaveOp :
+  Vector_Op<"interleave", [Pure,
+    AllTypesMatch<["lhs", "rhs"]>,
+    TypesMatchWith<
+    "type of 'result' is double the width of the inputs",
+    "lhs", "result",
+    [{
+      [&]() -> ::mlir::VectorType {
+        auto vectorType = ::llvm::cast<mlir::VectorType>($_self);
+        ::mlir::VectorType::Builder builder(vectorType);
+        if (vectorType.getRank() == 0) {
+          static constexpr int64_t v2xty_shape[] = { 2 };
+          return builder.setShape(v2xty_shape);
+        }
+        auto lastDim = vectorType.getRank() - 1;
+        return builder.setDim(lastDim, vectorType.getDimSize(lastDim) * 2);
+      }()
+    }]>]> {
+  let summary = "constructs a vector by interleaving two input vectors";
+  let description = [{
+    The interleave operation constructs a new vector by interleaving the
+    elements from the trailing (or final) dimension of two input vectors,
+    returning a new vector where the trailing dimension is twice the size.
+
+    Note that for the n-D case this differs from the interleaving possible with
+    `vector.shuffle`, which would only operate on the leading dimension.
+
+    Another key difference is this operation supports scalable vectors, though
+    currently a general LLVM lowering is limited to the case where only the
+    trailing dimension is scalable.
+
+    Example:
+    ```mlir
+    %0 = vector.interleave %a, %b
+               : vector<[4]xi32>     ; yields vector<[8]xi32>
+    %1 = vector.interleave %c, %d
+               : vector<8xi8>        ; yields vector<16xi8>
+    %2 = vector.interleave %e, %f
+               : vector<f16>         ; yields vector<2xf16>
+    %3 = vector.interleave %g, %h
+               : vector<2x4x[2]xf64> ; yields vector<2x4x[4]xf64>
+    %4 = vector.interleave %i, %j
+               : vector<6x3xf32>     ; yields vector<6x6xf32>
+    ```
+  }];
+
+  let arguments = (ins AnyVectorOfAnyRank:$lhs, AnyVectorOfAnyRank:$rhs);
+  let results = (outs AnyVector:$result);
+
+  let assemblyFormat = [{
+    $lhs `,` $rhs  attr-dict `:` type($lhs)
+  }];
+
+  let extraClassDeclaration = [{
+    VectorType getSourceVectorType() {
+      return ::llvm::cast<VectorType>(getLhs().getType());
+    }
+    VectorType getResultVectorType() {
+      return ::llvm::cast<VectorType>(getResult().getType());
+    }
+  }];
+}
+
 def Vector_ExtractElementOp :
   Vector_Op<"extractelement", [Pure,
      TypesMatchWith<"result type matches element type of vector operand",
diff --git a/mlir/test/Dialect/Vector/ops.mlir b/mlir/test/Dialect/Vector/ops.mlir
index 2f8530e..79a80be 100644
--- a/mlir/test/Dialect/Vector/ops.mlir
+++ b/mlir/test/Dialect/Vector/ops.mlir
@@ -1081,3 +1081,38 @@ func.func @fastmath(%x: vector<42xf32>) -> f32 {
   %min = vector.reduction <minnumf>, %x fastmath<reassoc,nnan,ninf> : vector<42xf32> into f32
   return %min: f32
 }
+
+// CHECK-LABEL: @interleave_0d
+func.func @interleave_0d(%a: vector<f32>, %b: vector<f32>) -> vector<2xf32> {
+  // CHECK: vector.interleave %{{.*}}, %{{.*}} : vector<f32>
+  %0 = vector.interleave %a, %b : vector<f32>
+  return %0 : vector<2xf32>
+}
+
+// CHECK-LABEL: @interleave_1d
+func.func @interleave_1d(%a: vector<4xf32>, %b: vector<4xf32>) -> vector<8xf32> {
+  // CHECK: vector.interleave %{{.*}}, %{{.*}} : vector<4xf32>
+  %0 = vector.interleave %a, %b : vector<4xf32>
+  return %0 : vector<8xf32>
+}
+
+// CHECK-LABEL: @interleave_1d_scalable
+func.func @interleave_1d_scalable(%a: vector<[8]xi16>, %b: vector<[8]xi16>) -> vector<[16]xi16> {
+  // CHECK: vector.interleave %{{.*}}, %{{.*}} : vector<[8]xi16>
+  %0 = vector.interleave %a, %b : vector<[8]xi16>
+  return %0 : vector<[16]xi16>
+}
+
+// CHECK-LABEL: @interleave_2d
+func.func @interleave_2d(%a: vector<2x8xf32>, %b: vector<2x8xf32>) -> vector<2x16xf32> {
+  // CHECK: vector.interleave %{{.*}}, %{{.*}} : vector<2x8xf32>
+  %0 = vector.interleave %a, %b : vector<2x8xf32>
+  return %0 : vector<2x16xf32>
+}
+
+// CHECK-LABEL: @interleave_2d_scalable
+func.func @interleave_2d_scalable(%a: vector<2x[2]xf64>, %b: vector<2x[2]xf64>) -> vector<2x[4]xf64> {
+  // CHECK: vector.interleave %{{.*}}, %{{.*}} : vector<2x[2]xf64>
+  %0 = vector.interleave %a, %b : vector<2x[2]xf64>
+  return %0 : vector<2x[4]xf64>
+}
-- 
cgit v1.1


From d2d6b368a17547a847baf2c9767ad8c1686e58d1 Mon Sep 17 00:00:00 2001
From: Nick Desaulniers <nickdesaulniers@users.noreply.github.com>
Date: Mon, 12 Feb 2024 08:31:53 -0800
Subject: [libc][stdbit] implement stdc_first_leading_zero (C23) (#81340)

---
 libc/config/linux/x86_64/entrypoints.txt           |  5 ++++
 libc/include/llvm-libc-macros/stdbit-macros.h      | 22 ++++++++++++++++
 libc/spec/stdc.td                                  | 10 ++++++--
 libc/src/__support/CPP/bit.h                       | 30 ++++++++++++++++++++++
 libc/src/stdbit/CMakeLists.txt                     |  1 +
 libc/src/stdbit/stdc_first_leading_zero_uc.cpp     | 21 +++++++++++++++
 libc/src/stdbit/stdc_first_leading_zero_uc.h       | 18 +++++++++++++
 libc/src/stdbit/stdc_first_leading_zero_ui.cpp     | 20 +++++++++++++++
 libc/src/stdbit/stdc_first_leading_zero_ui.h       | 18 +++++++++++++
 libc/src/stdbit/stdc_first_leading_zero_ul.cpp     | 21 +++++++++++++++
 libc/src/stdbit/stdc_first_leading_zero_ul.h       | 18 +++++++++++++
 libc/src/stdbit/stdc_first_leading_zero_ull.cpp    | 21 +++++++++++++++
 libc/src/stdbit/stdc_first_leading_zero_ull.h      | 18 +++++++++++++
 libc/src/stdbit/stdc_first_leading_zero_us.cpp     | 21 +++++++++++++++
 libc/src/stdbit/stdc_first_leading_zero_us.h       | 18 +++++++++++++
 libc/test/include/stdbit_test.cpp                  | 15 +++++++++++
 libc/test/src/__support/CPP/bit_test.cpp           |  7 +++++
 libc/test/src/stdbit/CMakeLists.txt                |  1 +
 .../src/stdbit/stdc_first_leading_zero_uc_test.cpp | 21 +++++++++++++++
 .../src/stdbit/stdc_first_leading_zero_ui_test.cpp | 21 +++++++++++++++
 .../src/stdbit/stdc_first_leading_zero_ul_test.cpp | 21 +++++++++++++++
 .../stdbit/stdc_first_leading_zero_ull_test.cpp    | 21 +++++++++++++++
 .../src/stdbit/stdc_first_leading_zero_us_test.cpp | 21 +++++++++++++++
 23 files changed, 388 insertions(+), 2 deletions(-)
 create mode 100644 libc/src/stdbit/stdc_first_leading_zero_uc.cpp
 create mode 100644 libc/src/stdbit/stdc_first_leading_zero_uc.h
 create mode 100644 libc/src/stdbit/stdc_first_leading_zero_ui.cpp
 create mode 100644 libc/src/stdbit/stdc_first_leading_zero_ui.h
 create mode 100644 libc/src/stdbit/stdc_first_leading_zero_ul.cpp
 create mode 100644 libc/src/stdbit/stdc_first_leading_zero_ul.h
 create mode 100644 libc/src/stdbit/stdc_first_leading_zero_ull.cpp
 create mode 100644 libc/src/stdbit/stdc_first_leading_zero_ull.h
 create mode 100644 libc/src/stdbit/stdc_first_leading_zero_us.cpp
 create mode 100644 libc/src/stdbit/stdc_first_leading_zero_us.h
 create mode 100644 libc/test/src/stdbit/stdc_first_leading_zero_uc_test.cpp
 create mode 100644 libc/test/src/stdbit/stdc_first_leading_zero_ui_test.cpp
 create mode 100644 libc/test/src/stdbit/stdc_first_leading_zero_ul_test.cpp
 create mode 100644 libc/test/src/stdbit/stdc_first_leading_zero_ull_test.cpp
 create mode 100644 libc/test/src/stdbit/stdc_first_leading_zero_us_test.cpp

diff --git a/libc/config/linux/x86_64/entrypoints.txt b/libc/config/linux/x86_64/entrypoints.txt
index 8ca9375..75e39ae 100644
--- a/libc/config/linux/x86_64/entrypoints.txt
+++ b/libc/config/linux/x86_64/entrypoints.txt
@@ -112,6 +112,11 @@ set(TARGET_LIBC_ENTRYPOINTS
     libc.src.stdbit.stdc_trailing_ones_ui
     libc.src.stdbit.stdc_trailing_ones_ul
     libc.src.stdbit.stdc_trailing_ones_ull
+    libc.src.stdbit.stdc_first_leading_zero_uc
+    libc.src.stdbit.stdc_first_leading_zero_us
+    libc.src.stdbit.stdc_first_leading_zero_ui
+    libc.src.stdbit.stdc_first_leading_zero_ul
+    libc.src.stdbit.stdc_first_leading_zero_ull
 
     # stdlib.h entrypoints
     libc.src.stdlib.abs
diff --git a/libc/include/llvm-libc-macros/stdbit-macros.h b/libc/include/llvm-libc-macros/stdbit-macros.h
index c55529e..693a45e 100644
--- a/libc/include/llvm-libc-macros/stdbit-macros.h
+++ b/libc/include/llvm-libc-macros/stdbit-macros.h
@@ -71,6 +71,21 @@ inline unsigned stdc_trailing_ones(unsigned long x) {
 inline unsigned stdc_trailing_ones(unsigned long long x) {
   return stdc_trailing_ones_ull(x);
 }
+inline unsigned stdc_first_leading_zero(unsigned char x) {
+  return stdc_first_leading_zero_uc(x);
+}
+inline unsigned stdc_first_leading_zero(unsigned short x) {
+  return stdc_first_leading_zero_us(x);
+}
+inline unsigned stdc_first_leading_zero(unsigned x) {
+  return stdc_first_leading_zero_ui(x);
+}
+inline unsigned stdc_first_leading_zero(unsigned long x) {
+  return stdc_first_leading_zero_ul(x);
+}
+inline unsigned stdc_first_leading_zero(unsigned long long x) {
+  return stdc_first_leading_zero_ull(x);
+}
 #else
 #define stdc_leading_zeros(x)                                                  \
   _Generic((x),                                                                \
@@ -100,6 +115,13 @@ inline unsigned stdc_trailing_ones(unsigned long long x) {
       unsigned: stdc_trailing_ones_ui,                                         \
       unsigned long: stdc_trailing_ones_ul,                                    \
       unsigned long long: stdc_trailing_ones_ull)(x)
+#define stdc_first_leading_zero(x)                                             \
+  _Generic((x),                                                                \
+      unsigned char: stdc_first_leading_zero_uc,                               \
+      unsigned short: stdc_first_leading_zero_us,                              \
+      unsigned: stdc_first_leading_zero_ui,                                    \
+      unsigned long: stdc_first_leading_zero_ul,                               \
+      unsigned long long: stdc_first_leading_zero_ull)(x)
 #endif // __cplusplus
 
 #endif // __LLVM_LIBC_MACROS_STDBIT_MACROS_H
diff --git a/libc/spec/stdc.td b/libc/spec/stdc.td
index 011abbf..1720a4a 100644
--- a/libc/spec/stdc.td
+++ b/libc/spec/stdc.td
@@ -780,7 +780,8 @@ def StdC : StandardSpec<"stdc"> {
         Macro<"stdc_leading_zeros">,
         Macro<"stdc_leading_ones">,
         Macro<"stdc_trailing_zeros">,
-        Macro<"stdc_trailing_ones">
+        Macro<"stdc_trailing_ones">,
+        Macro<"stdc_first_leading_zero">
       ], // Macros
       [], // Types
       [], // Enumerations
@@ -804,7 +805,12 @@ def StdC : StandardSpec<"stdc"> {
           FunctionSpec<"stdc_trailing_ones_us", RetValSpec<UnsignedIntType>, [ArgSpec<UnsignedShortType>]>,
           FunctionSpec<"stdc_trailing_ones_ui", RetValSpec<UnsignedIntType>, [ArgSpec<UnsignedIntType>]>,
           FunctionSpec<"stdc_trailing_ones_ul", RetValSpec<UnsignedIntType>, [ArgSpec<UnsignedLongType>]>,
-          FunctionSpec<"stdc_trailing_ones_ull", RetValSpec<UnsignedIntType>, [ArgSpec<UnsignedLongLongType>]>
+          FunctionSpec<"stdc_trailing_ones_ull", RetValSpec<UnsignedIntType>, [ArgSpec<UnsignedLongLongType>]>,
+          FunctionSpec<"stdc_first_leading_zero_uc", RetValSpec<UnsignedIntType>, [ArgSpec<UnsignedCharType>]>,
+          FunctionSpec<"stdc_first_leading_zero_us", RetValSpec<UnsignedIntType>, [ArgSpec<UnsignedShortType>]>,
+          FunctionSpec<"stdc_first_leading_zero_ui", RetValSpec<UnsignedIntType>, [ArgSpec<UnsignedIntType>]>,
+          FunctionSpec<"stdc_first_leading_zero_ul", RetValSpec<UnsignedIntType>, [ArgSpec<UnsignedLongType>]>,
+          FunctionSpec<"stdc_first_leading_zero_ull", RetValSpec<UnsignedIntType>, [ArgSpec<UnsignedLongLongType>]>
       ] // Functions
   >;
 
diff --git a/libc/src/__support/CPP/bit.h b/libc/src/__support/CPP/bit.h
index 122f6b8..392fbe2 100644
--- a/libc/src/__support/CPP/bit.h
+++ b/libc/src/__support/CPP/bit.h
@@ -238,6 +238,36 @@ LIBC_INLINE constexpr To bit_or_static_cast(const From &from) {
   }
 }
 
+#define SPECIALIZE_FLZ(NAME, TYPE, BUILTIN)                                    \
+  template <> [[nodiscard]] LIBC_INLINE constexpr int NAME<TYPE>(TYPE value) { \
+    static_assert(cpp::is_unsigned_v<TYPE>);                                   \
+    return value == cpp::numeric_limits<TYPE>::max()                           \
+               ? 0                                                             \
+               : BUILTIN(static_cast<TYPE>(~value)) + 1;                       \
+  }
+
+template <typename T, typename = cpp::enable_if_t<cpp::is_unsigned_v<T>>>
+[[nodiscard]] LIBC_INLINE constexpr int first_leading_zero(T value) {
+  return value == cpp::numeric_limits<T>::max()
+             ? 0
+             : countl_zero(static_cast<T>(~value)) + 1;
+}
+
+#if LIBC_HAS_BUILTIN(__builtin_clzs)
+SPECIALIZE_FLZ(first_leading_zero, unsigned short, __builtin_clzs)
+#endif
+#if LIBC_HAS_BUILTIN(__builtin_clz)
+SPECIALIZE_FLZ(first_leading_zero, unsigned int, __builtin_clz)
+#endif
+#if LIBC_HAS_BUILTIN(__builtin_clzl)
+SPECIALIZE_FLZ(first_leading_zero, unsigned long, __builtin_clzl)
+#endif
+#if LIBC_HAS_BUILTIN(__builtin_clzll)
+SPECIALIZE_FLZ(first_leading_zero, unsigned long long, __builtin_clzll)
+#endif
+
+#undef SPECIALIZE_FLZ
+
 } // namespace LIBC_NAMESPACE::cpp
 
 #endif // LLVM_LIBC_SRC___SUPPORT_CPP_BIT_H
diff --git a/libc/src/stdbit/CMakeLists.txt b/libc/src/stdbit/CMakeLists.txt
index 129621f..65d5f34 100644
--- a/libc/src/stdbit/CMakeLists.txt
+++ b/libc/src/stdbit/CMakeLists.txt
@@ -3,6 +3,7 @@ set(prefixes
   leading_ones
   trailing_zeros
   trailing_ones
+  first_leading_zero
 )
 set(suffixes c s i l ll)
 foreach(prefix IN LISTS prefixes)
diff --git a/libc/src/stdbit/stdc_first_leading_zero_uc.cpp b/libc/src/stdbit/stdc_first_leading_zero_uc.cpp
new file mode 100644
index 0000000..ffc1d92
--- /dev/null
+++ b/libc/src/stdbit/stdc_first_leading_zero_uc.cpp
@@ -0,0 +1,21 @@
+//===-- Implementation of stdc_first_leading_zero_uc ----------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/stdbit/stdc_first_leading_zero_uc.h"
+
+#include "src/__support/CPP/bit.h"
+#include "src/__support/common.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(unsigned, stdc_first_leading_zero_uc,
+                   (unsigned char value)) {
+  return static_cast<unsigned>(cpp::first_leading_zero(value));
+}
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/stdbit/stdc_first_leading_zero_uc.h b/libc/src/stdbit/stdc_first_leading_zero_uc.h
new file mode 100644
index 0000000..63ecd20a
--- /dev/null
+++ b/libc/src/stdbit/stdc_first_leading_zero_uc.h
@@ -0,0 +1,18 @@
+//===-- Implementation header for stdc_first_leading_zero_uc ----*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_STDBIT_STDC_FIRST_LEADING_ZERO_UC_H
+#define LLVM_LIBC_SRC_STDBIT_STDC_FIRST_LEADING_ZERO_UC_H
+
+namespace LIBC_NAMESPACE {
+
+unsigned stdc_first_leading_zero_uc(unsigned char value);
+
+} // namespace LIBC_NAMESPACE
+
+#endif // LLVM_LIBC_SRC_STDBIT_STDC_FIRST_LEADING_ZERO_UC_H
diff --git a/libc/src/stdbit/stdc_first_leading_zero_ui.cpp b/libc/src/stdbit/stdc_first_leading_zero_ui.cpp
new file mode 100644
index 0000000..1eeab29
--- /dev/null
+++ b/libc/src/stdbit/stdc_first_leading_zero_ui.cpp
@@ -0,0 +1,20 @@
+//===-- Implementation of stdc_first_leading_zero_ui ----------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/stdbit/stdc_first_leading_zero_ui.h"
+
+#include "src/__support/CPP/bit.h"
+#include "src/__support/common.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(unsigned, stdc_first_leading_zero_ui, (unsigned value)) {
+  return static_cast<unsigned>(cpp::first_leading_zero(value));
+}
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/stdbit/stdc_first_leading_zero_ui.h b/libc/src/stdbit/stdc_first_leading_zero_ui.h
new file mode 100644
index 0000000..d8d5d93
--- /dev/null
+++ b/libc/src/stdbit/stdc_first_leading_zero_ui.h
@@ -0,0 +1,18 @@
+//===-- Implementation header for stdc_first_leading_zero_ui ----*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_STDBIT_STDC_FIRST_LEADING_ZERO_UI_H
+#define LLVM_LIBC_SRC_STDBIT_STDC_FIRST_LEADING_ZERO_UI_H
+
+namespace LIBC_NAMESPACE {
+
+unsigned stdc_first_leading_zero_ui(unsigned value);
+
+} // namespace LIBC_NAMESPACE
+
+#endif // LLVM_LIBC_SRC_STDBIT_STDC_FIRST_LEADING_ZERO_UI_H
diff --git a/libc/src/stdbit/stdc_first_leading_zero_ul.cpp b/libc/src/stdbit/stdc_first_leading_zero_ul.cpp
new file mode 100644
index 0000000..6743d3e
--- /dev/null
+++ b/libc/src/stdbit/stdc_first_leading_zero_ul.cpp
@@ -0,0 +1,21 @@
+//===-- Implementation of stdc_first_leading_zero_ul ----------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/stdbit/stdc_first_leading_zero_ul.h"
+
+#include "src/__support/CPP/bit.h"
+#include "src/__support/common.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(unsigned, stdc_first_leading_zero_ul,
+                   (unsigned long value)) {
+  return static_cast<unsigned>(cpp::first_leading_zero(value));
+}
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/stdbit/stdc_first_leading_zero_ul.h b/libc/src/stdbit/stdc_first_leading_zero_ul.h
new file mode 100644
index 0000000..8df1b55
--- /dev/null
+++ b/libc/src/stdbit/stdc_first_leading_zero_ul.h
@@ -0,0 +1,18 @@
+//===-- Implementation header for stdc_first_leading_zero_ul ----*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_STDBIT_STDC_FIRST_LEADING_ZERO_UL_H
+#define LLVM_LIBC_SRC_STDBIT_STDC_FIRST_LEADING_ZERO_UL_H
+
+namespace LIBC_NAMESPACE {
+
+unsigned stdc_first_leading_zero_ul(unsigned long value);
+
+} // namespace LIBC_NAMESPACE
+
+#endif // LLVM_LIBC_SRC_STDBIT_STDC_FIRST_LEADING_ZERO_UL_H
diff --git a/libc/src/stdbit/stdc_first_leading_zero_ull.cpp b/libc/src/stdbit/stdc_first_leading_zero_ull.cpp
new file mode 100644
index 0000000..8128dd3
--- /dev/null
+++ b/libc/src/stdbit/stdc_first_leading_zero_ull.cpp
@@ -0,0 +1,21 @@
+//===-- Implementation of stdc_first_leading_zero_ull ---------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/stdbit/stdc_first_leading_zero_ull.h"
+
+#include "src/__support/CPP/bit.h"
+#include "src/__support/common.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(unsigned, stdc_first_leading_zero_ull,
+                   (unsigned long long value)) {
+  return static_cast<unsigned>(cpp::first_leading_zero(value));
+}
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/stdbit/stdc_first_leading_zero_ull.h b/libc/src/stdbit/stdc_first_leading_zero_ull.h
new file mode 100644
index 0000000..9aec5e7
--- /dev/null
+++ b/libc/src/stdbit/stdc_first_leading_zero_ull.h
@@ -0,0 +1,18 @@
+//===-- Implementation header for stdc_first_leading_zero_ull ---*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_STDBIT_STDC_FIRST_LEADING_ZERO_ULL_H
+#define LLVM_LIBC_SRC_STDBIT_STDC_FIRST_LEADING_ZERO_ULL_H
+
+namespace LIBC_NAMESPACE {
+
+unsigned stdc_first_leading_zero_ull(unsigned long long value);
+
+} // namespace LIBC_NAMESPACE
+
+#endif // LLVM_LIBC_SRC_STDBIT_STDC_FIRST_LEADING_ZERO_ULL_H
diff --git a/libc/src/stdbit/stdc_first_leading_zero_us.cpp b/libc/src/stdbit/stdc_first_leading_zero_us.cpp
new file mode 100644
index 0000000..d931535
--- /dev/null
+++ b/libc/src/stdbit/stdc_first_leading_zero_us.cpp
@@ -0,0 +1,21 @@
+//===-- Implementation of stdc_first_leading_zero_us ----------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/stdbit/stdc_first_leading_zero_us.h"
+
+#include "src/__support/CPP/bit.h"
+#include "src/__support/common.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(unsigned, stdc_first_leading_zero_us,
+                   (unsigned short value)) {
+  return static_cast<unsigned>(cpp::first_leading_zero(value));
+}
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/stdbit/stdc_first_leading_zero_us.h b/libc/src/stdbit/stdc_first_leading_zero_us.h
new file mode 100644
index 0000000..8587378
--- /dev/null
+++ b/libc/src/stdbit/stdc_first_leading_zero_us.h
@@ -0,0 +1,18 @@
+//===-- Implementation header for stdc_first_leading_zero_us ----*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_STDBIT_STDC_FIRST_LEADING_ZERO_US_H
+#define LLVM_LIBC_SRC_STDBIT_STDC_FIRST_LEADING_ZERO_US_H
+
+namespace LIBC_NAMESPACE {
+
+unsigned stdc_first_leading_zero_us(unsigned short value);
+
+} // namespace LIBC_NAMESPACE
+
+#endif // LLVM_LIBC_SRC_STDBIT_STDC_FIRST_LEADING_ZERO_US_H
diff --git a/libc/test/include/stdbit_test.cpp b/libc/test/include/stdbit_test.cpp
index 8d5d4c1..9a66a76 100644
--- a/libc/test/include/stdbit_test.cpp
+++ b/libc/test/include/stdbit_test.cpp
@@ -43,6 +43,13 @@ unsigned stdc_trailing_ones_us(unsigned short) noexcept { return 0xDBU; }
 unsigned stdc_trailing_ones_ui(unsigned) noexcept { return 0xDCU; }
 unsigned stdc_trailing_ones_ul(unsigned long) noexcept { return 0xDDU; }
 unsigned stdc_trailing_ones_ull(unsigned long long) noexcept { return 0xDFU; }
+unsigned stdc_first_leading_zero_uc(unsigned char) noexcept { return 0xEAU; }
+unsigned stdc_first_leading_zero_us(unsigned short) noexcept { return 0xEBU; }
+unsigned stdc_first_leading_zero_ui(unsigned) noexcept { return 0xECU; }
+unsigned stdc_first_leading_zero_ul(unsigned long) noexcept { return 0xEDU; }
+unsigned stdc_first_leading_zero_ull(unsigned long long) noexcept {
+  return 0xEFU;
+}
 }
 
 #include "include/llvm-libc-macros/stdbit-macros.h"
@@ -78,3 +85,11 @@ TEST(LlvmLibcStdbitTest, TypeGenericMacroTrailingOnes) {
   EXPECT_EQ(stdc_trailing_ones(0UL), 0xDDU);
   EXPECT_EQ(stdc_trailing_ones(0ULL), 0xDFU);
 }
+
+TEST(LlvmLibcStdbitTest, TypeGenericMacroFirstLeadingZero) {
+  EXPECT_EQ(stdc_first_leading_zero(static_cast<unsigned char>(0U)), 0xEAU);
+  EXPECT_EQ(stdc_first_leading_zero(static_cast<unsigned short>(0U)), 0xEBU);
+  EXPECT_EQ(stdc_first_leading_zero(0U), 0xECU);
+  EXPECT_EQ(stdc_first_leading_zero(0UL), 0xEDU);
+  EXPECT_EQ(stdc_first_leading_zero(0ULL), 0xEFU);
+}
diff --git a/libc/test/src/__support/CPP/bit_test.cpp b/libc/test/src/__support/CPP/bit_test.cpp
index fef551b..00d8ca5 100644
--- a/libc/test/src/__support/CPP/bit_test.cpp
+++ b/libc/test/src/__support/CPP/bit_test.cpp
@@ -206,4 +206,11 @@ TEST(LlvmLibcBitTest, Rotr) {
             rotr<uint64_t>(0x12345678deadbeefULL, -19));
 }
 
+TYPED_TEST(LlvmLibcBitTest, FirstLeadingZero, UnsignedTypes) {
+  EXPECT_EQ(first_leading_zero<T>(cpp::numeric_limits<T>::max()), 0);
+  for (int i = 0U; i != cpp::numeric_limits<T>::digits; ++i)
+    EXPECT_EQ(first_leading_zero<T>(~(T(1) << i)),
+              cpp::numeric_limits<T>::digits - i);
+}
+
 } // namespace LIBC_NAMESPACE::cpp
diff --git a/libc/test/src/stdbit/CMakeLists.txt b/libc/test/src/stdbit/CMakeLists.txt
index fab5f67..bc7e49d 100644
--- a/libc/test/src/stdbit/CMakeLists.txt
+++ b/libc/test/src/stdbit/CMakeLists.txt
@@ -5,6 +5,7 @@ set(prefixes
   leading_ones
   trailing_zeros
   trailing_ones
+  first_leading_zero
 )
 set(suffixes c s i l ll)
 foreach(prefix IN LISTS prefixes)
diff --git a/libc/test/src/stdbit/stdc_first_leading_zero_uc_test.cpp b/libc/test/src/stdbit/stdc_first_leading_zero_uc_test.cpp
new file mode 100644
index 0000000..ac7e8c7
--- /dev/null
+++ b/libc/test/src/stdbit/stdc_first_leading_zero_uc_test.cpp
@@ -0,0 +1,21 @@
+//===-- Unittests for stdc_first_leading_zero_uc --------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/__support/CPP/limits.h"
+#include "src/stdbit/stdc_first_leading_zero_uc.h"
+#include "test/UnitTest/Test.h"
+
+TEST(LlvmLibcStdcFirstLeadingZeroUcTest, ALL) {
+  EXPECT_EQ(LIBC_NAMESPACE::stdc_first_leading_zero_uc(UCHAR_MAX), 0U);
+}
+
+TEST(LlvmLibcStdcFirstLeadingZeroUcTest, ZeroHot) {
+  for (unsigned i = 0U; i != UCHAR_WIDTH; ++i)
+    EXPECT_EQ(LIBC_NAMESPACE::stdc_first_leading_zero_uc(~(1U << i)),
+              UCHAR_WIDTH - i);
+}
diff --git a/libc/test/src/stdbit/stdc_first_leading_zero_ui_test.cpp b/libc/test/src/stdbit/stdc_first_leading_zero_ui_test.cpp
new file mode 100644
index 0000000..79a4e53
--- /dev/null
+++ b/libc/test/src/stdbit/stdc_first_leading_zero_ui_test.cpp
@@ -0,0 +1,21 @@
+//===-- Unittests for stdc_first_leading_zero_ui --------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/__support/CPP/limits.h"
+#include "src/stdbit/stdc_first_leading_zero_ui.h"
+#include "test/UnitTest/Test.h"
+
+TEST(LlvmLibcStdcFirstLeadingZeroUiTest, ALL) {
+  EXPECT_EQ(LIBC_NAMESPACE::stdc_first_leading_zero_ui(UINT_MAX), 0U);
+}
+
+TEST(LlvmLibcStdcFirstLeadingZeroUiTest, ZeroHot) {
+  for (unsigned i = 0U; i != UINT_WIDTH; ++i)
+    EXPECT_EQ(LIBC_NAMESPACE::stdc_first_leading_zero_ui(~(1U << i)),
+              UINT_WIDTH - i);
+}
diff --git a/libc/test/src/stdbit/stdc_first_leading_zero_ul_test.cpp b/libc/test/src/stdbit/stdc_first_leading_zero_ul_test.cpp
new file mode 100644
index 0000000..92cac6c
--- /dev/null
+++ b/libc/test/src/stdbit/stdc_first_leading_zero_ul_test.cpp
@@ -0,0 +1,21 @@
+//===-- Unittests for stdc_first_leading_zero_ul --------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/__support/CPP/limits.h"
+#include "src/stdbit/stdc_first_leading_zero_ul.h"
+#include "test/UnitTest/Test.h"
+
+TEST(LlvmLibcStdcFirstLeadingZeroUlTest, ALL) {
+  EXPECT_EQ(LIBC_NAMESPACE::stdc_first_leading_zero_ul(ULONG_MAX), 0U);
+}
+
+TEST(LlvmLibcStdcFirstLeadingZeroUlTest, ZeroHot) {
+  for (unsigned i = 0U; i != ULONG_WIDTH; ++i)
+    EXPECT_EQ(LIBC_NAMESPACE::stdc_first_leading_zero_ul(~(1UL << i)),
+              ULONG_WIDTH - i);
+}
diff --git a/libc/test/src/stdbit/stdc_first_leading_zero_ull_test.cpp b/libc/test/src/stdbit/stdc_first_leading_zero_ull_test.cpp
new file mode 100644
index 0000000..a5afdad
--- /dev/null
+++ b/libc/test/src/stdbit/stdc_first_leading_zero_ull_test.cpp
@@ -0,0 +1,21 @@
+//===-- Unittests for stdc_first_leading_zero_ull -------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/__support/CPP/limits.h"
+#include "src/stdbit/stdc_first_leading_zero_ull.h"
+#include "test/UnitTest/Test.h"
+
+TEST(LlvmLibcStdcFirstLeadingZeroUllTest, ALL) {
+  EXPECT_EQ(LIBC_NAMESPACE::stdc_first_leading_zero_ull(ULLONG_MAX), 0U);
+}
+
+TEST(LlvmLibcStdcFirstLeadingZeroUllTest, ZeroHot) {
+  for (unsigned i = 0U; i != ULLONG_WIDTH; ++i)
+    EXPECT_EQ(LIBC_NAMESPACE::stdc_first_leading_zero_ull(~(1ULL << i)),
+              ULLONG_WIDTH - i);
+}
diff --git a/libc/test/src/stdbit/stdc_first_leading_zero_us_test.cpp b/libc/test/src/stdbit/stdc_first_leading_zero_us_test.cpp
new file mode 100644
index 0000000..37f8612
--- /dev/null
+++ b/libc/test/src/stdbit/stdc_first_leading_zero_us_test.cpp
@@ -0,0 +1,21 @@
+//===-- Unittests for stdc_first_leading_zero_us --------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/__support/CPP/limits.h"
+#include "src/stdbit/stdc_first_leading_zero_us.h"
+#include "test/UnitTest/Test.h"
+
+TEST(LlvmLibcStdcFirstLeadingZeroUsTest, ALL) {
+  EXPECT_EQ(LIBC_NAMESPACE::stdc_first_leading_zero_us(USHRT_MAX), 0U);
+}
+
+TEST(LlvmLibcStdcFirstLeadingZeroUsTest, ZeroHot) {
+  for (unsigned i = 0U; i != USHRT_WIDTH; ++i)
+    EXPECT_EQ(LIBC_NAMESPACE::stdc_first_leading_zero_us(~(1U << i)),
+              USHRT_WIDTH - i);
+}
-- 
cgit v1.1


From cf55e61dd916d7ba042b9d072d07c3a0b50a815f Mon Sep 17 00:00:00 2001
From: Konstantin Zhuravlyov <kzhuravl_dev@outlook.com>
Date: Mon, 12 Feb 2024 11:32:46 -0500
Subject: AMDGPU: Don't allow s_barrier on gfx12 (#81317)

- s_barrier is not present on gfx12
---
 llvm/lib/Target/AMDGPU/SOPInstructions.td            | 3 ++-
 llvm/test/MC/AMDGPU/gfx12_asm_sopp.s                 | 3 ---
 llvm/test/MC/AMDGPU/gfx12_unsupported.s              | 3 +++
 llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_sopp.txt | 3 ---
 4 files changed, 5 insertions(+), 7 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td
index c8e8ad2..8351560 100644
--- a/llvm/lib/Target/AMDGPU/SOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td
@@ -2620,7 +2620,8 @@ defm S_DECPERFLEVEL               : SOPP_Real_32_gfx11_gfx12<0x039>;
 defm S_TTRACEDATA                 : SOPP_Real_32_gfx11_gfx12<0x03a>;
 defm S_TTRACEDATA_IMM             : SOPP_Real_32_gfx11_gfx12<0x03b>;
 defm S_ICACHE_INV                 : SOPP_Real_32_gfx11_gfx12<0x03c>;
-defm S_BARRIER                    : SOPP_Real_32_gfx11_gfx12<0x03d>;
+
+defm S_BARRIER                    : SOPP_Real_32_gfx11<0x03d>;
 
 //===----------------------------------------------------------------------===//
 // SOPP - GFX1150, GFX12.
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_sopp.s b/llvm/test/MC/AMDGPU/gfx12_asm_sopp.s
index f6c7c99..73d04f1 100644
--- a/llvm/test/MC/AMDGPU/gfx12_asm_sopp.s
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_sopp.s
@@ -270,9 +270,6 @@ s_cbranch_execnz 0x0
 s_cbranch_execnz 0x1234
 // GFX12: s_cbranch_execnz 4660 ; encoding: [0x34,0x12,0xa6,0xbf]
 
-s_barrier
-// GFX12: s_barrier ; encoding: [0x00,0x00,0xbd,0xbf]
-
 s_setkill 0x0
 // GFX12: s_setkill 0 ; encoding: [0x00,0x00,0x81,0xbf]
 
diff --git a/llvm/test/MC/AMDGPU/gfx12_unsupported.s b/llvm/test/MC/AMDGPU/gfx12_unsupported.s
index 9d52a5d..6cd0fe2 100644
--- a/llvm/test/MC/AMDGPU/gfx12_unsupported.s
+++ b/llvm/test/MC/AMDGPU/gfx12_unsupported.s
@@ -106,6 +106,9 @@ s_cmpk_le_u32 s0, 0
 s_inst_prefetch 1
 // CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
 
+s_barrier
+// CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
 buffer_atomic_cmpswap_f32 v[5:6], off, s[96:99], s3
 // CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
 
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_sopp.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_sopp.txt
index ea547fc..e9371d1 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_sopp.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_sopp.txt
@@ -78,9 +78,6 @@
 # GFX12: s_barrier_leave                         ; encoding: [0x00,0x00,0x95,0xbf]
 0x00,0x00,0x95,0xbf
 
-# GFX12: s_barrier                               ; encoding: [0x00,0x00,0xbd,0xbf]
-0x00,0x00,0xbd,0xbf
-
 # GFX12: s_branch 0                              ; encoding: [0x00,0x00,0xa0,0xbf]
 0x00,0x00,0xa0,0xbf
 
-- 
cgit v1.1


From 30338223e461a00463cef665ec29d8f5a8ef2cda Mon Sep 17 00:00:00 2001
From: Vlad Serebrennikov <serebrennikov.vladislav@gmail.com>
Date: Mon, 12 Feb 2024 20:40:57 +0400
Subject: [clang] Refactor `IdentifierInfo::ObjcOrBuiltinID` (#71709)

This patch refactors how values are stored inside
`IdentifierInfo::ObjcOrBuiltinID` bit-field, and annotates it with
`preferred_type`. In order to make the value easier to interpret by
debuggers, a new `ObjCKeywordOrInterestingOrBuiltin` enum is added.
Previous "layout" of this fields couldn't be represented with this new enum,
because it skipped over some arbitrary enumerators, so a new "layout"
was invented, which is reflected in `ObjCKeywordOrInterestingOrBuiltin` enum. I
believe the new layout is simpler than the new one.
---
 clang/include/clang/Basic/IdentifierTable.h | 125 ++++++++++++++++++----------
 clang/lib/Serialization/ASTReader.cpp       |   8 +-
 clang/lib/Serialization/ASTWriter.cpp       |   9 +-
 3 files changed, 92 insertions(+), 50 deletions(-)

diff --git a/clang/include/clang/Basic/IdentifierTable.h b/clang/include/clang/Basic/IdentifierTable.h
index 1ac182d..fa8969e 100644
--- a/clang/include/clang/Basic/IdentifierTable.h
+++ b/clang/include/clang/Basic/IdentifierTable.h
@@ -15,6 +15,7 @@
 #ifndef LLVM_CLANG_BASIC_IDENTIFIERTABLE_H
 #define LLVM_CLANG_BASIC_IDENTIFIERTABLE_H
 
+#include "clang/Basic/Builtins.h"
 #include "clang/Basic/DiagnosticIDs.h"
 #include "clang/Basic/LLVM.h"
 #include "clang/Basic/TokenKinds.h"
@@ -86,19 +87,26 @@ enum { IdentifierInfoAlignment = 8 };
 static constexpr int ObjCOrBuiltinIDBits = 16;
 
 /// The "layout" of ObjCOrBuiltinID is:
-///  - The first value (0) represents "not a special identifier".
-///  - The next (NUM_OBJC_KEYWORDS - 1) values represent ObjCKeywordKinds (not
-///    including objc_not_keyword).
-///  - The next (NUM_INTERESTING_IDENTIFIERS - 1) values represent
-///    InterestingIdentifierKinds (not including not_interesting).
-///  - The rest of the values represent builtin IDs (not including NotBuiltin).
-static constexpr int FirstObjCKeywordID = 1;
-static constexpr int LastObjCKeywordID =
-    FirstObjCKeywordID + tok::NUM_OBJC_KEYWORDS - 2;
-static constexpr int FirstInterestingIdentifierID = LastObjCKeywordID + 1;
-static constexpr int LastInterestingIdentifierID =
-    FirstInterestingIdentifierID + tok::NUM_INTERESTING_IDENTIFIERS - 2;
-static constexpr int FirstBuiltinID = LastInterestingIdentifierID + 1;
+///  - ObjCKeywordKind enumerators
+///  - InterestingIdentifierKind enumerators
+///  - Builtin::ID enumerators
+///  - NonSpecialIdentifier
+enum class ObjCKeywordOrInterestingOrBuiltin {
+#define OBJC_AT_KEYWORD(X) objc_##X,
+#include "clang/Basic/TokenKinds.def"
+  NUM_OBJC_KEYWORDS,
+
+#define INTERESTING_IDENTIFIER(X) X,
+#include "clang/Basic/TokenKinds.def"
+  NUM_OBJC_KEYWORDS_AND_INTERESTING_IDENTIFIERS,
+
+  NotBuiltin,
+#define BUILTIN(ID, TYPE, ATTRS) BI##ID,
+#include "clang/Basic/Builtins.inc"
+  FirstTSBuiltin,
+
+  NonSpecialIdentifier = 65534
+};
 
 /// One of these records is kept for each identifier that
 /// is lexed.  This contains information about whether the token was \#define'd,
@@ -113,9 +121,7 @@ class alignas(IdentifierInfoAlignment) IdentifierInfo {
   LLVM_PREFERRED_TYPE(tok::TokenKind)
   unsigned TokenID : 9;
 
-  // ObjC keyword ('protocol' in '@protocol') or builtin (__builtin_inf).
-  // First NUM_OBJC_KEYWORDS values are for Objective-C,
-  // the remaining values are for builtins.
+  LLVM_PREFERRED_TYPE(ObjCKeywordOrInterestingOrBuiltin)
   unsigned ObjCOrBuiltinID : ObjCOrBuiltinIDBits;
 
   // True if there is a #define for this.
@@ -198,13 +204,16 @@ class alignas(IdentifierInfoAlignment) IdentifierInfo {
   llvm::StringMapEntry<IdentifierInfo *> *Entry = nullptr;
 
   IdentifierInfo()
-      : TokenID(tok::identifier), ObjCOrBuiltinID(0), HasMacro(false),
-        HadMacro(false), IsExtension(false), IsFutureCompatKeyword(false),
-        IsPoisoned(false), IsCPPOperatorKeyword(false),
-        NeedsHandleIdentifier(false), IsFromAST(false), ChangedAfterLoad(false),
-        FEChangedAfterLoad(false), RevertedTokenID(false), OutOfDate(false),
-        IsModulesImport(false), IsMangledOpenMPVariantName(false),
-        IsDeprecatedMacro(false), IsRestrictExpansion(false), IsFinal(false) {}
+      : TokenID(tok::identifier),
+        ObjCOrBuiltinID(llvm::to_underlying(
+            ObjCKeywordOrInterestingOrBuiltin::NonSpecialIdentifier)),
+        HasMacro(false), HadMacro(false), IsExtension(false),
+        IsFutureCompatKeyword(false), IsPoisoned(false),
+        IsCPPOperatorKeyword(false), NeedsHandleIdentifier(false),
+        IsFromAST(false), ChangedAfterLoad(false), FEChangedAfterLoad(false),
+        RevertedTokenID(false), OutOfDate(false), IsModulesImport(false),
+        IsMangledOpenMPVariantName(false), IsDeprecatedMacro(false),
+        IsRestrictExpansion(false), IsFinal(false) {}
 
 public:
   IdentifierInfo(const IdentifierInfo &) = delete;
@@ -332,42 +341,66 @@ public:
   ///
   /// For example, 'class' will return tok::objc_class if ObjC is enabled.
   tok::ObjCKeywordKind getObjCKeywordID() const {
-    static_assert(FirstObjCKeywordID == 1,
-                  "hard-coding this assumption to simplify code");
-    if (ObjCOrBuiltinID <= LastObjCKeywordID)
-      return tok::ObjCKeywordKind(ObjCOrBuiltinID);
-    else
-      return tok::objc_not_keyword;
+    assert(0 == llvm::to_underlying(
+                    ObjCKeywordOrInterestingOrBuiltin::objc_not_keyword));
+    auto Value =
+        static_cast<ObjCKeywordOrInterestingOrBuiltin>(ObjCOrBuiltinID);
+    if (Value < ObjCKeywordOrInterestingOrBuiltin::NUM_OBJC_KEYWORDS)
+      return static_cast<tok::ObjCKeywordKind>(ObjCOrBuiltinID);
+    return tok::objc_not_keyword;
+  }
+  void setObjCKeywordID(tok::ObjCKeywordKind ID) {
+    assert(0 == llvm::to_underlying(
+                    ObjCKeywordOrInterestingOrBuiltin::objc_not_keyword));
+    ObjCOrBuiltinID = ID;
+    assert(getObjCKeywordID() == ID && "ID too large for field!");
   }
-  void setObjCKeywordID(tok::ObjCKeywordKind ID) { ObjCOrBuiltinID = ID; }
 
   /// Return a value indicating whether this is a builtin function.
-  ///
-  /// 0 is not-built-in. 1+ are specific builtin functions.
   unsigned getBuiltinID() const {
-    if (ObjCOrBuiltinID >= FirstBuiltinID)
-      return 1 + (ObjCOrBuiltinID - FirstBuiltinID);
-    else
-      return 0;
+    auto Value =
+        static_cast<ObjCKeywordOrInterestingOrBuiltin>(ObjCOrBuiltinID);
+    if (Value > ObjCKeywordOrInterestingOrBuiltin::
+                    NUM_OBJC_KEYWORDS_AND_INTERESTING_IDENTIFIERS &&
+        Value != ObjCKeywordOrInterestingOrBuiltin::NonSpecialIdentifier) {
+      auto FirstBuiltin =
+          llvm::to_underlying(ObjCKeywordOrInterestingOrBuiltin::NotBuiltin);
+      return static_cast<Builtin::ID>(ObjCOrBuiltinID - FirstBuiltin);
+    }
+    return Builtin::ID::NotBuiltin;
   }
   void setBuiltinID(unsigned ID) {
-    assert(ID != 0);
-    ObjCOrBuiltinID = FirstBuiltinID + (ID - 1);
+    assert(ID != Builtin::ID::NotBuiltin);
+    auto FirstBuiltin =
+        llvm::to_underlying(ObjCKeywordOrInterestingOrBuiltin::NotBuiltin);
+    ObjCOrBuiltinID = ID + FirstBuiltin;
     assert(getBuiltinID() == ID && "ID too large for field!");
   }
-  void clearBuiltinID() { ObjCOrBuiltinID = 0; }
+  void clearBuiltinID() {
+    ObjCOrBuiltinID = llvm::to_underlying(
+        ObjCKeywordOrInterestingOrBuiltin::NonSpecialIdentifier);
+  }
 
   tok::InterestingIdentifierKind getInterestingIdentifierID() const {
-    if (ObjCOrBuiltinID >= FirstInterestingIdentifierID &&
-        ObjCOrBuiltinID <= LastInterestingIdentifierID)
-      return tok::InterestingIdentifierKind(
-          1 + (ObjCOrBuiltinID - FirstInterestingIdentifierID));
-    else
-      return tok::not_interesting;
+    auto Value =
+        static_cast<ObjCKeywordOrInterestingOrBuiltin>(ObjCOrBuiltinID);
+    if (Value > ObjCKeywordOrInterestingOrBuiltin::NUM_OBJC_KEYWORDS &&
+        Value < ObjCKeywordOrInterestingOrBuiltin::
+                    NUM_OBJC_KEYWORDS_AND_INTERESTING_IDENTIFIERS) {
+      auto FirstInterestingIdentifier =
+          1 + llvm::to_underlying(
+                  ObjCKeywordOrInterestingOrBuiltin::NUM_OBJC_KEYWORDS);
+      return static_cast<tok::InterestingIdentifierKind>(
+          ObjCOrBuiltinID - FirstInterestingIdentifier);
+    }
+    return tok::not_interesting;
   }
   void setInterestingIdentifierID(unsigned ID) {
     assert(ID != tok::not_interesting);
-    ObjCOrBuiltinID = FirstInterestingIdentifierID + (ID - 1);
+    auto FirstInterestingIdentifier =
+        1 + llvm::to_underlying(
+                ObjCKeywordOrInterestingOrBuiltin::NUM_OBJC_KEYWORDS);
+    ObjCOrBuiltinID = ID + FirstInterestingIdentifier;
     assert(getInterestingIdentifierID() == ID && "ID too large for field!");
   }
 
diff --git a/clang/lib/Serialization/ASTReader.cpp b/clang/lib/Serialization/ASTReader.cpp
index c9217f7..eea14a6 100644
--- a/clang/lib/Serialization/ASTReader.cpp
+++ b/clang/lib/Serialization/ASTReader.cpp
@@ -987,9 +987,13 @@ ASTIdentifierLookupTraitBase::ReadKey(const unsigned char* d, unsigned n) {
 /// Whether the given identifier is "interesting".
 static bool isInterestingIdentifier(ASTReader &Reader, IdentifierInfo &II,
                                     bool IsModule) {
+  bool IsInteresting =
+      II.getInterestingIdentifierID() !=
+          tok::InterestingIdentifierKind::not_interesting ||
+      II.getBuiltinID() != Builtin::ID::NotBuiltin ||
+      II.getObjCKeywordID() != tok::ObjCKeywordKind::objc_not_keyword;
   return II.hadMacroDefinition() || II.isPoisoned() ||
-         (!IsModule && II.getObjCOrBuiltinID()) ||
-         II.hasRevertedTokenIDToIdentifier() ||
+         (!IsModule && IsInteresting) || II.hasRevertedTokenIDToIdentifier() ||
          (!(IsModule && Reader.getPreprocessor().getLangOpts().CPlusPlus) &&
           II.getFETokenInfo());
 }
diff --git a/clang/lib/Serialization/ASTWriter.cpp b/clang/lib/Serialization/ASTWriter.cpp
index 83ad4cb5..7966b31 100644
--- a/clang/lib/Serialization/ASTWriter.cpp
+++ b/clang/lib/Serialization/ASTWriter.cpp
@@ -3597,8 +3597,13 @@ class ASTIdentifierTableTrait {
   /// doesn't check whether the name has macros defined; use PublicMacroIterator
   /// to check that.
   bool isInterestingIdentifier(const IdentifierInfo *II, uint64_t MacroOffset) {
-    if (MacroOffset || II->isPoisoned() ||
-        (!IsModule && II->getObjCOrBuiltinID()) ||
+    II->getObjCOrBuiltinID();
+    bool IsInteresting =
+        II->getInterestingIdentifierID() !=
+            tok::InterestingIdentifierKind::not_interesting ||
+        II->getBuiltinID() != Builtin::ID::NotBuiltin ||
+        II->getObjCKeywordID() != tok::ObjCKeywordKind::objc_not_keyword;
+    if (MacroOffset || II->isPoisoned() || (!IsModule && IsInteresting) ||
         II->hasRevertedTokenIDToIdentifier() ||
         (NeedDecls && II->getFETokenInfo()))
       return true;
-- 
cgit v1.1


From b5046a7fa9b38cfcc3980b83a93db678871ad350 Mon Sep 17 00:00:00 2001
From: Andrei Safronov <safronov@espressif.com>
Date: Mon, 12 Feb 2024 19:41:59 +0300
Subject: [Xtensa] Initial codegen support from IR (#78548)

This PR provides implementation of the basic codegen infra such as
TargetFrameLowering, MCInstLower,
AsmPrinter, RegisterInfo, InstructionInfo, TargetLowering,
SelectionDAGISel.

Migrated from https://reviews.llvm.org/D145658
---
 llvm/lib/Target/Xtensa/CMakeLists.txt          |  9 +++
 llvm/lib/Target/Xtensa/Xtensa.h                | 28 +++++++++
 llvm/lib/Target/Xtensa/XtensaAsmPrinter.cpp    | 70 +++++++++++++++++++++++
 llvm/lib/Target/Xtensa/XtensaAsmPrinter.h      | 46 +++++++++++++++
 llvm/lib/Target/Xtensa/XtensaFrameLowering.cpp | 39 +++++++++++++
 llvm/lib/Target/Xtensa/XtensaFrameLowering.h   | 32 +++++++++++
 llvm/lib/Target/Xtensa/XtensaISelDAGToDAG.cpp  | 79 ++++++++++++++++++++++++++
 llvm/lib/Target/Xtensa/XtensaISelLowering.cpp  | 58 +++++++++++++++++++
 llvm/lib/Target/Xtensa/XtensaISelLowering.h    | 41 +++++++++++++
 llvm/lib/Target/Xtensa/XtensaInstrInfo.cpp     | 27 +++++++++
 llvm/lib/Target/Xtensa/XtensaInstrInfo.h       | 45 +++++++++++++++
 llvm/lib/Target/Xtensa/XtensaRegisterInfo.cpp  | 69 ++++++++++++++++++++++
 llvm/lib/Target/Xtensa/XtensaRegisterInfo.h    | 56 ++++++++++++++++++
 llvm/lib/Target/Xtensa/XtensaSubtarget.cpp     | 44 ++++++++++++++
 llvm/lib/Target/Xtensa/XtensaSubtarget.h       | 72 +++++++++++++++++++++++
 llvm/lib/Target/Xtensa/XtensaTargetMachine.cpp | 41 ++++++++++++-
 llvm/lib/Target/Xtensa/XtensaTargetMachine.h   |  7 +++
 llvm/test/CodeGen/Xtensa/lit.local.cfg         |  2 +
 llvm/test/CodeGen/Xtensa/simple.ll             |  5 ++
 19 files changed, 769 insertions(+), 1 deletion(-)
 create mode 100644 llvm/lib/Target/Xtensa/Xtensa.h
 create mode 100644 llvm/lib/Target/Xtensa/XtensaAsmPrinter.cpp
 create mode 100644 llvm/lib/Target/Xtensa/XtensaAsmPrinter.h
 create mode 100644 llvm/lib/Target/Xtensa/XtensaFrameLowering.cpp
 create mode 100644 llvm/lib/Target/Xtensa/XtensaFrameLowering.h
 create mode 100644 llvm/lib/Target/Xtensa/XtensaISelDAGToDAG.cpp
 create mode 100644 llvm/lib/Target/Xtensa/XtensaISelLowering.cpp
 create mode 100644 llvm/lib/Target/Xtensa/XtensaISelLowering.h
 create mode 100644 llvm/lib/Target/Xtensa/XtensaInstrInfo.cpp
 create mode 100644 llvm/lib/Target/Xtensa/XtensaInstrInfo.h
 create mode 100644 llvm/lib/Target/Xtensa/XtensaRegisterInfo.cpp
 create mode 100644 llvm/lib/Target/Xtensa/XtensaRegisterInfo.h
 create mode 100644 llvm/lib/Target/Xtensa/XtensaSubtarget.cpp
 create mode 100644 llvm/lib/Target/Xtensa/XtensaSubtarget.h
 create mode 100644 llvm/test/CodeGen/Xtensa/lit.local.cfg
 create mode 100644 llvm/test/CodeGen/Xtensa/simple.ll

diff --git a/llvm/lib/Target/Xtensa/CMakeLists.txt b/llvm/lib/Target/Xtensa/CMakeLists.txt
index 7192f73..2064511 100644
--- a/llvm/lib/Target/Xtensa/CMakeLists.txt
+++ b/llvm/lib/Target/Xtensa/CMakeLists.txt
@@ -4,6 +4,7 @@ set(LLVM_TARGET_DEFINITIONS Xtensa.td)
 
 tablegen(LLVM XtensaGenAsmMatcher.inc -gen-asm-matcher)
 tablegen(LLVM XtensaGenAsmWriter.inc -gen-asm-writer)
+tablegen(LLVM XtensaGenDAGISel.inc -gen-dag-isel)
 tablegen(LLVM XtensaGenDisassemblerTables.inc -gen-disassembler)
 tablegen(LLVM XtensaGenInstrInfo.inc -gen-instr-info)
 tablegen(LLVM XtensaGenMCCodeEmitter.inc -gen-emitter)
@@ -13,6 +14,13 @@ tablegen(LLVM XtensaGenSubtargetInfo.inc -gen-subtarget)
 add_public_tablegen_target(XtensaCommonTableGen)
 
 add_llvm_target(XtensaCodeGen
+  XtensaAsmPrinter.cpp
+  XtensaFrameLowering.cpp
+  XtensaInstrInfo.cpp
+  XtensaISelDAGToDAG.cpp
+  XtensaISelLowering.cpp
+  XtensaRegisterInfo.cpp
+  XtensaSubtarget.cpp
   XtensaTargetMachine.cpp
 
   LINK_COMPONENTS
@@ -20,6 +28,7 @@ add_llvm_target(XtensaCodeGen
   CodeGen
   Core
   MC
+  SelectionDAG
   Support
   Target
   XtensaDesc
diff --git a/llvm/lib/Target/Xtensa/Xtensa.h b/llvm/lib/Target/Xtensa/Xtensa.h
new file mode 100644
index 0000000..da44e30
--- /dev/null
+++ b/llvm/lib/Target/Xtensa/Xtensa.h
@@ -0,0 +1,28 @@
+//===- Xtensa.h - Top-level interface for Xtensa representation -*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the entry points for global functions defined in
+// the LLVM Xtensa back-end.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_XTENSA_XTENSA_H
+#define LLVM_LIB_TARGET_XTENSA_XTENSA_H
+
+#include "MCTargetDesc/XtensaMCTargetDesc.h"
+#include "llvm/PassRegistry.h"
+#include "llvm/Support/CodeGen.h"
+
+namespace llvm {
+class XtensaTargetMachine;
+class FunctionPass;
+
+FunctionPass *createXtensaISelDag(XtensaTargetMachine &TM,
+                                  CodeGenOptLevel OptLevel);
+} // namespace llvm
+#endif // LLVM_LIB_TARGET_XTENSA_XTENSA_H
diff --git a/llvm/lib/Target/Xtensa/XtensaAsmPrinter.cpp b/llvm/lib/Target/Xtensa/XtensaAsmPrinter.cpp
new file mode 100644
index 0000000..87dbf2e
--- /dev/null
+++ b/llvm/lib/Target/Xtensa/XtensaAsmPrinter.cpp
@@ -0,0 +1,70 @@
+//===- XtensaAsmPrinter.cpp Xtensa LLVM Assembly Printer ------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains a printer that converts from our internal representation
+// of machine-dependent LLVM code to GAS-format Xtensa assembly language.
+//
+//===----------------------------------------------------------------------===//
+
+#include "XtensaAsmPrinter.h"
+#include "TargetInfo/XtensaTargetInfo.h"
+#include "llvm/BinaryFormat/ELF.h"
+#include "llvm/CodeGen/MachineModuleInfoImpls.h"
+#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCSectionELF.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/MC/MCSymbolELF.h"
+#include "llvm/MC/TargetRegistry.h"
+
+using namespace llvm;
+
+void XtensaAsmPrinter::emitInstruction(const MachineInstr *MI) {
+  MCInst LoweredMI;
+  lowerToMCInst(MI, LoweredMI);
+  EmitToStreamer(*OutStreamer, LoweredMI);
+}
+
+MCOperand XtensaAsmPrinter::lowerOperand(const MachineOperand &MO,
+                                         unsigned Offset) const {
+  MachineOperand::MachineOperandType MOTy = MO.getType();
+
+  switch (MOTy) {
+  case MachineOperand::MO_Register:
+    // Ignore all implicit register operands.
+    if (MO.isImplicit())
+      break;
+    return MCOperand::createReg(MO.getReg());
+  case MachineOperand::MO_Immediate:
+    return MCOperand::createImm(MO.getImm() + Offset);
+  case MachineOperand::MO_RegisterMask:
+    break;
+  default:
+    report_fatal_error("unknown operand type");
+  }
+
+  return MCOperand();
+}
+
+void XtensaAsmPrinter::lowerToMCInst(const MachineInstr *MI,
+                                     MCInst &OutMI) const {
+  OutMI.setOpcode(MI->getOpcode());
+
+  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+    const MachineOperand &MO = MI->getOperand(i);
+    MCOperand MCOp = lowerOperand(MO);
+
+    if (MCOp.isValid())
+      OutMI.addOperand(MCOp);
+  }
+}
+
+extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeXtensaAsmPrinter() {
+  RegisterAsmPrinter<XtensaAsmPrinter> A(getTheXtensaTarget());
+}
diff --git a/llvm/lib/Target/Xtensa/XtensaAsmPrinter.h b/llvm/lib/Target/Xtensa/XtensaAsmPrinter.h
new file mode 100644
index 0000000..dec2a1e
--- /dev/null
+++ b/llvm/lib/Target/Xtensa/XtensaAsmPrinter.h
@@ -0,0 +1,46 @@
+//===- XtensaAsmPrinter.h - Xtensa LLVM Assembly Printer --------*- C++-*--===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Xtensa Assembly printer class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_XTENSA_XTENSAASMPRINTER_H
+#define LLVM_LIB_TARGET_XTENSA_XTENSAASMPRINTER_H
+
+#include "XtensaTargetMachine.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/Support/Compiler.h"
+
+namespace llvm {
+class MCStreamer;
+class MachineBasicBlock;
+class MachineInstr;
+class Module;
+class raw_ostream;
+
+class LLVM_LIBRARY_VISIBILITY XtensaAsmPrinter : public AsmPrinter {
+  const MCSubtargetInfo *STI;
+
+public:
+  explicit XtensaAsmPrinter(TargetMachine &TM,
+                            std::unique_ptr<MCStreamer> Streamer)
+      : AsmPrinter(TM, std::move(Streamer)), STI(TM.getMCSubtargetInfo()) {}
+
+  StringRef getPassName() const override { return "Xtensa Assembly Printer"; }
+  void emitInstruction(const MachineInstr *MI) override;
+
+  // Lower MachineInstr MI to MCInst OutMI.
+  void lowerToMCInst(const MachineInstr *MI, MCInst &OutMI) const;
+
+  // Return an MCOperand for MO.  Return an empty operand if MO is implicit.
+  MCOperand lowerOperand(const MachineOperand &MO, unsigned Offset = 0) const;
+};
+} // end namespace llvm
+
+#endif /* LLVM_LIB_TARGET_XTENSA_XTENSAASMPRINTER_H */
diff --git a/llvm/lib/Target/Xtensa/XtensaFrameLowering.cpp b/llvm/lib/Target/Xtensa/XtensaFrameLowering.cpp
new file mode 100644
index 0000000..2092a2d
--- /dev/null
+++ b/llvm/lib/Target/Xtensa/XtensaFrameLowering.cpp
@@ -0,0 +1,39 @@
+//===- XtensaFrameLowering.cpp - Xtensa Frame Information -----------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the Xtensa implementation of TargetFrameLowering class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "XtensaFrameLowering.h"
+#include "XtensaInstrInfo.h"
+#include "XtensaSubtarget.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/RegisterScavenging.h"
+#include "llvm/IR/Function.h"
+
+using namespace llvm;
+
+XtensaFrameLowering::XtensaFrameLowering()
+    : TargetFrameLowering(TargetFrameLowering::StackGrowsDown, Align(4), 0,
+                          Align(4)) {}
+
+bool XtensaFrameLowering::hasFP(const MachineFunction &MF) const {
+  const MachineFrameInfo &MFI = MF.getFrameInfo();
+  return MF.getTarget().Options.DisableFramePointerElim(MF) ||
+         MFI.hasVarSizedObjects();
+}
+
+void XtensaFrameLowering::emitPrologue(MachineFunction &MF,
+                                       MachineBasicBlock &MBB) const {}
+
+void XtensaFrameLowering::emitEpilogue(MachineFunction &MF,
+                                       MachineBasicBlock &MBB) const {}
diff --git a/llvm/lib/Target/Xtensa/XtensaFrameLowering.h b/llvm/lib/Target/Xtensa/XtensaFrameLowering.h
new file mode 100644
index 0000000..19e52310
--- /dev/null
+++ b/llvm/lib/Target/Xtensa/XtensaFrameLowering.h
@@ -0,0 +1,32 @@
+//===- XtensaFrameLowering.h - Define frame lowering for Xtensa --*- C++ -*-==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===-----------------------------------------------------------------------==//
+
+#ifndef LLVM_LIB_TARGET_XTENSA_XTENSAFRAMELOWERING_H
+#define LLVM_LIB_TARGET_XTENSA_XTENSAFRAMELOWERING_H
+
+#include "llvm/CodeGen/TargetFrameLowering.h"
+
+namespace llvm {
+class XtensaTargetMachine;
+class XtensaSubtarget;
+
+class XtensaFrameLowering : public TargetFrameLowering {
+public:
+  XtensaFrameLowering();
+
+  bool hasFP(const MachineFunction &MF) const override;
+
+  /// emitProlog/emitEpilog - These methods insert prolog and epilog code into
+  /// the function.
+  void emitPrologue(MachineFunction &, MachineBasicBlock &) const override;
+  void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const override;
+};
+
+} // namespace llvm
+
+#endif /* LLVM_LIB_TARGET_XTENSA_XTENSAFRAMELOWERING_H */
diff --git a/llvm/lib/Target/Xtensa/XtensaISelDAGToDAG.cpp b/llvm/lib/Target/Xtensa/XtensaISelDAGToDAG.cpp
new file mode 100644
index 0000000..3007372
--- /dev/null
+++ b/llvm/lib/Target/Xtensa/XtensaISelDAGToDAG.cpp
@@ -0,0 +1,79 @@
+//===- XtensaISelDAGToDAG.cpp - A dag to dag inst selector for Xtensa -----===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines an instruction selector for the Xtensa target.
+//
+//===----------------------------------------------------------------------===//
+
+#include "Xtensa.h"
+#include "XtensaTargetMachine.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/SelectionDAGISel.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "xtensa-isel"
+
+namespace {
+
+class XtensaDAGToDAGISel : public SelectionDAGISel {
+public:
+  static char ID;
+
+  XtensaDAGToDAGISel(XtensaTargetMachine &TM, CodeGenOptLevel OptLevel)
+      : SelectionDAGISel(ID, TM, OptLevel) {}
+
+  StringRef getPassName() const override {
+    return "Xtensa DAG->DAG Pattern Instruction Selection";
+  }
+
+  void Select(SDNode *Node) override;
+
+  bool selectMemRegAddr(SDValue Addr, SDValue &Base, SDValue &Offset,
+                        int Scale) {
+    report_fatal_error("MemReg address is not implemented yet");
+  }
+
+  bool selectMemRegAddrISH1(SDValue Addr, SDValue &Base, SDValue &Offset) {
+    return selectMemRegAddr(Addr, Base, Offset, 1);
+  }
+
+  bool selectMemRegAddrISH2(SDValue Addr, SDValue &Base, SDValue &Offset) {
+    return selectMemRegAddr(Addr, Base, Offset, 2);
+  }
+
+  bool selectMemRegAddrISH4(SDValue Addr, SDValue &Base, SDValue &Offset) {
+    return selectMemRegAddr(Addr, Base, Offset, 4);
+  }
+
+// Include the pieces autogenerated from the target description.
+#include "XtensaGenDAGISel.inc"
+}; // namespace
+} // end anonymous namespace
+
+char XtensaDAGToDAGISel::ID = 0;
+
+FunctionPass *llvm::createXtensaISelDag(XtensaTargetMachine &TM,
+                                        CodeGenOptLevel OptLevel) {
+  return new XtensaDAGToDAGISel(TM, OptLevel);
+}
+
+void XtensaDAGToDAGISel::Select(SDNode *Node) {
+  SDLoc DL(Node);
+
+  // If we have a custom node, we already have selected!
+  if (Node->isMachineOpcode()) {
+    Node->setNodeId(-1);
+    return;
+  }
+
+  SelectCode(Node);
+}
diff --git a/llvm/lib/Target/Xtensa/XtensaISelLowering.cpp b/llvm/lib/Target/Xtensa/XtensaISelLowering.cpp
new file mode 100644
index 0000000..276fab8
--- /dev/null
+++ b/llvm/lib/Target/Xtensa/XtensaISelLowering.cpp
@@ -0,0 +1,58 @@
+//===- XtensaISelLowering.cpp - Xtensa DAG Lowering Implementation --------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the interfaces that Xtensa uses to lower LLVM code into a
+// selection DAG.
+//
+//===----------------------------------------------------------------------===//
+
+#include "XtensaISelLowering.h"
+#include "XtensaSubtarget.h"
+#include "XtensaTargetMachine.h"
+#include "llvm/CodeGen/CallingConvLower.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineJumpTableInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "xtensa-lower"
+
+XtensaTargetLowering::XtensaTargetLowering(const TargetMachine &TM,
+                                           const XtensaSubtarget &STI)
+    : TargetLowering(TM), Subtarget(STI) {
+  // Set up the register classes.
+  addRegisterClass(MVT::i32, &Xtensa::ARRegClass);
+
+  // Set up special registers.
+  setStackPointerRegisterToSaveRestore(Xtensa::SP);
+
+  setSchedulingPreference(Sched::RegPressure);
+
+  setMinFunctionAlignment(Align(4));
+
+  // Compute derived properties from the register classes
+  computeRegisterProperties(STI.getRegisterInfo());
+}
+
+SDValue XtensaTargetLowering::LowerOperation(SDValue Op,
+                                             SelectionDAG &DAG) const {
+  switch (Op.getOpcode()) {
+  default:
+    report_fatal_error("Unexpected node to lower");
+  }
+}
+
+const char *XtensaTargetLowering::getTargetNodeName(unsigned Opcode) const {
+  return nullptr;
+}
diff --git a/llvm/lib/Target/Xtensa/XtensaISelLowering.h b/llvm/lib/Target/Xtensa/XtensaISelLowering.h
new file mode 100644
index 0000000..8b03712
--- /dev/null
+++ b/llvm/lib/Target/Xtensa/XtensaISelLowering.h
@@ -0,0 +1,41 @@
+//===- XtensaISelLowering.h - Xtensa DAG Lowering Interface -----*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the interfaces that Xtensa uses to lower LLVM code into a
+// selection DAG.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_XTENSA_XTENSAISELLOWERING_H
+#define LLVM_LIB_TARGET_XTENSA_XTENSAISELLOWERING_H
+
+#include "llvm/CodeGen/CallingConvLower.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/CodeGen/TargetLowering.h"
+
+namespace llvm {
+class XtensaSubtarget;
+
+class XtensaTargetLowering : public TargetLowering {
+public:
+  explicit XtensaTargetLowering(const TargetMachine &TM,
+                                const XtensaSubtarget &STI);
+
+  const char *getTargetNodeName(unsigned Opcode) const override;
+
+  SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override;
+
+  const XtensaSubtarget &getSubtarget() const { return Subtarget; }
+
+private:
+  const XtensaSubtarget &Subtarget;
+};
+
+} // end namespace llvm
+
+#endif /* LLVM_LIB_TARGET_XTENSA_XTENSAISELLOWERING_H */
diff --git a/llvm/lib/Target/Xtensa/XtensaInstrInfo.cpp b/llvm/lib/Target/Xtensa/XtensaInstrInfo.cpp
new file mode 100644
index 0000000..41b794d
--- /dev/null
+++ b/llvm/lib/Target/Xtensa/XtensaInstrInfo.cpp
@@ -0,0 +1,27 @@
+//===- XtensaInstrInfo.cpp - Xtensa Instruction Information ---------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the Xtensa implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "XtensaInstrInfo.h"
+#include "XtensaTargetMachine.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+
+#define GET_INSTRINFO_CTOR_DTOR
+#include "XtensaGenInstrInfo.inc"
+
+using namespace llvm;
+
+XtensaInstrInfo::XtensaInstrInfo(const XtensaSubtarget &STI)
+    : XtensaGenInstrInfo(), RI(STI), STI(STI) {}
diff --git a/llvm/lib/Target/Xtensa/XtensaInstrInfo.h b/llvm/lib/Target/Xtensa/XtensaInstrInfo.h
new file mode 100644
index 0000000..8c73c9b
--- /dev/null
+++ b/llvm/lib/Target/Xtensa/XtensaInstrInfo.h
@@ -0,0 +1,45 @@
+//===-- XtensaInstrInfo.h - Xtensa Instruction Information ------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the Xtensa implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_XTENSA_XTENSAINSTRINFO_H
+#define LLVM_LIB_TARGET_XTENSA_XTENSAINSTRINFO_H
+
+#include "Xtensa.h"
+#include "XtensaRegisterInfo.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
+#include "llvm/CodeGen/TargetRegisterInfo.h"
+
+#define GET_INSTRINFO_HEADER
+
+#include "XtensaGenInstrInfo.inc"
+
+namespace llvm {
+
+class XtensaTargetMachine;
+class XtensaSubtarget;
+class XtensaInstrInfo : public XtensaGenInstrInfo {
+  const XtensaRegisterInfo RI;
+  const XtensaSubtarget &STI;
+
+public:
+  XtensaInstrInfo(const XtensaSubtarget &STI);
+
+  // Return the XtensaRegisterInfo, which this class owns.
+  const XtensaRegisterInfo &getRegisterInfo() const { return RI; }
+
+  const XtensaSubtarget &getSubtarget() const { return STI; }
+};
+} // end namespace llvm
+
+#endif /* LLVM_LIB_TARGET_XTENSA_XTENSAINSTRINFO_H */
diff --git a/llvm/lib/Target/Xtensa/XtensaRegisterInfo.cpp b/llvm/lib/Target/Xtensa/XtensaRegisterInfo.cpp
new file mode 100644
index 0000000..f749cc5
--- /dev/null
+++ b/llvm/lib/Target/Xtensa/XtensaRegisterInfo.cpp
@@ -0,0 +1,69 @@
+//===- XtensaRegisterInfo.cpp - Xtensa Register Information ---------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the Xtensa implementation of the TargetRegisterInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "XtensaRegisterInfo.h"
+#include "XtensaInstrInfo.h"
+#include "XtensaSubtarget.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+
+#define DEBUG_TYPE "xtensa-reg-info"
+
+#define GET_REGINFO_TARGET_DESC
+#include "XtensaGenRegisterInfo.inc"
+
+using namespace llvm;
+
+XtensaRegisterInfo::XtensaRegisterInfo(const XtensaSubtarget &STI)
+    : XtensaGenRegisterInfo(Xtensa::A0), Subtarget(STI) {}
+
+const uint16_t *
+XtensaRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
+  // Calling convention is not implemented yet
+  return nullptr;
+}
+
+const uint32_t *
+XtensaRegisterInfo::getCallPreservedMask(const MachineFunction &MF,
+                                         CallingConv::ID) const {
+  // Calling convention is not implemented yet
+  return nullptr;
+}
+
+BitVector XtensaRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
+  BitVector Reserved(getNumRegs());
+  const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering();
+
+  Reserved.set(Xtensa::A0);
+  if (TFI->hasFP(MF)) {
+    // Reserve frame pointer.
+    Reserved.set(getFrameRegister(MF));
+  }
+
+  // Reserve stack pointer.
+  Reserved.set(Xtensa::SP);
+  return Reserved;
+}
+
+bool XtensaRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
+                                             int SPAdj, unsigned FIOperandNum,
+                                             RegScavenger *RS) const {
+  report_fatal_error("Eliminate frame index not supported yet");
+}
+
+Register XtensaRegisterInfo::getFrameRegister(const MachineFunction &MF) const {
+  const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering();
+  return TFI->hasFP(MF) ? Xtensa::A15 : Xtensa::SP;
+}
diff --git a/llvm/lib/Target/Xtensa/XtensaRegisterInfo.h b/llvm/lib/Target/Xtensa/XtensaRegisterInfo.h
new file mode 100644
index 0000000..a4eda87
--- /dev/null
+++ b/llvm/lib/Target/Xtensa/XtensaRegisterInfo.h
@@ -0,0 +1,56 @@
+//===-- XtensaRegisterInfo.h - Xtensa Register Information Impl -*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the Xtensa implementation of the TargetRegisterInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_XTENSA_XTENSAREGISTERINFO_H
+#define LLVM_LIB_TARGET_XTENSA_XTENSAREGISTERINFO_H
+
+#include "Xtensa.h"
+#include "llvm/CodeGen/TargetRegisterInfo.h"
+
+#define GET_REGINFO_HEADER
+#include "XtensaGenRegisterInfo.inc"
+
+namespace llvm {
+class TargetRegisterClass;
+class XtensaInstrInfo;
+class XtensaSubtarget;
+
+struct XtensaRegisterInfo : public XtensaGenRegisterInfo {
+public:
+  const XtensaSubtarget &Subtarget;
+
+  XtensaRegisterInfo(const XtensaSubtarget &STI);
+
+  bool requiresRegisterScavenging(const MachineFunction &MF) const override {
+    return true;
+  }
+
+  bool requiresFrameIndexScavenging(const MachineFunction &MF) const override {
+    return true;
+  }
+
+  const uint16_t *
+  getCalleeSavedRegs(const MachineFunction *MF = 0) const override;
+  const uint32_t *getCallPreservedMask(const MachineFunction &MF,
+                                       CallingConv::ID) const override;
+  BitVector getReservedRegs(const MachineFunction &MF) const override;
+
+  bool eliminateFrameIndex(MachineBasicBlock::iterator MI, int SPAdj,
+                           unsigned FIOperandNum,
+                           RegScavenger *RS = nullptr) const override;
+
+  Register getFrameRegister(const MachineFunction &MF) const override;
+};
+
+} // end namespace llvm
+
+#endif // LLVM_LIB_TARGET_XTENSA_REGISTERINFO_H
diff --git a/llvm/lib/Target/Xtensa/XtensaSubtarget.cpp b/llvm/lib/Target/Xtensa/XtensaSubtarget.cpp
new file mode 100644
index 0000000..10ab92d
--- /dev/null
+++ b/llvm/lib/Target/Xtensa/XtensaSubtarget.cpp
@@ -0,0 +1,44 @@
+//===- XtensaSubtarget.cpp - Xtensa Subtarget Information -----------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the Xtensa specific subclass of TargetSubtargetInfo.
+//
+//===----------------------------------------------------------------------===//
+
+#include "XtensaSubtarget.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/Support/Debug.h"
+
+#define DEBUG_TYPE "xtensa-subtarget"
+
+#define GET_SUBTARGETINFO_TARGET_DESC
+#define GET_SUBTARGETINFO_CTOR
+#include "XtensaGenSubtargetInfo.inc"
+
+using namespace llvm;
+
+XtensaSubtarget &
+XtensaSubtarget::initializeSubtargetDependencies(StringRef CPU, StringRef FS) {
+  StringRef CPUName = CPU;
+  if (CPUName.empty()) {
+    // set default cpu name
+    CPUName = "generic";
+  }
+
+  HasDensity = false;
+
+  // Parse features string.
+  ParseSubtargetFeatures(CPUName, CPUName, FS);
+  return *this;
+}
+
+XtensaSubtarget::XtensaSubtarget(const Triple &TT, StringRef CPU, StringRef FS,
+                                 const TargetMachine &TM)
+    : XtensaGenSubtargetInfo(TT, CPU, /*TuneCPU=*/CPU, FS), TargetTriple(TT),
+      InstrInfo(initializeSubtargetDependencies(CPU, FS)), TLInfo(TM, *this),
+      TSInfo(), FrameLowering() {}
diff --git a/llvm/lib/Target/Xtensa/XtensaSubtarget.h b/llvm/lib/Target/Xtensa/XtensaSubtarget.h
new file mode 100644
index 0000000..948dcbc
--- /dev/null
+++ b/llvm/lib/Target/Xtensa/XtensaSubtarget.h
@@ -0,0 +1,72 @@
+//===-- XtensaSubtarget.h - Define Subtarget for the Xtensa ----*- C++ -*--===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the Xtensa specific subclass of TargetSubtargetInfo.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_XTENSA_XTENSASUBTARGET_H
+#define LLVM_LIB_TARGET_XTENSA_XTENSASUBTARGET_H
+
+#include "XtensaFrameLowering.h"
+#include "XtensaISelLowering.h"
+#include "XtensaInstrInfo.h"
+#include "XtensaRegisterInfo.h"
+#include "llvm/CodeGen/SelectionDAGTargetInfo.h"
+#include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/Target/TargetMachine.h"
+
+#define GET_SUBTARGETINFO_HEADER
+#include "XtensaGenSubtargetInfo.inc"
+
+namespace llvm {
+class StringRef;
+
+class XtensaSubtarget : public XtensaGenSubtargetInfo {
+private:
+  const Triple &TargetTriple;
+  XtensaInstrInfo InstrInfo;
+  XtensaTargetLowering TLInfo;
+  SelectionDAGTargetInfo TSInfo;
+  XtensaFrameLowering FrameLowering;
+
+  // Enabled Xtensa Density extension
+  bool HasDensity;
+
+  XtensaSubtarget &initializeSubtargetDependencies(StringRef CPU, StringRef FS);
+
+public:
+  XtensaSubtarget(const Triple &TT, StringRef CPU, StringRef FS,
+                  const TargetMachine &TM);
+
+  const Triple &getTargetTriple() const { return TargetTriple; }
+
+  const TargetFrameLowering *getFrameLowering() const override {
+    return &FrameLowering;
+  }
+  const XtensaInstrInfo *getInstrInfo() const override { return &InstrInfo; }
+  const XtensaRegisterInfo *getRegisterInfo() const override {
+    return &InstrInfo.getRegisterInfo();
+  }
+
+  const XtensaTargetLowering *getTargetLowering() const override {
+    return &TLInfo;
+  }
+  const SelectionDAGTargetInfo *getSelectionDAGInfo() const override {
+    return &TSInfo;
+  }
+
+  bool hasDensity() const { return HasDensity; }
+
+  // Automatically generated by tblgen.
+  void ParseSubtargetFeatures(StringRef CPU, StringRef TuneCPU, StringRef FS);
+};
+} // end namespace llvm
+
+#endif /* LLVM_LIB_TARGET_XTENSA_XTENSASUBTARGET_H */
diff --git a/llvm/lib/Target/Xtensa/XtensaTargetMachine.cpp b/llvm/lib/Target/Xtensa/XtensaTargetMachine.cpp
index c891ecd..49c7faf 100644
--- a/llvm/lib/Target/Xtensa/XtensaTargetMachine.cpp
+++ b/llvm/lib/Target/Xtensa/XtensaTargetMachine.cpp
@@ -64,6 +64,45 @@ XtensaTargetMachine::XtensaTargetMachine(const Target &T, const Triple &TT,
                                          CodeGenOptLevel OL, bool JIT)
     : XtensaTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, JIT, true) {}
 
+const XtensaSubtarget *
+XtensaTargetMachine::getSubtargetImpl(const Function &F) const {
+  Attribute CPUAttr = F.getFnAttribute("target-cpu");
+  Attribute FSAttr = F.getFnAttribute("target-features");
+
+  auto CPU = CPUAttr.isValid() ? CPUAttr.getValueAsString().str() : TargetCPU;
+  auto FS = FSAttr.isValid() ? FSAttr.getValueAsString().str() : TargetFS;
+
+  auto &I = SubtargetMap[CPU + FS];
+  if (!I) {
+    // This needs to be done before we create a new subtarget since any
+    // creation will depend on the TM and the code generation flags on the
+    // function that reside in TargetOptions.
+    resetTargetOptions(F);
+    I = std::make_unique<XtensaSubtarget>(TargetTriple, CPU, FS, *this);
+  }
+  return I.get();
+}
+
+namespace {
+/// Xtensa Code Generator Pass Configuration Options.
+class XtensaPassConfig : public TargetPassConfig {
+public:
+  XtensaPassConfig(XtensaTargetMachine &TM, PassManagerBase &PM)
+      : TargetPassConfig(TM, PM) {}
+
+  XtensaTargetMachine &getXtensaTargetMachine() const {
+    return getTM<XtensaTargetMachine>();
+  }
+
+  bool addInstSelector() override;
+};
+} // end anonymous namespace
+
+bool XtensaPassConfig::addInstSelector() {
+  addPass(createXtensaISelDag(getXtensaTargetMachine(), getOptLevel()));
+  return false;
+}
+
 TargetPassConfig *XtensaTargetMachine::createPassConfig(PassManagerBase &PM) {
-  return new TargetPassConfig(*this, PM);
+  return new XtensaPassConfig(*this, PM);
 }
diff --git a/llvm/lib/Target/Xtensa/XtensaTargetMachine.h b/llvm/lib/Target/Xtensa/XtensaTargetMachine.h
index dd76f45..f371f22 100644
--- a/llvm/lib/Target/Xtensa/XtensaTargetMachine.h
+++ b/llvm/lib/Target/Xtensa/XtensaTargetMachine.h
@@ -15,6 +15,7 @@
 #ifndef LLVM_LIB_TARGET_XTENSA_XTENSATARGETMACHINE_H
 #define LLVM_LIB_TARGET_XTENSA_XTENSATARGETMACHINE_H
 
+#include "XtensaSubtarget.h"
 #include "llvm/Target/TargetMachine.h"
 #include <optional>
 
@@ -36,10 +37,16 @@ public:
                       std::optional<CodeModel::Model> CM, CodeGenOptLevel OL,
                       bool JIT);
 
+  const XtensaSubtarget *getSubtargetImpl(const Function &F) const override;
+
   TargetPassConfig *createPassConfig(PassManagerBase &PM) override;
+
   TargetLoweringObjectFile *getObjFileLowering() const override {
     return TLOF.get();
   }
+
+protected:
+  mutable StringMap<std::unique_ptr<XtensaSubtarget>> SubtargetMap;
 };
 } // end namespace llvm
 
diff --git a/llvm/test/CodeGen/Xtensa/lit.local.cfg b/llvm/test/CodeGen/Xtensa/lit.local.cfg
new file mode 100644
index 0000000..e81bfa7
--- /dev/null
+++ b/llvm/test/CodeGen/Xtensa/lit.local.cfg
@@ -0,0 +1,2 @@
+if not "Xtensa" in config.root.targets:
+    config.unsupported = True
diff --git a/llvm/test/CodeGen/Xtensa/simple.ll b/llvm/test/CodeGen/Xtensa/simple.ll
new file mode 100644
index 0000000..a2c55aa
--- /dev/null
+++ b/llvm/test/CodeGen/Xtensa/simple.ll
@@ -0,0 +1,5 @@
+; RUN: llc -mtriple=xtensa -filetype=asm %s -o - | FileCheck %s
+; RUN: llc -mtriple=xtensa -filetype=obj %s -o - | llvm-objdump --arch=xtensa  -d - | FileCheck %s --check-prefix=DUMP
+
+; CHECK:  .text
+; DUMP:   file format elf32-xtensa
-- 
cgit v1.1


From 595d8d4e856217f69dc05448ae852c24e65cc181 Mon Sep 17 00:00:00 2001
From: Jay Foad <jay.foad@amd.com>
Date: Mon, 12 Feb 2024 16:26:08 +0000
Subject: [TableGen] Make use of find_if. NFC.

---
 llvm/utils/TableGen/CodeGenRegisters.cpp | 9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)

diff --git a/llvm/utils/TableGen/CodeGenRegisters.cpp b/llvm/utils/TableGen/CodeGenRegisters.cpp
index 4b89540..25f3864 100644
--- a/llvm/utils/TableGen/CodeGenRegisters.cpp
+++ b/llvm/utils/TableGen/CodeGenRegisters.cpp
@@ -1898,13 +1898,8 @@ void CodeGenRegBank::computeRegUnitWeights() {
 static std::vector<RegUnitSet>::const_iterator
 findRegUnitSet(const std::vector<RegUnitSet> &UniqueSets,
                const RegUnitSet &Set) {
-  std::vector<RegUnitSet>::const_iterator I = UniqueSets.begin(),
-                                          E = UniqueSets.end();
-  for (; I != E; ++I) {
-    if (I->Units == Set.Units)
-      break;
-  }
-  return I;
+  return find_if(UniqueSets,
+                 [&Set](const RegUnitSet &I) { return I.Units == Set.Units; });
 }
 
 // Return true if the RUSubSet is a subset of RUSuperSet.
-- 
cgit v1.1


From 6a3a5cad2e1249f1b685546ebe71b2ead9a27541 Mon Sep 17 00:00:00 2001
From: Alexey Bataev <a.bataev@outlook.com>
Date: Mon, 12 Feb 2024 08:43:52 -0800
Subject: Revert "[SLP]Add support for strided loads."

This reverts commit 0940f9083e68bda78bcbb323c2968a4294092e21 to fix
issues reported in https://github.com/llvm/llvm-project/pull/80310.
---
 llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp    | 288 +++++------
 .../SLPVectorizer/RISCV/complex-loads.ll           | 546 +++++++++++----------
 .../RISCV/strided-loads-vectorized.ll              | 131 ++++-
 .../RISCV/strided-loads-with-external-use-ptr.ll   |   4 +-
 .../SLPVectorizer/RISCV/strided-loads.ll           |  13 +-
 .../X86/gep-nodes-with-non-gep-inst.ll             |   2 +-
 .../X86/remark_gather-load-redux-cost.ll           |   2 +-
 7 files changed, 530 insertions(+), 456 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index c94fb71..c0b7298f 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -175,15 +175,6 @@ static cl::opt<int> RootLookAheadMaxDepth(
     "slp-max-root-look-ahead-depth", cl::init(2), cl::Hidden,
     cl::desc("The maximum look-ahead depth for searching best rooting option"));
 
-static cl::opt<unsigned> MinProfitableStridedLoads(
-    "slp-min-strided-loads", cl::init(2), cl::Hidden,
-    cl::desc("The minimum number of loads, which should be considered strided, "
-             "if the stride is > 1 or is runtime value"));
-
-static cl::opt<unsigned> MaxProfitableLoadStride(
-    "slp-max-stride", cl::init(8), cl::Hidden,
-    cl::desc("The maximum stride, considered to be profitable."));
-
 static cl::opt<bool>
     ViewSLPTree("view-slp-tree", cl::Hidden,
                 cl::desc("Display the SLP trees with Graphviz"));
@@ -2584,7 +2575,7 @@ private:
     enum EntryState {
       Vectorize,
       ScatterVectorize,
-      StridedVectorize,
+      PossibleStridedVectorize,
       NeedToGather
     };
     EntryState State;
@@ -2762,8 +2753,8 @@ private:
       case ScatterVectorize:
         dbgs() << "ScatterVectorize\n";
         break;
-      case StridedVectorize:
-        dbgs() << "StridedVectorize\n";
+      case PossibleStridedVectorize:
+        dbgs() << "PossibleStridedVectorize\n";
         break;
       case NeedToGather:
         dbgs() << "NeedToGather\n";
@@ -3689,7 +3680,7 @@ template <> struct DOTGraphTraits<BoUpSLP *> : public DefaultDOTGraphTraits {
     if (Entry->State == TreeEntry::NeedToGather)
       return "color=red";
     if (Entry->State == TreeEntry::ScatterVectorize ||
-        Entry->State == TreeEntry::StridedVectorize)
+        Entry->State == TreeEntry::PossibleStridedVectorize)
       return "color=blue";
     return "";
   }
@@ -3851,7 +3842,12 @@ BoUpSLP::findReusedOrderedScalars(const BoUpSLP::TreeEntry &TE) {
 
 namespace {
 /// Tracks the state we can represent the loads in the given sequence.
-enum class LoadsState { Gather, Vectorize, ScatterVectorize, StridedVectorize };
+enum class LoadsState {
+  Gather,
+  Vectorize,
+  ScatterVectorize,
+  PossibleStridedVectorize
+};
 } // anonymous namespace
 
 static bool arePointersCompatible(Value *Ptr1, Value *Ptr2,
@@ -3882,14 +3878,6 @@ static Align computeCommonAlignment(ArrayRef<Value *> VL) {
   return CommonAlignment;
 }
 
-/// Check if \p Order represents reverse order.
-static bool isReverseOrder(ArrayRef<unsigned> Order) {
-  unsigned Sz = Order.size();
-  return !Order.empty() && all_of(enumerate(Order), [&](const auto &Pair) {
-    return Pair.value() == Sz || Sz - Pair.index() - 1 == Pair.value();
-  });
-}
-
 /// Checks if the given array of loads can be represented as a vectorized,
 /// scatter or just simple gather.
 static LoadsState canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0,
@@ -3912,8 +3900,7 @@ static LoadsState canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0,
   // Make sure all loads in the bundle are simple - we can't vectorize
   // atomic or volatile loads.
   PointerOps.clear();
-  const unsigned Sz = VL.size();
-  PointerOps.resize(Sz);
+  PointerOps.resize(VL.size());
   auto *POIter = PointerOps.begin();
   for (Value *V : VL) {
     auto *L = cast<LoadInst>(V);
@@ -3924,12 +3911,12 @@ static LoadsState canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0,
   }
 
   Order.clear();
-  auto *VecTy = FixedVectorType::get(ScalarTy, Sz);
   // Check the order of pointer operands or that all pointers are the same.
   bool IsSorted = sortPtrAccesses(PointerOps, ScalarTy, DL, SE, Order);
   if (IsSorted || all_of(PointerOps, [&](Value *P) {
         return arePointersCompatible(P, PointerOps.front(), TLI);
       })) {
+    bool IsPossibleStrided = false;
     if (IsSorted) {
       Value *Ptr0;
       Value *PtrN;
@@ -3943,71 +3930,30 @@ static LoadsState canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0,
       std::optional<int> Diff =
           getPointersDiff(ScalarTy, Ptr0, ScalarTy, PtrN, DL, SE);
       // Check that the sorted loads are consecutive.
-      if (static_cast<unsigned>(*Diff) == Sz - 1)
+      if (static_cast<unsigned>(*Diff) == VL.size() - 1)
         return LoadsState::Vectorize;
       // Simple check if not a strided access - clear order.
-      bool IsPossibleStrided = *Diff % (Sz - 1) == 0;
-      // Try to generate strided load node if:
-      // 1. Target with strided load support is detected.
-      // 2. The number of loads is greater than MinProfitableStridedLoads,
-      // or the potential stride <= MaxProfitableLoadStride and the
-      // potential stride is power-of-2 (to avoid perf regressions for the very
-      // small number of loads) and max distance > number of loads, or potential
-      // stride is -1.
-      // 3. The loads are ordered, or number of unordered loads <=
-      // MaxProfitableUnorderedLoads, or loads are in reversed order.
-      // (this check is to avoid extra costs for very expensive shuffles).
-      if (IsPossibleStrided && (((Sz > MinProfitableStridedLoads ||
-                                  (static_cast<unsigned>(std::abs(*Diff)) <=
-                                       MaxProfitableLoadStride * Sz &&
-                                   isPowerOf2_32(std::abs(*Diff)))) &&
-                                 static_cast<unsigned>(std::abs(*Diff)) > Sz) ||
-                                *Diff == -(static_cast<int>(Sz) - 1))) {
-        int Stride = *Diff / static_cast<int>(Sz - 1);
-        if (*Diff == Stride * static_cast<int>(Sz - 1)) {
-          Align Alignment =
-              cast<LoadInst>(Order.empty() ? VL.front() : VL[Order.front()])
-                  ->getAlign();
-          if (TTI.isLegalStridedLoadStore(VecTy, Alignment)) {
-            // Iterate through all pointers and check if all distances are
-            // unique multiple of Dist.
-            SmallSet<int, 4> Dists;
-            for (Value *Ptr : PointerOps) {
-              int Dist = 0;
-              if (Ptr == PtrN)
-                Dist = *Diff;
-              else if (Ptr != Ptr0)
-                Dist = *getPointersDiff(ScalarTy, Ptr0, ScalarTy, Ptr, DL, SE);
-              // If the strides are not the same or repeated, we can't
-              // vectorize.
-              if (((Dist / Stride) * Stride) != Dist ||
-                  !Dists.insert(Dist).second)
-                break;
-            }
-            if (Dists.size() == Sz)
-              return LoadsState::StridedVectorize;
-          }
-        }
-      }
+      IsPossibleStrided = *Diff % (VL.size() - 1) == 0;
     }
     // TODO: need to improve analysis of the pointers, if not all of them are
     // GEPs or have > 2 operands, we end up with a gather node, which just
     // increases the cost.
     Loop *L = LI.getLoopFor(cast<LoadInst>(VL0)->getParent());
     bool ProfitableGatherPointers =
-        static_cast<unsigned>(count_if(
-            PointerOps,
-            [L](Value *V) { return L && L->isLoopInvariant(V); })) <= Sz / 2 &&
-        Sz > 2;
+        static_cast<unsigned>(count_if(PointerOps, [L](Value *V) {
+          return L && L->isLoopInvariant(V);
+        })) <= VL.size() / 2 && VL.size() > 2;
     if (ProfitableGatherPointers || all_of(PointerOps, [IsSorted](Value *P) {
           auto *GEP = dyn_cast<GetElementPtrInst>(P);
           return (IsSorted && !GEP && doesNotNeedToBeScheduled(P)) ||
                  (GEP && GEP->getNumOperands() == 2);
         })) {
       Align CommonAlignment = computeCommonAlignment<LoadInst>(VL);
+      auto *VecTy = FixedVectorType::get(ScalarTy, VL.size());
       if (TTI.isLegalMaskedGather(VecTy, CommonAlignment) &&
           !TTI.forceScalarizeMaskedGather(VecTy, CommonAlignment))
-        return LoadsState::ScatterVectorize;
+        return IsPossibleStrided ? LoadsState::PossibleStridedVectorize
+                                 : LoadsState::ScatterVectorize;
     }
   }
 
@@ -4214,7 +4160,7 @@ BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom) {
     return std::move(ResOrder);
   }
   if ((TE.State == TreeEntry::Vectorize ||
-       TE.State == TreeEntry::StridedVectorize) &&
+       TE.State == TreeEntry::PossibleStridedVectorize) &&
       (isa<LoadInst, ExtractElementInst, ExtractValueInst>(TE.getMainOp()) ||
        (TopToBottom && isa<StoreInst, InsertElementInst>(TE.getMainOp()))) &&
       !TE.isAltShuffle())
@@ -4472,7 +4418,7 @@ void BoUpSLP::reorderTopToBottom() {
       }
       VFToOrderedEntries[TE->getVectorFactor()].insert(TE.get());
       if (!(TE->State == TreeEntry::Vectorize ||
-            TE->State == TreeEntry::StridedVectorize) ||
+            TE->State == TreeEntry::PossibleStridedVectorize) ||
           !TE->ReuseShuffleIndices.empty())
         GathersToOrders.try_emplace(TE.get(), *CurrentOrder);
       if (TE->State == TreeEntry::Vectorize &&
@@ -4496,6 +4442,9 @@ void BoUpSLP::reorderTopToBottom() {
     MapVector<OrdersType, unsigned,
               DenseMap<OrdersType, unsigned, OrdersTypeDenseMapInfo>>
         OrdersUses;
+    // Last chance orders - scatter vectorize. Try to use their orders if no
+    // other orders or the order is counted already.
+    SmallVector<OrdersType> StridedVectorizeOrders;
     SmallPtrSet<const TreeEntry *, 4> VisitedOps;
     for (const TreeEntry *OpTE : OrderedEntries) {
       // No need to reorder this nodes, still need to extend and to use shuffle,
@@ -4542,6 +4491,11 @@ void BoUpSLP::reorderTopToBottom() {
         if (Order.empty())
           continue;
       }
+      // Postpone scatter orders.
+      if (OpTE->State == TreeEntry::PossibleStridedVectorize) {
+        StridedVectorizeOrders.push_back(Order);
+        continue;
+      }
       // Stores actually store the mask, not the order, need to invert.
       if (OpTE->State == TreeEntry::Vectorize && !OpTE->isAltShuffle() &&
           OpTE->getOpcode() == Instruction::Store && !Order.empty()) {
@@ -4558,6 +4512,22 @@ void BoUpSLP::reorderTopToBottom() {
         ++OrdersUses.insert(std::make_pair(Order, 0)).first->second;
       }
     }
+    // Set order of the user node.
+    if (OrdersUses.empty()) {
+      if (StridedVectorizeOrders.empty())
+        continue;
+      // Add (potentially!) strided vectorize orders.
+      for (OrdersType &Order : StridedVectorizeOrders)
+        ++OrdersUses.insert(std::make_pair(Order, 0)).first->second;
+    } else {
+      // Account (potentially!) strided vectorize orders only if it was used
+      // already.
+      for (OrdersType &Order : StridedVectorizeOrders) {
+        auto *It = OrdersUses.find(Order);
+        if (It != OrdersUses.end())
+          ++It->second;
+      }
+    }
     // Choose the most used order.
     ArrayRef<unsigned> BestOrder = OrdersUses.front().first;
     unsigned Cnt = OrdersUses.front().second;
@@ -4599,7 +4569,7 @@ void BoUpSLP::reorderTopToBottom() {
         continue;
       }
       if ((TE->State == TreeEntry::Vectorize ||
-           TE->State == TreeEntry::StridedVectorize) &&
+           TE->State == TreeEntry::PossibleStridedVectorize) &&
           isa<ExtractElementInst, ExtractValueInst, LoadInst, StoreInst,
               InsertElementInst>(TE->getMainOp()) &&
           !TE->isAltShuffle()) {
@@ -4640,6 +4610,10 @@ bool BoUpSLP::canReorderOperands(
         }))
       continue;
     if (TreeEntry *TE = getVectorizedOperand(UserTE, I)) {
+      // FIXME: Do not reorder (possible!) strided vectorized nodes, they
+      // require reordering of the operands, which is not implemented yet.
+      if (TE->State == TreeEntry::PossibleStridedVectorize)
+        return false;
       // Do not reorder if operand node is used by many user nodes.
       if (any_of(TE->UserTreeIndices,
                  [UserTE](const EdgeInfo &EI) { return EI.UserTE != UserTE; }))
@@ -4690,13 +4664,13 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
   SmallVector<TreeEntry *> NonVectorized;
   for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
     if (TE->State != TreeEntry::Vectorize &&
-        TE->State != TreeEntry::StridedVectorize)
+        TE->State != TreeEntry::PossibleStridedVectorize)
       NonVectorized.push_back(TE.get());
     if (std::optional<OrdersType> CurrentOrder =
             getReorderingData(*TE, /*TopToBottom=*/false)) {
       OrderedEntries.insert(TE.get());
       if (!(TE->State == TreeEntry::Vectorize ||
-            TE->State == TreeEntry::StridedVectorize) ||
+            TE->State == TreeEntry::PossibleStridedVectorize) ||
           !TE->ReuseShuffleIndices.empty())
         GathersToOrders.try_emplace(TE.get(), *CurrentOrder);
     }
@@ -4714,7 +4688,7 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
     SmallVector<TreeEntry *> Filtered;
     for (TreeEntry *TE : OrderedEntries) {
       if (!(TE->State == TreeEntry::Vectorize ||
-            TE->State == TreeEntry::StridedVectorize ||
+            TE->State == TreeEntry::PossibleStridedVectorize ||
             (TE->State == TreeEntry::NeedToGather &&
              GathersToOrders.count(TE))) ||
           TE->UserTreeIndices.empty() || !TE->ReuseShuffleIndices.empty() ||
@@ -4759,6 +4733,9 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
       MapVector<OrdersType, unsigned,
                 DenseMap<OrdersType, unsigned, OrdersTypeDenseMapInfo>>
           OrdersUses;
+      // Last chance orders - scatter vectorize. Try to use their orders if no
+      // other orders or the order is counted already.
+      SmallVector<std::pair<OrdersType, unsigned>> StridedVectorizeOrders;
       // Do the analysis for each tree entry only once, otherwise the order of
       // the same node my be considered several times, though might be not
       // profitable.
@@ -4780,6 +4757,11 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
             Data.second, [OpTE](const std::pair<unsigned, TreeEntry *> &P) {
               return P.second == OpTE;
             });
+        // Postpone scatter orders.
+        if (OpTE->State == TreeEntry::PossibleStridedVectorize) {
+          StridedVectorizeOrders.emplace_back(Order, NumOps);
+          continue;
+        }
         // Stores actually store the mask, not the order, need to invert.
         if (OpTE->State == TreeEntry::Vectorize && !OpTE->isAltShuffle() &&
             OpTE->getOpcode() == Instruction::Store && !Order.empty()) {
@@ -4837,6 +4819,30 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
             ++Res.first->second;
         }
       }
+      // If no orders - skip current nodes and jump to the next one, if any.
+      if (OrdersUses.empty()) {
+        if (StridedVectorizeOrders.empty() ||
+            (Data.first->ReorderIndices.empty() &&
+             Data.first->ReuseShuffleIndices.empty() &&
+             !(IgnoreReorder &&
+               Data.first == VectorizableTree.front().get()))) {
+          for (const std::pair<unsigned, TreeEntry *> &Op : Data.second)
+            OrderedEntries.remove(Op.second);
+          continue;
+        }
+        // Add (potentially!) strided vectorize orders.
+        for (std::pair<OrdersType, unsigned> &Pair : StridedVectorizeOrders)
+          OrdersUses.insert(std::make_pair(Pair.first, 0)).first->second +=
+              Pair.second;
+      } else {
+        // Account (potentially!) strided vectorize orders only if it was used
+        // already.
+        for (std::pair<OrdersType, unsigned> &Pair : StridedVectorizeOrders) {
+          auto *It = OrdersUses.find(Pair.first);
+          if (It != OrdersUses.end())
+            It->second += Pair.second;
+        }
+      }
       // Choose the best order.
       ArrayRef<unsigned> BestOrder = OrdersUses.front().first;
       unsigned Cnt = OrdersUses.front().second;
@@ -4872,7 +4878,7 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
         }
         // Gathers are processed separately.
         if (TE->State != TreeEntry::Vectorize &&
-            TE->State != TreeEntry::StridedVectorize &&
+            TE->State != TreeEntry::PossibleStridedVectorize &&
             (TE->State != TreeEntry::ScatterVectorize ||
              TE->ReorderIndices.empty()))
           continue;
@@ -4904,7 +4910,7 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
         Data.first->reorderOperands(Mask);
       if (!isa<InsertElementInst, StoreInst>(Data.first->getMainOp()) ||
           Data.first->isAltShuffle() ||
-          Data.first->State == TreeEntry::StridedVectorize) {
+          Data.first->State == TreeEntry::PossibleStridedVectorize) {
         reorderScalars(Data.first->Scalars, Mask);
         reorderOrder(Data.first->ReorderIndices, MaskOrder,
                      /*BottomOrder=*/true);
@@ -4967,6 +4973,7 @@ void BoUpSLP::buildExternalUses(
           // instructions. If that is the case, the one in FoundLane will
           // be used.
           if (UseEntry->State == TreeEntry::ScatterVectorize ||
+              UseEntry->State == TreeEntry::PossibleStridedVectorize ||
               !doesInTreeUserNeedToExtract(
                   Scalar, cast<Instruction>(UseEntry->Scalars.front()), TLI)) {
             LLVM_DEBUG(dbgs() << "SLP: \tInternal user will be removed:" << *U
@@ -5324,8 +5331,8 @@ BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(
       return TreeEntry::Vectorize;
     case LoadsState::ScatterVectorize:
       return TreeEntry::ScatterVectorize;
-    case LoadsState::StridedVectorize:
-      return TreeEntry::StridedVectorize;
+    case LoadsState::PossibleStridedVectorize:
+      return TreeEntry::PossibleStridedVectorize;
     case LoadsState::Gather:
 #ifndef NDEBUG
       Type *ScalarTy = VL0->getType();
@@ -5746,7 +5753,8 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
   BasicBlock *BB = nullptr;
   bool IsScatterVectorizeUserTE =
       UserTreeIdx.UserTE &&
-      UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize;
+      (UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize ||
+       UserTreeIdx.UserTE->State == TreeEntry::PossibleStridedVectorize);
   bool AreAllSameInsts =
       (S.getOpcode() && allSameBlock(VL)) ||
       (S.OpValue->getType()->isPointerTy() && IsScatterVectorizeUserTE &&
@@ -5843,7 +5851,8 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
   // Special processing for sorted pointers for ScatterVectorize node with
   // constant indeces only.
   if (AreAllSameInsts && UserTreeIdx.UserTE &&
-      UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize &&
+      (UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize ||
+       UserTreeIdx.UserTE->State == TreeEntry::PossibleStridedVectorize) &&
       !(S.getOpcode() && allSameBlock(VL))) {
     assert(S.OpValue->getType()->isPointerTy() &&
            count_if(VL, [](Value *V) { return isa<GetElementPtrInst>(V); }) >=
@@ -6040,17 +6049,18 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
         }
         TE->setOperandsInOrder();
         break;
-      case TreeEntry::StridedVectorize:
+      case TreeEntry::PossibleStridedVectorize:
         // Vectorizing non-consecutive loads with `llvm.masked.gather`.
         if (CurrentOrder.empty()) {
-          TE = newTreeEntry(VL, TreeEntry::StridedVectorize, Bundle, S,
+          TE = newTreeEntry(VL, TreeEntry::PossibleStridedVectorize, Bundle, S,
                             UserTreeIdx, ReuseShuffleIndicies);
         } else {
-          TE = newTreeEntry(VL, TreeEntry::StridedVectorize, Bundle, S,
+          TE = newTreeEntry(VL, TreeEntry::PossibleStridedVectorize, Bundle, S,
                             UserTreeIdx, ReuseShuffleIndicies, CurrentOrder);
         }
         TE->setOperandsInOrder();
-        LLVM_DEBUG(dbgs() << "SLP: added a vector of strided loads.\n");
+        buildTree_rec(PointerOps, Depth + 1, {TE, 0});
+        LLVM_DEBUG(dbgs() << "SLP: added a vector of non-consecutive loads.\n");
         break;
       case TreeEntry::ScatterVectorize:
         // Vectorizing non-consecutive loads with `llvm.masked.gather`.
@@ -7081,7 +7091,7 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
         !isSplat(Gathers)) {
       InstructionCost BaseCost = R.getGatherCost(Gathers, !Root);
       SetVector<Value *> VectorizedLoads;
-      SmallVector<std::pair<unsigned, LoadsState>> VectorizedStarts;
+      SmallVector<unsigned> VectorizedStarts;
       SmallVector<unsigned> ScatterVectorized;
       unsigned StartIdx = 0;
       unsigned VF = VL.size() / 2;
@@ -7105,16 +7115,12 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
             switch (LS) {
             case LoadsState::Vectorize:
             case LoadsState::ScatterVectorize:
-            case LoadsState::StridedVectorize:
+            case LoadsState::PossibleStridedVectorize:
               // Mark the vectorized loads so that we don't vectorize them
               // again.
               // TODO: better handling of loads with reorders.
-              if (((LS == LoadsState::Vectorize ||
-                    LS == LoadsState::StridedVectorize) &&
-                   CurrentOrder.empty()) ||
-                  (LS == LoadsState::StridedVectorize &&
-                   isReverseOrder(CurrentOrder)))
-                VectorizedStarts.emplace_back(Cnt, LS);
+              if (LS == LoadsState::Vectorize && CurrentOrder.empty())
+                VectorizedStarts.push_back(Cnt);
               else
                 ScatterVectorized.push_back(Cnt);
               VectorizedLoads.insert(Slice.begin(), Slice.end());
@@ -7158,20 +7164,16 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
                                   CostKind, TTI::OperandValueInfo(), LI);
         }
         auto *LoadTy = FixedVectorType::get(VL.front()->getType(), VF);
-        for (const std::pair<unsigned, LoadsState> &P : VectorizedStarts) {
-          auto *LI = cast<LoadInst>(VL[P.first]);
+        for (unsigned P : VectorizedStarts) {
+          auto *LI = cast<LoadInst>(VL[P]);
           Align Alignment = LI->getAlign();
           GatherCost +=
-              P.second == LoadsState::Vectorize
-                  ? TTI.getMemoryOpCost(Instruction::Load, LoadTy, Alignment,
-                                        LI->getPointerAddressSpace(), CostKind,
-                                        TTI::OperandValueInfo(), LI)
-                  : TTI.getStridedMemoryOpCost(
-                        Instruction::Load, LoadTy, LI->getPointerOperand(),
-                        /*VariableMask=*/false, Alignment, CostKind, LI);
+              TTI.getMemoryOpCost(Instruction::Load, LoadTy, Alignment,
+                                  LI->getPointerAddressSpace(), CostKind,
+                                  TTI::OperandValueInfo(), LI);
           // Estimate GEP cost.
           SmallVector<Value *> PointerOps(VF);
-          for (auto [I, V] : enumerate(VL.slice(P.first, VF)))
+          for (auto [I, V] : enumerate(VL.slice(P, VF)))
             PointerOps[I] = cast<LoadInst>(V)->getPointerOperand();
           auto [ScalarGEPCost, VectorGEPCost] =
               getGEPCosts(TTI, PointerOps, LI->getPointerOperand(),
@@ -7911,9 +7913,8 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
   }
   InstructionCost CommonCost = 0;
   SmallVector<int> Mask;
-  bool IsReverseOrder = isReverseOrder(E->ReorderIndices);
   if (!E->ReorderIndices.empty() &&
-      (E->State != TreeEntry::StridedVectorize || !IsReverseOrder)) {
+      E->State != TreeEntry::PossibleStridedVectorize) {
     SmallVector<int> NewMask;
     if (E->getOpcode() == Instruction::Store) {
       // For stores the order is actually a mask.
@@ -7931,7 +7932,7 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
         TTI->getShuffleCost(TTI::SK_PermuteSingleSrc, FinalVecTy, Mask);
   assert((E->State == TreeEntry::Vectorize ||
           E->State == TreeEntry::ScatterVectorize ||
-          E->State == TreeEntry::StridedVectorize) &&
+          E->State == TreeEntry::PossibleStridedVectorize) &&
          "Unhandled state");
   assert(E->getOpcode() &&
          ((allSameType(VL) && allSameBlock(VL)) ||
@@ -7951,8 +7952,7 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
   }
   auto GetCastContextHint = [&](Value *V) {
     if (const TreeEntry *OpTE = getTreeEntry(V)) {
-      if (OpTE->State == TreeEntry::ScatterVectorize ||
-          OpTE->State == TreeEntry::StridedVectorize)
+      if (OpTE->State == TreeEntry::ScatterVectorize)
         return TTI::CastContextHint::GatherScatter;
       if (OpTE->State == TreeEntry::Vectorize &&
           OpTE->getOpcode() == Instruction::Load && !OpTE->isAltShuffle()) {
@@ -8028,9 +8028,8 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
   // Calculate cost difference from vectorizing set of GEPs.
   // Negative value means vectorizing is profitable.
   auto GetGEPCostDiff = [=](ArrayRef<Value *> Ptrs, Value *BasePtr) {
-    assert((E->State == TreeEntry::Vectorize ||
-            E->State == TreeEntry::StridedVectorize) &&
-           "Entry state expected to be Vectorize or StridedVectorize here.");
+    assert(E->State == TreeEntry::Vectorize &&
+           "Entry state expected to be Vectorize here.");
     InstructionCost ScalarCost = 0;
     InstructionCost VecCost = 0;
     std::tie(ScalarCost, VecCost) = getGEPCosts(
@@ -8383,14 +8382,10 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
         VecLdCost = TTI->getMemoryOpCost(
             Instruction::Load, VecTy, LI0->getAlign(),
             LI0->getPointerAddressSpace(), CostKind, TTI::OperandValueInfo());
-      } else if (E->State == TreeEntry::StridedVectorize) {
-        Align CommonAlignment =
-            computeCommonAlignment<LoadInst>(UniqueValues.getArrayRef());
-        VecLdCost = TTI->getStridedMemoryOpCost(
-            Instruction::Load, VecTy, LI0->getPointerOperand(),
-            /*VariableMask=*/false, CommonAlignment, CostKind);
       } else {
-        assert(E->State == TreeEntry::ScatterVectorize && "Unknown EntryState");
+        assert((E->State == TreeEntry::ScatterVectorize ||
+                E->State == TreeEntry::PossibleStridedVectorize) &&
+               "Unknown EntryState");
         Align CommonAlignment =
             computeCommonAlignment<LoadInst>(UniqueValues.getArrayRef());
         VecLdCost = TTI->getGatherScatterOpCost(
@@ -8403,7 +8398,8 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
     InstructionCost Cost = GetCostDiff(GetScalarCost, GetVectorCost);
     // If this node generates masked gather load then it is not a terminal node.
     // Hence address operand cost is estimated separately.
-    if (E->State == TreeEntry::ScatterVectorize)
+    if (E->State == TreeEntry::ScatterVectorize ||
+        E->State == TreeEntry::PossibleStridedVectorize)
       return Cost;
 
     // Estimate cost of GEPs since this tree node is a terminator.
@@ -8612,7 +8608,7 @@ bool BoUpSLP::isFullyVectorizableTinyTree(bool ForReduction) const {
   if (VectorizableTree[0]->State == TreeEntry::NeedToGather ||
       (VectorizableTree[1]->State == TreeEntry::NeedToGather &&
        VectorizableTree[0]->State != TreeEntry::ScatterVectorize &&
-       VectorizableTree[0]->State != TreeEntry::StridedVectorize))
+       VectorizableTree[0]->State != TreeEntry::PossibleStridedVectorize))
     return false;
 
   return true;
@@ -10583,6 +10579,11 @@ public:
 Value *BoUpSLP::vectorizeOperand(TreeEntry *E, unsigned NodeIdx,
                                  bool PostponedPHIs) {
   ValueList &VL = E->getOperand(NodeIdx);
+  if (E->State == TreeEntry::PossibleStridedVectorize &&
+      !E->ReorderIndices.empty()) {
+    SmallVector<int> Mask(E->ReorderIndices.begin(), E->ReorderIndices.end());
+    reorderScalars(VL, Mask);
+  }
   const unsigned VF = VL.size();
   InstructionsState S = getSameOpcode(VL, *TLI);
   // Special processing for GEPs bundle, which may include non-gep values.
@@ -11156,7 +11157,6 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) {
     return Vec;
   }
 
-  bool IsReverseOrder = isReverseOrder(E->ReorderIndices);
   auto FinalShuffle = [&](Value *V, const TreeEntry *E, VectorType *VecTy,
                           bool IsSigned) {
     if (V->getType() != VecTy)
@@ -11167,7 +11167,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) {
           ArrayRef(reinterpret_cast<const int *>(E->ReorderIndices.begin()),
                    E->ReorderIndices.size());
       ShuffleBuilder.add(V, Mask);
-    } else if (E->State == TreeEntry::StridedVectorize && IsReverseOrder) {
+    } else if (E->State == TreeEntry::PossibleStridedVectorize) {
       ShuffleBuilder.addOrdered(V, std::nullopt);
     } else {
       ShuffleBuilder.addOrdered(V, E->ReorderIndices);
@@ -11177,7 +11177,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) {
 
   assert((E->State == TreeEntry::Vectorize ||
           E->State == TreeEntry::ScatterVectorize ||
-          E->State == TreeEntry::StridedVectorize) &&
+          E->State == TreeEntry::PossibleStridedVectorize) &&
          "Unhandled state");
   unsigned ShuffleOrOp =
       E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector : E->getOpcode();
@@ -11642,29 +11642,10 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) {
       Value *PO = LI->getPointerOperand();
       if (E->State == TreeEntry::Vectorize) {
         NewLI = Builder.CreateAlignedLoad(VecTy, PO, LI->getAlign());
-      } else if (E->State == TreeEntry::StridedVectorize) {
-        Value *Ptr0 = cast<LoadInst>(E->Scalars.front())->getPointerOperand();
-        Value *PtrN = cast<LoadInst>(E->Scalars.back())->getPointerOperand();
-        PO = IsReverseOrder ? PtrN : Ptr0;
-        std::optional<int> Diff = getPointersDiff(
-            VL0->getType(), Ptr0, VL0->getType(), PtrN, *DL, *SE);
-        Type *StrideTy = DL->getIndexType(PO->getType());
-        int Stride = *Diff / (static_cast<int>(E->Scalars.size()) - 1);
-        Value *StrideVal =
-            ConstantInt::get(StrideTy, (IsReverseOrder ? -1 : 1) * Stride *
-                                           DL->getTypeAllocSize(ScalarTy));
-        Align CommonAlignment = computeCommonAlignment<LoadInst>(E->Scalars);
-        auto *Inst = Builder.CreateIntrinsic(
-            Intrinsic::experimental_vp_strided_load,
-            {VecTy, PO->getType(), StrideTy},
-            {PO, StrideVal, Builder.getAllOnesMask(VecTy->getElementCount()),
-             Builder.getInt32(E->Scalars.size())});
-        Inst->addParamAttr(
-            /*ArgNo=*/0,
-            Attribute::getWithAlignment(Inst->getContext(), CommonAlignment));
-        NewLI = Inst;
       } else {
-        assert(E->State == TreeEntry::ScatterVectorize && "Unhandled state");
+        assert((E->State == TreeEntry::ScatterVectorize ||
+                E->State == TreeEntry::PossibleStridedVectorize) &&
+               "Unhandled state");
         Value *VecPtr = vectorizeOperand(E, 0, PostponedPHIs);
         if (E->VectorizedValue) {
           LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
@@ -12088,11 +12069,8 @@ Value *BoUpSLP::vectorizeTree(
                      [&](llvm::User *U) {
                        TreeEntry *UseEntry = getTreeEntry(U);
                        return UseEntry &&
-                              (UseEntry->State == TreeEntry::Vectorize ||
-                               UseEntry->State ==
-                                   TreeEntry::StridedVectorize) &&
-                              (E->State == TreeEntry::Vectorize ||
-                               E->State == TreeEntry::StridedVectorize) &&
+                              UseEntry->State == TreeEntry::Vectorize &&
+                              E->State == TreeEntry::Vectorize &&
                               doesInTreeUserNeedToExtract(
                                   Scalar,
                                   cast<Instruction>(UseEntry->Scalars.front()),
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/complex-loads.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/complex-loads.ll
index e167b6a..dc5fb91 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/complex-loads.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/complex-loads.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
-; RUN: opt -S -mtriple riscv64-unknown-linux-gnu < %s --passes=slp-vectorizer -mattr=+v -slp-threshold=-40 | FileCheck %s
+; RUN: opt -S -mtriple riscv64-unknown-linux-gnu < %s --passes=slp-vectorizer -mattr=+v -slp-threshold=-80 | FileCheck %s
 
 define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.ptr, ptr %add.ptr64) {
 ; CHECK-LABEL: define i32 @test(
@@ -67,303 +67,305 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
 ; CHECK-NEXT:    [[TMP48:%.*]] = extractelement <2 x i32> [[TMP47]], i32 0
 ; CHECK-NEXT:    [[TMP49:%.*]] = extractelement <2 x i32> [[TMP47]], i32 1
 ; CHECK-NEXT:    [[SUB59_2:%.*]] = sub i32 [[TMP48]], [[TMP49]]
+; CHECK-NEXT:    [[ARRAYIDX3_3:%.*]] = getelementptr i8, ptr null, i64 4
 ; CHECK-NEXT:    [[TMP50:%.*]] = load i8, ptr null, align 1
 ; CHECK-NEXT:    [[ARRAYIDX20_3:%.*]] = getelementptr i8, ptr null, i64 2
 ; CHECK-NEXT:    [[ARRAYIDX22_3:%.*]] = getelementptr i8, ptr null, i64 2
 ; CHECK-NEXT:    [[TMP51:%.*]] = load i8, ptr null, align 1
-; CHECK-NEXT:    [[TMP52:%.*]] = insertelement <2 x ptr> <ptr poison, ptr null>, ptr [[ARRAYIDX20_3]], i32 0
+; CHECK-NEXT:    [[TMP52:%.*]] = insertelement <2 x ptr> <ptr null, ptr poison>, ptr [[ARRAYIDX20_3]], i32 1
 ; CHECK-NEXT:    [[TMP53:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP52]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
 ; CHECK-NEXT:    [[TMP54:%.*]] = zext <2 x i8> [[TMP53]] to <2 x i32>
-; CHECK-NEXT:    [[TMP55:%.*]] = insertelement <2 x ptr> <ptr poison, ptr null>, ptr [[ARRAYIDX22_3]], i32 0
+; CHECK-NEXT:    [[TMP55:%.*]] = insertelement <2 x ptr> <ptr null, ptr poison>, ptr [[ARRAYIDX22_3]], i32 1
 ; CHECK-NEXT:    [[TMP56:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP55]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
 ; CHECK-NEXT:    [[TMP57:%.*]] = zext <2 x i8> [[TMP56]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP58:%.*]] = sub <2 x i32> [[TMP54]], [[TMP57]]
-; CHECK-NEXT:    [[TMP59:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 null, i64 4, <2 x i1> <i1 true, i1 true>, i32 2)
-; CHECK-NEXT:    [[TMP60:%.*]] = zext <2 x i8> [[TMP59]] to <2 x i32>
-; CHECK-NEXT:    [[TMP61:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> getelementptr (i8, <2 x ptr> zeroinitializer, <2 x i64> <i64 6, i64 4>), i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
-; CHECK-NEXT:    [[TMP62:%.*]] = zext <2 x i8> [[TMP61]] to <2 x i32>
-; CHECK-NEXT:    [[TMP63:%.*]] = sub <2 x i32> [[TMP60]], [[TMP62]]
-; CHECK-NEXT:    [[TMP64:%.*]] = shl <2 x i32> [[TMP63]], <i32 16, i32 16>
-; CHECK-NEXT:    [[TMP65:%.*]] = add <2 x i32> [[TMP64]], [[TMP58]]
-; CHECK-NEXT:    [[TMP66:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> getelementptr (i8, <2 x ptr> zeroinitializer, <2 x i64> <i64 3, i64 1>), i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
-; CHECK-NEXT:    [[TMP67:%.*]] = zext <2 x i8> [[TMP66]] to <2 x i32>
-; CHECK-NEXT:    [[TMP68:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> getelementptr (i8, <2 x ptr> zeroinitializer, <2 x i64> <i64 3, i64 1>), i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
-; CHECK-NEXT:    [[TMP69:%.*]] = zext <2 x i8> [[TMP68]] to <2 x i32>
-; CHECK-NEXT:    [[TMP70:%.*]] = sub <2 x i32> [[TMP67]], [[TMP69]]
-; CHECK-NEXT:    [[TMP71:%.*]] = insertelement <2 x i8> poison, i8 [[TMP51]], i32 0
-; CHECK-NEXT:    [[TMP72:%.*]] = insertelement <2 x i8> [[TMP71]], i8 [[TMP50]], i32 1
-; CHECK-NEXT:    [[TMP73:%.*]] = zext <2 x i8> [[TMP72]] to <2 x i32>
-; CHECK-NEXT:    [[TMP74:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> getelementptr (i8, <2 x ptr> zeroinitializer, <2 x i64> <i64 7, i64 5>), i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
-; CHECK-NEXT:    [[TMP75:%.*]] = zext <2 x i8> [[TMP74]] to <2 x i32>
-; CHECK-NEXT:    [[TMP76:%.*]] = sub <2 x i32> [[TMP73]], [[TMP75]]
-; CHECK-NEXT:    [[TMP77:%.*]] = shl <2 x i32> [[TMP76]], <i32 16, i32 16>
-; CHECK-NEXT:    [[TMP78:%.*]] = add <2 x i32> [[TMP77]], [[TMP70]]
-; CHECK-NEXT:    [[TMP79:%.*]] = sub <2 x i32> [[TMP65]], [[TMP78]]
-; CHECK-NEXT:    [[TMP80:%.*]] = shufflevector <2 x i32> [[TMP78]], <2 x i32> [[TMP46]], <2 x i32> <i32 1, i32 2>
-; CHECK-NEXT:    [[TMP81:%.*]] = shufflevector <2 x i32> [[TMP65]], <2 x i32> [[TMP30]], <2 x i32> <i32 1, i32 2>
-; CHECK-NEXT:    [[TMP82:%.*]] = add <2 x i32> [[TMP80]], [[TMP81]]
-; CHECK-NEXT:    [[TMP83:%.*]] = shufflevector <2 x i32> [[TMP78]], <2 x i32> [[TMP46]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT:    [[TMP84:%.*]] = shufflevector <2 x i32> [[TMP65]], <2 x i32> [[TMP30]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT:    [[TMP85:%.*]] = add <2 x i32> [[TMP83]], [[TMP84]]
-; CHECK-NEXT:    [[TMP86:%.*]] = add <2 x i32> [[TMP85]], [[TMP82]]
-; CHECK-NEXT:    [[TMP87:%.*]] = sub <2 x i32> [[TMP82]], [[TMP85]]
-; CHECK-NEXT:    [[TMP88:%.*]] = extractelement <2 x i32> [[TMP79]], i32 0
-; CHECK-NEXT:    [[TMP89:%.*]] = extractelement <2 x i32> [[TMP79]], i32 1
-; CHECK-NEXT:    [[SUB59_3:%.*]] = sub i32 [[TMP89]], [[TMP88]]
-; CHECK-NEXT:    [[TMP90:%.*]] = extractelement <2 x i32> [[TMP86]], i32 0
-; CHECK-NEXT:    [[TMP91:%.*]] = extractelement <2 x i32> [[TMP86]], i32 1
-; CHECK-NEXT:    [[ADD94:%.*]] = add i32 [[TMP90]], [[TMP91]]
-; CHECK-NEXT:    [[SUB102:%.*]] = sub i32 [[TMP91]], [[TMP90]]
-; CHECK-NEXT:    [[TMP92:%.*]] = extractelement <2 x i32> [[TMP54]], i32 1
-; CHECK-NEXT:    [[SHR_I:%.*]] = lshr i32 [[TMP92]], 15
+; CHECK-NEXT:    [[TMP59:%.*]] = insertelement <2 x ptr> <ptr poison, ptr null>, ptr [[ARRAYIDX3_3]], i32 0
+; CHECK-NEXT:    [[TMP60:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP59]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
+; CHECK-NEXT:    [[TMP61:%.*]] = zext <2 x i8> [[TMP60]] to <2 x i32>
+; CHECK-NEXT:    [[TMP62:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> getelementptr (i8, <2 x ptr> zeroinitializer, <2 x i64> <i64 4, i64 6>), i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
+; CHECK-NEXT:    [[TMP63:%.*]] = zext <2 x i8> [[TMP62]] to <2 x i32>
+; CHECK-NEXT:    [[TMP64:%.*]] = sub <2 x i32> [[TMP61]], [[TMP63]]
+; CHECK-NEXT:    [[TMP65:%.*]] = shl <2 x i32> [[TMP64]], <i32 16, i32 16>
+; CHECK-NEXT:    [[TMP66:%.*]] = add <2 x i32> [[TMP65]], [[TMP58]]
+; CHECK-NEXT:    [[TMP67:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> getelementptr (i8, <2 x ptr> zeroinitializer, <2 x i64> <i64 1, i64 3>), i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
+; CHECK-NEXT:    [[TMP68:%.*]] = zext <2 x i8> [[TMP67]] to <2 x i32>
+; CHECK-NEXT:    [[TMP69:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> getelementptr (i8, <2 x ptr> zeroinitializer, <2 x i64> <i64 1, i64 3>), i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
+; CHECK-NEXT:    [[TMP70:%.*]] = zext <2 x i8> [[TMP69]] to <2 x i32>
+; CHECK-NEXT:    [[TMP71:%.*]] = sub <2 x i32> [[TMP68]], [[TMP70]]
+; CHECK-NEXT:    [[TMP72:%.*]] = insertelement <2 x i8> poison, i8 [[TMP50]], i32 0
+; CHECK-NEXT:    [[TMP73:%.*]] = insertelement <2 x i8> [[TMP72]], i8 [[TMP51]], i32 1
+; CHECK-NEXT:    [[TMP74:%.*]] = zext <2 x i8> [[TMP73]] to <2 x i32>
+; CHECK-NEXT:    [[TMP75:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> getelementptr (i8, <2 x ptr> zeroinitializer, <2 x i64> <i64 5, i64 7>), i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
+; CHECK-NEXT:    [[TMP76:%.*]] = zext <2 x i8> [[TMP75]] to <2 x i32>
+; CHECK-NEXT:    [[TMP77:%.*]] = sub <2 x i32> [[TMP74]], [[TMP76]]
+; CHECK-NEXT:    [[TMP78:%.*]] = shl <2 x i32> [[TMP77]], <i32 16, i32 16>
+; CHECK-NEXT:    [[TMP79:%.*]] = add <2 x i32> [[TMP78]], [[TMP71]]
+; CHECK-NEXT:    [[TMP80:%.*]] = sub <2 x i32> [[TMP66]], [[TMP79]]
+; CHECK-NEXT:    [[TMP81:%.*]] = shufflevector <2 x i32> [[TMP79]], <2 x i32> [[TMP46]], <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:    [[TMP82:%.*]] = shufflevector <2 x i32> [[TMP66]], <2 x i32> [[TMP30]], <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:    [[TMP83:%.*]] = add <2 x i32> [[TMP81]], [[TMP82]]
+; CHECK-NEXT:    [[TMP84:%.*]] = shufflevector <2 x i32> [[TMP79]], <2 x i32> [[TMP46]], <2 x i32> <i32 1, i32 3>
+; CHECK-NEXT:    [[TMP85:%.*]] = shufflevector <2 x i32> [[TMP66]], <2 x i32> [[TMP30]], <2 x i32> <i32 1, i32 3>
+; CHECK-NEXT:    [[TMP86:%.*]] = add <2 x i32> [[TMP84]], [[TMP85]]
+; CHECK-NEXT:    [[TMP87:%.*]] = add <2 x i32> [[TMP86]], [[TMP83]]
+; CHECK-NEXT:    [[TMP88:%.*]] = sub <2 x i32> [[TMP83]], [[TMP86]]
+; CHECK-NEXT:    [[TMP89:%.*]] = extractelement <2 x i32> [[TMP80]], i32 0
+; CHECK-NEXT:    [[TMP90:%.*]] = extractelement <2 x i32> [[TMP80]], i32 1
+; CHECK-NEXT:    [[SUB59_3:%.*]] = sub i32 [[TMP89]], [[TMP90]]
+; CHECK-NEXT:    [[TMP91:%.*]] = extractelement <2 x i32> [[TMP87]], i32 0
+; CHECK-NEXT:    [[TMP92:%.*]] = extractelement <2 x i32> [[TMP87]], i32 1
+; CHECK-NEXT:    [[ADD94:%.*]] = add i32 [[TMP91]], [[TMP92]]
+; CHECK-NEXT:    [[SUB102:%.*]] = sub i32 [[TMP92]], [[TMP91]]
+; CHECK-NEXT:    [[TMP93:%.*]] = extractelement <2 x i32> [[TMP54]], i32 0
+; CHECK-NEXT:    [[SHR_I:%.*]] = lshr i32 [[TMP93]], 15
 ; CHECK-NEXT:    [[AND_I:%.*]] = and i32 [[SHR_I]], 65537
 ; CHECK-NEXT:    [[MUL_I:%.*]] = mul i32 [[AND_I]], 65535
-; CHECK-NEXT:    [[TMP93:%.*]] = extractelement <2 x i32> [[TMP85]], i32 1
-; CHECK-NEXT:    [[SHR_I49:%.*]] = lshr i32 [[TMP93]], 15
+; CHECK-NEXT:    [[TMP94:%.*]] = extractelement <2 x i32> [[TMP86]], i32 1
+; CHECK-NEXT:    [[SHR_I49:%.*]] = lshr i32 [[TMP94]], 15
 ; CHECK-NEXT:    [[AND_I50:%.*]] = and i32 [[SHR_I49]], 65537
 ; CHECK-NEXT:    [[MUL_I51:%.*]] = mul i32 [[AND_I50]], 65535
-; CHECK-NEXT:    [[TMP94:%.*]] = extractelement <2 x i32> [[TMP87]], i32 0
-; CHECK-NEXT:    [[TMP95:%.*]] = extractelement <2 x i32> [[TMP87]], i32 1
-; CHECK-NEXT:    [[ADD94_2:%.*]] = add i32 [[TMP94]], [[TMP95]]
-; CHECK-NEXT:    [[TMP96:%.*]] = load <2 x i8>, ptr [[ARRAYIDX20]], align 1
-; CHECK-NEXT:    [[TMP97:%.*]] = zext <2 x i8> [[TMP96]] to <2 x i32>
-; CHECK-NEXT:    [[TMP98:%.*]] = insertelement <2 x i32> poison, i32 [[SUB59_2]], i32 0
-; CHECK-NEXT:    [[TMP99:%.*]] = shufflevector <2 x i32> [[TMP98]], <2 x i32> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP100:%.*]] = insertelement <2 x i32> poison, i32 [[SUB59_3]], i32 0
-; CHECK-NEXT:    [[TMP101:%.*]] = shufflevector <2 x i32> [[TMP100]], <2 x i32> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP102:%.*]] = add <2 x i32> [[TMP99]], [[TMP101]]
-; CHECK-NEXT:    [[TMP103:%.*]] = sub <2 x i32> [[TMP99]], [[TMP101]]
-; CHECK-NEXT:    [[TMP104:%.*]] = shufflevector <2 x i32> [[TMP102]], <2 x i32> [[TMP103]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT:    [[TMP105:%.*]] = load <2 x i8>, ptr [[PIX1]], align 1
-; CHECK-NEXT:    [[TMP106:%.*]] = zext <2 x i8> [[TMP105]] to <2 x i32>
-; CHECK-NEXT:    [[TMP107:%.*]] = shufflevector <2 x i32> [[TMP106]], <2 x i32> poison, <2 x i32> <i32 1, i32 0>
-; CHECK-NEXT:    [[TMP108:%.*]] = insertelement <2 x ptr> [[TMP4]], ptr [[ARRAYIDX22]], i32 1
-; CHECK-NEXT:    [[TMP109:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP108]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
-; CHECK-NEXT:    [[TMP110:%.*]] = zext <2 x i8> [[TMP109]] to <2 x i32>
-; CHECK-NEXT:    [[TMP111:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP2]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
-; CHECK-NEXT:    [[TMP112:%.*]] = zext <2 x i8> [[TMP111]] to <2 x i32>
-; CHECK-NEXT:    [[TMP113:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP5]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
-; CHECK-NEXT:    [[TMP114:%.*]] = zext <2 x i8> [[TMP113]] to <2 x i32>
-; CHECK-NEXT:    [[TMP115:%.*]] = sub <2 x i32> [[TMP112]], [[TMP114]]
-; CHECK-NEXT:    [[TMP116:%.*]] = shl <2 x i32> [[TMP115]], <i32 16, i32 16>
-; CHECK-NEXT:    [[TMP117:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP6]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
-; CHECK-NEXT:    [[TMP118:%.*]] = zext <2 x i8> [[TMP117]] to <2 x i32>
-; CHECK-NEXT:    [[TMP119:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP7]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
-; CHECK-NEXT:    [[TMP120:%.*]] = zext <2 x i8> [[TMP119]] to <2 x i32>
-; CHECK-NEXT:    [[TMP121:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP8]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
-; CHECK-NEXT:    [[TMP122:%.*]] = zext <2 x i8> [[TMP121]] to <2 x i32>
-; CHECK-NEXT:    [[TMP123:%.*]] = sub <2 x i32> [[TMP120]], [[TMP122]]
-; CHECK-NEXT:    [[TMP124:%.*]] = shl <2 x i32> [[TMP123]], <i32 16, i32 16>
-; CHECK-NEXT:    [[TMP125:%.*]] = shufflevector <2 x i32> [[TMP106]], <2 x i32> [[TMP97]], <2 x i32> <i32 0, i32 2>
-; CHECK-NEXT:    [[TMP126:%.*]] = sub <2 x i32> [[TMP125]], [[TMP110]]
-; CHECK-NEXT:    [[TMP127:%.*]] = add <2 x i32> [[TMP116]], [[TMP126]]
-; CHECK-NEXT:    [[TMP128:%.*]] = shufflevector <2 x i32> [[TMP107]], <2 x i32> [[TMP97]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT:    [[TMP129:%.*]] = sub <2 x i32> [[TMP128]], [[TMP118]]
-; CHECK-NEXT:    [[TMP130:%.*]] = add <2 x i32> [[TMP124]], [[TMP129]]
-; CHECK-NEXT:    [[TMP131:%.*]] = extractelement <2 x i32> [[TMP127]], i32 1
-; CHECK-NEXT:    [[TMP132:%.*]] = extractelement <2 x i32> [[TMP130]], i32 1
-; CHECK-NEXT:    [[ADD46:%.*]] = add i32 [[TMP132]], [[TMP131]]
-; CHECK-NEXT:    [[TMP133:%.*]] = sub <2 x i32> [[TMP127]], [[TMP130]]
-; CHECK-NEXT:    [[TMP134:%.*]] = extractelement <2 x i32> [[TMP127]], i32 0
-; CHECK-NEXT:    [[TMP135:%.*]] = extractelement <2 x i32> [[TMP130]], i32 0
-; CHECK-NEXT:    [[ADD44:%.*]] = add i32 [[TMP135]], [[TMP134]]
-; CHECK-NEXT:    [[TMP136:%.*]] = lshr <2 x i32> [[TMP107]], <i32 15, i32 15>
-; CHECK-NEXT:    [[TMP137:%.*]] = and <2 x i32> [[TMP136]], <i32 65537, i32 65537>
-; CHECK-NEXT:    [[TMP138:%.*]] = mul <2 x i32> [[TMP137]], <i32 65535, i32 65535>
-; CHECK-NEXT:    [[TMP139:%.*]] = extractelement <2 x i32> [[TMP133]], i32 0
-; CHECK-NEXT:    [[TMP140:%.*]] = extractelement <2 x i32> [[TMP133]], i32 1
-; CHECK-NEXT:    [[SUB59:%.*]] = sub i32 [[TMP139]], [[TMP140]]
-; CHECK-NEXT:    [[TMP141:%.*]] = load <2 x i8>, ptr [[ARRAYIDX8_1]], align 1
-; CHECK-NEXT:    [[TMP142:%.*]] = zext <2 x i8> [[TMP141]] to <2 x i32>
+; CHECK-NEXT:    [[TMP95:%.*]] = extractelement <2 x i32> [[TMP88]], i32 0
+; CHECK-NEXT:    [[TMP96:%.*]] = extractelement <2 x i32> [[TMP88]], i32 1
+; CHECK-NEXT:    [[ADD94_2:%.*]] = add i32 [[TMP95]], [[TMP96]]
+; CHECK-NEXT:    [[TMP97:%.*]] = load <2 x i8>, ptr [[ARRAYIDX20]], align 1
+; CHECK-NEXT:    [[TMP98:%.*]] = zext <2 x i8> [[TMP97]] to <2 x i32>
+; CHECK-NEXT:    [[TMP99:%.*]] = insertelement <2 x i32> poison, i32 [[SUB59_2]], i32 0
+; CHECK-NEXT:    [[TMP100:%.*]] = shufflevector <2 x i32> [[TMP99]], <2 x i32> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP101:%.*]] = insertelement <2 x i32> poison, i32 [[SUB59_3]], i32 0
+; CHECK-NEXT:    [[TMP102:%.*]] = shufflevector <2 x i32> [[TMP101]], <2 x i32> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP103:%.*]] = add <2 x i32> [[TMP100]], [[TMP102]]
+; CHECK-NEXT:    [[TMP104:%.*]] = sub <2 x i32> [[TMP100]], [[TMP102]]
+; CHECK-NEXT:    [[TMP105:%.*]] = shufflevector <2 x i32> [[TMP103]], <2 x i32> [[TMP104]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    [[TMP106:%.*]] = load <2 x i8>, ptr [[PIX1]], align 1
+; CHECK-NEXT:    [[TMP107:%.*]] = zext <2 x i8> [[TMP106]] to <2 x i32>
+; CHECK-NEXT:    [[TMP108:%.*]] = shufflevector <2 x i32> [[TMP107]], <2 x i32> poison, <2 x i32> <i32 1, i32 0>
+; CHECK-NEXT:    [[TMP109:%.*]] = insertelement <2 x ptr> [[TMP4]], ptr [[ARRAYIDX22]], i32 1
+; CHECK-NEXT:    [[TMP110:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP109]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
+; CHECK-NEXT:    [[TMP111:%.*]] = zext <2 x i8> [[TMP110]] to <2 x i32>
+; CHECK-NEXT:    [[TMP112:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP2]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
+; CHECK-NEXT:    [[TMP113:%.*]] = zext <2 x i8> [[TMP112]] to <2 x i32>
+; CHECK-NEXT:    [[TMP114:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP5]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
+; CHECK-NEXT:    [[TMP115:%.*]] = zext <2 x i8> [[TMP114]] to <2 x i32>
+; CHECK-NEXT:    [[TMP116:%.*]] = sub <2 x i32> [[TMP113]], [[TMP115]]
+; CHECK-NEXT:    [[TMP117:%.*]] = shl <2 x i32> [[TMP116]], <i32 16, i32 16>
+; CHECK-NEXT:    [[TMP118:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP6]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
+; CHECK-NEXT:    [[TMP119:%.*]] = zext <2 x i8> [[TMP118]] to <2 x i32>
+; CHECK-NEXT:    [[TMP120:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP7]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
+; CHECK-NEXT:    [[TMP121:%.*]] = zext <2 x i8> [[TMP120]] to <2 x i32>
+; CHECK-NEXT:    [[TMP122:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP8]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
+; CHECK-NEXT:    [[TMP123:%.*]] = zext <2 x i8> [[TMP122]] to <2 x i32>
+; CHECK-NEXT:    [[TMP124:%.*]] = sub <2 x i32> [[TMP121]], [[TMP123]]
+; CHECK-NEXT:    [[TMP125:%.*]] = shl <2 x i32> [[TMP124]], <i32 16, i32 16>
+; CHECK-NEXT:    [[TMP126:%.*]] = shufflevector <2 x i32> [[TMP107]], <2 x i32> [[TMP98]], <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:    [[TMP127:%.*]] = sub <2 x i32> [[TMP126]], [[TMP111]]
+; CHECK-NEXT:    [[TMP128:%.*]] = add <2 x i32> [[TMP117]], [[TMP127]]
+; CHECK-NEXT:    [[TMP129:%.*]] = shufflevector <2 x i32> [[TMP108]], <2 x i32> [[TMP98]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    [[TMP130:%.*]] = sub <2 x i32> [[TMP129]], [[TMP119]]
+; CHECK-NEXT:    [[TMP131:%.*]] = add <2 x i32> [[TMP125]], [[TMP130]]
+; CHECK-NEXT:    [[TMP132:%.*]] = extractelement <2 x i32> [[TMP128]], i32 1
+; CHECK-NEXT:    [[TMP133:%.*]] = extractelement <2 x i32> [[TMP131]], i32 1
+; CHECK-NEXT:    [[ADD46:%.*]] = add i32 [[TMP133]], [[TMP132]]
+; CHECK-NEXT:    [[TMP134:%.*]] = sub <2 x i32> [[TMP128]], [[TMP131]]
+; CHECK-NEXT:    [[TMP135:%.*]] = extractelement <2 x i32> [[TMP128]], i32 0
+; CHECK-NEXT:    [[TMP136:%.*]] = extractelement <2 x i32> [[TMP131]], i32 0
+; CHECK-NEXT:    [[ADD44:%.*]] = add i32 [[TMP136]], [[TMP135]]
+; CHECK-NEXT:    [[TMP137:%.*]] = lshr <2 x i32> [[TMP108]], <i32 15, i32 15>
+; CHECK-NEXT:    [[TMP138:%.*]] = and <2 x i32> [[TMP137]], <i32 65537, i32 65537>
+; CHECK-NEXT:    [[TMP139:%.*]] = mul <2 x i32> [[TMP138]], <i32 65535, i32 65535>
+; CHECK-NEXT:    [[TMP140:%.*]] = extractelement <2 x i32> [[TMP134]], i32 0
+; CHECK-NEXT:    [[TMP141:%.*]] = extractelement <2 x i32> [[TMP134]], i32 1
+; CHECK-NEXT:    [[SUB59:%.*]] = sub i32 [[TMP140]], [[TMP141]]
+; CHECK-NEXT:    [[TMP142:%.*]] = load <2 x i8>, ptr [[ARRAYIDX8_1]], align 1
+; CHECK-NEXT:    [[TMP143:%.*]] = zext <2 x i8> [[TMP142]] to <2 x i32>
 ; CHECK-NEXT:    [[ADD_PTR644:%.*]] = getelementptr i8, ptr [[PIX2]], i64 [[IDX_EXT63]]
 ; CHECK-NEXT:    [[ARRAYIDX22_1:%.*]] = getelementptr i8, ptr [[ADD_PTR644]], i64 2
-; CHECK-NEXT:    [[TMP143:%.*]] = insertelement <2 x ptr> poison, ptr [[ADD_PTR644]], i32 0
-; CHECK-NEXT:    [[TMP144:%.*]] = insertelement <2 x ptr> [[TMP143]], ptr [[ARRAYIDX22_1]], i32 1
-; CHECK-NEXT:    [[TMP145:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP144]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
-; CHECK-NEXT:    [[TMP146:%.*]] = zext <2 x i8> [[TMP145]] to <2 x i32>
-; CHECK-NEXT:    [[TMP147:%.*]] = insertelement <2 x ptr> poison, ptr [[ADD_PTR3]], i32 0
-; CHECK-NEXT:    [[TMP148:%.*]] = shufflevector <2 x ptr> [[TMP147]], <2 x ptr> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP149:%.*]] = getelementptr i8, <2 x ptr> [[TMP148]], <2 x i64> <i64 4, i64 6>
-; CHECK-NEXT:    [[TMP150:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP149]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
-; CHECK-NEXT:    [[TMP151:%.*]] = zext <2 x i8> [[TMP150]] to <2 x i32>
-; CHECK-NEXT:    [[TMP152:%.*]] = shufflevector <2 x ptr> [[TMP144]], <2 x ptr> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP153:%.*]] = getelementptr i8, <2 x ptr> [[TMP152]], <2 x i64> <i64 4, i64 6>
-; CHECK-NEXT:    [[TMP154:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP153]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
-; CHECK-NEXT:    [[TMP155:%.*]] = zext <2 x i8> [[TMP154]] to <2 x i32>
-; CHECK-NEXT:    [[TMP156:%.*]] = sub <2 x i32> [[TMP151]], [[TMP155]]
-; CHECK-NEXT:    [[TMP157:%.*]] = shl <2 x i32> [[TMP156]], <i32 16, i32 16>
-; CHECK-NEXT:    [[TMP158:%.*]] = getelementptr i8, <2 x ptr> [[TMP152]], <2 x i64> <i64 1, i64 3>
-; CHECK-NEXT:    [[TMP159:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP158]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
-; CHECK-NEXT:    [[TMP160:%.*]] = zext <2 x i8> [[TMP159]] to <2 x i32>
-; CHECK-NEXT:    [[TMP161:%.*]] = getelementptr i8, <2 x ptr> [[TMP148]], <2 x i64> <i64 5, i64 7>
-; CHECK-NEXT:    [[TMP162:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP161]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
-; CHECK-NEXT:    [[TMP163:%.*]] = zext <2 x i8> [[TMP162]] to <2 x i32>
-; CHECK-NEXT:    [[TMP164:%.*]] = getelementptr i8, <2 x ptr> [[TMP152]], <2 x i64> <i64 5, i64 7>
-; CHECK-NEXT:    [[TMP165:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP164]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
-; CHECK-NEXT:    [[TMP166:%.*]] = zext <2 x i8> [[TMP165]] to <2 x i32>
-; CHECK-NEXT:    [[TMP167:%.*]] = sub <2 x i32> [[TMP163]], [[TMP166]]
-; CHECK-NEXT:    [[TMP168:%.*]] = shl <2 x i32> [[TMP167]], <i32 16, i32 16>
-; CHECK-NEXT:    [[TMP169:%.*]] = insertelement <2 x i32> [[TMP142]], i32 [[CONV33_1]], i32 1
-; CHECK-NEXT:    [[TMP170:%.*]] = sub <2 x i32> [[TMP169]], [[TMP160]]
-; CHECK-NEXT:    [[TMP171:%.*]] = add <2 x i32> [[TMP168]], [[TMP170]]
-; CHECK-NEXT:    [[TMP172:%.*]] = insertelement <2 x i32> [[TMP142]], i32 [[CONV_1]], i32 0
-; CHECK-NEXT:    [[TMP173:%.*]] = sub <2 x i32> [[TMP172]], [[TMP146]]
-; CHECK-NEXT:    [[TMP174:%.*]] = add <2 x i32> [[TMP157]], [[TMP173]]
-; CHECK-NEXT:    [[TMP175:%.*]] = add <2 x i32> [[TMP171]], [[TMP174]]
-; CHECK-NEXT:    [[TMP176:%.*]] = sub <2 x i32> [[TMP174]], [[TMP171]]
-; CHECK-NEXT:    [[TMP177:%.*]] = extractelement <2 x i32> [[TMP175]], i32 0
-; CHECK-NEXT:    [[TMP178:%.*]] = extractelement <2 x i32> [[TMP175]], i32 1
-; CHECK-NEXT:    [[SUB51_1:%.*]] = sub i32 [[TMP177]], [[TMP178]]
-; CHECK-NEXT:    [[TMP179:%.*]] = shufflevector <2 x i32> [[TMP176]], <2 x i32> [[TMP133]], <2 x i32> <i32 1, i32 3>
-; CHECK-NEXT:    [[TMP180:%.*]] = shufflevector <2 x i32> [[TMP176]], <2 x i32> [[TMP133]], <2 x i32> <i32 0, i32 2>
-; CHECK-NEXT:    [[TMP181:%.*]] = add <2 x i32> [[TMP179]], [[TMP180]]
-; CHECK-NEXT:    [[TMP182:%.*]] = extractelement <2 x i32> [[TMP176]], i32 0
-; CHECK-NEXT:    [[TMP183:%.*]] = extractelement <2 x i32> [[TMP176]], i32 1
-; CHECK-NEXT:    [[SUB59_1:%.*]] = sub i32 [[TMP182]], [[TMP183]]
-; CHECK-NEXT:    [[SHR_I54:%.*]] = lshr i32 [[TMP178]], 15
+; CHECK-NEXT:    [[TMP144:%.*]] = insertelement <2 x ptr> poison, ptr [[ADD_PTR644]], i32 0
+; CHECK-NEXT:    [[TMP145:%.*]] = insertelement <2 x ptr> [[TMP144]], ptr [[ARRAYIDX22_1]], i32 1
+; CHECK-NEXT:    [[TMP146:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP145]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
+; CHECK-NEXT:    [[TMP147:%.*]] = zext <2 x i8> [[TMP146]] to <2 x i32>
+; CHECK-NEXT:    [[TMP148:%.*]] = insertelement <2 x ptr> poison, ptr [[ADD_PTR3]], i32 0
+; CHECK-NEXT:    [[TMP149:%.*]] = shufflevector <2 x ptr> [[TMP148]], <2 x ptr> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP150:%.*]] = getelementptr i8, <2 x ptr> [[TMP149]], <2 x i64> <i64 4, i64 6>
+; CHECK-NEXT:    [[TMP151:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP150]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
+; CHECK-NEXT:    [[TMP152:%.*]] = zext <2 x i8> [[TMP151]] to <2 x i32>
+; CHECK-NEXT:    [[TMP153:%.*]] = shufflevector <2 x ptr> [[TMP145]], <2 x ptr> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP154:%.*]] = getelementptr i8, <2 x ptr> [[TMP153]], <2 x i64> <i64 4, i64 6>
+; CHECK-NEXT:    [[TMP155:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP154]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
+; CHECK-NEXT:    [[TMP156:%.*]] = zext <2 x i8> [[TMP155]] to <2 x i32>
+; CHECK-NEXT:    [[TMP157:%.*]] = sub <2 x i32> [[TMP152]], [[TMP156]]
+; CHECK-NEXT:    [[TMP158:%.*]] = shl <2 x i32> [[TMP157]], <i32 16, i32 16>
+; CHECK-NEXT:    [[TMP159:%.*]] = getelementptr i8, <2 x ptr> [[TMP153]], <2 x i64> <i64 1, i64 3>
+; CHECK-NEXT:    [[TMP160:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP159]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
+; CHECK-NEXT:    [[TMP161:%.*]] = zext <2 x i8> [[TMP160]] to <2 x i32>
+; CHECK-NEXT:    [[TMP162:%.*]] = getelementptr i8, <2 x ptr> [[TMP149]], <2 x i64> <i64 5, i64 7>
+; CHECK-NEXT:    [[TMP163:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP162]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
+; CHECK-NEXT:    [[TMP164:%.*]] = zext <2 x i8> [[TMP163]] to <2 x i32>
+; CHECK-NEXT:    [[TMP165:%.*]] = getelementptr i8, <2 x ptr> [[TMP153]], <2 x i64> <i64 5, i64 7>
+; CHECK-NEXT:    [[TMP166:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP165]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
+; CHECK-NEXT:    [[TMP167:%.*]] = zext <2 x i8> [[TMP166]] to <2 x i32>
+; CHECK-NEXT:    [[TMP168:%.*]] = sub <2 x i32> [[TMP164]], [[TMP167]]
+; CHECK-NEXT:    [[TMP169:%.*]] = shl <2 x i32> [[TMP168]], <i32 16, i32 16>
+; CHECK-NEXT:    [[TMP170:%.*]] = insertelement <2 x i32> [[TMP143]], i32 [[CONV33_1]], i32 1
+; CHECK-NEXT:    [[TMP171:%.*]] = sub <2 x i32> [[TMP170]], [[TMP161]]
+; CHECK-NEXT:    [[TMP172:%.*]] = add <2 x i32> [[TMP169]], [[TMP171]]
+; CHECK-NEXT:    [[TMP173:%.*]] = insertelement <2 x i32> [[TMP143]], i32 [[CONV_1]], i32 0
+; CHECK-NEXT:    [[TMP174:%.*]] = sub <2 x i32> [[TMP173]], [[TMP147]]
+; CHECK-NEXT:    [[TMP175:%.*]] = add <2 x i32> [[TMP158]], [[TMP174]]
+; CHECK-NEXT:    [[TMP176:%.*]] = add <2 x i32> [[TMP172]], [[TMP175]]
+; CHECK-NEXT:    [[TMP177:%.*]] = sub <2 x i32> [[TMP175]], [[TMP172]]
+; CHECK-NEXT:    [[TMP178:%.*]] = extractelement <2 x i32> [[TMP176]], i32 0
+; CHECK-NEXT:    [[TMP179:%.*]] = extractelement <2 x i32> [[TMP176]], i32 1
+; CHECK-NEXT:    [[SUB51_1:%.*]] = sub i32 [[TMP178]], [[TMP179]]
+; CHECK-NEXT:    [[TMP180:%.*]] = shufflevector <2 x i32> [[TMP177]], <2 x i32> [[TMP134]], <2 x i32> <i32 1, i32 3>
+; CHECK-NEXT:    [[TMP181:%.*]] = shufflevector <2 x i32> [[TMP177]], <2 x i32> [[TMP134]], <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:    [[TMP182:%.*]] = add <2 x i32> [[TMP180]], [[TMP181]]
+; CHECK-NEXT:    [[TMP183:%.*]] = extractelement <2 x i32> [[TMP177]], i32 0
+; CHECK-NEXT:    [[TMP184:%.*]] = extractelement <2 x i32> [[TMP177]], i32 1
+; CHECK-NEXT:    [[SUB59_1:%.*]] = sub i32 [[TMP183]], [[TMP184]]
+; CHECK-NEXT:    [[SHR_I54:%.*]] = lshr i32 [[TMP179]], 15
 ; CHECK-NEXT:    [[AND_I55:%.*]] = and i32 [[SHR_I54]], 65537
 ; CHECK-NEXT:    [[MUL_I56:%.*]] = mul i32 [[AND_I55]], 65535
-; CHECK-NEXT:    [[TMP184:%.*]] = lshr <2 x i32> [[TMP142]], <i32 15, i32 15>
-; CHECK-NEXT:    [[TMP185:%.*]] = and <2 x i32> [[TMP184]], <i32 65537, i32 65537>
-; CHECK-NEXT:    [[TMP186:%.*]] = mul <2 x i32> [[TMP185]], <i32 65535, i32 65535>
-; CHECK-NEXT:    [[TMP187:%.*]] = insertelement <2 x i32> poison, i32 [[SUB59_1]], i32 0
-; CHECK-NEXT:    [[TMP188:%.*]] = shufflevector <2 x i32> [[TMP187]], <2 x i32> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP189:%.*]] = extractelement <2 x i32> [[TMP181]], i32 0
-; CHECK-NEXT:    [[TMP190:%.*]] = extractelement <2 x i32> [[TMP181]], i32 1
-; CHECK-NEXT:    [[ADD78_1:%.*]] = add i32 [[TMP189]], [[TMP190]]
-; CHECK-NEXT:    [[TMP191:%.*]] = shufflevector <2 x i32> [[TMP33]], <2 x i32> [[TMP176]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT:    [[TMP192:%.*]] = lshr <2 x i32> [[TMP191]], <i32 15, i32 15>
-; CHECK-NEXT:    [[TMP193:%.*]] = and <2 x i32> [[TMP192]], <i32 65537, i32 65537>
-; CHECK-NEXT:    [[TMP194:%.*]] = mul <2 x i32> [[TMP193]], <i32 65535, i32 65535>
-; CHECK-NEXT:    [[TMP195:%.*]] = insertelement <2 x i32> poison, i32 [[ADD78_1]], i32 0
-; CHECK-NEXT:    [[TMP196:%.*]] = shufflevector <2 x i32> [[TMP195]], <2 x i32> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP197:%.*]] = insertelement <2 x i32> poison, i32 [[ADD94_2]], i32 0
-; CHECK-NEXT:    [[TMP198:%.*]] = shufflevector <2 x i32> [[TMP197]], <2 x i32> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP199:%.*]] = insertelement <2 x i32> poison, i32 [[ADD44]], i32 0
-; CHECK-NEXT:    [[TMP200:%.*]] = shufflevector <2 x i32> [[TMP199]], <2 x i32> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP201:%.*]] = insertelement <2 x i32> <i32 15, i32 poison>, i32 [[ADD46]], i32 1
-; CHECK-NEXT:    [[TMP202:%.*]] = lshr <2 x i32> [[TMP200]], [[TMP201]]
-; CHECK-NEXT:    [[TMP203:%.*]] = sub <2 x i32> [[TMP200]], [[TMP201]]
-; CHECK-NEXT:    [[TMP204:%.*]] = shufflevector <2 x i32> [[TMP202]], <2 x i32> [[TMP203]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT:    [[TMP205:%.*]] = extractelement <2 x i32> [[TMP204]], i32 1
-; CHECK-NEXT:    [[ADD78_2:%.*]] = add i32 [[SUB51_1]], [[TMP205]]
-; CHECK-NEXT:    [[TMP206:%.*]] = insertelement <2 x i32> <i32 65537, i32 poison>, i32 [[SUB51_1]], i32 1
-; CHECK-NEXT:    [[TMP207:%.*]] = and <2 x i32> [[TMP204]], [[TMP206]]
-; CHECK-NEXT:    [[TMP208:%.*]] = sub <2 x i32> [[TMP204]], [[TMP206]]
-; CHECK-NEXT:    [[TMP209:%.*]] = shufflevector <2 x i32> [[TMP207]], <2 x i32> [[TMP208]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT:    [[TMP210:%.*]] = insertelement <2 x i32> poison, i32 [[ADD78_2]], i32 0
-; CHECK-NEXT:    [[TMP211:%.*]] = shufflevector <2 x i32> [[TMP210]], <2 x i32> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP212:%.*]] = add <2 x i32> [[TMP211]], [[TMP198]]
-; CHECK-NEXT:    [[TMP213:%.*]] = sub <2 x i32> [[TMP211]], [[TMP198]]
-; CHECK-NEXT:    [[TMP214:%.*]] = shufflevector <2 x i32> [[TMP212]], <2 x i32> [[TMP213]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT:    [[TMP215:%.*]] = insertelement <2 x i32> [[TMP133]], i32 [[CONV_1]], i32 0
-; CHECK-NEXT:    [[TMP216:%.*]] = lshr <2 x i32> [[TMP215]], <i32 15, i32 15>
-; CHECK-NEXT:    [[TMP217:%.*]] = and <2 x i32> [[TMP216]], <i32 65537, i32 65537>
-; CHECK-NEXT:    [[TMP218:%.*]] = mul <2 x i32> [[TMP217]], <i32 65535, i32 65535>
-; CHECK-NEXT:    [[TMP219:%.*]] = shufflevector <2 x i32> [[TMP87]], <2 x i32> poison, <2 x i32> <i32 1, i32 poison>
-; CHECK-NEXT:    [[TMP220:%.*]] = shufflevector <2 x i32> [[TMP219]], <2 x i32> [[TMP181]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT:    [[TMP221:%.*]] = shufflevector <2 x i32> [[TMP87]], <2 x i32> [[TMP181]], <2 x i32> <i32 0, i32 2>
-; CHECK-NEXT:    [[TMP222:%.*]] = sub <2 x i32> [[TMP220]], [[TMP221]]
-; CHECK-NEXT:    [[TMP223:%.*]] = shufflevector <2 x i32> [[TMP47]], <2 x i32> poison, <2 x i32> <i32 1, i32 poison>
-; CHECK-NEXT:    [[TMP224:%.*]] = insertelement <2 x i32> [[TMP223]], i32 [[ADD46]], i32 1
-; CHECK-NEXT:    [[TMP225:%.*]] = insertelement <2 x i32> [[TMP47]], i32 [[ADD44]], i32 1
-; CHECK-NEXT:    [[TMP226:%.*]] = add <2 x i32> [[TMP224]], [[TMP225]]
-; CHECK-NEXT:    [[TMP227:%.*]] = shufflevector <2 x i32> [[TMP79]], <2 x i32> [[TMP175]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT:    [[TMP228:%.*]] = shufflevector <2 x i32> [[TMP79]], <2 x i32> [[TMP175]], <2 x i32> <i32 1, i32 2>
-; CHECK-NEXT:    [[TMP229:%.*]] = add <2 x i32> [[TMP227]], [[TMP228]]
-; CHECK-NEXT:    [[TMP230:%.*]] = extractelement <2 x i32> [[TMP226]], i32 0
-; CHECK-NEXT:    [[TMP231:%.*]] = extractelement <2 x i32> [[TMP229]], i32 0
-; CHECK-NEXT:    [[ADD94_1:%.*]] = add i32 [[TMP231]], [[TMP230]]
-; CHECK-NEXT:    [[TMP232:%.*]] = insertelement <2 x i32> [[TMP14]], i32 [[ADD46]], i32 1
-; CHECK-NEXT:    [[TMP233:%.*]] = lshr <2 x i32> [[TMP232]], <i32 15, i32 15>
-; CHECK-NEXT:    [[TMP234:%.*]] = and <2 x i32> [[TMP233]], <i32 65537, i32 65537>
-; CHECK-NEXT:    [[TMP235:%.*]] = mul <2 x i32> [[TMP234]], <i32 65535, i32 65535>
-; CHECK-NEXT:    [[TMP236:%.*]] = extractelement <2 x i32> [[TMP226]], i32 1
-; CHECK-NEXT:    [[TMP237:%.*]] = extractelement <2 x i32> [[TMP229]], i32 1
-; CHECK-NEXT:    [[ADD78:%.*]] = add i32 [[TMP237]], [[TMP236]]
-; CHECK-NEXT:    [[TMP238:%.*]] = sub <2 x i32> [[TMP226]], [[TMP229]]
+; CHECK-NEXT:    [[TMP185:%.*]] = lshr <2 x i32> [[TMP143]], <i32 15, i32 15>
+; CHECK-NEXT:    [[TMP186:%.*]] = and <2 x i32> [[TMP185]], <i32 65537, i32 65537>
+; CHECK-NEXT:    [[TMP187:%.*]] = mul <2 x i32> [[TMP186]], <i32 65535, i32 65535>
+; CHECK-NEXT:    [[TMP188:%.*]] = insertelement <2 x i32> poison, i32 [[SUB59_1]], i32 0
+; CHECK-NEXT:    [[TMP189:%.*]] = shufflevector <2 x i32> [[TMP188]], <2 x i32> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP190:%.*]] = extractelement <2 x i32> [[TMP182]], i32 0
+; CHECK-NEXT:    [[TMP191:%.*]] = extractelement <2 x i32> [[TMP182]], i32 1
+; CHECK-NEXT:    [[ADD78_1:%.*]] = add i32 [[TMP190]], [[TMP191]]
+; CHECK-NEXT:    [[TMP192:%.*]] = shufflevector <2 x i32> [[TMP33]], <2 x i32> [[TMP177]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    [[TMP193:%.*]] = lshr <2 x i32> [[TMP192]], <i32 15, i32 15>
+; CHECK-NEXT:    [[TMP194:%.*]] = and <2 x i32> [[TMP193]], <i32 65537, i32 65537>
+; CHECK-NEXT:    [[TMP195:%.*]] = mul <2 x i32> [[TMP194]], <i32 65535, i32 65535>
+; CHECK-NEXT:    [[TMP196:%.*]] = insertelement <2 x i32> poison, i32 [[ADD78_1]], i32 0
+; CHECK-NEXT:    [[TMP197:%.*]] = shufflevector <2 x i32> [[TMP196]], <2 x i32> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP198:%.*]] = insertelement <2 x i32> poison, i32 [[ADD94_2]], i32 0
+; CHECK-NEXT:    [[TMP199:%.*]] = shufflevector <2 x i32> [[TMP198]], <2 x i32> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP200:%.*]] = insertelement <2 x i32> poison, i32 [[ADD44]], i32 0
+; CHECK-NEXT:    [[TMP201:%.*]] = shufflevector <2 x i32> [[TMP200]], <2 x i32> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP202:%.*]] = insertelement <2 x i32> <i32 15, i32 poison>, i32 [[ADD46]], i32 1
+; CHECK-NEXT:    [[TMP203:%.*]] = lshr <2 x i32> [[TMP201]], [[TMP202]]
+; CHECK-NEXT:    [[TMP204:%.*]] = sub <2 x i32> [[TMP201]], [[TMP202]]
+; CHECK-NEXT:    [[TMP205:%.*]] = shufflevector <2 x i32> [[TMP203]], <2 x i32> [[TMP204]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    [[TMP206:%.*]] = extractelement <2 x i32> [[TMP205]], i32 1
+; CHECK-NEXT:    [[ADD78_2:%.*]] = add i32 [[SUB51_1]], [[TMP206]]
+; CHECK-NEXT:    [[TMP207:%.*]] = insertelement <2 x i32> <i32 65537, i32 poison>, i32 [[SUB51_1]], i32 1
+; CHECK-NEXT:    [[TMP208:%.*]] = and <2 x i32> [[TMP205]], [[TMP207]]
+; CHECK-NEXT:    [[TMP209:%.*]] = sub <2 x i32> [[TMP205]], [[TMP207]]
+; CHECK-NEXT:    [[TMP210:%.*]] = shufflevector <2 x i32> [[TMP208]], <2 x i32> [[TMP209]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    [[TMP211:%.*]] = insertelement <2 x i32> poison, i32 [[ADD78_2]], i32 0
+; CHECK-NEXT:    [[TMP212:%.*]] = shufflevector <2 x i32> [[TMP211]], <2 x i32> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP213:%.*]] = add <2 x i32> [[TMP212]], [[TMP199]]
+; CHECK-NEXT:    [[TMP214:%.*]] = sub <2 x i32> [[TMP212]], [[TMP199]]
+; CHECK-NEXT:    [[TMP215:%.*]] = shufflevector <2 x i32> [[TMP213]], <2 x i32> [[TMP214]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    [[TMP216:%.*]] = insertelement <2 x i32> [[TMP134]], i32 [[CONV_1]], i32 0
+; CHECK-NEXT:    [[TMP217:%.*]] = lshr <2 x i32> [[TMP216]], <i32 15, i32 15>
+; CHECK-NEXT:    [[TMP218:%.*]] = and <2 x i32> [[TMP217]], <i32 65537, i32 65537>
+; CHECK-NEXT:    [[TMP219:%.*]] = mul <2 x i32> [[TMP218]], <i32 65535, i32 65535>
+; CHECK-NEXT:    [[TMP220:%.*]] = shufflevector <2 x i32> [[TMP88]], <2 x i32> poison, <2 x i32> <i32 1, i32 poison>
+; CHECK-NEXT:    [[TMP221:%.*]] = shufflevector <2 x i32> [[TMP220]], <2 x i32> [[TMP182]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    [[TMP222:%.*]] = shufflevector <2 x i32> [[TMP88]], <2 x i32> [[TMP182]], <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:    [[TMP223:%.*]] = sub <2 x i32> [[TMP221]], [[TMP222]]
+; CHECK-NEXT:    [[TMP224:%.*]] = shufflevector <2 x i32> [[TMP47]], <2 x i32> poison, <2 x i32> <i32 1, i32 poison>
+; CHECK-NEXT:    [[TMP225:%.*]] = insertelement <2 x i32> [[TMP224]], i32 [[ADD46]], i32 1
+; CHECK-NEXT:    [[TMP226:%.*]] = insertelement <2 x i32> [[TMP47]], i32 [[ADD44]], i32 1
+; CHECK-NEXT:    [[TMP227:%.*]] = add <2 x i32> [[TMP225]], [[TMP226]]
+; CHECK-NEXT:    [[TMP228:%.*]] = shufflevector <2 x i32> [[TMP80]], <2 x i32> [[TMP176]], <2 x i32> <i32 1, i32 3>
+; CHECK-NEXT:    [[TMP229:%.*]] = shufflevector <2 x i32> [[TMP80]], <2 x i32> [[TMP176]], <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:    [[TMP230:%.*]] = add <2 x i32> [[TMP228]], [[TMP229]]
+; CHECK-NEXT:    [[TMP231:%.*]] = extractelement <2 x i32> [[TMP227]], i32 0
+; CHECK-NEXT:    [[TMP232:%.*]] = extractelement <2 x i32> [[TMP230]], i32 0
+; CHECK-NEXT:    [[ADD94_1:%.*]] = add i32 [[TMP232]], [[TMP231]]
+; CHECK-NEXT:    [[TMP233:%.*]] = insertelement <2 x i32> [[TMP14]], i32 [[ADD46]], i32 1
+; CHECK-NEXT:    [[TMP234:%.*]] = lshr <2 x i32> [[TMP233]], <i32 15, i32 15>
+; CHECK-NEXT:    [[TMP235:%.*]] = and <2 x i32> [[TMP234]], <i32 65537, i32 65537>
+; CHECK-NEXT:    [[TMP236:%.*]] = mul <2 x i32> [[TMP235]], <i32 65535, i32 65535>
+; CHECK-NEXT:    [[TMP237:%.*]] = extractelement <2 x i32> [[TMP227]], i32 1
+; CHECK-NEXT:    [[TMP238:%.*]] = extractelement <2 x i32> [[TMP230]], i32 1
+; CHECK-NEXT:    [[ADD78:%.*]] = add i32 [[TMP238]], [[TMP237]]
+; CHECK-NEXT:    [[TMP239:%.*]] = sub <2 x i32> [[TMP227]], [[TMP230]]
 ; CHECK-NEXT:    [[ADD103:%.*]] = add i32 [[ADD94]], [[ADD78]]
 ; CHECK-NEXT:    [[SUB104:%.*]] = sub i32 [[ADD78]], [[ADD94]]
-; CHECK-NEXT:    [[TMP239:%.*]] = extractelement <2 x i32> [[TMP238]], i32 1
-; CHECK-NEXT:    [[ADD105:%.*]] = add i32 [[SUB102]], [[TMP239]]
+; CHECK-NEXT:    [[TMP240:%.*]] = extractelement <2 x i32> [[TMP239]], i32 1
+; CHECK-NEXT:    [[ADD105:%.*]] = add i32 [[SUB102]], [[TMP240]]
 ; CHECK-NEXT:    [[ADD_I:%.*]] = add i32 [[MUL_I]], [[ADD103]]
-; CHECK-NEXT:    [[XOR_I:%.*]] = xor i32 [[ADD_I]], [[TMP92]]
+; CHECK-NEXT:    [[XOR_I:%.*]] = xor i32 [[ADD_I]], [[TMP93]]
 ; CHECK-NEXT:    [[ADD_I52:%.*]] = add i32 [[MUL_I51]], [[ADD105]]
-; CHECK-NEXT:    [[XOR_I53:%.*]] = xor i32 [[ADD_I52]], [[TMP93]]
+; CHECK-NEXT:    [[XOR_I53:%.*]] = xor i32 [[ADD_I52]], [[TMP94]]
 ; CHECK-NEXT:    [[ADD_I57:%.*]] = add i32 [[MUL_I56]], [[SUB104]]
-; CHECK-NEXT:    [[XOR_I58:%.*]] = xor i32 [[ADD_I57]], [[TMP178]]
+; CHECK-NEXT:    [[XOR_I58:%.*]] = xor i32 [[ADD_I57]], [[TMP179]]
 ; CHECK-NEXT:    [[ADD110:%.*]] = add i32 [[XOR_I53]], [[XOR_I]]
 ; CHECK-NEXT:    [[ADD112:%.*]] = add i32 [[ADD110]], [[XOR_I58]]
-; CHECK-NEXT:    [[TMP240:%.*]] = shufflevector <2 x i32> [[TMP222]], <2 x i32> poison, <2 x i32> <i32 1, i32 poison>
-; CHECK-NEXT:    [[TMP241:%.*]] = insertelement <2 x i32> [[TMP240]], i32 [[SUB102]], i32 1
-; CHECK-NEXT:    [[TMP242:%.*]] = add <2 x i32> [[TMP238]], [[TMP241]]
-; CHECK-NEXT:    [[TMP243:%.*]] = sub <2 x i32> [[TMP238]], [[TMP241]]
-; CHECK-NEXT:    [[TMP244:%.*]] = shufflevector <2 x i32> [[TMP242]], <2 x i32> [[TMP243]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT:    [[TMP245:%.*]] = add <2 x i32> [[TMP235]], [[TMP244]]
-; CHECK-NEXT:    [[TMP246:%.*]] = xor <2 x i32> [[TMP245]], [[TMP232]]
-; CHECK-NEXT:    [[TMP247:%.*]] = extractelement <2 x i32> [[TMP246]], i32 1
-; CHECK-NEXT:    [[ADD113:%.*]] = add i32 [[ADD112]], [[TMP247]]
-; CHECK-NEXT:    [[TMP248:%.*]] = insertelement <2 x i32> poison, i32 [[ADD94_1]], i32 0
-; CHECK-NEXT:    [[TMP249:%.*]] = shufflevector <2 x i32> [[TMP248]], <2 x i32> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP250:%.*]] = add <2 x i32> [[TMP196]], [[TMP249]]
-; CHECK-NEXT:    [[TMP251:%.*]] = sub <2 x i32> [[TMP196]], [[TMP249]]
-; CHECK-NEXT:    [[TMP252:%.*]] = shufflevector <2 x i32> [[TMP250]], <2 x i32> [[TMP251]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT:    [[TMP253:%.*]] = add <2 x i32> [[TMP194]], [[TMP252]]
-; CHECK-NEXT:    [[TMP254:%.*]] = xor <2 x i32> [[TMP253]], [[TMP191]]
-; CHECK-NEXT:    [[TMP255:%.*]] = extractelement <2 x i32> [[TMP246]], i32 0
-; CHECK-NEXT:    [[ADD108_1:%.*]] = add i32 [[TMP255]], [[ADD113]]
-; CHECK-NEXT:    [[TMP256:%.*]] = extractelement <2 x i32> [[TMP254]], i32 0
-; CHECK-NEXT:    [[ADD110_1:%.*]] = add i32 [[ADD108_1]], [[TMP256]]
-; CHECK-NEXT:    [[TMP257:%.*]] = extractelement <2 x i32> [[TMP254]], i32 1
-; CHECK-NEXT:    [[ADD112_1:%.*]] = add i32 [[ADD110_1]], [[TMP257]]
-; CHECK-NEXT:    [[TMP258:%.*]] = shufflevector <2 x i32> [[TMP209]], <2 x i32> poison, <2 x i32> <i32 1, i32 poison>
-; CHECK-NEXT:    [[TMP259:%.*]] = shufflevector <2 x i32> [[TMP258]], <2 x i32> [[TMP238]], <2 x i32> <i32 0, i32 2>
-; CHECK-NEXT:    [[TMP260:%.*]] = add <2 x i32> [[TMP222]], [[TMP259]]
-; CHECK-NEXT:    [[TMP261:%.*]] = sub <2 x i32> [[TMP222]], [[TMP259]]
-; CHECK-NEXT:    [[TMP262:%.*]] = shufflevector <2 x i32> [[TMP260]], <2 x i32> [[TMP261]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT:    [[TMP263:%.*]] = add <2 x i32> [[TMP218]], [[TMP262]]
-; CHECK-NEXT:    [[TMP264:%.*]] = xor <2 x i32> [[TMP263]], [[TMP215]]
-; CHECK-NEXT:    [[TMP265:%.*]] = extractelement <2 x i32> [[TMP264]], i32 1
-; CHECK-NEXT:    [[ADD113_1:%.*]] = add i32 [[ADD112_1]], [[TMP265]]
-; CHECK-NEXT:    [[TMP266:%.*]] = shufflevector <2 x i32> <i32 65535, i32 poison>, <2 x i32> [[TMP222]], <2 x i32> <i32 0, i32 2>
-; CHECK-NEXT:    [[TMP267:%.*]] = mul <2 x i32> [[TMP209]], [[TMP266]]
-; CHECK-NEXT:    [[TMP268:%.*]] = sub <2 x i32> [[TMP209]], [[TMP266]]
-; CHECK-NEXT:    [[TMP269:%.*]] = shufflevector <2 x i32> [[TMP267]], <2 x i32> [[TMP268]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT:    [[TMP270:%.*]] = add <2 x i32> [[TMP186]], [[TMP214]]
-; CHECK-NEXT:    [[TMP271:%.*]] = xor <2 x i32> [[TMP270]], [[TMP142]]
-; CHECK-NEXT:    [[TMP272:%.*]] = extractelement <2 x i32> [[TMP269]], i32 0
-; CHECK-NEXT:    [[TMP273:%.*]] = extractelement <2 x i32> [[TMP269]], i32 1
-; CHECK-NEXT:    [[ADD_I62_2:%.*]] = add i32 [[TMP272]], [[TMP273]]
+; CHECK-NEXT:    [[TMP241:%.*]] = shufflevector <2 x i32> [[TMP223]], <2 x i32> poison, <2 x i32> <i32 1, i32 poison>
+; CHECK-NEXT:    [[TMP242:%.*]] = insertelement <2 x i32> [[TMP241]], i32 [[SUB102]], i32 1
+; CHECK-NEXT:    [[TMP243:%.*]] = add <2 x i32> [[TMP239]], [[TMP242]]
+; CHECK-NEXT:    [[TMP244:%.*]] = sub <2 x i32> [[TMP239]], [[TMP242]]
+; CHECK-NEXT:    [[TMP245:%.*]] = shufflevector <2 x i32> [[TMP243]], <2 x i32> [[TMP244]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    [[TMP246:%.*]] = add <2 x i32> [[TMP236]], [[TMP245]]
+; CHECK-NEXT:    [[TMP247:%.*]] = xor <2 x i32> [[TMP246]], [[TMP233]]
+; CHECK-NEXT:    [[TMP248:%.*]] = extractelement <2 x i32> [[TMP247]], i32 1
+; CHECK-NEXT:    [[ADD113:%.*]] = add i32 [[ADD112]], [[TMP248]]
+; CHECK-NEXT:    [[TMP249:%.*]] = insertelement <2 x i32> poison, i32 [[ADD94_1]], i32 0
+; CHECK-NEXT:    [[TMP250:%.*]] = shufflevector <2 x i32> [[TMP249]], <2 x i32> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP251:%.*]] = add <2 x i32> [[TMP197]], [[TMP250]]
+; CHECK-NEXT:    [[TMP252:%.*]] = sub <2 x i32> [[TMP197]], [[TMP250]]
+; CHECK-NEXT:    [[TMP253:%.*]] = shufflevector <2 x i32> [[TMP251]], <2 x i32> [[TMP252]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    [[TMP254:%.*]] = add <2 x i32> [[TMP195]], [[TMP253]]
+; CHECK-NEXT:    [[TMP255:%.*]] = xor <2 x i32> [[TMP254]], [[TMP192]]
+; CHECK-NEXT:    [[TMP256:%.*]] = extractelement <2 x i32> [[TMP247]], i32 0
+; CHECK-NEXT:    [[ADD108_1:%.*]] = add i32 [[TMP256]], [[ADD113]]
+; CHECK-NEXT:    [[TMP257:%.*]] = extractelement <2 x i32> [[TMP255]], i32 0
+; CHECK-NEXT:    [[ADD110_1:%.*]] = add i32 [[ADD108_1]], [[TMP257]]
+; CHECK-NEXT:    [[TMP258:%.*]] = extractelement <2 x i32> [[TMP255]], i32 1
+; CHECK-NEXT:    [[ADD112_1:%.*]] = add i32 [[ADD110_1]], [[TMP258]]
+; CHECK-NEXT:    [[TMP259:%.*]] = shufflevector <2 x i32> [[TMP210]], <2 x i32> poison, <2 x i32> <i32 1, i32 poison>
+; CHECK-NEXT:    [[TMP260:%.*]] = shufflevector <2 x i32> [[TMP259]], <2 x i32> [[TMP239]], <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:    [[TMP261:%.*]] = add <2 x i32> [[TMP223]], [[TMP260]]
+; CHECK-NEXT:    [[TMP262:%.*]] = sub <2 x i32> [[TMP223]], [[TMP260]]
+; CHECK-NEXT:    [[TMP263:%.*]] = shufflevector <2 x i32> [[TMP261]], <2 x i32> [[TMP262]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    [[TMP264:%.*]] = add <2 x i32> [[TMP219]], [[TMP263]]
+; CHECK-NEXT:    [[TMP265:%.*]] = xor <2 x i32> [[TMP264]], [[TMP216]]
+; CHECK-NEXT:    [[TMP266:%.*]] = extractelement <2 x i32> [[TMP265]], i32 1
+; CHECK-NEXT:    [[ADD113_1:%.*]] = add i32 [[ADD112_1]], [[TMP266]]
+; CHECK-NEXT:    [[TMP267:%.*]] = shufflevector <2 x i32> <i32 65535, i32 poison>, <2 x i32> [[TMP223]], <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:    [[TMP268:%.*]] = mul <2 x i32> [[TMP210]], [[TMP267]]
+; CHECK-NEXT:    [[TMP269:%.*]] = sub <2 x i32> [[TMP210]], [[TMP267]]
+; CHECK-NEXT:    [[TMP270:%.*]] = shufflevector <2 x i32> [[TMP268]], <2 x i32> [[TMP269]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    [[TMP271:%.*]] = add <2 x i32> [[TMP187]], [[TMP215]]
+; CHECK-NEXT:    [[TMP272:%.*]] = xor <2 x i32> [[TMP271]], [[TMP143]]
+; CHECK-NEXT:    [[TMP273:%.*]] = extractelement <2 x i32> [[TMP270]], i32 0
+; CHECK-NEXT:    [[TMP274:%.*]] = extractelement <2 x i32> [[TMP270]], i32 1
+; CHECK-NEXT:    [[ADD_I62_2:%.*]] = add i32 [[TMP273]], [[TMP274]]
 ; CHECK-NEXT:    [[XOR_I63_2:%.*]] = xor i32 [[ADD_I62_2]], [[ADD44]]
-; CHECK-NEXT:    [[TMP274:%.*]] = extractelement <2 x i32> [[TMP264]], i32 0
-; CHECK-NEXT:    [[ADD108_2:%.*]] = add i32 [[TMP274]], [[ADD113_1]]
-; CHECK-NEXT:    [[TMP275:%.*]] = extractelement <2 x i32> [[TMP271]], i32 0
-; CHECK-NEXT:    [[ADD110_2:%.*]] = add i32 [[ADD108_2]], [[TMP275]]
-; CHECK-NEXT:    [[TMP276:%.*]] = extractelement <2 x i32> [[TMP271]], i32 1
-; CHECK-NEXT:    [[ADD112_2:%.*]] = add i32 [[ADD110_2]], [[TMP276]]
+; CHECK-NEXT:    [[TMP275:%.*]] = extractelement <2 x i32> [[TMP265]], i32 0
+; CHECK-NEXT:    [[ADD108_2:%.*]] = add i32 [[TMP275]], [[ADD113_1]]
+; CHECK-NEXT:    [[TMP276:%.*]] = extractelement <2 x i32> [[TMP272]], i32 0
+; CHECK-NEXT:    [[ADD110_2:%.*]] = add i32 [[ADD108_2]], [[TMP276]]
+; CHECK-NEXT:    [[TMP277:%.*]] = extractelement <2 x i32> [[TMP272]], i32 1
+; CHECK-NEXT:    [[ADD112_2:%.*]] = add i32 [[ADD110_2]], [[TMP277]]
 ; CHECK-NEXT:    [[ADD113_2:%.*]] = add i32 [[ADD112_2]], [[XOR_I63_2]]
-; CHECK-NEXT:    [[TMP277:%.*]] = insertelement <2 x i32> poison, i32 [[SUB59]], i32 0
-; CHECK-NEXT:    [[TMP278:%.*]] = shufflevector <2 x i32> [[TMP277]], <2 x i32> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP279:%.*]] = add <2 x i32> [[TMP278]], [[TMP188]]
-; CHECK-NEXT:    [[TMP280:%.*]] = sub <2 x i32> [[TMP278]], [[TMP188]]
-; CHECK-NEXT:    [[TMP281:%.*]] = shufflevector <2 x i32> [[TMP279]], <2 x i32> [[TMP280]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT:    [[TMP282:%.*]] = add <2 x i32> [[TMP104]], [[TMP281]]
-; CHECK-NEXT:    [[TMP283:%.*]] = sub <2 x i32> [[TMP281]], [[TMP104]]
-; CHECK-NEXT:    [[TMP284:%.*]] = add <2 x i32> [[TMP138]], [[TMP282]]
-; CHECK-NEXT:    [[TMP285:%.*]] = xor <2 x i32> [[TMP284]], [[TMP107]]
-; CHECK-NEXT:    [[TMP286:%.*]] = lshr <2 x i32> [[TMP97]], <i32 15, i32 15>
-; CHECK-NEXT:    [[TMP287:%.*]] = and <2 x i32> [[TMP286]], <i32 65537, i32 65537>
-; CHECK-NEXT:    [[TMP288:%.*]] = mul <2 x i32> [[TMP287]], <i32 65535, i32 65535>
-; CHECK-NEXT:    [[TMP289:%.*]] = add <2 x i32> [[TMP288]], [[TMP283]]
-; CHECK-NEXT:    [[TMP290:%.*]] = xor <2 x i32> [[TMP289]], [[TMP97]]
-; CHECK-NEXT:    [[TMP291:%.*]] = extractelement <2 x i32> [[TMP285]], i32 1
-; CHECK-NEXT:    [[ADD108_3:%.*]] = add i32 [[TMP291]], [[ADD113_2]]
-; CHECK-NEXT:    [[TMP292:%.*]] = extractelement <2 x i32> [[TMP285]], i32 0
-; CHECK-NEXT:    [[ADD110_3:%.*]] = add i32 [[ADD108_3]], [[TMP292]]
-; CHECK-NEXT:    [[TMP293:%.*]] = extractelement <2 x i32> [[TMP290]], i32 0
-; CHECK-NEXT:    [[ADD112_3:%.*]] = add i32 [[ADD110_3]], [[TMP293]]
-; CHECK-NEXT:    [[TMP294:%.*]] = extractelement <2 x i32> [[TMP290]], i32 1
-; CHECK-NEXT:    [[ADD113_3:%.*]] = add i32 [[ADD112_3]], [[TMP294]]
+; CHECK-NEXT:    [[TMP278:%.*]] = insertelement <2 x i32> poison, i32 [[SUB59]], i32 0
+; CHECK-NEXT:    [[TMP279:%.*]] = shufflevector <2 x i32> [[TMP278]], <2 x i32> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP280:%.*]] = add <2 x i32> [[TMP279]], [[TMP189]]
+; CHECK-NEXT:    [[TMP281:%.*]] = sub <2 x i32> [[TMP279]], [[TMP189]]
+; CHECK-NEXT:    [[TMP282:%.*]] = shufflevector <2 x i32> [[TMP280]], <2 x i32> [[TMP281]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    [[TMP283:%.*]] = add <2 x i32> [[TMP105]], [[TMP282]]
+; CHECK-NEXT:    [[TMP284:%.*]] = sub <2 x i32> [[TMP282]], [[TMP105]]
+; CHECK-NEXT:    [[TMP285:%.*]] = add <2 x i32> [[TMP139]], [[TMP283]]
+; CHECK-NEXT:    [[TMP286:%.*]] = xor <2 x i32> [[TMP285]], [[TMP108]]
+; CHECK-NEXT:    [[TMP287:%.*]] = lshr <2 x i32> [[TMP98]], <i32 15, i32 15>
+; CHECK-NEXT:    [[TMP288:%.*]] = and <2 x i32> [[TMP287]], <i32 65537, i32 65537>
+; CHECK-NEXT:    [[TMP289:%.*]] = mul <2 x i32> [[TMP288]], <i32 65535, i32 65535>
+; CHECK-NEXT:    [[TMP290:%.*]] = add <2 x i32> [[TMP289]], [[TMP284]]
+; CHECK-NEXT:    [[TMP291:%.*]] = xor <2 x i32> [[TMP290]], [[TMP98]]
+; CHECK-NEXT:    [[TMP292:%.*]] = extractelement <2 x i32> [[TMP286]], i32 1
+; CHECK-NEXT:    [[ADD108_3:%.*]] = add i32 [[TMP292]], [[ADD113_2]]
+; CHECK-NEXT:    [[TMP293:%.*]] = extractelement <2 x i32> [[TMP286]], i32 0
+; CHECK-NEXT:    [[ADD110_3:%.*]] = add i32 [[ADD108_3]], [[TMP293]]
+; CHECK-NEXT:    [[TMP294:%.*]] = extractelement <2 x i32> [[TMP291]], i32 0
+; CHECK-NEXT:    [[ADD112_3:%.*]] = add i32 [[ADD110_3]], [[TMP294]]
+; CHECK-NEXT:    [[TMP295:%.*]] = extractelement <2 x i32> [[TMP291]], i32 1
+; CHECK-NEXT:    [[ADD113_3:%.*]] = add i32 [[ADD112_3]], [[TMP295]]
 ; CHECK-NEXT:    ret i32 [[ADD113_3]]
 ;
 entry:
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/strided-loads-vectorized.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/strided-loads-vectorized.ll
index 4b0b419..a4cc311 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/strided-loads-vectorized.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/strided-loads-vectorized.ll
@@ -5,12 +5,61 @@ define void @test(ptr %p, ptr noalias %s) {
 ; CHECK-LABEL: @test(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [48 x float], ptr [[P:%.*]], i64 0, i64 0
+; CHECK-NEXT:    [[I:%.*]] = load float, ptr [[ARRAYIDX]], align 4
 ; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 30
+; CHECK-NEXT:    [[I1:%.*]] = load float, ptr [[ARRAYIDX1]], align 4
+; CHECK-NEXT:    [[ADD:%.*]] = fsub fast float [[I1]], [[I]]
 ; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[S:%.*]], i64 0
-; CHECK-NEXT:    [[TMP0:%.*]] = call <8 x float> @llvm.experimental.vp.strided.load.v8f32.p0.i64(ptr align 4 [[ARRAYIDX]], i64 16, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, i32 8)
-; CHECK-NEXT:    [[TMP1:%.*]] = call <8 x float> @llvm.experimental.vp.strided.load.v8f32.p0.i64(ptr align 4 [[ARRAYIDX1]], i64 -16, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, i32 8)
-; CHECK-NEXT:    [[TMP2:%.*]] = fsub fast <8 x float> [[TMP1]], [[TMP0]]
-; CHECK-NEXT:    store <8 x float> [[TMP2]], ptr [[ARRAYIDX2]], align 4
+; CHECK-NEXT:    store float [[ADD]], ptr [[ARRAYIDX2]], align 4
+; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 4
+; CHECK-NEXT:    [[I2:%.*]] = load float, ptr [[ARRAYIDX4]], align 4
+; CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 26
+; CHECK-NEXT:    [[I3:%.*]] = load float, ptr [[ARRAYIDX6]], align 4
+; CHECK-NEXT:    [[ADD7:%.*]] = fsub fast float [[I3]], [[I2]]
+; CHECK-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds float, ptr [[S]], i64 1
+; CHECK-NEXT:    store float [[ADD7]], ptr [[ARRAYIDX9]], align 4
+; CHECK-NEXT:    [[ARRAYIDX11:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 8
+; CHECK-NEXT:    [[I4:%.*]] = load float, ptr [[ARRAYIDX11]], align 4
+; CHECK-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 22
+; CHECK-NEXT:    [[I5:%.*]] = load float, ptr [[ARRAYIDX13]], align 4
+; CHECK-NEXT:    [[ADD14:%.*]] = fsub fast float [[I5]], [[I4]]
+; CHECK-NEXT:    [[ARRAYIDX16:%.*]] = getelementptr inbounds float, ptr [[S]], i64 2
+; CHECK-NEXT:    store float [[ADD14]], ptr [[ARRAYIDX16]], align 4
+; CHECK-NEXT:    [[ARRAYIDX18:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 12
+; CHECK-NEXT:    [[I6:%.*]] = load float, ptr [[ARRAYIDX18]], align 4
+; CHECK-NEXT:    [[ARRAYIDX20:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 18
+; CHECK-NEXT:    [[I7:%.*]] = load float, ptr [[ARRAYIDX20]], align 4
+; CHECK-NEXT:    [[ADD21:%.*]] = fsub fast float [[I7]], [[I6]]
+; CHECK-NEXT:    [[ARRAYIDX23:%.*]] = getelementptr inbounds float, ptr [[S]], i64 3
+; CHECK-NEXT:    store float [[ADD21]], ptr [[ARRAYIDX23]], align 4
+; CHECK-NEXT:    [[ARRAYIDX25:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 16
+; CHECK-NEXT:    [[I8:%.*]] = load float, ptr [[ARRAYIDX25]], align 4
+; CHECK-NEXT:    [[ARRAYIDX27:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 14
+; CHECK-NEXT:    [[I9:%.*]] = load float, ptr [[ARRAYIDX27]], align 4
+; CHECK-NEXT:    [[ADD28:%.*]] = fsub fast float [[I9]], [[I8]]
+; CHECK-NEXT:    [[ARRAYIDX30:%.*]] = getelementptr inbounds float, ptr [[S]], i64 4
+; CHECK-NEXT:    store float [[ADD28]], ptr [[ARRAYIDX30]], align 4
+; CHECK-NEXT:    [[ARRAYIDX32:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 20
+; CHECK-NEXT:    [[I10:%.*]] = load float, ptr [[ARRAYIDX32]], align 4
+; CHECK-NEXT:    [[ARRAYIDX34:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 10
+; CHECK-NEXT:    [[I11:%.*]] = load float, ptr [[ARRAYIDX34]], align 4
+; CHECK-NEXT:    [[ADD35:%.*]] = fsub fast float [[I11]], [[I10]]
+; CHECK-NEXT:    [[ARRAYIDX37:%.*]] = getelementptr inbounds float, ptr [[S]], i64 5
+; CHECK-NEXT:    store float [[ADD35]], ptr [[ARRAYIDX37]], align 4
+; CHECK-NEXT:    [[ARRAYIDX39:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 24
+; CHECK-NEXT:    [[I12:%.*]] = load float, ptr [[ARRAYIDX39]], align 4
+; CHECK-NEXT:    [[ARRAYIDX41:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 6
+; CHECK-NEXT:    [[I13:%.*]] = load float, ptr [[ARRAYIDX41]], align 4
+; CHECK-NEXT:    [[ADD42:%.*]] = fsub fast float [[I13]], [[I12]]
+; CHECK-NEXT:    [[ARRAYIDX44:%.*]] = getelementptr inbounds float, ptr [[S]], i64 6
+; CHECK-NEXT:    store float [[ADD42]], ptr [[ARRAYIDX44]], align 4
+; CHECK-NEXT:    [[ARRAYIDX46:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 28
+; CHECK-NEXT:    [[I14:%.*]] = load float, ptr [[ARRAYIDX46]], align 4
+; CHECK-NEXT:    [[ARRAYIDX48:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 2
+; CHECK-NEXT:    [[I15:%.*]] = load float, ptr [[ARRAYIDX48]], align 4
+; CHECK-NEXT:    [[ADD49:%.*]] = fsub fast float [[I15]], [[I14]]
+; CHECK-NEXT:    [[ARRAYIDX51:%.*]] = getelementptr inbounds float, ptr [[S]], i64 7
+; CHECK-NEXT:    store float [[ADD49]], ptr [[ARRAYIDX51]], align 4
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -213,40 +262,67 @@ define void @test2(ptr %p, ptr noalias %s, i32 %stride) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[STR:%.*]] = zext i32 [[STRIDE:%.*]] to i64
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [48 x float], ptr [[P:%.*]], i64 0, i64 2
+; CHECK-NEXT:    [[I:%.*]] = load float, ptr [[ARRAYIDX]], align 4
 ; CHECK-NEXT:    [[ST6:%.*]] = mul i64 [[STR]], 7
 ; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 [[ST6]]
 ; CHECK-NEXT:    [[I1:%.*]] = load float, ptr [[ARRAYIDX1]], align 4
+; CHECK-NEXT:    [[ADD:%.*]] = fsub fast float [[I1]], [[I]]
 ; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[S:%.*]], i64 0
+; CHECK-NEXT:    store float [[ADD]], ptr [[ARRAYIDX2]], align 4
+; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 6
+; CHECK-NEXT:    [[I2:%.*]] = load float, ptr [[ARRAYIDX4]], align 4
 ; CHECK-NEXT:    [[ST5:%.*]] = mul i64 [[STR]], 6
 ; CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 [[ST5]]
 ; CHECK-NEXT:    [[I3:%.*]] = load float, ptr [[ARRAYIDX6]], align 4
+; CHECK-NEXT:    [[ADD7:%.*]] = fsub fast float [[I3]], [[I2]]
+; CHECK-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds float, ptr [[S]], i64 1
+; CHECK-NEXT:    store float [[ADD7]], ptr [[ARRAYIDX9]], align 4
+; CHECK-NEXT:    [[ARRAYIDX11:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 10
+; CHECK-NEXT:    [[I4:%.*]] = load float, ptr [[ARRAYIDX11]], align 4
 ; CHECK-NEXT:    [[ST4:%.*]] = mul i64 [[STR]], 5
 ; CHECK-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 [[ST4]]
 ; CHECK-NEXT:    [[I5:%.*]] = load float, ptr [[ARRAYIDX13]], align 4
+; CHECK-NEXT:    [[ADD14:%.*]] = fsub fast float [[I5]], [[I4]]
+; CHECK-NEXT:    [[ARRAYIDX16:%.*]] = getelementptr inbounds float, ptr [[S]], i64 2
+; CHECK-NEXT:    store float [[ADD14]], ptr [[ARRAYIDX16]], align 4
+; CHECK-NEXT:    [[ARRAYIDX18:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 14
+; CHECK-NEXT:    [[I6:%.*]] = load float, ptr [[ARRAYIDX18]], align 4
 ; CHECK-NEXT:    [[ST3:%.*]] = mul i64 [[STR]], 4
 ; CHECK-NEXT:    [[ARRAYIDX20:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 [[ST3]]
 ; CHECK-NEXT:    [[I7:%.*]] = load float, ptr [[ARRAYIDX20]], align 4
+; CHECK-NEXT:    [[ADD21:%.*]] = fsub fast float [[I7]], [[I6]]
+; CHECK-NEXT:    [[ARRAYIDX23:%.*]] = getelementptr inbounds float, ptr [[S]], i64 3
+; CHECK-NEXT:    store float [[ADD21]], ptr [[ARRAYIDX23]], align 4
+; CHECK-NEXT:    [[ARRAYIDX25:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 18
 ; CHECK-NEXT:    [[ST2:%.*]] = mul i64 [[STR]], 3
+; CHECK-NEXT:    [[I8:%.*]] = load float, ptr [[ARRAYIDX25]], align 4
 ; CHECK-NEXT:    [[ARRAYIDX27:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 [[ST2]]
 ; CHECK-NEXT:    [[I9:%.*]] = load float, ptr [[ARRAYIDX27]], align 4
+; CHECK-NEXT:    [[ADD28:%.*]] = fsub fast float [[I9]], [[I8]]
+; CHECK-NEXT:    [[ARRAYIDX30:%.*]] = getelementptr inbounds float, ptr [[S]], i64 4
+; CHECK-NEXT:    store float [[ADD28]], ptr [[ARRAYIDX30]], align 4
+; CHECK-NEXT:    [[ARRAYIDX32:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 22
+; CHECK-NEXT:    [[I10:%.*]] = load float, ptr [[ARRAYIDX32]], align 4
 ; CHECK-NEXT:    [[ST1:%.*]] = mul i64 [[STR]], 2
 ; CHECK-NEXT:    [[ARRAYIDX34:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 [[ST1]]
 ; CHECK-NEXT:    [[I11:%.*]] = load float, ptr [[ARRAYIDX34]], align 4
+; CHECK-NEXT:    [[ADD35:%.*]] = fsub fast float [[I11]], [[I10]]
+; CHECK-NEXT:    [[ARRAYIDX37:%.*]] = getelementptr inbounds float, ptr [[S]], i64 5
+; CHECK-NEXT:    store float [[ADD35]], ptr [[ARRAYIDX37]], align 4
+; CHECK-NEXT:    [[ARRAYIDX39:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 26
+; CHECK-NEXT:    [[I12:%.*]] = load float, ptr [[ARRAYIDX39]], align 4
 ; CHECK-NEXT:    [[ARRAYIDX41:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 [[STR]]
 ; CHECK-NEXT:    [[I13:%.*]] = load float, ptr [[ARRAYIDX41]], align 4
+; CHECK-NEXT:    [[ADD42:%.*]] = fsub fast float [[I13]], [[I12]]
+; CHECK-NEXT:    [[ARRAYIDX44:%.*]] = getelementptr inbounds float, ptr [[S]], i64 6
+; CHECK-NEXT:    store float [[ADD42]], ptr [[ARRAYIDX44]], align 4
+; CHECK-NEXT:    [[ARRAYIDX46:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 30
+; CHECK-NEXT:    [[I14:%.*]] = load float, ptr [[ARRAYIDX46]], align 4
 ; CHECK-NEXT:    [[ARRAYIDX48:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 0
 ; CHECK-NEXT:    [[I15:%.*]] = load float, ptr [[ARRAYIDX48]], align 4
-; CHECK-NEXT:    [[TMP0:%.*]] = call <8 x float> @llvm.experimental.vp.strided.load.v8f32.p0.i64(ptr align 4 [[ARRAYIDX]], i64 16, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, i32 8)
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <8 x float> poison, float [[I1]], i32 0
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <8 x float> [[TMP1]], float [[I3]], i32 1
-; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <8 x float> [[TMP2]], float [[I5]], i32 2
-; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <8 x float> [[TMP3]], float [[I7]], i32 3
-; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <8 x float> [[TMP4]], float [[I9]], i32 4
-; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <8 x float> [[TMP5]], float [[I11]], i32 5
-; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <8 x float> [[TMP6]], float [[I13]], i32 6
-; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <8 x float> [[TMP7]], float [[I15]], i32 7
-; CHECK-NEXT:    [[TMP9:%.*]] = fsub fast <8 x float> [[TMP8]], [[TMP0]]
-; CHECK-NEXT:    store <8 x float> [[TMP9]], ptr [[ARRAYIDX2]], align 4
+; CHECK-NEXT:    [[ADD49:%.*]] = fsub fast float [[I15]], [[I14]]
+; CHECK-NEXT:    [[ARRAYIDX51:%.*]] = getelementptr inbounds float, ptr [[S]], i64 7
+; CHECK-NEXT:    store float [[ADD49]], ptr [[ARRAYIDX51]], align 4
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -321,12 +397,27 @@ define void @test3(ptr %p, ptr noalias %s) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [48 x float], ptr [[P:%.*]], i64 0, i64 0
 ; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[S:%.*]], i64 0
+; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 4
+; CHECK-NEXT:    [[ARRAYIDX11:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 8
+; CHECK-NEXT:    [[ARRAYIDX18:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 12
+; CHECK-NEXT:    [[ARRAYIDX25:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 16
+; CHECK-NEXT:    [[ARRAYIDX32:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 20
+; CHECK-NEXT:    [[ARRAYIDX39:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 24
+; CHECK-NEXT:    [[ARRAYIDX46:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 28
 ; CHECK-NEXT:    [[ARRAYIDX48:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 23
-; CHECK-NEXT:    [[TMP0:%.*]] = call <8 x float> @llvm.experimental.vp.strided.load.v8f32.p0.i64(ptr align 4 [[ARRAYIDX]], i64 16, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, i32 8)
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x float>, ptr [[ARRAYIDX48]], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    [[TMP3:%.*]] = fsub fast <8 x float> [[TMP2]], [[TMP0]]
-; CHECK-NEXT:    store <8 x float> [[TMP3]], ptr [[ARRAYIDX2]], align 4
+; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <8 x ptr> poison, ptr [[ARRAYIDX]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <8 x ptr> [[TMP0]], ptr [[ARRAYIDX4]], i32 1
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <8 x ptr> [[TMP1]], ptr [[ARRAYIDX11]], i32 2
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <8 x ptr> [[TMP2]], ptr [[ARRAYIDX18]], i32 3
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <8 x ptr> [[TMP3]], ptr [[ARRAYIDX25]], i32 4
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <8 x ptr> [[TMP4]], ptr [[ARRAYIDX32]], i32 5
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <8 x ptr> [[TMP5]], ptr [[ARRAYIDX39]], i32 6
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <8 x ptr> [[TMP6]], ptr [[ARRAYIDX46]], i32 7
+; CHECK-NEXT:    [[TMP8:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> [[TMP7]], i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x float> poison)
+; CHECK-NEXT:    [[TMP9:%.*]] = load <8 x float>, ptr [[ARRAYIDX48]], align 4
+; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <8 x float> [[TMP9]], <8 x float> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[TMP11:%.*]] = fsub fast <8 x float> [[TMP10]], [[TMP8]]
+; CHECK-NEXT:    store <8 x float> [[TMP11]], ptr [[ARRAYIDX2]], align 4
 ; CHECK-NEXT:    ret void
 ;
 entry:
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/strided-loads-with-external-use-ptr.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/strided-loads-with-external-use-ptr.ll
index ec152c7..5aba9ea 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/strided-loads-with-external-use-ptr.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/strided-loads-with-external-use-ptr.ll
@@ -8,7 +8,7 @@ define i16 @test() {
 ; CHECK-SAME: () #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[PPREV_058_I:%.*]] = getelementptr [[S:%.*]], ptr null, i64 -1
-; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x ptr> <ptr null, ptr poison>, ptr [[PPREV_058_I]], i32 1
+; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x ptr> <ptr poison, ptr null>, ptr [[PPREV_058_I]], i32 0
 ; CHECK-NEXT:    br label [[WHILE_BODY_I:%.*]]
 ; CHECK:       while.body.i:
 ; CHECK-NEXT:    [[TMP1:%.*]] = phi i16 [ 0, [[WHILE_BODY_I]] ], [ 0, [[ENTRY:%.*]] ]
@@ -17,7 +17,7 @@ define i16 @test() {
 ; CHECK-NEXT:    [[TMP4:%.*]] = call <2 x i16> @llvm.masked.gather.v2i16.v2p0(<2 x ptr> [[TMP3]], i32 2, <2 x i1> <i1 true, i1 true>, <2 x i16> poison)
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x i16> [[TMP4]], i32 0
 ; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x i16> [[TMP4]], i32 1
-; CHECK-NEXT:    [[CMP_I178:%.*]] = icmp ult i16 [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    [[CMP_I178:%.*]] = icmp ult i16 [[TMP6]], [[TMP5]]
 ; CHECK-NEXT:    br label [[WHILE_BODY_I]]
 ;
 entry:
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/strided-loads.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/strided-loads.ll
index 8ab57cc..8f2c72b 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/strided-loads.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/strided-loads.ll
@@ -5,11 +5,14 @@ define i32 @sum_of_abs(ptr noalias %a, ptr noalias %b) {
 ; CHECK-LABEL: define i32 @sum_of_abs
 ; CHECK-SAME: (ptr noalias [[A:%.*]], ptr noalias [[B:%.*]]) #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = call <8 x i8> @llvm.experimental.vp.strided.load.v8i8.p0.i64(ptr align 1 [[A]], i64 64, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, i32 8)
-; CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i8> @llvm.abs.v8i8(<8 x i8> [[TMP0]], i1 false)
-; CHECK-NEXT:    [[TMP2:%.*]] = sext <8 x i8> [[TMP1]] to <8 x i32>
-; CHECK-NEXT:    [[TMP3:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP2]])
-; CHECK-NEXT:    ret i32 [[TMP3]]
+; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <8 x ptr> poison, ptr [[A]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x ptr> [[TMP0]], <8 x ptr> poison, <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, <8 x ptr> [[TMP1]], <8 x i64> <i64 0, i64 64, i64 128, i64 192, i64 256, i64 320, i64 384, i64 448>
+; CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> [[TMP2]], i32 1, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i8> poison)
+; CHECK-NEXT:    [[TMP4:%.*]] = call <8 x i8> @llvm.abs.v8i8(<8 x i8> [[TMP3]], i1 false)
+; CHECK-NEXT:    [[TMP5:%.*]] = sext <8 x i8> [[TMP4]] to <8 x i32>
+; CHECK-NEXT:    [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP5]])
+; CHECK-NEXT:    ret i32 [[TMP6]]
 ;
 entry:
   %0 = load i8, ptr %a, align 1
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/gep-nodes-with-non-gep-inst.ll b/llvm/test/Transforms/SLPVectorizer/X86/gep-nodes-with-non-gep-inst.ll
index 9e43cef..96d4c30 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/gep-nodes-with-non-gep-inst.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/gep-nodes-with-non-gep-inst.ll
@@ -30,7 +30,7 @@ define void @test() {
 ; CHECK-SLP-THRESHOLD:       bb:
 ; CHECK-SLP-THRESHOLD-NEXT:    [[TMP0:%.*]] = insertelement <4 x ptr> poison, ptr [[COND_IN_V]], i32 0
 ; CHECK-SLP-THRESHOLD-NEXT:    [[TMP1:%.*]] = shufflevector <4 x ptr> [[TMP0]], <4 x ptr> poison, <4 x i32> zeroinitializer
-; CHECK-SLP-THRESHOLD-NEXT:    [[TMP2:%.*]] = getelementptr i64, <4 x ptr> [[TMP1]], <4 x i64> <i64 12, i64 8, i64 4, i64 0>
+; CHECK-SLP-THRESHOLD-NEXT:    [[TMP2:%.*]] = getelementptr i64, <4 x ptr> [[TMP1]], <4 x i64> <i64 0, i64 4, i64 8, i64 12>
 ; CHECK-SLP-THRESHOLD-NEXT:    [[TMP3:%.*]] = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> [[TMP2]], i32 8, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i64> poison)
 ; CHECK-SLP-THRESHOLD-NEXT:    [[TMP4:%.*]] = icmp eq <4 x i64> [[TMP3]], zeroinitializer
 ; CHECK-SLP-THRESHOLD-NEXT:    ret void
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/remark_gather-load-redux-cost.ll b/llvm/test/Transforms/SLPVectorizer/X86/remark_gather-load-redux-cost.ll
index 3bc6e64..1add732 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/remark_gather-load-redux-cost.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/remark_gather-load-redux-cost.ll
@@ -7,7 +7,7 @@ define i32 @test(ptr noalias %p, ptr noalias %addr) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <8 x ptr> poison, ptr [[ADDR:%.*]], i32 0
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x ptr> [[TMP0]], <8 x ptr> poison, <8 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i32, <8 x ptr> [[TMP1]], <8 x i32> <i32 15, i32 13, i32 11, i32 9, i32 7, i32 5, i32 3, i32 1>
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i32, <8 x ptr> [[TMP1]], <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> [[TMP2]], i32 8, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> poison)
 ; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <8 x ptr> poison, ptr [[P:%.*]], i32 0
 ; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <8 x ptr> [[TMP4]], <8 x ptr> poison, <8 x i32> zeroinitializer
-- 
cgit v1.1


From 7d9c38a040d778338131fbbcc094aa412dc9306a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Valentin=20Clement=20=28=E3=83=90=E3=83=AC=E3=83=B3?=
 =?UTF-8?q?=E3=82=BF=E3=82=A4=E3=83=B3=20=E3=82=AF=E3=83=AC=E3=83=A1?=
 =?UTF-8?q?=E3=83=B3=29?= <clementval@gmail.com>
Date: Mon, 12 Feb 2024 08:56:30 -0800
Subject: [flang][NFC] Remove hardcoded attr name for fir.global op (#81347)

These hardcoded attribute name are a leftover from the upstreaming
period when there was no way to get the attribute name without an
instance of the operation. It is since possible to do without them and
they should be removed to avoid duplication.

This PR cleanup the fir.global op of these hardcoded attribute name and
use their generated getters. Some other PRs will follow to cleanup other
operations.
---
 flang/include/flang/Optimizer/Dialect/FIROps.td   | 11 ---------
 flang/lib/Optimizer/Analysis/AliasAnalysis.cpp    |  6 +++--
 flang/lib/Optimizer/Dialect/FIROps.cpp            | 27 ++++++++++++++---------
 flang/lib/Optimizer/Transforms/ArrayValueCopy.cpp |  9 ++++++--
 4 files changed, 27 insertions(+), 26 deletions(-)

diff --git a/flang/include/flang/Optimizer/Dialect/FIROps.td b/flang/include/flang/Optimizer/Dialect/FIROps.td
index 9f198a4..d1308df 100644
--- a/flang/include/flang/Optimizer/Dialect/FIROps.td
+++ b/flang/include/flang/Optimizer/Dialect/FIROps.td
@@ -2769,17 +2769,6 @@ def fir_GlobalOp : fir_Op<"global", [IsolatedFromAbove, Symbol]> {
   ];
 
   let extraClassDeclaration = [{
-    static constexpr llvm::StringRef getSymbolAttrNameStr() { return "symref"; }
-    static constexpr llvm::StringRef getConstantAttrNameStr() {
-      return "constant";
-    }
-    static constexpr llvm::StringRef getLinkageAttrNameStr() {
-      return "linkName";
-    }
-    static constexpr llvm::StringRef getTargetAttrNameStr() {
-      return "target";
-    }
-
     /// The semantic type of the global
     mlir::Type resultType();
 
diff --git a/flang/lib/Optimizer/Analysis/AliasAnalysis.cpp b/flang/lib/Optimizer/Analysis/AliasAnalysis.cpp
index ea2064e..e144640 100644
--- a/flang/lib/Optimizer/Analysis/AliasAnalysis.cpp
+++ b/flang/lib/Optimizer/Analysis/AliasAnalysis.cpp
@@ -333,8 +333,10 @@ AliasAnalysis::Source AliasAnalysis::getSource(mlir::Value v) {
           else
             type = SourceKind::Global;
 
-          if (fir::valueHasFirAttribute(v,
-                                        fir::GlobalOp::getTargetAttrNameStr()))
+          auto globalOpName = mlir::OperationName(
+              fir::GlobalOp::getOperationName(), defOp->getContext());
+          if (fir::valueHasFirAttribute(
+                  v, fir::GlobalOp::getTargetAttrName(globalOpName)))
             attributes.set(Attribute::Target);
 
           // TODO: Take followBoxAddr into account when setting the pointer
diff --git a/flang/lib/Optimizer/Dialect/FIROps.cpp b/flang/lib/Optimizer/Dialect/FIROps.cpp
index a5b31da..447d56d 100644
--- a/flang/lib/Optimizer/Dialect/FIROps.cpp
+++ b/flang/lib/Optimizer/Dialect/FIROps.cpp
@@ -1325,12 +1325,14 @@ mlir::ParseResult fir::GlobalOp::parse(mlir::OpAsmParser &parser,
     if (fir::GlobalOp::verifyValidLinkage(linkage))
       return mlir::failure();
     mlir::StringAttr linkAttr = builder.getStringAttr(linkage);
-    result.addAttribute(fir::GlobalOp::getLinkageAttrNameStr(), linkAttr);
+    result.addAttribute(fir::GlobalOp::getLinkNameAttrName(result.name),
+                        linkAttr);
   }
 
   // Parse the name as a symbol reference attribute.
   mlir::SymbolRefAttr nameAttr;
-  if (parser.parseAttribute(nameAttr, fir::GlobalOp::getSymbolAttrNameStr(),
+  if (parser.parseAttribute(nameAttr,
+                            fir::GlobalOp::getSymrefAttrName(result.name),
                             result.attributes))
     return mlir::failure();
   result.addAttribute(mlir::SymbolTable::getSymbolAttrName(),
@@ -1339,7 +1341,8 @@ mlir::ParseResult fir::GlobalOp::parse(mlir::OpAsmParser &parser,
   bool simpleInitializer = false;
   if (mlir::succeeded(parser.parseOptionalLParen())) {
     mlir::Attribute attr;
-    if (parser.parseAttribute(attr, "initVal", result.attributes) ||
+    if (parser.parseAttribute(attr, getInitValAttrName(result.name),
+                              result.attributes) ||
         parser.parseRParen())
       return mlir::failure();
     simpleInitializer = true;
@@ -1348,13 +1351,15 @@ mlir::ParseResult fir::GlobalOp::parse(mlir::OpAsmParser &parser,
   if (parser.parseOptionalAttrDict(result.attributes))
     return mlir::failure();
 
-  if (succeeded(parser.parseOptionalKeyword(getConstantAttrNameStr()))) {
+  if (succeeded(
+          parser.parseOptionalKeyword(getConstantAttrName(result.name)))) {
     // if "constant" keyword then mark this as a constant, not a variable
-    result.addAttribute(getConstantAttrNameStr(), builder.getUnitAttr());
+    result.addAttribute(getConstantAttrName(result.name),
+                        builder.getUnitAttr());
   }
 
-  if (succeeded(parser.parseOptionalKeyword(getTargetAttrNameStr())))
-    result.addAttribute(getTargetAttrNameStr(), builder.getUnitAttr());
+  if (succeeded(parser.parseOptionalKeyword(getTargetAttrName(result.name))))
+    result.addAttribute(getTargetAttrName(result.name), builder.getUnitAttr());
 
   mlir::Type globalType;
   if (parser.parseColonType(globalType))
@@ -1389,9 +1394,9 @@ void fir::GlobalOp::print(mlir::OpAsmPrinter &p) {
                               getTargetAttrName(), getLinkNameAttrName(),
                               getInitValAttrName()});
   if (getOperation()->getAttr(getConstantAttrName()))
-    p << " " << getConstantAttrNameStr();
+    p << " " << getConstantAttrName().strref();
   if (getOperation()->getAttr(getTargetAttrName()))
-    p << " " << getTargetAttrNameStr();
+    p << " " << getTargetAttrName().strref();
   p << " : ";
   p.printType(getType());
   if (hasInitializationBody()) {
@@ -1415,7 +1420,7 @@ void fir::GlobalOp::build(mlir::OpBuilder &builder,
   result.addAttribute(getTypeAttrName(result.name), mlir::TypeAttr::get(type));
   result.addAttribute(mlir::SymbolTable::getSymbolAttrName(),
                       builder.getStringAttr(name));
-  result.addAttribute(getSymbolAttrNameStr(),
+  result.addAttribute(getSymrefAttrName(result.name),
                       mlir::SymbolRefAttr::get(builder.getContext(), name));
   if (isConstant)
     result.addAttribute(getConstantAttrName(result.name),
@@ -1425,7 +1430,7 @@ void fir::GlobalOp::build(mlir::OpBuilder &builder,
   if (initialVal)
     result.addAttribute(getInitValAttrName(result.name), initialVal);
   if (linkage)
-    result.addAttribute(getLinkageAttrNameStr(), linkage);
+    result.addAttribute(getLinkNameAttrName(result.name), linkage);
   result.attributes.append(attrs.begin(), attrs.end());
 }
 
diff --git a/flang/lib/Optimizer/Transforms/ArrayValueCopy.cpp b/flang/lib/Optimizer/Transforms/ArrayValueCopy.cpp
index 2971735..675314e 100644
--- a/flang/lib/Optimizer/Transforms/ArrayValueCopy.cpp
+++ b/flang/lib/Optimizer/Transforms/ArrayValueCopy.cpp
@@ -573,6 +573,8 @@ static bool conflictOnLoad(llvm::ArrayRef<mlir::Operation *> reach,
   for (auto *op : reach)
     if (auto ld = mlir::dyn_cast<ArrayLoadOp>(op)) {
       mlir::Type ldTy = ld.getMemref().getType();
+      auto globalOpName = mlir::OperationName(fir::GlobalOp::getOperationName(),
+                                              ld.getContext());
       if (ld.getMemref() == addr) {
         if (mutuallyExclusiveSliceRange(ld, st))
           continue;
@@ -588,14 +590,17 @@ static bool conflictOnLoad(llvm::ArrayRef<mlir::Operation *> reach,
         if (optimize && !hasPointerType(ldTy) &&
             !valueMayHaveFirAttributes(
                 ld.getMemref(),
-                {getTargetAttrName(), GlobalOp::getTargetAttrNameStr()}))
+                {getTargetAttrName(),
+                 fir::GlobalOp::getTargetAttrName(globalOpName).strref()}))
           continue;
 
         return true;
       } else if (hasPointerType(ldTy)) {
         if (optimize && !storeHasPointerType &&
             !valueMayHaveFirAttributes(
-                addr, {getTargetAttrName(), GlobalOp::getTargetAttrNameStr()}))
+                addr,
+                {getTargetAttrName(),
+                 fir::GlobalOp::getTargetAttrName(globalOpName).strref()}))
           continue;
 
         return true;
-- 
cgit v1.1


From 165fdaab213d3e3e7461344b4fc504f04dcba571 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Valentin=20Clement=20=28=E3=83=90=E3=83=AC=E3=83=B3?=
 =?UTF-8?q?=E3=82=BF=E3=82=A4=E3=83=B3=20=E3=82=AF=E3=83=AC=E3=83=A1?=
 =?UTF-8?q?=E3=83=B3=29?= <clementval@gmail.com>
Date: Mon, 12 Feb 2024 08:56:46 -0800
Subject: [flang][NFC]  Remove hardcoded attr name for fir.dispatch and
 fid.dt_entry ops (#81348)

These hardcoded attribute name are a leftover from the upstreaming
period when there was no way to get the attribute name without an
instance of the operation. It is since possible to do without them and
they should be removed to avoid duplication.

This PR cleanup the fir.dt_entry and fir.dispatch ops of these hardcoded
attribute name and use their generated getters. Some other PRs will
follow to cleanup other operations.
---
 flang/include/flang/Optimizer/Dialect/FIROps.td | 2 --
 flang/lib/Optimizer/Dialect/FIROps.cpp          | 5 ++---
 2 files changed, 2 insertions(+), 5 deletions(-)

diff --git a/flang/include/flang/Optimizer/Dialect/FIROps.td b/flang/include/flang/Optimizer/Dialect/FIROps.td
index d1308df..29bd100 100644
--- a/flang/include/flang/Optimizer/Dialect/FIROps.td
+++ b/flang/include/flang/Optimizer/Dialect/FIROps.td
@@ -2433,7 +2433,6 @@ def fir_DispatchOp : fir_Op<"dispatch", []> {
     // operand[0] is the object (of class type)
     operand_iterator arg_operand_begin() { return operand_begin() + 1; }
     operand_iterator arg_operand_end() { return operand_end(); }
-    static constexpr llvm::StringRef getMethodAttrNameStr() { return "method"; }
   }];
 }
 
@@ -2922,7 +2921,6 @@ def fir_DTEntryOp : fir_Op<"dt_entry", [HasParent<"TypeInfoOp">]> {
   let hasCustomAssemblyFormat = 1;
 
   let extraClassDeclaration = [{
-    static constexpr llvm::StringRef getMethodAttrNameStr() { return "method"; }
     static constexpr llvm::StringRef getProcAttrNameStr() { return "proc"; }
   }];
 }
diff --git a/flang/lib/Optimizer/Dialect/FIROps.cpp b/flang/lib/Optimizer/Dialect/FIROps.cpp
index 447d56d..0a534cd 100644
--- a/flang/lib/Optimizer/Dialect/FIROps.cpp
+++ b/flang/lib/Optimizer/Dialect/FIROps.cpp
@@ -2289,12 +2289,11 @@ mlir::ParseResult fir::DTEntryOp::parse(mlir::OpAsmParser &parser,
   // allow `methodName` or `"methodName"`
   if (failed(parser.parseOptionalKeyword(&methodName))) {
     mlir::StringAttr methodAttr;
-    if (parser.parseAttribute(methodAttr,
-                              fir::DTEntryOp::getMethodAttrNameStr(),
+    if (parser.parseAttribute(methodAttr, getMethodAttrName(result.name),
                               result.attributes))
       return mlir::failure();
   } else {
-    result.addAttribute(fir::DTEntryOp::getMethodAttrNameStr(),
+    result.addAttribute(getMethodAttrName(result.name),
                         parser.getBuilder().getStringAttr(methodName));
   }
   mlir::SymbolRefAttr calleeAttr;
-- 
cgit v1.1


From f559c2efe195b54ebc0f0a0077918483a25a6ae4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Timm=20B=C3=A4der?= <tbaeder@redhat.com>
Date: Mon, 12 Feb 2024 17:09:30 +0100
Subject: [clang][Sema][NFC] Use auto for dyn_cast<>

---
 clang/lib/Sema/SemaChecking.cpp | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp
index 71e6e72..afe2673 100644
--- a/clang/lib/Sema/SemaChecking.cpp
+++ b/clang/lib/Sema/SemaChecking.cpp
@@ -7162,13 +7162,11 @@ static bool CheckNonNullExpr(Sema &S, const Expr *Expr) {
 
   // As a special case, transparent unions initialized with zero are
   // considered null for the purposes of the nonnull attribute.
-  if (const RecordType *UT = Expr->getType()->getAsUnionType()) {
-    if (UT->getDecl()->hasAttr<TransparentUnionAttr>())
-      if (const CompoundLiteralExpr *CLE =
-          dyn_cast<CompoundLiteralExpr>(Expr))
-        if (const InitListExpr *ILE =
-            dyn_cast<InitListExpr>(CLE->getInitializer()))
-          Expr = ILE->getInit(0);
+  if (const RecordType *UT = Expr->getType()->getAsUnionType();
+      UT && UT->getDecl()->hasAttr<TransparentUnionAttr>()) {
+    if (const auto *CLE = dyn_cast<CompoundLiteralExpr>(Expr))
+      if (const auto *ILE = dyn_cast<InitListExpr>(CLE->getInitializer()))
+        Expr = ILE->getInit(0);
   }
 
   bool Result;
-- 
cgit v1.1


From 44d85c5b15bbf6226f442126735b764d81cbf6e3 Mon Sep 17 00:00:00 2001
From: Usman Nadeem <mnadeem@quicinc.com>
Date: Mon, 12 Feb 2024 09:03:41 -0800
Subject: [AArch64][SVE2] Use a PatFrag for URSHR (#81304)

Follow-up for #78374
---
 llvm/lib/Target/AArch64/AArch64ISelLowering.cpp | 3 ---
 llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td  | 6 +++++-
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 2854dd4..a3b7e31 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -20246,9 +20246,6 @@ static SDValue performIntrinsicCombine(SDNode *N,
   case Intrinsic::aarch64_sve_uqsub_x:
     return DAG.getNode(ISD::USUBSAT, SDLoc(N), N->getValueType(0),
                        N->getOperand(1), N->getOperand(2));
-  case Intrinsic::aarch64_sve_urshr:
-    return DAG.getNode(AArch64ISD::URSHR_I_PRED, SDLoc(N), N->getValueType(0),
-                       N->getOperand(1), N->getOperand(2), N->getOperand(3));
   case Intrinsic::aarch64_sve_asrd:
     return DAG.getNode(AArch64ISD::SRAD_MERGE_OP1, SDLoc(N), N->getValueType(0),
                        N->getOperand(1), N->getOperand(2), N->getOperand(3));
diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
index e83d8e5..2b05247 100644
--- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
@@ -232,7 +232,11 @@ def SDT_AArch64Arith_Imm : SDTypeProfile<1, 3, [
 ]>;
 
 def AArch64asrd_m1 : SDNode<"AArch64ISD::SRAD_MERGE_OP1", SDT_AArch64Arith_Imm>;
-def AArch64urshri_p : SDNode<"AArch64ISD::URSHR_I_PRED", SDT_AArch64Arith_Imm>;
+def AArch64urshri_p_node : SDNode<"AArch64ISD::URSHR_I_PRED", SDT_AArch64Arith_Imm>;
+
+def AArch64urshri_p : PatFrags<(ops node:$op1, node:$op2, node:$op3),
+                           [(int_aarch64_sve_urshr node:$op1, node:$op2, node:$op3),
+                            (AArch64urshri_p_node node:$op1, node:$op2, node:$op3)]>;
 
 def SDT_AArch64IntExtend : SDTypeProfile<1, 4, [
   SDTCisVec<0>, SDTCisVec<1>, SDTCisVec<2>, SDTCisVT<3, OtherVT>, SDTCisVec<4>,
-- 
cgit v1.1


From 0b6e04005ca0eab7c5566950d8595379a9f72d41 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Mon, 12 Feb 2024 09:17:50 -0800
Subject: [RISCV] Exclude X1 and X5 from register scavenging for long branch.
 (#80215)

When a branch target is too far away we need to emit an indirect branch.
We scavenge a register for this since we don't know we need this until
after register allocation.

Jumps using X1 and X5 as the source are hints to the hardware to pop the
return-address stack. We should avoiding using them for jumps that
aren't a return or tail call.
---
 llvm/lib/Target/RISCV/RISCVInstrInfo.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
index 89eb71d..225a9db 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
@@ -1115,7 +1115,7 @@ void RISCVInstrInfo::insertIndirectBranch(MachineBasicBlock &MBB,
   // FIXME: A virtual register must be used initially, as the register
   // scavenger won't work with empty blocks (SIInstrInfo::insertIndirectBranch
   // uses the same workaround).
-  Register ScratchReg = MRI.createVirtualRegister(&RISCV::GPRRegClass);
+  Register ScratchReg = MRI.createVirtualRegister(&RISCV::GPRJALRRegClass);
   auto II = MBB.end();
   // We may also update the jump target to RestoreBB later.
   MachineInstr &MI = *BuildMI(MBB, II, DL, get(RISCV::PseudoJump))
-- 
cgit v1.1


From 1114ac4399a8a5725a7ce79a5189f3467e43ba7a Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Mon, 12 Feb 2024 09:19:28 -0800
Subject: [RISCV] Remove stale comment from test. NFC (#81098)

The bug mentioned in the comment has been committed and did change the
cfi_offset.
---
 llvm/test/CodeGen/RISCV/zcmp-with-float.ll | 1 -
 1 file changed, 1 deletion(-)

diff --git a/llvm/test/CodeGen/RISCV/zcmp-with-float.ll b/llvm/test/CodeGen/RISCV/zcmp-with-float.ll
index 93f95e9..3eb91eb 100644
--- a/llvm/test/CodeGen/RISCV/zcmp-with-float.ll
+++ b/llvm/test/CodeGen/RISCV/zcmp-with-float.ll
@@ -5,7 +5,6 @@
 declare void @callee()
 
 ; Test the file could be compiled successfully.
-; .cfi_offset of fs0 is wrong here. It should be fixed by #66613.
 define float @foo(float %arg) {
 ; RV32-LABEL: foo:
 ; RV32:       # %bb.0: # %entry
-- 
cgit v1.1


From 9ecf4d20bbfabdcd11c9058363903f975a652e7b Mon Sep 17 00:00:00 2001
From: David Truby <david.truby@arm.com>
Date: Mon, 12 Feb 2024 17:19:49 +0000
Subject: [mlir][flang][openmp] Rework parallel reduction operations (#79308)

This patch reworks the way that parallel reduction operations function
to better match the expected semantics from the OpenMP specification.
Previously specific omp.reduction operations were used inside the
region, meaning that the reduction only applied when the correct
operation was used, whereas the specification states that any change to
the variable inside the region should be taken into account for the
reduction.

The new semantics create a private reduction variable as a block
argument which should be used normally for all operations on that
variable in the region; this private variable is then combined with the
others into the shared variable. This way no special omp.reduction
operations are needed inside the region.

This patch only makes the change for the `parallel` operation, the
change for the `wsloop` operation will be in a separate patch.

---------

Co-authored-by: Kiran Chandramohan <kiran.chandramohan@arm.com>
---
 flang/lib/Lower/OpenMP.cpp                         | 78 +++++++++++++++++-----
 .../Lower/OpenMP/FIR/parallel-reduction-add.f90    | 26 +++++---
 flang/test/Lower/OpenMP/parallel-reduction-add.f90 | 26 ++++++--
 flang/test/Lower/OpenMP/parallel-reduction.f90     | 38 +++++++++++
 mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td      |  9 +--
 mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp       | 68 +++++++++++++++++++
 .../Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp   | 16 ++++-
 mlir/test/Dialect/OpenMP/ops.mlir                  | 39 ++++++-----
 mlir/test/Target/LLVMIR/openmp-reduction.mlir      | 12 ++--
 9 files changed, 249 insertions(+), 63 deletions(-)
 create mode 100644 flang/test/Lower/OpenMP/parallel-reduction.f90

diff --git a/flang/lib/Lower/OpenMP.cpp b/flang/lib/Lower/OpenMP.cpp
index fd18b21..746dc2b6 100644
--- a/flang/lib/Lower/OpenMP.cpp
+++ b/flang/lib/Lower/OpenMP.cpp
@@ -621,10 +621,12 @@ public:
                   llvm::SmallVectorImpl<mlir::Location> *mapSymLocs = nullptr,
                   llvm::SmallVectorImpl<const Fortran::semantics::Symbol *>
                       *mapSymbols = nullptr) const;
-  bool processReduction(
-      mlir::Location currentLocation,
-      llvm::SmallVectorImpl<mlir::Value> &reductionVars,
-      llvm::SmallVectorImpl<mlir::Attribute> &reductionDeclSymbols) const;
+  bool
+  processReduction(mlir::Location currentLocation,
+                   llvm::SmallVectorImpl<mlir::Value> &reductionVars,
+                   llvm::SmallVectorImpl<mlir::Attribute> &reductionDeclSymbols,
+                   llvm::SmallVectorImpl<const Fortran::semantics::Symbol *>
+                       *reductionSymbols = nullptr) const;
   bool processSectionsReduction(mlir::Location currentLocation) const;
   bool processTo(llvm::SmallVectorImpl<DeclareTargetCapturePair> &result) const;
   bool
@@ -1079,12 +1081,14 @@ public:
 
   /// Creates a reduction declaration and associates it with an OpenMP block
   /// directive.
-  static void addReductionDecl(
-      mlir::Location currentLocation,
-      Fortran::lower::AbstractConverter &converter,
-      const Fortran::parser::OmpReductionClause &reduction,
-      llvm::SmallVectorImpl<mlir::Value> &reductionVars,
-      llvm::SmallVectorImpl<mlir::Attribute> &reductionDeclSymbols) {
+  static void
+  addReductionDecl(mlir::Location currentLocation,
+                   Fortran::lower::AbstractConverter &converter,
+                   const Fortran::parser::OmpReductionClause &reduction,
+                   llvm::SmallVectorImpl<mlir::Value> &reductionVars,
+                   llvm::SmallVectorImpl<mlir::Attribute> &reductionDeclSymbols,
+                   llvm::SmallVectorImpl<const Fortran::semantics::Symbol *>
+                       *reductionSymbols = nullptr) {
     fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder();
     mlir::omp::ReductionDeclareOp decl;
     const auto &redOperator{
@@ -1114,6 +1118,8 @@ public:
         if (const auto *name{
                 Fortran::parser::Unwrap<Fortran::parser::Name>(ompObject)}) {
           if (const Fortran::semantics::Symbol * symbol{name->symbol}) {
+            if (reductionSymbols)
+              reductionSymbols->push_back(symbol);
             mlir::Value symVal = converter.getSymbolAddress(*symbol);
             if (auto declOp = symVal.getDefiningOp<hlfir::DeclareOp>())
               symVal = declOp.getBase();
@@ -1148,6 +1154,8 @@ public:
           if (const auto *name{
                   Fortran::parser::Unwrap<Fortran::parser::Name>(ompObject)}) {
             if (const Fortran::semantics::Symbol * symbol{name->symbol}) {
+              if (reductionSymbols)
+                reductionSymbols->push_back(symbol);
               mlir::Value symVal = converter.getSymbolAddress(*symbol);
               if (auto declOp = symVal.getDefiningOp<hlfir::DeclareOp>())
                 symVal = declOp.getBase();
@@ -1948,13 +1956,16 @@ bool ClauseProcessor::processMap(
 bool ClauseProcessor::processReduction(
     mlir::Location currentLocation,
     llvm::SmallVectorImpl<mlir::Value> &reductionVars,
-    llvm::SmallVectorImpl<mlir::Attribute> &reductionDeclSymbols) const {
+    llvm::SmallVectorImpl<mlir::Attribute> &reductionDeclSymbols,
+    llvm::SmallVectorImpl<const Fortran::semantics::Symbol *> *reductionSymbols)
+    const {
   return findRepeatableClause<ClauseTy::Reduction>(
       [&](const ClauseTy::Reduction *reductionClause,
           const Fortran::parser::CharBlock &) {
         ReductionProcessor rp;
         rp.addReductionDecl(currentLocation, converter, reductionClause->v,
-                            reductionVars, reductionDeclSymbols);
+                            reductionVars, reductionDeclSymbols,
+                            reductionSymbols);
       });
 }
 
@@ -2304,6 +2315,14 @@ struct OpWithBodyGenInfo {
     return *this;
   }
 
+  OpWithBodyGenInfo &
+  setReductions(llvm::SmallVector<const Fortran::semantics::Symbol *> *value1,
+                llvm::SmallVector<mlir::Type> *value2) {
+    reductionSymbols = value1;
+    reductionTypes = value2;
+    return *this;
+  }
+
   OpWithBodyGenInfo &setGenRegionEntryCb(GenOMPRegionEntryCBFn value) {
     genRegionEntryCB = value;
     return *this;
@@ -2323,6 +2342,11 @@ struct OpWithBodyGenInfo {
   const Fortran::parser::OmpClauseList *clauses = nullptr;
   /// [in] if provided, processes the construct's data-sharing attributes.
   DataSharingProcessor *dsp = nullptr;
+  /// [in] if provided, list of reduction symbols
+  llvm::SmallVector<const Fortran::semantics::Symbol *> *reductionSymbols =
+      nullptr;
+  /// [in] if provided, list of reduction types
+  llvm::SmallVector<mlir::Type> *reductionTypes = nullptr;
   /// [in] if provided, emits the op's region entry. Otherwise, an emtpy block
   /// is created in the region.
   GenOMPRegionEntryCBFn genRegionEntryCB = nullptr;
@@ -2567,6 +2591,7 @@ genParallelOp(Fortran::lower::AbstractConverter &converter,
   llvm::SmallVector<mlir::Value> allocateOperands, allocatorOperands,
       reductionVars;
   llvm::SmallVector<mlir::Attribute> reductionDeclSymbols;
+  llvm::SmallVector<const Fortran::semantics::Symbol *> reductionSymbols;
 
   ClauseProcessor cp(converter, clauseList);
   cp.processIf(Fortran::parser::OmpIfClause::DirectiveNameModifier::Parallel,
@@ -2576,13 +2601,33 @@ genParallelOp(Fortran::lower::AbstractConverter &converter,
   cp.processDefault();
   cp.processAllocate(allocatorOperands, allocateOperands);
   if (!outerCombined)
-    cp.processReduction(currentLocation, reductionVars, reductionDeclSymbols);
+    cp.processReduction(currentLocation, reductionVars, reductionDeclSymbols,
+                        &reductionSymbols);
+
+  llvm::SmallVector<mlir::Type> reductionTypes;
+  reductionTypes.reserve(reductionVars.size());
+  llvm::transform(reductionVars, std::back_inserter(reductionTypes),
+                  [](mlir::Value v) { return v.getType(); });
+
+  auto reductionCallback = [&](mlir::Operation *op) {
+    llvm::SmallVector<mlir::Location> locs(reductionVars.size(),
+                                           currentLocation);
+    auto block = converter.getFirOpBuilder().createBlock(&op->getRegion(0), {},
+                                                         reductionTypes, locs);
+    for (auto [arg, prv] :
+         llvm::zip_equal(reductionSymbols, block->getArguments())) {
+      converter.bindSymbol(*arg, prv);
+    }
+    return reductionSymbols;
+  };
 
   return genOpWithBody<mlir::omp::ParallelOp>(
       OpWithBodyGenInfo(converter, currentLocation, eval)
           .setGenNested(genNested)
           .setOuterCombined(outerCombined)
-          .setClauses(&clauseList),
+          .setClauses(&clauseList)
+          .setReductions(&reductionSymbols, &reductionTypes)
+          .setGenRegionEntryCb(reductionCallback),
       /*resultTypes=*/mlir::TypeRange(), ifClauseOperand,
       numThreadsClauseOperand, allocateOperands, allocatorOperands,
       reductionVars,
@@ -3634,10 +3679,8 @@ genOMP(Fortran::lower::AbstractConverter &converter,
     break;
   }
 
-  if (singleDirective) {
-    genOpenMPReduction(converter, beginClauseList);
+  if (singleDirective)
     return;
-  }
 
   // Codegen for combined directives
   bool combinedDirective = false;
@@ -3673,7 +3716,6 @@ genOMP(Fortran::lower::AbstractConverter &converter,
                               ")");
 
   genNestedEvaluations(converter, eval);
-  genOpenMPReduction(converter, beginClauseList);
 }
 
 static void
diff --git a/flang/test/Lower/OpenMP/FIR/parallel-reduction-add.f90 b/flang/test/Lower/OpenMP/FIR/parallel-reduction-add.f90
index 6580aeb..4b223e8 100644
--- a/flang/test/Lower/OpenMP/FIR/parallel-reduction-add.f90
+++ b/flang/test/Lower/OpenMP/FIR/parallel-reduction-add.f90
@@ -27,9 +27,11 @@
 !CHECK:  %[[IREF:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFsimple_int_addEi"}
 !CHECK:  %[[I_START:.*]] = arith.constant 0 : i32
 !CHECK:  fir.store %[[I_START]] to %[[IREF]] : !fir.ref<i32>
-!CHECK:  omp.parallel   reduction(@[[RED_I32_NAME]] -> %[[IREF]] : !fir.ref<i32>) {
-!CHECK:    %[[I_INCR:.*]] = arith.constant 1 : i32
-!CHECK:    omp.reduction %[[I_INCR]], %[[IREF]] : i32, !fir.ref<i32>
+!CHECK:  omp.parallel reduction(@[[RED_I32_NAME]] %[[IREF]] -> %[[PRV:.+]] : !fir.ref<i32>) {
+!CHECK:    %[[LPRV:.+]] = fir.load %[[PRV]] : !fir.ref<i32>
+!CHECK:    %[[I_INCR:.+]] = arith.constant 1 : i32
+!CHECK:    %[[RES:.+]] = arith.addi %[[LPRV]], %[[I_INCR]]
+!CHECK:    fir.store %[[RES]] to %[[PRV]] : !fir.ref<i32>
 !CHECK:    omp.terminator
 !CHECK:  }
 !CHECK: return
@@ -48,9 +50,11 @@ end subroutine
 !CHECK:  %[[RREF:.*]] = fir.alloca f32 {bindc_name = "r", uniq_name = "_QFsimple_real_addEr"}
 !CHECK:  %[[R_START:.*]] = arith.constant 0.000000e+00 : f32
 !CHECK:  fir.store %[[R_START]] to %[[RREF]] : !fir.ref<f32>
-!CHECK:  omp.parallel   reduction(@[[RED_F32_NAME]] -> %[[RREF]] : !fir.ref<f32>) {
-!CHECK:    %[[R_INCR:.*]] = arith.constant 1.500000e+00 : f32
-!CHECK:    omp.reduction %[[R_INCR]], %[[RREF]] : f32, !fir.ref<f32>
+!CHECK:  omp.parallel reduction(@[[RED_F32_NAME]] %[[RREF]] -> %[[PRV:.+]] : !fir.ref<f32>) {
+!CHECK:    %[[LPRV:.+]] = fir.load %[[PRV]] : !fir.ref<f32>
+!CHECK:    %[[R_INCR:.+]] = arith.constant 1.500000e+00 : f32
+!CHECK:    %[[RES]] = arith.addf %[[LPRV]], %[[R_INCR]] {{.*}} : f32
+!CHECK:    fir.store %[[RES]] to %[[PRV]] : !fir.ref<f32>
 !CHECK:    omp.terminator
 !CHECK:  }
 !CHECK: return
@@ -72,11 +76,15 @@ end subroutine
 !CHECK:  fir.store %[[R_START]] to %[[RREF]] : !fir.ref<f32>
 !CHECK:  %[[I_START:.*]] = arith.constant 0 : i32
 !CHECK:  fir.store %[[I_START]] to %[[IREF]] : !fir.ref<i32>
-!CHECK:  omp.parallel   reduction(@[[RED_I32_NAME]] -> %[[IREF]] : !fir.ref<i32>, @[[RED_F32_NAME]] -> %[[RREF]] : !fir.ref<f32>) {
+!CHECK:  omp.parallel reduction(@[[RED_I32_NAME]] %[[IREF]] -> %[[PRV0:.+]] : !fir.ref<i32>, @[[RED_F32_NAME]] %[[RREF]] -> %[[PRV1:.+]] : !fir.ref<f32>) {
 !CHECK:    %[[R_INCR:.*]] = arith.constant 1.500000e+00 : f32
-!CHECK:    omp.reduction %[[R_INCR]], %[[RREF]] : f32, !fir.ref<f32>
+!CHECK:    %[[LPRV1:.+]] = fir.load %[[PRV1]] : !fir.ref<f32>
+!CHECK:    %[[RES1:.+]] = arith.addf %[[R_INCR]], %[[LPRV1]] {{.*}} : f32
+!CHECK:    fir.store %[[RES1]] to %[[PRV1]]
+!CHECK:    %[[LPRV0:.+]] = fir.load %[[PRV0]] : !fir.ref<i32>
 !CHECK:    %[[I_INCR:.*]] = arith.constant 3 : i32
-!CHECK:    omp.reduction %[[I_INCR]], %[[IREF]] : i32, !fir.ref<i32>
+!CHECK:    %[[RES0:.+]] = arith.addi %[[LPRV0]], %[[I_INCR]]
+!CHECK:    fir.store %[[RES0]] to %[[PRV0]]
 !CHECK:    omp.terminator
 !CHECK:  }
 !CHECK: return
diff --git a/flang/test/Lower/OpenMP/parallel-reduction-add.f90 b/flang/test/Lower/OpenMP/parallel-reduction-add.f90
index 81a93ae..8f3ac3d 100644
--- a/flang/test/Lower/OpenMP/parallel-reduction-add.f90
+++ b/flang/test/Lower/OpenMP/parallel-reduction-add.f90
@@ -28,9 +28,12 @@
 !CHECK:  %[[I_DECL:.*]]:2 = hlfir.declare %[[IREF]] {uniq_name = "_QFsimple_int_addEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 !CHECK:  %[[I_START:.*]] = arith.constant 0 : i32
 !CHECK:  hlfir.assign %[[I_START]] to %[[I_DECL]]#0 : i32, !fir.ref<i32>
-!CHECK:  omp.parallel   reduction(@[[RED_I32_NAME]] -> %[[I_DECL]]#0 : !fir.ref<i32>) {
+!CHECK:  omp.parallel reduction(@[[RED_I32_NAME]] %[[I_DECL]]#0 -> %[[PRV:.+]] : !fir.ref<i32>) {
+!CHECK:    %[[P_DECL:.+]]:2 = hlfir.declare %[[PRV]] {{.*}} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK:    %[[LPRV:.+]] = fir.load %[[P_DECL]]#0 : !fir.ref<i32>
 !CHECK:    %[[I_INCR:.*]] = arith.constant 1 : i32
-!CHECK:    omp.reduction %[[I_INCR]], %[[I_DECL]]#0 : i32, !fir.ref<i32>
+!CHECK:    %[[RES:.+]] = arith.addi %[[LPRV]], %[[I_INCR]] : i32
+!CHECK:    hlfir.assign %[[RES]] to %[[P_DECL]]#0 : i32, !fir.ref<i32>
 !CHECK:    omp.terminator
 !CHECK:  }
 !CHECK: return
@@ -50,9 +53,12 @@ end subroutine
 !CHECK:  %[[R_DECL:.*]]:2 = hlfir.declare %[[RREF]] {uniq_name = "_QFsimple_real_addEr"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
 !CHECK:  %[[R_START:.*]] = arith.constant 0.000000e+00 : f32
 !CHECK:  hlfir.assign %[[R_START]] to %[[R_DECL]]#0 : f32, !fir.ref<f32>
-!CHECK:  omp.parallel   reduction(@[[RED_F32_NAME]] -> %[[R_DECL]]#0 : !fir.ref<f32>) {
+!CHECK:  omp.parallel reduction(@[[RED_F32_NAME]] %[[R_DECL]]#0 -> %[[PRV:.+]] : !fir.ref<f32>) {
+!CHECK:    %[[P_DECL:.+]]:2 = hlfir.declare %[[PRV]] {{.*}} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
+!CHECK:    %[[LPRV:.+]] = fir.load %[[P_DECL]]#0 : !fir.ref<f32>
 !CHECK:    %[[R_INCR:.*]] = arith.constant 1.500000e+00 : f32
-!CHECK:    omp.reduction %[[R_INCR]], %[[R_DECL]]#0 : f32, !fir.ref<f32>
+!CHECK:    %[[RES:.+]] = arith.addf %[[LPRV]], %[[R_INCR]] {{.*}} : f32
+!CHECK:    hlfir.assign %[[RES]] to %[[P_DECL]]#0 : f32, !fir.ref<f32>
 !CHECK:    omp.terminator
 !CHECK:  }
 !CHECK: return
@@ -76,11 +82,17 @@ end subroutine
 !CHECK:  hlfir.assign %[[R_START]] to %[[R_DECL]]#0 : f32, !fir.ref<f32>
 !CHECK:  %[[I_START:.*]] = arith.constant 0 : i32
 !CHECK:  hlfir.assign %[[I_START]] to %[[I_DECL]]#0 : i32, !fir.ref<i32>
-!CHECK:  omp.parallel   reduction(@[[RED_I32_NAME]] -> %[[I_DECL]]#0 : !fir.ref<i32>, @[[RED_F32_NAME]] -> %[[R_DECL]]#0 : !fir.ref<f32>) {
+!CHECK:  omp.parallel reduction(@[[RED_I32_NAME]] %[[I_DECL]]#0 -> %[[IPRV:.+]] : !fir.ref<i32>, @[[RED_F32_NAME]] %[[R_DECL]]#0 -> %[[RPRV:.+]] : !fir.ref<f32>) {
+!CHECK:    %[[IP_DECL:.+]]:2 = hlfir.declare %[[IPRV]] {{.*}} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK:    %[[RP_DECL:.+]]:2 = hlfir.declare %[[RPRV]] {{.*}} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
 !CHECK:    %[[R_INCR:.*]] = arith.constant 1.500000e+00 : f32
-!CHECK:    omp.reduction %[[R_INCR]], %[[R_DECL]]#0 : f32, !fir.ref<f32>
+!CHECK:    %[[R_LPRV:.+]] = fir.load %[[RP_DECL]]#0 : !fir.ref<f32>
+!CHECK:    %[[RES1:.+]] = arith.addf %[[R_INCR]], %[[R_LPRV]] {{.*}} : f32
+!CHECK:    hlfir.assign %[[RES1]] to %[[RP_DECL]]#0 : f32, !fir.ref<f32>
+!CHECK:    %[[I_LPRV:.+]] = fir.load %[[IP_DECL]]#0 : !fir.ref<i32>
 !CHECK:    %[[I_INCR:.*]] = arith.constant 3 : i32
-!CHECK:    omp.reduction %[[I_INCR]], %[[I_DECL]]#0 : i32, !fir.ref<i32>
+!CHECK:    %[[RES0:.+]] = arith.addi %[[I_LPRV]], %[[I_INCR]] : i32
+!CHECK:    hlfir.assign %[[RES0]] to %[[IP_DECL]]#0 : i32, !fir.ref<i32>
 !CHECK:    omp.terminator
 !CHECK:  }
 !CHECK: return
diff --git a/flang/test/Lower/OpenMP/parallel-reduction.f90 b/flang/test/Lower/OpenMP/parallel-reduction.f90
new file mode 100644
index 0000000..a07d118
--- /dev/null
+++ b/flang/test/Lower/OpenMP/parallel-reduction.f90
@@ -0,0 +1,38 @@
+! RUN: bbc -emit-hlfir -fopenmp -o - %s 2>&1 | FileCheck %s
+! RUN: %flang_fc1 -emit-hlfir -fopenmp -o - %s 2>&1 | FileCheck %s
+
+!CHECK:  omp.reduction.declare @[[REDUCTION_DECLARE:[_a-z0-9]+]] : i32 init {
+!CHECK:  ^bb0(%{{.*}}: i32):
+!CHECK:    %[[I0:[_a-z0-9]+]] = arith.constant 0 : i32
+!CHECK:    omp.yield(%[[I0]] : i32)
+!CHECK:  } combiner {
+!CHECK:  ^bb0(%[[C0:[_a-z0-9]+]]: i32, %[[C1:[_a-z0-9]+]]: i32):
+!CHECK:    %[[CR:[_a-z0-9]+]] = arith.addi %[[C0]], %[[C1]] : i32
+!CHECK:    omp.yield(%[[CR]] : i32)
+!CHECK:  }
+!CHECK:  func.func @_QQmain() attributes {fir.bindc_name = "mn"} {
+!CHECK:    %[[RED_ACCUM_REF:[_a-z0-9]+]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFEi"}
+!CHECK:    %[[RED_ACCUM_DECL:[_a-z0-9]+]]:2 = hlfir.declare %[[RED_ACCUM_REF]] {uniq_name = "_QFEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK:    %[[C0:[_a-z0-9]+]] = arith.constant 0 : i32
+!CHECK:    hlfir.assign %[[C0]] to %[[RED_ACCUM_DECL]]#0 : i32, !fir.ref<i32>
+!CHECK:    omp.parallel reduction(@[[REDUCTION_DECLARE]] %[[RED_ACCUM_DECL]]#0 -> %[[PRIVATE_RED:[a-z0-9]+]] : !fir.ref<i32>) {
+!CHECK:      %[[PRIVATE_DECL:[_a-z0-9]+]]:2 = hlfir.declare %[[PRIVATE_RED]] {uniq_name = "_QFEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK:      %[[C1:[_a-z0-9]+]] = arith.constant 1 : i32
+!CHECK:      hlfir.assign %[[C1]] to %[[PRIVATE_DECL]]#0 : i32, !fir.ref<i32>
+!CHECK:      omp.terminator
+!CHECK:    }
+!CHECK:    %[[RED_ACCUM_VAL:[_a-z0-9]+]] = fir.load %[[RED_ACCUM_DECL]]#0 : !fir.ref<i32>
+!CHECK:    {{.*}} = fir.call @_FortranAioOutputInteger32(%{{.*}}, %[[RED_ACCUM_VAL]]) fastmath<contract> : (!fir.ref<i8>, i32) -> i1
+!CHECK:    return
+!CHECK:  }
+
+program mn
+    integer :: i
+    i = 0
+
+    !$omp parallel reduction(+:i)
+      i = 1
+    !$omp end parallel
+
+    print *, i
+end program
diff --git a/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td b/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td
index ca36350..5d84217 100644
--- a/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td
+++ b/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td
@@ -191,11 +191,8 @@ def ParallelOp : OpenMP_Op<"parallel", [
     unsigned getNumReductionVars() { return getReductionVars().size(); }
   }];
   let assemblyFormat = [{
-    oilist( `reduction` `(`
-              custom<ReductionVarList>(
-                $reduction_vars, type($reduction_vars), $reductions
-              ) `)`
-          | `if` `(` $if_expr_var `:` type($if_expr_var) `)`
+    oilist(
+          `if` `(` $if_expr_var `:` type($if_expr_var) `)`
           | `num_threads` `(` $num_threads_var `:` type($num_threads_var) `)`
           | `allocate` `(`
               custom<AllocateAndAllocator>(
@@ -203,7 +200,7 @@ def ParallelOp : OpenMP_Op<"parallel", [
                 $allocators_vars, type($allocators_vars)
               ) `)`
           | `proc_bind` `(` custom<ClauseAttr>($proc_bind_val) `)`
-    ) $region attr-dict
+    ) custom<ParallelRegion>($region, $reduction_vars, type($reduction_vars), $reductions) attr-dict
   }];
   let hasVerifier = 1;
 }
diff --git a/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp b/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp
index 381f17d..394f062 100644
--- a/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp
+++ b/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp
@@ -21,6 +21,7 @@
 #include "mlir/Interfaces/FoldInterfaces.h"
 
 #include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/STLForwardCompat.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/StringExtras.h"
@@ -34,6 +35,7 @@
 #include "mlir/Dialect/OpenMP/OpenMPOpsEnums.cpp.inc"
 #include "mlir/Dialect/OpenMP/OpenMPOpsInterfaces.cpp.inc"
 #include "mlir/Dialect/OpenMP/OpenMPTypeInterfaces.cpp.inc"
+#include "mlir/Support/LogicalResult.h"
 
 using namespace mlir;
 using namespace mlir::omp;
@@ -427,6 +429,71 @@ static void printScheduleClause(OpAsmPrinter &p, Operation *op,
 // Parser, printer and verifier for ReductionVarList
 //===----------------------------------------------------------------------===//
 
+ParseResult
+parseReductionClause(OpAsmParser &parser, Region &region,
+                     SmallVectorImpl<OpAsmParser::UnresolvedOperand> &operands,
+                     SmallVectorImpl<Type> &types, ArrayAttr &reductionSymbols,
+                     SmallVectorImpl<OpAsmParser::Argument> &privates) {
+  if (failed(parser.parseOptionalKeyword("reduction")))
+    return failure();
+
+  SmallVector<SymbolRefAttr> reductionVec;
+
+  if (failed(
+          parser.parseCommaSeparatedList(OpAsmParser::Delimiter::Paren, [&]() {
+            if (parser.parseAttribute(reductionVec.emplace_back()) ||
+                parser.parseOperand(operands.emplace_back()) ||
+                parser.parseArrow() ||
+                parser.parseArgument(privates.emplace_back()) ||
+                parser.parseColonType(types.emplace_back()))
+              return failure();
+            return success();
+          })))
+    return failure();
+
+  for (auto [prv, type] : llvm::zip_equal(privates, types)) {
+    prv.type = type;
+  }
+  SmallVector<Attribute> reductions(reductionVec.begin(), reductionVec.end());
+  reductionSymbols = ArrayAttr::get(parser.getContext(), reductions);
+  return success();
+}
+
+static void printReductionClause(OpAsmPrinter &p, Operation *op, Region &region,
+                                 ValueRange operands, TypeRange types,
+                                 ArrayAttr reductionSymbols) {
+  p << "reduction(";
+  llvm::interleaveComma(llvm::zip_equal(reductionSymbols, operands,
+                                        region.front().getArguments(), types),
+                        p, [&p](auto t) {
+                          auto [sym, op, arg, type] = t;
+                          p << sym << " " << op << " -> " << arg << " : "
+                            << type;
+                        });
+  p << ") ";
+}
+
+static ParseResult
+parseParallelRegion(OpAsmParser &parser, Region &region,
+                    SmallVectorImpl<OpAsmParser::UnresolvedOperand> &operands,
+                    SmallVectorImpl<Type> &types, ArrayAttr &reductionSymbols) {
+
+  llvm::SmallVector<OpAsmParser::Argument> privates;
+  if (succeeded(parseReductionClause(parser, region, operands, types,
+                                     reductionSymbols, privates)))
+    return parser.parseRegion(region, privates);
+
+  return parser.parseRegion(region);
+}
+
+static void printParallelRegion(OpAsmPrinter &p, Operation *op, Region &region,
+                                ValueRange operands, TypeRange types,
+                                ArrayAttr reductionSymbols) {
+  if (reductionSymbols)
+    printReductionClause(p, op, region, operands, types, reductionSymbols);
+  p.printRegion(region, /*printEntryBlockArgs=*/false);
+}
+
 /// reduction-entry-list ::= reduction-entry
 ///                        | reduction-entry-list `,` reduction-entry
 /// reduction-entry ::= symbol-ref `->` ssa-id `:` type
@@ -1114,6 +1181,7 @@ parseLoopControl(OpAsmParser &parser, Region &region,
   loopVarTypes = SmallVector<Type>(ivs.size(), loopVarType);
   for (auto &iv : ivs)
     iv.type = loopVarType;
+
   return parser.parseRegion(region, ivs);
 }
 
diff --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
index 79956f8..c87e895 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
@@ -1018,9 +1018,19 @@ convertOmpParallel(omp::ParallelOp opInst, llvm::IRBuilderBase &builder,
     // Allocate reduction vars
     SmallVector<llvm::Value *> privateReductionVariables;
     DenseMap<Value, llvm::Value *> reductionVariableMap;
-    allocReductionVars(opInst, builder, moduleTranslation, allocaIP,
-                       reductionDecls, privateReductionVariables,
-                       reductionVariableMap);
+    {
+      llvm::IRBuilderBase::InsertPointGuard guard(builder);
+      builder.restoreIP(allocaIP);
+      auto args = opInst.getRegion().getArguments();
+
+      for (std::size_t i = 0; i < opInst.getNumReductionVars(); ++i) {
+        llvm::Value *var = builder.CreateAlloca(
+            moduleTranslation.convertType(reductionDecls[i].getType()));
+        moduleTranslation.mapValue(args[i], var);
+        privateReductionVariables.push_back(var);
+        reductionVariableMap.try_emplace(opInst.getReductionVars()[i], var);
+      }
+    }
 
     // Store the mapping between reduction variables and their private copies on
     // ModuleTranslation stack. It can be then recovered when translating
diff --git a/mlir/test/Dialect/OpenMP/ops.mlir b/mlir/test/Dialect/OpenMP/ops.mlir
index 65a704d..6514059 100644
--- a/mlir/test/Dialect/OpenMP/ops.mlir
+++ b/mlir/test/Dialect/OpenMP/ops.mlir
@@ -640,11 +640,13 @@ func.func @wsloop_reduction(%lb : index, %ub : index, %step : index) {
 func.func @parallel_reduction() {
   %c1 = arith.constant 1 : i32
   %0 = llvm.alloca %c1 x i32 : (i32) -> !llvm.ptr
-  // CHECK: omp.parallel reduction(@add_f32 -> {{.+}} : !llvm.ptr)
-  omp.parallel reduction(@add_f32 -> %0 : !llvm.ptr) {
+  // CHECK: omp.parallel reduction(@add_f32 {{.+}} -> {{.+}} : !llvm.ptr)
+  omp.parallel reduction(@add_f32 %0 -> %prv : !llvm.ptr) {
     %1 = arith.constant 2.0 : f32
-    // CHECK: omp.reduction %{{.+}}, %{{.+}}
-    omp.reduction %1, %0 : f32, !llvm.ptr
+    %2 = llvm.load %prv : !llvm.ptr -> f32
+    // CHECK: llvm.fadd %{{.*}}, %{{.*}} : f32
+    %3 = llvm.fadd %1, %2 : f32
+    llvm.store %3, %prv : f32, !llvm.ptr
     omp.terminator
   }
   return
@@ -654,13 +656,14 @@ func.func @parallel_reduction() {
 func.func @parallel_wsloop_reduction(%lb : index, %ub : index, %step : index) {
   %c1 = arith.constant 1 : i32
   %0 = llvm.alloca %c1 x i32 : (i32) -> !llvm.ptr
-  // CHECK: omp.parallel reduction(@add_f32 -> %{{.+}} : !llvm.ptr) {
-  omp.parallel reduction(@add_f32 -> %0 : !llvm.ptr) {
+  // CHECK: omp.parallel reduction(@add_f32 %{{.*}} -> %{{.+}} : !llvm.ptr) {
+  omp.parallel reduction(@add_f32 %0 -> %prv : !llvm.ptr) {
     // CHECK: omp.wsloop for (%{{.+}}) : index = (%{{.+}}) to (%{{.+}}) step (%{{.+}})
     omp.wsloop for (%iv) : index = (%lb) to (%ub) step (%step) {
       %1 = arith.constant 2.0 : f32
-      // CHECK: omp.reduction %{{.+}}, %{{.+}} : f32, !llvm.ptr
-      omp.reduction %1, %0 : f32, !llvm.ptr
+      %2 = llvm.load %prv : !llvm.ptr -> f32
+      // CHECK: llvm.fadd %{{.+}}, %{{.+}} : f32
+      llvm.fadd %1, %2 : f32
       // CHECK: omp.yield
       omp.yield
     }
@@ -799,11 +802,14 @@ func.func @wsloop_reduction2(%lb : index, %ub : index, %step : index) {
 // CHECK-LABEL: func @parallel_reduction2
 func.func @parallel_reduction2() {
   %0 = memref.alloca() : memref<1xf32>
-  // CHECK: omp.parallel reduction(@add2_f32 -> %{{.+}} : memref<1xf32>)
-  omp.parallel reduction(@add2_f32 -> %0 : memref<1xf32>) {
+  // CHECK: omp.parallel reduction(@add2_f32 %{{.+}} -> %{{.+}} : memref<1xf32>)
+  omp.parallel reduction(@add2_f32 %0 -> %prv : memref<1xf32>) {
     %1 = arith.constant 2.0 : f32
-    // CHECK: omp.reduction
-    omp.reduction %1, %0 : f32, memref<1xf32>
+    %2 = arith.constant 0 : index
+    %3 = memref.load %prv[%2] : memref<1xf32>
+    // CHECK: llvm.fadd
+    %4 = llvm.fadd %1, %3 : f32
+    memref.store %4, %prv[%2] : memref<1xf32>
     omp.terminator
   }
   return
@@ -813,13 +819,14 @@ func.func @parallel_reduction2() {
 func.func @parallel_wsloop_reduction2(%lb : index, %ub : index, %step : index) {
   %c1 = arith.constant 1 : i32
   %0 = llvm.alloca %c1 x i32 : (i32) -> !llvm.ptr
-  // CHECK: omp.parallel reduction(@add2_f32 -> %{{.+}} : !llvm.ptr) {
-  omp.parallel reduction(@add2_f32 -> %0 : !llvm.ptr) {
+  // CHECK: omp.parallel reduction(@add2_f32 %{{.*}} -> %{{.+}} : !llvm.ptr) {
+  omp.parallel reduction(@add2_f32 %0 -> %prv : !llvm.ptr) {
     // CHECK: omp.wsloop for (%{{.+}}) : index = (%{{.+}}) to (%{{.+}}) step (%{{.+}})
     omp.wsloop for (%iv) : index = (%lb) to (%ub) step (%step) {
       %1 = arith.constant 2.0 : f32
-      // CHECK: omp.reduction %{{.+}}, %{{.+}} : f32, !llvm.ptr
-      omp.reduction %1, %0 : f32, !llvm.ptr
+      %2 = llvm.load %prv : !llvm.ptr -> f32
+      // CHECK: llvm.fadd %{{.+}}, %{{.+}} : f32
+      %3 = llvm.fadd %1, %2 : f32
       // CHECK: omp.yield
       omp.yield
     }
diff --git a/mlir/test/Target/LLVMIR/openmp-reduction.mlir b/mlir/test/Target/LLVMIR/openmp-reduction.mlir
index 93ab578..dae83c0 100644
--- a/mlir/test/Target/LLVMIR/openmp-reduction.mlir
+++ b/mlir/test/Target/LLVMIR/openmp-reduction.mlir
@@ -441,9 +441,11 @@ atomic {
 llvm.func @simple_reduction_parallel() {
   %c1 = llvm.mlir.constant(1 : i32) : i32
   %0 = llvm.alloca %c1 x i32 : (i32) -> !llvm.ptr
-  omp.parallel reduction(@add_f32 -> %0 : !llvm.ptr) {
+  omp.parallel reduction(@add_f32 %0 -> %prv : !llvm.ptr) {
     %1 = llvm.mlir.constant(2.0 : f32) : f32
-    omp.reduction %1, %0 : f32, !llvm.ptr
+    %2 = llvm.load %prv : !llvm.ptr -> f32
+    %3 = llvm.fadd %2, %1 : f32
+    llvm.store %3, %prv : f32, !llvm.ptr
     omp.terminator
   }
   llvm.return
@@ -512,10 +514,12 @@ llvm.func @parallel_nested_workshare_reduction(%ub : i64) {
   %lb = llvm.mlir.constant(1 : i64) : i64
   %step = llvm.mlir.constant(1 : i64) : i64
   
-  omp.parallel reduction(@add_i32 -> %0 : !llvm.ptr) {
+  omp.parallel reduction(@add_i32 %0 -> %prv : !llvm.ptr) {
     omp.wsloop for (%iv) : i64 = (%lb) to (%ub) step (%step) {
       %ival = llvm.trunc %iv : i64 to i32
-      omp.reduction %ival, %0 : i32, !llvm.ptr
+      %lprv = llvm.load %prv : !llvm.ptr -> i32
+      %add = llvm.add %lprv, %ival : i32
+      llvm.store %add, %prv : i32, !llvm.ptr
       omp.yield
     }
     omp.terminator
-- 
cgit v1.1


From a8b4c11f9d51f3d735f76c98367c87d6ff328a32 Mon Sep 17 00:00:00 2001
From: Felipe de Azevedo Piovezan <fpiovezan@apple.com>
Date: Mon, 12 Feb 2024 09:24:46 -0800
Subject: [DWARFYAML] Implement debug_names support (#79666)

This commit brings support for debug_names in DWARFYAML. It parses YAML
and generates emits a DWARF5 Accelerator table with the following
limitations:

1. All forms must have a fixed length (zero length is also ok).
2. Hard-coded support for DWARF 5 and DWARF32.
3. The generated table does not contain a hash index

All of these limitations can be lifted in the future, but for now this
is good enough to enable testing.
---
 llvm/include/llvm/ObjectYAML/DWARFEmitter.h        |   1 +
 llvm/include/llvm/ObjectYAML/DWARFYAML.h           |  49 ++++
 llvm/lib/ObjectYAML/DWARFEmitter.cpp               | 189 +++++++++++++++
 llvm/lib/ObjectYAML/DWARFYAML.cpp                  |  29 +++
 .../DebugInfo/DWARF/DWARFAcceleratorTableTest.cpp  | 253 +++++++++++++++++++++
 5 files changed, 521 insertions(+)

diff --git a/llvm/include/llvm/ObjectYAML/DWARFEmitter.h b/llvm/include/llvm/ObjectYAML/DWARFEmitter.h
index ee421b2..5e1b88f 100644
--- a/llvm/include/llvm/ObjectYAML/DWARFEmitter.h
+++ b/llvm/include/llvm/ObjectYAML/DWARFEmitter.h
@@ -42,6 +42,7 @@ Error emitDebugAddr(raw_ostream &OS, const Data &DI);
 Error emitDebugStrOffsets(raw_ostream &OS, const Data &DI);
 Error emitDebugRnglists(raw_ostream &OS, const Data &DI);
 Error emitDebugLoclists(raw_ostream &OS, const Data &DI);
+Error emitDebugNames(raw_ostream &OS, const Data &DI);
 
 std::function<Error(raw_ostream &, const Data &)>
 getDWARFEmitterByName(StringRef SecName);
diff --git a/llvm/include/llvm/ObjectYAML/DWARFYAML.h b/llvm/include/llvm/ObjectYAML/DWARFYAML.h
index a70ddf3..0b3bea7 100644
--- a/llvm/include/llvm/ObjectYAML/DWARFYAML.h
+++ b/llvm/include/llvm/ObjectYAML/DWARFYAML.h
@@ -118,6 +118,28 @@ struct Unit {
   std::vector<Entry> Entries;
 };
 
+struct IdxForm {
+  dwarf::Index Idx;
+  dwarf::Form Form;
+};
+
+struct DebugNameAbbreviation {
+  yaml::Hex64 Code;
+  dwarf::Tag Tag;
+  std::vector<IdxForm> Indices;
+};
+
+struct DebugNameEntry {
+  yaml::Hex32 NameStrp;
+  yaml::Hex64 Code;
+  std::vector<yaml::Hex64> Values;
+};
+
+struct DebugNamesSection {
+  std::vector<DebugNameAbbreviation> Abbrevs;
+  std::vector<DebugNameEntry> Entries;
+};
+
 struct File {
   StringRef Name;
   uint64_t DirIdx;
@@ -228,6 +250,7 @@ struct Data {
   std::vector<LineTable> DebugLines;
   std::optional<std::vector<ListTable<RnglistEntry>>> DebugRnglists;
   std::optional<std::vector<ListTable<LoclistEntry>>> DebugLoclists;
+  std::optional<DebugNamesSection> DebugNames;
 
   bool isEmpty() const;
 
@@ -276,6 +299,9 @@ LLVM_YAML_IS_SEQUENCE_VECTOR(
     llvm::DWARFYAML::ListEntries<DWARFYAML::LoclistEntry>)
 LLVM_YAML_IS_SEQUENCE_VECTOR(llvm::DWARFYAML::LoclistEntry)
 LLVM_YAML_IS_SEQUENCE_VECTOR(llvm::DWARFYAML::DWARFOperation)
+LLVM_YAML_IS_SEQUENCE_VECTOR(llvm::DWARFYAML::DebugNameEntry)
+LLVM_YAML_IS_SEQUENCE_VECTOR(llvm::DWARFYAML::DebugNameAbbreviation)
+LLVM_YAML_IS_SEQUENCE_VECTOR(llvm::DWARFYAML::IdxForm)
 
 namespace llvm {
 namespace yaml {
@@ -324,6 +350,19 @@ template <> struct MappingTraits<DWARFYAML::Unit> {
   static void mapping(IO &IO, DWARFYAML::Unit &Unit);
 };
 
+template <> struct MappingTraits<DWARFYAML::DebugNamesSection> {
+  static void mapping(IO &IO, DWARFYAML::DebugNamesSection &);
+};
+template <> struct MappingTraits<DWARFYAML::DebugNameEntry> {
+  static void mapping(IO &IO, DWARFYAML::DebugNameEntry &);
+};
+template <> struct MappingTraits<DWARFYAML::DebugNameAbbreviation> {
+  static void mapping(IO &IO, DWARFYAML::DebugNameAbbreviation &);
+};
+template <> struct MappingTraits<DWARFYAML::IdxForm> {
+  static void mapping(IO &IO, DWARFYAML::IdxForm &);
+};
+
 template <> struct MappingTraits<DWARFYAML::Entry> {
   static void mapping(IO &IO, DWARFYAML::Entry &Entry);
 };
@@ -437,6 +476,16 @@ template <> struct ScalarEnumerationTraits<dwarf::Form> {
   }
 };
 
+#define HANDLE_DW_IDX(unused, name)                                            \
+  io.enumCase(value, "DW_IDX_" #name, dwarf::DW_IDX_##name);
+
+template <> struct ScalarEnumerationTraits<dwarf::Index> {
+  static void enumeration(IO &io, dwarf::Index &value) {
+#include "llvm/BinaryFormat/Dwarf.def"
+    io.enumFallback<Hex16>(value);
+  }
+};
+
 #define HANDLE_DW_UT(unused, name)                                             \
   io.enumCase(value, "DW_UT_" #name, dwarf::DW_UT_##name);
 
diff --git a/llvm/lib/ObjectYAML/DWARFEmitter.cpp b/llvm/lib/ObjectYAML/DWARFEmitter.cpp
index a26e93f6..39195bf 100644
--- a/llvm/lib/ObjectYAML/DWARFEmitter.cpp
+++ b/llvm/lib/ObjectYAML/DWARFEmitter.cpp
@@ -691,6 +691,194 @@ Error DWARFYAML::emitDebugStrOffsets(raw_ostream &OS, const Data &DI) {
   return Error::success();
 }
 
+namespace {
+/// Emits the header for a DebugNames section.
+void emitDebugNamesHeader(raw_ostream &OS, bool IsLittleEndian,
+                          uint32_t NameCount, uint32_t AbbrevSize,
+                          uint32_t CombinedSizeOtherParts) {
+  // Use the same AugmentationString as AsmPrinter.
+  StringRef AugmentationString = "LLVM0700";
+  size_t TotalSize = CombinedSizeOtherParts + 5 * sizeof(uint32_t) +
+                     2 * sizeof(uint16_t) + sizeof(NameCount) +
+                     sizeof(AbbrevSize) + AugmentationString.size();
+  writeInteger(uint32_t(TotalSize), OS, IsLittleEndian); // Unit length
+
+  // Everything below is included in total size.
+  writeInteger(uint16_t(5), OS, IsLittleEndian); // Version
+  writeInteger(uint16_t(0), OS, IsLittleEndian); // Padding
+  writeInteger(uint32_t(1), OS, IsLittleEndian); // Compilation Unit count
+  writeInteger(uint32_t(0), OS, IsLittleEndian); // Local Type Unit count
+  writeInteger(uint32_t(0), OS, IsLittleEndian); // Foreign Type Unit count
+  writeInteger(uint32_t(0), OS, IsLittleEndian); // Bucket count
+  writeInteger(NameCount, OS, IsLittleEndian);
+  writeInteger(AbbrevSize, OS, IsLittleEndian);
+  writeInteger(uint32_t(AugmentationString.size()), OS, IsLittleEndian);
+  OS.write(AugmentationString.data(), AugmentationString.size());
+  return;
+}
+
+/// Emits the abbreviations for a DebugNames section.
+std::string
+emitDebugNamesAbbrev(ArrayRef<DWARFYAML::DebugNameAbbreviation> Abbrevs) {
+  std::string Data;
+  raw_string_ostream OS(Data);
+  for (const DWARFYAML::DebugNameAbbreviation &Abbrev : Abbrevs) {
+    encodeULEB128(Abbrev.Code, OS);
+    encodeULEB128(Abbrev.Tag, OS);
+    for (auto [Idx, Form] : Abbrev.Indices) {
+      encodeULEB128(Idx, OS);
+      encodeULEB128(Form, OS);
+    }
+    encodeULEB128(0, OS);
+    encodeULEB128(0, OS);
+  }
+  encodeULEB128(0, OS);
+  return Data;
+}
+
+/// Emits a simple CU offsets list for a DebugNames section containing a single
+/// CU at offset 0.
+std::string emitDebugNamesCUOffsets(bool IsLittleEndian) {
+  std::string Data;
+  raw_string_ostream OS(Data);
+  writeInteger(uint32_t(0), OS, IsLittleEndian);
+  return Data;
+}
+
+/// Emits the "NameTable" for a DebugNames section; according to the spec, it
+/// consists of two arrays: an array of string offsets, followed immediately by
+/// an array of entry offsets. The string offsets are emitted in the order
+/// provided in `Entries`.
+std::string emitDebugNamesNameTable(
+    bool IsLittleEndian,
+    const DenseMap<uint32_t, std::vector<DWARFYAML::DebugNameEntry>> &Entries,
+    ArrayRef<uint32_t> EntryPoolOffsets) {
+  assert(Entries.size() == EntryPoolOffsets.size());
+
+  std::string Data;
+  raw_string_ostream OS(Data);
+
+  for (uint32_t Strp : make_first_range(Entries))
+    writeInteger(Strp, OS, IsLittleEndian);
+  for (uint32_t PoolOffset : EntryPoolOffsets)
+    writeInteger(PoolOffset, OS, IsLittleEndian);
+  return Data;
+}
+
+/// Groups entries based on their name (strp) code and returns a map.
+DenseMap<uint32_t, std::vector<DWARFYAML::DebugNameEntry>>
+groupEntries(ArrayRef<DWARFYAML::DebugNameEntry> Entries) {
+  DenseMap<uint32_t, std::vector<DWARFYAML::DebugNameEntry>> StrpToEntries;
+  for (const DWARFYAML::DebugNameEntry &Entry : Entries)
+    StrpToEntries[Entry.NameStrp].push_back(Entry);
+  return StrpToEntries;
+}
+
+/// Finds the abbreviation whose code is AbbrevCode and returns a list
+/// containing the expected size of all non-zero-length forms.
+Expected<SmallVector<uint8_t>>
+getNonZeroDataSizesFor(uint32_t AbbrevCode,
+                       ArrayRef<DWARFYAML::DebugNameAbbreviation> Abbrevs) {
+  const auto *AbbrevIt = find_if(Abbrevs, [&](const auto &Abbrev) {
+    return Abbrev.Code.value == AbbrevCode;
+  });
+  if (AbbrevIt == Abbrevs.end())
+    return createStringError(inconvertibleErrorCode(),
+                             "did not find an Abbreviation for this code");
+
+  SmallVector<uint8_t> DataSizes;
+  dwarf::FormParams Params{/*Version=*/5, /*AddrSize=*/4, dwarf::DWARF32};
+  for (auto [Idx, Form] : AbbrevIt->Indices) {
+    std::optional<uint8_t> FormSize = dwarf::getFixedFormByteSize(Form, Params);
+    if (!FormSize)
+      return createStringError(inconvertibleErrorCode(),
+                               "unsupported Form for YAML debug_names emitter");
+    if (FormSize == 0)
+      continue;
+    DataSizes.push_back(*FormSize);
+  }
+  return DataSizes;
+}
+
+struct PoolOffsetsAndData {
+  std::string PoolData;
+  std::vector<uint32_t> PoolOffsets;
+};
+
+/// Emits the entry pool and returns an array of offsets containing the start
+/// offset for the entries of each unique name.
+/// Verifies that the provided number of data values match those expected by
+/// the abbreviation table.
+Expected<PoolOffsetsAndData> emitDebugNamesEntryPool(
+    bool IsLittleEndian,
+    const DenseMap<uint32_t, std::vector<DWARFYAML::DebugNameEntry>>
+        &StrpToEntries,
+    ArrayRef<DWARFYAML::DebugNameAbbreviation> Abbrevs) {
+  PoolOffsetsAndData Result;
+  raw_string_ostream OS(Result.PoolData);
+
+  for (ArrayRef<DWARFYAML::DebugNameEntry> EntriesWithSameName :
+       make_second_range(StrpToEntries)) {
+    Result.PoolOffsets.push_back(Result.PoolData.size());
+
+    for (const DWARFYAML::DebugNameEntry &Entry : EntriesWithSameName) {
+      encodeULEB128(Entry.Code, OS);
+
+      Expected<SmallVector<uint8_t>> DataSizes =
+          getNonZeroDataSizesFor(Entry.Code, Abbrevs);
+      if (!DataSizes)
+        return DataSizes.takeError();
+      if (DataSizes->size() != Entry.Values.size())
+        return createStringError(
+            inconvertibleErrorCode(),
+            "mismatch between provided and required number of values");
+
+      for (auto [Value, ValueSize] : zip_equal(Entry.Values, *DataSizes))
+        if (Error E =
+                writeVariableSizedInteger(Value, ValueSize, OS, IsLittleEndian))
+          return std::move(E);
+    }
+    encodeULEB128(0, OS);
+  }
+
+  return Result;
+}
+} // namespace
+
+Error DWARFYAML::emitDebugNames(raw_ostream &OS, const Data &DI) {
+  assert(DI.DebugNames && "unexpected emitDebugNames() call");
+  const DebugNamesSection DebugNames = DI.DebugNames.value();
+
+  DenseMap<uint32_t, std::vector<DebugNameEntry>> StrpToEntries =
+      groupEntries(DebugNames.Entries);
+
+  // Emit all sub-sections into individual strings so that we may compute
+  // relative offsets and sizes.
+  Expected<PoolOffsetsAndData> PoolInfo = emitDebugNamesEntryPool(
+      DI.IsLittleEndian, StrpToEntries, DebugNames.Abbrevs);
+  if (!PoolInfo)
+    return PoolInfo.takeError();
+  std::string NamesTableData = emitDebugNamesNameTable(
+      DI.IsLittleEndian, StrpToEntries, PoolInfo->PoolOffsets);
+
+  std::string AbbrevData = emitDebugNamesAbbrev(DebugNames.Abbrevs);
+  std::string CUOffsetsData = emitDebugNamesCUOffsets(DI.IsLittleEndian);
+
+  size_t TotalSize = PoolInfo->PoolData.size() + NamesTableData.size() +
+                     AbbrevData.size() + CUOffsetsData.size();
+
+  // Start real emission by combining all individual strings.
+  emitDebugNamesHeader(OS, DI.IsLittleEndian, StrpToEntries.size(),
+                       AbbrevData.size(), TotalSize);
+  OS.write(CUOffsetsData.data(), CUOffsetsData.size());
+  // No local TUs, no foreign TUs, no hash lookups table.
+  OS.write(NamesTableData.data(), NamesTableData.size());
+  OS.write(AbbrevData.data(), AbbrevData.size());
+  OS.write(PoolInfo->PoolData.data(), PoolInfo->PoolData.size());
+
+  return Error::success();
+}
+
 static Error checkOperandCount(StringRef EncodingString,
                                ArrayRef<yaml::Hex64> Values,
                                uint64_t ExpectedOperands) {
@@ -1024,6 +1212,7 @@ DWARFYAML::getDWARFEmitterByName(StringRef SecName) {
           .Case("debug_rnglists", DWARFYAML::emitDebugRnglists)
           .Case("debug_str", DWARFYAML::emitDebugStr)
           .Case("debug_str_offsets", DWARFYAML::emitDebugStrOffsets)
+          .Case("debug_names", DWARFYAML::emitDebugNames)
           .Default([&](raw_ostream &, const DWARFYAML::Data &) {
             return createStringError(errc::not_supported,
                                      SecName + " is not supported");
diff --git a/llvm/lib/ObjectYAML/DWARFYAML.cpp b/llvm/lib/ObjectYAML/DWARFYAML.cpp
index 2bddeed..5207671 100644
--- a/llvm/lib/ObjectYAML/DWARFYAML.cpp
+++ b/llvm/lib/ObjectYAML/DWARFYAML.cpp
@@ -52,6 +52,8 @@ SetVector<StringRef> DWARFYAML::Data::getNonEmptySectionNames() const {
     SecNames.insert("debug_rnglists");
   if (DebugLoclists)
     SecNames.insert("debug_loclists");
+  if (DebugNames)
+    SecNames.insert("debug_names");
   return SecNames;
 }
 
@@ -105,6 +107,7 @@ void MappingTraits<DWARFYAML::Data>::mapping(IO &IO, DWARFYAML::Data &DWARF) {
   IO.mapOptional("debug_str_offsets", DWARF.DebugStrOffsets);
   IO.mapOptional("debug_rnglists", DWARF.DebugRnglists);
   IO.mapOptional("debug_loclists", DWARF.DebugLoclists);
+  IO.mapOptional("debug_names", DWARF.DebugNames);
   IO.setContext(OldContext);
 }
 
@@ -122,6 +125,32 @@ void MappingTraits<DWARFYAML::Abbrev>::mapping(IO &IO,
   IO.mapOptional("Attributes", Abbrev.Attributes);
 }
 
+void MappingTraits<DWARFYAML::IdxForm>::mapping(IO &IO,
+                                                DWARFYAML::IdxForm &IdxForm) {
+  IO.mapRequired("Idx", IdxForm.Idx);
+  IO.mapRequired("Form", IdxForm.Form);
+}
+
+void MappingTraits<DWARFYAML::DebugNameAbbreviation>::mapping(
+    IO &IO, DWARFYAML::DebugNameAbbreviation &DebugNameAbbreviation) {
+  IO.mapRequired("Code", DebugNameAbbreviation.Code);
+  IO.mapRequired("Tag", DebugNameAbbreviation.Tag);
+  IO.mapRequired("Indices", DebugNameAbbreviation.Indices);
+}
+
+void MappingTraits<DWARFYAML::DebugNameEntry>::mapping(
+    IO &IO, DWARFYAML::DebugNameEntry &DebugNameEntry) {
+  IO.mapRequired("Name", DebugNameEntry.NameStrp);
+  IO.mapRequired("Code", DebugNameEntry.Code);
+  IO.mapOptional("Values", DebugNameEntry.Values);
+}
+
+void MappingTraits<DWARFYAML::DebugNamesSection>::mapping(
+    IO &IO, DWARFYAML::DebugNamesSection &DebugNames) {
+  IO.mapRequired("Abbreviations", DebugNames.Abbrevs);
+  IO.mapRequired("Entries", DebugNames.Entries);
+}
+
 void MappingTraits<DWARFYAML::AttributeAbbrev>::mapping(
     IO &IO, DWARFYAML::AttributeAbbrev &AttAbbrev) {
   IO.mapRequired("Attribute", AttAbbrev.Attribute);
diff --git a/llvm/unittests/DebugInfo/DWARF/DWARFAcceleratorTableTest.cpp b/llvm/unittests/DebugInfo/DWARF/DWARFAcceleratorTableTest.cpp
index 38f3e07..dedcf81 100644
--- a/llvm/unittests/DebugInfo/DWARF/DWARFAcceleratorTableTest.cpp
+++ b/llvm/unittests/DebugInfo/DWARF/DWARFAcceleratorTableTest.cpp
@@ -8,6 +8,7 @@
 
 #include "llvm/DebugInfo/DWARF/DWARFAcceleratorTable.h"
 #include "llvm/DebugInfo/DWARF/DWARFContext.h"
+#include "llvm/ObjectYAML/DWARFEmitter.h"
 #include "llvm/Testing/Support/Error.h"
 #include "gtest/gtest.h"
 
@@ -46,4 +47,256 @@ TEST(DWARFDebugNames, TooSmallForDWARF64) {
                         "data at offset 0x2b while reading [0x28, 0x2c)"));
 }
 
+TEST(DWARFDebugNames, BasicTestEntries) {
+  const char *Yamldata = R"(
+--- !ELF
+  debug_str:
+    - 'NameType1'
+    - 'NameType2'
+
+  debug_names:
+    Abbreviations:
+    - Code:   0x1
+      Tag: DW_TAG_namespace
+      Indices:
+        - Idx:   DW_IDX_compile_unit
+          Form:  DW_FORM_data4
+        - Idx:   DW_IDX_die_offset
+          Form:  DW_FORM_ref4
+    Entries:
+    - Name:   0x0  # strp to NameType1
+      Code:   0x1
+      Values:
+        - 0x0      # Compile unit
+        - 0x0      # DIE Offset
+    - Name:   0xa  # strp to NameType2
+      Code:   0x1
+      Values:
+        - 0x1      # Compile unit
+        - 0x1      # DIE Offset
+    - Name:   0x0  # strp to NameType1
+      Code:   0x1
+      Values:
+        - 0x2     # Compile unit
+        - 0x2     # DIE Offset
+
+)";
+
+  Expected<StringMap<std::unique_ptr<MemoryBuffer>>> Sections =
+      DWARFYAML::emitDebugSections(Yamldata,
+                                   /*IsLittleEndian=*/true,
+                                   /*Is64BitAddrSize=*/true);
+  ASSERT_THAT_EXPECTED(Sections, Succeeded());
+  auto Ctx = DWARFContext::create(*Sections, 4, /*isLittleEndian=*/true);
+  const DWARFDebugNames &DebugNames = Ctx->getDebugNames();
+  ASSERT_NE(DebugNames.begin(), DebugNames.end());
+  const DWARFDebugNames::NameIndex &NameIndex = *DebugNames.begin();
+
+  ASSERT_EQ(NameIndex.getNameCount(), 2u);
+  ASSERT_EQ(NameIndex.getBucketCount(), 0u);
+  ASSERT_EQ(NameIndex.getCUCount(), 1u);
+  ASSERT_EQ(NameIndex.getCUOffset(0), 0u);
+  ASSERT_EQ(NameIndex.getForeignTUCount(), 0u);
+  ASSERT_EQ(NameIndex.getLocalTUCount(), 0u);
+
+  // Check "NameEntries": there should be one per unique name.
+  // These are indexed starting on 1.
+  DWARFDebugNames::NameTableEntry FirstEntry = NameIndex.getNameTableEntry(1);
+  ASSERT_EQ(FirstEntry.getString(), StringRef("NameType1"));
+  DWARFDebugNames::NameTableEntry SecondEntry = NameIndex.getNameTableEntry(2);
+  ASSERT_EQ(SecondEntry.getString(), StringRef("NameType2"));
+
+  SmallVector<DWARFDebugNames::Entry> FirstNameEntries =
+      to_vector_of<DWARFDebugNames::Entry>(NameIndex.equal_range("NameType1"));
+  ASSERT_EQ(FirstNameEntries.size(), 2u);
+  ASSERT_EQ(FirstNameEntries[0].getCUIndex(), 0u);
+  ASSERT_EQ(FirstNameEntries[1].getCUIndex(), 0x2);
+  ASSERT_EQ(FirstNameEntries[0].getDIEUnitOffset(), 0x0);
+  ASSERT_EQ(FirstNameEntries[1].getDIEUnitOffset(), 0x2);
+
+  SmallVector<DWARFDebugNames::Entry> SecondNameEntries =
+      to_vector_of<DWARFDebugNames::Entry>(NameIndex.equal_range("NameType2"));
+  ASSERT_EQ(SecondNameEntries.size(), 1u);
+  ASSERT_EQ(SecondNameEntries[0].getCUIndex(), 0x1);
+  ASSERT_EQ(SecondNameEntries[0].getDIEUnitOffset(), 0x1);
+}
+
+TEST(DWARFDebugNames, ParentEntries) {
+  const char *Yamldata = R"(
+--- !ELF
+  debug_str:
+    - 'Name1'
+    - 'Name2'
+    - 'Name3'
+    - 'Name4'
+  debug_names:
+    Abbreviations:
+    - Code:   0x11
+      Tag: DW_TAG_namespace
+      Indices:
+        - Idx:   DW_IDX_parent
+          Form:  DW_FORM_flag_present
+        - Idx:   DW_IDX_die_offset
+          Form:  DW_FORM_ref4
+    - Code:   0x22
+      Tag: DW_TAG_namespace
+      Indices:
+        - Idx:   DW_IDX_parent
+          Form:  DW_FORM_ref4
+        - Idx:   DW_IDX_die_offset
+          Form:  DW_FORM_ref4
+    - Code:   0x33
+      Tag: DW_TAG_namespace
+      Indices:
+        - Idx:   DW_IDX_die_offset
+          Form:  DW_FORM_ref4
+    Entries:
+    - Name:   0x0  # strp to Name1
+      Code:   0x11
+      Values:
+        - 0x0      # Die offset
+    - Name:   0x6  # strp to Name2
+      Code:   0x22
+      Values:
+        - 0x0      # Parent = First entry
+        - 0x1      # Die offset
+    - Name:   0xc  # strp to Name3
+      Code:   0x22
+      Values:
+        - 0x6      # Parent = Second entry
+        - 0x1      # Die offset
+    - Name:   0x12  # strp to Name4
+      Code:   0x33
+      Values:
+        - 0x1      # Die offset
+)";
+
+  Expected<StringMap<std::unique_ptr<MemoryBuffer>>> Sections =
+      DWARFYAML::emitDebugSections(Yamldata,
+                                   /*IsLittleEndian=*/true,
+                                   /*Is64BitAddrSize=*/true);
+  ASSERT_THAT_EXPECTED(Sections, Succeeded());
+  auto Ctx = DWARFContext::create(*Sections, 4, /*isLittleEndian=*/true);
+  const DWARFDebugNames &DebugNames = Ctx->getDebugNames();
+  ASSERT_NE(DebugNames.begin(), DebugNames.end());
+  const DWARFDebugNames::NameIndex &NameIndex = *DebugNames.begin();
+
+  SmallVector<DWARFDebugNames::Entry> Name1Entries =
+      to_vector_of<DWARFDebugNames::Entry>(NameIndex.equal_range("Name1"));
+  ASSERT_EQ(Name1Entries.size(), 1u);
+  Expected<std::optional<DWARFDebugNames::Entry>> Name1Parent =
+      Name1Entries[0].getParentDIEEntry();
+  ASSERT_THAT_EXPECTED(Name1Parent, Succeeded());
+  ASSERT_EQ(*Name1Parent, std::nullopt); // Name1 has no parent
+
+  SmallVector<DWARFDebugNames::Entry> Name2Entries =
+      to_vector_of<DWARFDebugNames::Entry>(NameIndex.equal_range("Name2"));
+  ASSERT_EQ(Name2Entries.size(), 1u);
+  Expected<std::optional<DWARFDebugNames::Entry>> Name2Parent =
+      Name2Entries[0].getParentDIEEntry();
+  ASSERT_THAT_EXPECTED(Name2Parent, Succeeded());
+  ASSERT_EQ((**Name2Parent).getDIEUnitOffset(), 0x0); // Name2 parent == Name1
+
+  SmallVector<DWARFDebugNames::Entry> Name3Entries =
+      to_vector_of<DWARFDebugNames::Entry>(NameIndex.equal_range("Name3"));
+  ASSERT_EQ(Name3Entries.size(), 1u);
+  Expected<std::optional<DWARFDebugNames::Entry>> Name3Parent =
+      Name3Entries[0].getParentDIEEntry();
+  ASSERT_THAT_EXPECTED(Name3Parent, Succeeded());
+  ASSERT_EQ((**Name3Parent).getDIEUnitOffset(), 0x1); // Name3 parent == Name2
+
+  SmallVector<DWARFDebugNames::Entry> Name4Entries =
+      to_vector_of<DWARFDebugNames::Entry>(NameIndex.equal_range("Name4"));
+  ASSERT_EQ(Name4Entries.size(), 1u);
+  ASSERT_FALSE(Name4Entries[0].hasParentInformation());
+}
+
+TEST(DWARFDebugNames, InvalidAbbrevCode) {
+  const char *Yamldata = R"(
+--- !ELF
+  debug_str:
+    - 'NameType1'
+
+  debug_names:
+    Abbreviations:
+    - Code:   0x1
+      Tag: DW_TAG_namespace
+      Indices:
+        - Idx:   DW_IDX_compile_unit
+          Form:  DW_FORM_data4
+    Entries:
+    - Name:   0x0  # strp to NameType1
+      Code:   0x123456
+      Values:
+        - 0x0      # Compile unit
+)";
+
+  Expected<StringMap<std::unique_ptr<MemoryBuffer>>> Sections =
+      DWARFYAML::emitDebugSections(Yamldata,
+                                   /*IsLittleEndian=*/true,
+                                   /*Is64BitAddrSize=*/true);
+  ASSERT_THAT_EXPECTED(
+      Sections,
+      FailedWithMessage("did not find an Abbreviation for this code"));
+}
+
+TEST(DWARFDebugNames, InvalidNumOfValues) {
+  const char *Yamldata = R"(
+--- !ELF
+  debug_str:
+    - 'NameType1'
+
+  debug_names:
+    Abbreviations:
+    - Code:   0x1
+      Tag: DW_TAG_namespace
+      Indices:
+        - Idx:   DW_IDX_compile_unit
+          Form:  DW_FORM_data4
+    Entries:
+    - Name:   0x0  # strp to NameType1
+      Code:   0x1
+      Values:
+        - 0x0      # Compile unit
+        - 0x0      # Compile unit
+        - 0x0      # Compile unit
+)";
+
+  Expected<StringMap<std::unique_ptr<MemoryBuffer>>> Sections =
+      DWARFYAML::emitDebugSections(Yamldata,
+                                   /*IsLittleEndian=*/true,
+                                   /*Is64BitAddrSize=*/true);
+  ASSERT_THAT_EXPECTED(
+      Sections, FailedWithMessage(
+                    "mismatch between provided and required number of values"));
+}
+
+TEST(DWARFDebugNames, UnsupportedForm) {
+  const char *Yamldata = R"(
+--- !ELF
+  debug_str:
+    - 'NameType1'
+
+  debug_names:
+    Abbreviations:
+    - Code:   0x1
+      Tag: DW_TAG_namespace
+      Indices:
+        - Idx:   DW_IDX_compile_unit
+          Form:  DW_FORM_strx
+    Entries:
+    - Name:   0x0  # strp to NameType1
+      Code:   0x1
+      Values:
+        - 0x0      # Compile unit
+)";
+
+  Expected<StringMap<std::unique_ptr<MemoryBuffer>>> Sections =
+      DWARFYAML::emitDebugSections(Yamldata,
+                                   /*IsLittleEndian=*/true,
+                                   /*Is64BitAddrSize=*/true);
+  ASSERT_THAT_EXPECTED(
+      Sections,
+      FailedWithMessage("unsupported Form for YAML debug_names emitter"));
+}
 } // end anonymous namespace
-- 
cgit v1.1


From dfbe2bca1d8ccee87f032562fd60214ca8ea0d22 Mon Sep 17 00:00:00 2001
From: Vlad Serebrennikov <serebrennikov.vladislav@gmail.com>
Date: Mon, 12 Feb 2024 21:27:58 +0400
Subject: [clang][NFC] Refactor `Sema::TemplateDeductionResult` (#81398)

This patch converts `Sema::TemplateDeductionResult` into a scoped enum
in namespace scope, making it eligible for forward declaring. This is
useful in certain contexts, such as `preferred_type` annotations on
bit-fields.
---
 clang/include/clang/Sema/Sema.h              | 134 +++---
 clang/include/clang/Sema/TemplateDeduction.h |   5 +
 clang/lib/Sema/SemaDecl.cpp                  |   3 +-
 clang/lib/Sema/SemaExprCXX.cpp               |  12 +-
 clang/lib/Sema/SemaLookup.cpp                |   4 +-
 clang/lib/Sema/SemaOverload.cpp              | 379 ++++++++-------
 clang/lib/Sema/SemaStmt.cpp                  |  16 +-
 clang/lib/Sema/SemaTemplate.cpp              |  26 +-
 clang/lib/Sema/SemaTemplateDeduction.cpp     | 694 ++++++++++++++-------------
 clang/lib/Sema/SemaTemplateInstantiate.cpp   |  11 +-
 10 files changed, 678 insertions(+), 606 deletions(-)

diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h
index 32f7202..ed933f2 100644
--- a/clang/include/clang/Sema/Sema.h
+++ b/clang/include/clang/Sema/Sema.h
@@ -353,6 +353,72 @@ private:
   llvm::function_ref<QualType()> ComputeType;
 };
 
+/// Describes the result of template argument deduction.
+///
+/// The TemplateDeductionResult enumeration describes the result of
+/// template argument deduction, as returned from
+/// DeduceTemplateArguments(). The separate TemplateDeductionInfo
+/// structure provides additional information about the results of
+/// template argument deduction, e.g., the deduced template argument
+/// list (if successful) or the specific template parameters or
+/// deduced arguments that were involved in the failure.
+enum class TemplateDeductionResult {
+  /// Template argument deduction was successful.
+  Success = 0,
+  /// The declaration was invalid; do nothing.
+  Invalid,
+  /// Template argument deduction exceeded the maximum template
+  /// instantiation depth (which has already been diagnosed).
+  InstantiationDepth,
+  /// Template argument deduction did not deduce a value
+  /// for every template parameter.
+  Incomplete,
+  /// Template argument deduction did not deduce a value for every
+  /// expansion of an expanded template parameter pack.
+  IncompletePack,
+  /// Template argument deduction produced inconsistent
+  /// deduced values for the given template parameter.
+  Inconsistent,
+  /// Template argument deduction failed due to inconsistent
+  /// cv-qualifiers on a template parameter type that would
+  /// otherwise be deduced, e.g., we tried to deduce T in "const T"
+  /// but were given a non-const "X".
+  Underqualified,
+  /// Substitution of the deduced template argument values
+  /// resulted in an error.
+  SubstitutionFailure,
+  /// After substituting deduced template arguments, a dependent
+  /// parameter type did not match the corresponding argument.
+  DeducedMismatch,
+  /// After substituting deduced template arguments, an element of
+  /// a dependent parameter type did not match the corresponding element
+  /// of the corresponding argument (when deducing from an initializer list).
+  DeducedMismatchNested,
+  /// A non-depnedent component of the parameter did not match the
+  /// corresponding component of the argument.
+  NonDeducedMismatch,
+  /// When performing template argument deduction for a function
+  /// template, there were too many call arguments.
+  TooManyArguments,
+  /// When performing template argument deduction for a function
+  /// template, there were too few call arguments.
+  TooFewArguments,
+  /// The explicitly-specified template arguments were not valid
+  /// template arguments for the given template.
+  InvalidExplicitArguments,
+  /// Checking non-dependent argument conversions failed.
+  NonDependentConversionFailure,
+  /// The deduced arguments did not satisfy the constraints associated
+  /// with the template.
+  ConstraintsNotSatisfied,
+  /// Deduction failed; that's all we know.
+  MiscellaneousDeductionFailure,
+  /// CUDA Target attributes do not match.
+  CUDATargetMismatch,
+  /// Some error which was already diagnosed.
+  AlreadyDiagnosed
+};
+
 /// Sema - This implements semantic analysis and AST building for C.
 class Sema final {
   Sema(const Sema &) = delete;
@@ -9261,72 +9327,6 @@ public:
   QualType adjustCCAndNoReturn(QualType ArgFunctionType, QualType FunctionType,
                                bool AdjustExceptionSpec = false);
 
-  /// Describes the result of template argument deduction.
-  ///
-  /// The TemplateDeductionResult enumeration describes the result of
-  /// template argument deduction, as returned from
-  /// DeduceTemplateArguments(). The separate TemplateDeductionInfo
-  /// structure provides additional information about the results of
-  /// template argument deduction, e.g., the deduced template argument
-  /// list (if successful) or the specific template parameters or
-  /// deduced arguments that were involved in the failure.
-  enum TemplateDeductionResult {
-    /// Template argument deduction was successful.
-    TDK_Success = 0,
-    /// The declaration was invalid; do nothing.
-    TDK_Invalid,
-    /// Template argument deduction exceeded the maximum template
-    /// instantiation depth (which has already been diagnosed).
-    TDK_InstantiationDepth,
-    /// Template argument deduction did not deduce a value
-    /// for every template parameter.
-    TDK_Incomplete,
-    /// Template argument deduction did not deduce a value for every
-    /// expansion of an expanded template parameter pack.
-    TDK_IncompletePack,
-    /// Template argument deduction produced inconsistent
-    /// deduced values for the given template parameter.
-    TDK_Inconsistent,
-    /// Template argument deduction failed due to inconsistent
-    /// cv-qualifiers on a template parameter type that would
-    /// otherwise be deduced, e.g., we tried to deduce T in "const T"
-    /// but were given a non-const "X".
-    TDK_Underqualified,
-    /// Substitution of the deduced template argument values
-    /// resulted in an error.
-    TDK_SubstitutionFailure,
-    /// After substituting deduced template arguments, a dependent
-    /// parameter type did not match the corresponding argument.
-    TDK_DeducedMismatch,
-    /// After substituting deduced template arguments, an element of
-    /// a dependent parameter type did not match the corresponding element
-    /// of the corresponding argument (when deducing from an initializer list).
-    TDK_DeducedMismatchNested,
-    /// A non-depnedent component of the parameter did not match the
-    /// corresponding component of the argument.
-    TDK_NonDeducedMismatch,
-    /// When performing template argument deduction for a function
-    /// template, there were too many call arguments.
-    TDK_TooManyArguments,
-    /// When performing template argument deduction for a function
-    /// template, there were too few call arguments.
-    TDK_TooFewArguments,
-    /// The explicitly-specified template arguments were not valid
-    /// template arguments for the given template.
-    TDK_InvalidExplicitArguments,
-    /// Checking non-dependent argument conversions failed.
-    TDK_NonDependentConversionFailure,
-    /// The deduced arguments did not satisfy the constraints associated
-    /// with the template.
-    TDK_ConstraintsNotSatisfied,
-    /// Deduction failed; that's all we know.
-    TDK_MiscellaneousDeductionFailure,
-    /// CUDA Target attributes do not match.
-    TDK_CUDATargetMismatch,
-    /// Some error which was already diagnosed.
-    TDK_AlreadyDiagnosed
-  };
-
   TemplateDeductionResult
   DeduceTemplateArguments(ClassTemplatePartialSpecializationDecl *Partial,
                           ArrayRef<TemplateArgument> TemplateArgs,
@@ -14444,7 +14444,7 @@ public:
 };
 
 DeductionFailureInfo
-MakeDeductionFailureInfo(ASTContext &Context, Sema::TemplateDeductionResult TDK,
+MakeDeductionFailureInfo(ASTContext &Context, TemplateDeductionResult TDK,
                          sema::TemplateDeductionInfo &Info);
 
 /// Contains a late templated function.
diff --git a/clang/include/clang/Sema/TemplateDeduction.h b/clang/include/clang/Sema/TemplateDeduction.h
index 85691c6..28b014f 100644
--- a/clang/include/clang/Sema/TemplateDeduction.h
+++ b/clang/include/clang/Sema/TemplateDeduction.h
@@ -33,6 +33,7 @@ namespace clang {
 class Decl;
 struct DeducedPack;
 class Sema;
+enum class TemplateDeductionResult;
 
 namespace sema {
 
@@ -295,6 +296,10 @@ struct DeductionFailureInfo {
 
   /// Free any memory associated with this deduction failure.
   void Destroy();
+
+  TemplateDeductionResult getResult() const {
+    return static_cast<TemplateDeductionResult>(Result);
+  }
 };
 
 /// TemplateSpecCandidate - This is a generalization of OverloadCandidate
diff --git a/clang/lib/Sema/SemaDecl.cpp b/clang/lib/Sema/SemaDecl.cpp
index 529fbcb..be23c0f 100644
--- a/clang/lib/Sema/SemaDecl.cpp
+++ b/clang/lib/Sema/SemaDecl.cpp
@@ -13070,7 +13070,8 @@ QualType Sema::deduceVarTypeFromInitializer(VarDecl *VDecl,
   TemplateDeductionInfo Info(DeduceInit->getExprLoc());
   TemplateDeductionResult Result =
       DeduceAutoType(TSI->getTypeLoc(), DeduceInit, DeducedType, Info);
-  if (Result != TDK_Success && Result != TDK_AlreadyDiagnosed) {
+  if (Result != TemplateDeductionResult::Success &&
+      Result != TemplateDeductionResult::AlreadyDiagnosed) {
     if (!IsInitCapture)
       DiagnoseAutoDeductionFailure(VDecl, DeduceInit);
     else if (isa<InitListExpr>(Init))
diff --git a/clang/lib/Sema/SemaExprCXX.cpp b/clang/lib/Sema/SemaExprCXX.cpp
index 246d231..f2b8913 100644
--- a/clang/lib/Sema/SemaExprCXX.cpp
+++ b/clang/lib/Sema/SemaExprCXX.cpp
@@ -1554,12 +1554,13 @@ Sema::BuildCXXTypeConstructExpr(TypeSourceInfo *TInfo,
     TemplateDeductionInfo Info(Deduce->getExprLoc());
     TemplateDeductionResult Result =
         DeduceAutoType(TInfo->getTypeLoc(), Deduce, DeducedType, Info);
-    if (Result != TDK_Success && Result != TDK_AlreadyDiagnosed)
+    if (Result != TemplateDeductionResult::Success &&
+        Result != TemplateDeductionResult::AlreadyDiagnosed)
       return ExprError(Diag(TyBeginLoc, diag::err_auto_expr_deduction_failure)
                        << Ty << Deduce->getType() << FullRange
                        << Deduce->getSourceRange());
     if (DeducedType.isNull()) {
-      assert(Result == TDK_AlreadyDiagnosed);
+      assert(Result == TemplateDeductionResult::AlreadyDiagnosed);
       return ExprError();
     }
 
@@ -2098,12 +2099,13 @@ ExprResult Sema::BuildCXXNew(SourceRange Range, bool UseGlobal,
     TemplateDeductionInfo Info(Deduce->getExprLoc());
     TemplateDeductionResult Result =
         DeduceAutoType(AllocTypeInfo->getTypeLoc(), Deduce, DeducedType, Info);
-    if (Result != TDK_Success && Result != TDK_AlreadyDiagnosed)
+    if (Result != TemplateDeductionResult::Success &&
+        Result != TemplateDeductionResult::AlreadyDiagnosed)
       return ExprError(Diag(StartLoc, diag::err_auto_new_deduction_failure)
                        << AllocType << Deduce->getType() << TypeRange
                        << Deduce->getSourceRange());
     if (DeducedType.isNull()) {
-      assert(Result == TDK_AlreadyDiagnosed);
+      assert(Result == TemplateDeductionResult::AlreadyDiagnosed);
       return ExprError();
     }
     AllocType = DeducedType;
@@ -2883,7 +2885,7 @@ bool Sema::FindAllocationFunctions(SourceLocation StartLoc, SourceRange Range,
         // expected function type.
         TemplateDeductionInfo Info(StartLoc);
         if (DeduceTemplateArguments(FnTmpl, nullptr, ExpectedFunctionType, Fn,
-                                    Info))
+                                    Info) != TemplateDeductionResult::Success)
           continue;
       } else
         Fn = cast<FunctionDecl>((*D)->getUnderlyingDecl());
diff --git a/clang/lib/Sema/SemaLookup.cpp b/clang/lib/Sema/SemaLookup.cpp
index 02b1a04..d3a9c7a 100644
--- a/clang/lib/Sema/SemaLookup.cpp
+++ b/clang/lib/Sema/SemaLookup.cpp
@@ -1200,8 +1200,8 @@ static bool LookupDirect(Sema &S, LookupResult &R, const DeclContext *DC) {
     // Perform template argument deduction against the type that we would
     // expect the function to have.
     if (R.getSema().DeduceTemplateArguments(ConvTemplate, nullptr, ExpectedType,
-                                            Specialization, Info)
-          == Sema::TDK_Success) {
+                                            Specialization, Info) ==
+        TemplateDeductionResult::Success) {
       R.addDecl(Specialization);
       Found = true;
     }
diff --git a/clang/lib/Sema/SemaOverload.cpp b/clang/lib/Sema/SemaOverload.cpp
index 42960c2..9381b8c 100644
--- a/clang/lib/Sema/SemaOverload.cpp
+++ b/clang/lib/Sema/SemaOverload.cpp
@@ -628,28 +628,28 @@ namespace {
 /// to the form used in overload-candidate information.
 DeductionFailureInfo
 clang::MakeDeductionFailureInfo(ASTContext &Context,
-                                Sema::TemplateDeductionResult TDK,
+                                TemplateDeductionResult TDK,
                                 TemplateDeductionInfo &Info) {
   DeductionFailureInfo Result;
   Result.Result = static_cast<unsigned>(TDK);
   Result.HasDiagnostic = false;
   switch (TDK) {
-  case Sema::TDK_Invalid:
-  case Sema::TDK_InstantiationDepth:
-  case Sema::TDK_TooManyArguments:
-  case Sema::TDK_TooFewArguments:
-  case Sema::TDK_MiscellaneousDeductionFailure:
-  case Sema::TDK_CUDATargetMismatch:
+  case TemplateDeductionResult::Invalid:
+  case TemplateDeductionResult::InstantiationDepth:
+  case TemplateDeductionResult::TooManyArguments:
+  case TemplateDeductionResult::TooFewArguments:
+  case TemplateDeductionResult::MiscellaneousDeductionFailure:
+  case TemplateDeductionResult::CUDATargetMismatch:
     Result.Data = nullptr;
     break;
 
-  case Sema::TDK_Incomplete:
-  case Sema::TDK_InvalidExplicitArguments:
+  case TemplateDeductionResult::Incomplete:
+  case TemplateDeductionResult::InvalidExplicitArguments:
     Result.Data = Info.Param.getOpaqueValue();
     break;
 
-  case Sema::TDK_DeducedMismatch:
-  case Sema::TDK_DeducedMismatchNested: {
+  case TemplateDeductionResult::DeducedMismatch:
+  case TemplateDeductionResult::DeducedMismatchNested: {
     // FIXME: Should allocate from normal heap so that we can free this later.
     auto *Saved = new (Context) DFIDeducedMismatchArgs;
     Saved->FirstArg = Info.FirstArg;
@@ -660,7 +660,7 @@ clang::MakeDeductionFailureInfo(ASTContext &Context,
     break;
   }
 
-  case Sema::TDK_NonDeducedMismatch: {
+  case TemplateDeductionResult::NonDeducedMismatch: {
     // FIXME: Should allocate from normal heap so that we can free this later.
     DFIArguments *Saved = new (Context) DFIArguments;
     Saved->FirstArg = Info.FirstArg;
@@ -669,10 +669,10 @@ clang::MakeDeductionFailureInfo(ASTContext &Context,
     break;
   }
 
-  case Sema::TDK_IncompletePack:
+  case TemplateDeductionResult::IncompletePack:
     // FIXME: It's slightly wasteful to allocate two TemplateArguments for this.
-  case Sema::TDK_Inconsistent:
-  case Sema::TDK_Underqualified: {
+  case TemplateDeductionResult::Inconsistent:
+  case TemplateDeductionResult::Underqualified: {
     // FIXME: Should allocate from normal heap so that we can free this later.
     DFIParamWithArguments *Saved = new (Context) DFIParamWithArguments;
     Saved->Param = Info.Param;
@@ -682,7 +682,7 @@ clang::MakeDeductionFailureInfo(ASTContext &Context,
     break;
   }
 
-  case Sema::TDK_SubstitutionFailure:
+  case TemplateDeductionResult::SubstitutionFailure:
     Result.Data = Info.takeSugared();
     if (Info.hasSFINAEDiagnostic()) {
       PartialDiagnosticAt *Diag = new (Result.Diagnostic) PartialDiagnosticAt(
@@ -692,7 +692,7 @@ clang::MakeDeductionFailureInfo(ASTContext &Context,
     }
     break;
 
-  case Sema::TDK_ConstraintsNotSatisfied: {
+  case TemplateDeductionResult::ConstraintsNotSatisfied: {
     CNSInfo *Saved = new (Context) CNSInfo;
     Saved->TemplateArgs = Info.takeSugared();
     Saved->Satisfaction = Info.AssociatedConstraintsSatisfaction;
@@ -700,9 +700,9 @@ clang::MakeDeductionFailureInfo(ASTContext &Context,
     break;
   }
 
-  case Sema::TDK_Success:
-  case Sema::TDK_NonDependentConversionFailure:
-  case Sema::TDK_AlreadyDiagnosed:
+  case TemplateDeductionResult::Success:
+  case TemplateDeductionResult::NonDependentConversionFailure:
+  case TemplateDeductionResult::AlreadyDiagnosed:
     llvm_unreachable("not a deduction failure");
   }
 
@@ -710,29 +710,29 @@ clang::MakeDeductionFailureInfo(ASTContext &Context,
 }
 
 void DeductionFailureInfo::Destroy() {
-  switch (static_cast<Sema::TemplateDeductionResult>(Result)) {
-  case Sema::TDK_Success:
-  case Sema::TDK_Invalid:
-  case Sema::TDK_InstantiationDepth:
-  case Sema::TDK_Incomplete:
-  case Sema::TDK_TooManyArguments:
-  case Sema::TDK_TooFewArguments:
-  case Sema::TDK_InvalidExplicitArguments:
-  case Sema::TDK_CUDATargetMismatch:
-  case Sema::TDK_NonDependentConversionFailure:
+  switch (static_cast<TemplateDeductionResult>(Result)) {
+  case TemplateDeductionResult::Success:
+  case TemplateDeductionResult::Invalid:
+  case TemplateDeductionResult::InstantiationDepth:
+  case TemplateDeductionResult::Incomplete:
+  case TemplateDeductionResult::TooManyArguments:
+  case TemplateDeductionResult::TooFewArguments:
+  case TemplateDeductionResult::InvalidExplicitArguments:
+  case TemplateDeductionResult::CUDATargetMismatch:
+  case TemplateDeductionResult::NonDependentConversionFailure:
     break;
 
-  case Sema::TDK_IncompletePack:
-  case Sema::TDK_Inconsistent:
-  case Sema::TDK_Underqualified:
-  case Sema::TDK_DeducedMismatch:
-  case Sema::TDK_DeducedMismatchNested:
-  case Sema::TDK_NonDeducedMismatch:
+  case TemplateDeductionResult::IncompletePack:
+  case TemplateDeductionResult::Inconsistent:
+  case TemplateDeductionResult::Underqualified:
+  case TemplateDeductionResult::DeducedMismatch:
+  case TemplateDeductionResult::DeducedMismatchNested:
+  case TemplateDeductionResult::NonDeducedMismatch:
     // FIXME: Destroy the data?
     Data = nullptr;
     break;
 
-  case Sema::TDK_SubstitutionFailure:
+  case TemplateDeductionResult::SubstitutionFailure:
     // FIXME: Destroy the template argument list?
     Data = nullptr;
     if (PartialDiagnosticAt *Diag = getSFINAEDiagnostic()) {
@@ -741,7 +741,7 @@ void DeductionFailureInfo::Destroy() {
     }
     break;
 
-  case Sema::TDK_ConstraintsNotSatisfied:
+  case TemplateDeductionResult::ConstraintsNotSatisfied:
     // FIXME: Destroy the template argument list?
     Data = nullptr;
     if (PartialDiagnosticAt *Diag = getSFINAEDiagnostic()) {
@@ -751,8 +751,8 @@ void DeductionFailureInfo::Destroy() {
     break;
 
   // Unhandled
-  case Sema::TDK_MiscellaneousDeductionFailure:
-  case Sema::TDK_AlreadyDiagnosed:
+  case TemplateDeductionResult::MiscellaneousDeductionFailure:
+  case TemplateDeductionResult::AlreadyDiagnosed:
     break;
   }
 }
@@ -764,33 +764,33 @@ PartialDiagnosticAt *DeductionFailureInfo::getSFINAEDiagnostic() {
 }
 
 TemplateParameter DeductionFailureInfo::getTemplateParameter() {
-  switch (static_cast<Sema::TemplateDeductionResult>(Result)) {
-  case Sema::TDK_Success:
-  case Sema::TDK_Invalid:
-  case Sema::TDK_InstantiationDepth:
-  case Sema::TDK_TooManyArguments:
-  case Sema::TDK_TooFewArguments:
-  case Sema::TDK_SubstitutionFailure:
-  case Sema::TDK_DeducedMismatch:
-  case Sema::TDK_DeducedMismatchNested:
-  case Sema::TDK_NonDeducedMismatch:
-  case Sema::TDK_CUDATargetMismatch:
-  case Sema::TDK_NonDependentConversionFailure:
-  case Sema::TDK_ConstraintsNotSatisfied:
+  switch (static_cast<TemplateDeductionResult>(Result)) {
+  case TemplateDeductionResult::Success:
+  case TemplateDeductionResult::Invalid:
+  case TemplateDeductionResult::InstantiationDepth:
+  case TemplateDeductionResult::TooManyArguments:
+  case TemplateDeductionResult::TooFewArguments:
+  case TemplateDeductionResult::SubstitutionFailure:
+  case TemplateDeductionResult::DeducedMismatch:
+  case TemplateDeductionResult::DeducedMismatchNested:
+  case TemplateDeductionResult::NonDeducedMismatch:
+  case TemplateDeductionResult::CUDATargetMismatch:
+  case TemplateDeductionResult::NonDependentConversionFailure:
+  case TemplateDeductionResult::ConstraintsNotSatisfied:
     return TemplateParameter();
 
-  case Sema::TDK_Incomplete:
-  case Sema::TDK_InvalidExplicitArguments:
+  case TemplateDeductionResult::Incomplete:
+  case TemplateDeductionResult::InvalidExplicitArguments:
     return TemplateParameter::getFromOpaqueValue(Data);
 
-  case Sema::TDK_IncompletePack:
-  case Sema::TDK_Inconsistent:
-  case Sema::TDK_Underqualified:
+  case TemplateDeductionResult::IncompletePack:
+  case TemplateDeductionResult::Inconsistent:
+  case TemplateDeductionResult::Underqualified:
     return static_cast<DFIParamWithArguments*>(Data)->Param;
 
   // Unhandled
-  case Sema::TDK_MiscellaneousDeductionFailure:
-  case Sema::TDK_AlreadyDiagnosed:
+  case TemplateDeductionResult::MiscellaneousDeductionFailure:
+  case TemplateDeductionResult::AlreadyDiagnosed:
     break;
   }
 
@@ -798,35 +798,35 @@ TemplateParameter DeductionFailureInfo::getTemplateParameter() {
 }
 
 TemplateArgumentList *DeductionFailureInfo::getTemplateArgumentList() {
-  switch (static_cast<Sema::TemplateDeductionResult>(Result)) {
-  case Sema::TDK_Success:
-  case Sema::TDK_Invalid:
-  case Sema::TDK_InstantiationDepth:
-  case Sema::TDK_TooManyArguments:
-  case Sema::TDK_TooFewArguments:
-  case Sema::TDK_Incomplete:
-  case Sema::TDK_IncompletePack:
-  case Sema::TDK_InvalidExplicitArguments:
-  case Sema::TDK_Inconsistent:
-  case Sema::TDK_Underqualified:
-  case Sema::TDK_NonDeducedMismatch:
-  case Sema::TDK_CUDATargetMismatch:
-  case Sema::TDK_NonDependentConversionFailure:
+  switch (static_cast<TemplateDeductionResult>(Result)) {
+  case TemplateDeductionResult::Success:
+  case TemplateDeductionResult::Invalid:
+  case TemplateDeductionResult::InstantiationDepth:
+  case TemplateDeductionResult::TooManyArguments:
+  case TemplateDeductionResult::TooFewArguments:
+  case TemplateDeductionResult::Incomplete:
+  case TemplateDeductionResult::IncompletePack:
+  case TemplateDeductionResult::InvalidExplicitArguments:
+  case TemplateDeductionResult::Inconsistent:
+  case TemplateDeductionResult::Underqualified:
+  case TemplateDeductionResult::NonDeducedMismatch:
+  case TemplateDeductionResult::CUDATargetMismatch:
+  case TemplateDeductionResult::NonDependentConversionFailure:
     return nullptr;
 
-  case Sema::TDK_DeducedMismatch:
-  case Sema::TDK_DeducedMismatchNested:
+  case TemplateDeductionResult::DeducedMismatch:
+  case TemplateDeductionResult::DeducedMismatchNested:
     return static_cast<DFIDeducedMismatchArgs*>(Data)->TemplateArgs;
 
-  case Sema::TDK_SubstitutionFailure:
+  case TemplateDeductionResult::SubstitutionFailure:
     return static_cast<TemplateArgumentList*>(Data);
 
-  case Sema::TDK_ConstraintsNotSatisfied:
+  case TemplateDeductionResult::ConstraintsNotSatisfied:
     return static_cast<CNSInfo*>(Data)->TemplateArgs;
 
   // Unhandled
-  case Sema::TDK_MiscellaneousDeductionFailure:
-  case Sema::TDK_AlreadyDiagnosed:
+  case TemplateDeductionResult::MiscellaneousDeductionFailure:
+  case TemplateDeductionResult::AlreadyDiagnosed:
     break;
   }
 
@@ -834,31 +834,31 @@ TemplateArgumentList *DeductionFailureInfo::getTemplateArgumentList() {
 }
 
 const TemplateArgument *DeductionFailureInfo::getFirstArg() {
-  switch (static_cast<Sema::TemplateDeductionResult>(Result)) {
-  case Sema::TDK_Success:
-  case Sema::TDK_Invalid:
-  case Sema::TDK_InstantiationDepth:
-  case Sema::TDK_Incomplete:
-  case Sema::TDK_TooManyArguments:
-  case Sema::TDK_TooFewArguments:
-  case Sema::TDK_InvalidExplicitArguments:
-  case Sema::TDK_SubstitutionFailure:
-  case Sema::TDK_CUDATargetMismatch:
-  case Sema::TDK_NonDependentConversionFailure:
-  case Sema::TDK_ConstraintsNotSatisfied:
+  switch (static_cast<TemplateDeductionResult>(Result)) {
+  case TemplateDeductionResult::Success:
+  case TemplateDeductionResult::Invalid:
+  case TemplateDeductionResult::InstantiationDepth:
+  case TemplateDeductionResult::Incomplete:
+  case TemplateDeductionResult::TooManyArguments:
+  case TemplateDeductionResult::TooFewArguments:
+  case TemplateDeductionResult::InvalidExplicitArguments:
+  case TemplateDeductionResult::SubstitutionFailure:
+  case TemplateDeductionResult::CUDATargetMismatch:
+  case TemplateDeductionResult::NonDependentConversionFailure:
+  case TemplateDeductionResult::ConstraintsNotSatisfied:
     return nullptr;
 
-  case Sema::TDK_IncompletePack:
-  case Sema::TDK_Inconsistent:
-  case Sema::TDK_Underqualified:
-  case Sema::TDK_DeducedMismatch:
-  case Sema::TDK_DeducedMismatchNested:
-  case Sema::TDK_NonDeducedMismatch:
+  case TemplateDeductionResult::IncompletePack:
+  case TemplateDeductionResult::Inconsistent:
+  case TemplateDeductionResult::Underqualified:
+  case TemplateDeductionResult::DeducedMismatch:
+  case TemplateDeductionResult::DeducedMismatchNested:
+  case TemplateDeductionResult::NonDeducedMismatch:
     return &static_cast<DFIArguments*>(Data)->FirstArg;
 
   // Unhandled
-  case Sema::TDK_MiscellaneousDeductionFailure:
-  case Sema::TDK_AlreadyDiagnosed:
+  case TemplateDeductionResult::MiscellaneousDeductionFailure:
+  case TemplateDeductionResult::AlreadyDiagnosed:
     break;
   }
 
@@ -866,31 +866,31 @@ const TemplateArgument *DeductionFailureInfo::getFirstArg() {
 }
 
 const TemplateArgument *DeductionFailureInfo::getSecondArg() {
-  switch (static_cast<Sema::TemplateDeductionResult>(Result)) {
-  case Sema::TDK_Success:
-  case Sema::TDK_Invalid:
-  case Sema::TDK_InstantiationDepth:
-  case Sema::TDK_Incomplete:
-  case Sema::TDK_IncompletePack:
-  case Sema::TDK_TooManyArguments:
-  case Sema::TDK_TooFewArguments:
-  case Sema::TDK_InvalidExplicitArguments:
-  case Sema::TDK_SubstitutionFailure:
-  case Sema::TDK_CUDATargetMismatch:
-  case Sema::TDK_NonDependentConversionFailure:
-  case Sema::TDK_ConstraintsNotSatisfied:
+  switch (static_cast<TemplateDeductionResult>(Result)) {
+  case TemplateDeductionResult::Success:
+  case TemplateDeductionResult::Invalid:
+  case TemplateDeductionResult::InstantiationDepth:
+  case TemplateDeductionResult::Incomplete:
+  case TemplateDeductionResult::IncompletePack:
+  case TemplateDeductionResult::TooManyArguments:
+  case TemplateDeductionResult::TooFewArguments:
+  case TemplateDeductionResult::InvalidExplicitArguments:
+  case TemplateDeductionResult::SubstitutionFailure:
+  case TemplateDeductionResult::CUDATargetMismatch:
+  case TemplateDeductionResult::NonDependentConversionFailure:
+  case TemplateDeductionResult::ConstraintsNotSatisfied:
     return nullptr;
 
-  case Sema::TDK_Inconsistent:
-  case Sema::TDK_Underqualified:
-  case Sema::TDK_DeducedMismatch:
-  case Sema::TDK_DeducedMismatchNested:
-  case Sema::TDK_NonDeducedMismatch:
+  case TemplateDeductionResult::Inconsistent:
+  case TemplateDeductionResult::Underqualified:
+  case TemplateDeductionResult::DeducedMismatch:
+  case TemplateDeductionResult::DeducedMismatchNested:
+  case TemplateDeductionResult::NonDeducedMismatch:
     return &static_cast<DFIArguments*>(Data)->SecondArg;
 
   // Unhandled
-  case Sema::TDK_MiscellaneousDeductionFailure:
-  case Sema::TDK_AlreadyDiagnosed:
+  case TemplateDeductionResult::MiscellaneousDeductionFailure:
+  case TemplateDeductionResult::AlreadyDiagnosed:
     break;
   }
 
@@ -898,9 +898,9 @@ const TemplateArgument *DeductionFailureInfo::getSecondArg() {
 }
 
 std::optional<unsigned> DeductionFailureInfo::getCallArgIndex() {
-  switch (static_cast<Sema::TemplateDeductionResult>(Result)) {
-  case Sema::TDK_DeducedMismatch:
-  case Sema::TDK_DeducedMismatchNested:
+  switch (static_cast<TemplateDeductionResult>(Result)) {
+  case TemplateDeductionResult::DeducedMismatch:
+  case TemplateDeductionResult::DeducedMismatchNested:
     return static_cast<DFIDeducedMismatchArgs*>(Data)->CallArgIndex;
 
   default:
@@ -7548,12 +7548,14 @@ void Sema::AddMethodTemplateCandidate(
   if (TemplateDeductionResult Result = DeduceTemplateArguments(
           MethodTmpl, ExplicitTemplateArgs, Args, Specialization, Info,
           PartialOverloading, /*AggregateDeductionCandidate=*/false, ObjectType,
-          ObjectClassification, [&](ArrayRef<QualType> ParamTypes) {
+          ObjectClassification,
+          [&](ArrayRef<QualType> ParamTypes) {
             return CheckNonDependentConversions(
                 MethodTmpl, ParamTypes, Args, CandidateSet, Conversions,
                 SuppressUserConversions, ActingContext, ObjectType,
                 ObjectClassification, PO);
-          })) {
+          });
+      Result != TemplateDeductionResult::Success) {
     OverloadCandidate &Candidate =
         CandidateSet.addCandidate(Conversions.size(), Conversions);
     Candidate.FoundDecl = FoundDecl;
@@ -7566,7 +7568,7 @@ void Sema::AddMethodTemplateCandidate(
         cast<CXXMethodDecl>(Candidate.Function)->isStatic() ||
         ObjectType.isNull();
     Candidate.ExplicitCallArguments = Args.size();
-    if (Result == TDK_NonDependentConversionFailure)
+    if (Result == TemplateDeductionResult::NonDependentConversionFailure)
       Candidate.FailureKind = ovl_fail_bad_conversion;
     else {
       Candidate.FailureKind = ovl_fail_bad_deduction;
@@ -7639,7 +7641,8 @@ void Sema::AddTemplateOverloadCandidate(
             return CheckNonDependentConversions(
                 FunctionTemplate, ParamTypes, Args, CandidateSet, Conversions,
                 SuppressUserConversions, nullptr, QualType(), {}, PO);
-          })) {
+          });
+      Result != TemplateDeductionResult::Success) {
     OverloadCandidate &Candidate =
         CandidateSet.addCandidate(Conversions.size(), Conversions);
     Candidate.FoundDecl = FoundDecl;
@@ -7655,7 +7658,7 @@ void Sema::AddTemplateOverloadCandidate(
         isa<CXXMethodDecl>(Candidate.Function) &&
         !isa<CXXConstructorDecl>(Candidate.Function);
     Candidate.ExplicitCallArguments = Args.size();
-    if (Result == TDK_NonDependentConversionFailure)
+    if (Result == TemplateDeductionResult::NonDependentConversionFailure)
       Candidate.FailureKind = ovl_fail_bad_conversion;
     else {
       Candidate.FailureKind = ovl_fail_bad_deduction;
@@ -8042,7 +8045,8 @@ void Sema::AddTemplateConversionCandidate(
   CXXConversionDecl *Specialization = nullptr;
   if (TemplateDeductionResult Result = DeduceTemplateArguments(
           FunctionTemplate, ObjectType, ObjectClassification, ToType,
-          Specialization, Info)) {
+          Specialization, Info);
+      Result != TemplateDeductionResult::Success) {
     OverloadCandidate &Candidate = CandidateSet.addCandidate();
     Candidate.FoundDecl = FoundDecl;
     Candidate.Function = FunctionTemplate->getTemplatedDecl();
@@ -10655,7 +10659,8 @@ void Sema::diagnoseEquivalentInternalLinkageDeclarations(
 
 bool OverloadCandidate::NotValidBecauseConstraintExprHasError() const {
   return FailureKind == ovl_fail_bad_deduction &&
-         DeductionFailure.Result == Sema::TDK_ConstraintsNotSatisfied &&
+         static_cast<TemplateDeductionResult>(DeductionFailure.Result) ==
+             TemplateDeductionResult::ConstraintsNotSatisfied &&
          static_cast<CNSInfo *>(DeductionFailure.Data)
              ->Satisfaction.ContainsErrors;
 }
@@ -11358,11 +11363,13 @@ static bool CheckArityMismatch(Sema &S, OverloadCandidate *Cand,
   if (NumArgs < MinParams) {
     assert((Cand->FailureKind == ovl_fail_too_few_arguments) ||
            (Cand->FailureKind == ovl_fail_bad_deduction &&
-            Cand->DeductionFailure.Result == Sema::TDK_TooFewArguments));
+            Cand->DeductionFailure.getResult() ==
+                TemplateDeductionResult::TooFewArguments));
   } else {
     assert((Cand->FailureKind == ovl_fail_too_many_arguments) ||
            (Cand->FailureKind == ovl_fail_bad_deduction &&
-            Cand->DeductionFailure.Result == Sema::TDK_TooManyArguments));
+            Cand->DeductionFailure.getResult() ==
+                TemplateDeductionResult::TooManyArguments));
   }
 
   return false;
@@ -11445,11 +11452,18 @@ static void DiagnoseBadDeduction(Sema &S, NamedDecl *Found, Decl *Templated,
   (ParamD = Param.dyn_cast<TemplateTypeParmDecl*>()) ||
   (ParamD = Param.dyn_cast<NonTypeTemplateParmDecl*>()) ||
   (ParamD = Param.dyn_cast<TemplateTemplateParmDecl*>());
-  switch (DeductionFailure.Result) {
-  case Sema::TDK_Success:
-    llvm_unreachable("TDK_success while diagnosing bad deduction");
+  switch (DeductionFailure.getResult()) {
+  case TemplateDeductionResult::Success:
+    llvm_unreachable(
+        "TemplateDeductionResult::Success while diagnosing bad deduction");
+  case TemplateDeductionResult::NonDependentConversionFailure:
+    llvm_unreachable("TemplateDeductionResult::NonDependentConversionFailure "
+                     "while diagnosing bad deduction");
+  case TemplateDeductionResult::Invalid:
+  case TemplateDeductionResult::AlreadyDiagnosed:
+    return;
 
-  case Sema::TDK_Incomplete: {
+  case TemplateDeductionResult::Incomplete: {
     assert(ParamD && "no parameter found for incomplete deduction result");
     S.Diag(Templated->getLocation(),
            diag::note_ovl_candidate_incomplete_deduction)
@@ -11458,7 +11472,7 @@ static void DiagnoseBadDeduction(Sema &S, NamedDecl *Found, Decl *Templated,
     return;
   }
 
-  case Sema::TDK_IncompletePack: {
+  case TemplateDeductionResult::IncompletePack: {
     assert(ParamD && "no parameter found for incomplete deduction result");
     S.Diag(Templated->getLocation(),
            diag::note_ovl_candidate_incomplete_deduction_pack)
@@ -11469,7 +11483,7 @@ static void DiagnoseBadDeduction(Sema &S, NamedDecl *Found, Decl *Templated,
     return;
   }
 
-  case Sema::TDK_Underqualified: {
+  case TemplateDeductionResult::Underqualified: {
     assert(ParamD && "no parameter found for bad qualifiers deduction result");
     TemplateTypeParmDecl *TParam = cast<TemplateTypeParmDecl>(ParamD);
 
@@ -11494,7 +11508,7 @@ static void DiagnoseBadDeduction(Sema &S, NamedDecl *Found, Decl *Templated,
     return;
   }
 
-  case Sema::TDK_Inconsistent: {
+  case TemplateDeductionResult::Inconsistent: {
     assert(ParamD && "no parameter found for inconsistent deduction result");
     int which = 0;
     if (isa<TemplateTypeParmDecl>(ParamD))
@@ -11539,7 +11553,7 @@ static void DiagnoseBadDeduction(Sema &S, NamedDecl *Found, Decl *Templated,
     return;
   }
 
-  case Sema::TDK_InvalidExplicitArguments:
+  case TemplateDeductionResult::InvalidExplicitArguments:
     assert(ParamD && "no parameter found for invalid explicit arguments");
     if (ParamD->getDeclName())
       S.Diag(Templated->getLocation(),
@@ -11561,7 +11575,7 @@ static void DiagnoseBadDeduction(Sema &S, NamedDecl *Found, Decl *Templated,
     MaybeEmitInheritedConstructorNote(S, Found);
     return;
 
-  case Sema::TDK_ConstraintsNotSatisfied: {
+  case TemplateDeductionResult::ConstraintsNotSatisfied: {
     // Format the template argument list into the argument string.
     SmallString<128> TemplateArgString;
     TemplateArgumentList *Args = DeductionFailure.getTemplateArgumentList();
@@ -11578,18 +11592,18 @@ static void DiagnoseBadDeduction(Sema &S, NamedDecl *Found, Decl *Templated,
         static_cast<CNSInfo*>(DeductionFailure.Data)->Satisfaction);
     return;
   }
-  case Sema::TDK_TooManyArguments:
-  case Sema::TDK_TooFewArguments:
+  case TemplateDeductionResult::TooManyArguments:
+  case TemplateDeductionResult::TooFewArguments:
     DiagnoseArityMismatch(S, Found, Templated, NumArgs);
     return;
 
-  case Sema::TDK_InstantiationDepth:
+  case TemplateDeductionResult::InstantiationDepth:
     S.Diag(Templated->getLocation(),
            diag::note_ovl_candidate_instantiation_depth);
     MaybeEmitInheritedConstructorNote(S, Found);
     return;
 
-  case Sema::TDK_SubstitutionFailure: {
+  case TemplateDeductionResult::SubstitutionFailure: {
     // Format the template argument list into the argument string.
     SmallString<128> TemplateArgString;
     if (TemplateArgumentList *Args =
@@ -11639,8 +11653,8 @@ static void DiagnoseBadDeduction(Sema &S, NamedDecl *Found, Decl *Templated,
     return;
   }
 
-  case Sema::TDK_DeducedMismatch:
-  case Sema::TDK_DeducedMismatchNested: {
+  case TemplateDeductionResult::DeducedMismatch:
+  case TemplateDeductionResult::DeducedMismatchNested: {
     // Format the template argument list into the argument string.
     SmallString<128> TemplateArgString;
     if (TemplateArgumentList *Args =
@@ -11656,11 +11670,12 @@ static void DiagnoseBadDeduction(Sema &S, NamedDecl *Found, Decl *Templated,
         << (*DeductionFailure.getCallArgIndex() + 1)
         << *DeductionFailure.getFirstArg() << *DeductionFailure.getSecondArg()
         << TemplateArgString
-        << (DeductionFailure.Result == Sema::TDK_DeducedMismatchNested);
+        << (DeductionFailure.getResult() ==
+            TemplateDeductionResult::DeducedMismatchNested);
     break;
   }
 
-  case Sema::TDK_NonDeducedMismatch: {
+  case TemplateDeductionResult::NonDeducedMismatch: {
     // FIXME: Provide a source location to indicate what we couldn't match.
     TemplateArgument FirstTA = *DeductionFailure.getFirstArg();
     TemplateArgument SecondTA = *DeductionFailure.getSecondArg();
@@ -11701,11 +11716,11 @@ static void DiagnoseBadDeduction(Sema &S, NamedDecl *Found, Decl *Templated,
   }
   // TODO: diagnose these individually, then kill off
   // note_ovl_candidate_bad_deduction, which is uselessly vague.
-  case Sema::TDK_MiscellaneousDeductionFailure:
+  case TemplateDeductionResult::MiscellaneousDeductionFailure:
     S.Diag(Templated->getLocation(), diag::note_ovl_candidate_bad_deduction);
     MaybeEmitInheritedConstructorNote(S, Found);
     return;
-  case Sema::TDK_CUDATargetMismatch:
+  case TemplateDeductionResult::CUDATargetMismatch:
     S.Diag(Templated->getLocation(),
            diag::note_cuda_ovl_candidate_target_mismatch);
     return;
@@ -11716,8 +11731,9 @@ static void DiagnoseBadDeduction(Sema &S, NamedDecl *Found, Decl *Templated,
 static void DiagnoseBadDeduction(Sema &S, OverloadCandidate *Cand,
                                  unsigned NumArgs,
                                  bool TakingCandidateAddress) {
-  unsigned TDK = Cand->DeductionFailure.Result;
-  if (TDK == Sema::TDK_TooFewArguments || TDK == Sema::TDK_TooManyArguments) {
+  TemplateDeductionResult TDK = Cand->DeductionFailure.getResult();
+  if (TDK == TemplateDeductionResult::TooFewArguments ||
+      TDK == TemplateDeductionResult::TooManyArguments) {
     if (CheckArityMismatch(S, Cand, NumArgs))
       return;
   }
@@ -12051,38 +12067,38 @@ static SourceLocation GetLocationForCandidate(const OverloadCandidate *Cand) {
 }
 
 static unsigned RankDeductionFailure(const DeductionFailureInfo &DFI) {
-  switch ((Sema::TemplateDeductionResult)DFI.Result) {
-  case Sema::TDK_Success:
-  case Sema::TDK_NonDependentConversionFailure:
-  case Sema::TDK_AlreadyDiagnosed:
+  switch (static_cast<TemplateDeductionResult>(DFI.Result)) {
+  case TemplateDeductionResult::Success:
+  case TemplateDeductionResult::NonDependentConversionFailure:
+  case TemplateDeductionResult::AlreadyDiagnosed:
     llvm_unreachable("non-deduction failure while diagnosing bad deduction");
 
-  case Sema::TDK_Invalid:
-  case Sema::TDK_Incomplete:
-  case Sema::TDK_IncompletePack:
+  case TemplateDeductionResult::Invalid:
+  case TemplateDeductionResult::Incomplete:
+  case TemplateDeductionResult::IncompletePack:
     return 1;
 
-  case Sema::TDK_Underqualified:
-  case Sema::TDK_Inconsistent:
+  case TemplateDeductionResult::Underqualified:
+  case TemplateDeductionResult::Inconsistent:
     return 2;
 
-  case Sema::TDK_SubstitutionFailure:
-  case Sema::TDK_DeducedMismatch:
-  case Sema::TDK_ConstraintsNotSatisfied:
-  case Sema::TDK_DeducedMismatchNested:
-  case Sema::TDK_NonDeducedMismatch:
-  case Sema::TDK_MiscellaneousDeductionFailure:
-  case Sema::TDK_CUDATargetMismatch:
+  case TemplateDeductionResult::SubstitutionFailure:
+  case TemplateDeductionResult::DeducedMismatch:
+  case TemplateDeductionResult::ConstraintsNotSatisfied:
+  case TemplateDeductionResult::DeducedMismatchNested:
+  case TemplateDeductionResult::NonDeducedMismatch:
+  case TemplateDeductionResult::MiscellaneousDeductionFailure:
+  case TemplateDeductionResult::CUDATargetMismatch:
     return 3;
 
-  case Sema::TDK_InstantiationDepth:
+  case TemplateDeductionResult::InstantiationDepth:
     return 4;
 
-  case Sema::TDK_InvalidExplicitArguments:
+  case TemplateDeductionResult::InvalidExplicitArguments:
     return 5;
 
-  case Sema::TDK_TooManyArguments:
-  case Sema::TDK_TooFewArguments:
+  case TemplateDeductionResult::TooManyArguments:
+  case TemplateDeductionResult::TooFewArguments:
     return 6;
   }
   llvm_unreachable("Unhandled deduction result");
@@ -12810,11 +12826,10 @@ private:
     //   overloaded functions considered.
     FunctionDecl *Specialization = nullptr;
     TemplateDeductionInfo Info(FailedCandidates.getLocation());
-    if (Sema::TemplateDeductionResult Result
-          = S.DeduceTemplateArguments(FunctionTemplate,
-                                      &OvlExplicitTemplateArgs,
-                                      TargetFunctionType, Specialization,
-                                      Info, /*IsAddressOfFunction*/true)) {
+    if (TemplateDeductionResult Result = S.DeduceTemplateArguments(
+            FunctionTemplate, &OvlExplicitTemplateArgs, TargetFunctionType,
+            Specialization, Info, /*IsAddressOfFunction*/ true);
+        Result != TemplateDeductionResult::Success) {
       // Make a note of the failed deduction for diagnostics.
       FailedCandidates.addCandidate()
           .set(CurAccessFunPair, FunctionTemplate->getTemplatedDecl(),
@@ -13305,10 +13320,10 @@ FunctionDecl *Sema::ResolveSingleFunctionTemplateSpecialization(
     //   overloaded functions considered.
     FunctionDecl *Specialization = nullptr;
     TemplateDeductionInfo Info(ovl->getNameLoc());
-    if (TemplateDeductionResult Result
-          = DeduceTemplateArguments(FunctionTemplate, &ExplicitTemplateArgs,
-                                    Specialization, Info,
-                                    /*IsAddressOfFunction*/true)) {
+    if (TemplateDeductionResult Result = DeduceTemplateArguments(
+            FunctionTemplate, &ExplicitTemplateArgs, Specialization, Info,
+            /*IsAddressOfFunction*/ true);
+        Result != TemplateDeductionResult::Success) {
       // Make a note of the failed deduction for diagnostics.
       if (FailedTSC)
         FailedTSC->addCandidate().set(
diff --git a/clang/lib/Sema/SemaStmt.cpp b/clang/lib/Sema/SemaStmt.cpp
index d9aaea8..dde3bd8 100644
--- a/clang/lib/Sema/SemaStmt.cpp
+++ b/clang/lib/Sema/SemaStmt.cpp
@@ -2326,7 +2326,8 @@ Sema::ActOnObjCForCollectionStmt(SourceLocation ForLoc,
         FirstType = QualType();
         TemplateDeductionResult Result = DeduceAutoType(
             D->getTypeSourceInfo()->getTypeLoc(), DeducedInit, FirstType, Info);
-        if (Result != TDK_Success && Result != TDK_AlreadyDiagnosed)
+        if (Result != TemplateDeductionResult::Success &&
+            Result != TemplateDeductionResult::AlreadyDiagnosed)
           DiagnoseAutoDeductionFailure(D, DeducedInit);
         if (FirstType.isNull()) {
           D->setInvalidDecl();
@@ -2394,9 +2395,10 @@ static bool FinishForRangeVarDecl(Sema &SemaRef, VarDecl *Decl, Expr *Init,
     SemaRef.Diag(Loc, DiagID) << Init->getType();
   } else {
     TemplateDeductionInfo Info(Init->getExprLoc());
-    Sema::TemplateDeductionResult Result = SemaRef.DeduceAutoType(
+    TemplateDeductionResult Result = SemaRef.DeduceAutoType(
         Decl->getTypeSourceInfo()->getTypeLoc(), Init, InitType, Info);
-    if (Result != Sema::TDK_Success && Result != Sema::TDK_AlreadyDiagnosed)
+    if (Result != TemplateDeductionResult::Success &&
+        Result != TemplateDeductionResult::AlreadyDiagnosed)
       SemaRef.Diag(Loc, DiagID) << Init->getType();
   }
 
@@ -3865,14 +3867,14 @@ bool Sema::DeduceFunctionTypeFromReturnExpr(FunctionDecl *FD,
     TemplateDeductionResult Res = DeduceAutoType(
         OrigResultType, RetExpr, Deduced, Info, /*DependentDeduction=*/false,
         /*IgnoreConstraints=*/false, &FailedTSC);
-    if (Res != TDK_Success && FD->isInvalidDecl())
+    if (Res != TemplateDeductionResult::Success && FD->isInvalidDecl())
       return true;
     switch (Res) {
-    case TDK_Success:
+    case TemplateDeductionResult::Success:
       break;
-    case TDK_AlreadyDiagnosed:
+    case TemplateDeductionResult::AlreadyDiagnosed:
       return true;
-    case TDK_Inconsistent: {
+    case TemplateDeductionResult::Inconsistent: {
       //  If a function with a declared return type that contains a placeholder
       //  type has multiple return statements, the return type is deduced for
       //  each return statement. [...] if the type deduced is not the same in
diff --git a/clang/lib/Sema/SemaTemplate.cpp b/clang/lib/Sema/SemaTemplate.cpp
index cf781e0..9e516da 100644
--- a/clang/lib/Sema/SemaTemplate.cpp
+++ b/clang/lib/Sema/SemaTemplate.cpp
@@ -4863,7 +4863,8 @@ Sema::CheckVarTemplateId(VarTemplateDecl *Template, SourceLocation TemplateLoc,
     TemplateDeductionInfo Info(FailedCandidates.getLocation());
 
     if (TemplateDeductionResult Result =
-            DeduceTemplateArguments(Partial, CanonicalConverted, Info)) {
+            DeduceTemplateArguments(Partial, CanonicalConverted, Info);
+        Result != TemplateDeductionResult::Success) {
       // Store the failed-deduction information for use in diagnostics, later.
       // TODO: Actually use the failed-deduction info?
       FailedCandidates.addCandidate().set(
@@ -7243,10 +7244,10 @@ ExprResult Sema::CheckTemplateArgument(NonTypeTemplateParmDecl *Param,
                          // along with the other associated constraints after
                          // checking the template argument list.
                          /*IgnoreConstraints=*/true);
-      if (Result == TDK_AlreadyDiagnosed) {
+      if (Result == TemplateDeductionResult::AlreadyDiagnosed) {
         if (ParamType.isNull())
           return ExprError();
-      } else if (Result != TDK_Success) {
+      } else if (Result != TemplateDeductionResult::Success) {
         Diag(Arg->getExprLoc(),
              diag::err_non_type_template_parm_type_deduction_failure)
             << Param->getDeclName() << Param->getType() << Arg->getType()
@@ -9644,8 +9645,8 @@ bool Sema::CheckFunctionTemplateSpecialization(
       FunctionDecl *Specialization = nullptr;
       if (TemplateDeductionResult TDK = DeduceTemplateArguments(
               cast<FunctionTemplateDecl>(FunTmpl->getFirstDecl()),
-              ExplicitTemplateArgs ? &Args : nullptr, FT, Specialization,
-              Info)) {
+              ExplicitTemplateArgs ? &Args : nullptr, FT, Specialization, Info);
+          TDK != TemplateDeductionResult::Success) {
         // Template argument deduction failed; record why it failed, so
         // that we can provide nifty diagnostics.
         FailedCandidates.addCandidate().set(
@@ -9666,7 +9667,8 @@ bool Sema::CheckFunctionTemplateSpecialization(
               IdentifyCUDATarget(FD, /* IgnoreImplicitHDAttr = */ true)) {
         FailedCandidates.addCandidate().set(
             I.getPair(), FunTmpl->getTemplatedDecl(),
-            MakeDeductionFailureInfo(Context, TDK_CUDATargetMismatch, Info));
+            MakeDeductionFailureInfo(
+                Context, TemplateDeductionResult::CUDATargetMismatch, Info));
         continue;
       }
 
@@ -10816,11 +10818,10 @@ DeclResult Sema::ActOnExplicitInstantiation(Scope *S,
 
     TemplateDeductionInfo Info(FailedCandidates.getLocation());
     FunctionDecl *Specialization = nullptr;
-    if (TemplateDeductionResult TDK
-          = DeduceTemplateArguments(FunTmpl,
-                               (HasExplicitTemplateArgs ? &TemplateArgs
-                                                        : nullptr),
-                                    R, Specialization, Info)) {
+    if (TemplateDeductionResult TDK = DeduceTemplateArguments(
+            FunTmpl, (HasExplicitTemplateArgs ? &TemplateArgs : nullptr), R,
+            Specialization, Info);
+        TDK != TemplateDeductionResult::Success) {
       // Keep track of almost-matches.
       FailedCandidates.addCandidate()
           .set(P.getPair(), FunTmpl->getTemplatedDecl(),
@@ -10840,7 +10841,8 @@ DeclResult Sema::ActOnExplicitInstantiation(Scope *S,
             IdentifyCUDATarget(D.getDeclSpec().getAttributes())) {
       FailedCandidates.addCandidate().set(
           P.getPair(), FunTmpl->getTemplatedDecl(),
-          MakeDeductionFailureInfo(Context, TDK_CUDATargetMismatch, Info));
+          MakeDeductionFailureInfo(
+              Context, TemplateDeductionResult::CUDATargetMismatch, Info));
       continue;
     }
 
diff --git a/clang/lib/Sema/SemaTemplateDeduction.cpp b/clang/lib/Sema/SemaTemplateDeduction.cpp
index a54ad279..994c997 100644
--- a/clang/lib/Sema/SemaTemplateDeduction.cpp
+++ b/clang/lib/Sema/SemaTemplateDeduction.cpp
@@ -133,13 +133,13 @@ static bool hasSameExtendedValue(llvm::APSInt X, llvm::APSInt Y) {
   return X == Y;
 }
 
-static Sema::TemplateDeductionResult DeduceTemplateArgumentsByTypeMatch(
+static TemplateDeductionResult DeduceTemplateArgumentsByTypeMatch(
     Sema &S, TemplateParameterList *TemplateParams, QualType Param,
     QualType Arg, TemplateDeductionInfo &Info,
     SmallVectorImpl<DeducedTemplateArgument> &Deduced, unsigned TDF,
     bool PartialOrdering = false, bool DeducedFromArrayBound = false);
 
-static Sema::TemplateDeductionResult
+static TemplateDeductionResult
 DeduceTemplateArguments(Sema &S, TemplateParameterList *TemplateParams,
                         ArrayRef<TemplateArgument> Ps,
                         ArrayRef<TemplateArgument> As,
@@ -393,10 +393,11 @@ checkDeducedTemplateArguments(ASTContext &Context,
 /// Deduce the value of the given non-type template parameter
 /// as the given deduced template argument. All non-type template parameter
 /// deduction is funneled through here.
-static Sema::TemplateDeductionResult DeduceNonTypeTemplateArgument(
+static TemplateDeductionResult DeduceNonTypeTemplateArgument(
     Sema &S, TemplateParameterList *TemplateParams,
-    const NonTypeTemplateParmDecl *NTTP, const DeducedTemplateArgument &NewDeduced,
-    QualType ValueType, TemplateDeductionInfo &Info,
+    const NonTypeTemplateParmDecl *NTTP,
+    const DeducedTemplateArgument &NewDeduced, QualType ValueType,
+    TemplateDeductionInfo &Info,
     SmallVectorImpl<DeducedTemplateArgument> &Deduced) {
   assert(NTTP->getDepth() == Info.getDeducedDepth() &&
          "deducing non-type template argument with wrong depth");
@@ -407,19 +408,19 @@ static Sema::TemplateDeductionResult DeduceNonTypeTemplateArgument(
     Info.Param = const_cast<NonTypeTemplateParmDecl*>(NTTP);
     Info.FirstArg = Deduced[NTTP->getIndex()];
     Info.SecondArg = NewDeduced;
-    return Sema::TDK_Inconsistent;
+    return TemplateDeductionResult::Inconsistent;
   }
 
   Deduced[NTTP->getIndex()] = Result;
   if (!S.getLangOpts().CPlusPlus17)
-    return Sema::TDK_Success;
+    return TemplateDeductionResult::Success;
 
   if (NTTP->isExpandedParameterPack())
     // FIXME: We may still need to deduce parts of the type here! But we
     // don't have any way to find which slice of the type to use, and the
     // type stored on the NTTP itself is nonsense. Perhaps the type of an
     // expanded NTTP should be a pack expansion type?
-    return Sema::TDK_Success;
+    return TemplateDeductionResult::Success;
 
   // Get the type of the parameter for deduction. If it's a (dependent) array
   // or function type, we will not have decayed it yet, so do that now.
@@ -446,7 +447,7 @@ static Sema::TemplateDeductionResult DeduceNonTypeTemplateArgument(
 
 /// Deduce the value of the given non-type template parameter
 /// from the given integral constant.
-static Sema::TemplateDeductionResult DeduceNonTypeTemplateArgument(
+static TemplateDeductionResult DeduceNonTypeTemplateArgument(
     Sema &S, TemplateParameterList *TemplateParams,
     const NonTypeTemplateParmDecl *NTTP, const llvm::APSInt &Value,
     QualType ValueType, bool DeducedFromArrayBound, TemplateDeductionInfo &Info,
@@ -460,7 +461,7 @@ static Sema::TemplateDeductionResult DeduceNonTypeTemplateArgument(
 
 /// Deduce the value of the given non-type template parameter
 /// from the given null pointer template argument type.
-static Sema::TemplateDeductionResult DeduceNullPtrTemplateArgument(
+static TemplateDeductionResult DeduceNullPtrTemplateArgument(
     Sema &S, TemplateParameterList *TemplateParams,
     const NonTypeTemplateParmDecl *NTTP, QualType NullPtrType,
     TemplateDeductionInfo &Info,
@@ -481,9 +482,10 @@ static Sema::TemplateDeductionResult DeduceNullPtrTemplateArgument(
 /// from the given type- or value-dependent expression.
 ///
 /// \returns true if deduction succeeded, false otherwise.
-static Sema::TemplateDeductionResult DeduceNonTypeTemplateArgument(
+static TemplateDeductionResult DeduceNonTypeTemplateArgument(
     Sema &S, TemplateParameterList *TemplateParams,
-    const NonTypeTemplateParmDecl *NTTP, Expr *Value, TemplateDeductionInfo &Info,
+    const NonTypeTemplateParmDecl *NTTP, Expr *Value,
+    TemplateDeductionInfo &Info,
     SmallVectorImpl<DeducedTemplateArgument> &Deduced) {
   return DeduceNonTypeTemplateArgument(S, TemplateParams, NTTP,
                                        DeducedTemplateArgument(Value),
@@ -494,7 +496,7 @@ static Sema::TemplateDeductionResult DeduceNonTypeTemplateArgument(
 /// from the given declaration.
 ///
 /// \returns true if deduction succeeded, false otherwise.
-static Sema::TemplateDeductionResult DeduceNonTypeTemplateArgument(
+static TemplateDeductionResult DeduceNonTypeTemplateArgument(
     Sema &S, TemplateParameterList *TemplateParams,
     const NonTypeTemplateParmDecl *NTTP, ValueDecl *D, QualType T,
     TemplateDeductionInfo &Info,
@@ -505,25 +507,23 @@ static Sema::TemplateDeductionResult DeduceNonTypeTemplateArgument(
       S, TemplateParams, NTTP, DeducedTemplateArgument(New), T, Info, Deduced);
 }
 
-static Sema::TemplateDeductionResult
-DeduceTemplateArguments(Sema &S,
-                        TemplateParameterList *TemplateParams,
-                        TemplateName Param,
-                        TemplateName Arg,
+static TemplateDeductionResult
+DeduceTemplateArguments(Sema &S, TemplateParameterList *TemplateParams,
+                        TemplateName Param, TemplateName Arg,
                         TemplateDeductionInfo &Info,
                         SmallVectorImpl<DeducedTemplateArgument> &Deduced) {
   TemplateDecl *ParamDecl = Param.getAsTemplateDecl();
   if (!ParamDecl) {
     // The parameter type is dependent and is not a template template parameter,
     // so there is nothing that we can deduce.
-    return Sema::TDK_Success;
+    return TemplateDeductionResult::Success;
   }
 
   if (TemplateTemplateParmDecl *TempParam
         = dyn_cast<TemplateTemplateParmDecl>(ParamDecl)) {
     // If we're not deducing at this depth, there's nothing to deduce.
     if (TempParam->getDepth() != Info.getDeducedDepth())
-      return Sema::TDK_Success;
+      return TemplateDeductionResult::Success;
 
     DeducedTemplateArgument NewDeduced(S.Context.getCanonicalTemplateName(Arg));
     DeducedTemplateArgument Result = checkDeducedTemplateArguments(S.Context,
@@ -533,21 +533,21 @@ DeduceTemplateArguments(Sema &S,
       Info.Param = TempParam;
       Info.FirstArg = Deduced[TempParam->getIndex()];
       Info.SecondArg = NewDeduced;
-      return Sema::TDK_Inconsistent;
+      return TemplateDeductionResult::Inconsistent;
     }
 
     Deduced[TempParam->getIndex()] = Result;
-    return Sema::TDK_Success;
+    return TemplateDeductionResult::Success;
   }
 
   // Verify that the two template names are equivalent.
   if (S.Context.hasSameTemplateName(Param, Arg))
-    return Sema::TDK_Success;
+    return TemplateDeductionResult::Success;
 
   // Mismatch of non-dependent template parameter to argument.
   Info.FirstArg = TemplateArgument(Param);
   Info.SecondArg = TemplateArgument(Arg);
-  return Sema::TDK_NonDeducedMismatch;
+  return TemplateDeductionResult::NonDeducedMismatch;
 }
 
 /// Deduce the template arguments by comparing the template parameter
@@ -568,7 +568,7 @@ DeduceTemplateArguments(Sema &S,
 /// \returns the result of template argument deduction so far. Note that a
 /// "success" result means that template argument deduction has not yet failed,
 /// but it may still fail, later, for other reasons.
-static Sema::TemplateDeductionResult
+static TemplateDeductionResult
 DeduceTemplateSpecArguments(Sema &S, TemplateParameterList *TemplateParams,
                             const QualType P, QualType A,
                             TemplateDeductionInfo &Info,
@@ -583,7 +583,7 @@ DeduceTemplateSpecArguments(Sema &S, TemplateParameterList *TemplateParams,
 
   // If the parameter is an alias template, there is nothing to deduce.
   if (const auto *TD = TNP.getAsTemplateDecl(); TD && TD->isTypeAlias())
-    return Sema::TDK_Success;
+    return TemplateDeductionResult::Success;
 
   ArrayRef<TemplateArgument> PResolved = TP->template_arguments();
 
@@ -600,11 +600,12 @@ DeduceTemplateSpecArguments(Sema &S, TemplateParameterList *TemplateParams,
 
     // If the argument is an alias template, there is nothing to deduce.
     if (const auto *TD = TNA.getAsTemplateDecl(); TD && TD->isTypeAlias())
-      return Sema::TDK_Success;
+      return TemplateDeductionResult::Success;
 
     // Perform template argument deduction for the template name.
     if (auto Result =
-            DeduceTemplateArguments(S, TemplateParams, TNP, TNA, Info, Deduced))
+            DeduceTemplateArguments(S, TemplateParams, TNP, TNA, Info, Deduced);
+        Result != TemplateDeductionResult::Success)
       return Result;
     // Perform template argument deduction on each template
     // argument. Ignore any missing/extra arguments, since they could be
@@ -623,13 +624,14 @@ DeduceTemplateSpecArguments(Sema &S, TemplateParameterList *TemplateParams,
   if (!SA) {
     Info.FirstArg = TemplateArgument(P);
     Info.SecondArg = TemplateArgument(A);
-    return Sema::TDK_NonDeducedMismatch;
+    return TemplateDeductionResult::NonDeducedMismatch;
   }
 
   // Perform template argument deduction for the template name.
   if (auto Result = DeduceTemplateArguments(
           S, TemplateParams, TP->getTemplateName(),
-          TemplateName(SA->getSpecializedTemplate()), Info, Deduced))
+          TemplateName(SA->getSpecializedTemplate()), Info, Deduced);
+      Result != TemplateDeductionResult::Success)
     return Result;
 
   // Perform template argument deduction for the template arguments.
@@ -919,7 +921,7 @@ public:
   /// Finish template argument deduction for a set of argument packs,
   /// producing the argument packs and checking for consistency with prior
   /// deductions.
-  Sema::TemplateDeductionResult finish() {
+  TemplateDeductionResult finish() {
     // Build argument packs for each of the parameter packs expanded by this
     // pack expansion.
     for (auto &Pack : Packs) {
@@ -996,7 +998,7 @@ public:
         Info.Param = makeTemplateParameter(Param);
         Info.FirstArg = OldPack;
         Info.SecondArg = NewPack;
-        return Sema::TDK_Inconsistent;
+        return TemplateDeductionResult::Inconsistent;
       }
 
       // If we have a pre-expanded pack and we didn't deduce enough elements
@@ -1005,14 +1007,14 @@ public:
         if (*Expansions != PackElements) {
           Info.Param = makeTemplateParameter(Param);
           Info.FirstArg = Result;
-          return Sema::TDK_IncompletePack;
+          return TemplateDeductionResult::IncompletePack;
         }
       }
 
       *Loc = Result;
     }
 
-    return Sema::TDK_Success;
+    return TemplateDeductionResult::Success;
   }
 
 private:
@@ -1062,15 +1064,13 @@ private:
 /// \returns the result of template argument deduction so far. Note that a
 /// "success" result means that template argument deduction has not yet failed,
 /// but it may still fail, later, for other reasons.
-static Sema::TemplateDeductionResult
-DeduceTemplateArguments(Sema &S,
-                        TemplateParameterList *TemplateParams,
+static TemplateDeductionResult
+DeduceTemplateArguments(Sema &S, TemplateParameterList *TemplateParams,
                         const QualType *Params, unsigned NumParams,
                         const QualType *Args, unsigned NumArgs,
                         TemplateDeductionInfo &Info,
                         SmallVectorImpl<DeducedTemplateArgument> &Deduced,
-                        unsigned TDF,
-                        bool PartialOrdering = false) {
+                        unsigned TDF, bool PartialOrdering = false) {
   // C++0x [temp.deduct.type]p10:
   //   Similarly, if P has a form that contains (T), then each parameter type
   //   Pi of the respective parameter-type- list of P is compared with the
@@ -1086,22 +1086,22 @@ DeduceTemplateArguments(Sema &S,
 
       // Make sure we have an argument.
       if (ArgIdx >= NumArgs)
-        return Sema::TDK_MiscellaneousDeductionFailure;
+        return TemplateDeductionResult::MiscellaneousDeductionFailure;
 
       if (isa<PackExpansionType>(Args[ArgIdx])) {
         // C++0x [temp.deduct.type]p22:
         //   If the original function parameter associated with A is a function
         //   parameter pack and the function parameter associated with P is not
         //   a function parameter pack, then template argument deduction fails.
-        return Sema::TDK_MiscellaneousDeductionFailure;
+        return TemplateDeductionResult::MiscellaneousDeductionFailure;
       }
 
-      if (Sema::TemplateDeductionResult Result =
-              DeduceTemplateArgumentsByTypeMatch(
-                  S, TemplateParams, Params[ParamIdx].getUnqualifiedType(),
-                  Args[ArgIdx].getUnqualifiedType(), Info, Deduced, TDF,
-                  PartialOrdering,
-                  /*DeducedFromArrayBound=*/false))
+      if (TemplateDeductionResult Result = DeduceTemplateArgumentsByTypeMatch(
+              S, TemplateParams, Params[ParamIdx].getUnqualifiedType(),
+              Args[ArgIdx].getUnqualifiedType(), Info, Deduced, TDF,
+              PartialOrdering,
+              /*DeducedFromArrayBound=*/false);
+          Result != TemplateDeductionResult::Success)
         return Result;
 
       ++ArgIdx;
@@ -1123,11 +1123,11 @@ DeduceTemplateArguments(Sema &S,
     if (ParamIdx + 1 == NumParams || PackScope.hasFixedArity()) {
       for (; ArgIdx < NumArgs && PackScope.hasNextElement(); ++ArgIdx) {
         // Deduce template arguments from the pattern.
-        if (Sema::TemplateDeductionResult Result =
-                DeduceTemplateArgumentsByTypeMatch(
-                    S, TemplateParams, Pattern.getUnqualifiedType(),
-                    Args[ArgIdx].getUnqualifiedType(), Info, Deduced, TDF,
-                    PartialOrdering, /*DeducedFromArrayBound=*/false))
+        if (TemplateDeductionResult Result = DeduceTemplateArgumentsByTypeMatch(
+                S, TemplateParams, Pattern.getUnqualifiedType(),
+                Args[ArgIdx].getUnqualifiedType(), Info, Deduced, TDF,
+                PartialOrdering, /*DeducedFromArrayBound=*/false);
+            Result != TemplateDeductionResult::Success)
           return Result;
 
         PackScope.nextPackElement();
@@ -1160,7 +1160,8 @@ DeduceTemplateArguments(Sema &S,
 
     // Build argument packs for each of the parameter packs expanded by this
     // pack expansion.
-    if (auto Result = PackScope.finish())
+    if (auto Result = PackScope.finish();
+        Result != TemplateDeductionResult::Success)
       return Result;
   }
 
@@ -1172,13 +1173,13 @@ DeduceTemplateArguments(Sema &S,
   //   Ai is ignored;
   if (PartialOrdering && ArgIdx + 1 == NumArgs &&
       isa<PackExpansionType>(Args[ArgIdx]))
-    return Sema::TDK_Success;
+    return TemplateDeductionResult::Success;
 
   // Make sure we don't have any extra arguments.
   if (ArgIdx < NumArgs)
-    return Sema::TDK_MiscellaneousDeductionFailure;
+    return TemplateDeductionResult::MiscellaneousDeductionFailure;
 
-  return Sema::TDK_Success;
+  return TemplateDeductionResult::Success;
 }
 
 /// Determine whether the parameter has qualifiers that the argument
@@ -1286,7 +1287,7 @@ static CXXRecordDecl *getCanonicalRD(QualType T) {
 /// \returns the result of template argument deduction with the bases. "invalid"
 /// means no matches, "success" found a single item, and the
 /// "MiscellaneousDeductionFailure" result happens when the match is ambiguous.
-static Sema::TemplateDeductionResult
+static TemplateDeductionResult
 DeduceTemplateBases(Sema &S, const CXXRecordDecl *RD,
                     TemplateParameterList *TemplateParams, QualType P,
                     TemplateDeductionInfo &Info,
@@ -1338,13 +1339,13 @@ DeduceTemplateBases(Sema &S, const CXXRecordDecl *RD,
     SmallVector<DeducedTemplateArgument, 8> DeducedCopy(Deduced.begin(),
                                                         Deduced.end());
     TemplateDeductionInfo BaseInfo(TemplateDeductionInfo::ForBase, Info);
-    Sema::TemplateDeductionResult BaseResult = DeduceTemplateSpecArguments(
+    TemplateDeductionResult BaseResult = DeduceTemplateSpecArguments(
         S, TemplateParams, P, NextT, BaseInfo, DeducedCopy);
 
     // If this was a successful deduction, add it to the list of matches,
     // otherwise we need to continue searching its bases.
     const CXXRecordDecl *RD = ::getCanonicalRD(NextT);
-    if (BaseResult == Sema::TDK_Success)
+    if (BaseResult == TemplateDeductionResult::Success)
       Matches.insert({RD, DeducedCopy});
     else
       AddBases(RD);
@@ -1374,12 +1375,12 @@ DeduceTemplateBases(Sema &S, const CXXRecordDecl *RD,
   }
 
   if (Matches.empty())
-    return Sema::TDK_Invalid;
+    return TemplateDeductionResult::Invalid;
   if (Matches.size() > 1)
-    return Sema::TDK_MiscellaneousDeductionFailure;
+    return TemplateDeductionResult::MiscellaneousDeductionFailure;
 
   std::swap(Matches.front().second, Deduced);
-  return Sema::TDK_Success;
+  return TemplateDeductionResult::Success;
 }
 
 /// Deduce the template arguments by comparing the parameter type and
@@ -1406,7 +1407,7 @@ DeduceTemplateBases(Sema &S, const CXXRecordDecl *RD,
 /// \returns the result of template argument deduction so far. Note that a
 /// "success" result means that template argument deduction has not yet failed,
 /// but it may still fail, later, for other reasons.
-static Sema::TemplateDeductionResult DeduceTemplateArgumentsByTypeMatch(
+static TemplateDeductionResult DeduceTemplateArgumentsByTypeMatch(
     Sema &S, TemplateParameterList *TemplateParams, QualType P, QualType A,
     TemplateDeductionInfo &Info,
     SmallVectorImpl<DeducedTemplateArgument> &Deduced, unsigned TDF,
@@ -1460,7 +1461,7 @@ static Sema::TemplateDeductionResult DeduceTemplateArgumentsByTypeMatch(
            PQuals.withoutObjCLifetime() == AQuals.withoutObjCLifetime())) {
         Info.FirstArg = TemplateArgument(P);
         Info.SecondArg = TemplateArgument(A);
-        return Sema::TDK_NonDeducedMismatch;
+        return TemplateDeductionResult::NonDeducedMismatch;
       }
     }
     Qualifiers DiscardedQuals;
@@ -1514,7 +1515,7 @@ static Sema::TemplateDeductionResult DeduceTemplateArgumentsByTypeMatch(
     // Just skip any attempts to deduce from a placeholder type or a parameter
     // at a different depth.
     if (A->isPlaceholderType() || Info.getDeducedDepth() != TTP->getDepth())
-      return Sema::TDK_Success;
+      return TemplateDeductionResult::Success;
 
     unsigned Index = TTP->getIndex();
 
@@ -1534,13 +1535,13 @@ static Sema::TemplateDeductionResult DeduceTemplateArgumentsByTypeMatch(
       Info.Param = cast<TemplateTypeParmDecl>(TemplateParams->getParam(Index));
       Info.FirstArg = TemplateArgument(P);
       Info.SecondArg = TemplateArgument(A);
-      return Sema::TDK_Underqualified;
+      return TemplateDeductionResult::Underqualified;
     }
 
     // Do not match a function type with a cv-qualified type.
     // http://www.open-std.org/jtc1/sc22/wg21/docs/cwg_active.html#1584
     if (A->isFunctionType() && P.hasQualifiers())
-      return Sema::TDK_NonDeducedMismatch;
+      return TemplateDeductionResult::NonDeducedMismatch;
 
     assert(TTP->getDepth() == Info.getDeducedDepth() &&
            "saw template type parameter with wrong depth");
@@ -1568,7 +1569,7 @@ static Sema::TemplateDeductionResult DeduceTemplateArgumentsByTypeMatch(
       Info.Param = cast<TemplateTypeParmDecl>(TemplateParams->getParam(Index));
       Info.FirstArg = TemplateArgument(P);
       Info.SecondArg = TemplateArgument(A);
-      return Sema::TDK_Underqualified;
+      return TemplateDeductionResult::Underqualified;
     }
 
     // Objective-C ARC:
@@ -1588,11 +1589,11 @@ static Sema::TemplateDeductionResult DeduceTemplateArgumentsByTypeMatch(
       Info.Param = cast<TemplateTypeParmDecl>(TemplateParams->getParam(Index));
       Info.FirstArg = Deduced[Index];
       Info.SecondArg = NewDeduced;
-      return Sema::TDK_Inconsistent;
+      return TemplateDeductionResult::Inconsistent;
     }
 
     Deduced[Index] = Result;
-    return Sema::TDK_Success;
+    return TemplateDeductionResult::Success;
   }
 
   // Set up the template argument deduction information for a failure.
@@ -1604,19 +1605,19 @@ static Sema::TemplateDeductionResult DeduceTemplateArgumentsByTypeMatch(
   // at, so we have to wait until all of the parameter packs in this
   // expansion have arguments.
   if (P->getAs<SubstTemplateTypeParmPackType>())
-    return Sema::TDK_Success;
+    return TemplateDeductionResult::Success;
 
   // Check the cv-qualifiers on the parameter and argument types.
   if (!(TDF & TDF_IgnoreQualifiers)) {
     if (TDF & TDF_ParamWithReferenceType) {
       if (hasInconsistentOrSupersetQualifiersOf(P, A))
-        return Sema::TDK_NonDeducedMismatch;
+        return TemplateDeductionResult::NonDeducedMismatch;
     } else if (TDF & TDF_ArgWithReferenceType) {
       // C++ [temp.deduct.conv]p4:
       //   If the original A is a reference type, A can be more cv-qualified
       //   than the deduced A
       if (!A.getQualifiers().compatiblyIncludes(P.getQualifiers()))
-        return Sema::TDK_NonDeducedMismatch;
+        return TemplateDeductionResult::NonDeducedMismatch;
 
       // Strip out all extra qualifiers from the argument to figure out the
       // type we're converting to, prior to the qualification conversion.
@@ -1625,22 +1626,22 @@ static Sema::TemplateDeductionResult DeduceTemplateArgumentsByTypeMatch(
       A = S.Context.getQualifiedType(A, P.getQualifiers());
     } else if (!IsPossiblyOpaquelyQualifiedType(P)) {
       if (P.getCVRQualifiers() != A.getCVRQualifiers())
-        return Sema::TDK_NonDeducedMismatch;
+        return TemplateDeductionResult::NonDeducedMismatch;
     }
   }
 
   // If the parameter type is not dependent, there is nothing to deduce.
   if (!P->isDependentType()) {
     if (TDF & TDF_SkipNonDependent)
-      return Sema::TDK_Success;
+      return TemplateDeductionResult::Success;
     if ((TDF & TDF_IgnoreQualifiers) ? S.Context.hasSameUnqualifiedType(P, A)
                                      : S.Context.hasSameType(P, A))
-      return Sema::TDK_Success;
+      return TemplateDeductionResult::Success;
     if (TDF & TDF_AllowCompatibleFunctionType &&
         S.isSameOrCompatibleFunctionType(P, A))
-      return Sema::TDK_Success;
+      return TemplateDeductionResult::Success;
     if (!(TDF & TDF_IgnoreQualifiers))
-      return Sema::TDK_NonDeducedMismatch;
+      return TemplateDeductionResult::NonDeducedMismatch;
     // Otherwise, when ignoring qualifiers, the types not having the same
     // unqualified type does not mean they do not match, so in this case we
     // must keep going and analyze with a non-dependent parameter type.
@@ -1664,7 +1665,7 @@ static Sema::TemplateDeductionResult DeduceTemplateArgumentsByTypeMatch(
       // There's no corresponding wording for [temp.deduct.decl], but we treat
       // it the same to match other compilers.
       if (P->isDependentType())
-        return Sema::TDK_Success;
+        return TemplateDeductionResult::Success;
       [[fallthrough]];
     case Type::Builtin:
     case Type::VariableArray:
@@ -1680,14 +1681,14 @@ static Sema::TemplateDeductionResult DeduceTemplateArgumentsByTypeMatch(
                      ((TDF & TDF_IgnoreQualifiers)
                           ? S.Context.hasSameUnqualifiedType(P, A)
                           : S.Context.hasSameType(P, A))
-                 ? Sema::TDK_Success
-                 : Sema::TDK_NonDeducedMismatch;
+                 ? TemplateDeductionResult::Success
+                 : TemplateDeductionResult::NonDeducedMismatch;
 
     //     _Complex T   [placeholder extension]
     case Type::Complex: {
       const auto *CP = P->castAs<ComplexType>(), *CA = A->getAs<ComplexType>();
       if (!CA)
-        return Sema::TDK_NonDeducedMismatch;
+        return TemplateDeductionResult::NonDeducedMismatch;
       return DeduceTemplateArgumentsByTypeMatch(
           S, TemplateParams, CP->getElementType(), CA->getElementType(), Info,
           Deduced, TDF);
@@ -1697,7 +1698,7 @@ static Sema::TemplateDeductionResult DeduceTemplateArgumentsByTypeMatch(
     case Type::Atomic: {
       const auto *PA = P->castAs<AtomicType>(), *AA = A->getAs<AtomicType>();
       if (!AA)
-        return Sema::TDK_NonDeducedMismatch;
+        return TemplateDeductionResult::NonDeducedMismatch;
       return DeduceTemplateArgumentsByTypeMatch(
           S, TemplateParams, PA->getValueType(), AA->getValueType(), Info,
           Deduced, TDF);
@@ -1711,7 +1712,7 @@ static Sema::TemplateDeductionResult DeduceTemplateArgumentsByTypeMatch(
       } else if (const auto *PA = A->getAs<ObjCObjectPointerType>()) {
         PointeeType = PA->getPointeeType();
       } else {
-        return Sema::TDK_NonDeducedMismatch;
+        return TemplateDeductionResult::NonDeducedMismatch;
       }
       return DeduceTemplateArgumentsByTypeMatch(
           S, TemplateParams, P->castAs<PointerType>()->getPointeeType(),
@@ -1724,7 +1725,7 @@ static Sema::TemplateDeductionResult DeduceTemplateArgumentsByTypeMatch(
       const auto *RP = P->castAs<LValueReferenceType>(),
                  *RA = A->getAs<LValueReferenceType>();
       if (!RA)
-        return Sema::TDK_NonDeducedMismatch;
+        return TemplateDeductionResult::NonDeducedMismatch;
 
       return DeduceTemplateArgumentsByTypeMatch(
           S, TemplateParams, RP->getPointeeType(), RA->getPointeeType(), Info,
@@ -1736,7 +1737,7 @@ static Sema::TemplateDeductionResult DeduceTemplateArgumentsByTypeMatch(
       const auto *RP = P->castAs<RValueReferenceType>(),
                  *RA = A->getAs<RValueReferenceType>();
       if (!RA)
-        return Sema::TDK_NonDeducedMismatch;
+        return TemplateDeductionResult::NonDeducedMismatch;
 
       return DeduceTemplateArgumentsByTypeMatch(
           S, TemplateParams, RP->getPointeeType(), RA->getPointeeType(), Info,
@@ -1747,7 +1748,7 @@ static Sema::TemplateDeductionResult DeduceTemplateArgumentsByTypeMatch(
     case Type::IncompleteArray: {
       const auto *IAA = S.Context.getAsIncompleteArrayType(A);
       if (!IAA)
-        return Sema::TDK_NonDeducedMismatch;
+        return TemplateDeductionResult::NonDeducedMismatch;
 
       const auto *IAP = S.Context.getAsIncompleteArrayType(P);
       assert(IAP && "Template parameter not of incomplete array type");
@@ -1763,7 +1764,7 @@ static Sema::TemplateDeductionResult DeduceTemplateArgumentsByTypeMatch(
                  *CAP = S.Context.getAsConstantArrayType(P);
       assert(CAP);
       if (!CAA || CAA->getSize() != CAP->getSize())
-        return Sema::TDK_NonDeducedMismatch;
+        return TemplateDeductionResult::NonDeducedMismatch;
 
       return DeduceTemplateArgumentsByTypeMatch(
           S, TemplateParams, CAP->getElementType(), CAA->getElementType(), Info,
@@ -1774,21 +1775,22 @@ static Sema::TemplateDeductionResult DeduceTemplateArgumentsByTypeMatch(
     case Type::DependentSizedArray: {
       const auto *AA = S.Context.getAsArrayType(A);
       if (!AA)
-        return Sema::TDK_NonDeducedMismatch;
+        return TemplateDeductionResult::NonDeducedMismatch;
 
       // Check the element type of the arrays
       const auto *DAP = S.Context.getAsDependentSizedArrayType(P);
       assert(DAP);
       if (auto Result = DeduceTemplateArgumentsByTypeMatch(
               S, TemplateParams, DAP->getElementType(), AA->getElementType(),
-              Info, Deduced, TDF & TDF_IgnoreQualifiers))
+              Info, Deduced, TDF & TDF_IgnoreQualifiers);
+          Result != TemplateDeductionResult::Success)
         return Result;
 
       // Determine the array bound is something we can deduce.
       const NonTypeTemplateParmDecl *NTTP =
           getDeducedParameterFromExpr(Info, DAP->getSizeExpr());
       if (!NTTP)
-        return Sema::TDK_Success;
+        return TemplateDeductionResult::Success;
 
       // We can perform template argument deduction for the given non-type
       // template parameter.
@@ -1806,7 +1808,7 @@ static Sema::TemplateDeductionResult DeduceTemplateArgumentsByTypeMatch(
               S, TemplateParams, NTTP, DAA->getSizeExpr(), Info, Deduced);
 
       // Incomplete type does not match a dependently-sized array type
-      return Sema::TDK_NonDeducedMismatch;
+      return TemplateDeductionResult::NonDeducedMismatch;
     }
 
     //     type(*)(T)
@@ -1816,30 +1818,32 @@ static Sema::TemplateDeductionResult DeduceTemplateArgumentsByTypeMatch(
       const auto *FPP = P->castAs<FunctionProtoType>(),
                  *FPA = A->getAs<FunctionProtoType>();
       if (!FPA)
-        return Sema::TDK_NonDeducedMismatch;
+        return TemplateDeductionResult::NonDeducedMismatch;
 
       if (FPP->getMethodQuals() != FPA->getMethodQuals() ||
           FPP->getRefQualifier() != FPA->getRefQualifier() ||
           FPP->isVariadic() != FPA->isVariadic())
-        return Sema::TDK_NonDeducedMismatch;
+        return TemplateDeductionResult::NonDeducedMismatch;
 
       // Check return types.
       if (auto Result = DeduceTemplateArgumentsByTypeMatch(
               S, TemplateParams, FPP->getReturnType(), FPA->getReturnType(),
               Info, Deduced, 0,
               /*PartialOrdering=*/false,
-              /*DeducedFromArrayBound=*/false))
+              /*DeducedFromArrayBound=*/false);
+          Result != TemplateDeductionResult::Success)
         return Result;
 
       // Check parameter types.
       if (auto Result = DeduceTemplateArguments(
               S, TemplateParams, FPP->param_type_begin(), FPP->getNumParams(),
               FPA->param_type_begin(), FPA->getNumParams(), Info, Deduced,
-              TDF & TDF_TopLevelParameterTypeList, PartialOrdering))
+              TDF & TDF_TopLevelParameterTypeList, PartialOrdering);
+          Result != TemplateDeductionResult::Success)
         return Result;
 
       if (TDF & TDF_AllowCompatibleFunctionType)
-        return Sema::TDK_Success;
+        return TemplateDeductionResult::Success;
 
       // FIXME: Per core-2016/10/1019 (no corresponding core issue yet), permit
       // deducing through the noexcept-specifier if it's part of the canonical
@@ -1877,7 +1881,7 @@ static Sema::TemplateDeductionResult DeduceTemplateArgumentsByTypeMatch(
       // Careful about [temp.deduct.call] and [temp.deduct.conv], which allow
       // top-level differences in noexcept-specifications.
 
-      return Sema::TDK_Success;
+      return TemplateDeductionResult::Success;
     }
 
     case Type::InjectedClassName:
@@ -1901,7 +1905,7 @@ static Sema::TemplateDeductionResult DeduceTemplateArgumentsByTypeMatch(
 
       auto Result =
           DeduceTemplateSpecArguments(S, TemplateParams, P, A, Info, Deduced);
-      if (Result == Sema::TDK_Success)
+      if (Result == TemplateDeductionResult::Success)
         return Result;
 
       // We cannot inspect base classes as part of deduction when the type
@@ -1916,7 +1920,8 @@ static Sema::TemplateDeductionResult DeduceTemplateArgumentsByTypeMatch(
       // Check bases according to C++14 [temp.deduct.call] p4b3:
       auto BaseResult = DeduceTemplateBases(S, getCanonicalRD(A),
                                             TemplateParams, P, Info, Deduced);
-      return BaseResult != Sema::TDK_Invalid ? BaseResult : Result;
+      return BaseResult != TemplateDeductionResult::Invalid ? BaseResult
+                                                            : Result;
     }
 
     //     T type::*
@@ -1932,7 +1937,7 @@ static Sema::TemplateDeductionResult DeduceTemplateArgumentsByTypeMatch(
       const auto *MPP = P->castAs<MemberPointerType>(),
                  *MPA = A->getAs<MemberPointerType>();
       if (!MPA)
-        return Sema::TDK_NonDeducedMismatch;
+        return TemplateDeductionResult::NonDeducedMismatch;
 
       QualType PPT = MPP->getPointeeType();
       if (PPT->isFunctionType())
@@ -1945,7 +1950,8 @@ static Sema::TemplateDeductionResult DeduceTemplateArgumentsByTypeMatch(
 
       unsigned SubTDF = TDF & TDF_IgnoreQualifiers;
       if (auto Result = DeduceTemplateArgumentsByTypeMatch(
-              S, TemplateParams, PPT, APT, Info, Deduced, SubTDF))
+              S, TemplateParams, PPT, APT, Info, Deduced, SubTDF);
+          Result != TemplateDeductionResult::Success)
         return Result;
       return DeduceTemplateArgumentsByTypeMatch(
           S, TemplateParams, QualType(MPP->getClass(), 0),
@@ -1961,7 +1967,7 @@ static Sema::TemplateDeductionResult DeduceTemplateArgumentsByTypeMatch(
       const auto *BPP = P->castAs<BlockPointerType>(),
                  *BPA = A->getAs<BlockPointerType>();
       if (!BPA)
-        return Sema::TDK_NonDeducedMismatch;
+        return TemplateDeductionResult::NonDeducedMismatch;
       return DeduceTemplateArgumentsByTypeMatch(
           S, TemplateParams, BPP->getPointeeType(), BPA->getPointeeType(), Info,
           Deduced, 0);
@@ -1976,7 +1982,7 @@ static Sema::TemplateDeductionResult DeduceTemplateArgumentsByTypeMatch(
       if (const auto *VA = A->getAs<ExtVectorType>()) {
         // Make sure that the vectors have the same number of elements.
         if (VP->getNumElements() != VA->getNumElements())
-          return Sema::TDK_NonDeducedMismatch;
+          return TemplateDeductionResult::NonDeducedMismatch;
         ElementType = VA->getElementType();
       } else if (const auto *VA = A->getAs<DependentSizedExtVectorType>()) {
         // We can't check the number of elements, since the argument has a
@@ -1984,7 +1990,7 @@ static Sema::TemplateDeductionResult DeduceTemplateArgumentsByTypeMatch(
         // ordering.
         ElementType = VA->getElementType();
       } else {
-        return Sema::TDK_NonDeducedMismatch;
+        return TemplateDeductionResult::NonDeducedMismatch;
       }
       // Perform deduction on the element types.
       return DeduceTemplateArgumentsByTypeMatch(
@@ -1999,14 +2005,15 @@ static Sema::TemplateDeductionResult DeduceTemplateArgumentsByTypeMatch(
         // Perform deduction on the element types.
         if (auto Result = DeduceTemplateArgumentsByTypeMatch(
                 S, TemplateParams, VP->getElementType(), VA->getElementType(),
-                Info, Deduced, TDF))
+                Info, Deduced, TDF);
+            Result != TemplateDeductionResult::Success)
           return Result;
 
         // Perform deduction on the vector size, if we can.
         const NonTypeTemplateParmDecl *NTTP =
             getDeducedParameterFromExpr(Info, VP->getSizeExpr());
         if (!NTTP)
-          return Sema::TDK_Success;
+          return TemplateDeductionResult::Success;
 
         llvm::APSInt ArgSize(S.Context.getTypeSize(S.Context.IntTy), false);
         ArgSize = VA->getNumElements();
@@ -2022,20 +2029,21 @@ static Sema::TemplateDeductionResult DeduceTemplateArgumentsByTypeMatch(
         // Perform deduction on the element types.
         if (auto Result = DeduceTemplateArgumentsByTypeMatch(
                 S, TemplateParams, VP->getElementType(), VA->getElementType(),
-                Info, Deduced, TDF))
+                Info, Deduced, TDF);
+            Result != TemplateDeductionResult::Success)
           return Result;
 
         // Perform deduction on the vector size, if we can.
         const NonTypeTemplateParmDecl *NTTP =
             getDeducedParameterFromExpr(Info, VP->getSizeExpr());
         if (!NTTP)
-          return Sema::TDK_Success;
+          return TemplateDeductionResult::Success;
 
         return DeduceNonTypeTemplateArgument(S, TemplateParams, NTTP,
                                              VA->getSizeExpr(), Info, Deduced);
       }
 
-      return Sema::TDK_NonDeducedMismatch;
+      return TemplateDeductionResult::NonDeducedMismatch;
     }
 
     //     (clang extension)
@@ -2048,14 +2056,15 @@ static Sema::TemplateDeductionResult DeduceTemplateArgumentsByTypeMatch(
         // Perform deduction on the element types.
         if (auto Result = DeduceTemplateArgumentsByTypeMatch(
                 S, TemplateParams, VP->getElementType(), VA->getElementType(),
-                Info, Deduced, TDF))
+                Info, Deduced, TDF);
+            Result != TemplateDeductionResult::Success)
           return Result;
 
         // Perform deduction on the vector size, if we can.
         const NonTypeTemplateParmDecl *NTTP =
             getDeducedParameterFromExpr(Info, VP->getSizeExpr());
         if (!NTTP)
-          return Sema::TDK_Success;
+          return TemplateDeductionResult::Success;
 
         llvm::APSInt ArgSize(S.Context.getTypeSize(S.Context.IntTy), false);
         ArgSize = VA->getNumElements();
@@ -2071,20 +2080,21 @@ static Sema::TemplateDeductionResult DeduceTemplateArgumentsByTypeMatch(
         // Perform deduction on the element types.
         if (auto Result = DeduceTemplateArgumentsByTypeMatch(
                 S, TemplateParams, VP->getElementType(), VA->getElementType(),
-                Info, Deduced, TDF))
+                Info, Deduced, TDF);
+            Result != TemplateDeductionResult::Success)
           return Result;
 
         // Perform deduction on the vector size, if we can.
         const NonTypeTemplateParmDecl *NTTP =
             getDeducedParameterFromExpr(Info, VP->getSizeExpr());
         if (!NTTP)
-          return Sema::TDK_Success;
+          return TemplateDeductionResult::Success;
 
         return DeduceNonTypeTemplateArgument(S, TemplateParams, NTTP,
                                              VA->getSizeExpr(), Info, Deduced);
       }
 
-      return Sema::TDK_NonDeducedMismatch;
+      return TemplateDeductionResult::NonDeducedMismatch;
     }
 
     //     (clang extension)
@@ -2095,12 +2105,12 @@ static Sema::TemplateDeductionResult DeduceTemplateArgumentsByTypeMatch(
       const auto *MP = P->castAs<ConstantMatrixType>(),
                  *MA = A->getAs<ConstantMatrixType>();
       if (!MA)
-        return Sema::TDK_NonDeducedMismatch;
+        return TemplateDeductionResult::NonDeducedMismatch;
 
       // Check that the dimensions are the same
       if (MP->getNumRows() != MA->getNumRows() ||
           MP->getNumColumns() != MA->getNumColumns()) {
-        return Sema::TDK_NonDeducedMismatch;
+        return TemplateDeductionResult::NonDeducedMismatch;
       }
       // Perform deduction on element types.
       return DeduceTemplateArgumentsByTypeMatch(
@@ -2112,12 +2122,13 @@ static Sema::TemplateDeductionResult DeduceTemplateArgumentsByTypeMatch(
       const auto *MP = P->castAs<DependentSizedMatrixType>();
       const auto *MA = A->getAs<MatrixType>();
       if (!MA)
-        return Sema::TDK_NonDeducedMismatch;
+        return TemplateDeductionResult::NonDeducedMismatch;
 
       // Check the element type of the matrixes.
       if (auto Result = DeduceTemplateArgumentsByTypeMatch(
               S, TemplateParams, MP->getElementType(), MA->getElementType(),
-              Info, Deduced, TDF))
+              Info, Deduced, TDF);
+          Result != TemplateDeductionResult::Success)
         return Result;
 
       // Try to deduce a matrix dimension.
@@ -2132,26 +2143,26 @@ static Sema::TemplateDeductionResult DeduceTemplateArgumentsByTypeMatch(
               std::optional<llvm::APSInt> ParamConst =
                   ParamExpr->getIntegerConstantExpr(S.Context);
               if (!ParamConst)
-                return Sema::TDK_NonDeducedMismatch;
+                return TemplateDeductionResult::NonDeducedMismatch;
 
               if (ACM) {
                 if ((ACM->*GetArgDimension)() == *ParamConst)
-                  return Sema::TDK_Success;
-                return Sema::TDK_NonDeducedMismatch;
+                  return TemplateDeductionResult::Success;
+                return TemplateDeductionResult::NonDeducedMismatch;
               }
 
               Expr *ArgExpr = (ADM->*GetArgDimensionExpr)();
               if (std::optional<llvm::APSInt> ArgConst =
                       ArgExpr->getIntegerConstantExpr(S.Context))
                 if (*ArgConst == *ParamConst)
-                  return Sema::TDK_Success;
-              return Sema::TDK_NonDeducedMismatch;
+                  return TemplateDeductionResult::Success;
+              return TemplateDeductionResult::NonDeducedMismatch;
             }
 
             const NonTypeTemplateParmDecl *NTTP =
                 getDeducedParameterFromExpr(Info, ParamExpr);
             if (!NTTP)
-              return Sema::TDK_Success;
+              return TemplateDeductionResult::Success;
 
             if (ACM) {
               llvm::APSInt ArgConst(
@@ -2169,7 +2180,8 @@ static Sema::TemplateDeductionResult DeduceTemplateArgumentsByTypeMatch(
 
       if (auto Result = DeduceMatrixArg(MP->getRowExpr(), MA,
                                         &ConstantMatrixType::getNumRows,
-                                        &DependentSizedMatrixType::getRowExpr))
+                                        &DependentSizedMatrixType::getRowExpr);
+          Result != TemplateDeductionResult::Success)
         return Result;
 
       return DeduceMatrixArg(MP->getColumnExpr(), MA,
@@ -2187,14 +2199,15 @@ static Sema::TemplateDeductionResult DeduceTemplateArgumentsByTypeMatch(
         // Perform deduction on the pointer type.
         if (auto Result = DeduceTemplateArgumentsByTypeMatch(
                 S, TemplateParams, ASP->getPointeeType(), ASA->getPointeeType(),
-                Info, Deduced, TDF))
+                Info, Deduced, TDF);
+            Result != TemplateDeductionResult::Success)
           return Result;
 
         // Perform deduction on the address space, if we can.
         const NonTypeTemplateParmDecl *NTTP =
             getDeducedParameterFromExpr(Info, ASP->getAddrSpaceExpr());
         if (!NTTP)
-          return Sema::TDK_Success;
+          return TemplateDeductionResult::Success;
 
         return DeduceNonTypeTemplateArgument(
             S, TemplateParams, NTTP, ASA->getAddrSpaceExpr(), Info, Deduced);
@@ -2208,33 +2221,34 @@ static Sema::TemplateDeductionResult DeduceTemplateArgumentsByTypeMatch(
         // Perform deduction on the pointer types.
         if (auto Result = DeduceTemplateArgumentsByTypeMatch(
                 S, TemplateParams, ASP->getPointeeType(),
-                S.Context.removeAddrSpaceQualType(A), Info, Deduced, TDF))
+                S.Context.removeAddrSpaceQualType(A), Info, Deduced, TDF);
+            Result != TemplateDeductionResult::Success)
           return Result;
 
         // Perform deduction on the address space, if we can.
         const NonTypeTemplateParmDecl *NTTP =
             getDeducedParameterFromExpr(Info, ASP->getAddrSpaceExpr());
         if (!NTTP)
-          return Sema::TDK_Success;
+          return TemplateDeductionResult::Success;
 
         return DeduceNonTypeTemplateArgument(S, TemplateParams, NTTP,
                                              ArgAddressSpace, S.Context.IntTy,
                                              true, Info, Deduced);
       }
 
-      return Sema::TDK_NonDeducedMismatch;
+      return TemplateDeductionResult::NonDeducedMismatch;
     }
     case Type::DependentBitInt: {
       const auto *IP = P->castAs<DependentBitIntType>();
 
       if (const auto *IA = A->getAs<BitIntType>()) {
         if (IP->isUnsigned() != IA->isUnsigned())
-          return Sema::TDK_NonDeducedMismatch;
+          return TemplateDeductionResult::NonDeducedMismatch;
 
         const NonTypeTemplateParmDecl *NTTP =
             getDeducedParameterFromExpr(Info, IP->getNumBitsExpr());
         if (!NTTP)
-          return Sema::TDK_Success;
+          return TemplateDeductionResult::Success;
 
         llvm::APSInt ArgSize(S.Context.getTypeSize(S.Context.IntTy), false);
         ArgSize = IA->getNumBits();
@@ -2246,11 +2260,11 @@ static Sema::TemplateDeductionResult DeduceTemplateArgumentsByTypeMatch(
 
       if (const auto *IA = A->getAs<DependentBitIntType>()) {
         if (IP->isUnsigned() != IA->isUnsigned())
-          return Sema::TDK_NonDeducedMismatch;
-        return Sema::TDK_Success;
+          return TemplateDeductionResult::NonDeducedMismatch;
+        return TemplateDeductionResult::Success;
       }
 
-      return Sema::TDK_NonDeducedMismatch;
+      return TemplateDeductionResult::NonDeducedMismatch;
     }
 
     case Type::TypeOfExpr:
@@ -2264,7 +2278,7 @@ static Sema::TemplateDeductionResult DeduceTemplateArgumentsByTypeMatch(
     case Type::PackExpansion:
     case Type::Pipe:
       // No template argument deduction for these types
-      return Sema::TDK_Success;
+      return TemplateDeductionResult::Success;
 
     case Type::PackIndexing: {
       const PackIndexingType *PIT = P->getAs<PackIndexingType>();
@@ -2272,14 +2286,14 @@ static Sema::TemplateDeductionResult DeduceTemplateArgumentsByTypeMatch(
         return DeduceTemplateArgumentsByTypeMatch(
             S, TemplateParams, PIT->getSelectedType(), A, Info, Deduced, TDF);
       }
-      return Sema::TDK_IncompletePack;
+      return TemplateDeductionResult::IncompletePack;
     }
     }
 
   llvm_unreachable("Invalid Type Class!");
 }
 
-static Sema::TemplateDeductionResult
+static TemplateDeductionResult
 DeduceTemplateArguments(Sema &S, TemplateParameterList *TemplateParams,
                         const TemplateArgument &P, TemplateArgument A,
                         TemplateDeductionInfo &Info,
@@ -2300,7 +2314,7 @@ DeduceTemplateArguments(Sema &S, TemplateParameterList *TemplateParams,
           S, TemplateParams, P.getAsType(), A.getAsType(), Info, Deduced, 0);
     Info.FirstArg = P;
     Info.SecondArg = A;
-    return Sema::TDK_NonDeducedMismatch;
+    return TemplateDeductionResult::NonDeducedMismatch;
 
   case TemplateArgument::Template:
     if (A.getKind() == TemplateArgument::Template)
@@ -2308,7 +2322,7 @@ DeduceTemplateArguments(Sema &S, TemplateParameterList *TemplateParams,
                                      A.getAsTemplate(), Info, Deduced);
     Info.FirstArg = P;
     Info.SecondArg = A;
-    return Sema::TDK_NonDeducedMismatch;
+    return TemplateDeductionResult::NonDeducedMismatch;
 
   case TemplateArgument::TemplateExpansion:
     llvm_unreachable("caller should handle pack expansions");
@@ -2316,38 +2330,38 @@ DeduceTemplateArguments(Sema &S, TemplateParameterList *TemplateParams,
   case TemplateArgument::Declaration:
     if (A.getKind() == TemplateArgument::Declaration &&
         isSameDeclaration(P.getAsDecl(), A.getAsDecl()))
-      return Sema::TDK_Success;
+      return TemplateDeductionResult::Success;
 
     Info.FirstArg = P;
     Info.SecondArg = A;
-    return Sema::TDK_NonDeducedMismatch;
+    return TemplateDeductionResult::NonDeducedMismatch;
 
   case TemplateArgument::NullPtr:
     if (A.getKind() == TemplateArgument::NullPtr &&
         S.Context.hasSameType(P.getNullPtrType(), A.getNullPtrType()))
-      return Sema::TDK_Success;
+      return TemplateDeductionResult::Success;
 
     Info.FirstArg = P;
     Info.SecondArg = A;
-    return Sema::TDK_NonDeducedMismatch;
+    return TemplateDeductionResult::NonDeducedMismatch;
 
   case TemplateArgument::Integral:
     if (A.getKind() == TemplateArgument::Integral) {
       if (hasSameExtendedValue(P.getAsIntegral(), A.getAsIntegral()))
-        return Sema::TDK_Success;
+        return TemplateDeductionResult::Success;
     }
     Info.FirstArg = P;
     Info.SecondArg = A;
-    return Sema::TDK_NonDeducedMismatch;
+    return TemplateDeductionResult::NonDeducedMismatch;
 
   case TemplateArgument::StructuralValue:
     if (A.getKind() == TemplateArgument::StructuralValue &&
         A.structurallyEquals(P))
-      return Sema::TDK_Success;
+      return TemplateDeductionResult::Success;
 
     Info.FirstArg = P;
     Info.SecondArg = A;
-    return Sema::TDK_NonDeducedMismatch;
+    return TemplateDeductionResult::NonDeducedMismatch;
 
   case TemplateArgument::Expression:
     if (const NonTypeTemplateParmDecl *NTTP =
@@ -2376,13 +2390,13 @@ DeduceTemplateArguments(Sema &S, TemplateParameterList *TemplateParams,
       case TemplateArgument::Pack:
         Info.FirstArg = P;
         Info.SecondArg = A;
-        return Sema::TDK_NonDeducedMismatch;
+        return TemplateDeductionResult::NonDeducedMismatch;
       }
       llvm_unreachable("Unknown template argument kind");
     }
 
     // Can't deduce anything, but that's okay.
-    return Sema::TDK_Success;
+    return TemplateDeductionResult::Success;
   case TemplateArgument::Pack:
     llvm_unreachable("Argument packs should be expanded by the caller!");
   }
@@ -2433,7 +2447,7 @@ static bool hasPackExpansionBeforeEnd(ArrayRef<TemplateArgument> Args) {
   return false;
 }
 
-static Sema::TemplateDeductionResult
+static TemplateDeductionResult
 DeduceTemplateArguments(Sema &S, TemplateParameterList *TemplateParams,
                         ArrayRef<TemplateArgument> Ps,
                         ArrayRef<TemplateArgument> As,
@@ -2445,7 +2459,7 @@ DeduceTemplateArguments(Sema &S, TemplateParameterList *TemplateParams,
   //   the last template argument, the entire template argument list is a
   //   non-deduced context.
   if (hasPackExpansionBeforeEnd(Ps))
-    return Sema::TDK_Success;
+    return TemplateDeductionResult::Success;
 
   // C++0x [temp.deduct.type]p9:
   //   If P has a form that contains <T> or <i>, then each argument Pi of the
@@ -2460,18 +2474,19 @@ DeduceTemplateArguments(Sema &S, TemplateParameterList *TemplateParams,
       // Check whether we have enough arguments.
       if (!hasTemplateArgumentForDeduction(As, ArgIdx))
         return NumberOfArgumentsMustMatch
-                   ? Sema::TDK_MiscellaneousDeductionFailure
-                   : Sema::TDK_Success;
+                   ? TemplateDeductionResult::MiscellaneousDeductionFailure
+                   : TemplateDeductionResult::Success;
 
       // C++1z [temp.deduct.type]p9:
       //   During partial ordering, if Ai was originally a pack expansion [and]
       //   Pi is not a pack expansion, template argument deduction fails.
       if (As[ArgIdx].isPackExpansion())
-        return Sema::TDK_MiscellaneousDeductionFailure;
+        return TemplateDeductionResult::MiscellaneousDeductionFailure;
 
       // Perform deduction for this Pi/Ai pair.
       if (auto Result = DeduceTemplateArguments(S, TemplateParams, P,
-                                                As[ArgIdx], Info, Deduced))
+                                                As[ArgIdx], Info, Deduced);
+          Result != TemplateDeductionResult::Success)
         return Result;
 
       // Move to the next argument.
@@ -2499,7 +2514,8 @@ DeduceTemplateArguments(Sema &S, TemplateParameterList *TemplateParams,
          ++ArgIdx) {
       // Deduce template arguments from the pattern.
       if (auto Result = DeduceTemplateArguments(S, TemplateParams, Pattern,
-                                                As[ArgIdx], Info, Deduced))
+                                                As[ArgIdx], Info, Deduced);
+          Result != TemplateDeductionResult::Success)
         return Result;
 
       PackScope.nextPackElement();
@@ -2507,11 +2523,12 @@ DeduceTemplateArguments(Sema &S, TemplateParameterList *TemplateParams,
 
     // Build argument packs for each of the parameter packs expanded by this
     // pack expansion.
-    if (auto Result = PackScope.finish())
+    if (auto Result = PackScope.finish();
+        Result != TemplateDeductionResult::Success)
       return Result;
   }
 
-  return Sema::TDK_Success;
+  return TemplateDeductionResult::Success;
 }
 
 /// Determine whether two template arguments are the same.
@@ -2773,7 +2790,7 @@ static bool ConvertDeducedTemplateArgument(
 // ClassTemplatePartialSpecializationDecl sadly does not derive from
 // TemplateDecl.
 template <typename TemplateDeclT>
-static Sema::TemplateDeductionResult ConvertDeducedTemplateArguments(
+static TemplateDeductionResult ConvertDeducedTemplateArguments(
     Sema &S, TemplateDeclT *Template, bool IsDeduced,
     SmallVectorImpl<DeducedTemplateArgument> &Deduced,
     TemplateDeductionInfo &Info,
@@ -2792,7 +2809,8 @@ static Sema::TemplateDeductionResult ConvertDeducedTemplateArguments(
     // FIXME: Where did the word "trailing" come from?
     if (Deduced[I].isNull() && Param->isTemplateParameterPack()) {
       if (auto Result =
-              PackDeductionScope(S, TemplateParams, Deduced, Info, I).finish())
+              PackDeductionScope(S, TemplateParams, Deduced, Info, I).finish();
+          Result != TemplateDeductionResult::Success)
         return Result;
     }
 
@@ -2829,7 +2847,7 @@ static Sema::TemplateDeductionResult ConvertDeducedTemplateArguments(
         Info.reset(
             TemplateArgumentList::CreateCopy(S.Context, SugaredBuilder),
             TemplateArgumentList::CreateCopy(S.Context, CanonicalBuilder));
-        return Sema::TDK_SubstitutionFailure;
+        return TemplateDeductionResult::SubstitutionFailure;
       }
 
       continue;
@@ -2841,7 +2859,7 @@ static Sema::TemplateDeductionResult ConvertDeducedTemplateArguments(
     if (!TD) {
       assert(isa<ClassTemplatePartialSpecializationDecl>(Template) ||
              isa<VarTemplatePartialSpecializationDecl>(Template));
-      return Sema::TDK_Incomplete;
+      return TemplateDeductionResult::Incomplete;
     }
 
     TemplateArgumentLoc DefArg;
@@ -2871,8 +2889,8 @@ static Sema::TemplateDeductionResult ConvertDeducedTemplateArguments(
                  TemplateArgumentList::CreateCopy(S.Context, CanonicalBuilder));
       if (PartialOverloading) break;
 
-      return HasDefaultArg ? Sema::TDK_SubstitutionFailure
-                           : Sema::TDK_Incomplete;
+      return HasDefaultArg ? TemplateDeductionResult::SubstitutionFailure
+                           : TemplateDeductionResult::Incomplete;
     }
 
     // Check whether we can actually use the default argument.
@@ -2884,13 +2902,13 @@ static Sema::TemplateDeductionResult ConvertDeducedTemplateArguments(
       // FIXME: These template arguments are temporary. Free them!
       Info.reset(TemplateArgumentList::CreateCopy(S.Context, SugaredBuilder),
                  TemplateArgumentList::CreateCopy(S.Context, CanonicalBuilder));
-      return Sema::TDK_SubstitutionFailure;
+      return TemplateDeductionResult::SubstitutionFailure;
     }
 
     // If we get here, we successfully used the default template argument.
   }
 
-  return Sema::TDK_Success;
+  return TemplateDeductionResult::Success;
 }
 
 static DeclContext *getAsDeclContextOrEnclosing(Decl *D) {
@@ -2926,7 +2944,7 @@ bool DeducedArgsNeedReplacement<ClassTemplatePartialSpecializationDecl>(
 }
 
 template <typename TemplateDeclT>
-static Sema::TemplateDeductionResult
+static TemplateDeductionResult
 CheckDeducedArgumentConstraints(Sema &S, TemplateDeclT *Template,
                                 ArrayRef<TemplateArgument> SugaredDeducedArgs,
                                 ArrayRef<TemplateArgument> CanonicalDeducedArgs,
@@ -2959,15 +2977,15 @@ CheckDeducedArgumentConstraints(Sema &S, TemplateDeclT *Template,
     Info.reset(
         TemplateArgumentList::CreateCopy(S.Context, SugaredDeducedArgs),
         TemplateArgumentList::CreateCopy(S.Context, CanonicalDeducedArgs));
-    return Sema::TDK_ConstraintsNotSatisfied;
+    return TemplateDeductionResult::ConstraintsNotSatisfied;
   }
-  return Sema::TDK_Success;
+  return TemplateDeductionResult::Success;
 }
 
 /// Complete template argument deduction for a partial specialization.
 template <typename T>
 static std::enable_if_t<IsPartialSpecialization<T>::value,
-                        Sema::TemplateDeductionResult>
+                        TemplateDeductionResult>
 FinishTemplateArgumentDeduction(
     Sema &S, T *Partial, bool IsPartialOrdering,
     ArrayRef<TemplateArgument> TemplateArgs,
@@ -2986,7 +3004,8 @@ FinishTemplateArgumentDeduction(
   SmallVector<TemplateArgument, 4> SugaredBuilder, CanonicalBuilder;
   if (auto Result = ConvertDeducedTemplateArguments(
           S, Partial, IsPartialOrdering, Deduced, Info, SugaredBuilder,
-          CanonicalBuilder))
+          CanonicalBuilder);
+      Result != TemplateDeductionResult::Success)
     return Result;
 
   // Form the template argument list from the deduced template arguments.
@@ -3023,7 +3042,7 @@ FinishTemplateArgumentDeduction(
         Partial->getTemplateParameters()->getParam(ParamIdx));
     Info.Param = makeTemplateParameter(Param);
     Info.FirstArg = (*PartialTemplArgInfo)[ArgIdx].getArgument();
-    return Sema::TDK_SubstitutionFailure;
+    return TemplateDeductionResult::SubstitutionFailure;
   }
 
   bool ConstraintsNotSatisfied;
@@ -3033,8 +3052,9 @@ FinishTemplateArgumentDeduction(
           Template, Partial->getLocation(), InstArgs, false,
           SugaredConvertedInstArgs, CanonicalConvertedInstArgs,
           /*UpdateArgsWithConversions=*/true, &ConstraintsNotSatisfied))
-    return ConstraintsNotSatisfied ? Sema::TDK_ConstraintsNotSatisfied
-                                   : Sema::TDK_SubstitutionFailure;
+    return ConstraintsNotSatisfied
+               ? TemplateDeductionResult::ConstraintsNotSatisfied
+               : TemplateDeductionResult::SubstitutionFailure;
 
   TemplateParameterList *TemplateParams = Template->getTemplateParameters();
   for (unsigned I = 0, E = TemplateParams->size(); I != E; ++I) {
@@ -3044,24 +3064,25 @@ FinishTemplateArgumentDeduction(
       Info.Param = makeTemplateParameter(TemplateParams->getParam(I));
       Info.FirstArg = TemplateArgs[I];
       Info.SecondArg = InstArg;
-      return Sema::TDK_NonDeducedMismatch;
+      return TemplateDeductionResult::NonDeducedMismatch;
     }
   }
 
   if (Trap.hasErrorOccurred())
-    return Sema::TDK_SubstitutionFailure;
+    return TemplateDeductionResult::SubstitutionFailure;
 
   if (auto Result = CheckDeducedArgumentConstraints(S, Partial, SugaredBuilder,
-                                                    CanonicalBuilder, Info))
+                                                    CanonicalBuilder, Info);
+      Result != TemplateDeductionResult::Success)
     return Result;
 
-  return Sema::TDK_Success;
+  return TemplateDeductionResult::Success;
 }
 
 /// Complete template argument deduction for a class or variable template,
 /// when partial ordering against a partial specialization.
 // FIXME: Factor out duplication with partial specialization version above.
-static Sema::TemplateDeductionResult FinishTemplateArgumentDeduction(
+static TemplateDeductionResult FinishTemplateArgumentDeduction(
     Sema &S, TemplateDecl *Template, bool PartialOrdering,
     ArrayRef<TemplateArgument> TemplateArgs,
     SmallVectorImpl<DeducedTemplateArgument> &Deduced,
@@ -3081,7 +3102,8 @@ static Sema::TemplateDeductionResult FinishTemplateArgumentDeduction(
           S, Template, /*IsDeduced*/ PartialOrdering, Deduced, Info,
           SugaredBuilder, CanonicalBuilder,
           /*CurrentInstantiationScope=*/nullptr,
-          /*NumAlreadyConverted=*/0U, /*PartialOverloading=*/false))
+          /*NumAlreadyConverted=*/0U, /*PartialOverloading=*/false);
+      Result != TemplateDeductionResult::Success)
     return Result;
 
   // Check that we produced the correct argument list.
@@ -3093,29 +3115,30 @@ static Sema::TemplateDeductionResult FinishTemplateArgumentDeduction(
       Info.Param = makeTemplateParameter(TemplateParams->getParam(I));
       Info.FirstArg = TemplateArgs[I];
       Info.SecondArg = InstArg;
-      return Sema::TDK_NonDeducedMismatch;
+      return TemplateDeductionResult::NonDeducedMismatch;
     }
   }
 
   if (Trap.hasErrorOccurred())
-    return Sema::TDK_SubstitutionFailure;
+    return TemplateDeductionResult::SubstitutionFailure;
 
   if (auto Result = CheckDeducedArgumentConstraints(S, Template, SugaredBuilder,
-                                                    CanonicalBuilder, Info))
+                                                    CanonicalBuilder, Info);
+      Result != TemplateDeductionResult::Success)
     return Result;
 
-  return Sema::TDK_Success;
+  return TemplateDeductionResult::Success;
 }
 
 /// Perform template argument deduction to determine whether
 /// the given template arguments match the given class template
 /// partial specialization per C++ [temp.class.spec.match].
-Sema::TemplateDeductionResult
+TemplateDeductionResult
 Sema::DeduceTemplateArguments(ClassTemplatePartialSpecializationDecl *Partial,
                               ArrayRef<TemplateArgument> TemplateArgs,
                               TemplateDeductionInfo &Info) {
   if (Partial->isInvalidDecl())
-    return TDK_Invalid;
+    return TemplateDeductionResult::Invalid;
 
   // C++ [temp.class.spec.match]p2:
   //   A partial specialization matches a given actual template
@@ -3137,17 +3160,18 @@ Sema::DeduceTemplateArguments(ClassTemplatePartialSpecializationDecl *Partial,
   if (TemplateDeductionResult Result = ::DeduceTemplateArguments(
           *this, Partial->getTemplateParameters(),
           Partial->getTemplateArgs().asArray(), TemplateArgs, Info, Deduced,
-          /*NumberOfArgumentsMustMatch=*/false))
+          /*NumberOfArgumentsMustMatch=*/false);
+      Result != TemplateDeductionResult::Success)
     return Result;
 
   SmallVector<TemplateArgument, 4> DeducedArgs(Deduced.begin(), Deduced.end());
   InstantiatingTemplate Inst(*this, Info.getLocation(), Partial, DeducedArgs,
                              Info);
   if (Inst.isInvalid())
-    return TDK_InstantiationDepth;
+    return TemplateDeductionResult::InstantiationDepth;
 
   if (Trap.hasErrorOccurred())
-    return Sema::TDK_SubstitutionFailure;
+    return TemplateDeductionResult::SubstitutionFailure;
 
   TemplateDeductionResult Result;
   runWithSufficientStackSpace(Info.getLocation(), [&] {
@@ -3161,12 +3185,12 @@ Sema::DeduceTemplateArguments(ClassTemplatePartialSpecializationDecl *Partial,
 /// Perform template argument deduction to determine whether
 /// the given template arguments match the given variable template
 /// partial specialization per C++ [temp.class.spec.match].
-Sema::TemplateDeductionResult
+TemplateDeductionResult
 Sema::DeduceTemplateArguments(VarTemplatePartialSpecializationDecl *Partial,
                               ArrayRef<TemplateArgument> TemplateArgs,
                               TemplateDeductionInfo &Info) {
   if (Partial->isInvalidDecl())
-    return TDK_Invalid;
+    return TemplateDeductionResult::Invalid;
 
   // C++ [temp.class.spec.match]p2:
   //   A partial specialization matches a given actual template
@@ -3188,17 +3212,18 @@ Sema::DeduceTemplateArguments(VarTemplatePartialSpecializationDecl *Partial,
   if (TemplateDeductionResult Result = ::DeduceTemplateArguments(
           *this, Partial->getTemplateParameters(),
           Partial->getTemplateArgs().asArray(), TemplateArgs, Info, Deduced,
-          /*NumberOfArgumentsMustMatch=*/false))
+          /*NumberOfArgumentsMustMatch=*/false);
+      Result != TemplateDeductionResult::Success)
     return Result;
 
   SmallVector<TemplateArgument, 4> DeducedArgs(Deduced.begin(), Deduced.end());
   InstantiatingTemplate Inst(*this, Info.getLocation(), Partial, DeducedArgs,
                              Info);
   if (Inst.isInvalid())
-    return TDK_InstantiationDepth;
+    return TemplateDeductionResult::InstantiationDepth;
 
   if (Trap.hasErrorOccurred())
-    return Sema::TDK_SubstitutionFailure;
+    return TemplateDeductionResult::SubstitutionFailure;
 
   TemplateDeductionResult Result;
   runWithSufficientStackSpace(Info.getLocation(), [&] {
@@ -3251,9 +3276,9 @@ static bool isSimpleTemplateIdType(QualType T) {
 /// \param Info if substitution fails for any reason, this object will be
 /// populated with more information about the failure.
 ///
-/// \returns TDK_Success if substitution was successful, or some failure
-/// condition.
-Sema::TemplateDeductionResult Sema::SubstituteExplicitTemplateArguments(
+/// \returns TemplateDeductionResult::Success if substitution was successful, or
+/// some failure condition.
+TemplateDeductionResult Sema::SubstituteExplicitTemplateArguments(
     FunctionTemplateDecl *FunctionTemplate,
     TemplateArgumentListInfo &ExplicitTemplateArgs,
     SmallVectorImpl<DeducedTemplateArgument> &Deduced,
@@ -3271,7 +3296,7 @@ Sema::TemplateDeductionResult Sema::SubstituteExplicitTemplateArguments(
 
     if (FunctionType)
       *FunctionType = Function->getType();
-    return TDK_Success;
+    return TemplateDeductionResult::Success;
   }
 
   // Unevaluated SFINAE context.
@@ -3294,7 +3319,7 @@ Sema::TemplateDeductionResult Sema::SubstituteExplicitTemplateArguments(
       *this, Info.getLocation(), FunctionTemplate, DeducedArgs,
       CodeSynthesisContext::ExplicitTemplateArgumentSubstitution, Info);
   if (Inst.isInvalid())
-    return TDK_InstantiationDepth;
+    return TemplateDeductionResult::InstantiationDepth;
 
   if (CheckTemplateArgumentList(FunctionTemplate, SourceLocation(),
                                 ExplicitTemplateArgs, true, SugaredBuilder,
@@ -3303,9 +3328,9 @@ Sema::TemplateDeductionResult Sema::SubstituteExplicitTemplateArguments(
       Trap.hasErrorOccurred()) {
     unsigned Index = SugaredBuilder.size();
     if (Index >= TemplateParams->size())
-      return TDK_SubstitutionFailure;
+      return TemplateDeductionResult::SubstitutionFailure;
     Info.Param = makeTemplateParameter(TemplateParams->getParam(Index));
-    return TDK_InvalidExplicitArguments;
+    return TemplateDeductionResult::InvalidExplicitArguments;
   }
 
   // Form the template argument list from the explicitly-specified
@@ -3364,7 +3389,7 @@ Sema::TemplateDeductionResult Sema::SubstituteExplicitTemplateArguments(
     if (SubstParmTypes(Function->getLocation(), Function->parameters(),
                        Proto->getExtParameterInfosOrNull(), MLTAL, ParamTypes,
                        /*params=*/nullptr, ExtParamInfos))
-      return TDK_SubstitutionFailure;
+      return TemplateDeductionResult::SubstitutionFailure;
   }
 
   // Instantiate the return type.
@@ -3390,13 +3415,13 @@ Sema::TemplateDeductionResult Sema::SubstituteExplicitTemplateArguments(
         SubstType(Proto->getReturnType(), MLTAL,
                   Function->getTypeSpecStartLoc(), Function->getDeclName());
     if (ResultType.isNull() || Trap.hasErrorOccurred())
-      return TDK_SubstitutionFailure;
+      return TemplateDeductionResult::SubstitutionFailure;
     // CUDA: Kernel function must have 'void' return type.
     if (getLangOpts().CUDA)
       if (Function->hasAttr<CUDAGlobalAttr>() && !ResultType->isVoidType()) {
         Diag(Function->getLocation(), diag::err_kern_type_not_void_return)
             << Function->getType() << Function->getSourceRange();
-        return TDK_SubstitutionFailure;
+        return TemplateDeductionResult::SubstitutionFailure;
       }
   }
 
@@ -3406,7 +3431,7 @@ Sema::TemplateDeductionResult Sema::SubstituteExplicitTemplateArguments(
       SubstParmTypes(Function->getLocation(), Function->parameters(),
                      Proto->getExtParameterInfosOrNull(), MLTAL, ParamTypes,
                      /*params*/ nullptr, ExtParamInfos))
-    return TDK_SubstitutionFailure;
+    return TemplateDeductionResult::SubstitutionFailure;
 
   if (FunctionType) {
     auto EPI = Proto->getExtProtoInfo();
@@ -3426,14 +3451,14 @@ Sema::TemplateDeductionResult Sema::SubstituteExplicitTemplateArguments(
                 /*Pattern=*/nullptr,
                 /*ForConstraintInstantiation=*/false,
                 /*SkipForSpecialization=*/true)))
-      return TDK_SubstitutionFailure;
+      return TemplateDeductionResult::SubstitutionFailure;
 
     *FunctionType = BuildFunctionType(ResultType, ParamTypes,
                                       Function->getLocation(),
                                       Function->getDeclName(),
                                       EPI);
     if (FunctionType->isNull() || Trap.hasErrorOccurred())
-      return TDK_SubstitutionFailure;
+      return TemplateDeductionResult::SubstitutionFailure;
   }
 
   // C++ [temp.arg.explicit]p2:
@@ -3455,23 +3480,24 @@ Sema::TemplateDeductionResult Sema::SubstituteExplicitTemplateArguments(
       Deduced.push_back(Arg);
   }
 
-  return TDK_Success;
+  return TemplateDeductionResult::Success;
 }
 
 /// Check whether the deduced argument type for a call to a function
 /// template matches the actual argument type per C++ [temp.deduct.call]p4.
-static Sema::TemplateDeductionResult
+static TemplateDeductionResult
 CheckOriginalCallArgDeduction(Sema &S, TemplateDeductionInfo &Info,
                               Sema::OriginalCallArg OriginalArg,
                               QualType DeducedA) {
   ASTContext &Context = S.Context;
 
-  auto Failed = [&]() -> Sema::TemplateDeductionResult {
+  auto Failed = [&]() -> TemplateDeductionResult {
     Info.FirstArg = TemplateArgument(DeducedA);
     Info.SecondArg = TemplateArgument(OriginalArg.OriginalArgType);
     Info.CallArgIndex = OriginalArg.ArgIdx;
-    return OriginalArg.DecomposedParam ? Sema::TDK_DeducedMismatchNested
-                                       : Sema::TDK_DeducedMismatch;
+    return OriginalArg.DecomposedParam
+               ? TemplateDeductionResult::DeducedMismatchNested
+               : TemplateDeductionResult::DeducedMismatch;
   };
 
   QualType A = OriginalArg.OriginalArgType;
@@ -3479,7 +3505,7 @@ CheckOriginalCallArgDeduction(Sema &S, TemplateDeductionInfo &Info,
 
   // Check for type equality (top-level cv-qualifiers are ignored).
   if (Context.hasSameUnqualifiedType(A, DeducedA))
-    return Sema::TDK_Success;
+    return TemplateDeductionResult::Success;
 
   // Strip off references on the argument types; they aren't needed for
   // the following checks.
@@ -3503,7 +3529,7 @@ CheckOriginalCallArgDeduction(Sema &S, TemplateDeductionInfo &Info,
     // the deduced A can be F.
     QualType Tmp;
     if (A->isFunctionType() && S.IsFunctionConversion(A, DeducedA, Tmp))
-      return Sema::TDK_Success;
+      return TemplateDeductionResult::Success;
 
     Qualifiers AQuals = A.getQualifiers();
     Qualifiers DeducedAQuals = DeducedA.getQualifiers();
@@ -3544,7 +3570,7 @@ CheckOriginalCallArgDeduction(Sema &S, TemplateDeductionInfo &Info,
       (S.IsQualificationConversion(A, DeducedA, false,
                                    ObjCLifetimeConversion) ||
        S.IsFunctionConversion(A, DeducedA, ResultTy)))
-    return Sema::TDK_Success;
+    return TemplateDeductionResult::Success;
 
   //    - If P is a class and P has the form simple-template-id, then the
   //      transformed A can be a derived class of the deduced A. [...]
@@ -3565,11 +3591,11 @@ CheckOriginalCallArgDeduction(Sema &S, TemplateDeductionInfo &Info,
   }
 
   if (Context.hasSameUnqualifiedType(A, DeducedA))
-    return Sema::TDK_Success;
+    return TemplateDeductionResult::Success;
 
   if (A->isRecordType() && isSimpleTemplateIdType(OriginalParamType) &&
       S.IsDerivedFrom(Info.getLocation(), A, DeducedA))
-    return Sema::TDK_Success;
+    return TemplateDeductionResult::Success;
 
   return Failed();
 }
@@ -3607,7 +3633,7 @@ static unsigned getPackIndexForParam(Sema &S,
 // if `Specialization` is a `CXXConstructorDecl` or `CXXConversionDecl`,
 // we'll try to instantiate and update its explicit specifier after constraint
 // checking.
-static Sema::TemplateDeductionResult instantiateExplicitSpecifierDeferred(
+static TemplateDeductionResult instantiateExplicitSpecifierDeferred(
     Sema &S, FunctionDecl *Specialization,
     const MultiLevelTemplateArgumentList &SubstArgs,
     TemplateDeductionInfo &Info, FunctionTemplateDecl *FunctionTemplate,
@@ -3626,24 +3652,24 @@ static Sema::TemplateDeductionResult instantiateExplicitSpecifierDeferred(
   ExplicitSpecifier ES = GetExplicitSpecifier(Specialization);
   Expr *ExplicitExpr = ES.getExpr();
   if (!ExplicitExpr)
-    return Sema::TDK_Success;
+    return TemplateDeductionResult::Success;
   if (!ExplicitExpr->isValueDependent())
-    return Sema::TDK_Success;
+    return TemplateDeductionResult::Success;
 
   Sema::InstantiatingTemplate Inst(
       S, Info.getLocation(), FunctionTemplate, DeducedArgs,
       Sema::CodeSynthesisContext::DeducedTemplateArgumentSubstitution, Info);
   if (Inst.isInvalid())
-    return Sema::TDK_InstantiationDepth;
+    return TemplateDeductionResult::InstantiationDepth;
   Sema::SFINAETrap Trap(S);
   const ExplicitSpecifier InstantiatedES =
       S.instantiateExplicitSpecifier(SubstArgs, ES);
   if (InstantiatedES.isInvalid() || Trap.hasErrorOccurred()) {
     Specialization->setInvalidDecl(true);
-    return Sema::TDK_SubstitutionFailure;
+    return TemplateDeductionResult::SubstitutionFailure;
   }
   SetExplicitSpecifier(Specialization, InstantiatedES);
-  return Sema::TDK_Success;
+  return TemplateDeductionResult::Success;
 }
 
 /// Finish template argument deduction for a function template,
@@ -3652,7 +3678,7 @@ static Sema::TemplateDeductionResult instantiateExplicitSpecifierDeferred(
 ///
 /// \param OriginalCallArgs If non-NULL, the original call arguments against
 /// which the deduced argument types should be compared.
-Sema::TemplateDeductionResult Sema::FinishTemplateArgumentDeduction(
+TemplateDeductionResult Sema::FinishTemplateArgumentDeduction(
     FunctionTemplateDecl *FunctionTemplate,
     SmallVectorImpl<DeducedTemplateArgument> &Deduced,
     unsigned NumExplicitlySpecified, FunctionDecl *&Specialization,
@@ -3671,7 +3697,7 @@ Sema::TemplateDeductionResult Sema::FinishTemplateArgumentDeduction(
       *this, Info.getLocation(), FunctionTemplate, DeducedArgs,
       CodeSynthesisContext::DeducedTemplateArgumentSubstitution, Info);
   if (Inst.isInvalid())
-    return TDK_InstantiationDepth;
+    return TemplateDeductionResult::InstantiationDepth;
 
   ContextRAII SavedContext(*this, FunctionTemplate->getTemplatedDecl());
 
@@ -3682,7 +3708,8 @@ Sema::TemplateDeductionResult Sema::FinishTemplateArgumentDeduction(
   if (auto Result = ConvertDeducedTemplateArguments(
           *this, FunctionTemplate, /*IsDeduced*/ true, Deduced, Info,
           SugaredBuilder, CanonicalBuilder, CurrentInstantiationScope,
-          NumExplicitlySpecified, PartialOverloading))
+          NumExplicitlySpecified, PartialOverloading);
+      Result != TemplateDeductionResult::Success)
     return Result;
 
   // C++ [temp.deduct.call]p10: [DR1391]
@@ -3695,7 +3722,7 @@ Sema::TemplateDeductionResult Sema::FinishTemplateArgumentDeduction(
   //   explicitly-specified template arguments, if the corresponding argument
   //   A cannot be implicitly converted to P, deduction fails.
   if (CheckNonDependent())
-    return TDK_NonDependentConversionFailure;
+    return TemplateDeductionResult::NonDependentConversionFailure;
 
   // Form the template argument list from the deduced template arguments.
   TemplateArgumentList *SugaredDeducedArgumentList =
@@ -3732,7 +3759,7 @@ Sema::TemplateDeductionResult Sema::FinishTemplateArgumentDeduction(
   Specialization = cast_or_null<FunctionDecl>(
       SubstDecl(FD, Owner, SubstArgs));
   if (!Specialization || Specialization->isInvalidDecl())
-    return TDK_SubstitutionFailure;
+    return TemplateDeductionResult::SubstitutionFailure;
 
   assert(Specialization->getPrimaryTemplate()->getCanonicalDecl() ==
          FunctionTemplate->getCanonicalDecl());
@@ -3749,7 +3776,7 @@ Sema::TemplateDeductionResult Sema::FinishTemplateArgumentDeduction(
   // failure.
   if (Trap.hasErrorOccurred()) {
     Specialization->setInvalidDecl(true);
-    return TDK_SubstitutionFailure;
+    return TemplateDeductionResult::SubstitutionFailure;
   }
 
   // C++2a [temp.deduct]p5
@@ -3766,12 +3793,12 @@ Sema::TemplateDeductionResult Sema::FinishTemplateArgumentDeduction(
     if (CheckInstantiatedFunctionTemplateConstraints(
             Info.getLocation(), Specialization, CanonicalBuilder,
             Info.AssociatedConstraintsSatisfaction))
-      return TDK_MiscellaneousDeductionFailure;
+      return TemplateDeductionResult::MiscellaneousDeductionFailure;
 
     if (!Info.AssociatedConstraintsSatisfaction.IsSatisfied) {
       Info.reset(Info.takeSugared(),
                  TemplateArgumentList::CreateCopy(Context, CanonicalBuilder));
-      return TDK_ConstraintsNotSatisfied;
+      return TemplateDeductionResult::ConstraintsNotSatisfied;
     }
   }
 
@@ -3779,10 +3806,11 @@ Sema::TemplateDeductionResult Sema::FinishTemplateArgumentDeduction(
   // substitution of `FD` before. So, we try to instantiate it back if
   // `Specialization` is either a constructor or a conversion function.
   if (isa<CXXConstructorDecl, CXXConversionDecl>(Specialization)) {
-    if (TDK_Success != instantiateExplicitSpecifierDeferred(
-                           *this, Specialization, SubstArgs, Info,
-                           FunctionTemplate, DeducedArgs)) {
-      return TDK_SubstitutionFailure;
+    if (TemplateDeductionResult::Success !=
+        instantiateExplicitSpecifierDeferred(*this, Specialization, SubstArgs,
+                                             Info, FunctionTemplate,
+                                             DeducedArgs)) {
+      return TemplateDeductionResult::SubstitutionFailure;
     }
   }
 
@@ -3829,7 +3857,8 @@ Sema::TemplateDeductionResult Sema::FinishTemplateArgumentDeduction(
       }
 
       if (auto TDK =
-              CheckOriginalCallArgDeduction(*this, Info, OriginalArg, DeducedA))
+              CheckOriginalCallArgDeduction(*this, Info, OriginalArg, DeducedA);
+          TDK != TemplateDeductionResult::Success)
         return TDK;
     }
   }
@@ -3846,7 +3875,7 @@ Sema::TemplateDeductionResult Sema::FinishTemplateArgumentDeduction(
           .append(Info.diag_begin(), Info.diag_end());
   }
 
-  return TDK_Success;
+  return TemplateDeductionResult::Success;
 }
 
 /// Gets the type of a function for template-argument-deducton
@@ -3938,7 +3967,8 @@ ResolveOverloadForDeduction(Sema &S, TemplateParameterList *TemplateParams,
       FunctionDecl *Specialization = nullptr;
       TemplateDeductionInfo Info(Ovl->getNameLoc());
       if (S.DeduceTemplateArguments(FunTmpl, &ExplicitTemplateArgs,
-                                    Specialization, Info))
+                                    Specialization,
+                                    Info) != TemplateDeductionResult::Success)
         continue;
 
       D = Specialization;
@@ -3968,10 +3998,10 @@ ResolveOverloadForDeduction(Sema &S, TemplateParameterList *TemplateParams,
     SmallVector<DeducedTemplateArgument, 8>
       Deduced(TemplateParams->size());
     TemplateDeductionInfo Info(Ovl->getNameLoc());
-    Sema::TemplateDeductionResult Result
-      = DeduceTemplateArgumentsByTypeMatch(S, TemplateParams, ParamType,
-                                           ArgType, Info, Deduced, TDF);
-    if (Result) continue;
+    TemplateDeductionResult Result = DeduceTemplateArgumentsByTypeMatch(
+        S, TemplateParams, ParamType, ArgType, Info, Deduced, TDF);
+    if (Result != TemplateDeductionResult::Success)
+      continue;
     if (!Match.isNull())
       return {};
     Match = ArgType;
@@ -4084,7 +4114,7 @@ static bool
 hasDeducibleTemplateParameters(Sema &S, FunctionTemplateDecl *FunctionTemplate,
                                QualType T);
 
-static Sema::TemplateDeductionResult DeduceTemplateArgumentsFromCallArgument(
+static TemplateDeductionResult DeduceTemplateArgumentsFromCallArgument(
     Sema &S, TemplateParameterList *TemplateParams, unsigned FirstInnerIndex,
     QualType ParamType, QualType ArgType,
     Expr::Classification ArgClassification, Expr *Arg,
@@ -4096,7 +4126,7 @@ static Sema::TemplateDeductionResult DeduceTemplateArgumentsFromCallArgument(
 
 /// Attempt template argument deduction from an initializer list
 ///        deemed to be an argument in a function call.
-static Sema::TemplateDeductionResult DeduceFromInitializerList(
+static TemplateDeductionResult DeduceFromInitializerList(
     Sema &S, TemplateParameterList *TemplateParams, QualType AdjustedParamType,
     InitListExpr *ILE, TemplateDeductionInfo &Info,
     SmallVectorImpl<DeducedTemplateArgument> &Deduced,
@@ -4111,7 +4141,7 @@ static Sema::TemplateDeductionResult DeduceFromInitializerList(
   //
   // We've already removed references and cv-qualifiers here.
   if (!ILE->getNumInits())
-    return Sema::TDK_Success;
+    return TemplateDeductionResult::Success;
 
   QualType ElTy;
   auto *ArrTy = S.Context.getAsArrayType(AdjustedParamType);
@@ -4120,14 +4150,14 @@ static Sema::TemplateDeductionResult DeduceFromInitializerList(
   else if (!S.isStdInitializerList(AdjustedParamType, &ElTy)) {
     //   Otherwise, an initializer list argument causes the parameter to be
     //   considered a non-deduced context
-    return Sema::TDK_Success;
+    return TemplateDeductionResult::Success;
   }
 
   // Resolving a core issue: a braced-init-list containing any designators is
   // a non-deduced context.
   for (Expr *E : ILE->inits())
     if (isa<DesignatedInitExpr>(E))
-      return Sema::TDK_Success;
+      return TemplateDeductionResult::Success;
 
   // Deduction only needs to be done for dependent types.
   if (ElTy->isDependentType()) {
@@ -4135,7 +4165,8 @@ static Sema::TemplateDeductionResult DeduceFromInitializerList(
       if (auto Result = DeduceTemplateArgumentsFromCallArgument(
               S, TemplateParams, 0, ElTy, E->getType(),
               E->Classify(S.getASTContext()), E, Info, Deduced,
-              OriginalCallArgs, true, ArgIdx, TDF))
+              OriginalCallArgs, true, ArgIdx, TDF);
+          Result != TemplateDeductionResult::Success)
         return Result;
     }
   }
@@ -4154,17 +4185,18 @@ static Sema::TemplateDeductionResult DeduceFromInitializerList(
       llvm::APInt Size(S.Context.getIntWidth(T), ILE->getNumInits());
       if (auto Result = DeduceNonTypeTemplateArgument(
               S, TemplateParams, NTTP, llvm::APSInt(Size), T,
-              /*ArrayBound=*/true, Info, Deduced))
+              /*ArrayBound=*/true, Info, Deduced);
+          Result != TemplateDeductionResult::Success)
         return Result;
     }
   }
 
-  return Sema::TDK_Success;
+  return TemplateDeductionResult::Success;
 }
 
 /// Perform template argument deduction per [temp.deduct.call] for a
 ///        single parameter / argument pair.
-static Sema::TemplateDeductionResult DeduceTemplateArgumentsFromCallArgument(
+static TemplateDeductionResult DeduceTemplateArgumentsFromCallArgument(
     Sema &S, TemplateParameterList *TemplateParams, unsigned FirstInnerIndex,
     QualType ParamType, QualType ArgType,
     Expr::Classification ArgClassification, Expr *Arg,
@@ -4181,7 +4213,7 @@ static Sema::TemplateDeductionResult DeduceTemplateArgumentsFromCallArgument(
   if (AdjustFunctionParmAndArgTypesForDeduction(
           S, TemplateParams, FirstInnerIndex, ParamType, ArgType,
           ArgClassification, Arg, TDF, FailedTSC))
-    return Sema::TDK_Success;
+    return TemplateDeductionResult::Success;
 
   //   If [...] the argument is a non-empty initializer list [...]
   if (InitListExpr *ILE = dyn_cast_if_present<InitListExpr>(Arg))
@@ -4221,11 +4253,11 @@ static Sema::TemplateDeductionResult DeduceTemplateArgumentsFromCallArgument(
 /// \param CheckNonDependent A callback to invoke to check conversions for
 /// non-dependent parameters, between deduction and substitution, per DR1391.
 /// If this returns true, substitution will be skipped and we return
-/// TDK_NonDependentConversionFailure. The callback is passed the parameter
-/// types (after substituting explicit template arguments).
+/// TemplateDeductionResult::NonDependentConversionFailure. The callback is
+/// passed the parameter types (after substituting explicit template arguments).
 ///
 /// \returns the result of template argument deduction.
-Sema::TemplateDeductionResult Sema::DeduceTemplateArguments(
+TemplateDeductionResult Sema::DeduceTemplateArguments(
     FunctionTemplateDecl *FunctionTemplate,
     TemplateArgumentListInfo *ExplicitTemplateArgs, ArrayRef<Expr *> Args,
     FunctionDecl *&Specialization, TemplateDeductionInfo &Info,
@@ -4233,7 +4265,7 @@ Sema::TemplateDeductionResult Sema::DeduceTemplateArguments(
     QualType ObjectType, Expr::Classification ObjectClassification,
     llvm::function_ref<bool(ArrayRef<QualType>)> CheckNonDependent) {
   if (FunctionTemplate->isInvalidDecl())
-    return TDK_Invalid;
+    return TemplateDeductionResult::Invalid;
 
   FunctionDecl *Function = FunctionTemplate->getTemplatedDecl();
   unsigned NumParams = Function->getNumParams();
@@ -4252,14 +4284,14 @@ Sema::TemplateDeductionResult Sema::DeduceTemplateArguments(
   //   of the call (call it A) as described below.
   if (Args.size() < Function->getMinRequiredExplicitArguments() &&
       !PartialOverloading)
-    return TDK_TooFewArguments;
+    return TemplateDeductionResult::TooFewArguments;
   else if (TooManyArguments(NumParams, Args.size() + ExplicitObjectOffset,
                             PartialOverloading)) {
     const auto *Proto = Function->getType()->castAs<FunctionProtoType>();
     if (Proto->isTemplateVariadic())
       /* Do nothing */;
     else if (!Proto->isVariadic())
-      return TDK_TooManyArguments;
+      return TemplateDeductionResult::TooManyArguments;
   }
 
   // The types of the parameters from which we will perform template argument
@@ -4277,7 +4309,7 @@ Sema::TemplateDeductionResult Sema::DeduceTemplateArguments(
           FunctionTemplate, *ExplicitTemplateArgs, Deduced, ParamTypes, nullptr,
           Info);
     });
-    if (Result)
+    if (Result != TemplateDeductionResult::Success)
       return Result;
 
     NumExplicitlySpecified = Deduced.size();
@@ -4297,7 +4329,7 @@ Sema::TemplateDeductionResult Sema::DeduceTemplateArguments(
     //   parameter that contains template-parameters that participate in
     //   template argument deduction ...
     if (!hasDeducibleTemplateParameters(*this, FunctionTemplate, ParamType))
-      return Sema::TDK_Success;
+      return TemplateDeductionResult::Success;
 
     if (ExplicitObjetArgument) {
       //   ... with the type of the corresponding argument
@@ -4334,13 +4366,15 @@ Sema::TemplateDeductionResult Sema::DeduceTemplateArguments(
 
       if (ParamIdx == 0 && HasExplicitObject) {
         if (auto Result = DeduceCallArgument(ParamType, 0,
-                                             /*ExplicitObjetArgument=*/true))
+                                             /*ExplicitObjetArgument=*/true);
+            Result != TemplateDeductionResult::Success)
           return Result;
         continue;
       }
 
       if (auto Result = DeduceCallArgument(ParamType, ArgIdx++,
-                                           /*ExplicitObjetArgument=*/false))
+                                           /*ExplicitObjetArgument=*/false);
+          Result != TemplateDeductionResult::Success)
         return Result;
 
       continue;
@@ -4374,7 +4408,8 @@ Sema::TemplateDeductionResult Sema::DeduceTemplateArguments(
            PackScope.nextPackElement(), ++ArgIdx) {
         ParamTypesForArgChecking.push_back(ParamPattern);
         if (auto Result = DeduceCallArgument(ParamPattern, ArgIdx,
-                                             /*ExplicitObjetArgument=*/false))
+                                             /*ExplicitObjetArgument=*/false);
+            Result != TemplateDeductionResult::Success)
           return Result;
       }
     } else {
@@ -4414,7 +4449,8 @@ Sema::TemplateDeductionResult Sema::DeduceTemplateArguments(
         for (; ArgIdx < PackArgEnd && ArgIdx < Args.size(); ArgIdx++) {
           ParamTypesForArgChecking.push_back(ParamPattern);
           if (auto Result = DeduceCallArgument(ParamPattern, ArgIdx,
-                                               /*ExplicitObjetArgument=*/false))
+                                               /*ExplicitObjetArgument=*/false);
+              Result != TemplateDeductionResult::Success)
             return Result;
 
           PackScope.nextPackElement();
@@ -4424,7 +4460,8 @@ Sema::TemplateDeductionResult Sema::DeduceTemplateArguments(
 
     // Build argument packs for each of the parameter packs expanded by this
     // pack expansion.
-    if (auto Result = PackScope.finish())
+    if (auto Result = PackScope.finish();
+        Result != TemplateDeductionResult::Success)
       return Result;
   }
 
@@ -4508,13 +4545,13 @@ QualType Sema::adjustCCAndNoReturn(QualType ArgFunctionType,
 /// specialization based on its signature, per [temp.deduct.decl].
 ///
 /// \returns the result of template argument deduction.
-Sema::TemplateDeductionResult Sema::DeduceTemplateArguments(
+TemplateDeductionResult Sema::DeduceTemplateArguments(
     FunctionTemplateDecl *FunctionTemplate,
     TemplateArgumentListInfo *ExplicitTemplateArgs, QualType ArgFunctionType,
     FunctionDecl *&Specialization, TemplateDeductionInfo &Info,
     bool IsAddressOfFunction) {
   if (FunctionTemplate->isInvalidDecl())
-    return TDK_Invalid;
+    return TemplateDeductionResult::Invalid;
 
   FunctionDecl *Function = FunctionTemplate->getTemplatedDecl();
   TemplateParameterList *TemplateParams
@@ -4533,7 +4570,7 @@ Sema::TemplateDeductionResult Sema::DeduceTemplateArguments(
           FunctionTemplate, *ExplicitTemplateArgs, Deduced, ParamTypes,
           &FunctionType, Info);
     });
-    if (Result)
+    if (Result != TemplateDeductionResult::Success)
       return Result;
 
     NumExplicitlySpecified = Deduced.size();
@@ -4566,10 +4603,10 @@ Sema::TemplateDeductionResult Sema::DeduceTemplateArguments(
     unsigned TDF =
         TDF_TopLevelParameterTypeList | TDF_AllowCompatibleFunctionType;
     // Deduce template arguments from the function type.
-    if (TemplateDeductionResult Result
-          = DeduceTemplateArgumentsByTypeMatch(*this, TemplateParams,
-                                               FunctionType, ArgFunctionType,
-                                               Info, Deduced, TDF))
+    if (TemplateDeductionResult Result = DeduceTemplateArgumentsByTypeMatch(
+            *this, TemplateParams, FunctionType, ArgFunctionType, Info, Deduced,
+            TDF);
+        Result != TemplateDeductionResult::Success)
       return Result;
   }
 
@@ -4579,7 +4616,7 @@ Sema::TemplateDeductionResult Sema::DeduceTemplateArguments(
                                              NumExplicitlySpecified,
                                              Specialization, Info);
   });
-  if (Result)
+  if (Result != TemplateDeductionResult::Success)
     return Result;
 
   // If the function has a deduced return type, deduce it now, so we can check
@@ -4587,13 +4624,13 @@ Sema::TemplateDeductionResult Sema::DeduceTemplateArguments(
   if (HasDeducedReturnType && IsAddressOfFunction &&
       Specialization->getReturnType()->isUndeducedType() &&
       DeduceReturnType(Specialization, Info.getLocation(), false))
-    return TDK_MiscellaneousDeductionFailure;
+    return TemplateDeductionResult::MiscellaneousDeductionFailure;
 
   if (IsAddressOfFunction && getLangOpts().CPlusPlus20 &&
       Specialization->isImmediateEscalating() &&
       CheckIfFunctionSpecializationIsImmediate(Specialization,
                                                Info.getLocation()))
-    return TDK_MiscellaneousDeductionFailure;
+    return TemplateDeductionResult::MiscellaneousDeductionFailure;
 
   // If the function has a dependent exception specification, resolve it now,
   // so we can check that the exception specification matches.
@@ -4602,7 +4639,7 @@ Sema::TemplateDeductionResult Sema::DeduceTemplateArguments(
   if (getLangOpts().CPlusPlus17 &&
       isUnresolvedExceptionSpec(SpecializationFPT->getExceptionSpecType()) &&
       !ResolveExceptionSpec(Info.getLocation(), SpecializationFPT))
-    return TDK_MiscellaneousDeductionFailure;
+    return TemplateDeductionResult::MiscellaneousDeductionFailure;
 
   // Adjust the exception specification of the argument to match the
   // substituted and resolved type we just formed. (Calling convention and
@@ -4632,22 +4669,22 @@ Sema::TemplateDeductionResult Sema::DeduceTemplateArguments(
             : !Context.hasSameType(SpecializationType, ArgFunctionType)) {
       Info.FirstArg = TemplateArgument(SpecializationType);
       Info.SecondArg = TemplateArgument(ArgFunctionType);
-      return TDK_NonDeducedMismatch;
+      return TemplateDeductionResult::NonDeducedMismatch;
     }
   }
 
-  return TDK_Success;
+  return TemplateDeductionResult::Success;
 }
 
 /// Deduce template arguments for a templated conversion
 /// function (C++ [temp.deduct.conv]) and, if successful, produce a
 /// conversion function template specialization.
-Sema::TemplateDeductionResult Sema::DeduceTemplateArguments(
+TemplateDeductionResult Sema::DeduceTemplateArguments(
     FunctionTemplateDecl *ConversionTemplate, QualType ObjectType,
     Expr::Classification ObjectClassification, QualType ToType,
     CXXConversionDecl *&Specialization, TemplateDeductionInfo &Info) {
   if (ConversionTemplate->isInvalidDecl())
-    return TDK_Invalid;
+    return TemplateDeductionResult::Invalid;
 
   CXXConversionDecl *ConversionGeneric
     = cast<CXXConversionDecl>(ConversionTemplate->getTemplatedDecl());
@@ -4749,13 +4786,14 @@ Sema::TemplateDeductionResult Sema::DeduceTemplateArguments(
                 *this, TemplateParams, getFirstInnerIndex(ConversionTemplate),
                 ParamType, ObjectType, ObjectClassification,
                 /*Arg=*/nullptr, Info, Deduced, OriginalCallArgs,
-                /*Decomposed*/ false, 0, /*TDF*/ 0))
+                /*Decomposed*/ false, 0, /*TDF*/ 0);
+        Result != TemplateDeductionResult::Success)
       return Result;
   }
 
-  if (TemplateDeductionResult Result
-        = DeduceTemplateArgumentsByTypeMatch(*this, TemplateParams,
-                                             P, A, Info, Deduced, TDF))
+  if (TemplateDeductionResult Result = DeduceTemplateArgumentsByTypeMatch(
+          *this, TemplateParams, P, A, Info, Deduced, TDF);
+      Result != TemplateDeductionResult::Success)
     return Result;
 
   // Create an Instantiation Scope for finalizing the operator.
@@ -4796,11 +4834,12 @@ Sema::TemplateDeductionResult Sema::DeduceTemplateArguments(
 /// naming a function template specialization.
 ///
 /// \returns the result of template argument deduction.
-Sema::TemplateDeductionResult Sema::DeduceTemplateArguments(
-    FunctionTemplateDecl *FunctionTemplate,
-    TemplateArgumentListInfo *ExplicitTemplateArgs,
-    FunctionDecl *&Specialization, TemplateDeductionInfo &Info,
-    bool IsAddressOfFunction) {
+TemplateDeductionResult
+Sema::DeduceTemplateArguments(FunctionTemplateDecl *FunctionTemplate,
+                              TemplateArgumentListInfo *ExplicitTemplateArgs,
+                              FunctionDecl *&Specialization,
+                              TemplateDeductionInfo &Info,
+                              bool IsAddressOfFunction) {
   return DeduceTemplateArguments(FunctionTemplate, ExplicitTemplateArgs,
                                  QualType(), Specialization, Info,
                                  IsAddressOfFunction);
@@ -4962,14 +5001,14 @@ static bool CheckDeducedPlaceholderConstraints(Sema &S, const AutoType &Type,
 ///        should be specified in the 'Info' parameter.
 /// \param IgnoreConstraints Set if we should not fail if the deduced type does
 ///                          not satisfy the type-constraint in the auto type.
-Sema::TemplateDeductionResult
+TemplateDeductionResult
 Sema::DeduceAutoType(TypeLoc Type, Expr *Init, QualType &Result,
                      TemplateDeductionInfo &Info, bool DependentDeduction,
                      bool IgnoreConstraints,
                      TemplateSpecCandidateSet *FailedTSC) {
   assert(DependentDeduction || Info.getDeducedDepth() == 0);
   if (Init->containsErrors())
-    return TDK_AlreadyDiagnosed;
+    return TemplateDeductionResult::AlreadyDiagnosed;
 
   const AutoType *AT = Type.getType()->getContainedAutoType();
   assert(AT);
@@ -4977,7 +5016,7 @@ Sema::DeduceAutoType(TypeLoc Type, Expr *Init, QualType &Result,
   if (Init->getType()->isNonOverloadPlaceholderType() || AT->isDecltypeAuto()) {
     ExprResult NonPlaceholder = CheckPlaceholderExpr(Init);
     if (NonPlaceholder.isInvalid())
-      return TDK_AlreadyDiagnosed;
+      return TemplateDeductionResult::AlreadyDiagnosed;
     Init = NonPlaceholder.get();
   }
 
@@ -4989,7 +5028,7 @@ Sema::DeduceAutoType(TypeLoc Type, Expr *Init, QualType &Result,
        Init->containsUnexpandedParameterPack())) {
     Result = SubstituteDeducedTypeTransform(*this, DependentResult).Apply(Type);
     assert(!Result.isNull() && "substituting DependentTy can't fail");
-    return TDK_Success;
+    return TemplateDeductionResult::Success;
   }
 
   // Make sure that we treat 'char[]' equaly as 'char*' in C23 mode.
@@ -4999,7 +5038,7 @@ Sema::DeduceAutoType(TypeLoc Type, Expr *Init, QualType &Result,
     TypeLoc TL = TypeLoc(Init->getType(), Type.getOpaqueData());
     Result = SubstituteDeducedTypeTransform(*this, DependentResult).Apply(TL);
     assert(!Result.isNull() && "substituting DependentTy can't fail");
-    return TDK_Success;
+    return TemplateDeductionResult::Success;
   }
 
   // Emit a warning if 'auto*' is used in pedantic and in C23 mode.
@@ -5011,7 +5050,7 @@ Sema::DeduceAutoType(TypeLoc Type, Expr *Init, QualType &Result,
   if (!getLangOpts().CPlusPlus && InitList) {
     Diag(Init->getBeginLoc(), diag::err_auto_init_list_from_c)
         << (int)AT->getKeyword() << getLangOpts().C23;
-    return TDK_AlreadyDiagnosed;
+    return TemplateDeductionResult::AlreadyDiagnosed;
   }
 
   // Deduce type of TemplParam in Func(Init)
@@ -5025,7 +5064,7 @@ Sema::DeduceAutoType(TypeLoc Type, Expr *Init, QualType &Result,
       Result =
           SubstituteDeducedTypeTransform(*this, DependentResult).Apply(Type);
       assert(!Result.isNull() && "substituting DependentTy can't fail");
-      return TDK_Success;
+      return TemplateDeductionResult::Success;
     }
     return TDK;
   };
@@ -5037,7 +5076,7 @@ Sema::DeduceAutoType(TypeLoc Type, Expr *Init, QualType &Result,
   if (AT->isDecltypeAuto()) {
     if (InitList) {
       Diag(Init->getBeginLoc(), diag::err_decltype_auto_initializer_list);
-      return TDK_AlreadyDiagnosed;
+      return TemplateDeductionResult::AlreadyDiagnosed;
     }
 
     DeducedType = getDecltypeForExpr(Init);
@@ -5060,24 +5099,25 @@ Sema::DeduceAutoType(TypeLoc Type, Expr *Init, QualType &Result,
       // deduce against that. Such deduction only succeeds if removing
       // cv-qualifiers and references results in std::initializer_list<T>.
       if (!Type.getType().getNonReferenceType()->getAs<AutoType>())
-        return TDK_Invalid;
+        return TemplateDeductionResult::Invalid;
 
       SourceRange DeducedFromInitRange;
       for (Expr *Init : InitList->inits()) {
         // Resolving a core issue: a braced-init-list containing any designators
         // is a non-deduced context.
         if (isa<DesignatedInitExpr>(Init))
-          return TDK_Invalid;
+          return TemplateDeductionResult::Invalid;
         if (auto TDK = DeduceTemplateArgumentsFromCallArgument(
                 *this, TemplateParamsSt.get(), 0, TemplArg, Init->getType(),
                 Init->Classify(getASTContext()), Init, Info, Deduced,
                 OriginalCallArgs, /*Decomposed=*/true,
-                /*ArgIdx=*/0, /*TDF=*/0)) {
-          if (TDK == TDK_Inconsistent) {
+                /*ArgIdx=*/0, /*TDF=*/0);
+            TDK != TemplateDeductionResult::Success) {
+          if (TDK == TemplateDeductionResult::Inconsistent) {
             Diag(Info.getLocation(), diag::err_auto_inconsistent_deduction)
                 << Info.FirstArg << Info.SecondArg << DeducedFromInitRange
                 << Init->getSourceRange();
-            return DeductionFailed(TDK_AlreadyDiagnosed);
+            return DeductionFailed(TemplateDeductionResult::AlreadyDiagnosed);
           }
           return DeductionFailed(TDK);
         }
@@ -5089,7 +5129,7 @@ Sema::DeduceAutoType(TypeLoc Type, Expr *Init, QualType &Result,
     } else {
       if (!getLangOpts().CPlusPlus && Init->refersToBitField()) {
         Diag(Loc, diag::err_auto_bitfield);
-        return TDK_AlreadyDiagnosed;
+        return TemplateDeductionResult::AlreadyDiagnosed;
       }
       QualType FuncParam =
           SubstituteDeducedTypeTransform(*this, TemplArg).Apply(Type);
@@ -5099,19 +5139,20 @@ Sema::DeduceAutoType(TypeLoc Type, Expr *Init, QualType &Result,
               *this, TemplateParamsSt.get(), 0, FuncParam, Init->getType(),
               Init->Classify(getASTContext()), Init, Info, Deduced,
               OriginalCallArgs, /*Decomposed=*/false, /*ArgIdx=*/0, /*TDF=*/0,
-              FailedTSC))
+              FailedTSC);
+          TDK != TemplateDeductionResult::Success)
         return DeductionFailed(TDK);
     }
 
     // Could be null if somehow 'auto' appears in a non-deduced context.
     if (Deduced[0].getKind() != TemplateArgument::Type)
-      return DeductionFailed(TDK_Incomplete);
+      return DeductionFailed(TemplateDeductionResult::Incomplete);
     DeducedType = Deduced[0].getAsType();
 
     if (InitList) {
       DeducedType = BuildStdInitializerList(DeducedType, Loc);
       if (DeducedType.isNull())
-        return TDK_AlreadyDiagnosed;
+        return TemplateDeductionResult::AlreadyDiagnosed;
     }
   }
 
@@ -5119,7 +5160,7 @@ Sema::DeduceAutoType(TypeLoc Type, Expr *Init, QualType &Result,
     if (!Context.hasSameType(DeducedType, Result)) {
       Info.FirstArg = Result;
       Info.SecondArg = DeducedType;
-      return DeductionFailed(TDK_Inconsistent);
+      return DeductionFailed(TemplateDeductionResult::Inconsistent);
     }
     DeducedType = Context.getCommonSugaredType(Result, DeducedType);
   }
@@ -5127,11 +5168,11 @@ Sema::DeduceAutoType(TypeLoc Type, Expr *Init, QualType &Result,
   if (AT->isConstrained() && !IgnoreConstraints &&
       CheckDeducedPlaceholderConstraints(
           *this, *AT, Type.getContainedAutoTypeLoc(), DeducedType))
-    return TDK_AlreadyDiagnosed;
+    return TemplateDeductionResult::AlreadyDiagnosed;
 
   Result = SubstituteDeducedTypeTransform(*this, DeducedType).Apply(Type);
   if (Result.isNull())
-    return TDK_AlreadyDiagnosed;
+    return TemplateDeductionResult::AlreadyDiagnosed;
 
   // Check that the deduced argument type is compatible with the original
   // argument type per C++ [temp.deduct.call]p4.
@@ -5140,13 +5181,14 @@ Sema::DeduceAutoType(TypeLoc Type, Expr *Init, QualType &Result,
     assert((bool)InitList == OriginalArg.DecomposedParam &&
            "decomposed non-init-list in auto deduction?");
     if (auto TDK =
-            CheckOriginalCallArgDeduction(*this, Info, OriginalArg, DeducedA)) {
+            CheckOriginalCallArgDeduction(*this, Info, OriginalArg, DeducedA);
+        TDK != TemplateDeductionResult::Success) {
       Result = QualType();
       return DeductionFailed(TDK);
     }
   }
 
-  return TDK_Success;
+  return TemplateDeductionResult::Success;
 }
 
 QualType Sema::SubstAutoType(QualType TypeWithAuto,
@@ -5403,7 +5445,8 @@ static bool isAtLeastAsSpecializedAs(Sema &S,
 
     if (DeduceTemplateArguments(S, TemplateParams, Args2.data(), Args2.size(),
                                 Args1.data(), Args1.size(), Info, Deduced,
-                                TDF_None, /*PartialOrdering=*/true))
+                                TDF_None, /*PartialOrdering=*/true) !=
+        TemplateDeductionResult::Success)
       return false;
 
     break;
@@ -5415,17 +5458,17 @@ static bool isAtLeastAsSpecializedAs(Sema &S,
     if (DeduceTemplateArgumentsByTypeMatch(
             S, TemplateParams, Proto2->getReturnType(), Proto1->getReturnType(),
             Info, Deduced, TDF_None,
-            /*PartialOrdering=*/true))
+            /*PartialOrdering=*/true) != TemplateDeductionResult::Success)
       return false;
     break;
 
   case TPOC_Other:
     //   - In other contexts (14.6.6.2) the function template's function type
     //     is used.
-    if (DeduceTemplateArgumentsByTypeMatch(S, TemplateParams,
-                                           FD2->getType(), FD1->getType(),
-                                           Info, Deduced, TDF_None,
-                                           /*PartialOrdering=*/true))
+    if (DeduceTemplateArgumentsByTypeMatch(
+            S, TemplateParams, FD2->getType(), FD1->getType(), Info, Deduced,
+            TDF_None,
+            /*PartialOrdering=*/true) != TemplateDeductionResult::Success)
       return false;
     break;
   }
@@ -5776,9 +5819,9 @@ static bool isAtLeastAsSpecializedAs(Sema &S, QualType T1, QualType T2,
 
   // Determine whether P1 is at least as specialized as P2.
   Deduced.resize(P2->getTemplateParameters()->size());
-  if (DeduceTemplateArgumentsByTypeMatch(S, P2->getTemplateParameters(),
-                                         T2, T1, Info, Deduced, TDF_None,
-                                         /*PartialOrdering=*/true))
+  if (DeduceTemplateArgumentsByTypeMatch(
+          S, P2->getTemplateParameters(), T2, T1, Info, Deduced, TDF_None,
+          /*PartialOrdering=*/true) != TemplateDeductionResult::Success)
     return false;
 
   SmallVector<TemplateArgument, 4> DeducedArgs(Deduced.begin(),
@@ -5791,9 +5834,10 @@ static bool isAtLeastAsSpecializedAs(Sema &S, QualType T1, QualType T2,
   const auto *TST1 = cast<TemplateSpecializationType>(T1);
   bool AtLeastAsSpecialized;
   S.runWithSufficientStackSpace(Info.getLocation(), [&] {
-    AtLeastAsSpecialized = !FinishTemplateArgumentDeduction(
-        S, P2, /*IsPartialOrdering=*/true, TST1->template_arguments(), Deduced,
-        Info);
+    AtLeastAsSpecialized =
+        FinishTemplateArgumentDeduction(
+            S, P2, /*IsPartialOrdering=*/true, TST1->template_arguments(),
+            Deduced, Info) == TemplateDeductionResult::Success;
   });
   return AtLeastAsSpecialized;
 }
diff --git a/clang/lib/Sema/SemaTemplateInstantiate.cpp b/clang/lib/Sema/SemaTemplateInstantiate.cpp
index 6d59180..3713784 100644
--- a/clang/lib/Sema/SemaTemplateInstantiate.cpp
+++ b/clang/lib/Sema/SemaTemplateInstantiate.cpp
@@ -3693,9 +3693,9 @@ bool Sema::usesPartialOrExplicitSpecialization(
                    ->getPartialSpecializations(PartialSpecs);
   for (unsigned I = 0, N = PartialSpecs.size(); I != N; ++I) {
     TemplateDeductionInfo Info(Loc);
-    if (!DeduceTemplateArguments(PartialSpecs[I],
-                                 ClassTemplateSpec->getTemplateArgs().asArray(),
-                                 Info))
+    if (DeduceTemplateArguments(PartialSpecs[I],
+                                ClassTemplateSpec->getTemplateArgs().asArray(),
+                                Info) == TemplateDeductionResult::Success)
       return true;
   }
 
@@ -3739,8 +3739,9 @@ getPatternForClassTemplateSpecialization(
     for (unsigned I = 0, N = PartialSpecs.size(); I != N; ++I) {
       ClassTemplatePartialSpecializationDecl *Partial = PartialSpecs[I];
       TemplateDeductionInfo Info(FailedCandidates.getLocation());
-      if (Sema::TemplateDeductionResult Result = S.DeduceTemplateArguments(
-              Partial, ClassTemplateSpec->getTemplateArgs().asArray(), Info)) {
+      if (TemplateDeductionResult Result = S.DeduceTemplateArguments(
+              Partial, ClassTemplateSpec->getTemplateArgs().asArray(), Info);
+          Result != TemplateDeductionResult::Success) {
         // Store the failed-deduction information for use in diagnostics, later.
         // TODO: Actually use the failed-deduction info?
         FailedCandidates.addCandidate().set(
-- 
cgit v1.1


From 20948df25d0401f248ec754dfa22422e20377662 Mon Sep 17 00:00:00 2001
From: Felipe de Azevedo Piovezan <fpiovezan@apple.com>
Date: Mon, 12 Feb 2024 09:32:10 -0800
Subject: [DWARFVerifier] Fix debug_str_offsets DWARF version detection
 (#81303)

The DWARF 5 debug_str_offsets section starts with a header, which must
be skipped in order to access the underlying `strp`s.

However, the verifier supports some pre-standardization version of this
section (with the same section name), which does not have a header. In
this case, the offsets start on the first byte of the section. More in
[1] and [2] about this legacy section.

How does The DWARF verifier figure out which version to use? It manually
reads the **first** header in debug_info and uses that. This is wrong
when multiple debug_str_offset sections have been linked together, in
particular it is wrong in the following two cases:

1. A standard DWARF 4 object file (i.e. no debug_str_offsets) linked
with a standard DWARF 5 object file.
2. A non-standard DWARF 4 object file (i.e. containing the header-less
debug_str_offsets section) linked with a standard DWARF 5 object file.

Based on discussions in https://github.com/llvm/llvm-project/pull/81210,
the legacy version is only possible with dwo files, and dwo files cannot
mix the legacy version with the dwarf 5 version. As such, we change the
verifier to only check the debug_info header in the case of dwo files.
If it sees a dwarf 4 version, it handles it the legacy way.

Note: the modified test was technically testing an unsupported
combination of dwarf version + non-dwo sections. To see why, simply note
that the test contained no `debug_info.dwo` sections, so the call to
DWARFObject::forEachInfoDWOSections was doing nothing. We were finding
the error through the "standard version", which shouldn't happen.

[1]: https://gcc.gnu.org/wiki/DebugFission
[2]: https://gcc.gnu.org/wiki/DebugFissionDWP
---
 llvm/include/llvm/DebugInfo/DWARF/DWARFVerifier.h  |  6 +--
 llvm/lib/DebugInfo/DWARF/DWARFVerifier.cpp         | 45 +++++++++--------
 .../X86/debug-str-offsets-mixed-dwarf-4-5.yaml     | 57 ++++++++++++++++++++++
 .../X86/verify_invalid_str_offsets.yaml            | 17 ++++---
 4 files changed, 94 insertions(+), 31 deletions(-)
 create mode 100644 llvm/test/tools/llvm-dwarfdump/X86/debug-str-offsets-mixed-dwarf-4-5.yaml

diff --git a/llvm/include/llvm/DebugInfo/DWARF/DWARFVerifier.h b/llvm/include/llvm/DebugInfo/DWARF/DWARFVerifier.h
index ea73664..c2365a4 100644
--- a/llvm/include/llvm/DebugInfo/DWARF/DWARFVerifier.h
+++ b/llvm/include/llvm/DebugInfo/DWARF/DWARFVerifier.h
@@ -360,9 +360,9 @@ public:
   ///
   /// \returns true if the .debug_line verifies successfully, false otherwise.
   bool handleDebugStrOffsets();
-  bool verifyDebugStrOffsets(
-      StringRef SectionName, const DWARFSection &Section, StringRef StrData,
-      void (DWARFObject::*)(function_ref<void(const DWARFSection &)>) const);
+  bool verifyDebugStrOffsets(std::optional<dwarf::DwarfFormat> LegacyFormat,
+                             StringRef SectionName, const DWARFSection &Section,
+                             StringRef StrData);
 
   /// Emits any aggregate information collected, depending on the dump options
   void summarize();
diff --git a/llvm/lib/DebugInfo/DWARF/DWARFVerifier.cpp b/llvm/lib/DebugInfo/DWARF/DWARFVerifier.cpp
index b523576..20ef59e 100644
--- a/llvm/lib/DebugInfo/DWARF/DWARFVerifier.cpp
+++ b/llvm/lib/DebugInfo/DWARF/DWARFVerifier.cpp
@@ -1880,33 +1880,38 @@ bool DWARFVerifier::handleDebugStrOffsets() {
   OS << "Verifying .debug_str_offsets...\n";
   const DWARFObject &DObj = DCtx.getDWARFObj();
   bool Success = true;
+
+  // dwo sections may contain the legacy debug_str_offsets format (and they
+  // can't be mixed with dwarf 5's format). This section format contains no
+  // header.
+  // As such, check the version from debug_info and, if we are in the legacy
+  // mode (Dwarf <= 4), extract Dwarf32/Dwarf64.
+  std::optional<DwarfFormat> DwoLegacyDwarf4Format;
+  DObj.forEachInfoDWOSections([&](const DWARFSection &S) {
+    if (DwoLegacyDwarf4Format)
+      return;
+    DWARFDataExtractor DebugInfoData(DObj, S, DCtx.isLittleEndian(), 0);
+    uint64_t Offset = 0;
+    DwarfFormat InfoFormat = DebugInfoData.getInitialLength(&Offset).second;
+    if (uint16_t InfoVersion = DebugInfoData.getU16(&Offset); InfoVersion <= 4)
+      DwoLegacyDwarf4Format = InfoFormat;
+  });
+
   Success &= verifyDebugStrOffsets(
-      ".debug_str_offsets.dwo", DObj.getStrOffsetsDWOSection(),
-      DObj.getStrDWOSection(), &DWARFObject::forEachInfoDWOSections);
+      DwoLegacyDwarf4Format, ".debug_str_offsets.dwo",
+      DObj.getStrOffsetsDWOSection(), DObj.getStrDWOSection());
   Success &= verifyDebugStrOffsets(
-      ".debug_str_offsets", DObj.getStrOffsetsSection(), DObj.getStrSection(),
-      &DWARFObject::forEachInfoSections);
+      /*LegacyFormat=*/std::nullopt, ".debug_str_offsets",
+      DObj.getStrOffsetsSection(), DObj.getStrSection());
   return Success;
 }
 
 bool DWARFVerifier::verifyDebugStrOffsets(
-    StringRef SectionName, const DWARFSection &Section, StringRef StrData,
-    void (DWARFObject::*VisitInfoSections)(
-        function_ref<void(const DWARFSection &)>) const) {
+    std::optional<DwarfFormat> LegacyFormat, StringRef SectionName,
+    const DWARFSection &Section, StringRef StrData) {
   const DWARFObject &DObj = DCtx.getDWARFObj();
-  uint16_t InfoVersion = 0;
-  DwarfFormat InfoFormat = DwarfFormat::DWARF32;
-  (DObj.*VisitInfoSections)([&](const DWARFSection &S) {
-    if (InfoVersion)
-      return;
-    DWARFDataExtractor DebugInfoData(DObj, S, DCtx.isLittleEndian(), 0);
-    uint64_t Offset = 0;
-    InfoFormat = DebugInfoData.getInitialLength(&Offset).second;
-    InfoVersion = DebugInfoData.getU16(&Offset);
-  });
 
   DWARFDataExtractor DA(DObj, Section, DCtx.isLittleEndian(), 0);
-
   DataExtractor::Cursor C(0);
   uint64_t NextUnit = 0;
   bool Success = true;
@@ -1914,8 +1919,8 @@ bool DWARFVerifier::verifyDebugStrOffsets(
     DwarfFormat Format;
     uint64_t Length;
     uint64_t StartOffset = C.tell();
-    if (InfoVersion == 4) {
-      Format = InfoFormat;
+    if (LegacyFormat) {
+      Format = *LegacyFormat;
       Length = DA.getData().size();
       NextUnit = C.tell() + Length;
     } else {
diff --git a/llvm/test/tools/llvm-dwarfdump/X86/debug-str-offsets-mixed-dwarf-4-5.yaml b/llvm/test/tools/llvm-dwarfdump/X86/debug-str-offsets-mixed-dwarf-4-5.yaml
new file mode 100644
index 0000000..d104608
--- /dev/null
+++ b/llvm/test/tools/llvm-dwarfdump/X86/debug-str-offsets-mixed-dwarf-4-5.yaml
@@ -0,0 +1,57 @@
+# RUN: yaml2obj %s -o %t.o
+# RUN: llvm-dwarfdump -debug-str-offsets -verify %t.o | FileCheck %s
+
+# CHECK: Verifying .debug_str_offsets...
+# CHECK: No errors
+
+# Check that when mixing standard DWARF 4 debug information with standard DWARF
+# 5 debug information, the verifier correctly interprets the debug_str_offsets
+# section as a standards-conforming DWARF 5 section.
+
+--- !ELF
+FileHeader:
+  Class: ELFCLASS64
+  Data:  ELFDATA2LSB
+  Type:  ET_EXEC
+DWARF:
+  debug_str:
+    - 'cu1'
+    - 'cu2'
+  debug_str_offsets:
+    - Offsets:
+        - 0x0
+  debug_abbrev:
+    - Table:
+        - Code:            0x1
+          Tag:             DW_TAG_compile_unit
+          Children:        DW_CHILDREN_no
+          Attributes:
+            - Attribute:       DW_AT_name
+              Form:            DW_FORM_strp
+        - Code:            0x2
+          Tag:             DW_TAG_compile_unit
+          Children:        DW_CHILDREN_no
+          Attributes:
+            - Attribute:       DW_AT_name
+              Form:            DW_FORM_strx1
+            - Attribute:       DW_AT_str_offsets_base
+              Form:            DW_FORM_sec_offset
+  debug_info:
+    - Version:         4
+      AbbrevTableID:   0
+      AbbrOffset:      0x0
+      AddrSize:        8
+      Entries:
+        - AbbrCode:        0x1
+          Values:
+            - Value:           0x4
+    - Version:         5
+      UnitType:        DW_UT_compile
+      AbbrOffset:      0x0
+      AddrSize:        8
+      AbbrevTableID:   0
+      Entries:
+        - AbbrCode:        0x2
+          Values:
+            - Value:           0x0
+            - Value:           0x8 # str offsets base
diff --git a/llvm/test/tools/llvm-dwarfdump/X86/verify_invalid_str_offsets.yaml b/llvm/test/tools/llvm-dwarfdump/X86/verify_invalid_str_offsets.yaml
index 37f3763..1bdc640 100644
--- a/llvm/test/tools/llvm-dwarfdump/X86/verify_invalid_str_offsets.yaml
+++ b/llvm/test/tools/llvm-dwarfdump/X86/verify_invalid_str_offsets.yaml
@@ -13,7 +13,7 @@
 # CHECK-NEXT: error: .debug_str_offsets: contribution 0x29: length exceeds available space (contribution offset (0x29) + length field space (0x4) + length (0x5000000) == 0x500002D > section size 0x30)
 # Errors detected.
 
-# V4: error: .debug_str_offsets: contribution 0x0: index 0x2: invalid string offset *0x8 == 0x2, is neither zero nor immediately following a null character
+# V4: error: .debug_str_offsets.dwo: contribution 0x0: index 0x2: invalid string offset *0x8 == 0x2, is neither zero nor immediately following a null character
 
 
 #--- v4.yaml
@@ -23,16 +23,17 @@ FileHeader:
   Data:  ELFDATA2LSB
   Type:  ET_EXEC
 DWARF:
-  debug_str:
-    - 'foo'
-    - 'bar'
-  debug_info:
-    - Version: 4
-      AddrSize: 4
 Sections:
-  - Name: '.debug_str_offsets'
+  - Name: '.debug_info.dwo'
+    Type: SHT_PROGBITS
+    Content: "0700000004000000000004"
+  - Name: '.debug_str_offsets.dwo'
     Type: SHT_PROGBITS
     Content: "000000000400000002000000"
+  - Name: 'debug_str.dwo'
+    Type: SHT_PROGBITS
+    Content: "666F6F0062617200"
+
 
 #--- v5.yaml
 --- !ELF
-- 
cgit v1.1


From 5fdd094837c6d8437803ebf3ccc91c3d494a2ac8 Mon Sep 17 00:00:00 2001
From: Logikable <seanluchen@google.com>
Date: Mon, 12 Feb 2024 09:33:09 -0800
Subject: [clang][CodeGen] Emit atomic IR in place of optimized libcalls.
 (#73176)

In the beginning, Clang only emitted atomic IR for operations it knew
the
underlying microarch had instructions for, meaning it required
significant
knowledge of the target. Later, the backend acquired the ability to
lower
IR to libcalls. To avoid duplicating logic and improve logic locality,
we'd like to move as much as possible to the backend.

There are many ways to describe this change. For example, this change
reduces the variables Clang uses to decide whether to emit libcalls or
IR, down to only the atomic's size.
---
 clang/lib/CodeGen/CGAtomic.cpp                     | 321 +++------------------
 clang/test/CodeGen/LoongArch/atomics.c             |  16 +-
 clang/test/CodeGen/PowerPC/quadword-atomics.c      |  50 ++--
 clang/test/CodeGen/RISCV/riscv-atomics.c           |  68 ++---
 .../CodeGen/SystemZ/gnu-atomic-builtins-i128-8Al.c | 119 ++++----
 clang/test/CodeGen/arm-atomics-m.c                 |   8 +-
 clang/test/CodeGen/arm-atomics-m0.c                |  16 +-
 clang/test/CodeGen/atomic-ops-libcall.c            | 310 +++++++++++++++++---
 clang/test/CodeGen/atomic-ops.c                    |  27 +-
 clang/test/CodeGen/atomics-inlining.c              |  38 +--
 clang/test/CodeGen/c11atomics.c                    |  30 +-
 clang/test/CodeGenCXX/atomic-inline.cpp            |   6 +-
 clang/test/CodeGenOpenCL/atomic-ops-libcall.cl     |  57 ++--
 13 files changed, 497 insertions(+), 569 deletions(-)

diff --git a/clang/lib/CodeGen/CGAtomic.cpp b/clang/lib/CodeGen/CGAtomic.cpp
index 52e6ddb..a8d846b 100644
--- a/clang/lib/CodeGen/CGAtomic.cpp
+++ b/clang/lib/CodeGen/CGAtomic.cpp
@@ -811,29 +811,6 @@ static void EmitAtomicOp(CodeGenFunction &CGF, AtomicExpr *Expr, Address Dest,
   Builder.SetInsertPoint(ContBB);
 }
 
-static void
-AddDirectArgument(CodeGenFunction &CGF, CallArgList &Args,
-                  bool UseOptimizedLibcall, llvm::Value *Val, QualType ValTy,
-                  SourceLocation Loc, CharUnits SizeInChars) {
-  if (UseOptimizedLibcall) {
-    // Load value and pass it to the function directly.
-    CharUnits Align = CGF.getContext().getTypeAlignInChars(ValTy);
-    int64_t SizeInBits = CGF.getContext().toBits(SizeInChars);
-    ValTy =
-        CGF.getContext().getIntTypeForBitwidth(SizeInBits, /*Signed=*/false);
-    llvm::Type *ITy = llvm::IntegerType::get(CGF.getLLVMContext(), SizeInBits);
-    Address Ptr = Address(Val, ITy, Align);
-    Val = CGF.EmitLoadOfScalar(Ptr, false,
-                               CGF.getContext().getPointerType(ValTy),
-                               Loc);
-    // Coerce the value into an appropriately sized integer type.
-    Args.add(RValue::get(Val), ValTy);
-  } else {
-    // Non-optimized functions always take a reference.
-    Args.add(RValue::get(Val), CGF.getContext().VoidPtrTy);
-  }
-}
-
 RValue CodeGenFunction::EmitAtomicExpr(AtomicExpr *E) {
   QualType AtomicTy = E->getPtr()->getType()->getPointeeType();
   QualType MemTy = AtomicTy;
@@ -857,22 +834,16 @@ RValue CodeGenFunction::EmitAtomicExpr(AtomicExpr *E) {
   uint64_t Size = TInfo.Width.getQuantity();
   unsigned MaxInlineWidthInBits = getTarget().getMaxAtomicInlineWidth();
 
-  bool Oversized = getContext().toBits(TInfo.Width) > MaxInlineWidthInBits;
-  bool Misaligned = (Ptr.getAlignment() % TInfo.Width) != 0;
-  bool UseLibcall = Misaligned | Oversized;
-  bool ShouldCastToIntPtrTy = true;
-
   CharUnits MaxInlineWidth =
       getContext().toCharUnitsFromBits(MaxInlineWidthInBits);
-
   DiagnosticsEngine &Diags = CGM.getDiags();
-
+  bool Misaligned = (Ptr.getAlignment() % TInfo.Width) != 0;
+  bool Oversized = getContext().toBits(TInfo.Width) > MaxInlineWidthInBits;
   if (Misaligned) {
     Diags.Report(E->getBeginLoc(), diag::warn_atomic_op_misaligned)
         << (int)TInfo.Width.getQuantity()
         << (int)Ptr.getAlignment().getQuantity();
   }
-
   if (Oversized) {
     Diags.Report(E->getBeginLoc(), diag::warn_atomic_op_oversized)
         << (int)TInfo.Width.getQuantity() << (int)MaxInlineWidth.getQuantity();
@@ -881,6 +852,7 @@ RValue CodeGenFunction::EmitAtomicExpr(AtomicExpr *E) {
   llvm::Value *Order = EmitScalarExpr(E->getOrder());
   llvm::Value *Scope =
       E->getScopeModel() ? EmitScalarExpr(E->getScope()) : nullptr;
+  bool ShouldCastToIntPtrTy = true;
 
   switch (E->getOp()) {
   case AtomicExpr::AO__c11_atomic_init:
@@ -1047,122 +1019,25 @@ RValue CodeGenFunction::EmitAtomicExpr(AtomicExpr *E) {
       Dest = Atomics.castToAtomicIntPointer(Dest);
   }
 
-  // Use a library call.  See: http://gcc.gnu.org/wiki/Atomic/GCCMM/LIbrary .
+  bool PowerOf2Size = (Size & (Size - 1)) == 0;
+  bool UseLibcall = !PowerOf2Size || (Size > 16);
+
+  // For atomics larger than 16 bytes, emit a libcall from the frontend. This
+  // avoids the overhead of dealing with excessively-large value types in IR.
+  // Non-power-of-2 values also lower to libcall here, as they are not currently
+  // permitted in IR instructions (although that constraint could be relaxed in
+  // the future). For other cases where a libcall is required on a given
+  // platform, we let the backend handle it (this includes handling for all of
+  // the size-optimized libcall variants, which are only valid up to 16 bytes.)
+  //
+  // See: https://llvm.org/docs/Atomics.html#libcalls-atomic
   if (UseLibcall) {
-    bool UseOptimizedLibcall = false;
-    switch (E->getOp()) {
-    case AtomicExpr::AO__c11_atomic_init:
-    case AtomicExpr::AO__opencl_atomic_init:
-      llvm_unreachable("Already handled above with EmitAtomicInit!");
-
-    case AtomicExpr::AO__atomic_fetch_add:
-    case AtomicExpr::AO__atomic_fetch_and:
-    case AtomicExpr::AO__atomic_fetch_max:
-    case AtomicExpr::AO__atomic_fetch_min:
-    case AtomicExpr::AO__atomic_fetch_nand:
-    case AtomicExpr::AO__atomic_fetch_or:
-    case AtomicExpr::AO__atomic_fetch_sub:
-    case AtomicExpr::AO__atomic_fetch_xor:
-    case AtomicExpr::AO__atomic_add_fetch:
-    case AtomicExpr::AO__atomic_and_fetch:
-    case AtomicExpr::AO__atomic_max_fetch:
-    case AtomicExpr::AO__atomic_min_fetch:
-    case AtomicExpr::AO__atomic_nand_fetch:
-    case AtomicExpr::AO__atomic_or_fetch:
-    case AtomicExpr::AO__atomic_sub_fetch:
-    case AtomicExpr::AO__atomic_xor_fetch:
-    case AtomicExpr::AO__c11_atomic_fetch_add:
-    case AtomicExpr::AO__c11_atomic_fetch_and:
-    case AtomicExpr::AO__c11_atomic_fetch_max:
-    case AtomicExpr::AO__c11_atomic_fetch_min:
-    case AtomicExpr::AO__c11_atomic_fetch_nand:
-    case AtomicExpr::AO__c11_atomic_fetch_or:
-    case AtomicExpr::AO__c11_atomic_fetch_sub:
-    case AtomicExpr::AO__c11_atomic_fetch_xor:
-    case AtomicExpr::AO__hip_atomic_fetch_add:
-    case AtomicExpr::AO__hip_atomic_fetch_and:
-    case AtomicExpr::AO__hip_atomic_fetch_max:
-    case AtomicExpr::AO__hip_atomic_fetch_min:
-    case AtomicExpr::AO__hip_atomic_fetch_or:
-    case AtomicExpr::AO__hip_atomic_fetch_sub:
-    case AtomicExpr::AO__hip_atomic_fetch_xor:
-    case AtomicExpr::AO__opencl_atomic_fetch_add:
-    case AtomicExpr::AO__opencl_atomic_fetch_and:
-    case AtomicExpr::AO__opencl_atomic_fetch_max:
-    case AtomicExpr::AO__opencl_atomic_fetch_min:
-    case AtomicExpr::AO__opencl_atomic_fetch_or:
-    case AtomicExpr::AO__opencl_atomic_fetch_sub:
-    case AtomicExpr::AO__opencl_atomic_fetch_xor:
-    case AtomicExpr::AO__scoped_atomic_fetch_add:
-    case AtomicExpr::AO__scoped_atomic_fetch_and:
-    case AtomicExpr::AO__scoped_atomic_fetch_max:
-    case AtomicExpr::AO__scoped_atomic_fetch_min:
-    case AtomicExpr::AO__scoped_atomic_fetch_nand:
-    case AtomicExpr::AO__scoped_atomic_fetch_or:
-    case AtomicExpr::AO__scoped_atomic_fetch_sub:
-    case AtomicExpr::AO__scoped_atomic_fetch_xor:
-    case AtomicExpr::AO__scoped_atomic_add_fetch:
-    case AtomicExpr::AO__scoped_atomic_and_fetch:
-    case AtomicExpr::AO__scoped_atomic_max_fetch:
-    case AtomicExpr::AO__scoped_atomic_min_fetch:
-    case AtomicExpr::AO__scoped_atomic_nand_fetch:
-    case AtomicExpr::AO__scoped_atomic_or_fetch:
-    case AtomicExpr::AO__scoped_atomic_sub_fetch:
-    case AtomicExpr::AO__scoped_atomic_xor_fetch:
-      // For these, only library calls for certain sizes exist.
-      UseOptimizedLibcall = true;
-      break;
-
-    case AtomicExpr::AO__atomic_load:
-    case AtomicExpr::AO__atomic_store:
-    case AtomicExpr::AO__atomic_exchange:
-    case AtomicExpr::AO__atomic_compare_exchange:
-    case AtomicExpr::AO__scoped_atomic_load:
-    case AtomicExpr::AO__scoped_atomic_store:
-    case AtomicExpr::AO__scoped_atomic_exchange:
-    case AtomicExpr::AO__scoped_atomic_compare_exchange:
-      // Use the generic version if we don't know that the operand will be
-      // suitably aligned for the optimized version.
-      if (Misaligned)
-        break;
-      [[fallthrough]];
-    case AtomicExpr::AO__atomic_load_n:
-    case AtomicExpr::AO__atomic_store_n:
-    case AtomicExpr::AO__atomic_exchange_n:
-    case AtomicExpr::AO__atomic_compare_exchange_n:
-    case AtomicExpr::AO__c11_atomic_load:
-    case AtomicExpr::AO__c11_atomic_store:
-    case AtomicExpr::AO__c11_atomic_exchange:
-    case AtomicExpr::AO__c11_atomic_compare_exchange_weak:
-    case AtomicExpr::AO__c11_atomic_compare_exchange_strong:
-    case AtomicExpr::AO__hip_atomic_load:
-    case AtomicExpr::AO__hip_atomic_store:
-    case AtomicExpr::AO__hip_atomic_exchange:
-    case AtomicExpr::AO__hip_atomic_compare_exchange_weak:
-    case AtomicExpr::AO__hip_atomic_compare_exchange_strong:
-    case AtomicExpr::AO__opencl_atomic_load:
-    case AtomicExpr::AO__opencl_atomic_store:
-    case AtomicExpr::AO__opencl_atomic_exchange:
-    case AtomicExpr::AO__opencl_atomic_compare_exchange_weak:
-    case AtomicExpr::AO__opencl_atomic_compare_exchange_strong:
-    case AtomicExpr::AO__scoped_atomic_load_n:
-    case AtomicExpr::AO__scoped_atomic_store_n:
-    case AtomicExpr::AO__scoped_atomic_exchange_n:
-    case AtomicExpr::AO__scoped_atomic_compare_exchange_n:
-      // Only use optimized library calls for sizes for which they exist.
-      // FIXME: Size == 16 optimized library functions exist too.
-      if (Size == 1 || Size == 2 || Size == 4 || Size == 8)
-        UseOptimizedLibcall = true;
-      break;
-    }
-
     CallArgList Args;
-    if (!UseOptimizedLibcall) {
-      // For non-optimized library calls, the size is the first parameter
-      Args.add(RValue::get(llvm::ConstantInt::get(SizeTy, Size)),
-               getContext().getSizeType());
-    }
-    // Atomic address is the first or second parameter
+    // For non-optimized library calls, the size is the first parameter.
+    Args.add(RValue::get(llvm::ConstantInt::get(SizeTy, Size)),
+             getContext().getSizeType());
+
+    // The atomic address is the second parameter.
     // The OpenCL atomic library functions only accept pointer arguments to
     // generic address space.
     auto CastToGenericAddrSpace = [&](llvm::Value *V, QualType PT) {
@@ -1177,18 +1052,14 @@ RValue CodeGenFunction::EmitAtomicExpr(AtomicExpr *E) {
       return getTargetHooks().performAddrSpaceCast(
           *this, V, AS, LangAS::opencl_generic, DestType, false);
     };
-
     Args.add(RValue::get(CastToGenericAddrSpace(Ptr.getPointer(),
                                                 E->getPtr()->getType())),
              getContext().VoidPtrTy);
 
+    // The next 1-3 parameters are op-dependent.
     std::string LibCallName;
-    QualType LoweredMemTy =
-      MemTy->isPointerType() ? getContext().getIntPtrType() : MemTy;
     QualType RetTy;
     bool HaveRetTy = false;
-    llvm::Instruction::BinaryOps PostOp = (llvm::Instruction::BinaryOps)0;
-    bool PostOpMinMax = false;
     switch (E->getOp()) {
     case AtomicExpr::AO__c11_atomic_init:
     case AtomicExpr::AO__opencl_atomic_init:
@@ -1199,8 +1070,6 @@ RValue CodeGenFunction::EmitAtomicExpr(AtomicExpr *E) {
     // and exchange.
     // bool __atomic_compare_exchange(size_t size, void *mem, void *expected,
     //                                void *desired, int success, int failure)
-    // bool __atomic_compare_exchange_N(T *mem, T *expected, T desired,
-    //                                  int success, int failure)
     case AtomicExpr::AO__atomic_compare_exchange:
     case AtomicExpr::AO__atomic_compare_exchange_n:
     case AtomicExpr::AO__c11_atomic_compare_exchange_weak:
@@ -1217,14 +1086,14 @@ RValue CodeGenFunction::EmitAtomicExpr(AtomicExpr *E) {
       Args.add(RValue::get(CastToGenericAddrSpace(Val1.getPointer(),
                                                   E->getVal1()->getType())),
                getContext().VoidPtrTy);
-      AddDirectArgument(*this, Args, UseOptimizedLibcall, Val2.getPointer(),
-                        MemTy, E->getExprLoc(), TInfo.Width);
+      Args.add(RValue::get(CastToGenericAddrSpace(Val2.getPointer(),
+                                                  E->getVal2()->getType())),
+               getContext().VoidPtrTy);
       Args.add(RValue::get(Order), getContext().IntTy);
       Order = OrderFail;
       break;
     // void __atomic_exchange(size_t size, void *mem, void *val, void *return,
     //                        int order)
-    // T __atomic_exchange_N(T *mem, T val, int order)
     case AtomicExpr::AO__atomic_exchange:
     case AtomicExpr::AO__atomic_exchange_n:
     case AtomicExpr::AO__c11_atomic_exchange:
@@ -1233,11 +1102,11 @@ RValue CodeGenFunction::EmitAtomicExpr(AtomicExpr *E) {
     case AtomicExpr::AO__scoped_atomic_exchange:
     case AtomicExpr::AO__scoped_atomic_exchange_n:
       LibCallName = "__atomic_exchange";
-      AddDirectArgument(*this, Args, UseOptimizedLibcall, Val1.getPointer(),
-                        MemTy, E->getExprLoc(), TInfo.Width);
+      Args.add(RValue::get(CastToGenericAddrSpace(Val1.getPointer(),
+                                                  E->getVal1()->getType())),
+               getContext().VoidPtrTy);
       break;
     // void __atomic_store(size_t size, void *mem, void *val, int order)
-    // void __atomic_store_N(T *mem, T val, int order)
     case AtomicExpr::AO__atomic_store:
     case AtomicExpr::AO__atomic_store_n:
     case AtomicExpr::AO__c11_atomic_store:
@@ -1248,11 +1117,11 @@ RValue CodeGenFunction::EmitAtomicExpr(AtomicExpr *E) {
       LibCallName = "__atomic_store";
       RetTy = getContext().VoidTy;
       HaveRetTy = true;
-      AddDirectArgument(*this, Args, UseOptimizedLibcall, Val1.getPointer(),
-                        MemTy, E->getExprLoc(), TInfo.Width);
+      Args.add(RValue::get(CastToGenericAddrSpace(Val1.getPointer(),
+                                                  E->getVal1()->getType())),
+               getContext().VoidPtrTy);
       break;
     // void __atomic_load(size_t size, void *mem, void *return, int order)
-    // T __atomic_load_N(T *mem, int order)
     case AtomicExpr::AO__atomic_load:
     case AtomicExpr::AO__atomic_load_n:
     case AtomicExpr::AO__c11_atomic_load:
@@ -1262,183 +1131,85 @@ RValue CodeGenFunction::EmitAtomicExpr(AtomicExpr *E) {
     case AtomicExpr::AO__scoped_atomic_load_n:
       LibCallName = "__atomic_load";
       break;
-    // T __atomic_add_fetch_N(T *mem, T val, int order)
-    // T __atomic_fetch_add_N(T *mem, T val, int order)
     case AtomicExpr::AO__atomic_add_fetch:
     case AtomicExpr::AO__scoped_atomic_add_fetch:
-      PostOp = llvm::Instruction::Add;
-      [[fallthrough]];
     case AtomicExpr::AO__atomic_fetch_add:
     case AtomicExpr::AO__c11_atomic_fetch_add:
     case AtomicExpr::AO__hip_atomic_fetch_add:
     case AtomicExpr::AO__opencl_atomic_fetch_add:
     case AtomicExpr::AO__scoped_atomic_fetch_add:
-      LibCallName = "__atomic_fetch_add";
-      AddDirectArgument(*this, Args, UseOptimizedLibcall, Val1.getPointer(),
-                        LoweredMemTy, E->getExprLoc(), TInfo.Width);
-      break;
-    // T __atomic_and_fetch_N(T *mem, T val, int order)
-    // T __atomic_fetch_and_N(T *mem, T val, int order)
     case AtomicExpr::AO__atomic_and_fetch:
     case AtomicExpr::AO__scoped_atomic_and_fetch:
-      PostOp = llvm::Instruction::And;
-      [[fallthrough]];
     case AtomicExpr::AO__atomic_fetch_and:
     case AtomicExpr::AO__c11_atomic_fetch_and:
     case AtomicExpr::AO__hip_atomic_fetch_and:
     case AtomicExpr::AO__opencl_atomic_fetch_and:
     case AtomicExpr::AO__scoped_atomic_fetch_and:
-      LibCallName = "__atomic_fetch_and";
-      AddDirectArgument(*this, Args, UseOptimizedLibcall, Val1.getPointer(),
-                        MemTy, E->getExprLoc(), TInfo.Width);
-      break;
-    // T __atomic_or_fetch_N(T *mem, T val, int order)
-    // T __atomic_fetch_or_N(T *mem, T val, int order)
     case AtomicExpr::AO__atomic_or_fetch:
     case AtomicExpr::AO__scoped_atomic_or_fetch:
-      PostOp = llvm::Instruction::Or;
-      [[fallthrough]];
     case AtomicExpr::AO__atomic_fetch_or:
     case AtomicExpr::AO__c11_atomic_fetch_or:
     case AtomicExpr::AO__hip_atomic_fetch_or:
     case AtomicExpr::AO__opencl_atomic_fetch_or:
     case AtomicExpr::AO__scoped_atomic_fetch_or:
-      LibCallName = "__atomic_fetch_or";
-      AddDirectArgument(*this, Args, UseOptimizedLibcall, Val1.getPointer(),
-                        MemTy, E->getExprLoc(), TInfo.Width);
-      break;
-    // T __atomic_sub_fetch_N(T *mem, T val, int order)
-    // T __atomic_fetch_sub_N(T *mem, T val, int order)
     case AtomicExpr::AO__atomic_sub_fetch:
     case AtomicExpr::AO__scoped_atomic_sub_fetch:
-      PostOp = llvm::Instruction::Sub;
-      [[fallthrough]];
     case AtomicExpr::AO__atomic_fetch_sub:
     case AtomicExpr::AO__c11_atomic_fetch_sub:
     case AtomicExpr::AO__hip_atomic_fetch_sub:
     case AtomicExpr::AO__opencl_atomic_fetch_sub:
     case AtomicExpr::AO__scoped_atomic_fetch_sub:
-      LibCallName = "__atomic_fetch_sub";
-      AddDirectArgument(*this, Args, UseOptimizedLibcall, Val1.getPointer(),
-                        LoweredMemTy, E->getExprLoc(), TInfo.Width);
-      break;
-    // T __atomic_xor_fetch_N(T *mem, T val, int order)
-    // T __atomic_fetch_xor_N(T *mem, T val, int order)
     case AtomicExpr::AO__atomic_xor_fetch:
     case AtomicExpr::AO__scoped_atomic_xor_fetch:
-      PostOp = llvm::Instruction::Xor;
-      [[fallthrough]];
     case AtomicExpr::AO__atomic_fetch_xor:
     case AtomicExpr::AO__c11_atomic_fetch_xor:
     case AtomicExpr::AO__hip_atomic_fetch_xor:
     case AtomicExpr::AO__opencl_atomic_fetch_xor:
     case AtomicExpr::AO__scoped_atomic_fetch_xor:
-      LibCallName = "__atomic_fetch_xor";
-      AddDirectArgument(*this, Args, UseOptimizedLibcall, Val1.getPointer(),
-                        MemTy, E->getExprLoc(), TInfo.Width);
-      break;
+    case AtomicExpr::AO__atomic_nand_fetch:
+    case AtomicExpr::AO__atomic_fetch_nand:
+    case AtomicExpr::AO__c11_atomic_fetch_nand:
+    case AtomicExpr::AO__scoped_atomic_fetch_nand:
+    case AtomicExpr::AO__scoped_atomic_nand_fetch:
     case AtomicExpr::AO__atomic_min_fetch:
-    case AtomicExpr::AO__scoped_atomic_min_fetch:
-      PostOpMinMax = true;
-      [[fallthrough]];
     case AtomicExpr::AO__atomic_fetch_min:
     case AtomicExpr::AO__c11_atomic_fetch_min:
-    case AtomicExpr::AO__scoped_atomic_fetch_min:
     case AtomicExpr::AO__hip_atomic_fetch_min:
     case AtomicExpr::AO__opencl_atomic_fetch_min:
-      LibCallName = E->getValueType()->isSignedIntegerType()
-                        ? "__atomic_fetch_min"
-                        : "__atomic_fetch_umin";
-      AddDirectArgument(*this, Args, UseOptimizedLibcall, Val1.getPointer(),
-                        LoweredMemTy, E->getExprLoc(), TInfo.Width);
-      break;
+    case AtomicExpr::AO__scoped_atomic_fetch_min:
+    case AtomicExpr::AO__scoped_atomic_min_fetch:
     case AtomicExpr::AO__atomic_max_fetch:
-    case AtomicExpr::AO__scoped_atomic_max_fetch:
-      PostOpMinMax = true;
-      [[fallthrough]];
     case AtomicExpr::AO__atomic_fetch_max:
     case AtomicExpr::AO__c11_atomic_fetch_max:
     case AtomicExpr::AO__hip_atomic_fetch_max:
     case AtomicExpr::AO__opencl_atomic_fetch_max:
     case AtomicExpr::AO__scoped_atomic_fetch_max:
-      LibCallName = E->getValueType()->isSignedIntegerType()
-                        ? "__atomic_fetch_max"
-                        : "__atomic_fetch_umax";
-      AddDirectArgument(*this, Args, UseOptimizedLibcall, Val1.getPointer(),
-                        LoweredMemTy, E->getExprLoc(), TInfo.Width);
-      break;
-    // T __atomic_nand_fetch_N(T *mem, T val, int order)
-    // T __atomic_fetch_nand_N(T *mem, T val, int order)
-    case AtomicExpr::AO__atomic_nand_fetch:
-    case AtomicExpr::AO__scoped_atomic_nand_fetch:
-      PostOp = llvm::Instruction::And; // the NOT is special cased below
-      [[fallthrough]];
-    case AtomicExpr::AO__atomic_fetch_nand:
-    case AtomicExpr::AO__c11_atomic_fetch_nand:
-    case AtomicExpr::AO__scoped_atomic_fetch_nand:
-      LibCallName = "__atomic_fetch_nand";
-      AddDirectArgument(*this, Args, UseOptimizedLibcall, Val1.getPointer(),
-                        MemTy, E->getExprLoc(), TInfo.Width);
-      break;
+    case AtomicExpr::AO__scoped_atomic_max_fetch:
+      llvm_unreachable("Integral atomic operations always become atomicrmw!");
     }
 
     if (E->isOpenCL()) {
-      LibCallName = std::string("__opencl") +
-          StringRef(LibCallName).drop_front(1).str();
-
+      LibCallName =
+          std::string("__opencl") + StringRef(LibCallName).drop_front(1).str();
     }
-    // Optimized functions have the size in their name.
-    if (UseOptimizedLibcall)
-      LibCallName += "_" + llvm::utostr(Size);
     // By default, assume we return a value of the atomic type.
     if (!HaveRetTy) {
-      if (UseOptimizedLibcall) {
-        // Value is returned directly.
-        // The function returns an appropriately sized integer type.
-        RetTy = getContext().getIntTypeForBitwidth(
-            getContext().toBits(TInfo.Width), /*Signed=*/false);
-      } else {
-        // Value is returned through parameter before the order.
-        RetTy = getContext().VoidTy;
-        Args.add(RValue::get(Dest.getPointer()), getContext().VoidPtrTy);
-      }
+      // Value is returned through parameter before the order.
+      RetTy = getContext().VoidTy;
+      Args.add(RValue::get(CastToGenericAddrSpace(Dest.getPointer(), RetTy)),
+               getContext().VoidPtrTy);
     }
-    // order is always the last parameter
+    // Order is always the last parameter.
     Args.add(RValue::get(Order),
              getContext().IntTy);
     if (E->isOpenCL())
       Args.add(RValue::get(Scope), getContext().IntTy);
 
-    // PostOp is only needed for the atomic_*_fetch operations, and
-    // thus is only needed for and implemented in the
-    // UseOptimizedLibcall codepath.
-    assert(UseOptimizedLibcall || (!PostOp && !PostOpMinMax));
-
     RValue Res = emitAtomicLibcall(*this, LibCallName, RetTy, Args);
     // The value is returned directly from the libcall.
     if (E->isCmpXChg())
       return Res;
 
-    // The value is returned directly for optimized libcalls but the expr
-    // provided an out-param.
-    if (UseOptimizedLibcall && Res.getScalarVal()) {
-      llvm::Value *ResVal = Res.getScalarVal();
-      if (PostOpMinMax) {
-        llvm::Value *LoadVal1 = Args[1].getRValue(*this).getScalarVal();
-        ResVal = EmitPostAtomicMinMax(Builder, E->getOp(),
-                                      E->getValueType()->isSignedIntegerType(),
-                                      ResVal, LoadVal1);
-      } else if (PostOp) {
-        llvm::Value *LoadVal1 = Args[1].getRValue(*this).getScalarVal();
-        ResVal = Builder.CreateBinOp(PostOp, ResVal, LoadVal1);
-      }
-      if (E->getOp() == AtomicExpr::AO__atomic_nand_fetch ||
-          E->getOp() == AtomicExpr::AO__scoped_atomic_nand_fetch)
-        ResVal = Builder.CreateNot(ResVal);
-
-      Builder.CreateStore(ResVal, Dest.withElementType(ResVal->getType()));
-    }
-
     if (RValTy->isVoidType())
       return RValue::get(nullptr);
 
diff --git a/clang/test/CodeGen/LoongArch/atomics.c b/clang/test/CodeGen/LoongArch/atomics.c
index edc58d3..bd51fea 100644
--- a/clang/test/CodeGen/LoongArch/atomics.c
+++ b/clang/test/CodeGen/LoongArch/atomics.c
@@ -11,10 +11,10 @@
 void test_i8_atomics(_Atomic(int8_t) * a, int8_t b) {
   // LA32: load atomic i8, ptr %a seq_cst, align 1
   // LA32: store atomic i8 %b, ptr %a seq_cst, align 1
-  // LA32: atomicrmw add ptr %a, i8 %b seq_cst
+  // LA32: atomicrmw add ptr %a, i8 %b seq_cst, align 1
   // LA64: load atomic i8, ptr %a seq_cst, align 1
   // LA64: store atomic i8 %b, ptr %a seq_cst, align 1
-  // LA64: atomicrmw add ptr %a, i8 %b seq_cst
+  // LA64: atomicrmw add ptr %a, i8 %b seq_cst, align 1
   __c11_atomic_load(a, memory_order_seq_cst);
   __c11_atomic_store(a, b, memory_order_seq_cst);
   __c11_atomic_fetch_add(a, b, memory_order_seq_cst);
@@ -23,22 +23,22 @@ void test_i8_atomics(_Atomic(int8_t) * a, int8_t b) {
 void test_i32_atomics(_Atomic(int32_t) * a, int32_t b) {
   // LA32: load atomic i32, ptr %a seq_cst, align 4
   // LA32: store atomic i32 %b, ptr %a seq_cst, align 4
-  // LA32: atomicrmw add ptr %a, i32 %b seq_cst
+  // LA32: atomicrmw add ptr %a, i32 %b seq_cst, align 4
   // LA64: load atomic i32, ptr %a seq_cst, align 4
   // LA64: store atomic i32 %b, ptr %a seq_cst, align 4
-  // LA64: atomicrmw add ptr %a, i32 %b seq_cst
+  // LA64: atomicrmw add ptr %a, i32 %b seq_cst, align 4
   __c11_atomic_load(a, memory_order_seq_cst);
   __c11_atomic_store(a, b, memory_order_seq_cst);
   __c11_atomic_fetch_add(a, b, memory_order_seq_cst);
 }
 
 void test_i64_atomics(_Atomic(int64_t) * a, int64_t b) {
-  // LA32: call i64 @__atomic_load_8
-  // LA32: call void @__atomic_store_8
-  // LA32: call i64 @__atomic_fetch_add_8
+  // LA32: load atomic i64, ptr %a seq_cst, align 8
+  // LA32: store atomic i64 %b, ptr %a seq_cst, align 8
+  // LA32: atomicrmw add ptr %a, i64 %b seq_cst, align 8
   // LA64: load atomic i64, ptr %a seq_cst, align 8
   // LA64: store atomic i64 %b, ptr %a seq_cst, align 8
-  // LA64: atomicrmw add ptr %a, i64 %b seq_cst
+  // LA64: atomicrmw add ptr %a, i64 %b seq_cst, align 8
   __c11_atomic_load(a, memory_order_seq_cst);
   __c11_atomic_store(a, b, memory_order_seq_cst);
   __c11_atomic_fetch_add(a, b, memory_order_seq_cst);
diff --git a/clang/test/CodeGen/PowerPC/quadword-atomics.c b/clang/test/CodeGen/PowerPC/quadword-atomics.c
index bff03b2..dc04423 100644
--- a/clang/test/CodeGen/PowerPC/quadword-atomics.c
+++ b/clang/test/CodeGen/PowerPC/quadword-atomics.c
@@ -1,14 +1,18 @@
 // RUN: %clang_cc1 -Werror -Wno-atomic-alignment -triple powerpc64le-linux-gnu \
-// RUN:   -target-cpu pwr8 -emit-llvm -o - %s | FileCheck %s --check-prefix=PPC64-QUADWORD-ATOMICS
+// RUN:   -target-cpu pwr8 -emit-llvm -o - %s | FileCheck %s \
+// RUN:   --check-prefixes=PPC64,PPC64-QUADWORD-ATOMICS
 // RUN: %clang_cc1 -Werror -Wno-atomic-alignment -triple powerpc64le-linux-gnu \
-// RUN:   -emit-llvm -o - %s | FileCheck %s --check-prefix=PPC64
+// RUN:   -emit-llvm -o - %s | FileCheck %s \
+// RUN:   --check-prefixes=PPC64,PPC64-NO-QUADWORD-ATOMICS
 // RUN: %clang_cc1 -Werror -Wno-atomic-alignment -triple powerpc64-unknown-aix \
-// RUN:   -target-cpu pwr7 -emit-llvm -o - %s | FileCheck %s --check-prefix=PPC64
+// RUN:   -target-cpu pwr7 -emit-llvm -o - %s | FileCheck %s \
+// RUN:   --check-prefixes=PPC64,PPC64-NO-QUADWORD-ATOMICS
 // RUN: %clang_cc1 -Werror -Wno-atomic-alignment -triple powerpc64-unknown-aix \
-// RUN:   -target-cpu pwr8 -emit-llvm -o - %s | FileCheck %s --check-prefix=PPC64
+// RUN:   -target-cpu pwr8 -emit-llvm -o - %s | FileCheck %s \
+// RUN:   --check-prefixes=PPC64,PPC64-NO-QUADWORD-ATOMICS
 // RUN: %clang_cc1 -Werror -Wno-atomic-alignment -triple powerpc64-unknown-aix \
-// RUN:   -mabi=quadword-atomics -target-cpu pwr8 -emit-llvm -o - %s | FileCheck %s \
-// RUN:   --check-prefix=PPC64-QUADWORD-ATOMICS
+// RUN:   -mabi=quadword-atomics -target-cpu pwr8 -emit-llvm -o - %s | \
+// RUN:   FileCheck %s --check-prefixes=PPC64,PPC64-QUADWORD-ATOMICS
 
 
 typedef struct {
@@ -19,66 +23,48 @@ typedef _Atomic(Q) AtomicQ;
 
 typedef __int128_t int128_t;
 
-// PPC64-QUADWORD-ATOMICS-LABEL: @test_load(
-// PPC64-QUADWORD-ATOMICS:    [[TMP3:%.*]] = load atomic i128, ptr [[TMP1:%.*]] acquire, align 16
-//
 // PPC64-LABEL: @test_load(
-// PPC64:    call void @__atomic_load(i64 noundef 16, ptr noundef [[TMP3:%.*]], ptr noundef [[TMP4:%.*]], i32 noundef signext 2)
+// PPC64:    [[TMP3:%.*]] = load atomic i128, ptr [[TMP1:%.*]] acquire, align 16
 //
 Q test_load(AtomicQ *ptr) {
   // expected-no-diagnostics
   return __c11_atomic_load(ptr, __ATOMIC_ACQUIRE);
 }
 
-// PPC64-QUADWORD-ATOMICS-LABEL: @test_store(
-// PPC64-QUADWORD-ATOMICS:    store atomic i128 [[TMP6:%.*]], ptr [[TMP4:%.*]] release, align 16
-//
 // PPC64-LABEL: @test_store(
-// PPC64:    call void @__atomic_store(i64 noundef 16, ptr noundef [[TMP6:%.*]], ptr noundef [[TMP7:%.*]], i32 noundef signext 3)
+// PPC64:    store atomic i128 [[TMP6:%.*]], ptr [[TMP4:%.*]] release, align 16
 //
 void test_store(Q val, AtomicQ *ptr) {
   // expected-no-diagnostics
   __c11_atomic_store(ptr, val, __ATOMIC_RELEASE);
 }
 
-// PPC64-QUADWORD-ATOMICS-LABEL: @test_add(
-// PPC64-QUADWORD-ATOMICS:    [[TMP3:%.*]] = atomicrmw add ptr [[TMP0:%.*]], i128 [[TMP2:%.*]] monotonic, align 16
-//
 // PPC64-LABEL: @test_add(
-// PPC64:    [[CALL:%.*]] = call i128 @__atomic_fetch_add_16(ptr noundef [[TMP2:%.*]], i128 noundef [[TMP3:%.*]], i32 noundef signext 0)
+// PPC64:    [[ATOMICRMW:%.*]] = atomicrmw add ptr [[TMP0:%.*]], i128 [[TMP2:%.*]] monotonic, align 16
 //
 void test_add(_Atomic(int128_t) *ptr, int128_t x) {
   // expected-no-diagnostics
   __c11_atomic_fetch_add(ptr, x, __ATOMIC_RELAXED);
 }
 
-// PPC64-QUADWORD-ATOMICS-LABEL: @test_xchg(
-// PPC64-QUADWORD-ATOMICS:    [[TMP8:%.*]] = atomicrmw xchg ptr [[TMP4:%.*]], i128 [[TMP7:%.*]] seq_cst, align 16
-//
 // PPC64-LABEL: @test_xchg(
-// PPC64:    call void @__atomic_exchange(i64 noundef 16, ptr noundef [[TMP7:%.*]], ptr noundef [[TMP8:%.*]], ptr noundef [[TMP9:%.*]], i32 noundef signext 5)
+// PPC64:    [[TMP8:%.*]] = atomicrmw xchg ptr [[TMP4:%.*]], i128 [[TMP7:%.*]] seq_cst, align 16
 //
 Q test_xchg(AtomicQ *ptr, Q new) {
   // expected-no-diagnostics
   return __c11_atomic_exchange(ptr, new, __ATOMIC_SEQ_CST);
 }
 
-// PPC64-QUADWORD-ATOMICS-LABEL: @test_cmpxchg(
-// PPC64-QUADWORD-ATOMICS:    [[TMP10:%.*]] = cmpxchg ptr [[TMP5:%.*]], i128 [[TMP8:%.*]], i128 [[TMP9:%.*]] seq_cst monotonic, align 16
-//
 // PPC64-LABEL: @test_cmpxchg(
-// PPC64:    [[CALL:%.*]] = call zeroext i1 @__atomic_compare_exchange(i64 noundef 16, ptr noundef [[TMP8:%.*]], ptr noundef [[TMP9:%.*]], ptr noundef [[TMP10:%.*]], i32 noundef signext 5, i32 noundef signext 0)
+// PPC64:    [[TMP10:%.*]] = cmpxchg ptr [[TMP5:%.*]], i128 [[TMP8:%.*]], i128 [[TMP9:%.*]] seq_cst monotonic, align 16
 //
 int test_cmpxchg(AtomicQ *ptr, Q *cmp, Q new) {
   // expected-no-diagnostics
   return __c11_atomic_compare_exchange_strong(ptr, cmp, new, __ATOMIC_SEQ_CST, __ATOMIC_RELAXED);
 }
 
-// PPC64-QUADWORD-ATOMICS-LABEL: @test_cmpxchg_weak(
-// PPC64-QUADWORD-ATOMICS:    [[TMP10:%.*]] = cmpxchg weak ptr [[TMP5:%.*]], i128 [[TMP8:%.*]], i128 [[TMP9:%.*]] seq_cst monotonic, align 16
-//
 // PPC64-LABEL: @test_cmpxchg_weak(
-// PPC64:    [[CALL:%.*]] = call zeroext i1 @__atomic_compare_exchange(i64 noundef 16, ptr noundef [[TMP8:%.*]], ptr noundef [[TMP9:%.*]], ptr noundef [[TMP10:%.*]], i32 noundef signext 5, i32 noundef signext 0)
+// PPC64:    [[TMP10:%.*]] = cmpxchg weak ptr [[TMP5:%.*]], i128 [[TMP8:%.*]], i128 [[TMP9:%.*]] seq_cst monotonic, align 16
 //
 int test_cmpxchg_weak(AtomicQ *ptr, Q *cmp, Q new) {
   // expected-no-diagnostics
@@ -88,8 +74,8 @@ int test_cmpxchg_weak(AtomicQ *ptr, Q *cmp, Q new) {
 // PPC64-QUADWORD-ATOMICS-LABEL: @is_lock_free(
 // PPC64-QUADWORD-ATOMICS:    ret i32 1
 //
-// PPC64-LABEL: @is_lock_free(
-// PPC64:    [[CALL:%.*]] = call zeroext i1 @__atomic_is_lock_free(i64 noundef 16, ptr noundef null)
+// PPC64-NO-QUADWORD-ATOMICS-LABEL: @is_lock_free(
+// PPC64-NO-QUADWORD-ATOMICS:    [[CALL:%.*]] = call zeroext i1 @__atomic_is_lock_free(i64 noundef 16, ptr noundef null)
 //
 int is_lock_free() {
   AtomicQ q;
diff --git a/clang/test/CodeGen/RISCV/riscv-atomics.c b/clang/test/CodeGen/RISCV/riscv-atomics.c
index f629ad7..437cb94 100644
--- a/clang/test/CodeGen/RISCV/riscv-atomics.c
+++ b/clang/test/CodeGen/RISCV/riscv-atomics.c
@@ -1,68 +1,34 @@
 // RUN: %clang_cc1 -triple riscv32 -O1 -emit-llvm %s -o - \
-// RUN:   | FileCheck %s -check-prefix=RV32I
+// RUN:   -verify=no-atomics
 // RUN: %clang_cc1 -triple riscv32 -target-feature +a -O1 -emit-llvm %s -o - \
-// RUN:   | FileCheck %s -check-prefix=RV32IA
+// RUN:   -verify=small-atomics
 // RUN: %clang_cc1 -triple riscv64 -O1 -emit-llvm %s -o - \
-// RUN:   | FileCheck %s -check-prefix=RV64I
+// RUN:   -verify=no-atomics
 // RUN: %clang_cc1 -triple riscv64 -target-feature +a -O1 -emit-llvm %s -o - \
-// RUN:   | FileCheck %s -check-prefix=RV64IA
+// RUN:   -verify=all-atomics
 
-// This test demonstrates that MaxAtomicInlineWidth is set appropriately when
-// the atomics instruction set extension is enabled.
+// all-atomics-no-diagnostics
 
 #include <stdatomic.h>
 #include <stdint.h>
 
 void test_i8_atomics(_Atomic(int8_t) * a, int8_t b) {
-  // RV32I:  call zeroext i8 @__atomic_load_1
-  // RV32I:  call void @__atomic_store_1
-  // RV32I:  call zeroext i8 @__atomic_fetch_add_1
-  // RV32IA: load atomic i8, ptr %a seq_cst, align 1
-  // RV32IA: store atomic i8 %b, ptr %a seq_cst, align 1
-  // RV32IA: atomicrmw add ptr %a, i8 %b seq_cst, align 1
-  // RV64I:  call zeroext i8 @__atomic_load_1
-  // RV64I:  call void @__atomic_store_1
-  // RV64I:  call zeroext i8 @__atomic_fetch_add_1
-  // RV64IA: load atomic i8, ptr %a seq_cst, align 1
-  // RV64IA: store atomic i8 %b, ptr %a seq_cst, align 1
-  // RV64IA: atomicrmw add ptr %a, i8 %b seq_cst, align 1
-  __c11_atomic_load(a, memory_order_seq_cst);
-  __c11_atomic_store(a, b, memory_order_seq_cst);
-  __c11_atomic_fetch_add(a, b, memory_order_seq_cst);
+  __c11_atomic_load(a, memory_order_seq_cst);         // no-atomics-warning {{large atomic operation may incur significant performance penalty; the access size (1 bytes) exceeds the max lock-free size (0  bytes)}}
+  __c11_atomic_store(a, b, memory_order_seq_cst);     // no-atomics-warning {{large atomic operation may incur significant performance penalty; the access size (1 bytes) exceeds the max lock-free size (0  bytes)}}
+  __c11_atomic_fetch_add(a, b, memory_order_seq_cst); // no-atomics-warning {{large atomic operation may incur significant performance penalty; the access size (1 bytes) exceeds the max lock-free size (0  bytes)}}
 }
 
 void test_i32_atomics(_Atomic(int32_t) * a, int32_t b) {
-  // RV32I:  call i32 @__atomic_load_4
-  // RV32I:  call void @__atomic_store_4
-  // RV32I:  call i32 @__atomic_fetch_add_4
-  // RV32IA: load atomic i32, ptr %a seq_cst, align 4
-  // RV32IA: store atomic i32 %b, ptr %a seq_cst, align 4
-  // RV32IA: atomicrmw add ptr %a, i32 %b seq_cst, align 4
-  // RV64I:  call signext i32 @__atomic_load_4
-  // RV64I:  call void @__atomic_store_4
-  // RV64I:  call signext i32 @__atomic_fetch_add_4
-  // RV64IA: load atomic i32, ptr %a seq_cst, align 4
-  // RV64IA: store atomic i32 %b, ptr %a seq_cst, align 4
-  // RV64IA: atomicrmw add ptr %a, i32 %b seq_cst, align 4
-  __c11_atomic_load(a, memory_order_seq_cst);
-  __c11_atomic_store(a, b, memory_order_seq_cst);
-  __c11_atomic_fetch_add(a, b, memory_order_seq_cst);
+  __c11_atomic_load(a, memory_order_seq_cst);         // no-atomics-warning {{large atomic operation may incur significant performance penalty; the access size (4 bytes) exceeds the max lock-free size (0  bytes)}}
+  __c11_atomic_store(a, b, memory_order_seq_cst);     // no-atomics-warning {{large atomic operation may incur significant performance penalty; the access size (4 bytes) exceeds the max lock-free size (0  bytes)}}
+  __c11_atomic_fetch_add(a, b, memory_order_seq_cst); // no-atomics-warning {{large atomic operation may incur significant performance penalty; the access size (4 bytes) exceeds the max lock-free size (0  bytes)}}
 }
 
 void test_i64_atomics(_Atomic(int64_t) * a, int64_t b) {
-  // RV32I:  call i64 @__atomic_load_8
-  // RV32I:  call void @__atomic_store_8
-  // RV32I:  call i64 @__atomic_fetch_add_8
-  // RV32IA: call i64 @__atomic_load_8
-  // RV32IA: call void @__atomic_store_8
-  // RV32IA: call i64 @__atomic_fetch_add_8
-  // RV64I:  call i64 @__atomic_load_8
-  // RV64I:  call void @__atomic_store_8
-  // RV64I:  call i64 @__atomic_fetch_add_8
-  // RV64IA: load atomic i64, ptr %a seq_cst, align 8
-  // RV64IA: store atomic i64 %b, ptr %a seq_cst, align 8
-  // RV64IA: atomicrmw add ptr %a, i64 %b seq_cst, align 8
-  __c11_atomic_load(a, memory_order_seq_cst);
-  __c11_atomic_store(a, b, memory_order_seq_cst);
-  __c11_atomic_fetch_add(a, b, memory_order_seq_cst);
+  __c11_atomic_load(a, memory_order_seq_cst);         // no-atomics-warning {{large atomic operation may incur significant performance penalty; the access size (8 bytes) exceeds the max lock-free size (0  bytes)}}
+                                                      // small-atomics-warning@28 {{large atomic operation may incur significant performance penalty; the access size (8 bytes) exceeds the max lock-free size (4  bytes)}}
+  __c11_atomic_store(a, b, memory_order_seq_cst);     // no-atomics-warning {{large atomic operation may incur significant performance penalty; the access size (8 bytes) exceeds the max lock-free size (0  bytes)}}
+                                                      // small-atomics-warning@30 {{large atomic operation may incur significant performance penalty; the access size (8 bytes) exceeds the max lock-free size (4  bytes)}}
+  __c11_atomic_fetch_add(a, b, memory_order_seq_cst); // no-atomics-warning {{large atomic operation may incur significant performance penalty; the access size (8 bytes) exceeds the max lock-free size (0  bytes)}}
+                                                      // small-atomics-warning@32 {{large atomic operation may incur significant performance penalty; the access size (8 bytes) exceeds the max lock-free size (4  bytes)}}
 }
diff --git a/clang/test/CodeGen/SystemZ/gnu-atomic-builtins-i128-8Al.c b/clang/test/CodeGen/SystemZ/gnu-atomic-builtins-i128-8Al.c
index 4f6dcbc..8759df7 100644
--- a/clang/test/CodeGen/SystemZ/gnu-atomic-builtins-i128-8Al.c
+++ b/clang/test/CodeGen/SystemZ/gnu-atomic-builtins-i128-8Al.c
@@ -20,7 +20,8 @@ __int128 Des;
 
 // CHECK-LABEL: @f1(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    tail call void @__atomic_load(i64 noundef 16, ptr noundef nonnull @Ptr, ptr noundef nonnull [[AGG_RESULT:%.*]], i32 noundef signext 5)
+// CHECK-NEXT:    [[TMP0:%.*]] = load atomic i128, ptr @Ptr seq_cst, align 8
+// CHECK-NEXT:    store i128 [[TMP0]], ptr [[AGG_RESULT:%.*]], align 8, !tbaa [[TBAA2:![0-9]+]]
 // CHECK-NEXT:    ret void
 //
 __int128 f1() {
@@ -29,8 +30,8 @@ __int128 f1() {
 
 // CHECK-LABEL: @f2(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    tail call void @__atomic_load(i64 noundef 16, ptr noundef nonnull @Ptr, ptr noundef nonnull @Ret, i32 noundef signext 5)
-// CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr @Ret, align 8, !tbaa [[TBAA2:![0-9]+]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load atomic i128, ptr @Ptr seq_cst, align 8
+// CHECK-NEXT:    store i128 [[TMP0]], ptr @Ret, align 8
 // CHECK-NEXT:    store i128 [[TMP0]], ptr [[AGG_RESULT:%.*]], align 8, !tbaa [[TBAA2]]
 // CHECK-NEXT:    ret void
 //
@@ -41,10 +42,8 @@ __int128 f2() {
 
 // CHECK-LABEL: @f3(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[DOTATOMICTMP:%.*]] = alloca i128, align 8
 // CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr @Val, align 8, !tbaa [[TBAA2]]
-// CHECK-NEXT:    store i128 [[TMP0]], ptr [[DOTATOMICTMP]], align 8, !tbaa [[TBAA2]]
-// CHECK-NEXT:    call void @__atomic_store(i64 noundef 16, ptr noundef nonnull @Ptr, ptr noundef nonnull [[DOTATOMICTMP]], i32 noundef signext 5)
+// CHECK-NEXT:    store atomic i128 [[TMP0]], ptr @Ptr seq_cst, align 8
 // CHECK-NEXT:    ret void
 //
 void f3() {
@@ -53,7 +52,8 @@ void f3() {
 
 // CHECK-LABEL: @f4(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    tail call void @__atomic_store(i64 noundef 16, ptr noundef nonnull @Ptr, ptr noundef nonnull @Val, i32 noundef signext 5)
+// CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr @Val, align 8
+// CHECK-NEXT:    store atomic i128 [[TMP0]], ptr @Ptr seq_cst, align 8
 // CHECK-NEXT:    ret void
 //
 void f4() {
@@ -62,10 +62,9 @@ void f4() {
 
 // CHECK-LABEL: @f5(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[DOTATOMICTMP:%.*]] = alloca i128, align 8
 // CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr @Val, align 8, !tbaa [[TBAA2]]
-// CHECK-NEXT:    store i128 [[TMP0]], ptr [[DOTATOMICTMP]], align 8, !tbaa [[TBAA2]]
-// CHECK-NEXT:    call void @__atomic_exchange(i64 noundef 16, ptr noundef nonnull @Ptr, ptr noundef nonnull [[DOTATOMICTMP]], ptr noundef nonnull [[AGG_RESULT:%.*]], i32 noundef signext 5)
+// CHECK-NEXT:    [[TMP1:%.*]] = atomicrmw xchg ptr @Ptr, i128 [[TMP0]] seq_cst, align 8
+// CHECK-NEXT:    store i128 [[TMP1]], ptr [[AGG_RESULT:%.*]], align 8, !tbaa [[TBAA2]]
 // CHECK-NEXT:    ret void
 //
 __int128 f5() {
@@ -74,9 +73,10 @@ __int128 f5() {
 
 // CHECK-LABEL: @f6(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    tail call void @__atomic_exchange(i64 noundef 16, ptr noundef nonnull @Ptr, ptr noundef nonnull @Val, ptr noundef nonnull @Ret, i32 noundef signext 5)
-// CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr @Ret, align 8, !tbaa [[TBAA2]]
-// CHECK-NEXT:    store i128 [[TMP0]], ptr [[AGG_RESULT:%.*]], align 8, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr @Val, align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = atomicrmw xchg ptr @Ptr, i128 [[TMP0]] seq_cst, align 8
+// CHECK-NEXT:    store i128 [[TMP1]], ptr @Ret, align 8
+// CHECK-NEXT:    store i128 [[TMP1]], ptr [[AGG_RESULT:%.*]], align 8, !tbaa [[TBAA2]]
 // CHECK-NEXT:    ret void
 //
 __int128 f6() {
@@ -86,11 +86,17 @@ __int128 f6() {
 
 // CHECK-LABEL: @f7(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[DOTATOMICTMP:%.*]] = alloca i128, align 8
 // CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr @Des, align 8, !tbaa [[TBAA2]]
-// CHECK-NEXT:    store i128 [[TMP0]], ptr [[DOTATOMICTMP]], align 8, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[CALL:%.*]] = call zeroext i1 @__atomic_compare_exchange(i64 noundef 16, ptr noundef nonnull @Ptr, ptr noundef nonnull @Exp, ptr noundef nonnull [[DOTATOMICTMP]], i32 noundef signext 5, i32 noundef signext 5)
-// CHECK-NEXT:    ret i1 [[CALL]]
+// CHECK-NEXT:    [[TMP1:%.*]] = load i128, ptr @Exp, align 8
+// CHECK-NEXT:    [[TMP2:%.*]] = cmpxchg ptr @Ptr, i128 [[TMP1]], i128 [[TMP0]] seq_cst seq_cst, align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { i128, i1 } [[TMP2]], 1
+// CHECK-NEXT:    br i1 [[TMP3]], label [[CMPXCHG_CONTINUE:%.*]], label [[CMPXCHG_STORE_EXPECTED:%.*]]
+// CHECK:       cmpxchg.store_expected:
+// CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { i128, i1 } [[TMP2]], 0
+// CHECK-NEXT:    store i128 [[TMP4]], ptr @Exp, align 8
+// CHECK-NEXT:    br label [[CMPXCHG_CONTINUE]]
+// CHECK:       cmpxchg.continue:
+// CHECK-NEXT:    ret i1 [[TMP3]]
 //
 _Bool f7() {
   return __atomic_compare_exchange_n(&Ptr, &Exp, Des, 0,
@@ -99,8 +105,17 @@ _Bool f7() {
 
 // CHECK-LABEL: @f8(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[CALL:%.*]] = tail call zeroext i1 @__atomic_compare_exchange(i64 noundef 16, ptr noundef nonnull @Ptr, ptr noundef nonnull @Exp, ptr noundef nonnull @Des, i32 noundef signext 5, i32 noundef signext 5)
-// CHECK-NEXT:    ret i1 [[CALL]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr @Exp, align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load i128, ptr @Des, align 8
+// CHECK-NEXT:    [[TMP2:%.*]] = cmpxchg ptr @Ptr, i128 [[TMP0]], i128 [[TMP1]] seq_cst seq_cst, align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { i128, i1 } [[TMP2]], 1
+// CHECK-NEXT:    br i1 [[TMP3]], label [[CMPXCHG_CONTINUE:%.*]], label [[CMPXCHG_STORE_EXPECTED:%.*]]
+// CHECK:       cmpxchg.store_expected:
+// CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { i128, i1 } [[TMP2]], 0
+// CHECK-NEXT:    store i128 [[TMP4]], ptr @Exp, align 8
+// CHECK-NEXT:    br label [[CMPXCHG_CONTINUE]]
+// CHECK:       cmpxchg.continue:
+// CHECK-NEXT:    ret i1 [[TMP3]]
 //
 _Bool f8() {
   return __atomic_compare_exchange(&Ptr, &Exp, &Des, 0,
@@ -109,12 +124,8 @@ _Bool f8() {
 
 // CHECK-LABEL: @f9(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP:%.*]] = alloca i128, align 8
-// CHECK-NEXT:    [[INDIRECT_ARG_TEMP:%.*]] = alloca i128, align 8
 // CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr @Val, align 8, !tbaa [[TBAA2]]
-// CHECK-NEXT:    store i128 [[TMP0]], ptr [[INDIRECT_ARG_TEMP]], align 8, !tbaa [[TBAA2]]
-// CHECK-NEXT:    call void @__atomic_fetch_add_16(ptr dead_on_unwind nonnull writable sret(i128) align 8 [[TMP]], ptr noundef nonnull @Ptr, ptr noundef nonnull [[INDIRECT_ARG_TEMP]], i32 noundef signext 5)
-// CHECK-NEXT:    [[TMP1:%.*]] = load i128, ptr [[TMP]], align 8, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = atomicrmw add ptr @Ptr, i128 [[TMP0]] seq_cst, align 8
 // CHECK-NEXT:    [[TMP2:%.*]] = add i128 [[TMP1]], [[TMP0]]
 // CHECK-NEXT:    store i128 [[TMP2]], ptr [[AGG_RESULT:%.*]], align 8, !tbaa [[TBAA2]]
 // CHECK-NEXT:    ret void
@@ -125,12 +136,8 @@ __int128 f9() {
 
 // CHECK-LABEL: @f10(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP:%.*]] = alloca i128, align 8
-// CHECK-NEXT:    [[INDIRECT_ARG_TEMP:%.*]] = alloca i128, align 8
 // CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr @Val, align 8, !tbaa [[TBAA2]]
-// CHECK-NEXT:    store i128 [[TMP0]], ptr [[INDIRECT_ARG_TEMP]], align 8, !tbaa [[TBAA2]]
-// CHECK-NEXT:    call void @__atomic_fetch_sub_16(ptr dead_on_unwind nonnull writable sret(i128) align 8 [[TMP]], ptr noundef nonnull @Ptr, ptr noundef nonnull [[INDIRECT_ARG_TEMP]], i32 noundef signext 5)
-// CHECK-NEXT:    [[TMP1:%.*]] = load i128, ptr [[TMP]], align 8, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = atomicrmw sub ptr @Ptr, i128 [[TMP0]] seq_cst, align 8
 // CHECK-NEXT:    [[TMP2:%.*]] = sub i128 [[TMP1]], [[TMP0]]
 // CHECK-NEXT:    store i128 [[TMP2]], ptr [[AGG_RESULT:%.*]], align 8, !tbaa [[TBAA2]]
 // CHECK-NEXT:    ret void
@@ -141,12 +148,8 @@ __int128 f10() {
 
 // CHECK-LABEL: @f11(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP:%.*]] = alloca i128, align 8
-// CHECK-NEXT:    [[INDIRECT_ARG_TEMP:%.*]] = alloca i128, align 8
 // CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr @Val, align 8, !tbaa [[TBAA2]]
-// CHECK-NEXT:    store i128 [[TMP0]], ptr [[INDIRECT_ARG_TEMP]], align 8, !tbaa [[TBAA2]]
-// CHECK-NEXT:    call void @__atomic_fetch_and_16(ptr dead_on_unwind nonnull writable sret(i128) align 8 [[TMP]], ptr noundef nonnull @Ptr, ptr noundef nonnull [[INDIRECT_ARG_TEMP]], i32 noundef signext 5)
-// CHECK-NEXT:    [[TMP1:%.*]] = load i128, ptr [[TMP]], align 8, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = atomicrmw and ptr @Ptr, i128 [[TMP0]] seq_cst, align 8
 // CHECK-NEXT:    [[TMP2:%.*]] = and i128 [[TMP1]], [[TMP0]]
 // CHECK-NEXT:    store i128 [[TMP2]], ptr [[AGG_RESULT:%.*]], align 8, !tbaa [[TBAA2]]
 // CHECK-NEXT:    ret void
@@ -157,12 +160,8 @@ __int128 f11() {
 
 // CHECK-LABEL: @f12(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP:%.*]] = alloca i128, align 8
-// CHECK-NEXT:    [[INDIRECT_ARG_TEMP:%.*]] = alloca i128, align 8
 // CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr @Val, align 8, !tbaa [[TBAA2]]
-// CHECK-NEXT:    store i128 [[TMP0]], ptr [[INDIRECT_ARG_TEMP]], align 8, !tbaa [[TBAA2]]
-// CHECK-NEXT:    call void @__atomic_fetch_xor_16(ptr dead_on_unwind nonnull writable sret(i128) align 8 [[TMP]], ptr noundef nonnull @Ptr, ptr noundef nonnull [[INDIRECT_ARG_TEMP]], i32 noundef signext 5)
-// CHECK-NEXT:    [[TMP1:%.*]] = load i128, ptr [[TMP]], align 8, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = atomicrmw xor ptr @Ptr, i128 [[TMP0]] seq_cst, align 8
 // CHECK-NEXT:    [[TMP2:%.*]] = xor i128 [[TMP1]], [[TMP0]]
 // CHECK-NEXT:    store i128 [[TMP2]], ptr [[AGG_RESULT:%.*]], align 8, !tbaa [[TBAA2]]
 // CHECK-NEXT:    ret void
@@ -173,12 +172,8 @@ __int128 f12() {
 
 // CHECK-LABEL: @f13(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP:%.*]] = alloca i128, align 8
-// CHECK-NEXT:    [[INDIRECT_ARG_TEMP:%.*]] = alloca i128, align 8
 // CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr @Val, align 8, !tbaa [[TBAA2]]
-// CHECK-NEXT:    store i128 [[TMP0]], ptr [[INDIRECT_ARG_TEMP]], align 8, !tbaa [[TBAA2]]
-// CHECK-NEXT:    call void @__atomic_fetch_or_16(ptr dead_on_unwind nonnull writable sret(i128) align 8 [[TMP]], ptr noundef nonnull @Ptr, ptr noundef nonnull [[INDIRECT_ARG_TEMP]], i32 noundef signext 5)
-// CHECK-NEXT:    [[TMP1:%.*]] = load i128, ptr [[TMP]], align 8, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = atomicrmw or ptr @Ptr, i128 [[TMP0]] seq_cst, align 8
 // CHECK-NEXT:    [[TMP2:%.*]] = or i128 [[TMP1]], [[TMP0]]
 // CHECK-NEXT:    store i128 [[TMP2]], ptr [[AGG_RESULT:%.*]], align 8, !tbaa [[TBAA2]]
 // CHECK-NEXT:    ret void
@@ -189,12 +184,8 @@ __int128 f13() {
 
 // CHECK-LABEL: @f14(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP:%.*]] = alloca i128, align 8
-// CHECK-NEXT:    [[INDIRECT_ARG_TEMP:%.*]] = alloca i128, align 8
 // CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr @Val, align 8, !tbaa [[TBAA2]]
-// CHECK-NEXT:    store i128 [[TMP0]], ptr [[INDIRECT_ARG_TEMP]], align 8, !tbaa [[TBAA2]]
-// CHECK-NEXT:    call void @__atomic_fetch_nand_16(ptr dead_on_unwind nonnull writable sret(i128) align 8 [[TMP]], ptr noundef nonnull @Ptr, ptr noundef nonnull [[INDIRECT_ARG_TEMP]], i32 noundef signext 5)
-// CHECK-NEXT:    [[TMP1:%.*]] = load i128, ptr [[TMP]], align 8, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = atomicrmw nand ptr @Ptr, i128 [[TMP0]] seq_cst, align 8
 // CHECK-NEXT:    [[TMP2:%.*]] = and i128 [[TMP1]], [[TMP0]]
 // CHECK-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP2]], -1
 // CHECK-NEXT:    store i128 [[TMP3]], ptr [[AGG_RESULT:%.*]], align 8, !tbaa [[TBAA2]]
@@ -206,10 +197,9 @@ __int128 f14() {
 
 // CHECK-LABEL: @f15(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[INDIRECT_ARG_TEMP:%.*]] = alloca i128, align 8
 // CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr @Val, align 8, !tbaa [[TBAA2]]
-// CHECK-NEXT:    store i128 [[TMP0]], ptr [[INDIRECT_ARG_TEMP]], align 8, !tbaa [[TBAA2]]
-// CHECK-NEXT:    call void @__atomic_fetch_add_16(ptr dead_on_unwind nonnull writable sret(i128) align 8 [[AGG_RESULT:%.*]], ptr noundef nonnull @Ptr, ptr noundef nonnull [[INDIRECT_ARG_TEMP]], i32 noundef signext 5)
+// CHECK-NEXT:    [[TMP1:%.*]] = atomicrmw add ptr @Ptr, i128 [[TMP0]] seq_cst, align 8
+// CHECK-NEXT:    store i128 [[TMP1]], ptr [[AGG_RESULT:%.*]], align 8, !tbaa [[TBAA2]]
 // CHECK-NEXT:    ret void
 //
 __int128 f15() {
@@ -218,10 +208,9 @@ __int128 f15() {
 
 // CHECK-LABEL: @f16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[INDIRECT_ARG_TEMP:%.*]] = alloca i128, align 8
 // CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr @Val, align 8, !tbaa [[TBAA2]]
-// CHECK-NEXT:    store i128 [[TMP0]], ptr [[INDIRECT_ARG_TEMP]], align 8, !tbaa [[TBAA2]]
-// CHECK-NEXT:    call void @__atomic_fetch_sub_16(ptr dead_on_unwind nonnull writable sret(i128) align 8 [[AGG_RESULT:%.*]], ptr noundef nonnull @Ptr, ptr noundef nonnull [[INDIRECT_ARG_TEMP]], i32 noundef signext 5)
+// CHECK-NEXT:    [[TMP1:%.*]] = atomicrmw sub ptr @Ptr, i128 [[TMP0]] seq_cst, align 8
+// CHECK-NEXT:    store i128 [[TMP1]], ptr [[AGG_RESULT:%.*]], align 8, !tbaa [[TBAA2]]
 // CHECK-NEXT:    ret void
 //
 __int128 f16() {
@@ -230,10 +219,9 @@ __int128 f16() {
 
 // CHECK-LABEL: @f17(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[INDIRECT_ARG_TEMP:%.*]] = alloca i128, align 8
 // CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr @Val, align 8, !tbaa [[TBAA2]]
-// CHECK-NEXT:    store i128 [[TMP0]], ptr [[INDIRECT_ARG_TEMP]], align 8, !tbaa [[TBAA2]]
-// CHECK-NEXT:    call void @__atomic_fetch_and_16(ptr dead_on_unwind nonnull writable sret(i128) align 8 [[AGG_RESULT:%.*]], ptr noundef nonnull @Ptr, ptr noundef nonnull [[INDIRECT_ARG_TEMP]], i32 noundef signext 5)
+// CHECK-NEXT:    [[TMP1:%.*]] = atomicrmw and ptr @Ptr, i128 [[TMP0]] seq_cst, align 8
+// CHECK-NEXT:    store i128 [[TMP1]], ptr [[AGG_RESULT:%.*]], align 8, !tbaa [[TBAA2]]
 // CHECK-NEXT:    ret void
 //
 __int128 f17() {
@@ -242,10 +230,9 @@ __int128 f17() {
 
 // CHECK-LABEL: @f18(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[INDIRECT_ARG_TEMP:%.*]] = alloca i128, align 8
 // CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr @Val, align 8, !tbaa [[TBAA2]]
-// CHECK-NEXT:    store i128 [[TMP0]], ptr [[INDIRECT_ARG_TEMP]], align 8, !tbaa [[TBAA2]]
-// CHECK-NEXT:    call void @__atomic_fetch_xor_16(ptr dead_on_unwind nonnull writable sret(i128) align 8 [[AGG_RESULT:%.*]], ptr noundef nonnull @Ptr, ptr noundef nonnull [[INDIRECT_ARG_TEMP]], i32 noundef signext 5)
+// CHECK-NEXT:    [[TMP1:%.*]] = atomicrmw xor ptr @Ptr, i128 [[TMP0]] seq_cst, align 8
+// CHECK-NEXT:    store i128 [[TMP1]], ptr [[AGG_RESULT:%.*]], align 8, !tbaa [[TBAA2]]
 // CHECK-NEXT:    ret void
 //
 __int128 f18() {
@@ -254,10 +241,9 @@ __int128 f18() {
 
 // CHECK-LABEL: @f19(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[INDIRECT_ARG_TEMP:%.*]] = alloca i128, align 8
 // CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr @Val, align 8, !tbaa [[TBAA2]]
-// CHECK-NEXT:    store i128 [[TMP0]], ptr [[INDIRECT_ARG_TEMP]], align 8, !tbaa [[TBAA2]]
-// CHECK-NEXT:    call void @__atomic_fetch_or_16(ptr dead_on_unwind nonnull writable sret(i128) align 8 [[AGG_RESULT:%.*]], ptr noundef nonnull @Ptr, ptr noundef nonnull [[INDIRECT_ARG_TEMP]], i32 noundef signext 5)
+// CHECK-NEXT:    [[TMP1:%.*]] = atomicrmw or ptr @Ptr, i128 [[TMP0]] seq_cst, align 8
+// CHECK-NEXT:    store i128 [[TMP1]], ptr [[AGG_RESULT:%.*]], align 8, !tbaa [[TBAA2]]
 // CHECK-NEXT:    ret void
 //
 __int128 f19() {
@@ -266,10 +252,9 @@ __int128 f19() {
 
 // CHECK-LABEL: @f20(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[INDIRECT_ARG_TEMP:%.*]] = alloca i128, align 8
 // CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr @Val, align 8, !tbaa [[TBAA2]]
-// CHECK-NEXT:    store i128 [[TMP0]], ptr [[INDIRECT_ARG_TEMP]], align 8, !tbaa [[TBAA2]]
-// CHECK-NEXT:    call void @__atomic_fetch_nand_16(ptr dead_on_unwind nonnull writable sret(i128) align 8 [[AGG_RESULT:%.*]], ptr noundef nonnull @Ptr, ptr noundef nonnull [[INDIRECT_ARG_TEMP]], i32 noundef signext 5)
+// CHECK-NEXT:    [[TMP1:%.*]] = atomicrmw nand ptr @Ptr, i128 [[TMP0]] seq_cst, align 8
+// CHECK-NEXT:    store i128 [[TMP1]], ptr [[AGG_RESULT:%.*]], align 8, !tbaa [[TBAA2]]
 // CHECK-NEXT:    ret void
 //
 __int128 f20() {
diff --git a/clang/test/CodeGen/arm-atomics-m.c b/clang/test/CodeGen/arm-atomics-m.c
index b9cc72b..6087fd9 100644
--- a/clang/test/CodeGen/arm-atomics-m.c
+++ b/clang/test/CodeGen/arm-atomics-m.c
@@ -22,14 +22,14 @@ void test_presence(void)
   r = 0;
   __atomic_store(&i, &r, memory_order_seq_cst);
 
-  // CHECK: __atomic_fetch_add_8
+  // CHECK: atomicrmw add ptr {{.*}} seq_cst, align 8
   __atomic_fetch_add(&l, 1, memory_order_seq_cst);
-  // CHECK: __atomic_fetch_sub_8
+  // CHECK: atomicrmw sub ptr {{.*}} seq_cst, align 8
   __atomic_fetch_sub(&l, 1, memory_order_seq_cst);
-  // CHECK: __atomic_load_8
+  // CHECK: load atomic i64, ptr {{.*}} seq_cst, align 8
   long long rl;
   __atomic_load(&l, &rl, memory_order_seq_cst);
-  // CHECK: __atomic_store_8
+  // CHECK: store atomic i64 {{.*}}, ptr {{.*}} seq_cst, align 8
   rl = 0;
   __atomic_store(&l, &rl, memory_order_seq_cst);
 }
diff --git a/clang/test/CodeGen/arm-atomics-m0.c b/clang/test/CodeGen/arm-atomics-m0.c
index 335a1d2..94e344c 100644
--- a/clang/test/CodeGen/arm-atomics-m0.c
+++ b/clang/test/CodeGen/arm-atomics-m0.c
@@ -11,25 +11,25 @@ typedef enum memory_order {
 void test_presence(void)
 {
   // CHECK-LABEL: @test_presence
-  // CHECK: __atomic_fetch_add_4
+  // CHECK: atomicrmw add ptr {{.*}} seq_cst, align 4
   __atomic_fetch_add(&i, 1, memory_order_seq_cst);
-  // CHECK: __atomic_fetch_sub_4
+  // CHECK: atomicrmw sub {{.*}} seq_cst, align 4
   __atomic_fetch_sub(&i, 1, memory_order_seq_cst);
-  // CHECK: __atomic_load_4
+  // CHECK: load atomic i32, ptr {{.*}} seq_cst, align 4
   int r;
   __atomic_load(&i, &r, memory_order_seq_cst);
-  // CHECK: __atomic_store_4
+  // CHECK: store atomic i32 {{.*}}, ptr {{.*}} seq_cst, align 4
   r = 0;
   __atomic_store(&i, &r, memory_order_seq_cst);
 
-  // CHECK: __atomic_fetch_add_8
+  // CHECK: atomicrmw add {{.*}} seq_cst, align 8
   __atomic_fetch_add(&l, 1, memory_order_seq_cst);
-  // CHECK: __atomic_fetch_sub_8
+  // CHECK: atomicrmw sub {{.*}} seq_cst, align 8
   __atomic_fetch_sub(&l, 1, memory_order_seq_cst);
-  // CHECK: __atomic_load_8
+  // CHECK: load atomic i64, ptr {{.*}} seq_cst, align 8
   long long rl;
   __atomic_load(&l, &rl, memory_order_seq_cst);
-  // CHECK: __atomic_store_8
+  // CHECK: store atomic i64 {{.*}}, ptr {{.*}} seq_cst, align 8
   rl = 0;
   __atomic_store(&l, &rl, memory_order_seq_cst);
 }
diff --git a/clang/test/CodeGen/atomic-ops-libcall.c b/clang/test/CodeGen/atomic-ops-libcall.c
index 745ccd2..38a23f7 100644
--- a/clang/test/CodeGen/atomic-ops-libcall.c
+++ b/clang/test/CodeGen/atomic-ops-libcall.c
@@ -1,120 +1,338 @@
-// RUN: %clang_cc1 < %s -triple armv5e-none-linux-gnueabi -emit-llvm -O1 | FileCheck %s
-
-// FIXME: This file should not be checking -O1 output.
-// Ie, it is testing many IR optimizer passes as part of front-end verification.
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// RUN: %clang_cc1 -triple armv5e-none-linux-gnueabi -emit-llvm %s -o - | FileCheck %s
 
 enum memory_order {
   memory_order_relaxed, memory_order_consume, memory_order_acquire,
   memory_order_release, memory_order_acq_rel, memory_order_seq_cst
 };
 
+// CHECK-LABEL: define dso_local ptr @test_c11_atomic_fetch_add_int_ptr(
+// CHECK-SAME: ptr noundef [[P:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[P_ADDR:%.*]] = alloca ptr, align 4
+// CHECK-NEXT:    [[DOTATOMICTMP:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[ATOMIC_TEMP:%.*]] = alloca ptr, align 4
+// CHECK-NEXT:    store ptr [[P]], ptr [[P_ADDR]], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[P_ADDR]], align 4
+// CHECK-NEXT:    store i32 12, ptr [[DOTATOMICTMP]], align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[DOTATOMICTMP]], align 4
+// CHECK-NEXT:    [[TMP2:%.*]] = atomicrmw add ptr [[TMP0]], i32 [[TMP1]] seq_cst, align 4
+// CHECK-NEXT:    store i32 [[TMP2]], ptr [[ATOMIC_TEMP]], align 4
+// CHECK-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[ATOMIC_TEMP]], align 4
+// CHECK-NEXT:    ret ptr [[TMP3]]
+//
 int *test_c11_atomic_fetch_add_int_ptr(_Atomic(int *) *p) {
-  // CHECK: test_c11_atomic_fetch_add_int_ptr
-  // CHECK: {{%[^ ]*}} = tail call i32 @__atomic_fetch_add_4(ptr noundef %p, i32 noundef 12, i32 noundef 5)
   return __c11_atomic_fetch_add(p, 3, memory_order_seq_cst);
 }
 
+// CHECK-LABEL: define dso_local ptr @test_c11_atomic_fetch_sub_int_ptr(
+// CHECK-SAME: ptr noundef [[P:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[P_ADDR:%.*]] = alloca ptr, align 4
+// CHECK-NEXT:    [[DOTATOMICTMP:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[ATOMIC_TEMP:%.*]] = alloca ptr, align 4
+// CHECK-NEXT:    store ptr [[P]], ptr [[P_ADDR]], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[P_ADDR]], align 4
+// CHECK-NEXT:    store i32 20, ptr [[DOTATOMICTMP]], align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[DOTATOMICTMP]], align 4
+// CHECK-NEXT:    [[TMP2:%.*]] = atomicrmw sub ptr [[TMP0]], i32 [[TMP1]] seq_cst, align 4
+// CHECK-NEXT:    store i32 [[TMP2]], ptr [[ATOMIC_TEMP]], align 4
+// CHECK-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[ATOMIC_TEMP]], align 4
+// CHECK-NEXT:    ret ptr [[TMP3]]
+//
 int *test_c11_atomic_fetch_sub_int_ptr(_Atomic(int *) *p) {
-  // CHECK: test_c11_atomic_fetch_sub_int_ptr
-  // CHECK: {{%[^ ]*}} = tail call i32 @__atomic_fetch_sub_4(ptr noundef %p, i32 noundef 20, i32 noundef 5)
   return __c11_atomic_fetch_sub(p, 5, memory_order_seq_cst);
 }
 
+// CHECK-LABEL: define dso_local i32 @test_c11_atomic_fetch_add_int(
+// CHECK-SAME: ptr noundef [[P:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[P_ADDR:%.*]] = alloca ptr, align 4
+// CHECK-NEXT:    [[DOTATOMICTMP:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[ATOMIC_TEMP:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    store ptr [[P]], ptr [[P_ADDR]], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[P_ADDR]], align 4
+// CHECK-NEXT:    store i32 3, ptr [[DOTATOMICTMP]], align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[DOTATOMICTMP]], align 4
+// CHECK-NEXT:    [[TMP2:%.*]] = atomicrmw add ptr [[TMP0]], i32 [[TMP1]] seq_cst, align 4
+// CHECK-NEXT:    store i32 [[TMP2]], ptr [[ATOMIC_TEMP]], align 4
+// CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[ATOMIC_TEMP]], align 4
+// CHECK-NEXT:    ret i32 [[TMP3]]
+//
 int test_c11_atomic_fetch_add_int(_Atomic(int) *p) {
-  // CHECK: test_c11_atomic_fetch_add_int
-  // CHECK: {{%[^ ]*}} = tail call i32 @__atomic_fetch_add_4(ptr noundef %p, i32 noundef 3, i32 noundef 5)
   return __c11_atomic_fetch_add(p, 3, memory_order_seq_cst);
 }
 
+// CHECK-LABEL: define dso_local i32 @test_c11_atomic_fetch_sub_int(
+// CHECK-SAME: ptr noundef [[P:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[P_ADDR:%.*]] = alloca ptr, align 4
+// CHECK-NEXT:    [[DOTATOMICTMP:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[ATOMIC_TEMP:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    store ptr [[P]], ptr [[P_ADDR]], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[P_ADDR]], align 4
+// CHECK-NEXT:    store i32 5, ptr [[DOTATOMICTMP]], align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[DOTATOMICTMP]], align 4
+// CHECK-NEXT:    [[TMP2:%.*]] = atomicrmw sub ptr [[TMP0]], i32 [[TMP1]] seq_cst, align 4
+// CHECK-NEXT:    store i32 [[TMP2]], ptr [[ATOMIC_TEMP]], align 4
+// CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[ATOMIC_TEMP]], align 4
+// CHECK-NEXT:    ret i32 [[TMP3]]
+//
 int test_c11_atomic_fetch_sub_int(_Atomic(int) *p) {
-  // CHECK: test_c11_atomic_fetch_sub_int
-  // CHECK: {{%[^ ]*}} = tail call i32 @__atomic_fetch_sub_4(ptr noundef %p, i32 noundef 5, i32 noundef 5)
   return __c11_atomic_fetch_sub(p, 5, memory_order_seq_cst);
 }
 
+// CHECK-LABEL: define dso_local ptr @fp2a(
+// CHECK-SAME: ptr noundef [[P:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[P_ADDR:%.*]] = alloca ptr, align 4
+// CHECK-NEXT:    [[DOTATOMICTMP:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[ATOMIC_TEMP:%.*]] = alloca ptr, align 4
+// CHECK-NEXT:    store ptr [[P]], ptr [[P_ADDR]], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[P_ADDR]], align 4
+// CHECK-NEXT:    store i32 4, ptr [[DOTATOMICTMP]], align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[DOTATOMICTMP]], align 4
+// CHECK-NEXT:    [[TMP2:%.*]] = atomicrmw sub ptr [[TMP0]], i32 [[TMP1]] monotonic, align 4
+// CHECK-NEXT:    store i32 [[TMP2]], ptr [[ATOMIC_TEMP]], align 4
+// CHECK-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[ATOMIC_TEMP]], align 4
+// CHECK-NEXT:    ret ptr [[TMP3]]
+//
 int *fp2a(int **p) {
-  // CHECK: @fp2a
-  // CHECK: {{%[^ ]*}} = tail call i32 @__atomic_fetch_sub_4(ptr noundef %p, i32 noundef 4, i32 noundef 0)
   // Note, the GNU builtins do not multiply by sizeof(T)!
   return __atomic_fetch_sub(p, 4, memory_order_relaxed);
 }
 
+// CHECK-LABEL: define dso_local i32 @test_atomic_fetch_add(
+// CHECK-SAME: ptr noundef [[P:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[P_ADDR:%.*]] = alloca ptr, align 4
+// CHECK-NEXT:    [[DOTATOMICTMP:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[ATOMIC_TEMP:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    store ptr [[P]], ptr [[P_ADDR]], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[P_ADDR]], align 4
+// CHECK-NEXT:    store i32 55, ptr [[DOTATOMICTMP]], align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[DOTATOMICTMP]], align 4
+// CHECK-NEXT:    [[TMP2:%.*]] = atomicrmw add ptr [[TMP0]], i32 [[TMP1]] seq_cst, align 4
+// CHECK-NEXT:    store i32 [[TMP2]], ptr [[ATOMIC_TEMP]], align 4
+// CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[ATOMIC_TEMP]], align 4
+// CHECK-NEXT:    ret i32 [[TMP3]]
+//
 int test_atomic_fetch_add(int *p) {
-  // CHECK: test_atomic_fetch_add
-  // CHECK: {{%[^ ]*}} = tail call i32 @__atomic_fetch_add_4(ptr noundef %p, i32 noundef 55, i32 noundef 5)
   return __atomic_fetch_add(p, 55, memory_order_seq_cst);
 }
 
+// CHECK-LABEL: define dso_local i32 @test_atomic_fetch_sub(
+// CHECK-SAME: ptr noundef [[P:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[P_ADDR:%.*]] = alloca ptr, align 4
+// CHECK-NEXT:    [[DOTATOMICTMP:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[ATOMIC_TEMP:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    store ptr [[P]], ptr [[P_ADDR]], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[P_ADDR]], align 4
+// CHECK-NEXT:    store i32 55, ptr [[DOTATOMICTMP]], align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[DOTATOMICTMP]], align 4
+// CHECK-NEXT:    [[TMP2:%.*]] = atomicrmw sub ptr [[TMP0]], i32 [[TMP1]] seq_cst, align 4
+// CHECK-NEXT:    store i32 [[TMP2]], ptr [[ATOMIC_TEMP]], align 4
+// CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[ATOMIC_TEMP]], align 4
+// CHECK-NEXT:    ret i32 [[TMP3]]
+//
 int test_atomic_fetch_sub(int *p) {
-  // CHECK: test_atomic_fetch_sub
-  // CHECK: {{%[^ ]*}} = tail call i32 @__atomic_fetch_sub_4(ptr noundef %p, i32 noundef 55, i32 noundef 5)
   return __atomic_fetch_sub(p, 55, memory_order_seq_cst);
 }
 
+// CHECK-LABEL: define dso_local i32 @test_atomic_fetch_and(
+// CHECK-SAME: ptr noundef [[P:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[P_ADDR:%.*]] = alloca ptr, align 4
+// CHECK-NEXT:    [[DOTATOMICTMP:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[ATOMIC_TEMP:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    store ptr [[P]], ptr [[P_ADDR]], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[P_ADDR]], align 4
+// CHECK-NEXT:    store i32 55, ptr [[DOTATOMICTMP]], align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[DOTATOMICTMP]], align 4
+// CHECK-NEXT:    [[TMP2:%.*]] = atomicrmw and ptr [[TMP0]], i32 [[TMP1]] seq_cst, align 4
+// CHECK-NEXT:    store i32 [[TMP2]], ptr [[ATOMIC_TEMP]], align 4
+// CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[ATOMIC_TEMP]], align 4
+// CHECK-NEXT:    ret i32 [[TMP3]]
+//
 int test_atomic_fetch_and(int *p) {
-  // CHECK: test_atomic_fetch_and
-  // CHECK: {{%[^ ]*}} = tail call i32 @__atomic_fetch_and_4(ptr noundef %p, i32 noundef 55, i32 noundef 5)
   return __atomic_fetch_and(p, 55, memory_order_seq_cst);
 }
 
+// CHECK-LABEL: define dso_local i32 @test_atomic_fetch_or(
+// CHECK-SAME: ptr noundef [[P:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[P_ADDR:%.*]] = alloca ptr, align 4
+// CHECK-NEXT:    [[DOTATOMICTMP:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[ATOMIC_TEMP:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    store ptr [[P]], ptr [[P_ADDR]], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[P_ADDR]], align 4
+// CHECK-NEXT:    store i32 55, ptr [[DOTATOMICTMP]], align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[DOTATOMICTMP]], align 4
+// CHECK-NEXT:    [[TMP2:%.*]] = atomicrmw or ptr [[TMP0]], i32 [[TMP1]] seq_cst, align 4
+// CHECK-NEXT:    store i32 [[TMP2]], ptr [[ATOMIC_TEMP]], align 4
+// CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[ATOMIC_TEMP]], align 4
+// CHECK-NEXT:    ret i32 [[TMP3]]
+//
 int test_atomic_fetch_or(int *p) {
-  // CHECK: test_atomic_fetch_or
-  // CHECK: {{%[^ ]*}} = tail call i32 @__atomic_fetch_or_4(ptr noundef %p, i32 noundef 55, i32 noundef 5)
   return __atomic_fetch_or(p, 55, memory_order_seq_cst);
 }
 
+// CHECK-LABEL: define dso_local i32 @test_atomic_fetch_xor(
+// CHECK-SAME: ptr noundef [[P:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[P_ADDR:%.*]] = alloca ptr, align 4
+// CHECK-NEXT:    [[DOTATOMICTMP:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[ATOMIC_TEMP:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    store ptr [[P]], ptr [[P_ADDR]], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[P_ADDR]], align 4
+// CHECK-NEXT:    store i32 55, ptr [[DOTATOMICTMP]], align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[DOTATOMICTMP]], align 4
+// CHECK-NEXT:    [[TMP2:%.*]] = atomicrmw xor ptr [[TMP0]], i32 [[TMP1]] seq_cst, align 4
+// CHECK-NEXT:    store i32 [[TMP2]], ptr [[ATOMIC_TEMP]], align 4
+// CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[ATOMIC_TEMP]], align 4
+// CHECK-NEXT:    ret i32 [[TMP3]]
+//
 int test_atomic_fetch_xor(int *p) {
-  // CHECK: test_atomic_fetch_xor
-  // CHECK: {{%[^ ]*}} = tail call i32 @__atomic_fetch_xor_4(ptr noundef %p, i32 noundef 55, i32 noundef 5)
   return __atomic_fetch_xor(p, 55, memory_order_seq_cst);
 }
 
+// CHECK-LABEL: define dso_local i32 @test_atomic_fetch_nand(
+// CHECK-SAME: ptr noundef [[P:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[P_ADDR:%.*]] = alloca ptr, align 4
+// CHECK-NEXT:    [[DOTATOMICTMP:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[ATOMIC_TEMP:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    store ptr [[P]], ptr [[P_ADDR]], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[P_ADDR]], align 4
+// CHECK-NEXT:    store i32 55, ptr [[DOTATOMICTMP]], align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[DOTATOMICTMP]], align 4
+// CHECK-NEXT:    [[TMP2:%.*]] = atomicrmw nand ptr [[TMP0]], i32 [[TMP1]] seq_cst, align 4
+// CHECK-NEXT:    store i32 [[TMP2]], ptr [[ATOMIC_TEMP]], align 4
+// CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[ATOMIC_TEMP]], align 4
+// CHECK-NEXT:    ret i32 [[TMP3]]
+//
 int test_atomic_fetch_nand(int *p) {
-  // CHECK: test_atomic_fetch_nand
-  // CHECK: {{%[^ ]*}} = tail call i32 @__atomic_fetch_nand_4(ptr noundef %p, i32 noundef 55, i32 noundef 5)
   return __atomic_fetch_nand(p, 55, memory_order_seq_cst);
 }
 
+// CHECK-LABEL: define dso_local i32 @test_atomic_add_fetch(
+// CHECK-SAME: ptr noundef [[P:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[P_ADDR:%.*]] = alloca ptr, align 4
+// CHECK-NEXT:    [[DOTATOMICTMP:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[ATOMIC_TEMP:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    store ptr [[P]], ptr [[P_ADDR]], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[P_ADDR]], align 4
+// CHECK-NEXT:    store i32 55, ptr [[DOTATOMICTMP]], align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[DOTATOMICTMP]], align 4
+// CHECK-NEXT:    [[TMP2:%.*]] = atomicrmw add ptr [[TMP0]], i32 [[TMP1]] seq_cst, align 4
+// CHECK-NEXT:    [[TMP3:%.*]] = add i32 [[TMP2]], [[TMP1]]
+// CHECK-NEXT:    store i32 [[TMP3]], ptr [[ATOMIC_TEMP]], align 4
+// CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr [[ATOMIC_TEMP]], align 4
+// CHECK-NEXT:    ret i32 [[TMP4]]
+//
 int test_atomic_add_fetch(int *p) {
-  // CHECK: test_atomic_add_fetch
-  // CHECK: [[CALL:%[^ ]*]] = tail call i32 @__atomic_fetch_add_4(ptr noundef %p, i32 noundef 55, i32 noundef 5)
-  // CHECK: {{%[^ ]*}} = add i32 [[CALL]], 55
   return __atomic_add_fetch(p, 55, memory_order_seq_cst);
 }
 
+// CHECK-LABEL: define dso_local i32 @test_atomic_sub_fetch(
+// CHECK-SAME: ptr noundef [[P:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[P_ADDR:%.*]] = alloca ptr, align 4
+// CHECK-NEXT:    [[DOTATOMICTMP:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[ATOMIC_TEMP:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    store ptr [[P]], ptr [[P_ADDR]], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[P_ADDR]], align 4
+// CHECK-NEXT:    store i32 55, ptr [[DOTATOMICTMP]], align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[DOTATOMICTMP]], align 4
+// CHECK-NEXT:    [[TMP2:%.*]] = atomicrmw sub ptr [[TMP0]], i32 [[TMP1]] seq_cst, align 4
+// CHECK-NEXT:    [[TMP3:%.*]] = sub i32 [[TMP2]], [[TMP1]]
+// CHECK-NEXT:    store i32 [[TMP3]], ptr [[ATOMIC_TEMP]], align 4
+// CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr [[ATOMIC_TEMP]], align 4
+// CHECK-NEXT:    ret i32 [[TMP4]]
+//
 int test_atomic_sub_fetch(int *p) {
-  // CHECK: test_atomic_sub_fetch
-  // CHECK: [[CALL:%[^ ]*]] = tail call i32 @__atomic_fetch_sub_4(ptr noundef %p, i32 noundef 55, i32 noundef 5)
-  // CHECK: {{%[^ ]*}} = add i32 [[CALL]], -55
   return __atomic_sub_fetch(p, 55, memory_order_seq_cst);
 }
 
+// CHECK-LABEL: define dso_local i32 @test_atomic_and_fetch(
+// CHECK-SAME: ptr noundef [[P:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[P_ADDR:%.*]] = alloca ptr, align 4
+// CHECK-NEXT:    [[DOTATOMICTMP:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[ATOMIC_TEMP:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    store ptr [[P]], ptr [[P_ADDR]], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[P_ADDR]], align 4
+// CHECK-NEXT:    store i32 55, ptr [[DOTATOMICTMP]], align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[DOTATOMICTMP]], align 4
+// CHECK-NEXT:    [[TMP2:%.*]] = atomicrmw and ptr [[TMP0]], i32 [[TMP1]] seq_cst, align 4
+// CHECK-NEXT:    [[TMP3:%.*]] = and i32 [[TMP2]], [[TMP1]]
+// CHECK-NEXT:    store i32 [[TMP3]], ptr [[ATOMIC_TEMP]], align 4
+// CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr [[ATOMIC_TEMP]], align 4
+// CHECK-NEXT:    ret i32 [[TMP4]]
+//
 int test_atomic_and_fetch(int *p) {
-  // CHECK: test_atomic_and_fetch
-  // CHECK: [[CALL:%[^ ]*]] = tail call i32 @__atomic_fetch_and_4(ptr noundef %p, i32 noundef 55, i32 noundef 5)
-  // CHECK: {{%[^ ]*}} = and i32 [[CALL]], 55
   return __atomic_and_fetch(p, 55, memory_order_seq_cst);
 }
 
+// CHECK-LABEL: define dso_local i32 @test_atomic_or_fetch(
+// CHECK-SAME: ptr noundef [[P:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[P_ADDR:%.*]] = alloca ptr, align 4
+// CHECK-NEXT:    [[DOTATOMICTMP:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[ATOMIC_TEMP:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    store ptr [[P]], ptr [[P_ADDR]], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[P_ADDR]], align 4
+// CHECK-NEXT:    store i32 55, ptr [[DOTATOMICTMP]], align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[DOTATOMICTMP]], align 4
+// CHECK-NEXT:    [[TMP2:%.*]] = atomicrmw or ptr [[TMP0]], i32 [[TMP1]] seq_cst, align 4
+// CHECK-NEXT:    [[TMP3:%.*]] = or i32 [[TMP2]], [[TMP1]]
+// CHECK-NEXT:    store i32 [[TMP3]], ptr [[ATOMIC_TEMP]], align 4
+// CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr [[ATOMIC_TEMP]], align 4
+// CHECK-NEXT:    ret i32 [[TMP4]]
+//
 int test_atomic_or_fetch(int *p) {
-  // CHECK: test_atomic_or_fetch
-  // CHECK: [[CALL:%[^ ]*]] = tail call i32 @__atomic_fetch_or_4(ptr noundef %p, i32 noundef 55, i32 noundef 5)
-  // CHECK: {{%[^ ]*}} = or i32 [[CALL]], 55
   return __atomic_or_fetch(p, 55, memory_order_seq_cst);
 }
 
+// CHECK-LABEL: define dso_local i32 @test_atomic_xor_fetch(
+// CHECK-SAME: ptr noundef [[P:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[P_ADDR:%.*]] = alloca ptr, align 4
+// CHECK-NEXT:    [[DOTATOMICTMP:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[ATOMIC_TEMP:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    store ptr [[P]], ptr [[P_ADDR]], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[P_ADDR]], align 4
+// CHECK-NEXT:    store i32 55, ptr [[DOTATOMICTMP]], align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[DOTATOMICTMP]], align 4
+// CHECK-NEXT:    [[TMP2:%.*]] = atomicrmw xor ptr [[TMP0]], i32 [[TMP1]] seq_cst, align 4
+// CHECK-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP2]], [[TMP1]]
+// CHECK-NEXT:    store i32 [[TMP3]], ptr [[ATOMIC_TEMP]], align 4
+// CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr [[ATOMIC_TEMP]], align 4
+// CHECK-NEXT:    ret i32 [[TMP4]]
+//
 int test_atomic_xor_fetch(int *p) {
-  // CHECK: test_atomic_xor_fetch
-  // CHECK: [[CALL:%[^ ]*]] = tail call i32 @__atomic_fetch_xor_4(ptr noundef %p, i32 noundef 55, i32 noundef 5)
-  // CHECK: {{%[^ ]*}} = xor i32 [[CALL]], 55
   return __atomic_xor_fetch(p, 55, memory_order_seq_cst);
 }
 
+// CHECK-LABEL: define dso_local i32 @test_atomic_nand_fetch(
+// CHECK-SAME: ptr noundef [[P:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[P_ADDR:%.*]] = alloca ptr, align 4
+// CHECK-NEXT:    [[DOTATOMICTMP:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[ATOMIC_TEMP:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    store ptr [[P]], ptr [[P_ADDR]], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[P_ADDR]], align 4
+// CHECK-NEXT:    store i32 55, ptr [[DOTATOMICTMP]], align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[DOTATOMICTMP]], align 4
+// CHECK-NEXT:    [[TMP2:%.*]] = atomicrmw nand ptr [[TMP0]], i32 [[TMP1]] seq_cst, align 4
+// CHECK-NEXT:    [[TMP3:%.*]] = and i32 [[TMP2]], [[TMP1]]
+// CHECK-NEXT:    [[TMP4:%.*]] = xor i32 [[TMP3]], -1
+// CHECK-NEXT:    store i32 [[TMP4]], ptr [[ATOMIC_TEMP]], align 4
+// CHECK-NEXT:    [[TMP5:%.*]] = load i32, ptr [[ATOMIC_TEMP]], align 4
+// CHECK-NEXT:    ret i32 [[TMP5]]
+//
 int test_atomic_nand_fetch(int *p) {
-  // CHECK: test_atomic_nand_fetch
-  // CHECK: [[CALL:%[^ ]*]] = tail call i32 @__atomic_fetch_nand_4(ptr noundef %p, i32 noundef 55, i32 noundef 5)
-  // FIXME: We should not be checking optimized IR. It changes independently of clang.
-  // FIXME-CHECK: [[AND:%[^ ]*]] = and i32 [[CALL]], 55
-  // FIXME-CHECK: {{%[^ ]*}} = xor i32 [[AND]], -1
   return __atomic_nand_fetch(p, 55, memory_order_seq_cst);
 }
diff --git a/clang/test/CodeGen/atomic-ops.c b/clang/test/CodeGen/atomic-ops.c
index 9ac05d2..b6060dc 100644
--- a/clang/test/CodeGen/atomic-ops.c
+++ b/clang/test/CodeGen/atomic-ops.c
@@ -198,7 +198,8 @@ struct S implicit_load(_Atomic(struct S) *a) {
 struct S fd1(struct S *a) {
   // CHECK-LABEL: @fd1
   // CHECK: [[RETVAL:%.*]] = alloca %struct.S, align 4
-  // CHECK: call void @__atomic_load(i32 noundef 8, ptr noundef {{.*}}, ptr noundef [[RETVAL]], i32 noundef 5)
+  // CHECK: [[TMP1:%.*]] = load atomic i64, ptr {{%.*}} seq_cst, align 4
+  // CHECK-NEXT: store i64 [[TMP1]], ptr [[RETVAL]], align 4
   // CHECK: ret
   struct S ret;
   __atomic_load(a, &ret, memory_order_seq_cst);
@@ -213,7 +214,8 @@ void fd2(struct S *a, struct S *b) {
   // CHECK-NEXT: store ptr %b, ptr [[B_ADDR]], align 4
   // CHECK-NEXT: [[LOAD_A_PTR:%.*]] = load ptr, ptr [[A_ADDR]], align 4
   // CHECK-NEXT: [[LOAD_B_PTR:%.*]] = load ptr, ptr [[B_ADDR]], align 4
-  // CHECK-NEXT: call void @__atomic_store(i32 noundef 8, ptr noundef [[LOAD_A_PTR]], ptr noundef [[LOAD_B_PTR]],
+  // CHECK-NEXT: [[LOAD_B:%.*]] = load i64, ptr [[LOAD_B_PTR]], align 4
+  // CHECK-NEXT: store atomic i64 [[LOAD_B]], ptr [[LOAD_A_PTR]] seq_cst, align 4
   // CHECK-NEXT: ret void
   __atomic_store(a, b, memory_order_seq_cst);
 }
@@ -229,7 +231,9 @@ void fd3(struct S *a, struct S *b, struct S *c) {
   // CHECK-NEXT: [[LOAD_A_PTR:%.*]] = load ptr, ptr [[A_ADDR]], align 4
   // CHECK-NEXT: [[LOAD_B_PTR:%.*]] = load ptr, ptr [[B_ADDR]], align 4
   // CHECK-NEXT: [[LOAD_C_PTR:%.*]] = load ptr, ptr [[C_ADDR]], align 4
-  // CHECK-NEXT: call void @__atomic_exchange(i32 noundef 8, ptr noundef [[LOAD_A_PTR]], ptr noundef [[LOAD_B_PTR]], ptr noundef [[LOAD_C_PTR]],
+  // CHECK-NEXT: [[LOAD_B:%.*]] = load i64, ptr [[LOAD_B_PTR]], align 4
+  // CHECK-NEXT: [[RESULT:%.*]] = atomicrmw xchg ptr [[LOAD_A_PTR]], i64 [[LOAD_B]] seq_cst, align 4
+  // CHECK-NEXT: store i64 [[RESULT]], ptr [[LOAD_C_PTR]], align 4
 
   __atomic_exchange(a, b, c, memory_order_seq_cst);
 }
@@ -245,8 +249,9 @@ _Bool fd4(struct S *a, struct S *b, struct S *c) {
   // CHECK-NEXT: [[LOAD_A_PTR:%.*]] = load ptr, ptr [[A_ADDR]], align 4
   // CHECK-NEXT: [[LOAD_B_PTR:%.*]] = load ptr, ptr [[B_ADDR]], align 4
   // CHECK-NEXT: [[LOAD_C_PTR:%.*]] = load ptr, ptr [[C_ADDR]], align 4
-  // CHECK-NEXT: [[CALL:%.*]] = call zeroext i1 @__atomic_compare_exchange(i32 noundef 8, ptr noundef [[LOAD_A_PTR]], ptr noundef [[LOAD_B_PTR]], ptr noundef [[LOAD_C_PTR]],
-  // CHECK-NEXT: ret i1 [[CALL]]
+  // CHECK-NEXT: [[LOAD_B:%.*]] = load i64, ptr [[LOAD_B_PTR]], align 4
+  // CHECK-NEXT: [[LOAD_C:%.*]] = load i64, ptr [[LOAD_C_PTR]], align 4
+  // CHECK-NEXT: {{.*}} = cmpxchg weak ptr [[LOAD_A_PTR]], i64 [[LOAD_B]], i64 [[LOAD_C]] seq_cst seq_cst, align 4
   return __atomic_compare_exchange(a, b, c, 1, 5, 5);
 }
 
@@ -682,13 +687,13 @@ void test_underaligned(void) {
   // CHECK-LABEL: @test_underaligned
   struct Underaligned { char c[8]; } underaligned_a, underaligned_b, underaligned_c;
 
-  // CHECK: call void @__atomic_load(i32 noundef 8,
+  // CHECK: load atomic i64, {{.*}}, align 1
   __atomic_load(&underaligned_a, &underaligned_b, memory_order_seq_cst);
-  // CHECK: call void @__atomic_store(i32 noundef 8,
+  // CHECK: store atomic i64 {{.*}}, align 1
   __atomic_store(&underaligned_a, &underaligned_b, memory_order_seq_cst);
-  // CHECK: call void @__atomic_exchange(i32 noundef 8,
+  // CHECK: atomicrmw xchg ptr {{.*}}, align 1
   __atomic_exchange(&underaligned_a, &underaligned_b, &underaligned_c, memory_order_seq_cst);
-  // CHECK: call {{.*}} @__atomic_compare_exchange(i32 noundef 8,
+  // CHECK: cmpxchg weak ptr {{.*}}, align 1
   __atomic_compare_exchange(&underaligned_a, &underaligned_b, &underaligned_c, 1, memory_order_seq_cst, memory_order_seq_cst);
 
   __attribute__((aligned)) struct Underaligned aligned_a, aligned_b, aligned_c;
@@ -747,7 +752,7 @@ void test_minmax_postop(int *si, unsigned *ui, unsigned short *us, signed char *
   // CHECK: [[NEW:%.*]] = select i1 [[TST]], i32 [[OLD]], i32 [[RHS]]
   // CHECK: store i32 [[NEW]], ptr
   *si = __atomic_min_fetch(si, 42, memory_order_release);
-  
+
   // CHECK: [[OLD:%.*]] = atomicrmw umax ptr [[PTR:%.*]], i32 [[RHS:%.*]] release, align 4
   // CHECK: [[TST:%.*]] = icmp ugt i32 [[OLD]], [[RHS]]
   // CHECK: [[NEW:%.*]] = select i1 [[TST]], i32 [[OLD]], i32 [[RHS]]
@@ -772,7 +777,7 @@ void test_minmax_postop(int *si, unsigned *ui, unsigned short *us, signed char *
   // CHECK: store i8 [[NEW]], ptr
   *sc = __atomic_min_fetch(sc, 42, memory_order_release);
 
-  // CHECK: [[OLD:%.*]] = call i64 @__atomic_fetch_umin_8(ptr noundef {{%.*}}, i64 noundef [[RHS:%.*]],
+  // CHECK: [[OLD:%.*]] = atomicrmw umin ptr {{%.*}}, i64 [[RHS:%.*]] release, align 4
   // CHECK: [[TST:%.*]] = icmp ult i64 [[OLD]], [[RHS]]
   // CHECK: [[NEW:%.*]] = select i1 [[TST]], i64 [[OLD]], i64 [[RHS]]
   // CHECK: store i64 [[NEW]], ptr
diff --git a/clang/test/CodeGen/atomics-inlining.c b/clang/test/CodeGen/atomics-inlining.c
index 862c630..217a294 100644
--- a/clang/test/CodeGen/atomics-inlining.c
+++ b/clang/test/CodeGen/atomics-inlining.c
@@ -38,14 +38,14 @@ void test1(void) {
   (void)__atomic_store(&a1, &a2, memory_order_seq_cst);
 
 // ARM-LABEL: define{{.*}} void @test1
-// ARM: = call{{.*}} zeroext i8 @__atomic_load_1(ptr noundef @c1
-// ARM: call{{.*}} void @__atomic_store_1(ptr noundef @c1, i8 noundef zeroext
-// ARM: = call{{.*}} zeroext i16 @__atomic_load_2(ptr noundef @s1
-// ARM: call{{.*}} void @__atomic_store_2(ptr noundef @s1, i16 noundef zeroext
-// ARM: = call{{.*}} i32 @__atomic_load_4(ptr noundef @i1
-// ARM: call{{.*}} void @__atomic_store_4(ptr noundef @i1, i32 noundef
-// ARM: = call{{.*}} i64 @__atomic_load_8(ptr noundef @ll1
-// ARM: call{{.*}} void @__atomic_store_8(ptr noundef @ll1, i64 noundef
+// ARM: = load atomic i8, ptr @c1 seq_cst, align 1
+// ARM: store atomic i8 {{.*}}, ptr @c1 seq_cst, align 1
+// ARM: = load atomic i16, ptr @s1 seq_cst, align 2
+// ARM: store atomic i16 {{.*}}, ptr @s1 seq_cst, align 2
+// ARM: = load atomic i32, ptr @i1 seq_cst, align 4
+// ARM: store atomic i32 {{.*}}, ptr @i1 seq_cst, align 4
+// ARM: = load atomic i64, ptr @ll1 seq_cst, align 8
+// ARM: store atomic i64 {{.*}}, ptr @ll1 seq_cst, align 8
 // ARM: call{{.*}} void @__atomic_load(i32 noundef 100, ptr noundef @a1, ptr noundef @a2
 // ARM: call{{.*}} void @__atomic_store(i32 noundef 100, ptr noundef @a1, ptr noundef @a2
 
@@ -56,8 +56,8 @@ void test1(void) {
 // PPC32: store atomic i16 {{.*}}, ptr @s1 seq_cst, align 2
 // PPC32: = load atomic i32, ptr @i1 seq_cst, align 4
 // PPC32: store atomic i32 {{.*}}, ptr @i1 seq_cst, align 4
-// PPC32: = call i64 @__atomic_load_8(ptr noundef @ll1
-// PPC32: call void @__atomic_store_8(ptr noundef @ll1, i64
+// PPC32: = load atomic i64, ptr @ll1 seq_cst, align 8
+// PPC32: store atomic i64 {{.*}}, ptr @ll1 seq_cst, align 8
 // PPC32: call void @__atomic_load(i32 noundef 100, ptr noundef @a1, ptr noundef @a2
 // PPC32: call void @__atomic_store(i32 noundef 100, ptr noundef @a1, ptr noundef @a2
 
@@ -80,8 +80,8 @@ void test1(void) {
 // MIPS32: store atomic i16 {{.*}}, ptr @s1 seq_cst, align 2
 // MIPS32: = load atomic i32, ptr @i1 seq_cst, align 4
 // MIPS32: store atomic i32 {{.*}}, ptr @i1 seq_cst, align 4
-// MIPS32: call i64 @__atomic_load_8(ptr noundef @ll1
-// MIPS32: call void @__atomic_store_8(ptr noundef @ll1, i64
+// MIPS32: = load atomic i64, ptr @ll1 seq_cst, align 8
+// MIPS32: store atomic i64 {{.*}}, ptr @ll1 seq_cst, align 8
 // MIPS32: call void @__atomic_load(i32 noundef signext 100, ptr noundef @a1, ptr noundef @a2
 // MIPS32: call void @__atomic_store(i32 noundef signext 100, ptr noundef @a1, ptr noundef @a2
 
@@ -94,7 +94,7 @@ void test1(void) {
 // MIPS64: store atomic i32 {{.*}}, ptr @i1 seq_cst, align 4
 // MIPS64: = load atomic i64, ptr @ll1 seq_cst, align 8
 // MIPS64: store atomic i64 {{.*}}, ptr @ll1 seq_cst, align 8
-// MIPS64: call void @__atomic_load(i64 noundef zeroext 100, ptr noundef @a1
+// MIPS64: call void @__atomic_load(i64 noundef zeroext 100, ptr noundef @a1, ptr noundef @a2
 // MIPS64: call void @__atomic_store(i64 noundef zeroext 100, ptr noundef @a1, ptr noundef @a2
 
 // SPARC-LABEL: define{{.*}} void @test1
@@ -104,12 +104,12 @@ void test1(void) {
 // SPARC: store atomic i16 {{.*}}, ptr @s1 seq_cst, align 2
 // SPARC: = load atomic i32, ptr @i1 seq_cst, align 4
 // SPARC: store atomic i32 {{.*}}, ptr @i1 seq_cst, align 4
-// SPARCV8: call i64 @__atomic_load_8(ptr noundef @ll1
-// SPARCV8: call void @__atomic_store_8(ptr noundef @ll1, i64
-// SPARCV9: load atomic i64, ptr @ll1 seq_cst, align 8
-// SPARCV9: store atomic i64 {{.*}}, ptr @ll1 seq_cst, align 8
+// SPARC: load atomic i64, ptr @ll1 seq_cst, align 8
+// SPARC: store atomic i64 {{.*}}, ptr @ll1 seq_cst, align 8
 // SPARCV8: call void @__atomic_load(i32 noundef 100, ptr noundef @a1, ptr noundef @a2
 // SPARCV8: call void @__atomic_store(i32 noundef 100, ptr noundef @a1, ptr noundef @a2
+// SPARCV9: call void @__atomic_load(i64 noundef 100, ptr noundef @a1, ptr noundef @a2
+// SPARCV9: call void @__atomic_store(i64 noundef 100, ptr noundef @a1, ptr noundef @a2
 
 // NVPTX-LABEL: define{{.*}} void @test1
 // NVPTX: = load atomic i8, ptr @c1 seq_cst, align 1
@@ -120,7 +120,7 @@ void test1(void) {
 // NVPTX: store atomic i32 {{.*}}, ptr @i1 seq_cst, align 4
 // NVPTX: = load atomic i64, ptr @ll1 seq_cst, align 8
 // NVPTX: store atomic i64 {{.*}}, ptr @ll1 seq_cst, align 8
-// NVPTX: call void @__atomic_load(i64 noundef 100, ptr noundef @a1, ptr noundef @a2, i32 noundef 5)
-// NVPTX: call void @__atomic_store(i64 noundef 100, ptr noundef @a1, ptr noundef @a2, i32 noundef 5)
+// NVPTX: call void @__atomic_load(i64 noundef 100, ptr noundef @a1, ptr noundef @a2
+// NVPTX: call void @__atomic_store(i64 noundef 100, ptr noundef @a1, ptr noundef @a2
 
 }
diff --git a/clang/test/CodeGen/c11atomics.c b/clang/test/CodeGen/c11atomics.c
index dd1f52f..4da36ad 100644
--- a/clang/test/CodeGen/c11atomics.c
+++ b/clang/test/CodeGen/c11atomics.c
@@ -343,10 +343,9 @@ PS test_promoted_load(_Atomic(PS) *addr) {
   // CHECK:   [[ATOMIC_RES:%.*]] = alloca { %struct.PS, [2 x i8] }, align 8
   // CHECK:   store ptr %addr, ptr [[ADDR_ARG]], align 4
   // CHECK:   [[ADDR:%.*]] = load ptr, ptr [[ADDR_ARG]], align 4
-  // CHECK:   [[RES:%.*]] = call arm_aapcscc i64 @__atomic_load_8(ptr noundef [[ADDR]], i32 noundef 5)
-  // CHECK:   store i64 [[RES]], ptr [[ATOMIC_RES]], align 8
-  // CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 2 %agg.result, ptr align 8 [[ATOMIC_RES]], i32 6, i1 false)
-
+  // CHECK:   [[ATOMIC_RES:%.*]] = load atomic i64, ptr [[ADDR]] seq_cst, align 8
+  // CHECK:   store i64 [[ATOMIC_RES]], ptr [[ATOMIC_RES_ADDR:%.*]], align 8
+  // CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 2 %agg.result, ptr align 8 [[ATOMIC_RES_ADDR]], i32 6, i1 false)
   return __c11_atomic_load(addr, 5);
 }
 
@@ -362,8 +361,8 @@ void test_promoted_store(_Atomic(PS) *addr, PS *val) {
   // CHECK:   [[VAL:%.*]] = load ptr, ptr [[VAL_ARG]], align 4
   // CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 2 [[NONATOMIC_TMP]], ptr align 2 [[VAL]], i32 6, i1 false)
   // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[ATOMIC_VAL]], ptr align 2 [[NONATOMIC_TMP]], i64 6, i1 false)
-  // CHECK:   [[VAL64:%.*]] = load i64, ptr [[ATOMIC_VAL]], align 2
-  // CHECK:   call arm_aapcscc void @__atomic_store_8(ptr noundef [[ADDR]], i64 noundef [[VAL64]], i32 noundef 5)
+  // CHECK:   [[ATOMIC:%.*]] = load i64, ptr [[ATOMIC_VAL]], align 8
+  // CHECK:   store atomic i64 [[ATOMIC]], ptr [[ADDR]] seq_cst, align 8
   __c11_atomic_store(addr, *val, 5);
 }
 
@@ -380,10 +379,10 @@ PS test_promoted_exchange(_Atomic(PS) *addr, PS *val) {
   // CHECK:   [[VAL:%.*]] = load ptr, ptr [[VAL_ARG]], align 4
   // CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 2 [[NONATOMIC_TMP]], ptr align 2 [[VAL]], i32 6, i1 false)
   // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[ATOMIC_VAL]], ptr align 2 [[NONATOMIC_TMP]], i64 6, i1 false)
-  // CHECK:   [[VAL64:%.*]] = load i64, ptr [[ATOMIC_VAL]], align 2
-  // CHECK:   [[RES:%.*]] = call arm_aapcscc i64 @__atomic_exchange_8(ptr noundef [[ADDR]], i64 noundef [[VAL64]], i32 noundef 5)
-  // CHECK:   store i64 [[RES]], ptr [[ATOMIC_RES]], align 8
-  // CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 2 %agg.result, ptr align 8 [[ATOMIC_RES]], i32 6, i1 false)
+  // CHECK:   [[ATOMIC:%.*]] = load i64, ptr [[ATOMIC_VAL]], align 8
+  // CHECK:   [[ATOMIC_RES:%.*]] = atomicrmw xchg ptr [[ADDR]], i64 [[ATOMIC]] seq_cst, align 8
+  // CHECK:   store i64 [[ATOMIC_RES]], ptr [[ATOMIC_RES_PTR:%.*]], align 8
+  // CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 2 %agg.result, ptr align 8 [[ATOMIC_RES_PTR]], i32 6, i1 false)
   return __c11_atomic_exchange(addr, *val, 5);
 }
 
@@ -404,9 +403,10 @@ _Bool test_promoted_cmpxchg(_Atomic(PS) *addr, PS *desired, PS *new) {
   // CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 2 [[NONATOMIC_TMP]], ptr align 2 [[NEW]], i32 6, i1 false)
   // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[ATOMIC_DESIRED]], ptr align 2 [[DESIRED]], i64 6, i1 false)
   // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[ATOMIC_NEW]], ptr align 2 [[NONATOMIC_TMP]], i64 6, i1 false)
-  // CHECK:   [[NEW64:%.*]] = load i64, ptr [[ATOMIC_NEW]], align 2
-  // CHECK:   [[RES:%.*]] = call arm_aapcscc zeroext i1 @__atomic_compare_exchange_8(ptr noundef [[ADDR]], ptr noundef [[ATOMIC_DESIRED]], i64 noundef [[NEW64]], i32 noundef 5, i32 noundef 5)
-  // CHECK:   ret i1 [[RES]]
+  // CHECK:   [[VAL1:%.*]] = load i64, ptr [[ATOMIC_DESIRED]], align 8
+  // CHECK:   [[VAL2:%.*]] = load i64, ptr [[ATOMIC_NEW]], align 8
+  // CHECK:   [[RES_PAIR:%.*]] = cmpxchg ptr [[ADDR]], i64 [[VAL1]], i64 [[VAL2]] seq_cst seq_cst, align 8
+  // CHECK:   [[RES:%.*]] = extractvalue { i64, i1 } [[RES_PAIR]], 1
   return __c11_atomic_compare_exchange_strong(addr, desired, *new, 5, 5);
 }
 
@@ -414,12 +414,12 @@ struct Empty {};
 
 struct Empty test_empty_struct_load(_Atomic(struct Empty)* empty) {
   // CHECK-LABEL: @test_empty_struct_load(
-  // CHECK: call arm_aapcscc zeroext i8 @__atomic_load_1(ptr noundef %{{.*}}, i32 noundef 5)
+  // CHECK: load atomic i8, ptr {{.*}}, align 1
   return __c11_atomic_load(empty, 5);
 }
 
 void test_empty_struct_store(_Atomic(struct Empty)* empty, struct Empty value) {
   // CHECK-LABEL: @test_empty_struct_store(
-  // CHECK: call arm_aapcscc void @__atomic_store_1(ptr noundef %{{.*}}, i8 noundef zeroext %{{.*}}, i32 noundef 5)
+  // CHECK: store atomic i8 {{.*}}, ptr {{.*}}, align 1
   __c11_atomic_store(empty, value, 5);
 }
diff --git a/clang/test/CodeGenCXX/atomic-inline.cpp b/clang/test/CodeGenCXX/atomic-inline.cpp
index 701bbd5..c8fa877 100644
--- a/clang/test/CodeGenCXX/atomic-inline.cpp
+++ b/clang/test/CodeGenCXX/atomic-inline.cpp
@@ -42,7 +42,7 @@ AM16 m16;
 AM16 load16() {
   AM16 am;
   // CHECK-LABEL: @_Z6load16v
-  // CHECK: call void @__atomic_load
+  // CHECK: load atomic i128, {{.*}} monotonic, align 16
   // CORE2-LABEL: @_Z6load16v
   // CORE2: load atomic i128, {{.*}} monotonic, align 16
   __atomic_load(&m16, &am, 0);
@@ -52,7 +52,7 @@ AM16 load16() {
 AM16 s16;
 void store16() {
   // CHECK-LABEL: @_Z7store16v
-  // CHECK: call void @__atomic_store
+  // CHECK: store atomic i128 {{.*}} monotonic, align 16
   // CORE2-LABEL: @_Z7store16v
   // CORE2: store atomic i128 {{.*}} monotonic, align 16
   __atomic_store(&m16, &s16, 0);
@@ -61,7 +61,7 @@ void store16() {
 bool cmpxchg16() {
   AM16 am;
   // CHECK-LABEL: @_Z9cmpxchg16v
-  // CHECK: call noundef zeroext i1 @__atomic_compare_exchange
+  // CHECK: cmpxchg ptr {{.*}} monotonic monotonic, align 16
   // CORE2-LABEL: @_Z9cmpxchg16v
   // CORE2: cmpxchg ptr {{.*}} monotonic monotonic, align 16
   return __atomic_compare_exchange(&m16, &s16, &am, 0, 0, 0);
diff --git a/clang/test/CodeGenOpenCL/atomic-ops-libcall.cl b/clang/test/CodeGenOpenCL/atomic-ops-libcall.cl
index 2f020c2..d615ff6 100644
--- a/clang/test/CodeGenOpenCL/atomic-ops-libcall.cl
+++ b/clang/test/CodeGenOpenCL/atomic-ops-libcall.cl
@@ -20,63 +20,60 @@ typedef enum memory_scope {
 
 void f(atomic_int *i, global atomic_int *gi, local atomic_int *li, private atomic_int *pi, atomic_uint *ui, int cmp, int order, int scope) {
   int x;
-  // SPIR: {{%[^ ]*}} = call i32 @__opencl_atomic_load_4(ptr addrspace(4) noundef {{%[0-9]+}}, i32 noundef 5, i32 noundef 1)
-  // ARM: {{%[^ ]*}} = call i32 @__opencl_atomic_load_4(ptr noundef {{%[0-9]+}}, i32 noundef 5, i32 noundef 1)
+  // SPIR: load atomic i32, ptr addrspace(4) {{.*}} seq_cst, align 4
+  // ARM: load atomic i32, ptr {{.*}} seq_cst, align 4
   x = __opencl_atomic_load(i, memory_order_seq_cst, memory_scope_work_group);
 
-  // SPIR: call void @__opencl_atomic_store_4(ptr addrspace(4) noundef {{%[0-9]+}}, i32 noundef {{%[0-9]+}}, i32 noundef 5, i32 noundef 1)
-  // ARM: call void @__opencl_atomic_store_4(ptr noundef {{%[0-9]+}}, i32 noundef {{%[0-9]+}}, i32 noundef 5, i32 noundef 1)
+  // SPIR: store atomic i32 {{.*}}, ptr addrspace(4) {{.*}} seq_cst, align 4
+  // ARM: store atomic i32 {{.*}}, ptr {{.*}} seq_cst, align 4
   __opencl_atomic_store(i, 1, memory_order_seq_cst, memory_scope_work_group);
 
-  // SPIR: %[[GP:[0-9]+]] = addrspacecast ptr addrspace(1) {{%[0-9]+}} to ptr addrspace(4)
-  // SPIR: call void @__opencl_atomic_store_4(ptr addrspace(4) noundef %[[GP]], i32 noundef {{%[0-9]+}}, i32 noundef 5, i32 noundef 1)
-  // ARM: call void @__opencl_atomic_store_4(ptr noundef {{%[0-9]+}}, i32 noundef {{%[0-9]+}}, i32 noundef 5, i32 noundef 1)
+  // SPIR: store atomic i32 {{.*}}, ptr addrspace(1) {{.*}} seq_cst, align 4
+  // ARM: store atomic i32 {{.*}}, ptr {{.*}} seq_cst, align 4
   __opencl_atomic_store(gi, 1, memory_order_seq_cst, memory_scope_work_group);
 
-  // SPIR: %[[GP:[0-9]+]] = addrspacecast ptr addrspace(3) {{%[0-9]+}} to ptr addrspace(4)
-  // SPIR: call void @__opencl_atomic_store_4(ptr addrspace(4) noundef %[[GP]], i32 noundef {{%[0-9]+}}, i32 noundef 5, i32 noundef 1)
-  // ARM: call void @__opencl_atomic_store_4(ptr noundef {{%[0-9]+}}, i32 noundef {{%[0-9]+}}, i32 noundef 5, i32 noundef 1)
+  // SPIR: store atomic i32 {{.*}}, ptr addrspace(3) {{.*}} seq_cst, align 4
+  // ARM: store atomic i32 {{.*}}, ptr {{.*}} seq_cst, align 4
   __opencl_atomic_store(li, 1, memory_order_seq_cst, memory_scope_work_group);
 
-  // SPIR: %[[GP:[0-9]+]] = addrspacecast ptr {{%[0-9]+}} to ptr addrspace(4)
-  // SPIR: call void @__opencl_atomic_store_4(ptr addrspace(4) noundef %[[GP]], i32 noundef {{%[0-9]+}}, i32 noundef 5, i32 noundef 1)
-  // ARM: call void @__opencl_atomic_store_4(ptr noundef {{%[0-9]+}}, i32 noundef {{%[0-9]+}}, i32 noundef 5, i32 noundef 1)
+  // SPIR: store atomic i32 {{.*}}, ptr {{.*}} seq_cst, align 4
+  // ARM: store atomic i32 {{.*}}, ptr {{.*}} seq_cst, align 4
   __opencl_atomic_store(pi, 1, memory_order_seq_cst, memory_scope_work_group);
 
-  // SPIR: {{%[^ ]*}} = call i32 @__opencl_atomic_fetch_add_4(ptr addrspace(4) noundef {{%[0-9]+}}, i32 noundef {{%[0-9]+}}, i32 noundef 5, i32 noundef 1)
-  // ARM: {{%[^ ]*}} = call i32 @__opencl_atomic_fetch_add_4(ptr noundef {{%[0-9]+}}, i32 noundef {{%[0-9]+}}, i32 noundef 5, i32 noundef 1)
+  // SPIR: atomicrmw add ptr addrspace(4) {{.*}}, i32 {{.*}} seq_cst, align 4
+  // ARM: atomicrmw add ptr {{.*}}, i32 {{.*}} seq_cst, align 4
   x = __opencl_atomic_fetch_add(i, 3, memory_order_seq_cst, memory_scope_work_group);
 
-  // SPIR: {{%[^ ]*}} = call i32 @__opencl_atomic_fetch_min_4(ptr addrspace(4) noundef {{%[0-9]+}}, i32 noundef {{%[0-9]+}}, i32 noundef 5, i32 noundef 1)
-  // ARM: {{%[^ ]*}} = call i32 @__opencl_atomic_fetch_min_4(ptr noundef {{%[0-9]+}}, i32 noundef {{%[0-9]+}}, i32 noundef 5, i32 noundef 1)
+  // SPIR: atomicrmw min ptr addrspace(4) {{.*}}, i32 {{.*}} seq_cst, align 4
+  // ARM: atomicrmw min ptr {{.*}}, i32 {{.*}} seq_cst, align 4
   x = __opencl_atomic_fetch_min(i, 3, memory_order_seq_cst, memory_scope_work_group);
 
-  // SPIR: {{%[^ ]*}} = call i32 @__opencl_atomic_fetch_umin_4(ptr addrspace(4) noundef {{%[0-9]+}}, i32 noundef {{%[0-9]+}}, i32 noundef 5, i32 noundef 1)
-  // ARM: {{%[^ ]*}} = call i32 @__opencl_atomic_fetch_umin_4(ptr noundef {{%[0-9]+}}, i32 noundef {{%[0-9]+}}, i32 noundef 5, i32 noundef 1)
+  // SPIR: atomicrmw umin ptr addrspace(4) {{.*}}, i32 {{.*}} seq_cst, align 4
+  // ARM: atomicrmw umin ptr {{.*}}, i32 {{.*}} seq_cst, align 4
   x = __opencl_atomic_fetch_min(ui, 3, memory_order_seq_cst, memory_scope_work_group);
 
-  // SPIR: {{%[^ ]*}} = call zeroext i1 @__opencl_atomic_compare_exchange_4(ptr addrspace(4) noundef {{%[0-9]+}}, ptr addrspace(4) noundef {{%[^,]+}}, i32 noundef {{%[0-9]+}}, i32 noundef 5, i32 noundef 5, i32 noundef 1)
-  // ARM: {{%[^ ]*}} = call zeroext i1 @__opencl_atomic_compare_exchange_4(ptr noundef {{%[0-9]+}}, ptr noundef {{%[^,]+}}, i32 noundef {{%[0-9]+}}, i32 noundef 5, i32 noundef 5, i32 noundef 1)
+  // SPIR: cmpxchg ptr addrspace(4) {{.*}}, i32 {{.*}}, i32 {{.*}} seq_cst seq_cst, align 4
+  // ARM: cmpxchg ptr {{.*}}, i32 {{.*}}, i32 {{.*}} seq_cst seq_cst, align 4
   x = __opencl_atomic_compare_exchange_strong(i, &cmp, 1, memory_order_seq_cst, memory_order_seq_cst, memory_scope_work_group);
 
-  // SPIR: {{%[^ ]*}} = call zeroext i1 @__opencl_atomic_compare_exchange_4(ptr addrspace(4) noundef {{%[0-9]+}}, ptr addrspace(4) noundef {{%[^,]+}}, i32 noundef {{%[0-9]+}}, i32 noundef 5, i32 noundef 5, i32 noundef 1)
-  // ARM: {{%[^ ]*}} = call zeroext i1 @__opencl_atomic_compare_exchange_4(ptr noundef {{%[0-9]+}}, ptr noundef {{%[^,]+}}, i32 noundef {{%[0-9]+}}, i32 noundef 5, i32 noundef 5, i32 noundef 1)
+  // SPIR: cmpxchg weak ptr addrspace(4) {{.*}}, i32 {{.*}}, i32 {{.*}} seq_cst seq_cst, align 4
+  // ARM: cmpxchg weak ptr {{.*}}, i32 {{.*}}, i32 {{.*}} seq_cst seq_cst, align 4
   x = __opencl_atomic_compare_exchange_weak(i, &cmp, 1, memory_order_seq_cst, memory_order_seq_cst, memory_scope_work_group);
 
-  // SPIR: {{%[^ ]*}} = call zeroext i1 @__opencl_atomic_compare_exchange_4(ptr addrspace(4) noundef {{%[0-9]+}}, ptr addrspace(4) noundef {{%[^,]+}}, i32 noundef {{%[0-9]+}}, i32 noundef 5, i32 noundef 5, i32 noundef 2)
-  // ARM: {{%[^ ]*}} = call zeroext i1 @__opencl_atomic_compare_exchange_4(ptr noundef {{%[0-9]+}}, ptr noundef {{%[^,]+}}, i32 noundef {{%[0-9]+}}, i32 noundef 5, i32 noundef 5, i32 noundef 2)
+  // SPIR: cmpxchg weak ptr addrspace(4) {{.*}}, i32 {{.*}}, i32 {{.*}} seq_cst seq_cst, align 4
+  // ARM: cmpxchg weak ptr {{.*}}, i32 {{.*}}, i32 {{.*}} seq_cst seq_cst, align 4
   x = __opencl_atomic_compare_exchange_weak(i, &cmp, 1, memory_order_seq_cst, memory_order_seq_cst, memory_scope_device);
 
-  // SPIR: {{%[^ ]*}} = call zeroext i1 @__opencl_atomic_compare_exchange_4(ptr addrspace(4) noundef {{%[0-9]+}}, ptr addrspace(4) noundef {{%[^,]+}}, i32 noundef {{%[0-9]+}}, i32 noundef 5, i32 noundef 5, i32 noundef 3)
-  // ARM: {{%[^ ]*}} = call zeroext i1 @__opencl_atomic_compare_exchange_4(ptr noundef {{%[0-9]+}}, ptr noundef {{%[^,]+}}, i32 noundef {{%[0-9]+}}, i32 noundef 5, i32 noundef 5, i32 noundef 3)
+  // SPIR: cmpxchg weak ptr addrspace(4) {{.*}}, i32 {{.*}}, i32 {{.*}} seq_cst seq_cst, align 4
+  // ARM: cmpxchg weak ptr {{.*}}, i32 {{.*}}, i32 {{.*}} seq_cst seq_cst, align 4
   x = __opencl_atomic_compare_exchange_weak(i, &cmp, 1, memory_order_seq_cst, memory_order_seq_cst, memory_scope_all_svm_devices);
 
 #ifdef cl_khr_subgroups
-  // SPIR: {{%[^ ]*}} = call zeroext i1 @__opencl_atomic_compare_exchange_4(ptr addrspace(4) noundef {{%[0-9]+}}, ptr addrspace(4) noundef {{%[^,]+}}, i32 noundef {{%[0-9]+}}, i32 noundef 5, i32 noundef 5, i32 noundef 4)
+  // SPIR: cmpxchg weak ptr addrspace(4) {{.*}}, i32 {{.*}}, i32 {{.*}} seq_cst seq_cst, align 4
   x = __opencl_atomic_compare_exchange_weak(i, &cmp, 1, memory_order_seq_cst, memory_order_seq_cst, memory_scope_sub_group);
 #endif
 
-  // SPIR: {{%[^ ]*}} = call zeroext i1 @__opencl_atomic_compare_exchange_4(ptr addrspace(4) noundef {{%[0-9]+}}, ptr addrspace(4) noundef {{%[^,]+}}, i32 noundef {{%[0-9]+}}, i32 noundef %{{.*}}, i32 noundef %{{.*}}, i32 noundef %{{.*}})
-  // ARM: {{%[^ ]*}} = call zeroext i1 @__opencl_atomic_compare_exchange_4(ptr noundef {{%[0-9]+}}, ptr noundef {{%[^,]+}}, i32 noundef {{%[0-9]+}}, i32 noundef %{{.*}}, i32 noundef %{{.*}}, i32 noundef %{{.*}})
+  // SPIR: cmpxchg weak ptr addrspace(4) {{.*}}, i32 {{.*}}, i32 {{.*}} seq_cst seq_cst, align 4
+  // ARM: cmpxchg weak ptr {{.*}}, i32 {{.*}}, i32 {{.*}} seq_cst seq_cst, align 4
   x = __opencl_atomic_compare_exchange_weak(i, &cmp, 1, order, order, scope);
 }
-- 
cgit v1.1


From 4c654b7b91aff61728619fc3cc955fa5169d17c6 Mon Sep 17 00:00:00 2001
From: Rolf Morel <rolfmorel@gmail.com>
Date: Mon, 12 Feb 2024 17:35:43 +0000
Subject: [MLIR][Python] Add missing peel_front argument to LoopPeelOp's
 extension class (#81424)

---
 mlir/python/mlir/dialects/transform/loop.py     |  6 ++++++
 mlir/test/python/dialects/transform_loop_ext.py | 18 ++++++++++++++++++
 2 files changed, 24 insertions(+)

diff --git a/mlir/python/mlir/dialects/transform/loop.py b/mlir/python/mlir/dialects/transform/loop.py
index 3bdd9ca..c4770b1 100644
--- a/mlir/python/mlir/dialects/transform/loop.py
+++ b/mlir/python/mlir/dialects/transform/loop.py
@@ -55,6 +55,7 @@ class LoopPeelOp(LoopPeelOp):
         remainder_loop_type: Type,
         target: Union[Operation, Value],
         *,
+        peel_front: Union[bool, BoolAttr] = False,
         fail_if_already_divisible: Union[bool, BoolAttr] = False,
         ip=None,
         loc=None,
@@ -63,6 +64,11 @@ class LoopPeelOp(LoopPeelOp):
             main_loop_type,
             remainder_loop_type,
             _get_op_result_or_value(target),
+            peel_front=(
+                peel_front
+                if isinstance(peel_front, BoolAttr)
+                else BoolAttr.get(peel_front)
+            ),
             fail_if_already_divisible=(
                 fail_if_already_divisible
                 if isinstance(fail_if_already_divisible, BoolAttr)
diff --git a/mlir/test/python/dialects/transform_loop_ext.py b/mlir/test/python/dialects/transform_loop_ext.py
index 840e7a46..430b33f 100644
--- a/mlir/test/python/dialects/transform_loop_ext.py
+++ b/mlir/test/python/dialects/transform_loop_ext.py
@@ -49,6 +49,24 @@ def loopPeel():
     # CHECK-LABEL: TEST: loopPeel
     # CHECK: = transform.loop.peel %
 
+@run
+def loopPeel_peel_front():
+    sequence = transform.SequenceOp(
+        transform.FailurePropagationMode.Propagate,
+        [],
+        transform.OperationType.get("scf.for"),
+    )
+    with InsertionPoint(sequence.body):
+        loop.LoopPeelOp(
+            transform.AnyOpType.get(),
+            transform.AnyOpType.get(),
+            sequence.bodyTarget,
+            peel_front=True,
+        )
+        transform.YieldOp()
+    # CHECK-LABEL: TEST: loopPeel_peel_front
+    # CHECK: = transform.loop.peel %[[ARG0:.*]] {peel_front = true}
+
 
 @run
 def loopPipeline():
-- 
cgit v1.1


From 833a1cadeb7d2b173e5658d7516a2f8435fe7770 Mon Sep 17 00:00:00 2001
From: Alexey Bataev <5361294+alexey-bataev@users.noreply.github.com>
Date: Mon, 12 Feb 2024 07:41:42 -0500
Subject: [SLP]Add support for strided loads.

Added basic support for strided loads support in SLP vectorizer.
Supports constant strides only. If the strided load must be
reversed, applies -stride to avoid extra reverse shuffle.

Reviewers: preames, lukel97

Reviewed By: preames

Pull Request: https://github.com/llvm/llvm-project/pull/80310
---
 llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp    | 290 ++++++-----
 .../SLPVectorizer/RISCV/complex-loads.ll           | 546 ++++++++++-----------
 .../RISCV/strided-loads-vectorized.ll              | 131 +----
 .../RISCV/strided-loads-with-external-use-ptr.ll   |   4 +-
 .../SLPVectorizer/RISCV/strided-loads.ll           |  13 +-
 .../X86/gep-nodes-with-non-gep-inst.ll             |   2 +-
 .../X86/remark_gather-load-redux-cost.ll           |   2 +-
 7 files changed, 458 insertions(+), 530 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index c0b7298f..085cbdb 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -175,6 +175,15 @@ static cl::opt<int> RootLookAheadMaxDepth(
     "slp-max-root-look-ahead-depth", cl::init(2), cl::Hidden,
     cl::desc("The maximum look-ahead depth for searching best rooting option"));
 
+static cl::opt<unsigned> MinProfitableStridedLoads(
+    "slp-min-strided-loads", cl::init(2), cl::Hidden,
+    cl::desc("The minimum number of loads, which should be considered strided, "
+             "if the stride is > 1 or is runtime value"));
+
+static cl::opt<unsigned> MaxProfitableLoadStride(
+    "slp-max-stride", cl::init(8), cl::Hidden,
+    cl::desc("The maximum stride, considered to be profitable."));
+
 static cl::opt<bool>
     ViewSLPTree("view-slp-tree", cl::Hidden,
                 cl::desc("Display the SLP trees with Graphviz"));
@@ -2575,7 +2584,7 @@ private:
     enum EntryState {
       Vectorize,
       ScatterVectorize,
-      PossibleStridedVectorize,
+      StridedVectorize,
       NeedToGather
     };
     EntryState State;
@@ -2753,8 +2762,8 @@ private:
       case ScatterVectorize:
         dbgs() << "ScatterVectorize\n";
         break;
-      case PossibleStridedVectorize:
-        dbgs() << "PossibleStridedVectorize\n";
+      case StridedVectorize:
+        dbgs() << "StridedVectorize\n";
         break;
       case NeedToGather:
         dbgs() << "NeedToGather\n";
@@ -3680,7 +3689,7 @@ template <> struct DOTGraphTraits<BoUpSLP *> : public DefaultDOTGraphTraits {
     if (Entry->State == TreeEntry::NeedToGather)
       return "color=red";
     if (Entry->State == TreeEntry::ScatterVectorize ||
-        Entry->State == TreeEntry::PossibleStridedVectorize)
+        Entry->State == TreeEntry::StridedVectorize)
       return "color=blue";
     return "";
   }
@@ -3842,12 +3851,7 @@ BoUpSLP::findReusedOrderedScalars(const BoUpSLP::TreeEntry &TE) {
 
 namespace {
 /// Tracks the state we can represent the loads in the given sequence.
-enum class LoadsState {
-  Gather,
-  Vectorize,
-  ScatterVectorize,
-  PossibleStridedVectorize
-};
+enum class LoadsState { Gather, Vectorize, ScatterVectorize, StridedVectorize };
 } // anonymous namespace
 
 static bool arePointersCompatible(Value *Ptr1, Value *Ptr2,
@@ -3878,6 +3882,14 @@ static Align computeCommonAlignment(ArrayRef<Value *> VL) {
   return CommonAlignment;
 }
 
+/// Check if \p Order represents reverse order.
+static bool isReverseOrder(ArrayRef<unsigned> Order) {
+  unsigned Sz = Order.size();
+  return !Order.empty() && all_of(enumerate(Order), [&](const auto &Pair) {
+    return Pair.value() == Sz || Sz - Pair.index() - 1 == Pair.value();
+  });
+}
+
 /// Checks if the given array of loads can be represented as a vectorized,
 /// scatter or just simple gather.
 static LoadsState canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0,
@@ -3900,7 +3912,8 @@ static LoadsState canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0,
   // Make sure all loads in the bundle are simple - we can't vectorize
   // atomic or volatile loads.
   PointerOps.clear();
-  PointerOps.resize(VL.size());
+  const unsigned Sz = VL.size();
+  PointerOps.resize(Sz);
   auto *POIter = PointerOps.begin();
   for (Value *V : VL) {
     auto *L = cast<LoadInst>(V);
@@ -3911,12 +3924,12 @@ static LoadsState canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0,
   }
 
   Order.clear();
+  auto *VecTy = FixedVectorType::get(ScalarTy, Sz);
   // Check the order of pointer operands or that all pointers are the same.
   bool IsSorted = sortPtrAccesses(PointerOps, ScalarTy, DL, SE, Order);
   if (IsSorted || all_of(PointerOps, [&](Value *P) {
         return arePointersCompatible(P, PointerOps.front(), TLI);
       })) {
-    bool IsPossibleStrided = false;
     if (IsSorted) {
       Value *Ptr0;
       Value *PtrN;
@@ -3930,30 +3943,71 @@ static LoadsState canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0,
       std::optional<int> Diff =
           getPointersDiff(ScalarTy, Ptr0, ScalarTy, PtrN, DL, SE);
       // Check that the sorted loads are consecutive.
-      if (static_cast<unsigned>(*Diff) == VL.size() - 1)
+      if (static_cast<unsigned>(*Diff) == Sz - 1)
         return LoadsState::Vectorize;
       // Simple check if not a strided access - clear order.
-      IsPossibleStrided = *Diff % (VL.size() - 1) == 0;
+      bool IsPossibleStrided = *Diff % (Sz - 1) == 0;
+      // Try to generate strided load node if:
+      // 1. Target with strided load support is detected.
+      // 2. The number of loads is greater than MinProfitableStridedLoads,
+      // or the potential stride <= MaxProfitableLoadStride and the
+      // potential stride is power-of-2 (to avoid perf regressions for the very
+      // small number of loads) and max distance > number of loads, or potential
+      // stride is -1.
+      // 3. The loads are ordered, or number of unordered loads <=
+      // MaxProfitableUnorderedLoads, or loads are in reversed order.
+      // (this check is to avoid extra costs for very expensive shuffles).
+      if (IsPossibleStrided && (((Sz > MinProfitableStridedLoads ||
+                                  (static_cast<unsigned>(std::abs(*Diff)) <=
+                                       MaxProfitableLoadStride * Sz &&
+                                   isPowerOf2_32(std::abs(*Diff)))) &&
+                                 static_cast<unsigned>(std::abs(*Diff)) > Sz) ||
+                                *Diff == -(static_cast<int>(Sz) - 1))) {
+        int Stride = *Diff / static_cast<int>(Sz - 1);
+        if (*Diff == Stride * static_cast<int>(Sz - 1)) {
+          Align Alignment =
+              cast<LoadInst>(Order.empty() ? VL.front() : VL[Order.front()])
+                  ->getAlign();
+          if (TTI.isLegalStridedLoadStore(VecTy, Alignment)) {
+            // Iterate through all pointers and check if all distances are
+            // unique multiple of Dist.
+            SmallSet<int, 4> Dists;
+            for (Value *Ptr : PointerOps) {
+              int Dist = 0;
+              if (Ptr == PtrN)
+                Dist = *Diff;
+              else if (Ptr != Ptr0)
+                Dist = *getPointersDiff(ScalarTy, Ptr0, ScalarTy, Ptr, DL, SE);
+              // If the strides are not the same or repeated, we can't
+              // vectorize.
+              if (((Dist / Stride) * Stride) != Dist ||
+                  !Dists.insert(Dist).second)
+                break;
+            }
+            if (Dists.size() == Sz)
+              return LoadsState::StridedVectorize;
+          }
+        }
+      }
     }
     // TODO: need to improve analysis of the pointers, if not all of them are
     // GEPs or have > 2 operands, we end up with a gather node, which just
     // increases the cost.
     Loop *L = LI.getLoopFor(cast<LoadInst>(VL0)->getParent());
     bool ProfitableGatherPointers =
-        static_cast<unsigned>(count_if(PointerOps, [L](Value *V) {
-          return L && L->isLoopInvariant(V);
-        })) <= VL.size() / 2 && VL.size() > 2;
+        static_cast<unsigned>(count_if(
+            PointerOps,
+            [L](Value *V) { return L && L->isLoopInvariant(V); })) <= Sz / 2 &&
+        Sz > 2;
     if (ProfitableGatherPointers || all_of(PointerOps, [IsSorted](Value *P) {
           auto *GEP = dyn_cast<GetElementPtrInst>(P);
           return (IsSorted && !GEP && doesNotNeedToBeScheduled(P)) ||
                  (GEP && GEP->getNumOperands() == 2);
         })) {
       Align CommonAlignment = computeCommonAlignment<LoadInst>(VL);
-      auto *VecTy = FixedVectorType::get(ScalarTy, VL.size());
       if (TTI.isLegalMaskedGather(VecTy, CommonAlignment) &&
           !TTI.forceScalarizeMaskedGather(VecTy, CommonAlignment))
-        return IsPossibleStrided ? LoadsState::PossibleStridedVectorize
-                                 : LoadsState::ScatterVectorize;
+        return LoadsState::ScatterVectorize;
     }
   }
 
@@ -4160,7 +4214,7 @@ BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom) {
     return std::move(ResOrder);
   }
   if ((TE.State == TreeEntry::Vectorize ||
-       TE.State == TreeEntry::PossibleStridedVectorize) &&
+       TE.State == TreeEntry::StridedVectorize) &&
       (isa<LoadInst, ExtractElementInst, ExtractValueInst>(TE.getMainOp()) ||
        (TopToBottom && isa<StoreInst, InsertElementInst>(TE.getMainOp()))) &&
       !TE.isAltShuffle())
@@ -4418,7 +4472,7 @@ void BoUpSLP::reorderTopToBottom() {
       }
       VFToOrderedEntries[TE->getVectorFactor()].insert(TE.get());
       if (!(TE->State == TreeEntry::Vectorize ||
-            TE->State == TreeEntry::PossibleStridedVectorize) ||
+            TE->State == TreeEntry::StridedVectorize) ||
           !TE->ReuseShuffleIndices.empty())
         GathersToOrders.try_emplace(TE.get(), *CurrentOrder);
       if (TE->State == TreeEntry::Vectorize &&
@@ -4442,9 +4496,6 @@ void BoUpSLP::reorderTopToBottom() {
     MapVector<OrdersType, unsigned,
               DenseMap<OrdersType, unsigned, OrdersTypeDenseMapInfo>>
         OrdersUses;
-    // Last chance orders - scatter vectorize. Try to use their orders if no
-    // other orders or the order is counted already.
-    SmallVector<OrdersType> StridedVectorizeOrders;
     SmallPtrSet<const TreeEntry *, 4> VisitedOps;
     for (const TreeEntry *OpTE : OrderedEntries) {
       // No need to reorder this nodes, still need to extend and to use shuffle,
@@ -4491,11 +4542,6 @@ void BoUpSLP::reorderTopToBottom() {
         if (Order.empty())
           continue;
       }
-      // Postpone scatter orders.
-      if (OpTE->State == TreeEntry::PossibleStridedVectorize) {
-        StridedVectorizeOrders.push_back(Order);
-        continue;
-      }
       // Stores actually store the mask, not the order, need to invert.
       if (OpTE->State == TreeEntry::Vectorize && !OpTE->isAltShuffle() &&
           OpTE->getOpcode() == Instruction::Store && !Order.empty()) {
@@ -4512,22 +4558,8 @@ void BoUpSLP::reorderTopToBottom() {
         ++OrdersUses.insert(std::make_pair(Order, 0)).first->second;
       }
     }
-    // Set order of the user node.
-    if (OrdersUses.empty()) {
-      if (StridedVectorizeOrders.empty())
-        continue;
-      // Add (potentially!) strided vectorize orders.
-      for (OrdersType &Order : StridedVectorizeOrders)
-        ++OrdersUses.insert(std::make_pair(Order, 0)).first->second;
-    } else {
-      // Account (potentially!) strided vectorize orders only if it was used
-      // already.
-      for (OrdersType &Order : StridedVectorizeOrders) {
-        auto *It = OrdersUses.find(Order);
-        if (It != OrdersUses.end())
-          ++It->second;
-      }
-    }
+    if (OrdersUses.empty())
+      continue;
     // Choose the most used order.
     ArrayRef<unsigned> BestOrder = OrdersUses.front().first;
     unsigned Cnt = OrdersUses.front().second;
@@ -4569,7 +4601,7 @@ void BoUpSLP::reorderTopToBottom() {
         continue;
       }
       if ((TE->State == TreeEntry::Vectorize ||
-           TE->State == TreeEntry::PossibleStridedVectorize) &&
+           TE->State == TreeEntry::StridedVectorize) &&
           isa<ExtractElementInst, ExtractValueInst, LoadInst, StoreInst,
               InsertElementInst>(TE->getMainOp()) &&
           !TE->isAltShuffle()) {
@@ -4610,10 +4642,6 @@ bool BoUpSLP::canReorderOperands(
         }))
       continue;
     if (TreeEntry *TE = getVectorizedOperand(UserTE, I)) {
-      // FIXME: Do not reorder (possible!) strided vectorized nodes, they
-      // require reordering of the operands, which is not implemented yet.
-      if (TE->State == TreeEntry::PossibleStridedVectorize)
-        return false;
       // Do not reorder if operand node is used by many user nodes.
       if (any_of(TE->UserTreeIndices,
                  [UserTE](const EdgeInfo &EI) { return EI.UserTE != UserTE; }))
@@ -4664,13 +4692,13 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
   SmallVector<TreeEntry *> NonVectorized;
   for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
     if (TE->State != TreeEntry::Vectorize &&
-        TE->State != TreeEntry::PossibleStridedVectorize)
+        TE->State != TreeEntry::StridedVectorize)
       NonVectorized.push_back(TE.get());
     if (std::optional<OrdersType> CurrentOrder =
             getReorderingData(*TE, /*TopToBottom=*/false)) {
       OrderedEntries.insert(TE.get());
       if (!(TE->State == TreeEntry::Vectorize ||
-            TE->State == TreeEntry::PossibleStridedVectorize) ||
+            TE->State == TreeEntry::StridedVectorize) ||
           !TE->ReuseShuffleIndices.empty())
         GathersToOrders.try_emplace(TE.get(), *CurrentOrder);
     }
@@ -4688,7 +4716,7 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
     SmallVector<TreeEntry *> Filtered;
     for (TreeEntry *TE : OrderedEntries) {
       if (!(TE->State == TreeEntry::Vectorize ||
-            TE->State == TreeEntry::PossibleStridedVectorize ||
+            TE->State == TreeEntry::StridedVectorize ||
             (TE->State == TreeEntry::NeedToGather &&
              GathersToOrders.count(TE))) ||
           TE->UserTreeIndices.empty() || !TE->ReuseShuffleIndices.empty() ||
@@ -4733,9 +4761,6 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
       MapVector<OrdersType, unsigned,
                 DenseMap<OrdersType, unsigned, OrdersTypeDenseMapInfo>>
           OrdersUses;
-      // Last chance orders - scatter vectorize. Try to use their orders if no
-      // other orders or the order is counted already.
-      SmallVector<std::pair<OrdersType, unsigned>> StridedVectorizeOrders;
       // Do the analysis for each tree entry only once, otherwise the order of
       // the same node my be considered several times, though might be not
       // profitable.
@@ -4757,11 +4782,6 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
             Data.second, [OpTE](const std::pair<unsigned, TreeEntry *> &P) {
               return P.second == OpTE;
             });
-        // Postpone scatter orders.
-        if (OpTE->State == TreeEntry::PossibleStridedVectorize) {
-          StridedVectorizeOrders.emplace_back(Order, NumOps);
-          continue;
-        }
         // Stores actually store the mask, not the order, need to invert.
         if (OpTE->State == TreeEntry::Vectorize && !OpTE->isAltShuffle() &&
             OpTE->getOpcode() == Instruction::Store && !Order.empty()) {
@@ -4819,30 +4839,6 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
             ++Res.first->second;
         }
       }
-      // If no orders - skip current nodes and jump to the next one, if any.
-      if (OrdersUses.empty()) {
-        if (StridedVectorizeOrders.empty() ||
-            (Data.first->ReorderIndices.empty() &&
-             Data.first->ReuseShuffleIndices.empty() &&
-             !(IgnoreReorder &&
-               Data.first == VectorizableTree.front().get()))) {
-          for (const std::pair<unsigned, TreeEntry *> &Op : Data.second)
-            OrderedEntries.remove(Op.second);
-          continue;
-        }
-        // Add (potentially!) strided vectorize orders.
-        for (std::pair<OrdersType, unsigned> &Pair : StridedVectorizeOrders)
-          OrdersUses.insert(std::make_pair(Pair.first, 0)).first->second +=
-              Pair.second;
-      } else {
-        // Account (potentially!) strided vectorize orders only if it was used
-        // already.
-        for (std::pair<OrdersType, unsigned> &Pair : StridedVectorizeOrders) {
-          auto *It = OrdersUses.find(Pair.first);
-          if (It != OrdersUses.end())
-            It->second += Pair.second;
-        }
-      }
       // Choose the best order.
       ArrayRef<unsigned> BestOrder = OrdersUses.front().first;
       unsigned Cnt = OrdersUses.front().second;
@@ -4878,7 +4874,7 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
         }
         // Gathers are processed separately.
         if (TE->State != TreeEntry::Vectorize &&
-            TE->State != TreeEntry::PossibleStridedVectorize &&
+            TE->State != TreeEntry::StridedVectorize &&
             (TE->State != TreeEntry::ScatterVectorize ||
              TE->ReorderIndices.empty()))
           continue;
@@ -4910,7 +4906,7 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
         Data.first->reorderOperands(Mask);
       if (!isa<InsertElementInst, StoreInst>(Data.first->getMainOp()) ||
           Data.first->isAltShuffle() ||
-          Data.first->State == TreeEntry::PossibleStridedVectorize) {
+          Data.first->State == TreeEntry::StridedVectorize) {
         reorderScalars(Data.first->Scalars, Mask);
         reorderOrder(Data.first->ReorderIndices, MaskOrder,
                      /*BottomOrder=*/true);
@@ -4973,7 +4969,6 @@ void BoUpSLP::buildExternalUses(
           // instructions. If that is the case, the one in FoundLane will
           // be used.
           if (UseEntry->State == TreeEntry::ScatterVectorize ||
-              UseEntry->State == TreeEntry::PossibleStridedVectorize ||
               !doesInTreeUserNeedToExtract(
                   Scalar, cast<Instruction>(UseEntry->Scalars.front()), TLI)) {
             LLVM_DEBUG(dbgs() << "SLP: \tInternal user will be removed:" << *U
@@ -5331,8 +5326,8 @@ BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(
       return TreeEntry::Vectorize;
     case LoadsState::ScatterVectorize:
       return TreeEntry::ScatterVectorize;
-    case LoadsState::PossibleStridedVectorize:
-      return TreeEntry::PossibleStridedVectorize;
+    case LoadsState::StridedVectorize:
+      return TreeEntry::StridedVectorize;
     case LoadsState::Gather:
 #ifndef NDEBUG
       Type *ScalarTy = VL0->getType();
@@ -5753,8 +5748,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
   BasicBlock *BB = nullptr;
   bool IsScatterVectorizeUserTE =
       UserTreeIdx.UserTE &&
-      (UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize ||
-       UserTreeIdx.UserTE->State == TreeEntry::PossibleStridedVectorize);
+      UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize;
   bool AreAllSameInsts =
       (S.getOpcode() && allSameBlock(VL)) ||
       (S.OpValue->getType()->isPointerTy() && IsScatterVectorizeUserTE &&
@@ -5851,8 +5845,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
   // Special processing for sorted pointers for ScatterVectorize node with
   // constant indeces only.
   if (AreAllSameInsts && UserTreeIdx.UserTE &&
-      (UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize ||
-       UserTreeIdx.UserTE->State == TreeEntry::PossibleStridedVectorize) &&
+      UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize &&
       !(S.getOpcode() && allSameBlock(VL))) {
     assert(S.OpValue->getType()->isPointerTy() &&
            count_if(VL, [](Value *V) { return isa<GetElementPtrInst>(V); }) >=
@@ -6049,18 +6042,17 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
         }
         TE->setOperandsInOrder();
         break;
-      case TreeEntry::PossibleStridedVectorize:
+      case TreeEntry::StridedVectorize:
         // Vectorizing non-consecutive loads with `llvm.masked.gather`.
         if (CurrentOrder.empty()) {
-          TE = newTreeEntry(VL, TreeEntry::PossibleStridedVectorize, Bundle, S,
+          TE = newTreeEntry(VL, TreeEntry::StridedVectorize, Bundle, S,
                             UserTreeIdx, ReuseShuffleIndicies);
         } else {
-          TE = newTreeEntry(VL, TreeEntry::PossibleStridedVectorize, Bundle, S,
+          TE = newTreeEntry(VL, TreeEntry::StridedVectorize, Bundle, S,
                             UserTreeIdx, ReuseShuffleIndicies, CurrentOrder);
         }
         TE->setOperandsInOrder();
-        buildTree_rec(PointerOps, Depth + 1, {TE, 0});
-        LLVM_DEBUG(dbgs() << "SLP: added a vector of non-consecutive loads.\n");
+        LLVM_DEBUG(dbgs() << "SLP: added a vector of strided loads.\n");
         break;
       case TreeEntry::ScatterVectorize:
         // Vectorizing non-consecutive loads with `llvm.masked.gather`.
@@ -7091,7 +7083,7 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
         !isSplat(Gathers)) {
       InstructionCost BaseCost = R.getGatherCost(Gathers, !Root);
       SetVector<Value *> VectorizedLoads;
-      SmallVector<unsigned> VectorizedStarts;
+      SmallVector<std::pair<unsigned, LoadsState>> VectorizedStarts;
       SmallVector<unsigned> ScatterVectorized;
       unsigned StartIdx = 0;
       unsigned VF = VL.size() / 2;
@@ -7115,12 +7107,16 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
             switch (LS) {
             case LoadsState::Vectorize:
             case LoadsState::ScatterVectorize:
-            case LoadsState::PossibleStridedVectorize:
+            case LoadsState::StridedVectorize:
               // Mark the vectorized loads so that we don't vectorize them
               // again.
               // TODO: better handling of loads with reorders.
-              if (LS == LoadsState::Vectorize && CurrentOrder.empty())
-                VectorizedStarts.push_back(Cnt);
+              if (((LS == LoadsState::Vectorize ||
+                    LS == LoadsState::StridedVectorize) &&
+                   CurrentOrder.empty()) ||
+                  (LS == LoadsState::StridedVectorize &&
+                   isReverseOrder(CurrentOrder)))
+                VectorizedStarts.emplace_back(Cnt, LS);
               else
                 ScatterVectorized.push_back(Cnt);
               VectorizedLoads.insert(Slice.begin(), Slice.end());
@@ -7164,16 +7160,20 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
                                   CostKind, TTI::OperandValueInfo(), LI);
         }
         auto *LoadTy = FixedVectorType::get(VL.front()->getType(), VF);
-        for (unsigned P : VectorizedStarts) {
-          auto *LI = cast<LoadInst>(VL[P]);
+        for (const std::pair<unsigned, LoadsState> &P : VectorizedStarts) {
+          auto *LI = cast<LoadInst>(VL[P.first]);
           Align Alignment = LI->getAlign();
           GatherCost +=
-              TTI.getMemoryOpCost(Instruction::Load, LoadTy, Alignment,
-                                  LI->getPointerAddressSpace(), CostKind,
-                                  TTI::OperandValueInfo(), LI);
+              P.second == LoadsState::Vectorize
+                  ? TTI.getMemoryOpCost(Instruction::Load, LoadTy, Alignment,
+                                        LI->getPointerAddressSpace(), CostKind,
+                                        TTI::OperandValueInfo(), LI)
+                  : TTI.getStridedMemoryOpCost(
+                        Instruction::Load, LoadTy, LI->getPointerOperand(),
+                        /*VariableMask=*/false, Alignment, CostKind, LI);
           // Estimate GEP cost.
           SmallVector<Value *> PointerOps(VF);
-          for (auto [I, V] : enumerate(VL.slice(P, VF)))
+          for (auto [I, V] : enumerate(VL.slice(P.first, VF)))
             PointerOps[I] = cast<LoadInst>(V)->getPointerOperand();
           auto [ScalarGEPCost, VectorGEPCost] =
               getGEPCosts(TTI, PointerOps, LI->getPointerOperand(),
@@ -7913,8 +7913,9 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
   }
   InstructionCost CommonCost = 0;
   SmallVector<int> Mask;
+  bool IsReverseOrder = isReverseOrder(E->ReorderIndices);
   if (!E->ReorderIndices.empty() &&
-      E->State != TreeEntry::PossibleStridedVectorize) {
+      (E->State != TreeEntry::StridedVectorize || !IsReverseOrder)) {
     SmallVector<int> NewMask;
     if (E->getOpcode() == Instruction::Store) {
       // For stores the order is actually a mask.
@@ -7932,7 +7933,7 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
         TTI->getShuffleCost(TTI::SK_PermuteSingleSrc, FinalVecTy, Mask);
   assert((E->State == TreeEntry::Vectorize ||
           E->State == TreeEntry::ScatterVectorize ||
-          E->State == TreeEntry::PossibleStridedVectorize) &&
+          E->State == TreeEntry::StridedVectorize) &&
          "Unhandled state");
   assert(E->getOpcode() &&
          ((allSameType(VL) && allSameBlock(VL)) ||
@@ -7952,7 +7953,8 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
   }
   auto GetCastContextHint = [&](Value *V) {
     if (const TreeEntry *OpTE = getTreeEntry(V)) {
-      if (OpTE->State == TreeEntry::ScatterVectorize)
+      if (OpTE->State == TreeEntry::ScatterVectorize ||
+          OpTE->State == TreeEntry::StridedVectorize)
         return TTI::CastContextHint::GatherScatter;
       if (OpTE->State == TreeEntry::Vectorize &&
           OpTE->getOpcode() == Instruction::Load && !OpTE->isAltShuffle()) {
@@ -8028,8 +8030,9 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
   // Calculate cost difference from vectorizing set of GEPs.
   // Negative value means vectorizing is profitable.
   auto GetGEPCostDiff = [=](ArrayRef<Value *> Ptrs, Value *BasePtr) {
-    assert(E->State == TreeEntry::Vectorize &&
-           "Entry state expected to be Vectorize here.");
+    assert((E->State == TreeEntry::Vectorize ||
+            E->State == TreeEntry::StridedVectorize) &&
+           "Entry state expected to be Vectorize or StridedVectorize here.");
     InstructionCost ScalarCost = 0;
     InstructionCost VecCost = 0;
     std::tie(ScalarCost, VecCost) = getGEPCosts(
@@ -8382,10 +8385,14 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
         VecLdCost = TTI->getMemoryOpCost(
             Instruction::Load, VecTy, LI0->getAlign(),
             LI0->getPointerAddressSpace(), CostKind, TTI::OperandValueInfo());
+      } else if (E->State == TreeEntry::StridedVectorize) {
+        Align CommonAlignment =
+            computeCommonAlignment<LoadInst>(UniqueValues.getArrayRef());
+        VecLdCost = TTI->getStridedMemoryOpCost(
+            Instruction::Load, VecTy, LI0->getPointerOperand(),
+            /*VariableMask=*/false, CommonAlignment, CostKind);
       } else {
-        assert((E->State == TreeEntry::ScatterVectorize ||
-                E->State == TreeEntry::PossibleStridedVectorize) &&
-               "Unknown EntryState");
+        assert(E->State == TreeEntry::ScatterVectorize && "Unknown EntryState");
         Align CommonAlignment =
             computeCommonAlignment<LoadInst>(UniqueValues.getArrayRef());
         VecLdCost = TTI->getGatherScatterOpCost(
@@ -8398,8 +8405,7 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
     InstructionCost Cost = GetCostDiff(GetScalarCost, GetVectorCost);
     // If this node generates masked gather load then it is not a terminal node.
     // Hence address operand cost is estimated separately.
-    if (E->State == TreeEntry::ScatterVectorize ||
-        E->State == TreeEntry::PossibleStridedVectorize)
+    if (E->State == TreeEntry::ScatterVectorize)
       return Cost;
 
     // Estimate cost of GEPs since this tree node is a terminator.
@@ -8608,7 +8614,7 @@ bool BoUpSLP::isFullyVectorizableTinyTree(bool ForReduction) const {
   if (VectorizableTree[0]->State == TreeEntry::NeedToGather ||
       (VectorizableTree[1]->State == TreeEntry::NeedToGather &&
        VectorizableTree[0]->State != TreeEntry::ScatterVectorize &&
-       VectorizableTree[0]->State != TreeEntry::PossibleStridedVectorize))
+       VectorizableTree[0]->State != TreeEntry::StridedVectorize))
     return false;
 
   return true;
@@ -10579,11 +10585,6 @@ public:
 Value *BoUpSLP::vectorizeOperand(TreeEntry *E, unsigned NodeIdx,
                                  bool PostponedPHIs) {
   ValueList &VL = E->getOperand(NodeIdx);
-  if (E->State == TreeEntry::PossibleStridedVectorize &&
-      !E->ReorderIndices.empty()) {
-    SmallVector<int> Mask(E->ReorderIndices.begin(), E->ReorderIndices.end());
-    reorderScalars(VL, Mask);
-  }
   const unsigned VF = VL.size();
   InstructionsState S = getSameOpcode(VL, *TLI);
   // Special processing for GEPs bundle, which may include non-gep values.
@@ -11157,6 +11158,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) {
     return Vec;
   }
 
+  bool IsReverseOrder = isReverseOrder(E->ReorderIndices);
   auto FinalShuffle = [&](Value *V, const TreeEntry *E, VectorType *VecTy,
                           bool IsSigned) {
     if (V->getType() != VecTy)
@@ -11167,7 +11169,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) {
           ArrayRef(reinterpret_cast<const int *>(E->ReorderIndices.begin()),
                    E->ReorderIndices.size());
       ShuffleBuilder.add(V, Mask);
-    } else if (E->State == TreeEntry::PossibleStridedVectorize) {
+    } else if (E->State == TreeEntry::StridedVectorize && IsReverseOrder) {
       ShuffleBuilder.addOrdered(V, std::nullopt);
     } else {
       ShuffleBuilder.addOrdered(V, E->ReorderIndices);
@@ -11177,7 +11179,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) {
 
   assert((E->State == TreeEntry::Vectorize ||
           E->State == TreeEntry::ScatterVectorize ||
-          E->State == TreeEntry::PossibleStridedVectorize) &&
+          E->State == TreeEntry::StridedVectorize) &&
          "Unhandled state");
   unsigned ShuffleOrOp =
       E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector : E->getOpcode();
@@ -11642,10 +11644,29 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) {
       Value *PO = LI->getPointerOperand();
       if (E->State == TreeEntry::Vectorize) {
         NewLI = Builder.CreateAlignedLoad(VecTy, PO, LI->getAlign());
+      } else if (E->State == TreeEntry::StridedVectorize) {
+        Value *Ptr0 = cast<LoadInst>(E->Scalars.front())->getPointerOperand();
+        Value *PtrN = cast<LoadInst>(E->Scalars.back())->getPointerOperand();
+        PO = IsReverseOrder ? PtrN : Ptr0;
+        std::optional<int> Diff = getPointersDiff(
+            VL0->getType(), Ptr0, VL0->getType(), PtrN, *DL, *SE);
+        Type *StrideTy = DL->getIndexType(PO->getType());
+        int Stride = *Diff / (static_cast<int>(E->Scalars.size()) - 1);
+        Value *StrideVal =
+            ConstantInt::get(StrideTy, (IsReverseOrder ? -1 : 1) * Stride *
+                                           DL->getTypeAllocSize(ScalarTy));
+        Align CommonAlignment = computeCommonAlignment<LoadInst>(E->Scalars);
+        auto *Inst = Builder.CreateIntrinsic(
+            Intrinsic::experimental_vp_strided_load,
+            {VecTy, PO->getType(), StrideTy},
+            {PO, StrideVal, Builder.getAllOnesMask(VecTy->getElementCount()),
+             Builder.getInt32(E->Scalars.size())});
+        Inst->addParamAttr(
+            /*ArgNo=*/0,
+            Attribute::getWithAlignment(Inst->getContext(), CommonAlignment));
+        NewLI = Inst;
       } else {
-        assert((E->State == TreeEntry::ScatterVectorize ||
-                E->State == TreeEntry::PossibleStridedVectorize) &&
-               "Unhandled state");
+        assert(E->State == TreeEntry::ScatterVectorize && "Unhandled state");
         Value *VecPtr = vectorizeOperand(E, 0, PostponedPHIs);
         if (E->VectorizedValue) {
           LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
@@ -12069,8 +12090,11 @@ Value *BoUpSLP::vectorizeTree(
                      [&](llvm::User *U) {
                        TreeEntry *UseEntry = getTreeEntry(U);
                        return UseEntry &&
-                              UseEntry->State == TreeEntry::Vectorize &&
-                              E->State == TreeEntry::Vectorize &&
+                              (UseEntry->State == TreeEntry::Vectorize ||
+                               UseEntry->State ==
+                                   TreeEntry::StridedVectorize) &&
+                              (E->State == TreeEntry::Vectorize ||
+                               E->State == TreeEntry::StridedVectorize) &&
                               doesInTreeUserNeedToExtract(
                                   Scalar,
                                   cast<Instruction>(UseEntry->Scalars.front()),
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/complex-loads.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/complex-loads.ll
index dc5fb91..e167b6a 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/complex-loads.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/complex-loads.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
-; RUN: opt -S -mtriple riscv64-unknown-linux-gnu < %s --passes=slp-vectorizer -mattr=+v -slp-threshold=-80 | FileCheck %s
+; RUN: opt -S -mtriple riscv64-unknown-linux-gnu < %s --passes=slp-vectorizer -mattr=+v -slp-threshold=-40 | FileCheck %s
 
 define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.ptr, ptr %add.ptr64) {
 ; CHECK-LABEL: define i32 @test(
@@ -67,305 +67,303 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
 ; CHECK-NEXT:    [[TMP48:%.*]] = extractelement <2 x i32> [[TMP47]], i32 0
 ; CHECK-NEXT:    [[TMP49:%.*]] = extractelement <2 x i32> [[TMP47]], i32 1
 ; CHECK-NEXT:    [[SUB59_2:%.*]] = sub i32 [[TMP48]], [[TMP49]]
-; CHECK-NEXT:    [[ARRAYIDX3_3:%.*]] = getelementptr i8, ptr null, i64 4
 ; CHECK-NEXT:    [[TMP50:%.*]] = load i8, ptr null, align 1
 ; CHECK-NEXT:    [[ARRAYIDX20_3:%.*]] = getelementptr i8, ptr null, i64 2
 ; CHECK-NEXT:    [[ARRAYIDX22_3:%.*]] = getelementptr i8, ptr null, i64 2
 ; CHECK-NEXT:    [[TMP51:%.*]] = load i8, ptr null, align 1
-; CHECK-NEXT:    [[TMP52:%.*]] = insertelement <2 x ptr> <ptr null, ptr poison>, ptr [[ARRAYIDX20_3]], i32 1
+; CHECK-NEXT:    [[TMP52:%.*]] = insertelement <2 x ptr> <ptr poison, ptr null>, ptr [[ARRAYIDX20_3]], i32 0
 ; CHECK-NEXT:    [[TMP53:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP52]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
 ; CHECK-NEXT:    [[TMP54:%.*]] = zext <2 x i8> [[TMP53]] to <2 x i32>
-; CHECK-NEXT:    [[TMP55:%.*]] = insertelement <2 x ptr> <ptr null, ptr poison>, ptr [[ARRAYIDX22_3]], i32 1
+; CHECK-NEXT:    [[TMP55:%.*]] = insertelement <2 x ptr> <ptr poison, ptr null>, ptr [[ARRAYIDX22_3]], i32 0
 ; CHECK-NEXT:    [[TMP56:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP55]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
 ; CHECK-NEXT:    [[TMP57:%.*]] = zext <2 x i8> [[TMP56]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP58:%.*]] = sub <2 x i32> [[TMP54]], [[TMP57]]
-; CHECK-NEXT:    [[TMP59:%.*]] = insertelement <2 x ptr> <ptr poison, ptr null>, ptr [[ARRAYIDX3_3]], i32 0
-; CHECK-NEXT:    [[TMP60:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP59]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
-; CHECK-NEXT:    [[TMP61:%.*]] = zext <2 x i8> [[TMP60]] to <2 x i32>
-; CHECK-NEXT:    [[TMP62:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> getelementptr (i8, <2 x ptr> zeroinitializer, <2 x i64> <i64 4, i64 6>), i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
-; CHECK-NEXT:    [[TMP63:%.*]] = zext <2 x i8> [[TMP62]] to <2 x i32>
-; CHECK-NEXT:    [[TMP64:%.*]] = sub <2 x i32> [[TMP61]], [[TMP63]]
-; CHECK-NEXT:    [[TMP65:%.*]] = shl <2 x i32> [[TMP64]], <i32 16, i32 16>
-; CHECK-NEXT:    [[TMP66:%.*]] = add <2 x i32> [[TMP65]], [[TMP58]]
-; CHECK-NEXT:    [[TMP67:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> getelementptr (i8, <2 x ptr> zeroinitializer, <2 x i64> <i64 1, i64 3>), i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
-; CHECK-NEXT:    [[TMP68:%.*]] = zext <2 x i8> [[TMP67]] to <2 x i32>
-; CHECK-NEXT:    [[TMP69:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> getelementptr (i8, <2 x ptr> zeroinitializer, <2 x i64> <i64 1, i64 3>), i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
-; CHECK-NEXT:    [[TMP70:%.*]] = zext <2 x i8> [[TMP69]] to <2 x i32>
-; CHECK-NEXT:    [[TMP71:%.*]] = sub <2 x i32> [[TMP68]], [[TMP70]]
-; CHECK-NEXT:    [[TMP72:%.*]] = insertelement <2 x i8> poison, i8 [[TMP50]], i32 0
-; CHECK-NEXT:    [[TMP73:%.*]] = insertelement <2 x i8> [[TMP72]], i8 [[TMP51]], i32 1
-; CHECK-NEXT:    [[TMP74:%.*]] = zext <2 x i8> [[TMP73]] to <2 x i32>
-; CHECK-NEXT:    [[TMP75:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> getelementptr (i8, <2 x ptr> zeroinitializer, <2 x i64> <i64 5, i64 7>), i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
-; CHECK-NEXT:    [[TMP76:%.*]] = zext <2 x i8> [[TMP75]] to <2 x i32>
-; CHECK-NEXT:    [[TMP77:%.*]] = sub <2 x i32> [[TMP74]], [[TMP76]]
-; CHECK-NEXT:    [[TMP78:%.*]] = shl <2 x i32> [[TMP77]], <i32 16, i32 16>
-; CHECK-NEXT:    [[TMP79:%.*]] = add <2 x i32> [[TMP78]], [[TMP71]]
-; CHECK-NEXT:    [[TMP80:%.*]] = sub <2 x i32> [[TMP66]], [[TMP79]]
-; CHECK-NEXT:    [[TMP81:%.*]] = shufflevector <2 x i32> [[TMP79]], <2 x i32> [[TMP46]], <2 x i32> <i32 0, i32 2>
-; CHECK-NEXT:    [[TMP82:%.*]] = shufflevector <2 x i32> [[TMP66]], <2 x i32> [[TMP30]], <2 x i32> <i32 0, i32 2>
-; CHECK-NEXT:    [[TMP83:%.*]] = add <2 x i32> [[TMP81]], [[TMP82]]
-; CHECK-NEXT:    [[TMP84:%.*]] = shufflevector <2 x i32> [[TMP79]], <2 x i32> [[TMP46]], <2 x i32> <i32 1, i32 3>
-; CHECK-NEXT:    [[TMP85:%.*]] = shufflevector <2 x i32> [[TMP66]], <2 x i32> [[TMP30]], <2 x i32> <i32 1, i32 3>
-; CHECK-NEXT:    [[TMP86:%.*]] = add <2 x i32> [[TMP84]], [[TMP85]]
-; CHECK-NEXT:    [[TMP87:%.*]] = add <2 x i32> [[TMP86]], [[TMP83]]
-; CHECK-NEXT:    [[TMP88:%.*]] = sub <2 x i32> [[TMP83]], [[TMP86]]
-; CHECK-NEXT:    [[TMP89:%.*]] = extractelement <2 x i32> [[TMP80]], i32 0
-; CHECK-NEXT:    [[TMP90:%.*]] = extractelement <2 x i32> [[TMP80]], i32 1
-; CHECK-NEXT:    [[SUB59_3:%.*]] = sub i32 [[TMP89]], [[TMP90]]
-; CHECK-NEXT:    [[TMP91:%.*]] = extractelement <2 x i32> [[TMP87]], i32 0
-; CHECK-NEXT:    [[TMP92:%.*]] = extractelement <2 x i32> [[TMP87]], i32 1
-; CHECK-NEXT:    [[ADD94:%.*]] = add i32 [[TMP91]], [[TMP92]]
-; CHECK-NEXT:    [[SUB102:%.*]] = sub i32 [[TMP92]], [[TMP91]]
-; CHECK-NEXT:    [[TMP93:%.*]] = extractelement <2 x i32> [[TMP54]], i32 0
-; CHECK-NEXT:    [[SHR_I:%.*]] = lshr i32 [[TMP93]], 15
+; CHECK-NEXT:    [[TMP59:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 null, i64 4, <2 x i1> <i1 true, i1 true>, i32 2)
+; CHECK-NEXT:    [[TMP60:%.*]] = zext <2 x i8> [[TMP59]] to <2 x i32>
+; CHECK-NEXT:    [[TMP61:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> getelementptr (i8, <2 x ptr> zeroinitializer, <2 x i64> <i64 6, i64 4>), i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
+; CHECK-NEXT:    [[TMP62:%.*]] = zext <2 x i8> [[TMP61]] to <2 x i32>
+; CHECK-NEXT:    [[TMP63:%.*]] = sub <2 x i32> [[TMP60]], [[TMP62]]
+; CHECK-NEXT:    [[TMP64:%.*]] = shl <2 x i32> [[TMP63]], <i32 16, i32 16>
+; CHECK-NEXT:    [[TMP65:%.*]] = add <2 x i32> [[TMP64]], [[TMP58]]
+; CHECK-NEXT:    [[TMP66:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> getelementptr (i8, <2 x ptr> zeroinitializer, <2 x i64> <i64 3, i64 1>), i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
+; CHECK-NEXT:    [[TMP67:%.*]] = zext <2 x i8> [[TMP66]] to <2 x i32>
+; CHECK-NEXT:    [[TMP68:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> getelementptr (i8, <2 x ptr> zeroinitializer, <2 x i64> <i64 3, i64 1>), i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
+; CHECK-NEXT:    [[TMP69:%.*]] = zext <2 x i8> [[TMP68]] to <2 x i32>
+; CHECK-NEXT:    [[TMP70:%.*]] = sub <2 x i32> [[TMP67]], [[TMP69]]
+; CHECK-NEXT:    [[TMP71:%.*]] = insertelement <2 x i8> poison, i8 [[TMP51]], i32 0
+; CHECK-NEXT:    [[TMP72:%.*]] = insertelement <2 x i8> [[TMP71]], i8 [[TMP50]], i32 1
+; CHECK-NEXT:    [[TMP73:%.*]] = zext <2 x i8> [[TMP72]] to <2 x i32>
+; CHECK-NEXT:    [[TMP74:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> getelementptr (i8, <2 x ptr> zeroinitializer, <2 x i64> <i64 7, i64 5>), i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
+; CHECK-NEXT:    [[TMP75:%.*]] = zext <2 x i8> [[TMP74]] to <2 x i32>
+; CHECK-NEXT:    [[TMP76:%.*]] = sub <2 x i32> [[TMP73]], [[TMP75]]
+; CHECK-NEXT:    [[TMP77:%.*]] = shl <2 x i32> [[TMP76]], <i32 16, i32 16>
+; CHECK-NEXT:    [[TMP78:%.*]] = add <2 x i32> [[TMP77]], [[TMP70]]
+; CHECK-NEXT:    [[TMP79:%.*]] = sub <2 x i32> [[TMP65]], [[TMP78]]
+; CHECK-NEXT:    [[TMP80:%.*]] = shufflevector <2 x i32> [[TMP78]], <2 x i32> [[TMP46]], <2 x i32> <i32 1, i32 2>
+; CHECK-NEXT:    [[TMP81:%.*]] = shufflevector <2 x i32> [[TMP65]], <2 x i32> [[TMP30]], <2 x i32> <i32 1, i32 2>
+; CHECK-NEXT:    [[TMP82:%.*]] = add <2 x i32> [[TMP80]], [[TMP81]]
+; CHECK-NEXT:    [[TMP83:%.*]] = shufflevector <2 x i32> [[TMP78]], <2 x i32> [[TMP46]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    [[TMP84:%.*]] = shufflevector <2 x i32> [[TMP65]], <2 x i32> [[TMP30]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    [[TMP85:%.*]] = add <2 x i32> [[TMP83]], [[TMP84]]
+; CHECK-NEXT:    [[TMP86:%.*]] = add <2 x i32> [[TMP85]], [[TMP82]]
+; CHECK-NEXT:    [[TMP87:%.*]] = sub <2 x i32> [[TMP82]], [[TMP85]]
+; CHECK-NEXT:    [[TMP88:%.*]] = extractelement <2 x i32> [[TMP79]], i32 0
+; CHECK-NEXT:    [[TMP89:%.*]] = extractelement <2 x i32> [[TMP79]], i32 1
+; CHECK-NEXT:    [[SUB59_3:%.*]] = sub i32 [[TMP89]], [[TMP88]]
+; CHECK-NEXT:    [[TMP90:%.*]] = extractelement <2 x i32> [[TMP86]], i32 0
+; CHECK-NEXT:    [[TMP91:%.*]] = extractelement <2 x i32> [[TMP86]], i32 1
+; CHECK-NEXT:    [[ADD94:%.*]] = add i32 [[TMP90]], [[TMP91]]
+; CHECK-NEXT:    [[SUB102:%.*]] = sub i32 [[TMP91]], [[TMP90]]
+; CHECK-NEXT:    [[TMP92:%.*]] = extractelement <2 x i32> [[TMP54]], i32 1
+; CHECK-NEXT:    [[SHR_I:%.*]] = lshr i32 [[TMP92]], 15
 ; CHECK-NEXT:    [[AND_I:%.*]] = and i32 [[SHR_I]], 65537
 ; CHECK-NEXT:    [[MUL_I:%.*]] = mul i32 [[AND_I]], 65535
-; CHECK-NEXT:    [[TMP94:%.*]] = extractelement <2 x i32> [[TMP86]], i32 1
-; CHECK-NEXT:    [[SHR_I49:%.*]] = lshr i32 [[TMP94]], 15
+; CHECK-NEXT:    [[TMP93:%.*]] = extractelement <2 x i32> [[TMP85]], i32 1
+; CHECK-NEXT:    [[SHR_I49:%.*]] = lshr i32 [[TMP93]], 15
 ; CHECK-NEXT:    [[AND_I50:%.*]] = and i32 [[SHR_I49]], 65537
 ; CHECK-NEXT:    [[MUL_I51:%.*]] = mul i32 [[AND_I50]], 65535
-; CHECK-NEXT:    [[TMP95:%.*]] = extractelement <2 x i32> [[TMP88]], i32 0
-; CHECK-NEXT:    [[TMP96:%.*]] = extractelement <2 x i32> [[TMP88]], i32 1
-; CHECK-NEXT:    [[ADD94_2:%.*]] = add i32 [[TMP95]], [[TMP96]]
-; CHECK-NEXT:    [[TMP97:%.*]] = load <2 x i8>, ptr [[ARRAYIDX20]], align 1
-; CHECK-NEXT:    [[TMP98:%.*]] = zext <2 x i8> [[TMP97]] to <2 x i32>
-; CHECK-NEXT:    [[TMP99:%.*]] = insertelement <2 x i32> poison, i32 [[SUB59_2]], i32 0
-; CHECK-NEXT:    [[TMP100:%.*]] = shufflevector <2 x i32> [[TMP99]], <2 x i32> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP101:%.*]] = insertelement <2 x i32> poison, i32 [[SUB59_3]], i32 0
-; CHECK-NEXT:    [[TMP102:%.*]] = shufflevector <2 x i32> [[TMP101]], <2 x i32> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP103:%.*]] = add <2 x i32> [[TMP100]], [[TMP102]]
-; CHECK-NEXT:    [[TMP104:%.*]] = sub <2 x i32> [[TMP100]], [[TMP102]]
-; CHECK-NEXT:    [[TMP105:%.*]] = shufflevector <2 x i32> [[TMP103]], <2 x i32> [[TMP104]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT:    [[TMP106:%.*]] = load <2 x i8>, ptr [[PIX1]], align 1
-; CHECK-NEXT:    [[TMP107:%.*]] = zext <2 x i8> [[TMP106]] to <2 x i32>
-; CHECK-NEXT:    [[TMP108:%.*]] = shufflevector <2 x i32> [[TMP107]], <2 x i32> poison, <2 x i32> <i32 1, i32 0>
-; CHECK-NEXT:    [[TMP109:%.*]] = insertelement <2 x ptr> [[TMP4]], ptr [[ARRAYIDX22]], i32 1
-; CHECK-NEXT:    [[TMP110:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP109]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
-; CHECK-NEXT:    [[TMP111:%.*]] = zext <2 x i8> [[TMP110]] to <2 x i32>
-; CHECK-NEXT:    [[TMP112:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP2]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
-; CHECK-NEXT:    [[TMP113:%.*]] = zext <2 x i8> [[TMP112]] to <2 x i32>
-; CHECK-NEXT:    [[TMP114:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP5]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
-; CHECK-NEXT:    [[TMP115:%.*]] = zext <2 x i8> [[TMP114]] to <2 x i32>
-; CHECK-NEXT:    [[TMP116:%.*]] = sub <2 x i32> [[TMP113]], [[TMP115]]
-; CHECK-NEXT:    [[TMP117:%.*]] = shl <2 x i32> [[TMP116]], <i32 16, i32 16>
-; CHECK-NEXT:    [[TMP118:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP6]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
-; CHECK-NEXT:    [[TMP119:%.*]] = zext <2 x i8> [[TMP118]] to <2 x i32>
-; CHECK-NEXT:    [[TMP120:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP7]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
-; CHECK-NEXT:    [[TMP121:%.*]] = zext <2 x i8> [[TMP120]] to <2 x i32>
-; CHECK-NEXT:    [[TMP122:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP8]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
-; CHECK-NEXT:    [[TMP123:%.*]] = zext <2 x i8> [[TMP122]] to <2 x i32>
-; CHECK-NEXT:    [[TMP124:%.*]] = sub <2 x i32> [[TMP121]], [[TMP123]]
-; CHECK-NEXT:    [[TMP125:%.*]] = shl <2 x i32> [[TMP124]], <i32 16, i32 16>
-; CHECK-NEXT:    [[TMP126:%.*]] = shufflevector <2 x i32> [[TMP107]], <2 x i32> [[TMP98]], <2 x i32> <i32 0, i32 2>
-; CHECK-NEXT:    [[TMP127:%.*]] = sub <2 x i32> [[TMP126]], [[TMP111]]
-; CHECK-NEXT:    [[TMP128:%.*]] = add <2 x i32> [[TMP117]], [[TMP127]]
-; CHECK-NEXT:    [[TMP129:%.*]] = shufflevector <2 x i32> [[TMP108]], <2 x i32> [[TMP98]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT:    [[TMP130:%.*]] = sub <2 x i32> [[TMP129]], [[TMP119]]
-; CHECK-NEXT:    [[TMP131:%.*]] = add <2 x i32> [[TMP125]], [[TMP130]]
-; CHECK-NEXT:    [[TMP132:%.*]] = extractelement <2 x i32> [[TMP128]], i32 1
-; CHECK-NEXT:    [[TMP133:%.*]] = extractelement <2 x i32> [[TMP131]], i32 1
-; CHECK-NEXT:    [[ADD46:%.*]] = add i32 [[TMP133]], [[TMP132]]
-; CHECK-NEXT:    [[TMP134:%.*]] = sub <2 x i32> [[TMP128]], [[TMP131]]
-; CHECK-NEXT:    [[TMP135:%.*]] = extractelement <2 x i32> [[TMP128]], i32 0
-; CHECK-NEXT:    [[TMP136:%.*]] = extractelement <2 x i32> [[TMP131]], i32 0
-; CHECK-NEXT:    [[ADD44:%.*]] = add i32 [[TMP136]], [[TMP135]]
-; CHECK-NEXT:    [[TMP137:%.*]] = lshr <2 x i32> [[TMP108]], <i32 15, i32 15>
-; CHECK-NEXT:    [[TMP138:%.*]] = and <2 x i32> [[TMP137]], <i32 65537, i32 65537>
-; CHECK-NEXT:    [[TMP139:%.*]] = mul <2 x i32> [[TMP138]], <i32 65535, i32 65535>
-; CHECK-NEXT:    [[TMP140:%.*]] = extractelement <2 x i32> [[TMP134]], i32 0
-; CHECK-NEXT:    [[TMP141:%.*]] = extractelement <2 x i32> [[TMP134]], i32 1
-; CHECK-NEXT:    [[SUB59:%.*]] = sub i32 [[TMP140]], [[TMP141]]
-; CHECK-NEXT:    [[TMP142:%.*]] = load <2 x i8>, ptr [[ARRAYIDX8_1]], align 1
-; CHECK-NEXT:    [[TMP143:%.*]] = zext <2 x i8> [[TMP142]] to <2 x i32>
+; CHECK-NEXT:    [[TMP94:%.*]] = extractelement <2 x i32> [[TMP87]], i32 0
+; CHECK-NEXT:    [[TMP95:%.*]] = extractelement <2 x i32> [[TMP87]], i32 1
+; CHECK-NEXT:    [[ADD94_2:%.*]] = add i32 [[TMP94]], [[TMP95]]
+; CHECK-NEXT:    [[TMP96:%.*]] = load <2 x i8>, ptr [[ARRAYIDX20]], align 1
+; CHECK-NEXT:    [[TMP97:%.*]] = zext <2 x i8> [[TMP96]] to <2 x i32>
+; CHECK-NEXT:    [[TMP98:%.*]] = insertelement <2 x i32> poison, i32 [[SUB59_2]], i32 0
+; CHECK-NEXT:    [[TMP99:%.*]] = shufflevector <2 x i32> [[TMP98]], <2 x i32> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP100:%.*]] = insertelement <2 x i32> poison, i32 [[SUB59_3]], i32 0
+; CHECK-NEXT:    [[TMP101:%.*]] = shufflevector <2 x i32> [[TMP100]], <2 x i32> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP102:%.*]] = add <2 x i32> [[TMP99]], [[TMP101]]
+; CHECK-NEXT:    [[TMP103:%.*]] = sub <2 x i32> [[TMP99]], [[TMP101]]
+; CHECK-NEXT:    [[TMP104:%.*]] = shufflevector <2 x i32> [[TMP102]], <2 x i32> [[TMP103]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    [[TMP105:%.*]] = load <2 x i8>, ptr [[PIX1]], align 1
+; CHECK-NEXT:    [[TMP106:%.*]] = zext <2 x i8> [[TMP105]] to <2 x i32>
+; CHECK-NEXT:    [[TMP107:%.*]] = shufflevector <2 x i32> [[TMP106]], <2 x i32> poison, <2 x i32> <i32 1, i32 0>
+; CHECK-NEXT:    [[TMP108:%.*]] = insertelement <2 x ptr> [[TMP4]], ptr [[ARRAYIDX22]], i32 1
+; CHECK-NEXT:    [[TMP109:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP108]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
+; CHECK-NEXT:    [[TMP110:%.*]] = zext <2 x i8> [[TMP109]] to <2 x i32>
+; CHECK-NEXT:    [[TMP111:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP2]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
+; CHECK-NEXT:    [[TMP112:%.*]] = zext <2 x i8> [[TMP111]] to <2 x i32>
+; CHECK-NEXT:    [[TMP113:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP5]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
+; CHECK-NEXT:    [[TMP114:%.*]] = zext <2 x i8> [[TMP113]] to <2 x i32>
+; CHECK-NEXT:    [[TMP115:%.*]] = sub <2 x i32> [[TMP112]], [[TMP114]]
+; CHECK-NEXT:    [[TMP116:%.*]] = shl <2 x i32> [[TMP115]], <i32 16, i32 16>
+; CHECK-NEXT:    [[TMP117:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP6]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
+; CHECK-NEXT:    [[TMP118:%.*]] = zext <2 x i8> [[TMP117]] to <2 x i32>
+; CHECK-NEXT:    [[TMP119:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP7]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
+; CHECK-NEXT:    [[TMP120:%.*]] = zext <2 x i8> [[TMP119]] to <2 x i32>
+; CHECK-NEXT:    [[TMP121:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP8]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
+; CHECK-NEXT:    [[TMP122:%.*]] = zext <2 x i8> [[TMP121]] to <2 x i32>
+; CHECK-NEXT:    [[TMP123:%.*]] = sub <2 x i32> [[TMP120]], [[TMP122]]
+; CHECK-NEXT:    [[TMP124:%.*]] = shl <2 x i32> [[TMP123]], <i32 16, i32 16>
+; CHECK-NEXT:    [[TMP125:%.*]] = shufflevector <2 x i32> [[TMP106]], <2 x i32> [[TMP97]], <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:    [[TMP126:%.*]] = sub <2 x i32> [[TMP125]], [[TMP110]]
+; CHECK-NEXT:    [[TMP127:%.*]] = add <2 x i32> [[TMP116]], [[TMP126]]
+; CHECK-NEXT:    [[TMP128:%.*]] = shufflevector <2 x i32> [[TMP107]], <2 x i32> [[TMP97]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    [[TMP129:%.*]] = sub <2 x i32> [[TMP128]], [[TMP118]]
+; CHECK-NEXT:    [[TMP130:%.*]] = add <2 x i32> [[TMP124]], [[TMP129]]
+; CHECK-NEXT:    [[TMP131:%.*]] = extractelement <2 x i32> [[TMP127]], i32 1
+; CHECK-NEXT:    [[TMP132:%.*]] = extractelement <2 x i32> [[TMP130]], i32 1
+; CHECK-NEXT:    [[ADD46:%.*]] = add i32 [[TMP132]], [[TMP131]]
+; CHECK-NEXT:    [[TMP133:%.*]] = sub <2 x i32> [[TMP127]], [[TMP130]]
+; CHECK-NEXT:    [[TMP134:%.*]] = extractelement <2 x i32> [[TMP127]], i32 0
+; CHECK-NEXT:    [[TMP135:%.*]] = extractelement <2 x i32> [[TMP130]], i32 0
+; CHECK-NEXT:    [[ADD44:%.*]] = add i32 [[TMP135]], [[TMP134]]
+; CHECK-NEXT:    [[TMP136:%.*]] = lshr <2 x i32> [[TMP107]], <i32 15, i32 15>
+; CHECK-NEXT:    [[TMP137:%.*]] = and <2 x i32> [[TMP136]], <i32 65537, i32 65537>
+; CHECK-NEXT:    [[TMP138:%.*]] = mul <2 x i32> [[TMP137]], <i32 65535, i32 65535>
+; CHECK-NEXT:    [[TMP139:%.*]] = extractelement <2 x i32> [[TMP133]], i32 0
+; CHECK-NEXT:    [[TMP140:%.*]] = extractelement <2 x i32> [[TMP133]], i32 1
+; CHECK-NEXT:    [[SUB59:%.*]] = sub i32 [[TMP139]], [[TMP140]]
+; CHECK-NEXT:    [[TMP141:%.*]] = load <2 x i8>, ptr [[ARRAYIDX8_1]], align 1
+; CHECK-NEXT:    [[TMP142:%.*]] = zext <2 x i8> [[TMP141]] to <2 x i32>
 ; CHECK-NEXT:    [[ADD_PTR644:%.*]] = getelementptr i8, ptr [[PIX2]], i64 [[IDX_EXT63]]
 ; CHECK-NEXT:    [[ARRAYIDX22_1:%.*]] = getelementptr i8, ptr [[ADD_PTR644]], i64 2
-; CHECK-NEXT:    [[TMP144:%.*]] = insertelement <2 x ptr> poison, ptr [[ADD_PTR644]], i32 0
-; CHECK-NEXT:    [[TMP145:%.*]] = insertelement <2 x ptr> [[TMP144]], ptr [[ARRAYIDX22_1]], i32 1
-; CHECK-NEXT:    [[TMP146:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP145]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
-; CHECK-NEXT:    [[TMP147:%.*]] = zext <2 x i8> [[TMP146]] to <2 x i32>
-; CHECK-NEXT:    [[TMP148:%.*]] = insertelement <2 x ptr> poison, ptr [[ADD_PTR3]], i32 0
-; CHECK-NEXT:    [[TMP149:%.*]] = shufflevector <2 x ptr> [[TMP148]], <2 x ptr> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP150:%.*]] = getelementptr i8, <2 x ptr> [[TMP149]], <2 x i64> <i64 4, i64 6>
-; CHECK-NEXT:    [[TMP151:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP150]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
-; CHECK-NEXT:    [[TMP152:%.*]] = zext <2 x i8> [[TMP151]] to <2 x i32>
-; CHECK-NEXT:    [[TMP153:%.*]] = shufflevector <2 x ptr> [[TMP145]], <2 x ptr> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP154:%.*]] = getelementptr i8, <2 x ptr> [[TMP153]], <2 x i64> <i64 4, i64 6>
-; CHECK-NEXT:    [[TMP155:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP154]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
-; CHECK-NEXT:    [[TMP156:%.*]] = zext <2 x i8> [[TMP155]] to <2 x i32>
-; CHECK-NEXT:    [[TMP157:%.*]] = sub <2 x i32> [[TMP152]], [[TMP156]]
-; CHECK-NEXT:    [[TMP158:%.*]] = shl <2 x i32> [[TMP157]], <i32 16, i32 16>
-; CHECK-NEXT:    [[TMP159:%.*]] = getelementptr i8, <2 x ptr> [[TMP153]], <2 x i64> <i64 1, i64 3>
-; CHECK-NEXT:    [[TMP160:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP159]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
-; CHECK-NEXT:    [[TMP161:%.*]] = zext <2 x i8> [[TMP160]] to <2 x i32>
-; CHECK-NEXT:    [[TMP162:%.*]] = getelementptr i8, <2 x ptr> [[TMP149]], <2 x i64> <i64 5, i64 7>
-; CHECK-NEXT:    [[TMP163:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP162]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
-; CHECK-NEXT:    [[TMP164:%.*]] = zext <2 x i8> [[TMP163]] to <2 x i32>
-; CHECK-NEXT:    [[TMP165:%.*]] = getelementptr i8, <2 x ptr> [[TMP153]], <2 x i64> <i64 5, i64 7>
-; CHECK-NEXT:    [[TMP166:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP165]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
-; CHECK-NEXT:    [[TMP167:%.*]] = zext <2 x i8> [[TMP166]] to <2 x i32>
-; CHECK-NEXT:    [[TMP168:%.*]] = sub <2 x i32> [[TMP164]], [[TMP167]]
-; CHECK-NEXT:    [[TMP169:%.*]] = shl <2 x i32> [[TMP168]], <i32 16, i32 16>
-; CHECK-NEXT:    [[TMP170:%.*]] = insertelement <2 x i32> [[TMP143]], i32 [[CONV33_1]], i32 1
-; CHECK-NEXT:    [[TMP171:%.*]] = sub <2 x i32> [[TMP170]], [[TMP161]]
-; CHECK-NEXT:    [[TMP172:%.*]] = add <2 x i32> [[TMP169]], [[TMP171]]
-; CHECK-NEXT:    [[TMP173:%.*]] = insertelement <2 x i32> [[TMP143]], i32 [[CONV_1]], i32 0
-; CHECK-NEXT:    [[TMP174:%.*]] = sub <2 x i32> [[TMP173]], [[TMP147]]
-; CHECK-NEXT:    [[TMP175:%.*]] = add <2 x i32> [[TMP158]], [[TMP174]]
-; CHECK-NEXT:    [[TMP176:%.*]] = add <2 x i32> [[TMP172]], [[TMP175]]
-; CHECK-NEXT:    [[TMP177:%.*]] = sub <2 x i32> [[TMP175]], [[TMP172]]
-; CHECK-NEXT:    [[TMP178:%.*]] = extractelement <2 x i32> [[TMP176]], i32 0
-; CHECK-NEXT:    [[TMP179:%.*]] = extractelement <2 x i32> [[TMP176]], i32 1
-; CHECK-NEXT:    [[SUB51_1:%.*]] = sub i32 [[TMP178]], [[TMP179]]
-; CHECK-NEXT:    [[TMP180:%.*]] = shufflevector <2 x i32> [[TMP177]], <2 x i32> [[TMP134]], <2 x i32> <i32 1, i32 3>
-; CHECK-NEXT:    [[TMP181:%.*]] = shufflevector <2 x i32> [[TMP177]], <2 x i32> [[TMP134]], <2 x i32> <i32 0, i32 2>
-; CHECK-NEXT:    [[TMP182:%.*]] = add <2 x i32> [[TMP180]], [[TMP181]]
-; CHECK-NEXT:    [[TMP183:%.*]] = extractelement <2 x i32> [[TMP177]], i32 0
-; CHECK-NEXT:    [[TMP184:%.*]] = extractelement <2 x i32> [[TMP177]], i32 1
-; CHECK-NEXT:    [[SUB59_1:%.*]] = sub i32 [[TMP183]], [[TMP184]]
-; CHECK-NEXT:    [[SHR_I54:%.*]] = lshr i32 [[TMP179]], 15
+; CHECK-NEXT:    [[TMP143:%.*]] = insertelement <2 x ptr> poison, ptr [[ADD_PTR644]], i32 0
+; CHECK-NEXT:    [[TMP144:%.*]] = insertelement <2 x ptr> [[TMP143]], ptr [[ARRAYIDX22_1]], i32 1
+; CHECK-NEXT:    [[TMP145:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP144]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
+; CHECK-NEXT:    [[TMP146:%.*]] = zext <2 x i8> [[TMP145]] to <2 x i32>
+; CHECK-NEXT:    [[TMP147:%.*]] = insertelement <2 x ptr> poison, ptr [[ADD_PTR3]], i32 0
+; CHECK-NEXT:    [[TMP148:%.*]] = shufflevector <2 x ptr> [[TMP147]], <2 x ptr> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP149:%.*]] = getelementptr i8, <2 x ptr> [[TMP148]], <2 x i64> <i64 4, i64 6>
+; CHECK-NEXT:    [[TMP150:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP149]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
+; CHECK-NEXT:    [[TMP151:%.*]] = zext <2 x i8> [[TMP150]] to <2 x i32>
+; CHECK-NEXT:    [[TMP152:%.*]] = shufflevector <2 x ptr> [[TMP144]], <2 x ptr> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP153:%.*]] = getelementptr i8, <2 x ptr> [[TMP152]], <2 x i64> <i64 4, i64 6>
+; CHECK-NEXT:    [[TMP154:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP153]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
+; CHECK-NEXT:    [[TMP155:%.*]] = zext <2 x i8> [[TMP154]] to <2 x i32>
+; CHECK-NEXT:    [[TMP156:%.*]] = sub <2 x i32> [[TMP151]], [[TMP155]]
+; CHECK-NEXT:    [[TMP157:%.*]] = shl <2 x i32> [[TMP156]], <i32 16, i32 16>
+; CHECK-NEXT:    [[TMP158:%.*]] = getelementptr i8, <2 x ptr> [[TMP152]], <2 x i64> <i64 1, i64 3>
+; CHECK-NEXT:    [[TMP159:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP158]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
+; CHECK-NEXT:    [[TMP160:%.*]] = zext <2 x i8> [[TMP159]] to <2 x i32>
+; CHECK-NEXT:    [[TMP161:%.*]] = getelementptr i8, <2 x ptr> [[TMP148]], <2 x i64> <i64 5, i64 7>
+; CHECK-NEXT:    [[TMP162:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP161]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
+; CHECK-NEXT:    [[TMP163:%.*]] = zext <2 x i8> [[TMP162]] to <2 x i32>
+; CHECK-NEXT:    [[TMP164:%.*]] = getelementptr i8, <2 x ptr> [[TMP152]], <2 x i64> <i64 5, i64 7>
+; CHECK-NEXT:    [[TMP165:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP164]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
+; CHECK-NEXT:    [[TMP166:%.*]] = zext <2 x i8> [[TMP165]] to <2 x i32>
+; CHECK-NEXT:    [[TMP167:%.*]] = sub <2 x i32> [[TMP163]], [[TMP166]]
+; CHECK-NEXT:    [[TMP168:%.*]] = shl <2 x i32> [[TMP167]], <i32 16, i32 16>
+; CHECK-NEXT:    [[TMP169:%.*]] = insertelement <2 x i32> [[TMP142]], i32 [[CONV33_1]], i32 1
+; CHECK-NEXT:    [[TMP170:%.*]] = sub <2 x i32> [[TMP169]], [[TMP160]]
+; CHECK-NEXT:    [[TMP171:%.*]] = add <2 x i32> [[TMP168]], [[TMP170]]
+; CHECK-NEXT:    [[TMP172:%.*]] = insertelement <2 x i32> [[TMP142]], i32 [[CONV_1]], i32 0
+; CHECK-NEXT:    [[TMP173:%.*]] = sub <2 x i32> [[TMP172]], [[TMP146]]
+; CHECK-NEXT:    [[TMP174:%.*]] = add <2 x i32> [[TMP157]], [[TMP173]]
+; CHECK-NEXT:    [[TMP175:%.*]] = add <2 x i32> [[TMP171]], [[TMP174]]
+; CHECK-NEXT:    [[TMP176:%.*]] = sub <2 x i32> [[TMP174]], [[TMP171]]
+; CHECK-NEXT:    [[TMP177:%.*]] = extractelement <2 x i32> [[TMP175]], i32 0
+; CHECK-NEXT:    [[TMP178:%.*]] = extractelement <2 x i32> [[TMP175]], i32 1
+; CHECK-NEXT:    [[SUB51_1:%.*]] = sub i32 [[TMP177]], [[TMP178]]
+; CHECK-NEXT:    [[TMP179:%.*]] = shufflevector <2 x i32> [[TMP176]], <2 x i32> [[TMP133]], <2 x i32> <i32 1, i32 3>
+; CHECK-NEXT:    [[TMP180:%.*]] = shufflevector <2 x i32> [[TMP176]], <2 x i32> [[TMP133]], <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:    [[TMP181:%.*]] = add <2 x i32> [[TMP179]], [[TMP180]]
+; CHECK-NEXT:    [[TMP182:%.*]] = extractelement <2 x i32> [[TMP176]], i32 0
+; CHECK-NEXT:    [[TMP183:%.*]] = extractelement <2 x i32> [[TMP176]], i32 1
+; CHECK-NEXT:    [[SUB59_1:%.*]] = sub i32 [[TMP182]], [[TMP183]]
+; CHECK-NEXT:    [[SHR_I54:%.*]] = lshr i32 [[TMP178]], 15
 ; CHECK-NEXT:    [[AND_I55:%.*]] = and i32 [[SHR_I54]], 65537
 ; CHECK-NEXT:    [[MUL_I56:%.*]] = mul i32 [[AND_I55]], 65535
-; CHECK-NEXT:    [[TMP185:%.*]] = lshr <2 x i32> [[TMP143]], <i32 15, i32 15>
-; CHECK-NEXT:    [[TMP186:%.*]] = and <2 x i32> [[TMP185]], <i32 65537, i32 65537>
-; CHECK-NEXT:    [[TMP187:%.*]] = mul <2 x i32> [[TMP186]], <i32 65535, i32 65535>
-; CHECK-NEXT:    [[TMP188:%.*]] = insertelement <2 x i32> poison, i32 [[SUB59_1]], i32 0
-; CHECK-NEXT:    [[TMP189:%.*]] = shufflevector <2 x i32> [[TMP188]], <2 x i32> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP190:%.*]] = extractelement <2 x i32> [[TMP182]], i32 0
-; CHECK-NEXT:    [[TMP191:%.*]] = extractelement <2 x i32> [[TMP182]], i32 1
-; CHECK-NEXT:    [[ADD78_1:%.*]] = add i32 [[TMP190]], [[TMP191]]
-; CHECK-NEXT:    [[TMP192:%.*]] = shufflevector <2 x i32> [[TMP33]], <2 x i32> [[TMP177]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT:    [[TMP193:%.*]] = lshr <2 x i32> [[TMP192]], <i32 15, i32 15>
-; CHECK-NEXT:    [[TMP194:%.*]] = and <2 x i32> [[TMP193]], <i32 65537, i32 65537>
-; CHECK-NEXT:    [[TMP195:%.*]] = mul <2 x i32> [[TMP194]], <i32 65535, i32 65535>
-; CHECK-NEXT:    [[TMP196:%.*]] = insertelement <2 x i32> poison, i32 [[ADD78_1]], i32 0
-; CHECK-NEXT:    [[TMP197:%.*]] = shufflevector <2 x i32> [[TMP196]], <2 x i32> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP198:%.*]] = insertelement <2 x i32> poison, i32 [[ADD94_2]], i32 0
-; CHECK-NEXT:    [[TMP199:%.*]] = shufflevector <2 x i32> [[TMP198]], <2 x i32> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP200:%.*]] = insertelement <2 x i32> poison, i32 [[ADD44]], i32 0
-; CHECK-NEXT:    [[TMP201:%.*]] = shufflevector <2 x i32> [[TMP200]], <2 x i32> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP202:%.*]] = insertelement <2 x i32> <i32 15, i32 poison>, i32 [[ADD46]], i32 1
-; CHECK-NEXT:    [[TMP203:%.*]] = lshr <2 x i32> [[TMP201]], [[TMP202]]
-; CHECK-NEXT:    [[TMP204:%.*]] = sub <2 x i32> [[TMP201]], [[TMP202]]
-; CHECK-NEXT:    [[TMP205:%.*]] = shufflevector <2 x i32> [[TMP203]], <2 x i32> [[TMP204]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT:    [[TMP206:%.*]] = extractelement <2 x i32> [[TMP205]], i32 1
-; CHECK-NEXT:    [[ADD78_2:%.*]] = add i32 [[SUB51_1]], [[TMP206]]
-; CHECK-NEXT:    [[TMP207:%.*]] = insertelement <2 x i32> <i32 65537, i32 poison>, i32 [[SUB51_1]], i32 1
-; CHECK-NEXT:    [[TMP208:%.*]] = and <2 x i32> [[TMP205]], [[TMP207]]
-; CHECK-NEXT:    [[TMP209:%.*]] = sub <2 x i32> [[TMP205]], [[TMP207]]
-; CHECK-NEXT:    [[TMP210:%.*]] = shufflevector <2 x i32> [[TMP208]], <2 x i32> [[TMP209]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT:    [[TMP211:%.*]] = insertelement <2 x i32> poison, i32 [[ADD78_2]], i32 0
-; CHECK-NEXT:    [[TMP212:%.*]] = shufflevector <2 x i32> [[TMP211]], <2 x i32> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP213:%.*]] = add <2 x i32> [[TMP212]], [[TMP199]]
-; CHECK-NEXT:    [[TMP214:%.*]] = sub <2 x i32> [[TMP212]], [[TMP199]]
-; CHECK-NEXT:    [[TMP215:%.*]] = shufflevector <2 x i32> [[TMP213]], <2 x i32> [[TMP214]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT:    [[TMP216:%.*]] = insertelement <2 x i32> [[TMP134]], i32 [[CONV_1]], i32 0
-; CHECK-NEXT:    [[TMP217:%.*]] = lshr <2 x i32> [[TMP216]], <i32 15, i32 15>
-; CHECK-NEXT:    [[TMP218:%.*]] = and <2 x i32> [[TMP217]], <i32 65537, i32 65537>
-; CHECK-NEXT:    [[TMP219:%.*]] = mul <2 x i32> [[TMP218]], <i32 65535, i32 65535>
-; CHECK-NEXT:    [[TMP220:%.*]] = shufflevector <2 x i32> [[TMP88]], <2 x i32> poison, <2 x i32> <i32 1, i32 poison>
-; CHECK-NEXT:    [[TMP221:%.*]] = shufflevector <2 x i32> [[TMP220]], <2 x i32> [[TMP182]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT:    [[TMP222:%.*]] = shufflevector <2 x i32> [[TMP88]], <2 x i32> [[TMP182]], <2 x i32> <i32 0, i32 2>
-; CHECK-NEXT:    [[TMP223:%.*]] = sub <2 x i32> [[TMP221]], [[TMP222]]
-; CHECK-NEXT:    [[TMP224:%.*]] = shufflevector <2 x i32> [[TMP47]], <2 x i32> poison, <2 x i32> <i32 1, i32 poison>
-; CHECK-NEXT:    [[TMP225:%.*]] = insertelement <2 x i32> [[TMP224]], i32 [[ADD46]], i32 1
-; CHECK-NEXT:    [[TMP226:%.*]] = insertelement <2 x i32> [[TMP47]], i32 [[ADD44]], i32 1
-; CHECK-NEXT:    [[TMP227:%.*]] = add <2 x i32> [[TMP225]], [[TMP226]]
-; CHECK-NEXT:    [[TMP228:%.*]] = shufflevector <2 x i32> [[TMP80]], <2 x i32> [[TMP176]], <2 x i32> <i32 1, i32 3>
-; CHECK-NEXT:    [[TMP229:%.*]] = shufflevector <2 x i32> [[TMP80]], <2 x i32> [[TMP176]], <2 x i32> <i32 0, i32 2>
-; CHECK-NEXT:    [[TMP230:%.*]] = add <2 x i32> [[TMP228]], [[TMP229]]
-; CHECK-NEXT:    [[TMP231:%.*]] = extractelement <2 x i32> [[TMP227]], i32 0
-; CHECK-NEXT:    [[TMP232:%.*]] = extractelement <2 x i32> [[TMP230]], i32 0
-; CHECK-NEXT:    [[ADD94_1:%.*]] = add i32 [[TMP232]], [[TMP231]]
-; CHECK-NEXT:    [[TMP233:%.*]] = insertelement <2 x i32> [[TMP14]], i32 [[ADD46]], i32 1
-; CHECK-NEXT:    [[TMP234:%.*]] = lshr <2 x i32> [[TMP233]], <i32 15, i32 15>
-; CHECK-NEXT:    [[TMP235:%.*]] = and <2 x i32> [[TMP234]], <i32 65537, i32 65537>
-; CHECK-NEXT:    [[TMP236:%.*]] = mul <2 x i32> [[TMP235]], <i32 65535, i32 65535>
-; CHECK-NEXT:    [[TMP237:%.*]] = extractelement <2 x i32> [[TMP227]], i32 1
-; CHECK-NEXT:    [[TMP238:%.*]] = extractelement <2 x i32> [[TMP230]], i32 1
-; CHECK-NEXT:    [[ADD78:%.*]] = add i32 [[TMP238]], [[TMP237]]
-; CHECK-NEXT:    [[TMP239:%.*]] = sub <2 x i32> [[TMP227]], [[TMP230]]
+; CHECK-NEXT:    [[TMP184:%.*]] = lshr <2 x i32> [[TMP142]], <i32 15, i32 15>
+; CHECK-NEXT:    [[TMP185:%.*]] = and <2 x i32> [[TMP184]], <i32 65537, i32 65537>
+; CHECK-NEXT:    [[TMP186:%.*]] = mul <2 x i32> [[TMP185]], <i32 65535, i32 65535>
+; CHECK-NEXT:    [[TMP187:%.*]] = insertelement <2 x i32> poison, i32 [[SUB59_1]], i32 0
+; CHECK-NEXT:    [[TMP188:%.*]] = shufflevector <2 x i32> [[TMP187]], <2 x i32> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP189:%.*]] = extractelement <2 x i32> [[TMP181]], i32 0
+; CHECK-NEXT:    [[TMP190:%.*]] = extractelement <2 x i32> [[TMP181]], i32 1
+; CHECK-NEXT:    [[ADD78_1:%.*]] = add i32 [[TMP189]], [[TMP190]]
+; CHECK-NEXT:    [[TMP191:%.*]] = shufflevector <2 x i32> [[TMP33]], <2 x i32> [[TMP176]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    [[TMP192:%.*]] = lshr <2 x i32> [[TMP191]], <i32 15, i32 15>
+; CHECK-NEXT:    [[TMP193:%.*]] = and <2 x i32> [[TMP192]], <i32 65537, i32 65537>
+; CHECK-NEXT:    [[TMP194:%.*]] = mul <2 x i32> [[TMP193]], <i32 65535, i32 65535>
+; CHECK-NEXT:    [[TMP195:%.*]] = insertelement <2 x i32> poison, i32 [[ADD78_1]], i32 0
+; CHECK-NEXT:    [[TMP196:%.*]] = shufflevector <2 x i32> [[TMP195]], <2 x i32> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP197:%.*]] = insertelement <2 x i32> poison, i32 [[ADD94_2]], i32 0
+; CHECK-NEXT:    [[TMP198:%.*]] = shufflevector <2 x i32> [[TMP197]], <2 x i32> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP199:%.*]] = insertelement <2 x i32> poison, i32 [[ADD44]], i32 0
+; CHECK-NEXT:    [[TMP200:%.*]] = shufflevector <2 x i32> [[TMP199]], <2 x i32> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP201:%.*]] = insertelement <2 x i32> <i32 15, i32 poison>, i32 [[ADD46]], i32 1
+; CHECK-NEXT:    [[TMP202:%.*]] = lshr <2 x i32> [[TMP200]], [[TMP201]]
+; CHECK-NEXT:    [[TMP203:%.*]] = sub <2 x i32> [[TMP200]], [[TMP201]]
+; CHECK-NEXT:    [[TMP204:%.*]] = shufflevector <2 x i32> [[TMP202]], <2 x i32> [[TMP203]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    [[TMP205:%.*]] = extractelement <2 x i32> [[TMP204]], i32 1
+; CHECK-NEXT:    [[ADD78_2:%.*]] = add i32 [[SUB51_1]], [[TMP205]]
+; CHECK-NEXT:    [[TMP206:%.*]] = insertelement <2 x i32> <i32 65537, i32 poison>, i32 [[SUB51_1]], i32 1
+; CHECK-NEXT:    [[TMP207:%.*]] = and <2 x i32> [[TMP204]], [[TMP206]]
+; CHECK-NEXT:    [[TMP208:%.*]] = sub <2 x i32> [[TMP204]], [[TMP206]]
+; CHECK-NEXT:    [[TMP209:%.*]] = shufflevector <2 x i32> [[TMP207]], <2 x i32> [[TMP208]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    [[TMP210:%.*]] = insertelement <2 x i32> poison, i32 [[ADD78_2]], i32 0
+; CHECK-NEXT:    [[TMP211:%.*]] = shufflevector <2 x i32> [[TMP210]], <2 x i32> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP212:%.*]] = add <2 x i32> [[TMP211]], [[TMP198]]
+; CHECK-NEXT:    [[TMP213:%.*]] = sub <2 x i32> [[TMP211]], [[TMP198]]
+; CHECK-NEXT:    [[TMP214:%.*]] = shufflevector <2 x i32> [[TMP212]], <2 x i32> [[TMP213]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    [[TMP215:%.*]] = insertelement <2 x i32> [[TMP133]], i32 [[CONV_1]], i32 0
+; CHECK-NEXT:    [[TMP216:%.*]] = lshr <2 x i32> [[TMP215]], <i32 15, i32 15>
+; CHECK-NEXT:    [[TMP217:%.*]] = and <2 x i32> [[TMP216]], <i32 65537, i32 65537>
+; CHECK-NEXT:    [[TMP218:%.*]] = mul <2 x i32> [[TMP217]], <i32 65535, i32 65535>
+; CHECK-NEXT:    [[TMP219:%.*]] = shufflevector <2 x i32> [[TMP87]], <2 x i32> poison, <2 x i32> <i32 1, i32 poison>
+; CHECK-NEXT:    [[TMP220:%.*]] = shufflevector <2 x i32> [[TMP219]], <2 x i32> [[TMP181]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    [[TMP221:%.*]] = shufflevector <2 x i32> [[TMP87]], <2 x i32> [[TMP181]], <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:    [[TMP222:%.*]] = sub <2 x i32> [[TMP220]], [[TMP221]]
+; CHECK-NEXT:    [[TMP223:%.*]] = shufflevector <2 x i32> [[TMP47]], <2 x i32> poison, <2 x i32> <i32 1, i32 poison>
+; CHECK-NEXT:    [[TMP224:%.*]] = insertelement <2 x i32> [[TMP223]], i32 [[ADD46]], i32 1
+; CHECK-NEXT:    [[TMP225:%.*]] = insertelement <2 x i32> [[TMP47]], i32 [[ADD44]], i32 1
+; CHECK-NEXT:    [[TMP226:%.*]] = add <2 x i32> [[TMP224]], [[TMP225]]
+; CHECK-NEXT:    [[TMP227:%.*]] = shufflevector <2 x i32> [[TMP79]], <2 x i32> [[TMP175]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    [[TMP228:%.*]] = shufflevector <2 x i32> [[TMP79]], <2 x i32> [[TMP175]], <2 x i32> <i32 1, i32 2>
+; CHECK-NEXT:    [[TMP229:%.*]] = add <2 x i32> [[TMP227]], [[TMP228]]
+; CHECK-NEXT:    [[TMP230:%.*]] = extractelement <2 x i32> [[TMP226]], i32 0
+; CHECK-NEXT:    [[TMP231:%.*]] = extractelement <2 x i32> [[TMP229]], i32 0
+; CHECK-NEXT:    [[ADD94_1:%.*]] = add i32 [[TMP231]], [[TMP230]]
+; CHECK-NEXT:    [[TMP232:%.*]] = insertelement <2 x i32> [[TMP14]], i32 [[ADD46]], i32 1
+; CHECK-NEXT:    [[TMP233:%.*]] = lshr <2 x i32> [[TMP232]], <i32 15, i32 15>
+; CHECK-NEXT:    [[TMP234:%.*]] = and <2 x i32> [[TMP233]], <i32 65537, i32 65537>
+; CHECK-NEXT:    [[TMP235:%.*]] = mul <2 x i32> [[TMP234]], <i32 65535, i32 65535>
+; CHECK-NEXT:    [[TMP236:%.*]] = extractelement <2 x i32> [[TMP226]], i32 1
+; CHECK-NEXT:    [[TMP237:%.*]] = extractelement <2 x i32> [[TMP229]], i32 1
+; CHECK-NEXT:    [[ADD78:%.*]] = add i32 [[TMP237]], [[TMP236]]
+; CHECK-NEXT:    [[TMP238:%.*]] = sub <2 x i32> [[TMP226]], [[TMP229]]
 ; CHECK-NEXT:    [[ADD103:%.*]] = add i32 [[ADD94]], [[ADD78]]
 ; CHECK-NEXT:    [[SUB104:%.*]] = sub i32 [[ADD78]], [[ADD94]]
-; CHECK-NEXT:    [[TMP240:%.*]] = extractelement <2 x i32> [[TMP239]], i32 1
-; CHECK-NEXT:    [[ADD105:%.*]] = add i32 [[SUB102]], [[TMP240]]
+; CHECK-NEXT:    [[TMP239:%.*]] = extractelement <2 x i32> [[TMP238]], i32 1
+; CHECK-NEXT:    [[ADD105:%.*]] = add i32 [[SUB102]], [[TMP239]]
 ; CHECK-NEXT:    [[ADD_I:%.*]] = add i32 [[MUL_I]], [[ADD103]]
-; CHECK-NEXT:    [[XOR_I:%.*]] = xor i32 [[ADD_I]], [[TMP93]]
+; CHECK-NEXT:    [[XOR_I:%.*]] = xor i32 [[ADD_I]], [[TMP92]]
 ; CHECK-NEXT:    [[ADD_I52:%.*]] = add i32 [[MUL_I51]], [[ADD105]]
-; CHECK-NEXT:    [[XOR_I53:%.*]] = xor i32 [[ADD_I52]], [[TMP94]]
+; CHECK-NEXT:    [[XOR_I53:%.*]] = xor i32 [[ADD_I52]], [[TMP93]]
 ; CHECK-NEXT:    [[ADD_I57:%.*]] = add i32 [[MUL_I56]], [[SUB104]]
-; CHECK-NEXT:    [[XOR_I58:%.*]] = xor i32 [[ADD_I57]], [[TMP179]]
+; CHECK-NEXT:    [[XOR_I58:%.*]] = xor i32 [[ADD_I57]], [[TMP178]]
 ; CHECK-NEXT:    [[ADD110:%.*]] = add i32 [[XOR_I53]], [[XOR_I]]
 ; CHECK-NEXT:    [[ADD112:%.*]] = add i32 [[ADD110]], [[XOR_I58]]
-; CHECK-NEXT:    [[TMP241:%.*]] = shufflevector <2 x i32> [[TMP223]], <2 x i32> poison, <2 x i32> <i32 1, i32 poison>
-; CHECK-NEXT:    [[TMP242:%.*]] = insertelement <2 x i32> [[TMP241]], i32 [[SUB102]], i32 1
-; CHECK-NEXT:    [[TMP243:%.*]] = add <2 x i32> [[TMP239]], [[TMP242]]
-; CHECK-NEXT:    [[TMP244:%.*]] = sub <2 x i32> [[TMP239]], [[TMP242]]
-; CHECK-NEXT:    [[TMP245:%.*]] = shufflevector <2 x i32> [[TMP243]], <2 x i32> [[TMP244]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT:    [[TMP246:%.*]] = add <2 x i32> [[TMP236]], [[TMP245]]
-; CHECK-NEXT:    [[TMP247:%.*]] = xor <2 x i32> [[TMP246]], [[TMP233]]
-; CHECK-NEXT:    [[TMP248:%.*]] = extractelement <2 x i32> [[TMP247]], i32 1
-; CHECK-NEXT:    [[ADD113:%.*]] = add i32 [[ADD112]], [[TMP248]]
-; CHECK-NEXT:    [[TMP249:%.*]] = insertelement <2 x i32> poison, i32 [[ADD94_1]], i32 0
-; CHECK-NEXT:    [[TMP250:%.*]] = shufflevector <2 x i32> [[TMP249]], <2 x i32> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP251:%.*]] = add <2 x i32> [[TMP197]], [[TMP250]]
-; CHECK-NEXT:    [[TMP252:%.*]] = sub <2 x i32> [[TMP197]], [[TMP250]]
-; CHECK-NEXT:    [[TMP253:%.*]] = shufflevector <2 x i32> [[TMP251]], <2 x i32> [[TMP252]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT:    [[TMP254:%.*]] = add <2 x i32> [[TMP195]], [[TMP253]]
-; CHECK-NEXT:    [[TMP255:%.*]] = xor <2 x i32> [[TMP254]], [[TMP192]]
-; CHECK-NEXT:    [[TMP256:%.*]] = extractelement <2 x i32> [[TMP247]], i32 0
-; CHECK-NEXT:    [[ADD108_1:%.*]] = add i32 [[TMP256]], [[ADD113]]
-; CHECK-NEXT:    [[TMP257:%.*]] = extractelement <2 x i32> [[TMP255]], i32 0
-; CHECK-NEXT:    [[ADD110_1:%.*]] = add i32 [[ADD108_1]], [[TMP257]]
-; CHECK-NEXT:    [[TMP258:%.*]] = extractelement <2 x i32> [[TMP255]], i32 1
-; CHECK-NEXT:    [[ADD112_1:%.*]] = add i32 [[ADD110_1]], [[TMP258]]
-; CHECK-NEXT:    [[TMP259:%.*]] = shufflevector <2 x i32> [[TMP210]], <2 x i32> poison, <2 x i32> <i32 1, i32 poison>
-; CHECK-NEXT:    [[TMP260:%.*]] = shufflevector <2 x i32> [[TMP259]], <2 x i32> [[TMP239]], <2 x i32> <i32 0, i32 2>
-; CHECK-NEXT:    [[TMP261:%.*]] = add <2 x i32> [[TMP223]], [[TMP260]]
-; CHECK-NEXT:    [[TMP262:%.*]] = sub <2 x i32> [[TMP223]], [[TMP260]]
-; CHECK-NEXT:    [[TMP263:%.*]] = shufflevector <2 x i32> [[TMP261]], <2 x i32> [[TMP262]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT:    [[TMP264:%.*]] = add <2 x i32> [[TMP219]], [[TMP263]]
-; CHECK-NEXT:    [[TMP265:%.*]] = xor <2 x i32> [[TMP264]], [[TMP216]]
-; CHECK-NEXT:    [[TMP266:%.*]] = extractelement <2 x i32> [[TMP265]], i32 1
-; CHECK-NEXT:    [[ADD113_1:%.*]] = add i32 [[ADD112_1]], [[TMP266]]
-; CHECK-NEXT:    [[TMP267:%.*]] = shufflevector <2 x i32> <i32 65535, i32 poison>, <2 x i32> [[TMP223]], <2 x i32> <i32 0, i32 2>
-; CHECK-NEXT:    [[TMP268:%.*]] = mul <2 x i32> [[TMP210]], [[TMP267]]
-; CHECK-NEXT:    [[TMP269:%.*]] = sub <2 x i32> [[TMP210]], [[TMP267]]
-; CHECK-NEXT:    [[TMP270:%.*]] = shufflevector <2 x i32> [[TMP268]], <2 x i32> [[TMP269]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT:    [[TMP271:%.*]] = add <2 x i32> [[TMP187]], [[TMP215]]
-; CHECK-NEXT:    [[TMP272:%.*]] = xor <2 x i32> [[TMP271]], [[TMP143]]
-; CHECK-NEXT:    [[TMP273:%.*]] = extractelement <2 x i32> [[TMP270]], i32 0
-; CHECK-NEXT:    [[TMP274:%.*]] = extractelement <2 x i32> [[TMP270]], i32 1
-; CHECK-NEXT:    [[ADD_I62_2:%.*]] = add i32 [[TMP273]], [[TMP274]]
+; CHECK-NEXT:    [[TMP240:%.*]] = shufflevector <2 x i32> [[TMP222]], <2 x i32> poison, <2 x i32> <i32 1, i32 poison>
+; CHECK-NEXT:    [[TMP241:%.*]] = insertelement <2 x i32> [[TMP240]], i32 [[SUB102]], i32 1
+; CHECK-NEXT:    [[TMP242:%.*]] = add <2 x i32> [[TMP238]], [[TMP241]]
+; CHECK-NEXT:    [[TMP243:%.*]] = sub <2 x i32> [[TMP238]], [[TMP241]]
+; CHECK-NEXT:    [[TMP244:%.*]] = shufflevector <2 x i32> [[TMP242]], <2 x i32> [[TMP243]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    [[TMP245:%.*]] = add <2 x i32> [[TMP235]], [[TMP244]]
+; CHECK-NEXT:    [[TMP246:%.*]] = xor <2 x i32> [[TMP245]], [[TMP232]]
+; CHECK-NEXT:    [[TMP247:%.*]] = extractelement <2 x i32> [[TMP246]], i32 1
+; CHECK-NEXT:    [[ADD113:%.*]] = add i32 [[ADD112]], [[TMP247]]
+; CHECK-NEXT:    [[TMP248:%.*]] = insertelement <2 x i32> poison, i32 [[ADD94_1]], i32 0
+; CHECK-NEXT:    [[TMP249:%.*]] = shufflevector <2 x i32> [[TMP248]], <2 x i32> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP250:%.*]] = add <2 x i32> [[TMP196]], [[TMP249]]
+; CHECK-NEXT:    [[TMP251:%.*]] = sub <2 x i32> [[TMP196]], [[TMP249]]
+; CHECK-NEXT:    [[TMP252:%.*]] = shufflevector <2 x i32> [[TMP250]], <2 x i32> [[TMP251]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    [[TMP253:%.*]] = add <2 x i32> [[TMP194]], [[TMP252]]
+; CHECK-NEXT:    [[TMP254:%.*]] = xor <2 x i32> [[TMP253]], [[TMP191]]
+; CHECK-NEXT:    [[TMP255:%.*]] = extractelement <2 x i32> [[TMP246]], i32 0
+; CHECK-NEXT:    [[ADD108_1:%.*]] = add i32 [[TMP255]], [[ADD113]]
+; CHECK-NEXT:    [[TMP256:%.*]] = extractelement <2 x i32> [[TMP254]], i32 0
+; CHECK-NEXT:    [[ADD110_1:%.*]] = add i32 [[ADD108_1]], [[TMP256]]
+; CHECK-NEXT:    [[TMP257:%.*]] = extractelement <2 x i32> [[TMP254]], i32 1
+; CHECK-NEXT:    [[ADD112_1:%.*]] = add i32 [[ADD110_1]], [[TMP257]]
+; CHECK-NEXT:    [[TMP258:%.*]] = shufflevector <2 x i32> [[TMP209]], <2 x i32> poison, <2 x i32> <i32 1, i32 poison>
+; CHECK-NEXT:    [[TMP259:%.*]] = shufflevector <2 x i32> [[TMP258]], <2 x i32> [[TMP238]], <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:    [[TMP260:%.*]] = add <2 x i32> [[TMP222]], [[TMP259]]
+; CHECK-NEXT:    [[TMP261:%.*]] = sub <2 x i32> [[TMP222]], [[TMP259]]
+; CHECK-NEXT:    [[TMP262:%.*]] = shufflevector <2 x i32> [[TMP260]], <2 x i32> [[TMP261]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    [[TMP263:%.*]] = add <2 x i32> [[TMP218]], [[TMP262]]
+; CHECK-NEXT:    [[TMP264:%.*]] = xor <2 x i32> [[TMP263]], [[TMP215]]
+; CHECK-NEXT:    [[TMP265:%.*]] = extractelement <2 x i32> [[TMP264]], i32 1
+; CHECK-NEXT:    [[ADD113_1:%.*]] = add i32 [[ADD112_1]], [[TMP265]]
+; CHECK-NEXT:    [[TMP266:%.*]] = shufflevector <2 x i32> <i32 65535, i32 poison>, <2 x i32> [[TMP222]], <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:    [[TMP267:%.*]] = mul <2 x i32> [[TMP209]], [[TMP266]]
+; CHECK-NEXT:    [[TMP268:%.*]] = sub <2 x i32> [[TMP209]], [[TMP266]]
+; CHECK-NEXT:    [[TMP269:%.*]] = shufflevector <2 x i32> [[TMP267]], <2 x i32> [[TMP268]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    [[TMP270:%.*]] = add <2 x i32> [[TMP186]], [[TMP214]]
+; CHECK-NEXT:    [[TMP271:%.*]] = xor <2 x i32> [[TMP270]], [[TMP142]]
+; CHECK-NEXT:    [[TMP272:%.*]] = extractelement <2 x i32> [[TMP269]], i32 0
+; CHECK-NEXT:    [[TMP273:%.*]] = extractelement <2 x i32> [[TMP269]], i32 1
+; CHECK-NEXT:    [[ADD_I62_2:%.*]] = add i32 [[TMP272]], [[TMP273]]
 ; CHECK-NEXT:    [[XOR_I63_2:%.*]] = xor i32 [[ADD_I62_2]], [[ADD44]]
-; CHECK-NEXT:    [[TMP275:%.*]] = extractelement <2 x i32> [[TMP265]], i32 0
-; CHECK-NEXT:    [[ADD108_2:%.*]] = add i32 [[TMP275]], [[ADD113_1]]
-; CHECK-NEXT:    [[TMP276:%.*]] = extractelement <2 x i32> [[TMP272]], i32 0
-; CHECK-NEXT:    [[ADD110_2:%.*]] = add i32 [[ADD108_2]], [[TMP276]]
-; CHECK-NEXT:    [[TMP277:%.*]] = extractelement <2 x i32> [[TMP272]], i32 1
-; CHECK-NEXT:    [[ADD112_2:%.*]] = add i32 [[ADD110_2]], [[TMP277]]
+; CHECK-NEXT:    [[TMP274:%.*]] = extractelement <2 x i32> [[TMP264]], i32 0
+; CHECK-NEXT:    [[ADD108_2:%.*]] = add i32 [[TMP274]], [[ADD113_1]]
+; CHECK-NEXT:    [[TMP275:%.*]] = extractelement <2 x i32> [[TMP271]], i32 0
+; CHECK-NEXT:    [[ADD110_2:%.*]] = add i32 [[ADD108_2]], [[TMP275]]
+; CHECK-NEXT:    [[TMP276:%.*]] = extractelement <2 x i32> [[TMP271]], i32 1
+; CHECK-NEXT:    [[ADD112_2:%.*]] = add i32 [[ADD110_2]], [[TMP276]]
 ; CHECK-NEXT:    [[ADD113_2:%.*]] = add i32 [[ADD112_2]], [[XOR_I63_2]]
-; CHECK-NEXT:    [[TMP278:%.*]] = insertelement <2 x i32> poison, i32 [[SUB59]], i32 0
-; CHECK-NEXT:    [[TMP279:%.*]] = shufflevector <2 x i32> [[TMP278]], <2 x i32> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP280:%.*]] = add <2 x i32> [[TMP279]], [[TMP189]]
-; CHECK-NEXT:    [[TMP281:%.*]] = sub <2 x i32> [[TMP279]], [[TMP189]]
-; CHECK-NEXT:    [[TMP282:%.*]] = shufflevector <2 x i32> [[TMP280]], <2 x i32> [[TMP281]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT:    [[TMP283:%.*]] = add <2 x i32> [[TMP105]], [[TMP282]]
-; CHECK-NEXT:    [[TMP284:%.*]] = sub <2 x i32> [[TMP282]], [[TMP105]]
-; CHECK-NEXT:    [[TMP285:%.*]] = add <2 x i32> [[TMP139]], [[TMP283]]
-; CHECK-NEXT:    [[TMP286:%.*]] = xor <2 x i32> [[TMP285]], [[TMP108]]
-; CHECK-NEXT:    [[TMP287:%.*]] = lshr <2 x i32> [[TMP98]], <i32 15, i32 15>
-; CHECK-NEXT:    [[TMP288:%.*]] = and <2 x i32> [[TMP287]], <i32 65537, i32 65537>
-; CHECK-NEXT:    [[TMP289:%.*]] = mul <2 x i32> [[TMP288]], <i32 65535, i32 65535>
-; CHECK-NEXT:    [[TMP290:%.*]] = add <2 x i32> [[TMP289]], [[TMP284]]
-; CHECK-NEXT:    [[TMP291:%.*]] = xor <2 x i32> [[TMP290]], [[TMP98]]
-; CHECK-NEXT:    [[TMP292:%.*]] = extractelement <2 x i32> [[TMP286]], i32 1
-; CHECK-NEXT:    [[ADD108_3:%.*]] = add i32 [[TMP292]], [[ADD113_2]]
-; CHECK-NEXT:    [[TMP293:%.*]] = extractelement <2 x i32> [[TMP286]], i32 0
-; CHECK-NEXT:    [[ADD110_3:%.*]] = add i32 [[ADD108_3]], [[TMP293]]
-; CHECK-NEXT:    [[TMP294:%.*]] = extractelement <2 x i32> [[TMP291]], i32 0
-; CHECK-NEXT:    [[ADD112_3:%.*]] = add i32 [[ADD110_3]], [[TMP294]]
-; CHECK-NEXT:    [[TMP295:%.*]] = extractelement <2 x i32> [[TMP291]], i32 1
-; CHECK-NEXT:    [[ADD113_3:%.*]] = add i32 [[ADD112_3]], [[TMP295]]
+; CHECK-NEXT:    [[TMP277:%.*]] = insertelement <2 x i32> poison, i32 [[SUB59]], i32 0
+; CHECK-NEXT:    [[TMP278:%.*]] = shufflevector <2 x i32> [[TMP277]], <2 x i32> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP279:%.*]] = add <2 x i32> [[TMP278]], [[TMP188]]
+; CHECK-NEXT:    [[TMP280:%.*]] = sub <2 x i32> [[TMP278]], [[TMP188]]
+; CHECK-NEXT:    [[TMP281:%.*]] = shufflevector <2 x i32> [[TMP279]], <2 x i32> [[TMP280]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    [[TMP282:%.*]] = add <2 x i32> [[TMP104]], [[TMP281]]
+; CHECK-NEXT:    [[TMP283:%.*]] = sub <2 x i32> [[TMP281]], [[TMP104]]
+; CHECK-NEXT:    [[TMP284:%.*]] = add <2 x i32> [[TMP138]], [[TMP282]]
+; CHECK-NEXT:    [[TMP285:%.*]] = xor <2 x i32> [[TMP284]], [[TMP107]]
+; CHECK-NEXT:    [[TMP286:%.*]] = lshr <2 x i32> [[TMP97]], <i32 15, i32 15>
+; CHECK-NEXT:    [[TMP287:%.*]] = and <2 x i32> [[TMP286]], <i32 65537, i32 65537>
+; CHECK-NEXT:    [[TMP288:%.*]] = mul <2 x i32> [[TMP287]], <i32 65535, i32 65535>
+; CHECK-NEXT:    [[TMP289:%.*]] = add <2 x i32> [[TMP288]], [[TMP283]]
+; CHECK-NEXT:    [[TMP290:%.*]] = xor <2 x i32> [[TMP289]], [[TMP97]]
+; CHECK-NEXT:    [[TMP291:%.*]] = extractelement <2 x i32> [[TMP285]], i32 1
+; CHECK-NEXT:    [[ADD108_3:%.*]] = add i32 [[TMP291]], [[ADD113_2]]
+; CHECK-NEXT:    [[TMP292:%.*]] = extractelement <2 x i32> [[TMP285]], i32 0
+; CHECK-NEXT:    [[ADD110_3:%.*]] = add i32 [[ADD108_3]], [[TMP292]]
+; CHECK-NEXT:    [[TMP293:%.*]] = extractelement <2 x i32> [[TMP290]], i32 0
+; CHECK-NEXT:    [[ADD112_3:%.*]] = add i32 [[ADD110_3]], [[TMP293]]
+; CHECK-NEXT:    [[TMP294:%.*]] = extractelement <2 x i32> [[TMP290]], i32 1
+; CHECK-NEXT:    [[ADD113_3:%.*]] = add i32 [[ADD112_3]], [[TMP294]]
 ; CHECK-NEXT:    ret i32 [[ADD113_3]]
 ;
 entry:
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/strided-loads-vectorized.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/strided-loads-vectorized.ll
index a4cc311..4b0b419 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/strided-loads-vectorized.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/strided-loads-vectorized.ll
@@ -5,61 +5,12 @@ define void @test(ptr %p, ptr noalias %s) {
 ; CHECK-LABEL: @test(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [48 x float], ptr [[P:%.*]], i64 0, i64 0
-; CHECK-NEXT:    [[I:%.*]] = load float, ptr [[ARRAYIDX]], align 4
 ; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 30
-; CHECK-NEXT:    [[I1:%.*]] = load float, ptr [[ARRAYIDX1]], align 4
-; CHECK-NEXT:    [[ADD:%.*]] = fsub fast float [[I1]], [[I]]
 ; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[S:%.*]], i64 0
-; CHECK-NEXT:    store float [[ADD]], ptr [[ARRAYIDX2]], align 4
-; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 4
-; CHECK-NEXT:    [[I2:%.*]] = load float, ptr [[ARRAYIDX4]], align 4
-; CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 26
-; CHECK-NEXT:    [[I3:%.*]] = load float, ptr [[ARRAYIDX6]], align 4
-; CHECK-NEXT:    [[ADD7:%.*]] = fsub fast float [[I3]], [[I2]]
-; CHECK-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds float, ptr [[S]], i64 1
-; CHECK-NEXT:    store float [[ADD7]], ptr [[ARRAYIDX9]], align 4
-; CHECK-NEXT:    [[ARRAYIDX11:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 8
-; CHECK-NEXT:    [[I4:%.*]] = load float, ptr [[ARRAYIDX11]], align 4
-; CHECK-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 22
-; CHECK-NEXT:    [[I5:%.*]] = load float, ptr [[ARRAYIDX13]], align 4
-; CHECK-NEXT:    [[ADD14:%.*]] = fsub fast float [[I5]], [[I4]]
-; CHECK-NEXT:    [[ARRAYIDX16:%.*]] = getelementptr inbounds float, ptr [[S]], i64 2
-; CHECK-NEXT:    store float [[ADD14]], ptr [[ARRAYIDX16]], align 4
-; CHECK-NEXT:    [[ARRAYIDX18:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 12
-; CHECK-NEXT:    [[I6:%.*]] = load float, ptr [[ARRAYIDX18]], align 4
-; CHECK-NEXT:    [[ARRAYIDX20:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 18
-; CHECK-NEXT:    [[I7:%.*]] = load float, ptr [[ARRAYIDX20]], align 4
-; CHECK-NEXT:    [[ADD21:%.*]] = fsub fast float [[I7]], [[I6]]
-; CHECK-NEXT:    [[ARRAYIDX23:%.*]] = getelementptr inbounds float, ptr [[S]], i64 3
-; CHECK-NEXT:    store float [[ADD21]], ptr [[ARRAYIDX23]], align 4
-; CHECK-NEXT:    [[ARRAYIDX25:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 16
-; CHECK-NEXT:    [[I8:%.*]] = load float, ptr [[ARRAYIDX25]], align 4
-; CHECK-NEXT:    [[ARRAYIDX27:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 14
-; CHECK-NEXT:    [[I9:%.*]] = load float, ptr [[ARRAYIDX27]], align 4
-; CHECK-NEXT:    [[ADD28:%.*]] = fsub fast float [[I9]], [[I8]]
-; CHECK-NEXT:    [[ARRAYIDX30:%.*]] = getelementptr inbounds float, ptr [[S]], i64 4
-; CHECK-NEXT:    store float [[ADD28]], ptr [[ARRAYIDX30]], align 4
-; CHECK-NEXT:    [[ARRAYIDX32:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 20
-; CHECK-NEXT:    [[I10:%.*]] = load float, ptr [[ARRAYIDX32]], align 4
-; CHECK-NEXT:    [[ARRAYIDX34:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 10
-; CHECK-NEXT:    [[I11:%.*]] = load float, ptr [[ARRAYIDX34]], align 4
-; CHECK-NEXT:    [[ADD35:%.*]] = fsub fast float [[I11]], [[I10]]
-; CHECK-NEXT:    [[ARRAYIDX37:%.*]] = getelementptr inbounds float, ptr [[S]], i64 5
-; CHECK-NEXT:    store float [[ADD35]], ptr [[ARRAYIDX37]], align 4
-; CHECK-NEXT:    [[ARRAYIDX39:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 24
-; CHECK-NEXT:    [[I12:%.*]] = load float, ptr [[ARRAYIDX39]], align 4
-; CHECK-NEXT:    [[ARRAYIDX41:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 6
-; CHECK-NEXT:    [[I13:%.*]] = load float, ptr [[ARRAYIDX41]], align 4
-; CHECK-NEXT:    [[ADD42:%.*]] = fsub fast float [[I13]], [[I12]]
-; CHECK-NEXT:    [[ARRAYIDX44:%.*]] = getelementptr inbounds float, ptr [[S]], i64 6
-; CHECK-NEXT:    store float [[ADD42]], ptr [[ARRAYIDX44]], align 4
-; CHECK-NEXT:    [[ARRAYIDX46:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 28
-; CHECK-NEXT:    [[I14:%.*]] = load float, ptr [[ARRAYIDX46]], align 4
-; CHECK-NEXT:    [[ARRAYIDX48:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 2
-; CHECK-NEXT:    [[I15:%.*]] = load float, ptr [[ARRAYIDX48]], align 4
-; CHECK-NEXT:    [[ADD49:%.*]] = fsub fast float [[I15]], [[I14]]
-; CHECK-NEXT:    [[ARRAYIDX51:%.*]] = getelementptr inbounds float, ptr [[S]], i64 7
-; CHECK-NEXT:    store float [[ADD49]], ptr [[ARRAYIDX51]], align 4
+; CHECK-NEXT:    [[TMP0:%.*]] = call <8 x float> @llvm.experimental.vp.strided.load.v8f32.p0.i64(ptr align 4 [[ARRAYIDX]], i64 16, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, i32 8)
+; CHECK-NEXT:    [[TMP1:%.*]] = call <8 x float> @llvm.experimental.vp.strided.load.v8f32.p0.i64(ptr align 4 [[ARRAYIDX1]], i64 -16, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, i32 8)
+; CHECK-NEXT:    [[TMP2:%.*]] = fsub fast <8 x float> [[TMP1]], [[TMP0]]
+; CHECK-NEXT:    store <8 x float> [[TMP2]], ptr [[ARRAYIDX2]], align 4
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -262,67 +213,40 @@ define void @test2(ptr %p, ptr noalias %s, i32 %stride) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[STR:%.*]] = zext i32 [[STRIDE:%.*]] to i64
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [48 x float], ptr [[P:%.*]], i64 0, i64 2
-; CHECK-NEXT:    [[I:%.*]] = load float, ptr [[ARRAYIDX]], align 4
 ; CHECK-NEXT:    [[ST6:%.*]] = mul i64 [[STR]], 7
 ; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 [[ST6]]
 ; CHECK-NEXT:    [[I1:%.*]] = load float, ptr [[ARRAYIDX1]], align 4
-; CHECK-NEXT:    [[ADD:%.*]] = fsub fast float [[I1]], [[I]]
 ; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[S:%.*]], i64 0
-; CHECK-NEXT:    store float [[ADD]], ptr [[ARRAYIDX2]], align 4
-; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 6
-; CHECK-NEXT:    [[I2:%.*]] = load float, ptr [[ARRAYIDX4]], align 4
 ; CHECK-NEXT:    [[ST5:%.*]] = mul i64 [[STR]], 6
 ; CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 [[ST5]]
 ; CHECK-NEXT:    [[I3:%.*]] = load float, ptr [[ARRAYIDX6]], align 4
-; CHECK-NEXT:    [[ADD7:%.*]] = fsub fast float [[I3]], [[I2]]
-; CHECK-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds float, ptr [[S]], i64 1
-; CHECK-NEXT:    store float [[ADD7]], ptr [[ARRAYIDX9]], align 4
-; CHECK-NEXT:    [[ARRAYIDX11:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 10
-; CHECK-NEXT:    [[I4:%.*]] = load float, ptr [[ARRAYIDX11]], align 4
 ; CHECK-NEXT:    [[ST4:%.*]] = mul i64 [[STR]], 5
 ; CHECK-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 [[ST4]]
 ; CHECK-NEXT:    [[I5:%.*]] = load float, ptr [[ARRAYIDX13]], align 4
-; CHECK-NEXT:    [[ADD14:%.*]] = fsub fast float [[I5]], [[I4]]
-; CHECK-NEXT:    [[ARRAYIDX16:%.*]] = getelementptr inbounds float, ptr [[S]], i64 2
-; CHECK-NEXT:    store float [[ADD14]], ptr [[ARRAYIDX16]], align 4
-; CHECK-NEXT:    [[ARRAYIDX18:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 14
-; CHECK-NEXT:    [[I6:%.*]] = load float, ptr [[ARRAYIDX18]], align 4
 ; CHECK-NEXT:    [[ST3:%.*]] = mul i64 [[STR]], 4
 ; CHECK-NEXT:    [[ARRAYIDX20:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 [[ST3]]
 ; CHECK-NEXT:    [[I7:%.*]] = load float, ptr [[ARRAYIDX20]], align 4
-; CHECK-NEXT:    [[ADD21:%.*]] = fsub fast float [[I7]], [[I6]]
-; CHECK-NEXT:    [[ARRAYIDX23:%.*]] = getelementptr inbounds float, ptr [[S]], i64 3
-; CHECK-NEXT:    store float [[ADD21]], ptr [[ARRAYIDX23]], align 4
-; CHECK-NEXT:    [[ARRAYIDX25:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 18
 ; CHECK-NEXT:    [[ST2:%.*]] = mul i64 [[STR]], 3
-; CHECK-NEXT:    [[I8:%.*]] = load float, ptr [[ARRAYIDX25]], align 4
 ; CHECK-NEXT:    [[ARRAYIDX27:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 [[ST2]]
 ; CHECK-NEXT:    [[I9:%.*]] = load float, ptr [[ARRAYIDX27]], align 4
-; CHECK-NEXT:    [[ADD28:%.*]] = fsub fast float [[I9]], [[I8]]
-; CHECK-NEXT:    [[ARRAYIDX30:%.*]] = getelementptr inbounds float, ptr [[S]], i64 4
-; CHECK-NEXT:    store float [[ADD28]], ptr [[ARRAYIDX30]], align 4
-; CHECK-NEXT:    [[ARRAYIDX32:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 22
-; CHECK-NEXT:    [[I10:%.*]] = load float, ptr [[ARRAYIDX32]], align 4
 ; CHECK-NEXT:    [[ST1:%.*]] = mul i64 [[STR]], 2
 ; CHECK-NEXT:    [[ARRAYIDX34:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 [[ST1]]
 ; CHECK-NEXT:    [[I11:%.*]] = load float, ptr [[ARRAYIDX34]], align 4
-; CHECK-NEXT:    [[ADD35:%.*]] = fsub fast float [[I11]], [[I10]]
-; CHECK-NEXT:    [[ARRAYIDX37:%.*]] = getelementptr inbounds float, ptr [[S]], i64 5
-; CHECK-NEXT:    store float [[ADD35]], ptr [[ARRAYIDX37]], align 4
-; CHECK-NEXT:    [[ARRAYIDX39:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 26
-; CHECK-NEXT:    [[I12:%.*]] = load float, ptr [[ARRAYIDX39]], align 4
 ; CHECK-NEXT:    [[ARRAYIDX41:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 [[STR]]
 ; CHECK-NEXT:    [[I13:%.*]] = load float, ptr [[ARRAYIDX41]], align 4
-; CHECK-NEXT:    [[ADD42:%.*]] = fsub fast float [[I13]], [[I12]]
-; CHECK-NEXT:    [[ARRAYIDX44:%.*]] = getelementptr inbounds float, ptr [[S]], i64 6
-; CHECK-NEXT:    store float [[ADD42]], ptr [[ARRAYIDX44]], align 4
-; CHECK-NEXT:    [[ARRAYIDX46:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 30
-; CHECK-NEXT:    [[I14:%.*]] = load float, ptr [[ARRAYIDX46]], align 4
 ; CHECK-NEXT:    [[ARRAYIDX48:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 0
 ; CHECK-NEXT:    [[I15:%.*]] = load float, ptr [[ARRAYIDX48]], align 4
-; CHECK-NEXT:    [[ADD49:%.*]] = fsub fast float [[I15]], [[I14]]
-; CHECK-NEXT:    [[ARRAYIDX51:%.*]] = getelementptr inbounds float, ptr [[S]], i64 7
-; CHECK-NEXT:    store float [[ADD49]], ptr [[ARRAYIDX51]], align 4
+; CHECK-NEXT:    [[TMP0:%.*]] = call <8 x float> @llvm.experimental.vp.strided.load.v8f32.p0.i64(ptr align 4 [[ARRAYIDX]], i64 16, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, i32 8)
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <8 x float> poison, float [[I1]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <8 x float> [[TMP1]], float [[I3]], i32 1
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <8 x float> [[TMP2]], float [[I5]], i32 2
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <8 x float> [[TMP3]], float [[I7]], i32 3
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <8 x float> [[TMP4]], float [[I9]], i32 4
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <8 x float> [[TMP5]], float [[I11]], i32 5
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <8 x float> [[TMP6]], float [[I13]], i32 6
+; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <8 x float> [[TMP7]], float [[I15]], i32 7
+; CHECK-NEXT:    [[TMP9:%.*]] = fsub fast <8 x float> [[TMP8]], [[TMP0]]
+; CHECK-NEXT:    store <8 x float> [[TMP9]], ptr [[ARRAYIDX2]], align 4
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -397,27 +321,12 @@ define void @test3(ptr %p, ptr noalias %s) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [48 x float], ptr [[P:%.*]], i64 0, i64 0
 ; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[S:%.*]], i64 0
-; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 4
-; CHECK-NEXT:    [[ARRAYIDX11:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 8
-; CHECK-NEXT:    [[ARRAYIDX18:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 12
-; CHECK-NEXT:    [[ARRAYIDX25:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 16
-; CHECK-NEXT:    [[ARRAYIDX32:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 20
-; CHECK-NEXT:    [[ARRAYIDX39:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 24
-; CHECK-NEXT:    [[ARRAYIDX46:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 28
 ; CHECK-NEXT:    [[ARRAYIDX48:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 23
-; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <8 x ptr> poison, ptr [[ARRAYIDX]], i32 0
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <8 x ptr> [[TMP0]], ptr [[ARRAYIDX4]], i32 1
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <8 x ptr> [[TMP1]], ptr [[ARRAYIDX11]], i32 2
-; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <8 x ptr> [[TMP2]], ptr [[ARRAYIDX18]], i32 3
-; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <8 x ptr> [[TMP3]], ptr [[ARRAYIDX25]], i32 4
-; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <8 x ptr> [[TMP4]], ptr [[ARRAYIDX32]], i32 5
-; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <8 x ptr> [[TMP5]], ptr [[ARRAYIDX39]], i32 6
-; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <8 x ptr> [[TMP6]], ptr [[ARRAYIDX46]], i32 7
-; CHECK-NEXT:    [[TMP8:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> [[TMP7]], i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x float> poison)
-; CHECK-NEXT:    [[TMP9:%.*]] = load <8 x float>, ptr [[ARRAYIDX48]], align 4
-; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <8 x float> [[TMP9]], <8 x float> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    [[TMP11:%.*]] = fsub fast <8 x float> [[TMP10]], [[TMP8]]
-; CHECK-NEXT:    store <8 x float> [[TMP11]], ptr [[ARRAYIDX2]], align 4
+; CHECK-NEXT:    [[TMP0:%.*]] = call <8 x float> @llvm.experimental.vp.strided.load.v8f32.p0.i64(ptr align 4 [[ARRAYIDX]], i64 16, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, i32 8)
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x float>, ptr [[ARRAYIDX48]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[TMP3:%.*]] = fsub fast <8 x float> [[TMP2]], [[TMP0]]
+; CHECK-NEXT:    store <8 x float> [[TMP3]], ptr [[ARRAYIDX2]], align 4
 ; CHECK-NEXT:    ret void
 ;
 entry:
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/strided-loads-with-external-use-ptr.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/strided-loads-with-external-use-ptr.ll
index 5aba9ea..ec152c7 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/strided-loads-with-external-use-ptr.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/strided-loads-with-external-use-ptr.ll
@@ -8,7 +8,7 @@ define i16 @test() {
 ; CHECK-SAME: () #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[PPREV_058_I:%.*]] = getelementptr [[S:%.*]], ptr null, i64 -1
-; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x ptr> <ptr poison, ptr null>, ptr [[PPREV_058_I]], i32 0
+; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x ptr> <ptr null, ptr poison>, ptr [[PPREV_058_I]], i32 1
 ; CHECK-NEXT:    br label [[WHILE_BODY_I:%.*]]
 ; CHECK:       while.body.i:
 ; CHECK-NEXT:    [[TMP1:%.*]] = phi i16 [ 0, [[WHILE_BODY_I]] ], [ 0, [[ENTRY:%.*]] ]
@@ -17,7 +17,7 @@ define i16 @test() {
 ; CHECK-NEXT:    [[TMP4:%.*]] = call <2 x i16> @llvm.masked.gather.v2i16.v2p0(<2 x ptr> [[TMP3]], i32 2, <2 x i1> <i1 true, i1 true>, <2 x i16> poison)
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x i16> [[TMP4]], i32 0
 ; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x i16> [[TMP4]], i32 1
-; CHECK-NEXT:    [[CMP_I178:%.*]] = icmp ult i16 [[TMP6]], [[TMP5]]
+; CHECK-NEXT:    [[CMP_I178:%.*]] = icmp ult i16 [[TMP5]], [[TMP6]]
 ; CHECK-NEXT:    br label [[WHILE_BODY_I]]
 ;
 entry:
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/strided-loads.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/strided-loads.ll
index 8f2c72b..8ab57cc 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/strided-loads.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/strided-loads.ll
@@ -5,14 +5,11 @@ define i32 @sum_of_abs(ptr noalias %a, ptr noalias %b) {
 ; CHECK-LABEL: define i32 @sum_of_abs
 ; CHECK-SAME: (ptr noalias [[A:%.*]], ptr noalias [[B:%.*]]) #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <8 x ptr> poison, ptr [[A]], i32 0
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x ptr> [[TMP0]], <8 x ptr> poison, <8 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, <8 x ptr> [[TMP1]], <8 x i64> <i64 0, i64 64, i64 128, i64 192, i64 256, i64 320, i64 384, i64 448>
-; CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> [[TMP2]], i32 1, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i8> poison)
-; CHECK-NEXT:    [[TMP4:%.*]] = call <8 x i8> @llvm.abs.v8i8(<8 x i8> [[TMP3]], i1 false)
-; CHECK-NEXT:    [[TMP5:%.*]] = sext <8 x i8> [[TMP4]] to <8 x i32>
-; CHECK-NEXT:    [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP5]])
-; CHECK-NEXT:    ret i32 [[TMP6]]
+; CHECK-NEXT:    [[TMP0:%.*]] = call <8 x i8> @llvm.experimental.vp.strided.load.v8i8.p0.i64(ptr align 1 [[A]], i64 64, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, i32 8)
+; CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i8> @llvm.abs.v8i8(<8 x i8> [[TMP0]], i1 false)
+; CHECK-NEXT:    [[TMP2:%.*]] = sext <8 x i8> [[TMP1]] to <8 x i32>
+; CHECK-NEXT:    [[TMP3:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP2]])
+; CHECK-NEXT:    ret i32 [[TMP3]]
 ;
 entry:
   %0 = load i8, ptr %a, align 1
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/gep-nodes-with-non-gep-inst.ll b/llvm/test/Transforms/SLPVectorizer/X86/gep-nodes-with-non-gep-inst.ll
index 96d4c30..9e43cef 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/gep-nodes-with-non-gep-inst.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/gep-nodes-with-non-gep-inst.ll
@@ -30,7 +30,7 @@ define void @test() {
 ; CHECK-SLP-THRESHOLD:       bb:
 ; CHECK-SLP-THRESHOLD-NEXT:    [[TMP0:%.*]] = insertelement <4 x ptr> poison, ptr [[COND_IN_V]], i32 0
 ; CHECK-SLP-THRESHOLD-NEXT:    [[TMP1:%.*]] = shufflevector <4 x ptr> [[TMP0]], <4 x ptr> poison, <4 x i32> zeroinitializer
-; CHECK-SLP-THRESHOLD-NEXT:    [[TMP2:%.*]] = getelementptr i64, <4 x ptr> [[TMP1]], <4 x i64> <i64 0, i64 4, i64 8, i64 12>
+; CHECK-SLP-THRESHOLD-NEXT:    [[TMP2:%.*]] = getelementptr i64, <4 x ptr> [[TMP1]], <4 x i64> <i64 12, i64 8, i64 4, i64 0>
 ; CHECK-SLP-THRESHOLD-NEXT:    [[TMP3:%.*]] = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> [[TMP2]], i32 8, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i64> poison)
 ; CHECK-SLP-THRESHOLD-NEXT:    [[TMP4:%.*]] = icmp eq <4 x i64> [[TMP3]], zeroinitializer
 ; CHECK-SLP-THRESHOLD-NEXT:    ret void
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/remark_gather-load-redux-cost.ll b/llvm/test/Transforms/SLPVectorizer/X86/remark_gather-load-redux-cost.ll
index 1add732..3bc6e64 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/remark_gather-load-redux-cost.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/remark_gather-load-redux-cost.ll
@@ -7,7 +7,7 @@ define i32 @test(ptr noalias %p, ptr noalias %addr) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <8 x ptr> poison, ptr [[ADDR:%.*]], i32 0
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x ptr> [[TMP0]], <8 x ptr> poison, <8 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i32, <8 x ptr> [[TMP1]], <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i32, <8 x ptr> [[TMP1]], <8 x i32> <i32 15, i32 13, i32 11, i32 9, i32 7, i32 5, i32 3, i32 1>
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> [[TMP2]], i32 8, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> poison)
 ; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <8 x ptr> poison, ptr [[P:%.*]], i32 0
 ; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <8 x ptr> [[TMP4]], <8 x ptr> poison, <8 x i32> zeroinitializer
-- 
cgit v1.1


From 73ce0aed35eaeaf7e2d5c464a68fca7d946834bf Mon Sep 17 00:00:00 2001
From: Haojian Wu <hokein.wu@gmail.com>
Date: Mon, 12 Feb 2024 18:48:41 +0100
Subject: [bazel] Port for 9ecf4d20bbfabdcd11c9058363903f975a652e7b

---
 utils/bazel/llvm-project-overlay/mlir/BUILD.bazel | 1 +
 1 file changed, 1 insertion(+)

diff --git a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
index e9e7fe3..8b20e3e 100644
--- a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
@@ -10036,6 +10036,7 @@ cc_library(
         ":OpenMPInterfacesIncGen",
         ":OpenMPOpsIncGen",
         ":OpenMPTypeInterfacesIncGen",
+        ":Support",
         "//llvm:FrontendOpenMP",
         "//llvm:Support",
     ],
-- 
cgit v1.1


From a3dcc7ab3a4121a4185eab102fdbe39bbd7bf89f Mon Sep 17 00:00:00 2001
From: "J. Ryan Stinnett" <jryans@gmail.com>
Date: Mon, 12 Feb 2024 17:56:17 +0000
Subject: [Docs][DebugInfo][RemoveDIs] Revise debug info migration guide title
 (#81480)

---
 llvm/docs/RemoveDIsDebugInfo.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/docs/RemoveDIsDebugInfo.md b/llvm/docs/RemoveDIsDebugInfo.md
index e871fed..a057767 100644
--- a/llvm/docs/RemoveDIsDebugInfo.md
+++ b/llvm/docs/RemoveDIsDebugInfo.md
@@ -1,4 +1,4 @@
-# What's all this then?
+# Debug info migration: From intrinsics to records
 
 We're planning on removing debug info intrinsics from LLVM, as they're slow, unwieldy and can confuse optimisation passes if they're not expecting them. Instead of having a sequence of instructions that looks like this:
 
-- 
cgit v1.1


From 815e0485a47fa0794e0eccf93a4f4dc928881c6f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mirko=20Brku=C5=A1anin?= <Mirko.Brkusanin@amd.com>
Date: Mon, 12 Feb 2024 19:01:58 +0100
Subject: [AMDGPU][MC] Fix printing vcc(_lo) twice for VOPC DPP instrucitons
 (#81158)

---
 .../AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp      |   3 +-
 llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp    |  16 +++
 llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h      |   3 +
 llvm/lib/Target/AMDGPU/VOPCInstructions.td         |   4 +
 llvm/lib/Target/AMDGPU/VOPInstructions.td          |  13 ++
 llvm/test/MC/AMDGPU/gfx11-promotions.s             | 156 ++++++++++-----------
 6 files changed, 116 insertions(+), 79 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
index abfa4a3..4ab3aa5a 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
@@ -709,6 +709,7 @@ bool AMDGPUInstPrinter::needsImpliedVcc(const MCInstrDesc &Desc,
                                         unsigned OpNo) const {
   return OpNo == 0 && (Desc.TSFlags & SIInstrFlags::DPP) &&
          (Desc.TSFlags & SIInstrFlags::VOPC) &&
+         !isVOPCAsmOnly(Desc.getOpcode()) &&
          (Desc.hasImplicitDefOfPhysReg(AMDGPU::VCC) ||
           Desc.hasImplicitDefOfPhysReg(AMDGPU::VCC_LO));
 }
@@ -725,7 +726,7 @@ void AMDGPUInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
   // printOperandAndIntInputMods will be called instead
   if ((OpNo == 0 ||
        (OpNo == 1 && (Desc.TSFlags & SIInstrFlags::DPP) && ModIdx != -1)) &&
-      (Desc.TSFlags & SIInstrFlags::VOPC) &&
+      (Desc.TSFlags & SIInstrFlags::VOPC) && !isVOPCAsmOnly(Desc.getOpcode()) &&
       (Desc.hasImplicitDefOfPhysReg(AMDGPU::VCC) ||
        Desc.hasImplicitDefOfPhysReg(AMDGPU::VCC_LO)))
     printDefaultVccOperand(true, STI, O);
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index 33335ac..800dfcf 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -342,6 +342,14 @@ struct VOPC64DPPInfo {
   uint16_t Opcode;
 };
 
+struct VOPCDPPAsmOnlyInfo {
+  uint16_t Opcode;
+};
+
+struct VOP3CDPPAsmOnlyInfo {
+  uint16_t Opcode;
+};
+
 struct VOPDComponentInfo {
   uint16_t BaseVOP;
   uint16_t VOPDOp;
@@ -376,6 +384,10 @@ struct VOPTrue16Info {
 #define GET_VOPC64DPPTable_IMPL
 #define GET_VOPC64DPP8Table_DECL
 #define GET_VOPC64DPP8Table_IMPL
+#define GET_VOPCAsmOnlyInfoTable_DECL
+#define GET_VOPCAsmOnlyInfoTable_IMPL
+#define GET_VOP3CAsmOnlyInfoTable_DECL
+#define GET_VOP3CAsmOnlyInfoTable_IMPL
 #define GET_VOPDComponentTable_DECL
 #define GET_VOPDComponentTable_IMPL
 #define GET_VOPDPairs_DECL
@@ -477,6 +489,10 @@ bool isVOPC64DPP(unsigned Opc) {
   return isVOPC64DPPOpcodeHelper(Opc) || isVOPC64DPP8OpcodeHelper(Opc);
 }
 
+bool isVOPCAsmOnly(unsigned Opc) {
+  return isVOPCAsmOnlyOpcodeHelper(Opc) || isVOP3CAsmOnlyOpcodeHelper(Opc);
+}
+
 bool getMAIIsDGEMM(unsigned Opc) {
   const MAIInstInfo *Info = getMAIInstInfoHelper(Opc);
   return Info ? Info->is_dgemm : false;
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
index ded252c..b56025f5 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -500,6 +500,9 @@ bool getVOP3IsSingle(unsigned Opc);
 LLVM_READONLY
 bool isVOPC64DPP(unsigned Opc);
 
+LLVM_READONLY
+bool isVOPCAsmOnly(unsigned Opc);
+
 /// Returns true if MAI operation is a double precision GEMM.
 LLVM_READONLY
 bool getMAIIsDGEMM(unsigned Opc);
diff --git a/llvm/lib/Target/AMDGPU/VOPCInstructions.td b/llvm/lib/Target/AMDGPU/VOPCInstructions.td
index 861c9f7..fe52a0e 100644
--- a/llvm/lib/Target/AMDGPU/VOPCInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOPCInstructions.td
@@ -1165,6 +1165,8 @@ class VOPC_DPPe_Common<bits<8> op> : Enc64 {
 class VOPC_DPP_Base<bits<8> op, string OpName, VOPProfile P>
     : VOP_DPP_Base<OpName, P, P.InsDPP16, " " #P.AsmDPP16>,
       VOPC_DPPe_Common<op> {
+  Instruction Opcode = !cast<Instruction>(NAME);
+
   bits<2> src0_modifiers;
   bits<8> src0;
   bits<2> src1_modifiers;
@@ -1194,6 +1196,8 @@ class VOPC_DPP_Base<bits<8> op, string OpName, VOPProfile P>
 class VOPC_DPP8_Base<bits<8> op, string OpName, VOPProfile P>
     : VOP_DPP8_Base<OpName, P, P.InsDPP8, " " #P.AsmDPP8>,
       VOPC_DPPe_Common<op> {
+  Instruction Opcode = !cast<Instruction>(NAME);
+
   bits<8> src0;
   bits<24> dpp8;
   bits<9> fi;
diff --git a/llvm/lib/Target/AMDGPU/VOPInstructions.td b/llvm/lib/Target/AMDGPU/VOPInstructions.td
index 20d7c88..e8f85cb 100644
--- a/llvm/lib/Target/AMDGPU/VOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOPInstructions.td
@@ -1669,6 +1669,19 @@ class VOPC64Table <string Format> : GenericTable {
 def VOPC64DPPTable : VOPC64Table<"DPP">;
 def VOPC64DPP8Table : VOPC64Table<"DPP8">;
 
+class AsmOnlyInfoTable <string Format, string Class>: GenericTable {
+  let FilterClass = Class;
+  let FilterClassField = "isAsmParserOnly";
+  let CppTypeName = Format # "DPPAsmOnlyInfo";
+  let Fields = ["Opcode"];
+
+  let PrimaryKey = ["Opcode"];
+  let PrimaryKeyName = "is" # Format # "AsmOnlyOpcodeHelper";
+}
+
+def VOPCAsmOnlyInfoTable : AsmOnlyInfoTable <"VOPC", "VOPC_DPPe_Common">;
+def VOP3CAsmOnlyInfoTable : AsmOnlyInfoTable <"VOP3C", "VOP3_DPPe_Common_Base">;
+
 def VOPTrue16Table : GenericTable {
   let FilterClass = "VOP_Pseudo";
   let CppTypeName = "VOPTrue16Info";
diff --git a/llvm/test/MC/AMDGPU/gfx11-promotions.s b/llvm/test/MC/AMDGPU/gfx11-promotions.s
index 6be33af..0bd9026 100644
--- a/llvm/test/MC/AMDGPU/gfx11-promotions.s
+++ b/llvm/test/MC/AMDGPU/gfx11-promotions.s
@@ -11,343 +11,343 @@
 //===----------------------------------------------------------------------===//
 
 v_mov_b32 v0, v1
-// GFX11: encoding: [0x01,0x03,0x00,0x7e]
+// GFX11: v_mov_b32_e32 v0, v1                    ; encoding: [0x01,0x03,0x00,0x7e]
 
 v_mov_b32_e32 v0, v1
-// GFX11: encoding: [0x01,0x03,0x00,0x7e]
+// GFX11: v_mov_b32_e32 v0, v1                    ; encoding: [0x01,0x03,0x00,0x7e]
 
 //===----------------------------------------------------------------------===//
 // VOP2.
 //===----------------------------------------------------------------------===//
 
 v_add_f16 v5, v1, v2
-// GFX11: encoding: [0x01,0x05,0x0a,0x64]
+// GFX11: v_add_f16_e32 v5, v1, v2                ; encoding: [0x01,0x05,0x0a,0x64]
 
 v_add_f16_e32 v5, v1, v2
-// GFX11: encoding: [0x01,0x05,0x0a,0x64]
+// GFX11: v_add_f16_e32 v5, v1, v2                ; encoding: [0x01,0x05,0x0a,0x64]
 
 //===----------------------------------------------------------------------===//
 // VOPC.
 //===----------------------------------------------------------------------===//
 
 v_cmp_lt_f32 vcc_lo, v1, v2
-// GFX11: encoding: [0x01,0x05,0x22,0x7c]
+// GFX11: v_cmp_lt_f32_e32 vcc_lo, v1, v2         ; encoding: [0x01,0x05,0x22,0x7c]
 
 v_cmp_lt_f32_e32 vcc_lo, v1, v2
-// GFX11: encoding: [0x01,0x05,0x22,0x7c]
+// GFX11: v_cmp_lt_f32_e32 vcc_lo, v1, v2         ; encoding: [0x01,0x05,0x22,0x7c]
 
 //===----------------------------------------------------------------------===//
 // VOPCX.
 //===----------------------------------------------------------------------===//
 
 v_cmpx_class_f16 v1, v2
-// GFX11: encoding: [0x01,0x05,0xfa,0x7d]
+// GFX11: v_cmpx_class_f16_e32 v1, v2             ; encoding: [0x01,0x05,0xfa,0x7d]
 
 v_cmpx_class_f16_e32 v1, v2
-// GFX11: encoding: [0x01,0x05,0xfa,0x7d]
+// GFX11: v_cmpx_class_f16_e32 v1, v2             ; encoding: [0x01,0x05,0xfa,0x7d]
 
 //===----------------------------------------------------------------------===//
 // VOP1.DPP8.
 //===----------------------------------------------------------------------===//
 
 v_bfrev_b32 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: encoding: [0xe9,0x70,0x0a,0x7e,0x01,0x77,0x39,0x05]
+// GFX11: v_bfrev_b32_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x70,0x0a,0x7e,0x01,0x77,0x39,0x05]
 
 v_bfrev_b32_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: encoding: [0xe9,0x70,0x0a,0x7e,0x01,0x77,0x39,0x05]
+// GFX11: v_bfrev_b32_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x70,0x0a,0x7e,0x01,0x77,0x39,0x05]
 
 //===----------------------------------------------------------------------===//
 // VOP1.DPP16.
 //===----------------------------------------------------------------------===//
 
 v_bfrev_b32 v5, v1 quad_perm:[3,2,1,0]
-// GFX11: encoding: [0xfa,0x70,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+// GFX11: v_bfrev_b32_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x70,0x0a,0x7e,0x01,0x1b,0x00,0xff]
 
 v_bfrev_b32_dpp v5, v1 quad_perm:[3,2,1,0]
-// GFX11: encoding: [0xfa,0x70,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+// GFX11: v_bfrev_b32_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x70,0x0a,0x7e,0x01,0x1b,0x00,0xff]
 
 //===----------------------------------------------------------------------===//
 // VOP2.DPP8.
 //===----------------------------------------------------------------------===//
 
 v_add_f16 v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: encoding: [0xe9,0x04,0x0a,0x64,0x01,0x77,0x39,0x05]
+// GFX11: v_add_f16_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0a,0x64,0x01,0x77,0x39,0x05]
 
 v_add_f16_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: encoding: [0xe9,0x04,0x0a,0x64,0x01,0x77,0x39,0x05]
+// GFX11: v_add_f16_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0a,0x64,0x01,0x77,0x39,0x05]
 
 //===----------------------------------------------------------------------===//
 // VOP2.DPP16.
 //===----------------------------------------------------------------------===//
 
 v_add_f16 v5, v1, v2 quad_perm:[3,2,1,0]
-// GFX11: encoding: [0xfa,0x04,0x0a,0x64,0x01,0x1b,0x00,0xff]
+// GFX11: v_add_f16_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x64,0x01,0x1b,0x00,0xff]
 
 v_add_f16_dpp v5, v1, v2 quad_perm:[3,2,1,0]
-// GFX11: encoding: [0xfa,0x04,0x0a,0x64,0x01,0x1b,0x00,0xff]
+// GFX11: v_add_f16_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x64,0x01,0x1b,0x00,0xff]
 
 //===----------------------------------------------------------------------===//
 // VOPC.DPP8.
 //===----------------------------------------------------------------------===//
 
 v_cmp_le_u16 v1, v2 dpp8:[7,7,7,3,4,4,6,7] fi:1
-// GFX11: encoding: [0xea,0x04,0x76,0x7c,0x01,0xff,0x47,0xfa]
+// GFX11: v_cmp_le_u16 vcc_lo, v1, v2 dpp8:[7,7,7,3,4,4,6,7] fi:1 ; encoding: [0xea,0x04,0x76,0x7c,0x01,0xff,0x47,0xfa]
 
 v_cmp_le_u16_dpp v1, v2 dpp8:[7,7,7,3,4,4,6,7] fi:1
-// GFX11: encoding: [0xea,0x04,0x76,0x7c,0x01,0xff,0x47,0xfa]
+// GFX11: v_cmp_le_u16 vcc_lo, v1, v2 dpp8:[7,7,7,3,4,4,6,7] fi:1 ; encoding: [0xea,0x04,0x76,0x7c,0x01,0xff,0x47,0xfa]
 
 //===----------------------------------------------------------------------===//
 // VOPC.DPP16.
 //===----------------------------------------------------------------------===//
 
 v_cmp_gt_u16 v1, v2 row_shl:0x7 row_mask:0x0 bank_mask:0x0 fi:1
-// GFX11: encoding: [0xfa,0x04,0x78,0x7c,0x01,0x07,0x05,0x00]
+// GFX11: v_cmp_gt_u16 vcc_lo, v1, v2 row_shl:7 row_mask:0x0 bank_mask:0x0 fi:1 ; encoding: [0xfa,0x04,0x78,0x7c,0x01,0x07,0x05,0x00]
 
 v_cmp_gt_u16_dpp v1, v2 row_shl:0x7 row_mask:0x0 bank_mask:0x0 fi:1
-// GFX11: encoding: [0xfa,0x04,0x78,0x7c,0x01,0x07,0x05,0x00]
+// GFX11: v_cmp_gt_u16 vcc_lo, v1, v2 row_shl:7 row_mask:0x0 bank_mask:0x0 fi:1 ; encoding: [0xfa,0x04,0x78,0x7c,0x01,0x07,0x05,0x00]
 
 //===----------------------------------------------------------------------===//
 // VOPCX.DPP8.
 //===----------------------------------------------------------------------===//
 
 v_cmpx_class_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: encoding: [0xe9,0x04,0xfa,0x7d,0x01,0x77,0x39,0x05]
+// GFX11: v_cmpx_class_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0xfa,0x7d,0x01,0x77,0x39,0x05]
 
 v_cmpx_class_f16_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: encoding: [0xe9,0x04,0xfa,0x7d,0x01,0x77,0x39,0x05]
+// GFX11: v_cmpx_class_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0xfa,0x7d,0x01,0x77,0x39,0x05]
 
 //===----------------------------------------------------------------------===//
 // VOPCX.DPP16.
 //===----------------------------------------------------------------------===//
 
 v_cmpx_class_f16 v1, v2 quad_perm:[3,2,1,0]
-// GFX11: encoding: [0xfa,0x04,0xfa,0x7d,0x01,0x1b,0x00,0xff]
+// GFX11: v_cmpx_class_f16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfa,0x7d,0x01,0x1b,0x00,0xff]
 
 v_cmpx_class_f16_dpp v1, v2 quad_perm:[3,2,1,0]
-// GFX11: encoding: [0xfa,0x04,0xfa,0x7d,0x01,0x1b,0x00,0xff]
+// GFX11: v_cmpx_class_f16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfa,0x7d,0x01,0x1b,0x00,0xff]
 
 //===----------------------------------------------------------------------===//
 // VOP1 -> VOP3.
 //===----------------------------------------------------------------------===//
 
 v_sin_f32 v5, 0.5 mul:2
-// GFX11: encoding: [0x05,0x00,0xb5,0xd5,0xf0,0x00,0x00,0x08]
+// GFX11: v_sin_f32_e64 v5, 0.5 mul:2             ; encoding: [0x05,0x00,0xb5,0xd5,0xf0,0x00,0x00,0x08]
 
 v_sin_f32_e64 v5, 0.5 mul:2
-// GFX11: encoding: [0x05,0x00,0xb5,0xd5,0xf0,0x00,0x00,0x08]
+// GFX11: v_sin_f32_e64 v5, 0.5 mul:2             ; encoding: [0x05,0x00,0xb5,0xd5,0xf0,0x00,0x00,0x08]
 
 v_sin_f32_e64 v5, v1
-// GFX11: encoding: [0x05,0x00,0xb5,0xd5,0x01,0x01,0x00,0x00]
+// GFX11: v_sin_f32_e64 v5, v1                    ; encoding: [0x05,0x00,0xb5,0xd5,0x01,0x01,0x00,0x00]
 
 v_sin_f32 v5, v1
-// GFX11: encoding: [0x01,0x6b,0x0a,0x7e]
+// GFX11: v_sin_f32_e32 v5, v1                    ; encoding: [0x01,0x6b,0x0a,0x7e]
 
 //===----------------------------------------------------------------------===//
 // VOP2 -> VOP3.
 //===----------------------------------------------------------------------===//
 
 v_add_f32 v5, v1, -v2
-// GFX11: encoding: [0x05,0x00,0x03,0xd5,0x01,0x05,0x02,0x40]
+// GFX11: v_add_f32_e64 v5, v1, -v2               ; encoding: [0x05,0x00,0x03,0xd5,0x01,0x05,0x02,0x40]
 
 v_add_f32_e64 v5, v1, -v2
-// GFX11: encoding: [0x05,0x00,0x03,0xd5,0x01,0x05,0x02,0x40]
+// GFX11: v_add_f32_e64 v5, v1, -v2               ; encoding: [0x05,0x00,0x03,0xd5,0x01,0x05,0x02,0x40]
 
 v_add_f32_e64 v5, v1, v2
-// GFX11: encoding: [0x05,0x00,0x03,0xd5,0x01,0x05,0x02,0x00]
+// GFX11: v_add_f32_e64 v5, v1, v2                ; encoding: [0x05,0x00,0x03,0xd5,0x01,0x05,0x02,0x00]
 
 v_add_f32 v5, v1, v2
-// GFX11: encoding: [0x01,0x05,0x0a,0x06]
+// GFX11: v_add_f32_e32 v5, v1, v2                ; encoding: [0x01,0x05,0x0a,0x06]
 
 //===----------------------------------------------------------------------===//
 // VOPC -> VOP3.
 //===----------------------------------------------------------------------===//
 
 v_cmp_f_f32 s10, -v1, v2
-// GFX11: encoding: [0x0a,0x00,0x10,0xd4,0x01,0x05,0x02,0x20]
+// GFX11: v_cmp_f_f32_e64 s10, -v1, v2            ; encoding: [0x0a,0x00,0x10,0xd4,0x01,0x05,0x02,0x20]
 
 v_cmp_f_f32_e64 s10, -v1, v2
-// GFX11: encoding: [0x0a,0x00,0x10,0xd4,0x01,0x05,0x02,0x20]
+// GFX11: v_cmp_f_f32_e64 s10, -v1, v2            ; encoding: [0x0a,0x00,0x10,0xd4,0x01,0x05,0x02,0x20]
 
 v_cmp_f_f32_e64 vcc_lo, v1, v2
-// GFX11: encoding: [0x6a,0x00,0x10,0xd4,0x01,0x05,0x02,0x00]
+// GFX11: v_cmp_f_f32_e64 vcc_lo, v1, v2          ; encoding: [0x6a,0x00,0x10,0xd4,0x01,0x05,0x02,0x00]
 
 v_cmp_f_f32 vcc_lo, v1, v2
-// GFX11: encoding: [0x01,0x05,0x20,0x7c]
+// GFX11: v_cmp_f_f32_e32 vcc_lo, v1, v2          ; encoding: [0x01,0x05,0x20,0x7c]
 
 //===----------------------------------------------------------------------===//
 // VOPCX -> VOP3.
 //===----------------------------------------------------------------------===//
 
 v_cmpx_f_f32 -v1, v2
-// GFX11: encoding: [0x7e,0x00,0x90,0xd4,0x01,0x05,0x02,0x20]
+// GFX11: v_cmpx_f_f32_e64 -v1, v2                ; encoding: [0x7e,0x00,0x90,0xd4,0x01,0x05,0x02,0x20]
 
 v_cmpx_f_f32_e64 -v1, v2
-// GFX11: encoding: [0x7e,0x00,0x90,0xd4,0x01,0x05,0x02,0x20]
+// GFX11: v_cmpx_f_f32_e64 -v1, v2                ; encoding: [0x7e,0x00,0x90,0xd4,0x01,0x05,0x02,0x20]
 
 v_cmpx_f_f32_e64 v1, v2
-// GFX11: encoding: [0x7e,0x00,0x90,0xd4,0x01,0x05,0x02,0x00]
+// GFX11: v_cmpx_f_f32_e64 v1, v2                 ; encoding: [0x7e,0x00,0x90,0xd4,0x01,0x05,0x02,0x00]
 
 v_cmpx_f_f32 v1, v2
-// GFX11: encoding: [0x01,0x05,0x20,0x7d]
+// GFX11: v_cmpx_f_f32_e32 v1, v2                 ; encoding: [0x01,0x05,0x20,0x7d]
 
 //===----------------------------------------------------------------------===//
 // VOP3.
 //===----------------------------------------------------------------------===//
 
 v_add3_u32 v5, v1, v2, s3
-// GFX11: encoding: [0x05,0x00,0x55,0xd6,0x01,0x05,0x0e,0x00]
+// GFX11: v_add3_u32 v5, v1, v2, s3               ; encoding: [0x05,0x00,0x55,0xd6,0x01,0x05,0x0e,0x00]
 
 v_add3_u32_e64 v5, v1, v2, s3
-// GFX11: encoding: [0x05,0x00,0x55,0xd6,0x01,0x05,0x0e,0x00]
+// GFX11: v_add3_u32 v5, v1, v2, s3               ; encoding: [0x05,0x00,0x55,0xd6,0x01,0x05,0x0e,0x00]
 
 //===----------------------------------------------------------------------===//
 // VOP1 -> VOP3.DPP8.
 //===----------------------------------------------------------------------===//
 
 v_sin_f32 v5, v1 div:2 dpp8:[0,0,0,0,0,0,0,0]
-// GFX11: encoding: [0x05,0x00,0xb5,0xd5,0xe9,0x00,0x00,0x18,0x01,0x00,0x00,0x00]
+// GFX11: v_sin_f32_e64_dpp v5, v1 div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x05,0x00,0xb5,0xd5,0xe9,0x00,0x00,0x18,0x01,0x00,0x00,0x00]
 
 v_sin_f32_e64_dpp v5, v1 div:2 dpp8:[0,0,0,0,0,0,0,0]
-// GFX11: encoding: [0x05,0x00,0xb5,0xd5,0xe9,0x00,0x00,0x18,0x01,0x00,0x00,0x00]
+// GFX11: v_sin_f32_e64_dpp v5, v1 div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x05,0x00,0xb5,0xd5,0xe9,0x00,0x00,0x18,0x01,0x00,0x00,0x00]
 
 v_sin_f32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: encoding: [0x05,0x00,0xb5,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+// GFX11: v_sin_f32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xb5,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
 
 v_sin_f32 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: encoding: [0xe9,0x6a,0x0a,0x7e,0x01,0x77,0x39,0x05]
+// GFX11: v_sin_f32_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x6a,0x0a,0x7e,0x01,0x77,0x39,0x05]
 
 //===----------------------------------------------------------------------===//
 // VOP2 -> VOP3.DPP8.
 //===----------------------------------------------------------------------===//
 
 v_add_f32 v5, v1, v2 mul:4 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: encoding: [0x05,0x00,0x03,0xd5,0xe9,0x04,0x02,0x10,0x01,0x77,0x39,0x05]
+// GFX11: v_add_f32_e64_dpp v5, v1, v2 mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x03,0xd5,0xe9,0x04,0x02,0x10,0x01,0x77,0x39,0x05]
 
 v_add_f32_e64_dpp v5, v1, v2 mul:4 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: encoding: [0x05,0x00,0x03,0xd5,0xe9,0x04,0x02,0x10,0x01,0x77,0x39,0x05]
+// GFX11: v_add_f32_e64_dpp v5, v1, v2 mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x03,0xd5,0xe9,0x04,0x02,0x10,0x01,0x77,0x39,0x05]
 
 v_add_f32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: encoding: [0x05,0x00,0x03,0xd5,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+// GFX11: v_add_f32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x03,0xd5,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
 v_add_f32 v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: encoding: [0xe9,0x04,0x0a,0x06,0x01,0x77,0x39,0x05]
+// GFX11: v_add_f32_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0a,0x06,0x01,0x77,0x39,0x05]
 
 //===----------------------------------------------------------------------===//
 // VOPC -> VOP3.DPP8.
 //===----------------------------------------------------------------------===//
 
 v_cmp_class_f32 s5, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: encoding: [0x05,0x00,0x7e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+// GFX11: v_cmp_class_f32_e64_dpp s5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x7e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
 v_cmp_class_f32_e64_dpp s5, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: encoding: [0x05,0x00,0x7e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+// GFX11: v_cmp_class_f32_e64_dpp s5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x7e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
 v_cmp_class_f32_e64_dpp vcc_lo, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: encoding: [0x6a,0x00,0x7e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+// GFX11: v_cmp_class_f32_e64_dpp vcc_lo, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x6a,0x00,0x7e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
 v_cmp_class_f32 vcc_lo, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: encoding: [0xe9,0x04,0xfc,0x7c,0x01,0x77,0x39,0x05]
+// GFX11: v_cmp_class_f32 vcc_lo, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0xfc,0x7c,0x01,0x77,0x39,0x05]
 
 //===----------------------------------------------------------------------===//
 // VOPCX -> VOP3.DPP8.
 //===----------------------------------------------------------------------===//
 
 v_cmpx_class_f32 -v1, v2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: encoding: [0x7e,0x00,0xfe,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
+// GFX11: v_cmpx_class_f32_e64_dpp -v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xfe,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 
 v_cmpx_class_f32_e64_dpp -v1, v2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: encoding: [0x7e,0x00,0xfe,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
+// GFX11: v_cmpx_class_f32_e64_dpp -v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xfe,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 
 v_cmpx_class_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: encoding: [0x7e,0x00,0xfe,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+// GFX11: v_cmpx_class_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xfe,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
 v_cmpx_class_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: encoding: [0xe9,0x04,0xfc,0x7d,0x01,0x77,0x39,0x05]
+// GFX11: v_cmpx_class_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0xfc,0x7d,0x01,0x77,0x39,0x05]
 
 //===----------------------------------------------------------------------===//
 // VOP1 -> VOP3.DPP16.
 //===----------------------------------------------------------------------===//
 
 v_sin_f32 v5, v1 div:2 row_xmask:15
-// GFX11: encoding: [0x05,0x00,0xb5,0xd5,0xfa,0x00,0x00,0x18,0x01,0x6f,0x01,0xff]
+// GFX11: v_sin_f32_e64_dpp v5, v1 div:2 row_xmask:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xb5,0xd5,0xfa,0x00,0x00,0x18,0x01,0x6f,0x01,0xff]
 
 v_sin_f32_e64_dpp v5, v1 div:2 row_xmask:15
-// GFX11: encoding: [0x05,0x00,0xb5,0xd5,0xfa,0x00,0x00,0x18,0x01,0x6f,0x01,0xff]
+// GFX11: v_sin_f32_e64_dpp v5, v1 div:2 row_xmask:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xb5,0xd5,0xfa,0x00,0x00,0x18,0x01,0x6f,0x01,0xff]
 
 v_sin_f32_e64_dpp v5, v1 quad_perm:[3,2,1,0]
-// GFX11: encoding: [0x05,0x00,0xb5,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+// GFX11: v_sin_f32_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xb5,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
 
 v_sin_f32 v5, v1 quad_perm:[3,2,1,0]
-// GFX11: encoding: [0xfa,0x6a,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+// GFX11: v_sin_f32_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x6a,0x0a,0x7e,0x01,0x1b,0x00,0xff]
 
 //===----------------------------------------------------------------------===//
 // VOP2 -> VOP3.DPP16.
 //===----------------------------------------------------------------------===//
 
 v_add_f32 v5, v1, v2 div:2 quad_perm:[3,2,1,0]
-// GFX11: encoding: [0x05,0x00,0x03,0xd5,0xfa,0x04,0x02,0x18,0x01,0x1b,0x00,0xff]
+// GFX11: v_add_f32_e64_dpp v5, v1, v2 div:2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd5,0xfa,0x04,0x02,0x18,0x01,0x1b,0x00,0xff]
 
 v_add_f32_e64_dpp v5, v1, v2 div:2 quad_perm:[3,2,1,0]
-// GFX11: encoding: [0x05,0x00,0x03,0xd5,0xfa,0x04,0x02,0x18,0x01,0x1b,0x00,0xff]
+// GFX11: v_add_f32_e64_dpp v5, v1, v2 div:2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd5,0xfa,0x04,0x02,0x18,0x01,0x1b,0x00,0xff]
 
 v_add_f32_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0]
-// GFX11: encoding: [0x05,0x00,0x03,0xd5,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+// GFX11: v_add_f32_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd5,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
 v_add_f32 v5, v1, v2 quad_perm:[3,2,1,0]
-// GFX11: encoding: [0xfa,0x04,0x0a,0x06,0x01,0x1b,0x00,0xff]
+// GFX11: v_add_f32_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x06,0x01,0x1b,0x00,0xff]
 
 //===----------------------------------------------------------------------===//
 // VOPC -> VOP3.DPP16.
 //===----------------------------------------------------------------------===//
 
 v_cmp_class_f32 s5, v1, v2 quad_perm:[3,2,1,0]
-// GFX11: encoding: [0x05,0x00,0x7e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+// GFX11: v_cmp_class_f32_e64_dpp s5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x7e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
 v_cmp_class_f32_e64_dpp s5, v1, v2 quad_perm:[3,2,1,0]
-// GFX11: encoding: [0x05,0x00,0x7e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+// GFX11: v_cmp_class_f32_e64_dpp s5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x7e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
 v_cmp_class_f32_e64_dpp vcc_lo, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf
-// GFX11: encoding: [0x6a,0x00,0x7e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+// GFX11: v_cmp_class_f32_e64_dpp vcc_lo, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x6a,0x00,0x7e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 
 v_cmp_class_f32 vcc_lo, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf
-// GFX11: encoding: [0xfa,0x04,0xfc,0x7c,0x01,0x50,0x01,0xff]
+// GFX11: v_cmp_class_f32 vcc_lo, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfc,0x7c,0x01,0x50,0x01,0xff]
 
 //===----------------------------------------------------------------------===//
 // VOPCX -> VOP3.DPP16.
 //===----------------------------------------------------------------------===//
 
 v_cmpx_class_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0]
-// GFX11: encoding: [0x7e,0x00,0xfe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+// GFX11: v_cmpx_class_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xfe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
 v_cmpx_class_f32 v1, v2 quad_perm:[3,2,1,0]
-// GFX11: encoding: [0xfa,0x04,0xfc,0x7d,0x01,0x1b,0x00,0xff]
+// GFX11: v_cmpx_class_f32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfc,0x7d,0x01,0x1b,0x00,0xff]
 
 //===----------------------------------------------------------------------===//
 // VOP3P.
 //===----------------------------------------------------------------------===//
 
 v_dot2_f32_f16 v0, v1, v2, v3
-// GFX11: encoding: [0x00,0x40,0x13,0xcc,0x01,0x05,0x0e,0x1c]
+// GFX11: v_dot2_f32_f16 v0, v1, v2, v3           ; encoding: [0x00,0x40,0x13,0xcc,0x01,0x05,0x0e,0x1c]
 
 v_dot2_f32_f16_e64 v0, v1, v2, v3
-// GFX11: encoding: [0x00,0x40,0x13,0xcc,0x01,0x05,0x0e,0x1c]
+// GFX11: v_dot2_f32_f16 v0, v1, v2, v3           ; encoding: [0x00,0x40,0x13,0xcc,0x01,0x05,0x0e,0x1c]
 
 //===----------------------------------------------------------------------===//
 // VOP3P.DPP8.
 //===----------------------------------------------------------------------===//
 
 v_dot2_f32_f16 v0, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: encoding: [0x00,0x00,0x13,0xcc,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
+// GFX11: v_dot2_f32_f16_e64_dpp v0, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x00,0x00,0x13,0xcc,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 
 v_dot2_f32_f16_e64_dpp v0, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: encoding: [0x00,0x00,0x13,0xcc,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
+// GFX11: v_dot2_f32_f16_e64_dpp v0, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x00,0x00,0x13,0xcc,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 
 //===----------------------------------------------------------------------===//
 // VOP3P.DPP16.
 //===----------------------------------------------------------------------===//
 
 v_dot2_f32_f16 v0, v1, v2, v3 quad_perm:[1,2,3,0]
-// GFX11: encoding: [0x00,0x00,0x13,0xcc,0xfa,0x04,0x0e,0x04,0x01,0x39,0x00,0xff]
+// GFX11: v_dot2_f32_f16_e64_dpp v0, v1, v2, v3 quad_perm:[1,2,3,0] row_mask:0xf bank_mask:0xf ; encoding: [0x00,0x00,0x13,0xcc,0xfa,0x04,0x0e,0x04,0x01,0x39,0x00,0xff]
 
 v_dot2_f32_f16_e64_dpp v0, v1, v2, v3 quad_perm:[1,2,3,0]
-// GFX11: encoding: [0x00,0x00,0x13,0xcc,0xfa,0x04,0x0e,0x04,0x01,0x39,0x00,0xff]
+// GFX11: v_dot2_f32_f16_e64_dpp v0, v1, v2, v3 quad_perm:[1,2,3,0] row_mask:0xf bank_mask:0xf ; encoding: [0x00,0x00,0x13,0xcc,0xfa,0x04,0x0e,0x04,0x01,0x39,0x00,0xff]
-- 
cgit v1.1


From fb48fd18c240574841378defacadff34238089bb Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Mon, 12 Feb 2024 10:05:22 -0800
Subject: [Driver,test] Fix hlsl-lang-targets.hlsl after #78655

---
 clang/test/Driver/hlsl-lang-targets.hlsl | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/clang/test/Driver/hlsl-lang-targets.hlsl b/clang/test/Driver/hlsl-lang-targets.hlsl
index f2f4bba..7ce490a 100644
--- a/clang/test/Driver/hlsl-lang-targets.hlsl
+++ b/clang/test/Driver/hlsl-lang-targets.hlsl
@@ -32,7 +32,7 @@
 // Invalid shader stages
 //
 // RUN: not %clang -target dxil--shadermodel6.2-unknown %s -S -o /dev/null 2>&1 | FileCheck --check-prefix=CHECK-BAD-ENV %s
-// RUN: not %clang -target dxil--shadermodel6.2-invalidenvironment %s -S -o /dev/null 2>&1 | FileCheck --check-prefix=CHECK-BAD-ENV %s
+// RUN: not %clang --target=dxil--shadermodel6.2-invalidenvironment %s -S -o /dev/null 2>&1 | FileCheck --check-prefix=CHECK-BAD-ENV-DRV %s
 // RUN: not %clang -target dxil--shadermodel6.2-eabi %s -S -o /dev/null 2>&1 | FileCheck --check-prefix=CHECK-BAD-ENV %s
 // RUN: not %clang -target dxil--shadermodel6.2-msvc %s -S -o /dev/null 2>&1 | FileCheck --check-prefix=CHECK-BAD-ENV %s
 
@@ -47,6 +47,7 @@
 // CHECK-BAD-OS: error: shader model '{{.*}}' in target '{{.*}}' is invalid for HLSL code generation
 // CHECK-NO-ENV: error: shader stage is required as environment in target '{{.*}}' for HLSL code generation
 // CHECK-BAD-ENV: error: shader stage '{{.*}}' in target '{{.*}}' is invalid for HLSL code generation
+// CHECK-BAD-ENV-DRV: error: version '{{.*}}' in target triple '{{.*}}' is invalid
 // CHECK-BAD-TARGET: error: HLSL code generation is unsupported for target '{{.*}}'
 
 [shader("pixel")]
-- 
cgit v1.1


From b04dd5d187306df9cc7e53ec5a84c1324be63eb8 Mon Sep 17 00:00:00 2001
From: Alexey Bataev <a.bataev@outlook.com>
Date: Mon, 12 Feb 2024 10:26:26 -0800
Subject: [SLP]FIx PR81403: compiler crah because wrongly resized vector value.

The mask for the reshuffling/resizing might be calculated incorrectly,
fixed.
---
 llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp    | 18 ++----
 .../SLPVectorizer/SystemZ/reorder-same-node.ll     | 67 ++++++++++++++++++++++
 2 files changed, 73 insertions(+), 12 deletions(-)
 create mode 100644 llvm/test/Transforms/SLPVectorizer/SystemZ/reorder-same-node.ll

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 085cbdb..c54d065 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -10648,19 +10648,13 @@ Value *BoUpSLP::vectorizeOperand(TreeEntry *E, unsigned NodeIdx,
           // ... (use %2)
           // %shuffle = shuffle <2 x> %2, poison, <2 x> {2, 0}
           // br %block
-          SmallVector<int> UniqueIdxs(VF, PoisonMaskElem);
-          SmallSet<int, 4> UsedIdxs;
-          int Pos = 0;
-          for (int Idx : VE->ReuseShuffleIndices) {
-            if (Idx != static_cast<int>(VF) && Idx != PoisonMaskElem &&
-                UsedIdxs.insert(Idx).second)
-              UniqueIdxs[Idx] = Pos;
-            ++Pos;
+          SmallVector<int> Mask(VF, PoisonMaskElem);
+          for (auto [I, V] : enumerate(VL)) {
+            if (isa<PoisonValue>(V))
+              continue;
+            Mask[I] = VE->findLaneForValue(V);
           }
-          assert(VF >= UsedIdxs.size() && "Expected vectorization factor "
-                                          "less than original vector size.");
-          UniqueIdxs.append(VF - UsedIdxs.size(), PoisonMaskElem);
-          V = FinalShuffle(V, UniqueIdxs);
+          V = FinalShuffle(V, Mask);
         } else {
           assert(VF < cast<FixedVectorType>(V->getType())->getNumElements() &&
                  "Expected vectorization factor less "
diff --git a/llvm/test/Transforms/SLPVectorizer/SystemZ/reorder-same-node.ll b/llvm/test/Transforms/SLPVectorizer/SystemZ/reorder-same-node.ll
new file mode 100644
index 0000000..b4c8a1c
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/SystemZ/reorder-same-node.ll
@@ -0,0 +1,67 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -S --passes=slp-vectorizer -mtriple=s390x-unknown-linux -mcpu=z16 < %s | FileCheck %s
+
+define void @test() {
+; CHECK-LABEL: define void @test(
+; CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vector.reduce.xor.v8i64(<8 x i64> zeroinitializer)
+; CHECK-NEXT:    store i64 [[TMP1]], ptr null, align 8
+; CHECK-NEXT:    ret void
+;
+  %1 = zext i8 0 to i32
+  %2 = lshr i32 0, %1
+  %3 = icmp ult i32 %2, 0
+  %4 = shl i32 0, %1
+  %5 = and i32 %4, 0
+  %narrow = select i1 %3, i32 0, i32 %5
+  %6 = zext i32 %narrow to i64
+  %7 = zext i8 0 to i32
+  %8 = lshr i32 0, %7
+  %9 = icmp ult i32 %8, 0
+  %10 = shl i32 0, %7
+  %11 = and i32 %10, 0
+  %narrow.1 = select i1 %9, i32 0, i32 %11
+  %12 = zext i32 %narrow.1 to i64
+  %13 = xor i64 %6, %12
+  %14 = zext i8 0 to i32
+  %15 = lshr i32 0, %14
+  %16 = icmp ult i32 %15, 0
+  %17 = shl i32 0, %14
+  %18 = and i32 %17, 0
+  %narrow.2 = select i1 %16, i32 0, i32 %18
+  %19 = zext i32 %narrow.2 to i64
+  %20 = xor i64 %13, %19
+  %21 = icmp ult i32 %8, 0
+  %22 = shl i32 0, %7
+  %23 = and i32 %22, 0
+  %narrow.3 = select i1 %21, i32 0, i32 %23
+  %24 = zext i32 %narrow.3 to i64
+  %25 = xor i64 %20, %24
+  %26 = icmp ult i32 %15, 0
+  %27 = shl i32 0, %14
+  %28 = and i32 %27, 0
+  %narrow.4 = select i1 %26, i32 0, i32 %28
+  %29 = zext i32 %narrow.4 to i64
+  %30 = xor i64 %25, %29
+  %31 = icmp ult i32 %8, 0
+  %32 = shl i32 0, %7
+  %33 = and i32 %32, 0
+  %narrow.5 = select i1 %31, i32 0, i32 %33
+  %34 = zext i32 %narrow.5 to i64
+  %35 = xor i64 %30, %34
+  %36 = icmp ult i32 %15, 0
+  %37 = shl i32 0, %14
+  %38 = and i32 %37, 0
+  %narrow.6 = select i1 %36, i32 0, i32 %38
+  %39 = zext i32 %narrow.6 to i64
+  %40 = xor i64 %35, %39
+  %41 = icmp ult i32 %8, 0
+  %42 = shl i32 0, %7
+  %43 = and i32 %42, 0
+  %narrow.7 = select i1 %41, i32 0, i32 %43
+  %44 = zext i32 %narrow.7 to i64
+  %45 = xor i64 %40, %44
+  store i64 %45, ptr null, align 8
+  ret void
+}
+
-- 
cgit v1.1


From a9845d602288263a5e2c260ac1a9655ae6b7b119 Mon Sep 17 00:00:00 2001
From: Vlad Serebrennikov <serebrennikov.vladislav@gmail.com>
Date: Mon, 12 Feb 2024 22:36:06 +0400
Subject: [clang] Add some CodeGen tests for CWG 1xx issues (#80338)

Covers CWG issues
[124](https://cplusplus.github.io/CWG/issues/124.html)
[185](https://cplusplus.github.io/CWG/issues/185.html),
[193](https://cplusplus.github.io/CWG/issues/193.html),
[199](https://cplusplus.github.io/CWG/issues/199.html).

I also looked at [190](https://cplusplus.github.io/CWG/issues/190.html),
but concluded that we should try to test it via C++20
`std::is_layout_compatible` first.

I tried to group tests under `dr1xx-codegen.cpp`, but found out that
CodeGen can arbitrarily reorder function definitions in LLVM module. In
particular, interleaving between regular function definitions and
destructor definitions present in the source might not be preserved,
which messes up FileCheck directives. `CHECK-DAG` can help with that,
but its interaction with `CHECK-LABEL` (lack of thereof) would require
me to relax tests too much.
---
 clang/test/CXX/drs/dr124.cpp | 51 ++++++++++++++++++++++++++++++++++++++++++++
 clang/test/CXX/drs/dr185.cpp | 30 ++++++++++++++++++++++++++
 clang/test/CXX/drs/dr193.cpp | 46 +++++++++++++++++++++++++++++++++++++++
 clang/test/CXX/drs/dr199.cpp | 33 ++++++++++++++++++++++++++++
 clang/test/CXX/drs/dr1xx.cpp |  9 ++++----
 clang/www/cxx_dr_status.html |  8 +++----
 6 files changed, 169 insertions(+), 8 deletions(-)
 create mode 100644 clang/test/CXX/drs/dr124.cpp
 create mode 100644 clang/test/CXX/drs/dr185.cpp
 create mode 100644 clang/test/CXX/drs/dr193.cpp
 create mode 100644 clang/test/CXX/drs/dr199.cpp

diff --git a/clang/test/CXX/drs/dr124.cpp b/clang/test/CXX/drs/dr124.cpp
new file mode 100644
index 0000000..c07beb1
--- /dev/null
+++ b/clang/test/CXX/drs/dr124.cpp
@@ -0,0 +1,51 @@
+// RUN: %clang_cc1 -std=c++98 %s -triple x86_64-linux-gnu -emit-llvm -disable-llvm-passes -o - -fexceptions -fcxx-exceptions -pedantic-errors | llvm-cxxfilt -n | FileCheck %s --check-prefixes CHECK
+// RUN: %clang_cc1 -std=c++11 %s -triple x86_64-linux-gnu -emit-llvm -disable-llvm-passes -o - -fexceptions -fcxx-exceptions -pedantic-errors | llvm-cxxfilt -n | FileCheck %s --check-prefixes CHECK
+// RUN: %clang_cc1 -std=c++14 %s -triple x86_64-linux-gnu -emit-llvm -disable-llvm-passes -o - -fexceptions -fcxx-exceptions -pedantic-errors | llvm-cxxfilt -n | FileCheck %s --check-prefixes CHECK
+// RUN: %clang_cc1 -std=c++17 %s -triple x86_64-linux-gnu -emit-llvm -disable-llvm-passes -o - -fexceptions -fcxx-exceptions -pedantic-errors | llvm-cxxfilt -n | FileCheck %s --check-prefixes CHECK
+// RUN: %clang_cc1 -std=c++20 %s -triple x86_64-linux-gnu -emit-llvm -disable-llvm-passes -o - -fexceptions -fcxx-exceptions -pedantic-errors | llvm-cxxfilt -n | FileCheck %s --check-prefixes CHECK
+// RUN: %clang_cc1 -std=c++23 %s -triple x86_64-linux-gnu -emit-llvm -disable-llvm-passes -o - -fexceptions -fcxx-exceptions -pedantic-errors | llvm-cxxfilt -n | FileCheck %s --check-prefixes CHECK
+// RUN: %clang_cc1 -std=c++2c %s -triple x86_64-linux-gnu -emit-llvm -disable-llvm-passes -o - -fexceptions -fcxx-exceptions -pedantic-errors | llvm-cxxfilt -n | FileCheck %s --check-prefixes CHECK
+
+#if __cplusplus == 199711L
+#define NOTHROW throw()
+#else
+#define NOTHROW noexcept(true)
+#endif
+
+namespace dr124 { // dr124: 2.7
+
+extern void full_expr_fence() NOTHROW;
+
+struct A {
+  A() NOTHROW {}
+  ~A() NOTHROW {}
+};
+
+struct B {
+  B(A = A()) NOTHROW {}
+  ~B() NOTHROW {}
+};
+
+void f() {
+  full_expr_fence();
+  B b[2];
+  full_expr_fence();
+}
+
+// CHECK-LABEL: define {{.*}} void @dr124::f()()
+// CHECK:         call void @dr124::full_expr_fence()
+// CHECK:         br label %arrayctor.loop
+// CHECK-LABEL: arrayctor.loop:
+// CHECK:         call void @dr124::A::A()
+// CHECK:         call void @dr124::B::B(dr124::A)
+// CHECK:         call void @dr124::A::~A()
+// CHECK:         br {{.*}}, label %arrayctor.cont, label %arrayctor.loop
+// CHECK-LABEL: arrayctor.cont:
+// CHECK:         call void @dr124::full_expr_fence()
+// CHECK:         br label %arraydestroy.body
+// CHECK-LABEL: arraydestroy.body:
+// CHECK:         call void @dr124::B::~B()
+// CHECK-LABEL: }
+
+
+} // namespace dr124
diff --git a/clang/test/CXX/drs/dr185.cpp b/clang/test/CXX/drs/dr185.cpp
new file mode 100644
index 0000000..aff00f1
--- /dev/null
+++ b/clang/test/CXX/drs/dr185.cpp
@@ -0,0 +1,30 @@
+// RUN: %clang_cc1 -std=c++98 %s -triple x86_64-linux-gnu -emit-llvm -o - -fexceptions -fcxx-exceptions -pedantic-errors | llvm-cxxfilt -n | FileCheck %s --check-prefixes CHECK
+// RUN: %clang_cc1 -std=c++11 %s -triple x86_64-linux-gnu -emit-llvm -o - -fexceptions -fcxx-exceptions -pedantic-errors | llvm-cxxfilt -n | FileCheck %s --check-prefixes CHECK
+// RUN: %clang_cc1 -std=c++14 %s -triple x86_64-linux-gnu -emit-llvm -o - -fexceptions -fcxx-exceptions -pedantic-errors | llvm-cxxfilt -n | FileCheck %s --check-prefixes CHECK
+// RUN: %clang_cc1 -std=c++17 %s -triple x86_64-linux-gnu -emit-llvm -o - -fexceptions -fcxx-exceptions -pedantic-errors | llvm-cxxfilt -n | FileCheck %s --check-prefixes CHECK
+// RUN: %clang_cc1 -std=c++20 %s -triple x86_64-linux-gnu -emit-llvm -o - -fexceptions -fcxx-exceptions -pedantic-errors | llvm-cxxfilt -n | FileCheck %s --check-prefixes CHECK
+// RUN: %clang_cc1 -std=c++23 %s -triple x86_64-linux-gnu -emit-llvm -o - -fexceptions -fcxx-exceptions -pedantic-errors | llvm-cxxfilt -n | FileCheck %s --check-prefixes CHECK
+// RUN: %clang_cc1 -std=c++2c %s -triple x86_64-linux-gnu -emit-llvm -o - -fexceptions -fcxx-exceptions -pedantic-errors | llvm-cxxfilt -n | FileCheck %s --check-prefixes CHECK
+
+namespace dr185 { // dr185: 2.7
+struct A {
+  mutable int value;
+  explicit A(int i) : value(i) {}
+  void mutate(int i) const { value = i; }
+};
+
+int foo() {
+  A const& t = A(1);
+  A n(t);
+  t.mutate(2);
+  return n.value;
+}
+
+// CHECK-LABEL: define {{.*}} i32 @dr185::foo()
+// CHECK:         call void @dr185::A::A(int)(ptr {{[^,]*}} %ref.tmp, {{.*}})
+// CHECK:         store ptr %ref.tmp, ptr %t
+// CHECK-NOT:     %t =
+// CHECK:         [[DR185_T:%.+]] = load ptr, ptr %t
+// CHECK:         call void @llvm.memcpy.p0.p0.i64(ptr {{[^,]*}} %n, ptr {{[^,]*}} [[DR185_T]], {{.*}})
+// CHECK-LABEL: }
+} // namespace dr185
diff --git a/clang/test/CXX/drs/dr193.cpp b/clang/test/CXX/drs/dr193.cpp
new file mode 100644
index 0000000..c010dad
--- /dev/null
+++ b/clang/test/CXX/drs/dr193.cpp
@@ -0,0 +1,46 @@
+// RUN: %clang_cc1 -std=c++98 %s -triple x86_64-linux-gnu -emit-llvm -o - -fexceptions -fcxx-exceptions -pedantic-errors | llvm-cxxfilt -n | FileCheck %s --check-prefixes CHECK
+// RUN: %clang_cc1 -std=c++11 %s -triple x86_64-linux-gnu -emit-llvm -o - -fexceptions -fcxx-exceptions -pedantic-errors | llvm-cxxfilt -n | FileCheck %s --check-prefixes CHECK
+// RUN: %clang_cc1 -std=c++14 %s -triple x86_64-linux-gnu -emit-llvm -o - -fexceptions -fcxx-exceptions -pedantic-errors | llvm-cxxfilt -n | FileCheck %s --check-prefixes CHECK
+// RUN: %clang_cc1 -std=c++17 %s -triple x86_64-linux-gnu -emit-llvm -o - -fexceptions -fcxx-exceptions -pedantic-errors | llvm-cxxfilt -n | FileCheck %s --check-prefixes CHECK
+// RUN: %clang_cc1 -std=c++20 %s -triple x86_64-linux-gnu -emit-llvm -o - -fexceptions -fcxx-exceptions -pedantic-errors | llvm-cxxfilt -n | FileCheck %s --check-prefixes CHECK
+// RUN: %clang_cc1 -std=c++23 %s -triple x86_64-linux-gnu -emit-llvm -o - -fexceptions -fcxx-exceptions -pedantic-errors | llvm-cxxfilt -n | FileCheck %s --check-prefixes CHECK
+// RUN: %clang_cc1 -std=c++2c %s -triple x86_64-linux-gnu -emit-llvm -o - -fexceptions -fcxx-exceptions -pedantic-errors | llvm-cxxfilt -n | FileCheck %s --check-prefixes CHECK
+
+#if __cplusplus == 199711L
+#define NOTHROW throw()
+#else
+#define NOTHROW noexcept(true)
+#endif
+
+namespace dr193 { // dr193: 2.7
+struct A {
+  ~A() NOTHROW {}
+};
+
+struct B {
+  ~B() NOTHROW {}
+};
+
+struct C {
+  ~C() NOTHROW {}
+};
+
+struct D : A {
+  B b;
+  ~D() NOTHROW { C c; }
+};
+
+void foo() {
+  D d;
+}
+
+// skipping over D1 (complete object destructor)
+// CHECK-LABEL: define {{.*}} void @dr193::D::~D(){{.*}}
+// CHECK-LABEL: define {{.*}} void @dr193::D::~D(){{.*}}
+// CHECK-NOT:     call void @dr193::A::~A()
+// CHECK-NOT:     call void @dr193::B::~B()
+// CHECK:         call void @dr193::C::~C()
+// CHECK:         call void @dr193::B::~B()
+// CHECK:         call void @dr193::A::~A()
+// CHECK-LABEL: }
+} // namespace dr193
diff --git a/clang/test/CXX/drs/dr199.cpp b/clang/test/CXX/drs/dr199.cpp
new file mode 100644
index 0000000..7517d79
--- /dev/null
+++ b/clang/test/CXX/drs/dr199.cpp
@@ -0,0 +1,33 @@
+// RUN: %clang_cc1 -std=c++98 %s -triple x86_64-linux-gnu -emit-llvm -o - -fexceptions -fcxx-exceptions -pedantic-errors | llvm-cxxfilt -n | FileCheck %s --check-prefixes CHECK
+// RUN: %clang_cc1 -std=c++11 %s -triple x86_64-linux-gnu -emit-llvm -o - -fexceptions -fcxx-exceptions -pedantic-errors | llvm-cxxfilt -n | FileCheck %s --check-prefixes CHECK
+// RUN: %clang_cc1 -std=c++14 %s -triple x86_64-linux-gnu -emit-llvm -o - -fexceptions -fcxx-exceptions -pedantic-errors | llvm-cxxfilt -n | FileCheck %s --check-prefixes CHECK
+// RUN: %clang_cc1 -std=c++17 %s -triple x86_64-linux-gnu -emit-llvm -o - -fexceptions -fcxx-exceptions -pedantic-errors | llvm-cxxfilt -n | FileCheck %s --check-prefixes CHECK
+// RUN: %clang_cc1 -std=c++20 %s -triple x86_64-linux-gnu -emit-llvm -o - -fexceptions -fcxx-exceptions -pedantic-errors | llvm-cxxfilt -n | FileCheck %s --check-prefixes CHECK
+// RUN: %clang_cc1 -std=c++23 %s -triple x86_64-linux-gnu -emit-llvm -o - -fexceptions -fcxx-exceptions -pedantic-errors | llvm-cxxfilt -n | FileCheck %s --check-prefixes CHECK
+// RUN: %clang_cc1 -std=c++2c %s -triple x86_64-linux-gnu -emit-llvm -o - -fexceptions -fcxx-exceptions -pedantic-errors | llvm-cxxfilt -n | FileCheck %s --check-prefixes CHECK
+
+#if __cplusplus == 199711L
+#define NOTHROW throw()
+#else
+#define NOTHROW noexcept(true)
+#endif
+
+namespace dr199 { // dr199: 2.8
+struct A {
+  ~A() NOTHROW {}
+};
+
+struct B {
+  ~B() NOTHROW {}
+};
+
+void foo() {
+  A(), B();
+}
+
+// CHECK-LABEL: define {{.*}} void @dr199::foo()
+// CHECK-NOT:     call void @dr199::A::~A()
+// CHECK:         call void @dr199::B::~B()
+// CHECK:         call void @dr199::A::~A()
+// CHECK-LABEL: }
+} // namespace dr199
diff --git a/clang/test/CXX/drs/dr1xx.cpp b/clang/test/CXX/drs/dr1xx.cpp
index 1930de2..d55033c 100644
--- a/clang/test/CXX/drs/dr1xx.cpp
+++ b/clang/test/CXX/drs/dr1xx.cpp
@@ -306,7 +306,7 @@ namespace dr122 { // dr122: yes
 }
 
 // dr123: na
-// dr124: dup 201
+// dr124 is in dr124.cpp
 
 // dr125: yes
 struct dr125_A { struct dr125_B {}; }; // #dr125_B
@@ -1169,7 +1169,7 @@ namespace dr184 { // dr184: yes
   void h() { A<B>().g(); }
 }
 
-// dr185 FIXME: add codegen test
+// dr185 is in dr185.cpp
 
 namespace dr187 { // dr187: sup 481
   const int Z = 1;
@@ -1184,6 +1184,7 @@ namespace dr188 { // dr188: yes
 }
 
 // dr190 FIXME: add codegen test for tbaa
+//              or implement C++20 std::is_layout_compatible and test it this way
 
 int dr191_j;
 namespace dr191 { // dr191: yes
@@ -1215,7 +1216,7 @@ namespace dr191 { // dr191: yes
   }
 }
 
-// dr193 FIXME: add codegen test
+// dr193 is in dr193.cpp
 
 namespace dr194 { // dr194: yes
   struct A {
@@ -1290,4 +1291,4 @@ namespace dr198 { // dr198: yes
   };
 }
 
-// dr199 FIXME: add codegen test
+// dr199 is in dr199.cpp
diff --git a/clang/www/cxx_dr_status.html b/clang/www/cxx_dr_status.html
index 4ce5c43..8f4ae23 100755
--- a/clang/www/cxx_dr_status.html
+++ b/clang/www/cxx_dr_status.html
@@ -782,7 +782,7 @@
     <td><a href="https://cplusplus.github.io/CWG/issues/124.html">124</a></td>
     <td>CD1</td>
     <td>Lifetime of temporaries in default initialization of class arrays</td>
-    <td class="unknown" align="center">Duplicate of <a href="#201">201</a></td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="125">
     <td><a href="https://cplusplus.github.io/CWG/issues/125.html">125</a></td>
@@ -1148,7 +1148,7 @@
     <td><a href="https://cplusplus.github.io/CWG/issues/185.html">185</a></td>
     <td>TC1</td>
     <td>"Named" temporaries and copy elision</td>
-    <td class="unknown" align="center">Unknown</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr class="open" id="186">
     <td><a href="https://cplusplus.github.io/CWG/issues/186.html">186</a></td>
@@ -1196,7 +1196,7 @@
     <td><a href="https://cplusplus.github.io/CWG/issues/193.html">193</a></td>
     <td>TC1</td>
     <td>Order of destruction of local automatics of destructor</td>
-    <td class="unknown" align="center">Unknown</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="194">
     <td><a href="https://cplusplus.github.io/CWG/issues/194.html">194</a></td>
@@ -1232,7 +1232,7 @@
     <td><a href="https://cplusplus.github.io/CWG/issues/199.html">199</a></td>
     <td>CD1</td>
     <td>Order of destruction of temporaries</td>
-    <td class="unknown" align="center">Unknown</td>
+    <td class="full" align="center">Clang 2.8</td>
   </tr>
   <tr id="200">
     <td><a href="https://cplusplus.github.io/CWG/issues/200.html">200</a></td>
-- 
cgit v1.1


From 75a1c4e10be179536153cdd3462055cb32ddcb0c Mon Sep 17 00:00:00 2001
From: Konstantin Zhuravlyov <kzhuravl_dev@outlook.com>
Date: Mon, 12 Feb 2024 13:37:25 -0500
Subject: AMDGPU/NFC: Reserve 0x055 MACH in e_flag for future use (#81501)

---
 llvm/docs/AMDGPUUsage.rst            | 1 +
 llvm/include/llvm/BinaryFormat/ELF.h | 1 +
 2 files changed, 2 insertions(+)

diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst
index f253648..ddd5fd4 100644
--- a/llvm/docs/AMDGPUUsage.rst
+++ b/llvm/docs/AMDGPUUsage.rst
@@ -1863,6 +1863,7 @@ The AMDGPU backend uses the following ELF header:
      ``EF_AMDGPU_MACH_AMDGCN_GFX10_1_GENERIC``  0x052      ``gfx10.1-generic``
      ``EF_AMDGPU_MACH_AMDGCN_GFX10_3_GENERIC``  0x053      ``gfx10.3-generic``
      ``EF_AMDGPU_MACH_AMDGCN_GFX11_GENERIC``    0x054      ``gfx11-generic``
+     *reserved*                                 0x055      Reserved.
      ========================================== ========== =============================
 
 Sections
diff --git a/llvm/include/llvm/BinaryFormat/ELF.h b/llvm/include/llvm/BinaryFormat/ELF.h
index 3eddaee..8e0356a 100644
--- a/llvm/include/llvm/BinaryFormat/ELF.h
+++ b/llvm/include/llvm/BinaryFormat/ELF.h
@@ -794,6 +794,7 @@ enum : unsigned {
   EF_AMDGPU_MACH_AMDGCN_GFX10_1_GENERIC   = 0x052,
   EF_AMDGPU_MACH_AMDGCN_GFX10_3_GENERIC   = 0x053,
   EF_AMDGPU_MACH_AMDGCN_GFX11_GENERIC     = 0x054,
+  EF_AMDGPU_MACH_AMDGCN_RESERVED_0X55 = 0x055,
   // clang-format on
 
   // First/last AMDGCN-based processors.
-- 
cgit v1.1


From dbe4143f23ff27a058c4e7c551a0fa70b93b438e Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo@fhahn.com>
Date: Mon, 12 Feb 2024 18:45:13 +0000
Subject: [Matrix] Fix dimensions when hoisting transpose across add. (#81507)

Row and column arguments for matrix_transpose indicate the shape of the
operand. When hoisting the transpose to the result of the add, the add
operates on the original operand's shape, and so does the hoisted
transpose.

This patch also adds an assert that the shape for the original add and
the transpose match, as well as the shape of the new add matches the
cached shape for it.

The assert could potentially be moved to
updateShapeAndReplaceAllUsesWith.
---
 .../Transforms/Scalar/LowerMatrixIntrinsics.cpp    |  18 +++-
 .../LowerMatrixIntrinsics/propagate-backward.ll    | 119 +++++++++++----------
 .../transpose-opts-lifting.ll                      |  44 +++++++-
 3 files changed, 113 insertions(+), 68 deletions(-)

diff --git a/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp b/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp
index 03e289f..67c011b 100644
--- a/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp
+++ b/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp
@@ -898,20 +898,28 @@ public:
       updateShapeAndReplaceAllUsesWith(I, NewInst);
       CleanupBinOp(I, A, B);
     }
-    // A^t + B ^t -> (A + B)^t
+    // A^t + B ^t -> (A + B)^t. Pick rows and columns from first transpose. If
+    // the shape of the second transpose is different, there's a shape conflict
+    // which gets resolved by picking the shape of the first operand.
     else if (match(&I, m_FAdd(m_Value(A), m_Value(B))) &&
              match(A, m_Intrinsic<Intrinsic::matrix_transpose>(
                           m_Value(AT), m_ConstantInt(R), m_ConstantInt(C))) &&
              match(B, m_Intrinsic<Intrinsic::matrix_transpose>(
-                          m_Value(BT), m_ConstantInt(R), m_ConstantInt(C)))) {
+                          m_Value(BT), m_ConstantInt(), m_ConstantInt()))) {
       IRBuilder<> Builder(&I);
-      Value *Add = cast<Instruction>(Builder.CreateFAdd(AT, BT, "mfadd"));
-      setShapeInfo(Add, {C, R});
+      auto *Add = cast<Instruction>(Builder.CreateFAdd(AT, BT, "mfadd"));
+      setShapeInfo(Add, {R, C});
       MatrixBuilder MBuilder(Builder);
       Instruction *NewInst = MBuilder.CreateMatrixTranspose(
-          Add, C->getZExtValue(), R->getZExtValue(), "mfadd_t");
+          Add, R->getZExtValue(), C->getZExtValue(), "mfadd_t");
       updateShapeAndReplaceAllUsesWith(I, NewInst);
+      assert(computeShapeInfoForInst(NewInst, ShapeMap) ==
+                 computeShapeInfoForInst(&I, ShapeMap) &&
+             "Shape of new instruction doesn't match original shape.");
       CleanupBinOp(I, A, B);
+      assert(computeShapeInfoForInst(Add, ShapeMap).value_or(ShapeMap[Add]) ==
+                 ShapeMap[Add] &&
+             "Shape of updated addition doesn't match cached shape.");
     }
   }
 
diff --git a/llvm/test/Transforms/LowerMatrixIntrinsics/propagate-backward.ll b/llvm/test/Transforms/LowerMatrixIntrinsics/propagate-backward.ll
index 82ae93b..33a338d 100644
--- a/llvm/test/Transforms/LowerMatrixIntrinsics/propagate-backward.ll
+++ b/llvm/test/Transforms/LowerMatrixIntrinsics/propagate-backward.ll
@@ -4,31 +4,35 @@
 define <8 x double> @fadd_transpose(<8 x double> %a, <8 x double> %b) {
 ; CHECK-LABEL: @fadd_transpose(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[SPLIT:%.*]] = shufflevector <8 x double> [[A:%.*]], <8 x double> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[SPLIT2:%.*]] = shufflevector <8 x double> [[A]], <8 x double> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:    [[SPLIT3:%.*]] = shufflevector <8 x double> [[B:%.*]], <8 x double> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[SPLIT4:%.*]] = shufflevector <8 x double> [[B]], <8 x double> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:    [[TMP0:%.*]] = fadd <4 x double> [[SPLIT]], [[SPLIT3]]
-; CHECK-NEXT:    [[TMP1:%.*]] = fadd <4 x double> [[SPLIT2]], [[SPLIT4]]
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x double> [[TMP0]], i64 0
-; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x double> poison, double [[TMP2]], i64 0
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x double> [[TMP1]], i64 0
-; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <2 x double> [[TMP3]], double [[TMP4]], i64 1
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x double> [[TMP0]], i64 1
-; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <2 x double> poison, double [[TMP6]], i64 0
-; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <4 x double> [[TMP1]], i64 1
-; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <2 x double> [[TMP7]], double [[TMP8]], i64 1
-; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <4 x double> [[TMP0]], i64 2
-; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <2 x double> poison, double [[TMP10]], i64 0
-; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <4 x double> [[TMP1]], i64 2
-; CHECK-NEXT:    [[TMP13:%.*]] = insertelement <2 x double> [[TMP11]], double [[TMP12]], i64 1
-; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <4 x double> [[TMP0]], i64 3
-; CHECK-NEXT:    [[TMP15:%.*]] = insertelement <2 x double> poison, double [[TMP14]], i64 0
-; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <4 x double> [[TMP1]], i64 3
-; CHECK-NEXT:    [[TMP17:%.*]] = insertelement <2 x double> [[TMP15]], double [[TMP16]], i64 1
-; CHECK-NEXT:    [[TMP18:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> [[TMP9]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[TMP19:%.*]] = shufflevector <2 x double> [[TMP13]], <2 x double> [[TMP17]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[TMP20:%.*]] = shufflevector <4 x double> [[TMP18]], <4 x double> [[TMP19]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[SPLIT:%.*]] = shufflevector <8 x double> [[A:%.*]], <8 x double> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[SPLIT1:%.*]] = shufflevector <8 x double> [[A]], <8 x double> poison, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT:    [[SPLIT2:%.*]] = shufflevector <8 x double> [[A]], <8 x double> poison, <2 x i32> <i32 4, i32 5>
+; CHECK-NEXT:    [[SPLIT3:%.*]] = shufflevector <8 x double> [[A]], <8 x double> poison, <2 x i32> <i32 6, i32 7>
+; CHECK-NEXT:    [[SPLIT4:%.*]] = shufflevector <8 x double> [[B:%.*]], <8 x double> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[SPLIT5:%.*]] = shufflevector <8 x double> [[B]], <8 x double> poison, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT:    [[SPLIT6:%.*]] = shufflevector <8 x double> [[B]], <8 x double> poison, <2 x i32> <i32 4, i32 5>
+; CHECK-NEXT:    [[SPLIT7:%.*]] = shufflevector <8 x double> [[B]], <8 x double> poison, <2 x i32> <i32 6, i32 7>
+; CHECK-NEXT:    [[TMP0:%.*]] = fadd <2 x double> [[SPLIT]], [[SPLIT4]]
+; CHECK-NEXT:    [[TMP1:%.*]] = fadd <2 x double> [[SPLIT1]], [[SPLIT5]]
+; CHECK-NEXT:    [[TMP2:%.*]] = fadd <2 x double> [[SPLIT2]], [[SPLIT6]]
+; CHECK-NEXT:    [[TMP3:%.*]] = fadd <2 x double> [[SPLIT3]], [[SPLIT7]]
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x double> [[TMP0]], i64 0
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <4 x double> poison, double [[TMP4]], i64 0
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x double> [[TMP1]], i64 0
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <4 x double> [[TMP5]], double [[TMP6]], i64 1
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <2 x double> [[TMP2]], i64 0
+; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <4 x double> [[TMP7]], double [[TMP8]], i64 2
+; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <2 x double> [[TMP3]], i64 0
+; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <4 x double> [[TMP9]], double [[TMP10]], i64 3
+; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <2 x double> [[TMP0]], i64 1
+; CHECK-NEXT:    [[TMP13:%.*]] = insertelement <4 x double> poison, double [[TMP12]], i64 0
+; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <2 x double> [[TMP1]], i64 1
+; CHECK-NEXT:    [[TMP15:%.*]] = insertelement <4 x double> [[TMP13]], double [[TMP14]], i64 1
+; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <2 x double> [[TMP2]], i64 1
+; CHECK-NEXT:    [[TMP17:%.*]] = insertelement <4 x double> [[TMP15]], double [[TMP16]], i64 2
+; CHECK-NEXT:    [[TMP18:%.*]] = extractelement <2 x double> [[TMP3]], i64 1
+; CHECK-NEXT:    [[TMP19:%.*]] = insertelement <4 x double> [[TMP17]], double [[TMP18]], i64 3
+; CHECK-NEXT:    [[TMP20:%.*]] = shufflevector <4 x double> [[TMP11]], <4 x double> [[TMP19]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:    ret <8 x double> [[TMP20]]
 ;
 entry:
@@ -42,40 +46,37 @@ define <8 x double> @load_fadd_transpose(ptr %A.Ptr, <8 x double> %b) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[COL_LOAD:%.*]] = load <2 x double>, ptr [[A_PTR:%.*]], align 8
 ; CHECK-NEXT:    [[VEC_GEP:%.*]] = getelementptr double, ptr [[A_PTR]], i64 2
-; CHECK-NEXT:    [[COL_LOAD2:%.*]] = load <2 x double>, ptr [[VEC_GEP]], align 8
-; CHECK-NEXT:    [[VEC_GEP3:%.*]] = getelementptr double, ptr [[A_PTR]], i64 4
-; CHECK-NEXT:    [[COL_LOAD4:%.*]] = load <2 x double>, ptr [[VEC_GEP3]], align 8
-; CHECK-NEXT:    [[VEC_GEP5:%.*]] = getelementptr double, ptr [[A_PTR]], i64 6
-; CHECK-NEXT:    [[COL_LOAD6:%.*]] = load <2 x double>, ptr [[VEC_GEP5]], align 8
-; CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <2 x double> [[COL_LOAD]], <2 x double> [[COL_LOAD2]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <2 x double> [[COL_LOAD4]], <2 x double> [[COL_LOAD6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[TMP0]], <4 x double> [[TMP1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:    [[SPLIT:%.*]] = shufflevector <8 x double> [[TMP2]], <8 x double> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[SPLIT7:%.*]] = shufflevector <8 x double> [[TMP2]], <8 x double> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:    [[SPLIT8:%.*]] = shufflevector <8 x double> [[B:%.*]], <8 x double> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[SPLIT9:%.*]] = shufflevector <8 x double> [[B]], <8 x double> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:    [[TMP3:%.*]] = fadd <4 x double> [[SPLIT]], [[SPLIT8]]
-; CHECK-NEXT:    [[TMP4:%.*]] = fadd <4 x double> [[SPLIT7]], [[SPLIT9]]
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x double> [[TMP3]], i64 0
-; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x double> poison, double [[TMP5]], i64 0
-; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <4 x double> [[TMP4]], i64 0
-; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <2 x double> [[TMP6]], double [[TMP7]], i64 1
-; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <4 x double> [[TMP3]], i64 1
-; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <2 x double> poison, double [[TMP9]], i64 0
-; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <4 x double> [[TMP4]], i64 1
-; CHECK-NEXT:    [[TMP12:%.*]] = insertelement <2 x double> [[TMP10]], double [[TMP11]], i64 1
-; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <4 x double> [[TMP3]], i64 2
-; CHECK-NEXT:    [[TMP14:%.*]] = insertelement <2 x double> poison, double [[TMP13]], i64 0
-; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <4 x double> [[TMP4]], i64 2
-; CHECK-NEXT:    [[TMP16:%.*]] = insertelement <2 x double> [[TMP14]], double [[TMP15]], i64 1
-; CHECK-NEXT:    [[TMP17:%.*]] = extractelement <4 x double> [[TMP3]], i64 3
-; CHECK-NEXT:    [[TMP18:%.*]] = insertelement <2 x double> poison, double [[TMP17]], i64 0
-; CHECK-NEXT:    [[TMP19:%.*]] = extractelement <4 x double> [[TMP4]], i64 3
-; CHECK-NEXT:    [[TMP20:%.*]] = insertelement <2 x double> [[TMP18]], double [[TMP19]], i64 1
-; CHECK-NEXT:    [[TMP21:%.*]] = shufflevector <2 x double> [[TMP8]], <2 x double> [[TMP12]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[TMP22:%.*]] = shufflevector <2 x double> [[TMP16]], <2 x double> [[TMP20]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[TMP23:%.*]] = shufflevector <4 x double> [[TMP21]], <4 x double> [[TMP22]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:    ret <8 x double> [[TMP23]]
+; CHECK-NEXT:    [[COL_LOAD1:%.*]] = load <2 x double>, ptr [[VEC_GEP]], align 8
+; CHECK-NEXT:    [[VEC_GEP2:%.*]] = getelementptr double, ptr [[A_PTR]], i64 4
+; CHECK-NEXT:    [[COL_LOAD3:%.*]] = load <2 x double>, ptr [[VEC_GEP2]], align 8
+; CHECK-NEXT:    [[VEC_GEP4:%.*]] = getelementptr double, ptr [[A_PTR]], i64 6
+; CHECK-NEXT:    [[COL_LOAD5:%.*]] = load <2 x double>, ptr [[VEC_GEP4]], align 8
+; CHECK-NEXT:    [[SPLIT:%.*]] = shufflevector <8 x double> [[B:%.*]], <8 x double> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[SPLIT6:%.*]] = shufflevector <8 x double> [[B]], <8 x double> poison, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT:    [[SPLIT7:%.*]] = shufflevector <8 x double> [[B]], <8 x double> poison, <2 x i32> <i32 4, i32 5>
+; CHECK-NEXT:    [[SPLIT8:%.*]] = shufflevector <8 x double> [[B]], <8 x double> poison, <2 x i32> <i32 6, i32 7>
+; CHECK-NEXT:    [[TMP0:%.*]] = fadd <2 x double> [[COL_LOAD]], [[SPLIT]]
+; CHECK-NEXT:    [[TMP1:%.*]] = fadd <2 x double> [[COL_LOAD1]], [[SPLIT6]]
+; CHECK-NEXT:    [[TMP2:%.*]] = fadd <2 x double> [[COL_LOAD3]], [[SPLIT7]]
+; CHECK-NEXT:    [[TMP3:%.*]] = fadd <2 x double> [[COL_LOAD5]], [[SPLIT8]]
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x double> [[TMP0]], i64 0
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <4 x double> poison, double [[TMP4]], i64 0
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x double> [[TMP1]], i64 0
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <4 x double> [[TMP5]], double [[TMP6]], i64 1
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <2 x double> [[TMP2]], i64 0
+; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <4 x double> [[TMP7]], double [[TMP8]], i64 2
+; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <2 x double> [[TMP3]], i64 0
+; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <4 x double> [[TMP9]], double [[TMP10]], i64 3
+; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <2 x double> [[TMP0]], i64 1
+; CHECK-NEXT:    [[TMP13:%.*]] = insertelement <4 x double> poison, double [[TMP12]], i64 0
+; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <2 x double> [[TMP1]], i64 1
+; CHECK-NEXT:    [[TMP15:%.*]] = insertelement <4 x double> [[TMP13]], double [[TMP14]], i64 1
+; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <2 x double> [[TMP2]], i64 1
+; CHECK-NEXT:    [[TMP17:%.*]] = insertelement <4 x double> [[TMP15]], double [[TMP16]], i64 2
+; CHECK-NEXT:    [[TMP18:%.*]] = extractelement <2 x double> [[TMP3]], i64 1
+; CHECK-NEXT:    [[TMP19:%.*]] = insertelement <4 x double> [[TMP17]], double [[TMP18]], i64 3
+; CHECK-NEXT:    [[TMP20:%.*]] = shufflevector <4 x double> [[TMP11]], <4 x double> [[TMP19]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    ret <8 x double> [[TMP20]]
 ;
 
 
diff --git a/llvm/test/Transforms/LowerMatrixIntrinsics/transpose-opts-lifting.ll b/llvm/test/Transforms/LowerMatrixIntrinsics/transpose-opts-lifting.ll
index d0c6755..fcf83b0 100644
--- a/llvm/test/Transforms/LowerMatrixIntrinsics/transpose-opts-lifting.ll
+++ b/llvm/test/Transforms/LowerMatrixIntrinsics/transpose-opts-lifting.ll
@@ -9,7 +9,7 @@ define <6 x double> @lift_through_add_matching_transpose_dimensions(<6 x double>
 ; CHECK-LABEL:  define <6 x double> @lift_through_add_matching_transpose_dimensions(<6 x double> %a, <6 x double> %b) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[A:%.+]] = fadd <6 x double> %a, %b
-; CHECK-NEXT:    [[T:%.+]] = call <6 x double> @llvm.matrix.transpose.v6f64(<6 x double> [[A]], i32 2, i32 3)
+; CHECK-NEXT:    [[T:%.+]] = call <6 x double> @llvm.matrix.transpose.v6f64(<6 x double> [[A]], i32 3, i32 2)
 ; CHECK-NEXT:    ret <6 x double> [[T]]
 ;
 entry:
@@ -25,7 +25,7 @@ define <6 x double> @lift_through_add_matching_transpose_dimensions_ops_also_hav
 ; CHECK-NEXT:    [[A:%.+]] = load <6 x double>, ptr %a.ptr
 ; CHECK-NEXT:    [[B:%.+]] = load <6 x double>, ptr %b.ptr
 ; CHECK-NEXT:    [[ADD:%.+]] = fadd <6 x double> [[A]], [[B]]
-; CHECK-NEXT:    [[T:%.+]] = call <6 x double> @llvm.matrix.transpose.v6f64(<6 x double> [[ADD]], i32 2, i32 3)
+; CHECK-NEXT:    [[T:%.+]] = call <6 x double> @llvm.matrix.transpose.v6f64(<6 x double> [[ADD]], i32 3, i32 2)
 ; CHECK-NEXT:    ret <6 x double> [[T]]
 ;
 entry:
@@ -41,10 +41,28 @@ define <6 x double> @lift_through_add_mismatching_dimensions_1(<6 x double> %a,
 ; CHECK-LABEL:  define <6 x double> @lift_through_add_mismatching_dimensions_1(<6 x double> %a, <6 x double> %b) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[A:%.+]] = fadd <6 x double> %a, %b
-; CHECK-NEXT:    [[T:%.+]] = call <6 x double> @llvm.matrix.transpose.v6f64(<6 x double> [[A]], i32 2, i32 3)
+; CHECK-NEXT:    [[T:%.+]] = call <6 x double> @llvm.matrix.transpose.v6f64(<6 x double> [[A]], i32 1, i32 6)
+; CHECK-NEXT:    ret <6 x double> [[T]]
+;
+entry:
+  %a.t = call <6 x double> @llvm.matrix.transpose.v6f64(<6 x double> %a, i32 1, i32 6)
+  %b.t = call <6 x double> @llvm.matrix.transpose.v6f64(<6 x double> %b, i32 3, i32 2)
+  %add = fadd <6 x double> %a.t, %b.t
+  ret <6 x double> %add
+}
+
+define <6 x double> @lift_through_add_mismatching_dimensions_1_transpose_dimensions_ops_also_have_shape_info(ptr %a.ptr, ptr %b.ptr) {
+; CHECK-LABEL: define <6 x double> @lift_through_add_mismatching_dimensions_1_transpose_dimensions_ops_also_have_shape_info(ptr %a.ptr, ptr %b.ptr)
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[A:%.+]] = load <6 x double>, ptr %a.ptr
+; CHECK-NEXT:    [[B:%.+]] = load <6 x double>, ptr %b.ptr
+; CHECK-NEXT:    [[ADD:%.+]] = fadd <6 x double> [[A]], [[B]]
+; CHECK-NEXT:    [[T:%.+]] = call <6 x double> @llvm.matrix.transpose.v6f64(<6 x double> [[ADD]], i32 1, i32 6)
 ; CHECK-NEXT:    ret <6 x double> [[T]]
 ;
 entry:
+  %a = load <6 x double>, ptr %a.ptr
+  %b = load <6 x double>, ptr %b.ptr
   %a.t = call <6 x double> @llvm.matrix.transpose.v6f64(<6 x double> %a, i32 1, i32 6)
   %b.t = call <6 x double> @llvm.matrix.transpose.v6f64(<6 x double> %b, i32 3, i32 2)
   %add = fadd <6 x double> %a.t, %b.t
@@ -55,7 +73,7 @@ define <6 x double> @lift_through_add_mismatching_dimensions_2(<6 x double> %a,
 ; CHECK-LABEL:  define <6 x double> @lift_through_add_mismatching_dimensions_2(<6 x double> %a, <6 x double> %b) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[A:%.+]] = fadd <6 x double> %a, %b
-; CHECK-NEXT:    [[T:%.+]] = call <6 x double> @llvm.matrix.transpose.v6f64(<6 x double> [[A]], i32 1, i32 6)
+; CHECK-NEXT:    [[T:%.+]] = call <6 x double> @llvm.matrix.transpose.v6f64(<6 x double> [[A]], i32 3, i32 2)
 ; CHECK-NEXT:    ret <6 x double> [[T]]
 ;
 
@@ -66,6 +84,24 @@ entry:
   ret <6 x double> %add
 }
 
+define <6 x double> @lift_through_add_mismatching_dimensions_2_transpose_dimensions_ops_also_have_shape_info(ptr %a.ptr, ptr %b.ptr) {
+; CHECK-LABEL: define <6 x double> @lift_through_add_mismatching_dimensions_2_transpose_dimensions_ops_also_have_shape_info(ptr %a.ptr, ptr %b.ptr)
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[A:%.+]] = load <6 x double>, ptr %a.ptr
+; CHECK-NEXT:    [[B:%.+]] = load <6 x double>, ptr %b.ptr
+; CHECK-NEXT:    [[ADD:%.+]] = fadd <6 x double> [[A]], [[B]]
+; CHECK-NEXT:    [[T:%.+]] = call <6 x double> @llvm.matrix.transpose.v6f64(<6 x double> [[ADD]], i32 3, i32 2)
+; CHECK-NEXT:    ret <6 x double> [[T]]
+;
+entry:
+  %a = load <6 x double>, ptr %a.ptr
+  %b = load <6 x double>, ptr %b.ptr
+  %a.t = call <6 x double> @llvm.matrix.transpose.v6f64(<6 x double> %a, i32 3, i32 2)
+  %b.t = call <6 x double> @llvm.matrix.transpose.v6f64(<6 x double> %b, i32 6, i32 1)
+  %add = fadd <6 x double> %a.t, %b.t
+  ret <6 x double> %add
+}
+
 define <9 x double> @lift_through_multiply(<6 x double> %a, <6 x double> %b) {
 ; CHECK-LABEL: define <9 x double> @lift_through_multiply(<6 x double> %a, <6 x double> %b) {
 ; CHECK-NEXT:  entry:
-- 
cgit v1.1


From 78f39dc70c1feaea5130b90ea3fb7b3ddd62446b Mon Sep 17 00:00:00 2001
From: Jonas Hahnfeld <hahnjo@hahnjo.de>
Date: Mon, 12 Feb 2024 19:45:52 +0100
Subject: [JITLink][RISCV] Use hashmap to find PCREL_HI20 edge (#78849)

As noted in issues #68594 and #73935, `JITLink/RISCV/ELF_ehframe.s`
fails with libstdc++'s expensive checks because `getRISCVPCRelHi20`
calls `std::equal_range` on the edges which may not be ordered by their
offset. Instead let `ELFJITLinker_riscv` build a hashmap of all edges
with type `R_RISCV_PCREL_HI20` that can be looked up in constant time.

Closes #73935
---
 llvm/lib/ExecutionEngine/JITLink/ELF_riscv.cpp | 68 +++++++++++++-------------
 1 file changed, 35 insertions(+), 33 deletions(-)

diff --git a/llvm/lib/ExecutionEngine/JITLink/ELF_riscv.cpp b/llvm/lib/ExecutionEngine/JITLink/ELF_riscv.cpp
index 2fcdfcf..0cf548e 100644
--- a/llvm/lib/ExecutionEngine/JITLink/ELF_riscv.cpp
+++ b/llvm/lib/ExecutionEngine/JITLink/ELF_riscv.cpp
@@ -133,38 +133,6 @@ const uint8_t
 namespace llvm {
 namespace jitlink {
 
-static Expected<const Edge &> getRISCVPCRelHi20(const Edge &E) {
-  using namespace riscv;
-  assert((E.getKind() == R_RISCV_PCREL_LO12_I ||
-          E.getKind() == R_RISCV_PCREL_LO12_S) &&
-         "Can only have high relocation for R_RISCV_PCREL_LO12_I or "
-         "R_RISCV_PCREL_LO12_S");
-
-  const Symbol &Sym = E.getTarget();
-  const Block &B = Sym.getBlock();
-  orc::ExecutorAddrDiff Offset = Sym.getOffset();
-
-  struct Comp {
-    bool operator()(const Edge &Lhs, orc::ExecutorAddrDiff Offset) {
-      return Lhs.getOffset() < Offset;
-    }
-    bool operator()(orc::ExecutorAddrDiff Offset, const Edge &Rhs) {
-      return Offset < Rhs.getOffset();
-    }
-  };
-
-  auto Bound =
-      std::equal_range(B.edges().begin(), B.edges().end(), Offset, Comp{});
-
-  for (auto It = Bound.first; It != Bound.second; ++It) {
-    if (It->getKind() == R_RISCV_PCREL_HI20)
-      return *It;
-  }
-
-  return make_error<JITLinkError>(
-      "No HI20 PCREL relocation type be found for LO12 PCREL relocation type");
-}
-
 static uint32_t extractBits(uint32_t Num, unsigned Low, unsigned Size) {
   return (Num & (((1ULL << Size) - 1) << Low)) >> Low;
 }
@@ -184,9 +152,43 @@ class ELFJITLinker_riscv : public JITLinker<ELFJITLinker_riscv> {
 public:
   ELFJITLinker_riscv(std::unique_ptr<JITLinkContext> Ctx,
                      std::unique_ptr<LinkGraph> G, PassConfiguration PassConfig)
-      : JITLinker(std::move(Ctx), std::move(G), std::move(PassConfig)) {}
+      : JITLinker(std::move(Ctx), std::move(G), std::move(PassConfig)) {
+    JITLinkerBase::getPassConfig().PostAllocationPasses.push_back(
+        [this](LinkGraph &G) { return gatherRISCVPCRelHi20(G); });
+  }
 
 private:
+  DenseMap<std::pair<const Block *, orc::ExecutorAddrDiff>, const Edge *>
+      RelHi20;
+
+  Error gatherRISCVPCRelHi20(LinkGraph &G) {
+    for (Block *B : G.blocks())
+      for (Edge &E : B->edges())
+        if (E.getKind() == R_RISCV_PCREL_HI20)
+          RelHi20[{B, E.getOffset()}] = &E;
+
+    return Error::success();
+  }
+
+  Expected<const Edge &> getRISCVPCRelHi20(const Edge &E) const {
+    using namespace riscv;
+    assert((E.getKind() == R_RISCV_PCREL_LO12_I ||
+            E.getKind() == R_RISCV_PCREL_LO12_S) &&
+           "Can only have high relocation for R_RISCV_PCREL_LO12_I or "
+           "R_RISCV_PCREL_LO12_S");
+
+    const Symbol &Sym = E.getTarget();
+    const Block &B = Sym.getBlock();
+    orc::ExecutorAddrDiff Offset = Sym.getOffset();
+
+    auto It = RelHi20.find({&B, Offset});
+    if (It != RelHi20.end())
+      return *It->second;
+
+    return make_error<JITLinkError>("No HI20 PCREL relocation type be found "
+                                    "for LO12 PCREL relocation type");
+  }
+
   Error applyFixup(LinkGraph &G, Block &B, const Edge &E) const {
     using namespace riscv;
     using namespace llvm::support;
-- 
cgit v1.1


From d9c3066a587fa81bc0086aa1cf6afa1e04581e77 Mon Sep 17 00:00:00 2001
From: Mariusz Borsa <wrotki@msn.com>
Date: Mon, 12 Feb 2024 10:46:17 -0800
Subject: [Sanitizers][ABI] Build ASAN shim for arm64_32 arch (#81066)

Turns out this arch is needed by the ABI impls

rdar://121963634

Co-authored-by: Mariusz Borsa <m_borsa@apple.com>
---
 compiler-rt/cmake/Modules/AllSupportedArchDefs.cmake | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/compiler-rt/cmake/Modules/AllSupportedArchDefs.cmake b/compiler-rt/cmake/Modules/AllSupportedArchDefs.cmake
index 4167771..4231715 100644
--- a/compiler-rt/cmake/Modules/AllSupportedArchDefs.cmake
+++ b/compiler-rt/cmake/Modules/AllSupportedArchDefs.cmake
@@ -20,6 +20,7 @@ set(VE ve)
 if(APPLE)
   set(ARM64 arm64)
   set(ARM32 armv7 armv7s armv7k)
+  set(ARM64_32 arm64_32)
   set(X86_64 x86_64 x86_64h)
 endif()
 
@@ -29,7 +30,7 @@ set(ALL_SANITIZER_COMMON_SUPPORTED_ARCH ${X86} ${X86_64} ${PPC64} ${RISCV64}
 set(ALL_ASAN_SUPPORTED_ARCH ${X86} ${X86_64} ${ARM32} ${ARM64} ${RISCV64}
     ${MIPS32} ${MIPS64} ${PPC64} ${S390X} ${SPARC} ${SPARCV9} ${HEXAGON}
     ${LOONGARCH64})
-set(ALL_ASAN_ABI_SUPPORTED_ARCH ${X86_64} ${ARM64})
+set(ALL_ASAN_ABI_SUPPORTED_ARCH ${X86_64} ${ARM64} ${ARM64_32})
 set(ALL_DFSAN_SUPPORTED_ARCH ${X86_64} ${MIPS64} ${ARM64} ${LOONGARCH64})
 
 if(ANDROID)
-- 
cgit v1.1


From 50b8a3c01c6a20327dc3f65d2ee175ce73cdcc73 Mon Sep 17 00:00:00 2001
From: srcarroll <50210727+srcarroll@users.noreply.github.com>
Date: Mon, 12 Feb 2024 12:53:17 -0600
Subject: [mlir][linalg] Refactor `EraseIdentityGenericOp` to be reused by
 other `LinalgOp`s (#80466)

This refactored pattern rewrite is intended to be reused by any
`LinalgOp`'s canonicalization pattern for removing identity ops.
Additionally, this canonicalization has been applied to `BroadCastOp`.
---
 .../mlir/Dialect/Linalg/IR/LinalgStructuredOps.td  |  1 +
 mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp           | 44 ++++++++++++----------
 mlir/test/Dialect/Linalg/canonicalize.mlir         | 12 ++++++
 3 files changed, 38 insertions(+), 19 deletions(-)

diff --git a/mlir/include/mlir/Dialect/Linalg/IR/LinalgStructuredOps.td b/mlir/include/mlir/Dialect/Linalg/IR/LinalgStructuredOps.td
index 751edd0..272bc31 100644
--- a/mlir/include/mlir/Dialect/Linalg/IR/LinalgStructuredOps.td
+++ b/mlir/include/mlir/Dialect/Linalg/IR/LinalgStructuredOps.td
@@ -531,6 +531,7 @@ def BroadcastOp : LinalgStructuredBase_Op<"broadcast", [
 
   let hasCustomAssemblyFormat = 1;
   let hasVerifier = 1;
+  let hasCanonicalizer = 1;
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp b/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp
index e86b976..a0f02f6 100644
--- a/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp
+++ b/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp
@@ -1087,24 +1087,25 @@ LogicalResult GenericOp::verify() { return success(); }
 
 namespace {
 
-/// Remove generic operations (on tensors) that are just copying
+/// Remove any linalg operation (on tensors) that are just copying
 /// the values from inputs to the results. Requirements are
 /// 1) All iterator types are parallel
 /// 2) The body contains just a yield operation with the yielded values being
 ///    the arguments corresponding to the operands.
-struct EraseIdentityGenericOp : public OpRewritePattern<GenericOp> {
-  using OpRewritePattern<GenericOp>::OpRewritePattern;
+template <typename OpTy>
+struct EraseIdentityLinalgOp : public OpRewritePattern<OpTy> {
+  using OpRewritePattern<OpTy>::OpRewritePattern;
 
-  LogicalResult matchAndRewrite(GenericOp genericOp,
+  LogicalResult matchAndRewrite(OpTy linalgOp,
                                 PatternRewriter &rewriter) const override {
     // Check all indexing maps are identity.
-    if (llvm::any_of(genericOp.getIndexingMapsArray(),
+    if (llvm::any_of(linalgOp.getIndexingMapsArray(),
                      [](AffineMap map) { return !map.isIdentity(); }))
       return failure();
 
     // Check that the body of the linalg operation is just a linalg.yield
     // operation.
-    Block &body = genericOp.getRegion().front();
+    Block &body = linalgOp->getRegion(0).front();
     if (!llvm::hasSingleElement(body))
       return failure();
     auto yieldOp = dyn_cast<linalg::YieldOp>(body.getTerminator());
@@ -1112,18 +1113,18 @@ struct EraseIdentityGenericOp : public OpRewritePattern<GenericOp> {
       return failure();
 
     // In the buffer case, we need to check exact buffer equality.
-    if (genericOp.hasPureBufferSemantics()) {
-      if (genericOp.getNumDpsInputs() == 1 && genericOp.getNumDpsInits() == 1 &&
-          genericOp.getDpsInputOperand(0)->get() ==
-              genericOp.getDpsInitOperand(0)->get()) {
-        rewriter.eraseOp(genericOp);
+    if (linalgOp.hasPureBufferSemantics()) {
+      if (linalgOp.getNumDpsInputs() == 1 && linalgOp.getNumDpsInits() == 1 &&
+          linalgOp.getDpsInputOperand(0)->get() ==
+              linalgOp.getDpsInitOperand(0)->get()) {
+        rewriter.eraseOp(linalgOp);
         return success();
       }
       return failure();
     }
 
     // Mixed semantics is not supported yet.
-    if (!genericOp.hasPureTensorSemantics())
+    if (!linalgOp.hasPureTensorSemantics())
       return failure();
 
     // Get the argument number of the returned values. That is the operand
@@ -1134,8 +1135,8 @@ struct EraseIdentityGenericOp : public OpRewritePattern<GenericOp> {
       if (!yieldArg || yieldArg.getOwner() != &body)
         return failure();
       unsigned argumentNumber = yieldArg.getArgNumber();
-      Value returnedArg = genericOp->getOperand(argumentNumber);
-      Type resultType = genericOp->getResult(yieldVal.index()).getType();
+      Value returnedArg = linalgOp->getOperand(argumentNumber);
+      Type resultType = linalgOp->getResult(yieldVal.index()).getType();
       // The input can have a different type than the result, e.g. a dynamic
       // input dimension can be turned into a static output dimension.
       Type returnType = returnedArg.getType();
@@ -1145,21 +1146,21 @@ struct EraseIdentityGenericOp : public OpRewritePattern<GenericOp> {
         if (sparse_tensor::getSparseTensorEncoding(returnType) ||
             sparse_tensor::getSparseTensorEncoding(resultType))
           returnedArg = rewriter.create<sparse_tensor::ConvertOp>(
-              genericOp.getLoc(), resultType, returnedArg);
+              linalgOp.getLoc(), resultType, returnedArg);
         else {
           if (!tensor::CastOp::areCastCompatible(returnedArg.getType(),
                                                  resultType))
             return failure();
           returnedArg = rewriter.create<tensor::CastOp>(
-              genericOp.getLoc(), resultType, returnedArg);
+              linalgOp.getLoc(), resultType, returnedArg);
         }
       }
       returnedArgs.push_back(returnedArg);
     }
 
-    if (returnedArgs.size() != genericOp->getNumResults())
+    if (returnedArgs.size() != linalgOp->getNumResults())
       return failure();
-    rewriter.replaceOp(genericOp, returnedArgs);
+    rewriter.replaceOp(linalgOp, returnedArgs);
     return success();
   }
 };
@@ -1168,7 +1169,7 @@ struct EraseIdentityGenericOp : public OpRewritePattern<GenericOp> {
 
 void GenericOp::getCanonicalizationPatterns(RewritePatternSet &results,
                                             MLIRContext *context) {
-  results.add<EraseIdentityGenericOp>(context);
+  results.add<EraseIdentityLinalgOp<GenericOp>>(context);
 }
 
 LogicalResult GenericOp::fold(FoldAdaptor, SmallVectorImpl<OpFoldResult> &) {
@@ -1907,6 +1908,11 @@ void BroadcastOp::getEffects(
                         getDpsInits());
 }
 
+void BroadcastOp::getCanonicalizationPatterns(RewritePatternSet &results,
+                                              MLIRContext *context) {
+  results.add<EraseIdentityLinalgOp<BroadcastOp>>(context);
+}
+
 //===----------------------------------------------------------------------===//
 // YieldOp
 //===----------------------------------------------------------------------===//
diff --git a/mlir/test/Dialect/Linalg/canonicalize.mlir b/mlir/test/Dialect/Linalg/canonicalize.mlir
index 052dc36..721f351 100644
--- a/mlir/test/Dialect/Linalg/canonicalize.mlir
+++ b/mlir/test/Dialect/Linalg/canonicalize.mlir
@@ -1017,3 +1017,15 @@ func.func @canonicalize_fill_to_copy_dest(%arg0 : tensor<?x?xf32>, %arg1 : tenso
   %copy = linalg.copy ins(%arg1 : tensor<?x?xf32>) outs(%fill : tensor<?x?xf32>) -> tensor<?x?xf32>
   return %copy : tensor<?x?xf32>
 }
+
+// -----
+
+// CHECK-LABEL: func @broadcast_same_shape(
+//  CHECK-SAME:     %[[ARG0:[a-zA-Z0-9]+]]: tensor<2x3xf32>
+//  CHECK-SAME:     %[[ARG1:[a-zA-Z0-9]+]]: tensor<2x3xf32>)
+//       CHECK-NOT:   linalg.broadcast
+//       CHECK:       return %[[ARG0]] : tensor<2x3xf32>
+func.func @broadcast_same_shape(%input: tensor<2x3xf32>, %init: tensor<2x3xf32>) -> tensor<2x3xf32> {
+  %0 = linalg.broadcast ins(%input: tensor<2x3xf32>) outs(%init: tensor<2x3xf32>) dimensions = []
+  return %0 : tensor<2x3xf32>
+}
-- 
cgit v1.1


From 54226e234fa4fd00b58e7550329f1645ca37a0a1 Mon Sep 17 00:00:00 2001
From: Ben Langmuir <blangmuir@apple.com>
Date: Mon, 12 Feb 2024 10:54:49 -0800
Subject: [ORC] Make EPCDynamicLibrarySearchGenerator async

Switch the primary implementation of EPC lookupSymbols to be async,
keeping a synchronous wrapper for compatibility. Use the new async
implementation inside EPCDynamicLibrarySearchGenerator to work
working towards a fully async search generator.

Provide an asynchronous lookup API for EPCGenericDylibManager and adopt
that from the SimpleRemoteEPC. This enables an end-to-end async
EPCDynamicLibrarySearchGenerator. Note: currently we keep the current
per-dlhandle lookup model, but a future improvement could do a single
async call for a given lookup operation.
---
 .../ExecutionEngine/Orc/EPCGenericDylibManager.h   | 25 +++++++++-
 .../ExecutionEngine/Orc/ExecutorProcessControl.h   | 30 +++++++++---
 .../llvm/ExecutionEngine/Orc/SimpleRemoteEPC.h     |  4 +-
 .../Orc/EPCDynamicLibrarySearchGenerator.cpp       | 53 ++++++++++++----------
 .../ExecutionEngine/Orc/EPCGenericDylibManager.cpp | 50 ++++++++++++--------
 .../ExecutionEngine/Orc/ExecutorProcessControl.cpp | 10 ++--
 llvm/lib/ExecutionEngine/Orc/SimpleRemoteEPC.cpp   | 45 ++++++++++++------
 .../ExecutionEngine/Orc/ObjectLinkingLayerTest.cpp |  6 +--
 8 files changed, 149 insertions(+), 74 deletions(-)

diff --git a/llvm/include/llvm/ExecutionEngine/Orc/EPCGenericDylibManager.h b/llvm/include/llvm/ExecutionEngine/Orc/EPCGenericDylibManager.h
index 6ee2dee..887147a 100644
--- a/llvm/include/llvm/ExecutionEngine/Orc/EPCGenericDylibManager.h
+++ b/llvm/include/llvm/ExecutionEngine/Orc/EPCGenericDylibManager.h
@@ -51,11 +51,32 @@ public:
 
   /// Looks up symbols within the given dylib.
   Expected<std::vector<ExecutorSymbolDef>>
-  lookup(tpctypes::DylibHandle H, const SymbolLookupSet &Lookup);
+  lookup(tpctypes::DylibHandle H, const SymbolLookupSet &Lookup) {
+    std::promise<MSVCPExpected<std::vector<ExecutorSymbolDef>>> RP;
+    auto RF = RP.get_future();
+    lookupAsync(H, Lookup, [&RP](auto R) { RP.set_value(std::move(R)); });
+    return RF.get();
+  }
 
   /// Looks up symbols within the given dylib.
   Expected<std::vector<ExecutorSymbolDef>>
-  lookup(tpctypes::DylibHandle H, const RemoteSymbolLookupSet &Lookup);
+  lookup(tpctypes::DylibHandle H, const RemoteSymbolLookupSet &Lookup) {
+    std::promise<MSVCPExpected<std::vector<ExecutorSymbolDef>>> RP;
+    auto RF = RP.get_future();
+    lookupAsync(H, Lookup, [&RP](auto R) { RP.set_value(std::move(R)); });
+    return RF.get();
+  }
+
+  using SymbolLookupCompleteFn =
+      unique_function<void(Expected<std::vector<ExecutorSymbolDef>>)>;
+
+  /// Looks up symbols within the given dylib.
+  void lookupAsync(tpctypes::DylibHandle H, const SymbolLookupSet &Lookup,
+                   SymbolLookupCompleteFn Complete);
+
+  /// Looks up symbols within the given dylib.
+  void lookupAsync(tpctypes::DylibHandle H, const RemoteSymbolLookupSet &Lookup,
+                   SymbolLookupCompleteFn Complete);
 
 private:
   ExecutorProcessControl &EPC;
diff --git a/llvm/include/llvm/ExecutionEngine/Orc/ExecutorProcessControl.h b/llvm/include/llvm/ExecutionEngine/Orc/ExecutorProcessControl.h
index 9e42d6d..6468f2d 100644
--- a/llvm/include/llvm/ExecutionEngine/Orc/ExecutorProcessControl.h
+++ b/llvm/include/llvm/ExecutionEngine/Orc/ExecutorProcessControl.h
@@ -290,8 +290,26 @@ public:
   /// that correspond to the lookup order. If a required symbol is not
   /// found then this method will return an error. If a weakly referenced
   /// symbol is not found then it be assigned a '0' value.
-  virtual Expected<std::vector<tpctypes::LookupResult>>
-  lookupSymbols(ArrayRef<LookupRequest> Request) = 0;
+  Expected<std::vector<tpctypes::LookupResult>>
+  lookupSymbols(ArrayRef<LookupRequest> Request) {
+    std::promise<MSVCPExpected<std::vector<tpctypes::LookupResult>>> RP;
+    auto RF = RP.get_future();
+    lookupSymbolsAsync(Request,
+                       [&RP](auto Result) { RP.set_value(std::move(Result)); });
+    return RF.get();
+  }
+
+  using SymbolLookupCompleteFn =
+      unique_function<void(Expected<std::vector<tpctypes::LookupResult>>)>;
+
+  /// Search for symbols in the target process.
+  ///
+  /// The result of the lookup is a 2-dimensional array of target addresses
+  /// that correspond to the lookup order. If a required symbol is not
+  /// found then this method will return an error. If a weakly referenced
+  /// symbol is not found then it be assigned a '0' value.
+  virtual void lookupSymbolsAsync(ArrayRef<LookupRequest> Request,
+                                  SymbolLookupCompleteFn F) = 0;
 
   /// Run function with a main-like signature.
   virtual Expected<int32_t> runAsMain(ExecutorAddr MainFnAddr,
@@ -462,8 +480,8 @@ public:
     llvm_unreachable("Unsupported");
   }
 
-  Expected<std::vector<tpctypes::LookupResult>>
-  lookupSymbols(ArrayRef<LookupRequest> Request) override {
+  void lookupSymbolsAsync(ArrayRef<LookupRequest> Request,
+                          SymbolLookupCompleteFn F) override {
     llvm_unreachable("Unsupported");
   }
 
@@ -510,8 +528,8 @@ public:
 
   Expected<tpctypes::DylibHandle> loadDylib(const char *DylibPath) override;
 
-  Expected<std::vector<tpctypes::LookupResult>>
-  lookupSymbols(ArrayRef<LookupRequest> Request) override;
+  void lookupSymbolsAsync(ArrayRef<LookupRequest> Request,
+                          SymbolLookupCompleteFn F) override;
 
   Expected<int32_t> runAsMain(ExecutorAddr MainFnAddr,
                               ArrayRef<std::string> Args) override;
diff --git a/llvm/include/llvm/ExecutionEngine/Orc/SimpleRemoteEPC.h b/llvm/include/llvm/ExecutionEngine/Orc/SimpleRemoteEPC.h
index 25b79be..c10b8df 100644
--- a/llvm/include/llvm/ExecutionEngine/Orc/SimpleRemoteEPC.h
+++ b/llvm/include/llvm/ExecutionEngine/Orc/SimpleRemoteEPC.h
@@ -71,8 +71,8 @@ public:
 
   Expected<tpctypes::DylibHandle> loadDylib(const char *DylibPath) override;
 
-  Expected<std::vector<tpctypes::LookupResult>>
-  lookupSymbols(ArrayRef<LookupRequest> Request) override;
+  void lookupSymbolsAsync(ArrayRef<LookupRequest> Request,
+                          SymbolLookupCompleteFn F) override;
 
   Expected<int32_t> runAsMain(ExecutorAddr MainFnAddr,
                               ArrayRef<std::string> Args) override;
diff --git a/llvm/lib/ExecutionEngine/Orc/EPCDynamicLibrarySearchGenerator.cpp b/llvm/lib/ExecutionEngine/Orc/EPCDynamicLibrarySearchGenerator.cpp
index 460f4e1..88cc3b0 100644
--- a/llvm/lib/ExecutionEngine/Orc/EPCDynamicLibrarySearchGenerator.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/EPCDynamicLibrarySearchGenerator.cpp
@@ -7,6 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/ExecutionEngine/Orc/EPCDynamicLibrarySearchGenerator.h"
+#include "llvm/Support/Error.h"
 
 namespace llvm {
 namespace orc {
@@ -39,32 +40,38 @@ Error EPCDynamicLibrarySearchGenerator::tryToGenerate(
     LookupSymbols.add(KV.first, SymbolLookupFlags::WeaklyReferencedSymbol);
   }
 
-  SymbolMap NewSymbols;
-
   ExecutorProcessControl::LookupRequest Request(H, LookupSymbols);
-  auto Result = EPC.lookupSymbols(Request);
-  if (!Result)
-    return Result.takeError();
-
-  assert(Result->size() == 1 && "Results for more than one library returned");
-  assert(Result->front().size() == LookupSymbols.size() &&
-         "Result has incorrect number of elements");
-
-  auto ResultI = Result->front().begin();
-  for (auto &KV : LookupSymbols) {
-    if (ResultI->getAddress())
-      NewSymbols[KV.first] = *ResultI;
-    ++ResultI;
-  }
+  // Copy-capture LookupSymbols, since LookupRequest keeps a reference.
+  EPC.lookupSymbolsAsync(Request, [this, &JD, LS = std::move(LS),
+                                   LookupSymbols](auto Result) mutable {
+    if (!Result)
+      return LS.continueLookup(Result.takeError());
 
-  // If there were no resolved symbols bail out.
-  if (NewSymbols.empty())
-    return Error::success();
+    assert(Result->size() == 1 && "Results for more than one library returned");
+    assert(Result->front().size() == LookupSymbols.size() &&
+           "Result has incorrect number of elements");
+
+    SymbolMap NewSymbols;
+    auto ResultI = Result->front().begin();
+    for (auto &KV : LookupSymbols) {
+      if (ResultI->getAddress())
+        NewSymbols[KV.first] = *ResultI;
+      ++ResultI;
+    }
+
+    // If there were no resolved symbols bail out.
+    if (NewSymbols.empty())
+      return LS.continueLookup(Error::success());
+
+    // Define resolved symbols.
+    Error Err = AddAbsoluteSymbols
+                    ? AddAbsoluteSymbols(JD, std::move(NewSymbols))
+                    : JD.define(absoluteSymbols(std::move(NewSymbols)));
+
+    LS.continueLookup(std::move(Err));
+  });
 
-  // Define resolved symbols.
-  if (AddAbsoluteSymbols)
-    return AddAbsoluteSymbols(JD, std::move(NewSymbols));
-  return JD.define(absoluteSymbols(std::move(NewSymbols)));
+  return Error::success();
 }
 
 } // end namespace orc
diff --git a/llvm/lib/ExecutionEngine/Orc/EPCGenericDylibManager.cpp b/llvm/lib/ExecutionEngine/Orc/EPCGenericDylibManager.cpp
index da185c8..6a7cab4 100644
--- a/llvm/lib/ExecutionEngine/Orc/EPCGenericDylibManager.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/EPCGenericDylibManager.cpp
@@ -81,28 +81,38 @@ Expected<tpctypes::DylibHandle> EPCGenericDylibManager::open(StringRef Path,
   return H;
 }
 
-Expected<std::vector<ExecutorSymbolDef>>
-EPCGenericDylibManager::lookup(tpctypes::DylibHandle H,
-                               const SymbolLookupSet &Lookup) {
-  Expected<std::vector<ExecutorSymbolDef>> Result(
-      (std::vector<ExecutorSymbolDef>()));
-  if (auto Err =
-          EPC.callSPSWrapper<rt::SPSSimpleExecutorDylibManagerLookupSignature>(
-              SAs.Lookup, Result, SAs.Instance, H, Lookup))
-    return std::move(Err);
-  return Result;
+void EPCGenericDylibManager::lookupAsync(tpctypes::DylibHandle H,
+                                         const SymbolLookupSet &Lookup,
+                                         SymbolLookupCompleteFn Complete) {
+  EPC.callSPSWrapperAsync<rt::SPSSimpleExecutorDylibManagerLookupSignature>(
+      SAs.Lookup,
+      [Complete = std::move(Complete)](
+          Error SerializationErr,
+          Expected<std::vector<ExecutorSymbolDef>> Result) mutable {
+        if (SerializationErr) {
+          cantFail(Result.takeError());
+          Complete(std::move(SerializationErr));
+        }
+        Complete(std::move(Result));
+      },
+      SAs.Instance, H, Lookup);
 }
 
-Expected<std::vector<ExecutorSymbolDef>>
-EPCGenericDylibManager::lookup(tpctypes::DylibHandle H,
-                               const RemoteSymbolLookupSet &Lookup) {
-  Expected<std::vector<ExecutorSymbolDef>> Result(
-      (std::vector<ExecutorSymbolDef>()));
-  if (auto Err =
-          EPC.callSPSWrapper<rt::SPSSimpleExecutorDylibManagerLookupSignature>(
-              SAs.Lookup, Result, SAs.Instance, H, Lookup))
-    return std::move(Err);
-  return Result;
+void EPCGenericDylibManager::lookupAsync(tpctypes::DylibHandle H,
+                                         const RemoteSymbolLookupSet &Lookup,
+                                         SymbolLookupCompleteFn Complete) {
+  EPC.callSPSWrapperAsync<rt::SPSSimpleExecutorDylibManagerLookupSignature>(
+      SAs.Lookup,
+      [Complete = std::move(Complete)](
+          Error SerializationErr,
+          Expected<std::vector<ExecutorSymbolDef>> Result) mutable {
+        if (SerializationErr) {
+          cantFail(Result.takeError());
+          Complete(std::move(SerializationErr));
+        }
+        Complete(std::move(Result));
+      },
+      SAs.Instance, H, Lookup);
 }
 
 } // end namespace orc
diff --git a/llvm/lib/ExecutionEngine/Orc/ExecutorProcessControl.cpp b/llvm/lib/ExecutionEngine/Orc/ExecutorProcessControl.cpp
index f0c551c..efafca9 100644
--- a/llvm/lib/ExecutionEngine/Orc/ExecutorProcessControl.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/ExecutorProcessControl.cpp
@@ -89,8 +89,9 @@ SelfExecutorProcessControl::loadDylib(const char *DylibPath) {
   return ExecutorAddr::fromPtr(Dylib.getOSSpecificHandle());
 }
 
-Expected<std::vector<tpctypes::LookupResult>>
-SelfExecutorProcessControl::lookupSymbols(ArrayRef<LookupRequest> Request) {
+void SelfExecutorProcessControl::lookupSymbolsAsync(
+    ArrayRef<LookupRequest> Request,
+    ExecutorProcessControl::SymbolLookupCompleteFn Complete) {
   std::vector<tpctypes::LookupResult> R;
 
   for (auto &Elem : Request) {
@@ -105,7 +106,8 @@ SelfExecutorProcessControl::lookupSymbols(ArrayRef<LookupRequest> Request) {
         // FIXME: Collect all failing symbols before erroring out.
         SymbolNameVector MissingSymbols;
         MissingSymbols.push_back(Sym);
-        return make_error<SymbolsNotFound>(SSP, std::move(MissingSymbols));
+        return Complete(
+            make_error<SymbolsNotFound>(SSP, std::move(MissingSymbols)));
       }
       // FIXME: determine accurate JITSymbolFlags.
       R.back().push_back(
@@ -113,7 +115,7 @@ SelfExecutorProcessControl::lookupSymbols(ArrayRef<LookupRequest> Request) {
     }
   }
 
-  return R;
+  Complete(std::move(R));
 }
 
 Expected<int32_t>
diff --git a/llvm/lib/ExecutionEngine/Orc/SimpleRemoteEPC.cpp b/llvm/lib/ExecutionEngine/Orc/SimpleRemoteEPC.cpp
index 3d3ca89..a81019c 100644
--- a/llvm/lib/ExecutionEngine/Orc/SimpleRemoteEPC.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/SimpleRemoteEPC.cpp
@@ -29,20 +29,37 @@ SimpleRemoteEPC::loadDylib(const char *DylibPath) {
   return DylibMgr->open(DylibPath, 0);
 }
 
-Expected<std::vector<tpctypes::LookupResult>>
-SimpleRemoteEPC::lookupSymbols(ArrayRef<LookupRequest> Request) {
-  std::vector<tpctypes::LookupResult> Result;
-
-  for (auto &Element : Request) {
-    if (auto R = DylibMgr->lookup(Element.Handle, Element.Symbols)) {
-      Result.push_back({});
-      Result.back().reserve(R->size());
-      for (auto Addr : *R)
-        Result.back().push_back(Addr);
-    } else
-      return R.takeError();
-  }
-  return std::move(Result);
+/// Async helper to chain together calls to DylibMgr::lookupAsync to fulfill all
+/// all the requests.
+/// FIXME: The dylib manager should support multiple LookupRequests natively.
+static void
+lookupSymbolsAsyncHelper(EPCGenericDylibManager &DylibMgr,
+                         ArrayRef<SimpleRemoteEPC::LookupRequest> Request,
+                         std::vector<tpctypes::LookupResult> Result,
+                         SimpleRemoteEPC::SymbolLookupCompleteFn Complete) {
+  if (Request.empty())
+    return Complete(std::move(Result));
+
+  auto &Element = Request.front();
+  DylibMgr.lookupAsync(Element.Handle, Element.Symbols,
+                       [&DylibMgr, Request, Complete = std::move(Complete),
+                        Result = std::move(Result)](auto R) mutable {
+                         if (!R)
+                           return Complete(R.takeError());
+                         Result.push_back({});
+                         Result.back().reserve(R->size());
+                         for (auto Addr : *R)
+                           Result.back().push_back(Addr);
+
+                         lookupSymbolsAsyncHelper(
+                             DylibMgr, Request.drop_front(), std::move(Result),
+                             std::move(Complete));
+                       });
+}
+
+void SimpleRemoteEPC::lookupSymbolsAsync(ArrayRef<LookupRequest> Request,
+                                         SymbolLookupCompleteFn Complete) {
+  lookupSymbolsAsyncHelper(*DylibMgr, Request, {}, std::move(Complete));
 }
 
 Expected<int32_t> SimpleRemoteEPC::runAsMain(ExecutorAddr MainFnAddr,
diff --git a/llvm/unittests/ExecutionEngine/Orc/ObjectLinkingLayerTest.cpp b/llvm/unittests/ExecutionEngine/Orc/ObjectLinkingLayerTest.cpp
index edd12eb..7ab3e40 100644
--- a/llvm/unittests/ExecutionEngine/Orc/ObjectLinkingLayerTest.cpp
+++ b/llvm/unittests/ExecutionEngine/Orc/ObjectLinkingLayerTest.cpp
@@ -189,8 +189,8 @@ TEST(ObjectLinkingLayerSearchGeneratorTest, AbsoluteSymbolsObjectLayer) {
       return ExecutorAddr::fromPtr((void *)nullptr);
     }
 
-    Expected<std::vector<tpctypes::LookupResult>>
-    lookupSymbols(ArrayRef<LookupRequest> Request) override {
+    void lookupSymbolsAsync(ArrayRef<LookupRequest> Request,
+                            SymbolLookupCompleteFn Complete) override {
       std::vector<ExecutorSymbolDef> Result;
       EXPECT_EQ(Request.size(), 1u);
       for (auto &LR : Request) {
@@ -205,7 +205,7 @@ TEST(ObjectLinkingLayerSearchGeneratorTest, AbsoluteSymbolsObjectLayer) {
           }
         }
       }
-      return std::vector<tpctypes::LookupResult>{1, Result};
+      Complete(std::vector<tpctypes::LookupResult>{1, Result});
     }
   };
 
-- 
cgit v1.1


From 2fcfc9754a16805b81e541dc8222a8b5cf17a121 Mon Sep 17 00:00:00 2001
From: Jordan Rupprecht <rupprecht@google.com>
Date: Mon, 12 Feb 2024 11:35:17 -0800
Subject: [NFC] Reformat bzl files with buildifier

---
 utils/bazel/WORKSPACE                              |  8 ++--
 utils/bazel/examples/http_archive/WORKSPACE        |  6 +--
 utils/bazel/examples/submodule/WORKSPACE           |  6 +--
 utils/bazel/llvm-project-overlay/bolt/BUILD.bazel  |  2 +-
 utils/bazel/llvm-project-overlay/clang/BUILD.bazel | 10 ++---
 .../llvm-project-overlay/compiler-rt/BUILD.bazel   |  4 +-
 .../llvm-project-overlay/libc/libc_build_rules.bzl |  8 ++--
 .../libc/utils/MPFRWrapper/BUILD.bazel             |  2 +-
 utils/bazel/llvm-project-overlay/lld/BUILD.bazel   |  4 +-
 utils/bazel/llvm-project-overlay/llvm/BUILD.bazel  | 14 +++----
 utils/bazel/llvm-project-overlay/mlir/BUILD.bazel  | 14 +++----
 .../llvm-project-overlay/mlir/test/BUILD.bazel     |  2 -
 .../third-party/unittest/BUILD.bazel               |  2 +-
 utils/bazel/third_party_build/zlib-ng.BUILD        | 44 +++++++++++-----------
 utils/bazel/third_party_build/zstd.BUILD           |  2 +-
 15 files changed, 63 insertions(+), 65 deletions(-)

diff --git a/utils/bazel/WORKSPACE b/utils/bazel/WORKSPACE
index c334658..f4ae2c7c 100644
--- a/utils/bazel/WORKSPACE
+++ b/utils/bazel/WORKSPACE
@@ -2,9 +2,9 @@
 # See https://llvm.org/LICENSE.txt for license information.
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
+load("@bazel_tools//tools/build_defs/repo:git.bzl", "git_repository", "new_git_repository")
 load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive")
 load("@bazel_tools//tools/build_defs/repo:utils.bzl", "maybe")
-load("@bazel_tools//tools/build_defs/repo:git.bzl", "git_repository", "new_git_repository")
 
 SKYLIB_VERSION = "1.3.0"
 
@@ -12,8 +12,8 @@ http_archive(
     name = "bazel_skylib",
     sha256 = "74d544d96f4a5bb630d465ca8bbcfe231e3594e5aae57e1edbf17a6eb3ca2506",
     urls = [
-        "https://mirror.bazel.build/github.com/bazelbuild/bazel-skylib/releases/download/{version}/bazel-skylib-{version}.tar.gz".format(version=SKYLIB_VERSION),
-        "https://github.com/bazelbuild/bazel-skylib/releases/download/{version}/bazel-skylib-{version}.tar.gz".format(version=SKYLIB_VERSION),
+        "https://mirror.bazel.build/github.com/bazelbuild/bazel-skylib/releases/download/{version}/bazel-skylib-{version}.tar.gz".format(version = SKYLIB_VERSION),
+        "https://github.com/bazelbuild/bazel-skylib/releases/download/{version}/bazel-skylib-{version}.tar.gz".format(version = SKYLIB_VERSION),
     ],
 )
 
@@ -113,6 +113,6 @@ maybe(
     sha256 = "7c42d56fac126929a6a85dbc73ff1db2411d04f104fae9bdea51305663a83fd0",
     strip_prefix = "zstd-1.5.2",
     urls = [
-        "https://github.com/facebook/zstd/releases/download/v1.5.2/zstd-1.5.2.tar.gz"
+        "https://github.com/facebook/zstd/releases/download/v1.5.2/zstd-1.5.2.tar.gz",
     ],
 )
diff --git a/utils/bazel/examples/http_archive/WORKSPACE b/utils/bazel/examples/http_archive/WORKSPACE
index 82e9f16..efc3a08 100644
--- a/utils/bazel/examples/http_archive/WORKSPACE
+++ b/utils/bazel/examples/http_archive/WORKSPACE
@@ -14,8 +14,8 @@ http_archive(
     name = "bazel_skylib",
     sha256 = "97e70364e9249702246c0e9444bccdc4b847bed1eb03c5a3ece4f83dfe6abc44",
     urls = [
-        "https://mirror.bazel.build/github.com/bazelbuild/bazel-skylib/releases/download/{version}/bazel-skylib-{version}.tar.gz".format(version=SKYLIB_VERSION),
-        "https://github.com/bazelbuild/bazel-skylib/releases/download/{version}/bazel-skylib-{version}.tar.gz".format(version=SKYLIB_VERSION),
+        "https://mirror.bazel.build/github.com/bazelbuild/bazel-skylib/releases/download/{version}/bazel-skylib-{version}.tar.gz".format(version = SKYLIB_VERSION),
+        "https://github.com/bazelbuild/bazel-skylib/releases/download/{version}/bazel-skylib-{version}.tar.gz".format(version = SKYLIB_VERSION),
     ],
 )
 
@@ -57,6 +57,6 @@ maybe(
     sha256 = "7c42d56fac126929a6a85dbc73ff1db2411d04f104fae9bdea51305663a83fd0",
     strip_prefix = "zstd-1.5.2",
     urls = [
-        "https://github.com/facebook/zstd/releases/download/v1.5.2/zstd-1.5.2.tar.gz"
+        "https://github.com/facebook/zstd/releases/download/v1.5.2/zstd-1.5.2.tar.gz",
     ],
 )
diff --git a/utils/bazel/examples/submodule/WORKSPACE b/utils/bazel/examples/submodule/WORKSPACE
index 34e2cfd..e8eff85 100644
--- a/utils/bazel/examples/submodule/WORKSPACE
+++ b/utils/bazel/examples/submodule/WORKSPACE
@@ -12,8 +12,8 @@ http_archive(
     name = "bazel_skylib",
     sha256 = "97e70364e9249702246c0e9444bccdc4b847bed1eb03c5a3ece4f83dfe6abc44",
     urls = [
-        "https://mirror.bazel.build/github.com/bazelbuild/bazel-skylib/releases/download/{version}/bazel-skylib-{version}.tar.gz".format(version=SKYLIB_VERSION),
-        "https://github.com/bazelbuild/bazel-skylib/releases/download/{version}/bazel-skylib-{version}.tar.gz".format(version=SKYLIB_VERSION),
+        "https://mirror.bazel.build/github.com/bazelbuild/bazel-skylib/releases/download/{version}/bazel-skylib-{version}.tar.gz".format(version = SKYLIB_VERSION),
+        "https://github.com/bazelbuild/bazel-skylib/releases/download/{version}/bazel-skylib-{version}.tar.gz".format(version = SKYLIB_VERSION),
     ],
 )
 
@@ -46,6 +46,6 @@ maybe(
     sha256 = "7c42d56fac126929a6a85dbc73ff1db2411d04f104fae9bdea51305663a83fd0",
     strip_prefix = "zstd-1.5.2",
     urls = [
-        "https://github.com/facebook/zstd/releases/download/v1.5.2/zstd-1.5.2.tar.gz"
+        "https://github.com/facebook/zstd/releases/download/v1.5.2/zstd-1.5.2.tar.gz",
     ],
 )
diff --git a/utils/bazel/llvm-project-overlay/bolt/BUILD.bazel b/utils/bazel/llvm-project-overlay/bolt/BUILD.bazel
index 172620a..043a3b61 100644
--- a/utils/bazel/llvm-project-overlay/bolt/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/bolt/BUILD.bazel
@@ -80,10 +80,10 @@ cc_library(
         "//llvm:Analysis",
         "//llvm:BinaryFormat",
         "//llvm:CodeGen",
+        "//llvm:DWARFLinker",
         "//llvm:DWP",
         "//llvm:DebugInfoDWARF",
         "//llvm:Demangle",
-        "//llvm:DWARFLinker",
         "//llvm:JITLink",
         "//llvm:MC",
         "//llvm:MCDisassembler",
diff --git a/utils/bazel/llvm-project-overlay/clang/BUILD.bazel b/utils/bazel/llvm-project-overlay/clang/BUILD.bazel
index b8b3fcb..b5de786 100644
--- a/utils/bazel/llvm-project-overlay/clang/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/clang/BUILD.bazel
@@ -3,10 +3,6 @@
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
 load("@bazel_skylib//rules:expand_template.bzl", "expand_template")
-load("//:workspace_root.bzl", "workspace_root")
-load("//llvm:tblgen.bzl", "gentbl")
-load("//llvm:binary_alias.bzl", "binary_alias")
-load("//llvm:cc_plugin_library.bzl", "cc_plugin_library")
 load(
     "//:vars.bzl",
     "LLVM_VERSION",
@@ -14,6 +10,10 @@ load(
     "LLVM_VERSION_MINOR",
     "LLVM_VERSION_PATCH",
 )
+load("//:workspace_root.bzl", "workspace_root")
+load("//llvm:binary_alias.bzl", "binary_alias")
+load("//llvm:cc_plugin_library.bzl", "cc_plugin_library")
+load("//llvm:tblgen.bzl", "gentbl")
 
 package(
     default_visibility = ["//visibility:public"],
@@ -2432,8 +2432,8 @@ cc_binary(
 cc_binary(
     name = "clang-format",
     srcs = [
-        "tools/clang-format/ClangFormat.cpp",
         "lib/Format/MatchFilePath.h",
+        "tools/clang-format/ClangFormat.cpp",
     ],
     stamp = 0,
     deps = [
diff --git a/utils/bazel/llvm-project-overlay/compiler-rt/BUILD.bazel b/utils/bazel/llvm-project-overlay/compiler-rt/BUILD.bazel
index bba18bf..9bdd454 100644
--- a/utils/bazel/llvm-project-overlay/compiler-rt/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/compiler-rt/BUILD.bazel
@@ -36,7 +36,7 @@ cc_library(
         exclude = WIN32_ONLY_FILES,
     ) + select({
         "@platforms//os:windows": WIN32_ONLY_FILES,
-        "//conditions:default": []
+        "//conditions:default": [],
     }),
     hdrs = glob([
         "include/profile/*.h",
@@ -45,8 +45,8 @@ cc_library(
     includes = [
         "include",
     ],
+    linkstatic = True,
     deps = [
         ":config",
     ],
-    linkstatic = True,
 )
diff --git a/utils/bazel/llvm-project-overlay/libc/libc_build_rules.bzl b/utils/bazel/llvm-project-overlay/libc/libc_build_rules.bzl
index a28fa51..17eb30c8e 100644
--- a/utils/bazel/llvm-project-overlay/libc/libc_build_rules.bzl
+++ b/utils/bazel/llvm-project-overlay/libc/libc_build_rules.bzl
@@ -82,10 +82,10 @@ def libc_function(
     # original list, where this creates a new list and stores it in deps.
     copts = copts or []
     copts = copts + [
-      "-O3",
-      "-fno-builtin",
-      "-fno-lax-vector-conversions",
-      "-ftrivial-auto-var-init=pattern"
+        "-O3",
+        "-fno-builtin",
+        "-fno-lax-vector-conversions",
+        "-ftrivial-auto-var-init=pattern",
     ]
 
     # We compile the code twice, the first target is suffixed with ".__internal__" and contains the
diff --git a/utils/bazel/llvm-project-overlay/libc/utils/MPFRWrapper/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/utils/MPFRWrapper/BUILD.bazel
index 3ebae21..803010e 100644
--- a/utils/bazel/llvm-project-overlay/libc/utils/MPFRWrapper/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/libc/utils/MPFRWrapper/BUILD.bazel
@@ -46,8 +46,8 @@ libc_support_library(
         "//libc:__support_cpp_type_traits",
         "//libc:__support_fputil_fp_bits",
         "//libc:__support_fputil_fpbits_str",
-        "//libc/test/UnitTest:fp_test_helpers",
         "//libc/test/UnitTest:LibcUnitTest",
+        "//libc/test/UnitTest:fp_test_helpers",
         "//libc/utils/MPFRWrapper:mpfr_impl",
     ],
 )
diff --git a/utils/bazel/llvm-project-overlay/lld/BUILD.bazel b/utils/bazel/llvm-project-overlay/lld/BUILD.bazel
index a01aa13..8fb71fc 100644
--- a/utils/bazel/llvm-project-overlay/lld/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/lld/BUILD.bazel
@@ -3,12 +3,12 @@
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
 load("@bazel_skylib//rules:expand_template.bzl", "expand_template")
-load("//llvm:tblgen.bzl", "gentbl")
-load("//llvm:binary_alias.bzl", "binary_alias")
 load(
     "//:vars.bzl",
     "LLVM_VERSION",
 )
+load("//llvm:binary_alias.bzl", "binary_alias")
+load("//llvm:tblgen.bzl", "gentbl")
 
 package(
     default_visibility = ["//visibility:public"],
diff --git a/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel b/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel
index 6b947d4..4802daa 100644
--- a/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel
@@ -4860,7 +4860,7 @@ cc_library(
 cc_binary(
     name = "opt",
     stamp = 0,
-    deps = [":opt-driver"]
+    deps = [":opt-driver"],
 )
 
 gentbl(
@@ -4979,9 +4979,9 @@ py_binary(
     srcs = [
         "utils/mlgo-utils/mlgo/__init__.py",
         "utils/mlgo-utils/mlgo/corpus/extract_ir.py",
-        "utils/mlgo-utils/mlgo/corpus/extract_ir_lib.py"
+        "utils/mlgo-utils/mlgo/corpus/extract_ir_lib.py",
     ],
-    imports = ["utils/mlgo-utils"]
+    imports = ["utils/mlgo-utils"],
 )
 
 py_binary(
@@ -4989,9 +4989,9 @@ py_binary(
     srcs = [
         "utils/mlgo-utils/mlgo/__init__.py",
         "utils/mlgo-utils/mlgo/corpus/combine_training_corpus.py",
-        "utils/mlgo-utils/mlgo/corpus/combine_training_corpus_lib.py"
+        "utils/mlgo-utils/mlgo/corpus/combine_training_corpus_lib.py",
     ],
-    imports = ["utils/mlgo-utils"]
+    imports = ["utils/mlgo-utils"],
 )
 
 py_binary(
@@ -4999,9 +4999,9 @@ py_binary(
     srcs = [
         "utils/mlgo-utils/mlgo/__init__.py",
         "utils/mlgo-utils/mlgo/corpus/make_corpus.py",
-        "utils/mlgo-utils/mlgo/corpus/make_corpus_lib.py"
+        "utils/mlgo-utils/mlgo/corpus/make_corpus_lib.py",
     ],
-    imports = ["utils/mlgo-utils"]
+    imports = ["utils/mlgo-utils"],
 )
 
 cc_library(
diff --git a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
index 8b20e3e..821481e 100644
--- a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
@@ -4552,8 +4552,8 @@ cc_library(
     deps = [
         ":ControlFlowDialect",
         ":FuncDialect",
-        ":InferTypeOpInterface",
         ":IR",
+        ":InferTypeOpInterface",
         ":MeshShardingInterface",
     ],
 )
@@ -8756,7 +8756,7 @@ cc_library(
         ":ROCDLTarget",
         ":ROCDLToLLVMIRTranslation",
         ":SPIRVToLLVMIRTranslation",
-	":VCIXToLLVMIRTranslation",
+        ":VCIXToLLVMIRTranslation",
         ":X86VectorToLLVMIRTranslation",
     ],
 )
@@ -9155,6 +9155,7 @@ cc_binary(
         "//mlir/test:TestLinalg",
         "//mlir/test:TestLoopLikeInterface",
         "//mlir/test:TestMath",
+        "//mlir/test:TestMathToVCIX",
         "//mlir/test:TestMemRef",
         "//mlir/test:TestMesh",
         "//mlir/test:TestNVGPU",
@@ -9175,7 +9176,6 @@ cc_binary(
         "//mlir/test:TestTypeDialect",
         "//mlir/test:TestVector",
         "//mlir/test:TestVectorToSPIRV",
-        "//mlir/test:TestMathToVCIX",
     ],
 )
 
@@ -13658,11 +13658,11 @@ cc_library(
     hdrs = ["include/mlir/Dialect/LLVMIR/VCIXDialect.h"],
     includes = ["include"],
     deps = [
-	":GPUDialect",
+        ":GPUDialect",
         ":IR",
         ":LLVMDialect",
-        ":VCIXOpsIncGen",
         ":SideEffectInterfaces",
+        ":VCIXOpsIncGen",
         "//llvm:AsmParser",
         "//llvm:Core",
         "//llvm:Support",
@@ -13690,7 +13690,7 @@ gentbl_cc_library(
             ["-gen-op-defs"],
             "include/mlir/Dialect/LLVMIR/VCIXOps.cpp.inc",
         ),
-	(
+        (
             [
                 "-gen-dialect-decls",
                 "-dialect=vcix",
@@ -13731,9 +13731,9 @@ cc_library(
     includes = ["include"],
     deps = [
         ":IR",
+        ":ToLLVMIRTranslation",
         ":VCIXConversionIncGen",
         ":VCIXDialect",
-        ":ToLLVMIRTranslation",
         "//llvm:Core",
         "//llvm:Support",
     ],
diff --git a/utils/bazel/llvm-project-overlay/mlir/test/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/test/BUILD.bazel
index c650e3e..c3bc3f1 100644
--- a/utils/bazel/llvm-project-overlay/mlir/test/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/mlir/test/BUILD.bazel
@@ -804,8 +804,6 @@ cc_library(
     ],
 )
 
-
-
 cc_library(
     name = "TestMemRef",
     srcs = glob(["lib/Dialect/MemRef/*.cpp"]),
diff --git a/utils/bazel/llvm-project-overlay/third-party/unittest/BUILD.bazel b/utils/bazel/llvm-project-overlay/third-party/unittest/BUILD.bazel
index 86d502a..0b5c084 100644
--- a/utils/bazel/llvm-project-overlay/third-party/unittest/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/third-party/unittest/BUILD.bazel
@@ -52,9 +52,9 @@ cc_library(
     ) + [
     ],
     hdrs = [
-        "googletest/include/gtest/internal/gtest-port.h",
         "googletest/include/gtest/gtest.h",
         "googletest/include/gtest/gtest-spi.h",
+        "googletest/include/gtest/internal/gtest-port.h",
     ],
     copts = llvm_copts,
     defines = [
diff --git a/utils/bazel/third_party_build/zlib-ng.BUILD b/utils/bazel/third_party_build/zlib-ng.BUILD
index c41ea45..055261a 100644
--- a/utils/bazel/third_party_build/zlib-ng.BUILD
+++ b/utils/bazel/third_party_build/zlib-ng.BUILD
@@ -32,49 +32,49 @@ cc_library(
     name = "zlib",
     srcs = select({
         ":llvm_zlib_enabled": [
+            "adler32.c",
             "adler32_p.h",
+            "chunkset.c",
             "chunkset_tpl.h",
+            "compare258.c",
+            "compress.c",
+            "crc32.c",
+            "crc32_comb.c",
+            "crc32_comb_tbl.h",
             "crc32_p.h",
             "crc32_tbl.h",
-            "crc32_comb_tbl.h",
+            "deflate.c",
             "deflate.h",
+            "deflate_fast.c",
+            "deflate_medium.c",
             "deflate_p.h",
-            "functable.h",
+            "deflate_quick.c",
+            "deflate_slow.c",
             "fallback_builtins.h",
+            "functable.c",
+            "functable.h",
+            "infback.c",
+            "inffast.c",
             "inffast.h",
             "inffixed_tbl.h",
+            "inflate.c",
             "inflate.h",
             "inflate_p.h",
+            "inftrees.c",
             "inftrees.h",
+            "insert_string.c",
             "insert_string_tpl.h",
             "match_tpl.h",
+            "trees.c",
             "trees.h",
             "trees_emit.h",
             "trees_tbl.h",
+            "uncompr.c",
             "zbuild.h",
             "zendian.h",
+            "zutil.c",
             "zutil.h",
-            "adler32.c",
-            "chunkset.c",
-            "compare258.c",
-            "compress.c",
-            "crc32.c",
-            "crc32_comb.c",
-            "deflate.c",
-            "deflate_fast.c",
-            "deflate_medium.c",
-            "deflate_quick.c",
-            "deflate_slow.c",
-            "functable.c",
-            "infback.c",
-            "inffast.c",
-            "inflate.c",
-            "inftrees.c",
-            "insert_string.c",
-            "trees.c",
-            "uncompr.c",
             "zutil_p.h",
-            "zutil.c",
         ],
         "//conditions:default": [],
     }),
diff --git a/utils/bazel/third_party_build/zstd.BUILD b/utils/bazel/third_party_build/zstd.BUILD
index 510c461..7d022d4 100644
--- a/utils/bazel/third_party_build/zstd.BUILD
+++ b/utils/bazel/third_party_build/zstd.BUILD
@@ -37,8 +37,8 @@ cc_library(
     }),
     hdrs = select({
         ":llvm_zstd_enabled": [
-            "lib/zstd.h",
             "lib/zdict.h",
+            "lib/zstd.h",
             "lib/zstd_errors.h",
         ],
         "//conditions:default": [],
-- 
cgit v1.1


From 73159a994abcbf82881ee15b0df5cf13c9671f31 Mon Sep 17 00:00:00 2001
From: Andy Kaylor <andrew.kaylor@intel.com>
Date: Mon, 12 Feb 2024 12:38:16 -0800
Subject: [NFC] Refactor fast-math handling for clang driver (#81173)

This refactors the fast-math handling in the clang driver, moving the
settings into a lambda that is shared by the -ffp-model=fast and
-ffast-math code. Previously the -ffp-model=fast handler changed the
local option variable and fell through to the -ffast-math handler.

This refactoring is intended to prepare the way for decoupling the
-ffp-model=fast settings from the -ffast-math settings and possibly
introduce a less aggressive fp-model.
---
 clang/lib/Driver/ToolChains/Clang.cpp | 40 +++++++++++++++++++----------------
 1 file changed, 22 insertions(+), 18 deletions(-)

diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp
index 942ebbc..4459d86 100644
--- a/clang/lib/Driver/ToolChains/Clang.cpp
+++ b/clang/lib/Driver/ToolChains/Clang.cpp
@@ -2778,6 +2778,26 @@ static void RenderFloatingPointOptions(const ToolChain &TC, const Driver &D,
   LangOptions::ComplexRangeKind Range = LangOptions::ComplexRangeKind::CX_None;
   std::string ComplexRangeStr = "";
 
+  // Lambda to set fast-math options. This is also used by -ffp-model=fast
+  auto applyFastMath = [&]() {
+    HonorINFs = false;
+    HonorNaNs = false;
+    MathErrno = false;
+    AssociativeMath = true;
+    ReciprocalMath = true;
+    ApproxFunc = true;
+    SignedZeros = false;
+    TrappingMath = false;
+    RoundingFPMath = false;
+    FPExceptionBehavior = "";
+    // If fast-math is set then set the fp-contract mode to fast.
+    FPContract = "fast";
+    // ffast-math enables limited range rules for complex multiplication and
+    // division.
+    Range = LangOptions::ComplexRangeKind::CX_Limited;
+    SeenUnsafeMathModeOption = true;
+  };
+
   if (const Arg *A = Args.getLastArg(options::OPT_flimited_precision_EQ)) {
     CmdArgs.push_back("-mlimit-float-precision");
     CmdArgs.push_back(A->getValue());
@@ -2842,9 +2862,8 @@ static void RenderFloatingPointOptions(const ToolChain &TC, const Driver &D,
             << Args.MakeArgString("-ffp-model=" + FPModel)
             << Args.MakeArgString("-ffp-model=" + Val);
       if (Val.equals("fast")) {
-        optID = options::OPT_ffast_math;
         FPModel = Val;
-        FPContract = "fast";
+        applyFastMath();
       } else if (Val.equals("precise")) {
         optID = options::OPT_ffp_contract;
         FPModel = Val;
@@ -3061,22 +3080,7 @@ static void RenderFloatingPointOptions(const ToolChain &TC, const Driver &D,
         continue;
       [[fallthrough]];
     case options::OPT_ffast_math: {
-      HonorINFs = false;
-      HonorNaNs = false;
-      MathErrno = false;
-      AssociativeMath = true;
-      ReciprocalMath = true;
-      ApproxFunc = true;
-      SignedZeros = false;
-      TrappingMath = false;
-      RoundingFPMath = false;
-      FPExceptionBehavior = "";
-      // If fast-math is set then set the fp-contract mode to fast.
-      FPContract = "fast";
-      SeenUnsafeMathModeOption = true;
-      // ffast-math enables fortran rules for complex multiplication and
-      // division.
-      Range = LangOptions::ComplexRangeKind::CX_Limited;
+      applyFastMath();
       break;
     }
     case options::OPT_fno_fast_math:
-- 
cgit v1.1


From 8799d7143f8b75ba722f438d0d2388bfaefa793b Mon Sep 17 00:00:00 2001
From: Artem Belevich <tra@google.com>
Date: Mon, 12 Feb 2024 12:59:03 -0800
Subject: [NVPTX] Fix the error in a pattern match in v4i8 comparisons.
 (#81308)

The replacement should've had BFE() as the arguments for the comparison,
not the source register.

While at that, tighten the patterns a bit, and expand them to cover
variants with immediate arguments. Also change the default lowering of
bfe() to use unsigned variant, so the value of the upper bits is
predictable.
---
 llvm/lib/Target/NVPTX/NVPTXInstrInfo.td      |  92 +++++--
 llvm/test/CodeGen/NVPTX/i8x4-instructions.ll | 366 ++++++++++++++-------------
 2 files changed, 260 insertions(+), 198 deletions(-)

diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
index 365afc6..4322eae 100644
--- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
+++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -1886,10 +1886,14 @@ multiclass PRMT<ValueType T, RegisterClass RC> {
 }
 
 let hasSideEffects = false in {
-  defm BFE_S32 : BFE<"bfe.s32", i32, Int32Regs>;
+  // order is somewhat important here. signed/unsigned variants match
+  // the same patterns, so the first one wins. Having unsigned byte extraction
+  // has the benefit of always having zero in unused bits, which makes some
+  // optimizations easier (e.g. no need to mask them).
   defm BFE_U32 : BFE<"bfe.u32", i32, Int32Regs>;
-  defm BFE_S64 : BFE<"bfe.s64", i64, Int64Regs>;
+  defm BFE_S32 : BFE<"bfe.s32", i32, Int32Regs>;
   defm BFE_U64 : BFE<"bfe.u64", i64, Int64Regs>;
+  defm BFE_S64 : BFE<"bfe.s64", i64, Int64Regs>;
 
   defm BFI_B32 : BFI<"bfi.b32", i32, Int32Regs, i32imm>;
   defm BFI_B64 : BFI<"bfi.b64", i64, Int64Regs, i64imm>;
@@ -2259,27 +2263,69 @@ def : Pat<(setueq Int1Regs:$a, Int1Regs:$b),
           (NOT1 (XORb1rr Int1Regs:$a, Int1Regs:$b))>;
 
 // comparisons of i8 extracted with BFE as i32
-def: Pat<(setgt (sext_inreg (trunc Int32Regs:$a), i8), (sext_inreg (trunc Int32Regs:$b), i8)),
-         (SETP_s32rr Int32Regs:$a, Int32Regs:$b, CmpGT)>;
-def: Pat<(setge (sext_inreg (trunc Int32Regs:$a), i8), (sext_inreg (trunc Int32Regs:$b), i8)),
-         (SETP_s32rr Int32Regs:$a, Int32Regs:$b, CmpGE)>;
-def: Pat<(setlt (sext_inreg (trunc Int32Regs:$a), i8), (sext_inreg (trunc Int32Regs:$b), i8)),
-         (SETP_s32rr Int32Regs:$a, Int32Regs:$b, CmpLT)>;
-def: Pat<(setle (sext_inreg (trunc Int32Regs:$a), i8), (sext_inreg (trunc Int32Regs:$b), i8)),
-         (SETP_s32rr Int32Regs:$a, Int32Regs:$b, CmpLE)>;
-
-def: Pat<(setugt (i16 (and (trunc Int32Regs:$a), 255)), (i16 (and (trunc Int32Regs:$b), 255))),
-         (SETP_u32rr Int32Regs:$a, Int32Regs:$b, CmpHI)>;
-def: Pat<(setuge (i16 (and (trunc Int32Regs:$a), 255)), (i16 (and (trunc Int32Regs:$b), 255))),
-         (SETP_u32rr Int32Regs:$a, Int32Regs:$b, CmpHS)>;
-def: Pat<(setult (i16 (and (trunc Int32Regs:$a), 255)), (i16 (and (trunc Int32Regs:$b), 255))),
-         (SETP_u32rr Int32Regs:$a, Int32Regs:$b, CmpLO)>;
-def: Pat<(setule (i16 (and (trunc Int32Regs:$a), 255)), (i16 (and (trunc Int32Regs:$b), 255))),
-         (SETP_u32rr Int32Regs:$a, Int32Regs:$b, CmpLS)>;
-def: Pat<(seteq (i16 (and (trunc Int32Regs:$a), 255)), (i16 (and (trunc Int32Regs:$b), 255))),
-         (SETP_u32rr Int32Regs:$a, Int32Regs:$b, CmpEQ)>;
-def: Pat<(setne (i16 (and (trunc Int32Regs:$a), 255)), (i16 (and (trunc Int32Regs:$b), 255))),
-         (SETP_u32rr Int32Regs:$a, Int32Regs:$b, CmpNE)>;
+// It's faster to do comparison directly on i32 extracted by BFE,
+// instead of the long conversion and sign extending.
+def: Pat<(setgt (i16 (sext_inreg (i16 (trunc (bfe Int32Regs:$a, Int32Regs:$oa, 8))), i8)),
+                (i16 (sext_inreg (i16 (trunc (bfe Int32Regs:$b, Int32Regs:$ob, 8))), i8))),
+         (SETP_s32rr (BFE_S32rri $a, $oa, 8), (BFE_S32rri $b, $ob, 8), CmpGT)>;
+def: Pat<(setgt (i16 (sext_inreg (trunc (bfe Int32Regs:$a, imm:$oa, 8)), i8)),
+                (i16 (sext_inreg (trunc (bfe Int32Regs:$b, imm:$ob, 8)), i8))),
+         (SETP_s32rr (BFE_S32rii $a, imm:$oa, 8), (BFE_S32rii $b, imm:$ob, 8), CmpGT)>;
+def: Pat<(setge (i16 (sext_inreg (i16 (trunc (bfe Int32Regs:$a, Int32Regs:$oa, 8))), i8)),
+                (i16 (sext_inreg (i16 (trunc (bfe Int32Regs:$b, Int32Regs:$ob, 8))), i8))),
+         (SETP_s32rr (BFE_S32rri $a, $oa, 8), (BFE_S32rri $b, $ob, 8), CmpGE)>;
+def: Pat<(setge (i16 (sext_inreg (trunc (bfe Int32Regs:$a, imm:$oa, 8)), i8)),
+                (i16 (sext_inreg (trunc (bfe Int32Regs:$b, imm:$ob, 8)), i8))),
+         (SETP_s32rr (BFE_S32rii $a, imm:$oa, 8), (BFE_S32rii $b, imm:$ob, 8), CmpGE)>;
+def: Pat<(setlt (i16 (sext_inreg (i16 (trunc (bfe Int32Regs:$a, Int32Regs:$oa, 8))), i8)),
+                (i16 (sext_inreg (i16 (trunc (bfe Int32Regs:$b, Int32Regs:$ob, 8))), i8))),
+         (SETP_s32rr (BFE_S32rri $a, $oa, 8), (BFE_S32rri $b, $ob, 8), CmpLT)>;
+def: Pat<(setlt (i16 (sext_inreg (trunc (bfe Int32Regs:$a, imm:$oa, 8)), i8)),
+                (i16 (sext_inreg (trunc (bfe Int32Regs:$b, imm:$ob, 8)), i8))),
+         (SETP_s32rr (BFE_S32rii $a, imm:$oa, 8), (BFE_S32rii $b, imm:$ob, 8), CmpLT)>;
+def: Pat<(setle (i16 (sext_inreg (i16 (trunc (bfe Int32Regs:$a, Int32Regs:$oa, 8))), i8)),
+                (i16 (sext_inreg (i16 (trunc (bfe Int32Regs:$b, Int32Regs:$ob, 8))), i8))),
+         (SETP_s32rr (BFE_S32rri $a, $oa, 8), (BFE_S32rri $b, $ob, 8), CmpLE)>;
+def: Pat<(setle (i16 (sext_inreg (trunc (bfe Int32Regs:$a, imm:$oa, 8)), i8)),
+                (i16 (sext_inreg (trunc (bfe Int32Regs:$b, imm:$ob, 8)), i8))),
+         (SETP_s32rr (BFE_S32rii $a, imm:$oa, 8), (BFE_S32rii $b, imm:$ob, 8), CmpLE)>;
+
+def: Pat<(setugt (i16 (and (trunc (bfe Int32Regs:$a, Int32Regs:$oa, 8)), 255)),
+                 (i16 (and (trunc (bfe Int32Regs:$b, Int32Regs:$ob, 8)), 255))),
+         (SETP_u32rr (BFE_U32rri $a, $oa, 8), (BFE_U32rri $b, $ob, 8), CmpHI)>;
+def: Pat<(setugt (i16 (and (trunc (bfe Int32Regs:$a, imm:$oa, 8)), 255)),
+                 (i16 (and (trunc (bfe Int32Regs:$b, imm:$ob, 8)), 255))),
+         (SETP_u32rr (BFE_U32rii $a, imm:$oa, 8), (BFE_U32rii $b, imm:$ob, 8), CmpHI)>;
+def: Pat<(setuge (i16 (and (trunc (bfe Int32Regs:$a, Int32Regs:$oa, 8)), 255)),
+                 (i16 (and (trunc (bfe Int32Regs:$b, Int32Regs:$ob, 8)), 255))),
+         (SETP_u32rr (BFE_U32rri $a, $oa, 8), (BFE_U32rri $b, $ob, 8), CmpHS)>;
+def: Pat<(setuge (i16 (and (trunc (bfe Int32Regs:$a, imm:$oa, 8)), 255)),
+                 (i16 (and (trunc (bfe Int32Regs:$b, imm:$ob, 8)), 255))),
+         (SETP_u32rr (BFE_U32rii $a, imm:$oa, 8), (BFE_U32rii $b, imm:$ob, 8), CmpHS)>;
+def: Pat<(setult (i16 (and (trunc (bfe Int32Regs:$a, Int32Regs:$oa, 8)), 255)),
+                 (i16 (and (trunc (bfe Int32Regs:$b, Int32Regs:$ob, 8)), 255))),
+         (SETP_u32rr (BFE_U32rri $a, $oa, 8), (BFE_U32rri $b, $ob, 8), CmpLO)>;
+def: Pat<(setult (i16 (and (trunc (bfe Int32Regs:$a, imm:$oa, 8)), 255)),
+                 (i16 (and (trunc (bfe Int32Regs:$b, imm:$ob, 8)), 255))),
+         (SETP_u32rr (BFE_U32rii $a, imm:$oa, 8), (BFE_U32rii $b, imm:$ob, 8), CmpLO)>;
+def: Pat<(setule (i16 (and (trunc (bfe Int32Regs:$a, Int32Regs:$oa, 8)), 255)),
+                 (i16 (and (trunc (bfe Int32Regs:$b, Int32Regs:$ob, 8)), 255))),
+         (SETP_u32rr (BFE_U32rri $a, $oa, 8), (BFE_U32rri $b, $ob, 8), CmpLS)>;
+def: Pat<(setule (i16 (and (trunc (bfe Int32Regs:$a, imm:$oa, 8)), 255)),
+                 (i16 (and (trunc (bfe Int32Regs:$b, imm:$ob, 8)), 255))),
+         (SETP_u32rr (BFE_U32rii $a, imm:$oa, 8), (BFE_U32rii $b, imm:$ob, 8), CmpLS)>;
+def: Pat<(seteq (i16 (and (trunc (bfe Int32Regs:$a, Int32Regs:$oa, 8)), 255)),
+                 (i16 (and (trunc (bfe Int32Regs:$b, Int32Regs:$ob, 8)), 255))),
+         (SETP_u32rr (BFE_U32rri $a, $oa, 8), (BFE_U32rri $b, $ob, 8), CmpEQ)>;
+def: Pat<(seteq (i16 (and (trunc (bfe Int32Regs:$a, imm:$oa, 8)), 255)),
+                 (i16 (and (trunc (bfe Int32Regs:$b, imm:$ob, 8)), 255))),
+         (SETP_u32rr (BFE_U32rii $a, imm:$oa, 8), (BFE_U32rii $b, imm:$ob, 8), CmpEQ)>;
+def: Pat<(setne (i16 (and (trunc (bfe Int32Regs:$a, Int32Regs:$oa, 8)), 255)),
+                 (i16 (and (trunc (bfe Int32Regs:$b, Int32Regs:$ob, 8)), 255))),
+         (SETP_u32rr (BFE_U32rri $a, $oa, 8), (BFE_U32rri $b, $ob, 8), CmpNE)>;
+def: Pat<(setne (i16 (and (trunc (bfe Int32Regs:$a, imm:$oa, 8)), 255)),
+                 (i16 (and (trunc (bfe Int32Regs:$b, imm:$ob, 8)), 255))),
+         (SETP_u32rr (BFE_U32rii $a, imm:$oa, 8), (BFE_U32rii $b, imm:$ob, 8), CmpNE)>;
 
 // i1 compare -> i32
 def : Pat<(i32 (setne Int1Regs:$a, Int1Regs:$b)),
diff --git a/llvm/test/CodeGen/NVPTX/i8x4-instructions.ll b/llvm/test/CodeGen/NVPTX/i8x4-instructions.ll
index 1ec68b4..6895699 100644
--- a/llvm/test/CodeGen/NVPTX/i8x4-instructions.ll
+++ b/llvm/test/CodeGen/NVPTX/i8x4-instructions.ll
@@ -106,29 +106,29 @@ define <4 x i8> @test_add(<4 x i8> %a, <4 x i8> %b) #0 {
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.u32 %r2, [test_add_param_1];
 ; CHECK-NEXT:    ld.param.u32 %r1, [test_add_param_0];
-; CHECK-NEXT:    bfe.s32 %r3, %r2, 0, 8;
+; CHECK-NEXT:    bfe.u32 %r3, %r2, 0, 8;
 ; CHECK-NEXT:    cvt.u16.u32 %rs1, %r3;
-; CHECK-NEXT:    bfe.s32 %r4, %r1, 0, 8;
+; CHECK-NEXT:    bfe.u32 %r4, %r1, 0, 8;
 ; CHECK-NEXT:    cvt.u16.u32 %rs2, %r4;
 ; CHECK-NEXT:    add.s16 %rs3, %rs2, %rs1;
 ; CHECK-NEXT:    cvt.u32.u16 %r5, %rs3;
-; CHECK-NEXT:    bfe.s32 %r6, %r2, 8, 8;
+; CHECK-NEXT:    bfe.u32 %r6, %r2, 8, 8;
 ; CHECK-NEXT:    cvt.u16.u32 %rs4, %r6;
-; CHECK-NEXT:    bfe.s32 %r7, %r1, 8, 8;
+; CHECK-NEXT:    bfe.u32 %r7, %r1, 8, 8;
 ; CHECK-NEXT:    cvt.u16.u32 %rs5, %r7;
 ; CHECK-NEXT:    add.s16 %rs6, %rs5, %rs4;
 ; CHECK-NEXT:    cvt.u32.u16 %r8, %rs6;
 ; CHECK-NEXT:    bfi.b32 %r9, %r8, %r5, 8, 8;
-; CHECK-NEXT:    bfe.s32 %r10, %r2, 16, 8;
+; CHECK-NEXT:    bfe.u32 %r10, %r2, 16, 8;
 ; CHECK-NEXT:    cvt.u16.u32 %rs7, %r10;
-; CHECK-NEXT:    bfe.s32 %r11, %r1, 16, 8;
+; CHECK-NEXT:    bfe.u32 %r11, %r1, 16, 8;
 ; CHECK-NEXT:    cvt.u16.u32 %rs8, %r11;
 ; CHECK-NEXT:    add.s16 %rs9, %rs8, %rs7;
 ; CHECK-NEXT:    cvt.u32.u16 %r12, %rs9;
 ; CHECK-NEXT:    bfi.b32 %r13, %r12, %r9, 16, 8;
-; CHECK-NEXT:    bfe.s32 %r14, %r2, 24, 8;
+; CHECK-NEXT:    bfe.u32 %r14, %r2, 24, 8;
 ; CHECK-NEXT:    cvt.u16.u32 %rs10, %r14;
-; CHECK-NEXT:    bfe.s32 %r15, %r1, 24, 8;
+; CHECK-NEXT:    bfe.u32 %r15, %r1, 24, 8;
 ; CHECK-NEXT:    cvt.u16.u32 %rs11, %r15;
 ; CHECK-NEXT:    add.s16 %rs12, %rs11, %rs10;
 ; CHECK-NEXT:    cvt.u32.u16 %r16, %rs12;
@@ -147,21 +147,21 @@ define <4 x i8> @test_add_imm_0(<4 x i8> %a) #0 {
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.u32 %r1, [test_add_imm_0_param_0];
-; CHECK-NEXT:    bfe.s32 %r2, %r1, 0, 8;
+; CHECK-NEXT:    bfe.u32 %r2, %r1, 0, 8;
 ; CHECK-NEXT:    cvt.u16.u32 %rs1, %r2;
 ; CHECK-NEXT:    add.s16 %rs2, %rs1, 1;
 ; CHECK-NEXT:    cvt.u32.u16 %r3, %rs2;
-; CHECK-NEXT:    bfe.s32 %r4, %r1, 8, 8;
+; CHECK-NEXT:    bfe.u32 %r4, %r1, 8, 8;
 ; CHECK-NEXT:    cvt.u16.u32 %rs3, %r4;
 ; CHECK-NEXT:    add.s16 %rs4, %rs3, 2;
 ; CHECK-NEXT:    cvt.u32.u16 %r5, %rs4;
 ; CHECK-NEXT:    bfi.b32 %r6, %r5, %r3, 8, 8;
-; CHECK-NEXT:    bfe.s32 %r7, %r1, 16, 8;
+; CHECK-NEXT:    bfe.u32 %r7, %r1, 16, 8;
 ; CHECK-NEXT:    cvt.u16.u32 %rs5, %r7;
 ; CHECK-NEXT:    add.s16 %rs6, %rs5, 3;
 ; CHECK-NEXT:    cvt.u32.u16 %r8, %rs6;
 ; CHECK-NEXT:    bfi.b32 %r9, %r8, %r6, 16, 8;
-; CHECK-NEXT:    bfe.s32 %r10, %r1, 24, 8;
+; CHECK-NEXT:    bfe.u32 %r10, %r1, 24, 8;
 ; CHECK-NEXT:    cvt.u16.u32 %rs7, %r10;
 ; CHECK-NEXT:    add.s16 %rs8, %rs7, 4;
 ; CHECK-NEXT:    cvt.u32.u16 %r11, %rs8;
@@ -180,21 +180,21 @@ define <4 x i8> @test_add_imm_1(<4 x i8> %a) #0 {
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.u32 %r1, [test_add_imm_1_param_0];
-; CHECK-NEXT:    bfe.s32 %r2, %r1, 0, 8;
+; CHECK-NEXT:    bfe.u32 %r2, %r1, 0, 8;
 ; CHECK-NEXT:    cvt.u16.u32 %rs1, %r2;
 ; CHECK-NEXT:    add.s16 %rs2, %rs1, 1;
 ; CHECK-NEXT:    cvt.u32.u16 %r3, %rs2;
-; CHECK-NEXT:    bfe.s32 %r4, %r1, 8, 8;
+; CHECK-NEXT:    bfe.u32 %r4, %r1, 8, 8;
 ; CHECK-NEXT:    cvt.u16.u32 %rs3, %r4;
 ; CHECK-NEXT:    add.s16 %rs4, %rs3, 2;
 ; CHECK-NEXT:    cvt.u32.u16 %r5, %rs4;
 ; CHECK-NEXT:    bfi.b32 %r6, %r5, %r3, 8, 8;
-; CHECK-NEXT:    bfe.s32 %r7, %r1, 16, 8;
+; CHECK-NEXT:    bfe.u32 %r7, %r1, 16, 8;
 ; CHECK-NEXT:    cvt.u16.u32 %rs5, %r7;
 ; CHECK-NEXT:    add.s16 %rs6, %rs5, 3;
 ; CHECK-NEXT:    cvt.u32.u16 %r8, %rs6;
 ; CHECK-NEXT:    bfi.b32 %r9, %r8, %r6, 16, 8;
-; CHECK-NEXT:    bfe.s32 %r10, %r1, 24, 8;
+; CHECK-NEXT:    bfe.u32 %r10, %r1, 24, 8;
 ; CHECK-NEXT:    cvt.u16.u32 %rs7, %r10;
 ; CHECK-NEXT:    add.s16 %rs8, %rs7, 4;
 ; CHECK-NEXT:    cvt.u32.u16 %r11, %rs8;
@@ -214,29 +214,29 @@ define <4 x i8> @test_sub(<4 x i8> %a, <4 x i8> %b) #0 {
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.u32 %r2, [test_sub_param_1];
 ; CHECK-NEXT:    ld.param.u32 %r1, [test_sub_param_0];
-; CHECK-NEXT:    bfe.s32 %r3, %r2, 0, 8;
+; CHECK-NEXT:    bfe.u32 %r3, %r2, 0, 8;
 ; CHECK-NEXT:    cvt.u16.u32 %rs1, %r3;
-; CHECK-NEXT:    bfe.s32 %r4, %r1, 0, 8;
+; CHECK-NEXT:    bfe.u32 %r4, %r1, 0, 8;
 ; CHECK-NEXT:    cvt.u16.u32 %rs2, %r4;
 ; CHECK-NEXT:    sub.s16 %rs3, %rs2, %rs1;
 ; CHECK-NEXT:    cvt.u32.u16 %r5, %rs3;
-; CHECK-NEXT:    bfe.s32 %r6, %r2, 8, 8;
+; CHECK-NEXT:    bfe.u32 %r6, %r2, 8, 8;
 ; CHECK-NEXT:    cvt.u16.u32 %rs4, %r6;
-; CHECK-NEXT:    bfe.s32 %r7, %r1, 8, 8;
+; CHECK-NEXT:    bfe.u32 %r7, %r1, 8, 8;
 ; CHECK-NEXT:    cvt.u16.u32 %rs5, %r7;
 ; CHECK-NEXT:    sub.s16 %rs6, %rs5, %rs4;
 ; CHECK-NEXT:    cvt.u32.u16 %r8, %rs6;
 ; CHECK-NEXT:    bfi.b32 %r9, %r8, %r5, 8, 8;
-; CHECK-NEXT:    bfe.s32 %r10, %r2, 16, 8;
+; CHECK-NEXT:    bfe.u32 %r10, %r2, 16, 8;
 ; CHECK-NEXT:    cvt.u16.u32 %rs7, %r10;
-; CHECK-NEXT:    bfe.s32 %r11, %r1, 16, 8;
+; CHECK-NEXT:    bfe.u32 %r11, %r1, 16, 8;
 ; CHECK-NEXT:    cvt.u16.u32 %rs8, %r11;
 ; CHECK-NEXT:    sub.s16 %rs9, %rs8, %rs7;
 ; CHECK-NEXT:    cvt.u32.u16 %r12, %rs9;
 ; CHECK-NEXT:    bfi.b32 %r13, %r12, %r9, 16, 8;
-; CHECK-NEXT:    bfe.s32 %r14, %r2, 24, 8;
+; CHECK-NEXT:    bfe.u32 %r14, %r2, 24, 8;
 ; CHECK-NEXT:    cvt.u16.u32 %rs10, %r14;
-; CHECK-NEXT:    bfe.s32 %r15, %r1, 24, 8;
+; CHECK-NEXT:    bfe.u32 %r15, %r1, 24, 8;
 ; CHECK-NEXT:    cvt.u16.u32 %rs11, %r15;
 ; CHECK-NEXT:    sub.s16 %rs12, %rs11, %rs10;
 ; CHECK-NEXT:    cvt.u32.u16 %r16, %rs12;
@@ -251,31 +251,39 @@ define <4 x i8> @test_smax(<4 x i8> %a, <4 x i8> %b) #0 {
 ; CHECK-LABEL: test_smax(
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .pred %p<5>;
-; CHECK-NEXT:    .reg .b32 %r<19>;
+; CHECK-NEXT:    .reg .b32 %r<27>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.u32 %r2, [test_smax_param_1];
 ; CHECK-NEXT:    ld.param.u32 %r1, [test_smax_param_0];
-; CHECK-NEXT:    bfe.s32 %r3, %r1, 24, 8;
-; CHECK-NEXT:    bfe.s32 %r4, %r2, 24, 8;
-; CHECK-NEXT:    setp.gt.s32 %p1, %r3, %r4;
-; CHECK-NEXT:    bfe.s32 %r5, %r1, 16, 8;
-; CHECK-NEXT:    bfe.s32 %r6, %r2, 16, 8;
-; CHECK-NEXT:    setp.gt.s32 %p2, %r5, %r6;
-; CHECK-NEXT:    bfe.s32 %r7, %r1, 8, 8;
-; CHECK-NEXT:    bfe.s32 %r8, %r2, 8, 8;
-; CHECK-NEXT:    setp.gt.s32 %p3, %r7, %r8;
-; CHECK-NEXT:    bfe.s32 %r9, %r1, 0, 8;
-; CHECK-NEXT:    bfe.s32 %r10, %r2, 0, 8;
-; CHECK-NEXT:    setp.gt.s32 %p4, %r9, %r10;
-; CHECK-NEXT:    selp.b32 %r11, %r9, %r10, %p4;
-; CHECK-NEXT:    selp.b32 %r12, %r7, %r8, %p3;
-; CHECK-NEXT:    bfi.b32 %r13, %r12, %r11, 8, 8;
-; CHECK-NEXT:    selp.b32 %r14, %r5, %r6, %p2;
-; CHECK-NEXT:    bfi.b32 %r15, %r14, %r13, 16, 8;
-; CHECK-NEXT:    selp.b32 %r16, %r3, %r4, %p1;
-; CHECK-NEXT:    bfi.b32 %r17, %r16, %r15, 24, 8;
-; CHECK-NEXT:    st.param.b32 [func_retval0+0], %r17;
+; CHECK-NEXT:    bfe.s32 %r3, %r2, 24, 8;
+; CHECK-NEXT:    bfe.s32 %r4, %r1, 24, 8;
+; CHECK-NEXT:    setp.gt.s32 %p1, %r4, %r3;
+; CHECK-NEXT:    bfe.s32 %r5, %r2, 16, 8;
+; CHECK-NEXT:    bfe.s32 %r6, %r1, 16, 8;
+; CHECK-NEXT:    setp.gt.s32 %p2, %r6, %r5;
+; CHECK-NEXT:    bfe.s32 %r7, %r2, 8, 8;
+; CHECK-NEXT:    bfe.s32 %r8, %r1, 8, 8;
+; CHECK-NEXT:    setp.gt.s32 %p3, %r8, %r7;
+; CHECK-NEXT:    bfe.s32 %r9, %r2, 0, 8;
+; CHECK-NEXT:    bfe.s32 %r10, %r1, 0, 8;
+; CHECK-NEXT:    setp.gt.s32 %p4, %r10, %r9;
+; CHECK-NEXT:    bfe.u32 %r11, %r1, 24, 8;
+; CHECK-NEXT:    bfe.u32 %r12, %r1, 16, 8;
+; CHECK-NEXT:    bfe.u32 %r13, %r1, 8, 8;
+; CHECK-NEXT:    bfe.u32 %r14, %r1, 0, 8;
+; CHECK-NEXT:    bfe.u32 %r15, %r2, 0, 8;
+; CHECK-NEXT:    selp.b32 %r16, %r14, %r15, %p4;
+; CHECK-NEXT:    bfe.u32 %r17, %r2, 8, 8;
+; CHECK-NEXT:    selp.b32 %r18, %r13, %r17, %p3;
+; CHECK-NEXT:    bfi.b32 %r19, %r18, %r16, 8, 8;
+; CHECK-NEXT:    bfe.u32 %r20, %r2, 16, 8;
+; CHECK-NEXT:    selp.b32 %r21, %r12, %r20, %p2;
+; CHECK-NEXT:    bfi.b32 %r22, %r21, %r19, 16, 8;
+; CHECK-NEXT:    bfe.u32 %r23, %r2, 24, 8;
+; CHECK-NEXT:    selp.b32 %r24, %r11, %r23, %p1;
+; CHECK-NEXT:    bfi.b32 %r25, %r24, %r22, 24, 8;
+; CHECK-NEXT:    st.param.b32 [func_retval0+0], %r25;
 ; CHECK-NEXT:    ret;
   %cmp = icmp sgt <4 x i8> %a, %b
   %r = select <4 x i1> %cmp, <4 x i8> %a, <4 x i8> %b
@@ -291,24 +299,24 @@ define <4 x i8> @test_umax(<4 x i8> %a, <4 x i8> %b) #0 {
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.u32 %r2, [test_umax_param_1];
 ; CHECK-NEXT:    ld.param.u32 %r1, [test_umax_param_0];
-; CHECK-NEXT:    bfe.s32 %r3, %r1, 24, 8;
-; CHECK-NEXT:    bfe.s32 %r4, %r2, 24, 8;
-; CHECK-NEXT:    setp.hi.u32 %p1, %r3, %r4;
-; CHECK-NEXT:    bfe.s32 %r5, %r1, 16, 8;
-; CHECK-NEXT:    bfe.s32 %r6, %r2, 16, 8;
-; CHECK-NEXT:    setp.hi.u32 %p2, %r5, %r6;
-; CHECK-NEXT:    bfe.s32 %r7, %r1, 8, 8;
-; CHECK-NEXT:    bfe.s32 %r8, %r2, 8, 8;
-; CHECK-NEXT:    setp.hi.u32 %p3, %r7, %r8;
-; CHECK-NEXT:    bfe.s32 %r9, %r1, 0, 8;
-; CHECK-NEXT:    bfe.s32 %r10, %r2, 0, 8;
-; CHECK-NEXT:    setp.hi.u32 %p4, %r9, %r10;
-; CHECK-NEXT:    selp.b32 %r11, %r9, %r10, %p4;
-; CHECK-NEXT:    selp.b32 %r12, %r7, %r8, %p3;
+; CHECK-NEXT:    bfe.u32 %r3, %r2, 24, 8;
+; CHECK-NEXT:    bfe.u32 %r4, %r1, 24, 8;
+; CHECK-NEXT:    setp.hi.u32 %p1, %r4, %r3;
+; CHECK-NEXT:    bfe.u32 %r5, %r2, 16, 8;
+; CHECK-NEXT:    bfe.u32 %r6, %r1, 16, 8;
+; CHECK-NEXT:    setp.hi.u32 %p2, %r6, %r5;
+; CHECK-NEXT:    bfe.u32 %r7, %r2, 8, 8;
+; CHECK-NEXT:    bfe.u32 %r8, %r1, 8, 8;
+; CHECK-NEXT:    setp.hi.u32 %p3, %r8, %r7;
+; CHECK-NEXT:    bfe.u32 %r9, %r2, 0, 8;
+; CHECK-NEXT:    bfe.u32 %r10, %r1, 0, 8;
+; CHECK-NEXT:    setp.hi.u32 %p4, %r10, %r9;
+; CHECK-NEXT:    selp.b32 %r11, %r10, %r9, %p4;
+; CHECK-NEXT:    selp.b32 %r12, %r8, %r7, %p3;
 ; CHECK-NEXT:    bfi.b32 %r13, %r12, %r11, 8, 8;
-; CHECK-NEXT:    selp.b32 %r14, %r5, %r6, %p2;
+; CHECK-NEXT:    selp.b32 %r14, %r6, %r5, %p2;
 ; CHECK-NEXT:    bfi.b32 %r15, %r14, %r13, 16, 8;
-; CHECK-NEXT:    selp.b32 %r16, %r3, %r4, %p1;
+; CHECK-NEXT:    selp.b32 %r16, %r4, %r3, %p1;
 ; CHECK-NEXT:    bfi.b32 %r17, %r16, %r15, 24, 8;
 ; CHECK-NEXT:    st.param.b32 [func_retval0+0], %r17;
 ; CHECK-NEXT:    ret;
@@ -321,31 +329,39 @@ define <4 x i8> @test_smin(<4 x i8> %a, <4 x i8> %b) #0 {
 ; CHECK-LABEL: test_smin(
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .pred %p<5>;
-; CHECK-NEXT:    .reg .b32 %r<19>;
+; CHECK-NEXT:    .reg .b32 %r<27>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.u32 %r2, [test_smin_param_1];
 ; CHECK-NEXT:    ld.param.u32 %r1, [test_smin_param_0];
-; CHECK-NEXT:    bfe.s32 %r3, %r1, 24, 8;
-; CHECK-NEXT:    bfe.s32 %r4, %r2, 24, 8;
-; CHECK-NEXT:    setp.le.s32 %p1, %r3, %r4;
-; CHECK-NEXT:    bfe.s32 %r5, %r1, 16, 8;
-; CHECK-NEXT:    bfe.s32 %r6, %r2, 16, 8;
-; CHECK-NEXT:    setp.le.s32 %p2, %r5, %r6;
-; CHECK-NEXT:    bfe.s32 %r7, %r1, 8, 8;
-; CHECK-NEXT:    bfe.s32 %r8, %r2, 8, 8;
-; CHECK-NEXT:    setp.le.s32 %p3, %r7, %r8;
-; CHECK-NEXT:    bfe.s32 %r9, %r1, 0, 8;
-; CHECK-NEXT:    bfe.s32 %r10, %r2, 0, 8;
-; CHECK-NEXT:    setp.le.s32 %p4, %r9, %r10;
-; CHECK-NEXT:    selp.b32 %r11, %r9, %r10, %p4;
-; CHECK-NEXT:    selp.b32 %r12, %r7, %r8, %p3;
-; CHECK-NEXT:    bfi.b32 %r13, %r12, %r11, 8, 8;
-; CHECK-NEXT:    selp.b32 %r14, %r5, %r6, %p2;
-; CHECK-NEXT:    bfi.b32 %r15, %r14, %r13, 16, 8;
-; CHECK-NEXT:    selp.b32 %r16, %r3, %r4, %p1;
-; CHECK-NEXT:    bfi.b32 %r17, %r16, %r15, 24, 8;
-; CHECK-NEXT:    st.param.b32 [func_retval0+0], %r17;
+; CHECK-NEXT:    bfe.s32 %r3, %r2, 24, 8;
+; CHECK-NEXT:    bfe.s32 %r4, %r1, 24, 8;
+; CHECK-NEXT:    setp.le.s32 %p1, %r4, %r3;
+; CHECK-NEXT:    bfe.s32 %r5, %r2, 16, 8;
+; CHECK-NEXT:    bfe.s32 %r6, %r1, 16, 8;
+; CHECK-NEXT:    setp.le.s32 %p2, %r6, %r5;
+; CHECK-NEXT:    bfe.s32 %r7, %r2, 8, 8;
+; CHECK-NEXT:    bfe.s32 %r8, %r1, 8, 8;
+; CHECK-NEXT:    setp.le.s32 %p3, %r8, %r7;
+; CHECK-NEXT:    bfe.s32 %r9, %r2, 0, 8;
+; CHECK-NEXT:    bfe.s32 %r10, %r1, 0, 8;
+; CHECK-NEXT:    setp.le.s32 %p4, %r10, %r9;
+; CHECK-NEXT:    bfe.u32 %r11, %r1, 24, 8;
+; CHECK-NEXT:    bfe.u32 %r12, %r1, 16, 8;
+; CHECK-NEXT:    bfe.u32 %r13, %r1, 8, 8;
+; CHECK-NEXT:    bfe.u32 %r14, %r1, 0, 8;
+; CHECK-NEXT:    bfe.u32 %r15, %r2, 0, 8;
+; CHECK-NEXT:    selp.b32 %r16, %r14, %r15, %p4;
+; CHECK-NEXT:    bfe.u32 %r17, %r2, 8, 8;
+; CHECK-NEXT:    selp.b32 %r18, %r13, %r17, %p3;
+; CHECK-NEXT:    bfi.b32 %r19, %r18, %r16, 8, 8;
+; CHECK-NEXT:    bfe.u32 %r20, %r2, 16, 8;
+; CHECK-NEXT:    selp.b32 %r21, %r12, %r20, %p2;
+; CHECK-NEXT:    bfi.b32 %r22, %r21, %r19, 16, 8;
+; CHECK-NEXT:    bfe.u32 %r23, %r2, 24, 8;
+; CHECK-NEXT:    selp.b32 %r24, %r11, %r23, %p1;
+; CHECK-NEXT:    bfi.b32 %r25, %r24, %r22, 24, 8;
+; CHECK-NEXT:    st.param.b32 [func_retval0+0], %r25;
 ; CHECK-NEXT:    ret;
   %cmp = icmp sle <4 x i8> %a, %b
   %r = select <4 x i1> %cmp, <4 x i8> %a, <4 x i8> %b
@@ -361,24 +377,24 @@ define <4 x i8> @test_umin(<4 x i8> %a, <4 x i8> %b) #0 {
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.u32 %r2, [test_umin_param_1];
 ; CHECK-NEXT:    ld.param.u32 %r1, [test_umin_param_0];
-; CHECK-NEXT:    bfe.s32 %r3, %r1, 24, 8;
-; CHECK-NEXT:    bfe.s32 %r4, %r2, 24, 8;
-; CHECK-NEXT:    setp.ls.u32 %p1, %r3, %r4;
-; CHECK-NEXT:    bfe.s32 %r5, %r1, 16, 8;
-; CHECK-NEXT:    bfe.s32 %r6, %r2, 16, 8;
-; CHECK-NEXT:    setp.ls.u32 %p2, %r5, %r6;
-; CHECK-NEXT:    bfe.s32 %r7, %r1, 8, 8;
-; CHECK-NEXT:    bfe.s32 %r8, %r2, 8, 8;
-; CHECK-NEXT:    setp.ls.u32 %p3, %r7, %r8;
-; CHECK-NEXT:    bfe.s32 %r9, %r1, 0, 8;
-; CHECK-NEXT:    bfe.s32 %r10, %r2, 0, 8;
-; CHECK-NEXT:    setp.ls.u32 %p4, %r9, %r10;
-; CHECK-NEXT:    selp.b32 %r11, %r9, %r10, %p4;
-; CHECK-NEXT:    selp.b32 %r12, %r7, %r8, %p3;
+; CHECK-NEXT:    bfe.u32 %r3, %r2, 24, 8;
+; CHECK-NEXT:    bfe.u32 %r4, %r1, 24, 8;
+; CHECK-NEXT:    setp.ls.u32 %p1, %r4, %r3;
+; CHECK-NEXT:    bfe.u32 %r5, %r2, 16, 8;
+; CHECK-NEXT:    bfe.u32 %r6, %r1, 16, 8;
+; CHECK-NEXT:    setp.ls.u32 %p2, %r6, %r5;
+; CHECK-NEXT:    bfe.u32 %r7, %r2, 8, 8;
+; CHECK-NEXT:    bfe.u32 %r8, %r1, 8, 8;
+; CHECK-NEXT:    setp.ls.u32 %p3, %r8, %r7;
+; CHECK-NEXT:    bfe.u32 %r9, %r2, 0, 8;
+; CHECK-NEXT:    bfe.u32 %r10, %r1, 0, 8;
+; CHECK-NEXT:    setp.ls.u32 %p4, %r10, %r9;
+; CHECK-NEXT:    selp.b32 %r11, %r10, %r9, %p4;
+; CHECK-NEXT:    selp.b32 %r12, %r8, %r7, %p3;
 ; CHECK-NEXT:    bfi.b32 %r13, %r12, %r11, 8, 8;
-; CHECK-NEXT:    selp.b32 %r14, %r5, %r6, %p2;
+; CHECK-NEXT:    selp.b32 %r14, %r6, %r5, %p2;
 ; CHECK-NEXT:    bfi.b32 %r15, %r14, %r13, 16, 8;
-; CHECK-NEXT:    selp.b32 %r16, %r3, %r4, %p1;
+; CHECK-NEXT:    selp.b32 %r16, %r4, %r3, %p1;
 ; CHECK-NEXT:    bfi.b32 %r17, %r16, %r15, 24, 8;
 ; CHECK-NEXT:    st.param.b32 [func_retval0+0], %r17;
 ; CHECK-NEXT:    ret;
@@ -397,27 +413,27 @@ define <4 x i8> @test_eq(<4 x i8> %a, <4 x i8> %b, <4 x i8> %c) #0 {
 ; CHECK-NEXT:    ld.param.u32 %r3, [test_eq_param_2];
 ; CHECK-NEXT:    ld.param.u32 %r2, [test_eq_param_1];
 ; CHECK-NEXT:    ld.param.u32 %r1, [test_eq_param_0];
-; CHECK-NEXT:    bfe.s32 %r4, %r2, 24, 8;
-; CHECK-NEXT:    bfe.s32 %r5, %r1, 24, 8;
+; CHECK-NEXT:    bfe.u32 %r4, %r2, 24, 8;
+; CHECK-NEXT:    bfe.u32 %r5, %r1, 24, 8;
 ; CHECK-NEXT:    setp.eq.u32 %p1, %r5, %r4;
-; CHECK-NEXT:    bfe.s32 %r6, %r2, 16, 8;
-; CHECK-NEXT:    bfe.s32 %r7, %r1, 16, 8;
+; CHECK-NEXT:    bfe.u32 %r6, %r2, 16, 8;
+; CHECK-NEXT:    bfe.u32 %r7, %r1, 16, 8;
 ; CHECK-NEXT:    setp.eq.u32 %p2, %r7, %r6;
-; CHECK-NEXT:    bfe.s32 %r8, %r2, 8, 8;
-; CHECK-NEXT:    bfe.s32 %r9, %r1, 8, 8;
+; CHECK-NEXT:    bfe.u32 %r8, %r2, 8, 8;
+; CHECK-NEXT:    bfe.u32 %r9, %r1, 8, 8;
 ; CHECK-NEXT:    setp.eq.u32 %p3, %r9, %r8;
-; CHECK-NEXT:    bfe.s32 %r10, %r2, 0, 8;
-; CHECK-NEXT:    bfe.s32 %r11, %r1, 0, 8;
+; CHECK-NEXT:    bfe.u32 %r10, %r2, 0, 8;
+; CHECK-NEXT:    bfe.u32 %r11, %r1, 0, 8;
 ; CHECK-NEXT:    setp.eq.u32 %p4, %r11, %r10;
-; CHECK-NEXT:    bfe.s32 %r12, %r3, 0, 8;
+; CHECK-NEXT:    bfe.u32 %r12, %r3, 0, 8;
 ; CHECK-NEXT:    selp.b32 %r13, %r11, %r12, %p4;
-; CHECK-NEXT:    bfe.s32 %r14, %r3, 8, 8;
+; CHECK-NEXT:    bfe.u32 %r14, %r3, 8, 8;
 ; CHECK-NEXT:    selp.b32 %r15, %r9, %r14, %p3;
 ; CHECK-NEXT:    bfi.b32 %r16, %r15, %r13, 8, 8;
-; CHECK-NEXT:    bfe.s32 %r17, %r3, 16, 8;
+; CHECK-NEXT:    bfe.u32 %r17, %r3, 16, 8;
 ; CHECK-NEXT:    selp.b32 %r18, %r7, %r17, %p2;
 ; CHECK-NEXT:    bfi.b32 %r19, %r18, %r16, 16, 8;
-; CHECK-NEXT:    bfe.s32 %r20, %r3, 24, 8;
+; CHECK-NEXT:    bfe.u32 %r20, %r3, 24, 8;
 ; CHECK-NEXT:    selp.b32 %r21, %r5, %r20, %p1;
 ; CHECK-NEXT:    bfi.b32 %r22, %r21, %r19, 24, 8;
 ; CHECK-NEXT:    st.param.b32 [func_retval0+0], %r22;
@@ -437,27 +453,27 @@ define <4 x i8> @test_ne(<4 x i8> %a, <4 x i8> %b, <4 x i8> %c) #0 {
 ; CHECK-NEXT:    ld.param.u32 %r3, [test_ne_param_2];
 ; CHECK-NEXT:    ld.param.u32 %r2, [test_ne_param_1];
 ; CHECK-NEXT:    ld.param.u32 %r1, [test_ne_param_0];
-; CHECK-NEXT:    bfe.s32 %r4, %r2, 24, 8;
-; CHECK-NEXT:    bfe.s32 %r5, %r1, 24, 8;
+; CHECK-NEXT:    bfe.u32 %r4, %r2, 24, 8;
+; CHECK-NEXT:    bfe.u32 %r5, %r1, 24, 8;
 ; CHECK-NEXT:    setp.ne.u32 %p1, %r5, %r4;
-; CHECK-NEXT:    bfe.s32 %r6, %r2, 16, 8;
-; CHECK-NEXT:    bfe.s32 %r7, %r1, 16, 8;
+; CHECK-NEXT:    bfe.u32 %r6, %r2, 16, 8;
+; CHECK-NEXT:    bfe.u32 %r7, %r1, 16, 8;
 ; CHECK-NEXT:    setp.ne.u32 %p2, %r7, %r6;
-; CHECK-NEXT:    bfe.s32 %r8, %r2, 8, 8;
-; CHECK-NEXT:    bfe.s32 %r9, %r1, 8, 8;
+; CHECK-NEXT:    bfe.u32 %r8, %r2, 8, 8;
+; CHECK-NEXT:    bfe.u32 %r9, %r1, 8, 8;
 ; CHECK-NEXT:    setp.ne.u32 %p3, %r9, %r8;
-; CHECK-NEXT:    bfe.s32 %r10, %r2, 0, 8;
-; CHECK-NEXT:    bfe.s32 %r11, %r1, 0, 8;
+; CHECK-NEXT:    bfe.u32 %r10, %r2, 0, 8;
+; CHECK-NEXT:    bfe.u32 %r11, %r1, 0, 8;
 ; CHECK-NEXT:    setp.ne.u32 %p4, %r11, %r10;
-; CHECK-NEXT:    bfe.s32 %r12, %r3, 0, 8;
+; CHECK-NEXT:    bfe.u32 %r12, %r3, 0, 8;
 ; CHECK-NEXT:    selp.b32 %r13, %r11, %r12, %p4;
-; CHECK-NEXT:    bfe.s32 %r14, %r3, 8, 8;
+; CHECK-NEXT:    bfe.u32 %r14, %r3, 8, 8;
 ; CHECK-NEXT:    selp.b32 %r15, %r9, %r14, %p3;
 ; CHECK-NEXT:    bfi.b32 %r16, %r15, %r13, 8, 8;
-; CHECK-NEXT:    bfe.s32 %r17, %r3, 16, 8;
+; CHECK-NEXT:    bfe.u32 %r17, %r3, 16, 8;
 ; CHECK-NEXT:    selp.b32 %r18, %r7, %r17, %p2;
 ; CHECK-NEXT:    bfi.b32 %r19, %r18, %r16, 16, 8;
-; CHECK-NEXT:    bfe.s32 %r20, %r3, 24, 8;
+; CHECK-NEXT:    bfe.u32 %r20, %r3, 24, 8;
 ; CHECK-NEXT:    selp.b32 %r21, %r5, %r20, %p1;
 ; CHECK-NEXT:    bfi.b32 %r22, %r21, %r19, 24, 8;
 ; CHECK-NEXT:    st.param.b32 [func_retval0+0], %r22;
@@ -476,29 +492,29 @@ define <4 x i8> @test_mul(<4 x i8> %a, <4 x i8> %b) #0 {
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.u32 %r2, [test_mul_param_1];
 ; CHECK-NEXT:    ld.param.u32 %r1, [test_mul_param_0];
-; CHECK-NEXT:    bfe.s32 %r3, %r2, 0, 8;
+; CHECK-NEXT:    bfe.u32 %r3, %r2, 0, 8;
 ; CHECK-NEXT:    cvt.u16.u32 %rs1, %r3;
-; CHECK-NEXT:    bfe.s32 %r4, %r1, 0, 8;
+; CHECK-NEXT:    bfe.u32 %r4, %r1, 0, 8;
 ; CHECK-NEXT:    cvt.u16.u32 %rs2, %r4;
 ; CHECK-NEXT:    mul.lo.s16 %rs3, %rs2, %rs1;
 ; CHECK-NEXT:    cvt.u32.u16 %r5, %rs3;
-; CHECK-NEXT:    bfe.s32 %r6, %r2, 8, 8;
+; CHECK-NEXT:    bfe.u32 %r6, %r2, 8, 8;
 ; CHECK-NEXT:    cvt.u16.u32 %rs4, %r6;
-; CHECK-NEXT:    bfe.s32 %r7, %r1, 8, 8;
+; CHECK-NEXT:    bfe.u32 %r7, %r1, 8, 8;
 ; CHECK-NEXT:    cvt.u16.u32 %rs5, %r7;
 ; CHECK-NEXT:    mul.lo.s16 %rs6, %rs5, %rs4;
 ; CHECK-NEXT:    cvt.u32.u16 %r8, %rs6;
 ; CHECK-NEXT:    bfi.b32 %r9, %r8, %r5, 8, 8;
-; CHECK-NEXT:    bfe.s32 %r10, %r2, 16, 8;
+; CHECK-NEXT:    bfe.u32 %r10, %r2, 16, 8;
 ; CHECK-NEXT:    cvt.u16.u32 %rs7, %r10;
-; CHECK-NEXT:    bfe.s32 %r11, %r1, 16, 8;
+; CHECK-NEXT:    bfe.u32 %r11, %r1, 16, 8;
 ; CHECK-NEXT:    cvt.u16.u32 %rs8, %r11;
 ; CHECK-NEXT:    mul.lo.s16 %rs9, %rs8, %rs7;
 ; CHECK-NEXT:    cvt.u32.u16 %r12, %rs9;
 ; CHECK-NEXT:    bfi.b32 %r13, %r12, %r9, 16, 8;
-; CHECK-NEXT:    bfe.s32 %r14, %r2, 24, 8;
+; CHECK-NEXT:    bfe.u32 %r14, %r2, 24, 8;
 ; CHECK-NEXT:    cvt.u16.u32 %rs10, %r14;
-; CHECK-NEXT:    bfe.s32 %r15, %r1, 24, 8;
+; CHECK-NEXT:    bfe.u32 %r15, %r1, 24, 8;
 ; CHECK-NEXT:    cvt.u16.u32 %rs11, %r15;
 ; CHECK-NEXT:    mul.lo.s16 %rs12, %rs11, %rs10;
 ; CHECK-NEXT:    cvt.u32.u16 %r16, %rs12;
@@ -732,7 +748,7 @@ define void @test_ldst_v3i8(ptr %a, ptr %b) {
 ; CHECK-NEXT:    ld.param.u64 %rd1, [test_ldst_v3i8_param_0];
 ; CHECK-NEXT:    ld.u32 %r1, [%rd1];
 ; CHECK-NEXT:    st.u16 [%rd2], %r1;
-; CHECK-NEXT:    bfe.s32 %r3, %r1, 16, 8;
+; CHECK-NEXT:    bfe.u32 %r3, %r1, 16, 8;
 ; CHECK-NEXT:    st.u8 [%rd2+2], %r3;
 ; CHECK-NEXT:    ret;
   %t1 = load <3 x i8>, ptr %a
@@ -920,31 +936,31 @@ define <4 x i8> @test_select_cc(<4 x i8> %a, <4 x i8> %b, <4 x i8> %c, <4 x i8>
 ; CHECK-NEXT:    ld.param.u32 %r3, [test_select_cc_param_2];
 ; CHECK-NEXT:    ld.param.u32 %r2, [test_select_cc_param_1];
 ; CHECK-NEXT:    ld.param.u32 %r1, [test_select_cc_param_0];
-; CHECK-NEXT:    bfe.s32 %r5, %r4, 24, 8;
-; CHECK-NEXT:    bfe.s32 %r6, %r3, 24, 8;
+; CHECK-NEXT:    bfe.u32 %r5, %r4, 24, 8;
+; CHECK-NEXT:    bfe.u32 %r6, %r3, 24, 8;
 ; CHECK-NEXT:    setp.ne.u32 %p1, %r6, %r5;
-; CHECK-NEXT:    bfe.s32 %r7, %r4, 16, 8;
-; CHECK-NEXT:    bfe.s32 %r8, %r3, 16, 8;
+; CHECK-NEXT:    bfe.u32 %r7, %r4, 16, 8;
+; CHECK-NEXT:    bfe.u32 %r8, %r3, 16, 8;
 ; CHECK-NEXT:    setp.ne.u32 %p2, %r8, %r7;
-; CHECK-NEXT:    bfe.s32 %r9, %r4, 8, 8;
-; CHECK-NEXT:    bfe.s32 %r10, %r3, 8, 8;
+; CHECK-NEXT:    bfe.u32 %r9, %r4, 8, 8;
+; CHECK-NEXT:    bfe.u32 %r10, %r3, 8, 8;
 ; CHECK-NEXT:    setp.ne.u32 %p3, %r10, %r9;
-; CHECK-NEXT:    bfe.s32 %r11, %r4, 0, 8;
-; CHECK-NEXT:    bfe.s32 %r12, %r3, 0, 8;
+; CHECK-NEXT:    bfe.u32 %r11, %r4, 0, 8;
+; CHECK-NEXT:    bfe.u32 %r12, %r3, 0, 8;
 ; CHECK-NEXT:    setp.ne.u32 %p4, %r12, %r11;
-; CHECK-NEXT:    bfe.s32 %r13, %r2, 0, 8;
-; CHECK-NEXT:    bfe.s32 %r14, %r1, 0, 8;
+; CHECK-NEXT:    bfe.u32 %r13, %r2, 0, 8;
+; CHECK-NEXT:    bfe.u32 %r14, %r1, 0, 8;
 ; CHECK-NEXT:    selp.b32 %r15, %r14, %r13, %p4;
-; CHECK-NEXT:    bfe.s32 %r16, %r2, 8, 8;
-; CHECK-NEXT:    bfe.s32 %r17, %r1, 8, 8;
+; CHECK-NEXT:    bfe.u32 %r16, %r2, 8, 8;
+; CHECK-NEXT:    bfe.u32 %r17, %r1, 8, 8;
 ; CHECK-NEXT:    selp.b32 %r18, %r17, %r16, %p3;
 ; CHECK-NEXT:    bfi.b32 %r19, %r18, %r15, 8, 8;
-; CHECK-NEXT:    bfe.s32 %r20, %r2, 16, 8;
-; CHECK-NEXT:    bfe.s32 %r21, %r1, 16, 8;
+; CHECK-NEXT:    bfe.u32 %r20, %r2, 16, 8;
+; CHECK-NEXT:    bfe.u32 %r21, %r1, 16, 8;
 ; CHECK-NEXT:    selp.b32 %r22, %r21, %r20, %p2;
 ; CHECK-NEXT:    bfi.b32 %r23, %r22, %r19, 16, 8;
-; CHECK-NEXT:    bfe.s32 %r24, %r2, 24, 8;
-; CHECK-NEXT:    bfe.s32 %r25, %r1, 24, 8;
+; CHECK-NEXT:    bfe.u32 %r24, %r2, 24, 8;
+; CHECK-NEXT:    bfe.u32 %r25, %r1, 24, 8;
 ; CHECK-NEXT:    selp.b32 %r26, %r25, %r24, %p1;
 ; CHECK-NEXT:    bfi.b32 %r27, %r26, %r23, 24, 8;
 ; CHECK-NEXT:    st.param.b32 [func_retval0+0], %r27;
@@ -965,17 +981,17 @@ define <4 x i32> @test_select_cc_i32_i8(<4 x i32> %a, <4 x i32> %b,
 ; CHECK-NEXT:    ld.param.v4.u32 {%r1, %r2, %r3, %r4}, [test_select_cc_i32_i8_param_0];
 ; CHECK-NEXT:    ld.param.u32 %r10, [test_select_cc_i32_i8_param_3];
 ; CHECK-NEXT:    ld.param.u32 %r9, [test_select_cc_i32_i8_param_2];
-; CHECK-NEXT:    bfe.s32 %r11, %r10, 0, 8;
-; CHECK-NEXT:    bfe.s32 %r12, %r9, 0, 8;
+; CHECK-NEXT:    bfe.u32 %r11, %r10, 0, 8;
+; CHECK-NEXT:    bfe.u32 %r12, %r9, 0, 8;
 ; CHECK-NEXT:    setp.ne.u32 %p1, %r12, %r11;
-; CHECK-NEXT:    bfe.s32 %r13, %r10, 8, 8;
-; CHECK-NEXT:    bfe.s32 %r14, %r9, 8, 8;
+; CHECK-NEXT:    bfe.u32 %r13, %r10, 8, 8;
+; CHECK-NEXT:    bfe.u32 %r14, %r9, 8, 8;
 ; CHECK-NEXT:    setp.ne.u32 %p2, %r14, %r13;
-; CHECK-NEXT:    bfe.s32 %r15, %r10, 16, 8;
-; CHECK-NEXT:    bfe.s32 %r16, %r9, 16, 8;
+; CHECK-NEXT:    bfe.u32 %r15, %r10, 16, 8;
+; CHECK-NEXT:    bfe.u32 %r16, %r9, 16, 8;
 ; CHECK-NEXT:    setp.ne.u32 %p3, %r16, %r15;
-; CHECK-NEXT:    bfe.s32 %r17, %r10, 24, 8;
-; CHECK-NEXT:    bfe.s32 %r18, %r9, 24, 8;
+; CHECK-NEXT:    bfe.u32 %r17, %r10, 24, 8;
+; CHECK-NEXT:    bfe.u32 %r18, %r9, 24, 8;
 ; CHECK-NEXT:    setp.ne.u32 %p4, %r18, %r17;
 ; CHECK-NEXT:    selp.b32 %r19, %r4, %r8, %p4;
 ; CHECK-NEXT:    selp.b32 %r20, %r3, %r7, %p3;
@@ -1004,19 +1020,19 @@ define <4 x i8> @test_select_cc_i8_i32(<4 x i8> %a, <4 x i8> %b,
 ; CHECK-NEXT:    setp.ne.s32 %p2, %r5, %r9;
 ; CHECK-NEXT:    setp.ne.s32 %p3, %r4, %r8;
 ; CHECK-NEXT:    setp.ne.s32 %p4, %r3, %r7;
-; CHECK-NEXT:    bfe.s32 %r11, %r2, 0, 8;
-; CHECK-NEXT:    bfe.s32 %r12, %r1, 0, 8;
+; CHECK-NEXT:    bfe.u32 %r11, %r2, 0, 8;
+; CHECK-NEXT:    bfe.u32 %r12, %r1, 0, 8;
 ; CHECK-NEXT:    selp.b32 %r13, %r12, %r11, %p4;
-; CHECK-NEXT:    bfe.s32 %r14, %r2, 8, 8;
-; CHECK-NEXT:    bfe.s32 %r15, %r1, 8, 8;
+; CHECK-NEXT:    bfe.u32 %r14, %r2, 8, 8;
+; CHECK-NEXT:    bfe.u32 %r15, %r1, 8, 8;
 ; CHECK-NEXT:    selp.b32 %r16, %r15, %r14, %p3;
 ; CHECK-NEXT:    bfi.b32 %r17, %r16, %r13, 8, 8;
-; CHECK-NEXT:    bfe.s32 %r18, %r2, 16, 8;
-; CHECK-NEXT:    bfe.s32 %r19, %r1, 16, 8;
+; CHECK-NEXT:    bfe.u32 %r18, %r2, 16, 8;
+; CHECK-NEXT:    bfe.u32 %r19, %r1, 16, 8;
 ; CHECK-NEXT:    selp.b32 %r20, %r19, %r18, %p2;
 ; CHECK-NEXT:    bfi.b32 %r21, %r20, %r17, 16, 8;
-; CHECK-NEXT:    bfe.s32 %r22, %r2, 24, 8;
-; CHECK-NEXT:    bfe.s32 %r23, %r1, 24, 8;
+; CHECK-NEXT:    bfe.u32 %r22, %r2, 24, 8;
+; CHECK-NEXT:    bfe.u32 %r23, %r1, 24, 8;
 ; CHECK-NEXT:    selp.b32 %r24, %r23, %r22, %p1;
 ; CHECK-NEXT:    bfi.b32 %r25, %r24, %r21, 24, 8;
 ; CHECK-NEXT:    st.param.b32 [func_retval0+0], %r25;
@@ -1091,16 +1107,16 @@ define <4 x i64> @test_zext_2xi64(<4 x i8> %a) #0 {
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.u32 %r1, [test_zext_2xi64_param_0];
-; CHECK-NEXT:    bfe.s32 %r2, %r1, 24, 8;
+; CHECK-NEXT:    bfe.u32 %r2, %r1, 24, 8;
 ; CHECK-NEXT:    cvt.u64.u32 %rd1, %r2;
 ; CHECK-NEXT:    and.b64 %rd2, %rd1, 255;
-; CHECK-NEXT:    bfe.s32 %r3, %r1, 16, 8;
+; CHECK-NEXT:    bfe.u32 %r3, %r1, 16, 8;
 ; CHECK-NEXT:    cvt.u64.u32 %rd3, %r3;
 ; CHECK-NEXT:    and.b64 %rd4, %rd3, 255;
-; CHECK-NEXT:    bfe.s32 %r4, %r1, 8, 8;
+; CHECK-NEXT:    bfe.u32 %r4, %r1, 8, 8;
 ; CHECK-NEXT:    cvt.u64.u32 %rd5, %r4;
 ; CHECK-NEXT:    and.b64 %rd6, %rd5, 255;
-; CHECK-NEXT:    bfe.s32 %r5, %r1, 0, 8;
+; CHECK-NEXT:    bfe.u32 %r5, %r1, 0, 8;
 ; CHECK-NEXT:    cvt.u64.u32 %rd7, %r5;
 ; CHECK-NEXT:    and.b64 %rd8, %rd7, 255;
 ; CHECK-NEXT:    st.param.v2.b64 [func_retval0+0], {%rd8, %rd6};
@@ -1424,17 +1440,17 @@ define void @test_sext_v4i1_to_v4i8(ptr %a, ptr %b, ptr %c) {
 ; CHECK-NEXT:    ld.param.u64 %rd1, [test_sext_v4i1_to_v4i8_param_0];
 ; CHECK-NEXT:    ld.u32 %r1, [%rd1];
 ; CHECK-NEXT:    ld.u32 %r2, [%rd2];
-; CHECK-NEXT:    bfe.s32 %r3, %r2, 24, 8;
-; CHECK-NEXT:    bfe.s32 %r4, %r1, 24, 8;
+; CHECK-NEXT:    bfe.u32 %r3, %r2, 24, 8;
+; CHECK-NEXT:    bfe.u32 %r4, %r1, 24, 8;
 ; CHECK-NEXT:    setp.hi.u32 %p1, %r4, %r3;
-; CHECK-NEXT:    bfe.s32 %r5, %r2, 16, 8;
-; CHECK-NEXT:    bfe.s32 %r6, %r1, 16, 8;
+; CHECK-NEXT:    bfe.u32 %r5, %r2, 16, 8;
+; CHECK-NEXT:    bfe.u32 %r6, %r1, 16, 8;
 ; CHECK-NEXT:    setp.hi.u32 %p2, %r6, %r5;
-; CHECK-NEXT:    bfe.s32 %r7, %r2, 8, 8;
-; CHECK-NEXT:    bfe.s32 %r8, %r1, 8, 8;
+; CHECK-NEXT:    bfe.u32 %r7, %r2, 8, 8;
+; CHECK-NEXT:    bfe.u32 %r8, %r1, 8, 8;
 ; CHECK-NEXT:    setp.hi.u32 %p3, %r8, %r7;
-; CHECK-NEXT:    bfe.s32 %r9, %r2, 0, 8;
-; CHECK-NEXT:    bfe.s32 %r10, %r1, 0, 8;
+; CHECK-NEXT:    bfe.u32 %r9, %r2, 0, 8;
+; CHECK-NEXT:    bfe.u32 %r10, %r1, 0, 8;
 ; CHECK-NEXT:    setp.hi.u32 %p4, %r10, %r9;
 ; CHECK-NEXT:    selp.s32 %r11, -1, 0, %p4;
 ; CHECK-NEXT:    selp.s32 %r12, -1, 0, %p3;
-- 
cgit v1.1


From fa6850a9981b65972294e13021f82b96d460b3ec Mon Sep 17 00:00:00 2001
From: Rishi Surendran <142182875+rishisurendran@users.noreply.github.com>
Date: Mon, 12 Feb 2024 13:16:59 -0800
Subject: [mlir][nvvm]Add support for grid_constant attribute on LLVM function
 arguments (#78228)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add support for attribute nvvm.grid_constant on LLVM function arguments.
The attribute can be attached only to arguments of type llvm.ptr that
have llvm.byval attribute.
Generate LLVM metadata for functions with nvvm.grid_constant arguments.
The metadata node is a list of integers, where each integer n denotes
that the nth parameter has the
grid_constant annotation (numbering from 1). The generated metadata node
will be handled by NVVM compiler. See
https://docs.nvidia.com/cuda/nvvm-ir-spec/index.html#supported-properties
for documentation on grid_constant property.

This patch also adds convertParameterAttr to
LLVMTranslationDialectInterface for supporting the translation of
derived dialect attributes on function parameters ﻿
---
 llvm/include/llvm/IR/Metadata.h                    | 10 ++--
 mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td        | 13 +++++
 .../mlir/Target/LLVMIR/LLVMTranslationInterface.h  | 28 +++++++++++
 .../include/mlir/Target/LLVMIR/ModuleTranslation.h |  4 +-
 mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp         | 29 +++++++++++
 mlir/lib/Target/LLVMIR/AttrKindDetail.h            | 14 ++++++
 .../Dialect/NVVM/NVVMToLLVMIRTranslation.cpp       | 56 ++++++++++++++++++++++
 mlir/lib/Target/LLVMIR/ModuleTranslation.cpp       | 56 ++++++++++++----------
 mlir/test/Dialect/LLVMIR/nvvm.mlir                 | 26 ++++++++++
 mlir/test/Target/LLVMIR/nvvmir.mlir                | 17 +++++++
 10 files changed, 223 insertions(+), 30 deletions(-)

diff --git a/llvm/include/llvm/IR/Metadata.h b/llvm/include/llvm/IR/Metadata.h
index 6f23ac4..9d0b878 100644
--- a/llvm/include/llvm/IR/Metadata.h
+++ b/llvm/include/llvm/IR/Metadata.h
@@ -1735,7 +1735,7 @@ class NamedMDNode : public ilist_node<NamedMDNode> {
 
   explicit NamedMDNode(const Twine &N);
 
-  template <class T1, class T2> class op_iterator_impl {
+  template <class T1> class op_iterator_impl {
     friend class NamedMDNode;
 
     const NamedMDNode *Node = nullptr;
@@ -1745,10 +1745,10 @@ class NamedMDNode : public ilist_node<NamedMDNode> {
 
   public:
     using iterator_category = std::bidirectional_iterator_tag;
-    using value_type = T2;
+    using value_type = T1;
     using difference_type = std::ptrdiff_t;
     using pointer = value_type *;
-    using reference = value_type &;
+    using reference = value_type;
 
     op_iterator_impl() = default;
 
@@ -1809,12 +1809,12 @@ public:
   // ---------------------------------------------------------------------------
   // Operand Iterator interface...
   //
-  using op_iterator = op_iterator_impl<MDNode *, MDNode>;
+  using op_iterator = op_iterator_impl<MDNode *>;
 
   op_iterator op_begin() { return op_iterator(this, 0); }
   op_iterator op_end()   { return op_iterator(this, getNumOperands()); }
 
-  using const_op_iterator = op_iterator_impl<const MDNode *, MDNode>;
+  using const_op_iterator = op_iterator_impl<const MDNode *>;
 
   const_op_iterator op_begin() const { return const_op_iterator(this, 0); }
   const_op_iterator op_end()   const { return const_op_iterator(this, getNumOperands()); }
diff --git a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
index 37e525a..5a75944 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
@@ -59,6 +59,19 @@ def NVVM_Dialect : Dialect {
     /// Get the name of the attribute used to annotate max number of
     /// registers that can be allocated per thread.
     static StringRef getMaxnregAttrName() { return "nvvm.maxnreg"; }
+
+    /// Get the name of the attribute used to annotate kernel arguments that
+    /// are grid constants.
+    static StringRef getGridConstantAttrName() { return "nvvm.grid_constant"; }
+
+    /// Verify an attribute from this dialect on the argument at 'argIndex' for
+    /// the region at 'regionIndex' on the given operation. Returns failure if
+    /// the verification failed, success otherwise. This hook may optionally be
+    /// invoked from any operation containing a region.
+    LogicalResult verifyRegionArgAttribute(Operation *op,
+                                           unsigned regionIndex,
+                                           unsigned argIndex,
+                                           NamedAttribute argAttr) override;
   }];
 
   let useDefaultAttributePrinterParser = 1;
diff --git a/mlir/include/mlir/Target/LLVMIR/LLVMTranslationInterface.h b/mlir/include/mlir/Target/LLVMIR/LLVMTranslationInterface.h
index 19991a6..4a8ee06 100644
--- a/mlir/include/mlir/Target/LLVMIR/LLVMTranslationInterface.h
+++ b/mlir/include/mlir/Target/LLVMIR/LLVMTranslationInterface.h
@@ -13,6 +13,7 @@
 #ifndef MLIR_TARGET_LLVMIR_LLVMTRANSLATIONINTERFACE_H
 #define MLIR_TARGET_LLVMIR_LLVMTRANSLATIONINTERFACE_H
 
+#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
 #include "mlir/IR/BuiltinAttributes.h"
 #include "mlir/IR/DialectInterface.h"
 #include "mlir/Support/LogicalResult.h"
@@ -25,6 +26,7 @@ class IRBuilderBase;
 namespace mlir {
 namespace LLVM {
 class ModuleTranslation;
+class LLVMFuncOp;
 } // namespace LLVM
 
 /// Base class for dialect interfaces providing translation to LLVM IR.
@@ -58,6 +60,16 @@ public:
                  LLVM::ModuleTranslation &moduleTranslation) const {
     return success();
   }
+
+  /// Hook for derived dialect interface to translate or act on a derived
+  /// dialect attribute that appears on a function parameter. This gets called
+  /// after the function operation has been translated.
+  virtual LogicalResult
+  convertParameterAttr(LLVM::LLVMFuncOp function, int argIdx,
+                       NamedAttribute attr,
+                       LLVM::ModuleTranslation &moduleTranslation) const {
+    return success();
+  }
 };
 
 /// Interface collection for translation to LLVM IR, dispatches to a concrete
@@ -90,6 +102,22 @@ public:
     }
     return success();
   }
+
+  /// Acts on the given function operation using the interface implemented by
+  /// the dialect of one of the function parameter attributes.
+  virtual LogicalResult
+  convertParameterAttr(LLVM::LLVMFuncOp function, int argIdx,
+                       NamedAttribute attribute,
+                       LLVM::ModuleTranslation &moduleTranslation) const {
+    if (const LLVMTranslationDialectInterface *iface =
+            getInterfaceFor(attribute.getNameDialect())) {
+      return iface->convertParameterAttr(function, argIdx, attribute,
+                                         moduleTranslation);
+    }
+    function.emitWarning("Unhandled parameter attribute '" +
+                         attribute.getName().str() + "'");
+    return success();
+  }
 };
 
 } // namespace mlir
diff --git a/mlir/include/mlir/Target/LLVMIR/ModuleTranslation.h b/mlir/include/mlir/Target/LLVMIR/ModuleTranslation.h
index d6b03ac..fb4392e 100644
--- a/mlir/include/mlir/Target/LLVMIR/ModuleTranslation.h
+++ b/mlir/include/mlir/Target/LLVMIR/ModuleTranslation.h
@@ -327,7 +327,9 @@ private:
                            ArrayRef<llvm::Instruction *> instructions);
 
   /// Translates parameter attributes and adds them to the returned AttrBuilder.
-  llvm::AttrBuilder convertParameterAttrs(DictionaryAttr paramAttrs);
+  /// Returns failure if any of the translations failed.
+  FailureOr<llvm::AttrBuilder>
+  convertParameterAttrs(LLVMFuncOp func, int argIdx, DictionaryAttr paramAttrs);
 
   /// Original and translated module.
   Operation *mlirModule;
diff --git a/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp b/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp
index a855e4b..9b1ef08 100644
--- a/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp
+++ b/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp
@@ -1074,6 +1074,35 @@ LogicalResult NVVMDialect::verifyOperationAttribute(Operation *op,
   return success();
 }
 
+LogicalResult NVVMDialect::verifyRegionArgAttribute(Operation *op,
+                                                    unsigned regionIndex,
+                                                    unsigned argIndex,
+                                                    NamedAttribute argAttr) {
+  auto funcOp = dyn_cast<FunctionOpInterface>(op);
+  if (!funcOp)
+    return success();
+
+  bool isKernel = op->hasAttr(NVVMDialect::getKernelFuncAttrName());
+  StringAttr attrName = argAttr.getName();
+  if (attrName == NVVM::NVVMDialect::getGridConstantAttrName()) {
+    if (!isKernel) {
+      return op->emitError()
+             << "'" << attrName
+             << "' attribute must be present only on kernel arguments";
+    }
+    if (!isa<UnitAttr>(argAttr.getValue()))
+      return op->emitError() << "'" << attrName << "' must be a unit attribute";
+    if (!funcOp.getArgAttr(argIndex, LLVM::LLVMDialect::getByValAttrName())) {
+      return op->emitError()
+             << "'" << attrName
+             << "' attribute requires the argument to also have attribute '"
+             << LLVM::LLVMDialect::getByValAttrName() << "'";
+    }
+  }
+
+  return success();
+}
+
 //===----------------------------------------------------------------------===//
 // NVVM target attribute.
 //===----------------------------------------------------------------------===//
diff --git a/mlir/lib/Target/LLVMIR/AttrKindDetail.h b/mlir/lib/Target/LLVMIR/AttrKindDetail.h
index 7f81777..b01858e 100644
--- a/mlir/lib/Target/LLVMIR/AttrKindDetail.h
+++ b/mlir/lib/Target/LLVMIR/AttrKindDetail.h
@@ -59,6 +59,20 @@ getAttrKindToNameMapping() {
   return kindNamePairs;
 }
 
+/// Returns a dense map from LLVM attribute name to their kind in LLVM IR
+/// dialect.
+static llvm::DenseMap<llvm::StringRef, llvm::Attribute::AttrKind>
+getAttrNameToKindMapping() {
+  static auto attrNameToKindMapping = []() {
+    llvm::DenseMap<llvm::StringRef, llvm::Attribute::AttrKind> nameKindMap;
+    for (auto kindNamePair : getAttrKindToNameMapping()) {
+      nameKindMap.insert({kindNamePair.second, kindNamePair.first});
+    }
+    return nameKindMap;
+  }();
+  return attrNameToKindMapping;
+}
+
 } // namespace detail
 } // namespace LLVM
 } // namespace mlir
diff --git a/mlir/lib/Target/LLVMIR/Dialect/NVVM/NVVMToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/NVVM/NVVMToLLVMIRTranslation.cpp
index 45eb840..ea9fe26 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/NVVM/NVVMToLLVMIRTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/NVVM/NVVMToLLVMIRTranslation.cpp
@@ -201,6 +201,62 @@ public:
     }
     return success();
   }
+
+  LogicalResult
+  convertParameterAttr(LLVMFuncOp funcOp, int argIdx, NamedAttribute attribute,
+                       LLVM::ModuleTranslation &moduleTranslation) const final {
+
+    llvm::LLVMContext &llvmContext = moduleTranslation.getLLVMContext();
+    llvm::Function *llvmFunc =
+        moduleTranslation.lookupFunction(funcOp.getName());
+    llvm::NamedMDNode *nvvmAnnotations =
+        moduleTranslation.getOrInsertNamedModuleMetadata("nvvm.annotations");
+
+    if (attribute.getName() == NVVM::NVVMDialect::getGridConstantAttrName()) {
+      llvm::MDNode *gridConstantMetaData = nullptr;
+
+      // Check if a 'grid_constant' metadata node exists for the given function
+      for (llvm::MDNode *opnd : llvm::reverse(nvvmAnnotations->operands())) {
+        if (opnd->getNumOperands() == 3 &&
+            opnd->getOperand(0) == llvm::ValueAsMetadata::get(llvmFunc) &&
+            opnd->getOperand(1) ==
+                llvm::MDString::get(llvmContext, "grid_constant")) {
+          gridConstantMetaData = opnd;
+          break;
+        }
+      }
+
+      // 'grid_constant' is a function-level meta data node with a list of
+      // integers, where each integer n denotes that the nth parameter has the
+      // grid_constant annotation (numbering from 1). This requires aggregating
+      // the indices of the individual parameters that have this attribute.
+      llvm::Type *i32 = llvm::IntegerType::get(llvmContext, 32);
+      if (gridConstantMetaData == nullptr) {
+        // Create a new 'grid_constant' metadata node
+        SmallVector<llvm::Metadata *> gridConstMetadata = {
+            llvm::ValueAsMetadata::getConstant(
+                llvm::ConstantInt::get(i32, argIdx + 1))};
+        llvm::Metadata *llvmMetadata[] = {
+            llvm::ValueAsMetadata::get(llvmFunc),
+            llvm::MDString::get(llvmContext, "grid_constant"),
+            llvm::MDNode::get(llvmContext, gridConstMetadata)};
+        llvm::MDNode *llvmMetadataNode =
+            llvm::MDNode::get(llvmContext, llvmMetadata);
+        nvvmAnnotations->addOperand(llvmMetadataNode);
+      } else {
+        // Append argIdx + 1 to the 'grid_constant' argument list
+        if (auto argList =
+                dyn_cast<llvm::MDTuple>(gridConstantMetaData->getOperand(2))) {
+          llvm::TempMDTuple clonedArgList = argList->clone();
+          clonedArgList->push_back((llvm::ValueAsMetadata::getConstant(
+              llvm::ConstantInt::get(i32, argIdx + 1))));
+          gridConstantMetaData->replaceOperandWith(
+              2, llvm::MDNode::replaceWithUniqued(std::move(clonedArgList)));
+        }
+      }
+    }
+    return success();
+  }
 };
 } // namespace
 
diff --git a/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp b/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp
index a542215..ee8fffd 100644
--- a/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp
@@ -1298,28 +1298,29 @@ static void convertFunctionAttributes(LLVMFuncOp func,
   llvmFunc->setMemoryEffects(newMemEffects);
 }
 
-llvm::AttrBuilder
-ModuleTranslation::convertParameterAttrs(DictionaryAttr paramAttrs) {
+FailureOr<llvm::AttrBuilder>
+ModuleTranslation::convertParameterAttrs(LLVMFuncOp func, int argIdx,
+                                         DictionaryAttr paramAttrs) {
   llvm::AttrBuilder attrBuilder(llvmModule->getContext());
-
-  for (auto [llvmKind, mlirName] : getAttrKindToNameMapping()) {
-    Attribute attr = paramAttrs.get(mlirName);
-    // Skip attributes that are not present.
-    if (!attr)
-      continue;
-
-    // NOTE: C++17 does not support capturing structured bindings.
-    llvm::Attribute::AttrKind llvmKindCap = llvmKind;
-
-    llvm::TypeSwitch<Attribute>(attr)
-        .Case<TypeAttr>([&](auto typeAttr) {
-          attrBuilder.addTypeAttr(llvmKindCap,
-                                  convertType(typeAttr.getValue()));
-        })
-        .Case<IntegerAttr>([&](auto intAttr) {
-          attrBuilder.addRawIntAttr(llvmKindCap, intAttr.getInt());
-        })
-        .Case<UnitAttr>([&](auto) { attrBuilder.addAttribute(llvmKindCap); });
+  auto attrNameToKindMapping = getAttrNameToKindMapping();
+
+  for (auto namedAttr : paramAttrs) {
+    auto it = attrNameToKindMapping.find(namedAttr.getName());
+    if (it != attrNameToKindMapping.end()) {
+      llvm::Attribute::AttrKind llvmKind = it->second;
+
+      llvm::TypeSwitch<Attribute>(namedAttr.getValue())
+          .Case<TypeAttr>([&](auto typeAttr) {
+            attrBuilder.addTypeAttr(llvmKind, convertType(typeAttr.getValue()));
+          })
+          .Case<IntegerAttr>([&](auto intAttr) {
+            attrBuilder.addRawIntAttr(llvmKind, intAttr.getInt());
+          })
+          .Case<UnitAttr>([&](auto) { attrBuilder.addAttribute(llvmKind); });
+    } else if (namedAttr.getNameDialect()) {
+      if (failed(iface.convertParameterAttr(func, argIdx, namedAttr, *this)))
+        return failure();
+    }
   }
 
   return attrBuilder;
@@ -1348,14 +1349,21 @@ LogicalResult ModuleTranslation::convertFunctionSignatures() {
     // Convert result attributes.
     if (ArrayAttr allResultAttrs = function.getAllResultAttrs()) {
       DictionaryAttr resultAttrs = cast<DictionaryAttr>(allResultAttrs[0]);
-      llvmFunc->addRetAttrs(convertParameterAttrs(resultAttrs));
+      FailureOr<llvm::AttrBuilder> attrBuilder =
+          convertParameterAttrs(function, -1, resultAttrs);
+      if (failed(attrBuilder))
+        return failure();
+      llvmFunc->addRetAttrs(*attrBuilder);
     }
 
     // Convert argument attributes.
     for (auto [argIdx, llvmArg] : llvm::enumerate(llvmFunc->args())) {
       if (DictionaryAttr argAttrs = function.getArgAttrDict(argIdx)) {
-        llvm::AttrBuilder attrBuilder = convertParameterAttrs(argAttrs);
-        llvmArg.addAttrs(attrBuilder);
+        FailureOr<llvm::AttrBuilder> attrBuilder =
+            convertParameterAttrs(function, argIdx, argAttrs);
+        if (failed(attrBuilder))
+          return failure();
+        llvmArg.addAttrs(*attrBuilder);
       }
     }
 
diff --git a/mlir/test/Dialect/LLVMIR/nvvm.mlir b/mlir/test/Dialect/LLVMIR/nvvm.mlir
index ce483dd..0369f45 100644
--- a/mlir/test/Dialect/LLVMIR/nvvm.mlir
+++ b/mlir/test/Dialect/LLVMIR/nvvm.mlir
@@ -472,3 +472,29 @@ gpu.module @module_1 [#nvvm.target<chip = "sm_90", features = "+ptx70", link = [
 
 gpu.module @module_2 [#nvvm.target<chip = "sm_90">, #nvvm.target<chip = "sm_80">, #nvvm.target<chip = "sm_70">] {
 }
+
+// CHECK-LABEL : nvvm.grid_constant
+llvm.func @kernel_func(%arg0: !llvm.ptr {llvm.byval = i32, nvvm.grid_constant}) attributes {nvvm.kernel} {
+  llvm.return
+}
+
+// -----
+
+// expected-error @below {{'"nvvm.grid_constant"' attribute must be present only on kernel arguments}}
+llvm.func @kernel_func(%arg0: !llvm.ptr {llvm.byval = i32, nvvm.grid_constant}) {
+  llvm.return
+}
+
+// -----
+
+// expected-error @below {{'"nvvm.grid_constant"' attribute requires the argument to also have attribute 'llvm.byval'}}
+llvm.func @kernel_func(%arg0: !llvm.ptr {nvvm.grid_constant}) attributes {nvvm.kernel} {
+  llvm.return
+}
+
+// -----
+
+// expected-error @below {{'"nvvm.grid_constant"' must be a unit attribute}}
+llvm.func @kernel_func(%arg0: !llvm.ptr {llvm.byval = i32, nvvm.grid_constant = true}) attributes {nvvm.kernel} {
+  llvm.return
+}
diff --git a/mlir/test/Target/LLVMIR/nvvmir.mlir b/mlir/test/Target/LLVMIR/nvvmir.mlir
index 49f9426..3a6a454 100644
--- a/mlir/test/Target/LLVMIR/nvvmir.mlir
+++ b/mlir/test/Target/LLVMIR/nvvmir.mlir
@@ -538,3 +538,20 @@ llvm.func @kernel_func() attributes {nvvm.kernel, nvvm.maxntid = array<i32: 3, 4
   llvm.return
 }
 
+// -----
+// CHECK: !nvvm.annotations =
+// CHECK: !1 = !{ptr @kernel_func, !"grid_constant", !2}
+// CHECK: !2 = !{i32 1}
+// CHECK: !3 = !{ptr @kernel_func, !"kernel", i32 1}
+llvm.func @kernel_func(%arg0: !llvm.ptr {llvm.byval = i32, nvvm.grid_constant}) attributes {nvvm.kernel} {
+  llvm.return
+}
+
+// -----
+// CHECK: !nvvm.annotations =
+// CHECK: !1 = !{ptr @kernel_func, !"grid_constant", !2}
+// CHECK: !2 = !{i32 1, i32 3}
+// CHECK: !3 = !{ptr @kernel_func, !"kernel", i32 1}
+llvm.func @kernel_func(%arg0: !llvm.ptr {llvm.byval = i32, nvvm.grid_constant}, %arg1: f32, %arg2: !llvm.ptr {llvm.byval = f32, nvvm.grid_constant}) attributes {nvvm.kernel} {
+  llvm.return
+}
-- 
cgit v1.1


From 61a0fc794789e5bf9f850436926700c3ad56682b Mon Sep 17 00:00:00 2001
From: Artem Belevich <tra@google.com>
Date: Mon, 12 Feb 2024 13:18:08 -0800
Subject: [NVPTX] pass correct GPU arch to ptxas test (#81535)

---
 llvm/test/CodeGen/NVPTX/nanosleep.ll | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/test/CodeGen/NVPTX/nanosleep.ll b/llvm/test/CodeGen/NVPTX/nanosleep.ll
index 1b2a7bf..a28dabe 100644
--- a/llvm/test/CodeGen/NVPTX/nanosleep.ll
+++ b/llvm/test/CodeGen/NVPTX/nanosleep.ll
@@ -1,5 +1,5 @@
 ; RUN: llc < %s -march=nvptx64 -O2 -mcpu=sm_70 -mattr=+ptx63 | FileCheck %s
-; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_70 -mattr=+ptx63 | %ptxas-verify %}
+; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_70 -mattr=+ptx63 | %ptxas-verify -arch=sm_70 %}
 
 declare void @llvm.nvvm.nanosleep(i32)
 
-- 
cgit v1.1


From 537042577252ce84774aa4f7fbafef8c39b4032c Mon Sep 17 00:00:00 2001
From: Mehdi Amini <joker.eph@gmail.com>
Date: Fri, 20 Oct 2023 06:56:45 -0700
Subject: Apply clang-tidy fixes for bugprone-argument-comment in SCF.cpp (NFC)

---
 mlir/lib/Dialect/SCF/IR/SCF.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mlir/lib/Dialect/SCF/IR/SCF.cpp b/mlir/lib/Dialect/SCF/IR/SCF.cpp
index 9822ee5..119df9a 100644
--- a/mlir/lib/Dialect/SCF/IR/SCF.cpp
+++ b/mlir/lib/Dialect/SCF/IR/SCF.cpp
@@ -1190,7 +1190,7 @@ void ForallOp::print(OpAsmPrinter &p) {
                           OpAsmParser::Delimiter::Paren);
     p << " step ";
     printDynamicIndexList(p, op, getDynamicStep(), getStaticStep(),
-                          /*valueTypes=*/{}, /*scalable=*/{},
+                          /*valueTypes=*/{}, /*scalables=*/{},
                           OpAsmParser::Delimiter::Paren);
   }
   printInitializationList(p, getRegionOutArgs(), getOutputs(), " shared_outs");
-- 
cgit v1.1


From 8189db978fa73a09c040a1e698d85f8b4a63cd37 Mon Sep 17 00:00:00 2001
From: Mehdi Amini <joker.eph@gmail.com>
Date: Fri, 20 Oct 2023 07:01:20 -0700
Subject: Apply clang-tidy fixes for performance-unnecessary-value-param in
 BufferizableOpInterfaceImpl.cpp (NFC)

---
 mlir/lib/Dialect/SCF/Transforms/BufferizableOpInterfaceImpl.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mlir/lib/Dialect/SCF/Transforms/BufferizableOpInterfaceImpl.cpp b/mlir/lib/Dialect/SCF/Transforms/BufferizableOpInterfaceImpl.cpp
index 90f935d..2a16b10 100644
--- a/mlir/lib/Dialect/SCF/Transforms/BufferizableOpInterfaceImpl.cpp
+++ b/mlir/lib/Dialect/SCF/Transforms/BufferizableOpInterfaceImpl.cpp
@@ -463,7 +463,7 @@ DenseSet<int64_t> getEquivalentBuffers(Block::BlockArgListType bbArgs,
 /// Helper function for loop bufferization. Return the bufferized values of the
 /// given OpOperands. If an operand is not a tensor, return the original value.
 static FailureOr<SmallVector<Value>>
-getBuffers(RewriterBase &rewriter, MutableOperandRange operands,
+getBuffers(RewriterBase &rewriter, const MutableOperandRange &operands,
            const BufferizationOptions &options) {
   SmallVector<Value> result;
   for (OpOperand &opOperand : operands) {
-- 
cgit v1.1


From e546a6e95da62b39ba7903e0371fdaa210417e8d Mon Sep 17 00:00:00 2001
From: Mehdi Amini <joker.eph@gmail.com>
Date: Fri, 20 Oct 2023 07:12:58 -0700
Subject: Apply clang-tidy fixes for modernize-loop-convert in Utils.cpp (NFC)

---
 mlir/lib/Dialect/SCF/Utils/Utils.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/mlir/lib/Dialect/SCF/Utils/Utils.cpp b/mlir/lib/Dialect/SCF/Utils/Utils.cpp
index cdd85dd..536c02f 100644
--- a/mlir/lib/Dialect/SCF/Utils/Utils.cpp
+++ b/mlir/lib/Dialect/SCF/Utils/Utils.cpp
@@ -634,9 +634,9 @@ void mlir::collapseParallelLoops(
   SmallVector<Value, 3> lowerBounds, upperBounds, steps;
   auto cst0 = outsideBuilder.create<arith::ConstantIndexOp>(loc, 0);
   auto cst1 = outsideBuilder.create<arith::ConstantIndexOp>(loc, 1);
-  for (unsigned i = 0, e = sortedDimensions.size(); i < e; ++i) {
+  for (auto &sortedDimension : sortedDimensions) {
     Value newUpperBound = outsideBuilder.create<arith::ConstantIndexOp>(loc, 1);
-    for (auto idx : sortedDimensions[i]) {
+    for (auto idx : sortedDimension) {
       newUpperBound = outsideBuilder.create<arith::MulIOp>(
           loc, newUpperBound, normalizedUpperBounds[idx]);
     }
-- 
cgit v1.1


From 56c385cd677bcc4f191e8c09044155159bc9bf7b Mon Sep 17 00:00:00 2001
From: Mehdi Amini <joker.eph@gmail.com>
Date: Fri, 20 Oct 2023 07:30:52 -0700
Subject: Apply clang-tidy fixes for modernize-loop-convert in
 SparseGPUCodegen.cpp (NFC)

---
 mlir/lib/Dialect/SparseTensor/Transforms/SparseGPUCodegen.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/SparseGPUCodegen.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/SparseGPUCodegen.cpp
index 3f352c8..cdee8a4 100644
--- a/mlir/lib/Dialect/SparseTensor/Transforms/SparseGPUCodegen.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/SparseGPUCodegen.cpp
@@ -77,8 +77,8 @@ static gpu::GPUFuncOp genGPUFunc(OpBuilder &builder, gpu::GPUModuleOp gpuModule,
   // Then we insert a new kernel with given arguments into the module.
   builder.setInsertionPointToStart(&gpuModule.getBodyRegion().front());
   SmallVector<Type> argsTp;
-  for (unsigned i = 0, e = args.size(); i < e; i++)
-    argsTp.push_back(args[i].getType());
+  for (auto arg : args)
+    argsTp.push_back(arg.getType());
   FunctionType type = FunctionType::get(gpuModule->getContext(), argsTp, {});
   auto gpuFunc =
       builder.create<gpu::GPUFuncOp>(gpuModule->getLoc(), kernelName, type);
-- 
cgit v1.1


From 61f64d1c237be75bed5d717aec4de0f9df5ab2e7 Mon Sep 17 00:00:00 2001
From: Mehdi Amini <joker.eph@gmail.com>
Date: Fri, 20 Oct 2023 07:36:44 -0700
Subject: Apply clang-tidy fixes for llvm-qualified-auto in
 SparseTensorRewriting.cpp (NFC)

---
 .../SparseTensor/Transforms/SparseTensorRewriting.cpp        | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorRewriting.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorRewriting.cpp
index 1883cf1..235c545 100644
--- a/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorRewriting.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorRewriting.cpp
@@ -543,14 +543,14 @@ public:
     if (!op.hasPureTensorSemantics() || op.getNumDpsInputs() != 1 ||
         op.getNumReductionLoops() == 0 || op.getNumResults() != 1)
       return failure();
-    auto inp = op.getDpsInputOperand(0);
-    auto init = op.getDpsInitOperand(0);
+    auto *inp = op.getDpsInputOperand(0);
+    auto *init = op.getDpsInitOperand(0);
     if (!isSparseTensor(inp))
       return failure();
     // Look for direct x = x OP y for semi-ring ready reductions.
-    auto red = cast<linalg::YieldOp>(op.getRegion().front().getTerminator())
-                   .getOperand(0)
-                   .getDefiningOp();
+    auto *red = cast<linalg::YieldOp>(op.getRegion().front().getTerminator())
+                    .getOperand(0)
+                    .getDefiningOp();
     if (!isa<arith::AndIOp, arith::MulIOp, arith::MulFOp, arith::MinimumFOp,
              arith::MinSIOp, arith::MinUIOp, arith::MaximumFOp, arith::MaxSIOp,
              arith::MaxUIOp>(red))
@@ -592,7 +592,7 @@ public:
     IRMapping irMap;
     irMap.map(red->getOperand(0), region->getArgument(0));
     irMap.map(red->getOperand(1), region->getArgument(1));
-    auto cloned = rewriter.clone(*red, irMap);
+    auto *cloned = rewriter.clone(*red, irMap);
     rewriter.create<sparse_tensor::YieldOp>(loc, cloned->getResult(0));
     rewriter.setInsertionPointAfter(custom);
     rewriter.replaceOp(red, custom.getResult());
-- 
cgit v1.1


From aef36ebb3b74c81589885c61b4fc066052dd9498 Mon Sep 17 00:00:00 2001
From: Arthur Eubanks <aeubanks@google.com>
Date: Mon, 12 Feb 2024 14:41:58 -0700
Subject: [clang] Remove old Linux kernel workaround for ensuring stack space
 (#81533)

PR #71709 broke the Linux PIE build with `undefined symbol: alloca`
errors. With the newly included `clang/Basic/Builtins.h` in that PR, it
surfaces an issue with a combination of two previous patches.

26670dcba1609574cba5942aff78ff97b567c5f3 added `#undef alloca` so clang
builtins handling of alloca would work under MSVC (unsure if this is
still necessary).

194b6a3b1b1a99cc3c12c466a04320f271ebd8aa added code that calls `alloca`
to workaround a Linux kernel < 4.1 bug. Given that Linux 4.1 was EOL in
2018, it should be ok to remove this workaround.
---
 clang/tools/driver/cc1_main.cpp | 62 -----------------------------------------
 1 file changed, 62 deletions(-)

diff --git a/clang/tools/driver/cc1_main.cpp b/clang/tools/driver/cc1_main.cpp
index e9d2c6a..b5c6be3 100644
--- a/clang/tools/driver/cc1_main.cpp
+++ b/clang/tools/driver/cc1_main.cpp
@@ -78,64 +78,6 @@ static void LLVMErrorHandler(void *UserData, const char *Message,
 }
 
 #ifdef CLANG_HAVE_RLIMITS
-#if defined(__linux__) && defined(__PIE__)
-static size_t getCurrentStackAllocation() {
-  // If we can't compute the current stack usage, allow for 512K of command
-  // line arguments and environment.
-  size_t Usage = 512 * 1024;
-  if (FILE *StatFile = fopen("/proc/self/stat", "r")) {
-    // We assume that the stack extends from its current address to the end of
-    // the environment space. In reality, there is another string literal (the
-    // program name) after the environment, but this is close enough (we only
-    // need to be within 100K or so).
-    unsigned long StackPtr, EnvEnd;
-    // Disable silly GCC -Wformat warning that complains about length
-    // modifiers on ignored format specifiers. We want to retain these
-    // for documentation purposes even though they have no effect.
-#if defined(__GNUC__) && !defined(__clang__)
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Wformat"
-#endif
-    if (fscanf(StatFile,
-               "%*d %*s %*c %*d %*d %*d %*d %*d %*u %*lu %*lu %*lu %*lu %*lu "
-               "%*lu %*ld %*ld %*ld %*ld %*ld %*ld %*llu %*lu %*ld %*lu %*lu "
-               "%*lu %*lu %lu %*lu %*lu %*lu %*lu %*lu %*llu %*lu %*lu %*d %*d "
-               "%*u %*u %*llu %*lu %*ld %*lu %*lu %*lu %*lu %*lu %*lu %lu %*d",
-               &StackPtr, &EnvEnd) == 2) {
-#if defined(__GNUC__) && !defined(__clang__)
-#pragma GCC diagnostic pop
-#endif
-      Usage = StackPtr < EnvEnd ? EnvEnd - StackPtr : StackPtr - EnvEnd;
-    }
-    fclose(StatFile);
-  }
-  return Usage;
-}
-
-#include <alloca.h>
-
-LLVM_ATTRIBUTE_NOINLINE
-static void ensureStackAddressSpace() {
-  // Linux kernels prior to 4.1 will sometimes locate the heap of a PIE binary
-  // relatively close to the stack (they are only guaranteed to be 128MiB
-  // apart). This results in crashes if we happen to heap-allocate more than
-  // 128MiB before we reach our stack high-water mark.
-  //
-  // To avoid these crashes, ensure that we have sufficient virtual memory
-  // pages allocated before we start running.
-  size_t Curr = getCurrentStackAllocation();
-  const int kTargetStack = DesiredStackSize - 256 * 1024;
-  if (Curr < kTargetStack) {
-    volatile char *volatile Alloc =
-        static_cast<volatile char *>(alloca(kTargetStack - Curr));
-    Alloc[0] = 0;
-    Alloc[kTargetStack - Curr - 1] = 0;
-  }
-}
-#else
-static void ensureStackAddressSpace() {}
-#endif
-
 /// Attempt to ensure that we have at least 8MiB of usable stack space.
 static void ensureSufficientStack() {
   struct rlimit rlim;
@@ -159,10 +101,6 @@ static void ensureSufficientStack() {
         rlim.rlim_cur != DesiredStackSize)
       return;
   }
-
-  // We should now have a stack of size at least DesiredStackSize. Ensure
-  // that we can actually use that much, if necessary.
-  ensureStackAddressSpace();
 }
 #else
 static void ensureSufficientStack() {}
-- 
cgit v1.1


From fcef407aa21ad5a79d66a088e6f2a66a5745725d Mon Sep 17 00:00:00 2001
From: Konstantin Zhuravlyov <kzhuravl_dev@outlook.com>
Date: Mon, 12 Feb 2024 16:43:48 -0500
Subject: AMDGPU/NFC: Remove some bits from TSFlags (#81525)

- AMDGPU/NFC: Purge SOPK_ZEXT from TSFlags
  - Moved to helper function in SIInstInfo
- AMDGPU/NFC: Purge VOPAsmPrefer32Bit from TSFlags
  - This flag did not make sense / remnants of something else I think
---
 llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp |  5 -----
 llvm/lib/Target/AMDGPU/SIDefines.h                   | 10 ++++++++--
 llvm/lib/Target/AMDGPU/SIInstrFormats.td             | 17 +++++++----------
 llvm/lib/Target/AMDGPU/SIInstrInfo.cpp               |  2 +-
 llvm/lib/Target/AMDGPU/SIInstrInfo.h                 | 13 +++++++------
 llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp      |  6 +++---
 llvm/lib/Target/AMDGPU/SOPInstructions.td            |  3 ---
 llvm/lib/Target/AMDGPU/VOP1Instructions.td           |  4 ----
 llvm/lib/Target/AMDGPU/VOPInstructions.td            |  1 -
 9 files changed, 26 insertions(+), 35 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
index a94da99..79ad6dd 100644
--- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
+++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
@@ -3299,11 +3299,6 @@ unsigned AMDGPUAsmParser::checkTargetMatchPredicate(MCInst &Inst) {
       (isForcedSDWA() && !(TSFlags & SIInstrFlags::SDWA)) )
     return Match_InvalidOperand;
 
-  if ((TSFlags & SIInstrFlags::VOP3) &&
-      (TSFlags & SIInstrFlags::VOPAsmPrefer32Bit) &&
-      getForcedEncodingSize() != 64)
-    return Match_PreferE32;
-
   if (Inst.getOpcode() == AMDGPU::V_MAC_F32_sdwa_vi ||
       Inst.getOpcode() == AMDGPU::V_MAC_F16_sdwa_vi) {
     // v_mac_f32/16 allow only dst_sel == DWORD;
diff --git a/llvm/lib/Target/AMDGPU/SIDefines.h b/llvm/lib/Target/AMDGPU/SIDefines.h
index 19596d5..ca6728c 100644
--- a/llvm/lib/Target/AMDGPU/SIDefines.h
+++ b/llvm/lib/Target/AMDGPU/SIDefines.h
@@ -105,10 +105,16 @@ enum : uint64_t {
   WQM = UINT64_C(1) << 35,
   DisableWQM = UINT64_C(1) << 36,
   Gather4 = UINT64_C(1) << 37,
-  SOPK_ZEXT = UINT64_C(1) << 38,
+
+  // Reserved, must be 0.
+  Reserved0 = UINT64_C(1) << 38,
+
   SCALAR_STORE = UINT64_C(1) << 39,
   FIXED_SIZE = UINT64_C(1) << 40,
-  VOPAsmPrefer32Bit = UINT64_C(1) << 41,
+
+  // Reserved, must be 0.
+  Reserved1 = UINT64_C(1) << 41,
+
   VOP3_OPSEL = UINT64_C(1) << 42,
   maybeAtomic = UINT64_C(1) << 43,
   renamedInGFX9 = UINT64_C(1) << 44,
diff --git a/llvm/lib/Target/AMDGPU/SIInstrFormats.td b/llvm/lib/Target/AMDGPU/SIInstrFormats.td
index ab536f8..bdefcae 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrFormats.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrFormats.td
@@ -69,10 +69,6 @@ class InstSI <dag outs, dag ins, string asm = "",
 
   field bit Gather4 = 0;
 
-  // Most sopk treat the immediate as a signed 16-bit, however some
-  // use it as unsigned.
-  field bit SOPKZext = 0;
-
   // This is an s_store_dword* instruction that requires a cache flush
   // on wave termination. It is necessary to distinguish from mayStore
   // SMEM instructions like the cache flush ones.
@@ -82,10 +78,6 @@ class InstSI <dag outs, dag ins, string asm = "",
   // instruction size.
   field bit FixedSize = 0;
 
-  // This bit tells the assembler to use the 32-bit encoding in case it
-  // is unable to infer the encoding from the operands.
-  field bit VOPAsmPrefer32Bit = 0;
-
   // This bit indicates that this is a VOP3 opcode which supports op_sel
   // modifier.
   field bit VOP3_OPSEL = 0;
@@ -209,10 +201,15 @@ class InstSI <dag outs, dag ins, string asm = "",
   let TSFlags{36} = DisableWQM;
   let TSFlags{37} = Gather4;
 
-  let TSFlags{38} = SOPKZext;
+  // Reserved, must be 0.
+  let TSFlags{38} = 0;
+
   let TSFlags{39} = ScalarStore;
   let TSFlags{40} = FixedSize;
-  let TSFlags{41} = VOPAsmPrefer32Bit;
+
+  // Reserved, must be 0.
+  let TSFlags{41} = 0;
+
   let TSFlags{42} = VOP3_OPSEL;
 
   let TSFlags{43} = maybeAtomic;
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index c7628bd..f5ec831 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -4918,7 +4918,7 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
       }
     } else {
       uint64_t Imm = Op->getImm();
-      if (sopkIsZext(MI)) {
+      if (sopkIsZext(Opcode)) {
         if (!isUInt<16>(Imm)) {
           ErrInfo = "invalid immediate for SOPK instruction";
           return false;
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index 2838a5c..7a6c284 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -842,12 +842,13 @@ public:
     return MI.getDesc().TSFlags & SIInstrFlags::LGKM_CNT;
   }
 
-  static bool sopkIsZext(const MachineInstr &MI) {
-    return MI.getDesc().TSFlags & SIInstrFlags::SOPK_ZEXT;
-  }
-
-  bool sopkIsZext(uint16_t Opcode) const {
-    return get(Opcode).TSFlags & SIInstrFlags::SOPK_ZEXT;
+  // Most sopk treat the immediate as a signed 16-bit, however some
+  // use it as unsigned.
+  static bool sopkIsZext(unsigned Opcode) {
+    return Opcode == AMDGPU::S_CMPK_EQ_U32 || Opcode == AMDGPU::S_CMPK_LG_U32 ||
+           Opcode == AMDGPU::S_CMPK_GT_U32 || Opcode == AMDGPU::S_CMPK_GE_U32 ||
+           Opcode == AMDGPU::S_CMPK_LT_U32 || Opcode == AMDGPU::S_CMPK_LE_U32 ||
+           Opcode == AMDGPU::S_GETREG_B32;
   }
 
   /// \returns true if this is an s_store_dword* instruction. This is more
diff --git a/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp b/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp
index d290dd8..3c6f6dd 100644
--- a/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp
+++ b/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp
@@ -251,9 +251,9 @@ void SIShrinkInstructions::shrinkScalarCompare(MachineInstr &MI) const {
 
   const MCInstrDesc &NewDesc = TII->get(SOPKOpc);
 
-  if ((TII->sopkIsZext(SOPKOpc) && isKUImmOperand(Src1)) ||
-      (!TII->sopkIsZext(SOPKOpc) && isKImmOperand(Src1))) {
-    if (!TII->sopkIsZext(SOPKOpc))
+  if ((SIInstrInfo::sopkIsZext(SOPKOpc) && isKUImmOperand(Src1)) ||
+      (!SIInstrInfo::sopkIsZext(SOPKOpc) && isKImmOperand(Src1))) {
+    if (!SIInstrInfo::sopkIsZext(SOPKOpc))
       Src1.setImm(SignExtend64(Src1.getImm(), 32));
     MI.setDesc(NewDesc);
   }
diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td
index 8351560..1486df0 100644
--- a/llvm/lib/Target/AMDGPU/SOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td
@@ -1078,14 +1078,12 @@ def S_CMPK_GE_I32 : SOPK_SCC <"s_cmpk_ge_i32", "s_cmp_ge_i32", 1>;
 def S_CMPK_LT_I32 : SOPK_SCC <"s_cmpk_lt_i32", "s_cmp_lt_i32", 1>;
 def S_CMPK_LE_I32 : SOPK_SCC <"s_cmpk_le_i32", "s_cmp_le_i32", 1>;
 
-let SOPKZext = 1 in {
 def S_CMPK_EQ_U32 : SOPK_SCC <"s_cmpk_eq_u32", "s_cmp_eq_u32", 0>;
 def S_CMPK_LG_U32 : SOPK_SCC <"s_cmpk_lg_u32", "s_cmp_lg_u32", 0>;
 def S_CMPK_GT_U32 : SOPK_SCC <"s_cmpk_gt_u32", "s_cmp_gt_u32", 0>;
 def S_CMPK_GE_U32 : SOPK_SCC <"s_cmpk_ge_u32", "s_cmp_ge_u32", 0>;
 def S_CMPK_LT_U32 : SOPK_SCC <"s_cmpk_lt_u32", "s_cmp_lt_u32", 0>;
 def S_CMPK_LE_U32 : SOPK_SCC <"s_cmpk_le_u32", "s_cmp_le_u32", 0>;
-} // End SOPKZext = 1
 } // End isCompare = 1
 
 let isCommutable = 1, DisableEncoding = "$src0",
@@ -1111,7 +1109,6 @@ def S_GETREG_B32 : SOPK_Pseudo <
   (outs SReg_32:$sdst), (ins hwreg:$simm16),
   "$sdst, $simm16",
   [(set i32:$sdst, (int_amdgcn_s_getreg (i32 timm:$simm16)))]> {
-  let SOPKZext = 1;
   let hasSideEffects = 1;
 }
 
diff --git a/llvm/lib/Target/AMDGPU/VOP1Instructions.td b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
index 58b67b2..41a03bb 100644
--- a/llvm/lib/Target/AMDGPU/VOP1Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
@@ -217,9 +217,7 @@ def VOP_I16_F16_SPECIAL_OMOD_t16 : VOPProfile_Fake16<VOP_I16_F16> {
 // VOP1 Instructions
 //===----------------------------------------------------------------------===//
 
-let VOPAsmPrefer32Bit = 1 in {
 defm V_NOP : VOP1Inst <"v_nop", VOP_NOP_PROFILE>;
-}
 
 def VOPProfile_MOV : VOPProfile <[i32, i32, untyped, untyped]> {
   let InsVOPDX = (ins Src0RC32:$src0X);
@@ -368,9 +366,7 @@ defm V_FREXP_EXP_I32_F32 : VOP1Inst <"v_frexp_exp_i32_f32", VOP_I32_F32, int_amd
 defm V_FREXP_MANT_F32 : VOP1Inst <"v_frexp_mant_f32", VOP_F32_F32, int_amdgcn_frexp_mant>;
 } // End isReMaterializable = 1
 
-let VOPAsmPrefer32Bit = 1 in {
 defm V_CLREXCP : VOP1Inst <"v_clrexcp", VOP_NO_EXT<VOP_NONE>>;
-}
 
 // Restrict src0 to be VGPR
 def VOP_MOVRELS : VOPProfile<[i32, i32, untyped, untyped]> {
diff --git a/llvm/lib/Target/AMDGPU/VOPInstructions.td b/llvm/lib/Target/AMDGPU/VOPInstructions.td
index e8f85cb..f2bb58e 100644
--- a/llvm/lib/Target/AMDGPU/VOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOPInstructions.td
@@ -16,7 +16,6 @@ class LetDummies {
   bit isMoveImm;
   bit isReMaterializable;
   bit isAsCheapAsAMove;
-  bit VOPAsmPrefer32Bit;
   bit FPDPRounding;
   Predicate SubtargetPredicate;
   string Constraints;
-- 
cgit v1.1


From 283feb42ee509cc56fb75316b4fd86b53a714cf4 Mon Sep 17 00:00:00 2001
From: Karthika Devi C <quic_kartc@quicinc.com>
Date: Tue, 13 Feb 2024 03:19:32 +0530
Subject: [polly] Make reduction detection checks more robust - part 2 (#80721)

Existing reduction detection algorithm does two types of memory checks
before marking a load store pair as reduction.
Second check is to verify there is no other memory access in ScopStmt
overlapping with the memory of load and store that forms the reduction.
Existing check misses cases where there could be probable overlap such
as
	A[V] += A[P];
In the above case there is chance of overlap between A[V] and A[P] which
is missed.
This commit addresses this by removing the parameter from space before
checking for compatible space.

Part 1 of this patch :
[75297](https://github.com/llvm/llvm-project/pull/75297)
---
 polly/lib/Analysis/ScopBuilder.cpp                           | 12 ++++++++----
 .../non-affine-loop-condition-dependent-access_2.ll          |  4 ++--
 .../non-affine-loop-condition-dependent-access_3.ll          |  4 ++--
 polly/test/ScopInfo/int2ptr_ptr2int.ll                       |  4 ++--
 4 files changed, 14 insertions(+), 10 deletions(-)

diff --git a/polly/lib/Analysis/ScopBuilder.cpp b/polly/lib/Analysis/ScopBuilder.cpp
index 4486164..0edc41d 100644
--- a/polly/lib/Analysis/ScopBuilder.cpp
+++ b/polly/lib/Analysis/ScopBuilder.cpp
@@ -2516,15 +2516,19 @@ bool hasIntersectingAccesses(isl::set AllAccs, MemoryAccess *LoadMA,
                              MemoryAccess *StoreMA, isl::set Domain,
                              SmallVector<MemoryAccess *, 8> &MemAccs) {
   bool HasIntersectingAccs = false;
+  auto AllAccsNoParams = AllAccs.project_out_all_params();
+
   for (MemoryAccess *MA : MemAccs) {
     if (MA == LoadMA || MA == StoreMA)
       continue;
+    auto AccRel = MA->getAccessRelation().intersect_domain(Domain);
+    auto Accs = AccRel.range();
+    auto AccsNoParams = Accs.project_out_all_params();
 
-    isl::map AccRel = MA->getAccessRelation().intersect_domain(Domain);
-    isl::set Accs = AccRel.range();
+    bool CompatibleSpace = AllAccsNoParams.has_equal_space(AccsNoParams);
 
-    if (AllAccs.has_equal_space(Accs)) {
-      isl::set OverlapAccs = Accs.intersect(AllAccs);
+    if (CompatibleSpace) {
+      auto OverlapAccs = Accs.intersect(AllAccs);
       bool DoesIntersect = !OverlapAccs.is_empty();
       HasIntersectingAccs |= DoesIntersect;
     }
diff --git a/polly/test/ScopInfo/NonAffine/non-affine-loop-condition-dependent-access_2.ll b/polly/test/ScopInfo/NonAffine/non-affine-loop-condition-dependent-access_2.ll
index f2574b7..e39569ab 100644
--- a/polly/test/ScopInfo/NonAffine/non-affine-loop-condition-dependent-access_2.ll
+++ b/polly/test/ScopInfo/NonAffine/non-affine-loop-condition-dependent-access_2.ll
@@ -44,9 +44,9 @@
 ; INNERMOST-NEXT:             [p_0, p_1, p_2] -> { Stmt_bb16[i0] -> MemRef_A[p_1] };
 ; INNERMOST-NEXT:         ReadAccess :=    [Reduction Type: NONE] [Scalar: 0]
 ; INNERMOST-NEXT:             [p_0, p_1, p_2] -> { Stmt_bb16[i0] -> MemRef_A[p_2] };
-; INNERMOST-NEXT:         ReadAccess :=    [Reduction Type: +] [Scalar: 0]
+; INNERMOST-NEXT:         ReadAccess :=    [Reduction Type: NONE] [Scalar: 0]
 ; INNERMOST-NEXT:             [p_0, p_1, p_2] -> { Stmt_bb16[i0] -> MemRef_A[p_0 + i0] };
-; INNERMOST-NEXT:         MustWriteAccess :=    [Reduction Type: +] [Scalar: 0]
+; INNERMOST-NEXT:         MustWriteAccess :=    [Reduction Type: NONE] [Scalar: 0]
 ; INNERMOST-NEXT:             [p_0, p_1, p_2] -> { Stmt_bb16[i0] -> MemRef_A[p_0 + i0] };
 ; INNERMOST-NEXT:     Stmt_bb26
 ; INNERMOST-NEXT:         Domain :=
diff --git a/polly/test/ScopInfo/NonAffine/non-affine-loop-condition-dependent-access_3.ll b/polly/test/ScopInfo/NonAffine/non-affine-loop-condition-dependent-access_3.ll
index 463db23..75dd7ac 100644
--- a/polly/test/ScopInfo/NonAffine/non-affine-loop-condition-dependent-access_3.ll
+++ b/polly/test/ScopInfo/NonAffine/non-affine-loop-condition-dependent-access_3.ll
@@ -44,9 +44,9 @@
 ; INNERMOST-NEXT:             [p_0, p_1, p_2] -> { Stmt_bb16[i0] -> MemRef_A[p_1] };
 ; INNERMOST-NEXT:         ReadAccess :=    [Reduction Type: NONE] [Scalar: 0]
 ; INNERMOST-NEXT:             [p_0, p_1, p_2] -> { Stmt_bb16[i0] -> MemRef_A[p_2] };
-; INNERMOST-NEXT:         ReadAccess :=    [Reduction Type: +] [Scalar: 0]
+; INNERMOST-NEXT:         ReadAccess :=    [Reduction Type: NONE] [Scalar: 0]
 ; INNERMOST-NEXT:             [p_0, p_1, p_2] -> { Stmt_bb16[i0] -> MemRef_A[i0] };
-; INNERMOST-NEXT:         MustWriteAccess :=    [Reduction Type: +] [Scalar: 0]
+; INNERMOST-NEXT:         MustWriteAccess :=    [Reduction Type: NONE] [Scalar: 0]
 ; INNERMOST-NEXT:             [p_0, p_1, p_2] -> { Stmt_bb16[i0] -> MemRef_A[i0] };
 ; INNERMOST-NEXT:     Stmt_bb26
 ; INNERMOST-NEXT:         Domain :=
diff --git a/polly/test/ScopInfo/int2ptr_ptr2int.ll b/polly/test/ScopInfo/int2ptr_ptr2int.ll
index 2e7bfbe..9fadc5a 100644
--- a/polly/test/ScopInfo/int2ptr_ptr2int.ll
+++ b/polly/test/ScopInfo/int2ptr_ptr2int.ll
@@ -11,9 +11,9 @@
 ;
 ; CHECK:        ReadAccess :=	[Reduction Type: NONE] [Scalar: 0]
 ; CHECK-NEXT:       [val, ptr] -> { Stmt_for_body[i0] -> MemRef_A[9 + val] };
-; CHECK-NEXT:   ReadAccess :=	[Reduction Type: +] [Scalar: 0]
+; CHECK-NEXT:   ReadAccess :=	[Reduction Type: NONE] [Scalar: 0]
 ; CHECK-NEXT:       [val, ptr] -> { Stmt_for_body[i0] -> MemRef_A[9 + ptr] };
-; CHECK-NEXT:   MustWriteAccess :=	[Reduction Type: +] [Scalar: 0]
+; CHECK-NEXT:   MustWriteAccess :=	[Reduction Type: NONE] [Scalar: 0]
 ; CHECK-NEXT:       [val, ptr] -> { Stmt_for_body[i0] -> MemRef_A[9 + ptr] };
 ;
 ; IR:      entry:
-- 
cgit v1.1


From a5f3d1a803020167bd9d494a8a3921e7dcc1550a Mon Sep 17 00:00:00 2001
From: Amir Ayupov <aaupov@fb.com>
Date: Mon, 12 Feb 2024 14:36:12 -0800
Subject: [BOLT][NFC] Return Error from BinaryFunctionPass::runOnFunctions
 (#81521)

As part of the effort to refactor old error handling code that
would directly call exit(1), in this patch we change the
interface to `BinaryFunctionPass` to return an Error on
`runOnFunctions()`. This gives passes the ability to report a
serious problem to the caller (RewriteInstance class), so the
caller may decide how to best handle the exceptional situation.

Co-authored-by: Rafael Auler <rafaelauler@fb.com>

Test Plan: NFC
---
 bolt/include/bolt/Passes/ADRRelaxationPass.h     |  2 +-
 bolt/include/bolt/Passes/Aligner.h               |  2 +-
 bolt/include/bolt/Passes/AllocCombiner.h         |  2 +-
 bolt/include/bolt/Passes/AsmDump.h               |  2 +-
 bolt/include/bolt/Passes/BinaryPasses.h          | 45 +++++++-------
 bolt/include/bolt/Passes/CMOVConversion.h        |  2 +-
 bolt/include/bolt/Passes/FixRISCVCallsPass.h     |  2 +-
 bolt/include/bolt/Passes/FixRelaxationPass.h     |  2 +-
 bolt/include/bolt/Passes/FrameOptimizer.h        |  2 +-
 bolt/include/bolt/Passes/Hugify.h                |  2 +-
 bolt/include/bolt/Passes/IdenticalCodeFolding.h  |  2 +-
 bolt/include/bolt/Passes/IndirectCallPromotion.h |  2 +-
 bolt/include/bolt/Passes/Inliner.h               |  2 +-
 bolt/include/bolt/Passes/Instrumentation.h       |  2 +-
 bolt/include/bolt/Passes/JTFootprintReduction.h  |  2 +-
 bolt/include/bolt/Passes/LongJmp.h               |  2 +-
 bolt/include/bolt/Passes/LoopInversionPass.h     |  2 +-
 bolt/include/bolt/Passes/PLTCall.h               |  2 +-
 bolt/include/bolt/Passes/PatchEntries.h          |  2 +-
 bolt/include/bolt/Passes/RegReAssign.h           |  2 +-
 bolt/include/bolt/Passes/ReorderData.h           |  2 +-
 bolt/include/bolt/Passes/ReorderFunctions.h      |  2 +-
 bolt/include/bolt/Passes/RetpolineInsertion.h    |  2 +-
 bolt/include/bolt/Passes/SplitFunctions.h        |  2 +-
 bolt/include/bolt/Passes/StokeInfo.h             |  2 +-
 bolt/include/bolt/Passes/TailDuplication.h       |  2 +-
 bolt/include/bolt/Passes/ThreeWayBranch.h        |  2 +-
 bolt/include/bolt/Passes/ValidateInternalCalls.h |  2 +-
 bolt/include/bolt/Passes/ValidateMemRefs.h       |  2 +-
 bolt/include/bolt/Passes/VeneerElimination.h     |  2 +-
 bolt/lib/Passes/ADRRelaxationPass.cpp            |  5 +-
 bolt/lib/Passes/Aligner.cpp                      |  5 +-
 bolt/lib/Passes/AllocCombiner.cpp                |  5 +-
 bolt/lib/Passes/AsmDump.cpp                      |  3 +-
 bolt/lib/Passes/BinaryPasses.cpp                 | 79 +++++++++++++++---------
 bolt/lib/Passes/CMOVConversion.cpp               |  3 +-
 bolt/lib/Passes/FixRISCVCallsPass.cpp            |  6 +-
 bolt/lib/Passes/FixRelaxationPass.cpp            |  5 +-
 bolt/lib/Passes/FrameOptimizer.cpp               |  5 +-
 bolt/lib/Passes/Hugify.cpp                       |  5 +-
 bolt/lib/Passes/IdenticalCodeFolding.cpp         |  4 +-
 bolt/lib/Passes/IndirectCallPromotion.cpp        |  5 +-
 bolt/lib/Passes/Inliner.cpp                      |  5 +-
 bolt/lib/Passes/Instrumentation.cpp              |  3 +-
 bolt/lib/Passes/JTFootprintReduction.cpp         |  7 ++-
 bolt/lib/Passes/LongJmp.cpp                      |  3 +-
 bolt/lib/Passes/LoopInversionPass.cpp            |  5 +-
 bolt/lib/Passes/PLTCall.cpp                      |  5 +-
 bolt/lib/Passes/PatchEntries.cpp                 |  5 +-
 bolt/lib/Passes/RegReAssign.cpp                  |  5 +-
 bolt/lib/Passes/ReorderData.cpp                  |  7 ++-
 bolt/lib/Passes/ReorderFunctions.cpp             |  3 +-
 bolt/lib/Passes/RetpolineInsertion.cpp           |  5 +-
 bolt/lib/Passes/SplitFunctions.cpp               |  7 ++-
 bolt/lib/Passes/StokeInfo.cpp                    |  5 +-
 bolt/lib/Passes/TailDuplication.cpp              |  5 +-
 bolt/lib/Passes/ThreeWayBranch.cpp               |  3 +-
 bolt/lib/Passes/ValidateInternalCalls.cpp        |  7 ++-
 bolt/lib/Passes/ValidateMemRefs.cpp              |  9 +--
 bolt/lib/Passes/VeneerElimination.cpp            |  5 +-
 bolt/lib/Rewrite/BinaryPassManager.cpp           |  4 +-
 bolt/lib/Rewrite/BoltDiff.cpp                    |  8 +--
 62 files changed, 196 insertions(+), 143 deletions(-)

diff --git a/bolt/include/bolt/Passes/ADRRelaxationPass.h b/bolt/include/bolt/Passes/ADRRelaxationPass.h
index a5ff0c4..1d35a335 100644
--- a/bolt/include/bolt/Passes/ADRRelaxationPass.h
+++ b/bolt/include/bolt/Passes/ADRRelaxationPass.h
@@ -30,7 +30,7 @@ public:
   const char *getName() const override { return "adr-relaxation"; }
 
   /// Pass entry point
-  void runOnFunctions(BinaryContext &BC) override;
+  Error runOnFunctions(BinaryContext &BC) override;
   void runOnFunction(BinaryFunction &BF);
 };
 
diff --git a/bolt/include/bolt/Passes/Aligner.h b/bolt/include/bolt/Passes/Aligner.h
index 4cb44fd..eb07718 100644
--- a/bolt/include/bolt/Passes/Aligner.h
+++ b/bolt/include/bolt/Passes/Aligner.h
@@ -39,7 +39,7 @@ public:
   const char *getName() const override { return "aligner"; }
 
   /// Pass entry point
-  void runOnFunctions(BinaryContext &BC) override;
+  Error runOnFunctions(BinaryContext &BC) override;
 };
 
 } // namespace bolt
diff --git a/bolt/include/bolt/Passes/AllocCombiner.h b/bolt/include/bolt/Passes/AllocCombiner.h
index 44c6fdd..8532f76 100644
--- a/bolt/include/bolt/Passes/AllocCombiner.h
+++ b/bolt/include/bolt/Passes/AllocCombiner.h
@@ -33,7 +33,7 @@ public:
   }
 
   /// Pass entry point
-  void runOnFunctions(BinaryContext &BC) override;
+  Error runOnFunctions(BinaryContext &BC) override;
 };
 
 } // namespace bolt
diff --git a/bolt/include/bolt/Passes/AsmDump.h b/bolt/include/bolt/Passes/AsmDump.h
index 7cc96f2..d993909 100644
--- a/bolt/include/bolt/Passes/AsmDump.h
+++ b/bolt/include/bolt/Passes/AsmDump.h
@@ -28,7 +28,7 @@ public:
   bool shouldPrint(const BinaryFunction &BF) const override { return false; }
 
   /// Pass entry point
-  void runOnFunctions(BinaryContext &BC) override;
+  Error runOnFunctions(BinaryContext &BC) override;
 };
 
 } // namespace bolt
diff --git a/bolt/include/bolt/Passes/BinaryPasses.h b/bolt/include/bolt/Passes/BinaryPasses.h
index dace07e..8d52337b 100644
--- a/bolt/include/bolt/Passes/BinaryPasses.h
+++ b/bolt/include/bolt/Passes/BinaryPasses.h
@@ -51,7 +51,7 @@ public:
   virtual bool shouldPrint(const BinaryFunction &BF) const;
 
   /// Execute this pass on the given functions.
-  virtual void runOnFunctions(BinaryContext &BC) = 0;
+  virtual Error runOnFunctions(BinaryContext &BC) = 0;
 };
 
 /// A pass to print program-wide dynostats.
@@ -70,7 +70,7 @@ public:
 
   bool shouldPrint(const BinaryFunction &BF) const override { return false; }
 
-  void runOnFunctions(BinaryContext &BC) override {
+  Error runOnFunctions(BinaryContext &BC) override {
     const DynoStats NewDynoStats =
         getDynoStats(BC.getBinaryFunctions(), BC.isAArch64());
     const bool Changed = (NewDynoStats != PrevDynoStats);
@@ -82,6 +82,7 @@ public:
       NewDynoStats.print(outs(), &PrevDynoStats, BC.InstPrinter.get());
     }
     outs() << '\n';
+    return Error::success();
   }
 };
 
@@ -100,7 +101,7 @@ public:
 
   const char *getName() const override { return "normalize CFG"; }
 
-  void runOnFunctions(BinaryContext &) override;
+  Error runOnFunctions(BinaryContext &) override;
 };
 
 /// Detect and eliminate unreachable basic blocks. We could have those
@@ -119,7 +120,7 @@ public:
   bool shouldPrint(const BinaryFunction &BF) const override {
     return BinaryFunctionPass::shouldPrint(BF) && Modified.count(&BF) > 0;
   }
-  void runOnFunctions(BinaryContext &) override;
+  Error runOnFunctions(BinaryContext &) override;
 };
 
 // Reorder the basic blocks for each function based on hotness.
@@ -165,7 +166,7 @@ public:
 
   const char *getName() const override { return "reorder-blocks"; }
   bool shouldPrint(const BinaryFunction &BF) const override;
-  void runOnFunctions(BinaryContext &BC) override;
+  Error runOnFunctions(BinaryContext &BC) override;
 };
 
 /// Sync local branches with CFG.
@@ -175,7 +176,7 @@ public:
       : BinaryFunctionPass(PrintPass) {}
 
   const char *getName() const override { return "fix-branches"; }
-  void runOnFunctions(BinaryContext &BC) override;
+  Error runOnFunctions(BinaryContext &BC) override;
 };
 
 /// Fix the CFI state and exception handling information after all other
@@ -186,7 +187,7 @@ public:
       : BinaryFunctionPass(PrintPass) {}
 
   const char *getName() const override { return "finalize-functions"; }
-  void runOnFunctions(BinaryContext &BC) override;
+  Error runOnFunctions(BinaryContext &BC) override;
 };
 
 /// Perform any necessary adjustments for functions that do not fit into their
@@ -198,7 +199,7 @@ public:
 
   const char *getName() const override { return "check-large-functions"; }
 
-  void runOnFunctions(BinaryContext &BC) override;
+  Error runOnFunctions(BinaryContext &BC) override;
 
   bool shouldOptimize(const BinaryFunction &BF) const override;
 };
@@ -210,7 +211,7 @@ public:
       : BinaryFunctionPass(PrintPass) {}
 
   const char *getName() const override { return "lower-annotations"; }
-  void runOnFunctions(BinaryContext &BC) override;
+  Error runOnFunctions(BinaryContext &BC) override;
 };
 
 /// Clean the state of the MC representation before sending it to emission
@@ -220,7 +221,7 @@ public:
       : BinaryFunctionPass(PrintPass) {}
 
   const char *getName() const override { return "clean-mc-state"; }
-  void runOnFunctions(BinaryContext &BC) override;
+  Error runOnFunctions(BinaryContext &BC) override;
 };
 
 /// An optimization to simplify conditional tail calls by removing
@@ -292,7 +293,7 @@ public:
   bool shouldPrint(const BinaryFunction &BF) const override {
     return BinaryFunctionPass::shouldPrint(BF) && Modified.count(&BF) > 0;
   }
-  void runOnFunctions(BinaryContext &BC) override;
+  Error runOnFunctions(BinaryContext &BC) override;
 };
 
 /// Convert instructions to the form with the minimum operand width.
@@ -305,7 +306,7 @@ public:
 
   const char *getName() const override { return "shorten-instructions"; }
 
-  void runOnFunctions(BinaryContext &BC) override;
+  Error runOnFunctions(BinaryContext &BC) override;
 };
 
 /// Perform simple peephole optimizations.
@@ -339,7 +340,7 @@ public:
       : BinaryFunctionPass(PrintPass) {}
 
   const char *getName() const override { return "peepholes"; }
-  void runOnFunctions(BinaryContext &BC) override;
+  Error runOnFunctions(BinaryContext &BC) override;
 };
 
 /// An optimization to simplify loads from read-only sections.The pass converts
@@ -370,7 +371,7 @@ public:
   bool shouldPrint(const BinaryFunction &BF) const override {
     return BinaryFunctionPass::shouldPrint(BF) && Modified.count(&BF) > 0;
   }
-  void runOnFunctions(BinaryContext &BC) override;
+  Error runOnFunctions(BinaryContext &BC) override;
 };
 
 /// Assign output sections to all functions.
@@ -379,7 +380,7 @@ public:
   explicit AssignSections() : BinaryFunctionPass(false) {}
 
   const char *getName() const override { return "assign-sections"; }
-  void runOnFunctions(BinaryContext &BC) override;
+  Error runOnFunctions(BinaryContext &BC) override;
 };
 
 /// Compute and report to the user the imbalance in flow equations for all
@@ -394,7 +395,7 @@ public:
 
   const char *getName() const override { return "profile-stats"; }
   bool shouldPrint(const BinaryFunction &) const override { return false; }
-  void runOnFunctions(BinaryContext &BC) override;
+  Error runOnFunctions(BinaryContext &BC) override;
 };
 
 /// Prints a list of the top 100 functions sorted by a set of
@@ -406,7 +407,7 @@ public:
 
   const char *getName() const override { return "print-stats"; }
   bool shouldPrint(const BinaryFunction &) const override { return false; }
-  void runOnFunctions(BinaryContext &BC) override;
+  Error runOnFunctions(BinaryContext &BC) override;
 };
 
 /// Pass for lowering any instructions that we have raised and that have
@@ -418,7 +419,7 @@ public:
 
   const char *getName() const override { return "inst-lowering"; }
 
-  void runOnFunctions(BinaryContext &BC) override;
+  Error runOnFunctions(BinaryContext &BC) override;
 };
 
 /// Pass for stripping 'repz' from 'repz retq' sequence of instructions.
@@ -429,7 +430,7 @@ public:
 
   const char *getName() const override { return "strip-rep-ret"; }
 
-  void runOnFunctions(BinaryContext &BC) override;
+  Error runOnFunctions(BinaryContext &BC) override;
 };
 
 /// Pass for inlining calls to memcpy using 'rep movsb' on X86.
@@ -440,7 +441,7 @@ public:
 
   const char *getName() const override { return "inline-memcpy"; }
 
-  void runOnFunctions(BinaryContext &BC) override;
+  Error runOnFunctions(BinaryContext &BC) override;
 };
 
 /// Pass for specializing memcpy for a size of 1 byte.
@@ -461,7 +462,7 @@ public:
 
   const char *getName() const override { return "specialize-memcpy"; }
 
-  void runOnFunctions(BinaryContext &BC) override;
+  Error runOnFunctions(BinaryContext &BC) override;
 };
 
 /// Pass to remove nops in code
@@ -475,7 +476,7 @@ public:
   const char *getName() const override { return "remove-nops"; }
 
   /// Pass entry point
-  void runOnFunctions(BinaryContext &BC) override;
+  Error runOnFunctions(BinaryContext &BC) override;
 };
 
 enum FrameOptimizationType : char {
diff --git a/bolt/include/bolt/Passes/CMOVConversion.h b/bolt/include/bolt/Passes/CMOVConversion.h
index 77ce223..29b5184 100644
--- a/bolt/include/bolt/Passes/CMOVConversion.h
+++ b/bolt/include/bolt/Passes/CMOVConversion.h
@@ -76,7 +76,7 @@ public:
 
   const char *getName() const override { return "CMOV conversion"; }
 
-  void runOnFunctions(BinaryContext &BC) override;
+  Error runOnFunctions(BinaryContext &BC) override;
 };
 
 } // namespace bolt
diff --git a/bolt/include/bolt/Passes/FixRISCVCallsPass.h b/bolt/include/bolt/Passes/FixRISCVCallsPass.h
index 46418c4..a5c3e51 100644
--- a/bolt/include/bolt/Passes/FixRISCVCallsPass.h
+++ b/bolt/include/bolt/Passes/FixRISCVCallsPass.h
@@ -33,7 +33,7 @@ public:
   const char *getName() const override { return "fix-riscv-calls"; }
 
   /// Pass entry point
-  void runOnFunctions(BinaryContext &BC) override;
+  Error runOnFunctions(BinaryContext &BC) override;
 };
 
 } // namespace bolt
diff --git a/bolt/include/bolt/Passes/FixRelaxationPass.h b/bolt/include/bolt/Passes/FixRelaxationPass.h
index 45ee9cb..50b6448 100644
--- a/bolt/include/bolt/Passes/FixRelaxationPass.h
+++ b/bolt/include/bolt/Passes/FixRelaxationPass.h
@@ -31,7 +31,7 @@ public:
   const char *getName() const override { return "fix-relaxations"; }
 
   /// Pass entry point
-  void runOnFunctions(BinaryContext &BC) override;
+  Error runOnFunctions(BinaryContext &BC) override;
 };
 
 } // namespace bolt
diff --git a/bolt/include/bolt/Passes/FrameOptimizer.h b/bolt/include/bolt/Passes/FrameOptimizer.h
index 310bebf..a0d93c0 100644
--- a/bolt/include/bolt/Passes/FrameOptimizer.h
+++ b/bolt/include/bolt/Passes/FrameOptimizer.h
@@ -108,7 +108,7 @@ public:
   const char *getName() const override { return "frame-optimizer"; }
 
   /// Pass entry point
-  void runOnFunctions(BinaryContext &BC) override;
+  Error runOnFunctions(BinaryContext &BC) override;
 
   bool shouldPrint(const BinaryFunction &BF) const override {
     return BinaryFunctionPass::shouldPrint(BF) && FuncsChanged.count(&BF) > 0;
diff --git a/bolt/include/bolt/Passes/Hugify.h b/bolt/include/bolt/Passes/Hugify.h
index 0a77340..52c0ae19 100644
--- a/bolt/include/bolt/Passes/Hugify.h
+++ b/bolt/include/bolt/Passes/Hugify.h
@@ -18,7 +18,7 @@ class HugePage : public BinaryFunctionPass {
 public:
   HugePage(const cl::opt<bool> &PrintPass) : BinaryFunctionPass(PrintPass) {}
 
-  void runOnFunctions(BinaryContext &BC) override;
+  Error runOnFunctions(BinaryContext &BC) override;
 
   const char *getName() const override { return "HugePage"; }
 };
diff --git a/bolt/include/bolt/Passes/IdenticalCodeFolding.h b/bolt/include/bolt/Passes/IdenticalCodeFolding.h
index c15cebc..b4206fa 100644
--- a/bolt/include/bolt/Passes/IdenticalCodeFolding.h
+++ b/bolt/include/bolt/Passes/IdenticalCodeFolding.h
@@ -35,7 +35,7 @@ public:
       : BinaryFunctionPass(PrintPass) {}
 
   const char *getName() const override { return "identical-code-folding"; }
-  void runOnFunctions(BinaryContext &BC) override;
+  Error runOnFunctions(BinaryContext &BC) override;
 };
 
 } // namespace bolt
diff --git a/bolt/include/bolt/Passes/IndirectCallPromotion.h b/bolt/include/bolt/Passes/IndirectCallPromotion.h
index 397a386..adc58d7 100644
--- a/bolt/include/bolt/Passes/IndirectCallPromotion.h
+++ b/bolt/include/bolt/Passes/IndirectCallPromotion.h
@@ -221,7 +221,7 @@ public:
     return BF.isSimple() && !BF.isIgnored() && BF.hasProfile() &&
            !BF.hasUnknownControlFlow();
   }
-  void runOnFunctions(BinaryContext &BC) override;
+  Error runOnFunctions(BinaryContext &BC) override;
 };
 
 } // namespace bolt
diff --git a/bolt/include/bolt/Passes/Inliner.h b/bolt/include/bolt/Passes/Inliner.h
index 711eae6..5d9b96a 100644
--- a/bolt/include/bolt/Passes/Inliner.h
+++ b/bolt/include/bolt/Passes/Inliner.h
@@ -86,7 +86,7 @@ public:
     return BinaryFunctionPass::shouldPrint(BF) && Modified.count(&BF) > 0;
   }
 
-  void runOnFunctions(BinaryContext &BC) override;
+  Error runOnFunctions(BinaryContext &BC) override;
 };
 
 } // namespace bolt
diff --git a/bolt/include/bolt/Passes/Instrumentation.h b/bolt/include/bolt/Passes/Instrumentation.h
index 1a11f9e..76ffcf4 100644
--- a/bolt/include/bolt/Passes/Instrumentation.h
+++ b/bolt/include/bolt/Passes/Instrumentation.h
@@ -31,7 +31,7 @@ public:
         Summary(std::make_unique<InstrumentationSummary>()) {}
 
   /// Modifies all functions by inserting instrumentation code (first step)
-  void runOnFunctions(BinaryContext &BC) override;
+  Error runOnFunctions(BinaryContext &BC) override;
 
   const char *getName() const override { return "instrumentation"; }
 
diff --git a/bolt/include/bolt/Passes/JTFootprintReduction.h b/bolt/include/bolt/Passes/JTFootprintReduction.h
index 084049d..4b015e1 100644
--- a/bolt/include/bolt/Passes/JTFootprintReduction.h
+++ b/bolt/include/bolt/Passes/JTFootprintReduction.h
@@ -68,7 +68,7 @@ public:
   bool shouldPrint(const BinaryFunction &BF) const override {
     return BinaryFunctionPass::shouldPrint(BF) && Modified.count(&BF) > 0;
   }
-  void runOnFunctions(BinaryContext &BC) override;
+  Error runOnFunctions(BinaryContext &BC) override;
 };
 
 } // namespace bolt
diff --git a/bolt/include/bolt/Passes/LongJmp.h b/bolt/include/bolt/Passes/LongJmp.h
index c951819..9488337 100644
--- a/bolt/include/bolt/Passes/LongJmp.h
+++ b/bolt/include/bolt/Passes/LongJmp.h
@@ -148,7 +148,7 @@ public:
 
   const char *getName() const override { return "long-jmp"; }
 
-  void runOnFunctions(BinaryContext &BC) override;
+  Error runOnFunctions(BinaryContext &BC) override;
 };
 } // namespace bolt
 } // namespace llvm
diff --git a/bolt/include/bolt/Passes/LoopInversionPass.h b/bolt/include/bolt/Passes/LoopInversionPass.h
index 472fb36..aee441d 100644
--- a/bolt/include/bolt/Passes/LoopInversionPass.h
+++ b/bolt/include/bolt/Passes/LoopInversionPass.h
@@ -49,7 +49,7 @@ public:
   const char *getName() const override { return "loop-inversion-opt"; }
 
   /// Pass entry point
-  void runOnFunctions(BinaryContext &BC) override;
+  Error runOnFunctions(BinaryContext &BC) override;
   bool runOnFunction(BinaryFunction &Function);
 };
 
diff --git a/bolt/include/bolt/Passes/PLTCall.h b/bolt/include/bolt/Passes/PLTCall.h
index 4fdbf60..09ef96e 100644
--- a/bolt/include/bolt/Passes/PLTCall.h
+++ b/bolt/include/bolt/Passes/PLTCall.h
@@ -30,7 +30,7 @@ public:
   bool shouldPrint(const BinaryFunction &BF) const override {
     return BinaryFunctionPass::shouldPrint(BF);
   }
-  void runOnFunctions(BinaryContext &BC) override;
+  Error runOnFunctions(BinaryContext &BC) override;
 };
 
 } // namespace bolt
diff --git a/bolt/include/bolt/Passes/PatchEntries.h b/bolt/include/bolt/Passes/PatchEntries.h
index b9ed4a5..fa6b581 100644
--- a/bolt/include/bolt/Passes/PatchEntries.h
+++ b/bolt/include/bolt/Passes/PatchEntries.h
@@ -34,7 +34,7 @@ public:
   explicit PatchEntries() : BinaryFunctionPass(false) {}
 
   const char *getName() const override { return "patch-entries"; }
-  void runOnFunctions(BinaryContext &BC) override;
+  Error runOnFunctions(BinaryContext &BC) override;
 };
 
 } // namespace bolt
diff --git a/bolt/include/bolt/Passes/RegReAssign.h b/bolt/include/bolt/Passes/RegReAssign.h
index cd7bea6..c50e32f 100644
--- a/bolt/include/bolt/Passes/RegReAssign.h
+++ b/bolt/include/bolt/Passes/RegReAssign.h
@@ -55,7 +55,7 @@ public:
     return BinaryFunctionPass::shouldPrint(BF) && FuncsChanged.count(&BF) > 0;
   }
 
-  void runOnFunctions(BinaryContext &BC) override;
+  Error runOnFunctions(BinaryContext &BC) override;
 };
 } // namespace bolt
 } // namespace llvm
diff --git a/bolt/include/bolt/Passes/ReorderData.h b/bolt/include/bolt/Passes/ReorderData.h
index 65b7306..ed183bf 100644
--- a/bolt/include/bolt/Passes/ReorderData.h
+++ b/bolt/include/bolt/Passes/ReorderData.h
@@ -51,7 +51,7 @@ public:
 
   const char *getName() const override { return "reorder-data"; }
 
-  void runOnFunctions(BinaryContext &BC) override;
+  Error runOnFunctions(BinaryContext &BC) override;
 };
 
 } // namespace bolt
diff --git a/bolt/include/bolt/Passes/ReorderFunctions.h b/bolt/include/bolt/Passes/ReorderFunctions.h
index 8f9507d..6ca2dc3 100644
--- a/bolt/include/bolt/Passes/ReorderFunctions.h
+++ b/bolt/include/bolt/Passes/ReorderFunctions.h
@@ -42,7 +42,7 @@ public:
       : BinaryFunctionPass(PrintPass) {}
 
   const char *getName() const override { return "reorder-functions"; }
-  void runOnFunctions(BinaryContext &BC) override;
+  Error runOnFunctions(BinaryContext &BC) override;
 
   static std::vector<std::string> readFunctionOrderFile();
 };
diff --git a/bolt/include/bolt/Passes/RetpolineInsertion.h b/bolt/include/bolt/Passes/RetpolineInsertion.h
index 12f46a9..2cdde7f 100644
--- a/bolt/include/bolt/Passes/RetpolineInsertion.h
+++ b/bolt/include/bolt/Passes/RetpolineInsertion.h
@@ -62,7 +62,7 @@ public:
 
   const char *getName() const override { return "retpoline-insertion"; }
 
-  void runOnFunctions(BinaryContext &BC) override;
+  Error runOnFunctions(BinaryContext &BC) override;
 };
 
 } // namespace bolt
diff --git a/bolt/include/bolt/Passes/SplitFunctions.h b/bolt/include/bolt/Passes/SplitFunctions.h
index 28e9e79..8bdc48b 100644
--- a/bolt/include/bolt/Passes/SplitFunctions.h
+++ b/bolt/include/bolt/Passes/SplitFunctions.h
@@ -104,7 +104,7 @@ public:
 
   const char *getName() const override { return "split-functions"; }
 
-  void runOnFunctions(BinaryContext &BC) override;
+  Error runOnFunctions(BinaryContext &BC) override;
 };
 
 } // namespace bolt
diff --git a/bolt/include/bolt/Passes/StokeInfo.h b/bolt/include/bolt/Passes/StokeInfo.h
index 75cfa1e..76417e6 100644
--- a/bolt/include/bolt/Passes/StokeInfo.h
+++ b/bolt/include/bolt/Passes/StokeInfo.h
@@ -120,7 +120,7 @@ public:
   bool checkFunction(BinaryFunction &BF, DataflowInfoManager &DInfo,
                      RegAnalysis &RA, StokeFuncInfo &FuncInfo);
 
-  void runOnFunctions(BinaryContext &BC) override;
+  Error runOnFunctions(BinaryContext &BC) override;
 };
 
 } // namespace bolt
diff --git a/bolt/include/bolt/Passes/TailDuplication.h b/bolt/include/bolt/Passes/TailDuplication.h
index b3f1d7b..a2fcab0 100644
--- a/bolt/include/bolt/Passes/TailDuplication.h
+++ b/bolt/include/bolt/Passes/TailDuplication.h
@@ -145,7 +145,7 @@ public:
 
   const char *getName() const override { return "tail duplication"; }
 
-  void runOnFunctions(BinaryContext &BC) override;
+  Error runOnFunctions(BinaryContext &BC) override;
 };
 
 } // namespace bolt
diff --git a/bolt/include/bolt/Passes/ThreeWayBranch.h b/bolt/include/bolt/Passes/ThreeWayBranch.h
index 3eabf1b..9abf4c3 100644
--- a/bolt/include/bolt/Passes/ThreeWayBranch.h
+++ b/bolt/include/bolt/Passes/ThreeWayBranch.h
@@ -32,7 +32,7 @@ public:
 
   const char *getName() const override { return "three way branch"; }
 
-  void runOnFunctions(BinaryContext &BC) override;
+  Error runOnFunctions(BinaryContext &BC) override;
 };
 
 } // namespace bolt
diff --git a/bolt/include/bolt/Passes/ValidateInternalCalls.h b/bolt/include/bolt/Passes/ValidateInternalCalls.h
index 137b83b..0cdb858 100644
--- a/bolt/include/bolt/Passes/ValidateInternalCalls.h
+++ b/bolt/include/bolt/Passes/ValidateInternalCalls.h
@@ -54,7 +54,7 @@ public:
 
   const char *getName() const override { return "validate-internal-calls"; }
 
-  void runOnFunctions(BinaryContext &BC) override;
+  Error runOnFunctions(BinaryContext &BC) override;
 
 private:
   /// Fix the CFG to take into consideration internal calls that do not
diff --git a/bolt/include/bolt/Passes/ValidateMemRefs.h b/bolt/include/bolt/Passes/ValidateMemRefs.h
index d33862c..90acce3 100644
--- a/bolt/include/bolt/Passes/ValidateMemRefs.h
+++ b/bolt/include/bolt/Passes/ValidateMemRefs.h
@@ -25,7 +25,7 @@ public:
 
   const char *getName() const override { return "validate-mem-refs"; }
 
-  void runOnFunctions(BinaryContext &BC) override;
+  Error runOnFunctions(BinaryContext &BC) override;
 
 private:
   bool checkAndFixJTReference(BinaryFunction &BF, MCInst &Inst,
diff --git a/bolt/include/bolt/Passes/VeneerElimination.h b/bolt/include/bolt/Passes/VeneerElimination.h
index 10c8496..9ba1040 100644
--- a/bolt/include/bolt/Passes/VeneerElimination.h
+++ b/bolt/include/bolt/Passes/VeneerElimination.h
@@ -22,7 +22,7 @@ public:
 
   const char *getName() const override { return "veneer-elimination"; }
 
-  void runOnFunctions(BinaryContext &BC) override;
+  Error runOnFunctions(BinaryContext &BC) override;
 };
 
 } // namespace bolt
diff --git a/bolt/lib/Passes/ADRRelaxationPass.cpp b/bolt/lib/Passes/ADRRelaxationPass.cpp
index 27a1377..681b5ab 100644
--- a/bolt/lib/Passes/ADRRelaxationPass.cpp
+++ b/bolt/lib/Passes/ADRRelaxationPass.cpp
@@ -97,9 +97,9 @@ void ADRRelaxationPass::runOnFunction(BinaryFunction &BF) {
   }
 }
 
-void ADRRelaxationPass::runOnFunctions(BinaryContext &BC) {
+Error ADRRelaxationPass::runOnFunctions(BinaryContext &BC) {
   if (!opts::AdrPassOpt || !BC.HasRelocations)
-    return;
+    return Error::success();
 
   ParallelUtilities::WorkFuncTy WorkFun = [&](BinaryFunction &BF) {
     runOnFunction(BF);
@@ -111,6 +111,7 @@ void ADRRelaxationPass::runOnFunctions(BinaryContext &BC) {
 
   if (PassFailed)
     exit(1);
+  return Error::success();
 }
 
 } // end namespace bolt
diff --git a/bolt/lib/Passes/Aligner.cpp b/bolt/lib/Passes/Aligner.cpp
index 7c38752..555f82a 100644
--- a/bolt/lib/Passes/Aligner.cpp
+++ b/bolt/lib/Passes/Aligner.cpp
@@ -147,9 +147,9 @@ void AlignerPass::alignBlocks(BinaryFunction &Function,
   }
 }
 
-void AlignerPass::runOnFunctions(BinaryContext &BC) {
+Error AlignerPass::runOnFunctions(BinaryContext &BC) {
   if (!BC.HasRelocations)
-    return;
+    return Error::success();
 
   AlignHistogram.resize(opts::BlockAlignment);
 
@@ -179,6 +179,7 @@ void AlignerPass::runOnFunctions(BinaryContext &BC) {
     dbgs() << "BOLT-DEBUG: total execution count of aligned blocks: "
            << AlignedBlocksCount << '\n';
   );
+  return Error::success();
 }
 
 } // end namespace bolt
diff --git a/bolt/lib/Passes/AllocCombiner.cpp b/bolt/lib/Passes/AllocCombiner.cpp
index 6d3f2a5..2397dc4 100644
--- a/bolt/lib/Passes/AllocCombiner.cpp
+++ b/bolt/lib/Passes/AllocCombiner.cpp
@@ -103,9 +103,9 @@ void AllocCombinerPass::combineAdjustments(BinaryFunction &BF) {
   }
 }
 
-void AllocCombinerPass::runOnFunctions(BinaryContext &BC) {
+Error AllocCombinerPass::runOnFunctions(BinaryContext &BC) {
   if (opts::FrameOptimization == FOP_NONE)
-    return;
+    return Error::success();
 
   runForAllWeCare(BC.getBinaryFunctions(), [&](BinaryFunction &Function) {
     combineAdjustments(Function);
@@ -114,6 +114,7 @@ void AllocCombinerPass::runOnFunctions(BinaryContext &BC) {
   outs() << "BOLT-INFO: Allocation combiner: " << NumCombined
          << " empty spaces coalesced (dyn count: " << DynamicCountCombined
          << ").\n";
+  return Error::success();
 }
 
 } // end namespace bolt
diff --git a/bolt/lib/Passes/AsmDump.cpp b/bolt/lib/Passes/AsmDump.cpp
index 18d0395..1eaf890 100644
--- a/bolt/lib/Passes/AsmDump.cpp
+++ b/bolt/lib/Passes/AsmDump.cpp
@@ -237,9 +237,10 @@ void dumpFunction(const BinaryFunction &BF) {
     dumpBinaryDataSymbols(OS, BD, LastSection);
 }
 
-void AsmDumpPass::runOnFunctions(BinaryContext &BC) {
+Error AsmDumpPass::runOnFunctions(BinaryContext &BC) {
   for (const auto &BFIt : BC.getBinaryFunctions())
     dumpFunction(BFIt.second);
+  return Error::success();
 }
 
 } // namespace bolt
diff --git a/bolt/lib/Passes/BinaryPasses.cpp b/bolt/lib/Passes/BinaryPasses.cpp
index 08dce2f..e90f01a 100644
--- a/bolt/lib/Passes/BinaryPasses.cpp
+++ b/bolt/lib/Passes/BinaryPasses.cpp
@@ -301,7 +301,7 @@ void NormalizeCFG::runOnFunction(BinaryFunction &BF) {
   NumBlocksRemoved += NumRemoved;
 }
 
-void NormalizeCFG::runOnFunctions(BinaryContext &BC) {
+Error NormalizeCFG::runOnFunctions(BinaryContext &BC) {
   ParallelUtilities::runOnEachFunction(
       BC, ParallelUtilities::SchedulingPolicy::SP_BB_LINEAR,
       [&](BinaryFunction &BF) { runOnFunction(BF); },
@@ -314,6 +314,7 @@ void NormalizeCFG::runOnFunctions(BinaryContext &BC) {
     outs() << "BOLT-INFO: merged " << NumDuplicateEdgesMerged
            << " duplicate CFG edge" << (NumDuplicateEdgesMerged == 1 ? "" : "s")
            << '\n';
+  return Error::success();
 }
 
 void EliminateUnreachableBlocks::runOnFunction(BinaryFunction &Function) {
@@ -345,7 +346,7 @@ void EliminateUnreachableBlocks::runOnFunction(BinaryFunction &Function) {
   }
 }
 
-void EliminateUnreachableBlocks::runOnFunctions(BinaryContext &BC) {
+Error EliminateUnreachableBlocks::runOnFunctions(BinaryContext &BC) {
   ParallelUtilities::WorkFuncTy WorkFun = [&](BinaryFunction &BF) {
     runOnFunction(BF);
   };
@@ -361,6 +362,7 @@ void EliminateUnreachableBlocks::runOnFunctions(BinaryContext &BC) {
   if (DeletedBlocks)
     outs() << "BOLT-INFO: UCE removed " << DeletedBlocks << " blocks and "
            << DeletedBytes << " bytes of code\n";
+  return Error::success();
 }
 
 bool ReorderBasicBlocks::shouldPrint(const BinaryFunction &BF) const {
@@ -376,9 +378,9 @@ bool ReorderBasicBlocks::shouldOptimize(const BinaryFunction &BF) const {
   return BinaryFunctionPass::shouldOptimize(BF);
 }
 
-void ReorderBasicBlocks::runOnFunctions(BinaryContext &BC) {
+Error ReorderBasicBlocks::runOnFunctions(BinaryContext &BC) {
   if (opts::ReorderBlocks == ReorderBasicBlocks::LT_NONE)
-    return;
+    return Error::success();
 
   std::atomic_uint64_t ModifiedFuncCount(0);
   std::mutex FunctionEditDistanceMutex;
@@ -452,6 +454,7 @@ void ReorderBasicBlocks::runOnFunctions(BinaryContext &BC) {
          << FunctionEditDistance.lookup(&Function) << "\n\n";
     }
   }
+  return Error::success();
 }
 
 bool ReorderBasicBlocks::modifyFunctionLayout(BinaryFunction &BF,
@@ -513,7 +516,7 @@ bool ReorderBasicBlocks::modifyFunctionLayout(BinaryFunction &BF,
   return BF.getLayout().update(NewLayout);
 }
 
-void FixupBranches::runOnFunctions(BinaryContext &BC) {
+Error FixupBranches::runOnFunctions(BinaryContext &BC) {
   for (auto &It : BC.getBinaryFunctions()) {
     BinaryFunction &Function = It.second;
     if (!BC.shouldEmit(Function) || !Function.isSimple())
@@ -521,9 +524,10 @@ void FixupBranches::runOnFunctions(BinaryContext &BC) {
 
     Function.fixBranches();
   }
+  return Error::success();
 }
 
-void FinalizeFunctions::runOnFunctions(BinaryContext &BC) {
+Error FinalizeFunctions::runOnFunctions(BinaryContext &BC) {
   ParallelUtilities::WorkFuncTy WorkFun = [&](BinaryFunction &BF) {
     if (!BF.finalizeCFIState()) {
       if (BC.HasRelocations) {
@@ -548,11 +552,12 @@ void FinalizeFunctions::runOnFunctions(BinaryContext &BC) {
   ParallelUtilities::runOnEachFunction(
       BC, ParallelUtilities::SchedulingPolicy::SP_CONSTANT, WorkFun,
       SkipPredicate, "FinalizeFunctions");
+  return Error::success();
 }
 
-void CheckLargeFunctions::runOnFunctions(BinaryContext &BC) {
+Error CheckLargeFunctions::runOnFunctions(BinaryContext &BC) {
   if (BC.HasRelocations)
-    return;
+    return Error::success();
 
   // If the function wouldn't fit, mark it as non-simple. Otherwise, we may emit
   // incorrect meta data.
@@ -571,6 +576,8 @@ void CheckLargeFunctions::runOnFunctions(BinaryContext &BC) {
   ParallelUtilities::runOnEachFunction(
       BC, ParallelUtilities::SchedulingPolicy::SP_INST_LINEAR, WorkFun,
       SkipFunc, "CheckLargeFunctions");
+
+  return Error::success();
 }
 
 bool CheckLargeFunctions::shouldOptimize(const BinaryFunction &BF) const {
@@ -578,7 +585,7 @@ bool CheckLargeFunctions::shouldOptimize(const BinaryFunction &BF) const {
   return BF.isSimple() && !BF.isIgnored();
 }
 
-void LowerAnnotations::runOnFunctions(BinaryContext &BC) {
+Error LowerAnnotations::runOnFunctions(BinaryContext &BC) {
   // Convert GnuArgsSize annotations into CFIs.
   for (BinaryFunction *BF : BC.getAllBinaryFunctions()) {
     for (FunctionFragment &FF : BF->getLayout().fragments()) {
@@ -604,13 +611,14 @@ void LowerAnnotations::runOnFunctions(BinaryContext &BC) {
       }
     }
   }
+  return Error::success();
 }
 
 // Check for dirty state in MCSymbol objects that might be a consequence
 // of running calculateEmittedSize() in parallel, during split functions
 // pass. If an inconsistent state is found (symbol already registered or
 // already defined), clean it.
-void CleanMCState::runOnFunctions(BinaryContext &BC) {
+Error CleanMCState::runOnFunctions(BinaryContext &BC) {
   MCContext &Ctx = *BC.Ctx;
   for (const auto &SymMapEntry : Ctx.getSymbols()) {
     const MCSymbol *S = SymMapEntry.second;
@@ -628,6 +636,7 @@ void CleanMCState::runOnFunctions(BinaryContext &BC) {
       dbgs() << "BOLT-DEBUG: Symbol \"" << S->getName() << "\" is variable\n";
     });
   }
+  return Error::success();
 }
 
 // This peephole fixes jump instructions that jump to another basic
@@ -963,9 +972,9 @@ uint64_t SimplifyConditionalTailCalls::fixTailCalls(BinaryFunction &BF) {
   return NumLocalCTCs > 0;
 }
 
-void SimplifyConditionalTailCalls::runOnFunctions(BinaryContext &BC) {
+Error SimplifyConditionalTailCalls::runOnFunctions(BinaryContext &BC) {
   if (!BC.isX86())
-    return;
+    return Error::success();
 
   for (auto &It : BC.getBinaryFunctions()) {
     BinaryFunction &Function = It.second;
@@ -990,6 +999,7 @@ void SimplifyConditionalTailCalls::runOnFunctions(BinaryContext &BC) {
            << " bytes of code. CTCs total execution count is " << CTCExecCount
            << " and the number of times CTCs are taken is " << CTCTakenCount
            << "\n";
+  return Error::success();
 }
 
 uint64_t ShortenInstructions::shortenInstructions(BinaryFunction &Function) {
@@ -1019,10 +1029,10 @@ uint64_t ShortenInstructions::shortenInstructions(BinaryFunction &Function) {
   return Count;
 }
 
-void ShortenInstructions::runOnFunctions(BinaryContext &BC) {
+Error ShortenInstructions::runOnFunctions(BinaryContext &BC) {
   std::atomic<uint64_t> NumShortened{0};
   if (!BC.isX86())
-    return;
+    return Error::success();
 
   ParallelUtilities::runOnEachFunction(
       BC, ParallelUtilities::SchedulingPolicy::SP_INST_LINEAR,
@@ -1031,6 +1041,7 @@ void ShortenInstructions::runOnFunctions(BinaryContext &BC) {
 
   if (NumShortened)
     outs() << "BOLT-INFO: " << NumShortened << " instructions were shortened\n";
+  return Error::success();
 }
 
 void Peepholes::addTailcallTraps(BinaryFunction &Function) {
@@ -1073,12 +1084,12 @@ void Peepholes::removeUselessCondBranches(BinaryFunction &Function) {
   }
 }
 
-void Peepholes::runOnFunctions(BinaryContext &BC) {
+Error Peepholes::runOnFunctions(BinaryContext &BC) {
   const char Opts =
       std::accumulate(opts::Peepholes.begin(), opts::Peepholes.end(), 0,
                       [](const char A, const PeepholeOpts B) { return A | B; });
   if (Opts == PEEP_NONE)
-    return;
+    return Error::success();
 
   for (auto &It : BC.getBinaryFunctions()) {
     BinaryFunction &Function = It.second;
@@ -1098,6 +1109,7 @@ void Peepholes::runOnFunctions(BinaryContext &BC) {
          << " tail call traps inserted.\n"
          << "BOLT-INFO: Peephole: " << NumUselessCondBranches
          << " useless conditional branches removed.\n";
+  return Error::success();
 }
 
 bool SimplifyRODataLoads::simplifyRODataLoads(BinaryFunction &BF) {
@@ -1182,7 +1194,7 @@ bool SimplifyRODataLoads::simplifyRODataLoads(BinaryFunction &BF) {
   return NumLocalLoadsSimplified > 0;
 }
 
-void SimplifyRODataLoads::runOnFunctions(BinaryContext &BC) {
+Error SimplifyRODataLoads::runOnFunctions(BinaryContext &BC) {
   for (auto &It : BC.getBinaryFunctions()) {
     BinaryFunction &Function = It.second;
     if (shouldOptimize(Function) && simplifyRODataLoads(Function))
@@ -1194,9 +1206,10 @@ void SimplifyRODataLoads::runOnFunctions(BinaryContext &BC) {
          << "BOLT-INFO: dynamic loads simplified: " << NumDynamicLoadsSimplified
          << "\n"
          << "BOLT-INFO: dynamic loads found: " << NumDynamicLoadsFound << "\n";
+  return Error::success();
 }
 
-void AssignSections::runOnFunctions(BinaryContext &BC) {
+Error AssignSections::runOnFunctions(BinaryContext &BC) {
   for (BinaryFunction *Function : BC.getInjectedBinaryFunctions()) {
     Function->setCodeSectionName(BC.getInjectedCodeSectionName());
     Function->setColdCodeSectionName(BC.getInjectedColdCodeSectionName());
@@ -1204,7 +1217,7 @@ void AssignSections::runOnFunctions(BinaryContext &BC) {
 
   // In non-relocation mode functions have pre-assigned section names.
   if (!BC.HasRelocations)
-    return;
+    return Error::success();
 
   const bool UseColdSection =
       BC.NumProfiledFuncs > 0 ||
@@ -1225,9 +1238,10 @@ void AssignSections::runOnFunctions(BinaryContext &BC) {
     if (Function.isSplit())
       Function.setColdCodeSectionName(BC.getColdCodeSectionName());
   }
+  return Error::success();
 }
 
-void PrintProfileStats::runOnFunctions(BinaryContext &BC) {
+Error PrintProfileStats::runOnFunctions(BinaryContext &BC) {
   double FlowImbalanceMean = 0.0;
   size_t NumBlocksConsidered = 0;
   double WorstBias = 0.0;
@@ -1318,9 +1332,10 @@ void PrintProfileStats::runOnFunctions(BinaryContext &BC) {
            << "\n";
     LLVM_DEBUG(WorstBiasFunc->dump());
   }
+  return Error::success();
 }
 
-void PrintProgramStats::runOnFunctions(BinaryContext &BC) {
+Error PrintProgramStats::runOnFunctions(BinaryContext &BC) {
   uint64_t NumRegularFunctions = 0;
   uint64_t NumStaleProfileFunctions = 0;
   uint64_t NumAllStaleFunctions = 0;
@@ -1633,18 +1648,20 @@ void PrintProgramStats::runOnFunctions(BinaryContext &BC) {
       outs() << ". Use -print-unknown to see the list.";
     outs() << '\n';
   }
+  return Error::success();
 }
 
-void InstructionLowering::runOnFunctions(BinaryContext &BC) {
+Error InstructionLowering::runOnFunctions(BinaryContext &BC) {
   for (auto &BFI : BC.getBinaryFunctions())
     for (BinaryBasicBlock &BB : BFI.second)
       for (MCInst &Instruction : BB)
         BC.MIB->lowerTailCall(Instruction);
+  return Error::success();
 }
 
-void StripRepRet::runOnFunctions(BinaryContext &BC) {
+Error StripRepRet::runOnFunctions(BinaryContext &BC) {
   if (!BC.isX86())
-    return;
+    return Error::success();
 
   uint64_t NumPrefixesRemoved = 0;
   uint64_t NumBytesSaved = 0;
@@ -1665,11 +1682,12 @@ void StripRepRet::runOnFunctions(BinaryContext &BC) {
            << " 'repz' prefixes"
               " with estimated execution count of "
            << NumPrefixesRemoved << " times.\n";
+  return Error::success();
 }
 
-void InlineMemcpy::runOnFunctions(BinaryContext &BC) {
+Error InlineMemcpy::runOnFunctions(BinaryContext &BC) {
   if (!BC.isX86())
-    return;
+    return Error::success();
 
   uint64_t NumInlined = 0;
   uint64_t NumInlinedDyno = 0;
@@ -1714,6 +1732,7 @@ void InlineMemcpy::runOnFunctions(BinaryContext &BC) {
              << " times based on profile.";
     outs() << '\n';
   }
+  return Error::success();
 }
 
 bool SpecializeMemcpy1::shouldOptimize(const BinaryFunction &Function) const {
@@ -1754,9 +1773,9 @@ std::set<size_t> SpecializeMemcpy1::getCallSitesToOptimize(
   return Sites;
 }
 
-void SpecializeMemcpy1::runOnFunctions(BinaryContext &BC) {
+Error SpecializeMemcpy1::runOnFunctions(BinaryContext &BC) {
   if (!BC.isX86())
-    return;
+    return Error::success();
 
   uint64_t NumSpecialized = 0;
   uint64_t NumSpecializedDyno = 0;
@@ -1848,6 +1867,7 @@ void SpecializeMemcpy1::runOnFunctions(BinaryContext &BC) {
              << " times based on profile.";
     outs() << '\n';
   }
+  return Error::success();
 }
 
 void RemoveNops::runOnFunction(BinaryFunction &BF) {
@@ -1861,7 +1881,7 @@ void RemoveNops::runOnFunction(BinaryFunction &BF) {
   }
 }
 
-void RemoveNops::runOnFunctions(BinaryContext &BC) {
+Error RemoveNops::runOnFunctions(BinaryContext &BC) {
   ParallelUtilities::WorkFuncTy WorkFun = [&](BinaryFunction &BF) {
     runOnFunction(BF);
   };
@@ -1873,6 +1893,7 @@ void RemoveNops::runOnFunctions(BinaryContext &BC) {
   ParallelUtilities::runOnEachFunction(
       BC, ParallelUtilities::SchedulingPolicy::SP_INST_LINEAR, WorkFun,
       SkipFunc, "RemoveNops");
+  return Error::success();
 }
 
 } // namespace bolt
diff --git a/bolt/lib/Passes/CMOVConversion.cpp b/bolt/lib/Passes/CMOVConversion.cpp
index 6213479..adb6115 100644
--- a/bolt/lib/Passes/CMOVConversion.cpp
+++ b/bolt/lib/Passes/CMOVConversion.cpp
@@ -271,7 +271,7 @@ void CMOVConversion::runOnFunction(BinaryFunction &Function) {
   Global = Global + Local;
 }
 
-void CMOVConversion::runOnFunctions(BinaryContext &BC) {
+Error CMOVConversion::runOnFunctions(BinaryContext &BC) {
   for (auto &It : BC.getBinaryFunctions()) {
     BinaryFunction &Function = It.second;
     if (!shouldOptimize(Function))
@@ -281,6 +281,7 @@ void CMOVConversion::runOnFunctions(BinaryContext &BC) {
 
   outs() << "BOLT-INFO: CMOVConversion total: ";
   Global.dump();
+  return Error::success();
 }
 
 } // end namespace bolt
diff --git a/bolt/lib/Passes/FixRISCVCallsPass.cpp b/bolt/lib/Passes/FixRISCVCallsPass.cpp
index e2984ded..83c745f 100644
--- a/bolt/lib/Passes/FixRISCVCallsPass.cpp
+++ b/bolt/lib/Passes/FixRISCVCallsPass.cpp
@@ -68,9 +68,9 @@ void FixRISCVCallsPass::runOnFunction(BinaryFunction &BF) {
   }
 }
 
-void FixRISCVCallsPass::runOnFunctions(BinaryContext &BC) {
+Error FixRISCVCallsPass::runOnFunctions(BinaryContext &BC) {
   if (!BC.isRISCV() || !BC.HasRelocations)
-    return;
+    return Error::success();
 
   ParallelUtilities::WorkFuncTy WorkFun = [&](BinaryFunction &BF) {
     runOnFunction(BF);
@@ -79,6 +79,8 @@ void FixRISCVCallsPass::runOnFunctions(BinaryContext &BC) {
   ParallelUtilities::runOnEachFunction(
       BC, ParallelUtilities::SchedulingPolicy::SP_INST_LINEAR, WorkFun, nullptr,
       "FixRISCVCalls");
+
+  return Error::success();
 }
 
 } // namespace bolt
diff --git a/bolt/lib/Passes/FixRelaxationPass.cpp b/bolt/lib/Passes/FixRelaxationPass.cpp
index 3dd19b6..a49fb98 100644
--- a/bolt/lib/Passes/FixRelaxationPass.cpp
+++ b/bolt/lib/Passes/FixRelaxationPass.cpp
@@ -47,9 +47,9 @@ void FixRelaxations::runOnFunction(BinaryFunction &BF) {
   }
 }
 
-void FixRelaxations::runOnFunctions(BinaryContext &BC) {
+Error FixRelaxations::runOnFunctions(BinaryContext &BC) {
   if (!BC.isAArch64() || !BC.HasRelocations)
-    return;
+    return Error::success();
 
   ParallelUtilities::WorkFuncTy WorkFun = [&](BinaryFunction &BF) {
     runOnFunction(BF);
@@ -58,6 +58,7 @@ void FixRelaxations::runOnFunctions(BinaryContext &BC) {
   ParallelUtilities::runOnEachFunction(
       BC, ParallelUtilities::SchedulingPolicy::SP_INST_LINEAR, WorkFun, nullptr,
       "FixRelaxations");
+  return Error::success();
 }
 
 } // namespace bolt
diff --git a/bolt/lib/Passes/FrameOptimizer.cpp b/bolt/lib/Passes/FrameOptimizer.cpp
index 6f6dea0..83a5a36 100644
--- a/bolt/lib/Passes/FrameOptimizer.cpp
+++ b/bolt/lib/Passes/FrameOptimizer.cpp
@@ -221,9 +221,9 @@ void FrameOptimizerPass::removeUnusedStores(const FrameAnalysis &FA,
     LLVM_DEBUG(dbgs() << "FOP modified \"" << BF.getPrintName() << "\"\n");
 }
 
-void FrameOptimizerPass::runOnFunctions(BinaryContext &BC) {
+Error FrameOptimizerPass::runOnFunctions(BinaryContext &BC) {
   if (opts::FrameOptimization == FOP_NONE)
-    return;
+    return Error::success();
 
   std::unique_ptr<BinaryFunctionCallGraph> CG;
   std::unique_ptr<FrameAnalysis> FA;
@@ -303,6 +303,7 @@ void FrameOptimizerPass::runOnFunctions(BinaryContext &BC) {
          << NumRedundantStores << " store(s)\n";
   FA->printStats();
   ShrinkWrapping::printStats();
+  return Error::success();
 }
 
 void FrameOptimizerPass::performShrinkWrapping(const RegAnalysis &RA,
diff --git a/bolt/lib/Passes/Hugify.cpp b/bolt/lib/Passes/Hugify.cpp
index d2a64fb..b773561 100644
--- a/bolt/lib/Passes/Hugify.cpp
+++ b/bolt/lib/Passes/Hugify.cpp
@@ -16,10 +16,10 @@ using namespace llvm;
 namespace llvm {
 namespace bolt {
 
-void HugePage::runOnFunctions(BinaryContext &BC) {
+Error HugePage::runOnFunctions(BinaryContext &BC) {
   auto *RtLibrary = BC.getRuntimeLibrary();
   if (!RtLibrary || !BC.isELF() || !BC.StartFunctionAddress) {
-    return;
+    return Error::success();
   }
 
   auto createSimpleFunction =
@@ -45,6 +45,7 @@ void HugePage::runOnFunctions(BinaryContext &BC) {
   const MCSymbol *StartSym = Start->getSymbol();
   createSimpleFunction("__bolt_hugify_start_program",
                        BC.MIB->createSymbolTrampoline(StartSym, BC.Ctx.get()));
+  return Error::success();
 }
 } // namespace bolt
 } // namespace llvm
diff --git a/bolt/lib/Passes/IdenticalCodeFolding.cpp b/bolt/lib/Passes/IdenticalCodeFolding.cpp
index dfbc72e..ba3afd2 100644
--- a/bolt/lib/Passes/IdenticalCodeFolding.cpp
+++ b/bolt/lib/Passes/IdenticalCodeFolding.cpp
@@ -341,7 +341,7 @@ typedef std::unordered_map<BinaryFunction *, std::vector<BinaryFunction *>,
 namespace llvm {
 namespace bolt {
 
-void IdenticalCodeFolding::runOnFunctions(BinaryContext &BC) {
+Error IdenticalCodeFolding::runOnFunctions(BinaryContext &BC) {
   const size_t OriginalFunctionCount = BC.getBinaryFunctions().size();
   uint64_t NumFunctionsFolded = 0;
   std::atomic<uint64_t> NumJTFunctionsFolded{0};
@@ -516,6 +516,8 @@ void IdenticalCodeFolding::runOnFunctions(BinaryContext &BC) {
            << format("%.2lf", (double)BytesSavedEstimate / 1024)
            << " KB of code space. Folded functions were called " << NumCalled
            << " times based on profile.\n";
+
+  return Error::success();
 }
 
 } // namespace bolt
diff --git a/bolt/lib/Passes/IndirectCallPromotion.cpp b/bolt/lib/Passes/IndirectCallPromotion.cpp
index 4517581..d1dc1af 100644
--- a/bolt/lib/Passes/IndirectCallPromotion.cpp
+++ b/bolt/lib/Passes/IndirectCallPromotion.cpp
@@ -1137,9 +1137,9 @@ void IndirectCallPromotion::printCallsiteInfo(
   });
 }
 
-void IndirectCallPromotion::runOnFunctions(BinaryContext &BC) {
+Error IndirectCallPromotion::runOnFunctions(BinaryContext &BC) {
   if (opts::ICP == ICP_NONE)
-    return;
+    return Error::success();
 
   auto &BFs = BC.getBinaryFunctions();
 
@@ -1472,6 +1472,7 @@ void IndirectCallPromotion::runOnFunctions(BinaryContext &BC) {
 #ifndef NDEBUG
   verifyProfile(BFs);
 #endif
+  return Error::success();
 }
 
 } // namespace bolt
diff --git a/bolt/lib/Passes/Inliner.cpp b/bolt/lib/Passes/Inliner.cpp
index 8dcb893..d875ecb 100644
--- a/bolt/lib/Passes/Inliner.cpp
+++ b/bolt/lib/Passes/Inliner.cpp
@@ -496,11 +496,11 @@ bool Inliner::inlineCallsInFunction(BinaryFunction &Function) {
   return DidInlining;
 }
 
-void Inliner::runOnFunctions(BinaryContext &BC) {
+Error Inliner::runOnFunctions(BinaryContext &BC) {
   opts::syncOptions();
 
   if (!opts::inliningEnabled())
-    return;
+    return Error::success();
 
   bool InlinedOnce;
   unsigned NumIters = 0;
@@ -544,6 +544,7 @@ void Inliner::runOnFunctions(BinaryContext &BC) {
            << NumInlinedCallSites << " call sites in " << NumIters
            << " iteration(s). Change in binary size: " << TotalInlinedBytes
            << " bytes.\n";
+  return Error::success();
 }
 
 } // namespace bolt
diff --git a/bolt/lib/Passes/Instrumentation.cpp b/bolt/lib/Passes/Instrumentation.cpp
index e54b0ca..b6dc770 100644
--- a/bolt/lib/Passes/Instrumentation.cpp
+++ b/bolt/lib/Passes/Instrumentation.cpp
@@ -526,7 +526,7 @@ void Instrumentation::instrumentFunction(BinaryFunction &Function,
   FuncDesc->EdgesSet.clear();
 }
 
-void Instrumentation::runOnFunctions(BinaryContext &BC) {
+Error Instrumentation::runOnFunctions(BinaryContext &BC) {
   const unsigned Flags = BinarySection::getFlags(/*IsReadOnly=*/false,
                                                  /*IsText=*/false,
                                                  /*IsAllocatable=*/true);
@@ -603,6 +603,7 @@ void Instrumentation::runOnFunctions(BinaryContext &BC) {
   }
 
   setupRuntimeLibrary(BC);
+  return Error::success();
 }
 
 void Instrumentation::createAuxiliaryFunctions(BinaryContext &BC) {
diff --git a/bolt/lib/Passes/JTFootprintReduction.cpp b/bolt/lib/Passes/JTFootprintReduction.cpp
index d690e4d..5d2e27a 100644
--- a/bolt/lib/Passes/JTFootprintReduction.cpp
+++ b/bolt/lib/Passes/JTFootprintReduction.cpp
@@ -246,9 +246,9 @@ void JTFootprintReduction::optimizeFunction(BinaryFunction &Function,
         ++I;
 }
 
-void JTFootprintReduction::runOnFunctions(BinaryContext &BC) {
+Error JTFootprintReduction::runOnFunctions(BinaryContext &BC) {
   if (opts::JumpTables == JTS_BASIC && BC.HasRelocations)
-    return;
+    return Error::success();
 
   std::unique_ptr<RegAnalysis> RA;
   std::unique_ptr<BinaryFunctionCallGraph> CG;
@@ -273,7 +273,7 @@ void JTFootprintReduction::runOnFunctions(BinaryContext &BC) {
 
   if (TotalJTs == TotalJTsDenied) {
     outs() << "BOLT-INFO: JT Footprint reduction: no changes were made.\n";
-    return;
+    return Error::success();
   }
 
   outs() << "BOLT-INFO: JT Footprint reduction stats (simple funcs only):\n";
@@ -289,6 +289,7 @@ void JTFootprintReduction::runOnFunctions(BinaryContext &BC) {
   outs() << "\t   " << NumJTsNoReg
          << " JTs discarded due to register unavailability.\n";
   outs() << "\t   " << BytesSaved << " bytes saved.\n";
+  return Error::success();
 }
 
 } // namespace bolt
diff --git a/bolt/lib/Passes/LongJmp.cpp b/bolt/lib/Passes/LongJmp.cpp
index ded0db2..b524339 100644
--- a/bolt/lib/Passes/LongJmp.cpp
+++ b/bolt/lib/Passes/LongJmp.cpp
@@ -628,7 +628,7 @@ bool LongJmpPass::relax(BinaryFunction &Func) {
   return Modified;
 }
 
-void LongJmpPass::runOnFunctions(BinaryContext &BC) {
+Error LongJmpPass::runOnFunctions(BinaryContext &BC) {
   outs() << "BOLT-INFO: Starting stub-insertion pass\n";
   std::vector<BinaryFunction *> Sorted = BC.getSortedFunctions();
   bool Modified;
@@ -652,6 +652,7 @@ void LongJmpPass::runOnFunctions(BinaryContext &BC) {
          << " stubs in the hot area and " << NumColdStubs
          << " stubs in the cold area. Shared " << NumSharedStubs
          << " times, iterated " << Iterations << " times.\n";
+  return Error::success();
 }
 } // namespace bolt
 } // namespace llvm
diff --git a/bolt/lib/Passes/LoopInversionPass.cpp b/bolt/lib/Passes/LoopInversionPass.cpp
index f30e1a8..10df715 100644
--- a/bolt/lib/Passes/LoopInversionPass.cpp
+++ b/bolt/lib/Passes/LoopInversionPass.cpp
@@ -84,11 +84,11 @@ bool LoopInversionPass::runOnFunction(BinaryFunction &BF) {
   return IsChanged;
 }
 
-void LoopInversionPass::runOnFunctions(BinaryContext &BC) {
+Error LoopInversionPass::runOnFunctions(BinaryContext &BC) {
   std::atomic<uint64_t> ModifiedFuncCount{0};
   if (opts::ReorderBlocks == ReorderBasicBlocks::LT_NONE ||
       opts::LoopReorder == false)
-    return;
+    return Error::success();
 
   ParallelUtilities::WorkFuncTy WorkFun = [&](BinaryFunction &BF) {
     if (runOnFunction(BF))
@@ -105,6 +105,7 @@ void LoopInversionPass::runOnFunctions(BinaryContext &BC) {
 
   outs() << "BOLT-INFO: " << ModifiedFuncCount
          << " Functions were reordered by LoopInversionPass\n";
+  return Error::success();
 }
 
 } // end namespace bolt
diff --git a/bolt/lib/Passes/PLTCall.cpp b/bolt/lib/Passes/PLTCall.cpp
index aec75be..c02f2c3 100644
--- a/bolt/lib/Passes/PLTCall.cpp
+++ b/bolt/lib/Passes/PLTCall.cpp
@@ -43,9 +43,9 @@ PLT("plt",
 namespace llvm {
 namespace bolt {
 
-void PLTCall::runOnFunctions(BinaryContext &BC) {
+Error PLTCall::runOnFunctions(BinaryContext &BC) {
   if (opts::PLT == OT_NONE)
-    return;
+    return Error::success();
 
   uint64_t NumCallsOptimized = 0;
   for (auto &It : BC.getBinaryFunctions()) {
@@ -83,6 +83,7 @@ void PLTCall::runOnFunctions(BinaryContext &BC) {
     outs() << "BOLT-INFO: " << NumCallsOptimized
            << " PLT calls in the binary were optimized.\n";
   }
+  return Error::success();
 }
 
 } // namespace bolt
diff --git a/bolt/lib/Passes/PatchEntries.cpp b/bolt/lib/Passes/PatchEntries.cpp
index ee7512d..ddef854 100644
--- a/bolt/lib/Passes/PatchEntries.cpp
+++ b/bolt/lib/Passes/PatchEntries.cpp
@@ -31,7 +31,7 @@ llvm::cl::opt<bool>
 namespace llvm {
 namespace bolt {
 
-void PatchEntries::runOnFunctions(BinaryContext &BC) {
+Error PatchEntries::runOnFunctions(BinaryContext &BC) {
   if (!opts::ForcePatch) {
     // Mark the binary for patching if we did not create external references
     // for original code in any of functions we are not going to emit.
@@ -42,7 +42,7 @@ void PatchEntries::runOnFunctions(BinaryContext &BC) {
         });
 
     if (!NeedsPatching)
-      return;
+      return Error::success();
   }
 
   if (opts::Verbosity >= 1)
@@ -138,6 +138,7 @@ void PatchEntries::runOnFunctions(BinaryContext &BC) {
 
     Function.setIsPatched(true);
   }
+  return Error::success();
 }
 
 } // end namespace bolt
diff --git a/bolt/lib/Passes/RegReAssign.cpp b/bolt/lib/Passes/RegReAssign.cpp
index 8b9dc9c..5f0cddb 100644
--- a/bolt/lib/Passes/RegReAssign.cpp
+++ b/bolt/lib/Passes/RegReAssign.cpp
@@ -452,7 +452,7 @@ void RegReAssign::setupConservativePass(
   });
 }
 
-void RegReAssign::runOnFunctions(BinaryContext &BC) {
+Error RegReAssign::runOnFunctions(BinaryContext &BC) {
   RegScore = std::vector<int64_t>(BC.MRI->getNumRegs(), 0);
   RankedRegs = std::vector<size_t>(BC.MRI->getNumRegs(), 0);
 
@@ -481,7 +481,7 @@ void RegReAssign::runOnFunctions(BinaryContext &BC) {
 
   if (FuncsChanged.empty()) {
     outs() << "BOLT-INFO: Reg Reassignment Pass: no changes were made.\n";
-    return;
+    return Error::success();
   }
   if (opts::UpdateDebugSections)
     outs() << "BOLT-WARNING: You used -reg-reassign and -update-debug-sections."
@@ -492,6 +492,7 @@ void RegReAssign::runOnFunctions(BinaryContext &BC) {
   outs() << "\t   " << FuncsChanged.size() << " functions affected.\n";
   outs() << "\t   " << StaticBytesSaved << " static bytes saved.\n";
   outs() << "\t   " << DynBytesSaved << " dynamic bytes saved.\n";
+  return Error::success();
 }
 
 } // namespace bolt
diff --git a/bolt/lib/Passes/ReorderData.cpp b/bolt/lib/Passes/ReorderData.cpp
index 3a6654c..13e3e80 100644
--- a/bolt/lib/Passes/ReorderData.cpp
+++ b/bolt/lib/Passes/ReorderData.cpp
@@ -435,17 +435,17 @@ bool ReorderData::markUnmoveableSymbols(BinaryContext &BC,
   return FoundUnmoveable;
 }
 
-void ReorderData::runOnFunctions(BinaryContext &BC) {
+Error ReorderData::runOnFunctions(BinaryContext &BC) {
   static const char *DefaultSections[] = {".rodata", ".data", ".bss", nullptr};
 
   if (!BC.HasRelocations || opts::ReorderData.empty())
-    return;
+    return Error::success();
 
   // For now
   if (opts::JumpTables > JTS_BASIC) {
     outs() << "BOLT-WARNING: jump table support must be basic for "
            << "data reordering to work.\n";
-    return;
+    return Error::success();
   }
 
   assignMemData(BC);
@@ -523,6 +523,7 @@ void ReorderData::runOnFunctions(BinaryContext &BC) {
       setSectionOrder(BC, *Section, Order.begin(), Order.end());
     }
   }
+  return Error::success();
 }
 
 } // namespace bolt
diff --git a/bolt/lib/Passes/ReorderFunctions.cpp b/bolt/lib/Passes/ReorderFunctions.cpp
index 2446524..86fc03d 100644
--- a/bolt/lib/Passes/ReorderFunctions.cpp
+++ b/bolt/lib/Passes/ReorderFunctions.cpp
@@ -264,7 +264,7 @@ std::vector<std::string> ReorderFunctions::readFunctionOrderFile() {
   return FunctionNames;
 }
 
-void ReorderFunctions::runOnFunctions(BinaryContext &BC) {
+Error ReorderFunctions::runOnFunctions(BinaryContext &BC) {
   auto &BFs = BC.getBinaryFunctions();
   if (opts::ReorderFunctions != RT_NONE &&
       opts::ReorderFunctions != RT_EXEC_COUNT &&
@@ -515,6 +515,7 @@ void ReorderFunctions::runOnFunctions(BinaryContext &BC) {
              << opts::LinkSectionsFile << '\n';
     }
   }
+  return Error::success();
 }
 
 } // namespace bolt
diff --git a/bolt/lib/Passes/RetpolineInsertion.cpp b/bolt/lib/Passes/RetpolineInsertion.cpp
index 97eedb8..09ea3a8 100644
--- a/bolt/lib/Passes/RetpolineInsertion.cpp
+++ b/bolt/lib/Passes/RetpolineInsertion.cpp
@@ -271,9 +271,9 @@ IndirectBranchInfo::IndirectBranchInfo(MCInst &Inst, MCPlusBuilder &MIB) {
   }
 }
 
-void RetpolineInsertion::runOnFunctions(BinaryContext &BC) {
+Error RetpolineInsertion::runOnFunctions(BinaryContext &BC) {
   if (!opts::InsertRetpolines)
-    return;
+    return Error::success();
 
   assert(BC.isX86() &&
          "retpoline insertion not supported for target architecture");
@@ -331,6 +331,7 @@ void RetpolineInsertion::runOnFunctions(BinaryContext &BC) {
          << CreatedRetpolines.size()
          << "\nBOLT-INFO: The number of retpolined branches is : "
          << RetpolinedBranches << "\n";
+  return Error::success();
 }
 
 } // namespace bolt
diff --git a/bolt/lib/Passes/SplitFunctions.cpp b/bolt/lib/Passes/SplitFunctions.cpp
index 5de0759..26dd4b6 100644
--- a/bolt/lib/Passes/SplitFunctions.cpp
+++ b/bolt/lib/Passes/SplitFunctions.cpp
@@ -712,15 +712,15 @@ bool SplitFunctions::shouldOptimize(const BinaryFunction &BF) const {
   return BinaryFunctionPass::shouldOptimize(BF);
 }
 
-void SplitFunctions::runOnFunctions(BinaryContext &BC) {
+Error SplitFunctions::runOnFunctions(BinaryContext &BC) {
   if (!opts::SplitFunctions)
-    return;
+    return Error::success();
 
   // If split strategy is not CDSplit, then a second run of the pass is not
   // needed after function reordering.
   if (BC.HasFinalizedFunctionOrder &&
       opts::SplitStrategy != SplitFunctionsStrategy::CDSplit)
-    return;
+    return Error::success();
 
   std::unique_ptr<SplitStrategy> Strategy;
   bool ForceSequential = false;
@@ -770,6 +770,7 @@ void SplitFunctions::runOnFunctions(BinaryContext &BC) {
            << " hot bytes from " << SplitBytesCold << " cold bytes "
            << format("(%.2lf%% of split functions is hot).\n",
                      100.0 * SplitBytesHot / (SplitBytesHot + SplitBytesCold));
+  return Error::success();
 }
 
 void SplitFunctions::splitFunction(BinaryFunction &BF, SplitStrategy &S) {
diff --git a/bolt/lib/Passes/StokeInfo.cpp b/bolt/lib/Passes/StokeInfo.cpp
index 419ba23..df73293 100644
--- a/bolt/lib/Passes/StokeInfo.cpp
+++ b/bolt/lib/Passes/StokeInfo.cpp
@@ -144,7 +144,7 @@ bool StokeInfo::checkFunction(BinaryFunction &BF, DataflowInfoManager &DInfo,
   return true;
 }
 
-void StokeInfo::runOnFunctions(BinaryContext &BC) {
+Error StokeInfo::runOnFunctions(BinaryContext &BC) {
   outs() << "STOKE-INFO: begin of stoke pass\n";
 
   std::ofstream Outfile;
@@ -152,7 +152,7 @@ void StokeInfo::runOnFunctions(BinaryContext &BC) {
     Outfile.open(opts::StokeOutputDataFilename);
   } else {
     errs() << "STOKE-INFO: output file is required\n";
-    return;
+    return Error::success();
   }
 
   // check some context meta data
@@ -186,6 +186,7 @@ void StokeInfo::runOnFunctions(BinaryContext &BC) {
   }
 
   outs() << "STOKE-INFO: end of stoke pass\n";
+  return Error::success();
 }
 
 } // namespace bolt
diff --git a/bolt/lib/Passes/TailDuplication.cpp b/bolt/lib/Passes/TailDuplication.cpp
index e63d4be..4f3082e 100644
--- a/bolt/lib/Passes/TailDuplication.cpp
+++ b/bolt/lib/Passes/TailDuplication.cpp
@@ -633,9 +633,9 @@ void TailDuplication::runOnFunction(BinaryFunction &Function) {
     ModifiedFunctions++;
 }
 
-void TailDuplication::runOnFunctions(BinaryContext &BC) {
+Error TailDuplication::runOnFunctions(BinaryContext &BC) {
   if (opts::TailDuplicationMode == TailDuplication::TD_NONE)
-    return;
+    return Error::success();
 
   for (auto &It : BC.getBinaryFunctions()) {
     BinaryFunction &Function = It.second;
@@ -661,6 +661,7 @@ void TailDuplication::runOnFunctions(BinaryContext &BC) {
                      DynamicInstructionDeletionCount)
            << "\n";
   }
+  return Error::success();
 }
 
 } // end namespace bolt
diff --git a/bolt/lib/Passes/ThreeWayBranch.cpp b/bolt/lib/Passes/ThreeWayBranch.cpp
index dc320d5..a30a2a0 100644
--- a/bolt/lib/Passes/ThreeWayBranch.cpp
+++ b/bolt/lib/Passes/ThreeWayBranch.cpp
@@ -147,7 +147,7 @@ void ThreeWayBranch::runOnFunction(BinaryFunction &Function) {
   }
 }
 
-void ThreeWayBranch::runOnFunctions(BinaryContext &BC) {
+Error ThreeWayBranch::runOnFunctions(BinaryContext &BC) {
   for (auto &It : BC.getBinaryFunctions()) {
     BinaryFunction &Function = It.second;
     if (!shouldRunOnFunction(Function))
@@ -157,6 +157,7 @@ void ThreeWayBranch::runOnFunctions(BinaryContext &BC) {
 
   outs() << "BOLT-INFO: number of three way branches order changed: "
          << BranchesAltered << "\n";
+  return Error::success();
 }
 
 } // end namespace bolt
diff --git a/bolt/lib/Passes/ValidateInternalCalls.cpp b/bolt/lib/Passes/ValidateInternalCalls.cpp
index 516f91a..ce0b13b 100644
--- a/bolt/lib/Passes/ValidateInternalCalls.cpp
+++ b/bolt/lib/Passes/ValidateInternalCalls.cpp
@@ -302,9 +302,9 @@ bool ValidateInternalCalls::analyzeFunction(BinaryFunction &Function) const {
   return true;
 }
 
-void ValidateInternalCalls::runOnFunctions(BinaryContext &BC) {
+Error ValidateInternalCalls::runOnFunctions(BinaryContext &BC) {
   if (!BC.isX86())
-    return;
+    return Error::success();
 
   // Look for functions that need validation. This should be pretty rare.
   std::set<BinaryFunction *> NeedsValidation;
@@ -323,7 +323,7 @@ void ValidateInternalCalls::runOnFunctions(BinaryContext &BC) {
 
   // Skip validation for non-relocation mode
   if (!BC.HasRelocations)
-    return;
+    return Error::success();
 
   // Since few functions need validation, we can work with our most expensive
   // algorithms here. Fix the CFG treating internal calls as unconditional
@@ -346,6 +346,7 @@ void ValidateInternalCalls::runOnFunctions(BinaryContext &BC) {
       Function->setIgnored();
     }
   }
+  return Error::success();
 }
 
 } // namespace bolt
diff --git a/bolt/lib/Passes/ValidateMemRefs.cpp b/bolt/lib/Passes/ValidateMemRefs.cpp
index 33247768..598a4c2 100644
--- a/bolt/lib/Passes/ValidateMemRefs.cpp
+++ b/bolt/lib/Passes/ValidateMemRefs.cpp
@@ -72,13 +72,13 @@ void ValidateMemRefs::runOnFunction(BinaryFunction &BF) {
   }
 }
 
-void ValidateMemRefs::runOnFunctions(BinaryContext &BC) {
+Error ValidateMemRefs::runOnFunctions(BinaryContext &BC) {
   if (!BC.isX86())
-    return;
+    return Error::success();
 
   // Skip validation if not moving JT
   if (opts::JumpTables == JTS_NONE || opts::JumpTables == JTS_BASIC)
-    return;
+    return Error::success();
 
   ParallelUtilities::WorkFuncWithAllocTy ProcessFunction =
       [&](BinaryFunction &BF, MCPlusBuilder::AllocatorIdTy AllocId) {
@@ -94,10 +94,11 @@ void ValidateMemRefs::runOnFunctions(BinaryContext &BC) {
   LLVM_DEBUG(dbgs() << "BOLT-DEBUG: memrefs validation is concluded\n");
 
   if (!ReplacedReferences)
-    return;
+    return Error::success();
 
   outs() << "BOLT-INFO: validate-mem-refs updated " << ReplacedReferences
          << " object references\n";
+  return Error::success();
 }
 
 } // namespace llvm::bolt
diff --git a/bolt/lib/Passes/VeneerElimination.cpp b/bolt/lib/Passes/VeneerElimination.cpp
index 929c736..06cfecc 100644
--- a/bolt/lib/Passes/VeneerElimination.cpp
+++ b/bolt/lib/Passes/VeneerElimination.cpp
@@ -29,9 +29,9 @@ static llvm::cl::opt<bool>
 namespace llvm {
 namespace bolt {
 
-void VeneerElimination::runOnFunctions(BinaryContext &BC) {
+Error VeneerElimination::runOnFunctions(BinaryContext &BC) {
   if (!opts::EliminateVeneers || !BC.isAArch64())
-    return;
+    return Error::success();
 
   std::map<uint64_t, BinaryFunction> &BFs = BC.getBinaryFunctions();
   std::unordered_map<const MCSymbol *, const MCSymbol *> VeneerDestinations;
@@ -90,6 +90,7 @@ void VeneerElimination::runOnFunctions(BinaryContext &BC) {
       dbgs() << "BOLT-INFO: number of linker-inserted veneers call sites: "
              << VeneerCallers << "\n");
   (void)VeneerCallers;
+  return Error::success();
 }
 
 } // namespace bolt
diff --git a/bolt/lib/Rewrite/BinaryPassManager.cpp b/bolt/lib/Rewrite/BinaryPassManager.cpp
index 9946608..06763fc 100644
--- a/bolt/lib/Rewrite/BinaryPassManager.cpp
+++ b/bolt/lib/Rewrite/BinaryPassManager.cpp
@@ -286,8 +286,8 @@ void BinaryFunctionPassManager::runPasses() {
     NamedRegionTimer T(Pass->getName(), Pass->getName(), TimerGroupName,
                        TimerGroupDesc, TimeOpts);
 
-    callWithDynoStats([this, &Pass] { Pass->runOnFunctions(BC); }, BFs,
-                      Pass->getName(), opts::DynoStatsAll, BC.isAArch64());
+    callWithDynoStats([this, &Pass] { cantFail(Pass->runOnFunctions(BC)); },
+                      BFs, Pass->getName(), opts::DynoStatsAll, BC.isAArch64());
 
     if (opts::VerifyCFG &&
         !std::accumulate(
diff --git a/bolt/lib/Rewrite/BoltDiff.cpp b/bolt/lib/Rewrite/BoltDiff.cpp
index 16a9051..0de5a3d 100644
--- a/bolt/lib/Rewrite/BoltDiff.cpp
+++ b/bolt/lib/Rewrite/BoltDiff.cpp
@@ -294,9 +294,9 @@ class RewriteInstanceDiff {
     }
     PrintProgramStats PPS(opts::NeverPrint);
     outs() << "* BOLT-DIFF: Starting print program stats pass for binary 1\n";
-    PPS.runOnFunctions(*RI1.BC);
+    cantFail(PPS.runOnFunctions(*RI1.BC));
     outs() << "* BOLT-DIFF: Starting print program stats pass for binary 2\n";
-    PPS.runOnFunctions(*RI2.BC);
+    cantFail(PPS.runOnFunctions(*RI2.BC));
     outs() << "=====\n";
     outs() << "Inputs share " << BothHaveProfile
            << " functions with valid profile.\n";
@@ -700,9 +700,9 @@ void RewriteInstance::compare(RewriteInstance &RI2) {
   if (opts::ICF) {
     IdenticalCodeFolding ICF(opts::NeverPrint);
     outs() << "BOLT-DIFF: Starting ICF pass for binary 1";
-    ICF.runOnFunctions(*BC);
+    cantFail(ICF.runOnFunctions(*BC));
     outs() << "BOLT-DIFF: Starting ICF pass for binary 2";
-    ICF.runOnFunctions(*RI2.BC);
+    cantFail(ICF.runOnFunctions(*RI2.BC));
   }
 
   RewriteInstanceDiff RID(*this, RI2);
-- 
cgit v1.1


From fa7dd4919aa705f18f268fab5b2887d45f89d8dd Mon Sep 17 00:00:00 2001
From: Amir Ayupov <aaupov@fb.com>
Date: Mon, 12 Feb 2024 14:39:59 -0800
Subject: [BOLT][NFC] Add BOLTError and return it from passes (1/2) (#81522)

As part of the effort to refactor old error handling code that
would directly call exit(1), in this patch we add a new class
BOLTError and auxiliary functions `createFatalBOLTError()` and
`createNonFatalBOLTError()` that allow BOLT code to bubble up the
problem to the caller by using the Error class as a return
type (or Expected). Also changes passes to use these.

Co-authored-by: Rafael Auler <rafaelauler@fb.com>

Test Plan: NFC
---
 bolt/include/bolt/Core/BinaryContext.h | 23 +++++++++++++++++++++++
 bolt/lib/Core/BinaryContext.cpp        | 31 +++++++++++++++++++++++++++++++
 bolt/lib/Passes/ADRRelaxationPass.cpp  |  2 +-
 bolt/lib/Passes/BinaryPasses.cpp       |  6 +++++-
 bolt/lib/Passes/Instrumentation.cpp    | 14 ++++++--------
 bolt/lib/Passes/PatchEntries.cpp       |  2 +-
 bolt/lib/Passes/ReorderFunctions.cpp   |  4 ++--
 bolt/lib/Passes/VeneerElimination.cpp  |  4 ++--
 8 files changed, 71 insertions(+), 15 deletions(-)

diff --git a/bolt/include/bolt/Core/BinaryContext.h b/bolt/include/bolt/Core/BinaryContext.h
index f1db1fb..5dc5fdb 100644
--- a/bolt/include/bolt/Core/BinaryContext.h
+++ b/bolt/include/bolt/Core/BinaryContext.h
@@ -145,6 +145,29 @@ public:
   }
 };
 
+/// BOLT-exclusive errors generated in core BOLT libraries, optionally holding a
+/// string message and whether it is fatal or not. In case it is fatal and if
+/// BOLT is running as a standalone process, the process might be killed as soon
+/// as the error is checked.
+class BOLTError : public ErrorInfo<BOLTError> {
+public:
+  static char ID;
+
+  BOLTError(bool IsFatal, const Twine &S = Twine());
+  void log(raw_ostream &OS) const override;
+  bool isFatal() const { return IsFatal; }
+
+  const std::string &getMessage() const { return Msg; }
+  std::error_code convertToErrorCode() const override;
+
+private:
+  bool IsFatal;
+  std::string Msg;
+};
+
+Error createNonFatalBOLTError(const Twine &S);
+Error createFatalBOLTError(const Twine &S);
+
 class BinaryContext {
   BinaryContext() = delete;
 
diff --git a/bolt/lib/Core/BinaryContext.cpp b/bolt/lib/Core/BinaryContext.cpp
index df835d2..1c33544 100644
--- a/bolt/lib/Core/BinaryContext.cpp
+++ b/bolt/lib/Core/BinaryContext.cpp
@@ -83,6 +83,37 @@ cl::opt<std::string> CompDirOverride(
 namespace llvm {
 namespace bolt {
 
+char BOLTError::ID = 0;
+
+BOLTError::BOLTError(bool IsFatal, const Twine &S)
+    : IsFatal(IsFatal), Msg(S.str()) {}
+
+void BOLTError::log(raw_ostream &OS) const {
+  if (IsFatal)
+    OS << "FATAL ";
+  StringRef ErrMsg = StringRef(Msg);
+  // Prepend our error prefix if it is missing
+  if (ErrMsg.empty()) {
+    OS << "BOLT-ERROR\n";
+  } else {
+    if (!ErrMsg.starts_with("BOLT-ERROR"))
+      OS << "BOLT-ERROR: ";
+    OS << ErrMsg << "\n";
+  }
+}
+
+std::error_code BOLTError::convertToErrorCode() const {
+  return inconvertibleErrorCode();
+}
+
+Error createNonFatalBOLTError(const Twine &S) {
+  return make_error<BOLTError>(/*IsFatal*/ false, S);
+}
+
+Error createFatalBOLTError(const Twine &S) {
+  return make_error<BOLTError>(/*IsFatal*/ true, S);
+}
+
 BinaryContext::BinaryContext(std::unique_ptr<MCContext> Ctx,
                              std::unique_ptr<DWARFContext> DwCtx,
                              std::unique_ptr<Triple> TheTriple,
diff --git a/bolt/lib/Passes/ADRRelaxationPass.cpp b/bolt/lib/Passes/ADRRelaxationPass.cpp
index 681b5ab..aa715a3 100644
--- a/bolt/lib/Passes/ADRRelaxationPass.cpp
+++ b/bolt/lib/Passes/ADRRelaxationPass.cpp
@@ -110,7 +110,7 @@ Error ADRRelaxationPass::runOnFunctions(BinaryContext &BC) {
       "ADRRelaxationPass");
 
   if (PassFailed)
-    exit(1);
+    return createFatalBOLTError("");
   return Error::success();
 }
 
diff --git a/bolt/lib/Passes/BinaryPasses.cpp b/bolt/lib/Passes/BinaryPasses.cpp
index e90f01a..281488b 100644
--- a/bolt/lib/Passes/BinaryPasses.cpp
+++ b/bolt/lib/Passes/BinaryPasses.cpp
@@ -528,12 +528,14 @@ Error FixupBranches::runOnFunctions(BinaryContext &BC) {
 }
 
 Error FinalizeFunctions::runOnFunctions(BinaryContext &BC) {
+  std::atomic<bool> HasFatal{false};
   ParallelUtilities::WorkFuncTy WorkFun = [&](BinaryFunction &BF) {
     if (!BF.finalizeCFIState()) {
       if (BC.HasRelocations) {
         errs() << "BOLT-ERROR: unable to fix CFI state for function " << BF
                << ". Exiting.\n";
-        exit(1);
+        HasFatal = true;
+        return;
       }
       BF.setSimple(false);
       return;
@@ -552,6 +554,8 @@ Error FinalizeFunctions::runOnFunctions(BinaryContext &BC) {
   ParallelUtilities::runOnEachFunction(
       BC, ParallelUtilities::SchedulingPolicy::SP_CONSTANT, WorkFun,
       SkipPredicate, "FinalizeFunctions");
+  if (HasFatal)
+    return createFatalBOLTError("finalize CFI state failure");
   return Error::success();
 }
 
diff --git a/bolt/lib/Passes/Instrumentation.cpp b/bolt/lib/Passes/Instrumentation.cpp
index b6dc770..26b4a67 100644
--- a/bolt/lib/Passes/Instrumentation.cpp
+++ b/bolt/lib/Passes/Instrumentation.cpp
@@ -567,10 +567,9 @@ Error Instrumentation::runOnFunctions(BinaryContext &BC) {
 
       ErrorOr<BinarySection &> SetupSection =
           BC.getUniqueSectionByName("I__setup");
-      if (!SetupSection) {
-        llvm::errs() << "Cannot find I__setup section\n";
-        exit(1);
-      }
+      if (!SetupSection)
+        return createFatalBOLTError("Cannot find I__setup section\n");
+
       MCSymbol *Target = BC.registerNameAtAddress(
           "__bolt_instr_setup", SetupSection->getAddress(), 0, 0);
       MCInst NewInst;
@@ -586,10 +585,9 @@ Error Instrumentation::runOnFunctions(BinaryContext &BC) {
       BinaryBasicBlock &BB = Ctor->front();
       ErrorOr<BinarySection &> FiniSection =
           BC.getUniqueSectionByName("I__fini");
-      if (!FiniSection) {
-        llvm::errs() << "Cannot find I__fini section\n";
-        exit(1);
-      }
+      if (!FiniSection)
+        return createFatalBOLTError("Cannot find I__fini section");
+
       MCSymbol *Target = BC.registerNameAtAddress(
           "__bolt_instr_fini", FiniSection->getAddress(), 0, 0);
       auto IsLEA = [&BC](const MCInst &Inst) { return BC.MIB->isLEA64r(Inst); };
diff --git a/bolt/lib/Passes/PatchEntries.cpp b/bolt/lib/Passes/PatchEntries.cpp
index ddef854..0b0e15f 100644
--- a/bolt/lib/Passes/PatchEntries.cpp
+++ b/bolt/lib/Passes/PatchEntries.cpp
@@ -103,7 +103,7 @@ Error PatchEntries::runOnFunctions(BinaryContext &BC) {
         if (opts::ForcePatch) {
           errs() << "BOLT-ERROR: unable to patch entries in " << Function
                  << "\n";
-          exit(1);
+          return createFatalBOLTError("");
         }
 
         continue;
diff --git a/bolt/lib/Passes/ReorderFunctions.cpp b/bolt/lib/Passes/ReorderFunctions.cpp
index 86fc03d..9cecd08 100644
--- a/bolt/lib/Passes/ReorderFunctions.cpp
+++ b/bolt/lib/Passes/ReorderFunctions.cpp
@@ -444,7 +444,7 @@ Error ReorderFunctions::runOnFunctions(BinaryContext &BC) {
     if (!FuncsFile) {
       errs() << "BOLT-ERROR: ordered functions file "
              << opts::GenerateFunctionOrderFile << " cannot be opened\n";
-      exit(1);
+      return createFatalBOLTError("");
     }
   }
 
@@ -455,7 +455,7 @@ Error ReorderFunctions::runOnFunctions(BinaryContext &BC) {
     if (!LinkSectionsFile) {
       errs() << "BOLT-ERROR: link sections file " << opts::LinkSectionsFile
              << " cannot be opened\n";
-      exit(1);
+      return createFatalBOLTError("");
     }
   }
 
diff --git a/bolt/lib/Passes/VeneerElimination.cpp b/bolt/lib/Passes/VeneerElimination.cpp
index 06cfecc..d844a06 100644
--- a/bolt/lib/Passes/VeneerElimination.cpp
+++ b/bolt/lib/Passes/VeneerElimination.cpp
@@ -79,8 +79,8 @@ Error VeneerElimination::runOnFunctions(BinaryContext &BC) {
         VeneerCallers++;
         if (!BC.MIB->replaceBranchTarget(
                 Instr, VeneerDestinations[TargetSymbol], BC.Ctx.get())) {
-          errs() << "BOLT-ERROR: updating veneer call destination failed\n";
-          exit(1);
+          return createFatalBOLTError(
+              "BOLT-ERROR: updating veneer call destination failed\n");
         }
       }
     }
-- 
cgit v1.1


From 13d60ce2f262ef9055389908b63824e53b3054a1 Mon Sep 17 00:00:00 2001
From: Amir Ayupov <aaupov@fb.com>
Date: Mon, 12 Feb 2024 14:51:15 -0800
Subject: [BOLT][NFC] Propagate BOLTErrors from Core, RewriteInstance, and
 passes (2/2) (#81523)

As part of the effort to refactor old error handling code that
would directly call exit(1), in this patch continue the migration
on libCore, libRewrite and libPasses to use the new BOLTError
class whenever a failure occurs.

Test Plan: NFC

Co-authored-by: Rafael Auler <rafaelauler@fb.com>
---
 bolt/include/bolt/Core/BinaryFunction.h       | 21 ++++----
 bolt/include/bolt/Core/BinarySection.h        |  4 +-
 bolt/include/bolt/Passes/FrameOptimizer.h     |  4 +-
 bolt/include/bolt/Passes/LongJmp.h            |  4 +-
 bolt/include/bolt/Passes/ReorderFunctions.h   |  2 +-
 bolt/include/bolt/Passes/ShrinkWrapping.h     | 24 ++++-----
 bolt/include/bolt/Rewrite/BinaryPassManager.h |  4 +-
 bolt/lib/Core/BinaryFunction.cpp              | 71 ++++++++++++++++++---------
 bolt/lib/Core/BinarySection.cpp               |  2 +-
 bolt/lib/Core/Exceptions.cpp                  | 11 +++--
 bolt/lib/Passes/BinaryPasses.cpp              |  6 +--
 bolt/lib/Passes/FrameOptimizer.cpp            | 21 ++++++--
 bolt/lib/Passes/LongJmp.cpp                   | 42 ++++++++--------
 bolt/lib/Passes/ReorderFunctions.cpp          | 22 +++++----
 bolt/lib/Passes/ShrinkWrapping.cpp            | 65 +++++++++++++++---------
 bolt/lib/Rewrite/BinaryPassManager.cpp        | 23 ++++++---
 bolt/lib/Rewrite/MachORewriteInstance.cpp     |  4 +-
 bolt/lib/Rewrite/RewriteInstance.cpp          | 38 +++++++++++---
 bolt/lib/Target/X86/X86MCSymbolizer.cpp       | 17 +++++--
 bolt/lib/Target/X86/X86MCSymbolizer.h         |  4 +-
 20 files changed, 242 insertions(+), 147 deletions(-)

diff --git a/bolt/include/bolt/Core/BinaryFunction.h b/bolt/include/bolt/Core/BinaryFunction.h
index 3a1eae3..a17717876 100644
--- a/bolt/include/bolt/Core/BinaryFunction.h
+++ b/bolt/include/bolt/Core/BinaryFunction.h
@@ -1910,12 +1910,11 @@ public:
 
   /// Support dynamic relocations in constant islands, which may happen if
   /// binary is linked with -z notext option.
-  void markIslandDynamicRelocationAtAddress(uint64_t Address) {
-    if (!isInConstantIsland(Address)) {
-      errs() << "BOLT-ERROR: dynamic relocation found for text section at 0x"
-             << Twine::utohexstr(Address) << "\n";
-      exit(1);
-    }
+  Error markIslandDynamicRelocationAtAddress(uint64_t Address) {
+    if (!isInConstantIsland(Address))
+      return createFatalBOLTError(
+          Twine("dynamic relocation found for text section at 0x") +
+          Twine::utohexstr(Address) + Twine("\n"));
 
     // Mark island to have dynamic relocation
     Islands->HasDynamicRelocations = true;
@@ -1924,6 +1923,7 @@ public:
     // move binary data during updateOutputValues, making us emit
     // dynamic relocation with the right offset value.
     getOrCreateIslandAccess(Address);
+    return Error::success();
   }
 
   bool hasDynamicRelocationAtIsland() const {
@@ -2054,9 +2054,10 @@ public:
   /// state to State:Disassembled.
   ///
   /// Returns false if disassembly failed.
-  bool disassemble();
+  Error disassemble();
 
-  void handlePCRelOperand(MCInst &Instruction, uint64_t Address, uint64_t Size);
+  Error handlePCRelOperand(MCInst &Instruction, uint64_t Address,
+                           uint64_t Size);
 
   MCSymbol *handleExternalReference(MCInst &Instruction, uint64_t Size,
                                     uint64_t Offset, uint64_t TargetAddress,
@@ -2100,7 +2101,7 @@ public:
   ///
   /// Returns true on success and update the current function state to
   /// State::CFG. Returns false if CFG cannot be built.
-  bool buildCFG(MCPlusBuilder::AllocatorIdTy);
+  Error buildCFG(MCPlusBuilder::AllocatorIdTy);
 
   /// Perform post-processing of the CFG.
   void postProcessCFG();
@@ -2217,7 +2218,7 @@ public:
   }
 
   /// Process LSDA information for the function.
-  void parseLSDA(ArrayRef<uint8_t> LSDAData, uint64_t LSDAAddress);
+  Error parseLSDA(ArrayRef<uint8_t> LSDAData, uint64_t LSDAAddress);
 
   /// Update exception handling ranges for the function.
   void updateEHRanges();
diff --git a/bolt/include/bolt/Core/BinarySection.h b/bolt/include/bolt/Core/BinarySection.h
index 70914f5..a85dbf2 100644
--- a/bolt/include/bolt/Core/BinarySection.h
+++ b/bolt/include/bolt/Core/BinarySection.h
@@ -112,7 +112,7 @@ class BinarySection {
   static StringRef getName(SectionRef Section) {
     return cantFail(Section.getName());
   }
-  static StringRef getContents(SectionRef Section) {
+  static StringRef getContentsOrQuit(SectionRef Section) {
     if (Section.getObject()->isELF() &&
         ELFSectionRef(Section).getType() == ELF::SHT_NOBITS)
       return StringRef();
@@ -159,7 +159,7 @@ public:
 
   BinarySection(BinaryContext &BC, SectionRef Section)
       : BC(BC), Name(getName(Section)), Section(Section),
-        Contents(getContents(Section)), Address(Section.getAddress()),
+        Contents(getContentsOrQuit(Section)), Address(Section.getAddress()),
         Size(Section.getSize()), Alignment(Section.getAlignment().value()),
         OutputName(Name), SectionNumber(++Count) {
     if (isELF()) {
diff --git a/bolt/include/bolt/Passes/FrameOptimizer.h b/bolt/include/bolt/Passes/FrameOptimizer.h
index a0d93c0..64055bd 100644
--- a/bolt/include/bolt/Passes/FrameOptimizer.h
+++ b/bolt/include/bolt/Passes/FrameOptimizer.h
@@ -98,8 +98,8 @@ class FrameOptimizerPass : public BinaryFunctionPass {
   void removeUnusedStores(const FrameAnalysis &FA, BinaryFunction &BF);
 
   /// Perform shrinkwrapping step
-  void performShrinkWrapping(const RegAnalysis &RA, const FrameAnalysis &FA,
-                             BinaryContext &BC);
+  Error performShrinkWrapping(const RegAnalysis &RA, const FrameAnalysis &FA,
+                              BinaryContext &BC);
 
 public:
   explicit FrameOptimizerPass(const cl::opt<bool> &PrintPass)
diff --git a/bolt/include/bolt/Passes/LongJmp.h b/bolt/include/bolt/Passes/LongJmp.h
index 9488337..3d02d75 100644
--- a/bolt/include/bolt/Passes/LongJmp.h
+++ b/bolt/include/bolt/Passes/LongJmp.h
@@ -131,14 +131,14 @@ class LongJmpPass : public BinaryFunctionPass {
                  uint64_t DotAddress) const;
 
   /// Expand the range of the stub in StubBB if necessary
-  bool relaxStub(BinaryBasicBlock &StubBB);
+  Error relaxStub(BinaryBasicBlock &StubBB, bool &Modified);
 
   /// Helper to resolve a symbol address according to our tentative layout
   uint64_t getSymbolAddress(const BinaryContext &BC, const MCSymbol *Target,
                             const BinaryBasicBlock *TgtBB) const;
 
   /// Relax function by adding necessary stubs or relaxing existing stubs
-  bool relax(BinaryFunction &BF);
+  Error relax(BinaryFunction &BF, bool &Modified);
 
 public:
   /// BinaryPass public interface
diff --git a/bolt/include/bolt/Passes/ReorderFunctions.h b/bolt/include/bolt/Passes/ReorderFunctions.h
index 6ca2dc3..b75937d 100644
--- a/bolt/include/bolt/Passes/ReorderFunctions.h
+++ b/bolt/include/bolt/Passes/ReorderFunctions.h
@@ -44,7 +44,7 @@ public:
   const char *getName() const override { return "reorder-functions"; }
   Error runOnFunctions(BinaryContext &BC) override;
 
-  static std::vector<std::string> readFunctionOrderFile();
+  static Error readFunctionOrderFile(std::vector<std::string> &FunctionNames);
 };
 
 } // namespace bolt
diff --git a/bolt/include/bolt/Passes/ShrinkWrapping.h b/bolt/include/bolt/Passes/ShrinkWrapping.h
index cccbc51..016bea7 100644
--- a/bolt/include/bolt/Passes/ShrinkWrapping.h
+++ b/bolt/include/bolt/Passes/ShrinkWrapping.h
@@ -467,8 +467,9 @@ private:
   /// If \p CreatePushOrPop is true, create a push/pop instead. Current SP/FP
   /// values, as determined by StackPointerTracking, should be informed via
   /// \p SPVal and \p FPVal in order to emit the correct offset form SP/FP.
-  MCInst createStackAccess(int SPVal, int FPVal, const FrameIndexEntry &FIE,
-                           bool CreatePushOrPop);
+  Expected<MCInst> createStackAccess(int SPVal, int FPVal,
+                                     const FrameIndexEntry &FIE,
+                                     bool CreatePushOrPop);
 
   /// Update the CFI referenced by \p Inst with \p NewOffset, if the CFI has
   /// an offset.
@@ -484,22 +485,23 @@ private:
   /// InsertionPoint for other instructions that need to be inserted at the same
   /// original location, since this insertion may have invalidated the previous
   /// location.
-  BBIterTy processInsertion(BBIterTy InsertionPoint, BinaryBasicBlock *CurBB,
-                            const WorklistItem &Item, int64_t SPVal,
-                            int64_t FPVal);
+  Expected<BBIterTy> processInsertion(BBIterTy InsertionPoint,
+                                      BinaryBasicBlock *CurBB,
+                                      const WorklistItem &Item, int64_t SPVal,
+                                      int64_t FPVal);
 
   /// Auxiliary function to processInsertions(), helping perform all the
   /// insertion tasks in the todo list associated with a single insertion point.
   /// Return true if at least one insertion was performed.
-  BBIterTy processInsertionsList(BBIterTy InsertionPoint,
-                                 BinaryBasicBlock *CurBB,
-                                 std::vector<WorklistItem> &TodoList,
-                                 int64_t SPVal, int64_t FPVal);
+  Expected<BBIterTy> processInsertionsList(BBIterTy InsertionPoint,
+                                           BinaryBasicBlock *CurBB,
+                                           std::vector<WorklistItem> &TodoList,
+                                           int64_t SPVal, int64_t FPVal);
 
   /// Apply all insertion todo tasks regarding insertion of new stores/loads or
   /// push/pops at annotated points. Return false if the entire function had
   /// no todo tasks annotation and this pass has nothing to do.
-  bool processInsertions();
+  Expected<bool> processInsertions();
 
   /// Apply all deletion todo tasks (or tasks to change a push/pop to a memory
   /// access no-op)
@@ -519,7 +521,7 @@ public:
         BC.MIB->removeAnnotation(Inst, getAnnotationIndex());
   }
 
-  bool perform(bool HotOnly = false);
+  Expected<bool> perform(bool HotOnly = false);
 
   static void printStats();
 };
diff --git a/bolt/include/bolt/Rewrite/BinaryPassManager.h b/bolt/include/bolt/Rewrite/BinaryPassManager.h
index 84ab192..2297c3b 100644
--- a/bolt/include/bolt/Rewrite/BinaryPassManager.h
+++ b/bolt/include/bolt/Rewrite/BinaryPassManager.h
@@ -46,10 +46,10 @@ public:
   }
 
   /// Run all registered passes in the order they were added.
-  void runPasses();
+  Error runPasses();
 
   /// Runs all enabled implemented passes on all functions.
-  static void runAllPasses(BinaryContext &BC);
+  static Error runAllPasses(BinaryContext &BC);
 };
 
 } // namespace bolt
diff --git a/bolt/lib/Core/BinaryFunction.cpp b/bolt/lib/Core/BinaryFunction.cpp
index 0ac47a5..3f3da3b 100644
--- a/bolt/lib/Core/BinaryFunction.cpp
+++ b/bolt/lib/Core/BinaryFunction.cpp
@@ -1021,24 +1021,25 @@ bool BinaryFunction::isZeroPaddingAt(uint64_t Offset) const {
   return true;
 }
 
-void BinaryFunction::handlePCRelOperand(MCInst &Instruction, uint64_t Address,
-                                        uint64_t Size) {
+Error BinaryFunction::handlePCRelOperand(MCInst &Instruction, uint64_t Address,
+                                         uint64_t Size) {
   auto &MIB = BC.MIB;
   uint64_t TargetAddress = 0;
   if (!MIB->evaluateMemOperandTarget(Instruction, TargetAddress, Address,
                                      Size)) {
-    errs() << "BOLT-ERROR: PC-relative operand can't be evaluated:\n";
-    BC.InstPrinter->printInst(&Instruction, 0, "", *BC.STI, errs());
-    errs() << '\n';
-    Instruction.dump_pretty(errs(), BC.InstPrinter.get());
-    errs() << '\n';
-    errs() << "BOLT-ERROR: cannot handle PC-relative operand at 0x"
-           << Twine::utohexstr(Address) << ". Skipping function " << *this
-           << ".\n";
+    std::string Msg;
+    raw_string_ostream SS(Msg);
+    SS << "BOLT-ERROR: PC-relative operand can't be evaluated:\n";
+    BC.InstPrinter->printInst(&Instruction, 0, "", *BC.STI, SS);
+    SS << '\n';
+    Instruction.dump_pretty(SS, BC.InstPrinter.get());
+    SS << '\n';
+    SS << "BOLT-ERROR: cannot handle PC-relative operand at 0x"
+       << Twine::utohexstr(Address) << ". Skipping function " << *this << ".\n";
     if (BC.HasRelocations)
-      exit(1);
+      return createFatalBOLTError(Msg);
     IsSimple = false;
-    return;
+    return createNonFatalBOLTError(Msg);
   }
   if (TargetAddress == 0 && opts::Verbosity >= 1) {
     outs() << "BOLT-INFO: PC-relative operand is zero in function " << *this
@@ -1054,6 +1055,7 @@ void BinaryFunction::handlePCRelOperand(MCInst &Instruction, uint64_t Address,
       Instruction, TargetSymbol, static_cast<int64_t>(TargetOffset), &*BC.Ctx);
   (void)ReplaceSuccess;
   assert(ReplaceSuccess && "Failed to replace mem operand with symbol+off.");
+  return Error::success();
 }
 
 MCSymbol *BinaryFunction::handleExternalReference(MCInst &Instruction,
@@ -1164,7 +1166,7 @@ void BinaryFunction::handleAArch64IndirectCall(MCInst &Instruction,
   }
 }
 
-bool BinaryFunction::disassemble() {
+Error BinaryFunction::disassemble() {
   NamedRegionTimer T("disassemble", "Disassemble function", "buildfuncs",
                      "Build Binary Functions", opts::TimeBuild);
   ErrorOr<ArrayRef<uint8_t>> ErrorOrFunctionData = getData();
@@ -1332,8 +1334,19 @@ bool BinaryFunction::disassemble() {
         if (MIB->isIndirectBranch(Instruction))
           handleIndirectBranch(Instruction, Size, Offset);
         // Indirect call. We only need to fix it if the operand is RIP-relative.
-        if (IsSimple && MIB->hasPCRelOperand(Instruction))
-          handlePCRelOperand(Instruction, AbsoluteInstrAddr, Size);
+        if (IsSimple && MIB->hasPCRelOperand(Instruction)) {
+          if (auto NewE = handleErrors(
+                  handlePCRelOperand(Instruction, AbsoluteInstrAddr, Size),
+                  [&](const BOLTError &E) -> Error {
+                    if (E.isFatal())
+                      return Error(std::make_unique<BOLTError>(std::move(E)));
+                    if (!E.getMessage().empty())
+                      E.log(errs());
+                    return Error::success();
+                  })) {
+            return Error(std::move(NewE));
+          }
+        }
 
         if (BC.isAArch64())
           handleAArch64IndirectCall(Instruction, Offset);
@@ -1372,8 +1385,18 @@ bool BinaryFunction::disassemble() {
         UsedReloc = true;
       }
 
-      if (!BC.isRISCV() && MIB->hasPCRelOperand(Instruction) && !UsedReloc)
-        handlePCRelOperand(Instruction, AbsoluteInstrAddr, Size);
+      if (!BC.isRISCV() && MIB->hasPCRelOperand(Instruction) && !UsedReloc) {
+        if (auto NewE = handleErrors(
+                handlePCRelOperand(Instruction, AbsoluteInstrAddr, Size),
+                [&](const BOLTError &E) -> Error {
+                  if (E.isFatal())
+                    return Error(std::make_unique<BOLTError>(std::move(E)));
+                  if (!E.getMessage().empty())
+                    E.log(errs());
+                  return Error::success();
+                }))
+          return Error(std::move(NewE));
+      }
     }
 
 add_instruction:
@@ -1413,12 +1436,12 @@ add_instruction:
 
   if (!IsSimple) {
     clearList(Instructions);
-    return false;
+    return createNonFatalBOLTError("");
   }
 
   updateState(State::Disassembled);
 
-  return true;
+  return Error::success();
 }
 
 bool BinaryFunction::scanExternalRefs() {
@@ -1946,17 +1969,17 @@ void BinaryFunction::recomputeLandingPads() {
   }
 }
 
-bool BinaryFunction::buildCFG(MCPlusBuilder::AllocatorIdTy AllocatorId) {
+Error BinaryFunction::buildCFG(MCPlusBuilder::AllocatorIdTy AllocatorId) {
   auto &MIB = BC.MIB;
 
   if (!isSimple()) {
     assert(!BC.HasRelocations &&
            "cannot process file with non-simple function in relocs mode");
-    return false;
+    return createNonFatalBOLTError("");
   }
 
   if (CurrentState != State::Disassembled)
-    return false;
+    return createNonFatalBOLTError("");
 
   assert(BasicBlocks.empty() && "basic block list should be empty");
   assert((Labels.find(getFirstInstructionOffset()) != Labels.end()) &&
@@ -2093,7 +2116,7 @@ bool BinaryFunction::buildCFG(MCPlusBuilder::AllocatorIdTy AllocatorId) {
 
   if (BasicBlocks.empty()) {
     setSimple(false);
-    return false;
+    return createNonFatalBOLTError("");
   }
 
   // Intermediate dump.
@@ -2204,7 +2227,7 @@ bool BinaryFunction::buildCFG(MCPlusBuilder::AllocatorIdTy AllocatorId) {
   clearList(ExternallyReferencedOffsets);
   clearList(UnknownIndirectBranchOffsets);
 
-  return true;
+  return Error::success();
 }
 
 void BinaryFunction::postProcessCFG() {
diff --git a/bolt/lib/Core/BinarySection.cpp b/bolt/lib/Core/BinarySection.cpp
index 97bc251..564c63e 100644
--- a/bolt/lib/Core/BinarySection.cpp
+++ b/bolt/lib/Core/BinarySection.cpp
@@ -198,7 +198,7 @@ BinarySection::~BinarySection() {
 
   if (!isAllocatable() && !hasValidSectionID() &&
       (!hasSectionRef() ||
-       OutputContents.data() != getContents(Section).data())) {
+       OutputContents.data() != getContentsOrQuit(Section).data())) {
     delete[] getOutputData();
   }
 }
diff --git a/bolt/lib/Core/Exceptions.cpp b/bolt/lib/Core/Exceptions.cpp
index ab1885f..dd3fa46 100644
--- a/bolt/lib/Core/Exceptions.cpp
+++ b/bolt/lib/Core/Exceptions.cpp
@@ -98,12 +98,12 @@ namespace bolt {
 // site table will be the same size as GCC uses uleb encodings for PC offsets.
 //
 // Note: some functions have LSDA entries with 0 call site entries.
-void BinaryFunction::parseLSDA(ArrayRef<uint8_t> LSDASectionData,
-                               uint64_t LSDASectionAddress) {
+Error BinaryFunction::parseLSDA(ArrayRef<uint8_t> LSDASectionData,
+                                uint64_t LSDASectionAddress) {
   assert(CurrentState == State::Disassembled && "unexpected function state");
 
   if (!getLSDAAddress())
-    return;
+    return Error::success();
 
   DWARFDataExtractor Data(
       StringRef(reinterpret_cast<const char *>(LSDASectionData.data()),
@@ -121,7 +121,7 @@ void BinaryFunction::parseLSDA(ArrayRef<uint8_t> LSDASectionData,
     if (!MaybeLPStart) {
       errs() << "BOLT-ERROR: unsupported LPStartEncoding: "
              << (unsigned)LPStartEncoding << '\n';
-      exit(1);
+      return createFatalBOLTError("");
     }
     LPStart = *MaybeLPStart;
   }
@@ -209,7 +209,7 @@ void BinaryFunction::parseLSDA(ArrayRef<uint8_t> LSDASectionData,
                "BOLT-ERROR: cannot have landing pads in different functions");
         setHasIndirectTargetToSplitFragment(true);
         BC.addFragmentsToSkip(this);
-        return;
+        return Error::success();
       }
 
       const uint64_t LPOffset = LandingPad - getAddress();
@@ -354,6 +354,7 @@ void BinaryFunction::parseLSDA(ArrayRef<uint8_t> LSDASectionData,
     LSDATypeIndexTable =
         LSDASectionData.slice(TypeIndexTableStart, MaxTypeIndexTableOffset);
   }
+  return Error::success();
 }
 
 void BinaryFunction::updateEHRanges() {
diff --git a/bolt/lib/Passes/BinaryPasses.cpp b/bolt/lib/Passes/BinaryPasses.cpp
index 281488b..4d92b27 100644
--- a/bolt/lib/Passes/BinaryPasses.cpp
+++ b/bolt/lib/Passes/BinaryPasses.cpp
@@ -1456,9 +1456,9 @@ Error PrintProgramStats::runOnFunctions(BinaryContext &BC) {
            << format(" (%.1f%% of all stale)", PctStaleBlocksWithEqualIcount)
            << " have matching icount.\n";
     if (PctStale > opts::StaleThreshold) {
-      errs() << "BOLT-ERROR: stale functions exceed specified threshold of "
-             << opts::StaleThreshold << "%. Exiting.\n";
-      exit(1);
+      return createFatalBOLTError(
+          Twine("BOLT-ERROR: stale functions exceed specified threshold of ") +
+          Twine(opts::StaleThreshold.getValue()) + Twine("%. Exiting.\n"));
     }
   }
   if (NumInferredFunctions) {
diff --git a/bolt/lib/Passes/FrameOptimizer.cpp b/bolt/lib/Passes/FrameOptimizer.cpp
index 83a5a36..30bcfb9 100644
--- a/bolt/lib/Passes/FrameOptimizer.cpp
+++ b/bolt/lib/Passes/FrameOptimizer.cpp
@@ -285,7 +285,8 @@ Error FrameOptimizerPass::runOnFunctions(BinaryContext &BC) {
   {
     NamedRegionTimer T1("shrinkwrapping", "shrink wrapping", "FOP",
                         "FOP breakdown", opts::TimeOpts);
-    performShrinkWrapping(*RA, *FA, BC);
+    if (Error E = performShrinkWrapping(*RA, *FA, BC))
+      return Error(std::move(E));
   }
 
   outs() << "BOLT-INFO: FOP optimized " << NumRedundantLoads
@@ -306,9 +307,9 @@ Error FrameOptimizerPass::runOnFunctions(BinaryContext &BC) {
   return Error::success();
 }
 
-void FrameOptimizerPass::performShrinkWrapping(const RegAnalysis &RA,
-                                               const FrameAnalysis &FA,
-                                               BinaryContext &BC) {
+Error FrameOptimizerPass::performShrinkWrapping(const RegAnalysis &RA,
+                                                const FrameAnalysis &FA,
+                                                BinaryContext &BC) {
   // Initialize necessary annotations to allow safe parallel accesses to
   // annotation index in MIB
   BC.MIB->getOrCreateAnnotationIndex(CalleeSavedAnalysis::getSaveTagName());
@@ -358,12 +359,21 @@ void FrameOptimizerPass::performShrinkWrapping(const RegAnalysis &RA,
 
   const bool HotOnly = opts::FrameOptimization == FOP_HOT;
 
+  Error SWError = Error::success();
+
   ParallelUtilities::WorkFuncWithAllocTy WorkFunction =
       [&](BinaryFunction &BF, MCPlusBuilder::AllocatorIdTy AllocatorId) {
         DataflowInfoManager Info(BF, &RA, &FA, AllocatorId);
         ShrinkWrapping SW(FA, BF, Info, AllocatorId);
 
-        if (SW.perform(HotOnly)) {
+        auto ChangedOrErr = SW.perform(HotOnly);
+        if (auto E = ChangedOrErr.takeError()) {
+          std::lock_guard<std::mutex> Lock(FuncsChangedMutex);
+          SWError = joinErrors(std::move(SWError), Error(std::move(E)));
+          return;
+        }
+        const bool Changed = *ChangedOrErr;
+        if (Changed) {
           std::lock_guard<std::mutex> Lock(FuncsChangedMutex);
           FuncsChanged.insert(&BF);
           LLVM_DEBUG(LogFunc(BF));
@@ -379,6 +389,7 @@ void FrameOptimizerPass::performShrinkWrapping(const RegAnalysis &RA,
     for (const auto &Elmt : Top10Funcs)
       outs() << Elmt.first << " : " << Elmt.second->getPrintName() << "\n";
   }
+  return SWError;
 }
 
 } // namespace bolt
diff --git a/bolt/lib/Passes/LongJmp.cpp b/bolt/lib/Passes/LongJmp.cpp
index b524339..72823b9 100644
--- a/bolt/lib/Passes/LongJmp.cpp
+++ b/bolt/lib/Passes/LongJmp.cpp
@@ -459,13 +459,13 @@ uint64_t LongJmpPass::getSymbolAddress(const BinaryContext &BC,
   return Iter->second;
 }
 
-bool LongJmpPass::relaxStub(BinaryBasicBlock &StubBB) {
+Error LongJmpPass::relaxStub(BinaryBasicBlock &StubBB, bool &Modified) {
   const BinaryFunction &Func = *StubBB.getFunction();
   const BinaryContext &BC = Func.getBinaryContext();
   const int Bits = StubBits[&StubBB];
   // Already working with the largest range?
   if (Bits == static_cast<int>(BC.AsmInfo->getCodePointerSize() * 8))
-    return false;
+    return Error::success();
 
   const static int RangeShortJmp = BC.MIB->getShortJmpEncodingSize();
   const static int RangeSingleInstr = BC.MIB->getUncondBranchEncodingSize();
@@ -481,12 +481,12 @@ bool LongJmpPass::relaxStub(BinaryBasicBlock &StubBB) {
                                                      : TgtAddress - DotAddress;
   // If it fits in one instruction, do not relax
   if (!(PCRelTgtAddress & SingleInstrMask))
-    return false;
+    return Error::success();
 
   // Fits short jmp
   if (!(PCRelTgtAddress & ShortJmpMask)) {
     if (Bits >= RangeShortJmp)
-      return false;
+      return Error::success();
 
     LLVM_DEBUG(dbgs() << "Relaxing stub to short jump. PCRelTgtAddress = "
                       << Twine::utohexstr(PCRelTgtAddress)
@@ -494,22 +494,23 @@ bool LongJmpPass::relaxStub(BinaryBasicBlock &StubBB) {
                       << "\n");
     relaxStubToShortJmp(StubBB, RealTargetSym);
     StubBits[&StubBB] = RangeShortJmp;
-    return true;
+    Modified = true;
+    return Error::success();
   }
 
   // The long jmp uses absolute address on AArch64
   // So we could not use it for PIC binaries
-  if (BC.isAArch64() && !BC.HasFixedLoadAddress) {
-    errs() << "BOLT-ERROR: Unable to relax stub for PIC binary\n";
-    exit(1);
-  }
+  if (BC.isAArch64() && !BC.HasFixedLoadAddress)
+    return createFatalBOLTError(
+        "BOLT-ERROR: Unable to relax stub for PIC binary\n");
 
   LLVM_DEBUG(dbgs() << "Relaxing stub to long jump. PCRelTgtAddress = "
                     << Twine::utohexstr(PCRelTgtAddress)
                     << " RealTargetSym = " << RealTargetSym->getName() << "\n");
   relaxStubToLongJmp(StubBB, RealTargetSym);
   StubBits[&StubBB] = static_cast<int>(BC.AsmInfo->getCodePointerSize() * 8);
-  return true;
+  Modified = true;
+  return Error::success();
 }
 
 bool LongJmpPass::needsStub(const BinaryBasicBlock &BB, const MCInst &Inst,
@@ -539,9 +540,8 @@ bool LongJmpPass::needsStub(const BinaryBasicBlock &BB, const MCInst &Inst,
   return PCOffset < MinVal || PCOffset > MaxVal;
 }
 
-bool LongJmpPass::relax(BinaryFunction &Func) {
+Error LongJmpPass::relax(BinaryFunction &Func, bool &Modified) {
   const BinaryContext &BC = Func.getBinaryContext();
-  bool Modified = false;
 
   assert(BC.isAArch64() && "Unsupported arch");
   constexpr int InsnSize = 4; // AArch64
@@ -613,7 +613,8 @@ bool LongJmpPass::relax(BinaryFunction &Func) {
     if (!Stubs[&Func].count(&BB) || !BB.isValid())
       continue;
 
-    Modified |= relaxStub(BB);
+    if (auto E = relaxStub(BB, Modified))
+      return Error(std::move(E));
   }
 
   for (std::pair<BinaryBasicBlock *, std::unique_ptr<BinaryBasicBlock>> &Elmt :
@@ -625,7 +626,7 @@ bool LongJmpPass::relax(BinaryFunction &Func) {
     Func.insertBasicBlocks(Elmt.first, std::move(NewBBs), true);
   }
 
-  return Modified;
+  return Error::success();
 }
 
 Error LongJmpPass::runOnFunctions(BinaryContext &BC) {
@@ -639,13 +640,12 @@ Error LongJmpPass::runOnFunctions(BinaryContext &BC) {
     tentativeLayout(BC, Sorted);
     updateStubGroups();
     for (BinaryFunction *Func : Sorted) {
-      if (relax(*Func)) {
-        // Don't ruin non-simple functions, they can't afford to have the layout
-        // changed.
-        if (Func->isSimple())
-          Func->fixBranches();
-        Modified = true;
-      }
+      if (auto E = relax(*Func, Modified))
+        return Error(std::move(E));
+      // Don't ruin non-simple functions, they can't afford to have the layout
+      // changed.
+      if (Modified && Func->isSimple())
+        Func->fixBranches();
     }
   } while (Modified);
   outs() << "BOLT-INFO: Inserted " << NumHotStubs
diff --git a/bolt/lib/Passes/ReorderFunctions.cpp b/bolt/lib/Passes/ReorderFunctions.cpp
index 9cecd08..77e51c3 100644
--- a/bolt/lib/Passes/ReorderFunctions.cpp
+++ b/bolt/lib/Passes/ReorderFunctions.cpp
@@ -250,18 +250,18 @@ void ReorderFunctions::printStats(const std::vector<Cluster> &Clusters,
                      TotalCalls2MB, 100 * TotalCalls2MB / TotalCalls);
 }
 
-std::vector<std::string> ReorderFunctions::readFunctionOrderFile() {
-  std::vector<std::string> FunctionNames;
+Error ReorderFunctions::readFunctionOrderFile(
+    std::vector<std::string> &FunctionNames) {
   std::ifstream FuncsFile(opts::FunctionOrderFile, std::ios::in);
-  if (!FuncsFile) {
-    errs() << "Ordered functions file \"" << opts::FunctionOrderFile
-           << "\" can't be opened.\n";
-    exit(1);
-  }
+  if (!FuncsFile)
+    return createFatalBOLTError(Twine("Ordered functions file \"") +
+                                Twine(opts::FunctionOrderFile) +
+                                Twine("\" can't be opened."));
+
   std::string FuncName;
   while (std::getline(FuncsFile, FuncName))
     FunctionNames.push_back(FuncName);
-  return FunctionNames;
+  return Error::success();
 }
 
 Error ReorderFunctions::runOnFunctions(BinaryContext &BC) {
@@ -373,7 +373,11 @@ Error ReorderFunctions::runOnFunctions(BinaryContext &BC) {
 
     uint32_t Index = 0;
     uint32_t InvalidEntries = 0;
-    for (const std::string &Function : readFunctionOrderFile()) {
+    std::vector<std::string> FunctionNames;
+    if (Error E = readFunctionOrderFile(FunctionNames))
+      return Error(std::move(E));
+
+    for (const std::string &Function : FunctionNames) {
       std::vector<uint64_t> FuncAddrs;
 
       BinaryData *BD = BC.getBinaryDataByName(Function);
diff --git a/bolt/lib/Passes/ShrinkWrapping.cpp b/bolt/lib/Passes/ShrinkWrapping.cpp
index d7b25c1..2f2405b 100644
--- a/bolt/lib/Passes/ShrinkWrapping.cpp
+++ b/bolt/lib/Passes/ShrinkWrapping.cpp
@@ -1646,9 +1646,9 @@ void ShrinkWrapping::rebuildCFIForSP() {
         ++I;
 }
 
-MCInst ShrinkWrapping::createStackAccess(int SPVal, int FPVal,
-                                         const FrameIndexEntry &FIE,
-                                         bool CreatePushOrPop) {
+Expected<MCInst> ShrinkWrapping::createStackAccess(int SPVal, int FPVal,
+                                                   const FrameIndexEntry &FIE,
+                                                   bool CreatePushOrPop) {
   MCInst NewInst;
   if (SPVal != StackPointerTracking::SUPERPOSITION &&
       SPVal != StackPointerTracking::EMPTY) {
@@ -1656,15 +1656,15 @@ MCInst ShrinkWrapping::createStackAccess(int SPVal, int FPVal,
       if (!BC.MIB->createRestoreFromStack(NewInst, BC.MIB->getStackPointer(),
                                           FIE.StackOffset - SPVal, FIE.RegOrImm,
                                           FIE.Size)) {
-        errs() << "createRestoreFromStack: not supported on this platform\n";
-        abort();
+        return createFatalBOLTError(
+            "createRestoreFromStack: not supported on this platform\n");
       }
     } else {
       if (!BC.MIB->createSaveToStack(NewInst, BC.MIB->getStackPointer(),
                                      FIE.StackOffset - SPVal, FIE.RegOrImm,
                                      FIE.Size)) {
-        errs() << "createSaveToStack: not supported on this platform\n";
-        abort();
+        return createFatalBOLTError(
+            "createSaveToStack: not supported on this platform\n");
       }
     }
     if (CreatePushOrPop)
@@ -1678,15 +1678,15 @@ MCInst ShrinkWrapping::createStackAccess(int SPVal, int FPVal,
     if (!BC.MIB->createRestoreFromStack(NewInst, BC.MIB->getFramePointer(),
                                         FIE.StackOffset - FPVal, FIE.RegOrImm,
                                         FIE.Size)) {
-      errs() << "createRestoreFromStack: not supported on this platform\n";
-      abort();
+      return createFatalBOLTError(
+          "createRestoreFromStack: not supported on this platform\n");
     }
   } else {
     if (!BC.MIB->createSaveToStack(NewInst, BC.MIB->getFramePointer(),
                                    FIE.StackOffset - FPVal, FIE.RegOrImm,
                                    FIE.Size)) {
-      errs() << "createSaveToStack: not supported on this platform\n";
-      abort();
+      return createFatalBOLTError(
+          "createSaveToStack: not supported on this platform\n");
     }
   }
   return NewInst;
@@ -1743,10 +1743,11 @@ BBIterTy ShrinkWrapping::insertCFIsForPushOrPop(BinaryBasicBlock &BB,
   return Pos;
 }
 
-BBIterTy ShrinkWrapping::processInsertion(BBIterTy InsertionPoint,
-                                          BinaryBasicBlock *CurBB,
-                                          const WorklistItem &Item,
-                                          int64_t SPVal, int64_t FPVal) {
+Expected<BBIterTy> ShrinkWrapping::processInsertion(BBIterTy InsertionPoint,
+                                                    BinaryBasicBlock *CurBB,
+                                                    const WorklistItem &Item,
+                                                    int64_t SPVal,
+                                                    int64_t FPVal) {
   // Trigger CFI reconstruction for this CSR if necessary - writing to
   // PushOffsetByReg/PopOffsetByReg *will* trigger CFI update
   if ((Item.FIEToInsert.IsStore &&
@@ -1772,9 +1773,12 @@ BBIterTy ShrinkWrapping::processInsertion(BBIterTy InsertionPoint,
            << " Is push = " << (Item.Action == WorklistItem::InsertPushOrPop)
            << "\n";
   });
-  MCInst NewInst =
+  Expected<MCInst> NewInstOrErr =
       createStackAccess(SPVal, FPVal, Item.FIEToInsert,
                         Item.Action == WorklistItem::InsertPushOrPop);
+  if (auto E = NewInstOrErr.takeError())
+    return Error(std::move(E));
+  MCInst &NewInst = *NewInstOrErr;
   if (InsertionPoint != CurBB->end()) {
     LLVM_DEBUG({
       dbgs() << "Adding before Inst: ";
@@ -1791,7 +1795,7 @@ BBIterTy ShrinkWrapping::processInsertion(BBIterTy InsertionPoint,
   return CurBB->end();
 }
 
-BBIterTy ShrinkWrapping::processInsertionsList(
+Expected<BBIterTy> ShrinkWrapping::processInsertionsList(
     BBIterTy InsertionPoint, BinaryBasicBlock *CurBB,
     std::vector<WorklistItem> &TodoList, int64_t SPVal, int64_t FPVal) {
   bool HasInsertions = llvm::any_of(TodoList, [&](WorklistItem &Item) {
@@ -1840,8 +1844,11 @@ BBIterTy ShrinkWrapping::processInsertionsList(
         Item.Action == WorklistItem::ChangeToAdjustment)
       continue;
 
-    InsertionPoint =
+    auto InsertionPointOrErr =
         processInsertion(InsertionPoint, CurBB, Item, SPVal, FPVal);
+    if (auto E = InsertionPointOrErr.takeError())
+      return Error(std::move(E));
+    InsertionPoint = *InsertionPointOrErr;
     if (Item.Action == WorklistItem::InsertPushOrPop &&
         Item.FIEToInsert.IsStore)
       SPVal -= Item.FIEToInsert.Size;
@@ -1852,7 +1859,7 @@ BBIterTy ShrinkWrapping::processInsertionsList(
   return InsertionPoint;
 }
 
-bool ShrinkWrapping::processInsertions() {
+Expected<bool> ShrinkWrapping::processInsertions() {
   PredictiveStackPointerTracking PSPT(BF, Todo, Info, AllocatorId);
   PSPT.run();
 
@@ -1875,14 +1882,20 @@ bool ShrinkWrapping::processInsertions() {
       auto Iter = I;
       std::pair<int, int> SPTState =
           *PSPT.getStateAt(Iter == BB.begin() ? (ProgramPoint)&BB : &*(--Iter));
-      I = processInsertionsList(I, &BB, List, SPTState.first, SPTState.second);
+      auto IterOrErr =
+          processInsertionsList(I, &BB, List, SPTState.first, SPTState.second);
+      if (auto E = IterOrErr.takeError())
+        return Error(std::move(E));
+      I = *IterOrErr;
     }
     // Process insertions at the end of bb
     auto WRI = Todo.find(&BB);
     if (WRI != Todo.end()) {
       std::pair<int, int> SPTState = *PSPT.getStateAt(*BB.rbegin());
-      processInsertionsList(BB.end(), &BB, WRI->second, SPTState.first,
-                            SPTState.second);
+      if (auto E = processInsertionsList(BB.end(), &BB, WRI->second,
+                                         SPTState.first, SPTState.second)
+                       .takeError())
+        return Error(std::move(E));
       Changes = true;
     }
   }
@@ -1945,7 +1958,7 @@ void ShrinkWrapping::rebuildCFI() {
   }
 }
 
-bool ShrinkWrapping::perform(bool HotOnly) {
+Expected<bool> ShrinkWrapping::perform(bool HotOnly) {
   HasDeletedOffsetCFIs = BitVector(BC.MRI->getNumRegs(), false);
   PushOffsetByReg = std::vector<int64_t>(BC.MRI->getNumRegs(), 0LL);
   PopOffsetByReg = std::vector<int64_t>(BC.MRI->getNumRegs(), 0LL);
@@ -1998,7 +2011,11 @@ bool ShrinkWrapping::perform(bool HotOnly) {
   });
   SLM.performChanges();
   // Early exit if processInsertions doesn't detect any todo items
-  if (!processInsertions())
+  auto ModifiedOrErr = processInsertions();
+  if (auto E = ModifiedOrErr.takeError())
+    return Error(std::move(E));
+  const bool Modified = *ModifiedOrErr;
+  if (!Modified)
     return false;
   processDeletions();
   if (foldIdenticalSplitEdges()) {
diff --git a/bolt/lib/Rewrite/BinaryPassManager.cpp b/bolt/lib/Rewrite/BinaryPassManager.cpp
index 06763fc..cad8018 100644
--- a/bolt/lib/Rewrite/BinaryPassManager.cpp
+++ b/bolt/lib/Rewrite/BinaryPassManager.cpp
@@ -268,7 +268,7 @@ const char BinaryFunctionPassManager::TimerGroupName[] = "passman";
 const char BinaryFunctionPassManager::TimerGroupDesc[] =
     "Binary Function Pass Manager";
 
-void BinaryFunctionPassManager::runPasses() {
+Error BinaryFunctionPassManager::runPasses() {
   auto &BFs = BC.getBinaryFunctions();
   for (size_t PassIdx = 0; PassIdx < Passes.size(); PassIdx++) {
     const std::pair<const bool, std::unique_ptr<BinaryFunctionPass>>
@@ -286,8 +286,14 @@ void BinaryFunctionPassManager::runPasses() {
     NamedRegionTimer T(Pass->getName(), Pass->getName(), TimerGroupName,
                        TimerGroupDesc, TimeOpts);
 
-    callWithDynoStats([this, &Pass] { cantFail(Pass->runOnFunctions(BC)); },
-                      BFs, Pass->getName(), opts::DynoStatsAll, BC.isAArch64());
+    Error E = Error::success();
+    callWithDynoStats(
+        [this, &E, &Pass] {
+          E = joinErrors(std::move(E), Pass->runOnFunctions(BC));
+        },
+        BFs, Pass->getName(), opts::DynoStatsAll, BC.isAArch64());
+    if (E)
+      return Error(std::move(E));
 
     if (opts::VerifyCFG &&
         !std::accumulate(
@@ -296,9 +302,9 @@ void BinaryFunctionPassManager::runPasses() {
                const std::pair<const uint64_t, BinaryFunction> &It) {
               return Valid && It.second.validateCFG();
             })) {
-      errs() << "BOLT-ERROR: Invalid CFG detected after pass "
-             << Pass->getName() << "\n";
-      exit(1);
+      return createFatalBOLTError(
+          Twine("BOLT-ERROR: Invalid CFG detected after pass ") +
+          Twine(Pass->getName()) + Twine("\n"));
     }
 
     if (opts::Verbosity > 0)
@@ -321,9 +327,10 @@ void BinaryFunctionPassManager::runPasses() {
         Function.dumpGraphForPass(PassIdName);
     }
   }
+  return Error::success();
 }
 
-void BinaryFunctionPassManager::runAllPasses(BinaryContext &BC) {
+Error BinaryFunctionPassManager::runAllPasses(BinaryContext &BC) {
   BinaryFunctionPassManager Manager(BC);
 
   const DynoStats InitialDynoStats =
@@ -516,7 +523,7 @@ void BinaryFunctionPassManager::runAllPasses(BinaryContext &BC) {
   // in parallel and restore them
   Manager.registerPass(std::make_unique<CleanMCState>(NeverPrint));
 
-  Manager.runPasses();
+  return Manager.runPasses();
 }
 
 } // namespace bolt
diff --git a/bolt/lib/Rewrite/MachORewriteInstance.cpp b/bolt/lib/Rewrite/MachORewriteInstance.cpp
index 8be8257..c2a6c1f 100644
--- a/bolt/lib/Rewrite/MachORewriteInstance.cpp
+++ b/bolt/lib/Rewrite/MachORewriteInstance.cpp
@@ -337,7 +337,7 @@ void MachORewriteInstance::disassembleFunctions() {
     BinaryFunction &Function = BFI.second;
     if (!Function.isSimple())
       continue;
-    Function.disassemble();
+    cantFail(Function.disassemble());
     if (opts::PrintDisasm)
       Function.print(outs(), "after disassembly");
   }
@@ -387,7 +387,7 @@ void MachORewriteInstance::runOptimizationPasses() {
   Manager.registerPass(
       std::make_unique<FinalizeFunctions>(opts::PrintFinalized));
 
-  Manager.runPasses();
+  cantFail(Manager.runPasses());
 }
 
 void MachORewriteInstance::mapInstrumentationSection(
diff --git a/bolt/lib/Rewrite/RewriteInstance.cpp b/bolt/lib/Rewrite/RewriteInstance.cpp
index c909e31..829568c 100644
--- a/bolt/lib/Rewrite/RewriteInstance.cpp
+++ b/bolt/lib/Rewrite/RewriteInstance.cpp
@@ -1283,7 +1283,7 @@ void RewriteInstance::discoverFileObjects() {
                                                    /*UseMaxSize*/ true);
         if (BF) {
           assert(Rel.isRelative() && "Expected relative relocation for island");
-          BF->markIslandDynamicRelocationAtAddress(RelAddress);
+          cantFail(BF->markIslandDynamicRelocationAtAddress(RelAddress));
         }
       }
     }
@@ -2859,8 +2859,9 @@ void RewriteInstance::selectFunctionsToProcess() {
   StringSet<> ReorderFunctionsUserSet;
   StringSet<> ReorderFunctionsLTOCommonSet;
   if (opts::ReorderFunctions == ReorderFunctions::RT_USER) {
-    for (const std::string &Function :
-         ReorderFunctions::readFunctionOrderFile()) {
+    std::vector<std::string> FunctionNames;
+    cantFail(ReorderFunctions::readFunctionOrderFile(FunctionNames));
+    for (const std::string &Function : FunctionNames) {
       ReorderFunctionsUserSet.insert(Function);
       if (std::optional<StringRef> LTOCommonName = getLTOCommonName(Function))
         ReorderFunctionsLTOCommonSet.insert(*LTOCommonName);
@@ -3133,7 +3134,13 @@ void RewriteInstance::disassembleFunctions() {
       continue;
     }
 
-    if (!Function.disassemble()) {
+    bool DisasmFailed{false};
+    handleAllErrors(Function.disassemble(), [&](const BOLTError &E) {
+      DisasmFailed = true;
+      if (E.isFatal()) {
+        E.log(errs());
+        exit(1);
+      }
       if (opts::processAllFunctions())
         BC->exitWithBugReport("function cannot be properly disassembled. "
                               "Unable to continue in relocation mode.",
@@ -3143,8 +3150,10 @@ void RewriteInstance::disassembleFunctions() {
                << ". Will ignore.\n";
       // Forcefully ignore the function.
       Function.setIgnored();
+    });
+
+    if (DisasmFailed)
       continue;
-    }
 
     if (opts::PrintAll || opts::PrintDisasm)
       Function.print(outs(), "after disassembly");
@@ -3199,7 +3208,7 @@ void RewriteInstance::disassembleFunctions() {
       check_error(LSDASection.getError(), "failed to get LSDA section");
       ArrayRef<uint8_t> LSDAData = ArrayRef<uint8_t>(
           LSDASection->getData(), LSDASection->getContents().size());
-      Function.parseLSDA(LSDAData, LSDASection->getAddress());
+      cantFail(Function.parseLSDA(LSDAData, LSDASection->getAddress()));
     }
   }
 }
@@ -3214,7 +3223,16 @@ void RewriteInstance::buildFunctionsCFG() {
 
   ParallelUtilities::WorkFuncWithAllocTy WorkFun =
       [&](BinaryFunction &BF, MCPlusBuilder::AllocatorIdTy AllocId) {
-        if (!BF.buildCFG(AllocId))
+        bool HadErrors{false};
+        handleAllErrors(BF.buildCFG(AllocId), [&](const BOLTError &E) {
+          if (!E.getMessage().empty())
+            E.log(errs());
+          if (E.isFatal())
+            exit(1);
+          HadErrors = true;
+        });
+
+        if (HadErrors)
           return;
 
         if (opts::PrintAll) {
@@ -3281,7 +3299,11 @@ void RewriteInstance::postProcessFunctions() {
 void RewriteInstance::runOptimizationPasses() {
   NamedRegionTimer T("runOptimizationPasses", "run optimization passes",
                      TimerGroupName, TimerGroupDesc, opts::TimeRewrite);
-  BinaryFunctionPassManager::runAllPasses(*BC);
+  handleAllErrors(BinaryFunctionPassManager::runAllPasses(*BC),
+                  [](const BOLTError &E) {
+                    E.log(errs());
+                    exit(1);
+                  });
 }
 
 void RewriteInstance::preregisterSections() {
diff --git a/bolt/lib/Target/X86/X86MCSymbolizer.cpp b/bolt/lib/Target/X86/X86MCSymbolizer.cpp
index ca7fe13..dead685 100644
--- a/bolt/lib/Target/X86/X86MCSymbolizer.cpp
+++ b/bolt/lib/Target/X86/X86MCSymbolizer.cpp
@@ -134,7 +134,12 @@ bool X86MCSymbolizer::tryAddingSymbolicOperand(
   // a PC-relative 8-byte fixup, which is what we need to cover this. The
   // only way to do this is to use the symbol name _GLOBAL_OFFSET_TABLE_.
   if (Relocation::isX86GOTPC64(Relocation->Type)) {
-    auto [Sym, Addend] = handleGOTPC64(*Relocation, InstAddress);
+    auto PairOrErr = handleGOTPC64(*Relocation, InstAddress);
+    if (auto E = PairOrErr.takeError()) {
+      Function.setSimple(false);
+      return false;
+    }
+    auto [Sym, Addend] = *PairOrErr;
     addOperand(Sym, Addend);
     return true;
   }
@@ -158,14 +163,16 @@ bool X86MCSymbolizer::tryAddingSymbolicOperand(
   return true;
 }
 
-std::pair<MCSymbol *, uint64_t>
+Expected<std::pair<MCSymbol *, uint64_t>>
 X86MCSymbolizer::handleGOTPC64(const Relocation &R, uint64_t InstrAddr) {
   BinaryContext &BC = Function.getBinaryContext();
   const BinaryData *GOTSymBD = BC.getGOTSymbol();
   if (!GOTSymBD || !GOTSymBD->getAddress()) {
-    errs() << "BOLT-ERROR: R_X86_GOTPC64 relocation is present but we did "
-              "not detect a valid  _GLOBAL_OFFSET_TABLE_ in symbol table\n";
-    exit(1);
+    // This error is pretty serious but we can't kill the disassembler
+    // because of it, so don't make it fatal. Log it and warn the user.
+    return createNonFatalBOLTError(
+        "R_X86_GOTPC64 relocation is present but we did not detect "
+        "a valid  _GLOBAL_OFFSET_TABLE_ in symbol table\n");
   }
   // R_X86_GOTPC64 are not relative to the Reloc nor end of instruction,
   // but the start of the MOVABSQ instruction. So the Target Address is
diff --git a/bolt/lib/Target/X86/X86MCSymbolizer.h b/bolt/lib/Target/X86/X86MCSymbolizer.h
index 9ed18b6..189941e 100644
--- a/bolt/lib/Target/X86/X86MCSymbolizer.h
+++ b/bolt/lib/Target/X86/X86MCSymbolizer.h
@@ -20,8 +20,8 @@ protected:
   BinaryFunction &Function;
   bool CreateNewSymbols{true};
 
-  std::pair<MCSymbol *, uint64_t> handleGOTPC64(const Relocation &R,
-                                                uint64_t InstrAddr);
+  Expected<std::pair<MCSymbol *, uint64_t>> handleGOTPC64(const Relocation &R,
+                                                          uint64_t InstrAddr);
 
 public:
   X86MCSymbolizer(BinaryFunction &Function, bool CreateNewSymbols = true)
-- 
cgit v1.1


From 93cdd1b5cfa3735c599949b77e24dbfbe570441a Mon Sep 17 00:00:00 2001
From: Arthur Eubanks <aeubanks@google.com>
Date: Mon, 12 Feb 2024 15:52:08 -0700
Subject: [PGO] Add ability to mark cold functions as optsize/minsize/optnone
 (#69030)

The performance of cold functions shouldn't matter too much, so if we
care about binary sizes, add an option to mark cold functions as
optsize/minsize for binary size, or optnone for compile times [1]. Clang
patch will be in a future patch.

This is intended to replace `shouldOptimizeForSize(Function&, ...)`.
We've seen multiple cases where calls to this expensive function, if not
careful, can blow up compile times. I will clean up users of that
function in a followup patch.

Initial version: https://reviews.llvm.org/D149800

[1]
https://discourse.llvm.org/t/rfc-new-feature-proposal-de-optimizing-cold-functions-using-pgo-info/56388
---
 clang/lib/CodeGen/BackendUtil.cpp                  |  18 ++--
 llvm/include/llvm/Support/PGOOptions.h             |   3 +
 .../Instrumentation/PGOForceFunctionAttrs.h        |  29 ++++++
 llvm/lib/LTO/LTOBackend.cpp                        |  12 ++-
 llvm/lib/Passes/PassBuilder.cpp                    |   1 +
 llvm/lib/Passes/PassBuilderPipelines.cpp           |  10 ++
 llvm/lib/Passes/PassRegistry.def                   |   1 +
 llvm/lib/Support/PGOOptions.cpp                    |   7 +-
 llvm/lib/Transforms/Instrumentation/CMakeLists.txt |   1 +
 .../Instrumentation/PGOForceFunctionAttrs.cpp      |  61 ++++++++++++
 .../Instrumentation/PGOForceFunctionAttrs/basic.ll | 102 +++++++++++++++++++++
 llvm/tools/opt/NewPMDriver.cpp                     |  25 ++++-
 .../llvm/lib/Transforms/Instrumentation/BUILD.gn   |   1 +
 13 files changed, 253 insertions(+), 18 deletions(-)
 create mode 100644 llvm/include/llvm/Transforms/Instrumentation/PGOForceFunctionAttrs.h
 create mode 100644 llvm/lib/Transforms/Instrumentation/PGOForceFunctionAttrs.cpp
 create mode 100644 llvm/test/Instrumentation/PGOForceFunctionAttrs/basic.ll

diff --git a/clang/lib/CodeGen/BackendUtil.cpp b/clang/lib/CodeGen/BackendUtil.cpp
index e2511aa..a310825 100644
--- a/clang/lib/CodeGen/BackendUtil.cpp
+++ b/clang/lib/CodeGen/BackendUtil.cpp
@@ -748,7 +748,8 @@ void EmitAssemblyHelper::RunOptimizationPipeline(
         CodeGenOpts.InstrProfileOutput.empty() ? getDefaultProfileGenName()
                                                : CodeGenOpts.InstrProfileOutput,
         "", "", CodeGenOpts.MemoryProfileUsePath, nullptr, PGOOptions::IRInstr,
-        PGOOptions::NoCSAction, CodeGenOpts.DebugInfoForProfiling,
+        PGOOptions::NoCSAction, PGOOptions::ColdFuncOpt::Default,
+        CodeGenOpts.DebugInfoForProfiling,
         /*PseudoProbeForProfiling=*/false, CodeGenOpts.AtomicProfileUpdate);
   else if (CodeGenOpts.hasProfileIRUse()) {
     // -fprofile-use.
@@ -757,28 +758,32 @@ void EmitAssemblyHelper::RunOptimizationPipeline(
     PGOOpt = PGOOptions(
         CodeGenOpts.ProfileInstrumentUsePath, "",
         CodeGenOpts.ProfileRemappingFile, CodeGenOpts.MemoryProfileUsePath, VFS,
-        PGOOptions::IRUse, CSAction, CodeGenOpts.DebugInfoForProfiling);
+        PGOOptions::IRUse, CSAction, PGOOptions::ColdFuncOpt::Default,
+        CodeGenOpts.DebugInfoForProfiling);
   } else if (!CodeGenOpts.SampleProfileFile.empty())
     // -fprofile-sample-use
     PGOOpt = PGOOptions(
         CodeGenOpts.SampleProfileFile, "", CodeGenOpts.ProfileRemappingFile,
         CodeGenOpts.MemoryProfileUsePath, VFS, PGOOptions::SampleUse,
-        PGOOptions::NoCSAction, CodeGenOpts.DebugInfoForProfiling,
-        CodeGenOpts.PseudoProbeForProfiling);
+        PGOOptions::NoCSAction, PGOOptions::ColdFuncOpt::Default,
+        CodeGenOpts.DebugInfoForProfiling, CodeGenOpts.PseudoProbeForProfiling);
   else if (!CodeGenOpts.MemoryProfileUsePath.empty())
     // -fmemory-profile-use (without any of the above options)
     PGOOpt = PGOOptions("", "", "", CodeGenOpts.MemoryProfileUsePath, VFS,
                         PGOOptions::NoAction, PGOOptions::NoCSAction,
+                        PGOOptions::ColdFuncOpt::Default,
                         CodeGenOpts.DebugInfoForProfiling);
   else if (CodeGenOpts.PseudoProbeForProfiling)
     // -fpseudo-probe-for-profiling
     PGOOpt = PGOOptions("", "", "", /*MemoryProfile=*/"", nullptr,
                         PGOOptions::NoAction, PGOOptions::NoCSAction,
+                        PGOOptions::ColdFuncOpt::Default,
                         CodeGenOpts.DebugInfoForProfiling, true);
   else if (CodeGenOpts.DebugInfoForProfiling)
     // -fdebug-info-for-profiling
     PGOOpt = PGOOptions("", "", "", /*MemoryProfile=*/"", nullptr,
-                        PGOOptions::NoAction, PGOOptions::NoCSAction, true);
+                        PGOOptions::NoAction, PGOOptions::NoCSAction,
+                        PGOOptions::ColdFuncOpt::Default, true);
 
   // Check to see if we want to generate a CS profile.
   if (CodeGenOpts.hasProfileCSIRInstr()) {
@@ -801,7 +806,8 @@ void EmitAssemblyHelper::RunOptimizationPipeline(
                          ? getDefaultProfileGenName()
                          : CodeGenOpts.InstrProfileOutput,
                      "", /*MemoryProfile=*/"", nullptr, PGOOptions::NoAction,
-                     PGOOptions::CSIRInstr, CodeGenOpts.DebugInfoForProfiling);
+                     PGOOptions::CSIRInstr, PGOOptions::ColdFuncOpt::Default,
+                     CodeGenOpts.DebugInfoForProfiling);
   }
   if (TM)
     TM->setPGOOption(PGOOpt);
diff --git a/llvm/include/llvm/Support/PGOOptions.h b/llvm/include/llvm/Support/PGOOptions.h
index 87eb29a..de981ab 100644
--- a/llvm/include/llvm/Support/PGOOptions.h
+++ b/llvm/include/llvm/Support/PGOOptions.h
@@ -27,10 +27,12 @@ class FileSystem;
 struct PGOOptions {
   enum PGOAction { NoAction, IRInstr, IRUse, SampleUse };
   enum CSPGOAction { NoCSAction, CSIRInstr, CSIRUse };
+  enum class ColdFuncOpt { Default, OptSize, MinSize, OptNone };
   PGOOptions(std::string ProfileFile, std::string CSProfileGenFile,
              std::string ProfileRemappingFile, std::string MemoryProfile,
              IntrusiveRefCntPtr<vfs::FileSystem> FS,
              PGOAction Action = NoAction, CSPGOAction CSAction = NoCSAction,
+             ColdFuncOpt ColdType = ColdFuncOpt::Default,
              bool DebugInfoForProfiling = false,
              bool PseudoProbeForProfiling = false,
              bool AtomicCounterUpdate = false);
@@ -44,6 +46,7 @@ struct PGOOptions {
   std::string MemoryProfile;
   PGOAction Action;
   CSPGOAction CSAction;
+  ColdFuncOpt ColdOptType;
   bool DebugInfoForProfiling;
   bool PseudoProbeForProfiling;
   bool AtomicCounterUpdate;
diff --git a/llvm/include/llvm/Transforms/Instrumentation/PGOForceFunctionAttrs.h b/llvm/include/llvm/Transforms/Instrumentation/PGOForceFunctionAttrs.h
new file mode 100644
index 0000000..785448e
--- /dev/null
+++ b/llvm/include/llvm/Transforms/Instrumentation/PGOForceFunctionAttrs.h
@@ -0,0 +1,29 @@
+//===- PGOForceFunctionAttrs.h - --------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TRANSFORMS_INSTRUMENTATION_PGOFORCEFUNCTIONATTRS_H
+#define LLVM_TRANSFORMS_INSTRUMENTATION_PGOFORCEFUNCTIONATTRS_H
+
+#include "llvm/IR/PassManager.h"
+#include "llvm/Support/PGOOptions.h"
+
+namespace llvm {
+
+struct PGOForceFunctionAttrsPass
+    : public PassInfoMixin<PGOForceFunctionAttrsPass> {
+  PGOForceFunctionAttrsPass(PGOOptions::ColdFuncOpt ColdType)
+      : ColdType(ColdType) {}
+  PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
+
+private:
+  PGOOptions::ColdFuncOpt ColdType;
+};
+
+} // namespace llvm
+
+#endif // LLVM_TRANSFORMS_INSTRUMENTATION_PGOFORCEFUNCTIONATTRS_H
diff --git a/llvm/lib/LTO/LTOBackend.cpp b/llvm/lib/LTO/LTOBackend.cpp
index ccc4276..7b3a759 100644
--- a/llvm/lib/LTO/LTOBackend.cpp
+++ b/llvm/lib/LTO/LTOBackend.cpp
@@ -243,19 +243,23 @@ static void runNewPMPasses(const Config &Conf, Module &Mod, TargetMachine *TM,
   if (!Conf.SampleProfile.empty())
     PGOOpt = PGOOptions(Conf.SampleProfile, "", Conf.ProfileRemapping,
                         /*MemoryProfile=*/"", FS, PGOOptions::SampleUse,
-                        PGOOptions::NoCSAction, true);
+                        PGOOptions::NoCSAction,
+                        PGOOptions::ColdFuncOpt::Default, true);
   else if (Conf.RunCSIRInstr) {
     PGOOpt = PGOOptions("", Conf.CSIRProfile, Conf.ProfileRemapping,
                         /*MemoryProfile=*/"", FS, PGOOptions::IRUse,
-                        PGOOptions::CSIRInstr, Conf.AddFSDiscriminator);
+                        PGOOptions::CSIRInstr, PGOOptions::ColdFuncOpt::Default,
+                        Conf.AddFSDiscriminator);
   } else if (!Conf.CSIRProfile.empty()) {
     PGOOpt = PGOOptions(Conf.CSIRProfile, "", Conf.ProfileRemapping,
                         /*MemoryProfile=*/"", FS, PGOOptions::IRUse,
-                        PGOOptions::CSIRUse, Conf.AddFSDiscriminator);
+                        PGOOptions::CSIRUse, PGOOptions::ColdFuncOpt::Default,
+                        Conf.AddFSDiscriminator);
     NoPGOWarnMismatch = !Conf.PGOWarnMismatch;
   } else if (Conf.AddFSDiscriminator) {
     PGOOpt = PGOOptions("", "", "", /*MemoryProfile=*/"", nullptr,
-                        PGOOptions::NoAction, PGOOptions::NoCSAction, true);
+                        PGOOptions::NoAction, PGOOptions::NoCSAction,
+                        PGOOptions::ColdFuncOpt::Default, true);
   }
   TM->setPGOOption(PGOOpt);
 
diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp
index e3f2502..f26d95a 100644
--- a/llvm/lib/Passes/PassBuilder.cpp
+++ b/llvm/lib/Passes/PassBuilder.cpp
@@ -172,6 +172,7 @@
 #include "llvm/Transforms/Instrumentation/KCFI.h"
 #include "llvm/Transforms/Instrumentation/MemProfiler.h"
 #include "llvm/Transforms/Instrumentation/MemorySanitizer.h"
+#include "llvm/Transforms/Instrumentation/PGOForceFunctionAttrs.h"
 #include "llvm/Transforms/Instrumentation/PGOInstrumentation.h"
 #include "llvm/Transforms/Instrumentation/PoisonChecking.h"
 #include "llvm/Transforms/Instrumentation/SanitizerBinaryMetadata.h"
diff --git a/llvm/lib/Passes/PassBuilderPipelines.cpp b/llvm/lib/Passes/PassBuilderPipelines.cpp
index 4e233d9..142bd50 100644
--- a/llvm/lib/Passes/PassBuilderPipelines.cpp
+++ b/llvm/lib/Passes/PassBuilderPipelines.cpp
@@ -74,6 +74,7 @@
 #include "llvm/Transforms/Instrumentation/InstrOrderFile.h"
 #include "llvm/Transforms/Instrumentation/InstrProfiling.h"
 #include "llvm/Transforms/Instrumentation/MemProfiler.h"
+#include "llvm/Transforms/Instrumentation/PGOForceFunctionAttrs.h"
 #include "llvm/Transforms/Instrumentation/PGOInstrumentation.h"
 #include "llvm/Transforms/Scalar/ADCE.h"
 #include "llvm/Transforms/Scalar/AlignmentFromAssumptions.h"
@@ -213,6 +214,12 @@ static cl::opt<bool>
                            cl::desc("Enable DFA jump threading"),
                            cl::init(false), cl::Hidden);
 
+// TODO: turn on and remove flag
+static cl::opt<bool> EnablePGOForceFunctionAttrs(
+    "enable-pgo-force-function-attrs",
+    cl::desc("Enable pass to set function attributes based on PGO profiles"),
+    cl::init(false));
+
 static cl::opt<bool>
     EnableHotColdSplit("hot-cold-split",
                        cl::desc("Enable hot-cold splitting pass"));
@@ -1146,6 +1153,9 @@ PassBuilder::buildModuleSimplificationPipeline(OptimizationLevel Level,
   if (EnableSyntheticCounts && !PGOOpt)
     MPM.addPass(SyntheticCountsPropagation());
 
+  if (EnablePGOForceFunctionAttrs)
+    MPM.addPass(PGOForceFunctionAttrsPass(PGOOpt->ColdOptType));
+
   MPM.addPass(AlwaysInlinerPass(/*InsertLifetimeIntrinsics=*/true));
 
   if (EnableModuleInliner)
diff --git a/llvm/lib/Passes/PassRegistry.def b/llvm/lib/Passes/PassRegistry.def
index afa5a65..093c1f8 100644
--- a/llvm/lib/Passes/PassRegistry.def
+++ b/llvm/lib/Passes/PassRegistry.def
@@ -85,6 +85,7 @@ MODULE_PASS("lower-emutls", LowerEmuTLSPass())
 MODULE_PASS("lower-global-dtors", LowerGlobalDtorsPass())
 MODULE_PASS("lower-ifunc", LowerIFuncPass())
 MODULE_PASS("lowertypetests", LowerTypeTestsPass())
+MODULE_PASS("pgo-force-function-attrs", PGOForceFunctionAttrsPass(PGOOpt ? PGOOpt->ColdOptType : PGOOptions::ColdFuncOpt::Default))
 MODULE_PASS("memprof-context-disambiguation", MemProfContextDisambiguation())
 MODULE_PASS("memprof-module", ModuleMemProfilerPass())
 MODULE_PASS("mergefunc", MergeFunctionsPass())
diff --git a/llvm/lib/Support/PGOOptions.cpp b/llvm/lib/Support/PGOOptions.cpp
index 7e57b52..5981dff 100644
--- a/llvm/lib/Support/PGOOptions.cpp
+++ b/llvm/lib/Support/PGOOptions.cpp
@@ -15,11 +15,12 @@ PGOOptions::PGOOptions(std::string ProfileFile, std::string CSProfileGenFile,
                        std::string ProfileRemappingFile,
                        std::string MemoryProfile,
                        IntrusiveRefCntPtr<vfs::FileSystem> FS, PGOAction Action,
-                       CSPGOAction CSAction, bool DebugInfoForProfiling,
-                       bool PseudoProbeForProfiling, bool AtomicCounterUpdate)
+                       CSPGOAction CSAction, ColdFuncOpt ColdType,
+                       bool DebugInfoForProfiling, bool PseudoProbeForProfiling,
+                       bool AtomicCounterUpdate)
     : ProfileFile(ProfileFile), CSProfileGenFile(CSProfileGenFile),
       ProfileRemappingFile(ProfileRemappingFile), MemoryProfile(MemoryProfile),
-      Action(Action), CSAction(CSAction),
+      Action(Action), CSAction(CSAction), ColdOptType(ColdType),
       DebugInfoForProfiling(DebugInfoForProfiling ||
                             (Action == SampleUse && !PseudoProbeForProfiling)),
       PseudoProbeForProfiling(PseudoProbeForProfiling),
diff --git a/llvm/lib/Transforms/Instrumentation/CMakeLists.txt b/llvm/lib/Transforms/Instrumentation/CMakeLists.txt
index 424f1d4..ee9aa73 100644
--- a/llvm/lib/Transforms/Instrumentation/CMakeLists.txt
+++ b/llvm/lib/Transforms/Instrumentation/CMakeLists.txt
@@ -13,6 +13,7 @@ add_llvm_component_library(LLVMInstrumentation
   InstrOrderFile.cpp
   InstrProfiling.cpp
   KCFI.cpp
+  PGOForceFunctionAttrs.cpp
   PGOInstrumentation.cpp
   PGOMemOPSizeOpt.cpp
   PoisonChecking.cpp
diff --git a/llvm/lib/Transforms/Instrumentation/PGOForceFunctionAttrs.cpp b/llvm/lib/Transforms/Instrumentation/PGOForceFunctionAttrs.cpp
new file mode 100644
index 0000000..c0ebdd7
--- /dev/null
+++ b/llvm/lib/Transforms/Instrumentation/PGOForceFunctionAttrs.cpp
@@ -0,0 +1,61 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Instrumentation/PGOForceFunctionAttrs.h"
+#include "llvm/Analysis/BlockFrequencyInfo.h"
+#include "llvm/Analysis/ProfileSummaryInfo.h"
+#include "llvm/IR/PassManager.h"
+#include "llvm/Support/ErrorHandling.h"
+
+using namespace llvm;
+
+static bool shouldRunOnFunction(Function &F, ProfileSummaryInfo &PSI,
+                                FunctionAnalysisManager &FAM) {
+  if (F.isDeclaration())
+    return false;
+  // Respect existing attributes.
+  if (F.hasOptNone() || F.hasOptSize() || F.hasMinSize())
+    return false;
+  if (F.hasFnAttribute(Attribute::Cold))
+    return true;
+  if (!PSI.hasProfileSummary())
+    return false;
+  BlockFrequencyInfo &BFI = FAM.getResult<BlockFrequencyAnalysis>(F);
+  return PSI.isFunctionColdInCallGraph(&F, BFI);
+}
+
+PreservedAnalyses PGOForceFunctionAttrsPass::run(Module &M,
+                                                 ModuleAnalysisManager &AM) {
+  if (ColdType == PGOOptions::ColdFuncOpt::Default)
+    return PreservedAnalyses::all();
+  ProfileSummaryInfo &PSI = AM.getResult<ProfileSummaryAnalysis>(M);
+  FunctionAnalysisManager &FAM =
+      AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
+  bool MadeChange = false;
+  for (Function &F : M) {
+    if (!shouldRunOnFunction(F, PSI, FAM))
+      continue;
+    MadeChange = true;
+    switch (ColdType) {
+    case PGOOptions::ColdFuncOpt::Default:
+      llvm_unreachable("bailed out for default above");
+      break;
+    case PGOOptions::ColdFuncOpt::OptSize:
+      F.addFnAttr(Attribute::OptimizeForSize);
+      break;
+    case PGOOptions::ColdFuncOpt::MinSize:
+      F.addFnAttr(Attribute::MinSize);
+      break;
+    case PGOOptions::ColdFuncOpt::OptNone:
+      F.addFnAttr(Attribute::OptimizeNone);
+      F.addFnAttr(Attribute::NoInline);
+      break;
+    }
+  }
+  return MadeChange ? PreservedAnalyses::none() : PreservedAnalyses::all();
+}
diff --git a/llvm/test/Instrumentation/PGOForceFunctionAttrs/basic.ll b/llvm/test/Instrumentation/PGOForceFunctionAttrs/basic.ll
new file mode 100644
index 0000000..29ebc03
--- /dev/null
+++ b/llvm/test/Instrumentation/PGOForceFunctionAttrs/basic.ll
@@ -0,0 +1,102 @@
+; RUN: opt < %s -passes=pgo-force-function-attrs -pgo-kind=pgo-instr-use-pipeline -S -pgo-cold-func-opt=default | FileCheck %s --check-prefixes=NONE,CHECK
+; RUN: opt < %s -passes=pgo-force-function-attrs -pgo-kind=pgo-instr-use-pipeline -S -pgo-cold-func-opt=optsize | FileCheck %s --check-prefixes=OPTSIZE,CHECK
+; RUN: opt < %s -passes=pgo-force-function-attrs -pgo-kind=pgo-instr-use-pipeline -S -pgo-cold-func-opt=minsize | FileCheck %s --check-prefixes=MINSIZE,CHECK
+; RUN: opt < %s -passes=pgo-force-function-attrs -pgo-kind=pgo-instr-use-pipeline -S -pgo-cold-func-opt=optnone | FileCheck %s --check-prefixes=OPTNONE,CHECK
+
+; Should be no changes without profile data
+; RUN: opt < %s -passes=pgo-force-function-attrs                                  -S -pgo-cold-func-opt=minsize | FileCheck %s --check-prefixes=NONE,CHECK
+
+; NONE-NOT: Function Attrs:
+; OPTSIZE: Function Attrs: optsize{{$}}
+; MINSIZE: Function Attrs: minsize{{$}}
+; OPTNONE: Function Attrs: noinline optnone{{$}}
+; CHECK: define void @cold()
+
+; CHECK: Function Attrs: optsize{{$}}
+; CHECK-NEXT: define void @cold_optsize()
+
+; CHECK: Function Attrs: minsize{{$}}
+; CHECK-NEXT: define void @cold_minsize()
+
+; CHECK: Function Attrs: noinline optnone{{$}}
+; CHECK-NEXT: define void @cold_optnone()
+
+; NONE: Function Attrs: cold{{$}}
+; OPTSIZE: Function Attrs: cold optsize{{$}}
+; MINSIZE: Function Attrs: cold minsize{{$}}
+; OPTNONE: Function Attrs: cold noinline optnone{{$}}
+; CHECK-NEXT: define void @cold_attr()
+
+; CHECK-NOT: Function Attrs: {{.*}}optsize
+; CHECK-NOT: Function Attrs: {{.*}}minsize
+; CHECK-NOT: Function Attrs: {{.*}}optnone
+
+@s = global i32 0
+
+define void @cold() !prof !27 {
+  store i32 1, ptr @s, align 4
+  ret void
+}
+
+define void @cold_optsize() optsize !prof !27 {
+  store i32 1, ptr @s, align 4
+  ret void
+}
+
+define void @cold_minsize() minsize !prof !27 {
+  store i32 1, ptr @s, align 4
+  ret void
+}
+
+define void @cold_optnone() noinline optnone !prof !27 {
+  store i32 1, ptr @s, align 4
+  ret void
+}
+
+define void @cold_attr() cold {
+  store i32 1, ptr @s, align 4
+  ret void
+}
+
+define void @hot() !prof !28 {
+  %l = load i32, ptr @s, align 4
+  %add = add nsw i32 %l, 4
+  store i32 %add, ptr @s, align 4
+  ret void
+}
+
+attributes #0 = { optsize }
+attributes #1 = { minsize }
+attributes #2 = { noinline optnone }
+
+!llvm.module.flags = !{!0}
+
+!0 = !{i32 1, !"ProfileSummary", !1}
+!1 = !{!2, !3, !4, !5, !6, !7, !8, !9}
+!2 = !{!"ProfileFormat", !"InstrProf"}
+!3 = !{!"TotalCount", i64 9040}
+!4 = !{!"MaxCount", i64 9000}
+!5 = !{!"MaxInternalCount", i64 0}
+!6 = !{!"MaxFunctionCount", i64 9000}
+!7 = !{!"NumCounts", i64 5}
+!8 = !{!"NumFunctions", i64 5}
+!9 = !{!"DetailedSummary", !10}
+!10 = !{!11, !12, !13, !14, !15, !16, !17, !18, !19, !20, !21, !22, !23, !24, !25, !26}
+!11 = !{i32 10000, i64 9000, i32 1}
+!12 = !{i32 100000, i64 9000, i32 1}
+!13 = !{i32 200000, i64 9000, i32 1}
+!14 = !{i32 300000, i64 9000, i32 1}
+!15 = !{i32 400000, i64 9000, i32 1}
+!16 = !{i32 500000, i64 9000, i32 1}
+!17 = !{i32 600000, i64 9000, i32 1}
+!18 = !{i32 700000, i64 9000, i32 1}
+!19 = !{i32 800000, i64 9000, i32 1}
+!20 = !{i32 900000, i64 9000, i32 1}
+!21 = !{i32 950000, i64 9000, i32 1}
+!22 = !{i32 990000, i64 9000, i32 1}
+!23 = !{i32 999000, i64 10, i32 5}
+!24 = !{i32 999900, i64 10, i32 5}
+!25 = !{i32 999990, i64 10, i32 5}
+!26 = !{i32 999999, i64 10, i32 5}
+!27 = !{!"function_entry_count", i64 10}
+!28 = !{!"function_entry_count", i64 9000}
diff --git a/llvm/tools/opt/NewPMDriver.cpp b/llvm/tools/opt/NewPMDriver.cpp
index fdfb4df..3746980 100644
--- a/llvm/tools/opt/NewPMDriver.cpp
+++ b/llvm/tools/opt/NewPMDriver.cpp
@@ -202,6 +202,19 @@ static cl::opt<std::string>
                          cl::desc("Path to the profile remapping file."),
                          cl::Hidden);
 
+static cl::opt<PGOOptions::ColdFuncOpt> PGOColdFuncAttr(
+    "pgo-cold-func-opt", cl::init(PGOOptions::ColdFuncOpt::Default), cl::Hidden,
+    cl::desc(
+        "Function attribute to apply to cold functions as determined by PGO"),
+    cl::values(clEnumValN(PGOOptions::ColdFuncOpt::Default, "default",
+                          "Default (no attribute)"),
+               clEnumValN(PGOOptions::ColdFuncOpt::OptSize, "optsize",
+                          "Mark cold functions with optsize."),
+               clEnumValN(PGOOptions::ColdFuncOpt::MinSize, "minsize",
+                          "Mark cold functions with minsize."),
+               clEnumValN(PGOOptions::ColdFuncOpt::OptNone, "optnone",
+                          "Mark cold functions with optnone.")));
+
 static cl::opt<bool> DebugInfoForProfiling(
     "debug-info-for-profiling", cl::init(false), cl::Hidden,
     cl::desc("Emit special debug info to enable PGO profile generation."));
@@ -341,22 +354,24 @@ bool llvm::runPassPipeline(
   switch (PGOKindFlag) {
   case InstrGen:
     P = PGOOptions(ProfileFile, "", "", MemoryProfileFile, FS,
-                   PGOOptions::IRInstr);
+                   PGOOptions::IRInstr, PGOOptions::NoCSAction,
+                   PGOColdFuncAttr);
     break;
   case InstrUse:
     P = PGOOptions(ProfileFile, "", ProfileRemappingFile, MemoryProfileFile, FS,
-                   PGOOptions::IRUse);
+                   PGOOptions::IRUse, PGOOptions::NoCSAction, PGOColdFuncAttr);
     break;
   case SampleUse:
     P = PGOOptions(ProfileFile, "", ProfileRemappingFile, MemoryProfileFile, FS,
-                   PGOOptions::SampleUse);
+                   PGOOptions::SampleUse, PGOOptions::NoCSAction,
+                   PGOColdFuncAttr);
     break;
   case NoPGO:
     if (DebugInfoForProfiling || PseudoProbeForProfiling ||
         !MemoryProfileFile.empty())
       P = PGOOptions("", "", "", MemoryProfileFile, FS, PGOOptions::NoAction,
-                     PGOOptions::NoCSAction, DebugInfoForProfiling,
-                     PseudoProbeForProfiling);
+                     PGOOptions::NoCSAction, PGOColdFuncAttr,
+                     DebugInfoForProfiling, PseudoProbeForProfiling);
     else
       P = std::nullopt;
   }
diff --git a/llvm/utils/gn/secondary/llvm/lib/Transforms/Instrumentation/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/Transforms/Instrumentation/BUILD.gn
index 0cce655..00e1888 100644
--- a/llvm/utils/gn/secondary/llvm/lib/Transforms/Instrumentation/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/lib/Transforms/Instrumentation/BUILD.gn
@@ -23,6 +23,7 @@ static_library("Instrumentation") {
     "InstrProfiling.cpp",
     "Instrumentation.cpp",
     "KCFI.cpp",
+    "PGOForceFunctionAttrs.cpp",
     "MemProfiler.cpp",
     "MemorySanitizer.cpp",
     "PGOInstrumentation.cpp",
-- 
cgit v1.1


From 52cf07116bf0a8cab87b0f55176d198bcaa02575 Mon Sep 17 00:00:00 2001
From: Amir Ayupov <aaupov@fb.com>
Date: Mon, 12 Feb 2024 14:53:53 -0800
Subject: [BOLT][NFC] Log through JournalingStreams (#81524)

Make core BOLT functionality more friendly to being used as a
library instead of in our standalone driver llvm-bolt. To
accomplish this, we augment BinaryContext with journaling streams
that are to be used by most BOLT code whenever something needs to
be logged to the screen. Users of the library can decide if logs
should be printed to a file, no file or to the screen, as
before. To illustrate this, this patch adds a new option
`--log-file` that allows the user to redirect BOLT logging to a
file on disk or completely hide it by using
`--log-file=/dev/null`. Future BOLT code should now use
`BinaryContext::outs()` for printing important messages instead of
`llvm::outs()`. A new test log.test enforces this by verifying that
no strings are print to screen once the `--log-file` option is
used.

In previous patches we also added a new BOLTError class to report
common and fatal errors, so code shouldn't call exit(1) now. To
easily handle problems as before (by quitting with exit(1)),
callers can now use
`BinaryContext::logBOLTErrorsAndQuitOnFatal(Error)` whenever code
needs to deal with BOLT errors. To test this, we have fatal.s
that checks we are correctly quitting and printing a fatal error
to the screen.

Because this is a significant change by itself, not all code was
yet ported. Code from Profiler libs (DataAggregator and friends)
still print errors directly to screen.

Co-authored-by: Rafael Auler <rafaelauler@fb.com>

Test Plan: NFC
---
 bolt/include/bolt/Core/BinaryContext.h             |  28 +-
 bolt/include/bolt/Core/DIEBuilder.h                |  17 +-
 bolt/include/bolt/Core/DynoStats.h                 |  15 +-
 bolt/include/bolt/Core/Exceptions.h                |   4 +-
 bolt/include/bolt/Passes/BinaryPasses.h            |  13 +-
 bolt/include/bolt/Passes/CMOVConversion.h          |   2 +-
 bolt/include/bolt/Passes/CacheMetrics.h            |   6 +-
 bolt/include/bolt/Passes/ReorderData.h             |   3 +-
 bolt/include/bolt/Passes/ReorderFunctions.h        |   4 +-
 bolt/include/bolt/Passes/ShrinkWrapping.h          |   2 +-
 bolt/include/bolt/Profile/BoltAddressTranslation.h |   2 +-
 bolt/include/bolt/Rewrite/RewriteInstance.h        |   7 +-
 bolt/lib/Core/BinaryBasicBlock.cpp                 |  36 +-
 bolt/lib/Core/BinaryContext.cpp                    | 189 +++----
 bolt/lib/Core/BinaryEmitter.cpp                    |   7 +-
 bolt/lib/Core/BinaryFunction.cpp                   | 169 +++----
 bolt/lib/Core/DIEBuilder.cpp                       |  68 +--
 bolt/lib/Core/DebugData.cpp                        |   2 +-
 bolt/lib/Core/DynoStats.cpp                        |   2 +-
 bolt/lib/Core/Exceptions.cpp                       |  93 ++--
 bolt/lib/Core/ParallelUtilities.cpp                |   5 +-
 bolt/lib/Passes/ADRRelaxationPass.cpp              |   7 +-
 bolt/lib/Passes/AllocCombiner.cpp                  |   6 +-
 bolt/lib/Passes/AsmDump.cpp                        |  16 +-
 bolt/lib/Passes/BinaryFunctionCallGraph.cpp        |  14 +-
 bolt/lib/Passes/BinaryPasses.cpp                   | 288 +++++------
 bolt/lib/Passes/CMOVConversion.cpp                 |  24 +-
 bolt/lib/Passes/CacheMetrics.cpp                   |  39 +-
 bolt/lib/Passes/FrameAnalysis.cpp                  |  15 +-
 bolt/lib/Passes/FrameOptimizer.cpp                 |  32 +-
 bolt/lib/Passes/IdenticalCodeFolding.cpp           |  16 +-
 bolt/lib/Passes/IndirectCallPromotion.cpp          | 196 ++++----
 bolt/lib/Passes/Inliner.cpp                        |   8 +-
 bolt/lib/Passes/Instrumentation.cpp                |  62 +--
 bolt/lib/Passes/JTFootprintReduction.cpp           |  26 +-
 bolt/lib/Passes/LongJmp.cpp                        |  10 +-
 bolt/lib/Passes/LoopInversionPass.cpp              |   4 +-
 bolt/lib/Passes/PLTCall.cpp                        |   4 +-
 bolt/lib/Passes/PatchEntries.cpp                   |  18 +-
 bolt/lib/Passes/RegAnalysis.cpp                    |  10 +-
 bolt/lib/Passes/RegReAssign.cpp                    |  19 +-
 bolt/lib/Passes/ReorderData.cpp                    |  58 +--
 bolt/lib/Passes/ReorderFunctions.cpp               | 114 +++--
 bolt/lib/Passes/RetpolineInsertion.cpp             |   8 +-
 bolt/lib/Passes/ShrinkWrapping.cpp                 |  40 +-
 bolt/lib/Passes/SplitFunctions.cpp                 |  15 +-
 bolt/lib/Passes/StokeInfo.cpp                      |  11 +-
 bolt/lib/Passes/TailDuplication.cpp                |  26 +-
 bolt/lib/Passes/ThreeWayBranch.cpp                 |   4 +-
 bolt/lib/Passes/ValidateInternalCalls.cpp          |   7 +-
 bolt/lib/Passes/ValidateMemRefs.cpp                |   4 +-
 bolt/lib/Passes/VeneerElimination.cpp              |   4 +-
 bolt/lib/Profile/BoltAddressTranslation.cpp        |   6 +-
 bolt/lib/Rewrite/BinaryPassManager.cpp             |   7 +-
 bolt/lib/Rewrite/BoltDiff.cpp                      |   8 +-
 bolt/lib/Rewrite/DWARFRewriter.cpp                 |   7 +-
 bolt/lib/Rewrite/MachORewriteInstance.cpp          |  12 +-
 bolt/lib/Rewrite/RewriteInstance.cpp               | 553 +++++++++++----------
 bolt/lib/Target/X86/X86MCSymbolizer.cpp            |   1 +
 bolt/test/X86/fatal-error.s                        |  39 ++
 bolt/test/X86/log.test                             |  19 +
 bolt/tools/bat-dump/bat-dump.cpp                   |   2 +-
 bolt/tools/driver/llvm-bolt.cpp                    |  26 +-
 bolt/unittests/Core/BinaryContext.cpp              |   3 +-
 bolt/unittests/Core/MCPlusBuilder.cpp              |   3 +-
 65 files changed, 1344 insertions(+), 1121 deletions(-)
 create mode 100644 bolt/test/X86/fatal-error.s
 create mode 100644 bolt/test/X86/log.test

diff --git a/bolt/include/bolt/Core/BinaryContext.h b/bolt/include/bolt/Core/BinaryContext.h
index 5dc5fdb..30336c4 100644
--- a/bolt/include/bolt/Core/BinaryContext.h
+++ b/bolt/include/bolt/Core/BinaryContext.h
@@ -165,6 +165,12 @@ private:
   std::string Msg;
 };
 
+/// Streams used by BOLT to log regular or error events
+struct JournalingStreams {
+  raw_ostream &Out;
+  raw_ostream &Err;
+};
+
 Error createNonFatalBOLTError(const Twine &S);
 Error createFatalBOLTError(const Twine &S);
 
@@ -260,7 +266,8 @@ class BinaryContext {
 public:
   static Expected<std::unique_ptr<BinaryContext>>
   createBinaryContext(const ObjectFile *File, bool IsPIC,
-                      std::unique_ptr<DWARFContext> DwCtx);
+                      std::unique_ptr<DWARFContext> DwCtx,
+                      JournalingStreams Logger);
 
   /// Superset of compiler units that will contain overwritten code that needs
   /// new debug info. In a few cases, functions may end up not being
@@ -628,6 +635,10 @@ public:
 
   std::unique_ptr<MCAsmBackend> MAB;
 
+  /// Allows BOLT to print to log whenever it is necessary (with or without
+  /// const references)
+  mutable JournalingStreams Logger;
+
   /// Indicates if the binary is Linux kernel.
   bool IsLinuxKernel{false};
 
@@ -760,7 +771,8 @@ public:
                 std::unique_ptr<const MCInstrAnalysis> MIA,
                 std::unique_ptr<MCPlusBuilder> MIB,
                 std::unique_ptr<const MCRegisterInfo> MRI,
-                std::unique_ptr<MCDisassembler> DisAsm);
+                std::unique_ptr<MCDisassembler> DisAsm,
+                JournalingStreams Logger);
 
   ~BinaryContext();
 
@@ -1372,8 +1384,12 @@ public:
     return Offset;
   }
 
-  void exitWithBugReport(StringRef Message,
-                         const BinaryFunction &Function) const;
+  /// Log BOLT errors to journaling streams and quit process with non-zero error
+  /// code 1 if error is fatal.
+  void logBOLTErrorsAndQuitOnFatal(Error E);
+
+  std::string generateBugReportMessage(StringRef Message,
+                                       const BinaryFunction &Function) const;
 
   struct IndependentCodeEmitter {
     std::unique_ptr<MCObjectFileInfo> LocalMOFI;
@@ -1421,6 +1437,10 @@ public:
     assert(IOAddressMap && "Address map not set yet");
     return *IOAddressMap;
   }
+
+  raw_ostream &outs() const { return Logger.Out; }
+
+  raw_ostream &errs() const { return Logger.Err; }
 };
 
 template <typename T, typename = std::enable_if_t<sizeof(T) == 1>>
diff --git a/bolt/include/bolt/Core/DIEBuilder.h b/bolt/include/bolt/Core/DIEBuilder.h
index f890840..f13d42f 100644
--- a/bolt/include/bolt/Core/DIEBuilder.h
+++ b/bolt/include/bolt/Core/DIEBuilder.h
@@ -15,6 +15,7 @@
 #ifndef BOLT_CORE_DIE_BUILDER_H
 #define BOLT_CORE_DIE_BUILDER_H
 
+#include "bolt/Core/BinaryContext.h"
 #include "llvm/CodeGen/DIE.h"
 #include "llvm/DebugInfo/DWARF/DWARFAbbreviationDeclaration.h"
 #include "llvm/DebugInfo/DWARF/DWARFDie.h"
@@ -32,6 +33,7 @@
 namespace llvm {
 
 namespace bolt {
+
 class DIEStreamer;
 class DebugStrOffsetsWriter;
 
@@ -120,6 +122,7 @@ private:
   std::unique_ptr<State> BuilderState;
   FoldingSet<DIEAbbrev> AbbreviationsSet;
   std::vector<std::unique_ptr<DIEAbbrev>> Abbreviations;
+  BinaryContext &BC;
   DWARFContext *DwarfContext{nullptr};
   bool IsDWO{false};
   uint64_t UnitSize{0};
@@ -219,9 +222,10 @@ private:
     if (getState().CloneUnitCtxMap[UnitId].DieInfoVector.size() > DIEId)
       return *getState().CloneUnitCtxMap[UnitId].DieInfoVector[DIEId].get();
 
-    errs() << "BOLT-WARNING: [internal-dwarf-error]: The DIE is not allocated "
-              "before looking up, some"
-           << "unexpected corner cases happened.\n";
+    BC.errs()
+        << "BOLT-WARNING: [internal-dwarf-error]: The DIE is not allocated "
+           "before looking up, some"
+        << "unexpected corner cases happened.\n";
     return *getState().CloneUnitCtxMap[UnitId].DieInfoVector.front().get();
   }
 
@@ -261,7 +265,7 @@ private:
   DIE *constructDIEFast(DWARFDie &DDie, DWARFUnit &U, uint32_t UnitId);
 
 public:
-  DIEBuilder(DWARFContext *DwarfContext, bool IsDWO = false);
+  DIEBuilder(BinaryContext &BC, DWARFContext *DwarfContext, bool IsDWO = false);
 
   /// Returns enum to what we are currently processing.
   ProcessingType getCurrentProcessingState() { return getState().Type; }
@@ -295,8 +299,9 @@ public:
     if (getState().TypeDIEMap.count(&DU))
       return getState().TypeDIEMap[&DU];
 
-    errs() << "BOLT-ERROR: unable to find TypeUnit for Type Unit at offset 0x"
-           << DU.getOffset() << "\n";
+    BC.errs()
+        << "BOLT-ERROR: unable to find TypeUnit for Type Unit at offset 0x"
+        << DU.getOffset() << "\n";
     return nullptr;
   }
 
diff --git a/bolt/include/bolt/Core/DynoStats.h b/bolt/include/bolt/Core/DynoStats.h
index 6525671..82a6966 100644
--- a/bolt/include/bolt/Core/DynoStats.h
+++ b/bolt/include/bolt/Core/DynoStats.h
@@ -159,8 +159,9 @@ inline DynoStats getDynoStats(FuncsType &Funcs, bool IsAArch64) {
 
 /// Call a function with optional before and after dynostats printing.
 template <typename FnType, typename FuncsType>
-inline void callWithDynoStats(FnType &&Func, FuncsType &Funcs, StringRef Phase,
-                              const bool Flag, bool IsAArch64) {
+inline void callWithDynoStats(raw_ostream &OS, FnType &&Func, FuncsType &Funcs,
+                              StringRef Phase, const bool Flag,
+                              bool IsAArch64) {
   DynoStats DynoStatsBefore(IsAArch64);
   if (Flag)
     DynoStatsBefore = getDynoStats(Funcs, IsAArch64);
@@ -170,12 +171,12 @@ inline void callWithDynoStats(FnType &&Func, FuncsType &Funcs, StringRef Phase,
   if (Flag) {
     const DynoStats DynoStatsAfter = getDynoStats(Funcs, IsAArch64);
     const bool Changed = (DynoStatsAfter != DynoStatsBefore);
-    outs() << "BOLT-INFO: program-wide dynostats after running " << Phase
-           << (Changed ? "" : " (no change)") << ":\n\n"
-           << DynoStatsBefore << '\n';
+    OS << "BOLT-INFO: program-wide dynostats after running " << Phase
+       << (Changed ? "" : " (no change)") << ":\n\n"
+       << DynoStatsBefore << '\n';
     if (Changed)
-      DynoStatsAfter.print(outs(), &DynoStatsBefore);
-    outs() << '\n';
+      DynoStatsAfter.print(OS, &DynoStatsBefore);
+    OS << '\n';
   }
 }
 
diff --git a/bolt/include/bolt/Core/Exceptions.h b/bolt/include/bolt/Core/Exceptions.h
index 7c09b5b..422b86f 100644
--- a/bolt/include/bolt/Core/Exceptions.h
+++ b/bolt/include/bolt/Core/Exceptions.h
@@ -30,13 +30,14 @@ class FDE;
 
 namespace bolt {
 
+class BinaryContext;
 class BinaryFunction;
 
 /// \brief Wraps up information to read all CFI instructions and feed them to a
 /// BinaryFunction, as well as rewriting CFI sections.
 class CFIReaderWriter {
 public:
-  explicit CFIReaderWriter(const DWARFDebugFrame &EHFrame);
+  explicit CFIReaderWriter(BinaryContext &BC, const DWARFDebugFrame &EHFrame);
 
   bool fillCFIInfoFor(BinaryFunction &Function) const;
 
@@ -59,6 +60,7 @@ public:
   const FDEsMap &getFDEs() const { return FDEs; }
 
 private:
+  BinaryContext &BC;
   FDEsMap FDEs;
 };
 
diff --git a/bolt/include/bolt/Passes/BinaryPasses.h b/bolt/include/bolt/Passes/BinaryPasses.h
index 8d52337b..046765b 100644
--- a/bolt/include/bolt/Passes/BinaryPasses.h
+++ b/bolt/include/bolt/Passes/BinaryPasses.h
@@ -50,7 +50,6 @@ public:
   /// this pass is completed (printPass() must have returned true).
   virtual bool shouldPrint(const BinaryFunction &BF) const;
 
-  /// Execute this pass on the given functions.
   virtual Error runOnFunctions(BinaryContext &BC) = 0;
 };
 
@@ -74,14 +73,14 @@ public:
     const DynoStats NewDynoStats =
         getDynoStats(BC.getBinaryFunctions(), BC.isAArch64());
     const bool Changed = (NewDynoStats != PrevDynoStats);
-    outs() << "BOLT-INFO: program-wide dynostats " << Title
-           << (Changed ? "" : " (no change)") << ":\n\n"
-           << PrevDynoStats;
+    BC.outs() << "BOLT-INFO: program-wide dynostats " << Title
+              << (Changed ? "" : " (no change)") << ":\n\n"
+              << PrevDynoStats;
     if (Changed) {
-      outs() << '\n';
-      NewDynoStats.print(outs(), &PrevDynoStats, BC.InstPrinter.get());
+      BC.outs() << '\n';
+      NewDynoStats.print(BC.outs(), &PrevDynoStats, BC.InstPrinter.get());
     }
-    outs() << '\n';
+    BC.outs() << '\n';
     return Error::success();
   }
 };
diff --git a/bolt/include/bolt/Passes/CMOVConversion.h b/bolt/include/bolt/Passes/CMOVConversion.h
index 29b5184..4046266 100644
--- a/bolt/include/bolt/Passes/CMOVConversion.h
+++ b/bolt/include/bolt/Passes/CMOVConversion.h
@@ -64,7 +64,7 @@ class CMOVConversion : public BinaryFunctionPass {
     }
     double getMPRatio() { return (double)RemovedMP / PossibleMP; }
 
-    void dump();
+    void dumpTo(raw_ostream &OS);
   };
   // BinaryContext-wide stats
   Stats Global;
diff --git a/bolt/include/bolt/Passes/CacheMetrics.h b/bolt/include/bolt/Passes/CacheMetrics.h
index 1e40ad6..5c88d98 100644
--- a/bolt/include/bolt/Passes/CacheMetrics.h
+++ b/bolt/include/bolt/Passes/CacheMetrics.h
@@ -17,12 +17,16 @@
 #include <vector>
 
 namespace llvm {
+
+class raw_ostream;
+
 namespace bolt {
 class BinaryFunction;
 namespace CacheMetrics {
 
 /// Calculate and print various metrics related to instruction cache performance
-void printAll(const std::vector<BinaryFunction *> &BinaryFunctions);
+void printAll(raw_ostream &OS,
+              const std::vector<BinaryFunction *> &BinaryFunctions);
 
 } // namespace CacheMetrics
 } // namespace bolt
diff --git a/bolt/include/bolt/Passes/ReorderData.h b/bolt/include/bolt/Passes/ReorderData.h
index ed183bf..9cd17ae 100644
--- a/bolt/include/bolt/Passes/ReorderData.h
+++ b/bolt/include/bolt/Passes/ReorderData.h
@@ -35,7 +35,8 @@ private:
   sortedByFunc(BinaryContext &BC, const BinarySection &Section,
                std::map<uint64_t, BinaryFunction> &BFs) const;
 
-  void printOrder(const BinarySection &Section, DataOrder::const_iterator Begin,
+  void printOrder(BinaryContext &BC, const BinarySection &Section,
+                  DataOrder::const_iterator Begin,
                   DataOrder::const_iterator End) const;
 
   /// Set the ordering of the section with \p SectionName.  \p NewOrder is a
diff --git a/bolt/include/bolt/Passes/ReorderFunctions.h b/bolt/include/bolt/Passes/ReorderFunctions.h
index b75937d..4c88142 100644
--- a/bolt/include/bolt/Passes/ReorderFunctions.h
+++ b/bolt/include/bolt/Passes/ReorderFunctions.h
@@ -20,10 +20,10 @@ class Cluster;
 class ReorderFunctions : public BinaryFunctionPass {
   BinaryFunctionCallGraph Cg;
 
-  void reorder(std::vector<Cluster> &&Clusters,
+  void reorder(BinaryContext &BC, std::vector<Cluster> &&Clusters,
                std::map<uint64_t, BinaryFunction> &BFs);
 
-  void printStats(const std::vector<Cluster> &Clusters,
+  void printStats(BinaryContext &BC, const std::vector<Cluster> &Clusters,
                   const std::vector<uint64_t> &FuncAddr);
 
 public:
diff --git a/bolt/include/bolt/Passes/ShrinkWrapping.h b/bolt/include/bolt/Passes/ShrinkWrapping.h
index 016bea7..b7809f0 100644
--- a/bolt/include/bolt/Passes/ShrinkWrapping.h
+++ b/bolt/include/bolt/Passes/ShrinkWrapping.h
@@ -523,7 +523,7 @@ public:
 
   Expected<bool> perform(bool HotOnly = false);
 
-  static void printStats();
+  static void printStats(BinaryContext &BC);
 };
 
 } // end namespace bolt
diff --git a/bolt/include/bolt/Profile/BoltAddressTranslation.h b/bolt/include/bolt/Profile/BoltAddressTranslation.h
index fa29ece..b9d1e86 100644
--- a/bolt/include/bolt/Profile/BoltAddressTranslation.h
+++ b/bolt/include/bolt/Profile/BoltAddressTranslation.h
@@ -85,7 +85,7 @@ public:
 
   /// Read the serialized address translation tables and load them internally
   /// in memory. Return a parse error if failed.
-  std::error_code parse(StringRef Buf);
+  std::error_code parse(raw_ostream &OS, StringRef Buf);
 
   /// Dump the parsed address translation tables
   void dump(raw_ostream &OS);
diff --git a/bolt/include/bolt/Rewrite/RewriteInstance.h b/bolt/include/bolt/Rewrite/RewriteInstance.h
index 170da78..97ab65c 100644
--- a/bolt/include/bolt/Rewrite/RewriteInstance.h
+++ b/bolt/include/bolt/Rewrite/RewriteInstance.h
@@ -47,11 +47,14 @@ public:
   // construction. Constructors can’t return errors, so clients must test \p Err
   // after the object is constructed. Use `create` method instead.
   RewriteInstance(llvm::object::ELFObjectFileBase *File, const int Argc,
-                  const char *const *Argv, StringRef ToolPath, Error &Err);
+                  const char *const *Argv, StringRef ToolPath,
+                  raw_ostream &Stdout, raw_ostream &Stderr, Error &Err);
 
   static Expected<std::unique_ptr<RewriteInstance>>
   create(llvm::object::ELFObjectFileBase *File, const int Argc,
-         const char *const *Argv, StringRef ToolPath);
+         const char *const *Argv, StringRef ToolPath,
+         raw_ostream &Stdout = llvm::outs(),
+         raw_ostream &Stderr = llvm::errs());
   ~RewriteInstance();
 
   /// Assign profile from \p Filename to this instance.
diff --git a/bolt/lib/Core/BinaryBasicBlock.cpp b/bolt/lib/Core/BinaryBasicBlock.cpp
index 984bc6d..4a83fec 100644
--- a/bolt/lib/Core/BinaryBasicBlock.cpp
+++ b/bolt/lib/Core/BinaryBasicBlock.cpp
@@ -92,8 +92,8 @@ bool BinaryBasicBlock::validateSuccessorInvariants() {
         // Work on the assumption that jump table blocks don't
         // have a conditional successor.
         Valid = false;
-        errs() << "BOLT-WARNING: Jump table successor " << Succ->getName()
-               << " not contained in the jump table.\n";
+        BC.errs() << "BOLT-WARNING: Jump table successor " << Succ->getName()
+                  << " not contained in the jump table.\n";
       }
     }
     // If there are any leftover entries in the jump table, they
@@ -103,8 +103,8 @@ bool BinaryBasicBlock::validateSuccessorInvariants() {
         Valid &= (Sym == Function->getFunctionEndLabel() ||
                   Sym == Function->getFunctionEndLabel(getFragmentNum()));
         if (!Valid) {
-          errs() << "BOLT-WARNING: Jump table contains illegal entry: "
-                 << Sym->getName() << "\n";
+          BC.errs() << "BOLT-WARNING: Jump table contains illegal entry: "
+                    << Sym->getName() << "\n";
         }
       }
     }
@@ -141,11 +141,11 @@ bool BinaryBasicBlock::validateSuccessorInvariants() {
     }
   }
   if (!Valid) {
-    errs() << "BOLT-WARNING: CFG invalid in " << *getFunction() << " @ "
-           << getName() << "\n";
+    BC.errs() << "BOLT-WARNING: CFG invalid in " << *getFunction() << " @ "
+              << getName() << "\n";
     if (JT) {
-      errs() << "Jump Table instruction addr = 0x"
-             << Twine::utohexstr(BC.MIB->getJumpTable(*Inst)) << "\n";
+      BC.errs() << "Jump Table instruction addr = 0x"
+                << Twine::utohexstr(BC.MIB->getJumpTable(*Inst)) << "\n";
       JT->print(errs());
     }
     getFunction()->dump();
@@ -520,9 +520,9 @@ uint32_t BinaryBasicBlock::getNumPseudos() const {
       ++N;
 
   if (N != NumPseudos) {
-    errs() << "BOLT-ERROR: instructions for basic block " << getName()
-           << " in function " << *Function << ": calculated pseudos " << N
-           << ", set pseudos " << NumPseudos << ", size " << size() << '\n';
+    BC.errs() << "BOLT-ERROR: instructions for basic block " << getName()
+              << " in function " << *Function << ": calculated pseudos " << N
+              << ", set pseudos " << NumPseudos << ", size " << size() << '\n';
     llvm_unreachable("pseudos mismatch");
   }
 #endif
@@ -559,18 +559,18 @@ BinaryBasicBlock::getBranchStats(const BinaryBasicBlock *Succ) const {
 void BinaryBasicBlock::dump() const {
   BinaryContext &BC = Function->getBinaryContext();
   if (Label)
-    outs() << Label->getName() << ":\n";
-  BC.printInstructions(outs(), Instructions.begin(), Instructions.end(),
+    BC.outs() << Label->getName() << ":\n";
+  BC.printInstructions(BC.outs(), Instructions.begin(), Instructions.end(),
                        getOffset(), Function);
-  outs() << "preds:";
+  BC.outs() << "preds:";
   for (auto itr = pred_begin(); itr != pred_end(); ++itr) {
-    outs() << " " << (*itr)->getName();
+    BC.outs() << " " << (*itr)->getName();
   }
-  outs() << "\nsuccs:";
+  BC.outs() << "\nsuccs:";
   for (auto itr = succ_begin(); itr != succ_end(); ++itr) {
-    outs() << " " << (*itr)->getName();
+    BC.outs() << " " << (*itr)->getName();
   }
-  outs() << "\n";
+  BC.outs() << "\n";
 }
 
 uint64_t BinaryBasicBlock::estimateSize(const MCCodeEmitter *Emitter) const {
diff --git a/bolt/lib/Core/BinaryContext.cpp b/bolt/lib/Core/BinaryContext.cpp
index 1c33544..d544ece 100644
--- a/bolt/lib/Core/BinaryContext.cpp
+++ b/bolt/lib/Core/BinaryContext.cpp
@@ -114,6 +114,15 @@ Error createFatalBOLTError(const Twine &S) {
   return make_error<BOLTError>(/*IsFatal*/ true, S);
 }
 
+void BinaryContext::logBOLTErrorsAndQuitOnFatal(Error E) {
+  handleAllErrors(Error(std::move(E)), [&](const BOLTError &E) {
+    if (!E.getMessage().empty())
+      E.log(this->errs());
+    if (E.isFatal())
+      exit(1);
+  });
+}
+
 BinaryContext::BinaryContext(std::unique_ptr<MCContext> Ctx,
                              std::unique_ptr<DWARFContext> DwCtx,
                              std::unique_ptr<Triple> TheTriple,
@@ -127,13 +136,15 @@ BinaryContext::BinaryContext(std::unique_ptr<MCContext> Ctx,
                              std::unique_ptr<const MCInstrAnalysis> MIA,
                              std::unique_ptr<MCPlusBuilder> MIB,
                              std::unique_ptr<const MCRegisterInfo> MRI,
-                             std::unique_ptr<MCDisassembler> DisAsm)
+                             std::unique_ptr<MCDisassembler> DisAsm,
+                             JournalingStreams Logger)
     : Ctx(std::move(Ctx)), DwCtx(std::move(DwCtx)),
       TheTriple(std::move(TheTriple)), TheTarget(TheTarget),
       TripleName(TripleName), MCE(std::move(MCE)), MOFI(std::move(MOFI)),
       AsmInfo(std::move(AsmInfo)), MII(std::move(MII)), STI(std::move(STI)),
       InstPrinter(std::move(InstPrinter)), MIA(std::move(MIA)),
-      MIB(std::move(MIB)), MRI(std::move(MRI)), DisAsm(std::move(DisAsm)) {
+      MIB(std::move(MIB)), MRI(std::move(MRI)), DisAsm(std::move(DisAsm)),
+      Logger(Logger) {
   Relocation::Arch = this->TheTriple->getArch();
   RegularPageSize = isAArch64() ? RegularPageSizeAArch64 : RegularPageSizeX86;
   PageAlign = opts::NoHugePages ? RegularPageSize : HugePageSize;
@@ -153,7 +164,8 @@ BinaryContext::~BinaryContext() {
 /// triple \p TripleName.
 Expected<std::unique_ptr<BinaryContext>>
 BinaryContext::createBinaryContext(const ObjectFile *File, bool IsPIC,
-                                   std::unique_ptr<DWARFContext> DwCtx) {
+                                   std::unique_ptr<DWARFContext> DwCtx,
+                                   JournalingStreams Logger) {
   StringRef ArchName = "";
   std::string FeaturesStr = "";
   switch (File->getArch()) {
@@ -272,17 +284,12 @@ BinaryContext::createBinaryContext(const ObjectFile *File, bool IsPIC,
   std::unique_ptr<MCCodeEmitter> MCE(
       TheTarget->createMCCodeEmitter(*MII, *Ctx));
 
-  // Make sure we don't miss any output on core dumps.
-  outs().SetUnbuffered();
-  errs().SetUnbuffered();
-  dbgs().SetUnbuffered();
-
   auto BC = std::make_unique<BinaryContext>(
       std::move(Ctx), std::move(DwCtx), std::move(TheTriple), TheTarget,
       std::string(TripleName), std::move(MCE), std::move(MOFI),
       std::move(AsmInfo), std::move(MII), std::move(STI),
       std::move(InstructionPrinter), std::move(MIA), nullptr, std::move(MRI),
-      std::move(DisAsm));
+      std::move(DisAsm), Logger);
 
   BC->LSDAEncoding = LSDAEncoding;
 
@@ -335,9 +342,9 @@ bool BinaryContext::validateObjectNesting() const {
            Itr->second->containsRange(Next->second->getAddress(),
                                       Next->second->getSize())) {
       if (Next->second->Parent != Itr->second) {
-        errs() << "BOLT-WARNING: object nesting incorrect for:\n"
-               << "BOLT-WARNING:  " << *Itr->second << "\n"
-               << "BOLT-WARNING:  " << *Next->second << "\n";
+        this->errs() << "BOLT-WARNING: object nesting incorrect for:\n"
+                     << "BOLT-WARNING:  " << *Itr->second << "\n"
+                     << "BOLT-WARNING:  " << *Next->second << "\n";
         Valid = false;
       }
       ++Next;
@@ -354,14 +361,16 @@ bool BinaryContext::validateHoles() const {
       uint64_t RelAddr = Rel.Offset + Section.getAddress();
       const BinaryData *BD = getBinaryDataContainingAddress(RelAddr);
       if (!BD) {
-        errs() << "BOLT-WARNING: no BinaryData found for relocation at address"
-               << " 0x" << Twine::utohexstr(RelAddr) << " in "
-               << Section.getName() << "\n";
+        this->errs()
+            << "BOLT-WARNING: no BinaryData found for relocation at address"
+            << " 0x" << Twine::utohexstr(RelAddr) << " in " << Section.getName()
+            << "\n";
         Valid = false;
       } else if (!BD->getAtomicRoot()) {
-        errs() << "BOLT-WARNING: no atomic BinaryData found for relocation at "
-               << "address 0x" << Twine::utohexstr(RelAddr) << " in "
-               << Section.getName() << "\n";
+        this->errs()
+            << "BOLT-WARNING: no atomic BinaryData found for relocation at "
+            << "address 0x" << Twine::utohexstr(RelAddr) << " in "
+            << Section.getName() << "\n";
         Valid = false;
       }
     }
@@ -469,8 +478,9 @@ BinaryContext::handleAddressRef(uint64_t Address, BinaryFunction &BF,
         // The address could potentially escape. Mark it as another entry
         // point into the function.
         if (opts::Verbosity >= 1) {
-          outs() << "BOLT-INFO: potentially escaped address 0x"
-                 << Twine::utohexstr(Address) << " in function " << BF << '\n';
+          this->outs() << "BOLT-INFO: potentially escaped address 0x"
+                       << Twine::utohexstr(Address) << " in function " << BF
+                       << '\n';
         }
         BF.HasInternalLabelReference = true;
         return std::make_pair(
@@ -513,9 +523,9 @@ MemoryContentsType BinaryContext::analyzeMemoryAt(uint64_t Address,
     // internal function addresses to escape the function scope - we
     // consider it a tail call.
     if (opts::Verbosity > 1) {
-      errs() << "BOLT-WARNING: no section for address 0x"
-             << Twine::utohexstr(Address) << " referenced from function " << BF
-             << '\n';
+      this->errs() << "BOLT-WARNING: no section for address 0x"
+                   << Twine::utohexstr(Address) << " referenced from function "
+                   << BF << '\n';
     }
     return MemoryContentsType::UNKNOWN;
   }
@@ -761,7 +771,7 @@ void BinaryContext::skipMarkedFragments() {
     assert(FragmentsToSkip.count(BF) &&
            "internal error in traversing function fragments");
     if (opts::Verbosity >= 1)
-      errs() << "BOLT-WARNING: Ignoring " << BF->getPrintName() << '\n';
+      this->errs() << "BOLT-WARNING: Ignoring " << BF->getPrintName() << '\n';
     BF->setSimple(false);
     BF->setHasIndirectTargetToSplitFragment(true);
 
@@ -769,9 +779,9 @@ void BinaryContext::skipMarkedFragments() {
     llvm::for_each(BF->ParentFragments, addToWorklist);
   }
   if (!FragmentsToSkip.empty())
-    errs() << "BOLT-WARNING: skipped " << FragmentsToSkip.size() << " function"
-           << (FragmentsToSkip.size() == 1 ? "" : "s")
-           << " due to cold fragments\n";
+    this->errs() << "BOLT-WARNING: skipped " << FragmentsToSkip.size()
+                 << " function" << (FragmentsToSkip.size() == 1 ? "" : "s")
+                 << " due to cold fragments\n";
 }
 
 MCSymbol *BinaryContext::getOrCreateGlobalSymbol(uint64_t Address, Twine Prefix,
@@ -822,10 +832,10 @@ BinaryContext::getOrCreateJumpTable(BinaryFunction &Function, uint64_t Address,
       // Duplicate the entry for the parent function for easy access
       JT->Parents.push_back(&Function);
       if (opts::Verbosity > 2) {
-        outs() << "BOLT-INFO: Multiple fragments access same jump table: "
-               << JT->Parents[0]->getPrintName() << "; "
-               << Function.getPrintName() << "\n";
-        JT->print(outs());
+        this->outs() << "BOLT-INFO: Multiple fragments access same jump table: "
+                     << JT->Parents[0]->getPrintName() << "; "
+                     << Function.getPrintName() << "\n";
+        JT->print(this->outs());
       }
       Function.JumpTables.emplace(Address, JT);
       JT->Parents[0]->setHasIndirectTargetToSplitFragment(true);
@@ -863,7 +873,7 @@ BinaryContext::getOrCreateJumpTable(BinaryFunction &Function, uint64_t Address,
                                 *getSectionForAddress(Address));
   JT->Parents.push_back(&Function);
   if (opts::Verbosity > 2)
-    JT->print(outs());
+    JT->print(this->outs());
   JumpTables.emplace(Address, JT);
 
   // Duplicate the entry for the parent function for easy access.
@@ -992,12 +1002,13 @@ bool BinaryContext::hasValidCodePadding(const BinaryFunction &BF) {
     return true;
 
   if (opts::Verbosity >= 1) {
-    errs() << "BOLT-WARNING: bad padding at address 0x"
-           << Twine::utohexstr(BF.getAddress() + BF.getSize())
-           << " starting at offset " << (Offset - BF.getSize())
-           << " in function " << BF << '\n'
-           << FunctionData->slice(BF.getSize(), BF.getMaxSize() - BF.getSize())
-           << '\n';
+    this->errs() << "BOLT-WARNING: bad padding at address 0x"
+                 << Twine::utohexstr(BF.getAddress() + BF.getSize())
+                 << " starting at offset " << (Offset - BF.getSize())
+                 << " in function " << BF << '\n'
+                 << FunctionData->slice(BF.getSize(),
+                                        BF.getMaxSize() - BF.getSize())
+                 << '\n';
   }
 
   return false;
@@ -1012,8 +1023,8 @@ void BinaryContext::adjustCodePadding() {
     if (!hasValidCodePadding(BF)) {
       if (HasRelocations) {
         if (opts::Verbosity >= 1) {
-          outs() << "BOLT-INFO: function " << BF
-                 << " has invalid padding. Ignoring the function.\n";
+          this->outs() << "BOLT-INFO: function " << BF
+                       << " has invalid padding. Ignoring the function.\n";
         }
         BF.setIgnored();
       } else {
@@ -1161,8 +1172,8 @@ void BinaryContext::generateSymbolHashes() {
       // (i.e. all zeros or a "hole")
       if (!isPadding(BD)) {
         if (opts::Verbosity) {
-          errs() << "BOLT-WARNING: collision detected when hashing " << BD
-                 << " with new name (" << NewName << "), skipping.\n";
+          this->errs() << "BOLT-WARNING: collision detected when hashing " << BD
+                       << " with new name (" << NewName << "), skipping.\n";
         }
         ++NumCollisions;
       }
@@ -1172,11 +1183,11 @@ void BinaryContext::generateSymbolHashes() {
     GlobalSymbols[NewName] = &BD;
   }
   if (NumCollisions) {
-    errs() << "BOLT-WARNING: " << NumCollisions
-           << " collisions detected while hashing binary objects";
+    this->errs() << "BOLT-WARNING: " << NumCollisions
+                 << " collisions detected while hashing binary objects";
     if (!opts::Verbosity)
-      errs() << ". Use -v=1 to see the list.";
-    errs() << '\n';
+      this->errs() << ". Use -v=1 to see the list.";
+    this->errs() << '\n';
   }
 }
 
@@ -1192,8 +1203,8 @@ bool BinaryContext::registerFragment(BinaryFunction &TargetFunction,
     Function.setSimple(false);
   }
   if (opts::Verbosity >= 1) {
-    outs() << "BOLT-INFO: marking " << TargetFunction << " as a fragment of "
-           << Function << '\n';
+    this->outs() << "BOLT-INFO: marking " << TargetFunction
+                 << " as a fragment of " << Function << '\n';
   }
   return true;
 }
@@ -1307,10 +1318,11 @@ void BinaryContext::processInterproceduralReferences() {
     if (TargetFunction) {
       if (TargetFunction->isFragment() &&
           !TargetFunction->isChildOf(Function)) {
-        errs() << "BOLT-WARNING: interprocedural reference between unrelated "
-                  "fragments: "
-               << Function.getPrintName() << " and "
-               << TargetFunction->getPrintName() << '\n';
+        this->errs()
+            << "BOLT-WARNING: interprocedural reference between unrelated "
+               "fragments: "
+            << Function.getPrintName() << " and "
+            << TargetFunction->getPrintName() << '\n';
       }
       if (uint64_t Offset = Address - TargetFunction->getAddress())
         TargetFunction->addEntryPointAtOffset(Offset);
@@ -1336,9 +1348,10 @@ void BinaryContext::processInterproceduralReferences() {
       continue;
 
     if (opts::processAllFunctions()) {
-      errs() << "BOLT-ERROR: cannot process binaries with unmarked "
-             << "object in code at address 0x" << Twine::utohexstr(Address)
-             << " belonging to section " << SectionName << " in current mode\n";
+      this->errs() << "BOLT-ERROR: cannot process binaries with unmarked "
+                   << "object in code at address 0x"
+                   << Twine::utohexstr(Address) << " belonging to section "
+                   << SectionName << " in current mode\n";
       exit(1);
     }
 
@@ -1348,9 +1361,10 @@ void BinaryContext::processInterproceduralReferences() {
     // We are not going to overwrite non-simple functions, but for simple
     // ones - adjust the padding size.
     if (TargetFunction && TargetFunction->isSimple()) {
-      errs() << "BOLT-WARNING: function " << *TargetFunction
-             << " has an object detected in a padding region at address 0x"
-             << Twine::utohexstr(Address) << '\n';
+      this->errs()
+          << "BOLT-WARNING: function " << *TargetFunction
+          << " has an object detected in a padding region at address 0x"
+          << Twine::utohexstr(Address) << '\n';
       TargetFunction->setMaxSize(TargetFunction->getSize());
     }
   }
@@ -1367,7 +1381,8 @@ void BinaryContext::postProcessSymbolTable() {
          BD->getName().starts_with("DATAat")) &&
         !BD->getParent() && !BD->getSize() && !BD->isAbsolute() &&
         BD->getSection()) {
-      errs() << "BOLT-WARNING: zero-sized top level symbol: " << *BD << "\n";
+      this->errs() << "BOLT-WARNING: zero-sized top level symbol: " << *BD
+                   << "\n";
       Valid = false;
     }
   }
@@ -1623,17 +1638,18 @@ void BinaryContext::preprocessDWODebugInfo() {
       DWARFUnit *DWOCU =
           DwarfUnit->getNonSkeletonUnitDIE(false, AbsolutePath).getDwarfUnit();
       if (!DWOCU->isDWOUnit()) {
-        outs() << "BOLT-WARNING: Debug Fission: DWO debug information for "
-               << DWOName
-               << " was not retrieved and won't be updated. Please check "
-                  "relative path.\n";
+        this->outs()
+            << "BOLT-WARNING: Debug Fission: DWO debug information for "
+            << DWOName
+            << " was not retrieved and won't be updated. Please check "
+               "relative path.\n";
         continue;
       }
       DWOCUs[*DWOId] = DWOCU;
     }
   }
   if (!DWOCUs.empty())
-    outs() << "BOLT-INFO: processing split DWARF\n";
+    this->outs() << "BOLT-INFO: processing split DWARF\n";
 }
 
 void BinaryContext::preprocessDebugInfo() {
@@ -1694,8 +1710,8 @@ void BinaryContext::preprocessDebugInfo() {
   }
 
   if (opts::Verbosity >= 1) {
-    outs() << "BOLT-INFO: " << ProcessedCUs.size() << " out of "
-           << DwCtx->getNumCompileUnits() << " CUs will be updated\n";
+    this->outs() << "BOLT-INFO: " << ProcessedCUs.size() << " out of "
+                 << DwCtx->getNumCompileUnits() << " CUs will be updated\n";
   }
 
   preprocessDWODebugInfo();
@@ -2276,23 +2292,26 @@ BinaryFunction *BinaryContext::getFunctionForSymbol(const MCSymbol *Symbol,
   return BF;
 }
 
-void BinaryContext::exitWithBugReport(StringRef Message,
-                                      const BinaryFunction &Function) const {
-  errs() << "=======================================\n";
-  errs() << "BOLT is unable to proceed because it couldn't properly understand "
-            "this function.\n";
-  errs() << "If you are running the most recent version of BOLT, you may "
-            "want to "
-            "report this and paste this dump.\nPlease check that there is no "
-            "sensitive contents being shared in this dump.\n";
-  errs() << "\nOffending function: " << Function.getPrintName() << "\n\n";
-  ScopedPrinter SP(errs());
+std::string
+BinaryContext::generateBugReportMessage(StringRef Message,
+                                        const BinaryFunction &Function) const {
+  std::string Msg;
+  raw_string_ostream SS(Msg);
+  SS << "=======================================\n";
+  SS << "BOLT is unable to proceed because it couldn't properly understand "
+        "this function.\n";
+  SS << "If you are running the most recent version of BOLT, you may "
+        "want to "
+        "report this and paste this dump.\nPlease check that there is no "
+        "sensitive contents being shared in this dump.\n";
+  SS << "\nOffending function: " << Function.getPrintName() << "\n\n";
+  ScopedPrinter SP(SS);
   SP.printBinaryBlock("Function contents", *Function.getData());
-  errs() << "\n";
-  Function.dump();
-  errs() << "ERROR: " << Message;
-  errs() << "\n=======================================\n";
-  exit(1);
+  SS << "\n";
+  const_cast<BinaryFunction &>(Function).print(SS, "");
+  SS << "ERROR: " << Message;
+  SS << "\n=======================================\n";
+  return Msg;
 }
 
 BinaryFunction *
@@ -2430,9 +2449,9 @@ bool BinaryContext::validateInstructionEncoding(
   auto OutputSequence = ArrayRef<uint8_t>((uint8_t *)Code.data(), Code.size());
   if (InputSequence != OutputSequence) {
     if (opts::Verbosity > 1) {
-      errs() << "BOLT-WARNING: mismatched encoding detected\n"
-             << "      input: " << InputSequence << '\n'
-             << "     output: " << OutputSequence << '\n';
+      this->errs() << "BOLT-WARNING: mismatched encoding detected\n"
+                   << "      input: " << InputSequence << '\n'
+                   << "     output: " << OutputSequence << '\n';
     }
     return false;
   }
diff --git a/bolt/lib/Core/BinaryEmitter.cpp b/bolt/lib/Core/BinaryEmitter.cpp
index 3bff312..d4b668c 100644
--- a/bolt/lib/Core/BinaryEmitter.cpp
+++ b/bolt/lib/Core/BinaryEmitter.cpp
@@ -567,7 +567,8 @@ void BinaryEmitter::emitConstantIslands(BinaryFunction &BF, bool EmitColdPart,
       BF.getAddress() - BF.getOriginSection()->getAddress(), BF.getMaxSize());
 
   if (opts::Verbosity && !OnBehalfOf)
-    outs() << "BOLT-INFO: emitting constant island for function " << BF << "\n";
+    BC.outs() << "BOLT-INFO: emitting constant island for function " << BF
+              << "\n";
 
   // We split the island into smaller blocks and output labels between them.
   auto IS = Islands.Offsets.begin();
@@ -766,7 +767,7 @@ void BinaryEmitter::emitJumpTables(const BinaryFunction &BF) {
     return;
 
   if (opts::PrintJumpTables)
-    outs() << "BOLT-INFO: jump tables for function " << BF << ":\n";
+    BC.outs() << "BOLT-INFO: jump tables for function " << BF << ":\n";
 
   for (auto &JTI : BF.jumpTables()) {
     JumpTable &JT = *JTI.second;
@@ -774,7 +775,7 @@ void BinaryEmitter::emitJumpTables(const BinaryFunction &BF) {
     if (JT.Parents.size() > 1 && JT.Parents[0] != &BF)
       continue;
     if (opts::PrintJumpTables)
-      JT.print(outs());
+      JT.print(BC.outs());
     if (opts::JumpTables == JTS_BASIC && BC.HasRelocations) {
       JT.updateOriginal();
     } else {
diff --git a/bolt/lib/Core/BinaryFunction.cpp b/bolt/lib/Core/BinaryFunction.cpp
index 3f3da3b..54f2f9d 100644
--- a/bolt/lib/Core/BinaryFunction.cpp
+++ b/bolt/lib/Core/BinaryFunction.cpp
@@ -880,9 +880,9 @@ BinaryFunction::processIndirectBranch(MCInst &Instruction, unsigned Size,
     // internal function addresses to escape the function scope - we
     // consider it a tail call.
     if (opts::Verbosity >= 1) {
-      errs() << "BOLT-WARNING: no section for address 0x"
-             << Twine::utohexstr(ArrayStart) << " referenced from function "
-             << *this << '\n';
+      BC.errs() << "BOLT-WARNING: no section for address 0x"
+                << Twine::utohexstr(ArrayStart) << " referenced from function "
+                << *this << '\n';
     }
     return IndirectBranchType::POSSIBLE_TAIL_CALL;
   }
@@ -899,11 +899,11 @@ BinaryFunction::processIndirectBranch(MCInst &Instruction, unsigned Size,
     if (BC.getSectionForAddress(ArrayStart)->isWritable())
       return IndirectBranchType::UNKNOWN;
 
-    outs() << "BOLT-INFO: fixed indirect branch detected in " << *this
-           << " at 0x" << Twine::utohexstr(getAddress() + Offset)
-           << " referencing data at 0x" << Twine::utohexstr(ArrayStart)
-           << " the destination value is 0x" << Twine::utohexstr(*Value)
-           << '\n';
+    BC.outs() << "BOLT-INFO: fixed indirect branch detected in " << *this
+              << " at 0x" << Twine::utohexstr(getAddress() + Offset)
+              << " referencing data at 0x" << Twine::utohexstr(ArrayStart)
+              << " the destination value is 0x" << Twine::utohexstr(*Value)
+              << '\n';
 
     TargetAddress = *Value;
     return BranchType;
@@ -1042,8 +1042,8 @@ Error BinaryFunction::handlePCRelOperand(MCInst &Instruction, uint64_t Address,
     return createNonFatalBOLTError(Msg);
   }
   if (TargetAddress == 0 && opts::Verbosity >= 1) {
-    outs() << "BOLT-INFO: PC-relative operand is zero in function " << *this
-           << '\n';
+    BC.outs() << "BOLT-INFO: PC-relative operand is zero in function " << *this
+              << '\n';
   }
 
   const MCSymbol *TargetSymbol;
@@ -1068,9 +1068,9 @@ MCSymbol *BinaryFunction::handleExternalReference(MCInst &Instruction,
   const uint64_t AbsoluteInstrAddr = getAddress() + Offset;
   BC.addInterproceduralReference(this, TargetAddress);
   if (opts::Verbosity >= 2 && !IsCall && Size == 2 && !BC.HasRelocations) {
-    errs() << "BOLT-WARNING: relaxed tail call detected at 0x"
-           << Twine::utohexstr(AbsoluteInstrAddr) << " in function " << *this
-           << ". Code size will be increased.\n";
+    BC.errs() << "BOLT-WARNING: relaxed tail call detected at 0x"
+              << Twine::utohexstr(AbsoluteInstrAddr) << " in function " << *this
+              << ". Code size will be increased.\n";
   }
 
   assert(!MIB->isTailCall(Instruction) &&
@@ -1084,9 +1084,9 @@ MCSymbol *BinaryFunction::handleExternalReference(MCInst &Instruction,
       assert(MIB->isConditionalBranch(Instruction) &&
              "unknown tail call instruction");
       if (opts::Verbosity >= 2) {
-        errs() << "BOLT-WARNING: conditional tail call detected in "
-               << "function " << *this << " at 0x"
-               << Twine::utohexstr(AbsoluteInstrAddr) << ".\n";
+        BC.errs() << "BOLT-WARNING: conditional tail call detected in "
+                  << "function " << *this << " at 0x"
+                  << Twine::utohexstr(AbsoluteInstrAddr) << ".\n";
       }
     }
     IsCall = true;
@@ -1096,8 +1096,8 @@ MCSymbol *BinaryFunction::handleExternalReference(MCInst &Instruction,
     // We actually see calls to address 0 in presence of weak
     // symbols originating from libraries. This code is never meant
     // to be executed.
-    outs() << "BOLT-INFO: Function " << *this
-           << " has a call to address zero.\n";
+    BC.outs() << "BOLT-INFO: Function " << *this
+              << " has a call to address zero.\n";
   }
 
   return BC.getOrCreateGlobalSymbol(TargetAddress, "FUNCat");
@@ -1210,10 +1210,11 @@ Error BinaryFunction::disassemble() {
       if (isZeroPaddingAt(Offset))
         break;
 
-      errs() << "BOLT-WARNING: unable to disassemble instruction at offset 0x"
-             << Twine::utohexstr(Offset) << " (address 0x"
-             << Twine::utohexstr(AbsoluteInstrAddr) << ") in function " << *this
-             << '\n';
+      BC.errs()
+          << "BOLT-WARNING: unable to disassemble instruction at offset 0x"
+          << Twine::utohexstr(Offset) << " (address 0x"
+          << Twine::utohexstr(AbsoluteInstrAddr) << ") in function " << *this
+          << '\n';
       // Some AVX-512 instructions could not be disassembled at all.
       if (BC.HasRelocations && opts::TrapOnAVX512 && BC.isX86()) {
         setTrapOnEntry();
@@ -1229,10 +1230,10 @@ Error BinaryFunction::disassemble() {
     if (opts::CheckEncoding && !BC.MIB->isBranch(Instruction) &&
         !BC.MIB->isCall(Instruction) && !BC.MIB->isNoop(Instruction)) {
       if (!BC.validateInstructionEncoding(FunctionData.slice(Offset, Size))) {
-        errs() << "BOLT-WARNING: mismatching LLVM encoding detected in "
-               << "function " << *this << " for instruction :\n";
-        BC.printInstruction(errs(), Instruction, AbsoluteInstrAddr);
-        errs() << '\n';
+        BC.errs() << "BOLT-WARNING: mismatching LLVM encoding detected in "
+                  << "function " << *this << " for instruction :\n";
+        BC.printInstruction(BC.errs(), Instruction, AbsoluteInstrAddr);
+        BC.errs() << '\n';
       }
     }
 
@@ -1245,10 +1246,10 @@ Error BinaryFunction::disassemble() {
       }
 
       if (!BC.validateInstructionEncoding(FunctionData.slice(Offset, Size))) {
-        errs() << "BOLT-WARNING: internal assembler/disassembler error "
-                  "detected for AVX512 instruction:\n";
-        BC.printInstruction(errs(), Instruction, AbsoluteInstrAddr);
-        errs() << " in function " << *this << '\n';
+        BC.errs() << "BOLT-WARNING: internal assembler/disassembler error "
+                     "detected for AVX512 instruction:\n";
+        BC.printInstruction(BC.errs(), Instruction, AbsoluteInstrAddr);
+        BC.errs() << " in function " << *this << '\n';
         setIgnored();
         break;
       }
@@ -1284,9 +1285,9 @@ Error BinaryFunction::disassemble() {
               // function, so preserve the function as is for now.
               PreserveNops = true;
             } else {
-              errs() << "BOLT-WARNING: internal call detected at 0x"
-                     << Twine::utohexstr(AbsoluteInstrAddr) << " in function "
-                     << *this << ". Skipping.\n";
+              BC.errs() << "BOLT-WARNING: internal call detected at 0x"
+                        << Twine::utohexstr(AbsoluteInstrAddr)
+                        << " in function " << *this << ". Skipping.\n";
               IsSimple = false;
             }
           }
@@ -1341,7 +1342,7 @@ Error BinaryFunction::disassemble() {
                     if (E.isFatal())
                       return Error(std::make_unique<BOLTError>(std::move(E)));
                     if (!E.getMessage().empty())
-                      E.log(errs());
+                      E.log(BC.errs());
                     return Error::success();
                   })) {
             return Error(std::move(NewE));
@@ -1392,7 +1393,7 @@ Error BinaryFunction::disassemble() {
                   if (E.isFatal())
                     return Error(std::make_unique<BOLTError>(std::move(E)));
                   if (!E.getMessage().empty())
-                    E.log(errs());
+                    E.log(BC.errs());
                   return Error::success();
                 }))
           return Error(std::move(NewE));
@@ -1490,10 +1491,11 @@ bool BinaryFunction::scanExternalRefs() {
                                            FunctionData.slice(Offset),
                                            AbsoluteInstrAddr, nulls())) {
       if (opts::Verbosity >= 1 && !isZeroPaddingAt(Offset)) {
-        errs() << "BOLT-WARNING: unable to disassemble instruction at offset 0x"
-               << Twine::utohexstr(Offset) << " (address 0x"
-               << Twine::utohexstr(AbsoluteInstrAddr) << ") in function "
-               << *this << '\n';
+        BC.errs()
+            << "BOLT-WARNING: unable to disassemble instruction at offset 0x"
+            << Twine::utohexstr(Offset) << " (address 0x"
+            << Twine::utohexstr(AbsoluteInstrAddr) << ") in function " << *this
+            << '\n';
       }
       Success = false;
       DisassemblyFailed = true;
@@ -1621,7 +1623,7 @@ bool BinaryFunction::scanExternalRefs() {
     HasExternalRefRelocations = true;
 
   if (opts::Verbosity >= 1 && !Success)
-    outs() << "BOLT-INFO: failed to scan refs for  " << *this << '\n';
+    BC.outs() << "BOLT-INFO: failed to scan refs for  " << *this << '\n';
 
   return Success;
 }
@@ -1654,9 +1656,9 @@ void BinaryFunction::postProcessEntryPoints() {
     if (BC.isAArch64() && Offset == getSize())
       continue;
 
-    errs() << "BOLT-WARNING: reference in the middle of instruction "
-              "detected in function "
-           << *this << " at offset 0x" << Twine::utohexstr(Offset) << '\n';
+    BC.errs() << "BOLT-WARNING: reference in the middle of instruction "
+                 "detected in function "
+              << *this << " at offset 0x" << Twine::utohexstr(Offset) << '\n';
     if (BC.HasRelocations)
       setIgnored();
     setSimple(false);
@@ -1670,9 +1672,9 @@ void BinaryFunction::postProcessJumpTables() {
     JumpTable &JT = *JTI.second;
     if (JT.Type == JumpTable::JTT_PIC && opts::JumpTables == JTS_BASIC) {
       opts::JumpTables = JTS_MOVE;
-      outs() << "BOLT-INFO: forcing -jump-tables=move as PIC jump table was "
-                "detected in function "
-             << *this << '\n';
+      BC.outs() << "BOLT-INFO: forcing -jump-tables=move as PIC jump table was "
+                   "detected in function "
+                << *this << '\n';
     }
     const uint64_t BDSize =
         BC.getBinaryDataAtAddress(JT.getAddress())->getSize();
@@ -1787,15 +1789,15 @@ bool BinaryFunction::validateExternallyReferencedOffsets() {
         continue;
 
       if (opts::Verbosity >= 1) {
-        errs() << "BOLT-WARNING: unclaimed data to code reference (possibly "
-               << "an unrecognized jump table entry) to " << BB->getName()
-               << " in " << *this << "\n";
+        BC.errs() << "BOLT-WARNING: unclaimed data to code reference (possibly "
+                  << "an unrecognized jump table entry) to " << BB->getName()
+                  << " in " << *this << "\n";
       }
       auto L = BC.scopeLock();
       addEntryPoint(*BB);
     } else {
-      errs() << "BOLT-WARNING: unknown data to code reference to offset "
-             << Twine::utohexstr(Destination) << " in " << *this << "\n";
+      BC.errs() << "BOLT-WARNING: unknown data to code reference to offset "
+                << Twine::utohexstr(Destination) << " in " << *this << "\n";
       setIgnored();
     }
     HasUnclaimedReference = true;
@@ -1895,9 +1897,9 @@ bool BinaryFunction::postProcessIndirectBranches(
       }
 
       if (opts::Verbosity >= 2) {
-        outs() << "BOLT-INFO: rejected potential indirect tail call in "
-               << "function " << *this << " in basic block " << BB.getName()
-               << ".\n";
+        BC.outs() << "BOLT-INFO: rejected potential indirect tail call in "
+                  << "function " << *this << " in basic block " << BB.getName()
+                  << ".\n";
         LLVM_DEBUG(BC.printInstructions(dbgs(), BB.begin(), BB.end(),
                                         BB.getOffset(), this, true));
       }
@@ -2134,11 +2136,12 @@ Error BinaryFunction::buildCFG(MCPlusBuilder::AllocatorIdTy AllocatorId) {
     BinaryBasicBlock *ToBB = getBasicBlockAtOffset(Branch.second);
     if (!FromBB || !ToBB) {
       if (!FromBB)
-        errs() << "BOLT-ERROR: cannot find BB containing the branch.\n";
+        BC.errs() << "BOLT-ERROR: cannot find BB containing the branch.\n";
       if (!ToBB)
-        errs() << "BOLT-ERROR: cannot find BB containing branch destination.\n";
-      BC.exitWithBugReport("disassembly failed - inconsistent branch found.",
-                           *this);
+        BC.errs()
+            << "BOLT-ERROR: cannot find BB containing branch destination.\n";
+      return createFatalBOLTError(BC.generateBugReportMessage(
+          "disassembly failed - inconsistent branch found.", *this));
     }
 
     FromBB->addSuccessor(ToBB);
@@ -2216,8 +2219,8 @@ Error BinaryFunction::buildCFG(MCPlusBuilder::AllocatorIdTy AllocatorId) {
   // Make any necessary adjustments for indirect branches.
   if (!postProcessIndirectBranches(AllocatorId)) {
     if (opts::Verbosity) {
-      errs() << "BOLT-WARNING: failed to post-process indirect branches for "
-             << *this << '\n';
+      BC.errs() << "BOLT-WARNING: failed to post-process indirect branches for "
+                << *this << '\n';
     }
     // In relocation mode we want to keep processing the function but avoid
     // optimizing it.
@@ -3057,10 +3060,6 @@ static std::string constructFilename(std::string Filename,
     Annotation.insert(0, "-");
   if (Filename.size() + Annotation.size() + Suffix.size() > MAX_PATH) {
     assert(Suffix.size() + Annotation.size() <= MAX_PATH);
-    if (opts::Verbosity >= 1) {
-      errs() << "BOLT-WARNING: Filename \"" << Filename << Annotation << Suffix
-             << "\" exceeds the " << MAX_PATH << " size limit, truncating.\n";
-    }
     Filename.resize(MAX_PATH - (Suffix.size() + Annotation.size()));
   }
   Filename += Annotation;
@@ -3185,16 +3184,17 @@ void BinaryFunction::viewGraph() const {
   SmallString<MAX_PATH> Filename;
   if (std::error_code EC =
           sys::fs::createTemporaryFile("bolt-cfg", "dot", Filename)) {
-    errs() << "BOLT-ERROR: " << EC.message() << ", unable to create "
-           << " bolt-cfg-XXXXX.dot temporary file.\n";
+    BC.errs() << "BOLT-ERROR: " << EC.message() << ", unable to create "
+              << " bolt-cfg-XXXXX.dot temporary file.\n";
     return;
   }
   dumpGraphToFile(std::string(Filename));
   if (DisplayGraph(Filename))
-    errs() << "BOLT-ERROR: Can't display " << Filename << " with graphviz.\n";
+    BC.errs() << "BOLT-ERROR: Can't display " << Filename
+              << " with graphviz.\n";
   if (std::error_code EC = sys::fs::remove(Filename)) {
-    errs() << "BOLT-WARNING: " << EC.message() << ", failed to remove "
-           << Filename << "\n";
+    BC.errs() << "BOLT-WARNING: " << EC.message() << ", failed to remove "
+              << Filename << "\n";
   }
 }
 
@@ -3204,7 +3204,7 @@ void BinaryFunction::dumpGraphForPass(std::string Annotation) const {
 
   std::string Filename = constructFilename(getPrintName(), Annotation, ".dot");
   if (opts::Verbosity >= 1)
-    outs() << "BOLT-INFO: dumping CFG to " << Filename << "\n";
+    BC.outs() << "BOLT-INFO: dumping CFG to " << Filename << "\n";
   dumpGraphToFile(Filename);
 }
 
@@ -3213,8 +3213,8 @@ void BinaryFunction::dumpGraphToFile(std::string Filename) const {
   raw_fd_ostream of(Filename, EC, sys::fs::OF_None);
   if (EC) {
     if (opts::Verbosity >= 1) {
-      errs() << "BOLT-WARNING: " << EC.message() << ", unable to open "
-             << Filename << " for output.\n";
+      BC.errs() << "BOLT-WARNING: " << EC.message() << ", unable to open "
+                << Filename << " for output.\n";
     }
     return;
   }
@@ -3236,8 +3236,8 @@ bool BinaryFunction::validateCFG() const {
   // Make sure all blocks in CFG are valid.
   auto validateBlock = [this](const BinaryBasicBlock *BB, StringRef Desc) {
     if (!BB->isValid()) {
-      errs() << "BOLT-ERROR: deleted " << Desc << " " << BB->getName()
-             << " detected in:\n";
+      BC.errs() << "BOLT-ERROR: deleted " << Desc << " " << BB->getName()
+                << " detected in:\n";
       this->dump();
       return false;
     }
@@ -3264,8 +3264,8 @@ bool BinaryFunction::validateCFG() const {
     std::unordered_set<const BinaryBasicBlock *> BBLandingPads;
     for (const BinaryBasicBlock *LP : BB->landing_pads()) {
       if (BBLandingPads.count(LP)) {
-        errs() << "BOLT-ERROR: duplicate landing pad detected in"
-               << BB->getName() << " in function " << *this << '\n';
+        BC.errs() << "BOLT-ERROR: duplicate landing pad detected in"
+                  << BB->getName() << " in function " << *this << '\n';
         return false;
       }
       BBLandingPads.insert(LP);
@@ -3274,8 +3274,8 @@ bool BinaryFunction::validateCFG() const {
     std::unordered_set<const BinaryBasicBlock *> BBThrowers;
     for (const BinaryBasicBlock *Thrower : BB->throwers()) {
       if (BBThrowers.count(Thrower)) {
-        errs() << "BOLT-ERROR: duplicate thrower detected in" << BB->getName()
-               << " in function " << *this << '\n';
+        BC.errs() << "BOLT-ERROR: duplicate thrower detected in"
+                  << BB->getName() << " in function " << *this << '\n';
         return false;
       }
       BBThrowers.insert(Thrower);
@@ -3283,17 +3283,18 @@ bool BinaryFunction::validateCFG() const {
 
     for (const BinaryBasicBlock *LPBlock : BB->landing_pads()) {
       if (!llvm::is_contained(LPBlock->throwers(), BB)) {
-        errs() << "BOLT-ERROR: inconsistent landing pad detected in " << *this
-               << ": " << BB->getName() << " is in LandingPads but not in "
-               << LPBlock->getName() << " Throwers\n";
+        BC.errs() << "BOLT-ERROR: inconsistent landing pad detected in "
+                  << *this << ": " << BB->getName()
+                  << " is in LandingPads but not in " << LPBlock->getName()
+                  << " Throwers\n";
         return false;
       }
     }
     for (const BinaryBasicBlock *Thrower : BB->throwers()) {
       if (!llvm::is_contained(Thrower->landing_pads(), BB)) {
-        errs() << "BOLT-ERROR: inconsistent thrower detected in " << *this
-               << ": " << BB->getName() << " is in Throwers list but not in "
-               << Thrower->getName() << " LandingPads\n";
+        BC.errs() << "BOLT-ERROR: inconsistent thrower detected in " << *this
+                  << ": " << BB->getName() << " is in Throwers list but not in "
+                  << Thrower->getName() << " LandingPads\n";
         return false;
       }
     }
diff --git a/bolt/lib/Core/DIEBuilder.cpp b/bolt/lib/Core/DIEBuilder.cpp
index 762d341..3c72c74 100644
--- a/bolt/lib/Core/DIEBuilder.cpp
+++ b/bolt/lib/Core/DIEBuilder.cpp
@@ -126,8 +126,8 @@ uint32_t DIEBuilder::allocDIE(const DWARFUnit &DU, const DWARFDie &DDie,
 void DIEBuilder::constructFromUnit(DWARFUnit &DU) {
   std::optional<uint32_t> UnitId = getUnitId(DU);
   if (!UnitId) {
-    errs() << "BOLT-WARNING: [internal-dwarf-error]: "
-           << "Skip Unit at " << Twine::utohexstr(DU.getOffset()) << "\n";
+    BC.errs() << "BOLT-WARNING: [internal-dwarf-error]: "
+              << "Skip Unit at " << Twine::utohexstr(DU.getOffset()) << "\n";
     return;
   }
 
@@ -178,8 +178,9 @@ void DIEBuilder::constructFromUnit(DWARFUnit &DU) {
   getState().CloneUnitCtxMap[*UnitId].IsConstructed = true;
 }
 
-DIEBuilder::DIEBuilder(DWARFContext *DwarfContext, bool IsDWO)
-    : DwarfContext(DwarfContext), IsDWO(IsDWO) {}
+DIEBuilder::DIEBuilder(BinaryContext &BC, DWARFContext *DwarfContext,
+                       bool IsDWO)
+    : BC(BC), DwarfContext(DwarfContext), IsDWO(IsDWO) {}
 
 static unsigned int getCUNum(DWARFContext *DwarfContext, bool IsDWO) {
   unsigned int CUNum = IsDWO ? DwarfContext->getNumDWOCompileUnits()
@@ -475,19 +476,21 @@ DWARFDie DIEBuilder::resolveDIEReference(
           allocDIE(*RefCU, RefDie, getState().DIEAlloc, *UnitId);
         return RefDie;
       }
-      errs() << "BOLT-WARNING: [internal-dwarf-error]: invalid referenced DIE "
-                "at offset: "
-             << Twine::utohexstr(RefOffset) << ".\n";
+      BC.errs()
+          << "BOLT-WARNING: [internal-dwarf-error]: invalid referenced DIE "
+             "at offset: "
+          << Twine::utohexstr(RefOffset) << ".\n";
 
     } else {
-      errs() << "BOLT-WARNING: [internal-dwarf-error]: could not parse "
-                "referenced DIE at offset: "
-             << Twine::utohexstr(RefOffset) << ".\n";
+      BC.errs() << "BOLT-WARNING: [internal-dwarf-error]: could not parse "
+                   "referenced DIE at offset: "
+                << Twine::utohexstr(RefOffset) << ".\n";
     }
   } else {
-    errs() << "BOLT-WARNING: [internal-dwarf-error]: could not find referenced "
-              "CU. Referenced DIE offset: "
-           << Twine::utohexstr(RefOffset) << ".\n";
+    BC.errs()
+        << "BOLT-WARNING: [internal-dwarf-error]: could not find referenced "
+           "CU. Referenced DIE offset: "
+        << Twine::utohexstr(RefOffset) << ".\n";
   }
   return DWARFDie();
 }
@@ -516,8 +519,8 @@ void DIEBuilder::cloneDieReferenceAttribute(
   if (!DieInfo.Die) {
     assert(Ref > InputDIE.getOffset());
     (void)Ref;
-    errs() << "BOLT-WARNING: [internal-dwarf-error]: encounter unexpected "
-              "unallocated DIE. Should be alloc!\n";
+    BC.errs() << "BOLT-WARNING: [internal-dwarf-error]: encounter unexpected "
+                 "unallocated DIE. Should be alloc!\n";
     // We haven't cloned this DIE yet. Just create an empty one and
     // store it. It'll get really cloned when we process it.
     DieInfo.Die = DIE::get(getState().DIEAlloc, dwarf::Tag(RefDie.getTag()));
@@ -580,8 +583,8 @@ bool DIEBuilder::cloneExpression(const DataExtractor &Data,
         (Description.Op.size() == 2 &&
          Description.Op[1] == Encoding::BaseTypeRef &&
          Description.Op[0] != Encoding::Size1))
-      outs() << "BOLT-WARNING: [internal-dwarf-error]: unsupported DW_OP "
-                "encoding.\n";
+      BC.outs() << "BOLT-WARNING: [internal-dwarf-error]: unsupported DW_OP "
+                   "encoding.\n";
 
     if ((Description.Op.size() == 1 &&
          Description.Op[0] == Encoding::BaseTypeRef) ||
@@ -616,9 +619,9 @@ bool DIEBuilder::cloneExpression(const DataExtractor &Data,
             Offset = Stage == CloneExpressionStage::INIT ? RefOffset
                                                          : Clone->getOffset();
           else
-            errs() << "BOLT-WARNING: [internal-dwarf-error]: base type ref "
-                      "doesn't point to "
-                      "DW_TAG_base_type.\n";
+            BC.errs() << "BOLT-WARNING: [internal-dwarf-error]: base type ref "
+                         "doesn't point to "
+                         "DW_TAG_base_type.\n";
         }
       }
       uint8_t ULEB[16];
@@ -652,8 +655,9 @@ void DIEBuilder::cloneBlockAttribute(
                                    U.getVersion())) {
     Block = new (getState().DIEAlloc) DIEBlock;
   } else {
-    errs() << "BOLT-WARNING: [internal-dwarf-error]: Unexpected Form value in "
-              "cloneBlockAttribute\n";
+    BC.errs()
+        << "BOLT-WARNING: [internal-dwarf-error]: Unexpected Form value in "
+           "cloneBlockAttribute\n";
     return;
   }
   Attr = Loc ? static_cast<DIEValueList *>(Loc)
@@ -720,9 +724,9 @@ void DIEBuilder::cloneScalarAttribute(
   else if (auto OptionalValue = Val.getAsSectionOffset())
     Value = *OptionalValue;
   else {
-    errs() << "BOLT-WARNING: [internal-dwarf-error]: Unsupported scalar "
-              "attribute form. Dropping "
-              "attribute.\n";
+    BC.errs() << "BOLT-WARNING: [internal-dwarf-error]: Unsupported scalar "
+                 "attribute form. Dropping "
+                 "attribute.\n";
     return;
   }
 
@@ -743,9 +747,9 @@ void DIEBuilder::cloneLoclistAttrubute(
   else if (auto OptionalValue = Val.getAsSectionOffset())
     Value = OptionalValue;
   else
-    errs() << "BOLT-WARNING: [internal-dwarf-error]: Unsupported scalar "
-              "attribute form. Dropping "
-              "attribute.\n";
+    BC.errs() << "BOLT-WARNING: [internal-dwarf-error]: Unsupported scalar "
+                 "attribute form. Dropping "
+                 "attribute.\n";
 
   if (!Value.has_value())
     return;
@@ -808,10 +812,10 @@ void DIEBuilder::cloneAttribute(
     cloneRefsigAttribute(Die, AttrSpec, Val);
     break;
   default:
-    errs() << "BOLT-WARNING: [internal-dwarf-error]: Unsupported attribute "
-              "form " +
-                  dwarf::FormEncodingString(AttrSpec.Form).str() +
-                  " in cloneAttribute. Dropping.";
+    BC.errs() << "BOLT-WARNING: [internal-dwarf-error]: Unsupported attribute "
+                 "form " +
+                     dwarf::FormEncodingString(AttrSpec.Form).str() +
+                     " in cloneAttribute. Dropping.";
   }
 }
 void DIEBuilder::assignAbbrev(DIEAbbrev &Abbrev) {
diff --git a/bolt/lib/Core/DebugData.cpp b/bolt/lib/Core/DebugData.cpp
index 415b031..8c3f6bd 100644
--- a/bolt/lib/Core/DebugData.cpp
+++ b/bolt/lib/Core/DebugData.cpp
@@ -1185,7 +1185,7 @@ static void parseAndPopulateDebugLineStr(BinarySection &LineStrSection,
     Error Err = Error::success();
     const char *CStr = StrData.getCStr(&Offset, &Err);
     if (Err) {
-      errs() << "BOLT-ERROR: could not extract string from .debug_line_str";
+      BC.errs() << "BOLT-ERROR: could not extract string from .debug_line_str";
       continue;
     }
     const size_t NewOffset = LineStr.addString(CStr);
diff --git a/bolt/lib/Core/DynoStats.cpp b/bolt/lib/Core/DynoStats.cpp
index 10f0769..5de0f9e 100644
--- a/bolt/lib/Core/DynoStats.cpp
+++ b/bolt/lib/Core/DynoStats.cpp
@@ -99,7 +99,7 @@ void DynoStats::print(raw_ostream &OS, const DynoStats *Other,
     printStatWithDelta(Desc[Stat], Stats[Stat], Other ? (*Other)[Stat] : 0);
   }
   if (opts::PrintDynoOpcodeStat && Printer) {
-    outs() << "\nProgram-wide opcode histogram:\n";
+    OS << "\nProgram-wide opcode histogram:\n";
     OS << "              Opcode,   Execution Count,     Max Exec Count, "
           "Function Name:Offset ...\n";
     std::vector<std::pair<uint64_t, unsigned>> SortedHistogram;
diff --git a/bolt/lib/Core/Exceptions.cpp b/bolt/lib/Core/Exceptions.cpp
index dd3fa46..54618ae 100644
--- a/bolt/lib/Core/Exceptions.cpp
+++ b/bolt/lib/Core/Exceptions.cpp
@@ -119,8 +119,8 @@ Error BinaryFunction::parseLSDA(ArrayRef<uint8_t> LSDASectionData,
     std::optional<uint64_t> MaybeLPStart = Data.getEncodedPointer(
         &Offset, LPStartEncoding, Offset + LSDASectionAddress);
     if (!MaybeLPStart) {
-      errs() << "BOLT-ERROR: unsupported LPStartEncoding: "
-             << (unsigned)LPStartEncoding << '\n';
+      BC.errs() << "BOLT-ERROR: unsupported LPStartEncoding: "
+                << (unsigned)LPStartEncoding << '\n';
       return createFatalBOLTError("");
     }
     LPStart = *MaybeLPStart;
@@ -136,13 +136,14 @@ Error BinaryFunction::parseLSDA(ArrayRef<uint8_t> LSDASectionData,
   }
 
   if (opts::PrintExceptions) {
-    outs() << "[LSDA at 0x" << Twine::utohexstr(getLSDAAddress())
-           << " for function " << *this << "]:\n";
-    outs() << "LPStart Encoding = 0x" << Twine::utohexstr(LPStartEncoding)
-           << '\n';
-    outs() << "LPStart = 0x" << Twine::utohexstr(LPStart) << '\n';
-    outs() << "TType Encoding = 0x" << Twine::utohexstr(TTypeEncoding) << '\n';
-    outs() << "TType End = " << TTypeEnd << '\n';
+    BC.outs() << "[LSDA at 0x" << Twine::utohexstr(getLSDAAddress())
+              << " for function " << *this << "]:\n";
+    BC.outs() << "LPStart Encoding = 0x" << Twine::utohexstr(LPStartEncoding)
+              << '\n';
+    BC.outs() << "LPStart = 0x" << Twine::utohexstr(LPStart) << '\n';
+    BC.outs() << "TType Encoding = 0x" << Twine::utohexstr(TTypeEncoding)
+              << '\n';
+    BC.outs() << "TType End = " << TTypeEnd << '\n';
   }
 
   // Table to store list of indices in type table. Entries are uleb128 values.
@@ -166,9 +167,9 @@ Error BinaryFunction::parseLSDA(ArrayRef<uint8_t> LSDASectionData,
   uint64_t ActionTableStart = CallSiteTableEnd;
 
   if (opts::PrintExceptions) {
-    outs() << "CallSite Encoding = " << (unsigned)CallSiteEncoding << '\n';
-    outs() << "CallSite table length = " << CallSiteTableLength << '\n';
-    outs() << '\n';
+    BC.outs() << "CallSite Encoding = " << (unsigned)CallSiteEncoding << '\n';
+    BC.outs() << "CallSite table length = " << CallSiteTableLength << '\n';
+    BC.outs() << '\n';
   }
 
   this->HasEHRanges = CallSitePtr < CallSiteTableEnd;
@@ -185,12 +186,13 @@ Error BinaryFunction::parseLSDA(ArrayRef<uint8_t> LSDASectionData,
       LandingPad += LPStart;
 
     if (opts::PrintExceptions) {
-      outs() << "Call Site: [0x" << Twine::utohexstr(RangeBase + Start)
-             << ", 0x" << Twine::utohexstr(RangeBase + Start + Length)
-             << "); landing pad: 0x" << Twine::utohexstr(LandingPad)
-             << "; action entry: 0x" << Twine::utohexstr(ActionEntry) << "\n";
-      outs() << "  current offset is " << (CallSitePtr - CallSiteTableStart)
-             << '\n';
+      BC.outs() << "Call Site: [0x" << Twine::utohexstr(RangeBase + Start)
+                << ", 0x" << Twine::utohexstr(RangeBase + Start + Length)
+                << "); landing pad: 0x" << Twine::utohexstr(LandingPad)
+                << "; action entry: 0x" << Twine::utohexstr(ActionEntry)
+                << "\n";
+      BC.outs() << "  current offset is " << (CallSitePtr - CallSiteTableStart)
+                << '\n';
     }
 
     // Create a handler entry if necessary.
@@ -215,9 +217,10 @@ Error BinaryFunction::parseLSDA(ArrayRef<uint8_t> LSDASectionData,
       const uint64_t LPOffset = LandingPad - getAddress();
       if (!getInstructionAtOffset(LPOffset)) {
         if (opts::Verbosity >= 1)
-          errs() << "BOLT-WARNING: landing pad " << Twine::utohexstr(LPOffset)
-                 << " not pointing to an instruction in function " << *this
-                 << " - ignoring.\n";
+          BC.errs() << "BOLT-WARNING: landing pad "
+                    << Twine::utohexstr(LPOffset)
+                    << " not pointing to an instruction in function " << *this
+                    << " - ignoring.\n";
       } else {
         auto Label = Labels.find(LPOffset);
         if (Label != Labels.end()) {
@@ -271,7 +274,7 @@ Error BinaryFunction::parseLSDA(ArrayRef<uint8_t> LSDASectionData,
           OS << "0x" << Twine::utohexstr(TypeAddress);
       };
       if (opts::PrintExceptions)
-        outs() << "    actions: ";
+        BC.outs() << "    actions: ";
       uint64_t ActionPtr = ActionTableStart + ActionEntry - 1;
       int64_t ActionType;
       int64_t ActionNext;
@@ -281,21 +284,21 @@ Error BinaryFunction::parseLSDA(ArrayRef<uint8_t> LSDASectionData,
         const uint32_t Self = ActionPtr;
         ActionNext = Data.getSLEB128(&ActionPtr);
         if (opts::PrintExceptions)
-          outs() << Sep << "(" << ActionType << ", " << ActionNext << ") ";
+          BC.outs() << Sep << "(" << ActionType << ", " << ActionNext << ") ";
         if (ActionType == 0) {
           if (opts::PrintExceptions)
-            outs() << "cleanup";
+            BC.outs() << "cleanup";
         } else if (ActionType > 0) {
           // It's an index into a type table.
           MaxTypeIndex =
               std::max(MaxTypeIndex, static_cast<unsigned>(ActionType));
           if (opts::PrintExceptions) {
-            outs() << "catch type ";
-            printType(ActionType, outs());
+            BC.outs() << "catch type ";
+            printType(ActionType, BC.outs());
           }
         } else { // ActionType < 0
           if (opts::PrintExceptions)
-            outs() << "filter exception types ";
+            BC.outs() << "filter exception types ";
           const char *TSep = "";
           // ActionType is a negative *byte* offset into *uleb128-encoded* table
           // of indices with base 1.
@@ -305,8 +308,8 @@ Error BinaryFunction::parseLSDA(ArrayRef<uint8_t> LSDASectionData,
           while (uint64_t Index = Data.getULEB128(&TypeIndexTablePtr)) {
             MaxTypeIndex = std::max(MaxTypeIndex, static_cast<unsigned>(Index));
             if (opts::PrintExceptions) {
-              outs() << TSep;
-              printType(Index, outs());
+              BC.outs() << TSep;
+              printType(Index, BC.outs());
               TSep = ", ";
             }
           }
@@ -319,11 +322,11 @@ Error BinaryFunction::parseLSDA(ArrayRef<uint8_t> LSDASectionData,
         ActionPtr = Self + ActionNext;
       } while (ActionNext);
       if (opts::PrintExceptions)
-        outs() << '\n';
+        BC.outs() << '\n';
     }
   }
   if (opts::PrintExceptions)
-    outs() << '\n';
+    BC.outs() << '\n';
 
   assert(TypeIndexTableStart + MaxTypeIndexTableOffset <=
              Data.getData().size() &&
@@ -461,7 +464,9 @@ void BinaryFunction::updateEHRanges() {
 
 const uint8_t DWARF_CFI_PRIMARY_OPCODE_MASK = 0xc0;
 
-CFIReaderWriter::CFIReaderWriter(const DWARFDebugFrame &EHFrame) {
+CFIReaderWriter::CFIReaderWriter(BinaryContext &BC,
+                                 const DWARFDebugFrame &EHFrame)
+    : BC(BC) {
   // Prepare FDEs for fast lookup
   for (const dwarf::FrameEntry &Entry : EHFrame.entries()) {
     const auto *CurFDE = dyn_cast<dwarf::FDE>(&Entry);
@@ -476,10 +481,10 @@ CFIReaderWriter::CFIReaderWriter(const DWARFDebugFrame &EHFrame) {
         if (FDEI->second->getAddressRange() == 0) {
           FDEI->second = CurFDE;
         } else if (opts::Verbosity > 0) {
-          errs() << "BOLT-WARNING: different FDEs for function at 0x"
-                 << Twine::utohexstr(FDEI->first)
-                 << " detected; sizes: " << FDEI->second->getAddressRange()
-                 << " and " << CurFDE->getAddressRange() << '\n';
+          BC.errs() << "BOLT-WARNING: different FDEs for function at 0x"
+                    << Twine::utohexstr(FDEI->first)
+                    << " detected; sizes: " << FDEI->second->getAddressRange()
+                    << " and " << CurFDE->getAddressRange() << '\n';
         }
       }
     } else {
@@ -509,8 +514,8 @@ bool CFIReaderWriter::fillCFIInfoFor(BinaryFunction &Function) const {
         *CurFDE.getLinkedCIE()->getPersonalityEncoding());
   }
 
-  auto decodeFrameInstruction = [&Function, &Offset, Address, CodeAlignment,
-                                 DataAlignment](
+  auto decodeFrameInstruction = [this, &Function, &Offset, Address,
+                                 CodeAlignment, DataAlignment](
                                     const CFIProgram::Instruction &Instr) {
     uint8_t Opcode = Instr.Opcode;
     if (Opcode & DWARF_CFI_PRIMARY_OPCODE_MASK)
@@ -602,7 +607,7 @@ bool CFIReaderWriter::fillCFIInfoFor(BinaryFunction &Function) const {
     case DW_CFA_val_offset_sf:
     case DW_CFA_val_offset:
       if (opts::Verbosity >= 1) {
-        errs() << "BOLT-WARNING: DWARF val_offset() unimplemented\n";
+        BC.errs() << "BOLT-WARNING: DWARF val_offset() unimplemented\n";
       }
       return false;
     case DW_CFA_def_cfa_expression:
@@ -623,7 +628,7 @@ bool CFIReaderWriter::fillCFIInfoFor(BinaryFunction &Function) const {
     }
     case DW_CFA_MIPS_advance_loc8:
       if (opts::Verbosity >= 1)
-        errs() << "BOLT-WARNING: DW_CFA_MIPS_advance_loc unimplemented\n";
+        BC.errs() << "BOLT-WARNING: DW_CFA_MIPS_advance_loc unimplemented\n";
       return false;
     case DW_CFA_GNU_window_save:
       // DW_CFA_GNU_window_save and DW_CFA_GNU_NegateRAState just use the same
@@ -634,17 +639,17 @@ bool CFIReaderWriter::fillCFIInfoFor(BinaryFunction &Function) const {
         break;
       }
       if (opts::Verbosity >= 1)
-        errs() << "BOLT-WARNING: DW_CFA_GNU_window_save unimplemented\n";
+        BC.errs() << "BOLT-WARNING: DW_CFA_GNU_window_save unimplemented\n";
       return false;
     case DW_CFA_lo_user:
     case DW_CFA_hi_user:
       if (opts::Verbosity >= 1)
-        errs() << "BOLT-WARNING: DW_CFA_*_user unimplemented\n";
+        BC.errs() << "BOLT-WARNING: DW_CFA_*_user unimplemented\n";
       return false;
     default:
       if (opts::Verbosity >= 1)
-        errs() << "BOLT-WARNING: Unrecognized CFI instruction: " << Instr.Opcode
-               << '\n';
+        BC.errs() << "BOLT-WARNING: Unrecognized CFI instruction: "
+                  << Instr.Opcode << '\n';
       return false;
     }
 
diff --git a/bolt/lib/Core/ParallelUtilities.cpp b/bolt/lib/Core/ParallelUtilities.cpp
index fb2b6dc..1a28bc4 100644
--- a/bolt/lib/Core/ParallelUtilities.cpp
+++ b/bolt/lib/Core/ParallelUtilities.cpp
@@ -90,8 +90,9 @@ inline unsigned estimateTotalCost(const BinaryContext &BC,
 
   // Switch to trivial scheduling if total estimated work is zero
   if (TotalCost == 0) {
-    outs() << "BOLT-WARNING: Running parallel work of 0 estimated cost, will "
-              "switch to  trivial scheduling.\n";
+    BC.outs()
+        << "BOLT-WARNING: Running parallel work of 0 estimated cost, will "
+           "switch to  trivial scheduling.\n";
 
     SchedPolicy = SP_TRIVIAL;
     TotalCost = BC.getBinaryFunctions().size();
diff --git a/bolt/lib/Passes/ADRRelaxationPass.cpp b/bolt/lib/Passes/ADRRelaxationPass.cpp
index aa715a3..24fddbc 100644
--- a/bolt/lib/Passes/ADRRelaxationPass.cpp
+++ b/bolt/lib/Passes/ADRRelaxationPass.cpp
@@ -86,9 +86,10 @@ void ADRRelaxationPass::runOnFunction(BinaryFunction &BF) {
         // invalidate this offset, so we have to rely on linker-inserted NOP to
         // replace it with ADRP, and abort if it is not present.
         auto L = BC.scopeLock();
-        errs() << formatv("BOLT-ERROR: Cannot relax adr in non-simple function "
-                          "{0}. Use --strict option to override\n",
-                          BF.getOneName());
+        BC.errs() << formatv(
+            "BOLT-ERROR: Cannot relax adr in non-simple function "
+            "{0}. Use --strict option to override\n",
+            BF.getOneName());
         PassFailed = true;
         return;
       }
diff --git a/bolt/lib/Passes/AllocCombiner.cpp b/bolt/lib/Passes/AllocCombiner.cpp
index 2397dc4..38ef7d0 100644
--- a/bolt/lib/Passes/AllocCombiner.cpp
+++ b/bolt/lib/Passes/AllocCombiner.cpp
@@ -111,9 +111,9 @@ Error AllocCombinerPass::runOnFunctions(BinaryContext &BC) {
     combineAdjustments(Function);
   });
 
-  outs() << "BOLT-INFO: Allocation combiner: " << NumCombined
-         << " empty spaces coalesced (dyn count: " << DynamicCountCombined
-         << ").\n";
+  BC.outs() << "BOLT-INFO: Allocation combiner: " << NumCombined
+            << " empty spaces coalesced (dyn count: " << DynamicCountCombined
+            << ").\n";
   return Error::success();
 }
 
diff --git a/bolt/lib/Passes/AsmDump.cpp b/bolt/lib/Passes/AsmDump.cpp
index 1eaf890..0a12eae 100644
--- a/bolt/lib/Passes/AsmDump.cpp
+++ b/bolt/lib/Passes/AsmDump.cpp
@@ -43,7 +43,7 @@ void dumpCFI(const BinaryFunction &BF, const MCInst &Instr, AsmPrinter &MAP) {
   case MCCFIInstruction::OpRememberState:
   case MCCFIInstruction::OpRestoreState:
     if (opts::Verbosity >= 2)
-      errs()
+      BF.getBinaryContext().errs()
           << "BOLT-WARNING: AsmDump: skipping unsupported CFI instruction in "
           << BF << ".\n";
 
@@ -102,9 +102,9 @@ void dumpFunction(const BinaryFunction &BF) {
   // Make sure the new directory exists, creating it if necessary.
   if (!opts::AsmDump.empty()) {
     if (std::error_code EC = sys::fs::create_directories(opts::AsmDump)) {
-      errs() << "BOLT-ERROR: could not create directory '" << opts::AsmDump
-             << "': " << EC.message() << '\n';
-      exit(1);
+      BC.errs() << "BOLT-ERROR: could not create directory '" << opts::AsmDump
+                << "': " << EC.message() << '\n';
+      return;
     }
   }
 
@@ -115,14 +115,14 @@ void dumpFunction(const BinaryFunction &BF) {
           ? (PrintName + ".s")
           : (opts::AsmDump + sys::path::get_separator() + PrintName + ".s")
                 .str();
-  outs() << "BOLT-INFO: Dumping function assembly to " << Filename << "\n";
+  BC.outs() << "BOLT-INFO: Dumping function assembly to " << Filename << "\n";
 
   std::error_code EC;
   raw_fd_ostream OS(Filename, EC, sys::fs::OF_None);
   if (EC) {
-    errs() << "BOLT-ERROR: " << EC.message() << ", unable to open " << Filename
-           << " for output.\n";
-    exit(1);
+    BC.errs() << "BOLT-ERROR: " << EC.message() << ", unable to open "
+              << Filename << " for output.\n";
+    return;
   }
   OS.SetUnbuffered();
 
diff --git a/bolt/lib/Passes/BinaryFunctionCallGraph.cpp b/bolt/lib/Passes/BinaryFunctionCallGraph.cpp
index 28621a4..2373710 100644
--- a/bolt/lib/Passes/BinaryFunctionCallGraph.cpp
+++ b/bolt/lib/Passes/BinaryFunctionCallGraph.cpp
@@ -278,13 +278,13 @@ buildCallGraph(BinaryContext &BC, CgFilterFunction Filter, bool CgFromPerfData,
   bool PrintInfo = false;
 #endif
   if (PrintInfo || opts::Verbosity > 0)
-    outs() << format("BOLT-INFO: buildCallGraph: %u nodes, %u callsites "
-                     "(%u recursive), density = %.6lf, %u callsites not "
-                     "processed, %u callsites with invalid profile, "
-                     "used perf data for %u stale functions.\n",
-                     Cg.numNodes(), TotalCallsites, RecursiveCallsites,
-                     Cg.density(), NotProcessed, NoProfileCallsites,
-                     NumFallbacks);
+    BC.outs() << format("BOLT-INFO: buildCallGraph: %u nodes, %u callsites "
+                        "(%u recursive), density = %.6lf, %u callsites not "
+                        "processed, %u callsites with invalid profile, "
+                        "used perf data for %u stale functions.\n",
+                        Cg.numNodes(), TotalCallsites, RecursiveCallsites,
+                        Cg.density(), NotProcessed, NoProfileCallsites,
+                        NumFallbacks);
 
   if (opts::DumpCGDot.getNumOccurrences()) {
     Cg.printDot(opts::DumpCGDot, [&](CallGraph::NodeId Id) {
diff --git a/bolt/lib/Passes/BinaryPasses.cpp b/bolt/lib/Passes/BinaryPasses.cpp
index 4d92b27..d2850b0 100644
--- a/bolt/lib/Passes/BinaryPasses.cpp
+++ b/bolt/lib/Passes/BinaryPasses.cpp
@@ -308,12 +308,12 @@ Error NormalizeCFG::runOnFunctions(BinaryContext &BC) {
       [&](const BinaryFunction &BF) { return !shouldOptimize(BF); },
       "NormalizeCFG");
   if (NumBlocksRemoved)
-    outs() << "BOLT-INFO: removed " << NumBlocksRemoved << " empty block"
-           << (NumBlocksRemoved == 1 ? "" : "s") << '\n';
+    BC.outs() << "BOLT-INFO: removed " << NumBlocksRemoved << " empty block"
+              << (NumBlocksRemoved == 1 ? "" : "s") << '\n';
   if (NumDuplicateEdgesMerged)
-    outs() << "BOLT-INFO: merged " << NumDuplicateEdgesMerged
-           << " duplicate CFG edge" << (NumDuplicateEdgesMerged == 1 ? "" : "s")
-           << '\n';
+    BC.outs() << "BOLT-INFO: merged " << NumDuplicateEdgesMerged
+              << " duplicate CFG edge"
+              << (NumDuplicateEdgesMerged == 1 ? "" : "s") << '\n';
   return Error::success();
 }
 
@@ -340,9 +340,9 @@ void EliminateUnreachableBlocks::runOnFunction(BinaryFunction &Function) {
     auto L = BC.scopeLock();
     Modified.insert(&Function);
     if (opts::Verbosity > 0)
-      outs() << "BOLT-INFO: removed " << Count
-             << " dead basic block(s) accounting for " << Bytes
-             << " bytes in function " << Function << '\n';
+      BC.outs() << "BOLT-INFO: removed " << Count
+                << " dead basic block(s) accounting for " << Bytes
+                << " bytes in function " << Function << '\n';
   }
 }
 
@@ -360,8 +360,8 @@ Error EliminateUnreachableBlocks::runOnFunctions(BinaryContext &BC) {
       SkipPredicate, "elimininate-unreachable");
 
   if (DeletedBlocks)
-    outs() << "BOLT-INFO: UCE removed " << DeletedBlocks << " blocks and "
-           << DeletedBytes << " bytes of code\n";
+    BC.outs() << "BOLT-INFO: UCE removed " << DeletedBlocks << " blocks and "
+              << DeletedBytes << " bytes of code\n";
   return Error::success();
 }
 
@@ -413,8 +413,9 @@ Error ReorderBasicBlocks::runOnFunctions(BinaryContext &BC) {
   const size_t NumAllProfiledFunctions =
       BC.NumProfiledFuncs + BC.NumStaleProfileFuncs;
 
-  outs() << "BOLT-INFO: basic block reordering modified layout of "
-         << format("%zu functions (%.2lf%% of profiled, %.2lf%% of total)\n",
+  BC.outs() << "BOLT-INFO: basic block reordering modified layout of "
+            << format(
+                   "%zu functions (%.2lf%% of profiled, %.2lf%% of total)\n",
                    ModifiedFuncCount.load(std::memory_order_relaxed),
                    100.0 * ModifiedFuncCount.load(std::memory_order_relaxed) /
                        NumAllProfiledFunctions,
@@ -422,7 +423,7 @@ Error ReorderBasicBlocks::runOnFunctions(BinaryContext &BC) {
                        BC.getBinaryFunctions().size());
 
   if (opts::PrintFuncStat > 0) {
-    raw_ostream &OS = outs();
+    raw_ostream &OS = BC.outs();
     // Copy all the values into vector in order to sort them
     std::map<uint64_t, BinaryFunction &> ScoreMap;
     auto &BFs = BC.getBinaryFunctions();
@@ -532,8 +533,8 @@ Error FinalizeFunctions::runOnFunctions(BinaryContext &BC) {
   ParallelUtilities::WorkFuncTy WorkFun = [&](BinaryFunction &BF) {
     if (!BF.finalizeCFIState()) {
       if (BC.HasRelocations) {
-        errs() << "BOLT-ERROR: unable to fix CFI state for function " << BF
-               << ". Exiting.\n";
+        BC.errs() << "BOLT-ERROR: unable to fix CFI state for function " << BF
+                  << ". Exiting.\n";
         HasFatal = true;
         return;
       }
@@ -993,16 +994,16 @@ Error SimplifyConditionalTailCalls::runOnFunctions(BinaryContext &BC) {
   }
 
   if (NumTailCallsPatched)
-    outs() << "BOLT-INFO: SCTC: patched " << NumTailCallsPatched
-           << " tail calls (" << NumOrigForwardBranches << " forward)"
-           << " tail calls (" << NumOrigBackwardBranches << " backward)"
-           << " from a total of " << NumCandidateTailCalls << " while removing "
-           << NumDoubleJumps << " double jumps"
-           << " and removing " << DeletedBlocks << " basic blocks"
-           << " totalling " << DeletedBytes
-           << " bytes of code. CTCs total execution count is " << CTCExecCount
-           << " and the number of times CTCs are taken is " << CTCTakenCount
-           << "\n";
+    BC.outs() << "BOLT-INFO: SCTC: patched " << NumTailCallsPatched
+              << " tail calls (" << NumOrigForwardBranches << " forward)"
+              << " tail calls (" << NumOrigBackwardBranches << " backward)"
+              << " from a total of " << NumCandidateTailCalls
+              << " while removing " << NumDoubleJumps << " double jumps"
+              << " and removing " << DeletedBlocks << " basic blocks"
+              << " totalling " << DeletedBytes
+              << " bytes of code. CTCs total execution count is "
+              << CTCExecCount << " and the number of times CTCs are taken is "
+              << CTCTakenCount << "\n";
   return Error::success();
 }
 
@@ -1020,10 +1021,10 @@ uint64_t ShortenInstructions::shortenInstructions(BinaryFunction &Function) {
 
       if (opts::Verbosity > 2) {
         BC.scopeLock();
-        outs() << "BOLT-INFO: shortening:\nBOLT-INFO:    ";
-        BC.printInstruction(outs(), OriginalInst, 0, &Function);
-        outs() << "BOLT-INFO: to:";
-        BC.printInstruction(outs(), Inst, 0, &Function);
+        BC.outs() << "BOLT-INFO: shortening:\nBOLT-INFO:    ";
+        BC.printInstruction(BC.outs(), OriginalInst, 0, &Function);
+        BC.outs() << "BOLT-INFO: to:";
+        BC.printInstruction(BC.outs(), Inst, 0, &Function);
       }
 
       ++Count;
@@ -1044,7 +1045,8 @@ Error ShortenInstructions::runOnFunctions(BinaryContext &BC) {
       nullptr, "ShortenInstructions");
 
   if (NumShortened)
-    outs() << "BOLT-INFO: " << NumShortened << " instructions were shortened\n";
+    BC.outs() << "BOLT-INFO: " << NumShortened
+              << " instructions were shortened\n";
   return Error::success();
 }
 
@@ -1107,12 +1109,12 @@ Error Peepholes::runOnFunctions(BinaryContext &BC) {
       assert(Function.validateCFG());
     }
   }
-  outs() << "BOLT-INFO: Peephole: " << NumDoubleJumps
-         << " double jumps patched.\n"
-         << "BOLT-INFO: Peephole: " << TailCallTraps
-         << " tail call traps inserted.\n"
-         << "BOLT-INFO: Peephole: " << NumUselessCondBranches
-         << " useless conditional branches removed.\n";
+  BC.outs() << "BOLT-INFO: Peephole: " << NumDoubleJumps
+            << " double jumps patched.\n"
+            << "BOLT-INFO: Peephole: " << TailCallTraps
+            << " tail call traps inserted.\n"
+            << "BOLT-INFO: Peephole: " << NumUselessCondBranches
+            << " useless conditional branches removed.\n";
   return Error::success();
 }
 
@@ -1205,11 +1207,12 @@ Error SimplifyRODataLoads::runOnFunctions(BinaryContext &BC) {
       Modified.insert(&Function);
   }
 
-  outs() << "BOLT-INFO: simplified " << NumLoadsSimplified << " out of "
-         << NumLoadsFound << " loads from a statically computed address.\n"
-         << "BOLT-INFO: dynamic loads simplified: " << NumDynamicLoadsSimplified
-         << "\n"
-         << "BOLT-INFO: dynamic loads found: " << NumDynamicLoadsFound << "\n";
+  BC.outs() << "BOLT-INFO: simplified " << NumLoadsSimplified << " out of "
+            << NumLoadsFound << " loads from a statically computed address.\n"
+            << "BOLT-INFO: dynamic loads simplified: "
+            << NumDynamicLoadsSimplified << "\n"
+            << "BOLT-INFO: dynamic loads found: " << NumDynamicLoadsFound
+            << "\n";
   return Error::success();
 }
 
@@ -1329,11 +1332,11 @@ Error PrintProfileStats::runOnFunctions(BinaryContext &BC) {
   }
 
   // Report to user
-  outs() << format("BOLT-INFO: Profile bias score: %.4lf%% StDev: %.4lf%%\n",
-                   (100.0 * FlowImbalanceMean), (100.0 * FlowImbalanceVar));
+  BC.outs() << format("BOLT-INFO: Profile bias score: %.4lf%% StDev: %.4lf%%\n",
+                      (100.0 * FlowImbalanceMean), (100.0 * FlowImbalanceVar));
   if (WorstBiasFunc && opts::Verbosity >= 1) {
-    outs() << "Worst average bias observed in " << WorstBiasFunc->getPrintName()
-           << "\n";
+    BC.outs() << "Worst average bias observed in "
+              << WorstBiasFunc->getPrintName() << "\n";
     LLVM_DEBUG(WorstBiasFunc->dump());
   }
   return Error::success();
@@ -1370,7 +1373,7 @@ Error PrintProgramStats::runOnFunctions(BinaryContext &BC) {
       if (opts::PrintUnknownCFG)
         Function.dump();
       else if (opts::PrintUnknown)
-        errs() << "function with unknown control flow: " << Function << '\n';
+        BC.errs() << "function with unknown control flow: " << Function << '\n';
 
       ++NumUnknownControlFlowFunctions;
     }
@@ -1390,9 +1393,9 @@ Error PrintProgramStats::runOnFunctions(BinaryContext &BC) {
       }
     } else {
       if (opts::ReportStaleFuncs) {
-        outs() << StaleFuncsHeader;
+        BC.outs() << StaleFuncsHeader;
         StaleFuncsHeader = "";
-        outs() << "  " << Function << '\n';
+        BC.outs() << "  " << Function << '\n';
       }
       ++NumStaleProfileFunctions;
       StaleSampleCount += SampleCount;
@@ -1404,15 +1407,15 @@ Error PrintProgramStats::runOnFunctions(BinaryContext &BC) {
 
   const size_t NumAllProfiledFunctions =
       ProfiledFunctions.size() + NumStaleProfileFunctions;
-  outs() << "BOLT-INFO: " << NumAllProfiledFunctions << " out of "
-         << NumRegularFunctions << " functions in the binary ("
-         << format("%.1f", NumAllProfiledFunctions /
-                               (float)NumRegularFunctions * 100.0f)
-         << "%) have non-empty execution profile\n";
+  BC.outs() << "BOLT-INFO: " << NumAllProfiledFunctions << " out of "
+            << NumRegularFunctions << " functions in the binary ("
+            << format("%.1f", NumAllProfiledFunctions /
+                                  (float)NumRegularFunctions * 100.0f)
+            << "%) have non-empty execution profile\n";
   if (NumNonSimpleProfiledFunctions) {
-    outs() << "BOLT-INFO: " << NumNonSimpleProfiledFunctions << " function"
-           << (NumNonSimpleProfiledFunctions == 1 ? "" : "s")
-           << " with profile could not be optimized\n";
+    BC.outs() << "BOLT-INFO: " << NumNonSimpleProfiledFunctions << " function"
+              << (NumNonSimpleProfiledFunctions == 1 ? "" : "s")
+              << " with profile could not be optimized\n";
   }
   if (NumAllStaleFunctions) {
     const float PctStale =
@@ -1425,36 +1428,37 @@ Error PrintProgramStats::runOnFunctions(BinaryContext &BC) {
         BC.Stats.NumStaleBlocks * 100.0f;
     auto printErrorOrWarning = [&]() {
       if (PctStale > opts::StaleThreshold)
-        errs() << "BOLT-ERROR: ";
+        BC.errs() << "BOLT-ERROR: ";
       else
-        errs() << "BOLT-WARNING: ";
+        BC.errs() << "BOLT-WARNING: ";
     };
     printErrorOrWarning();
-    errs() << NumAllStaleFunctions
-           << format(" (%.1f%% of all profiled)", PctStale) << " function"
-           << (NumAllStaleFunctions == 1 ? "" : "s")
-           << " have invalid (possibly stale) profile."
-              " Use -report-stale to see the list.\n";
+    BC.errs() << NumAllStaleFunctions
+              << format(" (%.1f%% of all profiled)", PctStale) << " function"
+              << (NumAllStaleFunctions == 1 ? "" : "s")
+              << " have invalid (possibly stale) profile."
+                 " Use -report-stale to see the list.\n";
     if (TotalSampleCount > 0) {
       printErrorOrWarning();
-      errs() << (StaleSampleCount + InferredSampleCount) << " out of "
-             << TotalSampleCount << " samples in the binary ("
-             << format("%.1f",
-                       ((100.0f * (StaleSampleCount + InferredSampleCount)) /
-                        TotalSampleCount))
-             << "%) belong to functions with invalid"
-                " (possibly stale) profile.\n";
+      BC.errs() << (StaleSampleCount + InferredSampleCount) << " out of "
+                << TotalSampleCount << " samples in the binary ("
+                << format("%.1f",
+                          ((100.0f * (StaleSampleCount + InferredSampleCount)) /
+                           TotalSampleCount))
+                << "%) belong to functions with invalid"
+                   " (possibly stale) profile.\n";
     }
-    outs() << "BOLT-INFO: " << BC.Stats.NumStaleFuncsWithEqualBlockCount
-           << " stale function"
-           << (BC.Stats.NumStaleFuncsWithEqualBlockCount == 1 ? "" : "s")
-           << format(" (%.1f%% of all stale)", PctStaleFuncsWithEqualBlockCount)
-           << " have matching block count.\n";
-    outs() << "BOLT-INFO: " << BC.Stats.NumStaleBlocksWithEqualIcount
-           << " stale block"
-           << (BC.Stats.NumStaleBlocksWithEqualIcount == 1 ? "" : "s")
-           << format(" (%.1f%% of all stale)", PctStaleBlocksWithEqualIcount)
-           << " have matching icount.\n";
+    BC.outs() << "BOLT-INFO: " << BC.Stats.NumStaleFuncsWithEqualBlockCount
+              << " stale function"
+              << (BC.Stats.NumStaleFuncsWithEqualBlockCount == 1 ? "" : "s")
+              << format(" (%.1f%% of all stale)",
+                        PctStaleFuncsWithEqualBlockCount)
+              << " have matching block count.\n";
+    BC.outs() << "BOLT-INFO: " << BC.Stats.NumStaleBlocksWithEqualIcount
+              << " stale block"
+              << (BC.Stats.NumStaleBlocksWithEqualIcount == 1 ? "" : "s")
+              << format(" (%.1f%% of all stale)", PctStaleBlocksWithEqualIcount)
+              << " have matching icount.\n";
     if (PctStale > opts::StaleThreshold) {
       return createFatalBOLTError(
           Twine("BOLT-ERROR: stale functions exceed specified threshold of ") +
@@ -1462,15 +1466,16 @@ Error PrintProgramStats::runOnFunctions(BinaryContext &BC) {
     }
   }
   if (NumInferredFunctions) {
-    outs() << format("BOLT-INFO: inferred profile for %d (%.2f%% of profiled, "
-                     "%.2f%% of stale) functions responsible for %.2f%% samples"
-                     " (%zu out of %zu)\n",
-                     NumInferredFunctions,
-                     100.0 * NumInferredFunctions / NumAllProfiledFunctions,
-                     100.0 * NumInferredFunctions / NumAllStaleFunctions,
-                     100.0 * InferredSampleCount / TotalSampleCount,
-                     InferredSampleCount, TotalSampleCount);
-    outs() << format(
+    BC.outs() << format(
+        "BOLT-INFO: inferred profile for %d (%.2f%% of profiled, "
+        "%.2f%% of stale) functions responsible for %.2f%% samples"
+        " (%zu out of %zu)\n",
+        NumInferredFunctions,
+        100.0 * NumInferredFunctions / NumAllProfiledFunctions,
+        100.0 * NumInferredFunctions / NumAllStaleFunctions,
+        100.0 * InferredSampleCount / TotalSampleCount, InferredSampleCount,
+        TotalSampleCount);
+    BC.outs() << format(
         "BOLT-INFO: inference found an exact match for %.2f%% of basic blocks"
         " (%zu out of %zu stale) responsible for %.2f%% samples"
         " (%zu out of %zu stale)\n",
@@ -1481,13 +1486,13 @@ Error PrintProgramStats::runOnFunctions(BinaryContext &BC) {
   }
 
   if (const uint64_t NumUnusedObjects = BC.getNumUnusedProfiledObjects()) {
-    outs() << "BOLT-INFO: profile for " << NumUnusedObjects
-           << " objects was ignored\n";
+    BC.outs() << "BOLT-INFO: profile for " << NumUnusedObjects
+              << " objects was ignored\n";
   }
 
   if (ProfiledFunctions.size() > 10) {
     if (opts::Verbosity >= 1) {
-      outs() << "BOLT-INFO: top called functions are:\n";
+      BC.outs() << "BOLT-INFO: top called functions are:\n";
       llvm::sort(ProfiledFunctions,
                  [](const BinaryFunction *A, const BinaryFunction *B) {
                    return B->getExecutionCount() < A->getExecutionCount();
@@ -1496,7 +1501,8 @@ Error PrintProgramStats::runOnFunctions(BinaryContext &BC) {
       auto SFIend = ProfiledFunctions.end();
       for (unsigned I = 0u; I < opts::TopCalledLimit && SFI != SFIend;
            ++SFI, ++I)
-        outs() << "  " << **SFI << " : " << (*SFI)->getExecutionCount() << '\n';
+        BC.outs() << "  " << **SFI << " : " << (*SFI)->getExecutionCount()
+                  << '\n';
     }
   }
 
@@ -1536,70 +1542,70 @@ Error PrintProgramStats::runOnFunctions(BinaryContext &BC) {
           });
     }
 
-    outs() << "BOLT-INFO: top functions sorted by ";
+    BC.outs() << "BOLT-INFO: top functions sorted by ";
     if (SortAll) {
-      outs() << "dyno stats";
+      BC.outs() << "dyno stats";
     } else {
-      outs() << "(";
+      BC.outs() << "(";
       bool PrintComma = false;
       for (const DynoStats::Category Category : opts::PrintSortedBy) {
         if (PrintComma)
-          outs() << ", ";
-        outs() << DynoStats::Description(Category);
+          BC.outs() << ", ";
+        BC.outs() << DynoStats::Description(Category);
         PrintComma = true;
       }
-      outs() << ")";
+      BC.outs() << ")";
     }
 
-    outs() << " are:\n";
+    BC.outs() << " are:\n";
     auto SFI = Functions.begin();
     for (unsigned I = 0; I < 100 && SFI != Functions.end(); ++SFI, ++I) {
       const DynoStats Stats = getDynoStats(**SFI);
-      outs() << "  " << **SFI;
+      BC.outs() << "  " << **SFI;
       if (!SortAll) {
-        outs() << " (";
+        BC.outs() << " (";
         bool PrintComma = false;
         for (const DynoStats::Category Category : opts::PrintSortedBy) {
           if (PrintComma)
-            outs() << ", ";
-          outs() << dynoStatsOptName(Category) << "=" << Stats[Category];
+            BC.outs() << ", ";
+          BC.outs() << dynoStatsOptName(Category) << "=" << Stats[Category];
           PrintComma = true;
         }
-        outs() << ")";
+        BC.outs() << ")";
       }
-      outs() << "\n";
+      BC.outs() << "\n";
     }
   }
 
   if (!BC.TrappedFunctions.empty()) {
-    errs() << "BOLT-WARNING: " << BC.TrappedFunctions.size() << " function"
-           << (BC.TrappedFunctions.size() > 1 ? "s" : "")
-           << " will trap on entry. Use -trap-avx512=0 to disable"
-              " traps.";
+    BC.errs() << "BOLT-WARNING: " << BC.TrappedFunctions.size() << " function"
+              << (BC.TrappedFunctions.size() > 1 ? "s" : "")
+              << " will trap on entry. Use -trap-avx512=0 to disable"
+                 " traps.";
     if (opts::Verbosity >= 1 || BC.TrappedFunctions.size() <= 5) {
-      errs() << '\n';
+      BC.errs() << '\n';
       for (const BinaryFunction *Function : BC.TrappedFunctions)
-        errs() << "  " << *Function << '\n';
+        BC.errs() << "  " << *Function << '\n';
     } else {
-      errs() << " Use -v=1 to see the list.\n";
+      BC.errs() << " Use -v=1 to see the list.\n";
     }
   }
 
   // Print information on missed macro-fusion opportunities seen on input.
   if (BC.Stats.MissedMacroFusionPairs) {
-    outs() << format("BOLT-INFO: the input contains %zu (dynamic count : %zu)"
-                     " opportunities for macro-fusion optimization",
-                     BC.Stats.MissedMacroFusionPairs,
-                     BC.Stats.MissedMacroFusionExecCount);
+    BC.outs() << format(
+        "BOLT-INFO: the input contains %zu (dynamic count : %zu)"
+        " opportunities for macro-fusion optimization",
+        BC.Stats.MissedMacroFusionPairs, BC.Stats.MissedMacroFusionExecCount);
     switch (opts::AlignMacroOpFusion) {
     case MFT_NONE:
-      outs() << ". Use -align-macro-fusion to fix.\n";
+      BC.outs() << ". Use -align-macro-fusion to fix.\n";
       break;
     case MFT_HOT:
-      outs() << ". Will fix instances on a hot path.\n";
+      BC.outs() << ". Will fix instances on a hot path.\n";
       break;
     case MFT_ALL:
-      outs() << " that are going to be fixed\n";
+      BC.outs() << " that are going to be fixed\n";
       break;
     }
   }
@@ -1634,23 +1640,23 @@ Error PrintProgramStats::runOnFunctions(BinaryContext &BC) {
                           B->getKnownExecutionCount() / B->getSize();
                  });
 
-      outs() << "BOLT-INFO: " << SuboptimalFuncs.size()
-             << " functions have "
-                "cold code in the middle of hot code. Top functions are:\n";
+      BC.outs() << "BOLT-INFO: " << SuboptimalFuncs.size()
+                << " functions have "
+                   "cold code in the middle of hot code. Top functions are:\n";
       for (unsigned I = 0;
            I < std::min(static_cast<size_t>(opts::ReportBadLayout),
                         SuboptimalFuncs.size());
            ++I)
-        SuboptimalFuncs[I]->print(outs());
+        SuboptimalFuncs[I]->print(BC.outs());
     }
   }
 
   if (NumUnknownControlFlowFunctions) {
-    outs() << "BOLT-INFO: " << NumUnknownControlFlowFunctions
-           << " functions have instructions with unknown control flow";
+    BC.outs() << "BOLT-INFO: " << NumUnknownControlFlowFunctions
+              << " functions have instructions with unknown control flow";
     if (!opts::PrintUnknown)
-      outs() << ". Use -print-unknown to see the list.";
-    outs() << '\n';
+      BC.outs() << ". Use -print-unknown to see the list.";
+    BC.outs() << '\n';
   }
   return Error::success();
 }
@@ -1682,10 +1688,10 @@ Error StripRepRet::runOnFunctions(BinaryContext &BC) {
   }
 
   if (NumBytesSaved)
-    outs() << "BOLT-INFO: removed " << NumBytesSaved
-           << " 'repz' prefixes"
-              " with estimated execution count of "
-           << NumPrefixesRemoved << " times.\n";
+    BC.outs() << "BOLT-INFO: removed " << NumBytesSaved
+              << " 'repz' prefixes"
+                 " with estimated execution count of "
+              << NumPrefixesRemoved << " times.\n";
   return Error::success();
 }
 
@@ -1730,11 +1736,11 @@ Error InlineMemcpy::runOnFunctions(BinaryContext &BC) {
   }
 
   if (NumInlined) {
-    outs() << "BOLT-INFO: inlined " << NumInlined << " memcpy() calls";
+    BC.outs() << "BOLT-INFO: inlined " << NumInlined << " memcpy() calls";
     if (NumInlinedDyno)
-      outs() << ". The calls were executed " << NumInlinedDyno
-             << " times based on profile.";
-    outs() << '\n';
+      BC.outs() << ". The calls were executed " << NumInlinedDyno
+                << " times based on profile.";
+    BC.outs() << '\n';
   }
   return Error::success();
 }
@@ -1864,12 +1870,12 @@ Error SpecializeMemcpy1::runOnFunctions(BinaryContext &BC) {
   }
 
   if (NumSpecialized) {
-    outs() << "BOLT-INFO: specialized " << NumSpecialized
-           << " memcpy() call sites for size 1";
+    BC.outs() << "BOLT-INFO: specialized " << NumSpecialized
+              << " memcpy() call sites for size 1";
     if (NumSpecializedDyno)
-      outs() << ". The calls were executed " << NumSpecializedDyno
-             << " times based on profile.";
-    outs() << '\n';
+      BC.outs() << ". The calls were executed " << NumSpecializedDyno
+                << " times based on profile.";
+    BC.outs() << '\n';
   }
   return Error::success();
 }
diff --git a/bolt/lib/Passes/CMOVConversion.cpp b/bolt/lib/Passes/CMOVConversion.cpp
index adb6115..2492ff2 100644
--- a/bolt/lib/Passes/CMOVConversion.cpp
+++ b/bolt/lib/Passes/CMOVConversion.cpp
@@ -168,14 +168,14 @@ int calculateConditionBias(const BinaryBasicBlock &BB,
   return -1;
 }
 
-void CMOVConversion::Stats::dump() {
-  outs() << "converted static " << StaticPerformed << "/" << StaticPossible
-         << formatv(" ({0:P}) ", getStaticRatio())
-         << "hammock(s) into CMOV sequences, with dynamic execution count "
-         << DynamicPerformed << "/" << DynamicPossible
-         << formatv(" ({0:P}), ", getDynamicRatio()) << "saving " << RemovedMP
-         << "/" << PossibleMP << formatv(" ({0:P}) ", getMPRatio())
-         << "mispredictions\n";
+void CMOVConversion::Stats::dumpTo(raw_ostream &OS) {
+  OS << "converted static " << StaticPerformed << "/" << StaticPossible
+     << formatv(" ({0:P}) ", getStaticRatio())
+     << "hammock(s) into CMOV sequences, with dynamic execution count "
+     << DynamicPerformed << "/" << DynamicPossible
+     << formatv(" ({0:P}), ", getDynamicRatio()) << "saving " << RemovedMP
+     << "/" << PossibleMP << formatv(" ({0:P}) ", getMPRatio())
+     << "mispredictions\n";
 }
 
 void CMOVConversion::runOnFunction(BinaryFunction &Function) {
@@ -265,8 +265,8 @@ void CMOVConversion::runOnFunction(BinaryFunction &Function) {
   if (Modified)
     Function.eraseInvalidBBs();
   if (opts::Verbosity > 1) {
-    outs() << "BOLT-INFO: CMOVConversion: " << Function << ", ";
-    Local.dump();
+    BC.outs() << "BOLT-INFO: CMOVConversion: " << Function << ", ";
+    Local.dumpTo(BC.outs());
   }
   Global = Global + Local;
 }
@@ -279,8 +279,8 @@ Error CMOVConversion::runOnFunctions(BinaryContext &BC) {
     runOnFunction(Function);
   }
 
-  outs() << "BOLT-INFO: CMOVConversion total: ";
-  Global.dump();
+  BC.outs() << "BOLT-INFO: CMOVConversion total: ";
+  Global.dumpTo(BC.outs());
   return Error::success();
 }
 
diff --git a/bolt/lib/Passes/CacheMetrics.cpp b/bolt/lib/Passes/CacheMetrics.cpp
index f670899..b02d430 100644
--- a/bolt/lib/Passes/CacheMetrics.cpp
+++ b/bolt/lib/Passes/CacheMetrics.cpp
@@ -189,7 +189,8 @@ double expectedCacheHitRatio(
 
 } // namespace
 
-void CacheMetrics::printAll(const std::vector<BinaryFunction *> &BFs) {
+void CacheMetrics::printAll(raw_ostream &OS,
+                            const std::vector<BinaryFunction *> &BFs) {
   // Stats related to hot-cold code splitting
   size_t NumFunctions = 0;
   size_t NumProfiledFunctions = 0;
@@ -222,36 +223,36 @@ void CacheMetrics::printAll(const std::vector<BinaryFunction *> &BFs) {
     }
   }
 
-  outs() << format("  There are %zu functions;", NumFunctions)
-         << format(" %zu (%.2lf%%) are in the hot section,", NumHotFunctions,
-                   100.0 * NumHotFunctions / NumFunctions)
-         << format(" %zu (%.2lf%%) have profile\n", NumProfiledFunctions,
-                   100.0 * NumProfiledFunctions / NumFunctions);
-  outs() << format("  There are %zu basic blocks;", NumBlocks)
-         << format(" %zu (%.2lf%%) are in the hot section\n", NumHotBlocks,
-                   100.0 * NumHotBlocks / NumBlocks);
+  OS << format("  There are %zu functions;", NumFunctions)
+     << format(" %zu (%.2lf%%) are in the hot section,", NumHotFunctions,
+               100.0 * NumHotFunctions / NumFunctions)
+     << format(" %zu (%.2lf%%) have profile\n", NumProfiledFunctions,
+               100.0 * NumProfiledFunctions / NumFunctions);
+  OS << format("  There are %zu basic blocks;", NumBlocks)
+     << format(" %zu (%.2lf%%) are in the hot section\n", NumHotBlocks,
+               100.0 * NumHotBlocks / NumBlocks);
 
   assert(TotalCodeMinAddr <= TotalCodeMaxAddr && "incorrect output addresses");
   size_t HotCodeSize = HotCodeMaxAddr - HotCodeMinAddr;
   size_t TotalCodeSize = TotalCodeMaxAddr - TotalCodeMinAddr;
 
   size_t HugePage2MB = 2 << 20;
-  outs() << format("  Hot code takes %.2lf%% of binary (%zu bytes out of %zu, "
-                   "%.2lf huge pages)\n",
-                   100.0 * HotCodeSize / TotalCodeSize, HotCodeSize,
-                   TotalCodeSize, double(HotCodeSize) / HugePage2MB);
+  OS << format("  Hot code takes %.2lf%% of binary (%zu bytes out of %zu, "
+               "%.2lf huge pages)\n",
+               100.0 * HotCodeSize / TotalCodeSize, HotCodeSize, TotalCodeSize,
+               double(HotCodeSize) / HugePage2MB);
 
   // Stats related to expected cache performance
   std::unordered_map<BinaryBasicBlock *, uint64_t> BBAddr;
   std::unordered_map<BinaryBasicBlock *, uint64_t> BBSize;
   extractBasicBlockInfo(BFs, BBAddr, BBSize);
 
-  outs() << "  Expected i-TLB cache hit ratio: "
-         << format("%.2lf%%\n", expectedCacheHitRatio(BFs, BBAddr, BBSize));
+  OS << "  Expected i-TLB cache hit ratio: "
+     << format("%.2lf%%\n", expectedCacheHitRatio(BFs, BBAddr, BBSize));
 
   auto Stats = calcTSPScore(BFs, BBAddr, BBSize);
-  outs() << "  TSP score: "
-         << format("%.2lf%% (%zu out of %zu)\n",
-                   100.0 * Stats.first / std::max<uint64_t>(Stats.second, 1),
-                   Stats.first, Stats.second);
+  OS << "  TSP score: "
+     << format("%.2lf%% (%zu out of %zu)\n",
+               100.0 * Stats.first / std::max<uint64_t>(Stats.second, 1),
+               Stats.first, Stats.second);
 }
diff --git a/bolt/lib/Passes/FrameAnalysis.cpp b/bolt/lib/Passes/FrameAnalysis.cpp
index 1e6be49..7f1245e 100644
--- a/bolt/lib/Passes/FrameAnalysis.cpp
+++ b/bolt/lib/Passes/FrameAnalysis.cpp
@@ -124,7 +124,7 @@ class FrameAccessAnalysis {
     if (IsIndexed || (!FIE.Size && (FIE.IsLoad || FIE.IsStore))) {
       LLVM_DEBUG(dbgs() << "Giving up on indexed memory access/unknown size\n");
       LLVM_DEBUG(dbgs() << "Blame insn: ");
-      LLVM_DEBUG(BC.printInstruction(outs(), Inst, 0, &BF, true, false, false));
+      LLVM_DEBUG(BC.printInstruction(dbgs(), Inst, 0, &BF, true, false, false));
       LLVM_DEBUG(Inst.dump());
       return false;
     }
@@ -570,13 +570,14 @@ FrameAnalysis::FrameAnalysis(BinaryContext &BC, BinaryFunctionCallGraph &CG)
 }
 
 void FrameAnalysis::printStats() {
-  outs() << "BOLT-INFO: FRAME ANALYSIS: " << NumFunctionsNotOptimized
-         << " function(s) were not optimized.\n"
-         << "BOLT-INFO: FRAME ANALYSIS: " << NumFunctionsFailedRestoreFI
-         << " function(s) "
-         << format("(%.1lf%% dyn cov)",
+  BC.outs() << "BOLT-INFO: FRAME ANALYSIS: " << NumFunctionsNotOptimized
+            << " function(s) were not optimized.\n"
+            << "BOLT-INFO: FRAME ANALYSIS: " << NumFunctionsFailedRestoreFI
+            << " function(s) "
+            << format(
+                   "(%.1lf%% dyn cov)",
                    (100.0 * CountFunctionsFailedRestoreFI / CountDenominator))
-         << " could not have its frame indices restored.\n";
+            << " could not have its frame indices restored.\n";
 }
 
 void FrameAnalysis::clearSPTMap() {
diff --git a/bolt/lib/Passes/FrameOptimizer.cpp b/bolt/lib/Passes/FrameOptimizer.cpp
index 30bcfb9..fb5f8ea 100644
--- a/bolt/lib/Passes/FrameOptimizer.cpp
+++ b/bolt/lib/Passes/FrameOptimizer.cpp
@@ -289,21 +289,21 @@ Error FrameOptimizerPass::runOnFunctions(BinaryContext &BC) {
       return Error(std::move(E));
   }
 
-  outs() << "BOLT-INFO: FOP optimized " << NumRedundantLoads
-         << " redundant load(s) and " << NumRedundantStores
-         << " unused store(s)\n";
-  outs() << "BOLT-INFO: Frequency of redundant loads is " << FreqRedundantLoads
-         << " and frequency of unused stores is " << FreqRedundantStores
-         << "\n";
-  outs() << "BOLT-INFO: Frequency of loads changed to use a register is "
-         << FreqLoadsChangedToReg
-         << " and frequency of loads changed to use an immediate is "
-         << FreqLoadsChangedToImm << "\n";
-  outs() << "BOLT-INFO: FOP deleted " << NumLoadsDeleted
-         << " load(s) (dyn count: " << FreqLoadsDeleted << ") and "
-         << NumRedundantStores << " store(s)\n";
+  BC.outs() << "BOLT-INFO: FOP optimized " << NumRedundantLoads
+            << " redundant load(s) and " << NumRedundantStores
+            << " unused store(s)\n";
+  BC.outs() << "BOLT-INFO: Frequency of redundant loads is "
+            << FreqRedundantLoads << " and frequency of unused stores is "
+            << FreqRedundantStores << "\n";
+  BC.outs() << "BOLT-INFO: Frequency of loads changed to use a register is "
+            << FreqLoadsChangedToReg
+            << " and frequency of loads changed to use an immediate is "
+            << FreqLoadsChangedToImm << "\n";
+  BC.outs() << "BOLT-INFO: FOP deleted " << NumLoadsDeleted
+            << " load(s) (dyn count: " << FreqLoadsDeleted << ") and "
+            << NumRedundantStores << " store(s)\n";
   FA->printStats();
-  ShrinkWrapping::printStats();
+  ShrinkWrapping::printStats(BC);
   return Error::success();
 }
 
@@ -385,9 +385,9 @@ Error FrameOptimizerPass::performShrinkWrapping(const RegAnalysis &RA,
       SkipPredicate, "shrink-wrapping");
 
   if (!Top10Funcs.empty()) {
-    outs() << "BOLT-INFO: top 10 functions changed by shrink wrapping:\n";
+    BC.outs() << "BOLT-INFO: top 10 functions changed by shrink wrapping:\n";
     for (const auto &Elmt : Top10Funcs)
-      outs() << Elmt.first << " : " << Elmt.second->getPrintName() << "\n";
+      BC.outs() << Elmt.first << " : " << Elmt.second->getPrintName() << "\n";
   }
   return SWError;
 }
diff --git a/bolt/lib/Passes/IdenticalCodeFolding.cpp b/bolt/lib/Passes/IdenticalCodeFolding.cpp
index ba3afd2..9f8d82b 100644
--- a/bolt/lib/Passes/IdenticalCodeFolding.cpp
+++ b/bolt/lib/Passes/IdenticalCodeFolding.cpp
@@ -508,14 +508,14 @@ Error IdenticalCodeFolding::runOnFunctions(BinaryContext &BC) {
   });
 
   if (NumFunctionsFolded)
-    outs() << "BOLT-INFO: ICF folded " << NumFunctionsFolded << " out of "
-           << OriginalFunctionCount << " functions in " << Iteration
-           << " passes. " << NumJTFunctionsFolded
-           << " functions had jump tables.\n"
-           << "BOLT-INFO: Removing all identical functions will save "
-           << format("%.2lf", (double)BytesSavedEstimate / 1024)
-           << " KB of code space. Folded functions were called " << NumCalled
-           << " times based on profile.\n";
+    BC.outs() << "BOLT-INFO: ICF folded " << NumFunctionsFolded << " out of "
+              << OriginalFunctionCount << " functions in " << Iteration
+              << " passes. " << NumJTFunctionsFolded
+              << " functions had jump tables.\n"
+              << "BOLT-INFO: Removing all identical functions will save "
+              << format("%.2lf", (double)BytesSavedEstimate / 1024)
+              << " KB of code space. Folded functions were called " << NumCalled
+              << " times based on profile.\n";
 
   return Error::success();
 }
diff --git a/bolt/lib/Passes/IndirectCallPromotion.cpp b/bolt/lib/Passes/IndirectCallPromotion.cpp
index d1dc1af..55eede6 100644
--- a/bolt/lib/Passes/IndirectCallPromotion.cpp
+++ b/bolt/lib/Passes/IndirectCallPromotion.cpp
@@ -171,9 +171,10 @@ static bool verifyProfile(std::map<uint64_t, BinaryFunction> &BFs) {
         if (BI->Count != BinaryBasicBlock::COUNT_NO_PROFILE && BI->Count > 0) {
           if (BB.getKnownExecutionCount() == 0 ||
               SuccBB->getKnownExecutionCount() == 0) {
-            errs() << "BOLT-WARNING: profile verification failed after ICP for "
-                      "function "
-                   << BF << '\n';
+            BF.getBinaryContext().errs()
+                << "BOLT-WARNING: profile verification failed after ICP for "
+                   "function "
+                << BF << '\n';
             IsValid = false;
           }
         }
@@ -526,6 +527,7 @@ IndirectCallPromotion::findCallTargetSymbols(std::vector<Callsite> &Targets,
                                              size_t &N, BinaryBasicBlock &BB,
                                              MCInst &CallInst,
                                              MCInst *&TargetFetchInst) const {
+  const BinaryContext &BC = BB.getFunction()->getBinaryContext();
   const JumpTable *JT = BB.getFunction()->getJumpTable(CallInst);
   SymTargetsType SymTargets;
 
@@ -556,8 +558,9 @@ IndirectCallPromotion::findCallTargetSymbols(std::vector<Callsite> &Targets,
   if (!HotTargets.empty()) {
     if (opts::Verbosity >= 1)
       for (size_t I = 0; I < HotTargets.size(); ++I)
-        outs() << "BOLT-INFO: HotTarget[" << I << "] = (" << HotTargets[I].first
-               << ", " << HotTargets[I].second << ")\n";
+        BC.outs() << "BOLT-INFO: HotTarget[" << I << "] = ("
+                  << HotTargets[I].first << ", " << HotTargets[I].second
+                  << ")\n";
 
     // Recompute hottest targets, now discriminating which index is hot
     // NOTE: This is a tradeoff. On one hand, we get index information. On the
@@ -611,9 +614,9 @@ IndirectCallPromotion::findCallTargetSymbols(std::vector<Callsite> &Targets,
     N = I;
 
     if (N == 0 && opts::Verbosity >= 1) {
-      outs() << "BOLT-INFO: ICP failed in " << *BB.getFunction() << " in "
-             << BB.getName() << ": failed to meet thresholds after memory "
-             << "profile data was loaded.\n";
+      BC.outs() << "BOLT-INFO: ICP failed in " << *BB.getFunction() << " in "
+                << BB.getName() << ": failed to meet thresholds after memory "
+                << "profile data was loaded.\n";
       return SymTargets;
     }
   }
@@ -974,9 +977,9 @@ size_t IndirectCallPromotion::canPromoteCallsite(
   if (Targets.empty() || !NumCalls) {
     if (opts::Verbosity >= 1) {
       const ptrdiff_t InstIdx = &Inst - &(*BB.begin());
-      outs() << "BOLT-INFO: ICP failed in " << *BF << " @ " << InstIdx << " in "
-             << BB.getName() << ", calls = " << NumCalls
-             << ", targets empty or NumCalls == 0.\n";
+      BC.outs() << "BOLT-INFO: ICP failed in " << *BF << " @ " << InstIdx
+                << " in " << BB.getName() << ", calls = " << NumCalls
+                << ", targets empty or NumCalls == 0.\n";
     }
     return 0;
   }
@@ -1015,10 +1018,10 @@ size_t IndirectCallPromotion::canPromoteCallsite(
     if (TopNFrequency == 0 || TopNFrequency < opts::ICPMispredictThreshold) {
       if (opts::Verbosity >= 1) {
         const ptrdiff_t InstIdx = &Inst - &(*BB.begin());
-        outs() << "BOLT-INFO: ICP failed in " << *BF << " @ " << InstIdx
-               << " in " << BB.getName() << ", calls = " << NumCalls
-               << ", top N mis. frequency " << format("%.1f", TopNFrequency)
-               << "% < " << opts::ICPMispredictThreshold << "%\n";
+        BC.outs() << "BOLT-INFO: ICP failed in " << *BF << " @ " << InstIdx
+                  << " in " << BB.getName() << ", calls = " << NumCalls
+                  << ", top N mis. frequency " << format("%.1f", TopNFrequency)
+                  << "% < " << opts::ICPMispredictThreshold << "%\n";
       }
       return 0;
     }
@@ -1061,11 +1064,11 @@ size_t IndirectCallPromotion::canPromoteCallsite(
       if (TopNMispredictFrequency < opts::ICPMispredictThreshold) {
         if (opts::Verbosity >= 1) {
           const ptrdiff_t InstIdx = &Inst - &(*BB.begin());
-          outs() << "BOLT-INFO: ICP failed in " << *BF << " @ " << InstIdx
-                 << " in " << BB.getName() << ", calls = " << NumCalls
-                 << ", top N mispredict frequency "
-                 << format("%.1f", TopNMispredictFrequency) << "% < "
-                 << opts::ICPMispredictThreshold << "%\n";
+          BC.outs() << "BOLT-INFO: ICP failed in " << *BF << " @ " << InstIdx
+                    << " in " << BB.getName() << ", calls = " << NumCalls
+                    << ", top N mispredict frequency "
+                    << format("%.1f", TopNMispredictFrequency) << "% < "
+                    << opts::ICPMispredictThreshold << "%\n";
         }
         return 0;
       }
@@ -1106,29 +1109,29 @@ void IndirectCallPromotion::printCallsiteInfo(
   const bool IsJumpTable = BB.getFunction()->getJumpTable(Inst);
   const ptrdiff_t InstIdx = &Inst - &(*BB.begin());
 
-  outs() << "BOLT-INFO: ICP candidate branch info: " << *BB.getFunction()
-         << " @ " << InstIdx << " in " << BB.getName()
-         << " -> calls = " << NumCalls
-         << (IsTailCall ? " (tail)" : (IsJumpTable ? " (jump table)" : ""))
-         << "\n";
+  BC.outs() << "BOLT-INFO: ICP candidate branch info: " << *BB.getFunction()
+            << " @ " << InstIdx << " in " << BB.getName()
+            << " -> calls = " << NumCalls
+            << (IsTailCall ? " (tail)" : (IsJumpTable ? " (jump table)" : ""))
+            << "\n";
   for (size_t I = 0; I < N; I++) {
     const double Frequency = 100.0 * Targets[I].Branches / NumCalls;
     const double MisFrequency = 100.0 * Targets[I].Mispreds / NumCalls;
-    outs() << "BOLT-INFO:   ";
+    BC.outs() << "BOLT-INFO:   ";
     if (Targets[I].To.Sym)
-      outs() << Targets[I].To.Sym->getName();
+      BC.outs() << Targets[I].To.Sym->getName();
     else
-      outs() << Targets[I].To.Addr;
-    outs() << ", calls = " << Targets[I].Branches
-           << ", mispreds = " << Targets[I].Mispreds
-           << ", taken freq = " << format("%.1f", Frequency) << "%"
-           << ", mis. freq = " << format("%.1f", MisFrequency) << "%";
+      BC.outs() << Targets[I].To.Addr;
+    BC.outs() << ", calls = " << Targets[I].Branches
+              << ", mispreds = " << Targets[I].Mispreds
+              << ", taken freq = " << format("%.1f", Frequency) << "%"
+              << ", mis. freq = " << format("%.1f", MisFrequency) << "%";
     bool First = true;
     for (uint64_t JTIndex : Targets[I].JTIndices) {
-      outs() << (First ? ", indices = " : ", ") << JTIndex;
+      BC.outs() << (First ? ", indices = " : ", ") << JTIndex;
       First = false;
     }
-    outs() << "\n";
+    BC.outs() << "\n";
   }
 
   LLVM_DEBUG({
@@ -1222,9 +1225,9 @@ Error IndirectCallPromotion::runOnFunctions(BinaryContext &BC) {
       Functions.insert(std::get<2>(IC));
       ++Num;
     }
-    outs() << "BOLT-INFO: ICP Total indirect calls = " << TotalIndirectCalls
-           << ", " << Num << " callsites cover " << opts::ICPTopCallsites
-           << "% of all indirect calls\n";
+    BC.outs() << "BOLT-INFO: ICP Total indirect calls = " << TotalIndirectCalls
+              << ", " << Num << " callsites cover " << opts::ICPTopCallsites
+              << "% of all indirect calls\n";
   }
 
   for (BinaryFunction *FuncPtr : Functions) {
@@ -1301,11 +1304,11 @@ Error IndirectCallPromotion::runOnFunctions(BinaryContext &BC) {
               Info.getLivenessAnalysis().getStateBefore(Inst);
           if (!State || (State && (*State)[BC.MIB->getFlagsReg()])) {
             if (opts::Verbosity >= 1)
-              outs() << "BOLT-INFO: ICP failed in " << Function << " @ "
-                     << InstIdx << " in " << BB->getName()
-                     << ", calls = " << NumCalls
-                     << (State ? ", cannot clobber flags reg.\n"
-                               : ", no liveness data available.\n");
+              BC.outs() << "BOLT-INFO: ICP failed in " << Function << " @ "
+                        << InstIdx << " in " << BB->getName()
+                        << ", calls = " << NumCalls
+                        << (State ? ", cannot clobber flags reg.\n"
+                                  : ", no liveness data available.\n");
             continue;
           }
         }
@@ -1341,11 +1344,11 @@ Error IndirectCallPromotion::runOnFunctions(BinaryContext &BC) {
         if (SymTargets.size() < N) {
           const size_t LastTarget = SymTargets.size();
           if (opts::Verbosity >= 1)
-            outs() << "BOLT-INFO: ICP failed in " << Function << " @ "
-                   << InstIdx << " in " << BB->getName()
-                   << ", calls = " << NumCalls
-                   << ", ICP failed to find target symbol for "
-                   << Targets[LastTarget].To.Sym->getName() << "\n";
+            BC.outs() << "BOLT-INFO: ICP failed in " << Function << " @ "
+                      << InstIdx << " in " << BB->getName()
+                      << ", calls = " << NumCalls
+                      << ", ICP failed to find target symbol for "
+                      << Targets[LastTarget].To.Sym->getName() << "\n";
           continue;
         }
 
@@ -1374,10 +1377,10 @@ Error IndirectCallPromotion::runOnFunctions(BinaryContext &BC) {
 
         if (ICPcode.empty()) {
           if (opts::Verbosity >= 1)
-            outs() << "BOLT-INFO: ICP failed in " << Function << " @ "
-                   << InstIdx << " in " << BB->getName()
-                   << ", calls = " << NumCalls
-                   << ", unable to generate promoted call code.\n";
+            BC.outs() << "BOLT-INFO: ICP failed in " << Function << " @ "
+                      << InstIdx << " in " << BB->getName()
+                      << ", calls = " << NumCalls
+                      << ", unable to generate promoted call code.\n";
           continue;
         }
 
@@ -1410,9 +1413,9 @@ Error IndirectCallPromotion::runOnFunctions(BinaryContext &BC) {
           BBs.push_back(MergeBlock);
 
         if (opts::Verbosity >= 1)
-          outs() << "BOLT-INFO: ICP succeeded in " << Function << " @ "
-                 << InstIdx << " in " << BB->getName()
-                 << " -> calls = " << NumCalls << "\n";
+          BC.outs() << "BOLT-INFO: ICP succeeded in " << Function << " @ "
+                    << InstIdx << " in " << BB->getName()
+                    << " -> calls = " << NumCalls << "\n";
 
         if (IsJumpTable)
           ++TotalOptimizedJumpTableCallsites;
@@ -1426,48 +1429,49 @@ Error IndirectCallPromotion::runOnFunctions(BinaryContext &BC) {
     TotalIndirectJmps += FuncTotalIndirectJmps;
   }
 
-  outs() << "BOLT-INFO: ICP total indirect callsites with profile = "
-         << TotalIndirectCallsites << "\n"
-         << "BOLT-INFO: ICP total jump table callsites = "
-         << TotalJumpTableCallsites << "\n"
-         << "BOLT-INFO: ICP total number of calls = " << TotalCalls << "\n"
-         << "BOLT-INFO: ICP percentage of calls that are indirect = "
-         << format("%.1f", (100.0 * TotalIndirectCalls) / TotalCalls) << "%\n"
-         << "BOLT-INFO: ICP percentage of indirect calls that can be "
-            "optimized = "
-         << format("%.1f", (100.0 * TotalNumFrequentCalls) /
-                               std::max<size_t>(TotalIndirectCalls, 1))
-         << "%\n"
-         << "BOLT-INFO: ICP percentage of indirect callsites that are "
-            "optimized = "
-         << format("%.1f", (100.0 * TotalOptimizedIndirectCallsites) /
-                               std::max<uint64_t>(TotalIndirectCallsites, 1))
-         << "%\n"
-         << "BOLT-INFO: ICP number of method load elimination candidates = "
-         << TotalMethodLoadEliminationCandidates << "\n"
-         << "BOLT-INFO: ICP percentage of method calls candidates that have "
-            "loads eliminated = "
-         << format("%.1f", (100.0 * TotalMethodLoadsEliminated) /
-                               std::max<uint64_t>(
-                                   TotalMethodLoadEliminationCandidates, 1))
-         << "%\n"
-         << "BOLT-INFO: ICP percentage of indirect branches that are "
-            "optimized = "
-         << format("%.1f", (100.0 * TotalNumFrequentJmps) /
-                               std::max<uint64_t>(TotalIndirectJmps, 1))
-         << "%\n"
-         << "BOLT-INFO: ICP percentage of jump table callsites that are "
-         << "optimized = "
-         << format("%.1f", (100.0 * TotalOptimizedJumpTableCallsites) /
-                               std::max<uint64_t>(TotalJumpTableCallsites, 1))
-         << "%\n"
-         << "BOLT-INFO: ICP number of jump table callsites that can use hot "
-         << "indices = " << TotalIndexBasedCandidates << "\n"
-         << "BOLT-INFO: ICP percentage of jump table callsites that use hot "
-            "indices = "
-         << format("%.1f", (100.0 * TotalIndexBasedJumps) /
-                               std::max<uint64_t>(TotalIndexBasedCandidates, 1))
-         << "%\n";
+  BC.outs()
+      << "BOLT-INFO: ICP total indirect callsites with profile = "
+      << TotalIndirectCallsites << "\n"
+      << "BOLT-INFO: ICP total jump table callsites = "
+      << TotalJumpTableCallsites << "\n"
+      << "BOLT-INFO: ICP total number of calls = " << TotalCalls << "\n"
+      << "BOLT-INFO: ICP percentage of calls that are indirect = "
+      << format("%.1f", (100.0 * TotalIndirectCalls) / TotalCalls) << "%\n"
+      << "BOLT-INFO: ICP percentage of indirect calls that can be "
+         "optimized = "
+      << format("%.1f", (100.0 * TotalNumFrequentCalls) /
+                            std::max<size_t>(TotalIndirectCalls, 1))
+      << "%\n"
+      << "BOLT-INFO: ICP percentage of indirect callsites that are "
+         "optimized = "
+      << format("%.1f", (100.0 * TotalOptimizedIndirectCallsites) /
+                            std::max<uint64_t>(TotalIndirectCallsites, 1))
+      << "%\n"
+      << "BOLT-INFO: ICP number of method load elimination candidates = "
+      << TotalMethodLoadEliminationCandidates << "\n"
+      << "BOLT-INFO: ICP percentage of method calls candidates that have "
+         "loads eliminated = "
+      << format("%.1f",
+                (100.0 * TotalMethodLoadsEliminated) /
+                    std::max<uint64_t>(TotalMethodLoadEliminationCandidates, 1))
+      << "%\n"
+      << "BOLT-INFO: ICP percentage of indirect branches that are "
+         "optimized = "
+      << format("%.1f", (100.0 * TotalNumFrequentJmps) /
+                            std::max<uint64_t>(TotalIndirectJmps, 1))
+      << "%\n"
+      << "BOLT-INFO: ICP percentage of jump table callsites that are "
+      << "optimized = "
+      << format("%.1f", (100.0 * TotalOptimizedJumpTableCallsites) /
+                            std::max<uint64_t>(TotalJumpTableCallsites, 1))
+      << "%\n"
+      << "BOLT-INFO: ICP number of jump table callsites that can use hot "
+      << "indices = " << TotalIndexBasedCandidates << "\n"
+      << "BOLT-INFO: ICP percentage of jump table callsites that use hot "
+         "indices = "
+      << format("%.1f", (100.0 * TotalIndexBasedJumps) /
+                            std::max<uint64_t>(TotalIndexBasedCandidates, 1))
+      << "%\n";
 
 #ifndef NDEBUG
   verifyProfile(BFs);
diff --git a/bolt/lib/Passes/Inliner.cpp b/bolt/lib/Passes/Inliner.cpp
index d875ecb..a3b2017 100644
--- a/bolt/lib/Passes/Inliner.cpp
+++ b/bolt/lib/Passes/Inliner.cpp
@@ -540,10 +540,10 @@ Error Inliner::runOnFunctions(BinaryContext &BC) {
   } while (InlinedOnce && NumIters < opts::InlineMaxIters);
 
   if (NumInlinedCallSites)
-    outs() << "BOLT-INFO: inlined " << NumInlinedDynamicCalls << " calls at "
-           << NumInlinedCallSites << " call sites in " << NumIters
-           << " iteration(s). Change in binary size: " << TotalInlinedBytes
-           << " bytes.\n";
+    BC.outs() << "BOLT-INFO: inlined " << NumInlinedDynamicCalls << " calls at "
+              << NumInlinedCallSites << " call sites in " << NumIters
+              << " iteration(s). Change in binary size: " << TotalInlinedBytes
+              << " bytes.\n";
   return Error::success();
 }
 
diff --git a/bolt/lib/Passes/Instrumentation.cpp b/bolt/lib/Passes/Instrumentation.cpp
index 26b4a67..760ca84 100644
--- a/bolt/lib/Passes/Instrumentation.cpp
+++ b/bolt/lib/Passes/Instrumentation.cpp
@@ -96,8 +96,8 @@ static bool hasAArch64ExclusiveMemop(BinaryFunction &Function) {
     for (const MCInst &Inst : BB)
       if (BC.MIB->isAArch64Exclusive(Inst)) {
         if (opts::Verbosity >= 1)
-          outs() << "BOLT-INSTRUMENTER: Function " << Function
-                 << " has exclusive instructions, skip instrumentation\n";
+          BC.outs() << "BOLT-INSTRUMENTER: Function " << Function
+                    << " has exclusive instructions, skip instrumentation\n";
         return true;
       }
 
@@ -576,7 +576,7 @@ Error Instrumentation::runOnFunctions(BinaryContext &BC) {
       BC.MIB->createCall(NewInst, Target, BC.Ctx.get());
       BB.insertInstruction(BB.begin(), std::move(NewInst));
     } else {
-      llvm::errs() << "BOLT-WARNING: Entry point not found\n";
+      BC.errs() << "BOLT-WARNING: Entry point not found\n";
     }
 
     if (BinaryData *BD = BC.getBinaryDataByName("___GLOBAL_init_65535/1")) {
@@ -596,7 +596,7 @@ Error Instrumentation::runOnFunctions(BinaryContext &BC) {
       LEA->getOperand(4).setExpr(
           MCSymbolRefExpr::create(Target, MCSymbolRefExpr::VK_None, *BC.Ctx));
     } else {
-      llvm::errs() << "BOLT-WARNING: ___GLOBAL_init_65535 not found\n";
+      BC.errs() << "BOLT-WARNING: ___GLOBAL_init_65535 not found\n";
     }
   }
 
@@ -687,32 +687,34 @@ void Instrumentation::createAuxiliaryFunctions(BinaryContext &BC) {
 void Instrumentation::setupRuntimeLibrary(BinaryContext &BC) {
   uint32_t FuncDescSize = Summary->getFDSize();
 
-  outs() << "BOLT-INSTRUMENTER: Number of indirect call site descriptors: "
-         << Summary->IndCallDescriptions.size() << "\n";
-  outs() << "BOLT-INSTRUMENTER: Number of indirect call target descriptors: "
-         << Summary->IndCallTargetDescriptions.size() << "\n";
-  outs() << "BOLT-INSTRUMENTER: Number of function descriptors: "
-         << Summary->FunctionDescriptions.size() << "\n";
-  outs() << "BOLT-INSTRUMENTER: Number of branch counters: " << BranchCounters
-         << "\n";
-  outs() << "BOLT-INSTRUMENTER: Number of ST leaf node counters: "
-         << LeafNodeCounters << "\n";
-  outs() << "BOLT-INSTRUMENTER: Number of direct call counters: "
-         << DirectCallCounters << "\n";
-  outs() << "BOLT-INSTRUMENTER: Total number of counters: "
-         << Summary->Counters.size() << "\n";
-  outs() << "BOLT-INSTRUMENTER: Total size of counters: "
-         << (Summary->Counters.size() * 8) << " bytes (static alloc memory)\n";
-  outs() << "BOLT-INSTRUMENTER: Total size of string table emitted: "
-         << Summary->StringTable.size() << " bytes in file\n";
-  outs() << "BOLT-INSTRUMENTER: Total size of descriptors: "
-         << (FuncDescSize +
-             Summary->IndCallDescriptions.size() * sizeof(IndCallDescription) +
-             Summary->IndCallTargetDescriptions.size() *
-                 sizeof(IndCallTargetDescription))
-         << " bytes in file\n";
-  outs() << "BOLT-INSTRUMENTER: Profile will be saved to file "
-         << opts::InstrumentationFilename << "\n";
+  BC.outs() << "BOLT-INSTRUMENTER: Number of indirect call site descriptors: "
+            << Summary->IndCallDescriptions.size() << "\n";
+  BC.outs() << "BOLT-INSTRUMENTER: Number of indirect call target descriptors: "
+            << Summary->IndCallTargetDescriptions.size() << "\n";
+  BC.outs() << "BOLT-INSTRUMENTER: Number of function descriptors: "
+            << Summary->FunctionDescriptions.size() << "\n";
+  BC.outs() << "BOLT-INSTRUMENTER: Number of branch counters: "
+            << BranchCounters << "\n";
+  BC.outs() << "BOLT-INSTRUMENTER: Number of ST leaf node counters: "
+            << LeafNodeCounters << "\n";
+  BC.outs() << "BOLT-INSTRUMENTER: Number of direct call counters: "
+            << DirectCallCounters << "\n";
+  BC.outs() << "BOLT-INSTRUMENTER: Total number of counters: "
+            << Summary->Counters.size() << "\n";
+  BC.outs() << "BOLT-INSTRUMENTER: Total size of counters: "
+            << (Summary->Counters.size() * 8)
+            << " bytes (static alloc memory)\n";
+  BC.outs() << "BOLT-INSTRUMENTER: Total size of string table emitted: "
+            << Summary->StringTable.size() << " bytes in file\n";
+  BC.outs() << "BOLT-INSTRUMENTER: Total size of descriptors: "
+            << (FuncDescSize +
+                Summary->IndCallDescriptions.size() *
+                    sizeof(IndCallDescription) +
+                Summary->IndCallTargetDescriptions.size() *
+                    sizeof(IndCallTargetDescription))
+            << " bytes in file\n";
+  BC.outs() << "BOLT-INSTRUMENTER: Profile will be saved to file "
+            << opts::InstrumentationFilename << "\n";
 
   InstrumentationRuntimeLibrary *RtLibrary =
       static_cast<InstrumentationRuntimeLibrary *>(BC.getRuntimeLibrary());
diff --git a/bolt/lib/Passes/JTFootprintReduction.cpp b/bolt/lib/Passes/JTFootprintReduction.cpp
index 5d2e27a..fd291f9 100644
--- a/bolt/lib/Passes/JTFootprintReduction.cpp
+++ b/bolt/lib/Passes/JTFootprintReduction.cpp
@@ -272,23 +272,23 @@ Error JTFootprintReduction::runOnFunctions(BinaryContext &BC) {
   }
 
   if (TotalJTs == TotalJTsDenied) {
-    outs() << "BOLT-INFO: JT Footprint reduction: no changes were made.\n";
+    BC.outs() << "BOLT-INFO: JT Footprint reduction: no changes were made.\n";
     return Error::success();
   }
 
-  outs() << "BOLT-INFO: JT Footprint reduction stats (simple funcs only):\n";
+  BC.outs() << "BOLT-INFO: JT Footprint reduction stats (simple funcs only):\n";
   if (OptimizedScore)
-    outs() << format("\t   %.2lf%%", (OptimizedScore * 100.0 / TotalJTScore))
-           << " of dynamic JT entries were reduced.\n";
-  outs() << "\t   " << TotalJTs - TotalJTsDenied << " of " << TotalJTs
-         << " jump tables affected.\n";
-  outs() << "\t   " << IndJmps - IndJmpsDenied << " of " << IndJmps
-         << " indirect jumps to JTs affected.\n";
-  outs() << "\t   " << NumJTsBadMatch
-         << " JTs discarded due to unsupported jump pattern.\n";
-  outs() << "\t   " << NumJTsNoReg
-         << " JTs discarded due to register unavailability.\n";
-  outs() << "\t   " << BytesSaved << " bytes saved.\n";
+    BC.outs() << format("\t   %.2lf%%", (OptimizedScore * 100.0 / TotalJTScore))
+              << " of dynamic JT entries were reduced.\n";
+  BC.outs() << "\t   " << TotalJTs - TotalJTsDenied << " of " << TotalJTs
+            << " jump tables affected.\n";
+  BC.outs() << "\t   " << IndJmps - IndJmpsDenied << " of " << IndJmps
+            << " indirect jumps to JTs affected.\n";
+  BC.outs() << "\t   " << NumJTsBadMatch
+            << " JTs discarded due to unsupported jump pattern.\n";
+  BC.outs() << "\t   " << NumJTsNoReg
+            << " JTs discarded due to register unavailability.\n";
+  BC.outs() << "\t   " << BytesSaved << " bytes saved.\n";
   return Error::success();
 }
 
diff --git a/bolt/lib/Passes/LongJmp.cpp b/bolt/lib/Passes/LongJmp.cpp
index 72823b9..c483f70 100644
--- a/bolt/lib/Passes/LongJmp.cpp
+++ b/bolt/lib/Passes/LongJmp.cpp
@@ -630,7 +630,7 @@ Error LongJmpPass::relax(BinaryFunction &Func, bool &Modified) {
 }
 
 Error LongJmpPass::runOnFunctions(BinaryContext &BC) {
-  outs() << "BOLT-INFO: Starting stub-insertion pass\n";
+  BC.outs() << "BOLT-INFO: Starting stub-insertion pass\n";
   std::vector<BinaryFunction *> Sorted = BC.getSortedFunctions();
   bool Modified;
   uint32_t Iterations = 0;
@@ -648,10 +648,10 @@ Error LongJmpPass::runOnFunctions(BinaryContext &BC) {
         Func->fixBranches();
     }
   } while (Modified);
-  outs() << "BOLT-INFO: Inserted " << NumHotStubs
-         << " stubs in the hot area and " << NumColdStubs
-         << " stubs in the cold area. Shared " << NumSharedStubs
-         << " times, iterated " << Iterations << " times.\n";
+  BC.outs() << "BOLT-INFO: Inserted " << NumHotStubs
+            << " stubs in the hot area and " << NumColdStubs
+            << " stubs in the cold area. Shared " << NumSharedStubs
+            << " times, iterated " << Iterations << " times.\n";
   return Error::success();
 }
 } // namespace bolt
diff --git a/bolt/lib/Passes/LoopInversionPass.cpp b/bolt/lib/Passes/LoopInversionPass.cpp
index 10df715..250a971 100644
--- a/bolt/lib/Passes/LoopInversionPass.cpp
+++ b/bolt/lib/Passes/LoopInversionPass.cpp
@@ -103,8 +103,8 @@ Error LoopInversionPass::runOnFunctions(BinaryContext &BC) {
       BC, ParallelUtilities::SchedulingPolicy::SP_TRIVIAL, WorkFun, SkipFunc,
       "LoopInversionPass");
 
-  outs() << "BOLT-INFO: " << ModifiedFuncCount
-         << " Functions were reordered by LoopInversionPass\n";
+  BC.outs() << "BOLT-INFO: " << ModifiedFuncCount
+            << " Functions were reordered by LoopInversionPass\n";
   return Error::success();
 }
 
diff --git a/bolt/lib/Passes/PLTCall.cpp b/bolt/lib/Passes/PLTCall.cpp
index c02f2c3..d0276f2 100644
--- a/bolt/lib/Passes/PLTCall.cpp
+++ b/bolt/lib/Passes/PLTCall.cpp
@@ -80,8 +80,8 @@ Error PLTCall::runOnFunctions(BinaryContext &BC) {
 
   if (NumCallsOptimized) {
     BC.RequiresZNow = true;
-    outs() << "BOLT-INFO: " << NumCallsOptimized
-           << " PLT calls in the binary were optimized.\n";
+    BC.outs() << "BOLT-INFO: " << NumCallsOptimized
+              << " PLT calls in the binary were optimized.\n";
   }
   return Error::success();
 }
diff --git a/bolt/lib/Passes/PatchEntries.cpp b/bolt/lib/Passes/PatchEntries.cpp
index 0b0e15f..981d1b7 100644
--- a/bolt/lib/Passes/PatchEntries.cpp
+++ b/bolt/lib/Passes/PatchEntries.cpp
@@ -46,7 +46,7 @@ Error PatchEntries::runOnFunctions(BinaryContext &BC) {
   }
 
   if (opts::Verbosity >= 1)
-    outs() << "BOLT-INFO: patching entries in original code\n";
+    BC.outs() << "BOLT-INFO: patching entries in original code\n";
 
   // Calculate the size of the patch.
   static size_t PatchSize = 0;
@@ -78,8 +78,8 @@ Error PatchEntries::runOnFunctions(BinaryContext &BC) {
                                                   const MCSymbol *Symbol) {
       if (Offset < NextValidByte) {
         if (opts::Verbosity >= 1)
-          outs() << "BOLT-INFO: unable to patch entry point in " << Function
-                 << " at offset 0x" << Twine::utohexstr(Offset) << '\n';
+          BC.outs() << "BOLT-INFO: unable to patch entry point in " << Function
+                    << " at offset 0x" << Twine::utohexstr(Offset) << '\n';
         return false;
       }
 
@@ -89,8 +89,8 @@ Error PatchEntries::runOnFunctions(BinaryContext &BC) {
       NextValidByte = Offset + PatchSize;
       if (NextValidByte > Function.getMaxSize()) {
         if (opts::Verbosity >= 1)
-          outs() << "BOLT-INFO: function " << Function
-                 << " too small to patch its entry point\n";
+          BC.outs() << "BOLT-INFO: function " << Function
+                    << " too small to patch its entry point\n";
         return false;
       }
 
@@ -101,8 +101,8 @@ Error PatchEntries::runOnFunctions(BinaryContext &BC) {
       // We can't change output layout for AArch64 due to LongJmp pass
       if (BC.isAArch64()) {
         if (opts::ForcePatch) {
-          errs() << "BOLT-ERROR: unable to patch entries in " << Function
-                 << "\n";
+          BC.errs() << "BOLT-ERROR: unable to patch entries in " << Function
+                    << "\n";
           return createFatalBOLTError("");
         }
 
@@ -111,8 +111,8 @@ Error PatchEntries::runOnFunctions(BinaryContext &BC) {
 
       // If the original function entries cannot be patched, then we cannot
       // safely emit new function body.
-      errs() << "BOLT-WARNING: failed to patch entries in " << Function
-             << ". The function will not be optimized.\n";
+      BC.errs() << "BOLT-WARNING: failed to patch entries in " << Function
+                << ". The function will not be optimized.\n";
       Function.setIgnored();
       continue;
     }
diff --git a/bolt/lib/Passes/RegAnalysis.cpp b/bolt/lib/Passes/RegAnalysis.cpp
index eab16cb..9054385 100644
--- a/bolt/lib/Passes/RegAnalysis.cpp
+++ b/bolt/lib/Passes/RegAnalysis.cpp
@@ -232,11 +232,11 @@ BitVector RegAnalysis::getFunctionClobberList(const BinaryFunction *Func) {
 }
 
 void RegAnalysis::printStats() {
-  outs() << "BOLT-INFO REG ANALYSIS: Number of functions conservatively "
-            "treated as clobbering all registers: "
-         << NumFunctionsAllClobber
-         << format(" (%.1lf%% dyn cov)\n",
-                   (100.0 * CountFunctionsAllClobber / CountDenominator));
+  BC.outs() << "BOLT-INFO REG ANALYSIS: Number of functions conservatively "
+               "treated as clobbering all registers: "
+            << NumFunctionsAllClobber
+            << format(" (%.1lf%% dyn cov)\n",
+                      (100.0 * CountFunctionsAllClobber / CountDenominator));
 }
 
 } // namespace bolt
diff --git a/bolt/lib/Passes/RegReAssign.cpp b/bolt/lib/Passes/RegReAssign.cpp
index 5f0cddb..0becfb4 100644
--- a/bolt/lib/Passes/RegReAssign.cpp
+++ b/bolt/lib/Passes/RegReAssign.cpp
@@ -480,18 +480,19 @@ Error RegReAssign::runOnFunctions(BinaryContext &BC) {
   }
 
   if (FuncsChanged.empty()) {
-    outs() << "BOLT-INFO: Reg Reassignment Pass: no changes were made.\n";
+    BC.outs() << "BOLT-INFO: Reg Reassignment Pass: no changes were made.\n";
     return Error::success();
   }
   if (opts::UpdateDebugSections)
-    outs() << "BOLT-WARNING: You used -reg-reassign and -update-debug-sections."
-           << " Some registers were changed but associated AT_LOCATION for "
-           << "impacted variables were NOT updated! This operation is "
-           << "currently unsupported by BOLT.\n";
-  outs() << "BOLT-INFO: Reg Reassignment Pass Stats:\n";
-  outs() << "\t   " << FuncsChanged.size() << " functions affected.\n";
-  outs() << "\t   " << StaticBytesSaved << " static bytes saved.\n";
-  outs() << "\t   " << DynBytesSaved << " dynamic bytes saved.\n";
+    BC.outs()
+        << "BOLT-WARNING: You used -reg-reassign and -update-debug-sections."
+        << " Some registers were changed but associated AT_LOCATION for "
+        << "impacted variables were NOT updated! This operation is "
+        << "currently unsupported by BOLT.\n";
+  BC.outs() << "BOLT-INFO: Reg Reassignment Pass Stats:\n";
+  BC.outs() << "\t   " << FuncsChanged.size() << " functions affected.\n";
+  BC.outs() << "\t   " << StaticBytesSaved << " static bytes saved.\n";
+  BC.outs() << "\t   " << DynBytesSaved << " dynamic bytes saved.\n";
   return Error::success();
 }
 
diff --git a/bolt/lib/Passes/ReorderData.cpp b/bolt/lib/Passes/ReorderData.cpp
index 13e3e80..2b04361 100644
--- a/bolt/lib/Passes/ReorderData.cpp
+++ b/bolt/lib/Passes/ReorderData.cpp
@@ -133,7 +133,7 @@ bool filterSymbol(const BinaryData *BD) {
 
 using DataOrder = ReorderData::DataOrder;
 
-void ReorderData::printOrder(const BinarySection &Section,
+void ReorderData::printOrder(BinaryContext &BC, const BinarySection &Section,
                              DataOrder::const_iterator Begin,
                              DataOrder::const_iterator End) const {
   uint64_t TotalSize = 0;
@@ -142,19 +142,20 @@ void ReorderData::printOrder(const BinarySection &Section,
     const BinaryData *BD = Begin->first;
 
     if (!PrintHeader) {
-      outs() << "BOLT-INFO: Hot global symbols for " << Section.getName()
-             << ":\n";
+      BC.outs() << "BOLT-INFO: Hot global symbols for " << Section.getName()
+                << ":\n";
       PrintHeader = true;
     }
 
-    outs() << "BOLT-INFO: " << *BD << ", moveable=" << BD->isMoveable()
-           << format(", weight=%.5f\n", double(Begin->second) / BD->getSize());
+    BC.outs() << "BOLT-INFO: " << *BD << ", moveable=" << BD->isMoveable()
+              << format(", weight=%.5f\n",
+                        double(Begin->second) / BD->getSize());
 
     TotalSize += BD->getSize();
     ++Begin;
   }
   if (TotalSize)
-    outs() << "BOLT-INFO: Total hot symbol size = " << TotalSize << "\n";
+    BC.outs() << "BOLT-INFO: Total hot symbol size = " << TotalSize << "\n";
 }
 
 DataOrder ReorderData::baseOrder(BinaryContext &BC,
@@ -208,19 +209,19 @@ void ReorderData::assignMemData(BinaryContext &BC) {
   }
 
   if (!Counts.empty()) {
-    outs() << "BOLT-INFO: Memory stats breakdown:\n";
+    BC.outs() << "BOLT-INFO: Memory stats breakdown:\n";
     for (const auto &KV : Counts) {
       StringRef Section = KV.first;
       const uint64_t Count = KV.second;
-      outs() << "BOLT-INFO:   " << Section << " = " << Count
-             << format(" (%.1f%%)\n", 100.0 * Count / TotalCount);
+      BC.outs() << "BOLT-INFO:   " << Section << " = " << Count
+                << format(" (%.1f%%)\n", 100.0 * Count / TotalCount);
       if (JumpTableCounts.count(Section) != 0) {
         const uint64_t JTCount = JumpTableCounts[Section];
-        outs() << "BOLT-INFO:     jump tables = " << JTCount
-               << format(" (%.1f%%)\n", 100.0 * JTCount / Count);
+        BC.outs() << "BOLT-INFO:     jump tables = " << JTCount
+                  << format(" (%.1f%%)\n", 100.0 * JTCount / Count);
       }
     }
-    outs() << "BOLT-INFO: Total memory events: " << TotalCount << "\n";
+    BC.outs() << "BOLT-INFO: Total memory events: " << TotalCount << "\n";
   }
 }
 
@@ -395,9 +396,9 @@ void ReorderData::setSectionOrder(BinaryContext &BC,
 
   OutputSection.reorderContents(NewOrder, opts::ReorderInplace);
 
-  outs() << "BOLT-INFO: reorder-data: " << Count << "/" << TotalCount
-         << format(" (%.1f%%)", 100.0 * Count / TotalCount) << " events, "
-         << Offset << " hot bytes\n";
+  BC.outs() << "BOLT-INFO: reorder-data: " << Count << "/" << TotalCount
+            << format(" (%.1f%%)", 100.0 * Count / TotalCount) << " events, "
+            << Offset << " hot bytes\n";
 }
 
 bool ReorderData::markUnmoveableSymbols(BinaryContext &BC,
@@ -443,8 +444,8 @@ Error ReorderData::runOnFunctions(BinaryContext &BC) {
 
   // For now
   if (opts::JumpTables > JTS_BASIC) {
-    outs() << "BOLT-WARNING: jump table support must be basic for "
-           << "data reordering to work.\n";
+    BC.outs() << "BOLT-WARNING: jump table support must be basic for "
+              << "data reordering to work.\n";
     return Error::success();
   }
 
@@ -463,14 +464,14 @@ Error ReorderData::runOnFunctions(BinaryContext &BC) {
 
     ErrorOr<BinarySection &> Section = BC.getUniqueSectionByName(SectionName);
     if (!Section) {
-      outs() << "BOLT-WARNING: Section " << SectionName
-             << " not found, skipping.\n";
+      BC.outs() << "BOLT-WARNING: Section " << SectionName
+                << " not found, skipping.\n";
       continue;
     }
 
     if (!isSupported(*Section)) {
-      outs() << "BOLT-ERROR: Section " << SectionName << " not supported.\n";
-      exit(1);
+      BC.errs() << "BOLT-ERROR: Section " << SectionName << " not supported.\n";
+      return createFatalBOLTError("");
     }
 
     Sections.push_back(&*Section);
@@ -483,23 +484,23 @@ Error ReorderData::runOnFunctions(BinaryContext &BC) {
     unsigned SplitPointIdx;
 
     if (opts::ReorderAlgorithm == opts::ReorderAlgo::REORDER_COUNT) {
-      outs() << "BOLT-INFO: reorder-sections: ordering data by count\n";
+      BC.outs() << "BOLT-INFO: reorder-sections: ordering data by count\n";
       std::tie(Order, SplitPointIdx) = sortedByCount(BC, *Section);
     } else {
-      outs() << "BOLT-INFO: reorder-sections: ordering data by funcs\n";
+      BC.outs() << "BOLT-INFO: reorder-sections: ordering data by funcs\n";
       std::tie(Order, SplitPointIdx) =
           sortedByFunc(BC, *Section, BC.getBinaryFunctions());
     }
     auto SplitPoint = Order.begin() + SplitPointIdx;
 
     if (opts::PrintReorderedData)
-      printOrder(*Section, Order.begin(), SplitPoint);
+      printOrder(BC, *Section, Order.begin(), SplitPoint);
 
     if (!opts::ReorderInplace || FoundUnmoveable) {
       if (opts::ReorderInplace && FoundUnmoveable)
-        outs() << "BOLT-INFO: Found unmoveable symbols in "
-               << Section->getName() << " falling back to splitting "
-               << "instead of in-place reordering.\n";
+        BC.outs() << "BOLT-INFO: Found unmoveable symbols in "
+                  << Section->getName() << " falling back to splitting "
+                  << "instead of in-place reordering.\n";
 
       // Rename sections.
       BinarySection &Hot =
@@ -519,7 +520,8 @@ Error ReorderData::runOnFunctions(BinaryContext &BC) {
         }
       }
     } else {
-      outs() << "BOLT-WARNING: Inplace section reordering not supported yet.\n";
+      BC.outs()
+          << "BOLT-WARNING: Inplace section reordering not supported yet.\n";
       setSectionOrder(BC, *Section, Order.begin(), Order.end());
     }
   }
diff --git a/bolt/lib/Passes/ReorderFunctions.cpp b/bolt/lib/Passes/ReorderFunctions.cpp
index 77e51c3..c2d5401 100644
--- a/bolt/lib/Passes/ReorderFunctions.cpp
+++ b/bolt/lib/Passes/ReorderFunctions.cpp
@@ -114,7 +114,8 @@ using NodeId = CallGraph::NodeId;
 using Arc = CallGraph::Arc;
 using Node = CallGraph::Node;
 
-void ReorderFunctions::reorder(std::vector<Cluster> &&Clusters,
+void ReorderFunctions::reorder(BinaryContext &BC,
+                               std::vector<Cluster> &&Clusters,
                                std::map<uint64_t, BinaryFunction> &BFs) {
   std::vector<uint64_t> FuncAddr(Cg.numNodes()); // Just for computing stats
   uint64_t TotalSize = 0;
@@ -139,10 +140,11 @@ void ReorderFunctions::reorder(std::vector<Cluster> &&Clusters,
   if (opts::ReorderFunctions == RT_NONE)
     return;
 
-  printStats(Clusters, FuncAddr);
+  printStats(BC, Clusters, FuncAddr);
 }
 
-void ReorderFunctions::printStats(const std::vector<Cluster> &Clusters,
+void ReorderFunctions::printStats(BinaryContext &BC,
+                                  const std::vector<Cluster> &Clusters,
                                   const std::vector<uint64_t> &FuncAddr) {
   if (opts::Verbosity == 0) {
 #ifndef NDEBUG
@@ -167,11 +169,11 @@ void ReorderFunctions::printStats(const std::vector<Cluster> &Clusters,
   double TotalCalls4KB = 0;
   double TotalCalls2MB = 0;
   if (PrintDetailed)
-    outs() << "BOLT-INFO: Function reordering page layout\n"
-           << "BOLT-INFO: ============== page 0 ==============\n";
+    BC.outs() << "BOLT-INFO: Function reordering page layout\n"
+              << "BOLT-INFO: ============== page 0 ==============\n";
   for (const Cluster &Cluster : Clusters) {
     if (PrintDetailed)
-      outs() << format(
+      BC.outs() << format(
           "BOLT-INFO: -------- density = %.3lf (%u / %u) --------\n",
           Cluster.density(), Cluster.samples(), Cluster.size());
 
@@ -180,8 +182,8 @@ void ReorderFunctions::printStats(const std::vector<Cluster> &Clusters,
         Hotfuncs++;
 
         if (PrintDetailed)
-          outs() << "BOLT-INFO: hot func " << *Cg.nodeIdToFunc(FuncId) << " ("
-                 << Cg.size(FuncId) << ")\n";
+          BC.outs() << "BOLT-INFO: hot func " << *Cg.nodeIdToFunc(FuncId)
+                    << " (" << Cg.size(FuncId) << ")\n";
 
         uint64_t Dist = 0;
         uint64_t Calls = 0;
@@ -193,12 +195,13 @@ void ReorderFunctions::printStats(const std::vector<Cluster> &Clusters,
                                   (FuncAddr[FuncId] + Arc.avgCallOffset()));
           const double W = Arc.weight();
           if (D < 64 && PrintDetailed && opts::Verbosity > 2)
-            outs() << "BOLT-INFO: short (" << D << "B) call:\n"
-                   << "BOLT-INFO:   Src: " << *Cg.nodeIdToFunc(FuncId) << "\n"
-                   << "BOLT-INFO:   Dst: " << *Cg.nodeIdToFunc(Dst) << "\n"
-                   << "BOLT-INFO:   Weight = " << W << "\n"
-                   << "BOLT-INFO:   AvgOffset = " << Arc.avgCallOffset()
-                   << "\n";
+            BC.outs() << "BOLT-INFO: short (" << D << "B) call:\n"
+                      << "BOLT-INFO:   Src: " << *Cg.nodeIdToFunc(FuncId)
+                      << "\n"
+                      << "BOLT-INFO:   Dst: " << *Cg.nodeIdToFunc(Dst) << "\n"
+                      << "BOLT-INFO:   Weight = " << W << "\n"
+                      << "BOLT-INFO:   AvgOffset = " << Arc.avgCallOffset()
+                      << "\n";
           Calls += W;
           if (D < 64)
             TotalCalls64B += W;
@@ -208,46 +211,47 @@ void ReorderFunctions::printStats(const std::vector<Cluster> &Clusters,
             TotalCalls2MB += W;
           Dist += Arc.weight() * D;
           if (PrintDetailed)
-            outs() << format("BOLT-INFO: arc: %u [@%lu+%.1lf] -> %u [@%lu]: "
-                             "weight = %.0lf, callDist = %f\n",
-                             Arc.src(), FuncAddr[Arc.src()],
-                             Arc.avgCallOffset(), Arc.dst(),
-                             FuncAddr[Arc.dst()], Arc.weight(), D);
+            BC.outs() << format("BOLT-INFO: arc: %u [@%lu+%.1lf] -> %u [@%lu]: "
+                                "weight = %.0lf, callDist = %f\n",
+                                Arc.src(), FuncAddr[Arc.src()],
+                                Arc.avgCallOffset(), Arc.dst(),
+                                FuncAddr[Arc.dst()], Arc.weight(), D);
         }
         TotalCalls += Calls;
         TotalDistance += Dist;
         TotalSize += Cg.size(FuncId);
 
         if (PrintDetailed) {
-          outs() << format("BOLT-INFO: start = %6u : avgCallDist = %lu : ",
-                           TotalSize, Calls ? Dist / Calls : 0)
-                 << Cg.nodeIdToFunc(FuncId)->getPrintName() << '\n';
+          BC.outs() << format("BOLT-INFO: start = %6u : avgCallDist = %lu : ",
+                              TotalSize, Calls ? Dist / Calls : 0)
+                    << Cg.nodeIdToFunc(FuncId)->getPrintName() << '\n';
           const uint64_t NewPage = TotalSize / HugePageSize;
           if (NewPage != CurPage) {
             CurPage = NewPage;
-            outs() << format(
+            BC.outs() << format(
                 "BOLT-INFO: ============== page %u ==============\n", CurPage);
           }
         }
       }
     }
   }
-  outs() << "BOLT-INFO: Function reordering stats\n"
-         << format("BOLT-INFO:  Number of hot functions: %u\n"
-                   "BOLT-INFO:  Number of clusters: %lu\n",
-                   Hotfuncs, Clusters.size())
-         << format("BOLT-INFO:  Final average call distance = %.1lf "
-                   "(%.0lf / %.0lf)\n",
-                   TotalCalls ? TotalDistance / TotalCalls : 0, TotalDistance,
-                   TotalCalls)
-         << format("BOLT-INFO:  Total Calls = %.0lf\n", TotalCalls);
+  BC.outs() << "BOLT-INFO: Function reordering stats\n"
+            << format("BOLT-INFO:  Number of hot functions: %u\n"
+                      "BOLT-INFO:  Number of clusters: %lu\n",
+                      Hotfuncs, Clusters.size())
+            << format("BOLT-INFO:  Final average call distance = %.1lf "
+                      "(%.0lf / %.0lf)\n",
+                      TotalCalls ? TotalDistance / TotalCalls : 0,
+                      TotalDistance, TotalCalls)
+            << format("BOLT-INFO:  Total Calls = %.0lf\n", TotalCalls);
   if (TotalCalls)
-    outs() << format("BOLT-INFO:  Total Calls within 64B = %.0lf (%.2lf%%)\n",
-                     TotalCalls64B, 100 * TotalCalls64B / TotalCalls)
-           << format("BOLT-INFO:  Total Calls within 4KB = %.0lf (%.2lf%%)\n",
-                     TotalCalls4KB, 100 * TotalCalls4KB / TotalCalls)
-           << format("BOLT-INFO:  Total Calls within 2MB = %.0lf (%.2lf%%)\n",
-                     TotalCalls2MB, 100 * TotalCalls2MB / TotalCalls);
+    BC.outs()
+        << format("BOLT-INFO:  Total Calls within 64B = %.0lf (%.2lf%%)\n",
+                  TotalCalls64B, 100 * TotalCalls64B / TotalCalls)
+        << format("BOLT-INFO:  Total Calls within 4KB = %.0lf (%.2lf%%)\n",
+                  TotalCalls4KB, 100 * TotalCalls4KB / TotalCalls)
+        << format("BOLT-INFO:  Total Calls within 2MB = %.0lf (%.2lf%%)\n",
+                  TotalCalls2MB, 100 * TotalCalls2MB / TotalCalls);
 }
 
 Error ReorderFunctions::readFunctionOrderFile(
@@ -403,8 +407,8 @@ Error ReorderFunctions::runOnFunctions(BinaryContext &BC) {
 
       if (FuncAddrs.empty()) {
         if (opts::Verbosity >= 1)
-          errs() << "BOLT-WARNING: Reorder functions: can't find function "
-                 << "for " << Function << "\n";
+          BC.errs() << "BOLT-WARNING: Reorder functions: can't find function "
+                    << "for " << Function << "\n";
         ++InvalidEntries;
         continue;
       }
@@ -416,28 +420,28 @@ Error ReorderFunctions::runOnFunctions(BinaryContext &BC) {
         BinaryFunction *BF = BC.getFunctionForSymbol(FuncBD->getSymbol());
         if (!BF) {
           if (opts::Verbosity >= 1)
-            errs() << "BOLT-WARNING: Reorder functions: can't find function "
-                   << "for " << Function << "\n";
+            BC.errs() << "BOLT-WARNING: Reorder functions: can't find function "
+                      << "for " << Function << "\n";
           ++InvalidEntries;
           break;
         }
         if (!BF->hasValidIndex())
           BF->setIndex(Index++);
         else if (opts::Verbosity > 0)
-          errs() << "BOLT-WARNING: Duplicate reorder entry for " << Function
-                 << "\n";
+          BC.errs() << "BOLT-WARNING: Duplicate reorder entry for " << Function
+                    << "\n";
       }
     }
     if (InvalidEntries)
-      errs() << "BOLT-WARNING: Reorder functions: can't find functions for "
-             << InvalidEntries << " entries in -function-order list\n";
+      BC.errs() << "BOLT-WARNING: Reorder functions: can't find functions for "
+                << InvalidEntries << " entries in -function-order list\n";
   } break;
 
   default:
     llvm_unreachable("unexpected layout type");
   }
 
-  reorder(std::move(Clusters), BFs);
+  reorder(BC, std::move(Clusters), BFs);
 
   BC.HasFinalizedFunctionOrder = true;
 
@@ -446,8 +450,8 @@ Error ReorderFunctions::runOnFunctions(BinaryContext &BC) {
     FuncsFile = std::make_unique<std::ofstream>(opts::GenerateFunctionOrderFile,
                                                 std::ios::out);
     if (!FuncsFile) {
-      errs() << "BOLT-ERROR: ordered functions file "
-             << opts::GenerateFunctionOrderFile << " cannot be opened\n";
+      BC.errs() << "BOLT-ERROR: ordered functions file "
+                << opts::GenerateFunctionOrderFile << " cannot be opened\n";
       return createFatalBOLTError("");
     }
   }
@@ -457,8 +461,8 @@ Error ReorderFunctions::runOnFunctions(BinaryContext &BC) {
     LinkSectionsFile =
         std::make_unique<std::ofstream>(opts::LinkSectionsFile, std::ios::out);
     if (!LinkSectionsFile) {
-      errs() << "BOLT-ERROR: link sections file " << opts::LinkSectionsFile
-             << " cannot be opened\n";
+      BC.errs() << "BOLT-ERROR: link sections file " << opts::LinkSectionsFile
+                << " cannot be opened\n";
       return createFatalBOLTError("");
     }
   }
@@ -509,14 +513,14 @@ Error ReorderFunctions::runOnFunctions(BinaryContext &BC) {
 
     if (FuncsFile) {
       FuncsFile->close();
-      outs() << "BOLT-INFO: dumped function order to "
-             << opts::GenerateFunctionOrderFile << '\n';
+      BC.outs() << "BOLT-INFO: dumped function order to "
+                << opts::GenerateFunctionOrderFile << '\n';
     }
 
     if (LinkSectionsFile) {
       LinkSectionsFile->close();
-      outs() << "BOLT-INFO: dumped linker section order to "
-             << opts::LinkSectionsFile << '\n';
+      BC.outs() << "BOLT-INFO: dumped linker section order to "
+                << opts::LinkSectionsFile << '\n';
     }
   }
   return Error::success();
diff --git a/bolt/lib/Passes/RetpolineInsertion.cpp b/bolt/lib/Passes/RetpolineInsertion.cpp
index 09ea3a8..2808575 100644
--- a/bolt/lib/Passes/RetpolineInsertion.cpp
+++ b/bolt/lib/Passes/RetpolineInsertion.cpp
@@ -327,10 +327,10 @@ Error RetpolineInsertion::runOnFunctions(BinaryContext &BC) {
       }
     }
   }
-  outs() << "BOLT-INFO: The number of created retpoline functions is : "
-         << CreatedRetpolines.size()
-         << "\nBOLT-INFO: The number of retpolined branches is : "
-         << RetpolinedBranches << "\n";
+  BC.outs() << "BOLT-INFO: The number of created retpoline functions is : "
+            << CreatedRetpolines.size()
+            << "\nBOLT-INFO: The number of retpolined branches is : "
+            << RetpolinedBranches << "\n";
   return Error::success();
 }
 
diff --git a/bolt/lib/Passes/ShrinkWrapping.cpp b/bolt/lib/Passes/ShrinkWrapping.cpp
index 2f2405b..9a1f9d7 100644
--- a/bolt/lib/Passes/ShrinkWrapping.cpp
+++ b/bolt/lib/Passes/ShrinkWrapping.cpp
@@ -2035,28 +2035,28 @@ Expected<bool> ShrinkWrapping::perform(bool HotOnly) {
   return true;
 }
 
-void ShrinkWrapping::printStats() {
-  outs() << "BOLT-INFO: Shrink wrapping moved " << SpillsMovedRegularMode
-         << " spills inserting load/stores and " << SpillsMovedPushPopMode
-         << " spills inserting push/pops\n";
+void ShrinkWrapping::printStats(BinaryContext &BC) {
+  BC.outs() << "BOLT-INFO: Shrink wrapping moved " << SpillsMovedRegularMode
+            << " spills inserting load/stores and " << SpillsMovedPushPopMode
+            << " spills inserting push/pops\n";
   if (!InstrDynamicCount || !StoreDynamicCount)
     return;
-  outs() << "BOLT-INFO: Shrink wrapping reduced " << SpillsMovedDynamicCount
-         << " store executions ("
-         << format("%.1lf%%",
-                   (100.0 * SpillsMovedDynamicCount / InstrDynamicCount))
-         << " total instructions executed, "
-         << format("%.1lf%%",
-                   (100.0 * SpillsMovedDynamicCount / StoreDynamicCount))
-         << " store instructions)\n";
-  outs() << "BOLT-INFO: Shrink wrapping failed at reducing "
-         << SpillsFailedDynamicCount << " store executions ("
-         << format("%.1lf%%",
-                   (100.0 * SpillsFailedDynamicCount / InstrDynamicCount))
-         << " total instructions executed, "
-         << format("%.1lf%%",
-                   (100.0 * SpillsFailedDynamicCount / StoreDynamicCount))
-         << " store instructions)\n";
+  BC.outs() << "BOLT-INFO: Shrink wrapping reduced " << SpillsMovedDynamicCount
+            << " store executions ("
+            << format("%.1lf%%",
+                      (100.0 * SpillsMovedDynamicCount / InstrDynamicCount))
+            << " total instructions executed, "
+            << format("%.1lf%%",
+                      (100.0 * SpillsMovedDynamicCount / StoreDynamicCount))
+            << " store instructions)\n";
+  BC.outs() << "BOLT-INFO: Shrink wrapping failed at reducing "
+            << SpillsFailedDynamicCount << " store executions ("
+            << format("%.1lf%%",
+                      (100.0 * SpillsFailedDynamicCount / InstrDynamicCount))
+            << " total instructions executed, "
+            << format("%.1lf%%",
+                      (100.0 * SpillsFailedDynamicCount / StoreDynamicCount))
+            << " store instructions)\n";
 }
 
 // Operators necessary as a result of using MCAnnotation
diff --git a/bolt/lib/Passes/SplitFunctions.cpp b/bolt/lib/Passes/SplitFunctions.cpp
index 26dd4b6..cdbb2a1 100644
--- a/bolt/lib/Passes/SplitFunctions.cpp
+++ b/bolt/lib/Passes/SplitFunctions.cpp
@@ -766,10 +766,11 @@ Error SplitFunctions::runOnFunctions(BinaryContext &BC) {
       "SplitFunctions", ForceSequential);
 
   if (SplitBytesHot + SplitBytesCold > 0)
-    outs() << "BOLT-INFO: splitting separates " << SplitBytesHot
-           << " hot bytes from " << SplitBytesCold << " cold bytes "
-           << format("(%.2lf%% of split functions is hot).\n",
-                     100.0 * SplitBytesHot / (SplitBytesHot + SplitBytesCold));
+    BC.outs() << "BOLT-INFO: splitting separates " << SplitBytesHot
+              << " hot bytes from " << SplitBytesCold << " cold bytes "
+              << format("(%.2lf%% of split functions is hot).\n",
+                        100.0 * SplitBytesHot /
+                            (SplitBytesHot + SplitBytesCold));
   return Error::success();
 }
 
@@ -900,9 +901,9 @@ void SplitFunctions::splitFunction(BinaryFunction &BF, SplitStrategy &S) {
     if (alignTo(OriginalHotSize, opts::SplitAlignThreshold) <=
         alignTo(HotSize, opts::SplitAlignThreshold) + opts::SplitThreshold) {
       if (opts::Verbosity >= 2) {
-        outs() << "BOLT-INFO: Reversing splitting of function "
-               << formatv("{0}:\n  {1:x}, {2:x} -> {3:x}\n", BF, HotSize,
-                          ColdSize, OriginalHotSize);
+        BC.outs() << "BOLT-INFO: Reversing splitting of function "
+                  << formatv("{0}:\n  {1:x}, {2:x} -> {3:x}\n", BF, HotSize,
+                             ColdSize, OriginalHotSize);
       }
 
       // Reverse the action of createEHTrampolines(). The trampolines will be
diff --git a/bolt/lib/Passes/StokeInfo.cpp b/bolt/lib/Passes/StokeInfo.cpp
index df73293..499cac42 100644
--- a/bolt/lib/Passes/StokeInfo.cpp
+++ b/bolt/lib/Passes/StokeInfo.cpp
@@ -97,7 +97,8 @@ bool StokeInfo::checkFunction(BinaryFunction &BF, DataflowInfoManager &DInfo,
 
   if (!BF.isSimple() || BF.isMultiEntry() || BF.empty())
     return false;
-  outs() << " STOKE-INFO: analyzing function " << Name << "\n";
+  BF.getBinaryContext().outs()
+      << " STOKE-INFO: analyzing function " << Name << "\n";
 
   FuncInfo.FuncName = Name;
   FuncInfo.Offset = BF.getFileOffset();
@@ -140,18 +141,18 @@ bool StokeInfo::checkFunction(BinaryFunction &BF, DataflowInfoManager &DInfo,
   LiveOutBV &= DefaultLiveOutMask;
   getRegNameFromBitVec(BF.getBinaryContext(), LiveOutBV, &FuncInfo.LiveOut);
 
-  outs() << " STOKE-INFO: end function \n";
+  BF.getBinaryContext().outs() << " STOKE-INFO: end function \n";
   return true;
 }
 
 Error StokeInfo::runOnFunctions(BinaryContext &BC) {
-  outs() << "STOKE-INFO: begin of stoke pass\n";
+  BC.outs() << "STOKE-INFO: begin of stoke pass\n";
 
   std::ofstream Outfile;
   if (!opts::StokeOutputDataFilename.empty()) {
     Outfile.open(opts::StokeOutputDataFilename);
   } else {
-    errs() << "STOKE-INFO: output file is required\n";
+    BC.errs() << "STOKE-INFO: output file is required\n";
     return Error::success();
   }
 
@@ -185,7 +186,7 @@ Error StokeInfo::runOnFunctions(BinaryContext &BC) {
       FuncInfo.printData(Outfile);
   }
 
-  outs() << "STOKE-INFO: end of stoke pass\n";
+  BC.outs() << "STOKE-INFO: end of stoke pass\n";
   return Error::success();
 }
 
diff --git a/bolt/lib/Passes/TailDuplication.cpp b/bolt/lib/Passes/TailDuplication.cpp
index 4f3082e..2163e3a 100644
--- a/bolt/lib/Passes/TailDuplication.cpp
+++ b/bolt/lib/Passes/TailDuplication.cpp
@@ -644,22 +644,24 @@ Error TailDuplication::runOnFunctions(BinaryContext &BC) {
     runOnFunction(Function);
   }
 
-  outs() << "BOLT-INFO: tail duplication"
-         << format(" modified %zu (%.2f%%) functions;", ModifiedFunctions,
-                   100.0 * ModifiedFunctions / BC.getBinaryFunctions().size())
-         << format(" duplicated %zu blocks (%zu bytes) responsible for",
-                   DuplicatedBlockCount, DuplicatedByteCount)
-         << format(" %zu dynamic executions (%.2f%% of all block executions)",
-                   DuplicationsDynamicCount,
-                   100.0 * DuplicationsDynamicCount / AllDynamicCount)
-         << "\n";
+  BC.outs()
+      << "BOLT-INFO: tail duplication"
+      << format(" modified %zu (%.2f%%) functions;", ModifiedFunctions,
+                100.0 * ModifiedFunctions / BC.getBinaryFunctions().size())
+      << format(" duplicated %zu blocks (%zu bytes) responsible for",
+                DuplicatedBlockCount, DuplicatedByteCount)
+      << format(" %zu dynamic executions (%.2f%% of all block executions)",
+                DuplicationsDynamicCount,
+                100.0 * DuplicationsDynamicCount / AllDynamicCount)
+      << "\n";
 
   if (opts::TailDuplicationConstCopyPropagation) {
-    outs() << "BOLT-INFO: tail duplication "
-           << format("applied %zu static and %zu dynamic propagation deletions",
+    BC.outs() << "BOLT-INFO: tail duplication "
+              << format(
+                     "applied %zu static and %zu dynamic propagation deletions",
                      StaticInstructionDeletionCount,
                      DynamicInstructionDeletionCount)
-           << "\n";
+              << "\n";
   }
   return Error::success();
 }
diff --git a/bolt/lib/Passes/ThreeWayBranch.cpp b/bolt/lib/Passes/ThreeWayBranch.cpp
index a30a2a0..c69eac5 100644
--- a/bolt/lib/Passes/ThreeWayBranch.cpp
+++ b/bolt/lib/Passes/ThreeWayBranch.cpp
@@ -155,8 +155,8 @@ Error ThreeWayBranch::runOnFunctions(BinaryContext &BC) {
     runOnFunction(Function);
   }
 
-  outs() << "BOLT-INFO: number of three way branches order changed: "
-         << BranchesAltered << "\n";
+  BC.outs() << "BOLT-INFO: number of three way branches order changed: "
+            << BranchesAltered << "\n";
   return Error::success();
 }
 
diff --git a/bolt/lib/Passes/ValidateInternalCalls.cpp b/bolt/lib/Passes/ValidateInternalCalls.cpp
index ce0b13b..54ae621 100644
--- a/bolt/lib/Passes/ValidateInternalCalls.cpp
+++ b/bolt/lib/Passes/ValidateInternalCalls.cpp
@@ -339,10 +339,11 @@ Error ValidateInternalCalls::runOnFunctions(BinaryContext &BC) {
   }
 
   if (!Invalid.empty()) {
-    errs() << "BOLT-WARNING: will skip the following function(s) as unsupported"
-              " internal calls were detected:\n";
+    BC.errs()
+        << "BOLT-WARNING: will skip the following function(s) as unsupported"
+           " internal calls were detected:\n";
     for (BinaryFunction *Function : Invalid) {
-      errs() << "              " << *Function << "\n";
+      BC.errs() << "              " << *Function << "\n";
       Function->setIgnored();
     }
   }
diff --git a/bolt/lib/Passes/ValidateMemRefs.cpp b/bolt/lib/Passes/ValidateMemRefs.cpp
index 598a4c2..f29a97c 100644
--- a/bolt/lib/Passes/ValidateMemRefs.cpp
+++ b/bolt/lib/Passes/ValidateMemRefs.cpp
@@ -96,8 +96,8 @@ Error ValidateMemRefs::runOnFunctions(BinaryContext &BC) {
   if (!ReplacedReferences)
     return Error::success();
 
-  outs() << "BOLT-INFO: validate-mem-refs updated " << ReplacedReferences
-         << " object references\n";
+  BC.outs() << "BOLT-INFO: validate-mem-refs updated " << ReplacedReferences
+            << " object references\n";
   return Error::success();
 }
 
diff --git a/bolt/lib/Passes/VeneerElimination.cpp b/bolt/lib/Passes/VeneerElimination.cpp
index d844a06..0bec111 100644
--- a/bolt/lib/Passes/VeneerElimination.cpp
+++ b/bolt/lib/Passes/VeneerElimination.cpp
@@ -51,8 +51,8 @@ Error VeneerElimination::runOnFunctions(BinaryContext &BC) {
       VeneerDestinations[Symbol] = VeneerTargetSymbol;
   }
 
-  outs() << "BOLT-INFO: number of removed linker-inserted veneers: "
-         << VeneersCount << "\n";
+  BC.outs() << "BOLT-INFO: number of removed linker-inserted veneers: "
+            << VeneersCount << "\n";
 
   // Handle veneers to veneers in case they occur
   for (auto &Entry : VeneerDestinations) {
diff --git a/bolt/lib/Profile/BoltAddressTranslation.cpp b/bolt/lib/Profile/BoltAddressTranslation.cpp
index 6901e48..8e18b21 100644
--- a/bolt/lib/Profile/BoltAddressTranslation.cpp
+++ b/bolt/lib/Profile/BoltAddressTranslation.cpp
@@ -108,7 +108,7 @@ void BoltAddressTranslation::write(const BinaryContext &BC, raw_ostream &OS) {
   writeMaps</*Cold=*/false>(Maps, PrevAddress, OS);
   writeMaps</*Cold=*/true>(Maps, PrevAddress, OS);
 
-  outs() << "BOLT-INFO: Wrote " << Maps.size() << " BAT maps\n";
+  BC.outs() << "BOLT-INFO: Wrote " << Maps.size() << " BAT maps\n";
 }
 
 APInt BoltAddressTranslation::calculateBranchEntriesBitMask(MapTy &Map,
@@ -201,7 +201,7 @@ void BoltAddressTranslation::writeMaps(std::map<uint64_t, MapTy> &Maps,
   }
 }
 
-std::error_code BoltAddressTranslation::parse(StringRef Buf) {
+std::error_code BoltAddressTranslation::parse(raw_ostream &OS, StringRef Buf) {
   DataExtractor DE = DataExtractor(Buf, true, 8);
   uint64_t Offset = 0;
   if (Buf.size() < 12)
@@ -225,7 +225,7 @@ std::error_code BoltAddressTranslation::parse(StringRef Buf) {
   uint64_t PrevAddress = 0;
   parseMaps</*Cold=*/false>(HotFuncs, PrevAddress, DE, Offset, Err);
   parseMaps</*Cold=*/true>(HotFuncs, PrevAddress, DE, Offset, Err);
-  outs() << "BOLT-INFO: Parsed " << Maps.size() << " BAT entries\n";
+  OS << "BOLT-INFO: Parsed " << Maps.size() << " BAT entries\n";
   return errorToErrorCode(std::move(Err));
 }
 
diff --git a/bolt/lib/Rewrite/BinaryPassManager.cpp b/bolt/lib/Rewrite/BinaryPassManager.cpp
index cad8018..489b33f 100644
--- a/bolt/lib/Rewrite/BinaryPassManager.cpp
+++ b/bolt/lib/Rewrite/BinaryPassManager.cpp
@@ -281,13 +281,14 @@ Error BinaryFunctionPassManager::runPasses() {
         formatv("{0:2}_{1}", PassIdx, Pass->getName()).str();
 
     if (opts::Verbosity > 0)
-      outs() << "BOLT-INFO: Starting pass: " << Pass->getName() << "\n";
+      BC.outs() << "BOLT-INFO: Starting pass: " << Pass->getName() << "\n";
 
     NamedRegionTimer T(Pass->getName(), Pass->getName(), TimerGroupName,
                        TimerGroupDesc, TimeOpts);
 
     Error E = Error::success();
     callWithDynoStats(
+        BC.outs(),
         [this, &E, &Pass] {
           E = joinErrors(std::move(E), Pass->runOnFunctions(BC));
         },
@@ -308,7 +309,7 @@ Error BinaryFunctionPassManager::runPasses() {
     }
 
     if (opts::Verbosity > 0)
-      outs() << "BOLT-INFO: Finished pass: " << Pass->getName() << "\n";
+      BC.outs() << "BOLT-INFO: Finished pass: " << Pass->getName() << "\n";
 
     if (!opts::PrintAll && !opts::DumpDotAll && !Pass->printPass())
       continue;
@@ -321,7 +322,7 @@ Error BinaryFunctionPassManager::runPasses() {
       if (!Pass->shouldPrint(Function))
         continue;
 
-      Function.print(outs(), Message);
+      Function.print(BC.outs(), Message);
 
       if (opts::DumpDotAll)
         Function.dumpGraphForPass(PassIdName);
diff --git a/bolt/lib/Rewrite/BoltDiff.cpp b/bolt/lib/Rewrite/BoltDiff.cpp
index 0de5a3d..fa43b7a 100644
--- a/bolt/lib/Rewrite/BoltDiff.cpp
+++ b/bolt/lib/Rewrite/BoltDiff.cpp
@@ -294,9 +294,9 @@ class RewriteInstanceDiff {
     }
     PrintProgramStats PPS(opts::NeverPrint);
     outs() << "* BOLT-DIFF: Starting print program stats pass for binary 1\n";
-    cantFail(PPS.runOnFunctions(*RI1.BC));
+    RI1.BC->logBOLTErrorsAndQuitOnFatal(PPS.runOnFunctions(*RI1.BC));
     outs() << "* BOLT-DIFF: Starting print program stats pass for binary 2\n";
-    cantFail(PPS.runOnFunctions(*RI2.BC));
+    RI1.BC->logBOLTErrorsAndQuitOnFatal(PPS.runOnFunctions(*RI2.BC));
     outs() << "=====\n";
     outs() << "Inputs share " << BothHaveProfile
            << " functions with valid profile.\n";
@@ -700,9 +700,9 @@ void RewriteInstance::compare(RewriteInstance &RI2) {
   if (opts::ICF) {
     IdenticalCodeFolding ICF(opts::NeverPrint);
     outs() << "BOLT-DIFF: Starting ICF pass for binary 1";
-    cantFail(ICF.runOnFunctions(*BC));
+    BC->logBOLTErrorsAndQuitOnFatal(ICF.runOnFunctions(*BC));
     outs() << "BOLT-DIFF: Starting ICF pass for binary 2";
-    cantFail(ICF.runOnFunctions(*RI2.BC));
+    BC->logBOLTErrorsAndQuitOnFatal(ICF.runOnFunctions(*RI2.BC));
   }
 
   RewriteInstanceDiff RID(*this, RI2);
diff --git a/bolt/lib/Rewrite/DWARFRewriter.cpp b/bolt/lib/Rewrite/DWARFRewriter.cpp
index cefccbd..27fa937 100644
--- a/bolt/lib/Rewrite/DWARFRewriter.cpp
+++ b/bolt/lib/Rewrite/DWARFRewriter.cpp
@@ -709,7 +709,7 @@ void DWARFRewriter::updateDebugInfo() {
                                 : LegacyRangesSectionWriter.get();
     // Skipping CUs that failed to load.
     if (SplitCU) {
-      DIEBuilder DWODIEBuilder(&(*SplitCU)->getContext(), true);
+      DIEBuilder DWODIEBuilder(BC, &(*SplitCU)->getContext(), true);
       DWODIEBuilder.buildDWOUnit(**SplitCU);
       std::string DWOName = updateDWONameCompDir(
           *Unit, *DIEBlder, *DIEBlder->getUnitDIEbyUnit(*Unit));
@@ -754,7 +754,7 @@ void DWARFRewriter::updateDebugInfo() {
     AddrWriter->update(*DIEBlder, *Unit);
   };
 
-  DIEBuilder DIEBlder(BC.DwCtx.get());
+  DIEBuilder DIEBlder(BC, BC.DwCtx.get());
   DIEBlder.buildTypeUnits(StrOffstsWriter.get());
   SmallVector<char, 20> OutBuffer;
   std::unique_ptr<raw_svector_ostream> ObjOS =
@@ -1655,7 +1655,8 @@ createDwarfOnlyBC(const object::ObjectFile &File) {
       &File, false,
       DWARFContext::create(File, DWARFContext::ProcessDebugRelocations::Ignore,
                            nullptr, "", WithColor::defaultErrorHandler,
-                           WithColor::defaultWarningHandler)));
+                           WithColor::defaultWarningHandler),
+      {llvm::outs(), llvm::errs()}));
 }
 
 StringMap<DWARFRewriter::KnownSectionsEntry>
diff --git a/bolt/lib/Rewrite/MachORewriteInstance.cpp b/bolt/lib/Rewrite/MachORewriteInstance.cpp
index c2a6c1f..0970a05 100644
--- a/bolt/lib/Rewrite/MachORewriteInstance.cpp
+++ b/bolt/lib/Rewrite/MachORewriteInstance.cpp
@@ -103,7 +103,8 @@ MachORewriteInstance::MachORewriteInstance(object::MachOObjectFile *InputFile,
     : InputFile(InputFile), ToolPath(ToolPath) {
   ErrorAsOutParameter EAO(&Err);
   auto BCOrErr = BinaryContext::createBinaryContext(
-      InputFile, /* IsPIC */ true, DWARFContext::create(*InputFile));
+      InputFile, /* IsPIC */ true, DWARFContext::create(*InputFile),
+      {llvm::outs(), llvm::errs()});
   if (Error E = BCOrErr.takeError()) {
     Err = std::move(E);
     return;
@@ -337,7 +338,7 @@ void MachORewriteInstance::disassembleFunctions() {
     BinaryFunction &Function = BFI.second;
     if (!Function.isSimple())
       continue;
-    cantFail(Function.disassemble());
+    BC->logBOLTErrorsAndQuitOnFatal(Function.disassemble());
     if (opts::PrintDisasm)
       Function.print(outs(), "after disassembly");
   }
@@ -348,10 +349,7 @@ void MachORewriteInstance::buildFunctionsCFG() {
     BinaryFunction &Function = BFI.second;
     if (!Function.isSimple())
       continue;
-    if (!Function.buildCFG(/*AllocId*/ 0)) {
-      errs() << "BOLT-WARNING: failed to build CFG for the function "
-             << Function << "\n";
-    }
+    BC->logBOLTErrorsAndQuitOnFatal(Function.buildCFG(/*AllocId*/ 0));
   }
 }
 
@@ -387,7 +385,7 @@ void MachORewriteInstance::runOptimizationPasses() {
   Manager.registerPass(
       std::make_unique<FinalizeFunctions>(opts::PrintFinalized));
 
-  cantFail(Manager.runPasses());
+  BC->logBOLTErrorsAndQuitOnFatal(Manager.runPasses());
 }
 
 void MachORewriteInstance::mapInstrumentationSection(
diff --git a/bolt/lib/Rewrite/RewriteInstance.cpp b/bolt/lib/Rewrite/RewriteInstance.cpp
index 829568c..89ca13c 100644
--- a/bolt/lib/Rewrite/RewriteInstance.cpp
+++ b/bolt/lib/Rewrite/RewriteInstance.cpp
@@ -309,9 +309,11 @@ bool refersToReorderedSection(ErrorOr<BinarySection &> Section) {
 
 Expected<std::unique_ptr<RewriteInstance>>
 RewriteInstance::create(ELFObjectFileBase *File, const int Argc,
-                        const char *const *Argv, StringRef ToolPath) {
+                        const char *const *Argv, StringRef ToolPath,
+                        raw_ostream &Stdout, raw_ostream &Stderr) {
   Error Err = Error::success();
-  auto RI = std::make_unique<RewriteInstance>(File, Argc, Argv, ToolPath, Err);
+  auto RI = std::make_unique<RewriteInstance>(File, Argc, Argv, ToolPath,
+                                              Stdout, Stderr, Err);
   if (Err)
     return std::move(Err);
   return std::move(RI);
@@ -319,6 +321,7 @@ RewriteInstance::create(ELFObjectFileBase *File, const int Argc,
 
 RewriteInstance::RewriteInstance(ELFObjectFileBase *File, const int Argc,
                                  const char *const *Argv, StringRef ToolPath,
+                                 raw_ostream &Stdout, raw_ostream &Stderr,
                                  Error &Err)
     : InputFile(File), Argc(Argc), Argv(Argv), ToolPath(ToolPath),
       SHStrTab(StringTableBuilder::ELF) {
@@ -333,17 +336,23 @@ RewriteInstance::RewriteInstance(ELFObjectFileBase *File, const int Argc,
   bool IsPIC = false;
   const ELFFile<ELF64LE> &Obj = ELF64LEFile->getELFFile();
   if (Obj.getHeader().e_type != ELF::ET_EXEC) {
-    outs() << "BOLT-INFO: shared object or position-independent executable "
+    Stdout << "BOLT-INFO: shared object or position-independent executable "
               "detected\n";
     IsPIC = true;
   }
 
+  // Make sure we don't miss any output on core dumps.
+  Stdout.SetUnbuffered();
+  Stderr.SetUnbuffered();
+  LLVM_DEBUG(dbgs().SetUnbuffered());
+
   auto BCOrErr = BinaryContext::createBinaryContext(
       File, IsPIC,
       DWARFContext::create(*File, DWARFContext::ProcessDebugRelocations::Ignore,
                            nullptr, opts::DWPPathName,
                            WithColor::defaultErrorHandler,
-                           WithColor::defaultWarningHandler));
+                           WithColor::defaultWarningHandler),
+      JournalingStreams{Stdout, Stderr});
   if (Error E = BCOrErr.takeError()) {
     Err = std::move(E);
     return;
@@ -457,22 +466,23 @@ void RewriteInstance::markGnuRelroSections() {
     bool VMAContains = checkVMA<ELFT>(Phdr, *Sec, VMAOverlap);
     if (ImageOverlap) {
       if (opts::Verbosity >= 1)
-        errs() << "BOLT-WARNING: GNU_RELRO segment has partial file offset "
-               << "overlap with section " << BinarySection->getName() << '\n';
+        BC->errs() << "BOLT-WARNING: GNU_RELRO segment has partial file offset "
+                   << "overlap with section " << BinarySection->getName()
+                   << '\n';
       return;
     }
     if (VMAOverlap) {
       if (opts::Verbosity >= 1)
-        errs() << "BOLT-WARNING: GNU_RELRO segment has partial VMA overlap "
-               << "with section " << BinarySection->getName() << '\n';
+        BC->errs() << "BOLT-WARNING: GNU_RELRO segment has partial VMA overlap "
+                   << "with section " << BinarySection->getName() << '\n';
       return;
     }
     if (!ImageContains || !VMAContains)
       return;
     BinarySection->setRelro();
     if (opts::Verbosity >= 1)
-      outs() << "BOLT-INFO: marking " << BinarySection->getName()
-             << " as GNU_RELRO\n";
+      BC->outs() << "BOLT-INFO: marking " << BinarySection->getName()
+                 << " as GNU_RELRO\n";
   };
 
   for (const ELFT::Phdr &Phdr : cantFail(Obj.program_headers()))
@@ -523,7 +533,7 @@ Error RewriteInstance::discoverStorage() {
   }
 
   if (BC->IsLinuxKernel)
-    outs() << "BOLT-INFO: Linux kernel binary detected\n";
+    BC->outs() << "BOLT-INFO: Linux kernel binary detected\n";
 
   for (const SectionRef &Section : InputFile->sections()) {
     Expected<StringRef> SectionNameOrErr = Section.getName();
@@ -555,8 +565,8 @@ Error RewriteInstance::discoverStorage() {
     return createStringError(errc::executable_format_error,
                              "no PT_LOAD pheader seen");
 
-  outs() << "BOLT-INFO: first alloc address is 0x"
-         << Twine::utohexstr(BC->FirstAllocAddress) << '\n';
+  BC->outs() << "BOLT-INFO: first alloc address is 0x"
+             << Twine::utohexstr(BC->FirstAllocAddress) << '\n';
 
   FirstNonAllocatableOffset = NextAvailableOffset;
 
@@ -588,9 +598,9 @@ Error RewriteInstance::discoverStorage() {
                NextAvailableAddress - BC->FirstAllocAddress &&
            "PHDR table address calculation error");
 
-    outs() << "BOLT-INFO: creating new program header table at address 0x"
-           << Twine::utohexstr(NextAvailableAddress) << ", offset 0x"
-           << Twine::utohexstr(NextAvailableOffset) << '\n';
+    BC->outs() << "BOLT-INFO: creating new program header table at address 0x"
+               << Twine::utohexstr(NextAvailableAddress) << ", offset 0x"
+               << Twine::utohexstr(NextAvailableOffset) << '\n';
 
     PHDRTableAddress = NextAvailableAddress;
     PHDRTableOffset = NextAvailableOffset;
@@ -685,7 +695,8 @@ void RewriteInstance::patchBuildID() {
 
   uint64_t FileOffset = getFileOffsetForAddress(BuildIDSection->getAddress());
   if (!FileOffset) {
-    errs() << "BOLT-WARNING: Non-allocatable build-id will not be updated.\n";
+    BC->errs()
+        << "BOLT-WARNING: Non-allocatable build-id will not be updated.\n";
     return;
   }
 
@@ -693,17 +704,17 @@ void RewriteInstance::patchBuildID() {
   LastIDByte ^= 1;
   OS.pwrite(&LastIDByte, 1, FileOffset + IDOffset + BuildID.size() - 1);
 
-  outs() << "BOLT-INFO: patched build-id (flipped last bit)\n";
+  BC->outs() << "BOLT-INFO: patched build-id (flipped last bit)\n";
 }
 
 Error RewriteInstance::run() {
   assert(BC && "failed to create a binary context");
 
-  outs() << "BOLT-INFO: Target architecture: "
-         << Triple::getArchTypeName(
-                (llvm::Triple::ArchType)InputFile->getArch())
-         << "\n";
-  outs() << "BOLT-INFO: BOLT version: " << BoltRevision << "\n";
+  BC->outs() << "BOLT-INFO: Target architecture: "
+             << Triple::getArchTypeName(
+                    (llvm::Triple::ArchType)InputFile->getArch())
+             << "\n";
+  BC->outs() << "BOLT-INFO: BOLT version: " << BoltRevision << "\n";
 
   if (Error E = discoverStorage())
     return E;
@@ -758,10 +769,10 @@ Error RewriteInstance::run() {
     updateRtFiniReloc();
 
   if (opts::OutputFilename == "/dev/null") {
-    outs() << "BOLT-INFO: skipping writing final binary to disk\n";
+    BC->outs() << "BOLT-INFO: skipping writing final binary to disk\n";
     return Error::success();
   } else if (BC->IsLinuxKernel) {
-    errs() << "BOLT-WARNING: Linux kernel support is experimental\n";
+    BC->errs() << "BOLT-WARNING: Linux kernel support is experimental\n";
   }
 
   // Rewrite allocatable contents and copy non-allocatable parts with mods.
@@ -786,13 +797,15 @@ void RewriteInstance::discoverFileObjects() {
   for (const ELFSymbolRef &Symbol : InputFile->symbols()) {
     Expected<StringRef> NameOrError = Symbol.getName();
     if (NameOrError && NameOrError->starts_with("__asan_init")) {
-      errs() << "BOLT-ERROR: input file was compiled or linked with sanitizer "
-                "support. Cannot optimize.\n";
+      BC->errs()
+          << "BOLT-ERROR: input file was compiled or linked with sanitizer "
+             "support. Cannot optimize.\n";
       exit(1);
     }
     if (NameOrError && NameOrError->starts_with("__llvm_coverage_mapping")) {
-      errs() << "BOLT-ERROR: input file was compiled or linked with coverage "
-                "support. Cannot optimize.\n";
+      BC->errs()
+          << "BOLT-ERROR: input file was compiled or linked with coverage "
+             "support. Cannot optimize.\n";
       exit(1);
     }
 
@@ -924,7 +937,7 @@ void RewriteInstance::discoverFileObjects() {
     StringRef SymName = cantFail(Symbol.getName(), "cannot get symbol name");
     if (SymbolAddress == 0) {
       if (opts::Verbosity >= 1 && SymbolType == SymbolRef::ST_Function)
-        errs() << "BOLT-WARNING: function with 0 address seen\n";
+        BC->errs() << "BOLT-WARNING: function with 0 address seen\n";
       continue;
     }
 
@@ -965,13 +978,13 @@ void RewriteInstance::discoverFileObjects() {
         if (BD->getSize() == ELFSymbolRef(Symbol).getSize() &&
             BD->getAddress() == SymbolAddress) {
           if (opts::Verbosity > 1)
-            errs() << "BOLT-WARNING: ignoring duplicate global symbol " << Name
-                   << "\n";
+            BC->errs() << "BOLT-WARNING: ignoring duplicate global symbol "
+                       << Name << "\n";
           // Ignore duplicate entry - possibly a bug in the linker
           continue;
         }
-        errs() << "BOLT-ERROR: bad input binary, global symbol \"" << Name
-               << "\" is not unique\n";
+        BC->errs() << "BOLT-ERROR: bad input binary, global symbol \"" << Name
+                   << "\" is not unique\n";
         exit(1);
       }
       UniqueName = Name;
@@ -1059,9 +1072,9 @@ void RewriteInstance::discoverFileObjects() {
                    !SymbolSize) {
           LLVM_DEBUG(dbgs() << "BOLT-DEBUG: ignoring symbol as a marker\n");
         } else if (opts::Verbosity > 1) {
-          errs() << "BOLT-WARNING: symbol " << UniqueName
-                 << " seen in the middle of function " << *PreviousFunction
-                 << ". Could be a new entry.\n";
+          BC->errs() << "BOLT-WARNING: symbol " << UniqueName
+                     << " seen in the middle of function " << *PreviousFunction
+                     << ". Could be a new entry.\n";
         }
         registerName(SymbolSize);
         continue;
@@ -1077,12 +1090,14 @@ void RewriteInstance::discoverFileObjects() {
         PreviousFunction->getAddress() != SymbolAddress) {
       if (PreviousFunction->isSymbolValidInScope(Symbol, SymbolSize)) {
         if (opts::Verbosity >= 1)
-          outs() << "BOLT-INFO: skipping possibly another entry for function "
-                 << *PreviousFunction << " : " << UniqueName << '\n';
+          BC->outs()
+              << "BOLT-INFO: skipping possibly another entry for function "
+              << *PreviousFunction << " : " << UniqueName << '\n';
         registerName(SymbolSize);
       } else {
-        outs() << "BOLT-INFO: using " << UniqueName << " as another entry to "
-               << "function " << *PreviousFunction << '\n';
+        BC->outs() << "BOLT-INFO: using " << UniqueName
+                   << " as another entry to "
+                   << "function " << *PreviousFunction << '\n';
 
         registerName(0);
 
@@ -1114,20 +1129,21 @@ void RewriteInstance::discoverFileObjects() {
           uint64_t PrevLength = PrevFDE.getAddressRange();
           if (SymbolAddress > PrevStart &&
               SymbolAddress < PrevStart + PrevLength) {
-            errs() << "BOLT-ERROR: function " << UniqueName
-                   << " is in conflict with FDE ["
-                   << Twine::utohexstr(PrevStart) << ", "
-                   << Twine::utohexstr(PrevStart + PrevLength)
-                   << "). Skipping.\n";
+            BC->errs() << "BOLT-ERROR: function " << UniqueName
+                       << " is in conflict with FDE ["
+                       << Twine::utohexstr(PrevStart) << ", "
+                       << Twine::utohexstr(PrevStart + PrevLength)
+                       << "). Skipping.\n";
             IsSimple = false;
           }
         }
       } else if (FDE.getAddressRange() != SymbolSize) {
         if (SymbolSize) {
           // Function addresses match but sizes differ.
-          errs() << "BOLT-WARNING: sizes differ for function " << UniqueName
-                 << ". FDE : " << FDE.getAddressRange()
-                 << "; symbol table : " << SymbolSize << ". Using max size.\n";
+          BC->errs() << "BOLT-WARNING: sizes differ for function " << UniqueName
+                     << ". FDE : " << FDE.getAddressRange()
+                     << "; symbol table : " << SymbolSize
+                     << ". Using max size.\n";
         }
         SymbolSize = std::max(SymbolSize, FDE.getAddressRange());
         if (BC->getBinaryDataAtAddress(SymbolAddress)) {
@@ -1151,10 +1167,11 @@ void RewriteInstance::discoverFileObjects() {
       if (SymbolSize != BF->getSize()) {
         if (opts::Verbosity >= 1) {
           if (SymbolSize && BF->getSize())
-            errs() << "BOLT-WARNING: size mismatch for duplicate entries "
-                   << *BF << " and " << UniqueName << '\n';
-          outs() << "BOLT-INFO: adjusting size of function " << *BF << " old "
-                 << BF->getSize() << " new " << SymbolSize << "\n";
+            BC->errs() << "BOLT-WARNING: size mismatch for duplicate entries "
+                       << *BF << " and " << UniqueName << '\n';
+          BC->outs() << "BOLT-INFO: adjusting size of function " << *BF
+                     << " old " << BF->getSize() << " new " << SymbolSize
+                     << "\n";
         }
         BF->setSize(std::max(SymbolSize, BF->getSize()));
         BC->setBinaryDataSize(SymbolAddress, BF->getSize());
@@ -1165,9 +1182,9 @@ void RewriteInstance::discoverFileObjects() {
           BC->getSectionForAddress(SymbolAddress);
       // Skip symbols from invalid sections
       if (!Section) {
-        errs() << "BOLT-WARNING: " << UniqueName << " (0x"
-               << Twine::utohexstr(SymbolAddress)
-               << ") does not have any section\n";
+        BC->errs() << "BOLT-WARNING: " << UniqueName << " (0x"
+                   << Twine::utohexstr(SymbolAddress)
+                   << ") does not have any section\n";
         continue;
       }
 
@@ -1186,12 +1203,12 @@ void RewriteInstance::discoverFileObjects() {
       static bool PrintedWarning = false;
       if (!PrintedWarning) {
         PrintedWarning = true;
-        errs() << "BOLT-WARNING: split function detected on input : "
-               << SymName;
+        BC->errs() << "BOLT-WARNING: split function detected on input : "
+                   << SymName;
         if (BC->HasRelocations)
-          errs() << ". The support is limited in relocation mode\n";
+          BC->errs() << ". The support is limited in relocation mode\n";
         else
-          errs() << '\n';
+          BC->errs() << '\n';
       }
       BC->HasSplitFunctions = true;
       BF->IsFragment = true;
@@ -1222,16 +1239,16 @@ void RewriteInstance::discoverFileObjects() {
 
     BF = BC->getBinaryFunctionContainingAddress(Address);
     if (BF) {
-      errs() << "BOLT-WARNING: FDE [0x" << Twine::utohexstr(Address) << ", 0x"
-             << Twine::utohexstr(Address + FDE->getAddressRange())
-             << ") conflicts with function " << *BF << '\n';
+      BC->errs() << "BOLT-WARNING: FDE [0x" << Twine::utohexstr(Address)
+                 << ", 0x" << Twine::utohexstr(Address + FDE->getAddressRange())
+                 << ") conflicts with function " << *BF << '\n';
       continue;
     }
 
     if (opts::Verbosity >= 1)
-      errs() << "BOLT-WARNING: FDE [0x" << Twine::utohexstr(Address) << ", 0x"
-             << Twine::utohexstr(Address + FDE->getAddressRange())
-             << ") has no corresponding symbol table entry\n";
+      BC->errs() << "BOLT-WARNING: FDE [0x" << Twine::utohexstr(Address)
+                 << ", 0x" << Twine::utohexstr(Address + FDE->getAddressRange())
+                 << ") has no corresponding symbol table entry\n";
 
     ErrorOr<BinarySection &> Section = BC->getSectionForAddress(Address);
     assert(Section && "cannot get section for address from FDE");
@@ -1283,7 +1300,8 @@ void RewriteInstance::discoverFileObjects() {
                                                    /*UseMaxSize*/ true);
         if (BF) {
           assert(Rel.isRelative() && "Expected relative relocation for island");
-          cantFail(BF->markIslandDynamicRelocationAtAddress(RelAddress));
+          BC->logBOLTErrorsAndQuitOnFatal(
+              BF->markIslandDynamicRelocationAtAddress(RelAddress));
         }
       }
     }
@@ -1395,23 +1413,24 @@ void RewriteInstance::registerFragments() {
       }
       if (!BD) {
         if (opts::Verbosity >= 1)
-          outs() << "BOLT-INFO: parent function not found for " << Name << "\n";
+          BC->outs() << "BOLT-INFO: parent function not found for " << Name
+                     << "\n";
         continue;
       }
       const uint64_t Address = BD->getAddress();
       BinaryFunction *BF = BC->getBinaryFunctionAtAddress(Address);
       if (!BF) {
         if (opts::Verbosity >= 1)
-          outs() << formatv("BOLT-INFO: parent function not found at {0:x}\n",
-                            Address);
+          BC->outs() << formatv(
+              "BOLT-INFO: parent function not found at {0:x}\n", Address);
         continue;
       }
       BC->registerFragment(Function, *BF);
       ++ParentsFound;
     }
     if (!ParentsFound) {
-      errs() << "BOLT-ERROR: parent function not found for " << Function
-             << '\n';
+      BC->errs() << "BOLT-ERROR: parent function not found for " << Function
+                 << '\n';
       exit(1);
     }
   }
@@ -1449,7 +1468,7 @@ void RewriteInstance::createPLTBinaryFunction(uint64_t TargetAddress,
     // IFUNC trampoline without symbol
     BinaryFunction *TargetBF = BC->getBinaryFunctionAtAddress(Rel->Addend);
     if (!TargetBF) {
-      errs()
+      BC->errs()
           << "BOLT-WARNING: Expected BF to be presented as IFUNC resolver at "
           << Twine::utohexstr(Rel->Addend) << ", skipping\n";
       return;
@@ -1483,8 +1502,9 @@ void RewriteInstance::disassemblePLTInstruction(const BinarySection &Section,
   if (!BC->DisAsm->getInstruction(Instruction, InstrSize,
                                   PLTData.slice(InstrOffset), InstrAddr,
                                   nulls())) {
-    errs() << "BOLT-ERROR: unable to disassemble instruction in PLT section "
-           << Section.getName() << formatv(" at offset {0:x}\n", InstrOffset);
+    BC->errs()
+        << "BOLT-ERROR: unable to disassemble instruction in PLT section "
+        << Section.getName() << formatv(" at offset {0:x}\n", InstrOffset);
     exit(1);
   }
 }
@@ -1546,9 +1566,10 @@ void RewriteInstance::disassemblePLTSectionRISCV(BinarySection &Section) {
     if (!BC->DisAsm->getInstruction(Instruction, InstrSize,
                                     PLTData.slice(InstrOffset), InstrAddr,
                                     nulls())) {
-      errs() << "BOLT-ERROR: unable to disassemble instruction in PLT section "
-             << Section.getName() << " at offset 0x"
-             << Twine::utohexstr(InstrOffset) << '\n';
+      BC->errs()
+          << "BOLT-ERROR: unable to disassemble instruction in PLT section "
+          << Section.getName() << " at offset 0x"
+          << Twine::utohexstr(InstrOffset) << '\n';
       exit(1);
     }
   };
@@ -1606,8 +1627,8 @@ void RewriteInstance::disassemblePLTSectionX86(BinarySection &Section,
     if (!BC->MIB->evaluateMemOperandTarget(Instruction, TargetAddress,
                                            SectionAddress + InstrOffset,
                                            InstrSize)) {
-      errs() << "BOLT-ERROR: error evaluating PLT instruction at offset 0x"
-             << Twine::utohexstr(SectionAddress + InstrOffset) << '\n';
+      BC->errs() << "BOLT-ERROR: error evaluating PLT instruction at offset 0x"
+                 << Twine::utohexstr(SectionAddress + InstrOffset) << '\n';
       exit(1);
     }
 
@@ -1714,8 +1735,8 @@ void RewriteInstance::adjustFunctionBoundaries() {
 
     const uint64_t MaxSize = NextObjectAddress - Function.getAddress();
     if (MaxSize < Function.getSize()) {
-      errs() << "BOLT-ERROR: symbol seen in the middle of the function "
-             << Function << ". Skipping.\n";
+      BC->errs() << "BOLT-ERROR: symbol seen in the middle of the function "
+                 << Function << ". Skipping.\n";
       Function.setSimple(false);
       Function.setMaxSize(Function.getSize());
       continue;
@@ -1725,8 +1746,8 @@ void RewriteInstance::adjustFunctionBoundaries() {
       // Some assembly functions have their size set to 0, use the max
       // size as their real size.
       if (opts::Verbosity >= 1)
-        outs() << "BOLT-INFO: setting size of function " << Function << " to "
-               << Function.getMaxSize() << " (was 0)\n";
+        BC->outs() << "BOLT-INFO: setting size of function " << Function
+                   << " to " << Function.getMaxSize() << " (was 0)\n";
       Function.setSize(Function.getMaxSize());
     }
   }
@@ -1812,8 +1833,8 @@ Error RewriteInstance::readSpecialSections() {
   markGnuRelroSections();
 
   if (HasDebugInfo && !opts::UpdateDebugSections && !opts::AggregateOnly) {
-    errs() << "BOLT-WARNING: debug info will be stripped from the binary. "
-              "Use -update-debug-sections to keep it.\n";
+    BC->errs() << "BOLT-WARNING: debug info will be stripped from the binary. "
+                  "Use -update-debug-sections to keep it.\n";
   }
 
   HasTextRelocations = (bool)BC->getUniqueSectionByName(".rela.text");
@@ -1825,22 +1846,23 @@ Error RewriteInstance::readSpecialSections() {
           BC->getUniqueSectionByName(BoltAddressTranslation::SECTION_NAME)) {
     // Do not read BAT when plotting a heatmap
     if (!opts::HeatmapMode) {
-      if (std::error_code EC = BAT->parse(BATSec->getContents())) {
-        errs() << "BOLT-ERROR: failed to parse BOLT address translation "
-                  "table.\n";
+      if (std::error_code EC = BAT->parse(BC->outs(), BATSec->getContents())) {
+        BC->errs() << "BOLT-ERROR: failed to parse BOLT address translation "
+                      "table.\n";
         exit(1);
       }
     }
   }
 
   if (opts::PrintSections) {
-    outs() << "BOLT-INFO: Sections from original binary:\n";
-    BC->printSections(outs());
+    BC->outs() << "BOLT-INFO: Sections from original binary:\n";
+    BC->printSections(BC->outs());
   }
 
   if (opts::RelocationMode == cl::BOU_TRUE && !HasTextRelocations) {
-    errs() << "BOLT-ERROR: relocations against code are missing from the input "
-              "file. Cannot proceed in relocations mode (-relocs).\n";
+    BC->errs()
+        << "BOLT-ERROR: relocations against code are missing from the input "
+           "file. Cannot proceed in relocations mode (-relocs).\n";
     exit(1);
   }
 
@@ -1848,15 +1870,16 @@ Error RewriteInstance::readSpecialSections() {
       HasTextRelocations && (opts::RelocationMode != cl::BOU_FALSE);
 
   if (BC->IsLinuxKernel && BC->HasRelocations) {
-    outs() << "BOLT-INFO: disabling relocation mode for Linux kernel\n";
+    BC->outs() << "BOLT-INFO: disabling relocation mode for Linux kernel\n";
     BC->HasRelocations = false;
   }
 
   BC->IsStripped = !HasSymbolTable;
 
   if (BC->IsStripped && !opts::AllowStripped) {
-    errs() << "BOLT-ERROR: stripped binaries are not supported. If you know "
-              "what you're doing, use --allow-stripped to proceed";
+    BC->errs()
+        << "BOLT-ERROR: stripped binaries are not supported. If you know "
+           "what you're doing, use --allow-stripped to proceed";
     exit(1);
   }
 
@@ -1865,14 +1888,14 @@ Error RewriteInstance::readSpecialSections() {
     BC->HasRelocations = false;
 
   if (BC->HasRelocations)
-    outs() << "BOLT-INFO: enabling " << (opts::StrictMode ? "strict " : "")
-           << "relocation mode\n";
+    BC->outs() << "BOLT-INFO: enabling " << (opts::StrictMode ? "strict " : "")
+               << "relocation mode\n";
 
   // Read EH frame for function boundaries info.
   Expected<const DWARFDebugFrame *> EHFrameOrError = BC->DwCtx->getEHFrame();
   if (!EHFrameOrError)
     report_error("expected valid eh_frame section", EHFrameOrError.takeError());
-  CFIRdWrt.reset(new CFIReaderWriter(*EHFrameOrError.get()));
+  CFIRdWrt.reset(new CFIReaderWriter(*BC, *EHFrameOrError.get()));
 
   // Parse build-id
   parseBuildID();
@@ -1885,63 +1908,68 @@ Error RewriteInstance::readSpecialSections() {
 
 void RewriteInstance::adjustCommandLineOptions() {
   if (BC->isAArch64() && !BC->HasRelocations)
-    errs() << "BOLT-WARNING: non-relocation mode for AArch64 is not fully "
-              "supported\n";
+    BC->errs() << "BOLT-WARNING: non-relocation mode for AArch64 is not fully "
+                  "supported\n";
 
   if (RuntimeLibrary *RtLibrary = BC->getRuntimeLibrary())
     RtLibrary->adjustCommandLineOptions(*BC);
 
   if (opts::AlignMacroOpFusion != MFT_NONE && !BC->isX86()) {
-    outs() << "BOLT-INFO: disabling -align-macro-fusion on non-x86 platform\n";
+    BC->outs()
+        << "BOLT-INFO: disabling -align-macro-fusion on non-x86 platform\n";
     opts::AlignMacroOpFusion = MFT_NONE;
   }
 
   if (BC->isX86() && BC->MAB->allowAutoPadding()) {
     if (!BC->HasRelocations) {
-      errs() << "BOLT-ERROR: cannot apply mitigations for Intel JCC erratum in "
-                "non-relocation mode\n";
+      BC->errs()
+          << "BOLT-ERROR: cannot apply mitigations for Intel JCC erratum in "
+             "non-relocation mode\n";
       exit(1);
     }
-    outs() << "BOLT-WARNING: using mitigation for Intel JCC erratum, layout "
-              "may take several minutes\n";
+    BC->outs()
+        << "BOLT-WARNING: using mitigation for Intel JCC erratum, layout "
+           "may take several minutes\n";
     opts::AlignMacroOpFusion = MFT_NONE;
   }
 
   if (opts::AlignMacroOpFusion != MFT_NONE && !BC->HasRelocations) {
-    outs() << "BOLT-INFO: disabling -align-macro-fusion in non-relocation "
-              "mode\n";
+    BC->outs() << "BOLT-INFO: disabling -align-macro-fusion in non-relocation "
+                  "mode\n";
     opts::AlignMacroOpFusion = MFT_NONE;
   }
 
   if (opts::SplitEH && !BC->HasRelocations) {
-    errs() << "BOLT-WARNING: disabling -split-eh in non-relocation mode\n";
+    BC->errs() << "BOLT-WARNING: disabling -split-eh in non-relocation mode\n";
     opts::SplitEH = false;
   }
 
   if (opts::StrictMode && !BC->HasRelocations) {
-    errs() << "BOLT-WARNING: disabling strict mode (-strict) in non-relocation "
-              "mode\n";
+    BC->errs()
+        << "BOLT-WARNING: disabling strict mode (-strict) in non-relocation "
+           "mode\n";
     opts::StrictMode = false;
   }
 
   if (BC->HasRelocations && opts::AggregateOnly &&
       !opts::StrictMode.getNumOccurrences()) {
-    outs() << "BOLT-INFO: enabling strict relocation mode for aggregation "
-              "purposes\n";
+    BC->outs() << "BOLT-INFO: enabling strict relocation mode for aggregation "
+                  "purposes\n";
     opts::StrictMode = true;
   }
 
   if (BC->isX86() && BC->HasRelocations &&
       opts::AlignMacroOpFusion == MFT_HOT && !ProfileReader) {
-    outs() << "BOLT-INFO: enabling -align-macro-fusion=all since no profile "
-              "was specified\n";
+    BC->outs()
+        << "BOLT-INFO: enabling -align-macro-fusion=all since no profile "
+           "was specified\n";
     opts::AlignMacroOpFusion = MFT_ALL;
   }
 
   if (!BC->HasRelocations &&
       opts::ReorderFunctions != ReorderFunctions::RT_NONE) {
-    errs() << "BOLT-ERROR: function reordering only works when "
-           << "relocations are enabled\n";
+    BC->errs() << "BOLT-ERROR: function reordering only works when "
+               << "relocations are enabled\n";
     exit(1);
   }
 
@@ -1950,7 +1978,7 @@ void RewriteInstance::adjustCommandLineOptions() {
        !opts::HotText.getNumOccurrences())) {
     opts::HotText = true;
   } else if (opts::HotText && !BC->HasRelocations) {
-    errs() << "BOLT-WARNING: hot text is disabled in non-relocation mode\n";
+    BC->errs() << "BOLT-WARNING: hot text is disabled in non-relocation mode\n";
     opts::HotText = false;
   }
 
@@ -1961,12 +1989,13 @@ void RewriteInstance::adjustCommandLineOptions() {
   }
 
   if (opts::UseOldText && !BC->OldTextSectionAddress) {
-    errs() << "BOLT-WARNING: cannot use old .text as the section was not found"
-              "\n";
+    BC->errs()
+        << "BOLT-WARNING: cannot use old .text as the section was not found"
+           "\n";
     opts::UseOldText = false;
   }
   if (opts::UseOldText && !BC->HasRelocations) {
-    errs() << "BOLT-WARNING: cannot use old .text in non-relocation mode\n";
+    BC->errs() << "BOLT-WARNING: cannot use old .text in non-relocation mode\n";
     opts::UseOldText = false;
   }
 
@@ -1981,23 +2010,25 @@ void RewriteInstance::adjustCommandLineOptions() {
     opts::Lite = true;
 
   if (opts::Lite && opts::UseOldText) {
-    errs() << "BOLT-WARNING: cannot combine -lite with -use-old-text. "
-              "Disabling -use-old-text.\n";
+    BC->errs() << "BOLT-WARNING: cannot combine -lite with -use-old-text. "
+                  "Disabling -use-old-text.\n";
     opts::UseOldText = false;
   }
 
   if (opts::Lite && opts::StrictMode) {
-    errs() << "BOLT-ERROR: -strict and -lite cannot be used at the same time\n";
+    BC->errs()
+        << "BOLT-ERROR: -strict and -lite cannot be used at the same time\n";
     exit(1);
   }
 
   if (opts::Lite)
-    outs() << "BOLT-INFO: enabling lite mode\n";
+    BC->outs() << "BOLT-INFO: enabling lite mode\n";
 
   if (!opts::SaveProfile.empty() && BAT->enabledFor(InputFile)) {
-    errs() << "BOLT-ERROR: unable to save profile in YAML format for input "
-              "file processed by BOLT. Please remove -w option and use branch "
-              "profile.\n";
+    BC->errs()
+        << "BOLT-ERROR: unable to save profile in YAML format for input "
+           "file processed by BOLT. Please remove -w option and use branch "
+           "profile.\n";
     exit(1);
   }
 }
@@ -2252,8 +2283,8 @@ void RewriteInstance::processRelocations() {
   }
 
   if (NumFailedRelocations)
-    errs() << "BOLT-WARNING: Failed to analyze " << NumFailedRelocations
-           << " relocations\n";
+    BC->errs() << "BOLT-WARNING: Failed to analyze " << NumFailedRelocations
+               << " relocations\n";
 }
 
 void RewriteInstance::readDynamicRelocations(const SectionRef &Section,
@@ -2497,8 +2528,8 @@ void RewriteInstance::handleRelocation(const SectionRef &RelocatedSection,
     assert(ContainingBF && "cannot find function for address in code");
     if (!IsAArch64 && !ContainingBF->containsAddress(Rel.getOffset())) {
       if (opts::Verbosity >= 1)
-        outs() << formatv("BOLT-INFO: {0} has relocations in padding area\n",
-                          *ContainingBF);
+        BC->outs() << formatv(
+            "BOLT-INFO: {0} has relocations in padding area\n", *ContainingBF);
       ContainingBF->setSize(ContainingBF->getMaxSize());
       ContainingBF->setSimple(false);
       return;
@@ -2593,12 +2624,13 @@ void RewriteInstance::handleRelocation(const SectionRef &RelocatedSection,
       if (BF != ReferencedBF) {
         // It's possible we are referencing a function without referencing any
         // code, e.g. when taking a bitmask action on a function address.
-        errs() << "BOLT-WARNING: non-standard function reference (e.g. bitmask)"
-               << formatv(" detected against function {0} from ", *BF);
+        BC->errs()
+            << "BOLT-WARNING: non-standard function reference (e.g. bitmask)"
+            << formatv(" detected against function {0} from ", *BF);
         if (IsFromCode)
-          errs() << formatv("function {0}\n", *ContainingBF);
+          BC->errs() << formatv("function {0}\n", *ContainingBF);
         else
-          errs() << formatv("data section at {0:x}\n", Rel.getOffset());
+          BC->errs() << formatv("data section at {0:x}\n", Rel.getOffset());
         LLVM_DEBUG(printRelocationInfo(Rel, SymbolName, SymbolAddress, Addend,
                                        ExtractedValue));
         ReferencedBF = BF;
@@ -2628,10 +2660,11 @@ void RewriteInstance::handleRelocation(const SectionRef &RelocatedSection,
           llvm::make_second_range(ContainingBF->Relocations), CheckReloc);
 
       if (Found) {
-        errs() << "BOLT-WARNING: detected possible compiler de-virtualization "
-                  "bug: -1 addend used with non-pc-relative relocation against "
-               << formatv("function {0} in function {1}\n", *RogueBF,
-                          *ContainingBF);
+        BC->errs()
+            << "BOLT-WARNING: detected possible compiler de-virtualization "
+               "bug: -1 addend used with non-pc-relative relocation against "
+            << formatv("function {0} in function {1}\n", *RogueBF,
+                       *ContainingBF);
         return;
       }
     }
@@ -2681,9 +2714,10 @@ void RewriteInstance::handleRelocation(const SectionRef &RelocatedSection,
         }
         if (opts::Verbosity > 1 &&
             BinarySection(*BC, RelocatedSection).isWritable())
-          errs() << "BOLT-WARNING: writable reference into the middle of the "
-                 << formatv("function {0} detected at address {1:x}\n",
-                            *ReferencedBF, Rel.getOffset());
+          BC->errs()
+              << "BOLT-WARNING: writable reference into the middle of the "
+              << formatv("function {0} detected at address {1:x}\n",
+                         *ReferencedBF, Rel.getOffset());
       }
       SymbolAddress = Address;
       Addend = 0;
@@ -2825,8 +2859,9 @@ void RewriteInstance::selectFunctionsToProcess() {
   if ((!opts::ForceFunctionNames.empty() ||
        !opts::ForceFunctionNamesNR.empty()) &&
       !opts::SkipFunctionNames.empty()) {
-    errs() << "BOLT-ERROR: cannot select functions to process and skip at the "
-              "same time. Please use only one type of selection.\n";
+    BC->errs()
+        << "BOLT-ERROR: cannot select functions to process and skip at the "
+           "same time. Please use only one type of selection.\n";
     exit(1);
   }
 
@@ -2850,8 +2885,8 @@ void RewriteInstance::selectFunctionsToProcess() {
     if (Index)
       --Index;
     LiteThresholdExecCount = TopFunctions[Index]->getKnownExecutionCount();
-    outs() << "BOLT-INFO: limiting processing to functions with at least "
-           << LiteThresholdExecCount << " invocations\n";
+    BC->outs() << "BOLT-INFO: limiting processing to functions with at least "
+               << LiteThresholdExecCount << " invocations\n";
   }
   LiteThresholdExecCount = std::max(
       LiteThresholdExecCount, static_cast<uint64_t>(opts::LiteThresholdCount));
@@ -2860,7 +2895,8 @@ void RewriteInstance::selectFunctionsToProcess() {
   StringSet<> ReorderFunctionsLTOCommonSet;
   if (opts::ReorderFunctions == ReorderFunctions::RT_USER) {
     std::vector<std::string> FunctionNames;
-    cantFail(ReorderFunctions::readFunctionOrderFile(FunctionNames));
+    BC->logBOLTErrorsAndQuitOnFatal(
+        ReorderFunctions::readFunctionOrderFile(FunctionNames));
     for (const std::string &Function : FunctionNames) {
       ReorderFunctionsUserSet.insert(Function);
       if (std::optional<StringRef> LTOCommonName = getLTOCommonName(Function))
@@ -2937,15 +2973,15 @@ void RewriteInstance::selectFunctionsToProcess() {
 
     if (!shouldProcess(Function)) {
       if (opts::Verbosity >= 1) {
-        outs() << "BOLT-INFO: skipping processing " << Function
-               << " per user request\n";
+        BC->outs() << "BOLT-INFO: skipping processing " << Function
+                   << " per user request\n";
       }
       Function.setIgnored();
     } else {
       ++NumFunctionsToProcess;
       if (opts::MaxFunctions.getNumOccurrences() &&
           NumFunctionsToProcess == opts::MaxFunctions)
-        outs() << "BOLT-INFO: processing ending on " << Function << '\n';
+        BC->outs() << "BOLT-INFO: processing ending on " << Function << '\n';
     }
   }
 
@@ -2964,8 +3000,8 @@ void RewriteInstance::selectFunctionsToProcess() {
     if (mustSkip(Function)) {
       for (BinaryFunction *Parent : Function.ParentFragments) {
         if (opts::Verbosity >= 1) {
-          outs() << "BOLT-INFO: skipping processing " << *Parent
-                 << " together with fragment function\n";
+          BC->outs() << "BOLT-INFO: skipping processing " << *Parent
+                     << " together with fragment function\n";
         }
         Parent->setIgnored();
         --NumFunctionsToProcess;
@@ -2980,18 +3016,18 @@ void RewriteInstance::selectFunctionsToProcess() {
         });
     if (IgnoredParent) {
       if (opts::Verbosity >= 1) {
-        outs() << "BOLT-INFO: skipping processing " << Function
-               << " together with parent function\n";
+        BC->outs() << "BOLT-INFO: skipping processing " << Function
+                   << " together with parent function\n";
       }
       Function.setIgnored();
     } else {
       ++NumFunctionsToProcess;
       if (opts::Verbosity >= 1) {
-        outs() << "BOLT-INFO: processing " << Function
-               << " as a sibling of non-ignored function\n";
+        BC->outs() << "BOLT-INFO: processing " << Function
+                   << " as a sibling of non-ignored function\n";
       }
       if (opts::MaxFunctions && NumFunctionsToProcess == opts::MaxFunctions)
-        outs() << "BOLT-INFO: processing ending on " << Function << '\n';
+        BC->outs() << "BOLT-INFO: processing ending on " << Function << '\n';
     }
   }
 }
@@ -3012,12 +3048,12 @@ void RewriteInstance::preprocessProfileData() {
   NamedRegionTimer T("preprocessprofile", "pre-process profile data",
                      TimerGroupName, TimerGroupDesc, opts::TimeRewrite);
 
-  outs() << "BOLT-INFO: pre-processing profile using "
-         << ProfileReader->getReaderName() << '\n';
+  BC->outs() << "BOLT-INFO: pre-processing profile using "
+             << ProfileReader->getReaderName() << '\n';
 
   if (BAT->enabledFor(InputFile)) {
-    outs() << "BOLT-INFO: profile collection done on a binary already "
-              "processed by BOLT\n";
+    BC->outs() << "BOLT-INFO: profile collection done on a binary already "
+                  "processed by BOLT\n";
     ProfileReader->setBAT(&*BAT);
   }
 
@@ -3025,10 +3061,11 @@ void RewriteInstance::preprocessProfileData() {
     report_error("cannot pre-process profile", std::move(E));
 
   if (!BC->hasSymbolsWithFileName() && ProfileReader->hasLocalsWithFileName()) {
-    errs() << "BOLT-ERROR: input binary does not have local file symbols "
-              "but profile data includes function names with embedded file "
-              "names. It appears that the input binary was stripped while a "
-              "profiled binary was not\n";
+    BC->errs()
+        << "BOLT-ERROR: input binary does not have local file symbols "
+           "but profile data includes function names with embedded file "
+           "names. It appears that the input binary was stripped while a "
+           "profiled binary was not\n";
     exit(1);
   }
 }
@@ -3081,7 +3118,7 @@ void RewriteInstance::processProfileData() {
       if (Function.empty())
         continue;
 
-      Function.print(outs(), "after attaching profile");
+      Function.print(BC->outs(), "after attaching profile");
     }
   }
 
@@ -3110,8 +3147,8 @@ void RewriteInstance::disassembleFunctions() {
 
     ErrorOr<ArrayRef<uint8_t>> FunctionData = Function.getData();
     if (!FunctionData) {
-      errs() << "BOLT-ERROR: corresponding section is non-executable or "
-             << "empty for function " << Function << '\n';
+      BC->errs() << "BOLT-ERROR: corresponding section is non-executable or "
+                 << "empty for function " << Function << '\n';
       exit(1);
     }
 
@@ -3138,16 +3175,19 @@ void RewriteInstance::disassembleFunctions() {
     handleAllErrors(Function.disassemble(), [&](const BOLTError &E) {
       DisasmFailed = true;
       if (E.isFatal()) {
-        E.log(errs());
+        E.log(BC->errs());
+        exit(1);
+      }
+      if (opts::processAllFunctions()) {
+        BC->errs() << BC->generateBugReportMessage(
+            "function cannot be properly disassembled. "
+            "Unable to continue in relocation mode.",
+            Function);
         exit(1);
       }
-      if (opts::processAllFunctions())
-        BC->exitWithBugReport("function cannot be properly disassembled. "
-                              "Unable to continue in relocation mode.",
-                              Function);
       if (opts::Verbosity >= 1)
-        outs() << "BOLT-INFO: could not disassemble function " << Function
-               << ". Will ignore.\n";
+        BC->outs() << "BOLT-INFO: could not disassemble function " << Function
+                   << ". Will ignore.\n";
       // Forcefully ignore the function.
       Function.setIgnored();
     });
@@ -3156,7 +3196,7 @@ void RewriteInstance::disassembleFunctions() {
       continue;
 
     if (opts::PrintAll || opts::PrintDisasm)
-      Function.print(outs(), "after disassembly");
+      Function.print(BC->outs(), "after disassembly");
   }
 
   BC->processInterproceduralReferences();
@@ -3191,10 +3231,12 @@ void RewriteInstance::disassembleFunctions() {
     // Fill in CFI information for this function
     if (!Function.trapsOnEntry() && !CFIRdWrt->fillCFIInfoFor(Function)) {
       if (BC->HasRelocations) {
-        BC->exitWithBugReport("unable to fill CFI.", Function);
+        BC->errs() << BC->generateBugReportMessage("unable to fill CFI.",
+                                                   Function);
+        exit(1);
       } else {
-        errs() << "BOLT-WARNING: unable to fill CFI for function " << Function
-               << ". Skipping.\n";
+        BC->errs() << "BOLT-WARNING: unable to fill CFI for function "
+                   << Function << ". Skipping.\n";
         Function.setSimple(false);
         continue;
       }
@@ -3208,7 +3250,8 @@ void RewriteInstance::disassembleFunctions() {
       check_error(LSDASection.getError(), "failed to get LSDA section");
       ArrayRef<uint8_t> LSDAData = ArrayRef<uint8_t>(
           LSDASection->getData(), LSDASection->getContents().size());
-      cantFail(Function.parseLSDA(LSDAData, LSDASection->getAddress()));
+      BC->logBOLTErrorsAndQuitOnFatal(
+          Function.parseLSDA(LSDAData, LSDASection->getAddress()));
     }
   }
 }
@@ -3226,7 +3269,7 @@ void RewriteInstance::buildFunctionsCFG() {
         bool HadErrors{false};
         handleAllErrors(BF.buildCFG(AllocId), [&](const BOLTError &E) {
           if (!E.getMessage().empty())
-            E.log(errs());
+            E.log(BC->errs());
           if (E.isFatal())
             exit(1);
           HadErrors = true;
@@ -3237,7 +3280,7 @@ void RewriteInstance::buildFunctionsCFG() {
 
         if (opts::PrintAll) {
           auto L = BC->scopeLock();
-          BF.print(outs(), "while building cfg");
+          BF.print(BC->outs(), "while building cfg");
         }
       };
 
@@ -3276,14 +3319,14 @@ void RewriteInstance::postProcessFunctions() {
     Function.postProcessCFG();
 
     if (opts::PrintAll || opts::PrintCFG)
-      Function.print(outs(), "after building cfg");
+      Function.print(BC->outs(), "after building cfg");
 
     if (opts::DumpDotAll)
       Function.dumpGraphForPass("00_build-cfg");
 
     if (opts::PrintLoopInfo) {
       Function.calculateLoopInfo();
-      Function.printLoopInfo(outs());
+      Function.printLoopInfo(BC->outs());
     }
 
     BC->TotalScore += Function.getFunctionScore();
@@ -3291,19 +3334,15 @@ void RewriteInstance::postProcessFunctions() {
   }
 
   if (opts::PrintGlobals) {
-    outs() << "BOLT-INFO: Global symbols:\n";
-    BC->printGlobalSymbols(outs());
+    BC->outs() << "BOLT-INFO: Global symbols:\n";
+    BC->printGlobalSymbols(BC->outs());
   }
 }
 
 void RewriteInstance::runOptimizationPasses() {
   NamedRegionTimer T("runOptimizationPasses", "run optimization passes",
                      TimerGroupName, TimerGroupDesc, opts::TimeRewrite);
-  handleAllErrors(BinaryFunctionPassManager::runAllPasses(*BC),
-                  [](const BOLTError &E) {
-                    E.log(errs());
-                    exit(1);
-                  });
+  BC->logBOLTErrorsAndQuitOnFatal(BinaryFunctionPassManager::runAllPasses(*BC));
 }
 
 void RewriteInstance::preregisterSections() {
@@ -3353,7 +3392,7 @@ void RewriteInstance::emitAndLink() {
 
   Streamer->finish();
   if (Streamer->getContext().hadError()) {
-    errs() << "BOLT-ERROR: Emission failed.\n";
+    BC->errs() << "BOLT-ERROR: Emission failed.\n";
     exit(1);
   }
 
@@ -3364,9 +3403,10 @@ void RewriteInstance::emitAndLink() {
     raw_fd_ostream FOS(OutObjectPath, EC);
     check_error(EC, "cannot create output object file");
     FOS << ObjectBuffer;
-    outs() << "BOLT-INFO: intermediary output object file saved for debugging "
-              "purposes: "
-           << OutObjectPath << "\n";
+    BC->outs()
+        << "BOLT-INFO: intermediary output object file saved for debugging "
+           "purposes: "
+        << OutObjectPath << "\n";
   }
 
   ErrorOr<BinarySection &> TextSection =
@@ -3431,8 +3471,8 @@ void RewriteInstance::emitAndLink() {
   }
 
   if (opts::PrintCacheMetrics) {
-    outs() << "BOLT-INFO: cache metrics after emitting functions:\n";
-    CacheMetrics::printAll(BC->getSortedFunctions());
+    BC->outs() << "BOLT-INFO: cache metrics after emitting functions:\n";
+    CacheMetrics::printAll(BC->outs(), BC->getSortedFunctions());
   }
 }
 
@@ -3598,14 +3638,15 @@ void RewriteInstance::mapCodeSections(BOLTLinker::SectionMapper MapSection) {
           allocateAt(BC->OldTextSectionAddress) - BC->OldTextSectionAddress;
 
       if (CodeSize <= BC->OldTextSectionSize) {
-        outs() << "BOLT-INFO: using original .text for new code with 0x"
-               << Twine::utohexstr(opts::AlignText) << " alignment\n";
+        BC->outs() << "BOLT-INFO: using original .text for new code with 0x"
+                   << Twine::utohexstr(opts::AlignText) << " alignment\n";
         AllocationDone = true;
       } else {
-        errs() << "BOLT-WARNING: original .text too small to fit the new code"
-               << " using 0x" << Twine::utohexstr(opts::AlignText)
-               << " alignment. " << CodeSize << " bytes needed, have "
-               << BC->OldTextSectionSize << " bytes available.\n";
+        BC->errs()
+            << "BOLT-WARNING: original .text too small to fit the new code"
+            << " using 0x" << Twine::utohexstr(opts::AlignText)
+            << " alignment. " << CodeSize << " bytes needed, have "
+            << BC->OldTextSectionSize << " bytes available.\n";
         opts::UseOldText = false;
       }
     }
@@ -3626,9 +3667,9 @@ void RewriteInstance::mapCodeSections(BOLTLinker::SectionMapper MapSection) {
 
     // Check if we need to insert a padding section for hot text.
     if (PaddingSize && !opts::UseOldText)
-      outs() << "BOLT-INFO: padding code to 0x"
-             << Twine::utohexstr(NextAvailableAddress)
-             << " to accommodate hot text\n";
+      BC->outs() << "BOLT-INFO: padding code to 0x"
+                 << Twine::utohexstr(NextAvailableAddress)
+                 << " to accommodate hot text\n";
 
     return;
   }
@@ -3837,7 +3878,7 @@ void RewriteInstance::patchELFPHDRTable() {
     assert(!PHDRTableAddress && "unexpected address for program header table");
     PHDRTableOffset = Obj.getHeader().e_phoff;
     if (NewWritableSegmentSize) {
-      errs() << "Unable to add writable segment with UseGnuStack option\n";
+      BC->errs() << "Unable to add writable segment with UseGnuStack option\n";
       exit(1);
     }
   }
@@ -4137,7 +4178,8 @@ void RewriteInstance::encodeBATSection() {
                                   copyByteArray(BoltInfo), BoltInfo.size(),
                                   /*Alignment=*/1,
                                   /*IsReadOnly=*/true, ELF::SHT_NOTE);
-  outs() << "BOLT-INFO: BAT section size (bytes): " << BoltInfo.size() << '\n';
+  BC->outs() << "BOLT-INFO: BAT section size (bytes): " << BoltInfo.size()
+             << '\n';
 }
 
 template <typename ELFShdrTy>
@@ -4198,14 +4240,14 @@ RewriteInstance::getOutputSections(ELFObjectFile<ELFT> *File,
 
     if (Section.hasSectionRef() || Section.isAnonymous()) {
       if (opts::Verbosity)
-        outs() << "BOLT-INFO: not writing section header for section "
-               << Section.getOutputName() << '\n';
+        BC->outs() << "BOLT-INFO: not writing section header for section "
+                   << Section.getOutputName() << '\n';
       continue;
     }
 
     if (opts::Verbosity >= 1)
-      outs() << "BOLT-INFO: writing section header for "
-             << Section.getOutputName() << '\n';
+      BC->outs() << "BOLT-INFO: writing section header for "
+                 << Section.getOutputName() << '\n';
     ELFShdrTy NewSection;
     NewSection.sh_type = ELF::SHT_PROGBITS;
     NewSection.sh_addr = Section.getOutputAddress();
@@ -4239,8 +4281,8 @@ RewriteInstance::getOutputSections(ELFObjectFile<ELFT> *File,
     if (PrevSection &&
         PrevSection->sh_offset + PrevSection->sh_size > Section.sh_offset) {
       if (opts::Verbosity > 1) {
-        outs() << "BOLT-INFO: adjusting size for section "
-               << PrevBinSec->getOutputName() << '\n';
+        BC->outs() << "BOLT-INFO: adjusting size for section "
+                   << PrevBinSec->getOutputName() << '\n';
       }
       PrevSection->sh_size = Section.sh_offset - PrevSection->sh_offset;
     }
@@ -4287,8 +4329,8 @@ RewriteInstance::getOutputSections(ELFObjectFile<ELFT> *File,
       continue;
 
     if (opts::Verbosity >= 1)
-      outs() << "BOLT-INFO: writing section header for "
-             << Section.getOutputName() << '\n';
+      BC->outs() << "BOLT-INFO: writing section header for "
+                 << Section.getOutputName() << '\n';
 
     ELFShdrTy NewSection;
     NewSection.sh_type = Section.getELFType();
@@ -4686,8 +4728,8 @@ void RewriteInstance::updateELFSymbolTable(
                                  std::optional<uint64_t> Value = std::nullopt) {
       NewSymbol.st_value = Value ? *Value : getNewValueForSymbol(Name);
       NewSymbol.st_shndx = ELF::SHN_ABS;
-      outs() << "BOLT-INFO: setting " << Name << " to 0x"
-             << Twine::utohexstr(NewSymbol.st_value) << '\n';
+      BC->outs() << "BOLT-INFO: setting " << Name << " to 0x"
+                 << Twine::utohexstr(NewSymbol.st_value) << '\n';
     };
 
     if (opts::HotText &&
@@ -4762,8 +4804,8 @@ void RewriteInstance::updateELFSymbolTable(
     Symbol.st_other = 0;
     Symbol.setBindingAndType(ELF::STB_WEAK, ELF::STT_NOTYPE);
 
-    outs() << "BOLT-INFO: setting " << Name << " to 0x"
-           << Twine::utohexstr(Symbol.st_value) << '\n';
+    BC->outs() << "BOLT-INFO: setting " << Name << " to 0x"
+               << Twine::utohexstr(Symbol.st_value) << '\n';
 
     Symbols.emplace_back(Symbol);
   };
@@ -4859,7 +4901,7 @@ void RewriteInstance::patchELFSymTabs(ELFObjectFile<ELFT> *File) {
     }
   }
   if (!SymTabSection) {
-    errs() << "BOLT-WARNING: no symbol table found\n";
+    BC->errs() << "BOLT-WARNING: no symbol table found\n";
     return;
   }
 
@@ -4962,7 +5004,7 @@ void RewriteInstance::patchELFAllocatableRelrSection(
 
   auto WriteRelr = [&](uint64_t Value) {
     if (RelrDynOffset + DynamicRelrEntrySize > RelrDynEndOffset) {
-      errs() << "BOLT-ERROR: Offset overflow for relr.dyn section\n";
+      BC->errs() << "BOLT-ERROR: Offset overflow for relr.dyn section\n";
       exit(1);
     }
 
@@ -5074,12 +5116,12 @@ RewriteInstance::patchELFAllocatableRelaSections(ELFObjectFile<ELFT> *File) {
         const uint64_t &EndOffset =
             IsJmpRel ? RelPltEndOffset : RelDynEndOffset;
         if (!Offset || !EndOffset) {
-          errs() << "BOLT-ERROR: Invalid offsets for dynamic relocation\n";
+          BC->errs() << "BOLT-ERROR: Invalid offsets for dynamic relocation\n";
           exit(1);
         }
 
         if (Offset + sizeof(NewRelA) > EndOffset) {
-          errs() << "BOLT-ERROR: Offset overflow for dynamic relocation\n";
+          BC->errs() << "BOLT-ERROR: Offset overflow for dynamic relocation\n";
           exit(1);
         }
 
@@ -5128,7 +5170,7 @@ void RewriteInstance::patchELFGOT(ELFObjectFile<ELFT> *File) {
   }
   if (!GOTSection.getObject()) {
     if (!BC->IsStaticExecutable)
-      errs() << "BOLT-INFO: no .got section found\n";
+      BC->errs() << "BOLT-INFO: no .got section found\n";
     return;
   }
 
@@ -5232,9 +5274,10 @@ void RewriteInstance::patchELFDynamic(ELFObjectFile<ELFT> *File) {
   }
 
   if (BC->RequiresZNow && !ZNowSet) {
-    errs() << "BOLT-ERROR: output binary requires immediate relocation "
-              "processing which depends on DT_FLAGS or DT_FLAGS_1 presence in "
-              ".dynamic. Please re-link the binary with -znow.\n";
+    BC->errs()
+        << "BOLT-ERROR: output binary requires immediate relocation "
+           "processing which depends on DT_FLAGS or DT_FLAGS_1 presence in "
+           ".dynamic. Please re-link the binary with -znow.\n";
     exit(1);
   }
 }
@@ -5256,7 +5299,7 @@ Error RewriteInstance::readELFDynamic(ELFObjectFile<ELFT> *File) {
   }
 
   if (!DynamicPhdr) {
-    outs() << "BOLT-INFO: static input executable detected\n";
+    BC->outs() << "BOLT-INFO: static input executable detected\n";
     // TODO: static PIE executable might have dynamic header
     BC->IsStaticExecutable = true;
     return Error::success();
@@ -5330,12 +5373,12 @@ Error RewriteInstance::readELFDynamic(ELFObjectFile<ELFT> *File) {
     DynamicRelrAddress.reset();
     DynamicRelrSize = 0;
   } else if (!DynamicRelrEntrySize) {
-    errs() << "BOLT-ERROR: expected DT_RELRENT to be presented "
-           << "in DYNAMIC section\n";
+    BC->errs() << "BOLT-ERROR: expected DT_RELRENT to be presented "
+               << "in DYNAMIC section\n";
     exit(1);
   } else if (DynamicRelrSize % DynamicRelrEntrySize) {
-    errs() << "BOLT-ERROR: expected RELR table size to be divisible "
-           << "by RELR entry size\n";
+    BC->errs() << "BOLT-ERROR: expected RELR table size to be divisible "
+               << "by RELR entry size\n";
     exit(1);
   }
 
@@ -5392,11 +5435,11 @@ void RewriteInstance::rewriteFile() {
     if (Function->getImageSize() > Function->getMaxSize()) {
       assert(!BC->isX86() && "Unexpected large function.");
       if (opts::Verbosity >= 1)
-        errs() << "BOLT-WARNING: new function size (0x"
-               << Twine::utohexstr(Function->getImageSize())
-               << ") is larger than maximum allowed size (0x"
-               << Twine::utohexstr(Function->getMaxSize()) << ") for function "
-               << *Function << '\n';
+        BC->errs() << "BOLT-WARNING: new function size (0x"
+                   << Twine::utohexstr(Function->getImageSize())
+                   << ") is larger than maximum allowed size (0x"
+                   << Twine::utohexstr(Function->getMaxSize())
+                   << ") for function " << *Function << '\n';
 
       // Remove jump table sections that this function owns in non-reloc mode
       // because we don't want to write them anymore.
@@ -5432,7 +5475,7 @@ void RewriteInstance::rewriteFile() {
 
     // Overwrite function in the output file.
     if (opts::Verbosity >= 2)
-      outs() << "BOLT: rewriting function \"" << *Function << "\"\n";
+      BC->outs() << "BOLT: rewriting function \"" << *Function << "\"\n";
 
     OS.pwrite(reinterpret_cast<char *>(Function->getImageAddress()),
               Function->getImageSize(), Function->getFileOffset());
@@ -5452,8 +5495,8 @@ void RewriteInstance::rewriteFile() {
 
     // Write cold part
     if (opts::Verbosity >= 2) {
-      outs() << formatv("BOLT: rewriting function \"{0}\" (split parts)\n",
-                        *Function);
+      BC->outs() << formatv("BOLT: rewriting function \"{0}\" (split parts)\n",
+                            *Function);
     }
 
     for (const FunctionFragment &FF :
@@ -5465,14 +5508,15 @@ void RewriteInstance::rewriteFile() {
 
   // Print function statistics for non-relocation mode.
   if (!BC->HasRelocations) {
-    outs() << "BOLT: " << CountOverwrittenFunctions << " out of "
-           << BC->getBinaryFunctions().size()
-           << " functions were overwritten.\n";
+    BC->outs() << "BOLT: " << CountOverwrittenFunctions << " out of "
+               << BC->getBinaryFunctions().size()
+               << " functions were overwritten.\n";
     if (BC->TotalScore != 0) {
       double Coverage = OverwrittenScore / (double)BC->TotalScore * 100.0;
-      outs() << format("BOLT-INFO: rewritten functions cover %.2lf", Coverage)
-             << "% of the execution count of simple functions of "
-                "this binary\n";
+      BC->outs() << format("BOLT-INFO: rewritten functions cover %.2lf",
+                           Coverage)
+                 << "% of the execution count of simple functions of "
+                    "this binary\n";
     }
   }
 
@@ -5500,10 +5544,11 @@ void RewriteInstance::rewriteFile() {
       continue;
 
     if (opts::Verbosity >= 1)
-      outs() << "BOLT: writing new section " << Section.getName()
-             << "\n data at 0x" << Twine::utohexstr(Section.getAllocAddress())
-             << "\n of size " << Section.getOutputSize() << "\n at offset "
-             << Section.getOutputFileOffset() << '\n';
+      BC->outs() << "BOLT: writing new section " << Section.getName()
+                 << "\n data at 0x"
+                 << Twine::utohexstr(Section.getAllocAddress()) << "\n of size "
+                 << Section.getOutputSize() << "\n at offset "
+                 << Section.getOutputFileOffset() << '\n';
     OS.pwrite(reinterpret_cast<const char *>(Section.getOutputData()),
               Section.getOutputSize(), Section.getOutputFileOffset());
   }
@@ -5553,8 +5598,8 @@ void RewriteInstance::rewriteFile() {
   patchELFSectionHeaderTable();
 
   if (opts::PrintSections) {
-    outs() << "BOLT-INFO: Sections after processing:\n";
-    BC->printSections(outs());
+    BC->outs() << "BOLT-INFO: Sections after processing:\n";
+    BC->printSections(BC->outs());
   }
 
   Out->keep();
diff --git a/bolt/lib/Target/X86/X86MCSymbolizer.cpp b/bolt/lib/Target/X86/X86MCSymbolizer.cpp
index dead685..0e0ad92 100644
--- a/bolt/lib/Target/X86/X86MCSymbolizer.cpp
+++ b/bolt/lib/Target/X86/X86MCSymbolizer.cpp
@@ -137,6 +137,7 @@ bool X86MCSymbolizer::tryAddingSymbolicOperand(
     auto PairOrErr = handleGOTPC64(*Relocation, InstAddress);
     if (auto E = PairOrErr.takeError()) {
       Function.setSimple(false);
+      BC.logBOLTErrorsAndQuitOnFatal(std::move(E));
       return false;
     }
     auto [Sym, Addend] = *PairOrErr;
diff --git a/bolt/test/X86/fatal-error.s b/bolt/test/X86/fatal-error.s
new file mode 100644
index 0000000..312d1d4
--- /dev/null
+++ b/bolt/test/X86/fatal-error.s
@@ -0,0 +1,39 @@
+# Tests whether llvm-bolt will correctly exit with error code and printing
+# fatal error message in case one occurs. Here we test opening a function
+# reordering file that does not exist.
+
+# RUN: llvm-mc -filetype=obj -triple x86_64-unknown-unknown %s -o %t.o
+# RUN: %clang %cflags %t.o -o %t.exe -Wl,-q
+# RUN: not llvm-bolt %t.exe -o %t.null \
+# RUN:   --reorder-blocks=normal --reorder-functions=user \
+# RUN:   --function-order=/DOES/NOT/EXIST  2>&1 \
+# RUN:   | FileCheck --check-prefix=CHECK %s
+
+# CHECK: FATAL BOLT-ERROR: Ordered functions file "/DOES/NOT/EXIST" can't be opened
+
+# Sample function reordering input, based off function-order-lite.s
+  .globl main
+  .type main, %function
+main:
+	.cfi_startproc
+.LBB06:
+	callq	func_a
+	retq
+	.cfi_endproc
+.size main, .-main
+
+  .globl func_a
+  .type func_a, %function
+func_a:
+	.cfi_startproc
+	retq
+	.cfi_endproc
+.size func_a, .-func_a
+
+  .globl func_b
+  .type func_b, %function
+func_b:
+	.cfi_startproc
+	retq
+	.cfi_endproc
+.size func_b, .-func_b
diff --git a/bolt/test/X86/log.test b/bolt/test/X86/log.test
new file mode 100644
index 0000000..0cbb5b6
--- /dev/null
+++ b/bolt/test/X86/log.test
@@ -0,0 +1,19 @@
+# Tests whether llvm-bolt is able to redirect logs when processing a simple
+# input. If this test fails on your changes, please use BinaryContext::outs()
+# to print BOLT logging instead of llvm::outs().
+
+RUN: yaml2obj %p/Inputs/blarge.yaml &> %t.exe
+RUN: llvm-bolt %t.exe -o %t.null --data %p/Inputs/blarge.fdata -v=2 \
+RUN:   --reorder-blocks=normal --print-finalized --log-file=%t.log  2>&1 \
+RUN:   | FileCheck --check-prefix=CHECK --allow-empty %s
+RUN: cat %t.log | FileCheck %s --check-prefix=CHECK-LOG
+
+CHECK-NOT: BOLT-INFO
+CHECK-NOT: BOLT-WARNING
+CHECK-NOT: BOLT-ERROR
+
+# Check some usual BOLT output lines are being redirected to the log file
+CHECK-LOG: BOLT-INFO: Target architecture
+CHECK-LOG: BOLT-INFO: BOLT version
+CHECK-LOG: BOLT-INFO: basic block reordering modified layout
+CHECK-LOG:    Binary Function "usqrt"
diff --git a/bolt/tools/bat-dump/bat-dump.cpp b/bolt/tools/bat-dump/bat-dump.cpp
index 71efe008d40..2e9b26c 100644
--- a/bolt/tools/bat-dump/bat-dump.cpp
+++ b/bolt/tools/bat-dump/bat-dump.cpp
@@ -109,7 +109,7 @@ void dumpBATFor(llvm::object::ELFObjectFileBase *InputFile) {
     exit(1);
   }
 
-  if (std::error_code EC = BAT.parse(SectionContents)) {
+  if (std::error_code EC = BAT.parse(outs(), SectionContents)) {
     errs() << "BOLT-ERROR: failed to parse BOLT address translation "
               "table. Malformed BAT section\n";
     exit(1);
diff --git a/bolt/tools/driver/llvm-bolt.cpp b/bolt/tools/driver/llvm-bolt.cpp
index cc215a5..9b03524 100644
--- a/bolt/tools/driver/llvm-bolt.cpp
+++ b/bolt/tools/driver/llvm-bolt.cpp
@@ -63,6 +63,11 @@ BoltProfile("b",
   cl::aliasopt(InputDataFilename),
   cl::cat(BoltCategory));
 
+cl::opt<std::string>
+    LogFile("log-file",
+            cl::desc("redirect journaling to a file instead of stdout/stderr"),
+            cl::Hidden, cl::cat(BoltCategory));
+
 static cl::opt<std::string>
 InputDataFilename2("data2",
   cl::desc("<data file>"),
@@ -207,6 +212,24 @@ int main(int argc, char **argv) {
   if (!sys::fs::exists(opts::InputFilename))
     report_error(opts::InputFilename, errc::no_such_file_or_directory);
 
+  // Initialize journaling streams
+  raw_ostream *BOLTJournalOut = &outs();
+  raw_ostream *BOLTJournalErr = &errs();
+  // RAII obj to keep log file open throughout execution
+  std::unique_ptr<raw_fd_ostream> LogFileStream;
+  if (!opts::LogFile.empty()) {
+    std::error_code LogEC;
+    LogFileStream = std::make_unique<raw_fd_ostream>(
+        opts::LogFile, LogEC, sys::fs::OpenFlags::OF_None);
+    if (LogEC) {
+      errs() << "BOLT-ERROR: cannot open requested log file for writing: "
+             << LogEC.message() << "\n";
+      exit(1);
+    }
+    BOLTJournalOut = LogFileStream.get();
+    BOLTJournalErr = LogFileStream.get();
+  }
+
   // Attempt to open the binary.
   if (!opts::DiffOnly) {
     Expected<OwningBinary<Binary>> BinaryOrErr =
@@ -216,7 +239,8 @@ int main(int argc, char **argv) {
     Binary &Binary = *BinaryOrErr.get().getBinary();
 
     if (auto *e = dyn_cast<ELFObjectFileBase>(&Binary)) {
-      auto RIOrErr = RewriteInstance::create(e, argc, argv, ToolPath);
+      auto RIOrErr = RewriteInstance::create(e, argc, argv, ToolPath,
+                                             *BOLTJournalOut, *BOLTJournalErr);
       if (Error E = RIOrErr.takeError())
         report_error(opts::InputFilename, std::move(E));
       RewriteInstance &RI = *RIOrErr.get();
diff --git a/bolt/unittests/Core/BinaryContext.cpp b/bolt/unittests/Core/BinaryContext.cpp
index 7ac1c14..1fbb07b 100644
--- a/bolt/unittests/Core/BinaryContext.cpp
+++ b/bolt/unittests/Core/BinaryContext.cpp
@@ -40,7 +40,8 @@ protected:
 
   void initializeBOLT() {
     BC = cantFail(BinaryContext::createBinaryContext(
-        ObjFile.get(), true, DWARFContext::create(*ObjFile.get())));
+        ObjFile.get(), true, DWARFContext::create(*ObjFile.get()),
+        {llvm::outs(), llvm::errs()}));
     ASSERT_FALSE(!BC);
   }
 
diff --git a/bolt/unittests/Core/MCPlusBuilder.cpp b/bolt/unittests/Core/MCPlusBuilder.cpp
index b851c75..6344803 100644
--- a/bolt/unittests/Core/MCPlusBuilder.cpp
+++ b/bolt/unittests/Core/MCPlusBuilder.cpp
@@ -50,7 +50,8 @@ protected:
 
   void initializeBolt() {
     BC = cantFail(BinaryContext::createBinaryContext(
-        ObjFile.get(), true, DWARFContext::create(*ObjFile.get())));
+        ObjFile.get(), true, DWARFContext::create(*ObjFile.get()),
+        {llvm::outs(), llvm::errs()}));
     ASSERT_FALSE(!BC);
     BC->initializeTarget(std::unique_ptr<MCPlusBuilder>(
         createMCPlusBuilder(GetParam(), BC->MIA.get(), BC->MII.get(),
-- 
cgit v1.1


From 85507f17cfc42e29f956de6b3e0924c3cf7b4469 Mon Sep 17 00:00:00 2001
From: Ryosuke Niwa <rniwa@webkit.org>
Date: Mon, 12 Feb 2024 14:59:14 -0800
Subject: [analyzer] Ignore assignment to Ref / RefPtr in
 alpha.webkit.UncountedCallArgsChecker. (#80810)

---
 .../Checkers/WebKit/UncountedCallArgsChecker.cpp        | 10 ++++++++++
 .../Analysis/Checkers/WebKit/assignment-to-refptr.cpp   | 17 +++++++++++++++++
 clang/test/Analysis/Checkers/WebKit/mock-types.h        |  1 +
 3 files changed, 28 insertions(+)
 create mode 100644 clang/test/Analysis/Checkers/WebKit/assignment-to-refptr.cpp

diff --git a/clang/lib/StaticAnalyzer/Checkers/WebKit/UncountedCallArgsChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/WebKit/UncountedCallArgsChecker.cpp
index 31ccae8..737e887 100644
--- a/clang/lib/StaticAnalyzer/Checkers/WebKit/UncountedCallArgsChecker.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/WebKit/UncountedCallArgsChecker.cpp
@@ -125,6 +125,16 @@ public:
     // of object on LHS.
     if (auto *MemberOp = dyn_cast<CXXOperatorCallExpr>(CE)) {
       // Note: assignemnt to built-in type isn't derived from CallExpr.
+      if (MemberOp->getOperator() ==
+          OO_Equal) { // Ignore assignment to Ref/RefPtr.
+        auto *callee = MemberOp->getDirectCallee();
+        if (auto *calleeDecl = dyn_cast<CXXMethodDecl>(callee)) {
+          if (const CXXRecordDecl *classDecl = calleeDecl->getParent()) {
+            if (isRefCounted(classDecl))
+              return true;
+          }
+        }
+      }
       if (MemberOp->isAssignmentOp())
         return false;
     }
diff --git a/clang/test/Analysis/Checkers/WebKit/assignment-to-refptr.cpp b/clang/test/Analysis/Checkers/WebKit/assignment-to-refptr.cpp
new file mode 100644
index 0000000..8b2b467
--- /dev/null
+++ b/clang/test/Analysis/Checkers/WebKit/assignment-to-refptr.cpp
@@ -0,0 +1,17 @@
+// RUN: %clang_analyze_cc1 -analyzer-checker=alpha.webkit.UncountedCallArgsChecker -verify %s
+// expected-no-diagnostics
+
+#include "mock-types.h"
+
+class Node {
+public:
+    Node* nextSibling() const;
+
+    void ref() const;
+    void deref() const;
+};
+
+static void removeDetachedChildren(Node* firstChild)
+{
+    for (RefPtr child = firstChild; child; child = child->nextSibling());
+}
diff --git a/clang/test/Analysis/Checkers/WebKit/mock-types.h b/clang/test/Analysis/Checkers/WebKit/mock-types.h
index 5f570b8..814e094 100644
--- a/clang/test/Analysis/Checkers/WebKit/mock-types.h
+++ b/clang/test/Analysis/Checkers/WebKit/mock-types.h
@@ -20,6 +20,7 @@ template <typename T> struct RefPtr {
   T *operator->() { return t; }
   T &operator*() { return *t; }
   RefPtr &operator=(T *) { return *this; }
+  operator bool() { return t; }
 };
 
 template <typename T> bool operator==(const RefPtr<T> &, const RefPtr<T> &) {
-- 
cgit v1.1


From 7ff5dfbaa0c971048da0f37ec6f05f5395562c21 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Mon, 12 Feb 2024 14:59:49 -0800
Subject: [LICM] Support integer mul/add in hoistFPAssociation. (#67736)

The reassociation this is trying to repair can happen for integer types
too.

This patch adds support for integer mul/add to hoistFPAssociation. The
function has been renamed to hoistMulAddAssociation. I've used separate
statistics and limits for integer to allow tuning flexibility.
---
 llvm/lib/Transforms/Scalar/LICM.cpp               |  66 +++-
 llvm/test/Transforms/LICM/expr-reassociate-int.ll | 364 ++++++++++++++++++++++
 2 files changed, 413 insertions(+), 17 deletions(-)
 create mode 100644 llvm/test/Transforms/LICM/expr-reassociate-int.ll

diff --git a/llvm/lib/Transforms/Scalar/LICM.cpp b/llvm/lib/Transforms/Scalar/LICM.cpp
index f3e40a5..67295f1 100644
--- a/llvm/lib/Transforms/Scalar/LICM.cpp
+++ b/llvm/lib/Transforms/Scalar/LICM.cpp
@@ -110,6 +110,9 @@ STATISTIC(NumAddSubHoisted, "Number of add/subtract expressions reassociated "
                             "and hoisted out of the loop");
 STATISTIC(NumFPAssociationsHoisted, "Number of invariant FP expressions "
                                     "reassociated and hoisted out of the loop");
+STATISTIC(NumIntAssociationsHoisted,
+          "Number of invariant int expressions "
+          "reassociated and hoisted out of the loop");
 
 /// Memory promotion is enabled by default.
 static cl::opt<bool>
@@ -135,6 +138,12 @@ static cl::opt<unsigned> FPAssociationUpperLimit(
         "Set upper limit for the number of transformations performed "
         "during a single round of hoisting the reassociated expressions."));
 
+cl::opt<unsigned> IntAssociationUpperLimit(
+    "licm-max-num-int-reassociations", cl::init(5U), cl::Hidden,
+    cl::desc(
+        "Set upper limit for the number of transformations performed "
+        "during a single round of hoisting the reassociated expressions."));
+
 // Experimental option to allow imprecision in LICM in pathological cases, in
 // exchange for faster compile. This is to be removed if MemorySSA starts to
 // address the same issue. LICM calls MemorySSAWalker's
@@ -2661,21 +2670,31 @@ static bool hoistAddSub(Instruction &I, Loop &L, ICFLoopSafetyInfo &SafetyInfo,
   return false;
 }
 
+static bool isReassociableOp(Instruction *I, unsigned IntOpcode,
+                             unsigned FPOpcode) {
+  if (I->getOpcode() == IntOpcode)
+    return true;
+  if (I->getOpcode() == FPOpcode && I->hasAllowReassoc() &&
+      I->hasNoSignedZeros())
+    return true;
+  return false;
+}
+
 /// Try to reassociate expressions like ((A1 * B1) + (A2 * B2) + ...) * C where
 /// A1, A2, ... and C are loop invariants into expressions like
 /// ((A1 * C * B1) + (A2 * C * B2) + ...) and hoist the (A1 * C), (A2 * C), ...
 /// invariant expressions. This functions returns true only if any hoisting has
 /// actually occured.
-static bool hoistFPAssociation(Instruction &I, Loop &L,
-                               ICFLoopSafetyInfo &SafetyInfo,
-                               MemorySSAUpdater &MSSAU, AssumptionCache *AC,
-                               DominatorTree *DT) {
+static bool hoistMulAddAssociation(Instruction &I, Loop &L,
+                                   ICFLoopSafetyInfo &SafetyInfo,
+                                   MemorySSAUpdater &MSSAU, AssumptionCache *AC,
+                                   DominatorTree *DT) {
   using namespace PatternMatch;
-  Value *VariantOp = nullptr, *InvariantOp = nullptr;
 
-  if (!match(&I, m_FMul(m_Value(VariantOp), m_Value(InvariantOp))) ||
-      !I.hasAllowReassoc() || !I.hasNoSignedZeros())
+  if (!isReassociableOp(&I, Instruction::Mul, Instruction::FMul))
     return false;
+  Value *VariantOp = I.getOperand(0);
+  Value *InvariantOp = I.getOperand(1);
   if (L.isLoopInvariant(VariantOp))
     std::swap(VariantOp, InvariantOp);
   if (L.isLoopInvariant(VariantOp) || !L.isLoopInvariant(InvariantOp))
@@ -2689,15 +2708,17 @@ static bool hoistFPAssociation(Instruction &I, Loop &L,
     Worklist.push_back(VariantBinOp);
   while (!Worklist.empty()) {
     BinaryOperator *BO = Worklist.pop_back_val();
-    if (!BO->hasOneUse() || !BO->hasAllowReassoc() || !BO->hasNoSignedZeros())
+    if (!BO->hasOneUse())
       return false;
-    BinaryOperator *Op0, *Op1;
-    if (match(BO, m_FAdd(m_BinOp(Op0), m_BinOp(Op1)))) {
-      Worklist.push_back(Op0);
-      Worklist.push_back(Op1);
+    if (isReassociableOp(BO, Instruction::Add, Instruction::FAdd) &&
+        isa<BinaryOperator>(BO->getOperand(0)) &&
+        isa<BinaryOperator>(BO->getOperand(1))) {
+      Worklist.push_back(cast<BinaryOperator>(BO->getOperand(0)));
+      Worklist.push_back(cast<BinaryOperator>(BO->getOperand(1)));
       continue;
     }
-    if (BO->getOpcode() != Instruction::FMul || L.isLoopInvariant(BO))
+    if (!isReassociableOp(BO, Instruction::Mul, Instruction::FMul) ||
+        L.isLoopInvariant(BO))
       return false;
     Use &U0 = BO->getOperandUse(0);
     Use &U1 = BO->getOperandUse(1);
@@ -2707,7 +2728,10 @@ static bool hoistFPAssociation(Instruction &I, Loop &L,
       Changes.push_back(&U1);
     else
       return false;
-    if (Changes.size() > FPAssociationUpperLimit)
+    unsigned Limit = I.getType()->isIntOrIntVectorTy()
+                         ? IntAssociationUpperLimit
+                         : FPAssociationUpperLimit;
+    if (Changes.size() > Limit)
       return false;
   }
   if (Changes.empty())
@@ -2720,7 +2744,12 @@ static bool hoistFPAssociation(Instruction &I, Loop &L,
   for (auto *U : Changes) {
     assert(L.isLoopInvariant(U->get()));
     Instruction *Ins = cast<Instruction>(U->getUser());
-    U->set(Builder.CreateFMulFMF(U->get(), Factor, Ins, "factor.op.fmul"));
+    Value *Mul;
+    if (I.getType()->isIntOrIntVectorTy())
+      Mul = Builder.CreateMul(U->get(), Factor, "factor.op.mul");
+    else
+      Mul = Builder.CreateFMulFMF(U->get(), Factor, Ins, "factor.op.fmul");
+    U->set(Mul);
   }
   I.replaceAllUsesWith(VariantOp);
   eraseInstruction(I, SafetyInfo, MSSAU);
@@ -2754,9 +2783,12 @@ static bool hoistArithmetics(Instruction &I, Loop &L,
     return true;
   }
 
-  if (hoistFPAssociation(I, L, SafetyInfo, MSSAU, AC, DT)) {
+  if (hoistMulAddAssociation(I, L, SafetyInfo, MSSAU, AC, DT)) {
     ++NumHoisted;
-    ++NumFPAssociationsHoisted;
+    if (I.getType()->isIntOrIntVectorTy())
+      ++NumIntAssociationsHoisted;
+    else
+      ++NumFPAssociationsHoisted;
     return true;
   }
 
diff --git a/llvm/test/Transforms/LICM/expr-reassociate-int.ll b/llvm/test/Transforms/LICM/expr-reassociate-int.ll
new file mode 100644
index 0000000..6354897
--- /dev/null
+++ b/llvm/test/Transforms/LICM/expr-reassociate-int.ll
@@ -0,0 +1,364 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2
+; RUN: opt -passes='loop-mssa(licm)' -S < %s | FileCheck %s --check-prefixes=CHECK,NOT_CONSTRAINED
+; RUN: opt -passes='loop-mssa(licm)' -licm-max-num-int-reassociations=1 -S < %s | FileCheck %s --check-prefixes=CHECK,CONSTRAINED
+
+;
+; A simple loop:
+;
+;  int j;
+;
+;  for (j = 0; j <= i; j++)
+;    cells[j] = d1 * cells[j + 1] * delta;
+;
+; ...should be transformed by the LICM pass into this:
+;
+;  int j;
+;  const uint64_t d1d = d1 * delta;
+;
+;  for (j = 0; j <= i; j++)
+;    cells[j] = d1d * cells[j + 1];
+;
+
+define void @innermost_loop_1d_shouldhoist(i32 %i, i64 %d1, i64 %delta, ptr %cells) {
+; CHECK-LABEL: define void @innermost_loop_1d_shouldhoist
+; CHECK-SAME: (i32 [[I:%.*]], i64 [[D1:%.*]], i64 [[DELTA:%.*]], ptr [[CELLS:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[MUL_1:%.*]] = mul i64 [[DELTA]], [[D1]]
+; CHECK-NEXT:    br label [[FOR_COND:%.*]]
+; CHECK:       for.cond:
+; CHECK-NEXT:    [[J:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[ADD_J_1:%.*]], [[FOR_BODY:%.*]] ]
+; CHECK-NEXT:    [[CMP_NOT:%.*]] = icmp sgt i32 [[J]], [[I]]
+; CHECK-NEXT:    br i1 [[CMP_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[ADD_J_1]] = add nuw nsw i32 [[J]], 1
+; CHECK-NEXT:    [[IDXPROM_J_1:%.*]] = zext i32 [[ADD_J_1]] to i64
+; CHECK-NEXT:    [[ARRAYIDX_J_1:%.*]] = getelementptr inbounds i64, ptr [[CELLS]], i64 [[IDXPROM_J_1]]
+; CHECK-NEXT:    [[CELL_1:%.*]] = load i64, ptr [[ARRAYIDX_J_1]], align 8
+; CHECK-NEXT:    [[MUL_2:%.*]] = mul i64 [[MUL_1]], [[CELL_1]]
+; CHECK-NEXT:    [[IDXPROM_J:%.*]] = zext i32 [[J]] to i64
+; CHECK-NEXT:    [[ARRAYIDX_J:%.*]] = getelementptr inbounds i64, ptr [[CELLS]], i64 [[IDXPROM_J]]
+; CHECK-NEXT:    store i64 [[MUL_2]], ptr [[ARRAYIDX_J]], align 8
+; CHECK-NEXT:    br label [[FOR_COND]]
+; CHECK:       for.end:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %for.cond
+
+for.cond:
+  %j = phi i32 [ 0, %entry ], [ %add.j.1, %for.body ]
+  %cmp.not = icmp sgt i32 %j, %i
+  br i1 %cmp.not, label %for.end, label %for.body
+
+for.body:
+  %add.j.1 = add nuw nsw i32 %j, 1
+  %idxprom.j.1 = zext i32 %add.j.1 to i64
+  %arrayidx.j.1 = getelementptr inbounds i64, ptr %cells, i64 %idxprom.j.1
+  %cell.1 = load i64, ptr %arrayidx.j.1, align 8
+  %mul.1 = mul i64 %delta, %d1
+  %mul.2 = mul i64 %mul.1, %cell.1
+  %idxprom.j = zext i32 %j to i64
+  %arrayidx.j = getelementptr inbounds i64, ptr %cells, i64 %idxprom.j
+  store i64 %mul.2, ptr %arrayidx.j, align 8
+  br label %for.cond
+
+for.end:
+  ret void
+}
+
+;
+; The following loop will be modified by the 'Reassociate expressions' pass,
+;
+;  int j;
+;  const uint64_t d1d = d1 * delta;
+;  const uint64_t d2d = d2 * delta;
+;
+;  for (j = 0; j <= i; j++)
+;    cells[j] = d1d * cells[j + 1] + d2d * cells[j];
+;
+; ...into this:
+;
+;  int j;
+;
+;  for (j = 0; j <= i; j++)
+;    cells[j] = (d1 * cells[j + 1] + d2 * cells[j]) * delta;
+;
+; We expect the LICM pass to undo this transformation.
+;
+
+define void @innermost_loop_2d(i32 %i, i64 %d1, i64 %d2, i64 %delta, ptr %cells) {
+; NOT_CONSTRAINED-LABEL: define void @innermost_loop_2d
+; NOT_CONSTRAINED-SAME: (i32 [[I:%.*]], i64 [[D1:%.*]], i64 [[D2:%.*]], i64 [[DELTA:%.*]], ptr [[CELLS:%.*]]) {
+; NOT_CONSTRAINED-NEXT:  entry:
+; NOT_CONSTRAINED-NEXT:    [[FACTOR_OP_MUL:%.*]] = mul i64 [[D1]], [[DELTA]]
+; NOT_CONSTRAINED-NEXT:    [[FACTOR_OP_MUL1:%.*]] = mul i64 [[D2]], [[DELTA]]
+; NOT_CONSTRAINED-NEXT:    br label [[FOR_COND:%.*]]
+; NOT_CONSTRAINED:       for.cond:
+; NOT_CONSTRAINED-NEXT:    [[J:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[ADD_J_1:%.*]], [[FOR_BODY:%.*]] ]
+; NOT_CONSTRAINED-NEXT:    [[CMP_NOT:%.*]] = icmp sgt i32 [[J]], [[I]]
+; NOT_CONSTRAINED-NEXT:    br i1 [[CMP_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]]
+; NOT_CONSTRAINED:       for.body:
+; NOT_CONSTRAINED-NEXT:    [[ADD_J_1]] = add nuw nsw i32 [[J]], 1
+; NOT_CONSTRAINED-NEXT:    [[IDXPROM_J_1:%.*]] = zext i32 [[ADD_J_1]] to i64
+; NOT_CONSTRAINED-NEXT:    [[ARRAYIDX_J_1:%.*]] = getelementptr inbounds i64, ptr [[CELLS]], i64 [[IDXPROM_J_1]]
+; NOT_CONSTRAINED-NEXT:    [[CELL_1:%.*]] = load i64, ptr [[ARRAYIDX_J_1]], align 8
+; NOT_CONSTRAINED-NEXT:    [[MUL_1:%.*]] = mul i64 [[CELL_1]], [[FACTOR_OP_MUL]]
+; NOT_CONSTRAINED-NEXT:    [[IDXPROM_J:%.*]] = zext i32 [[J]] to i64
+; NOT_CONSTRAINED-NEXT:    [[ARRAYIDX_J:%.*]] = getelementptr inbounds i64, ptr [[CELLS]], i64 [[IDXPROM_J]]
+; NOT_CONSTRAINED-NEXT:    [[CELL_2:%.*]] = load i64, ptr [[ARRAYIDX_J]], align 8
+; NOT_CONSTRAINED-NEXT:    [[MUL_2:%.*]] = mul i64 [[CELL_2]], [[FACTOR_OP_MUL1]]
+; NOT_CONSTRAINED-NEXT:    [[REASS_ADD:%.*]] = add i64 [[MUL_2]], [[MUL_1]]
+; NOT_CONSTRAINED-NEXT:    store i64 [[REASS_ADD]], ptr [[ARRAYIDX_J]], align 8
+; NOT_CONSTRAINED-NEXT:    br label [[FOR_COND]]
+; NOT_CONSTRAINED:       for.end:
+; NOT_CONSTRAINED-NEXT:    ret void
+;
+; CONSTRAINED-LABEL: define void @innermost_loop_2d
+; CONSTRAINED-SAME: (i32 [[I:%.*]], i64 [[D1:%.*]], i64 [[D2:%.*]], i64 [[DELTA:%.*]], ptr [[CELLS:%.*]]) {
+; CONSTRAINED-NEXT:  entry:
+; CONSTRAINED-NEXT:    br label [[FOR_COND:%.*]]
+; CONSTRAINED:       for.cond:
+; CONSTRAINED-NEXT:    [[J:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[ADD_J_1:%.*]], [[FOR_BODY:%.*]] ]
+; CONSTRAINED-NEXT:    [[CMP_NOT:%.*]] = icmp sgt i32 [[J]], [[I]]
+; CONSTRAINED-NEXT:    br i1 [[CMP_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]]
+; CONSTRAINED:       for.body:
+; CONSTRAINED-NEXT:    [[ADD_J_1]] = add nuw nsw i32 [[J]], 1
+; CONSTRAINED-NEXT:    [[IDXPROM_J_1:%.*]] = zext i32 [[ADD_J_1]] to i64
+; CONSTRAINED-NEXT:    [[ARRAYIDX_J_1:%.*]] = getelementptr inbounds i64, ptr [[CELLS]], i64 [[IDXPROM_J_1]]
+; CONSTRAINED-NEXT:    [[CELL_1:%.*]] = load i64, ptr [[ARRAYIDX_J_1]], align 8
+; CONSTRAINED-NEXT:    [[MUL_1:%.*]] = mul i64 [[CELL_1]], [[D1]]
+; CONSTRAINED-NEXT:    [[IDXPROM_J:%.*]] = zext i32 [[J]] to i64
+; CONSTRAINED-NEXT:    [[ARRAYIDX_J:%.*]] = getelementptr inbounds i64, ptr [[CELLS]], i64 [[IDXPROM_J]]
+; CONSTRAINED-NEXT:    [[CELL_2:%.*]] = load i64, ptr [[ARRAYIDX_J]], align 8
+; CONSTRAINED-NEXT:    [[MUL_2:%.*]] = mul i64 [[CELL_2]], [[D2]]
+; CONSTRAINED-NEXT:    [[REASS_ADD:%.*]] = add i64 [[MUL_2]], [[MUL_1]]
+; CONSTRAINED-NEXT:    [[REASS_MUL:%.*]] = mul i64 [[REASS_ADD]], [[DELTA]]
+; CONSTRAINED-NEXT:    store i64 [[REASS_MUL]], ptr [[ARRAYIDX_J]], align 8
+; CONSTRAINED-NEXT:    br label [[FOR_COND]]
+; CONSTRAINED:       for.end:
+; CONSTRAINED-NEXT:    ret void
+;
+entry:
+  br label %for.cond
+
+for.cond:
+  %j = phi i32 [ 0, %entry ], [ %add.j.1, %for.body ]
+  %cmp.not = icmp sgt i32 %j, %i
+  br i1 %cmp.not, label %for.end, label %for.body
+
+for.body:
+  %add.j.1 = add nuw nsw i32 %j, 1
+  %idxprom.j.1 = zext i32 %add.j.1 to i64
+  %arrayidx.j.1 = getelementptr inbounds i64, ptr %cells, i64 %idxprom.j.1
+  %cell.1 = load i64, ptr %arrayidx.j.1, align 8
+  %mul.1 = mul i64 %cell.1, %d1
+  %idxprom.j = zext i32 %j to i64
+  %arrayidx.j = getelementptr inbounds i64, ptr %cells, i64 %idxprom.j
+  %cell.2 = load i64, ptr %arrayidx.j, align 8
+  %mul.2 = mul i64 %cell.2, %d2
+  %reass.add = add i64 %mul.2, %mul.1
+  %reass.mul = mul i64 %reass.add, %delta
+  store i64 %reass.mul, ptr %arrayidx.j, align 8
+  br label %for.cond
+
+for.end:
+  ret void
+}
+
+;
+; The following loop will be modified by the 'Reassociate expressions' pass,
+;
+;  int j;
+;  const uint64_t d1d = d1 * delta;
+;  const uint64_t d2d = d2 * delta;
+;  const uint64_t d3d = d3 * delta;
+;
+;  for (j = 0; j <= i; j++)
+;    cells[j] = d1d * cells[j + 1] + d2d * cells[j] + d3d * cells[j + 2];
+;
+; ...into this:
+;
+;  int j;
+;
+;  for (j = 0; j <= i; j++)
+;    cells[j] = (d1 * cells[j + 1] + d2 * cells[j] + d3 * cells[j + 2]) * delta;
+;
+; We expect the LICM pass to undo this transformation.
+;
+
+
+define void @innermost_loop_3d(i32 %i, i64 %d1, i64 %d2, i64 %d3, i64 %delta, ptr %cells) {
+; NOT_CONSTRAINED-LABEL: define void @innermost_loop_3d
+; NOT_CONSTRAINED-SAME: (i32 [[I:%.*]], i64 [[D1:%.*]], i64 [[D2:%.*]], i64 [[D3:%.*]], i64 [[DELTA:%.*]], ptr [[CELLS:%.*]]) {
+; NOT_CONSTRAINED-NEXT:  entry:
+; NOT_CONSTRAINED-NEXT:    [[FACTOR_OP_MUL:%.*]] = mul i64 [[D3]], [[DELTA]]
+; NOT_CONSTRAINED-NEXT:    [[FACTOR_OP_MUL1:%.*]] = mul i64 [[D1]], [[DELTA]]
+; NOT_CONSTRAINED-NEXT:    [[FACTOR_OP_MUL2:%.*]] = mul i64 [[D2]], [[DELTA]]
+; NOT_CONSTRAINED-NEXT:    br label [[FOR_COND:%.*]]
+; NOT_CONSTRAINED:       for.cond:
+; NOT_CONSTRAINED-NEXT:    [[J:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[ADD_J_1:%.*]], [[FOR_BODY:%.*]] ]
+; NOT_CONSTRAINED-NEXT:    [[CMP_NOT:%.*]] = icmp sgt i32 [[J]], [[I]]
+; NOT_CONSTRAINED-NEXT:    br i1 [[CMP_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]]
+; NOT_CONSTRAINED:       for.body:
+; NOT_CONSTRAINED-NEXT:    [[ADD_J_1]] = add nuw nsw i32 [[J]], 1
+; NOT_CONSTRAINED-NEXT:    [[IDXPROM_J_1:%.*]] = zext i32 [[ADD_J_1]] to i64
+; NOT_CONSTRAINED-NEXT:    [[ARRAYIDX_J_1:%.*]] = getelementptr inbounds i64, ptr [[CELLS]], i64 [[IDXPROM_J_1]]
+; NOT_CONSTRAINED-NEXT:    [[CELL_1:%.*]] = load i64, ptr [[ARRAYIDX_J_1]], align 8
+; NOT_CONSTRAINED-NEXT:    [[MUL_1:%.*]] = mul i64 [[CELL_1]], [[FACTOR_OP_MUL1]]
+; NOT_CONSTRAINED-NEXT:    [[IDXPROM_J:%.*]] = zext i32 [[J]] to i64
+; NOT_CONSTRAINED-NEXT:    [[ARRAYIDX_J:%.*]] = getelementptr inbounds i64, ptr [[CELLS]], i64 [[IDXPROM_J]]
+; NOT_CONSTRAINED-NEXT:    [[CELL_2:%.*]] = load i64, ptr [[ARRAYIDX_J]], align 8
+; NOT_CONSTRAINED-NEXT:    [[MUL_2:%.*]] = mul i64 [[CELL_2]], [[FACTOR_OP_MUL2]]
+; NOT_CONSTRAINED-NEXT:    [[ADD_J_2:%.*]] = add nuw nsw i32 [[J]], 2
+; NOT_CONSTRAINED-NEXT:    [[IDXPROM_J_2:%.*]] = zext i32 [[ADD_J_2]] to i64
+; NOT_CONSTRAINED-NEXT:    [[ARRAYIDX_J_2:%.*]] = getelementptr inbounds i64, ptr [[CELLS]], i64 [[IDXPROM_J_2]]
+; NOT_CONSTRAINED-NEXT:    [[CELL_3:%.*]] = load i64, ptr [[ARRAYIDX_J_2]], align 8
+; NOT_CONSTRAINED-NEXT:    [[MUL_3:%.*]] = mul i64 [[CELL_3]], [[FACTOR_OP_MUL]]
+; NOT_CONSTRAINED-NEXT:    [[REASS_ADD:%.*]] = add i64 [[MUL_2]], [[MUL_1]]
+; NOT_CONSTRAINED-NEXT:    [[REASS_ADD1:%.*]] = add i64 [[REASS_ADD]], [[MUL_3]]
+; NOT_CONSTRAINED-NEXT:    store i64 [[REASS_ADD1]], ptr [[ARRAYIDX_J_2]], align 8
+; NOT_CONSTRAINED-NEXT:    br label [[FOR_COND]]
+; NOT_CONSTRAINED:       for.end:
+; NOT_CONSTRAINED-NEXT:    ret void
+;
+; CONSTRAINED-LABEL: define void @innermost_loop_3d
+; CONSTRAINED-SAME: (i32 [[I:%.*]], i64 [[D1:%.*]], i64 [[D2:%.*]], i64 [[D3:%.*]], i64 [[DELTA:%.*]], ptr [[CELLS:%.*]]) {
+; CONSTRAINED-NEXT:  entry:
+; CONSTRAINED-NEXT:    br label [[FOR_COND:%.*]]
+; CONSTRAINED:       for.cond:
+; CONSTRAINED-NEXT:    [[J:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[ADD_J_1:%.*]], [[FOR_BODY:%.*]] ]
+; CONSTRAINED-NEXT:    [[CMP_NOT:%.*]] = icmp sgt i32 [[J]], [[I]]
+; CONSTRAINED-NEXT:    br i1 [[CMP_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]]
+; CONSTRAINED:       for.body:
+; CONSTRAINED-NEXT:    [[ADD_J_1]] = add nuw nsw i32 [[J]], 1
+; CONSTRAINED-NEXT:    [[IDXPROM_J_1:%.*]] = zext i32 [[ADD_J_1]] to i64
+; CONSTRAINED-NEXT:    [[ARRAYIDX_J_1:%.*]] = getelementptr inbounds i64, ptr [[CELLS]], i64 [[IDXPROM_J_1]]
+; CONSTRAINED-NEXT:    [[CELL_1:%.*]] = load i64, ptr [[ARRAYIDX_J_1]], align 8
+; CONSTRAINED-NEXT:    [[MUL_1:%.*]] = mul i64 [[CELL_1]], [[D1]]
+; CONSTRAINED-NEXT:    [[IDXPROM_J:%.*]] = zext i32 [[J]] to i64
+; CONSTRAINED-NEXT:    [[ARRAYIDX_J:%.*]] = getelementptr inbounds i64, ptr [[CELLS]], i64 [[IDXPROM_J]]
+; CONSTRAINED-NEXT:    [[CELL_2:%.*]] = load i64, ptr [[ARRAYIDX_J]], align 8
+; CONSTRAINED-NEXT:    [[MUL_2:%.*]] = mul i64 [[CELL_2]], [[D2]]
+; CONSTRAINED-NEXT:    [[ADD_J_2:%.*]] = add nuw nsw i32 [[J]], 2
+; CONSTRAINED-NEXT:    [[IDXPROM_J_2:%.*]] = zext i32 [[ADD_J_2]] to i64
+; CONSTRAINED-NEXT:    [[ARRAYIDX_J_2:%.*]] = getelementptr inbounds i64, ptr [[CELLS]], i64 [[IDXPROM_J_2]]
+; CONSTRAINED-NEXT:    [[CELL_3:%.*]] = load i64, ptr [[ARRAYIDX_J_2]], align 8
+; CONSTRAINED-NEXT:    [[MUL_3:%.*]] = mul i64 [[CELL_3]], [[D3]]
+; CONSTRAINED-NEXT:    [[REASS_ADD:%.*]] = add i64 [[MUL_2]], [[MUL_1]]
+; CONSTRAINED-NEXT:    [[REASS_ADD1:%.*]] = add i64 [[REASS_ADD]], [[MUL_3]]
+; CONSTRAINED-NEXT:    [[REASS_MUL:%.*]] = mul i64 [[REASS_ADD1]], [[DELTA]]
+; CONSTRAINED-NEXT:    store i64 [[REASS_MUL]], ptr [[ARRAYIDX_J_2]], align 8
+; CONSTRAINED-NEXT:    br label [[FOR_COND]]
+; CONSTRAINED:       for.end:
+; CONSTRAINED-NEXT:    ret void
+;
+entry:
+  br label %for.cond
+
+for.cond:
+  %j = phi i32 [ 0, %entry ], [ %add.j.1, %for.body ]
+  %cmp.not = icmp sgt i32 %j, %i
+  br i1 %cmp.not, label %for.end, label %for.body
+
+for.body:
+  %add.j.1 = add nuw nsw i32 %j, 1
+  %idxprom.j.1 = zext i32 %add.j.1 to i64
+  %arrayidx.j.1 = getelementptr inbounds i64, ptr %cells, i64 %idxprom.j.1
+  %cell.1 = load i64, ptr %arrayidx.j.1, align 8
+  %mul.1 = mul i64 %cell.1, %d1
+  %idxprom.j = zext i32 %j to i64
+  %arrayidx.j = getelementptr inbounds i64, ptr %cells, i64 %idxprom.j
+  %cell.2 = load i64, ptr %arrayidx.j, align 8
+  %mul.2 = mul i64 %cell.2, %d2
+  %add.j.2 = add nuw nsw i32 %j, 2
+  %idxprom.j.2 = zext i32 %add.j.2 to i64
+  %arrayidx.j.2 = getelementptr inbounds i64, ptr %cells, i64 %idxprom.j.2
+  %cell.3 = load i64, ptr %arrayidx.j.2, align 8
+  %mul.3 = mul i64 %cell.3, %d3
+  %reass.add = add i64 %mul.2, %mul.1
+  %reass.add1 = add i64 %reass.add, %mul.3
+  %reass.mul = mul i64 %reass.add1, %delta
+  store i64 %reass.mul, ptr %arrayidx.j.2, align 8
+  br label %for.cond
+
+for.end:
+  ret void
+}
+
+;
+; The following loop will not be modified by the LICM pass:
+;
+;  int j;
+;
+;  for (j = 0; j <= i; j++)
+;    cells[j] = (d1 * cells[j + 1] + d2 * cells[j] +
+;                cells[j] * cells[j + 1]) * delta;
+;
+; This case differs as one of the multiplications involves no invariants.
+;
+
+define void @innermost_loop_3d_reassociated_different(i32 %i, i64 %d1, i64 %d2, i64 %delta, ptr %cells) {
+; CHECK-LABEL: define void @innermost_loop_3d_reassociated_different
+; CHECK-SAME: (i32 [[I:%.*]], i64 [[D1:%.*]], i64 [[D2:%.*]], i64 [[DELTA:%.*]], ptr [[CELLS:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[FOR_COND:%.*]]
+; CHECK:       for.cond:
+; CHECK-NEXT:    [[J:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[ADD_J_1:%.*]], [[FOR_BODY:%.*]] ]
+; CHECK-NEXT:    [[CMP_NOT:%.*]] = icmp sgt i32 [[J]], [[I]]
+; CHECK-NEXT:    br i1 [[CMP_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[ADD_J_1]] = add nuw nsw i32 [[J]], 1
+; CHECK-NEXT:    [[IDXPROM_J_1:%.*]] = zext i32 [[ADD_J_1]] to i64
+; CHECK-NEXT:    [[ARRAYIDX_J_1:%.*]] = getelementptr inbounds i64, ptr [[CELLS]], i64 [[IDXPROM_J_1]]
+; CHECK-NEXT:    [[CELL_1:%.*]] = load i64, ptr [[ARRAYIDX_J_1]], align 8
+; CHECK-NEXT:    [[IDXPROM_J_2:%.*]] = zext i32 [[ADD_J_1]] to i64
+; CHECK-NEXT:    [[ARRAYIDX_J_2:%.*]] = getelementptr inbounds i64, ptr [[CELLS]], i64 [[IDXPROM_J_2]]
+; CHECK-NEXT:    [[CELL_2:%.*]] = load i64, ptr [[ARRAYIDX_J_2]], align 8
+; CHECK-NEXT:    [[CELL_3:%.*]] = load i64, ptr [[ARRAYIDX_J_2]], align 8
+; CHECK-NEXT:    [[IDXPROM_J:%.*]] = zext i32 [[J]] to i64
+; CHECK-NEXT:    [[ARRAYIDX_J:%.*]] = getelementptr inbounds i64, ptr [[CELLS]], i64 [[IDXPROM_J]]
+; CHECK-NEXT:    [[CELL_4:%.*]] = load i64, ptr [[ARRAYIDX_J]], align 8
+; CHECK-NEXT:    [[MUL_1:%.*]] = mul i64 [[CELL_1]], [[D1]]
+; CHECK-NEXT:    [[MUL_2:%.*]] = mul i64 [[CELL_4]], [[D2]]
+; CHECK-NEXT:    [[EXTRA_MUL:%.*]] = mul i64 [[CELL_3]], [[CELL_2]]
+; CHECK-NEXT:    [[REASS_ADD:%.*]] = add i64 [[EXTRA_MUL]], [[MUL_1]]
+; CHECK-NEXT:    [[EXTRA_ADD:%.*]] = add i64 [[REASS_ADD]], [[MUL_2]]
+; CHECK-NEXT:    [[REASS_MUL:%.*]] = mul i64 [[EXTRA_ADD]], [[DELTA]]
+; CHECK-NEXT:    store i64 [[REASS_MUL]], ptr [[ARRAYIDX_J]], align 8
+; CHECK-NEXT:    br label [[FOR_COND]]
+; CHECK:       for.end:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %for.cond
+
+for.cond:
+  %j = phi i32 [ 0, %entry ], [ %add.j.1, %for.body ]
+  %cmp.not = icmp sgt i32 %j, %i
+  br i1 %cmp.not, label %for.end, label %for.body
+
+for.body:
+  %add.j.1 = add nuw nsw i32 %j, 1
+  %idxprom.j.1 = zext i32 %add.j.1 to i64
+  %arrayidx.j.1 = getelementptr inbounds i64, ptr %cells, i64 %idxprom.j.1
+  %cell.1 = load i64, ptr %arrayidx.j.1, align 8
+  %idxprom.j.2 = zext i32 %add.j.1 to i64
+  %arrayidx.j.2 = getelementptr inbounds i64, ptr %cells, i64 %idxprom.j.2
+  %cell.2 = load i64, ptr %arrayidx.j.2, align 8
+  %idxprom.j.3 = zext i32 %add.j.1 to i64
+  %cell.3 = load i64, ptr %arrayidx.j.2, align 8
+  %idxprom.j = zext i32 %j to i64
+  %arrayidx.j = getelementptr inbounds i64, ptr %cells, i64 %idxprom.j
+  %cell.4 = load i64, ptr %arrayidx.j, align 8
+  %mul.1 = mul i64 %cell.1, %d1
+  %mul.2 = mul i64 %cell.4, %d2
+  %extra.mul = mul i64 %cell.3, %cell.2
+  %reass.add = add i64 %extra.mul, %mul.1
+  %extra.add = add i64 %reass.add, %mul.2
+  %reass.mul = mul i64 %extra.add, %delta
+  store i64 %reass.mul, ptr %arrayidx.j, align 8
+  br label %for.cond
+
+for.end:
+  ret void
+}
-- 
cgit v1.1


From 82568046e6761f961002ea6c48b5160ba6c7576f Mon Sep 17 00:00:00 2001
From: Ryosuke Niwa <rniwa@webkit.org>
Date: Mon, 12 Feb 2024 15:00:12 -0800
Subject: [analyzer] Add the support for calling Ref::ptr accessor. (#80919)

This accessor returns a pointer from Ref type and is therefore safe.
---
 .../lib/StaticAnalyzer/Checkers/WebKit/PtrTypesSemantics.cpp |  1 +
 clang/test/Analysis/Checkers/WebKit/mock-types.h             | 10 ++++++----
 clang/test/Analysis/Checkers/WebKit/ref-ptr-accessor.cpp     | 12 ++++++++++++
 3 files changed, 19 insertions(+), 4 deletions(-)
 create mode 100644 clang/test/Analysis/Checkers/WebKit/ref-ptr-accessor.cpp

diff --git a/clang/lib/StaticAnalyzer/Checkers/WebKit/PtrTypesSemantics.cpp b/clang/lib/StaticAnalyzer/Checkers/WebKit/PtrTypesSemantics.cpp
index d2b6634..eadd468 100644
--- a/clang/lib/StaticAnalyzer/Checkers/WebKit/PtrTypesSemantics.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/WebKit/PtrTypesSemantics.cpp
@@ -154,6 +154,7 @@ std::optional<bool> isGetterOfRefCounted(const CXXMethodDecl* M)
 
     if (((className == "Ref" || className == "RefPtr") &&
          methodName == "get") ||
+        (className == "Ref" && methodName == "ptr") ||
         ((className == "String" || className == "AtomString" ||
           className == "AtomStringImpl" || className == "UniqueString" ||
           className == "UniqueStringImpl" || className == "Identifier") &&
diff --git a/clang/test/Analysis/Checkers/WebKit/mock-types.h b/clang/test/Analysis/Checkers/WebKit/mock-types.h
index 814e094..cc40487 100644
--- a/clang/test/Analysis/Checkers/WebKit/mock-types.h
+++ b/clang/test/Analysis/Checkers/WebKit/mock-types.h
@@ -2,13 +2,14 @@
 #define mock_types_1103988513531
 
 template <typename T> struct Ref {
-  T t;
+  T *t;
 
   Ref() : t{} {};
   Ref(T *) {}
-  T *get() { return nullptr; }
-  operator const T &() const { return t; }
-  operator T &() { return t; }
+  T *get() { return t; }
+  T *ptr() { return t; }
+  operator const T &() const { return *t; }
+  operator T &() { return *t; }
 };
 
 template <typename T> struct RefPtr {
@@ -40,6 +41,7 @@ template <typename T> bool operator!=(const RefPtr<T> &, T *) { return false; }
 template <typename T> bool operator!=(const RefPtr<T> &, T &) { return false; }
 
 struct RefCountable {
+  static Ref<RefCountable> create();
   void ref() {}
   void deref() {}
 };
diff --git a/clang/test/Analysis/Checkers/WebKit/ref-ptr-accessor.cpp b/clang/test/Analysis/Checkers/WebKit/ref-ptr-accessor.cpp
new file mode 100644
index 0000000..560dcfd
--- /dev/null
+++ b/clang/test/Analysis/Checkers/WebKit/ref-ptr-accessor.cpp
@@ -0,0 +1,12 @@
+// RUN: %clang_analyze_cc1 -analyzer-checker=alpha.webkit.UncountedCallArgsChecker -verify %s
+// expected-no-diagnostics
+
+#include "mock-types.h"
+
+void someFunction(RefCountable*);
+
+void testFunction()
+{
+    Ref item = RefCountable::create();
+    someFunction(item.ptr());
+}
-- 
cgit v1.1


From 9df71899bd5dbbaf0640c74cc82a6330dc7760cf Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Mon, 12 Feb 2024 15:00:45 -0800
Subject: [test] Replace aarch64-*-{eabi,gnueabi} with aarch64

Similar to d39b4ce3ce8a3c256e01bdec2b140777a332a633
Using "eabi" or "gnueabi" for aarch64 targets is a common mistake and
warned by Clang Driver. We want to avoid them elsewhere as well. Just
use the common "aarch64" without other triple components.
---
 .../CodeGen/aarch64-ABI-align-packed-assembly.c    |  2 +-
 clang/test/CodeGen/aarch64-ABI-align-packed.c      |  2 +-
 clang/test/CodeGen/aarch64-fix-cortex-a53-835769.c |  6 +--
 clang/test/CodeGen/aarch64-ls64-inline-asm.c       |  2 +-
 clang/test/CodeGen/aarch64-ls64.c                  |  4 +-
 clang/test/CodeGen/aarch64-matmul.cpp              |  2 +-
 clang/test/CodeGen/aarch64-neon-ldst-one-rcpc3.c   |  2 +-
 clang/test/CodeGen/aarch64-targetattr-arch.c       |  6 +--
 clang/test/CodeGen/aarch64-targetattr-crypto.c     |  2 +-
 clang/test/CodeGen/aarch64-targetattr.c            |  2 +-
 clang/test/CodeGen/aarch64-tme.cpp                 |  4 +-
 clang/test/CodeGen/arm64-mte.c                     |  4 +-
 clang/test/CodeGen/fp128_complex.c                 |  2 +-
 clang/test/CodeGen/fp16-ops-strictfp.c             |  4 +-
 clang/test/CodeGen/fp16-ops.c                      |  4 +-
 .../debug-info-structured-binding-bitfield.cpp     |  2 +-
 clang/test/Driver/aarch64-fix-cortex-a53-835769.c  |  6 +--
 clang/test/Driver/aarch64-mgeneral_regs_only.c     |  2 +-
 clang/test/Driver/aarch64-pauth-lr.c               | 20 ++++----
 clang/test/Driver/aarch64-target-as-march.s        | 24 ++++-----
 clang/test/Driver/arm-alignment.c                  | 16 +++---
 clang/test/Driver/arm-no-neg-immediates.c          |  4 +-
 clang/test/Driver/linux-ld.c                       |  4 +-
 clang/test/Frontend/embed-bitcode.ll               |  2 +-
 clang/test/Frontend/gnu-mcount.c                   | 21 +++-----
 clang/test/Headers/arm-acle-header.c               |  4 +-
 clang/test/Headers/xmmintrin-unsupported.c         |  2 +-
 clang/test/Preprocessor/aarch64-target-features.c  | 60 +++++++++++-----------
 clang/test/Sema/aarch64-tme-errors.c               |  2 +-
 clang/test/Sema/aarch64-tme-tcancel-errors.c       |  4 +-
 .../test/SemaCXX/warn-overaligned-type-thrown.cpp  |  2 +-
 31 files changed, 108 insertions(+), 115 deletions(-)

diff --git a/clang/test/CodeGen/aarch64-ABI-align-packed-assembly.c b/clang/test/CodeGen/aarch64-ABI-align-packed-assembly.c
index e6eb98b..5ac8fd1 100644
--- a/clang/test/CodeGen/aarch64-ABI-align-packed-assembly.c
+++ b/clang/test/CodeGen/aarch64-ABI-align-packed-assembly.c
@@ -1,5 +1,5 @@
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -fsyntax-only -triple aarch64-none-eabi -target-feature +neon -S -O2 -o - %s | FileCheck %s
+// RUN: %clang_cc1 -fsyntax-only -triple aarch64 -target-feature +neon -S -O2 -o - %s | FileCheck %s
 #include <stdarg.h>
 #include <arm_neon.h>
 
diff --git a/clang/test/CodeGen/aarch64-ABI-align-packed.c b/clang/test/CodeGen/aarch64-ABI-align-packed.c
index 03f4834..93b81d5 100644
--- a/clang/test/CodeGen/aarch64-ABI-align-packed.c
+++ b/clang/test/CodeGen/aarch64-ABI-align-packed.c
@@ -1,5 +1,5 @@
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -fsyntax-only -triple aarch64-none-eabi -target-feature +neon -emit-llvm -O2 -o - %s | FileCheck %s
+// RUN: %clang_cc1 -fsyntax-only -triple aarch64 -target-feature +neon -emit-llvm -O2 -o - %s | FileCheck %s
 #include <stdarg.h>
 #include <arm_neon.h>
 
diff --git a/clang/test/CodeGen/aarch64-fix-cortex-a53-835769.c b/clang/test/CodeGen/aarch64-fix-cortex-a53-835769.c
index e5d7056..baef74b 100644
--- a/clang/test/CodeGen/aarch64-fix-cortex-a53-835769.c
+++ b/clang/test/CodeGen/aarch64-fix-cortex-a53-835769.c
@@ -1,8 +1,8 @@
-// RUN: %clang -O3 -target aarch64-linux-eabi %s -S -o- \
+// RUN: %clang -O3 --target=aarch64 %s -S -o- \
 // RUN:   | FileCheck --check-prefix=CHECK-NO --check-prefix=CHECK %s
-// RUN: %clang -O3 -target aarch64-linux-eabi -mfix-cortex-a53-835769 %s -S -o- 2>&1 \
+// RUN: %clang -O3 --target=aarch64 -mfix-cortex-a53-835769 %s -S -o- 2>&1 \
 // RUN:   | FileCheck --check-prefix=CHECK-YES --check-prefix=CHECK %s
-// RUN: %clang -O3 -target aarch64-linux-eabi -mno-fix-cortex-a53-835769 %s -S -o- 2>&1 \
+// RUN: %clang -O3 --target=aarch64 -mno-fix-cortex-a53-835769 %s -S -o- 2>&1 \
 // RUN:   | FileCheck --check-prefix=CHECK-NO --check-prefix=CHECK %s
 
 // RUN: %clang -O3 --target=aarch64-linux-androideabi %s -S -o- \
diff --git a/clang/test/CodeGen/aarch64-ls64-inline-asm.c b/clang/test/CodeGen/aarch64-ls64-inline-asm.c
index 744d691..0ba12ab 100644
--- a/clang/test/CodeGen/aarch64-ls64-inline-asm.c
+++ b/clang/test/CodeGen/aarch64-ls64-inline-asm.c
@@ -1,5 +1,5 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -triple aarch64-eabi -target-feature +ls64 -O1 -S -emit-llvm -x c %s -o - | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +ls64 -O1 -S -emit-llvm -x c %s -o - | FileCheck %s
 
 struct foo { unsigned long long x[8]; };
 
diff --git a/clang/test/CodeGen/aarch64-ls64.c b/clang/test/CodeGen/aarch64-ls64.c
index 8a61a96..c20be13e 100644
--- a/clang/test/CodeGen/aarch64-ls64.c
+++ b/clang/test/CodeGen/aarch64-ls64.c
@@ -1,6 +1,6 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -triple aarch64-eabi -target-feature +ls64 -S -emit-llvm -x c %s -o - | FileCheck --check-prefixes=CHECK-C %s
-// RUN: %clang_cc1 -triple aarch64-eabi -target-feature +ls64 -S -emit-llvm -x c++ %s -o - | FileCheck --check-prefixes=CHECK-CXX %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +ls64 -S -emit-llvm -x c %s -o - | FileCheck --check-prefixes=CHECK-C %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +ls64 -S -emit-llvm -x c++ %s -o - | FileCheck --check-prefixes=CHECK-CXX %s
 // RUN: %clang_cc1 -triple aarch64_be-eabi -target-feature +ls64 -S -emit-llvm -x c %s -o - | FileCheck  --check-prefixes=CHECK-C %s
 // RUN: %clang_cc1 -triple aarch64_be-eabi -target-feature +ls64 -S -emit-llvm -x c++ %s -o - | FileCheck  --check-prefixes=CHECK-CXX %s
 
diff --git a/clang/test/CodeGen/aarch64-matmul.cpp b/clang/test/CodeGen/aarch64-matmul.cpp
index 677d8bf..58deda1 100644
--- a/clang/test/CodeGen/aarch64-matmul.cpp
+++ b/clang/test/CodeGen/aarch64-matmul.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1             -triple aarch64-eabi -target-feature +neon -target-feature +i8mm -S -emit-llvm %s -o - | FileCheck %s
+// RUN: %clang_cc1             -triple aarch64 -target-feature +neon -target-feature +i8mm -S -emit-llvm %s -o - | FileCheck %s
 
 #ifdef __ARM_FEATURE_MATMUL_INT8
 extern "C" void arm_feature_matmulint8_defined() {}
diff --git a/clang/test/CodeGen/aarch64-neon-ldst-one-rcpc3.c b/clang/test/CodeGen/aarch64-neon-ldst-one-rcpc3.c
index ab7d752..40c5a0a 100644
--- a/clang/test/CodeGen/aarch64-neon-ldst-one-rcpc3.c
+++ b/clang/test/CodeGen/aarch64-neon-ldst-one-rcpc3.c
@@ -1,5 +1,5 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -triple aarch64-arm-none-eabi -target-feature +neon \
+// RUN: %clang_cc1 -triple aarch64 -target-feature +neon \
 // RUN:  -target-feature +rcpc3 -disable-O0-optnone -emit-llvm -o - %s \
 // RUN: | opt -S -passes=mem2reg | FileCheck %s
 
diff --git a/clang/test/CodeGen/aarch64-targetattr-arch.c b/clang/test/CodeGen/aarch64-targetattr-arch.c
index 86ddeac0..ed731d0 100644
--- a/clang/test/CodeGen/aarch64-targetattr-arch.c
+++ b/clang/test/CodeGen/aarch64-targetattr-arch.c
@@ -1,6 +1,6 @@
-// RUN: %clang_cc1 -triple aarch64-eabi -target-feature +v8a -verify -DHAS8 -S %s -o -
-// RUN: %clang_cc1 -triple aarch64-eabi -target-feature +v8.1a -verify -DHAS81 -S %s -o -
-// RUN: %clang_cc1 -triple aarch64-eabi -target-feature +v9a -verify -DHAS9 -S %s -o -
+// RUN: %clang_cc1 -triple aarch64 -target-feature +v8a -verify -DHAS8 -S %s -o -
+// RUN: %clang_cc1 -triple aarch64 -target-feature +v8.1a -verify -DHAS81 -S %s -o -
+// RUN: %clang_cc1 -triple aarch64 -target-feature +v9a -verify -DHAS9 -S %s -o -
 // REQUIRES: aarch64-registered-target
 
 #ifdef HAS9
diff --git a/clang/test/CodeGen/aarch64-targetattr-crypto.c b/clang/test/CodeGen/aarch64-targetattr-crypto.c
index d360924..006a394 100644
--- a/clang/test/CodeGen/aarch64-targetattr-crypto.c
+++ b/clang/test/CodeGen/aarch64-targetattr-crypto.c
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -triple aarch64-eabi -target-feature +v8a -verify -S %s -o -
+// RUN: %clang_cc1 -triple aarch64 -target-feature +v8a -verify -S %s -o -
 // REQUIRES: aarch64-registered-target
 
 #include <arm_neon.h>
diff --git a/clang/test/CodeGen/aarch64-targetattr.c b/clang/test/CodeGen/aarch64-targetattr.c
index 1a3a84a..bf4c147 100644
--- a/clang/test/CodeGen/aarch64-targetattr.c
+++ b/clang/test/CodeGen/aarch64-targetattr.c
@@ -1,5 +1,5 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -triple aarch64-eabi -S -emit-llvm %s -o - | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -S -emit-llvm %s -o - | FileCheck %s
 
 // CHECK-LABEL: @v82() #0
 __attribute__((target("arch=armv8.2-a")))
diff --git a/clang/test/CodeGen/aarch64-tme.cpp b/clang/test/CodeGen/aarch64-tme.cpp
index 5004751..096a8e4 100644
--- a/clang/test/CodeGen/aarch64-tme.cpp
+++ b/clang/test/CodeGen/aarch64-tme.cpp
@@ -1,5 +1,5 @@
-// RUN: %clang_cc1             -triple aarch64-eabi -target-feature +tme -S -emit-llvm %s -o - | FileCheck %s
-// RUN: %clang_cc1 -DUSE_ACLE  -triple aarch64-eabi -target-feature +tme -S -emit-llvm %s -o - | FileCheck %s
+// RUN: %clang_cc1             -triple aarch64 -target-feature +tme -S -emit-llvm %s -o - | FileCheck %s
+// RUN: %clang_cc1 -DUSE_ACLE  -triple aarch64 -target-feature +tme -S -emit-llvm %s -o - | FileCheck %s
 
 #define A -1
 constexpr int f() { return 65536; }
diff --git a/clang/test/CodeGen/arm64-mte.c b/clang/test/CodeGen/arm64-mte.c
index 1c65d6a..7dde23c 100644
--- a/clang/test/CodeGen/arm64-mte.c
+++ b/clang/test/CodeGen/arm64-mte.c
@@ -1,6 +1,6 @@
 // Test memory tagging extension intrinsics
-// RUN: %clang_cc1 -triple aarch64-none-linux-eabi -target-feature +mte -O3 -S -emit-llvm -o - %s  | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-eabi -DMTE -O3 -S -emit-llvm -o - %s  | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +mte -O3 -S -emit-llvm -o - %s  | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -DMTE -O3 -S -emit-llvm -o - %s  | FileCheck %s
 #include <stddef.h>
 #include <arm_acle.h>
 
diff --git a/clang/test/CodeGen/fp128_complex.c b/clang/test/CodeGen/fp128_complex.c
index 0e87cbe..d1593fa 100644
--- a/clang/test/CodeGen/fp128_complex.c
+++ b/clang/test/CodeGen/fp128_complex.c
@@ -1,4 +1,4 @@
-// RUN: %clang -target aarch64-linux-gnueabi %s -S -emit-llvm -o - | FileCheck %s
+// RUN: %clang --target=aarch64 %s -S -emit-llvm -o - | FileCheck %s
 
 _Complex long double a, b, c, d;
 void test_fp128_compound_assign(void) {
diff --git a/clang/test/CodeGen/fp16-ops-strictfp.c b/clang/test/CodeGen/fp16-ops-strictfp.c
index aa096fc..25753e5 100644
--- a/clang/test/CodeGen/fp16-ops-strictfp.c
+++ b/clang/test/CodeGen/fp16-ops-strictfp.c
@@ -1,10 +1,10 @@
 // REQUIRES: arm-registered-target
 // RUN: %clang_cc1 -ffp-exception-behavior=maytrap -fexperimental-strict-floating-point -emit-llvm -o - -triple arm-none-linux-gnueabi %s | FileCheck %s --check-prefix=NOTNATIVE --check-prefix=CHECK -vv -dump-input=fail
-// RUN: %clang_cc1 -ffp-exception-behavior=maytrap -emit-llvm -o - -triple aarch64-none-linux-gnueabi %s | FileCheck %s --check-prefix=NOTNATIVE --check-prefix=CHECK
+// RUN: %clang_cc1 -ffp-exception-behavior=maytrap -emit-llvm -o - -triple aarch64 %s | FileCheck %s --check-prefix=NOTNATIVE --check-prefix=CHECK
 // RUN: %clang_cc1 -ffp-exception-behavior=maytrap -fexperimental-strict-floating-point -emit-llvm -o - -triple x86_64-linux-gnu %s | FileCheck %s --check-prefix=NOTNATIVE --check-prefix=CHECK
 // RUN: %clang_cc1 -ffp-exception-behavior=maytrap -fexperimental-strict-floating-point -emit-llvm -o - -triple arm-none-linux-gnueabi -fnative-half-type %s \
 // RUN:   | FileCheck %s --check-prefix=NATIVE-HALF --check-prefix=CHECK
-// RUN: %clang_cc1 -ffp-exception-behavior=maytrap -emit-llvm -o - -triple aarch64-none-linux-gnueabi -fnative-half-type %s \
+// RUN: %clang_cc1 -ffp-exception-behavior=maytrap -emit-llvm -o - -triple aarch64 -fnative-half-type %s \
 // RUN:   | FileCheck %s --check-prefix=NATIVE-HALF --check-prefix=CHECK
 //
 // Test that the constrained intrinsics are picking up the exception
diff --git a/clang/test/CodeGen/fp16-ops.c b/clang/test/CodeGen/fp16-ops.c
index 0626e0a..bfa2a2f 100644
--- a/clang/test/CodeGen/fp16-ops.c
+++ b/clang/test/CodeGen/fp16-ops.c
@@ -1,10 +1,10 @@
 // REQUIRES: arm-registered-target
 // RUN: %clang_cc1 -emit-llvm -o - -triple arm-none-linux-gnueabi %s | FileCheck %s --check-prefix=NOTNATIVE --check-prefix=CHECK
-// RUN: %clang_cc1 -emit-llvm -o - -triple aarch64-none-linux-gnueabi %s | FileCheck %s --check-prefix=NOTNATIVE --check-prefix=CHECK
+// RUN: %clang_cc1 -emit-llvm -o - -triple aarch64 %s | FileCheck %s --check-prefix=NOTNATIVE --check-prefix=CHECK
 // RUN: %clang_cc1 -emit-llvm -o - -triple x86_64-linux-gnu %s | FileCheck %s --check-prefix=NOTNATIVE --check-prefix=CHECK
 // RUN: %clang_cc1 -emit-llvm -o - -triple arm-none-linux-gnueabi -fnative-half-type %s \
 // RUN:   | FileCheck %s --check-prefix=NATIVE-HALF
-// RUN: %clang_cc1 -emit-llvm -o - -triple aarch64-none-linux-gnueabi -fnative-half-type %s \
+// RUN: %clang_cc1 -emit-llvm -o - -triple aarch64 -fnative-half-type %s \
 // RUN:   | FileCheck %s --check-prefix=NATIVE-HALF
 // RUN: %clang_cc1 -emit-llvm -o - -x renderscript %s \
 // RUN:   | FileCheck %s --check-prefix=NATIVE-HALF
diff --git a/clang/test/CodeGenCXX/debug-info-structured-binding-bitfield.cpp b/clang/test/CodeGenCXX/debug-info-structured-binding-bitfield.cpp
index 0234e41..d9f5e3e 100644
--- a/clang/test/CodeGenCXX/debug-info-structured-binding-bitfield.cpp
+++ b/clang/test/CodeGenCXX/debug-info-structured-binding-bitfield.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -emit-llvm -debug-info-kind=standalone -triple aarch64-arm-none-eabi %s -o - | FileCheck %s
+// RUN: %clang_cc1 -emit-llvm -debug-info-kind=standalone -triple aarch64 %s -o - | FileCheck %s
 
 struct S0 {
   unsigned int x : 16;
diff --git a/clang/test/Driver/aarch64-fix-cortex-a53-835769.c b/clang/test/Driver/aarch64-fix-cortex-a53-835769.c
index d7a2ad9..84d8c1d 100644
--- a/clang/test/Driver/aarch64-fix-cortex-a53-835769.c
+++ b/clang/test/Driver/aarch64-fix-cortex-a53-835769.c
@@ -1,8 +1,8 @@
-// RUN: %clang --target=aarch64-linux-eabi %s -### 2>&1 \
+// RUN: %clang --target=aarch64 %s -### 2>&1 \
 // RUN:   | FileCheck --check-prefix=CHECK-DEF %s
-// RUN: %clang --target=aarch64-linux-eabi -mfix-cortex-a53-835769 %s -### 2>&1 \
+// RUN: %clang --target=aarch64 -mfix-cortex-a53-835769 %s -### 2>&1 \
 // RUN:   | FileCheck --check-prefix=CHECK-YES %s
-// RUN: %clang --target=aarch64-linux-eabi -mno-fix-cortex-a53-835769 %s -### 2>&1 \
+// RUN: %clang --target=aarch64 -mno-fix-cortex-a53-835769 %s -### 2>&1 \
 // RUN:   | FileCheck --check-prefix=CHECK-NO %s
 
 // RUN: %clang --target=aarch64-linux-androideabi %s -### 2>&1 \
diff --git a/clang/test/Driver/aarch64-mgeneral_regs_only.c b/clang/test/Driver/aarch64-mgeneral_regs_only.c
index 17da9c6..93f1ca9 100644
--- a/clang/test/Driver/aarch64-mgeneral_regs_only.c
+++ b/clang/test/Driver/aarch64-mgeneral_regs_only.c
@@ -1,6 +1,6 @@
 // Test the -mgeneral-regs-only option
 
-// RUN: %clang --target=aarch64-linux-eabi -mgeneral-regs-only %s -### 2>&1 \
+// RUN: %clang --target=aarch64 -mgeneral-regs-only %s -### 2>&1 \
 // RUN:   | FileCheck --check-prefix=CHECK-NO-FP %s
 // RUN: %clang --target=arm64-linux-eabi -mgeneral-regs-only %s -### 2>&1 \
 // RUN:   | FileCheck --check-prefix=CHECK-NO-FP %s
diff --git a/clang/test/Driver/aarch64-pauth-lr.c b/clang/test/Driver/aarch64-pauth-lr.c
index 2e1b530..00281fa 100644
--- a/clang/test/Driver/aarch64-pauth-lr.c
+++ b/clang/test/Driver/aarch64-pauth-lr.c
@@ -1,15 +1,15 @@
 // Check the -cc1 flags for the various forms of -mbranch-protection=pac-ret+pc.
 
-// RUN: %clang -target aarch64-arm-none-eabi -c %s -### -mbranch-protection=pac-ret+pc                  2>&1 |  FileCheck %s --check-prefixes=PAUTH-LR
-// RUN: %clang -target aarch64-arm-none-eabi -c %s -### -mbranch-protection=pac-ret+pc+b-key            2>&1 |  FileCheck %s --check-prefixes=PAUTH-LR-B-KEY
-// RUN: %clang -target aarch64-arm-none-eabi -c %s -### -mbranch-protection=pac-ret+pc+leaf             2>&1 |  FileCheck %s --check-prefixes=PAUTH-LR-LEAF
-// RUN: %clang -target aarch64-arm-none-eabi -c %s -### -mbranch-protection=pac-ret+pc+bti              2>&1 |  FileCheck %s --check-prefixes=PAUTH-LR-BTI
-// RUN: %clang -target aarch64-arm-none-eabi -c %s -### -mbranch-protection=pac-ret+pc+leaf+b-key+bti   2>&1 |  FileCheck %s --check-prefixes=PAUTH-LR-LEAF-B-KEY-BTI
-// RUN: %clang -target aarch64-arm-none-eabi -c %s -### -mbranch-protection=pac-ret+pc                  -march=armv9.5-a 2>&1 |  FileCheck %s --check-prefixes=PAUTH-LR
-// RUN: %clang -target aarch64-arm-none-eabi -c %s -### -mbranch-protection=pac-ret+pc+b-key            -march=armv9.5-a 2>&1 |  FileCheck %s --check-prefixes=PAUTH-LR-B-KEY
-// RUN: %clang -target aarch64-arm-none-eabi -c %s -### -mbranch-protection=pac-ret+pc+leaf             -march=armv9.5-a 2>&1 |  FileCheck %s --check-prefixes=PAUTH-LR-LEAF
-// RUN: %clang -target aarch64-arm-none-eabi -c %s -### -mbranch-protection=pac-ret+pc+bti              -march=armv9.5-a 2>&1 |  FileCheck %s --check-prefixes=PAUTH-LR-BTI
-// RUN: %clang -target aarch64-arm-none-eabi -c %s -### -mbranch-protection=pac-ret+pc+leaf+b-key+bti   -march=armv9.5-a 2>&1 |  FileCheck %s --check-prefixes=PAUTH-LR-LEAF-B-KEY-BTI
+// RUN: %clang --target=aarch64 -c %s -### -mbranch-protection=pac-ret+pc                  2>&1 |  FileCheck %s --check-prefixes=PAUTH-LR
+// RUN: %clang --target=aarch64 -c %s -### -mbranch-protection=pac-ret+pc+b-key            2>&1 |  FileCheck %s --check-prefixes=PAUTH-LR-B-KEY
+// RUN: %clang --target=aarch64 -c %s -### -mbranch-protection=pac-ret+pc+leaf             2>&1 |  FileCheck %s --check-prefixes=PAUTH-LR-LEAF
+// RUN: %clang --target=aarch64 -c %s -### -mbranch-protection=pac-ret+pc+bti              2>&1 |  FileCheck %s --check-prefixes=PAUTH-LR-BTI
+// RUN: %clang --target=aarch64 -c %s -### -mbranch-protection=pac-ret+pc+leaf+b-key+bti   2>&1 |  FileCheck %s --check-prefixes=PAUTH-LR-LEAF-B-KEY-BTI
+// RUN: %clang --target=aarch64 -c %s -### -mbranch-protection=pac-ret+pc                  -march=armv9.5-a 2>&1 |  FileCheck %s --check-prefixes=PAUTH-LR
+// RUN: %clang --target=aarch64 -c %s -### -mbranch-protection=pac-ret+pc+b-key            -march=armv9.5-a 2>&1 |  FileCheck %s --check-prefixes=PAUTH-LR-B-KEY
+// RUN: %clang --target=aarch64 -c %s -### -mbranch-protection=pac-ret+pc+leaf             -march=armv9.5-a 2>&1 |  FileCheck %s --check-prefixes=PAUTH-LR-LEAF
+// RUN: %clang --target=aarch64 -c %s -### -mbranch-protection=pac-ret+pc+bti              -march=armv9.5-a 2>&1 |  FileCheck %s --check-prefixes=PAUTH-LR-BTI
+// RUN: %clang --target=aarch64 -c %s -### -mbranch-protection=pac-ret+pc+leaf+b-key+bti   -march=armv9.5-a 2>&1 |  FileCheck %s --check-prefixes=PAUTH-LR-LEAF-B-KEY-BTI
 
 // PAUTH-LR: "-msign-return-address=non-leaf" "-msign-return-address-key=a_key" "-mbranch-protection-pauth-lr"
 // PAUTH-LR-B-KEY: "-msign-return-address=non-leaf" "-msign-return-address-key=b_key" "-mbranch-protection-pauth-lr"
diff --git a/clang/test/Driver/aarch64-target-as-march.s b/clang/test/Driver/aarch64-target-as-march.s
index 59c0ca4..ae77d08 100644
--- a/clang/test/Driver/aarch64-target-as-march.s
+++ b/clang/test/Driver/aarch64-target-as-march.s
@@ -2,42 +2,42 @@
 /// via -Wa or -Xassembler are applied correctly to assembler inputs.
 
 /// Does not apply to non assembly files
-// RUN: %clang --target=aarch64-linux-gnueabi -### -c -Wa,-march=armv8.1-a \
+// RUN: %clang --target=aarch64 -### -c -Wa,-march=armv8.1-a \
 // RUN: %S/Inputs/wildcard1.c 2>&1 | FileCheck --check-prefix=TARGET-FEATURE-1 %s
-// RUN: %clang --target=aarch64-linux-gnueabi -### -c -Xassembler -march=armv8.1-a \
+// RUN: %clang --target=aarch64 -### -c -Xassembler -march=armv8.1-a \
 // RUN: %S/Inputs/wildcard1.c 2>&1 | FileCheck --check-prefix=TARGET-FEATURE-1 %s
 
 // TARGET-FEATURE-1-NOT: "-target-feature" "+v8.1a"
 
 /// Does apply to assembler input
-// RUN: %clang --target=aarch64-linux-gnueabi -### -c -Wa,-march=armv8.2-a %s 2>&1 | \
+// RUN: %clang --target=aarch64 -### -c -Wa,-march=armv8.2-a %s 2>&1 | \
 // RUN: FileCheck --check-prefix=TARGET-FEATURE-2 %s
-// RUN: %clang --target=aarch64-linux-gnueabi -### -c -Xassembler -march=armv8.2-a %s 2>&1 | \
+// RUN: %clang --target=aarch64 -### -c -Xassembler -march=armv8.2-a %s 2>&1 | \
 // RUN: FileCheck --check-prefix=TARGET-FEATURE-2 %s
 
 // TARGET-FEATURE-2: "-target-feature" "+v8.2a"
 
 /// No unused argument warnings when there are multiple values
-// RUN: %clang --target=aarch64-linux-gnueabi -### -c -Wa,-march=armv8.1-a -Wa,-march=armv8.2-a %s 2>&1 | \
+// RUN: %clang --target=aarch64 -### -c -Wa,-march=armv8.1-a -Wa,-march=armv8.2-a %s 2>&1 | \
 // RUN: FileCheck --check-prefix=UNUSED-WARNING %s
 
 // UNUSED-WARNING-NOT: warning: argument unused during compilation
 
 /// Last march to assembler wins
-// RUN: %clang --target=aarch64-linux-gnueabi -### -c -Wa,-march=armv8.2-a -Wa,-march=armv8.1-a %s 2>&1 | \
+// RUN: %clang --target=aarch64 -### -c -Wa,-march=armv8.2-a -Wa,-march=armv8.1-a %s 2>&1 | \
 // RUN: FileCheck --check-prefix=MULTIPLE-VALUES %s
-// RUN: %clang --target=aarch64-linux-gnueabi -### -c -Wa,-march=armv8.2-a,-march=armv8.1-a %s 2>&1 | \
+// RUN: %clang --target=aarch64 -### -c -Wa,-march=armv8.2-a,-march=armv8.1-a %s 2>&1 | \
 // RUN: FileCheck --check-prefix=MULTIPLE-VALUES %s
-// RUN: %clang --target=aarch64-linux-gnueabi -### -c -Xassembler -march=armv8.2-a -Xassembler \
+// RUN: %clang --target=aarch64 -### -c -Xassembler -march=armv8.2-a -Xassembler \
 // RUN: -march=armv8.1-a %s 2>&1 | FileCheck --check-prefix=MULTIPLE-VALUES %s
 
 // MULTIPLE-VALUES: "-target-feature" "+v8.1a
 // MULTIPLE-VALUES-NOT: "-target-feature" "+v8.2a
 
 /// march to compiler and assembler, we choose the one suited to the input file type
-// RUN: %clang --target=aarch64-linux-gnueabi -### -c -Wa,-march=armv8.3-a -march=armv8.4-a %s 2>&1 | \
+// RUN: %clang --target=aarch64 -### -c -Wa,-march=armv8.3-a -march=armv8.4-a %s 2>&1 | \
 // RUN: FileCheck --check-prefix=TARGET-FEATURE-3 %s
-// RUN: %clang --target=aarch64-linux-gnueabi -### -c -Wa,-march=armv8.3-a -march=armv8.4-a \
+// RUN: %clang --target=aarch64 -### -c -Wa,-march=armv8.3-a -march=armv8.4-a \
 // RUN: %S/Inputs/wildcard1.c 2>&1 | FileCheck --check-prefix=TARGET-FEATURE-4 %s
 
 // TARGET-FEATURE-3: "-target-feature" "+v8.3a"
@@ -46,9 +46,9 @@
 // TARGET-FEATURE-4-NOT: "-target-feature" "+v8.3a"
 
 // Invalid -march settings
-// RUN: not %clang --target=aarch64-linux-gnueabi -### -c -Wa,-march=all %s 2>&1 | \
+// RUN: not %clang --target=aarch64 -### -c -Wa,-march=all %s 2>&1 | \
 // RUN: FileCheck --check-prefix=INVALID-ARCH-1 %s
-// RUN: not %clang --target=aarch64-linux-gnueabi -### -c -Wa,-march=foobar %s 2>&1 | \
+// RUN: not %clang --target=aarch64 -### -c -Wa,-march=foobar %s 2>&1 | \
 // RUN: FileCheck --check-prefix=INVALID-ARCH-2 %s
 
 // INVALID-ARCH-1: error: unsupported argument 'all' to option '-march='
diff --git a/clang/test/Driver/arm-alignment.c b/clang/test/Driver/arm-alignment.c
index ba1be29..9177b62 100644
--- a/clang/test/Driver/arm-alignment.c
+++ b/clang/test/Driver/arm-alignment.c
@@ -22,13 +22,13 @@
 // RUN: %clang -target armv7-windows -### %s 2> %t
 // RUN: FileCheck --check-prefix=CHECK-UNALIGNED-ARM < %t %s
 
-// RUN: %clang -target aarch64-none-gnueabi -munaligned-access -### %s 2> %t
+// RUN: %clang --target=aarch64 -munaligned-access -### %s 2> %t
 // RUN: FileCheck --check-prefix=CHECK-UNALIGNED-AARCH64 < %t %s
 
-// RUN: %clang -target aarch64-none-gnueabi -mstrict-align -munaligned-access -### %s 2> %t
+// RUN: %clang --target=aarch64 -mstrict-align -munaligned-access -### %s 2> %t
 // RUN: FileCheck --check-prefix=CHECK-UNALIGNED-AARCH64 < %t %s
 
-// RUN: %clang -target aarch64-none-gnueabi -mno-unaligned-access -munaligned-access -### %s 2> %t
+// RUN: %clang --target=aarch64 -mno-unaligned-access -munaligned-access -### %s 2> %t
 // RUN: FileCheck --check-prefix=CHECK-UNALIGNED-AARCH64 < %t %s
 
 // CHECK-UNALIGNED-ARM-NOT: "-target-feature" "+strict-align"
@@ -68,19 +68,19 @@
 // RUN: %clang -target armv6m-netbsd-eabi -### %s 2> %t
 // RUN: FileCheck --check-prefix=CHECK-ALIGNED-ARM < %t %s
 
-// RUN: %clang -target aarch64-none-gnueabi -mno-unaligned-access -### %s 2> %t
+// RUN: %clang --target=aarch64 -mno-unaligned-access -### %s 2> %t
 // RUN: FileCheck --check-prefix=CHECK-ALIGNED-AARCH64 < %t %s
 
-// RUN: %clang -target aarch64-none-gnueabi -mstrict-align -### %s 2> %t
+// RUN: %clang --target=aarch64 -mstrict-align -### %s 2> %t
 // RUN: FileCheck --check-prefix=CHECK-ALIGNED-AARCH64 < %t %s
 
-// RUN: %clang -target aarch64-none-gnueabi -munaligned-access -mno-unaligned-access -### %s 2> %t
+// RUN: %clang --target=aarch64 -munaligned-access -mno-unaligned-access -### %s 2> %t
 // RUN: FileCheck --check-prefix=CHECK-ALIGNED-AARCH64 < %t %s
 
-// RUN: %clang -target aarch64-none-gnueabi -munaligned-access -mstrict-align -### %s 2> %t
+// RUN: %clang --target=aarch64 -munaligned-access -mstrict-align -### %s 2> %t
 // RUN: FileCheck --check-prefix=CHECK-ALIGNED-AARCH64 < %t %s
 
-// RUN: %clang -target aarch64-none-gnueabi -mkernel -mno-unaligned-access -### %s 2> %t
+// RUN: %clang --target=aarch64 -mkernel -mno-unaligned-access -### %s 2> %t
 // RUN: FileCheck --check-prefix=CHECK-ALIGNED-AARCH64 < %t %s
 
 // RUN: %clang -target aarch64-unknown-openbsd -### %s 2> %t
diff --git a/clang/test/Driver/arm-no-neg-immediates.c b/clang/test/Driver/arm-no-neg-immediates.c
index f1e4d5f..26c70c8 100644
--- a/clang/test/Driver/arm-no-neg-immediates.c
+++ b/clang/test/Driver/arm-no-neg-immediates.c
@@ -1,8 +1,8 @@
 // RUN: %clang -target arm-none-gnueabi -### %s 2>&1 | FileCheck %s --check-prefix=CHECK-DEFAULT
 // RUN: %clang -target arm-none-gnueabi -mno-neg-immediates -### %s 2>&1 | FileCheck %s
 
-// RUN: %clang -target aarch64-none-gnueabi -### %s 2>&1 | FileCheck %s --check-prefix=CHECK-DEFAULT
-// RUN: %clang -target aarch64-none-gnueabi -mno-neg-immediates -### %s 2>&1 | FileCheck %s
+// RUN: %clang --target=aarch64 -### %s 2>&1 | FileCheck %s --check-prefix=CHECK-DEFAULT
+// RUN: %clang --target=aarch64 -mno-neg-immediates -### %s 2>&1 | FileCheck %s
 
 // CHECK: "-target-feature" "+no-neg-immediates"
 // CHECK-DEFAULT-NOT: "+no-neg-immediates"
diff --git a/clang/test/Driver/linux-ld.c b/clang/test/Driver/linux-ld.c
index b8efd64..b3ce5ca 100644
--- a/clang/test/Driver/linux-ld.c
+++ b/clang/test/Driver/linux-ld.c
@@ -1750,10 +1750,10 @@
 // RUN:     --target=armv7eb-pc-linux-musleabi -mhard-float \
 // RUN:   | FileCheck --check-prefix=CHECK-MUSL-ARMEBHF %s
 // RUN: %clang -### %s -no-pie 2>&1 \
-// RUN:     --target=aarch64-pc-linux-musleabi \
+// RUN:     --target=aarch64-pc-linux-musl \
 // RUN:   | FileCheck --check-prefix=CHECK-MUSL-AARCH64 %s
 // RUN: %clang -### %s -no-pie 2>&1 \
-// RUN:     --target=aarch64_be-pc-linux-musleabi \
+// RUN:     --target=aarch64_be-pc-linux-musl \
 // RUN:   | FileCheck --check-prefix=CHECK-MUSL-AARCH64_BE %s
 // CHECK-MUSL-X86:        "-dynamic-linker" "/lib/ld-musl-i386.so.1"
 // CHECK-MUSL-X86_64:     "-dynamic-linker" "/lib/ld-musl-x86_64.so.1"
diff --git a/clang/test/Frontend/embed-bitcode.ll b/clang/test/Frontend/embed-bitcode.ll
index defb2d1..9b8632d 100644
--- a/clang/test/Frontend/embed-bitcode.ll
+++ b/clang/test/Frontend/embed-bitcode.ll
@@ -7,7 +7,7 @@
 ; RUN: %clang_cc1 -triple thumbv7-apple-ios8.0.0 -emit-llvm \
 ; RUN:    -fembed-bitcode=marker -x ir %s -o - \
 ; RUN:    | FileCheck %s -check-prefix=CHECK-MARKER
-; RUN: %clang_cc1 -triple aarch64-unknown-linux-gnueabi -emit-llvm \
+; RUN: %clang_cc1 -triple aarch64 -emit-llvm \
 ; RUN:    -fembed-bitcode=all -x ir %s -o - \
 ; RUN:    | FileCheck %s -check-prefix=CHECK-ELF
 
diff --git a/clang/test/Frontend/gnu-mcount.c b/clang/test/Frontend/gnu-mcount.c
index e54983c..a6ee4b2 100644
--- a/clang/test/Frontend/gnu-mcount.c
+++ b/clang/test/Frontend/gnu-mcount.c
@@ -2,36 +2,29 @@
 
 // RUN: %clang -target armv7-unknown-none-eabi -pg -S -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,UNSUPPORTED
 // RUN: %clang -target armv7-unknown-none-eabi -pg -meabi gnu -S -emit-llvm -o - %s | FileCheck %s --check-prefixes=CHECK,UNSUPPORTED
-// RUN: %clang -target aarch64-unknown-none-gnu -pg -S -emit-llvm -o - %s | FileCheck %s --check-prefixes=CHECK,MCOUNT
-// RUN: %clang -target aarch64-unknown-none-gnu -pg -meabi gnu -S -emit-llvm -o - %s | FileCheck %s --check-prefixes=CHECK,UNDER
+// RUN: %clang --target=aarch64-unknown-none-gnu -pg -S -emit-llvm -o - %s | FileCheck %s --check-prefixes=CHECK,MCOUNT
 // RUN: %clang -target armv7-unknown-linux-gnueabi -pg -S -emit-llvm -o - %s | FileCheck %s -check-prefix CHECK -check-prefix CHECK-ARM-EABI
 // RUN: %clang -target armv7-unknown-linux-gnueabi -meabi gnu -pg -S -emit-llvm -o - %s | FileCheck %s -check-prefix CHECK -check-prefix CHECK-ARM-EABI-MEABI-GNU
-// RUN: %clang -target aarch64-unknown-linux-gnueabi -pg -S -emit-llvm -o - %s | FileCheck %s --check-prefixes=CHECK,UNDER
-// RUN: %clang -target aarch64-unknown-linux-gnueabi -meabi gnu -pg -S -emit-llvm -o - %s | FileCheck %s --check-prefixes=CHECK,UNDER
+// RUN: %clang --target=aarch64-unknown-linux -pg -S -emit-llvm -o - %s | FileCheck %s --check-prefixes=CHECK,UNDER
 // RUN: %clang -target armv7-unknown-linux-gnueabihf -pg -S -emit-llvm -o - %s | FileCheck %s -check-prefix CHECK -check-prefix CHECK-ARM-EABI
 // RUN: %clang -target armv7-unknown-linux-gnueabihf -meabi gnu -pg -S -emit-llvm -o - %s | FileCheck %s -check-prefix CHECK -check-prefix CHECK-ARM-EABI-MEABI-GNU
-// RUN: %clang -target aarch64-unknown-linux-gnueabihf -pg -S -emit-llvm -o - %s | FileCheck %s --check-prefixes=CHECK,UNDER
-// RUN: %clang -target aarch64-unknown-linux-gnueabihf -meabi gnu -pg -S -emit-llvm -o - %s | FileCheck %s --check-prefixes=CHECK,UNDER
 // RUN: %clang -target armv7-unknown-freebsd-gnueabihf -pg -S -emit-llvm -o - %s | FileCheck %s --check-prefixes=CHECK,UNDER_UNDER
 // RUN: %clang -target armv7-unknown-freebsd-gnueabihf -meabi gnu -pg -S -emit-llvm -o - %s | FileCheck %s --check-prefixes=CHECK,UNDER_UNDER
-// RUN: %clang -target aarch64-unknown-freebsd-gnueabihf -pg -S -emit-llvm -o - %s | FileCheck %s -check-prefix CHECK -check-prefix CHECK-ARM64-EABI-FREEBSD
-// RUN: %clang -target aarch64-unknown-freebsd-gnueabihf -meabi gnu -pg -S -emit-llvm -o - %s | FileCheck %s -check-prefix CHECK -check-prefix CHECK-ARM64-EABI-FREEBSD
+// RUN: %clang --target=aarch64-unknown-freebsd -pg -S -emit-llvm -o - %s | FileCheck %s -check-prefix CHECK -check-prefix CHECK-ARM64-EABI-FREEBSD
 // RUN: %clang -target armv7-unknown-openbsd-gnueabihf -pg -S -emit-llvm -o - %s | FileCheck %s -check-prefix CHECK -check-prefix UNDER_UNDER
 // RUN: %clang -target armv7-unknown-openbsd-gnueabihf -meabi gnu -pg -S -emit-llvm -o - %s | FileCheck %s -check-prefix CHECK -check-prefix UNDER_UNDER
-// RUN: %clang -target aarch64-unknown-openbsd-gnueabihf -pg -S -emit-llvm -o - %s | FileCheck %s -check-prefix CHECK -check-prefix UNDER_UNDER
-// RUN: %clang -target aarch64-unknown-openbsd-gnueabihf -meabi gnu -pg -S -emit-llvm -o - %s | FileCheck %s --check-prefixes=CHECK,UNDER_UNDER
+// RUN: %clang --target=aarch64-unknown-openbsd -pg -S -emit-llvm -o - %s | FileCheck %s -check-prefix CHECK -check-prefix UNDER_UNDER
 // RUN: %clang -target armv7-unknown-netbsd-gnueabihf -pg -S -emit-llvm -o - %s | FileCheck %s --check-prefixes=CHECK,UNDER_UNDER
 // RUN: %clang -target armv7-unknown-netbsd-gnueabihf -meabi gnu -pg -S -emit-llvm -o - %s | FileCheck %s --check-prefixes=CHECK,UNDER_UNDER
-// RUN: %clang -target aarch64-unknown-netbsd-gnueabihf -pg -S -emit-llvm -o - %s | FileCheck %s --check-prefixes=CHECK,UNDER_UNDER
-// RUN: %clang -target aarch64-unknown-netbsd-gnueabihf -meabi gnu -pg -S -emit-llvm -o - %s | FileCheck %s --check-prefixes=CHECK,UNDER_UNDER
+// RUN: %clang --target=aarch64-unknown-netbsd -pg -S -emit-llvm -o - %s | FileCheck %s --check-prefixes=CHECK,UNDER_UNDER
 // RUN: %clang -target armv7-apple-ios -pg -S -emit-llvm -o - %s | FileCheck %s --check-prefixes=CHECK,UNSUPPORTED
 // RUN: %clang -target armv7-apple-ios -pg -meabi gnu -S -emit-llvm -o - %s | FileCheck %s --check-prefixes=CHECK,UNSUPPORTED
 // RUN: %clang -target arm64-apple-ios -pg -S -emit-llvm -o - %s | FileCheck %s --check-prefixes=CHECK,UNSUPPORTED
 // RUN: %clang -target arm64-apple-ios -pg -meabi gnu -S -emit-llvm -o - %s | FileCheck %s --check-prefixes=CHECK,UNSUPPORTED
 // RUN: %clang -target armv7-unknown-rtems-gnueabihf -pg -S -emit-llvm -o - %s | FileCheck %s --check-prefixes=CHECK,MCOUNT
 // RUN: %clang -target armv7-unknown-rtems-gnueabihf -meabi gnu -pg -S -emit-llvm -o - %s | FileCheck %s --check-prefixes=CHECK,MCOUNT
-// RUN: %clang -target aarch64-unknown-rtems-gnueabihf -pg -S -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,MCOUNT
-// RUN: %clang -target aarch64-unknown-rtems-gnueabihf -meabi gnu -pg -S -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,MCOUNT
+// RUN: %clang --target=aarch64-unknown-rtems -pg -S -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,MCOUNT
+// RUN: %clang --target=aarch64-unknown-rtems -pg -S -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,MCOUNT
 
 int f() {
   return 0;
diff --git a/clang/test/Headers/arm-acle-header.c b/clang/test/Headers/arm-acle-header.c
index 1084a8c..f04c7e1 100644
--- a/clang/test/Headers/arm-acle-header.c
+++ b/clang/test/Headers/arm-acle-header.c
@@ -1,8 +1,8 @@
 // RUN: %clang_cc1 -triple armv7-eabi -target-cpu cortex-a15 -fsyntax-only -ffreestanding %s
-// RUN: %clang_cc1 -triple aarch64-eabi -target-cpu cortex-a53 -fsyntax-only -ffreestanding %s
+// RUN: %clang_cc1 -triple aarch64 -target-cpu cortex-a53 -fsyntax-only -ffreestanding %s
 // RUN: %clang_cc1 -triple thumbv7-windows -target-cpu cortex-a53 -fsyntax-only -ffreestanding %s
 // RUN: %clang_cc1 -x c++ -triple armv7-eabi -target-cpu cortex-a15 -fsyntax-only -ffreestanding %s
-// RUN: %clang_cc1 -x c++ -triple aarch64-eabi -target-cpu cortex-a57 -fsyntax-only -ffreestanding %s
+// RUN: %clang_cc1 -x c++ -triple aarch64 -target-cpu cortex-a57 -fsyntax-only -ffreestanding %s
 // RUN: %clang_cc1 -x c++ -triple thumbv7-windows -target-cpu cortex-a15 -fsyntax-only -ffreestanding %s
 // RUN: %clang_cc1 -x c++ -triple thumbv7-windows -target-cpu cortex-a15 -fsyntax-only -ffreestanding -fms-extensions -fms-compatibility -fms-compatibility-version=19.11 %s
 // RUN: %clang_cc1 -x c++ -triple aarch64-windows -target-cpu cortex-a53 -fsyntax-only -ffreestanding -fms-extensions -fms-compatibility -fms-compatibility-version=19.11 %s
diff --git a/clang/test/Headers/xmmintrin-unsupported.c b/clang/test/Headers/xmmintrin-unsupported.c
index 1991a13..870be81 100644
--- a/clang/test/Headers/xmmintrin-unsupported.c
+++ b/clang/test/Headers/xmmintrin-unsupported.c
@@ -1,4 +1,4 @@
-// RUN: not %clang_cc1 %s -triple aarch64-eabi -fsyntax-only 2>&1 | FileCheck %s
+// RUN: not %clang_cc1 %s -triple aarch64 -fsyntax-only 2>&1 | FileCheck %s
 //
 // REQUIRES: x86-registered-target
 // CHECK: This header is only meant to be used on x86 and x64 architecture
diff --git a/clang/test/Preprocessor/aarch64-target-features.c b/clang/test/Preprocessor/aarch64-target-features.c
index 41fb26e..6ec4dcd 100644
--- a/clang/test/Preprocessor/aarch64-target-features.c
+++ b/clang/test/Preprocessor/aarch64-target-features.c
@@ -244,16 +244,16 @@
 
 // On ARMv8.2-A and above, +fp16fml implies +fp16.
 // On ARMv8.4-A and above, +fp16 implies +fp16fml.
-// RUN: %clang -target aarch64-none-linux-gnueabi -march=armv8.2-a+nofp16fml+fp16 -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=CHECK-FULLFP16-NOFML --check-prefix=CHECK-FULLFP16-VECTOR-SCALAR %s
-// RUN: %clang -target aarch64-none-linux-gnueabi -march=armv8.2-a+nofp16+fp16fml -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=CHECK-FULLFP16-FML --check-prefix=CHECK-FULLFP16-VECTOR-SCALAR %s
-// RUN: %clang -target aarch64-none-linux-gnueabi -march=armv8.2-a+fp16+nofp16fml -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=CHECK-FULLFP16-NOFML --check-prefix=CHECK-FULLFP16-VECTOR-SCALAR %s
-// RUN: %clang -target aarch64-none-linux-gnueabi -march=armv8-a+fp16fml -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=CHECK-FULLFP16-FML --check-prefix=CHECK-FULLFP16-VECTOR-SCALAR %s
-// RUN: %clang -target aarch64-none-linux-gnueabi -march=armv8-a+fp16 -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=CHECK-FULLFP16-NOFML --check-prefix=CHECK-FULLFP16-VECTOR-SCALAR %s
-// RUN: %clang -target aarch64-none-linux-gnueabi -march=armv8.4-a+nofp16fml+fp16 -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=CHECK-FULLFP16-FML --check-prefix=CHECK-FULLFP16-VECTOR-SCALAR %s
-// RUN: %clang -target aarch64-none-linux-gnueabi -march=armv8.4-a+nofp16+fp16fml -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=CHECK-FULLFP16-FML --check-prefix=CHECK-FULLFP16-VECTOR-SCALAR %s
-// RUN: %clang -target aarch64-none-linux-gnueabi -march=armv8.4-a+fp16+nofp16fml -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=CHECK-FULLFP16-NOFML --check-prefix=CHECK-FULLFP16-VECTOR-SCALAR %s
-// RUN: %clang -target aarch64-none-linux-gnueabi -march=armv8.4-a+fp16fml -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=CHECK-FULLFP16-FML --check-prefix=CHECK-FULLFP16-VECTOR-SCALAR %s
-// RUN: %clang -target aarch64-none-linux-gnueabi -march=armv8.4-a+fp16 -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=CHECK-FULLFP16-FML --check-prefix=CHECK-FULLFP16-VECTOR-SCALAR %s
+// RUN: %clang --target=aarch64 -march=armv8.2-a+nofp16fml+fp16 -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=CHECK-FULLFP16-NOFML --check-prefix=CHECK-FULLFP16-VECTOR-SCALAR %s
+// RUN: %clang --target=aarch64 -march=armv8.2-a+nofp16+fp16fml -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=CHECK-FULLFP16-FML --check-prefix=CHECK-FULLFP16-VECTOR-SCALAR %s
+// RUN: %clang --target=aarch64 -march=armv8.2-a+fp16+nofp16fml -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=CHECK-FULLFP16-NOFML --check-prefix=CHECK-FULLFP16-VECTOR-SCALAR %s
+// RUN: %clang --target=aarch64 -march=armv8-a+fp16fml -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=CHECK-FULLFP16-FML --check-prefix=CHECK-FULLFP16-VECTOR-SCALAR %s
+// RUN: %clang --target=aarch64 -march=armv8-a+fp16 -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=CHECK-FULLFP16-NOFML --check-prefix=CHECK-FULLFP16-VECTOR-SCALAR %s
+// RUN: %clang --target=aarch64 -march=armv8.4-a+nofp16fml+fp16 -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=CHECK-FULLFP16-FML --check-prefix=CHECK-FULLFP16-VECTOR-SCALAR %s
+// RUN: %clang --target=aarch64 -march=armv8.4-a+nofp16+fp16fml -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=CHECK-FULLFP16-FML --check-prefix=CHECK-FULLFP16-VECTOR-SCALAR %s
+// RUN: %clang --target=aarch64 -march=armv8.4-a+fp16+nofp16fml -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=CHECK-FULLFP16-NOFML --check-prefix=CHECK-FULLFP16-VECTOR-SCALAR %s
+// RUN: %clang --target=aarch64 -march=armv8.4-a+fp16fml -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=CHECK-FULLFP16-FML --check-prefix=CHECK-FULLFP16-VECTOR-SCALAR %s
+// RUN: %clang --target=aarch64 -march=armv8.4-a+fp16 -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=CHECK-FULLFP16-FML --check-prefix=CHECK-FULLFP16-VECTOR-SCALAR %s
 // CHECK-FULLFP16-FML:           #define __ARM_FEATURE_FP16_FML 1
 // CHECK-FULLFP16-NOFML-NOT:     #define __ARM_FEATURE_FP16_FML 1
 // CHECK-FULLFP16-VECTOR-SCALAR: #define __ARM_FEATURE_FP16_SCALAR_ARITHMETIC 1
@@ -263,24 +263,24 @@
 
 // +fp16fml+nosimd doesn't make sense as the fp16fml instructions all require SIMD.
 // However, as +fp16fml implies +fp16 there is a set of defines that we would expect.
-// RUN: %clang -target aarch64-none-linux-gnueabi -march=armv8-a+fp16fml+nosimd -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=CHECK-FULLFP16-SCALAR %s
-// RUN: %clang -target aarch64-none-linux-gnueabi -march=armv8-a+fp16+nosimd -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=CHECK-FULLFP16-SCALAR %s
-// RUN: %clang -target aarch64-none-linux-gnueabi -march=armv8.4-a+fp16fml+nosimd -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=CHECK-FULLFP16-SCALAR %s
-// RUN: %clang -target aarch64-none-linux-gnueabi -march=armv8.4-a+fp16+nosimd -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=CHECK-FULLFP16-SCALAR %s
+// RUN: %clang --target=aarch64 -march=armv8-a+fp16fml+nosimd -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=CHECK-FULLFP16-SCALAR %s
+// RUN: %clang --target=aarch64 -march=armv8-a+fp16+nosimd -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=CHECK-FULLFP16-SCALAR %s
+// RUN: %clang --target=aarch64 -march=armv8.4-a+fp16fml+nosimd -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=CHECK-FULLFP16-SCALAR %s
+// RUN: %clang --target=aarch64 -march=armv8.4-a+fp16+nosimd -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=CHECK-FULLFP16-SCALAR %s
 // CHECK-FULLFP16-SCALAR-NOT:   #define __ARM_FEATURE_FP16_FML 1
 // CHECK-FULLFP16-SCALAR:       #define __ARM_FEATURE_FP16_SCALAR_ARITHMETIC 1
 // CHECK-FULLFP16-SCALAR-NOT:   #define __ARM_FEATURE_FP16_VECTOR_ARITHMETIC 1
 // CHECK-FULLFP16-SCALAR:       #define __ARM_FP 0xE
 // CHECK-FULLFP16-SCALAR:       #define __ARM_FP16_FORMAT_IEEE 1
 
-// RUN: %clang -target aarch64-none-linux-gnueabi -march=armv8.2-a -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=CHECK-FULLFP16-NOFML-VECTOR-SCALAR %s
-// RUN: %clang -target aarch64-none-linux-gnueabi -march=armv8.2-a+nofp16 -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=CHECK-FULLFP16-NOFML-VECTOR-SCALAR %s
-// RUN: %clang -target aarch64-none-linux-gnueabi -march=armv8.2-a+nofp16fml -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=CHECK-FULLFP16-NOFML-VECTOR-SCALAR %s
-// RUN: %clang -target aarch64-none-linux-gnueabi -march=armv8.2-a+fp16fml+nofp16 -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=CHECK-FULLFP16-NOFML-VECTOR-SCALAR %s
-// RUN: %clang -target aarch64-none-linux-gnueabi -march=armv8.4-a -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=CHECK-FULLFP16-NOFML-VECTOR-SCALAR %s
-// RUN: %clang -target aarch64-none-linux-gnueabi -march=armv8.4-a+nofp16 -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=CHECK-FULLFP16-NOFML-VECTOR-SCALAR %s
-// RUN: %clang -target aarch64-none-linux-gnueabi -march=armv8.4-a+nofp16fml -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=CHECK-FULLFP16-NOFML-VECTOR-SCALAR %s
-// RUN: %clang -target aarch64-none-linux-gnueabi -march=armv8.4-a+fp16fml+nofp16 -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=CHECK-FULLFP16-NOFML-VECTOR-SCALAR %s
+// RUN: %clang --target=aarch64 -march=armv8.2-a -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=CHECK-FULLFP16-NOFML-VECTOR-SCALAR %s
+// RUN: %clang --target=aarch64 -march=armv8.2-a+nofp16 -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=CHECK-FULLFP16-NOFML-VECTOR-SCALAR %s
+// RUN: %clang --target=aarch64 -march=armv8.2-a+nofp16fml -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=CHECK-FULLFP16-NOFML-VECTOR-SCALAR %s
+// RUN: %clang --target=aarch64 -march=armv8.2-a+fp16fml+nofp16 -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=CHECK-FULLFP16-NOFML-VECTOR-SCALAR %s
+// RUN: %clang --target=aarch64 -march=armv8.4-a -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=CHECK-FULLFP16-NOFML-VECTOR-SCALAR %s
+// RUN: %clang --target=aarch64 -march=armv8.4-a+nofp16 -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=CHECK-FULLFP16-NOFML-VECTOR-SCALAR %s
+// RUN: %clang --target=aarch64 -march=armv8.4-a+nofp16fml -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=CHECK-FULLFP16-NOFML-VECTOR-SCALAR %s
+// RUN: %clang --target=aarch64 -march=armv8.4-a+fp16fml+nofp16 -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=CHECK-FULLFP16-NOFML-VECTOR-SCALAR %s
 // CHECK-FULLFP16-NOFML-VECTOR-SCALAR-NOT: #define __ARM_FEATURE_FP16_FML 1
 // CHECK-FULLFP16-NOFML-VECTOR-SCALAR-NOT: #define __ARM_FEATURE_FP16_SCALAR_ARITHMETIC 1
 // CHECK-FULLFP16-NOFML-VECTOR-SCALAR-NOT: #define __ARM_FEATURE_FP16_VECTOR_ARITHMETIC 1
@@ -600,14 +600,14 @@
 // CHECK-NOSYS128-NOT: __ARM_FEATURE_SYSREG128 1
 
 // ================== Check Armv8.9-A/Armv9.4-A Guarded Control Stack (FEAT_GCS)
-// RUN: %clang -target aarch64-arm-none-eabi -march=armv8.9-a     -x c -E -dM %s -o - | FileCheck --check-prefixes=CHECK-NOGCS,CHECK-NOGCS-DEFAULT %s
-// RUN: %clang -target aarch64-arm-none-eabi -march=armv9.4-a     -x c -E -dM %s -o - | FileCheck --check-prefixes=CHECK-NOGCS,CHECK-NOGCS-DEFAULT %s
-// RUN: %clang -target aarch64-arm-none-eabi -march=armv8.9-a+gcs -x c -E -dM %s -o - | FileCheck --check-prefixes=CHECK-GCS,CHECK-NOGCS-DEFAULT %s
-// RUN: %clang -target aarch64-arm-none-eabi -march=armv9.4-a+gcs -x c -E -dM %s -o - | FileCheck --check-prefixes=CHECK-GCS,CHECK-NOGCS-DEFAULT %s
-// RUN: %clang -target aarch64-arm-none-eabi -march=armv8.9-a     -mbranch-protection=gcs -x c -E -dM %s -o - | FileCheck --check-prefixes=CHECK-NOGCS,CHECK-GCS-DEFAULT %s
-// RUN: %clang -target aarch64-arm-none-eabi -march=armv9.4-a     -mbranch-protection=gcs -x c -E -dM %s -o - | FileCheck --check-prefixes=CHECK-NOGCS,CHECK-GCS-DEFAULT %s
-// RUN: %clang -target aarch64-arm-none-eabi -march=armv8.9-a+gcs -mbranch-protection=gcs -x c -E -dM %s -o - | FileCheck --check-prefixes=CHECK-GCS,CHECK-GCS-DEFAULT %s
-// RUN: %clang -target aarch64-arm-none-eabi -march=armv9.4-a+gcs -mbranch-protection=gcs -x c -E -dM %s -o - | FileCheck --check-prefixes=CHECK-GCS,CHECK-GCS-DEFAULT %s
+// RUN: %clang --target=aarch64 -march=armv8.9-a     -x c -E -dM %s -o - | FileCheck --check-prefixes=CHECK-NOGCS,CHECK-NOGCS-DEFAULT %s
+// RUN: %clang --target=aarch64 -march=armv9.4-a     -x c -E -dM %s -o - | FileCheck --check-prefixes=CHECK-NOGCS,CHECK-NOGCS-DEFAULT %s
+// RUN: %clang --target=aarch64 -march=armv8.9-a+gcs -x c -E -dM %s -o - | FileCheck --check-prefixes=CHECK-GCS,CHECK-NOGCS-DEFAULT %s
+// RUN: %clang --target=aarch64 -march=armv9.4-a+gcs -x c -E -dM %s -o - | FileCheck --check-prefixes=CHECK-GCS,CHECK-NOGCS-DEFAULT %s
+// RUN: %clang --target=aarch64 -march=armv8.9-a     -mbranch-protection=gcs -x c -E -dM %s -o - | FileCheck --check-prefixes=CHECK-NOGCS,CHECK-GCS-DEFAULT %s
+// RUN: %clang --target=aarch64 -march=armv9.4-a     -mbranch-protection=gcs -x c -E -dM %s -o - | FileCheck --check-prefixes=CHECK-NOGCS,CHECK-GCS-DEFAULT %s
+// RUN: %clang --target=aarch64 -march=armv8.9-a+gcs -mbranch-protection=gcs -x c -E -dM %s -o - | FileCheck --check-prefixes=CHECK-GCS,CHECK-GCS-DEFAULT %s
+// RUN: %clang --target=aarch64 -march=armv9.4-a+gcs -mbranch-protection=gcs -x c -E -dM %s -o - | FileCheck --check-prefixes=CHECK-GCS,CHECK-GCS-DEFAULT %s
 // CHECK-GCS: __ARM_FEATURE_GCS 1
 // CHECK-NOGCS-NOT: __ARM_FEATURE_GCS 1
 // CHECK-GCS-DEFAULT: __ARM_FEATURE_GCS_DEFAULT 1
diff --git a/clang/test/Sema/aarch64-tme-errors.c b/clang/test/Sema/aarch64-tme-errors.c
index 26e931b..1cb6f690 100644
--- a/clang/test/Sema/aarch64-tme-errors.c
+++ b/clang/test/Sema/aarch64-tme-errors.c
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -triple aarch64-eabi -verify %s
+// RUN: %clang_cc1 -triple aarch64 -verify %s
 
 #include "arm_acle.h"
 
diff --git a/clang/test/Sema/aarch64-tme-tcancel-errors.c b/clang/test/Sema/aarch64-tme-tcancel-errors.c
index 4fcf681..365bf81 100644
--- a/clang/test/Sema/aarch64-tme-tcancel-errors.c
+++ b/clang/test/Sema/aarch64-tme-tcancel-errors.c
@@ -1,9 +1,9 @@
-// RUN: %clang_cc1 -triple aarch64-eabi -target-feature +tme -verify %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +tme -verify %s
 void t_cancel_const(unsigned short u) {
   __builtin_arm_tcancel(u); // expected-error{{argument to '__builtin_arm_tcancel' must be a constant integer}}
 }
 
-// RUN: %clang_cc1 -triple aarch64-eabi -target-feature +tme -verify %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +tme -verify %s
 void t_cancel_range(void) {
   __builtin_arm_tcancel(0x12345u); // expected-error{{argument value 74565 is outside the valid range [0, 65535]}}
 }
diff --git a/clang/test/SemaCXX/warn-overaligned-type-thrown.cpp b/clang/test/SemaCXX/warn-overaligned-type-thrown.cpp
index 9f2386d..800783f 100644
--- a/clang/test/SemaCXX/warn-overaligned-type-thrown.cpp
+++ b/clang/test/SemaCXX/warn-overaligned-type-thrown.cpp
@@ -9,7 +9,7 @@
 // RUN: %clang_cc1 -triple arm64-apple-tvos12 -verify -fsyntax-only -std=c++11 -fcxx-exceptions -fexceptions %s
 // RUN: %clang_cc1 -triple arm64-apple-watchos5 -verify -fsyntax-only -std=c++11 -fcxx-exceptions -fexceptions %s
 // RUN: %clang_cc1 -triple arm-linux-androideabi -verify -fsyntax-only -std=c++11 -fcxx-exceptions -fexceptions %s
-// RUN: %clang_cc1 -triple aarch64-linux-gnueabi -verify -fsyntax-only -std=c++11 -fcxx-exceptions -fexceptions %s
+// RUN: %clang_cc1 -triple aarch64 -verify -fsyntax-only -std=c++11 -fcxx-exceptions -fexceptions %s
 // RUN: %clang_cc1 -triple mipsel-linux-gnu -verify -fsyntax-only -std=c++11 -fcxx-exceptions -fexceptions %s
 // RUN: %clang_cc1 -triple mips64el-linux-gnu -verify -fsyntax-only -std=c++11 -fcxx-exceptions -fexceptions %s
 // RUN: %clang_cc1 -triple wasm32-unknown-unknown -verify -fsyntax-only -std=c++11 -fcxx-exceptions -fexceptions %s
-- 
cgit v1.1


From f63da479ae2f5e0d747430f268ae7b458c02455c Mon Sep 17 00:00:00 2001
From: Ryosuke Niwa <rniwa@webkit.org>
Date: Mon, 12 Feb 2024 15:01:16 -0800
Subject: [analyzer] Fix an implicit cast to a base ref counted class generates
 a false positive. (#80934)

The bug was caused by isRefCountable erroneously returning false for a
class with both ref() and deref() functions defined because we were not
resetting the base paths results between looking for "ref()" and
"deref()"
---
 .../Checkers/WebKit/PtrTypesSemantics.cpp          |  1 +
 ...cast-to-base-class-with-deref-in-superclass.cpp | 30 ++++++++++++++++++++++
 2 files changed, 31 insertions(+)
 create mode 100644 clang/test/Analysis/Checkers/WebKit/implicit-cast-to-base-class-with-deref-in-superclass.cpp

diff --git a/clang/lib/StaticAnalyzer/Checkers/WebKit/PtrTypesSemantics.cpp b/clang/lib/StaticAnalyzer/Checkers/WebKit/PtrTypesSemantics.cpp
index eadd468..96784d4 100644
--- a/clang/lib/StaticAnalyzer/Checkers/WebKit/PtrTypesSemantics.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/WebKit/PtrTypesSemantics.cpp
@@ -84,6 +84,7 @@ std::optional<bool> isRefCountable(const CXXRecordDecl* R)
   if (AnyInconclusiveBase)
     return std::nullopt;
 
+  Paths.clear();
   const auto hasPublicDerefInBase =
       [&AnyInconclusiveBase](const CXXBaseSpecifier *Base, CXXBasePath &) {
         auto hasDerefInBase = clang::hasPublicMethodInBase(Base, "deref");
diff --git a/clang/test/Analysis/Checkers/WebKit/implicit-cast-to-base-class-with-deref-in-superclass.cpp b/clang/test/Analysis/Checkers/WebKit/implicit-cast-to-base-class-with-deref-in-superclass.cpp
new file mode 100644
index 0000000..176238f3
--- /dev/null
+++ b/clang/test/Analysis/Checkers/WebKit/implicit-cast-to-base-class-with-deref-in-superclass.cpp
@@ -0,0 +1,30 @@
+// RUN: %clang_analyze_cc1 -analyzer-checker=alpha.webkit.UncountedCallArgsChecker -verify %s
+// expected-no-diagnostics
+
+#include "mock-types.h"
+
+class Base {
+public:
+    virtual ~Base();
+    void ref() const;
+    void deref() const;
+};
+
+class Event : public Base {
+protected:
+    explicit Event();
+};
+
+class SubEvent : public Event {
+public:
+    static Ref<SubEvent> create();
+private:
+    SubEvent() = default;
+};
+
+void someFunction(Base&);
+
+static void test()
+{
+    someFunction(SubEvent::create());
+}
-- 
cgit v1.1


From 2dbfa8407e7d2f4293add33b5ead3f2d5fcd04e9 Mon Sep 17 00:00:00 2001
From: Ryosuke Niwa <rniwa@webkit.org>
Date: Mon, 12 Feb 2024 15:01:40 -0800
Subject: [analyzer] Allow default arguments to be evaluated like other
 arguments. (#80956)

This PR aligns the evaluation of default arguments with other kinds of
arguments by extracting the expressions within them as argument values
to be evaluated.
---
 .../Checkers/WebKit/UncountedCallArgsChecker.cpp   |  3 +++
 .../WebKit/ref-countable-default-arg-nullptr.cpp   | 25 ++++++++++++++++++++++
 2 files changed, 28 insertions(+)
 create mode 100644 clang/test/Analysis/Checkers/WebKit/ref-countable-default-arg-nullptr.cpp

diff --git a/clang/lib/StaticAnalyzer/Checkers/WebKit/UncountedCallArgsChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/WebKit/UncountedCallArgsChecker.cpp
index 737e887..f4e6191 100644
--- a/clang/lib/StaticAnalyzer/Checkers/WebKit/UncountedCallArgsChecker.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/WebKit/UncountedCallArgsChecker.cpp
@@ -91,6 +91,9 @@ public:
 
         const auto *Arg = CE->getArg(ArgIdx);
 
+        if (auto *defaultArg = dyn_cast<CXXDefaultArgExpr>(Arg))
+          Arg = defaultArg->getExpr();
+
         std::pair<const clang::Expr *, bool> ArgOrigin =
             tryToFindPtrOrigin(Arg, true);
 
diff --git a/clang/test/Analysis/Checkers/WebKit/ref-countable-default-arg-nullptr.cpp b/clang/test/Analysis/Checkers/WebKit/ref-countable-default-arg-nullptr.cpp
new file mode 100644
index 0000000..a1860a5
--- /dev/null
+++ b/clang/test/Analysis/Checkers/WebKit/ref-countable-default-arg-nullptr.cpp
@@ -0,0 +1,25 @@
+// RUN: %clang_analyze_cc1 -analyzer-checker=alpha.webkit.UncountedCallArgsChecker -verify %s
+
+#include "mock-types.h"
+
+class Obj {
+public:
+  static Obj* get();
+  static RefPtr<Obj> create();
+  void ref() const;
+  void deref() const;
+};
+
+void someFunction(Obj*, Obj* = nullptr);
+void otherFunction(Obj*, Obj* = Obj::get());
+// expected-warning@-1{{Call argument is uncounted and unsafe [alpha.webkit.UncountedCallArgsChecker]}}
+void anotherFunction(Obj*, Obj* = Obj::create().get());
+
+void otherFunction() {
+  someFunction(nullptr);
+  someFunction(Obj::get());
+  // expected-warning@-1{{Call argument is uncounted and unsafe [alpha.webkit.UncountedCallArgsChecker]}}
+  someFunction(Obj::create().get());
+  otherFunction(nullptr);
+  anotherFunction(nullptr);
+}
-- 
cgit v1.1


From c2b01c87dcc3ab59e7d466cbec795310a3d43fde Mon Sep 17 00:00:00 2001
From: jimingham <jingham@apple.com>
Date: Mon, 12 Feb 2024 15:24:11 -0800
Subject: Make ValueObjectPrinter's handling of its ValueObject pointers more
 principled (NFC) (#81314)

I get a small but fairly steady stream of crash reports which I can only
explain by ValueObjectPrinter trying to access its m_valobj field, and
finding it NULL. I have never been able to reproduce any of these, and
the reports show a state too long after the fact to know what went
wrong.

I've read through this section of lldb a bunch of times trying to figure
out how this could happen, but haven't ever found anything actually
wrong that could cause this. OTOH, ValueObjectPrinter is somewhat sloppy
about how it handles the ValueObject it is printing.

a) lldb allows you to make a ValueObjectPrinter with a Null incoming
ValueObject. However, there's no affordance to set the ValueObject in
the Printer after the fact, and it doesn't really make sense to do that.
So I change the ValueObjectPrinter API's to take a ValueObject
reference, rather than a pointer. All the places that make
ValueObjectPrinters already check the non-null status of their
ValueObject's before making the ValueObjectPrinter, so sadly, I didn't
find the bug, but this will enforce the intent.
b) The next step in printing the ValueObject is deciding which of the
associated DynamicValue/SyntheticValue we are actually printing (based
on the use_dynamic and use_synthetic settings in the original
ValueObject. This was put in a pointer by GetMostSpecializedValue, but
most of the printer code just accessed the pointer, and it was hard to
reason out whether we were guaranteed to always call this before using
m_valobj. So far as I could see we always do (sigh, didn't find the bug
there either) but this was way too hard to reason about.

In fact, we figure out once which ValueObject we're going to print and
don't change that through the life of the printer. So I changed this to
both set the "most specialized value" in the constructor, and then to
always access it through GetMostSpecializedValue(). That makes it easier
to reason about the use of this ValueObject as well.

This is an NFC change, all it does is make the code easier to reason
about.
---
 .../lldb/DataFormatters/ValueObjectPrinter.h       |  41 +++-
 lldb/source/Commands/CommandObjectFrame.cpp        |   5 +-
 lldb/source/Core/ValueObject.cpp                   |   2 +-
 lldb/source/DataFormatters/TypeSummary.cpp         |   5 +-
 lldb/source/DataFormatters/ValueObjectPrinter.cpp  | 222 +++++++++++----------
 5 files changed, 153 insertions(+), 122 deletions(-)

diff --git a/lldb/include/lldb/DataFormatters/ValueObjectPrinter.h b/lldb/include/lldb/DataFormatters/ValueObjectPrinter.h
index 2b3936e..fe46321 100644
--- a/lldb/include/lldb/DataFormatters/ValueObjectPrinter.h
+++ b/lldb/include/lldb/DataFormatters/ValueObjectPrinter.h
@@ -21,10 +21,15 @@
 namespace lldb_private {
 
 class ValueObjectPrinter {
+  /// The ValueObjectPrinter is a one-shot printer for ValueObjects.  It
+  /// does not retain the ValueObject it is printing, that is the job of
+  /// its caller.  It also doesn't attempt to track changes in the
+  /// ValueObject, e.g. changing synthetic child providers or changing
+  /// dynamic vrs. static vrs. synthetic settings.
 public:
-  ValueObjectPrinter(ValueObject *valobj, Stream *s);
+  ValueObjectPrinter(ValueObject &valobj, Stream *s);
 
-  ValueObjectPrinter(ValueObject *valobj, Stream *s,
+  ValueObjectPrinter(ValueObject &valobj, Stream *s,
                      const DumpValueObjectOptions &options);
 
   ~ValueObjectPrinter() = default;
@@ -39,7 +44,7 @@ protected:
 
   // only this class (and subclasses, if any) should ever be concerned with the
   // depth mechanism
-  ValueObjectPrinter(ValueObject *valobj, Stream *s,
+  ValueObjectPrinter(ValueObject &valobj, Stream *s,
                      const DumpValueObjectOptions &options,
                      const DumpValueObjectOptions::PointerDepth &ptr_depth,
                      uint32_t curr_depth,
@@ -47,13 +52,29 @@ protected:
 
   // we should actually be using delegating constructors here but some versions
   // of GCC still have trouble with those
-  void Init(ValueObject *valobj, Stream *s,
+  void Init(ValueObject &valobj, Stream *s,
             const DumpValueObjectOptions &options,
             const DumpValueObjectOptions::PointerDepth &ptr_depth,
             uint32_t curr_depth,
             InstancePointersSetSP printed_instance_pointers);
 
-  bool GetMostSpecializedValue();
+  /// Cache the ValueObject we are actually going to print.  If this
+  /// ValueObject has a Dynamic type, we return that, if either the original
+  /// ValueObject or its Dynamic type has a Synthetic provider, return that.
+  /// This will never return an empty ValueObject, since we use the ValueObject
+  /// to carry errors.
+  /// Note, this gets called when making the printer object, and uses the
+  /// use dynamic and use synthetic settings of the ValueObject being printed,
+  /// so changes made to these settings won't affect already made
+  /// ValueObjectPrinters. SetupMostSpecializedValue();
+
+  /// Access the cached "most specialized value" - that is the one to use for
+  /// printing the value object's value.  However, be sure to use
+  /// GetValueForChildGeneration when you are generating the children of this
+  /// value.
+  ValueObject &GetMostSpecializedValue();
+
+  void SetupMostSpecializedValue();
 
   const char *GetDescriptionForDisplay();
 
@@ -95,13 +116,13 @@ protected:
 
   bool ShouldExpandEmptyAggregates();
 
-  ValueObject *GetValueObjectForChildrenGeneration();
+  ValueObject &GetValueObjectForChildrenGeneration();
 
   void PrintChildrenPreamble(bool value_printed, bool summary_printed);
 
   void PrintChildrenPostamble(bool print_dotdotdot);
 
-  lldb::ValueObjectSP GenerateChild(ValueObject *synth_valobj, size_t idx);
+  lldb::ValueObjectSP GenerateChild(ValueObject &synth_valobj, size_t idx);
 
   void PrintChild(lldb::ValueObjectSP child_sp,
                   const DumpValueObjectOptions::PointerDepth &curr_ptr_depth);
@@ -121,8 +142,10 @@ protected:
 private:
   bool ShouldShowName() const;
 
-  ValueObject *m_orig_valobj;
-  ValueObject *m_valobj;
+  ValueObject &m_orig_valobj;
+  ValueObject *m_cached_valobj; /// Cache the current "most specialized" value.
+                                /// Don't use this directly, use
+                                /// GetMostSpecializedValue.
   Stream *m_stream;
   DumpValueObjectOptions m_options;
   Flags m_type_flags;
diff --git a/lldb/source/Commands/CommandObjectFrame.cpp b/lldb/source/Commands/CommandObjectFrame.cpp
index 17a7e85..a4d3fb6 100644
--- a/lldb/source/Commands/CommandObjectFrame.cpp
+++ b/lldb/source/Commands/CommandObjectFrame.cpp
@@ -177,7 +177,10 @@ protected:
 
     DumpValueObjectOptions options;
     options.SetDeclPrintingHelper(helper);
-    ValueObjectPrinter printer(valobj_sp.get(), &result.GetOutputStream(),
+    // We've already handled the case where the value object sp is null, so
+    // this is just to make sure future changes don't skip that:
+    assert(valobj_sp.get() && "Must have a valid ValueObject to print");
+    ValueObjectPrinter printer(*valobj_sp, &result.GetOutputStream(),
                                options);
     printer.PrintValueObject();
   }
diff --git a/lldb/source/Core/ValueObject.cpp b/lldb/source/Core/ValueObject.cpp
index 9208047..e800428 100644
--- a/lldb/source/Core/ValueObject.cpp
+++ b/lldb/source/Core/ValueObject.cpp
@@ -2525,7 +2525,7 @@ ValueObjectSP ValueObject::GetValueForExpressionPath_Impl(
 void ValueObject::Dump(Stream &s) { Dump(s, DumpValueObjectOptions(*this)); }
 
 void ValueObject::Dump(Stream &s, const DumpValueObjectOptions &options) {
-  ValueObjectPrinter printer(this, &s, options);
+  ValueObjectPrinter printer(*this, &s, options);
   printer.PrintValueObject();
 }
 
diff --git a/lldb/source/DataFormatters/TypeSummary.cpp b/lldb/source/DataFormatters/TypeSummary.cpp
index c09ed31..3707d2d 100644
--- a/lldb/source/DataFormatters/TypeSummary.cpp
+++ b/lldb/source/DataFormatters/TypeSummary.cpp
@@ -80,7 +80,10 @@ bool StringSummaryFormat::FormatObject(ValueObject *valobj, std::string &retval,
     sc = frame->GetSymbolContext(lldb::eSymbolContextEverything);
 
   if (IsOneLiner()) {
-    ValueObjectPrinter printer(valobj, &s, DumpValueObjectOptions());
+    // We've already checked the case of a NULL valobj above.  Let's put in an
+    // assert here to make sure someone doesn't take that out:
+    assert(valobj && "Must have a valid ValueObject to summarize");
+    ValueObjectPrinter printer(*valobj, &s, DumpValueObjectOptions());
     printer.PrintChildrenOneLiner(HideNames(valobj));
     retval = std::string(s.GetString());
     return true;
diff --git a/lldb/source/DataFormatters/ValueObjectPrinter.cpp b/lldb/source/DataFormatters/ValueObjectPrinter.cpp
index 074d0b6..46e50a8 100644
--- a/lldb/source/DataFormatters/ValueObjectPrinter.cpp
+++ b/lldb/source/DataFormatters/ValueObjectPrinter.cpp
@@ -18,39 +18,35 @@
 using namespace lldb;
 using namespace lldb_private;
 
-ValueObjectPrinter::ValueObjectPrinter(ValueObject *valobj, Stream *s) {
-  if (valobj) {
-    DumpValueObjectOptions options(*valobj);
-    Init(valobj, s, options, m_options.m_max_ptr_depth, 0, nullptr);
-  } else {
-    DumpValueObjectOptions options;
-    Init(valobj, s, options, m_options.m_max_ptr_depth, 0, nullptr);
-  }
+ValueObjectPrinter::ValueObjectPrinter(ValueObject &valobj, Stream *s)
+    : m_orig_valobj(valobj) {
+  DumpValueObjectOptions options(valobj);
+  Init(valobj, s, options, m_options.m_max_ptr_depth, 0, nullptr);
 }
 
-ValueObjectPrinter::ValueObjectPrinter(ValueObject *valobj, Stream *s,
-                                       const DumpValueObjectOptions &options) {
+ValueObjectPrinter::ValueObjectPrinter(ValueObject &valobj, Stream *s,
+                                       const DumpValueObjectOptions &options)
+    : m_orig_valobj(valobj) {
   Init(valobj, s, options, m_options.m_max_ptr_depth, 0, nullptr);
 }
 
 ValueObjectPrinter::ValueObjectPrinter(
-    ValueObject *valobj, Stream *s, const DumpValueObjectOptions &options,
+    ValueObject &valobj, Stream *s, const DumpValueObjectOptions &options,
     const DumpValueObjectOptions::PointerDepth &ptr_depth, uint32_t curr_depth,
-    InstancePointersSetSP printed_instance_pointers) {
+    InstancePointersSetSP printed_instance_pointers)
+    : m_orig_valobj(valobj) {
   Init(valobj, s, options, ptr_depth, curr_depth, printed_instance_pointers);
 }
 
 void ValueObjectPrinter::Init(
-    ValueObject *valobj, Stream *s, const DumpValueObjectOptions &options,
+    ValueObject &valobj, Stream *s, const DumpValueObjectOptions &options,
     const DumpValueObjectOptions::PointerDepth &ptr_depth, uint32_t curr_depth,
     InstancePointersSetSP printed_instance_pointers) {
-  m_orig_valobj = valobj;
-  m_valobj = nullptr;
+  m_cached_valobj = nullptr;
   m_stream = s;
   m_options = options;
   m_ptr_depth = ptr_depth;
   m_curr_depth = curr_depth;
-  assert(m_orig_valobj && "cannot print a NULL ValueObject");
   assert(m_stream && "cannot print to a NULL Stream");
   m_should_print = eLazyBoolCalculate;
   m_is_nil = eLazyBoolCalculate;
@@ -68,23 +64,18 @@ void ValueObjectPrinter::Init(
       printed_instance_pointers
           ? printed_instance_pointers
           : InstancePointersSetSP(new InstancePointersSet());
+  SetupMostSpecializedValue();
 }
 
 bool ValueObjectPrinter::PrintValueObject() {
-  if (!m_orig_valobj)
-    return false;
-
   // If the incoming ValueObject is in an error state, the best we're going to 
   // get out of it is its type.  But if we don't even have that, just print
   // the error and exit early.
-  if (m_orig_valobj->GetError().Fail() 
-      && !m_orig_valobj->GetCompilerType().IsValid()) {
-    m_stream->Printf("Error: '%s'", m_orig_valobj->GetError().AsCString());
+  if (m_orig_valobj.GetError().Fail() &&
+      !m_orig_valobj.GetCompilerType().IsValid()) {
+    m_stream->Printf("Error: '%s'", m_orig_valobj.GetError().AsCString());
     return true;
   }
-   
-  if (!GetMostSpecializedValue() || m_valobj == nullptr)
-    return false;
 
   if (ShouldPrintValueObject()) {
     PrintLocationIfNeeded();
@@ -107,66 +98,68 @@ bool ValueObjectPrinter::PrintValueObject() {
   return true;
 }
 
-bool ValueObjectPrinter::GetMostSpecializedValue() {
-  if (m_valobj)
-    return true;
-  bool update_success = m_orig_valobj->UpdateValueIfNeeded(true);
-  if (!update_success) {
-    m_valobj = m_orig_valobj;
-  } else {
-    if (m_orig_valobj->IsDynamic()) {
+ValueObject &ValueObjectPrinter::GetMostSpecializedValue() {
+  assert(m_cached_valobj && "ValueObjectPrinter must have a valid ValueObject");
+  return *m_cached_valobj;
+}
+
+void ValueObjectPrinter::SetupMostSpecializedValue() {
+  bool update_success = m_orig_valobj.UpdateValueIfNeeded(true);
+  // If we can't find anything better, we'll fall back on the original
+  // ValueObject.
+  m_cached_valobj = &m_orig_valobj;
+  if (update_success) {
+    if (m_orig_valobj.IsDynamic()) {
       if (m_options.m_use_dynamic == eNoDynamicValues) {
-        ValueObject *static_value = m_orig_valobj->GetStaticValue().get();
+        ValueObject *static_value = m_orig_valobj.GetStaticValue().get();
         if (static_value)
-          m_valobj = static_value;
-        else
-          m_valobj = m_orig_valobj;
-      } else
-        m_valobj = m_orig_valobj;
+          m_cached_valobj = static_value;
+      }
     } else {
       if (m_options.m_use_dynamic != eNoDynamicValues) {
         ValueObject *dynamic_value =
-            m_orig_valobj->GetDynamicValue(m_options.m_use_dynamic).get();
+            m_orig_valobj.GetDynamicValue(m_options.m_use_dynamic).get();
         if (dynamic_value)
-          m_valobj = dynamic_value;
-        else
-          m_valobj = m_orig_valobj;
-      } else
-        m_valobj = m_orig_valobj;
+          m_cached_valobj = dynamic_value;
+      }
     }
 
-    if (m_valobj->IsSynthetic()) {
+    if (m_cached_valobj->IsSynthetic()) {
       if (!m_options.m_use_synthetic) {
-        ValueObject *non_synthetic = m_valobj->GetNonSyntheticValue().get();
+        ValueObject *non_synthetic =
+            m_cached_valobj->GetNonSyntheticValue().get();
         if (non_synthetic)
-          m_valobj = non_synthetic;
+          m_cached_valobj = non_synthetic;
       }
     } else {
       if (m_options.m_use_synthetic) {
-        ValueObject *synthetic = m_valobj->GetSyntheticValue().get();
+        ValueObject *synthetic = m_cached_valobj->GetSyntheticValue().get();
         if (synthetic)
-          m_valobj = synthetic;
+          m_cached_valobj = synthetic;
       }
     }
   }
-  m_compiler_type = m_valobj->GetCompilerType();
+  m_compiler_type = m_cached_valobj->GetCompilerType();
   m_type_flags = m_compiler_type.GetTypeInfo();
-  return true;
+  assert(m_cached_valobj &&
+         "SetupMostSpecialized value must compute a valid ValueObject");
 }
 
 const char *ValueObjectPrinter::GetDescriptionForDisplay() {
-  const char *str = m_valobj->GetObjectDescription();
+  ValueObject &valobj = GetMostSpecializedValue();
+  const char *str = valobj.GetObjectDescription();
   if (!str)
-    str = m_valobj->GetSummaryAsCString();
+    str = valobj.GetSummaryAsCString();
   if (!str)
-    str = m_valobj->GetValueAsCString();
+    str = valobj.GetValueAsCString();
   return str;
 }
 
 const char *ValueObjectPrinter::GetRootNameForDisplay() {
-  const char *root_valobj_name = m_options.m_root_valobj_name.empty()
-                                     ? m_valobj->GetName().AsCString()
-                                     : m_options.m_root_valobj_name.c_str();
+  const char *root_valobj_name =
+      m_options.m_root_valobj_name.empty()
+          ? GetMostSpecializedValue().GetName().AsCString()
+          : m_options.m_root_valobj_name.c_str();
   return root_valobj_name ? root_valobj_name : "";
 }
 
@@ -181,14 +174,16 @@ bool ValueObjectPrinter::ShouldPrintValueObject() {
 
 bool ValueObjectPrinter::IsNil() {
   if (m_is_nil == eLazyBoolCalculate)
-    m_is_nil = m_valobj->IsNilReference() ? eLazyBoolYes : eLazyBoolNo;
+    m_is_nil =
+        GetMostSpecializedValue().IsNilReference() ? eLazyBoolYes : eLazyBoolNo;
   return m_is_nil == eLazyBoolYes;
 }
 
 bool ValueObjectPrinter::IsUninitialized() {
   if (m_is_uninit == eLazyBoolCalculate)
-    m_is_uninit =
-        m_valobj->IsUninitializedReference() ? eLazyBoolYes : eLazyBoolNo;
+    m_is_uninit = GetMostSpecializedValue().IsUninitializedReference()
+                      ? eLazyBoolYes
+                      : eLazyBoolNo;
   return m_is_uninit == eLazyBoolYes;
 }
 
@@ -213,19 +208,20 @@ bool ValueObjectPrinter::IsAggregate() {
 
 bool ValueObjectPrinter::IsInstancePointer() {
   // you need to do this check on the value's clang type
+  ValueObject &valobj = GetMostSpecializedValue();
   if (m_is_instance_ptr == eLazyBoolCalculate)
-    m_is_instance_ptr = (m_valobj->GetValue().GetCompilerType().GetTypeInfo() &
+    m_is_instance_ptr = (valobj.GetValue().GetCompilerType().GetTypeInfo() &
                          eTypeInstanceIsPointer) != 0
                             ? eLazyBoolYes
                             : eLazyBoolNo;
-  if ((eLazyBoolYes == m_is_instance_ptr) && m_valobj->IsBaseClass())
+  if ((eLazyBoolYes == m_is_instance_ptr) && valobj.IsBaseClass())
     m_is_instance_ptr = eLazyBoolNo;
   return m_is_instance_ptr == eLazyBoolYes;
 }
 
 bool ValueObjectPrinter::PrintLocationIfNeeded() {
   if (m_options.m_show_location) {
-    m_stream->Printf("%s: ", m_valobj->GetLocationAsCString());
+    m_stream->Printf("%s: ", GetMostSpecializedValue().GetLocationAsCString());
     return true;
   }
   return false;
@@ -244,6 +240,8 @@ void ValueObjectPrinter::PrintDecl() {
                 (m_curr_depth == 0 && !m_options.m_flat_output);
 
   StreamString typeName;
+  // Figure out which ValueObject we're acting on
+  ValueObject &valobj = GetMostSpecializedValue();
 
   // always show the type at the root level if it is invalid
   if (show_type) {
@@ -252,8 +250,8 @@ void ValueObjectPrinter::PrintDecl() {
     ConstString type_name;
     if (m_compiler_type.IsValid()) {
       type_name = m_options.m_use_type_display_name
-                      ? m_valobj->GetDisplayTypeName()
-                      : m_valobj->GetQualifiedTypeName();
+                      ? valobj.GetDisplayTypeName()
+                      : valobj.GetQualifiedTypeName();
     } else {
       // only show an invalid type name if the user explicitly triggered
       // show_type
@@ -277,7 +275,7 @@ void ValueObjectPrinter::PrintDecl() {
 
   if (ShouldShowName()) {
     if (m_options.m_flat_output)
-      m_valobj->GetExpressionPath(varName);
+      valobj.GetExpressionPath(varName);
     else
       varName << GetRootNameForDisplay();
   }
@@ -289,7 +287,7 @@ void ValueObjectPrinter::PrintDecl() {
     // one for the ValueObject
     lldb::LanguageType lang_type =
         (m_options.m_varformat_language == lldb::eLanguageTypeUnknown)
-            ? m_valobj->GetPreferredDisplayLanguage()
+            ? valobj.GetPreferredDisplayLanguage()
             : m_options.m_varformat_language;
     if (Language *lang_plugin = Language::FindPlugin(lang_type)) {
       m_options.m_decl_printing_helper = lang_plugin->GetDeclPrintingHelper();
@@ -327,14 +325,15 @@ void ValueObjectPrinter::PrintDecl() {
 bool ValueObjectPrinter::CheckScopeIfNeeded() {
   if (m_options.m_scope_already_checked)
     return true;
-  return m_valobj->IsInScope();
+  return GetMostSpecializedValue().IsInScope();
 }
 
 TypeSummaryImpl *ValueObjectPrinter::GetSummaryFormatter(bool null_if_omitted) {
   if (!m_summary_formatter.second) {
-    TypeSummaryImpl *entry = m_options.m_summary_sp
-                                 ? m_options.m_summary_sp.get()
-                                 : m_valobj->GetSummaryFormat().get();
+    TypeSummaryImpl *entry =
+        m_options.m_summary_sp
+            ? m_options.m_summary_sp.get()
+            : GetMostSpecializedValue().GetSummaryFormat().get();
 
     if (m_options.m_omit_summary_depth > 0)
       entry = nullptr;
@@ -357,18 +356,19 @@ void ValueObjectPrinter::GetValueSummaryError(std::string &value,
                                               std::string &summary,
                                               std::string &error) {
   lldb::Format format = m_options.m_format;
+  ValueObject &valobj = GetMostSpecializedValue();
   // if I am printing synthetized elements, apply the format to those elements
   // only
   if (m_options.m_pointer_as_array)
-    m_valobj->GetValueAsCString(lldb::eFormatDefault, value);
-  else if (format != eFormatDefault && format != m_valobj->GetFormat())
-    m_valobj->GetValueAsCString(format, value);
+    valobj.GetValueAsCString(lldb::eFormatDefault, value);
+  else if (format != eFormatDefault && format != valobj.GetFormat())
+    valobj.GetValueAsCString(format, value);
   else {
-    const char *val_cstr = m_valobj->GetValueAsCString();
+    const char *val_cstr = valobj.GetValueAsCString();
     if (val_cstr)
       value.assign(val_cstr);
   }
-  const char *err_cstr = m_valobj->GetError().AsCString();
+  const char *err_cstr = valobj.GetError().AsCString();
   if (err_cstr)
     error.assign(err_cstr);
 
@@ -378,7 +378,7 @@ void ValueObjectPrinter::GetValueSummaryError(std::string &value,
   if (IsNil()) {
     lldb::LanguageType lang_type =
         (m_options.m_varformat_language == lldb::eLanguageTypeUnknown)
-            ? m_valobj->GetPreferredDisplayLanguage()
+            ? valobj.GetPreferredDisplayLanguage()
             : m_options.m_varformat_language;
     if (Language *lang_plugin = Language::FindPlugin(lang_type)) {
       summary.assign(lang_plugin->GetNilReferenceSummaryString().str());
@@ -392,11 +392,11 @@ void ValueObjectPrinter::GetValueSummaryError(std::string &value,
   } else if (m_options.m_omit_summary_depth == 0) {
     TypeSummaryImpl *entry = GetSummaryFormatter();
     if (entry) {
-      m_valobj->GetSummaryAsCString(entry, summary,
-                                    m_options.m_varformat_language);
+      valobj.GetSummaryAsCString(entry, summary,
+                                 m_options.m_varformat_language);
     } else {
       const char *sum_cstr =
-          m_valobj->GetSummaryAsCString(m_options.m_varformat_language);
+          valobj.GetSummaryAsCString(m_options.m_varformat_language);
       if (sum_cstr)
         summary.assign(sum_cstr);
     }
@@ -431,16 +431,17 @@ bool ValueObjectPrinter::PrintValueAndSummaryIfNeeded(bool &value_printed,
       // this thing is nil (but show the value if the user passes a format
       // explicitly)
       TypeSummaryImpl *entry = GetSummaryFormatter();
+      ValueObject &valobj = GetMostSpecializedValue();
       const bool has_nil_or_uninitialized_summary =
           (IsNil() || IsUninitialized()) && !m_summary.empty();
       if (!has_nil_or_uninitialized_summary && !m_value.empty() &&
           (entry == nullptr ||
-           (entry->DoesPrintValue(m_valobj) ||
+           (entry->DoesPrintValue(&valobj) ||
             m_options.m_format != eFormatDefault) ||
            m_summary.empty()) &&
           !m_options.m_hide_value) {
         if (m_options.m_hide_pointer_value &&
-            IsPointerValue(m_valobj->GetCompilerType())) {
+            IsPointerValue(valobj.GetCompilerType())) {
         } else {
           if (ShouldShowName())
             m_stream->PutChar(' ');
@@ -470,7 +471,7 @@ bool ValueObjectPrinter::PrintObjectDescriptionIfNeeded(bool value_printed,
         m_stream->Printf(" ");
       const char *object_desc = nullptr;
       if (value_printed || summary_printed)
-        object_desc = m_valobj->GetObjectDescription();
+        object_desc = GetMostSpecializedValue().GetObjectDescription();
       else
         object_desc = GetDescriptionForDisplay();
       if (object_desc && *object_desc) {
@@ -523,8 +524,9 @@ bool ValueObjectPrinter::ShouldPrintChildren(
     return false;
 
   bool print_children = true;
+  ValueObject &valobj = GetMostSpecializedValue();
   if (TypeSummaryImpl *type_summary = GetSummaryFormatter())
-    print_children = type_summary->DoesPrintChildren(m_valobj);
+    print_children = type_summary->DoesPrintChildren(&valobj);
 
   // We will show children for all concrete types. We won't show pointer
   // contents unless a pointer depth has been specified. We won't reference
@@ -537,7 +539,7 @@ bool ValueObjectPrinter::ShouldPrintChildren(
     // We have a pointer or reference whose value is an address. Make sure
     // that address is not NULL
     AddressType ptr_address_type;
-    if (m_valobj->GetPointerValue(&ptr_address_type) == 0)
+    if (valobj.GetPointerValue(&ptr_address_type) == 0)
       return false;
 
     const bool is_root_level = m_curr_depth == 0;
@@ -565,8 +567,8 @@ bool ValueObjectPrinter::ShouldExpandEmptyAggregates() {
   return entry->DoesPrintEmptyAggregates();
 }
 
-ValueObject *ValueObjectPrinter::GetValueObjectForChildrenGeneration() {
-  return m_valobj;
+ValueObject &ValueObjectPrinter::GetValueObjectForChildrenGeneration() {
+  return GetMostSpecializedValue();
 }
 
 void ValueObjectPrinter::PrintChildrenPreamble(bool value_printed,
@@ -612,7 +614,7 @@ void ValueObjectPrinter::PrintChild(
     if (does_consume_ptr_depth)
       ptr_depth = curr_ptr_depth.Decremented();
 
-    ValueObjectPrinter child_printer(child_sp.get(), m_stream, child_options,
+    ValueObjectPrinter child_printer(*(child_sp.get()), m_stream, child_options,
                                      ptr_depth, m_curr_depth + 1,
                                      m_printed_instance_pointers);
     child_printer.PrintValueObject();
@@ -620,16 +622,17 @@ void ValueObjectPrinter::PrintChild(
 }
 
 uint32_t ValueObjectPrinter::GetMaxNumChildrenToPrint(bool &print_dotdotdot) {
-  ValueObject *synth_m_valobj = GetValueObjectForChildrenGeneration();
+  ValueObject &synth_valobj = GetValueObjectForChildrenGeneration();
 
   if (m_options.m_pointer_as_array)
     return m_options.m_pointer_as_array.m_element_count;
 
-  size_t num_children = synth_m_valobj->GetNumChildren();
+  size_t num_children = synth_valobj.GetNumChildren();
   print_dotdotdot = false;
   if (num_children) {
-    const size_t max_num_children =
-        m_valobj->GetTargetSP()->GetMaximumNumberOfChildrenToDisplay();
+    const size_t max_num_children = GetMostSpecializedValue()
+                                        .GetTargetSP()
+                                        ->GetMaximumNumberOfChildrenToDisplay();
 
     if (num_children > max_num_children && !m_options.m_ignore_cap) {
       print_dotdotdot = true;
@@ -642,7 +645,8 @@ uint32_t ValueObjectPrinter::GetMaxNumChildrenToPrint(bool &print_dotdotdot) {
 void ValueObjectPrinter::PrintChildrenPostamble(bool print_dotdotdot) {
   if (!m_options.m_flat_output) {
     if (print_dotdotdot) {
-      m_valobj->GetTargetSP()
+      GetMostSpecializedValue()
+          .GetTargetSP()
           ->GetDebugger()
           .GetCommandInterpreter()
           .ChildrenTruncated();
@@ -655,7 +659,7 @@ void ValueObjectPrinter::PrintChildrenPostamble(bool print_dotdotdot) {
 
 bool ValueObjectPrinter::ShouldPrintEmptyBrackets(bool value_printed,
                                                   bool summary_printed) {
-  ValueObject *synth_m_valobj = GetValueObjectForChildrenGeneration();
+  ValueObject &synth_valobj = GetValueObjectForChildrenGeneration();
 
   if (!IsAggregate())
     return false;
@@ -665,7 +669,7 @@ bool ValueObjectPrinter::ShouldPrintEmptyBrackets(bool value_printed,
       return false;
   }
 
-  if (synth_m_valobj->MightHaveChildren())
+  if (synth_valobj.MightHaveChildren())
     return true;
 
   if (m_val_summary_ok)
@@ -679,25 +683,25 @@ static constexpr size_t PhysicalIndexForLogicalIndex(size_t base, size_t stride,
   return base + logical * stride;
 }
 
-ValueObjectSP ValueObjectPrinter::GenerateChild(ValueObject *synth_valobj,
+ValueObjectSP ValueObjectPrinter::GenerateChild(ValueObject &synth_valobj,
                                                 size_t idx) {
   if (m_options.m_pointer_as_array) {
     // if generating pointer-as-array children, use GetSyntheticArrayMember
-    return synth_valobj->GetSyntheticArrayMember(
+    return synth_valobj.GetSyntheticArrayMember(
         PhysicalIndexForLogicalIndex(
             m_options.m_pointer_as_array.m_base_element,
             m_options.m_pointer_as_array.m_stride, idx),
         true);
   } else {
     // otherwise, do the usual thing
-    return synth_valobj->GetChildAtIndex(idx);
+    return synth_valobj.GetChildAtIndex(idx);
   }
 }
 
 void ValueObjectPrinter::PrintChildren(
     bool value_printed, bool summary_printed,
     const DumpValueObjectOptions::PointerDepth &curr_ptr_depth) {
-  ValueObject *synth_m_valobj = GetValueObjectForChildrenGeneration();
+  ValueObject &synth_valobj = GetValueObjectForChildrenGeneration();
 
   bool print_dotdotdot = false;
   size_t num_children = GetMaxNumChildrenToPrint(print_dotdotdot);
@@ -705,7 +709,7 @@ void ValueObjectPrinter::PrintChildren(
     bool any_children_printed = false;
 
     for (size_t idx = 0; idx < num_children; ++idx) {
-      if (ValueObjectSP child_sp = GenerateChild(synth_m_valobj, idx)) {
+      if (ValueObjectSP child_sp = GenerateChild(synth_valobj, idx)) {
         if (m_options.m_child_printing_decider &&
             !m_options.m_child_printing_decider(child_sp->GetName()))
           continue;
@@ -733,7 +737,7 @@ void ValueObjectPrinter::PrintChildren(
     if (ShouldPrintValueObject()) {
       // if it has a synthetic value, then don't print {}, the synthetic
       // children are probably only being used to vend a value
-      if (m_valobj->DoesProvideSyntheticValue() ||
+      if (GetMostSpecializedValue().DoesProvideSyntheticValue() ||
           !ShouldExpandEmptyAggregates())
         m_stream->PutCString("\n");
       else
@@ -746,10 +750,7 @@ void ValueObjectPrinter::PrintChildren(
 }
 
 bool ValueObjectPrinter::PrintChildrenOneLiner(bool hide_names) {
-  if (!GetMostSpecializedValue() || m_valobj == nullptr)
-    return false;
-
-  ValueObject *synth_m_valobj = GetValueObjectForChildrenGeneration();
+  ValueObject &synth_valobj = GetValueObjectForChildrenGeneration();
 
   bool print_dotdotdot = false;
   size_t num_children = GetMaxNumChildrenToPrint(print_dotdotdot);
@@ -759,7 +760,7 @@ bool ValueObjectPrinter::PrintChildrenOneLiner(bool hide_names) {
 
     bool did_print_children = false;
     for (uint32_t idx = 0; idx < num_children; ++idx) {
-      lldb::ValueObjectSP child_sp(synth_m_valobj->GetChildAtIndex(idx));
+      lldb::ValueObjectSP child_sp(synth_valobj.GetChildAtIndex(idx));
       if (child_sp)
         child_sp = child_sp->GetQualifiedRepresentationIfAvailable(
             m_options.m_use_dynamic, m_options.m_use_synthetic);
@@ -795,6 +796,7 @@ bool ValueObjectPrinter::PrintChildrenOneLiner(bool hide_names) {
 void ValueObjectPrinter::PrintChildrenIfNeeded(bool value_printed,
                                                bool summary_printed) {
   PrintObjectDescriptionIfNeeded(value_printed, summary_printed);
+  ValueObject &valobj = GetMostSpecializedValue();
 
   DumpValueObjectOptions::PointerDepth curr_ptr_depth = m_ptr_depth;
   const bool print_children = ShouldPrintChildren(curr_ptr_depth);
@@ -803,9 +805,9 @@ void ValueObjectPrinter::PrintChildrenIfNeeded(bool value_printed,
        !m_options.m_allow_oneliner_mode || m_options.m_flat_output ||
        (m_options.m_pointer_as_array) || m_options.m_show_location)
           ? false
-          : DataVisualization::ShouldPrintAsOneLiner(*m_valobj);
+          : DataVisualization::ShouldPrintAsOneLiner(valobj);
   if (print_children && IsInstancePointer()) {
-    uint64_t instance_ptr_value = m_valobj->GetValueAsUnsigned(0);
+    uint64_t instance_ptr_value = valobj.GetValueAsUnsigned(0);
     if (m_printed_instance_pointers->count(instance_ptr_value)) {
       // We already printed this instance-is-pointer thing, so don't expand it.
       m_stream->PutCString(" {...}\n");
@@ -831,7 +833,7 @@ void ValueObjectPrinter::PrintChildrenIfNeeded(bool value_printed,
     // the user. The warning tells the user that the limit has been reached, but
     // more importantly tells them how to expand the limit if desired.
     if (m_options.m_max_depth_is_default)
-      m_valobj->GetTargetSP()
+      valobj.GetTargetSP()
           ->GetDebugger()
           .GetCommandInterpreter()
           .SetReachedMaximumDepth();
-- 
cgit v1.1


From 11e5e22c41f6611591a591f58c1e73cc3721ef5f Mon Sep 17 00:00:00 2001
From: michaelrj-google <71531609+michaelrj-google@users.noreply.github.com>
Date: Mon, 12 Feb 2024 15:50:25 -0800
Subject: [libc][bazel] mark read and write as weak (#81330)

Downstream there's a user that intercepts these functions and overlays
them. This causes symbol conflicts if neither function is marked weak.
In future the intent is to move to this to being a downstream
configuration
option.
---
 utils/bazel/llvm-project-overlay/libc/BUILD.bazel | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
index fd6d1dc..4aef47c 100644
--- a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
@@ -2803,6 +2803,7 @@ libc_function(
     name = "read",
     srcs = ["src/unistd/linux/read.cpp"],
     hdrs = ["src/unistd/read.h"],
+    weak = True,
     deps = [
         ":__support_common",
         ":__support_macros_sanitizer",
@@ -2928,6 +2929,7 @@ libc_function(
     name = "write",
     srcs = ["src/unistd/linux/write.cpp"],
     hdrs = ["src/unistd/write.h"],
+    weak = True,
     deps = [
         ":__support_common",
         ":__support_osutil_syscall",
-- 
cgit v1.1


From fac6d3d98ba1bac24acc5b19c84d07af25c1b755 Mon Sep 17 00:00:00 2001
From: michaelrj-google <71531609+michaelrj-google@users.noreply.github.com>
Date: Mon, 12 Feb 2024 15:51:25 -0800
Subject: [libc][bazel] mark epoll funcitons as weak (#81327)

Downstream there's a user that intercepts these functions and overlays
them. This causes symbol conflicts if neither function is marked weak.
In future the intent is to move to this to being a downstream
configuration
option.
---
 utils/bazel/llvm-project-overlay/libc/BUILD.bazel | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
index 4aef47c..30c180b 100644
--- a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
@@ -3218,6 +3218,7 @@ libc_function(
     name = "epoll_wait",
     srcs = ["src/sys/epoll/linux/epoll_wait.cpp"],
     hdrs = ["src/sys/epoll/epoll_wait.h"],
+    weak = True,
     deps = [
         ":__support_osutil_syscall",
         ":errno",
@@ -3228,6 +3229,7 @@ libc_function(
     name = "epoll_pwait",
     srcs = ["src/sys/epoll/linux/epoll_pwait.cpp"],
     hdrs = ["src/sys/epoll/epoll_pwait.h"],
+    weak = True,
     deps = [
         ":__support_osutil_syscall",
         ":errno",
@@ -3240,6 +3242,7 @@ libc_function(
 #     name = "epoll_pwait2",
 #     srcs = ["src/sys/epoll/linux/epoll_pwait2.cpp"],
 #     hdrs = ["src/sys/epoll/epoll_pwait2.h"],
+#     weak = True,
 #     deps = [
 #         ":__support_osutil_syscall",
 #         ":errno",
-- 
cgit v1.1


From 644ac2a018c9bf83c9ba256074e552ad7f1fe941 Mon Sep 17 00:00:00 2001
From: jkorous-apple <32549412+jkorous-apple@users.noreply.github.com>
Date: Mon, 12 Feb 2024 15:52:20 -0800
Subject: [-Wunsafe-buffer-usage] Introduce std::array fixits (#80084)

Array subscript on a const size array is not bounds-checked. The idiomatic
replacement is std::array which is bounds-safe in hardened mode of libc++.

This commit extends the fixit-producing machine to consider std::array as a
transformation target type and teaches it to handle the array subscript on const
size arrays with a trivial (empty) fixit.
---
 .../clang/Analysis/Analyses/UnsafeBufferUsage.h    |  45 ++-
 clang/include/clang/Basic/DiagnosticSemaKinds.td   |   4 +-
 clang/lib/Analysis/UnsafeBufferUsage.cpp           | 362 +++++++++++++--------
 clang/lib/Sema/AnalysisBasedWarnings.cpp           |  16 +-
 .../SemaCXX/warn-unsafe-buffer-usage-array.cpp     |  24 ++
 .../SemaCXX/warn-unsafe-buffer-usage-debug.cpp     |   9 -
 ...-unsafe-buffer-usage-fixits-local-var-array.cpp | 228 +++++++++++++
 clang/test/SemaCXX/warn-unsafe-buffer-usage.cpp    |   5 +
 8 files changed, 536 insertions(+), 157 deletions(-)
 create mode 100644 clang/test/SemaCXX/warn-unsafe-buffer-usage-array.cpp
 create mode 100644 clang/test/SemaCXX/warn-unsafe-buffer-usage-fixits-local-var-array.cpp

diff --git a/clang/include/clang/Analysis/Analyses/UnsafeBufferUsage.h b/clang/include/clang/Analysis/Analyses/UnsafeBufferUsage.h
index aca1ad9..5d16dcc 100644
--- a/clang/include/clang/Analysis/Analyses/UnsafeBufferUsage.h
+++ b/clang/include/clang/Analysis/Analyses/UnsafeBufferUsage.h
@@ -42,6 +42,43 @@ public:
   virtual VarGrpRef getGroupOfParms() const =0;
 };
 
+// FixitStrategy is a map from variables to the way we plan to emit fixes for
+// these variables. It is figured out gradually by trying different fixes
+// for different variables depending on gadgets in which these variables
+// participate.
+class FixitStrategy {
+public:
+  enum class Kind {
+    Wontfix,  // We don't plan to emit a fixit for this variable.
+    Span,     // We recommend replacing the variable with std::span.
+    Iterator, // We recommend replacing the variable with std::span::iterator.
+    Array,    // We recommend replacing the variable with std::array.
+    Vector    // We recommend replacing the variable with std::vector.
+  };
+
+private:
+  using MapTy = llvm::DenseMap<const VarDecl *, Kind>;
+
+  MapTy Map;
+
+public:
+  FixitStrategy() = default;
+  FixitStrategy(const FixitStrategy &) = delete; // Let's avoid copies.
+  FixitStrategy &operator=(const FixitStrategy &) = delete;
+  FixitStrategy(FixitStrategy &&) = default;
+  FixitStrategy &operator=(FixitStrategy &&) = default;
+
+  void set(const VarDecl *VD, Kind K) { Map[VD] = K; }
+
+  Kind lookup(const VarDecl *VD) const {
+    auto I = Map.find(VD);
+    if (I == Map.end())
+      return Kind::Wontfix;
+
+    return I->second;
+  }
+};
+
 /// The interface that lets the caller handle unsafe buffer usage analysis
 /// results by overriding this class's handle... methods.
 class UnsafeBufferUsageHandler {
@@ -75,9 +112,11 @@ public:
   ///
   /// `D` is the declaration of the callable under analysis that owns `Variable`
   /// and all of its group mates.
-  virtual void handleUnsafeVariableGroup(const VarDecl *Variable,
-                                         const VariableGroupsManager &VarGrpMgr,
-                                         FixItList &&Fixes, const Decl *D) = 0;
+  virtual void
+  handleUnsafeVariableGroup(const VarDecl *Variable,
+                            const VariableGroupsManager &VarGrpMgr,
+                            FixItList &&Fixes, const Decl *D,
+                            const FixitStrategy &VarTargetTypes) = 0;
 
 #ifndef NDEBUG
 public:
diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td
index 83b89d1..754733a 100644
--- a/clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -12128,9 +12128,9 @@ def warn_unsafe_buffer_operation : Warning<
 def note_unsafe_buffer_operation : Note<
   "used%select{| in pointer arithmetic| in buffer access}0 here">;
 def note_unsafe_buffer_variable_fixit_group : Note<
-  "change type of %0 to '%select{std::span|std::array|std::span::iterator}1' to preserve bounds information%select{|, and change %2 to '%select{std::span|std::array|std::span::iterator}1' to propagate bounds information between them}3">;
+  "change type of %0 to '%select{std::span' to preserve bounds information|std::array' to label it for hardening|std::span::iterator' to preserve bounds information}1%select{|, and change %2 to '%select{std::span|std::array|std::span::iterator}1' to propagate bounds information between them}3">;
 def note_unsafe_buffer_variable_fixit_together : Note<
-  "change type of %0 to '%select{std::span|std::array|std::span::iterator}1' to preserve bounds information"
+  "change type of %0 to '%select{std::span' to preserve bounds information|std::array' to label it for hardening|std::span::iterator' to preserve bounds information}1"
   "%select{|, and change %2 to safe types to make function %4 bounds-safe}3">;
 def note_safe_buffer_usage_suggestions_disabled : Note<
   "pass -fsafe-buffer-usage-suggestions to receive code hardening suggestions">;
diff --git a/clang/lib/Analysis/UnsafeBufferUsage.cpp b/clang/lib/Analysis/UnsafeBufferUsage.cpp
index a6dcf16..3c2a6fd 100644
--- a/clang/lib/Analysis/UnsafeBufferUsage.cpp
+++ b/clang/lib/Analysis/UnsafeBufferUsage.cpp
@@ -12,10 +12,14 @@
 #include "clang/AST/RecursiveASTVisitor.h"
 #include "clang/AST/StmtVisitor.h"
 #include "clang/ASTMatchers/ASTMatchFinder.h"
+#include "clang/Basic/CharInfo.h"
+#include "clang/Basic/SourceLocation.h"
 #include "clang/Lex/Lexer.h"
 #include "clang/Lex/Preprocessor.h"
 #include "llvm/ADT/APSInt.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/Casting.h"
 #include <memory>
 #include <optional>
 #include <queue>
@@ -407,9 +411,6 @@ using DeclUseList = SmallVector<const DeclRefExpr *, 1>;
 
 // Convenience typedef.
 using FixItList = SmallVector<FixItHint, 4>;
-
-// Defined below.
-class Strategy;
 } // namespace
 
 namespace {
@@ -486,7 +487,7 @@ public:
   /// Returns a fixit that would fix the current gadget according to
   /// the current strategy. Returns std::nullopt if the fix cannot be produced;
   /// returns an empty list if no fixes are necessary.
-  virtual std::optional<FixItList> getFixits(const Strategy &) const {
+  virtual std::optional<FixItList> getFixits(const FixitStrategy &) const {
     return std::nullopt;
   }
 
@@ -737,7 +738,8 @@ public:
     return stmt(PtrInitStmt);
   }
 
-  virtual std::optional<FixItList> getFixits(const Strategy &S) const override;
+  virtual std::optional<FixItList>
+  getFixits(const FixitStrategy &S) const override;
 
   virtual const Stmt *getBaseStmt() const override {
     // FIXME: This needs to be the entire DeclStmt, assuming that this method
@@ -789,7 +791,8 @@ public:
     return stmt(isInUnspecifiedUntypedContext(PtrAssignExpr));
   }
 
-  virtual std::optional<FixItList> getFixits(const Strategy &S) const override;
+  virtual std::optional<FixItList>
+  getFixits(const FixitStrategy &S) const override;
 
   virtual const Stmt *getBaseStmt() const override {
     // FIXME: This should be the binary operator, assuming that this method
@@ -892,7 +895,8 @@ public:
     return expr(isInUnspecifiedLvalueContext(Target));
   }
 
-  virtual std::optional<FixItList> getFixits(const Strategy &S) const override;
+  virtual std::optional<FixItList>
+  getFixits(const FixitStrategy &S) const override;
 
   virtual const Stmt *getBaseStmt() const override { return Node; }
 
@@ -932,7 +936,8 @@ public:
     return stmt(isInUnspecifiedPointerContext(target));
   }
 
-  virtual std::optional<FixItList> getFixits(const Strategy &S) const override;
+  virtual std::optional<FixItList>
+  getFixits(const FixitStrategy &S) const override;
 
   virtual const Stmt *getBaseStmt() const override { return Node; }
 
@@ -976,7 +981,8 @@ public:
 
   virtual const Stmt *getBaseStmt() const final { return Op; }
 
-  virtual std::optional<FixItList> getFixits(const Strategy &S) const override;
+  virtual std::optional<FixItList>
+  getFixits(const FixitStrategy &S) const override;
 };
 
 // Represents expressions of the form `&DRE[any]` in the Unspecified Pointer
@@ -1009,7 +1015,8 @@ public:
             .bind(UPCAddressofArraySubscriptTag)))));
   }
 
-  virtual std::optional<FixItList> getFixits(const Strategy &) const override;
+  virtual std::optional<FixItList>
+  getFixits(const FixitStrategy &) const override;
 
   virtual const Stmt *getBaseStmt() const override { return Node; }
 
@@ -1088,46 +1095,6 @@ public:
 };
 } // namespace
 
-namespace {
-// Strategy is a map from variables to the way we plan to emit fixes for
-// these variables. It is figured out gradually by trying different fixes
-// for different variables depending on gadgets in which these variables
-// participate.
-class Strategy {
-public:
-  enum class Kind {
-    Wontfix,  // We don't plan to emit a fixit for this variable.
-    Span,     // We recommend replacing the variable with std::span.
-    Iterator, // We recommend replacing the variable with std::span::iterator.
-    Array,    // We recommend replacing the variable with std::array.
-    Vector    // We recommend replacing the variable with std::vector.
-  };
-
-private:
-  using MapTy = llvm::DenseMap<const VarDecl *, Kind>;
-
-  MapTy Map;
-
-public:
-  Strategy() = default;
-  Strategy(const Strategy &) = delete; // Let's avoid copies.
-  Strategy &operator=(const Strategy &) = delete;
-  Strategy(Strategy &&) = default;
-  Strategy &operator=(Strategy &&) = default;
-
-  void set(const VarDecl *VD, Kind K) { Map[VD] = K; }
-
-  Kind lookup(const VarDecl *VD) const {
-    auto I = Map.find(VD);
-    if (I == Map.end())
-      return Kind::Wontfix;
-
-    return I->second;
-  }
-};
-} // namespace
-
-
 // Representing a pointer type expression of the form `++Ptr` in an Unspecified
 // Pointer Context (UPC):
 class UPCPreIncrementGadget : public FixableGadget {
@@ -1159,7 +1126,8 @@ public:
 										  ).bind(UPCPreIncrementTag)))));
   }
 
-  virtual std::optional<FixItList> getFixits(const Strategy &S) const override;
+  virtual std::optional<FixItList>
+  getFixits(const FixitStrategy &S) const override;
 
   virtual const Stmt *getBaseStmt() const override { return Node; }
 
@@ -1204,7 +1172,8 @@ public:
     // clang-format on
   }
 
-  virtual std::optional<FixItList> getFixits(const Strategy &S) const override;
+  virtual std::optional<FixItList>
+  getFixits(const FixitStrategy &S) const override;
 
   virtual const Stmt *getBaseStmt() const override { return Node; }
 
@@ -1254,7 +1223,8 @@ public:
     // clang-format on
   }
 
-  virtual std::optional<FixItList> getFixits(const Strategy &s) const final;
+  virtual std::optional<FixItList>
+  getFixits(const FixitStrategy &s) const final;
 
   // TODO remove this method from FixableGadget interface
   virtual const Stmt *getBaseStmt() const final { return nullptr; }
@@ -1464,38 +1434,40 @@ bool clang::internal::anyConflict(const SmallVectorImpl<FixItHint> &FixIts,
 }
 
 std::optional<FixItList>
-PointerAssignmentGadget::getFixits(const Strategy &S) const {
+PointerAssignmentGadget::getFixits(const FixitStrategy &S) const {
   const auto *LeftVD = cast<VarDecl>(PtrLHS->getDecl());
   const auto *RightVD = cast<VarDecl>(PtrRHS->getDecl());
   switch (S.lookup(LeftVD)) {
-    case Strategy::Kind::Span:
-      if (S.lookup(RightVD) == Strategy::Kind::Span)
-        return FixItList{};
-      return std::nullopt;
-    case Strategy::Kind::Wontfix:
-      return std::nullopt;
-    case Strategy::Kind::Iterator:
-    case Strategy::Kind::Array:
-    case Strategy::Kind::Vector:
-      llvm_unreachable("unsupported strategies for FixableGadgets");
+  case FixitStrategy::Kind::Span:
+    if (S.lookup(RightVD) == FixitStrategy::Kind::Span)
+      return FixItList{};
+    return std::nullopt;
+  case FixitStrategy::Kind::Wontfix:
+    return std::nullopt;
+  case FixitStrategy::Kind::Iterator:
+  case FixitStrategy::Kind::Array:
+    return std::nullopt;
+  case FixitStrategy::Kind::Vector:
+    llvm_unreachable("unsupported strategies for FixableGadgets");
   }
   return std::nullopt;
 }
 
 std::optional<FixItList>
-PointerInitGadget::getFixits(const Strategy &S) const {
+PointerInitGadget::getFixits(const FixitStrategy &S) const {
   const auto *LeftVD = PtrInitLHS;
   const auto *RightVD = cast<VarDecl>(PtrInitRHS->getDecl());
   switch (S.lookup(LeftVD)) {
-    case Strategy::Kind::Span:
-      if (S.lookup(RightVD) == Strategy::Kind::Span)
-        return FixItList{};
-      return std::nullopt;
-    case Strategy::Kind::Wontfix:
-      return std::nullopt;
-    case Strategy::Kind::Iterator:
-    case Strategy::Kind::Array:
-    case Strategy::Kind::Vector:
+  case FixitStrategy::Kind::Span:
+    if (S.lookup(RightVD) == FixitStrategy::Kind::Span)
+      return FixItList{};
+    return std::nullopt;
+  case FixitStrategy::Kind::Wontfix:
+    return std::nullopt;
+  case FixitStrategy::Kind::Iterator:
+  case FixitStrategy::Kind::Array:
+    return std::nullopt;
+  case FixitStrategy::Kind::Vector:
     llvm_unreachable("unsupported strategies for FixableGadgets");
   }
   return std::nullopt;
@@ -1512,12 +1484,12 @@ static bool isNonNegativeIntegerExpr(const Expr *Expr, const VarDecl *VD,
 }
 
 std::optional<FixItList>
-ULCArraySubscriptGadget::getFixits(const Strategy &S) const {
+ULCArraySubscriptGadget::getFixits(const FixitStrategy &S) const {
   if (const auto *DRE =
           dyn_cast<DeclRefExpr>(Node->getBase()->IgnoreImpCasts()))
     if (const auto *VD = dyn_cast<VarDecl>(DRE->getDecl())) {
       switch (S.lookup(VD)) {
-      case Strategy::Kind::Span: {
+      case FixitStrategy::Kind::Span: {
 
         // If the index has a negative constant value, we give up as no valid
         // fix-it can be generated:
@@ -1528,10 +1500,11 @@ ULCArraySubscriptGadget::getFixits(const Strategy &S) const {
         // no-op is a good fix-it, otherwise
         return FixItList{};
       }
-      case Strategy::Kind::Wontfix:
-      case Strategy::Kind::Iterator:
-      case Strategy::Kind::Array:
-      case Strategy::Kind::Vector:
+      case FixitStrategy::Kind::Array:
+        return FixItList{};
+      case FixitStrategy::Kind::Wontfix:
+      case FixitStrategy::Kind::Iterator:
+      case FixitStrategy::Kind::Vector:
         llvm_unreachable("unsupported strategies for FixableGadgets");
       }
     }
@@ -1542,17 +1515,18 @@ static std::optional<FixItList> // forward declaration
 fixUPCAddressofArraySubscriptWithSpan(const UnaryOperator *Node);
 
 std::optional<FixItList>
-UPCAddressofArraySubscriptGadget::getFixits(const Strategy &S) const {
+UPCAddressofArraySubscriptGadget::getFixits(const FixitStrategy &S) const {
   auto DREs = getClaimedVarUseSites();
   const auto *VD = cast<VarDecl>(DREs.front()->getDecl());
 
   switch (S.lookup(VD)) {
-  case Strategy::Kind::Span:
+  case FixitStrategy::Kind::Span:
     return fixUPCAddressofArraySubscriptWithSpan(Node);
-  case Strategy::Kind::Wontfix:
-  case Strategy::Kind::Iterator:
-  case Strategy::Kind::Array:
-  case Strategy::Kind::Vector:
+  case FixitStrategy::Kind::Wontfix:
+  case FixitStrategy::Kind::Iterator:
+  case FixitStrategy::Kind::Array:
+    return std::nullopt;
+  case FixitStrategy::Kind::Vector:
     llvm_unreachable("unsupported strategies for FixableGadgets");
   }
   return std::nullopt; // something went wrong, no fix-it
@@ -1803,10 +1777,10 @@ getSpanTypeText(StringRef EltTyText,
 }
 
 std::optional<FixItList>
-DerefSimplePtrArithFixableGadget::getFixits(const Strategy &s) const {
+DerefSimplePtrArithFixableGadget::getFixits(const FixitStrategy &s) const {
   const VarDecl *VD = dyn_cast<VarDecl>(BaseDeclRefExpr->getDecl());
 
-  if (VD && s.lookup(VD) == Strategy::Kind::Span) {
+  if (VD && s.lookup(VD) == FixitStrategy::Kind::Span) {
     ASTContext &Ctx = VD->getASTContext();
     // std::span can't represent elements before its begin()
     if (auto ConstVal = Offset->getIntegerConstantExpr(Ctx))
@@ -1866,10 +1840,10 @@ DerefSimplePtrArithFixableGadget::getFixits(const Strategy &s) const {
 }
 
 std::optional<FixItList>
-PointerDereferenceGadget::getFixits(const Strategy &S) const {
+PointerDereferenceGadget::getFixits(const FixitStrategy &S) const {
   const VarDecl *VD = cast<VarDecl>(BaseDeclRefExpr->getDecl());
   switch (S.lookup(VD)) {
-  case Strategy::Kind::Span: {
+  case FixitStrategy::Kind::Span: {
     ASTContext &Ctx = VD->getASTContext();
     SourceManager &SM = Ctx.getSourceManager();
     // Required changes: *(ptr); => (ptr[0]); and *ptr; => ptr[0]
@@ -1884,11 +1858,12 @@ PointerDereferenceGadget::getFixits(const Strategy &S) const {
     }
     break;
   }
-  case Strategy::Kind::Iterator:
-  case Strategy::Kind::Array:
-  case Strategy::Kind::Vector:
-    llvm_unreachable("Strategy not implemented yet!");
-  case Strategy::Kind::Wontfix:
+  case FixitStrategy::Kind::Iterator:
+  case FixitStrategy::Kind::Array:
+    return std::nullopt;
+  case FixitStrategy::Kind::Vector:
+    llvm_unreachable("FixitStrategy not implemented yet!");
+  case FixitStrategy::Kind::Wontfix:
     llvm_unreachable("Invalid strategy!");
   }
 
@@ -1897,28 +1872,28 @@ PointerDereferenceGadget::getFixits(const Strategy &S) const {
 
 // Generates fix-its replacing an expression of the form UPC(DRE) with
 // `DRE.data()`
-std::optional<FixItList> UPCStandalonePointerGadget::getFixits(const Strategy &S)
-      const {
+std::optional<FixItList>
+UPCStandalonePointerGadget::getFixits(const FixitStrategy &S) const {
   const auto VD = cast<VarDecl>(Node->getDecl());
   switch (S.lookup(VD)) {
-    case Strategy::Kind::Span: {
-      ASTContext &Ctx = VD->getASTContext();
-      SourceManager &SM = Ctx.getSourceManager();
-      // Inserts the .data() after the DRE
-      std::optional<SourceLocation> EndOfOperand =
-          getPastLoc(Node, SM, Ctx.getLangOpts());
-
-      if (EndOfOperand)
-        return FixItList{{FixItHint::CreateInsertion(
-            *EndOfOperand, ".data()")}};
-      // FIXME: Points inside a macro expansion.
-      break;
-    }
-    case Strategy::Kind::Wontfix:
-    case Strategy::Kind::Iterator:
-    case Strategy::Kind::Array:
-    case Strategy::Kind::Vector:
-      llvm_unreachable("unsupported strategies for FixableGadgets");
+  case FixitStrategy::Kind::Span: {
+    ASTContext &Ctx = VD->getASTContext();
+    SourceManager &SM = Ctx.getSourceManager();
+    // Inserts the .data() after the DRE
+    std::optional<SourceLocation> EndOfOperand =
+        getPastLoc(Node, SM, Ctx.getLangOpts());
+
+    if (EndOfOperand)
+      return FixItList{{FixItHint::CreateInsertion(*EndOfOperand, ".data()")}};
+    // FIXME: Points inside a macro expansion.
+    break;
+  }
+  case FixitStrategy::Kind::Wontfix:
+  case FixitStrategy::Kind::Iterator:
+  case FixitStrategy::Kind::Array:
+    return std::nullopt;
+  case FixitStrategy::Kind::Vector:
+    llvm_unreachable("unsupported strategies for FixableGadgets");
   }
 
   return std::nullopt;
@@ -1962,14 +1937,14 @@ fixUPCAddressofArraySubscriptWithSpan(const UnaryOperator *Node) {
 }
 
 std::optional<FixItList>
-UUCAddAssignGadget::getFixits(const Strategy &S) const {
+UUCAddAssignGadget::getFixits(const FixitStrategy &S) const {
   DeclUseList DREs = getClaimedVarUseSites();
 
   if (DREs.size() != 1)
     return std::nullopt; // In cases of `Ptr += n` where `Ptr` is not a DRE, we
                          // give up
   if (const VarDecl *VD = dyn_cast<VarDecl>(DREs.front()->getDecl())) {
-    if (S.lookup(VD) == Strategy::Kind::Span) {
+    if (S.lookup(VD) == FixitStrategy::Kind::Span) {
       FixItList Fixes;
 
       const Stmt *AddAssignNode = getBaseStmt();
@@ -2003,14 +1978,15 @@ UUCAddAssignGadget::getFixits(const Strategy &S) const {
   return std::nullopt; // Not in the cases that we can handle for now, give up.
 }
 
-std::optional<FixItList> UPCPreIncrementGadget::getFixits(const Strategy &S) const {
+std::optional<FixItList>
+UPCPreIncrementGadget::getFixits(const FixitStrategy &S) const {
   DeclUseList DREs = getClaimedVarUseSites();
 
   if (DREs.size() != 1)
     return std::nullopt; // In cases of `++Ptr` where `Ptr` is not a DRE, we
                          // give up
   if (const VarDecl *VD = dyn_cast<VarDecl>(DREs.front()->getDecl())) {
-    if (S.lookup(VD) == Strategy::Kind::Span) {
+    if (S.lookup(VD) == FixitStrategy::Kind::Span) {
       FixItList Fixes;
       std::stringstream SS;
       const Stmt *PreIncNode = getBaseStmt();
@@ -2033,7 +2009,6 @@ std::optional<FixItList> UPCPreIncrementGadget::getFixits(const Strategy &S) con
   return std::nullopt; // Not in the cases that we can handle for now, give up.
 }
 
-
 // For a non-null initializer `Init` of `T *` type, this function returns
 // `FixItHint`s producing a list initializer `{Init,  S}` as a part of a fix-it
 // to output stream.
@@ -2261,7 +2236,7 @@ static bool hasConflictingOverload(const FunctionDecl *FD) {
 // }
 //
 static std::optional<FixItList>
-createOverloadsForFixedParams(const Strategy &S, const FunctionDecl *FD,
+createOverloadsForFixedParams(const FixitStrategy &S, const FunctionDecl *FD,
                               const ASTContext &Ctx,
                               UnsafeBufferUsageHandler &Handler) {
   // FIXME: need to make this conflict checking better:
@@ -2278,9 +2253,9 @@ createOverloadsForFixedParams(const Strategy &S, const FunctionDecl *FD,
   for (unsigned i = 0; i < NumParms; i++) {
     const ParmVarDecl *PVD = FD->getParamDecl(i);
 
-    if (S.lookup(PVD) == Strategy::Kind::Wontfix)
+    if (S.lookup(PVD) == FixitStrategy::Kind::Wontfix)
       continue;
-    if (S.lookup(PVD) != Strategy::Kind::Span)
+    if (S.lookup(PVD) != FixitStrategy::Kind::Span)
       // Not supported, not suppose to happen:
       return std::nullopt;
 
@@ -2291,7 +2266,8 @@ createOverloadsForFixedParams(const Strategy &S, const FunctionDecl *FD,
     if (!PteTyText)
       // something wrong in obtaining the text of the pointee type, give up
       return std::nullopt;
-    // FIXME: whether we should create std::span type depends on the Strategy.
+    // FIXME: whether we should create std::span type depends on the
+    // FixitStrategy.
     NewTysTexts[i] = getSpanTypeText(*PteTyText, PteTyQuals);
     ParmsMask[i] = true;
     AtLeastOneParmToFix = true;
@@ -2495,10 +2471,103 @@ static FixItList fixVariableWithSpan(const VarDecl *VD,
   return fixLocalVarDeclWithSpan(VD, Ctx, getUserFillPlaceHolder(), Handler);
 }
 
+static FixItList fixVarDeclWithArray(const VarDecl *D, const ASTContext &Ctx,
+                                     UnsafeBufferUsageHandler &Handler) {
+  FixItList FixIts{};
+
+  // Note: the code below expects the declaration to not use any type sugar like
+  // typedef.
+  if (auto CAT = dyn_cast<clang::ConstantArrayType>(D->getType())) {
+    const QualType &ArrayEltT = CAT->getElementType();
+    assert(!ArrayEltT.isNull() && "Trying to fix a non-array type variable!");
+    // FIXME: support multi-dimensional arrays
+    if (isa<clang::ArrayType>(ArrayEltT.getCanonicalType()))
+      return {};
+
+    const SourceLocation IdentifierLoc = getVarDeclIdentifierLoc(D);
+
+    // Get the spelling of the element type as written in the source file
+    // (including macros, etc.).
+    auto MaybeElemTypeTxt =
+        getRangeText({D->getBeginLoc(), IdentifierLoc}, Ctx.getSourceManager(),
+                     Ctx.getLangOpts());
+    if (!MaybeElemTypeTxt)
+      return {};
+    const llvm::StringRef ElemTypeTxt = MaybeElemTypeTxt->trim();
+
+    // Find the '[' token.
+    std::optional<Token> NextTok = Lexer::findNextToken(
+        IdentifierLoc, Ctx.getSourceManager(), Ctx.getLangOpts());
+    while (NextTok && !NextTok->is(tok::l_square) &&
+           NextTok->getLocation() <= D->getSourceRange().getEnd())
+      NextTok = Lexer::findNextToken(NextTok->getLocation(),
+                                     Ctx.getSourceManager(), Ctx.getLangOpts());
+    if (!NextTok)
+      return {};
+    const SourceLocation LSqBracketLoc = NextTok->getLocation();
+
+    // Get the spelling of the array size as written in the source file
+    // (including macros, etc.).
+    auto MaybeArraySizeTxt = getRangeText(
+        {LSqBracketLoc.getLocWithOffset(1), D->getTypeSpecEndLoc()},
+        Ctx.getSourceManager(), Ctx.getLangOpts());
+    if (!MaybeArraySizeTxt)
+      return {};
+    const llvm::StringRef ArraySizeTxt = MaybeArraySizeTxt->trim();
+    if (ArraySizeTxt.empty()) {
+      // FIXME: Support array size getting determined from the initializer.
+      // Examples:
+      //    int arr1[] = {0, 1, 2};
+      //    int arr2{3, 4, 5};
+      // We might be able to preserve the non-specified size with `auto` and
+      // `std::to_array`:
+      //    auto arr1 = std::to_array<int>({0, 1, 2});
+      return {};
+    }
+
+    std::optional<StringRef> IdentText =
+        getVarDeclIdentifierText(D, Ctx.getSourceManager(), Ctx.getLangOpts());
+
+    if (!IdentText) {
+      DEBUG_NOTE_DECL_FAIL(D, " : failed to locate the identifier");
+      return {};
+    }
+
+    SmallString<32> Replacement;
+    raw_svector_ostream OS(Replacement);
+    OS << "std::array<" << ElemTypeTxt << ", " << ArraySizeTxt << "> "
+       << IdentText->str();
+
+    FixIts.push_back(FixItHint::CreateReplacement(
+        SourceRange{D->getBeginLoc(), D->getTypeSpecEndLoc()}, OS.str()));
+  }
+
+  return FixIts;
+}
+
+static FixItList fixVariableWithArray(const VarDecl *VD,
+                                      const DeclUseTracker &Tracker,
+                                      const ASTContext &Ctx,
+                                      UnsafeBufferUsageHandler &Handler) {
+  const DeclStmt *DS = Tracker.lookupDecl(VD);
+  assert(DS && "Fixing non-local variables not implemented yet!");
+  if (!DS->isSingleDecl()) {
+    // FIXME: to support handling multiple `VarDecl`s in a single `DeclStmt`
+    return {};
+  }
+  // Currently DS is an unused variable but we'll need it when
+  // non-single decls are implemented, where the pointee type name
+  // and the '*' are spread around the place.
+  (void)DS;
+
+  // FIXME: handle cases where DS has multiple declarations
+  return fixVarDeclWithArray(VD, Ctx, Handler);
+}
+
 // TODO: we should be consistent to use `std::nullopt` to represent no-fix due
 // to any unexpected problem.
 static FixItList
-fixVariable(const VarDecl *VD, Strategy::Kind K,
+fixVariable(const VarDecl *VD, FixitStrategy::Kind K,
             /* The function decl under analysis */ const Decl *D,
             const DeclUseTracker &Tracker, ASTContext &Ctx,
             UnsafeBufferUsageHandler &Handler) {
@@ -2529,7 +2598,7 @@ fixVariable(const VarDecl *VD, Strategy::Kind K,
   }
 
   switch (K) {
-  case Strategy::Kind::Span: {
+  case FixitStrategy::Kind::Span: {
     if (VD->getType()->isPointerType()) {
       if (const auto *PVD = dyn_cast<ParmVarDecl>(VD))
         return fixParamWithSpan(PVD, Ctx, Handler);
@@ -2540,11 +2609,18 @@ fixVariable(const VarDecl *VD, Strategy::Kind K,
     DEBUG_NOTE_DECL_FAIL(VD, " : not a pointer");
     return {};
   }
-  case Strategy::Kind::Iterator:
-  case Strategy::Kind::Array:
-  case Strategy::Kind::Vector:
-    llvm_unreachable("Strategy not implemented yet!");
-  case Strategy::Kind::Wontfix:
+  case FixitStrategy::Kind::Array: {
+    if (VD->isLocalVarDecl() &&
+        isa<clang::ConstantArrayType>(VD->getType().getCanonicalType()))
+      return fixVariableWithArray(VD, Tracker, Ctx, Handler);
+
+    DEBUG_NOTE_DECL_FAIL(VD, " : not a local const-size array");
+    return {};
+  }
+  case FixitStrategy::Kind::Iterator:
+  case FixitStrategy::Kind::Vector:
+    llvm_unreachable("FixitStrategy not implemented yet!");
+  case FixitStrategy::Kind::Wontfix:
     llvm_unreachable("Invalid strategy!");
   }
   llvm_unreachable("Unknown strategy!");
@@ -2605,7 +2681,8 @@ static void eraseVarsForUnfixableGroupMates(
 static FixItList createFunctionOverloadsForParms(
     std::map<const VarDecl *, FixItList> &FixItsForVariable /* mutable */,
     const VariableGroupsManager &VarGrpMgr, const FunctionDecl *FD,
-    const Strategy &S, ASTContext &Ctx, UnsafeBufferUsageHandler &Handler) {
+    const FixitStrategy &S, ASTContext &Ctx,
+    UnsafeBufferUsageHandler &Handler) {
   FixItList FixItsSharedByParms{};
 
   std::optional<FixItList> OverloadFixes =
@@ -2625,8 +2702,8 @@ static FixItList createFunctionOverloadsForParms(
 
 // Constructs self-contained fix-its for each variable in `FixablesForAllVars`.
 static std::map<const VarDecl *, FixItList>
-getFixIts(FixableGadgetSets &FixablesForAllVars, const Strategy &S,
-	  ASTContext &Ctx,
+getFixIts(FixableGadgetSets &FixablesForAllVars, const FixitStrategy &S,
+          ASTContext &Ctx,
           /* The function decl under analysis */ const Decl *D,
           const DeclUseTracker &Tracker, UnsafeBufferUsageHandler &Handler,
           const VariableGroupsManager &VarGrpMgr) {
@@ -2724,11 +2801,14 @@ getFixIts(FixableGadgetSets &FixablesForAllVars, const Strategy &S,
 }
 
 template <typename VarDeclIterTy>
-static Strategy
+static FixitStrategy
 getNaiveStrategy(llvm::iterator_range<VarDeclIterTy> UnsafeVars) {
-  Strategy S;
+  FixitStrategy S;
   for (const VarDecl *VD : UnsafeVars) {
-    S.set(VD, Strategy::Kind::Span);
+    if (isa<ConstantArrayType>(VD->getType().getCanonicalType()))
+      S.set(VD, FixitStrategy::Kind::Array);
+    else
+      S.set(VD, FixitStrategy::Kind::Span);
   }
   return S;
 }
@@ -3034,7 +3114,7 @@ void clang::checkUnsafeBufferUsage(const Decl *D,
 
   // We assign strategies to variables that are 1) in the graph and 2) can be
   // fixed. Other variables have the default "Won't fix" strategy.
-  Strategy NaiveStrategy = getNaiveStrategy(llvm::make_filter_range(
+  FixitStrategy NaiveStrategy = getNaiveStrategy(llvm::make_filter_range(
       VisitedVars, [&FixablesForAllVars](const VarDecl *V) {
         // If a warned variable has no "Fixable", it is considered unfixable:
         return FixablesForAllVars.byVar.count(V);
@@ -3057,9 +3137,9 @@ void clang::checkUnsafeBufferUsage(const Decl *D,
     auto FixItsIt = FixItsForVariableGroup.find(VD);
     Handler.handleUnsafeVariableGroup(VD, VarGrpMgr,
                                       FixItsIt != FixItsForVariableGroup.end()
-                                      ? std::move(FixItsIt->second)
-                                      : FixItList{},
-                                      D);
+                                          ? std::move(FixItsIt->second)
+                                          : FixItList{},
+                                      D, NaiveStrategy);
     for (const auto &G : WarningGadgets) {
       Handler.handleUnsafeOperation(G->getBaseStmt(), /*IsRelatedToDecl=*/true,
                                     D->getASTContext());
diff --git a/clang/lib/Sema/AnalysisBasedWarnings.cpp b/clang/lib/Sema/AnalysisBasedWarnings.cpp
index 78b9f32..8239ba4 100644
--- a/clang/lib/Sema/AnalysisBasedWarnings.cpp
+++ b/clang/lib/Sema/AnalysisBasedWarnings.cpp
@@ -2297,7 +2297,8 @@ public:
 
   void handleUnsafeVariableGroup(const VarDecl *Variable,
                                  const VariableGroupsManager &VarGrpMgr,
-                                 FixItList &&Fixes, const Decl *D) override {
+                                 FixItList &&Fixes, const Decl *D,
+                                 const FixitStrategy &VarTargetTypes) override {
     assert(!SuggestSuggestions &&
            "Unsafe buffer usage fixits displayed without suggestions!");
     S.Diag(Variable->getLocation(), diag::warn_unsafe_buffer_variable)
@@ -2312,7 +2313,18 @@ public:
       // NOT explain how the variables are grouped as the reason is non-trivial
       // and irrelavant to users' experience:
       const auto VarGroupForVD = VarGrpMgr.getGroupOfVar(Variable, &BriefMsg);
-      unsigned FixItStrategy = 0; // For now we only have 'std::span' strategy
+      unsigned FixItStrategy = 0;
+      switch (VarTargetTypes.lookup(Variable)) {
+      case clang::FixitStrategy::Kind::Span:
+        FixItStrategy = 0;
+        break;
+      case clang::FixitStrategy::Kind::Array:
+        FixItStrategy = 1;
+        break;
+      default:
+        assert(false && "We support only std::span and std::array");
+      };
+
       const auto &FD =
           S.Diag(Variable->getLocation(),
                  BriefMsg ? diag::note_unsafe_buffer_variable_fixit_together
diff --git a/clang/test/SemaCXX/warn-unsafe-buffer-usage-array.cpp b/clang/test/SemaCXX/warn-unsafe-buffer-usage-array.cpp
new file mode 100644
index 0000000..90c11b1
--- /dev/null
+++ b/clang/test/SemaCXX/warn-unsafe-buffer-usage-array.cpp
@@ -0,0 +1,24 @@
+// RUN: %clang_cc1 -std=c++20 -Wno-all -Wunsafe-buffer-usage \
+// RUN:            -fsafe-buffer-usage-suggestions \
+// RUN:            -verify %s
+
+// CHECK-NOT: [-Wunsafe-buffer-usage]
+
+
+void foo(unsigned idx) {
+  int buffer[10];         // expected-warning{{'buffer' is an unsafe buffer that does not perform bounds checks}}
+                          // expected-note@-1{{change type of 'buffer' to 'std::array' to label it for hardening}}
+  buffer[idx] = 0;        // expected-note{{used in buffer access here}}
+}
+
+int global_buffer[10];    // expected-warning{{'global_buffer' is an unsafe buffer that does not perform bounds checks}}
+void foo2(unsigned idx) {
+  global_buffer[idx] = 0;        // expected-note{{used in buffer access here}}
+}
+
+struct Foo {
+  int member_buffer[10];
+};
+void foo2(Foo& f, unsigned idx) {
+  f.member_buffer[idx] = 0; // expected-warning{{unsafe buffer access}}
+}
diff --git a/clang/test/SemaCXX/warn-unsafe-buffer-usage-debug.cpp b/clang/test/SemaCXX/warn-unsafe-buffer-usage-debug.cpp
index 5fff085..a5b578b 100644
--- a/clang/test/SemaCXX/warn-unsafe-buffer-usage-debug.cpp
+++ b/clang/test/SemaCXX/warn-unsafe-buffer-usage-debug.cpp
@@ -32,15 +32,6 @@ void foo() {
                         // debug-note{{safe buffers debug: gadget 'ULCArraySubscript' refused to produce a fix}}
 }
 
-void failed_decl() {
-  int a[10];  // expected-warning{{'a' is an unsafe buffer that does not perform bounds checks}} \
-              // debug-note{{safe buffers debug: failed to produce fixit for declaration 'a' : not a pointer}}
-  
-  for (int i = 0; i < 10; i++) {
-    a[i] = i;  // expected-note{{used in buffer access here}}
-  }
-}
-
 void failed_multiple_decl() {
   int *a = new int[4], b;  // expected-warning{{'a' is an unsafe pointer used for buffer access}} \
                           // debug-note{{safe buffers debug: failed to produce fixit for declaration 'a' : multiple VarDecls}}
diff --git a/clang/test/SemaCXX/warn-unsafe-buffer-usage-fixits-local-var-array.cpp b/clang/test/SemaCXX/warn-unsafe-buffer-usage-fixits-local-var-array.cpp
new file mode 100644
index 0000000..3adfc32
--- /dev/null
+++ b/clang/test/SemaCXX/warn-unsafe-buffer-usage-fixits-local-var-array.cpp
@@ -0,0 +1,228 @@
+// RUN: %clang_cc1 -std=c++20 -Wunsafe-buffer-usage \
+// RUN:            -fsafe-buffer-usage-suggestions \
+// RUN:            -fdiagnostics-parseable-fixits %s 2>&1 | FileCheck %s
+typedef int * Int_ptr_t;
+typedef int Int_t;
+
+void simple(unsigned idx) {
+  int buffer[10];
+// CHECK: fix-it:"{{.*}}":{[[@LINE-1]]:3-[[@LINE-1]]:17}:"std::array<int, 10> buffer"
+  buffer[idx] = 0;
+}
+
+void array2d(unsigned idx) {
+  int buffer[10][10];
+// CHECK-NOT: fix-it:"{{.*}}":{[[@LINE-1]]
+  buffer[idx][idx] = 0;
+}
+
+void array2d_vla(unsigned sz, unsigned idx) {
+  int buffer1[10][sz];
+// CHECK-NOT: fix-it:"{{.*}}":{[[@LINE-1]]
+  int buffer2[sz][10];
+// CHECK-NOT: fix-it:"{{.*}}":{[[@LINE-1]]
+  buffer1[idx][idx] = 0;
+  buffer2[idx][idx] = 0;
+}
+
+void array2d_assign_from_elem(unsigned idx) {
+  int buffer[10][10];
+// CHECK-NOT: fix-it:"{{.*}}":{[[@LINE-1]]
+  int a = buffer[idx][idx];
+}
+
+void array2d_use(int *);
+void array2d_call(unsigned idx) {
+  int buffer[10][10];
+// CHECK-NOT: fix-it:"{{.*}}":{[[@LINE-1]]
+  array2d_use(buffer[idx]);
+}
+void array2d_call_vla(unsigned sz, unsigned idx) {
+  int buffer[10][sz];
+// CHECK-NOT: fix-it:"{{.*}}":{[[@LINE-1]]
+  array2d_use(buffer[idx]);
+}
+
+void array2d_typedef(unsigned idx) {
+  typedef int ten_ints_t[10];
+// CHECK-NOT: fix-it:"{{.*}}":{[[@LINE-1]]
+  ten_ints_t buffer[10];
+// CHECK-NOT: fix-it:"{{.*}}":{[[@LINE-1]]
+  buffer[idx][idx] = 0;
+}
+
+void whitespace_in_declaration(unsigned idx) {
+  int      buffer_w   [       10 ];
+// CHECK: fix-it:"{{.*}}":{[[@LINE-1]]:3-[[@LINE-1]]:35}:"std::array<int, 10> buffer_w"
+  buffer_w[idx] = 0;
+}
+
+void comments_in_declaration(unsigned idx) {
+  int   /* [A] */   buffer_w  /* [B] */ [  /* [C] */ 10 /* [D] */  ] ;
+// CHECK: fix-it:"{{.*}}":{[[@LINE-1]]:3-[[@LINE-1]]:69}:"std::array<int   /* [A] */, /* [C] */ 10 /* [D] */> buffer_w"
+  buffer_w[idx] = 0;
+}
+
+void initializer(unsigned idx) {
+  int buffer[3] = {0};
+// CHECK: fix-it:"{{.*}}":{[[@LINE-1]]:3-[[@LINE-1]]:16}:"std::array<int, 3> buffer"
+
+  int buffer2[3] = {0, 1, 2};
+// CHECK: fix-it:"{{.*}}":{[[@LINE-1]]:3-[[@LINE-1]]:17}:"std::array<int, 3> buffer2"
+
+  buffer[idx] = 0;
+  buffer2[idx] = 0;
+}
+
+void auto_size(unsigned idx) {
+  int buffer[] = {0, 1, 2};
+// CHECK-NOT: fix-it:"{{.*}}":{[[@LINE-1]]
+// FIXME: implement support
+
+  buffer[idx] = 0;
+}
+
+void universal_initialization(unsigned idx) {
+  int buffer[] {0, 1, 2};
+// CHECK-NOT: fix-it:"{{.*}}":{[[@LINE-1]]
+// FIXME: implement support
+
+  buffer[idx] = 0;
+}
+
+void multi_decl1(unsigned idx) {
+  int a, buffer[10];
+// CHECK-NOT: fix-it:"{{.*}}":{[[@LINE-1]]
+// FIXME: implement support
+
+  buffer[idx] = 0;
+}
+
+void multi_decl2(unsigned idx) {
+  int buffer[10], b;
+// CHECK-NOT: fix-it:"{{.*}}":{[[@LINE-1]]
+// FIXME: implement support
+
+  buffer[idx] = 0;
+}
+
+void local_array_ptr_to_const(unsigned idx, const int*& a) {
+  const int * buffer[10] = {a};
+// CHECK: fix-it:"{{.*}}":{[[@LINE-1]]:3-[[@LINE-1]]:25}:"std::array<const int *, 10> buffer"
+  a = buffer[idx];
+}
+
+void local_array_const_ptr(unsigned idx, int*& a) {
+  int * const buffer[10] = {a};
+// CHECK: fix-it:"{{.*}}":{[[@LINE-1]]:3-[[@LINE-1]]:25}:"std::array<int * const, 10> buffer"
+
+  a = buffer[idx];
+}
+
+void local_array_const_ptr_via_typedef(unsigned idx, int*& a) {
+  typedef int * const my_const_ptr;
+  my_const_ptr buffer[10] = {a};
+// CHECK: fix-it:"{{.*}}":{[[@LINE-1]]:3-[[@LINE-1]]:26}:"std::array<my_const_ptr, 10> buffer"
+
+  a = buffer[idx];
+}
+
+void local_array_const_ptr_to_const(unsigned idx, const int*& a) {
+  const int * const buffer[10] = {a};
+// CHECK: fix-it:"{{.*}}":{[[@LINE-1]]:3-[[@LINE-1]]:31}:"std::array<const int * const, 10> buffer"
+
+  a = buffer[idx];
+
+}
+
+template<typename T>
+void unsupported_local_array_in_template(unsigned idx) {
+  T buffer[10];
+// CHECK-NOT: fix-it:"{{.*}}":{[[@LINE-1]]:.*-[[@LINE-1]]:.*}
+  buffer[idx] = 0;
+}
+// Instantiate the template function to force its analysis.
+template void unsupported_local_array_in_template<int>(unsigned);
+
+typedef unsigned int my_uint;
+void typedef_as_elem_type(unsigned idx) {
+  my_uint buffer[10];
+// CHECK: fix-it:"{{.*}}":{[[@LINE-1]]:3-[[@LINE-1]]:21}:"std::array<my_uint, 10> buffer"
+  buffer[idx] = 0;
+}
+
+void decltype_as_elem_type(unsigned idx) {
+  int a;
+  decltype(a) buffer[10];
+// CHECK: fix-it:"{{.*}}":{[[@LINE-1]]:3-[[@LINE-1]]:25}:"std::array<decltype(a), 10> buffer"
+  buffer[idx] = 0;
+}
+
+void macro_as_elem_type(unsigned idx) {
+#define MY_INT int
+  MY_INT buffer[10];
+// CHECK-NOT: fix-it:"{{.*}}":{[[@LINE-1]]:.*-[[@LINE-1]]:.*}
+// FIXME: implement support
+
+  buffer[idx] = 0;
+#undef MY_INT
+}
+
+void macro_as_identifier(unsigned idx) {
+#define MY_BUFFER buffer
+  int MY_BUFFER[10];
+// CHECK: fix-it:"{{.*}}":{[[@LINE-1]]:3-[[@LINE-1]]:20}:"std::array<int, 10> MY_BUFFER"
+  MY_BUFFER[idx] = 0;
+#undef MY_BUFFER
+}
+
+void macro_as_size(unsigned idx) {
+#define MY_TEN 10
+  int buffer[MY_TEN];
+// CHECK: fix-it:"{{.*}}":{[[@LINE-1]]:3-[[@LINE-1]]:21}:"std::array<int, MY_TEN> buffer"
+  buffer[idx] = 0;
+#undef MY_TEN
+}
+
+typedef unsigned int my_array[42];
+// CHECK-NOT: fix-it:"{{.*}}":{[[@LINE-1]]:.*-[[@LINE-1]]:.*}
+void typedef_as_array_type(unsigned idx) {
+  my_array buffer;
+// CHECK-NOT: fix-it:"{{.*}}":{[[@LINE-1]]:.*-[[@LINE-1]]:.*}
+  buffer[idx] = 0;
+}
+
+void decltype_as_array_type(unsigned idx) {
+  int buffer[42];
+// CHECK-NOT: fix-it:"{{.*}}":{[[@LINE-1]]:.*-[[@LINE-1]]:.*}
+  decltype(buffer) buffer2;
+// CHECK-NOT: fix-it:"{{.*}}":{[[@LINE-1]]:.*-[[@LINE-1]]:.*}
+  buffer2[idx] = 0;
+}
+
+void constant_as_size(unsigned idx) {
+  const unsigned my_const = 10;
+  int buffer[my_const];
+// CHECK: fix-it:"{{.*}}":{[[@LINE-1]]:3-[[@LINE-1]]:23}:"std::array<int, my_const> buffer"
+  buffer[idx] = 0;
+}
+
+void subscript_negative() {
+  int buffer[10];
+// CHECK: fix-it:"{{.*}}":{[[@LINE-1]]:3-[[@LINE-1]]:17}:"std::array<int, 10> buffer"
+
+  // For constant-size arrays any negative index will lead to buffer underflow.
+  // std::array::operator[] has unsigned parameter so the value will be casted to unsigned.
+  // This will very likely be buffer overflow but hardened std::array catch these at runtime.
+  buffer[-5] = 0;
+}
+
+void subscript_signed(int signed_idx) {
+  int buffer[10];
+// CHECK: fix-it:"{{.*}}":{[[@LINE-1]]:3-[[@LINE-1]]:17}:"std::array<int, 10> buffer"
+
+  // For constant-size arrays any negative index will lead to buffer underflow.
+  // std::array::operator[] has unsigned parameter so the value will be casted to unsigned.
+  // This will very likely be buffer overflow but hardened std::array catches these at runtime.
+  buffer[signed_idx] = 0;
+}
diff --git a/clang/test/SemaCXX/warn-unsafe-buffer-usage.cpp b/clang/test/SemaCXX/warn-unsafe-buffer-usage.cpp
index c5f0a9e..67cdf25 100644
--- a/clang/test/SemaCXX/warn-unsafe-buffer-usage.cpp
+++ b/clang/test/SemaCXX/warn-unsafe-buffer-usage.cpp
@@ -61,6 +61,7 @@ void testArraySubscripts(int *p, int **pp) {
       );
 
     int a[10];          // expected-warning{{'a' is an unsafe buffer that does not perform bounds checks}}
+                        // expected-note@-1{{change type of 'a' to 'std::array' to label it for hardening}}
     int b[10][10];      // expected-warning{{'b' is an unsafe buffer that does not perform bounds checks}}
 
   foo(a[1], 1[a],   // expected-note2{{used in buffer access here}}
@@ -174,6 +175,7 @@ auto file_scope_lambda = [](int *ptr) {
 void testLambdaCapture() {
   int a[10];              // expected-warning{{'a' is an unsafe buffer that does not perform bounds checks}}
   int b[10];              // expected-warning{{'b' is an unsafe buffer that does not perform bounds checks}}
+                          // expected-note@-1{{change type of 'b' to 'std::array' to label it for hardening}}
   int c[10];
 
   auto Lam1 = [a]() {
@@ -191,7 +193,9 @@ void testLambdaCapture() {
 
 void testLambdaImplicitCapture() {
   int a[10];              // expected-warning{{'a' is an unsafe buffer that does not perform bounds checks}}
+                          // expected-note@-1{{change type of 'a' to 'std::array' to label it for hardening}}
   int b[10];              // expected-warning{{'b' is an unsafe buffer that does not perform bounds checks}}
+                          // expected-note@-1{{change type of 'b' to 'std::array' to label it for hardening}}
   
   auto Lam1 = [=]() {
     return a[1];           // expected-note{{used in buffer access here}}
@@ -344,6 +348,7 @@ template<typename T> void fArr(T t[]) {
   // expected-warning@-1{{'t' is an unsafe pointer used for buffer access}}
   foo(t[1]);    // expected-note{{used in buffer access here}}
   T ar[8];      // expected-warning{{'ar' is an unsafe buffer that does not perform bounds checks}}
+                // expected-note@-1{{change type of 'ar' to 'std::array' to label it for hardening}}
   foo(ar[5]);   // expected-note{{used in buffer access here}}
 }
 
-- 
cgit v1.1


From 3a080a0195ed21b8e12f825cfa00c8fa79f851a6 Mon Sep 17 00:00:00 2001
From: Cyndy Ishida <cyndy_ishida@apple.com>
Date: Mon, 12 Feb 2024 16:15:21 -0800
Subject: [TextAPI] Refactor BinaryAttrs to InterfaceFile assignment (#81551)

Create a helper method for this operation, so it can be reused in
multiple places.
Additonally move FileType enum into its own header to avoid include
cycles.
---
 llvm/include/llvm/TextAPI/FileTypes.h         | 49 +++++++++++++++++++++++++++
 llvm/include/llvm/TextAPI/InterfaceFile.h     | 49 ++++++---------------------
 llvm/include/llvm/TextAPI/RecordsSlice.h      |  3 +-
 llvm/lib/TextAPI/BinaryReader/DylibReader.cpp |  1 +
 llvm/lib/TextAPI/InterfaceFile.cpp            | 29 ++++++++++++++++
 llvm/lib/TextAPI/RecordsSlice.cpp             | 24 ++-----------
 llvm/lib/TextAPI/TextAPIContext.h             |  3 +-
 llvm/lib/TextAPI/TextStubCommon.h             |  1 +
 8 files changed, 95 insertions(+), 64 deletions(-)
 create mode 100644 llvm/include/llvm/TextAPI/FileTypes.h

diff --git a/llvm/include/llvm/TextAPI/FileTypes.h b/llvm/include/llvm/TextAPI/FileTypes.h
new file mode 100644
index 0000000..5876e9d
--- /dev/null
+++ b/llvm/include/llvm/TextAPI/FileTypes.h
@@ -0,0 +1,49 @@
+//===- llvm/TextAPI/FileTypes.h - TAPI Interface File -----------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TEXTAPI_FILETYPES_H
+#define LLVM_TEXTAPI_FILETYPES_H
+
+#include "llvm/ADT/BitmaskEnum.h"
+namespace llvm::MachO {
+/// Defines the file type TextAPI files can represent.
+enum FileType : unsigned {
+  /// Invalid file type.
+  Invalid = 0U,
+
+  /// \brief MachO Dynamic Library file.
+  MachO_DynamicLibrary = 1U << 0,
+
+  /// \brief MachO Dynamic Library Stub file.
+  MachO_DynamicLibrary_Stub = 1U << 1,
+
+  /// \brief MachO Bundle file.
+  MachO_Bundle = 1U << 2,
+
+  /// Text-based stub file (.tbd) version 1.0
+  TBD_V1 = 1U << 3,
+
+  /// Text-based stub file (.tbd) version 2.0
+  TBD_V2 = 1U << 4,
+
+  /// Text-based stub file (.tbd) version 3.0
+  TBD_V3 = 1U << 5,
+
+  /// Text-based stub file (.tbd) version 4.0
+  TBD_V4 = 1U << 6,
+
+  /// Text-based stub file (.tbd) version 5.0
+  TBD_V5 = 1U << 7,
+
+  All = ~0U,
+
+  LLVM_MARK_AS_BITMASK_ENUM(/*LargestValue=*/All),
+};
+
+} // namespace llvm::MachO
+#endif // LLVM_TEXTAPI_FILETYPES_H
diff --git a/llvm/include/llvm/TextAPI/InterfaceFile.h b/llvm/include/llvm/TextAPI/InterfaceFile.h
index 7984a800..10a37e3 100644
--- a/llvm/include/llvm/TextAPI/InterfaceFile.h
+++ b/llvm/include/llvm/TextAPI/InterfaceFile.h
@@ -14,14 +14,15 @@
 #ifndef LLVM_TEXTAPI_INTERFACEFILE_H
 #define LLVM_TEXTAPI_INTERFACEFILE_H
 
-#include "llvm/ADT/BitmaskEnum.h"
 #include "llvm/ADT/Hashing.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/iterator.h"
 #include "llvm/Support/Allocator.h"
 #include "llvm/TextAPI/ArchitectureSet.h"
+#include "llvm/TextAPI/FileTypes.h"
 #include "llvm/TextAPI/PackedVersion.h"
 #include "llvm/TextAPI/Platform.h"
+#include "llvm/TextAPI/RecordsSlice.h"
 #include "llvm/TextAPI/Symbol.h"
 #include "llvm/TextAPI/SymbolSet.h"
 #include "llvm/TextAPI/Target.h"
@@ -47,44 +48,6 @@ enum class ObjCConstraintType : unsigned {
   GC = 4,
 };
 
-// clang-format off
-
-/// Defines the file type this file represents.
-enum FileType : unsigned {
-  /// Invalid file type.
-  Invalid = 0U,
-
-  /// \brief MachO Dynamic Library file.
-  MachO_DynamicLibrary      = 1U <<  0,
-
-  /// \brief MachO Dynamic Library Stub file.
-  MachO_DynamicLibrary_Stub = 1U <<  1,
-
-  /// \brief MachO Bundle file.
-  MachO_Bundle              = 1U <<  2,
-
-  /// Text-based stub file (.tbd) version 1.0
-  TBD_V1                    = 1U <<  3,
-
-  /// Text-based stub file (.tbd) version 2.0
-  TBD_V2                    = 1U <<  4,
-
-  /// Text-based stub file (.tbd) version 3.0
-  TBD_V3                    = 1U <<  5,
-
-  /// Text-based stub file (.tbd) version 4.0
-  TBD_V4                    = 1U <<  6,
-
-  /// Text-based stub file (.tbd) version 5.0
-  TBD_V5                    = 1U <<  7,
-
-  All                       = ~0U,
-
-  LLVM_MARK_AS_BITMASK_ENUM(/*LargestValue=*/All),
-};
-
-// clang-format on
-
 /// Reference to an interface file.
 class InterfaceFileRef {
 public:
@@ -437,6 +400,14 @@ public:
   void inlineLibrary(std::shared_ptr<InterfaceFile> Library,
                      bool Overwrite = false);
 
+  /// Set InterfaceFile properties from pre-gathered binary attributes,
+  /// if they are not set already.
+  ///
+  /// \param BA Attributes typically represented in load commands.
+  /// \param Targ MachO Target slice to add attributes to.
+  void setFromBinaryAttrs(const RecordsSlice::BinaryAttrs &BA,
+                          const Target &Targ);
+
   /// The equality is determined by attributes that impact linking
   /// compatibilities. Path, & FileKind are irrelevant since these by
   /// itself should not impact linking.
diff --git a/llvm/include/llvm/TextAPI/RecordsSlice.h b/llvm/include/llvm/TextAPI/RecordsSlice.h
index f3db721..5b214d0 100644
--- a/llvm/include/llvm/TextAPI/RecordsSlice.h
+++ b/llvm/include/llvm/TextAPI/RecordsSlice.h
@@ -15,7 +15,7 @@
 #define LLVM_TEXTAPI_RECORDSLICE_H
 
 #include "llvm/Support/Allocator.h"
-#include "llvm/TextAPI/InterfaceFile.h"
+#include "llvm/TextAPI/FileTypes.h"
 #include "llvm/TextAPI/PackedVersion.h"
 #include "llvm/TextAPI/Record.h"
 #include "llvm/TextAPI/RecordVisitor.h"
@@ -191,6 +191,7 @@ private:
 };
 
 using Records = llvm::SmallVector<std::shared_ptr<RecordsSlice>, 4>;
+class InterfaceFile;
 std::unique_ptr<InterfaceFile> convertToInterfaceFile(const Records &Slices);
 
 } // namespace MachO
diff --git a/llvm/lib/TextAPI/BinaryReader/DylibReader.cpp b/llvm/lib/TextAPI/BinaryReader/DylibReader.cpp
index 25b9499..0694d8f 100644
--- a/llvm/lib/TextAPI/BinaryReader/DylibReader.cpp
+++ b/llvm/lib/TextAPI/BinaryReader/DylibReader.cpp
@@ -17,6 +17,7 @@
 #include "llvm/Object/MachOUniversal.h"
 #include "llvm/Support/Endian.h"
 #include "llvm/TargetParser/Triple.h"
+#include "llvm/TextAPI/InterfaceFile.h"
 #include "llvm/TextAPI/RecordsSlice.h"
 #include "llvm/TextAPI/TextAPIError.h"
 #include <iomanip>
diff --git a/llvm/lib/TextAPI/InterfaceFile.cpp b/llvm/lib/TextAPI/InterfaceFile.cpp
index d712ed3..9979df92 100644
--- a/llvm/lib/TextAPI/InterfaceFile.cpp
+++ b/llvm/lib/TextAPI/InterfaceFile.cpp
@@ -11,6 +11,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/TextAPI/InterfaceFile.h"
+#include "llvm/TextAPI/RecordsSlice.h"
 #include "llvm/TextAPI/TextAPIError.h"
 #include <iomanip>
 #include <sstream>
@@ -351,6 +352,34 @@ InterfaceFile::extract(Architecture Arch) const {
   return std::move(IF);
 }
 
+void InterfaceFile::setFromBinaryAttrs(const RecordsSlice::BinaryAttrs &BA,
+                                       const Target &Targ) {
+  if (getFileType() != BA.File)
+    setFileType(BA.File);
+  if (getInstallName().empty())
+    setInstallName(BA.InstallName);
+  if (BA.AppExtensionSafe && !isApplicationExtensionSafe())
+    setApplicationExtensionSafe();
+  if (BA.TwoLevelNamespace && !isTwoLevelNamespace())
+    setTwoLevelNamespace();
+  if (BA.OSLibNotForSharedCache && !isOSLibNotForSharedCache())
+    setOSLibNotForSharedCache();
+  if (getCurrentVersion().empty())
+    setCurrentVersion(BA.CurrentVersion);
+  if (getCompatibilityVersion().empty())
+    setCompatibilityVersion(BA.CompatVersion);
+  if (getSwiftABIVersion() == 0)
+    setSwiftABIVersion(BA.SwiftABI);
+  if (getPath().empty())
+    setPath(BA.Path);
+  if (!BA.ParentUmbrella.empty())
+    addParentUmbrella(Targ, BA.ParentUmbrella);
+  for (const auto &Client : BA.AllowableClients)
+    addAllowableClient(Client, Targ);
+  for (const auto &Lib : BA.RexportedLibraries)
+    addReexportedLibrary(Lib, Targ);
+}
+
 static bool isYAMLTextStub(const FileType &Kind) {
   return (Kind >= FileType::TBD_V1) && (Kind < FileType::TBD_V5);
 }
diff --git a/llvm/lib/TextAPI/RecordsSlice.cpp b/llvm/lib/TextAPI/RecordsSlice.cpp
index f07853e..fb961a9 100644
--- a/llvm/lib/TextAPI/RecordsSlice.cpp
+++ b/llvm/lib/TextAPI/RecordsSlice.cpp
@@ -12,6 +12,7 @@
 
 #include "llvm/TextAPI/RecordsSlice.h"
 #include "llvm/ADT/SetVector.h"
+#include "llvm/TextAPI/InterfaceFile.h"
 #include "llvm/TextAPI/Record.h"
 #include "llvm/TextAPI/Symbol.h"
 #include <utility>
@@ -325,28 +326,7 @@ createInterfaceFile(const Records &Slices, StringRef InstallName) {
       continue;
     const Target &Targ = S->getTarget();
     File->addTarget(Targ);
-    if (File->getFileType() == FileType::Invalid)
-      File->setFileType(BA.File);
-    if (BA.AppExtensionSafe && !File->isApplicationExtensionSafe())
-      File->setApplicationExtensionSafe();
-    if (BA.TwoLevelNamespace && !File->isTwoLevelNamespace())
-      File->setTwoLevelNamespace();
-    if (BA.OSLibNotForSharedCache && !File->isOSLibNotForSharedCache())
-      File->setOSLibNotForSharedCache();
-    if (File->getCurrentVersion().empty())
-      File->setCurrentVersion(BA.CurrentVersion);
-    if (File->getCompatibilityVersion().empty())
-      File->setCompatibilityVersion(BA.CompatVersion);
-    if (File->getSwiftABIVersion() == 0)
-      File->setSwiftABIVersion(BA.SwiftABI);
-    if (File->getPath().empty())
-      File->setPath(BA.Path);
-    if (!BA.ParentUmbrella.empty())
-      File->addParentUmbrella(Targ, BA.ParentUmbrella);
-    for (const auto &Client : BA.AllowableClients)
-      File->addAllowableClient(Client, Targ);
-    for (const auto &Lib : BA.RexportedLibraries)
-      File->addReexportedLibrary(Lib, Targ);
+    File->setFromBinaryAttrs(BA, Targ);
   }
 
   return File;
diff --git a/llvm/lib/TextAPI/TextAPIContext.h b/llvm/lib/TextAPI/TextAPIContext.h
index 217d1f5..3151db7 100644
--- a/llvm/lib/TextAPI/TextAPIContext.h
+++ b/llvm/lib/TextAPI/TextAPIContext.h
@@ -13,13 +13,12 @@
 #ifndef LLVM_TEXTAPI_MACHO_CONTEXT_H
 #define LLVM_TEXTAPI_MACHO_CONTEXT_H
 
+#include "llvm/TextAPI/FileTypes.h"
 #include <string>
 
 namespace llvm {
 namespace MachO {
 
-enum FileType : unsigned;
-
 struct TextAPIContext {
   std::string ErrorMessage;
   std::string Path;
diff --git a/llvm/lib/TextAPI/TextStubCommon.h b/llvm/lib/TextAPI/TextStubCommon.h
index 360910c..155d38a 100644
--- a/llvm/lib/TextAPI/TextStubCommon.h
+++ b/llvm/lib/TextAPI/TextStubCommon.h
@@ -13,6 +13,7 @@
 #ifndef LLVM_TEXTAPI_TEXT_STUB_COMMON_H
 #define LLVM_TEXTAPI_TEXT_STUB_COMMON_H
 
+#include "llvm/ADT/BitmaskEnum.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Support/YAMLTraits.h"
 #include "llvm/TextAPI/Architecture.h"
-- 
cgit v1.1


From ecd63afafd635d3f37ec25e3ded1db0410e97de3 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Mon, 12 Feb 2024 16:41:29 -0800
Subject: Revert "[LICM] Support integer mul/add in hoistFPAssociation.
 (#67736)"

This reverts commit 7ff5dfbaa0c971048da0f37ec6f05f5395562c21.

Causing crashes on Mac build bots.
---
 llvm/lib/Transforms/Scalar/LICM.cpp               |  66 +---
 llvm/test/Transforms/LICM/expr-reassociate-int.ll | 364 ----------------------
 2 files changed, 17 insertions(+), 413 deletions(-)
 delete mode 100644 llvm/test/Transforms/LICM/expr-reassociate-int.ll

diff --git a/llvm/lib/Transforms/Scalar/LICM.cpp b/llvm/lib/Transforms/Scalar/LICM.cpp
index 67295f1..f3e40a5 100644
--- a/llvm/lib/Transforms/Scalar/LICM.cpp
+++ b/llvm/lib/Transforms/Scalar/LICM.cpp
@@ -110,9 +110,6 @@ STATISTIC(NumAddSubHoisted, "Number of add/subtract expressions reassociated "
                             "and hoisted out of the loop");
 STATISTIC(NumFPAssociationsHoisted, "Number of invariant FP expressions "
                                     "reassociated and hoisted out of the loop");
-STATISTIC(NumIntAssociationsHoisted,
-          "Number of invariant int expressions "
-          "reassociated and hoisted out of the loop");
 
 /// Memory promotion is enabled by default.
 static cl::opt<bool>
@@ -138,12 +135,6 @@ static cl::opt<unsigned> FPAssociationUpperLimit(
         "Set upper limit for the number of transformations performed "
         "during a single round of hoisting the reassociated expressions."));
 
-cl::opt<unsigned> IntAssociationUpperLimit(
-    "licm-max-num-int-reassociations", cl::init(5U), cl::Hidden,
-    cl::desc(
-        "Set upper limit for the number of transformations performed "
-        "during a single round of hoisting the reassociated expressions."));
-
 // Experimental option to allow imprecision in LICM in pathological cases, in
 // exchange for faster compile. This is to be removed if MemorySSA starts to
 // address the same issue. LICM calls MemorySSAWalker's
@@ -2670,31 +2661,21 @@ static bool hoistAddSub(Instruction &I, Loop &L, ICFLoopSafetyInfo &SafetyInfo,
   return false;
 }
 
-static bool isReassociableOp(Instruction *I, unsigned IntOpcode,
-                             unsigned FPOpcode) {
-  if (I->getOpcode() == IntOpcode)
-    return true;
-  if (I->getOpcode() == FPOpcode && I->hasAllowReassoc() &&
-      I->hasNoSignedZeros())
-    return true;
-  return false;
-}
-
 /// Try to reassociate expressions like ((A1 * B1) + (A2 * B2) + ...) * C where
 /// A1, A2, ... and C are loop invariants into expressions like
 /// ((A1 * C * B1) + (A2 * C * B2) + ...) and hoist the (A1 * C), (A2 * C), ...
 /// invariant expressions. This functions returns true only if any hoisting has
 /// actually occured.
-static bool hoistMulAddAssociation(Instruction &I, Loop &L,
-                                   ICFLoopSafetyInfo &SafetyInfo,
-                                   MemorySSAUpdater &MSSAU, AssumptionCache *AC,
-                                   DominatorTree *DT) {
+static bool hoistFPAssociation(Instruction &I, Loop &L,
+                               ICFLoopSafetyInfo &SafetyInfo,
+                               MemorySSAUpdater &MSSAU, AssumptionCache *AC,
+                               DominatorTree *DT) {
   using namespace PatternMatch;
+  Value *VariantOp = nullptr, *InvariantOp = nullptr;
 
-  if (!isReassociableOp(&I, Instruction::Mul, Instruction::FMul))
+  if (!match(&I, m_FMul(m_Value(VariantOp), m_Value(InvariantOp))) ||
+      !I.hasAllowReassoc() || !I.hasNoSignedZeros())
     return false;
-  Value *VariantOp = I.getOperand(0);
-  Value *InvariantOp = I.getOperand(1);
   if (L.isLoopInvariant(VariantOp))
     std::swap(VariantOp, InvariantOp);
   if (L.isLoopInvariant(VariantOp) || !L.isLoopInvariant(InvariantOp))
@@ -2708,17 +2689,15 @@ static bool hoistMulAddAssociation(Instruction &I, Loop &L,
     Worklist.push_back(VariantBinOp);
   while (!Worklist.empty()) {
     BinaryOperator *BO = Worklist.pop_back_val();
-    if (!BO->hasOneUse())
+    if (!BO->hasOneUse() || !BO->hasAllowReassoc() || !BO->hasNoSignedZeros())
       return false;
-    if (isReassociableOp(BO, Instruction::Add, Instruction::FAdd) &&
-        isa<BinaryOperator>(BO->getOperand(0)) &&
-        isa<BinaryOperator>(BO->getOperand(1))) {
-      Worklist.push_back(cast<BinaryOperator>(BO->getOperand(0)));
-      Worklist.push_back(cast<BinaryOperator>(BO->getOperand(1)));
+    BinaryOperator *Op0, *Op1;
+    if (match(BO, m_FAdd(m_BinOp(Op0), m_BinOp(Op1)))) {
+      Worklist.push_back(Op0);
+      Worklist.push_back(Op1);
       continue;
     }
-    if (!isReassociableOp(BO, Instruction::Mul, Instruction::FMul) ||
-        L.isLoopInvariant(BO))
+    if (BO->getOpcode() != Instruction::FMul || L.isLoopInvariant(BO))
       return false;
     Use &U0 = BO->getOperandUse(0);
     Use &U1 = BO->getOperandUse(1);
@@ -2728,10 +2707,7 @@ static bool hoistMulAddAssociation(Instruction &I, Loop &L,
       Changes.push_back(&U1);
     else
       return false;
-    unsigned Limit = I.getType()->isIntOrIntVectorTy()
-                         ? IntAssociationUpperLimit
-                         : FPAssociationUpperLimit;
-    if (Changes.size() > Limit)
+    if (Changes.size() > FPAssociationUpperLimit)
       return false;
   }
   if (Changes.empty())
@@ -2744,12 +2720,7 @@ static bool hoistMulAddAssociation(Instruction &I, Loop &L,
   for (auto *U : Changes) {
     assert(L.isLoopInvariant(U->get()));
     Instruction *Ins = cast<Instruction>(U->getUser());
-    Value *Mul;
-    if (I.getType()->isIntOrIntVectorTy())
-      Mul = Builder.CreateMul(U->get(), Factor, "factor.op.mul");
-    else
-      Mul = Builder.CreateFMulFMF(U->get(), Factor, Ins, "factor.op.fmul");
-    U->set(Mul);
+    U->set(Builder.CreateFMulFMF(U->get(), Factor, Ins, "factor.op.fmul"));
   }
   I.replaceAllUsesWith(VariantOp);
   eraseInstruction(I, SafetyInfo, MSSAU);
@@ -2783,12 +2754,9 @@ static bool hoistArithmetics(Instruction &I, Loop &L,
     return true;
   }
 
-  if (hoistMulAddAssociation(I, L, SafetyInfo, MSSAU, AC, DT)) {
+  if (hoistFPAssociation(I, L, SafetyInfo, MSSAU, AC, DT)) {
     ++NumHoisted;
-    if (I.getType()->isIntOrIntVectorTy())
-      ++NumIntAssociationsHoisted;
-    else
-      ++NumFPAssociationsHoisted;
+    ++NumFPAssociationsHoisted;
     return true;
   }
 
diff --git a/llvm/test/Transforms/LICM/expr-reassociate-int.ll b/llvm/test/Transforms/LICM/expr-reassociate-int.ll
deleted file mode 100644
index 6354897..0000000
--- a/llvm/test/Transforms/LICM/expr-reassociate-int.ll
+++ /dev/null
@@ -1,364 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2
-; RUN: opt -passes='loop-mssa(licm)' -S < %s | FileCheck %s --check-prefixes=CHECK,NOT_CONSTRAINED
-; RUN: opt -passes='loop-mssa(licm)' -licm-max-num-int-reassociations=1 -S < %s | FileCheck %s --check-prefixes=CHECK,CONSTRAINED
-
-;
-; A simple loop:
-;
-;  int j;
-;
-;  for (j = 0; j <= i; j++)
-;    cells[j] = d1 * cells[j + 1] * delta;
-;
-; ...should be transformed by the LICM pass into this:
-;
-;  int j;
-;  const uint64_t d1d = d1 * delta;
-;
-;  for (j = 0; j <= i; j++)
-;    cells[j] = d1d * cells[j + 1];
-;
-
-define void @innermost_loop_1d_shouldhoist(i32 %i, i64 %d1, i64 %delta, ptr %cells) {
-; CHECK-LABEL: define void @innermost_loop_1d_shouldhoist
-; CHECK-SAME: (i32 [[I:%.*]], i64 [[D1:%.*]], i64 [[DELTA:%.*]], ptr [[CELLS:%.*]]) {
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[MUL_1:%.*]] = mul i64 [[DELTA]], [[D1]]
-; CHECK-NEXT:    br label [[FOR_COND:%.*]]
-; CHECK:       for.cond:
-; CHECK-NEXT:    [[J:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[ADD_J_1:%.*]], [[FOR_BODY:%.*]] ]
-; CHECK-NEXT:    [[CMP_NOT:%.*]] = icmp sgt i32 [[J]], [[I]]
-; CHECK-NEXT:    br i1 [[CMP_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[ADD_J_1]] = add nuw nsw i32 [[J]], 1
-; CHECK-NEXT:    [[IDXPROM_J_1:%.*]] = zext i32 [[ADD_J_1]] to i64
-; CHECK-NEXT:    [[ARRAYIDX_J_1:%.*]] = getelementptr inbounds i64, ptr [[CELLS]], i64 [[IDXPROM_J_1]]
-; CHECK-NEXT:    [[CELL_1:%.*]] = load i64, ptr [[ARRAYIDX_J_1]], align 8
-; CHECK-NEXT:    [[MUL_2:%.*]] = mul i64 [[MUL_1]], [[CELL_1]]
-; CHECK-NEXT:    [[IDXPROM_J:%.*]] = zext i32 [[J]] to i64
-; CHECK-NEXT:    [[ARRAYIDX_J:%.*]] = getelementptr inbounds i64, ptr [[CELLS]], i64 [[IDXPROM_J]]
-; CHECK-NEXT:    store i64 [[MUL_2]], ptr [[ARRAYIDX_J]], align 8
-; CHECK-NEXT:    br label [[FOR_COND]]
-; CHECK:       for.end:
-; CHECK-NEXT:    ret void
-;
-entry:
-  br label %for.cond
-
-for.cond:
-  %j = phi i32 [ 0, %entry ], [ %add.j.1, %for.body ]
-  %cmp.not = icmp sgt i32 %j, %i
-  br i1 %cmp.not, label %for.end, label %for.body
-
-for.body:
-  %add.j.1 = add nuw nsw i32 %j, 1
-  %idxprom.j.1 = zext i32 %add.j.1 to i64
-  %arrayidx.j.1 = getelementptr inbounds i64, ptr %cells, i64 %idxprom.j.1
-  %cell.1 = load i64, ptr %arrayidx.j.1, align 8
-  %mul.1 = mul i64 %delta, %d1
-  %mul.2 = mul i64 %mul.1, %cell.1
-  %idxprom.j = zext i32 %j to i64
-  %arrayidx.j = getelementptr inbounds i64, ptr %cells, i64 %idxprom.j
-  store i64 %mul.2, ptr %arrayidx.j, align 8
-  br label %for.cond
-
-for.end:
-  ret void
-}
-
-;
-; The following loop will be modified by the 'Reassociate expressions' pass,
-;
-;  int j;
-;  const uint64_t d1d = d1 * delta;
-;  const uint64_t d2d = d2 * delta;
-;
-;  for (j = 0; j <= i; j++)
-;    cells[j] = d1d * cells[j + 1] + d2d * cells[j];
-;
-; ...into this:
-;
-;  int j;
-;
-;  for (j = 0; j <= i; j++)
-;    cells[j] = (d1 * cells[j + 1] + d2 * cells[j]) * delta;
-;
-; We expect the LICM pass to undo this transformation.
-;
-
-define void @innermost_loop_2d(i32 %i, i64 %d1, i64 %d2, i64 %delta, ptr %cells) {
-; NOT_CONSTRAINED-LABEL: define void @innermost_loop_2d
-; NOT_CONSTRAINED-SAME: (i32 [[I:%.*]], i64 [[D1:%.*]], i64 [[D2:%.*]], i64 [[DELTA:%.*]], ptr [[CELLS:%.*]]) {
-; NOT_CONSTRAINED-NEXT:  entry:
-; NOT_CONSTRAINED-NEXT:    [[FACTOR_OP_MUL:%.*]] = mul i64 [[D1]], [[DELTA]]
-; NOT_CONSTRAINED-NEXT:    [[FACTOR_OP_MUL1:%.*]] = mul i64 [[D2]], [[DELTA]]
-; NOT_CONSTRAINED-NEXT:    br label [[FOR_COND:%.*]]
-; NOT_CONSTRAINED:       for.cond:
-; NOT_CONSTRAINED-NEXT:    [[J:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[ADD_J_1:%.*]], [[FOR_BODY:%.*]] ]
-; NOT_CONSTRAINED-NEXT:    [[CMP_NOT:%.*]] = icmp sgt i32 [[J]], [[I]]
-; NOT_CONSTRAINED-NEXT:    br i1 [[CMP_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]]
-; NOT_CONSTRAINED:       for.body:
-; NOT_CONSTRAINED-NEXT:    [[ADD_J_1]] = add nuw nsw i32 [[J]], 1
-; NOT_CONSTRAINED-NEXT:    [[IDXPROM_J_1:%.*]] = zext i32 [[ADD_J_1]] to i64
-; NOT_CONSTRAINED-NEXT:    [[ARRAYIDX_J_1:%.*]] = getelementptr inbounds i64, ptr [[CELLS]], i64 [[IDXPROM_J_1]]
-; NOT_CONSTRAINED-NEXT:    [[CELL_1:%.*]] = load i64, ptr [[ARRAYIDX_J_1]], align 8
-; NOT_CONSTRAINED-NEXT:    [[MUL_1:%.*]] = mul i64 [[CELL_1]], [[FACTOR_OP_MUL]]
-; NOT_CONSTRAINED-NEXT:    [[IDXPROM_J:%.*]] = zext i32 [[J]] to i64
-; NOT_CONSTRAINED-NEXT:    [[ARRAYIDX_J:%.*]] = getelementptr inbounds i64, ptr [[CELLS]], i64 [[IDXPROM_J]]
-; NOT_CONSTRAINED-NEXT:    [[CELL_2:%.*]] = load i64, ptr [[ARRAYIDX_J]], align 8
-; NOT_CONSTRAINED-NEXT:    [[MUL_2:%.*]] = mul i64 [[CELL_2]], [[FACTOR_OP_MUL1]]
-; NOT_CONSTRAINED-NEXT:    [[REASS_ADD:%.*]] = add i64 [[MUL_2]], [[MUL_1]]
-; NOT_CONSTRAINED-NEXT:    store i64 [[REASS_ADD]], ptr [[ARRAYIDX_J]], align 8
-; NOT_CONSTRAINED-NEXT:    br label [[FOR_COND]]
-; NOT_CONSTRAINED:       for.end:
-; NOT_CONSTRAINED-NEXT:    ret void
-;
-; CONSTRAINED-LABEL: define void @innermost_loop_2d
-; CONSTRAINED-SAME: (i32 [[I:%.*]], i64 [[D1:%.*]], i64 [[D2:%.*]], i64 [[DELTA:%.*]], ptr [[CELLS:%.*]]) {
-; CONSTRAINED-NEXT:  entry:
-; CONSTRAINED-NEXT:    br label [[FOR_COND:%.*]]
-; CONSTRAINED:       for.cond:
-; CONSTRAINED-NEXT:    [[J:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[ADD_J_1:%.*]], [[FOR_BODY:%.*]] ]
-; CONSTRAINED-NEXT:    [[CMP_NOT:%.*]] = icmp sgt i32 [[J]], [[I]]
-; CONSTRAINED-NEXT:    br i1 [[CMP_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]]
-; CONSTRAINED:       for.body:
-; CONSTRAINED-NEXT:    [[ADD_J_1]] = add nuw nsw i32 [[J]], 1
-; CONSTRAINED-NEXT:    [[IDXPROM_J_1:%.*]] = zext i32 [[ADD_J_1]] to i64
-; CONSTRAINED-NEXT:    [[ARRAYIDX_J_1:%.*]] = getelementptr inbounds i64, ptr [[CELLS]], i64 [[IDXPROM_J_1]]
-; CONSTRAINED-NEXT:    [[CELL_1:%.*]] = load i64, ptr [[ARRAYIDX_J_1]], align 8
-; CONSTRAINED-NEXT:    [[MUL_1:%.*]] = mul i64 [[CELL_1]], [[D1]]
-; CONSTRAINED-NEXT:    [[IDXPROM_J:%.*]] = zext i32 [[J]] to i64
-; CONSTRAINED-NEXT:    [[ARRAYIDX_J:%.*]] = getelementptr inbounds i64, ptr [[CELLS]], i64 [[IDXPROM_J]]
-; CONSTRAINED-NEXT:    [[CELL_2:%.*]] = load i64, ptr [[ARRAYIDX_J]], align 8
-; CONSTRAINED-NEXT:    [[MUL_2:%.*]] = mul i64 [[CELL_2]], [[D2]]
-; CONSTRAINED-NEXT:    [[REASS_ADD:%.*]] = add i64 [[MUL_2]], [[MUL_1]]
-; CONSTRAINED-NEXT:    [[REASS_MUL:%.*]] = mul i64 [[REASS_ADD]], [[DELTA]]
-; CONSTRAINED-NEXT:    store i64 [[REASS_MUL]], ptr [[ARRAYIDX_J]], align 8
-; CONSTRAINED-NEXT:    br label [[FOR_COND]]
-; CONSTRAINED:       for.end:
-; CONSTRAINED-NEXT:    ret void
-;
-entry:
-  br label %for.cond
-
-for.cond:
-  %j = phi i32 [ 0, %entry ], [ %add.j.1, %for.body ]
-  %cmp.not = icmp sgt i32 %j, %i
-  br i1 %cmp.not, label %for.end, label %for.body
-
-for.body:
-  %add.j.1 = add nuw nsw i32 %j, 1
-  %idxprom.j.1 = zext i32 %add.j.1 to i64
-  %arrayidx.j.1 = getelementptr inbounds i64, ptr %cells, i64 %idxprom.j.1
-  %cell.1 = load i64, ptr %arrayidx.j.1, align 8
-  %mul.1 = mul i64 %cell.1, %d1
-  %idxprom.j = zext i32 %j to i64
-  %arrayidx.j = getelementptr inbounds i64, ptr %cells, i64 %idxprom.j
-  %cell.2 = load i64, ptr %arrayidx.j, align 8
-  %mul.2 = mul i64 %cell.2, %d2
-  %reass.add = add i64 %mul.2, %mul.1
-  %reass.mul = mul i64 %reass.add, %delta
-  store i64 %reass.mul, ptr %arrayidx.j, align 8
-  br label %for.cond
-
-for.end:
-  ret void
-}
-
-;
-; The following loop will be modified by the 'Reassociate expressions' pass,
-;
-;  int j;
-;  const uint64_t d1d = d1 * delta;
-;  const uint64_t d2d = d2 * delta;
-;  const uint64_t d3d = d3 * delta;
-;
-;  for (j = 0; j <= i; j++)
-;    cells[j] = d1d * cells[j + 1] + d2d * cells[j] + d3d * cells[j + 2];
-;
-; ...into this:
-;
-;  int j;
-;
-;  for (j = 0; j <= i; j++)
-;    cells[j] = (d1 * cells[j + 1] + d2 * cells[j] + d3 * cells[j + 2]) * delta;
-;
-; We expect the LICM pass to undo this transformation.
-;
-
-
-define void @innermost_loop_3d(i32 %i, i64 %d1, i64 %d2, i64 %d3, i64 %delta, ptr %cells) {
-; NOT_CONSTRAINED-LABEL: define void @innermost_loop_3d
-; NOT_CONSTRAINED-SAME: (i32 [[I:%.*]], i64 [[D1:%.*]], i64 [[D2:%.*]], i64 [[D3:%.*]], i64 [[DELTA:%.*]], ptr [[CELLS:%.*]]) {
-; NOT_CONSTRAINED-NEXT:  entry:
-; NOT_CONSTRAINED-NEXT:    [[FACTOR_OP_MUL:%.*]] = mul i64 [[D3]], [[DELTA]]
-; NOT_CONSTRAINED-NEXT:    [[FACTOR_OP_MUL1:%.*]] = mul i64 [[D1]], [[DELTA]]
-; NOT_CONSTRAINED-NEXT:    [[FACTOR_OP_MUL2:%.*]] = mul i64 [[D2]], [[DELTA]]
-; NOT_CONSTRAINED-NEXT:    br label [[FOR_COND:%.*]]
-; NOT_CONSTRAINED:       for.cond:
-; NOT_CONSTRAINED-NEXT:    [[J:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[ADD_J_1:%.*]], [[FOR_BODY:%.*]] ]
-; NOT_CONSTRAINED-NEXT:    [[CMP_NOT:%.*]] = icmp sgt i32 [[J]], [[I]]
-; NOT_CONSTRAINED-NEXT:    br i1 [[CMP_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]]
-; NOT_CONSTRAINED:       for.body:
-; NOT_CONSTRAINED-NEXT:    [[ADD_J_1]] = add nuw nsw i32 [[J]], 1
-; NOT_CONSTRAINED-NEXT:    [[IDXPROM_J_1:%.*]] = zext i32 [[ADD_J_1]] to i64
-; NOT_CONSTRAINED-NEXT:    [[ARRAYIDX_J_1:%.*]] = getelementptr inbounds i64, ptr [[CELLS]], i64 [[IDXPROM_J_1]]
-; NOT_CONSTRAINED-NEXT:    [[CELL_1:%.*]] = load i64, ptr [[ARRAYIDX_J_1]], align 8
-; NOT_CONSTRAINED-NEXT:    [[MUL_1:%.*]] = mul i64 [[CELL_1]], [[FACTOR_OP_MUL1]]
-; NOT_CONSTRAINED-NEXT:    [[IDXPROM_J:%.*]] = zext i32 [[J]] to i64
-; NOT_CONSTRAINED-NEXT:    [[ARRAYIDX_J:%.*]] = getelementptr inbounds i64, ptr [[CELLS]], i64 [[IDXPROM_J]]
-; NOT_CONSTRAINED-NEXT:    [[CELL_2:%.*]] = load i64, ptr [[ARRAYIDX_J]], align 8
-; NOT_CONSTRAINED-NEXT:    [[MUL_2:%.*]] = mul i64 [[CELL_2]], [[FACTOR_OP_MUL2]]
-; NOT_CONSTRAINED-NEXT:    [[ADD_J_2:%.*]] = add nuw nsw i32 [[J]], 2
-; NOT_CONSTRAINED-NEXT:    [[IDXPROM_J_2:%.*]] = zext i32 [[ADD_J_2]] to i64
-; NOT_CONSTRAINED-NEXT:    [[ARRAYIDX_J_2:%.*]] = getelementptr inbounds i64, ptr [[CELLS]], i64 [[IDXPROM_J_2]]
-; NOT_CONSTRAINED-NEXT:    [[CELL_3:%.*]] = load i64, ptr [[ARRAYIDX_J_2]], align 8
-; NOT_CONSTRAINED-NEXT:    [[MUL_3:%.*]] = mul i64 [[CELL_3]], [[FACTOR_OP_MUL]]
-; NOT_CONSTRAINED-NEXT:    [[REASS_ADD:%.*]] = add i64 [[MUL_2]], [[MUL_1]]
-; NOT_CONSTRAINED-NEXT:    [[REASS_ADD1:%.*]] = add i64 [[REASS_ADD]], [[MUL_3]]
-; NOT_CONSTRAINED-NEXT:    store i64 [[REASS_ADD1]], ptr [[ARRAYIDX_J_2]], align 8
-; NOT_CONSTRAINED-NEXT:    br label [[FOR_COND]]
-; NOT_CONSTRAINED:       for.end:
-; NOT_CONSTRAINED-NEXT:    ret void
-;
-; CONSTRAINED-LABEL: define void @innermost_loop_3d
-; CONSTRAINED-SAME: (i32 [[I:%.*]], i64 [[D1:%.*]], i64 [[D2:%.*]], i64 [[D3:%.*]], i64 [[DELTA:%.*]], ptr [[CELLS:%.*]]) {
-; CONSTRAINED-NEXT:  entry:
-; CONSTRAINED-NEXT:    br label [[FOR_COND:%.*]]
-; CONSTRAINED:       for.cond:
-; CONSTRAINED-NEXT:    [[J:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[ADD_J_1:%.*]], [[FOR_BODY:%.*]] ]
-; CONSTRAINED-NEXT:    [[CMP_NOT:%.*]] = icmp sgt i32 [[J]], [[I]]
-; CONSTRAINED-NEXT:    br i1 [[CMP_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]]
-; CONSTRAINED:       for.body:
-; CONSTRAINED-NEXT:    [[ADD_J_1]] = add nuw nsw i32 [[J]], 1
-; CONSTRAINED-NEXT:    [[IDXPROM_J_1:%.*]] = zext i32 [[ADD_J_1]] to i64
-; CONSTRAINED-NEXT:    [[ARRAYIDX_J_1:%.*]] = getelementptr inbounds i64, ptr [[CELLS]], i64 [[IDXPROM_J_1]]
-; CONSTRAINED-NEXT:    [[CELL_1:%.*]] = load i64, ptr [[ARRAYIDX_J_1]], align 8
-; CONSTRAINED-NEXT:    [[MUL_1:%.*]] = mul i64 [[CELL_1]], [[D1]]
-; CONSTRAINED-NEXT:    [[IDXPROM_J:%.*]] = zext i32 [[J]] to i64
-; CONSTRAINED-NEXT:    [[ARRAYIDX_J:%.*]] = getelementptr inbounds i64, ptr [[CELLS]], i64 [[IDXPROM_J]]
-; CONSTRAINED-NEXT:    [[CELL_2:%.*]] = load i64, ptr [[ARRAYIDX_J]], align 8
-; CONSTRAINED-NEXT:    [[MUL_2:%.*]] = mul i64 [[CELL_2]], [[D2]]
-; CONSTRAINED-NEXT:    [[ADD_J_2:%.*]] = add nuw nsw i32 [[J]], 2
-; CONSTRAINED-NEXT:    [[IDXPROM_J_2:%.*]] = zext i32 [[ADD_J_2]] to i64
-; CONSTRAINED-NEXT:    [[ARRAYIDX_J_2:%.*]] = getelementptr inbounds i64, ptr [[CELLS]], i64 [[IDXPROM_J_2]]
-; CONSTRAINED-NEXT:    [[CELL_3:%.*]] = load i64, ptr [[ARRAYIDX_J_2]], align 8
-; CONSTRAINED-NEXT:    [[MUL_3:%.*]] = mul i64 [[CELL_3]], [[D3]]
-; CONSTRAINED-NEXT:    [[REASS_ADD:%.*]] = add i64 [[MUL_2]], [[MUL_1]]
-; CONSTRAINED-NEXT:    [[REASS_ADD1:%.*]] = add i64 [[REASS_ADD]], [[MUL_3]]
-; CONSTRAINED-NEXT:    [[REASS_MUL:%.*]] = mul i64 [[REASS_ADD1]], [[DELTA]]
-; CONSTRAINED-NEXT:    store i64 [[REASS_MUL]], ptr [[ARRAYIDX_J_2]], align 8
-; CONSTRAINED-NEXT:    br label [[FOR_COND]]
-; CONSTRAINED:       for.end:
-; CONSTRAINED-NEXT:    ret void
-;
-entry:
-  br label %for.cond
-
-for.cond:
-  %j = phi i32 [ 0, %entry ], [ %add.j.1, %for.body ]
-  %cmp.not = icmp sgt i32 %j, %i
-  br i1 %cmp.not, label %for.end, label %for.body
-
-for.body:
-  %add.j.1 = add nuw nsw i32 %j, 1
-  %idxprom.j.1 = zext i32 %add.j.1 to i64
-  %arrayidx.j.1 = getelementptr inbounds i64, ptr %cells, i64 %idxprom.j.1
-  %cell.1 = load i64, ptr %arrayidx.j.1, align 8
-  %mul.1 = mul i64 %cell.1, %d1
-  %idxprom.j = zext i32 %j to i64
-  %arrayidx.j = getelementptr inbounds i64, ptr %cells, i64 %idxprom.j
-  %cell.2 = load i64, ptr %arrayidx.j, align 8
-  %mul.2 = mul i64 %cell.2, %d2
-  %add.j.2 = add nuw nsw i32 %j, 2
-  %idxprom.j.2 = zext i32 %add.j.2 to i64
-  %arrayidx.j.2 = getelementptr inbounds i64, ptr %cells, i64 %idxprom.j.2
-  %cell.3 = load i64, ptr %arrayidx.j.2, align 8
-  %mul.3 = mul i64 %cell.3, %d3
-  %reass.add = add i64 %mul.2, %mul.1
-  %reass.add1 = add i64 %reass.add, %mul.3
-  %reass.mul = mul i64 %reass.add1, %delta
-  store i64 %reass.mul, ptr %arrayidx.j.2, align 8
-  br label %for.cond
-
-for.end:
-  ret void
-}
-
-;
-; The following loop will not be modified by the LICM pass:
-;
-;  int j;
-;
-;  for (j = 0; j <= i; j++)
-;    cells[j] = (d1 * cells[j + 1] + d2 * cells[j] +
-;                cells[j] * cells[j + 1]) * delta;
-;
-; This case differs as one of the multiplications involves no invariants.
-;
-
-define void @innermost_loop_3d_reassociated_different(i32 %i, i64 %d1, i64 %d2, i64 %delta, ptr %cells) {
-; CHECK-LABEL: define void @innermost_loop_3d_reassociated_different
-; CHECK-SAME: (i32 [[I:%.*]], i64 [[D1:%.*]], i64 [[D2:%.*]], i64 [[DELTA:%.*]], ptr [[CELLS:%.*]]) {
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br label [[FOR_COND:%.*]]
-; CHECK:       for.cond:
-; CHECK-NEXT:    [[J:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[ADD_J_1:%.*]], [[FOR_BODY:%.*]] ]
-; CHECK-NEXT:    [[CMP_NOT:%.*]] = icmp sgt i32 [[J]], [[I]]
-; CHECK-NEXT:    br i1 [[CMP_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[ADD_J_1]] = add nuw nsw i32 [[J]], 1
-; CHECK-NEXT:    [[IDXPROM_J_1:%.*]] = zext i32 [[ADD_J_1]] to i64
-; CHECK-NEXT:    [[ARRAYIDX_J_1:%.*]] = getelementptr inbounds i64, ptr [[CELLS]], i64 [[IDXPROM_J_1]]
-; CHECK-NEXT:    [[CELL_1:%.*]] = load i64, ptr [[ARRAYIDX_J_1]], align 8
-; CHECK-NEXT:    [[IDXPROM_J_2:%.*]] = zext i32 [[ADD_J_1]] to i64
-; CHECK-NEXT:    [[ARRAYIDX_J_2:%.*]] = getelementptr inbounds i64, ptr [[CELLS]], i64 [[IDXPROM_J_2]]
-; CHECK-NEXT:    [[CELL_2:%.*]] = load i64, ptr [[ARRAYIDX_J_2]], align 8
-; CHECK-NEXT:    [[CELL_3:%.*]] = load i64, ptr [[ARRAYIDX_J_2]], align 8
-; CHECK-NEXT:    [[IDXPROM_J:%.*]] = zext i32 [[J]] to i64
-; CHECK-NEXT:    [[ARRAYIDX_J:%.*]] = getelementptr inbounds i64, ptr [[CELLS]], i64 [[IDXPROM_J]]
-; CHECK-NEXT:    [[CELL_4:%.*]] = load i64, ptr [[ARRAYIDX_J]], align 8
-; CHECK-NEXT:    [[MUL_1:%.*]] = mul i64 [[CELL_1]], [[D1]]
-; CHECK-NEXT:    [[MUL_2:%.*]] = mul i64 [[CELL_4]], [[D2]]
-; CHECK-NEXT:    [[EXTRA_MUL:%.*]] = mul i64 [[CELL_3]], [[CELL_2]]
-; CHECK-NEXT:    [[REASS_ADD:%.*]] = add i64 [[EXTRA_MUL]], [[MUL_1]]
-; CHECK-NEXT:    [[EXTRA_ADD:%.*]] = add i64 [[REASS_ADD]], [[MUL_2]]
-; CHECK-NEXT:    [[REASS_MUL:%.*]] = mul i64 [[EXTRA_ADD]], [[DELTA]]
-; CHECK-NEXT:    store i64 [[REASS_MUL]], ptr [[ARRAYIDX_J]], align 8
-; CHECK-NEXT:    br label [[FOR_COND]]
-; CHECK:       for.end:
-; CHECK-NEXT:    ret void
-;
-entry:
-  br label %for.cond
-
-for.cond:
-  %j = phi i32 [ 0, %entry ], [ %add.j.1, %for.body ]
-  %cmp.not = icmp sgt i32 %j, %i
-  br i1 %cmp.not, label %for.end, label %for.body
-
-for.body:
-  %add.j.1 = add nuw nsw i32 %j, 1
-  %idxprom.j.1 = zext i32 %add.j.1 to i64
-  %arrayidx.j.1 = getelementptr inbounds i64, ptr %cells, i64 %idxprom.j.1
-  %cell.1 = load i64, ptr %arrayidx.j.1, align 8
-  %idxprom.j.2 = zext i32 %add.j.1 to i64
-  %arrayidx.j.2 = getelementptr inbounds i64, ptr %cells, i64 %idxprom.j.2
-  %cell.2 = load i64, ptr %arrayidx.j.2, align 8
-  %idxprom.j.3 = zext i32 %add.j.1 to i64
-  %cell.3 = load i64, ptr %arrayidx.j.2, align 8
-  %idxprom.j = zext i32 %j to i64
-  %arrayidx.j = getelementptr inbounds i64, ptr %cells, i64 %idxprom.j
-  %cell.4 = load i64, ptr %arrayidx.j, align 8
-  %mul.1 = mul i64 %cell.1, %d1
-  %mul.2 = mul i64 %cell.4, %d2
-  %extra.mul = mul i64 %cell.3, %cell.2
-  %reass.add = add i64 %extra.mul, %mul.1
-  %extra.add = add i64 %reass.add, %mul.2
-  %reass.mul = mul i64 %extra.add, %delta
-  store i64 %reass.mul, ptr %arrayidx.j, align 8
-  br label %for.cond
-
-for.end:
-  ret void
-}
-- 
cgit v1.1


From a38152e2156a467520dae29fa3760802c308d54c Mon Sep 17 00:00:00 2001
From: Jacek Caban <jacek@codeweavers.com>
Date: Tue, 13 Feb 2024 01:49:45 +0100
Subject: [llvm-lib] Add support for -defArm64Native argument. (#81426)

This can be used to create import libraries that contain both ARM64EC
and native exports. The implementation follows observed MSVC lib.exe
behaviour. It's ignored on targets other than ARM64EC.
---
 lld/COFF/Driver.cpp                                |  12 +-
 llvm/include/llvm/Object/COFFImportFile.h          |   1 +
 llvm/lib/Object/COFFImportFile.cpp                 | 112 +++++------
 .../lib/ToolDrivers/llvm-dlltool/DlltoolDriver.cpp |   5 +-
 llvm/lib/ToolDrivers/llvm-lib/LibDriver.cpp        |  30 ++-
 llvm/lib/ToolDrivers/llvm-lib/Options.td           |   1 +
 llvm/test/tools/llvm-lib/arm64ec-implib.test       | 207 +++++++++++++++++++++
 7 files changed, 306 insertions(+), 62 deletions(-)

diff --git a/lld/COFF/Driver.cpp b/lld/COFF/Driver.cpp
index 22ee2f1..091aa0d 100644
--- a/lld/COFF/Driver.cpp
+++ b/lld/COFF/Driver.cpp
@@ -939,7 +939,7 @@ std::string LinkerDriver::getImportName(bool asLib) {
 
 void LinkerDriver::createImportLibrary(bool asLib) {
   llvm::TimeTraceScope timeScope("Create import library");
-  std::vector<COFFShortExport> exports;
+  std::vector<COFFShortExport> exports, nativeExports;
   for (Export &e1 : ctx.config.exports) {
     COFFShortExport e2;
     e2.Name = std::string(e1.name);
@@ -958,8 +958,8 @@ void LinkerDriver::createImportLibrary(bool asLib) {
   std::string path = getImplibPath();
 
   if (!ctx.config.incremental) {
-    checkError(writeImportLibrary(libName, path, exports, ctx.config.machine,
-                                  ctx.config.mingw));
+    checkError(writeImportLibrary(libName, path, exports, nativeExports,
+                                  ctx.config.machine, ctx.config.mingw));
     return;
   }
 
@@ -968,8 +968,8 @@ void LinkerDriver::createImportLibrary(bool asLib) {
   ErrorOr<std::unique_ptr<MemoryBuffer>> oldBuf = MemoryBuffer::getFile(
       path, /*IsText=*/false, /*RequiresNullTerminator=*/false);
   if (!oldBuf) {
-    checkError(writeImportLibrary(libName, path, exports, ctx.config.machine,
-                                  ctx.config.mingw));
+    checkError(writeImportLibrary(libName, path, exports, nativeExports,
+                                  ctx.config.machine, ctx.config.mingw));
     return;
   }
 
@@ -979,7 +979,7 @@ void LinkerDriver::createImportLibrary(bool asLib) {
     fatal("cannot create temporary file for import library " + path + ": " +
           ec.message());
 
-  if (Error e = writeImportLibrary(libName, tmpName, exports,
+  if (Error e = writeImportLibrary(libName, tmpName, exports, nativeExports,
                                    ctx.config.machine, ctx.config.mingw)) {
     checkError(std::move(e));
     return;
diff --git a/llvm/include/llvm/Object/COFFImportFile.h b/llvm/include/llvm/Object/COFFImportFile.h
index 46a982d..23c3e6a 100644
--- a/llvm/include/llvm/Object/COFFImportFile.h
+++ b/llvm/include/llvm/Object/COFFImportFile.h
@@ -137,6 +137,7 @@ struct COFFShortExport {
 
 Error writeImportLibrary(StringRef ImportName, StringRef Path,
                          ArrayRef<COFFShortExport> Exports,
+                         ArrayRef<COFFShortExport> NativeExports,
                          COFF::MachineTypes Machine, bool MinGW);
 
 } // namespace object
diff --git a/llvm/lib/Object/COFFImportFile.cpp b/llvm/lib/Object/COFFImportFile.cpp
index a3e5e78..9175c3e 100644
--- a/llvm/lib/Object/COFFImportFile.cpp
+++ b/llvm/lib/Object/COFFImportFile.cpp
@@ -625,6 +625,7 @@ NewArchiveMember ObjectFactory::createWeakExternal(StringRef Sym,
 
 Error writeImportLibrary(StringRef ImportName, StringRef Path,
                          ArrayRef<COFFShortExport> Exports,
+                         ArrayRef<COFFShortExport> NativeExports,
                          MachineTypes Machine, bool MinGW) {
 
   MachineTypes NativeMachine =
@@ -642,66 +643,73 @@ Error writeImportLibrary(StringRef ImportName, StringRef Path,
   std::vector<uint8_t> NullThunk;
   Members.push_back(OF.createNullThunk(NullThunk));
 
-  for (const COFFShortExport &E : Exports) {
-    if (E.Private)
-      continue;
-
-    ImportType ImportType = IMPORT_CODE;
-    if (E.Data)
-      ImportType = IMPORT_DATA;
-    if (E.Constant)
-      ImportType = IMPORT_CONST;
-
-    StringRef SymbolName = E.SymbolName.empty() ? E.Name : E.SymbolName;
-    std::string Name;
-
-    if (E.ExtName.empty()) {
-      Name = std::string(SymbolName);
-    } else {
-      Expected<std::string> ReplacedName =
-          replace(SymbolName, E.Name, E.ExtName);
-      if (!ReplacedName)
-        return ReplacedName.takeError();
-      Name.swap(*ReplacedName);
-    }
+  auto addExports = [&](ArrayRef<COFFShortExport> Exp,
+                        MachineTypes M) -> Error {
+    for (const COFFShortExport &E : Exp) {
+      if (E.Private)
+        continue;
+
+      ImportType ImportType = IMPORT_CODE;
+      if (E.Data)
+        ImportType = IMPORT_DATA;
+      if (E.Constant)
+        ImportType = IMPORT_CONST;
+
+      StringRef SymbolName = E.SymbolName.empty() ? E.Name : E.SymbolName;
+      std::string Name;
+
+      if (E.ExtName.empty()) {
+        Name = std::string(SymbolName);
+      } else {
+        Expected<std::string> ReplacedName =
+            replace(SymbolName, E.Name, E.ExtName);
+        if (!ReplacedName)
+          return ReplacedName.takeError();
+        Name.swap(*ReplacedName);
+      }
 
-    if (!E.AliasTarget.empty() && Name != E.AliasTarget) {
-      Members.push_back(
-          OF.createWeakExternal(E.AliasTarget, Name, false, Machine));
-      Members.push_back(
-          OF.createWeakExternal(E.AliasTarget, Name, true, Machine));
-      continue;
-    }
+      if (!E.AliasTarget.empty() && Name != E.AliasTarget) {
+        Members.push_back(OF.createWeakExternal(E.AliasTarget, Name, false, M));
+        Members.push_back(OF.createWeakExternal(E.AliasTarget, Name, true, M));
+        continue;
+      }
 
-    ImportNameType NameType;
-    std::string ExportName;
-    if (E.Noname) {
-      NameType = IMPORT_ORDINAL;
-    } else if (!E.ExportAs.empty()) {
-      NameType = IMPORT_NAME_EXPORTAS;
-      ExportName = E.ExportAs;
-    } else {
-      NameType = getNameType(SymbolName, E.Name, Machine, MinGW);
-    }
+      ImportNameType NameType;
+      std::string ExportName;
+      if (E.Noname) {
+        NameType = IMPORT_ORDINAL;
+      } else if (!E.ExportAs.empty()) {
+        NameType = IMPORT_NAME_EXPORTAS;
+        ExportName = E.ExportAs;
+      } else {
+        NameType = getNameType(SymbolName, E.Name, M, MinGW);
+      }
 
-    // On ARM64EC, use EXPORTAS to import demangled name for mangled symbols.
-    if (ImportType == IMPORT_CODE && isArm64EC(Machine)) {
-      if (std::optional<std::string> MangledName =
-              getArm64ECMangledFunctionName(Name)) {
-        if (ExportName.empty()) {
+      // On ARM64EC, use EXPORTAS to import demangled name for mangled symbols.
+      if (ImportType == IMPORT_CODE && isArm64EC(M)) {
+        if (std::optional<std::string> MangledName =
+                getArm64ECMangledFunctionName(Name)) {
+          if (ExportName.empty()) {
+            NameType = IMPORT_NAME_EXPORTAS;
+            ExportName.swap(Name);
+          }
+          Name = std::move(*MangledName);
+        } else if (ExportName.empty()) {
           NameType = IMPORT_NAME_EXPORTAS;
-          ExportName.swap(Name);
+          ExportName = std::move(*getArm64ECDemangledFunctionName(Name));
         }
-        Name = std::move(*MangledName);
-      } else if (ExportName.empty()) {
-        NameType = IMPORT_NAME_EXPORTAS;
-        ExportName = std::move(*getArm64ECDemangledFunctionName(Name));
       }
+
+      Members.push_back(OF.createShortImport(Name, E.Ordinal, ImportType,
+                                             NameType, ExportName, M));
     }
+    return Error::success();
+  };
 
-    Members.push_back(OF.createShortImport(Name, E.Ordinal, ImportType,
-                                           NameType, ExportName, Machine));
-  }
+  if (Error e = addExports(Exports, Machine))
+    return e;
+  if (Error e = addExports(NativeExports, NativeMachine))
+    return e;
 
   return writeArchive(Path, Members, SymtabWritingMode::NormalSymtab,
                       MinGW ? object::Archive::K_GNU : object::Archive::K_COFF,
diff --git a/llvm/lib/ToolDrivers/llvm-dlltool/DlltoolDriver.cpp b/llvm/lib/ToolDrivers/llvm-dlltool/DlltoolDriver.cpp
index 8349038..0749580 100644
--- a/llvm/lib/ToolDrivers/llvm-dlltool/DlltoolDriver.cpp
+++ b/llvm/lib/ToolDrivers/llvm-dlltool/DlltoolDriver.cpp
@@ -215,8 +215,9 @@ int llvm::dlltoolDriverMain(llvm::ArrayRef<const char *> ArgsArr) {
     }
   }
 
-  if (!Path.empty() && writeImportLibrary(Def->OutputFile, Path, Def->Exports,
-                                          Machine, /*MinGW=*/true))
+  if (!Path.empty() &&
+      writeImportLibrary(Def->OutputFile, Path, Def->Exports, std::nullopt,
+                         Machine, /*MinGW=*/true))
     return 1;
   return 0;
 }
diff --git a/llvm/lib/ToolDrivers/llvm-lib/LibDriver.cpp b/llvm/lib/ToolDrivers/llvm-lib/LibDriver.cpp
index 5d7ec0f..3baa0a08c 100644
--- a/llvm/lib/ToolDrivers/llvm-lib/LibDriver.cpp
+++ b/llvm/lib/ToolDrivers/llvm-lib/LibDriver.cpp
@@ -392,8 +392,34 @@ int llvm::libDriverMain(ArrayRef<const char *> ArgsArr) {
       return 1;
     }
 
-    return writeImportLibrary(Def->OutputFile, OutputPath, Def->Exports,
-                              LibMachine,
+    std::vector<COFFShortExport> NativeExports;
+    std::string OutputFile = Def->OutputFile;
+
+    if (isArm64EC(LibMachine) && Args.hasArg(OPT_nativedeffile)) {
+      std::unique_ptr<MemoryBuffer> NativeMB =
+          openFile(Args.getLastArg(OPT_nativedeffile)->getValue());
+      if (!NativeMB)
+        return 1;
+
+      if (!NativeMB->getBufferSize()) {
+        llvm::errs() << "native definition file empty\n";
+        return 1;
+      }
+
+      Expected<COFFModuleDefinition> NativeDef =
+          parseCOFFModuleDefinition(*NativeMB, COFF::IMAGE_FILE_MACHINE_ARM64);
+
+      if (!NativeDef) {
+        llvm::errs() << "error parsing native definition\n"
+                     << errorToErrorCode(NativeDef.takeError()).message();
+        return 1;
+      }
+      NativeExports = std::move(NativeDef->Exports);
+      OutputFile = std::move(NativeDef->OutputFile);
+    }
+
+    return writeImportLibrary(OutputFile, OutputPath, Def->Exports,
+                              NativeExports, LibMachine,
                               /*MinGW=*/false)
                ? 1
                : 0;
diff --git a/llvm/lib/ToolDrivers/llvm-lib/Options.td b/llvm/lib/ToolDrivers/llvm-lib/Options.td
index 22ac1fb..a3d901d 100644
--- a/llvm/lib/ToolDrivers/llvm-lib/Options.td
+++ b/llvm/lib/ToolDrivers/llvm-lib/Options.td
@@ -23,6 +23,7 @@ def libpath: P<"libpath", "Object file search path">;
 def lst    : F<"list">, HelpText<"List contents of .lib file on stdout">;
 def out    : P<"out", "Path to file to write output">;
 def deffile : P<"def", "def file to use to generate import library">;
+def nativedeffile : P<"defArm64Native", "def file to use to generate native ARM64 symbols in ARM64EC import library">;
 
 def llvmlibthin : F<"llvmlibthin">,
     HelpText<"Make .lib point to .obj files instead of copying their contents">;
diff --git a/llvm/test/tools/llvm-lib/arm64ec-implib.test b/llvm/test/tools/llvm-lib/arm64ec-implib.test
index c583ef7..77bdc23 100644
--- a/llvm/test/tools/llvm-lib/arm64ec-implib.test
+++ b/llvm/test/tools/llvm-lib/arm64ec-implib.test
@@ -98,6 +98,208 @@ RUN: llvm-lib -machine:arm64ec test.lib -out:test2.lib
 RUN: llvm-nm --print-armap test2.lib | FileCheck -check-prefix=ARMAP %s
 
 
+RUN: llvm-lib -machine:arm64ec -def:test.def -defArm64Native:test.def -out:testx.lib
+
+RUN: llvm-nm --print-armap testx.lib | FileCheck -check-prefix=ARMAPX %s
+
+ARMAPX:      Archive map
+ARMAPX-NEXT: #mangledfunc in test.dll
+ARMAPX-NEXT: ?test_cpp_func@@YAHPEAX@Z in test.dll
+ARMAPX-NEXT: __IMPORT_DESCRIPTOR_test in test.dll
+ARMAPX-NEXT: __NULL_IMPORT_DESCRIPTOR in test.dll
+ARMAPX-NEXT: __imp_#mangledfunc in test.dll
+ARMAPX-NEXT: __imp_?test_cpp_func@@YAHPEAX@Z in test.dll
+ARMAPX-NEXT: __imp_dataexp in test.dll
+ARMAPX-NEXT: __imp_expname in test.dll
+ARMAPX-NEXT: __imp_funcexp in test.dll
+ARMAPX-NEXT: expname in test.dll
+ARMAPX-NEXT: funcexp in test.dll
+ARMAPX-NEXT: test_NULL_THUNK_DATA in test.dll
+ARMAPX-EMPTY:
+ARMAPX-NEXT: Archive EC map
+ARMAPX-NEXT: #expname in test.dll
+ARMAPX-NEXT: #funcexp in test.dll
+ARMAPX-NEXT: #mangledfunc in test.dll
+ARMAPX-NEXT: ?test_cpp_func@@$$hYAHPEAX@Z in test.dll
+ARMAPX-NEXT: ?test_cpp_func@@YAHPEAX@Z in test.dll
+ARMAPX-NEXT: __imp_?test_cpp_func@@YAHPEAX@Z in test.dll
+ARMAPX-NEXT: __imp_aux_?test_cpp_func@@YAHPEAX@Z in test.dll
+ARMAPX-NEXT: __imp_aux_expname in test.dll
+ARMAPX-NEXT: __imp_aux_funcexp in test.dll
+ARMAPX-NEXT: __imp_aux_mangledfunc in test.dll
+ARMAPX-NEXT: __imp_dataexp in test.dll
+ARMAPX-NEXT: __imp_expname in test.dll
+ARMAPX-NEXT: __imp_funcexp in test.dll
+ARMAPX-NEXT: __imp_mangledfunc in test.dll
+ARMAPX-NEXT: expname in test.dll
+ARMAPX-NEXT: funcexp in test.dll
+ARMAPX-NEXT: mangledfunc in test.dll
+
+RUN: llvm-readobj testx.lib | FileCheck -check-prefix=READOBJX %s
+
+READOBJX:      File: testx.lib(test.dll)
+READOBJX-NEXT: Format: COFF-ARM64
+READOBJX-NEXT: Arch: aarch64
+READOBJX-NEXT: AddressSize: 64bit
+READOBJX-EMPTY:
+READOBJX-NEXT: File: testx.lib(test.dll)
+READOBJX-NEXT: Format: COFF-ARM64
+READOBJX-NEXT: Arch: aarch64
+READOBJX-NEXT: AddressSize: 64bit
+READOBJX-EMPTY:
+READOBJX-NEXT: File: testx.lib(test.dll)
+READOBJX-NEXT: Format: COFF-ARM64
+READOBJX-NEXT: Arch: aarch64
+READOBJX-NEXT: AddressSize: 64bit
+READOBJX-EMPTY:
+READOBJX-NEXT: File: test.dll
+READOBJX-NEXT: Format: COFF-import-file-ARM64EC
+READOBJX-NEXT: Type: code
+READOBJX-NEXT: Name type: export as
+READOBJX-NEXT: Export name: funcexp
+READOBJX-NEXT: Symbol: __imp_funcexp
+READOBJX-NEXT: Symbol: funcexp
+READOBJX-NEXT: Symbol: __imp_aux_funcexp
+READOBJX-NEXT: Symbol: #funcexp
+READOBJX-EMPTY:
+READOBJX-NEXT: File: test.dll
+READOBJX-NEXT: Format: COFF-import-file-ARM64EC
+READOBJX-NEXT: Type: code
+READOBJX-NEXT: Name type: export as
+READOBJX-NEXT: Export name: mangledfunc
+READOBJX-NEXT: Symbol: __imp_mangledfunc
+READOBJX-NEXT: Symbol: mangledfunc
+READOBJX-NEXT: Symbol: __imp_aux_mangledfunc
+READOBJX-NEXT: Symbol: #mangledfunc
+READOBJX-EMPTY:
+READOBJX-NEXT: File: test.dll
+READOBJX-NEXT: Format: COFF-import-file-ARM64EC
+READOBJX-NEXT: Type: code
+READOBJX-NEXT: Name type: export as
+READOBJX-NEXT: Export name: ?test_cpp_func@@YAHPEAX@Z
+READOBJX-NEXT: Symbol: __imp_?test_cpp_func@@YAHPEAX@Z
+READOBJX-NEXT: Symbol: ?test_cpp_func@@YAHPEAX@Z
+READOBJX-NEXT: Symbol: __imp_aux_?test_cpp_func@@YAHPEAX@Z
+READOBJX-NEXT: Symbol: ?test_cpp_func@@$$hYAHPEAX@Z
+READOBJX-EMPTY:
+READOBJX-NEXT: File: test.dll
+READOBJX-NEXT: Format: COFF-import-file-ARM64EC
+READOBJX-NEXT: Type: code
+READOBJX-NEXT: Name type: export as
+READOBJX-NEXT: Export name: expname
+READOBJX-NEXT: Symbol: __imp_expname
+READOBJX-NEXT: Symbol: expname
+READOBJX-NEXT: Symbol: __imp_aux_expname
+READOBJX-NEXT: Symbol: #expname
+READOBJX-EMPTY:
+READOBJX-NEXT: File: test.dll
+READOBJX-NEXT: Format: COFF-import-file-ARM64EC
+READOBJX-NEXT: Type: data
+READOBJX-NEXT: Name type: name
+READOBJX-NEXT: Export name: dataexp
+READOBJX-NEXT: Symbol: __imp_dataexp
+READOBJX-EMPTY:
+READOBJX-NEXT: File: test.dll
+READOBJX-NEXT: Format: COFF-import-file-ARM64
+READOBJX-NEXT: Type: code
+READOBJX-NEXT: Name type: name
+READOBJX-NEXT: Export name: funcexp
+READOBJX-NEXT: Symbol: __imp_funcexp
+READOBJX-NEXT: Symbol: funcexp
+READOBJX-EMPTY:
+READOBJX-NEXT: File: test.dll
+READOBJX-NEXT: Format: COFF-import-file-ARM64
+READOBJX-NEXT: Type: code
+READOBJX-NEXT: Name type: name
+READOBJX-NEXT: Export name: #mangledfunc
+READOBJX-NEXT: Symbol: __imp_#mangledfunc
+READOBJX-NEXT: Symbol: #mangledfunc
+READOBJX-EMPTY:
+READOBJX-NEXT: File: test.dll
+READOBJX-NEXT: Format: COFF-import-file-ARM64
+READOBJX-NEXT: Type: code
+READOBJX-NEXT: Name type: name
+READOBJX-NEXT: Export name: ?test_cpp_func@@YAHPEAX@Z
+READOBJX-NEXT: Symbol: __imp_?test_cpp_func@@YAHPEAX@Z
+READOBJX-NEXT: Symbol: ?test_cpp_func@@YAHPEAX@Z
+READOBJX-EMPTY:
+READOBJX-NEXT: File: test.dll
+READOBJX-NEXT: Format: COFF-import-file-ARM64
+READOBJX-NEXT: Type: code
+READOBJX-NEXT: Name type: name
+READOBJX-NEXT: Export name: expname
+READOBJX-NEXT: Symbol: __imp_expname
+READOBJX-NEXT: Symbol: expname
+READOBJX-EMPTY:
+READOBJX-NEXT: File: test.dll
+READOBJX-NEXT: Format: COFF-import-file-ARM64
+READOBJX-NEXT: Type: data
+READOBJX-NEXT: Name type: name
+READOBJX-NEXT: Export name: dataexp
+READOBJX-NEXT: Symbol: __imp_dataexp
+
+
+RUN: llvm-lib -machine:arm64ec -def:test.def -defArm64Native:test2.def -out:test2.lib
+RUN: llvm-nm --print-armap test2.lib | FileCheck -check-prefix=ARMAPX2 %s
+
+ARMAPX2:      Archive map
+ARMAPX2-NEXT: __IMPORT_DESCRIPTOR_test2 in test2.dll
+ARMAPX2-NEXT: __NULL_IMPORT_DESCRIPTOR in test2.dll
+ARMAPX2-NEXT: __imp_otherfunc in test2.dll
+ARMAPX2-NEXT: otherfunc in test2.dll
+ARMAPX2-NEXT: test2_NULL_THUNK_DATA in test2.dll
+ARMAPX2-EMPTY:
+ARMAPX2-NEXT: Archive EC map
+ARMAPX2-NEXT: #expname in test2.dll
+ARMAPX2-NEXT: #funcexp in test2.dll
+ARMAPX2-NEXT: #mangledfunc in test2.dll
+ARMAPX2-NEXT: ?test_cpp_func@@$$hYAHPEAX@Z in test2.dll
+ARMAPX2-NEXT: ?test_cpp_func@@YAHPEAX@Z in test2.dll
+ARMAPX2-NEXT: __imp_?test_cpp_func@@YAHPEAX@Z in test2.dll
+ARMAPX2-NEXT: __imp_aux_?test_cpp_func@@YAHPEAX@Z in test2.dll
+ARMAPX2-NEXT: __imp_aux_expname in test2.dll
+ARMAPX2-NEXT: __imp_aux_funcexp in test2.dll
+ARMAPX2-NEXT: __imp_aux_mangledfunc in test2.dll
+ARMAPX2-NEXT: __imp_dataexp in test2.dll
+ARMAPX2-NEXT: __imp_expname in test2.dll
+ARMAPX2-NEXT: __imp_funcexp in test2.dll
+ARMAPX2-NEXT: __imp_mangledfunc in test2.dll
+ARMAPX2-NEXT: expname in test2.dll
+ARMAPX2-NEXT: funcexp in test2.dll
+ARMAPX2-NEXT: mangledfunc in test2.dll
+
+ARMAPX2:      test2.dll:
+ARMAPX2:      00000000 T #funcexp
+ARMAPX2-NEXT: 00000000 T __imp_aux_funcexp
+ARMAPX2-NEXT: 00000000 T __imp_funcexp
+ARMAPX2-NEXT: 00000000 T funcexp
+ARMAPX2-EMPTY:
+ARMAPX2-NEXT: test2.dll:
+ARMAPX2-NEXT: 00000000 T #mangledfunc
+ARMAPX2-NEXT: 00000000 T __imp_aux_mangledfunc
+ARMAPX2-NEXT: 00000000 T __imp_mangledfunc
+ARMAPX2-NEXT: 00000000 T mangledfunc
+ARMAPX2-EMPTY:
+ARMAPX2-NEXT: test2.dll:
+ARMAPX2-NEXT: 00000000 T ?test_cpp_func@@$$hYAHPEAX@Z
+ARMAPX2-NEXT: 00000000 T ?test_cpp_func@@YAHPEAX@Z
+ARMAPX2-NEXT: 00000000 T __imp_?test_cpp_func@@YAHPEAX@Z
+ARMAPX2-NEXT: 00000000 T __imp_aux_?test_cpp_func@@YAHPEAX@Z
+ARMAPX2-EMPTY:
+ARMAPX2-NEXT: test2.dll:
+ARMAPX2-NEXT: 00000000 T #expname
+ARMAPX2-NEXT: 00000000 T __imp_aux_expname
+ARMAPX2-NEXT: 00000000 T __imp_expname
+ARMAPX2-NEXT: 00000000 T expname
+ARMAPX2-EMPTY:
+ARMAPX2-NEXT: test2.dll:
+ARMAPX2-NEXT: 00000000 D __imp_dataexp
+ARMAPX2-EMPTY:
+ARMAPX2-NEXT: test2.dll:
+ARMAPX2-NEXT: 00000000 T __imp_otherfunc
+ARMAPX2-NEXT: 00000000 T otherfunc
+
+
 RUN: llvm-lib -machine:arm64ec -def:exportas.def -out:exportas.lib
 RUN: llvm-nm --print-armap exportas.lib | FileCheck -check-prefix=EXPAS-ARMAP %s
 RUN: llvm-readobj exportas.lib | FileCheck -check-prefix=EXPAS-READOBJ %s
@@ -186,6 +388,11 @@ EXPORTS
     expname=impname
     dataexp DATA
 
+#--- test2.def
+LIBRARY test2.dll
+EXPORTS
+    otherfunc
+
 #--- exportas.def
 LIBRARY test.dll
 EXPORTS
-- 
cgit v1.1


From 3bdc4c702dfdd1f2fb513943f034d49bb79ded29 Mon Sep 17 00:00:00 2001
From: Kevin Frei <kevinfrei@users.noreply.github.com>
Date: Mon, 12 Feb 2024 16:57:02 -0800
Subject: Gsymutil aggregation similar to DwarfDump --verify (#81154)

GsymUtil, like DwarfDump --verify, spews a *lot* of data necessary to
understand/diagnose issues with DWARF data. The trouble is that the kind
of information necessary to make the messages useful also makes them
nearly impossible to easily categorize. I put together a similar output
categorizer (https://github.com/llvm/llvm-project/pull/79648) that will
emit a summary of issues identified at the bottom of the (very verbose)
output, enabling easier tracking of issues as they arise or are
addressed.

There's a single output change, where a message "warning: Unable to
retrieve DWO .debug_info section for some object files. (Remove the
--quiet flag for full output)" was being dumped the first time it was
encountered (in what looks like an attempt to make something easily
grep-able), but rather than keep the output in the same order, that
message is now a 'category' so gets emitted at the end of the output.
The test 'tools/llvm-gsymutil/X86/elf-dwo.yaml' was changed to reflect
this difference.

---------

Co-authored-by: Kevin Frei <freik@meta.com>
---
 .../include/llvm/DebugInfo/GSYM/DwarfTransformer.h |   7 +-
 llvm/include/llvm/DebugInfo/GSYM/GsymCreator.h     |   3 +-
 .../llvm/DebugInfo/GSYM/ObjectFileTransformer.h    |   7 +-
 .../include/llvm/DebugInfo/GSYM/OutputAggregator.h |  75 ++++++
 llvm/lib/DebugInfo/GSYM/DwarfTransformer.cpp       | 290 +++++++++++----------
 llvm/lib/DebugInfo/GSYM/GsymCreator.cpp            |  37 +--
 llvm/lib/DebugInfo/GSYM/ObjectFileTransformer.cpp  |  15 +-
 llvm/test/tools/llvm-gsymutil/X86/elf-dwo.yaml     |   2 +-
 llvm/tools/llvm-gsymutil/llvm-gsymutil.cpp         |  44 ++--
 llvm/unittests/DebugInfo/GSYM/GSYMTest.cpp         | 119 +++++----
 10 files changed, 366 insertions(+), 233 deletions(-)
 create mode 100644 llvm/include/llvm/DebugInfo/GSYM/OutputAggregator.h

diff --git a/llvm/include/llvm/DebugInfo/GSYM/DwarfTransformer.h b/llvm/include/llvm/DebugInfo/GSYM/DwarfTransformer.h
index fd2433a..3abf3ba 100644
--- a/llvm/include/llvm/DebugInfo/GSYM/DwarfTransformer.h
+++ b/llvm/include/llvm/DebugInfo/GSYM/DwarfTransformer.h
@@ -22,6 +22,7 @@ namespace gsym {
 struct CUInfo;
 struct FunctionInfo;
 class GsymCreator;
+class OutputAggregator;
 
 /// A class that transforms the DWARF in a DWARFContext into GSYM information
 /// by populating the GsymCreator object that it is constructed with. This
@@ -52,9 +53,9 @@ public:
   ///
   /// \returns An error indicating any fatal issues that happen when parsing
   /// the DWARF, or Error::success() if all goes well.
-  llvm::Error convert(uint32_t NumThreads, raw_ostream *OS);
+  llvm::Error convert(uint32_t NumThreads, OutputAggregator &OS);
 
-  llvm::Error verify(StringRef GsymPath, raw_ostream &OS);
+  llvm::Error verify(StringRef GsymPath, OutputAggregator &OS);
 
 private:
 
@@ -79,7 +80,7 @@ private:
   /// information.
   ///
   /// \param Die The DWARF debug info entry to parse.
-  void handleDie(raw_ostream *Strm, CUInfo &CUI, DWARFDie Die);
+  void handleDie(OutputAggregator &Strm, CUInfo &CUI, DWARFDie Die);
 
   DWARFContext &DICtx;
   GsymCreator &Gsym;
diff --git a/llvm/include/llvm/DebugInfo/GSYM/GsymCreator.h b/llvm/include/llvm/DebugInfo/GSYM/GsymCreator.h
index bb52f3f..855a275 100644
--- a/llvm/include/llvm/DebugInfo/GSYM/GsymCreator.h
+++ b/llvm/include/llvm/DebugInfo/GSYM/GsymCreator.h
@@ -28,6 +28,7 @@ namespace llvm {
 
 namespace gsym {
 class FileWriter;
+class OutputAggregator;
 
 /// GsymCreator is used to emit GSYM data to a stand alone file or section
 /// within a file.
@@ -360,7 +361,7 @@ public:
   ///         function infos, and function infos that were merged or removed.
   /// \returns An error object that indicates success or failure of the
   ///          finalize.
-  llvm::Error finalize(llvm::raw_ostream &OS);
+  llvm::Error finalize(OutputAggregator &OS);
 
   /// Set the UUID value.
   ///
diff --git a/llvm/include/llvm/DebugInfo/GSYM/ObjectFileTransformer.h b/llvm/include/llvm/DebugInfo/GSYM/ObjectFileTransformer.h
index 2018e09..fe44e82 100644
--- a/llvm/include/llvm/DebugInfo/GSYM/ObjectFileTransformer.h
+++ b/llvm/include/llvm/DebugInfo/GSYM/ObjectFileTransformer.h
@@ -13,8 +13,6 @@
 
 namespace llvm {
 
-class raw_ostream;
-
 namespace object {
 class ObjectFile;
 }
@@ -22,6 +20,7 @@ class ObjectFile;
 namespace gsym {
 
 class GsymCreator;
+class OutputAggregator;
 
 class ObjectFileTransformer {
 public:
@@ -40,8 +39,8 @@ public:
   ///
   /// \returns An error indicating any fatal issues that happen when parsing
   /// the DWARF, or Error::success() if all goes well.
-  static llvm::Error convert(const object::ObjectFile &Obj, raw_ostream *Log,
-                             GsymCreator &Gsym);
+  static llvm::Error convert(const object::ObjectFile &Obj,
+                             OutputAggregator &Output, GsymCreator &Gsym);
 };
 
 } // namespace gsym
diff --git a/llvm/include/llvm/DebugInfo/GSYM/OutputAggregator.h b/llvm/include/llvm/DebugInfo/GSYM/OutputAggregator.h
new file mode 100644
index 0000000..8deea3b
--- /dev/null
+++ b/llvm/include/llvm/DebugInfo/GSYM/OutputAggregator.h
@@ -0,0 +1,75 @@
+//===- DwarfTransformer.h ---------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_DEBUGINFO_GSYM_OUTPUTAGGREGATOR_H
+#define LLVM_DEBUGINFO_GSYM_OUTPUTAGGREGATOR_H
+
+#include "llvm/ADT/StringRef.h"
+#include "llvm/DebugInfo/GSYM/ExtractRanges.h"
+
+#include <map>
+#include <string>
+
+namespace llvm {
+
+class raw_ostream;
+
+namespace gsym {
+
+class OutputAggregator {
+protected:
+  // A std::map is preferable over an llvm::StringMap for presenting results
+  // in a predictable order.
+  std::map<std::string, unsigned> Aggregation;
+  raw_ostream *Out;
+
+public:
+  OutputAggregator(raw_ostream *out) : Out(out) {}
+
+  size_t GetNumCategories() const { return Aggregation.size(); }
+
+  void Report(StringRef s, std::function<void(raw_ostream &o)> detailCallback) {
+    Aggregation[std::string(s)]++;
+    if (GetOS())
+      detailCallback(*Out);
+  }
+
+  void EnumerateResults(
+      std::function<void(StringRef, unsigned)> handleCounts) const {
+    for (auto &&[name, count] : Aggregation)
+      handleCounts(name, count);
+  }
+
+  raw_ostream *GetOS() const { return Out; }
+
+  // You can just use the stream, and if it's null, nothing happens.
+  // Don't do a lot of stuff like this, but it's convenient for silly stuff.
+  // It doesn't work with things that have custom insertion operators, though.
+  template <typename T> OutputAggregator &operator<<(T &&value) {
+    if (Out != nullptr)
+      *Out << value;
+    return *this;
+  }
+
+  // For multi-threaded usage, we can collect stuff in another aggregator,
+  // then merge it in here
+  void Merge(const OutputAggregator &other) {
+    for (auto &&[name, count] : other.Aggregation) {
+      auto it = Aggregation.find(name);
+      if (it == Aggregation.end())
+        Aggregation.emplace(name, count);
+      else
+        it->second += count;
+    }
+  }
+};
+
+} // namespace gsym
+} // namespace llvm
+
+#endif // LLVM_DEBUGINFO_GSYM_OUTPUTAGGREGATOR_H
diff --git a/llvm/lib/DebugInfo/GSYM/DwarfTransformer.cpp b/llvm/lib/DebugInfo/GSYM/DwarfTransformer.cpp
index 6268c78..3a28cd4 100644
--- a/llvm/lib/DebugInfo/GSYM/DwarfTransformer.cpp
+++ b/llvm/lib/DebugInfo/GSYM/DwarfTransformer.cpp
@@ -21,6 +21,8 @@
 #include "llvm/DebugInfo/GSYM/GsymCreator.h"
 #include "llvm/DebugInfo/GSYM/GsymReader.h"
 #include "llvm/DebugInfo/GSYM/InlineInfo.h"
+#include "llvm/DebugInfo/GSYM/OutputAggregator.h"
+
 #include <optional>
 
 using namespace llvm;
@@ -215,9 +217,9 @@ ConvertDWARFRanges(const DWARFAddressRangesVector &DwarfRanges) {
   return Ranges;
 }
 
-static void parseInlineInfo(GsymCreator &Gsym, raw_ostream *Log, CUInfo &CUI,
-                            DWARFDie Die, uint32_t Depth, FunctionInfo &FI,
-                            InlineInfo &Parent,
+static void parseInlineInfo(GsymCreator &Gsym, OutputAggregator &Out,
+                            CUInfo &CUI, DWARFDie Die, uint32_t Depth,
+                            FunctionInfo &FI, InlineInfo &Parent,
                             const AddressRanges &AllParentRanges,
                             bool &WarnIfEmpty) {
   if (!hasInlineInfo(Die, Depth))
@@ -250,14 +252,18 @@ static void parseInlineInfo(GsymCreator &Gsym, raw_ostream *Log, CUInfo &CUI,
             // parsing for.
             if (AllParentRanges.contains(InlineRange)) {
               WarnIfEmpty = false;
-            } else if (Log) {
-              *Log << "error: inlined function DIE at "
-                   << HEX32(Die.getOffset()) << " has a range ["
-                   << HEX64(InlineRange.start()) << " - "
-                   << HEX64(InlineRange.end()) << ") that isn't contained in "
-                   << "any parent address ranges, this inline range will be "
-                      "removed.\n";
-            }
+            } else
+              Out.Report("Function DIE has uncontained address range",
+                         [&](raw_ostream &OS) {
+                           OS << "error: inlined function DIE at "
+                              << HEX32(Die.getOffset()) << " has a range ["
+                              << HEX64(InlineRange.start()) << " - "
+                              << HEX64(InlineRange.end())
+                              << ") that isn't contained in "
+                              << "any parent address ranges, this inline range "
+                                 "will be "
+                                 "removed.\n";
+                         });
           }
         }
       }
@@ -281,26 +287,30 @@ static void parseInlineInfo(GsymCreator &Gsym, raw_ostream *Log, CUInfo &CUI,
       II.CallLine = dwarf::toUnsigned(Die.find(dwarf::DW_AT_call_line), 0);
       // parse all children and append to parent
       for (DWARFDie ChildDie : Die.children())
-        parseInlineInfo(Gsym, Log, CUI, ChildDie, Depth + 1, FI, II,
+        parseInlineInfo(Gsym, Out, CUI, ChildDie, Depth + 1, FI, II,
                         AllInlineRanges, WarnIfEmpty);
       Parent.Children.emplace_back(std::move(II));
-    } else if (Log) {
-      *Log << "error: inlined function DIE at " << HEX32(Die.getOffset())
-           << " has an invalid file index " << DwarfFileIdx
-           << " in its DW_AT_call_file attribute, this inline entry and all "
-           << "children will be removed.\n";
-    }
+    } else
+      Out.Report(
+          "Inlined function die has invlaid file index in DW_AT_call_file",
+          [&](raw_ostream &OS) {
+            OS << "error: inlined function DIE at " << HEX32(Die.getOffset())
+               << " has an invalid file index " << DwarfFileIdx
+               << " in its DW_AT_call_file attribute, this inline entry and "
+                  "all "
+               << "children will be removed.\n";
+          });
     return;
   }
   if (Tag == dwarf::DW_TAG_subprogram || Tag == dwarf::DW_TAG_lexical_block) {
     // skip this Die and just recurse down
     for (DWARFDie ChildDie : Die.children())
-      parseInlineInfo(Gsym, Log, CUI, ChildDie, Depth + 1, FI, Parent,
+      parseInlineInfo(Gsym, Out, CUI, ChildDie, Depth + 1, FI, Parent,
                       AllParentRanges, WarnIfEmpty);
   }
 }
 
-static void convertFunctionLineTable(raw_ostream *Log, CUInfo &CUI,
+static void convertFunctionLineTable(OutputAggregator &Out, CUInfo &CUI,
                                      DWARFDie Die, GsymCreator &Gsym,
                                      FunctionInfo &FI) {
   std::vector<uint32_t> RowVector;
@@ -319,15 +329,15 @@ static void convertFunctionLineTable(raw_ostream *Log, CUInfo &CUI,
     if (FilePath.empty()) {
       // If we had a DW_AT_decl_file, but got no file then we need to emit a
       // warning.
-      if (Log) {
+      Out.Report("Invalid file index in DW_AT_decl_file", [&](raw_ostream &OS) {
         const uint64_t DwarfFileIdx = dwarf::toUnsigned(
             Die.findRecursively(dwarf::DW_AT_decl_file), UINT32_MAX);
-        *Log << "error: function DIE at " << HEX32(Die.getOffset())
-             << " has an invalid file index " << DwarfFileIdx
-             << " in its DW_AT_decl_file attribute, unable to create a single "
-             << "line entry from the DW_AT_decl_file/DW_AT_decl_line "
-             << "attributes.\n";
-      }
+        OS << "error: function DIE at " << HEX32(Die.getOffset())
+           << " has an invalid file index " << DwarfFileIdx
+           << " in its DW_AT_decl_file attribute, unable to create a single "
+           << "line entry from the DW_AT_decl_file/DW_AT_decl_line "
+           << "attributes.\n";
+      });
       return;
     }
     if (auto Line =
@@ -347,14 +357,15 @@ static void convertFunctionLineTable(raw_ostream *Log, CUInfo &CUI,
     std::optional<uint32_t> OptFileIdx =
         CUI.DWARFToGSYMFileIndex(Gsym, Row.File);
     if (!OptFileIdx) {
-      if (Log) {
-        *Log << "error: function DIE at " << HEX32(Die.getOffset()) << " has "
-             << "a line entry with invalid DWARF file index, this entry will "
-             << "be removed:\n";
-        Row.dumpTableHeader(*Log, /*Indent=*/0);
-        Row.dump(*Log);
-        *Log << "\n";
-      }
+      Out.Report(
+          "Invalid file index in DWARF line table", [&](raw_ostream &OS) {
+            OS << "error: function DIE at " << HEX32(Die.getOffset()) << " has "
+               << "a line entry with invalid DWARF file index, this entry will "
+               << "be removed:\n";
+            Row.dumpTableHeader(OS, /*Indent=*/0);
+            Row.dump(OS);
+            OS << "\n";
+          });
       continue;
     }
     const uint32_t FileIdx = OptFileIdx.value();
@@ -367,12 +378,15 @@ static void convertFunctionLineTable(raw_ostream *Log, CUInfo &CUI,
     // an error, but not worth stopping the creation of the GSYM.
     if (!FI.Range.contains(RowAddress)) {
       if (RowAddress < FI.Range.start()) {
-        if (Log) {
-          *Log << "error: DIE has a start address whose LowPC is between the "
-                  "line table Row[" << RowIndex << "] with address "
-               << HEX64(RowAddress) << " and the next one.\n";
-          Die.dump(*Log, 0, DIDumpOptions::getForSingleDIE());
-        }
+        Out.Report("Start address lies between valid Row table entries",
+                   [&](raw_ostream &OS) {
+                     OS << "error: DIE has a start address whose LowPC is "
+                           "between the "
+                           "line table Row["
+                        << RowIndex << "] with address " << HEX64(RowAddress)
+                        << " and the next one.\n";
+                     Die.dump(OS, 0, DIDumpOptions::getForSingleDIE());
+                   });
         RowAddress = FI.Range.start();
       } else {
         continue;
@@ -388,20 +402,21 @@ static void convertFunctionLineTable(raw_ostream *Log, CUInfo &CUI,
       // line entry we already have in our "function_info.Lines". If
       // so break out after printing a warning.
       auto FirstLE = FI.OptLineTable->first();
-      if (FirstLE && *FirstLE == LE) {
-        if (Log && !Gsym.isQuiet()) {
-          *Log << "warning: duplicate line table detected for DIE:\n";
-          Die.dump(*Log, 0, DIDumpOptions::getForSingleDIE());
-        }
-      } else {
-        if (Log) {
-          *Log << "error: line table has addresses that do not "
-               << "monotonically increase:\n";
-          for (uint32_t RowIndex2 : RowVector)
-            CUI.LineTable->Rows[RowIndex2].dump(*Log);
-          Die.dump(*Log, 0, DIDumpOptions::getForSingleDIE());
-        }
-      }
+      if (FirstLE && *FirstLE == LE)
+        // if (Log && !Gsym.isQuiet()) { TODO <-- This looks weird
+        Out.Report("Duplicate line table detected", [&](raw_ostream &OS) {
+          OS << "warning: duplicate line table detected for DIE:\n";
+          Die.dump(OS, 0, DIDumpOptions::getForSingleDIE());
+        });
+      else
+        Out.Report("Non-monotonically increasing addresses",
+                   [&](raw_ostream &OS) {
+                     OS << "error: line table has addresses that do not "
+                        << "monotonically increase:\n";
+                     for (uint32_t RowIndex2 : RowVector)
+                       CUI.LineTable->Rows[RowIndex2].dump(OS);
+                     Die.dump(OS, 0, DIDumpOptions::getForSingleDIE());
+                   });
       break;
     }
 
@@ -429,7 +444,8 @@ static void convertFunctionLineTable(raw_ostream *Log, CUInfo &CUI,
     FI.OptLineTable = std::nullopt;
 }
 
-void DwarfTransformer::handleDie(raw_ostream *OS, CUInfo &CUI, DWARFDie Die) {
+void DwarfTransformer::handleDie(OutputAggregator &Out, CUInfo &CUI,
+                                 DWARFDie Die) {
   switch (Die.getTag()) {
   case dwarf::DW_TAG_subprogram: {
     Expected<DWARFAddressRangesVector> RangesOrError = Die.getAddressRanges();
@@ -442,11 +458,11 @@ void DwarfTransformer::handleDie(raw_ostream *OS, CUInfo &CUI, DWARFDie Die) {
       break;
     auto NameIndex = getQualifiedNameIndex(Die, CUI.Language, Gsym);
     if (!NameIndex) {
-      if (OS) {
-        *OS << "error: function at " << HEX64(Die.getOffset())
-            << " has no name\n ";
-        Die.dump(*OS, 0, DIDumpOptions::getForSingleDIE());
-      }
+      Out.Report("Function has no name", [&](raw_ostream &OS) {
+        OS << "error: function at " << HEX64(Die.getOffset())
+           << " has no name\n ";
+        Die.dump(OS, 0, DIDumpOptions::getForSingleDIE());
+      });
       break;
     }
     // All ranges for the subprogram DIE in case it has multiple. We need to
@@ -472,8 +488,8 @@ void DwarfTransformer::handleDie(raw_ostream *OS, CUInfo &CUI, DWARFDie Die) {
 
       // Many linkers can't remove DWARF and might set the LowPC to zero. Since
       // high PC can be an offset from the low PC in more recent DWARF versions
-      // we need to watch for a zero'ed low pc which we do using
-      // ValidTextRanges below.
+      // we need to watch for a zero'ed low pc which we do using ValidTextRanges
+      // below.
       if (!Gsym.IsValidTextAddress(Range.LowPC)) {
         // We expect zero and -1 to be invalid addresses in DWARF depending
         // on the linker of the DWARF. This indicates a function was stripped
@@ -482,13 +498,15 @@ void DwarfTransformer::handleDie(raw_ostream *OS, CUInfo &CUI, DWARFDie Die) {
         if (Range.LowPC != 0) {
           if (!Gsym.isQuiet()) {
             // Unexpected invalid address, emit a warning
-            if (OS) {
-              *OS << "warning: DIE has an address range whose start address "
-                     "is not in any executable sections ("
-                  << *Gsym.GetValidTextRanges()
-                  << ") and will not be processed:\n";
-              Die.dump(*OS, 0, DIDumpOptions::getForSingleDIE());
-            }
+            Out.Report("Address range starts outside executable section",
+                       [&](raw_ostream &OS) {
+                         OS << "warning: DIE has an address range whose "
+                               "start address "
+                               "is not in any executable sections ("
+                            << *Gsym.GetValidTextRanges()
+                            << ") and will not be processed:\n";
+                         Die.dump(OS, 0, DIDumpOptions::getForSingleDIE());
+                       });
           }
         }
         break;
@@ -498,14 +516,14 @@ void DwarfTransformer::handleDie(raw_ostream *OS, CUInfo &CUI, DWARFDie Die) {
       FI.Range = {Range.LowPC, Range.HighPC};
       FI.Name = *NameIndex;
       if (CUI.LineTable)
-        convertFunctionLineTable(OS, CUI, Die, Gsym, FI);
+        convertFunctionLineTable(Out, CUI, Die, Gsym, FI);
 
       if (hasInlineInfo(Die, 0)) {
         FI.Inline = InlineInfo();
         FI.Inline->Name = *NameIndex;
         FI.Inline->Ranges.insert(FI.Range);
         bool WarnIfEmpty = true;
-        parseInlineInfo(Gsym, OS, CUI, Die, 0, FI, *FI.Inline,
+        parseInlineInfo(Gsym, Out, CUI, Die, 0, FI, *FI.Inline,
                         AllSubprogramRanges, WarnIfEmpty);
         // Make sure we at least got some valid inline info other than just
         // the top level function. If we didn't then remove the inline info
@@ -517,11 +535,14 @@ void DwarfTransformer::handleDie(raw_ostream *OS, CUInfo &CUI, DWARFDie Die) {
         // information object, we will know if we got anything valid from the
         // debug info.
         if (FI.Inline->Children.empty()) {
-          if (WarnIfEmpty && OS && !Gsym.isQuiet()) {
-            *OS << "warning: DIE contains inline function information that has "
-                  "no valid ranges, removing inline information:\n";
-            Die.dump(*OS, 0, DIDumpOptions::getForSingleDIE());
-          }
+          if (WarnIfEmpty && !Gsym.isQuiet())
+            Out.Report("DIE contains inline functions with no valid ranges",
+                       [&](raw_ostream &OS) {
+                         OS << "warning: DIE contains inline function "
+                               "information that has no valid ranges, removing "
+                               "inline information:\n";
+                         Die.dump(OS, 0, DIDumpOptions::getForSingleDIE());
+                       });
           FI.Inline = std::nullopt;
         }
       }
@@ -532,33 +553,28 @@ void DwarfTransformer::handleDie(raw_ostream *OS, CUInfo &CUI, DWARFDie Die) {
     break;
   }
   for (DWARFDie ChildDie : Die.children())
-    handleDie(OS, CUI, ChildDie);
+    handleDie(Out, CUI, ChildDie);
 }
 
-Error DwarfTransformer::convert(uint32_t NumThreads, raw_ostream *OS) {
+Error DwarfTransformer::convert(uint32_t NumThreads, OutputAggregator &Out) {
   size_t NumBefore = Gsym.getNumFunctionInfos();
-  std::once_flag flag;
   auto getDie = [&](DWARFUnit &DwarfUnit) -> DWARFDie {
     DWARFDie ReturnDie = DwarfUnit.getUnitDIE(false);
     if (DwarfUnit.getDWOId()) {
       DWARFUnit *DWOCU = DwarfUnit.getNonSkeletonUnitDIE(false).getDwarfUnit();
-      if (!DWOCU->isDWOUnit()) {
-        if (OS) {
-          std::string DWOName = dwarf::toString(
-              DwarfUnit.getUnitDIE().find(
-                  {dwarf::DW_AT_dwo_name, dwarf::DW_AT_GNU_dwo_name}),
-              "");
-          *OS << "warning: Unable to retrieve DWO .debug_info section for "
-              << DWOName << "\n";
-        } else {
-          std::call_once(flag, []() {
-            outs()
-                << "warning: Unable to retrieve DWO .debug_info section for "
-                   "some "
-                   "object files. (Remove the --quiet flag for full output)\n";
-          });
-        }
-      } else {
+      if (!DWOCU->isDWOUnit())
+        Out.Report(
+            "warning: Unable to retrieve DWO .debug_info section for some "
+            "object files. (Remove the --quiet flag for full output)",
+            [&](raw_ostream &OS) {
+              std::string DWOName = dwarf::toString(
+                  DwarfUnit.getUnitDIE().find(
+                      {dwarf::DW_AT_dwo_name, dwarf::DW_AT_GNU_dwo_name}),
+                  "");
+              OS << "warning: Unable to retrieve DWO .debug_info section for "
+                 << DWOName << "\n";
+            });
+      else {
         ReturnDie = DWOCU->getUnitDIE(false);
       }
     }
@@ -570,7 +586,7 @@ Error DwarfTransformer::convert(uint32_t NumThreads, raw_ostream *OS) {
     for (const auto &CU : DICtx.compile_units()) {
       DWARFDie Die = getDie(*CU);
       CUInfo CUI(DICtx, dyn_cast<DWARFCompileUnit>(CU.get()));
-      handleDie(OS, CUI, Die);
+      handleDie(Out, CUI, Die);
     }
   } else {
     // LLVM Dwarf parser is not thread-safe and we need to parse all DWARF up
@@ -596,29 +612,31 @@ Error DwarfTransformer::convert(uint32_t NumThreads, raw_ostream *OS) {
       DWARFDie Die = getDie(*CU);
       if (Die) {
         CUInfo CUI(DICtx, dyn_cast<DWARFCompileUnit>(CU.get()));
-        pool.async([this, CUI, &LogMutex, OS, Die]() mutable {
-          std::string ThreadLogStorage;
-          raw_string_ostream ThreadOS(ThreadLogStorage);
-          handleDie(OS ? &ThreadOS: nullptr, CUI, Die);
-          ThreadOS.flush();
-          if (OS && !ThreadLogStorage.empty()) {
-            // Print ThreadLogStorage lines into an actual stream under a lock
-            std::lock_guard<std::mutex> guard(LogMutex);
-            *OS << ThreadLogStorage;
+        pool.async([this, CUI, &LogMutex, Out, Die]() mutable {
+          std::string storage;
+          raw_string_ostream StrStream(storage);
+          OutputAggregator ThreadOut(Out.GetOS() ? &StrStream : nullptr);
+          handleDie(ThreadOut, CUI, Die);
+          // Print ThreadLogStorage lines into an actual stream under a lock
+          std::lock_guard<std::mutex> guard(LogMutex);
+          if (Out.GetOS()) {
+            StrStream.flush();
+            Out << storage;
           }
+          Out.Merge(ThreadOut);
         });
       }
     }
     pool.wait();
   }
   size_t FunctionsAddedCount = Gsym.getNumFunctionInfos() - NumBefore;
-  if (OS)
-    *OS << "Loaded " << FunctionsAddedCount << " functions from DWARF.\n";
+  Out << "Loaded " << FunctionsAddedCount << " functions from DWARF.\n";
   return Error::success();
 }
 
-llvm::Error DwarfTransformer::verify(StringRef GsymPath, raw_ostream &Log) {
-  Log << "Verifying GSYM file \"" << GsymPath << "\":\n";
+llvm::Error DwarfTransformer::verify(StringRef GsymPath,
+                                     OutputAggregator &Out) {
+  Out << "Verifying GSYM file \"" << GsymPath << "\":\n";
 
   auto Gsym = GsymReader::openFile(GsymPath);
   if (!Gsym)
@@ -637,9 +655,9 @@ llvm::Error DwarfTransformer::verify(StringRef GsymPath, raw_ostream &Log) {
 
     auto FI = Gsym->getFunctionInfo(*FuncAddr);
     if (!FI)
-      return createStringError(std::errc::invalid_argument,
-                            "failed to extract function info for address 0x%"
-                            PRIu64, *FuncAddr);
+      return createStringError(
+          std::errc::invalid_argument,
+          "failed to extract function info for address 0x%" PRIu64, *FuncAddr);
 
     for (auto Addr = *FuncAddr; Addr < *FuncAddr + FI->size(); ++Addr) {
       const object::SectionedAddress SectAddr{
@@ -664,24 +682,27 @@ llvm::Error DwarfTransformer::verify(StringRef GsymPath, raw_ostream &Log) {
       }
       if (NumDwarfInlineInfos > 0 &&
           NumDwarfInlineInfos != LR->Locations.size()) {
-        Log << "error: address " << HEX64(Addr) << " has "
-            << NumDwarfInlineInfos << " DWARF inline frames and GSYM has "
-            << LR->Locations.size() << "\n";
-        Log << "    " << NumDwarfInlineInfos << " DWARF frames:\n";
-        for (size_t Idx = 0; Idx < NumDwarfInlineInfos; ++Idx) {
-          const auto &dii = DwarfInlineInfos.getFrame(Idx);
-          Log << "    [" << Idx << "]: " << dii.FunctionName << " @ "
-              << dii.FileName << ':' << dii.Line << '\n';
-        }
-        Log << "    " << LR->Locations.size() << " GSYM frames:\n";
-        for (size_t Idx = 0, count = LR->Locations.size();
-              Idx < count; ++Idx) {
-          const auto &gii = LR->Locations[Idx];
-          Log << "    [" << Idx << "]: " << gii.Name << " @ " << gii.Dir
-              << '/' << gii.Base << ':' << gii.Line << '\n';
+        if (Out.GetOS()) {
+          raw_ostream &Log = *Out.GetOS();
+          Log << "error: address " << HEX64(Addr) << " has "
+              << NumDwarfInlineInfos << " DWARF inline frames and GSYM has "
+              << LR->Locations.size() << "\n";
+          Log << "    " << NumDwarfInlineInfos << " DWARF frames:\n";
+          for (size_t Idx = 0; Idx < NumDwarfInlineInfos; ++Idx) {
+            const auto &dii = DwarfInlineInfos.getFrame(Idx);
+            Log << "    [" << Idx << "]: " << dii.FunctionName << " @ "
+                << dii.FileName << ':' << dii.Line << '\n';
+          }
+          Log << "    " << LR->Locations.size() << " GSYM frames:\n";
+          for (size_t Idx = 0, count = LR->Locations.size(); Idx < count;
+               ++Idx) {
+            const auto &gii = LR->Locations[Idx];
+            Log << "    [" << Idx << "]: " << gii.Name << " @ " << gii.Dir
+                << '/' << gii.Base << ':' << gii.Line << '\n';
+          }
+          DwarfInlineInfos = DICtx.getInliningInfoForAddress(SectAddr, DLIS);
+          Gsym->dump(Log, *FI);
         }
-        DwarfInlineInfos = DICtx.getInliningInfoForAddress(SectAddr, DLIS);
-        Gsym->dump(Log, *FI);
         continue;
       }
 
@@ -693,17 +714,18 @@ llvm::Error DwarfTransformer::verify(StringRef GsymPath, raw_ostream &Log) {
           gsymFilename = LR->getSourceFile(Idx);
           // Verify function name
           if (dii.FunctionName.find(gii.Name.str()) != 0)
-            Log << "error: address " << HEX64(Addr) << " DWARF function \""
+            Out << "error: address " << HEX64(Addr) << " DWARF function \""
                 << dii.FunctionName.c_str()
                 << "\" doesn't match GSYM function \"" << gii.Name << "\"\n";
+
           // Verify source file path
           if (dii.FileName != gsymFilename)
-            Log << "error: address " << HEX64(Addr) << " DWARF path \""
+            Out << "error: address " << HEX64(Addr) << " DWARF path \""
                 << dii.FileName.c_str() << "\" doesn't match GSYM path \""
                 << gsymFilename.c_str() << "\"\n";
           // Verify source file line
           if (dii.Line != gii.Line)
-            Log << "error: address " << HEX64(Addr) << " DWARF line "
+            Out << "error: address " << HEX64(Addr) << " DWARF line "
                 << dii.Line << " != GSYM line " << gii.Line << "\n";
         }
       }
diff --git a/llvm/lib/DebugInfo/GSYM/GsymCreator.cpp b/llvm/lib/DebugInfo/GSYM/GsymCreator.cpp
index 7413875..deedaee 100644
--- a/llvm/lib/DebugInfo/GSYM/GsymCreator.cpp
+++ b/llvm/lib/DebugInfo/GSYM/GsymCreator.cpp
@@ -9,6 +9,7 @@
 #include "llvm/DebugInfo/GSYM/FileWriter.h"
 #include "llvm/DebugInfo/GSYM/Header.h"
 #include "llvm/DebugInfo/GSYM/LineTable.h"
+#include "llvm/DebugInfo/GSYM/OutputAggregator.h"
 #include "llvm/MC/StringTableBuilder.h"
 #include "llvm/Support/raw_ostream.h"
 
@@ -188,7 +189,7 @@ llvm::Error GsymCreator::encode(FileWriter &O) const {
   return ErrorSuccess();
 }
 
-llvm::Error GsymCreator::finalize(llvm::raw_ostream &OS) {
+llvm::Error GsymCreator::finalize(OutputAggregator &Out) {
   std::lock_guard<std::mutex> Guard(Mutex);
   if (Finalized)
     return createStringError(std::errc::invalid_argument, "already finalized");
@@ -247,26 +248,29 @@ llvm::Error GsymCreator::finalize(llvm::raw_ostream &OS) {
             // address ranges that have debug info are last in
             // the sort.
             if (!(Prev == Curr)) {
-              if (Prev.hasRichInfo() && Curr.hasRichInfo()) {
-                if (!Quiet) {
-                  OS << "warning: same address range contains "
-                        "different debug "
-                    << "info. Removing:\n"
-                    << Prev << "\nIn favor of this one:\n"
-                    << Curr << "\n";
-                }
-              }
+              if (Prev.hasRichInfo() && Curr.hasRichInfo())
+                Out.Report(
+                    "Duplicate address ranges with different debug info.",
+                    [&](raw_ostream &OS) {
+                      OS << "warning: same address range contains "
+                            "different debug "
+                         << "info. Removing:\n"
+                         << Prev << "\nIn favor of this one:\n"
+                         << Curr << "\n";
+                    });
+
               // We want to swap the current entry with the previous since
               // later entries with the same range always have more debug info
               // or different debug info.
               std::swap(Prev, Curr);
             }
           } else {
-            if (!Quiet) { // print warnings about overlaps
+            Out.Report("Overlapping function ranges", [&](raw_ostream &OS) {
+              // print warnings about overlaps
               OS << "warning: function ranges overlap:\n"
                 << Prev << "\n"
                 << Curr << "\n";
-            }
+            });
             FinalizedFuncs.emplace_back(std::move(Curr));
           }
         } else {
@@ -293,8 +297,8 @@ llvm::Error GsymCreator::finalize(llvm::raw_ostream &OS) {
         Funcs.back().Range = {Funcs.back().Range.start(), Range->end()};
       }
     }
-    OS << "Pruned " << NumBefore - Funcs.size() << " functions, ended with "
-      << Funcs.size() << " total\n";
+    Out << "Pruned " << NumBefore - Funcs.size() << " functions, ended with "
+        << Funcs.size() << " total\n";
   }
   return Error::success();
 }
@@ -494,8 +498,9 @@ llvm::Error GsymCreator::saveSegments(StringRef Path,
       GsymCreator *GC = ExpectedGC->get();
       if (GC == NULL)
         break; // We had not more functions to encode.
-      raw_null_ostream ErrorStrm;
-      llvm::Error Err = GC->finalize(ErrorStrm);
+      // Don't collect any messages at all
+      OutputAggregator Out(nullptr);
+      llvm::Error Err = GC->finalize(Out);
       if (Err)
         return Err;
       std::string SegmentedGsymPath;
diff --git a/llvm/lib/DebugInfo/GSYM/ObjectFileTransformer.cpp b/llvm/lib/DebugInfo/GSYM/ObjectFileTransformer.cpp
index a60b2d3..2500e17 100644
--- a/llvm/lib/DebugInfo/GSYM/ObjectFileTransformer.cpp
+++ b/llvm/lib/DebugInfo/GSYM/ObjectFileTransformer.cpp
@@ -14,8 +14,9 @@
 #include "llvm/Support/DataExtractor.h"
 #include "llvm/Support/raw_ostream.h"
 
-#include "llvm/DebugInfo/GSYM/ObjectFileTransformer.h"
 #include "llvm/DebugInfo/GSYM/GsymCreator.h"
+#include "llvm/DebugInfo/GSYM/ObjectFileTransformer.h"
+#include "llvm/DebugInfo/GSYM/OutputAggregator.h"
 
 using namespace llvm;
 using namespace gsym;
@@ -68,7 +69,7 @@ static std::vector<uint8_t> getUUID(const object::ObjectFile &Obj) {
 }
 
 llvm::Error ObjectFileTransformer::convert(const object::ObjectFile &Obj,
-                                           raw_ostream *Log,
+                                           OutputAggregator &Out,
                                            GsymCreator &Gsym) {
   using namespace llvm::object;
 
@@ -99,8 +100,8 @@ llvm::Error ObjectFileTransformer::convert(const object::ObjectFile &Obj,
     const uint64_t size = IsELF ? ELFSymbolRef(Sym).getSize() : 0;
     Expected<StringRef> Name = Sym.getName();
     if (!Name) {
-      if (Log)
-        logAllUnhandledErrors(Name.takeError(), *Log,
+      if (Out.GetOS())
+        logAllUnhandledErrors(Name.takeError(), *Out.GetOS(),
                               "ObjectFileTransformer: ");
       else
         consumeError(Name.takeError());
@@ -114,8 +115,8 @@ llvm::Error ObjectFileTransformer::convert(const object::ObjectFile &Obj,
         FunctionInfo(*AddrOrErr, size, Gsym.insertString(*Name, NoCopy)));
   }
   size_t FunctionsAddedCount = Gsym.getNumFunctionInfos() - NumBefore;
-  if (Log)
-    *Log << "Loaded " << FunctionsAddedCount
-         << " functions from symbol table.\n";
+  if (Out.GetOS())
+    *Out.GetOS() << "Loaded " << FunctionsAddedCount
+                 << " functions from symbol table.\n";
   return Error::success();
 }
diff --git a/llvm/test/tools/llvm-gsymutil/X86/elf-dwo.yaml b/llvm/test/tools/llvm-gsymutil/X86/elf-dwo.yaml
index 109f2fe..b430e85 100644
--- a/llvm/test/tools/llvm-gsymutil/X86/elf-dwo.yaml
+++ b/llvm/test/tools/llvm-gsymutil/X86/elf-dwo.yaml
@@ -15,8 +15,8 @@
 
 ## WARNING-QUIET: Input file: {{.*\.yaml\.tmp}}
 ## WARNING-QUIET: Output file (x86_64): {{.*\.yaml\.tmp\.gsym}}
-## WARNING-QUIET: warning: Unable to retrieve DWO .debug_info section for some object files. (Remove the --quiet flag for full output)
 ## WARNING-QUIET: Pruned 0 functions, ended with 10 total
+## WARNING-QUIET: warning: Unable to retrieve DWO .debug_info section for some object files. (Remove the --quiet flag for full output)
 
 
diff --git a/llvm/tools/llvm-gsymutil/llvm-gsymutil.cpp b/llvm/tools/llvm-gsymutil/llvm-gsymutil.cpp
index e5ae726..2de9c76 100644
--- a/llvm/tools/llvm-gsymutil/llvm-gsymutil.cpp
+++ b/llvm/tools/llvm-gsymutil/llvm-gsymutil.cpp
@@ -43,6 +43,7 @@
 #include "llvm/DebugInfo/GSYM/InlineInfo.h"
 #include "llvm/DebugInfo/GSYM/LookupResult.h"
 #include "llvm/DebugInfo/GSYM/ObjectFileTransformer.h"
+#include "llvm/DebugInfo/GSYM/OutputAggregator.h"
 #include <optional>
 
 using namespace llvm;
@@ -300,16 +301,10 @@ static std::optional<uint64_t> getImageBaseAddress(object::ObjectFile &Obj) {
   return std::nullopt;
 }
 
-static llvm::Error handleObjectFile(ObjectFile &Obj,
-                                    const std::string &OutFile) {
+static llvm::Error handleObjectFile(ObjectFile &Obj, const std::string &OutFile,
+                                    OutputAggregator &Out) {
   auto ThreadCount =
       NumThreads > 0 ? NumThreads : std::thread::hardware_concurrency();
-  auto &OS = outs();
-  // Make a stream refernce that will become a /dev/null log stream if
-  // Quiet is true, or normal output if Quiet is false. This can stop the
-  // errors and warnings from being displayed and producing too much output
-  // when they aren't desired.
-  raw_ostream *LogOS = Quiet ? nullptr : &outs();
 
   GsymCreator Gsym(Quiet);
 
@@ -354,17 +349,17 @@ static llvm::Error handleObjectFile(ObjectFile &Obj,
     Gsym.SetValidTextRanges(TextRanges);
 
   // Convert all DWARF to GSYM.
-  if (auto Err = DT.convert(ThreadCount, LogOS))
+  if (auto Err = DT.convert(ThreadCount, Out))
     return Err;
 
   // Get the UUID and convert symbol table to GSYM.
-  if (auto Err = ObjectFileTransformer::convert(Obj, LogOS, Gsym))
+  if (auto Err = ObjectFileTransformer::convert(Obj, Out, Gsym))
     return Err;
 
   // Finalize the GSYM to make it ready to save to disk. This will remove
   // duplicate FunctionInfo entries where we might have found an entry from
   // debug info and also a symbol table entry from the object file.
-  if (auto Err = Gsym.finalize(OS))
+  if (auto Err = Gsym.finalize(Out))
     return Err;
 
   // Save the GSYM file to disk.
@@ -381,7 +376,7 @@ static llvm::Error handleObjectFile(ObjectFile &Obj,
   // Verify the DWARF if requested. This will ensure all the info in the DWARF
   // can be looked up in the GSYM and that all lookups get matching data.
   if (Verify) {
-    if (auto Err = DT.verify(OutFile, OS))
+    if (auto Err = DT.verify(OutFile, Out))
       return Err;
   }
 
@@ -389,7 +384,8 @@ static llvm::Error handleObjectFile(ObjectFile &Obj,
 }
 
 static llvm::Error handleBuffer(StringRef Filename, MemoryBufferRef Buffer,
-                                const std::string &OutFile) {
+                                const std::string &OutFile,
+                                OutputAggregator &Out) {
   Expected<std::unique_ptr<Binary>> BinOrErr = object::createBinary(Buffer);
   error(Filename, errorToErrorCode(BinOrErr.takeError()));
 
@@ -397,7 +393,7 @@ static llvm::Error handleBuffer(StringRef Filename, MemoryBufferRef Buffer,
     Triple ObjTriple(Obj->makeTriple());
     auto ArchName = ObjTriple.getArchName();
     outs() << "Output file (" << ArchName << "): " << OutFile << "\n";
-    if (auto Err = handleObjectFile(*Obj, OutFile))
+    if (auto Err = handleObjectFile(*Obj, OutFile, Out))
       return Err;
   } else if (auto *Fat = dyn_cast<MachOUniversalBinary>(BinOrErr->get())) {
     // Iterate over all contained architectures and filter out any that were
@@ -431,7 +427,7 @@ static llvm::Error handleBuffer(StringRef Filename, MemoryBufferRef Buffer,
         ArchOutFile.append(ArchName.str());
       }
       outs() << "Output file (" << ArchName << "): " << ArchOutFile << "\n";
-      if (auto Err = handleObjectFile(*Obj, ArchOutFile))
+      if (auto Err = handleObjectFile(*Obj, ArchOutFile, Out))
         return Err;
     }
   }
@@ -439,15 +435,16 @@ static llvm::Error handleBuffer(StringRef Filename, MemoryBufferRef Buffer,
 }
 
 static llvm::Error handleFileConversionToGSYM(StringRef Filename,
-                                              const std::string &OutFile) {
+                                              const std::string &OutFile,
+                                              OutputAggregator &Out) {
   ErrorOr<std::unique_ptr<MemoryBuffer>> BuffOrErr =
       MemoryBuffer::getFileOrSTDIN(Filename);
   error(Filename, BuffOrErr.getError());
   std::unique_ptr<MemoryBuffer> Buffer = std::move(BuffOrErr.get());
-  return handleBuffer(Filename, *Buffer, OutFile);
+  return handleBuffer(Filename, *Buffer, OutFile, Out);
 }
 
-static llvm::Error convertFileToGSYM(raw_ostream &OS) {
+static llvm::Error convertFileToGSYM(OutputAggregator &Out) {
   // Expand any .dSYM bundles to the individual object files contained therein.
   std::vector<std::string> Objects;
   std::string OutFile = OutputFilename;
@@ -456,7 +453,7 @@ static llvm::Error convertFileToGSYM(raw_ostream &OS) {
     OutFile += ".gsym";
   }
 
-  OS << "Input file: " << ConvertFilename << "\n";
+  Out << "Input file: " << ConvertFilename << "\n";
 
   if (auto DsymObjectsOrErr =
           MachOObjectFile::findDsymObjectMembers(ConvertFilename)) {
@@ -469,7 +466,7 @@ static llvm::Error convertFileToGSYM(raw_ostream &OS) {
   }
 
   for (StringRef Object : Objects)
-    if (Error Err = handleFileConversionToGSYM(Object, OutFile))
+    if (Error Err = handleFileConversionToGSYM(Object, OutFile, Out))
       return Err;
   return Error::success();
 }
@@ -507,6 +504,7 @@ int llvm_gsymutil_main(int argc, char **argv, const llvm::ToolContext &) {
 
   raw_ostream &OS = outs();
 
+  OutputAggregator Aggregation(&OS);
   if (!ConvertFilename.empty()) {
     // Convert DWARF to GSYM
     if (!InputFilenames.empty()) {
@@ -515,8 +513,12 @@ int llvm_gsymutil_main(int argc, char **argv, const llvm::ToolContext &) {
       return 1;
     }
     // Call error() if we have an error and it will exit with a status of 1
-    if (auto Err = convertFileToGSYM(OS))
+    if (auto Err = convertFileToGSYM(Aggregation))
       error("DWARF conversion failed: ", std::move(Err));
+    // Report the errors from aggregator:
+    Aggregation.EnumerateResults([&](StringRef category, unsigned count) {
+      OS << category << " occurred " << count << " time(s)\n";
+    });
     return 0;
   }
 
diff --git a/llvm/unittests/DebugInfo/GSYM/GSYMTest.cpp b/llvm/unittests/DebugInfo/GSYM/GSYMTest.cpp
index f7ee769..621b6fe 100644
--- a/llvm/unittests/DebugInfo/GSYM/GSYMTest.cpp
+++ b/llvm/unittests/DebugInfo/GSYM/GSYMTest.cpp
@@ -18,6 +18,7 @@
 #include "llvm/DebugInfo/GSYM/GsymReader.h"
 #include "llvm/DebugInfo/GSYM/Header.h"
 #include "llvm/DebugInfo/GSYM/InlineInfo.h"
+#include "llvm/DebugInfo/GSYM/OutputAggregator.h"
 #include "llvm/DebugInfo/GSYM/StringTable.h"
 #include "llvm/ObjectYAML/DWARFEmitter.h"
 #include "llvm/Support/DataExtractor.h"
@@ -971,9 +972,10 @@ TEST(GSYMTest, TestGsymCreatorEncodeErrors) {
                              "GsymCreator wasn't finalized prior to encoding");
   std::string finalizeIssues;
   raw_string_ostream OS(finalizeIssues);
-  llvm::Error finalizeErr = GC.finalize(OS);
+  OutputAggregator Agg(&OS);
+  llvm::Error finalizeErr = GC.finalize(Agg);
   ASSERT_FALSE(bool(finalizeErr));
-  finalizeErr = GC.finalize(OS);
+  finalizeErr = GC.finalize(Agg);
   ASSERT_TRUE(bool(finalizeErr));
   checkError("already finalized", std::move(finalizeErr));
   // Verify we get an error trying to encode a GsymCreator with a UUID that is
@@ -1043,7 +1045,8 @@ TEST(GSYMTest, TestGsymCreator1ByteAddrOffsets) {
   const uint32_t Func2Name = GC.insertString("bar");
   GC.addFunctionInfo(FunctionInfo(BaseAddr+0x00, 0x10, Func1Name));
   GC.addFunctionInfo(FunctionInfo(BaseAddr+0x20, 0x10, Func2Name));
-  Error Err = GC.finalize(llvm::nulls());
+  OutputAggregator Null(nullptr);
+  Error Err = GC.finalize(Null);
   ASSERT_FALSE(Err);
   TestEncodeDecode(GC, llvm::endianness::little, GSYM_VERSION, AddrOffSize,
                    BaseAddr,
@@ -1065,7 +1068,8 @@ TEST(GSYMTest, TestGsymCreator2ByteAddrOffsets) {
   const uint32_t Func2Name = GC.insertString("bar");
   GC.addFunctionInfo(FunctionInfo(BaseAddr+0x000, 0x100, Func1Name));
   GC.addFunctionInfo(FunctionInfo(BaseAddr+0x200, 0x100, Func2Name));
-  Error Err = GC.finalize(llvm::nulls());
+  OutputAggregator Null(nullptr);
+  Error Err = GC.finalize(Null);
   ASSERT_FALSE(Err);
   TestEncodeDecode(GC, llvm::endianness::little, GSYM_VERSION, AddrOffSize,
                    BaseAddr,
@@ -1087,7 +1091,8 @@ TEST(GSYMTest, TestGsymCreator4ByteAddrOffsets) {
   const uint32_t Func2Name = GC.insertString("bar");
   GC.addFunctionInfo(FunctionInfo(BaseAddr+0x000, 0x100, Func1Name));
   GC.addFunctionInfo(FunctionInfo(BaseAddr+0x20000, 0x100, Func2Name));
-  Error Err = GC.finalize(llvm::nulls());
+  OutputAggregator Null(nullptr);
+  Error Err = GC.finalize(Null);
   ASSERT_FALSE(Err);
   TestEncodeDecode(GC, llvm::endianness::little, GSYM_VERSION, AddrOffSize,
                    BaseAddr,
@@ -1109,7 +1114,8 @@ TEST(GSYMTest, TestGsymCreator8ByteAddrOffsets) {
   const uint32_t Func2Name = GC.insertString("bar");
   GC.addFunctionInfo(FunctionInfo(BaseAddr+0x000, 0x100, Func1Name));
   GC.addFunctionInfo(FunctionInfo(BaseAddr+0x100000000, 0x100, Func2Name));
-  Error Err = GC.finalize(llvm::nulls());
+  OutputAggregator Null(nullptr);
+  Error Err = GC.finalize(Null);
   ASSERT_FALSE(Err);
   TestEncodeDecode(GC, llvm::endianness::little, GSYM_VERSION, AddrOffSize,
                    BaseAddr,
@@ -1148,7 +1154,8 @@ TEST(GSYMTest, TestGsymReader) {
   const auto ByteOrder = llvm::endianness::native;
   GC.addFunctionInfo(FunctionInfo(Func1Addr, FuncSize, Func1Name));
   GC.addFunctionInfo(FunctionInfo(Func2Addr, FuncSize, Func2Name));
-  Error FinalizeErr = GC.finalize(llvm::nulls());
+  OutputAggregator Null(nullptr);
+  Error FinalizeErr = GC.finalize(Null);
   ASSERT_FALSE(FinalizeErr);
   SmallString<512> Str;
   raw_svector_ostream OutStrm(Str);
@@ -1213,7 +1220,8 @@ TEST(GSYMTest, TestGsymLookups) {
   Inline3.Ranges.insert(AddressRange(0x1016, 0x1018));
   FI.Inline->Children.emplace_back(Inline3);
   GC.addFunctionInfo(std::move(FI));
-  Error FinalizeErr = GC.finalize(llvm::nulls());
+  OutputAggregator Null(nullptr);
+  Error FinalizeErr = GC.finalize(Null);
   ASSERT_FALSE(FinalizeErr);
   SmallString<512> Str;
   raw_svector_ostream OutStrm(Str);
@@ -1329,11 +1337,12 @@ TEST(GSYMTest, TestDWARFFunctionWithAddresses) {
       DWARFContext::create(*ErrOrSections, 8);
   ASSERT_TRUE(DwarfContext.get() != nullptr);
   auto &OS = llvm::nulls();
+  OutputAggregator OSAgg(&OS);
   GsymCreator GC;
   DwarfTransformer DT(*DwarfContext, GC);
   const uint32_t ThreadCount = 1;
-  ASSERT_THAT_ERROR(DT.convert(ThreadCount, &OS), Succeeded());
-  ASSERT_THAT_ERROR(GC.finalize(OS), Succeeded());
+  ASSERT_THAT_ERROR(DT.convert(ThreadCount, OSAgg), Succeeded());
+  ASSERT_THAT_ERROR(GC.finalize(OSAgg), Succeeded());
   SmallString<512> Str;
   raw_svector_ostream OutStrm(Str);
   const auto ByteOrder = llvm::endianness::native;
@@ -1406,11 +1415,12 @@ TEST(GSYMTest, TestDWARFFunctionWithAddressAndOffset) {
       DWARFContext::create(*ErrOrSections, 8);
   ASSERT_TRUE(DwarfContext.get() != nullptr);
   auto &OS = llvm::nulls();
+  OutputAggregator OSAgg(&OS);
   GsymCreator GC;
   DwarfTransformer DT(*DwarfContext, GC);
   const uint32_t ThreadCount = 1;
-  ASSERT_THAT_ERROR(DT.convert(ThreadCount, &OS), Succeeded());
-  ASSERT_THAT_ERROR(GC.finalize(OS), Succeeded());
+  ASSERT_THAT_ERROR(DT.convert(ThreadCount, OSAgg), Succeeded());
+  ASSERT_THAT_ERROR(GC.finalize(OSAgg), Succeeded());
   SmallString<512> Str;
   raw_svector_ostream OutStrm(Str);
   const auto ByteOrder = llvm::endianness::native;
@@ -1513,11 +1523,12 @@ TEST(GSYMTest, TestDWARFStructMethodNoMangled) {
       DWARFContext::create(*ErrOrSections, 8);
   ASSERT_TRUE(DwarfContext.get() != nullptr);
   auto &OS = llvm::nulls();
+  OutputAggregator OSAgg(&OS);
   GsymCreator GC;
   DwarfTransformer DT(*DwarfContext, GC);
   const uint32_t ThreadCount = 1;
-  ASSERT_THAT_ERROR(DT.convert(ThreadCount, &OS), Succeeded());
-  ASSERT_THAT_ERROR(GC.finalize(OS), Succeeded());
+  ASSERT_THAT_ERROR(DT.convert(ThreadCount, OSAgg), Succeeded());
+  ASSERT_THAT_ERROR(GC.finalize(OSAgg), Succeeded());
   SmallString<512> Str;
   raw_svector_ostream OutStrm(Str);
   const auto ByteOrder = llvm::endianness::native;
@@ -1613,6 +1624,7 @@ TEST(GSYMTest, TestDWARFTextRanges) {
       DWARFContext::create(*ErrOrSections, 8);
   ASSERT_TRUE(DwarfContext.get() != nullptr);
   auto &OS = llvm::nulls();
+  OutputAggregator OSAgg(&OS);
   GsymCreator GC;
   DwarfTransformer DT(*DwarfContext, GC);
   // Only allow addresses between [0x1000 - 0x2000) to be linked into the
@@ -1621,8 +1633,8 @@ TEST(GSYMTest, TestDWARFTextRanges) {
   TextRanges.insert(AddressRange(0x1000, 0x2000));
   GC.SetValidTextRanges(TextRanges);
   const uint32_t ThreadCount = 1;
-  ASSERT_THAT_ERROR(DT.convert(ThreadCount, &OS), Succeeded());
-  ASSERT_THAT_ERROR(GC.finalize(OS), Succeeded());
+  ASSERT_THAT_ERROR(DT.convert(ThreadCount, OSAgg), Succeeded());
+  ASSERT_THAT_ERROR(GC.finalize(OSAgg), Succeeded());
   SmallString<512> Str;
   raw_svector_ostream OutStrm(Str);
   const auto ByteOrder = llvm::endianness::native;
@@ -1650,8 +1662,8 @@ TEST(GSYMTest, TestEmptySymbolEndAddressOfTextRanges) {
   TextRanges.insert(AddressRange(0x1000, 0x2000));
   GC.SetValidTextRanges(TextRanges);
   GC.addFunctionInfo(FunctionInfo(0x1500, 0, GC.insertString("symbol")));
-  auto &OS = llvm::nulls();
-  ASSERT_THAT_ERROR(GC.finalize(OS), Succeeded());
+  OutputAggregator Null(nullptr);
+  ASSERT_THAT_ERROR(GC.finalize(Null), Succeeded());
   SmallString<512> Str;
   raw_svector_ostream OutStrm(Str);
   const auto ByteOrder = llvm::endianness::native;
@@ -1816,11 +1828,12 @@ TEST(GSYMTest, TestDWARFInlineInfo) {
       DWARFContext::create(*ErrOrSections, 8);
   ASSERT_TRUE(DwarfContext.get() != nullptr);
   auto &OS = llvm::nulls();
+  OutputAggregator OSAgg(&OS);
   GsymCreator GC;
   DwarfTransformer DT(*DwarfContext, GC);
   const uint32_t ThreadCount = 1;
-  ASSERT_THAT_ERROR(DT.convert(ThreadCount, &OS), Succeeded());
-  ASSERT_THAT_ERROR(GC.finalize(OS), Succeeded());
+  ASSERT_THAT_ERROR(DT.convert(ThreadCount, OSAgg), Succeeded());
+  ASSERT_THAT_ERROR(GC.finalize(OSAgg), Succeeded());
   SmallString<512> Str;
   raw_svector_ostream OutStrm(Str);
   const auto ByteOrder = llvm::endianness::native;
@@ -2076,11 +2089,12 @@ TEST(GSYMTest, TestDWARFNoLines) {
       DWARFContext::create(*ErrOrSections, 8);
   ASSERT_TRUE(DwarfContext.get() != nullptr);
   auto &OS = llvm::nulls();
+  OutputAggregator OSAgg(&OS);
   GsymCreator GC;
   DwarfTransformer DT(*DwarfContext, GC);
   const uint32_t ThreadCount = 1;
-  ASSERT_THAT_ERROR(DT.convert(ThreadCount, &OS), Succeeded());
-  ASSERT_THAT_ERROR(GC.finalize(OS), Succeeded());
+  ASSERT_THAT_ERROR(DT.convert(ThreadCount, OSAgg), Succeeded());
+  ASSERT_THAT_ERROR(GC.finalize(OSAgg), Succeeded());
   SmallString<512> Str;
   raw_svector_ostream OutStrm(Str);
   const auto ByteOrder = llvm::endianness::native;
@@ -2255,11 +2269,12 @@ TEST(GSYMTest, TestDWARFDeadStripAddr4) {
       DWARFContext::create(*ErrOrSections, 4);
   ASSERT_TRUE(DwarfContext.get() != nullptr);
   auto &OS = llvm::nulls();
+  OutputAggregator OSAgg(&OS);
   GsymCreator GC;
   DwarfTransformer DT(*DwarfContext, GC);
   const uint32_t ThreadCount = 1;
-  ASSERT_THAT_ERROR(DT.convert(ThreadCount, &OS), Succeeded());
-  ASSERT_THAT_ERROR(GC.finalize(OS), Succeeded());
+  ASSERT_THAT_ERROR(DT.convert(ThreadCount, OSAgg), Succeeded());
+  ASSERT_THAT_ERROR(GC.finalize(OSAgg), Succeeded());
   SmallString<512> Str;
   raw_svector_ostream OutStrm(Str);
   const auto ByteOrder = llvm::endianness::native;
@@ -2395,11 +2410,12 @@ TEST(GSYMTest, TestDWARFDeadStripAddr8) {
       DWARFContext::create(*ErrOrSections, 8);
   ASSERT_TRUE(DwarfContext.get() != nullptr);
   auto &OS = llvm::nulls();
+  OutputAggregator OSAgg(&OS);
   GsymCreator GC;
   DwarfTransformer DT(*DwarfContext, GC);
   const uint32_t ThreadCount = 1;
-  ASSERT_THAT_ERROR(DT.convert(ThreadCount, &OS), Succeeded());
-  ASSERT_THAT_ERROR(GC.finalize(OS), Succeeded());
+  ASSERT_THAT_ERROR(DT.convert(ThreadCount, OSAgg), Succeeded());
+  ASSERT_THAT_ERROR(GC.finalize(OSAgg), Succeeded());
   SmallString<512> Str;
   raw_svector_ostream OutStrm(Str);
   const auto ByteOrder = llvm::endianness::native;
@@ -2430,7 +2446,8 @@ TEST(GSYMTest, TestGsymCreatorMultipleSymbolsWithNoSize) {
   const uint32_t Func2Name = GC.insertString("bar");
   GC.addFunctionInfo(FunctionInfo(BaseAddr, 0, Func1Name));
   GC.addFunctionInfo(FunctionInfo(BaseAddr, 0, Func2Name));
-  Error Err = GC.finalize(llvm::nulls());
+  OutputAggregator Null(nullptr);
+  Error Err = GC.finalize(Null);
   ASSERT_FALSE(Err);
   TestEncodeDecode(GC, llvm::endianness::little, GSYM_VERSION, AddrOffSize,
                    BaseAddr,
@@ -2485,7 +2502,8 @@ static void AddFunctionInfo(GsymCreator &GC, const char *FuncName,
 // Finalize a GsymCreator, encode it and decode it and return the error or
 // GsymReader that was successfully decoded.
 static Expected<GsymReader> FinalizeEncodeAndDecode(GsymCreator &GC) {
-  Error FinalizeErr = GC.finalize(llvm::nulls());
+  OutputAggregator Null(nullptr);
+  Error FinalizeErr = GC.finalize(Null);
   if (FinalizeErr)
     return std::move(FinalizeErr);
   SmallString<1024> Str;
@@ -3033,11 +3051,12 @@ TEST(GSYMTest, TestDWARFInlineRangeScopes) {
   ASSERT_TRUE(DwarfContext.get() != nullptr);
   std::string errors;
   raw_string_ostream OS(errors);
+  OutputAggregator OSAgg(&OS);
   GsymCreator GC;
   DwarfTransformer DT(*DwarfContext, GC);
   const uint32_t ThreadCount = 1;
-  ASSERT_THAT_ERROR(DT.convert(ThreadCount, &OS), Succeeded());
-  ASSERT_THAT_ERROR(GC.finalize(OS), Succeeded());
+  ASSERT_THAT_ERROR(DT.convert(ThreadCount, OSAgg), Succeeded());
+  ASSERT_THAT_ERROR(GC.finalize(OSAgg), Succeeded());
   SmallString<512> Str;
   raw_svector_ostream OutStrm(Str);
   const auto ByteOrder = llvm::endianness::native;
@@ -3260,11 +3279,12 @@ TEST(GSYMTest, TestDWARFEmptyInline) {
   ASSERT_TRUE(DwarfContext.get() != nullptr);
   std::string errors;
   raw_string_ostream OS(errors);
+  OutputAggregator OSAgg(&OS);
   GsymCreator GC;
   DwarfTransformer DT(*DwarfContext, GC);
   const uint32_t ThreadCount = 1;
-  ASSERT_THAT_ERROR(DT.convert(ThreadCount, &OS), Succeeded());
-  ASSERT_THAT_ERROR(GC.finalize(OS), Succeeded());
+  ASSERT_THAT_ERROR(DT.convert(ThreadCount, OSAgg), Succeeded());
+  ASSERT_THAT_ERROR(GC.finalize(OSAgg), Succeeded());
   SmallString<512> Str;
   raw_svector_ostream OutStrm(Str);
   const auto ByteOrder = llvm::endianness::native;
@@ -3496,11 +3516,12 @@ TEST(GSYMTest, TestFinalizeForLineTables) {
   ASSERT_TRUE(DwarfContext.get() != nullptr);
   std::string errors;
   raw_string_ostream OS(errors);
+  OutputAggregator OSAgg(&OS);
   GsymCreator GC;
   DwarfTransformer DT(*DwarfContext, GC);
   const uint32_t ThreadCount = 1;
-  ASSERT_THAT_ERROR(DT.convert(ThreadCount, &OS), Succeeded());
-  ASSERT_THAT_ERROR(GC.finalize(OS), Succeeded());
+  ASSERT_THAT_ERROR(DT.convert(ThreadCount, OSAgg), Succeeded());
+  ASSERT_THAT_ERROR(GC.finalize(OSAgg), Succeeded());
   SmallString<512> Str;
   raw_svector_ostream OutStrm(Str);
   const auto ByteOrder = llvm::endianness::native;
@@ -3775,11 +3796,12 @@ TEST(GSYMTest, TestRangeWarnings) {
   ASSERT_TRUE(DwarfContext.get() != nullptr);
   std::string errors;
   raw_string_ostream OS(errors);
+  OutputAggregator OSAgg(&OS);
   GsymCreator GC;
   DwarfTransformer DT(*DwarfContext, GC);
   const uint32_t ThreadCount = 1;
-  ASSERT_THAT_ERROR(DT.convert(ThreadCount, &OS), Succeeded());
-  ASSERT_THAT_ERROR(GC.finalize(OS), Succeeded());
+  ASSERT_THAT_ERROR(DT.convert(ThreadCount, OSAgg), Succeeded());
+  ASSERT_THAT_ERROR(GC.finalize(OSAgg), Succeeded());
   OS.flush();
   SmallString<512> Str;
   raw_svector_ostream OutStrm(Str);
@@ -3977,11 +3999,12 @@ TEST(GSYMTest, TestEmptyRangeWarnings) {
   ASSERT_TRUE(DwarfContext.get() != nullptr);
   std::string errors;
   raw_string_ostream OS(errors);
+  OutputAggregator OSAgg(&OS);
   GsymCreator GC;
   DwarfTransformer DT(*DwarfContext, GC);
   const uint32_t ThreadCount = 1;
-  ASSERT_THAT_ERROR(DT.convert(ThreadCount, &OS), Succeeded());
-  ASSERT_THAT_ERROR(GC.finalize(OS), Succeeded());
+  ASSERT_THAT_ERROR(DT.convert(ThreadCount, OSAgg), Succeeded());
+  ASSERT_THAT_ERROR(GC.finalize(OSAgg), Succeeded());
   OS.flush();
   SmallString<512> Str;
   raw_svector_ostream OutStrm(Str);
@@ -4129,11 +4152,12 @@ TEST(GSYMTest, TestEmptyLinkageName) {
   ASSERT_TRUE(DwarfContext.get() != nullptr);
   std::string errors;
   raw_string_ostream OS(errors);
+  OutputAggregator OSAgg(&OS);
   GsymCreator GC;
   DwarfTransformer DT(*DwarfContext, GC);
   const uint32_t ThreadCount = 1;
-  ASSERT_THAT_ERROR(DT.convert(ThreadCount, &OS), Succeeded());
-  ASSERT_THAT_ERROR(GC.finalize(OS), Succeeded());
+  ASSERT_THAT_ERROR(DT.convert(ThreadCount, OSAgg), Succeeded());
+  ASSERT_THAT_ERROR(GC.finalize(OSAgg), Succeeded());
   OS.flush();
   SmallString<512> Str;
   raw_svector_ostream OutStrm(Str);
@@ -4290,11 +4314,12 @@ TEST(GSYMTest, TestLineTablesWithEmptyRanges) {
   ASSERT_TRUE(DwarfContext.get() != nullptr);
   std::string errors;
   raw_string_ostream OS(errors);
+  OutputAggregator OSAgg(&OS);
   GsymCreator GC;
   DwarfTransformer DT(*DwarfContext, GC);
   const uint32_t ThreadCount = 1;
-  ASSERT_THAT_ERROR(DT.convert(ThreadCount, &OS), Succeeded());
-  ASSERT_THAT_ERROR(GC.finalize(OS), Succeeded());
+  ASSERT_THAT_ERROR(DT.convert(ThreadCount, OSAgg), Succeeded());
+  ASSERT_THAT_ERROR(GC.finalize(OSAgg), Succeeded());
   OS.flush();
   SmallString<512> Str;
   raw_svector_ostream OutStrm(Str);
@@ -4610,11 +4635,12 @@ TEST(GSYMTest, TestHandlingOfInvalidFileIndexes) {
   ASSERT_TRUE(DwarfContext.get() != nullptr);
   std::string errors;
   raw_string_ostream OS(errors);
+  OutputAggregator OSAgg(&OS);
   GsymCreator GC;
   DwarfTransformer DT(*DwarfContext, GC);
   const uint32_t ThreadCount = 1;
-  ASSERT_THAT_ERROR(DT.convert(ThreadCount, &OS), Succeeded());
-  ASSERT_THAT_ERROR(GC.finalize(OS), Succeeded());
+  ASSERT_THAT_ERROR(DT.convert(ThreadCount, OSAgg), Succeeded());
+  ASSERT_THAT_ERROR(GC.finalize(OSAgg), Succeeded());
   OS.flush();
   SmallString<512> Str;
   raw_svector_ostream OutStrm(Str);
@@ -4825,11 +4851,12 @@ TEST(GSYMTest, TestLookupsOfOverlappingAndUnequalRanges) {
   ASSERT_TRUE(DwarfContext.get() != nullptr);
   std::string errors;
   raw_string_ostream OS(errors);
+  OutputAggregator OSAgg(&OS);
   GsymCreator GC;
   DwarfTransformer DT(*DwarfContext, GC);
   const uint32_t ThreadCount = 1;
-  ASSERT_THAT_ERROR(DT.convert(ThreadCount, &OS), Succeeded());
-  ASSERT_THAT_ERROR(GC.finalize(OS), Succeeded());
+  ASSERT_THAT_ERROR(DT.convert(ThreadCount, OSAgg), Succeeded());
+  ASSERT_THAT_ERROR(GC.finalize(OSAgg), Succeeded());
   OS.flush();
   SmallString<512> Str;
   raw_svector_ostream OutStrm(Str);
-- 
cgit v1.1


From bc66e0cf9feb8f3c38fa2e33850b6c6e5477fb21 Mon Sep 17 00:00:00 2001
From: Arthur Eubanks <aeubanks@google.com>
Date: Tue, 13 Feb 2024 00:57:06 +0000
Subject: Revert "Reapply "[DebugInfo][RemoveDIs] Turn on non-instrinsic
 debug-info by default""

This reverts commit 5c9f7682b090124d9a8b69f92d3f7c269dca25fc.

Causes crashes, see comments on https://github.com/llvm/llvm-project/commit/5c9f7682b090124d9a8b69f92d3f7c269dca25fc.
---
 llvm/lib/IR/BasicBlock.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/IR/BasicBlock.cpp b/llvm/lib/IR/BasicBlock.cpp
index bf02eba..fe9d0d0 100644
--- a/llvm/lib/IR/BasicBlock.cpp
+++ b/llvm/lib/IR/BasicBlock.cpp
@@ -34,7 +34,7 @@ cl::opt<bool>
     UseNewDbgInfoFormat("experimental-debuginfo-iterators",
                         cl::desc("Enable communicating debuginfo positions "
                                  "through iterators, eliminating intrinsics"),
-                        cl::init(true));
+                        cl::init(false));
 
 DPMarker *BasicBlock::createMarker(Instruction *I) {
   assert(IsNewDbgInfoFormat &&
-- 
cgit v1.1


From 1af073a11cb2ae5a52205e66f33d0ec9bbcbb5e0 Mon Sep 17 00:00:00 2001
From: Krzysztof Parzyszek <Krzysztof.Parzyszek@amd.com>
Date: Mon, 12 Feb 2024 19:15:55 -0600
Subject: =?UTF-8?q?[flang][OpenMP]=20Pass=20semantics=20context=20to=20all?=
 =?UTF-8?q?=20generating=20functions=20in=E2=80=A6=20(#81269)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

… lower

The convention is to pass it after "symTable" if present, otherwise
after "converter":
- converter, symTable, semaCtx
- converter, semaCtx

This makes the interfaces more uniform---some of these functions were
already taking the semantics context, while others were not.

The context will be used in future patches.
---
 flang/include/flang/Lower/OpenMP.h |   2 +
 flang/lib/Lower/Bridge.cpp         |   3 +-
 flang/lib/Lower/OpenMP.cpp         | 271 +++++++++++++++++++------------------
 3 files changed, 147 insertions(+), 129 deletions(-)

diff --git a/flang/include/flang/Lower/OpenMP.h b/flang/include/flang/Lower/OpenMP.h
index d2b7a423..e6fe7fb 100644
--- a/flang/include/flang/Lower/OpenMP.h
+++ b/flang/include/flang/Lower/OpenMP.h
@@ -72,6 +72,7 @@ int64_t getCollapseValue(const Fortran::parser::OmpClauseList &clauseList);
 void genThreadprivateOp(AbstractConverter &, const pft::Variable &);
 void genDeclareTargetIntGlobal(AbstractConverter &, const pft::Variable &);
 void genOpenMPReduction(AbstractConverter &,
+                        Fortran::semantics::SemanticsContext &,
                         const Fortran::parser::OmpClauseList &clauseList);
 
 mlir::Operation *findReductionChain(mlir::Value, mlir::Value * = nullptr);
@@ -82,6 +83,7 @@ void removeStoreOp(mlir::Operation *, mlir::Value);
 
 bool isOpenMPTargetConstruct(const parser::OpenMPConstruct &);
 bool isOpenMPDeviceDeclareTarget(Fortran::lower::AbstractConverter &,
+                                 Fortran::semantics::SemanticsContext &,
                                  Fortran::lower::pft::Evaluation &,
                                  const parser::OpenMPDeclarativeConstruct &);
 void genOpenMPRequires(mlir::Operation *, const Fortran::semantics::Symbol *);
diff --git a/flang/lib/Lower/Bridge.cpp b/flang/lib/Lower/Bridge.cpp
index 7577c49..76e1272 100644
--- a/flang/lib/Lower/Bridge.cpp
+++ b/flang/lib/Lower/Bridge.cpp
@@ -2470,7 +2470,8 @@ private:
     // found
     ompDeviceCodeFound =
         ompDeviceCodeFound ||
-        Fortran::lower::isOpenMPDeviceDeclareTarget(*this, getEval(), ompDecl);
+        Fortran::lower::isOpenMPDeviceDeclareTarget(
+            *this, bridge.getSemanticsContext(), getEval(), ompDecl);
     genOpenMPDeclarativeConstruct(
         *this, localSymbols, bridge.getSemanticsContext(), getEval(), ompDecl);
     builder->restoreInsertionPoint(insertPt);
diff --git a/flang/lib/Lower/OpenMP.cpp b/flang/lib/Lower/OpenMP.cpp
index 746dc2b6..e588762 100644
--- a/flang/lib/Lower/OpenMP.cpp
+++ b/flang/lib/Lower/OpenMP.cpp
@@ -552,8 +552,9 @@ class ClauseProcessor {
 
 public:
   ClauseProcessor(Fortran::lower::AbstractConverter &converter,
+                  Fortran::semantics::SemanticsContext &semaCtx,
                   const Fortran::parser::OmpClauseList &clauses)
-      : converter(converter), clauses(clauses) {}
+      : converter(converter), semaCtx(semaCtx), clauses(clauses) {}
 
   // 'Unique' clauses: They can appear at most once in the clause list.
   bool
@@ -614,7 +615,6 @@ public:
   // target directives that require it.
   bool processMap(mlir::Location currentLocation,
                   const llvm::omp::Directive &directive,
-                  Fortran::semantics::SemanticsContext &semanticsContext,
                   Fortran::lower::StatementContext &stmtCtx,
                   llvm::SmallVectorImpl<mlir::Value> &mapOperands,
                   llvm::SmallVectorImpl<mlir::Type> *mapSymTypes = nullptr,
@@ -643,10 +643,8 @@ public:
                           &useDeviceSymbols) const;
 
   template <typename T>
-  bool
-  processMotionClauses(Fortran::semantics::SemanticsContext &semanticsContext,
-                       Fortran::lower::StatementContext &stmtCtx,
-                       llvm::SmallVectorImpl<mlir::Value> &mapOperands);
+  bool processMotionClauses(Fortran::lower::StatementContext &stmtCtx,
+                            llvm::SmallVectorImpl<mlir::Value> &mapOperands);
 
   // Call this method for these clauses that should be supported but are not
   // implemented yet. It triggers a compilation error if any of the given
@@ -715,6 +713,7 @@ private:
   }
 
   Fortran::lower::AbstractConverter &converter;
+  Fortran::semantics::SemanticsContext &semaCtx;
   const Fortran::parser::OmpClauseList &clauses;
 };
 
@@ -1859,7 +1858,6 @@ createMapInfoOp(fir::FirOpBuilder &builder, mlir::Location loc,
 
 bool ClauseProcessor::processMap(
     mlir::Location currentLocation, const llvm::omp::Directive &directive,
-    Fortran::semantics::SemanticsContext &semanticsContext,
     Fortran::lower::StatementContext &stmtCtx,
     llvm::SmallVectorImpl<mlir::Value> &mapOperands,
     llvm::SmallVectorImpl<mlir::Type> *mapSymTypes,
@@ -1921,7 +1919,7 @@ bool ClauseProcessor::processMap(
               Fortran::lower::gatherDataOperandAddrAndBounds<
                   Fortran::parser::OmpObject, mlir::omp::DataBoundsOp,
                   mlir::omp::DataBoundsType>(
-                  converter, firOpBuilder, semanticsContext, stmtCtx, ompObject,
+                  converter, firOpBuilder, semaCtx, stmtCtx, ompObject,
                   clauseLocation, asFortran, bounds, treatIndexAsSection);
 
           auto origSymbol =
@@ -2029,7 +2027,6 @@ bool ClauseProcessor::processUseDevicePtr(
 
 template <typename T>
 bool ClauseProcessor::processMotionClauses(
-    Fortran::semantics::SemanticsContext &semanticsContext,
     Fortran::lower::StatementContext &stmtCtx,
     llvm::SmallVectorImpl<mlir::Value> &mapOperands) {
   return findRepeatableClause<T>(
@@ -2053,7 +2050,7 @@ bool ClauseProcessor::processMotionClauses(
               Fortran::lower::gatherDataOperandAddrAndBounds<
                   Fortran::parser::OmpObject, mlir::omp::DataBoundsOp,
                   mlir::omp::DataBoundsType>(
-                  converter, firOpBuilder, semanticsContext, stmtCtx, ompObject,
+                  converter, firOpBuilder, semaCtx, stmtCtx, ompObject,
                   clauseLocation, asFortran, bounds, treatIndexAsSection);
 
           auto origSymbol =
@@ -2292,8 +2289,9 @@ struct OpWithBodyGenInfo {
           mlir::Operation *)>;
 
   OpWithBodyGenInfo(Fortran::lower::AbstractConverter &converter,
+                    Fortran::semantics::SemanticsContext &semaCtx,
                     mlir::Location loc, Fortran::lower::pft::Evaluation &eval)
-      : converter(converter), loc(loc), eval(eval) {}
+      : converter(converter), semaCtx(semaCtx), loc(loc), eval(eval) {}
 
   OpWithBodyGenInfo &setGenNested(bool value) {
     genNested = value;
@@ -2330,6 +2328,8 @@ struct OpWithBodyGenInfo {
 
   /// [inout] converter to use for the clauses.
   Fortran::lower::AbstractConverter &converter;
+  /// [in] Semantics context
+  Fortran::semantics::SemanticsContext &semaCtx;
   /// [in] location in source code.
   mlir::Location loc;
   /// [in] current PFT node/evaluation.
@@ -2408,7 +2408,8 @@ static void createBodyOfOp(Op &op, OpWithBodyGenInfo &info) {
     threadPrivatizeVars(info.converter, info.eval);
     if (info.clauses) {
       firOpBuilder.setInsertionPoint(marker);
-      ClauseProcessor(info.converter, *info.clauses).processCopyin();
+      ClauseProcessor(info.converter, info.semaCtx, *info.clauses)
+          .processCopyin();
     }
   }
 
@@ -2488,6 +2489,7 @@ static void createBodyOfOp(Op &op, OpWithBodyGenInfo &info) {
 
 static void genBodyOfTargetDataOp(
     Fortran::lower::AbstractConverter &converter,
+    Fortran::semantics::SemanticsContext &semaCtx,
     Fortran::lower::pft::Evaluation &eval, bool genNested,
     mlir::omp::DataOp &dataOp,
     const llvm::SmallVector<mlir::Type> &useDeviceTypes,
@@ -2561,26 +2563,29 @@ static OpTy genOpWithBody(OpWithBodyGenInfo &info, Args &&...args) {
 
 static mlir::omp::MasterOp
 genMasterOp(Fortran::lower::AbstractConverter &converter,
+            Fortran::semantics::SemanticsContext &semaCtx,
             Fortran::lower::pft::Evaluation &eval, bool genNested,
             mlir::Location currentLocation) {
   return genOpWithBody<mlir::omp::MasterOp>(
-      OpWithBodyGenInfo(converter, currentLocation, eval)
+      OpWithBodyGenInfo(converter, semaCtx, currentLocation, eval)
           .setGenNested(genNested),
       /*resultTypes=*/mlir::TypeRange());
 }
 
 static mlir::omp::OrderedRegionOp
 genOrderedRegionOp(Fortran::lower::AbstractConverter &converter,
+                   Fortran::semantics::SemanticsContext &semaCtx,
                    Fortran::lower::pft::Evaluation &eval, bool genNested,
                    mlir::Location currentLocation) {
   return genOpWithBody<mlir::omp::OrderedRegionOp>(
-      OpWithBodyGenInfo(converter, currentLocation, eval)
+      OpWithBodyGenInfo(converter, semaCtx, currentLocation, eval)
           .setGenNested(genNested),
       /*simd=*/false);
 }
 
 static mlir::omp::ParallelOp
 genParallelOp(Fortran::lower::AbstractConverter &converter,
+              Fortran::semantics::SemanticsContext &semaCtx,
               Fortran::lower::pft::Evaluation &eval, bool genNested,
               mlir::Location currentLocation,
               const Fortran::parser::OmpClauseList &clauseList,
@@ -2593,7 +2598,7 @@ genParallelOp(Fortran::lower::AbstractConverter &converter,
   llvm::SmallVector<mlir::Attribute> reductionDeclSymbols;
   llvm::SmallVector<const Fortran::semantics::Symbol *> reductionSymbols;
 
-  ClauseProcessor cp(converter, clauseList);
+  ClauseProcessor cp(converter, semaCtx, clauseList);
   cp.processIf(Fortran::parser::OmpIfClause::DirectiveNameModifier::Parallel,
                ifClauseOperand);
   cp.processNumThreads(stmtCtx, numThreadsClauseOperand);
@@ -2622,7 +2627,7 @@ genParallelOp(Fortran::lower::AbstractConverter &converter,
   };
 
   return genOpWithBody<mlir::omp::ParallelOp>(
-      OpWithBodyGenInfo(converter, currentLocation, eval)
+      OpWithBodyGenInfo(converter, semaCtx, currentLocation, eval)
           .setGenNested(genNested)
           .setOuterCombined(outerCombined)
           .setClauses(&clauseList)
@@ -2640,19 +2645,21 @@ genParallelOp(Fortran::lower::AbstractConverter &converter,
 
 static mlir::omp::SectionOp
 genSectionOp(Fortran::lower::AbstractConverter &converter,
+             Fortran::semantics::SemanticsContext &semaCtx,
              Fortran::lower::pft::Evaluation &eval, bool genNested,
              mlir::Location currentLocation,
              const Fortran::parser::OmpClauseList &sectionsClauseList) {
   // Currently only private/firstprivate clause is handled, and
   // all privatization is done within `omp.section` operations.
   return genOpWithBody<mlir::omp::SectionOp>(
-      OpWithBodyGenInfo(converter, currentLocation, eval)
+      OpWithBodyGenInfo(converter, semaCtx, currentLocation, eval)
           .setGenNested(genNested)
           .setClauses(&sectionsClauseList));
 }
 
 static mlir::omp::SingleOp
 genSingleOp(Fortran::lower::AbstractConverter &converter,
+            Fortran::semantics::SemanticsContext &semaCtx,
             Fortran::lower::pft::Evaluation &eval, bool genNested,
             mlir::Location currentLocation,
             const Fortran::parser::OmpClauseList &beginClauseList,
@@ -2660,15 +2667,15 @@ genSingleOp(Fortran::lower::AbstractConverter &converter,
   llvm::SmallVector<mlir::Value> allocateOperands, allocatorOperands;
   mlir::UnitAttr nowaitAttr;
 
-  ClauseProcessor cp(converter, beginClauseList);
+  ClauseProcessor cp(converter, semaCtx, beginClauseList);
   cp.processAllocate(allocatorOperands, allocateOperands);
   cp.processTODO<Fortran::parser::OmpClause::Copyprivate>(
       currentLocation, llvm::omp::Directive::OMPD_single);
 
-  ClauseProcessor(converter, endClauseList).processNowait(nowaitAttr);
+  ClauseProcessor(converter, semaCtx, endClauseList).processNowait(nowaitAttr);
 
   return genOpWithBody<mlir::omp::SingleOp>(
-      OpWithBodyGenInfo(converter, currentLocation, eval)
+      OpWithBodyGenInfo(converter, semaCtx, currentLocation, eval)
           .setGenNested(genNested)
           .setClauses(&beginClauseList),
       allocateOperands, allocatorOperands, nowaitAttr);
@@ -2676,6 +2683,7 @@ genSingleOp(Fortran::lower::AbstractConverter &converter,
 
 static mlir::omp::TaskOp
 genTaskOp(Fortran::lower::AbstractConverter &converter,
+          Fortran::semantics::SemanticsContext &semaCtx,
           Fortran::lower::pft::Evaluation &eval, bool genNested,
           mlir::Location currentLocation,
           const Fortran::parser::OmpClauseList &clauseList) {
@@ -2686,7 +2694,7 @@ genTaskOp(Fortran::lower::AbstractConverter &converter,
   llvm::SmallVector<mlir::Value> allocateOperands, allocatorOperands,
       dependOperands;
 
-  ClauseProcessor cp(converter, clauseList);
+  ClauseProcessor cp(converter, semaCtx, clauseList);
   cp.processIf(Fortran::parser::OmpIfClause::DirectiveNameModifier::Task,
                ifClauseOperand);
   cp.processAllocate(allocatorOperands, allocateOperands);
@@ -2702,7 +2710,7 @@ genTaskOp(Fortran::lower::AbstractConverter &converter,
       currentLocation, llvm::omp::Directive::OMPD_task);
 
   return genOpWithBody<mlir::omp::TaskOp>(
-      OpWithBodyGenInfo(converter, currentLocation, eval)
+      OpWithBodyGenInfo(converter, semaCtx, currentLocation, eval)
           .setGenNested(genNested)
           .setClauses(&clauseList),
       ifClauseOperand, finalClauseOperand, untiedAttr, mergeableAttr,
@@ -2717,16 +2725,17 @@ genTaskOp(Fortran::lower::AbstractConverter &converter,
 
 static mlir::omp::TaskGroupOp
 genTaskGroupOp(Fortran::lower::AbstractConverter &converter,
+               Fortran::semantics::SemanticsContext &semaCtx,
                Fortran::lower::pft::Evaluation &eval, bool genNested,
                mlir::Location currentLocation,
                const Fortran::parser::OmpClauseList &clauseList) {
   llvm::SmallVector<mlir::Value> allocateOperands, allocatorOperands;
-  ClauseProcessor cp(converter, clauseList);
+  ClauseProcessor cp(converter, semaCtx, clauseList);
   cp.processAllocate(allocatorOperands, allocateOperands);
   cp.processTODO<Fortran::parser::OmpClause::TaskReduction>(
       currentLocation, llvm::omp::Directive::OMPD_taskgroup);
   return genOpWithBody<mlir::omp::TaskGroupOp>(
-      OpWithBodyGenInfo(converter, currentLocation, eval)
+      OpWithBodyGenInfo(converter, semaCtx, currentLocation, eval)
           .setGenNested(genNested)
           .setClauses(&clauseList),
       /*task_reduction_vars=*/mlir::ValueRange(),
@@ -2735,9 +2744,9 @@ genTaskGroupOp(Fortran::lower::AbstractConverter &converter,
 
 static mlir::omp::DataOp
 genDataOp(Fortran::lower::AbstractConverter &converter,
-          Fortran::lower::pft::Evaluation &eval,
-          Fortran::semantics::SemanticsContext &semanticsContext,
-          bool genNested, mlir::Location currentLocation,
+          Fortran::semantics::SemanticsContext &semaCtx,
+          Fortran::lower::pft::Evaluation &eval, bool genNested,
+          mlir::Location currentLocation,
           const Fortran::parser::OmpClauseList &clauseList) {
   Fortran::lower::StatementContext stmtCtx;
   mlir::Value ifClauseOperand, deviceOperand;
@@ -2747,7 +2756,7 @@ genDataOp(Fortran::lower::AbstractConverter &converter,
   llvm::SmallVector<mlir::Location> useDeviceLocs;
   llvm::SmallVector<const Fortran::semantics::Symbol *> useDeviceSymbols;
 
-  ClauseProcessor cp(converter, clauseList);
+  ClauseProcessor cp(converter, semaCtx, clauseList);
   cp.processIf(Fortran::parser::OmpIfClause::DirectiveNameModifier::TargetData,
                ifClauseOperand);
   cp.processDevice(stmtCtx, deviceOperand);
@@ -2756,20 +2765,21 @@ genDataOp(Fortran::lower::AbstractConverter &converter,
   cp.processUseDeviceAddr(deviceAddrOperands, useDeviceTypes, useDeviceLocs,
                           useDeviceSymbols);
   cp.processMap(currentLocation, llvm::omp::Directive::OMPD_target_data,
-                semanticsContext, stmtCtx, mapOperands);
+                stmtCtx, mapOperands);
 
   auto dataOp = converter.getFirOpBuilder().create<mlir::omp::DataOp>(
       currentLocation, ifClauseOperand, deviceOperand, devicePtrOperands,
       deviceAddrOperands, mapOperands);
-  genBodyOfTargetDataOp(converter, eval, genNested, dataOp, useDeviceTypes,
-                        useDeviceLocs, useDeviceSymbols, currentLocation);
+  genBodyOfTargetDataOp(converter, semaCtx, eval, genNested, dataOp,
+                        useDeviceTypes, useDeviceLocs, useDeviceSymbols,
+                        currentLocation);
   return dataOp;
 }
 
 template <typename OpTy>
 static OpTy
 genEnterExitUpdateDataOp(Fortran::lower::AbstractConverter &converter,
-                         Fortran::semantics::SemanticsContext &semanticsContext,
+                         Fortran::semantics::SemanticsContext &semaCtx,
                          mlir::Location currentLocation,
                          const Fortran::parser::OmpClauseList &clauseList) {
   fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder();
@@ -2796,20 +2806,19 @@ genEnterExitUpdateDataOp(Fortran::lower::AbstractConverter &converter,
     return nullptr;
   }
 
-  ClauseProcessor cp(converter, clauseList);
+  ClauseProcessor cp(converter, semaCtx, clauseList);
   cp.processIf(directiveName, ifClauseOperand);
   cp.processDevice(stmtCtx, deviceOperand);
   cp.processNowait(nowaitAttr);
 
   if constexpr (std::is_same_v<OpTy, mlir::omp::UpdateDataOp>) {
-    cp.processMotionClauses<Fortran::parser::OmpClause::To>(
-        semanticsContext, stmtCtx, mapOperands);
-    cp.processMotionClauses<Fortran::parser::OmpClause::From>(
-        semanticsContext, stmtCtx, mapOperands);
+    cp.processMotionClauses<Fortran::parser::OmpClause::To>(stmtCtx,
+                                                            mapOperands);
+    cp.processMotionClauses<Fortran::parser::OmpClause::From>(stmtCtx,
+                                                              mapOperands);
 
   } else {
-    cp.processMap(currentLocation, directive, semanticsContext, stmtCtx,
-                  mapOperands);
+    cp.processMap(currentLocation, directive, stmtCtx, mapOperands);
   }
 
   cp.processTODO<Fortran::parser::OmpClause::Depend>(currentLocation,
@@ -2823,6 +2832,7 @@ genEnterExitUpdateDataOp(Fortran::lower::AbstractConverter &converter,
 // all the symbols present in mapSymbols as block arguments to this block.
 static void genBodyOfTargetOp(
     Fortran::lower::AbstractConverter &converter,
+    Fortran::semantics::SemanticsContext &semaCtx,
     Fortran::lower::pft::Evaluation &eval, bool genNested,
     mlir::omp::TargetOp &targetOp,
     const llvm::SmallVector<mlir::Type> &mapSymTypes,
@@ -2974,9 +2984,9 @@ static void genBodyOfTargetOp(
 
 static mlir::omp::TargetOp
 genTargetOp(Fortran::lower::AbstractConverter &converter,
-            Fortran::lower::pft::Evaluation &eval,
-            Fortran::semantics::SemanticsContext &semanticsContext,
-            bool genNested, mlir::Location currentLocation,
+            Fortran::semantics::SemanticsContext &semaCtx,
+            Fortran::lower::pft::Evaluation &eval, bool genNested,
+            mlir::Location currentLocation,
             const Fortran::parser::OmpClauseList &clauseList,
             llvm::omp::Directive directive, bool outerCombined = false) {
   Fortran::lower::StatementContext stmtCtx;
@@ -2987,14 +2997,14 @@ genTargetOp(Fortran::lower::AbstractConverter &converter,
   llvm::SmallVector<mlir::Location> mapSymLocs;
   llvm::SmallVector<const Fortran::semantics::Symbol *> mapSymbols;
 
-  ClauseProcessor cp(converter, clauseList);
+  ClauseProcessor cp(converter, semaCtx, clauseList);
   cp.processIf(Fortran::parser::OmpIfClause::DirectiveNameModifier::Target,
                ifClauseOperand);
   cp.processDevice(stmtCtx, deviceOperand);
   cp.processThreadLimit(stmtCtx, threadLimitOperand);
   cp.processNowait(nowaitAttr);
-  cp.processMap(currentLocation, directive, semanticsContext, stmtCtx,
-                mapOperands, &mapSymTypes, &mapSymLocs, &mapSymbols);
+  cp.processMap(currentLocation, directive, stmtCtx, mapOperands, &mapSymTypes,
+                &mapSymLocs, &mapSymbols);
   cp.processTODO<Fortran::parser::OmpClause::Private,
                  Fortran::parser::OmpClause::Depend,
                  Fortran::parser::OmpClause::Firstprivate,
@@ -3082,7 +3092,7 @@ genTargetOp(Fortran::lower::AbstractConverter &converter,
       currentLocation, ifClauseOperand, deviceOperand, threadLimitOperand,
       nowaitAttr, mapOperands);
 
-  genBodyOfTargetOp(converter, eval, genNested, targetOp, mapSymTypes,
+  genBodyOfTargetOp(converter, semaCtx, eval, genNested, targetOp, mapSymTypes,
                     mapSymLocs, mapSymbols, currentLocation);
 
   return targetOp;
@@ -3090,6 +3100,7 @@ genTargetOp(Fortran::lower::AbstractConverter &converter,
 
 static mlir::omp::TeamsOp
 genTeamsOp(Fortran::lower::AbstractConverter &converter,
+           Fortran::semantics::SemanticsContext &semaCtx,
            Fortran::lower::pft::Evaluation &eval, bool genNested,
            mlir::Location currentLocation,
            const Fortran::parser::OmpClauseList &clauseList,
@@ -3100,7 +3111,7 @@ genTeamsOp(Fortran::lower::AbstractConverter &converter,
       reductionVars;
   llvm::SmallVector<mlir::Attribute> reductionDeclSymbols;
 
-  ClauseProcessor cp(converter, clauseList);
+  ClauseProcessor cp(converter, semaCtx, clauseList);
   cp.processIf(Fortran::parser::OmpIfClause::DirectiveNameModifier::Teams,
                ifClauseOperand);
   cp.processAllocate(allocatorOperands, allocateOperands);
@@ -3111,7 +3122,7 @@ genTeamsOp(Fortran::lower::AbstractConverter &converter,
       currentLocation, llvm::omp::Directive::OMPD_teams);
 
   return genOpWithBody<mlir::omp::TeamsOp>(
-      OpWithBodyGenInfo(converter, currentLocation, eval)
+      OpWithBodyGenInfo(converter, semaCtx, currentLocation, eval)
           .setGenNested(genNested)
           .setOuterCombined(outerCombined)
           .setClauses(&clauseList),
@@ -3128,6 +3139,7 @@ genTeamsOp(Fortran::lower::AbstractConverter &converter,
 /// 'declare target' directive and return the intended device type for them.
 static mlir::omp::DeclareTargetDeviceType getDeclareTargetInfo(
     Fortran::lower::AbstractConverter &converter,
+    Fortran::semantics::SemanticsContext &semaCtx,
     Fortran::lower::pft::Evaluation &eval,
     const Fortran::parser::OpenMPDeclareTargetConstruct &declareTargetConstruct,
     llvm::SmallVectorImpl<DeclareTargetCapturePair> &symbolAndClause) {
@@ -3153,7 +3165,7 @@ static mlir::omp::DeclareTargetDeviceType getDeclareTargetInfo(
           eval.getOwningProcedure()->getSubprogramSymbol());
     }
 
-    ClauseProcessor cp(converter, *clauseList);
+    ClauseProcessor cp(converter, semaCtx, *clauseList);
     cp.processTo(symbolAndClause);
     cp.processEnter(symbolAndClause);
     cp.processLink(symbolAndClause);
@@ -3169,12 +3181,13 @@ static mlir::omp::DeclareTargetDeviceType getDeclareTargetInfo(
 static std::optional<mlir::omp::DeclareTargetDeviceType>
 getDeclareTargetFunctionDevice(
     Fortran::lower::AbstractConverter &converter,
+    Fortran::semantics::SemanticsContext &semaCtx,
     Fortran::lower::pft::Evaluation &eval,
     const Fortran::parser::OpenMPDeclareTargetConstruct
         &declareTargetConstruct) {
   llvm::SmallVector<DeclareTargetCapturePair, 0> symbolAndClause;
   mlir::omp::DeclareTargetDeviceType deviceType = getDeclareTargetInfo(
-      converter, eval, declareTargetConstruct, symbolAndClause);
+      converter, semaCtx, eval, declareTargetConstruct, symbolAndClause);
 
   // Return the device type only if at least one of the targets for the
   // directive is a function or subroutine
@@ -3196,9 +3209,8 @@ getDeclareTargetFunctionDevice(
 
 static void
 genOmpSimpleStandalone(Fortran::lower::AbstractConverter &converter,
-                       Fortran::lower::pft::Evaluation &eval,
-                       Fortran::semantics::SemanticsContext &semanticsContext,
-                       bool genNested,
+                       Fortran::semantics::SemanticsContext &semaCtx,
+                       Fortran::lower::pft::Evaluation &eval, bool genNested,
                        const Fortran::parser::OpenMPSimpleStandaloneConstruct
                            &simpleStandaloneConstruct) {
   const auto &directive =
@@ -3216,7 +3228,7 @@ genOmpSimpleStandalone(Fortran::lower::AbstractConverter &converter,
     firOpBuilder.create<mlir::omp::BarrierOp>(currentLocation);
     break;
   case llvm::omp::Directive::OMPD_taskwait:
-    ClauseProcessor(converter, opClauseList)
+    ClauseProcessor(converter, semaCtx, opClauseList)
         .processTODO<Fortran::parser::OmpClause::Depend,
                      Fortran::parser::OmpClause::Nowait>(
             currentLocation, llvm::omp::Directive::OMPD_taskwait);
@@ -3226,20 +3238,20 @@ genOmpSimpleStandalone(Fortran::lower::AbstractConverter &converter,
     firOpBuilder.create<mlir::omp::TaskyieldOp>(currentLocation);
     break;
   case llvm::omp::Directive::OMPD_target_data:
-    genDataOp(converter, eval, semanticsContext, genNested, currentLocation,
+    genDataOp(converter, semaCtx, eval, genNested, currentLocation,
               opClauseList);
     break;
   case llvm::omp::Directive::OMPD_target_enter_data:
     genEnterExitUpdateDataOp<mlir::omp::EnterDataOp>(
-        converter, semanticsContext, currentLocation, opClauseList);
+        converter, semaCtx, currentLocation, opClauseList);
     break;
   case llvm::omp::Directive::OMPD_target_exit_data:
     genEnterExitUpdateDataOp<mlir::omp::ExitDataOp>(
-        converter, semanticsContext, currentLocation, opClauseList);
+        converter, semaCtx, currentLocation, opClauseList);
     break;
   case llvm::omp::Directive::OMPD_target_update:
     genEnterExitUpdateDataOp<mlir::omp::UpdateDataOp>(
-        converter, semanticsContext, currentLocation, opClauseList);
+        converter, semaCtx, currentLocation, opClauseList);
     break;
   case llvm::omp::Directive::OMPD_ordered:
     TODO(currentLocation, "OMPD_ordered");
@@ -3248,6 +3260,7 @@ genOmpSimpleStandalone(Fortran::lower::AbstractConverter &converter,
 
 static void
 genOmpFlush(Fortran::lower::AbstractConverter &converter,
+            Fortran::semantics::SemanticsContext &semaCtx,
             Fortran::lower::pft::Evaluation &eval,
             const Fortran::parser::OpenMPFlushConstruct &flushConstruct) {
   llvm::SmallVector<mlir::Value, 4> operandRange;
@@ -3267,19 +3280,19 @@ genOmpFlush(Fortran::lower::AbstractConverter &converter,
 static void
 genOMP(Fortran::lower::AbstractConverter &converter,
        Fortran::lower::SymMap &symTable,
-       Fortran::semantics::SemanticsContext &semanticsContext,
+       Fortran::semantics::SemanticsContext &semaCtx,
        Fortran::lower::pft::Evaluation &eval,
        const Fortran::parser::OpenMPStandaloneConstruct &standaloneConstruct) {
   std::visit(
       Fortran::common::visitors{
           [&](const Fortran::parser::OpenMPSimpleStandaloneConstruct
                   &simpleStandaloneConstruct) {
-            genOmpSimpleStandalone(converter, eval, semanticsContext,
+            genOmpSimpleStandalone(converter, semaCtx, eval,
                                    /*genNested=*/true,
                                    simpleStandaloneConstruct);
           },
           [&](const Fortran::parser::OpenMPFlushConstruct &flushConstruct) {
-            genOmpFlush(converter, eval, flushConstruct);
+            genOmpFlush(converter, semaCtx, eval, flushConstruct);
           },
           [&](const Fortran::parser::OpenMPCancelConstruct &cancelConstruct) {
             TODO(converter.getCurrentLocation(), "OpenMPCancelConstruct");
@@ -3340,6 +3353,7 @@ genLoopVars(mlir::Operation *op, Fortran::lower::AbstractConverter &converter,
 
 static void
 createSimdLoop(Fortran::lower::AbstractConverter &converter,
+               Fortran::semantics::SemanticsContext &semaCtx,
                Fortran::lower::pft::Evaluation &eval,
                llvm::omp::Directive ompDirective,
                const Fortran::parser::OmpClauseList &loopOpClauseList,
@@ -3358,7 +3372,7 @@ createSimdLoop(Fortran::lower::AbstractConverter &converter,
   mlir::IntegerAttr simdlenClauseOperand, safelenClauseOperand;
   std::size_t loopVarTypeSize;
 
-  ClauseProcessor cp(converter, loopOpClauseList);
+  ClauseProcessor cp(converter, semaCtx, loopOpClauseList);
   cp.processCollapse(loc, eval, lowerBound, upperBound, step, iv,
                      loopVarTypeSize);
   cp.processScheduleChunk(stmtCtx, scheduleChunkClauseOperand);
@@ -3391,13 +3405,14 @@ createSimdLoop(Fortran::lower::AbstractConverter &converter,
   };
 
   createBodyOfOp<mlir::omp::SimdLoopOp>(
-      simdLoopOp, OpWithBodyGenInfo(converter, loc, *nestedEval)
+      simdLoopOp, OpWithBodyGenInfo(converter, semaCtx, loc, *nestedEval)
                       .setClauses(&loopOpClauseList)
                       .setDataSharingProcessor(&dsp)
                       .setGenRegionEntryCb(ivCallback));
 }
 
 static void createWsLoop(Fortran::lower::AbstractConverter &converter,
+                         Fortran::semantics::SemanticsContext &semaCtx,
                          Fortran::lower::pft::Evaluation &eval,
                          llvm::omp::Directive ompDirective,
                          const Fortran::parser::OmpClauseList &beginClauseList,
@@ -3420,7 +3435,7 @@ static void createWsLoop(Fortran::lower::AbstractConverter &converter,
   mlir::omp::ScheduleModifierAttr scheduleModClauseOperand;
   std::size_t loopVarTypeSize;
 
-  ClauseProcessor cp(converter, beginClauseList);
+  ClauseProcessor cp(converter, semaCtx, beginClauseList);
   cp.processCollapse(loc, eval, lowerBound, upperBound, step, iv,
                      loopVarTypeSize);
   cp.processScheduleChunk(stmtCtx, scheduleChunkClauseOperand);
@@ -3460,7 +3475,7 @@ static void createWsLoop(Fortran::lower::AbstractConverter &converter,
   // <...>
   // !$omp end do nowait
   if (endClauseList) {
-    if (ClauseProcessor(converter, *endClauseList)
+    if (ClauseProcessor(converter, semaCtx, *endClauseList)
             .processNowait(nowaitClauseOperand))
       wsLoopOp.setNowaitAttr(nowaitClauseOperand);
   }
@@ -3473,7 +3488,7 @@ static void createWsLoop(Fortran::lower::AbstractConverter &converter,
   };
 
   createBodyOfOp<mlir::omp::WsLoopOp>(
-      wsLoopOp, OpWithBodyGenInfo(converter, loc, *nestedEval)
+      wsLoopOp, OpWithBodyGenInfo(converter, semaCtx, loc, *nestedEval)
                     .setClauses(&beginClauseList)
                     .setDataSharingProcessor(&dsp)
                     .setGenRegionEntryCb(ivCallback));
@@ -3481,10 +3496,11 @@ static void createWsLoop(Fortran::lower::AbstractConverter &converter,
 
 static void createSimdWsLoop(
     Fortran::lower::AbstractConverter &converter,
+    Fortran::semantics::SemanticsContext &semaCtx,
     Fortran::lower::pft::Evaluation &eval, llvm::omp::Directive ompDirective,
     const Fortran::parser::OmpClauseList &beginClauseList,
     const Fortran::parser::OmpClauseList *endClauseList, mlir::Location loc) {
-  ClauseProcessor cp(converter, beginClauseList);
+  ClauseProcessor cp(converter, semaCtx, beginClauseList);
   cp.processTODO<
       Fortran::parser::OmpClause::Aligned, Fortran::parser::OmpClause::Allocate,
       Fortran::parser::OmpClause::Linear, Fortran::parser::OmpClause::Safelen,
@@ -3498,13 +3514,13 @@ static void createSimdWsLoop(
   // When support for vectorization is enabled, then we need to add handling of
   // if clause. Currently if clause can be skipped because we always assume
   // SIMD length = 1.
-  createWsLoop(converter, eval, ompDirective, beginClauseList, endClauseList,
-               loc);
+  createWsLoop(converter, semaCtx, eval, ompDirective, beginClauseList,
+               endClauseList, loc);
 }
 
 static void genOMP(Fortran::lower::AbstractConverter &converter,
                    Fortran::lower::SymMap &symTable,
-                   Fortran::semantics::SemanticsContext &semanticsContext,
+                   Fortran::semantics::SemanticsContext &semaCtx,
                    Fortran::lower::pft::Evaluation &eval,
                    const Fortran::parser::OpenMPLoopConstruct &loopConstruct) {
   const auto &beginLoopDirective =
@@ -3536,14 +3552,14 @@ static void genOMP(Fortran::lower::AbstractConverter &converter,
     if ((llvm::omp::allTargetSet & llvm::omp::loopConstructSet)
             .test(ompDirective)) {
       validDirective = true;
-      genTargetOp(converter, eval, semanticsContext, /*genNested=*/false,
+      genTargetOp(converter, semaCtx, eval, /*genNested=*/false,
                   currentLocation, loopOpClauseList, ompDirective,
                   /*outerCombined=*/true);
     }
     if ((llvm::omp::allTeamsSet & llvm::omp::loopConstructSet)
             .test(ompDirective)) {
       validDirective = true;
-      genTeamsOp(converter, eval, /*genNested=*/false, currentLocation,
+      genTeamsOp(converter, semaCtx, eval, /*genNested=*/false, currentLocation,
                  loopOpClauseList,
                  /*outerCombined=*/true);
     }
@@ -3554,8 +3570,8 @@ static void genOMP(Fortran::lower::AbstractConverter &converter,
     if ((llvm::omp::allParallelSet & llvm::omp::loopConstructSet)
             .test(ompDirective)) {
       validDirective = true;
-      genParallelOp(converter, eval, /*genNested=*/false, currentLocation,
-                    loopOpClauseList,
+      genParallelOp(converter, semaCtx, eval, /*genNested=*/false,
+                    currentLocation, loopOpClauseList,
                     /*outerCombined=*/true);
     }
   }
@@ -3570,25 +3586,25 @@ static void genOMP(Fortran::lower::AbstractConverter &converter,
 
   if (llvm::omp::allDoSimdSet.test(ompDirective)) {
     // 2.9.3.2 Workshare SIMD construct
-    createSimdWsLoop(converter, eval, ompDirective, loopOpClauseList,
+    createSimdWsLoop(converter, semaCtx, eval, ompDirective, loopOpClauseList,
                      endClauseList, currentLocation);
 
   } else if (llvm::omp::allSimdSet.test(ompDirective)) {
     // 2.9.3.1 SIMD construct
-    createSimdLoop(converter, eval, ompDirective, loopOpClauseList,
+    createSimdLoop(converter, semaCtx, eval, ompDirective, loopOpClauseList,
                    currentLocation);
   } else {
-    createWsLoop(converter, eval, ompDirective, loopOpClauseList, endClauseList,
-                 currentLocation);
+    createWsLoop(converter, semaCtx, eval, ompDirective, loopOpClauseList,
+                 endClauseList, currentLocation);
   }
 
-  genOpenMPReduction(converter, loopOpClauseList);
+  genOpenMPReduction(converter, semaCtx, loopOpClauseList);
 }
 
 static void
 genOMP(Fortran::lower::AbstractConverter &converter,
        Fortran::lower::SymMap &symTable,
-       Fortran::semantics::SemanticsContext &semanticsContext,
+       Fortran::semantics::SemanticsContext &semaCtx,
        Fortran::lower::pft::Evaluation &eval,
        const Fortran::parser::OpenMPBlockConstruct &blockConstruct) {
   const auto &beginBlockDirective =
@@ -3637,37 +3653,38 @@ genOMP(Fortran::lower::AbstractConverter &converter,
   mlir::Location currentLocation = converter.genLocation(directive.source);
   switch (directive.v) {
   case llvm::omp::Directive::OMPD_master:
-    genMasterOp(converter, eval, /*genNested=*/true, currentLocation);
+    genMasterOp(converter, semaCtx, eval, /*genNested=*/true, currentLocation);
     break;
   case llvm::omp::Directive::OMPD_ordered:
-    genOrderedRegionOp(converter, eval, /*genNested=*/true, currentLocation);
+    genOrderedRegionOp(converter, semaCtx, eval, /*genNested=*/true,
+                       currentLocation);
     break;
   case llvm::omp::Directive::OMPD_parallel:
-    genParallelOp(converter, eval, /*genNested=*/true, currentLocation,
+    genParallelOp(converter, semaCtx, eval, /*genNested=*/true, currentLocation,
                   beginClauseList);
     break;
   case llvm::omp::Directive::OMPD_single:
-    genSingleOp(converter, eval, /*genNested=*/true, currentLocation,
+    genSingleOp(converter, semaCtx, eval, /*genNested=*/true, currentLocation,
                 beginClauseList, endClauseList);
     break;
   case llvm::omp::Directive::OMPD_target:
-    genTargetOp(converter, eval, semanticsContext, /*genNested=*/true,
-                currentLocation, beginClauseList, directive.v);
+    genTargetOp(converter, semaCtx, eval, /*genNested=*/true, currentLocation,
+                beginClauseList, directive.v);
     break;
   case llvm::omp::Directive::OMPD_target_data:
-    genDataOp(converter, eval, semanticsContext, /*genNested=*/true,
-              currentLocation, beginClauseList);
+    genDataOp(converter, semaCtx, eval, /*genNested=*/true, currentLocation,
+              beginClauseList);
     break;
   case llvm::omp::Directive::OMPD_task:
-    genTaskOp(converter, eval, /*genNested=*/true, currentLocation,
+    genTaskOp(converter, semaCtx, eval, /*genNested=*/true, currentLocation,
               beginClauseList);
     break;
   case llvm::omp::Directive::OMPD_taskgroup:
-    genTaskGroupOp(converter, eval, /*genNested=*/true, currentLocation,
-                   beginClauseList);
+    genTaskGroupOp(converter, semaCtx, eval, /*genNested=*/true,
+                   currentLocation, beginClauseList);
     break;
   case llvm::omp::Directive::OMPD_teams:
-    genTeamsOp(converter, eval, /*genNested=*/true, currentLocation,
+    genTeamsOp(converter, semaCtx, eval, /*genNested=*/true, currentLocation,
                beginClauseList,
                /*outerCombined=*/false);
     break;
@@ -3686,14 +3703,14 @@ genOMP(Fortran::lower::AbstractConverter &converter,
   bool combinedDirective = false;
   if ((llvm::omp::allTargetSet & llvm::omp::blockConstructSet)
           .test(directive.v)) {
-    genTargetOp(converter, eval, semanticsContext, /*genNested=*/false,
-                currentLocation, beginClauseList, directive.v,
+    genTargetOp(converter, semaCtx, eval, /*genNested=*/false, currentLocation,
+                beginClauseList, directive.v,
                 /*outerCombined=*/true);
     combinedDirective = true;
   }
   if ((llvm::omp::allTeamsSet & llvm::omp::blockConstructSet)
           .test(directive.v)) {
-    genTeamsOp(converter, eval, /*genNested=*/false, currentLocation,
+    genTeamsOp(converter, semaCtx, eval, /*genNested=*/false, currentLocation,
                beginClauseList);
     combinedDirective = true;
   }
@@ -3701,8 +3718,8 @@ genOMP(Fortran::lower::AbstractConverter &converter,
           .test(directive.v)) {
     bool outerCombined =
         directive.v != llvm::omp::Directive::OMPD_target_parallel;
-    genParallelOp(converter, eval, /*genNested=*/false, currentLocation,
-                  beginClauseList, outerCombined);
+    genParallelOp(converter, semaCtx, eval, /*genNested=*/false,
+                  currentLocation, beginClauseList, outerCombined);
     combinedDirective = true;
   }
   if ((llvm::omp::workShareSet & llvm::omp::blockConstructSet)
@@ -3721,7 +3738,7 @@ genOMP(Fortran::lower::AbstractConverter &converter,
 static void
 genOMP(Fortran::lower::AbstractConverter &converter,
        Fortran::lower::SymMap &symTable,
-       Fortran::semantics::SemanticsContext &semanticsContext,
+       Fortran::semantics::SemanticsContext &semaCtx,
        Fortran::lower::pft::Evaluation &eval,
        const Fortran::parser::OpenMPCriticalConstruct &criticalConstruct) {
   fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder();
@@ -3736,7 +3753,7 @@ genOMP(Fortran::lower::AbstractConverter &converter,
   }
 
   const auto &clauseList = std::get<Fortran::parser::OmpClauseList>(cd.t);
-  ClauseProcessor(converter, clauseList).processHint(hintClauseOp);
+  ClauseProcessor(converter, semaCtx, clauseList).processHint(hintClauseOp);
 
   mlir::omp::CriticalOp criticalOp = [&]() {
     if (name.empty()) {
@@ -3754,14 +3771,14 @@ genOMP(Fortran::lower::AbstractConverter &converter,
         currentLocation, mlir::FlatSymbolRefAttr::get(firOpBuilder.getContext(),
                                                       global.getSymName()));
   }();
-  auto genInfo = OpWithBodyGenInfo(converter, currentLocation, eval);
+  auto genInfo = OpWithBodyGenInfo(converter, semaCtx, currentLocation, eval);
   createBodyOfOp<mlir::omp::CriticalOp>(criticalOp, genInfo);
 }
 
 static void
 genOMP(Fortran::lower::AbstractConverter &converter,
        Fortran::lower::SymMap &symTable,
-       Fortran::semantics::SemanticsContext &semanticsContext,
+       Fortran::semantics::SemanticsContext &semaCtx,
        Fortran::lower::pft::Evaluation &eval,
        const Fortran::parser::OpenMPSectionsConstruct &sectionsConstruct) {
   mlir::Location currentLocation = converter.getCurrentLocation();
@@ -3774,7 +3791,7 @@ genOMP(Fortran::lower::AbstractConverter &converter,
 
   // Process clauses before optional omp.parallel, so that new variables are
   // allocated outside of the parallel region
-  ClauseProcessor cp(converter, sectionsClauseList);
+  ClauseProcessor cp(converter, semaCtx, sectionsClauseList);
   cp.processSectionsReduction(currentLocation);
   cp.processAllocate(allocatorOperands, allocateOperands);
 
@@ -3784,7 +3801,7 @@ genOMP(Fortran::lower::AbstractConverter &converter,
 
   // Parallel wrapper of PARALLEL SECTIONS construct
   if (dir == llvm::omp::Directive::OMPD_parallel_sections) {
-    genParallelOp(converter, eval,
+    genParallelOp(converter, semaCtx, eval,
                   /*genNested=*/false, currentLocation, sectionsClauseList,
                   /*outerCombined=*/true);
   } else {
@@ -3792,13 +3809,14 @@ genOMP(Fortran::lower::AbstractConverter &converter,
         std::get<Fortran::parser::OmpEndSectionsDirective>(sectionsConstruct.t);
     const auto &endSectionsClauseList =
         std::get<Fortran::parser::OmpClauseList>(endSectionsDirective.t);
-    ClauseProcessor(converter, endSectionsClauseList)
+    ClauseProcessor(converter, semaCtx, endSectionsClauseList)
         .processNowait(nowaitClauseOperand);
   }
 
   // SECTIONS construct
   genOpWithBody<mlir::omp::SectionsOp>(
-      OpWithBodyGenInfo(converter, currentLocation, eval).setGenNested(false),
+      OpWithBodyGenInfo(converter, semaCtx, currentLocation, eval)
+          .setGenNested(false),
       /*reduction_vars=*/mlir::ValueRange(),
       /*reductions=*/nullptr, allocateOperands, allocatorOperands,
       nowaitClauseOperand);
@@ -3810,7 +3828,7 @@ genOMP(Fortran::lower::AbstractConverter &converter,
   for (const auto &[nblock, neval] :
        llvm::zip(sectionBlocks.v, eval.getNestedEvaluations())) {
     symTable.pushScope();
-    genSectionOp(converter, neval, /*genNested=*/true, currentLocation,
+    genSectionOp(converter, semaCtx, neval, /*genNested=*/true, currentLocation,
                  sectionsClauseList);
     symTable.popScope();
     firOpBuilder.restoreInsertionPoint(ip);
@@ -3820,7 +3838,7 @@ genOMP(Fortran::lower::AbstractConverter &converter,
 static void
 genOMP(Fortran::lower::AbstractConverter &converter,
        Fortran::lower::SymMap &symTable,
-       Fortran::semantics::SemanticsContext &semanticsContext,
+       Fortran::semantics::SemanticsContext &semaCtx,
        Fortran::lower::pft::Evaluation &eval,
        const Fortran::parser::OpenMPAtomicConstruct &atomicConstruct) {
   std::visit(
@@ -3865,14 +3883,14 @@ genOMP(Fortran::lower::AbstractConverter &converter,
 
 static void genOMP(Fortran::lower::AbstractConverter &converter,
                    Fortran::lower::SymMap &symTable,
-                   Fortran::semantics::SemanticsContext &semanticsContext,
+                   Fortran::semantics::SemanticsContext &semaCtx,
                    Fortran::lower::pft::Evaluation &eval,
                    const Fortran::parser::OpenMPDeclareTargetConstruct
                        &declareTargetConstruct) {
   llvm::SmallVector<DeclareTargetCapturePair, 0> symbolAndClause;
   mlir::ModuleOp mod = converter.getFirOpBuilder().getModule();
   mlir::omp::DeclareTargetDeviceType deviceType = getDeclareTargetInfo(
-      converter, eval, declareTargetConstruct, symbolAndClause);
+      converter, semaCtx, eval, declareTargetConstruct, symbolAndClause);
 
   for (const DeclareTargetCapturePair &symClause : symbolAndClause) {
     mlir::Operation *op = mod.lookupSymbol(
@@ -3918,27 +3936,25 @@ static void genOMP(Fortran::lower::AbstractConverter &converter,
 
 static void genOMP(Fortran::lower::AbstractConverter &converter,
                    Fortran::lower::SymMap &symTable,
-                   Fortran::semantics::SemanticsContext &semanticsContext,
+                   Fortran::semantics::SemanticsContext &semaCtx,
                    Fortran::lower::pft::Evaluation &eval,
                    const Fortran::parser::OpenMPConstruct &ompConstruct) {
   std::visit(
       Fortran::common::visitors{
           [&](const Fortran::parser::OpenMPStandaloneConstruct
                   &standaloneConstruct) {
-            genOMP(converter, symTable, semanticsContext, eval,
-                   standaloneConstruct);
+            genOMP(converter, symTable, semaCtx, eval, standaloneConstruct);
           },
           [&](const Fortran::parser::OpenMPSectionsConstruct
                   &sectionsConstruct) {
-            genOMP(converter, symTable, semanticsContext, eval,
-                   sectionsConstruct);
+            genOMP(converter, symTable, semaCtx, eval, sectionsConstruct);
           },
           [&](const Fortran::parser::OpenMPSectionConstruct &sectionConstruct) {
             // SECTION constructs are handled as a part of SECTIONS.
             llvm_unreachable("Unexpected standalone OMP SECTION");
           },
           [&](const Fortran::parser::OpenMPLoopConstruct &loopConstruct) {
-            genOMP(converter, symTable, semanticsContext, eval, loopConstruct);
+            genOMP(converter, symTable, semaCtx, eval, loopConstruct);
           },
           [&](const Fortran::parser::OpenMPDeclarativeAllocate
                   &execAllocConstruct) {
@@ -3953,16 +3969,14 @@ static void genOMP(Fortran::lower::AbstractConverter &converter,
             TODO(converter.getCurrentLocation(), "OpenMPAllocatorsConstruct");
           },
           [&](const Fortran::parser::OpenMPBlockConstruct &blockConstruct) {
-            genOMP(converter, symTable, semanticsContext, eval, blockConstruct);
+            genOMP(converter, symTable, semaCtx, eval, blockConstruct);
           },
           [&](const Fortran::parser::OpenMPAtomicConstruct &atomicConstruct) {
-            genOMP(converter, symTable, semanticsContext, eval,
-                   atomicConstruct);
+            genOMP(converter, symTable, semaCtx, eval, atomicConstruct);
           },
           [&](const Fortran::parser::OpenMPCriticalConstruct
                   &criticalConstruct) {
-            genOMP(converter, symTable, semanticsContext, eval,
-                   criticalConstruct);
+            genOMP(converter, symTable, semaCtx, eval, criticalConstruct);
           },
       },
       ompConstruct.u);
@@ -3971,7 +3985,7 @@ static void genOMP(Fortran::lower::AbstractConverter &converter,
 static void
 genOMP(Fortran::lower::AbstractConverter &converter,
        Fortran::lower::SymMap &symTable,
-       Fortran::semantics::SemanticsContext &semanticsContext,
+       Fortran::semantics::SemanticsContext &semaCtx,
        Fortran::lower::pft::Evaluation &eval,
        const Fortran::parser::OpenMPDeclarativeConstruct &ompDeclConstruct) {
   std::visit(
@@ -3991,8 +4005,7 @@ genOMP(Fortran::lower::AbstractConverter &converter,
           },
           [&](const Fortran::parser::OpenMPDeclareTargetConstruct
                   &declareTargetConstruct) {
-            genOMP(converter, symTable, semanticsContext, eval,
-                   declareTargetConstruct);
+            genOMP(converter, symTable, semaCtx, eval, declareTargetConstruct);
           },
           [&](const Fortran::parser::OpenMPRequiresConstruct
                   &requiresConstruct) {
@@ -4026,21 +4039,21 @@ mlir::Operation *Fortran::lower::genOpenMPTerminator(fir::FirOpBuilder &builder,
 void Fortran::lower::genOpenMPConstruct(
     Fortran::lower::AbstractConverter &converter,
     Fortran::lower::SymMap &symTable,
-    Fortran::semantics::SemanticsContext &semanticsContext,
+    Fortran::semantics::SemanticsContext &semaCtx,
     Fortran::lower::pft::Evaluation &eval,
     const Fortran::parser::OpenMPConstruct &omp) {
   symTable.pushScope();
-  genOMP(converter, symTable, semanticsContext, eval, omp);
+  genOMP(converter, symTable, semaCtx, eval, omp);
   symTable.popScope();
 }
 
 void Fortran::lower::genOpenMPDeclarativeConstruct(
     Fortran::lower::AbstractConverter &converter,
     Fortran::lower::SymMap &symTable,
-    Fortran::semantics::SemanticsContext &semanticsContext,
+    Fortran::semantics::SemanticsContext &semaCtx,
     Fortran::lower::pft::Evaluation &eval,
     const Fortran::parser::OpenMPDeclarativeConstruct &omp) {
-  genOMP(converter, symTable, semanticsContext, eval, omp);
+  genOMP(converter, symTable, semaCtx, eval, omp);
   genNestedEvaluations(converter, eval);
 }
 
@@ -4155,6 +4168,7 @@ void Fortran::lower::genDeclareTargetIntGlobal(
 // ops in the builder (instead of a rewriter) is probably not the best approach.
 void Fortran::lower::genOpenMPReduction(
     Fortran::lower::AbstractConverter &converter,
+    Fortran::semantics::SemanticsContext &semaCtx,
     const Fortran::parser::OmpClauseList &clauseList) {
   fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder();
 
@@ -4377,13 +4391,14 @@ bool Fortran::lower::isOpenMPTargetConstruct(
 
 bool Fortran::lower::isOpenMPDeviceDeclareTarget(
     Fortran::lower::AbstractConverter &converter,
+    Fortran::semantics::SemanticsContext &semaCtx,
     Fortran::lower::pft::Evaluation &eval,
     const Fortran::parser::OpenMPDeclarativeConstruct &ompDecl) {
   return std::visit(
       Fortran::common::visitors{
           [&](const Fortran::parser::OpenMPDeclareTargetConstruct &ompReq) {
             mlir::omp::DeclareTargetDeviceType targetType =
-                getDeclareTargetFunctionDevice(converter, eval, ompReq)
+                getDeclareTargetFunctionDevice(converter, semaCtx, eval, ompReq)
                     .value_or(mlir::omp::DeclareTargetDeviceType::host);
             return targetType != mlir::omp::DeclareTargetDeviceType::host;
           },
-- 
cgit v1.1


From f7c2c32c019275e88fd14e401aaa68b760f32a28 Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Mon, 12 Feb 2024 17:37:29 -0800
Subject: [ELF] Properly reject unsupported e_machine

Fix an incorrect llvm_unreachable.
---
 lld/ELF/Target.cpp                     |  3 ++-
 lld/test/ELF/unsupported-emachine.test | 11 +++++++++++
 2 files changed, 13 insertions(+), 1 deletion(-)
 create mode 100644 lld/test/ELF/unsupported-emachine.test

diff --git a/lld/ELF/Target.cpp b/lld/ELF/Target.cpp
index 671d22c..286db1e 100644
--- a/lld/ELF/Target.cpp
+++ b/lld/ELF/Target.cpp
@@ -89,8 +89,9 @@ TargetInfo *elf::getTarget() {
     return getSPARCV9TargetInfo();
   case EM_X86_64:
     return getX86_64TargetInfo();
+  default:
+    fatal("unsupported e_machine value: " + Twine(config->emachine));
   }
-  llvm_unreachable("unknown target machine");
 }
 
 ErrorPlace elf::getErrorPlace(const uint8_t *loc) {
diff --git a/lld/test/ELF/unsupported-emachine.test b/lld/test/ELF/unsupported-emachine.test
new file mode 100644
index 0000000..43d907e
--- /dev/null
+++ b/lld/test/ELF/unsupported-emachine.test
@@ -0,0 +1,11 @@
+# RUN: yaml2obj %s -o %t.o
+# RUN: not ld.lld %t.o -o /dev/null 2>&1 | FileCheck %s
+
+# CHECK: error: unsupported e_machine value: 9
+
+--- !ELF
+FileHeader:
+  Class:   ELFCLASS64
+  Data:    ELFDATA2LSB
+  Type:    ET_REL
+  Machine: EM_S370
-- 
cgit v1.1


From 25cec33521ba1c67f5e939ceea277be18b2d0fb7 Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Mon, 12 Feb 2024 18:14:19 -0800
Subject: [ELF] Place _edata before .bss in the presence of .ldata

This minor issue is identified while working on #81224.
---
 lld/ELF/Writer.cpp                   | 9 ++++++---
 lld/test/ELF/x86-64-section-layout.s | 4 ++--
 2 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/lld/ELF/Writer.cpp b/lld/ELF/Writer.cpp
index bd4db1e..5b7dfd3 100644
--- a/lld/ELF/Writer.cpp
+++ b/lld/ELF/Writer.cpp
@@ -1104,7 +1104,9 @@ template <class ELFT> void Writer<ELFT>::setReservedSymbolSections() {
 
   PhdrEntry *last = nullptr;
   PhdrEntry *lastRO = nullptr;
-
+  auto isLarge = [](OutputSection *osec) {
+    return config->emachine == EM_X86_64 && osec->flags & SHF_X86_64_LARGE;
+  };
   for (Partition &part : partitions) {
     for (PhdrEntry *p : part.phdrs) {
       if (p->p_type != PT_LOAD)
@@ -1124,10 +1126,11 @@ template <class ELFT> void Writer<ELFT>::setReservedSymbolSections() {
   }
 
   if (last) {
-    // _edata points to the end of the last mapped initialized section.
+    // _edata points to the end of the last non-large mapped initialized
+    // section.
     OutputSection *edata = nullptr;
     for (OutputSection *os : outputSections) {
-      if (os->type != SHT_NOBITS)
+      if (os->type != SHT_NOBITS && !isLarge(os))
         edata = os;
       if (os == last->lastSec)
         break;
diff --git a/lld/test/ELF/x86-64-section-layout.s b/lld/test/ELF/x86-64-section-layout.s
index f292877..0ba605393 100644
--- a/lld/test/ELF/x86-64-section-layout.s
+++ b/lld/test/ELF/x86-64-section-layout.s
@@ -43,7 +43,7 @@
 
 # CHECK:       0000000000201304     0 NOTYPE  GLOBAL DEFAULT [[#]] (.text)   _start
 # CHECK-NEXT:  0000000000201305     0 NOTYPE  GLOBAL DEFAULT [[#]] (.text)   _etext
-# CHECK-NEXT:  0000000000205b0a     0 NOTYPE  GLOBAL DEFAULT [[#]] (.ldata2) _edata
+# CHECK-NEXT:  0000000000203307     0 NOTYPE  GLOBAL DEFAULT [[#]] (.data)   _edata
 # CHECK-NEXT:  0000000000206d0b     0 NOTYPE  GLOBAL DEFAULT [[#]] (.lbss)   _end
 
 # CHECK1:      .data      PROGBITS        0000000000203306 000306 000001 00  WA  0   0  1
@@ -53,7 +53,7 @@
 
 # CHECK1:       0000000000201304     0 NOTYPE  GLOBAL DEFAULT [[#]] (.text)   _start
 # CHECK1-NEXT:  0000000000201305     0 NOTYPE  GLOBAL DEFAULT [[#]] (.text)   _etext
-# CHECK1-NEXT:  000000000020330a     0 NOTYPE  GLOBAL DEFAULT [[#]] (.ldata2) _edata
+# CHECK1-NEXT:  0000000000203307     0 NOTYPE  GLOBAL DEFAULT [[#]] (.data)   _edata
 # CHECK1-NEXT:  000000000020330a     0 NOTYPE  GLOBAL DEFAULT [[#]] (.ldata2) _end
 
 # CHECK2:      .note      NOTE            0000000000200300 000300 000001 00   A  0   0  1
-- 
cgit v1.1


From 3d18c8cd265c0c0bf1d85226c4770a2dd0f86e8f Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Mon, 12 Feb 2024 18:29:55 -0800
Subject: [test] Replace aarch64-*-{eabi,gnueabi}{,hf} with aarch64

Similar to d39b4ce3ce8a3c256e01bdec2b140777a332a633
Using "eabi" or "gnueabi" for aarch64 targets is a common mistake and
warned by Clang Driver. We want to avoid them elsewhere as well. Just
use the common "aarch64" without other triple components.
---
 llvm/test/Analysis/CostModel/AArch64/cast.ll                   | 10 +++++-----
 llvm/test/Analysis/CostModel/AArch64/fptoi_sat.ll              |  4 ++--
 llvm/test/Analysis/LoopAccessAnalysis/number-of-memchecks.ll   |  2 +-
 .../Analysis/LoopAccessAnalysis/reverse-memcheck-bounds.ll     |  2 +-
 llvm/test/CodeGen/AArch64/Redundantstore.ll                    |  2 +-
 llvm/test/CodeGen/AArch64/aarch64-a57-fp-load-balancing.ll     |  4 ++--
 llvm/test/CodeGen/AArch64/aarch64-addv.ll                      |  4 ++--
 llvm/test/CodeGen/AArch64/aarch64-tbz.ll                       |  4 ++--
 llvm/test/CodeGen/AArch64/aarch64-unroll-and-jam.ll            |  2 +-
 llvm/test/CodeGen/AArch64/aarch64-vcvtfp2fxs-combine.ll        |  2 +-
 llvm/test/CodeGen/AArch64/arm64-build-vector.ll                |  2 +-
 llvm/test/CodeGen/AArch64/arm64-movi.ll                        |  2 +-
 llvm/test/CodeGen/AArch64/arm64-popcnt.ll                      |  4 ++--
 llvm/test/CodeGen/AArch64/arm64-rev.ll                         |  4 ++--
 llvm/test/CodeGen/AArch64/asm-large-immediate.ll               |  2 +-
 llvm/test/CodeGen/AArch64/bf16-shuffle.ll                      |  6 +++---
 llvm/test/CodeGen/AArch64/bf16.ll                              |  2 +-
 llvm/test/CodeGen/AArch64/bitreverse.ll                        |  2 +-
 llvm/test/CodeGen/AArch64/cmpwithshort.ll                      |  2 +-
 .../CodeGen/AArch64/complex-deinterleaving-i16-add-scalable.ll |  2 +-
 .../CodeGen/AArch64/complex-deinterleaving-i16-mul-scalable.ll |  2 +-
 .../CodeGen/AArch64/complex-deinterleaving-i32-add-scalable.ll |  2 +-
 .../CodeGen/AArch64/complex-deinterleaving-i32-mul-scalable.ll |  2 +-
 .../CodeGen/AArch64/complex-deinterleaving-i64-add-scalable.ll |  2 +-
 .../CodeGen/AArch64/complex-deinterleaving-i64-mul-scalable.ll |  2 +-
 .../CodeGen/AArch64/complex-deinterleaving-i8-add-scalable.ll  |  2 +-
 .../CodeGen/AArch64/complex-deinterleaving-splat-scalable.ll   |  2 +-
 llvm/test/CodeGen/AArch64/complex-deinterleaving-splat.ll      |  2 +-
 llvm/test/CodeGen/AArch64/cond-br-tuning.ll                    |  2 +-
 llvm/test/CodeGen/AArch64/consthoist-gep.ll                    |  2 +-
 llvm/test/CodeGen/AArch64/extbinopload.ll                      |  2 +-
 llvm/test/CodeGen/AArch64/extract-sext-zext.ll                 |  4 ++--
 llvm/test/CodeGen/AArch64/fabs.ll                              |  8 ++++----
 llvm/test/CodeGen/AArch64/faddsub.ll                           |  8 ++++----
 llvm/test/CodeGen/AArch64/fcmp.ll                              |  8 ++++----
 llvm/test/CodeGen/AArch64/fcopysign.ll                         |  4 ++--
 llvm/test/CodeGen/AArch64/fcvt.ll                              |  8 ++++----
 llvm/test/CodeGen/AArch64/fdiv.ll                              |  8 ++++----
 llvm/test/CodeGen/AArch64/fence-singlethread.ll                |  2 +-
 llvm/test/CodeGen/AArch64/fexplog.ll                           |  4 ++--
 llvm/test/CodeGen/AArch64/fminimummaximum.ll                   |  8 ++++----
 llvm/test/CodeGen/AArch64/fminmax.ll                           |  8 ++++----
 llvm/test/CodeGen/AArch64/fmla.ll                              |  8 ++++----
 llvm/test/CodeGen/AArch64/fmul.ll                              |  8 ++++----
 llvm/test/CodeGen/AArch64/fneg.ll                              |  8 ++++----
 llvm/test/CodeGen/AArch64/fp16_intrinsic_lane.ll               |  2 +-
 llvm/test/CodeGen/AArch64/fp16_intrinsic_scalar_1op.ll         |  2 +-
 llvm/test/CodeGen/AArch64/fp16_intrinsic_scalar_2op.ll         |  2 +-
 llvm/test/CodeGen/AArch64/fp16_intrinsic_scalar_3op.ll         |  2 +-
 llvm/test/CodeGen/AArch64/fp16_intrinsic_vector_1op.ll         |  2 +-
 llvm/test/CodeGen/AArch64/fp16_intrinsic_vector_2op.ll         |  2 +-
 llvm/test/CodeGen/AArch64/fp16_intrinsic_vector_3op.ll         |  2 +-
 llvm/test/CodeGen/AArch64/fpext.ll                             |  4 ++--
 llvm/test/CodeGen/AArch64/fpow.ll                              |  4 ++--
 llvm/test/CodeGen/AArch64/fpowi.ll                             |  4 ++--
 llvm/test/CodeGen/AArch64/fptoi.ll                             |  8 ++++----
 llvm/test/CodeGen/AArch64/fptrunc.ll                           |  4 ++--
 llvm/test/CodeGen/AArch64/frem.ll                              |  8 ++++----
 llvm/test/CodeGen/AArch64/frintn.ll                            |  2 +-
 llvm/test/CodeGen/AArch64/fsincos.ll                           |  4 ++--
 llvm/test/CodeGen/AArch64/fsqrt.ll                             |  8 ++++----
 llvm/test/CodeGen/AArch64/hints.ll                             |  2 +-
 llvm/test/CodeGen/AArch64/icmp.ll                              |  4 ++--
 llvm/test/CodeGen/AArch64/insertextract.ll                     |  4 ++--
 llvm/test/CodeGen/AArch64/intrinsics-memory-barrier.ll         |  2 +-
 llvm/test/CodeGen/AArch64/itofp.ll                             |  8 ++++----
 llvm/test/CodeGen/AArch64/ldp-stp-scaled-unscaled-pairs.ll     |  2 +-
 llvm/test/CodeGen/AArch64/legalize-bug-bogus-cpu.ll            |  2 +-
 llvm/test/CodeGen/AArch64/merge-scoped-aa-store.ll             |  2 +-
 llvm/test/CodeGen/AArch64/merge-store-dependency.ll            |  2 +-
 llvm/test/CodeGen/AArch64/merge-store.ll                       |  2 +-
 llvm/test/CodeGen/AArch64/min-max-combine.ll                   |  8 ++++----
 llvm/test/CodeGen/AArch64/min-max.ll                           |  8 ++++----
 llvm/test/CodeGen/AArch64/mul_pow2.ll                          |  4 ++--
 llvm/test/CodeGen/AArch64/neon-extadd-extract.ll               |  2 +-
 llvm/test/CodeGen/AArch64/neon_rbit.ll                         |  2 +-
 llvm/test/CodeGen/AArch64/no-quad-ldp-stp.ll                   |  4 ++--
 llvm/test/CodeGen/AArch64/nzcv-save.ll                         |  2 +-
 llvm/test/CodeGen/AArch64/pacbti-module-attrs.ll               |  2 +-
 llvm/test/CodeGen/AArch64/postra-mi-sched.ll                   |  2 +-
 llvm/test/CodeGen/AArch64/rbit.ll                              |  2 +-
 llvm/test/CodeGen/AArch64/rcpc3-sve.ll                         |  4 ++--
 llvm/test/CodeGen/AArch64/rcpc3.ll                             |  4 ++--
 llvm/test/CodeGen/AArch64/rem_crash.ll                         |  2 +-
 llvm/test/CodeGen/AArch64/rotate.ll                            |  2 +-
 llvm/test/CodeGen/AArch64/setcc_knownbits.ll                   |  2 +-
 llvm/test/CodeGen/AArch64/sls-stackprotector-outliner.ll       |  2 +-
 llvm/test/CodeGen/AArch64/stack-probing-64k.ll                 |  4 ++--
 llvm/test/CodeGen/AArch64/stack-probing-dynamic.ll             |  4 ++--
 llvm/test/CodeGen/AArch64/stack-probing-sve.ll                 |  4 ++--
 llvm/test/CodeGen/AArch64/stack-probing.ll                     |  4 ++--
 llvm/test/CodeGen/AArch64/sve-fcopysign.ll                     |  4 ++--
 llvm/test/CodeGen/AArch64/sve2-fcopysign.ll                    |  4 ++--
 llvm/test/CodeGen/AArch64/tailmerging_in_mbp.ll                |  2 +-
 llvm/test/CodeGen/AArch64/tbz-tbnz.ll                          |  2 +-
 llvm/test/CodeGen/AArch64/v3f-to-int.ll                        |  2 +-
 llvm/test/CodeGen/AArch64/v8.5a-neon-frint3264-intrinsic.ll    |  2 +-
 llvm/test/CodeGen/AArch64/v8.5a-scalar-frint3264-intrinsic.ll  |  2 +-
 llvm/test/CodeGen/AArch64/vecreduce-fadd-strict.ll             |  8 ++++----
 llvm/test/CodeGen/AArch64/vecreduce-fadd.ll                    |  8 ++++----
 llvm/test/CodeGen/AArch64/vecreduce-fmul-strict.ll             |  8 ++++----
 llvm/test/CodeGen/AArch64/vecreduce-fmul.ll                    |  8 ++++----
 llvm/test/CodeGen/AArch64/xar.ll                               |  4 ++--
 llvm/test/Transforms/LICM/sink-foldable.ll                     |  2 +-
 .../LoopVectorize/AArch64/arbitrary-induction-step.ll          |  2 +-
 .../test/Transforms/LoopVectorize/AArch64/backedge-overflow.ll |  2 +-
 llvm/test/Transforms/LoopVectorize/AArch64/interleaved_cost.ll |  2 +-
 107 files changed, 200 insertions(+), 200 deletions(-)

diff --git a/llvm/test/Analysis/CostModel/AArch64/cast.ll b/llvm/test/Analysis/CostModel/AArch64/cast.ll
index 5dd37e8..0cd444f 100644
--- a/llvm/test/Analysis/CostModel/AArch64/cast.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/cast.ll
@@ -1,9 +1,9 @@
 ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
-; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=aarch64-none-linux-gnueabi %s | FileCheck --check-prefixes=CHECK,CHECK-NOFP16 %s
-; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=aarch64-none-linux-gnueabi -mattr=+sve -force-streaming-compatible-sve %s | FileCheck --check-prefixes=SVE,SVE128-NO-NEON %s
-; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=aarch64-none-linux-gnueabi -mattr=+fullfp16 %s | FileCheck --check-prefixes=CHECK,CHECK-FP16 %s
-; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=aarch64-none-linux-gnueabi -mattr=+sve -aarch64-sve-vector-bits-min=256 %s | FileCheck --check-prefixes=SVE,FIXED-MIN-256 %s
-; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=aarch64-none-linux-gnueabi -mattr=+sve -aarch64-sve-vector-bits-min=2048 %s | FileCheck --check-prefixes=SVE,FIXED-MIN-2048 %s
+; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=aarch64 %s | FileCheck --check-prefixes=CHECK,CHECK-NOFP16 %s
+; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=aarch64 -mattr=+sve -force-streaming-compatible-sve %s | FileCheck --check-prefixes=SVE,SVE128-NO-NEON %s
+; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=aarch64 -mattr=+fullfp16 %s | FileCheck --check-prefixes=CHECK,CHECK-FP16 %s
+; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=aarch64 -mattr=+sve -aarch64-sve-vector-bits-min=256 %s | FileCheck --check-prefixes=SVE,FIXED-MIN-256 %s
+; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=aarch64 -mattr=+sve -aarch64-sve-vector-bits-min=2048 %s | FileCheck --check-prefixes=SVE,FIXED-MIN-2048 %s
 
 target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
 
diff --git a/llvm/test/Analysis/CostModel/AArch64/fptoi_sat.ll b/llvm/test/Analysis/CostModel/AArch64/fptoi_sat.ll
index a352424..e4e2914 100644
--- a/llvm/test/Analysis/CostModel/AArch64/fptoi_sat.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/fptoi_sat.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
-; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=aarch64-none-linux-gnueabi %s | FileCheck --check-prefixes=CHECK,CHECK-NOFP16 %s
-; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=aarch64-none-linux-gnueabi -mattr=+fullfp16 %s | FileCheck --check-prefixes=CHECK,CHECK-FP16 %s
+; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=aarch64 %s | FileCheck --check-prefixes=CHECK,CHECK-NOFP16 %s
+; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=aarch64 -mattr=+fullfp16 %s | FileCheck --check-prefixes=CHECK,CHECK-FP16 %s
 
 target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
 
diff --git a/llvm/test/Analysis/LoopAccessAnalysis/number-of-memchecks.ll b/llvm/test/Analysis/LoopAccessAnalysis/number-of-memchecks.ll
index c268cc5..d428761 100644
--- a/llvm/test/Analysis/LoopAccessAnalysis/number-of-memchecks.ll
+++ b/llvm/test/Analysis/LoopAccessAnalysis/number-of-memchecks.ll
@@ -1,7 +1,7 @@
 ; RUN: opt -passes='print<access-info>' -disable-output  < %s 2>&1 | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
-target triple = "aarch64--linux-gnueabi"
+target triple = "aarch64"
 
 ; 3 reads and 3 writes should need 12 memchecks
 ; CHECK: function 'testf':
diff --git a/llvm/test/Analysis/LoopAccessAnalysis/reverse-memcheck-bounds.ll b/llvm/test/Analysis/LoopAccessAnalysis/reverse-memcheck-bounds.ll
index 86395ee..1496e1b 100644
--- a/llvm/test/Analysis/LoopAccessAnalysis/reverse-memcheck-bounds.ll
+++ b/llvm/test/Analysis/LoopAccessAnalysis/reverse-memcheck-bounds.ll
@@ -12,7 +12,7 @@
 ;   }
 
 target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
-target triple = "aarch64--linux-gnueabi"
+target triple = "aarch64"
 
 ; CHECK: function 'f':
 ; CHECK: (Low: (20000 + %a)<nuw> High: (60004 + %a))
diff --git a/llvm/test/CodeGen/AArch64/Redundantstore.ll b/llvm/test/CodeGen/AArch64/Redundantstore.ll
index 6fec557..229d644 100644
--- a/llvm/test/CodeGen/AArch64/Redundantstore.ll
+++ b/llvm/test/CodeGen/AArch64/Redundantstore.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -O3 -mtriple=aarch64-eabi | FileCheck %s 
+; RUN: llc < %s -O3 -mtriple=aarch64 | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 @end_of_array = common global ptr null, align 8
diff --git a/llvm/test/CodeGen/AArch64/aarch64-a57-fp-load-balancing.ll b/llvm/test/CodeGen/AArch64/aarch64-a57-fp-load-balancing.ll
index 122e187..f2ed57e 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-a57-fp-load-balancing.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-a57-fp-load-balancing.ll
@@ -6,8 +6,8 @@
 ; The following tests use the balance-fp-ops feature, and should be independent of
 ; the target cpu.
 
-; RUN: llc < %s -mtriple=aarch64-linux-gnueabi -mattr=+balance-fp-ops -aarch64-a57-fp-load-balancing-override=1 -aarch64-a57-fp-load-balancing-force-all -enable-misched=false -enable-post-misched=false | FileCheck %s --check-prefix CHECK --check-prefix CHECK-EVEN
-; RUN: llc < %s -mtriple=aarch64-linux-gnueabi -mattr=+balance-fp-ops  -aarch64-a57-fp-load-balancing-override=2 -aarch64-a57-fp-load-balancing-force-all -enable-misched=false -enable-post-misched=false | FileCheck %s --check-prefix CHECK --check-prefix CHECK-ODD
+; RUN: llc < %s -mtriple=aarch64 -mattr=+balance-fp-ops -aarch64-a57-fp-load-balancing-override=1 -aarch64-a57-fp-load-balancing-force-all -enable-misched=false -enable-post-misched=false | FileCheck %s --check-prefix CHECK --check-prefix CHECK-EVEN
+; RUN: llc < %s -mtriple=aarch64 -mattr=+balance-fp-ops  -aarch64-a57-fp-load-balancing-override=2 -aarch64-a57-fp-load-balancing-force-all -enable-misched=false -enable-post-misched=false | FileCheck %s --check-prefix CHECK --check-prefix CHECK-ODD
 
 ; Test the AArch64A57FPLoadBalancing pass. This pass relies heavily on register allocation, so
 ; our test strategy is to:
diff --git a/llvm/test/CodeGen/AArch64/aarch64-addv.ll b/llvm/test/CodeGen/AArch64/aarch64-addv.ll
index 5b78b0d..b77d591 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-addv.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-addv.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=aarch64-eabi -aarch64-neon-syntax=generic | FileCheck %s -check-prefixes=CHECK,SDAG
-; RUN: llc < %s -global-isel=1 -global-isel-abort=2 -mtriple=aarch64-eabi -aarch64-neon-syntax=generic 2>&1 | FileCheck %s --check-prefixes=CHECK,GISEL
+; RUN: llc < %s -mtriple=aarch64 -aarch64-neon-syntax=generic | FileCheck %s -check-prefixes=CHECK,SDAG
+; RUN: llc < %s -global-isel=1 -global-isel-abort=2 -mtriple=aarch64 -aarch64-neon-syntax=generic 2>&1 | FileCheck %s --check-prefixes=CHECK,GISEL
 
 ; Function Attrs: nounwind readnone
 declare i8 @llvm.vector.reduce.add.v2i8(<2 x i8>)
diff --git a/llvm/test/CodeGen/AArch64/aarch64-tbz.ll b/llvm/test/CodeGen/AArch64/aarch64-tbz.ll
index 28629a8..4cf3633 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-tbz.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-tbz.ll
@@ -1,5 +1,5 @@
-; RUN: llc -verify-machineinstrs -mtriple=aarch64-linux-gnueabi < %s | FileCheck %s
-; RUN: llc -verify-machineinstrs -mtriple=aarch64-linux-gnueabi -cgp-verify-bfi-updates=true < %s | FileCheck %s
+; RUN: llc -verify-machineinstrs -mtriple=aarch64 < %s | FileCheck %s
+; RUN: llc -verify-machineinstrs -mtriple=aarch64 -cgp-verify-bfi-updates=true < %s | FileCheck %s
 
 ; CHECK-LABEL: test1
 ; CHECK: tbz {{w[0-9]}}, #3, {{.LBB0_3}}
diff --git a/llvm/test/CodeGen/AArch64/aarch64-unroll-and-jam.ll b/llvm/test/CodeGen/AArch64/aarch64-unroll-and-jam.ll
index af5f6a9..285c16a 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-unroll-and-jam.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-unroll-and-jam.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -passes='loop-unroll-and-jam' < %s -mcpu=cortex-a55 -mtriple=aarch64-none-linux-eabi -S | FileCheck %s
+; RUN: opt -passes='loop-unroll-and-jam' < %s -mcpu=cortex-a55 -mtriple=aarch64 -S | FileCheck %s
 
 target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
 
diff --git a/llvm/test/CodeGen/AArch64/aarch64-vcvtfp2fxs-combine.ll b/llvm/test/CodeGen/AArch64/aarch64-vcvtfp2fxs-combine.ll
index 463084e..50f7d6d 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-vcvtfp2fxs-combine.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-vcvtfp2fxs-combine.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=aarch64-linux-eabi -o - | FileCheck %s
+; RUN: llc < %s -mtriple=aarch64 -o - | FileCheck %s
 
 %struct.a= type { i64, i64, i64, i64 }
 
diff --git a/llvm/test/CodeGen/AArch64/arm64-build-vector.ll b/llvm/test/CodeGen/AArch64/arm64-build-vector.ll
index 68c56d7..e4fbf07 100644
--- a/llvm/test/CodeGen/AArch64/arm64-build-vector.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-build-vector.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=aarch64-eabi -mattr=+fullfp16,+bf16 | FileCheck %s
+; RUN: llc < %s -mtriple=aarch64 -mattr=+fullfp16,+bf16 | FileCheck %s
 
 ; Check that building a vector from floats doesn't insert an unnecessary
 ; copy for lane zero.
diff --git a/llvm/test/CodeGen/AArch64/arm64-movi.ll b/llvm/test/CodeGen/AArch64/arm64-movi.ll
index 8ec98b7..c9074c2 100644
--- a/llvm/test/CodeGen/AArch64/arm64-movi.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-movi.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=aarch64-eabi | FileCheck %s
+; RUN: llc < %s -mtriple=aarch64 | FileCheck %s
 
 ;==--------------------------------------------------------------------------==
 ; Tests for MOV-immediate implemented with ORR-immediate.
diff --git a/llvm/test/CodeGen/AArch64/arm64-popcnt.ll b/llvm/test/CodeGen/AArch64/arm64-popcnt.ll
index 599fac8..f5ce73a3 100644
--- a/llvm/test/CodeGen/AArch64/arm64-popcnt.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-popcnt.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=arm64-eabi -aarch64-neon-syntax=apple | FileCheck %s
-; RUN: llc < %s -mtriple=aarch64-eabi -mattr -neon -aarch64-neon-syntax=apple | FileCheck -check-prefix=CHECK-NONEON %s
-; RUN: llc < %s -mtriple=aarch64-eabi -mattr +cssc -aarch64-neon-syntax=apple | FileCheck -check-prefix=CHECK-CSSC %s
+; RUN: llc < %s -mtriple=aarch64 -mattr -neon -aarch64-neon-syntax=apple | FileCheck -check-prefix=CHECK-NONEON %s
+; RUN: llc < %s -mtriple=aarch64 -mattr +cssc -aarch64-neon-syntax=apple | FileCheck -check-prefix=CHECK-CSSC %s
 
 define i32 @cnt32_advsimd(i32 %x) nounwind readnone {
 ; CHECK-LABEL: cnt32_advsimd:
diff --git a/llvm/test/CodeGen/AArch64/arm64-rev.ll b/llvm/test/CodeGen/AArch64/arm64-rev.ll
index 5f61d90..f548a0e 100644
--- a/llvm/test/CodeGen/AArch64/arm64-rev.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-rev.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=aarch64-eabi -aarch64-neon-syntax=apple | FileCheck %s --check-prefixes=CHECK,CHECK-SD
-; RUN: llc < %s -mtriple=aarch64-eabi -aarch64-neon-syntax=apple -global-isel | FileCheck %s --check-prefixes=CHECK,CHECK-GI
+; RUN: llc < %s -mtriple=aarch64 -aarch64-neon-syntax=apple | FileCheck %s --check-prefixes=CHECK,CHECK-SD
+; RUN: llc < %s -mtriple=aarch64 -aarch64-neon-syntax=apple -global-isel | FileCheck %s --check-prefixes=CHECK,CHECK-GI
 
 define i32 @test_rev_w(i32 %a) nounwind {
 ; CHECK-LABEL: test_rev_w:
diff --git a/llvm/test/CodeGen/AArch64/asm-large-immediate.ll b/llvm/test/CodeGen/AArch64/asm-large-immediate.ll
index 8369071..b45f22c 100644
--- a/llvm/test/CodeGen/AArch64/asm-large-immediate.ll
+++ b/llvm/test/CodeGen/AArch64/asm-large-immediate.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=aarch64-eabi -no-integrated-as | FileCheck %s
+; RUN: llc < %s -mtriple=aarch64 -no-integrated-as | FileCheck %s
 
 define void @test() {
 entry:
diff --git a/llvm/test/CodeGen/AArch64/bf16-shuffle.ll b/llvm/test/CodeGen/AArch64/bf16-shuffle.ll
index cf0b438..d59de3c 100644
--- a/llvm/test/CodeGen/AArch64/bf16-shuffle.ll
+++ b/llvm/test/CodeGen/AArch64/bf16-shuffle.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=aarch64-eabi -mattr=+v8.6a,+neon < %s | FileCheck %s
-; RUN: llc -mtriple=aarch64-eabi -mattr=+v8.6a,+neon,+bf16 < %s | FileCheck %s
-; RUN: llc -mtriple=aarch64-eabi -mattr=+v8.6a,+neon,+fullfp16,+bf16 < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64 -mattr=+v8.6a,+neon < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64 -mattr=+v8.6a,+neon,+bf16 < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64 -mattr=+v8.6a,+neon,+fullfp16,+bf16 < %s | FileCheck %s
 
 %struct.float16x4x2_t = type { [2 x <4 x bfloat>] }
 %struct.float16x8x2_t = type { [2 x <8 x bfloat>] }
diff --git a/llvm/test/CodeGen/AArch64/bf16.ll b/llvm/test/CodeGen/AArch64/bf16.ll
index 7a171c6..d3911ae4 100644
--- a/llvm/test/CodeGen/AArch64/bf16.ll
+++ b/llvm/test/CodeGen/AArch64/bf16.ll
@@ -1,5 +1,5 @@
 ; RUN: llc < %s -asm-verbose=0 -mtriple=arm64-eabi -mattr=+bf16 | FileCheck %s
-; RUN: llc < %s -asm-verbose=0 -mtriple=aarch64-eabi -mattr=+bf16 | FileCheck %s
+; RUN: llc < %s -asm-verbose=0 -mtriple=aarch64 -mattr=+bf16 | FileCheck %s
 
 ; test argument passing and simple load/store
 
diff --git a/llvm/test/CodeGen/AArch64/bitreverse.ll b/llvm/test/CodeGen/AArch64/bitreverse.ll
index be9f5b8..a6d3683 100644
--- a/llvm/test/CodeGen/AArch64/bitreverse.ll
+++ b/llvm/test/CodeGen/AArch64/bitreverse.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=aarch64-eabi %s -o - | FileCheck %s
+; RUN: llc -mtriple=aarch64 %s -o - | FileCheck %s
 
 ; These tests just check that the plumbing is in place for @llvm.bitreverse.
 
diff --git a/llvm/test/CodeGen/AArch64/cmpwithshort.ll b/llvm/test/CodeGen/AArch64/cmpwithshort.ll
index 8dbfdae..3dbf64a 100644
--- a/llvm/test/CodeGen/AArch64/cmpwithshort.ll
+++ b/llvm/test/CodeGen/AArch64/cmpwithshort.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -O3 -mtriple=aarch64-eabi | FileCheck %s 
+; RUN: llc < %s -O3 -mtriple=aarch64 | FileCheck %s 
 
 define i16 @test_1cmp_signed_1(ptr %ptr1) {
 ; CHECK-LABEL: @test_1cmp_signed_1
diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-i16-add-scalable.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-i16-add-scalable.ll
index 6f4f8d3..001046f 100644
--- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-i16-add-scalable.ll
+++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-i16-add-scalable.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s --mattr=+sve2 -o - | FileCheck %s
 
-target triple = "aarch64-arm-none-eabi"
+target triple = "aarch64"
 
 ; Expected to not transform as the type's minimum size is less than 128 bits.
 define <vscale x 4 x i16> @complex_add_v4i16(<vscale x 4 x i16> %a, <vscale x 4 x i16> %b) {
diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-i16-mul-scalable.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-i16-mul-scalable.ll
index b0a3e46..1b8a21b 100644
--- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-i16-mul-scalable.ll
+++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-i16-mul-scalable.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s --mattr=+sve2 -o - | FileCheck %s
 
-target triple = "aarch64-arm-none-eabi"
+target triple = "aarch64"
 
 ; Expected to not transform as the type's minimum size is less than 128 bits.
 define <vscale x 4 x i16> @complex_mul_v4i16(<vscale x 4 x i16> %a, <vscale x 4 x i16> %b) {
diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-i32-add-scalable.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-i32-add-scalable.ll
index 3118d86..1ce480b 100644
--- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-i32-add-scalable.ll
+++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-i32-add-scalable.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s --mattr=+sve2 -o - | FileCheck %s
 
-target triple = "aarch64-arm-none-eabi"
+target triple = "aarch64"
 
 ; Expected to transform
 define <vscale x 4 x i32> @complex_add_v4i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b) {
diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-i32-mul-scalable.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-i32-mul-scalable.ll
index 256ed10..d88eef9 100644
--- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-i32-mul-scalable.ll
+++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-i32-mul-scalable.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s --mattr=+sve2 -o - | FileCheck %s
 
-target triple = "aarch64-arm-none-eabi"
+target triple = "aarch64"
 
 ; Expected to transform
 define <vscale x 4 x i32> @complex_mul_v4i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b) {
diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-i64-add-scalable.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-i64-add-scalable.ll
index d9ec5fc..0b59be9 100644
--- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-i64-add-scalable.ll
+++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-i64-add-scalable.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s --mattr=+sve2 -o - | FileCheck %s
 
-target triple = "aarch64-arm-none-eabi"
+target triple = "aarch64"
 
 ; Expected to transform
 define <vscale x 2 x i64> @complex_add_v2i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b) {
diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-i64-mul-scalable.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-i64-mul-scalable.ll
index 2dec03b..16e1f3e 100644
--- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-i64-mul-scalable.ll
+++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-i64-mul-scalable.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s --mattr=+sve2 -o - | FileCheck %s
 
-target triple = "aarch64-arm-none-eabi"
+target triple = "aarch64"
 
 ; Expected to transform
 define <vscale x 2 x i64> @complex_mul_v2i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b) {
diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-i8-add-scalable.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-i8-add-scalable.ll
index e7ebd07..b631486 100644
--- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-i8-add-scalable.ll
+++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-i8-add-scalable.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s --mattr=+sve2 -o - | FileCheck %s
 
-target triple = "aarch64-arm-none-eabi"
+target triple = "aarch64"
 
 ; Expected to not transform as the type's minimum size is less than 128 bits.
 define <vscale x 8 x i8> @complex_add_v8i8(<vscale x 8 x i8> %a, <vscale x 8 x i8> %b) {
diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-splat-scalable.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-splat-scalable.ll
index 0cbe2f4..2627f2a 100644
--- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-splat-scalable.ll
+++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-splat-scalable.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s --mattr=+sve -o - | FileCheck %s
 
-target triple = "aarch64-arm-none-eabi"
+target triple = "aarch64"
 
 ; a[i] * b[i] * (11.0 + 3.0.i);
 ;
diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-splat.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-splat.ll
index 8de2ac5..ad9240b 100644
--- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-splat.ll
+++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-splat.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s --mattr=+complxnum -o - | FileCheck %s
 
-target triple = "aarch64-arm-none-eabi"
+target triple = "aarch64"
 
 
 ; a[i] * b[i] * (11.0 + 3.0.i);
diff --git a/llvm/test/CodeGen/AArch64/cond-br-tuning.ll b/llvm/test/CodeGen/AArch64/cond-br-tuning.ll
index dc00c41..1e1af3e 100644
--- a/llvm/test/CodeGen/AArch64/cond-br-tuning.ll
+++ b/llvm/test/CodeGen/AArch64/cond-br-tuning.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -debugify-and-strip-all-safe < %s -O3 -mtriple=aarch64-eabi -verify-machineinstrs | FileCheck %s
+; RUN: llc -debugify-and-strip-all-safe < %s -O3 -mtriple=aarch64 -verify-machineinstrs | FileCheck %s
 
 target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
 target triple = "aarch64-linaro-linux-gnueabi"
diff --git a/llvm/test/CodeGen/AArch64/consthoist-gep.ll b/llvm/test/CodeGen/AArch64/consthoist-gep.ll
index d109dad..031ee35 100644
--- a/llvm/test/CodeGen/AArch64/consthoist-gep.ll
+++ b/llvm/test/CodeGen/AArch64/consthoist-gep.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=aarch64-none-unknown-linuxeabi -consthoist-gep %s -o - | FileCheck %s
+; RUN: llc -mtriple=aarch64 -consthoist-gep %s -o - | FileCheck %s
 
 %struct.blam = type { %struct.bar, %struct.bar.0, %struct.wobble, %struct.wombat, i8, i16, %struct.snork.2, %struct.foo, %struct.snork.3, %struct.wobble.4, %struct.quux, [9 x i16], %struct.spam, %struct.zot }
 %struct.bar = type { i8, i8, %struct.snork }
diff --git a/llvm/test/CodeGen/AArch64/extbinopload.ll b/llvm/test/CodeGen/AArch64/extbinopload.ll
index 849fc7a..1f68c77 100644
--- a/llvm/test/CodeGen/AArch64/extbinopload.ll
+++ b/llvm/test/CodeGen/AArch64/extbinopload.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -mtriple=aarch64-none-eabi -verify-machineinstrs %s -o - | FileCheck %s
+; RUN: llc -mtriple=aarch64 -verify-machineinstrs %s -o - | FileCheck %s
 
 define <4 x i16> @normal_load_v4i8(ptr %p) {
 ; CHECK-LABEL: normal_load_v4i8:
diff --git a/llvm/test/CodeGen/AArch64/extract-sext-zext.ll b/llvm/test/CodeGen/AArch64/extract-sext-zext.ll
index f566ebb..ecb76d9 100644
--- a/llvm/test/CodeGen/AArch64/extract-sext-zext.ll
+++ b/llvm/test/CodeGen/AArch64/extract-sext-zext.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=aarch64-eabi %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-ISEL
-; RUN: llc -mtriple=aarch64-eabi -global-isel %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GLOBAL
+; RUN: llc -mtriple=aarch64 %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-ISEL
+; RUN: llc -mtriple=aarch64 -global-isel %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GLOBAL
 
 define i64 @extract_v2i64(<2 x i64> %x, i32 %y) {
 ; CHECK-ISEL-LABEL: extract_v2i64:
diff --git a/llvm/test/CodeGen/AArch64/fabs.ll b/llvm/test/CodeGen/AArch64/fabs.ll
index c56c6a0..5462bc6 100644
--- a/llvm/test/CodeGen/AArch64/fabs.ll
+++ b/llvm/test/CodeGen/AArch64/fabs.ll
@@ -1,8 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -mtriple=aarch64-none-eabi -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD,CHECK-SD-NOFP16
-; RUN: llc -mtriple=aarch64-none-eabi -mattr=+fullfp16 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD,CHECK-SD-FP16
-; RUN: llc -mtriple=aarch64-none-eabi -global-isel -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI,CHECK-GI-NOFP16
-; RUN: llc -mtriple=aarch64-none-eabi -mattr=+fullfp16 -global-isel -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI,CHECK-GI-FP16
+; RUN: llc -mtriple=aarch64 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD,CHECK-SD-NOFP16
+; RUN: llc -mtriple=aarch64 -mattr=+fullfp16 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD,CHECK-SD-FP16
+; RUN: llc -mtriple=aarch64 -global-isel -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI,CHECK-GI-NOFP16
+; RUN: llc -mtriple=aarch64 -mattr=+fullfp16 -global-isel -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI,CHECK-GI-FP16
 
 define double @fabs_f64(double %a) {
 ; CHECK-LABEL: fabs_f64:
diff --git a/llvm/test/CodeGen/AArch64/faddsub.ll b/llvm/test/CodeGen/AArch64/faddsub.ll
index c02e02d..31389f5 100644
--- a/llvm/test/CodeGen/AArch64/faddsub.ll
+++ b/llvm/test/CodeGen/AArch64/faddsub.ll
@@ -1,8 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -mtriple=aarch64-none-eabi -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD,CHECK-SD-NOFP16
-; RUN: llc -mtriple=aarch64-none-eabi -mattr=+fullfp16 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD,CHECK-SD-FP16
-; RUN: llc -mtriple=aarch64-none-eabi -global-isel -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI,CHECK-GI-NOFP16
-; RUN: llc -mtriple=aarch64-none-eabi -mattr=+fullfp16 -global-isel -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI,CHECK-GI-FP16
+; RUN: llc -mtriple=aarch64 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD,CHECK-SD-NOFP16
+; RUN: llc -mtriple=aarch64 -mattr=+fullfp16 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD,CHECK-SD-FP16
+; RUN: llc -mtriple=aarch64 -global-isel -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI,CHECK-GI-NOFP16
+; RUN: llc -mtriple=aarch64 -mattr=+fullfp16 -global-isel -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI,CHECK-GI-FP16
 
 define double @fadd_f64(double %a, double %b) {
 ; CHECK-LABEL: fadd_f64:
diff --git a/llvm/test/CodeGen/AArch64/fcmp.ll b/llvm/test/CodeGen/AArch64/fcmp.ll
index 29138ba..0f02784 100644
--- a/llvm/test/CodeGen/AArch64/fcmp.ll
+++ b/llvm/test/CodeGen/AArch64/fcmp.ll
@@ -1,8 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
-; RUN: llc -mtriple=aarch64-none-eabi -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD,CHECK-SD-NOFP16
-; RUN: llc -mtriple=aarch64-none-eabi -mattr=+fullfp16 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD,CHECK-SD-FP16
-; RUN: llc -mtriple=aarch64-none-eabi -global-isel -global-isel-abort=2 -verify-machineinstrs %s -o - 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI,CHECK-GI-NOFP16
-; RUN: llc -mtriple=aarch64-none-eabi -mattr=+fullfp16 -global-isel -global-isel-abort=2 -verify-machineinstrs %s -o - 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI,CHECK-GI-FP16
+; RUN: llc -mtriple=aarch64 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD,CHECK-SD-NOFP16
+; RUN: llc -mtriple=aarch64 -mattr=+fullfp16 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD,CHECK-SD-FP16
+; RUN: llc -mtriple=aarch64 -global-isel -global-isel-abort=2 -verify-machineinstrs %s -o - 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI,CHECK-GI-NOFP16
+; RUN: llc -mtriple=aarch64 -mattr=+fullfp16 -global-isel -global-isel-abort=2 -verify-machineinstrs %s -o - 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI,CHECK-GI-FP16
 
 define double @f64_double(double %a, double %b, double %d, double %e) {
 ; CHECK-LABEL: f64_double:
diff --git a/llvm/test/CodeGen/AArch64/fcopysign.ll b/llvm/test/CodeGen/AArch64/fcopysign.ll
index 4abd115..a1c48bd 100644
--- a/llvm/test/CodeGen/AArch64/fcopysign.ll
+++ b/llvm/test/CodeGen/AArch64/fcopysign.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
-; RUN: llc -mtriple=aarch64-none-eabi -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD
-; RUN: llc -mtriple=aarch64-none-eabi -global-isel -global-isel-abort=2 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI
+; RUN: llc -mtriple=aarch64 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD
+; RUN: llc -mtriple=aarch64 -global-isel -global-isel-abort=2 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI
 
 define double @copysign_f64(double %a, double %b) {
 ; CHECK-SD-LABEL: copysign_f64:
diff --git a/llvm/test/CodeGen/AArch64/fcvt.ll b/llvm/test/CodeGen/AArch64/fcvt.ll
index ce38bebf..584174d 100644
--- a/llvm/test/CodeGen/AArch64/fcvt.ll
+++ b/llvm/test/CodeGen/AArch64/fcvt.ll
@@ -1,8 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -mtriple=aarch64-none-eabi -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD,CHECK-SD-NOFP16
-; RUN: llc -mtriple=aarch64-none-eabi -mattr=+fullfp16 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD,CHECK-SD-FP16
-; RUN: llc -mtriple=aarch64-none-eabi -global-isel -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI,CHECK-GI-NOFP16
-; RUN: llc -mtriple=aarch64-none-eabi -mattr=+fullfp16 -global-isel -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI,CHECK-GI-FP16
+; RUN: llc -mtriple=aarch64 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD,CHECK-SD-NOFP16
+; RUN: llc -mtriple=aarch64 -mattr=+fullfp16 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD,CHECK-SD-FP16
+; RUN: llc -mtriple=aarch64 -global-isel -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI,CHECK-GI-NOFP16
+; RUN: llc -mtriple=aarch64 -mattr=+fullfp16 -global-isel -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI,CHECK-GI-FP16
 
 define double @ceil_f64(double %a) {
 ; CHECK-LABEL: ceil_f64:
diff --git a/llvm/test/CodeGen/AArch64/fdiv.ll b/llvm/test/CodeGen/AArch64/fdiv.ll
index b7a645b..fa87c4f 100644
--- a/llvm/test/CodeGen/AArch64/fdiv.ll
+++ b/llvm/test/CodeGen/AArch64/fdiv.ll
@@ -1,8 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -mtriple=aarch64-none-eabi -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD,CHECK-SD-NOFP16
-; RUN: llc -mtriple=aarch64-none-eabi -mattr=+fullfp16 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD,CHECK-SD-FP16
-; RUN: llc -mtriple=aarch64-none-eabi -global-isel -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI,CHECK-GI-NOFP16
-; RUN: llc -mtriple=aarch64-none-eabi -mattr=+fullfp16 -global-isel -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI,CHECK-GI-FP16
+; RUN: llc -mtriple=aarch64 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD,CHECK-SD-NOFP16
+; RUN: llc -mtriple=aarch64 -mattr=+fullfp16 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD,CHECK-SD-FP16
+; RUN: llc -mtriple=aarch64 -global-isel -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI,CHECK-GI-NOFP16
+; RUN: llc -mtriple=aarch64 -mattr=+fullfp16 -global-isel -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI,CHECK-GI-FP16
 
 define double @fdiv_f64(double %a, double %b) {
 ; CHECK-LABEL: fdiv_f64:
diff --git a/llvm/test/CodeGen/AArch64/fence-singlethread.ll b/llvm/test/CodeGen/AArch64/fence-singlethread.ll
index f36d289..eb77daa 100644
--- a/llvm/test/CodeGen/AArch64/fence-singlethread.ll
+++ b/llvm/test/CodeGen/AArch64/fence-singlethread.ll
@@ -1,7 +1,7 @@
 ; RUN: llc -mtriple=aarch64-linux-gnu %s -o - | FileCheck %s --check-prefix=LINUX
 ; RUN: llc -mtriple=aarch64-apple-ios %s -o - | FileCheck %s --check-prefix=IOS
 ; RUN: llc -mtriple=aarch64-apple-ios %s -o - -global-isel | FileCheck %s --check-prefix=IOS
-; RUN: llc -mtriple=aarch64-linux-gnueabihf %s -filetype=obj -o %t
+; RUN: llc -mtriple=aarch64 %s -filetype=obj -o %t
 ; RUN: llvm-objdump -d %t | FileCheck %s --check-prefix=OBJ
 
 ; OBJ-NOT: dmb
diff --git a/llvm/test/CodeGen/AArch64/fexplog.ll b/llvm/test/CodeGen/AArch64/fexplog.ll
index 2848a6b..e3c0ced 100644
--- a/llvm/test/CodeGen/AArch64/fexplog.ll
+++ b/llvm/test/CodeGen/AArch64/fexplog.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
-; RUN: llc -mtriple=aarch64-none-eabi -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD
-; RUN: llc -mtriple=aarch64-none-eabi -global-isel -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI
+; RUN: llc -mtriple=aarch64 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD
+; RUN: llc -mtriple=aarch64 -global-isel -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI
 
 define double @exp_f64(double %a) {
 ; CHECK-LABEL: exp_f64:
diff --git a/llvm/test/CodeGen/AArch64/fminimummaximum.ll b/llvm/test/CodeGen/AArch64/fminimummaximum.ll
index 217e4e4..f0e946c 100644
--- a/llvm/test/CodeGen/AArch64/fminimummaximum.ll
+++ b/llvm/test/CodeGen/AArch64/fminimummaximum.ll
@@ -1,8 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=aarch64-none-eabi -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD,CHECK-NOFP16-SD
-; RUN: llc -mtriple=aarch64-none-eabi -mattr=+fullfp16 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD,CHECK-FP16-SD
-; RUN: llc -mtriple=aarch64-none-eabi -global-isel -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI,CHECK-NOFP16-GI
-; RUN: llc -mtriple=aarch64-none-eabi -mattr=+fullfp16 -global-isel -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI,CHECK-FP16-GI
+; RUN: llc -mtriple=aarch64 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD,CHECK-NOFP16-SD
+; RUN: llc -mtriple=aarch64 -mattr=+fullfp16 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD,CHECK-FP16-SD
+; RUN: llc -mtriple=aarch64 -global-isel -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI,CHECK-NOFP16-GI
+; RUN: llc -mtriple=aarch64 -mattr=+fullfp16 -global-isel -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI,CHECK-FP16-GI
 
 define double @min_f64(double %a, double %b) {
 ; CHECK-LABEL: min_f64:
diff --git a/llvm/test/CodeGen/AArch64/fminmax.ll b/llvm/test/CodeGen/AArch64/fminmax.ll
index 1b92c46..cdf9973 100644
--- a/llvm/test/CodeGen/AArch64/fminmax.ll
+++ b/llvm/test/CodeGen/AArch64/fminmax.ll
@@ -1,8 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
-; RUN: llc -mtriple=aarch64-none-eabi -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD,CHECK-NOFP16-SD
-; RUN: llc -mtriple=aarch64-none-eabi -mattr=+fullfp16 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD,CHECK-FP16-SD
-; RUN: llc -mtriple=aarch64-none-eabi -global-isel -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI,CHECK-NOFP16-GI
-; RUN: llc -mtriple=aarch64-none-eabi -mattr=+fullfp16 -global-isel -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI,CHECK-FP16-GI
+; RUN: llc -mtriple=aarch64 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD,CHECK-NOFP16-SD
+; RUN: llc -mtriple=aarch64 -mattr=+fullfp16 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD,CHECK-FP16-SD
+; RUN: llc -mtriple=aarch64 -global-isel -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI,CHECK-NOFP16-GI
+; RUN: llc -mtriple=aarch64 -mattr=+fullfp16 -global-isel -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI,CHECK-FP16-GI
 
 define double @min_f64(double %a, double %b) {
 ; CHECK-LABEL: min_f64:
diff --git a/llvm/test/CodeGen/AArch64/fmla.ll b/llvm/test/CodeGen/AArch64/fmla.ll
index 3ae2158..339ade5 100644
--- a/llvm/test/CodeGen/AArch64/fmla.ll
+++ b/llvm/test/CodeGen/AArch64/fmla.ll
@@ -1,8 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -mtriple=aarch64-none-eabi -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD,CHECK-SD-NOFP16
-; RUN: llc -mtriple=aarch64-none-eabi -mattr=+fullfp16 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD,CHECK-SD-FP16
-; RUN: llc -mtriple=aarch64-none-eabi -global-isel -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI,CHECK-GI-NOFP16
-; RUN: llc -mtriple=aarch64-none-eabi -mattr=+fullfp16 -global-isel -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI,CHECK-GI-FP16
+; RUN: llc -mtriple=aarch64 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD,CHECK-SD-NOFP16
+; RUN: llc -mtriple=aarch64 -mattr=+fullfp16 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD,CHECK-SD-FP16
+; RUN: llc -mtriple=aarch64 -global-isel -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI,CHECK-GI-NOFP16
+; RUN: llc -mtriple=aarch64 -mattr=+fullfp16 -global-isel -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI,CHECK-GI-FP16
 
 define double @fma_f64(double %a, double %b, double %c) {
 ; CHECK-LABEL: fma_f64:
diff --git a/llvm/test/CodeGen/AArch64/fmul.ll b/llvm/test/CodeGen/AArch64/fmul.ll
index 244c73b..fe84fe1 100644
--- a/llvm/test/CodeGen/AArch64/fmul.ll
+++ b/llvm/test/CodeGen/AArch64/fmul.ll
@@ -1,8 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -mtriple=aarch64-none-eabi -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD,CHECK-SD-NOFP16
-; RUN: llc -mtriple=aarch64-none-eabi -mattr=+fullfp16 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD,CHECK-SD-FP16
-; RUN: llc -mtriple=aarch64-none-eabi -global-isel -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI,CHECK-GI-NOFP16
-; RUN: llc -mtriple=aarch64-none-eabi -mattr=+fullfp16 -global-isel -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI,CHECK-GI-FP16
+; RUN: llc -mtriple=aarch64 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD,CHECK-SD-NOFP16
+; RUN: llc -mtriple=aarch64 -mattr=+fullfp16 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD,CHECK-SD-FP16
+; RUN: llc -mtriple=aarch64 -global-isel -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI,CHECK-GI-NOFP16
+; RUN: llc -mtriple=aarch64 -mattr=+fullfp16 -global-isel -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI,CHECK-GI-FP16
 
 define double @fmul_f64(double %a, double %b) {
 ; CHECK-LABEL: fmul_f64:
diff --git a/llvm/test/CodeGen/AArch64/fneg.ll b/llvm/test/CodeGen/AArch64/fneg.ll
index fd42768..7805512 100644
--- a/llvm/test/CodeGen/AArch64/fneg.ll
+++ b/llvm/test/CodeGen/AArch64/fneg.ll
@@ -1,8 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -mtriple=aarch64-none-eabi -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD,CHECK-SD-NOFP16
-; RUN: llc -mtriple=aarch64-none-eabi -mattr=+fullfp16 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD,CHECK-SD-FP16
-; RUN: llc -mtriple=aarch64-none-eabi -global-isel -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI,CHECK-GI-NOFP16
-; RUN: llc -mtriple=aarch64-none-eabi -mattr=+fullfp16 -global-isel -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI,CHECK-GI-FP16
+; RUN: llc -mtriple=aarch64 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD,CHECK-SD-NOFP16
+; RUN: llc -mtriple=aarch64 -mattr=+fullfp16 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD,CHECK-SD-FP16
+; RUN: llc -mtriple=aarch64 -global-isel -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI,CHECK-GI-NOFP16
+; RUN: llc -mtriple=aarch64 -mattr=+fullfp16 -global-isel -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI,CHECK-GI-FP16
 
 define double @fabs_f64(double %a) {
 ; CHECK-LABEL: fabs_f64:
diff --git a/llvm/test/CodeGen/AArch64/fp16_intrinsic_lane.ll b/llvm/test/CodeGen/AArch64/fp16_intrinsic_lane.ll
index f68691a..e9fbaf6 100644
--- a/llvm/test/CodeGen/AArch64/fp16_intrinsic_lane.ll
+++ b/llvm/test/CodeGen/AArch64/fp16_intrinsic_lane.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=aarch64-eabi -mattr=+v8.2a,+fullfp16  | FileCheck %s
+; RUN: llc < %s -mtriple=aarch64 -mattr=+v8.2a,+fullfp16  | FileCheck %s
 
 declare half @llvm.aarch64.neon.fmulx.f16(half, half)
 declare <4 x half> @llvm.aarch64.neon.fmulx.v4f16(<4 x half>, <4 x half>)
diff --git a/llvm/test/CodeGen/AArch64/fp16_intrinsic_scalar_1op.ll b/llvm/test/CodeGen/AArch64/fp16_intrinsic_scalar_1op.ll
index 62284ca..40d2d63 100644
--- a/llvm/test/CodeGen/AArch64/fp16_intrinsic_scalar_1op.ll
+++ b/llvm/test/CodeGen/AArch64/fp16_intrinsic_scalar_1op.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=aarch64-eabi -mattr=+v8.2a,+fullfp16  | FileCheck %s
+; RUN: llc < %s -mtriple=aarch64 -mattr=+v8.2a,+fullfp16  | FileCheck %s
 
 declare i64 @llvm.aarch64.neon.fcvtpu.i64.f16(half)
 declare i32 @llvm.aarch64.neon.fcvtpu.i32.f16(half)
diff --git a/llvm/test/CodeGen/AArch64/fp16_intrinsic_scalar_2op.ll b/llvm/test/CodeGen/AArch64/fp16_intrinsic_scalar_2op.ll
index 111ddfe..36795f8 100644
--- a/llvm/test/CodeGen/AArch64/fp16_intrinsic_scalar_2op.ll
+++ b/llvm/test/CodeGen/AArch64/fp16_intrinsic_scalar_2op.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=aarch64-eabi -mattr=+v8.2a,+fullfp16  | FileCheck %s
+; RUN: llc < %s -mtriple=aarch64 -mattr=+v8.2a,+fullfp16  | FileCheck %s
 
 declare half @llvm.aarch64.sisd.fabd.f16(half, half)
 declare half @llvm.aarch64.neon.fmax.f16(half, half)
diff --git a/llvm/test/CodeGen/AArch64/fp16_intrinsic_scalar_3op.ll b/llvm/test/CodeGen/AArch64/fp16_intrinsic_scalar_3op.ll
index ba8e616e..9a52e21 100644
--- a/llvm/test/CodeGen/AArch64/fp16_intrinsic_scalar_3op.ll
+++ b/llvm/test/CodeGen/AArch64/fp16_intrinsic_scalar_3op.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=aarch64-eabi -mattr=+v8.2a,+neon,+fullfp16  | FileCheck %s
+; RUN: llc < %s -mtriple=aarch64 -mattr=+v8.2a,+neon,+fullfp16  | FileCheck %s
 
 define dso_local half @t_vfmah_f16(half %a, half %b, half %c) {
 ; CHECK-LABEL: t_vfmah_f16:
diff --git a/llvm/test/CodeGen/AArch64/fp16_intrinsic_vector_1op.ll b/llvm/test/CodeGen/AArch64/fp16_intrinsic_vector_1op.ll
index becbbdd..58cbc29 100644
--- a/llvm/test/CodeGen/AArch64/fp16_intrinsic_vector_1op.ll
+++ b/llvm/test/CodeGen/AArch64/fp16_intrinsic_vector_1op.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=aarch64-eabi -mattr=+v8.2a,+fullfp16  | FileCheck %s
+; RUN: llc < %s -mtriple=aarch64 -mattr=+v8.2a,+fullfp16  | FileCheck %s
 
 declare <4 x half> @llvm.nearbyint.v4f16(<4 x half>)
 declare <8 x half> @llvm.nearbyint.v8f16(<8 x half>)
diff --git a/llvm/test/CodeGen/AArch64/fp16_intrinsic_vector_2op.ll b/llvm/test/CodeGen/AArch64/fp16_intrinsic_vector_2op.ll
index 1674d86..e29919a 100644
--- a/llvm/test/CodeGen/AArch64/fp16_intrinsic_vector_2op.ll
+++ b/llvm/test/CodeGen/AArch64/fp16_intrinsic_vector_2op.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=aarch64-eabi -mattr=+v8.2a,+fullfp16  | FileCheck %s
+; RUN: llc < %s -mtriple=aarch64 -mattr=+v8.2a,+fullfp16  | FileCheck %s
 
 declare <4 x half> @llvm.aarch64.neon.fmulx.v4f16(<4 x half>, <4 x half>)
 declare <8 x half> @llvm.aarch64.neon.fmulx.v8f16(<8 x half>, <8 x half>)
diff --git a/llvm/test/CodeGen/AArch64/fp16_intrinsic_vector_3op.ll b/llvm/test/CodeGen/AArch64/fp16_intrinsic_vector_3op.ll
index c8a33a6..8d52d2a 100644
--- a/llvm/test/CodeGen/AArch64/fp16_intrinsic_vector_3op.ll
+++ b/llvm/test/CodeGen/AArch64/fp16_intrinsic_vector_3op.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=aarch64-eabi -mattr=+v8.2a,+fullfp16  | FileCheck %s
+; RUN: llc < %s -mtriple=aarch64 -mattr=+v8.2a,+fullfp16  | FileCheck %s
 
 declare <4 x half> @llvm.fma.v4f16(<4 x half>, <4 x half>, <4 x half>)
 declare <8 x half> @llvm.fma.v8f16(<8 x half>, <8 x half>, <8 x half>)
diff --git a/llvm/test/CodeGen/AArch64/fpext.ll b/llvm/test/CodeGen/AArch64/fpext.ll
index 9635b88..db1105d 100644
--- a/llvm/test/CodeGen/AArch64/fpext.ll
+++ b/llvm/test/CodeGen/AArch64/fpext.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
-; RUN: llc -mtriple=aarch64-none-eabi -global-isel=0 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD
-; RUN: llc -mtriple=aarch64-none-eabi -global-isel=1 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI
+; RUN: llc -mtriple=aarch64 -global-isel=0 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD
+; RUN: llc -mtriple=aarch64 -global-isel=1 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI
 
 define double @fpext_f32_f64(float %a) {
 ; CHECK-LABEL: fpext_f32_f64:
diff --git a/llvm/test/CodeGen/AArch64/fpow.ll b/llvm/test/CodeGen/AArch64/fpow.ll
index a55c0db..1dd5450 100644
--- a/llvm/test/CodeGen/AArch64/fpow.ll
+++ b/llvm/test/CodeGen/AArch64/fpow.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -mtriple=aarch64-none-eabi -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD
-; RUN: llc -mtriple=aarch64-none-eabi -global-isel -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI
+; RUN: llc -mtriple=aarch64 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD
+; RUN: llc -mtriple=aarch64 -global-isel -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI
 
 define double @pow_f64(double %a, double %b) {
 ; CHECK-LABEL: pow_f64:
diff --git a/llvm/test/CodeGen/AArch64/fpowi.ll b/llvm/test/CodeGen/AArch64/fpowi.ll
index 677d2e0..b496c7d 100644
--- a/llvm/test/CodeGen/AArch64/fpowi.ll
+++ b/llvm/test/CodeGen/AArch64/fpowi.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
-; RUN: llc -mtriple=aarch64-none-eabi -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD
-; RUN: llc -mtriple=aarch64-none-eabi -global-isel -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI
+; RUN: llc -mtriple=aarch64 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD
+; RUN: llc -mtriple=aarch64 -global-isel -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI
 
 define double @powi_f64(double %a, i32 %b) {
 ; CHECK-LABEL: powi_f64:
diff --git a/llvm/test/CodeGen/AArch64/fptoi.ll b/llvm/test/CodeGen/AArch64/fptoi.ll
index 23ba85d..251719c 100644
--- a/llvm/test/CodeGen/AArch64/fptoi.ll
+++ b/llvm/test/CodeGen/AArch64/fptoi.ll
@@ -1,8 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -mtriple=aarch64-none-eabi -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD,CHECK-SD-NOFP16
-; RUN: llc -mtriple=aarch64-none-eabi -mattr=+fullfp16 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD,CHECK-SD-FP16
-; RUN: llc -mtriple=aarch64-none-eabi -global-isel -global-isel-abort=2 -verify-machineinstrs %s -o - 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI,CHECK-GI-NOFP16
-; RUN: llc -mtriple=aarch64-none-eabi -mattr=+fullfp16 -global-isel -global-isel-abort=2 -verify-machineinstrs %s -o - 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI,CHECK-GI-FP16
+; RUN: llc -mtriple=aarch64 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD,CHECK-SD-NOFP16
+; RUN: llc -mtriple=aarch64 -mattr=+fullfp16 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD,CHECK-SD-FP16
+; RUN: llc -mtriple=aarch64 -global-isel -global-isel-abort=2 -verify-machineinstrs %s -o - 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI,CHECK-GI-NOFP16
+; RUN: llc -mtriple=aarch64 -mattr=+fullfp16 -global-isel -global-isel-abort=2 -verify-machineinstrs %s -o - 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI,CHECK-GI-FP16
 
 ; CHECK-GI-FP16:       warning: Instruction selection used fallback path for fptos_v2f16_v2i16
 ; CHECK-GI-FP16-NEXT:  warning: Instruction selection used fallback path for fptou_v2f16_v2i16
diff --git a/llvm/test/CodeGen/AArch64/fptrunc.ll b/llvm/test/CodeGen/AArch64/fptrunc.ll
index 813fa03..9425988 100644
--- a/llvm/test/CodeGen/AArch64/fptrunc.ll
+++ b/llvm/test/CodeGen/AArch64/fptrunc.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
-; RUN: llc -mtriple=aarch64-none-eabi -global-isel=0 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD
-; RUN: llc -mtriple=aarch64-none-eabi -global-isel=1 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI
+; RUN: llc -mtriple=aarch64 -global-isel=0 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD
+; RUN: llc -mtriple=aarch64 -global-isel=1 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI
 
 define float @fptrunc_f64_f32(double %a) {
 ; CHECK-LABEL: fptrunc_f64_f32:
diff --git a/llvm/test/CodeGen/AArch64/frem.ll b/llvm/test/CodeGen/AArch64/frem.ll
index eb26128..03caf0a 100644
--- a/llvm/test/CodeGen/AArch64/frem.ll
+++ b/llvm/test/CodeGen/AArch64/frem.ll
@@ -1,8 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -mtriple=aarch64-none-eabi -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD
-; RUN: llc -mtriple=aarch64-none-eabi -mattr=+fullfp16 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD
-; RUN: llc -mtriple=aarch64-none-eabi -global-isel -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI
-; RUN: llc -mtriple=aarch64-none-eabi -mattr=+fullfp16 -global-isel -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI
+; RUN: llc -mtriple=aarch64 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD
+; RUN: llc -mtriple=aarch64 -mattr=+fullfp16 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD
+; RUN: llc -mtriple=aarch64 -global-isel -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI
+; RUN: llc -mtriple=aarch64 -mattr=+fullfp16 -global-isel -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI
 
 define double @frem_f64(double %a, double %b) {
 ; CHECK-LABEL: frem_f64:
diff --git a/llvm/test/CodeGen/AArch64/frintn.ll b/llvm/test/CodeGen/AArch64/frintn.ll
index 2dc03db..782ba1d 100644
--- a/llvm/test/CodeGen/AArch64/frintn.ll
+++ b/llvm/test/CodeGen/AArch64/frintn.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=aarch64-eabi -mattr=+fullfp16 %s -o - | FileCheck %s
+; RUN: llc -mtriple=aarch64 -mattr=+fullfp16 %s -o - | FileCheck %s
 
 ; The llvm.aarch64.neon.frintn intrinsic should be auto-upgraded to the
 ; target-independent roundeven intrinsic.
diff --git a/llvm/test/CodeGen/AArch64/fsincos.ll b/llvm/test/CodeGen/AArch64/fsincos.ll
index aef0b2e..2c76d96 100644
--- a/llvm/test/CodeGen/AArch64/fsincos.ll
+++ b/llvm/test/CodeGen/AArch64/fsincos.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
-; RUN: llc -mtriple=aarch64-none-eabi -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD
-; RUN: llc -mtriple=aarch64-none-eabi -global-isel -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI
+; RUN: llc -mtriple=aarch64 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD
+; RUN: llc -mtriple=aarch64 -global-isel -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI
 
 define double @sin_f64(double %a) {
 ; CHECK-LABEL: sin_f64:
diff --git a/llvm/test/CodeGen/AArch64/fsqrt.ll b/llvm/test/CodeGen/AArch64/fsqrt.ll
index 76930e7..683544a 100644
--- a/llvm/test/CodeGen/AArch64/fsqrt.ll
+++ b/llvm/test/CodeGen/AArch64/fsqrt.ll
@@ -1,8 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -mtriple=aarch64-none-eabi -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD,CHECK-SD-NOFP16
-; RUN: llc -mtriple=aarch64-none-eabi -mattr=+fullfp16 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD,CHECK-SD-FP16
-; RUN: llc -mtriple=aarch64-none-eabi -global-isel -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI,CHECK-GI-NOFP16
-; RUN: llc -mtriple=aarch64-none-eabi -mattr=+fullfp16 -global-isel -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI,CHECK-GI-FP16
+; RUN: llc -mtriple=aarch64 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD,CHECK-SD-NOFP16
+; RUN: llc -mtriple=aarch64 -mattr=+fullfp16 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD,CHECK-SD-FP16
+; RUN: llc -mtriple=aarch64 -global-isel -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI,CHECK-GI-NOFP16
+; RUN: llc -mtriple=aarch64 -mattr=+fullfp16 -global-isel -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI,CHECK-GI-FP16
 
 define double @sqrt_f64(double %a) {
 ; CHECK-LABEL: sqrt_f64:
diff --git a/llvm/test/CodeGen/AArch64/hints.ll b/llvm/test/CodeGen/AArch64/hints.ll
index f23c7b0..61a3fa4 100644
--- a/llvm/test/CodeGen/AArch64/hints.ll
+++ b/llvm/test/CodeGen/AArch64/hints.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple aarch64-eabi -o - %s | FileCheck %s
+; RUN: llc -mtriple=aarch64 -o - %s | FileCheck %s
 
 declare void @llvm.aarch64.hint(i32) nounwind
 
diff --git a/llvm/test/CodeGen/AArch64/icmp.ll b/llvm/test/CodeGen/AArch64/icmp.ll
index 789bc99..2e8c93a 100644
--- a/llvm/test/CodeGen/AArch64/icmp.ll
+++ b/llvm/test/CodeGen/AArch64/icmp.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
-; RUN: llc -mtriple=aarch64-none-eabi -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD
-; RUN: llc -mtriple=aarch64-none-eabi -global-isel -global-isel-abort=2 -verify-machineinstrs %s -o - 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI
+; RUN: llc -mtriple=aarch64 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD
+; RUN: llc -mtriple=aarch64 -global-isel -global-isel-abort=2 -verify-machineinstrs %s -o - 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI
 
 define i64 @i64_i64(i64 %a, i64 %b, i64 %d, i64 %e) {
 ; CHECK-LABEL: i64_i64:
diff --git a/llvm/test/CodeGen/AArch64/insertextract.ll b/llvm/test/CodeGen/AArch64/insertextract.ll
index 794abca..6074d44 100644
--- a/llvm/test/CodeGen/AArch64/insertextract.ll
+++ b/llvm/test/CodeGen/AArch64/insertextract.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
-; RUN: llc -mtriple=aarch64-none-eabi -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD
-; RUN: llc -mtriple=aarch64-none-eabi -global-isel -global-isel-abort=2 -verify-machineinstrs %s -o - 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI
+; RUN: llc -mtriple=aarch64 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD
+; RUN: llc -mtriple=aarch64 -global-isel -global-isel-abort=2 -verify-machineinstrs %s -o - 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI
 
 ; CHECK-GI:       warning: Instruction selection used fallback path for insert_v2f64_c
 ; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for insert_v3f64_c
diff --git a/llvm/test/CodeGen/AArch64/intrinsics-memory-barrier.ll b/llvm/test/CodeGen/AArch64/intrinsics-memory-barrier.ll
index 6193997..ea4205f 100644
--- a/llvm/test/CodeGen/AArch64/intrinsics-memory-barrier.ll
+++ b/llvm/test/CodeGen/AArch64/intrinsics-memory-barrier.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=aarch64-eabi -O=3 | FileCheck %s
+; RUN: llc < %s -mtriple=aarch64 -O=3 | FileCheck %s
 
 define void @test() {
   ; CHECK: dmb sy
diff --git a/llvm/test/CodeGen/AArch64/itofp.ll b/llvm/test/CodeGen/AArch64/itofp.ll
index fa1ab61..85689b6 100644
--- a/llvm/test/CodeGen/AArch64/itofp.ll
+++ b/llvm/test/CodeGen/AArch64/itofp.ll
@@ -1,8 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
-; RUN: llc -mtriple=aarch64-none-eabi -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD,CHECK-SD-NOFP16
-; RUN: llc -mtriple=aarch64-none-eabi -mattr=+fullfp16 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD,CHECK-SD-FP16
-; RUN: llc -mtriple=aarch64-none-eabi -global-isel -global-isel-abort=2 -verify-machineinstrs %s -o - 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI,CHECK-GI-NOFP16
-; RUN: llc -mtriple=aarch64-none-eabi -mattr=+fullfp16 -global-isel -global-isel-abort=2 -verify-machineinstrs %s -o - 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI,CHECK-GI-FP16
+; RUN: llc -mtriple=aarch64 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD,CHECK-SD-NOFP16
+; RUN: llc -mtriple=aarch64 -mattr=+fullfp16 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD,CHECK-SD-FP16
+; RUN: llc -mtriple=aarch64 -global-isel -global-isel-abort=2 -verify-machineinstrs %s -o - 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI,CHECK-GI-NOFP16
+; RUN: llc -mtriple=aarch64 -mattr=+fullfp16 -global-isel -global-isel-abort=2 -verify-machineinstrs %s -o - 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI,CHECK-GI-FP16
 
 ; CHECK-GI:       warning: Instruction selection used fallback path for stofp_v3i8_v3f64
 ; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for utofp_v3i8_v3f64
diff --git a/llvm/test/CodeGen/AArch64/ldp-stp-scaled-unscaled-pairs.ll b/llvm/test/CodeGen/AArch64/ldp-stp-scaled-unscaled-pairs.ll
index cf63835..91cf605 100644
--- a/llvm/test/CodeGen/AArch64/ldp-stp-scaled-unscaled-pairs.ll
+++ b/llvm/test/CodeGen/AArch64/ldp-stp-scaled-unscaled-pairs.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=aarch64-eabi -aarch64-neon-syntax=apple -aarch64-enable-stp-suppress=false -verify-machineinstrs -asm-verbose=false | FileCheck %s
+; RUN: llc < %s -mtriple=aarch64 -aarch64-neon-syntax=apple -aarch64-enable-stp-suppress=false -verify-machineinstrs -asm-verbose=false | FileCheck %s
 
 ; CHECK-LABEL: test_strd_sturd:
 ; CHECK-NEXT: stp d0, d1, [x0, #-8]
diff --git a/llvm/test/CodeGen/AArch64/legalize-bug-bogus-cpu.ll b/llvm/test/CodeGen/AArch64/legalize-bug-bogus-cpu.ll
index a96a3c5..a949abb 100644
--- a/llvm/test/CodeGen/AArch64/legalize-bug-bogus-cpu.ll
+++ b/llvm/test/CodeGen/AArch64/legalize-bug-bogus-cpu.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=aarch64-eabi -mcpu=bogus
+; RUN: llc < %s -mtriple=aarch64 -mcpu=bogus
 
 ; Fix the bug in PR20557. Set mcpu to a bogus name, llc will crash in type
 ; legalization.
diff --git a/llvm/test/CodeGen/AArch64/merge-scoped-aa-store.ll b/llvm/test/CodeGen/AArch64/merge-scoped-aa-store.ll
index 871bc3b..23011df 100644
--- a/llvm/test/CodeGen/AArch64/merge-scoped-aa-store.ll
+++ b/llvm/test/CodeGen/AArch64/merge-scoped-aa-store.ll
@@ -1,4 +1,4 @@
-; RUN: llc %s -mtriple=aarch64-eabi -stop-after=finalize-isel -o - | FileCheck --check-prefix=MIR %s
+; RUN: llc %s -mtriple=aarch64 -stop-after=finalize-isel -o - | FileCheck --check-prefix=MIR %s
 
 ; Ensure the scoped AA metadata is still retained after store merging.
 
diff --git a/llvm/test/CodeGen/AArch64/merge-store-dependency.ll b/llvm/test/CodeGen/AArch64/merge-store-dependency.ll
index fc5813b..3c42987 100644
--- a/llvm/test/CodeGen/AArch64/merge-store-dependency.ll
+++ b/llvm/test/CodeGen/AArch64/merge-store-dependency.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mcpu cortex-a53 -mtriple=aarch64-eabi | FileCheck %s --check-prefix=A53
+; RUN: llc < %s -mcpu cortex-a53 -mtriple=aarch64 | FileCheck %s --check-prefix=A53
 
 ; PR26827 - Merge stores causes wrong dependency.
 %struct1 = type { ptr, ptr, i32, i32, i16, i16, ptr, ptr }
diff --git a/llvm/test/CodeGen/AArch64/merge-store.ll b/llvm/test/CodeGen/AArch64/merge-store.ll
index f720103..b93d0c3 100644
--- a/llvm/test/CodeGen/AArch64/merge-store.ll
+++ b/llvm/test/CodeGen/AArch64/merge-store.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=aarch64-unknown-unknown -mcpu=cyclone -mattr=+slow-misaligned-128store | FileCheck %s --check-prefixes=CHECK,SPLITTING
-; RUN: llc < %s -mtriple=aarch64-eabi -mattr=-slow-misaligned-128store | FileCheck %s --check-prefixes=CHECK,MISALIGNED
+; RUN: llc < %s -mtriple=aarch64 -mattr=-slow-misaligned-128store | FileCheck %s --check-prefixes=CHECK,MISALIGNED
 
 @g0 = external dso_local global <3 x float>, align 16
 @g1 = external dso_local global <3 x float>, align 4
diff --git a/llvm/test/CodeGen/AArch64/min-max-combine.ll b/llvm/test/CodeGen/AArch64/min-max-combine.ll
index 535d2ba..5111f83 100644
--- a/llvm/test/CodeGen/AArch64/min-max-combine.ll
+++ b/llvm/test/CodeGen/AArch64/min-max-combine.ll
@@ -1,8 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -mtriple=aarch64-eabi %s -o - | FileCheck %s --check-prefixes=CHECK-ISEL
-; RUN: llc -mtriple=aarch64-eabi %s -o - -mattr=cssc | FileCheck %s --check-prefixes=CHECK-CSSC
-; RUN: llc -mtriple=aarch64-eabi -global-isel %s -o - | FileCheck %s --check-prefixes=CHECK-GLOBAL
-; RUN: llc -mtriple=aarch64-eabi -global-isel %s -o - -mattr=cssc | FileCheck %s --check-prefixes=CHECK-CSSC
+; RUN: llc -mtriple=aarch64 %s -o - | FileCheck %s --check-prefixes=CHECK-ISEL
+; RUN: llc -mtriple=aarch64 %s -o - -mattr=cssc | FileCheck %s --check-prefixes=CHECK-CSSC
+; RUN: llc -mtriple=aarch64 -global-isel %s -o - | FileCheck %s --check-prefixes=CHECK-GLOBAL
+; RUN: llc -mtriple=aarch64 -global-isel %s -o - -mattr=cssc | FileCheck %s --check-prefixes=CHECK-CSSC
 
 ; These tests check for @llvm.smax, @llvm.smin combines.
 
diff --git a/llvm/test/CodeGen/AArch64/min-max.ll b/llvm/test/CodeGen/AArch64/min-max.ll
index 8914406..0d02f1e 100644
--- a/llvm/test/CodeGen/AArch64/min-max.ll
+++ b/llvm/test/CodeGen/AArch64/min-max.ll
@@ -1,8 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=aarch64-eabi %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-ISEL
-; RUN: llc -mtriple=aarch64-eabi %s -o - -mattr=+cssc | FileCheck %s --check-prefixes=CHECK,CHECK-ISEL-CSSC
-; RUN: llc -mtriple=aarch64-eabi -global-isel %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GLOBAL
-; RUN: llc -mtriple=aarch64-eabi -global-isel %s -o - -mattr=+cssc | FileCheck %s --check-prefixes=CHECK,CHECK-GLOBAL-CSSC
+; RUN: llc -mtriple=aarch64 %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-ISEL
+; RUN: llc -mtriple=aarch64 %s -o - -mattr=+cssc | FileCheck %s --check-prefixes=CHECK,CHECK-ISEL-CSSC
+; RUN: llc -mtriple=aarch64 -global-isel %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GLOBAL
+; RUN: llc -mtriple=aarch64 -global-isel %s -o - -mattr=+cssc | FileCheck %s --check-prefixes=CHECK,CHECK-GLOBAL-CSSC
 
 ; These tests just check that the plumbing is in place for @llvm.smax, @llvm.umax,
 ; @llvm.smin, @llvm.umin.
diff --git a/llvm/test/CodeGen/AArch64/mul_pow2.ll b/llvm/test/CodeGen/AArch64/mul_pow2.ll
index e16ee40..90e560a 100644
--- a/llvm/test/CodeGen/AArch64/mul_pow2.ll
+++ b/llvm/test/CodeGen/AArch64/mul_pow2.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=aarch64-eabi | FileCheck %s
-; RUN: llc < %s -mtriple=aarch64-eabi -global-isel -global-isel-abort=1 | FileCheck %s --check-prefix=GISEL
+; RUN: llc < %s -mtriple=aarch64 | FileCheck %s
+; RUN: llc < %s -mtriple=aarch64 -global-isel -global-isel-abort=1 | FileCheck %s --check-prefix=GISEL
 
 ; Convert mul x, pow2 to shift.
 ; Convert mul x, pow2 +/- 1 to shift + add/sub.
diff --git a/llvm/test/CodeGen/AArch64/neon-extadd-extract.ll b/llvm/test/CodeGen/AArch64/neon-extadd-extract.ll
index d79c072..b3692f8 100644
--- a/llvm/test/CodeGen/AArch64/neon-extadd-extract.ll
+++ b/llvm/test/CodeGen/AArch64/neon-extadd-extract.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc < %s -mtriple aarch64-none-eabi -o - | FileCheck %s
+; RUN: llc < %s -mtriple=aarch64 -o - | FileCheck %s
 
 define <4 x i16> @addls_v8i8_0(<8 x i8> %s0, <8 x i8> %s1) {
 ; CHECK-LABEL: addls_v8i8_0:
diff --git a/llvm/test/CodeGen/AArch64/neon_rbit.ll b/llvm/test/CodeGen/AArch64/neon_rbit.ll
index 0daaf72..e66aca7 100644
--- a/llvm/test/CodeGen/AArch64/neon_rbit.ll
+++ b/llvm/test/CodeGen/AArch64/neon_rbit.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=aarch64-eabi -mattr=+fullfp16 %s -o - | FileCheck %s
+; RUN: llc -mtriple=aarch64 -mattr=+fullfp16 %s -o - | FileCheck %s
 
 ; The llvm.aarch64_neon_rbit intrinsic should be auto-upgraded to the
 ; target-independent bitreverse intrinsic.
diff --git a/llvm/test/CodeGen/AArch64/no-quad-ldp-stp.ll b/llvm/test/CodeGen/AArch64/no-quad-ldp-stp.ll
index 4fe16d8..b7dde88 100644
--- a/llvm/test/CodeGen/AArch64/no-quad-ldp-stp.ll
+++ b/llvm/test/CodeGen/AArch64/no-quad-ldp-stp.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -mtriple=aarch64-eabi -mattr=+slow-paired-128 -verify-machineinstrs -asm-verbose=false | FileCheck %s --check-prefixes=CHECK,SLOW
-; RUN: llc < %s -mtriple=aarch64-eabi -mcpu=exynos-m3         -verify-machineinstrs -asm-verbose=false | FileCheck %s --check-prefixes=CHECK,FAST
+; RUN: llc < %s -mtriple=aarch64 -mattr=+slow-paired-128 -verify-machineinstrs -asm-verbose=false | FileCheck %s --check-prefixes=CHECK,SLOW
+; RUN: llc < %s -mtriple=aarch64 -mcpu=exynos-m3         -verify-machineinstrs -asm-verbose=false | FileCheck %s --check-prefixes=CHECK,FAST
 
 ; CHECK-LABEL: test_nopair_st
 ; SLOW: str
diff --git a/llvm/test/CodeGen/AArch64/nzcv-save.ll b/llvm/test/CodeGen/AArch64/nzcv-save.ll
index 9bc4ccf..c40e529 100644
--- a/llvm/test/CodeGen/AArch64/nzcv-save.ll
+++ b/llvm/test/CodeGen/AArch64/nzcv-save.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-eabi | FileCheck %s
+; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64 | FileCheck %s
 
 ; DAG ends up with two uses for the flags from an ADCS node, which means they
 ; must be saved for later.
diff --git a/llvm/test/CodeGen/AArch64/pacbti-module-attrs.ll b/llvm/test/CodeGen/AArch64/pacbti-module-attrs.ll
index 1858f3b..ba47721 100644
--- a/llvm/test/CodeGen/AArch64/pacbti-module-attrs.ll
+++ b/llvm/test/CodeGen/AArch64/pacbti-module-attrs.ll
@@ -1,4 +1,4 @@
-;; RUN: llc -mtriple=aarch64-eabi -mattr=+v8.5a %s -o - | FileCheck %s
+;; RUN: llc -mtriple=aarch64 -mattr=+v8.5a %s -o - | FileCheck %s
 
 declare i32 @g(i32) #5
 
diff --git a/llvm/test/CodeGen/AArch64/postra-mi-sched.ll b/llvm/test/CodeGen/AArch64/postra-mi-sched.ll
index 7688973..5abc06b 100644
--- a/llvm/test/CodeGen/AArch64/postra-mi-sched.ll
+++ b/llvm/test/CodeGen/AArch64/postra-mi-sched.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -O3 -mtriple=aarch64-eabi -mcpu=cortex-a53 | FileCheck %s
+; RUN: llc < %s -O3 -mtriple=aarch64 -mcpu=cortex-a53 | FileCheck %s
 
 ; With cortex-a53, each of fmul and fcvt have latency of 6 cycles.  After the
 ; pre-RA MI scheduler, fmul, fcvt and fdiv will be consecutive.  The top-down
diff --git a/llvm/test/CodeGen/AArch64/rbit.ll b/llvm/test/CodeGen/AArch64/rbit.ll
index 288a25b..8c457b4 100644
--- a/llvm/test/CodeGen/AArch64/rbit.ll
+++ b/llvm/test/CodeGen/AArch64/rbit.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=aarch64-eabi %s -o - | FileCheck %s
+; RUN: llc -mtriple=aarch64 %s -o - | FileCheck %s
 
 ; The llvm.aarch64.rbit intrinsic should be auto-upgraded to the
 ; target-independent bitreverse intrinsic.
diff --git a/llvm/test/CodeGen/AArch64/rcpc3-sve.ll b/llvm/test/CodeGen/AArch64/rcpc3-sve.ll
index b9d9394..47e3381 100644
--- a/llvm/test/CodeGen/AArch64/rcpc3-sve.ll
+++ b/llvm/test/CodeGen/AArch64/rcpc3-sve.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -mtriple=aarch64-none-eabi -mattr=+v8.9a -mattr=+sve -mattr=+rcpc3 < %s | FileCheck %s
-; RUN: llc -mtriple=aarch64-none-eabi -mattr=+v8.9a -mattr=+sve < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64 -mattr=+v8.9a -mattr=+sve -mattr=+rcpc3 < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64 -mattr=+v8.9a -mattr=+sve < %s | FileCheck %s
 
 ; Show what happens with RCPC3 for extract/insert into SVE vectors.
 ; Currently there is no RCPC3 codegen expected for this.
diff --git a/llvm/test/CodeGen/AArch64/rcpc3.ll b/llvm/test/CodeGen/AArch64/rcpc3.ll
index d416139..4577033 100644
--- a/llvm/test/CodeGen/AArch64/rcpc3.ll
+++ b/llvm/test/CodeGen/AArch64/rcpc3.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -mtriple=aarch64-none-eabi -mattr=+v8.9a -mattr=+rcpc3 < %s | FileCheck --check-prefixes=BOTH,RCPC3 %s
-; RUN: llc -mtriple=aarch64-none-eabi -mattr=+v8.9a < %s | FileCheck --check-prefixes=BOTH,NO-RCPC3 %s
+; RUN: llc -mtriple=aarch64 -mattr=+v8.9a -mattr=+rcpc3 < %s | FileCheck --check-prefixes=BOTH,RCPC3 %s
+; RUN: llc -mtriple=aarch64 -mattr=+v8.9a < %s | FileCheck --check-prefixes=BOTH,NO-RCPC3 %s
 
 define hidden <2 x i64> @test_ldap1_2xi64_lane0(ptr nocapture noundef readonly %a, <2 x i64> noundef %b) local_unnamed_addr {
 ;
diff --git a/llvm/test/CodeGen/AArch64/rem_crash.ll b/llvm/test/CodeGen/AArch64/rem_crash.ll
index f9cf6d5..38b46ea 100644
--- a/llvm/test/CodeGen/AArch64/rem_crash.ll
+++ b/llvm/test/CodeGen/AArch64/rem_crash.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=aarch64-eabi
+; RUN: llc < %s -mtriple=aarch64
 
 define i8 @test_minsize_uu8(i8 %x) minsize optsize {
 entry:
diff --git a/llvm/test/CodeGen/AArch64/rotate.ll b/llvm/test/CodeGen/AArch64/rotate.ll
index 8d52b6dd..7b4808e 100644
--- a/llvm/test/CodeGen/AArch64/rotate.ll
+++ b/llvm/test/CodeGen/AArch64/rotate.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=aarch64--linux-gnueabihf | FileCheck %s
+; RUN: llc < %s -mtriple=aarch64 | FileCheck %s
 
 ;; This used to cause a backend crash about not being able to
 ;; select ROTL. Make sure if generates the basic ushr/shl.
diff --git a/llvm/test/CodeGen/AArch64/setcc_knownbits.ll b/llvm/test/CodeGen/AArch64/setcc_knownbits.ll
index 9e9c814..46b714d 100644
--- a/llvm/test/CodeGen/AArch64/setcc_knownbits.ll
+++ b/llvm/test/CodeGen/AArch64/setcc_knownbits.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
-; RUN: llc < %s -mtriple=aarch64-none-eabi | FileCheck %s
+; RUN: llc < %s -mtriple=aarch64 | FileCheck %s
 
 define i1 @load_bv_v4i8(i1 zeroext %a) {
 ; CHECK-LABEL: load_bv_v4i8:
diff --git a/llvm/test/CodeGen/AArch64/sls-stackprotector-outliner.ll b/llvm/test/CodeGen/AArch64/sls-stackprotector-outliner.ll
index 9e5e555..5f3b150 100644
--- a/llvm/test/CodeGen/AArch64/sls-stackprotector-outliner.ll
+++ b/llvm/test/CodeGen/AArch64/sls-stackprotector-outliner.ll
@@ -5,7 +5,7 @@
 ; inserted at a point where LR is live.
 
 target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
-target triple = "aarch64-arm-none-eabi"
+target triple = "aarch64"
 
 define hidden void @_ZTv0_n24_N2C6D1Ev(ptr %this) minsize sspreq "target-features"="+harden-sls-retbr" {
 ; CHECK-LABEL: _ZTv0_n24_N2C6D1Ev:
diff --git a/llvm/test/CodeGen/AArch64/stack-probing-64k.ll b/llvm/test/CodeGen/AArch64/stack-probing-64k.ll
index 0d64e73..5f833e3 100644
--- a/llvm/test/CodeGen/AArch64/stack-probing-64k.ll
+++ b/llvm/test/CodeGen/AArch64/stack-probing-64k.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple aarch64-none-eabi < %s -verify-machineinstrs -enable-post-misched=false | FileCheck %s
-; RUN: llc -mtriple aarch64-none-eabi < %s -verify-machineinstrs -enable-post-misched=false -global-isel | FileCheck %s
+; RUN: llc -mtriple=aarch64 < %s -verify-machineinstrs -enable-post-misched=false | FileCheck %s
+; RUN: llc -mtriple=aarch64 < %s -verify-machineinstrs -enable-post-misched=false -global-isel | FileCheck %s
 
 ; Tests for prolog sequences for stack probing, when using a 64KiB stack guard.
 
diff --git a/llvm/test/CodeGen/AArch64/stack-probing-dynamic.ll b/llvm/test/CodeGen/AArch64/stack-probing-dynamic.ll
index a3b8df4..d9ad104 100644
--- a/llvm/test/CodeGen/AArch64/stack-probing-dynamic.ll
+++ b/llvm/test/CodeGen/AArch64/stack-probing-dynamic.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple aarch64-none-eabi < %s -verify-machineinstrs                                   | FileCheck %s
-; RUN: llc -mtriple aarch64-none-eabi < %s -verify-machineinstrs -global-isel -global-isel-abort=2 | FileCheck %s
+; RUN: llc -mtriple=aarch64 < %s -verify-machineinstrs                                   | FileCheck %s
+; RUN: llc -mtriple=aarch64 < %s -verify-machineinstrs -global-isel -global-isel-abort=2 | FileCheck %s
 
 ; Dynamically-sized allocation, needs a loop which can handle any size at
 ; runtime. The final iteration of the loop will temporarily put SP below the
diff --git a/llvm/test/CodeGen/AArch64/stack-probing-sve.ll b/llvm/test/CodeGen/AArch64/stack-probing-sve.ll
index 03a9220..1ad7870 100644
--- a/llvm/test/CodeGen/AArch64/stack-probing-sve.ll
+++ b/llvm/test/CodeGen/AArch64/stack-probing-sve.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple aarch64-none-eabi < %s -verify-machineinstrs | FileCheck %s
-; RUN: llc -mtriple aarch64-none-eabi < %s -verify-machineinstrs -global-isel -global-isel-abort=2 | FileCheck %s
+; RUN: llc -mtriple=aarch64 < %s -verify-machineinstrs | FileCheck %s
+; RUN: llc -mtriple=aarch64 < %s -verify-machineinstrs -global-isel -global-isel-abort=2 | FileCheck %s
 
 ; Test prolog sequences for stack probing when SVE objects are involved.
 
diff --git a/llvm/test/CodeGen/AArch64/stack-probing.ll b/llvm/test/CodeGen/AArch64/stack-probing.ll
index df5408d..8fc90cf 100644
--- a/llvm/test/CodeGen/AArch64/stack-probing.ll
+++ b/llvm/test/CodeGen/AArch64/stack-probing.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple aarch64-none-eabi < %s -verify-machineinstrs -enable-post-misched=false | FileCheck %s
-; RUN: llc -mtriple aarch64-none-eabi < %s -verify-machineinstrs -enable-post-misched=false -global-isel | FileCheck %s
+; RUN: llc -mtriple=aarch64 < %s -verify-machineinstrs -enable-post-misched=false | FileCheck %s
+; RUN: llc -mtriple=aarch64 < %s -verify-machineinstrs -enable-post-misched=false -global-isel | FileCheck %s
 
 ; Tests for prolog sequences for stack probing, when using a 4KiB stack guard.
 
diff --git a/llvm/test/CodeGen/AArch64/sve-fcopysign.ll b/llvm/test/CodeGen/AArch64/sve-fcopysign.ll
index d2315844..f158075 100644
--- a/llvm/test/CodeGen/AArch64/sve-fcopysign.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fcopysign.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple aarch64-eabi -mattr=+sve -o - | FileCheck --check-prefixes=CHECK,CHECK-NO-EXTEND-ROUND %s
-; RUN: llc < %s -mtriple aarch64-eabi -mattr=+sve --combiner-vector-fcopysign-extend-round -o - | FileCheck --check-prefixes=CHECK,CHECK-EXTEND-ROUND %s
+; RUN: llc < %s -mtriple=aarch64 -mattr=+sve -o - | FileCheck --check-prefixes=CHECK,CHECK-NO-EXTEND-ROUND %s
+; RUN: llc < %s -mtriple=aarch64 -mattr=+sve --combiner-vector-fcopysign-extend-round -o - | FileCheck --check-prefixes=CHECK,CHECK-EXTEND-ROUND %s
 target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
 
 ;============ v2f32
diff --git a/llvm/test/CodeGen/AArch64/sve2-fcopysign.ll b/llvm/test/CodeGen/AArch64/sve2-fcopysign.ll
index 7f65997..14cc8cd 100644
--- a/llvm/test/CodeGen/AArch64/sve2-fcopysign.ll
+++ b/llvm/test/CodeGen/AArch64/sve2-fcopysign.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple aarch64-eabi -mattr=+sve2 -o - | FileCheck --check-prefixes=CHECK,CHECK_NO_EXTEND_ROUND %s
-; RUN: llc < %s -mtriple aarch64-eabi -mattr=+sve2 --combiner-vector-fcopysign-extend-round -o - | FileCheck --check-prefixes=CHECK,CHECK_EXTEND_ROUND %s
+; RUN: llc < %s -mtriple=aarch64 -mattr=+sve2 -o - | FileCheck --check-prefixes=CHECK,CHECK_NO_EXTEND_ROUND %s
+; RUN: llc < %s -mtriple=aarch64 -mattr=+sve2 --combiner-vector-fcopysign-extend-round -o - | FileCheck --check-prefixes=CHECK,CHECK_EXTEND_ROUND %s
 
 target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
 
diff --git a/llvm/test/CodeGen/AArch64/tailmerging_in_mbp.ll b/llvm/test/CodeGen/AArch64/tailmerging_in_mbp.ll
index 54c200e..6753807 100644
--- a/llvm/test/CodeGen/AArch64/tailmerging_in_mbp.ll
+++ b/llvm/test/CodeGen/AArch64/tailmerging_in_mbp.ll
@@ -1,4 +1,4 @@
-; RUN: llc <%s -mtriple=aarch64-eabi -verify-machine-dom-info | FileCheck %s
+; RUN: llc <%s -mtriple=aarch64 -verify-machine-dom-info | FileCheck %s
 
 ; CHECK-LABEL: test:
 ; CHECK-LABEL: %cond.false12.i
diff --git a/llvm/test/CodeGen/AArch64/tbz-tbnz.ll b/llvm/test/CodeGen/AArch64/tbz-tbnz.ll
index 1edea22..d301a38 100644
--- a/llvm/test/CodeGen/AArch64/tbz-tbnz.ll
+++ b/llvm/test/CodeGen/AArch64/tbz-tbnz.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -O1 -mtriple=aarch64-eabi -aarch64-enable-cond-br-tune=false | FileCheck %s
+; RUN: llc < %s -O1 -mtriple=aarch64 -aarch64-enable-cond-br-tune=false | FileCheck %s
 
 declare void @t()
 
diff --git a/llvm/test/CodeGen/AArch64/v3f-to-int.ll b/llvm/test/CodeGen/AArch64/v3f-to-int.ll
index a3c9c8f..f6553b6 100644
--- a/llvm/test/CodeGen/AArch64/v3f-to-int.ll
+++ b/llvm/test/CodeGen/AArch64/v3f-to-int.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=aarch64--linux-eabi %s -o - | FileCheck %s
+; RUN: llc -mtriple=aarch64 %s -o - | FileCheck %s
 
 ; CHECK-LABEL: convert_v3f32
 ; CHECK: strb
diff --git a/llvm/test/CodeGen/AArch64/v8.5a-neon-frint3264-intrinsic.ll b/llvm/test/CodeGen/AArch64/v8.5a-neon-frint3264-intrinsic.ll
index 1979d97..85187f1 100644
--- a/llvm/test/CodeGen/AArch64/v8.5a-neon-frint3264-intrinsic.ll
+++ b/llvm/test/CodeGen/AArch64/v8.5a-neon-frint3264-intrinsic.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=aarch64-eabi -mattr=+v8.5a  | FileCheck %s
+; RUN: llc < %s -mtriple=aarch64 -mattr=+v8.5a  | FileCheck %s
 
 declare <2 x float> @llvm.aarch64.neon.frint32x.v2f32(<2 x float>)
 declare <4 x float> @llvm.aarch64.neon.frint32x.v4f32(<4 x float>)
diff --git a/llvm/test/CodeGen/AArch64/v8.5a-scalar-frint3264-intrinsic.ll b/llvm/test/CodeGen/AArch64/v8.5a-scalar-frint3264-intrinsic.ll
index b4a0f3d..728c13d 100644
--- a/llvm/test/CodeGen/AArch64/v8.5a-scalar-frint3264-intrinsic.ll
+++ b/llvm/test/CodeGen/AArch64/v8.5a-scalar-frint3264-intrinsic.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=aarch64-eabi -mattr=+v8.5a  | FileCheck %s
+; RUN: llc < %s -mtriple=aarch64 -mattr=+v8.5a  | FileCheck %s
 
 declare float @llvm.aarch64.frint32z.f32(float)
 declare double @llvm.aarch64.frint32z.f64(double)
diff --git a/llvm/test/CodeGen/AArch64/vecreduce-fadd-strict.ll b/llvm/test/CodeGen/AArch64/vecreduce-fadd-strict.ll
index de95943..1164e02 100644
--- a/llvm/test/CodeGen/AArch64/vecreduce-fadd-strict.ll
+++ b/llvm/test/CodeGen/AArch64/vecreduce-fadd-strict.ll
@@ -1,8 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=aarch64-none-eabi -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD,CHECK-SD-NOFP16
-; RUN: llc -mtriple=aarch64-none-eabi -mattr=+fullfp16 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD,CHECK-SD-FP16
-; RUN: llc -mtriple=aarch64-none-eabi -global-isel -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI,CHECK-GI-NOFP16
-; RUN: llc -mtriple=aarch64-none-eabi -mattr=+fullfp16 -global-isel -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI,CHECK-GI-FP16
+; RUN: llc -mtriple=aarch64 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD,CHECK-SD-NOFP16
+; RUN: llc -mtriple=aarch64 -mattr=+fullfp16 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD,CHECK-SD-FP16
+; RUN: llc -mtriple=aarch64 -global-isel -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI,CHECK-GI-NOFP16
+; RUN: llc -mtriple=aarch64 -mattr=+fullfp16 -global-isel -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI,CHECK-GI-FP16
 
 define float @add_HalfS(<2 x float> %bin.rdx)  {
 ; CHECK-SD-LABEL: add_HalfS:
diff --git a/llvm/test/CodeGen/AArch64/vecreduce-fadd.ll b/llvm/test/CodeGen/AArch64/vecreduce-fadd.ll
index aaba379..99c6808 100644
--- a/llvm/test/CodeGen/AArch64/vecreduce-fadd.ll
+++ b/llvm/test/CodeGen/AArch64/vecreduce-fadd.ll
@@ -1,8 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=aarch64-none-eabi -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD,CHECK-SD-NOFP16
-; RUN: llc -mtriple=aarch64-none-eabi -mattr=+fullfp16 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD,CHECK-SD-FP16
-; RUN: llc -mtriple=aarch64-none-eabi -global-isel -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI,CHECK-GI-NOFP16
-; RUN: llc -mtriple=aarch64-none-eabi -mattr=+fullfp16 -global-isel -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI,CHECK-GI-FP16
+; RUN: llc -mtriple=aarch64 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD,CHECK-SD-NOFP16
+; RUN: llc -mtriple=aarch64 -mattr=+fullfp16 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD,CHECK-SD-FP16
+; RUN: llc -mtriple=aarch64 -global-isel -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI,CHECK-GI-NOFP16
+; RUN: llc -mtriple=aarch64 -mattr=+fullfp16 -global-isel -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI,CHECK-GI-FP16
 
 define float @add_HalfS(<2 x float> %bin.rdx)  {
 ; CHECK-LABEL: add_HalfS:
diff --git a/llvm/test/CodeGen/AArch64/vecreduce-fmul-strict.ll b/llvm/test/CodeGen/AArch64/vecreduce-fmul-strict.ll
index 7b93e60..e1b2170 100644
--- a/llvm/test/CodeGen/AArch64/vecreduce-fmul-strict.ll
+++ b/llvm/test/CodeGen/AArch64/vecreduce-fmul-strict.ll
@@ -1,8 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=aarch64-none-eabi -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD,CHECK-SD-NOFP16
-; RUN: llc -mtriple=aarch64-none-eabi -mattr=+fullfp16 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD,CHECK-SD-FP16
-; RUN: llc -mtriple=aarch64-none-eabi -global-isel -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI,CHECK-GI-NOFP16
-; RUN: llc -mtriple=aarch64-none-eabi -mattr=+fullfp16 -global-isel -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI,CHECK-GI-FP16
+; RUN: llc -mtriple=aarch64 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD,CHECK-SD-NOFP16
+; RUN: llc -mtriple=aarch64 -mattr=+fullfp16 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD,CHECK-SD-FP16
+; RUN: llc -mtriple=aarch64 -global-isel -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI,CHECK-GI-NOFP16
+; RUN: llc -mtriple=aarch64 -mattr=+fullfp16 -global-isel -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI,CHECK-GI-FP16
 
 define float @mul_HalfS(<2 x float> %bin.rdx)  {
 ; CHECK-SD-LABEL: mul_HalfS:
diff --git a/llvm/test/CodeGen/AArch64/vecreduce-fmul.ll b/llvm/test/CodeGen/AArch64/vecreduce-fmul.ll
index 67b4ebb..e85384e 100644
--- a/llvm/test/CodeGen/AArch64/vecreduce-fmul.ll
+++ b/llvm/test/CodeGen/AArch64/vecreduce-fmul.ll
@@ -1,8 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=aarch64-none-eabi -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD,CHECK-SD-NOFP16
-; RUN: llc -mtriple=aarch64-none-eabi -mattr=+fullfp16 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD,CHECK-SD-FP16
-; RUN: llc -mtriple=aarch64-none-eabi -global-isel -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI,CHECK-GI-NOFP16
-; RUN: llc -mtriple=aarch64-none-eabi -mattr=+fullfp16 -global-isel -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI,CHECK-GI-FP16
+; RUN: llc -mtriple=aarch64 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD,CHECK-SD-NOFP16
+; RUN: llc -mtriple=aarch64 -mattr=+fullfp16 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD,CHECK-SD-FP16
+; RUN: llc -mtriple=aarch64 -global-isel -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI,CHECK-GI-NOFP16
+; RUN: llc -mtriple=aarch64 -mattr=+fullfp16 -global-isel -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI,CHECK-GI-FP16
 
 define float @mul_HalfS(<2 x float> %bin.rdx)  {
 ; CHECK-SD-LABEL: mul_HalfS:
diff --git a/llvm/test/CodeGen/AArch64/xar.ll b/llvm/test/CodeGen/AArch64/xar.ll
index c602837..d050eaf 100644
--- a/llvm/test/CodeGen/AArch64/xar.ll
+++ b/llvm/test/CodeGen/AArch64/xar.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
-; RUN: llc -mtriple=aarch64-none-eabi -mattr=+sha3 < %s | FileCheck --check-prefix=SHA3 %s
-; RUN: llc -mtriple=aarch64-none-eabi -mattr=-sha3 < %s | FileCheck --check-prefix=NOSHA3 %s
+; RUN: llc -mtriple=aarch64 -mattr=+sha3 < %s | FileCheck --check-prefix=SHA3 %s
+; RUN: llc -mtriple=aarch64 -mattr=-sha3 < %s | FileCheck --check-prefix=NOSHA3 %s
 
 define <2 x i64> @xar(<2 x i64> %x, <2 x i64> %y) {
 ; SHA3-LABEL: xar:
diff --git a/llvm/test/Transforms/LICM/sink-foldable.ll b/llvm/test/Transforms/LICM/sink-foldable.ll
index bf2cc77..38577a5 100644
--- a/llvm/test/Transforms/LICM/sink-foldable.ll
+++ b/llvm/test/Transforms/LICM/sink-foldable.ll
@@ -3,7 +3,7 @@
 
 ; RUN: opt < %s -passes=licm -S   | FileCheck %s
 
-target triple = "aarch64--linux-gnueabi"
+target triple = "aarch64"
 
 define ptr @test1(i32 %j, ptr readonly %P, ptr readnone %Q) {
 ; CHECK-LABEL: @test1(
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/arbitrary-induction-step.ll b/llvm/test/Transforms/LoopVectorize/AArch64/arbitrary-induction-step.ll
index cb9ba1b..22aaa56 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/arbitrary-induction-step.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/arbitrary-induction-step.ll
@@ -2,7 +2,7 @@
 ; RUN: opt -S < %s -passes=loop-vectorize -force-vector-interleave=1 -force-vector-width=2 | FileCheck %s --check-prefix=FORCE-VEC
 
 target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
-target triple = "aarch64--linux-gnueabi"
+target triple = "aarch64"
 
 ; Test integer induction variable of step 2:
 ;   for (int i = 0; i < 1024; i+=2) {
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/backedge-overflow.ll b/llvm/test/Transforms/LoopVectorize/AArch64/backedge-overflow.ll
index c47a630..4f71959 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/backedge-overflow.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/backedge-overflow.ll
@@ -1,4 +1,4 @@
-; RUN: opt -mtriple=aarch64--linux-gnueabi -passes=loop-vectorize -force-vector-width=4 -force-vector-interleave=1 < %s -S | FileCheck %s
+; RUN: opt -mtriple=aarch64 -passes=loop-vectorize -force-vector-width=4 -force-vector-interleave=1 < %s -S | FileCheck %s
 
 ; The following tests contain loops for which SCEV cannot determine the backedge
 ; taken count. This is because the backedge taken condition is produced by an
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/interleaved_cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/interleaved_cost.ll
index bf9146b..7879872 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/interleaved_cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/interleaved_cost.ll
@@ -5,7 +5,7 @@
 ; REQUIRES: asserts
 
 target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
-target triple = "aarch64--linux-gnueabi"
+target triple = "aarch64"
 
 %i8.2 = type {i8, i8}
 define void @i8_factor_2(ptr %data, i64 %n) {
-- 
cgit v1.1


From 78f2eb8d0f9cf5e8a79b64f6d64deeb487502e38 Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Mon, 12 Feb 2024 18:36:31 -0800
Subject: [test] Replace aarch64-*-{eabi,gnueabi}{,hf} with aarch64

---
 llvm/test/CodeGen/AArch64/aarch64-gep-opt.ll                     | 2 +-
 llvm/test/CodeGen/AArch64/cond-br-tuning.ll                      | 2 +-
 llvm/test/Transforms/CallSiteSplitting/callsite-split-or-phi.ll  | 2 +-
 llvm/test/Transforms/CallSiteSplitting/callsite-split.ll         | 2 +-
 llvm/test/Transforms/CalledValuePropagation/simple-arguments.ll  | 2 +-
 llvm/test/Transforms/CalledValuePropagation/simple-memory.ll     | 2 +-
 llvm/test/Transforms/CalledValuePropagation/simple-select.ll     | 2 +-
 llvm/test/Transforms/ConstantHoisting/AArch64/const-hoist-gep.ll | 2 +-
 8 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/llvm/test/CodeGen/AArch64/aarch64-gep-opt.ll b/llvm/test/CodeGen/AArch64/aarch64-gep-opt.ll
index c2a3acb..578038b 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-gep-opt.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-gep-opt.ll
@@ -5,7 +5,7 @@
 ; RUN: llc -O3 -aarch64-enable-gep-opt=true -print-after=codegenprepare -mcpu=cortex-a53 < %s 2>&1 | FileCheck --check-prefix=CHECK-UseAA %s
 
 target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
-target triple = "aarch64-linux-gnueabi"
+target triple = "aarch64"
 
 ; Following test cases test enabling SeparateConstOffsetFromGEP pass in AArch64
 ; backend. If useAA() returns true, it will lower a GEP with multiple indices
diff --git a/llvm/test/CodeGen/AArch64/cond-br-tuning.ll b/llvm/test/CodeGen/AArch64/cond-br-tuning.ll
index 1e1af3e..87a315a 100644
--- a/llvm/test/CodeGen/AArch64/cond-br-tuning.ll
+++ b/llvm/test/CodeGen/AArch64/cond-br-tuning.ll
@@ -2,7 +2,7 @@
 ; RUN: llc -debugify-and-strip-all-safe < %s -O3 -mtriple=aarch64 -verify-machineinstrs | FileCheck %s
 
 target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
-target triple = "aarch64-linaro-linux-gnueabi"
+target triple = "aarch64"
 
 ; CMN is an alias of ADDS.
 
diff --git a/llvm/test/Transforms/CallSiteSplitting/callsite-split-or-phi.ll b/llvm/test/Transforms/CallSiteSplitting/callsite-split-or-phi.ll
index ee8ce72..6e57cc0 100644
--- a/llvm/test/Transforms/CallSiteSplitting/callsite-split-or-phi.ll
+++ b/llvm/test/Transforms/CallSiteSplitting/callsite-split-or-phi.ll
@@ -2,7 +2,7 @@
 ; RUN: opt < %s -passes='function(callsite-splitting)' -S | FileCheck %s
 
 target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
-target triple = "aarch64-linaro-linux-gnueabi"
+target triple = "aarch64"
 
 ;CHECK-LABEL: @test_eq_eq
 
diff --git a/llvm/test/Transforms/CallSiteSplitting/callsite-split.ll b/llvm/test/Transforms/CallSiteSplitting/callsite-split.ll
index 78b8f5b..256261d 100644
--- a/llvm/test/Transforms/CallSiteSplitting/callsite-split.ll
+++ b/llvm/test/Transforms/CallSiteSplitting/callsite-split.ll
@@ -1,7 +1,7 @@
 ; RUN: opt < %s -passes='function(callsite-splitting),cgscc(inline),function(instcombine,jump-threading)' -S | FileCheck %s
 
 target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
-target triple = "aarch64-linaro-linux-gnueabi"
+target triple = "aarch64"
 
 %struct.bitmap = type { i32, ptr }
 
diff --git a/llvm/test/Transforms/CalledValuePropagation/simple-arguments.ll b/llvm/test/Transforms/CalledValuePropagation/simple-arguments.ll
index d9739af..150c56c 100644
--- a/llvm/test/Transforms/CalledValuePropagation/simple-arguments.ll
+++ b/llvm/test/Transforms/CalledValuePropagation/simple-arguments.ll
@@ -1,6 +1,6 @@
 ; RUN: opt -passes=called-value-propagation -S < %s | FileCheck %s
 
-target triple = "aarch64-unknown-linux-gnueabi"
+target triple = "aarch64"
 
 
 ; This test checks that we propagate the functions through arguments and attach
diff --git a/llvm/test/Transforms/CalledValuePropagation/simple-memory.ll b/llvm/test/Transforms/CalledValuePropagation/simple-memory.ll
index 899bed8..106ed84 100644
--- a/llvm/test/Transforms/CalledValuePropagation/simple-memory.ll
+++ b/llvm/test/Transforms/CalledValuePropagation/simple-memory.ll
@@ -1,6 +1,6 @@
 ; RUN: opt -passes=called-value-propagation -S < %s | FileCheck %s
 
-target triple = "aarch64-unknown-linux-gnueabi"
+target triple = "aarch64"
 
 @global_function = internal unnamed_addr global ptr null, align 8
 @global_array = common unnamed_addr global ptr null, align 8
diff --git a/llvm/test/Transforms/CalledValuePropagation/simple-select.ll b/llvm/test/Transforms/CalledValuePropagation/simple-select.ll
index e4d35b8..b2ea20a 100644
--- a/llvm/test/Transforms/CalledValuePropagation/simple-select.ll
+++ b/llvm/test/Transforms/CalledValuePropagation/simple-select.ll
@@ -1,6 +1,6 @@
 ; RUN: opt -passes=called-value-propagation -S < %s | FileCheck %s
 
-target triple = "aarch64-unknown-linux-gnueabi"
+target triple = "aarch64"
 
 @global_function = internal unnamed_addr global ptr null, align 8
 @global_scalar = internal unnamed_addr global i64 zeroinitializer
diff --git a/llvm/test/Transforms/ConstantHoisting/AArch64/const-hoist-gep.ll b/llvm/test/Transforms/ConstantHoisting/AArch64/const-hoist-gep.ll
index da0b2e6..f74dc3e 100644
--- a/llvm/test/Transforms/ConstantHoisting/AArch64/const-hoist-gep.ll
+++ b/llvm/test/Transforms/ConstantHoisting/AArch64/const-hoist-gep.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2
 ; RUN: opt -passes=consthoist -consthoist-gep -S -o - %s | FileCheck %s
 
-target triple = "aarch64-none--musleabi"
+target triple = "aarch64"
 
 %0 = type { %1, %2, [9 x i16], %6, %7 }
 %1 = type { i32, i32, i32, i32, i32, i32, i16, i16, i8, i8, i16, i32, i32, i16, i8, i8 }
-- 
cgit v1.1


From 95a204cf34c410c2be5f152daec2d476f1e06d84 Mon Sep 17 00:00:00 2001
From: paperchalice <liujunchang97@outlook.com>
Date: Tue, 13 Feb 2024 10:48:58 +0800
Subject: [CodeGenPassBuilder] Remove  analysis registering interfaces (#81439)

PassBuilder already can do this.
---
 llvm/include/llvm/Passes/CodeGenPassBuilder.h | 67 +--------------------------
 1 file changed, 1 insertion(+), 66 deletions(-)

diff --git a/llvm/include/llvm/Passes/CodeGenPassBuilder.h b/llvm/include/llvm/Passes/CodeGenPassBuilder.h
index fa6dbd4..80bbfb7 100644
--- a/llvm/include/llvm/Passes/CodeGenPassBuilder.h
+++ b/llvm/include/llvm/Passes/CodeGenPassBuilder.h
@@ -7,8 +7,7 @@
 //===----------------------------------------------------------------------===//
 /// \file
 ///
-/// Interfaces for registering analysis passes, producing common pass manager
-/// configurations, and parsing of pass pipelines.
+/// Interfaces for producing common pass manager configurations.
 ///
 //===----------------------------------------------------------------------===//
 
@@ -137,16 +136,6 @@ public:
                       raw_pwrite_stream &Out, raw_pwrite_stream *DwoOut,
                       CodeGenFileType FileType) const;
 
-  void registerModuleAnalyses(ModuleAnalysisManager &) const;
-  void registerFunctionAnalyses(FunctionAnalysisManager &) const;
-  void registerMachineFunctionAnalyses(MachineFunctionAnalysisManager &) const;
-
-  void registerAnalyses(MachineFunctionAnalysisManager &MFAM) const {
-    registerModuleAnalyses(*MFAM.MAM);
-    registerFunctionAnalyses(*MFAM.FAM);
-    registerMachineFunctionAnalyses(MFAM);
-  }
-
   PassInstrumentationCallbacks *getPassInstrumentationCallbacks() const {
     return PIC;
   }
@@ -239,14 +228,6 @@ protected:
   CGPassBuilderOption Opt;
   PassInstrumentationCallbacks *PIC;
 
-  /// Target override these hooks to parse target-specific analyses.
-  void registerTargetAnalysis(ModuleAnalysisManager &) const {}
-  void registerTargetAnalysis(FunctionAnalysisManager &) const {}
-  void registerTargetAnalysis(MachineFunctionAnalysisManager &) const {}
-  std::pair<StringRef, bool> getTargetPassNameFromLegacyName(StringRef) const {
-    return {"", false};
-  }
-
   template <typename TMC> TMC &getTM() const { return static_cast<TMC &>(TM); }
   CodeGenOptLevel getOptLevel() const { return TM.getOptLevel(); }
 
@@ -577,52 +558,6 @@ Error CodeGenPassBuilder<Derived>::verifyStartStop(
   return Error::success();
 }
 
-static inline AAManager registerAAAnalyses() {
-  AAManager AA;
-
-  // The order in which these are registered determines their priority when
-  // being queried.
-
-  // Basic AliasAnalysis support.
-  // Add TypeBasedAliasAnalysis before BasicAliasAnalysis so that
-  // BasicAliasAnalysis wins if they disagree. This is intended to help
-  // support "obvious" type-punning idioms.
-  AA.registerFunctionAnalysis<TypeBasedAA>();
-  AA.registerFunctionAnalysis<ScopedNoAliasAA>();
-  AA.registerFunctionAnalysis<BasicAA>();
-
-  return AA;
-}
-
-template <typename Derived>
-void CodeGenPassBuilder<Derived>::registerModuleAnalyses(
-    ModuleAnalysisManager &MAM) const {
-#define MODULE_ANALYSIS(NAME, CREATE_PASS)                                     \
-  MAM.registerPass([&] { return CREATE_PASS; });
-#include "MachinePassRegistry.def"
-  derived().registerTargetAnalysis(MAM);
-}
-
-template <typename Derived>
-void CodeGenPassBuilder<Derived>::registerFunctionAnalyses(
-    FunctionAnalysisManager &FAM) const {
-  FAM.registerPass([this] { return registerAAAnalyses(); });
-
-#define FUNCTION_ANALYSIS(NAME, CREATE_PASS)                                   \
-  FAM.registerPass([&] { return CREATE_PASS; });
-#include "MachinePassRegistry.def"
-  derived().registerTargetAnalysis(FAM);
-}
-
-template <typename Derived>
-void CodeGenPassBuilder<Derived>::registerMachineFunctionAnalyses(
-    MachineFunctionAnalysisManager &MFAM) const {
-#define MACHINE_FUNCTION_ANALYSIS(NAME, CREATE_PASS)                           \
-  MFAM.registerPass([&] { return CREATE_PASS; });
-#include "MachinePassRegistry.def"
-  derived().registerTargetAnalysis(MFAM);
-}
-
 template <typename Derived>
 void CodeGenPassBuilder<Derived>::addISelPasses(AddIRPass &addPass) const {
   derived().addGlobalMergePass(addPass);
-- 
cgit v1.1


From 542a3cb9cc89c3d6fca68374bf6c35ac6c229300 Mon Sep 17 00:00:00 2001
From: Yingwei Zheng <dtcxzyw2333@gmail.com>
Date: Tue, 13 Feb 2024 11:18:13 +0800
Subject: [ValueTracking] Compute known FPClass from dominating condition
 (#80941)

This patch improves `computeKnownFPClass` by using context-sensitive
information from `DomConditionCache`.
---
 llvm/lib/Analysis/DomConditionCache.cpp            |   7 +-
 llvm/lib/Analysis/ValueTracking.cpp                |  91 ++++--
 .../InstCombine/fpclass-from-dom-cond.ll           | 322 +++++++++++++++++++++
 3 files changed, 392 insertions(+), 28 deletions(-)
 create mode 100644 llvm/test/Transforms/InstCombine/fpclass-from-dom-cond.ll

diff --git a/llvm/lib/Analysis/DomConditionCache.cpp b/llvm/lib/Analysis/DomConditionCache.cpp
index 3dad0c2..c07a8a7 100644
--- a/llvm/lib/Analysis/DomConditionCache.cpp
+++ b/llvm/lib/Analysis/DomConditionCache.cpp
@@ -43,7 +43,7 @@ static void findAffectedValues(Value *Cond,
     if (!Visited.insert(V).second)
       continue;
 
-    ICmpInst::Predicate Pred;
+    CmpInst::Predicate Pred;
     Value *A, *B;
     // Only recurse into and/or if it matches the top-level and/or type.
     if (TopLevelIsAnd ? match(V, m_LogicalAnd(m_Value(A), m_Value(B)))
@@ -67,6 +67,11 @@ static void findAffectedValues(Value *Cond,
         if (match(A, m_Add(m_Value(X), m_ConstantInt())))
           AddAffected(X);
       }
+    } else if (match(Cond, m_CombineOr(m_FCmp(Pred, m_Value(A), m_Constant()),
+                                       m_Intrinsic<Intrinsic::is_fpclass>(
+                                           m_Value(A), m_Constant())))) {
+      // Handle patterns that computeKnownFPClass() support.
+      AddAffected(A);
     }
   }
 }
diff --git a/llvm/lib/Analysis/ValueTracking.cpp b/llvm/lib/Analysis/ValueTracking.cpp
index 0e40a02..220ef32 100644
--- a/llvm/lib/Analysis/ValueTracking.cpp
+++ b/llvm/lib/Analysis/ValueTracking.cpp
@@ -4225,9 +4225,53 @@ llvm::fcmpImpliesClass(CmpInst::Predicate Pred, const Function &F, Value *LHS,
   return fcmpImpliesClass(Pred, F, LHS, *ConstRHS, LookThroughSrc);
 }
 
-static FPClassTest computeKnownFPClassFromAssumes(const Value *V,
-                                                  const SimplifyQuery &Q) {
-  FPClassTest KnownFromAssume = fcAllFlags;
+static void computeKnownFPClassFromCond(const Value *V, Value *Cond,
+                                        bool CondIsTrue,
+                                        const Instruction *CxtI,
+                                        KnownFPClass &KnownFromContext) {
+  CmpInst::Predicate Pred;
+  Value *LHS;
+  uint64_t ClassVal = 0;
+  const APFloat *CRHS;
+  // TODO: handle sign-bit check idiom
+  if (match(Cond, m_FCmp(Pred, m_Value(LHS), m_APFloat(CRHS)))) {
+    auto [CmpVal, MaskIfTrue, MaskIfFalse] = fcmpImpliesClass(
+        Pred, *CxtI->getParent()->getParent(), LHS, *CRHS, LHS != V);
+    if (CmpVal == V)
+      KnownFromContext.knownNot(~(CondIsTrue ? MaskIfTrue : MaskIfFalse));
+  } else if (match(Cond, m_Intrinsic<Intrinsic::is_fpclass>(
+                             m_Value(LHS), m_ConstantInt(ClassVal)))) {
+    FPClassTest Mask = static_cast<FPClassTest>(ClassVal);
+    KnownFromContext.knownNot(CondIsTrue ? ~Mask : Mask);
+  }
+}
+
+static KnownFPClass computeKnownFPClassFromContext(const Value *V,
+                                                   const SimplifyQuery &Q) {
+  KnownFPClass KnownFromContext;
+
+  if (!Q.CxtI)
+    return KnownFromContext;
+
+  if (Q.DC && Q.DT) {
+    // Handle dominating conditions.
+    for (BranchInst *BI : Q.DC->conditionsFor(V)) {
+      Value *Cond = BI->getCondition();
+
+      BasicBlockEdge Edge0(BI->getParent(), BI->getSuccessor(0));
+      if (Q.DT->dominates(Edge0, Q.CxtI->getParent()))
+        computeKnownFPClassFromCond(V, Cond, /*CondIsTrue=*/true, Q.CxtI,
+                                    KnownFromContext);
+
+      BasicBlockEdge Edge1(BI->getParent(), BI->getSuccessor(1));
+      if (Q.DT->dominates(Edge1, Q.CxtI->getParent()))
+        computeKnownFPClassFromCond(V, Cond, /*CondIsTrue=*/false, Q.CxtI,
+                                    KnownFromContext);
+    }
+  }
+
+  if (!Q.AC)
+    return KnownFromContext;
 
   // Try to restrict the floating-point classes based on information from
   // assumptions.
@@ -4245,25 +4289,11 @@ static FPClassTest computeKnownFPClassFromAssumes(const Value *V,
     if (!isValidAssumeForContext(I, Q.CxtI, Q.DT))
       continue;
 
-    CmpInst::Predicate Pred;
-    Value *LHS, *RHS;
-    uint64_t ClassVal = 0;
-    if (match(I->getArgOperand(0), m_FCmp(Pred, m_Value(LHS), m_Value(RHS)))) {
-      const APFloat *CRHS;
-      if (match(RHS, m_APFloat(CRHS))) {
-        auto [CmpVal, MaskIfTrue, MaskIfFalse] =
-            fcmpImpliesClass(Pred, *F, LHS, *CRHS, LHS != V);
-        if (CmpVal == V)
-          KnownFromAssume &= MaskIfTrue;
-      }
-    } else if (match(I->getArgOperand(0),
-                     m_Intrinsic<Intrinsic::is_fpclass>(
-                         m_Value(LHS), m_ConstantInt(ClassVal)))) {
-      KnownFromAssume &= static_cast<FPClassTest>(ClassVal);
-    }
+    computeKnownFPClassFromCond(V, I->getArgOperand(0), /*CondIsTrue=*/true,
+                                Q.CxtI, KnownFromContext);
   }
 
-  return KnownFromAssume;
+  return KnownFromContext;
 }
 
 void computeKnownFPClass(const Value *V, const APInt &DemandedElts,
@@ -4371,10 +4401,8 @@ void computeKnownFPClass(const Value *V, const APInt &DemandedElts,
       KnownNotFromFlags |= fcInf;
   }
 
-  if (Q.AC) {
-    FPClassTest AssumedClasses = computeKnownFPClassFromAssumes(V, Q);
-    KnownNotFromFlags |= ~AssumedClasses;
-  }
+  KnownFPClass AssumedClasses = computeKnownFPClassFromContext(V, Q);
+  KnownNotFromFlags |= ~AssumedClasses.KnownFPClasses;
 
   // We no longer need to find out about these bits from inputs if we can
   // assume this from flags/attributes.
@@ -4382,6 +4410,12 @@ void computeKnownFPClass(const Value *V, const APInt &DemandedElts,
 
   auto ClearClassesFromFlags = make_scope_exit([=, &Known] {
     Known.knownNot(KnownNotFromFlags);
+    if (!Known.SignBit && AssumedClasses.SignBit) {
+      if (*AssumedClasses.SignBit)
+        Known.signBitMustBeOne();
+      else
+        Known.signBitMustBeZero();
+    }
   });
 
   if (!Op)
@@ -5283,7 +5317,8 @@ void computeKnownFPClass(const Value *V, const APInt &DemandedElts,
 
       bool First = true;
 
-      for (Value *IncValue : P->incoming_values()) {
+      for (const Use &U : P->operands()) {
+        Value *IncValue = U.get();
         // Skip direct self references.
         if (IncValue == P)
           continue;
@@ -5292,8 +5327,10 @@ void computeKnownFPClass(const Value *V, const APInt &DemandedElts,
         // Recurse, but cap the recursion to two levels, because we don't want
         // to waste time spinning around in loops. We need at least depth 2 to
         // detect known sign bits.
-        computeKnownFPClass(IncValue, DemandedElts, InterestedClasses, KnownSrc,
-                            PhiRecursionLimit, Q);
+        computeKnownFPClass(
+            IncValue, DemandedElts, InterestedClasses, KnownSrc,
+            PhiRecursionLimit,
+            Q.getWithInstruction(P->getIncomingBlock(U)->getTerminator()));
 
         if (First) {
           Known = KnownSrc;
diff --git a/llvm/test/Transforms/InstCombine/fpclass-from-dom-cond.ll b/llvm/test/Transforms/InstCombine/fpclass-from-dom-cond.ll
new file mode 100644
index 0000000..5d48401
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/fpclass-from-dom-cond.ll
@@ -0,0 +1,322 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -S -passes=instcombine < %s | FileCheck %s
+
+define i1 @test1(float %x) {
+; CHECK-LABEL: define i1 @test1(
+; CHECK-SAME: float [[X:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[COND:%.*]] = fcmp ueq float [[X]], 0.000000e+00
+; CHECK-NEXT:    br i1 [[COND]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    ret i1 false
+; CHECK:       if.else:
+; CHECK-NEXT:    [[RET:%.*]] = call i1 @llvm.is.fpclass.f32(float [[X]], i32 780)
+; CHECK-NEXT:    ret i1 [[RET]]
+;
+entry:
+  %cond = fcmp ueq float %x, 0.000000e+00
+  br i1 %cond, label %if.then, label %if.else
+
+if.then:
+  ret i1 false
+
+if.else:
+  %ret = call i1 @llvm.is.fpclass.f32(float %x, i32 783)
+  ret i1 %ret
+}
+
+define i1 @test2(double %x) {
+; CHECK-LABEL: define i1 @test2(
+; CHECK-SAME: double [[X:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp olt double [[X]], 0x3EB0C6F7A0000000
+; CHECK-NEXT:    br i1 [[CMP]], label [[IF_THEN:%.*]], label [[IF_END:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    ret i1 false
+; CHECK:       if.end:
+; CHECK-NEXT:    ret i1 false
+;
+entry:
+  %cmp = fcmp olt double %x, 0x3EB0C6F7A0000000
+  br i1 %cmp, label %if.then, label %if.end
+if.then:
+  ret i1 false
+if.end:
+  %cmp.i = fcmp oeq double %x, 0.000000e+00
+  ret i1 %cmp.i
+}
+
+define i1 @test3(float %x) {
+; CHECK-LABEL: define i1 @test3(
+; CHECK-SAME: float [[X:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp ogt float [[X]], 3.000000e+00
+; CHECK-NEXT:    br i1 [[CMP]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    [[RET:%.*]] = fcmp oeq float [[X]], 0x7FF0000000000000
+; CHECK-NEXT:    ret i1 [[RET]]
+; CHECK:       if.else:
+; CHECK-NEXT:    ret i1 false
+;
+entry:
+  %cmp = fcmp ogt float %x, 3.000000e+00
+  br i1 %cmp, label %if.then, label %if.else
+if.then:
+  %abs = call float @llvm.fabs.f32(float %x)
+  %ret = fcmp oeq float %abs, 0x7FF0000000000000
+  ret i1 %ret
+if.else:
+  ret i1 false
+}
+
+define float @test4(float %x) {
+; CHECK-LABEL: define float @test4(
+; CHECK-SAME: float [[X:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp olt float [[X]], 0x3EB0C6F7A0000000
+; CHECK-NEXT:    br i1 [[CMP]], label [[IF_THEN:%.*]], label [[IF_END:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    ret float 1.000000e+00
+; CHECK:       if.end:
+; CHECK-NEXT:    [[DIV:%.*]] = fdiv float 1.000000e+00, [[X]]
+; CHECK-NEXT:    ret float [[DIV]]
+;
+entry:
+  %cmp = fcmp olt float %x, 0x3EB0C6F7A0000000
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:
+  ret float 1.0
+
+if.end:
+  %cmp.i = fcmp oeq float %x, 0.000000e+00
+  %div = fdiv float 1.000000e+00, %x
+  %ret = select i1 %cmp.i, float 1.000000e+00, float %div
+  ret float %ret
+}
+
+define i1 @test5(double %x, i1 %cond) {
+; CHECK-LABEL: define i1 @test5(
+; CHECK-SAME: double [[X:%.*]], i1 [[COND:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 [[COND]], label [[IF:%.*]], label [[EXIT:%.*]]
+; CHECK:       if:
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp uno double [[X]], 0.000000e+00
+; CHECK-NEXT:    br i1 [[CMP]], label [[IF_THEN:%.*]], label [[IF_END:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    ret i1 false
+; CHECK:       if.end:
+; CHECK-NEXT:    br label [[EXIT]]
+; CHECK:       exit:
+; CHECK-NEXT:    [[Y:%.*]] = phi double [ -1.000000e+00, [[ENTRY:%.*]] ], [ [[X]], [[IF_END]] ]
+; CHECK-NEXT:    [[RET:%.*]] = tail call i1 @llvm.is.fpclass.f64(double [[Y]], i32 408)
+; CHECK-NEXT:    ret i1 [[RET]]
+;
+entry:
+  br i1 %cond, label %if, label %exit
+if:
+  %cmp = fcmp uno double %x, 0.000000e+00
+  br i1 %cmp, label %if.then, label %if.end
+if.then:
+  ret i1 false
+if.end:
+  br label %exit
+exit:
+  %y = phi double [ -1.000000e+00, %entry ], [ %x, %if.end ]
+  %ret = tail call i1 @llvm.is.fpclass.f64(double %y, i32 411)
+  ret i1 %ret
+}
+
+define i1 @test6(double %x) {
+; CHECK-LABEL: define i1 @test6(
+; CHECK-SAME: double [[X:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp ogt double [[X]], 0.000000e+00
+; CHECK-NEXT:    br i1 [[CMP]], label [[LAND_RHS:%.*]], label [[LAND_END:%.*]]
+; CHECK:       land.rhs:
+; CHECK-NEXT:    [[CMP_I:%.*]] = fcmp oeq double [[X]], 0x7FF0000000000000
+; CHECK-NEXT:    br label [[LAND_END]]
+; CHECK:       land.end:
+; CHECK-NEXT:    [[RET:%.*]] = phi i1 [ false, [[ENTRY:%.*]] ], [ [[CMP_I]], [[LAND_RHS]] ]
+; CHECK-NEXT:    ret i1 [[RET]]
+;
+entry:
+  %cmp = fcmp ogt double %x, 0.000000e+00
+  br i1 %cmp, label %land.rhs, label %land.end
+
+land.rhs:
+  %abs = tail call double @llvm.fabs.f64(double %x)
+  %and.i = bitcast double %abs to i64
+  %cmp.i = icmp eq i64 %and.i, 9218868437227405312
+  br label %land.end
+
+land.end:
+  %ret = phi i1 [ false, %entry ], [ %cmp.i, %land.rhs ]
+  ret i1 %ret
+}
+
+define i1 @test7(float %x) {
+; CHECK-LABEL: define i1 @test7(
+; CHECK-SAME: float [[X:%.*]]) {
+; CHECK-NEXT:    [[COND:%.*]] = call i1 @llvm.is.fpclass.f32(float [[X]], i32 345)
+; CHECK-NEXT:    br i1 [[COND]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    [[RET1:%.*]] = call i1 @llvm.is.fpclass.f32(float [[X]], i32 328)
+; CHECK-NEXT:    ret i1 [[RET1]]
+; CHECK:       if.else:
+; CHECK-NEXT:    [[RET2:%.*]] = call i1 @llvm.is.fpclass.f32(float [[X]], i32 128)
+; CHECK-NEXT:    ret i1 [[RET2]]
+;
+  %cond = call i1 @llvm.is.fpclass.f32(float %x, i32 345)
+  br i1 %cond, label %if.then, label %if.else
+if.then:
+  %ret1 = call i1 @llvm.is.fpclass.f32(float %x, i32 456)
+  ret i1 %ret1
+if.else:
+  %ret2 = call i1 @llvm.is.fpclass.f32(float %x, i32 456)
+  ret i1 %ret2
+}
+
+; TODO: These two is.fpclass can be simplified.
+define i1 @test8(float %x) {
+; CHECK-LABEL: define i1 @test8(
+; CHECK-SAME: float [[X:%.*]]) {
+; CHECK-NEXT:    [[ABS:%.*]] = call float @llvm.fabs.f32(float [[X]])
+; CHECK-NEXT:    [[COND:%.*]] = fcmp oeq float [[ABS]], 0x7FF0000000000000
+; CHECK-NEXT:    br i1 [[COND]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    [[RET1:%.*]] = call i1 @llvm.is.fpclass.f32(float [[X]], i32 575)
+; CHECK-NEXT:    ret i1 [[RET1]]
+; CHECK:       if.else:
+; CHECK-NEXT:    [[RET2:%.*]] = call i1 @llvm.is.fpclass.f32(float [[X]], i32 575)
+; CHECK-NEXT:    ret i1 [[RET2]]
+;
+  %abs = call float @llvm.fabs.f32(float %x)
+  %cond = fcmp oeq float %abs, 0x7FF0000000000000
+  br i1 %cond, label %if.then, label %if.else
+if.then:
+  %ret1 = call i1 @llvm.is.fpclass.f32(float %x, i32 575)
+  ret i1 %ret1
+if.else:
+  %ret2 = call i1 @llvm.is.fpclass.f32(float %x, i32 575)
+  ret i1 %ret2
+}
+
+define i1 @test9(float %x) {
+; CHECK-LABEL: define i1 @test9(
+; CHECK-SAME: float [[X:%.*]]) {
+; CHECK-NEXT:    [[COND:%.*]] = fcmp olt float [[X]], -1.000000e+00
+; CHECK-NEXT:    br i1 [[COND]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    ret i1 false
+; CHECK:       if.else:
+; CHECK-NEXT:    ret i1 false
+;
+  %cond = fcmp olt float %x, -1.0
+  br i1 %cond, label %if.then, label %if.else
+if.then:
+  %ret1 = fcmp oeq float %x, 0x7FF0000000000000
+  ret i1 %ret1
+if.else:
+  ret i1 false
+}
+
+define i1 @test10(float %x) {
+; CHECK-LABEL: define i1 @test10(
+; CHECK-SAME: float [[X:%.*]]) {
+; CHECK-NEXT:    [[COND:%.*]] = fcmp olt float [[X]], -1.000000e+00
+; CHECK-NEXT:    br i1 [[COND]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    ret i1 false
+; CHECK:       if.else:
+; CHECK-NEXT:    ret i1 false
+;
+  %cond = fcmp olt float %x, -1.0
+  %neg = fneg float %x
+  br i1 %cond, label %if.then, label %if.else
+if.then:
+  %ret1 = fcmp oeq float %neg, 0xFFF0000000000000
+  ret i1 %ret1
+if.else:
+  ret i1 false
+}
+
+; TODO: handle and/or conditions
+define i1 @test11_and(float %x, i1 %cond2) {
+; CHECK-LABEL: define i1 @test11_and(
+; CHECK-SAME: float [[X:%.*]], i1 [[COND2:%.*]]) {
+; CHECK-NEXT:    [[COND:%.*]] = fcmp olt float [[X]], -1.000000e+00
+; CHECK-NEXT:    [[AND:%.*]] = and i1 [[COND]], [[COND2]]
+; CHECK-NEXT:    br i1 [[AND]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    [[RET1:%.*]] = fcmp oeq float [[X]], 0x7FF0000000000000
+; CHECK-NEXT:    ret i1 [[RET1]]
+; CHECK:       if.else:
+; CHECK-NEXT:    ret i1 false
+;
+  %cond = fcmp olt float %x, -1.0
+  %neg = fneg float %x
+  %and = and i1 %cond, %cond2
+  br i1 %and, label %if.then, label %if.else
+if.then:
+  %ret1 = fcmp oeq float %neg, 0xFFF0000000000000
+  ret i1 %ret1
+if.else:
+  ret i1 false
+}
+
+; TODO: handle and/or conditions
+define i1 @test12_or(float %x, i1 %cond2) {
+; CHECK-LABEL: define i1 @test12_or(
+; CHECK-SAME: float [[X:%.*]], i1 [[COND2:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[COND:%.*]] = fcmp ueq float [[X]], 0.000000e+00
+; CHECK-NEXT:    [[OR:%.*]] = or i1 [[COND]], [[COND2]]
+; CHECK-NEXT:    br i1 [[OR]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    ret i1 false
+; CHECK:       if.else:
+; CHECK-NEXT:    [[RET:%.*]] = call i1 @llvm.is.fpclass.f32(float [[X]], i32 783)
+; CHECK-NEXT:    ret i1 [[RET]]
+;
+entry:
+  %cond = fcmp ueq float %x, 0.000000e+00
+  %or = or i1 %cond, %cond2
+  br i1 %or, label %if.then, label %if.else
+
+if.then:
+  ret i1 false
+
+if.else:
+  %ret = call i1 @llvm.is.fpclass.f32(float %x, i32 783)
+  ret i1 %ret
+}
+
+define i1 @test1_no_dominating(float %x, i1 %c) {
+; CHECK-LABEL: define i1 @test1_no_dominating(
+; CHECK-SAME: float [[X:%.*]], i1 [[C:%.*]]) {
+; CHECK-NEXT:  entry0:
+; CHECK-NEXT:    br i1 [[C]], label [[ENTRY:%.*]], label [[IF_ELSE:%.*]]
+; CHECK:       entry:
+; CHECK-NEXT:    [[COND:%.*]] = fcmp ueq float [[X]], 0.000000e+00
+; CHECK-NEXT:    br i1 [[COND]], label [[IF_THEN:%.*]], label [[IF_ELSE]]
+; CHECK:       if.then:
+; CHECK-NEXT:    ret i1 false
+; CHECK:       if.else:
+; CHECK-NEXT:    [[RET:%.*]] = call i1 @llvm.is.fpclass.f32(float [[X]], i32 783)
+; CHECK-NEXT:    ret i1 [[RET]]
+;
+entry0:
+  br i1 %c, label %entry, label %if.else
+
+entry:
+  %cond = fcmp ueq float %x, 0.000000e+00
+  br i1 %cond, label %if.then, label %if.else
+
+if.then:
+  ret i1 false
+
+if.else:
+  %ret = call i1 @llvm.is.fpclass.f32(float %x, i32 783)
+  ret i1 %ret
+}
-- 
cgit v1.1


From 4af24d4ab76539706bfbceec4b3923426fb1b9e7 Mon Sep 17 00:00:00 2001
From: Owen Pan <owenpiano@gmail.com>
Date: Mon, 12 Feb 2024 19:20:26 -0800
Subject: [clang-format] Don't remove parentheses in macro definitions (#81444)

Closes #81399.
---
 clang/lib/Format/UnwrappedLineParser.cpp | 2 +-
 clang/unittests/Format/FormatTest.cpp    | 2 ++
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/clang/lib/Format/UnwrappedLineParser.cpp b/clang/lib/Format/UnwrappedLineParser.cpp
index d84914c..8f6453a 100644
--- a/clang/lib/Format/UnwrappedLineParser.cpp
+++ b/clang/lib/Format/UnwrappedLineParser.cpp
@@ -2518,7 +2518,7 @@ bool UnwrappedLineParser::parseParens(TokenType AmpAmpTokenType) {
         parseChildBlock();
       break;
     case tok::r_paren:
-      if (!MightBeStmtExpr &&
+      if (!MightBeStmtExpr && !Line->InMacroBody &&
           Style.RemoveParentheses > FormatStyle::RPS_Leave) {
         const auto *Prev = LeftParen->Previous;
         const auto *Next = Tokens->peekNextToken();
diff --git a/clang/unittests/Format/FormatTest.cpp b/clang/unittests/Format/FormatTest.cpp
index 7b65c8d..13937a1 100644
--- a/clang/unittests/Format/FormatTest.cpp
+++ b/clang/unittests/Format/FormatTest.cpp
@@ -26972,6 +26972,7 @@ TEST_F(FormatTest, RemoveParentheses) {
   EXPECT_EQ(Style.RemoveParentheses, FormatStyle::RPS_Leave);
 
   Style.RemoveParentheses = FormatStyle::RPS_MultipleParentheses;
+  verifyFormat("#define Foo(...) foo((__VA_ARGS__))", Style);
   verifyFormat("int x __attribute__((aligned(16))) = 0;", Style);
   verifyFormat("decltype((foo->bar)) baz;", Style);
   verifyFormat("class __declspec(dllimport) X {};",
@@ -27006,6 +27007,7 @@ TEST_F(FormatTest, RemoveParentheses) {
   verifyFormat("return (({ 0; }));", "return ((({ 0; })));", Style);
 
   Style.RemoveParentheses = FormatStyle::RPS_ReturnStatement;
+  verifyFormat("#define Return0 return (0);", Style);
   verifyFormat("return 0;", "return (0);", Style);
   verifyFormat("co_return 0;", "co_return ((0));", Style);
   verifyFormat("return 0;", "return (((0)));", Style);
-- 
cgit v1.1


From c43ad6c0fddac0bbed5e881801dd2bc2f9eeba2d Mon Sep 17 00:00:00 2001
From: yonghong-song <ys114321@gmail.com>
Date: Mon, 12 Feb 2024 20:08:01 -0800
Subject: BPF: Change callx insn encoding (#81546)

Currently, the kernel verifier unsupported callx insn used the 32-bit
imm field to store the target register. On the other hand, gcc used the
dst_reg field to store the target register. The gcc encoding is better.
This patch adjusted the coding to be the same as gcc.

Signed-off-by: Yonghong Song <yonghong.song@linux.dev>
---
 llvm/lib/Target/BPF/AsmParser/BPFAsmParser.cpp | 1 +
 llvm/lib/Target/BPF/BPFInstrInfo.td            | 4 ++--
 llvm/test/MC/BPF/insn-unit.s                   | 3 +++
 3 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Target/BPF/AsmParser/BPFAsmParser.cpp b/llvm/lib/Target/BPF/AsmParser/BPFAsmParser.cpp
index 90697c6..0d1eef6 100644
--- a/llvm/lib/Target/BPF/AsmParser/BPFAsmParser.cpp
+++ b/llvm/lib/Target/BPF/AsmParser/BPFAsmParser.cpp
@@ -229,6 +229,7 @@ public:
     return StringSwitch<bool>(Name.lower())
         .Case("if", true)
         .Case("call", true)
+        .Case("callx", true)
         .Case("goto", true)
         .Case("gotol", true)
         .Case("*", true)
diff --git a/llvm/lib/Target/BPF/BPFInstrInfo.td b/llvm/lib/Target/BPF/BPFInstrInfo.td
index 7d443a3..690d534 100644
--- a/llvm/lib/Target/BPF/BPFInstrInfo.td
+++ b/llvm/lib/Target/BPF/BPFInstrInfo.td
@@ -622,9 +622,9 @@ class CALLX<string OpcodeStr>
                    (ins GPR:$BrDst),
                    !strconcat(OpcodeStr, " $BrDst"),
                    []> {
-  bits<32> BrDst;
+  bits<4> BrDst;
 
-  let Inst{31-0} = BrDst;
+  let Inst{51-48} = BrDst;
   let BPFClass = BPF_JMP;
 }
 
diff --git a/llvm/test/MC/BPF/insn-unit.s b/llvm/test/MC/BPF/insn-unit.s
index 58342cd..224eb73 100644
--- a/llvm/test/MC/BPF/insn-unit.s
+++ b/llvm/test/MC/BPF/insn-unit.s
@@ -61,6 +61,9 @@
 // CHECK-32: c3 92 10 00 00 00 00 00 	lock *(u32 *)(r2 + 16) += w9
 // CHECK: db a3 e2 ff 00 00 00 00 	lock *(u64 *)(r3 - 30) += r10
 
+  callx r2
+// CHECK: 8d 02 00 00 00 00 00 00 	callx r2
+
 // ======== BPF_JMP Class ========
   if r1 & r2 goto Llabel0    // BPF_JSET  | BPF_X
   if r1 & 0xffff goto Llabel0    // BPF_JSET  | BPF_K
-- 
cgit v1.1


From 070fad422834faab1f0ffc90e21e25d2f524b17b Mon Sep 17 00:00:00 2001
From: Kareem Ergawy <kareem.ergawy@amd.com>
Date: Tue, 13 Feb 2024 05:10:18 +0100
Subject: [MLIR][OpenMP] Add `omp.private` op (#80955)

This PR adds a new op to the OpenMP dialect: `PrivateClauseOp`. This op
will be later used to model `[first]private` clauses for differnt OpenMP
directives.

This is part of productizing the "delayed privatization" PoC which can
be found in #79862.
---
 mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td | 93 ++++++++++++++++++++++++++-
 mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp  | 67 +++++++++++++++++++
 mlir/test/Dialect/OpenMP/invalid.mlir         | 63 ++++++++++++++++++
 mlir/test/Dialect/OpenMP/roundtrip.mlir       | 21 ++++++
 4 files changed, 243 insertions(+), 1 deletion(-)
 create mode 100644 mlir/test/Dialect/OpenMP/roundtrip.mlir

diff --git a/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td b/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td
index 5d84217..44f3e5b 100644
--- a/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td
+++ b/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td
@@ -134,6 +134,97 @@ def DeclareTargetAttr : OpenMP_Attr<"DeclareTarget", "declaretarget"> {
 }
 
 //===----------------------------------------------------------------------===//
+// 2.19.4 Data-Sharing Attribute Clauses
+//===----------------------------------------------------------------------===//
+
+def DataSharingTypePrivate      : I32EnumAttrCase<"Private", 0, "private">;
+def DataSharingTypeFirstPrivate : I32EnumAttrCase<"FirstPrivate", 1, "firstprivate">;
+
+def DataSharingClauseType : I32EnumAttr<
+    "DataSharingClauseType",
+    "Type of a data-sharing clause",
+    [DataSharingTypePrivate, DataSharingTypeFirstPrivate]> {
+  let genSpecializedAttr = 0;
+  let cppNamespace = "::mlir::omp";
+}
+
+def DataSharingClauseTypeAttr : EnumAttr<
+    OpenMP_Dialect, DataSharingClauseType, "data_sharing_type"> {
+  let assemblyFormat = "`{` `type` `=` $value `}`";
+}
+
+def PrivateClauseOp : OpenMP_Op<"private", [IsolatedFromAbove]> {
+  let summary = "Provides declaration of [first]private logic.";
+  let description = [{
+    This operation provides a declaration of how to implement the
+    [first]privatization of a variable. The dialect users should provide
+    information about how to create an instance of the type in the alloc region
+    and how to initialize the copy from the original item in the copy region.
+
+    Examples:
+    ---------
+    * `private(x)` would be emitted as:
+    ```mlir
+    omp.private {type = private} @x.privatizer : !fir.ref<i32> alloc {
+    ^bb0(%arg0: !fir.ref<i32>):
+    %0 = ... allocate proper memory for the private clone ...
+    omp.yield(%0 : !fir.ref<i32>)
+    }
+    ```
+
+    * `firstprivate(x)` would be emitted as:
+    ```mlir
+    omp.private {type = firstprivate} @x.privatizer : !fir.ref<i32> alloc {
+    ^bb0(%arg0: !fir.ref<i32>):
+    %0 = ... allocate proper memory for the private clone ...
+    omp.yield(%0 : !fir.ref<i32>)
+    } copy {
+    ^bb0(%arg0: !fir.ref<i32>, %arg1: !fir.ref<i32>):
+    // %arg0 is the original host variable. Same as for `alloc`.
+    // %arg1 represents the memory allocated in `alloc`.
+    ... copy from host to the privatized clone ....
+    omp.yield(%arg1 : !fir.ref<i32>)
+    }
+    ```
+
+    There are no restrictions on the body except for:
+    - The `alloc` region has a single argument.
+    - The `copy` region has 2 arguments.
+    - Both regions are terminated by `omp.yield` ops.
+    The above restrictions and other obvious restrictions (e.g. verifying the
+    type of yielded values) are verified by the custom op verifier. The actual
+    contents of the blocks inside both regions are not verified.
+
+    Instances of this op would then be used by ops that model directives that
+    accept data-sharing attribute clauses.
+
+    The $sym_name attribute provides a symbol by which the privatizer op can be
+    referenced by other dialect ops.
+
+    The $type attribute is the type of the value being privatized.
+
+    The $data_sharing_type attribute specifies whether privatizer corresponds
+    to a `private` or a `firstprivate` clause.
+  }];
+
+  let arguments = (ins SymbolNameAttr:$sym_name,
+                       TypeAttrOf<AnyType>:$type,
+                       DataSharingClauseTypeAttr:$data_sharing_type);
+
+  let regions = (region MinSizedRegion<1>:$alloc_region,
+                        AnyRegion:$copy_region);
+
+  let assemblyFormat = [{
+    $data_sharing_type $sym_name `:` $type
+      `alloc` $alloc_region
+      (`copy` $copy_region^)?
+      attr-dict
+  }];
+
+  let hasVerifier = 1;
+}
+
+//===----------------------------------------------------------------------===//
 // 2.6 parallel Construct
 //===----------------------------------------------------------------------===//
 
@@ -609,7 +700,7 @@ def SimdLoopOp : OpenMP_Op<"simdloop", [AttrSizedOperandSegments,
 def YieldOp : OpenMP_Op<"yield",
     [Pure, ReturnLike, Terminator,
      ParentOneOf<["WsLoopOp", "ReductionDeclareOp",
-     "AtomicUpdateOp", "SimdLoopOp"]>]> {
+     "AtomicUpdateOp", "SimdLoopOp", "PrivateClauseOp"]>]> {
   let summary = "loop yield and termination operation";
   let description = [{
     "omp.yield" yields SSA values from the OpenMP dialect op region and
diff --git a/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp b/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp
index 394f062..ef08bd8 100644
--- a/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp
+++ b/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp
@@ -1662,6 +1662,73 @@ LogicalResult DataBoundsOp::verify() {
   return success();
 }
 
+LogicalResult PrivateClauseOp::verify() {
+  Type symType = getType();
+
+  auto verifyTerminator = [&](Operation *terminator) -> LogicalResult {
+    if (!terminator->hasSuccessors() && !llvm::isa<YieldOp>(terminator))
+      return mlir::emitError(terminator->getLoc())
+             << "expected exit block terminator to be an `omp.yield` op.";
+
+    YieldOp yieldOp = llvm::cast<YieldOp>(terminator);
+    TypeRange yieldedTypes = yieldOp.getResults().getTypes();
+
+    if (yieldedTypes.size() == 1 && yieldedTypes.front() == symType)
+      return success();
+
+    auto error = mlir::emitError(yieldOp.getLoc())
+                 << "Invalid yielded value. Expected type: " << symType
+                 << ", got: ";
+
+    if (yieldedTypes.empty())
+      error << "None";
+    else
+      error << yieldedTypes;
+
+    return error;
+  };
+
+  auto verifyRegion = [&](Region &region, unsigned expectedNumArgs,
+                          StringRef regionName) -> LogicalResult {
+    assert(!region.empty());
+
+    if (region.getNumArguments() != expectedNumArgs)
+      return mlir::emitError(region.getLoc())
+             << "`" << regionName << "`: "
+             << "expected " << expectedNumArgs
+             << " region arguments, got: " << region.getNumArguments();
+
+    for (Block &block : region) {
+      // MLIR will verify the absence of the terminator for us.
+      if (!block.mightHaveTerminator())
+        continue;
+
+      if (failed(verifyTerminator(block.getTerminator())))
+        return failure();
+    }
+
+    return success();
+  };
+
+  if (failed(verifyRegion(getAllocRegion(), /*expectedNumArgs=*/1, "alloc")))
+    return failure();
+
+  DataSharingClauseType dsType = getDataSharingType();
+
+  if (dsType == DataSharingClauseType::Private && !getCopyRegion().empty())
+    return emitError("`private` clauses require only an `alloc` region.");
+
+  if (dsType == DataSharingClauseType::FirstPrivate && getCopyRegion().empty())
+    return emitError(
+        "`firstprivate` clauses require both `alloc` and `copy` regions.");
+
+  if (dsType == DataSharingClauseType::FirstPrivate &&
+      failed(verifyRegion(getCopyRegion(), /*expectedNumArgs=*/2, "copy")))
+    return failure();
+
+  return success();
+}
+
 #define GET_ATTRDEF_CLASSES
 #include "mlir/Dialect/OpenMP/OpenMPOpsAttributes.cpp.inc"
 
diff --git a/mlir/test/Dialect/OpenMP/invalid.mlir b/mlir/test/Dialect/OpenMP/invalid.mlir
index 812b79e..59b4239 100644
--- a/mlir/test/Dialect/OpenMP/invalid.mlir
+++ b/mlir/test/Dialect/OpenMP/invalid.mlir
@@ -1738,3 +1738,66 @@ func.func @omp_distribute(%data_var : memref<i32>) -> () {
       "omp.terminator"() : () -> ()
     }) : (memref<i32>) -> ()
 }
+
+// -----
+
+omp.private {type = private} @x.privatizer : i32 alloc {
+^bb0(%arg0: i32):
+  %0 = arith.constant 0.0 : f32
+  // expected-error @below {{Invalid yielded value. Expected type: 'i32', got: 'f32'}}
+  omp.yield(%0 : f32)
+}
+
+// -----
+
+omp.private {type = private} @x.privatizer : i32 alloc {
+^bb0(%arg0: i32):
+  // expected-error @below {{Invalid yielded value. Expected type: 'i32', got: None}}
+  omp.yield
+}
+
+// -----
+
+omp.private {type = private} @x.privatizer : i32 alloc {
+^bb0(%arg0: i32):
+  // expected-error @below {{expected exit block terminator to be an `omp.yield` op.}}
+  omp.terminator
+}
+
+// -----
+
+// expected-error @below {{`alloc`: expected 1 region arguments, got: 2}}
+omp.private {type = private} @x.privatizer : f32 alloc {
+^bb0(%arg0: f32, %arg1: f32):
+  omp.yield(%arg0 : f32)
+}
+
+// -----
+
+// expected-error @below {{`copy`: expected 2 region arguments, got: 1}}
+omp.private {type = firstprivate} @x.privatizer : f32 alloc {
+^bb0(%arg0: f32):
+  omp.yield(%arg0 : f32)
+} copy {
+^bb0(%arg0: f32):
+  omp.yield(%arg0 : f32)
+}
+
+// -----
+
+// expected-error @below {{`private` clauses require only an `alloc` region.}}
+omp.private {type = private} @x.privatizer : f32 alloc {
+^bb0(%arg0: f32):
+  omp.yield(%arg0 : f32)
+} copy {
+^bb0(%arg0: f32, %arg1 : f32):
+  omp.yield(%arg0 : f32)
+}
+
+// -----
+
+// expected-error @below {{`firstprivate` clauses require both `alloc` and `copy` regions.}}
+omp.private {type = firstprivate} @x.privatizer : f32 alloc {
+^bb0(%arg0: f32):
+  omp.yield(%arg0 : f32)
+}
diff --git a/mlir/test/Dialect/OpenMP/roundtrip.mlir b/mlir/test/Dialect/OpenMP/roundtrip.mlir
new file mode 100644
index 0000000..2553442
--- /dev/null
+++ b/mlir/test/Dialect/OpenMP/roundtrip.mlir
@@ -0,0 +1,21 @@
+// RUN: mlir-opt -verify-diagnostics %s | mlir-opt | FileCheck %s
+
+// CHECK: omp.private {type = private} @x.privatizer : !llvm.ptr alloc {
+omp.private {type = private} @x.privatizer : !llvm.ptr alloc {
+// CHECK: ^bb0(%arg0: {{.*}}):
+^bb0(%arg0: !llvm.ptr):
+  omp.yield(%arg0 : !llvm.ptr)
+}
+
+// CHECK: omp.private {type = firstprivate} @y.privatizer : !llvm.ptr alloc {
+omp.private {type = firstprivate} @y.privatizer : !llvm.ptr alloc {
+// CHECK: ^bb0(%arg0: {{.*}}):
+^bb0(%arg0: !llvm.ptr):
+  omp.yield(%arg0 : !llvm.ptr)
+// CHECK: } copy {
+} copy {
+// CHECK: ^bb0(%arg0: {{.*}}, %arg1: {{.*}}):
+^bb0(%arg0: !llvm.ptr, %arg1: !llvm.ptr):
+  omp.yield(%arg0 : !llvm.ptr)
+}
+
-- 
cgit v1.1


From 91dcf53abd34fa836a126c706f87b810d299d802 Mon Sep 17 00:00:00 2001
From: rmarker <37921131+rmarker@users.noreply.github.com>
Date: Tue, 13 Feb 2024 14:58:33 +1030
Subject: [clang-format] Rename option AlwaysBreakAfterReturnType. (#80827)

Changes the option to BreakAfterReturnType option, with a more relevant
name, deprecating and replacing AlwaysBreakAfterReturnType.
Following up on #78010.
---
 clang/docs/ClangFormatStyleOptions.rst     | 221 +++++++++++++++--------------
 clang/docs/ReleaseNotes.rst                |   2 +
 clang/include/clang/Format/Format.h        |   7 +-
 clang/lib/Format/Format.cpp                |   5 +-
 clang/unittests/Format/ConfigParseTest.cpp |  16 +++
 5 files changed, 140 insertions(+), 111 deletions(-)

diff --git a/clang/docs/ClangFormatStyleOptions.rst b/clang/docs/ClangFormatStyleOptions.rst
index 5deeff0..fdf7bfa 100644
--- a/clang/docs/ClangFormatStyleOptions.rst
+++ b/clang/docs/ClangFormatStyleOptions.rst
@@ -1531,114 +1531,8 @@ the configuration (without a prefix: ``Auto``).
 
 .. _AlwaysBreakAfterReturnType:
 
-**AlwaysBreakAfterReturnType** (``ReturnTypeBreakingStyle``) :versionbadge:`clang-format 3.8` :ref:`¶ <AlwaysBreakAfterReturnType>`
-  The function declaration return type breaking style to use.
-
-  Possible values:
-
-  * ``RTBS_None`` (in configuration: ``None``)
-    This is **deprecated**. See ``Automatic`` below.
-
-  * ``RTBS_Automatic`` (in configuration: ``Automatic``)
-    Break after return type based on ``PenaltyReturnTypeOnItsOwnLine``.
-
-    .. code-block:: c++
-
-      class A {
-        int f() { return 0; };
-      };
-      int f();
-      int f() { return 1; }
-      int
-      LongName::AnotherLongName();
-
-  * ``RTBS_ExceptShortType`` (in configuration: ``ExceptShortType``)
-    Same as ``Automatic`` above, except that there is no break after short
-    return types.
-
-    .. code-block:: c++
-
-      class A {
-        int f() { return 0; };
-      };
-      int f();
-      int f() { return 1; }
-      int LongName::
-          AnotherLongName();
-
-  * ``RTBS_All`` (in configuration: ``All``)
-    Always break after the return type.
-
-    .. code-block:: c++
-
-      class A {
-        int
-        f() {
-          return 0;
-        };
-      };
-      int
-      f();
-      int
-      f() {
-        return 1;
-      }
-      int
-      LongName::AnotherLongName();
-
-  * ``RTBS_TopLevel`` (in configuration: ``TopLevel``)
-    Always break after the return types of top-level functions.
-
-    .. code-block:: c++
-
-      class A {
-        int f() { return 0; };
-      };
-      int
-      f();
-      int
-      f() {
-        return 1;
-      }
-      int
-      LongName::AnotherLongName();
-
-  * ``RTBS_AllDefinitions`` (in configuration: ``AllDefinitions``)
-    Always break after the return type of function definitions.
-
-    .. code-block:: c++
-
-      class A {
-        int
-        f() {
-          return 0;
-        };
-      };
-      int f();
-      int
-      f() {
-        return 1;
-      }
-      int
-      LongName::AnotherLongName();
-
-  * ``RTBS_TopLevelDefinitions`` (in configuration: ``TopLevelDefinitions``)
-    Always break after the return type of top-level definitions.
-
-    .. code-block:: c++
-
-      class A {
-        int f() { return 0; };
-      };
-      int f();
-      int
-      f() {
-        return 1;
-      }
-      int
-      LongName::AnotherLongName();
-
-
+**AlwaysBreakAfterReturnType** (``deprecated``) :versionbadge:`clang-format 3.8` :ref:`¶ <AlwaysBreakAfterReturnType>`
+  This option is renamed to ``BreakAfterReturnType``.
 
 .. _AlwaysBreakBeforeMultilineStrings:
 
@@ -2219,6 +2113,117 @@ the configuration (without a prefix: ``Auto``).
      @Mock
      DataLoad loader;
 
+.. _BreakAfterReturnType:
+
+**BreakAfterReturnType** (``ReturnTypeBreakingStyle``) :versionbadge:`clang-format 19` :ref:`¶ <BreakAfterReturnType>`
+  The function declaration return type breaking style to use.
+
+  Possible values:
+
+  * ``RTBS_None`` (in configuration: ``None``)
+    This is **deprecated**. See ``Automatic`` below.
+
+  * ``RTBS_Automatic`` (in configuration: ``Automatic``)
+    Break after return type based on ``PenaltyReturnTypeOnItsOwnLine``.
+
+    .. code-block:: c++
+
+      class A {
+        int f() { return 0; };
+      };
+      int f();
+      int f() { return 1; }
+      int
+      LongName::AnotherLongName();
+
+  * ``RTBS_ExceptShortType`` (in configuration: ``ExceptShortType``)
+    Same as ``Automatic`` above, except that there is no break after short
+    return types.
+
+    .. code-block:: c++
+
+      class A {
+        int f() { return 0; };
+      };
+      int f();
+      int f() { return 1; }
+      int LongName::
+          AnotherLongName();
+
+  * ``RTBS_All`` (in configuration: ``All``)
+    Always break after the return type.
+
+    .. code-block:: c++
+
+      class A {
+        int
+        f() {
+          return 0;
+        };
+      };
+      int
+      f();
+      int
+      f() {
+        return 1;
+      }
+      int
+      LongName::AnotherLongName();
+
+  * ``RTBS_TopLevel`` (in configuration: ``TopLevel``)
+    Always break after the return types of top-level functions.
+
+    .. code-block:: c++
+
+      class A {
+        int f() { return 0; };
+      };
+      int
+      f();
+      int
+      f() {
+        return 1;
+      }
+      int
+      LongName::AnotherLongName();
+
+  * ``RTBS_AllDefinitions`` (in configuration: ``AllDefinitions``)
+    Always break after the return type of function definitions.
+
+    .. code-block:: c++
+
+      class A {
+        int
+        f() {
+          return 0;
+        };
+      };
+      int f();
+      int
+      f() {
+        return 1;
+      }
+      int
+      LongName::AnotherLongName();
+
+  * ``RTBS_TopLevelDefinitions`` (in configuration: ``TopLevelDefinitions``)
+    Always break after the return type of top-level definitions.
+
+    .. code-block:: c++
+
+      class A {
+        int f() { return 0; };
+      };
+      int f();
+      int
+      f() {
+        return 1;
+      }
+      int
+      LongName::AnotherLongName();
+
+
+
 .. _BreakArrays:
 
 **BreakArrays** (``Boolean``) :versionbadge:`clang-format 16` :ref:`¶ <BreakArrays>`
diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 402a2f8..aa167d7 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -296,6 +296,8 @@ clang-format
 
 - ``AlwaysBreakTemplateDeclarations`` is deprecated and renamed to
   ``BreakTemplateDeclarations``.
+- ``AlwaysBreakAfterReturnType`` is deprecated and renamed to
+  ``BreakAfterReturnType``.
 
 libclang
 --------
diff --git a/clang/include/clang/Format/Format.h b/clang/include/clang/Format/Format.h
index d9c18e5..737cbfc 100644
--- a/clang/include/clang/Format/Format.h
+++ b/clang/include/clang/Format/Format.h
@@ -1010,8 +1010,9 @@ struct FormatStyle {
   /// \version 3.7
   DefinitionReturnTypeBreakingStyle AlwaysBreakAfterDefinitionReturnType;
 
-  /// The function declaration return type breaking style to use.
+  /// This option is renamed to ``BreakAfterReturnType``.
   /// \version 3.8
+  /// @deprecated
   ReturnTypeBreakingStyle AlwaysBreakAfterReturnType;
 
   /// If ``true``, always break before multiline string literals.
@@ -1576,6 +1577,10 @@ struct FormatStyle {
   /// \version 16
   AttributeBreakingStyle BreakAfterAttributes;
 
+  /// The function declaration return type breaking style to use.
+  /// \version 19
+  // ReturnTypeBreakingStyle BreakAfterReturnType;
+
   /// If ``true``, clang-format will always break after a Json array ``[``
   /// otherwise it will scan until the closing ``]`` to determine if it should
   /// add newlines between elements (prettier compatible).
diff --git a/clang/lib/Format/Format.cpp b/clang/lib/Format/Format.cpp
index d2cc466..8efc42e 100644
--- a/clang/lib/Format/Format.cpp
+++ b/clang/lib/Format/Format.cpp
@@ -877,6 +877,8 @@ template <> struct MappingTraits<FormatStyle> {
     if (!IO.outputting()) {
       IO.mapOptional("AlignEscapedNewlinesLeft", Style.AlignEscapedNewlines);
       IO.mapOptional("AllowAllConstructorInitializersOnNextLine", OnNextLine);
+      IO.mapOptional("AlwaysBreakAfterReturnType",
+                     Style.AlwaysBreakAfterReturnType);
       IO.mapOptional("AlwaysBreakTemplateDeclarations",
                      Style.BreakTemplateDeclarations);
       IO.mapOptional("BreakBeforeInheritanceComma",
@@ -941,8 +943,6 @@ template <> struct MappingTraits<FormatStyle> {
                    Style.AllowShortLoopsOnASingleLine);
     IO.mapOptional("AlwaysBreakAfterDefinitionReturnType",
                    Style.AlwaysBreakAfterDefinitionReturnType);
-    IO.mapOptional("AlwaysBreakAfterReturnType",
-                   Style.AlwaysBreakAfterReturnType);
     IO.mapOptional("AlwaysBreakBeforeMultilineStrings",
                    Style.AlwaysBreakBeforeMultilineStrings);
     IO.mapOptional("AttributeMacros", Style.AttributeMacros);
@@ -957,6 +957,7 @@ template <> struct MappingTraits<FormatStyle> {
     IO.mapOptional("BreakAfterAttributes", Style.BreakAfterAttributes);
     IO.mapOptional("BreakAfterJavaFieldAnnotations",
                    Style.BreakAfterJavaFieldAnnotations);
+    IO.mapOptional("BreakAfterReturnType", Style.AlwaysBreakAfterReturnType);
     IO.mapOptional("BreakArrays", Style.BreakArrays);
     IO.mapOptional("BreakBeforeBinaryOperators",
                    Style.BreakBeforeBinaryOperators);
diff --git a/clang/unittests/Format/ConfigParseTest.cpp b/clang/unittests/Format/ConfigParseTest.cpp
index 571e1eb..ee8a556 100644
--- a/clang/unittests/Format/ConfigParseTest.cpp
+++ b/clang/unittests/Format/ConfigParseTest.cpp
@@ -678,6 +678,22 @@ TEST(ConfigParseTest, ParsesConfiguration) {
               BraceWrapping.AfterControlStatement, FormatStyle::BWACS_Never);
 
   Style.AlwaysBreakAfterReturnType = FormatStyle::RTBS_All;
+  CHECK_PARSE("BreakAfterReturnType: None", AlwaysBreakAfterReturnType,
+              FormatStyle::RTBS_None);
+  CHECK_PARSE("BreakAfterReturnType: Automatic", AlwaysBreakAfterReturnType,
+              FormatStyle::RTBS_Automatic);
+  CHECK_PARSE("BreakAfterReturnType: ExceptShortType",
+              AlwaysBreakAfterReturnType, FormatStyle::RTBS_ExceptShortType);
+  CHECK_PARSE("BreakAfterReturnType: All", AlwaysBreakAfterReturnType,
+              FormatStyle::RTBS_All);
+  CHECK_PARSE("BreakAfterReturnType: TopLevel", AlwaysBreakAfterReturnType,
+              FormatStyle::RTBS_TopLevel);
+  CHECK_PARSE("BreakAfterReturnType: AllDefinitions",
+              AlwaysBreakAfterReturnType, FormatStyle::RTBS_AllDefinitions);
+  CHECK_PARSE("BreakAfterReturnType: TopLevelDefinitions",
+              AlwaysBreakAfterReturnType,
+              FormatStyle::RTBS_TopLevelDefinitions);
+  // For backward compatibility:
   CHECK_PARSE("AlwaysBreakAfterReturnType: None", AlwaysBreakAfterReturnType,
               FormatStyle::RTBS_None);
   CHECK_PARSE("AlwaysBreakAfterReturnType: Automatic",
-- 
cgit v1.1


From 2dd52046816ac706a25d588efac77705793d8778 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Mon, 12 Feb 2024 20:28:09 -0800
Subject: Recommit "[LICM] Support integer mul/add in hoistFPAssociation.
 (#67736)"

With a fix for build bot failure. I was accessing the type of a deleted
Instruction.

Original message:

The reassociation this is trying to repair can happen for integer types
too.

This patch adds support for integer mul/add to hoistFPAssociation. The
function has been renamed to hoistMulAddAssociation. I've used separate
statistics and limits for integer to allow tuning flexibility.
---
 llvm/lib/Transforms/Scalar/LICM.cpp               |  69 ++--
 llvm/test/Transforms/LICM/expr-reassociate-int.ll | 364 ++++++++++++++++++++++
 2 files changed, 414 insertions(+), 19 deletions(-)
 create mode 100644 llvm/test/Transforms/LICM/expr-reassociate-int.ll

diff --git a/llvm/lib/Transforms/Scalar/LICM.cpp b/llvm/lib/Transforms/Scalar/LICM.cpp
index f3e40a5..9ec9c31 100644
--- a/llvm/lib/Transforms/Scalar/LICM.cpp
+++ b/llvm/lib/Transforms/Scalar/LICM.cpp
@@ -110,6 +110,9 @@ STATISTIC(NumAddSubHoisted, "Number of add/subtract expressions reassociated "
                             "and hoisted out of the loop");
 STATISTIC(NumFPAssociationsHoisted, "Number of invariant FP expressions "
                                     "reassociated and hoisted out of the loop");
+STATISTIC(NumIntAssociationsHoisted,
+          "Number of invariant int expressions "
+          "reassociated and hoisted out of the loop");
 
 /// Memory promotion is enabled by default.
 static cl::opt<bool>
@@ -135,6 +138,12 @@ static cl::opt<unsigned> FPAssociationUpperLimit(
         "Set upper limit for the number of transformations performed "
         "during a single round of hoisting the reassociated expressions."));
 
+cl::opt<unsigned> IntAssociationUpperLimit(
+    "licm-max-num-int-reassociations", cl::init(5U), cl::Hidden,
+    cl::desc(
+        "Set upper limit for the number of transformations performed "
+        "during a single round of hoisting the reassociated expressions."));
+
 // Experimental option to allow imprecision in LICM in pathological cases, in
 // exchange for faster compile. This is to be removed if MemorySSA starts to
 // address the same issue. LICM calls MemorySSAWalker's
@@ -2661,21 +2670,29 @@ static bool hoistAddSub(Instruction &I, Loop &L, ICFLoopSafetyInfo &SafetyInfo,
   return false;
 }
 
+static bool isReassociableOp(Instruction *I, unsigned IntOpcode,
+                             unsigned FPOpcode) {
+  if (I->getOpcode() == IntOpcode)
+    return true;
+  if (I->getOpcode() == FPOpcode && I->hasAllowReassoc() &&
+      I->hasNoSignedZeros())
+    return true;
+  return false;
+}
+
 /// Try to reassociate expressions like ((A1 * B1) + (A2 * B2) + ...) * C where
 /// A1, A2, ... and C are loop invariants into expressions like
 /// ((A1 * C * B1) + (A2 * C * B2) + ...) and hoist the (A1 * C), (A2 * C), ...
 /// invariant expressions. This functions returns true only if any hoisting has
 /// actually occured.
-static bool hoistFPAssociation(Instruction &I, Loop &L,
-                               ICFLoopSafetyInfo &SafetyInfo,
-                               MemorySSAUpdater &MSSAU, AssumptionCache *AC,
-                               DominatorTree *DT) {
-  using namespace PatternMatch;
-  Value *VariantOp = nullptr, *InvariantOp = nullptr;
-
-  if (!match(&I, m_FMul(m_Value(VariantOp), m_Value(InvariantOp))) ||
-      !I.hasAllowReassoc() || !I.hasNoSignedZeros())
+static bool hoistMulAddAssociation(Instruction &I, Loop &L,
+                                   ICFLoopSafetyInfo &SafetyInfo,
+                                   MemorySSAUpdater &MSSAU, AssumptionCache *AC,
+                                   DominatorTree *DT) {
+  if (!isReassociableOp(&I, Instruction::Mul, Instruction::FMul))
     return false;
+  Value *VariantOp = I.getOperand(0);
+  Value *InvariantOp = I.getOperand(1);
   if (L.isLoopInvariant(VariantOp))
     std::swap(VariantOp, InvariantOp);
   if (L.isLoopInvariant(VariantOp) || !L.isLoopInvariant(InvariantOp))
@@ -2689,15 +2706,17 @@ static bool hoistFPAssociation(Instruction &I, Loop &L,
     Worklist.push_back(VariantBinOp);
   while (!Worklist.empty()) {
     BinaryOperator *BO = Worklist.pop_back_val();
-    if (!BO->hasOneUse() || !BO->hasAllowReassoc() || !BO->hasNoSignedZeros())
+    if (!BO->hasOneUse())
       return false;
-    BinaryOperator *Op0, *Op1;
-    if (match(BO, m_FAdd(m_BinOp(Op0), m_BinOp(Op1)))) {
-      Worklist.push_back(Op0);
-      Worklist.push_back(Op1);
+    if (isReassociableOp(BO, Instruction::Add, Instruction::FAdd) &&
+        isa<BinaryOperator>(BO->getOperand(0)) &&
+        isa<BinaryOperator>(BO->getOperand(1))) {
+      Worklist.push_back(cast<BinaryOperator>(BO->getOperand(0)));
+      Worklist.push_back(cast<BinaryOperator>(BO->getOperand(1)));
       continue;
     }
-    if (BO->getOpcode() != Instruction::FMul || L.isLoopInvariant(BO))
+    if (!isReassociableOp(BO, Instruction::Mul, Instruction::FMul) ||
+        L.isLoopInvariant(BO))
       return false;
     Use &U0 = BO->getOperandUse(0);
     Use &U1 = BO->getOperandUse(1);
@@ -2707,7 +2726,10 @@ static bool hoistFPAssociation(Instruction &I, Loop &L,
       Changes.push_back(&U1);
     else
       return false;
-    if (Changes.size() > FPAssociationUpperLimit)
+    unsigned Limit = I.getType()->isIntOrIntVectorTy()
+                         ? IntAssociationUpperLimit
+                         : FPAssociationUpperLimit;
+    if (Changes.size() > Limit)
       return false;
   }
   if (Changes.empty())
@@ -2720,7 +2742,12 @@ static bool hoistFPAssociation(Instruction &I, Loop &L,
   for (auto *U : Changes) {
     assert(L.isLoopInvariant(U->get()));
     Instruction *Ins = cast<Instruction>(U->getUser());
-    U->set(Builder.CreateFMulFMF(U->get(), Factor, Ins, "factor.op.fmul"));
+    Value *Mul;
+    if (I.getType()->isIntOrIntVectorTy())
+      Mul = Builder.CreateMul(U->get(), Factor, "factor.op.mul");
+    else
+      Mul = Builder.CreateFMulFMF(U->get(), Factor, Ins, "factor.op.fmul");
+    U->set(Mul);
   }
   I.replaceAllUsesWith(VariantOp);
   eraseInstruction(I, SafetyInfo, MSSAU);
@@ -2754,9 +2781,13 @@ static bool hoistArithmetics(Instruction &I, Loop &L,
     return true;
   }
 
-  if (hoistFPAssociation(I, L, SafetyInfo, MSSAU, AC, DT)) {
+  bool IsInt = I.getType()->isIntOrIntVectorTy();
+  if (hoistMulAddAssociation(I, L, SafetyInfo, MSSAU, AC, DT)) {
     ++NumHoisted;
-    ++NumFPAssociationsHoisted;
+    if (IsInt)
+      ++NumIntAssociationsHoisted;
+    else
+      ++NumFPAssociationsHoisted;
     return true;
   }
 
diff --git a/llvm/test/Transforms/LICM/expr-reassociate-int.ll b/llvm/test/Transforms/LICM/expr-reassociate-int.ll
new file mode 100644
index 0000000..6354897
--- /dev/null
+++ b/llvm/test/Transforms/LICM/expr-reassociate-int.ll
@@ -0,0 +1,364 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2
+; RUN: opt -passes='loop-mssa(licm)' -S < %s | FileCheck %s --check-prefixes=CHECK,NOT_CONSTRAINED
+; RUN: opt -passes='loop-mssa(licm)' -licm-max-num-int-reassociations=1 -S < %s | FileCheck %s --check-prefixes=CHECK,CONSTRAINED
+
+;
+; A simple loop:
+;
+;  int j;
+;
+;  for (j = 0; j <= i; j++)
+;    cells[j] = d1 * cells[j + 1] * delta;
+;
+; ...should be transformed by the LICM pass into this:
+;
+;  int j;
+;  const uint64_t d1d = d1 * delta;
+;
+;  for (j = 0; j <= i; j++)
+;    cells[j] = d1d * cells[j + 1];
+;
+
+define void @innermost_loop_1d_shouldhoist(i32 %i, i64 %d1, i64 %delta, ptr %cells) {
+; CHECK-LABEL: define void @innermost_loop_1d_shouldhoist
+; CHECK-SAME: (i32 [[I:%.*]], i64 [[D1:%.*]], i64 [[DELTA:%.*]], ptr [[CELLS:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[MUL_1:%.*]] = mul i64 [[DELTA]], [[D1]]
+; CHECK-NEXT:    br label [[FOR_COND:%.*]]
+; CHECK:       for.cond:
+; CHECK-NEXT:    [[J:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[ADD_J_1:%.*]], [[FOR_BODY:%.*]] ]
+; CHECK-NEXT:    [[CMP_NOT:%.*]] = icmp sgt i32 [[J]], [[I]]
+; CHECK-NEXT:    br i1 [[CMP_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[ADD_J_1]] = add nuw nsw i32 [[J]], 1
+; CHECK-NEXT:    [[IDXPROM_J_1:%.*]] = zext i32 [[ADD_J_1]] to i64
+; CHECK-NEXT:    [[ARRAYIDX_J_1:%.*]] = getelementptr inbounds i64, ptr [[CELLS]], i64 [[IDXPROM_J_1]]
+; CHECK-NEXT:    [[CELL_1:%.*]] = load i64, ptr [[ARRAYIDX_J_1]], align 8
+; CHECK-NEXT:    [[MUL_2:%.*]] = mul i64 [[MUL_1]], [[CELL_1]]
+; CHECK-NEXT:    [[IDXPROM_J:%.*]] = zext i32 [[J]] to i64
+; CHECK-NEXT:    [[ARRAYIDX_J:%.*]] = getelementptr inbounds i64, ptr [[CELLS]], i64 [[IDXPROM_J]]
+; CHECK-NEXT:    store i64 [[MUL_2]], ptr [[ARRAYIDX_J]], align 8
+; CHECK-NEXT:    br label [[FOR_COND]]
+; CHECK:       for.end:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %for.cond
+
+for.cond:
+  %j = phi i32 [ 0, %entry ], [ %add.j.1, %for.body ]
+  %cmp.not = icmp sgt i32 %j, %i
+  br i1 %cmp.not, label %for.end, label %for.body
+
+for.body:
+  %add.j.1 = add nuw nsw i32 %j, 1
+  %idxprom.j.1 = zext i32 %add.j.1 to i64
+  %arrayidx.j.1 = getelementptr inbounds i64, ptr %cells, i64 %idxprom.j.1
+  %cell.1 = load i64, ptr %arrayidx.j.1, align 8
+  %mul.1 = mul i64 %delta, %d1
+  %mul.2 = mul i64 %mul.1, %cell.1
+  %idxprom.j = zext i32 %j to i64
+  %arrayidx.j = getelementptr inbounds i64, ptr %cells, i64 %idxprom.j
+  store i64 %mul.2, ptr %arrayidx.j, align 8
+  br label %for.cond
+
+for.end:
+  ret void
+}
+
+;
+; The following loop will be modified by the 'Reassociate expressions' pass,
+;
+;  int j;
+;  const uint64_t d1d = d1 * delta;
+;  const uint64_t d2d = d2 * delta;
+;
+;  for (j = 0; j <= i; j++)
+;    cells[j] = d1d * cells[j + 1] + d2d * cells[j];
+;
+; ...into this:
+;
+;  int j;
+;
+;  for (j = 0; j <= i; j++)
+;    cells[j] = (d1 * cells[j + 1] + d2 * cells[j]) * delta;
+;
+; We expect the LICM pass to undo this transformation.
+;
+
+define void @innermost_loop_2d(i32 %i, i64 %d1, i64 %d2, i64 %delta, ptr %cells) {
+; NOT_CONSTRAINED-LABEL: define void @innermost_loop_2d
+; NOT_CONSTRAINED-SAME: (i32 [[I:%.*]], i64 [[D1:%.*]], i64 [[D2:%.*]], i64 [[DELTA:%.*]], ptr [[CELLS:%.*]]) {
+; NOT_CONSTRAINED-NEXT:  entry:
+; NOT_CONSTRAINED-NEXT:    [[FACTOR_OP_MUL:%.*]] = mul i64 [[D1]], [[DELTA]]
+; NOT_CONSTRAINED-NEXT:    [[FACTOR_OP_MUL1:%.*]] = mul i64 [[D2]], [[DELTA]]
+; NOT_CONSTRAINED-NEXT:    br label [[FOR_COND:%.*]]
+; NOT_CONSTRAINED:       for.cond:
+; NOT_CONSTRAINED-NEXT:    [[J:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[ADD_J_1:%.*]], [[FOR_BODY:%.*]] ]
+; NOT_CONSTRAINED-NEXT:    [[CMP_NOT:%.*]] = icmp sgt i32 [[J]], [[I]]
+; NOT_CONSTRAINED-NEXT:    br i1 [[CMP_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]]
+; NOT_CONSTRAINED:       for.body:
+; NOT_CONSTRAINED-NEXT:    [[ADD_J_1]] = add nuw nsw i32 [[J]], 1
+; NOT_CONSTRAINED-NEXT:    [[IDXPROM_J_1:%.*]] = zext i32 [[ADD_J_1]] to i64
+; NOT_CONSTRAINED-NEXT:    [[ARRAYIDX_J_1:%.*]] = getelementptr inbounds i64, ptr [[CELLS]], i64 [[IDXPROM_J_1]]
+; NOT_CONSTRAINED-NEXT:    [[CELL_1:%.*]] = load i64, ptr [[ARRAYIDX_J_1]], align 8
+; NOT_CONSTRAINED-NEXT:    [[MUL_1:%.*]] = mul i64 [[CELL_1]], [[FACTOR_OP_MUL]]
+; NOT_CONSTRAINED-NEXT:    [[IDXPROM_J:%.*]] = zext i32 [[J]] to i64
+; NOT_CONSTRAINED-NEXT:    [[ARRAYIDX_J:%.*]] = getelementptr inbounds i64, ptr [[CELLS]], i64 [[IDXPROM_J]]
+; NOT_CONSTRAINED-NEXT:    [[CELL_2:%.*]] = load i64, ptr [[ARRAYIDX_J]], align 8
+; NOT_CONSTRAINED-NEXT:    [[MUL_2:%.*]] = mul i64 [[CELL_2]], [[FACTOR_OP_MUL1]]
+; NOT_CONSTRAINED-NEXT:    [[REASS_ADD:%.*]] = add i64 [[MUL_2]], [[MUL_1]]
+; NOT_CONSTRAINED-NEXT:    store i64 [[REASS_ADD]], ptr [[ARRAYIDX_J]], align 8
+; NOT_CONSTRAINED-NEXT:    br label [[FOR_COND]]
+; NOT_CONSTRAINED:       for.end:
+; NOT_CONSTRAINED-NEXT:    ret void
+;
+; CONSTRAINED-LABEL: define void @innermost_loop_2d
+; CONSTRAINED-SAME: (i32 [[I:%.*]], i64 [[D1:%.*]], i64 [[D2:%.*]], i64 [[DELTA:%.*]], ptr [[CELLS:%.*]]) {
+; CONSTRAINED-NEXT:  entry:
+; CONSTRAINED-NEXT:    br label [[FOR_COND:%.*]]
+; CONSTRAINED:       for.cond:
+; CONSTRAINED-NEXT:    [[J:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[ADD_J_1:%.*]], [[FOR_BODY:%.*]] ]
+; CONSTRAINED-NEXT:    [[CMP_NOT:%.*]] = icmp sgt i32 [[J]], [[I]]
+; CONSTRAINED-NEXT:    br i1 [[CMP_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]]
+; CONSTRAINED:       for.body:
+; CONSTRAINED-NEXT:    [[ADD_J_1]] = add nuw nsw i32 [[J]], 1
+; CONSTRAINED-NEXT:    [[IDXPROM_J_1:%.*]] = zext i32 [[ADD_J_1]] to i64
+; CONSTRAINED-NEXT:    [[ARRAYIDX_J_1:%.*]] = getelementptr inbounds i64, ptr [[CELLS]], i64 [[IDXPROM_J_1]]
+; CONSTRAINED-NEXT:    [[CELL_1:%.*]] = load i64, ptr [[ARRAYIDX_J_1]], align 8
+; CONSTRAINED-NEXT:    [[MUL_1:%.*]] = mul i64 [[CELL_1]], [[D1]]
+; CONSTRAINED-NEXT:    [[IDXPROM_J:%.*]] = zext i32 [[J]] to i64
+; CONSTRAINED-NEXT:    [[ARRAYIDX_J:%.*]] = getelementptr inbounds i64, ptr [[CELLS]], i64 [[IDXPROM_J]]
+; CONSTRAINED-NEXT:    [[CELL_2:%.*]] = load i64, ptr [[ARRAYIDX_J]], align 8
+; CONSTRAINED-NEXT:    [[MUL_2:%.*]] = mul i64 [[CELL_2]], [[D2]]
+; CONSTRAINED-NEXT:    [[REASS_ADD:%.*]] = add i64 [[MUL_2]], [[MUL_1]]
+; CONSTRAINED-NEXT:    [[REASS_MUL:%.*]] = mul i64 [[REASS_ADD]], [[DELTA]]
+; CONSTRAINED-NEXT:    store i64 [[REASS_MUL]], ptr [[ARRAYIDX_J]], align 8
+; CONSTRAINED-NEXT:    br label [[FOR_COND]]
+; CONSTRAINED:       for.end:
+; CONSTRAINED-NEXT:    ret void
+;
+entry:
+  br label %for.cond
+
+for.cond:
+  %j = phi i32 [ 0, %entry ], [ %add.j.1, %for.body ]
+  %cmp.not = icmp sgt i32 %j, %i
+  br i1 %cmp.not, label %for.end, label %for.body
+
+for.body:
+  %add.j.1 = add nuw nsw i32 %j, 1
+  %idxprom.j.1 = zext i32 %add.j.1 to i64
+  %arrayidx.j.1 = getelementptr inbounds i64, ptr %cells, i64 %idxprom.j.1
+  %cell.1 = load i64, ptr %arrayidx.j.1, align 8
+  %mul.1 = mul i64 %cell.1, %d1
+  %idxprom.j = zext i32 %j to i64
+  %arrayidx.j = getelementptr inbounds i64, ptr %cells, i64 %idxprom.j
+  %cell.2 = load i64, ptr %arrayidx.j, align 8
+  %mul.2 = mul i64 %cell.2, %d2
+  %reass.add = add i64 %mul.2, %mul.1
+  %reass.mul = mul i64 %reass.add, %delta
+  store i64 %reass.mul, ptr %arrayidx.j, align 8
+  br label %for.cond
+
+for.end:
+  ret void
+}
+
+;
+; The following loop will be modified by the 'Reassociate expressions' pass,
+;
+;  int j;
+;  const uint64_t d1d = d1 * delta;
+;  const uint64_t d2d = d2 * delta;
+;  const uint64_t d3d = d3 * delta;
+;
+;  for (j = 0; j <= i; j++)
+;    cells[j] = d1d * cells[j + 1] + d2d * cells[j] + d3d * cells[j + 2];
+;
+; ...into this:
+;
+;  int j;
+;
+;  for (j = 0; j <= i; j++)
+;    cells[j] = (d1 * cells[j + 1] + d2 * cells[j] + d3 * cells[j + 2]) * delta;
+;
+; We expect the LICM pass to undo this transformation.
+;
+
+
+define void @innermost_loop_3d(i32 %i, i64 %d1, i64 %d2, i64 %d3, i64 %delta, ptr %cells) {
+; NOT_CONSTRAINED-LABEL: define void @innermost_loop_3d
+; NOT_CONSTRAINED-SAME: (i32 [[I:%.*]], i64 [[D1:%.*]], i64 [[D2:%.*]], i64 [[D3:%.*]], i64 [[DELTA:%.*]], ptr [[CELLS:%.*]]) {
+; NOT_CONSTRAINED-NEXT:  entry:
+; NOT_CONSTRAINED-NEXT:    [[FACTOR_OP_MUL:%.*]] = mul i64 [[D3]], [[DELTA]]
+; NOT_CONSTRAINED-NEXT:    [[FACTOR_OP_MUL1:%.*]] = mul i64 [[D1]], [[DELTA]]
+; NOT_CONSTRAINED-NEXT:    [[FACTOR_OP_MUL2:%.*]] = mul i64 [[D2]], [[DELTA]]
+; NOT_CONSTRAINED-NEXT:    br label [[FOR_COND:%.*]]
+; NOT_CONSTRAINED:       for.cond:
+; NOT_CONSTRAINED-NEXT:    [[J:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[ADD_J_1:%.*]], [[FOR_BODY:%.*]] ]
+; NOT_CONSTRAINED-NEXT:    [[CMP_NOT:%.*]] = icmp sgt i32 [[J]], [[I]]
+; NOT_CONSTRAINED-NEXT:    br i1 [[CMP_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]]
+; NOT_CONSTRAINED:       for.body:
+; NOT_CONSTRAINED-NEXT:    [[ADD_J_1]] = add nuw nsw i32 [[J]], 1
+; NOT_CONSTRAINED-NEXT:    [[IDXPROM_J_1:%.*]] = zext i32 [[ADD_J_1]] to i64
+; NOT_CONSTRAINED-NEXT:    [[ARRAYIDX_J_1:%.*]] = getelementptr inbounds i64, ptr [[CELLS]], i64 [[IDXPROM_J_1]]
+; NOT_CONSTRAINED-NEXT:    [[CELL_1:%.*]] = load i64, ptr [[ARRAYIDX_J_1]], align 8
+; NOT_CONSTRAINED-NEXT:    [[MUL_1:%.*]] = mul i64 [[CELL_1]], [[FACTOR_OP_MUL1]]
+; NOT_CONSTRAINED-NEXT:    [[IDXPROM_J:%.*]] = zext i32 [[J]] to i64
+; NOT_CONSTRAINED-NEXT:    [[ARRAYIDX_J:%.*]] = getelementptr inbounds i64, ptr [[CELLS]], i64 [[IDXPROM_J]]
+; NOT_CONSTRAINED-NEXT:    [[CELL_2:%.*]] = load i64, ptr [[ARRAYIDX_J]], align 8
+; NOT_CONSTRAINED-NEXT:    [[MUL_2:%.*]] = mul i64 [[CELL_2]], [[FACTOR_OP_MUL2]]
+; NOT_CONSTRAINED-NEXT:    [[ADD_J_2:%.*]] = add nuw nsw i32 [[J]], 2
+; NOT_CONSTRAINED-NEXT:    [[IDXPROM_J_2:%.*]] = zext i32 [[ADD_J_2]] to i64
+; NOT_CONSTRAINED-NEXT:    [[ARRAYIDX_J_2:%.*]] = getelementptr inbounds i64, ptr [[CELLS]], i64 [[IDXPROM_J_2]]
+; NOT_CONSTRAINED-NEXT:    [[CELL_3:%.*]] = load i64, ptr [[ARRAYIDX_J_2]], align 8
+; NOT_CONSTRAINED-NEXT:    [[MUL_3:%.*]] = mul i64 [[CELL_3]], [[FACTOR_OP_MUL]]
+; NOT_CONSTRAINED-NEXT:    [[REASS_ADD:%.*]] = add i64 [[MUL_2]], [[MUL_1]]
+; NOT_CONSTRAINED-NEXT:    [[REASS_ADD1:%.*]] = add i64 [[REASS_ADD]], [[MUL_3]]
+; NOT_CONSTRAINED-NEXT:    store i64 [[REASS_ADD1]], ptr [[ARRAYIDX_J_2]], align 8
+; NOT_CONSTRAINED-NEXT:    br label [[FOR_COND]]
+; NOT_CONSTRAINED:       for.end:
+; NOT_CONSTRAINED-NEXT:    ret void
+;
+; CONSTRAINED-LABEL: define void @innermost_loop_3d
+; CONSTRAINED-SAME: (i32 [[I:%.*]], i64 [[D1:%.*]], i64 [[D2:%.*]], i64 [[D3:%.*]], i64 [[DELTA:%.*]], ptr [[CELLS:%.*]]) {
+; CONSTRAINED-NEXT:  entry:
+; CONSTRAINED-NEXT:    br label [[FOR_COND:%.*]]
+; CONSTRAINED:       for.cond:
+; CONSTRAINED-NEXT:    [[J:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[ADD_J_1:%.*]], [[FOR_BODY:%.*]] ]
+; CONSTRAINED-NEXT:    [[CMP_NOT:%.*]] = icmp sgt i32 [[J]], [[I]]
+; CONSTRAINED-NEXT:    br i1 [[CMP_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]]
+; CONSTRAINED:       for.body:
+; CONSTRAINED-NEXT:    [[ADD_J_1]] = add nuw nsw i32 [[J]], 1
+; CONSTRAINED-NEXT:    [[IDXPROM_J_1:%.*]] = zext i32 [[ADD_J_1]] to i64
+; CONSTRAINED-NEXT:    [[ARRAYIDX_J_1:%.*]] = getelementptr inbounds i64, ptr [[CELLS]], i64 [[IDXPROM_J_1]]
+; CONSTRAINED-NEXT:    [[CELL_1:%.*]] = load i64, ptr [[ARRAYIDX_J_1]], align 8
+; CONSTRAINED-NEXT:    [[MUL_1:%.*]] = mul i64 [[CELL_1]], [[D1]]
+; CONSTRAINED-NEXT:    [[IDXPROM_J:%.*]] = zext i32 [[J]] to i64
+; CONSTRAINED-NEXT:    [[ARRAYIDX_J:%.*]] = getelementptr inbounds i64, ptr [[CELLS]], i64 [[IDXPROM_J]]
+; CONSTRAINED-NEXT:    [[CELL_2:%.*]] = load i64, ptr [[ARRAYIDX_J]], align 8
+; CONSTRAINED-NEXT:    [[MUL_2:%.*]] = mul i64 [[CELL_2]], [[D2]]
+; CONSTRAINED-NEXT:    [[ADD_J_2:%.*]] = add nuw nsw i32 [[J]], 2
+; CONSTRAINED-NEXT:    [[IDXPROM_J_2:%.*]] = zext i32 [[ADD_J_2]] to i64
+; CONSTRAINED-NEXT:    [[ARRAYIDX_J_2:%.*]] = getelementptr inbounds i64, ptr [[CELLS]], i64 [[IDXPROM_J_2]]
+; CONSTRAINED-NEXT:    [[CELL_3:%.*]] = load i64, ptr [[ARRAYIDX_J_2]], align 8
+; CONSTRAINED-NEXT:    [[MUL_3:%.*]] = mul i64 [[CELL_3]], [[D3]]
+; CONSTRAINED-NEXT:    [[REASS_ADD:%.*]] = add i64 [[MUL_2]], [[MUL_1]]
+; CONSTRAINED-NEXT:    [[REASS_ADD1:%.*]] = add i64 [[REASS_ADD]], [[MUL_3]]
+; CONSTRAINED-NEXT:    [[REASS_MUL:%.*]] = mul i64 [[REASS_ADD1]], [[DELTA]]
+; CONSTRAINED-NEXT:    store i64 [[REASS_MUL]], ptr [[ARRAYIDX_J_2]], align 8
+; CONSTRAINED-NEXT:    br label [[FOR_COND]]
+; CONSTRAINED:       for.end:
+; CONSTRAINED-NEXT:    ret void
+;
+entry:
+  br label %for.cond
+
+for.cond:
+  %j = phi i32 [ 0, %entry ], [ %add.j.1, %for.body ]
+  %cmp.not = icmp sgt i32 %j, %i
+  br i1 %cmp.not, label %for.end, label %for.body
+
+for.body:
+  %add.j.1 = add nuw nsw i32 %j, 1
+  %idxprom.j.1 = zext i32 %add.j.1 to i64
+  %arrayidx.j.1 = getelementptr inbounds i64, ptr %cells, i64 %idxprom.j.1
+  %cell.1 = load i64, ptr %arrayidx.j.1, align 8
+  %mul.1 = mul i64 %cell.1, %d1
+  %idxprom.j = zext i32 %j to i64
+  %arrayidx.j = getelementptr inbounds i64, ptr %cells, i64 %idxprom.j
+  %cell.2 = load i64, ptr %arrayidx.j, align 8
+  %mul.2 = mul i64 %cell.2, %d2
+  %add.j.2 = add nuw nsw i32 %j, 2
+  %idxprom.j.2 = zext i32 %add.j.2 to i64
+  %arrayidx.j.2 = getelementptr inbounds i64, ptr %cells, i64 %idxprom.j.2
+  %cell.3 = load i64, ptr %arrayidx.j.2, align 8
+  %mul.3 = mul i64 %cell.3, %d3
+  %reass.add = add i64 %mul.2, %mul.1
+  %reass.add1 = add i64 %reass.add, %mul.3
+  %reass.mul = mul i64 %reass.add1, %delta
+  store i64 %reass.mul, ptr %arrayidx.j.2, align 8
+  br label %for.cond
+
+for.end:
+  ret void
+}
+
+;
+; The following loop will not be modified by the LICM pass:
+;
+;  int j;
+;
+;  for (j = 0; j <= i; j++)
+;    cells[j] = (d1 * cells[j + 1] + d2 * cells[j] +
+;                cells[j] * cells[j + 1]) * delta;
+;
+; This case differs as one of the multiplications involves no invariants.
+;
+
+define void @innermost_loop_3d_reassociated_different(i32 %i, i64 %d1, i64 %d2, i64 %delta, ptr %cells) {
+; CHECK-LABEL: define void @innermost_loop_3d_reassociated_different
+; CHECK-SAME: (i32 [[I:%.*]], i64 [[D1:%.*]], i64 [[D2:%.*]], i64 [[DELTA:%.*]], ptr [[CELLS:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[FOR_COND:%.*]]
+; CHECK:       for.cond:
+; CHECK-NEXT:    [[J:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[ADD_J_1:%.*]], [[FOR_BODY:%.*]] ]
+; CHECK-NEXT:    [[CMP_NOT:%.*]] = icmp sgt i32 [[J]], [[I]]
+; CHECK-NEXT:    br i1 [[CMP_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[ADD_J_1]] = add nuw nsw i32 [[J]], 1
+; CHECK-NEXT:    [[IDXPROM_J_1:%.*]] = zext i32 [[ADD_J_1]] to i64
+; CHECK-NEXT:    [[ARRAYIDX_J_1:%.*]] = getelementptr inbounds i64, ptr [[CELLS]], i64 [[IDXPROM_J_1]]
+; CHECK-NEXT:    [[CELL_1:%.*]] = load i64, ptr [[ARRAYIDX_J_1]], align 8
+; CHECK-NEXT:    [[IDXPROM_J_2:%.*]] = zext i32 [[ADD_J_1]] to i64
+; CHECK-NEXT:    [[ARRAYIDX_J_2:%.*]] = getelementptr inbounds i64, ptr [[CELLS]], i64 [[IDXPROM_J_2]]
+; CHECK-NEXT:    [[CELL_2:%.*]] = load i64, ptr [[ARRAYIDX_J_2]], align 8
+; CHECK-NEXT:    [[CELL_3:%.*]] = load i64, ptr [[ARRAYIDX_J_2]], align 8
+; CHECK-NEXT:    [[IDXPROM_J:%.*]] = zext i32 [[J]] to i64
+; CHECK-NEXT:    [[ARRAYIDX_J:%.*]] = getelementptr inbounds i64, ptr [[CELLS]], i64 [[IDXPROM_J]]
+; CHECK-NEXT:    [[CELL_4:%.*]] = load i64, ptr [[ARRAYIDX_J]], align 8
+; CHECK-NEXT:    [[MUL_1:%.*]] = mul i64 [[CELL_1]], [[D1]]
+; CHECK-NEXT:    [[MUL_2:%.*]] = mul i64 [[CELL_4]], [[D2]]
+; CHECK-NEXT:    [[EXTRA_MUL:%.*]] = mul i64 [[CELL_3]], [[CELL_2]]
+; CHECK-NEXT:    [[REASS_ADD:%.*]] = add i64 [[EXTRA_MUL]], [[MUL_1]]
+; CHECK-NEXT:    [[EXTRA_ADD:%.*]] = add i64 [[REASS_ADD]], [[MUL_2]]
+; CHECK-NEXT:    [[REASS_MUL:%.*]] = mul i64 [[EXTRA_ADD]], [[DELTA]]
+; CHECK-NEXT:    store i64 [[REASS_MUL]], ptr [[ARRAYIDX_J]], align 8
+; CHECK-NEXT:    br label [[FOR_COND]]
+; CHECK:       for.end:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %for.cond
+
+for.cond:
+  %j = phi i32 [ 0, %entry ], [ %add.j.1, %for.body ]
+  %cmp.not = icmp sgt i32 %j, %i
+  br i1 %cmp.not, label %for.end, label %for.body
+
+for.body:
+  %add.j.1 = add nuw nsw i32 %j, 1
+  %idxprom.j.1 = zext i32 %add.j.1 to i64
+  %arrayidx.j.1 = getelementptr inbounds i64, ptr %cells, i64 %idxprom.j.1
+  %cell.1 = load i64, ptr %arrayidx.j.1, align 8
+  %idxprom.j.2 = zext i32 %add.j.1 to i64
+  %arrayidx.j.2 = getelementptr inbounds i64, ptr %cells, i64 %idxprom.j.2
+  %cell.2 = load i64, ptr %arrayidx.j.2, align 8
+  %idxprom.j.3 = zext i32 %add.j.1 to i64
+  %cell.3 = load i64, ptr %arrayidx.j.2, align 8
+  %idxprom.j = zext i32 %j to i64
+  %arrayidx.j = getelementptr inbounds i64, ptr %cells, i64 %idxprom.j
+  %cell.4 = load i64, ptr %arrayidx.j, align 8
+  %mul.1 = mul i64 %cell.1, %d1
+  %mul.2 = mul i64 %cell.4, %d2
+  %extra.mul = mul i64 %cell.3, %cell.2
+  %reass.add = add i64 %extra.mul, %mul.1
+  %extra.add = add i64 %reass.add, %mul.2
+  %reass.mul = mul i64 %extra.add, %delta
+  store i64 %reass.mul, ptr %arrayidx.j, align 8
+  br label %for.cond
+
+for.end:
+  ret void
+}
-- 
cgit v1.1


From bc8910d272e3f9ed58b21af2e7c7a9125e4b25ea Mon Sep 17 00:00:00 2001
From: Luke Lau <luke@igalia.com>
Date: Tue, 13 Feb 2024 13:01:18 +0800
Subject: [RISCV] Mark RISCVFoldMasks methods as const. NFC

---
 llvm/lib/Target/RISCV/RISCVFoldMasks.cpp | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVFoldMasks.cpp b/llvm/lib/Target/RISCV/RISCVFoldMasks.cpp
index 271d28f..fddbaa9 100644
--- a/llvm/lib/Target/RISCV/RISCVFoldMasks.cpp
+++ b/llvm/lib/Target/RISCV/RISCVFoldMasks.cpp
@@ -47,10 +47,10 @@ public:
   StringRef getPassName() const override { return "RISC-V Fold Masks"; }
 
 private:
-  bool convertToUnmasked(MachineInstr &MI, MachineInstr *MaskDef);
-  bool convertVMergeToVMv(MachineInstr &MI, MachineInstr *MaskDef);
+  bool convertToUnmasked(MachineInstr &MI, MachineInstr *MaskDef) const;
+  bool convertVMergeToVMv(MachineInstr &MI, MachineInstr *MaskDef) const;
 
-  bool isAllOnesMask(MachineInstr *MaskDef);
+  bool isAllOnesMask(MachineInstr *MaskDef) const;
 };
 
 } // namespace
@@ -59,7 +59,7 @@ char RISCVFoldMasks::ID = 0;
 
 INITIALIZE_PASS(RISCVFoldMasks, DEBUG_TYPE, "RISC-V Fold Masks", false, false)
 
-bool RISCVFoldMasks::isAllOnesMask(MachineInstr *MaskDef) {
+bool RISCVFoldMasks::isAllOnesMask(MachineInstr *MaskDef) const {
   if (!MaskDef)
     return false;
   assert(MaskDef->isCopy() && MaskDef->getOperand(0).getReg() == RISCV::V0);
@@ -89,7 +89,8 @@ bool RISCVFoldMasks::isAllOnesMask(MachineInstr *MaskDef) {
 
 // Transform (VMERGE_VVM_<LMUL> false, false, true, allones, vl, sew) to
 // (VMV_V_V_<LMUL> false, true, vl, sew). It may decrease uses of VMSET.
-bool RISCVFoldMasks::convertVMergeToVMv(MachineInstr &MI, MachineInstr *V0Def) {
+bool RISCVFoldMasks::convertVMergeToVMv(MachineInstr &MI,
+                                        MachineInstr *V0Def) const {
 #define CASE_VMERGE_TO_VMV(lmul)                                               \
   case RISCV::PseudoVMERGE_VVM_##lmul:                                         \
     NewOpc = RISCV::PseudoVMV_V_V_##lmul;                                      \
@@ -133,7 +134,7 @@ bool RISCVFoldMasks::convertVMergeToVMv(MachineInstr &MI, MachineInstr *V0Def) {
 }
 
 bool RISCVFoldMasks::convertToUnmasked(MachineInstr &MI,
-                                       MachineInstr *MaskDef) {
+                                       MachineInstr *MaskDef) const {
   const RISCV::RISCVMaskedPseudoInfo *I =
       RISCV::getMaskedPseudoInfo(MI.getOpcode());
   if (!I)
-- 
cgit v1.1


From b56b3d75b04cda762ac8e15b7ec6fa4f52a6735a Mon Sep 17 00:00:00 2001
From: Younan Zhang <zyn7109@gmail.com>
Date: Tue, 13 Feb 2024 13:34:27 +0800
Subject: [Clang][Sema] Don't consider top-level cv-qualifiers in template
 partial orderings (#81449)

This fixes a regression since
https://github.com/llvm/llvm-project/commit/340eac01f7dad6c24cee35dd35f2484098dd6b1a,
from which we compared function parameter types with cv-qualifiers taken
into account. However, as per [dcl.fct]/p5:

> After producing the list of parameter types, any top-level
cv-qualifiers modifying
> a parameter type are deleted when forming the function type.

Thus, I think we should use `hasSameUnqualifiedType` for type
comparison.

This fixes https://github.com/llvm/llvm-project/issues/75404.
---
 clang/docs/ReleaseNotes.rst                              |  2 ++
 clang/lib/Sema/SemaTemplateDeduction.cpp                 |  7 +++++--
 clang/test/CXX/over/over.match/over.match.best/p1-2a.cpp | 10 ++++++----
 3 files changed, 13 insertions(+), 6 deletions(-)

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index aa167d7..dd79023 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -228,6 +228,8 @@ Bug Fixes to C++ Support
   or non-constant more accurately. Previously, only a subset of the initializer
   elements were considered, misclassifying some initializers as constant. Fixes
   some of (`#80510 <https://github.com/llvm/llvm-project/issues/80510>`).
+- Clang now ignores top-level cv-qualifiers on function parameters in template partial orderings.
+  (`#75404 <https://github.com/llvm/llvm-project/issues/75404>`_)
 
 Bug Fixes to AST Handling
 ^^^^^^^^^^^^^^^^^^^^^^^^^
diff --git a/clang/lib/Sema/SemaTemplateDeduction.cpp b/clang/lib/Sema/SemaTemplateDeduction.cpp
index 994c997..47cc223 100644
--- a/clang/lib/Sema/SemaTemplateDeduction.cpp
+++ b/clang/lib/Sema/SemaTemplateDeduction.cpp
@@ -5642,9 +5642,12 @@ FunctionTemplateDecl *Sema::getMoreSpecializedTemplate(
                                       Sema::TPL_TemplateParamsEquivalent))
     return nullptr;
 
+  // [dcl.fct]p5:
+  // Any top-level cv-qualifiers modifying a parameter type are deleted when
+  // forming the function type.
   for (unsigned i = 0; i < NumParams1; ++i)
-    if (!Context.hasSameType(FD1->getParamDecl(i)->getType(),
-                             FD2->getParamDecl(i)->getType()))
+    if (!Context.hasSameUnqualifiedType(FD1->getParamDecl(i)->getType(),
+                                        FD2->getParamDecl(i)->getType()))
       return nullptr;
 
   // C++20 [temp.func.order]p6.3:
diff --git a/clang/test/CXX/over/over.match/over.match.best/p1-2a.cpp b/clang/test/CXX/over/over.match/over.match.best/p1-2a.cpp
index dae1ba76..db3e3e3 100644
--- a/clang/test/CXX/over/over.match/over.match.best/p1-2a.cpp
+++ b/clang/test/CXX/over/over.match/over.match.best/p1-2a.cpp
@@ -97,13 +97,16 @@ namespace non_template
   static_assert(is_same_v<decltype(baz<int>()), int>); // expected-error {{call to 'baz' is ambiguous}}
   static_assert(is_same_v<decltype(bar<int>()), void>); // expected-error {{call to 'bar' is ambiguous}}
 
+  // Top-level cv-qualifiers are ignored in template partial ordering per [dcl.fct]/p5.
+  //   After producing the list of parameter types, any top-level cv-qualifiers modifying
+  //   a parameter type are deleted when forming the function type.
   template<typename T>
-  constexpr int goo(int a) requires AtLeast2<int> && true { // expected-note {{candidate function}}
+  constexpr int goo(T a) requires AtLeast2<T> && true {
     return 1;
   }
 
   template<typename T>
-  constexpr int goo(const int b) requires AtLeast2<int> { // expected-note {{candidate function}}
+  constexpr int goo(const T b) requires AtLeast2<T> {
     return 2;
   }
 
@@ -122,7 +125,6 @@ namespace non_template
     return 2;
   }
 
-  // By temp.func.order-6.2.2, this is ambiguous because parameter a and b have different types.
-  static_assert(goo<int>(1) == 1); // expected-error {{call to 'goo' is ambiguous}}
+  static_assert(goo<int>(1) == 1);
   static_assert(doo<int>(2) == 1);
 }
-- 
cgit v1.1


From 85e6e71eb09cb9e75bbd475e5f54f876653c3f16 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Timm=20B=C3=A4der?= <tbaeder@redhat.com>
Date: Tue, 13 Feb 2024 06:56:58 +0100
Subject: [clang][Interp] Handle discarded PointerToIntegral casts

Resolve an old TODO comment.
---
 clang/lib/AST/Interp/ByteCodeExprGen.cpp | 4 +++-
 clang/test/AST/Interp/c.c                | 1 +
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/clang/lib/AST/Interp/ByteCodeExprGen.cpp b/clang/lib/AST/Interp/ByteCodeExprGen.cpp
index 8a2c1e5..ba6c1d5 100644
--- a/clang/lib/AST/Interp/ByteCodeExprGen.cpp
+++ b/clang/lib/AST/Interp/ByteCodeExprGen.cpp
@@ -167,7 +167,9 @@ bool ByteCodeExprGen<Emitter>::VisitCastExpr(const CastExpr *CE) {
     return this->emitNull(classifyPrim(CE->getType()), CE);
 
   case CK_PointerToIntegral: {
-    // TODO: Discard handling.
+    if (DiscardResult)
+      return this->discard(SubExpr);
+
     if (!this->visit(SubExpr))
       return false;
 
diff --git a/clang/test/AST/Interp/c.c b/clang/test/AST/Interp/c.c
index 392b682..85c195d 100644
--- a/clang/test/AST/Interp/c.c
+++ b/clang/test/AST/Interp/c.c
@@ -24,6 +24,7 @@ _Static_assert(!!1, "");
 int a = (1 == 1 ? 5 : 3);
 _Static_assert(a == 5, ""); // all-error {{not an integral constant expression}}
 
+const int DiscardedPtrToIntCast = ((intptr_t)((void*)0), 0); // all-warning {{left operand of comma operator has no effect}}
 
 const int b = 3;
 _Static_assert(b == 3, ""); // pedantic-ref-warning {{not an integer constant expression}} \
-- 
cgit v1.1


From bb77047a3b93e332be9cb3d85ad658e0ffa25525 Mon Sep 17 00:00:00 2001
From: Luke Lau <luke@igalia.com>
Date: Tue, 13 Feb 2024 14:29:08 +0800
Subject: [RISCV] Handle fixed length vectors with exact VLEN in
 loweringEXTRACT_SUBVECTOR (#79949)

This is a revival of #65392. When we lower an extract_subvector, we
extract the
subregister that the subvector is contained in first and then do a
vslidedown
with LMUL=1. We can currently only do this for scalable vectors though
because
the index is scaled by vscale and thus we will know what subregister the
subvector lies in.

For fixed length vectors, the index isn't scaled by vscale and so the
subvector
could lie in any arbitrary subregister, so we have to do a vslidedown
with the
full LMUL.

The exception to this is when we know the exact VLEN: in which case, we
can
still work out the exact subregister and do the LMUL=1 vslidedown on it.

This patch handles this case by scaling the index by 1/vscale before
computing
the subregister, and extending the LMUL=1 path to handle fixed length
vectors.
---
 llvm/lib/Target/RISCV/RISCVISelLowering.cpp        |  66 +++++--
 .../RISCV/rvv/fixed-vectors-extract-subvector.ll   | 205 +++++++++++++--------
 2 files changed, 181 insertions(+), 90 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 12c0cd5..5bbd827 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -9727,12 +9727,15 @@ SDValue RISCVTargetLowering::lowerEXTRACT_SUBVECTOR(SDValue Op,
   if (OrigIdx == 0)
     return Op;
 
-  // If the subvector vector is a fixed-length type, we cannot use subregister
-  // manipulation to simplify the codegen; we don't know which register of a
-  // LMUL group contains the specific subvector as we only know the minimum
-  // register size. Therefore we must slide the vector group down the full
-  // amount.
-  if (SubVecVT.isFixedLengthVector()) {
+  const unsigned MinVLen = Subtarget.getRealMinVLen();
+  const unsigned MaxVLen = Subtarget.getRealMaxVLen();
+
+  // If the subvector vector is a fixed-length type and we don't know VLEN
+  // exactly, we cannot use subregister manipulation to simplify the codegen; we
+  // don't know which register of a LMUL group contains the specific subvector
+  // as we only know the minimum register size. Therefore we must slide the
+  // vector group down the full amount.
+  if (SubVecVT.isFixedLengthVector() && MinVLen != MaxVLen) {
     MVT ContainerVT = VecVT;
     if (VecVT.isFixedLengthVector()) {
       ContainerVT = getContainerForFixedLengthVector(VecVT);
@@ -9764,19 +9767,47 @@ SDValue RISCVTargetLowering::lowerEXTRACT_SUBVECTOR(SDValue Op,
     return DAG.getBitcast(Op.getValueType(), Slidedown);
   }
 
+  if (VecVT.isFixedLengthVector()) {
+    VecVT = getContainerForFixedLengthVector(VecVT);
+    Vec = convertToScalableVector(VecVT, Vec, DAG, Subtarget);
+  }
+
+  MVT ContainerSubVecVT = SubVecVT;
+  if (SubVecVT.isFixedLengthVector())
+    ContainerSubVecVT = getContainerForFixedLengthVector(SubVecVT);
+
   unsigned SubRegIdx, RemIdx;
-  std::tie(SubRegIdx, RemIdx) =
-      RISCVTargetLowering::decomposeSubvectorInsertExtractToSubRegs(
-          VecVT, SubVecVT, OrigIdx, TRI);
+  // extract_subvector scales the index by vscale is the subvector is scalable,
+  // and decomposeSubvectorInsertExtractToSubRegs takes this into account. So if
+  // we have a fixed length subvector, we need to adjust the index by 1/vscale.
+  if (SubVecVT.isFixedLengthVector()) {
+    assert(MinVLen == MaxVLen);
+    unsigned Vscale = MinVLen / RISCV::RVVBitsPerBlock;
+    std::tie(SubRegIdx, RemIdx) =
+        RISCVTargetLowering::decomposeSubvectorInsertExtractToSubRegs(
+            VecVT, ContainerSubVecVT, OrigIdx / Vscale, TRI);
+    RemIdx = (RemIdx * Vscale) + (OrigIdx % Vscale);
+  } else {
+    std::tie(SubRegIdx, RemIdx) =
+        RISCVTargetLowering::decomposeSubvectorInsertExtractToSubRegs(
+            VecVT, ContainerSubVecVT, OrigIdx, TRI);
+  }
 
   // If the Idx has been completely eliminated then this is a subvector extract
   // which naturally aligns to a vector register. These can easily be handled
   // using subregister manipulation.
-  if (RemIdx == 0)
+  if (RemIdx == 0) {
+    if (SubVecVT.isFixedLengthVector()) {
+      Vec = DAG.getTargetExtractSubreg(SubRegIdx, DL, ContainerSubVecVT, Vec);
+      return convertFromScalableVector(SubVecVT, Vec, DAG, Subtarget);
+    }
     return Op;
+  }
 
-  // Else SubVecVT is a fractional LMUL and may need to be slid down.
-  assert(RISCVVType::decodeVLMUL(getLMUL(SubVecVT)).second);
+  // Else SubVecVT is a fractional LMUL and may need to be slid down: if
+  // SubVecVT was > M1 then the index would need to be a multiple of VLMAX, and
+  // so would divide exactly.
+  assert(RISCVVType::decodeVLMUL(getLMUL(ContainerSubVecVT)).second);
 
   // If the vector type is an LMUL-group type, extract a subvector equal to the
   // nearest full vector register type.
@@ -9791,10 +9822,17 @@ SDValue RISCVTargetLowering::lowerEXTRACT_SUBVECTOR(SDValue Op,
 
   // Slide this vector register down by the desired number of elements in order
   // to place the desired subvector starting at element 0.
-  SDValue SlidedownAmt =
-      DAG.getVScale(DL, XLenVT, APInt(XLenVT.getSizeInBits(), RemIdx));
+  SDValue SlidedownAmt;
+  if (SubVecVT.isFixedLengthVector())
+    SlidedownAmt = DAG.getConstant(RemIdx, DL, Subtarget.getXLenVT());
+  else
+    SlidedownAmt =
+        DAG.getVScale(DL, XLenVT, APInt(XLenVT.getSizeInBits(), RemIdx));
 
   auto [Mask, VL] = getDefaultScalableVLOps(InterSubVT, DL, DAG, Subtarget);
+  if (SubVecVT.isFixedLengthVector())
+    VL = getVLOp(SubVecVT.getVectorNumElements(), InterSubVT, DL, DAG,
+                 Subtarget);
   SDValue Slidedown =
       getVSlidedown(DAG, Subtarget, DL, InterSubVT, DAG.getUNDEF(InterSubVT),
                     Vec, SlidedownAmt, Mask, VL);
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extract-subvector.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extract-subvector.ll
index 8b51a38..7d29b19 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extract-subvector.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extract-subvector.ll
@@ -76,10 +76,8 @@ define void @extract_v1i32_v8i32_4(ptr %x, ptr %y) {
 ; CHECK-KNOWNVLEN128-LABEL: extract_v1i32_v8i32_4:
 ; CHECK-KNOWNVLEN128:       # %bb.0:
 ; CHECK-KNOWNVLEN128-NEXT:    vl2re32.v v8, (a0)
-; CHECK-KNOWNVLEN128-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
-; CHECK-KNOWNVLEN128-NEXT:    vslidedown.vi v8, v8, 4
 ; CHECK-KNOWNVLEN128-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
-; CHECK-KNOWNVLEN128-NEXT:    vse32.v v8, (a1)
+; CHECK-KNOWNVLEN128-NEXT:    vse32.v v9, (a1)
 ; CHECK-KNOWNVLEN128-NEXT:    ret
   %a = load <8 x i32>, ptr %x
   %c = call <1 x i32> @llvm.vector.extract.v1i32.v8i32(<8 x i32> %a, i64 4)
@@ -101,8 +99,8 @@ define void @extract_v1i32_v8i32_5(ptr %x, ptr %y) {
 ; CHECK-KNOWNVLEN128-LABEL: extract_v1i32_v8i32_5:
 ; CHECK-KNOWNVLEN128:       # %bb.0:
 ; CHECK-KNOWNVLEN128-NEXT:    vl2re32.v v8, (a0)
-; CHECK-KNOWNVLEN128-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
-; CHECK-KNOWNVLEN128-NEXT:    vslidedown.vi v8, v8, 5
+; CHECK-KNOWNVLEN128-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
+; CHECK-KNOWNVLEN128-NEXT:    vslidedown.vi v8, v9, 1
 ; CHECK-KNOWNVLEN128-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
 ; CHECK-KNOWNVLEN128-NEXT:    vse32.v v8, (a1)
 ; CHECK-KNOWNVLEN128-NEXT:    ret
@@ -172,10 +170,8 @@ define void @extract_v2i32_v8i32_4(ptr %x, ptr %y) {
 ; CHECK-KNOWNVLEN128-LABEL: extract_v2i32_v8i32_4:
 ; CHECK-KNOWNVLEN128:       # %bb.0:
 ; CHECK-KNOWNVLEN128-NEXT:    vl2re32.v v8, (a0)
-; CHECK-KNOWNVLEN128-NEXT:    vsetivli zero, 2, e32, m2, ta, ma
-; CHECK-KNOWNVLEN128-NEXT:    vslidedown.vi v8, v8, 4
 ; CHECK-KNOWNVLEN128-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; CHECK-KNOWNVLEN128-NEXT:    vse32.v v8, (a1)
+; CHECK-KNOWNVLEN128-NEXT:    vse32.v v9, (a1)
 ; CHECK-KNOWNVLEN128-NEXT:    ret
   %a = load <8 x i32>, ptr %x
   %c = call <2 x i32> @llvm.vector.extract.v2i32.v8i32(<8 x i32> %a, i64 4)
@@ -197,8 +193,8 @@ define void @extract_v2i32_v8i32_6(ptr %x, ptr %y) {
 ; CHECK-KNOWNVLEN128-LABEL: extract_v2i32_v8i32_6:
 ; CHECK-KNOWNVLEN128:       # %bb.0:
 ; CHECK-KNOWNVLEN128-NEXT:    vl2re32.v v8, (a0)
-; CHECK-KNOWNVLEN128-NEXT:    vsetivli zero, 2, e32, m2, ta, ma
-; CHECK-KNOWNVLEN128-NEXT:    vslidedown.vi v8, v8, 6
+; CHECK-KNOWNVLEN128-NEXT:    vsetivli zero, 2, e32, m1, ta, ma
+; CHECK-KNOWNVLEN128-NEXT:    vslidedown.vi v8, v9, 2
 ; CHECK-KNOWNVLEN128-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
 ; CHECK-KNOWNVLEN128-NEXT:    vse32.v v8, (a1)
 ; CHECK-KNOWNVLEN128-NEXT:    ret
@@ -234,39 +230,59 @@ define void @extract_v2i32_nxv16i32_2(<vscale x 16 x i32> %x, ptr %y) {
 }
 
 define void @extract_v2i32_nxv16i32_4(<vscale x 16 x i32> %x, ptr %y) {
-; CHECK-LABEL: extract_v2i32_nxv16i32_4:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 2, e32, m2, ta, ma
-; CHECK-NEXT:    vslidedown.vi v8, v8, 4
-; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; CHECK-NEXT:    vse32.v v8, (a0)
-; CHECK-NEXT:    ret
+; CHECK-V-LABEL: extract_v2i32_nxv16i32_4:
+; CHECK-V:       # %bb.0:
+; CHECK-V-NEXT:    vsetivli zero, 2, e32, m2, ta, ma
+; CHECK-V-NEXT:    vslidedown.vi v8, v8, 4
+; CHECK-V-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; CHECK-V-NEXT:    vse32.v v8, (a0)
+; CHECK-V-NEXT:    ret
+;
+; CHECK-KNOWNVLEN128-LABEL: extract_v2i32_nxv16i32_4:
+; CHECK-KNOWNVLEN128:       # %bb.0:
+; CHECK-KNOWNVLEN128-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; CHECK-KNOWNVLEN128-NEXT:    vse32.v v9, (a0)
+; CHECK-KNOWNVLEN128-NEXT:    ret
   %c = call <2 x i32> @llvm.vector.extract.v2i32.nxv16i32(<vscale x 16 x i32> %x, i64 4)
   store <2 x i32> %c, ptr %y
   ret void
 }
 
 define void @extract_v2i32_nxv16i32_6(<vscale x 16 x i32> %x, ptr %y) {
-; CHECK-LABEL: extract_v2i32_nxv16i32_6:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 2, e32, m2, ta, ma
-; CHECK-NEXT:    vslidedown.vi v8, v8, 6
-; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; CHECK-NEXT:    vse32.v v8, (a0)
-; CHECK-NEXT:    ret
+; CHECK-V-LABEL: extract_v2i32_nxv16i32_6:
+; CHECK-V:       # %bb.0:
+; CHECK-V-NEXT:    vsetivli zero, 2, e32, m2, ta, ma
+; CHECK-V-NEXT:    vslidedown.vi v8, v8, 6
+; CHECK-V-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; CHECK-V-NEXT:    vse32.v v8, (a0)
+; CHECK-V-NEXT:    ret
+;
+; CHECK-KNOWNVLEN128-LABEL: extract_v2i32_nxv16i32_6:
+; CHECK-KNOWNVLEN128:       # %bb.0:
+; CHECK-KNOWNVLEN128-NEXT:    vsetivli zero, 2, e32, m1, ta, ma
+; CHECK-KNOWNVLEN128-NEXT:    vslidedown.vi v8, v9, 2
+; CHECK-KNOWNVLEN128-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; CHECK-KNOWNVLEN128-NEXT:    vse32.v v8, (a0)
+; CHECK-KNOWNVLEN128-NEXT:    ret
   %c = call <2 x i32> @llvm.vector.extract.v2i32.nxv16i32(<vscale x 16 x i32> %x, i64 6)
   store <2 x i32> %c, ptr %y
   ret void
 }
 
 define void @extract_v2i32_nxv16i32_8(<vscale x 16 x i32> %x, ptr %y) {
-; CHECK-LABEL: extract_v2i32_nxv16i32_8:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 2, e32, m4, ta, ma
-; CHECK-NEXT:    vslidedown.vi v8, v8, 8
-; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; CHECK-NEXT:    vse32.v v8, (a0)
-; CHECK-NEXT:    ret
+; CHECK-V-LABEL: extract_v2i32_nxv16i32_8:
+; CHECK-V:       # %bb.0:
+; CHECK-V-NEXT:    vsetivli zero, 2, e32, m4, ta, ma
+; CHECK-V-NEXT:    vslidedown.vi v8, v8, 8
+; CHECK-V-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; CHECK-V-NEXT:    vse32.v v8, (a0)
+; CHECK-V-NEXT:    ret
+;
+; CHECK-KNOWNVLEN128-LABEL: extract_v2i32_nxv16i32_8:
+; CHECK-KNOWNVLEN128:       # %bb.0:
+; CHECK-KNOWNVLEN128-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; CHECK-KNOWNVLEN128-NEXT:    vse32.v v10, (a0)
+; CHECK-KNOWNVLEN128-NEXT:    ret
   %c = call <2 x i32> @llvm.vector.extract.v2i32.nxv16i32(<vscale x 16 x i32> %x, i64 8)
   store <2 x i32> %c, ptr %y
   ret void
@@ -333,9 +349,7 @@ define void @extract_v8i32_nxv16i32_8(<vscale x 16 x i32> %x, ptr %y) {
 ;
 ; CHECK-KNOWNVLEN128-LABEL: extract_v8i32_nxv16i32_8:
 ; CHECK-KNOWNVLEN128:       # %bb.0:
-; CHECK-KNOWNVLEN128-NEXT:    vsetivli zero, 8, e32, m4, ta, ma
-; CHECK-KNOWNVLEN128-NEXT:    vslidedown.vi v8, v8, 8
-; CHECK-KNOWNVLEN128-NEXT:    vs2r.v v8, (a0)
+; CHECK-KNOWNVLEN128-NEXT:    vs2r.v v10, (a0)
 ; CHECK-KNOWNVLEN128-NEXT:    ret
   %c = call <8 x i32> @llvm.vector.extract.v8i32.nxv16i32(<vscale x 16 x i32> %x, i64 8)
   store <8 x i32> %c, ptr %y
@@ -611,9 +625,8 @@ define void @extract_v2i1_v64i1_42(ptr %x, ptr %y) {
 ; CHECK-KNOWNVLEN128-NEXT:    vlm.v v0, (a0)
 ; CHECK-KNOWNVLEN128-NEXT:    vmv.v.i v8, 0
 ; CHECK-KNOWNVLEN128-NEXT:    vmerge.vim v8, v8, 1, v0
-; CHECK-KNOWNVLEN128-NEXT:    li a0, 42
-; CHECK-KNOWNVLEN128-NEXT:    vsetivli zero, 2, e8, m4, ta, ma
-; CHECK-KNOWNVLEN128-NEXT:    vslidedown.vx v8, v8, a0
+; CHECK-KNOWNVLEN128-NEXT:    vsetivli zero, 2, e8, m1, ta, ma
+; CHECK-KNOWNVLEN128-NEXT:    vslidedown.vi v8, v10, 10
 ; CHECK-KNOWNVLEN128-NEXT:    vsetivli zero, 2, e8, mf8, ta, ma
 ; CHECK-KNOWNVLEN128-NEXT:    vmsne.vi v0, v8, 0
 ; CHECK-KNOWNVLEN128-NEXT:    vmv.v.i v8, 0
@@ -741,51 +754,91 @@ define void @extract_v2i1_nxv64i1_2(<vscale x 64 x i1> %x, ptr %y) {
 }
 
 define void @extract_v2i1_nxv64i1_42(<vscale x 64 x i1> %x, ptr %y) {
-; CHECK-LABEL: extract_v2i1_nxv64i1_42:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e8, m8, ta, ma
-; CHECK-NEXT:    vmv.v.i v8, 0
-; CHECK-NEXT:    vmerge.vim v8, v8, 1, v0
-; CHECK-NEXT:    li a1, 42
-; CHECK-NEXT:    vsetivli zero, 2, e8, m4, ta, ma
-; CHECK-NEXT:    vslidedown.vx v8, v8, a1
-; CHECK-NEXT:    vsetivli zero, 2, e8, mf8, ta, ma
-; CHECK-NEXT:    vmsne.vi v0, v8, 0
-; CHECK-NEXT:    vmv.v.i v8, 0
-; CHECK-NEXT:    vmerge.vim v8, v8, 1, v0
-; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-NEXT:    vmv.v.i v9, 0
-; CHECK-NEXT:    vsetivli zero, 2, e8, mf2, tu, ma
-; CHECK-NEXT:    vmv.v.v v9, v8
-; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-NEXT:    vmsne.vi v8, v9, 0
-; CHECK-NEXT:    vsm.v v8, (a0)
-; CHECK-NEXT:    ret
+; CHECK-V-LABEL: extract_v2i1_nxv64i1_42:
+; CHECK-V:       # %bb.0:
+; CHECK-V-NEXT:    vsetvli a1, zero, e8, m8, ta, ma
+; CHECK-V-NEXT:    vmv.v.i v8, 0
+; CHECK-V-NEXT:    vmerge.vim v8, v8, 1, v0
+; CHECK-V-NEXT:    li a1, 42
+; CHECK-V-NEXT:    vsetivli zero, 2, e8, m4, ta, ma
+; CHECK-V-NEXT:    vslidedown.vx v8, v8, a1
+; CHECK-V-NEXT:    vsetivli zero, 2, e8, mf8, ta, ma
+; CHECK-V-NEXT:    vmsne.vi v0, v8, 0
+; CHECK-V-NEXT:    vmv.v.i v8, 0
+; CHECK-V-NEXT:    vmerge.vim v8, v8, 1, v0
+; CHECK-V-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; CHECK-V-NEXT:    vmv.v.i v9, 0
+; CHECK-V-NEXT:    vsetivli zero, 2, e8, mf2, tu, ma
+; CHECK-V-NEXT:    vmv.v.v v9, v8
+; CHECK-V-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; CHECK-V-NEXT:    vmsne.vi v8, v9, 0
+; CHECK-V-NEXT:    vsm.v v8, (a0)
+; CHECK-V-NEXT:    ret
+;
+; CHECK-KNOWNVLEN128-LABEL: extract_v2i1_nxv64i1_42:
+; CHECK-KNOWNVLEN128:       # %bb.0:
+; CHECK-KNOWNVLEN128-NEXT:    vsetvli a1, zero, e8, m8, ta, ma
+; CHECK-KNOWNVLEN128-NEXT:    vmv.v.i v8, 0
+; CHECK-KNOWNVLEN128-NEXT:    vmerge.vim v8, v8, 1, v0
+; CHECK-KNOWNVLEN128-NEXT:    vsetivli zero, 2, e8, m1, ta, ma
+; CHECK-KNOWNVLEN128-NEXT:    vslidedown.vi v8, v10, 10
+; CHECK-KNOWNVLEN128-NEXT:    vsetivli zero, 2, e8, mf8, ta, ma
+; CHECK-KNOWNVLEN128-NEXT:    vmsne.vi v0, v8, 0
+; CHECK-KNOWNVLEN128-NEXT:    vmv.v.i v8, 0
+; CHECK-KNOWNVLEN128-NEXT:    vmerge.vim v8, v8, 1, v0
+; CHECK-KNOWNVLEN128-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; CHECK-KNOWNVLEN128-NEXT:    vmv.v.i v9, 0
+; CHECK-KNOWNVLEN128-NEXT:    vsetivli zero, 2, e8, mf2, tu, ma
+; CHECK-KNOWNVLEN128-NEXT:    vmv.v.v v9, v8
+; CHECK-KNOWNVLEN128-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; CHECK-KNOWNVLEN128-NEXT:    vmsne.vi v8, v9, 0
+; CHECK-KNOWNVLEN128-NEXT:    vsm.v v8, (a0)
+; CHECK-KNOWNVLEN128-NEXT:    ret
   %c = call <2 x i1> @llvm.vector.extract.v2i1.nxv64i1(<vscale x 64 x i1> %x, i64 42)
   store <2 x i1> %c, ptr %y
   ret void
 }
 
 define void @extract_v2i1_nxv32i1_26(<vscale x 32 x i1> %x, ptr %y) {
-; CHECK-LABEL: extract_v2i1_nxv32i1_26:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e8, m4, ta, ma
-; CHECK-NEXT:    vmv.v.i v8, 0
-; CHECK-NEXT:    vmerge.vim v8, v8, 1, v0
-; CHECK-NEXT:    vsetivli zero, 2, e8, m2, ta, ma
-; CHECK-NEXT:    vslidedown.vi v8, v8, 26
-; CHECK-NEXT:    vsetivli zero, 2, e8, mf8, ta, ma
-; CHECK-NEXT:    vmsne.vi v0, v8, 0
-; CHECK-NEXT:    vmv.v.i v8, 0
-; CHECK-NEXT:    vmerge.vim v8, v8, 1, v0
-; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-NEXT:    vmv.v.i v9, 0
-; CHECK-NEXT:    vsetivli zero, 2, e8, mf2, tu, ma
-; CHECK-NEXT:    vmv.v.v v9, v8
-; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-NEXT:    vmsne.vi v8, v9, 0
-; CHECK-NEXT:    vsm.v v8, (a0)
-; CHECK-NEXT:    ret
+; CHECK-V-LABEL: extract_v2i1_nxv32i1_26:
+; CHECK-V:       # %bb.0:
+; CHECK-V-NEXT:    vsetvli a1, zero, e8, m4, ta, ma
+; CHECK-V-NEXT:    vmv.v.i v8, 0
+; CHECK-V-NEXT:    vmerge.vim v8, v8, 1, v0
+; CHECK-V-NEXT:    vsetivli zero, 2, e8, m2, ta, ma
+; CHECK-V-NEXT:    vslidedown.vi v8, v8, 26
+; CHECK-V-NEXT:    vsetivli zero, 2, e8, mf8, ta, ma
+; CHECK-V-NEXT:    vmsne.vi v0, v8, 0
+; CHECK-V-NEXT:    vmv.v.i v8, 0
+; CHECK-V-NEXT:    vmerge.vim v8, v8, 1, v0
+; CHECK-V-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; CHECK-V-NEXT:    vmv.v.i v9, 0
+; CHECK-V-NEXT:    vsetivli zero, 2, e8, mf2, tu, ma
+; CHECK-V-NEXT:    vmv.v.v v9, v8
+; CHECK-V-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; CHECK-V-NEXT:    vmsne.vi v8, v9, 0
+; CHECK-V-NEXT:    vsm.v v8, (a0)
+; CHECK-V-NEXT:    ret
+;
+; CHECK-KNOWNVLEN128-LABEL: extract_v2i1_nxv32i1_26:
+; CHECK-KNOWNVLEN128:       # %bb.0:
+; CHECK-KNOWNVLEN128-NEXT:    vsetvli a1, zero, e8, m4, ta, ma
+; CHECK-KNOWNVLEN128-NEXT:    vmv.v.i v8, 0
+; CHECK-KNOWNVLEN128-NEXT:    vmerge.vim v8, v8, 1, v0
+; CHECK-KNOWNVLEN128-NEXT:    vsetivli zero, 2, e8, m1, ta, ma
+; CHECK-KNOWNVLEN128-NEXT:    vslidedown.vi v8, v9, 10
+; CHECK-KNOWNVLEN128-NEXT:    vsetivli zero, 2, e8, mf8, ta, ma
+; CHECK-KNOWNVLEN128-NEXT:    vmsne.vi v0, v8, 0
+; CHECK-KNOWNVLEN128-NEXT:    vmv.v.i v8, 0
+; CHECK-KNOWNVLEN128-NEXT:    vmerge.vim v8, v8, 1, v0
+; CHECK-KNOWNVLEN128-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; CHECK-KNOWNVLEN128-NEXT:    vmv.v.i v9, 0
+; CHECK-KNOWNVLEN128-NEXT:    vsetivli zero, 2, e8, mf2, tu, ma
+; CHECK-KNOWNVLEN128-NEXT:    vmv.v.v v9, v8
+; CHECK-KNOWNVLEN128-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; CHECK-KNOWNVLEN128-NEXT:    vmsne.vi v8, v9, 0
+; CHECK-KNOWNVLEN128-NEXT:    vsm.v v8, (a0)
+; CHECK-KNOWNVLEN128-NEXT:    ret
   %c = call <2 x i1> @llvm.vector.extract.v2i1.nxv32i1(<vscale x 32 x i1> %x, i64 26)
   store <2 x i1> %c, ptr %y
   ret void
-- 
cgit v1.1


From 4bcbeaed63af8f1e18c1c74d3ef2a5a6ff7a22ac Mon Sep 17 00:00:00 2001
From: Austin Kerbow <Austin.Kerbow@amd.com>
Date: Mon, 12 Feb 2024 22:33:29 -0800
Subject: [AMDGPU] Enable kernel arg preloading with gfx90a (#81180)

Add a trap instruction to the beginning of the kernel prologue to handle
cases where preloading is attempted on HW loaded with incompatible
firmware.
---
 llvm/docs/AMDGPUUsage.rst                          |     5 +-
 llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp        |     3 +-
 .../Target/AMDGPU/AMDGPULowerKernelArguments.cpp   |     1 -
 llvm/lib/Target/AMDGPU/GCNSubtarget.h              |     6 -
 .../AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp   |    16 +-
 .../AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h     |     9 +-
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp          |     3 +-
 llvm/test/CodeGen/AMDGPU/preload-kernarg-header.ll |     9 +-
 llvm/test/CodeGen/AMDGPU/preload-kernargs.ll       | 15978 +++++++++++++------
 9 files changed, 10685 insertions(+), 5345 deletions(-)

diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst
index ddd5fd4..2b4ca0a 100644
--- a/llvm/docs/AMDGPUUsage.rst
+++ b/llvm/docs/AMDGPUUsage.rst
@@ -5515,7 +5515,10 @@ additional 256 bytes to the kernel_code_entry_byte_offset. This addition
 facilitates the incorporation of a prologue to the kernel entry to handle cases
 where code designed for kernarg preloading is executed on hardware equipped with
 incompatible firmware. If hardware has compatible firmware the 256 bytes at the
-start of the kernel entry will be skipped.
+start of the kernel entry will be skipped. Additionally, the compiler backend
+may insert a trap instruction at the start of the kernel prologue to manage
+situations where kernarg preloading is attempted on hardware with incompatible
+firmware.
 
 .. _amdgpu-amdhsa-kernel-prolog:
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
index 5777a7c..37a36b2 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
@@ -204,7 +204,8 @@ void AMDGPUAsmPrinter::emitFunctionBodyStart() {
 
   if (MFI.getNumKernargPreloadedSGPRs() > 0) {
     assert(AMDGPU::hasKernargPreload(STM));
-    getTargetStreamer()->EmitKernargPreloadHeader(*getGlobalSTI());
+    getTargetStreamer()->EmitKernargPreloadHeader(*getGlobalSTI(),
+                                                  STM.isAmdHsaOS());
   }
 }
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp
index 015c7108..bc58407 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp
@@ -145,7 +145,6 @@ static bool lowerKernelArguments(Function &F, const TargetMachine &TM) {
 
     // Try to preload this argument into user SGPRs.
     if (Arg.hasInRegAttr() && InPreloadSequence && ST.hasKernargPreload() &&
-        !ST.needsKernargPreloadBackwardsCompatibility() &&
         !Arg.getType()->isAggregateType())
       if (PreloadInfo.tryAllocPreloadSGPRs(AllocSize, EltOffset,
                                            LastExplicitArgOffset))
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index b13b4f7..c8e1b15 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -1258,12 +1258,6 @@ public:
   // \returns true if preloading kernel arguments is supported.
   bool hasKernargPreload() const { return KernargPreload; }
 
-  // \returns true if we need to generate backwards compatible code when
-  // preloading kernel arguments.
-  bool needsKernargPreloadBackwardsCompatibility() const {
-    return hasKernargPreload() && !hasGFX940Insts();
-  }
-
   // \returns true if the target has split barriers feature
   bool hasSplitBarriers() const { return getGeneration() >= GFX12; }
 
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
index a25622c..4742b0b 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
@@ -782,18 +782,26 @@ bool AMDGPUTargetELFStreamer::EmitHSAMetadata(msgpack::Document &HSAMetadataDoc,
 }
 
 bool AMDGPUTargetAsmStreamer::EmitKernargPreloadHeader(
-    const MCSubtargetInfo &STI) {
-  for (int i = 0; i < 64; ++i) {
+    const MCSubtargetInfo &STI, bool TrapEnabled) {
+  const char *TrapInstr = TrapEnabled ? "\ts_trap 2" : "\ts_endpgm";
+  OS << TrapInstr
+     << " ; Trap with incompatible firmware that doesn't "
+        "support preloading kernel arguments.\n";
+  for (int i = 0; i < 63; ++i) {
     OS << "\ts_nop 0\n";
   }
   return true;
 }
 
 bool AMDGPUTargetELFStreamer::EmitKernargPreloadHeader(
-    const MCSubtargetInfo &STI) {
+    const MCSubtargetInfo &STI, bool TrapEnabled) {
   const uint32_t Encoded_s_nop = 0xbf800000;
+  const uint32_t Encoded_s_trap = 0xbf920002;
+  const uint32_t Encoded_s_endpgm = 0xbf810000;
+  const uint32_t TrapInstr = TrapEnabled ? Encoded_s_trap : Encoded_s_endpgm;
   MCStreamer &OS = getStreamer();
-  for (int i = 0; i < 64; ++i) {
+  OS.emitInt32(TrapInstr);
+  for (int i = 0; i < 63; ++i) {
     OS.emitInt32(Encoded_s_nop);
   }
   return true;
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h
index ad5f27a..5aa80ff 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h
@@ -89,7 +89,8 @@ public:
   virtual bool EmitCodeEnd(const MCSubtargetInfo &STI) { return true; }
 
   /// \returns True on success, false on failure.
-  virtual bool EmitKernargPreloadHeader(const MCSubtargetInfo &STI) {
+  virtual bool EmitKernargPreloadHeader(const MCSubtargetInfo &STI,
+                                        bool TrapEnabled) {
     return true;
   }
 
@@ -146,7 +147,8 @@ public:
   bool EmitCodeEnd(const MCSubtargetInfo &STI) override;
 
   /// \returns True on success, false on failure.
-  bool EmitKernargPreloadHeader(const MCSubtargetInfo &STI) override;
+  bool EmitKernargPreloadHeader(const MCSubtargetInfo &STI,
+                                bool TrapEnabled) override;
 
   void EmitAmdhsaKernelDescriptor(
       const MCSubtargetInfo &STI, StringRef KernelName,
@@ -200,7 +202,8 @@ public:
   bool EmitCodeEnd(const MCSubtargetInfo &STI) override;
 
   /// \returns True on success, false on failure.
-  bool EmitKernargPreloadHeader(const MCSubtargetInfo &STI) override;
+  bool EmitKernargPreloadHeader(const MCSubtargetInfo &STI,
+                                bool TrapEnabled) override;
 
   void EmitAmdhsaKernelDescriptor(
       const MCSubtargetInfo &STI, StringRef KernelName,
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index a64a9e6..83221f7 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -2826,8 +2826,7 @@ SDValue SITargetLowering::LowerFormalArguments(
   if (IsEntryFunc) {
     allocateSpecialEntryInputVGPRs(CCInfo, MF, *TRI, *Info);
     allocateHSAUserSGPRs(CCInfo, MF, *TRI, *Info);
-    if (IsKernel && Subtarget->hasKernargPreload() &&
-        !Subtarget->needsKernargPreloadBackwardsCompatibility())
+    if (IsKernel && Subtarget->hasKernargPreload())
       allocatePreloadKernArgSGPRs(CCInfo, ArgLocs, Ins, MF, *TRI, *Info);
 
     allocateLDSKernelId(CCInfo, MF, *TRI, *Info);
diff --git a/llvm/test/CodeGen/AMDGPU/preload-kernarg-header.ll b/llvm/test/CodeGen/AMDGPU/preload-kernarg-header.ll
index 75feac3..a70488a 100644
--- a/llvm/test/CodeGen/AMDGPU/preload-kernarg-header.ll
+++ b/llvm/test/CodeGen/AMDGPU/preload-kernarg-header.ll
@@ -1,8 +1,11 @@
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -amdgpu-kernarg-preload-count=1 -asm-verbose=0 < %s | FileCheck -check-prefixes=GCN %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -amdgpu-kernarg-preload-count=1 -filetype=obj < %s | llvm-objdump --arch=amdgcn --mcpu=gfx940 --disassemble - | FileCheck -check-prefixes=GCN %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -amdgpu-kernarg-preload-count=1 -asm-verbose=0 < %s | FileCheck -check-prefixes=GCN,HSA %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -amdgpu-kernarg-preload-count=1 -filetype=obj < %s | llvm-objdump --arch=amdgcn --mcpu=gfx940 --disassemble - | FileCheck -check-prefixes=GCN,HSA %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx940 -amdgpu-kernarg-preload-count=1 -filetype=obj < %s | llvm-objdump --arch=amdgcn --mcpu=gfx940 --disassemble - | FileCheck -check-prefixes=GCN,NON-HSA %s
 
 ; GCN: preload_kernarg_header
-; GCN-COUNT-64: s_nop 0
+; HSA: s_trap 2
+; NON-HSA: s_endpgm
+; GCN-COUNT-63: s_nop 0
 define amdgpu_kernel void @preload_kernarg_header(ptr %arg) {
     store ptr %arg, ptr %arg
     ret void
diff --git a/llvm/test/CodeGen/AMDGPU/preload-kernargs.ll b/llvm/test/CodeGen/AMDGPU/preload-kernargs.ll
index 5798021..d20c3a4 100644
--- a/llvm/test/CodeGen/AMDGPU/preload-kernargs.ll
+++ b/llvm/test/CodeGen/AMDGPU/preload-kernargs.ll
@@ -1,1856 +1,3681 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx940 -verify-machineinstrs < %s | FileCheck -check-prefixes=NO-PRELOAD %s
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx940 -amdgpu-kernarg-preload-count=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=PRELOAD-1 %s
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx940 -amdgpu-kernarg-preload-count=2 -verify-machineinstrs < %s | FileCheck -check-prefixes=PRELOAD-2 %s
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx940 -amdgpu-kernarg-preload-count=4 -verify-machineinstrs < %s | FileCheck -check-prefixes=PRELOAD-4 %s
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx940 -amdgpu-kernarg-preload-count=8 -verify-machineinstrs < %s | FileCheck -check-prefixes=PRELOAD-8 %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx940 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940-NO-PRELOAD %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx940 -amdgpu-kernarg-preload-count=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940-PRELOAD-1 %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx940 -amdgpu-kernarg-preload-count=2 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940-PRELOAD-2 %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx940 -amdgpu-kernarg-preload-count=4 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940-PRELOAD-4 %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx940 -amdgpu-kernarg-preload-count=8 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940-PRELOAD-8 %s
+
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90a-NO-PRELOAD %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx90a -amdgpu-kernarg-preload-count=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90a-PRELOAD-1 %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx90a -amdgpu-kernarg-preload-count=2 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90a-PRELOAD-2 %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx90a -amdgpu-kernarg-preload-count=4 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90a-PRELOAD-4 %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx90a -amdgpu-kernarg-preload-count=8 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90a-PRELOAD-8 %s
 
 define amdgpu_kernel void @ptr1_i8(ptr addrspace(1) %out, i8 %arg0) {
-; NO-PRELOAD-LABEL: ptr1_i8:
-; NO-PRELOAD:       ; %bb.0:
-; NO-PRELOAD-NEXT:    s_load_dword s4, s[0:1], 0x8
-; NO-PRELOAD-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
-; NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
-; NO-PRELOAD-NEXT:    s_and_b32 s0, s4, 0xff
-; NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s0
-; NO-PRELOAD-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
-; NO-PRELOAD-NEXT:    s_endpgm
-;
-; PRELOAD-1-LABEL: ptr1_i8:
-; PRELOAD-1:         s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:  ; %bb.0:
-; PRELOAD-1-NEXT:    s_load_dword s0, s[0:1], 0x8
-; PRELOAD-1-NEXT:    v_mov_b32_e32 v0, 0
-; PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
-; PRELOAD-1-NEXT:    s_and_b32 s0, s0, 0xff
-; PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s0
-; PRELOAD-1-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
-; PRELOAD-1-NEXT:    s_endpgm
-;
-; PRELOAD-2-LABEL: ptr1_i8:
-; PRELOAD-2:         s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:  ; %bb.0:
-; PRELOAD-2-NEXT:    s_and_b32 s0, s4, 0xff
-; PRELOAD-2-NEXT:    v_mov_b32_e32 v0, 0
-; PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s0
-; PRELOAD-2-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
-; PRELOAD-2-NEXT:    s_endpgm
-;
-; PRELOAD-4-LABEL: ptr1_i8:
-; PRELOAD-4:         s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:  ; %bb.0:
-; PRELOAD-4-NEXT:    s_and_b32 s0, s4, 0xff
-; PRELOAD-4-NEXT:    v_mov_b32_e32 v0, 0
-; PRELOAD-4-NEXT:    v_mov_b32_e32 v1, s0
-; PRELOAD-4-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
-; PRELOAD-4-NEXT:    s_endpgm
-;
-; PRELOAD-8-LABEL: ptr1_i8:
-; PRELOAD-8:         s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:  ; %bb.0:
-; PRELOAD-8-NEXT:    s_and_b32 s0, s4, 0xff
-; PRELOAD-8-NEXT:    v_mov_b32_e32 v0, 0
-; PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s0
-; PRELOAD-8-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
-; PRELOAD-8-NEXT:    s_endpgm
+; GFX940-NO-PRELOAD-LABEL: ptr1_i8:
+; GFX940-NO-PRELOAD:       ; %bb.0:
+; GFX940-NO-PRELOAD-NEXT:    s_load_dword s4, s[0:1], 0x8
+; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NO-PRELOAD-NEXT:    s_and_b32 s0, s4, 0xff
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s0
+; GFX940-NO-PRELOAD-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-1-LABEL: ptr1_i8:
+; GFX940-PRELOAD-1:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-1-NEXT:    s_load_dword s0, s[0:1], 0x8
+; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-PRELOAD-1-NEXT:    s_and_b32 s0, s0, 0xff
+; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s0
+; GFX940-PRELOAD-1-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-PRELOAD-1-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-2-LABEL: ptr1_i8:
+; GFX940-PRELOAD-2:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-2-NEXT:    s_and_b32 s0, s4, 0xff
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s0
+; GFX940-PRELOAD-2-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-PRELOAD-2-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-4-LABEL: ptr1_i8:
+; GFX940-PRELOAD-4:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-4-NEXT:    s_and_b32 s0, s4, 0xff
+; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v1, s0
+; GFX940-PRELOAD-4-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-PRELOAD-4-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-8-LABEL: ptr1_i8:
+; GFX940-PRELOAD-8:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-8-NEXT:    s_and_b32 s0, s4, 0xff
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s0
+; GFX940-PRELOAD-8-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-PRELOAD-8-NEXT:    s_endpgm
+;
+; GFX90a-NO-PRELOAD-LABEL: ptr1_i8:
+; GFX90a-NO-PRELOAD:       ; %bb.0:
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-NO-PRELOAD-NEXT:    s_and_b32 s2, s2, 0xff
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90a-NO-PRELOAD-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90a-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-1-LABEL: ptr1_i8:
+; GFX90a-PRELOAD-1:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-1-NEXT:    s_load_dword s0, s[4:5], 0x8
+; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-PRELOAD-1-NEXT:    s_and_b32 s0, s0, 0xff
+; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90a-PRELOAD-1-NEXT:    global_store_dword v0, v1, s[6:7]
+; GFX90a-PRELOAD-1-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-2-LABEL: ptr1_i8:
+; GFX90a-PRELOAD-2:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-2-NEXT:    s_and_b32 s0, s8, 0xff
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90a-PRELOAD-2-NEXT:    global_store_dword v0, v1, s[6:7]
+; GFX90a-PRELOAD-2-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-4-LABEL: ptr1_i8:
+; GFX90a-PRELOAD-4:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-4-NEXT:    s_and_b32 s0, s8, 0xff
+; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90a-PRELOAD-4-NEXT:    global_store_dword v0, v1, s[6:7]
+; GFX90a-PRELOAD-4-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-8-LABEL: ptr1_i8:
+; GFX90a-PRELOAD-8:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-8-NEXT:    s_and_b32 s0, s8, 0xff
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90a-PRELOAD-8-NEXT:    global_store_dword v0, v1, s[6:7]
+; GFX90a-PRELOAD-8-NEXT:    s_endpgm
   %ext = zext i8 %arg0 to i32
   store i32 %ext, ptr addrspace(1) %out
   ret void
 }
 
 define amdgpu_kernel void @ptr1_i8_zext_arg(ptr addrspace(1) %out, i8 zeroext %arg0) {
-; NO-PRELOAD-LABEL: ptr1_i8_zext_arg:
-; NO-PRELOAD:       ; %bb.0:
-; NO-PRELOAD-NEXT:    s_load_dword s4, s[0:1], 0x8
-; NO-PRELOAD-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
-; NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
-; NO-PRELOAD-NEXT:    s_and_b32 s0, s4, 0xff
-; NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s0
-; NO-PRELOAD-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
-; NO-PRELOAD-NEXT:    s_endpgm
-;
-; PRELOAD-1-LABEL: ptr1_i8_zext_arg:
-; PRELOAD-1:         s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:  ; %bb.0:
-; PRELOAD-1-NEXT:    s_load_dword s0, s[0:1], 0x8
-; PRELOAD-1-NEXT:    v_mov_b32_e32 v0, 0
-; PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
-; PRELOAD-1-NEXT:    s_and_b32 s0, s0, 0xff
-; PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s0
-; PRELOAD-1-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
-; PRELOAD-1-NEXT:    s_endpgm
-;
-; PRELOAD-2-LABEL: ptr1_i8_zext_arg:
-; PRELOAD-2:         s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:  ; %bb.0:
-; PRELOAD-2-NEXT:    s_mov_b32 s0, 0xffff
-; PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s4
-; PRELOAD-2-NEXT:    v_mov_b32_e32 v0, 0
-; PRELOAD-2-NEXT:    v_and_b32_sdwa v1, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; PRELOAD-2-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
-; PRELOAD-2-NEXT:    s_endpgm
-;
-; PRELOAD-4-LABEL: ptr1_i8_zext_arg:
-; PRELOAD-4:         s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:  ; %bb.0:
-; PRELOAD-4-NEXT:    s_mov_b32 s0, 0xffff
-; PRELOAD-4-NEXT:    v_mov_b32_e32 v1, s4
-; PRELOAD-4-NEXT:    v_mov_b32_e32 v0, 0
-; PRELOAD-4-NEXT:    v_and_b32_sdwa v1, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; PRELOAD-4-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
-; PRELOAD-4-NEXT:    s_endpgm
-;
-; PRELOAD-8-LABEL: ptr1_i8_zext_arg:
-; PRELOAD-8:         s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:  ; %bb.0:
-; PRELOAD-8-NEXT:    s_mov_b32 s0, 0xffff
-; PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s4
-; PRELOAD-8-NEXT:    v_mov_b32_e32 v0, 0
-; PRELOAD-8-NEXT:    v_and_b32_sdwa v1, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; PRELOAD-8-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
-; PRELOAD-8-NEXT:    s_endpgm
+; GFX940-NO-PRELOAD-LABEL: ptr1_i8_zext_arg:
+; GFX940-NO-PRELOAD:       ; %bb.0:
+; GFX940-NO-PRELOAD-NEXT:    s_load_dword s4, s[0:1], 0x8
+; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NO-PRELOAD-NEXT:    s_and_b32 s0, s4, 0xff
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s0
+; GFX940-NO-PRELOAD-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-1-LABEL: ptr1_i8_zext_arg:
+; GFX940-PRELOAD-1:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-1-NEXT:    s_load_dword s0, s[0:1], 0x8
+; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-PRELOAD-1-NEXT:    s_and_b32 s0, s0, 0xff
+; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s0
+; GFX940-PRELOAD-1-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-PRELOAD-1-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-2-LABEL: ptr1_i8_zext_arg:
+; GFX940-PRELOAD-2:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-2-NEXT:    s_mov_b32 s0, 0xffff
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s4
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-PRELOAD-2-NEXT:    v_and_b32_sdwa v1, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX940-PRELOAD-2-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-PRELOAD-2-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-4-LABEL: ptr1_i8_zext_arg:
+; GFX940-PRELOAD-4:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-4-NEXT:    s_mov_b32 s0, 0xffff
+; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v1, s4
+; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-PRELOAD-4-NEXT:    v_and_b32_sdwa v1, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX940-PRELOAD-4-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-PRELOAD-4-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-8-LABEL: ptr1_i8_zext_arg:
+; GFX940-PRELOAD-8:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-8-NEXT:    s_mov_b32 s0, 0xffff
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s4
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-PRELOAD-8-NEXT:    v_and_b32_sdwa v1, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX940-PRELOAD-8-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-PRELOAD-8-NEXT:    s_endpgm
+;
+; GFX90a-NO-PRELOAD-LABEL: ptr1_i8_zext_arg:
+; GFX90a-NO-PRELOAD:       ; %bb.0:
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-NO-PRELOAD-NEXT:    s_and_b32 s2, s2, 0xff
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90a-NO-PRELOAD-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90a-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-1-LABEL: ptr1_i8_zext_arg:
+; GFX90a-PRELOAD-1:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-1-NEXT:    s_load_dword s0, s[4:5], 0x8
+; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-PRELOAD-1-NEXT:    s_and_b32 s0, s0, 0xff
+; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90a-PRELOAD-1-NEXT:    global_store_dword v0, v1, s[6:7]
+; GFX90a-PRELOAD-1-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-2-LABEL: ptr1_i8_zext_arg:
+; GFX90a-PRELOAD-2:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-2-NEXT:    s_mov_b32 s0, 0xffff
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s8
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-PRELOAD-2-NEXT:    v_and_b32_sdwa v1, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX90a-PRELOAD-2-NEXT:    global_store_dword v0, v1, s[6:7]
+; GFX90a-PRELOAD-2-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-4-LABEL: ptr1_i8_zext_arg:
+; GFX90a-PRELOAD-4:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-4-NEXT:    s_mov_b32 s0, 0xffff
+; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v1, s8
+; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-PRELOAD-4-NEXT:    v_and_b32_sdwa v1, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX90a-PRELOAD-4-NEXT:    global_store_dword v0, v1, s[6:7]
+; GFX90a-PRELOAD-4-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-8-LABEL: ptr1_i8_zext_arg:
+; GFX90a-PRELOAD-8:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-8-NEXT:    s_mov_b32 s0, 0xffff
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s8
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-PRELOAD-8-NEXT:    v_and_b32_sdwa v1, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX90a-PRELOAD-8-NEXT:    global_store_dword v0, v1, s[6:7]
+; GFX90a-PRELOAD-8-NEXT:    s_endpgm
   %ext = zext i8 %arg0 to i32
   store i32 %ext, ptr addrspace(1) %out, align 4
   ret void
 }
 
 define amdgpu_kernel void @ptr1_i16_preload_arg(ptr addrspace(1) %out, i16 %arg0) {
-; NO-PRELOAD-LABEL: ptr1_i16_preload_arg:
-; NO-PRELOAD:       ; %bb.0:
-; NO-PRELOAD-NEXT:    s_load_dword s4, s[0:1], 0x8
-; NO-PRELOAD-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
-; NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
-; NO-PRELOAD-NEXT:    s_and_b32 s0, s4, 0xffff
-; NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s0
-; NO-PRELOAD-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
-; NO-PRELOAD-NEXT:    s_endpgm
-;
-; PRELOAD-1-LABEL: ptr1_i16_preload_arg:
-; PRELOAD-1:         s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:  ; %bb.0:
-; PRELOAD-1-NEXT:    s_load_dword s0, s[0:1], 0x8
-; PRELOAD-1-NEXT:    v_mov_b32_e32 v0, 0
-; PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
-; PRELOAD-1-NEXT:    s_and_b32 s0, s0, 0xffff
-; PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s0
-; PRELOAD-1-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
-; PRELOAD-1-NEXT:    s_endpgm
-;
-; PRELOAD-2-LABEL: ptr1_i16_preload_arg:
-; PRELOAD-2:         s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:  ; %bb.0:
-; PRELOAD-2-NEXT:    s_and_b32 s0, s4, 0xffff
-; PRELOAD-2-NEXT:    v_mov_b32_e32 v0, 0
-; PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s0
-; PRELOAD-2-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
-; PRELOAD-2-NEXT:    s_endpgm
-;
-; PRELOAD-4-LABEL: ptr1_i16_preload_arg:
-; PRELOAD-4:         s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:  ; %bb.0:
-; PRELOAD-4-NEXT:    s_and_b32 s0, s4, 0xffff
-; PRELOAD-4-NEXT:    v_mov_b32_e32 v0, 0
-; PRELOAD-4-NEXT:    v_mov_b32_e32 v1, s0
-; PRELOAD-4-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
-; PRELOAD-4-NEXT:    s_endpgm
-;
-; PRELOAD-8-LABEL: ptr1_i16_preload_arg:
-; PRELOAD-8:         s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:  ; %bb.0:
-; PRELOAD-8-NEXT:    s_and_b32 s0, s4, 0xffff
-; PRELOAD-8-NEXT:    v_mov_b32_e32 v0, 0
-; PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s0
-; PRELOAD-8-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
-; PRELOAD-8-NEXT:    s_endpgm
+; GFX940-NO-PRELOAD-LABEL: ptr1_i16_preload_arg:
+; GFX940-NO-PRELOAD:       ; %bb.0:
+; GFX940-NO-PRELOAD-NEXT:    s_load_dword s4, s[0:1], 0x8
+; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NO-PRELOAD-NEXT:    s_and_b32 s0, s4, 0xffff
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s0
+; GFX940-NO-PRELOAD-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-1-LABEL: ptr1_i16_preload_arg:
+; GFX940-PRELOAD-1:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-1-NEXT:    s_load_dword s0, s[0:1], 0x8
+; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-PRELOAD-1-NEXT:    s_and_b32 s0, s0, 0xffff
+; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s0
+; GFX940-PRELOAD-1-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-PRELOAD-1-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-2-LABEL: ptr1_i16_preload_arg:
+; GFX940-PRELOAD-2:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-2-NEXT:    s_and_b32 s0, s4, 0xffff
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s0
+; GFX940-PRELOAD-2-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-PRELOAD-2-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-4-LABEL: ptr1_i16_preload_arg:
+; GFX940-PRELOAD-4:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-4-NEXT:    s_and_b32 s0, s4, 0xffff
+; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v1, s0
+; GFX940-PRELOAD-4-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-PRELOAD-4-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-8-LABEL: ptr1_i16_preload_arg:
+; GFX940-PRELOAD-8:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-8-NEXT:    s_and_b32 s0, s4, 0xffff
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s0
+; GFX940-PRELOAD-8-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-PRELOAD-8-NEXT:    s_endpgm
+;
+; GFX90a-NO-PRELOAD-LABEL: ptr1_i16_preload_arg:
+; GFX90a-NO-PRELOAD:       ; %bb.0:
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-NO-PRELOAD-NEXT:    s_and_b32 s2, s2, 0xffff
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90a-NO-PRELOAD-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90a-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-1-LABEL: ptr1_i16_preload_arg:
+; GFX90a-PRELOAD-1:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-1-NEXT:    s_load_dword s0, s[4:5], 0x8
+; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-PRELOAD-1-NEXT:    s_and_b32 s0, s0, 0xffff
+; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90a-PRELOAD-1-NEXT:    global_store_dword v0, v1, s[6:7]
+; GFX90a-PRELOAD-1-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-2-LABEL: ptr1_i16_preload_arg:
+; GFX90a-PRELOAD-2:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-2-NEXT:    s_and_b32 s0, s8, 0xffff
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90a-PRELOAD-2-NEXT:    global_store_dword v0, v1, s[6:7]
+; GFX90a-PRELOAD-2-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-4-LABEL: ptr1_i16_preload_arg:
+; GFX90a-PRELOAD-4:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-4-NEXT:    s_and_b32 s0, s8, 0xffff
+; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90a-PRELOAD-4-NEXT:    global_store_dword v0, v1, s[6:7]
+; GFX90a-PRELOAD-4-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-8-LABEL: ptr1_i16_preload_arg:
+; GFX90a-PRELOAD-8:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-8-NEXT:    s_and_b32 s0, s8, 0xffff
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90a-PRELOAD-8-NEXT:    global_store_dword v0, v1, s[6:7]
+; GFX90a-PRELOAD-8-NEXT:    s_endpgm
   %ext = zext i16 %arg0 to i32
   store i32 %ext, ptr addrspace(1) %out, align 4
   ret void
 }
 
 define amdgpu_kernel void @ptr1_i32_preload_arg(ptr addrspace(1) %out, i32 %arg0) {
-; NO-PRELOAD-LABEL: ptr1_i32_preload_arg:
-; NO-PRELOAD:       ; %bb.0:
-; NO-PRELOAD-NEXT:    s_load_dword s4, s[0:1], 0x8
-; NO-PRELOAD-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
-; NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
-; NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s4
-; NO-PRELOAD-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
-; NO-PRELOAD-NEXT:    s_endpgm
-;
-; PRELOAD-1-LABEL: ptr1_i32_preload_arg:
-; PRELOAD-1:         s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:  ; %bb.0:
-; PRELOAD-1-NEXT:    s_load_dword s0, s[0:1], 0x8
-; PRELOAD-1-NEXT:    v_mov_b32_e32 v0, 0
-; PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
-; PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s0
-; PRELOAD-1-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
-; PRELOAD-1-NEXT:    s_endpgm
-;
-; PRELOAD-2-LABEL: ptr1_i32_preload_arg:
-; PRELOAD-2:         s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:  ; %bb.0:
-; PRELOAD-2-NEXT:    v_mov_b32_e32 v0, 0
-; PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s4
-; PRELOAD-2-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
-; PRELOAD-2-NEXT:    s_endpgm
-;
-; PRELOAD-4-LABEL: ptr1_i32_preload_arg:
-; PRELOAD-4:         s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:  ; %bb.0:
-; PRELOAD-4-NEXT:    v_mov_b32_e32 v0, 0
-; PRELOAD-4-NEXT:    v_mov_b32_e32 v1, s4
-; PRELOAD-4-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
-; PRELOAD-4-NEXT:    s_endpgm
-;
-; PRELOAD-8-LABEL: ptr1_i32_preload_arg:
-; PRELOAD-8:         s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:  ; %bb.0:
-; PRELOAD-8-NEXT:    v_mov_b32_e32 v0, 0
-; PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s4
-; PRELOAD-8-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
-; PRELOAD-8-NEXT:    s_endpgm
+; GFX940-NO-PRELOAD-LABEL: ptr1_i32_preload_arg:
+; GFX940-NO-PRELOAD:       ; %bb.0:
+; GFX940-NO-PRELOAD-NEXT:    s_load_dword s4, s[0:1], 0x8
+; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s4
+; GFX940-NO-PRELOAD-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-1-LABEL: ptr1_i32_preload_arg:
+; GFX940-PRELOAD-1:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-1-NEXT:    s_load_dword s0, s[0:1], 0x8
+; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s0
+; GFX940-PRELOAD-1-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-PRELOAD-1-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-2-LABEL: ptr1_i32_preload_arg:
+; GFX940-PRELOAD-2:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s4
+; GFX940-PRELOAD-2-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-PRELOAD-2-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-4-LABEL: ptr1_i32_preload_arg:
+; GFX940-PRELOAD-4:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v1, s4
+; GFX940-PRELOAD-4-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-PRELOAD-4-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-8-LABEL: ptr1_i32_preload_arg:
+; GFX940-PRELOAD-8:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s4
+; GFX940-PRELOAD-8-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-PRELOAD-8-NEXT:    s_endpgm
+;
+; GFX90a-NO-PRELOAD-LABEL: ptr1_i32_preload_arg:
+; GFX90a-NO-PRELOAD:       ; %bb.0:
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90a-NO-PRELOAD-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90a-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-1-LABEL: ptr1_i32_preload_arg:
+; GFX90a-PRELOAD-1:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-1-NEXT:    s_load_dword s0, s[4:5], 0x8
+; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90a-PRELOAD-1-NEXT:    global_store_dword v0, v1, s[6:7]
+; GFX90a-PRELOAD-1-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-2-LABEL: ptr1_i32_preload_arg:
+; GFX90a-PRELOAD-2:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s8
+; GFX90a-PRELOAD-2-NEXT:    global_store_dword v0, v1, s[6:7]
+; GFX90a-PRELOAD-2-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-4-LABEL: ptr1_i32_preload_arg:
+; GFX90a-PRELOAD-4:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v1, s8
+; GFX90a-PRELOAD-4-NEXT:    global_store_dword v0, v1, s[6:7]
+; GFX90a-PRELOAD-4-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-8-LABEL: ptr1_i32_preload_arg:
+; GFX90a-PRELOAD-8:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s8
+; GFX90a-PRELOAD-8-NEXT:    global_store_dword v0, v1, s[6:7]
+; GFX90a-PRELOAD-8-NEXT:    s_endpgm
   store i32 %arg0, ptr addrspace(1) %out
   ret void
 }
 
-; Check alignment on the second preloaded arg.
 
 define amdgpu_kernel void @i32_ptr1_i32_preload_arg(i32 %arg0, ptr addrspace(1) %out, i32 %arg1) {
-; NO-PRELOAD-LABEL: i32_ptr1_i32_preload_arg:
-; NO-PRELOAD:       ; %bb.0:
-; NO-PRELOAD-NEXT:    s_load_dword s4, s[0:1], 0x10
-; NO-PRELOAD-NEXT:    s_load_dword s5, s[0:1], 0x0
-; NO-PRELOAD-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x8
-; NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
-; NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
-; NO-PRELOAD-NEXT:    s_add_i32 s0, s5, s4
-; NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s0
-; NO-PRELOAD-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
-; NO-PRELOAD-NEXT:    s_endpgm
-;
-; PRELOAD-1-LABEL: i32_ptr1_i32_preload_arg:
-; PRELOAD-1:         s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:  ; %bb.0:
-; PRELOAD-1-NEXT:    s_load_dword s3, s[0:1], 0x10
-; PRELOAD-1-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x8
-; PRELOAD-1-NEXT:    v_mov_b32_e32 v0, 0
-; PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
-; PRELOAD-1-NEXT:    s_add_i32 s0, s2, s3
-; PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s0
-; PRELOAD-1-NEXT:    global_store_dword v0, v1, s[4:5] sc0 sc1
-; PRELOAD-1-NEXT:    s_endpgm
-;
-; PRELOAD-2-LABEL: i32_ptr1_i32_preload_arg:
-; PRELOAD-2:         s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:  ; %bb.0:
-; PRELOAD-2-NEXT:    s_load_dword s0, s[0:1], 0x10
-; PRELOAD-2-NEXT:    v_mov_b32_e32 v0, 0
-; PRELOAD-2-NEXT:    s_waitcnt lgkmcnt(0)
-; PRELOAD-2-NEXT:    s_add_i32 s0, s2, s0
-; PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s0
-; PRELOAD-2-NEXT:    global_store_dword v0, v1, s[4:5] sc0 sc1
-; PRELOAD-2-NEXT:    s_endpgm
-;
-; PRELOAD-4-LABEL: i32_ptr1_i32_preload_arg:
-; PRELOAD-4:         s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:  ; %bb.0:
-; PRELOAD-4-NEXT:    s_add_i32 s0, s2, s6
-; PRELOAD-4-NEXT:    v_mov_b32_e32 v0, 0
-; PRELOAD-4-NEXT:    v_mov_b32_e32 v1, s0
-; PRELOAD-4-NEXT:    global_store_dword v0, v1, s[4:5] sc0 sc1
-; PRELOAD-4-NEXT:    s_endpgm
-;
-; PRELOAD-8-LABEL: i32_ptr1_i32_preload_arg:
-; PRELOAD-8:         s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:  ; %bb.0:
-; PRELOAD-8-NEXT:    s_add_i32 s0, s2, s6
-; PRELOAD-8-NEXT:    v_mov_b32_e32 v0, 0
-; PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s0
-; PRELOAD-8-NEXT:    global_store_dword v0, v1, s[4:5] sc0 sc1
-; PRELOAD-8-NEXT:    s_endpgm
+; GFX940-NO-PRELOAD-LABEL: i32_ptr1_i32_preload_arg:
+; GFX940-NO-PRELOAD:       ; %bb.0:
+; GFX940-NO-PRELOAD-NEXT:    s_load_dword s4, s[0:1], 0x10
+; GFX940-NO-PRELOAD-NEXT:    s_load_dword s5, s[0:1], 0x0
+; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NO-PRELOAD-NEXT:    s_add_i32 s0, s5, s4
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s0
+; GFX940-NO-PRELOAD-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-1-LABEL: i32_ptr1_i32_preload_arg:
+; GFX940-PRELOAD-1:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-1-NEXT:    s_load_dword s3, s[0:1], 0x10
+; GFX940-PRELOAD-1-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-PRELOAD-1-NEXT:    s_add_i32 s0, s2, s3
+; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s0
+; GFX940-PRELOAD-1-NEXT:    global_store_dword v0, v1, s[4:5] sc0 sc1
+; GFX940-PRELOAD-1-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-2-LABEL: i32_ptr1_i32_preload_arg:
+; GFX940-PRELOAD-2:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-2-NEXT:    s_load_dword s0, s[0:1], 0x10
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-PRELOAD-2-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-PRELOAD-2-NEXT:    s_add_i32 s0, s2, s0
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s0
+; GFX940-PRELOAD-2-NEXT:    global_store_dword v0, v1, s[4:5] sc0 sc1
+; GFX940-PRELOAD-2-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-4-LABEL: i32_ptr1_i32_preload_arg:
+; GFX940-PRELOAD-4:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-4-NEXT:    s_add_i32 s0, s2, s6
+; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v1, s0
+; GFX940-PRELOAD-4-NEXT:    global_store_dword v0, v1, s[4:5] sc0 sc1
+; GFX940-PRELOAD-4-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-8-LABEL: i32_ptr1_i32_preload_arg:
+; GFX940-PRELOAD-8:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-8-NEXT:    s_add_i32 s0, s2, s6
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s0
+; GFX940-PRELOAD-8-NEXT:    global_store_dword v0, v1, s[4:5] sc0 sc1
+; GFX940-PRELOAD-8-NEXT:    s_endpgm
+;
+; GFX90a-NO-PRELOAD-LABEL: i32_ptr1_i32_preload_arg:
+; GFX90a-NO-PRELOAD:       ; %bb.0:
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dword s2, s[4:5], 0x10
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dword s3, s[4:5], 0x0
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-NO-PRELOAD-NEXT:    s_add_i32 s2, s3, s2
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90a-NO-PRELOAD-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90a-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-1-LABEL: i32_ptr1_i32_preload_arg:
+; GFX90a-PRELOAD-1:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-1-NEXT:    s_load_dword s2, s[4:5], 0x10
+; GFX90a-PRELOAD-1-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-PRELOAD-1-NEXT:    s_add_i32 s2, s6, s2
+; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90a-PRELOAD-1-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90a-PRELOAD-1-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-2-LABEL: i32_ptr1_i32_preload_arg:
+; GFX90a-PRELOAD-2:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-2-NEXT:    s_load_dword s0, s[4:5], 0x10
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-PRELOAD-2-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-PRELOAD-2-NEXT:    s_add_i32 s0, s6, s0
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90a-PRELOAD-2-NEXT:    global_store_dword v0, v1, s[8:9]
+; GFX90a-PRELOAD-2-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-4-LABEL: i32_ptr1_i32_preload_arg:
+; GFX90a-PRELOAD-4:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-4-NEXT:    s_add_i32 s0, s6, s10
+; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90a-PRELOAD-4-NEXT:    global_store_dword v0, v1, s[8:9]
+; GFX90a-PRELOAD-4-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-8-LABEL: i32_ptr1_i32_preload_arg:
+; GFX90a-PRELOAD-8:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-8-NEXT:    s_add_i32 s0, s6, s10
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90a-PRELOAD-8-NEXT:    global_store_dword v0, v1, s[8:9]
+; GFX90a-PRELOAD-8-NEXT:    s_endpgm
   %add = add i32 %arg0, %arg1
   store i32 %add, ptr addrspace(1) %out
   ret void
 }
 
 define amdgpu_kernel void @ptr1_i16_i16_preload_arg(ptr addrspace(1) %out, i16 %arg0, i16 %arg1) {
-; NO-PRELOAD-LABEL: ptr1_i16_i16_preload_arg:
-; NO-PRELOAD:       ; %bb.0:
-; NO-PRELOAD-NEXT:    s_load_dword s4, s[0:1], 0x8
-; NO-PRELOAD-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
-; NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
-; NO-PRELOAD-NEXT:    s_lshr_b32 s0, s4, 16
-; NO-PRELOAD-NEXT:    s_and_b32 s1, s4, 0xffff
-; NO-PRELOAD-NEXT:    s_add_i32 s0, s1, s0
-; NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s0
-; NO-PRELOAD-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
-; NO-PRELOAD-NEXT:    s_endpgm
-;
-; PRELOAD-1-LABEL: ptr1_i16_i16_preload_arg:
-; PRELOAD-1:         s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:  ; %bb.0:
-; PRELOAD-1-NEXT:    s_load_dword s0, s[0:1], 0x8
-; PRELOAD-1-NEXT:    v_mov_b32_e32 v0, 0
-; PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
-; PRELOAD-1-NEXT:    s_lshr_b32 s1, s0, 16
-; PRELOAD-1-NEXT:    s_and_b32 s0, s0, 0xffff
-; PRELOAD-1-NEXT:    s_add_i32 s0, s0, s1
-; PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s0
-; PRELOAD-1-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
-; PRELOAD-1-NEXT:    s_endpgm
-;
-; PRELOAD-2-LABEL: ptr1_i16_i16_preload_arg:
-; PRELOAD-2:         s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:  ; %bb.0:
-; PRELOAD-2-NEXT:    s_load_dword s0, s[0:1], 0x8
-; PRELOAD-2-NEXT:    s_and_b32 s1, s4, 0xffff
-; PRELOAD-2-NEXT:    v_mov_b32_e32 v0, 0
-; PRELOAD-2-NEXT:    s_waitcnt lgkmcnt(0)
-; PRELOAD-2-NEXT:    s_lshr_b32 s0, s0, 16
-; PRELOAD-2-NEXT:    s_add_i32 s0, s1, s0
-; PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s0
-; PRELOAD-2-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
-; PRELOAD-2-NEXT:    s_endpgm
-;
-; PRELOAD-4-LABEL: ptr1_i16_i16_preload_arg:
-; PRELOAD-4:         s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:  ; %bb.0:
-; PRELOAD-4-NEXT:    s_lshr_b32 s0, s4, 16
-; PRELOAD-4-NEXT:    s_and_b32 s1, s4, 0xffff
-; PRELOAD-4-NEXT:    s_add_i32 s0, s1, s0
-; PRELOAD-4-NEXT:    v_mov_b32_e32 v0, 0
-; PRELOAD-4-NEXT:    v_mov_b32_e32 v1, s0
-; PRELOAD-4-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
-; PRELOAD-4-NEXT:    s_endpgm
-;
-; PRELOAD-8-LABEL: ptr1_i16_i16_preload_arg:
-; PRELOAD-8:         s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:  ; %bb.0:
-; PRELOAD-8-NEXT:    s_lshr_b32 s0, s4, 16
-; PRELOAD-8-NEXT:    s_and_b32 s1, s4, 0xffff
-; PRELOAD-8-NEXT:    s_add_i32 s0, s1, s0
-; PRELOAD-8-NEXT:    v_mov_b32_e32 v0, 0
-; PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s0
-; PRELOAD-8-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
-; PRELOAD-8-NEXT:    s_endpgm
+; GFX940-NO-PRELOAD-LABEL: ptr1_i16_i16_preload_arg:
+; GFX940-NO-PRELOAD:       ; %bb.0:
+; GFX940-NO-PRELOAD-NEXT:    s_load_dword s4, s[0:1], 0x8
+; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NO-PRELOAD-NEXT:    s_lshr_b32 s0, s4, 16
+; GFX940-NO-PRELOAD-NEXT:    s_and_b32 s1, s4, 0xffff
+; GFX940-NO-PRELOAD-NEXT:    s_add_i32 s0, s1, s0
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s0
+; GFX940-NO-PRELOAD-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-1-LABEL: ptr1_i16_i16_preload_arg:
+; GFX940-PRELOAD-1:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-1-NEXT:    s_load_dword s0, s[0:1], 0x8
+; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-PRELOAD-1-NEXT:    s_lshr_b32 s1, s0, 16
+; GFX940-PRELOAD-1-NEXT:    s_and_b32 s0, s0, 0xffff
+; GFX940-PRELOAD-1-NEXT:    s_add_i32 s0, s0, s1
+; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s0
+; GFX940-PRELOAD-1-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-PRELOAD-1-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-2-LABEL: ptr1_i16_i16_preload_arg:
+; GFX940-PRELOAD-2:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-2-NEXT:    s_load_dword s0, s[0:1], 0x8
+; GFX940-PRELOAD-2-NEXT:    s_and_b32 s1, s4, 0xffff
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-PRELOAD-2-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-PRELOAD-2-NEXT:    s_lshr_b32 s0, s0, 16
+; GFX940-PRELOAD-2-NEXT:    s_add_i32 s0, s1, s0
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s0
+; GFX940-PRELOAD-2-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-PRELOAD-2-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-4-LABEL: ptr1_i16_i16_preload_arg:
+; GFX940-PRELOAD-4:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-4-NEXT:    s_lshr_b32 s0, s4, 16
+; GFX940-PRELOAD-4-NEXT:    s_and_b32 s1, s4, 0xffff
+; GFX940-PRELOAD-4-NEXT:    s_add_i32 s0, s1, s0
+; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v1, s0
+; GFX940-PRELOAD-4-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-PRELOAD-4-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-8-LABEL: ptr1_i16_i16_preload_arg:
+; GFX940-PRELOAD-8:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-8-NEXT:    s_lshr_b32 s0, s4, 16
+; GFX940-PRELOAD-8-NEXT:    s_and_b32 s1, s4, 0xffff
+; GFX940-PRELOAD-8-NEXT:    s_add_i32 s0, s1, s0
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s0
+; GFX940-PRELOAD-8-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-PRELOAD-8-NEXT:    s_endpgm
+;
+; GFX90a-NO-PRELOAD-LABEL: ptr1_i16_i16_preload_arg:
+; GFX90a-NO-PRELOAD:       ; %bb.0:
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-NO-PRELOAD-NEXT:    s_lshr_b32 s3, s2, 16
+; GFX90a-NO-PRELOAD-NEXT:    s_and_b32 s2, s2, 0xffff
+; GFX90a-NO-PRELOAD-NEXT:    s_add_i32 s2, s2, s3
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90a-NO-PRELOAD-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90a-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-1-LABEL: ptr1_i16_i16_preload_arg:
+; GFX90a-PRELOAD-1:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-1-NEXT:    s_load_dword s0, s[4:5], 0x8
+; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-PRELOAD-1-NEXT:    s_lshr_b32 s1, s0, 16
+; GFX90a-PRELOAD-1-NEXT:    s_and_b32 s0, s0, 0xffff
+; GFX90a-PRELOAD-1-NEXT:    s_add_i32 s0, s0, s1
+; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90a-PRELOAD-1-NEXT:    global_store_dword v0, v1, s[6:7]
+; GFX90a-PRELOAD-1-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-2-LABEL: ptr1_i16_i16_preload_arg:
+; GFX90a-PRELOAD-2:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-2-NEXT:    s_load_dword s0, s[4:5], 0x8
+; GFX90a-PRELOAD-2-NEXT:    s_and_b32 s1, s8, 0xffff
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-PRELOAD-2-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-PRELOAD-2-NEXT:    s_lshr_b32 s0, s0, 16
+; GFX90a-PRELOAD-2-NEXT:    s_add_i32 s0, s1, s0
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90a-PRELOAD-2-NEXT:    global_store_dword v0, v1, s[6:7]
+; GFX90a-PRELOAD-2-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-4-LABEL: ptr1_i16_i16_preload_arg:
+; GFX90a-PRELOAD-4:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-4-NEXT:    s_lshr_b32 s0, s8, 16
+; GFX90a-PRELOAD-4-NEXT:    s_and_b32 s1, s8, 0xffff
+; GFX90a-PRELOAD-4-NEXT:    s_add_i32 s0, s1, s0
+; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90a-PRELOAD-4-NEXT:    global_store_dword v0, v1, s[6:7]
+; GFX90a-PRELOAD-4-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-8-LABEL: ptr1_i16_i16_preload_arg:
+; GFX90a-PRELOAD-8:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-8-NEXT:    s_lshr_b32 s0, s8, 16
+; GFX90a-PRELOAD-8-NEXT:    s_and_b32 s1, s8, 0xffff
+; GFX90a-PRELOAD-8-NEXT:    s_add_i32 s0, s1, s0
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90a-PRELOAD-8-NEXT:    global_store_dword v0, v1, s[6:7]
+; GFX90a-PRELOAD-8-NEXT:    s_endpgm
   %ext = zext i16 %arg0 to i32
   %ext1 = zext i16 %arg1 to i32
   %add = add i32 %ext, %ext1
@@ -1859,3563 +3684,7068 @@ define amdgpu_kernel void @ptr1_i16_i16_preload_arg(ptr addrspace(1) %out, i16 %
 }
 
 define amdgpu_kernel void @ptr1_v2i8_preload_arg(ptr addrspace(1) %out, <2 x i8> %in) {
-; NO-PRELOAD-LABEL: ptr1_v2i8_preload_arg:
-; NO-PRELOAD:       ; %bb.0:
-; NO-PRELOAD-NEXT:    s_load_dword s4, s[0:1], 0x8
-; NO-PRELOAD-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
-; NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
-; NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s4
-; NO-PRELOAD-NEXT:    global_store_short v0, v1, s[2:3] sc0 sc1
-; NO-PRELOAD-NEXT:    s_endpgm
-;
-; PRELOAD-1-LABEL: ptr1_v2i8_preload_arg:
-; PRELOAD-1:         s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:  ; %bb.0:
-; PRELOAD-1-NEXT:    s_load_dword s0, s[0:1], 0x8
-; PRELOAD-1-NEXT:    v_mov_b32_e32 v0, 0
-; PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
-; PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s0
-; PRELOAD-1-NEXT:    global_store_short v0, v1, s[2:3] sc0 sc1
-; PRELOAD-1-NEXT:    s_endpgm
-;
-; PRELOAD-2-LABEL: ptr1_v2i8_preload_arg:
-; PRELOAD-2:         s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:  ; %bb.0:
-; PRELOAD-2-NEXT:    s_lshr_b32 s0, s4, 8
-; PRELOAD-2-NEXT:    v_lshlrev_b16_e64 v0, 8, s0
-; PRELOAD-2-NEXT:    v_or_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; PRELOAD-2-NEXT:    v_mov_b32_e32 v1, 0
-; PRELOAD-2-NEXT:    global_store_short v1, v0, s[2:3] sc0 sc1
-; PRELOAD-2-NEXT:    s_endpgm
-;
-; PRELOAD-4-LABEL: ptr1_v2i8_preload_arg:
-; PRELOAD-4:         s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:  ; %bb.0:
-; PRELOAD-4-NEXT:    s_lshr_b32 s0, s4, 8
-; PRELOAD-4-NEXT:    v_lshlrev_b16_e64 v0, 8, s0
-; PRELOAD-4-NEXT:    v_or_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; PRELOAD-4-NEXT:    v_mov_b32_e32 v1, 0
-; PRELOAD-4-NEXT:    global_store_short v1, v0, s[2:3] sc0 sc1
-; PRELOAD-4-NEXT:    s_endpgm
-;
-; PRELOAD-8-LABEL: ptr1_v2i8_preload_arg:
-; PRELOAD-8:         s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:  ; %bb.0:
-; PRELOAD-8-NEXT:    s_lshr_b32 s0, s4, 8
-; PRELOAD-8-NEXT:    v_lshlrev_b16_e64 v0, 8, s0
-; PRELOAD-8-NEXT:    v_or_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; PRELOAD-8-NEXT:    v_mov_b32_e32 v1, 0
-; PRELOAD-8-NEXT:    global_store_short v1, v0, s[2:3] sc0 sc1
-; PRELOAD-8-NEXT:    s_endpgm
+; GFX940-NO-PRELOAD-LABEL: ptr1_v2i8_preload_arg:
+; GFX940-NO-PRELOAD:       ; %bb.0:
+; GFX940-NO-PRELOAD-NEXT:    s_load_dword s4, s[0:1], 0x8
+; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s4
+; GFX940-NO-PRELOAD-NEXT:    global_store_short v0, v1, s[2:3] sc0 sc1
+; GFX940-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-1-LABEL: ptr1_v2i8_preload_arg:
+; GFX940-PRELOAD-1:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-1-NEXT:    s_load_dword s0, s[0:1], 0x8
+; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s0
+; GFX940-PRELOAD-1-NEXT:    global_store_short v0, v1, s[2:3] sc0 sc1
+; GFX940-PRELOAD-1-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-2-LABEL: ptr1_v2i8_preload_arg:
+; GFX940-PRELOAD-2:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-2-NEXT:    s_lshr_b32 s0, s4, 8
+; GFX940-PRELOAD-2-NEXT:    v_lshlrev_b16_e64 v0, 8, s0
+; GFX940-PRELOAD-2-NEXT:    v_or_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, 0
+; GFX940-PRELOAD-2-NEXT:    global_store_short v1, v0, s[2:3] sc0 sc1
+; GFX940-PRELOAD-2-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-4-LABEL: ptr1_v2i8_preload_arg:
+; GFX940-PRELOAD-4:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-4-NEXT:    s_lshr_b32 s0, s4, 8
+; GFX940-PRELOAD-4-NEXT:    v_lshlrev_b16_e64 v0, 8, s0
+; GFX940-PRELOAD-4-NEXT:    v_or_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v1, 0
+; GFX940-PRELOAD-4-NEXT:    global_store_short v1, v0, s[2:3] sc0 sc1
+; GFX940-PRELOAD-4-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-8-LABEL: ptr1_v2i8_preload_arg:
+; GFX940-PRELOAD-8:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-8-NEXT:    s_lshr_b32 s0, s4, 8
+; GFX940-PRELOAD-8-NEXT:    v_lshlrev_b16_e64 v0, 8, s0
+; GFX940-PRELOAD-8-NEXT:    v_or_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, 0
+; GFX940-PRELOAD-8-NEXT:    global_store_short v1, v0, s[2:3] sc0 sc1
+; GFX940-PRELOAD-8-NEXT:    s_endpgm
+;
+; GFX90a-NO-PRELOAD-LABEL: ptr1_v2i8_preload_arg:
+; GFX90a-NO-PRELOAD:       ; %bb.0:
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90a-NO-PRELOAD-NEXT:    global_store_short v0, v1, s[0:1]
+; GFX90a-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-1-LABEL: ptr1_v2i8_preload_arg:
+; GFX90a-PRELOAD-1:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-1-NEXT:    s_load_dword s0, s[4:5], 0x8
+; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90a-PRELOAD-1-NEXT:    global_store_short v0, v1, s[6:7]
+; GFX90a-PRELOAD-1-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-2-LABEL: ptr1_v2i8_preload_arg:
+; GFX90a-PRELOAD-2:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-2-NEXT:    s_lshr_b32 s0, s8, 8
+; GFX90a-PRELOAD-2-NEXT:    v_lshlrev_b16_e64 v0, 8, s0
+; GFX90a-PRELOAD-2-NEXT:    v_or_b32_sdwa v0, s8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, 0
+; GFX90a-PRELOAD-2-NEXT:    global_store_short v1, v0, s[6:7]
+; GFX90a-PRELOAD-2-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-4-LABEL: ptr1_v2i8_preload_arg:
+; GFX90a-PRELOAD-4:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-4-NEXT:    s_lshr_b32 s0, s8, 8
+; GFX90a-PRELOAD-4-NEXT:    v_lshlrev_b16_e64 v0, 8, s0
+; GFX90a-PRELOAD-4-NEXT:    v_or_b32_sdwa v0, s8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v1, 0
+; GFX90a-PRELOAD-4-NEXT:    global_store_short v1, v0, s[6:7]
+; GFX90a-PRELOAD-4-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-8-LABEL: ptr1_v2i8_preload_arg:
+; GFX90a-PRELOAD-8:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-8-NEXT:    s_lshr_b32 s0, s8, 8
+; GFX90a-PRELOAD-8-NEXT:    v_lshlrev_b16_e64 v0, 8, s0
+; GFX90a-PRELOAD-8-NEXT:    v_or_b32_sdwa v0, s8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, 0
+; GFX90a-PRELOAD-8-NEXT:    global_store_short v1, v0, s[6:7]
+; GFX90a-PRELOAD-8-NEXT:    s_endpgm
   store <2 x i8> %in, ptr addrspace(1) %out
   ret void
 }
 
-; Don't try to preload byref args.
 
 define amdgpu_kernel void @byref_preload_arg(ptr addrspace(1) %out, ptr addrspace(4) byref(i32) align(256) %in.byref, i32 %after.offset) {
-; NO-PRELOAD-LABEL: byref_preload_arg:
-; NO-PRELOAD:       ; %bb.0:
-; NO-PRELOAD-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x100
-; NO-PRELOAD-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
-; NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
-; NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
-; NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s2
-; NO-PRELOAD-NEXT:    v_mov_b32_e32 v2, s3
-; NO-PRELOAD-NEXT:    global_store_dword v0, v1, s[4:5] sc0 sc1
-; NO-PRELOAD-NEXT:    s_waitcnt vmcnt(0)
-; NO-PRELOAD-NEXT:    global_store_dword v0, v2, s[4:5] sc0 sc1
-; NO-PRELOAD-NEXT:    s_waitcnt vmcnt(0)
-; NO-PRELOAD-NEXT:    s_endpgm
-;
-; PRELOAD-1-LABEL: byref_preload_arg:
-; PRELOAD-1:         s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:  ; %bb.0:
-; PRELOAD-1-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x100
-; PRELOAD-1-NEXT:    v_mov_b32_e32 v0, 0
-; PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
-; PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s0
-; PRELOAD-1-NEXT:    v_mov_b32_e32 v2, s1
-; PRELOAD-1-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
-; PRELOAD-1-NEXT:    s_waitcnt vmcnt(0)
-; PRELOAD-1-NEXT:    global_store_dword v0, v2, s[2:3] sc0 sc1
-; PRELOAD-1-NEXT:    s_waitcnt vmcnt(0)
-; PRELOAD-1-NEXT:    s_endpgm
-;
-; PRELOAD-2-LABEL: byref_preload_arg:
-; PRELOAD-2:         s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:  ; %bb.0:
-; PRELOAD-2-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x100
-; PRELOAD-2-NEXT:    v_mov_b32_e32 v0, 0
-; PRELOAD-2-NEXT:    s_waitcnt lgkmcnt(0)
-; PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s0
-; PRELOAD-2-NEXT:    v_mov_b32_e32 v2, s1
-; PRELOAD-2-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
-; PRELOAD-2-NEXT:    s_waitcnt vmcnt(0)
-; PRELOAD-2-NEXT:    global_store_dword v0, v2, s[2:3] sc0 sc1
-; PRELOAD-2-NEXT:    s_waitcnt vmcnt(0)
-; PRELOAD-2-NEXT:    s_endpgm
-;
-; PRELOAD-4-LABEL: byref_preload_arg:
-; PRELOAD-4:         s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:  ; %bb.0:
-; PRELOAD-4-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x100
-; PRELOAD-4-NEXT:    v_mov_b32_e32 v0, 0
-; PRELOAD-4-NEXT:    s_waitcnt lgkmcnt(0)
-; PRELOAD-4-NEXT:    v_mov_b32_e32 v1, s0
-; PRELOAD-4-NEXT:    v_mov_b32_e32 v2, s1
-; PRELOAD-4-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
-; PRELOAD-4-NEXT:    s_waitcnt vmcnt(0)
-; PRELOAD-4-NEXT:    global_store_dword v0, v2, s[2:3] sc0 sc1
-; PRELOAD-4-NEXT:    s_waitcnt vmcnt(0)
-; PRELOAD-4-NEXT:    s_endpgm
-;
-; PRELOAD-8-LABEL: byref_preload_arg:
-; PRELOAD-8:         s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:  ; %bb.0:
-; PRELOAD-8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x100
-; PRELOAD-8-NEXT:    v_mov_b32_e32 v0, 0
-; PRELOAD-8-NEXT:    s_waitcnt lgkmcnt(0)
-; PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s0
-; PRELOAD-8-NEXT:    v_mov_b32_e32 v2, s1
-; PRELOAD-8-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
-; PRELOAD-8-NEXT:    s_waitcnt vmcnt(0)
-; PRELOAD-8-NEXT:    global_store_dword v0, v2, s[2:3] sc0 sc1
-; PRELOAD-8-NEXT:    s_waitcnt vmcnt(0)
-; PRELOAD-8-NEXT:    s_endpgm
+; GFX940-NO-PRELOAD-LABEL: byref_preload_arg:
+; GFX940-NO-PRELOAD:       ; %bb.0:
+; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x100
+; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NO-PRELOAD-NEXT:    global_store_dword v0, v1, s[4:5] sc0 sc1
+; GFX940-NO-PRELOAD-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NO-PRELOAD-NEXT:    global_store_dword v0, v2, s[4:5] sc0 sc1
+; GFX940-NO-PRELOAD-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-1-LABEL: byref_preload_arg:
+; GFX940-PRELOAD-1:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-1-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x100
+; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s0
+; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-PRELOAD-1-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-PRELOAD-1-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-PRELOAD-1-NEXT:    global_store_dword v0, v2, s[2:3] sc0 sc1
+; GFX940-PRELOAD-1-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-PRELOAD-1-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-2-LABEL: byref_preload_arg:
+; GFX940-PRELOAD-2:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-2-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x100
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-PRELOAD-2-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s0
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-PRELOAD-2-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-PRELOAD-2-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-PRELOAD-2-NEXT:    global_store_dword v0, v2, s[2:3] sc0 sc1
+; GFX940-PRELOAD-2-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-PRELOAD-2-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-4-LABEL: byref_preload_arg:
+; GFX940-PRELOAD-4:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-4-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x100
+; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-PRELOAD-4-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v1, s0
+; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-PRELOAD-4-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-PRELOAD-4-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-PRELOAD-4-NEXT:    global_store_dword v0, v2, s[2:3] sc0 sc1
+; GFX940-PRELOAD-4-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-PRELOAD-4-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-8-LABEL: byref_preload_arg:
+; GFX940-PRELOAD-8:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x100
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-PRELOAD-8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s0
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-PRELOAD-8-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-PRELOAD-8-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-PRELOAD-8-NEXT:    global_store_dword v0, v2, s[2:3] sc0 sc1
+; GFX940-PRELOAD-8-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-PRELOAD-8-NEXT:    s_endpgm
+;
+; GFX90a-NO-PRELOAD-LABEL: byref_preload_arg:
+; GFX90a-NO-PRELOAD:       ; %bb.0:
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x100
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90a-NO-PRELOAD-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX90a-NO-PRELOAD-NEXT:    s_waitcnt vmcnt(0)
+; GFX90a-NO-PRELOAD-NEXT:    global_store_dword v0, v2, s[2:3]
+; GFX90a-NO-PRELOAD-NEXT:    s_waitcnt vmcnt(0)
+; GFX90a-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-1-LABEL: byref_preload_arg:
+; GFX90a-PRELOAD-1:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-1-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x100
+; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90a-PRELOAD-1-NEXT:    global_store_dword v0, v1, s[6:7]
+; GFX90a-PRELOAD-1-NEXT:    s_waitcnt vmcnt(0)
+; GFX90a-PRELOAD-1-NEXT:    global_store_dword v0, v2, s[6:7]
+; GFX90a-PRELOAD-1-NEXT:    s_waitcnt vmcnt(0)
+; GFX90a-PRELOAD-1-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-2-LABEL: byref_preload_arg:
+; GFX90a-PRELOAD-2:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-2-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x100
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-PRELOAD-2-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90a-PRELOAD-2-NEXT:    global_store_dword v0, v1, s[6:7]
+; GFX90a-PRELOAD-2-NEXT:    s_waitcnt vmcnt(0)
+; GFX90a-PRELOAD-2-NEXT:    global_store_dword v0, v2, s[6:7]
+; GFX90a-PRELOAD-2-NEXT:    s_waitcnt vmcnt(0)
+; GFX90a-PRELOAD-2-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-4-LABEL: byref_preload_arg:
+; GFX90a-PRELOAD-4:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-4-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x100
+; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-PRELOAD-4-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90a-PRELOAD-4-NEXT:    global_store_dword v0, v1, s[6:7]
+; GFX90a-PRELOAD-4-NEXT:    s_waitcnt vmcnt(0)
+; GFX90a-PRELOAD-4-NEXT:    global_store_dword v0, v2, s[6:7]
+; GFX90a-PRELOAD-4-NEXT:    s_waitcnt vmcnt(0)
+; GFX90a-PRELOAD-4-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-8-LABEL: byref_preload_arg:
+; GFX90a-PRELOAD-8:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-8-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x100
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-PRELOAD-8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90a-PRELOAD-8-NEXT:    global_store_dword v0, v1, s[6:7]
+; GFX90a-PRELOAD-8-NEXT:    s_waitcnt vmcnt(0)
+; GFX90a-PRELOAD-8-NEXT:    global_store_dword v0, v2, s[6:7]
+; GFX90a-PRELOAD-8-NEXT:    s_waitcnt vmcnt(0)
+; GFX90a-PRELOAD-8-NEXT:    s_endpgm
   %in = load i32, ptr addrspace(4) %in.byref
   store volatile i32 %in, ptr addrspace(1) %out, align 4
   store volatile i32 %after.offset, ptr addrspace(1) %out, align 4
   ret void
 }
 
-; TODO: Should do partial preload in cases like these where only part of the arg
-; can be preloaded.
 
 define amdgpu_kernel void @v8i32_arg(ptr addrspace(1) nocapture %out, <8 x i32> %in) nounwind {
-; NO-PRELOAD-LABEL: v8i32_arg:
-; NO-PRELOAD:       ; %bb.0:
-; NO-PRELOAD-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x20
-; NO-PRELOAD-NEXT:    v_mov_b32_e32 v4, 0
-; NO-PRELOAD-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
-; NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
-; NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, s8
-; NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s9
-; NO-PRELOAD-NEXT:    v_mov_b32_e32 v2, s10
-; NO-PRELOAD-NEXT:    v_mov_b32_e32 v3, s11
-; NO-PRELOAD-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1] offset:16 sc0 sc1
-; NO-PRELOAD-NEXT:    s_nop 1
-; NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, s4
-; NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s5
-; NO-PRELOAD-NEXT:    v_mov_b32_e32 v2, s6
-; NO-PRELOAD-NEXT:    v_mov_b32_e32 v3, s7
-; NO-PRELOAD-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1
-; NO-PRELOAD-NEXT:    s_endpgm
-;
-; PRELOAD-1-LABEL: v8i32_arg:
-; PRELOAD-1:         s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:  ; %bb.0:
-; PRELOAD-1-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x20
-; PRELOAD-1-NEXT:    v_mov_b32_e32 v4, 0
-; PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
-; PRELOAD-1-NEXT:    v_mov_b32_e32 v0, s8
-; PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s9
-; PRELOAD-1-NEXT:    v_mov_b32_e32 v2, s10
-; PRELOAD-1-NEXT:    v_mov_b32_e32 v3, s11
-; PRELOAD-1-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] offset:16 sc0 sc1
-; PRELOAD-1-NEXT:    s_nop 1
-; PRELOAD-1-NEXT:    v_mov_b32_e32 v0, s4
-; PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s5
-; PRELOAD-1-NEXT:    v_mov_b32_e32 v2, s6
-; PRELOAD-1-NEXT:    v_mov_b32_e32 v3, s7
-; PRELOAD-1-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] sc0 sc1
-; PRELOAD-1-NEXT:    s_endpgm
-;
-; PRELOAD-2-LABEL: v8i32_arg:
-; PRELOAD-2:         s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:  ; %bb.0:
-; PRELOAD-2-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x20
-; PRELOAD-2-NEXT:    v_mov_b32_e32 v4, 0
-; PRELOAD-2-NEXT:    s_waitcnt lgkmcnt(0)
-; PRELOAD-2-NEXT:    v_mov_b32_e32 v0, s8
-; PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s9
-; PRELOAD-2-NEXT:    v_mov_b32_e32 v2, s10
-; PRELOAD-2-NEXT:    v_mov_b32_e32 v3, s11
-; PRELOAD-2-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] offset:16 sc0 sc1
-; PRELOAD-2-NEXT:    s_nop 1
-; PRELOAD-2-NEXT:    v_mov_b32_e32 v0, s4
-; PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s5
-; PRELOAD-2-NEXT:    v_mov_b32_e32 v2, s6
-; PRELOAD-2-NEXT:    v_mov_b32_e32 v3, s7
-; PRELOAD-2-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] sc0 sc1
-; PRELOAD-2-NEXT:    s_endpgm
-;
-; PRELOAD-4-LABEL: v8i32_arg:
-; PRELOAD-4:         s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:  ; %bb.0:
-; PRELOAD-4-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x20
-; PRELOAD-4-NEXT:    v_mov_b32_e32 v4, 0
-; PRELOAD-4-NEXT:    s_waitcnt lgkmcnt(0)
-; PRELOAD-4-NEXT:    v_mov_b32_e32 v0, s8
-; PRELOAD-4-NEXT:    v_mov_b32_e32 v1, s9
-; PRELOAD-4-NEXT:    v_mov_b32_e32 v2, s10
-; PRELOAD-4-NEXT:    v_mov_b32_e32 v3, s11
-; PRELOAD-4-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] offset:16 sc0 sc1
-; PRELOAD-4-NEXT:    s_nop 1
-; PRELOAD-4-NEXT:    v_mov_b32_e32 v0, s4
-; PRELOAD-4-NEXT:    v_mov_b32_e32 v1, s5
-; PRELOAD-4-NEXT:    v_mov_b32_e32 v2, s6
-; PRELOAD-4-NEXT:    v_mov_b32_e32 v3, s7
-; PRELOAD-4-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] sc0 sc1
-; PRELOAD-4-NEXT:    s_endpgm
-;
-; PRELOAD-8-LABEL: v8i32_arg:
-; PRELOAD-8:         s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:  ; %bb.0:
-; PRELOAD-8-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x20
-; PRELOAD-8-NEXT:    v_mov_b32_e32 v4, 0
-; PRELOAD-8-NEXT:    s_waitcnt lgkmcnt(0)
-; PRELOAD-8-NEXT:    v_mov_b32_e32 v0, s8
-; PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s9
-; PRELOAD-8-NEXT:    v_mov_b32_e32 v2, s10
-; PRELOAD-8-NEXT:    v_mov_b32_e32 v3, s11
-; PRELOAD-8-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] offset:16 sc0 sc1
-; PRELOAD-8-NEXT:    s_nop 1
-; PRELOAD-8-NEXT:    v_mov_b32_e32 v0, s4
-; PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s5
-; PRELOAD-8-NEXT:    v_mov_b32_e32 v2, s6
-; PRELOAD-8-NEXT:    v_mov_b32_e32 v3, s7
-; PRELOAD-8-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] sc0 sc1
-; PRELOAD-8-NEXT:    s_endpgm
+; GFX940-NO-PRELOAD-LABEL: v8i32_arg:
+; GFX940-NO-PRELOAD:       ; %bb.0:
+; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x20
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v4, 0
+; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, s8
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s9
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v2, s10
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v3, s11
+; GFX940-NO-PRELOAD-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1] offset:16 sc0 sc1
+; GFX940-NO-PRELOAD-NEXT:    s_nop 1
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, s4
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s5
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v2, s6
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v3, s7
+; GFX940-NO-PRELOAD-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1
+; GFX940-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-1-LABEL: v8i32_arg:
+; GFX940-PRELOAD-1:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-1-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x20
+; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v4, 0
+; GFX940-PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v0, s8
+; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s9
+; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v2, s10
+; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v3, s11
+; GFX940-PRELOAD-1-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] offset:16 sc0 sc1
+; GFX940-PRELOAD-1-NEXT:    s_nop 1
+; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v0, s4
+; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s5
+; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v2, s6
+; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v3, s7
+; GFX940-PRELOAD-1-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] sc0 sc1
+; GFX940-PRELOAD-1-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-2-LABEL: v8i32_arg:
+; GFX940-PRELOAD-2:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-2-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x20
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v4, 0
+; GFX940-PRELOAD-2-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v0, s8
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s9
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v2, s10
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v3, s11
+; GFX940-PRELOAD-2-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] offset:16 sc0 sc1
+; GFX940-PRELOAD-2-NEXT:    s_nop 1
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v0, s4
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s5
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v2, s6
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v3, s7
+; GFX940-PRELOAD-2-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] sc0 sc1
+; GFX940-PRELOAD-2-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-4-LABEL: v8i32_arg:
+; GFX940-PRELOAD-4:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-4-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x20
+; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v4, 0
+; GFX940-PRELOAD-4-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v0, s8
+; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v1, s9
+; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v2, s10
+; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v3, s11
+; GFX940-PRELOAD-4-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] offset:16 sc0 sc1
+; GFX940-PRELOAD-4-NEXT:    s_nop 1
+; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v0, s4
+; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v1, s5
+; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v2, s6
+; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v3, s7
+; GFX940-PRELOAD-4-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] sc0 sc1
+; GFX940-PRELOAD-4-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-8-LABEL: v8i32_arg:
+; GFX940-PRELOAD-8:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-8-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x20
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v4, 0
+; GFX940-PRELOAD-8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v0, s8
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s9
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v2, s10
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v3, s11
+; GFX940-PRELOAD-8-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] offset:16 sc0 sc1
+; GFX940-PRELOAD-8-NEXT:    s_nop 1
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v0, s4
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s5
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v2, s6
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v3, s7
+; GFX940-PRELOAD-8-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] sc0 sc1
+; GFX940-PRELOAD-8-NEXT:    s_endpgm
+;
+; GFX90a-NO-PRELOAD-LABEL: v8i32_arg:
+; GFX90a-NO-PRELOAD:       ; %bb.0:
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x20
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v4, 0
+; GFX90a-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, s12
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s13
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v2, s14
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v3, s15
+; GFX90a-NO-PRELOAD-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1] offset:16
+; GFX90a-NO-PRELOAD-NEXT:    s_nop 0
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, s8
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s9
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v2, s10
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v3, s11
+; GFX90a-NO-PRELOAD-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
+; GFX90a-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-1-LABEL: v8i32_arg:
+; GFX90a-PRELOAD-1:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-1-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x20
+; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v4, 0
+; GFX90a-PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v0, s12
+; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s13
+; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v2, s14
+; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v3, s15
+; GFX90a-PRELOAD-1-NEXT:    global_store_dwordx4 v4, v[0:3], s[6:7] offset:16
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v0, s8
+; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s9
+; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v2, s10
+; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v3, s11
+; GFX90a-PRELOAD-1-NEXT:    global_store_dwordx4 v4, v[0:3], s[6:7]
+; GFX90a-PRELOAD-1-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-2-LABEL: v8i32_arg:
+; GFX90a-PRELOAD-2:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-2-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x20
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v4, 0
+; GFX90a-PRELOAD-2-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v0, s12
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s13
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v2, s14
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v3, s15
+; GFX90a-PRELOAD-2-NEXT:    global_store_dwordx4 v4, v[0:3], s[6:7] offset:16
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v0, s8
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s9
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v2, s10
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v3, s11
+; GFX90a-PRELOAD-2-NEXT:    global_store_dwordx4 v4, v[0:3], s[6:7]
+; GFX90a-PRELOAD-2-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-4-LABEL: v8i32_arg:
+; GFX90a-PRELOAD-4:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-4-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x20
+; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v4, 0
+; GFX90a-PRELOAD-4-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v0, s12
+; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v1, s13
+; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v2, s14
+; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v3, s15
+; GFX90a-PRELOAD-4-NEXT:    global_store_dwordx4 v4, v[0:3], s[6:7] offset:16
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v0, s8
+; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v1, s9
+; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v2, s10
+; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v3, s11
+; GFX90a-PRELOAD-4-NEXT:    global_store_dwordx4 v4, v[0:3], s[6:7]
+; GFX90a-PRELOAD-4-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-8-LABEL: v8i32_arg:
+; GFX90a-PRELOAD-8:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-8-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x20
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v4, 0
+; GFX90a-PRELOAD-8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v0, s12
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s13
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v2, s14
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v3, s15
+; GFX90a-PRELOAD-8-NEXT:    global_store_dwordx4 v4, v[0:3], s[6:7] offset:16
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v0, s8
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s9
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v2, s10
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v3, s11
+; GFX90a-PRELOAD-8-NEXT:    global_store_dwordx4 v4, v[0:3], s[6:7]
+; GFX90a-PRELOAD-8-NEXT:    s_endpgm
   store <8 x i32> %in, ptr addrspace(1) %out, align 4
   ret void
 }
 
 define amdgpu_kernel void @v3i16_preload_arg(ptr addrspace(1) nocapture %out, <3 x i16> %in) nounwind {
-; NO-PRELOAD-LABEL: v3i16_preload_arg:
-; NO-PRELOAD:       ; %bb.0:
-; NO-PRELOAD-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
-; NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
-; NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s3
-; NO-PRELOAD-NEXT:    v_mov_b32_e32 v2, s2
-; NO-PRELOAD-NEXT:    global_store_short v0, v1, s[0:1] offset:4 sc0 sc1
-; NO-PRELOAD-NEXT:    global_store_dword v0, v2, s[0:1] sc0 sc1
-; NO-PRELOAD-NEXT:    s_endpgm
-;
-; PRELOAD-1-LABEL: v3i16_preload_arg:
-; PRELOAD-1:         s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:  ; %bb.0:
-; PRELOAD-1-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
-; PRELOAD-1-NEXT:    v_mov_b32_e32 v0, 0
-; PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
-; PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s1
-; PRELOAD-1-NEXT:    v_mov_b32_e32 v2, s0
-; PRELOAD-1-NEXT:    global_store_short v0, v1, s[2:3] offset:4 sc0 sc1
-; PRELOAD-1-NEXT:    global_store_dword v0, v2, s[2:3] sc0 sc1
-; PRELOAD-1-NEXT:    s_endpgm
-;
-; PRELOAD-2-LABEL: v3i16_preload_arg:
-; PRELOAD-2:         s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:  ; %bb.0:
-; PRELOAD-2-NEXT:    v_mov_b32_e32 v0, 0
-; PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s5
-; PRELOAD-2-NEXT:    global_store_short v0, v1, s[2:3] offset:4 sc0 sc1
-; PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s4
-; PRELOAD-2-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
-; PRELOAD-2-NEXT:    s_endpgm
-;
-; PRELOAD-4-LABEL: v3i16_preload_arg:
-; PRELOAD-4:         s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:  ; %bb.0:
-; PRELOAD-4-NEXT:    v_mov_b32_e32 v0, 0
-; PRELOAD-4-NEXT:    v_mov_b32_e32 v1, s5
-; PRELOAD-4-NEXT:    global_store_short v0, v1, s[2:3] offset:4 sc0 sc1
-; PRELOAD-4-NEXT:    v_mov_b32_e32 v1, s4
-; PRELOAD-4-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
-; PRELOAD-4-NEXT:    s_endpgm
-;
-; PRELOAD-8-LABEL: v3i16_preload_arg:
-; PRELOAD-8:         s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:  ; %bb.0:
-; PRELOAD-8-NEXT:    v_mov_b32_e32 v0, 0
-; PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s5
-; PRELOAD-8-NEXT:    global_store_short v0, v1, s[2:3] offset:4 sc0 sc1
-; PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s4
-; PRELOAD-8-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
-; PRELOAD-8-NEXT:    s_endpgm
+; GFX940-NO-PRELOAD-LABEL: v3i16_preload_arg:
+; GFX940-NO-PRELOAD:       ; %bb.0:
+; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s3
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v2, s2
+; GFX940-NO-PRELOAD-NEXT:    global_store_short v0, v1, s[0:1] offset:4 sc0 sc1
+; GFX940-NO-PRELOAD-NEXT:    global_store_dword v0, v2, s[0:1] sc0 sc1
+; GFX940-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-1-LABEL: v3i16_preload_arg:
+; GFX940-PRELOAD-1:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-1-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
+; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v2, s0
+; GFX940-PRELOAD-1-NEXT:    global_store_short v0, v1, s[2:3] offset:4 sc0 sc1
+; GFX940-PRELOAD-1-NEXT:    global_store_dword v0, v2, s[2:3] sc0 sc1
+; GFX940-PRELOAD-1-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-2-LABEL: v3i16_preload_arg:
+; GFX940-PRELOAD-2:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s5
+; GFX940-PRELOAD-2-NEXT:    global_store_short v0, v1, s[2:3] offset:4 sc0 sc1
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s4
+; GFX940-PRELOAD-2-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-PRELOAD-2-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-4-LABEL: v3i16_preload_arg:
+; GFX940-PRELOAD-4:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v1, s5
+; GFX940-PRELOAD-4-NEXT:    global_store_short v0, v1, s[2:3] offset:4 sc0 sc1
+; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v1, s4
+; GFX940-PRELOAD-4-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-PRELOAD-4-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-8-LABEL: v3i16_preload_arg:
+; GFX940-PRELOAD-8:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s5
+; GFX940-PRELOAD-8-NEXT:    global_store_short v0, v1, s[2:3] offset:4 sc0 sc1
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s4
+; GFX940-PRELOAD-8-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-PRELOAD-8-NEXT:    s_endpgm
+;
+; GFX90a-NO-PRELOAD-LABEL: v3i16_preload_arg:
+; GFX90a-NO-PRELOAD:       ; %bb.0:
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s3
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90a-NO-PRELOAD-NEXT:    global_store_short v0, v1, s[0:1] offset:4
+; GFX90a-NO-PRELOAD-NEXT:    global_store_dword v0, v2, s[0:1]
+; GFX90a-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-1-LABEL: v3i16_preload_arg:
+; GFX90a-PRELOAD-1:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-1-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v2, s0
+; GFX90a-PRELOAD-1-NEXT:    global_store_short v0, v1, s[6:7] offset:4
+; GFX90a-PRELOAD-1-NEXT:    global_store_dword v0, v2, s[6:7]
+; GFX90a-PRELOAD-1-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-2-LABEL: v3i16_preload_arg:
+; GFX90a-PRELOAD-2:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s9
+; GFX90a-PRELOAD-2-NEXT:    global_store_short v0, v1, s[6:7] offset:4
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s8
+; GFX90a-PRELOAD-2-NEXT:    global_store_dword v0, v1, s[6:7]
+; GFX90a-PRELOAD-2-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-4-LABEL: v3i16_preload_arg:
+; GFX90a-PRELOAD-4:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v1, s9
+; GFX90a-PRELOAD-4-NEXT:    global_store_short v0, v1, s[6:7] offset:4
+; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v1, s8
+; GFX90a-PRELOAD-4-NEXT:    global_store_dword v0, v1, s[6:7]
+; GFX90a-PRELOAD-4-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-8-LABEL: v3i16_preload_arg:
+; GFX90a-PRELOAD-8:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s9
+; GFX90a-PRELOAD-8-NEXT:    global_store_short v0, v1, s[6:7] offset:4
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s8
+; GFX90a-PRELOAD-8-NEXT:    global_store_dword v0, v1, s[6:7]
+; GFX90a-PRELOAD-8-NEXT:    s_endpgm
   store <3 x i16> %in, ptr addrspace(1) %out, align 4
   ret void
 }
 
 define amdgpu_kernel void @v3i32_preload_arg(ptr addrspace(1) nocapture %out, <3 x i32> %in) nounwind {
-; NO-PRELOAD-LABEL: v3i32_preload_arg:
-; NO-PRELOAD:       ; %bb.0:
-; NO-PRELOAD-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x10
-; NO-PRELOAD-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; NO-PRELOAD-NEXT:    v_mov_b32_e32 v3, 0
-; NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
-; NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, s4
-; NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s5
-; NO-PRELOAD-NEXT:    v_mov_b32_e32 v2, s6
-; NO-PRELOAD-NEXT:    global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1
-; NO-PRELOAD-NEXT:    s_endpgm
-;
-; PRELOAD-1-LABEL: v3i32_preload_arg:
-; PRELOAD-1:         s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:  ; %bb.0:
-; PRELOAD-1-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x10
-; PRELOAD-1-NEXT:    v_mov_b32_e32 v3, 0
-; PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
-; PRELOAD-1-NEXT:    v_mov_b32_e32 v0, s4
-; PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s5
-; PRELOAD-1-NEXT:    v_mov_b32_e32 v2, s6
-; PRELOAD-1-NEXT:    global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1
-; PRELOAD-1-NEXT:    s_endpgm
-;
-; PRELOAD-2-LABEL: v3i32_preload_arg:
-; PRELOAD-2:         s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:  ; %bb.0:
-; PRELOAD-2-NEXT:    v_mov_b32_e32 v0, s6
-; PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s7
-; PRELOAD-2-NEXT:    v_mov_b32_e32 v2, s8
-; PRELOAD-2-NEXT:    v_mov_b32_e32 v3, 0
-; PRELOAD-2-NEXT:    global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1
-; PRELOAD-2-NEXT:    s_endpgm
-;
-; PRELOAD-4-LABEL: v3i32_preload_arg:
-; PRELOAD-4:         s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:  ; %bb.0:
-; PRELOAD-4-NEXT:    v_mov_b32_e32 v0, s6
-; PRELOAD-4-NEXT:    v_mov_b32_e32 v1, s7
-; PRELOAD-4-NEXT:    v_mov_b32_e32 v2, s8
-; PRELOAD-4-NEXT:    v_mov_b32_e32 v3, 0
-; PRELOAD-4-NEXT:    global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1
-; PRELOAD-4-NEXT:    s_endpgm
-;
-; PRELOAD-8-LABEL: v3i32_preload_arg:
-; PRELOAD-8:         s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:  ; %bb.0:
-; PRELOAD-8-NEXT:    v_mov_b32_e32 v0, s6
-; PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s7
-; PRELOAD-8-NEXT:    v_mov_b32_e32 v2, s8
-; PRELOAD-8-NEXT:    v_mov_b32_e32 v3, 0
-; PRELOAD-8-NEXT:    global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1
-; PRELOAD-8-NEXT:    s_endpgm
+; GFX940-NO-PRELOAD-LABEL: v3i32_preload_arg:
+; GFX940-NO-PRELOAD:       ; %bb.0:
+; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x10
+; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v3, 0
+; GFX940-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, s4
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s5
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v2, s6
+; GFX940-NO-PRELOAD-NEXT:    global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1
+; GFX940-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-1-LABEL: v3i32_preload_arg:
+; GFX940-PRELOAD-1:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-1-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x10
+; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v3, 0
+; GFX940-PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v0, s4
+; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s5
+; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v2, s6
+; GFX940-PRELOAD-1-NEXT:    global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1
+; GFX940-PRELOAD-1-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-2-LABEL: v3i32_preload_arg:
+; GFX940-PRELOAD-2:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v0, s6
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s7
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v2, s8
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v3, 0
+; GFX940-PRELOAD-2-NEXT:    global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1
+; GFX940-PRELOAD-2-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-4-LABEL: v3i32_preload_arg:
+; GFX940-PRELOAD-4:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v0, s6
+; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v1, s7
+; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v2, s8
+; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v3, 0
+; GFX940-PRELOAD-4-NEXT:    global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1
+; GFX940-PRELOAD-4-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-8-LABEL: v3i32_preload_arg:
+; GFX940-PRELOAD-8:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v0, s6
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s7
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v2, s8
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v3, 0
+; GFX940-PRELOAD-8-NEXT:    global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1
+; GFX940-PRELOAD-8-NEXT:    s_endpgm
+;
+; GFX90a-NO-PRELOAD-LABEL: v3i32_preload_arg:
+; GFX90a-NO-PRELOAD:       ; %bb.0:
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x10
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v3, 0
+; GFX90a-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90a-NO-PRELOAD-NEXT:    global_store_dwordx3 v3, v[0:2], s[6:7]
+; GFX90a-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-1-LABEL: v3i32_preload_arg:
+; GFX90a-PRELOAD-1:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-1-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x10
+; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v3, 0
+; GFX90a-PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90a-PRELOAD-1-NEXT:    global_store_dwordx3 v3, v[0:2], s[6:7]
+; GFX90a-PRELOAD-1-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-2-LABEL: v3i32_preload_arg:
+; GFX90a-PRELOAD-2:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v0, s10
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s11
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v2, s12
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v3, 0
+; GFX90a-PRELOAD-2-NEXT:    global_store_dwordx3 v3, v[0:2], s[6:7]
+; GFX90a-PRELOAD-2-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-4-LABEL: v3i32_preload_arg:
+; GFX90a-PRELOAD-4:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v0, s10
+; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v1, s11
+; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v2, s12
+; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v3, 0
+; GFX90a-PRELOAD-4-NEXT:    global_store_dwordx3 v3, v[0:2], s[6:7]
+; GFX90a-PRELOAD-4-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-8-LABEL: v3i32_preload_arg:
+; GFX90a-PRELOAD-8:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v0, s10
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s11
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v2, s12
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v3, 0
+; GFX90a-PRELOAD-8-NEXT:    global_store_dwordx3 v3, v[0:2], s[6:7]
+; GFX90a-PRELOAD-8-NEXT:    s_endpgm
   store <3 x i32> %in, ptr addrspace(1) %out, align 4
   ret void
 }
 
 define amdgpu_kernel void @v3f32_preload_arg(ptr addrspace(1) nocapture %out, <3 x float> %in) nounwind {
-; NO-PRELOAD-LABEL: v3f32_preload_arg:
-; NO-PRELOAD:       ; %bb.0:
-; NO-PRELOAD-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x10
-; NO-PRELOAD-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; NO-PRELOAD-NEXT:    v_mov_b32_e32 v3, 0
-; NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
-; NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, s4
-; NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s5
-; NO-PRELOAD-NEXT:    v_mov_b32_e32 v2, s6
-; NO-PRELOAD-NEXT:    global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1
-; NO-PRELOAD-NEXT:    s_endpgm
-;
-; PRELOAD-1-LABEL: v3f32_preload_arg:
-; PRELOAD-1:         s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:  ; %bb.0:
-; PRELOAD-1-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x10
-; PRELOAD-1-NEXT:    v_mov_b32_e32 v3, 0
-; PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
-; PRELOAD-1-NEXT:    v_mov_b32_e32 v0, s4
-; PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s5
-; PRELOAD-1-NEXT:    v_mov_b32_e32 v2, s6
-; PRELOAD-1-NEXT:    global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1
-; PRELOAD-1-NEXT:    s_endpgm
-;
-; PRELOAD-2-LABEL: v3f32_preload_arg:
-; PRELOAD-2:         s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:  ; %bb.0:
-; PRELOAD-2-NEXT:    v_mov_b32_e32 v3, 0
-; PRELOAD-2-NEXT:    v_mov_b32_e32 v0, s6
-; PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s7
-; PRELOAD-2-NEXT:    v_mov_b32_e32 v2, s8
-; PRELOAD-2-NEXT:    global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1
-; PRELOAD-2-NEXT:    s_endpgm
-;
-; PRELOAD-4-LABEL: v3f32_preload_arg:
-; PRELOAD-4:         s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:  ; %bb.0:
-; PRELOAD-4-NEXT:    v_mov_b32_e32 v3, 0
-; PRELOAD-4-NEXT:    v_mov_b32_e32 v0, s6
-; PRELOAD-4-NEXT:    v_mov_b32_e32 v1, s7
-; PRELOAD-4-NEXT:    v_mov_b32_e32 v2, s8
-; PRELOAD-4-NEXT:    global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1
-; PRELOAD-4-NEXT:    s_endpgm
-;
-; PRELOAD-8-LABEL: v3f32_preload_arg:
-; PRELOAD-8:         s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:  ; %bb.0:
-; PRELOAD-8-NEXT:    v_mov_b32_e32 v3, 0
-; PRELOAD-8-NEXT:    v_mov_b32_e32 v0, s6
-; PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s7
-; PRELOAD-8-NEXT:    v_mov_b32_e32 v2, s8
-; PRELOAD-8-NEXT:    global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1
-; PRELOAD-8-NEXT:    s_endpgm
+; GFX940-NO-PRELOAD-LABEL: v3f32_preload_arg:
+; GFX940-NO-PRELOAD:       ; %bb.0:
+; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x10
+; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v3, 0
+; GFX940-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, s4
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s5
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v2, s6
+; GFX940-NO-PRELOAD-NEXT:    global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1
+; GFX940-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-1-LABEL: v3f32_preload_arg:
+; GFX940-PRELOAD-1:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-1-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x10
+; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v3, 0
+; GFX940-PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v0, s4
+; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s5
+; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v2, s6
+; GFX940-PRELOAD-1-NEXT:    global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1
+; GFX940-PRELOAD-1-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-2-LABEL: v3f32_preload_arg:
+; GFX940-PRELOAD-2:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v3, 0
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v0, s6
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s7
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v2, s8
+; GFX940-PRELOAD-2-NEXT:    global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1
+; GFX940-PRELOAD-2-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-4-LABEL: v3f32_preload_arg:
+; GFX940-PRELOAD-4:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v3, 0
+; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v0, s6
+; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v1, s7
+; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v2, s8
+; GFX940-PRELOAD-4-NEXT:    global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1
+; GFX940-PRELOAD-4-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-8-LABEL: v3f32_preload_arg:
+; GFX940-PRELOAD-8:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v3, 0
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v0, s6
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s7
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v2, s8
+; GFX940-PRELOAD-8-NEXT:    global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1
+; GFX940-PRELOAD-8-NEXT:    s_endpgm
+;
+; GFX90a-NO-PRELOAD-LABEL: v3f32_preload_arg:
+; GFX90a-NO-PRELOAD:       ; %bb.0:
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x10
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v3, 0
+; GFX90a-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90a-NO-PRELOAD-NEXT:    global_store_dwordx3 v3, v[0:2], s[6:7]
+; GFX90a-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-1-LABEL: v3f32_preload_arg:
+; GFX90a-PRELOAD-1:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-1-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x10
+; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v3, 0
+; GFX90a-PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90a-PRELOAD-1-NEXT:    global_store_dwordx3 v3, v[0:2], s[6:7]
+; GFX90a-PRELOAD-1-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-2-LABEL: v3f32_preload_arg:
+; GFX90a-PRELOAD-2:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v3, 0
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v0, s10
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s11
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v2, s12
+; GFX90a-PRELOAD-2-NEXT:    global_store_dwordx3 v3, v[0:2], s[6:7]
+; GFX90a-PRELOAD-2-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-4-LABEL: v3f32_preload_arg:
+; GFX90a-PRELOAD-4:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v3, 0
+; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v0, s10
+; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v1, s11
+; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v2, s12
+; GFX90a-PRELOAD-4-NEXT:    global_store_dwordx3 v3, v[0:2], s[6:7]
+; GFX90a-PRELOAD-4-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-8-LABEL: v3f32_preload_arg:
+; GFX90a-PRELOAD-8:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v3, 0
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v0, s10
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s11
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v2, s12
+; GFX90a-PRELOAD-8-NEXT:    global_store_dwordx3 v3, v[0:2], s[6:7]
+; GFX90a-PRELOAD-8-NEXT:    s_endpgm
   store <3 x float> %in, ptr addrspace(1) %out, align 4
   ret void
 }
 
 define amdgpu_kernel void @v5i8_preload_arg(ptr addrspace(1) nocapture %out, <5 x i8> %in) nounwind {
-; NO-PRELOAD-LABEL: v5i8_preload_arg:
-; NO-PRELOAD:       ; %bb.0:
-; NO-PRELOAD-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
-; NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
-; NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s3
-; NO-PRELOAD-NEXT:    v_mov_b32_e32 v2, s2
-; NO-PRELOAD-NEXT:    global_store_byte v0, v1, s[0:1] offset:4 sc0 sc1
-; NO-PRELOAD-NEXT:    global_store_dword v0, v2, s[0:1] sc0 sc1
-; NO-PRELOAD-NEXT:    s_endpgm
-;
-; PRELOAD-1-LABEL: v5i8_preload_arg:
-; PRELOAD-1:         s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:  ; %bb.0:
-; PRELOAD-1-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
-; PRELOAD-1-NEXT:    v_mov_b32_e32 v0, 0
-; PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
-; PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s1
-; PRELOAD-1-NEXT:    v_mov_b32_e32 v2, s0
-; PRELOAD-1-NEXT:    global_store_byte v0, v1, s[2:3] offset:4 sc0 sc1
-; PRELOAD-1-NEXT:    global_store_dword v0, v2, s[2:3] sc0 sc1
-; PRELOAD-1-NEXT:    s_endpgm
-;
-; PRELOAD-2-LABEL: v5i8_preload_arg:
-; PRELOAD-2:         s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:  ; %bb.0:
-; PRELOAD-2-NEXT:    s_lshr_b32 s0, s4, 8
-; PRELOAD-2-NEXT:    v_lshlrev_b16_e64 v0, 8, s0
-; PRELOAD-2-NEXT:    s_lshr_b32 s0, s4, 24
-; PRELOAD-2-NEXT:    v_lshlrev_b16_e64 v1, 8, s0
-; PRELOAD-2-NEXT:    s_lshr_b32 s0, s4, 16
-; PRELOAD-2-NEXT:    v_or_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; PRELOAD-2-NEXT:    v_or_b32_sdwa v1, s0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; PRELOAD-2-NEXT:    v_mov_b32_e32 v2, s5
-; PRELOAD-2-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; PRELOAD-2-NEXT:    v_mov_b32_e32 v1, 0
-; PRELOAD-2-NEXT:    global_store_byte v1, v2, s[2:3] offset:4 sc0 sc1
-; PRELOAD-2-NEXT:    global_store_dword v1, v0, s[2:3] sc0 sc1
-; PRELOAD-2-NEXT:    s_endpgm
-;
-; PRELOAD-4-LABEL: v5i8_preload_arg:
-; PRELOAD-4:         s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:  ; %bb.0:
-; PRELOAD-4-NEXT:    s_lshr_b32 s0, s4, 8
-; PRELOAD-4-NEXT:    v_lshlrev_b16_e64 v0, 8, s0
-; PRELOAD-4-NEXT:    s_lshr_b32 s0, s4, 24
-; PRELOAD-4-NEXT:    v_lshlrev_b16_e64 v1, 8, s0
-; PRELOAD-4-NEXT:    s_lshr_b32 s0, s4, 16
-; PRELOAD-4-NEXT:    v_or_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; PRELOAD-4-NEXT:    v_or_b32_sdwa v1, s0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; PRELOAD-4-NEXT:    v_mov_b32_e32 v2, s5
-; PRELOAD-4-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; PRELOAD-4-NEXT:    v_mov_b32_e32 v1, 0
-; PRELOAD-4-NEXT:    global_store_byte v1, v2, s[2:3] offset:4 sc0 sc1
-; PRELOAD-4-NEXT:    global_store_dword v1, v0, s[2:3] sc0 sc1
-; PRELOAD-4-NEXT:    s_endpgm
-;
-; PRELOAD-8-LABEL: v5i8_preload_arg:
-; PRELOAD-8:         s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:  ; %bb.0:
-; PRELOAD-8-NEXT:    s_lshr_b32 s0, s4, 8
-; PRELOAD-8-NEXT:    v_lshlrev_b16_e64 v0, 8, s0
-; PRELOAD-8-NEXT:    s_lshr_b32 s0, s4, 24
-; PRELOAD-8-NEXT:    v_lshlrev_b16_e64 v1, 8, s0
-; PRELOAD-8-NEXT:    s_lshr_b32 s0, s4, 16
-; PRELOAD-8-NEXT:    v_or_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; PRELOAD-8-NEXT:    v_or_b32_sdwa v1, s0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; PRELOAD-8-NEXT:    v_mov_b32_e32 v2, s5
-; PRELOAD-8-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; PRELOAD-8-NEXT:    v_mov_b32_e32 v1, 0
-; PRELOAD-8-NEXT:    global_store_byte v1, v2, s[2:3] offset:4 sc0 sc1
-; PRELOAD-8-NEXT:    global_store_dword v1, v0, s[2:3] sc0 sc1
-; PRELOAD-8-NEXT:    s_endpgm
+; GFX940-NO-PRELOAD-LABEL: v5i8_preload_arg:
+; GFX940-NO-PRELOAD:       ; %bb.0:
+; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s3
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v2, s2
+; GFX940-NO-PRELOAD-NEXT:    global_store_byte v0, v1, s[0:1] offset:4 sc0 sc1
+; GFX940-NO-PRELOAD-NEXT:    global_store_dword v0, v2, s[0:1] sc0 sc1
+; GFX940-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-1-LABEL: v5i8_preload_arg:
+; GFX940-PRELOAD-1:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-1-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
+; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v2, s0
+; GFX940-PRELOAD-1-NEXT:    global_store_byte v0, v1, s[2:3] offset:4 sc0 sc1
+; GFX940-PRELOAD-1-NEXT:    global_store_dword v0, v2, s[2:3] sc0 sc1
+; GFX940-PRELOAD-1-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-2-LABEL: v5i8_preload_arg:
+; GFX940-PRELOAD-2:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-2-NEXT:    s_lshr_b32 s0, s4, 8
+; GFX940-PRELOAD-2-NEXT:    v_lshlrev_b16_e64 v0, 8, s0
+; GFX940-PRELOAD-2-NEXT:    s_lshr_b32 s0, s4, 24
+; GFX940-PRELOAD-2-NEXT:    v_lshlrev_b16_e64 v1, 8, s0
+; GFX940-PRELOAD-2-NEXT:    s_lshr_b32 s0, s4, 16
+; GFX940-PRELOAD-2-NEXT:    v_or_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX940-PRELOAD-2-NEXT:    v_or_b32_sdwa v1, s0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v2, s5
+; GFX940-PRELOAD-2-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, 0
+; GFX940-PRELOAD-2-NEXT:    global_store_byte v1, v2, s[2:3] offset:4 sc0 sc1
+; GFX940-PRELOAD-2-NEXT:    global_store_dword v1, v0, s[2:3] sc0 sc1
+; GFX940-PRELOAD-2-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-4-LABEL: v5i8_preload_arg:
+; GFX940-PRELOAD-4:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-4-NEXT:    s_lshr_b32 s0, s4, 8
+; GFX940-PRELOAD-4-NEXT:    v_lshlrev_b16_e64 v0, 8, s0
+; GFX940-PRELOAD-4-NEXT:    s_lshr_b32 s0, s4, 24
+; GFX940-PRELOAD-4-NEXT:    v_lshlrev_b16_e64 v1, 8, s0
+; GFX940-PRELOAD-4-NEXT:    s_lshr_b32 s0, s4, 16
+; GFX940-PRELOAD-4-NEXT:    v_or_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX940-PRELOAD-4-NEXT:    v_or_b32_sdwa v1, s0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v2, s5
+; GFX940-PRELOAD-4-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v1, 0
+; GFX940-PRELOAD-4-NEXT:    global_store_byte v1, v2, s[2:3] offset:4 sc0 sc1
+; GFX940-PRELOAD-4-NEXT:    global_store_dword v1, v0, s[2:3] sc0 sc1
+; GFX940-PRELOAD-4-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-8-LABEL: v5i8_preload_arg:
+; GFX940-PRELOAD-8:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-8-NEXT:    s_lshr_b32 s0, s4, 8
+; GFX940-PRELOAD-8-NEXT:    v_lshlrev_b16_e64 v0, 8, s0
+; GFX940-PRELOAD-8-NEXT:    s_lshr_b32 s0, s4, 24
+; GFX940-PRELOAD-8-NEXT:    v_lshlrev_b16_e64 v1, 8, s0
+; GFX940-PRELOAD-8-NEXT:    s_lshr_b32 s0, s4, 16
+; GFX940-PRELOAD-8-NEXT:    v_or_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX940-PRELOAD-8-NEXT:    v_or_b32_sdwa v1, s0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v2, s5
+; GFX940-PRELOAD-8-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, 0
+; GFX940-PRELOAD-8-NEXT:    global_store_byte v1, v2, s[2:3] offset:4 sc0 sc1
+; GFX940-PRELOAD-8-NEXT:    global_store_dword v1, v0, s[2:3] sc0 sc1
+; GFX940-PRELOAD-8-NEXT:    s_endpgm
+;
+; GFX90a-NO-PRELOAD-LABEL: v5i8_preload_arg:
+; GFX90a-NO-PRELOAD:       ; %bb.0:
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s3
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90a-NO-PRELOAD-NEXT:    global_store_byte v0, v1, s[0:1] offset:4
+; GFX90a-NO-PRELOAD-NEXT:    global_store_dword v0, v2, s[0:1]
+; GFX90a-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-1-LABEL: v5i8_preload_arg:
+; GFX90a-PRELOAD-1:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-1-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v2, s0
+; GFX90a-PRELOAD-1-NEXT:    global_store_byte v0, v1, s[6:7] offset:4
+; GFX90a-PRELOAD-1-NEXT:    global_store_dword v0, v2, s[6:7]
+; GFX90a-PRELOAD-1-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-2-LABEL: v5i8_preload_arg:
+; GFX90a-PRELOAD-2:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-2-NEXT:    s_lshr_b32 s0, s8, 8
+; GFX90a-PRELOAD-2-NEXT:    v_lshlrev_b16_e64 v0, 8, s0
+; GFX90a-PRELOAD-2-NEXT:    s_lshr_b32 s0, s8, 24
+; GFX90a-PRELOAD-2-NEXT:    v_lshlrev_b16_e64 v1, 8, s0
+; GFX90a-PRELOAD-2-NEXT:    s_lshr_b32 s0, s8, 16
+; GFX90a-PRELOAD-2-NEXT:    v_or_b32_sdwa v0, s8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX90a-PRELOAD-2-NEXT:    v_or_b32_sdwa v1, s0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX90a-PRELOAD-2-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, 0
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v2, s9
+; GFX90a-PRELOAD-2-NEXT:    global_store_byte v1, v2, s[6:7] offset:4
+; GFX90a-PRELOAD-2-NEXT:    global_store_dword v1, v0, s[6:7]
+; GFX90a-PRELOAD-2-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-4-LABEL: v5i8_preload_arg:
+; GFX90a-PRELOAD-4:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-4-NEXT:    s_lshr_b32 s0, s8, 8
+; GFX90a-PRELOAD-4-NEXT:    v_lshlrev_b16_e64 v0, 8, s0
+; GFX90a-PRELOAD-4-NEXT:    s_lshr_b32 s0, s8, 24
+; GFX90a-PRELOAD-4-NEXT:    v_lshlrev_b16_e64 v1, 8, s0
+; GFX90a-PRELOAD-4-NEXT:    s_lshr_b32 s0, s8, 16
+; GFX90a-PRELOAD-4-NEXT:    v_or_b32_sdwa v0, s8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX90a-PRELOAD-4-NEXT:    v_or_b32_sdwa v1, s0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX90a-PRELOAD-4-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v1, 0
+; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v2, s9
+; GFX90a-PRELOAD-4-NEXT:    global_store_byte v1, v2, s[6:7] offset:4
+; GFX90a-PRELOAD-4-NEXT:    global_store_dword v1, v0, s[6:7]
+; GFX90a-PRELOAD-4-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-8-LABEL: v5i8_preload_arg:
+; GFX90a-PRELOAD-8:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-8-NEXT:    s_lshr_b32 s0, s8, 8
+; GFX90a-PRELOAD-8-NEXT:    v_lshlrev_b16_e64 v0, 8, s0
+; GFX90a-PRELOAD-8-NEXT:    s_lshr_b32 s0, s8, 24
+; GFX90a-PRELOAD-8-NEXT:    v_lshlrev_b16_e64 v1, 8, s0
+; GFX90a-PRELOAD-8-NEXT:    s_lshr_b32 s0, s8, 16
+; GFX90a-PRELOAD-8-NEXT:    v_or_b32_sdwa v0, s8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX90a-PRELOAD-8-NEXT:    v_or_b32_sdwa v1, s0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX90a-PRELOAD-8-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, 0
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v2, s9
+; GFX90a-PRELOAD-8-NEXT:    global_store_byte v1, v2, s[6:7] offset:4
+; GFX90a-PRELOAD-8-NEXT:    global_store_dword v1, v0, s[6:7]
+; GFX90a-PRELOAD-8-NEXT:    s_endpgm
   store <5 x i8> %in, ptr addrspace(1) %out, align 4
   ret void
 }
 
 define amdgpu_kernel void @v5f64_arg(ptr addrspace(1) nocapture %out, <5 x double> %in) nounwind {
-; NO-PRELOAD-LABEL: v5f64_arg:
-; NO-PRELOAD:       ; %bb.0:
-; NO-PRELOAD-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x60
-; NO-PRELOAD-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x40
-; NO-PRELOAD-NEXT:    s_load_dwordx2 s[12:13], s[0:1], 0x0
-; NO-PRELOAD-NEXT:    v_mov_b32_e32 v4, 0
-; NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
-; NO-PRELOAD-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
-; NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, s8
-; NO-PRELOAD-NEXT:    global_store_dwordx2 v4, v[2:3], s[12:13] offset:32 sc0 sc1
-; NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s9
-; NO-PRELOAD-NEXT:    v_mov_b32_e32 v2, s10
-; NO-PRELOAD-NEXT:    v_mov_b32_e32 v3, s11
-; NO-PRELOAD-NEXT:    global_store_dwordx4 v4, v[0:3], s[12:13] offset:16 sc0 sc1
-; NO-PRELOAD-NEXT:    s_nop 1
-; NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, s4
-; NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s5
-; NO-PRELOAD-NEXT:    v_mov_b32_e32 v2, s6
-; NO-PRELOAD-NEXT:    v_mov_b32_e32 v3, s7
-; NO-PRELOAD-NEXT:    global_store_dwordx4 v4, v[0:3], s[12:13] sc0 sc1
-; NO-PRELOAD-NEXT:    s_endpgm
-;
-; PRELOAD-1-LABEL: v5f64_arg:
-; PRELOAD-1:         s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:  ; %bb.0:
-; PRELOAD-1-NEXT:    s_load_dwordx2 s[12:13], s[0:1], 0x60
-; PRELOAD-1-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x40
-; PRELOAD-1-NEXT:    v_mov_b32_e32 v4, 0
-; PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
-; PRELOAD-1-NEXT:    v_mov_b64_e32 v[2:3], s[12:13]
-; PRELOAD-1-NEXT:    v_mov_b32_e32 v0, s8
-; PRELOAD-1-NEXT:    global_store_dwordx2 v4, v[2:3], s[2:3] offset:32 sc0 sc1
-; PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s9
-; PRELOAD-1-NEXT:    v_mov_b32_e32 v2, s10
-; PRELOAD-1-NEXT:    v_mov_b32_e32 v3, s11
-; PRELOAD-1-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] offset:16 sc0 sc1
-; PRELOAD-1-NEXT:    s_nop 1
-; PRELOAD-1-NEXT:    v_mov_b32_e32 v0, s4
-; PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s5
-; PRELOAD-1-NEXT:    v_mov_b32_e32 v2, s6
-; PRELOAD-1-NEXT:    v_mov_b32_e32 v3, s7
-; PRELOAD-1-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] sc0 sc1
-; PRELOAD-1-NEXT:    s_endpgm
-;
-; PRELOAD-2-LABEL: v5f64_arg:
-; PRELOAD-2:         s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:  ; %bb.0:
-; PRELOAD-2-NEXT:    s_load_dwordx2 s[12:13], s[0:1], 0x60
-; PRELOAD-2-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x40
-; PRELOAD-2-NEXT:    v_mov_b32_e32 v4, 0
-; PRELOAD-2-NEXT:    s_waitcnt lgkmcnt(0)
-; PRELOAD-2-NEXT:    v_mov_b64_e32 v[2:3], s[12:13]
-; PRELOAD-2-NEXT:    v_mov_b32_e32 v0, s8
-; PRELOAD-2-NEXT:    global_store_dwordx2 v4, v[2:3], s[2:3] offset:32 sc0 sc1
-; PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s9
-; PRELOAD-2-NEXT:    v_mov_b32_e32 v2, s10
-; PRELOAD-2-NEXT:    v_mov_b32_e32 v3, s11
-; PRELOAD-2-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] offset:16 sc0 sc1
-; PRELOAD-2-NEXT:    s_nop 1
-; PRELOAD-2-NEXT:    v_mov_b32_e32 v0, s4
-; PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s5
-; PRELOAD-2-NEXT:    v_mov_b32_e32 v2, s6
-; PRELOAD-2-NEXT:    v_mov_b32_e32 v3, s7
-; PRELOAD-2-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] sc0 sc1
-; PRELOAD-2-NEXT:    s_endpgm
-;
-; PRELOAD-4-LABEL: v5f64_arg:
-; PRELOAD-4:         s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:  ; %bb.0:
-; PRELOAD-4-NEXT:    s_load_dwordx2 s[12:13], s[0:1], 0x60
-; PRELOAD-4-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x40
-; PRELOAD-4-NEXT:    v_mov_b32_e32 v4, 0
-; PRELOAD-4-NEXT:    s_waitcnt lgkmcnt(0)
-; PRELOAD-4-NEXT:    v_mov_b64_e32 v[2:3], s[12:13]
-; PRELOAD-4-NEXT:    v_mov_b32_e32 v0, s8
-; PRELOAD-4-NEXT:    global_store_dwordx2 v4, v[2:3], s[2:3] offset:32 sc0 sc1
-; PRELOAD-4-NEXT:    v_mov_b32_e32 v1, s9
-; PRELOAD-4-NEXT:    v_mov_b32_e32 v2, s10
-; PRELOAD-4-NEXT:    v_mov_b32_e32 v3, s11
-; PRELOAD-4-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] offset:16 sc0 sc1
-; PRELOAD-4-NEXT:    s_nop 1
-; PRELOAD-4-NEXT:    v_mov_b32_e32 v0, s4
-; PRELOAD-4-NEXT:    v_mov_b32_e32 v1, s5
-; PRELOAD-4-NEXT:    v_mov_b32_e32 v2, s6
-; PRELOAD-4-NEXT:    v_mov_b32_e32 v3, s7
-; PRELOAD-4-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] sc0 sc1
-; PRELOAD-4-NEXT:    s_endpgm
-;
-; PRELOAD-8-LABEL: v5f64_arg:
-; PRELOAD-8:         s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:  ; %bb.0:
-; PRELOAD-8-NEXT:    s_load_dwordx2 s[12:13], s[0:1], 0x60
-; PRELOAD-8-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x40
-; PRELOAD-8-NEXT:    v_mov_b32_e32 v4, 0
-; PRELOAD-8-NEXT:    s_waitcnt lgkmcnt(0)
-; PRELOAD-8-NEXT:    v_mov_b64_e32 v[2:3], s[12:13]
-; PRELOAD-8-NEXT:    v_mov_b32_e32 v0, s8
-; PRELOAD-8-NEXT:    global_store_dwordx2 v4, v[2:3], s[2:3] offset:32 sc0 sc1
-; PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s9
-; PRELOAD-8-NEXT:    v_mov_b32_e32 v2, s10
-; PRELOAD-8-NEXT:    v_mov_b32_e32 v3, s11
-; PRELOAD-8-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] offset:16 sc0 sc1
-; PRELOAD-8-NEXT:    s_nop 1
-; PRELOAD-8-NEXT:    v_mov_b32_e32 v0, s4
-; PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s5
-; PRELOAD-8-NEXT:    v_mov_b32_e32 v2, s6
-; PRELOAD-8-NEXT:    v_mov_b32_e32 v3, s7
-; PRELOAD-8-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] sc0 sc1
-; PRELOAD-8-NEXT:    s_endpgm
+; GFX940-NO-PRELOAD-LABEL: v5f64_arg:
+; GFX940-NO-PRELOAD:       ; %bb.0:
+; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x60
+; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x40
+; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx2 s[12:13], s[0:1], 0x0
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v4, 0
+; GFX940-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, s8
+; GFX940-NO-PRELOAD-NEXT:    global_store_dwordx2 v4, v[2:3], s[12:13] offset:32 sc0 sc1
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s9
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v2, s10
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v3, s11
+; GFX940-NO-PRELOAD-NEXT:    global_store_dwordx4 v4, v[0:3], s[12:13] offset:16 sc0 sc1
+; GFX940-NO-PRELOAD-NEXT:    s_nop 1
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, s4
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s5
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v2, s6
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v3, s7
+; GFX940-NO-PRELOAD-NEXT:    global_store_dwordx4 v4, v[0:3], s[12:13] sc0 sc1
+; GFX940-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-1-LABEL: v5f64_arg:
+; GFX940-PRELOAD-1:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-1-NEXT:    s_load_dwordx2 s[12:13], s[0:1], 0x60
+; GFX940-PRELOAD-1-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x40
+; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v4, 0
+; GFX940-PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-PRELOAD-1-NEXT:    v_mov_b64_e32 v[2:3], s[12:13]
+; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v0, s8
+; GFX940-PRELOAD-1-NEXT:    global_store_dwordx2 v4, v[2:3], s[2:3] offset:32 sc0 sc1
+; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s9
+; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v2, s10
+; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v3, s11
+; GFX940-PRELOAD-1-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] offset:16 sc0 sc1
+; GFX940-PRELOAD-1-NEXT:    s_nop 1
+; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v0, s4
+; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s5
+; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v2, s6
+; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v3, s7
+; GFX940-PRELOAD-1-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] sc0 sc1
+; GFX940-PRELOAD-1-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-2-LABEL: v5f64_arg:
+; GFX940-PRELOAD-2:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-2-NEXT:    s_load_dwordx2 s[12:13], s[0:1], 0x60
+; GFX940-PRELOAD-2-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x40
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v4, 0
+; GFX940-PRELOAD-2-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-PRELOAD-2-NEXT:    v_mov_b64_e32 v[2:3], s[12:13]
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v0, s8
+; GFX940-PRELOAD-2-NEXT:    global_store_dwordx2 v4, v[2:3], s[2:3] offset:32 sc0 sc1
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s9
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v2, s10
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v3, s11
+; GFX940-PRELOAD-2-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] offset:16 sc0 sc1
+; GFX940-PRELOAD-2-NEXT:    s_nop 1
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v0, s4
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s5
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v2, s6
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v3, s7
+; GFX940-PRELOAD-2-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] sc0 sc1
+; GFX940-PRELOAD-2-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-4-LABEL: v5f64_arg:
+; GFX940-PRELOAD-4:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-4-NEXT:    s_load_dwordx2 s[12:13], s[0:1], 0x60
+; GFX940-PRELOAD-4-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x40
+; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v4, 0
+; GFX940-PRELOAD-4-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-PRELOAD-4-NEXT:    v_mov_b64_e32 v[2:3], s[12:13]
+; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v0, s8
+; GFX940-PRELOAD-4-NEXT:    global_store_dwordx2 v4, v[2:3], s[2:3] offset:32 sc0 sc1
+; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v1, s9
+; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v2, s10
+; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v3, s11
+; GFX940-PRELOAD-4-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] offset:16 sc0 sc1
+; GFX940-PRELOAD-4-NEXT:    s_nop 1
+; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v0, s4
+; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v1, s5
+; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v2, s6
+; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v3, s7
+; GFX940-PRELOAD-4-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] sc0 sc1
+; GFX940-PRELOAD-4-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-8-LABEL: v5f64_arg:
+; GFX940-PRELOAD-8:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-8-NEXT:    s_load_dwordx2 s[12:13], s[0:1], 0x60
+; GFX940-PRELOAD-8-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x40
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v4, 0
+; GFX940-PRELOAD-8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-PRELOAD-8-NEXT:    v_mov_b64_e32 v[2:3], s[12:13]
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v0, s8
+; GFX940-PRELOAD-8-NEXT:    global_store_dwordx2 v4, v[2:3], s[2:3] offset:32 sc0 sc1
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s9
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v2, s10
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v3, s11
+; GFX940-PRELOAD-8-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] offset:16 sc0 sc1
+; GFX940-PRELOAD-8-NEXT:    s_nop 1
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v0, s4
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s5
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v2, s6
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v3, s7
+; GFX940-PRELOAD-8-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] sc0 sc1
+; GFX940-PRELOAD-8-NEXT:    s_endpgm
+;
+; GFX90a-NO-PRELOAD-LABEL: v5f64_arg:
+; GFX90a-NO-PRELOAD:       ; %bb.0:
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x60
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x40
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v4, 0
+; GFX90a-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-NO-PRELOAD-NEXT:    v_pk_mov_b32 v[2:3], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, s12
+; GFX90a-NO-PRELOAD-NEXT:    global_store_dwordx2 v4, v[2:3], s[2:3] offset:32
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s13
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v2, s14
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v3, s15
+; GFX90a-NO-PRELOAD-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] offset:16
+; GFX90a-NO-PRELOAD-NEXT:    s_nop 0
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, s8
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s9
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v2, s10
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v3, s11
+; GFX90a-NO-PRELOAD-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3]
+; GFX90a-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-1-LABEL: v5f64_arg:
+; GFX90a-PRELOAD-1:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-1-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x60
+; GFX90a-PRELOAD-1-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x40
+; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v4, 0
+; GFX90a-PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-PRELOAD-1-NEXT:    v_pk_mov_b32 v[2:3], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v0, s12
+; GFX90a-PRELOAD-1-NEXT:    global_store_dwordx2 v4, v[2:3], s[6:7] offset:32
+; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s13
+; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v2, s14
+; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v3, s15
+; GFX90a-PRELOAD-1-NEXT:    global_store_dwordx4 v4, v[0:3], s[6:7] offset:16
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v0, s8
+; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s9
+; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v2, s10
+; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v3, s11
+; GFX90a-PRELOAD-1-NEXT:    global_store_dwordx4 v4, v[0:3], s[6:7]
+; GFX90a-PRELOAD-1-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-2-LABEL: v5f64_arg:
+; GFX90a-PRELOAD-2:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-2-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x60
+; GFX90a-PRELOAD-2-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x40
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v4, 0
+; GFX90a-PRELOAD-2-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-PRELOAD-2-NEXT:    v_pk_mov_b32 v[2:3], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v0, s12
+; GFX90a-PRELOAD-2-NEXT:    global_store_dwordx2 v4, v[2:3], s[6:7] offset:32
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s13
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v2, s14
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v3, s15
+; GFX90a-PRELOAD-2-NEXT:    global_store_dwordx4 v4, v[0:3], s[6:7] offset:16
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v0, s8
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s9
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v2, s10
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v3, s11
+; GFX90a-PRELOAD-2-NEXT:    global_store_dwordx4 v4, v[0:3], s[6:7]
+; GFX90a-PRELOAD-2-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-4-LABEL: v5f64_arg:
+; GFX90a-PRELOAD-4:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-4-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x60
+; GFX90a-PRELOAD-4-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x40
+; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v4, 0
+; GFX90a-PRELOAD-4-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-PRELOAD-4-NEXT:    v_pk_mov_b32 v[2:3], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v0, s12
+; GFX90a-PRELOAD-4-NEXT:    global_store_dwordx2 v4, v[2:3], s[6:7] offset:32
+; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v1, s13
+; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v2, s14
+; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v3, s15
+; GFX90a-PRELOAD-4-NEXT:    global_store_dwordx4 v4, v[0:3], s[6:7] offset:16
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v0, s8
+; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v1, s9
+; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v2, s10
+; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v3, s11
+; GFX90a-PRELOAD-4-NEXT:    global_store_dwordx4 v4, v[0:3], s[6:7]
+; GFX90a-PRELOAD-4-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-8-LABEL: v5f64_arg:
+; GFX90a-PRELOAD-8:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-8-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x60
+; GFX90a-PRELOAD-8-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x40
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v4, 0
+; GFX90a-PRELOAD-8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-PRELOAD-8-NEXT:    v_pk_mov_b32 v[2:3], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v0, s12
+; GFX90a-PRELOAD-8-NEXT:    global_store_dwordx2 v4, v[2:3], s[6:7] offset:32
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s13
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v2, s14
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v3, s15
+; GFX90a-PRELOAD-8-NEXT:    global_store_dwordx4 v4, v[0:3], s[6:7] offset:16
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v0, s8
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s9
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v2, s10
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v3, s11
+; GFX90a-PRELOAD-8-NEXT:    global_store_dwordx4 v4, v[0:3], s[6:7]
+; GFX90a-PRELOAD-8-NEXT:    s_endpgm
   store <5 x double> %in, ptr addrspace(1) %out, align 8
   ret void
 }
 
 define amdgpu_kernel void @v8i8_preload_arg(ptr addrspace(1) %out, <8 x i8> %in) {
-; NO-PRELOAD-LABEL: v8i8_preload_arg:
-; NO-PRELOAD:       ; %bb.0:
-; NO-PRELOAD-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; NO-PRELOAD-NEXT:    v_mov_b32_e32 v2, 0
-; NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
-; NO-PRELOAD-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; NO-PRELOAD-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1
-; NO-PRELOAD-NEXT:    s_endpgm
-;
-; PRELOAD-1-LABEL: v8i8_preload_arg:
-; PRELOAD-1:         s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:  ; %bb.0:
-; PRELOAD-1-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
-; PRELOAD-1-NEXT:    v_mov_b32_e32 v2, 0
-; PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
-; PRELOAD-1-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; PRELOAD-1-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3] sc0 sc1
-; PRELOAD-1-NEXT:    s_endpgm
-;
-; PRELOAD-2-LABEL: v8i8_preload_arg:
-; PRELOAD-2:         s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:  ; %bb.0:
-; PRELOAD-2-NEXT:    s_lshr_b32 s0, s5, 8
-; PRELOAD-2-NEXT:    v_lshlrev_b16_e64 v0, 8, s0
-; PRELOAD-2-NEXT:    s_lshr_b32 s0, s5, 24
-; PRELOAD-2-NEXT:    v_lshlrev_b16_e64 v1, 8, s0
-; PRELOAD-2-NEXT:    s_lshr_b32 s0, s5, 16
-; PRELOAD-2-NEXT:    v_or_b32_sdwa v0, s5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; PRELOAD-2-NEXT:    v_or_b32_sdwa v1, s0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; PRELOAD-2-NEXT:    s_lshr_b32 s0, s4, 8
-; PRELOAD-2-NEXT:    v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; PRELOAD-2-NEXT:    v_lshlrev_b16_e64 v0, 8, s0
-; PRELOAD-2-NEXT:    s_lshr_b32 s0, s4, 24
-; PRELOAD-2-NEXT:    v_lshlrev_b16_e64 v2, 8, s0
-; PRELOAD-2-NEXT:    s_lshr_b32 s0, s4, 16
-; PRELOAD-2-NEXT:    v_or_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; PRELOAD-2-NEXT:    v_or_b32_sdwa v2, s0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; PRELOAD-2-NEXT:    v_mov_b32_e32 v2, 0
-; PRELOAD-2-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3] sc0 sc1
-; PRELOAD-2-NEXT:    s_endpgm
-;
-; PRELOAD-4-LABEL: v8i8_preload_arg:
-; PRELOAD-4:         s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:  ; %bb.0:
-; PRELOAD-4-NEXT:    s_lshr_b32 s0, s5, 8
-; PRELOAD-4-NEXT:    v_lshlrev_b16_e64 v0, 8, s0
-; PRELOAD-4-NEXT:    s_lshr_b32 s0, s5, 24
-; PRELOAD-4-NEXT:    v_lshlrev_b16_e64 v1, 8, s0
-; PRELOAD-4-NEXT:    s_lshr_b32 s0, s5, 16
-; PRELOAD-4-NEXT:    v_or_b32_sdwa v0, s5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; PRELOAD-4-NEXT:    v_or_b32_sdwa v1, s0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; PRELOAD-4-NEXT:    s_lshr_b32 s0, s4, 8
-; PRELOAD-4-NEXT:    v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; PRELOAD-4-NEXT:    v_lshlrev_b16_e64 v0, 8, s0
-; PRELOAD-4-NEXT:    s_lshr_b32 s0, s4, 24
-; PRELOAD-4-NEXT:    v_lshlrev_b16_e64 v2, 8, s0
-; PRELOAD-4-NEXT:    s_lshr_b32 s0, s4, 16
-; PRELOAD-4-NEXT:    v_or_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; PRELOAD-4-NEXT:    v_or_b32_sdwa v2, s0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; PRELOAD-4-NEXT:    v_mov_b32_e32 v2, 0
-; PRELOAD-4-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3] sc0 sc1
-; PRELOAD-4-NEXT:    s_endpgm
-;
-; PRELOAD-8-LABEL: v8i8_preload_arg:
-; PRELOAD-8:         s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:  ; %bb.0:
-; PRELOAD-8-NEXT:    s_lshr_b32 s0, s5, 8
-; PRELOAD-8-NEXT:    v_lshlrev_b16_e64 v0, 8, s0
-; PRELOAD-8-NEXT:    s_lshr_b32 s0, s5, 24
-; PRELOAD-8-NEXT:    v_lshlrev_b16_e64 v1, 8, s0
-; PRELOAD-8-NEXT:    s_lshr_b32 s0, s5, 16
-; PRELOAD-8-NEXT:    v_or_b32_sdwa v0, s5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; PRELOAD-8-NEXT:    v_or_b32_sdwa v1, s0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; PRELOAD-8-NEXT:    s_lshr_b32 s0, s4, 8
-; PRELOAD-8-NEXT:    v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; PRELOAD-8-NEXT:    v_lshlrev_b16_e64 v0, 8, s0
-; PRELOAD-8-NEXT:    s_lshr_b32 s0, s4, 24
-; PRELOAD-8-NEXT:    v_lshlrev_b16_e64 v2, 8, s0
-; PRELOAD-8-NEXT:    s_lshr_b32 s0, s4, 16
-; PRELOAD-8-NEXT:    v_or_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; PRELOAD-8-NEXT:    v_or_b32_sdwa v2, s0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; PRELOAD-8-NEXT:    v_mov_b32_e32 v2, 0
-; PRELOAD-8-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3] sc0 sc1
-; PRELOAD-8-NEXT:    s_endpgm
+; GFX940-NO-PRELOAD-LABEL: v8i8_preload_arg:
+; GFX940-NO-PRELOAD:       ; %bb.0:
+; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NO-PRELOAD-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1
+; GFX940-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-1-LABEL: v8i8_preload_arg:
+; GFX940-PRELOAD-1:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-1-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
+; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-PRELOAD-1-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX940-PRELOAD-1-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3] sc0 sc1
+; GFX940-PRELOAD-1-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-2-LABEL: v8i8_preload_arg:
+; GFX940-PRELOAD-2:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-2-NEXT:    s_lshr_b32 s0, s5, 8
+; GFX940-PRELOAD-2-NEXT:    v_lshlrev_b16_e64 v0, 8, s0
+; GFX940-PRELOAD-2-NEXT:    s_lshr_b32 s0, s5, 24
+; GFX940-PRELOAD-2-NEXT:    v_lshlrev_b16_e64 v1, 8, s0
+; GFX940-PRELOAD-2-NEXT:    s_lshr_b32 s0, s5, 16
+; GFX940-PRELOAD-2-NEXT:    v_or_b32_sdwa v0, s5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX940-PRELOAD-2-NEXT:    v_or_b32_sdwa v1, s0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX940-PRELOAD-2-NEXT:    s_lshr_b32 s0, s4, 8
+; GFX940-PRELOAD-2-NEXT:    v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX940-PRELOAD-2-NEXT:    v_lshlrev_b16_e64 v0, 8, s0
+; GFX940-PRELOAD-2-NEXT:    s_lshr_b32 s0, s4, 24
+; GFX940-PRELOAD-2-NEXT:    v_lshlrev_b16_e64 v2, 8, s0
+; GFX940-PRELOAD-2-NEXT:    s_lshr_b32 s0, s4, 16
+; GFX940-PRELOAD-2-NEXT:    v_or_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX940-PRELOAD-2-NEXT:    v_or_b32_sdwa v2, s0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-PRELOAD-2-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3] sc0 sc1
+; GFX940-PRELOAD-2-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-4-LABEL: v8i8_preload_arg:
+; GFX940-PRELOAD-4:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-4-NEXT:    s_lshr_b32 s0, s5, 8
+; GFX940-PRELOAD-4-NEXT:    v_lshlrev_b16_e64 v0, 8, s0
+; GFX940-PRELOAD-4-NEXT:    s_lshr_b32 s0, s5, 24
+; GFX940-PRELOAD-4-NEXT:    v_lshlrev_b16_e64 v1, 8, s0
+; GFX940-PRELOAD-4-NEXT:    s_lshr_b32 s0, s5, 16
+; GFX940-PRELOAD-4-NEXT:    v_or_b32_sdwa v0, s5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX940-PRELOAD-4-NEXT:    v_or_b32_sdwa v1, s0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX940-PRELOAD-4-NEXT:    s_lshr_b32 s0, s4, 8
+; GFX940-PRELOAD-4-NEXT:    v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX940-PRELOAD-4-NEXT:    v_lshlrev_b16_e64 v0, 8, s0
+; GFX940-PRELOAD-4-NEXT:    s_lshr_b32 s0, s4, 24
+; GFX940-PRELOAD-4-NEXT:    v_lshlrev_b16_e64 v2, 8, s0
+; GFX940-PRELOAD-4-NEXT:    s_lshr_b32 s0, s4, 16
+; GFX940-PRELOAD-4-NEXT:    v_or_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX940-PRELOAD-4-NEXT:    v_or_b32_sdwa v2, s0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-PRELOAD-4-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3] sc0 sc1
+; GFX940-PRELOAD-4-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-8-LABEL: v8i8_preload_arg:
+; GFX940-PRELOAD-8:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-8-NEXT:    s_lshr_b32 s0, s5, 8
+; GFX940-PRELOAD-8-NEXT:    v_lshlrev_b16_e64 v0, 8, s0
+; GFX940-PRELOAD-8-NEXT:    s_lshr_b32 s0, s5, 24
+; GFX940-PRELOAD-8-NEXT:    v_lshlrev_b16_e64 v1, 8, s0
+; GFX940-PRELOAD-8-NEXT:    s_lshr_b32 s0, s5, 16
+; GFX940-PRELOAD-8-NEXT:    v_or_b32_sdwa v0, s5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX940-PRELOAD-8-NEXT:    v_or_b32_sdwa v1, s0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX940-PRELOAD-8-NEXT:    s_lshr_b32 s0, s4, 8
+; GFX940-PRELOAD-8-NEXT:    v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX940-PRELOAD-8-NEXT:    v_lshlrev_b16_e64 v0, 8, s0
+; GFX940-PRELOAD-8-NEXT:    s_lshr_b32 s0, s4, 24
+; GFX940-PRELOAD-8-NEXT:    v_lshlrev_b16_e64 v2, 8, s0
+; GFX940-PRELOAD-8-NEXT:    s_lshr_b32 s0, s4, 16
+; GFX940-PRELOAD-8-NEXT:    v_or_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX940-PRELOAD-8-NEXT:    v_or_b32_sdwa v2, s0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-PRELOAD-8-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3] sc0 sc1
+; GFX940-PRELOAD-8-NEXT:    s_endpgm
+;
+; GFX90a-NO-PRELOAD-LABEL: v8i8_preload_arg:
+; GFX90a-NO-PRELOAD:       ; %bb.0:
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90a-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-NO-PRELOAD-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90a-NO-PRELOAD-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX90a-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-1-LABEL: v8i8_preload_arg:
+; GFX90a-PRELOAD-1:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-1-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90a-PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-PRELOAD-1-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90a-PRELOAD-1-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
+; GFX90a-PRELOAD-1-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-2-LABEL: v8i8_preload_arg:
+; GFX90a-PRELOAD-2:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-2-NEXT:    s_lshr_b32 s0, s9, 8
+; GFX90a-PRELOAD-2-NEXT:    v_lshlrev_b16_e64 v0, 8, s0
+; GFX90a-PRELOAD-2-NEXT:    s_lshr_b32 s0, s9, 24
+; GFX90a-PRELOAD-2-NEXT:    v_lshlrev_b16_e64 v1, 8, s0
+; GFX90a-PRELOAD-2-NEXT:    s_lshr_b32 s0, s9, 16
+; GFX90a-PRELOAD-2-NEXT:    v_or_b32_sdwa v0, s9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX90a-PRELOAD-2-NEXT:    v_or_b32_sdwa v1, s0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX90a-PRELOAD-2-NEXT:    s_lshr_b32 s0, s8, 8
+; GFX90a-PRELOAD-2-NEXT:    v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX90a-PRELOAD-2-NEXT:    v_lshlrev_b16_e64 v0, 8, s0
+; GFX90a-PRELOAD-2-NEXT:    s_lshr_b32 s0, s8, 24
+; GFX90a-PRELOAD-2-NEXT:    v_lshlrev_b16_e64 v2, 8, s0
+; GFX90a-PRELOAD-2-NEXT:    s_lshr_b32 s0, s8, 16
+; GFX90a-PRELOAD-2-NEXT:    v_or_b32_sdwa v0, s8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX90a-PRELOAD-2-NEXT:    v_or_b32_sdwa v2, s0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX90a-PRELOAD-2-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90a-PRELOAD-2-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
+; GFX90a-PRELOAD-2-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-4-LABEL: v8i8_preload_arg:
+; GFX90a-PRELOAD-4:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-4-NEXT:    s_lshr_b32 s0, s9, 8
+; GFX90a-PRELOAD-4-NEXT:    v_lshlrev_b16_e64 v0, 8, s0
+; GFX90a-PRELOAD-4-NEXT:    s_lshr_b32 s0, s9, 24
+; GFX90a-PRELOAD-4-NEXT:    v_lshlrev_b16_e64 v1, 8, s0
+; GFX90a-PRELOAD-4-NEXT:    s_lshr_b32 s0, s9, 16
+; GFX90a-PRELOAD-4-NEXT:    v_or_b32_sdwa v0, s9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX90a-PRELOAD-4-NEXT:    v_or_b32_sdwa v1, s0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX90a-PRELOAD-4-NEXT:    s_lshr_b32 s0, s8, 8
+; GFX90a-PRELOAD-4-NEXT:    v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX90a-PRELOAD-4-NEXT:    v_lshlrev_b16_e64 v0, 8, s0
+; GFX90a-PRELOAD-4-NEXT:    s_lshr_b32 s0, s8, 24
+; GFX90a-PRELOAD-4-NEXT:    v_lshlrev_b16_e64 v2, 8, s0
+; GFX90a-PRELOAD-4-NEXT:    s_lshr_b32 s0, s8, 16
+; GFX90a-PRELOAD-4-NEXT:    v_or_b32_sdwa v0, s8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX90a-PRELOAD-4-NEXT:    v_or_b32_sdwa v2, s0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX90a-PRELOAD-4-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90a-PRELOAD-4-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
+; GFX90a-PRELOAD-4-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-8-LABEL: v8i8_preload_arg:
+; GFX90a-PRELOAD-8:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-8-NEXT:    s_lshr_b32 s0, s9, 8
+; GFX90a-PRELOAD-8-NEXT:    v_lshlrev_b16_e64 v0, 8, s0
+; GFX90a-PRELOAD-8-NEXT:    s_lshr_b32 s0, s9, 24
+; GFX90a-PRELOAD-8-NEXT:    v_lshlrev_b16_e64 v1, 8, s0
+; GFX90a-PRELOAD-8-NEXT:    s_lshr_b32 s0, s9, 16
+; GFX90a-PRELOAD-8-NEXT:    v_or_b32_sdwa v0, s9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX90a-PRELOAD-8-NEXT:    v_or_b32_sdwa v1, s0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX90a-PRELOAD-8-NEXT:    s_lshr_b32 s0, s8, 8
+; GFX90a-PRELOAD-8-NEXT:    v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX90a-PRELOAD-8-NEXT:    v_lshlrev_b16_e64 v0, 8, s0
+; GFX90a-PRELOAD-8-NEXT:    s_lshr_b32 s0, s8, 24
+; GFX90a-PRELOAD-8-NEXT:    v_lshlrev_b16_e64 v2, 8, s0
+; GFX90a-PRELOAD-8-NEXT:    s_lshr_b32 s0, s8, 16
+; GFX90a-PRELOAD-8-NEXT:    v_or_b32_sdwa v0, s8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX90a-PRELOAD-8-NEXT:    v_or_b32_sdwa v2, s0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX90a-PRELOAD-8-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90a-PRELOAD-8-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
+; GFX90a-PRELOAD-8-NEXT:    s_endpgm
   store <8 x i8> %in, ptr addrspace(1) %out
   ret void
 }
 
 define amdgpu_kernel void @i64_kernel_preload_arg(ptr addrspace(1) %out, i64 %a) {
-; NO-PRELOAD-LABEL: i64_kernel_preload_arg:
-; NO-PRELOAD:       ; %bb.0:
-; NO-PRELOAD-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; NO-PRELOAD-NEXT:    v_mov_b32_e32 v2, 0
-; NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
-; NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, s2
-; NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s3
-; NO-PRELOAD-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1
-; NO-PRELOAD-NEXT:    s_endpgm
-;
-; PRELOAD-1-LABEL: i64_kernel_preload_arg:
-; PRELOAD-1:         s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:  ; %bb.0:
-; PRELOAD-1-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
-; PRELOAD-1-NEXT:    v_mov_b32_e32 v2, 0
-; PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
-; PRELOAD-1-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; PRELOAD-1-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3] sc0 sc1
-; PRELOAD-1-NEXT:    s_endpgm
-;
-; PRELOAD-2-LABEL: i64_kernel_preload_arg:
-; PRELOAD-2:         s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:  ; %bb.0:
-; PRELOAD-2-NEXT:    v_mov_b32_e32 v2, 0
-; PRELOAD-2-NEXT:    v_mov_b64_e32 v[0:1], s[4:5]
-; PRELOAD-2-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3] sc0 sc1
-; PRELOAD-2-NEXT:    s_endpgm
-;
-; PRELOAD-4-LABEL: i64_kernel_preload_arg:
-; PRELOAD-4:         s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:  ; %bb.0:
-; PRELOAD-4-NEXT:    v_mov_b32_e32 v2, 0
-; PRELOAD-4-NEXT:    v_mov_b64_e32 v[0:1], s[4:5]
-; PRELOAD-4-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3] sc0 sc1
-; PRELOAD-4-NEXT:    s_endpgm
-;
-; PRELOAD-8-LABEL: i64_kernel_preload_arg:
-; PRELOAD-8:         s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:  ; %bb.0:
-; PRELOAD-8-NEXT:    v_mov_b32_e32 v2, 0
-; PRELOAD-8-NEXT:    v_mov_b64_e32 v[0:1], s[4:5]
-; PRELOAD-8-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3] sc0 sc1
-; PRELOAD-8-NEXT:    s_endpgm
+; GFX940-NO-PRELOAD-LABEL: i64_kernel_preload_arg:
+; GFX940-NO-PRELOAD:       ; %bb.0:
+; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s3
+; GFX940-NO-PRELOAD-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1
+; GFX940-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-1-LABEL: i64_kernel_preload_arg:
+; GFX940-PRELOAD-1:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-1-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
+; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-PRELOAD-1-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX940-PRELOAD-1-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3] sc0 sc1
+; GFX940-PRELOAD-1-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-2-LABEL: i64_kernel_preload_arg:
+; GFX940-PRELOAD-2:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-PRELOAD-2-NEXT:    v_mov_b64_e32 v[0:1], s[4:5]
+; GFX940-PRELOAD-2-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3] sc0 sc1
+; GFX940-PRELOAD-2-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-4-LABEL: i64_kernel_preload_arg:
+; GFX940-PRELOAD-4:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-PRELOAD-4-NEXT:    v_mov_b64_e32 v[0:1], s[4:5]
+; GFX940-PRELOAD-4-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3] sc0 sc1
+; GFX940-PRELOAD-4-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-8-LABEL: i64_kernel_preload_arg:
+; GFX940-PRELOAD-8:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-PRELOAD-8-NEXT:    v_mov_b64_e32 v[0:1], s[4:5]
+; GFX940-PRELOAD-8-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3] sc0 sc1
+; GFX940-PRELOAD-8-NEXT:    s_endpgm
+;
+; GFX90a-NO-PRELOAD-LABEL: i64_kernel_preload_arg:
+; GFX90a-NO-PRELOAD:       ; %bb.0:
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90a-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, s2
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s3
+; GFX90a-NO-PRELOAD-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX90a-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-1-LABEL: i64_kernel_preload_arg:
+; GFX90a-PRELOAD-1:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-1-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90a-PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-PRELOAD-1-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90a-PRELOAD-1-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
+; GFX90a-PRELOAD-1-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-2-LABEL: i64_kernel_preload_arg:
+; GFX90a-PRELOAD-2:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90a-PRELOAD-2-NEXT:    v_pk_mov_b32 v[0:1], s[8:9], s[8:9] op_sel:[0,1]
+; GFX90a-PRELOAD-2-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
+; GFX90a-PRELOAD-2-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-4-LABEL: i64_kernel_preload_arg:
+; GFX90a-PRELOAD-4:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90a-PRELOAD-4-NEXT:    v_pk_mov_b32 v[0:1], s[8:9], s[8:9] op_sel:[0,1]
+; GFX90a-PRELOAD-4-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
+; GFX90a-PRELOAD-4-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-8-LABEL: i64_kernel_preload_arg:
+; GFX90a-PRELOAD-8:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90a-PRELOAD-8-NEXT:    v_pk_mov_b32 v[0:1], s[8:9], s[8:9] op_sel:[0,1]
+; GFX90a-PRELOAD-8-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
+; GFX90a-PRELOAD-8-NEXT:    s_endpgm
   store i64 %a, ptr addrspace(1) %out, align 8
   ret void
 }
 
 define amdgpu_kernel void @f64_kernel_preload_arg(ptr addrspace(1) %out, double %in) {
-; NO-PRELOAD-LABEL: f64_kernel_preload_arg:
-; NO-PRELOAD:       ; %bb.0:
-; NO-PRELOAD-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; NO-PRELOAD-NEXT:    v_mov_b32_e32 v2, 0
-; NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
-; NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, s2
-; NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s3
-; NO-PRELOAD-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1
-; NO-PRELOAD-NEXT:    s_endpgm
-;
-; PRELOAD-1-LABEL: f64_kernel_preload_arg:
-; PRELOAD-1:         s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:  ; %bb.0:
-; PRELOAD-1-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
-; PRELOAD-1-NEXT:    v_mov_b32_e32 v2, 0
-; PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
-; PRELOAD-1-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; PRELOAD-1-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3] sc0 sc1
-; PRELOAD-1-NEXT:    s_endpgm
-;
-; PRELOAD-2-LABEL: f64_kernel_preload_arg:
-; PRELOAD-2:         s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:  ; %bb.0:
-; PRELOAD-2-NEXT:    v_mov_b32_e32 v2, 0
-; PRELOAD-2-NEXT:    v_mov_b64_e32 v[0:1], s[4:5]
-; PRELOAD-2-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3] sc0 sc1
-; PRELOAD-2-NEXT:    s_endpgm
-;
-; PRELOAD-4-LABEL: f64_kernel_preload_arg:
-; PRELOAD-4:         s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:  ; %bb.0:
-; PRELOAD-4-NEXT:    v_mov_b32_e32 v2, 0
-; PRELOAD-4-NEXT:    v_mov_b64_e32 v[0:1], s[4:5]
-; PRELOAD-4-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3] sc0 sc1
-; PRELOAD-4-NEXT:    s_endpgm
-;
-; PRELOAD-8-LABEL: f64_kernel_preload_arg:
-; PRELOAD-8:         s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:  ; %bb.0:
-; PRELOAD-8-NEXT:    v_mov_b32_e32 v2, 0
-; PRELOAD-8-NEXT:    v_mov_b64_e32 v[0:1], s[4:5]
-; PRELOAD-8-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3] sc0 sc1
-; PRELOAD-8-NEXT:    s_endpgm
+; GFX940-NO-PRELOAD-LABEL: f64_kernel_preload_arg:
+; GFX940-NO-PRELOAD:       ; %bb.0:
+; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s3
+; GFX940-NO-PRELOAD-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1
+; GFX940-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-1-LABEL: f64_kernel_preload_arg:
+; GFX940-PRELOAD-1:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-1-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
+; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-PRELOAD-1-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX940-PRELOAD-1-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3] sc0 sc1
+; GFX940-PRELOAD-1-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-2-LABEL: f64_kernel_preload_arg:
+; GFX940-PRELOAD-2:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-PRELOAD-2-NEXT:    v_mov_b64_e32 v[0:1], s[4:5]
+; GFX940-PRELOAD-2-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3] sc0 sc1
+; GFX940-PRELOAD-2-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-4-LABEL: f64_kernel_preload_arg:
+; GFX940-PRELOAD-4:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-PRELOAD-4-NEXT:    v_mov_b64_e32 v[0:1], s[4:5]
+; GFX940-PRELOAD-4-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3] sc0 sc1
+; GFX940-PRELOAD-4-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-8-LABEL: f64_kernel_preload_arg:
+; GFX940-PRELOAD-8:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-PRELOAD-8-NEXT:    v_mov_b64_e32 v[0:1], s[4:5]
+; GFX940-PRELOAD-8-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3] sc0 sc1
+; GFX940-PRELOAD-8-NEXT:    s_endpgm
+;
+; GFX90a-NO-PRELOAD-LABEL: f64_kernel_preload_arg:
+; GFX90a-NO-PRELOAD:       ; %bb.0:
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90a-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, s2
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s3
+; GFX90a-NO-PRELOAD-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX90a-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-1-LABEL: f64_kernel_preload_arg:
+; GFX90a-PRELOAD-1:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-1-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90a-PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-PRELOAD-1-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90a-PRELOAD-1-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
+; GFX90a-PRELOAD-1-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-2-LABEL: f64_kernel_preload_arg:
+; GFX90a-PRELOAD-2:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90a-PRELOAD-2-NEXT:    v_pk_mov_b32 v[0:1], s[8:9], s[8:9] op_sel:[0,1]
+; GFX90a-PRELOAD-2-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
+; GFX90a-PRELOAD-2-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-4-LABEL: f64_kernel_preload_arg:
+; GFX90a-PRELOAD-4:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90a-PRELOAD-4-NEXT:    v_pk_mov_b32 v[0:1], s[8:9], s[8:9] op_sel:[0,1]
+; GFX90a-PRELOAD-4-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
+; GFX90a-PRELOAD-4-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-8-LABEL: f64_kernel_preload_arg:
+; GFX90a-PRELOAD-8:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90a-PRELOAD-8-NEXT:    v_pk_mov_b32 v[0:1], s[8:9], s[8:9] op_sel:[0,1]
+; GFX90a-PRELOAD-8-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
+; GFX90a-PRELOAD-8-NEXT:    s_endpgm
   store double %in, ptr addrspace(1) %out
   ret void
 }
-- 
cgit v1.1


From 785eddd7a786b798427e336b79bc0f2495a49984 Mon Sep 17 00:00:00 2001
From: sstipanovic <146831748+sstipanovic@users.noreply.github.com>
Date: Tue, 13 Feb 2024 08:26:10 +0100
Subject: [AMDGPU][GlobalIsel] Introduce isRegisterClassType to check for legal
 types, instead of checking bit width.  (#68189)

In D151116 it was suggested to have a set of classes to cover every
possible case. This does it for bitcast first.

closes #79578
---
 llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp     | 135 +++++++++++++--------
 .../CodeGen/AMDGPU/GlobalISel/bitcast_38_i16.ll    |  85 +++++++++++++
 .../CodeGen/AMDGPU/GlobalISel/extractelement.ll    | 126 +++++++++++++++++++
 .../AMDGPU/GlobalISel/legalize-build-vector.mir    | 123 -------------------
 4 files changed, 296 insertions(+), 173 deletions(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/GlobalISel/bitcast_38_i16.ll

diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index 97952de..def08cc 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -239,6 +239,7 @@ static bool isRegisterVectorType(LLT Ty) {
          EltSize == 128 || EltSize == 256;
 }
 
+// TODO: replace all uses of isRegisterType with isRegisterClassType
 static bool isRegisterType(LLT Ty) {
   if (!isRegisterSize(Ty.getSizeInBits()))
     return false;
@@ -258,6 +259,8 @@ static LegalityPredicate isRegisterType(unsigned TypeIdx) {
 }
 
 // RegisterType that doesn't have a corresponding RegClass.
+// TODO: Once `isRegisterType` is replaced with `isRegisterClassType` this
+// should be removed.
 static LegalityPredicate isIllegalRegisterType(unsigned TypeIdx) {
   return [=](const LegalityQuery &Query) {
     LLT Ty = Query.Types[TypeIdx];
@@ -276,6 +279,85 @@ static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) {
   };
 }
 
+static const LLT S1 = LLT::scalar(1);
+static const LLT S8 = LLT::scalar(8);
+static const LLT S16 = LLT::scalar(16);
+static const LLT S32 = LLT::scalar(32);
+static const LLT S64 = LLT::scalar(64);
+static const LLT S96 = LLT::scalar(96);
+static const LLT S128 = LLT::scalar(128);
+static const LLT S160 = LLT::scalar(160);
+static const LLT S224 = LLT::scalar(224);
+static const LLT S256 = LLT::scalar(256);
+static const LLT S512 = LLT::scalar(512);
+static const LLT MaxScalar = LLT::scalar(MaxRegisterSize);
+
+static const LLT V2S8 = LLT::fixed_vector(2, 8);
+static const LLT V2S16 = LLT::fixed_vector(2, 16);
+static const LLT V4S16 = LLT::fixed_vector(4, 16);
+static const LLT V6S16 = LLT::fixed_vector(6, 16);
+static const LLT V8S16 = LLT::fixed_vector(8, 16);
+static const LLT V10S16 = LLT::fixed_vector(10, 16);
+static const LLT V12S16 = LLT::fixed_vector(12, 16);
+static const LLT V16S16 = LLT::fixed_vector(16, 16);
+
+static const LLT V2S32 = LLT::fixed_vector(2, 32);
+static const LLT V3S32 = LLT::fixed_vector(3, 32);
+static const LLT V4S32 = LLT::fixed_vector(4, 32);
+static const LLT V5S32 = LLT::fixed_vector(5, 32);
+static const LLT V6S32 = LLT::fixed_vector(6, 32);
+static const LLT V7S32 = LLT::fixed_vector(7, 32);
+static const LLT V8S32 = LLT::fixed_vector(8, 32);
+static const LLT V9S32 = LLT::fixed_vector(9, 32);
+static const LLT V10S32 = LLT::fixed_vector(10, 32);
+static const LLT V11S32 = LLT::fixed_vector(11, 32);
+static const LLT V12S32 = LLT::fixed_vector(12, 32);
+static const LLT V16S32 = LLT::fixed_vector(16, 32);
+static const LLT V32S32 = LLT::fixed_vector(32, 32);
+
+static const LLT V2S64 = LLT::fixed_vector(2, 64);
+static const LLT V3S64 = LLT::fixed_vector(3, 64);
+static const LLT V4S64 = LLT::fixed_vector(4, 64);
+static const LLT V5S64 = LLT::fixed_vector(5, 64);
+static const LLT V6S64 = LLT::fixed_vector(6, 64);
+static const LLT V7S64 = LLT::fixed_vector(7, 64);
+static const LLT V8S64 = LLT::fixed_vector(8, 64);
+static const LLT V16S64 = LLT::fixed_vector(16, 64);
+
+static const LLT V2S128 = LLT::fixed_vector(2, 128);
+static const LLT V4S128 = LLT::fixed_vector(4, 128);
+
+static std::initializer_list<LLT> AllScalarTypes = {S32,  S64,  S96,  S128,
+                                                    S160, S224, S256, S512};
+
+static std::initializer_list<LLT> AllS16Vectors{
+    V2S16, V4S16, V6S16, V8S16, V10S16, V12S16, V16S16, V2S128, V4S128};
+
+static std::initializer_list<LLT> AllS32Vectors = {
+    V2S32, V3S32,  V4S32,  V5S32,  V6S32,  V7S32, V8S32,
+    V9S32, V10S32, V11S32, V12S32, V16S32, V32S32};
+
+static std::initializer_list<LLT> AllS64Vectors = {V2S64, V3S64, V4S64, V5S64,
+                                                   V6S64, V7S64, V8S64, V16S64};
+
+// Checks whether a type is in the list of legal register types.
+static bool isRegisterClassType(LLT Ty) {
+  if (Ty.isVector() && Ty.getElementType().isPointer())
+    Ty = LLT::fixed_vector(Ty.getNumElements(),
+                           LLT::scalar(Ty.getScalarSizeInBits()));
+  else if (Ty.isPointer())
+    Ty = LLT::scalar(Ty.getScalarSizeInBits());
+
+  return is_contained(AllS32Vectors, Ty) || is_contained(AllS64Vectors, Ty) ||
+         is_contained(AllScalarTypes, Ty) || is_contained(AllS16Vectors, Ty);
+}
+
+static LegalityPredicate isRegisterClassType(unsigned TypeIdx) {
+  return [TypeIdx](const LegalityQuery &Query) {
+    return isRegisterClassType(Query.Types[TypeIdx]);
+  };
+}
+
 // If we have a truncating store or an extending load with a data size larger
 // than 32-bits, we need to reduce to a 32-bit type.
 static LegalityPredicate isWideScalarExtLoadTruncStore(unsigned TypeIdx) {
@@ -578,52 +660,6 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
     return LLT::pointer(AS, TM.getPointerSizeInBits(AS));
   };
 
-  const LLT S1 = LLT::scalar(1);
-  const LLT S8 = LLT::scalar(8);
-  const LLT S16 = LLT::scalar(16);
-  const LLT S32 = LLT::scalar(32);
-  const LLT S64 = LLT::scalar(64);
-  const LLT S128 = LLT::scalar(128);
-  const LLT S256 = LLT::scalar(256);
-  const LLT S512 = LLT::scalar(512);
-  const LLT MaxScalar = LLT::scalar(MaxRegisterSize);
-
-  const LLT V2S8 = LLT::fixed_vector(2, 8);
-  const LLT V2S16 = LLT::fixed_vector(2, 16);
-  const LLT V4S16 = LLT::fixed_vector(4, 16);
-
-  const LLT V2S32 = LLT::fixed_vector(2, 32);
-  const LLT V3S32 = LLT::fixed_vector(3, 32);
-  const LLT V4S32 = LLT::fixed_vector(4, 32);
-  const LLT V5S32 = LLT::fixed_vector(5, 32);
-  const LLT V6S32 = LLT::fixed_vector(6, 32);
-  const LLT V7S32 = LLT::fixed_vector(7, 32);
-  const LLT V8S32 = LLT::fixed_vector(8, 32);
-  const LLT V9S32 = LLT::fixed_vector(9, 32);
-  const LLT V10S32 = LLT::fixed_vector(10, 32);
-  const LLT V11S32 = LLT::fixed_vector(11, 32);
-  const LLT V12S32 = LLT::fixed_vector(12, 32);
-  const LLT V13S32 = LLT::fixed_vector(13, 32);
-  const LLT V14S32 = LLT::fixed_vector(14, 32);
-  const LLT V15S32 = LLT::fixed_vector(15, 32);
-  const LLT V16S32 = LLT::fixed_vector(16, 32);
-  const LLT V32S32 = LLT::fixed_vector(32, 32);
-
-  const LLT V2S64 = LLT::fixed_vector(2, 64);
-  const LLT V3S64 = LLT::fixed_vector(3, 64);
-  const LLT V4S64 = LLT::fixed_vector(4, 64);
-  const LLT V5S64 = LLT::fixed_vector(5, 64);
-  const LLT V6S64 = LLT::fixed_vector(6, 64);
-  const LLT V7S64 = LLT::fixed_vector(7, 64);
-  const LLT V8S64 = LLT::fixed_vector(8, 64);
-  const LLT V16S64 = LLT::fixed_vector(16, 64);
-
-  std::initializer_list<LLT> AllS32Vectors =
-    {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32,
-     V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32};
-  std::initializer_list<LLT> AllS64Vectors =
-    {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64};
-
   const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
   const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
   const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT);
@@ -836,10 +872,9 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
       .scalarize(0);
 
   getActionDefinitionsBuilder(G_BITCAST)
-    // Don't worry about the size constraint.
-    .legalIf(all(isRegisterType(0), isRegisterType(1)))
-    .lower();
-
+      // Don't worry about the size constraint.
+      .legalIf(all(isRegisterClassType(0), isRegisterClassType(1)))
+      .lower();
 
   getActionDefinitionsBuilder(G_CONSTANT)
     .legalFor({S1, S32, S64, S16, GlobalPtr,
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/bitcast_38_i16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/bitcast_38_i16.ll
new file mode 100644
index 0000000..5bea13a
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/bitcast_38_i16.ll
@@ -0,0 +1,85 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GPRIDX %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,MOVREL %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10PLUS,GFX10 %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10PLUS,GFX11 %s
+define void @main(<19 x i32> %arg) {
+; GCN-LABEL: main:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    s_mov_b32 s4, 0
+; GCN-NEXT:    s_mov_b32 s12, s4
+; GCN-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v0
+; GCN-NEXT:    v_mov_b32_e32 v1, 0
+; GCN-NEXT:    s_mov_b32 s13, s4
+; GCN-NEXT:    v_mov_b32_e32 v4, s12
+; GCN-NEXT:    s_mov_b32 s5, s4
+; GCN-NEXT:    s_mov_b32 s6, s4
+; GCN-NEXT:    s_mov_b32 s7, s4
+; GCN-NEXT:    s_mov_b32 s8, s4
+; GCN-NEXT:    s_mov_b32 s9, s4
+; GCN-NEXT:    s_mov_b32 s10, s4
+; GCN-NEXT:    s_mov_b32 s11, s4
+; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GCN-NEXT:    v_mov_b32_e32 v2, v1
+; GCN-NEXT:    v_mov_b32_e32 v3, v1
+; GCN-NEXT:    v_mov_b32_e32 v5, s13
+; GCN-NEXT:    image_store v[0:3], v[4:5], s[4:11] unorm
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: main:
+; GFX10:       ; %bb.0: ; %bb
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_mov_b32 s4, 0
+; GFX10-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0
+; GFX10-NEXT:    s_mov_b32 s10, s4
+; GFX10-NEXT:    s_mov_b32 s11, s4
+; GFX10-NEXT:    v_mov_b32_e32 v4, s10
+; GFX10-NEXT:    v_mov_b32_e32 v2, v1
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX10-NEXT:    v_mov_b32_e32 v3, v1
+; GFX10-NEXT:    v_mov_b32_e32 v5, s11
+; GFX10-NEXT:    s_mov_b32 s5, s4
+; GFX10-NEXT:    s_mov_b32 s6, s4
+; GFX10-NEXT:    s_mov_b32 s7, s4
+; GFX10-NEXT:    s_mov_b32 s8, s4
+; GFX10-NEXT:    s_mov_b32 s9, s4
+; GFX10-NEXT:    image_store v[0:3], v[4:5], s[4:11] dim:SQ_RSRC_IMG_2D unorm
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: main:
+; GFX11:       ; %bb.0: ; %bb
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_mov_b32 s0, 0
+; GFX11-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0
+; GFX11-NEXT:    s_mov_b32 s6, s0
+; GFX11-NEXT:    s_mov_b32 s7, s0
+; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v4, s6
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-NEXT:    v_mov_b32_e32 v5, s7
+; GFX11-NEXT:    s_mov_b32 s1, s0
+; GFX11-NEXT:    v_mov_b32_e32 v2, v1
+; GFX11-NEXT:    v_mov_b32_e32 v3, v1
+; GFX11-NEXT:    s_mov_b32 s2, s0
+; GFX11-NEXT:    s_mov_b32 s3, s0
+; GFX11-NEXT:    s_mov_b32 s4, s0
+; GFX11-NEXT:    s_mov_b32 s5, s0
+; GFX11-NEXT:    image_store v[0:3], v[4:5], s[0:7] dim:SQ_RSRC_IMG_2D unorm
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+bb:
+  %i = bitcast <19 x i32> %arg to <38 x i16>
+  %i1 = extractelement <38 x i16> %i, i64 0
+  %i2 = icmp eq i16 %i1, 0
+  %i3 = zext i1 %i2 to i32
+  %i4 = bitcast i32 %i3 to float
+  %i5 = insertelement <4 x float> zeroinitializer, float %i4, i64 0
+  call void @llvm.amdgcn.image.store.2d.v4f32.i32(<4 x float> %i5, i32 0, i32 0, i32 0, <8 x i32> zeroinitializer, i32 0, i32 0)
+  ret void
+}
+declare void @llvm.amdgcn.image.store.2d.v4f32.i32(<4 x float>, i32 immarg, i32, i32, <8 x i32>, i32 immarg, i32 immarg)
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GFX10PLUS: {{.*}}
+; GPRIDX: {{.*}}
+; MOVREL: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll
index ac15318..1e1c90d 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll
@@ -2626,6 +2626,132 @@ entry:
   ret double %ext
 }
 
+define amdgpu_ps double @dyn_extract_v7f64_s_v_bitcast(<14 x float> inreg %userData, i32 %sel) {
+; GCN-LABEL: dyn_extract_v7f64_s_v_bitcast:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    v_mov_b32_e32 v1, s2
+; GCN-NEXT:    v_mov_b32_e32 v2, s3
+; GCN-NEXT:    v_mov_b32_e32 v3, s4
+; GCN-NEXT:    v_mov_b32_e32 v4, s5
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
+; GCN-NEXT:    v_mov_b32_e32 v5, s6
+; GCN-NEXT:    v_mov_b32_e32 v6, s7
+; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; GCN-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 2, v0
+; GCN-NEXT:    v_mov_b32_e32 v7, s8
+; GCN-NEXT:    v_mov_b32_e32 v8, s9
+; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
+; GCN-NEXT:    v_cndmask_b32_e32 v2, v2, v6, vcc
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 3, v0
+; GCN-NEXT:    v_mov_b32_e32 v9, s10
+; GCN-NEXT:    v_mov_b32_e32 v10, s11
+; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v7, vcc
+; GCN-NEXT:    v_cndmask_b32_e32 v2, v2, v8, vcc
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 4, v0
+; GCN-NEXT:    v_mov_b32_e32 v11, s12
+; GCN-NEXT:    v_mov_b32_e32 v12, s13
+; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v9, vcc
+; GCN-NEXT:    v_cndmask_b32_e32 v2, v2, v10, vcc
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 5, v0
+; GCN-NEXT:    v_mov_b32_e32 v13, s14
+; GCN-NEXT:    v_mov_b32_e32 v14, s15
+; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v11, vcc
+; GCN-NEXT:    v_cndmask_b32_e32 v2, v2, v12, vcc
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 6, v0
+; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v13, vcc
+; GCN-NEXT:    v_cndmask_b32_e32 v2, v2, v14, vcc
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 7, v0
+; GCN-NEXT:    ; kill: def $vgpr15 killed $sgpr2 killed $exec
+; GCN-NEXT:    ; kill: def $vgpr16 killed $sgpr3 killed $exec
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v1, v15, vcc
+; GCN-NEXT:    v_cndmask_b32_e32 v1, v2, v16, vcc
+; GCN-NEXT:    v_readfirstlane_b32 s0, v0
+; GCN-NEXT:    v_readfirstlane_b32 s1, v1
+; GCN-NEXT:    ; return to shader part epilog
+;
+; GFX10-LABEL: dyn_extract_v7f64_s_v_bitcast:
+; GFX10:       ; %bb.0: ; %entry
+; GFX10-NEXT:    v_mov_b32_e32 v1, s4
+; GFX10-NEXT:    v_mov_b32_e32 v2, s5
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
+; GFX10-NEXT:    s_mov_b32 s0, s14
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, s2, v1, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, s3, v2, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 2, v0
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, s6, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, v2, s7, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 3, v0
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, s8, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, v2, s9, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 4, v0
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, s10, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, v2, s11, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 5, v0
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, s12, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, v2, s13, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 6, v0
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, s0, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, v2, s15, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 7, v0
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, v1, s2, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, v2, s3, vcc_lo
+; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX10-NEXT:    v_readfirstlane_b32 s1, v1
+; GFX10-NEXT:    ; return to shader part epilog
+;
+; GFX11-LABEL: dyn_extract_v7f64_s_v_bitcast:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    v_dual_mov_b32 v1, s4 :: v_dual_mov_b32 v2, s5
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
+; GFX11-NEXT:    s_mov_b32 s0, s14
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, s2, v1, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e32 v2, s3, v2, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 2, v0
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, v1, s6, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, v2, s7, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 3, v0
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, v1, s8, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, v2, s9, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 4, v0
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, v1, s10, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, v2, s11, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 5, v0
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, v1, s12, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, v2, s13, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 6, v0
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, v1, s0, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, v2, s15, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 7, v0
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, v1, s2, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, v2, s3, vcc_lo
+; GFX11-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX11-NEXT:    v_readfirstlane_b32 s1, v1
+; GFX11-NEXT:    ; return to shader part epilog
+entry:
+  %bc = bitcast <14 x float> %userData to <7 x double>
+  %ext = extractelement <7 x double> %bc, i32 %sel
+  ret double %ext
+}
+
+define amdgpu_ps i64 @dyn_extract_v7i64_s_v_bitcast(<14 x i32> inreg %userData, i32 %sel) {
+; GCN-LABEL: dyn_extract_v7i64_s_v_bitcast:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_mov_b32 s0, s10
+; GCN-NEXT:    s_mov_b32 s1, s11
+; GCN-NEXT:    ; return to shader part epilog
+;
+; GFX10PLUS-LABEL: dyn_extract_v7i64_s_v_bitcast:
+; GFX10PLUS:       ; %bb.0: ; %entry
+; GFX10PLUS-NEXT:    s_mov_b32 s0, s10
+; GFX10PLUS-NEXT:    s_mov_b32 s1, s11
+; GFX10PLUS-NEXT:    ; return to shader part epilog
+entry:
+  %.bc = bitcast <14 x i32> %userData to <7 x i64>
+  %ext = extractelement <7 x i64> %.bc, i32 4
+  ret i64 %ext
+}
+
 define amdgpu_ps double @dyn_extract_v7f64_s_v(<7 x double> inreg %vec, i32 %sel) {
 ; GCN-LABEL: dyn_extract_v7f64_s_v:
 ; GCN:       ; %bb.0: ; %entry
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-build-vector.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-build-vector.mir
index 10766b0..25652b6 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-build-vector.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-build-vector.mir
@@ -299,129 +299,6 @@ body: |
     S_NOP 0, implicit %12
 ...
 ---
-name: legal_v13s32
-body: |
-  bb.0:
-    liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12
-    ; CHECK-LABEL: name: legal_v13s32
-    ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12
-    ; CHECK-NEXT: {{  $}}
-    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
-    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
-    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
-    ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
-    ; CHECK-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
-    ; CHECK-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
-    ; CHECK-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6
-    ; CHECK-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7
-    ; CHECK-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8
-    ; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr9
-    ; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr10
-    ; CHECK-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr11
-    ; CHECK-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr12
-    ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<13 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32), [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32), [[COPY12]](s32)
-    ; CHECK-NEXT: S_NOP 0, implicit [[BUILD_VECTOR]](<13 x s32>)
-    %0:_(s32) = COPY $vgpr0
-    %1:_(s32) = COPY $vgpr1
-    %2:_(s32) = COPY $vgpr2
-    %3:_(s32) = COPY $vgpr3
-    %4:_(s32) = COPY $vgpr4
-    %5:_(s32) = COPY $vgpr5
-    %6:_(s32) = COPY $vgpr6
-    %7:_(s32) = COPY $vgpr7
-    %8:_(s32) = COPY $vgpr8
-    %9:_(s32) = COPY $vgpr9
-    %10:_(s32) = COPY $vgpr10
-    %11:_(s32) = COPY $vgpr11
-    %12:_(s32) = COPY $vgpr12
-    %13:_(<13 x s32>) = G_BUILD_VECTOR %0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12
-    S_NOP 0, implicit %13
-...
----
-name: legal_v14s32
-body: |
-  bb.0:
-    liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13
-    ; CHECK-LABEL: name: legal_v14s32
-    ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13
-    ; CHECK-NEXT: {{  $}}
-    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
-    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
-    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
-    ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
-    ; CHECK-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
-    ; CHECK-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
-    ; CHECK-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6
-    ; CHECK-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7
-    ; CHECK-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8
-    ; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr9
-    ; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr10
-    ; CHECK-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr11
-    ; CHECK-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr12
-    ; CHECK-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr13
-    ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<14 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32), [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32), [[COPY12]](s32), [[COPY13]](s32)
-    ; CHECK-NEXT: S_NOP 0, implicit [[BUILD_VECTOR]](<14 x s32>)
-    %0:_(s32) = COPY $vgpr0
-    %1:_(s32) = COPY $vgpr1
-    %2:_(s32) = COPY $vgpr2
-    %3:_(s32) = COPY $vgpr3
-    %4:_(s32) = COPY $vgpr4
-    %5:_(s32) = COPY $vgpr5
-    %6:_(s32) = COPY $vgpr6
-    %7:_(s32) = COPY $vgpr7
-    %8:_(s32) = COPY $vgpr8
-    %9:_(s32) = COPY $vgpr9
-    %10:_(s32) = COPY $vgpr10
-    %11:_(s32) = COPY $vgpr11
-    %12:_(s32) = COPY $vgpr12
-    %13:_(s32) = COPY $vgpr13
-    %14:_(<14 x s32>) = G_BUILD_VECTOR %0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13
-    S_NOP 0, implicit %14
-...
----
-name: legal_v15s32
-body: |
-  bb.0:
-    liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14
-    ; CHECK-LABEL: name: legal_v15s32
-    ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14
-    ; CHECK-NEXT: {{  $}}
-    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
-    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
-    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
-    ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
-    ; CHECK-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
-    ; CHECK-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
-    ; CHECK-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6
-    ; CHECK-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7
-    ; CHECK-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8
-    ; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr9
-    ; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr10
-    ; CHECK-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr11
-    ; CHECK-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr12
-    ; CHECK-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr13
-    ; CHECK-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr14
-    ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<15 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32), [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32), [[COPY12]](s32), [[COPY13]](s32), [[COPY14]](s32)
-    ; CHECK-NEXT: S_NOP 0, implicit [[BUILD_VECTOR]](<15 x s32>)
-    %0:_(s32) = COPY $vgpr0
-    %1:_(s32) = COPY $vgpr1
-    %2:_(s32) = COPY $vgpr2
-    %3:_(s32) = COPY $vgpr3
-    %4:_(s32) = COPY $vgpr4
-    %5:_(s32) = COPY $vgpr5
-    %6:_(s32) = COPY $vgpr6
-    %7:_(s32) = COPY $vgpr7
-    %8:_(s32) = COPY $vgpr8
-    %9:_(s32) = COPY $vgpr9
-    %10:_(s32) = COPY $vgpr10
-    %11:_(s32) = COPY $vgpr11
-    %12:_(s32) = COPY $vgpr12
-    %13:_(s32) = COPY $vgpr13
-    %14:_(s32) = COPY $vgpr14
-    %15:_(<15 x s32>) = G_BUILD_VECTOR %0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, %14
-    S_NOP 0, implicit %15
-...
----
 name: legal_v16s32
 body: |
   bb.0:
-- 
cgit v1.1


From d7f59c8fb83cc00c58a826d542b81f98e7bd9526 Mon Sep 17 00:00:00 2001
From: Guray Ozen <guray.ozen@gmail.com>
Date: Tue, 13 Feb 2024 08:31:42 +0100
Subject: [mlir] Lower math dialect later in gpu-lower-to-nvvm-pipeline
 (#81489)

This PR moves lowering of math dialect later in the pipeline. Because
math dialect is lowered correctly by createConvertGpuOpsToNVVMOps for
GPU target, and it needs to run it first.

Reland #78556
---
 .../Dialect/GPU/Pipelines/GPUToNVVMPipeline.cpp    |  2 +-
 mlir/test/Dialect/GPU/test-nvvm-pipeline.mlir      | 30 ++++++++++++++++++++++
 2 files changed, 31 insertions(+), 1 deletion(-)
 create mode 100644 mlir/test/Dialect/GPU/test-nvvm-pipeline.mlir

diff --git a/mlir/lib/Dialect/GPU/Pipelines/GPUToNVVMPipeline.cpp b/mlir/lib/Dialect/GPU/Pipelines/GPUToNVVMPipeline.cpp
index 0b47392..935f0de 100644
--- a/mlir/lib/Dialect/GPU/Pipelines/GPUToNVVMPipeline.cpp
+++ b/mlir/lib/Dialect/GPU/Pipelines/GPUToNVVMPipeline.cpp
@@ -51,7 +51,6 @@ void buildCommonPassPipeline(
   pm.addPass(createConvertVectorToSCFPass());
   pm.addPass(createConvertSCFToCFPass());
   pm.addPass(createConvertNVVMToLLVMPass());
-  pm.addPass(createConvertMathToLLVMPass());
   pm.addPass(createConvertFuncToLLVMPass());
   pm.addPass(memref::createExpandStridedMetadataPass());
 
@@ -98,6 +97,7 @@ void buildHostPostPipeline(OpPassManager &pm,
   GpuModuleToBinaryPassOptions gpuModuleToBinaryPassOptions;
   gpuModuleToBinaryPassOptions.compilationTarget = options.cubinFormat;
   pm.addPass(createGpuModuleToBinaryPass(gpuModuleToBinaryPassOptions));
+  pm.addPass(createConvertMathToLLVMPass());
   pm.addPass(createCanonicalizerPass());
   pm.addPass(createCSEPass());
   pm.addPass(createReconcileUnrealizedCastsPass());
diff --git a/mlir/test/Dialect/GPU/test-nvvm-pipeline.mlir b/mlir/test/Dialect/GPU/test-nvvm-pipeline.mlir
new file mode 100644
index 0000000..07e7197
--- /dev/null
+++ b/mlir/test/Dialect/GPU/test-nvvm-pipeline.mlir
@@ -0,0 +1,30 @@
+// REQUIRES: host-supports-nvptx
+// RUN: mlir-opt %s \
+// RUN:  | mlir-opt -gpu-lower-to-nvvm-pipeline="cubin-format=isa" \
+// RUN:   | FileCheck %s
+
+// RUN: mlir-opt %s \
+// RUN:  | mlir-opt -gpu-lower-to-nvvm-pipeline="cubin-format=isa" \
+// RUN:    --mlir-print-ir-after=convert-gpu-to-nvvm 2>&1 \
+// RUN:  | FileCheck %s --check-prefixes=CHECK-NVVM
+
+// This test checks whether the GPU region is compiled correctly to PTX by 
+// pipeline. It doesn't test IR for GPU side, but it can test Host IR and 
+// generated PTX.
+
+// CHECK-LABEL: llvm.func @test_math(%arg0: f32) {
+func.func @test_math(%arg0 : f32) {
+    %c2 = arith.constant 2 : index
+    %c1 = arith.constant 1 : index
+    // CHECK: gpu.launch_func  @test_math_kernel::@test_math_kernel
+    // CHECK: gpu.binary @test_math_kernel  [#gpu.object<#nvvm.target
+    gpu.launch 
+        blocks(%0, %1, %2) in (%3 = %c1, %4 = %c1, %5 = %c1) 
+        threads(%6, %7, %8) in (%9 = %c2, %10 = %c1, %11 = %c1) { 
+        // CHECK-NVVM: __nv_expf 
+        %s1 = math.exp %arg0 : f32
+        gpu.printf "%f" %s1 : f32
+        gpu.terminator
+    }
+    return
+}
\ No newline at end of file
-- 
cgit v1.1


From d033366bd2189e33343ca93d276b40341dc39770 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Martin=20Storsj=C3=B6?= <martin@martin.st>
Date: Tue, 13 Feb 2024 09:32:40 +0200
Subject: [LLD] [MinGW] Implement the --lto-emit-asm and -plugin-opt=emit-llvm
 options (#81475)

These were implemented in the COFF linker in
3923e61b96cf90123762f0e0381504efaba2d77a and
d12b99a4313816cf99e97cb5f579e2d51ba72b0b.

This matches the corresponding options in the ELF linker.
---
 lld/MinGW/Driver.cpp       | 4 ++++
 lld/MinGW/Options.td       | 5 +++++
 lld/test/MinGW/driver.test | 7 +++++++
 3 files changed, 16 insertions(+)

diff --git a/lld/MinGW/Driver.cpp b/lld/MinGW/Driver.cpp
index 290eeca..efd643f 100644
--- a/lld/MinGW/Driver.cpp
+++ b/lld/MinGW/Driver.cpp
@@ -451,6 +451,10 @@ bool link(ArrayRef<const char *> argsArr, llvm::raw_ostream &stdoutOS,
     add("-lto-cs-profile-generate");
   if (auto *arg = args.getLastArg(OPT_lto_cs_profile_file))
     add("-lto-cs-profile-file:" + StringRef(arg->getValue()));
+  if (args.hasArg(OPT_plugin_opt_emit_llvm))
+    add("-lldemit:llvm");
+  if (args.hasArg(OPT_lto_emit_asm))
+    add("-lldemit:asm");
 
   if (auto *a = args.getLastArg(OPT_thinlto_cache_dir))
     add("-lldltocache:" + StringRef(a->getValue()));
diff --git a/lld/MinGW/Options.td b/lld/MinGW/Options.td
index 02f00f2..9a0a96a 100644
--- a/lld/MinGW/Options.td
+++ b/lld/MinGW/Options.td
@@ -158,6 +158,8 @@ def lto_cs_profile_generate: FF<"lto-cs-profile-generate">,
   HelpText<"Perform context sensitive PGO instrumentation">;
 def lto_cs_profile_file: JJ<"lto-cs-profile-file=">,
   HelpText<"Context sensitive profile file path">;
+def lto_emit_asm: FF<"lto-emit-asm">,
+  HelpText<"Emit assembly code">;
 
 def thinlto_cache_dir: JJ<"thinlto-cache-dir=">,
   HelpText<"Path to ThinLTO cached object file directory">;
@@ -181,6 +183,9 @@ def: J<"plugin-opt=cs-profile-path=">,
   Alias<lto_cs_profile_file>, HelpText<"Alias for --lto-cs-profile-file">;
 def plugin_opt_dwo_dir_eq: J<"plugin-opt=dwo_dir=">,
   HelpText<"Directory to store .dwo files when LTO and debug fission are used">;
+def plugin_opt_emit_asm: F<"plugin-opt=emit-asm">,
+  Alias<lto_emit_asm>, HelpText<"Alias for --lto-emit-asm">;
+def plugin_opt_emit_llvm: F<"plugin-opt=emit-llvm">;
 def: J<"plugin-opt=jobs=">, Alias<thinlto_jobs_eq>, HelpText<"Alias for --thinlto-jobs=">;
 def plugin_opt_mcpu_eq: J<"plugin-opt=mcpu=">;
 
diff --git a/lld/test/MinGW/driver.test b/lld/test/MinGW/driver.test
index 46b3b6d..a4e9e5e1b 100644
--- a/lld/test/MinGW/driver.test
+++ b/lld/test/MinGW/driver.test
@@ -415,6 +415,13 @@ LTO_OPTS: -mllvm:-mcpu=x86-64 -opt:lldlto=2 -dwodir:foo -lto-cs-profile-generate
 RUN: ld.lld -### foo.o -m i386pep --lto-O2 --lto-CGO1 --lto-cs-profile-generate --lto-cs-profile-file=foo 2>&1 | FileCheck -check-prefix=LTO_OPTS2 %s
 LTO_OPTS2:-opt:lldlto=2 -opt:lldltocgo=1 -lto-cs-profile-generate -lto-cs-profile-file:foo
 
+RUN: ld.lld -### foo.o -m i386pe -plugin-opt=emit-asm 2>&1 | FileCheck -check-prefix=LTO_EMIT_ASM %s
+RUN: ld.lld -### foo.o -m i386pe --lto-emit-asm 2>&1 | FileCheck -check-prefix=LTO_EMIT_ASM %s
+LTO_EMIT_ASM: -lldemit:asm
+
+RUN: ld.lld -### foo.o -m i386pe -plugin-opt=emit-llvm 2>&1 | FileCheck -check-prefix=LTO_EMIT_LLVM %s
+LTO_EMIT_LLVM: -lldemit:llvm
+
 Test GCC specific LTO options that GCC passes unconditionally, that we ignore.
 
 RUN: ld.lld -### foo.o -m i386pep -plugin /usr/lib/gcc/x86_64-w64-mingw32/10-posix/liblto_plugin.so -plugin-opt=/usr/lib/gcc/x86_64-w64-mingw32/10-posix/lto-wrapper -plugin-opt=-fresolution=/tmp/ccM9d4fP.res -plugin-opt=-pass-through=-lmingw32 2> /dev/null
-- 
cgit v1.1


From 66f73100b8c758248724d53598165d850fdaf364 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Timm=20B=C3=A4der?= <tbaeder@redhat.com>
Date: Tue, 13 Feb 2024 08:41:45 +0100
Subject: [clang][Interp] Fix ltor conversion for pointer types

This special case is wrong, we need to handle pointer types here
just like anything else.
---
 clang/lib/AST/Interp/Pointer.cpp   |  6 +-----
 clang/test/AST/Interp/literals.cpp | 10 ++++++++++
 2 files changed, 11 insertions(+), 5 deletions(-)

diff --git a/clang/lib/AST/Interp/Pointer.cpp b/clang/lib/AST/Interp/Pointer.cpp
index 8a0a155..3f85635 100644
--- a/clang/lib/AST/Interp/Pointer.cpp
+++ b/clang/lib/AST/Interp/Pointer.cpp
@@ -232,11 +232,7 @@ std::optional<APValue> Pointer::toRValue(const Context &Ctx) const {
 
     // Primitive values.
     if (std::optional<PrimType> T = Ctx.classify(Ty)) {
-      if (T == PT_Ptr || T == PT_FnPtr) {
-        R = Ptr.toAPValue();
-      } else {
-        TYPE_SWITCH(*T, R = Ptr.deref<T>().toAPValue());
-      }
+      TYPE_SWITCH(*T, R = Ptr.deref<T>().toAPValue());
       return true;
     }
 
diff --git a/clang/test/AST/Interp/literals.cpp b/clang/test/AST/Interp/literals.cpp
index f5b5f77..9202bb9 100644
--- a/clang/test/AST/Interp/literals.cpp
+++ b/clang/test/AST/Interp/literals.cpp
@@ -1105,3 +1105,13 @@ namespace NonConstReads {
   static_assert(z == 0, ""); // both-error {{not an integral constant expression}} \
                              // both-note {{read of non-const variable 'z'}}
 }
+
+/// This test passes a MaterializedTemporaryExpr to evaluateAsRValue.
+/// That needs to return a null pointer after the lvalue-to-rvalue conversion.
+/// We used to fail to do that.
+namespace rdar8769025 {
+  __attribute__((nonnull)) void f1(int * const &p);
+  void test_f1() {
+    f1(0); // both-warning{{null passed to a callee that requires a non-null argument}}
+  }
+}
-- 
cgit v1.1


From d2ccf3393363f3bbcfd46e58a0ed2a6bd9170099 Mon Sep 17 00:00:00 2001
From: Hristo Hristov <hghristov.rmm@gmail.com>
Date: Tue, 13 Feb 2024 09:59:31 +0200
Subject: [libc++][sstream] Explicitly delete special member functions (#80254)

The standard declares the copy constructors and copy assign operators as
deleted.

References:
- https://eel.is/c++draft/string.streams
---
 libcxx/include/sstream                             | 16 +++++
 .../istringstream/types.compile.pass.cpp           | 75 ++++++++++++++++++++++
 .../string.streams/istringstream/types.pass.cpp    | 39 -----------
 .../ostringstream/types.compile.pass.cpp           | 75 ++++++++++++++++++++++
 .../string.streams/ostringstream/types.pass.cpp    | 39 -----------
 .../stringbuf/types.compile.pass.cpp               | 70 ++++++++++++++++++++
 .../string.streams/stringbuf/types.pass.cpp        | 39 -----------
 .../stringstream/types.compile.pass.cpp            | 72 +++++++++++++++++++++
 .../string.streams/stringstream/types.pass.cpp     | 39 -----------
 9 files changed, 308 insertions(+), 156 deletions(-)
 create mode 100644 libcxx/test/std/input.output/string.streams/istringstream/types.compile.pass.cpp
 delete mode 100644 libcxx/test/std/input.output/string.streams/istringstream/types.pass.cpp
 create mode 100644 libcxx/test/std/input.output/string.streams/ostringstream/types.compile.pass.cpp
 delete mode 100644 libcxx/test/std/input.output/string.streams/ostringstream/types.pass.cpp
 create mode 100644 libcxx/test/std/input.output/string.streams/stringbuf/types.compile.pass.cpp
 delete mode 100644 libcxx/test/std/input.output/string.streams/stringbuf/types.pass.cpp
 create mode 100644 libcxx/test/std/input.output/string.streams/stringstream/types.compile.pass.cpp
 delete mode 100644 libcxx/test/std/input.output/string.streams/stringstream/types.pass.cpp

diff --git a/libcxx/include/sstream b/libcxx/include/sstream
index 6c354cf..8862e2ef 100644
--- a/libcxx/include/sstream
+++ b/libcxx/include/sstream
@@ -48,10 +48,12 @@ public:
     template <class SAlloc>
     explicit basic_stringbuf(const basic_string<char_type, traits_type, SAlloc>& s,
                              ios_base::openmode which = ios_base::in | ios_base::out); // C++20
+    basic_stringbuf(const basic_stringbuf&) = delete;
     basic_stringbuf(basic_stringbuf&& rhs);
     basic_stringbuf(basic_stringbuf&& rhs, const allocator_type& a);                   // C++20
 
     // [stringbuf.assign] Assign and swap:
+    basic_stringbuf& operator=(const basic_stringbuf&) = delete;
     basic_stringbuf& operator=(basic_stringbuf&& rhs);
     void swap(basic_stringbuf& rhs) noexcept(see below);                               // conditionally noexcept since C++20
 
@@ -119,9 +121,11 @@ public:
     template <class SAlloc>
     explicit basic_istringstream(const basic_string<char_type, traits_type, SAlloc>& s,
                                  ios_base::openmode which = ios_base::in);             // C++20
+    basic_istringstream(const basic_istringstream&) = delete;
     basic_istringstream(basic_istringstream&& rhs);
 
     // [istringstream.assign] Assign and swap:
+    basic_istringstream& operator=(const basic_istringstream&) = delete;
     basic_istringstream& operator=(basic_istringstream&& rhs);
     void swap(basic_istringstream& rhs);
 
@@ -178,9 +182,11 @@ public:
     template <class SAlloc>
     explicit basic_ostringstream(const basic_string<char_type, traits_type, SAlloc>& s,
                                  ios_base::openmode which = ios_base::out);            // C++20
+    basic_ostringstream(const basic_ostringstream&) = delete;                             
     basic_ostringstream(basic_ostringstream&& rhs);
 
     // [ostringstream.assign] Assign and swap:
+    basic_ostringstream& operator=(const basic_ostringstream&) = delete;
     basic_ostringstream& operator=(basic_ostringstream&& rhs);
     void swap(basic_ostringstream& rhs);
 
@@ -237,9 +243,11 @@ public:
     template <class SAlloc>
     explicit basic_stringstream(const basic_string<char_type, traits_type, SAlloc>& s,
                                 ios_base::openmode which = ios_base::out | ios_base::in); // C++20
+    basic_stringstream(const basic_stringstream&) = delete;
     basic_stringstream(basic_stringstream&& rhs);
 
     // [stringstream.assign] Assign and swap:
+    basic_stringstream& operator=(const basic_stringstream&) = delete;
     basic_stringstream& operator=(basic_stringstream&& rhs);
     void swap(basic_stringstream& rhs);
 
@@ -364,6 +372,7 @@ public:
   }
 #endif // _LIBCPP_STD_VER >= 20
 
+  basic_stringbuf(const basic_stringbuf&) = delete;
   basic_stringbuf(basic_stringbuf&& __rhs) : __mode_(__rhs.__mode_) { __move_init(std::move(__rhs)); }
 
 #if _LIBCPP_STD_VER >= 20
@@ -374,6 +383,7 @@ public:
 #endif
 
   // [stringbuf.assign] Assign and swap:
+  basic_stringbuf& operator=(const basic_stringbuf&) = delete;
   basic_stringbuf& operator=(basic_stringbuf&& __rhs);
   void swap(basic_stringbuf& __rhs)
 #if _LIBCPP_STD_VER >= 20
@@ -822,12 +832,14 @@ public:
       : basic_istream<_CharT, _Traits>(std::addressof(__sb_)), __sb_(__s, __wch | ios_base::in) {}
 #endif // _LIBCPP_STD_VER >= 20
 
+  basic_istringstream(const basic_istringstream&) = delete;
   _LIBCPP_HIDE_FROM_ABI basic_istringstream(basic_istringstream&& __rhs)
       : basic_istream<_CharT, _Traits>(std::move(__rhs)), __sb_(std::move(__rhs.__sb_)) {
     basic_istream<_CharT, _Traits>::set_rdbuf(&__sb_);
   }
 
   // [istringstream.assign] Assign and swap:
+  basic_istringstream& operator=(const basic_istringstream&) = delete;
   basic_istringstream& operator=(basic_istringstream&& __rhs) {
     basic_istream<char_type, traits_type>::operator=(std::move(__rhs));
     __sb_ = std::move(__rhs.__sb_);
@@ -929,12 +941,14 @@ public:
       : basic_ostream<_CharT, _Traits>(std::addressof(__sb_)), __sb_(__s, __wch | ios_base::out) {}
 #endif // _LIBCPP_STD_VER >= 20
 
+  basic_ostringstream(const basic_ostringstream&) = delete;
   _LIBCPP_HIDE_FROM_ABI basic_ostringstream(basic_ostringstream&& __rhs)
       : basic_ostream<_CharT, _Traits>(std::move(__rhs)), __sb_(std::move(__rhs.__sb_)) {
     basic_ostream<_CharT, _Traits>::set_rdbuf(&__sb_);
   }
 
   // [ostringstream.assign] Assign and swap:
+  basic_ostringstream& operator=(const basic_ostringstream&) = delete;
   basic_ostringstream& operator=(basic_ostringstream&& __rhs) {
     basic_ostream<char_type, traits_type>::operator=(std::move(__rhs));
     __sb_ = std::move(__rhs.__sb_);
@@ -1040,12 +1054,14 @@ public:
       : basic_iostream<_CharT, _Traits>(std::addressof(__sb_)), __sb_(__s, __wch) {}
 #endif // _LIBCPP_STD_VER >= 20
 
+  basic_stringstream(const basic_stringstream&) = delete;
   _LIBCPP_HIDE_FROM_ABI basic_stringstream(basic_stringstream&& __rhs)
       : basic_iostream<_CharT, _Traits>(std::move(__rhs)), __sb_(std::move(__rhs.__sb_)) {
     basic_istream<_CharT, _Traits>::set_rdbuf(&__sb_);
   }
 
   // [stringstream.assign] Assign and swap:
+  basic_stringstream& operator=(const basic_stringstream&) = delete;
   basic_stringstream& operator=(basic_stringstream&& __rhs) {
     basic_iostream<char_type, traits_type>::operator=(std::move(__rhs));
     __sb_ = std::move(__rhs.__sb_);
diff --git a/libcxx/test/std/input.output/string.streams/istringstream/types.compile.pass.cpp b/libcxx/test/std/input.output/string.streams/istringstream/types.compile.pass.cpp
new file mode 100644
index 0000000..7be260ff7
--- /dev/null
+++ b/libcxx/test/std/input.output/string.streams/istringstream/types.compile.pass.cpp
@@ -0,0 +1,75 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// <sstream>
+
+// template <class charT, class traits = char_traits<charT>, class Allocator = allocator<charT> >
+// class basic_istringstream
+//     : public basic_istream<charT, traits>
+// {
+// public:
+//     typedef charT                          char_type;
+//     typedef traits                         traits_type;
+//     typedef typename traits_type::int_type int_type;
+//     typedef typename traits_type::pos_type pos_type;
+//     typedef typename traits_type::off_type off_type;
+//     typedef Allocator                      allocator_type;
+//
+//     basic_istringstream(const basic_istringstream&) = delete;
+//     basic_istringstream& operator=(const basic_istringstream&) = delete;
+//
+//     basic_istringstream(basic_istringstream&& rhs);
+//     basic_istringstream& operator=(basic_istringstream&& rhs);
+
+#include <sstream>
+#include <type_traits>
+
+#include "test_macros.h"
+
+// Types
+
+static_assert(std::is_base_of<std::basic_istream<char>, std::basic_istringstream<char> >::value, "");
+static_assert(std::is_same<std::basic_istringstream<char>::char_type, char>::value, "");
+static_assert(std::is_same<std::basic_istringstream<char>::traits_type, std::char_traits<char> >::value, "");
+static_assert(std::is_same<std::basic_istringstream<char>::int_type, std::char_traits<char>::int_type>::value, "");
+static_assert(std::is_same<std::basic_istringstream<char>::pos_type, std::char_traits<char>::pos_type>::value, "");
+static_assert(std::is_same<std::basic_istringstream<char>::off_type, std::char_traits<char>::off_type>::value, "");
+static_assert(std::is_same<std::basic_istringstream<char>::allocator_type, std::allocator<char> >::value, "");
+
+#ifndef TEST_HAS_NO_WIDE_CHARACTERS
+static_assert(std::is_base_of<std::basic_istream<wchar_t>, std::basic_istringstream<wchar_t> >::value, "");
+static_assert(std::is_same<std::basic_istringstream<wchar_t>::char_type, wchar_t>::value, "");
+static_assert(std::is_same<std::basic_istringstream<wchar_t>::traits_type, std::char_traits<wchar_t> >::value, "");
+static_assert(std::is_same<std::basic_istringstream<wchar_t>::int_type, std::char_traits<wchar_t>::int_type>::value,
+              "");
+static_assert(std::is_same<std::basic_istringstream<wchar_t>::pos_type, std::char_traits<wchar_t>::pos_type>::value,
+              "");
+static_assert(std::is_same<std::basic_istringstream<wchar_t>::off_type, std::char_traits<wchar_t>::off_type>::value,
+              "");
+static_assert(std::is_same<std::basic_istringstream<wchar_t>::allocator_type, std::allocator<wchar_t> >::value, "");
+#endif
+
+// Copy properties
+
+static_assert(!std::is_copy_constructible<std::istringstream>::value, "");
+static_assert(!std::is_copy_assignable<std::istringstream>::value, "");
+
+#ifndef TEST_HAS_NO_WIDE_CHARACTERS
+static_assert(!std::is_copy_constructible<std::wistringstream>::value, "");
+static_assert(!std::is_copy_assignable<std::wistringstream>::value, "");
+#endif
+
+// Move properties
+
+static_assert(std::is_move_constructible<std::istringstream>::value, "");
+static_assert(std::is_move_assignable<std::istringstream>::value, "");
+
+#ifndef TEST_HAS_NO_WIDE_CHARACTERS
+static_assert(std::is_move_constructible<std::wistringstream>::value, "");
+static_assert(std::is_move_assignable<std::wistringstream>::value, "");
+#endif
diff --git a/libcxx/test/std/input.output/string.streams/istringstream/types.pass.cpp b/libcxx/test/std/input.output/string.streams/istringstream/types.pass.cpp
deleted file mode 100644
index da1b1de..0000000
--- a/libcxx/test/std/input.output/string.streams/istringstream/types.pass.cpp
+++ /dev/null
@@ -1,39 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-// <sstream>
-
-// template <class charT, class traits = char_traits<charT>, class Allocator = allocator<charT> >
-// class basic_istringstream
-//     : public basic_istream<charT, traits>
-// {
-// public:
-//     typedef charT                          char_type;
-//     typedef traits                         traits_type;
-//     typedef typename traits_type::int_type int_type;
-//     typedef typename traits_type::pos_type pos_type;
-//     typedef typename traits_type::off_type off_type;
-//     typedef Allocator                      allocator_type;
-
-#include <sstream>
-#include <type_traits>
-
-#include "test_macros.h"
-
-int main(int, char**)
-{
-    static_assert((std::is_base_of<std::basic_istream<char>, std::basic_istringstream<char> >::value), "");
-    static_assert((std::is_same<std::basic_istringstream<char>::char_type, char>::value), "");
-    static_assert((std::is_same<std::basic_istringstream<char>::traits_type, std::char_traits<char> >::value), "");
-    static_assert((std::is_same<std::basic_istringstream<char>::int_type, std::char_traits<char>::int_type>::value), "");
-    static_assert((std::is_same<std::basic_istringstream<char>::pos_type, std::char_traits<char>::pos_type>::value), "");
-    static_assert((std::is_same<std::basic_istringstream<char>::off_type, std::char_traits<char>::off_type>::value), "");
-    static_assert((std::is_same<std::basic_istringstream<char>::allocator_type, std::allocator<char> >::value), "");
-
-  return 0;
-}
diff --git a/libcxx/test/std/input.output/string.streams/ostringstream/types.compile.pass.cpp b/libcxx/test/std/input.output/string.streams/ostringstream/types.compile.pass.cpp
new file mode 100644
index 0000000..df712bb
--- /dev/null
+++ b/libcxx/test/std/input.output/string.streams/ostringstream/types.compile.pass.cpp
@@ -0,0 +1,75 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// <sstream>
+
+// template <class charT, class traits = char_traits<charT>, class Allocator = allocator<charT> >
+// class basic_ostringstream
+//     : public basic_ostream<charT, traits>
+// {
+// public:
+//     typedef charT                          char_type;
+//     typedef traits                         traits_type;
+//     typedef typename traits_type::int_type int_type;
+//     typedef typename traits_type::pos_type pos_type;
+//     typedef typename traits_type::off_type off_type;
+//     typedef Allocator                      allocator_type;
+//
+//     basic_ostringstream(const basic_ostringstream&) = delete;
+//     basic_ostringstream& operator=(const basic_ostringstream&) = delete;
+//
+//     basic_ostringstream(basic_ostringstream&& rhs);
+//     basic_ostringstream& operator=(basic_ostringstream&& rhs);
+
+#include <sstream>
+#include <type_traits>
+
+#include "test_macros.h"
+
+// Types
+
+static_assert(std::is_base_of<std::basic_ostream<char>, std::basic_ostringstream<char> >::value, "");
+static_assert(std::is_same<std::basic_ostringstream<char>::char_type, char>::value, "");
+static_assert(std::is_same<std::basic_ostringstream<char>::traits_type, std::char_traits<char> >::value, "");
+static_assert(std::is_same<std::basic_ostringstream<char>::int_type, std::char_traits<char>::int_type>::value, "");
+static_assert(std::is_same<std::basic_ostringstream<char>::pos_type, std::char_traits<char>::pos_type>::value, "");
+static_assert(std::is_same<std::basic_ostringstream<char>::off_type, std::char_traits<char>::off_type>::value, "");
+static_assert(std::is_same<std::basic_ostringstream<char>::allocator_type, std::allocator<char> >::value, "");
+
+#ifndef TEST_HAS_NO_WIDE_CHARACTERS
+static_assert(std::is_base_of<std::basic_ostream<wchar_t>, std::basic_ostringstream<wchar_t> >::value, "");
+static_assert(std::is_same<std::basic_ostringstream<wchar_t>::char_type, wchar_t>::value, "");
+static_assert(std::is_same<std::basic_ostringstream<wchar_t>::traits_type, std::char_traits<wchar_t> >::value, "");
+static_assert(std::is_same<std::basic_ostringstream<wchar_t>::int_type, std::char_traits<wchar_t>::int_type>::value,
+              "");
+static_assert(std::is_same<std::basic_ostringstream<wchar_t>::pos_type, std::char_traits<wchar_t>::pos_type>::value,
+              "");
+static_assert(std::is_same<std::basic_ostringstream<wchar_t>::off_type, std::char_traits<wchar_t>::off_type>::value,
+              "");
+static_assert(std::is_same<std::basic_ostringstream<wchar_t>::allocator_type, std::allocator<wchar_t> >::value, "");
+#endif
+
+// Copy properties
+
+static_assert(!std::is_copy_constructible<std::ostringstream>::value, "");
+static_assert(!std::is_copy_assignable<std::ostringstream>::value, "");
+
+#ifndef TEST_HAS_NO_WIDE_CHARACTERS
+static_assert(!std::is_copy_constructible<std::wostringstream>::value, "");
+static_assert(!std::is_copy_assignable<std::wostringstream>::value, "");
+#endif
+
+// Move properties
+
+static_assert(std::is_move_constructible<std::ostringstream>::value, "");
+static_assert(std::is_move_assignable<std::ostringstream>::value, "");
+
+#ifndef TEST_HAS_NO_WIDE_CHARACTERS
+static_assert(std::is_move_constructible<std::wostringstream>::value, "");
+static_assert(std::is_move_assignable<std::wostringstream>::value, "");
+#endif
diff --git a/libcxx/test/std/input.output/string.streams/ostringstream/types.pass.cpp b/libcxx/test/std/input.output/string.streams/ostringstream/types.pass.cpp
deleted file mode 100644
index b294192..0000000
--- a/libcxx/test/std/input.output/string.streams/ostringstream/types.pass.cpp
+++ /dev/null
@@ -1,39 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-// <sstream>
-
-// template <class charT, class traits = char_traits<charT>, class Allocator = allocator<charT> >
-// class basic_ostringstream
-//     : public basic_ostream<charT, traits>
-// {
-// public:
-//     typedef charT                          char_type;
-//     typedef traits                         traits_type;
-//     typedef typename traits_type::int_type int_type;
-//     typedef typename traits_type::pos_type pos_type;
-//     typedef typename traits_type::off_type off_type;
-//     typedef Allocator                      allocator_type;
-
-#include <sstream>
-#include <type_traits>
-
-#include "test_macros.h"
-
-int main(int, char**)
-{
-    static_assert((std::is_base_of<std::basic_ostream<char>, std::basic_ostringstream<char> >::value), "");
-    static_assert((std::is_same<std::basic_ostringstream<char>::char_type, char>::value), "");
-    static_assert((std::is_same<std::basic_ostringstream<char>::traits_type, std::char_traits<char> >::value), "");
-    static_assert((std::is_same<std::basic_ostringstream<char>::int_type, std::char_traits<char>::int_type>::value), "");
-    static_assert((std::is_same<std::basic_ostringstream<char>::pos_type, std::char_traits<char>::pos_type>::value), "");
-    static_assert((std::is_same<std::basic_ostringstream<char>::off_type, std::char_traits<char>::off_type>::value), "");
-    static_assert((std::is_same<std::basic_ostringstream<char>::allocator_type, std::allocator<char> >::value), "");
-
-  return 0;
-}
diff --git a/libcxx/test/std/input.output/string.streams/stringbuf/types.compile.pass.cpp b/libcxx/test/std/input.output/string.streams/stringbuf/types.compile.pass.cpp
new file mode 100644
index 0000000..7743d3a
--- /dev/null
+++ b/libcxx/test/std/input.output/string.streams/stringbuf/types.compile.pass.cpp
@@ -0,0 +1,70 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// <sstream>
+
+// template <class charT, class traits = char_traits<charT>, class Allocator = allocator<charT> >
+// class basic_stringbuf
+//     : public basic_streambuf<charT, traits>
+// {
+// public:
+//     typedef charT                          char_type;
+//     typedef traits                         traits_type;
+//     typedef typename traits_type::int_type int_type;
+//     typedef typename traits_type::pos_type pos_type;
+//     typedef typename traits_type::off_type off_type;
+//     typedef Allocator                      allocator_type;
+//
+//     basic_stringbuf(const basic_stringbuf&) = delete;
+//     basic_stringbuf& operator=(const basic_stringbuf&) = delete;
+//
+//     basic_stringbuf(basic_stringbuf&& rhs);
+//     basic_stringbuf& operator=(basic_stringbuf&& rhs);
+
+#include <sstream>
+#include <type_traits>
+
+#include "test_macros.h"
+
+static_assert(std::is_base_of<std::basic_streambuf<char>, std::basic_stringbuf<char> >::value, "");
+static_assert(std::is_same<std::basic_stringbuf<char>::char_type, char>::value, "");
+static_assert(std::is_same<std::basic_stringbuf<char>::traits_type, std::char_traits<char> >::value, "");
+static_assert(std::is_same<std::basic_stringbuf<char>::int_type, std::char_traits<char>::int_type>::value, "");
+static_assert(std::is_same<std::basic_stringbuf<char>::pos_type, std::char_traits<char>::pos_type>::value, "");
+static_assert(std::is_same<std::basic_stringbuf<char>::off_type, std::char_traits<char>::off_type>::value, "");
+static_assert(std::is_same<std::basic_stringbuf<char>::allocator_type, std::allocator<char> >::value, "");
+
+#ifndef TEST_HAS_NO_WIDE_CHARACTERS
+static_assert(std::is_base_of<std::basic_streambuf<wchar_t>, std::basic_stringbuf<wchar_t> >::value, "");
+static_assert(std::is_same<std::basic_stringbuf<wchar_t>::char_type, wchar_t>::value, "");
+static_assert(std::is_same<std::basic_stringbuf<wchar_t>::traits_type, std::char_traits<wchar_t> >::value, "");
+static_assert(std::is_same<std::basic_stringbuf<wchar_t>::int_type, std::char_traits<wchar_t>::int_type>::value, "");
+static_assert(std::is_same<std::basic_stringbuf<wchar_t>::pos_type, std::char_traits<wchar_t>::pos_type>::value, "");
+static_assert(std::is_same<std::basic_stringbuf<wchar_t>::off_type, std::char_traits<wchar_t>::off_type>::value, "");
+static_assert(std::is_same<std::basic_stringbuf<wchar_t>::allocator_type, std::allocator<wchar_t> >::value, "");
+#endif
+
+// Copy properties
+
+static_assert(!std::is_copy_constructible<std::basic_stringbuf<char> >::value, "");
+static_assert(!std::is_copy_assignable<std::basic_stringbuf<char> >::value, "");
+
+#ifndef TEST_HAS_NO_WIDE_CHARACTERS
+static_assert(!std::is_copy_constructible<std::basic_stringbuf<wchar_t> >::value, "");
+static_assert(!std::is_copy_assignable<std::basic_stringbuf<wchar_t> >::value, "");
+#endif
+
+// Move properties
+
+static_assert(std::is_move_constructible<std::basic_stringbuf<char> >::value, "");
+static_assert(std::is_move_assignable<std::basic_stringbuf<char> >::value, "");
+
+#ifndef TEST_HAS_NO_WIDE_CHARACTERS
+static_assert(std::is_move_constructible<std::basic_stringbuf<wchar_t> >::value, "");
+static_assert(std::is_move_assignable<std::basic_stringbuf<wchar_t> >::value, "");
+#endif
diff --git a/libcxx/test/std/input.output/string.streams/stringbuf/types.pass.cpp b/libcxx/test/std/input.output/string.streams/stringbuf/types.pass.cpp
deleted file mode 100644
index cec61694..0000000
--- a/libcxx/test/std/input.output/string.streams/stringbuf/types.pass.cpp
+++ /dev/null
@@ -1,39 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-// <sstream>
-
-// template <class charT, class traits = char_traits<charT>, class Allocator = allocator<charT> >
-// class basic_stringbuf
-//     : public basic_streambuf<charT, traits>
-// {
-// public:
-//     typedef charT                          char_type;
-//     typedef traits                         traits_type;
-//     typedef typename traits_type::int_type int_type;
-//     typedef typename traits_type::pos_type pos_type;
-//     typedef typename traits_type::off_type off_type;
-//     typedef Allocator                      allocator_type;
-
-#include <sstream>
-#include <type_traits>
-
-#include "test_macros.h"
-
-int main(int, char**)
-{
-    static_assert((std::is_base_of<std::basic_streambuf<char>, std::basic_stringbuf<char> >::value), "");
-    static_assert((std::is_same<std::basic_stringbuf<char>::char_type, char>::value), "");
-    static_assert((std::is_same<std::basic_stringbuf<char>::traits_type, std::char_traits<char> >::value), "");
-    static_assert((std::is_same<std::basic_stringbuf<char>::int_type, std::char_traits<char>::int_type>::value), "");
-    static_assert((std::is_same<std::basic_stringbuf<char>::pos_type, std::char_traits<char>::pos_type>::value), "");
-    static_assert((std::is_same<std::basic_stringbuf<char>::off_type, std::char_traits<char>::off_type>::value), "");
-    static_assert((std::is_same<std::basic_stringbuf<char>::allocator_type, std::allocator<char> >::value), "");
-
-  return 0;
-}
diff --git a/libcxx/test/std/input.output/string.streams/stringstream/types.compile.pass.cpp b/libcxx/test/std/input.output/string.streams/stringstream/types.compile.pass.cpp
new file mode 100644
index 0000000..20486c8
--- /dev/null
+++ b/libcxx/test/std/input.output/string.streams/stringstream/types.compile.pass.cpp
@@ -0,0 +1,72 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// <sstream>
+
+// template <class charT, class traits = char_traits<charT>, class Allocator = allocator<charT> >
+// class basic_stringstream
+//     : public basic_iostream<charT, traits>
+// {
+// public:
+//     typedef charT                          char_type;
+//     typedef traits                         traits_type;
+//     typedef typename traits_type::int_type int_type;
+//     typedef typename traits_type::pos_type pos_type;
+//     typedef typename traits_type::off_type off_type;
+//     typedef Allocator                      allocator_type;
+//
+//     basic_stringstream(const basic_stringstream&) = delete;
+//     basic_stringstream& operator=(const basic_stringstream&) = delete;
+//
+//     basic_stringstream(basic_stringstream&& rhs);
+//     basic_stringstream& operator=(basic_stringstream&& rhs);
+
+#include <sstream>
+#include <type_traits>
+
+#include "test_macros.h"
+
+// Types
+
+static_assert(std::is_base_of<std::basic_iostream<char>, std::basic_stringstream<char> >::value, "");
+static_assert(std::is_same<std::basic_stringstream<char>::char_type, char>::value, "");
+static_assert(std::is_same<std::basic_stringstream<char>::traits_type, std::char_traits<char> >::value, "");
+static_assert(std::is_same<std::basic_stringstream<char>::int_type, std::char_traits<char>::int_type>::value, "");
+static_assert(std::is_same<std::basic_stringstream<char>::pos_type, std::char_traits<char>::pos_type>::value, "");
+static_assert(std::is_same<std::basic_stringstream<char>::off_type, std::char_traits<char>::off_type>::value, "");
+static_assert(std::is_same<std::basic_stringstream<char>::allocator_type, std::allocator<char> >::value, "");
+
+#ifndef TEST_HAS_NO_WIDE_CHARACTERS
+static_assert(std::is_base_of<std::basic_iostream<wchar_t>, std::basic_stringstream<wchar_t> >::value, "");
+static_assert(std::is_same<std::basic_stringstream<wchar_t>::char_type, wchar_t>::value, "");
+static_assert(std::is_same<std::basic_stringstream<wchar_t>::traits_type, std::char_traits<wchar_t> >::value, "");
+static_assert(std::is_same<std::basic_stringstream<wchar_t>::int_type, std::char_traits<wchar_t>::int_type>::value, "");
+static_assert(std::is_same<std::basic_stringstream<wchar_t>::pos_type, std::char_traits<wchar_t>::pos_type>::value, "");
+static_assert(std::is_same<std::basic_stringstream<wchar_t>::off_type, std::char_traits<wchar_t>::off_type>::value, "");
+static_assert(std::is_same<std::basic_stringstream<wchar_t>::allocator_type, std::allocator<wchar_t> >::value, "");
+#endif
+
+// Copy properties
+
+static_assert(!std::is_copy_constructible<std::stringstream>::value, "");
+static_assert(!std::is_copy_assignable<std::stringstream>::value, "");
+
+#ifndef TEST_HAS_NO_WIDE_CHARACTERS
+static_assert(!std::is_copy_constructible<std::wstringstream>::value, "");
+static_assert(!std::is_copy_assignable<std::wstringstream>::value, "");
+#endif
+
+// Move properties
+
+static_assert(std::is_move_constructible<std::stringstream>::value, "");
+static_assert(std::is_move_assignable<std::stringstream>::value, "");
+
+#ifndef TEST_HAS_NO_WIDE_CHARACTERS
+static_assert(std::is_move_constructible<std::wstringstream>::value, "");
+static_assert(std::is_move_assignable<std::wstringstream>::value, "");
+#endif
diff --git a/libcxx/test/std/input.output/string.streams/stringstream/types.pass.cpp b/libcxx/test/std/input.output/string.streams/stringstream/types.pass.cpp
deleted file mode 100644
index 11990e6a..0000000
--- a/libcxx/test/std/input.output/string.streams/stringstream/types.pass.cpp
+++ /dev/null
@@ -1,39 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-// <sstream>
-
-// template <class charT, class traits = char_traits<charT>, class Allocator = allocator<charT> >
-// class basic_stringstream
-//     : public basic_iostream<charT, traits>
-// {
-// public:
-//     typedef charT                          char_type;
-//     typedef traits                         traits_type;
-//     typedef typename traits_type::int_type int_type;
-//     typedef typename traits_type::pos_type pos_type;
-//     typedef typename traits_type::off_type off_type;
-//     typedef Allocator                      allocator_type;
-
-#include <sstream>
-#include <type_traits>
-
-#include "test_macros.h"
-
-int main(int, char**)
-{
-    static_assert((std::is_base_of<std::basic_iostream<char>, std::basic_stringstream<char> >::value), "");
-    static_assert((std::is_same<std::basic_stringstream<char>::char_type, char>::value), "");
-    static_assert((std::is_same<std::basic_stringstream<char>::traits_type, std::char_traits<char> >::value), "");
-    static_assert((std::is_same<std::basic_stringstream<char>::int_type, std::char_traits<char>::int_type>::value), "");
-    static_assert((std::is_same<std::basic_stringstream<char>::pos_type, std::char_traits<char>::pos_type>::value), "");
-    static_assert((std::is_same<std::basic_stringstream<char>::off_type, std::char_traits<char>::off_type>::value), "");
-    static_assert((std::is_same<std::basic_stringstream<char>::allocator_type, std::allocator<char> >::value), "");
-
-  return 0;
-}
-- 
cgit v1.1


From d30e941a03f7e70fb7875ede7b5d80342982e3a8 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <RKSimon@users.noreply.github.com>
Date: Tue, 13 Feb 2024 08:06:16 +0000
Subject: [DAG] Add SelectionDAG::getShiftAmountConstant APInt variant (#81484)

Asserts that the shift amount is in range and update ExpandShiftByConstant to use getShiftAmountConstant (and legal shift amount types).
---
 llvm/include/llvm/CodeGen/SelectionDAG.h           |  2 +
 .../CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp  | 59 ++++++++++++----------
 llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp     |  6 +++
 3 files changed, 40 insertions(+), 27 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/SelectionDAG.h b/llvm/include/llvm/CodeGen/SelectionDAG.h
index 886ec0b..7bb12d8 100644
--- a/llvm/include/llvm/CodeGen/SelectionDAG.h
+++ b/llvm/include/llvm/CodeGen/SelectionDAG.h
@@ -668,6 +668,8 @@ public:
                             bool isTarget = false);
   SDValue getShiftAmountConstant(uint64_t Val, EVT VT, const SDLoc &DL,
                                  bool LegalTypes = true);
+  SDValue getShiftAmountConstant(const APInt &Val, EVT VT, const SDLoc &DL,
+                                 bool LegalTypes = true);
   SDValue getVectorIdxConstant(uint64_t Val, const SDLoc &DL,
                                bool isTarget = false);
 
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
index 39b7e06..e73a092 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
@@ -2824,25 +2824,26 @@ void DAGTypeLegalizer::ExpandShiftByConstant(SDNode *N, const APInt &Amt,
   EVT NVT = InL.getValueType();
   unsigned VTBits = N->getValueType(0).getSizeInBits();
   unsigned NVTBits = NVT.getSizeInBits();
-  EVT ShTy = N->getOperand(1).getValueType();
 
   if (N->getOpcode() == ISD::SHL) {
     if (Amt.uge(VTBits)) {
       Lo = Hi = DAG.getConstant(0, DL, NVT);
     } else if (Amt.ugt(NVTBits)) {
       Lo = DAG.getConstant(0, DL, NVT);
-      Hi = DAG.getNode(ISD::SHL, DL,
-                       NVT, InL, DAG.getConstant(Amt - NVTBits, DL, ShTy));
+      Hi = DAG.getNode(ISD::SHL, DL, NVT, InL,
+                       DAG.getShiftAmountConstant(Amt - NVTBits, NVT, DL));
     } else if (Amt == NVTBits) {
       Lo = DAG.getConstant(0, DL, NVT);
       Hi = InL;
     } else {
-      Lo = DAG.getNode(ISD::SHL, DL, NVT, InL, DAG.getConstant(Amt, DL, ShTy));
-      Hi = DAG.getNode(ISD::OR, DL, NVT,
-                       DAG.getNode(ISD::SHL, DL, NVT, InH,
-                                   DAG.getConstant(Amt, DL, ShTy)),
-                       DAG.getNode(ISD::SRL, DL, NVT, InL,
-                                   DAG.getConstant(-Amt + NVTBits, DL, ShTy)));
+      Lo = DAG.getNode(ISD::SHL, DL, NVT, InL,
+                       DAG.getShiftAmountConstant(Amt, NVT, DL));
+      Hi = DAG.getNode(
+          ISD::OR, DL, NVT,
+          DAG.getNode(ISD::SHL, DL, NVT, InH,
+                      DAG.getShiftAmountConstant(Amt, NVT, DL)),
+          DAG.getNode(ISD::SRL, DL, NVT, InL,
+                      DAG.getShiftAmountConstant(-Amt + NVTBits, NVT, DL)));
     }
     return;
   }
@@ -2851,19 +2852,21 @@ void DAGTypeLegalizer::ExpandShiftByConstant(SDNode *N, const APInt &Amt,
     if (Amt.uge(VTBits)) {
       Lo = Hi = DAG.getConstant(0, DL, NVT);
     } else if (Amt.ugt(NVTBits)) {
-      Lo = DAG.getNode(ISD::SRL, DL,
-                       NVT, InH, DAG.getConstant(Amt - NVTBits, DL, ShTy));
+      Lo = DAG.getNode(ISD::SRL, DL, NVT, InH,
+                       DAG.getShiftAmountConstant(Amt - NVTBits, NVT, DL));
       Hi = DAG.getConstant(0, DL, NVT);
     } else if (Amt == NVTBits) {
       Lo = InH;
       Hi = DAG.getConstant(0, DL, NVT);
     } else {
-      Lo = DAG.getNode(ISD::OR, DL, NVT,
-                       DAG.getNode(ISD::SRL, DL, NVT, InL,
-                                   DAG.getConstant(Amt, DL, ShTy)),
-                       DAG.getNode(ISD::SHL, DL, NVT, InH,
-                                   DAG.getConstant(-Amt + NVTBits, DL, ShTy)));
-      Hi = DAG.getNode(ISD::SRL, DL, NVT, InH, DAG.getConstant(Amt, DL, ShTy));
+      Lo = DAG.getNode(
+          ISD::OR, DL, NVT,
+          DAG.getNode(ISD::SRL, DL, NVT, InL,
+                      DAG.getShiftAmountConstant(Amt, NVT, DL)),
+          DAG.getNode(ISD::SHL, DL, NVT, InH,
+                      DAG.getShiftAmountConstant(-Amt + NVTBits, NVT, DL)));
+      Hi = DAG.getNode(ISD::SRL, DL, NVT, InH,
+                       DAG.getShiftAmountConstant(Amt, NVT, DL));
     }
     return;
   }
@@ -2871,23 +2874,25 @@ void DAGTypeLegalizer::ExpandShiftByConstant(SDNode *N, const APInt &Amt,
   assert(N->getOpcode() == ISD::SRA && "Unknown shift!");
   if (Amt.uge(VTBits)) {
     Hi = Lo = DAG.getNode(ISD::SRA, DL, NVT, InH,
-                          DAG.getConstant(NVTBits - 1, DL, ShTy));
+                          DAG.getShiftAmountConstant(NVTBits - 1, NVT, DL));
   } else if (Amt.ugt(NVTBits)) {
     Lo = DAG.getNode(ISD::SRA, DL, NVT, InH,
-                     DAG.getConstant(Amt - NVTBits, DL, ShTy));
+                     DAG.getShiftAmountConstant(Amt - NVTBits, NVT, DL));
     Hi = DAG.getNode(ISD::SRA, DL, NVT, InH,
-                     DAG.getConstant(NVTBits - 1, DL, ShTy));
+                     DAG.getShiftAmountConstant(NVTBits - 1, NVT, DL));
   } else if (Amt == NVTBits) {
     Lo = InH;
     Hi = DAG.getNode(ISD::SRA, DL, NVT, InH,
-                     DAG.getConstant(NVTBits - 1, DL, ShTy));
+                     DAG.getShiftAmountConstant(NVTBits - 1, NVT, DL));
   } else {
-    Lo = DAG.getNode(ISD::OR, DL, NVT,
-                     DAG.getNode(ISD::SRL, DL, NVT, InL,
-                                 DAG.getConstant(Amt, DL, ShTy)),
-                     DAG.getNode(ISD::SHL, DL, NVT, InH,
-                                 DAG.getConstant(-Amt + NVTBits, DL, ShTy)));
-    Hi = DAG.getNode(ISD::SRA, DL, NVT, InH, DAG.getConstant(Amt, DL, ShTy));
+    Lo = DAG.getNode(
+        ISD::OR, DL, NVT,
+        DAG.getNode(ISD::SRL, DL, NVT, InL,
+                    DAG.getShiftAmountConstant(Amt, NVT, DL)),
+        DAG.getNode(ISD::SHL, DL, NVT, InH,
+                    DAG.getShiftAmountConstant(-Amt + NVTBits, NVT, DL)));
+    Hi = DAG.getNode(ISD::SRA, DL, NVT, InH,
+                     DAG.getShiftAmountConstant(Amt, NVT, DL));
   }
 }
 
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index 55eee78..421bb51 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -1734,6 +1734,12 @@ SDValue SelectionDAG::getShiftAmountConstant(uint64_t Val, EVT VT,
   return getConstant(Val, DL, ShiftVT);
 }
 
+SDValue SelectionDAG::getShiftAmountConstant(const APInt &Val, EVT VT,
+                                             const SDLoc &DL, bool LegalTypes) {
+  assert(Val.ult(VT.getScalarSizeInBits()) && "Out of range shift");
+  return getShiftAmountConstant(Val.getZExtValue(), VT, DL, LegalTypes);
+}
+
 SDValue SelectionDAG::getVectorIdxConstant(uint64_t Val, const SDLoc &DL,
                                            bool isTarget) {
   return getConstant(Val, DL, TLI->getVectorIdxTy(getDataLayout()), isTarget);
-- 
cgit v1.1


From a1efe56ace6099959ac97fcb09b9a7837b6d0255 Mon Sep 17 00:00:00 2001
From: Haojian Wu <hokein.wu@gmail.com>
Date: Tue, 13 Feb 2024 08:59:46 +0100
Subject: Remove an unused variable in release build.

---
 llvm/lib/Analysis/ValueTracking.cpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/llvm/lib/Analysis/ValueTracking.cpp b/llvm/lib/Analysis/ValueTracking.cpp
index 220ef32..92c9162 100644
--- a/llvm/lib/Analysis/ValueTracking.cpp
+++ b/llvm/lib/Analysis/ValueTracking.cpp
@@ -4279,9 +4279,8 @@ static KnownFPClass computeKnownFPClassFromContext(const Value *V,
     if (!AssumeVH)
       continue;
     CallInst *I = cast<CallInst>(AssumeVH);
-    const Function *F = I->getFunction();
 
-    assert(F == Q.CxtI->getParent()->getParent() &&
+    assert(I->getFunction() == Q.CxtI->getParent()->getParent() &&
            "Got assumption for the wrong function!");
     assert(I->getCalledFunction()->getIntrinsicID() == Intrinsic::assume &&
            "must be an assume intrinsic");
-- 
cgit v1.1


From 87d771193490e6604f132752438bd8404c83498c Mon Sep 17 00:00:00 2001
From: Pierre van Houtryve <pierre.vanhoutryve@amd.com>
Date: Tue, 13 Feb 2024 09:07:51 +0100
Subject: [AMDGPU][SIMemoryLegalizer] Fix order of GL0/1_INV on GFX10/11
 (#81450)

Fixes SWDEV-443292
---
 llvm/docs/AMDGPUUsage.rst                          |  32 +-
 llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp       |   5 +-
 .../AMDGPU/GlobalISel/atomicrmw_udec_wrap.ll       | 124 ++---
 .../AMDGPU/GlobalISel/atomicrmw_uinc_wrap.ll       | 128 ++---
 .../GlobalISel/memory-legalizer-atomic-fence.ll    |  96 ++--
 .../AMDGPU/atomic_optimizations_global_pointer.ll  |  88 ++--
 llvm/test/CodeGen/AMDGPU/atomicrmw-expand.ll       |   2 +-
 llvm/test/CodeGen/AMDGPU/global-atomics-fp.ll      |  36 +-
 .../AMDGPU/global-saddr-atomics-min-max-system.ll  | 128 ++---
 llvm/test/CodeGen/AMDGPU/global-saddr-atomics.ll   | 248 +++++-----
 llvm/test/CodeGen/AMDGPU/global-saddr-load.ll      |  16 +-
 llvm/test/CodeGen/AMDGPU/insert-waitcnts-crash.ll  |   2 +-
 llvm/test/CodeGen/AMDGPU/memory-legalizer-fence.ll |  96 ++--
 .../CodeGen/AMDGPU/memory-legalizer-flat-agent.ll  | 544 ++++++++++-----------
 .../CodeGen/AMDGPU/memory-legalizer-flat-system.ll | 544 ++++++++++-----------
 .../AMDGPU/memory-legalizer-global-agent.ll        | 544 ++++++++++-----------
 .../AMDGPU/memory-legalizer-global-system.ll       | 512 +++++++++----------
 17 files changed, 1574 insertions(+), 1571 deletions(-)

diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst
index 2b4ca0a..8c12690 100644
--- a/llvm/docs/AMDGPUUsage.rst
+++ b/llvm/docs/AMDGPUUsage.rst
@@ -12291,8 +12291,8 @@ table :ref:`amdgpu-amdhsa-memory-model-code-sequences-gfx10-gfx11-table`.
                                                              before invalidating
                                                              the caches.
 
-                                                         3. buffer_gl0_inv;
-                                                            buffer_gl1_inv
+                                                         3. buffer_gl1_inv;
+                                                            buffer_gl0_inv
 
                                                            - Must happen before
                                                              any following
@@ -12321,8 +12321,8 @@ table :ref:`amdgpu-amdhsa-memory-model-code-sequences-gfx10-gfx11-table`.
                                                              before invalidating
                                                              the caches.
 
-                                                         3. buffer_gl0_inv;
-                                                            buffer_gl1_inv
+                                                         3. buffer_gl1_inv;
+                                                            buffer_gl0_inv
 
                                                            - Must happen before
                                                              any following
@@ -12428,8 +12428,8 @@ table :ref:`amdgpu-amdhsa-memory-model-code-sequences-gfx10-gfx11-table`.
                                                              invalidating the
                                                              caches.
 
-                                                         3. buffer_gl0_inv;
-                                                            buffer_gl1_inv
+                                                         3. buffer_gl1_inv;
+                                                            buffer_gl0_inv
 
                                                            - Must happen before
                                                              any following
@@ -12459,8 +12459,8 @@ table :ref:`amdgpu-amdhsa-memory-model-code-sequences-gfx10-gfx11-table`.
                                                              invalidating the
                                                              caches.
 
-                                                         3. buffer_gl0_inv;
-                                                            buffer_gl1_inv
+                                                         3. buffer_gl1_inv;
+                                                            buffer_gl0_inv
 
                                                            - Must happen before
                                                              any following
@@ -12655,8 +12655,8 @@ table :ref:`amdgpu-amdhsa-memory-model-code-sequences-gfx10-gfx11-table`.
                                                              the
                                                              fence-paired-atomic.
 
-                                                         2. buffer_gl0_inv;
-                                                            buffer_gl1_inv
+                                                         2. buffer_gl1_inv;
+                                                            buffer_gl0_inv
 
                                                            - Must happen before any
                                                              following global/generic
@@ -13369,8 +13369,8 @@ table :ref:`amdgpu-amdhsa-memory-model-code-sequences-gfx10-gfx11-table`.
                                                              invalidating the
                                                              caches.
 
-                                                         4. buffer_gl0_inv;
-                                                            buffer_gl1_inv
+                                                         4. buffer_gl1_inv;
+                                                            buffer_gl0_inv
 
                                                            - Must happen before
                                                              any following
@@ -13444,8 +13444,8 @@ table :ref:`amdgpu-amdhsa-memory-model-code-sequences-gfx10-gfx11-table`.
                                                              invalidating the
                                                              caches.
 
-                                                         4. buffer_gl0_inv;
-                                                            buffer_gl1_inv
+                                                         4. buffer_gl1_inv;
+                                                            buffer_gl0_inv
 
                                                            - Must happen before
                                                              any following
@@ -13672,8 +13672,8 @@ table :ref:`amdgpu-amdhsa-memory-model-code-sequences-gfx10-gfx11-table`.
                                                              requirements of
                                                              release.
 
-                                                         2. buffer_gl0_inv;
-                                                            buffer_gl1_inv
+                                                         2. buffer_gl1_inv;
+                                                            buffer_gl0_inv
 
                                                            - Must happen before
                                                              any following
diff --git a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
index 84b9330..f62e808 100644
--- a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
@@ -2030,8 +2030,11 @@ bool SIGfx10CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
     switch (Scope) {
     case SIAtomicScope::SYSTEM:
     case SIAtomicScope::AGENT:
-      BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL0_INV));
+      // The order of invalidates matter here. We must invalidate "outer in"
+      // so L1 -> L0 to avoid L0 pulling in stale data from L1 when it is
+      // invalidated.
       BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL1_INV));
+      BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL0_INV));
       Changed = true;
       break;
     case SIAtomicScope::WORKGROUP:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_udec_wrap.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_udec_wrap.ll
index 25cee87..b04bc04 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_udec_wrap.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_udec_wrap.ll
@@ -338,8 +338,8 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32(ptr addrspace(1) %out, ptr
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    global_atomic_dec v0, v1, v0, s[2:3] glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
@@ -350,8 +350,8 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32(ptr addrspace(1) %out, ptr
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    global_atomic_dec_u32 v0, v1, v0, s[2:3] glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -416,8 +416,8 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32_offset(ptr addrspace(1) %ou
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    global_atomic_dec v0, v1, v0, s[2:3] offset:16 glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
@@ -428,8 +428,8 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32_offset(ptr addrspace(1) %ou
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    global_atomic_dec_u32 v0, v1, v0, s[2:3] offset:16 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -495,8 +495,8 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32_offset_system(ptr addrspace
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    global_atomic_dec v0, v1, v0, s[2:3] offset:16 glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
@@ -507,8 +507,8 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32_offset_system(ptr addrspace
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    global_atomic_dec_u32 v0, v1, v0, s[2:3] offset:16 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -563,8 +563,8 @@ define amdgpu_kernel void @global_atomic_dec_noret_i32(ptr addrspace(1) %ptr) #1
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    global_atomic_dec v1, v0, s[0:1]
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: global_atomic_dec_noret_i32:
@@ -574,8 +574,8 @@ define amdgpu_kernel void @global_atomic_dec_noret_i32(ptr addrspace(1) %ptr) #1
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    global_atomic_dec_u32 v1, v0, s[0:1]
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_endpgm
   %result = atomicrmw udec_wrap ptr addrspace(1) %ptr, i32 42 syncscope("agent") seq_cst, align 4
   ret void
@@ -629,8 +629,8 @@ define amdgpu_kernel void @global_atomic_dec_noret_i32_offset(ptr addrspace(1) %
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    global_atomic_dec v1, v0, s[0:1] offset:16
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: global_atomic_dec_noret_i32_offset:
@@ -640,8 +640,8 @@ define amdgpu_kernel void @global_atomic_dec_noret_i32_offset(ptr addrspace(1) %
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    global_atomic_dec_u32 v1, v0, s[0:1] offset:16
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_endpgm
   %gep = getelementptr i32, ptr addrspace(1) %ptr, i32 4
   %result = atomicrmw udec_wrap ptr addrspace(1) %gep, i32 42 syncscope("agent") seq_cst, align 4
@@ -696,8 +696,8 @@ define amdgpu_kernel void @global_atomic_dec_noret_i32_offset_system(ptr addrspa
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    global_atomic_dec v1, v0, s[0:1] offset:16
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: global_atomic_dec_noret_i32_offset_system:
@@ -707,8 +707,8 @@ define amdgpu_kernel void @global_atomic_dec_noret_i32_offset_system(ptr addrspa
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    global_atomic_dec_u32 v1, v0, s[0:1] offset:16
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_endpgm
   %gep = getelementptr i32, ptr addrspace(1) %ptr, i32 4
   %result = atomicrmw udec_wrap ptr addrspace(1) %gep, i32 42 seq_cst, align 4
@@ -780,8 +780,8 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32_offset_addr64(ptr addrspace
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    global_atomic_dec v1, v0, v1, s[2:3] offset:20 glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
@@ -792,8 +792,8 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32_offset_addr64(ptr addrspace
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    global_atomic_dec_u32 v1, v0, v1, s[2:3] offset:20 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -861,8 +861,8 @@ define amdgpu_kernel void @global_atomic_dec_noret_i32_offset_addr64(ptr addrspa
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    global_atomic_dec v0, v1, s[0:1] offset:20
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: global_atomic_dec_noret_i32_offset_addr64:
@@ -872,8 +872,8 @@ define amdgpu_kernel void @global_atomic_dec_noret_i32_offset_addr64(ptr addrspa
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    global_atomic_dec_u32 v0, v1, s[0:1] offset:20
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_endpgm
   %id = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.tid = getelementptr i32, ptr addrspace(1) %ptr, i32 %id
@@ -937,8 +937,8 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32(ptr %out, ptr %ptr) #1 {
 ; GFX10-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-NEXT:    flat_atomic_dec v2, v[0:1], v2 glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-NEXT:    flat_store_dword v[0:1], v2
@@ -952,8 +952,8 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32(ptr %out, ptr %ptr) #1 {
 ; GFX11-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-NEXT:    flat_atomic_dec_u32 v2, v[0:1], v2 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX11-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-NEXT:    s_endpgm
@@ -1023,8 +1023,8 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset(ptr %out, ptr %ptr) #1
 ; GFX10-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-NEXT:    flat_atomic_dec v2, v[0:1], v2 glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-NEXT:    flat_store_dword v[0:1], v2
@@ -1038,8 +1038,8 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset(ptr %out, ptr %ptr) #1
 ; GFX11-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-NEXT:    flat_atomic_dec_u32 v2, v[0:1], v2 offset:16 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX11-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-NEXT:    s_endpgm
@@ -1110,8 +1110,8 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset_system(ptr %out, ptr %
 ; GFX10-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-NEXT:    flat_atomic_dec v2, v[0:1], v2 glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-NEXT:    flat_store_dword v[0:1], v2
@@ -1125,8 +1125,8 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset_system(ptr %out, ptr %
 ; GFX11-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-NEXT:    flat_atomic_dec_u32 v2, v[0:1], v2 offset:16 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX11-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-NEXT:    s_endpgm
@@ -1183,8 +1183,8 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32(ptr %ptr) #1 {
 ; GFX10-NEXT:    flat_atomic_dec v[0:1], v2
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: flat_atomic_dec_noret_i32:
@@ -1196,8 +1196,8 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32(ptr %ptr) #1 {
 ; GFX11-NEXT:    flat_atomic_dec_u32 v[0:1], v2
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_endpgm
   %result = atomicrmw udec_wrap ptr %ptr, i32 42 syncscope("agent") seq_cst, align 4
   ret void
@@ -1256,8 +1256,8 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset(ptr %ptr) #1 {
 ; GFX10-NEXT:    flat_atomic_dec v[0:1], v2
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: flat_atomic_dec_noret_i32_offset:
@@ -1269,8 +1269,8 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset(ptr %ptr) #1 {
 ; GFX11-NEXT:    flat_atomic_dec_u32 v[0:1], v2 offset:16
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_endpgm
   %gep = getelementptr i32, ptr %ptr, i32 4
   %result = atomicrmw udec_wrap ptr %gep, i32 42 syncscope("agent") seq_cst, align 4
@@ -1330,8 +1330,8 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset_system(ptr %ptr) #1
 ; GFX10-NEXT:    flat_atomic_dec v[0:1], v2
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: flat_atomic_dec_noret_i32_offset_system:
@@ -1343,8 +1343,8 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset_system(ptr %ptr) #1
 ; GFX11-NEXT:    flat_atomic_dec_u32 v[0:1], v2 offset:16
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_endpgm
   %gep = getelementptr i32, ptr %ptr, i32 4
   %result = atomicrmw udec_wrap ptr %gep, i32 42 seq_cst, align 4
@@ -1430,8 +1430,8 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset_addr64(ptr %out, ptr %
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
 ; GFX10-NEXT:    flat_atomic_dec v3, v[0:1], v3 glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
@@ -1450,8 +1450,8 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset_addr64(ptr %out, ptr %
 ; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
 ; GFX11-NEXT:    flat_atomic_dec_u32 v3, v[0:1], v3 offset:20 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
@@ -1532,8 +1532,8 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset_addr64(ptr %ptr) #1
 ; GFX10-NEXT:    flat_atomic_dec v[0:1], v2
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: flat_atomic_dec_noret_i32_offset_addr64:
@@ -1549,8 +1549,8 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset_addr64(ptr %ptr) #1
 ; GFX11-NEXT:    flat_atomic_dec_u32 v[0:1], v2 offset:20
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_endpgm
   %id = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.tid = getelementptr i32, ptr %ptr, i32 %id
@@ -1628,8 +1628,8 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64(ptr %out, ptr %ptr) #1 {
 ; GFX10-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-NEXT:    flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    v_mov_b32_e32 v3, s1
 ; GFX10-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX10-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
@@ -1644,8 +1644,8 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64(ptr %out, ptr %ptr) #1 {
 ; GFX11-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX11-NEXT:    flat_atomic_dec_u64 v[0:1], v[2:3], v[0:1] glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
 ; GFX11-NEXT:    flat_store_b64 v[2:3], v[0:1]
 ; GFX11-NEXT:    s_endpgm
@@ -1729,8 +1729,8 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset(ptr %out, ptr %ptr) #1
 ; GFX10-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-NEXT:    flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    v_mov_b32_e32 v3, s1
 ; GFX10-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX10-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
@@ -1745,8 +1745,8 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset(ptr %out, ptr %ptr) #1
 ; GFX11-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX11-NEXT:    flat_atomic_dec_u64 v[0:1], v[2:3], v[0:1] offset:32 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
 ; GFX11-NEXT:    flat_store_b64 v[2:3], v[0:1]
 ; GFX11-NEXT:    s_endpgm
@@ -1807,8 +1807,8 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64(ptr %ptr) #1 {
 ; GFX10-NEXT:    flat_atomic_dec_x2 v[2:3], v[0:1]
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: flat_atomic_dec_noret_i64:
@@ -1821,8 +1821,8 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64(ptr %ptr) #1 {
 ; GFX11-NEXT:    flat_atomic_dec_u64 v[2:3], v[0:1]
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_endpgm
   %result = atomicrmw udec_wrap ptr %ptr, i64 42 syncscope("agent") seq_cst, align 8
   ret void
@@ -1885,8 +1885,8 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset(ptr %ptr) #1 {
 ; GFX10-NEXT:    flat_atomic_dec_x2 v[2:3], v[0:1]
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: flat_atomic_dec_noret_i64_offset:
@@ -1899,8 +1899,8 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset(ptr %ptr) #1 {
 ; GFX11-NEXT:    flat_atomic_dec_u64 v[2:3], v[0:1] offset:32
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_endpgm
   %gep = getelementptr i64, ptr %ptr, i32 4
   %result = atomicrmw udec_wrap ptr %gep, i64 42 syncscope("agent") seq_cst, align 8
@@ -1964,8 +1964,8 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset_system(ptr %ptr) #1
 ; GFX10-NEXT:    flat_atomic_dec_x2 v[2:3], v[0:1]
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: flat_atomic_dec_noret_i64_offset_system:
@@ -1978,8 +1978,8 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset_system(ptr %ptr) #1
 ; GFX11-NEXT:    flat_atomic_dec_u64 v[2:3], v[0:1] offset:32
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_endpgm
   %gep = getelementptr i64, ptr %ptr, i32 4
   %result = atomicrmw udec_wrap ptr %gep, i64 42 seq_cst, align 8
@@ -2075,8 +2075,8 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset_addr64(ptr %out, ptr %
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
 ; GFX10-NEXT:    flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    v_mov_b32_e32 v3, s1
 ; GFX10-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX10-NEXT:    v_add_co_u32 v2, vcc_lo, v2, v4
@@ -2097,8 +2097,8 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset_addr64(ptr %out, ptr %
 ; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
 ; GFX11-NEXT:    flat_atomic_dec_u64 v[0:1], v[0:1], v[2:3] offset:40 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_add_co_u32 v2, vcc_lo, v2, v4
@@ -2183,8 +2183,8 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset_addr64(ptr %ptr) #1
 ; GFX10-NEXT:    flat_atomic_dec_x2 v[2:3], v[0:1]
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: flat_atomic_dec_noret_i64_offset_addr64:
@@ -2201,8 +2201,8 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset_addr64(ptr %ptr) #1
 ; GFX11-NEXT:    flat_atomic_dec_u64 v[0:1], v[2:3] offset:40
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_endpgm
   %id = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.tid = getelementptr i64, ptr %ptr, i32 %id
@@ -2651,8 +2651,8 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64(ptr addrspace(1) %out, ptr
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    global_atomic_dec_x2 v[0:1], v2, v[0:1], s[2:3] glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
@@ -2664,8 +2664,8 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64(ptr addrspace(1) %out, ptr
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    global_atomic_dec_u64 v[0:1], v2, v[0:1], s[2:3] glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -2734,8 +2734,8 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64_offset(ptr addrspace(1) %ou
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    global_atomic_dec_x2 v[0:1], v2, v[0:1], s[2:3] offset:32 glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
@@ -2747,8 +2747,8 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64_offset(ptr addrspace(1) %ou
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    global_atomic_dec_u64 v[0:1], v2, v[0:1], s[2:3] offset:32 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -2818,8 +2818,8 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64_offset_system(ptr addrspace
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    global_atomic_dec_x2 v[0:1], v2, v[0:1], s[2:3] offset:32 glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
@@ -2831,8 +2831,8 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64_offset_system(ptr addrspace
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    global_atomic_dec_u64 v[0:1], v2, v[0:1], s[2:3] offset:32 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -2891,8 +2891,8 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64(ptr addrspace(1) %ptr) #1
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    global_atomic_dec_x2 v2, v[0:1], s[0:1]
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: global_atomic_dec_noret_i64:
@@ -2903,8 +2903,8 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64(ptr addrspace(1) %ptr) #1
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    global_atomic_dec_u64 v2, v[0:1], s[0:1]
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_endpgm
   %result = atomicrmw udec_wrap ptr addrspace(1) %ptr, i64 42 syncscope("agent") seq_cst, align 8
   ret void
@@ -2962,8 +2962,8 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64_offset(ptr addrspace(1) %
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    global_atomic_dec_x2 v2, v[0:1], s[0:1] offset:32
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: global_atomic_dec_noret_i64_offset:
@@ -2974,8 +2974,8 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64_offset(ptr addrspace(1) %
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    global_atomic_dec_u64 v2, v[0:1], s[0:1] offset:32
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_endpgm
   %gep = getelementptr i64, ptr addrspace(1) %ptr, i32 4
   %result = atomicrmw udec_wrap ptr addrspace(1) %gep, i64 42 syncscope("agent") seq_cst, align 8
@@ -3034,8 +3034,8 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64_offset_system(ptr addrspa
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    global_atomic_dec_x2 v2, v[0:1], s[0:1] offset:32
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: global_atomic_dec_noret_i64_offset_system:
@@ -3046,8 +3046,8 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64_offset_system(ptr addrspa
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    global_atomic_dec_u64 v2, v[0:1], s[0:1] offset:32
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_endpgm
   %gep = getelementptr i64, ptr addrspace(1) %ptr, i32 4
   %result = atomicrmw udec_wrap ptr addrspace(1) %gep, i64 42 seq_cst, align 8
@@ -3123,8 +3123,8 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64_offset_addr64(ptr addrspace
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    global_atomic_dec_x2 v[0:1], v3, v[1:2], s[2:3] offset:40 glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    global_store_dwordx2 v3, v[0:1], s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
@@ -3136,8 +3136,8 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64_offset_addr64(ptr addrspace
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    global_atomic_dec_u64 v[0:1], v3, v[1:2], s[2:3] offset:40 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    global_store_b64 v3, v[0:1], s[0:1]
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -3209,8 +3209,8 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64_offset_addr64(ptr addrspa
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    global_atomic_dec_x2 v0, v[1:2], s[0:1] offset:40
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: global_atomic_dec_noret_i64_offset_addr64:
@@ -3221,8 +3221,8 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64_offset_addr64(ptr addrspa
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    global_atomic_dec_u64 v0, v[1:2], s[0:1] offset:40
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_endpgm
   %id = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.tid = getelementptr i64, ptr addrspace(1) %ptr, i32 %id
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_uinc_wrap.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_uinc_wrap.ll
index 5e01337..f6a997f 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_uinc_wrap.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_uinc_wrap.ll
@@ -338,8 +338,8 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32(ptr addrspace(1) %out, ptr
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    global_atomic_inc v0, v1, v0, s[2:3] glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
@@ -350,8 +350,8 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32(ptr addrspace(1) %out, ptr
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    global_atomic_inc_u32 v0, v1, v0, s[2:3] glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -416,8 +416,8 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32_offset(ptr addrspace(1) %ou
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    global_atomic_inc v0, v1, v0, s[2:3] offset:16 glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
@@ -428,8 +428,8 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32_offset(ptr addrspace(1) %ou
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    global_atomic_inc_u32 v0, v1, v0, s[2:3] offset:16 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -495,8 +495,8 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32_offset_sistem(ptr addrspace
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    global_atomic_inc v0, v1, v0, s[2:3] offset:16 glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
@@ -507,8 +507,8 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32_offset_sistem(ptr addrspace
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    global_atomic_inc_u32 v0, v1, v0, s[2:3] offset:16 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -563,8 +563,8 @@ define amdgpu_kernel void @global_atomic_inc_noret_i32(ptr addrspace(1) %ptr) #1
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    global_atomic_inc v1, v0, s[0:1]
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: global_atomic_inc_noret_i32:
@@ -574,8 +574,8 @@ define amdgpu_kernel void @global_atomic_inc_noret_i32(ptr addrspace(1) %ptr) #1
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    global_atomic_inc_u32 v1, v0, s[0:1]
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_endpgm
   %result = atomicrmw uinc_wrap ptr addrspace(1) %ptr, i32 42 syncscope("agent") seq_cst, align 4
   ret void
@@ -629,8 +629,8 @@ define amdgpu_kernel void @global_atomic_inc_noret_i32_offset(ptr addrspace(1) %
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    global_atomic_inc v1, v0, s[0:1] offset:16
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: global_atomic_inc_noret_i32_offset:
@@ -640,8 +640,8 @@ define amdgpu_kernel void @global_atomic_inc_noret_i32_offset(ptr addrspace(1) %
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    global_atomic_inc_u32 v1, v0, s[0:1] offset:16
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_endpgm
   %gep = getelementptr i32, ptr addrspace(1) %ptr, i32 4
   %result = atomicrmw uinc_wrap ptr addrspace(1) %gep, i32 42 syncscope("agent") seq_cst, align 4
@@ -696,8 +696,8 @@ define amdgpu_kernel void @global_atomic_inc_noret_i32_offset_system(ptr addrspa
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    global_atomic_inc v1, v0, s[0:1] offset:16
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: global_atomic_inc_noret_i32_offset_system:
@@ -707,8 +707,8 @@ define amdgpu_kernel void @global_atomic_inc_noret_i32_offset_system(ptr addrspa
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    global_atomic_inc_u32 v1, v0, s[0:1] offset:16
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_endpgm
   %gep = getelementptr i32, ptr addrspace(1) %ptr, i32 4
   %result = atomicrmw uinc_wrap ptr addrspace(1) %gep, i32 42 seq_cst, align 4
@@ -780,8 +780,8 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32_offset_addr64(ptr addrspace
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    global_atomic_inc v1, v0, v1, s[2:3] offset:20 glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
@@ -792,8 +792,8 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32_offset_addr64(ptr addrspace
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    global_atomic_inc_u32 v1, v0, v1, s[2:3] offset:20 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -861,8 +861,8 @@ define amdgpu_kernel void @global_atomic_inc_noret_i32_offset_addr64(ptr addrspa
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    global_atomic_inc v0, v1, s[0:1] offset:20
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: global_atomic_inc_noret_i32_offset_addr64:
@@ -872,8 +872,8 @@ define amdgpu_kernel void @global_atomic_inc_noret_i32_offset_addr64(ptr addrspa
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    global_atomic_inc_u32 v0, v1, s[0:1] offset:20
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_endpgm
   %id = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.tid = getelementptr i32, ptr addrspace(1) %ptr, i32 %id
@@ -1322,8 +1322,8 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64(ptr addrspace(1) %out, ptr
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    global_atomic_inc_x2 v[0:1], v2, v[0:1], s[2:3] glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
@@ -1335,8 +1335,8 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64(ptr addrspace(1) %out, ptr
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    global_atomic_inc_u64 v[0:1], v2, v[0:1], s[2:3] glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -1405,8 +1405,8 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64_offset(ptr addrspace(1) %ou
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    global_atomic_inc_x2 v[0:1], v2, v[0:1], s[2:3] offset:32 glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
@@ -1418,8 +1418,8 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64_offset(ptr addrspace(1) %ou
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    global_atomic_inc_u64 v[0:1], v2, v[0:1], s[2:3] offset:32 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -1489,8 +1489,8 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64_offset_system(ptr addrspace
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    global_atomic_inc_x2 v[0:1], v2, v[0:1], s[2:3] offset:32 glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
@@ -1502,8 +1502,8 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64_offset_system(ptr addrspace
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    global_atomic_inc_u64 v[0:1], v2, v[0:1], s[2:3] offset:32 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -1562,8 +1562,8 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64(ptr addrspace(1) %ptr) #1
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    global_atomic_inc_x2 v2, v[0:1], s[0:1]
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: global_atomic_inc_noret_i64:
@@ -1574,8 +1574,8 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64(ptr addrspace(1) %ptr) #1
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    global_atomic_inc_u64 v2, v[0:1], s[0:1]
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_endpgm
   %result = atomicrmw uinc_wrap ptr addrspace(1) %ptr, i64 42 syncscope("agent") seq_cst, align 8
   ret void
@@ -1633,8 +1633,8 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64_offset(ptr addrspace(1) %
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    global_atomic_inc_x2 v2, v[0:1], s[0:1] offset:32
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: global_atomic_inc_noret_i64_offset:
@@ -1645,8 +1645,8 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64_offset(ptr addrspace(1) %
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    global_atomic_inc_u64 v2, v[0:1], s[0:1] offset:32
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_endpgm
   %gep = getelementptr i64, ptr addrspace(1) %ptr, i32 4
   %result = atomicrmw uinc_wrap ptr addrspace(1) %gep, i64 42 syncscope("agent") seq_cst, align 8
@@ -1705,8 +1705,8 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64_offset_system(ptr addrspa
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    global_atomic_inc_x2 v2, v[0:1], s[0:1] offset:32
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: global_atomic_inc_noret_i64_offset_system:
@@ -1717,8 +1717,8 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64_offset_system(ptr addrspa
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    global_atomic_inc_u64 v2, v[0:1], s[0:1] offset:32
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_endpgm
   %gep = getelementptr i64, ptr addrspace(1) %ptr, i32 4
   %result = atomicrmw uinc_wrap ptr addrspace(1) %gep, i64 42 seq_cst, align 8
@@ -1794,8 +1794,8 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64_offset_addr64(ptr addrspace
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    global_atomic_inc_x2 v[0:1], v3, v[1:2], s[2:3] offset:40 glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    global_store_dwordx2 v3, v[0:1], s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
@@ -1807,8 +1807,8 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64_offset_addr64(ptr addrspace
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    global_atomic_inc_u64 v[0:1], v3, v[1:2], s[2:3] offset:40 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    global_store_b64 v3, v[0:1], s[0:1]
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -1880,8 +1880,8 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64_offset_addr64(ptr addrspa
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    global_atomic_inc_x2 v0, v[1:2], s[0:1] offset:40
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: global_atomic_inc_noret_i64_offset_addr64:
@@ -1892,8 +1892,8 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64_offset_addr64(ptr addrspa
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    global_atomic_inc_u64 v0, v[1:2], s[0:1] offset:40
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_endpgm
   %id = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.tid = getelementptr i64, ptr addrspace(1) %ptr, i32 %id
@@ -1957,8 +1957,8 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32(ptr %out, ptr %ptr) #1 {
 ; GFX10-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-NEXT:    flat_atomic_inc v2, v[0:1], v2 glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-NEXT:    flat_store_dword v[0:1], v2
@@ -1972,8 +1972,8 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32(ptr %out, ptr %ptr) #1 {
 ; GFX11-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-NEXT:    flat_atomic_inc_u32 v2, v[0:1], v2 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX11-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-NEXT:    s_endpgm
@@ -2043,8 +2043,8 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset(ptr %out, ptr %ptr) #1
 ; GFX10-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-NEXT:    flat_atomic_inc v2, v[0:1], v2 glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-NEXT:    flat_store_dword v[0:1], v2
@@ -2058,8 +2058,8 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset(ptr %out, ptr %ptr) #1
 ; GFX11-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-NEXT:    flat_atomic_inc_u32 v2, v[0:1], v2 offset:16 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX11-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-NEXT:    s_endpgm
@@ -2130,8 +2130,8 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_system(ptr %out, ptr %
 ; GFX10-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-NEXT:    flat_atomic_inc v2, v[0:1], v2 glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-NEXT:    flat_store_dword v[0:1], v2
@@ -2145,8 +2145,8 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_system(ptr %out, ptr %
 ; GFX11-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-NEXT:    flat_atomic_inc_u32 v2, v[0:1], v2 offset:16 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX11-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-NEXT:    s_endpgm
@@ -2203,8 +2203,8 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32(ptr %ptr) #1 {
 ; GFX10-NEXT:    flat_atomic_inc v[0:1], v2
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: flat_atomic_inc_noret_i32:
@@ -2216,8 +2216,8 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32(ptr %ptr) #1 {
 ; GFX11-NEXT:    flat_atomic_inc_u32 v[0:1], v2
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_endpgm
   %result = atomicrmw uinc_wrap ptr %ptr, i32 42 syncscope("agent") seq_cst, align 4
   ret void
@@ -2276,8 +2276,8 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset(ptr %ptr) #1 {
 ; GFX10-NEXT:    flat_atomic_inc v[0:1], v2
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: flat_atomic_inc_noret_i32_offset:
@@ -2289,8 +2289,8 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset(ptr %ptr) #1 {
 ; GFX11-NEXT:    flat_atomic_inc_u32 v[0:1], v2 offset:16
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_endpgm
   %gep = getelementptr i32, ptr %ptr, i32 4
   %result = atomicrmw uinc_wrap ptr %gep, i32 42 syncscope("agent") seq_cst, align 4
@@ -2350,8 +2350,8 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset_system(ptr %ptr) #1
 ; GFX10-NEXT:    flat_atomic_inc v[0:1], v2
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: flat_atomic_inc_noret_i32_offset_system:
@@ -2363,8 +2363,8 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset_system(ptr %ptr) #1
 ; GFX11-NEXT:    flat_atomic_inc_u32 v[0:1], v2 offset:16
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_endpgm
   %gep = getelementptr i32, ptr %ptr, i32 4
   %result = atomicrmw uinc_wrap ptr %gep, i32 42 seq_cst, align 4
@@ -2450,8 +2450,8 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_addr64(ptr %out, ptr %
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
 ; GFX10-NEXT:    flat_atomic_inc v3, v[0:1], v3 glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
@@ -2470,8 +2470,8 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_addr64(ptr %out, ptr %
 ; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
 ; GFX11-NEXT:    flat_atomic_inc_u32 v3, v[0:1], v3 offset:20 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
@@ -2552,8 +2552,8 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset_addr64(ptr %ptr) #1
 ; GFX10-NEXT:    flat_atomic_inc v[0:1], v2
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: flat_atomic_inc_noret_i32_offset_addr64:
@@ -2569,8 +2569,8 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset_addr64(ptr %ptr) #1
 ; GFX11-NEXT:    flat_atomic_inc_u32 v[0:1], v2 offset:20
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_endpgm
   %id = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.tid = getelementptr i32, ptr %ptr, i32 %id
@@ -2744,8 +2744,8 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64(ptr %out, ptr %ptr) #1 {
 ; GFX10-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-NEXT:    flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    v_mov_b32_e32 v3, s1
 ; GFX10-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX10-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
@@ -2760,8 +2760,8 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64(ptr %out, ptr %ptr) #1 {
 ; GFX11-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX11-NEXT:    flat_atomic_inc_u64 v[0:1], v[2:3], v[0:1] glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
 ; GFX11-NEXT:    flat_store_b64 v[2:3], v[0:1]
 ; GFX11-NEXT:    s_endpgm
@@ -2845,8 +2845,8 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset(ptr %out, ptr %ptr) #1
 ; GFX10-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-NEXT:    flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    v_mov_b32_e32 v3, s1
 ; GFX10-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX10-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
@@ -2861,8 +2861,8 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset(ptr %out, ptr %ptr) #1
 ; GFX11-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX11-NEXT:    flat_atomic_inc_u64 v[0:1], v[2:3], v[0:1] offset:32 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
 ; GFX11-NEXT:    flat_store_b64 v[2:3], v[0:1]
 ; GFX11-NEXT:    s_endpgm
@@ -2947,8 +2947,8 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_system(ptr %out, ptr %
 ; GFX10-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-NEXT:    flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    v_mov_b32_e32 v3, s1
 ; GFX10-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX10-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
@@ -2963,8 +2963,8 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_system(ptr %out, ptr %
 ; GFX11-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX11-NEXT:    flat_atomic_inc_u64 v[0:1], v[2:3], v[0:1] offset:32 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
 ; GFX11-NEXT:    flat_store_b64 v[2:3], v[0:1]
 ; GFX11-NEXT:    s_endpgm
@@ -3025,8 +3025,8 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64(ptr %ptr) #1 {
 ; GFX10-NEXT:    flat_atomic_inc_x2 v[2:3], v[0:1]
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: flat_atomic_inc_noret_i64:
@@ -3039,8 +3039,8 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64(ptr %ptr) #1 {
 ; GFX11-NEXT:    flat_atomic_inc_u64 v[2:3], v[0:1]
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_endpgm
   %result = atomicrmw uinc_wrap ptr %ptr, i64 42 syncscope("agent") seq_cst, align 8
   ret void
@@ -3103,8 +3103,8 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset(ptr %ptr) #1 {
 ; GFX10-NEXT:    flat_atomic_inc_x2 v[2:3], v[0:1]
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: flat_atomic_inc_noret_i64_offset:
@@ -3117,8 +3117,8 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset(ptr %ptr) #1 {
 ; GFX11-NEXT:    flat_atomic_inc_u64 v[2:3], v[0:1] offset:32
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_endpgm
   %gep = getelementptr i64, ptr %ptr, i32 4
   %result = atomicrmw uinc_wrap ptr %gep, i64 42 syncscope("agent") seq_cst, align 8
@@ -3182,8 +3182,8 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_system(ptr %ptr) #1
 ; GFX10-NEXT:    flat_atomic_inc_x2 v[2:3], v[0:1]
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: flat_atomic_inc_noret_i64_offset_system:
@@ -3196,8 +3196,8 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_system(ptr %ptr) #1
 ; GFX11-NEXT:    flat_atomic_inc_u64 v[2:3], v[0:1] offset:32
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_endpgm
   %gep = getelementptr i64, ptr %ptr, i32 4
   %result = atomicrmw uinc_wrap ptr %gep, i64 42 seq_cst, align 8
@@ -3293,8 +3293,8 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_addr64(ptr %out, ptr %
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
 ; GFX10-NEXT:    flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    v_mov_b32_e32 v3, s1
 ; GFX10-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX10-NEXT:    v_add_co_u32 v2, vcc_lo, v2, v4
@@ -3315,8 +3315,8 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_addr64(ptr %out, ptr %
 ; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
 ; GFX11-NEXT:    flat_atomic_inc_u64 v[0:1], v[0:1], v[2:3] offset:40 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_add_co_u32 v2, vcc_lo, v2, v4
@@ -3401,8 +3401,8 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_addr64(ptr %ptr) #1
 ; GFX10-NEXT:    flat_atomic_inc_x2 v[2:3], v[0:1]
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: flat_atomic_inc_noret_i64_offset_addr64:
@@ -3419,8 +3419,8 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_addr64(ptr %ptr) #1
 ; GFX11-NEXT:    flat_atomic_inc_u64 v[0:1], v[2:3] offset:40
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_endpgm
   %id = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.tid = getelementptr i64, ptr %ptr, i32 %id
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/memory-legalizer-atomic-fence.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/memory-legalizer-atomic-fence.ll
index 03cd833..2727fde 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/memory-legalizer-atomic-fence.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/memory-legalizer-atomic-fence.ll
@@ -27,32 +27,32 @@ define amdgpu_kernel void @system_one_as_acquire() {
   ; GFX10WGP: bb.0.entry:
   ; GFX10WGP-NEXT:   S_WAITCNT_soft 16240
   ; GFX10WGP-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
-  ; GFX10WGP-NEXT:   BUFFER_GL0_INV implicit $exec
   ; GFX10WGP-NEXT:   BUFFER_GL1_INV implicit $exec
+  ; GFX10WGP-NEXT:   BUFFER_GL0_INV implicit $exec
   ; GFX10WGP-NEXT:   S_ENDPGM 0
   ;
   ; GFX10CU-LABEL: name: system_one_as_acquire
   ; GFX10CU: bb.0.entry:
   ; GFX10CU-NEXT:   S_WAITCNT_soft 16240
   ; GFX10CU-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
-  ; GFX10CU-NEXT:   BUFFER_GL0_INV implicit $exec
   ; GFX10CU-NEXT:   BUFFER_GL1_INV implicit $exec
+  ; GFX10CU-NEXT:   BUFFER_GL0_INV implicit $exec
   ; GFX10CU-NEXT:   S_ENDPGM 0
   ;
   ; GFX11WGP-LABEL: name: system_one_as_acquire
   ; GFX11WGP: bb.0.entry:
   ; GFX11WGP-NEXT:   S_WAITCNT_soft 1015
   ; GFX11WGP-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
-  ; GFX11WGP-NEXT:   BUFFER_GL0_INV implicit $exec
   ; GFX11WGP-NEXT:   BUFFER_GL1_INV implicit $exec
+  ; GFX11WGP-NEXT:   BUFFER_GL0_INV implicit $exec
   ; GFX11WGP-NEXT:   S_ENDPGM 0
   ;
   ; GFX11CU-LABEL: name: system_one_as_acquire
   ; GFX11CU: bb.0.entry:
   ; GFX11CU-NEXT:   S_WAITCNT_soft 1015
   ; GFX11CU-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
-  ; GFX11CU-NEXT:   BUFFER_GL0_INV implicit $exec
   ; GFX11CU-NEXT:   BUFFER_GL1_INV implicit $exec
+  ; GFX11CU-NEXT:   BUFFER_GL0_INV implicit $exec
   ; GFX11CU-NEXT:   S_ENDPGM 0
 entry:
   fence syncscope("one-as") acquire
@@ -115,32 +115,32 @@ define amdgpu_kernel void @system_one_as_acq_rel() {
   ; GFX10WGP: bb.0.entry:
   ; GFX10WGP-NEXT:   S_WAITCNT_soft 16240
   ; GFX10WGP-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
-  ; GFX10WGP-NEXT:   BUFFER_GL0_INV implicit $exec
   ; GFX10WGP-NEXT:   BUFFER_GL1_INV implicit $exec
+  ; GFX10WGP-NEXT:   BUFFER_GL0_INV implicit $exec
   ; GFX10WGP-NEXT:   S_ENDPGM 0
   ;
   ; GFX10CU-LABEL: name: system_one_as_acq_rel
   ; GFX10CU: bb.0.entry:
   ; GFX10CU-NEXT:   S_WAITCNT_soft 16240
   ; GFX10CU-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
-  ; GFX10CU-NEXT:   BUFFER_GL0_INV implicit $exec
   ; GFX10CU-NEXT:   BUFFER_GL1_INV implicit $exec
+  ; GFX10CU-NEXT:   BUFFER_GL0_INV implicit $exec
   ; GFX10CU-NEXT:   S_ENDPGM 0
   ;
   ; GFX11WGP-LABEL: name: system_one_as_acq_rel
   ; GFX11WGP: bb.0.entry:
   ; GFX11WGP-NEXT:   S_WAITCNT_soft 1015
   ; GFX11WGP-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
-  ; GFX11WGP-NEXT:   BUFFER_GL0_INV implicit $exec
   ; GFX11WGP-NEXT:   BUFFER_GL1_INV implicit $exec
+  ; GFX11WGP-NEXT:   BUFFER_GL0_INV implicit $exec
   ; GFX11WGP-NEXT:   S_ENDPGM 0
   ;
   ; GFX11CU-LABEL: name: system_one_as_acq_rel
   ; GFX11CU: bb.0.entry:
   ; GFX11CU-NEXT:   S_WAITCNT_soft 1015
   ; GFX11CU-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
-  ; GFX11CU-NEXT:   BUFFER_GL0_INV implicit $exec
   ; GFX11CU-NEXT:   BUFFER_GL1_INV implicit $exec
+  ; GFX11CU-NEXT:   BUFFER_GL0_INV implicit $exec
   ; GFX11CU-NEXT:   S_ENDPGM 0
 entry:
   fence syncscope("one-as") acq_rel
@@ -164,32 +164,32 @@ define amdgpu_kernel void @system_one_as_seq_cst() {
   ; GFX10WGP: bb.0.entry:
   ; GFX10WGP-NEXT:   S_WAITCNT_soft 16240
   ; GFX10WGP-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
-  ; GFX10WGP-NEXT:   BUFFER_GL0_INV implicit $exec
   ; GFX10WGP-NEXT:   BUFFER_GL1_INV implicit $exec
+  ; GFX10WGP-NEXT:   BUFFER_GL0_INV implicit $exec
   ; GFX10WGP-NEXT:   S_ENDPGM 0
   ;
   ; GFX10CU-LABEL: name: system_one_as_seq_cst
   ; GFX10CU: bb.0.entry:
   ; GFX10CU-NEXT:   S_WAITCNT_soft 16240
   ; GFX10CU-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
-  ; GFX10CU-NEXT:   BUFFER_GL0_INV implicit $exec
   ; GFX10CU-NEXT:   BUFFER_GL1_INV implicit $exec
+  ; GFX10CU-NEXT:   BUFFER_GL0_INV implicit $exec
   ; GFX10CU-NEXT:   S_ENDPGM 0
   ;
   ; GFX11WGP-LABEL: name: system_one_as_seq_cst
   ; GFX11WGP: bb.0.entry:
   ; GFX11WGP-NEXT:   S_WAITCNT_soft 1015
   ; GFX11WGP-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
-  ; GFX11WGP-NEXT:   BUFFER_GL0_INV implicit $exec
   ; GFX11WGP-NEXT:   BUFFER_GL1_INV implicit $exec
+  ; GFX11WGP-NEXT:   BUFFER_GL0_INV implicit $exec
   ; GFX11WGP-NEXT:   S_ENDPGM 0
   ;
   ; GFX11CU-LABEL: name: system_one_as_seq_cst
   ; GFX11CU: bb.0.entry:
   ; GFX11CU-NEXT:   S_WAITCNT_soft 1015
   ; GFX11CU-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
-  ; GFX11CU-NEXT:   BUFFER_GL0_INV implicit $exec
   ; GFX11CU-NEXT:   BUFFER_GL1_INV implicit $exec
+  ; GFX11CU-NEXT:   BUFFER_GL0_INV implicit $exec
   ; GFX11CU-NEXT:   S_ENDPGM 0
 entry:
   fence syncscope("one-as") seq_cst
@@ -329,32 +329,32 @@ define amdgpu_kernel void @agent_one_as_acquire() {
   ; GFX10WGP: bb.0.entry:
   ; GFX10WGP-NEXT:   S_WAITCNT_soft 16240
   ; GFX10WGP-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
-  ; GFX10WGP-NEXT:   BUFFER_GL0_INV implicit $exec
   ; GFX10WGP-NEXT:   BUFFER_GL1_INV implicit $exec
+  ; GFX10WGP-NEXT:   BUFFER_GL0_INV implicit $exec
   ; GFX10WGP-NEXT:   S_ENDPGM 0
   ;
   ; GFX10CU-LABEL: name: agent_one_as_acquire
   ; GFX10CU: bb.0.entry:
   ; GFX10CU-NEXT:   S_WAITCNT_soft 16240
   ; GFX10CU-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
-  ; GFX10CU-NEXT:   BUFFER_GL0_INV implicit $exec
   ; GFX10CU-NEXT:   BUFFER_GL1_INV implicit $exec
+  ; GFX10CU-NEXT:   BUFFER_GL0_INV implicit $exec
   ; GFX10CU-NEXT:   S_ENDPGM 0
   ;
   ; GFX11WGP-LABEL: name: agent_one_as_acquire
   ; GFX11WGP: bb.0.entry:
   ; GFX11WGP-NEXT:   S_WAITCNT_soft 1015
   ; GFX11WGP-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
-  ; GFX11WGP-NEXT:   BUFFER_GL0_INV implicit $exec
   ; GFX11WGP-NEXT:   BUFFER_GL1_INV implicit $exec
+  ; GFX11WGP-NEXT:   BUFFER_GL0_INV implicit $exec
   ; GFX11WGP-NEXT:   S_ENDPGM 0
   ;
   ; GFX11CU-LABEL: name: agent_one_as_acquire
   ; GFX11CU: bb.0.entry:
   ; GFX11CU-NEXT:   S_WAITCNT_soft 1015
   ; GFX11CU-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
-  ; GFX11CU-NEXT:   BUFFER_GL0_INV implicit $exec
   ; GFX11CU-NEXT:   BUFFER_GL1_INV implicit $exec
+  ; GFX11CU-NEXT:   BUFFER_GL0_INV implicit $exec
   ; GFX11CU-NEXT:   S_ENDPGM 0
 entry:
   fence syncscope("agent-one-as") acquire
@@ -417,32 +417,32 @@ define amdgpu_kernel void @agent_one_as_acq_rel() {
   ; GFX10WGP: bb.0.entry:
   ; GFX10WGP-NEXT:   S_WAITCNT_soft 16240
   ; GFX10WGP-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
-  ; GFX10WGP-NEXT:   BUFFER_GL0_INV implicit $exec
   ; GFX10WGP-NEXT:   BUFFER_GL1_INV implicit $exec
+  ; GFX10WGP-NEXT:   BUFFER_GL0_INV implicit $exec
   ; GFX10WGP-NEXT:   S_ENDPGM 0
   ;
   ; GFX10CU-LABEL: name: agent_one_as_acq_rel
   ; GFX10CU: bb.0.entry:
   ; GFX10CU-NEXT:   S_WAITCNT_soft 16240
   ; GFX10CU-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
-  ; GFX10CU-NEXT:   BUFFER_GL0_INV implicit $exec
   ; GFX10CU-NEXT:   BUFFER_GL1_INV implicit $exec
+  ; GFX10CU-NEXT:   BUFFER_GL0_INV implicit $exec
   ; GFX10CU-NEXT:   S_ENDPGM 0
   ;
   ; GFX11WGP-LABEL: name: agent_one_as_acq_rel
   ; GFX11WGP: bb.0.entry:
   ; GFX11WGP-NEXT:   S_WAITCNT_soft 1015
   ; GFX11WGP-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
-  ; GFX11WGP-NEXT:   BUFFER_GL0_INV implicit $exec
   ; GFX11WGP-NEXT:   BUFFER_GL1_INV implicit $exec
+  ; GFX11WGP-NEXT:   BUFFER_GL0_INV implicit $exec
   ; GFX11WGP-NEXT:   S_ENDPGM 0
   ;
   ; GFX11CU-LABEL: name: agent_one_as_acq_rel
   ; GFX11CU: bb.0.entry:
   ; GFX11CU-NEXT:   S_WAITCNT_soft 1015
   ; GFX11CU-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
-  ; GFX11CU-NEXT:   BUFFER_GL0_INV implicit $exec
   ; GFX11CU-NEXT:   BUFFER_GL1_INV implicit $exec
+  ; GFX11CU-NEXT:   BUFFER_GL0_INV implicit $exec
   ; GFX11CU-NEXT:   S_ENDPGM 0
 entry:
   fence syncscope("agent-one-as") acq_rel
@@ -466,32 +466,32 @@ define amdgpu_kernel void @agent_one_as_seq_cst() {
   ; GFX10WGP: bb.0.entry:
   ; GFX10WGP-NEXT:   S_WAITCNT_soft 16240
   ; GFX10WGP-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
-  ; GFX10WGP-NEXT:   BUFFER_GL0_INV implicit $exec
   ; GFX10WGP-NEXT:   BUFFER_GL1_INV implicit $exec
+  ; GFX10WGP-NEXT:   BUFFER_GL0_INV implicit $exec
   ; GFX10WGP-NEXT:   S_ENDPGM 0
   ;
   ; GFX10CU-LABEL: name: agent_one_as_seq_cst
   ; GFX10CU: bb.0.entry:
   ; GFX10CU-NEXT:   S_WAITCNT_soft 16240
   ; GFX10CU-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
-  ; GFX10CU-NEXT:   BUFFER_GL0_INV implicit $exec
   ; GFX10CU-NEXT:   BUFFER_GL1_INV implicit $exec
+  ; GFX10CU-NEXT:   BUFFER_GL0_INV implicit $exec
   ; GFX10CU-NEXT:   S_ENDPGM 0
   ;
   ; GFX11WGP-LABEL: name: agent_one_as_seq_cst
   ; GFX11WGP: bb.0.entry:
   ; GFX11WGP-NEXT:   S_WAITCNT_soft 1015
   ; GFX11WGP-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
-  ; GFX11WGP-NEXT:   BUFFER_GL0_INV implicit $exec
   ; GFX11WGP-NEXT:   BUFFER_GL1_INV implicit $exec
+  ; GFX11WGP-NEXT:   BUFFER_GL0_INV implicit $exec
   ; GFX11WGP-NEXT:   S_ENDPGM 0
   ;
   ; GFX11CU-LABEL: name: agent_one_as_seq_cst
   ; GFX11CU: bb.0.entry:
   ; GFX11CU-NEXT:   S_WAITCNT_soft 1015
   ; GFX11CU-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
-  ; GFX11CU-NEXT:   BUFFER_GL0_INV implicit $exec
   ; GFX11CU-NEXT:   BUFFER_GL1_INV implicit $exec
+  ; GFX11CU-NEXT:   BUFFER_GL0_INV implicit $exec
   ; GFX11CU-NEXT:   S_ENDPGM 0
 entry:
   fence syncscope("agent-one-as") seq_cst
@@ -769,32 +769,32 @@ define amdgpu_kernel void @system_acquire() {
   ; GFX10WGP: bb.0.entry:
   ; GFX10WGP-NEXT:   S_WAITCNT_soft 112
   ; GFX10WGP-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
-  ; GFX10WGP-NEXT:   BUFFER_GL0_INV implicit $exec
   ; GFX10WGP-NEXT:   BUFFER_GL1_INV implicit $exec
+  ; GFX10WGP-NEXT:   BUFFER_GL0_INV implicit $exec
   ; GFX10WGP-NEXT:   S_ENDPGM 0
   ;
   ; GFX10CU-LABEL: name: system_acquire
   ; GFX10CU: bb.0.entry:
   ; GFX10CU-NEXT:   S_WAITCNT_soft 112
   ; GFX10CU-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
-  ; GFX10CU-NEXT:   BUFFER_GL0_INV implicit $exec
   ; GFX10CU-NEXT:   BUFFER_GL1_INV implicit $exec
+  ; GFX10CU-NEXT:   BUFFER_GL0_INV implicit $exec
   ; GFX10CU-NEXT:   S_ENDPGM 0
   ;
   ; GFX11WGP-LABEL: name: system_acquire
   ; GFX11WGP: bb.0.entry:
   ; GFX11WGP-NEXT:   S_WAITCNT_soft 7
   ; GFX11WGP-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
-  ; GFX11WGP-NEXT:   BUFFER_GL0_INV implicit $exec
   ; GFX11WGP-NEXT:   BUFFER_GL1_INV implicit $exec
+  ; GFX11WGP-NEXT:   BUFFER_GL0_INV implicit $exec
   ; GFX11WGP-NEXT:   S_ENDPGM 0
   ;
   ; GFX11CU-LABEL: name: system_acquire
   ; GFX11CU: bb.0.entry:
   ; GFX11CU-NEXT:   S_WAITCNT_soft 7
   ; GFX11CU-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
-  ; GFX11CU-NEXT:   BUFFER_GL0_INV implicit $exec
   ; GFX11CU-NEXT:   BUFFER_GL1_INV implicit $exec
+  ; GFX11CU-NEXT:   BUFFER_GL0_INV implicit $exec
   ; GFX11CU-NEXT:   S_ENDPGM 0
 entry:
   fence acquire
@@ -857,32 +857,32 @@ define amdgpu_kernel void @system_acq_rel() {
   ; GFX10WGP: bb.0.entry:
   ; GFX10WGP-NEXT:   S_WAITCNT_soft 112
   ; GFX10WGP-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
-  ; GFX10WGP-NEXT:   BUFFER_GL0_INV implicit $exec
   ; GFX10WGP-NEXT:   BUFFER_GL1_INV implicit $exec
+  ; GFX10WGP-NEXT:   BUFFER_GL0_INV implicit $exec
   ; GFX10WGP-NEXT:   S_ENDPGM 0
   ;
   ; GFX10CU-LABEL: name: system_acq_rel
   ; GFX10CU: bb.0.entry:
   ; GFX10CU-NEXT:   S_WAITCNT_soft 112
   ; GFX10CU-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
-  ; GFX10CU-NEXT:   BUFFER_GL0_INV implicit $exec
   ; GFX10CU-NEXT:   BUFFER_GL1_INV implicit $exec
+  ; GFX10CU-NEXT:   BUFFER_GL0_INV implicit $exec
   ; GFX10CU-NEXT:   S_ENDPGM 0
   ;
   ; GFX11WGP-LABEL: name: system_acq_rel
   ; GFX11WGP: bb.0.entry:
   ; GFX11WGP-NEXT:   S_WAITCNT_soft 7
   ; GFX11WGP-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
-  ; GFX11WGP-NEXT:   BUFFER_GL0_INV implicit $exec
   ; GFX11WGP-NEXT:   BUFFER_GL1_INV implicit $exec
+  ; GFX11WGP-NEXT:   BUFFER_GL0_INV implicit $exec
   ; GFX11WGP-NEXT:   S_ENDPGM 0
   ;
   ; GFX11CU-LABEL: name: system_acq_rel
   ; GFX11CU: bb.0.entry:
   ; GFX11CU-NEXT:   S_WAITCNT_soft 7
   ; GFX11CU-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
-  ; GFX11CU-NEXT:   BUFFER_GL0_INV implicit $exec
   ; GFX11CU-NEXT:   BUFFER_GL1_INV implicit $exec
+  ; GFX11CU-NEXT:   BUFFER_GL0_INV implicit $exec
   ; GFX11CU-NEXT:   S_ENDPGM 0
 entry:
   fence acq_rel
@@ -906,32 +906,32 @@ define amdgpu_kernel void @system_seq_cst() {
   ; GFX10WGP: bb.0.entry:
   ; GFX10WGP-NEXT:   S_WAITCNT_soft 112
   ; GFX10WGP-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
-  ; GFX10WGP-NEXT:   BUFFER_GL0_INV implicit $exec
   ; GFX10WGP-NEXT:   BUFFER_GL1_INV implicit $exec
+  ; GFX10WGP-NEXT:   BUFFER_GL0_INV implicit $exec
   ; GFX10WGP-NEXT:   S_ENDPGM 0
   ;
   ; GFX10CU-LABEL: name: system_seq_cst
   ; GFX10CU: bb.0.entry:
   ; GFX10CU-NEXT:   S_WAITCNT_soft 112
   ; GFX10CU-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
-  ; GFX10CU-NEXT:   BUFFER_GL0_INV implicit $exec
   ; GFX10CU-NEXT:   BUFFER_GL1_INV implicit $exec
+  ; GFX10CU-NEXT:   BUFFER_GL0_INV implicit $exec
   ; GFX10CU-NEXT:   S_ENDPGM 0
   ;
   ; GFX11WGP-LABEL: name: system_seq_cst
   ; GFX11WGP: bb.0.entry:
   ; GFX11WGP-NEXT:   S_WAITCNT_soft 7
   ; GFX11WGP-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
-  ; GFX11WGP-NEXT:   BUFFER_GL0_INV implicit $exec
   ; GFX11WGP-NEXT:   BUFFER_GL1_INV implicit $exec
+  ; GFX11WGP-NEXT:   BUFFER_GL0_INV implicit $exec
   ; GFX11WGP-NEXT:   S_ENDPGM 0
   ;
   ; GFX11CU-LABEL: name: system_seq_cst
   ; GFX11CU: bb.0.entry:
   ; GFX11CU-NEXT:   S_WAITCNT_soft 7
   ; GFX11CU-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
-  ; GFX11CU-NEXT:   BUFFER_GL0_INV implicit $exec
   ; GFX11CU-NEXT:   BUFFER_GL1_INV implicit $exec
+  ; GFX11CU-NEXT:   BUFFER_GL0_INV implicit $exec
   ; GFX11CU-NEXT:   S_ENDPGM 0
 entry:
   fence seq_cst
@@ -1071,32 +1071,32 @@ define amdgpu_kernel void @agent_acquire() {
   ; GFX10WGP: bb.0.entry:
   ; GFX10WGP-NEXT:   S_WAITCNT_soft 112
   ; GFX10WGP-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
-  ; GFX10WGP-NEXT:   BUFFER_GL0_INV implicit $exec
   ; GFX10WGP-NEXT:   BUFFER_GL1_INV implicit $exec
+  ; GFX10WGP-NEXT:   BUFFER_GL0_INV implicit $exec
   ; GFX10WGP-NEXT:   S_ENDPGM 0
   ;
   ; GFX10CU-LABEL: name: agent_acquire
   ; GFX10CU: bb.0.entry:
   ; GFX10CU-NEXT:   S_WAITCNT_soft 112
   ; GFX10CU-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
-  ; GFX10CU-NEXT:   BUFFER_GL0_INV implicit $exec
   ; GFX10CU-NEXT:   BUFFER_GL1_INV implicit $exec
+  ; GFX10CU-NEXT:   BUFFER_GL0_INV implicit $exec
   ; GFX10CU-NEXT:   S_ENDPGM 0
   ;
   ; GFX11WGP-LABEL: name: agent_acquire
   ; GFX11WGP: bb.0.entry:
   ; GFX11WGP-NEXT:   S_WAITCNT_soft 7
   ; GFX11WGP-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
-  ; GFX11WGP-NEXT:   BUFFER_GL0_INV implicit $exec
   ; GFX11WGP-NEXT:   BUFFER_GL1_INV implicit $exec
+  ; GFX11WGP-NEXT:   BUFFER_GL0_INV implicit $exec
   ; GFX11WGP-NEXT:   S_ENDPGM 0
   ;
   ; GFX11CU-LABEL: name: agent_acquire
   ; GFX11CU: bb.0.entry:
   ; GFX11CU-NEXT:   S_WAITCNT_soft 7
   ; GFX11CU-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
-  ; GFX11CU-NEXT:   BUFFER_GL0_INV implicit $exec
   ; GFX11CU-NEXT:   BUFFER_GL1_INV implicit $exec
+  ; GFX11CU-NEXT:   BUFFER_GL0_INV implicit $exec
   ; GFX11CU-NEXT:   S_ENDPGM 0
 entry:
   fence syncscope("agent") acquire
@@ -1159,32 +1159,32 @@ define amdgpu_kernel void @agent_acq_rel() {
   ; GFX10WGP: bb.0.entry:
   ; GFX10WGP-NEXT:   S_WAITCNT_soft 112
   ; GFX10WGP-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
-  ; GFX10WGP-NEXT:   BUFFER_GL0_INV implicit $exec
   ; GFX10WGP-NEXT:   BUFFER_GL1_INV implicit $exec
+  ; GFX10WGP-NEXT:   BUFFER_GL0_INV implicit $exec
   ; GFX10WGP-NEXT:   S_ENDPGM 0
   ;
   ; GFX10CU-LABEL: name: agent_acq_rel
   ; GFX10CU: bb.0.entry:
   ; GFX10CU-NEXT:   S_WAITCNT_soft 112
   ; GFX10CU-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
-  ; GFX10CU-NEXT:   BUFFER_GL0_INV implicit $exec
   ; GFX10CU-NEXT:   BUFFER_GL1_INV implicit $exec
+  ; GFX10CU-NEXT:   BUFFER_GL0_INV implicit $exec
   ; GFX10CU-NEXT:   S_ENDPGM 0
   ;
   ; GFX11WGP-LABEL: name: agent_acq_rel
   ; GFX11WGP: bb.0.entry:
   ; GFX11WGP-NEXT:   S_WAITCNT_soft 7
   ; GFX11WGP-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
-  ; GFX11WGP-NEXT:   BUFFER_GL0_INV implicit $exec
   ; GFX11WGP-NEXT:   BUFFER_GL1_INV implicit $exec
+  ; GFX11WGP-NEXT:   BUFFER_GL0_INV implicit $exec
   ; GFX11WGP-NEXT:   S_ENDPGM 0
   ;
   ; GFX11CU-LABEL: name: agent_acq_rel
   ; GFX11CU: bb.0.entry:
   ; GFX11CU-NEXT:   S_WAITCNT_soft 7
   ; GFX11CU-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
-  ; GFX11CU-NEXT:   BUFFER_GL0_INV implicit $exec
   ; GFX11CU-NEXT:   BUFFER_GL1_INV implicit $exec
+  ; GFX11CU-NEXT:   BUFFER_GL0_INV implicit $exec
   ; GFX11CU-NEXT:   S_ENDPGM 0
 entry:
   fence syncscope("agent") acq_rel
@@ -1208,32 +1208,32 @@ define amdgpu_kernel void @agent_seq_cst() {
   ; GFX10WGP: bb.0.entry:
   ; GFX10WGP-NEXT:   S_WAITCNT_soft 112
   ; GFX10WGP-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
-  ; GFX10WGP-NEXT:   BUFFER_GL0_INV implicit $exec
   ; GFX10WGP-NEXT:   BUFFER_GL1_INV implicit $exec
+  ; GFX10WGP-NEXT:   BUFFER_GL0_INV implicit $exec
   ; GFX10WGP-NEXT:   S_ENDPGM 0
   ;
   ; GFX10CU-LABEL: name: agent_seq_cst
   ; GFX10CU: bb.0.entry:
   ; GFX10CU-NEXT:   S_WAITCNT_soft 112
   ; GFX10CU-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
-  ; GFX10CU-NEXT:   BUFFER_GL0_INV implicit $exec
   ; GFX10CU-NEXT:   BUFFER_GL1_INV implicit $exec
+  ; GFX10CU-NEXT:   BUFFER_GL0_INV implicit $exec
   ; GFX10CU-NEXT:   S_ENDPGM 0
   ;
   ; GFX11WGP-LABEL: name: agent_seq_cst
   ; GFX11WGP: bb.0.entry:
   ; GFX11WGP-NEXT:   S_WAITCNT_soft 7
   ; GFX11WGP-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
-  ; GFX11WGP-NEXT:   BUFFER_GL0_INV implicit $exec
   ; GFX11WGP-NEXT:   BUFFER_GL1_INV implicit $exec
+  ; GFX11WGP-NEXT:   BUFFER_GL0_INV implicit $exec
   ; GFX11WGP-NEXT:   S_ENDPGM 0
   ;
   ; GFX11CU-LABEL: name: agent_seq_cst
   ; GFX11CU: bb.0.entry:
   ; GFX11CU-NEXT:   S_WAITCNT_soft 7
   ; GFX11CU-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
-  ; GFX11CU-NEXT:   BUFFER_GL0_INV implicit $exec
   ; GFX11CU-NEXT:   BUFFER_GL1_INV implicit $exec
+  ; GFX11CU-NEXT:   BUFFER_GL0_INV implicit $exec
   ; GFX11CU-NEXT:   S_ENDPGM 0
 entry:
   fence syncscope("agent") seq_cst
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
index b5cccb6..8ee0ee3 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
@@ -99,8 +99,8 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX1064-NEXT:    s_mov_b32 s9, s3
 ; GFX1064-NEXT:    buffer_atomic_add v1, off, s[8:11], 0 glc
 ; GFX1064-NEXT:    s_waitcnt vmcnt(0)
-; GFX1064-NEXT:    buffer_gl0_inv
 ; GFX1064-NEXT:    buffer_gl1_inv
+; GFX1064-NEXT:    buffer_gl0_inv
 ; GFX1064-NEXT:  .LBB0_2:
 ; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
@@ -132,8 +132,8 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX1032-NEXT:    s_mov_b32 s9, s3
 ; GFX1032-NEXT:    buffer_atomic_add v1, off, s[8:11], 0 glc
 ; GFX1032-NEXT:    s_waitcnt vmcnt(0)
-; GFX1032-NEXT:    buffer_gl0_inv
 ; GFX1032-NEXT:    buffer_gl1_inv
+; GFX1032-NEXT:    buffer_gl0_inv
 ; GFX1032-NEXT:  .LBB0_2:
 ; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s4
@@ -167,8 +167,8 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX1164-NEXT:    s_mov_b32 s9, s3
 ; GFX1164-NEXT:    buffer_atomic_add_u32 v1, off, s[8:11], 0 glc
 ; GFX1164-NEXT:    s_waitcnt vmcnt(0)
-; GFX1164-NEXT:    buffer_gl0_inv
 ; GFX1164-NEXT:    buffer_gl1_inv
+; GFX1164-NEXT:    buffer_gl0_inv
 ; GFX1164-NEXT:  .LBB0_2:
 ; GFX1164-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
@@ -203,8 +203,8 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX1132-NEXT:    s_mov_b32 s9, s3
 ; GFX1132-NEXT:    buffer_atomic_add_u32 v1, off, s[8:11], 0 glc
 ; GFX1132-NEXT:    s_waitcnt vmcnt(0)
-; GFX1132-NEXT:    buffer_gl0_inv
 ; GFX1132-NEXT:    buffer_gl1_inv
+; GFX1132-NEXT:    buffer_gl0_inv
 ; GFX1132-NEXT:  .LBB0_2:
 ; GFX1132-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
@@ -420,8 +420,8 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1064-NEXT:    s_mov_b32 s13, s7
 ; GFX1064-NEXT:    buffer_atomic_add v1, off, s[12:15], 0 glc
 ; GFX1064-NEXT:    s_waitcnt vmcnt(0)
-; GFX1064-NEXT:    buffer_gl0_inv
 ; GFX1064-NEXT:    buffer_gl1_inv
+; GFX1064-NEXT:    buffer_gl0_inv
 ; GFX1064-NEXT:  .LBB1_2:
 ; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1064-NEXT:    s_or_b64 exec, exec, s[0:1]
@@ -455,8 +455,8 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1032-NEXT:    s_mov_b32 s9, s7
 ; GFX1032-NEXT:    buffer_atomic_add v1, off, s[8:11], 0 glc
 ; GFX1032-NEXT:    s_waitcnt vmcnt(0)
-; GFX1032-NEXT:    buffer_gl0_inv
 ; GFX1032-NEXT:    buffer_gl1_inv
+; GFX1032-NEXT:    buffer_gl0_inv
 ; GFX1032-NEXT:  .LBB1_2:
 ; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s0
@@ -492,8 +492,8 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1164-NEXT:    s_mov_b32 s13, s7
 ; GFX1164-NEXT:    buffer_atomic_add_u32 v1, off, s[12:15], 0 glc
 ; GFX1164-NEXT:    s_waitcnt vmcnt(0)
-; GFX1164-NEXT:    buffer_gl0_inv
 ; GFX1164-NEXT:    buffer_gl1_inv
+; GFX1164-NEXT:    buffer_gl0_inv
 ; GFX1164-NEXT:  .LBB1_2:
 ; GFX1164-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX1164-NEXT:    v_readfirstlane_b32 s0, v1
@@ -530,8 +530,8 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1132-NEXT:    s_mov_b32 s9, s7
 ; GFX1132-NEXT:    buffer_atomic_add_u32 v1, off, s[8:11], 0 glc
 ; GFX1132-NEXT:    s_waitcnt vmcnt(0)
-; GFX1132-NEXT:    buffer_gl0_inv
 ; GFX1132-NEXT:    buffer_gl1_inv
+; GFX1132-NEXT:    buffer_gl0_inv
 ; GFX1132-NEXT:  .LBB1_2:
 ; GFX1132-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX1132-NEXT:    v_readfirstlane_b32 s2, v1
@@ -767,8 +767,8 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1064-NEXT:    s_mov_b32 s9, s3
 ; GFX1064-NEXT:    buffer_atomic_add v0, off, s[8:11], 0 glc
 ; GFX1064-NEXT:    s_waitcnt vmcnt(0)
-; GFX1064-NEXT:    buffer_gl0_inv
 ; GFX1064-NEXT:    buffer_gl1_inv
+; GFX1064-NEXT:    buffer_gl0_inv
 ; GFX1064-NEXT:  .LBB2_4:
 ; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
@@ -812,8 +812,8 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1032-NEXT:    s_mov_b32 s9, s3
 ; GFX1032-NEXT:    buffer_atomic_add v0, off, s[8:11], 0 glc
 ; GFX1032-NEXT:    s_waitcnt vmcnt(0)
-; GFX1032-NEXT:    buffer_gl0_inv
 ; GFX1032-NEXT:    buffer_gl1_inv
+; GFX1032-NEXT:    buffer_gl0_inv
 ; GFX1032-NEXT:  .LBB2_4:
 ; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s5
@@ -861,8 +861,8 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1164-NEXT:    s_mov_b32 s9, s3
 ; GFX1164-NEXT:    buffer_atomic_add_u32 v0, off, s[8:11], 0 glc
 ; GFX1164-NEXT:    s_waitcnt vmcnt(0)
-; GFX1164-NEXT:    buffer_gl0_inv
 ; GFX1164-NEXT:    buffer_gl1_inv
+; GFX1164-NEXT:    buffer_gl0_inv
 ; GFX1164-NEXT:  .LBB2_4:
 ; GFX1164-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
@@ -910,8 +910,8 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1132-NEXT:    s_mov_b32 s9, s3
 ; GFX1132-NEXT:    buffer_atomic_add_u32 v0, off, s[8:11], 0 glc
 ; GFX1132-NEXT:    s_waitcnt vmcnt(0)
-; GFX1132-NEXT:    buffer_gl0_inv
 ; GFX1132-NEXT:    buffer_gl1_inv
+; GFX1132-NEXT:    buffer_gl0_inv
 ; GFX1132-NEXT:  .LBB2_4:
 ; GFX1132-NEXT:    s_or_b32 exec_lo, exec_lo, s5
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1128,8 +1128,8 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX1064-NEXT:    s_mov_b32 s9, s3
 ; GFX1064-NEXT:    buffer_atomic_add_x2 v[0:1], off, s[8:11], 0 glc
 ; GFX1064-NEXT:    s_waitcnt vmcnt(0)
-; GFX1064-NEXT:    buffer_gl0_inv
 ; GFX1064-NEXT:    buffer_gl1_inv
+; GFX1064-NEXT:    buffer_gl0_inv
 ; GFX1064-NEXT:  .LBB3_2:
 ; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
@@ -1163,8 +1163,8 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX1032-NEXT:    s_mov_b32 s9, s3
 ; GFX1032-NEXT:    buffer_atomic_add_x2 v[0:1], off, s[8:11], 0 glc
 ; GFX1032-NEXT:    s_waitcnt vmcnt(0)
-; GFX1032-NEXT:    buffer_gl0_inv
 ; GFX1032-NEXT:    buffer_gl1_inv
+; GFX1032-NEXT:    buffer_gl0_inv
 ; GFX1032-NEXT:  .LBB3_2:
 ; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s4
@@ -1200,8 +1200,8 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX1164-NEXT:    s_mov_b32 s9, s3
 ; GFX1164-NEXT:    buffer_atomic_add_u64 v[0:1], off, s[8:11], 0 glc
 ; GFX1164-NEXT:    s_waitcnt vmcnt(0)
-; GFX1164-NEXT:    buffer_gl0_inv
 ; GFX1164-NEXT:    buffer_gl1_inv
+; GFX1164-NEXT:    buffer_gl0_inv
 ; GFX1164-NEXT:  .LBB3_2:
 ; GFX1164-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1237,8 +1237,8 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX1132-NEXT:    s_mov_b32 s9, s3
 ; GFX1132-NEXT:    buffer_atomic_add_u64 v[0:1], off, s[8:11], 0 glc
 ; GFX1132-NEXT:    s_waitcnt vmcnt(0)
-; GFX1132-NEXT:    buffer_gl0_inv
 ; GFX1132-NEXT:    buffer_gl1_inv
+; GFX1132-NEXT:    buffer_gl0_inv
 ; GFX1132-NEXT:  .LBB3_2:
 ; GFX1132-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1488,8 +1488,8 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1064-NEXT:    s_mov_b32 s9, s7
 ; GFX1064-NEXT:    buffer_atomic_add_x2 v[0:1], off, s[8:11], 0 glc
 ; GFX1064-NEXT:    s_waitcnt vmcnt(0)
-; GFX1064-NEXT:    buffer_gl0_inv
 ; GFX1064-NEXT:    buffer_gl1_inv
+; GFX1064-NEXT:    buffer_gl0_inv
 ; GFX1064-NEXT:  .LBB4_2:
 ; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1064-NEXT:    s_or_b64 exec, exec, s[0:1]
@@ -1529,8 +1529,8 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1032-NEXT:    s_mov_b32 s9, s7
 ; GFX1032-NEXT:    buffer_atomic_add_x2 v[0:1], off, s[8:11], 0 glc
 ; GFX1032-NEXT:    s_waitcnt vmcnt(0)
-; GFX1032-NEXT:    buffer_gl0_inv
 ; GFX1032-NEXT:    buffer_gl1_inv
+; GFX1032-NEXT:    buffer_gl0_inv
 ; GFX1032-NEXT:  .LBB4_2:
 ; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s0
@@ -1572,8 +1572,8 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1164-NEXT:    s_mov_b32 s9, s7
 ; GFX1164-NEXT:    buffer_atomic_add_u64 v[0:1], off, s[8:11], 0 glc
 ; GFX1164-NEXT:    s_waitcnt vmcnt(0)
-; GFX1164-NEXT:    buffer_gl0_inv
 ; GFX1164-NEXT:    buffer_gl1_inv
+; GFX1164-NEXT:    buffer_gl0_inv
 ; GFX1164-NEXT:  .LBB4_2:
 ; GFX1164-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX1164-NEXT:    v_readfirstlane_b32 s2, v0
@@ -1618,8 +1618,8 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1132-NEXT:    s_mov_b32 s9, s7
 ; GFX1132-NEXT:    buffer_atomic_add_u64 v[0:1], off, s[8:11], 0 glc
 ; GFX1132-NEXT:    s_waitcnt vmcnt(0)
-; GFX1132-NEXT:    buffer_gl0_inv
 ; GFX1132-NEXT:    buffer_gl1_inv
+; GFX1132-NEXT:    buffer_gl0_inv
 ; GFX1132-NEXT:  .LBB4_2:
 ; GFX1132-NEXT:    s_or_b32 exec_lo, exec_lo, s2
 ; GFX1132-NEXT:    v_readfirstlane_b32 s2, v0
@@ -1777,8 +1777,8 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX10-NEXT:    s_mov_b32 s4, s0
 ; GFX10-NEXT:    buffer_atomic_add_x2 v[0:1], off, s[8:11], 0 glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    s_mov_b32 s5, s1
 ; GFX10-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GFX10-NEXT:    s_endpgm
@@ -1797,8 +1797,8 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX11-NEXT:    s_mov_b32 s4, s0
 ; GFX11-NEXT:    buffer_atomic_add_u64 v[0:1], off, s[8:11], 0 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_mov_b32 s5, s1
 ; GFX11-NEXT:    buffer_store_b64 v[0:1], off, s[4:7], 0
 ; GFX11-NEXT:    s_nop 0
@@ -1954,8 +1954,8 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX1064-NEXT:    s_mov_b32 s9, s3
 ; GFX1064-NEXT:    buffer_atomic_sub v1, off, s[8:11], 0 glc
 ; GFX1064-NEXT:    s_waitcnt vmcnt(0)
-; GFX1064-NEXT:    buffer_gl0_inv
 ; GFX1064-NEXT:    buffer_gl1_inv
+; GFX1064-NEXT:    buffer_gl0_inv
 ; GFX1064-NEXT:  .LBB6_2:
 ; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
@@ -1988,8 +1988,8 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX1032-NEXT:    s_mov_b32 s9, s3
 ; GFX1032-NEXT:    buffer_atomic_sub v1, off, s[8:11], 0 glc
 ; GFX1032-NEXT:    s_waitcnt vmcnt(0)
-; GFX1032-NEXT:    buffer_gl0_inv
 ; GFX1032-NEXT:    buffer_gl1_inv
+; GFX1032-NEXT:    buffer_gl0_inv
 ; GFX1032-NEXT:  .LBB6_2:
 ; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s4
@@ -2024,8 +2024,8 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX1164-NEXT:    s_mov_b32 s9, s3
 ; GFX1164-NEXT:    buffer_atomic_sub_u32 v1, off, s[8:11], 0 glc
 ; GFX1164-NEXT:    s_waitcnt vmcnt(0)
-; GFX1164-NEXT:    buffer_gl0_inv
 ; GFX1164-NEXT:    buffer_gl1_inv
+; GFX1164-NEXT:    buffer_gl0_inv
 ; GFX1164-NEXT:  .LBB6_2:
 ; GFX1164-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2061,8 +2061,8 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX1132-NEXT:    s_mov_b32 s9, s3
 ; GFX1132-NEXT:    buffer_atomic_sub_u32 v1, off, s[8:11], 0 glc
 ; GFX1132-NEXT:    s_waitcnt vmcnt(0)
-; GFX1132-NEXT:    buffer_gl0_inv
 ; GFX1132-NEXT:    buffer_gl1_inv
+; GFX1132-NEXT:    buffer_gl0_inv
 ; GFX1132-NEXT:  .LBB6_2:
 ; GFX1132-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2281,8 +2281,8 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1064-NEXT:    s_mov_b32 s13, s7
 ; GFX1064-NEXT:    buffer_atomic_sub v1, off, s[12:15], 0 glc
 ; GFX1064-NEXT:    s_waitcnt vmcnt(0)
-; GFX1064-NEXT:    buffer_gl0_inv
 ; GFX1064-NEXT:    buffer_gl1_inv
+; GFX1064-NEXT:    buffer_gl0_inv
 ; GFX1064-NEXT:  .LBB7_2:
 ; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1064-NEXT:    s_or_b64 exec, exec, s[0:1]
@@ -2317,8 +2317,8 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1032-NEXT:    s_mov_b32 s9, s7
 ; GFX1032-NEXT:    buffer_atomic_sub v1, off, s[8:11], 0 glc
 ; GFX1032-NEXT:    s_waitcnt vmcnt(0)
-; GFX1032-NEXT:    buffer_gl0_inv
 ; GFX1032-NEXT:    buffer_gl1_inv
+; GFX1032-NEXT:    buffer_gl0_inv
 ; GFX1032-NEXT:  .LBB7_2:
 ; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s0
@@ -2355,8 +2355,8 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1164-NEXT:    s_mov_b32 s13, s7
 ; GFX1164-NEXT:    buffer_atomic_sub_u32 v1, off, s[12:15], 0 glc
 ; GFX1164-NEXT:    s_waitcnt vmcnt(0)
-; GFX1164-NEXT:    buffer_gl0_inv
 ; GFX1164-NEXT:    buffer_gl1_inv
+; GFX1164-NEXT:    buffer_gl0_inv
 ; GFX1164-NEXT:  .LBB7_2:
 ; GFX1164-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2394,8 +2394,8 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1132-NEXT:    s_mov_b32 s9, s7
 ; GFX1132-NEXT:    buffer_atomic_sub_u32 v1, off, s[8:11], 0 glc
 ; GFX1132-NEXT:    s_waitcnt vmcnt(0)
-; GFX1132-NEXT:    buffer_gl0_inv
 ; GFX1132-NEXT:    buffer_gl1_inv
+; GFX1132-NEXT:    buffer_gl0_inv
 ; GFX1132-NEXT:  .LBB7_2:
 ; GFX1132-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2634,8 +2634,8 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1064-NEXT:    s_mov_b32 s9, s3
 ; GFX1064-NEXT:    buffer_atomic_sub v0, off, s[8:11], 0 glc
 ; GFX1064-NEXT:    s_waitcnt vmcnt(0)
-; GFX1064-NEXT:    buffer_gl0_inv
 ; GFX1064-NEXT:    buffer_gl1_inv
+; GFX1064-NEXT:    buffer_gl0_inv
 ; GFX1064-NEXT:  .LBB8_4:
 ; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
@@ -2679,8 +2679,8 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1032-NEXT:    s_mov_b32 s9, s3
 ; GFX1032-NEXT:    buffer_atomic_sub v0, off, s[8:11], 0 glc
 ; GFX1032-NEXT:    s_waitcnt vmcnt(0)
-; GFX1032-NEXT:    buffer_gl0_inv
 ; GFX1032-NEXT:    buffer_gl1_inv
+; GFX1032-NEXT:    buffer_gl0_inv
 ; GFX1032-NEXT:  .LBB8_4:
 ; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s5
@@ -2728,8 +2728,8 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1164-NEXT:    s_mov_b32 s9, s3
 ; GFX1164-NEXT:    buffer_atomic_sub_u32 v0, off, s[8:11], 0 glc
 ; GFX1164-NEXT:    s_waitcnt vmcnt(0)
-; GFX1164-NEXT:    buffer_gl0_inv
 ; GFX1164-NEXT:    buffer_gl1_inv
+; GFX1164-NEXT:    buffer_gl0_inv
 ; GFX1164-NEXT:  .LBB8_4:
 ; GFX1164-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2777,8 +2777,8 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1132-NEXT:    s_mov_b32 s9, s3
 ; GFX1132-NEXT:    buffer_atomic_sub_u32 v0, off, s[8:11], 0 glc
 ; GFX1132-NEXT:    s_waitcnt vmcnt(0)
-; GFX1132-NEXT:    buffer_gl0_inv
 ; GFX1132-NEXT:    buffer_gl1_inv
+; GFX1132-NEXT:    buffer_gl0_inv
 ; GFX1132-NEXT:  .LBB8_4:
 ; GFX1132-NEXT:    s_or_b32 exec_lo, exec_lo, s5
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
@@ -3034,8 +3034,8 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX1064-NEXT:    s_mov_b32 s9, s3
 ; GFX1064-NEXT:    buffer_atomic_sub_x2 v[0:1], off, s[8:11], 0 glc
 ; GFX1064-NEXT:    s_waitcnt vmcnt(0)
-; GFX1064-NEXT:    buffer_gl0_inv
 ; GFX1064-NEXT:    buffer_gl1_inv
+; GFX1064-NEXT:    buffer_gl0_inv
 ; GFX1064-NEXT:  .LBB9_2:
 ; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
@@ -3072,8 +3072,8 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX1032-NEXT:    s_mov_b32 s9, s3
 ; GFX1032-NEXT:    buffer_atomic_sub_x2 v[0:1], off, s[8:11], 0 glc
 ; GFX1032-NEXT:    s_waitcnt vmcnt(0)
-; GFX1032-NEXT:    buffer_gl0_inv
 ; GFX1032-NEXT:    buffer_gl1_inv
+; GFX1032-NEXT:    buffer_gl0_inv
 ; GFX1032-NEXT:  .LBB9_2:
 ; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s4
@@ -3112,8 +3112,8 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX1164-NEXT:    s_mov_b32 s9, s3
 ; GFX1164-NEXT:    buffer_atomic_sub_u64 v[0:1], off, s[8:11], 0 glc
 ; GFX1164-NEXT:    s_waitcnt vmcnt(0)
-; GFX1164-NEXT:    buffer_gl0_inv
 ; GFX1164-NEXT:    buffer_gl1_inv
+; GFX1164-NEXT:    buffer_gl0_inv
 ; GFX1164-NEXT:  .LBB9_2:
 ; GFX1164-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
@@ -3152,8 +3152,8 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX1132-NEXT:    s_mov_b32 s9, s3
 ; GFX1132-NEXT:    buffer_atomic_sub_u64 v[0:1], off, s[8:11], 0 glc
 ; GFX1132-NEXT:    s_waitcnt vmcnt(0)
-; GFX1132-NEXT:    buffer_gl0_inv
 ; GFX1132-NEXT:    buffer_gl1_inv
+; GFX1132-NEXT:    buffer_gl0_inv
 ; GFX1132-NEXT:  .LBB9_2:
 ; GFX1132-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
@@ -3415,8 +3415,8 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1064-NEXT:    s_mov_b32 s9, s7
 ; GFX1064-NEXT:    buffer_atomic_sub_x2 v[0:1], off, s[8:11], 0 glc
 ; GFX1064-NEXT:    s_waitcnt vmcnt(0)
-; GFX1064-NEXT:    buffer_gl0_inv
 ; GFX1064-NEXT:    buffer_gl1_inv
+; GFX1064-NEXT:    buffer_gl0_inv
 ; GFX1064-NEXT:  .LBB10_2:
 ; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1064-NEXT:    s_or_b64 exec, exec, s[0:1]
@@ -3459,8 +3459,8 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1032-NEXT:    s_mov_b32 s9, s7
 ; GFX1032-NEXT:    buffer_atomic_sub_x2 v[0:1], off, s[8:11], 0 glc
 ; GFX1032-NEXT:    s_waitcnt vmcnt(0)
-; GFX1032-NEXT:    buffer_gl0_inv
 ; GFX1032-NEXT:    buffer_gl1_inv
+; GFX1032-NEXT:    buffer_gl0_inv
 ; GFX1032-NEXT:  .LBB10_2:
 ; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s0
@@ -3505,8 +3505,8 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1164-NEXT:    s_mov_b32 s9, s7
 ; GFX1164-NEXT:    buffer_atomic_sub_u64 v[0:1], off, s[8:11], 0 glc
 ; GFX1164-NEXT:    s_waitcnt vmcnt(0)
-; GFX1164-NEXT:    buffer_gl0_inv
 ; GFX1164-NEXT:    buffer_gl1_inv
+; GFX1164-NEXT:    buffer_gl0_inv
 ; GFX1164-NEXT:  .LBB10_2:
 ; GFX1164-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
@@ -3553,8 +3553,8 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1132-NEXT:    s_mov_b32 s9, s7
 ; GFX1132-NEXT:    buffer_atomic_sub_u64 v[0:1], off, s[8:11], 0 glc
 ; GFX1132-NEXT:    s_waitcnt vmcnt(0)
-; GFX1132-NEXT:    buffer_gl0_inv
 ; GFX1132-NEXT:    buffer_gl1_inv
+; GFX1132-NEXT:    buffer_gl0_inv
 ; GFX1132-NEXT:  .LBB10_2:
 ; GFX1132-NEXT:    s_or_b32 exec_lo, exec_lo, s2
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
@@ -3722,8 +3722,8 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX10-NEXT:    s_mov_b32 s4, s0
 ; GFX10-NEXT:    buffer_atomic_sub_x2 v[0:1], off, s[8:11], 0 glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    s_mov_b32 s5, s1
 ; GFX10-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GFX10-NEXT:    s_endpgm
@@ -3742,8 +3742,8 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX11-NEXT:    s_mov_b32 s4, s0
 ; GFX11-NEXT:    buffer_atomic_sub_u64 v[0:1], off, s[8:11], 0 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_mov_b32 s5, s1
 ; GFX11-NEXT:    buffer_store_b64 v[0:1], off, s[4:7], 0
 ; GFX11-NEXT:    s_nop 0
diff --git a/llvm/test/CodeGen/AMDGPU/atomicrmw-expand.ll b/llvm/test/CodeGen/AMDGPU/atomicrmw-expand.ll
index e64f8c2..19a1d2d9 100644
--- a/llvm/test/CodeGen/AMDGPU/atomicrmw-expand.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomicrmw-expand.ll
@@ -75,8 +75,8 @@ define float @syncscope_system(ptr %addr, float %val) #0 {
 ; GFX1100-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX1100-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc
 ; GFX1100-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX1100-NEXT:    buffer_gl0_inv
 ; GFX1100-NEXT:    buffer_gl1_inv
+; GFX1100-NEXT:    buffer_gl0_inv
 ; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
 ; GFX1100-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX1100-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
diff --git a/llvm/test/CodeGen/AMDGPU/global-atomics-fp.ll b/llvm/test/CodeGen/AMDGPU/global-atomics-fp.ll
index 135285cc..490167e 100644
--- a/llvm/test/CodeGen/AMDGPU/global-atomics-fp.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-atomics-fp.ll
@@ -156,8 +156,8 @@ define amdgpu_kernel void @global_atomic_fadd_ret_f32(ptr addrspace(1) %ptr) #0
 ; GFX10-NEXT:    v_add_f32_e32 v4, v5, v2
 ; GFX10-NEXT:    global_atomic_cmpswap v1, v3, v[4:5], s[0:1] glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v5
 ; GFX10-NEXT:    s_or_b32 s3, vcc_lo, s3
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s3
@@ -196,8 +196,8 @@ define amdgpu_kernel void @global_atomic_fadd_ret_f32(ptr addrspace(1) %ptr) #0
 ; GFX11-NEXT:    v_add_f32_e32 v4, v5, v2
 ; GFX11-NEXT:    global_atomic_cmpswap_b32 v1, v3, v[4:5], s[0:1] glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v5
 ; GFX11-NEXT:    s_or_b32 s3, vcc_lo, s3
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s3
@@ -357,8 +357,8 @@ define amdgpu_kernel void @global_atomic_fadd_ret_f32_ieee(ptr addrspace(1) %ptr
 ; GFX10-NEXT:    v_add_f32_e32 v4, v5, v2
 ; GFX10-NEXT:    global_atomic_cmpswap v1, v3, v[4:5], s[0:1] glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v5
 ; GFX10-NEXT:    s_or_b32 s3, vcc_lo, s3
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s3
@@ -390,8 +390,8 @@ define amdgpu_kernel void @global_atomic_fadd_ret_f32_ieee(ptr addrspace(1) %ptr
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    global_atomic_add_f32 v1, v2, v1, s[0:1] glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:  .LBB1_2:
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s2
 ; GFX11-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
@@ -506,8 +506,8 @@ define amdgpu_kernel void @global_atomic_fadd_noret_f32(ptr addrspace(1) %ptr) #
 ; GFX10-NEXT:    v_add_f32_e32 v0, v1, v2
 ; GFX10-NEXT:    global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
 ; GFX10-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX10-NEXT:    s_or_b32 s2, vcc_lo, s2
@@ -531,8 +531,8 @@ define amdgpu_kernel void @global_atomic_fadd_noret_f32(ptr addrspace(1) %ptr) #
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    global_atomic_add_f32 v1, v0, s[0:1]
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:  .LBB2_2:
 ; GFX11-NEXT:    s_endpgm
   %result = atomicrmw fadd ptr addrspace(1) %ptr, float 4.0 syncscope("agent") seq_cst
@@ -638,8 +638,8 @@ define amdgpu_kernel void @global_atomic_fadd_noret_f32_ieee(ptr addrspace(1) %p
 ; GFX10-NEXT:    v_add_f32_e32 v0, v1, v2
 ; GFX10-NEXT:    global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
 ; GFX10-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX10-NEXT:    s_or_b32 s2, vcc_lo, s2
@@ -663,8 +663,8 @@ define amdgpu_kernel void @global_atomic_fadd_noret_f32_ieee(ptr addrspace(1) %p
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    global_atomic_add_f32 v1, v0, s[0:1]
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:  .LBB3_2:
 ; GFX11-NEXT:    s_endpgm
   %result = atomicrmw fadd ptr addrspace(1) %ptr, float 4.0 syncscope("agent") seq_cst
@@ -806,8 +806,8 @@ define amdgpu_kernel void @global_atomic_fadd_ret_f32_agent(ptr addrspace(1) %pt
 ; GFX10-NEXT:    v_add_f32_e32 v4, v5, v2
 ; GFX10-NEXT:    global_atomic_cmpswap v1, v3, v[4:5], s[0:1] glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v5
 ; GFX10-NEXT:    s_or_b32 s3, vcc_lo, s3
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s3
@@ -838,8 +838,8 @@ define amdgpu_kernel void @global_atomic_fadd_ret_f32_agent(ptr addrspace(1) %pt
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    global_atomic_add_f32 v1, v2, v1, s[0:1] glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:  .LBB4_2:
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s2
 ; GFX11-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
@@ -1006,8 +1006,8 @@ define amdgpu_kernel void @global_atomic_fadd_ret_f32_system(ptr addrspace(1) %p
 ; GFX10-NEXT:    v_add_f32_e32 v4, v5, v2
 ; GFX10-NEXT:    global_atomic_cmpswap v1, v3, v[4:5], s[0:1] glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v5
 ; GFX10-NEXT:    s_or_b32 s3, vcc_lo, s3
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s3
@@ -1046,8 +1046,8 @@ define amdgpu_kernel void @global_atomic_fadd_ret_f32_system(ptr addrspace(1) %p
 ; GFX11-NEXT:    v_add_f32_e32 v4, v5, v2
 ; GFX11-NEXT:    global_atomic_cmpswap_b32 v1, v3, v[4:5], s[0:1] glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v5
 ; GFX11-NEXT:    s_or_b32 s3, vcc_lo, s3
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s3
@@ -1325,8 +1325,8 @@ define amdgpu_kernel void @global_atomic_fadd_noret_f32_safe(ptr addrspace(1) %p
 ; GFX10-NEXT:    v_add_f32_e32 v0, v1, v2
 ; GFX10-NEXT:    global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
 ; GFX10-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX10-NEXT:    s_or_b32 s2, vcc_lo, s2
@@ -1357,8 +1357,8 @@ define amdgpu_kernel void @global_atomic_fadd_noret_f32_safe(ptr addrspace(1) %p
 ; GFX11-NEXT:    v_add_f32_e32 v0, v1, v2
 ; GFX11-NEXT:    global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
 ; GFX11-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX11-NEXT:    s_or_b32 s2, vcc_lo, s2
@@ -1631,8 +1631,8 @@ define amdgpu_kernel void @global_atomic_fadd_ret_bf16_agent(ptr addrspace(1) %p
 ; GFX10-NEXT:    v_and_or_b32 v1, v2, s4, v1
 ; GFX10-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[0:1] glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
 ; GFX10-NEXT:    s_or_b32 s3, vcc_lo, s3
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s3
@@ -1669,8 +1669,8 @@ define amdgpu_kernel void @global_atomic_fadd_ret_bf16_agent(ptr addrspace(1) %p
 ; GFX11-NEXT:    v_and_or_b32 v1, v2, s4, v1
 ; GFX11-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
 ; GFX11-NEXT:    s_or_b32 s3, vcc_lo, s3
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s3
@@ -1819,8 +1819,8 @@ define amdgpu_kernel void @global_atomic_fadd_ret_bf16_system(ptr addrspace(1) %
 ; GFX10-NEXT:    v_and_or_b32 v1, v2, s4, v1
 ; GFX10-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[0:1] glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
 ; GFX10-NEXT:    s_or_b32 s3, vcc_lo, s3
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s3
@@ -1857,8 +1857,8 @@ define amdgpu_kernel void @global_atomic_fadd_ret_bf16_system(ptr addrspace(1) %
 ; GFX11-NEXT:    v_and_or_b32 v1, v2, s4, v1
 ; GFX11-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
 ; GFX11-NEXT:    s_or_b32 s3, vcc_lo, s3
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s3
diff --git a/llvm/test/CodeGen/AMDGPU/global-saddr-atomics-min-max-system.ll b/llvm/test/CodeGen/AMDGPU/global-saddr-atomics-min-max-system.ll
index a847538..6b4a638 100644
--- a/llvm/test/CodeGen/AMDGPU/global-saddr-atomics-min-max-system.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-saddr-atomics-min-max-system.ll
@@ -48,8 +48,8 @@ define amdgpu_ps float @global_max_saddr_i32_rtn(ptr addrspace(1) inreg %sbase,
 ; GFX10-NEXT:    v_max_i32_e32 v4, v5, v1
 ; GFX10-NEXT:    global_atomic_cmpswap v0, v[2:3], v[4:5], off glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
 ; GFX10-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX10-NEXT:    s_andn2_b64 exec, exec, s[0:1]
@@ -75,8 +75,8 @@ define amdgpu_ps float @global_max_saddr_i32_rtn(ptr addrspace(1) inreg %sbase,
 ; GFX11-NEXT:    v_max_i32_e32 v4, v5, v1
 ; GFX11-NEXT:    global_atomic_cmpswap_b32 v0, v[2:3], v[4:5], off glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
 ; GFX11-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
@@ -131,8 +131,8 @@ define amdgpu_ps float @global_max_saddr_i32_rtn_neg128(ptr addrspace(1) inreg %
 ; GFX10-NEXT:    v_max_i32_e32 v4, v5, v1
 ; GFX10-NEXT:    global_atomic_cmpswap v0, v[2:3], v[4:5], off offset:-128 glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
 ; GFX10-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX10-NEXT:    s_andn2_b64 exec, exec, s[0:1]
@@ -158,8 +158,8 @@ define amdgpu_ps float @global_max_saddr_i32_rtn_neg128(ptr addrspace(1) inreg %
 ; GFX11-NEXT:    v_max_i32_e32 v4, v5, v1
 ; GFX11-NEXT:    global_atomic_cmpswap_b32 v0, v[2:3], v[4:5], off offset:-128 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
 ; GFX11-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
@@ -211,8 +211,8 @@ define amdgpu_ps void @global_max_saddr_i32_nortn(ptr addrspace(1) inreg %sbase,
 ; GFX10-NEXT:    v_max_i32_e32 v4, v5, v1
 ; GFX10-NEXT:    global_atomic_cmpswap v0, v[2:3], v[4:5], off glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
 ; GFX10-NEXT:    v_mov_b32_e32 v5, v0
 ; GFX10-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
@@ -235,8 +235,8 @@ define amdgpu_ps void @global_max_saddr_i32_nortn(ptr addrspace(1) inreg %sbase,
 ; GFX11-NEXT:    v_max_i32_e32 v4, v5, v1
 ; GFX11-NEXT:    global_atomic_cmpswap_b32 v0, v[2:3], v[4:5], off glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
 ; GFX11-NEXT:    v_mov_b32_e32 v5, v0
 ; GFX11-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
@@ -286,8 +286,8 @@ define amdgpu_ps void @global_max_saddr_i32_nortn_neg128(ptr addrspace(1) inreg
 ; GFX10-NEXT:    v_max_i32_e32 v4, v5, v1
 ; GFX10-NEXT:    global_atomic_cmpswap v0, v[2:3], v[4:5], off offset:-128 glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
 ; GFX10-NEXT:    v_mov_b32_e32 v5, v0
 ; GFX10-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
@@ -310,8 +310,8 @@ define amdgpu_ps void @global_max_saddr_i32_nortn_neg128(ptr addrspace(1) inreg
 ; GFX11-NEXT:    v_max_i32_e32 v4, v5, v1
 ; GFX11-NEXT:    global_atomic_cmpswap_b32 v0, v[2:3], v[4:5], off offset:-128 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
 ; GFX11-NEXT:    v_mov_b32_e32 v5, v0
 ; GFX11-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
@@ -372,8 +372,8 @@ define amdgpu_ps <2 x float> @global_max_saddr_i64_rtn(ptr addrspace(1) inreg %s
 ; GFX10-NEXT:    v_cndmask_b32_e32 v7, v1, v9, vcc
 ; GFX10-NEXT:    global_atomic_cmpswap_x2 v[3:4], v[5:6], v[7:10], off glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10]
 ; GFX10-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX10-NEXT:    s_andn2_b64 exec, exec, s[0:1]
@@ -403,8 +403,8 @@ define amdgpu_ps <2 x float> @global_max_saddr_i64_rtn(ptr addrspace(1) inreg %s
 ; GFX11-NEXT:    v_cndmask_b32_e32 v7, v1, v9, vcc
 ; GFX11-NEXT:    global_atomic_cmpswap_b64 v[3:4], v[5:6], v[7:10], off glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10]
 ; GFX11-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
@@ -467,8 +467,8 @@ define amdgpu_ps <2 x float> @global_max_saddr_i64_rtn_neg128(ptr addrspace(1) i
 ; GFX10-NEXT:    v_cndmask_b32_e32 v7, v1, v9, vcc
 ; GFX10-NEXT:    global_atomic_cmpswap_x2 v[3:4], v[5:6], v[7:10], off offset:-128 glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10]
 ; GFX10-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX10-NEXT:    s_andn2_b64 exec, exec, s[0:1]
@@ -498,8 +498,8 @@ define amdgpu_ps <2 x float> @global_max_saddr_i64_rtn_neg128(ptr addrspace(1) i
 ; GFX11-NEXT:    v_cndmask_b32_e32 v7, v1, v9, vcc
 ; GFX11-NEXT:    global_atomic_cmpswap_b64 v[3:4], v[5:6], v[7:10], off offset:-128 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10]
 ; GFX11-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
@@ -558,8 +558,8 @@ define amdgpu_ps void @global_max_saddr_i64_nortn(ptr addrspace(1) inreg %sbase,
 ; GFX10-NEXT:    v_cndmask_b32_e32 v3, v1, v5, vcc
 ; GFX10-NEXT:    global_atomic_cmpswap_x2 v[3:4], v[7:8], v[3:6], off glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6]
 ; GFX10-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX10-NEXT:    v_mov_b32_e32 v5, v3
@@ -585,8 +585,8 @@ define amdgpu_ps void @global_max_saddr_i64_nortn(ptr addrspace(1) inreg %sbase,
 ; GFX11-NEXT:    v_cndmask_b32_e32 v3, v1, v5, vcc
 ; GFX11-NEXT:    global_atomic_cmpswap_b64 v[3:4], v[7:8], v[3:6], off glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6]
 ; GFX11-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX11-NEXT:    v_mov_b32_e32 v5, v3
@@ -642,8 +642,8 @@ define amdgpu_ps void @global_max_saddr_i64_nortn_neg128(ptr addrspace(1) inreg
 ; GFX10-NEXT:    v_cndmask_b32_e32 v3, v1, v5, vcc
 ; GFX10-NEXT:    global_atomic_cmpswap_x2 v[3:4], v[7:8], v[3:6], off offset:-128 glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6]
 ; GFX10-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX10-NEXT:    v_mov_b32_e32 v5, v3
@@ -669,8 +669,8 @@ define amdgpu_ps void @global_max_saddr_i64_nortn_neg128(ptr addrspace(1) inreg
 ; GFX11-NEXT:    v_cndmask_b32_e32 v3, v1, v5, vcc
 ; GFX11-NEXT:    global_atomic_cmpswap_b64 v[3:4], v[7:8], v[3:6], off offset:-128 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6]
 ; GFX11-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX11-NEXT:    v_mov_b32_e32 v5, v3
@@ -730,8 +730,8 @@ define amdgpu_ps float @global_min_saddr_i32_rtn(ptr addrspace(1) inreg %sbase,
 ; GFX10-NEXT:    v_min_i32_e32 v4, v5, v1
 ; GFX10-NEXT:    global_atomic_cmpswap v0, v[2:3], v[4:5], off glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
 ; GFX10-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX10-NEXT:    s_andn2_b64 exec, exec, s[0:1]
@@ -757,8 +757,8 @@ define amdgpu_ps float @global_min_saddr_i32_rtn(ptr addrspace(1) inreg %sbase,
 ; GFX11-NEXT:    v_min_i32_e32 v4, v5, v1
 ; GFX11-NEXT:    global_atomic_cmpswap_b32 v0, v[2:3], v[4:5], off glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
 ; GFX11-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
@@ -813,8 +813,8 @@ define amdgpu_ps float @global_min_saddr_i32_rtn_neg128(ptr addrspace(1) inreg %
 ; GFX10-NEXT:    v_min_i32_e32 v4, v5, v1
 ; GFX10-NEXT:    global_atomic_cmpswap v0, v[2:3], v[4:5], off offset:-128 glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
 ; GFX10-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX10-NEXT:    s_andn2_b64 exec, exec, s[0:1]
@@ -840,8 +840,8 @@ define amdgpu_ps float @global_min_saddr_i32_rtn_neg128(ptr addrspace(1) inreg %
 ; GFX11-NEXT:    v_min_i32_e32 v4, v5, v1
 ; GFX11-NEXT:    global_atomic_cmpswap_b32 v0, v[2:3], v[4:5], off offset:-128 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
 ; GFX11-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
@@ -893,8 +893,8 @@ define amdgpu_ps void @global_min_saddr_i32_nortn(ptr addrspace(1) inreg %sbase,
 ; GFX10-NEXT:    v_min_i32_e32 v4, v5, v1
 ; GFX10-NEXT:    global_atomic_cmpswap v0, v[2:3], v[4:5], off glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
 ; GFX10-NEXT:    v_mov_b32_e32 v5, v0
 ; GFX10-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
@@ -917,8 +917,8 @@ define amdgpu_ps void @global_min_saddr_i32_nortn(ptr addrspace(1) inreg %sbase,
 ; GFX11-NEXT:    v_min_i32_e32 v4, v5, v1
 ; GFX11-NEXT:    global_atomic_cmpswap_b32 v0, v[2:3], v[4:5], off glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
 ; GFX11-NEXT:    v_mov_b32_e32 v5, v0
 ; GFX11-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
@@ -968,8 +968,8 @@ define amdgpu_ps void @global_min_saddr_i32_nortn_neg128(ptr addrspace(1) inreg
 ; GFX10-NEXT:    v_min_i32_e32 v4, v5, v1
 ; GFX10-NEXT:    global_atomic_cmpswap v0, v[2:3], v[4:5], off offset:-128 glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
 ; GFX10-NEXT:    v_mov_b32_e32 v5, v0
 ; GFX10-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
@@ -992,8 +992,8 @@ define amdgpu_ps void @global_min_saddr_i32_nortn_neg128(ptr addrspace(1) inreg
 ; GFX11-NEXT:    v_min_i32_e32 v4, v5, v1
 ; GFX11-NEXT:    global_atomic_cmpswap_b32 v0, v[2:3], v[4:5], off offset:-128 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
 ; GFX11-NEXT:    v_mov_b32_e32 v5, v0
 ; GFX11-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
@@ -1054,8 +1054,8 @@ define amdgpu_ps <2 x float> @global_min_saddr_i64_rtn(ptr addrspace(1) inreg %s
 ; GFX10-NEXT:    v_cndmask_b32_e32 v7, v1, v9, vcc
 ; GFX10-NEXT:    global_atomic_cmpswap_x2 v[3:4], v[5:6], v[7:10], off glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10]
 ; GFX10-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX10-NEXT:    s_andn2_b64 exec, exec, s[0:1]
@@ -1085,8 +1085,8 @@ define amdgpu_ps <2 x float> @global_min_saddr_i64_rtn(ptr addrspace(1) inreg %s
 ; GFX11-NEXT:    v_cndmask_b32_e32 v7, v1, v9, vcc
 ; GFX11-NEXT:    global_atomic_cmpswap_b64 v[3:4], v[5:6], v[7:10], off glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10]
 ; GFX11-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
@@ -1149,8 +1149,8 @@ define amdgpu_ps <2 x float> @global_min_saddr_i64_rtn_neg128(ptr addrspace(1) i
 ; GFX10-NEXT:    v_cndmask_b32_e32 v7, v1, v9, vcc
 ; GFX10-NEXT:    global_atomic_cmpswap_x2 v[3:4], v[5:6], v[7:10], off offset:-128 glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10]
 ; GFX10-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX10-NEXT:    s_andn2_b64 exec, exec, s[0:1]
@@ -1180,8 +1180,8 @@ define amdgpu_ps <2 x float> @global_min_saddr_i64_rtn_neg128(ptr addrspace(1) i
 ; GFX11-NEXT:    v_cndmask_b32_e32 v7, v1, v9, vcc
 ; GFX11-NEXT:    global_atomic_cmpswap_b64 v[3:4], v[5:6], v[7:10], off offset:-128 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10]
 ; GFX11-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
@@ -1240,8 +1240,8 @@ define amdgpu_ps void @global_min_saddr_i64_nortn(ptr addrspace(1) inreg %sbase,
 ; GFX10-NEXT:    v_cndmask_b32_e32 v3, v1, v5, vcc
 ; GFX10-NEXT:    global_atomic_cmpswap_x2 v[3:4], v[7:8], v[3:6], off glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6]
 ; GFX10-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX10-NEXT:    v_mov_b32_e32 v5, v3
@@ -1267,8 +1267,8 @@ define amdgpu_ps void @global_min_saddr_i64_nortn(ptr addrspace(1) inreg %sbase,
 ; GFX11-NEXT:    v_cndmask_b32_e32 v3, v1, v5, vcc
 ; GFX11-NEXT:    global_atomic_cmpswap_b64 v[3:4], v[7:8], v[3:6], off glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6]
 ; GFX11-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX11-NEXT:    v_mov_b32_e32 v5, v3
@@ -1324,8 +1324,8 @@ define amdgpu_ps void @global_min_saddr_i64_nortn_neg128(ptr addrspace(1) inreg
 ; GFX10-NEXT:    v_cndmask_b32_e32 v3, v1, v5, vcc
 ; GFX10-NEXT:    global_atomic_cmpswap_x2 v[3:4], v[7:8], v[3:6], off offset:-128 glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6]
 ; GFX10-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX10-NEXT:    v_mov_b32_e32 v5, v3
@@ -1351,8 +1351,8 @@ define amdgpu_ps void @global_min_saddr_i64_nortn_neg128(ptr addrspace(1) inreg
 ; GFX11-NEXT:    v_cndmask_b32_e32 v3, v1, v5, vcc
 ; GFX11-NEXT:    global_atomic_cmpswap_b64 v[3:4], v[7:8], v[3:6], off offset:-128 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6]
 ; GFX11-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX11-NEXT:    v_mov_b32_e32 v5, v3
@@ -1412,8 +1412,8 @@ define amdgpu_ps float @global_umax_saddr_i32_rtn(ptr addrspace(1) inreg %sbase,
 ; GFX10-NEXT:    v_max_u32_e32 v4, v5, v1
 ; GFX10-NEXT:    global_atomic_cmpswap v0, v[2:3], v[4:5], off glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
 ; GFX10-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX10-NEXT:    s_andn2_b64 exec, exec, s[0:1]
@@ -1439,8 +1439,8 @@ define amdgpu_ps float @global_umax_saddr_i32_rtn(ptr addrspace(1) inreg %sbase,
 ; GFX11-NEXT:    v_max_u32_e32 v4, v5, v1
 ; GFX11-NEXT:    global_atomic_cmpswap_b32 v0, v[2:3], v[4:5], off glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
 ; GFX11-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
@@ -1495,8 +1495,8 @@ define amdgpu_ps float @global_umax_saddr_i32_rtn_neg128(ptr addrspace(1) inreg
 ; GFX10-NEXT:    v_max_u32_e32 v4, v5, v1
 ; GFX10-NEXT:    global_atomic_cmpswap v0, v[2:3], v[4:5], off offset:-128 glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
 ; GFX10-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX10-NEXT:    s_andn2_b64 exec, exec, s[0:1]
@@ -1522,8 +1522,8 @@ define amdgpu_ps float @global_umax_saddr_i32_rtn_neg128(ptr addrspace(1) inreg
 ; GFX11-NEXT:    v_max_u32_e32 v4, v5, v1
 ; GFX11-NEXT:    global_atomic_cmpswap_b32 v0, v[2:3], v[4:5], off offset:-128 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
 ; GFX11-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
@@ -1575,8 +1575,8 @@ define amdgpu_ps void @global_umax_saddr_i32_nortn(ptr addrspace(1) inreg %sbase
 ; GFX10-NEXT:    v_max_u32_e32 v4, v5, v1
 ; GFX10-NEXT:    global_atomic_cmpswap v0, v[2:3], v[4:5], off glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
 ; GFX10-NEXT:    v_mov_b32_e32 v5, v0
 ; GFX10-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
@@ -1599,8 +1599,8 @@ define amdgpu_ps void @global_umax_saddr_i32_nortn(ptr addrspace(1) inreg %sbase
 ; GFX11-NEXT:    v_max_u32_e32 v4, v5, v1
 ; GFX11-NEXT:    global_atomic_cmpswap_b32 v0, v[2:3], v[4:5], off glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
 ; GFX11-NEXT:    v_mov_b32_e32 v5, v0
 ; GFX11-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
@@ -1650,8 +1650,8 @@ define amdgpu_ps void @global_umax_saddr_i32_nortn_neg128(ptr addrspace(1) inreg
 ; GFX10-NEXT:    v_max_u32_e32 v4, v5, v1
 ; GFX10-NEXT:    global_atomic_cmpswap v0, v[2:3], v[4:5], off offset:-128 glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
 ; GFX10-NEXT:    v_mov_b32_e32 v5, v0
 ; GFX10-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
@@ -1674,8 +1674,8 @@ define amdgpu_ps void @global_umax_saddr_i32_nortn_neg128(ptr addrspace(1) inreg
 ; GFX11-NEXT:    v_max_u32_e32 v4, v5, v1
 ; GFX11-NEXT:    global_atomic_cmpswap_b32 v0, v[2:3], v[4:5], off offset:-128 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
 ; GFX11-NEXT:    v_mov_b32_e32 v5, v0
 ; GFX11-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
@@ -1736,8 +1736,8 @@ define amdgpu_ps <2 x float> @global_umax_saddr_i64_rtn(ptr addrspace(1) inreg %
 ; GFX10-NEXT:    v_cndmask_b32_e32 v7, v1, v9, vcc
 ; GFX10-NEXT:    global_atomic_cmpswap_x2 v[3:4], v[5:6], v[7:10], off glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10]
 ; GFX10-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX10-NEXT:    s_andn2_b64 exec, exec, s[0:1]
@@ -1767,8 +1767,8 @@ define amdgpu_ps <2 x float> @global_umax_saddr_i64_rtn(ptr addrspace(1) inreg %
 ; GFX11-NEXT:    v_cndmask_b32_e32 v7, v1, v9, vcc
 ; GFX11-NEXT:    global_atomic_cmpswap_b64 v[3:4], v[5:6], v[7:10], off glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10]
 ; GFX11-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
@@ -1831,8 +1831,8 @@ define amdgpu_ps <2 x float> @global_umax_saddr_i64_rtn_neg128(ptr addrspace(1)
 ; GFX10-NEXT:    v_cndmask_b32_e32 v7, v1, v9, vcc
 ; GFX10-NEXT:    global_atomic_cmpswap_x2 v[3:4], v[5:6], v[7:10], off offset:-128 glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10]
 ; GFX10-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX10-NEXT:    s_andn2_b64 exec, exec, s[0:1]
@@ -1862,8 +1862,8 @@ define amdgpu_ps <2 x float> @global_umax_saddr_i64_rtn_neg128(ptr addrspace(1)
 ; GFX11-NEXT:    v_cndmask_b32_e32 v7, v1, v9, vcc
 ; GFX11-NEXT:    global_atomic_cmpswap_b64 v[3:4], v[5:6], v[7:10], off offset:-128 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10]
 ; GFX11-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
@@ -1922,8 +1922,8 @@ define amdgpu_ps void @global_umax_saddr_i64_nortn(ptr addrspace(1) inreg %sbase
 ; GFX10-NEXT:    v_cndmask_b32_e32 v3, v1, v5, vcc
 ; GFX10-NEXT:    global_atomic_cmpswap_x2 v[3:4], v[7:8], v[3:6], off glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6]
 ; GFX10-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX10-NEXT:    v_mov_b32_e32 v5, v3
@@ -1949,8 +1949,8 @@ define amdgpu_ps void @global_umax_saddr_i64_nortn(ptr addrspace(1) inreg %sbase
 ; GFX11-NEXT:    v_cndmask_b32_e32 v3, v1, v5, vcc
 ; GFX11-NEXT:    global_atomic_cmpswap_b64 v[3:4], v[7:8], v[3:6], off glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6]
 ; GFX11-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX11-NEXT:    v_mov_b32_e32 v5, v3
@@ -2006,8 +2006,8 @@ define amdgpu_ps void @global_umax_saddr_i64_nortn_neg128(ptr addrspace(1) inreg
 ; GFX10-NEXT:    v_cndmask_b32_e32 v3, v1, v5, vcc
 ; GFX10-NEXT:    global_atomic_cmpswap_x2 v[3:4], v[7:8], v[3:6], off offset:-128 glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6]
 ; GFX10-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX10-NEXT:    v_mov_b32_e32 v5, v3
@@ -2033,8 +2033,8 @@ define amdgpu_ps void @global_umax_saddr_i64_nortn_neg128(ptr addrspace(1) inreg
 ; GFX11-NEXT:    v_cndmask_b32_e32 v3, v1, v5, vcc
 ; GFX11-NEXT:    global_atomic_cmpswap_b64 v[3:4], v[7:8], v[3:6], off offset:-128 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6]
 ; GFX11-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX11-NEXT:    v_mov_b32_e32 v5, v3
@@ -2094,8 +2094,8 @@ define amdgpu_ps float @global_umin_saddr_i32_rtn(ptr addrspace(1) inreg %sbase,
 ; GFX10-NEXT:    v_min_u32_e32 v4, v5, v1
 ; GFX10-NEXT:    global_atomic_cmpswap v0, v[2:3], v[4:5], off glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
 ; GFX10-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX10-NEXT:    s_andn2_b64 exec, exec, s[0:1]
@@ -2121,8 +2121,8 @@ define amdgpu_ps float @global_umin_saddr_i32_rtn(ptr addrspace(1) inreg %sbase,
 ; GFX11-NEXT:    v_min_u32_e32 v4, v5, v1
 ; GFX11-NEXT:    global_atomic_cmpswap_b32 v0, v[2:3], v[4:5], off glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
 ; GFX11-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
@@ -2177,8 +2177,8 @@ define amdgpu_ps float @global_umin_saddr_i32_rtn_neg128(ptr addrspace(1) inreg
 ; GFX10-NEXT:    v_min_u32_e32 v4, v5, v1
 ; GFX10-NEXT:    global_atomic_cmpswap v0, v[2:3], v[4:5], off offset:-128 glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
 ; GFX10-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX10-NEXT:    s_andn2_b64 exec, exec, s[0:1]
@@ -2204,8 +2204,8 @@ define amdgpu_ps float @global_umin_saddr_i32_rtn_neg128(ptr addrspace(1) inreg
 ; GFX11-NEXT:    v_min_u32_e32 v4, v5, v1
 ; GFX11-NEXT:    global_atomic_cmpswap_b32 v0, v[2:3], v[4:5], off offset:-128 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
 ; GFX11-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
@@ -2257,8 +2257,8 @@ define amdgpu_ps void @global_umin_saddr_i32_nortn(ptr addrspace(1) inreg %sbase
 ; GFX10-NEXT:    v_min_u32_e32 v4, v5, v1
 ; GFX10-NEXT:    global_atomic_cmpswap v0, v[2:3], v[4:5], off glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
 ; GFX10-NEXT:    v_mov_b32_e32 v5, v0
 ; GFX10-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
@@ -2281,8 +2281,8 @@ define amdgpu_ps void @global_umin_saddr_i32_nortn(ptr addrspace(1) inreg %sbase
 ; GFX11-NEXT:    v_min_u32_e32 v4, v5, v1
 ; GFX11-NEXT:    global_atomic_cmpswap_b32 v0, v[2:3], v[4:5], off glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
 ; GFX11-NEXT:    v_mov_b32_e32 v5, v0
 ; GFX11-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
@@ -2332,8 +2332,8 @@ define amdgpu_ps void @global_umin_saddr_i32_nortn_neg128(ptr addrspace(1) inreg
 ; GFX10-NEXT:    v_min_u32_e32 v4, v5, v1
 ; GFX10-NEXT:    global_atomic_cmpswap v0, v[2:3], v[4:5], off offset:-128 glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
 ; GFX10-NEXT:    v_mov_b32_e32 v5, v0
 ; GFX10-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
@@ -2356,8 +2356,8 @@ define amdgpu_ps void @global_umin_saddr_i32_nortn_neg128(ptr addrspace(1) inreg
 ; GFX11-NEXT:    v_min_u32_e32 v4, v5, v1
 ; GFX11-NEXT:    global_atomic_cmpswap_b32 v0, v[2:3], v[4:5], off offset:-128 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
 ; GFX11-NEXT:    v_mov_b32_e32 v5, v0
 ; GFX11-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
@@ -2418,8 +2418,8 @@ define amdgpu_ps <2 x float> @global_umin_saddr_i64_rtn(ptr addrspace(1) inreg %
 ; GFX10-NEXT:    v_cndmask_b32_e32 v7, v1, v9, vcc
 ; GFX10-NEXT:    global_atomic_cmpswap_x2 v[3:4], v[5:6], v[7:10], off glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10]
 ; GFX10-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX10-NEXT:    s_andn2_b64 exec, exec, s[0:1]
@@ -2449,8 +2449,8 @@ define amdgpu_ps <2 x float> @global_umin_saddr_i64_rtn(ptr addrspace(1) inreg %
 ; GFX11-NEXT:    v_cndmask_b32_e32 v7, v1, v9, vcc
 ; GFX11-NEXT:    global_atomic_cmpswap_b64 v[3:4], v[5:6], v[7:10], off glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10]
 ; GFX11-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
@@ -2513,8 +2513,8 @@ define amdgpu_ps <2 x float> @global_umin_saddr_i64_rtn_neg128(ptr addrspace(1)
 ; GFX10-NEXT:    v_cndmask_b32_e32 v7, v1, v9, vcc
 ; GFX10-NEXT:    global_atomic_cmpswap_x2 v[3:4], v[5:6], v[7:10], off offset:-128 glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10]
 ; GFX10-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX10-NEXT:    s_andn2_b64 exec, exec, s[0:1]
@@ -2544,8 +2544,8 @@ define amdgpu_ps <2 x float> @global_umin_saddr_i64_rtn_neg128(ptr addrspace(1)
 ; GFX11-NEXT:    v_cndmask_b32_e32 v7, v1, v9, vcc
 ; GFX11-NEXT:    global_atomic_cmpswap_b64 v[3:4], v[5:6], v[7:10], off offset:-128 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10]
 ; GFX11-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
@@ -2604,8 +2604,8 @@ define amdgpu_ps void @global_umin_saddr_i64_nortn(ptr addrspace(1) inreg %sbase
 ; GFX10-NEXT:    v_cndmask_b32_e32 v3, v1, v5, vcc
 ; GFX10-NEXT:    global_atomic_cmpswap_x2 v[3:4], v[7:8], v[3:6], off glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6]
 ; GFX10-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX10-NEXT:    v_mov_b32_e32 v5, v3
@@ -2631,8 +2631,8 @@ define amdgpu_ps void @global_umin_saddr_i64_nortn(ptr addrspace(1) inreg %sbase
 ; GFX11-NEXT:    v_cndmask_b32_e32 v3, v1, v5, vcc
 ; GFX11-NEXT:    global_atomic_cmpswap_b64 v[3:4], v[7:8], v[3:6], off glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6]
 ; GFX11-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX11-NEXT:    v_mov_b32_e32 v5, v3
@@ -2688,8 +2688,8 @@ define amdgpu_ps void @global_umin_saddr_i64_nortn_neg128(ptr addrspace(1) inreg
 ; GFX10-NEXT:    v_cndmask_b32_e32 v3, v1, v5, vcc
 ; GFX10-NEXT:    global_atomic_cmpswap_x2 v[3:4], v[7:8], v[3:6], off offset:-128 glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6]
 ; GFX10-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX10-NEXT:    v_mov_b32_e32 v5, v3
@@ -2715,8 +2715,8 @@ define amdgpu_ps void @global_umin_saddr_i64_nortn_neg128(ptr addrspace(1) inreg
 ; GFX11-NEXT:    v_cndmask_b32_e32 v3, v1, v5, vcc
 ; GFX11-NEXT:    global_atomic_cmpswap_b64 v[3:4], v[7:8], v[3:6], off offset:-128 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6]
 ; GFX11-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX11-NEXT:    v_mov_b32_e32 v5, v3
diff --git a/llvm/test/CodeGen/AMDGPU/global-saddr-atomics.ll b/llvm/test/CodeGen/AMDGPU/global-saddr-atomics.ll
index 94d21be..a3c8bb1 100644
--- a/llvm/test/CodeGen/AMDGPU/global-saddr-atomics.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-saddr-atomics.ll
@@ -17,16 +17,16 @@ define amdgpu_ps void @global_xchg_saddr_i32_nortn(ptr addrspace(1) inreg %sbase
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    global_atomic_swap v0, v1, s[2:3]
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: global_xchg_saddr_i32_nortn:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    global_atomic_swap_b32 v0, v1, s[2:3]
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -47,16 +47,16 @@ define amdgpu_ps void @global_xchg_saddr_i32_nortn_offset_2047(ptr addrspace(1)
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    global_atomic_swap v0, v1, s[2:3] offset:2047
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: global_xchg_saddr_i32_nortn_offset_2047:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    global_atomic_swap_b32 v0, v1, s[2:3] offset:2047
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -78,16 +78,16 @@ define amdgpu_ps void @global_xchg_saddr_i32_nortn_offset_neg2048(ptr addrspace(
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    global_atomic_swap v0, v1, s[2:3] offset:-2048
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: global_xchg_saddr_i32_nortn_offset_neg2048:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    global_atomic_swap_b32 v0, v1, s[2:3] offset:-2048
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -108,16 +108,16 @@ define amdgpu_ps float @global_xchg_saddr_i32_rtn(ptr addrspace(1) inreg %sbase,
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    global_atomic_swap v0, v0, v1, s[2:3] glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: global_xchg_saddr_i32_rtn:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    global_atomic_swap_b32 v0, v0, v1, s[2:3] glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -142,16 +142,16 @@ define amdgpu_ps float @global_xchg_saddr_i32_rtn_2048(ptr addrspace(1) inreg %s
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc, 0, v3, vcc
 ; GFX10-NEXT:    global_atomic_swap v0, v[2:3], v1, off glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: global_xchg_saddr_i32_rtn_2048:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    global_atomic_swap_b32 v0, v0, v1, s[2:3] offset:2048 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -173,16 +173,16 @@ define amdgpu_ps float @global_xchg_saddr_i32_rtn_neg2048(ptr addrspace(1) inreg
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    global_atomic_swap v0, v0, v1, s[2:3] offset:-2048 glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: global_xchg_saddr_i32_rtn_neg2048:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    global_atomic_swap_b32 v0, v0, v1, s[2:3] offset:-2048 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -222,8 +222,8 @@ define amdgpu_ps float @global_xchg_saddr_uniform_ptr_in_vgprs_rtn(i32 %voffset,
 ; GFX10-NEXT:    v_readfirstlane_b32 s1, v3
 ; GFX10-NEXT:    global_atomic_swap v0, v0, v1, s[0:1] glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: global_xchg_saddr_uniform_ptr_in_vgprs_rtn:
@@ -235,8 +235,8 @@ define amdgpu_ps float @global_xchg_saddr_uniform_ptr_in_vgprs_rtn(i32 %voffset,
 ; GFX11-NEXT:    v_readfirstlane_b32 s1, v3
 ; GFX11-NEXT:    global_atomic_swap_b32 v0, v0, v1, s[0:1] glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    ; return to shader part epilog
   %sbase = load ptr addrspace(1), ptr addrspace(3) @ptr.in.lds
   %zext.offset = zext i32 %voffset to i64
@@ -270,8 +270,8 @@ define amdgpu_ps float @global_xchg_saddr_uniform_ptr_in_vgprs_rtn_immoffset(i32
 ; GFX10-NEXT:    v_readfirstlane_b32 s1, v3
 ; GFX10-NEXT:    global_atomic_swap v0, v0, v1, s[0:1] offset:42 glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: global_xchg_saddr_uniform_ptr_in_vgprs_rtn_immoffset:
@@ -283,8 +283,8 @@ define amdgpu_ps float @global_xchg_saddr_uniform_ptr_in_vgprs_rtn_immoffset(i32
 ; GFX11-NEXT:    v_readfirstlane_b32 s1, v3
 ; GFX11-NEXT:    global_atomic_swap_b32 v0, v0, v1, s[0:1] offset:42 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    ; return to shader part epilog
   %sbase = load ptr addrspace(1), ptr addrspace(3) @ptr.in.lds
   %zext.offset = zext i32 %voffset to i64
@@ -319,8 +319,8 @@ define amdgpu_ps void @global_xchg_saddr_uniform_ptr_in_vgprs_nortn(i32 %voffset
 ; GFX10-NEXT:    v_readfirstlane_b32 s1, v3
 ; GFX10-NEXT:    global_atomic_swap v0, v1, s[0:1]
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: global_xchg_saddr_uniform_ptr_in_vgprs_nortn:
@@ -332,8 +332,8 @@ define amdgpu_ps void @global_xchg_saddr_uniform_ptr_in_vgprs_nortn(i32 %voffset
 ; GFX11-NEXT:    v_readfirstlane_b32 s1, v3
 ; GFX11-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_endpgm
   %sbase = load ptr addrspace(1), ptr addrspace(3) @ptr.in.lds
   %zext.offset = zext i32 %voffset to i64
@@ -366,8 +366,8 @@ define amdgpu_ps void @global_xchg_saddr_uniform_ptr_in_vgprs_nortn_immoffset(i3
 ; GFX10-NEXT:    v_readfirstlane_b32 s1, v3
 ; GFX10-NEXT:    global_atomic_swap v0, v1, s[0:1] offset:42
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: global_xchg_saddr_uniform_ptr_in_vgprs_nortn_immoffset:
@@ -379,8 +379,8 @@ define amdgpu_ps void @global_xchg_saddr_uniform_ptr_in_vgprs_nortn_immoffset(i3
 ; GFX11-NEXT:    v_readfirstlane_b32 s1, v3
 ; GFX11-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1] offset:42
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_endpgm
   %sbase = load ptr addrspace(1), ptr addrspace(3) @ptr.in.lds
   %zext.offset = zext i32 %voffset to i64
@@ -410,16 +410,16 @@ define amdgpu_ps <2 x float> @global_xchg_saddr_i64_rtn(ptr addrspace(1) inreg %
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    global_atomic_swap_x2 v[0:1], v0, v[1:2], s[2:3] glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: global_xchg_saddr_i64_rtn:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    global_atomic_swap_b64 v[0:1], v0, v[1:2], s[2:3] glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -440,16 +440,16 @@ define amdgpu_ps <2 x float> @global_xchg_saddr_i64_rtn_neg128(ptr addrspace(1)
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    global_atomic_swap_x2 v[0:1], v0, v[1:2], s[2:3] offset:-128 glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: global_xchg_saddr_i64_rtn_neg128:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    global_atomic_swap_b64 v[0:1], v0, v[1:2], s[2:3] offset:-128 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -471,16 +471,16 @@ define amdgpu_ps void @global_xchg_saddr_i64_nortn(ptr addrspace(1) inreg %sbase
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    global_atomic_swap_x2 v0, v[1:2], s[2:3]
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: global_xchg_saddr_i64_nortn:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    global_atomic_swap_b64 v0, v[1:2], s[2:3]
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -500,16 +500,16 @@ define amdgpu_ps void @global_xchg_saddr_i64_nortn_neg128(ptr addrspace(1) inreg
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    global_atomic_swap_x2 v0, v[1:2], s[2:3] offset:-128
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: global_xchg_saddr_i64_nortn_neg128:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    global_atomic_swap_b64 v0, v[1:2], s[2:3] offset:-128
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -534,16 +534,16 @@ define amdgpu_ps float @global_add_saddr_i32_rtn(ptr addrspace(1) inreg %sbase,
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    global_atomic_add v0, v0, v1, s[2:3] glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: global_add_saddr_i32_rtn:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    global_atomic_add_u32 v0, v0, v1, s[2:3] glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -564,16 +564,16 @@ define amdgpu_ps float @global_add_saddr_i32_rtn_neg128(ptr addrspace(1) inreg %
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    global_atomic_add v0, v0, v1, s[2:3] offset:-128 glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: global_add_saddr_i32_rtn_neg128:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    global_atomic_add_u32 v0, v0, v1, s[2:3] offset:-128 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -595,16 +595,16 @@ define amdgpu_ps void @global_add_saddr_i32_nortn(ptr addrspace(1) inreg %sbase,
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    global_atomic_add v0, v1, s[2:3]
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: global_add_saddr_i32_nortn:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    global_atomic_add_u32 v0, v1, s[2:3]
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -624,16 +624,16 @@ define amdgpu_ps void @global_add_saddr_i32_nortn_neg128(ptr addrspace(1) inreg
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    global_atomic_add v0, v1, s[2:3] offset:-128
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: global_add_saddr_i32_nortn_neg128:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    global_atomic_add_u32 v0, v1, s[2:3] offset:-128
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -654,16 +654,16 @@ define amdgpu_ps <2 x float> @global_add_saddr_i64_rtn(ptr addrspace(1) inreg %s
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    global_atomic_add_x2 v[0:1], v0, v[1:2], s[2:3] glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: global_add_saddr_i64_rtn:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    global_atomic_add_u64 v[0:1], v0, v[1:2], s[2:3] glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -684,16 +684,16 @@ define amdgpu_ps <2 x float> @global_add_saddr_i64_rtn_neg128(ptr addrspace(1) i
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    global_atomic_add_x2 v[0:1], v0, v[1:2], s[2:3] offset:-128 glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: global_add_saddr_i64_rtn_neg128:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    global_atomic_add_u64 v[0:1], v0, v[1:2], s[2:3] offset:-128 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -715,16 +715,16 @@ define amdgpu_ps void @global_add_saddr_i64_nortn(ptr addrspace(1) inreg %sbase,
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    global_atomic_add_x2 v0, v[1:2], s[2:3]
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: global_add_saddr_i64_nortn:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    global_atomic_add_u64 v0, v[1:2], s[2:3]
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -744,16 +744,16 @@ define amdgpu_ps void @global_add_saddr_i64_nortn_neg128(ptr addrspace(1) inreg
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    global_atomic_add_x2 v0, v[1:2], s[2:3] offset:-128
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: global_add_saddr_i64_nortn_neg128:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    global_atomic_add_u64 v0, v[1:2], s[2:3] offset:-128
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -778,16 +778,16 @@ define amdgpu_ps float @global_sub_saddr_i32_rtn(ptr addrspace(1) inreg %sbase,
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    global_atomic_sub v0, v0, v1, s[2:3] glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: global_sub_saddr_i32_rtn:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    global_atomic_sub_u32 v0, v0, v1, s[2:3] glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -808,16 +808,16 @@ define amdgpu_ps float @global_sub_saddr_i32_rtn_neg128(ptr addrspace(1) inreg %
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    global_atomic_sub v0, v0, v1, s[2:3] offset:-128 glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: global_sub_saddr_i32_rtn_neg128:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    global_atomic_sub_u32 v0, v0, v1, s[2:3] offset:-128 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -839,16 +839,16 @@ define amdgpu_ps void @global_sub_saddr_i32_nortn(ptr addrspace(1) inreg %sbase,
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    global_atomic_sub v0, v1, s[2:3]
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: global_sub_saddr_i32_nortn:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    global_atomic_sub_u32 v0, v1, s[2:3]
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -868,16 +868,16 @@ define amdgpu_ps void @global_sub_saddr_i32_nortn_neg128(ptr addrspace(1) inreg
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    global_atomic_sub v0, v1, s[2:3] offset:-128
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: global_sub_saddr_i32_nortn_neg128:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    global_atomic_sub_u32 v0, v1, s[2:3] offset:-128
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -898,16 +898,16 @@ define amdgpu_ps <2 x float> @global_sub_saddr_i64_rtn(ptr addrspace(1) inreg %s
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    global_atomic_sub_x2 v[0:1], v0, v[1:2], s[2:3] glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: global_sub_saddr_i64_rtn:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    global_atomic_sub_u64 v[0:1], v0, v[1:2], s[2:3] glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -928,16 +928,16 @@ define amdgpu_ps <2 x float> @global_sub_saddr_i64_rtn_neg128(ptr addrspace(1) i
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    global_atomic_sub_x2 v[0:1], v0, v[1:2], s[2:3] offset:-128 glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: global_sub_saddr_i64_rtn_neg128:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    global_atomic_sub_u64 v[0:1], v0, v[1:2], s[2:3] offset:-128 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -959,16 +959,16 @@ define amdgpu_ps void @global_sub_saddr_i64_nortn(ptr addrspace(1) inreg %sbase,
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    global_atomic_sub_x2 v0, v[1:2], s[2:3]
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: global_sub_saddr_i64_nortn:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    global_atomic_sub_u64 v0, v[1:2], s[2:3]
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -988,16 +988,16 @@ define amdgpu_ps void @global_sub_saddr_i64_nortn_neg128(ptr addrspace(1) inreg
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    global_atomic_sub_x2 v0, v[1:2], s[2:3] offset:-128
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: global_sub_saddr_i64_nortn_neg128:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    global_atomic_sub_u64 v0, v[1:2], s[2:3] offset:-128
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -1022,16 +1022,16 @@ define amdgpu_ps float @global_and_saddr_i32_rtn(ptr addrspace(1) inreg %sbase,
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    global_atomic_and v0, v0, v1, s[2:3] glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: global_and_saddr_i32_rtn:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    global_atomic_and_b32 v0, v0, v1, s[2:3] glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -1052,16 +1052,16 @@ define amdgpu_ps float @global_and_saddr_i32_rtn_neg128(ptr addrspace(1) inreg %
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    global_atomic_and v0, v0, v1, s[2:3] offset:-128 glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: global_and_saddr_i32_rtn_neg128:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    global_atomic_and_b32 v0, v0, v1, s[2:3] offset:-128 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -1083,16 +1083,16 @@ define amdgpu_ps void @global_and_saddr_i32_nortn(ptr addrspace(1) inreg %sbase,
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    global_atomic_and v0, v1, s[2:3]
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: global_and_saddr_i32_nortn:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    global_atomic_and_b32 v0, v1, s[2:3]
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -1112,16 +1112,16 @@ define amdgpu_ps void @global_and_saddr_i32_nortn_neg128(ptr addrspace(1) inreg
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    global_atomic_and v0, v1, s[2:3] offset:-128
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: global_and_saddr_i32_nortn_neg128:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    global_atomic_and_b32 v0, v1, s[2:3] offset:-128
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -1142,16 +1142,16 @@ define amdgpu_ps <2 x float> @global_and_saddr_i64_rtn(ptr addrspace(1) inreg %s
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    global_atomic_and_x2 v[0:1], v0, v[1:2], s[2:3] glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: global_and_saddr_i64_rtn:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    global_atomic_and_b64 v[0:1], v0, v[1:2], s[2:3] glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -1172,16 +1172,16 @@ define amdgpu_ps <2 x float> @global_and_saddr_i64_rtn_neg128(ptr addrspace(1) i
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    global_atomic_and_x2 v[0:1], v0, v[1:2], s[2:3] offset:-128 glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: global_and_saddr_i64_rtn_neg128:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    global_atomic_and_b64 v[0:1], v0, v[1:2], s[2:3] offset:-128 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -1203,16 +1203,16 @@ define amdgpu_ps void @global_and_saddr_i64_nortn(ptr addrspace(1) inreg %sbase,
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    global_atomic_and_x2 v0, v[1:2], s[2:3]
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: global_and_saddr_i64_nortn:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    global_atomic_and_b64 v0, v[1:2], s[2:3]
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -1232,16 +1232,16 @@ define amdgpu_ps void @global_and_saddr_i64_nortn_neg128(ptr addrspace(1) inreg
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    global_atomic_and_x2 v0, v[1:2], s[2:3] offset:-128
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: global_and_saddr_i64_nortn_neg128:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    global_atomic_and_b64 v0, v[1:2], s[2:3] offset:-128
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -1266,16 +1266,16 @@ define amdgpu_ps float @global_or_saddr_i32_rtn(ptr addrspace(1) inreg %sbase, i
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    global_atomic_or v0, v0, v1, s[2:3] glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: global_or_saddr_i32_rtn:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    global_atomic_or_b32 v0, v0, v1, s[2:3] glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -1296,16 +1296,16 @@ define amdgpu_ps float @global_or_saddr_i32_rtn_neg128(ptr addrspace(1) inreg %s
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    global_atomic_or v0, v0, v1, s[2:3] offset:-128 glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: global_or_saddr_i32_rtn_neg128:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    global_atomic_or_b32 v0, v0, v1, s[2:3] offset:-128 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -1327,16 +1327,16 @@ define amdgpu_ps void @global_or_saddr_i32_nortn(ptr addrspace(1) inreg %sbase,
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    global_atomic_or v0, v1, s[2:3]
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: global_or_saddr_i32_nortn:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    global_atomic_or_b32 v0, v1, s[2:3]
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -1356,16 +1356,16 @@ define amdgpu_ps void @global_or_saddr_i32_nortn_neg128(ptr addrspace(1) inreg %
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    global_atomic_or v0, v1, s[2:3] offset:-128
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: global_or_saddr_i32_nortn_neg128:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    global_atomic_or_b32 v0, v1, s[2:3] offset:-128
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -1386,16 +1386,16 @@ define amdgpu_ps <2 x float> @global_or_saddr_i64_rtn(ptr addrspace(1) inreg %sb
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    global_atomic_or_x2 v[0:1], v0, v[1:2], s[2:3] glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: global_or_saddr_i64_rtn:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    global_atomic_or_b64 v[0:1], v0, v[1:2], s[2:3] glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -1416,16 +1416,16 @@ define amdgpu_ps <2 x float> @global_or_saddr_i64_rtn_neg128(ptr addrspace(1) in
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    global_atomic_or_x2 v[0:1], v0, v[1:2], s[2:3] offset:-128 glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: global_or_saddr_i64_rtn_neg128:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    global_atomic_or_b64 v[0:1], v0, v[1:2], s[2:3] offset:-128 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -1447,16 +1447,16 @@ define amdgpu_ps void @global_or_saddr_i64_nortn(ptr addrspace(1) inreg %sbase,
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    global_atomic_or_x2 v0, v[1:2], s[2:3]
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: global_or_saddr_i64_nortn:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    global_atomic_or_b64 v0, v[1:2], s[2:3]
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -1476,16 +1476,16 @@ define amdgpu_ps void @global_or_saddr_i64_nortn_neg128(ptr addrspace(1) inreg %
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    global_atomic_or_x2 v0, v[1:2], s[2:3] offset:-128
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: global_or_saddr_i64_nortn_neg128:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    global_atomic_or_b64 v0, v[1:2], s[2:3] offset:-128
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -1510,16 +1510,16 @@ define amdgpu_ps float @global_xor_saddr_i32_rtn(ptr addrspace(1) inreg %sbase,
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    global_atomic_xor v0, v0, v1, s[2:3] glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: global_xor_saddr_i32_rtn:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    global_atomic_xor_b32 v0, v0, v1, s[2:3] glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -1540,16 +1540,16 @@ define amdgpu_ps float @global_xor_saddr_i32_rtn_neg128(ptr addrspace(1) inreg %
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    global_atomic_xor v0, v0, v1, s[2:3] offset:-128 glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: global_xor_saddr_i32_rtn_neg128:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    global_atomic_xor_b32 v0, v0, v1, s[2:3] offset:-128 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -1571,16 +1571,16 @@ define amdgpu_ps void @global_xor_saddr_i32_nortn(ptr addrspace(1) inreg %sbase,
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    global_atomic_xor v0, v1, s[2:3]
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: global_xor_saddr_i32_nortn:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    global_atomic_xor_b32 v0, v1, s[2:3]
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -1600,16 +1600,16 @@ define amdgpu_ps void @global_xor_saddr_i32_nortn_neg128(ptr addrspace(1) inreg
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    global_atomic_xor v0, v1, s[2:3] offset:-128
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: global_xor_saddr_i32_nortn_neg128:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    global_atomic_xor_b32 v0, v1, s[2:3] offset:-128
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -1630,16 +1630,16 @@ define amdgpu_ps <2 x float> @global_xor_saddr_i64_rtn(ptr addrspace(1) inreg %s
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    global_atomic_xor_x2 v[0:1], v0, v[1:2], s[2:3] glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: global_xor_saddr_i64_rtn:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    global_atomic_xor_b64 v[0:1], v0, v[1:2], s[2:3] glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -1660,16 +1660,16 @@ define amdgpu_ps <2 x float> @global_xor_saddr_i64_rtn_neg128(ptr addrspace(1) i
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    global_atomic_xor_x2 v[0:1], v0, v[1:2], s[2:3] offset:-128 glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: global_xor_saddr_i64_rtn_neg128:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    global_atomic_xor_b64 v[0:1], v0, v[1:2], s[2:3] offset:-128 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -1691,16 +1691,16 @@ define amdgpu_ps void @global_xor_saddr_i64_nortn(ptr addrspace(1) inreg %sbase,
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    global_atomic_xor_x2 v0, v[1:2], s[2:3]
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: global_xor_saddr_i64_nortn:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    global_atomic_xor_b64 v0, v[1:2], s[2:3]
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -1720,16 +1720,16 @@ define amdgpu_ps void @global_xor_saddr_i64_nortn_neg128(ptr addrspace(1) inreg
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    global_atomic_xor_x2 v0, v[1:2], s[2:3] offset:-128
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: global_xor_saddr_i64_nortn_neg128:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    global_atomic_xor_b64 v0, v[1:2], s[2:3] offset:-128
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -2620,8 +2620,8 @@ define amdgpu_ps float @global_cmpxchg_saddr_i32_rtn(ptr addrspace(1) inreg %sba
 ; GFX10-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX10-NEXT:    global_atomic_cmpswap v0, v0, v[2:3], s[2:3] glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: global_cmpxchg_saddr_i32_rtn:
@@ -2629,8 +2629,8 @@ define amdgpu_ps float @global_cmpxchg_saddr_i32_rtn(ptr addrspace(1) inreg %sba
 ; GFX11-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX11-NEXT:    global_atomic_cmpswap_b32 v0, v0, v[2:3], s[2:3] glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -2654,8 +2654,8 @@ define amdgpu_ps float @global_cmpxchg_saddr_i32_rtn_neg128(ptr addrspace(1) inr
 ; GFX10-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX10-NEXT:    global_atomic_cmpswap v0, v0, v[2:3], s[2:3] offset:-128 glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: global_cmpxchg_saddr_i32_rtn_neg128:
@@ -2663,8 +2663,8 @@ define amdgpu_ps float @global_cmpxchg_saddr_i32_rtn_neg128(ptr addrspace(1) inr
 ; GFX11-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX11-NEXT:    global_atomic_cmpswap_b32 v0, v0, v[2:3], s[2:3] offset:-128 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -2689,8 +2689,8 @@ define amdgpu_ps void @global_cmpxchg_saddr_i32_nortn(ptr addrspace(1) inreg %sb
 ; GFX10-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX10-NEXT:    global_atomic_cmpswap v0, v[2:3], s[2:3]
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: global_cmpxchg_saddr_i32_nortn:
@@ -2698,8 +2698,8 @@ define amdgpu_ps void @global_cmpxchg_saddr_i32_nortn(ptr addrspace(1) inreg %sb
 ; GFX11-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX11-NEXT:    global_atomic_cmpswap_b32 v0, v[2:3], s[2:3]
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -2721,8 +2721,8 @@ define amdgpu_ps void @global_cmpxchg_saddr_i32_nortn_neg128(ptr addrspace(1) in
 ; GFX10-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX10-NEXT:    global_atomic_cmpswap v0, v[2:3], s[2:3] offset:-128
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: global_cmpxchg_saddr_i32_nortn_neg128:
@@ -2730,8 +2730,8 @@ define amdgpu_ps void @global_cmpxchg_saddr_i32_nortn_neg128(ptr addrspace(1) in
 ; GFX11-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX11-NEXT:    global_atomic_cmpswap_b32 v0, v[2:3], s[2:3] offset:-128
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -2756,8 +2756,8 @@ define amdgpu_ps <2 x float> @global_cmpxchg_saddr_i64_rtn(ptr addrspace(1) inre
 ; GFX10-NEXT:    v_mov_b32_e32 v5, v1
 ; GFX10-NEXT:    global_atomic_cmpswap_x2 v[0:1], v0, v[3:6], s[2:3] glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: global_cmpxchg_saddr_i64_rtn:
@@ -2766,8 +2766,8 @@ define amdgpu_ps <2 x float> @global_cmpxchg_saddr_i64_rtn(ptr addrspace(1) inre
 ; GFX11-NEXT:    v_mov_b32_e32 v5, v1
 ; GFX11-NEXT:    global_atomic_cmpswap_b64 v[0:1], v0, v[3:6], s[2:3] glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -2793,8 +2793,8 @@ define amdgpu_ps <2 x float> @global_cmpxchg_saddr_i64_rtn_neg128(ptr addrspace(
 ; GFX10-NEXT:    v_mov_b32_e32 v5, v1
 ; GFX10-NEXT:    global_atomic_cmpswap_x2 v[0:1], v0, v[3:6], s[2:3] offset:-128 glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: global_cmpxchg_saddr_i64_rtn_neg128:
@@ -2803,8 +2803,8 @@ define amdgpu_ps <2 x float> @global_cmpxchg_saddr_i64_rtn_neg128(ptr addrspace(
 ; GFX11-NEXT:    v_mov_b32_e32 v5, v1
 ; GFX11-NEXT:    global_atomic_cmpswap_b64 v[0:1], v0, v[3:6], s[2:3] offset:-128 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -2831,8 +2831,8 @@ define amdgpu_ps void @global_cmpxchg_saddr_i64_nortn(ptr addrspace(1) inreg %sb
 ; GFX10-NEXT:    v_mov_b32_e32 v5, v1
 ; GFX10-NEXT:    global_atomic_cmpswap_x2 v0, v[3:6], s[2:3]
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: global_cmpxchg_saddr_i64_nortn:
@@ -2841,8 +2841,8 @@ define amdgpu_ps void @global_cmpxchg_saddr_i64_nortn(ptr addrspace(1) inreg %sb
 ; GFX11-NEXT:    v_mov_b32_e32 v5, v1
 ; GFX11-NEXT:    global_atomic_cmpswap_b64 v0, v[3:6], s[2:3]
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -2866,8 +2866,8 @@ define amdgpu_ps void @global_cmpxchg_saddr_i64_nortn_neg128(ptr addrspace(1) in
 ; GFX10-NEXT:    v_mov_b32_e32 v5, v1
 ; GFX10-NEXT:    global_atomic_cmpswap_x2 v0, v[3:6], s[2:3] offset:-128
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: global_cmpxchg_saddr_i64_nortn_neg128:
@@ -2876,8 +2876,8 @@ define amdgpu_ps void @global_cmpxchg_saddr_i64_nortn_neg128(ptr addrspace(1) in
 ; GFX11-NEXT:    v_mov_b32_e32 v5, v1
 ; GFX11-NEXT:    global_atomic_cmpswap_b64 v0, v[3:6], s[2:3] offset:-128
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
diff --git a/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll b/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll
index 6d99485..d9cbbc1 100644
--- a/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll
@@ -3575,16 +3575,16 @@ define amdgpu_ps float @atomic_global_load_saddr_i32(ptr addrspace(1) inreg %sba
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    global_load_dword v0, v0, s[2:3] glc dlc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: atomic_global_load_saddr_i32:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    global_load_b32 v0, v0, s[2:3] glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    ; return to shader part epilog
 ;
 ; GFX12-LABEL: atomic_global_load_saddr_i32:
@@ -3612,16 +3612,16 @@ define amdgpu_ps float @atomic_global_load_saddr_i32_immneg128(ptr addrspace(1)
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    global_load_dword v0, v0, s[2:3] offset:-128 glc dlc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: atomic_global_load_saddr_i32_immneg128:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    global_load_b32 v0, v0, s[2:3] offset:-128 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    ; return to shader part epilog
 ;
 ; GFX12-LABEL: atomic_global_load_saddr_i32_immneg128:
@@ -3650,16 +3650,16 @@ define amdgpu_ps <2 x float> @atomic_global_load_saddr_i64(ptr addrspace(1) inre
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    global_load_dwordx2 v[0:1], v0, s[2:3] glc dlc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: atomic_global_load_saddr_i64:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    global_load_b64 v[0:1], v0, s[2:3] glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    ; return to shader part epilog
 ;
 ; GFX12-LABEL: atomic_global_load_saddr_i64:
@@ -3687,16 +3687,16 @@ define amdgpu_ps <2 x float> @atomic_global_load_saddr_i64_immneg128(ptr addrspa
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    global_load_dwordx2 v[0:1], v0, s[2:3] offset:-128 glc dlc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: atomic_global_load_saddr_i64_immneg128:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    global_load_b64 v[0:1], v0, s[2:3] offset:-128 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    ; return to shader part epilog
 ;
 ; GFX12-LABEL: atomic_global_load_saddr_i64_immneg128:
diff --git a/llvm/test/CodeGen/AMDGPU/insert-waitcnts-crash.ll b/llvm/test/CodeGen/AMDGPU/insert-waitcnts-crash.ll
index 833aba9..9e336a7 100644
--- a/llvm/test/CodeGen/AMDGPU/insert-waitcnts-crash.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert-waitcnts-crash.ll
@@ -23,8 +23,8 @@ define fastcc i32 @foo() {
   ; CHECK-NEXT:     $sgpr17 = S_ADDC_U32 internal $sgpr17, target-flags(amdgpu-gotprel32-hi) @bar + 12, implicit-def $scc, implicit internal $scc
   ; CHECK-NEXT:   }
   ; CHECK-NEXT:   S_WAITCNT_VSCNT undef $sgpr_null, 0
-  ; CHECK-NEXT:   BUFFER_GL0_INV implicit $exec
   ; CHECK-NEXT:   BUFFER_GL1_INV implicit $exec
+  ; CHECK-NEXT:   BUFFER_GL0_INV implicit $exec
   ; CHECK-NEXT:   renamable $sgpr16_sgpr17 = S_LOAD_DWORDX2_IMM killed renamable $sgpr16_sgpr17, 0, 0 :: (dereferenceable invariant load (s64) from got, addrspace 4)
   ; CHECK-NEXT:   $vgpr40 = V_WRITELANE_B32 killed $sgpr30, 0, killed $vgpr40
   ; CHECK-NEXT:   $vgpr40 = V_WRITELANE_B32 killed $sgpr31, 1, killed $vgpr40
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence.ll
index 89ccf97..77962fa 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence.ll
@@ -1224,14 +1224,14 @@ define amdgpu_kernel void @agent_acquire_fence() {
 ;
 ; GFX10-WGP-LABEL: agent_acquire_fence:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: agent_acquire_fence:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: agent_acquire_fence:
@@ -1260,14 +1260,14 @@ define amdgpu_kernel void @agent_acquire_fence() {
 ;
 ; GFX11-WGP-LABEL: agent_acquire_fence:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: agent_acquire_fence:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
 entry:
   fence syncscope("agent") acquire
@@ -1338,14 +1338,14 @@ define amdgpu_kernel void @agent_acq_rel_fence() {
 ;
 ; GFX10-WGP-LABEL: agent_acq_rel_fence:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: agent_acq_rel_fence:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: agent_acq_rel_fence:
@@ -1376,14 +1376,14 @@ define amdgpu_kernel void @agent_acq_rel_fence() {
 ;
 ; GFX11-WGP-LABEL: agent_acq_rel_fence:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: agent_acq_rel_fence:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
 entry:
   fence syncscope("agent") acq_rel
@@ -1403,14 +1403,14 @@ define amdgpu_kernel void @agent_seq_cst_fence() {
 ;
 ; GFX10-WGP-LABEL: agent_seq_cst_fence:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: agent_seq_cst_fence:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: agent_seq_cst_fence:
@@ -1441,14 +1441,14 @@ define amdgpu_kernel void @agent_seq_cst_fence() {
 ;
 ; GFX11-WGP-LABEL: agent_seq_cst_fence:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: agent_seq_cst_fence:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
 entry:
   fence syncscope("agent") seq_cst
@@ -1468,14 +1468,14 @@ define amdgpu_kernel void @agent_one_as_acquire_fence() {
 ;
 ; GFX10-WGP-LABEL: agent_one_as_acquire_fence:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: agent_one_as_acquire_fence:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: agent_one_as_acquire_fence:
@@ -1504,14 +1504,14 @@ define amdgpu_kernel void @agent_one_as_acquire_fence() {
 ;
 ; GFX11-WGP-LABEL: agent_one_as_acquire_fence:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: agent_one_as_acquire_fence:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
 entry:
   fence syncscope("agent-one-as") acquire
@@ -1582,14 +1582,14 @@ define amdgpu_kernel void @agent_one_as_acq_rel_fence() {
 ;
 ; GFX10-WGP-LABEL: agent_one_as_acq_rel_fence:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: agent_one_as_acq_rel_fence:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: agent_one_as_acq_rel_fence:
@@ -1620,14 +1620,14 @@ define amdgpu_kernel void @agent_one_as_acq_rel_fence() {
 ;
 ; GFX11-WGP-LABEL: agent_one_as_acq_rel_fence:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: agent_one_as_acq_rel_fence:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
 entry:
   fence syncscope("agent-one-as") acq_rel
@@ -1647,14 +1647,14 @@ define amdgpu_kernel void @agent_one_as_seq_cst_fence() {
 ;
 ; GFX10-WGP-LABEL: agent_one_as_seq_cst_fence:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: agent_one_as_seq_cst_fence:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: agent_one_as_seq_cst_fence:
@@ -1685,14 +1685,14 @@ define amdgpu_kernel void @agent_one_as_seq_cst_fence() {
 ;
 ; GFX11-WGP-LABEL: agent_one_as_seq_cst_fence:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: agent_one_as_seq_cst_fence:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
 entry:
   fence syncscope("agent-one-as") seq_cst
@@ -1712,14 +1712,14 @@ define amdgpu_kernel void @system_acquire_fence() {
 ;
 ; GFX10-WGP-LABEL: system_acquire_fence:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: system_acquire_fence:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: system_acquire_fence:
@@ -1750,14 +1750,14 @@ define amdgpu_kernel void @system_acquire_fence() {
 ;
 ; GFX11-WGP-LABEL: system_acquire_fence:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: system_acquire_fence:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
 entry:
   fence acquire
@@ -1830,14 +1830,14 @@ define amdgpu_kernel void @system_acq_rel_fence() {
 ;
 ; GFX10-WGP-LABEL: system_acq_rel_fence:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: system_acq_rel_fence:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: system_acq_rel_fence:
@@ -1872,14 +1872,14 @@ define amdgpu_kernel void @system_acq_rel_fence() {
 ;
 ; GFX11-WGP-LABEL: system_acq_rel_fence:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: system_acq_rel_fence:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
 entry:
   fence acq_rel
@@ -1899,14 +1899,14 @@ define amdgpu_kernel void @system_seq_cst_fence() {
 ;
 ; GFX10-WGP-LABEL: system_seq_cst_fence:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: system_seq_cst_fence:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: system_seq_cst_fence:
@@ -1941,14 +1941,14 @@ define amdgpu_kernel void @system_seq_cst_fence() {
 ;
 ; GFX11-WGP-LABEL: system_seq_cst_fence:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: system_seq_cst_fence:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
 entry:
   fence seq_cst
@@ -1968,14 +1968,14 @@ define amdgpu_kernel void @system_one_as_acquire_fence() {
 ;
 ; GFX10-WGP-LABEL: system_one_as_acquire_fence:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: system_one_as_acquire_fence:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: system_one_as_acquire_fence:
@@ -2006,14 +2006,14 @@ define amdgpu_kernel void @system_one_as_acquire_fence() {
 ;
 ; GFX11-WGP-LABEL: system_one_as_acquire_fence:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: system_one_as_acquire_fence:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
 entry:
   fence syncscope("one-as") acquire
@@ -2086,14 +2086,14 @@ define amdgpu_kernel void @system_one_as_acq_rel_fence() {
 ;
 ; GFX10-WGP-LABEL: system_one_as_acq_rel_fence:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: system_one_as_acq_rel_fence:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: system_one_as_acq_rel_fence:
@@ -2128,14 +2128,14 @@ define amdgpu_kernel void @system_one_as_acq_rel_fence() {
 ;
 ; GFX11-WGP-LABEL: system_one_as_acq_rel_fence:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: system_one_as_acq_rel_fence:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
 entry:
   fence syncscope("one-as") acq_rel
@@ -2155,14 +2155,14 @@ define amdgpu_kernel void @system_one_as_seq_cst_fence() {
 ;
 ; GFX10-WGP-LABEL: system_one_as_seq_cst_fence:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: system_one_as_seq_cst_fence:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: system_one_as_seq_cst_fence:
@@ -2197,14 +2197,14 @@ define amdgpu_kernel void @system_one_as_seq_cst_fence() {
 ;
 ; GFX11-WGP-LABEL: system_one_as_seq_cst_fence:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: system_one_as_seq_cst_fence:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
 entry:
   fence syncscope("one-as") seq_cst
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-agent.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-agent.ll
index d6c759f..d9f5e64 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-agent.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-agent.ll
@@ -299,8 +299,8 @@ define amdgpu_kernel void @flat_agent_acquire_load(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-WGP-NEXT:    flat_load_dword v2, v[0:1] glc dlc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
@@ -314,8 +314,8 @@ define amdgpu_kernel void @flat_agent_acquire_load(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-CU-NEXT:    flat_load_dword v2, v[0:1] glc dlc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
@@ -397,8 +397,8 @@ define amdgpu_kernel void @flat_agent_acquire_load(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX11-WGP-NEXT:    flat_load_b32 v2, v[0:1] glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
@@ -410,8 +410,8 @@ define amdgpu_kernel void @flat_agent_acquire_load(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX11-CU-NEXT:    flat_load_b32 v2, v[0:1] glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
@@ -445,8 +445,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_load(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-WGP-NEXT:    flat_load_dword v2, v[0:1] glc dlc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
@@ -460,8 +460,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_load(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-CU-NEXT:    flat_load_dword v2, v[0:1] glc dlc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
@@ -543,8 +543,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_load(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX11-WGP-NEXT:    flat_load_b32 v2, v[0:1] glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
@@ -556,8 +556,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_load(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX11-CU-NEXT:    flat_load_b32 v2, v[0:1] glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
@@ -1168,8 +1168,8 @@ define amdgpu_kernel void @flat_agent_acquire_atomicrmw(
 ; GFX10-WGP-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_agent_acquire_atomicrmw:
@@ -1184,8 +1184,8 @@ define amdgpu_kernel void @flat_agent_acquire_atomicrmw(
 ; GFX10-CU-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_agent_acquire_atomicrmw:
@@ -1259,8 +1259,8 @@ define amdgpu_kernel void @flat_agent_acquire_atomicrmw(
 ; GFX11-WGP-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_agent_acquire_atomicrmw:
@@ -1274,8 +1274,8 @@ define amdgpu_kernel void @flat_agent_acquire_atomicrmw(
 ; GFX11-CU-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
     ptr %out, i32 %in) {
 entry:
@@ -1425,8 +1425,8 @@ define amdgpu_kernel void @flat_agent_acq_rel_atomicrmw(
 ; GFX10-WGP-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_agent_acq_rel_atomicrmw:
@@ -1441,8 +1441,8 @@ define amdgpu_kernel void @flat_agent_acq_rel_atomicrmw(
 ; GFX10-CU-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_agent_acq_rel_atomicrmw:
@@ -1518,8 +1518,8 @@ define amdgpu_kernel void @flat_agent_acq_rel_atomicrmw(
 ; GFX11-WGP-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_agent_acq_rel_atomicrmw:
@@ -1533,8 +1533,8 @@ define amdgpu_kernel void @flat_agent_acq_rel_atomicrmw(
 ; GFX11-CU-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
     ptr %out, i32 %in) {
 entry:
@@ -1568,8 +1568,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_atomicrmw(
 ; GFX10-WGP-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_agent_seq_cst_atomicrmw:
@@ -1584,8 +1584,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_atomicrmw(
 ; GFX10-CU-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_agent_seq_cst_atomicrmw:
@@ -1661,8 +1661,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_atomicrmw(
 ; GFX11-WGP-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_agent_seq_cst_atomicrmw:
@@ -1676,8 +1676,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_atomicrmw(
 ; GFX11-CU-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
     ptr %out, i32 %in) {
 entry:
@@ -1711,8 +1711,8 @@ define amdgpu_kernel void @flat_agent_acquire_ret_atomicrmw(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX10-WGP-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
@@ -1727,8 +1727,8 @@ define amdgpu_kernel void @flat_agent_acquire_ret_atomicrmw(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX10-CU-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -1807,8 +1807,8 @@ define amdgpu_kernel void @flat_agent_acquire_ret_atomicrmw(
 ; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-WGP-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
@@ -1822,8 +1822,8 @@ define amdgpu_kernel void @flat_agent_acquire_ret_atomicrmw(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-CU-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
     ptr %out, i32 %in) {
@@ -1859,8 +1859,8 @@ define amdgpu_kernel void @flat_agent_acq_rel_ret_atomicrmw(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX10-WGP-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
@@ -1875,8 +1875,8 @@ define amdgpu_kernel void @flat_agent_acq_rel_ret_atomicrmw(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX10-CU-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -1957,8 +1957,8 @@ define amdgpu_kernel void @flat_agent_acq_rel_ret_atomicrmw(
 ; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-WGP-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
@@ -1972,8 +1972,8 @@ define amdgpu_kernel void @flat_agent_acq_rel_ret_atomicrmw(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-CU-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
     ptr %out, i32 %in) {
@@ -2009,8 +2009,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_ret_atomicrmw(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX10-WGP-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
@@ -2025,8 +2025,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_ret_atomicrmw(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX10-CU-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -2107,8 +2107,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_ret_atomicrmw(
 ; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-WGP-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
@@ -2122,8 +2122,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_ret_atomicrmw(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-CU-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
     ptr %out, i32 %in) {
@@ -2275,8 +2275,8 @@ define amdgpu_kernel void @flat_agent_acquire_monotonic_cmpxchg(
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_agent_acquire_monotonic_cmpxchg:
@@ -2292,8 +2292,8 @@ define amdgpu_kernel void @flat_agent_acquire_monotonic_cmpxchg(
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_agent_acquire_monotonic_cmpxchg:
@@ -2363,8 +2363,8 @@ define amdgpu_kernel void @flat_agent_acquire_monotonic_cmpxchg(
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_agent_acquire_monotonic_cmpxchg:
@@ -2376,8 +2376,8 @@ define amdgpu_kernel void @flat_agent_acquire_monotonic_cmpxchg(
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
 entry:
@@ -2530,8 +2530,8 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_cmpxchg(
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_agent_acq_rel_monotonic_cmpxchg:
@@ -2547,8 +2547,8 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_cmpxchg(
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_agent_acq_rel_monotonic_cmpxchg:
@@ -2620,8 +2620,8 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_cmpxchg(
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_agent_acq_rel_monotonic_cmpxchg:
@@ -2633,8 +2633,8 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_cmpxchg(
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
 entry:
@@ -2672,8 +2672,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_cmpxchg(
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_agent_seq_cst_monotonic_cmpxchg:
@@ -2689,8 +2689,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_cmpxchg(
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_agent_seq_cst_monotonic_cmpxchg:
@@ -2762,8 +2762,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_cmpxchg(
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_agent_seq_cst_monotonic_cmpxchg:
@@ -2775,8 +2775,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_cmpxchg(
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
 entry:
@@ -2814,8 +2814,8 @@ define amdgpu_kernel void @flat_agent_monotonic_acquire_cmpxchg(
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_agent_monotonic_acquire_cmpxchg:
@@ -2831,8 +2831,8 @@ define amdgpu_kernel void @flat_agent_monotonic_acquire_cmpxchg(
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_agent_monotonic_acquire_cmpxchg:
@@ -2902,8 +2902,8 @@ define amdgpu_kernel void @flat_agent_monotonic_acquire_cmpxchg(
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_agent_monotonic_acquire_cmpxchg:
@@ -2915,8 +2915,8 @@ define amdgpu_kernel void @flat_agent_monotonic_acquire_cmpxchg(
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
 entry:
@@ -2954,8 +2954,8 @@ define amdgpu_kernel void @flat_agent_acquire_acquire_cmpxchg(
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_agent_acquire_acquire_cmpxchg:
@@ -2971,8 +2971,8 @@ define amdgpu_kernel void @flat_agent_acquire_acquire_cmpxchg(
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_agent_acquire_acquire_cmpxchg:
@@ -3042,8 +3042,8 @@ define amdgpu_kernel void @flat_agent_acquire_acquire_cmpxchg(
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_agent_acquire_acquire_cmpxchg:
@@ -3055,8 +3055,8 @@ define amdgpu_kernel void @flat_agent_acquire_acquire_cmpxchg(
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
 entry:
@@ -3094,8 +3094,8 @@ define amdgpu_kernel void @flat_agent_release_acquire_cmpxchg(
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_agent_release_acquire_cmpxchg:
@@ -3111,8 +3111,8 @@ define amdgpu_kernel void @flat_agent_release_acquire_cmpxchg(
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_agent_release_acquire_cmpxchg:
@@ -3184,8 +3184,8 @@ define amdgpu_kernel void @flat_agent_release_acquire_cmpxchg(
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_agent_release_acquire_cmpxchg:
@@ -3197,8 +3197,8 @@ define amdgpu_kernel void @flat_agent_release_acquire_cmpxchg(
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
 entry:
@@ -3236,8 +3236,8 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_cmpxchg(
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_agent_acq_rel_acquire_cmpxchg:
@@ -3253,8 +3253,8 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_cmpxchg(
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_agent_acq_rel_acquire_cmpxchg:
@@ -3326,8 +3326,8 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_cmpxchg(
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_agent_acq_rel_acquire_cmpxchg:
@@ -3339,8 +3339,8 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_cmpxchg(
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
 entry:
@@ -3378,8 +3378,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_cmpxchg(
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_agent_seq_cst_acquire_cmpxchg:
@@ -3395,8 +3395,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_cmpxchg(
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_agent_seq_cst_acquire_cmpxchg:
@@ -3468,8 +3468,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_cmpxchg(
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_agent_seq_cst_acquire_cmpxchg:
@@ -3481,8 +3481,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_cmpxchg(
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
 entry:
@@ -3520,8 +3520,8 @@ define amdgpu_kernel void @flat_agent_monotonic_seq_cst_cmpxchg(
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_agent_monotonic_seq_cst_cmpxchg:
@@ -3537,8 +3537,8 @@ define amdgpu_kernel void @flat_agent_monotonic_seq_cst_cmpxchg(
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_agent_monotonic_seq_cst_cmpxchg:
@@ -3610,8 +3610,8 @@ define amdgpu_kernel void @flat_agent_monotonic_seq_cst_cmpxchg(
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_agent_monotonic_seq_cst_cmpxchg:
@@ -3623,8 +3623,8 @@ define amdgpu_kernel void @flat_agent_monotonic_seq_cst_cmpxchg(
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
 entry:
@@ -3662,8 +3662,8 @@ define amdgpu_kernel void @flat_agent_acquire_seq_cst_cmpxchg(
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_agent_acquire_seq_cst_cmpxchg:
@@ -3679,8 +3679,8 @@ define amdgpu_kernel void @flat_agent_acquire_seq_cst_cmpxchg(
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_agent_acquire_seq_cst_cmpxchg:
@@ -3752,8 +3752,8 @@ define amdgpu_kernel void @flat_agent_acquire_seq_cst_cmpxchg(
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_agent_acquire_seq_cst_cmpxchg:
@@ -3765,8 +3765,8 @@ define amdgpu_kernel void @flat_agent_acquire_seq_cst_cmpxchg(
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
 entry:
@@ -3804,8 +3804,8 @@ define amdgpu_kernel void @flat_agent_release_seq_cst_cmpxchg(
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_agent_release_seq_cst_cmpxchg:
@@ -3821,8 +3821,8 @@ define amdgpu_kernel void @flat_agent_release_seq_cst_cmpxchg(
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_agent_release_seq_cst_cmpxchg:
@@ -3894,8 +3894,8 @@ define amdgpu_kernel void @flat_agent_release_seq_cst_cmpxchg(
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_agent_release_seq_cst_cmpxchg:
@@ -3907,8 +3907,8 @@ define amdgpu_kernel void @flat_agent_release_seq_cst_cmpxchg(
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
 entry:
@@ -3946,8 +3946,8 @@ define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_cmpxchg(
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_agent_acq_rel_seq_cst_cmpxchg:
@@ -3963,8 +3963,8 @@ define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_cmpxchg(
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_agent_acq_rel_seq_cst_cmpxchg:
@@ -4036,8 +4036,8 @@ define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_cmpxchg(
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_agent_acq_rel_seq_cst_cmpxchg:
@@ -4049,8 +4049,8 @@ define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_cmpxchg(
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
 entry:
@@ -4088,8 +4088,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_cmpxchg(
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_agent_seq_cst_seq_cst_cmpxchg:
@@ -4105,8 +4105,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_cmpxchg(
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_agent_seq_cst_seq_cst_cmpxchg:
@@ -4178,8 +4178,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_cmpxchg(
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_agent_seq_cst_seq_cst_cmpxchg:
@@ -4191,8 +4191,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_cmpxchg(
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
 entry:
@@ -4375,8 +4375,8 @@ define amdgpu_kernel void @flat_agent_acquire_monotonic_ret_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
@@ -4394,8 +4394,8 @@ define amdgpu_kernel void @flat_agent_acquire_monotonic_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
@@ -4474,8 +4474,8 @@ define amdgpu_kernel void @flat_agent_acquire_monotonic_ret_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
@@ -4487,8 +4487,8 @@ define amdgpu_kernel void @flat_agent_acquire_monotonic_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
@@ -4676,8 +4676,8 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_ret_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
@@ -4695,8 +4695,8 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
@@ -4777,8 +4777,8 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_ret_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
@@ -4790,8 +4790,8 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
@@ -4834,8 +4834,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_ret_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
@@ -4853,8 +4853,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
@@ -4935,8 +4935,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_ret_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
@@ -4948,8 +4948,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
@@ -4992,8 +4992,8 @@ define amdgpu_kernel void @flat_agent_monotonic_acquire_ret_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
@@ -5011,8 +5011,8 @@ define amdgpu_kernel void @flat_agent_monotonic_acquire_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
@@ -5091,8 +5091,8 @@ define amdgpu_kernel void @flat_agent_monotonic_acquire_ret_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
@@ -5104,8 +5104,8 @@ define amdgpu_kernel void @flat_agent_monotonic_acquire_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
@@ -5148,8 +5148,8 @@ define amdgpu_kernel void @flat_agent_acquire_acquire_ret_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
@@ -5167,8 +5167,8 @@ define amdgpu_kernel void @flat_agent_acquire_acquire_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
@@ -5247,8 +5247,8 @@ define amdgpu_kernel void @flat_agent_acquire_acquire_ret_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
@@ -5260,8 +5260,8 @@ define amdgpu_kernel void @flat_agent_acquire_acquire_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
@@ -5304,8 +5304,8 @@ define amdgpu_kernel void @flat_agent_release_acquire_ret_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
@@ -5323,8 +5323,8 @@ define amdgpu_kernel void @flat_agent_release_acquire_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
@@ -5405,8 +5405,8 @@ define amdgpu_kernel void @flat_agent_release_acquire_ret_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
@@ -5418,8 +5418,8 @@ define amdgpu_kernel void @flat_agent_release_acquire_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
@@ -5462,8 +5462,8 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_ret_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
@@ -5481,8 +5481,8 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
@@ -5563,8 +5563,8 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_ret_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
@@ -5576,8 +5576,8 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
@@ -5620,8 +5620,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_ret_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
@@ -5639,8 +5639,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
@@ -5721,8 +5721,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_ret_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
@@ -5734,8 +5734,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
@@ -5778,8 +5778,8 @@ define amdgpu_kernel void @flat_agent_monotonic_seq_cst_ret_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
@@ -5797,8 +5797,8 @@ define amdgpu_kernel void @flat_agent_monotonic_seq_cst_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
@@ -5879,8 +5879,8 @@ define amdgpu_kernel void @flat_agent_monotonic_seq_cst_ret_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
@@ -5892,8 +5892,8 @@ define amdgpu_kernel void @flat_agent_monotonic_seq_cst_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
@@ -5936,8 +5936,8 @@ define amdgpu_kernel void @flat_agent_acquire_seq_cst_ret_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
@@ -5955,8 +5955,8 @@ define amdgpu_kernel void @flat_agent_acquire_seq_cst_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
@@ -6037,8 +6037,8 @@ define amdgpu_kernel void @flat_agent_acquire_seq_cst_ret_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
@@ -6050,8 +6050,8 @@ define amdgpu_kernel void @flat_agent_acquire_seq_cst_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
@@ -6094,8 +6094,8 @@ define amdgpu_kernel void @flat_agent_release_seq_cst_ret_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
@@ -6113,8 +6113,8 @@ define amdgpu_kernel void @flat_agent_release_seq_cst_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
@@ -6195,8 +6195,8 @@ define amdgpu_kernel void @flat_agent_release_seq_cst_ret_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
@@ -6208,8 +6208,8 @@ define amdgpu_kernel void @flat_agent_release_seq_cst_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
@@ -6252,8 +6252,8 @@ define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
@@ -6271,8 +6271,8 @@ define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
@@ -6353,8 +6353,8 @@ define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
@@ -6366,8 +6366,8 @@ define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
@@ -6410,8 +6410,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
@@ -6429,8 +6429,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
@@ -6511,8 +6511,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
@@ -6524,8 +6524,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
@@ -6827,8 +6827,8 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_load(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-WGP-NEXT:    flat_load_dword v2, v[0:1] glc dlc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
@@ -6843,8 +6843,8 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_load(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-CU-NEXT:    flat_load_dword v2, v[0:1] glc dlc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
@@ -6930,8 +6930,8 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_load(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX11-WGP-NEXT:    flat_load_b32 v2, v[0:1] glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
@@ -6944,8 +6944,8 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_load(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX11-CU-NEXT:    flat_load_b32 v2, v[0:1] glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
@@ -6981,8 +6981,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_load(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-WGP-NEXT:    flat_load_dword v2, v[0:1] glc dlc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
@@ -6997,8 +6997,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_load(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-CU-NEXT:    flat_load_dword v2, v[0:1] glc dlc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
@@ -7084,8 +7084,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_load(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX11-WGP-NEXT:    flat_load_b32 v2, v[0:1] glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
@@ -7098,8 +7098,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_load(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX11-CU-NEXT:    flat_load_b32 v2, v[0:1] glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
@@ -7710,8 +7710,8 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_atomicrmw(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX10-WGP-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_acquire_atomicrmw:
@@ -7725,8 +7725,8 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_atomicrmw(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX10-CU-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_agent_one_as_acquire_atomicrmw:
@@ -7799,8 +7799,8 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_atomicrmw(
 ; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-WGP-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_agent_one_as_acquire_atomicrmw:
@@ -7813,8 +7813,8 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_atomicrmw(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-CU-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
     ptr %out, i32 %in) {
 entry:
@@ -7963,8 +7963,8 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_atomicrmw(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX10-WGP-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_acq_rel_atomicrmw:
@@ -7978,8 +7978,8 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_atomicrmw(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX10-CU-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_agent_one_as_acq_rel_atomicrmw:
@@ -8054,8 +8054,8 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_atomicrmw(
 ; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-WGP-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_agent_one_as_acq_rel_atomicrmw:
@@ -8068,8 +8068,8 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_atomicrmw(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-CU-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
     ptr %out, i32 %in) {
 entry:
@@ -8102,8 +8102,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_atomicrmw(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX10-WGP-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_seq_cst_atomicrmw:
@@ -8117,8 +8117,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_atomicrmw(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX10-CU-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_agent_one_as_seq_cst_atomicrmw:
@@ -8193,8 +8193,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_atomicrmw(
 ; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-WGP-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_agent_one_as_seq_cst_atomicrmw:
@@ -8207,8 +8207,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_atomicrmw(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-CU-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
     ptr %out, i32 %in) {
 entry:
@@ -8243,8 +8243,8 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_ret_atomicrmw(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX10-WGP-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
@@ -8260,8 +8260,8 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_ret_atomicrmw(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX10-CU-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
@@ -8343,8 +8343,8 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_ret_atomicrmw(
 ; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-WGP-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
@@ -8359,8 +8359,8 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_ret_atomicrmw(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-CU-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
@@ -8398,8 +8398,8 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_ret_atomicrmw(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX10-WGP-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
@@ -8415,8 +8415,8 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_ret_atomicrmw(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX10-CU-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
@@ -8500,8 +8500,8 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_ret_atomicrmw(
 ; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-WGP-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
@@ -8516,8 +8516,8 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_ret_atomicrmw(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-CU-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
@@ -8555,8 +8555,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_ret_atomicrmw(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX10-WGP-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
@@ -8572,8 +8572,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_ret_atomicrmw(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX10-CU-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
@@ -8657,8 +8657,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_ret_atomicrmw(
 ; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-WGP-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
@@ -8673,8 +8673,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_ret_atomicrmw(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-CU-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
@@ -8826,8 +8826,8 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_acquire_monotonic_cmpxchg:
@@ -8842,8 +8842,8 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_agent_one_as_acquire_monotonic_cmpxchg:
@@ -8912,8 +8912,8 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_agent_one_as_acquire_monotonic_cmpxchg:
@@ -8924,8 +8924,8 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
 entry:
@@ -9077,8 +9077,8 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_acq_rel_monotonic_cmpxchg:
@@ -9093,8 +9093,8 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_agent_one_as_acq_rel_monotonic_cmpxchg:
@@ -9165,8 +9165,8 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_agent_one_as_acq_rel_monotonic_cmpxchg:
@@ -9177,8 +9177,8 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
 entry:
@@ -9215,8 +9215,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_seq_cst_monotonic_cmpxchg:
@@ -9231,8 +9231,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_agent_one_as_seq_cst_monotonic_cmpxchg:
@@ -9303,8 +9303,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_agent_one_as_seq_cst_monotonic_cmpxchg:
@@ -9315,8 +9315,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
 entry:
@@ -9353,8 +9353,8 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_monotonic_acquire_cmpxchg:
@@ -9369,8 +9369,8 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_agent_one_as_monotonic_acquire_cmpxchg:
@@ -9439,8 +9439,8 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_agent_one_as_monotonic_acquire_cmpxchg:
@@ -9451,8 +9451,8 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
 entry:
@@ -9489,8 +9489,8 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_acquire_acquire_cmpxchg:
@@ -9505,8 +9505,8 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_agent_one_as_acquire_acquire_cmpxchg:
@@ -9575,8 +9575,8 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_agent_one_as_acquire_acquire_cmpxchg:
@@ -9587,8 +9587,8 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
 entry:
@@ -9625,8 +9625,8 @@ define amdgpu_kernel void @flat_agent_one_as_release_acquire_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_release_acquire_cmpxchg:
@@ -9641,8 +9641,8 @@ define amdgpu_kernel void @flat_agent_one_as_release_acquire_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_agent_one_as_release_acquire_cmpxchg:
@@ -9713,8 +9713,8 @@ define amdgpu_kernel void @flat_agent_one_as_release_acquire_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_agent_one_as_release_acquire_cmpxchg:
@@ -9725,8 +9725,8 @@ define amdgpu_kernel void @flat_agent_one_as_release_acquire_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
 entry:
@@ -9763,8 +9763,8 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_acq_rel_acquire_cmpxchg:
@@ -9779,8 +9779,8 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_agent_one_as_acq_rel_acquire_cmpxchg:
@@ -9851,8 +9851,8 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_agent_one_as_acq_rel_acquire_cmpxchg:
@@ -9863,8 +9863,8 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
 entry:
@@ -9901,8 +9901,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_seq_cst_acquire_cmpxchg:
@@ -9917,8 +9917,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_agent_one_as_seq_cst_acquire_cmpxchg:
@@ -9989,8 +9989,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_agent_one_as_seq_cst_acquire_cmpxchg:
@@ -10001,8 +10001,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
 entry:
@@ -10039,8 +10039,8 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_monotonic_seq_cst_cmpxchg:
@@ -10055,8 +10055,8 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_agent_one_as_monotonic_seq_cst_cmpxchg:
@@ -10127,8 +10127,8 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_agent_one_as_monotonic_seq_cst_cmpxchg:
@@ -10139,8 +10139,8 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
 entry:
@@ -10177,8 +10177,8 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_acquire_seq_cst_cmpxchg:
@@ -10193,8 +10193,8 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_agent_one_as_acquire_seq_cst_cmpxchg:
@@ -10265,8 +10265,8 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_agent_one_as_acquire_seq_cst_cmpxchg:
@@ -10277,8 +10277,8 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
 entry:
@@ -10315,8 +10315,8 @@ define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_release_seq_cst_cmpxchg:
@@ -10331,8 +10331,8 @@ define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_agent_one_as_release_seq_cst_cmpxchg:
@@ -10403,8 +10403,8 @@ define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_agent_one_as_release_seq_cst_cmpxchg:
@@ -10415,8 +10415,8 @@ define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
 entry:
@@ -10453,8 +10453,8 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_acq_rel_seq_cst_cmpxchg:
@@ -10469,8 +10469,8 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_agent_one_as_acq_rel_seq_cst_cmpxchg:
@@ -10541,8 +10541,8 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_agent_one_as_acq_rel_seq_cst_cmpxchg:
@@ -10553,8 +10553,8 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
 entry:
@@ -10591,8 +10591,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_seq_cst_seq_cst_cmpxchg:
@@ -10607,8 +10607,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_agent_one_as_seq_cst_seq_cst_cmpxchg:
@@ -10679,8 +10679,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_agent_one_as_seq_cst_seq_cst_cmpxchg:
@@ -10691,8 +10691,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
 entry:
@@ -10876,8 +10876,8 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_ret_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
@@ -10896,8 +10896,8 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
@@ -10980,8 +10980,8 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_ret_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
@@ -10994,8 +10994,8 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
@@ -11185,8 +11185,8 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
@@ -11205,8 +11205,8 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
@@ -11291,8 +11291,8 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
@@ -11305,8 +11305,8 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
@@ -11351,8 +11351,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
@@ -11371,8 +11371,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
@@ -11457,8 +11457,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
@@ -11471,8 +11471,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
@@ -11517,8 +11517,8 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_ret_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
@@ -11537,8 +11537,8 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
@@ -11621,8 +11621,8 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_ret_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
@@ -11635,8 +11635,8 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
@@ -11681,8 +11681,8 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_ret_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
@@ -11701,8 +11701,8 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
@@ -11785,8 +11785,8 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_ret_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
@@ -11799,8 +11799,8 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
@@ -11845,8 +11845,8 @@ define amdgpu_kernel void @flat_agent_one_as_release_acquire_ret_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
@@ -11865,8 +11865,8 @@ define amdgpu_kernel void @flat_agent_one_as_release_acquire_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
@@ -11951,8 +11951,8 @@ define amdgpu_kernel void @flat_agent_one_as_release_acquire_ret_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
@@ -11965,8 +11965,8 @@ define amdgpu_kernel void @flat_agent_one_as_release_acquire_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
@@ -12011,8 +12011,8 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_ret_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
@@ -12031,8 +12031,8 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
@@ -12117,8 +12117,8 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_ret_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
@@ -12131,8 +12131,8 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
@@ -12177,8 +12177,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_ret_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
@@ -12197,8 +12197,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
@@ -12283,8 +12283,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_ret_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
@@ -12297,8 +12297,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
@@ -12343,8 +12343,8 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
@@ -12363,8 +12363,8 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
@@ -12449,8 +12449,8 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
@@ -12463,8 +12463,8 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
@@ -12509,8 +12509,8 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_ret_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
@@ -12529,8 +12529,8 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
@@ -12615,8 +12615,8 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_ret_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
@@ -12629,8 +12629,8 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
@@ -12675,8 +12675,8 @@ define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_ret_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
@@ -12695,8 +12695,8 @@ define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
@@ -12781,8 +12781,8 @@ define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_ret_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
@@ -12795,8 +12795,8 @@ define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
@@ -12841,8 +12841,8 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
@@ -12861,8 +12861,8 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
@@ -12947,8 +12947,8 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
@@ -12961,8 +12961,8 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
@@ -13007,8 +13007,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
@@ -13027,8 +13027,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
@@ -13113,8 +13113,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
@@ -13127,8 +13127,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-system.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-system.ll
index 461fe8b1..372f9ad 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-system.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-system.ll
@@ -299,8 +299,8 @@ define amdgpu_kernel void @flat_system_acquire_load(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-WGP-NEXT:    flat_load_dword v2, v[0:1] glc dlc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
@@ -314,8 +314,8 @@ define amdgpu_kernel void @flat_system_acquire_load(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-CU-NEXT:    flat_load_dword v2, v[0:1] glc dlc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
@@ -399,8 +399,8 @@ define amdgpu_kernel void @flat_system_acquire_load(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX11-WGP-NEXT:    flat_load_b32 v2, v[0:1] glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
@@ -412,8 +412,8 @@ define amdgpu_kernel void @flat_system_acquire_load(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX11-CU-NEXT:    flat_load_b32 v2, v[0:1] glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
@@ -447,8 +447,8 @@ define amdgpu_kernel void @flat_system_seq_cst_load(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-WGP-NEXT:    flat_load_dword v2, v[0:1] glc dlc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
@@ -462,8 +462,8 @@ define amdgpu_kernel void @flat_system_seq_cst_load(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-CU-NEXT:    flat_load_dword v2, v[0:1] glc dlc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
@@ -547,8 +547,8 @@ define amdgpu_kernel void @flat_system_seq_cst_load(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX11-WGP-NEXT:    flat_load_b32 v2, v[0:1] glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
@@ -560,8 +560,8 @@ define amdgpu_kernel void @flat_system_seq_cst_load(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX11-CU-NEXT:    flat_load_b32 v2, v[0:1] glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
@@ -1176,8 +1176,8 @@ define amdgpu_kernel void @flat_system_acquire_atomicrmw(
 ; GFX10-WGP-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_system_acquire_atomicrmw:
@@ -1192,8 +1192,8 @@ define amdgpu_kernel void @flat_system_acquire_atomicrmw(
 ; GFX10-CU-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_system_acquire_atomicrmw:
@@ -1269,8 +1269,8 @@ define amdgpu_kernel void @flat_system_acquire_atomicrmw(
 ; GFX11-WGP-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_system_acquire_atomicrmw:
@@ -1284,8 +1284,8 @@ define amdgpu_kernel void @flat_system_acquire_atomicrmw(
 ; GFX11-CU-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
     ptr %out, i32 %in) {
 entry:
@@ -1437,8 +1437,8 @@ define amdgpu_kernel void @flat_system_acq_rel_atomicrmw(
 ; GFX10-WGP-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_system_acq_rel_atomicrmw:
@@ -1453,8 +1453,8 @@ define amdgpu_kernel void @flat_system_acq_rel_atomicrmw(
 ; GFX10-CU-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_system_acq_rel_atomicrmw:
@@ -1534,8 +1534,8 @@ define amdgpu_kernel void @flat_system_acq_rel_atomicrmw(
 ; GFX11-WGP-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_system_acq_rel_atomicrmw:
@@ -1549,8 +1549,8 @@ define amdgpu_kernel void @flat_system_acq_rel_atomicrmw(
 ; GFX11-CU-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
     ptr %out, i32 %in) {
 entry:
@@ -1584,8 +1584,8 @@ define amdgpu_kernel void @flat_system_seq_cst_atomicrmw(
 ; GFX10-WGP-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_system_seq_cst_atomicrmw:
@@ -1600,8 +1600,8 @@ define amdgpu_kernel void @flat_system_seq_cst_atomicrmw(
 ; GFX10-CU-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_system_seq_cst_atomicrmw:
@@ -1681,8 +1681,8 @@ define amdgpu_kernel void @flat_system_seq_cst_atomicrmw(
 ; GFX11-WGP-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_system_seq_cst_atomicrmw:
@@ -1696,8 +1696,8 @@ define amdgpu_kernel void @flat_system_seq_cst_atomicrmw(
 ; GFX11-CU-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
     ptr %out, i32 %in) {
 entry:
@@ -1731,8 +1731,8 @@ define amdgpu_kernel void @flat_system_acquire_ret_atomicrmw(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX10-WGP-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
@@ -1747,8 +1747,8 @@ define amdgpu_kernel void @flat_system_acquire_ret_atomicrmw(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX10-CU-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -1829,8 +1829,8 @@ define amdgpu_kernel void @flat_system_acquire_ret_atomicrmw(
 ; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-WGP-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
@@ -1844,8 +1844,8 @@ define amdgpu_kernel void @flat_system_acquire_ret_atomicrmw(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-CU-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
     ptr %out, i32 %in) {
@@ -1881,8 +1881,8 @@ define amdgpu_kernel void @flat_system_acq_rel_ret_atomicrmw(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX10-WGP-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
@@ -1897,8 +1897,8 @@ define amdgpu_kernel void @flat_system_acq_rel_ret_atomicrmw(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX10-CU-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -1983,8 +1983,8 @@ define amdgpu_kernel void @flat_system_acq_rel_ret_atomicrmw(
 ; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-WGP-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
@@ -1998,8 +1998,8 @@ define amdgpu_kernel void @flat_system_acq_rel_ret_atomicrmw(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-CU-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
     ptr %out, i32 %in) {
@@ -2035,8 +2035,8 @@ define amdgpu_kernel void @flat_system_seq_cst_ret_atomicrmw(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX10-WGP-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
@@ -2051,8 +2051,8 @@ define amdgpu_kernel void @flat_system_seq_cst_ret_atomicrmw(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX10-CU-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -2137,8 +2137,8 @@ define amdgpu_kernel void @flat_system_seq_cst_ret_atomicrmw(
 ; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-WGP-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
@@ -2152,8 +2152,8 @@ define amdgpu_kernel void @flat_system_seq_cst_ret_atomicrmw(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-CU-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
     ptr %out, i32 %in) {
@@ -2305,8 +2305,8 @@ define amdgpu_kernel void @flat_system_acquire_monotonic_cmpxchg(
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_system_acquire_monotonic_cmpxchg:
@@ -2322,8 +2322,8 @@ define amdgpu_kernel void @flat_system_acquire_monotonic_cmpxchg(
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_system_acquire_monotonic_cmpxchg:
@@ -2395,8 +2395,8 @@ define amdgpu_kernel void @flat_system_acquire_monotonic_cmpxchg(
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_system_acquire_monotonic_cmpxchg:
@@ -2408,8 +2408,8 @@ define amdgpu_kernel void @flat_system_acquire_monotonic_cmpxchg(
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
 entry:
@@ -2564,8 +2564,8 @@ define amdgpu_kernel void @flat_system_acq_rel_monotonic_cmpxchg(
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_system_acq_rel_monotonic_cmpxchg:
@@ -2581,8 +2581,8 @@ define amdgpu_kernel void @flat_system_acq_rel_monotonic_cmpxchg(
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_system_acq_rel_monotonic_cmpxchg:
@@ -2658,8 +2658,8 @@ define amdgpu_kernel void @flat_system_acq_rel_monotonic_cmpxchg(
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_system_acq_rel_monotonic_cmpxchg:
@@ -2671,8 +2671,8 @@ define amdgpu_kernel void @flat_system_acq_rel_monotonic_cmpxchg(
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
 entry:
@@ -2710,8 +2710,8 @@ define amdgpu_kernel void @flat_system_seq_cst_monotonic_cmpxchg(
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_system_seq_cst_monotonic_cmpxchg:
@@ -2727,8 +2727,8 @@ define amdgpu_kernel void @flat_system_seq_cst_monotonic_cmpxchg(
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_system_seq_cst_monotonic_cmpxchg:
@@ -2804,8 +2804,8 @@ define amdgpu_kernel void @flat_system_seq_cst_monotonic_cmpxchg(
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_system_seq_cst_monotonic_cmpxchg:
@@ -2817,8 +2817,8 @@ define amdgpu_kernel void @flat_system_seq_cst_monotonic_cmpxchg(
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
 entry:
@@ -2856,8 +2856,8 @@ define amdgpu_kernel void @flat_system_monotonic_acquire_cmpxchg(
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_system_monotonic_acquire_cmpxchg:
@@ -2873,8 +2873,8 @@ define amdgpu_kernel void @flat_system_monotonic_acquire_cmpxchg(
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_system_monotonic_acquire_cmpxchg:
@@ -2946,8 +2946,8 @@ define amdgpu_kernel void @flat_system_monotonic_acquire_cmpxchg(
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_system_monotonic_acquire_cmpxchg:
@@ -2959,8 +2959,8 @@ define amdgpu_kernel void @flat_system_monotonic_acquire_cmpxchg(
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
 entry:
@@ -2998,8 +2998,8 @@ define amdgpu_kernel void @flat_system_acquire_acquire_cmpxchg(
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_system_acquire_acquire_cmpxchg:
@@ -3015,8 +3015,8 @@ define amdgpu_kernel void @flat_system_acquire_acquire_cmpxchg(
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_system_acquire_acquire_cmpxchg:
@@ -3088,8 +3088,8 @@ define amdgpu_kernel void @flat_system_acquire_acquire_cmpxchg(
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_system_acquire_acquire_cmpxchg:
@@ -3101,8 +3101,8 @@ define amdgpu_kernel void @flat_system_acquire_acquire_cmpxchg(
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
 entry:
@@ -3140,8 +3140,8 @@ define amdgpu_kernel void @flat_system_release_acquire_cmpxchg(
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_system_release_acquire_cmpxchg:
@@ -3157,8 +3157,8 @@ define amdgpu_kernel void @flat_system_release_acquire_cmpxchg(
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_system_release_acquire_cmpxchg:
@@ -3234,8 +3234,8 @@ define amdgpu_kernel void @flat_system_release_acquire_cmpxchg(
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_system_release_acquire_cmpxchg:
@@ -3247,8 +3247,8 @@ define amdgpu_kernel void @flat_system_release_acquire_cmpxchg(
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
 entry:
@@ -3286,8 +3286,8 @@ define amdgpu_kernel void @flat_system_acq_rel_acquire_cmpxchg(
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_system_acq_rel_acquire_cmpxchg:
@@ -3303,8 +3303,8 @@ define amdgpu_kernel void @flat_system_acq_rel_acquire_cmpxchg(
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_system_acq_rel_acquire_cmpxchg:
@@ -3380,8 +3380,8 @@ define amdgpu_kernel void @flat_system_acq_rel_acquire_cmpxchg(
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_system_acq_rel_acquire_cmpxchg:
@@ -3393,8 +3393,8 @@ define amdgpu_kernel void @flat_system_acq_rel_acquire_cmpxchg(
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
 entry:
@@ -3432,8 +3432,8 @@ define amdgpu_kernel void @flat_system_seq_cst_acquire_cmpxchg(
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_system_seq_cst_acquire_cmpxchg:
@@ -3449,8 +3449,8 @@ define amdgpu_kernel void @flat_system_seq_cst_acquire_cmpxchg(
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_system_seq_cst_acquire_cmpxchg:
@@ -3526,8 +3526,8 @@ define amdgpu_kernel void @flat_system_seq_cst_acquire_cmpxchg(
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_system_seq_cst_acquire_cmpxchg:
@@ -3539,8 +3539,8 @@ define amdgpu_kernel void @flat_system_seq_cst_acquire_cmpxchg(
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
 entry:
@@ -3578,8 +3578,8 @@ define amdgpu_kernel void @flat_system_monotonic_seq_cst_cmpxchg(
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_system_monotonic_seq_cst_cmpxchg:
@@ -3595,8 +3595,8 @@ define amdgpu_kernel void @flat_system_monotonic_seq_cst_cmpxchg(
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_system_monotonic_seq_cst_cmpxchg:
@@ -3672,8 +3672,8 @@ define amdgpu_kernel void @flat_system_monotonic_seq_cst_cmpxchg(
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_system_monotonic_seq_cst_cmpxchg:
@@ -3685,8 +3685,8 @@ define amdgpu_kernel void @flat_system_monotonic_seq_cst_cmpxchg(
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
 entry:
@@ -3724,8 +3724,8 @@ define amdgpu_kernel void @flat_system_acquire_seq_cst_cmpxchg(
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_system_acquire_seq_cst_cmpxchg:
@@ -3741,8 +3741,8 @@ define amdgpu_kernel void @flat_system_acquire_seq_cst_cmpxchg(
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_system_acquire_seq_cst_cmpxchg:
@@ -3818,8 +3818,8 @@ define amdgpu_kernel void @flat_system_acquire_seq_cst_cmpxchg(
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_system_acquire_seq_cst_cmpxchg:
@@ -3831,8 +3831,8 @@ define amdgpu_kernel void @flat_system_acquire_seq_cst_cmpxchg(
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
 entry:
@@ -3870,8 +3870,8 @@ define amdgpu_kernel void @flat_system_release_seq_cst_cmpxchg(
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_system_release_seq_cst_cmpxchg:
@@ -3887,8 +3887,8 @@ define amdgpu_kernel void @flat_system_release_seq_cst_cmpxchg(
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_system_release_seq_cst_cmpxchg:
@@ -3964,8 +3964,8 @@ define amdgpu_kernel void @flat_system_release_seq_cst_cmpxchg(
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_system_release_seq_cst_cmpxchg:
@@ -3977,8 +3977,8 @@ define amdgpu_kernel void @flat_system_release_seq_cst_cmpxchg(
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
 entry:
@@ -4016,8 +4016,8 @@ define amdgpu_kernel void @flat_system_acq_rel_seq_cst_cmpxchg(
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_system_acq_rel_seq_cst_cmpxchg:
@@ -4033,8 +4033,8 @@ define amdgpu_kernel void @flat_system_acq_rel_seq_cst_cmpxchg(
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_system_acq_rel_seq_cst_cmpxchg:
@@ -4110,8 +4110,8 @@ define amdgpu_kernel void @flat_system_acq_rel_seq_cst_cmpxchg(
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_system_acq_rel_seq_cst_cmpxchg:
@@ -4123,8 +4123,8 @@ define amdgpu_kernel void @flat_system_acq_rel_seq_cst_cmpxchg(
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
 entry:
@@ -4162,8 +4162,8 @@ define amdgpu_kernel void @flat_system_seq_cst_seq_cst_cmpxchg(
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_system_seq_cst_seq_cst_cmpxchg:
@@ -4179,8 +4179,8 @@ define amdgpu_kernel void @flat_system_seq_cst_seq_cst_cmpxchg(
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_system_seq_cst_seq_cst_cmpxchg:
@@ -4256,8 +4256,8 @@ define amdgpu_kernel void @flat_system_seq_cst_seq_cst_cmpxchg(
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_system_seq_cst_seq_cst_cmpxchg:
@@ -4269,8 +4269,8 @@ define amdgpu_kernel void @flat_system_seq_cst_seq_cst_cmpxchg(
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
 entry:
@@ -4453,8 +4453,8 @@ define amdgpu_kernel void @flat_system_acquire_monotonic_ret_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
@@ -4472,8 +4472,8 @@ define amdgpu_kernel void @flat_system_acquire_monotonic_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
@@ -4554,8 +4554,8 @@ define amdgpu_kernel void @flat_system_acquire_monotonic_ret_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
@@ -4567,8 +4567,8 @@ define amdgpu_kernel void @flat_system_acquire_monotonic_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
@@ -4758,8 +4758,8 @@ define amdgpu_kernel void @flat_system_acq_rel_monotonic_ret_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
@@ -4777,8 +4777,8 @@ define amdgpu_kernel void @flat_system_acq_rel_monotonic_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
@@ -4863,8 +4863,8 @@ define amdgpu_kernel void @flat_system_acq_rel_monotonic_ret_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
@@ -4876,8 +4876,8 @@ define amdgpu_kernel void @flat_system_acq_rel_monotonic_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
@@ -4920,8 +4920,8 @@ define amdgpu_kernel void @flat_system_seq_cst_monotonic_ret_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
@@ -4939,8 +4939,8 @@ define amdgpu_kernel void @flat_system_seq_cst_monotonic_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
@@ -5025,8 +5025,8 @@ define amdgpu_kernel void @flat_system_seq_cst_monotonic_ret_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
@@ -5038,8 +5038,8 @@ define amdgpu_kernel void @flat_system_seq_cst_monotonic_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
@@ -5082,8 +5082,8 @@ define amdgpu_kernel void @flat_system_monotonic_acquire_ret_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
@@ -5101,8 +5101,8 @@ define amdgpu_kernel void @flat_system_monotonic_acquire_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
@@ -5183,8 +5183,8 @@ define amdgpu_kernel void @flat_system_monotonic_acquire_ret_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
@@ -5196,8 +5196,8 @@ define amdgpu_kernel void @flat_system_monotonic_acquire_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
@@ -5240,8 +5240,8 @@ define amdgpu_kernel void @flat_system_acquire_acquire_ret_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
@@ -5259,8 +5259,8 @@ define amdgpu_kernel void @flat_system_acquire_acquire_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
@@ -5341,8 +5341,8 @@ define amdgpu_kernel void @flat_system_acquire_acquire_ret_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
@@ -5354,8 +5354,8 @@ define amdgpu_kernel void @flat_system_acquire_acquire_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
@@ -5398,8 +5398,8 @@ define amdgpu_kernel void @flat_system_release_acquire_ret_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
@@ -5417,8 +5417,8 @@ define amdgpu_kernel void @flat_system_release_acquire_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
@@ -5503,8 +5503,8 @@ define amdgpu_kernel void @flat_system_release_acquire_ret_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
@@ -5516,8 +5516,8 @@ define amdgpu_kernel void @flat_system_release_acquire_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
@@ -5560,8 +5560,8 @@ define amdgpu_kernel void @flat_system_acq_rel_acquire_ret_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
@@ -5579,8 +5579,8 @@ define amdgpu_kernel void @flat_system_acq_rel_acquire_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
@@ -5665,8 +5665,8 @@ define amdgpu_kernel void @flat_system_acq_rel_acquire_ret_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
@@ -5678,8 +5678,8 @@ define amdgpu_kernel void @flat_system_acq_rel_acquire_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
@@ -5722,8 +5722,8 @@ define amdgpu_kernel void @flat_system_seq_cst_acquire_ret_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
@@ -5741,8 +5741,8 @@ define amdgpu_kernel void @flat_system_seq_cst_acquire_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
@@ -5827,8 +5827,8 @@ define amdgpu_kernel void @flat_system_seq_cst_acquire_ret_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
@@ -5840,8 +5840,8 @@ define amdgpu_kernel void @flat_system_seq_cst_acquire_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
@@ -5884,8 +5884,8 @@ define amdgpu_kernel void @flat_system_monotonic_seq_cst_ret_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
@@ -5903,8 +5903,8 @@ define amdgpu_kernel void @flat_system_monotonic_seq_cst_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
@@ -5989,8 +5989,8 @@ define amdgpu_kernel void @flat_system_monotonic_seq_cst_ret_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
@@ -6002,8 +6002,8 @@ define amdgpu_kernel void @flat_system_monotonic_seq_cst_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
@@ -6046,8 +6046,8 @@ define amdgpu_kernel void @flat_system_acquire_seq_cst_ret_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
@@ -6065,8 +6065,8 @@ define amdgpu_kernel void @flat_system_acquire_seq_cst_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
@@ -6151,8 +6151,8 @@ define amdgpu_kernel void @flat_system_acquire_seq_cst_ret_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
@@ -6164,8 +6164,8 @@ define amdgpu_kernel void @flat_system_acquire_seq_cst_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
@@ -6208,8 +6208,8 @@ define amdgpu_kernel void @flat_system_release_seq_cst_ret_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
@@ -6227,8 +6227,8 @@ define amdgpu_kernel void @flat_system_release_seq_cst_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
@@ -6313,8 +6313,8 @@ define amdgpu_kernel void @flat_system_release_seq_cst_ret_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
@@ -6326,8 +6326,8 @@ define amdgpu_kernel void @flat_system_release_seq_cst_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
@@ -6370,8 +6370,8 @@ define amdgpu_kernel void @flat_system_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
@@ -6389,8 +6389,8 @@ define amdgpu_kernel void @flat_system_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
@@ -6475,8 +6475,8 @@ define amdgpu_kernel void @flat_system_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
@@ -6488,8 +6488,8 @@ define amdgpu_kernel void @flat_system_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
@@ -6532,8 +6532,8 @@ define amdgpu_kernel void @flat_system_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
@@ -6551,8 +6551,8 @@ define amdgpu_kernel void @flat_system_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
@@ -6637,8 +6637,8 @@ define amdgpu_kernel void @flat_system_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
@@ -6650,8 +6650,8 @@ define amdgpu_kernel void @flat_system_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
@@ -6953,8 +6953,8 @@ define amdgpu_kernel void @flat_system_one_as_acquire_load(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-WGP-NEXT:    flat_load_dword v2, v[0:1] glc dlc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
@@ -6969,8 +6969,8 @@ define amdgpu_kernel void @flat_system_one_as_acquire_load(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-CU-NEXT:    flat_load_dword v2, v[0:1] glc dlc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
@@ -7058,8 +7058,8 @@ define amdgpu_kernel void @flat_system_one_as_acquire_load(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX11-WGP-NEXT:    flat_load_b32 v2, v[0:1] glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
@@ -7072,8 +7072,8 @@ define amdgpu_kernel void @flat_system_one_as_acquire_load(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX11-CU-NEXT:    flat_load_b32 v2, v[0:1] glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
@@ -7109,8 +7109,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_load(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-WGP-NEXT:    flat_load_dword v2, v[0:1] glc dlc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
@@ -7125,8 +7125,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_load(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-CU-NEXT:    flat_load_dword v2, v[0:1] glc dlc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
@@ -7214,8 +7214,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_load(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX11-WGP-NEXT:    flat_load_b32 v2, v[0:1] glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
@@ -7228,8 +7228,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_load(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX11-CU-NEXT:    flat_load_b32 v2, v[0:1] glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
@@ -7844,8 +7844,8 @@ define amdgpu_kernel void @flat_system_one_as_acquire_atomicrmw(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX10-WGP-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_acquire_atomicrmw:
@@ -7859,8 +7859,8 @@ define amdgpu_kernel void @flat_system_one_as_acquire_atomicrmw(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX10-CU-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_system_one_as_acquire_atomicrmw:
@@ -7935,8 +7935,8 @@ define amdgpu_kernel void @flat_system_one_as_acquire_atomicrmw(
 ; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-WGP-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_system_one_as_acquire_atomicrmw:
@@ -7949,8 +7949,8 @@ define amdgpu_kernel void @flat_system_one_as_acquire_atomicrmw(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-CU-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
     ptr %out, i32 %in) {
 entry:
@@ -8101,8 +8101,8 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_atomicrmw(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX10-WGP-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_acq_rel_atomicrmw:
@@ -8116,8 +8116,8 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_atomicrmw(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX10-CU-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_system_one_as_acq_rel_atomicrmw:
@@ -8196,8 +8196,8 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_atomicrmw(
 ; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-WGP-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_system_one_as_acq_rel_atomicrmw:
@@ -8210,8 +8210,8 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_atomicrmw(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-CU-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
     ptr %out, i32 %in) {
 entry:
@@ -8244,8 +8244,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_atomicrmw(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX10-WGP-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_seq_cst_atomicrmw:
@@ -8259,8 +8259,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_atomicrmw(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX10-CU-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_system_one_as_seq_cst_atomicrmw:
@@ -8339,8 +8339,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_atomicrmw(
 ; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-WGP-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_system_one_as_seq_cst_atomicrmw:
@@ -8353,8 +8353,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_atomicrmw(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-CU-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
     ptr %out, i32 %in) {
 entry:
@@ -8389,8 +8389,8 @@ define amdgpu_kernel void @flat_system_one_as_acquire_ret_atomicrmw(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX10-WGP-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
@@ -8406,8 +8406,8 @@ define amdgpu_kernel void @flat_system_one_as_acquire_ret_atomicrmw(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX10-CU-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
@@ -8491,8 +8491,8 @@ define amdgpu_kernel void @flat_system_one_as_acquire_ret_atomicrmw(
 ; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-WGP-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
@@ -8507,8 +8507,8 @@ define amdgpu_kernel void @flat_system_one_as_acquire_ret_atomicrmw(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-CU-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
@@ -8546,8 +8546,8 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_ret_atomicrmw(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX10-WGP-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
@@ -8563,8 +8563,8 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_ret_atomicrmw(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX10-CU-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
@@ -8652,8 +8652,8 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_ret_atomicrmw(
 ; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-WGP-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
@@ -8668,8 +8668,8 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_ret_atomicrmw(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-CU-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
@@ -8707,8 +8707,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_ret_atomicrmw(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX10-WGP-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
@@ -8724,8 +8724,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_ret_atomicrmw(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX10-CU-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
@@ -8813,8 +8813,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_ret_atomicrmw(
 ; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-WGP-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
@@ -8829,8 +8829,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_ret_atomicrmw(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-CU-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
@@ -8982,8 +8982,8 @@ define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_acquire_monotonic_cmpxchg:
@@ -8998,8 +8998,8 @@ define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_system_one_as_acquire_monotonic_cmpxchg:
@@ -9070,8 +9070,8 @@ define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_system_one_as_acquire_monotonic_cmpxchg:
@@ -9082,8 +9082,8 @@ define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
 entry:
@@ -9237,8 +9237,8 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_acq_rel_monotonic_cmpxchg:
@@ -9253,8 +9253,8 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_system_one_as_acq_rel_monotonic_cmpxchg:
@@ -9329,8 +9329,8 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_system_one_as_acq_rel_monotonic_cmpxchg:
@@ -9341,8 +9341,8 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
 entry:
@@ -9379,8 +9379,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_seq_cst_monotonic_cmpxchg:
@@ -9395,8 +9395,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_system_one_as_seq_cst_monotonic_cmpxchg:
@@ -9471,8 +9471,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_system_one_as_seq_cst_monotonic_cmpxchg:
@@ -9483,8 +9483,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
 entry:
@@ -9521,8 +9521,8 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_monotonic_acquire_cmpxchg:
@@ -9537,8 +9537,8 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_system_one_as_monotonic_acquire_cmpxchg:
@@ -9609,8 +9609,8 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_system_one_as_monotonic_acquire_cmpxchg:
@@ -9621,8 +9621,8 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
 entry:
@@ -9659,8 +9659,8 @@ define amdgpu_kernel void @flat_system_one_as_acquire_acquire_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_acquire_acquire_cmpxchg:
@@ -9675,8 +9675,8 @@ define amdgpu_kernel void @flat_system_one_as_acquire_acquire_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_system_one_as_acquire_acquire_cmpxchg:
@@ -9747,8 +9747,8 @@ define amdgpu_kernel void @flat_system_one_as_acquire_acquire_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_system_one_as_acquire_acquire_cmpxchg:
@@ -9759,8 +9759,8 @@ define amdgpu_kernel void @flat_system_one_as_acquire_acquire_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
 entry:
@@ -9797,8 +9797,8 @@ define amdgpu_kernel void @flat_system_one_as_release_acquire_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_release_acquire_cmpxchg:
@@ -9813,8 +9813,8 @@ define amdgpu_kernel void @flat_system_one_as_release_acquire_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_system_one_as_release_acquire_cmpxchg:
@@ -9889,8 +9889,8 @@ define amdgpu_kernel void @flat_system_one_as_release_acquire_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_system_one_as_release_acquire_cmpxchg:
@@ -9901,8 +9901,8 @@ define amdgpu_kernel void @flat_system_one_as_release_acquire_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
 entry:
@@ -9939,8 +9939,8 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_acq_rel_acquire_cmpxchg:
@@ -9955,8 +9955,8 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_system_one_as_acq_rel_acquire_cmpxchg:
@@ -10031,8 +10031,8 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_system_one_as_acq_rel_acquire_cmpxchg:
@@ -10043,8 +10043,8 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
 entry:
@@ -10081,8 +10081,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_seq_cst_acquire_cmpxchg:
@@ -10097,8 +10097,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_system_one_as_seq_cst_acquire_cmpxchg:
@@ -10173,8 +10173,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_system_one_as_seq_cst_acquire_cmpxchg:
@@ -10185,8 +10185,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
 entry:
@@ -10223,8 +10223,8 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_monotonic_seq_cst_cmpxchg:
@@ -10239,8 +10239,8 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_system_one_as_monotonic_seq_cst_cmpxchg:
@@ -10315,8 +10315,8 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_system_one_as_monotonic_seq_cst_cmpxchg:
@@ -10327,8 +10327,8 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
 entry:
@@ -10365,8 +10365,8 @@ define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_acquire_seq_cst_cmpxchg:
@@ -10381,8 +10381,8 @@ define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_system_one_as_acquire_seq_cst_cmpxchg:
@@ -10457,8 +10457,8 @@ define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_system_one_as_acquire_seq_cst_cmpxchg:
@@ -10469,8 +10469,8 @@ define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
 entry:
@@ -10507,8 +10507,8 @@ define amdgpu_kernel void @flat_system_one_as_release_seq_cst_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_release_seq_cst_cmpxchg:
@@ -10523,8 +10523,8 @@ define amdgpu_kernel void @flat_system_one_as_release_seq_cst_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_system_one_as_release_seq_cst_cmpxchg:
@@ -10599,8 +10599,8 @@ define amdgpu_kernel void @flat_system_one_as_release_seq_cst_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_system_one_as_release_seq_cst_cmpxchg:
@@ -10611,8 +10611,8 @@ define amdgpu_kernel void @flat_system_one_as_release_seq_cst_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
 entry:
@@ -10649,8 +10649,8 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_acq_rel_seq_cst_cmpxchg:
@@ -10665,8 +10665,8 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_system_one_as_acq_rel_seq_cst_cmpxchg:
@@ -10741,8 +10741,8 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_system_one_as_acq_rel_seq_cst_cmpxchg:
@@ -10753,8 +10753,8 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
 entry:
@@ -10791,8 +10791,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_seq_cst_seq_cst_cmpxchg:
@@ -10807,8 +10807,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_system_one_as_seq_cst_seq_cst_cmpxchg:
@@ -10883,8 +10883,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_system_one_as_seq_cst_seq_cst_cmpxchg:
@@ -10895,8 +10895,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
 entry:
@@ -11080,8 +11080,8 @@ define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_ret_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
@@ -11100,8 +11100,8 @@ define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
@@ -11186,8 +11186,8 @@ define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_ret_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
@@ -11200,8 +11200,8 @@ define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
@@ -11393,8 +11393,8 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_ret_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
@@ -11413,8 +11413,8 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
@@ -11503,8 +11503,8 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_ret_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
@@ -11517,8 +11517,8 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
@@ -11563,8 +11563,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_ret_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
@@ -11583,8 +11583,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
@@ -11673,8 +11673,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_ret_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
@@ -11687,8 +11687,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
@@ -11733,8 +11733,8 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_ret_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
@@ -11753,8 +11753,8 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
@@ -11839,8 +11839,8 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_ret_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
@@ -11853,8 +11853,8 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
@@ -11899,8 +11899,8 @@ define amdgpu_kernel void @flat_system_one_as_acquire_acquire_ret_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
@@ -11919,8 +11919,8 @@ define amdgpu_kernel void @flat_system_one_as_acquire_acquire_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
@@ -12005,8 +12005,8 @@ define amdgpu_kernel void @flat_system_one_as_acquire_acquire_ret_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
@@ -12019,8 +12019,8 @@ define amdgpu_kernel void @flat_system_one_as_acquire_acquire_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
@@ -12065,8 +12065,8 @@ define amdgpu_kernel void @flat_system_one_as_release_acquire_ret_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
@@ -12085,8 +12085,8 @@ define amdgpu_kernel void @flat_system_one_as_release_acquire_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
@@ -12175,8 +12175,8 @@ define amdgpu_kernel void @flat_system_one_as_release_acquire_ret_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
@@ -12189,8 +12189,8 @@ define amdgpu_kernel void @flat_system_one_as_release_acquire_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
@@ -12235,8 +12235,8 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_ret_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
@@ -12255,8 +12255,8 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
@@ -12345,8 +12345,8 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_ret_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
@@ -12359,8 +12359,8 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
@@ -12405,8 +12405,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_ret_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
@@ -12425,8 +12425,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
@@ -12515,8 +12515,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_ret_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
@@ -12529,8 +12529,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
@@ -12575,8 +12575,8 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_ret_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
@@ -12595,8 +12595,8 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
@@ -12685,8 +12685,8 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_ret_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
@@ -12699,8 +12699,8 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
@@ -12745,8 +12745,8 @@ define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_ret_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
@@ -12765,8 +12765,8 @@ define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
@@ -12855,8 +12855,8 @@ define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_ret_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
@@ -12869,8 +12869,8 @@ define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
@@ -12915,8 +12915,8 @@ define amdgpu_kernel void @flat_system_one_as_release_seq_cst_ret_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
@@ -12935,8 +12935,8 @@ define amdgpu_kernel void @flat_system_one_as_release_seq_cst_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
@@ -13025,8 +13025,8 @@ define amdgpu_kernel void @flat_system_one_as_release_seq_cst_ret_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
@@ -13039,8 +13039,8 @@ define amdgpu_kernel void @flat_system_one_as_release_seq_cst_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
@@ -13085,8 +13085,8 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
@@ -13105,8 +13105,8 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
@@ -13195,8 +13195,8 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
@@ -13209,8 +13209,8 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
@@ -13255,8 +13255,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
@@ -13275,8 +13275,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
@@ -13365,8 +13365,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
@@ -13379,8 +13379,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-agent.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-agent.ll
index 0a7a07e..0449e41 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-agent.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-agent.ll
@@ -317,8 +317,8 @@ define amdgpu_kernel void @global_agent_acquire_load(
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    global_load_dword v1, v0, s[0:1] glc dlc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[2:3]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
@@ -329,8 +329,8 @@ define amdgpu_kernel void @global_agent_acquire_load(
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    global_load_dword v1, v0, s[0:1] glc dlc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    global_store_dword v0, v1, s[2:3]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -400,8 +400,8 @@ define amdgpu_kernel void @global_agent_acquire_load(
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    global_load_b32 v1, v0, s[0:1] glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[2:3]
 ; GFX11-WGP-NEXT:    s_nop 0
 ; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -414,8 +414,8 @@ define amdgpu_kernel void @global_agent_acquire_load(
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    global_load_b32 v1, v0, s[0:1] glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[2:3]
 ; GFX11-CU-NEXT:    s_nop 0
 ; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -465,8 +465,8 @@ define amdgpu_kernel void @global_agent_seq_cst_load(
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    global_load_dword v1, v0, s[0:1] glc dlc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[2:3]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
@@ -477,8 +477,8 @@ define amdgpu_kernel void @global_agent_seq_cst_load(
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    global_load_dword v1, v0, s[0:1] glc dlc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    global_store_dword v0, v1, s[2:3]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -548,8 +548,8 @@ define amdgpu_kernel void @global_agent_seq_cst_load(
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    global_load_b32 v1, v0, s[0:1] glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[2:3]
 ; GFX11-WGP-NEXT:    s_nop 0
 ; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -562,8 +562,8 @@ define amdgpu_kernel void @global_agent_seq_cst_load(
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    global_load_b32 v1, v0, s[0:1] glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[2:3]
 ; GFX11-CU-NEXT:    s_nop 0
 ; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -1241,8 +1241,8 @@ define amdgpu_kernel void @global_agent_acquire_atomicrmw(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX10-WGP-NEXT:    global_atomic_swap v0, v1, s[0:1]
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_agent_acquire_atomicrmw:
@@ -1255,8 +1255,8 @@ define amdgpu_kernel void @global_agent_acquire_atomicrmw(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX10-CU-NEXT:    global_atomic_swap v0, v1, s[0:1]
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_agent_acquire_atomicrmw:
@@ -1328,8 +1328,8 @@ define amdgpu_kernel void @global_agent_acquire_atomicrmw(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; GFX11-WGP-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_agent_acquire_atomicrmw:
@@ -1341,8 +1341,8 @@ define amdgpu_kernel void @global_agent_acquire_atomicrmw(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; GFX11-CU-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in) {
 entry:
@@ -1514,8 +1514,8 @@ define amdgpu_kernel void @global_agent_acq_rel_atomicrmw(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX10-WGP-NEXT:    global_atomic_swap v0, v1, s[0:1]
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_agent_acq_rel_atomicrmw:
@@ -1528,8 +1528,8 @@ define amdgpu_kernel void @global_agent_acq_rel_atomicrmw(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX10-CU-NEXT:    global_atomic_swap v0, v1, s[0:1]
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_agent_acq_rel_atomicrmw:
@@ -1603,8 +1603,8 @@ define amdgpu_kernel void @global_agent_acq_rel_atomicrmw(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; GFX11-WGP-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_agent_acq_rel_atomicrmw:
@@ -1616,8 +1616,8 @@ define amdgpu_kernel void @global_agent_acq_rel_atomicrmw(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; GFX11-CU-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in) {
 entry:
@@ -1662,8 +1662,8 @@ define amdgpu_kernel void @global_agent_seq_cst_atomicrmw(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX10-WGP-NEXT:    global_atomic_swap v0, v1, s[0:1]
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_agent_seq_cst_atomicrmw:
@@ -1676,8 +1676,8 @@ define amdgpu_kernel void @global_agent_seq_cst_atomicrmw(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX10-CU-NEXT:    global_atomic_swap v0, v1, s[0:1]
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_agent_seq_cst_atomicrmw:
@@ -1751,8 +1751,8 @@ define amdgpu_kernel void @global_agent_seq_cst_atomicrmw(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; GFX11-WGP-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_agent_seq_cst_atomicrmw:
@@ -1764,8 +1764,8 @@ define amdgpu_kernel void @global_agent_seq_cst_atomicrmw(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; GFX11-CU-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in) {
 entry:
@@ -1812,8 +1812,8 @@ define amdgpu_kernel void @global_agent_acquire_ret_atomicrmw(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX10-WGP-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
@@ -1827,8 +1827,8 @@ define amdgpu_kernel void @global_agent_acquire_ret_atomicrmw(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX10-CU-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -1906,8 +1906,8 @@ define amdgpu_kernel void @global_agent_acquire_ret_atomicrmw(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; GFX11-WGP-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-WGP-NEXT:    s_nop 0
 ; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -1922,8 +1922,8 @@ define amdgpu_kernel void @global_agent_acquire_ret_atomicrmw(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; GFX11-CU-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_nop 0
 ; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -1974,8 +1974,8 @@ define amdgpu_kernel void @global_agent_acq_rel_ret_atomicrmw(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX10-WGP-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
@@ -1989,8 +1989,8 @@ define amdgpu_kernel void @global_agent_acq_rel_ret_atomicrmw(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX10-CU-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -2070,8 +2070,8 @@ define amdgpu_kernel void @global_agent_acq_rel_ret_atomicrmw(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; GFX11-WGP-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-WGP-NEXT:    s_nop 0
 ; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -2086,8 +2086,8 @@ define amdgpu_kernel void @global_agent_acq_rel_ret_atomicrmw(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; GFX11-CU-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_nop 0
 ; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -2138,8 +2138,8 @@ define amdgpu_kernel void @global_agent_seq_cst_ret_atomicrmw(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX10-WGP-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
@@ -2153,8 +2153,8 @@ define amdgpu_kernel void @global_agent_seq_cst_ret_atomicrmw(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX10-CU-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -2234,8 +2234,8 @@ define amdgpu_kernel void @global_agent_seq_cst_ret_atomicrmw(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; GFX11-WGP-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-WGP-NEXT:    s_nop 0
 ; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -2250,8 +2250,8 @@ define amdgpu_kernel void @global_agent_seq_cst_ret_atomicrmw(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; GFX11-CU-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_nop 0
 ; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -2427,8 +2427,8 @@ define amdgpu_kernel void @global_agent_acquire_monotonic_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_agent_acquire_monotonic_cmpxchg:
@@ -2440,8 +2440,8 @@ define amdgpu_kernel void @global_agent_acquire_monotonic_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_agent_acquire_monotonic_cmpxchg:
@@ -2510,8 +2510,8 @@ define amdgpu_kernel void @global_agent_acquire_monotonic_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_agent_acquire_monotonic_cmpxchg:
@@ -2522,8 +2522,8 @@ define amdgpu_kernel void @global_agent_acquire_monotonic_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -2698,8 +2698,8 @@ define amdgpu_kernel void @global_agent_acq_rel_monotonic_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_agent_acq_rel_monotonic_cmpxchg:
@@ -2711,8 +2711,8 @@ define amdgpu_kernel void @global_agent_acq_rel_monotonic_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_agent_acq_rel_monotonic_cmpxchg:
@@ -2783,8 +2783,8 @@ define amdgpu_kernel void @global_agent_acq_rel_monotonic_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_agent_acq_rel_monotonic_cmpxchg:
@@ -2795,8 +2795,8 @@ define amdgpu_kernel void @global_agent_acq_rel_monotonic_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -2845,8 +2845,8 @@ define amdgpu_kernel void @global_agent_seq_cst_monotonic_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_agent_seq_cst_monotonic_cmpxchg:
@@ -2858,8 +2858,8 @@ define amdgpu_kernel void @global_agent_seq_cst_monotonic_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_agent_seq_cst_monotonic_cmpxchg:
@@ -2930,8 +2930,8 @@ define amdgpu_kernel void @global_agent_seq_cst_monotonic_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_agent_seq_cst_monotonic_cmpxchg:
@@ -2942,8 +2942,8 @@ define amdgpu_kernel void @global_agent_seq_cst_monotonic_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -2992,8 +2992,8 @@ define amdgpu_kernel void @global_agent_monotonic_acquire_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_agent_monotonic_acquire_cmpxchg:
@@ -3005,8 +3005,8 @@ define amdgpu_kernel void @global_agent_monotonic_acquire_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_agent_monotonic_acquire_cmpxchg:
@@ -3075,8 +3075,8 @@ define amdgpu_kernel void @global_agent_monotonic_acquire_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_agent_monotonic_acquire_cmpxchg:
@@ -3087,8 +3087,8 @@ define amdgpu_kernel void @global_agent_monotonic_acquire_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -3137,8 +3137,8 @@ define amdgpu_kernel void @global_agent_acquire_acquire_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_agent_acquire_acquire_cmpxchg:
@@ -3150,8 +3150,8 @@ define amdgpu_kernel void @global_agent_acquire_acquire_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_agent_acquire_acquire_cmpxchg:
@@ -3220,8 +3220,8 @@ define amdgpu_kernel void @global_agent_acquire_acquire_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_agent_acquire_acquire_cmpxchg:
@@ -3232,8 +3232,8 @@ define amdgpu_kernel void @global_agent_acquire_acquire_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -3282,8 +3282,8 @@ define amdgpu_kernel void @global_agent_release_acquire_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_agent_release_acquire_cmpxchg:
@@ -3295,8 +3295,8 @@ define amdgpu_kernel void @global_agent_release_acquire_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_agent_release_acquire_cmpxchg:
@@ -3367,8 +3367,8 @@ define amdgpu_kernel void @global_agent_release_acquire_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_agent_release_acquire_cmpxchg:
@@ -3379,8 +3379,8 @@ define amdgpu_kernel void @global_agent_release_acquire_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -3429,8 +3429,8 @@ define amdgpu_kernel void @global_agent_acq_rel_acquire_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_agent_acq_rel_acquire_cmpxchg:
@@ -3442,8 +3442,8 @@ define amdgpu_kernel void @global_agent_acq_rel_acquire_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_agent_acq_rel_acquire_cmpxchg:
@@ -3514,8 +3514,8 @@ define amdgpu_kernel void @global_agent_acq_rel_acquire_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_agent_acq_rel_acquire_cmpxchg:
@@ -3526,8 +3526,8 @@ define amdgpu_kernel void @global_agent_acq_rel_acquire_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -3576,8 +3576,8 @@ define amdgpu_kernel void @global_agent_seq_cst_acquire_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_agent_seq_cst_acquire_cmpxchg:
@@ -3589,8 +3589,8 @@ define amdgpu_kernel void @global_agent_seq_cst_acquire_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_agent_seq_cst_acquire_cmpxchg:
@@ -3661,8 +3661,8 @@ define amdgpu_kernel void @global_agent_seq_cst_acquire_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_agent_seq_cst_acquire_cmpxchg:
@@ -3673,8 +3673,8 @@ define amdgpu_kernel void @global_agent_seq_cst_acquire_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -3723,8 +3723,8 @@ define amdgpu_kernel void @global_agent_monotonic_seq_cst_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_agent_monotonic_seq_cst_cmpxchg:
@@ -3736,8 +3736,8 @@ define amdgpu_kernel void @global_agent_monotonic_seq_cst_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_agent_monotonic_seq_cst_cmpxchg:
@@ -3808,8 +3808,8 @@ define amdgpu_kernel void @global_agent_monotonic_seq_cst_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_agent_monotonic_seq_cst_cmpxchg:
@@ -3820,8 +3820,8 @@ define amdgpu_kernel void @global_agent_monotonic_seq_cst_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -3870,8 +3870,8 @@ define amdgpu_kernel void @global_agent_acquire_seq_cst_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_agent_acquire_seq_cst_cmpxchg:
@@ -3883,8 +3883,8 @@ define amdgpu_kernel void @global_agent_acquire_seq_cst_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_agent_acquire_seq_cst_cmpxchg:
@@ -3955,8 +3955,8 @@ define amdgpu_kernel void @global_agent_acquire_seq_cst_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_agent_acquire_seq_cst_cmpxchg:
@@ -3967,8 +3967,8 @@ define amdgpu_kernel void @global_agent_acquire_seq_cst_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -4017,8 +4017,8 @@ define amdgpu_kernel void @global_agent_release_seq_cst_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_agent_release_seq_cst_cmpxchg:
@@ -4030,8 +4030,8 @@ define amdgpu_kernel void @global_agent_release_seq_cst_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_agent_release_seq_cst_cmpxchg:
@@ -4102,8 +4102,8 @@ define amdgpu_kernel void @global_agent_release_seq_cst_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_agent_release_seq_cst_cmpxchg:
@@ -4114,8 +4114,8 @@ define amdgpu_kernel void @global_agent_release_seq_cst_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -4164,8 +4164,8 @@ define amdgpu_kernel void @global_agent_acq_rel_seq_cst_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_agent_acq_rel_seq_cst_cmpxchg:
@@ -4177,8 +4177,8 @@ define amdgpu_kernel void @global_agent_acq_rel_seq_cst_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_agent_acq_rel_seq_cst_cmpxchg:
@@ -4249,8 +4249,8 @@ define amdgpu_kernel void @global_agent_acq_rel_seq_cst_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_agent_acq_rel_seq_cst_cmpxchg:
@@ -4261,8 +4261,8 @@ define amdgpu_kernel void @global_agent_acq_rel_seq_cst_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -4311,8 +4311,8 @@ define amdgpu_kernel void @global_agent_seq_cst_seq_cst_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_agent_seq_cst_seq_cst_cmpxchg:
@@ -4324,8 +4324,8 @@ define amdgpu_kernel void @global_agent_seq_cst_seq_cst_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_agent_seq_cst_seq_cst_cmpxchg:
@@ -4396,8 +4396,8 @@ define amdgpu_kernel void @global_agent_seq_cst_seq_cst_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_agent_seq_cst_seq_cst_cmpxchg:
@@ -4408,8 +4408,8 @@ define amdgpu_kernel void @global_agent_seq_cst_seq_cst_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -4612,8 +4612,8 @@ define amdgpu_kernel void @global_agent_acquire_monotonic_ret_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
@@ -4626,8 +4626,8 @@ define amdgpu_kernel void @global_agent_acquire_monotonic_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -4702,8 +4702,8 @@ define amdgpu_kernel void @global_agent_acquire_monotonic_ret_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
 ; GFX11-WGP-NEXT:    s_nop 0
 ; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -4717,8 +4717,8 @@ define amdgpu_kernel void @global_agent_acquire_monotonic_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
 ; GFX11-CU-NEXT:    s_nop 0
 ; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -4928,8 +4928,8 @@ define amdgpu_kernel void @global_agent_acq_rel_monotonic_ret_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
@@ -4942,8 +4942,8 @@ define amdgpu_kernel void @global_agent_acq_rel_monotonic_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -5020,8 +5020,8 @@ define amdgpu_kernel void @global_agent_acq_rel_monotonic_ret_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
 ; GFX11-WGP-NEXT:    s_nop 0
 ; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -5035,8 +5035,8 @@ define amdgpu_kernel void @global_agent_acq_rel_monotonic_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
 ; GFX11-CU-NEXT:    s_nop 0
 ; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -5094,8 +5094,8 @@ define amdgpu_kernel void @global_agent_seq_cst_monotonic_ret_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
@@ -5108,8 +5108,8 @@ define amdgpu_kernel void @global_agent_seq_cst_monotonic_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -5186,8 +5186,8 @@ define amdgpu_kernel void @global_agent_seq_cst_monotonic_ret_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
 ; GFX11-WGP-NEXT:    s_nop 0
 ; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -5201,8 +5201,8 @@ define amdgpu_kernel void @global_agent_seq_cst_monotonic_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
 ; GFX11-CU-NEXT:    s_nop 0
 ; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -5260,8 +5260,8 @@ define amdgpu_kernel void @global_agent_monotonic_acquire_ret_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
@@ -5274,8 +5274,8 @@ define amdgpu_kernel void @global_agent_monotonic_acquire_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -5350,8 +5350,8 @@ define amdgpu_kernel void @global_agent_monotonic_acquire_ret_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
 ; GFX11-WGP-NEXT:    s_nop 0
 ; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -5365,8 +5365,8 @@ define amdgpu_kernel void @global_agent_monotonic_acquire_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
 ; GFX11-CU-NEXT:    s_nop 0
 ; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -5424,8 +5424,8 @@ define amdgpu_kernel void @global_agent_acquire_acquire_ret_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
@@ -5438,8 +5438,8 @@ define amdgpu_kernel void @global_agent_acquire_acquire_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -5514,8 +5514,8 @@ define amdgpu_kernel void @global_agent_acquire_acquire_ret_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
 ; GFX11-WGP-NEXT:    s_nop 0
 ; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -5529,8 +5529,8 @@ define amdgpu_kernel void @global_agent_acquire_acquire_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
 ; GFX11-CU-NEXT:    s_nop 0
 ; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -5588,8 +5588,8 @@ define amdgpu_kernel void @global_agent_release_acquire_ret_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
@@ -5602,8 +5602,8 @@ define amdgpu_kernel void @global_agent_release_acquire_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -5680,8 +5680,8 @@ define amdgpu_kernel void @global_agent_release_acquire_ret_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
 ; GFX11-WGP-NEXT:    s_nop 0
 ; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -5695,8 +5695,8 @@ define amdgpu_kernel void @global_agent_release_acquire_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
 ; GFX11-CU-NEXT:    s_nop 0
 ; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -5754,8 +5754,8 @@ define amdgpu_kernel void @global_agent_acq_rel_acquire_ret_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
@@ -5768,8 +5768,8 @@ define amdgpu_kernel void @global_agent_acq_rel_acquire_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -5846,8 +5846,8 @@ define amdgpu_kernel void @global_agent_acq_rel_acquire_ret_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
 ; GFX11-WGP-NEXT:    s_nop 0
 ; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -5861,8 +5861,8 @@ define amdgpu_kernel void @global_agent_acq_rel_acquire_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
 ; GFX11-CU-NEXT:    s_nop 0
 ; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -5920,8 +5920,8 @@ define amdgpu_kernel void @global_agent_seq_cst_acquire_ret_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
@@ -5934,8 +5934,8 @@ define amdgpu_kernel void @global_agent_seq_cst_acquire_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -6012,8 +6012,8 @@ define amdgpu_kernel void @global_agent_seq_cst_acquire_ret_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
 ; GFX11-WGP-NEXT:    s_nop 0
 ; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -6027,8 +6027,8 @@ define amdgpu_kernel void @global_agent_seq_cst_acquire_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
 ; GFX11-CU-NEXT:    s_nop 0
 ; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -6086,8 +6086,8 @@ define amdgpu_kernel void @global_agent_monotonic_seq_cst_ret_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
@@ -6100,8 +6100,8 @@ define amdgpu_kernel void @global_agent_monotonic_seq_cst_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -6178,8 +6178,8 @@ define amdgpu_kernel void @global_agent_monotonic_seq_cst_ret_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
 ; GFX11-WGP-NEXT:    s_nop 0
 ; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -6193,8 +6193,8 @@ define amdgpu_kernel void @global_agent_monotonic_seq_cst_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
 ; GFX11-CU-NEXT:    s_nop 0
 ; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -6252,8 +6252,8 @@ define amdgpu_kernel void @global_agent_acquire_seq_cst_ret_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
@@ -6266,8 +6266,8 @@ define amdgpu_kernel void @global_agent_acquire_seq_cst_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -6344,8 +6344,8 @@ define amdgpu_kernel void @global_agent_acquire_seq_cst_ret_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
 ; GFX11-WGP-NEXT:    s_nop 0
 ; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -6359,8 +6359,8 @@ define amdgpu_kernel void @global_agent_acquire_seq_cst_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
 ; GFX11-CU-NEXT:    s_nop 0
 ; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -6418,8 +6418,8 @@ define amdgpu_kernel void @global_agent_release_seq_cst_ret_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
@@ -6432,8 +6432,8 @@ define amdgpu_kernel void @global_agent_release_seq_cst_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -6510,8 +6510,8 @@ define amdgpu_kernel void @global_agent_release_seq_cst_ret_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
 ; GFX11-WGP-NEXT:    s_nop 0
 ; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -6525,8 +6525,8 @@ define amdgpu_kernel void @global_agent_release_seq_cst_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
 ; GFX11-CU-NEXT:    s_nop 0
 ; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -6584,8 +6584,8 @@ define amdgpu_kernel void @global_agent_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
@@ -6598,8 +6598,8 @@ define amdgpu_kernel void @global_agent_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -6676,8 +6676,8 @@ define amdgpu_kernel void @global_agent_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
 ; GFX11-WGP-NEXT:    s_nop 0
 ; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -6691,8 +6691,8 @@ define amdgpu_kernel void @global_agent_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
 ; GFX11-CU-NEXT:    s_nop 0
 ; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -6750,8 +6750,8 @@ define amdgpu_kernel void @global_agent_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
@@ -6764,8 +6764,8 @@ define amdgpu_kernel void @global_agent_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -6842,8 +6842,8 @@ define amdgpu_kernel void @global_agent_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
 ; GFX11-WGP-NEXT:    s_nop 0
 ; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -6857,8 +6857,8 @@ define amdgpu_kernel void @global_agent_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
 ; GFX11-CU-NEXT:    s_nop 0
 ; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -7178,8 +7178,8 @@ define amdgpu_kernel void @global_agent_one_as_acquire_load(
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    global_load_dword v1, v0, s[0:1] glc dlc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[2:3]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
@@ -7190,8 +7190,8 @@ define amdgpu_kernel void @global_agent_one_as_acquire_load(
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    global_load_dword v1, v0, s[0:1] glc dlc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    global_store_dword v0, v1, s[2:3]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -7261,8 +7261,8 @@ define amdgpu_kernel void @global_agent_one_as_acquire_load(
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    global_load_b32 v1, v0, s[0:1] glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[2:3]
 ; GFX11-WGP-NEXT:    s_nop 0
 ; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -7275,8 +7275,8 @@ define amdgpu_kernel void @global_agent_one_as_acquire_load(
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    global_load_b32 v1, v0, s[0:1] glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[2:3]
 ; GFX11-CU-NEXT:    s_nop 0
 ; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -7326,8 +7326,8 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_load(
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    global_load_dword v1, v0, s[0:1] glc dlc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[2:3]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
@@ -7338,8 +7338,8 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_load(
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    global_load_dword v1, v0, s[0:1] glc dlc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    global_store_dword v0, v1, s[2:3]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -7409,8 +7409,8 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_load(
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    global_load_b32 v1, v0, s[0:1] glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[2:3]
 ; GFX11-WGP-NEXT:    s_nop 0
 ; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -7423,8 +7423,8 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_load(
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    global_load_b32 v1, v0, s[0:1] glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[2:3]
 ; GFX11-CU-NEXT:    s_nop 0
 ; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -8102,8 +8102,8 @@ define amdgpu_kernel void @global_agent_one_as_acquire_atomicrmw(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX10-WGP-NEXT:    global_atomic_swap v0, v1, s[0:1]
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_agent_one_as_acquire_atomicrmw:
@@ -8116,8 +8116,8 @@ define amdgpu_kernel void @global_agent_one_as_acquire_atomicrmw(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX10-CU-NEXT:    global_atomic_swap v0, v1, s[0:1]
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_agent_one_as_acquire_atomicrmw:
@@ -8189,8 +8189,8 @@ define amdgpu_kernel void @global_agent_one_as_acquire_atomicrmw(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; GFX11-WGP-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_agent_one_as_acquire_atomicrmw:
@@ -8202,8 +8202,8 @@ define amdgpu_kernel void @global_agent_one_as_acquire_atomicrmw(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; GFX11-CU-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in) {
 entry:
@@ -8375,8 +8375,8 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_atomicrmw(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX10-WGP-NEXT:    global_atomic_swap v0, v1, s[0:1]
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_agent_one_as_acq_rel_atomicrmw:
@@ -8389,8 +8389,8 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_atomicrmw(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX10-CU-NEXT:    global_atomic_swap v0, v1, s[0:1]
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_agent_one_as_acq_rel_atomicrmw:
@@ -8464,8 +8464,8 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_atomicrmw(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; GFX11-WGP-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_agent_one_as_acq_rel_atomicrmw:
@@ -8477,8 +8477,8 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_atomicrmw(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; GFX11-CU-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in) {
 entry:
@@ -8523,8 +8523,8 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_atomicrmw(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX10-WGP-NEXT:    global_atomic_swap v0, v1, s[0:1]
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_agent_one_as_seq_cst_atomicrmw:
@@ -8537,8 +8537,8 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_atomicrmw(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX10-CU-NEXT:    global_atomic_swap v0, v1, s[0:1]
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_agent_one_as_seq_cst_atomicrmw:
@@ -8612,8 +8612,8 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_atomicrmw(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; GFX11-WGP-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_agent_one_as_seq_cst_atomicrmw:
@@ -8625,8 +8625,8 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_atomicrmw(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; GFX11-CU-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in) {
 entry:
@@ -8673,8 +8673,8 @@ define amdgpu_kernel void @global_agent_one_as_acquire_ret_atomicrmw(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX10-WGP-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
@@ -8688,8 +8688,8 @@ define amdgpu_kernel void @global_agent_one_as_acquire_ret_atomicrmw(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX10-CU-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -8767,8 +8767,8 @@ define amdgpu_kernel void @global_agent_one_as_acquire_ret_atomicrmw(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; GFX11-WGP-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-WGP-NEXT:    s_nop 0
 ; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -8783,8 +8783,8 @@ define amdgpu_kernel void @global_agent_one_as_acquire_ret_atomicrmw(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; GFX11-CU-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_nop 0
 ; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -8835,8 +8835,8 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_ret_atomicrmw(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX10-WGP-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
@@ -8850,8 +8850,8 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_ret_atomicrmw(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX10-CU-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -8931,8 +8931,8 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_ret_atomicrmw(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; GFX11-WGP-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-WGP-NEXT:    s_nop 0
 ; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -8947,8 +8947,8 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_ret_atomicrmw(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; GFX11-CU-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_nop 0
 ; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -8999,8 +8999,8 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_ret_atomicrmw(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX10-WGP-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
@@ -9014,8 +9014,8 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_ret_atomicrmw(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX10-CU-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -9095,8 +9095,8 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_ret_atomicrmw(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; GFX11-WGP-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-WGP-NEXT:    s_nop 0
 ; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -9111,8 +9111,8 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_ret_atomicrmw(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; GFX11-CU-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_nop 0
 ; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -9288,8 +9288,8 @@ define amdgpu_kernel void @global_agent_one_as_acquire_monotonic_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_agent_one_as_acquire_monotonic_cmpxchg:
@@ -9301,8 +9301,8 @@ define amdgpu_kernel void @global_agent_one_as_acquire_monotonic_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_agent_one_as_acquire_monotonic_cmpxchg:
@@ -9371,8 +9371,8 @@ define amdgpu_kernel void @global_agent_one_as_acquire_monotonic_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_agent_one_as_acquire_monotonic_cmpxchg:
@@ -9383,8 +9383,8 @@ define amdgpu_kernel void @global_agent_one_as_acquire_monotonic_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -9559,8 +9559,8 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_monotonic_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_agent_one_as_acq_rel_monotonic_cmpxchg:
@@ -9572,8 +9572,8 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_monotonic_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_agent_one_as_acq_rel_monotonic_cmpxchg:
@@ -9644,8 +9644,8 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_monotonic_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_agent_one_as_acq_rel_monotonic_cmpxchg:
@@ -9656,8 +9656,8 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_monotonic_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -9706,8 +9706,8 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_monotonic_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_agent_one_as_seq_cst_monotonic_cmpxchg:
@@ -9719,8 +9719,8 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_monotonic_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_agent_one_as_seq_cst_monotonic_cmpxchg:
@@ -9791,8 +9791,8 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_monotonic_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_agent_one_as_seq_cst_monotonic_cmpxchg:
@@ -9803,8 +9803,8 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_monotonic_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -9853,8 +9853,8 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_acquire_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_agent_one_as_monotonic_acquire_cmpxchg:
@@ -9866,8 +9866,8 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_acquire_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_agent_one_as_monotonic_acquire_cmpxchg:
@@ -9936,8 +9936,8 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_acquire_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_agent_one_as_monotonic_acquire_cmpxchg:
@@ -9948,8 +9948,8 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_acquire_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -9998,8 +9998,8 @@ define amdgpu_kernel void @global_agent_one_as_acquire_acquire_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_agent_one_as_acquire_acquire_cmpxchg:
@@ -10011,8 +10011,8 @@ define amdgpu_kernel void @global_agent_one_as_acquire_acquire_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_agent_one_as_acquire_acquire_cmpxchg:
@@ -10081,8 +10081,8 @@ define amdgpu_kernel void @global_agent_one_as_acquire_acquire_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_agent_one_as_acquire_acquire_cmpxchg:
@@ -10093,8 +10093,8 @@ define amdgpu_kernel void @global_agent_one_as_acquire_acquire_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -10143,8 +10143,8 @@ define amdgpu_kernel void @global_agent_one_as_release_acquire_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_agent_one_as_release_acquire_cmpxchg:
@@ -10156,8 +10156,8 @@ define amdgpu_kernel void @global_agent_one_as_release_acquire_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_agent_one_as_release_acquire_cmpxchg:
@@ -10228,8 +10228,8 @@ define amdgpu_kernel void @global_agent_one_as_release_acquire_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_agent_one_as_release_acquire_cmpxchg:
@@ -10240,8 +10240,8 @@ define amdgpu_kernel void @global_agent_one_as_release_acquire_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -10290,8 +10290,8 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_acquire_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_agent_one_as_acq_rel_acquire_cmpxchg:
@@ -10303,8 +10303,8 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_acquire_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_agent_one_as_acq_rel_acquire_cmpxchg:
@@ -10375,8 +10375,8 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_acquire_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_agent_one_as_acq_rel_acquire_cmpxchg:
@@ -10387,8 +10387,8 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_acquire_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -10437,8 +10437,8 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_acquire_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_agent_one_as_seq_cst_acquire_cmpxchg:
@@ -10450,8 +10450,8 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_acquire_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_agent_one_as_seq_cst_acquire_cmpxchg:
@@ -10522,8 +10522,8 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_acquire_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_agent_one_as_seq_cst_acquire_cmpxchg:
@@ -10534,8 +10534,8 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_acquire_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -10584,8 +10584,8 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_seq_cst_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_agent_one_as_monotonic_seq_cst_cmpxchg:
@@ -10597,8 +10597,8 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_seq_cst_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_agent_one_as_monotonic_seq_cst_cmpxchg:
@@ -10669,8 +10669,8 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_seq_cst_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_agent_one_as_monotonic_seq_cst_cmpxchg:
@@ -10681,8 +10681,8 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_seq_cst_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -10731,8 +10731,8 @@ define amdgpu_kernel void @global_agent_one_as_acquire_seq_cst_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_agent_one_as_acquire_seq_cst_cmpxchg:
@@ -10744,8 +10744,8 @@ define amdgpu_kernel void @global_agent_one_as_acquire_seq_cst_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_agent_one_as_acquire_seq_cst_cmpxchg:
@@ -10816,8 +10816,8 @@ define amdgpu_kernel void @global_agent_one_as_acquire_seq_cst_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_agent_one_as_acquire_seq_cst_cmpxchg:
@@ -10828,8 +10828,8 @@ define amdgpu_kernel void @global_agent_one_as_acquire_seq_cst_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -10878,8 +10878,8 @@ define amdgpu_kernel void @global_agent_one_as_release_seq_cst_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_agent_one_as_release_seq_cst_cmpxchg:
@@ -10891,8 +10891,8 @@ define amdgpu_kernel void @global_agent_one_as_release_seq_cst_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_agent_one_as_release_seq_cst_cmpxchg:
@@ -10963,8 +10963,8 @@ define amdgpu_kernel void @global_agent_one_as_release_seq_cst_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_agent_one_as_release_seq_cst_cmpxchg:
@@ -10975,8 +10975,8 @@ define amdgpu_kernel void @global_agent_one_as_release_seq_cst_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -11025,8 +11025,8 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_seq_cst_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_agent_one_as_acq_rel_seq_cst_cmpxchg:
@@ -11038,8 +11038,8 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_seq_cst_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_agent_one_as_acq_rel_seq_cst_cmpxchg:
@@ -11110,8 +11110,8 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_seq_cst_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_agent_one_as_acq_rel_seq_cst_cmpxchg:
@@ -11122,8 +11122,8 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_seq_cst_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -11172,8 +11172,8 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_seq_cst_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_agent_one_as_seq_cst_seq_cst_cmpxchg:
@@ -11185,8 +11185,8 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_seq_cst_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_agent_one_as_seq_cst_seq_cst_cmpxchg:
@@ -11257,8 +11257,8 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_seq_cst_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_agent_one_as_seq_cst_seq_cst_cmpxchg:
@@ -11269,8 +11269,8 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_seq_cst_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -11473,8 +11473,8 @@ define amdgpu_kernel void @global_agent_one_as_acquire_monotonic_ret_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
@@ -11487,8 +11487,8 @@ define amdgpu_kernel void @global_agent_one_as_acquire_monotonic_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -11563,8 +11563,8 @@ define amdgpu_kernel void @global_agent_one_as_acquire_monotonic_ret_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
 ; GFX11-WGP-NEXT:    s_nop 0
 ; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -11578,8 +11578,8 @@ define amdgpu_kernel void @global_agent_one_as_acquire_monotonic_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
 ; GFX11-CU-NEXT:    s_nop 0
 ; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -11637,8 +11637,8 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_monotonic_ret_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
@@ -11651,8 +11651,8 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_monotonic_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -11729,8 +11729,8 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_monotonic_ret_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
 ; GFX11-WGP-NEXT:    s_nop 0
 ; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -11744,8 +11744,8 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_monotonic_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
 ; GFX11-CU-NEXT:    s_nop 0
 ; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -11803,8 +11803,8 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_monotonic_ret_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
@@ -11817,8 +11817,8 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_monotonic_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -11895,8 +11895,8 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_monotonic_ret_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
 ; GFX11-WGP-NEXT:    s_nop 0
 ; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -11910,8 +11910,8 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_monotonic_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
 ; GFX11-CU-NEXT:    s_nop 0
 ; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -11969,8 +11969,8 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_acquire_ret_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
@@ -11983,8 +11983,8 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_acquire_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -12059,8 +12059,8 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_acquire_ret_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
 ; GFX11-WGP-NEXT:    s_nop 0
 ; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -12074,8 +12074,8 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_acquire_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
 ; GFX11-CU-NEXT:    s_nop 0
 ; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -12133,8 +12133,8 @@ define amdgpu_kernel void @global_agent_one_as_acquire_acquire_ret_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
@@ -12147,8 +12147,8 @@ define amdgpu_kernel void @global_agent_one_as_acquire_acquire_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -12223,8 +12223,8 @@ define amdgpu_kernel void @global_agent_one_as_acquire_acquire_ret_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
 ; GFX11-WGP-NEXT:    s_nop 0
 ; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -12238,8 +12238,8 @@ define amdgpu_kernel void @global_agent_one_as_acquire_acquire_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
 ; GFX11-CU-NEXT:    s_nop 0
 ; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -12297,8 +12297,8 @@ define amdgpu_kernel void @global_agent_one_as_release_acquire_ret_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
@@ -12311,8 +12311,8 @@ define amdgpu_kernel void @global_agent_one_as_release_acquire_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -12389,8 +12389,8 @@ define amdgpu_kernel void @global_agent_one_as_release_acquire_ret_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
 ; GFX11-WGP-NEXT:    s_nop 0
 ; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -12404,8 +12404,8 @@ define amdgpu_kernel void @global_agent_one_as_release_acquire_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
 ; GFX11-CU-NEXT:    s_nop 0
 ; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -12463,8 +12463,8 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_acquire_ret_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
@@ -12477,8 +12477,8 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_acquire_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -12555,8 +12555,8 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_acquire_ret_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
 ; GFX11-WGP-NEXT:    s_nop 0
 ; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -12570,8 +12570,8 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_acquire_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
 ; GFX11-CU-NEXT:    s_nop 0
 ; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -12629,8 +12629,8 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_acquire_ret_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
@@ -12643,8 +12643,8 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_acquire_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -12721,8 +12721,8 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_acquire_ret_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
 ; GFX11-WGP-NEXT:    s_nop 0
 ; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -12736,8 +12736,8 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_acquire_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
 ; GFX11-CU-NEXT:    s_nop 0
 ; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -12795,8 +12795,8 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_seq_cst_ret_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
@@ -12809,8 +12809,8 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_seq_cst_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -12887,8 +12887,8 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_seq_cst_ret_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
 ; GFX11-WGP-NEXT:    s_nop 0
 ; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -12902,8 +12902,8 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_seq_cst_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
 ; GFX11-CU-NEXT:    s_nop 0
 ; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -12961,8 +12961,8 @@ define amdgpu_kernel void @global_agent_one_as_acquire_seq_cst_ret_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
@@ -12975,8 +12975,8 @@ define amdgpu_kernel void @global_agent_one_as_acquire_seq_cst_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -13053,8 +13053,8 @@ define amdgpu_kernel void @global_agent_one_as_acquire_seq_cst_ret_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
 ; GFX11-WGP-NEXT:    s_nop 0
 ; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -13068,8 +13068,8 @@ define amdgpu_kernel void @global_agent_one_as_acquire_seq_cst_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
 ; GFX11-CU-NEXT:    s_nop 0
 ; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -13127,8 +13127,8 @@ define amdgpu_kernel void @global_agent_one_as_release_seq_cst_ret_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
@@ -13141,8 +13141,8 @@ define amdgpu_kernel void @global_agent_one_as_release_seq_cst_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -13219,8 +13219,8 @@ define amdgpu_kernel void @global_agent_one_as_release_seq_cst_ret_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
 ; GFX11-WGP-NEXT:    s_nop 0
 ; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -13234,8 +13234,8 @@ define amdgpu_kernel void @global_agent_one_as_release_seq_cst_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
 ; GFX11-CU-NEXT:    s_nop 0
 ; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -13293,8 +13293,8 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
@@ -13307,8 +13307,8 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -13385,8 +13385,8 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
 ; GFX11-WGP-NEXT:    s_nop 0
 ; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -13400,8 +13400,8 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
 ; GFX11-CU-NEXT:    s_nop 0
 ; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -13459,8 +13459,8 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
@@ -13473,8 +13473,8 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -13551,8 +13551,8 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
 ; GFX11-WGP-NEXT:    s_nop 0
 ; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -13566,8 +13566,8 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
 ; GFX11-CU-NEXT:    s_nop 0
 ; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-system.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-system.ll
index 97b2f08..f611003 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-system.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-system.ll
@@ -317,8 +317,8 @@ define amdgpu_kernel void @global_system_acquire_load(
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    global_load_dword v1, v0, s[0:1] glc dlc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[2:3]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
@@ -329,8 +329,8 @@ define amdgpu_kernel void @global_system_acquire_load(
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    global_load_dword v1, v0, s[0:1] glc dlc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    global_store_dword v0, v1, s[2:3]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -402,8 +402,8 @@ define amdgpu_kernel void @global_system_acquire_load(
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    global_load_b32 v1, v0, s[0:1] glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[2:3]
 ; GFX11-WGP-NEXT:    s_nop 0
 ; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -416,8 +416,8 @@ define amdgpu_kernel void @global_system_acquire_load(
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    global_load_b32 v1, v0, s[0:1] glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[2:3]
 ; GFX11-CU-NEXT:    s_nop 0
 ; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -467,8 +467,8 @@ define amdgpu_kernel void @global_system_seq_cst_load(
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    global_load_dword v1, v0, s[0:1] glc dlc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[2:3]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
@@ -479,8 +479,8 @@ define amdgpu_kernel void @global_system_seq_cst_load(
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    global_load_dword v1, v0, s[0:1] glc dlc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    global_store_dword v0, v1, s[2:3]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -552,8 +552,8 @@ define amdgpu_kernel void @global_system_seq_cst_load(
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    global_load_b32 v1, v0, s[0:1] glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[2:3]
 ; GFX11-WGP-NEXT:    s_nop 0
 ; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -566,8 +566,8 @@ define amdgpu_kernel void @global_system_seq_cst_load(
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    global_load_b32 v1, v0, s[0:1] glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[2:3]
 ; GFX11-CU-NEXT:    s_nop 0
 ; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -1249,8 +1249,8 @@ define amdgpu_kernel void @global_system_acquire_atomicrmw(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX10-WGP-NEXT:    global_atomic_swap v0, v1, s[0:1]
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_system_acquire_atomicrmw:
@@ -1263,8 +1263,8 @@ define amdgpu_kernel void @global_system_acquire_atomicrmw(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX10-CU-NEXT:    global_atomic_swap v0, v1, s[0:1]
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_system_acquire_atomicrmw:
@@ -1338,8 +1338,8 @@ define amdgpu_kernel void @global_system_acquire_atomicrmw(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; GFX11-WGP-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_system_acquire_atomicrmw:
@@ -1351,8 +1351,8 @@ define amdgpu_kernel void @global_system_acquire_atomicrmw(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; GFX11-CU-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in) {
 entry:
@@ -1526,8 +1526,8 @@ define amdgpu_kernel void @global_system_acq_rel_atomicrmw(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX10-WGP-NEXT:    global_atomic_swap v0, v1, s[0:1]
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_system_acq_rel_atomicrmw:
@@ -1540,8 +1540,8 @@ define amdgpu_kernel void @global_system_acq_rel_atomicrmw(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX10-CU-NEXT:    global_atomic_swap v0, v1, s[0:1]
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_system_acq_rel_atomicrmw:
@@ -1619,8 +1619,8 @@ define amdgpu_kernel void @global_system_acq_rel_atomicrmw(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; GFX11-WGP-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_system_acq_rel_atomicrmw:
@@ -1632,8 +1632,8 @@ define amdgpu_kernel void @global_system_acq_rel_atomicrmw(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; GFX11-CU-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in) {
 entry:
@@ -1678,8 +1678,8 @@ define amdgpu_kernel void @global_system_seq_cst_atomicrmw(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX10-WGP-NEXT:    global_atomic_swap v0, v1, s[0:1]
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_system_seq_cst_atomicrmw:
@@ -1692,8 +1692,8 @@ define amdgpu_kernel void @global_system_seq_cst_atomicrmw(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX10-CU-NEXT:    global_atomic_swap v0, v1, s[0:1]
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_system_seq_cst_atomicrmw:
@@ -1771,8 +1771,8 @@ define amdgpu_kernel void @global_system_seq_cst_atomicrmw(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; GFX11-WGP-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_system_seq_cst_atomicrmw:
@@ -1784,8 +1784,8 @@ define amdgpu_kernel void @global_system_seq_cst_atomicrmw(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; GFX11-CU-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in) {
 entry:
@@ -1832,8 +1832,8 @@ define amdgpu_kernel void @global_system_acquire_ret_atomicrmw(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX10-WGP-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
@@ -1847,8 +1847,8 @@ define amdgpu_kernel void @global_system_acquire_ret_atomicrmw(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX10-CU-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -1928,8 +1928,8 @@ define amdgpu_kernel void @global_system_acquire_ret_atomicrmw(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; GFX11-WGP-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-WGP-NEXT:    s_nop 0
 ; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -1944,8 +1944,8 @@ define amdgpu_kernel void @global_system_acquire_ret_atomicrmw(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; GFX11-CU-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_nop 0
 ; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -1996,8 +1996,8 @@ define amdgpu_kernel void @global_system_acq_rel_ret_atomicrmw(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX10-WGP-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
@@ -2011,8 +2011,8 @@ define amdgpu_kernel void @global_system_acq_rel_ret_atomicrmw(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX10-CU-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -2096,8 +2096,8 @@ define amdgpu_kernel void @global_system_acq_rel_ret_atomicrmw(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; GFX11-WGP-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-WGP-NEXT:    s_nop 0
 ; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -2112,8 +2112,8 @@ define amdgpu_kernel void @global_system_acq_rel_ret_atomicrmw(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; GFX11-CU-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_nop 0
 ; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -2164,8 +2164,8 @@ define amdgpu_kernel void @global_system_seq_cst_ret_atomicrmw(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX10-WGP-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
@@ -2179,8 +2179,8 @@ define amdgpu_kernel void @global_system_seq_cst_ret_atomicrmw(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX10-CU-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -2264,8 +2264,8 @@ define amdgpu_kernel void @global_system_seq_cst_ret_atomicrmw(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; GFX11-WGP-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-WGP-NEXT:    s_nop 0
 ; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -2280,8 +2280,8 @@ define amdgpu_kernel void @global_system_seq_cst_ret_atomicrmw(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; GFX11-CU-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_nop 0
 ; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -2457,8 +2457,8 @@ define amdgpu_kernel void @global_system_acquire_monotonic_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_system_acquire_monotonic_cmpxchg:
@@ -2470,8 +2470,8 @@ define amdgpu_kernel void @global_system_acquire_monotonic_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_system_acquire_monotonic_cmpxchg:
@@ -2542,8 +2542,8 @@ define amdgpu_kernel void @global_system_acquire_monotonic_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_system_acquire_monotonic_cmpxchg:
@@ -2554,8 +2554,8 @@ define amdgpu_kernel void @global_system_acquire_monotonic_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -2732,8 +2732,8 @@ define amdgpu_kernel void @global_system_acq_rel_monotonic_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_system_acq_rel_monotonic_cmpxchg:
@@ -2745,8 +2745,8 @@ define amdgpu_kernel void @global_system_acq_rel_monotonic_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_system_acq_rel_monotonic_cmpxchg:
@@ -2821,8 +2821,8 @@ define amdgpu_kernel void @global_system_acq_rel_monotonic_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_system_acq_rel_monotonic_cmpxchg:
@@ -2833,8 +2833,8 @@ define amdgpu_kernel void @global_system_acq_rel_monotonic_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -2883,8 +2883,8 @@ define amdgpu_kernel void @global_system_seq_cst_monotonic_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_system_seq_cst_monotonic_cmpxchg:
@@ -2896,8 +2896,8 @@ define amdgpu_kernel void @global_system_seq_cst_monotonic_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_system_seq_cst_monotonic_cmpxchg:
@@ -2972,8 +2972,8 @@ define amdgpu_kernel void @global_system_seq_cst_monotonic_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_system_seq_cst_monotonic_cmpxchg:
@@ -2984,8 +2984,8 @@ define amdgpu_kernel void @global_system_seq_cst_monotonic_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -3034,8 +3034,8 @@ define amdgpu_kernel void @global_system_monotonic_acquire_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_system_monotonic_acquire_cmpxchg:
@@ -3047,8 +3047,8 @@ define amdgpu_kernel void @global_system_monotonic_acquire_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_system_monotonic_acquire_cmpxchg:
@@ -3119,8 +3119,8 @@ define amdgpu_kernel void @global_system_monotonic_acquire_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_system_monotonic_acquire_cmpxchg:
@@ -3131,8 +3131,8 @@ define amdgpu_kernel void @global_system_monotonic_acquire_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -3181,8 +3181,8 @@ define amdgpu_kernel void @global_system_acquire_acquire_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_system_acquire_acquire_cmpxchg:
@@ -3194,8 +3194,8 @@ define amdgpu_kernel void @global_system_acquire_acquire_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_system_acquire_acquire_cmpxchg:
@@ -3266,8 +3266,8 @@ define amdgpu_kernel void @global_system_acquire_acquire_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_system_acquire_acquire_cmpxchg:
@@ -3278,8 +3278,8 @@ define amdgpu_kernel void @global_system_acquire_acquire_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -3328,8 +3328,8 @@ define amdgpu_kernel void @global_system_release_acquire_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_system_release_acquire_cmpxchg:
@@ -3341,8 +3341,8 @@ define amdgpu_kernel void @global_system_release_acquire_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_system_release_acquire_cmpxchg:
@@ -3417,8 +3417,8 @@ define amdgpu_kernel void @global_system_release_acquire_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_system_release_acquire_cmpxchg:
@@ -3429,8 +3429,8 @@ define amdgpu_kernel void @global_system_release_acquire_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -3479,8 +3479,8 @@ define amdgpu_kernel void @global_system_acq_rel_acquire_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_system_acq_rel_acquire_cmpxchg:
@@ -3492,8 +3492,8 @@ define amdgpu_kernel void @global_system_acq_rel_acquire_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_system_acq_rel_acquire_cmpxchg:
@@ -3568,8 +3568,8 @@ define amdgpu_kernel void @global_system_acq_rel_acquire_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_system_acq_rel_acquire_cmpxchg:
@@ -3580,8 +3580,8 @@ define amdgpu_kernel void @global_system_acq_rel_acquire_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -3630,8 +3630,8 @@ define amdgpu_kernel void @global_system_seq_cst_acquire_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_system_seq_cst_acquire_cmpxchg:
@@ -3643,8 +3643,8 @@ define amdgpu_kernel void @global_system_seq_cst_acquire_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_system_seq_cst_acquire_cmpxchg:
@@ -3719,8 +3719,8 @@ define amdgpu_kernel void @global_system_seq_cst_acquire_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_system_seq_cst_acquire_cmpxchg:
@@ -3731,8 +3731,8 @@ define amdgpu_kernel void @global_system_seq_cst_acquire_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -3781,8 +3781,8 @@ define amdgpu_kernel void @global_system_seq_cst_seq_cst_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_system_seq_cst_seq_cst_cmpxchg:
@@ -3794,8 +3794,8 @@ define amdgpu_kernel void @global_system_seq_cst_seq_cst_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_system_seq_cst_seq_cst_cmpxchg:
@@ -3870,8 +3870,8 @@ define amdgpu_kernel void @global_system_seq_cst_seq_cst_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_system_seq_cst_seq_cst_cmpxchg:
@@ -3882,8 +3882,8 @@ define amdgpu_kernel void @global_system_seq_cst_seq_cst_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -4086,8 +4086,8 @@ define amdgpu_kernel void @global_system_acquire_monotonic_ret_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
@@ -4100,8 +4100,8 @@ define amdgpu_kernel void @global_system_acquire_monotonic_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -4178,8 +4178,8 @@ define amdgpu_kernel void @global_system_acquire_monotonic_ret_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
 ; GFX11-WGP-NEXT:    s_nop 0
 ; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -4193,8 +4193,8 @@ define amdgpu_kernel void @global_system_acquire_monotonic_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
 ; GFX11-CU-NEXT:    s_nop 0
 ; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -4252,8 +4252,8 @@ define amdgpu_kernel void @global_system_acq_rel_monotonic_ret_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
@@ -4266,8 +4266,8 @@ define amdgpu_kernel void @global_system_acq_rel_monotonic_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -4348,8 +4348,8 @@ define amdgpu_kernel void @global_system_acq_rel_monotonic_ret_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
 ; GFX11-WGP-NEXT:    s_nop 0
 ; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -4363,8 +4363,8 @@ define amdgpu_kernel void @global_system_acq_rel_monotonic_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
 ; GFX11-CU-NEXT:    s_nop 0
 ; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -4422,8 +4422,8 @@ define amdgpu_kernel void @global_system_seq_cst_monotonic_ret_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
@@ -4436,8 +4436,8 @@ define amdgpu_kernel void @global_system_seq_cst_monotonic_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -4518,8 +4518,8 @@ define amdgpu_kernel void @global_system_seq_cst_monotonic_ret_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
 ; GFX11-WGP-NEXT:    s_nop 0
 ; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -4533,8 +4533,8 @@ define amdgpu_kernel void @global_system_seq_cst_monotonic_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
 ; GFX11-CU-NEXT:    s_nop 0
 ; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -4592,8 +4592,8 @@ define amdgpu_kernel void @global_system_monotonic_acquire_ret_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
@@ -4606,8 +4606,8 @@ define amdgpu_kernel void @global_system_monotonic_acquire_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -4684,8 +4684,8 @@ define amdgpu_kernel void @global_system_monotonic_acquire_ret_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
 ; GFX11-WGP-NEXT:    s_nop 0
 ; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -4699,8 +4699,8 @@ define amdgpu_kernel void @global_system_monotonic_acquire_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
 ; GFX11-CU-NEXT:    s_nop 0
 ; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -4758,8 +4758,8 @@ define amdgpu_kernel void @global_system_acquire_acquire_ret_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
@@ -4772,8 +4772,8 @@ define amdgpu_kernel void @global_system_acquire_acquire_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -4850,8 +4850,8 @@ define amdgpu_kernel void @global_system_acquire_acquire_ret_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
 ; GFX11-WGP-NEXT:    s_nop 0
 ; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -4865,8 +4865,8 @@ define amdgpu_kernel void @global_system_acquire_acquire_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
 ; GFX11-CU-NEXT:    s_nop 0
 ; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -4924,8 +4924,8 @@ define amdgpu_kernel void @global_system_release_acquire_ret_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
@@ -4938,8 +4938,8 @@ define amdgpu_kernel void @global_system_release_acquire_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -5020,8 +5020,8 @@ define amdgpu_kernel void @global_system_release_acquire_ret_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
 ; GFX11-WGP-NEXT:    s_nop 0
 ; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -5035,8 +5035,8 @@ define amdgpu_kernel void @global_system_release_acquire_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
 ; GFX11-CU-NEXT:    s_nop 0
 ; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -5094,8 +5094,8 @@ define amdgpu_kernel void @global_system_acq_rel_acquire_ret_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
@@ -5108,8 +5108,8 @@ define amdgpu_kernel void @global_system_acq_rel_acquire_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -5190,8 +5190,8 @@ define amdgpu_kernel void @global_system_acq_rel_acquire_ret_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
 ; GFX11-WGP-NEXT:    s_nop 0
 ; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -5205,8 +5205,8 @@ define amdgpu_kernel void @global_system_acq_rel_acquire_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
 ; GFX11-CU-NEXT:    s_nop 0
 ; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -5264,8 +5264,8 @@ define amdgpu_kernel void @global_system_seq_cst_acquire_ret_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
@@ -5278,8 +5278,8 @@ define amdgpu_kernel void @global_system_seq_cst_acquire_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -5360,8 +5360,8 @@ define amdgpu_kernel void @global_system_seq_cst_acquire_ret_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
 ; GFX11-WGP-NEXT:    s_nop 0
 ; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -5375,8 +5375,8 @@ define amdgpu_kernel void @global_system_seq_cst_acquire_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
 ; GFX11-CU-NEXT:    s_nop 0
 ; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -5434,8 +5434,8 @@ define amdgpu_kernel void @global_system_monotonic_seq_cst_ret_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
@@ -5448,8 +5448,8 @@ define amdgpu_kernel void @global_system_monotonic_seq_cst_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -5530,8 +5530,8 @@ define amdgpu_kernel void @global_system_monotonic_seq_cst_ret_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
 ; GFX11-WGP-NEXT:    s_nop 0
 ; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -5545,8 +5545,8 @@ define amdgpu_kernel void @global_system_monotonic_seq_cst_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
 ; GFX11-CU-NEXT:    s_nop 0
 ; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -5604,8 +5604,8 @@ define amdgpu_kernel void @global_system_acquire_seq_cst_ret_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
@@ -5618,8 +5618,8 @@ define amdgpu_kernel void @global_system_acquire_seq_cst_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -5700,8 +5700,8 @@ define amdgpu_kernel void @global_system_acquire_seq_cst_ret_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
 ; GFX11-WGP-NEXT:    s_nop 0
 ; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -5715,8 +5715,8 @@ define amdgpu_kernel void @global_system_acquire_seq_cst_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
 ; GFX11-CU-NEXT:    s_nop 0
 ; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -5774,8 +5774,8 @@ define amdgpu_kernel void @global_system_relese_seq_cst_ret_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
@@ -5788,8 +5788,8 @@ define amdgpu_kernel void @global_system_relese_seq_cst_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -5870,8 +5870,8 @@ define amdgpu_kernel void @global_system_relese_seq_cst_ret_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
 ; GFX11-WGP-NEXT:    s_nop 0
 ; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -5885,8 +5885,8 @@ define amdgpu_kernel void @global_system_relese_seq_cst_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
 ; GFX11-CU-NEXT:    s_nop 0
 ; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -5944,8 +5944,8 @@ define amdgpu_kernel void @global_system_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
@@ -5958,8 +5958,8 @@ define amdgpu_kernel void @global_system_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -6040,8 +6040,8 @@ define amdgpu_kernel void @global_system_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
 ; GFX11-WGP-NEXT:    s_nop 0
 ; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -6055,8 +6055,8 @@ define amdgpu_kernel void @global_system_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
 ; GFX11-CU-NEXT:    s_nop 0
 ; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -6114,8 +6114,8 @@ define amdgpu_kernel void @global_system_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
@@ -6128,8 +6128,8 @@ define amdgpu_kernel void @global_system_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -6210,8 +6210,8 @@ define amdgpu_kernel void @global_system_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
 ; GFX11-WGP-NEXT:    s_nop 0
 ; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -6225,8 +6225,8 @@ define amdgpu_kernel void @global_system_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
 ; GFX11-CU-NEXT:    s_nop 0
 ; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -6546,8 +6546,8 @@ define amdgpu_kernel void @global_system_one_as_acquire_load(
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    global_load_dword v1, v0, s[0:1] glc dlc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[2:3]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
@@ -6558,8 +6558,8 @@ define amdgpu_kernel void @global_system_one_as_acquire_load(
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    global_load_dword v1, v0, s[0:1] glc dlc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    global_store_dword v0, v1, s[2:3]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -6631,8 +6631,8 @@ define amdgpu_kernel void @global_system_one_as_acquire_load(
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    global_load_b32 v1, v0, s[0:1] glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[2:3]
 ; GFX11-WGP-NEXT:    s_nop 0
 ; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -6645,8 +6645,8 @@ define amdgpu_kernel void @global_system_one_as_acquire_load(
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    global_load_b32 v1, v0, s[0:1] glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[2:3]
 ; GFX11-CU-NEXT:    s_nop 0
 ; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -6696,8 +6696,8 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_load(
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    global_load_dword v1, v0, s[0:1] glc dlc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[2:3]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
@@ -6708,8 +6708,8 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_load(
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    global_load_dword v1, v0, s[0:1] glc dlc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    global_store_dword v0, v1, s[2:3]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -6781,8 +6781,8 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_load(
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    global_load_b32 v1, v0, s[0:1] glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[2:3]
 ; GFX11-WGP-NEXT:    s_nop 0
 ; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -6795,8 +6795,8 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_load(
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    global_load_b32 v1, v0, s[0:1] glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[2:3]
 ; GFX11-CU-NEXT:    s_nop 0
 ; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -7478,8 +7478,8 @@ define amdgpu_kernel void @global_system_one_as_acquire_atomicrmw(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX10-WGP-NEXT:    global_atomic_swap v0, v1, s[0:1]
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_system_one_as_acquire_atomicrmw:
@@ -7492,8 +7492,8 @@ define amdgpu_kernel void @global_system_one_as_acquire_atomicrmw(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX10-CU-NEXT:    global_atomic_swap v0, v1, s[0:1]
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_system_one_as_acquire_atomicrmw:
@@ -7567,8 +7567,8 @@ define amdgpu_kernel void @global_system_one_as_acquire_atomicrmw(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; GFX11-WGP-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_system_one_as_acquire_atomicrmw:
@@ -7580,8 +7580,8 @@ define amdgpu_kernel void @global_system_one_as_acquire_atomicrmw(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; GFX11-CU-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in) {
 entry:
@@ -7755,8 +7755,8 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_atomicrmw(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX10-WGP-NEXT:    global_atomic_swap v0, v1, s[0:1]
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_system_one_as_acq_rel_atomicrmw:
@@ -7769,8 +7769,8 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_atomicrmw(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX10-CU-NEXT:    global_atomic_swap v0, v1, s[0:1]
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_system_one_as_acq_rel_atomicrmw:
@@ -7848,8 +7848,8 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_atomicrmw(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; GFX11-WGP-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_system_one_as_acq_rel_atomicrmw:
@@ -7861,8 +7861,8 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_atomicrmw(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; GFX11-CU-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in) {
 entry:
@@ -7907,8 +7907,8 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_atomicrmw(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX10-WGP-NEXT:    global_atomic_swap v0, v1, s[0:1]
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_system_one_as_seq_cst_atomicrmw:
@@ -7921,8 +7921,8 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_atomicrmw(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX10-CU-NEXT:    global_atomic_swap v0, v1, s[0:1]
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_system_one_as_seq_cst_atomicrmw:
@@ -8000,8 +8000,8 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_atomicrmw(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; GFX11-WGP-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_system_one_as_seq_cst_atomicrmw:
@@ -8013,8 +8013,8 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_atomicrmw(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; GFX11-CU-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in) {
 entry:
@@ -8061,8 +8061,8 @@ define amdgpu_kernel void @global_system_one_as_acquire_ret_atomicrmw(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX10-WGP-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
@@ -8076,8 +8076,8 @@ define amdgpu_kernel void @global_system_one_as_acquire_ret_atomicrmw(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX10-CU-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -8157,8 +8157,8 @@ define amdgpu_kernel void @global_system_one_as_acquire_ret_atomicrmw(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; GFX11-WGP-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-WGP-NEXT:    s_nop 0
 ; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -8173,8 +8173,8 @@ define amdgpu_kernel void @global_system_one_as_acquire_ret_atomicrmw(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; GFX11-CU-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_nop 0
 ; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -8225,8 +8225,8 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_ret_atomicrmw(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX10-WGP-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
@@ -8240,8 +8240,8 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_ret_atomicrmw(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX10-CU-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -8325,8 +8325,8 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_ret_atomicrmw(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; GFX11-WGP-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-WGP-NEXT:    s_nop 0
 ; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -8341,8 +8341,8 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_ret_atomicrmw(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; GFX11-CU-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_nop 0
 ; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -8393,8 +8393,8 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_ret_atomicrmw(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX10-WGP-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
@@ -8408,8 +8408,8 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_ret_atomicrmw(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX10-CU-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -8493,8 +8493,8 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_ret_atomicrmw(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; GFX11-WGP-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-WGP-NEXT:    s_nop 0
 ; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -8509,8 +8509,8 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_ret_atomicrmw(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; GFX11-CU-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_nop 0
 ; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -8686,8 +8686,8 @@ define amdgpu_kernel void @global_system_one_as_acquire_monotonic_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_system_one_as_acquire_monotonic_cmpxchg:
@@ -8699,8 +8699,8 @@ define amdgpu_kernel void @global_system_one_as_acquire_monotonic_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_system_one_as_acquire_monotonic_cmpxchg:
@@ -8771,8 +8771,8 @@ define amdgpu_kernel void @global_system_one_as_acquire_monotonic_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_system_one_as_acquire_monotonic_cmpxchg:
@@ -8783,8 +8783,8 @@ define amdgpu_kernel void @global_system_one_as_acquire_monotonic_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -8961,8 +8961,8 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_monotonic_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_system_one_as_acq_rel_monotonic_cmpxchg:
@@ -8974,8 +8974,8 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_monotonic_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_system_one_as_acq_rel_monotonic_cmpxchg:
@@ -9050,8 +9050,8 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_monotonic_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_system_one_as_acq_rel_monotonic_cmpxchg:
@@ -9062,8 +9062,8 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_monotonic_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -9112,8 +9112,8 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_monotonic_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_system_one_as_seq_cst_monotonic_cmpxchg:
@@ -9125,8 +9125,8 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_monotonic_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_system_one_as_seq_cst_monotonic_cmpxchg:
@@ -9201,8 +9201,8 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_monotonic_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_system_one_as_seq_cst_monotonic_cmpxchg:
@@ -9213,8 +9213,8 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_monotonic_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -9263,8 +9263,8 @@ define amdgpu_kernel void @global_system_one_as_monotonic_acquire_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_system_one_as_monotonic_acquire_cmpxchg:
@@ -9276,8 +9276,8 @@ define amdgpu_kernel void @global_system_one_as_monotonic_acquire_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_system_one_as_monotonic_acquire_cmpxchg:
@@ -9348,8 +9348,8 @@ define amdgpu_kernel void @global_system_one_as_monotonic_acquire_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_system_one_as_monotonic_acquire_cmpxchg:
@@ -9360,8 +9360,8 @@ define amdgpu_kernel void @global_system_one_as_monotonic_acquire_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -9410,8 +9410,8 @@ define amdgpu_kernel void @global_system_one_as_acquire_acquire_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_system_one_as_acquire_acquire_cmpxchg:
@@ -9423,8 +9423,8 @@ define amdgpu_kernel void @global_system_one_as_acquire_acquire_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_system_one_as_acquire_acquire_cmpxchg:
@@ -9495,8 +9495,8 @@ define amdgpu_kernel void @global_system_one_as_acquire_acquire_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_system_one_as_acquire_acquire_cmpxchg:
@@ -9507,8 +9507,8 @@ define amdgpu_kernel void @global_system_one_as_acquire_acquire_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -9557,8 +9557,8 @@ define amdgpu_kernel void @global_system_one_as_release_acquire_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_system_one_as_release_acquire_cmpxchg:
@@ -9570,8 +9570,8 @@ define amdgpu_kernel void @global_system_one_as_release_acquire_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_system_one_as_release_acquire_cmpxchg:
@@ -9646,8 +9646,8 @@ define amdgpu_kernel void @global_system_one_as_release_acquire_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_system_one_as_release_acquire_cmpxchg:
@@ -9658,8 +9658,8 @@ define amdgpu_kernel void @global_system_one_as_release_acquire_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -9708,8 +9708,8 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_acquire_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_system_one_as_acq_rel_acquire_cmpxchg:
@@ -9721,8 +9721,8 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_acquire_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_system_one_as_acq_rel_acquire_cmpxchg:
@@ -9797,8 +9797,8 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_acquire_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_system_one_as_acq_rel_acquire_cmpxchg:
@@ -9809,8 +9809,8 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_acquire_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -9859,8 +9859,8 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_acquire_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_system_one_as_seq_cst_acquire_cmpxchg:
@@ -9872,8 +9872,8 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_acquire_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_system_one_as_seq_cst_acquire_cmpxchg:
@@ -9948,8 +9948,8 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_acquire_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_system_one_as_seq_cst_acquire_cmpxchg:
@@ -9960,8 +9960,8 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_acquire_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -10010,8 +10010,8 @@ define amdgpu_kernel void @global_system_one_as_monotonic_seq_cst_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_system_one_as_monotonic_seq_cst_cmpxchg:
@@ -10023,8 +10023,8 @@ define amdgpu_kernel void @global_system_one_as_monotonic_seq_cst_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_system_one_as_monotonic_seq_cst_cmpxchg:
@@ -10099,8 +10099,8 @@ define amdgpu_kernel void @global_system_one_as_monotonic_seq_cst_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_system_one_as_monotonic_seq_cst_cmpxchg:
@@ -10111,8 +10111,8 @@ define amdgpu_kernel void @global_system_one_as_monotonic_seq_cst_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -10161,8 +10161,8 @@ define amdgpu_kernel void @global_system_one_as_acquire_seq_cst_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_system_one_as_acquire_seq_cst_cmpxchg:
@@ -10174,8 +10174,8 @@ define amdgpu_kernel void @global_system_one_as_acquire_seq_cst_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_system_one_as_acquire_seq_cst_cmpxchg:
@@ -10250,8 +10250,8 @@ define amdgpu_kernel void @global_system_one_as_acquire_seq_cst_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_system_one_as_acquire_seq_cst_cmpxchg:
@@ -10262,8 +10262,8 @@ define amdgpu_kernel void @global_system_one_as_acquire_seq_cst_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -10312,8 +10312,8 @@ define amdgpu_kernel void @global_system_one_as_release_seq_cst_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_system_one_as_release_seq_cst_cmpxchg:
@@ -10325,8 +10325,8 @@ define amdgpu_kernel void @global_system_one_as_release_seq_cst_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_system_one_as_release_seq_cst_cmpxchg:
@@ -10401,8 +10401,8 @@ define amdgpu_kernel void @global_system_one_as_release_seq_cst_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_system_one_as_release_seq_cst_cmpxchg:
@@ -10413,8 +10413,8 @@ define amdgpu_kernel void @global_system_one_as_release_seq_cst_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -10463,8 +10463,8 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_seq_cst_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_system_one_as_acq_rel_seq_cst_cmpxchg:
@@ -10476,8 +10476,8 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_seq_cst_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_system_one_as_acq_rel_seq_cst_cmpxchg:
@@ -10552,8 +10552,8 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_seq_cst_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_system_one_as_acq_rel_seq_cst_cmpxchg:
@@ -10564,8 +10564,8 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_seq_cst_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -10614,8 +10614,8 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_seq_cst_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_system_one_as_seq_cst_seq_cst_cmpxchg:
@@ -10627,8 +10627,8 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_seq_cst_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_system_one_as_seq_cst_seq_cst_cmpxchg:
@@ -10703,8 +10703,8 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_seq_cst_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_system_one_as_seq_cst_seq_cst_cmpxchg:
@@ -10715,8 +10715,8 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_seq_cst_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -10919,8 +10919,8 @@ define amdgpu_kernel void @global_system_one_as_acquire_monotonic_ret_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
@@ -10933,8 +10933,8 @@ define amdgpu_kernel void @global_system_one_as_acquire_monotonic_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -11011,8 +11011,8 @@ define amdgpu_kernel void @global_system_one_as_acquire_monotonic_ret_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
 ; GFX11-WGP-NEXT:    s_nop 0
 ; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -11026,8 +11026,8 @@ define amdgpu_kernel void @global_system_one_as_acquire_monotonic_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
 ; GFX11-CU-NEXT:    s_nop 0
 ; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -11239,8 +11239,8 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_monotonic_ret_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
@@ -11253,8 +11253,8 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_monotonic_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -11335,8 +11335,8 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_monotonic_ret_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
 ; GFX11-WGP-NEXT:    s_nop 0
 ; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -11350,8 +11350,8 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_monotonic_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
 ; GFX11-CU-NEXT:    s_nop 0
 ; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -11409,8 +11409,8 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_monotonic_ret_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
@@ -11423,8 +11423,8 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_monotonic_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -11505,8 +11505,8 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_monotonic_ret_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
 ; GFX11-WGP-NEXT:    s_nop 0
 ; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -11520,8 +11520,8 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_monotonic_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
 ; GFX11-CU-NEXT:    s_nop 0
 ; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -11579,8 +11579,8 @@ define amdgpu_kernel void @global_system_one_as_monotonic_acquire_ret_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
@@ -11593,8 +11593,8 @@ define amdgpu_kernel void @global_system_one_as_monotonic_acquire_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -11671,8 +11671,8 @@ define amdgpu_kernel void @global_system_one_as_monotonic_acquire_ret_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
 ; GFX11-WGP-NEXT:    s_nop 0
 ; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -11686,8 +11686,8 @@ define amdgpu_kernel void @global_system_one_as_monotonic_acquire_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
 ; GFX11-CU-NEXT:    s_nop 0
 ; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -11745,8 +11745,8 @@ define amdgpu_kernel void @global_system_one_as_acquire_acquire_ret_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
@@ -11759,8 +11759,8 @@ define amdgpu_kernel void @global_system_one_as_acquire_acquire_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -11837,8 +11837,8 @@ define amdgpu_kernel void @global_system_one_as_acquire_acquire_ret_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
 ; GFX11-WGP-NEXT:    s_nop 0
 ; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -11852,8 +11852,8 @@ define amdgpu_kernel void @global_system_one_as_acquire_acquire_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
 ; GFX11-CU-NEXT:    s_nop 0
 ; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -11911,8 +11911,8 @@ define amdgpu_kernel void @global_system_one_as_release_acquire_ret_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
@@ -11925,8 +11925,8 @@ define amdgpu_kernel void @global_system_one_as_release_acquire_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -12007,8 +12007,8 @@ define amdgpu_kernel void @global_system_one_as_release_acquire_ret_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
 ; GFX11-WGP-NEXT:    s_nop 0
 ; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -12022,8 +12022,8 @@ define amdgpu_kernel void @global_system_one_as_release_acquire_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
 ; GFX11-CU-NEXT:    s_nop 0
 ; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -12081,8 +12081,8 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_acquire_ret_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
@@ -12095,8 +12095,8 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_acquire_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -12177,8 +12177,8 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_acquire_ret_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
 ; GFX11-WGP-NEXT:    s_nop 0
 ; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -12192,8 +12192,8 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_acquire_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
 ; GFX11-CU-NEXT:    s_nop 0
 ; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -12251,8 +12251,8 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_acquire_ret_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
@@ -12265,8 +12265,8 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_acquire_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -12347,8 +12347,8 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_acquire_ret_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
 ; GFX11-WGP-NEXT:    s_nop 0
 ; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -12362,8 +12362,8 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_acquire_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
 ; GFX11-CU-NEXT:    s_nop 0
 ; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -12421,8 +12421,8 @@ define amdgpu_kernel void @global_system_one_as_monotonic_seq_cst_ret_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
@@ -12435,8 +12435,8 @@ define amdgpu_kernel void @global_system_one_as_monotonic_seq_cst_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -12517,8 +12517,8 @@ define amdgpu_kernel void @global_system_one_as_monotonic_seq_cst_ret_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
 ; GFX11-WGP-NEXT:    s_nop 0
 ; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -12532,8 +12532,8 @@ define amdgpu_kernel void @global_system_one_as_monotonic_seq_cst_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
 ; GFX11-CU-NEXT:    s_nop 0
 ; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -12591,8 +12591,8 @@ define amdgpu_kernel void @global_system_one_as_acquire_seq_cst_ret_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
@@ -12605,8 +12605,8 @@ define amdgpu_kernel void @global_system_one_as_acquire_seq_cst_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -12687,8 +12687,8 @@ define amdgpu_kernel void @global_system_one_as_acquire_seq_cst_ret_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
 ; GFX11-WGP-NEXT:    s_nop 0
 ; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -12702,8 +12702,8 @@ define amdgpu_kernel void @global_system_one_as_acquire_seq_cst_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
 ; GFX11-CU-NEXT:    s_nop 0
 ; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -12761,8 +12761,8 @@ define amdgpu_kernel void @global_system_one_as_release_seq_cst_ret_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
@@ -12775,8 +12775,8 @@ define amdgpu_kernel void @global_system_one_as_release_seq_cst_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -12857,8 +12857,8 @@ define amdgpu_kernel void @global_system_one_as_release_seq_cst_ret_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
 ; GFX11-WGP-NEXT:    s_nop 0
 ; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -12872,8 +12872,8 @@ define amdgpu_kernel void @global_system_one_as_release_seq_cst_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
 ; GFX11-CU-NEXT:    s_nop 0
 ; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -12931,8 +12931,8 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
@@ -12945,8 +12945,8 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -13027,8 +13027,8 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
 ; GFX11-WGP-NEXT:    s_nop 0
 ; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -13042,8 +13042,8 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
 ; GFX11-CU-NEXT:    s_nop 0
 ; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -13101,8 +13101,8 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
@@ -13115,8 +13115,8 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -13197,8 +13197,8 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
 ; GFX11-WGP-NEXT:    s_nop 0
 ; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -13212,8 +13212,8 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
 ; GFX11-CU-NEXT:    s_nop 0
 ; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-- 
cgit v1.1


From e892f323ea234a997888b0fa44356b0678eda5f0 Mon Sep 17 00:00:00 2001
From: Guray Ozen <guray.ozen@gmail.com>
Date: Tue, 13 Feb 2024 09:17:09 +0100
Subject: [mlir][nvgpu] Allow TMA's last dim to be non-128B without swizzling
 (#81499)

Allow TMA's last dimension to be non-128B when swizzling mode is not
set.

Test `tma_load_64x8_8x128_noswizzle.mlir` is failing due to the
verifier. This PR will fix that
---
 mlir/lib/Dialect/NVGPU/IR/NVGPUDialect.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/mlir/lib/Dialect/NVGPU/IR/NVGPUDialect.cpp b/mlir/lib/Dialect/NVGPU/IR/NVGPUDialect.cpp
index 4b632747..26f831f 100644
--- a/mlir/lib/Dialect/NVGPU/IR/NVGPUDialect.cpp
+++ b/mlir/lib/Dialect/NVGPU/IR/NVGPUDialect.cpp
@@ -362,7 +362,8 @@ std::optional<InFlightDiagnostic> verifyTmaDescriptorWithMemref(
                              << kMaxTMADimension << " but it is " << dim;
     }
   }
-  if (descMemref.getRank() > 1) {
+  if (descMemref.getRank() > 1 &&
+      descType.getSwizzle() != TensorMapSwizzleKind::SWIZZLE_NONE) {
     unsigned lastDimensionByte =
         descMemref.getElementTypeBitWidth() * descMemref.getShape().back() / 8;
     if (lastDimensionByte != kMaxTMALastdimByte)
-- 
cgit v1.1


From 346e7c7f6881afaade5a71ad97475d70639dadcf Mon Sep 17 00:00:00 2001
From: Vlad Serebrennikov <serebrennikov.vladislav@gmail.com>
Date: Tue, 13 Feb 2024 12:17:46 +0400
Subject: [clang] Add some CodeGen tests for CWG 2xx issues (#80823)

This patch covers CWG issues
[201](https://cplusplus.github.io/CWG/issues/201.html),
[210](https://cplusplus.github.io/CWG/issues/210.html),
[292](https://cplusplus.github.io/CWG/issues/292.html).

[CWG208](https://cplusplus.github.io/CWG/issues/208.html) is not
covered, as it actually requires a libcxxabi test.

Resolution of CWG292 has been superseded by
[P0145R3](https://wg21.link/p0145r3) "Refining Expression Evaluation
Order for Idiomatic C++"
(see changes to paragraph 5.3.4/18).
---
 clang/test/CXX/drs/dr201.cpp | 42 ++++++++++++++++++++++++++++++++++++++++++
 clang/test/CXX/drs/dr210.cpp | 41 +++++++++++++++++++++++++++++++++++++++++
 clang/test/CXX/drs/dr292.cpp | 30 ++++++++++++++++++++++++++++++
 clang/test/CXX/drs/dr2xx.cpp |  6 +++---
 clang/www/cxx_dr_status.html |  6 +++---
 5 files changed, 119 insertions(+), 6 deletions(-)
 create mode 100644 clang/test/CXX/drs/dr201.cpp
 create mode 100644 clang/test/CXX/drs/dr210.cpp
 create mode 100644 clang/test/CXX/drs/dr292.cpp

diff --git a/clang/test/CXX/drs/dr201.cpp b/clang/test/CXX/drs/dr201.cpp
new file mode 100644
index 0000000..7e86498
--- /dev/null
+++ b/clang/test/CXX/drs/dr201.cpp
@@ -0,0 +1,42 @@
+// RUN: %clang_cc1 -std=c++98 %s -triple x86_64-linux-gnu -emit-llvm -disable-llvm-passes -o - -fexceptions -fcxx-exceptions -pedantic-errors | llvm-cxxfilt -n | FileCheck %s --check-prefixes CHECK
+// RUN: %clang_cc1 -std=c++11 %s -triple x86_64-linux-gnu -emit-llvm -disable-llvm-passes -o - -fexceptions -fcxx-exceptions -pedantic-errors | llvm-cxxfilt -n | FileCheck %s --check-prefixes CHECK
+// RUN: %clang_cc1 -std=c++14 %s -triple x86_64-linux-gnu -emit-llvm -disable-llvm-passes -o - -fexceptions -fcxx-exceptions -pedantic-errors | llvm-cxxfilt -n | FileCheck %s --check-prefixes CHECK
+// RUN: %clang_cc1 -std=c++17 %s -triple x86_64-linux-gnu -emit-llvm -disable-llvm-passes -o - -fexceptions -fcxx-exceptions -pedantic-errors | llvm-cxxfilt -n | FileCheck %s --check-prefixes CHECK
+// RUN: %clang_cc1 -std=c++20 %s -triple x86_64-linux-gnu -emit-llvm -disable-llvm-passes -o - -fexceptions -fcxx-exceptions -pedantic-errors | llvm-cxxfilt -n | FileCheck %s --check-prefixes CHECK
+// RUN: %clang_cc1 -std=c++23 %s -triple x86_64-linux-gnu -emit-llvm -disable-llvm-passes -o - -fexceptions -fcxx-exceptions -pedantic-errors | llvm-cxxfilt -n | FileCheck %s --check-prefixes CHECK
+// RUN: %clang_cc1 -std=c++2c %s -triple x86_64-linux-gnu -emit-llvm -disable-llvm-passes -o - -fexceptions -fcxx-exceptions -pedantic-errors | llvm-cxxfilt -n | FileCheck %s --check-prefixes CHECK
+
+#if __cplusplus == 199711L
+#define NOTHROW throw()
+#else
+#define NOTHROW noexcept(true)
+#endif
+
+namespace dr201 { // dr201: 2.8
+
+extern void full_expr_fence() NOTHROW;
+
+struct A {
+  ~A() NOTHROW {}
+};
+
+struct B {
+  B(A) NOTHROW {}
+  ~B() NOTHROW {}
+};
+
+void foo() {
+  full_expr_fence();
+  B b = A();
+  full_expr_fence();
+}
+
+// CHECK-LABEL: define {{.*}} void @dr201::foo()
+// CHECK:         call void @dr201::full_expr_fence()
+// CHECK:         call void @dr201::B::B(dr201::A)
+// CHECK:         call void @dr201::A::~A()
+// CHECK:         call void @dr201::full_expr_fence()
+// CHECK:         call void @dr201::B::~B()
+// CHECK-LABEL: }
+
+} // namespace dr201
diff --git a/clang/test/CXX/drs/dr210.cpp b/clang/test/CXX/drs/dr210.cpp
new file mode 100644
index 0000000..156ee81
--- /dev/null
+++ b/clang/test/CXX/drs/dr210.cpp
@@ -0,0 +1,41 @@
+// RUN: %clang_cc1 -std=c++98 %s -triple x86_64-linux-gnu -emit-llvm -disable-llvm-passes -o - -fexceptions -fcxx-exceptions -pedantic-errors | llvm-cxxfilt -n | FileCheck %s --check-prefixes CHECK
+// RUN: %clang_cc1 -std=c++11 %s -triple x86_64-linux-gnu -emit-llvm -disable-llvm-passes -o - -fexceptions -fcxx-exceptions -pedantic-errors | llvm-cxxfilt -n | FileCheck %s --check-prefixes CHECK
+// RUN: %clang_cc1 -std=c++14 %s -triple x86_64-linux-gnu -emit-llvm -disable-llvm-passes -o - -fexceptions -fcxx-exceptions -pedantic-errors | llvm-cxxfilt -n | FileCheck %s --check-prefixes CHECK
+// RUN: %clang_cc1 -std=c++17 %s -triple x86_64-linux-gnu -emit-llvm -disable-llvm-passes -o - -fexceptions -fcxx-exceptions -pedantic-errors | llvm-cxxfilt -n | FileCheck %s --check-prefixes CHECK
+// RUN: %clang_cc1 -std=c++20 %s -triple x86_64-linux-gnu -emit-llvm -disable-llvm-passes -o - -fexceptions -fcxx-exceptions -pedantic-errors | llvm-cxxfilt -n | FileCheck %s --check-prefixes CHECK
+// RUN: %clang_cc1 -std=c++23 %s -triple x86_64-linux-gnu -emit-llvm -disable-llvm-passes -o - -fexceptions -fcxx-exceptions -pedantic-errors | llvm-cxxfilt -n | FileCheck %s --check-prefixes CHECK
+// RUN: %clang_cc1 -std=c++2c %s -triple x86_64-linux-gnu -emit-llvm -disable-llvm-passes -o - -fexceptions -fcxx-exceptions -pedantic-errors | llvm-cxxfilt -n | FileCheck %s --check-prefixes CHECK
+
+#if __cplusplus == 199711L
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wvariadic-macros"
+#define static_assert(...) __extension__ _Static_assert(__VA_ARGS__)
+#pragma clang diagnostic pop
+#endif
+
+namespace dr210 { // dr210: 2.7
+struct B {
+  long i;
+  B();
+  virtual ~B();
+};
+
+static_assert(sizeof(B) == 16, "");
+
+struct D : B {
+  long j;
+  D();
+};
+
+static_assert(sizeof(D) == 24, "");
+
+void toss(const B* b) {
+  throw *b;
+}
+
+// CHECK-LABEL: define {{.*}} void @dr210::toss(dr210::B const*)
+// CHECK:         %[[EXCEPTION:.*]] = call ptr @__cxa_allocate_exception(i64 16)
+// CHECK:         call void @__cxa_throw(ptr %[[EXCEPTION]], ptr @typeinfo for dr210::B, ptr @dr210::B::~B())
+// CHECK-LABEL: }
+
+} // namespace dr210
diff --git a/clang/test/CXX/drs/dr292.cpp b/clang/test/CXX/drs/dr292.cpp
new file mode 100644
index 0000000..19caeef
--- /dev/null
+++ b/clang/test/CXX/drs/dr292.cpp
@@ -0,0 +1,30 @@
+// RUN: %clang_cc1 -std=c++98 %s -triple x86_64-linux-gnu -emit-llvm -disable-llvm-passes -o - -fexceptions -fcxx-exceptions -pedantic-errors | llvm-cxxfilt -n | FileCheck %s --check-prefixes CHECK
+// RUN: %clang_cc1 -std=c++11 %s -triple x86_64-linux-gnu -emit-llvm -disable-llvm-passes -o - -fexceptions -fcxx-exceptions -pedantic-errors | llvm-cxxfilt -n | FileCheck %s --check-prefixes CHECK
+// RUN: %clang_cc1 -std=c++14 %s -triple x86_64-linux-gnu -emit-llvm -disable-llvm-passes -o - -fexceptions -fcxx-exceptions -pedantic-errors | llvm-cxxfilt -n | FileCheck %s --check-prefixes CHECK
+// RUN: %clang_cc1 -std=c++17 %s -triple x86_64-linux-gnu -emit-llvm -disable-llvm-passes -o - -fexceptions -fcxx-exceptions -pedantic-errors | llvm-cxxfilt -n | FileCheck %s --check-prefixes CHECK
+// RUN: %clang_cc1 -std=c++20 %s -triple x86_64-linux-gnu -emit-llvm -disable-llvm-passes -o - -fexceptions -fcxx-exceptions -pedantic-errors | llvm-cxxfilt -n | FileCheck %s --check-prefixes CHECK
+// RUN: %clang_cc1 -std=c++23 %s -triple x86_64-linux-gnu -emit-llvm -disable-llvm-passes -o - -fexceptions -fcxx-exceptions -pedantic-errors | llvm-cxxfilt -n | FileCheck %s --check-prefixes CHECK
+// RUN: %clang_cc1 -std=c++2c %s -triple x86_64-linux-gnu -emit-llvm -disable-llvm-passes -o - -fexceptions -fcxx-exceptions -pedantic-errors | llvm-cxxfilt -n | FileCheck %s --check-prefixes CHECK
+
+namespace dr292 { // dr292: 2.9
+
+extern int g();
+
+struct A {
+  A(int) throw() {}
+};
+
+void f() {
+  new A(g());
+}
+
+// CHECK-LABEL: define {{.*}} void @dr292::f()()
+// CHECK:         %[[CALL:.+]] = call {{.*}} @operator new(unsigned long)({{.*}})
+// CHECK:         invoke {{.*}} i32 @dr292::g()()
+// CHECK-NEXT:           to {{.*}} unwind label %lpad
+// CHECK-LABEL: lpad:
+// CHECK:         call void @operator delete(void*)(ptr {{.*}} %[[CALL]])
+// CHECK-LABEL: eh.resume:
+// CHECK-LABEL: }
+
+} // namespace dr292
diff --git a/clang/test/CXX/drs/dr2xx.cpp b/clang/test/CXX/drs/dr2xx.cpp
index 1a3ac53..cbb8734 100644
--- a/clang/test/CXX/drs/dr2xx.cpp
+++ b/clang/test/CXX/drs/dr2xx.cpp
@@ -26,7 +26,7 @@ namespace dr200 { // dr200: dup 214
   }
 }
 
-// dr201 FIXME: write codegen test
+// dr201 is in dr201.cpp
 
 namespace dr202 { // dr202: 3.1
   template<typename T> T f();
@@ -76,7 +76,7 @@ namespace dr209 { // dr209: 3.2
   };
 }
 
-// dr210 FIXME: write codegen test
+// dr210 is in dr210.cpp
 
 namespace dr211 { // dr211: yes
   struct A {
@@ -1188,7 +1188,7 @@ namespace dr289 { // dr289: yes
 
 // dr290: na
 // dr291: dup 391
-// dr292 FIXME: write a codegen test
+// dr292 is in dr292.cpp
 
 namespace dr294 { // dr294: no
   void f() throw(int);
diff --git a/clang/www/cxx_dr_status.html b/clang/www/cxx_dr_status.html
index 8f4ae23..1e527e1 100755
--- a/clang/www/cxx_dr_status.html
+++ b/clang/www/cxx_dr_status.html
@@ -1244,7 +1244,7 @@
     <td><a href="https://cplusplus.github.io/CWG/issues/201.html">201</a></td>
     <td>CD1</td>
     <td>Order of destruction of temporaries in initializers</td>
-    <td class="unknown" align="center">Unknown</td>
+    <td class="full" align="center">Clang 2.8</td>
   </tr>
   <tr id="202">
     <td><a href="https://cplusplus.github.io/CWG/issues/202.html">202</a></td>
@@ -1299,7 +1299,7 @@ accessible?</td>
     <td><a href="https://cplusplus.github.io/CWG/issues/210.html">210</a></td>
     <td>TC1</td>
     <td>What is the type matched by an exception handler?</td>
-    <td class="unknown" align="center">Unknown</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="211">
     <td><a href="https://cplusplus.github.io/CWG/issues/211.html">211</a></td>
@@ -1792,7 +1792,7 @@ of class templates</td>
     <td><a href="https://cplusplus.github.io/CWG/issues/292.html">292</a></td>
     <td>CD3</td>
     <td>Deallocation on exception in <TT>new</TT> before arguments evaluated</td>
-    <td class="unknown" align="center">Unknown</td>
+    <td class="full" align="center">Clang 2.9</td>
   </tr>
   <tr class="open" id="293">
     <td><a href="https://cplusplus.github.io/CWG/issues/293.html">293</a></td>
-- 
cgit v1.1


From d57515bd107bc76df5a042ffee2b7dc6125ffef1 Mon Sep 17 00:00:00 2001
From: Jay Foad <jay.foad@amd.com>
Date: Tue, 13 Feb 2024 08:21:35 +0000
Subject: [LLT] Add and use isPointerVector and isPointerOrPointerVector. NFC.
 (#81283)

---
 llvm/include/llvm/CodeGen/GlobalISel/LegalizerInfo.h       |  2 +-
 llvm/include/llvm/CodeGenTypes/LowLevelType.h              | 14 ++++++++------
 llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp            |  5 ++---
 llvm/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp           |  2 +-
 llvm/lib/CodeGen/MachineVerifier.cpp                       | 14 ++++++--------
 .../Target/AArch64/GISel/AArch64InstructionSelector.cpp    |  2 +-
 llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp     | 11 ++++-------
 llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp             |  5 ++---
 llvm/unittests/CodeGen/LowLevelTypeTest.cpp                |  3 +++
 9 files changed, 28 insertions(+), 30 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/GlobalISel/LegalizerInfo.h b/llvm/include/llvm/CodeGen/GlobalISel/LegalizerInfo.h
index 5fd80e5..637c2c7 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/LegalizerInfo.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/LegalizerInfo.h
@@ -1084,7 +1084,7 @@ public:
         },
         [=](const LegalityQuery &Query) {
           LLT T = Query.Types[LargeTypeIdx];
-          if (T.isVector() && T.getElementType().isPointer())
+          if (T.isPointerVector())
             T = T.changeElementType(LLT::scalar(T.getScalarSizeInBits()));
           return std::make_pair(TypeIdx, T);
         });
diff --git a/llvm/include/llvm/CodeGenTypes/LowLevelType.h b/llvm/include/llvm/CodeGenTypes/LowLevelType.h
index cc33152..5a16cff 100644
--- a/llvm/include/llvm/CodeGenTypes/LowLevelType.h
+++ b/llvm/include/llvm/CodeGenTypes/LowLevelType.h
@@ -134,15 +134,17 @@ public:
 
   explicit LLT(MVT VT);
 
-  constexpr bool isValid() const { return IsScalar || RawData != 0; }
+  constexpr bool isValid() const { return IsScalar || IsPointer || IsVector; }
 
   constexpr bool isScalar() const { return IsScalar; }
 
-  constexpr bool isPointer() const {
-    return isValid() && IsPointer && !IsVector;
-  }
+  constexpr bool isPointer() const { return IsPointer && !IsVector; }
+
+  constexpr bool isPointerVector() const { return IsPointer && IsVector; }
+
+  constexpr bool isPointerOrPointerVector() const { return IsPointer; }
 
-  constexpr bool isVector() const { return isValid() && IsVector; }
+  constexpr bool isVector() const { return IsVector; }
 
   /// Returns the number of elements in a vector LLT. Must only be called on
   /// vector types.
@@ -209,7 +211,7 @@ public:
   /// but the new element size. Otherwise, return the new element type. Invalid
   /// for pointer types. For pointer types, use changeElementType.
   constexpr LLT changeElementSize(unsigned NewEltSize) const {
-    assert(!getScalarType().isPointer() &&
+    assert(!isPointerOrPointerVector() &&
            "invalid to directly change element size for pointers");
     return isVector() ? LLT::vector(getElementCount(), NewEltSize)
                       : LLT::scalar(NewEltSize);
diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
index 464ff08..e39fdae 100644
--- a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
@@ -1716,8 +1716,7 @@ Register LegalizerHelper::coerceToScalar(Register Val) {
   Register NewVal = Val;
 
   assert(Ty.isVector());
-  LLT EltTy = Ty.getElementType();
-  if (EltTy.isPointer())
+  if (Ty.isPointerVector())
     NewVal = MIRBuilder.buildPtrToInt(NewTy, NewVal).getReg(0);
   return MIRBuilder.buildBitcast(NewTy, NewVal).getReg(0);
 }
@@ -7964,7 +7963,7 @@ LegalizerHelper::LegalizeResult LegalizerHelper::lowerSelect(MachineInstr &MI) {
   auto [DstReg, DstTy, MaskReg, MaskTy, Op1Reg, Op1Ty, Op2Reg, Op2Ty] =
       MI.getFirst4RegLLTs();
 
-  bool IsEltPtr = DstTy.getScalarType().isPointer();
+  bool IsEltPtr = DstTy.isPointerOrPointerVector();
   if (IsEltPtr) {
     LLT ScalarPtrTy = LLT::scalar(DstTy.getScalarSizeInBits());
     LLT NewTy = DstTy.changeElementType(ScalarPtrTy);
diff --git a/llvm/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp b/llvm/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp
index a5827c2..d58b628 100644
--- a/llvm/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp
@@ -199,7 +199,7 @@ void MachineIRBuilder::validateShiftOp(const LLT Res, const LLT Op0,
 MachineInstrBuilder
 MachineIRBuilder::buildPtrAdd(const DstOp &Res, const SrcOp &Op0,
                               const SrcOp &Op1, std::optional<unsigned> Flags) {
-  assert(Res.getLLTTy(*getMRI()).getScalarType().isPointer() &&
+  assert(Res.getLLTTy(*getMRI()).isPointerOrPointerVector() &&
          Res.getLLTTy(*getMRI()) == Op0.getLLTTy(*getMRI()) && "type mismatch");
   assert(Op1.getLLTTy(*getMRI()).getScalarType().isScalar() && "invalid offset type");
 
diff --git a/llvm/lib/CodeGen/MachineVerifier.cpp b/llvm/lib/CodeGen/MachineVerifier.cpp
index c65e917..2632b5b 100644
--- a/llvm/lib/CodeGen/MachineVerifier.cpp
+++ b/llvm/lib/CodeGen/MachineVerifier.cpp
@@ -1288,10 +1288,10 @@ void MachineVerifier::verifyPreISelGenericInstruction(const MachineInstr *MI) {
     if (!DstTy.isValid() || !PtrTy.isValid() || !OffsetTy.isValid())
       break;
 
-    if (!PtrTy.getScalarType().isPointer())
+    if (!PtrTy.isPointerOrPointerVector())
       report("gep first operand must be a pointer", MI);
 
-    if (OffsetTy.getScalarType().isPointer())
+    if (OffsetTy.isPointerOrPointerVector())
       report("gep offset operand must not be a pointer", MI);
 
     // TODO: Is the offset allowed to be a scalar with a vector?
@@ -1304,7 +1304,7 @@ void MachineVerifier::verifyPreISelGenericInstruction(const MachineInstr *MI) {
     if (!DstTy.isValid() || !SrcTy.isValid() || !MaskTy.isValid())
       break;
 
-    if (!DstTy.getScalarType().isPointer())
+    if (!DstTy.isPointerOrPointerVector())
       report("ptrmask result type must be a pointer", MI);
 
     if (!MaskTy.getScalarType().isScalar())
@@ -1330,15 +1330,13 @@ void MachineVerifier::verifyPreISelGenericInstruction(const MachineInstr *MI) {
     if (!DstTy.isValid() || !SrcTy.isValid())
       break;
 
-    LLT DstElTy = DstTy.getScalarType();
-    LLT SrcElTy = SrcTy.getScalarType();
-    if (DstElTy.isPointer() || SrcElTy.isPointer())
+    if (DstTy.isPointerOrPointerVector() || SrcTy.isPointerOrPointerVector())
       report("Generic extend/truncate can not operate on pointers", MI);
 
     verifyVectorElementMatch(DstTy, SrcTy, MI);
 
-    unsigned DstSize = DstElTy.getSizeInBits();
-    unsigned SrcSize = SrcElTy.getSizeInBits();
+    unsigned DstSize = DstTy.getScalarSizeInBits();
+    unsigned SrcSize = SrcTy.getScalarSizeInBits();
     switch (MI->getOpcode()) {
     default:
       if (DstSize <= SrcSize)
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
index 9d51a7f..ac80485 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
@@ -2091,7 +2091,7 @@ bool AArch64InstructionSelector::preISelLower(MachineInstr &I) {
   case AArch64::G_DUP: {
     // Convert the type from p0 to s64 to help selection.
     LLT DstTy = MRI.getType(I.getOperand(0).getReg());
-    if (!DstTy.getElementType().isPointer())
+    if (!DstTy.isPointerVector())
       return false;
     auto NewSrc = MIB.buildCopy(LLT::scalar(64), I.getOperand(1).getReg());
     MRI.setType(I.getOperand(0).getReg(),
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
index cbf5655..ab25e2b 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
@@ -343,10 +343,7 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
 
   auto IsPtrVecPred = [=](const LegalityQuery &Query) {
     const LLT &ValTy = Query.Types[0];
-    if (!ValTy.isVector())
-      return false;
-    const LLT EltTy = ValTy.getElementType();
-    return EltTy.isPointer() && EltTy.getAddressSpace() == 0;
+    return ValTy.isPointerVector() && ValTy.getAddressSpace() == 0;
   };
 
   getActionDefinitionsBuilder(G_LOAD)
@@ -521,7 +518,7 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
           [=](const LegalityQuery &Query) {
             const LLT &Ty = Query.Types[0];
             const LLT &SrcTy = Query.Types[1];
-            return Ty.isVector() && !SrcTy.getElementType().isPointer() &&
+            return Ty.isVector() && !SrcTy.isPointerVector() &&
                    Ty.getElementType() != SrcTy.getElementType();
           },
           0, 1)
@@ -555,7 +552,7 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
           [=](const LegalityQuery &Query) {
             const LLT &Ty = Query.Types[0];
             const LLT &SrcTy = Query.Types[1];
-            return Ty.isVector() && !SrcTy.getElementType().isPointer() &&
+            return Ty.isVector() && !SrcTy.isPointerVector() &&
                    Ty.getElementType() != SrcTy.getElementType();
           },
           0, 1)
@@ -1649,7 +1646,7 @@ bool AArch64LegalizerInfo::legalizeLoadStore(
     return true;
   }
 
-  if (!ValTy.isVector() || !ValTy.getElementType().isPointer() ||
+  if (!ValTy.isPointerVector() ||
       ValTy.getElementType().getAddressSpace() != 0) {
     LLVM_DEBUG(dbgs() << "Tried to do custom legalization on wrong load/store");
     return false;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index def08cc..f3716f9 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -498,11 +498,10 @@ static bool loadStoreBitcastWorkaround(const LLT Ty) {
   if (!Ty.isVector())
     return true;
 
-  LLT EltTy = Ty.getElementType();
-  if (EltTy.isPointer())
+  if (Ty.isPointerVector())
     return true;
 
-  unsigned EltSize = EltTy.getSizeInBits();
+  unsigned EltSize = Ty.getScalarSizeInBits();
   return EltSize != 32 && EltSize != 64;
 }
 
diff --git a/llvm/unittests/CodeGen/LowLevelTypeTest.cpp b/llvm/unittests/CodeGen/LowLevelTypeTest.cpp
index d13cfee..cb34802 100644
--- a/llvm/unittests/CodeGen/LowLevelTypeTest.cpp
+++ b/llvm/unittests/CodeGen/LowLevelTypeTest.cpp
@@ -259,6 +259,7 @@ TEST(LowLevelTypeTest, Pointer) {
       // Test kind.
       ASSERT_TRUE(Ty.isValid());
       ASSERT_TRUE(Ty.isPointer());
+      ASSERT_TRUE(Ty.isPointerOrPointerVector());
 
       ASSERT_FALSE(Ty.isScalar());
       ASSERT_FALSE(Ty.isVector());
@@ -266,6 +267,8 @@ TEST(LowLevelTypeTest, Pointer) {
       ASSERT_TRUE(VTy.isValid());
       ASSERT_TRUE(VTy.isVector());
       ASSERT_TRUE(VTy.getElementType().isPointer());
+      ASSERT_TRUE(VTy.isPointerVector());
+      ASSERT_TRUE(VTy.isPointerOrPointerVector());
 
       EXPECT_EQ(Ty, VTy.getElementType());
       EXPECT_EQ(Ty.getSizeInBits(), VTy.getScalarSizeInBits());
-- 
cgit v1.1


From 0e16950e741aee7ffec865c81596e1411471475e Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Tue, 13 Feb 2024 08:22:39 +0000
Subject: [X86] IceLakeServer - ZMM FADD/FMUL can only use Port0

Fix discrepancy from when this was forked from the SkylakeServer model

This also fixes VRANGEPS/VRANGEPD instructions which typically match FADD characteristics

Confirmed with Agner + uops.info

Fixes #81504
---
 llvm/lib/Target/X86/X86SchedIceLake.td             |   8 +-
 .../llvm-mca/X86/IceLakeServer/resources-avx512.s  | 218 ++++++++++-----------
 .../X86/IceLakeServer/resources-avx512dq.s         |  98 ++++-----
 3 files changed, 162 insertions(+), 162 deletions(-)

diff --git a/llvm/lib/Target/X86/X86SchedIceLake.td b/llvm/lib/Target/X86/X86SchedIceLake.td
index 2c660fa..3144327 100644
--- a/llvm/lib/Target/X86/X86SchedIceLake.td
+++ b/llvm/lib/Target/X86/X86SchedIceLake.td
@@ -257,11 +257,11 @@ defm : X86WriteRes<WriteEMMS,          [ICXPort05,ICXPort0156], 10, [9,1], 10>;
 defm : ICXWriteResPair<WriteFAdd,      [ICXPort01],  4, [1], 1, 5>; // Floating point add/sub.
 defm : ICXWriteResPair<WriteFAddX,     [ICXPort01],  4, [1], 1, 6>;
 defm : ICXWriteResPair<WriteFAddY,     [ICXPort01],  4, [1], 1, 7>;
-defm : ICXWriteResPair<WriteFAddZ,     [ICXPort05],  4, [1], 1, 7>;
+defm : ICXWriteResPair<WriteFAddZ,     [ICXPort0],   4, [1], 1, 7>;
 defm : ICXWriteResPair<WriteFAdd64,    [ICXPort01],  4, [1], 1, 5>; // Floating point double add/sub.
 defm : ICXWriteResPair<WriteFAdd64X,   [ICXPort01],  4, [1], 1, 6>;
 defm : ICXWriteResPair<WriteFAdd64Y,   [ICXPort01],  4, [1], 1, 7>;
-defm : ICXWriteResPair<WriteFAdd64Z,   [ICXPort05],  4, [1], 1, 7>;
+defm : ICXWriteResPair<WriteFAdd64Z,   [ICXPort0],   4, [1], 1, 7>;
 
 defm : ICXWriteResPair<WriteFCmp,      [ICXPort01],  4, [1], 1, 5>; // Floating point compare.
 defm : ICXWriteResPair<WriteFCmpX,     [ICXPort01],  4, [1], 1, 6>;
@@ -278,11 +278,11 @@ defm : ICXWriteResPair<WriteFComX,      [ICXPort0],  2>; // Floating point compa
 defm : ICXWriteResPair<WriteFMul,      [ICXPort01],  4, [1], 1, 5>; // Floating point multiplication.
 defm : ICXWriteResPair<WriteFMulX,     [ICXPort01],  4, [1], 1, 6>;
 defm : ICXWriteResPair<WriteFMulY,     [ICXPort01],  4, [1], 1, 7>;
-defm : ICXWriteResPair<WriteFMulZ,     [ICXPort05],  4, [1], 1, 7>;
+defm : ICXWriteResPair<WriteFMulZ,     [ICXPort0],   4, [1], 1, 7>;
 defm : ICXWriteResPair<WriteFMul64,    [ICXPort01],  4, [1], 1, 5>; // Floating point double multiplication.
 defm : ICXWriteResPair<WriteFMul64X,   [ICXPort01],  4, [1], 1, 6>;
 defm : ICXWriteResPair<WriteFMul64Y,   [ICXPort01],  4, [1], 1, 7>;
-defm : ICXWriteResPair<WriteFMul64Z,   [ICXPort05],  4, [1], 1, 7>;
+defm : ICXWriteResPair<WriteFMul64Z,   [ICXPort0],   4, [1], 1, 7>;
 
 defm : ICXWriteResPair<WriteFDiv,     [ICXPort0,ICXFPDivider], 11, [1,3], 1, 5>; // 10-14 cycles. // Floating point division.
 defm : ICXWriteResPair<WriteFDivX,    [ICXPort0,ICXFPDivider], 11, [1,3], 1, 6>; // 10-14 cycles.
diff --git a/llvm/test/tools/llvm-mca/X86/IceLakeServer/resources-avx512.s b/llvm/test/tools/llvm-mca/X86/IceLakeServer/resources-avx512.s
index 6d33fdb..5c12c52 100644
--- a/llvm/test/tools/llvm-mca/X86/IceLakeServer/resources-avx512.s
+++ b/llvm/test/tools/llvm-mca/X86/IceLakeServer/resources-avx512.s
@@ -1084,24 +1084,24 @@ vunpcklps         (%rax){1to16}, %zmm17, %zmm19 {z}{k1}
 # CHECK-NEXT:  1      4     1.00                        kshiftlw	$2, %k1, %k2
 # CHECK-NEXT:  1      4     1.00                        kshiftrw	$2, %k1, %k2
 # CHECK-NEXT:  1      4     1.00                        kunpckbw	%k0, %k1, %k2
-# CHECK-NEXT:  1      4     0.50                        vaddpd	%zmm16, %zmm17, %zmm19
-# CHECK-NEXT:  2      11    0.50    *                   vaddpd	(%rax), %zmm17, %zmm19
-# CHECK-NEXT:  2      11    0.50    *                   vaddpd	(%rax){1to8}, %zmm17, %zmm19
-# CHECK-NEXT:  1      4     0.50                        vaddpd	%zmm16, %zmm17, %zmm19 {%k1}
-# CHECK-NEXT:  2      11    0.50    *                   vaddpd	(%rax), %zmm17, %zmm19 {%k1}
-# CHECK-NEXT:  2      11    0.50    *                   vaddpd	(%rax){1to8}, %zmm17, %zmm19 {%k1}
-# CHECK-NEXT:  1      4     0.50                        vaddpd	%zmm16, %zmm17, %zmm19 {%k1} {z}
-# CHECK-NEXT:  2      11    0.50    *                   vaddpd	(%rax), %zmm17, %zmm19 {%k1} {z}
-# CHECK-NEXT:  2      11    0.50    *                   vaddpd	(%rax){1to8}, %zmm17, %zmm19 {%k1} {z}
-# CHECK-NEXT:  1      4     0.50                        vaddps	%zmm16, %zmm17, %zmm19
-# CHECK-NEXT:  2      11    0.50    *                   vaddps	(%rax), %zmm17, %zmm19
-# CHECK-NEXT:  2      11    0.50    *                   vaddps	(%rax){1to16}, %zmm17, %zmm19
-# CHECK-NEXT:  1      4     0.50                        vaddps	%zmm16, %zmm17, %zmm19 {%k1}
-# CHECK-NEXT:  2      11    0.50    *                   vaddps	(%rax), %zmm17, %zmm19 {%k1}
-# CHECK-NEXT:  2      11    0.50    *                   vaddps	(%rax){1to16}, %zmm17, %zmm19 {%k1}
-# CHECK-NEXT:  1      4     0.50                        vaddps	%zmm16, %zmm17, %zmm19 {%k1} {z}
-# CHECK-NEXT:  2      11    0.50    *                   vaddps	(%rax), %zmm17, %zmm19 {%k1} {z}
-# CHECK-NEXT:  2      11    0.50    *                   vaddps	(%rax){1to16}, %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  1      4     1.00                        vaddpd	%zmm16, %zmm17, %zmm19
+# CHECK-NEXT:  2      11    1.00    *                   vaddpd	(%rax), %zmm17, %zmm19
+# CHECK-NEXT:  2      11    1.00    *                   vaddpd	(%rax){1to8}, %zmm17, %zmm19
+# CHECK-NEXT:  1      4     1.00                        vaddpd	%zmm16, %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  2      11    1.00    *                   vaddpd	(%rax), %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  2      11    1.00    *                   vaddpd	(%rax){1to8}, %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  1      4     1.00                        vaddpd	%zmm16, %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  2      11    1.00    *                   vaddpd	(%rax), %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  2      11    1.00    *                   vaddpd	(%rax){1to8}, %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  1      4     1.00                        vaddps	%zmm16, %zmm17, %zmm19
+# CHECK-NEXT:  2      11    1.00    *                   vaddps	(%rax), %zmm17, %zmm19
+# CHECK-NEXT:  2      11    1.00    *                   vaddps	(%rax){1to16}, %zmm17, %zmm19
+# CHECK-NEXT:  1      4     1.00                        vaddps	%zmm16, %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  2      11    1.00    *                   vaddps	(%rax), %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  2      11    1.00    *                   vaddps	(%rax){1to16}, %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  1      4     1.00                        vaddps	%zmm16, %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  2      11    1.00    *                   vaddps	(%rax), %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  2      11    1.00    *                   vaddps	(%rax){1to16}, %zmm17, %zmm19 {%k1} {z}
 # CHECK-NEXT:  1      3     1.00                        valignd	$1, %zmm16, %zmm17, %zmm19
 # CHECK-NEXT:  2      10    1.00    *                   valignd	$1, (%rax), %zmm17, %zmm19
 # CHECK-NEXT:  2      10    1.00    *                   valignd	$1, (%rax){1to16}, %zmm17, %zmm19
@@ -1510,24 +1510,24 @@ vunpcklps         (%rax){1to16}, %zmm17, %zmm19 {z}{k1}
 # CHECK-NEXT:  2      1     0.50           *            vmovups	%zmm16, (%rax) {%k1}
 # CHECK-NEXT:  1      1     0.50                        vmovups	%zmm16, %zmm19 {%k1} {z}
 # CHECK-NEXT:  2      8     0.50    *                   vmovups	(%rax), %zmm19 {%k1} {z}
-# CHECK-NEXT:  1      4     0.50                        vmulpd	%zmm16, %zmm17, %zmm19
-# CHECK-NEXT:  2      11    0.50    *                   vmulpd	(%rax), %zmm17, %zmm19
-# CHECK-NEXT:  2      11    0.50    *                   vmulpd	(%rax){1to8}, %zmm17, %zmm19
-# CHECK-NEXT:  1      4     0.50                        vmulpd	%zmm16, %zmm17, %zmm19 {%k1}
-# CHECK-NEXT:  2      11    0.50    *                   vmulpd	(%rax), %zmm17, %zmm19 {%k1}
-# CHECK-NEXT:  2      11    0.50    *                   vmulpd	(%rax){1to8}, %zmm17, %zmm19 {%k1}
-# CHECK-NEXT:  1      4     0.50                        vmulpd	%zmm16, %zmm17, %zmm19 {%k1} {z}
-# CHECK-NEXT:  2      11    0.50    *                   vmulpd	(%rax), %zmm17, %zmm19 {%k1} {z}
-# CHECK-NEXT:  2      11    0.50    *                   vmulpd	(%rax){1to8}, %zmm17, %zmm19 {%k1} {z}
-# CHECK-NEXT:  1      4     0.50                        vmulps	%zmm16, %zmm17, %zmm19
-# CHECK-NEXT:  2      11    0.50    *                   vmulps	(%rax), %zmm17, %zmm19
-# CHECK-NEXT:  2      11    0.50    *                   vmulps	(%rax){1to16}, %zmm17, %zmm19
-# CHECK-NEXT:  1      4     0.50                        vmulps	%zmm16, %zmm17, %zmm19 {%k1}
-# CHECK-NEXT:  2      11    0.50    *                   vmulps	(%rax), %zmm17, %zmm19 {%k1}
-# CHECK-NEXT:  2      11    0.50    *                   vmulps	(%rax){1to16}, %zmm17, %zmm19 {%k1}
-# CHECK-NEXT:  1      4     0.50                        vmulps	%zmm16, %zmm17, %zmm19 {%k1} {z}
-# CHECK-NEXT:  2      11    0.50    *                   vmulps	(%rax), %zmm17, %zmm19 {%k1} {z}
-# CHECK-NEXT:  2      11    0.50    *                   vmulps	(%rax){1to16}, %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  1      4     1.00                        vmulpd	%zmm16, %zmm17, %zmm19
+# CHECK-NEXT:  2      11    1.00    *                   vmulpd	(%rax), %zmm17, %zmm19
+# CHECK-NEXT:  2      11    1.00    *                   vmulpd	(%rax){1to8}, %zmm17, %zmm19
+# CHECK-NEXT:  1      4     1.00                        vmulpd	%zmm16, %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  2      11    1.00    *                   vmulpd	(%rax), %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  2      11    1.00    *                   vmulpd	(%rax){1to8}, %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  1      4     1.00                        vmulpd	%zmm16, %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  2      11    1.00    *                   vmulpd	(%rax), %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  2      11    1.00    *                   vmulpd	(%rax){1to8}, %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  1      4     1.00                        vmulps	%zmm16, %zmm17, %zmm19
+# CHECK-NEXT:  2      11    1.00    *                   vmulps	(%rax), %zmm17, %zmm19
+# CHECK-NEXT:  2      11    1.00    *                   vmulps	(%rax){1to16}, %zmm17, %zmm19
+# CHECK-NEXT:  1      4     1.00                        vmulps	%zmm16, %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  2      11    1.00    *                   vmulps	(%rax), %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  2      11    1.00    *                   vmulps	(%rax){1to16}, %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  1      4     1.00                        vmulps	%zmm16, %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  2      11    1.00    *                   vmulps	(%rax), %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  2      11    1.00    *                   vmulps	(%rax){1to16}, %zmm17, %zmm19 {%k1} {z}
 # CHECK-NEXT:  1      1     1.00                        vpabsd	%zmm16, %zmm19
 # CHECK-NEXT:  2      8     1.00    *                   vpabsd	(%rax), %zmm19
 # CHECK-NEXT:  2      8     1.00    *                   vpabsd	(%rax){1to16}, %zmm19
@@ -1958,24 +1958,24 @@ vunpcklps         (%rax){1to16}, %zmm17, %zmm19 {z}{k1}
 # CHECK-NEXT:  1      4     1.00                        vptestnmq	%zmm0, %zmm1, %k2 {%k3}
 # CHECK-NEXT:  2      11    1.00    *                   vptestnmq	(%rax), %zmm1, %k2 {%k3}
 # CHECK-NEXT:  2      11    1.00    *                   vptestnmq	(%rax){1to8}, %zmm1, %k2 {%k3}
-# CHECK-NEXT:  1      4     0.50                        vsubpd	%zmm16, %zmm17, %zmm19
-# CHECK-NEXT:  2      11    0.50    *                   vsubpd	(%rax), %zmm17, %zmm19
-# CHECK-NEXT:  2      11    0.50    *                   vsubpd	(%rax){1to8}, %zmm17, %zmm19
-# CHECK-NEXT:  1      4     0.50                        vsubpd	%zmm16, %zmm17, %zmm19 {%k1}
-# CHECK-NEXT:  2      11    0.50    *                   vsubpd	(%rax), %zmm17, %zmm19 {%k1}
-# CHECK-NEXT:  2      11    0.50    *                   vsubpd	(%rax){1to8}, %zmm17, %zmm19 {%k1}
-# CHECK-NEXT:  1      4     0.50                        vsubpd	%zmm16, %zmm17, %zmm19 {%k1} {z}
-# CHECK-NEXT:  2      11    0.50    *                   vsubpd	(%rax), %zmm17, %zmm19 {%k1} {z}
-# CHECK-NEXT:  2      11    0.50    *                   vsubpd	(%rax){1to8}, %zmm17, %zmm19 {%k1} {z}
-# CHECK-NEXT:  1      4     0.50                        vsubps	%zmm16, %zmm17, %zmm19
-# CHECK-NEXT:  2      11    0.50    *                   vsubps	(%rax), %zmm17, %zmm19
-# CHECK-NEXT:  2      11    0.50    *                   vsubps	(%rax){1to16}, %zmm17, %zmm19
-# CHECK-NEXT:  1      4     0.50                        vsubps	%zmm16, %zmm17, %zmm19 {%k1}
-# CHECK-NEXT:  2      11    0.50    *                   vsubps	(%rax), %zmm17, %zmm19 {%k1}
-# CHECK-NEXT:  2      11    0.50    *                   vsubps	(%rax){1to16}, %zmm17, %zmm19 {%k1}
-# CHECK-NEXT:  1      4     0.50                        vsubps	%zmm16, %zmm17, %zmm19 {%k1} {z}
-# CHECK-NEXT:  2      11    0.50    *                   vsubps	(%rax), %zmm17, %zmm19 {%k1} {z}
-# CHECK-NEXT:  2      11    0.50    *                   vsubps	(%rax){1to16}, %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  1      4     1.00                        vsubpd	%zmm16, %zmm17, %zmm19
+# CHECK-NEXT:  2      11    1.00    *                   vsubpd	(%rax), %zmm17, %zmm19
+# CHECK-NEXT:  2      11    1.00    *                   vsubpd	(%rax){1to8}, %zmm17, %zmm19
+# CHECK-NEXT:  1      4     1.00                        vsubpd	%zmm16, %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  2      11    1.00    *                   vsubpd	(%rax), %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  2      11    1.00    *                   vsubpd	(%rax){1to8}, %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  1      4     1.00                        vsubpd	%zmm16, %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  2      11    1.00    *                   vsubpd	(%rax), %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  2      11    1.00    *                   vsubpd	(%rax){1to8}, %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  1      4     1.00                        vsubps	%zmm16, %zmm17, %zmm19
+# CHECK-NEXT:  2      11    1.00    *                   vsubps	(%rax), %zmm17, %zmm19
+# CHECK-NEXT:  2      11    1.00    *                   vsubps	(%rax){1to16}, %zmm17, %zmm19
+# CHECK-NEXT:  1      4     1.00                        vsubps	%zmm16, %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  2      11    1.00    *                   vsubps	(%rax), %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  2      11    1.00    *                   vsubps	(%rax){1to16}, %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  1      4     1.00                        vsubps	%zmm16, %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  2      11    1.00    *                   vsubps	(%rax), %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  2      11    1.00    *                   vsubps	(%rax){1to16}, %zmm17, %zmm19 {%k1} {z}
 # CHECK-NEXT:  1      2     1.00                        vucomiss	%xmm16, %xmm17
 # CHECK-NEXT:  2      7     1.00    *                   vucomiss	(%rax), %xmm17
 # CHECK-NEXT:  1      1     1.00                        vunpckhpd	%zmm16, %zmm17, %zmm19
@@ -2031,7 +2031,7 @@ vunpcklps         (%rax){1to16}, %zmm17, %zmm19 {z}{k1}
 
 # CHECK:      Resource pressure per iteration:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]
-# CHECK-NEXT:  -     612.00 371.17 99.67  327.50 327.50 8.00   612.17 2.00   8.00   8.00   8.00
+# CHECK-NEXT:  -     612.00 398.17 99.67  327.50 327.50 8.00   585.17 2.00   8.00   8.00   8.00
 
 # CHECK:      Resource pressure by instruction:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   Instructions:
@@ -2044,24 +2044,24 @@ vunpcklps         (%rax){1to16}, %zmm17, %zmm19 {z}{k1}
 # CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -      -      -      -     kshiftlw	$2, %k1, %k2
 # CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -      -      -      -     kshiftrw	$2, %k1, %k2
 # CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -      -      -      -     kunpckbw	%k0, %k1, %k2
-# CHECK-NEXT:  -      -     0.50    -      -      -      -     0.50    -      -      -      -     vaddpd	%zmm16, %zmm17, %zmm19
-# CHECK-NEXT:  -      -     0.50    -     0.50   0.50    -     0.50    -      -      -      -     vaddpd	(%rax), %zmm17, %zmm19
-# CHECK-NEXT:  -      -     0.50    -     0.50   0.50    -     0.50    -      -      -      -     vaddpd	(%rax){1to8}, %zmm17, %zmm19
-# CHECK-NEXT:  -      -     0.50    -      -      -      -     0.50    -      -      -      -     vaddpd	%zmm16, %zmm17, %zmm19 {%k1}
-# CHECK-NEXT:  -      -     0.50    -     0.50   0.50    -     0.50    -      -      -      -     vaddpd	(%rax), %zmm17, %zmm19 {%k1}
-# CHECK-NEXT:  -      -     0.50    -     0.50   0.50    -     0.50    -      -      -      -     vaddpd	(%rax){1to8}, %zmm17, %zmm19 {%k1}
-# CHECK-NEXT:  -      -     0.50    -      -      -      -     0.50    -      -      -      -     vaddpd	%zmm16, %zmm17, %zmm19 {%k1} {z}
-# CHECK-NEXT:  -      -     0.50    -     0.50   0.50    -     0.50    -      -      -      -     vaddpd	(%rax), %zmm17, %zmm19 {%k1} {z}
-# CHECK-NEXT:  -      -     0.50    -     0.50   0.50    -     0.50    -      -      -      -     vaddpd	(%rax){1to8}, %zmm17, %zmm19 {%k1} {z}
-# CHECK-NEXT:  -      -     0.50    -      -      -      -     0.50    -      -      -      -     vaddps	%zmm16, %zmm17, %zmm19
-# CHECK-NEXT:  -      -     0.50    -     0.50   0.50    -     0.50    -      -      -      -     vaddps	(%rax), %zmm17, %zmm19
-# CHECK-NEXT:  -      -     0.50    -     0.50   0.50    -     0.50    -      -      -      -     vaddps	(%rax){1to16}, %zmm17, %zmm19
-# CHECK-NEXT:  -      -     0.50    -      -      -      -     0.50    -      -      -      -     vaddps	%zmm16, %zmm17, %zmm19 {%k1}
-# CHECK-NEXT:  -      -     0.50    -     0.50   0.50    -     0.50    -      -      -      -     vaddps	(%rax), %zmm17, %zmm19 {%k1}
-# CHECK-NEXT:  -      -     0.50    -     0.50   0.50    -     0.50    -      -      -      -     vaddps	(%rax){1to16}, %zmm17, %zmm19 {%k1}
-# CHECK-NEXT:  -      -     0.50    -      -      -      -     0.50    -      -      -      -     vaddps	%zmm16, %zmm17, %zmm19 {%k1} {z}
-# CHECK-NEXT:  -      -     0.50    -     0.50   0.50    -     0.50    -      -      -      -     vaddps	(%rax), %zmm17, %zmm19 {%k1} {z}
-# CHECK-NEXT:  -      -     0.50    -     0.50   0.50    -     0.50    -      -      -      -     vaddps	(%rax){1to16}, %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -      -      -      -     vaddpd	%zmm16, %zmm17, %zmm19
+# CHECK-NEXT:  -      -     1.00    -     0.50   0.50    -      -      -      -      -      -     vaddpd	(%rax), %zmm17, %zmm19
+# CHECK-NEXT:  -      -     1.00    -     0.50   0.50    -      -      -      -      -      -     vaddpd	(%rax){1to8}, %zmm17, %zmm19
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -      -      -      -     vaddpd	%zmm16, %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  -      -     1.00    -     0.50   0.50    -      -      -      -      -      -     vaddpd	(%rax), %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  -      -     1.00    -     0.50   0.50    -      -      -      -      -      -     vaddpd	(%rax){1to8}, %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -      -      -      -     vaddpd	%zmm16, %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  -      -     1.00    -     0.50   0.50    -      -      -      -      -      -     vaddpd	(%rax), %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  -      -     1.00    -     0.50   0.50    -      -      -      -      -      -     vaddpd	(%rax){1to8}, %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -      -      -      -     vaddps	%zmm16, %zmm17, %zmm19
+# CHECK-NEXT:  -      -     1.00    -     0.50   0.50    -      -      -      -      -      -     vaddps	(%rax), %zmm17, %zmm19
+# CHECK-NEXT:  -      -     1.00    -     0.50   0.50    -      -      -      -      -      -     vaddps	(%rax){1to16}, %zmm17, %zmm19
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -      -      -      -     vaddps	%zmm16, %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  -      -     1.00    -     0.50   0.50    -      -      -      -      -      -     vaddps	(%rax), %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  -      -     1.00    -     0.50   0.50    -      -      -      -      -      -     vaddps	(%rax){1to16}, %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -      -      -      -     vaddps	%zmm16, %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  -      -     1.00    -     0.50   0.50    -      -      -      -      -      -     vaddps	(%rax), %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  -      -     1.00    -     0.50   0.50    -      -      -      -      -      -     vaddps	(%rax){1to16}, %zmm17, %zmm19 {%k1} {z}
 # CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -      -      -      -     valignd	$1, %zmm16, %zmm17, %zmm19
 # CHECK-NEXT:  -      -      -      -     0.50   0.50    -     1.00    -      -      -      -     valignd	$1, (%rax), %zmm17, %zmm19
 # CHECK-NEXT:  -      -      -      -     0.50   0.50    -     1.00    -      -      -      -     valignd	$1, (%rax){1to16}, %zmm17, %zmm19
@@ -2470,24 +2470,24 @@ vunpcklps         (%rax){1to16}, %zmm17, %zmm19 {z}{k1}
 # CHECK-NEXT:  -      -      -      -      -      -     0.50    -      -     0.50   0.50   0.50   vmovups	%zmm16, (%rax) {%k1}
 # CHECK-NEXT:  -      -     0.50    -      -      -      -     0.50    -      -      -      -     vmovups	%zmm16, %zmm19 {%k1} {z}
 # CHECK-NEXT:  -      -     0.33   0.33   0.50   0.50    -     0.33    -      -      -      -     vmovups	(%rax), %zmm19 {%k1} {z}
-# CHECK-NEXT:  -      -     0.50    -      -      -      -     0.50    -      -      -      -     vmulpd	%zmm16, %zmm17, %zmm19
-# CHECK-NEXT:  -      -     0.50    -     0.50   0.50    -     0.50    -      -      -      -     vmulpd	(%rax), %zmm17, %zmm19
-# CHECK-NEXT:  -      -     0.50    -     0.50   0.50    -     0.50    -      -      -      -     vmulpd	(%rax){1to8}, %zmm17, %zmm19
-# CHECK-NEXT:  -      -     0.50    -      -      -      -     0.50    -      -      -      -     vmulpd	%zmm16, %zmm17, %zmm19 {%k1}
-# CHECK-NEXT:  -      -     0.50    -     0.50   0.50    -     0.50    -      -      -      -     vmulpd	(%rax), %zmm17, %zmm19 {%k1}
-# CHECK-NEXT:  -      -     0.50    -     0.50   0.50    -     0.50    -      -      -      -     vmulpd	(%rax){1to8}, %zmm17, %zmm19 {%k1}
-# CHECK-NEXT:  -      -     0.50    -      -      -      -     0.50    -      -      -      -     vmulpd	%zmm16, %zmm17, %zmm19 {%k1} {z}
-# CHECK-NEXT:  -      -     0.50    -     0.50   0.50    -     0.50    -      -      -      -     vmulpd	(%rax), %zmm17, %zmm19 {%k1} {z}
-# CHECK-NEXT:  -      -     0.50    -     0.50   0.50    -     0.50    -      -      -      -     vmulpd	(%rax){1to8}, %zmm17, %zmm19 {%k1} {z}
-# CHECK-NEXT:  -      -     0.50    -      -      -      -     0.50    -      -      -      -     vmulps	%zmm16, %zmm17, %zmm19
-# CHECK-NEXT:  -      -     0.50    -     0.50   0.50    -     0.50    -      -      -      -     vmulps	(%rax), %zmm17, %zmm19
-# CHECK-NEXT:  -      -     0.50    -     0.50   0.50    -     0.50    -      -      -      -     vmulps	(%rax){1to16}, %zmm17, %zmm19
-# CHECK-NEXT:  -      -     0.50    -      -      -      -     0.50    -      -      -      -     vmulps	%zmm16, %zmm17, %zmm19 {%k1}
-# CHECK-NEXT:  -      -     0.50    -     0.50   0.50    -     0.50    -      -      -      -     vmulps	(%rax), %zmm17, %zmm19 {%k1}
-# CHECK-NEXT:  -      -     0.50    -     0.50   0.50    -     0.50    -      -      -      -     vmulps	(%rax){1to16}, %zmm17, %zmm19 {%k1}
-# CHECK-NEXT:  -      -     0.50    -      -      -      -     0.50    -      -      -      -     vmulps	%zmm16, %zmm17, %zmm19 {%k1} {z}
-# CHECK-NEXT:  -      -     0.50    -     0.50   0.50    -     0.50    -      -      -      -     vmulps	(%rax), %zmm17, %zmm19 {%k1} {z}
-# CHECK-NEXT:  -      -     0.50    -     0.50   0.50    -     0.50    -      -      -      -     vmulps	(%rax){1to16}, %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -      -      -      -     vmulpd	%zmm16, %zmm17, %zmm19
+# CHECK-NEXT:  -      -     1.00    -     0.50   0.50    -      -      -      -      -      -     vmulpd	(%rax), %zmm17, %zmm19
+# CHECK-NEXT:  -      -     1.00    -     0.50   0.50    -      -      -      -      -      -     vmulpd	(%rax){1to8}, %zmm17, %zmm19
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -      -      -      -     vmulpd	%zmm16, %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  -      -     1.00    -     0.50   0.50    -      -      -      -      -      -     vmulpd	(%rax), %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  -      -     1.00    -     0.50   0.50    -      -      -      -      -      -     vmulpd	(%rax){1to8}, %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -      -      -      -     vmulpd	%zmm16, %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  -      -     1.00    -     0.50   0.50    -      -      -      -      -      -     vmulpd	(%rax), %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  -      -     1.00    -     0.50   0.50    -      -      -      -      -      -     vmulpd	(%rax){1to8}, %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -      -      -      -     vmulps	%zmm16, %zmm17, %zmm19
+# CHECK-NEXT:  -      -     1.00    -     0.50   0.50    -      -      -      -      -      -     vmulps	(%rax), %zmm17, %zmm19
+# CHECK-NEXT:  -      -     1.00    -     0.50   0.50    -      -      -      -      -      -     vmulps	(%rax){1to16}, %zmm17, %zmm19
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -      -      -      -     vmulps	%zmm16, %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  -      -     1.00    -     0.50   0.50    -      -      -      -      -      -     vmulps	(%rax), %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  -      -     1.00    -     0.50   0.50    -      -      -      -      -      -     vmulps	(%rax){1to16}, %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -      -      -      -     vmulps	%zmm16, %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  -      -     1.00    -     0.50   0.50    -      -      -      -      -      -     vmulps	(%rax), %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  -      -     1.00    -     0.50   0.50    -      -      -      -      -      -     vmulps	(%rax){1to16}, %zmm17, %zmm19 {%k1} {z}
 # CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -      -      -      -     vpabsd	%zmm16, %zmm19
 # CHECK-NEXT:  -      -     1.00    -     0.50   0.50    -      -      -      -      -      -     vpabsd	(%rax), %zmm19
 # CHECK-NEXT:  -      -     1.00    -     0.50   0.50    -      -      -      -      -      -     vpabsd	(%rax){1to16}, %zmm19
@@ -2918,24 +2918,24 @@ vunpcklps         (%rax){1to16}, %zmm17, %zmm19 {z}{k1}
 # CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -      -      -      -     vptestnmq	%zmm0, %zmm1, %k2 {%k3}
 # CHECK-NEXT:  -      -      -      -     0.50   0.50    -     1.00    -      -      -      -     vptestnmq	(%rax), %zmm1, %k2 {%k3}
 # CHECK-NEXT:  -      -      -      -     0.50   0.50    -     1.00    -      -      -      -     vptestnmq	(%rax){1to8}, %zmm1, %k2 {%k3}
-# CHECK-NEXT:  -      -     0.50    -      -      -      -     0.50    -      -      -      -     vsubpd	%zmm16, %zmm17, %zmm19
-# CHECK-NEXT:  -      -     0.50    -     0.50   0.50    -     0.50    -      -      -      -     vsubpd	(%rax), %zmm17, %zmm19
-# CHECK-NEXT:  -      -     0.50    -     0.50   0.50    -     0.50    -      -      -      -     vsubpd	(%rax){1to8}, %zmm17, %zmm19
-# CHECK-NEXT:  -      -     0.50    -      -      -      -     0.50    -      -      -      -     vsubpd	%zmm16, %zmm17, %zmm19 {%k1}
-# CHECK-NEXT:  -      -     0.50    -     0.50   0.50    -     0.50    -      -      -      -     vsubpd	(%rax), %zmm17, %zmm19 {%k1}
-# CHECK-NEXT:  -      -     0.50    -     0.50   0.50    -     0.50    -      -      -      -     vsubpd	(%rax){1to8}, %zmm17, %zmm19 {%k1}
-# CHECK-NEXT:  -      -     0.50    -      -      -      -     0.50    -      -      -      -     vsubpd	%zmm16, %zmm17, %zmm19 {%k1} {z}
-# CHECK-NEXT:  -      -     0.50    -     0.50   0.50    -     0.50    -      -      -      -     vsubpd	(%rax), %zmm17, %zmm19 {%k1} {z}
-# CHECK-NEXT:  -      -     0.50    -     0.50   0.50    -     0.50    -      -      -      -     vsubpd	(%rax){1to8}, %zmm17, %zmm19 {%k1} {z}
-# CHECK-NEXT:  -      -     0.50    -      -      -      -     0.50    -      -      -      -     vsubps	%zmm16, %zmm17, %zmm19
-# CHECK-NEXT:  -      -     0.50    -     0.50   0.50    -     0.50    -      -      -      -     vsubps	(%rax), %zmm17, %zmm19
-# CHECK-NEXT:  -      -     0.50    -     0.50   0.50    -     0.50    -      -      -      -     vsubps	(%rax){1to16}, %zmm17, %zmm19
-# CHECK-NEXT:  -      -     0.50    -      -      -      -     0.50    -      -      -      -     vsubps	%zmm16, %zmm17, %zmm19 {%k1}
-# CHECK-NEXT:  -      -     0.50    -     0.50   0.50    -     0.50    -      -      -      -     vsubps	(%rax), %zmm17, %zmm19 {%k1}
-# CHECK-NEXT:  -      -     0.50    -     0.50   0.50    -     0.50    -      -      -      -     vsubps	(%rax){1to16}, %zmm17, %zmm19 {%k1}
-# CHECK-NEXT:  -      -     0.50    -      -      -      -     0.50    -      -      -      -     vsubps	%zmm16, %zmm17, %zmm19 {%k1} {z}
-# CHECK-NEXT:  -      -     0.50    -     0.50   0.50    -     0.50    -      -      -      -     vsubps	(%rax), %zmm17, %zmm19 {%k1} {z}
-# CHECK-NEXT:  -      -     0.50    -     0.50   0.50    -     0.50    -      -      -      -     vsubps	(%rax){1to16}, %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -      -      -      -     vsubpd	%zmm16, %zmm17, %zmm19
+# CHECK-NEXT:  -      -     1.00    -     0.50   0.50    -      -      -      -      -      -     vsubpd	(%rax), %zmm17, %zmm19
+# CHECK-NEXT:  -      -     1.00    -     0.50   0.50    -      -      -      -      -      -     vsubpd	(%rax){1to8}, %zmm17, %zmm19
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -      -      -      -     vsubpd	%zmm16, %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  -      -     1.00    -     0.50   0.50    -      -      -      -      -      -     vsubpd	(%rax), %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  -      -     1.00    -     0.50   0.50    -      -      -      -      -      -     vsubpd	(%rax){1to8}, %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -      -      -      -     vsubpd	%zmm16, %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  -      -     1.00    -     0.50   0.50    -      -      -      -      -      -     vsubpd	(%rax), %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  -      -     1.00    -     0.50   0.50    -      -      -      -      -      -     vsubpd	(%rax){1to8}, %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -      -      -      -     vsubps	%zmm16, %zmm17, %zmm19
+# CHECK-NEXT:  -      -     1.00    -     0.50   0.50    -      -      -      -      -      -     vsubps	(%rax), %zmm17, %zmm19
+# CHECK-NEXT:  -      -     1.00    -     0.50   0.50    -      -      -      -      -      -     vsubps	(%rax){1to16}, %zmm17, %zmm19
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -      -      -      -     vsubps	%zmm16, %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  -      -     1.00    -     0.50   0.50    -      -      -      -      -      -     vsubps	(%rax), %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  -      -     1.00    -     0.50   0.50    -      -      -      -      -      -     vsubps	(%rax){1to16}, %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -      -      -      -     vsubps	%zmm16, %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  -      -     1.00    -     0.50   0.50    -      -      -      -      -      -     vsubps	(%rax), %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  -      -     1.00    -     0.50   0.50    -      -      -      -      -      -     vsubps	(%rax){1to16}, %zmm17, %zmm19 {%k1} {z}
 # CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -      -      -      -     vucomiss	%xmm16, %xmm17
 # CHECK-NEXT:  -      -     1.00    -     0.50   0.50    -      -      -      -      -      -     vucomiss	(%rax), %xmm17
 # CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -      -      -      -     vunpckhpd	%zmm16, %zmm17, %zmm19
diff --git a/llvm/test/tools/llvm-mca/X86/IceLakeServer/resources-avx512dq.s b/llvm/test/tools/llvm-mca/X86/IceLakeServer/resources-avx512dq.s
index 4da4ceb..42041cd 100644
--- a/llvm/test/tools/llvm-mca/X86/IceLakeServer/resources-avx512dq.s
+++ b/llvm/test/tools/llvm-mca/X86/IceLakeServer/resources-avx512dq.s
@@ -753,30 +753,30 @@ vxorps            (%rax){1to16}, %zmm17, %zmm19 {z}{k1}
 # CHECK-NEXT:  4      22    3.00    *                   vpmullq	(%rax), %zmm17, %zmm19 {%k1}
 # CHECK-NEXT:  3      15    3.00                        vpmullq	%zmm16, %zmm17, %zmm19 {%k1} {z}
 # CHECK-NEXT:  4      22    3.00    *                   vpmullq	(%rax), %zmm17, %zmm19 {%k1} {z}
-# CHECK-NEXT:  1      4     0.50                        vrangepd	$ab, %zmm16, %zmm17, %zmm19
-# CHECK-NEXT:  2      11    0.50    *                   vrangepd	$ab, (%rax), %zmm17, %zmm19
-# CHECK-NEXT:  2      11    0.50    *                   vrangepd	$ab, (%rax){1to8}, %zmm17, %zmm19
-# CHECK-NEXT:  1      4     0.50                        vrangepd	$ab, %zmm16, %zmm17, %zmm19 {%k1}
-# CHECK-NEXT:  2      11    0.50    *                   vrangepd	$ab, (%rax), %zmm17, %zmm19 {%k1}
-# CHECK-NEXT:  2      11    0.50    *                   vrangepd	$ab, (%rax){1to8}, %zmm17, %zmm19 {%k1}
-# CHECK-NEXT:  1      4     0.50                        vrangepd	$ab, %zmm16, %zmm17, %zmm19 {%k1} {z}
-# CHECK-NEXT:  2      11    0.50    *                   vrangepd	$ab, (%rax), %zmm17, %zmm19 {%k1} {z}
-# CHECK-NEXT:  2      11    0.50    *                   vrangepd	$ab, (%rax){1to8}, %zmm17, %zmm19 {%k1} {z}
-# CHECK-NEXT:  1      4     0.50                        vrangepd	$ab, {sae}, %zmm16, %zmm17, %zmm19
-# CHECK-NEXT:  1      4     0.50                        vrangepd	$ab, {sae}, %zmm16, %zmm17, %zmm19 {%k1}
-# CHECK-NEXT:  1      4     0.50                        vrangepd	$ab, {sae}, %zmm16, %zmm17, %zmm19 {%k1} {z}
-# CHECK-NEXT:  1      4     0.50                        vrangeps	$ab, %zmm16, %zmm17, %zmm19
-# CHECK-NEXT:  2      11    0.50    *                   vrangeps	$ab, (%rax), %zmm17, %zmm19
-# CHECK-NEXT:  2      11    0.50    *                   vrangeps	$ab, (%rax){1to16}, %zmm17, %zmm19
-# CHECK-NEXT:  1      4     0.50                        vrangeps	$ab, %zmm16, %zmm17, %zmm19 {%k1}
-# CHECK-NEXT:  2      11    0.50    *                   vrangeps	$ab, (%rax), %zmm17, %zmm19 {%k1}
-# CHECK-NEXT:  2      11    0.50    *                   vrangeps	$ab, (%rax){1to16}, %zmm17, %zmm19 {%k1}
-# CHECK-NEXT:  1      4     0.50                        vrangeps	$ab, %zmm16, %zmm17, %zmm19 {%k1} {z}
-# CHECK-NEXT:  2      11    0.50    *                   vrangeps	$ab, (%rax), %zmm17, %zmm19 {%k1} {z}
-# CHECK-NEXT:  2      11    0.50    *                   vrangeps	$ab, (%rax){1to16}, %zmm17, %zmm19 {%k1} {z}
-# CHECK-NEXT:  1      4     0.50                        vrangeps	$ab, {sae}, %zmm16, %zmm17, %zmm19
-# CHECK-NEXT:  1      4     0.50                        vrangeps	$ab, {sae}, %zmm16, %zmm17, %zmm19 {%k1}
-# CHECK-NEXT:  1      4     0.50                        vrangeps	$ab, {sae}, %zmm16, %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  1      4     1.00                        vrangepd	$ab, %zmm16, %zmm17, %zmm19
+# CHECK-NEXT:  2      11    1.00    *                   vrangepd	$ab, (%rax), %zmm17, %zmm19
+# CHECK-NEXT:  2      11    1.00    *                   vrangepd	$ab, (%rax){1to8}, %zmm17, %zmm19
+# CHECK-NEXT:  1      4     1.00                        vrangepd	$ab, %zmm16, %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  2      11    1.00    *                   vrangepd	$ab, (%rax), %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  2      11    1.00    *                   vrangepd	$ab, (%rax){1to8}, %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  1      4     1.00                        vrangepd	$ab, %zmm16, %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  2      11    1.00    *                   vrangepd	$ab, (%rax), %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  2      11    1.00    *                   vrangepd	$ab, (%rax){1to8}, %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  1      4     1.00                        vrangepd	$ab, {sae}, %zmm16, %zmm17, %zmm19
+# CHECK-NEXT:  1      4     1.00                        vrangepd	$ab, {sae}, %zmm16, %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  1      4     1.00                        vrangepd	$ab, {sae}, %zmm16, %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  1      4     1.00                        vrangeps	$ab, %zmm16, %zmm17, %zmm19
+# CHECK-NEXT:  2      11    1.00    *                   vrangeps	$ab, (%rax), %zmm17, %zmm19
+# CHECK-NEXT:  2      11    1.00    *                   vrangeps	$ab, (%rax){1to16}, %zmm17, %zmm19
+# CHECK-NEXT:  1      4     1.00                        vrangeps	$ab, %zmm16, %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  2      11    1.00    *                   vrangeps	$ab, (%rax), %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  2      11    1.00    *                   vrangeps	$ab, (%rax){1to16}, %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  1      4     1.00                        vrangeps	$ab, %zmm16, %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  2      11    1.00    *                   vrangeps	$ab, (%rax), %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  2      11    1.00    *                   vrangeps	$ab, (%rax){1to16}, %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  1      4     1.00                        vrangeps	$ab, {sae}, %zmm16, %zmm17, %zmm19
+# CHECK-NEXT:  1      4     1.00                        vrangeps	$ab, {sae}, %zmm16, %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  1      4     1.00                        vrangeps	$ab, {sae}, %zmm16, %zmm17, %zmm19 {%k1} {z}
 # CHECK-NEXT:  1      4     0.50                        vrangesd	$ab, %xmm16, %xmm17, %xmm19
 # CHECK-NEXT:  2      10    0.50    *                   vrangesd	$ab, (%rax), %xmm17, %xmm19
 # CHECK-NEXT:  1      4     0.50                        vrangesd	$ab, %xmm16, %xmm17, %xmm19 {%k1}
@@ -872,7 +872,7 @@ vxorps            (%rax){1to16}, %zmm17, %zmm19 {z}{k1}
 
 # CHECK:      Resource pressure per iteration:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]
-# CHECK-NEXT:  -      -     197.25 73.25  102.00 102.00 5.50   209.25 0.25   5.50   5.50   5.50
+# CHECK-NEXT:  -      -     209.25 73.25  102.00 102.00 5.50   197.25 0.25   5.50   5.50   5.50
 
 # CHECK:      Resource pressure by instruction:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   Instructions:
@@ -1162,30 +1162,30 @@ vxorps            (%rax){1to16}, %zmm17, %zmm19 {z}{k1}
 # CHECK-NEXT:  -      -     3.00    -     0.50   0.50    -      -      -      -      -      -     vpmullq	(%rax), %zmm17, %zmm19 {%k1}
 # CHECK-NEXT:  -      -     3.00    -      -      -      -      -      -      -      -      -     vpmullq	%zmm16, %zmm17, %zmm19 {%k1} {z}
 # CHECK-NEXT:  -      -     3.00    -     0.50   0.50    -      -      -      -      -      -     vpmullq	(%rax), %zmm17, %zmm19 {%k1} {z}
-# CHECK-NEXT:  -      -     0.50    -      -      -      -     0.50    -      -      -      -     vrangepd	$ab, %zmm16, %zmm17, %zmm19
-# CHECK-NEXT:  -      -     0.50    -     0.50   0.50    -     0.50    -      -      -      -     vrangepd	$ab, (%rax), %zmm17, %zmm19
-# CHECK-NEXT:  -      -     0.50    -     0.50   0.50    -     0.50    -      -      -      -     vrangepd	$ab, (%rax){1to8}, %zmm17, %zmm19
-# CHECK-NEXT:  -      -     0.50    -      -      -      -     0.50    -      -      -      -     vrangepd	$ab, %zmm16, %zmm17, %zmm19 {%k1}
-# CHECK-NEXT:  -      -     0.50    -     0.50   0.50    -     0.50    -      -      -      -     vrangepd	$ab, (%rax), %zmm17, %zmm19 {%k1}
-# CHECK-NEXT:  -      -     0.50    -     0.50   0.50    -     0.50    -      -      -      -     vrangepd	$ab, (%rax){1to8}, %zmm17, %zmm19 {%k1}
-# CHECK-NEXT:  -      -     0.50    -      -      -      -     0.50    -      -      -      -     vrangepd	$ab, %zmm16, %zmm17, %zmm19 {%k1} {z}
-# CHECK-NEXT:  -      -     0.50    -     0.50   0.50    -     0.50    -      -      -      -     vrangepd	$ab, (%rax), %zmm17, %zmm19 {%k1} {z}
-# CHECK-NEXT:  -      -     0.50    -     0.50   0.50    -     0.50    -      -      -      -     vrangepd	$ab, (%rax){1to8}, %zmm17, %zmm19 {%k1} {z}
-# CHECK-NEXT:  -      -     0.50    -      -      -      -     0.50    -      -      -      -     vrangepd	$ab, {sae}, %zmm16, %zmm17, %zmm19
-# CHECK-NEXT:  -      -     0.50    -      -      -      -     0.50    -      -      -      -     vrangepd	$ab, {sae}, %zmm16, %zmm17, %zmm19 {%k1}
-# CHECK-NEXT:  -      -     0.50    -      -      -      -     0.50    -      -      -      -     vrangepd	$ab, {sae}, %zmm16, %zmm17, %zmm19 {%k1} {z}
-# CHECK-NEXT:  -      -     0.50    -      -      -      -     0.50    -      -      -      -     vrangeps	$ab, %zmm16, %zmm17, %zmm19
-# CHECK-NEXT:  -      -     0.50    -     0.50   0.50    -     0.50    -      -      -      -     vrangeps	$ab, (%rax), %zmm17, %zmm19
-# CHECK-NEXT:  -      -     0.50    -     0.50   0.50    -     0.50    -      -      -      -     vrangeps	$ab, (%rax){1to16}, %zmm17, %zmm19
-# CHECK-NEXT:  -      -     0.50    -      -      -      -     0.50    -      -      -      -     vrangeps	$ab, %zmm16, %zmm17, %zmm19 {%k1}
-# CHECK-NEXT:  -      -     0.50    -     0.50   0.50    -     0.50    -      -      -      -     vrangeps	$ab, (%rax), %zmm17, %zmm19 {%k1}
-# CHECK-NEXT:  -      -     0.50    -     0.50   0.50    -     0.50    -      -      -      -     vrangeps	$ab, (%rax){1to16}, %zmm17, %zmm19 {%k1}
-# CHECK-NEXT:  -      -     0.50    -      -      -      -     0.50    -      -      -      -     vrangeps	$ab, %zmm16, %zmm17, %zmm19 {%k1} {z}
-# CHECK-NEXT:  -      -     0.50    -     0.50   0.50    -     0.50    -      -      -      -     vrangeps	$ab, (%rax), %zmm17, %zmm19 {%k1} {z}
-# CHECK-NEXT:  -      -     0.50    -     0.50   0.50    -     0.50    -      -      -      -     vrangeps	$ab, (%rax){1to16}, %zmm17, %zmm19 {%k1} {z}
-# CHECK-NEXT:  -      -     0.50    -      -      -      -     0.50    -      -      -      -     vrangeps	$ab, {sae}, %zmm16, %zmm17, %zmm19
-# CHECK-NEXT:  -      -     0.50    -      -      -      -     0.50    -      -      -      -     vrangeps	$ab, {sae}, %zmm16, %zmm17, %zmm19 {%k1}
-# CHECK-NEXT:  -      -     0.50    -      -      -      -     0.50    -      -      -      -     vrangeps	$ab, {sae}, %zmm16, %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -      -      -      -     vrangepd	$ab, %zmm16, %zmm17, %zmm19
+# CHECK-NEXT:  -      -     1.00    -     0.50   0.50    -      -      -      -      -      -     vrangepd	$ab, (%rax), %zmm17, %zmm19
+# CHECK-NEXT:  -      -     1.00    -     0.50   0.50    -      -      -      -      -      -     vrangepd	$ab, (%rax){1to8}, %zmm17, %zmm19
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -      -      -      -     vrangepd	$ab, %zmm16, %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  -      -     1.00    -     0.50   0.50    -      -      -      -      -      -     vrangepd	$ab, (%rax), %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  -      -     1.00    -     0.50   0.50    -      -      -      -      -      -     vrangepd	$ab, (%rax){1to8}, %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -      -      -      -     vrangepd	$ab, %zmm16, %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  -      -     1.00    -     0.50   0.50    -      -      -      -      -      -     vrangepd	$ab, (%rax), %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  -      -     1.00    -     0.50   0.50    -      -      -      -      -      -     vrangepd	$ab, (%rax){1to8}, %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -      -      -      -     vrangepd	$ab, {sae}, %zmm16, %zmm17, %zmm19
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -      -      -      -     vrangepd	$ab, {sae}, %zmm16, %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -      -      -      -     vrangepd	$ab, {sae}, %zmm16, %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -      -      -      -     vrangeps	$ab, %zmm16, %zmm17, %zmm19
+# CHECK-NEXT:  -      -     1.00    -     0.50   0.50    -      -      -      -      -      -     vrangeps	$ab, (%rax), %zmm17, %zmm19
+# CHECK-NEXT:  -      -     1.00    -     0.50   0.50    -      -      -      -      -      -     vrangeps	$ab, (%rax){1to16}, %zmm17, %zmm19
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -      -      -      -     vrangeps	$ab, %zmm16, %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  -      -     1.00    -     0.50   0.50    -      -      -      -      -      -     vrangeps	$ab, (%rax), %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  -      -     1.00    -     0.50   0.50    -      -      -      -      -      -     vrangeps	$ab, (%rax){1to16}, %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -      -      -      -     vrangeps	$ab, %zmm16, %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  -      -     1.00    -     0.50   0.50    -      -      -      -      -      -     vrangeps	$ab, (%rax), %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  -      -     1.00    -     0.50   0.50    -      -      -      -      -      -     vrangeps	$ab, (%rax){1to16}, %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -      -      -      -     vrangeps	$ab, {sae}, %zmm16, %zmm17, %zmm19
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -      -      -      -     vrangeps	$ab, {sae}, %zmm16, %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -      -      -      -     vrangeps	$ab, {sae}, %zmm16, %zmm17, %zmm19 {%k1} {z}
 # CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -      -      -      -      -     vrangesd	$ab, %xmm16, %xmm17, %xmm19
 # CHECK-NEXT:  -      -     0.50   0.50   0.50   0.50    -      -      -      -      -      -     vrangesd	$ab, (%rax), %xmm17, %xmm19
 # CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -      -      -      -      -     vrangesd	$ab, %xmm16, %xmm17, %xmm19 {%k1}
-- 
cgit v1.1


From 070848c17c2944afa494d42d3ad42929f3379842 Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Tue, 13 Feb 2024 09:29:56 +0100
Subject: [AArch64][GISel] Don't pointlessly lower G_TRUNC (#81479)

If we have something like G_TRUNC from v2s32 to v2s16, then lowering
this to a concat of two G_TRUNC s32 to s16 followed by G_TRUNC from
v2s16 to v2s8 does not bring us any closer to legality. In fact, the
first part of that is a G_BUILD_VECTOR whose legalization will produce a
new G_TRUNC from v2s32 to v2s16, and both G_TRUNCs will then get
combined to the original, causing a legalization cycle.

Make the lowering condition more precise, by requiring that the original
vector is >128 bits, which is I believe the only case where this
specific splitting approach is useful.

Note that this doesn't actually produce a legal result (the alwaysLegal
is a lie, as before), but it will cause a proper globalisel abort
instead of an infinite legalization loop.

Fixes https://github.com/llvm/llvm-project/issues/81244.
---
 .../Target/AArch64/GISel/AArch64LegalizerInfo.cpp  |  5 ++---
 .../CodeGen/AArch64/GlobalISel/legalize-xtn.mir    | 24 ++++++++++++++++++++++
 2 files changed, 26 insertions(+), 3 deletions(-)

diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
index ab25e2b..933f13d 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
@@ -612,9 +612,8 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
       .lowerIf([=](const LegalityQuery &Query) {
         LLT DstTy = Query.Types[0];
         LLT SrcTy = Query.Types[1];
-        return DstTy.isVector() && (SrcTy.getSizeInBits() > 128 ||
-                                    (DstTy.getScalarSizeInBits() * 2 <
-                                     SrcTy.getScalarSizeInBits()));
+        return DstTy.isVector() && SrcTy.getSizeInBits() > 128 &&
+               DstTy.getScalarSizeInBits() * 2 <= SrcTy.getScalarSizeInBits();
       })
 
       .alwaysLegal();
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-xtn.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-xtn.mir
index 16b780a..6612651 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-xtn.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-xtn.mir
@@ -529,3 +529,27 @@ body:             |
     RET_ReallyLR implicit $q0
 
 ...
+
+---
+name:            pr81244
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $d0
+    ; CHECK-LABEL: name: pr81244
+    ; CHECK: liveins: $d0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<2 x s32>) = COPY $d0
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(<2 x s8>) = G_TRUNC [[COPY]](<2 x s32>)
+    ; CHECK-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s8>) = G_CONCAT_VECTORS [[TRUNC]](<2 x s8>), [[TRUNC]](<2 x s8>)
+    ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(<4 x s16>) = G_ANYEXT [[CONCAT_VECTORS]](<4 x s8>)
+    ; CHECK-NEXT: $d0 = COPY [[ANYEXT]](<4 x s16>)
+    ; CHECK-NEXT: RET_ReallyLR implicit $d0
+    %0:_(<2 x s32>) = COPY $d0
+    %1:_(<2 x s8>) = G_TRUNC %0(<2 x s32>)
+    %2:_(<4 x s8>) = G_CONCAT_VECTORS %1(<2 x s8>), %1(<2 x s8>)
+    %3:_(<4 x s16>) = G_ANYEXT %2(<4 x s8>)
+    $d0 = COPY %3(<4 x s16>)
+    RET_ReallyLR implicit $d0
+
+...
-- 
cgit v1.1


From 815a84655262ac569db11357fef1651f3571e7ee Mon Sep 17 00:00:00 2001
From: David Green <david.green@arm.com>
Date: Tue, 13 Feb 2024 08:31:07 +0000
Subject: [Flang] Move genMinMaxlocReductionLoop to Transforms/Utils.cpp
 (#81380)

This is one option for attempting to move genMinMaxlocReductionLoop to a
better location. It moves it into Transforms and makes HLFIRTranforms
depend upon FIRTransforms.

It passes a build locally, both with and without -DBUILD_SHARED_LIBS,
and does OK on the windows CI.
---
 flang/include/flang/Optimizer/Support/Utils.h      | 139 ---------------------
 flang/include/flang/Optimizer/Transforms/Utils.h   |  38 ++++++
 flang/lib/Optimizer/Dialect/CMakeLists.txt         |   1 -
 .../lib/Optimizer/HLFIR/Transforms/CMakeLists.txt  |   1 +
 .../HLFIR/Transforms/OptimizedBufferization.cpp    |   2 +-
 .../Optimizer/Transforms/SimplifyIntrinsics.cpp    | 131 ++++++++++++++++++-
 6 files changed, 170 insertions(+), 142 deletions(-)
 create mode 100644 flang/include/flang/Optimizer/Transforms/Utils.h

diff --git a/flang/include/flang/Optimizer/Support/Utils.h b/flang/include/flang/Optimizer/Support/Utils.h
index 4e06bf8..7a8a34c 100644
--- a/flang/include/flang/Optimizer/Support/Utils.h
+++ b/flang/include/flang/Optimizer/Support/Utils.h
@@ -18,7 +18,6 @@
 #include "flang/Optimizer/Builder/Todo.h"
 #include "flang/Optimizer/Dialect/FIROps.h"
 #include "flang/Optimizer/Dialect/FIRType.h"
-#include "flang/Optimizer/HLFIR/HLFIRDialect.h"
 #include "flang/Optimizer/Support/FatalError.h"
 #include "mlir/Dialect/Arith/IR/Arith.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
@@ -135,144 +134,6 @@ inline void intrinsicTypeTODO(fir::FirOpBuilder &builder, mlir::Type type,
            " in " + intrinsicName);
 }
 
-using MinlocBodyOpGeneratorTy = llvm::function_ref<mlir::Value(
-    fir::FirOpBuilder &, mlir::Location, const mlir::Type &, mlir::Value,
-    mlir::Value, mlir::Value, const llvm::SmallVectorImpl<mlir::Value> &)>;
-using InitValGeneratorTy = llvm::function_ref<mlir::Value(
-    fir::FirOpBuilder &, mlir::Location, const mlir::Type &)>;
-using AddrGeneratorTy = llvm::function_ref<mlir::Value(
-    fir::FirOpBuilder &, mlir::Location, const mlir::Type &, mlir::Value,
-    mlir::Value)>;
-
-// Produces a loop nest for a Minloc intrinsic.
-inline void genMinMaxlocReductionLoop(
-    fir::FirOpBuilder &builder, mlir::Value array,
-    fir::InitValGeneratorTy initVal, fir::MinlocBodyOpGeneratorTy genBody,
-    fir::AddrGeneratorTy getAddrFn, unsigned rank, mlir::Type elementType,
-    mlir::Location loc, mlir::Type maskElemType, mlir::Value resultArr,
-    bool maskMayBeLogicalScalar) {
-  mlir::IndexType idxTy = builder.getIndexType();
-
-  mlir::Value zeroIdx = builder.createIntegerConstant(loc, idxTy, 0);
-
-  fir::SequenceType::Shape flatShape(rank,
-                                     fir::SequenceType::getUnknownExtent());
-  mlir::Type arrTy = fir::SequenceType::get(flatShape, elementType);
-  mlir::Type boxArrTy = fir::BoxType::get(arrTy);
-  array = builder.create<fir::ConvertOp>(loc, boxArrTy, array);
-
-  mlir::Type resultElemType = hlfir::getFortranElementType(resultArr.getType());
-  mlir::Value flagSet = builder.createIntegerConstant(loc, resultElemType, 1);
-  mlir::Value zero = builder.createIntegerConstant(loc, resultElemType, 0);
-  mlir::Value flagRef = builder.createTemporary(loc, resultElemType);
-  builder.create<fir::StoreOp>(loc, zero, flagRef);
-
-  mlir::Value init = initVal(builder, loc, elementType);
-  llvm::SmallVector<mlir::Value, Fortran::common::maxRank> bounds;
-
-  assert(rank > 0 && "rank cannot be zero");
-  mlir::Value one = builder.createIntegerConstant(loc, idxTy, 1);
-
-  // Compute all the upper bounds before the loop nest.
-  // It is not strictly necessary for performance, since the loop nest
-  // does not have any store operations and any LICM optimization
-  // should be able to optimize the redundancy.
-  for (unsigned i = 0; i < rank; ++i) {
-    mlir::Value dimIdx = builder.createIntegerConstant(loc, idxTy, i);
-    auto dims =
-        builder.create<fir::BoxDimsOp>(loc, idxTy, idxTy, idxTy, array, dimIdx);
-    mlir::Value len = dims.getResult(1);
-    // We use C indexing here, so len-1 as loopcount
-    mlir::Value loopCount = builder.create<mlir::arith::SubIOp>(loc, len, one);
-    bounds.push_back(loopCount);
-  }
-  // Create a loop nest consisting of OP operations.
-  // Collect the loops' induction variables into indices array,
-  // which will be used in the innermost loop to load the input
-  // array's element.
-  // The loops are generated such that the innermost loop processes
-  // the 0 dimension.
-  llvm::SmallVector<mlir::Value, Fortran::common::maxRank> indices;
-  for (unsigned i = rank; 0 < i; --i) {
-    mlir::Value step = one;
-    mlir::Value loopCount = bounds[i - 1];
-    auto loop =
-        builder.create<fir::DoLoopOp>(loc, zeroIdx, loopCount, step, false,
-                                      /*finalCountValue=*/false, init);
-    init = loop.getRegionIterArgs()[0];
-    indices.push_back(loop.getInductionVar());
-    // Set insertion point to the loop body so that the next loop
-    // is inserted inside the current one.
-    builder.setInsertionPointToStart(loop.getBody());
-  }
-
-  // Reverse the indices such that they are ordered as:
-  //   <dim-0-idx, dim-1-idx, ...>
-  std::reverse(indices.begin(), indices.end());
-  mlir::Value reductionVal =
-      genBody(builder, loc, elementType, array, flagRef, init, indices);
-
-  // Unwind the loop nest and insert ResultOp on each level
-  // to return the updated value of the reduction to the enclosing
-  // loops.
-  for (unsigned i = 0; i < rank; ++i) {
-    auto result = builder.create<fir::ResultOp>(loc, reductionVal);
-    // Proceed to the outer loop.
-    auto loop = mlir::cast<fir::DoLoopOp>(result->getParentOp());
-    reductionVal = loop.getResult(0);
-    // Set insertion point after the loop operation that we have
-    // just processed.
-    builder.setInsertionPointAfter(loop.getOperation());
-  }
-  // End of loop nest. The insertion point is after the outermost loop.
-  if (maskMayBeLogicalScalar) {
-    if (fir::IfOp ifOp =
-            mlir::dyn_cast<fir::IfOp>(builder.getBlock()->getParentOp())) {
-      builder.create<fir::ResultOp>(loc, reductionVal);
-      builder.setInsertionPointAfter(ifOp);
-      // Redefine flagSet to escape scope of ifOp
-      flagSet = builder.createIntegerConstant(loc, resultElemType, 1);
-      reductionVal = ifOp.getResult(0);
-    }
-  }
-
-  // Check for case where array was full of max values.
-  // flag will be 0 if mask was never true, 1 if mask was true as some point,
-  // this is needed to avoid catching cases where we didn't access any elements
-  // e.g. mask=.FALSE.
-  mlir::Value flagValue =
-      builder.create<fir::LoadOp>(loc, resultElemType, flagRef);
-  mlir::Value flagCmp = builder.create<mlir::arith::CmpIOp>(
-      loc, mlir::arith::CmpIPredicate::eq, flagValue, flagSet);
-  fir::IfOp ifMaskTrueOp =
-      builder.create<fir::IfOp>(loc, flagCmp, /*withElseRegion=*/false);
-  builder.setInsertionPointToStart(&ifMaskTrueOp.getThenRegion().front());
-
-  mlir::Value testInit = initVal(builder, loc, elementType);
-  fir::IfOp ifMinSetOp;
-  if (elementType.isa<mlir::FloatType>()) {
-    mlir::Value cmp = builder.create<mlir::arith::CmpFOp>(
-        loc, mlir::arith::CmpFPredicate::OEQ, testInit, reductionVal);
-    ifMinSetOp = builder.create<fir::IfOp>(loc, cmp,
-                                           /*withElseRegion*/ false);
-  } else {
-    mlir::Value cmp = builder.create<mlir::arith::CmpIOp>(
-        loc, mlir::arith::CmpIPredicate::eq, testInit, reductionVal);
-    ifMinSetOp = builder.create<fir::IfOp>(loc, cmp,
-                                           /*withElseRegion*/ false);
-  }
-  builder.setInsertionPointToStart(&ifMinSetOp.getThenRegion().front());
-
-  // Load output array with 1s instead of 0s
-  for (unsigned int i = 0; i < rank; ++i) {
-    mlir::Value index = builder.createIntegerConstant(loc, idxTy, i);
-    mlir::Value resultElemAddr =
-        getAddrFn(builder, loc, resultElemType, resultArr, index);
-    builder.create<fir::StoreOp>(loc, flagSet, resultElemAddr);
-  }
-  builder.setInsertionPointAfter(ifMaskTrueOp);
-}
-
 inline fir::CUDADataAttributeAttr
 getCUDADataAttribute(mlir::MLIRContext *mlirContext,
                      std::optional<Fortran::common::CUDADataAttr> cudaAttr) {
diff --git a/flang/include/flang/Optimizer/Transforms/Utils.h b/flang/include/flang/Optimizer/Transforms/Utils.h
new file mode 100644
index 0000000..49a616f
--- /dev/null
+++ b/flang/include/flang/Optimizer/Transforms/Utils.h
@@ -0,0 +1,38 @@
+//===-- Optimizer/Transforms/Utils.h ----------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Coding style: https://mlir.llvm.org/getting_started/DeveloperGuide/
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef FORTRAN_OPTIMIZER_TRANSFORMS_UTILS_H
+#define FORTRAN_OPTIMIZER_TRANSFORMS_UTILS_H
+
+namespace fir {
+
+using MinlocBodyOpGeneratorTy = llvm::function_ref<mlir::Value(
+    fir::FirOpBuilder &, mlir::Location, const mlir::Type &, mlir::Value,
+    mlir::Value, mlir::Value, const llvm::SmallVectorImpl<mlir::Value> &)>;
+using InitValGeneratorTy = llvm::function_ref<mlir::Value(
+    fir::FirOpBuilder &, mlir::Location, const mlir::Type &)>;
+using AddrGeneratorTy = llvm::function_ref<mlir::Value(
+    fir::FirOpBuilder &, mlir::Location, const mlir::Type &, mlir::Value,
+    mlir::Value)>;
+
+// Produces a loop nest for a Minloc intrinsic.
+void genMinMaxlocReductionLoop(fir::FirOpBuilder &builder, mlir::Value array,
+                               fir::InitValGeneratorTy initVal,
+                               fir::MinlocBodyOpGeneratorTy genBody,
+                               fir::AddrGeneratorTy getAddrFn, unsigned rank,
+                               mlir::Type elementType, mlir::Location loc,
+                               mlir::Type maskElemType, mlir::Value resultArr,
+                               bool maskMayBeLogicalScalar);
+
+} // namespace fir
+
+#endif // FORTRAN_OPTIMIZER_TRANSFORMS_UTILS_H
diff --git a/flang/lib/Optimizer/Dialect/CMakeLists.txt b/flang/lib/Optimizer/Dialect/CMakeLists.txt
index 58a4276..745439b 100644
--- a/flang/lib/Optimizer/Dialect/CMakeLists.txt
+++ b/flang/lib/Optimizer/Dialect/CMakeLists.txt
@@ -13,7 +13,6 @@ add_flang_library(FIRDialect
   CanonicalizationPatternsIncGen
   MLIRIR
   FIROpsIncGen
-  HLFIROpsIncGen
   intrinsics_gen
 
   LINK_LIBS
diff --git a/flang/lib/Optimizer/HLFIR/Transforms/CMakeLists.txt b/flang/lib/Optimizer/HLFIR/Transforms/CMakeLists.txt
index 603b328..ad569ce 100644
--- a/flang/lib/Optimizer/HLFIR/Transforms/CMakeLists.txt
+++ b/flang/lib/Optimizer/HLFIR/Transforms/CMakeLists.txt
@@ -21,6 +21,7 @@ add_flang_library(HLFIRTransforms
   FIRBuilder
   FIRDialectSupport
   FIRSupport
+  FIRTransforms
   HLFIRDialect
   MLIRIR
   ${dialect_libs}
diff --git a/flang/lib/Optimizer/HLFIR/Transforms/OptimizedBufferization.cpp b/flang/lib/Optimizer/HLFIR/Transforms/OptimizedBufferization.cpp
index 523671f..c2512c7 100644
--- a/flang/lib/Optimizer/HLFIR/Transforms/OptimizedBufferization.cpp
+++ b/flang/lib/Optimizer/HLFIR/Transforms/OptimizedBufferization.cpp
@@ -20,7 +20,7 @@
 #include "flang/Optimizer/HLFIR/HLFIRDialect.h"
 #include "flang/Optimizer/HLFIR/HLFIROps.h"
 #include "flang/Optimizer/HLFIR/Passes.h"
-#include "flang/Optimizer/Support/Utils.h"
+#include "flang/Optimizer/Transforms/Utils.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/IR/Dominance.h"
 #include "mlir/IR/PatternMatch.h"
diff --git a/flang/lib/Optimizer/Transforms/SimplifyIntrinsics.cpp b/flang/lib/Optimizer/Transforms/SimplifyIntrinsics.cpp
index b415463..86343e2 100644
--- a/flang/lib/Optimizer/Transforms/SimplifyIntrinsics.cpp
+++ b/flang/lib/Optimizer/Transforms/SimplifyIntrinsics.cpp
@@ -31,8 +31,8 @@
 #include "flang/Optimizer/Dialect/FIRType.h"
 #include "flang/Optimizer/Dialect/Support/FIRContext.h"
 #include "flang/Optimizer/HLFIR/HLFIRDialect.h"
-#include "flang/Optimizer/Support/Utils.h"
 #include "flang/Optimizer/Transforms/Passes.h"
+#include "flang/Optimizer/Transforms/Utils.h"
 #include "flang/Runtime/entry-names.h"
 #include "mlir/Dialect/LLVMIR/LLVMDialect.h"
 #include "mlir/IR/Matchers.h"
@@ -558,6 +558,135 @@ static mlir::FunctionType genRuntimeMinlocType(fir::FirOpBuilder &builder,
                                  {boxRefType, boxType, boxType}, {});
 }
 
+// Produces a loop nest for a Minloc intrinsic.
+void fir::genMinMaxlocReductionLoop(
+    fir::FirOpBuilder &builder, mlir::Value array,
+    fir::InitValGeneratorTy initVal, fir::MinlocBodyOpGeneratorTy genBody,
+    fir::AddrGeneratorTy getAddrFn, unsigned rank, mlir::Type elementType,
+    mlir::Location loc, mlir::Type maskElemType, mlir::Value resultArr,
+    bool maskMayBeLogicalScalar) {
+  mlir::IndexType idxTy = builder.getIndexType();
+
+  mlir::Value zeroIdx = builder.createIntegerConstant(loc, idxTy, 0);
+
+  fir::SequenceType::Shape flatShape(rank,
+                                     fir::SequenceType::getUnknownExtent());
+  mlir::Type arrTy = fir::SequenceType::get(flatShape, elementType);
+  mlir::Type boxArrTy = fir::BoxType::get(arrTy);
+  array = builder.create<fir::ConvertOp>(loc, boxArrTy, array);
+
+  mlir::Type resultElemType = hlfir::getFortranElementType(resultArr.getType());
+  mlir::Value flagSet = builder.createIntegerConstant(loc, resultElemType, 1);
+  mlir::Value zero = builder.createIntegerConstant(loc, resultElemType, 0);
+  mlir::Value flagRef = builder.createTemporary(loc, resultElemType);
+  builder.create<fir::StoreOp>(loc, zero, flagRef);
+
+  mlir::Value init = initVal(builder, loc, elementType);
+  llvm::SmallVector<mlir::Value, Fortran::common::maxRank> bounds;
+
+  assert(rank > 0 && "rank cannot be zero");
+  mlir::Value one = builder.createIntegerConstant(loc, idxTy, 1);
+
+  // Compute all the upper bounds before the loop nest.
+  // It is not strictly necessary for performance, since the loop nest
+  // does not have any store operations and any LICM optimization
+  // should be able to optimize the redundancy.
+  for (unsigned i = 0; i < rank; ++i) {
+    mlir::Value dimIdx = builder.createIntegerConstant(loc, idxTy, i);
+    auto dims =
+        builder.create<fir::BoxDimsOp>(loc, idxTy, idxTy, idxTy, array, dimIdx);
+    mlir::Value len = dims.getResult(1);
+    // We use C indexing here, so len-1 as loopcount
+    mlir::Value loopCount = builder.create<mlir::arith::SubIOp>(loc, len, one);
+    bounds.push_back(loopCount);
+  }
+  // Create a loop nest consisting of OP operations.
+  // Collect the loops' induction variables into indices array,
+  // which will be used in the innermost loop to load the input
+  // array's element.
+  // The loops are generated such that the innermost loop processes
+  // the 0 dimension.
+  llvm::SmallVector<mlir::Value, Fortran::common::maxRank> indices;
+  for (unsigned i = rank; 0 < i; --i) {
+    mlir::Value step = one;
+    mlir::Value loopCount = bounds[i - 1];
+    auto loop =
+        builder.create<fir::DoLoopOp>(loc, zeroIdx, loopCount, step, false,
+                                      /*finalCountValue=*/false, init);
+    init = loop.getRegionIterArgs()[0];
+    indices.push_back(loop.getInductionVar());
+    // Set insertion point to the loop body so that the next loop
+    // is inserted inside the current one.
+    builder.setInsertionPointToStart(loop.getBody());
+  }
+
+  // Reverse the indices such that they are ordered as:
+  //   <dim-0-idx, dim-1-idx, ...>
+  std::reverse(indices.begin(), indices.end());
+  mlir::Value reductionVal =
+      genBody(builder, loc, elementType, array, flagRef, init, indices);
+
+  // Unwind the loop nest and insert ResultOp on each level
+  // to return the updated value of the reduction to the enclosing
+  // loops.
+  for (unsigned i = 0; i < rank; ++i) {
+    auto result = builder.create<fir::ResultOp>(loc, reductionVal);
+    // Proceed to the outer loop.
+    auto loop = mlir::cast<fir::DoLoopOp>(result->getParentOp());
+    reductionVal = loop.getResult(0);
+    // Set insertion point after the loop operation that we have
+    // just processed.
+    builder.setInsertionPointAfter(loop.getOperation());
+  }
+  // End of loop nest. The insertion point is after the outermost loop.
+  if (maskMayBeLogicalScalar) {
+    if (fir::IfOp ifOp =
+            mlir::dyn_cast<fir::IfOp>(builder.getBlock()->getParentOp())) {
+      builder.create<fir::ResultOp>(loc, reductionVal);
+      builder.setInsertionPointAfter(ifOp);
+      // Redefine flagSet to escape scope of ifOp
+      flagSet = builder.createIntegerConstant(loc, resultElemType, 1);
+      reductionVal = ifOp.getResult(0);
+    }
+  }
+
+  // Check for case where array was full of max values.
+  // flag will be 0 if mask was never true, 1 if mask was true as some point,
+  // this is needed to avoid catching cases where we didn't access any elements
+  // e.g. mask=.FALSE.
+  mlir::Value flagValue =
+      builder.create<fir::LoadOp>(loc, resultElemType, flagRef);
+  mlir::Value flagCmp = builder.create<mlir::arith::CmpIOp>(
+      loc, mlir::arith::CmpIPredicate::eq, flagValue, flagSet);
+  fir::IfOp ifMaskTrueOp =
+      builder.create<fir::IfOp>(loc, flagCmp, /*withElseRegion=*/false);
+  builder.setInsertionPointToStart(&ifMaskTrueOp.getThenRegion().front());
+
+  mlir::Value testInit = initVal(builder, loc, elementType);
+  fir::IfOp ifMinSetOp;
+  if (elementType.isa<mlir::FloatType>()) {
+    mlir::Value cmp = builder.create<mlir::arith::CmpFOp>(
+        loc, mlir::arith::CmpFPredicate::OEQ, testInit, reductionVal);
+    ifMinSetOp = builder.create<fir::IfOp>(loc, cmp,
+                                           /*withElseRegion*/ false);
+  } else {
+    mlir::Value cmp = builder.create<mlir::arith::CmpIOp>(
+        loc, mlir::arith::CmpIPredicate::eq, testInit, reductionVal);
+    ifMinSetOp = builder.create<fir::IfOp>(loc, cmp,
+                                           /*withElseRegion*/ false);
+  }
+  builder.setInsertionPointToStart(&ifMinSetOp.getThenRegion().front());
+
+  // Load output array with 1s instead of 0s
+  for (unsigned int i = 0; i < rank; ++i) {
+    mlir::Value index = builder.createIntegerConstant(loc, idxTy, i);
+    mlir::Value resultElemAddr =
+        getAddrFn(builder, loc, resultElemType, resultArr, index);
+    builder.create<fir::StoreOp>(loc, flagSet, resultElemAddr);
+  }
+  builder.setInsertionPointAfter(ifMaskTrueOp);
+}
+
 static void genRuntimeMinMaxlocBody(fir::FirOpBuilder &builder,
                                     mlir::func::FuncOp &funcOp, bool isMax,
                                     unsigned rank, int maskRank,
-- 
cgit v1.1


From f0db35b93f31ea5d6ff9bd4791fb6755b5a5bb9b Mon Sep 17 00:00:00 2001
From: NAKAMURA Takumi <geek4civic@gmail.com>
Date: Tue, 13 Feb 2024 17:40:51 +0900
Subject: [MC/DC] Refactor: Introduce `MCDCTypes.h` for `coverage::mcdc`
 (#81459)

They can be also used in `clang`.
Introduce the lightweight header instead of `CoverageMapping.h`.

This includes for now:
* `mcdc::ConditionID`
* `mcdc::Parameters`
---
 clang/lib/CodeGen/CoverageMappingGen.cpp           | 54 +++++++++++-----------
 .../llvm/ProfileData/Coverage/CoverageMapping.h    | 33 +++++--------
 llvm/include/llvm/ProfileData/Coverage/MCDCTypes.h | 36 +++++++++++++++
 llvm/lib/ProfileData/Coverage/CoverageMapping.cpp  |  2 +-
 .../ProfileData/Coverage/CoverageMappingReader.cpp |  7 ++-
 llvm/unittests/ProfileData/CoverageMappingTest.cpp |  7 ++-
 6 files changed, 80 insertions(+), 59 deletions(-)
 create mode 100644 llvm/include/llvm/ProfileData/Coverage/MCDCTypes.h

diff --git a/clang/lib/CodeGen/CoverageMappingGen.cpp b/clang/lib/CodeGen/CoverageMappingGen.cpp
index 0c43317..5060b5b 100644
--- a/clang/lib/CodeGen/CoverageMappingGen.cpp
+++ b/clang/lib/CodeGen/CoverageMappingGen.cpp
@@ -95,9 +95,6 @@ void CoverageSourceInfo::updateNextTokLoc(SourceLocation Loc) {
 }
 
 namespace {
-using MCDCConditionID = CounterMappingRegion::MCDCConditionID;
-using MCDCParameters = CounterMappingRegion::MCDCParameters;
-
 /// A region of source code that can be mapped to a counter.
 class SourceMappingRegion {
   /// Primary Counter that is also used for Branch Regions for "True" branches.
@@ -107,7 +104,7 @@ class SourceMappingRegion {
   std::optional<Counter> FalseCount;
 
   /// Parameters used for Modified Condition/Decision Coverage
-  MCDCParameters MCDCParams;
+  mcdc::Parameters MCDCParams;
 
   /// The region's starting location.
   std::optional<SourceLocation> LocStart;
@@ -131,7 +128,7 @@ public:
         SkippedRegion(false) {}
 
   SourceMappingRegion(Counter Count, std::optional<Counter> FalseCount,
-                      MCDCParameters MCDCParams,
+                      mcdc::Parameters MCDCParams,
                       std::optional<SourceLocation> LocStart,
                       std::optional<SourceLocation> LocEnd,
                       bool GapRegion = false)
@@ -139,7 +136,7 @@ public:
         LocStart(LocStart), LocEnd(LocEnd), GapRegion(GapRegion),
         SkippedRegion(false) {}
 
-  SourceMappingRegion(MCDCParameters MCDCParams,
+  SourceMappingRegion(mcdc::Parameters MCDCParams,
                       std::optional<SourceLocation> LocStart,
                       std::optional<SourceLocation> LocEnd)
       : MCDCParams(MCDCParams), LocStart(LocStart), LocEnd(LocEnd),
@@ -187,7 +184,7 @@ public:
 
   bool isMCDCDecision() const { return MCDCParams.NumConditions != 0; }
 
-  const MCDCParameters &getMCDCParams() const { return MCDCParams; }
+  const mcdc::Parameters &getMCDCParams() const { return MCDCParams; }
 };
 
 /// Spelling locations for the start and end of a source region.
@@ -587,8 +584,8 @@ struct EmptyCoverageMappingBuilder : public CoverageMappingBuilder {
 struct MCDCCoverageBuilder {
 
   struct DecisionIDPair {
-    MCDCConditionID TrueID = 0;
-    MCDCConditionID FalseID = 0;
+    mcdc::ConditionID TrueID = 0;
+    mcdc::ConditionID FalseID = 0;
   };
 
   /// The AST walk recursively visits nested logical-AND or logical-OR binary
@@ -682,9 +679,9 @@ private:
   CodeGenModule &CGM;
 
   llvm::SmallVector<DecisionIDPair> DecisionStack;
-  llvm::DenseMap<const Stmt *, MCDCConditionID> &CondIDs;
+  llvm::DenseMap<const Stmt *, mcdc::ConditionID> &CondIDs;
   llvm::DenseMap<const Stmt *, unsigned> &MCDCBitmapMap;
-  MCDCConditionID NextID = 1;
+  mcdc::ConditionID NextID = 1;
   bool NotMapped = false;
 
   /// Represent a sentinel value of [0,0] for the bottom of DecisionStack.
@@ -696,9 +693,10 @@ private:
   }
 
 public:
-  MCDCCoverageBuilder(CodeGenModule &CGM,
-                      llvm::DenseMap<const Stmt *, MCDCConditionID> &CondIDMap,
-                      llvm::DenseMap<const Stmt *, unsigned> &MCDCBitmapMap)
+  MCDCCoverageBuilder(
+      CodeGenModule &CGM,
+      llvm::DenseMap<const Stmt *, mcdc::ConditionID> &CondIDMap,
+      llvm::DenseMap<const Stmt *, unsigned> &MCDCBitmapMap)
       : CGM(CGM), DecisionStack(1, DecisionStackSentinel), CondIDs(CondIDMap),
         MCDCBitmapMap(MCDCBitmapMap) {}
 
@@ -713,12 +711,12 @@ public:
   bool isBuilding() const { return (NextID > 1); }
 
   /// Set the given condition's ID.
-  void setCondID(const Expr *Cond, MCDCConditionID ID) {
+  void setCondID(const Expr *Cond, mcdc::ConditionID ID) {
     CondIDs[CodeGenFunction::stripCond(Cond)] = ID;
   }
 
   /// Return the ID of a given condition.
-  MCDCConditionID getCondID(const Expr *Cond) const {
+  mcdc::ConditionID getCondID(const Expr *Cond) const {
     auto I = CondIDs.find(CodeGenFunction::stripCond(Cond));
     if (I == CondIDs.end())
       return 0;
@@ -755,7 +753,7 @@ public:
       setCondID(E->getLHS(), NextID++);
 
     // Assign a ID+1 for the RHS.
-    MCDCConditionID RHSid = NextID++;
+    mcdc::ConditionID RHSid = NextID++;
     setCondID(E->getRHS(), RHSid);
 
     // Push the LHS decision IDs onto the DecisionStack.
@@ -865,8 +863,8 @@ struct CounterCoverageMappingBuilder
                     std::optional<SourceLocation> StartLoc = std::nullopt,
                     std::optional<SourceLocation> EndLoc = std::nullopt,
                     std::optional<Counter> FalseCount = std::nullopt,
-                    MCDCConditionID ID = 0, MCDCConditionID TrueID = 0,
-                    MCDCConditionID FalseID = 0) {
+                    mcdc::ConditionID ID = 0, mcdc::ConditionID TrueID = 0,
+                    mcdc::ConditionID FalseID = 0) {
 
     if (StartLoc && !FalseCount) {
       MostRecentLocation = *StartLoc;
@@ -886,7 +884,7 @@ struct CounterCoverageMappingBuilder
     if (EndLoc && EndLoc->isInvalid())
       EndLoc = std::nullopt;
     RegionStack.emplace_back(Count, FalseCount,
-                             MCDCParameters{0, 0, ID, TrueID, FalseID},
+                             mcdc::Parameters{0, 0, ID, TrueID, FalseID},
                              StartLoc, EndLoc);
 
     return RegionStack.size() - 1;
@@ -896,7 +894,7 @@ struct CounterCoverageMappingBuilder
                     std::optional<SourceLocation> StartLoc = std::nullopt,
                     std::optional<SourceLocation> EndLoc = std::nullopt) {
 
-    RegionStack.emplace_back(MCDCParameters{BitmapIdx, Conditions}, StartLoc,
+    RegionStack.emplace_back(mcdc::Parameters{BitmapIdx, Conditions}, StartLoc,
                              EndLoc);
 
     return RegionStack.size() - 1;
@@ -1042,9 +1040,9 @@ struct CounterCoverageMappingBuilder
     // function's SourceRegions) because it doesn't apply to any other source
     // code other than the Condition.
     if (CodeGenFunction::isInstrumentedCondition(C)) {
-      MCDCConditionID ID = MCDCBuilder.getCondID(C);
-      MCDCConditionID TrueID = IDPair.TrueID;
-      MCDCConditionID FalseID = IDPair.FalseID;
+      mcdc::ConditionID ID = MCDCBuilder.getCondID(C);
+      mcdc::ConditionID TrueID = IDPair.TrueID;
+      mcdc::ConditionID FalseID = IDPair.FalseID;
 
       // If a condition can fold to true or false, the corresponding branch
       // will be removed.  Create a region with both counters hard-coded to
@@ -1151,9 +1149,9 @@ struct CounterCoverageMappingBuilder
           if (I.isBranch())
             SourceRegions.emplace_back(
                 I.getCounter(), I.getFalseCounter(),
-                MCDCParameters{0, 0, I.getMCDCParams().ID,
-                               I.getMCDCParams().TrueID,
-                               I.getMCDCParams().FalseID},
+                mcdc::Parameters{0, 0, I.getMCDCParams().ID,
+                                 I.getMCDCParams().TrueID,
+                                 I.getMCDCParams().FalseID},
                 Loc, getEndOfFileOrMacro(Loc), I.isBranch());
           else
             SourceRegions.emplace_back(I.getCounter(), Loc,
@@ -1338,7 +1336,7 @@ struct CounterCoverageMappingBuilder
       CoverageMappingModuleGen &CVM,
       llvm::DenseMap<const Stmt *, unsigned> &CounterMap,
       llvm::DenseMap<const Stmt *, unsigned> &MCDCBitmapMap,
-      llvm::DenseMap<const Stmt *, MCDCConditionID> &CondIDMap,
+      llvm::DenseMap<const Stmt *, mcdc::ConditionID> &CondIDMap,
       SourceManager &SM, const LangOptions &LangOpts)
       : CoverageMappingBuilder(CVM, SM, LangOpts), CounterMap(CounterMap),
         MCDCBitmapMap(MCDCBitmapMap),
diff --git a/llvm/include/llvm/ProfileData/Coverage/CoverageMapping.h b/llvm/include/llvm/ProfileData/Coverage/CoverageMapping.h
index 88ec60c..2869ae8 100644
--- a/llvm/include/llvm/ProfileData/Coverage/CoverageMapping.h
+++ b/llvm/include/llvm/ProfileData/Coverage/CoverageMapping.h
@@ -23,6 +23,7 @@
 #include "llvm/ADT/iterator.h"
 #include "llvm/ADT/iterator_range.h"
 #include "llvm/Object/BuildID.h"
+#include "llvm/ProfileData/Coverage/MCDCTypes.h"
 #include "llvm/ProfileData/InstrProf.h"
 #include "llvm/Support/Alignment.h"
 #include "llvm/Support/Compiler.h"
@@ -249,19 +250,6 @@ struct CounterMappingRegion {
     MCDCBranchRegion
   };
 
-  using MCDCConditionID = unsigned int;
-  struct MCDCParameters {
-    /// Byte Index of Bitmap Coverage Object for a Decision Region.
-    unsigned BitmapIdx = 0;
-
-    /// Number of Conditions used for a Decision Region.
-    unsigned NumConditions = 0;
-
-    /// IDs used to represent a branch region and other branch regions
-    /// evaluated based on True and False branches.
-    MCDCConditionID ID = 0, TrueID = 0, FalseID = 0;
-  };
-
   /// Primary Counter that is also used for Branch Regions (TrueCount).
   Counter Count;
 
@@ -269,7 +257,7 @@ struct CounterMappingRegion {
   Counter FalseCount;
 
   /// Parameters used for Modified Condition/Decision Coverage
-  MCDCParameters MCDCParams;
+  mcdc::Parameters MCDCParams;
 
   unsigned FileID = 0;
   unsigned ExpandedFileID = 0;
@@ -285,7 +273,7 @@ struct CounterMappingRegion {
         ColumnEnd(ColumnEnd), Kind(Kind) {}
 
   CounterMappingRegion(Counter Count, Counter FalseCount,
-                       MCDCParameters MCDCParams, unsigned FileID,
+                       mcdc::Parameters MCDCParams, unsigned FileID,
                        unsigned ExpandedFileID, unsigned LineStart,
                        unsigned ColumnStart, unsigned LineEnd,
                        unsigned ColumnEnd, RegionKind Kind)
@@ -294,7 +282,7 @@ struct CounterMappingRegion {
         ColumnStart(ColumnStart), LineEnd(LineEnd), ColumnEnd(ColumnEnd),
         Kind(Kind) {}
 
-  CounterMappingRegion(MCDCParameters MCDCParams, unsigned FileID,
+  CounterMappingRegion(mcdc::Parameters MCDCParams, unsigned FileID,
                        unsigned LineStart, unsigned ColumnStart,
                        unsigned LineEnd, unsigned ColumnEnd, RegionKind Kind)
       : MCDCParams(MCDCParams), FileID(FileID), LineStart(LineStart),
@@ -334,15 +322,16 @@ struct CounterMappingRegion {
   makeBranchRegion(Counter Count, Counter FalseCount, unsigned FileID,
                    unsigned LineStart, unsigned ColumnStart, unsigned LineEnd,
                    unsigned ColumnEnd) {
-    return CounterMappingRegion(Count, FalseCount, MCDCParameters(), FileID, 0,
-                                LineStart, ColumnStart, LineEnd, ColumnEnd,
+    return CounterMappingRegion(Count, FalseCount, mcdc::Parameters(), FileID,
+                                0, LineStart, ColumnStart, LineEnd, ColumnEnd,
                                 BranchRegion);
   }
 
   static CounterMappingRegion
-  makeBranchRegion(Counter Count, Counter FalseCount, MCDCParameters MCDCParams,
-                   unsigned FileID, unsigned LineStart, unsigned ColumnStart,
-                   unsigned LineEnd, unsigned ColumnEnd) {
+  makeBranchRegion(Counter Count, Counter FalseCount,
+                   mcdc::Parameters MCDCParams, unsigned FileID,
+                   unsigned LineStart, unsigned ColumnStart, unsigned LineEnd,
+                   unsigned ColumnEnd) {
     return CounterMappingRegion(Count, FalseCount, MCDCParams, FileID, 0,
                                 LineStart, ColumnStart, LineEnd, ColumnEnd,
                                 MCDCParams.ID == 0 ? BranchRegion
@@ -350,7 +339,7 @@ struct CounterMappingRegion {
   }
 
   static CounterMappingRegion
-  makeDecisionRegion(MCDCParameters MCDCParams, unsigned FileID,
+  makeDecisionRegion(mcdc::Parameters MCDCParams, unsigned FileID,
                      unsigned LineStart, unsigned ColumnStart, unsigned LineEnd,
                      unsigned ColumnEnd) {
     return CounterMappingRegion(MCDCParams, FileID, LineStart, ColumnStart,
diff --git a/llvm/include/llvm/ProfileData/Coverage/MCDCTypes.h b/llvm/include/llvm/ProfileData/Coverage/MCDCTypes.h
new file mode 100644
index 0000000..365c899
--- /dev/null
+++ b/llvm/include/llvm/ProfileData/Coverage/MCDCTypes.h
@@ -0,0 +1,36 @@
+//===- MCDCTypes.h - Types related to MC/DC Coverage ------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Types related to MC/DC Coverage.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_PROFILEDATA_COVERAGE_MCDCTYPES_H
+#define LLVM_PROFILEDATA_COVERAGE_MCDCTYPES_H
+
+namespace llvm::coverage::mcdc {
+
+/// The ID for MCDCBranch.
+using ConditionID = unsigned int;
+
+/// MC/DC-specifig parameters
+struct Parameters {
+  /// Byte Index of Bitmap Coverage Object for a Decision Region.
+  unsigned BitmapIdx = 0;
+
+  /// Number of Conditions used for a Decision Region.
+  unsigned NumConditions = 0;
+
+  /// IDs used to represent a branch region and other branch regions
+  /// evaluated based on True and False branches.
+  ConditionID ID = 0, TrueID = 0, FalseID = 0;
+};
+
+} // namespace llvm::coverage::mcdc
+
+#endif // LLVM_PROFILEDATA_COVERAGE_MCDCTYPES_H
diff --git a/llvm/lib/ProfileData/Coverage/CoverageMapping.cpp b/llvm/lib/ProfileData/Coverage/CoverageMapping.cpp
index eb0996e..517a81d 100644
--- a/llvm/lib/ProfileData/Coverage/CoverageMapping.cpp
+++ b/llvm/lib/ProfileData/Coverage/CoverageMapping.cpp
@@ -524,7 +524,7 @@ private:
 
     /// IDs that are stored in MCDCBranches
     /// Complete when all IDs (1 to NumConditions) are met.
-    DenseSet<CounterMappingRegion::MCDCConditionID> ConditionIDs;
+    DenseSet<mcdc::ConditionID> ConditionIDs;
 
     /// Set of IDs of Expansion(s) that are relevant to DecisionRegion
     /// and its children (via expansions).
diff --git a/llvm/lib/ProfileData/Coverage/CoverageMappingReader.cpp b/llvm/lib/ProfileData/Coverage/CoverageMappingReader.cpp
index ac8e6b5..fc6014c 100644
--- a/llvm/lib/ProfileData/Coverage/CoverageMappingReader.cpp
+++ b/llvm/lib/ProfileData/Coverage/CoverageMappingReader.cpp
@@ -371,10 +371,9 @@ Error RawCoverageMappingReader::readMappingRegionsSubArray(
 
     auto CMR = CounterMappingRegion(
         C, C2,
-        CounterMappingRegion::MCDCParameters{
-            static_cast<unsigned>(BIDX), static_cast<unsigned>(NC),
-            static_cast<unsigned>(ID), static_cast<unsigned>(TID),
-            static_cast<unsigned>(FID)},
+        mcdc::Parameters{static_cast<unsigned>(BIDX), static_cast<unsigned>(NC),
+                         static_cast<unsigned>(ID), static_cast<unsigned>(TID),
+                         static_cast<unsigned>(FID)},
         InferredFileID, ExpandedFileID, LineStart, ColumnStart,
         LineStart + NumLines, ColumnEnd, Kind);
     if (CMR.startLoc() > CMR.endLoc())
diff --git a/llvm/unittests/ProfileData/CoverageMappingTest.cpp b/llvm/unittests/ProfileData/CoverageMappingTest.cpp
index 2849781..d60312b 100644
--- a/llvm/unittests/ProfileData/CoverageMappingTest.cpp
+++ b/llvm/unittests/ProfileData/CoverageMappingTest.cpp
@@ -197,8 +197,7 @@ struct CoverageMappingTest : ::testing::TestWithParam<std::tuple<bool, bool>> {
     auto &Regions = InputFunctions.back().Regions;
     unsigned FileID = getFileIndexForFunction(File);
     Regions.push_back(CounterMappingRegion::makeDecisionRegion(
-        CounterMappingRegion::MCDCParameters{Mask, NC}, FileID, LS, CS, LE,
-        CE));
+        mcdc::Parameters{Mask, NC}, FileID, LS, CS, LE, CE));
   }
 
   void addMCDCBranchCMR(Counter C1, Counter C2, unsigned ID, unsigned TrueID,
@@ -207,8 +206,8 @@ struct CoverageMappingTest : ::testing::TestWithParam<std::tuple<bool, bool>> {
     auto &Regions = InputFunctions.back().Regions;
     unsigned FileID = getFileIndexForFunction(File);
     Regions.push_back(CounterMappingRegion::makeBranchRegion(
-        C1, C2, CounterMappingRegion::MCDCParameters{0, 0, ID, TrueID, FalseID},
-        FileID, LS, CS, LE, CE));
+        C1, C2, mcdc::Parameters{0, 0, ID, TrueID, FalseID}, FileID, LS, CS, LE,
+        CE));
   }
 
   void addExpansionCMR(StringRef File, StringRef ExpandedFile, unsigned LS,
-- 
cgit v1.1


From 05ad0d46325732e2f7759cb93c94f3e15b41d110 Mon Sep 17 00:00:00 2001
From: NAKAMURA Takumi <geek4civic@gmail.com>
Date: Tue, 13 Feb 2024 17:45:28 +0900
Subject: CoverageMapping.cpp: Apply std::move to MCDCRecord (#81220)

---
 llvm/include/llvm/ProfileData/Coverage/CoverageMapping.h | 16 ++++++++++------
 llvm/lib/ProfileData/Coverage/CoverageMapping.cpp        |  8 ++++----
 2 files changed, 14 insertions(+), 10 deletions(-)

diff --git a/llvm/include/llvm/ProfileData/Coverage/CoverageMapping.h b/llvm/include/llvm/ProfileData/Coverage/CoverageMapping.h
index 2869ae8..e877571 100644
--- a/llvm/include/llvm/ProfileData/Coverage/CoverageMapping.h
+++ b/llvm/include/llvm/ProfileData/Coverage/CoverageMapping.h
@@ -396,11 +396,13 @@ private:
   LineColPairMap CondLoc;
 
 public:
-  MCDCRecord(CounterMappingRegion Region, TestVectors TV,
-             TVPairMap IndependencePairs, BoolVector Folded, CondIDMap PosToID,
-             LineColPairMap CondLoc)
-      : Region(Region), TV(TV), IndependencePairs(IndependencePairs),
-        Folded(Folded), PosToID(PosToID), CondLoc(CondLoc){};
+  MCDCRecord(const CounterMappingRegion &Region, TestVectors &&TV,
+             TVPairMap &&IndependencePairs, BoolVector &&Folded,
+             CondIDMap &&PosToID, LineColPairMap &&CondLoc)
+      : Region(Region), TV(std::move(TV)),
+        IndependencePairs(std::move(IndependencePairs)),
+        Folded(std::move(Folded)), PosToID(std::move(PosToID)),
+        CondLoc(std::move(CondLoc)){};
 
   CounterMappingRegion getDecisionRegion() const { return Region; }
   unsigned getNumConditions() const {
@@ -603,7 +605,9 @@ struct FunctionRecord {
   FunctionRecord(FunctionRecord &&FR) = default;
   FunctionRecord &operator=(FunctionRecord &&) = default;
 
-  void pushMCDCRecord(MCDCRecord Record) { MCDCRecords.push_back(Record); }
+  void pushMCDCRecord(MCDCRecord &&Record) {
+    MCDCRecords.push_back(std::move(Record));
+  }
 
   void pushRegion(CounterMappingRegion Region, uint64_t Count,
                   uint64_t FalseCount) {
diff --git a/llvm/lib/ProfileData/Coverage/CoverageMapping.cpp b/llvm/lib/ProfileData/Coverage/CoverageMapping.cpp
index 517a81d..0c65ab8 100644
--- a/llvm/lib/ProfileData/Coverage/CoverageMapping.cpp
+++ b/llvm/lib/ProfileData/Coverage/CoverageMapping.cpp
@@ -385,9 +385,9 @@ public:
     findIndependencePairs();
 
     // Record Test vectors, executed vectors, and independence pairs.
-    MCDCRecord Res(Region, ExecVectors, IndependencePairs, Folded, PosToID,
-                   CondLoc);
-    return Res;
+    return MCDCRecord(Region, std::move(ExecVectors),
+                      std::move(IndependencePairs), std::move(Folded),
+                      std::move(PosToID), std::move(CondLoc));
   }
 };
 
@@ -761,7 +761,7 @@ Error CoverageMapping::loadFunctionRecord(
     }
 
     // Save the MC/DC Record so that it can be visualized later.
-    Function.pushMCDCRecord(*Record);
+    Function.pushMCDCRecord(std::move(*Record));
   }
 
   // Don't create records for (filenames, function) pairs we've already seen.
-- 
cgit v1.1


From 0a600c34c8c1fe87c9661b6020e5044b24da3dc7 Mon Sep 17 00:00:00 2001
From: Guray Ozen <guray.ozen@gmail.com>
Date: Tue, 13 Feb 2024 09:50:34 +0100
Subject: [mlir][nvgpu] Make `phaseParity` of `mbarrier.try_wait` `i1` (#81460)

Currently, `phaseParity` argument of `nvgpu.mbarrier.try_wait.parity` is
index. This can cause a problem if it's passed any value different than
0 or 1. Because the PTX instruction only accepts even or odd phase. This
PR makes phaseParity argument i1 to avoid misuse.

Here is the information from PTX doc:

```
The .parity variant of the instructions test for the completion of the phase indicated
by the operand phaseParity, which is the integer parity of either the current phase or
the immediately preceding phase of the mbarrier object. An even phase has integer
parity 0 and an odd phase has integer parity of 1. So the valid values of phaseParity
operand are 0 and 1.
```
See for more information:

https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-test-wait-mbarrier-try-wait
---
 mlir/include/mlir/Dialect/NVGPU/IR/NVGPU.td                    | 10 ++++++----
 mlir/lib/Conversion/NVGPUToNVVM/NVGPUToNVVM.cpp                |  3 ++-
 mlir/lib/Dialect/NVGPU/TransformOps/NVGPUTransformOps.cpp      |  3 ++-
 mlir/test/Conversion/NVGPUToNVVM/nvgpu-to-nvvm.mlir            |  8 ++++----
 mlir/test/Dialect/NVGPU/tmaload-transform.mlir                 |  2 +-
 .../GPU/CUDA/sm90/gemm_f32_f16_f16_128x128x128.mlir            |  3 ++-
 .../GPU/CUDA/sm90/gemm_pred_f32_f16_f16_128x128x128.mlir       |  3 ++-
 .../Integration/GPU/CUDA/sm90/tma_load_128x64_swizzle128b.mlir |  3 ++-
 .../Integration/GPU/CUDA/sm90/tma_load_64x64_swizzle128b.mlir  |  3 ++-
 .../GPU/CUDA/sm90/tma_load_64x8_8x128_noswizzle.mlir           |  3 ++-
 10 files changed, 25 insertions(+), 16 deletions(-)

diff --git a/mlir/include/mlir/Dialect/NVGPU/IR/NVGPU.td b/mlir/include/mlir/Dialect/NVGPU/IR/NVGPU.td
index a0c0d4c..dda8f31 100644
--- a/mlir/include/mlir/Dialect/NVGPU/IR/NVGPU.td
+++ b/mlir/include/mlir/Dialect/NVGPU/IR/NVGPU.td
@@ -609,14 +609,16 @@ def NVGPU_MBarrierTryWaitParityOp : NVGPU_Op<"mbarrier.try_wait.parity", []> {
     phase. Suspended thread resumes execution when the specified phase completes 
     OR before the phase completes following a system-dependent time limit. 
 
+    The `$phaseParity` specifies either even phase (0) or odd phase (1) to 
+    wait.
+
     Example:
     ```mlir
-      nvgpu.mbarrier.try_wait.parity %barrier, %phase, %ticks : !nvgpu.mbarrier.barrier<memorySpace = #gpu.address_space<workgroup>>
+      nvgpu.mbarrier.try_wait.parity %barrier, %phaseParity, %ticks : !nvgpu.mbarrier.barrier<memorySpace = #gpu.address_space<workgroup>>
     ```
-
   }];
-  let arguments = (ins NVGPU_MBarrierGroup:$barriers, Index:$phase, Index:$ticks, Index:$mbarId);
-  let assemblyFormat = "$barriers `[` $mbarId `]` `,` $phase `,` $ticks attr-dict `:` type($barriers)";  
+  let arguments = (ins NVGPU_MBarrierGroup:$barriers, I1:$phaseParity, Index:$ticks, Index:$mbarId);
+  let assemblyFormat = "$barriers `[` $mbarId `]` `,` $phaseParity `,` $ticks attr-dict `:` type($barriers)";  
 }
 
 def NVGPU_TmaPrefetchOp : NVGPU_Op<"tma.prefetch.descriptor", []> {
diff --git a/mlir/lib/Conversion/NVGPUToNVVM/NVGPUToNVVM.cpp b/mlir/lib/Conversion/NVGPUToNVVM/NVGPUToNVVM.cpp
index 5080956..9b5d19e 100644
--- a/mlir/lib/Conversion/NVGPUToNVVM/NVGPUToNVVM.cpp
+++ b/mlir/lib/Conversion/NVGPUToNVVM/NVGPUToNVVM.cpp
@@ -956,7 +956,8 @@ struct NVGPUMBarrierTryWaitParityLowering
         getMbarrierPtr(b, op.getBarriers().getType(), adaptor.getBarriers(),
                        adaptor.getMbarId(), rewriter);
     Value ticks = truncToI32(b, adaptor.getTicks());
-    Value phase = truncToI32(b, adaptor.getPhase());
+    Value phase =
+        b.create<LLVM::ZExtOp>(b.getI32Type(), adaptor.getPhaseParity());
 
     if (isMbarrierShared(op.getBarriers().getType())) {
       rewriter.replaceOpWithNewOp<NVVM::MBarrierTryWaitParitySharedOp>(
diff --git a/mlir/lib/Dialect/NVGPU/TransformOps/NVGPUTransformOps.cpp b/mlir/lib/Dialect/NVGPU/TransformOps/NVGPUTransformOps.cpp
index c817422..1635297 100644
--- a/mlir/lib/Dialect/NVGPU/TransformOps/NVGPUTransformOps.cpp
+++ b/mlir/lib/Dialect/NVGPU/TransformOps/NVGPUTransformOps.cpp
@@ -1010,7 +1010,8 @@ void HopperBuilder::buildBarrierArriveTx(
 
 void HopperBuilder::buildTryWaitParity(
     TypedValue<nvgpu::MBarrierGroupType> barrier) {
-  Value parity = rewriter.create<arith::ConstantIndexOp>(loc, 0);
+  Type i1 = rewriter.getI1Type();
+  Value parity = rewriter.create<LLVM::ConstantOp>(loc, i1, 0);
   // 10M is an arbitrary, not too small or too big number to specify the number
   // of ticks before retry.
   // TODO: hoist this in a default dialect constant.
diff --git a/mlir/test/Conversion/NVGPUToNVVM/nvgpu-to-nvvm.mlir b/mlir/test/Conversion/NVGPUToNVVM/nvgpu-to-nvvm.mlir
index 09a873f..dbf8ead 100644
--- a/mlir/test/Conversion/NVGPUToNVVM/nvgpu-to-nvvm.mlir
+++ b/mlir/test/Conversion/NVGPUToNVVM/nvgpu-to-nvvm.mlir
@@ -590,12 +590,12 @@ func.func @mbarrier_txcount() {
     }
       
 
-    %phase = arith.constant 0 : index
+    %phase_c0 = arith.constant 0 : i1
     %ticks = arith.constant 10000000 : index
     // CHECK: %[[base3:.+]] = llvm.extractvalue %[[barStr]][1] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)> 
     // CHECK: %[[barPtr3:.+]] = llvm.getelementptr %[[base3]][%[[mid]]] : (!llvm.ptr<3>, i64) -> !llvm.ptr<3>, i64
     // CHECK: nvvm.mbarrier.try_wait.parity.shared %[[barPtr3]]
-    nvgpu.mbarrier.try_wait.parity %barrier[%c0], %phase, %ticks : !barrierType
+    nvgpu.mbarrier.try_wait.parity %barrier[%c0], %phase_c0, %ticks : !barrierType
 
     func.return 
 }
@@ -626,12 +626,12 @@ func.func @mbarrier_txcount_pred() {
     // CHECK: nvvm.mbarrier.arrive.expect_tx.shared %[[barPtr2]], {{.*}}, predicate = %[[P]]
     nvgpu.mbarrier.arrive.expect_tx %barrier[%c0], %txcount, predicate = %pred : !barrierType
 
-    %phase = arith.constant 0 : index
+    %phase_c0 = arith.constant 0 : i1
     %ticks = arith.constant 10000000 : index
     // CHECK: %[[base3:.+]] = llvm.extractvalue %[[barStr]][1] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)> 
     // CHECK: %[[barPtr3:.+]] = llvm.getelementptr %[[base3]][%[[mid]]] : (!llvm.ptr<3>, i64) -> !llvm.ptr<3>, i64
     // CHECK: nvvm.mbarrier.try_wait.parity.shared %[[barPtr3]]
-    nvgpu.mbarrier.try_wait.parity %barrier[%c0], %phase, %ticks : !barrierType
+    nvgpu.mbarrier.try_wait.parity %barrier[%c0], %phase_c0, %ticks : !barrierType
 
     func.return 
 }
diff --git a/mlir/test/Dialect/NVGPU/tmaload-transform.mlir b/mlir/test/Dialect/NVGPU/tmaload-transform.mlir
index 29e300a..40acd82 100644
--- a/mlir/test/Dialect/NVGPU/tmaload-transform.mlir
+++ b/mlir/test/Dialect/NVGPU/tmaload-transform.mlir
@@ -62,7 +62,7 @@ func.func @main() {
     //      CHECK:   nvgpu.mbarrier.arrive.expect_tx %[[B]][%{{.*}}], %[[c0_7]] : <memorySpace = #gpu.address_space<workgroup>
     //      CHECK: }
     //
-    //      CHECK: %[[c0_6:.*]] = arith.constant 0 : index
+    //      CHECK: %[[c0_6:.*]] = llvm.mlir.constant(false) : i1 
     //      CHECK: %[[c10000000:.*]] = arith.constant 10000000 : index
     //      CHECK: nvgpu.mbarrier.try_wait.parity %[[B]][%{{.*}}], %[[c0_6]], %[[c10000000]] : <memorySpace = #gpu.address_space<workgroup>
 
diff --git a/mlir/test/Integration/GPU/CUDA/sm90/gemm_f32_f16_f16_128x128x128.mlir b/mlir/test/Integration/GPU/CUDA/sm90/gemm_f32_f16_f16_128x128x128.mlir
index 35ca0ee..51bcf45 100644
--- a/mlir/test/Integration/GPU/CUDA/sm90/gemm_f32_f16_f16_128x128x128.mlir
+++ b/mlir/test/Integration/GPU/CUDA/sm90/gemm_f32_f16_f16_128x128x128.mlir
@@ -197,7 +197,8 @@ func.func @main() {
     {
       %ticks = arith.constant 10000000 : index
       // TMA wait
-      nvgpu.mbarrier.try_wait.parity %barrier[%i], %c0, %ticks : !barrierType
+      %phase_c0 = arith.constant 0 : i1
+      nvgpu.mbarrier.try_wait.parity %barrier[%i], %phase_c0, %ticks : !barrierType
       %lhsSlice = memref.subview %lhsShmem [%i, 0, 0][1, 128, 64][1, 1, 1] : memref<2x128x64xf16, 3> to memref<128x64xf16, strided<[64, 1], offset: ?>, 3>
       %rhsSlice = memref.subview %rhsShmem [%i, 0, 0][1, 64, 128][1, 1, 1] : memref<2x64x128xf16, strided<[8192, 128, 1], offset: 16384>, 3> to memref<64x128xf16, strided<[128, 1], offset: ?>, 3>
       // Descriptor WGMMA
diff --git a/mlir/test/Integration/GPU/CUDA/sm90/gemm_pred_f32_f16_f16_128x128x128.mlir b/mlir/test/Integration/GPU/CUDA/sm90/gemm_pred_f32_f16_f16_128x128x128.mlir
index 5a10bbb..85bdb38 100644
--- a/mlir/test/Integration/GPU/CUDA/sm90/gemm_pred_f32_f16_f16_128x128x128.mlir
+++ b/mlir/test/Integration/GPU/CUDA/sm90/gemm_pred_f32_f16_f16_128x128x128.mlir
@@ -206,7 +206,8 @@ func.func @main() {
     {
       %ticks = arith.constant 10000000 : index
       // TMA wait
-      nvgpu.mbarrier.try_wait.parity %barrier[%i], %c0, %ticks : !barrierType
+      %phase_c0 = arith.constant 0 : i1
+      nvgpu.mbarrier.try_wait.parity %barrier[%i], %phase_c0, %ticks : !barrierType
       %lhsSlice = memref.subview %lhsShmem [%i, 0, 0][1, 128, 64][1, 1, 1] : memref<2x128x64xf16, 3> to memref<128x64xf16, strided<[64, 1], offset: ?>, 3>
       %rhsSlice = memref.subview %rhsShmem [%i, 0, 0][1, 64, 128][1, 1, 1] : memref<2x64x128xf16, strided<[8192, 128, 1], offset: 16384>, 3> to memref<64x128xf16, strided<[128, 1], offset: ?>, 3>
       // Descriptor WGMMA
diff --git a/mlir/test/Integration/GPU/CUDA/sm90/tma_load_128x64_swizzle128b.mlir b/mlir/test/Integration/GPU/CUDA/sm90/tma_load_128x64_swizzle128b.mlir
index 9c5aacf..b50772f 100644
--- a/mlir/test/Integration/GPU/CUDA/sm90/tma_load_128x64_swizzle128b.mlir
+++ b/mlir/test/Integration/GPU/CUDA/sm90/tma_load_128x64_swizzle128b.mlir
@@ -93,7 +93,8 @@ module @mymod {
       }
 
       // Step 8. Wait until TMA is done
-      nvgpu.mbarrier.try_wait.parity %9[%c0], %c0, %c10000000 : !barrierType
+      %phase_c0 = arith.constant 0 : i1
+      nvgpu.mbarrier.try_wait.parity %9[%c0], %phase_c0, %c10000000 : !barrierType
 
       // Step 9. Print loaded data in 128b swizzled
       scf.if %10 {        
diff --git a/mlir/test/Integration/GPU/CUDA/sm90/tma_load_64x64_swizzle128b.mlir b/mlir/test/Integration/GPU/CUDA/sm90/tma_load_64x64_swizzle128b.mlir
index 536e71d..65e5fc0 100644
--- a/mlir/test/Integration/GPU/CUDA/sm90/tma_load_64x64_swizzle128b.mlir
+++ b/mlir/test/Integration/GPU/CUDA/sm90/tma_load_64x64_swizzle128b.mlir
@@ -119,7 +119,8 @@ module @mymod {
       }
 
       // Step 7. Wait until TMA is done
-      nvgpu.mbarrier.try_wait.parity %9[%c0], %c0, %c10000000 : !barrierType
+      %phase_c0 = arith.constant 0 : i1
+      nvgpu.mbarrier.try_wait.parity %9[%c0], %phase_c0, %c10000000 : !barrierType
 
       // Step 8. Print loaded data in 128b swizzled
       scf.if %10 {        
diff --git a/mlir/test/Integration/GPU/CUDA/sm90/tma_load_64x8_8x128_noswizzle.mlir b/mlir/test/Integration/GPU/CUDA/sm90/tma_load_64x8_8x128_noswizzle.mlir
index aee265e..2e59b72 100644
--- a/mlir/test/Integration/GPU/CUDA/sm90/tma_load_64x8_8x128_noswizzle.mlir
+++ b/mlir/test/Integration/GPU/CUDA/sm90/tma_load_64x8_8x128_noswizzle.mlir
@@ -96,7 +96,8 @@ module @mymod {
       } else {
         nvgpu.mbarrier.arrive.expect_tx %9[%c0], %c0 : <memorySpace = #gpu.address_space<workgroup>>
       }
-      nvgpu.mbarrier.try_wait.parity %9[%c0], %c0, %c10000000 : <memorySpace = #gpu.address_space<workgroup>>
+      %phase_c0 = arith.constant 0 : i1
+      nvgpu.mbarrier.try_wait.parity %9[%c0], %phase_c0, %c10000000 : <memorySpace = #gpu.address_space<workgroup>>
       scf.if %10 {
         %11 = memref.load %7[%c45, %c7] : memref<64x8xf32, 3>
         %12 = memref.load %8[%c7, %c0] : memref<8x128xf32, 3>
-- 
cgit v1.1


From 4588525d7edbc0d14c41f5fa8f3e23a3241a502e Mon Sep 17 00:00:00 2001
From: NAKAMURA Takumi <geek4civic@gmail.com>
Date: Mon, 12 Feb 2024 14:05:58 +0900
Subject: CoverageMappingReader/Writer: MCDCConditionID shouldn't be zero

---
 llvm/lib/ProfileData/Coverage/CoverageMappingReader.cpp | 4 ++++
 llvm/lib/ProfileData/Coverage/CoverageMappingWriter.cpp | 1 +
 2 files changed, 5 insertions(+)

diff --git a/llvm/lib/ProfileData/Coverage/CoverageMappingReader.cpp b/llvm/lib/ProfileData/Coverage/CoverageMappingReader.cpp
index fc6014c..061f0f1 100644
--- a/llvm/lib/ProfileData/Coverage/CoverageMappingReader.cpp
+++ b/llvm/lib/ProfileData/Coverage/CoverageMappingReader.cpp
@@ -308,6 +308,10 @@ Error RawCoverageMappingReader::readMappingRegionsSubArray(
             return Err;
           if (auto Err = readIntMax(FID, std::numeric_limits<unsigned>::max()))
             return Err;
+          if (ID == 0)
+            return make_error<CoverageMapError>(
+                coveragemap_error::malformed,
+                "MCDCConditionID shouldn't be zero");
           break;
         case CounterMappingRegion::MCDCDecisionRegion:
           Kind = CounterMappingRegion::MCDCDecisionRegion;
diff --git a/llvm/lib/ProfileData/Coverage/CoverageMappingWriter.cpp b/llvm/lib/ProfileData/Coverage/CoverageMappingWriter.cpp
index 27727f2..248a6a7 100644
--- a/llvm/lib/ProfileData/Coverage/CoverageMappingWriter.cpp
+++ b/llvm/lib/ProfileData/Coverage/CoverageMappingWriter.cpp
@@ -251,6 +251,7 @@ void CoverageMappingWriter::write(raw_ostream &OS) {
                     OS);
       writeCounter(MinExpressions, Count, OS);
       writeCounter(MinExpressions, FalseCount, OS);
+      assert(I->MCDCParams.ID > 0);
       encodeULEB128(unsigned(I->MCDCParams.ID), OS);
       encodeULEB128(unsigned(I->MCDCParams.TrueID), OS);
       encodeULEB128(unsigned(I->MCDCParams.FalseID), OS);
-- 
cgit v1.1


From 270f2c5575dc5dd001e91ddc2c71b0f2d23f567c Mon Sep 17 00:00:00 2001
From: martinboehme <mboehme@google.com>
Date: Tue, 13 Feb 2024 10:01:25 +0100
Subject: [clang][dataflow] Add `Environment::initializeFieldsWithValues()`.
 (#81239)

This function will be useful when we change the behavior of record-type
prvalues
so that they directly initialize the associated result object. See also
the
comment here for more details:


https://github.com/llvm/llvm-project/blob/9e73656af524a2c592978aec91de67316c5ce69f/clang/include/clang/Analysis/FlowSensitive/DataflowEnvironment.h#L354

As part of this patch, we document and assert that synthetic fields may
not have
reference type.

There is no practical use case for this: A `StorageLocation` may not
have
reference type, and a synthetic field of the corresponding non-reference
type
can serve the same purpose.
---
 .../FlowSensitive/DataflowAnalysisContext.h        | 15 ++++-
 .../Analysis/FlowSensitive/DataflowEnvironment.h   |  8 +++
 .../Analysis/FlowSensitive/DataflowEnvironment.cpp | 74 ++++++++++++++--------
 3 files changed, 68 insertions(+), 29 deletions(-)

diff --git a/clang/include/clang/Analysis/FlowSensitive/DataflowAnalysisContext.h b/clang/include/clang/Analysis/FlowSensitive/DataflowAnalysisContext.h
index 20e45cc..98bdf03 100644
--- a/clang/include/clang/Analysis/FlowSensitive/DataflowAnalysisContext.h
+++ b/clang/include/clang/Analysis/FlowSensitive/DataflowAnalysisContext.h
@@ -100,6 +100,8 @@ public:
   /// to add to a `RecordStorageLocation` of a given type.
   /// Typically, this is called from the constructor of a `DataflowAnalysis`
   ///
+  /// The field types returned by the callback may not have reference type.
+  ///
   /// To maintain the invariant that all `RecordStorageLocation`s of a given
   /// type have the same fields:
   /// *  The callback must always return the same result for a given type
@@ -205,8 +207,17 @@ public:
   /// type.
   llvm::StringMap<QualType> getSyntheticFields(QualType Type) {
     assert(Type->isRecordType());
-    if (SyntheticFieldCallback)
-      return SyntheticFieldCallback(Type);
+    if (SyntheticFieldCallback) {
+      llvm::StringMap<QualType> Result = SyntheticFieldCallback(Type);
+      // Synthetic fields are not allowed to have reference type.
+      assert([&Result] {
+        for (const auto &Entry : Result)
+          if (Entry.getValue()->isReferenceType())
+            return false;
+        return true;
+      }());
+      return Result;
+    }
     return {};
   }
 
diff --git a/clang/include/clang/Analysis/FlowSensitive/DataflowEnvironment.h b/clang/include/clang/Analysis/FlowSensitive/DataflowEnvironment.h
index 5c737a5..0aecc74 100644
--- a/clang/include/clang/Analysis/FlowSensitive/DataflowEnvironment.h
+++ b/clang/include/clang/Analysis/FlowSensitive/DataflowEnvironment.h
@@ -681,6 +681,14 @@ private:
                                           llvm::DenseSet<QualType> &Visited,
                                           int Depth, int &CreatedValuesCount);
 
+  /// Initializes the fields (including synthetic fields) of `Loc` with values,
+  /// unless values of the field type are not supported or we hit one of the
+  /// limits at which we stop producing values (controlled by `Visited`,
+  /// `Depth`, and `CreatedValuesCount`).
+  void initializeFieldsWithValues(RecordStorageLocation &Loc,
+                                  llvm::DenseSet<QualType> &Visited, int Depth,
+                                  int &CreatedValuesCount);
+
   /// Shared implementation of `createObject()` overloads.
   /// `D` and `InitExpr` may be null.
   StorageLocation &createObjectInternal(const ValueDecl *D, QualType Ty,
diff --git a/clang/lib/Analysis/FlowSensitive/DataflowEnvironment.cpp b/clang/lib/Analysis/FlowSensitive/DataflowEnvironment.cpp
index 24811de..93a9dac 100644
--- a/clang/lib/Analysis/FlowSensitive/DataflowEnvironment.cpp
+++ b/clang/lib/Analysis/FlowSensitive/DataflowEnvironment.cpp
@@ -887,34 +887,10 @@ Value *Environment::createValueUnlessSelfReferential(
 
   if (Type->isRecordType()) {
     CreatedValuesCount++;
-    llvm::DenseMap<const ValueDecl *, StorageLocation *> FieldLocs;
-    for (const FieldDecl *Field : DACtx->getModeledFields(Type)) {
-      assert(Field != nullptr);
+    auto &Loc = cast<RecordStorageLocation>(createStorageLocation(Type));
+    initializeFieldsWithValues(Loc, Visited, Depth, CreatedValuesCount);
 
-      QualType FieldType = Field->getType();
-
-      FieldLocs.insert(
-          {Field, &createLocAndMaybeValue(FieldType, Visited, Depth + 1,
-                                          CreatedValuesCount)});
-    }
-
-    RecordStorageLocation::SyntheticFieldMap SyntheticFieldLocs;
-    for (const auto &Entry : DACtx->getSyntheticFields(Type)) {
-      SyntheticFieldLocs.insert(
-          {Entry.getKey(),
-           &createLocAndMaybeValue(Entry.getValue(), Visited, Depth + 1,
-                                   CreatedValuesCount)});
-    }
-
-    RecordStorageLocation &Loc = DACtx->createRecordStorageLocation(
-        Type, std::move(FieldLocs), std::move(SyntheticFieldLocs));
-    RecordValue &RecordVal = create<RecordValue>(Loc);
-
-    // As we already have a storage location for the `RecordValue`, we can and
-    // should associate them in the environment.
-    setValue(Loc, RecordVal);
-
-    return &RecordVal;
+    return &refreshRecordValue(Loc, *this);
   }
 
   return nullptr;
@@ -943,6 +919,50 @@ Environment::createLocAndMaybeValue(QualType Ty,
   return Loc;
 }
 
+void Environment::initializeFieldsWithValues(RecordStorageLocation &Loc,
+                                             llvm::DenseSet<QualType> &Visited,
+                                             int Depth,
+                                             int &CreatedValuesCount) {
+  auto initField = [&](QualType FieldType, StorageLocation &FieldLoc) {
+    if (FieldType->isRecordType()) {
+      auto &FieldRecordLoc = cast<RecordStorageLocation>(FieldLoc);
+      setValue(FieldRecordLoc, create<RecordValue>(FieldRecordLoc));
+      initializeFieldsWithValues(FieldRecordLoc, Visited, Depth + 1,
+                                 CreatedValuesCount);
+    } else {
+      if (!Visited.insert(FieldType.getCanonicalType()).second)
+        return;
+      if (Value *Val = createValueUnlessSelfReferential(
+              FieldType, Visited, Depth + 1, CreatedValuesCount))
+        setValue(FieldLoc, *Val);
+      Visited.erase(FieldType.getCanonicalType());
+    }
+  };
+
+  for (const auto [Field, FieldLoc] : Loc.children()) {
+    assert(Field != nullptr);
+    QualType FieldType = Field->getType();
+
+    if (FieldType->isReferenceType()) {
+      Loc.setChild(*Field,
+                   &createLocAndMaybeValue(FieldType, Visited, Depth + 1,
+                                           CreatedValuesCount));
+    } else {
+      assert(FieldLoc != nullptr);
+      initField(FieldType, *FieldLoc);
+    }
+  }
+  for (const auto &[FieldName, FieldLoc] : Loc.synthetic_fields()) {
+    assert(FieldLoc != nullptr);
+    QualType FieldType = FieldLoc->getType();
+
+    // Synthetic fields cannot have reference type, so we don't need to deal
+    // with this case.
+    assert(!FieldType->isReferenceType());
+    initField(FieldType, Loc.getSyntheticField(FieldName));
+  }
+}
+
 StorageLocation &Environment::createObjectInternal(const ValueDecl *D,
                                                    QualType Ty,
                                                    const Expr *InitExpr) {
-- 
cgit v1.1


From 5b015229b77d6ea7916a503d88661b04d4867e7c Mon Sep 17 00:00:00 2001
From: Jay Foad <jay.foad@amd.com>
Date: Tue, 13 Feb 2024 09:01:45 +0000
Subject: [AMDGPU] Use LLT::isPointerOrPointerVector in legalizer (#81582)

---
 llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index f3716f9..5458dfc 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -342,11 +342,8 @@ static std::initializer_list<LLT> AllS64Vectors = {V2S64, V3S64, V4S64, V5S64,
 
 // Checks whether a type is in the list of legal register types.
 static bool isRegisterClassType(LLT Ty) {
-  if (Ty.isVector() && Ty.getElementType().isPointer())
-    Ty = LLT::fixed_vector(Ty.getNumElements(),
-                           LLT::scalar(Ty.getScalarSizeInBits()));
-  else if (Ty.isPointer())
-    Ty = LLT::scalar(Ty.getScalarSizeInBits());
+  if (Ty.isPointerOrPointerVector())
+    Ty = Ty.changeElementType(LLT::scalar(Ty.getScalarSizeInBits()));
 
   return is_contained(AllS32Vectors, Ty) || is_contained(AllS64Vectors, Ty) ||
          is_contained(AllScalarTypes, Ty) || is_contained(AllS16Vectors, Ty);
-- 
cgit v1.1


From d860ea96b1d4bcc13bf269e9b108d8f5e0c21d93 Mon Sep 17 00:00:00 2001
From: Orlando Cazalet-Hyams <orlando.hyams@sony.com>
Date: Tue, 13 Feb 2024 09:11:09 +0000
Subject: [HWASAN] Update dbg.assign intrinsics in HWAsan pass (#79864)

llvm.dbg.assign intrinsics have 2 {value, expression} pairs; fix hwasan to
update the second expression.

Fixes #76545. This is #78606 rebased and with the addition of DPValue handling.
Note the addition of --try-experimental-debuginfo-iterators in the tests and
some shuffling of code in MemoryTaggingSupport.cpp.
---
 llvm/lib/IR/DebugInfo.cpp                          |  4 --
 .../Instrumentation/HWAddressSanitizer.cpp         | 13 ++++
 llvm/lib/Transforms/Utils/MemoryTaggingSupport.cpp | 21 ++++--
 .../AArch64/dbg-assign-tag-offset-mix-loc.ll       | 72 ++++++++++++++++++++
 llvm/test/CodeGen/AArch64/dbg-assign-tag-offset.ll | 76 ++++++++++++++++++++++
 .../declare-to-assign/hwasan.ll                    |  3 +-
 .../HWAddressSanitizer/dbg-assign-tag-offset.ll    | 60 +++++++++++++++++
 7 files changed, 237 insertions(+), 12 deletions(-)
 create mode 100644 llvm/test/CodeGen/AArch64/dbg-assign-tag-offset-mix-loc.ll
 create mode 100644 llvm/test/CodeGen/AArch64/dbg-assign-tag-offset.ll
 create mode 100644 llvm/test/Instrumentation/HWAddressSanitizer/dbg-assign-tag-offset.ll

diff --git a/llvm/lib/IR/DebugInfo.cpp b/llvm/lib/IR/DebugInfo.cpp
index d8c1b0d..eaa5cb3 100644
--- a/llvm/lib/IR/DebugInfo.cpp
+++ b/llvm/lib/IR/DebugInfo.cpp
@@ -2214,10 +2214,6 @@ bool AssignmentTrackingPass::runOnFunction(Function &F) {
   if (F.hasFnAttribute(Attribute::OptimizeNone))
     return /*Changed*/ false;
 
-  // FIXME: https://github.com/llvm/llvm-project/issues/76545
-  if (F.hasFnAttribute(Attribute::SanitizeHWAddress))
-    return /*Changed*/ false;
-
   bool Changed = false;
   auto *DL = &F.getParent()->getDataLayout();
   // Collect a map of {backing storage : dbg.declares} (currently "backing
diff --git a/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp
index c2a4632..393afc9 100644
--- a/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp
@@ -1371,6 +1371,14 @@ static bool isLifetimeIntrinsic(Value *V) {
   return II && II->isLifetimeStartOrEnd();
 }
 
+static DbgAssignIntrinsic *DynCastToDbgAssign(DbgVariableIntrinsic *DVI) {
+  return dyn_cast<DbgAssignIntrinsic>(DVI);
+}
+
+static DPValue *DynCastToDbgAssign(DPValue *DPV) {
+  return DPV->isDbgAssign() ? DPV : nullptr;
+}
+
 bool HWAddressSanitizer::instrumentStack(memtag::StackInfo &SInfo,
                                          Value *StackTag, Value *UARTag,
                                          const DominatorTree &DT,
@@ -1437,6 +1445,11 @@ bool HWAddressSanitizer::instrumentStack(memtag::StackInfo &SInfo,
         if (DPtr->getVariableLocationOp(LocNo) == AI)
           DPtr->setExpression(DIExpression::appendOpsToArg(
               DPtr->getExpression(), NewOps, LocNo));
+      if (auto *DAI = DynCastToDbgAssign(DPtr)) {
+        if (DAI->getAddress() == AI)
+          DAI->setAddressExpression(DIExpression::prependOpcodes(
+              DAI->getAddressExpression(), NewOps));
+      }
     };
 
     llvm::for_each(Info.DbgVariableIntrinsics, AnnotateDbgRecord);
diff --git a/llvm/lib/Transforms/Utils/MemoryTaggingSupport.cpp b/llvm/lib/Transforms/Utils/MemoryTaggingSupport.cpp
index 4336695..1ffa003 100644
--- a/llvm/lib/Transforms/Utils/MemoryTaggingSupport.cpp
+++ b/llvm/lib/Transforms/Utils/MemoryTaggingSupport.cpp
@@ -110,18 +110,22 @@ Instruction *getUntagLocationIfFunctionExit(Instruction &Inst) {
 }
 
 void StackInfoBuilder::visit(Instruction &Inst) {
-  // Check for non-intrinsic debug-info records.
+  // Visit non-intrinsic debug-info records attached to Inst.
   for (auto &DPV : Inst.getDbgValueRange()) {
-    for (Value *V : DPV.location_ops()) {
+    auto AddIfInteresting = [&](Value *V) {
       if (auto *AI = dyn_cast_or_null<AllocaInst>(V)) {
         if (!isInterestingAlloca(*AI))
-          continue;
+          return;
         AllocaInfo &AInfo = Info.AllocasToInstrument[AI];
         auto &DPVVec = AInfo.DbgVariableRecords;
         if (DPVVec.empty() || DPVVec.back() != &DPV)
           DPVVec.push_back(&DPV);
       }
-    }
+    };
+
+    for_each(DPV.location_ops(), AddIfInteresting);
+    if (DPV.isDbgAssign())
+      AddIfInteresting(DPV.getAddress());
   }
 
   if (CallInst *CI = dyn_cast<CallInst>(&Inst)) {
@@ -152,16 +156,19 @@ void StackInfoBuilder::visit(Instruction &Inst) {
     return;
   }
   if (auto *DVI = dyn_cast<DbgVariableIntrinsic>(&Inst)) {
-    for (Value *V : DVI->location_ops()) {
+    auto AddIfInteresting = [&](Value *V) {
       if (auto *AI = dyn_cast_or_null<AllocaInst>(V)) {
         if (!isInterestingAlloca(*AI))
-          continue;
+          return;
         AllocaInfo &AInfo = Info.AllocasToInstrument[AI];
         auto &DVIVec = AInfo.DbgVariableIntrinsics;
         if (DVIVec.empty() || DVIVec.back() != DVI)
           DVIVec.push_back(DVI);
       }
-    }
+    };
+    for_each(DVI->location_ops(), AddIfInteresting);
+    if (auto *DAI = dyn_cast<DbgAssignIntrinsic>(DVI))
+      AddIfInteresting(DAI->getAddress());
   }
 
   Instruction *ExitUntag = getUntagLocationIfFunctionExit(Inst);
diff --git a/llvm/test/CodeGen/AArch64/dbg-assign-tag-offset-mix-loc.ll b/llvm/test/CodeGen/AArch64/dbg-assign-tag-offset-mix-loc.ll
new file mode 100644
index 0000000..d9d0d98
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/dbg-assign-tag-offset-mix-loc.ll
@@ -0,0 +1,72 @@
+; RUN: llc -filetype=obj -o - %s | llvm-dwarfdump - | FileCheck %s
+; RUN: llc --try-experimental-debuginfo-iterators -filetype=obj -o - %s | llvm-dwarfdump - | FileCheck %s
+
+;; Similar to dbg-assign-tag-offset.ll except the variable 'x' has been removed
+;; and 'y' has an implicit location range as well as stack location range
+;; (according to the hand-modified debug info -- see the dbg.value).
+
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64-unknown-linux-android24"
+
+; CHECK:      DW_TAG_variable
+; CHECK-NOT:  DW_TAG
+; CHECK:        DW_AT_LLVM_tag_offset (0x80)
+; CHECK-NEXT:   DW_AT_name    ("y")
+
+define dso_local void @f() !dbg !14 {
+  %1 = alloca i32, align 4, !DIAssignID !31
+  %2 = alloca i32, align 4, !DIAssignID !32
+  call void @llvm.dbg.assign(metadata i1 undef, metadata !20, metadata !DIExpression(), metadata !32, metadata ptr %2, metadata !DIExpression(DW_OP_LLVM_tag_offset, 128)), !dbg !22
+  call void @llvm.dbg.value(metadata i32 2, metadata !20, metadata !DIExpression()), !dbg !22
+  call void @use(ptr null), !dbg !28
+  store i32 1, ptr %2, align 4, !dbg !23, !tbaa !24, !DIAssignID !33
+  call void @llvm.dbg.assign(metadata i32 1, metadata !20, metadata !DIExpression(), metadata !33, metadata ptr %2, metadata !DIExpression(DW_OP_LLVM_tag_offset, 128)), !dbg !22
+  call void @use(ptr nonnull %1), !dbg !28
+  call void @use(ptr nonnull %2), !dbg !29
+  ret void, !dbg !30
+}
+
+declare !dbg !5 void @use(ptr)
+
+declare void @llvm.dbg.value(metadata, metadata, metadata)
+declare void @llvm.dbg.assign(metadata, metadata, metadata, metadata, metadata, metadata)
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!8, !9, !10, !11, !12, !34}
+!llvm.ident = !{!13}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: !1, producer: "clang version 10.0.0", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, retainedTypes: !3, nameTableKind: None)
+!1 = !DIFile(filename: "dbg.cc", directory: "/tmp")
+!2 = !{}
+!3 = !{!4, !5}
+!4 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: null, size: 64)
+!5 = !DISubprogram(name: "use", scope: !1, file: !1, line: 2, type: !6, flags: DIFlagPrototyped, spFlags: DISPFlagOptimized, retainedNodes: !2)
+!6 = !DISubroutineType(types: !7)
+!7 = !{null, !4}
+!8 = !{i32 7, !"Dwarf Version", i32 4}
+!9 = !{i32 2, !"Debug Info Version", i32 3}
+!10 = !{i32 1, !"wchar_size", i32 4}
+!11 = !{i32 7, !"PIC Level", i32 2}
+!12 = !{i32 7, !"PIE Level", i32 2}
+!13 = !{!"clang version 10.0.0"}
+!14 = distinct !DISubprogram(name: "f", scope: !1, file: !1, line: 4, type: !15, scopeLine: 4, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !17)
+!15 = !DISubroutineType(types: !16)
+!16 = !{null}
+!17 = !{!18, !20}
+!18 = !DILocalVariable(name: "x", scope: !14, file: !1, line: 5, type: !19)
+!19 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+!20 = !DILocalVariable(name: "y", scope: !14, file: !1, line: 5, type: !19)
+!21 = !DILocation(line: 5, column: 3, scope: !14)
+!22 = !DILocation(line: 0, scope: !14)
+!23 = !DILocation(line: 5, column: 10, scope: !14)
+!24 = !{!25, !25, i64 0}
+!25 = !{!"int", !26, i64 0}
+!26 = !{!"omnipotent char", !27, i64 0}
+!27 = !{!"Simple C++ TBAA"}
+!28 = !DILocation(line: 6, column: 3, scope: !14)
+!29 = !DILocation(line: 7, column: 3, scope: !14)
+!30 = !DILocation(line: 8, column: 1, scope: !14)
+!31 = distinct !DIAssignID()
+!32 = distinct !DIAssignID()
+!33 = distinct !DIAssignID()
+!34 = !{i32 7, !"debug-info-assignment-tracking", i1 true}
diff --git a/llvm/test/CodeGen/AArch64/dbg-assign-tag-offset.ll b/llvm/test/CodeGen/AArch64/dbg-assign-tag-offset.ll
new file mode 100644
index 0000000..155e610
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/dbg-assign-tag-offset.ll
@@ -0,0 +1,76 @@
+; RUN: llc -filetype=obj -o - %s | llvm-dwarfdump - | FileCheck %s
+; RUN: llc --try-experimental-debuginfo-iterators -filetype=obj -o - %s | llvm-dwarfdump - | FileCheck %s
+
+;; Copied from dbg-value-tag-offset.ll. Check that variables with locations
+;; tracked with dbg.assigns with DW_OP_LLVM_TAG_offset operators in their
+;; expressions get a DW_AT_LLVM_tag_offset attribute on their DIE.
+
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64-unknown-linux-android24"
+
+; CHECK:      DW_TAG_variable
+; CHECK-NOT:  DW_TAG
+; CHECK:        DW_AT_LLVM_tag_offset (0x00)
+; CHECK-NEXT:   DW_AT_name    ("x")
+
+; CHECK:      DW_TAG_variable
+; CHECK-NOT:  DW_TAG
+; CHECK:        DW_AT_LLVM_tag_offset (0x80)
+; CHECK-NEXT:   DW_AT_name    ("y")
+
+define dso_local void @f() !dbg !14 {
+  %1 = alloca i32, align 4, !DIAssignID !31
+  call void @llvm.dbg.assign(metadata i1 undef, metadata !18, metadata !DIExpression(), metadata !31, metadata ptr %1, metadata !DIExpression(DW_OP_LLVM_tag_offset, 0)), !dbg !22
+  %2 = alloca i32, align 4, !DIAssignID !32
+  call void @llvm.dbg.assign(metadata i1 undef, metadata !20, metadata !DIExpression(), metadata !32, metadata ptr %2, metadata !DIExpression(DW_OP_LLVM_tag_offset, 128)), !dbg !22
+  store i32 1, ptr %2, align 4, !dbg !23, !tbaa !24, !DIAssignID !33
+  call void @llvm.dbg.assign(metadata i32 1, metadata !20, metadata !DIExpression(), metadata !33, metadata ptr %2, metadata !DIExpression(DW_OP_LLVM_tag_offset, 128)), !dbg !22
+  call void @use(ptr nonnull %1), !dbg !28
+  call void @use(ptr nonnull %2), !dbg !29
+  ret void, !dbg !30
+}
+
+declare !dbg !5 void @use(ptr)
+
+declare void @llvm.dbg.value(metadata, metadata, metadata)
+declare void @llvm.dbg.assign(metadata, metadata, metadata, metadata, metadata, metadata)
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!8, !9, !10, !11, !12, !34}
+!llvm.ident = !{!13}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: !1, producer: "clang version 10.0.0", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, retainedTypes: !3, nameTableKind: None)
+!1 = !DIFile(filename: "dbg.cc", directory: "/tmp")
+!2 = !{}
+!3 = !{!4, !5}
+!4 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: null, size: 64)
+!5 = !DISubprogram(name: "use", scope: !1, file: !1, line: 2, type: !6, flags: DIFlagPrototyped, spFlags: DISPFlagOptimized, retainedNodes: !2)
+!6 = !DISubroutineType(types: !7)
+!7 = !{null, !4}
+!8 = !{i32 7, !"Dwarf Version", i32 4}
+!9 = !{i32 2, !"Debug Info Version", i32 3}
+!10 = !{i32 1, !"wchar_size", i32 4}
+!11 = !{i32 7, !"PIC Level", i32 2}
+!12 = !{i32 7, !"PIE Level", i32 2}
+!13 = !{!"clang version 10.0.0"}
+!14 = distinct !DISubprogram(name: "f", scope: !1, file: !1, line: 4, type: !15, scopeLine: 4, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !17)
+!15 = !DISubroutineType(types: !16)
+!16 = !{null}
+!17 = !{!18, !20}
+!18 = !DILocalVariable(name: "x", scope: !14, file: !1, line: 5, type: !19)
+!19 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+!20 = !DILocalVariable(name: "y", scope: !14, file: !1, line: 5, type: !19)
+!21 = !DILocation(line: 5, column: 3, scope: !14)
+!22 = !DILocation(line: 0, scope: !14)
+!23 = !DILocation(line: 5, column: 10, scope: !14)
+!24 = !{!25, !25, i64 0}
+!25 = !{!"int", !26, i64 0}
+!26 = !{!"omnipotent char", !27, i64 0}
+!27 = !{!"Simple C++ TBAA"}
+!28 = !DILocation(line: 6, column: 3, scope: !14)
+!29 = !DILocation(line: 7, column: 3, scope: !14)
+!30 = !DILocation(line: 8, column: 1, scope: !14)
+!31 = distinct !DIAssignID()
+!32 = distinct !DIAssignID()
+!33 = distinct !DIAssignID()
+!34 = !{i32 7, !"debug-info-assignment-tracking", i1 true}
diff --git a/llvm/test/DebugInfo/Generic/assignment-tracking/declare-to-assign/hwasan.ll b/llvm/test/DebugInfo/Generic/assignment-tracking/declare-to-assign/hwasan.ll
index c4b209d..f7f126c 100644
--- a/llvm/test/DebugInfo/Generic/assignment-tracking/declare-to-assign/hwasan.ll
+++ b/llvm/test/DebugInfo/Generic/assignment-tracking/declare-to-assign/hwasan.ll
@@ -1,6 +1,7 @@
 ; RUN: opt %s -S -passes=declare-to-assign -o - | FileCheck %s
+; RUN: opt --try-experimental-debuginfo-iterators %s -S -passes=declare-to-assign -o - | FileCheck %s
 
-; CHECK: call void @llvm.dbg.declare
+; CHECK: call void @llvm.dbg.assign
 
 define dso_local void @f() sanitize_hwaddress !dbg !9 {
 entry:
diff --git a/llvm/test/Instrumentation/HWAddressSanitizer/dbg-assign-tag-offset.ll b/llvm/test/Instrumentation/HWAddressSanitizer/dbg-assign-tag-offset.ll
new file mode 100644
index 0000000..ec8d034
--- /dev/null
+++ b/llvm/test/Instrumentation/HWAddressSanitizer/dbg-assign-tag-offset.ll
@@ -0,0 +1,60 @@
+; RUN: opt -passes=hwasan -S -o - %s | FileCheck %s
+; RUN: opt --try-experimental-debuginfo-iterators -passes=hwasan -S -o - %s | FileCheck %s
+
+source_filename = "test.ll"
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64--linux-android"
+
+declare void @g(ptr, ptr, ptr, ptr, ptr, ptr)
+
+; Function Attrs: sanitize_hwaddress
+define void @f() #0 !dbg !7 {
+entry:
+  %nodebug0 = alloca ptr, align 8
+  %nodebug1 = alloca ptr, align 8
+  %nodebug2 = alloca ptr, align 8
+  %nodebug3 = alloca ptr, align 8
+  ; CHECK: %a = alloca{{.*}} !DIAssignID ![[ID1:[0-9]+]]
+  %a = alloca ptr, align 8, !DIAssignID !13
+  ; CHECK: @llvm.dbg.assign{{.*}} metadata ![[ID1]]{{.*}} !DIExpression(DW_OP_LLVM_tag_offset, 32)
+  call void @llvm.dbg.assign(metadata i1 undef, metadata !14, metadata !DIExpression(), metadata !13, metadata ptr %a, metadata !DIExpression()), !dbg !15
+  ; CHECK: %b = alloca{{.*}} !DIAssignID ![[ID2:[0-9]+]]
+  %b = alloca ptr, align 8, !DIAssignID !16
+  ; CHECK: @llvm.dbg.assign{{.*}} metadata ![[ID2]]{{.*}} !DIExpression(DW_OP_LLVM_tag_offset, 96)
+  call void @llvm.dbg.assign(metadata i1 undef, metadata !17, metadata !DIExpression(), metadata !16, metadata ptr %b, metadata !DIExpression()), !dbg !15
+  call void @g(ptr %nodebug0, ptr %nodebug1, ptr %nodebug2, ptr %nodebug3, ptr %a, ptr %b)
+  ret void, !dbg !18
+}
+
+; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
+
+; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare void @llvm.dbg.assign(metadata, metadata, metadata, metadata, metadata, metadata) #1
+
+attributes #0 = { sanitize_hwaddress }
+attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!3, !4, !5}
+!llvm.ident = !{!6}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !2)
+!1 = !DIFile(filename: "x.c", directory: "/")
+!2 = !{}
+!3 = !{i32 2, !"Dwarf Version", i32 4}
+!4 = !{i32 2, !"Debug Info Version", i32 3}
+!5 = !{i32 7, !"debug-info-assignment-tracking", i1 true}
+!6 = !{!"clang"}
+!7 = distinct !DISubprogram(name: "f", scope: !1, file: !1, line: 1, type: !8, scopeLine: 1, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: !0, retainedNodes: !2)
+!8 = !DISubroutineType(types: !9)
+!9 = !{null, !10}
+!10 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !11, size: 64)
+!11 = !DIDerivedType(tag: DW_TAG_const_type, baseType: !12)
+!12 = !DIBasicType(name: "char", size: 8, encoding: DW_ATE_signed_char)
+!13 = distinct !DIAssignID()
+!14 = !DILocalVariable(name: "a", scope: !7, file: !1, line: 1, type: !10)
+!15 = !DILocation(line: 0, scope: !7)
+!16 = distinct !DIAssignID()
+!17 = !DILocalVariable(name: "b", scope: !7, file: !1, line: 1, type: !10)
+!18 = !DILocation(line: 1, column: 37, scope: !7)
-- 
cgit v1.1


From 44706bd4f0e26f86eda24e4888044897fd4f92a8 Mon Sep 17 00:00:00 2001
From: ostannard <oliver.stannard@arm.com>
Date: Tue, 13 Feb 2024 09:13:22 +0000
Subject: [InstCombine] Don't add fcmp instructions to strictfp functions
 (#81498)

The strictfp attribute has the requirement that "LLVM will not introduce
any new floating-point instructions that may trap". The llvm.is.fpclass
intrinsic is documented as "The function never raises floating-point
exceptions", and the fcmp instruction may raise one, so we can't
transform the former into the latter in functions with the strictfp
attribute.
---
 .../Transforms/InstCombine/InstCombineCalls.cpp    |   3 +-
 .../Transforms/InstCombine/fpclass-check-idioms.ll | 323 ++++++++++++++++++++-
 2 files changed, 316 insertions(+), 10 deletions(-)

diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
index 56d1259..5266808 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -902,7 +902,8 @@ Instruction *InstCombinerImpl::foldIntrinsicIsFPClass(IntrinsicInst &II) {
   const FPClassTest OrderedMask = Mask & ~fcNan;
   const FPClassTest OrderedInvertedMask = ~OrderedMask & ~fcNan;
 
-  const bool IsStrict = II.isStrictFP();
+  const bool IsStrict =
+      II.getFunction()->getAttributes().hasFnAttr(Attribute::StrictFP);
 
   Value *FNegSrc;
   if (match(Src0, m_FNeg(m_Value(FNegSrc)))) {
diff --git a/llvm/test/Transforms/InstCombine/fpclass-check-idioms.ll b/llvm/test/Transforms/InstCombine/fpclass-check-idioms.ll
index d2b4536..42c6506 100644
--- a/llvm/test/Transforms/InstCombine/fpclass-check-idioms.ll
+++ b/llvm/test/Transforms/InstCombine/fpclass-check-idioms.ll
@@ -14,6 +14,18 @@ define i1 @f32_fcnan_fcinf(float %a) {
   ret i1 %cmp
 }
 
+define i1 @f32_fcnan_fcinf_strictfp(float %a) strictfp {
+; CHECK-LABEL: define i1 @f32_fcnan_fcinf_strictfp(
+; CHECK-SAME: float [[A:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:    [[CMP:%.*]] = call i1 @llvm.is.fpclass.f32(float [[A]], i32 519)
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %i32 = bitcast float %a to i32
+  %and = and i32 %i32, 2139095040
+  %cmp = icmp eq i32 %and, 2139095040
+  ret i1 %cmp
+}
+
 define i1 @f32_not_fcnan_fcinf(float %a) {
 ; CHECK-LABEL: define i1 @f32_not_fcnan_fcinf(
 ; CHECK-SAME: float [[A:%.*]]) {
@@ -27,6 +39,18 @@ define i1 @f32_not_fcnan_fcinf(float %a) {
   ret i1 %cmp
 }
 
+define i1 @f32_not_fcnan_fcinf_strictfp(float %a) strictfp {
+; CHECK-LABEL: define i1 @f32_not_fcnan_fcinf_strictfp(
+; CHECK-SAME: float [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[CMP:%.*]] = call i1 @llvm.is.fpclass.f32(float [[A]], i32 504)
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %i32 = bitcast float %a to i32
+  %and = and i32 %i32, 2139095040
+  %cmp = icmp ne i32 %and, 2139095040
+  ret i1 %cmp
+}
+
 define i1 @f64_fcnan_fcinf(double %a) {
 ; CHECK-LABEL: define i1 @f64_fcnan_fcinf(
 ; CHECK-SAME: double [[A:%.*]]) {
@@ -40,6 +64,18 @@ define i1 @f64_fcnan_fcinf(double %a) {
   ret i1 %cmp
 }
 
+define i1 @f64_fcnan_fcinf_strictfp(double %a) strictfp {
+; CHECK-LABEL: define i1 @f64_fcnan_fcinf_strictfp(
+; CHECK-SAME: double [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[CMP:%.*]] = call i1 @llvm.is.fpclass.f64(double [[A]], i32 519)
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %i64 = bitcast double %a to i64
+  %and = and i64 %i64, 9218868437227405312
+  %cmp = icmp eq i64 %and, 9218868437227405312
+  ret i1 %cmp
+}
+
 define i1 @f32_fcinf(float %a) {
 ; CHECK-LABEL: define i1 @f32_fcinf(
 ; CHECK-SAME: float [[A:%.*]]) {
@@ -53,6 +89,18 @@ define i1 @f32_fcinf(float %a) {
   ret i1 %cmp
 }
 
+define i1 @f32_fcinf_strictfp(float %a) strictfp {
+; CHECK-LABEL: define i1 @f32_fcinf_strictfp(
+; CHECK-SAME: float [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[CMP:%.*]] = call i1 @llvm.is.fpclass.f32(float [[A]], i32 516)
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %i32 = bitcast float %a to i32
+  %and = and i32 %i32, 2147483647
+  %cmp = icmp eq i32 %and, 2139095040
+  ret i1 %cmp
+}
+
 define i1 @f32_fcposinf(float %a) {
 ; CHECK-LABEL: define i1 @f32_fcposinf(
 ; CHECK-SAME: float [[A:%.*]]) {
@@ -64,6 +112,17 @@ define i1 @f32_fcposinf(float %a) {
   ret i1 %cmp
 }
 
+define i1 @f32_fcposinf_strictfp(float %a) strictfp {
+; CHECK-LABEL: define i1 @f32_fcposinf_strictfp(
+; CHECK-SAME: float [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[CMP:%.*]] = call i1 @llvm.is.fpclass.f32(float [[A]], i32 512)
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %i32 = bitcast float %a to i32
+  %cmp = icmp eq i32 %i32, 2139095040
+  ret i1 %cmp
+}
+
 define i1 @f32_fcneginf(float %a) {
 ; CHECK-LABEL: define i1 @f32_fcneginf(
 ; CHECK-SAME: float [[A:%.*]]) {
@@ -75,6 +134,17 @@ define i1 @f32_fcneginf(float %a) {
   ret i1 %cmp
 }
 
+define i1 @f32_fcneginf_strictfp(float %a) strictfp {
+; CHECK-LABEL: define i1 @f32_fcneginf_strictfp(
+; CHECK-SAME: float [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[CMP:%.*]] = call i1 @llvm.is.fpclass.f32(float [[A]], i32 4)
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %i32 = bitcast float %a to i32
+  %cmp = icmp eq i32 %i32, 4286578688
+  ret i1 %cmp
+}
+
 define i1 @f32_fcposzero(float %a) {
 ; CHECK-LABEL: define i1 @f32_fcposzero(
 ; CHECK-SAME: float [[A:%.*]]) {
@@ -86,6 +156,17 @@ define i1 @f32_fcposzero(float %a) {
   ret i1 %cmp
 }
 
+define i1 @f32_fcposzero_strictfp(float %a) strictfp {
+; CHECK-LABEL: define i1 @f32_fcposzero_strictfp(
+; CHECK-SAME: float [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[CMP:%.*]] = call i1 @llvm.is.fpclass.f32(float [[A]], i32 64)
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %i32 = bitcast float %a to i32
+  %cmp = icmp eq i32 %i32, 0
+  ret i1 %cmp
+}
+
 define i1 @f32_fcnegzero(float %a) {
 ; CHECK-LABEL: define i1 @f32_fcnegzero(
 ; CHECK-SAME: float [[A:%.*]]) {
@@ -97,6 +178,17 @@ define i1 @f32_fcnegzero(float %a) {
   ret i1 %cmp
 }
 
+define i1 @f32_fcnegzero_strictfp(float %a) strictfp {
+; CHECK-LABEL: define i1 @f32_fcnegzero_strictfp(
+; CHECK-SAME: float [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[CMP:%.*]] = call i1 @llvm.is.fpclass.f32(float [[A]], i32 32)
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %i32 = bitcast float %a to i32
+  %cmp = icmp eq i32 %i32, 2147483648
+  ret i1 %cmp
+}
+
 define i1 @f32_fczero(float %a) {
 ; CHECK-LABEL: define i1 @f32_fczero(
 ; CHECK-SAME: float [[A:%.*]]) {
@@ -109,6 +201,18 @@ define i1 @f32_fczero(float %a) {
   ret i1 %cmp
 }
 
+define i1 @f32_fczero_strictfp(float %a) strictfp {
+; CHECK-LABEL: define i1 @f32_fczero_strictfp(
+; CHECK-SAME: float [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[CMP:%.*]] = call i1 @llvm.is.fpclass.f32(float [[A]], i32 96)
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %i32 = bitcast float %a to i32
+  %and = and i32 %i32, 2147483647
+  %cmp = icmp eq i32 %and, 0
+  ret i1 %cmp
+}
+
 ; TODO: handle more fpclass check idioms
 define i1 @f32_fcnan(float %a) {
 ; CHECK-LABEL: define i1 @f32_fcnan(
@@ -130,17 +234,24 @@ define i1 @f32_fcnan(float %a) {
   ret i1 %res
 }
 
-define i1 @f32_fcnan_fcinf_strictfp(float %a) strictfp {
-; CHECK-LABEL: define i1 @f32_fcnan_fcinf_strictfp(
-; CHECK-SAME: float [[A:%.*]]) #[[ATTR0:[0-9]+]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = call float @llvm.fabs.f32(float [[A]])
-; CHECK-NEXT:    [[CMP:%.*]] = fcmp ueq float [[TMP1]], 0x7FF0000000000000
-; CHECK-NEXT:    ret i1 [[CMP]]
+define i1 @f32_fcnan_strictfp(float %a) strictfp {
+; CHECK-LABEL: define i1 @f32_fcnan_strictfp(
+; CHECK-SAME: float [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[I32:%.*]] = bitcast float [[A]] to i32
+; CHECK-NEXT:    [[AND1:%.*]] = and i32 [[I32]], 2139095040
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp eq i32 [[AND1]], 2139095040
+; CHECK-NEXT:    [[AND2:%.*]] = and i32 [[I32]], 8388607
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp ne i32 [[AND2]], 0
+; CHECK-NEXT:    [[RES:%.*]] = and i1 [[CMP1]], [[CMP2]]
+; CHECK-NEXT:    ret i1 [[RES]]
 ;
   %i32 = bitcast float %a to i32
-  %and = and i32 %i32, 2139095040
-  %cmp = icmp eq i32 %and, 2139095040
-  ret i1 %cmp
+  %and1 = and i32 %i32, 2139095040
+  %cmp1 = icmp eq i32 %and1, 2139095040
+  %and2 = and i32 %i32, 8388607
+  %cmp2 = icmp ne i32 %and2, 0
+  %res = and i1 %cmp1, %cmp2
+  ret i1 %res
 }
 
 define <2 x i1> @f32_fcnan_fcinf_vec(<2 x float> %a) {
@@ -156,6 +267,18 @@ define <2 x i1> @f32_fcnan_fcinf_vec(<2 x float> %a) {
   ret <2 x i1> %cmp
 }
 
+define <2 x i1> @f32_fcnan_fcinf_vec_strictfp(<2 x float> %a) strictfp {
+; CHECK-LABEL: define <2 x i1> @f32_fcnan_fcinf_vec_strictfp(
+; CHECK-SAME: <2 x float> [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[CMP:%.*]] = call <2 x i1> @llvm.is.fpclass.v2f32(<2 x float> [[A]], i32 519)
+; CHECK-NEXT:    ret <2 x i1> [[CMP]]
+;
+  %i32 = bitcast <2 x float> %a to <2 x i32>
+  %and = and <2 x i32> %i32, <i32 2139095040, i32 2139095040>
+  %cmp = icmp eq <2 x i32> %and, <i32 2139095040, i32 2139095040>
+  ret <2 x i1> %cmp
+}
+
 define <2 x i1> @f32_fcinf_vec(<2 x float> %a) {
 ; CHECK-LABEL: define <2 x i1> @f32_fcinf_vec(
 ; CHECK-SAME: <2 x float> [[A:%.*]]) {
@@ -169,6 +292,18 @@ define <2 x i1> @f32_fcinf_vec(<2 x float> %a) {
   ret <2 x i1> %cmp
 }
 
+define <2 x i1> @f32_fcinf_vec_strictfp(<2 x float> %a) strictfp {
+; CHECK-LABEL: define <2 x i1> @f32_fcinf_vec_strictfp(
+; CHECK-SAME: <2 x float> [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[CMP:%.*]] = call <2 x i1> @llvm.is.fpclass.v2f32(<2 x float> [[A]], i32 516)
+; CHECK-NEXT:    ret <2 x i1> [[CMP]]
+;
+  %i32 = bitcast <2 x float> %a to <2 x i32>
+  %and = and <2 x i32> %i32, <i32 2147483647, i32 2147483647>
+  %cmp = icmp eq <2 x i32> %and, <i32 2139095040, i32 2139095040>
+  ret <2 x i1> %cmp
+}
+
 ; Negative tests
 
 define i1 @f32_fcnan_fcinf_wrong_mask1(float %a) {
@@ -185,6 +320,20 @@ define i1 @f32_fcnan_fcinf_wrong_mask1(float %a) {
   ret i1 %cmp
 }
 
+define i1 @f32_fcnan_fcinf_wrong_mask1_strictfp(float %a) strictfp {
+; CHECK-LABEL: define i1 @f32_fcnan_fcinf_wrong_mask1_strictfp(
+; CHECK-SAME: float [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[I32:%.*]] = bitcast float [[A]] to i32
+; CHECK-NEXT:    [[AND:%.*]] = and i32 [[I32]], 2139095041
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[AND]], 2139095040
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %i32 = bitcast float %a to i32
+  %and = and i32 %i32, 2139095041
+  %cmp = icmp eq i32 %and, 2139095040
+  ret i1 %cmp
+}
+
 define i1 @f32_fcnan_fcinf_wrong_mask2(float %a) {
 ; CHECK-LABEL: define i1 @f32_fcnan_fcinf_wrong_mask2(
 ; CHECK-SAME: float [[A:%.*]]) {
@@ -199,6 +348,20 @@ define i1 @f32_fcnan_fcinf_wrong_mask2(float %a) {
   ret i1 %cmp
 }
 
+define i1 @f32_fcnan_fcinf_wrong_mask2_strictfp(float %a) strictfp {
+; CHECK-LABEL: define i1 @f32_fcnan_fcinf_wrong_mask2_strictfp(
+; CHECK-SAME: float [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[I32:%.*]] = bitcast float [[A]] to i32
+; CHECK-NEXT:    [[AND:%.*]] = and i32 [[I32]], 2139095040
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[AND]], 2130706432
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %i32 = bitcast float %a to i32
+  %and = and i32 %i32, 2139095040
+  %cmp = icmp eq i32 %and, 2130706432
+  ret i1 %cmp
+}
+
 define i1 @f64_fcnan_fcinf_wrong_mask3(double %a) {
 ; CHECK-LABEL: define i1 @f64_fcnan_fcinf_wrong_mask3(
 ; CHECK-SAME: double [[A:%.*]]) {
@@ -213,6 +376,20 @@ define i1 @f64_fcnan_fcinf_wrong_mask3(double %a) {
   ret i1 %cmp
 }
 
+define i1 @f64_fcnan_fcinf_wrong_mask3_strictfp(double %a) strictfp {
+; CHECK-LABEL: define i1 @f64_fcnan_fcinf_wrong_mask3_strictfp(
+; CHECK-SAME: double [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[I64:%.*]] = bitcast double [[A]] to i64
+; CHECK-NEXT:    [[AND:%.*]] = and i64 [[I64]], 2139095040
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i64 [[AND]], 2139095040
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %i64 = bitcast double %a to i64
+  %and = and i64 %i64, 2139095040
+  %cmp = icmp eq i64 %and, 2139095040
+  ret i1 %cmp
+}
+
 define i1 @f32_fcnan_fcinf_wrong_pred(float %a) {
 ; CHECK-LABEL: define i1 @f32_fcnan_fcinf_wrong_pred(
 ; CHECK-SAME: float [[A:%.*]]) {
@@ -226,6 +403,18 @@ define i1 @f32_fcnan_fcinf_wrong_pred(float %a) {
   ret i1 %cmp
 }
 
+define i1 @f32_fcnan_fcinf_wrong_pred_strictfp(float %a) strictfp {
+; CHECK-LABEL: define i1 @f32_fcnan_fcinf_wrong_pred_strictfp(
+; CHECK-SAME: float [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[CMP:%.*]] = call i1 @llvm.is.fpclass.f32(float [[A]], i32 504)
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %i32 = bitcast float %a to i32
+  %and = and i32 %i32, 2139095040
+  %cmp = icmp slt i32 %and, 2139095040
+  ret i1 %cmp
+}
+
 define i1 @f32_fcposzero_wrong_pred(float %a) {
 ; CHECK-LABEL: define i1 @f32_fcposzero_wrong_pred(
 ; CHECK-SAME: float [[A:%.*]]) {
@@ -238,6 +427,18 @@ define i1 @f32_fcposzero_wrong_pred(float %a) {
   ret i1 %cmp
 }
 
+define i1 @f32_fcposzero_wrong_pred_strictfp(float %a) strictfp {
+; CHECK-LABEL: define i1 @f32_fcposzero_wrong_pred_strictfp(
+; CHECK-SAME: float [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[I32:%.*]] = bitcast float [[A]] to i32
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[I32]], 0
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %i32 = bitcast float %a to i32
+  %cmp = icmp slt i32 %i32, 0
+  ret i1 %cmp
+}
+
 define i1 @f32_fcnan_fcinf_wrong_type1(<2 x float> %a) {
 ; CHECK-LABEL: define i1 @f32_fcnan_fcinf_wrong_type1(
 ; CHECK-SAME: <2 x float> [[A:%.*]]) {
@@ -252,6 +453,20 @@ define i1 @f32_fcnan_fcinf_wrong_type1(<2 x float> %a) {
   ret i1 %cmp
 }
 
+define i1 @f32_fcnan_fcinf_wrong_type1_strictfp(<2 x float> %a) strictfp {
+; CHECK-LABEL: define i1 @f32_fcnan_fcinf_wrong_type1_strictfp(
+; CHECK-SAME: <2 x float> [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[I64:%.*]] = bitcast <2 x float> [[A]] to i64
+; CHECK-NEXT:    [[AND:%.*]] = and i64 [[I64]], 2139095040
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i64 [[AND]], 2139095040
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %i64 = bitcast <2 x float> %a to i64
+  %and = and i64 %i64, 2139095040
+  %cmp = icmp eq i64 %and, 2139095040
+  ret i1 %cmp
+}
+
 define i1 @f32_fcposinf_wrong_type1(<2 x float> %a) {
 ; CHECK-LABEL: define i1 @f32_fcposinf_wrong_type1(
 ; CHECK-SAME: <2 x float> [[A:%.*]]) {
@@ -264,6 +479,18 @@ define i1 @f32_fcposinf_wrong_type1(<2 x float> %a) {
   ret i1 %cmp
 }
 
+define i1 @f32_fcposinf_wrong_type1_strictfp(<2 x float> %a) strictfp {
+; CHECK-LABEL: define i1 @f32_fcposinf_wrong_type1_strictfp(
+; CHECK-SAME: <2 x float> [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[I64:%.*]] = bitcast <2 x float> [[A]] to i64
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i64 [[I64]], 2139095040
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %i64 = bitcast <2 x float> %a to i64
+  %cmp = icmp eq i64 %i64, 2139095040
+  ret i1 %cmp
+}
+
 define i1 @f32_fcnan_fcinf_wrong_type2(x86_fp80 %a) {
 ; CHECK-LABEL: define i1 @f32_fcnan_fcinf_wrong_type2(
 ; CHECK-SAME: x86_fp80 [[A:%.*]]) {
@@ -278,6 +505,20 @@ define i1 @f32_fcnan_fcinf_wrong_type2(x86_fp80 %a) {
   ret i1 %cmp
 }
 
+define i1 @f32_fcnan_fcinf_wrong_type2_strictfp(x86_fp80 %a) strictfp {
+; CHECK-LABEL: define i1 @f32_fcnan_fcinf_wrong_type2_strictfp(
+; CHECK-SAME: x86_fp80 [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[I80:%.*]] = bitcast x86_fp80 [[A]] to i80
+; CHECK-NEXT:    [[AND:%.*]] = and i80 [[I80]], 2139095040
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i80 [[AND]], 2139095040
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %i80 = bitcast x86_fp80 %a to i80
+  %and = and i80 %i80, 2139095040
+  %cmp = icmp eq i80 %and, 2139095040
+  ret i1 %cmp
+}
+
 define i1 @f32_fcposzero_wrong_type2(x86_fp80 %a) {
 ; CHECK-LABEL: define i1 @f32_fcposzero_wrong_type2(
 ; CHECK-SAME: x86_fp80 [[A:%.*]]) {
@@ -290,6 +531,18 @@ define i1 @f32_fcposzero_wrong_type2(x86_fp80 %a) {
   ret i1 %cmp
 }
 
+define i1 @f32_fcposzero_wrong_type2_strictfp(x86_fp80 %a) strictfp {
+; CHECK-LABEL: define i1 @f32_fcposzero_wrong_type2_strictfp(
+; CHECK-SAME: x86_fp80 [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[I80:%.*]] = bitcast x86_fp80 [[A]] to i80
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i80 [[I80]], 0
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %i80 = bitcast x86_fp80 %a to i80
+  %cmp = icmp eq i80 %i80, 0
+  ret i1 %cmp
+}
+
 define i1 @f32_fcnan_fcinf_noimplicitfloat(float %a) #0 {
 ; CHECK-LABEL: define i1 @f32_fcnan_fcinf_noimplicitfloat(
 ; CHECK-SAME: float [[A:%.*]]) #[[ATTR1:[0-9]+]] {
@@ -304,6 +557,20 @@ define i1 @f32_fcnan_fcinf_noimplicitfloat(float %a) #0 {
   ret i1 %cmp
 }
 
+define i1 @f32_fcnan_fcinf_noimplicitfloat_strictfp(float %a) strictfp #0 {
+; CHECK-LABEL: define i1 @f32_fcnan_fcinf_noimplicitfloat_strictfp(
+; CHECK-SAME: float [[A:%.*]]) #[[ATTR2:[0-9]+]] {
+; CHECK-NEXT:    [[I32:%.*]] = bitcast float [[A]] to i32
+; CHECK-NEXT:    [[AND:%.*]] = and i32 [[I32]], 2139095040
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[AND]], 2139095040
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %i32 = bitcast float %a to i32
+  %and = and i32 %i32, 2139095040
+  %cmp = icmp eq i32 %and, 2139095040
+  ret i1 %cmp
+}
+
 define i1 @f32_fcposinf_noimplicitfloat(float %a) #0 {
 ; CHECK-LABEL: define i1 @f32_fcposinf_noimplicitfloat(
 ; CHECK-SAME: float [[A:%.*]]) #[[ATTR1]] {
@@ -316,6 +583,18 @@ define i1 @f32_fcposinf_noimplicitfloat(float %a) #0 {
   ret i1 %cmp
 }
 
+define i1 @f32_fcposinf_noimplicitfloat_strictfp(float %a) strictfp #0 {
+; CHECK-LABEL: define i1 @f32_fcposinf_noimplicitfloat_strictfp(
+; CHECK-SAME: float [[A:%.*]]) #[[ATTR2]] {
+; CHECK-NEXT:    [[I32:%.*]] = bitcast float [[A]] to i32
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[I32]], 2139095040
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %i32 = bitcast float %a to i32
+  %cmp = icmp eq i32 %i32, 2139095040
+  ret i1 %cmp
+}
+
 define i1 @f32_fcposnan(float %a) {
 ; CHECK-LABEL: define i1 @f32_fcposnan(
 ; CHECK-SAME: float [[A:%.*]]) {
@@ -328,6 +607,18 @@ define i1 @f32_fcposnan(float %a) {
   ret i1 %cmp
 }
 
+define i1 @f32_fcposnan_strictfp(float %a) strictfp {
+; CHECK-LABEL: define i1 @f32_fcposnan_strictfp(
+; CHECK-SAME: float [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[I32:%.*]] = bitcast float [[A]] to i32
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[I32]], 2139095041
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %i32 = bitcast float %a to i32
+  %cmp = icmp eq i32 %i32, 2139095041
+  ret i1 %cmp
+}
+
 define i1 @f32_fcposinf_multiuse(float %a) {
 ; CHECK-LABEL: define i1 @f32_fcposinf_multiuse(
 ; CHECK-SAME: float [[A:%.*]]) {
@@ -342,6 +633,20 @@ define i1 @f32_fcposinf_multiuse(float %a) {
   ret i1 %cmp
 }
 
+define i1 @f32_fcposinf_multiuse_strictfp(float %a) strictfp {
+; CHECK-LABEL: define i1 @f32_fcposinf_multiuse_strictfp(
+; CHECK-SAME: float [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[I32:%.*]] = bitcast float [[A]] to i32
+; CHECK-NEXT:    call void @usei32(i32 [[I32]])
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[I32]], 2139095040
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %i32 = bitcast float %a to i32
+  call void @usei32(i32 %i32)
+  %cmp = icmp eq i32 %i32, 2139095040
+  ret i1 %cmp
+}
+
 declare void @usei32(i32)
 
 attributes #0 = { noimplicitfloat }
-- 
cgit v1.1


From ca61e6a71dbe622593b27ab6c6f3bfd058069772 Mon Sep 17 00:00:00 2001
From: Yingwei Zheng <dtcxzyw2333@gmail.com>
Date: Tue, 13 Feb 2024 17:28:06 +0800
Subject: Revert "[CVP] Check whether the default case is reachable (#79993)"
 (#81585)

This reverts commit a034e65e972175a2465deacb8c78bc7efc99bd23.

Some protobuf users reported that this patch caused a significant
compile-time regression because `TailDuplicator` works poorly with a
specific pattern.

We will reland it once the codegen issue is fixed.
---
 .../Scalar/CorrelatedValuePropagation.cpp          |  34 ---
 .../Transforms/CorrelatedValuePropagation/basic.ll |  11 +-
 .../CorrelatedValuePropagation/switch.ll           | 301 ---------------------
 3 files changed, 5 insertions(+), 341 deletions(-)
 delete mode 100644 llvm/test/Transforms/CorrelatedValuePropagation/switch.ll

diff --git a/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp b/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp
index 24e1677..9235850 100644
--- a/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp
+++ b/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp
@@ -371,7 +371,6 @@ static bool processSwitch(SwitchInst *I, LazyValueInfo *LVI,
   { // Scope for SwitchInstProfUpdateWrapper. It must not live during
     // ConstantFoldTerminator() as the underlying SwitchInst can be changed.
     SwitchInstProfUpdateWrapper SI(*I);
-    unsigned ReachableCaseCount = 0;
 
     for (auto CI = SI->case_begin(), CE = SI->case_end(); CI != CE;) {
       ConstantInt *Case = CI->getCaseValue();
@@ -408,33 +407,6 @@ static bool processSwitch(SwitchInst *I, LazyValueInfo *LVI,
 
       // Increment the case iterator since we didn't delete it.
       ++CI;
-      ++ReachableCaseCount;
-    }
-
-    BasicBlock *DefaultDest = SI->getDefaultDest();
-    if (ReachableCaseCount > 1 &&
-        !isa<UnreachableInst>(DefaultDest->getFirstNonPHIOrDbg())) {
-      ConstantRange CR = LVI->getConstantRangeAtUse(I->getOperandUse(0),
-                                                    /*UndefAllowed*/ false);
-      // The default dest is unreachable if all cases are covered.
-      if (!CR.isSizeLargerThan(ReachableCaseCount)) {
-        BasicBlock *NewUnreachableBB =
-            BasicBlock::Create(BB->getContext(), "default.unreachable",
-                               BB->getParent(), DefaultDest);
-        new UnreachableInst(BB->getContext(), NewUnreachableBB);
-
-        DefaultDest->removePredecessor(BB);
-        SI->setDefaultDest(NewUnreachableBB);
-
-        if (SuccessorsCount[DefaultDest] == 1)
-          DTU.applyUpdatesPermissive(
-              {{DominatorTree::Delete, BB, DefaultDest}});
-        DTU.applyUpdatesPermissive(
-            {{DominatorTree::Insert, BB, NewUnreachableBB}});
-
-        ++NumDeadCases;
-        Changed = true;
-      }
     }
   }
 
@@ -1255,12 +1227,6 @@ CorrelatedValuePropagationPass::run(Function &F, FunctionAnalysisManager &AM) {
   if (!Changed) {
     PA = PreservedAnalyses::all();
   } else {
-#if defined(EXPENSIVE_CHECKS)
-    assert(DT->verify(DominatorTree::VerificationLevel::Full));
-#else
-    assert(DT->verify(DominatorTree::VerificationLevel::Fast));
-#endif // EXPENSIVE_CHECKS
-
     PA.preserve<DominatorTreeAnalysis>();
     PA.preserve<LazyValueAnalysis>();
   }
diff --git a/llvm/test/Transforms/CorrelatedValuePropagation/basic.ll b/llvm/test/Transforms/CorrelatedValuePropagation/basic.ll
index 93ed008..8dce9ef 100644
--- a/llvm/test/Transforms/CorrelatedValuePropagation/basic.ll
+++ b/llvm/test/Transforms/CorrelatedValuePropagation/basic.ll
@@ -442,7 +442,7 @@ define i32 @switch_range(i32 %cond) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[S:%.*]] = urem i32 [[COND:%.*]], 3
 ; CHECK-NEXT:    [[S1:%.*]] = add nuw nsw i32 [[S]], 1
-; CHECK-NEXT:    switch i32 [[S1]], label [[DEFAULT_UNREACHABLE:%.*]] [
+; CHECK-NEXT:    switch i32 [[S1]], label [[UNREACHABLE:%.*]] [
 ; CHECK-NEXT:      i32 1, label [[EXIT1:%.*]]
 ; CHECK-NEXT:      i32 2, label [[EXIT2:%.*]]
 ; CHECK-NEXT:      i32 3, label [[EXIT1]]
@@ -451,8 +451,6 @@ define i32 @switch_range(i32 %cond) {
 ; CHECK-NEXT:    ret i32 1
 ; CHECK:       exit2:
 ; CHECK-NEXT:    ret i32 2
-; CHECK:       default.unreachable:
-; CHECK-NEXT:    unreachable
 ; CHECK:       unreachable:
 ; CHECK-NEXT:    ret i32 0
 ;
@@ -515,9 +513,10 @@ define i8 @switch_defaultdest_multipleuse(i8 %t0) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[O:%.*]] = or i8 [[T0:%.*]], 1
 ; CHECK-NEXT:    [[R:%.*]] = srem i8 1, [[O]]
-; CHECK-NEXT:    br label [[EXIT:%.*]]
-; CHECK:       default.unreachable:
-; CHECK-NEXT:    unreachable
+; CHECK-NEXT:    switch i8 [[R]], label [[EXIT:%.*]] [
+; CHECK-NEXT:      i8 0, label [[EXIT]]
+; CHECK-NEXT:      i8 1, label [[EXIT]]
+; CHECK-NEXT:    ]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret i8 0
 ;
diff --git a/llvm/test/Transforms/CorrelatedValuePropagation/switch.ll b/llvm/test/Transforms/CorrelatedValuePropagation/switch.ll
deleted file mode 100644
index a0794d5..0000000
--- a/llvm/test/Transforms/CorrelatedValuePropagation/switch.ll
+++ /dev/null
@@ -1,301 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
-; RUN: opt < %s -passes=correlated-propagation -S | FileCheck %s
-
-define i32 @test_unreachable_default(i32 noundef %num) {
-; CHECK-LABEL: define i32 @test_unreachable_default(
-; CHECK-SAME: i32 noundef [[NUM:%.*]]) {
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[SUB:%.*]] = add i32 [[NUM]], -120
-; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i32 [[SUB]], 3
-; CHECK-NEXT:    [[COND:%.*]] = select i1 [[CMP]], i32 [[SUB]], i32 2
-; CHECK-NEXT:    switch i32 [[COND]], label [[DEFAULT_UNREACHABLE:%.*]] [
-; CHECK-NEXT:      i32 0, label [[SW_BB:%.*]]
-; CHECK-NEXT:      i32 1, label [[SW_BB2:%.*]]
-; CHECK-NEXT:      i32 2, label [[SW_BB4:%.*]]
-; CHECK-NEXT:    ]
-; CHECK:       sw.bb:
-; CHECK-NEXT:    [[CALL:%.*]] = call i32 @call0()
-; CHECK-NEXT:    br label [[CLEANUP:%.*]]
-; CHECK:       sw.bb2:
-; CHECK-NEXT:    [[CALL3:%.*]] = call i32 @call1()
-; CHECK-NEXT:    br label [[CLEANUP]]
-; CHECK:       sw.bb4:
-; CHECK-NEXT:    [[CALL5:%.*]] = call i32 @call2()
-; CHECK-NEXT:    br label [[CLEANUP]]
-; CHECK:       default.unreachable:
-; CHECK-NEXT:    unreachable
-; CHECK:       sw.default:
-; CHECK-NEXT:    [[CALL6:%.*]] = call i32 @call3()
-; CHECK-NEXT:    br label [[CLEANUP]]
-; CHECK:       cleanup:
-; CHECK-NEXT:    [[RETVAL_0:%.*]] = phi i32 [ [[CALL6]], [[SW_DEFAULT:%.*]] ], [ [[CALL5]], [[SW_BB4]] ], [ [[CALL3]], [[SW_BB2]] ], [ [[CALL]], [[SW_BB]] ]
-; CHECK-NEXT:    ret i32 [[RETVAL_0]]
-;
-entry:
-  %sub = add i32 %num, -120
-  %cmp = icmp ult i32 %sub, 3
-  %cond = select i1 %cmp, i32 %sub, i32 2
-  switch i32 %cond, label %sw.default [
-  i32 0, label %sw.bb
-  i32 1, label %sw.bb2
-  i32 2, label %sw.bb4
-  ]
-
-sw.bb:
-  %call = call i32 @call0()
-  br label %cleanup
-
-sw.bb2:
-  %call3 = call i32 @call1()
-  br label %cleanup
-
-sw.bb4:
-  %call5 = call i32 @call2()
-  br label %cleanup
-
-sw.default:
-  %call6 = call i32 @call3()
-  br label %cleanup
-
-cleanup:
-  %retval.0 = phi i32 [ %call6, %sw.default ], [ %call5, %sw.bb4 ], [ %call3, %sw.bb2 ], [ %call, %sw.bb ]
-  ret i32 %retval.0
-}
-
-define i32 @test_unreachable_default_shared_edge(i32 noundef %num) {
-; CHECK-LABEL: define i32 @test_unreachable_default_shared_edge(
-; CHECK-SAME: i32 noundef [[NUM:%.*]]) {
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[SUB:%.*]] = add i32 [[NUM]], -120
-; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i32 [[SUB]], 3
-; CHECK-NEXT:    [[COND:%.*]] = select i1 [[CMP]], i32 [[SUB]], i32 2
-; CHECK-NEXT:    switch i32 [[COND]], label [[DEFAULT_UNREACHABLE:%.*]] [
-; CHECK-NEXT:      i32 0, label [[SW_BB:%.*]]
-; CHECK-NEXT:      i32 1, label [[SW_BB2:%.*]]
-; CHECK-NEXT:      i32 2, label [[SW_BB4:%.*]]
-; CHECK-NEXT:    ]
-; CHECK:       sw.bb:
-; CHECK-NEXT:    [[CALL:%.*]] = call i32 @call0()
-; CHECK-NEXT:    br label [[CLEANUP:%.*]]
-; CHECK:       sw.bb2:
-; CHECK-NEXT:    [[CALL3:%.*]] = call i32 @call1()
-; CHECK-NEXT:    br label [[CLEANUP]]
-; CHECK:       default.unreachable:
-; CHECK-NEXT:    unreachable
-; CHECK:       sw.bb4:
-; CHECK-NEXT:    [[CALL5:%.*]] = call i32 @call4(i32 [[SUB]])
-; CHECK-NEXT:    br label [[CLEANUP]]
-; CHECK:       cleanup:
-; CHECK-NEXT:    [[RETVAL_0:%.*]] = phi i32 [ [[CALL5]], [[SW_BB4]] ], [ [[CALL3]], [[SW_BB2]] ], [ [[CALL]], [[SW_BB]] ]
-; CHECK-NEXT:    ret i32 [[RETVAL_0]]
-;
-entry:
-  %sub = add i32 %num, -120
-  %cmp = icmp ult i32 %sub, 3
-  %cond = select i1 %cmp, i32 %sub, i32 2
-  switch i32 %cond, label %sw.bb4 [
-  i32 0, label %sw.bb
-  i32 1, label %sw.bb2
-  i32 2, label %sw.bb4
-  ]
-
-sw.bb:
-  %call = call i32 @call0()
-  br label %cleanup
-
-sw.bb2:
-  %call3 = call i32 @call1()
-  br label %cleanup
-
-sw.bb4:
-  %val = phi i32 [ %sub, %entry ], [ %sub, %entry ]
-  %call5 = call i32 @call4(i32 %val)
-  br label %cleanup
-
-cleanup:
-  %retval.0 = phi i32 [ %call5, %sw.bb4 ], [ %call3, %sw.bb2 ], [ %call, %sw.bb ]
-  ret i32 %retval.0
-}
-
-; Negative tests
-
-define i32 @test_reachable_default(i32 noundef %num) {
-; CHECK-LABEL: define i32 @test_reachable_default(
-; CHECK-SAME: i32 noundef [[NUM:%.*]]) {
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[SUB:%.*]] = add i32 [[NUM]], -120
-; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i32 [[SUB]], 3
-; CHECK-NEXT:    [[COND:%.*]] = select i1 [[CMP]], i32 [[SUB]], i32 4
-; CHECK-NEXT:    switch i32 [[COND]], label [[SW_DEFAULT:%.*]] [
-; CHECK-NEXT:      i32 0, label [[SW_BB:%.*]]
-; CHECK-NEXT:      i32 1, label [[SW_BB2:%.*]]
-; CHECK-NEXT:      i32 2, label [[SW_BB4:%.*]]
-; CHECK-NEXT:    ]
-; CHECK:       sw.bb:
-; CHECK-NEXT:    [[CALL:%.*]] = call i32 @call0()
-; CHECK-NEXT:    br label [[CLEANUP:%.*]]
-; CHECK:       sw.bb2:
-; CHECK-NEXT:    [[CALL3:%.*]] = call i32 @call1()
-; CHECK-NEXT:    br label [[CLEANUP]]
-; CHECK:       sw.bb4:
-; CHECK-NEXT:    [[CALL5:%.*]] = call i32 @call2()
-; CHECK-NEXT:    br label [[CLEANUP]]
-; CHECK:       sw.default:
-; CHECK-NEXT:    [[CALL6:%.*]] = call i32 @call3()
-; CHECK-NEXT:    br label [[CLEANUP]]
-; CHECK:       cleanup:
-; CHECK-NEXT:    [[RETVAL_0:%.*]] = phi i32 [ [[CALL6]], [[SW_DEFAULT]] ], [ [[CALL5]], [[SW_BB4]] ], [ [[CALL3]], [[SW_BB2]] ], [ [[CALL]], [[SW_BB]] ]
-; CHECK-NEXT:    ret i32 [[RETVAL_0]]
-;
-entry:
-  %sub = add i32 %num, -120
-  %cmp = icmp ult i32 %sub, 3
-  %cond = select i1 %cmp, i32 %sub, i32 4
-  switch i32 %cond, label %sw.default [
-  i32 0, label %sw.bb
-  i32 1, label %sw.bb2
-  i32 2, label %sw.bb4
-  ]
-
-sw.bb:
-  %call = call i32 @call0()
-  br label %cleanup
-
-sw.bb2:
-  %call3 = call i32 @call1()
-  br label %cleanup
-
-sw.bb4:
-  %call5 = call i32 @call2()
-  br label %cleanup
-
-sw.default:
-  %call6 = call i32 @call3()
-  br label %cleanup
-
-cleanup:
-  %retval.0 = phi i32 [ %call6, %sw.default ], [ %call5, %sw.bb4 ], [ %call3, %sw.bb2 ], [ %call, %sw.bb ]
-  ret i32 %retval.0
-}
-
-define i32 @test_unreachable_default_cond_may_be_undef(i32 %num) {
-; CHECK-LABEL: define i32 @test_unreachable_default_cond_may_be_undef(
-; CHECK-SAME: i32 [[NUM:%.*]]) {
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[SUB:%.*]] = add i32 [[NUM]], -120
-; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i32 [[SUB]], 3
-; CHECK-NEXT:    [[COND:%.*]] = select i1 [[CMP]], i32 [[SUB]], i32 2
-; CHECK-NEXT:    switch i32 [[COND]], label [[SW_DEFAULT:%.*]] [
-; CHECK-NEXT:      i32 0, label [[SW_BB:%.*]]
-; CHECK-NEXT:      i32 1, label [[SW_BB2:%.*]]
-; CHECK-NEXT:      i32 2, label [[SW_BB4:%.*]]
-; CHECK-NEXT:    ]
-; CHECK:       sw.bb:
-; CHECK-NEXT:    [[CALL:%.*]] = call i32 @call0()
-; CHECK-NEXT:    br label [[CLEANUP:%.*]]
-; CHECK:       sw.bb2:
-; CHECK-NEXT:    [[CALL3:%.*]] = call i32 @call1()
-; CHECK-NEXT:    br label [[CLEANUP]]
-; CHECK:       sw.bb4:
-; CHECK-NEXT:    [[CALL5:%.*]] = call i32 @call2()
-; CHECK-NEXT:    br label [[CLEANUP]]
-; CHECK:       sw.default:
-; CHECK-NEXT:    [[CALL6:%.*]] = call i32 @call3()
-; CHECK-NEXT:    br label [[CLEANUP]]
-; CHECK:       cleanup:
-; CHECK-NEXT:    [[RETVAL_0:%.*]] = phi i32 [ [[CALL6]], [[SW_DEFAULT]] ], [ [[CALL5]], [[SW_BB4]] ], [ [[CALL3]], [[SW_BB2]] ], [ [[CALL]], [[SW_BB]] ]
-; CHECK-NEXT:    ret i32 [[RETVAL_0]]
-;
-entry:
-  %sub = add i32 %num, -120
-  %cmp = icmp ult i32 %sub, 3
-  %cond = select i1 %cmp, i32 %sub, i32 2
-  switch i32 %cond, label %sw.default [
-  i32 0, label %sw.bb
-  i32 1, label %sw.bb2
-  i32 2, label %sw.bb4
-  ]
-
-sw.bb:
-  %call = call i32 @call0()
-  br label %cleanup
-
-sw.bb2:
-  %call3 = call i32 @call1()
-  br label %cleanup
-
-sw.bb4:
-  %call5 = call i32 @call2()
-  br label %cleanup
-
-sw.default:
-  %call6 = call i32 @call3()
-  br label %cleanup
-
-cleanup:
-  %retval.0 = phi i32 [ %call6, %sw.default ], [ %call5, %sw.bb4 ], [ %call3, %sw.bb2 ], [ %call, %sw.bb ]
-  ret i32 %retval.0
-}
-
-define i32 @test_default_is_already_unreachable(i32 %num) {
-; CHECK-LABEL: define i32 @test_default_is_already_unreachable(
-; CHECK-SAME: i32 [[NUM:%.*]]) {
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[SUB:%.*]] = add i32 [[NUM]], -120
-; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i32 [[SUB]], 3
-; CHECK-NEXT:    [[COND:%.*]] = select i1 [[CMP]], i32 [[SUB]], i32 2
-; CHECK-NEXT:    switch i32 [[COND]], label [[SW_DEFAULT:%.*]] [
-; CHECK-NEXT:      i32 0, label [[SW_BB:%.*]]
-; CHECK-NEXT:      i32 1, label [[SW_BB2:%.*]]
-; CHECK-NEXT:      i32 2, label [[SW_BB4:%.*]]
-; CHECK-NEXT:    ]
-; CHECK:       sw.bb:
-; CHECK-NEXT:    [[CALL:%.*]] = call i32 @call0()
-; CHECK-NEXT:    br label [[CLEANUP:%.*]]
-; CHECK:       sw.bb2:
-; CHECK-NEXT:    [[CALL3:%.*]] = call i32 @call1()
-; CHECK-NEXT:    br label [[CLEANUP]]
-; CHECK:       sw.bb4:
-; CHECK-NEXT:    [[CALL5:%.*]] = call i32 @call2()
-; CHECK-NEXT:    br label [[CLEANUP]]
-; CHECK:       sw.default:
-; CHECK-NEXT:    unreachable
-; CHECK:       cleanup:
-; CHECK-NEXT:    [[RETVAL_0:%.*]] = phi i32 [ [[CALL5]], [[SW_BB4]] ], [ [[CALL3]], [[SW_BB2]] ], [ [[CALL]], [[SW_BB]] ]
-; CHECK-NEXT:    ret i32 [[RETVAL_0]]
-;
-entry:
-  %sub = add i32 %num, -120
-  %cmp = icmp ult i32 %sub, 3
-  %cond = select i1 %cmp, i32 %sub, i32 2
-  switch i32 %cond, label %sw.default [
-  i32 0, label %sw.bb
-  i32 1, label %sw.bb2
-  i32 2, label %sw.bb4
-  ]
-
-sw.bb:
-  %call = call i32 @call0()
-  br label %cleanup
-
-sw.bb2:
-  %call3 = call i32 @call1()
-  br label %cleanup
-
-sw.bb4:
-  %call5 = call i32 @call2()
-  br label %cleanup
-
-sw.default:
-  unreachable
-
-cleanup:
-  %retval.0 = phi i32 [ %call5, %sw.bb4 ], [ %call3, %sw.bb2 ], [ %call, %sw.bb ]
-  ret i32 %retval.0
-}
-
-declare i32 @call0()
-declare i32 @call1()
-declare i32 @call2()
-declare i32 @call3()
-declare i32 @call4(i32)
-- 
cgit v1.1


From ebe77cc320a1bcc8e2cec44f4f388fb43b72016d Mon Sep 17 00:00:00 2001
From: Congcong Cai <congcongcai0907@163.com>
Date: Tue, 13 Feb 2024 18:00:11 +0800
Subject: [clang-tidy] ignore local variable with [maybe_unused] attribute in
 bugprone-unused-local-non-trivial-variable (#81563)

---
 .../clang-tidy/bugprone/UnusedLocalNonTrivialVariableCheck.cpp        | 1 +
 clang-tools-extra/docs/ReleaseNotes.rst                               | 4 ++++
 .../clang-tidy/checks/bugprone/unused-local-non-trivial-variable.rst  | 1 +
 .../checkers/bugprone/unused-local-non-trivial-variable.cpp           | 1 +
 4 files changed, 7 insertions(+)

diff --git a/clang-tools-extra/clang-tidy/bugprone/UnusedLocalNonTrivialVariableCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/UnusedLocalNonTrivialVariableCheck.cpp
index 1b763d2..37baae7 100644
--- a/clang-tools-extra/clang-tidy/bugprone/UnusedLocalNonTrivialVariableCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/UnusedLocalNonTrivialVariableCheck.cpp
@@ -60,6 +60,7 @@ void UnusedLocalNonTrivialVariableCheck::registerMatchers(MatchFinder *Finder) {
       varDecl(isLocalVarDecl(), unless(isReferenced()),
               unless(isExceptionVariable()), hasLocalStorage(), isDefinition(),
               unless(hasType(isReferenceType())), unless(hasType(isTrivial())),
+              unless(hasAttr(attr::Kind::Unused)),
               hasType(hasUnqualifiedDesugaredType(
                   anyOf(recordType(hasDeclaration(namedDecl(
                             matchesAnyListedName(IncludeTypes),
diff --git a/clang-tools-extra/docs/ReleaseNotes.rst b/clang-tools-extra/docs/ReleaseNotes.rst
index ee68c8f..f2fba9a 100644
--- a/clang-tools-extra/docs/ReleaseNotes.rst
+++ b/clang-tools-extra/docs/ReleaseNotes.rst
@@ -121,6 +121,10 @@ Changes in existing checks
   <clang-tidy/checks/bugprone/too-small-loop-variable>` check by incorporating
   better support for ``const`` loop boundaries.
 
+- Improved :doc:`bugprone-unused-local-non-trivial-variable
+  <clang-tidy/checks/bugprone/unused-local-non-trivial-variable>` check by
+  ignoring local variable with ``[maybe_unused]`` attribute.
+
 - Cleaned up :doc:`cppcoreguidelines-prefer-member-initializer
   <clang-tidy/checks/cppcoreguidelines/prefer-member-initializer>`
   by removing enforcement of rule `C.48
diff --git a/clang-tools-extra/docs/clang-tidy/checks/bugprone/unused-local-non-trivial-variable.rst b/clang-tools-extra/docs/clang-tidy/checks/bugprone/unused-local-non-trivial-variable.rst
index 7531f19..9f283de 100644
--- a/clang-tools-extra/docs/clang-tidy/checks/bugprone/unused-local-non-trivial-variable.rst
+++ b/clang-tools-extra/docs/clang-tidy/checks/bugprone/unused-local-non-trivial-variable.rst
@@ -11,6 +11,7 @@ The following types of variables are excluded from this check:
 * exception variables in catch clauses
 * static or thread local
 * structured bindings
+* variables with ``[[maybe_unused]]`` attribute
 
 This check can be configured to warn on all non-trivial variables by setting
 `IncludeTypes` to `.*`, and excluding specific types using `ExcludeTypes`.
diff --git a/clang-tools-extra/test/clang-tidy/checkers/bugprone/unused-local-non-trivial-variable.cpp b/clang-tools-extra/test/clang-tidy/checkers/bugprone/unused-local-non-trivial-variable.cpp
index 19f2344..3fdc24b 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/bugprone/unused-local-non-trivial-variable.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/bugprone/unused-local-non-trivial-variable.cpp
@@ -77,6 +77,7 @@ T qux(T Generic) {
     // CHECK-MESSAGES: :[[@LINE-1]]:22: warning: unused local variable 'TemplateType' of type 'async::Future<T>' [bugprone-unused-local-non-trivial-variable]
     a::Future<T> AliasTemplateType;
     // CHECK-MESSAGES: :[[@LINE-1]]:18: warning: unused local variable 'AliasTemplateType' of type 'a::Future<T>' (aka 'Future<type-parameter-0-0>') [bugprone-unused-local-non-trivial-variable]
+    [[maybe_unused]] async::Future<Units> MaybeUnused;
     return Generic;
 }
 
-- 
cgit v1.1


From 8c6e96d9eb35849762fa3ab4d3cc9517c4e14e74 Mon Sep 17 00:00:00 2001
From: Antonio Frighetto <me@antoniofrighetto.com>
Date: Tue, 13 Feb 2024 11:00:04 +0100
Subject: [clang][Dataflow] Fix unnecessary copy in
 `initializeFieldsWithValues` (NFC)

---
 clang/lib/Analysis/FlowSensitive/DataflowEnvironment.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/clang/lib/Analysis/FlowSensitive/DataflowEnvironment.cpp b/clang/lib/Analysis/FlowSensitive/DataflowEnvironment.cpp
index 93a9dac..d487944 100644
--- a/clang/lib/Analysis/FlowSensitive/DataflowEnvironment.cpp
+++ b/clang/lib/Analysis/FlowSensitive/DataflowEnvironment.cpp
@@ -939,7 +939,7 @@ void Environment::initializeFieldsWithValues(RecordStorageLocation &Loc,
     }
   };
 
-  for (const auto [Field, FieldLoc] : Loc.children()) {
+  for (const auto &[Field, FieldLoc] : Loc.children()) {
     assert(Field != nullptr);
     QualType FieldType = Field->getType();
 
-- 
cgit v1.1


From f506192c016fb6909eb2c17e183f6223db012f8c Mon Sep 17 00:00:00 2001
From: Guillaume Chatelet <gchatelet@google.com>
Date: Tue, 13 Feb 2024 11:17:28 +0100
Subject: [libc][NFC] Small `abs` related simplifications (#79858)

---
 libc/src/__support/FPUtil/BasicOperations.h | 4 +---
 libc/src/math/generic/acoshf.cpp            | 6 +-----
 libc/src/math/generic/asinhf.cpp            | 4 +---
 3 files changed, 3 insertions(+), 11 deletions(-)

diff --git a/libc/src/__support/FPUtil/BasicOperations.h b/libc/src/__support/FPUtil/BasicOperations.h
index ccc61a8..a19d6d0 100644
--- a/libc/src/__support/FPUtil/BasicOperations.h
+++ b/libc/src/__support/FPUtil/BasicOperations.h
@@ -19,9 +19,7 @@ namespace fputil {
 
 template <typename T, cpp::enable_if_t<cpp::is_floating_point_v<T>, int> = 0>
 LIBC_INLINE T abs(T x) {
-  FPBits<T> bits(x);
-  bits.set_sign(Sign::POS);
-  return bits.get_val();
+  return FPBits<T>(x).abs().get_val();
 }
 
 template <typename T, cpp::enable_if_t<cpp::is_floating_point_v<T>, int> = 0>
diff --git a/libc/src/math/generic/acoshf.cpp b/libc/src/math/generic/acoshf.cpp
index 54b66bf..a4a75a7 100644
--- a/libc/src/math/generic/acoshf.cpp
+++ b/libc/src/math/generic/acoshf.cpp
@@ -33,12 +33,8 @@ LLVM_LIBC_FUNCTION(float, acoshf, (float x)) {
   }
 
   if (LIBC_UNLIKELY(x_u >= 0x4f8ffb03)) {
-    // Check for exceptional values.
-    uint32_t x_abs = xbits.abs().uintval();
-    if (LIBC_UNLIKELY(x_abs >= 0x7f80'0000U)) {
-      // x is +inf or NaN.
+    if (LIBC_UNLIKELY(xbits.is_inf_or_nan()))
       return x;
-    }
 
     // Helper functions to set results for exceptional cases.
     auto round_result_slightly_down = [](float r) -> float {
diff --git a/libc/src/math/generic/asinhf.cpp b/libc/src/math/generic/asinhf.cpp
index ac05991..6e35178 100644
--- a/libc/src/math/generic/asinhf.cpp
+++ b/libc/src/math/generic/asinhf.cpp
@@ -59,10 +59,8 @@ LLVM_LIBC_FUNCTION(float, asinhf, (float x)) {
   };
 
   if (LIBC_UNLIKELY(x_abs >= 0x4bdd'65a5U)) {
-    if (LIBC_UNLIKELY(x_abs >= 0x7f80'0000U)) {
-      // x is +-inf or nan
+    if (LIBC_UNLIKELY(xbits.is_inf_or_nan()))
       return x;
-    }
 
     // Exceptional cases when x > 2^24.
     switch (x_abs) {
-- 
cgit v1.1


From 4c931091a344e3284bd449710defdabe8fab1c22 Mon Sep 17 00:00:00 2001
From: Ivan Kosarev <ivan.kosarev@amd.com>
Date: Tue, 13 Feb 2024 12:27:56 +0200
Subject: [AMDGPU][NFC] Get rid of some operand decoders defined using macros.
 (#81482)

Use templates instead.

Part of <https://github.com/llvm/llvm-project/issues/62629>.
---
 .../AMDGPU/Disassembler/AMDGPUDisassembler.cpp     |  84 ++++-------
 llvm/lib/Target/AMDGPU/SIRegisterInfo.td           | 165 +++++++++++----------
 2 files changed, 111 insertions(+), 138 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
index 85377d0..b307865 100644
--- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
+++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
@@ -178,8 +178,12 @@ static DecodeStatus decodeAV10(MCInst &Inst, unsigned Imm, uint64_t /* Addr */,
 }
 
 // Decoder for Src(9-bit encoding) registers only.
-#define DECODE_OPERAND_SRC_REG_9(RegClass, OpWidth)                            \
-  DECODE_SrcOp(decodeOperand_##RegClass, 9, OpWidth, Imm, false, 0)
+template <AMDGPUDisassembler::OpWidthTy OpWidth>
+static DecodeStatus decodeSrcReg9(MCInst &Inst, unsigned Imm,
+                                  uint64_t /* Addr */,
+                                  const MCDisassembler *Decoder) {
+  return decodeSrcOp(Inst, 9, OpWidth, Imm, Imm, false, 0, Decoder);
+}
 
 // Decoder for Src(9-bit encoding) AGPR, register number encoded in 9bits, set
 // Imm{9} to 1 (set acc) and decode using 'enum10' from decodeSrcOp, registers
@@ -204,22 +208,29 @@ static DecodeStatus decodeSrcAV10(MCInst &Inst, unsigned Imm,
 // will be decoded and InstPrinter will report warning. Immediate will be
 // decoded into constant of size ImmWidth, should match width of immediate used
 // by OperandType (important for floating point types).
-#define DECODE_OPERAND_SRC_REG_OR_IMM_9(RegClass, OpWidth, ImmWidth)           \
-  DECODE_SrcOp(decodeOperand_##RegClass##_Imm##ImmWidth, 9, OpWidth, Imm,      \
-               false, ImmWidth)
-
-#define DECODE_OPERAND_SRC_REG_OR_IMM_9_TYPED(Name, OpWidth, ImmWidth)         \
-  DECODE_SrcOp(decodeOperand_##Name, 9, OpWidth, Imm, false, ImmWidth)
+template <AMDGPUDisassembler::OpWidthTy OpWidth, unsigned ImmWidth>
+static DecodeStatus decodeSrcRegOrImm9(MCInst &Inst, unsigned Imm,
+                                       uint64_t /* Addr */,
+                                       const MCDisassembler *Decoder) {
+  return decodeSrcOp(Inst, 9, OpWidth, Imm, Imm, false, ImmWidth, Decoder);
+}
 
 // Decoder for Src(9-bit encoding) AGPR or immediate. Set Imm{9} to 1 (set acc)
 // and decode using 'enum10' from decodeSrcOp.
-#define DECODE_OPERAND_SRC_REG_OR_IMM_A9(RegClass, OpWidth, ImmWidth)          \
-  DECODE_SrcOp(decodeOperand_##RegClass##_Imm##ImmWidth, 9, OpWidth,           \
-               Imm | 512, false, ImmWidth)
+template <AMDGPUDisassembler::OpWidthTy OpWidth, unsigned ImmWidth>
+static DecodeStatus decodeSrcRegOrImmA9(MCInst &Inst, unsigned Imm,
+                                        uint64_t /* Addr */,
+                                        const MCDisassembler *Decoder) {
+  return decodeSrcOp(Inst, 9, OpWidth, Imm, Imm | 512, false, ImmWidth,
+                     Decoder);
+}
 
-#define DECODE_OPERAND_SRC_REG_OR_IMM_DEFERRED_9(RegClass, OpWidth, ImmWidth)  \
-  DECODE_SrcOp(decodeOperand_##RegClass##_Deferred##_Imm##ImmWidth, 9,         \
-               OpWidth, Imm, true, ImmWidth)
+template <AMDGPUDisassembler::OpWidthTy OpWidth, unsigned ImmWidth>
+static DecodeStatus decodeSrcRegOrImmDeferred9(MCInst &Inst, unsigned Imm,
+                                               uint64_t /* Addr */,
+                                               const MCDisassembler *Decoder) {
+  return decodeSrcOp(Inst, 9, OpWidth, Imm, Imm, true, ImmWidth, Decoder);
+}
 
 // Default decoders generated by tablegen: 'Decode<RegClass>RegisterClass'
 // when RegisterClass is used as an operand. Most often used for destination
@@ -255,51 +266,6 @@ DECODE_OPERAND_REG_8(AReg_256)
 DECODE_OPERAND_REG_8(AReg_512)
 DECODE_OPERAND_REG_8(AReg_1024)
 
-// Decoders for register only source RegisterOperands that use use 9-bit Src
-// encoding: 'decodeOperand_<RegClass>'.
-
-DECODE_OPERAND_SRC_REG_9(VGPR_32, OPW32)
-DECODE_OPERAND_SRC_REG_9(VReg_64, OPW64)
-DECODE_OPERAND_SRC_REG_9(VReg_128, OPW128)
-DECODE_OPERAND_SRC_REG_9(VReg_256, OPW256)
-DECODE_OPERAND_SRC_REG_9(VRegOrLds_32, OPW32)
-
-// Decoders for register or immediate RegisterOperands that use 9-bit Src
-// encoding: 'decodeOperand_<RegClass>_Imm<ImmWidth>'.
-
-DECODE_OPERAND_SRC_REG_OR_IMM_9(SReg_64, OPW64, 64)
-DECODE_OPERAND_SRC_REG_OR_IMM_9(SReg_32, OPW32, 32)
-DECODE_OPERAND_SRC_REG_OR_IMM_9(SReg_32, OPW32, 16)
-DECODE_OPERAND_SRC_REG_OR_IMM_9(SRegOrLds_32, OPW32, 32)
-DECODE_OPERAND_SRC_REG_OR_IMM_9(VS_32_Lo128, OPW16, 16)
-DECODE_OPERAND_SRC_REG_OR_IMM_9(VS_32, OPW32, 16)
-DECODE_OPERAND_SRC_REG_OR_IMM_9(VS_32, OPW32, 32)
-DECODE_OPERAND_SRC_REG_OR_IMM_9(VS_64, OPW64, 64)
-DECODE_OPERAND_SRC_REG_OR_IMM_9(VS_64, OPW64, 32)
-DECODE_OPERAND_SRC_REG_OR_IMM_9(VReg_64, OPW64, 64)
-DECODE_OPERAND_SRC_REG_OR_IMM_9(VReg_64, OPW64, 32)
-DECODE_OPERAND_SRC_REG_OR_IMM_9(VReg_64, OPW64, 16)
-DECODE_OPERAND_SRC_REG_OR_IMM_9(VReg_128, OPW128, 32)
-DECODE_OPERAND_SRC_REG_OR_IMM_9(VReg_128, OPW128, 16)
-DECODE_OPERAND_SRC_REG_OR_IMM_9(VReg_256, OPW256, 64)
-DECODE_OPERAND_SRC_REG_OR_IMM_9(VReg_256, OPW256, 32)
-DECODE_OPERAND_SRC_REG_OR_IMM_9(VReg_512, OPW512, 32)
-DECODE_OPERAND_SRC_REG_OR_IMM_9(VReg_1024, OPW1024, 32)
-
-DECODE_OPERAND_SRC_REG_OR_IMM_9_TYPED(VS_32_ImmV2I16, OPW32, 32)
-DECODE_OPERAND_SRC_REG_OR_IMM_9_TYPED(VS_32_ImmV2F16, OPW32, 16)
-
-DECODE_OPERAND_SRC_REG_OR_IMM_A9(AReg_64, OPW64, 64)
-DECODE_OPERAND_SRC_REG_OR_IMM_A9(AReg_128, OPW128, 32)
-DECODE_OPERAND_SRC_REG_OR_IMM_A9(AReg_256, OPW256, 64)
-DECODE_OPERAND_SRC_REG_OR_IMM_A9(AReg_512, OPW512, 32)
-DECODE_OPERAND_SRC_REG_OR_IMM_A9(AReg_1024, OPW1024, 32)
-
-DECODE_OPERAND_SRC_REG_OR_IMM_DEFERRED_9(VS_32_Lo128, OPW16, 16)
-DECODE_OPERAND_SRC_REG_OR_IMM_DEFERRED_9(VS_32, OPW16, 16)
-DECODE_OPERAND_SRC_REG_OR_IMM_DEFERRED_9(VS_32, OPW32, 32)
-DECODE_OPERAND_SRC_REG_OR_IMM_DEFERRED_9(SReg_32, OPW32, 32)
-
 static DecodeStatus DecodeVGPR_16RegisterClass(MCInst &Inst, unsigned Imm,
                                                uint64_t /*Addr*/,
                                                const MCDisassembler *Decoder) {
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
index aabb6c2..d4a1e8d1 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
@@ -1101,96 +1101,106 @@ class RegImmMatcher<string name> : AsmOperandClass {
   let RenderMethod = "addRegOrImmOperands";
 }
 
-class RegOrImmOperand <string RegisterClassName, string OperandTypeName,
-                       string decoderImmSize>
-  : RegisterOperand<!cast<RegisterClass>(RegisterClassName)> {
+class RegOrImmOperand <RegisterClass RegClass, string OperandTypeName>
+  : RegisterOperand<RegClass> {
     let OperandNamespace = "AMDGPU";
     let OperandType = OperandTypeName;
     let ParserMatchClass = RegImmMatcher<!subst("_Deferred", "", NAME)>;
-    let DecoderMethod = "decodeOperand_" # RegisterClassName # decoderImmSize;
 }
 
 //===----------------------------------------------------------------------===//
 //  SSrc_* Operands with an SGPR or a 32-bit immediate
 //===----------------------------------------------------------------------===//
 
-def SSrc_b16 : RegOrImmOperand <"SReg_32", "OPERAND_REG_IMM_INT16", "_Imm16">;
-def SSrc_f16 : RegOrImmOperand <"SReg_32", "OPERAND_REG_IMM_FP16", "_Imm16">;
-def SSrc_b32 : RegOrImmOperand <"SReg_32", "OPERAND_REG_IMM_INT32", "_Imm32">;
-def SSrc_f32 : RegOrImmOperand <"SReg_32", "OPERAND_REG_IMM_FP32", "_Imm32">;
-def SSrc_b64 : RegOrImmOperand <"SReg_64", "OPERAND_REG_IMM_INT64", "_Imm64">;
+class SrcRegOrImm9<RegisterClass regClass, string opWidth, string operandType,
+                   int immWidth> : RegOrImmOperand<regClass, operandType> {
+  let DecoderMethod = "decodeSrcRegOrImm9<AMDGPUDisassembler::" # opWidth #
+                      ", " # immWidth # ">";
+}
+
+def SSrc_b16 : SrcRegOrImm9 <SReg_32, "OPW32", "OPERAND_REG_IMM_INT16", 16>;
+def SSrc_f16 : SrcRegOrImm9 <SReg_32, "OPW32", "OPERAND_REG_IMM_FP16", 16>;
+def SSrc_b32 : SrcRegOrImm9 <SReg_32, "OPW32", "OPERAND_REG_IMM_INT32", 32>;
+def SSrc_f32 : SrcRegOrImm9 <SReg_32, "OPW32", "OPERAND_REG_IMM_FP32", 32>;
+def SSrc_b64 : SrcRegOrImm9 <SReg_64, "OPW64", "OPERAND_REG_IMM_INT64", 64>;
 
-def SSrcOrLds_b32 : RegOrImmOperand <"SRegOrLds_32", "OPERAND_REG_IMM_INT32", "_Imm32">;
+def SSrcOrLds_b32 : SrcRegOrImm9 <SRegOrLds_32, "OPW32", "OPERAND_REG_IMM_INT32", 32>;
 
 //===----------------------------------------------------------------------===//
 //  SSrc_32_Deferred Operands with an SGPR or a 32-bit immediate for use with
 //  FMAMK/FMAAK
 //===----------------------------------------------------------------------===//
 
-def SSrc_f32_Deferred : RegOrImmOperand<"SReg_32", "OPERAND_REG_IMM_FP32_DEFERRED", "_Deferred_Imm32">;
+class SrcRegOrImmDeferred9<RegisterClass regClass, string opWidth,
+                           string operandType, int immWidth>
+    : RegOrImmOperand<regClass, operandType> {
+  let DecoderMethod = "decodeSrcRegOrImmDeferred9<AMDGPUDisassembler::" #
+                      opWidth # ", " # immWidth # ">";
+}
+
+def SSrc_f32_Deferred : SrcRegOrImmDeferred9<SReg_32, "OPW32", "OPERAND_REG_IMM_FP32_DEFERRED", 32>;
 
 //===----------------------------------------------------------------------===//
 //  SCSrc_* Operands with an SGPR or a inline constant
 //===----------------------------------------------------------------------===//
 
-def SCSrc_b32 : RegOrImmOperand <"SReg_32", "OPERAND_REG_INLINE_C_INT32", "_Imm32">;
-def SCSrc_b64 : RegOrImmOperand <"SReg_64", "OPERAND_REG_INLINE_C_INT64", "_Imm64">;
+def SCSrc_b32 : SrcRegOrImm9 <SReg_32, "OPW32", "OPERAND_REG_INLINE_C_INT32", 32>;
+def SCSrc_b64 : SrcRegOrImm9 <SReg_64, "OPW64", "OPERAND_REG_INLINE_C_INT64", 64>;
 
 //===----------------------------------------------------------------------===//
 //  VSrc_* Operands with an SGPR, VGPR or a 32-bit immediate
 //===----------------------------------------------------------------------===//
 
 // The current and temporary future default used case for VOP3.
-def VSrc_b16 : RegOrImmOperand <"VS_32", "OPERAND_REG_IMM_INT16", "_Imm16">;
-def VSrc_f16 : RegOrImmOperand <"VS_32", "OPERAND_REG_IMM_FP16", "_Imm16">;
+def VSrc_b16 : SrcRegOrImm9 <VS_32, "OPW32", "OPERAND_REG_IMM_INT16", 16>;
+def VSrc_f16 : SrcRegOrImm9 <VS_32, "OPW32", "OPERAND_REG_IMM_FP16", 16>;
 
 // True16 VOP3 operands.
-def VSrcT_b16 : RegOrImmOperand <"VS_16", "OPERAND_REG_IMM_INT16", "_Imm16"> {
+def VSrcT_b16 : RegOrImmOperand <VS_16, "OPERAND_REG_IMM_INT16"> {
   let EncoderMethod = "getMachineOpValueT16";
   let DecoderMethod = "decodeOperand_VSrcT16";
 }
-def VSrcT_f16 : RegOrImmOperand <"VS_16", "OPERAND_REG_IMM_FP16", "_Imm16"> {
+def VSrcT_f16 : RegOrImmOperand <VS_16, "OPERAND_REG_IMM_FP16"> {
   let EncoderMethod = "getMachineOpValueT16";
   let DecoderMethod = "decodeOperand_VSrcT16";
 }
 
 // True16 VOP1/2/C operands.
-def VSrcT_b16_Lo128 : RegOrImmOperand <"VS_16_Lo128", "OPERAND_REG_IMM_INT16", "_Imm16"> {
+def VSrcT_b16_Lo128 : RegOrImmOperand <VS_16_Lo128, "OPERAND_REG_IMM_INT16"> {
   let EncoderMethod = "getMachineOpValueT16Lo128";
   let DecoderMethod = "decodeOperand_VSrcT16_Lo128";
 }
-def VSrcT_f16_Lo128 : RegOrImmOperand <"VS_16_Lo128", "OPERAND_REG_IMM_FP16", "_Imm16"> {
+def VSrcT_f16_Lo128 : RegOrImmOperand <VS_16_Lo128, "OPERAND_REG_IMM_FP16"> {
   let EncoderMethod = "getMachineOpValueT16Lo128";
   let DecoderMethod = "decodeOperand_VSrcT16_Lo128";
 }
 
 // The current and temporary future default used case for fake VOP1/2/C.
 // For VOP1,2,C True16 instructions. _Lo128 use first 128 32-bit VGPRs only.
-def VSrcFake16_b16_Lo128 : RegOrImmOperand <"VS_32_Lo128", "OPERAND_REG_IMM_INT16", "_Imm16">;
-def VSrcFake16_f16_Lo128 : RegOrImmOperand <"VS_32_Lo128", "OPERAND_REG_IMM_FP16", "_Imm16">;
-
-def VSrc_b32 : RegOrImmOperand <"VS_32", "OPERAND_REG_IMM_INT32", "_Imm32">;
-def VSrc_f32 : RegOrImmOperand <"VS_32", "OPERAND_REG_IMM_FP32", "_Imm32">;
-def VSrc_v2b16 : RegOrImmOperand <"VS_32", "OPERAND_REG_IMM_V2INT16", "_ImmV2I16">;
-def VSrc_v2f16 : RegOrImmOperand <"VS_32", "OPERAND_REG_IMM_V2FP16", "_ImmV2F16">;
-def VSrc_b64 : RegOrImmOperand <"VS_64", "OPERAND_REG_IMM_INT64", "_Imm64">;
-def VSrc_f64 : RegOrImmOperand <"VS_64", "OPERAND_REG_IMM_FP64", "_Imm64"> {
+def VSrcFake16_b16_Lo128 : SrcRegOrImm9 <VS_32_Lo128, "OPW16", "OPERAND_REG_IMM_INT16", 16>;
+def VSrcFake16_f16_Lo128 : SrcRegOrImm9 <VS_32_Lo128, "OPW16", "OPERAND_REG_IMM_FP16", 16>;
+
+def VSrc_b32 : SrcRegOrImm9 <VS_32, "OPW32", "OPERAND_REG_IMM_INT32", 32>;
+def VSrc_f32 : SrcRegOrImm9 <VS_32, "OPW32", "OPERAND_REG_IMM_FP32", 32>;
+def VSrc_v2b16 : SrcRegOrImm9 <VS_32, "OPW32", "OPERAND_REG_IMM_V2INT16", 32>;
+def VSrc_v2f16 : SrcRegOrImm9 <VS_32, "OPW32", "OPERAND_REG_IMM_V2FP16", 16>;
+def VSrc_b64 : SrcRegOrImm9 <VS_64, "OPW64", "OPERAND_REG_IMM_INT64", 64>;
+def VSrc_f64 : SrcRegOrImm9 <VS_64, "OPW64", "OPERAND_REG_IMM_FP64", 64> {
   let DecoderMethod = "decodeOperand_VSrc_f64";
 }
-def VSrc_v2b32 : RegOrImmOperand <"VS_64", "OPERAND_REG_IMM_V2INT32", "_Imm32">;
-def VSrc_v2f32 : RegOrImmOperand <"VS_64", "OPERAND_REG_IMM_V2FP32", "_Imm32">;
+def VSrc_v2b32 : SrcRegOrImm9 <VS_64, "OPW64", "OPERAND_REG_IMM_V2INT32", 32>;
+def VSrc_v2f32 : SrcRegOrImm9 <VS_64, "OPW64", "OPERAND_REG_IMM_V2FP32", 32>;
 
 //===----------------------------------------------------------------------===//
 //  VSrc_*_Deferred Operands with an SGPR, VGPR or a 32-bit immediate for use
 //  with FMAMK/FMAAK
 //===----------------------------------------------------------------------===//
 
-def VSrc_f16_Deferred : RegOrImmOperand<"VS_32", "OPERAND_REG_IMM_FP16_DEFERRED", "_Deferred_Imm16">;
-def VSrc_f32_Deferred : RegOrImmOperand<"VS_32", "OPERAND_REG_IMM_FP32_DEFERRED", "_Deferred_Imm32">;
+def VSrc_f16_Deferred : SrcRegOrImmDeferred9<VS_32, "OPW16", "OPERAND_REG_IMM_FP16_DEFERRED", 16>;
+def VSrc_f32_Deferred : SrcRegOrImmDeferred9<VS_32, "OPW32", "OPERAND_REG_IMM_FP32_DEFERRED", 32>;
 
-def VSrcFake16_f16_Lo128_Deferred : RegOrImmOperand<"VS_32_Lo128",
-                                                    "OPERAND_REG_IMM_FP16_DEFERRED",
-                                                    "_Deferred_Imm16">;
+def VSrcFake16_f16_Lo128_Deferred
+  : SrcRegOrImmDeferred9<VS_32_Lo128, "OPW16", "OPERAND_REG_IMM_FP16_DEFERRED", 16>;
 
 //===----------------------------------------------------------------------===//
 //  VRegSrc_* Operands with a VGPR
@@ -1198,25 +1208,15 @@ def VSrcFake16_f16_Lo128_Deferred : RegOrImmOperand<"VS_32_Lo128",
 
 // This is for operands with the enum(9), VSrc encoding restriction,
 // but only allows VGPRs.
-def VRegSrc_32 : RegisterOperand<VGPR_32> {
-  let DecoderMethod = "decodeOperand_VGPR_32";
-}
-
-def VRegSrc_64 : RegisterOperand<VReg_64> {
-  let DecoderMethod = "decodeOperand_VReg_64";
+class SrcReg9<RegisterClass regClass, string width> : RegisterOperand<regClass> {
+  let DecoderMethod = "decodeSrcReg9<AMDGPUDisassembler::" # width # ">";
 }
 
-def VRegSrc_128 : RegisterOperand<VReg_128> {
-  let DecoderMethod = "decodeOperand_VReg_128";
-}
-
-def VRegSrc_256 : RegisterOperand<VReg_256> {
-  let DecoderMethod = "decodeOperand_VReg_256";
-}
-
-def VRegOrLdsSrc_32 : RegisterOperand<VRegOrLds_32> {
-  let DecoderMethod = "decodeOperand_VRegOrLds_32";
-}
+def VRegSrc_32 : SrcReg9<VGPR_32, "OPW32">;
+def VRegSrc_64 : SrcReg9<VReg_64, "OPW64">;
+def VRegSrc_128: SrcReg9<VReg_128, "OPW128">;
+def VRegSrc_256: SrcReg9<VReg_256, "OPW256">;
+def VRegOrLdsSrc_32 : SrcReg9<VRegOrLds_32, "OPW32">;
 
 //===----------------------------------------------------------------------===//
 // VGPRSrc_*
@@ -1257,30 +1257,30 @@ def ARegSrc_32 : AVOperand<AGPR_32, "decodeSrcA9", "OPW32">;
 //  VCSrc_* Operands with an SGPR, VGPR or an inline constant
 //===----------------------------------------------------------------------===//
 
-def VCSrc_b16 : RegOrImmOperand <"VS_32", "OPERAND_REG_INLINE_C_INT16", "_Imm16">;
-def VCSrc_f16 : RegOrImmOperand <"VS_32", "OPERAND_REG_INLINE_C_FP16", "_Imm16">;
-def VCSrc_b32 : RegOrImmOperand <"VS_32", "OPERAND_REG_INLINE_C_INT32", "_Imm32">;
-def VCSrc_f32 : RegOrImmOperand <"VS_32", "OPERAND_REG_INLINE_C_FP32", "_Imm32">;
-def VCSrc_v2b16 : RegOrImmOperand <"VS_32", "OPERAND_REG_INLINE_C_V2INT16", "_ImmV2I16">;
-def VCSrc_v2f16 : RegOrImmOperand <"VS_32", "OPERAND_REG_INLINE_C_V2FP16", "_ImmV2F16">;
+def VCSrc_b16 : SrcRegOrImm9 <VS_32, "OPW32", "OPERAND_REG_INLINE_C_INT16", 16>;
+def VCSrc_f16 : SrcRegOrImm9 <VS_32, "OPW32", "OPERAND_REG_INLINE_C_FP16", 16>;
+def VCSrc_b32 : SrcRegOrImm9 <VS_32, "OPW32", "OPERAND_REG_INLINE_C_INT32", 32>;
+def VCSrc_f32 : SrcRegOrImm9 <VS_32, "OPW32", "OPERAND_REG_INLINE_C_FP32", 32>;
+def VCSrc_v2b16 : SrcRegOrImm9 <VS_32, "OPW32", "OPERAND_REG_INLINE_C_V2INT16", 32>;
+def VCSrc_v2f16 : SrcRegOrImm9 <VS_32, "OPW32", "OPERAND_REG_INLINE_C_V2FP16", 16>;
 
 //===----------------------------------------------------------------------===//
 //  VISrc_* Operands with a VGPR or an inline constant
 //===----------------------------------------------------------------------===//
 
-def VISrc_64_f16 : RegOrImmOperand <"VReg_64", "OPERAND_REG_INLINE_C_FP16", "_Imm16">;
-def VISrc_64_b32 : RegOrImmOperand <"VReg_64", "OPERAND_REG_INLINE_C_INT32", "_Imm32">;
-def VISrc_64_f64 : RegOrImmOperand <"VReg_64", "OPERAND_REG_INLINE_C_FP64", "_Imm64">;
-def VISrc_128_f16 : RegOrImmOperand <"VReg_128", "OPERAND_REG_INLINE_C_FP16", "_Imm16">;
-def VISrc_128_b32 : RegOrImmOperand <"VReg_128", "OPERAND_REG_INLINE_C_INT32", "_Imm32">;
-def VISrc_128_f32 : RegOrImmOperand <"VReg_128", "OPERAND_REG_INLINE_C_FP32", "_Imm32">;
-def VISrc_256_b32 : RegOrImmOperand <"VReg_256", "OPERAND_REG_INLINE_C_INT32", "_Imm32">;
-def VISrc_256_f32 : RegOrImmOperand <"VReg_256", "OPERAND_REG_INLINE_C_FP32", "_Imm32">;
-def VISrc_256_f64 : RegOrImmOperand <"VReg_256", "OPERAND_REG_INLINE_C_FP64", "_Imm64">;
-def VISrc_512_b32 : RegOrImmOperand <"VReg_512", "OPERAND_REG_INLINE_C_INT32", "_Imm32">;
-def VISrc_512_f32 : RegOrImmOperand <"VReg_512", "OPERAND_REG_INLINE_C_FP32", "_Imm32">;
-def VISrc_1024_b32 : RegOrImmOperand <"VReg_1024", "OPERAND_REG_INLINE_C_INT32", "_Imm32">;
-def VISrc_1024_f32 : RegOrImmOperand <"VReg_1024", "OPERAND_REG_INLINE_C_FP32", "_Imm32">;
+def VISrc_64_f16 : SrcRegOrImm9 <VReg_64, "OPW64", "OPERAND_REG_INLINE_C_FP16", 16>;
+def VISrc_64_b32 : SrcRegOrImm9 <VReg_64, "OPW64", "OPERAND_REG_INLINE_C_INT32", 32>;
+def VISrc_64_f64 : SrcRegOrImm9 <VReg_64, "OPW64", "OPERAND_REG_INLINE_C_FP64", 64>;
+def VISrc_128_f16 : SrcRegOrImm9 <VReg_128, "OPW128", "OPERAND_REG_INLINE_C_FP16", 16>;
+def VISrc_128_b32 : SrcRegOrImm9 <VReg_128, "OPW128", "OPERAND_REG_INLINE_C_INT32", 32>;
+def VISrc_128_f32 : SrcRegOrImm9 <VReg_128, "OPW128", "OPERAND_REG_INLINE_C_FP32", 32>;
+def VISrc_256_b32 : SrcRegOrImm9 <VReg_256, "OPW256", "OPERAND_REG_INLINE_C_INT32", 32>;
+def VISrc_256_f32 : SrcRegOrImm9 <VReg_256, "OPW256", "OPERAND_REG_INLINE_C_FP32", 32>;
+def VISrc_256_f64 : SrcRegOrImm9 <VReg_256, "OPW256", "OPERAND_REG_INLINE_C_FP64", 64>;
+def VISrc_512_b32 : SrcRegOrImm9 <VReg_512, "OPW512", "OPERAND_REG_INLINE_C_INT32", 32>;
+def VISrc_512_f32 : SrcRegOrImm9 <VReg_512, "OPW512", "OPERAND_REG_INLINE_C_FP32", 32>;
+def VISrc_1024_b32 : SrcRegOrImm9 <VReg_1024, "OPW1024", "OPERAND_REG_INLINE_C_INT32", 32>;
+def VISrc_1024_f32 : SrcRegOrImm9 <VReg_1024, "OPW1024", "OPERAND_REG_INLINE_C_FP32", 32>;
 
 //===----------------------------------------------------------------------===//
 //  AVSrc_*, AVDst_*, AVLdSt_* Operands with an AGPR or VGPR
@@ -1312,11 +1312,18 @@ def AVLdSt_160 : AVLdStOperand<AV_160, "OPW160">;
 //  ACSrc_* Operands with an AGPR or an inline constant
 //===----------------------------------------------------------------------===//
 
-def AISrc_64_f64 : RegOrImmOperand <"AReg_64", "OPERAND_REG_INLINE_AC_FP64", "_Imm64">;
-def AISrc_128_f32 : RegOrImmOperand <"AReg_128", "OPERAND_REG_INLINE_AC_FP32", "_Imm32">;
-def AISrc_128_b32 : RegOrImmOperand <"AReg_128", "OPERAND_REG_INLINE_AC_INT32", "_Imm32">;
-def AISrc_256_f64 : RegOrImmOperand <"AReg_256", "OPERAND_REG_INLINE_AC_FP64", "_Imm64">;
-def AISrc_512_f32 : RegOrImmOperand <"AReg_512", "OPERAND_REG_INLINE_AC_FP32", "_Imm32">;
-def AISrc_512_b32 : RegOrImmOperand <"AReg_512", "OPERAND_REG_INLINE_AC_INT32", "_Imm32">;
-def AISrc_1024_f32 : RegOrImmOperand <"AReg_1024", "OPERAND_REG_INLINE_AC_FP32", "_Imm32">;
-def AISrc_1024_b32 : RegOrImmOperand <"AReg_1024", "OPERAND_REG_INLINE_AC_INT32", "_Imm32">;
+class SrcRegOrImmA9<RegisterClass regClass, string opWidth, string operandType,
+                    int immWidth>
+    : RegOrImmOperand<regClass, operandType> {
+  let DecoderMethod = "decodeSrcRegOrImmA9<AMDGPUDisassembler::" # opWidth #
+                      ", " # immWidth # ">";
+}
+
+def AISrc_64_f64 : SrcRegOrImmA9 <AReg_64, "OPW64", "OPERAND_REG_INLINE_AC_FP64", 64>;
+def AISrc_128_f32 : SrcRegOrImmA9 <AReg_128, "OPW128", "OPERAND_REG_INLINE_AC_FP32", 32>;
+def AISrc_128_b32 : SrcRegOrImmA9 <AReg_128, "OPW128", "OPERAND_REG_INLINE_AC_INT32", 32>;
+def AISrc_256_f64 : SrcRegOrImmA9 <AReg_256, "OPW256", "OPERAND_REG_INLINE_AC_FP64", 64>;
+def AISrc_512_f32 : SrcRegOrImmA9 <AReg_512, "OPW512", "OPERAND_REG_INLINE_AC_FP32", 32>;
+def AISrc_512_b32 : SrcRegOrImmA9 <AReg_512, "OPW512", "OPERAND_REG_INLINE_AC_INT32", 32>;
+def AISrc_1024_f32 : SrcRegOrImmA9 <AReg_1024, "OPW1024", "OPERAND_REG_INLINE_AC_FP32", 32>;
+def AISrc_1024_b32 : SrcRegOrImmA9 <AReg_1024, "OPW1024", "OPERAND_REG_INLINE_AC_INT32", 32>;
-- 
cgit v1.1


From fe3406e349884e4ef61480dd0607f1e237102c74 Mon Sep 17 00:00:00 2001
From: Ulrich Weigand <ulrich.weigand@de.ibm.com>
Date: Tue, 13 Feb 2024 11:29:21 +0100
Subject: [lld] Add target support for SystemZ (s390x) (#75643)

This patch adds full support for linking SystemZ (ELF s390x) object
files. Support should be generally complete:
- All relocation types are supported.
- Full shared library support (DYNAMIC, GOT, PLT, ifunc).
- Relaxation of TLS and GOT relocations where appropriate.
- Platform-specific test cases.

In addition to new platform code and the obvious changes, there were a
few additional changes to common code:

- Add three new RelExpr members (R_GOTPLT_OFF, R_GOTPLT_PC, and
R_PLT_GOTREL) needed to support certain s390x relocations. I chose not
to use a platform-specific name since nothing in the definition of these
relocs is actually platform-specific; it is well possible that other
platforms will need the same.

- A couple of tweaks to TLS relocation handling, as the particular
semantics of the s390x versions differ slightly. See comments in the
code.

This was tested by building and testing >1500 Fedora packages, with only
a handful of failures; as these also have issues when building with LLD
on other architectures, they seem unrelated.

Co-authored-by: Tulio Magno Quites Machado Filho <tuliom@redhat.com>
---
 lld/ELF/Arch/SystemZ.cpp                    | 607 ++++++++++++++++++++++++++++
 lld/ELF/CMakeLists.txt                      |   1 +
 lld/ELF/Driver.cpp                          |   3 +-
 lld/ELF/InputFiles.cpp                      |   2 +
 lld/ELF/InputSection.cpp                    |   7 +
 lld/ELF/Relocations.cpp                     |  25 +-
 lld/ELF/Relocations.h                       |   3 +
 lld/ELF/ScriptParser.cpp                    |   1 +
 lld/ELF/SyntheticSections.cpp               |   3 +
 lld/ELF/Target.cpp                          |   2 +
 lld/ELF/Target.h                            |   1 +
 lld/test/ELF/Inputs/systemz-init.s          |   5 +
 lld/test/ELF/basic-systemz.s                |  63 +++
 lld/test/ELF/emulation-systemz.s            |  29 ++
 lld/test/ELF/lto/systemz.ll                 |  18 +
 lld/test/ELF/systemz-got.s                  |  16 +
 lld/test/ELF/systemz-gotent-relax-align.s   |  48 +++
 lld/test/ELF/systemz-gotent-relax-und-dso.s |  68 ++++
 lld/test/ELF/systemz-gotent-relax.s         |  91 +++++
 lld/test/ELF/systemz-ifunc-nonpreemptible.s |  75 ++++
 lld/test/ELF/systemz-init-padding.s         |  27 ++
 lld/test/ELF/systemz-pie.s                  |  38 ++
 lld/test/ELF/systemz-plt.s                  |  83 ++++
 lld/test/ELF/systemz-reloc-abs.s            |  32 ++
 lld/test/ELF/systemz-reloc-disp12.s         |  21 +
 lld/test/ELF/systemz-reloc-disp20.s         |  21 +
 lld/test/ELF/systemz-reloc-got.s            |  92 +++++
 lld/test/ELF/systemz-reloc-gotrel.s         |  36 ++
 lld/test/ELF/systemz-reloc-pc16.s           |  39 ++
 lld/test/ELF/systemz-reloc-pc32.s           |  39 ++
 lld/test/ELF/systemz-reloc-pcdbl.s          |  68 ++++
 lld/test/ELF/systemz-tls-gd.s               | 142 +++++++
 lld/test/ELF/systemz-tls-ie.s               |  87 ++++
 lld/test/ELF/systemz-tls-ld.s               | 114 ++++++
 lld/test/ELF/systemz-tls-le.s               |  61 +++
 lld/test/lit.cfg.py                         |   1 +
 36 files changed, 1959 insertions(+), 10 deletions(-)
 create mode 100644 lld/ELF/Arch/SystemZ.cpp
 create mode 100644 lld/test/ELF/Inputs/systemz-init.s
 create mode 100644 lld/test/ELF/basic-systemz.s
 create mode 100644 lld/test/ELF/emulation-systemz.s
 create mode 100644 lld/test/ELF/lto/systemz.ll
 create mode 100644 lld/test/ELF/systemz-got.s
 create mode 100644 lld/test/ELF/systemz-gotent-relax-align.s
 create mode 100644 lld/test/ELF/systemz-gotent-relax-und-dso.s
 create mode 100644 lld/test/ELF/systemz-gotent-relax.s
 create mode 100644 lld/test/ELF/systemz-ifunc-nonpreemptible.s
 create mode 100644 lld/test/ELF/systemz-init-padding.s
 create mode 100644 lld/test/ELF/systemz-pie.s
 create mode 100644 lld/test/ELF/systemz-plt.s
 create mode 100644 lld/test/ELF/systemz-reloc-abs.s
 create mode 100644 lld/test/ELF/systemz-reloc-disp12.s
 create mode 100644 lld/test/ELF/systemz-reloc-disp20.s
 create mode 100644 lld/test/ELF/systemz-reloc-got.s
 create mode 100644 lld/test/ELF/systemz-reloc-gotrel.s
 create mode 100644 lld/test/ELF/systemz-reloc-pc16.s
 create mode 100644 lld/test/ELF/systemz-reloc-pc32.s
 create mode 100644 lld/test/ELF/systemz-reloc-pcdbl.s
 create mode 100644 lld/test/ELF/systemz-tls-gd.s
 create mode 100644 lld/test/ELF/systemz-tls-ie.s
 create mode 100644 lld/test/ELF/systemz-tls-ld.s
 create mode 100644 lld/test/ELF/systemz-tls-le.s

diff --git a/lld/ELF/Arch/SystemZ.cpp b/lld/ELF/Arch/SystemZ.cpp
new file mode 100644
index 0000000..d37db68
--- /dev/null
+++ b/lld/ELF/Arch/SystemZ.cpp
@@ -0,0 +1,607 @@
+//===- SystemZ.cpp --------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "OutputSections.h"
+#include "Symbols.h"
+#include "SyntheticSections.h"
+#include "Target.h"
+#include "lld/Common/ErrorHandler.h"
+#include "llvm/BinaryFormat/ELF.h"
+#include "llvm/Support/Endian.h"
+
+using namespace llvm;
+using namespace llvm::support::endian;
+using namespace llvm::ELF;
+using namespace lld;
+using namespace lld::elf;
+
+namespace {
+class SystemZ : public TargetInfo {
+public:
+  SystemZ();
+  int getTlsGdRelaxSkip(RelType type) const override;
+  RelExpr getRelExpr(RelType type, const Symbol &s,
+                     const uint8_t *loc) const override;
+  RelType getDynRel(RelType type) const override;
+  void writeGotHeader(uint8_t *buf) const override;
+  void writeGotPlt(uint8_t *buf, const Symbol &s) const override;
+  void writeIgotPlt(uint8_t *buf, const Symbol &s) const override;
+  void writePltHeader(uint8_t *buf) const override;
+  void addPltHeaderSymbols(InputSection &isd) const override;
+  void writePlt(uint8_t *buf, const Symbol &sym,
+                uint64_t pltEntryAddr) const override;
+  RelExpr adjustTlsExpr(RelType type, RelExpr expr) const override;
+  RelExpr adjustGotPcExpr(RelType type, int64_t addend,
+                          const uint8_t *loc) const override;
+  bool relaxOnce(int pass) const override;
+  void relocate(uint8_t *loc, const Relocation &rel,
+                uint64_t val) const override;
+  int64_t getImplicitAddend(const uint8_t *buf, RelType type) const override;
+
+private:
+  void relaxGot(uint8_t *loc, const Relocation &rel, uint64_t val) const;
+  void relaxTlsGdToIe(uint8_t *loc, const Relocation &rel, uint64_t val) const;
+  void relaxTlsGdToLe(uint8_t *loc, const Relocation &rel, uint64_t val) const;
+  void relaxTlsLdToLe(uint8_t *loc, const Relocation &rel, uint64_t val) const;
+};
+} // namespace
+
+SystemZ::SystemZ() {
+  copyRel = R_390_COPY;
+  gotRel = R_390_GLOB_DAT;
+  pltRel = R_390_JMP_SLOT;
+  relativeRel = R_390_RELATIVE;
+  iRelativeRel = R_390_IRELATIVE;
+  symbolicRel = R_390_64;
+  tlsGotRel = R_390_TLS_TPOFF;
+  tlsModuleIndexRel = R_390_TLS_DTPMOD;
+  tlsOffsetRel = R_390_TLS_DTPOFF;
+  gotHeaderEntriesNum = 3;
+  gotPltHeaderEntriesNum = 0;
+  gotEntrySize = 8;
+  pltHeaderSize = 32;
+  pltEntrySize = 32;
+  ipltEntrySize = 32;
+
+  // This "trap instruction" is used to fill gaps between sections.
+  // On SystemZ, the behavior of the GNU ld is to fill those gaps
+  // with nop instructions instead - and unfortunately the default
+  // glibc crt object files (used to) rely on that behavior since
+  // they use an alignment on the .init section fragments that causes
+  // gaps which must be filled with nops as they are being executed.
+  // Therefore, we provide a nop instruction as "trapInstr" here.
+  trapInstr = {0x07, 0x07, 0x07, 0x07};
+
+  defaultImageBase = 0x1000000;
+}
+
+RelExpr SystemZ::getRelExpr(RelType type, const Symbol &s,
+                            const uint8_t *loc) const {
+  switch (type) {
+  case R_390_NONE:
+    return R_NONE;
+  // Relocations targeting the symbol value.
+  case R_390_8:
+  case R_390_12:
+  case R_390_16:
+  case R_390_20:
+  case R_390_32:
+  case R_390_64:
+    return R_ABS;
+  case R_390_PC16:
+  case R_390_PC32:
+  case R_390_PC64:
+  case R_390_PC12DBL:
+  case R_390_PC16DBL:
+  case R_390_PC24DBL:
+  case R_390_PC32DBL:
+    return R_PC;
+  case R_390_GOTOFF16:
+  case R_390_GOTOFF: // a.k.a. R_390_GOTOFF32
+  case R_390_GOTOFF64:
+    return R_GOTREL;
+  // Relocations targeting the PLT associated with the symbol.
+  case R_390_PLT32:
+  case R_390_PLT64:
+  case R_390_PLT12DBL:
+  case R_390_PLT16DBL:
+  case R_390_PLT24DBL:
+  case R_390_PLT32DBL:
+    return R_PLT_PC;
+  case R_390_PLTOFF16:
+  case R_390_PLTOFF32:
+  case R_390_PLTOFF64:
+    return R_PLT_GOTREL;
+  // Relocations targeting the GOT entry associated with the symbol.
+  case R_390_GOTENT:
+    return R_GOT_PC;
+  case R_390_GOT12:
+  case R_390_GOT16:
+  case R_390_GOT20:
+  case R_390_GOT32:
+  case R_390_GOT64:
+    return R_GOT_OFF;
+  // Relocations targeting the GOTPLT entry associated with the symbol.
+  case R_390_GOTPLTENT:
+    return R_GOTPLT_PC;
+  case R_390_GOTPLT12:
+  case R_390_GOTPLT16:
+  case R_390_GOTPLT20:
+  case R_390_GOTPLT32:
+  case R_390_GOTPLT64:
+    return R_GOTPLT_GOTREL;
+  // Relocations targeting _GLOBAL_OFFSET_TABLE_.
+  case R_390_GOTPC:
+  case R_390_GOTPCDBL:
+    return R_GOTONLY_PC;
+  // TLS-related relocations.
+  case R_390_TLS_LOAD:
+    return R_NONE;
+  case R_390_TLS_GDCALL:
+    return R_TLSGD_PC;
+  case R_390_TLS_LDCALL:
+    return R_TLSLD_PC;
+  case R_390_TLS_GD32:
+  case R_390_TLS_GD64:
+    return R_TLSGD_GOT;
+  case R_390_TLS_LDM32:
+  case R_390_TLS_LDM64:
+    return R_TLSLD_GOT;
+  case R_390_TLS_LDO32:
+  case R_390_TLS_LDO64:
+    return R_DTPREL;
+  case R_390_TLS_LE32:
+  case R_390_TLS_LE64:
+    return R_TPREL;
+  case R_390_TLS_IE32:
+  case R_390_TLS_IE64:
+    return R_GOT;
+  case R_390_TLS_GOTIE12:
+  case R_390_TLS_GOTIE20:
+  case R_390_TLS_GOTIE32:
+  case R_390_TLS_GOTIE64:
+    return R_GOT_OFF;
+  case R_390_TLS_IEENT:
+    return R_GOT_PC;
+
+  default:
+    error(getErrorLocation(loc) + "unknown relocation (" + Twine(type) +
+          ") against symbol " + toString(s));
+    return R_NONE;
+  }
+}
+
+void SystemZ::writeGotHeader(uint8_t *buf) const {
+  // _GLOBAL_OFFSET_TABLE_[0] holds the value of _DYNAMIC.
+  // _GLOBAL_OFFSET_TABLE_[1] and [2] are reserved.
+  write64be(buf, mainPart->dynamic->getVA());
+}
+
+void SystemZ::writeGotPlt(uint8_t *buf, const Symbol &s) const {
+  write64be(buf, s.getPltVA() + 14);
+}
+
+void SystemZ::writeIgotPlt(uint8_t *buf, const Symbol &s) const {
+  if (config->writeAddends)
+    write64be(buf, s.getVA());
+}
+
+void SystemZ::writePltHeader(uint8_t *buf) const {
+  const uint8_t pltData[] = {
+      0xe3, 0x10, 0xf0, 0x38, 0x00, 0x24, // stg     %r1,56(%r15)
+      0xc0, 0x10, 0x00, 0x00, 0x00, 0x00, // larl    %r1,_GLOBAL_OFFSET_TABLE_
+      0xd2, 0x07, 0xf0, 0x30, 0x10, 0x08, // mvc     48(8,%r15),8(%r1)
+      0xe3, 0x10, 0x10, 0x10, 0x00, 0x04, // lg      %r1,16(%r1)
+      0x07, 0xf1,                         // br      %r1
+      0x07, 0x00,                         // nopr
+      0x07, 0x00,                         // nopr
+      0x07, 0x00,                         // nopr
+  };
+  memcpy(buf, pltData, sizeof(pltData));
+  uint64_t got = in.got->getVA();
+  uint64_t plt = in.plt->getVA();
+  write32be(buf + 8, (got - plt - 6) >> 1);
+}
+
+void SystemZ::addPltHeaderSymbols(InputSection &isec) const {
+  // The PLT header needs a reference to _GLOBAL_OFFSET_TABLE_, so we
+  // must ensure the .got section is created even if otherwise unused.
+  in.got->hasGotOffRel.store(true, std::memory_order_relaxed);
+}
+
+void SystemZ::writePlt(uint8_t *buf, const Symbol &sym,
+                       uint64_t pltEntryAddr) const {
+  const uint8_t inst[] = {
+      0xc0, 0x10, 0x00, 0x00, 0x00, 0x00, // larl    %r1,<.got.plt slot>
+      0xe3, 0x10, 0x10, 0x00, 0x00, 0x04, // lg      %r1,0(%r1)
+      0x07, 0xf1,                         // br      %r1
+      0x0d, 0x10,                         // basr    %r1,%r0
+      0xe3, 0x10, 0x10, 0x0c, 0x00, 0x14, // lgf     %r1,12(%r1)
+      0xc0, 0xf4, 0x00, 0x00, 0x00, 0x00, // jg      <plt header>
+      0x00, 0x00, 0x00, 0x00,             // <relocation offset>
+  };
+  memcpy(buf, inst, sizeof(inst));
+
+  write32be(buf + 2, (sym.getGotPltVA() - pltEntryAddr) >> 1);
+  write32be(buf + 24, (in.plt->getVA() - pltEntryAddr - 22) >> 1);
+  write32be(buf + 28, in.relaPlt->entsize * sym.getPltIdx());
+}
+
+int64_t SystemZ::getImplicitAddend(const uint8_t *buf, RelType type) const {
+  switch (type) {
+  case R_390_8:
+    return SignExtend64<8>(*buf);
+  case R_390_16:
+  case R_390_PC16:
+    return SignExtend64<16>(read16be(buf));
+  case R_390_PC16DBL:
+    return SignExtend64<16>(read16be(buf)) << 1;
+  case R_390_32:
+  case R_390_PC32:
+    return SignExtend64<32>(read32be(buf));
+  case R_390_PC32DBL:
+    return SignExtend64<32>(read32be(buf)) << 1;
+  case R_390_64:
+  case R_390_PC64:
+  case R_390_TLS_DTPMOD:
+  case R_390_TLS_DTPOFF:
+  case R_390_TLS_TPOFF:
+  case R_390_GLOB_DAT:
+  case R_390_RELATIVE:
+  case R_390_IRELATIVE:
+    return read64be(buf);
+  case R_390_COPY:
+  case R_390_JMP_SLOT:
+  case R_390_NONE:
+    // These relocations are defined as not having an implicit addend.
+    return 0;
+  default:
+    internalLinkerError(getErrorLocation(buf),
+                        "cannot read addend for relocation " + toString(type));
+    return 0;
+  }
+}
+
+RelType SystemZ::getDynRel(RelType type) const {
+  if (type == R_390_64 || type == R_390_PC64)
+    return type;
+  return R_390_NONE;
+}
+
+RelExpr SystemZ::adjustTlsExpr(RelType type, RelExpr expr) const {
+  if (expr == R_RELAX_TLS_GD_TO_IE)
+    return R_RELAX_TLS_GD_TO_IE_GOT_OFF;
+  return expr;
+}
+
+int SystemZ::getTlsGdRelaxSkip(RelType type) const {
+  // A __tls_get_offset call instruction is marked with 2 relocations:
+  //
+  //   R_390_TLS_GDCALL / R_390_TLS_LDCALL: marker relocation
+  //   R_390_PLT32DBL: __tls_get_offset
+  //
+  // After the relaxation we no longer call __tls_get_offset and should skip
+  // both relocations to not create a false dependence on __tls_get_offset
+  // being defined.
+  //
+  // Note that this mechanism only works correctly if the R_390_TLS_[GL]DCALL
+  // is seen immediately *before* the R_390_PLT32DBL.  Unfortunately, current
+  // compilers on the platform will typically generate the inverse sequence.
+  // To fix this, we sort relocations by offset in RelocationScanner::scan;
+  // this ensures the correct sequence as the R_390_TLS_[GL]DCALL applies to
+  // the first byte of the brasl instruction, while the R_390_PLT32DBL applies
+  // to its third byte (the relative displacement).
+
+  if (type == R_390_TLS_GDCALL || type == R_390_TLS_LDCALL)
+    return 2;
+  return 1;
+}
+
+void SystemZ::relaxTlsGdToIe(uint8_t *loc, const Relocation &rel,
+                             uint64_t val) const {
+  // The general-dynamic code sequence for a global `x`:
+  //
+  // Instruction                      Relocation       Symbol
+  // ear %rX,%a0
+  // sllg %rX,%rX,32
+  // ear %rX,%a1
+  // larl %r12,_GLOBAL_OFFSET_TABLE_  R_390_GOTPCDBL   _GLOBAL_OFFSET_TABLE_
+  // lgrl %r2,.LC0                    R_390_PC32DBL    .LC0
+  // brasl %r14,__tls_get_offset@plt  R_390_TLS_GDCALL x
+  //            :tls_gdcall:x         R_390_PLT32DBL   __tls_get_offset
+  // la %r2,0(%r2,%rX)
+  //
+  // .LC0:
+  // .quad   x@TLSGD                  R_390_TLS_GD64   x
+  //
+  // Relaxing to initial-exec entails:
+  // 1) Replacing the call by a load from the GOT.
+  // 2) Replacing the relocation on the constant LC0 by R_390_TLS_GOTIE64.
+
+  switch (rel.type) {
+  case R_390_TLS_GDCALL:
+    // brasl %r14,__tls_get_offset@plt -> lg %r2,0(%r2,%r12)
+    write16be(loc, 0xe322);
+    write32be(loc + 2, 0xc0000004);
+    break;
+  case R_390_TLS_GD64:
+    relocateNoSym(loc, R_390_TLS_GOTIE64, val);
+    break;
+  default:
+    llvm_unreachable("unsupported relocation for TLS GD to IE relaxation");
+  }
+}
+
+void SystemZ::relaxTlsGdToLe(uint8_t *loc, const Relocation &rel,
+                             uint64_t val) const {
+  // The general-dynamic code sequence for a global `x`:
+  //
+  // Instruction                      Relocation       Symbol
+  // ear %rX,%a0
+  // sllg %rX,%rX,32
+  // ear %rX,%a1
+  // larl %r12,_GLOBAL_OFFSET_TABLE_  R_390_GOTPCDBL   _GLOBAL_OFFSET_TABLE_
+  // lgrl %r2,.LC0                    R_390_PC32DBL    .LC0
+  // brasl %r14,__tls_get_offset@plt  R_390_TLS_GDCALL x
+  //            :tls_gdcall:x         R_390_PLT32DBL   __tls_get_offset
+  // la %r2,0(%r2,%rX)
+  //
+  // .LC0:
+  // .quad   x@tlsgd                  R_390_TLS_GD64   x
+  //
+  // Relaxing to local-exec entails:
+  // 1) Replacing the call by a nop.
+  // 2) Replacing the relocation on the constant LC0 by R_390_TLS_LE64.
+
+  switch (rel.type) {
+  case R_390_TLS_GDCALL:
+    // brasl %r14,__tls_get_offset@plt -> brcl 0,.
+    write16be(loc, 0xc004);
+    write32be(loc + 2, 0x00000000);
+    break;
+  case R_390_TLS_GD64:
+    relocateNoSym(loc, R_390_TLS_LE64, val);
+    break;
+  default:
+    llvm_unreachable("unsupported relocation for TLS GD to LE relaxation");
+  }
+}
+
+void SystemZ::relaxTlsLdToLe(uint8_t *loc, const Relocation &rel,
+                             uint64_t val) const {
+  // The local-dynamic code sequence for a global `x`:
+  //
+  // Instruction                      Relocation       Symbol
+  // ear %rX,%a0
+  // sllg %rX,%rX,32
+  // ear %rX,%a1
+  // larl %r12,_GLOBAL_OFFSET_TABLE_  R_390_GOTPCDBL   _GLOBAL_OFFSET_TABLE_
+  // lgrl %r2,.LC0                    R_390_PC32DBL    .LC0
+  // brasl %r14,__tls_get_offset@plt  R_390_TLS_LDCALL <sym>
+  //            :tls_ldcall:<sym>     R_390_PLT32DBL   __tls_get_offset
+  // la %r2,0(%r2,%rX)
+  // lgrl %rY,.LC1                    R_390_PC32DBL    .LC1
+  // la %r2,0(%r2,%rY)
+  //
+  // .LC0:
+  // .quad   <sym>@tlsldm             R_390_TLS_LDM64  <sym>
+  // .LC1:
+  // .quad   x@dtpoff                 R_390_TLS_LDO64  x
+  //
+  // Relaxing to local-exec entails:
+  // 1) Replacing the call by a nop.
+  // 2) Replacing the constant LC0 by 0 (i.e. ignoring the relocation).
+  // 3) Replacing the relocation on the constant LC1 by R_390_TLS_LE64.
+
+  switch (rel.type) {
+  case R_390_TLS_LDCALL:
+    // brasl %r14,__tls_get_offset@plt -> brcl 0,.
+    write16be(loc, 0xc004);
+    write32be(loc + 2, 0x00000000);
+    break;
+  case R_390_TLS_LDM64:
+    break;
+  case R_390_TLS_LDO64:
+    relocateNoSym(loc, R_390_TLS_LE64, val);
+    break;
+  default:
+    llvm_unreachable("unsupported relocation for TLS LD to LE relaxation");
+  }
+}
+
+RelExpr SystemZ::adjustGotPcExpr(RelType type, int64_t addend,
+                                 const uint8_t *loc) const {
+  // Only R_390_GOTENT with addend 2 can be relaxed.
+  if (!config->relax || addend != 2 || type != R_390_GOTENT)
+    return R_GOT_PC;
+  const uint16_t op = read16be(loc - 2);
+
+  // lgrl rx,sym@GOTENT -> larl rx, sym
+  // This relaxation is legal if "sym" binds locally (which was already
+  // verified by our caller) and is in-range and properly aligned for a
+  // LARL instruction.  We cannot verify the latter constraint here, so
+  // we assume it is true and revert the decision later on in relaxOnce
+  // if necessary.
+  if ((op & 0xff0f) == 0xc408)
+    return R_RELAX_GOT_PC;
+
+  return R_GOT_PC;
+}
+
+bool SystemZ::relaxOnce(int pass) const {
+  // If we decided in adjustGotPcExpr to relax a R_390_GOTENT,
+  // we need to validate the target symbol is in-range and aligned.
+  SmallVector<InputSection *, 0> storage;
+  bool changed = false;
+  for (OutputSection *osec : outputSections) {
+    if (!(osec->flags & SHF_EXECINSTR))
+      continue;
+    for (InputSection *sec : getInputSections(*osec, storage)) {
+      for (Relocation &rel : sec->relocs()) {
+        if (rel.expr != R_RELAX_GOT_PC)
+          continue;
+
+        uint64_t v = sec->getRelocTargetVA(
+            sec->file, rel.type, rel.addend,
+            sec->getOutputSection()->addr + rel.offset, *rel.sym, rel.expr);
+        if (isInt<33>(v) && !(v & 1))
+          continue;
+        if (rel.sym->auxIdx == 0) {
+          rel.sym->allocateAux();
+          addGotEntry(*rel.sym);
+          changed = true;
+        }
+        rel.expr = R_GOT_PC;
+      }
+    }
+  }
+  return changed;
+}
+
+void SystemZ::relaxGot(uint8_t *loc, const Relocation &rel,
+                       uint64_t val) const {
+  assert(isInt<33>(val) &&
+         "R_390_GOTENT should not have been relaxed if it overflows");
+  assert(!(val & 1) &&
+         "R_390_GOTENT should not have been relaxed if it is misaligned");
+  const uint16_t op = read16be(loc - 2);
+
+  // lgrl rx,sym@GOTENT -> larl rx, sym
+  if ((op & 0xff0f) == 0xc408) {
+    write16be(loc - 2, 0xc000 | (op & 0x00f0));
+    write32be(loc, val >> 1);
+  }
+}
+
+void SystemZ::relocate(uint8_t *loc, const Relocation &rel,
+                       uint64_t val) const {
+  switch (rel.expr) {
+  case R_RELAX_GOT_PC:
+    return relaxGot(loc, rel, val);
+  case R_RELAX_TLS_GD_TO_IE_GOT_OFF:
+    return relaxTlsGdToIe(loc, rel, val);
+  case R_RELAX_TLS_GD_TO_LE:
+    return relaxTlsGdToLe(loc, rel, val);
+  case R_RELAX_TLS_LD_TO_LE:
+    return relaxTlsLdToLe(loc, rel, val);
+  default:
+    break;
+  }
+  switch (rel.type) {
+  case R_390_8:
+    checkIntUInt(loc, val, 8, rel);
+    *loc = val;
+    break;
+  case R_390_12:
+  case R_390_GOT12:
+  case R_390_GOTPLT12:
+  case R_390_TLS_GOTIE12:
+    checkUInt(loc, val, 12, rel);
+    write16be(loc, (read16be(loc) & 0xF000) | val);
+    break;
+  case R_390_PC12DBL:
+  case R_390_PLT12DBL:
+    checkInt(loc, val, 13, rel);
+    checkAlignment(loc, val, 2, rel);
+    write16be(loc, (read16be(loc) & 0xF000) | ((val >> 1) & 0x0FFF));
+    break;
+  case R_390_16:
+  case R_390_GOT16:
+  case R_390_GOTPLT16:
+  case R_390_GOTOFF16:
+  case R_390_PLTOFF16:
+    checkIntUInt(loc, val, 16, rel);
+    write16be(loc, val);
+    break;
+  case R_390_PC16:
+    checkInt(loc, val, 16, rel);
+    write16be(loc, val);
+    break;
+  case R_390_PC16DBL:
+  case R_390_PLT16DBL:
+    checkInt(loc, val, 17, rel);
+    checkAlignment(loc, val, 2, rel);
+    write16be(loc, val >> 1);
+    break;
+  case R_390_20:
+  case R_390_GOT20:
+  case R_390_GOTPLT20:
+  case R_390_TLS_GOTIE20:
+    checkInt(loc, val, 20, rel);
+    write32be(loc, (read32be(loc) & 0xF00000FF) | ((val & 0xFFF) << 16) |
+                       ((val & 0xFF000) >> 4));
+    break;
+  case R_390_PC24DBL:
+  case R_390_PLT24DBL:
+    checkInt(loc, val, 25, rel);
+    checkAlignment(loc, val, 2, rel);
+    loc[0] = val >> 17;
+    loc[1] = val >> 9;
+    loc[2] = val >> 1;
+    break;
+  case R_390_32:
+  case R_390_GOT32:
+  case R_390_GOTPLT32:
+  case R_390_GOTOFF:
+  case R_390_PLTOFF32:
+  case R_390_TLS_IE32:
+  case R_390_TLS_GOTIE32:
+  case R_390_TLS_GD32:
+  case R_390_TLS_LDM32:
+  case R_390_TLS_LDO32:
+  case R_390_TLS_LE32:
+    checkIntUInt(loc, val, 32, rel);
+    write32be(loc, val);
+    break;
+  case R_390_PC32:
+  case R_390_PLT32:
+    checkInt(loc, val, 32, rel);
+    write32be(loc, val);
+    break;
+  case R_390_PC32DBL:
+  case R_390_PLT32DBL:
+  case R_390_GOTPCDBL:
+  case R_390_GOTENT:
+  case R_390_GOTPLTENT:
+  case R_390_TLS_IEENT:
+    checkInt(loc, val, 33, rel);
+    checkAlignment(loc, val, 2, rel);
+    write32be(loc, val >> 1);
+    break;
+  case R_390_64:
+  case R_390_PC64:
+  case R_390_PLT64:
+  case R_390_GOT64:
+  case R_390_GOTPLT64:
+  case R_390_GOTOFF64:
+  case R_390_PLTOFF64:
+  case R_390_GOTPC:
+  case R_390_TLS_IE64:
+  case R_390_TLS_GOTIE64:
+  case R_390_TLS_GD64:
+  case R_390_TLS_LDM64:
+  case R_390_TLS_LDO64:
+  case R_390_TLS_LE64:
+  case R_390_TLS_DTPMOD:
+  case R_390_TLS_DTPOFF:
+  case R_390_TLS_TPOFF:
+    write64be(loc, val);
+    break;
+  case R_390_TLS_LOAD:
+  case R_390_TLS_GDCALL:
+  case R_390_TLS_LDCALL:
+    break;
+  default:
+    llvm_unreachable("unknown relocation");
+  }
+}
+
+TargetInfo *elf::getSystemZTargetInfo() {
+  static SystemZ t;
+  return &t;
+}
diff --git a/lld/ELF/CMakeLists.txt b/lld/ELF/CMakeLists.txt
index 475f7de..83d816dd 100644
--- a/lld/ELF/CMakeLists.txt
+++ b/lld/ELF/CMakeLists.txt
@@ -33,6 +33,7 @@ add_lld_library(lldELF
   Arch/PPC64.cpp
   Arch/RISCV.cpp
   Arch/SPARCV9.cpp
+  Arch/SystemZ.cpp
   Arch/X86.cpp
   Arch/X86_64.cpp
   ARMErrataFix.cpp
diff --git a/lld/ELF/Driver.cpp b/lld/ELF/Driver.cpp
index c19fba6..4bb9b7a 100644
--- a/lld/ELF/Driver.cpp
+++ b/lld/ELF/Driver.cpp
@@ -200,6 +200,7 @@ static std::tuple<ELFKind, uint16_t, uint8_t> parseEmulation(StringRef emul) {
           .Case("msp430elf", {ELF32LEKind, EM_MSP430})
           .Case("elf64_amdgpu", {ELF64LEKind, EM_AMDGPU})
           .Case("elf64loongarch", {ELF64LEKind, EM_LOONGARCH})
+          .Case("elf64_s390", {ELF64BEKind, EM_S390})
           .Default({ELFNoneKind, EM_NONE});
 
   if (ret.first == ELFNoneKind)
@@ -1137,7 +1138,7 @@ static SmallVector<StringRef, 0> getSymbolOrderingFile(MemoryBufferRef mb) {
 static bool getIsRela(opt::InputArgList &args) {
   // The psABI specifies the default relocation entry format.
   bool rela = is_contained({EM_AARCH64, EM_AMDGPU, EM_HEXAGON, EM_LOONGARCH,
-                            EM_PPC, EM_PPC64, EM_RISCV, EM_X86_64},
+                            EM_PPC, EM_PPC64, EM_RISCV, EM_S390, EM_X86_64},
                            config->emachine);
   // If -z rel or -z rela is specified, use the last option.
   for (auto *arg : args.filtered(OPT_z)) {
diff --git a/lld/ELF/InputFiles.cpp b/lld/ELF/InputFiles.cpp
index a292e87..6c7ef27 100644
--- a/lld/ELF/InputFiles.cpp
+++ b/lld/ELF/InputFiles.cpp
@@ -1614,6 +1614,8 @@ static uint16_t getBitcodeMachineKind(StringRef path, const Triple &t) {
     return EM_RISCV;
   case Triple::sparcv9:
     return EM_SPARCV9;
+  case Triple::systemz:
+    return EM_S390;
   case Triple::x86:
     return t.isOSIAMCU() ? EM_IAMCU : EM_386;
   case Triple::x86_64:
diff --git a/lld/ELF/InputSection.cpp b/lld/ELF/InputSection.cpp
index 3d726b4..e033a71 100644
--- a/lld/ELF/InputSection.cpp
+++ b/lld/ELF/InputSection.cpp
@@ -655,6 +655,7 @@ static int64_t getTlsTpOffset(const Symbol &s) {
 
     // Variant 2.
   case EM_HEXAGON:
+  case EM_S390:
   case EM_SPARCV9:
   case EM_386:
   case EM_X86_64:
@@ -717,6 +718,10 @@ uint64_t InputSectionBase::getRelocTargetVA(const InputFile *file, RelType type,
   case R_GOT_PC:
   case R_RELAX_TLS_GD_TO_IE:
     return sym.getGotVA() + a - p;
+  case R_GOTPLT_GOTREL:
+    return sym.getGotPltVA() + a - in.got->getVA();
+  case R_GOTPLT_PC:
+    return sym.getGotPltVA() + a - p;
   case R_LOONGARCH_GOT_PAGE_PC:
     if (sym.hasFlag(NEEDS_TLSGD))
       return getLoongArchPageDelta(in.got->getGlobalDynAddr(sym) + a, p, type);
@@ -808,6 +813,8 @@ uint64_t InputSectionBase::getRelocTargetVA(const InputFile *file, RelType type,
     return getLoongArchPageDelta(sym.getPltVA() + a, p, type);
   case R_PLT_GOTPLT:
     return sym.getPltVA() + a - in.gotPlt->getVA();
+  case R_PLT_GOTREL:
+    return sym.getPltVA() + a - in.got->getVA();
   case R_PPC32_PLTREL:
     // R_PPC_PLTREL24 uses the addend (usually 0 or 0x8000) to indicate r30
     // stores _GLOBAL_OFFSET_TABLE_ or .got2+0x8000. The addend is ignored for
diff --git a/lld/ELF/Relocations.cpp b/lld/ELF/Relocations.cpp
index 79c8230..f64b421 100644
--- a/lld/ELF/Relocations.cpp
+++ b/lld/ELF/Relocations.cpp
@@ -203,8 +203,9 @@ static bool isAbsoluteValue(const Symbol &sym) {
 
 // Returns true if Expr refers a PLT entry.
 static bool needsPlt(RelExpr expr) {
-  return oneof<R_PLT, R_PLT_PC, R_PLT_GOTPLT, R_LOONGARCH_PLT_PAGE_PC,
-               R_PPC32_PLTREL, R_PPC64_CALL_PLT>(expr);
+  return oneof<R_PLT, R_PLT_PC, R_PLT_GOTREL, R_PLT_GOTPLT, R_GOTPLT_GOTREL,
+               R_GOTPLT_PC, R_LOONGARCH_PLT_PAGE_PC, R_PPC32_PLTREL,
+               R_PPC64_CALL_PLT>(expr);
 }
 
 bool lld::elf::needsGot(RelExpr expr) {
@@ -233,6 +234,8 @@ static RelExpr toPlt(RelExpr expr) {
     return R_PLT_PC;
   case R_ABS:
     return R_PLT;
+  case R_GOTREL:
+    return R_PLT_GOTREL;
   default:
     return expr;
   }
@@ -253,6 +256,8 @@ static RelExpr fromPlt(RelExpr expr) {
     return R_ABS;
   case R_PLT_GOTPLT:
     return R_GOTPLTREL;
+  case R_PLT_GOTREL:
+    return R_GOTREL;
   default:
     return expr;
   }
@@ -979,10 +984,10 @@ bool RelocationScanner::isStaticLinkTimeConstant(RelExpr e, RelType type,
   if (oneof<R_GOTPLT, R_GOT_OFF, R_RELAX_HINT, R_MIPS_GOT_LOCAL_PAGE,
             R_MIPS_GOTREL, R_MIPS_GOT_OFF, R_MIPS_GOT_OFF32, R_MIPS_GOT_GP_PC,
             R_AARCH64_GOT_PAGE_PC, R_GOT_PC, R_GOTONLY_PC, R_GOTPLTONLY_PC,
-            R_PLT_PC, R_PLT_GOTPLT, R_PPC32_PLTREL, R_PPC64_CALL_PLT,
-            R_PPC64_RELAX_TOC, R_RISCV_ADD, R_AARCH64_GOT_PAGE,
-            R_LOONGARCH_PLT_PAGE_PC, R_LOONGARCH_GOT, R_LOONGARCH_GOT_PAGE_PC>(
-          e))
+            R_PLT_PC, R_PLT_GOTREL, R_PLT_GOTPLT, R_GOTPLT_GOTREL, R_GOTPLT_PC,
+            R_PPC32_PLTREL, R_PPC64_CALL_PLT, R_PPC64_RELAX_TOC, R_RISCV_ADD,
+            R_AARCH64_GOT_PAGE, R_LOONGARCH_PLT_PAGE_PC, R_LOONGARCH_GOT,
+            R_LOONGARCH_GOT_PAGE_PC>(e))
     return true;
 
   // These never do, except if the entire file is position dependent or if
@@ -1374,8 +1379,8 @@ static unsigned handleTlsRelocation(RelType type, Symbol &sym,
             R_LOONGARCH_GOT_PAGE_PC, R_GOT_OFF, R_TLSIE_HINT>(expr)) {
     ctx.hasTlsIe.store(true, std::memory_order_relaxed);
     // Initial-Exec relocs can be optimized to Local-Exec if the symbol is
-    // locally defined.
-    if (execOptimize && isLocalInExecutable) {
+    // locally defined.  This is not supported on SystemZ.
+    if (execOptimize && isLocalInExecutable && config->emachine != EM_S390) {
       c.addReloc({R_RELAX_TLS_IE_TO_LE, type, offset, addend, &sym});
     } else if (expr != R_TLSIE_HINT) {
       sym.setFlags(NEEDS_TLSIE);
@@ -1534,8 +1539,10 @@ void RelocationScanner::scan(ArrayRef<RelTy> rels) {
   // For EhInputSection, OffsetGetter expects the relocations to be sorted by
   // r_offset. In rare cases (.eh_frame pieces are reordered by a linker
   // script), the relocations may be unordered.
+  // On SystemZ, all sections need to be sorted by r_offset, to allow TLS
+  // relaxation to be handled correctly - see SystemZ::getTlsGdRelaxSkip.
   SmallVector<RelTy, 0> storage;
-  if (isa<EhInputSection>(sec))
+  if (isa<EhInputSection>(sec) || config->emachine == EM_S390)
     rels = sortRels(rels, storage);
 
   end = static_cast<const void *>(rels.end());
diff --git a/lld/ELF/Relocations.h b/lld/ELF/Relocations.h
index cfb9092..7eb8a811 100644
--- a/lld/ELF/Relocations.h
+++ b/lld/ELF/Relocations.h
@@ -40,11 +40,14 @@ enum RelExpr {
   R_GOTPLT,
   R_GOTPLTREL,
   R_GOTREL,
+  R_GOTPLT_GOTREL,
+  R_GOTPLT_PC,
   R_NONE,
   R_PC,
   R_PLT,
   R_PLT_PC,
   R_PLT_GOTPLT,
+  R_PLT_GOTREL,
   R_RELAX_HINT,
   R_RELAX_GOT_PC,
   R_RELAX_GOT_PC_NOPIC,
diff --git a/lld/ELF/ScriptParser.cpp b/lld/ELF/ScriptParser.cpp
index dd69916..f0ede1f 100644
--- a/lld/ELF/ScriptParser.cpp
+++ b/lld/ELF/ScriptParser.cpp
@@ -445,6 +445,7 @@ static std::pair<ELFKind, uint16_t> parseBfdName(StringRef s) {
       .Case("elf32-msp430", {ELF32LEKind, EM_MSP430})
       .Case("elf32-loongarch", {ELF32LEKind, EM_LOONGARCH})
       .Case("elf64-loongarch", {ELF64LEKind, EM_LOONGARCH})
+      .Case("elf64-s390", {ELF64BEKind, EM_S390})
       .Default({ELFNoneKind, EM_NONE});
 }
 
diff --git a/lld/ELF/SyntheticSections.cpp b/lld/ELF/SyntheticSections.cpp
index 4b413163..bada394 100644
--- a/lld/ELF/SyntheticSections.cpp
+++ b/lld/ELF/SyntheticSections.cpp
@@ -1419,6 +1419,9 @@ DynamicSection<ELFT>::computeContents() {
     case EM_MIPS:
       addInSec(DT_MIPS_PLTGOT, *in.gotPlt);
       break;
+    case EM_S390:
+      addInSec(DT_PLTGOT, *in.got);
+      break;
     case EM_SPARCV9:
       addInSec(DT_PLTGOT, *in.plt);
       break;
diff --git a/lld/ELF/Target.cpp b/lld/ELF/Target.cpp
index 286db1e..d879a42 100644
--- a/lld/ELF/Target.cpp
+++ b/lld/ELF/Target.cpp
@@ -87,6 +87,8 @@ TargetInfo *elf::getTarget() {
     return getRISCVTargetInfo();
   case EM_SPARCV9:
     return getSPARCV9TargetInfo();
+  case EM_S390:
+    return getSystemZTargetInfo();
   case EM_X86_64:
     return getX86_64TargetInfo();
   default:
diff --git a/lld/ELF/Target.h b/lld/ELF/Target.h
index ed00e81..0cefa31 100644
--- a/lld/ELF/Target.h
+++ b/lld/ELF/Target.h
@@ -188,6 +188,7 @@ TargetInfo *getPPC64TargetInfo();
 TargetInfo *getPPCTargetInfo();
 TargetInfo *getRISCVTargetInfo();
 TargetInfo *getSPARCV9TargetInfo();
+TargetInfo *getSystemZTargetInfo();
 TargetInfo *getX86TargetInfo();
 TargetInfo *getX86_64TargetInfo();
 template <class ELFT> TargetInfo *getMipsTargetInfo();
diff --git a/lld/test/ELF/Inputs/systemz-init.s b/lld/test/ELF/Inputs/systemz-init.s
new file mode 100644
index 0000000..1611b69
--- /dev/null
+++ b/lld/test/ELF/Inputs/systemz-init.s
@@ -0,0 +1,5 @@
+// glibc < 2.39 used to align .init and .fini code at a 4-byte boundary.
+// This file aims to recreate that behavior.
+	.section        .init,"ax",@progbits
+	.align	4
+	lg %r4, 272(%r15)
diff --git a/lld/test/ELF/basic-systemz.s b/lld/test/ELF/basic-systemz.s
new file mode 100644
index 0000000..f7bb0e8
--- /dev/null
+++ b/lld/test/ELF/basic-systemz.s
@@ -0,0 +1,63 @@
+# REQUIRES: systemz
+# RUN: llvm-mc -filetype=obj -triple=s390x-unknown-linux %s -o %t.o
+# RUN: ld.lld --hash-style=sysv -discard-all -shared %t.o -o %t.so
+# RUN: llvm-readelf --file-header --program-headers --section-headers --dynamic-table %t.so | FileCheck %s
+
+# Exits with return code 55 on linux.
+.text
+  lghi 2,55
+  svc 1
+
+# CHECK:       ELF Header:
+# CHECK-NEXT:  Magic:   7f 45 4c 46 02 02 01 00 00 00 00 00 00 00 00 00
+# CHECK-NEXT:  Class:                             ELF64
+# CHECK-NEXT:  Data:                              2's complement, big endian
+# CHECK-NEXT:  Version:                           1 (current)
+# CHECK-NEXT:  OS/ABI:                            UNIX - System V
+# CHECK-NEXT:  ABI Version:                       0
+# CHECK-NEXT:  Type:                              DYN (Shared object file)
+# CHECK-NEXT:  Machine:                           IBM S/390
+# CHECK-NEXT:  Version:                           0x1
+# CHECK-NEXT:  Entry point address:               0x0
+# CHECK-NEXT:  Start of program headers:          64 (bytes into file)
+# CHECK-NEXT:  Start of section headers:          768 (bytes into file)
+# CHECK-NEXT:  Flags:                             0x0
+# CHECK-NEXT:  Size of this header:               64 (bytes)
+# CHECK-NEXT:  Size of program headers:           56 (bytes)
+# CHECK-NEXT:  Number of program headers:         7
+# CHECK-NEXT:  Size of section headers:           64 (bytes)
+# CHECK-NEXT:  Number of section headers:         11
+# CHECK-NEXT:  Section header string table index: 9
+
+# CHECK:       Section Headers:
+# CHECK-NEXT:  [Nr] Name              Type            Address          Off    Size   ES Flg Lk Inf Al
+# CHECK-NEXT:  [ 0]                   NULL            0000000000000000 000000 000000 00      0   0  0
+# CHECK-NEXT:  [ 1] .dynsym           DYNSYM          00000000000001c8 0001c8 000018 18   A  3   1  8
+# CHECK-NEXT:  [ 2] .hash             HASH            00000000000001e0 0001e0 000010 04   A  1   0  4
+# CHECK-NEXT:  [ 3] .dynstr           STRTAB          00000000000001f0 0001f0 000001 00   A  0   0  1
+# CHECK-NEXT:  [ 4] .text             PROGBITS        00000000000011f4 0001f4 000006 00  AX  0   0  4
+# CHECK-NEXT:  [ 5] .dynamic          DYNAMIC         0000000000002200 000200 000060 10  WA  3   0  8
+# CHECK-NEXT:  [ 6] .relro_padding    NOBITS          0000000000002260 000260 000da0 00  WA  0   0  1
+# CHECK-NEXT:  [ 7] .comment          PROGBITS        0000000000000000 000260 000008 01  MS  0   0  1
+# CHECK-NEXT:  [ 8] .symtab           SYMTAB          0000000000000000 000268 000030 18     10   2  8
+# CHECK-NEXT:  [ 9] .shstrtab         STRTAB          0000000000000000 000298 000058 00      0   0  1
+# CHECK-NEXT:  [10] .strtab           STRTAB          0000000000000000 0002f0 00000a 00      0   0  1
+
+# CHECK:       Program Headers:
+# CHECK-NEXT:  Type           Offset   VirtAddr           PhysAddr           FileSiz  MemSiz   Flg Align
+# CHECK-NEXT:  PHDR           0x000040 0x0000000000000040 0x0000000000000040 0x000188 0x000188 R   0x8
+# CHECK-NEXT:  LOAD           0x000000 0x0000000000000000 0x0000000000000000 0x0001f1 0x0001f1 R   0x1000
+# CHECK-NEXT:  LOAD           0x0001f4 0x00000000000011f4 0x00000000000011f4 0x000006 0x000006 R E 0x1000
+# CHECK-NEXT:  LOAD           0x000200 0x0000000000002200 0x0000000000002200 0x000060 0x000e00 RW  0x1000
+# CHECK-NEXT:  DYNAMIC        0x000200 0x0000000000002200 0x0000000000002200 0x000060 0x000060 RW  0x8
+# CHECK-NEXT:  GNU_RELRO      0x000200 0x0000000000002200 0x0000000000002200 0x000060 0x000e00 R   0x1
+# CHECK-NEXT:  GNU_STACK      0x000000 0x0000000000000000 0x0000000000000000 0x000000 0x000000 RW  0x0
+
+# CHECK:       Dynamic section at offset 0x200 contains 6 entries:
+# CHECK-NEXT:  Tag                Type     Name/Value
+# CHECK-NEXT:  0x0000000000000006 (SYMTAB) 0x1c8
+# CHECK-NEXT:  0x000000000000000b (SYMENT) 24 (bytes)
+# CHECK-NEXT:  0x0000000000000005 (STRTAB) 0x1f0
+# CHECK-NEXT:  0x000000000000000a (STRSZ)  1 (bytes)
+# CHECK-NEXT:  0x0000000000000004 (HASH)   0x1e0
+# CHECK-NEXT:  0x0000000000000000 (NULL)   0x0
diff --git a/lld/test/ELF/emulation-systemz.s b/lld/test/ELF/emulation-systemz.s
new file mode 100644
index 0000000..dfdb462
--- /dev/null
+++ b/lld/test/ELF/emulation-systemz.s
@@ -0,0 +1,29 @@
+# REQUIRES: systemz
+# RUN: llvm-mc -filetype=obj -triple=s390x-unknown-linux %s -o %t.o
+# RUN: ld.lld -m elf64_s390 %t.o -o %t1
+# RUN: llvm-readelf --file-header %t1 | FileCheck %s
+# RUN: ld.lld %t.o -o %t2
+# RUN: llvm-readelf --file-header %t2 | FileCheck %s
+# RUN: echo 'OUTPUT_FORMAT(elf64-s390)' > %t.script
+# RUN: ld.lld %t.script %t.o -o %t3
+# RUN: llvm-readelf --file-header %t3 | FileCheck %s
+
+# CHECK:       ELF Header:
+# CHECK-NEXT:  Magic:   7f 45 4c 46 02 02 01 00 00 00 00 00 00 00 00 00
+# CHECK-NEXT:  Class:                             ELF64
+# CHECK-NEXT:  Data:                              2's complement, big endian
+# CHECK-NEXT:  Version:                           1 (current)
+# CHECK-NEXT:  OS/ABI:                            UNIX - System V
+# CHECK-NEXT:  ABI Version:                       0
+# CHECK-NEXT:  Type:                              EXEC (Executable file)
+# CHECK-NEXT:  Machine:                           IBM S/390
+# CHECK-NEXT:  Version:                           0x1
+# CHECK-NEXT:  Entry point address:
+# CHECK-NEXT:  Start of program headers:          64 (bytes into file)
+# CHECK-NEXT:  Start of section headers:
+# CHECK-NEXT:  Flags:                             0x0
+# CHECK-NEXT:  Size of this header:               64 (bytes)
+# CHECK-NEXT:  Size of program headers:           56 (bytes)
+
+.globl _start
+_start:
diff --git a/lld/test/ELF/lto/systemz.ll b/lld/test/ELF/lto/systemz.ll
new file mode 100644
index 0000000..42bf4e3
--- /dev/null
+++ b/lld/test/ELF/lto/systemz.ll
@@ -0,0 +1,18 @@
+; REQUIRES: systemz
+;; Test we can infer the e_machine value EM_S390 from a bitcode file.
+
+; RUN: llvm-as %s -o %t.o
+; RUN: ld.lld %t.o -o %t
+; RUN: llvm-readobj -h %t | FileCheck %s
+
+; CHECK: Class: 64-bit
+; CHECK: DataEncoding: BigEndian
+; CHECK: Machine: EM_S390
+
+target datalayout = "E-m:e-i1:8:16-i8:8:16-i64:64-f128:64-v128:64-a:8:16-n32:64"
+target triple = "s390x-unknown-linux-gnu"
+
+define void @_start() {
+entry:
+  ret void
+}
diff --git a/lld/test/ELF/systemz-got.s b/lld/test/ELF/systemz-got.s
new file mode 100644
index 0000000..1d558aa
--- /dev/null
+++ b/lld/test/ELF/systemz-got.s
@@ -0,0 +1,16 @@
+# REQUIRES: systemz
+# RUN: llvm-mc -filetype=obj -triple=s390x-unknown-linux %s -o %t.o
+# RUN: llvm-mc -filetype=obj -triple=s390x-unknown-linux %p/Inputs/shared.s -o %t2.o
+# RUN: ld.lld -shared %t2.o -soname=%t2.so -o %t2.so
+
+# RUN: ld.lld -dynamic-linker /lib/ld64.so.1 %t.o %t2.so -o %t
+# RUN: llvm-readelf -S -r  %t | FileCheck %s
+
+# CHECK: .got              PROGBITS        {{.*}} {{.*}} 000020 00  WA  0   0  8
+
+# CHECK: Relocation section '.rela.dyn' at offset {{.*}} contains 1 entries:
+# CHECK: {{.*}}  000000010000000a R_390_GLOB_DAT         0000000000000000 bar + 0
+
+.global _start
+_start:
+	lgrl  %r1,bar@GOT
diff --git a/lld/test/ELF/systemz-gotent-relax-align.s b/lld/test/ELF/systemz-gotent-relax-align.s
new file mode 100644
index 0000000..c632608
--- /dev/null
+++ b/lld/test/ELF/systemz-gotent-relax-align.s
@@ -0,0 +1,48 @@
+# REQUIRES: systemz
+## Verify that R_390_GOTENT optimization is not performed on misaligned symbols.
+
+# RUN: llvm-mc -filetype=obj -relax-relocations -triple=s390x-unknown-linux %s -o %t.o
+# RUN: ld.lld %t.o -o %t1
+# RUN: llvm-readelf -S -r -x .got -x .got.plt %t1 | FileCheck --check-prefixes=CHECK %s
+# RUN: llvm-objdump --no-print-imm-hex -d %t1 | FileCheck --check-prefix=DISASM %s
+
+## We retain one .got entry for the unaligned symbol.
+# CHECK:      Name              Type            Address          Off    Size   ES Flg Lk Inf Al
+# CHECK:      .got              PROGBITS        00000000010021e0 0001e0 000020 00  WA  0   0  8
+# CHECK-NEXT: .relro_padding    NOBITS          0000000001002200 000200 000e00 00  WA  0   0  1
+# CHECK-NEXT: .data             PROGBITS        0000000001003200 000200 000006 00  WA  0   0  2
+
+# CHECK-LABEL: Hex dump of section '.got':
+# CHECK-NEXT:    0x010021e0 00000000 00000000 00000000 00000000
+# CHECK-NEXT:    0x010021f0 00000000 00000000 00000000 01003205
+
+# DISASM:      Disassembly of section .text:
+# DISASM:      <_start>:
+# DISASM-NEXT:   larl    %r1, 0x1003200
+# DISASM-NEXT:   larl    %r1, 0x1003200
+# DISASM-NEXT:   lgrl    %r1, 0x10021f8
+# DISASM-NEXT:   lgrl    %r1, 0x10021f8
+
+.data
+.globl var_align
+.hidden var_align
+ .align 2
+var_align:
+ .long 0
+
+.data
+.globl var_unalign
+.hidden var_unalign
+ .align 2
+ .byte 0
+var_unalign:
+ .byte 0
+
+.text
+.globl _start
+.type _start, @function
+_start:
+ lgrl %r1, var_align@GOT
+ lgrl %r1, var_align@GOT
+ lgrl %r1, var_unalign@GOT
+ lgrl %r1, var_unalign@GOT
diff --git a/lld/test/ELF/systemz-gotent-relax-und-dso.s b/lld/test/ELF/systemz-gotent-relax-und-dso.s
new file mode 100644
index 0000000..57369a4
--- /dev/null
+++ b/lld/test/ELF/systemz-gotent-relax-und-dso.s
@@ -0,0 +1,68 @@
+# REQUIRES: systemz
+# RUN: llvm-mc -filetype=obj -relax-relocations -triple=s390x-unknown-linux %s -o %t.o
+# RUN: llvm-mc -filetype=obj -relax-relocations -triple=s390x-unknown-linux %S/Inputs/gotpc-relax-und-dso.s -o %tdso.o
+# RUN: ld.lld -shared %tdso.o -soname=t.so -o %t.so
+# RUN: ld.lld --hash-style=sysv -shared %t.o %t.so -o %t
+# RUN: llvm-readelf -r %t | FileCheck --check-prefix=RELOC %s
+# RUN: llvm-objdump --no-print-imm-hex -d %t | FileCheck --check-prefix=DISASM %s
+
+# RELOC-LABEL: Relocation section '.rela.dyn' at offset {{.*}} contains 3 entries:
+# RELOC: 00000000000023f8 000000010000000a R_390_GLOB_DAT 00000000000012d8 foo + 0
+# RELOC: 0000000000002400 000000030000000a R_390_GLOB_DAT 0000000000000000 und + 0
+# RELOC: 0000000000002408 000000040000000a R_390_GLOB_DAT 0000000000000000 dsofoo + 0
+
+# DISASM:      Disassembly of section .text:
+# DISASM-EMPTY:
+# DISASM-NEXT: <foo>:
+# DISASM-NEXT:     bc 0, 0
+# DISASM:      <hid>:
+# DISASM-NEXT:     bc 0, 0
+# DISASM:      <_start>:
+# DISASM-NEXT:    lgrl    %r1, 0x2400
+# DISASM-NEXT:    lgrl    %r1, 0x2400
+# DISASM-NEXT:    lgrl    %r1, 0x2408
+# DISASM-NEXT:    lgrl    %r1, 0x2408
+# DISASM-NEXT:    larl    %r1, 0x12dc
+# DISASM-NEXT:    larl    %r1, 0x12dc
+# DISASM-NEXT:    lgrl    %r1, 0x23f8
+# DISASM-NEXT:    lgrl    %r1, 0x23f8
+# DISASM-NEXT:    lgrl    %r1, 0x2400
+# DISASM-NEXT:    lgrl    %r1, 0x2400
+# DISASM-NEXT:    lgrl    %r1, 0x2408
+# DISASM-NEXT:    lgrl    %r1, 0x2408
+# DISASM-NEXT:    larl    %r1, 0x12dc
+# DISASM-NEXT:    larl    %r1, 0x12dc
+# DISASM-NEXT:    lgrl    %r1, 0x23f8
+# DISASM-NEXT:    lgrl    %r1, 0x23f8
+
+.text
+.globl foo
+.type foo, @function
+foo:
+ nop
+
+.globl hid
+.hidden hid
+.type hid, @function
+hid:
+ nop
+
+.globl _start
+.type _start, @function
+_start:
+ lgrl %r1, und@GOT
+ lgrl %r1, und@GOT
+ lgrl %r1, dsofoo@GOT
+ lgrl %r1, dsofoo@GOT
+ lgrl %r1, hid@GOT
+ lgrl %r1, hid@GOT
+ lgrl %r1, foo@GOT
+ lgrl %r1, foo@GOT
+ lgrl %r1, und@GOT
+ lgrl %r1, und@GOT
+ lgrl %r1, dsofoo@GOT
+ lgrl %r1, dsofoo@GOT
+ lgrl %r1, hid@GOT
+ lgrl %r1, hid@GOT
+ lgrl %r1, foo@GOT
+ lgrl %r1, foo@GOT
diff --git a/lld/test/ELF/systemz-gotent-relax.s b/lld/test/ELF/systemz-gotent-relax.s
new file mode 100644
index 0000000..f665e1a
--- /dev/null
+++ b/lld/test/ELF/systemz-gotent-relax.s
@@ -0,0 +1,91 @@
+# REQUIRES: systemz
+## Test R_390_GOTENT optimization.
+
+# RUN: llvm-mc -filetype=obj -relax-relocations -triple=s390x-unknown-linux %s -o %t.o
+# RUN: ld.lld %t.o -o %t1 --no-apply-dynamic-relocs
+# RUN: llvm-readelf -S -r -x .got.plt %t1 | FileCheck --check-prefixes=CHECK,NOAPPLY %s
+# RUN: ld.lld %t.o -o %t1 --apply-dynamic-relocs
+# RUN: llvm-readelf -S -r -x .got.plt %t1 | FileCheck --check-prefixes=CHECK,APPLY %s
+# RUN: ld.lld %t.o -o %t1
+# RUN: llvm-objdump --no-print-imm-hex -d %t1 | FileCheck --check-prefix=DISASM %s
+
+## --no-relax disables GOT optimization.
+# RUN: ld.lld --no-relax %t.o -o %t2
+# RUN: llvm-objdump --no-print-imm-hex -d %t2 | FileCheck --check-prefix=NORELAX %s
+
+## In our implementation, .got is retained even if all GOT-generating relocations are optimized.
+# CHECK:      Name              Type            Address          Off    Size   ES Flg Lk Inf Al
+# CHECK:      .iplt             PROGBITS        0000000001001240 000240 000020 00  AX  0   0 16
+# CHECK-NEXT: .got              PROGBITS        0000000001002260 000260 000018 00  WA  0   0  8
+# CHECK-NEXT: .relro_padding    NOBITS          0000000001002278 000278 000d88 00  WA  0   0  1
+# CHECK-NEXT: .got.plt          PROGBITS        0000000001003278 000278 000008 00  WA  0   0  8
+
+## There is one R_S390_IRELATIVE relocation.
+# CHECK-LABEL: Relocation section '.rela.dyn' at offset {{.*}} contains 1 entries:
+# CHECK:       0000000001003278  000000000000003d R_390_IRELATIVE                   10011e8
+
+# CHECK-LABEL: Hex dump of section '.got.plt':
+# NOAPPLY-NEXT:  0x01003278 00000000 00000000
+# APPLY-NEXT:    0x01003278 00000000 010011e8
+
+# DISASM:      Disassembly of section .text:
+# DISASM: 00000000010011e0 <foo>:
+# DISASM-NEXT:   bc      0, 0
+# DISASM: 00000000010011e4 <hid>:
+# DISASM-NEXT:   bc      0, 0
+# DISASM: 00000000010011e8 <ifunc>:
+# DISASM-NEXT:   br      %r14
+# DISASM: 00000000010011ea <_start>:
+# DISASM-NEXT:   larl    %r1, 0x10011e0
+# DISASM-NEXT:   larl    %r1, 0x10011e0
+# DISASM-NEXT:   larl    %r1, 0x10011e4
+# DISASM-NEXT:   larl    %r1, 0x10011e4
+# DISASM-NEXT:   lgrl    %r1, 0x1003278
+# DISASM-NEXT:   lgrl    %r1, 0x1003278
+# DISASM-NEXT:   larl    %r1, 0x10011e0
+# DISASM-NEXT:   larl    %r1, 0x10011e0
+# DISASM-NEXT:   larl    %r1, 0x10011e4
+# DISASM-NEXT:   larl    %r1, 0x10011e4
+# DISASM-NEXT:   lgrl    %r1, 0x1003278
+# DISASM-NEXT:   lgrl    %r1, 0x1003278
+
+# NORELAX-LABEL: <_start>:
+# NORELAX-COUNT-12: lgrl
+
+.text
+.globl foo
+
+.text
+.globl foo
+.type foo, @function
+foo:
+ nop
+
+.globl hid
+.hidden hid
+.type hid, @function
+hid:
+ nop
+
+.text
+.type ifunc STT_GNU_IFUNC
+.globl ifunc
+.type ifunc, @function
+ifunc:
+ br %r14
+
+.globl _start
+.type _start, @function
+_start:
+ lgrl %r1, foo@GOT
+ lgrl %r1, foo@GOT
+ lgrl %r1, hid@GOT
+ lgrl %r1, hid@GOT
+ lgrl %r1, ifunc@GOT
+ lgrl %r1, ifunc@GOT
+ lgrl %r1, foo@GOT
+ lgrl %r1, foo@GOT
+ lgrl %r1, hid@GOT
+ lgrl %r1, hid@GOT
+ lgrl %r1, ifunc@GOT
+ lgrl %r1, ifunc@GOT
diff --git a/lld/test/ELF/systemz-ifunc-nonpreemptible.s b/lld/test/ELF/systemz-ifunc-nonpreemptible.s
new file mode 100644
index 0000000..5056db3
--- /dev/null
+++ b/lld/test/ELF/systemz-ifunc-nonpreemptible.s
@@ -0,0 +1,75 @@
+# REQUIRES: systemz
+# RUN: llvm-mc -filetype=obj -triple=s390x-none-linux-gnu %s -o %t.o
+# RUN: ld.lld -static %t.o -o %t
+# RUN: ld.lld -static %t.o -o %t.apply --apply-dynamic-relocs
+# RUN: llvm-readelf --section-headers --relocations --symbols %t | FileCheck %s
+# RUN: llvm-readelf -x .got.plt %t | FileCheck %s --check-prefix=NO-APPLY-RELOC
+# RUN: llvm-readelf -x .got.plt %t.apply | FileCheck %s --check-prefix=APPLY-RELOC
+# RUN: llvm-objdump --no-print-imm-hex -d --no-show-raw-insn %t | FileCheck %s --check-prefix=DISASM
+
+# CHECK:      Section Headers:
+# CHECK-NEXT:  [Nr] Name              Type            Address          Off    Size   ES Flg Lk Inf Al
+# CHECK-NEXT:  [ 0]                   NULL            0000000000000000 000000 000000 00      0   0  0
+# CHECK-NEXT:  [ 1] .rela.dyn         RELA            0000000001000158 000158 000030 18  AI  0   4  8
+# CHECK-NEXT:  [ 2] .text             PROGBITS        0000000001001188 000188 00001c 00  AX  0   0  4
+# CHECK-NEXT:  [ 3] .iplt             PROGBITS        00000000010011b0 0001b0 000040 00  AX  0   0 16
+# CHECK-NEXT:  [ 4] .got.plt          PROGBITS        00000000010021f0 0001f0 000010 00  WA  0   0  8
+
+# CHECK:      Relocation section '.rela.dyn' at offset 0x158 contains 2 entries:
+# CHECK-NEXT:     Offset             Info             Type               Symbol's Value  Symbol's Name + Addend
+# CHECK-NEXT: 00000000010021f0  000000000000003d R_390_IRELATIVE                   1001188
+# CHECK-NEXT: 00000000010021f8  000000000000003d R_390_IRELATIVE                   100118a
+
+# CHECK:      Symbol table '.symtab' contains 6 entries:
+# CHECK-NEXT:   Num:    Value          Size Type    Bind   Vis       Ndx Name
+# CHECK-NEXT:     0: 0000000000000000     0 NOTYPE  LOCAL  DEFAULT   UND
+# CHECK-NEXT:     1: 0000000001000158     0 NOTYPE  LOCAL  HIDDEN      1 __rela_iplt_start
+# CHECK-NEXT:     2: 0000000001000188     0 NOTYPE  LOCAL  HIDDEN      1 __rela_iplt_end
+# CHECK-NEXT:     3: 0000000001001188     0 IFUNC   GLOBAL DEFAULT     2 foo
+# CHECK-NEXT:     4: 000000000100118a     0 IFUNC   GLOBAL DEFAULT     2 bar
+# CHECK-NEXT:     5: 000000000100118c     0 NOTYPE  GLOBAL DEFAULT     2 _start
+
+# NO-APPLY-RELOC-LABEL:  Hex dump of section '.got.plt':
+# NO-APPLY-RELOC-NEXT:     0x010021f0 00000000 00000000 00000000 00000000
+# NO-APPLY-RELOC-EMPTY:
+
+# APPLY-RELOC-LABEL:  Hex dump of section '.got.plt':
+# APPLY-RELOC-NEXT:     0x010021f0 00000000 01001188 00000000 0100118a
+# APPLY-RELOC-EMPTY:
+
+# DISASM: Disassembly of section .text:
+# DISASM: 0000000001001188 <foo>:
+# DISASM-NEXT:  br      %r14
+# DISASM: 000000000100118a <bar>:
+# DISASM-NEXT:  br      %r14
+# DISASM: 000000000100118c  <_start>:
+# DISASM-NEXT:  brasl   %r14, 0x10011b0
+# DISASM-NEXT:  brasl   %r14, 0x10011d0
+# DISASM-NEXT:  larl    %r2, 0x1000158
+# DISASM-NEXT:  larl    %r2, 0x1000188
+# DISASM: Disassembly of section .iplt:
+# DISASM: <.iplt>:
+# DISASM:        10011b0:       larl    %r1, 0x10021f0
+# DISASM-NEXT:   10011b6:       lg      %r1, 0(%r1)
+# DISASM-NEXT:   10011bc:       br      %r1
+# DISASM:        10011d0:       larl    %r1, 0x10021f8
+# DISASM-NEXT:   10011d6:       lg      %r1, 0(%r1)
+# DISASM-NEXT:   10011dc:       br      %r1
+
+.text
+.type foo STT_GNU_IFUNC
+.globl foo
+foo:
+ br %r14
+
+.type bar STT_GNU_IFUNC
+.globl bar
+bar:
+ br %r14
+
+.globl _start
+_start:
+ brasl %r14, foo@plt
+ brasl %r14, bar@plt
+ larl %r2, __rela_iplt_start
+ larl %r2, __rela_iplt_end
diff --git a/lld/test/ELF/systemz-init-padding.s b/lld/test/ELF/systemz-init-padding.s
new file mode 100644
index 0000000..c56b98d
--- /dev/null
+++ b/lld/test/ELF/systemz-init-padding.s
@@ -0,0 +1,27 @@
+# REQUIRES: systemz
+# RUN: llvm-mc -filetype=obj -triple=s390x-unknown-linux %p/Inputs/systemz-init.s -o systemz-init.o
+# RUN: llvm-mc -filetype=obj -triple=s390x-unknown-linux %s -o %t.o
+# RUN: ld.lld -dynamic-linker /lib/ld64.so.1 %t.o systemz-init.o -o %t
+# RUN: llvm-objdump -d --no-show-raw-insn -j .init %t | FileCheck %s
+
+# glibc < 2.39 used to align .init and .fini code at a 4-byte boundary.
+# When that happens, the linker must not pad the code with invalid
+# instructions, e.g. null bytes.
+	.section        .init,"ax",@progbits
+	brasl %r14, startup
+
+# CHECK:      <.init>:
+# CHECK-NEXT: brasl %r14,
+# CHECK-NEXT: bcr     0, %r7
+# CHECK-NEXT: lg %r4, 272(%r15)
+
+	.text
+	.globl startup
+	.p2align 4
+startup:
+	br %r14
+
+	.globl main
+	.p2align 4
+main:
+	br %r14
diff --git a/lld/test/ELF/systemz-pie.s b/lld/test/ELF/systemz-pie.s
new file mode 100644
index 0000000..bb971a8
--- /dev/null
+++ b/lld/test/ELF/systemz-pie.s
@@ -0,0 +1,38 @@
+# REQUIRES: systemz
+# RUN: llvm-mc -filetype=obj -triple=s390x-unknown-linux %s -o %t1.o
+
+## Check -pie.
+# RUN: ld.lld -pie %t1.o -o %t
+# RUN: llvm-readelf --file-headers --program-headers --dynamic %t | FileCheck %s
+
+# CHECK: ELF Header:
+# CHECK-NEXT:  Magic:   7f 45 4c 46 02 02 01 00 00 00 00 00 00 00 00 00
+# CHECK-NEXT:  Class:                             ELF64
+# CHECK-NEXT:  Data:                              2's complement, big endian
+# CHECK-NEXT:  Version:                           1 (current)
+# CHECK-NEXT:  OS/ABI:                            UNIX - System V
+# CHECK-NEXT:  ABI Version:                       0
+# CHECK-NEXT:  Type:                              DYN (Shared object file)
+# CHECK-NEXT:  Machine:                           IBM S/390
+# CHECK-NEXT:  Version:                           0x1
+
+# CHECK: Program Headers:
+# CHECK-NEXT:  Type           Offset   VirtAddr           PhysAddr           FileSiz  MemSiz   Flg Align
+# CHECK-NEXT:  PHDR           0x000040 0x0000000000000040 0x0000000000000040 0x000188 0x000188 R   0x8
+# CHECK-NEXT:  LOAD           0x000000 0x0000000000000000 0x0000000000000000 0x00020d 0x00020d R   0x1000
+# CHECK-NEXT:  LOAD           0x000210 0x0000000000002210 0x0000000000002210 0x000090 0x000df0 RW  0x1000
+# CHECK-NEXT:  DYNAMIC        0x000210 0x0000000000002210 0x0000000000002210 0x000090 0x000090 RW  0x8
+# CHECK-NEXT:  GNU_RELRO      0x000210 0x0000000000002210 0x0000000000002210 0x000090 0x000df0 R   0x1
+# CHECK-NEXT:  GNU_STACK      0x000000 0x0000000000000000 0x0000000000000000 0x000000 0x000000 RW  0x0
+
+# CHECK: Dynamic section at offset 0x210 contains 9 entries:
+# CHECK-NEXT:   Tag                Type       Name/Value
+# CHECK-NEXT:   0x000000006ffffffb (FLAGS_1)  PIE
+
+## Check -nopie
+# RUN: ld.lld -no-pie %t1.o -o %t2
+# RUN: llvm-readelf --file-headers %t2 | FileCheck %s --check-prefix=NOPIE
+# NOPIE-NOT: Type: DYN
+
+.globl _start
+_start:
diff --git a/lld/test/ELF/systemz-plt.s b/lld/test/ELF/systemz-plt.s
new file mode 100644
index 0000000..4669f01
--- /dev/null
+++ b/lld/test/ELF/systemz-plt.s
@@ -0,0 +1,83 @@
+# REQUIRES: systemz
+# RUN: echo '.globl bar, weak; .type bar,@function; .type weak,@function; bar: weak:' > %t1.s
+
+# RUN: llvm-mc -filetype=obj -triple=s390x-unknown-linux %t1.s -o %t1.o
+# RUN: ld.lld -shared %t1.o -soname=t1.so -o %t1.so
+# RUN: llvm-mc -filetype=obj -triple=s390x-unknown-linux %s -o %t.o
+# RUN: ld.lld %t.o %t1.so -z separate-code -o %t
+# RUN: llvm-readelf -S -s -r -x .got.plt %t | FileCheck %s
+# RUN: llvm-objdump -d %t | FileCheck --check-prefixes=DIS %s
+
+# CHECK: Section Headers:
+# CHECK: .plt     PROGBITS 0000000001001020 001020 000060 00  AX  0   0 16
+# CHECK: .got     PROGBITS 00000000010020d0 0020d0 000018 00  WA  0   0  8
+# CHECK: .got.plt PROGBITS 00000000010030e8 0020e8 000010 00  WA  0   0  8
+
+# CHECK: Relocation section '.rela.plt' at offset {{.*}} contains 2 entries:
+# CHECK: 00000000010030e8 000000010000000b R_390_JMP_SLOT 0000000000000000 bar + 0
+# CHECK: 00000000010030f0 000000020000000b R_390_JMP_SLOT 0000000000000000 weak + 0
+
+## A canonical PLT has a non-zero st_value. bar and weak are called but their
+## addresses are not taken, so a canonical PLT is not necessary.
+# CHECK: Symbol table '.dynsym' contains 3 entries:
+# CHECK-NEXT:   Num:    Value          Size Type    Bind   Vis       Ndx Name
+# CHECK-NEXT:     0: 0000000000000000     0 NOTYPE  LOCAL  DEFAULT   UND
+# CHECK-NEXT:     1: 0000000000000000     0 FUNC    GLOBAL DEFAULT   UND bar
+# CHECK-NEXT:     2: 0000000000000000     0 FUNC    WEAK   DEFAULT   UND weak
+
+## The .got.plt slots relocated by .rela.plt point to .plt
+## This is required by glibc.
+# CHECK: Hex dump of section '.got.plt':
+# CHECK-NEXT: 0x010030e8 00000000 0100104e 00000000 0100106e
+
+# DIS: Disassembly of section .text:
+
+# DIS: 0000000001001000 <_start>:
+# DIS-NEXT: brasl	%r14, 0x1001012
+# DIS-NEXT: brasl	%r14, 0x1001040
+# DIS-NEXT: brasl	%r14, 0x1001060
+
+# DIS: 0000000001001012 <foo>:
+# DIS-NEXT: br	%r14
+
+# DIS: Disassembly of section .plt:
+
+# DIS: 0000000001001020 <.plt>:
+# DIS-NEXT: 1001020: e3 10 f0 38 00 24    	stg	%r1, 56(%r15)
+# DIS-NEXT: 1001026: c0 10 00 00 08 55          larl	%r1, 0x10020d0
+# DIS-NEXT: 100102c: d2 07 f0 30 10 08    	mvc	48(8,%r15), 8(%r1)
+# DIS-NEXT: 1001032: e3 10 10 10 00 04    	lg	%r1, 16(%r1)
+# DIS-NEXT: 1001038: 07 f1        	br	%r1
+# DIS-NEXT: 100103a: 07 00        	bcr	0, %r0
+# DIS-NEXT: 100103c: 07 00        	bcr	0, %r0
+# DIS-NEXT: 100103e: 07 00        	bcr	0, %r0
+# DIS-NEXT: 1001040: c0 10 00 00 10 54    	larl	%r1, 0x10030e8
+# DIS-NEXT: 1001046: e3 10 10 00 00 04    	lg	%r1, 0(%r1)
+# DIS-NEXT: 100104c: 07 f1        	br	%r1
+# DIS-NEXT: 100104e: 0d 10        	basr	%r1, 0
+# DIS-NEXT: 1001050: e3 10 10 0c 00 14    	lgf	%r1, 12(%r1)
+# DIS-NEXT: 1001056: c0 f4 ff ff ff e5    	jg	0x1001020
+# DIS-NEXT: 100105c: 00 00        	<unknown>
+# DIS-NEXT: 100105e: 00 00        	<unknown>
+# DIS-NEXT: 1001060: c0 10 00 00 10 48    	larl	%r1, 0x10030f0
+# DIS-NEXT: 1001066: e3 10 10 00 00 04    	lg	%r1, 0(%r1)
+# DIS-NEXT: 100106c: 07 f1        	br	%r1
+# DIS-NEXT: 100106e: 0d 10        	basr	%r1, 0
+# DIS-NEXT: 1001070: e3 10 10 0c 00 14    	lgf	%r1, 12(%r1)
+# DIS-NEXT: 1001076: c0 f4 ff ff ff d5    	jg	0x1001020
+# DIS-NEXT: 100107c: 00 00        	<unknown>
+# DIS-NEXT: 100107e: 00 18        	<unknown>
+
+.global _start, foo, bar
+.weak weak
+
+_start:
+  ## Use @plt to avoid generating direct references that would force
+  ## allocation of a canonical PLT entry.
+  brasl %r14, foo@plt
+  brasl %r14, bar@plt
+  brasl %r14, weak@plt
+
+## foo is local and non-preemptable, no PLT is generated.
+foo:
+  br %r14
diff --git a/lld/test/ELF/systemz-reloc-abs.s b/lld/test/ELF/systemz-reloc-abs.s
new file mode 100644
index 0000000..b5ad94d
--- /dev/null
+++ b/lld/test/ELF/systemz-reloc-abs.s
@@ -0,0 +1,32 @@
+# REQUIRES: systemz
+# RUN: llvm-mc -filetype=obj -triple=s390x %s -o %t.o
+# RUN: llvm-mc -filetype=obj -triple=s390x %S/Inputs/abs255.s -o %t255.o
+# RUN: llvm-mc -filetype=obj -triple=s390x %S/Inputs/abs256.s -o %t256.o
+# RUN: llvm-mc -filetype=obj -triple=s390x %S/Inputs/abs257.s -o %t257.o
+
+# RUN: ld.lld %t.o %t256.o -o %t
+# RUN: llvm-readelf -x .data %t | FileCheck %s
+# CHECK: 0x{{[0-9a-f]+}} ff80ffff 8000ffff ffff8000 0000ffff
+# CHECK-NEXT:            ffffffff ffff8000 00000000 0000
+
+# RUN: not ld.lld %t.o %t255.o -o /dev/null 2>&1 | FileCheck --check-prefix=OVERFLOW1 %s
+# OVERFLOW1: relocation R_390_8 out of range: -129 is not in [-128, 255]
+# OVERFLOW1: relocation R_390_16 out of range: -32769 is not in [-32768, 65535]
+# OVERFLOW1: relocation R_390_32 out of range: -2147483649 is not in [-2147483648, 4294967295]
+
+# RUN: not ld.lld %t.o %t257.o -o /dev/null 2>&1 | FileCheck --check-prefix=OVERFLOW2 %s
+# OVERFLOW2: relocation R_390_8 out of range: 256 is not in [-128, 255]
+# OVERFLOW2: relocation R_390_16 out of range: 65536 is not in [-32768, 65535]
+# OVERFLOW2: relocation R_390_32 out of range: 4294967296 is not in [-2147483648, 4294967295]
+
+.globl _start
+_start:
+.data
+.byte foo - 1
+.byte foo - 384
+.word foo + 0xfeff
+.word foo - 0x8100
+.long foo + 0xfffffeff
+.long foo - 0x80000100
+.quad foo + 0xfffffffffffffeff
+.quad foo - 0x8000000000000100
diff --git a/lld/test/ELF/systemz-reloc-disp12.s b/lld/test/ELF/systemz-reloc-disp12.s
new file mode 100644
index 0000000..3d32707
--- /dev/null
+++ b/lld/test/ELF/systemz-reloc-disp12.s
@@ -0,0 +1,21 @@
+# REQUIRES: systemz
+# RUN: llvm-mc -filetype=obj -triple=s390x -defsym DISP=291 %s -o %t1.o
+# RUN: llvm-mc -filetype=obj -triple=s390x -defsym DISP=4095 %s -o %t2.o
+# RUN: llvm-mc -filetype=obj -triple=s390x -defsym DISP=4096 %s -o %t3.o
+
+# RUN: ld.lld --section-start=.text=0x0 %t1.o -o %t1out
+# RUN: ld.lld --section-start=.text=0x0 %t2.o -o %t2out
+# RUN: not ld.lld --section-start=.text=0x0 %t3.o -o /dev/null 2>&1 | FileCheck %s --check-prefix RANGE
+
+# RANGE: relocation R_390_12 out of range: 4096 is not in [0, 4095]
+
+# RUN: llvm-readelf --hex-dump=.text %t1out | FileCheck %s -DINSN=58678123 --check-prefix DUMP
+# RUN: llvm-readelf --hex-dump=.text %t2out | FileCheck %s -DINSN=58678fff --check-prefix DUMP
+
+# DUMP:  0x00000000 [[INSN]]
+
+.text
+.globl _start
+_start:
+    .reloc .+2, R_390_12, DISP
+    l %r6, 0(%r7,%r8)
diff --git a/lld/test/ELF/systemz-reloc-disp20.s b/lld/test/ELF/systemz-reloc-disp20.s
new file mode 100644
index 0000000..88cd657
--- /dev/null
+++ b/lld/test/ELF/systemz-reloc-disp20.s
@@ -0,0 +1,21 @@
+# REQUIRES: systemz
+# RUN: llvm-mc -filetype=obj -triple=s390x -defsym DISP=74565 %s -o %t1.o
+# RUN: llvm-mc -filetype=obj -triple=s390x -defsym DISP=524287 %s -o %t2.o
+# RUN: llvm-mc -filetype=obj -triple=s390x -defsym DISP=524288 %s -o %t3.o
+
+# RUN: ld.lld --section-start=.text=0x0 %t1.o -o %t1out
+# RUN: ld.lld --section-start=.text=0x0 %t2.o -o %t2out
+# RUN: not ld.lld --section-start=.text=0x0 %t3.o -o /dev/null 2>&1 | FileCheck %s --check-prefix RANGE
+
+# RANGE: relocation R_390_20 out of range: 524288 is not in [-524288, 524287]
+
+# RUN: llvm-readelf --hex-dump=.text %t1out | FileCheck %s -DINSN="e3678345 1204" --check-prefix DUMP
+# RUN: llvm-readelf --hex-dump=.text %t2out | FileCheck %s -DINSN="e3678fff 7f04" --check-prefix DUMP
+
+# DUMP:  0x00000000 [[INSN]]
+
+.text
+.globl _start
+_start:
+    .reloc .+2, R_390_20, DISP
+    lg %r6, 0(%r7,%r8)
diff --git a/lld/test/ELF/systemz-reloc-got.s b/lld/test/ELF/systemz-reloc-got.s
new file mode 100644
index 0000000..4b9ac16
--- /dev/null
+++ b/lld/test/ELF/systemz-reloc-got.s
@@ -0,0 +1,92 @@
+# REQUIRES: systemz
+# RUN: llvm-mc -filetype=obj -triple=s390x-unknown-linux %s -o %t.o
+# RUN: ld.lld -z norelro -shared %t.o -soname=t.so -o %t.so
+## Note: Without norelro the distance between .got and .got.plt causes
+## R_390_GOTPLT12 relocations to always overflow.
+
+# RUN: llvm-readelf -S -x .data %t.so | FileCheck %s
+# RUN: llvm-objdump -d --no-show-raw-insn %t.so | FileCheck %s --check-prefix=DISASM
+
+# CHECK: Section Headers:
+# CHECK: .got PROGBITS 0000000000002458
+# CHECK: .got.plt PROGBITS 0000000000002480
+
+## Note: _GLOBAL_OFFSET_TABLE is at .got
+## GOT (foo) is at .got + 24 == 0x2470
+## GOT (bar) is at .got + 32 == 0x2478
+## GOTPLT (foo) is at .got.plt + 0 == .got + 40 == 0x2480
+## GOTPLT (bar) is at .got.plt + 8 == .got + 48 == 0x2488
+
+# DISASM:      larl %r12, 0x2458
+# DISASM-NEXT: larl %r1, 0x2470
+# DISASM-NEXT: larl %r1, 0x2478
+# DISASM-NEXT: larl %r1, 0x2480
+# DISASM-NEXT: larl %r1, 0x2488
+
+# DISASM-NEXT: l %r1, 24(%r12)
+# DISASM-NEXT: l %r1, 32(%r12)
+# DISASM-NEXT: l %r1, 40(%r12)
+# DISASM-NEXT: l %r1, 48(%r12)
+# DISASM-NEXT: lg %r1, 24(%r12)
+# DISASM-NEXT: lg %r1, 32(%r12)
+# DISASM-NEXT: lg %r1, 40(%r12)
+# DISASM-NEXT: lg %r1, 48(%r12)
+
+# CHECK: Hex dump of section '.data':
+# CHECK-NEXT: 00180020 00280030 00000018 00000020
+# CHECK-NEXT: 00000028 00000030 00000000 00000018
+# CHECK-NEXT: 00000000 00000020 00000000 00000028
+# CHECK-NEXT: 00000000 00000030
+
+.text
+larl %r12, _GLOBAL_OFFSET_TABLE_
+.reloc .+2, R_390_GOTENT, foo+2
+larl %r1, 0
+.reloc .+2, R_390_GOTENT, bar+2
+larl %r1, 0
+.reloc .+2, R_390_GOTPLTENT, foo+2
+larl %r1, 0
+.reloc .+2, R_390_GOTPLTENT, bar+2
+larl %r1, 0
+.reloc .+2, R_390_GOT12, foo
+l %r1, 0(%r12)
+.reloc .+2, R_390_GOT12, bar
+l %r1, 0(%r12)
+.reloc .+2, R_390_GOTPLT12, foo
+l %r1, 0(%r12)
+.reloc .+2, R_390_GOTPLT12, bar
+l %r1, 0(%r12)
+.reloc .+2, R_390_GOT20, foo
+lg %r1, 0(%r12)
+.reloc .+2, R_390_GOT20, bar
+lg %r1, 0(%r12)
+.reloc .+2, R_390_GOTPLT20, foo
+lg %r1, 0(%r12)
+.reloc .+2, R_390_GOTPLT20, bar
+lg %r1, 0(%r12)
+
+.data
+.reloc ., R_390_GOT16, foo
+.space 2
+.reloc ., R_390_GOT16, bar
+.space 2
+.reloc ., R_390_GOTPLT16, foo
+.space 2
+.reloc ., R_390_GOTPLT16, bar
+.space 2
+.reloc ., R_390_GOT32, foo
+.space 4
+.reloc ., R_390_GOT32, bar
+.space 4
+.reloc ., R_390_GOTPLT32, foo
+.space 4
+.reloc ., R_390_GOTPLT32, bar
+.space 4
+.reloc ., R_390_GOT64, foo
+.space 8
+.reloc ., R_390_GOT64, bar
+.space 8
+.reloc ., R_390_GOTPLT64, foo
+.space 8
+.reloc ., R_390_GOTPLT64, bar
+.space 8
diff --git a/lld/test/ELF/systemz-reloc-gotrel.s b/lld/test/ELF/systemz-reloc-gotrel.s
new file mode 100644
index 0000000..46669ec
--- /dev/null
+++ b/lld/test/ELF/systemz-reloc-gotrel.s
@@ -0,0 +1,36 @@
+# REQUIRES: systemz
+# RUN: llvm-mc -filetype=obj -triple=s390x-unknown-linux %s -o %t.o
+# RUN: ld.lld -shared %t.o -soname=t.so -o %t.so
+
+# RUN: llvm-readelf -S -s -x .data %t.so | FileCheck %s
+
+# CHECK: Section Headers:
+# CHECK: .plt PROGBITS 0000000000001290
+# CHECK: .got PROGBITS 0000000000002390
+
+# CHECK: Symbol table '.symtab'
+# CHECK: 0000000000001288 {{.*}}  bar
+
+## Note: foo is the first (and only) PLT entry, which resides at .plt + 32
+## PLTOFF (foo) is (.plt + 32) - .got == 0x12b0 - 0x2390 == 0xffffef20
+## GOTOFF (bar) is bar - .got == 0x1288 - 0x2390 == 0xffffeef8
+# CHECK: Hex dump of section '.data':
+# CHECK-NEXT: eef8ef20 ffffeef8 ffffef20 ffffffff
+# CHECK-NEXT: ffffeef8 ffffffff ffffef20
+
+bar:
+  br %r14
+
+.data
+.reloc ., R_390_GOTOFF16, bar
+.space 2
+.reloc ., R_390_PLTOFF16, foo
+.space 2
+.reloc ., R_390_GOTOFF, bar
+.space 4
+.reloc ., R_390_PLTOFF32, foo
+.space 4
+.reloc ., R_390_GOTOFF64, bar
+.space 8
+.reloc ., R_390_PLTOFF64, foo
+.space 8
diff --git a/lld/test/ELF/systemz-reloc-pc16.s b/lld/test/ELF/systemz-reloc-pc16.s
new file mode 100644
index 0000000..e1dad5a
--- /dev/null
+++ b/lld/test/ELF/systemz-reloc-pc16.s
@@ -0,0 +1,39 @@
+# REQUIRES: systemz
+# RUN: rm -rf %t && split-file %s %t
+
+## Check recompile with -fPIC error message
+# RUN: llvm-mc -filetype=obj -triple=s390x-unknown-linux %t/shared.s -o %t/shared.o
+# RUN: not ld.lld -shared %t/shared.o -o /dev/null 2>&1 | FileCheck %s
+
+# CHECK: error: relocation R_390_PC16 cannot be used against symbol '_shared'; recompile with -fPIC
+# CHECK: >>> defined in {{.*}}
+# CHECK: >>> referenced by {{.*}}:(.data+0x1)
+
+## Check patching of negative addends
+
+# RUN: llvm-mc -filetype=obj -triple=s390x -defsym ADDEND=1 %t/addend.s -o %t/1.o
+# RUN: llvm-mc -filetype=obj -triple=s390x -defsym ADDEND=32768 %t/addend.s -o %t/2.o
+# RUN: llvm-mc -filetype=obj -triple=s390x -defsym ADDEND=32769 %t/addend.s -o %t/3.o
+
+# RUN: ld.lld --section-start=.text=0x0 %t/1.o -o %t/1out
+# RUN: ld.lld --section-start=.text=0x0 %t/2.o -o %t/2out
+# RUN: not ld.lld --section-start=.text=0x0 %t/3.o -o /dev/null 2>&1 | FileCheck %s -DFILE=%t/3.o --check-prefix RANGE
+
+# RANGE: error: [[FILE]]:(.text+0x0): relocation R_390_PC16 out of range
+
+# RUN: llvm-readelf --hex-dump=.text %t/1out | FileCheck %s -DADDEND=ffff --check-prefix DUMP
+# RUN: llvm-readelf --hex-dump=.text %t/2out | FileCheck %s -DADDEND=8000 --check-prefix DUMP
+
+# DUMP:  0x00000000 [[ADDEND]]
+
+#--- shared.s
+.data
+ .byte 0xe8
+ .word _shared - .
+
+#--- addend.s
+.text
+.globl _start
+_start:
+    .reloc ., R_390_PC16, .text-ADDEND
+    .space 2
diff --git a/lld/test/ELF/systemz-reloc-pc32.s b/lld/test/ELF/systemz-reloc-pc32.s
new file mode 100644
index 0000000..0cb9322
--- /dev/null
+++ b/lld/test/ELF/systemz-reloc-pc32.s
@@ -0,0 +1,39 @@
+# REQUIRES: systemz
+# RUN: rm -rf %t && split-file %s %t
+
+## Check recompile with -fPIC error message
+# RUN: llvm-mc -filetype=obj -triple=s390x-unknown-linux %t/shared.s -o %t/shared.o
+# RUN: not ld.lld -shared %t/shared.o -o /dev/null 2>&1 | FileCheck %s
+
+# CHECK: error: relocation R_390_PC32 cannot be used against symbol '_shared'; recompile with -fPIC
+# CHECK: >>> defined in {{.*}}
+# CHECK: >>> referenced by {{.*}}:(.data+0x1)
+
+## Check patching of negative addends
+
+# RUN: llvm-mc -filetype=obj -triple=s390x -defsym ADDEND=1 %t/addend.s -o %t/1.o
+# RUN: llvm-mc -filetype=obj -triple=s390x -defsym ADDEND=2147483648 %t/addend.s -o %t/2.o
+# RUN: llvm-mc -filetype=obj -triple=s390x -defsym ADDEND=2147483649 %t/addend.s -o %t/3.o
+
+# RUN: ld.lld --section-start=.text=0x0 %t/1.o -o %t/1out
+# RUN: ld.lld --section-start=.text=0x0 %t/2.o -o %t/2out
+# RUN: not ld.lld --section-start=.text=0x0 %t/3.o -o /dev/null 2>&1 | FileCheck %s -DFILE=%t/3.o --check-prefix RANGE
+
+# RANGE: error: [[FILE]]:(.text+0x0): relocation R_390_PC32 out of range
+
+# RUN: llvm-readelf --hex-dump=.text %t/1out | FileCheck %s -DADDEND=ffffffff --check-prefix DUMP
+# RUN: llvm-readelf --hex-dump=.text %t/2out | FileCheck %s -DADDEND=80000000 --check-prefix DUMP
+
+# DUMP:  0x00000000 [[ADDEND]]
+
+#--- shared.s
+.data
+ .byte 0xe8
+ .long _shared - .
+
+#--- addend.s
+.text
+.globl _start
+_start:
+    .reloc ., R_390_PC32, .text-ADDEND
+    .space 4
diff --git a/lld/test/ELF/systemz-reloc-pcdbl.s b/lld/test/ELF/systemz-reloc-pcdbl.s
new file mode 100644
index 0000000..faee756
--- /dev/null
+++ b/lld/test/ELF/systemz-reloc-pcdbl.s
@@ -0,0 +1,68 @@
+# REQUIRES: systemz
+
+# RUN: llvm-mc --filetype=obj --triple=s390x-unknown-linux -mcpu=z13 %s -o %t.o
+
+# RUN: ld.lld %t.o --defsym foo16=pc16dbl+4 --defsym bar16=pc16dbl --defsym foo32=pc32dbl+6 --defsym bar32=pc32dbl --defsym foo12=pc12dbl+6 --defsym bar12=pc12dbl --defsym foo24=pc24dbl+6 --defsym bar24=pc24dbl -o %t
+# RUN: llvm-objdump --no-show-raw-insn --mcpu=z13 -d %t | FileCheck %s --check-prefix=CHECK
+# CHECK: 0000000001001120 <pc16dbl>:
+# CHECK: je      0x1001124
+# CHECK: jne     0x1001120
+# CHECK: 0000000001001128 <pc32dbl>:
+# CHECK: jge     0x100112e
+# CHECK: jgne    0x1001128
+# CHECK: 0000000001001134 <pc12dbl>:
+# CHECK: bprp    5, 0x100113a, 0x1001134
+# CHECK: bprp    6, 0x1001134, 0x100113a
+# CHECK: 0000000001001140 <pc24dbl>:
+# CHECK: bprp    5, 0x1001140, 0x1001146
+# CHECK: bprp    6, 0x1001146, 0x1001140
+
+# RUN: ld.lld %t.o --defsym foo16=pc16dbl+0xfffe --defsym bar16=pc16dbl+4-0x10000 --defsym foo32=pc32dbl+0xfffffffe --defsym bar32=pc32dbl+6-0x100000000 --defsym foo12=pc12dbl+0xffe --defsym bar12=pc12dbl+6-0x1000 --defsym foo24=pc24dbl+0xfffffe --defsym bar24=pc24dbl+6-0x1000000 -o %t.limits
+# RUN: llvm-objdump --no-show-raw-insn --mcpu=z13 -d %t.limits | FileCheck %s --check-prefix=LIMITS
+# LIMITS:      je   0x101111e
+# LIMITS-NEXT: jne  0xff1124
+# LIMITS:      jge  0x101001126
+# LIMITS-NEXT: jgne 0xffffffff0100112e
+# LIMITS:      bprp 5, 0x1002132, 0x1001134
+# LIMITS-NEXT: bprp 6, 0x100013a, 0x100113a
+# LIMITS:      bprp 5, 0x1001140, 0x200113e
+# LIMITS-NEXT: bprp 6, 0x1001146, 0x1146
+
+# RUN: not ld.lld %t.o --defsym foo16=pc16dbl+0x10000 --defsym bar16=pc16dbl+4-0x10002 --defsym foo32=pc32dbl+0x100000000 --defsym bar32=pc32dbl+6-0x100000002 --defsym foo12=pc12dbl+0x1000 --defsym bar12=pc12dbl+6-0x1002 --defsym foo24=pc24dbl+0x1000000 --defsym bar24=pc24dbl+6-0x1000002 -o /dev/null 2>&1 | FileCheck -DFILE=%t.o --check-prefix=ERROR-RANGE %s
+# ERROR-RANGE: error: [[FILE]]:(.text+0x2): relocation R_390_PC16DBL out of range: 65536 is not in [-65536, 65535]; references 'foo16'
+# ERROR-RANGE: error: [[FILE]]:(.text+0x6): relocation R_390_PC16DBL out of range: -65538 is not in [-65536, 65535]; references 'bar16'
+# ERROR-RANGE: error: [[FILE]]:(.text+0xa): relocation R_390_PC32DBL out of range: 4294967296 is not in [-4294967296, 4294967295]; references 'foo32'
+# ERROR-RANGE: error: [[FILE]]:(.text+0x10): relocation R_390_PC32DBL out of range: -4294967298 is not in [-4294967296, 4294967295]; references 'bar32'
+# ERROR-RANGE: error: [[FILE]]:(.text+0x15): relocation R_390_PC12DBL out of range: 4096 is not in [-4096, 4095]; references 'foo12'
+# ERROR-RANGE: error: [[FILE]]:(.text+0x1b): relocation R_390_PC12DBL out of range: -4098 is not in [-4096, 4095]; references 'bar12'
+# ERROR-RANGE: error: [[FILE]]:(.text+0x23): relocation R_390_PC24DBL out of range: 16777216 is not in [-16777216, 16777215]; references 'foo24'
+# ERROR-RANGE: error: [[FILE]]:(.text+0x29): relocation R_390_PC24DBL out of range: -16777218 is not in [-16777216, 16777215]; references 'bar24'
+
+# RUN: not ld.lld %t.o --defsym foo16=pc16dbl+1 --defsym bar16=pc16dbl-1 --defsym foo32=pc32dbl+1 --defsym bar32=pc32dbl-1 --defsym foo12=pc12dbl+1 --defsym bar12=pc12dbl-1 --defsym foo24=pc24dbl+1 --defsym bar24=pc24dbl-1 -o /dev/null 2>&1 | FileCheck -DFILE=%t.o --check-prefix=ERROR-ALIGN %s
+# ERROR-ALIGN:      error: [[FILE]]:(.text+0x2): improper alignment for relocation R_390_PC16DBL: 0x1 is not aligned to 2 bytes
+# ERROR-ALIGN-NEXT: error: [[FILE]]:(.text+0x6): improper alignment for relocation R_390_PC16DBL: 0xFFFFFFFFFFFFFFFB is not aligned to 2 bytes
+# ERROR-ALIGN-NEXT: error: [[FILE]]:(.text+0xa): improper alignment for relocation R_390_PC32DBL: 0x1 is not aligned to 2 bytes
+# ERROR-ALIGN-NEXT: error: [[FILE]]:(.text+0x10): improper alignment for relocation R_390_PC32DBL: 0xFFFFFFFFFFFFFFF9 is not aligned to 2 bytes
+# ERROR-ALIGN-NEXT: error: [[FILE]]:(.text+0x15): improper alignment for relocation R_390_PC12DBL: 0x1 is not aligned to 2 bytes
+# ERROR-ALIGN-NEXT: error: [[FILE]]:(.text+0x1b): improper alignment for relocation R_390_PC12DBL: 0xFFFFFFFFFFFFFFF9 is not aligned to 2 bytes
+# ERROR-ALIGN-NEXT: error: [[FILE]]:(.text+0x23): improper alignment for relocation R_390_PC24DBL: 0x1 is not aligned to 2 bytes
+# ERROR-ALIGN-NEXT: error: [[FILE]]:(.text+0x29): improper alignment for relocation R_390_PC24DBL: 0xFFFFFFFFFFFFFFF9 is not aligned to 2 bytes
+
+.global _start
+.global pc16dbl
+.global pc32dbl
+.global pc12dbl
+.global pc24dbl
+_start:
+pc16dbl:
+     je   foo16
+     jne  bar16
+pc32dbl:
+     jge  foo32
+     jgne bar32
+pc12dbl:
+     bprp 5,foo12,0
+     bprp 6,bar12,0
+pc24dbl:
+     bprp 5,0,foo24
+     bprp 6,0,bar24
diff --git a/lld/test/ELF/systemz-tls-gd.s b/lld/test/ELF/systemz-tls-gd.s
new file mode 100644
index 0000000..3976f55
--- /dev/null
+++ b/lld/test/ELF/systemz-tls-gd.s
@@ -0,0 +1,142 @@
+# REQUIRES: systemz
+# RUN: llvm-mc -filetype=obj -triple=s390x-unknown-linux %s -o %t.o
+# RUN: echo '.tbss; .globl b, c; b: .zero 4; c:' | llvm-mc -filetype=obj -triple=s390x-unknown-linux - -o %t1.o
+# RUN: ld.lld -shared -soname=t1.so %t1.o -o %t1.so
+
+# RUN: ld.lld -shared %t.o %t1.o -o %t.so
+# RUN: llvm-readelf -r %t.so | FileCheck --check-prefix=GD-REL %s
+# RUN: llvm-objdump -d --no-show-raw-insn %t.so | FileCheck --check-prefix=GD %s
+# RUN: llvm-objdump --section .data.rel.ro --full-contents %t.so | FileCheck --check-prefix=GD-DATA %s
+
+# RUN: ld.lld %t.o %t1.o -o %t.le
+# RUN: llvm-readelf -r %t.le | FileCheck --check-prefix=NOREL %s
+# RUN: llvm-objdump -d --no-show-raw-insn %t.le | FileCheck --check-prefix=LE %s
+# RUN: llvm-objdump --section .data.rel.ro --full-contents %t.le | FileCheck --check-prefix=LE-DATA %s
+
+# RUN: ld.lld %t.o %t1.so -o %t.ie
+# RUN: llvm-readelf -r %t.ie | FileCheck --check-prefix=IE-REL %s
+# RUN: llvm-objdump -d --no-show-raw-insn %t.ie | FileCheck --check-prefix=IE %s
+# RUN: llvm-objdump --section .data.rel.ro --full-contents %t.ie | FileCheck --check-prefix=IE-DATA %s
+
+# GD-REL: Relocation section '.rela.dyn' at offset {{.*}} contains 6 entries:
+# GD-REL:      0000000000002570 0000000200000036 R_390_TLS_DTPMOD 0000000000000008 a + 0
+# GD-REL-NEXT: 0000000000002578 0000000200000037 R_390_TLS_DTPOFF 0000000000000008 a + 0
+# GD-REL-NEXT: 0000000000002580 0000000300000036 R_390_TLS_DTPMOD 000000000000000c b + 0
+# GD-REL-NEXT: 0000000000002588 0000000300000037 R_390_TLS_DTPOFF 000000000000000c b + 0
+# GD-REL-NEXT: 0000000000002590 0000000400000036 R_390_TLS_DTPMOD 0000000000000010 c + 0
+# GD-REL-NEXT: 0000000000002598 0000000400000037 R_390_TLS_DTPOFF 0000000000000010 c + 0
+
+## _GLOBAL_OFFSET_TABLE is at 0x2558
+# GD:      larl    %r12, 0x2558
+
+## GOT offset of the TLS module ID / offset pair for a is at 0x2460
+# GD-NEXT: lgrl    %r2, 0x2460
+# GD-NEXT: brasl   %r14, 0x1440
+# GD-NEXT: lgf     %r2, 0(%r2,%r7)
+
+## GOT offset of the TLS module ID / offset pair for b is at 0x2468
+# GD-NEXT: lgrl    %r2, 0x2468
+# GD-NEXT: brasl   %r14, 0x1440
+# GD-NEXT: lgf     %r2, 0(%r2,%r7)
+
+## GOT offset of the TLS module ID / offset pair for c is at 0x2470
+# GD-NEXT: lgrl    %r2, 0x2470
+# GD-NEXT: brasl   %r14, 0x1440
+# GD-NEXT: lgf     %r2, 0(%r2,%r7)
+
+## Constant pool holding GOT offsets of TLS module ID / offset pairs:
+# a: 0x2570 / 0x18
+# b: 0x2580 / 0x28
+# c: 0x2590 / 0x38
+# GD-DATA:      2460 00000000 00000018 00000000 00000028
+# GD-DATA-NEXT: 2470 00000000 00000038
+
+# NOREL: no relocations
+
+## _GLOBAL_OFFSET_TABLE is at 0x1002230
+# LE:      larl    %r12, 0x1002230
+
+## TP offset of a is at 0x1002218
+# LE-NEXT: lgrl    %r2, 0x1002218
+# LE-NEXT: brcl    0,
+# LE-NEXT: lgf     %r2, 0(%r2,%r7)
+
+## TP offset of b is at 0x1002220
+# LE-NEXT: lgrl    %r2, 0x1002220
+# LE-NEXT: brcl    0,
+# LE-NEXT: lgf     %r2, 0(%r2,%r7)
+
+## TP offset of c is at 0x1002228
+# LE-NEXT: lgrl    %r2, 0x1002228
+# LE-NEXT: brcl    0,
+# LE-NEXT: lgf     %r2, 0(%r2,%r7)
+
+## TP offsets
+# a: -8
+# b: -4
+# c: 0
+# LE-DATA:      1002218 ffffffff fffffff8 ffffffff fffffffc
+# LE-DATA-NEXT: 1002228 00000000 00000000
+
+
+# IE-REL: Relocation section '.rela.dyn' at offset {{.*}} contains 2 entries:
+# IE-REL:      0000000001002430 0000000200000038 R_390_TLS_TPOFF 0000000000000000 b + 0
+# IE-REL-NEXT: 0000000001002438 0000000300000038 R_390_TLS_TPOFF 0000000000000000 c + 0
+
+## _GLOBAL_OFFSET_TABLE is at 0x1002418
+# IE:      larl    %r12, 0x1002418
+
+## TP offset of a is at 0x1002340
+# IE-NEXT: lgrl    %r2, 0x1002340
+# IE-NEXT: brcl    0,
+# IE-NEXT: lgf     %r2, 0(%r2,%r7)
+
+## GOT offset of the TP offset for b is at 0x1002348
+# IE-NEXT: lgrl    %r2, 0x1002348
+# IE-NEXT: lg      %r2, 0(%r2,%r12)
+# IE-NEXT: lgf     %r2, 0(%r2,%r7)
+
+## GOT offset of the TP offset for c is at 0x1002350
+# IE-NEXT: lgrl    %r2, 0x1002350
+# IE-NEXT: lg      %r2, 0(%r2,%r12)
+# IE-NEXT: lgf     %r2, 0(%r2,%r7)
+
+## TP offsets (a) / GOT offset of TP offsets (b, c)
+# a: -4
+# b: 0x1002430 / 0x18
+# c: 0x1002438 / 0x20
+# IE-DATA:      1002340 ffffffff fffffffc 00000000 00000018
+# IE-DATA-NEXT: 1002350 00000000 00000020
+
+
+ear     %r7,%a0
+sllg    %r7,%r1,32
+ear     %r7,%a1
+larl    %r12,_GLOBAL_OFFSET_TABLE_
+
+lgrl    %r2,.LC0
+brasl   %r14,__tls_get_offset@PLT:tls_gdcall:a
+lgf     %r2,0(%r2,%r7)
+
+lgrl    %r2,.LC1
+brasl   %r14,__tls_get_offset@PLT:tls_gdcall:b
+lgf     %r2,0(%r2,%r7)
+
+lgrl    %r2,.LC2
+brasl   %r14,__tls_get_offset@PLT:tls_gdcall:c
+lgf     %r2,0(%r2,%r7)
+
+        .section        .data.rel.ro,"aw"
+        .align  8
+.LC0:
+        .quad   a@TLSGD
+.LC1:
+        .quad   b@TLSGD
+.LC2:
+        .quad   c@TLSGD
+
+	.section .tbss
+	.globl a
+	.zero 8
+a:
+	.zero 4
diff --git a/lld/test/ELF/systemz-tls-ie.s b/lld/test/ELF/systemz-tls-ie.s
new file mode 100644
index 0000000..27b642e
--- /dev/null
+++ b/lld/test/ELF/systemz-tls-ie.s
@@ -0,0 +1,87 @@
+# REQUIRES: systemz
+# RUN: llvm-mc -filetype=obj -triple=s390x-unknown-linux %s -o %t.o
+
+# RUN: ld.lld -shared %t.o -o %t.so
+# RUN: llvm-readelf -r %t.so | FileCheck --check-prefix=IE-REL %s
+# RUN: llvm-objdump -d --no-show-raw-insn %t.so | FileCheck --check-prefix=IE %s
+# RUN: llvm-objdump --section .data --full-contents %t.so | FileCheck --check-prefix=IE-DATA %s
+
+# RUN: ld.lld %t.o -o %t
+# RUN: llvm-readelf -r %t | FileCheck --check-prefix=NOREL %s
+# RUN: llvm-objdump -d --no-show-raw-insn %t | FileCheck --check-prefix=LE %s
+# RUN: llvm-objdump --section .data --full-contents %t | FileCheck --check-prefix=LE-DATA %s
+# RUN: llvm-objdump --section .got --full-contents %t | FileCheck --check-prefix=LE-GOT %s
+
+# IE-REL: Relocation section '.rela.dyn' at offset {{.*}} contains 4 entries:
+# IE-REL: 0000000000003478 000000000000000c R_390_RELATIVE 2460
+# IE-REL: 0000000000002460 0000000100000038 R_390_TLS_TPOFF 0000000000000008 a + 0
+# IE-REL: 0000000000002468 0000000200000038 R_390_TLS_TPOFF 000000000000000c b + 0
+# IE-REL: 0000000000002470 0000000300000038 R_390_TLS_TPOFF 0000000000000010 c + 0
+
+## TP offset for a is at 0x2460
+# IE:      lgrl    %r1, 0x2460
+# IE-NEXT: lgf     %r1, 0(%r1,%r7)
+
+## TP offset for b is at 0x2468
+# IE-NEXT: lgrl    %r1, 0x2468
+# IE-NEXT: lgf     %r1, 0(%r1,%r7)
+
+## TP offset for c is at 0x2470
+# IE-NEXT: lgrl    %r1, 0x2470
+# IE-NEXT: lgf     %r1, 0(%r1,%r7)
+
+## Data element: TP offset for a is at 0x2460 (relocated via R_390_RELATIVE above)
+# IE-DATA: 3478 00000000 00000000
+
+# NOREL: no relocations
+
+## TP offset for a is at 0x1002250
+# LE:      lgrl    %r1, 0x1002250
+# LE-NEXT: lgf     %r1, 0(%r1,%r7)
+
+## TP offset for b is at 0x1002258
+# LE-NEXT: lgrl    %r1, 0x1002258
+# LE-NEXT: lgf     %r1, 0(%r1,%r7)
+
+## TP offset for c is at 0x1002260
+# LE-NEXT: lgrl    %r1, 0x1002260
+# LE-NEXT: lgf     %r1, 0(%r1,%r7)
+
+## Data element: TP offset for a is at 0x1002250
+# LE-DATA: 00000000 01002250
+
+## TP offsets in GOT:
+# a: -8
+# b: -4
+# c: 0
+# LE-GOT: 1002238 00000000 00000000 00000000 00000000
+# LE-GOT: 1002248 00000000 00000000 ffffffff fffffff8
+# LE-GOT: 1002258 ffffffff fffffffc 00000000 00000000
+
+ear     %r7,%a0
+sllg    %r7,%r1,32
+ear     %r7,%a1
+
+lgrl    %r1, a@indntpoff
+lgf     %r1,0(%r1,%r7)
+
+lgrl    %r1, b@indntpoff
+lgf     %r1,0(%r1,%r7)
+
+lgrl    %r1, c@indntpoff
+lgf     %r1,0(%r1,%r7)
+
+	.data
+	.reloc .,R_390_TLS_IE64,a
+	.space 8
+
+	.section .tbss
+	.globl a
+	.globl b
+	.globl c
+	.zero 8
+a:
+	.zero 4
+b:
+	.zero 4
+c:
diff --git a/lld/test/ELF/systemz-tls-ld.s b/lld/test/ELF/systemz-tls-ld.s
new file mode 100644
index 0000000..2cb36d7
--- /dev/null
+++ b/lld/test/ELF/systemz-tls-ld.s
@@ -0,0 +1,114 @@
+# REQUIRES: systemz
+# RUN: llvm-mc -filetype=obj -triple=s390x-unknown-linux %s -o %t.o
+
+# RUN: ld.lld -shared %t.o -o %t.so
+# RUN: llvm-readelf -r %t.so | FileCheck --check-prefix=LD-REL %s
+# RUN: llvm-objdump -d --no-show-raw-insn %t.so | FileCheck --check-prefix=LD %s
+# RUN: llvm-objdump --section .data.rel.ro --full-contents %t.so | FileCheck --check-prefix=LD-DATA %s
+
+# RUN: ld.lld %t.o -o %t
+# RUN: llvm-readelf -r %t | FileCheck --check-prefix=NOREL %s
+# RUN: llvm-objdump -d --no-show-raw-insn %t | FileCheck --check-prefix=LE %s
+# RUN: llvm-objdump --section .data.rel.ro --full-contents %t | FileCheck --check-prefix=LE-DATA %s
+
+# LD-REL: Relocation section '.rela.dyn' at offset {{.*}} contains 1 entries:
+# LD-REL: 00000000000024f8 0000000000000036 R_390_TLS_DTPMOD 0
+
+## _GLOBAL_OFFSET_TABLE is at 0x24e0
+# LD:      larl    %r12, 0x24e0
+
+## GOT offset of the LDM TLS module ID is at 0x23e0
+# LD-NEXT: lgrl    %r2, 0x23e0
+# LD-NEXT: brasl   %r14, 0x13c0
+# LD-NEXT: la      %r2, 0(%r2,%r7)
+
+## DTP offset for a is at 0x23e8
+# LD-NEXT: lgrl    %r1, 0x23e8
+# LD-NEXT: lgf     %r1, 0(%r1,%r2)
+
+## DTP offset for b is at 0x23f0
+# LD-NEXT: lgrl    %r1, 0x23f0
+# LD-NEXT: lgf     %r1, 0(%r1,%r2)
+
+## DTP offset for c is at 0x23f8
+# LD-NEXT: lgrl    %r1, 0x23f8
+# LD-NEXT: lgf     %r1, 0(%r1,%r2)
+
+## Constant pool holding GOT offsets of TLS module ID and DTP offsets:
+# TLS module ID: 0x24f8 / 0x18
+# a: 8
+# b: 12
+# c: 16
+# LD-DATA: 23e0 00000000 00000018 00000000 00000008
+# LD-DATA: 23f0 00000000 0000000c 00000000 00000010
+
+# NOREL: no relocations
+
+## _GLOBAL_OFFSET_TABLE is at 0x1002230
+# LE:      larl    %r12, 0x1002230
+
+## GOT offset of the LDM TLS module ID is at 0x1002210
+# LE-NEXT: lgrl    %r2, 0x1002210
+# LE-NEXT: brcl    0,
+# LE-NEXT: la      %r2, 0(%r2,%r7)
+
+## TP offset for a is at 0x1002218
+# LE-NEXT: lgrl    %r1, 0x1002218
+# LE-NEXT: lgf     %r1, 0(%r1,%r2)
+
+## TP offset for b is at 0x1002220
+# LE-NEXT: lgrl    %r1, 0x1002220
+# LE-NEXT: lgf     %r1, 0(%r1,%r2)
+
+## TP offset for c is at 0x1002228
+# LE-NEXT: lgrl    %r1, 0x1002228
+# LE-NEXT: lgf     %r1, 0(%r1,%r2)
+
+## zeroed LDM / TP offsets:
+# LDM TLS: 0
+# a: -8
+# b: -4
+# c: 0
+# LE-DATA: 1002210 00000000 00000000 ffffffff fffffff8
+# LE-DATA: 1002220 ffffffff fffffffc 00000000 00000000
+
+
+ear     %r7,%a0
+sllg    %r7,%r1,32
+ear     %r7,%a1
+larl    %r12,_GLOBAL_OFFSET_TABLE_
+
+lgrl    %r2,.LC0
+brasl   %r14,__tls_get_offset@PLT:tls_ldcall:a
+la      %r2,0(%r2,%r7)
+
+lgrl    %r1, .LC1
+lgf     %r1,0(%r1,%r2)
+
+lgrl    %r1, .LC2
+lgf     %r1,0(%r1,%r2)
+
+lgrl    %r1, .LC3
+lgf     %r1,0(%r1,%r2)
+
+        .section        .data.rel.ro,"aw"
+        .align  8
+.LC0:
+        .quad   a@TLSLDM
+.LC1:
+        .quad   a@DTPOFF
+.LC2:
+        .quad   b@DTPOFF
+.LC3:
+        .quad   c@DTPOFF
+
+	.section .tbss
+	.globl a
+	.globl b
+	.globl c
+	.zero 8
+a:
+	.zero 4
+b:
+	.zero 4
+c:
diff --git a/lld/test/ELF/systemz-tls-le.s b/lld/test/ELF/systemz-tls-le.s
new file mode 100644
index 0000000..9e41fc7
--- /dev/null
+++ b/lld/test/ELF/systemz-tls-le.s
@@ -0,0 +1,61 @@
+# REQUIRES: systemz
+# RUN: llvm-mc -filetype=obj -triple=s390x-unknown-linux %s -o %t.o
+
+# RUN: ld.lld %t.o -o %t
+# RUN: llvm-readelf -r %t | FileCheck --check-prefix=NOREL %s
+# RUN: llvm-objdump -d --no-show-raw-insn %t | FileCheck --check-prefix=LE %s
+# RUN: llvm-objdump --section .data.rel.ro --full-contents %t | FileCheck --check-prefix=LE-DATA %s
+
+# NOREL: no relocations
+
+## TP offset for a is at 0x1002200
+# LE:      lgrl    %r1, 0x1002200
+# LE-NEXT: lgf     %r1, 0(%r1,%r7)
+
+## TP offset for b is at 0x1002208
+# LE-NEXT: lgrl    %r1, 0x1002208
+# LE-NEXT: lgf     %r1, 0(%r1,%r7)
+
+## TP offset for c is at 0x1002210
+# LE-NEXT: lgrl    %r1, 0x1002210
+# LE-NEXT: lgf     %r1, 0(%r1,%r7)
+
+## TP offsets:
+# a: -8
+# b: -4
+# c: 0
+# LE-DATA: 1002200 ffffffff fffffff8 ffffffff fffffffc
+# LE-DATA: 1002210 00000000 00000000
+
+ear     %r7,%a0
+sllg    %r7,%r1,32
+ear     %r7,%a1
+
+lgrl    %r1, .LC0
+lgf     %r1,0(%r1,%r7)
+
+lgrl    %r1, .LC1
+lgf     %r1,0(%r1,%r7)
+
+lgrl    %r1, .LC2
+lgf     %r1,0(%r1,%r7)
+
+        .section        .data.rel.ro,"aw"
+        .align  8
+.LC0:
+        .quad   a@ntpoff
+.LC1:
+        .quad   b@ntpoff
+.LC2:
+        .quad   c@ntpoff
+
+	.section .tbss
+	.globl a
+	.globl b
+	.globl c
+	.zero 8
+a:
+	.zero 4
+b:
+	.zero 4
+c:
diff --git a/lld/test/lit.cfg.py b/lld/test/lit.cfg.py
index b3e07f1..d309c2a 100644
--- a/lld/test/lit.cfg.py
+++ b/lld/test/lit.cfg.py
@@ -83,6 +83,7 @@ llvm_config.feature_config(
                 "PowerPC": "ppc",
                 "RISCV": "riscv",
                 "Sparc": "sparc",
+                "SystemZ": "systemz",
                 "WebAssembly": "wasm",
                 "X86": "x86",
             },
-- 
cgit v1.1


From 9ca1a1575a337931d0e49859f83a0d5b70916abd Mon Sep 17 00:00:00 2001
From: David Spickett <david.spickett@linaro.org>
Date: Tue, 13 Feb 2024 10:38:38 +0000
Subject: [flang][Driver] Add -masm option to flang (#81490)

The motivation here was a suggestion over in Compiler Explorer. You can
use `-mllvm` already to do this but since gfortran supports `-masm`, I
figured I'd try to add it.

This is done by flang expanding `-masm` into `-mllvm x86-asm-syntax=`,
then passing that to fc1. Which then collects all the `-mllvm` options
and forwards them on.

The code to expand it comes from clang `Clang::AddX86TargetArgs` (there
are some other places doing the same thing too). However I've removed
the `-inline-asm` that clang adds, as fortran doesn't have inline
assembly.

So `-masm` for flang purely changes the style of assembly output.

```
$ ./bin/flang-new /tmp/test.f90 -o - -S -target x86_64-linux-gnu
<...>
        pushq   %rbp
$ ./bin/flang-new /tmp/test.f90 -o - -S -target x86_64-linux-gnu -masm=att
<...>
        pushq   %rbp
$ ./bin/flang-new /tmp/test.f90 -o - -S -target x86_64-linux-gnu -masm=intel
<...>
        push    rbp
```

The test is adapted from `clang/test/Driver/masm.c` by removing the
clang-cl related lines and changing the 32 bit triples to 64 bit triples
since flang doesn't support 32 bit targets.
---
 clang/include/clang/Driver/Options.td |  2 +-
 clang/lib/Driver/ToolChains/Flang.cpp | 15 +++++++++++++++
 clang/lib/Driver/ToolChains/Flang.h   |  7 +++++++
 flang/test/Driver/masm.f90            | 10 ++++++++++
 4 files changed, 33 insertions(+), 1 deletion(-)
 create mode 100644 flang/test/Driver/masm.f90

diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index 31e8571..d5017b9 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -4414,7 +4414,7 @@ def mwatchsimulator_version_min_EQ : Joined<["-"], "mwatchsimulator-version-min=
 def march_EQ : Joined<["-"], "march=">, Group<m_Group>,
   Flags<[TargetSpecific]>, Visibility<[ClangOption, CLOption, DXCOption, FlangOption]>,
   HelpText<"For a list of available architectures for the target use '-mcpu=help'">;
-def masm_EQ : Joined<["-"], "masm=">, Group<m_Group>;
+def masm_EQ : Joined<["-"], "masm=">, Group<m_Group>, Visibility<[ClangOption, FlangOption]>;
 def inline_asm_EQ : Joined<["-"], "inline-asm=">, Group<m_Group>,
   Visibility<[ClangOption, CC1Option]>,
   Values<"att,intel">,
diff --git a/clang/lib/Driver/ToolChains/Flang.cpp b/clang/lib/Driver/ToolChains/Flang.cpp
index 23da08a..6168b42d 100644
--- a/clang/lib/Driver/ToolChains/Flang.cpp
+++ b/clang/lib/Driver/ToolChains/Flang.cpp
@@ -249,6 +249,20 @@ void Flang::AddRISCVTargetArgs(const ArgList &Args,
   }
 }
 
+void Flang::AddX86_64TargetArgs(const ArgList &Args,
+                                ArgStringList &CmdArgs) const {
+  if (Arg *A = Args.getLastArg(options::OPT_masm_EQ)) {
+    StringRef Value = A->getValue();
+    if (Value == "intel" || Value == "att") {
+      CmdArgs.push_back(Args.MakeArgString("-mllvm"));
+      CmdArgs.push_back(Args.MakeArgString("-x86-asm-syntax=" + Value));
+    } else {
+      getToolChain().getDriver().Diag(diag::err_drv_unsupported_option_argument)
+          << A->getSpelling() << Value;
+    }
+  }
+}
+
 static void addVSDefines(const ToolChain &TC, const ArgList &Args,
                          ArgStringList &CmdArgs) {
 
@@ -374,6 +388,7 @@ void Flang::addTargetOptions(const ArgList &Args,
     break;
   case llvm::Triple::x86_64:
     getTargetFeatures(D, Triple, Args, CmdArgs, /*ForAs*/ false);
+    AddX86_64TargetArgs(Args, CmdArgs);
     break;
   }
 
diff --git a/clang/lib/Driver/ToolChains/Flang.h b/clang/lib/Driver/ToolChains/Flang.h
index ec2e545..9f5e26b 100644
--- a/clang/lib/Driver/ToolChains/Flang.h
+++ b/clang/lib/Driver/ToolChains/Flang.h
@@ -77,6 +77,13 @@ private:
   void AddRISCVTargetArgs(const llvm::opt::ArgList &Args,
                           llvm::opt::ArgStringList &CmdArgs) const;
 
+  /// Add specific options for X86_64 target.
+  ///
+  /// \param [in] Args The list of input driver arguments
+  /// \param [out] CmdArgs The list of output command arguments
+  void AddX86_64TargetArgs(const llvm::opt::ArgList &Args,
+                           llvm::opt::ArgStringList &CmdArgs) const;
+
   /// Extract offload options from the driver arguments and add them to
   /// the command arguments.
   /// \param [in] C The current compilation for the driver invocation
diff --git a/flang/test/Driver/masm.f90 b/flang/test/Driver/masm.f90
new file mode 100644
index 0000000..c5c44ef
--- /dev/null
+++ b/flang/test/Driver/masm.f90
@@ -0,0 +1,10 @@
+! RUN: %flang --target=x86_64-unknown-linux -masm=intel -S %s -### 2>&1 | FileCheck --check-prefix=CHECK-INTEL %s
+! RUN: %flang --target=x86_64-unknown-linux -masm=att -S %s -### 2>&1 | FileCheck --check-prefix=CHECK-ATT %s
+! RUN: not %flang --target=x86_64-unknown-linux -S -masm=somerequired %s -### 2>&1 | FileCheck --check-prefix=CHECK-SOMEREQUIRED %s
+! RUN: %flang --target=aarch64-unknown-eabi -S -masm=intel %s -### 2>&1 | FileCheck --check-prefix=CHECK-AARCH64 %s
+
+! CHECK-INTEL: "-mllvm" "-x86-asm-syntax=intel"
+! CHECK-ATT: "-mllvm" "-x86-asm-syntax=att"
+! CHECK-SOMEREQUIRED: error: unsupported argument 'somerequired' to option '-masm='
+! CHECK-AARCH64: warning: argument unused during compilation: '-masm=intel'
+! CHECK-AARCH64-NOT: -x86-asm-syntax=intel
-- 
cgit v1.1


From a8fb0dcc41bf355a3bff9ad7165715a8b6012059 Mon Sep 17 00:00:00 2001
From: Paul Semel <semelpaul@gmail.com>
Date: Tue, 13 Feb 2024 11:39:27 +0100
Subject: [dataflow] CXXOperatorCallExpr equal operator might not be a glvalue
 (#80991)

Although in a normal implementation the assumption is reasonable, it
seems that some esoteric implementation are not returning a T&. This
should be handled correctly and the values be propagated.

---------

Co-authored-by: martinboehme <mboehme@google.com>
---
 clang/lib/Analysis/FlowSensitive/Transfer.cpp      | 14 ++++++++-
 .../Analysis/FlowSensitive/TransferTest.cpp        | 36 ++++++++++++++++++++++
 2 files changed, 49 insertions(+), 1 deletion(-)

diff --git a/clang/lib/Analysis/FlowSensitive/Transfer.cpp b/clang/lib/Analysis/FlowSensitive/Transfer.cpp
index a098471..f0b15f4 100644
--- a/clang/lib/Analysis/FlowSensitive/Transfer.cpp
+++ b/clang/lib/Analysis/FlowSensitive/Transfer.cpp
@@ -535,7 +535,19 @@ public:
         return;
 
       copyRecord(*LocSrc, *LocDst, Env);
-      Env.setStorageLocation(*S, *LocDst);
+
+      // If the expr is a glvalue, we can reasonably assume the operator is
+      // returning T& and thus we can assign it `LocDst`.
+      if (S->isGLValue()) {
+        Env.setStorageLocation(*S, *LocDst);
+      } else if (S->getType()->isRecordType()) {
+        // Make sure that we have a `RecordValue` for this expression so that
+        // `Environment::getResultObjectLocation()` is able to return a location
+        // for it.
+        if (Env.getValue(*S) == nullptr)
+          refreshRecordValue(*S, Env);
+      }
+
       return;
     }
 
diff --git a/clang/unittests/Analysis/FlowSensitive/TransferTest.cpp b/clang/unittests/Analysis/FlowSensitive/TransferTest.cpp
index 55af702..4b3b351 100644
--- a/clang/unittests/Analysis/FlowSensitive/TransferTest.cpp
+++ b/clang/unittests/Analysis/FlowSensitive/TransferTest.cpp
@@ -2313,6 +2313,42 @@ TEST(TransferTest, AssignmentOperatorWithInitAndInheritance) {
          ASTContext &ASTCtx) {});
 }
 
+TEST(TransferTest, AssignmentOperatorReturnsVoid) {
+  // This is a crash repro.
+  std::string Code = R"(
+    struct S {
+      void operator=(S&& other);
+    };
+    void target() {
+      S s;
+      s = S();
+      // [[p]]
+    }
+  )";
+  runDataflow(
+      Code,
+      [](const llvm::StringMap<DataflowAnalysisState<NoopLattice>> &Results,
+         ASTContext &ASTCtx) {});
+}
+
+TEST(TransferTest, AssignmentOperatorReturnsByValue) {
+  // This is a crash repro.
+  std::string Code = R"(
+    struct S {
+      S operator=(S&& other);
+    };
+    void target() {
+      S s;
+      s = S();
+      // [[p]]
+    }
+  )";
+  runDataflow(
+      Code,
+      [](const llvm::StringMap<DataflowAnalysisState<NoopLattice>> &Results,
+         ASTContext &ASTCtx) {});
+}
+
 TEST(TransferTest, CopyConstructor) {
   std::string Code = R"(
     struct A {
-- 
cgit v1.1


From 79ce2c93aeb4686ef687b19867dbfe0e8cf40673 Mon Sep 17 00:00:00 2001
From: Benjamin Maxwell <benjamin.maxwell@arm.com>
Date: Tue, 13 Feb 2024 10:47:33 +0000
Subject: [mlir][VectorOps] Add conversion of 1-D vector.interleave ops to LLVM
 (#80966)

The 1-D case directly maps to LLVM intrinsics. The n-D case will be
handled by unrolling to 1-D first (in a later patch).

Depends on: #80965
---
 .../VectorToLLVM/ConvertVectorToLLVM.cpp           | 42 +++++++++++++++++++++-
 .../Conversion/VectorToLLVM/vector-to-llvm.mlir    | 37 +++++++++++++++++++
 2 files changed, 78 insertions(+), 1 deletion(-)

diff --git a/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp b/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp
index b66b55a..19cc914 100644
--- a/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp
+++ b/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp
@@ -1734,6 +1734,45 @@ struct VectorSplatNdOpLowering : public ConvertOpToLLVMPattern<SplatOp> {
   }
 };
 
+/// Conversion pattern for a `vector.interleave`.
+/// This supports fixed-sized vectors and scalable vectors.
+struct VectorInterleaveOpLowering
+    : public ConvertOpToLLVMPattern<vector::InterleaveOp> {
+  using ConvertOpToLLVMPattern::ConvertOpToLLVMPattern;
+
+  LogicalResult
+  matchAndRewrite(vector::InterleaveOp interleaveOp, OpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
+    VectorType resultType = interleaveOp.getResultVectorType();
+    // n-D interleaves should have been lowered already.
+    if (resultType.getRank() != 1)
+      return rewriter.notifyMatchFailure(interleaveOp,
+                                         "InterleaveOp not rank 1");
+    // If the result is rank 1, then this directly maps to LLVM.
+    if (resultType.isScalable()) {
+      rewriter.replaceOpWithNewOp<LLVM::experimental_vector_interleave2>(
+          interleaveOp, typeConverter->convertType(resultType),
+          adaptor.getLhs(), adaptor.getRhs());
+      return success();
+    }
+    // Lower fixed-size interleaves to a shufflevector. While the
+    // vector.interleave2 intrinsic supports fixed and scalable vectors, the
+    // langref still recommends fixed-vectors use shufflevector, see:
+    // https://llvm.org/docs/LangRef.html#id876.
+    int64_t resultVectorSize = resultType.getNumElements();
+    SmallVector<int32_t> interleaveShuffleMask;
+    interleaveShuffleMask.reserve(resultVectorSize);
+    for (int i = 0, end = resultVectorSize / 2; i < end; ++i) {
+      interleaveShuffleMask.push_back(i);
+      interleaveShuffleMask.push_back((resultVectorSize / 2) + i);
+    }
+    rewriter.replaceOpWithNewOp<LLVM::ShuffleVectorOp>(
+        interleaveOp, adaptor.getLhs(), adaptor.getRhs(),
+        interleaveShuffleMask);
+    return success();
+  }
+};
+
 } // namespace
 
 /// Populate the given list with patterns that convert from Vector to LLVM.
@@ -1758,7 +1797,8 @@ void mlir::populateVectorToLLVMConversionPatterns(
                VectorExpandLoadOpConversion, VectorCompressStoreOpConversion,
                VectorSplatOpLowering, VectorSplatNdOpLowering,
                VectorScalableInsertOpLowering, VectorScalableExtractOpLowering,
-               MaskedReductionOpConversion>(converter);
+               MaskedReductionOpConversion, VectorInterleaveOpLowering>(
+      converter);
   // Transfer ops with rank > 1 are handled by VectorToSCF.
   populateVectorTransferLoweringPatterns(patterns, /*maxTransferRank=*/1);
 }
diff --git a/mlir/test/Conversion/VectorToLLVM/vector-to-llvm.mlir b/mlir/test/Conversion/VectorToLLVM/vector-to-llvm.mlir
index 1c13b16..a46f2e1 100644
--- a/mlir/test/Conversion/VectorToLLVM/vector-to-llvm.mlir
+++ b/mlir/test/Conversion/VectorToLLVM/vector-to-llvm.mlir
@@ -2460,3 +2460,40 @@ func.func @make_fixed_vector_of_scalable_vector(%f : f64) -> vector<3x[2]xf64>
   %res = vector.broadcast %f : f64 to vector<3x[2]xf64>
   return %res : vector<3x[2]xf64>
 }
+
+// -----
+
+// CHECK-LABEL: @vector_interleave_0d
+//  CHECK-SAME:     %[[LHS:.*]]: vector<i8>, %[[RHS:.*]]: vector<i8>)
+func.func @vector_interleave_0d(%a: vector<i8>, %b: vector<i8>) -> vector<2xi8> {
+  // CHECK: %[[LHS_RANK1:.*]] = builtin.unrealized_conversion_cast %[[LHS]] : vector<i8> to vector<1xi8>
+  // CHECK: %[[RHS_RANK1:.*]] = builtin.unrealized_conversion_cast %[[RHS]] : vector<i8> to vector<1xi8>
+  // CHECK: %[[ZIP:.*]] = llvm.shufflevector %[[LHS_RANK1]], %[[RHS_RANK1]] [0, 1] : vector<1xi8>
+  // CHECK: return %[[ZIP]]
+  %0 = vector.interleave %a, %b : vector<i8>
+  return %0 : vector<2xi8>
+}
+
+// -----
+
+// CHECK-LABEL: @vector_interleave_1d
+//  CHECK-SAME:     %[[LHS:.*]]: vector<8xf32>, %[[RHS:.*]]: vector<8xf32>)
+func.func @vector_interleave_1d(%a: vector<8xf32>, %b: vector<8xf32>) -> vector<16xf32>
+{
+  // CHECK: %[[ZIP:.*]] = llvm.shufflevector %[[LHS]], %[[RHS]] [0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15] : vector<8xf32>
+  // CHECK: return %[[ZIP]]
+  %0 = vector.interleave %a, %b : vector<8xf32>
+  return %0 : vector<16xf32>
+}
+
+// -----
+
+// CHECK-LABEL: @vector_interleave_1d_scalable
+//  CHECK-SAME:     %[[LHS:.*]]: vector<[4]xi32>, %[[RHS:.*]]: vector<[4]xi32>)
+func.func @vector_interleave_1d_scalable(%a: vector<[4]xi32>, %b: vector<[4]xi32>) -> vector<[8]xi32>
+{
+  // CHECK: %[[ZIP:.*]] = "llvm.intr.experimental.vector.interleave2"(%[[LHS]], %[[RHS]]) : (vector<[4]xi32>, vector<[4]xi32>) -> vector<[8]xi32>
+  // CHECK: return %[[ZIP]]
+  %0 = vector.interleave %a, %b : vector<[4]xi32>
+  return %0 : vector<[8]xi32>
+}
-- 
cgit v1.1


From e678e6edec8d2672eb848b7453bd10e16cc54958 Mon Sep 17 00:00:00 2001
From: LLVM GN Syncbot <llvmgnsyncbot@gmail.com>
Date: Tue, 13 Feb 2024 10:49:19 +0000
Subject: [gn build] Port fe3406e34988

---
 llvm/utils/gn/secondary/lld/ELF/BUILD.gn | 1 +
 1 file changed, 1 insertion(+)

diff --git a/llvm/utils/gn/secondary/lld/ELF/BUILD.gn b/llvm/utils/gn/secondary/lld/ELF/BUILD.gn
index bd4a4f5..d903725 100644
--- a/llvm/utils/gn/secondary/lld/ELF/BUILD.gn
+++ b/llvm/utils/gn/secondary/lld/ELF/BUILD.gn
@@ -39,6 +39,7 @@ static_library("ELF") {
     "Arch/PPC64.cpp",
     "Arch/RISCV.cpp",
     "Arch/SPARCV9.cpp",
+    "Arch/SystemZ.cpp",
     "Arch/X86.cpp",
     "Arch/X86_64.cpp",
     "CallGraphSort.cpp",
-- 
cgit v1.1


From 8456e0c290f759807023924481712767a19464c2 Mon Sep 17 00:00:00 2001
From: Jay Foad <jay.foad@amd.com>
Date: Tue, 13 Feb 2024 11:03:20 +0000
Subject: [ADT] Allow std::next to work on BitVector's set_bits_iterator
 (#80830)

Without this I would hit errors with libstdc++-12 like:

/usr/include/c++/12/bits/stl_iterator_base_funcs.h:230:5: note:
candidate template ignored: substitution failure [with _InputIterator =
llvm::const_set_bits_iterator_impl<llvm::BitVector>]: argument may not
have 'void' type
    next(_InputIterator __x, typename
    ^
---
 llvm/include/llvm/ADT/BitVector.h    | 2 +-
 llvm/unittests/ADT/BitVectorTest.cpp | 3 +++
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/llvm/include/llvm/ADT/BitVector.h b/llvm/include/llvm/ADT/BitVector.h
index e0de1af..0eaa77b6 100644
--- a/llvm/include/llvm/ADT/BitVector.h
+++ b/llvm/include/llvm/ADT/BitVector.h
@@ -42,7 +42,7 @@ template <typename BitVectorT> class const_set_bits_iterator_impl {
 
 public:
   using iterator_category = std::forward_iterator_tag;
-  using difference_type   = void;
+  using difference_type   = std::ptrdiff_t;
   using value_type        = int;
   using pointer           = value_type*;
   using reference         = value_type&;
diff --git a/llvm/unittests/ADT/BitVectorTest.cpp b/llvm/unittests/ADT/BitVectorTest.cpp
index e00e11e..6a4780c 100644
--- a/llvm/unittests/ADT/BitVectorTest.cpp
+++ b/llvm/unittests/ADT/BitVectorTest.cpp
@@ -1143,6 +1143,9 @@ TYPED_TEST(BitVectorTest, EmptyVectorGetData) {
 }
 
 TYPED_TEST(BitVectorTest, Iterators) {
+  TypeParam Singleton(1, true);
+  EXPECT_EQ(std::next(Singleton.set_bits_begin()), Singleton.set_bits_end());
+
   TypeParam Filled(10, true);
   EXPECT_NE(Filled.set_bits_begin(), Filled.set_bits_end());
   unsigned Counter = 0;
-- 
cgit v1.1


From 55d6643ccf6f9394d88d3d6359492000c58c2357 Mon Sep 17 00:00:00 2001
From: Pranav Bhandarkar <153014763+bhandarkar-pranav@users.noreply.github.com>
Date: Tue, 13 Feb 2024 05:15:51 -0600
Subject: [mlir][openmp] - Add the depend clause to omp.target and related
 offloading directives (#81081)

This patch adds support for the depend clause in a number of OpenMP
directives/constructs related to offloading. Specifically, it adds the
handling of the depend clause when it is used with the following
constructs

- target
- target enter data
- target update data
- target exit data
---
 flang/lib/Lower/OpenMP.cpp                    |  5 ++-
 mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td | 37 +++++++++++++---
 mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp  | 22 +++++++---
 mlir/test/Dialect/OpenMP/invalid.mlir         | 37 ++++++++++++++++
 mlir/test/Dialect/OpenMP/ops.mlir             | 63 ++++++++++++++++++++++++++-
 5 files changed, 151 insertions(+), 13 deletions(-)

diff --git a/flang/lib/Lower/OpenMP.cpp b/flang/lib/Lower/OpenMP.cpp
index e588762..06850be 100644
--- a/flang/lib/Lower/OpenMP.cpp
+++ b/flang/lib/Lower/OpenMP.cpp
@@ -2825,7 +2825,8 @@ genEnterExitUpdateDataOp(Fortran::lower::AbstractConverter &converter,
                                                      directive);
 
   return firOpBuilder.create<OpTy>(currentLocation, ifClauseOperand,
-                                   deviceOperand, nowaitAttr, mapOperands);
+                                   deviceOperand, nullptr, mlir::ValueRange(),
+                                   nowaitAttr, mapOperands);
 }
 
 // This functions creates a block for the body of the targetOp's region. It adds
@@ -3090,7 +3091,7 @@ genTargetOp(Fortran::lower::AbstractConverter &converter,
 
   auto targetOp = converter.getFirOpBuilder().create<mlir::omp::TargetOp>(
       currentLocation, ifClauseOperand, deviceOperand, threadLimitOperand,
-      nowaitAttr, mapOperands);
+      nullptr, mlir::ValueRange(), nowaitAttr, mapOperands);
 
   genBodyOfTargetOp(converter, semaCtx, eval, genNested, targetOp, mapSymTypes,
                     mapSymLocs, mapSymbols, currentLocation);
diff --git a/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td b/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td
index 44f3e5b..c7a32de 100644
--- a/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td
+++ b/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td
@@ -781,7 +781,7 @@ def ClauseTaskDependInOut : I32EnumAttrCase<"taskdependinout", 2>;
 
 def ClauseTaskDepend : I32EnumAttr<
     "ClauseTaskDepend",
-    "task depend clause",
+    "depend clause in a target or task construct",
     [ClauseTaskDependIn, ClauseTaskDependOut, ClauseTaskDependInOut]> {
   let genSpecializedAttr = 0;
   let cppNamespace = "::mlir::omp";
@@ -1447,11 +1447,17 @@ def Target_EnterDataOp: OpenMP_Op<"target_enter_data",
 
     The $map_types specifies the types and modifiers for the map clause.
 
-    TODO:  depend clause and map_type_modifier values iterator and mapper.
+    The `depends` and `depend_vars` arguments are variadic lists of values
+    that specify the dependencies of this particular target task in relation to
+    other tasks.
+
+    TODO:  map_type_modifier values iterator and mapper.
   }];
 
   let arguments = (ins Optional<I1>:$if_expr,
                        Optional<AnyInteger>:$device,
+                       OptionalAttr<TaskDependArrayAttr>:$depends,
+                       Variadic<OpenMP_PointerLikeType>:$depend_vars,
                        UnitAttr:$nowait,
                        Variadic<AnyType>:$map_operands);
 
@@ -1460,6 +1466,7 @@ def Target_EnterDataOp: OpenMP_Op<"target_enter_data",
     | `device` `(` $device `:` type($device) `)`
     | `nowait` $nowait
     | `map_entries` `(` $map_operands `:` type($map_operands) `)`
+    | `depend` `(` custom<DependVarList>($depend_vars, type($depend_vars), $depends) `)`
     ) attr-dict
    }];
 
@@ -1494,11 +1501,17 @@ def Target_ExitDataOp: OpenMP_Op<"target_exit_data",
 
     The $map_types specifies the types and modifiers for the map clause.
 
-    TODO:  depend clause and map_type_modifier values iterator and mapper.
+    The `depends` and `depend_vars` arguments are variadic lists of values
+    that specify the dependencies of this particular target task in relation to
+    other tasks.
+
+    TODO: map_type_modifier values iterator and mapper.
   }];
 
   let arguments = (ins Optional<I1>:$if_expr,
                        Optional<AnyInteger>:$device,
+                       OptionalAttr<TaskDependArrayAttr>:$depends,
+                       Variadic<OpenMP_PointerLikeType>:$depend_vars,
                        UnitAttr:$nowait,
                        Variadic<AnyType>:$map_operands);
 
@@ -1507,6 +1520,7 @@ def Target_ExitDataOp: OpenMP_Op<"target_exit_data",
     | `device` `(` $device `:` type($device) `)`
     | `nowait` $nowait
     | `map_entries` `(` $map_operands `:` type($map_operands) `)`
+    | `depend` `(` custom<DependVarList>($depend_vars, type($depend_vars), $depends) `)`
     ) attr-dict
    }];
 
@@ -1545,11 +1559,16 @@ def Target_UpdateDataOp: OpenMP_Op<"target_update_data",
     during verification to make sure the restrictions for target update are
     respected.
 
-    TODO: depend clause
+    The `depends` and `depend_vars` arguments are variadic lists of values
+    that specify the dependencies of this particular target task in relation to
+    other tasks.
+
   }];
 
   let arguments = (ins Optional<I1>:$if_expr,
                        Optional<AnyInteger>:$device,
+                       OptionalAttr<TaskDependArrayAttr>:$depends,
+                       Variadic<OpenMP_PointerLikeType>:$depend_vars,
                        UnitAttr:$nowait,
                        Variadic<OpenMP_PointerLikeType>:$map_operands);
 
@@ -1558,6 +1577,7 @@ def Target_UpdateDataOp: OpenMP_Op<"target_update_data",
     | `device` `(` $device `:` type($device) `)`
     | `nowait` $nowait
     | `motion_entries` `(` $map_operands `:` type($map_operands) `)`
+    | `depend` `(` custom<DependVarList>($depend_vars, type($depend_vars), $depends) `)`
     ) attr-dict
    }];
 
@@ -1587,13 +1607,19 @@ def TargetOp : OpenMP_Op<"target",[IsolatedFromAbove, MapClauseOwningOpInterface
     The optional $nowait elliminates the implicit barrier so the parent task can make progress
     even if the target task is not yet completed.
 
-    TODO:  is_device_ptr, depend, defaultmap, in_reduction
+    The `depends` and `depend_vars` arguments are variadic lists of values
+    that specify the dependencies of this particular target task in relation to
+    other tasks.
+
+    TODO:  is_device_ptr, defaultmap, in_reduction
 
   }];
 
   let arguments = (ins Optional<I1>:$if_expr,
                        Optional<AnyInteger>:$device,
                        Optional<AnyInteger>:$thread_limit,
+                       OptionalAttr<TaskDependArrayAttr>:$depends,
+                       Variadic<OpenMP_PointerLikeType>:$depend_vars,
                        UnitAttr:$nowait,
                        Variadic<AnyType>:$map_operands);
 
@@ -1605,6 +1631,7 @@ def TargetOp : OpenMP_Op<"target",[IsolatedFromAbove, MapClauseOwningOpInterface
     | `thread_limit` `(` $thread_limit `:` type($thread_limit) `)`
     | `nowait` $nowait
     | `map_entries` `(` custom<MapEntries>($map_operands, type($map_operands)) `)`
+    | `depend` `(` custom<DependVarList>($depend_vars, type($depend_vars), $depends) `)`
     ) $region attr-dict
   }];
 
diff --git a/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp b/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp
index ef08bd8..849449f 100644
--- a/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp
+++ b/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp
@@ -628,7 +628,7 @@ static LogicalResult verifyDependVarList(Operation *op,
       return op->emitOpError() << "expected as many depend values"
                                   " as depend variables";
   } else {
-    if (depends)
+    if (depends && !depends->empty())
       return op->emitOpError() << "unexpected depend values";
     return success();
   }
@@ -1032,19 +1032,31 @@ LogicalResult DataOp::verify() {
 }
 
 LogicalResult EnterDataOp::verify() {
-  return verifyMapClause(*this, getMapOperands());
+  LogicalResult verifyDependVars =
+      verifyDependVarList(*this, getDepends(), getDependVars());
+  return failed(verifyDependVars) ? verifyDependVars
+                                  : verifyMapClause(*this, getMapOperands());
 }
 
 LogicalResult ExitDataOp::verify() {
-  return verifyMapClause(*this, getMapOperands());
+  LogicalResult verifyDependVars =
+      verifyDependVarList(*this, getDepends(), getDependVars());
+  return failed(verifyDependVars) ? verifyDependVars
+                                  : verifyMapClause(*this, getMapOperands());
 }
 
 LogicalResult UpdateDataOp::verify() {
-  return verifyMapClause(*this, getMapOperands());
+  LogicalResult verifyDependVars =
+      verifyDependVarList(*this, getDepends(), getDependVars());
+  return failed(verifyDependVars) ? verifyDependVars
+                                  : verifyMapClause(*this, getMapOperands());
 }
 
 LogicalResult TargetOp::verify() {
-  return verifyMapClause(*this, getMapOperands());
+  LogicalResult verifyDependVars =
+      verifyDependVarList(*this, getDepends(), getDependVars());
+  return failed(verifyDependVars) ? verifyDependVars
+                                  : verifyMapClause(*this, getMapOperands());
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/mlir/test/Dialect/OpenMP/invalid.mlir b/mlir/test/Dialect/OpenMP/invalid.mlir
index 59b4239..1c1b6ea 100644
--- a/mlir/test/Dialect/OpenMP/invalid.mlir
+++ b/mlir/test/Dialect/OpenMP/invalid.mlir
@@ -1651,6 +1651,15 @@ func.func @omp_target_enter_data(%map1: memref<?xi32>) {
 
 // -----
 
+func.func @omp_target_enter_data_depend(%a: memref<?xi32>) {
+  %0 = omp.map_info var_ptr(%a: memref<?xi32>, tensor<?xi32>) map_clauses(to) capture(ByRef) -> memref<?xi32>
+  // expected-error @below {{op expected as many depend values as depend variables}}
+  omp.target_enter_data map_entries(%0: memref<?xi32> ) {operandSegmentSizes = array<i32: 0, 0, 1, 0>}
+  return
+}
+
+// -----
+
 func.func @omp_target_exit_data(%map1: memref<?xi32>) {
   %mapv = omp.map_info var_ptr(%map1 : memref<?xi32>, tensor<?xi32>)   map_clauses(to) capture(ByRef) -> memref<?xi32> {name = ""}
   // expected-error @below {{from, release and delete map types are permitted}}
@@ -1660,6 +1669,15 @@ func.func @omp_target_exit_data(%map1: memref<?xi32>) {
 
 // -----
 
+func.func @omp_target_exit_data_depend(%a: memref<?xi32>) {
+  %0 = omp.map_info var_ptr(%a: memref<?xi32>, tensor<?xi32>) map_clauses(from) capture(ByRef) -> memref<?xi32>
+  // expected-error @below {{op expected as many depend values as depend variables}}
+  omp.target_exit_data map_entries(%0: memref<?xi32> ) {operandSegmentSizes = array<i32: 0, 0, 1, 0>}
+  return
+}
+
+// -----
+
 func.func @omp_target_update_invalid_motion_type(%map1 : memref<?xi32>) {
   %mapv = omp.map_info var_ptr(%map1 : memref<?xi32>, tensor<?xi32>) map_clauses(exit_release_or_enter_alloc) capture(ByRef) -> memref<?xi32> {name = ""}
 
@@ -1732,6 +1750,25 @@ llvm.mlir.global internal @_QFsubEx() : i32
 
 // -----
 
+func.func @omp_target_update_data_depend(%a: memref<?xi32>) {
+  %0 = omp.map_info var_ptr(%a: memref<?xi32>, tensor<?xi32>) map_clauses(to) capture(ByRef) -> memref<?xi32>
+  // expected-error @below {{op expected as many depend values as depend variables}}
+  omp.target_update_data motion_entries(%0: memref<?xi32> ) {operandSegmentSizes = array<i32: 0, 0, 1, 0>}
+  return
+}
+
+// -----
+
+func.func @omp_target_depend(%data_var: memref<i32>) {
+  // expected-error @below {{op expected as many depend values as depend variables}}
+    "omp.target"(%data_var) ({
+      "omp.terminator"() : () -> ()
+    }) {depends = [], operandSegmentSizes = array<i32: 0, 0, 0, 1, 0>} : (memref<i32>) -> ()
+   "func.return"() : () -> ()
+}
+
+// -----
+
 func.func @omp_distribute(%data_var : memref<i32>) -> () {
   // expected-error @below {{expected equal sizes for allocate and allocator variables}}
   "omp.distribute"(%data_var) <{operandSegmentSizes = array<i32: 0, 1, 0>}> ({
diff --git a/mlir/test/Dialect/OpenMP/ops.mlir b/mlir/test/Dialect/OpenMP/ops.mlir
index 6514059..3bb4a28 100644
--- a/mlir/test/Dialect/OpenMP/ops.mlir
+++ b/mlir/test/Dialect/OpenMP/ops.mlir
@@ -517,7 +517,7 @@ func.func @omp_target(%if_cond : i1, %device : si32,  %num_threads : i32, %map1:
     "omp.target"(%if_cond, %device, %num_threads) ({
        // CHECK: omp.terminator
        omp.terminator
-    }) {nowait, operandSegmentSizes = array<i32: 1,1,1,0>} : ( i1, si32, i32 ) -> ()
+    }) {nowait, operandSegmentSizes = array<i32: 1,1,1,0,0>} : ( i1, si32, i32 ) -> ()
 
     // Test with optional map clause.
     // CHECK: %[[MAP_A:.*]] = omp.map_info var_ptr(%[[VAL_1:.*]] : memref<?xi32>, tensor<?xi32>)   map_clauses(tofrom) capture(ByRef) -> memref<?xi32> {name = ""}
@@ -1717,6 +1717,18 @@ func.func @omp_task_depend(%arg0: memref<i32>, %arg1: memref<i32>) {
   return
 }
 
+
+// CHECK-LABEL: @omp_target_depend
+// CHECK-SAME: (%arg0: memref<i32>, %arg1: memref<i32>) {
+func.func @omp_target_depend(%arg0: memref<i32>, %arg1: memref<i32>) {
+  // CHECK:  omp.target depend(taskdependin -> %arg0 : memref<i32>, taskdependin -> %arg1 : memref<i32>, taskdependinout -> %arg0 : memref<i32>) {
+  omp.target depend(taskdependin -> %arg0 : memref<i32>, taskdependin -> %arg1 : memref<i32>, taskdependinout -> %arg0 : memref<i32>) {
+    // CHECK: omp.terminator
+    omp.terminator
+  } {operandSegmentSizes = array<i32: 0,0,0,3,0>}
+  return
+}
+
 func.func @omp_threadprivate() {
   %0 = arith.constant 1 : i32
   %1 = arith.constant 2 : i32
@@ -2145,3 +2157,52 @@ func.func @omp_targets_is_allocatable(%arg0: !llvm.ptr, %arg1: !llvm.ptr) -> ()
   }
   return
 }
+
+// CHECK-LABEL: func @omp_target_enter_update_exit_data_depend
+// CHECK-SAME:([[ARG0:%.*]]: memref<?xi32>, [[ARG1:%.*]]: memref<?xi32>, [[ARG2:%.*]]: memref<?xi32>) {
+func.func @omp_target_enter_update_exit_data_depend(%a: memref<?xi32>, %b: memref<?xi32>, %c: memref<?xi32>) {
+// CHECK-NEXT: [[MAP0:%.*]] = omp.map_info
+// CHECK-NEXT: [[MAP1:%.*]] = omp.map_info
+// CHECK-NEXT: [[MAP2:%.*]] = omp.map_info
+  %map_a = omp.map_info var_ptr(%a: memref<?xi32>, tensor<?xi32>) map_clauses(to) capture(ByRef) -> memref<?xi32>
+  %map_b = omp.map_info var_ptr(%b: memref<?xi32>, tensor<?xi32>) map_clauses(from) capture(ByRef) -> memref<?xi32>
+  %map_c = omp.map_info var_ptr(%c: memref<?xi32>, tensor<?xi32>) map_clauses(exit_release_or_enter_alloc) capture(ByRef) -> memref<?xi32>
+
+  // Do some work on the host that writes to 'a'
+  omp.task depend(taskdependout -> %a : memref<?xi32>) {
+    "test.foo"(%a) : (memref<?xi32>) -> ()
+    omp.terminator
+  }
+
+  // Then map that over to the target
+  // CHECK: omp.target_enter_data nowait map_entries([[MAP0]], [[MAP2]] : memref<?xi32>, memref<?xi32>) depend(taskdependin -> [[ARG0]] : memref<?xi32>)
+  omp.target_enter_data nowait map_entries(%map_a, %map_c: memref<?xi32>, memref<?xi32>) depend(taskdependin ->  %a: memref<?xi32>)
+
+  // Compute 'b' on the target and copy it back
+  // CHECK: omp.target map_entries([[MAP1]] -> {{%.*}} : memref<?xi32>) {
+  omp.target map_entries(%map_b -> %arg0 : memref<?xi32>) {
+    ^bb0(%arg0: memref<?xi32>) :
+      "test.foo"(%arg0) : (memref<?xi32>) -> ()
+      omp.terminator
+  }
+
+  // Update 'a' on the host using 'b'
+  omp.task depend(taskdependout -> %a: memref<?xi32>){
+    "test.bar"(%a, %b) : (memref<?xi32>, memref<?xi32>) -> ()
+  }
+
+  // Copy the updated 'a' onto the target
+  // CHECK: omp.target_update_data nowait motion_entries([[MAP0]] : memref<?xi32>) depend(taskdependin -> [[ARG0]] : memref<?xi32>)
+  omp.target_update_data motion_entries(%map_a :  memref<?xi32>) depend(taskdependin -> %a : memref<?xi32>) nowait
+
+  // Compute 'c' on the target and copy it back
+  %map_c_from = omp.map_info var_ptr(%c: memref<?xi32>, tensor<?xi32>) map_clauses(from) capture(ByRef) -> memref<?xi32>
+  omp.target map_entries(%map_a -> %arg0, %map_c_from -> %arg1 : memref<?xi32>, memref<?xi32>) depend(taskdependout -> %c : memref<?xi32>) {
+  ^bb0(%arg0 : memref<?xi32>, %arg1 : memref<?xi32>) :
+    "test.foobar"() : ()->()
+    omp.terminator
+  }
+  // CHECK: omp.target_exit_data map_entries([[MAP2]] : memref<?xi32>) depend(taskdependin -> [[ARG2]] : memref<?xi32>)
+  omp.target_exit_data map_entries(%map_c : memref<?xi32>) depend(taskdependin -> %c : memref<?xi32>)
+  return
+}
-- 
cgit v1.1


From e79ad7bb94611666a23764459098574bc0394d56 Mon Sep 17 00:00:00 2001
From: David Spickett <david.spickett@linaro.org>
Date: Tue, 13 Feb 2024 11:09:06 +0000
Subject: [flang][docs] Fix a couple of warnings

---
 flang/docs/FortranLLVMTestSuite.md | 2 +-
 flang/docs/index.md                | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/flang/docs/FortranLLVMTestSuite.md b/flang/docs/FortranLLVMTestSuite.md
index f07d415..45485ef 100644
--- a/flang/docs/FortranLLVMTestSuite.md
+++ b/flang/docs/FortranLLVMTestSuite.md
@@ -62,7 +62,7 @@ cmake -G "Ninja" -DCMAKE_C_COMPILER=gcc -DCMAKE_CXX_COMPILER=g++ \
 
 Tests from the gfortran test suite have been imported into the LLVM Test Suite.
 The tests will be run automatically if the test suite is built following the
-instructions described [above](#running-the-LLVM-test-suite-with-fortran).
+instructions described [above](#running-the-llvm-test-suite-with-fortran).
 There are additional configure-time options that can be used with the gfortran 
 tests. More details about those options and their purpose can be found in 
 [`Fortran/gfortran/README.md`](https://github.com/llvm/llvm-test-suite/tree/main/Fortran/gfortran/README.md)`.
diff --git a/flang/docs/index.md b/flang/docs/index.md
index ff8f4a2..d974a36 100644
--- a/flang/docs/index.md
+++ b/flang/docs/index.md
@@ -68,6 +68,7 @@ on how to get in touch with us and to learn more about the current status.
    OpenACC
    OpenACC-descriptor-management.md
    OpenMP-4.5-grammar.md
+   OpenMP-descriptor-management
    OpenMP-semantics
    OptionComparison
    Overview
-- 
cgit v1.1


From 97088b2ab2184ad4bd64f59fba0b92b70468b10d Mon Sep 17 00:00:00 2001
From: Orlando Cazalet-Hyams <orlando.hyams@sony.com>
Date: Tue, 13 Feb 2024 11:35:28 +0000
Subject: [RemoveDIs][ValueMapper] Remap DIAssignIDs in DPValues  (#81595)

Fix crash raised in comments for 5c9f7682b090124d9a8b69f92d3f7c269dca25fc
---
 llvm/lib/Transforms/Utils/ValueMapper.cpp          |  1 +
 .../DebugInfo/Generic/ipsccp-remap-assign-id.ll    | 59 ++++++++++++++++++++++
 2 files changed, 60 insertions(+)
 create mode 100644 llvm/test/DebugInfo/Generic/ipsccp-remap-assign-id.ll

diff --git a/llvm/lib/Transforms/Utils/ValueMapper.cpp b/llvm/lib/Transforms/Utils/ValueMapper.cpp
index a8ae3ee..93a4c82 100644
--- a/llvm/lib/Transforms/Utils/ValueMapper.cpp
+++ b/llvm/lib/Transforms/Utils/ValueMapper.cpp
@@ -552,6 +552,7 @@ void Mapper::remapDPValue(DPValue &V) {
       V.setKillAddress();
     else if (NewAddr)
       V.setAddress(NewAddr);
+    V.setAssignId(cast<DIAssignID>(mapMetadata(V.getAssignID())));
   }
 
   // Find Value operands and remap those.
diff --git a/llvm/test/DebugInfo/Generic/ipsccp-remap-assign-id.ll b/llvm/test/DebugInfo/Generic/ipsccp-remap-assign-id.ll
new file mode 100644
index 0000000..13ac88d
--- /dev/null
+++ b/llvm/test/DebugInfo/Generic/ipsccp-remap-assign-id.ll
@@ -0,0 +1,59 @@
+; RUN: opt -passes=ipsccp %s -S -o - | FileCheck %s
+; RUN: opt --try-experimental-debuginfo-iterators -passes=ipsccp %s -S -o - | FileCheck %s
+
+;; Check the dbg.assign DIAssignID operand gets remapped after cloning.
+
+; CHECK: %tmp = alloca [4096 x i32], i32 0, align 16, !DIAssignID ![[ID1:[0-9]+]]
+; CHECK-NEXT: dbg.assign(metadata i1 undef, metadata !{{.*}}, metadata !DIExpression(), metadata ![[ID1]], metadata ptr %tmp, metadata !DIExpression())
+;
+; CHECK: %tmp = alloca [4096 x i32], i32 0, align 16, !DIAssignID ![[ID2:[0-9]+]]
+; CHECK-NEXT: dbg.assign(metadata i1 undef, metadata !{{.*}}, metadata !DIExpression(), metadata ![[ID2]], metadata ptr %tmp, metadata !DIExpression())
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare void @llvm.dbg.declare(metadata, metadata, metadata)
+
+define void @inv_txfm_add_dct_dct_4x4_c() {
+entry:
+  call void @inv_txfm_add_c(ptr @dav1d_inv_dct4_1d_c)
+  ret void
+}
+
+declare void @llvm.memset.p0.i64(ptr nocapture writeonly, i8, i64, i1 immarg)
+
+; Function Attrs: noinline
+define void @inv_txfm_add_c(ptr %first_1d_fn) #2 {
+entry:
+  %tmp = alloca [4096 x i32], i32 0, align 16, !DIAssignID !5
+  tail call void @llvm.dbg.assign(metadata i1 undef, metadata !6, metadata !DIExpression(), metadata !5, metadata ptr %tmp, metadata !DIExpression()), !dbg !16
+  call void @llvm.memset.p0.i64(ptr %tmp, i8 0, i64 0, i1 false), !DIAssignID !17
+  call void %first_1d_fn(ptr null, i64 0, i32 0, i32 0)
+  ret void
+}
+
+declare void @dav1d_inv_dct4_1d_c(ptr, i64, i32, i32)
+
+declare void @llvm.dbg.assign(metadata, metadata, metadata, metadata, metadata, metadata)
+
+attributes #2 = { noinline }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!3, !4}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C11, file: !1, isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, retainedTypes: !2, splitDebugInlining: false, nameTableKind: GNU)
+!1 = !DIFile(filename: "itx_tmpl.c", directory: ".")
+!2 = !{}
+!3 = !{i32 2, !"Debug Info Version", i32 3}
+!4 = !{i32 7, !"debug-info-assignment-tracking", i1 true}
+!5 = distinct !DIAssignID()
+!6 = !DILocalVariable(name: "tmp", scope: !7, file: !1, line: 78, type: !10)
+!7 = distinct !DISubprogram(name: "inv_txfm_add_c", scope: !1, file: !1, line: 41, type: !8, scopeLine: 45, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !2)
+!8 = distinct !DISubroutineType(types: !9)
+!9 = !{null}
+!10 = !DICompositeType(tag: DW_TAG_array_type, baseType: !11, size: 131072, elements: !2)
+!11 = !DIDerivedType(tag: DW_TAG_typedef, name: "int32_t", file: !12, line: 26, baseType: !13)
+!12 = !DIFile(filename: "stdint-intn.h", directory: ".")
+!13 = !DIDerivedType(tag: DW_TAG_typedef, name: "__int32_t", file: !14, line: 41, baseType: !15)
+!14 = !DIFile(filename: "types.h", directory: ".")
+!15 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+!16 = !DILocation(line: 0, scope: !7)
+!17 = distinct !DIAssignID()
-- 
cgit v1.1


From bfc0b7c6891896ee8e9818f22800472510093864 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Andrzej=20Warzy=C5=84ski?= <andrzej.warzynski@arm.com>
Date: Tue, 13 Feb 2024 12:10:54 +0000
Subject: [mlir][linalg] Document ops not supported by the vectoriser (nfc)
 (#81500)

Adds a test to help document Linalg Ops that are currently not supported
by the vectoriser (i.e. the logic to vectorise these is missing). The
list is not exhaustive.
---
 .../Dialect/Linalg/vectorization-unsupported.mlir  | 73 ++++++++++++++++++++++
 1 file changed, 73 insertions(+)
 create mode 100644 mlir/test/Dialect/Linalg/vectorization-unsupported.mlir

diff --git a/mlir/test/Dialect/Linalg/vectorization-unsupported.mlir b/mlir/test/Dialect/Linalg/vectorization-unsupported.mlir
new file mode 100644
index 0000000..a1a5239
--- /dev/null
+++ b/mlir/test/Dialect/Linalg/vectorization-unsupported.mlir
@@ -0,0 +1,73 @@
+// RUN: mlir-opt %s -transform-interpreter -split-input-file -verify-diagnostics
+
+func.func @conv1d_nwc_wcf_dyn_ch_dim(%input: memref<4x6x?xf32>, %filter: memref<1x?x8xf32>, %output: memref<4x2x8xf32>) {
+  // expected-error @+1 {{Attempted to vectorize, but failed}}
+  linalg.conv_1d_nwc_wcf
+    {dilations = dense<1> : tensor<1xi64>, strides = dense<3> : tensor<1xi64>}
+    ins(%input, %filter : memref<4x6x?xf32>, memref<1x?x8xf32>)
+    outs(%output : memref<4x2x8xf32>)
+  return
+}
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+    %0 = transform.structured.match ops{["linalg.conv_1d_nwc_wcf"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    transform.structured.vectorize %0 : !transform.any_op
+    transform.yield
+  }
+}
+
+// -----
+
+func.func @depthwise_conv1d_nwc_wc_dyn_ch_dim(%input: memref<3x5x?xf32>, %filter: memref<2x?xf32>, %output: memref<3x2x?xf32>) {
+  // expected-error @+1 {{Attempted to vectorize, but failed}}
+  linalg.depthwise_conv_1d_nwc_wc
+    {dilations = dense<2> : tensor<1xi64>, strides = dense<1> : tensor<1xi64>}
+    ins(%input, %filter : memref<3x5x?xf32>, memref<2x?xf32>)
+    outs(%output : memref<3x2x?xf32>)
+  return
+}
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+    %0 = transform.structured.match ops{["linalg.depthwise_conv_1d_nwc_wc"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    transform.structured.vectorize %0 : !transform.any_op
+    transform.yield
+  }
+}
+
+// -----
+
+func.func @depthwise_conv1d_nwc_wc_dyn_w_dim(%input: memref<3x?x3xf32>, %filter: memref<2x3xf32>, %output: memref<3x?x3xf32>) {
+  // expected-error @+1 {{Attempted to vectorize, but failed}}
+  linalg.depthwise_conv_1d_nwc_wc
+    {dilations = dense<2> : tensor<1xi64>, strides = dense<1> : tensor<1xi64>}
+    ins(%input, %filter : memref<3x?x3xf32>, memref<2x3xf32>)
+    outs(%output : memref<3x?x3xf32>)
+  return
+}
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+    %0 = transform.structured.match ops{["linalg.depthwise_conv_1d_nwc_wc"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    transform.structured.vectorize %0 : !transform.any_op
+    transform.yield
+  }
+}
+
+// -----
+
+func.func @conv1d_dyn_w_dim(%input: tensor<?xf32>, %filter: tensor<4xf32>, %output: tensor<?xf32>) -> tensor<?xf32> {
+  // expected-error @+1 {{Attempted to vectorize, but failed}}
+  %0 = linalg.conv_1d ins(%input, %filter : tensor<?xf32>, tensor<4xf32>)
+                     outs(%output : tensor<?xf32>) -> tensor<?xf32>
+  return %0 : tensor<?xf32>
+}
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+    %0 = transform.structured.match ops{["linalg.conv_1d"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    transform.structured.vectorize %0 : !transform.any_op
+    transform.yield
+  }
+}
-- 
cgit v1.1


From 35ef3994bf738318b59ce640910fb1ccd3bb7dcb Mon Sep 17 00:00:00 2001
From: Ivan Butygin <ivan.butygin@gmail.com>
Date: Tue, 13 Feb 2024 15:30:58 +0300
Subject: [mlir][vector] ND vectors linearization pass (#81159)

Common backends (LLVM, SPIR-V) only supports 1D vectors, LLVM conversion
handles ND vectors (N >= 2) as `array<array<... vector>>` and SPIR-V
conversion doesn't handle them at all at the moment. Sometimes it's
preferable to treat multidim vectors as linearized 1D. Add pass to do
this. Only constants and simple elementwise ops are supported for now.

@krzysz00 I've extracted yours result type conversion code from
LegalizeToF32 and moved it to common place.

Also, add ConversionPattern class operating on traits.
---
 .../Vector/Transforms/VectorRewritePatterns.h      |  9 ++
 mlir/include/mlir/Transforms/DialectConversion.h   | 23 +++++
 mlir/lib/Dialect/Math/Transforms/LegalizeToF32.cpp | 20 ++---
 mlir/lib/Dialect/Vector/Transforms/CMakeLists.txt  |  1 +
 .../Dialect/Vector/Transforms/VectorLinearize.cpp  | 97 ++++++++++++++++++++++
 mlir/lib/Transforms/Utils/DialectConversion.cpp    | 21 +++++
 mlir/test/Dialect/Vector/linearize.mlir            | 19 +++++
 .../lib/Dialect/Vector/TestVectorTransforms.cpp    | 29 +++++++
 8 files changed, 206 insertions(+), 13 deletions(-)
 create mode 100644 mlir/lib/Dialect/Vector/Transforms/VectorLinearize.cpp
 create mode 100644 mlir/test/Dialect/Vector/linearize.mlir

diff --git a/mlir/include/mlir/Dialect/Vector/Transforms/VectorRewritePatterns.h b/mlir/include/mlir/Dialect/Vector/Transforms/VectorRewritePatterns.h
index f5941d3..7c943f0 100644
--- a/mlir/include/mlir/Dialect/Vector/Transforms/VectorRewritePatterns.h
+++ b/mlir/include/mlir/Dialect/Vector/Transforms/VectorRewritePatterns.h
@@ -20,7 +20,9 @@
 #include "mlir/Dialect/Vector/Transforms/VectorTransformsEnums.h.inc"
 
 namespace mlir {
+class ConversionTarget;
 class RewritePatternSet;
+class TypeConverter;
 
 namespace arith {
 class AndIOp;
@@ -375,6 +377,13 @@ void populateVectorNarrowTypeRewritePatterns(RewritePatternSet &patterns,
 void populateVectorTransposeNarrowTypeRewritePatterns(
     RewritePatternSet &patterns, PatternBenefit benefit = 1);
 
+/// Populates patterns for ND vectors (N >= 2) linearization and sets up the
+/// provided ConversionTarget with the appropriate legality configuration for
+/// the ops to get converted properly.
+void populateVectorLinearizeTypeConversionsAndLegality(
+    TypeConverter &typeConverter, RewritePatternSet &patterns,
+    ConversionTarget &target);
+
 } // namespace vector
 } // namespace mlir
 
diff --git a/mlir/include/mlir/Transforms/DialectConversion.h b/mlir/include/mlir/Transforms/DialectConversion.h
index b1ec1fe..0911316 100644
--- a/mlir/include/mlir/Transforms/DialectConversion.h
+++ b/mlir/include/mlir/Transforms/DialectConversion.h
@@ -604,6 +604,29 @@ private:
   using ConversionPattern::matchAndRewrite;
 };
 
+/// OpTraitConversionPattern is a wrapper around ConversionPattern that allows
+/// for matching and rewriting against instances of an operation that possess a
+/// given trait.
+template <template <typename> class TraitType>
+class OpTraitConversionPattern : public ConversionPattern {
+public:
+  OpTraitConversionPattern(MLIRContext *context, PatternBenefit benefit = 1)
+      : ConversionPattern(Pattern::MatchTraitOpTypeTag(),
+                          TypeID::get<TraitType>(), benefit, context) {}
+  OpTraitConversionPattern(const TypeConverter &typeConverter,
+                           MLIRContext *context, PatternBenefit benefit = 1)
+      : ConversionPattern(typeConverter, Pattern::MatchTraitOpTypeTag(),
+                          TypeID::get<TraitType>(), benefit, context) {}
+};
+
+/// Generic utility to convert op result types according to type converter
+/// without knowing exact op type.
+/// Clones existing op with new result types and returns it.
+FailureOr<Operation *>
+convertOpResultTypes(Operation *op, ValueRange operands,
+                     const TypeConverter &converter,
+                     ConversionPatternRewriter &rewriter);
+
 /// Add a pattern to the given pattern list to convert the signature of a
 /// FunctionOpInterface op with the given type converter. This only supports
 /// ops which use FunctionType to represent their type.
diff --git a/mlir/lib/Dialect/Math/Transforms/LegalizeToF32.cpp b/mlir/lib/Dialect/Math/Transforms/LegalizeToF32.cpp
index d281790..5998133 100644
--- a/mlir/lib/Dialect/Math/Transforms/LegalizeToF32.cpp
+++ b/mlir/lib/Dialect/Math/Transforms/LegalizeToF32.cpp
@@ -76,20 +76,14 @@ LogicalResult LegalizeToF32RewritePattern::matchAndRewrite(
     ConversionPatternRewriter &rewriter) const {
   Location loc = op->getLoc();
   const TypeConverter *converter = getTypeConverter();
-  if (converter->isLegal(op))
-    return rewriter.notifyMatchFailure(loc, "op already legal");
-  OperationState newOp(loc, op->getName());
-  newOp.addOperands(operands);
+  FailureOr<Operation *> legalized =
+      convertOpResultTypes(op, operands, *converter, rewriter);
+  if (failed(legalized))
+    return failure();
 
-  SmallVector<Type> newResultTypes;
-  if (failed(converter->convertTypes(op->getResultTypes(), newResultTypes)))
-    return rewriter.notifyMatchFailure(loc, "couldn't convert return types");
-  newOp.addTypes(newResultTypes);
-  newOp.addAttributes(op->getAttrs());
-  Operation *legalized = rewriter.create(newOp);
-  SmallVector<Value> results = legalized->getResults();
-  for (auto [result, newType, origType] :
-       llvm::zip_equal(results, newResultTypes, op->getResultTypes())) {
+  SmallVector<Value> results = (*legalized)->getResults();
+  for (auto [result, newType, origType] : llvm::zip_equal(
+           results, (*legalized)->getResultTypes(), op->getResultTypes())) {
     if (newType != origType)
       result = rewriter.create<arith::TruncFOp>(loc, origType, result);
   }
diff --git a/mlir/lib/Dialect/Vector/Transforms/CMakeLists.txt b/mlir/lib/Dialect/Vector/Transforms/CMakeLists.txt
index daf2888..adf961f 100644
--- a/mlir/lib/Dialect/Vector/Transforms/CMakeLists.txt
+++ b/mlir/lib/Dialect/Vector/Transforms/CMakeLists.txt
@@ -16,6 +16,7 @@ add_mlir_dialect_library(MLIRVectorTransforms
   VectorEmulateMaskedLoadStore.cpp
   VectorEmulateNarrowType.cpp
   VectorInsertExtractStridedSliceRewritePatterns.cpp
+  VectorLinearize.cpp
   VectorTransferOpTransforms.cpp
   VectorTransferSplitRewritePatterns.cpp
   VectorTransforms.cpp
diff --git a/mlir/lib/Dialect/Vector/Transforms/VectorLinearize.cpp b/mlir/lib/Dialect/Vector/Transforms/VectorLinearize.cpp
new file mode 100644
index 0000000..c535204
--- /dev/null
+++ b/mlir/lib/Dialect/Vector/Transforms/VectorLinearize.cpp
@@ -0,0 +1,97 @@
+//===- VectorLinearize.cpp - vector linearization transforms --------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements patterns and pass for linearizing ND vectors into 1D.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/Vector/IR/VectorOps.h"
+#include "mlir/Dialect/Vector/Transforms/VectorRewritePatterns.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/IR/TypeUtilities.h"
+#include "mlir/Transforms/DialectConversion.h"
+
+using namespace mlir;
+
+namespace {
+struct LinearizeConstant final : OpConversionPattern<arith::ConstantOp> {
+  using OpConversionPattern::OpConversionPattern;
+
+  LogicalResult
+  matchAndRewrite(arith::ConstantOp constOp, OpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
+    Location loc = constOp.getLoc();
+    auto resType =
+        getTypeConverter()->convertType<VectorType>(constOp.getType());
+    if (!resType)
+      return rewriter.notifyMatchFailure(loc, "can't convert return type");
+
+    auto dstElementsAttr = dyn_cast<DenseElementsAttr>(constOp.getValue());
+    if (!dstElementsAttr)
+      return rewriter.notifyMatchFailure(loc, "unsupported attr type");
+
+    dstElementsAttr = dstElementsAttr.reshape(resType);
+    rewriter.replaceOpWithNewOp<arith::ConstantOp>(constOp, resType,
+                                                   dstElementsAttr);
+    return success();
+  }
+};
+
+struct LinearizeVectorizable final
+    : OpTraitConversionPattern<OpTrait::Vectorizable> {
+  using OpTraitConversionPattern::OpTraitConversionPattern;
+
+  LogicalResult
+  matchAndRewrite(Operation *op, ArrayRef<Value> operands,
+                  ConversionPatternRewriter &rewriter) const override {
+    FailureOr<Operation *> newOp =
+        convertOpResultTypes(op, operands, *getTypeConverter(), rewriter);
+    if (failed(newOp))
+      return failure();
+
+    rewriter.replaceOp(op, (*newOp)->getResults());
+    return success();
+  }
+};
+} // namespace
+
+void mlir::vector::populateVectorLinearizeTypeConversionsAndLegality(
+    TypeConverter &typeConverter, RewritePatternSet &patterns,
+    ConversionTarget &target) {
+  typeConverter.addConversion([](VectorType type) -> std::optional<Type> {
+    // Ignore scalable vectors for now.
+    if (type.getRank() <= 1 || type.isScalable())
+      return type;
+
+    return VectorType::get(type.getNumElements(), type.getElementType());
+  });
+
+  auto materializeCast = [](OpBuilder &builder, Type type, ValueRange inputs,
+                            Location loc) -> Value {
+    if (inputs.size() != 1 || !isa<VectorType>(inputs.front().getType()) ||
+        !isa<VectorType>(type))
+      return nullptr;
+
+    return builder.create<vector::ShapeCastOp>(loc, type, inputs.front());
+  };
+  typeConverter.addArgumentMaterialization(materializeCast);
+  typeConverter.addSourceMaterialization(materializeCast);
+  typeConverter.addTargetMaterialization(materializeCast);
+
+  target.markUnknownOpDynamicallyLegal(
+      [&](Operation *op) -> std::optional<bool> {
+        if (isa<arith::ConstantOp>(op) || op->hasTrait<OpTrait::Vectorizable>())
+          return typeConverter.isLegal(op);
+
+        return std::nullopt;
+      });
+
+  patterns.add<LinearizeConstant, LinearizeVectorizable>(typeConverter,
+                                                         patterns.getContext());
+}
diff --git a/mlir/lib/Transforms/Utils/DialectConversion.cpp b/mlir/lib/Transforms/Utils/DialectConversion.cpp
index e904470..a5a77e0 100644
--- a/mlir/lib/Transforms/Utils/DialectConversion.cpp
+++ b/mlir/lib/Transforms/Utils/DialectConversion.cpp
@@ -3130,6 +3130,27 @@ struct AnyFunctionOpInterfaceSignatureConversion
 };
 } // namespace
 
+FailureOr<Operation *>
+mlir::convertOpResultTypes(Operation *op, ValueRange operands,
+                           const TypeConverter &converter,
+                           ConversionPatternRewriter &rewriter) {
+  assert(op && "Invalid op");
+  Location loc = op->getLoc();
+  if (converter.isLegal(op))
+    return rewriter.notifyMatchFailure(loc, "op already legal");
+
+  OperationState newOp(loc, op->getName());
+  newOp.addOperands(operands);
+
+  SmallVector<Type> newResultTypes;
+  if (failed(converter.convertTypes(op->getResultTypes(), newResultTypes)))
+    return rewriter.notifyMatchFailure(loc, "couldn't convert return types");
+
+  newOp.addTypes(newResultTypes);
+  newOp.addAttributes(op->getAttrs());
+  return rewriter.create(newOp);
+}
+
 void mlir::populateFunctionOpInterfaceTypeConversionPattern(
     StringRef functionLikeOpName, RewritePatternSet &patterns,
     const TypeConverter &converter) {
diff --git a/mlir/test/Dialect/Vector/linearize.mlir b/mlir/test/Dialect/Vector/linearize.mlir
new file mode 100644
index 0000000..85e2310
--- /dev/null
+++ b/mlir/test/Dialect/Vector/linearize.mlir
@@ -0,0 +1,19 @@
+// RUN: mlir-opt %s -split-input-file -test-vector-linearize | FileCheck %s
+
+// CHECK-LABEL: test_linearize
+//  CHECK-SAME: (%[[ORIG_ARG:.*]]: vector<2x2xf32>)
+//       CHECK: %[[ARG:.*]] = vector.shape_cast %[[ORIG_ARG]] : vector<2x2xf32> to vector<4xf32>
+func.func @test_linearize(%arg0: vector<2x2xf32>) -> vector<2x2xf32> {
+//       CHECK: %[[C1:.*]] = arith.constant dense<[1.000000e+00, 2.000000e+00, 3.000000e+00, 4.000000e+00]> : vector<4xf32>
+  %0 = arith.constant dense<[[1.0, 2.0], [3.0, 4.0]]> : vector<2x2xf32>
+//       CHECK: %[[RES:.*]] = vector.shape_cast %[[C1]] : vector<4xf32> to vector<2x2xf32>
+
+// Arith and math ops are handled in generic way, check some of them
+//       CHECK: %{{.*}} =  math.sin %[[ARG]] : vector<4xf32>
+  %1 = math.sin %arg0 : vector<2x2xf32>
+//       CHECK: %{{.*}} = arith.addf %[[ARG]], %[[C1]] : vector<4xf32>
+  %2 = arith.addf %arg0, %0 :  vector<2x2xf32>
+
+//       CHECK: return %[[RES]] : vector<2x2xf32>
+  return %0 : vector<2x2xf32>
+}
diff --git a/mlir/test/lib/Dialect/Vector/TestVectorTransforms.cpp b/mlir/test/lib/Dialect/Vector/TestVectorTransforms.cpp
index 126d65b..acd3898 100644
--- a/mlir/test/lib/Dialect/Vector/TestVectorTransforms.cpp
+++ b/mlir/test/lib/Dialect/Vector/TestVectorTransforms.cpp
@@ -823,6 +823,33 @@ struct TestVectorEmulateMaskedLoadStore final
     (void)applyPatternsAndFoldGreedily(getOperation(), std::move(patterns));
   }
 };
+
+struct TestVectorLinearize final
+    : public PassWrapper<TestVectorLinearize, OperationPass<>> {
+  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(TestVectorLinearize)
+
+  StringRef getArgument() const override { return "test-vector-linearize"; }
+  StringRef getDescription() const override {
+    return "Linearizes ND vectors for N >= 2 into 1D vectors";
+  }
+  void getDependentDialects(DialectRegistry &registry) const override {
+    registry.insert<vector::VectorDialect>();
+  }
+
+  void runOnOperation() override {
+    auto *context = &getContext();
+
+    TypeConverter typeConverter;
+    RewritePatternSet patterns(context);
+    ConversionTarget target(*context);
+
+    vector::populateVectorLinearizeTypeConversionsAndLegality(typeConverter,
+                                                              patterns, target);
+    if (failed(applyPartialConversion(getOperation(), target,
+                                      std::move(patterns))))
+      return signalPassFailure();
+  }
+};
 } // namespace
 
 namespace mlir {
@@ -867,6 +894,8 @@ void registerTestVectorLowerings() {
   PassRegistration<TestFoldArithExtensionIntoVectorContractPatterns>();
 
   PassRegistration<TestVectorEmulateMaskedLoadStore>();
+
+  PassRegistration<TestVectorLinearize>();
 }
 } // namespace test
 } // namespace mlir
-- 
cgit v1.1


From 990896a591b0b042cdf552900b92016ed253970d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Timm=20B=C3=A4der?= <tbaeder@redhat.com>
Date: Tue, 13 Feb 2024 12:58:48 +0100
Subject: [clang][Interp][NFC] Fix a prototype argument name

---
 clang/lib/AST/Interp/Program.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/clang/lib/AST/Interp/Program.h b/clang/lib/AST/Interp/Program.h
index 1734268..364a63d 100644
--- a/clang/lib/AST/Interp/Program.h
+++ b/clang/lib/AST/Interp/Program.h
@@ -86,7 +86,7 @@ public:
   std::optional<unsigned> getOrCreateDummy(const ValueDecl *VD);
 
   /// Creates a global and returns its index.
-  std::optional<unsigned> createGlobal(const ValueDecl *VD, const Expr *E);
+  std::optional<unsigned> createGlobal(const ValueDecl *VD, const Expr *Init);
 
   /// Creates a global from a lifetime-extended temporary.
   std::optional<unsigned> createGlobal(const Expr *E);
-- 
cgit v1.1


From 208edf7672cd2e84ae5da4df423adccd752ee1f1 Mon Sep 17 00:00:00 2001
From: Luke Lau <luke@igalia.com>
Date: Tue, 13 Feb 2024 20:27:56 +0800
Subject: [RISCV] Fix assertion in lowerEXTRACT_SUBVECTOR

This fixes a crash when lowering an extract_subvector like:

t0:v1i64 = extract_subvector t1:v2i64, 1

Whilst we never need a vslidedown with M1 on scalable vector types, we might
need to do it for v1i64/v1f64, since the smallest container type for it is
nxv1i64/nxv1f64.

The lowering code is still correct for this case, but the assertion was too
strict. The actual invariant we're relying on is that ContainerSubVecVT's LMUL
<= M1, not < M1. Hence why we handled v2i32 fine, because its container type
was nxv1i32 and MF2.
---
 llvm/lib/Target/RISCV/RISCVISelLowering.cpp                    |  9 +++++----
 llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extract-subvector.ll | 10 ++++++++++
 2 files changed, 15 insertions(+), 4 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 5bbd827..73492c2 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -9804,10 +9804,11 @@ SDValue RISCVTargetLowering::lowerEXTRACT_SUBVECTOR(SDValue Op,
     return Op;
   }
 
-  // Else SubVecVT is a fractional LMUL and may need to be slid down: if
-  // SubVecVT was > M1 then the index would need to be a multiple of VLMAX, and
-  // so would divide exactly.
-  assert(RISCVVType::decodeVLMUL(getLMUL(ContainerSubVecVT)).second);
+  // Else SubVecVT is M1 or smaller and may need to be slid down: if SubVecVT
+  // was > M1 then the index would need to be a multiple of VLMAX, and so would
+  // divide exactly.
+  assert(RISCVVType::decodeVLMUL(getLMUL(ContainerSubVecVT)).second ||
+         getLMUL(ContainerSubVecVT) == RISCVII::VLMUL::LMUL_1);
 
   // If the vector type is an LMUL-group type, extract a subvector equal to the
   // nearest full vector register type.
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extract-subvector.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extract-subvector.ll
index 7d29b19..c49b1a7 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extract-subvector.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extract-subvector.ll
@@ -857,6 +857,16 @@ define void @extract_v8i1_nxv32i1_16(<vscale x 32 x i1> %x, ptr %y) {
   ret void
 }
 
+define <1 x i64> @extract_v1i64_v2i64_1(<2 x i64> %x) {
+; CHECK-LABEL: extract_v1i64_v2i64_1:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
+; CHECK-NEXT:    vslidedown.vi v8, v8, 1
+; CHECK-NEXT:    ret
+  %v = call <1 x i64> @llvm.vector.extract.v1i64.v2i64(<2 x i64> %x, i64 1)
+  ret <1 x i64> %v
+}
+
 declare <2 x i1> @llvm.vector.extract.v2i1.v64i1(<64 x i1> %vec, i64 %idx)
 declare <8 x i1> @llvm.vector.extract.v8i1.v64i1(<64 x i1> %vec, i64 %idx)
 
-- 
cgit v1.1


From 9b718c0d5d0f1f146957753b7785f87f58cccfec Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Timm=20B=C3=A4der?= <tbaeder@redhat.com>
Date: Tue, 13 Feb 2024 13:33:21 +0100
Subject: [clang][Interp] Handle CXXUuidofExprs

Allocate storage and initialize it with the given APValue contents.
---
 clang/lib/AST/Interp/ByteCodeExprGen.cpp | 60 ++++++++++++++++++++++++++++++++
 clang/lib/AST/Interp/ByteCodeExprGen.h   |  1 +
 clang/lib/AST/Interp/Program.cpp         |  2 +-
 clang/test/AST/Interp/literals.cpp       |  9 +++++
 clang/test/SemaCXX/PR40395.cpp           |  1 +
 clang/test/SemaCXX/ms-uuid.cpp           |  2 ++
 6 files changed, 74 insertions(+), 1 deletion(-)

diff --git a/clang/lib/AST/Interp/ByteCodeExprGen.cpp b/clang/lib/AST/Interp/ByteCodeExprGen.cpp
index ba6c1d5..880e338 100644
--- a/clang/lib/AST/Interp/ByteCodeExprGen.cpp
+++ b/clang/lib/AST/Interp/ByteCodeExprGen.cpp
@@ -2059,6 +2059,66 @@ bool ByteCodeExprGen<Emitter>::VisitExpressionTraitExpr(
   return this->emitConstBool(E->getValue(), E);
 }
 
+template <class Emitter>
+bool ByteCodeExprGen<Emitter>::VisitCXXUuidofExpr(const CXXUuidofExpr *E) {
+  if (DiscardResult)
+    return true;
+  assert(!Initializing);
+
+  std::optional<unsigned> GlobalIndex = P.getOrCreateGlobal(E->getGuidDecl());
+  if (!GlobalIndex)
+    return false;
+  if (!this->emitGetPtrGlobal(*GlobalIndex, E))
+    return false;
+
+  const Record *R = this->getRecord(E->getType());
+  assert(R);
+
+  const APValue &V = E->getGuidDecl()->getAsAPValue();
+  if (V.getKind() == APValue::None)
+    return true;
+
+  assert(V.isStruct());
+  assert(V.getStructNumBases() == 0);
+  // FIXME: This could be useful in visitAPValue, too.
+  for (unsigned I = 0, N = V.getStructNumFields(); I != N; ++I) {
+    const APValue &F = V.getStructField(I);
+    const Record::Field *RF = R->getField(I);
+
+    if (F.isInt()) {
+      PrimType T = classifyPrim(RF->Decl->getType());
+      if (!this->visitAPValue(F, T, E))
+        return false;
+      if (!this->emitInitField(T, RF->Offset, E))
+        return false;
+    } else if (F.isArray()) {
+      assert(RF->Desc->isPrimitiveArray());
+      const auto *ArrType = RF->Decl->getType()->getAsArrayTypeUnsafe();
+      PrimType ElemT = classifyPrim(ArrType->getElementType());
+      assert(ArrType);
+
+      if (!this->emitDupPtr(E))
+        return false;
+      if (!this->emitGetPtrField(RF->Offset, E))
+        return false;
+
+      for (unsigned A = 0, AN = F.getArraySize(); A != AN; ++A) {
+        if (!this->visitAPValue(F.getArrayInitializedElt(A), ElemT, E))
+          return false;
+        if (!this->emitInitElem(ElemT, A, E))
+          return false;
+      }
+
+      if (!this->emitPopPtr(E))
+        return false;
+    } else {
+      assert(false && "I don't think this should be possible");
+    }
+  }
+
+  return this->emitInitPtr(E);
+}
+
 template <class Emitter> bool ByteCodeExprGen<Emitter>::discard(const Expr *E) {
   if (E->containsErrors())
     return false;
diff --git a/clang/lib/AST/Interp/ByteCodeExprGen.h b/clang/lib/AST/Interp/ByteCodeExprGen.h
index ae216f5..c723170 100644
--- a/clang/lib/AST/Interp/ByteCodeExprGen.h
+++ b/clang/lib/AST/Interp/ByteCodeExprGen.h
@@ -113,6 +113,7 @@ public:
   bool VisitObjCBoolLiteralExpr(const ObjCBoolLiteralExpr *E);
   bool VisitCXXInheritedCtorInitExpr(const CXXInheritedCtorInitExpr *E);
   bool VisitExpressionTraitExpr(const ExpressionTraitExpr *E);
+  bool VisitCXXUuidofExpr(const CXXUuidofExpr *E);
 
 protected:
   bool visitExpr(const Expr *E) override;
diff --git a/clang/lib/AST/Interp/Program.cpp b/clang/lib/AST/Interp/Program.cpp
index b2b478a..964c037 100644
--- a/clang/lib/AST/Interp/Program.cpp
+++ b/clang/lib/AST/Interp/Program.cpp
@@ -169,7 +169,7 @@ std::optional<unsigned> Program::createGlobal(const ValueDecl *VD,
   if (const auto *Var = dyn_cast<VarDecl>(VD)) {
     IsStatic = Context::shouldBeGloballyIndexed(VD);
     IsExtern = !Var->getAnyInitializer();
-  } else if (isa<UnnamedGlobalConstantDecl>(VD)) {
+  } else if (isa<UnnamedGlobalConstantDecl, MSGuidDecl>(VD)) {
     IsStatic = true;
     IsExtern = false;
   } else {
diff --git a/clang/test/AST/Interp/literals.cpp b/clang/test/AST/Interp/literals.cpp
index 9202bb9..bc994c3 100644
--- a/clang/test/AST/Interp/literals.cpp
+++ b/clang/test/AST/Interp/literals.cpp
@@ -915,6 +915,13 @@ static_assert(ignoredDecls() == 12, "");
 namespace DiscardExprs {
 #pragma clang diagnostic push
 #pragma clang diagnostic ignored "-Wunused-value"
+  typedef struct _GUID {
+    __UINT32_TYPE__ Data1;
+    __UINT16_TYPE__ Data2;
+    __UINT16_TYPE__ Data3;
+    __UINT8_TYPE__ Data4[8];
+  } GUID;
+  class __declspec(uuid("000000A0-0000-0000-C000-000000000049")) GuidType;
 
   struct A{ int a; };
   constexpr int ignoredExprs() {
@@ -951,6 +958,8 @@ namespace DiscardExprs {
     (float)1;
     (double)1.0f;
     (signed)4u;
+    __uuidof(GuidType);
+    __uuidof(number); // both-error {{cannot call operator __uuidof on a type with no GUID}}
 
     return 0;
   }
diff --git a/clang/test/SemaCXX/PR40395.cpp b/clang/test/SemaCXX/PR40395.cpp
index 469c86d..ea0fad2 100644
--- a/clang/test/SemaCXX/PR40395.cpp
+++ b/clang/test/SemaCXX/PR40395.cpp
@@ -1,4 +1,5 @@
 // RUN: %clang_cc1 -std=c++17 -fms-extensions -triple=x86_64-pc-win32 -verify %s
+// RUN: %clang_cc1 -std=c++17 -fms-extensions -triple=x86_64-pc-win32 -verify %s -fexperimental-new-constant-interpreter
 // expected-no-diagnostics
 
 // PR40395 - ConstantExpr shouldn't cause the template object to infinitely
diff --git a/clang/test/SemaCXX/ms-uuid.cpp b/clang/test/SemaCXX/ms-uuid.cpp
index 21f93ec..172e036 100644
--- a/clang/test/SemaCXX/ms-uuid.cpp
+++ b/clang/test/SemaCXX/ms-uuid.cpp
@@ -1,5 +1,7 @@
 // RUN: %clang_cc1 -fsyntax-only -verify -fms-extensions %s -Wno-deprecated-declarations
+// RUN: %clang_cc1 -fsyntax-only -verify -fms-extensions %s -Wno-deprecated-declarations -fexperimental-new-constant-interpreter
 // RUN: %clang_cc1 -fsyntax-only -std=c++17 -verify -fms-extensions %s -Wno-deprecated-declarations
+// RUN: %clang_cc1 -fsyntax-only -std=c++17 -verify -fms-extensions %s -Wno-deprecated-declarations -fexperimental-new-constant-interpreter
 
 typedef struct _GUID {
   __UINT32_TYPE__ Data1;
-- 
cgit v1.1


From a70077ed8cdf7c7c2879c18c1c67917cd88e64ef Mon Sep 17 00:00:00 2001
From: Abhina Sree <69635948+abhina-sree@users.noreply.github.com>
Date: Tue, 13 Feb 2024 08:03:14 -0500
Subject: [SystemZ][z/OS][libcxx] mark aligned allocation tests XFAIL on z/OS
 (#80735)

zOS doesn't support aligned allocation, so mark these testcases as
unsupported.

Continuation of https://reviews.llvm.org/D102798
---
 .../new.delete/new.delete.array/new.size_align.except.pass.cpp        | 4 ++++
 .../new.delete.array/new.size_align_nothrow.except.pass.cpp           | 4 ++++
 .../new.delete/new.delete.single/new.size_align.except.pass.cpp       | 4 ++++
 .../new.delete.single/new.size_align_nothrow.except.pass.cpp          | 4 ++++
 4 files changed, 16 insertions(+)

diff --git a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/new.size_align.except.pass.cpp b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/new.size_align.except.pass.cpp
index cde7dc5..4e34ebc 100644
--- a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/new.size_align.except.pass.cpp
+++ b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/new.size_align.except.pass.cpp
@@ -9,6 +9,10 @@
 // UNSUPPORTED: no-exceptions
 // UNSUPPORTED: sanitizer-new-delete
 
+// Libc++ when built for z/OS doesn't contain the aligned allocation functions,
+// nor does the dynamic library shipped with z/OS.
+// XFAIL: target={{.+}}-zos{{.*}}
+
 #include <new>
 #include <cassert>
 #include <limits>
diff --git a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/new.size_align_nothrow.except.pass.cpp b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/new.size_align_nothrow.except.pass.cpp
index 251ba0f..07ec84b 100644
--- a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/new.size_align_nothrow.except.pass.cpp
+++ b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/new.size_align_nothrow.except.pass.cpp
@@ -9,6 +9,10 @@
 // UNSUPPORTED: no-exceptions
 // UNSUPPORTED: sanitizer-new-delete
 
+// Libc++ when built for z/OS doesn't contain the aligned allocation functions,
+// nor does the dynamic library shipped with z/OS.
+// XFAIL: target={{.+}}-zos{{.*}}
+
 #include <new>
 #include <cassert>
 #include <limits>
diff --git a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/new.size_align.except.pass.cpp b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/new.size_align.except.pass.cpp
index cb83fb2..7694314 100644
--- a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/new.size_align.except.pass.cpp
+++ b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/new.size_align.except.pass.cpp
@@ -9,6 +9,10 @@
 // UNSUPPORTED: no-exceptions
 // UNSUPPORTED: sanitizer-new-delete
 
+// Libc++ when built for z/OS doesn't contain the aligned allocation functions,
+// nor does the dynamic library shipped with z/OS.
+// XFAIL: target={{.+}}-zos{{.*}}
+
 #include <new>
 #include <cassert>
 #include <limits>
diff --git a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/new.size_align_nothrow.except.pass.cpp b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/new.size_align_nothrow.except.pass.cpp
index d95e78e..bf61d0b 100644
--- a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/new.size_align_nothrow.except.pass.cpp
+++ b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/new.size_align_nothrow.except.pass.cpp
@@ -9,6 +9,10 @@
 // UNSUPPORTED: no-exceptions
 // UNSUPPORTED: sanitizer-new-delete
 
+// Libc++ when built for z/OS doesn't contain the aligned allocation functions,
+// nor does the dynamic library shipped with z/OS.
+// XFAIL: target={{.+}}-zos{{.*}}
+
 #include <new>
 #include <cassert>
 #include <limits>
-- 
cgit v1.1


From a17a3e9d9a6b4baefd96e19ee5e8ce04cead8ab5 Mon Sep 17 00:00:00 2001
From: NAKAMURA Takumi <geek4civic@gmail.com>
Date: Tue, 13 Feb 2024 22:43:46 +0900
Subject: [MC/DC] Refactor: Make `MCDCParams` as `std::variant` (#81227)

Introduce `mcdc::DecisionParameters` and `mcdc::BranchParameters` and make
sure them not initialized as zero.

FIXME: Could we make `CoverageMappingRegion` as a smart tagged union?
---
 clang/lib/CodeGen/CoverageMappingGen.cpp           | 68 ++++++++++++----------
 .../llvm/ProfileData/Coverage/CoverageMapping.h    | 65 ++++++++++++---------
 llvm/include/llvm/ProfileData/Coverage/MCDCTypes.h | 21 +++++--
 llvm/lib/ProfileData/Coverage/CoverageMapping.cpp  | 53 +++++++++--------
 .../ProfileData/Coverage/CoverageMappingReader.cpp | 16 ++---
 .../ProfileData/Coverage/CoverageMappingWriter.cpp | 24 ++++++--
 llvm/unittests/ProfileData/CoverageMappingTest.cpp |  6 +-
 7 files changed, 153 insertions(+), 100 deletions(-)

diff --git a/clang/lib/CodeGen/CoverageMappingGen.cpp b/clang/lib/CodeGen/CoverageMappingGen.cpp
index 5060b5b..93fe76e 100644
--- a/clang/lib/CodeGen/CoverageMappingGen.cpp
+++ b/clang/lib/CodeGen/CoverageMappingGen.cpp
@@ -182,7 +182,17 @@ public:
 
   bool isBranch() const { return FalseCount.has_value(); }
 
-  bool isMCDCDecision() const { return MCDCParams.NumConditions != 0; }
+  bool isMCDCDecision() const {
+    const auto *DecisionParams =
+        std::get_if<mcdc::DecisionParameters>(&MCDCParams);
+    assert(!DecisionParams || DecisionParams->NumConditions > 0);
+    return DecisionParams;
+  }
+
+  const auto &getMCDCDecisionParams() const {
+    return CounterMappingRegion::getParams<const mcdc::DecisionParameters>(
+        MCDCParams);
+  }
 
   const mcdc::Parameters &getMCDCParams() const { return MCDCParams; }
 };
@@ -480,13 +490,13 @@ public:
             SR.ColumnEnd));
       } else if (Region.isBranch()) {
         MappingRegions.push_back(CounterMappingRegion::makeBranchRegion(
-            Region.getCounter(), Region.getFalseCounter(),
-            Region.getMCDCParams(), *CovFileID, SR.LineStart, SR.ColumnStart,
-            SR.LineEnd, SR.ColumnEnd));
+            Region.getCounter(), Region.getFalseCounter(), *CovFileID,
+            SR.LineStart, SR.ColumnStart, SR.LineEnd, SR.ColumnEnd,
+            Region.getMCDCParams()));
       } else if (Region.isMCDCDecision()) {
         MappingRegions.push_back(CounterMappingRegion::makeDecisionRegion(
-            Region.getMCDCParams(), *CovFileID, SR.LineStart, SR.ColumnStart,
-            SR.LineEnd, SR.ColumnEnd));
+            Region.getMCDCDecisionParams(), *CovFileID, SR.LineStart,
+            SR.ColumnStart, SR.LineEnd, SR.ColumnEnd));
       } else {
         MappingRegions.push_back(CounterMappingRegion::makeRegion(
             Region.getCounter(), *CovFileID, SR.LineStart, SR.ColumnStart,
@@ -863,8 +873,7 @@ struct CounterCoverageMappingBuilder
                     std::optional<SourceLocation> StartLoc = std::nullopt,
                     std::optional<SourceLocation> EndLoc = std::nullopt,
                     std::optional<Counter> FalseCount = std::nullopt,
-                    mcdc::ConditionID ID = 0, mcdc::ConditionID TrueID = 0,
-                    mcdc::ConditionID FalseID = 0) {
+                    const mcdc::Parameters &BranchParams = std::monostate()) {
 
     if (StartLoc && !FalseCount) {
       MostRecentLocation = *StartLoc;
@@ -883,9 +892,7 @@ struct CounterCoverageMappingBuilder
       StartLoc = std::nullopt;
     if (EndLoc && EndLoc->isInvalid())
       EndLoc = std::nullopt;
-    RegionStack.emplace_back(Count, FalseCount,
-                             mcdc::Parameters{0, 0, ID, TrueID, FalseID},
-                             StartLoc, EndLoc);
+    RegionStack.emplace_back(Count, FalseCount, BranchParams, StartLoc, EndLoc);
 
     return RegionStack.size() - 1;
   }
@@ -894,8 +901,8 @@ struct CounterCoverageMappingBuilder
                     std::optional<SourceLocation> StartLoc = std::nullopt,
                     std::optional<SourceLocation> EndLoc = std::nullopt) {
 
-    RegionStack.emplace_back(mcdc::Parameters{BitmapIdx, Conditions}, StartLoc,
-                             EndLoc);
+    RegionStack.emplace_back(mcdc::DecisionParameters{BitmapIdx, Conditions},
+                             StartLoc, EndLoc);
 
     return RegionStack.size() - 1;
   }
@@ -1040,9 +1047,11 @@ struct CounterCoverageMappingBuilder
     // function's SourceRegions) because it doesn't apply to any other source
     // code other than the Condition.
     if (CodeGenFunction::isInstrumentedCondition(C)) {
+      mcdc::Parameters BranchParams;
       mcdc::ConditionID ID = MCDCBuilder.getCondID(C);
-      mcdc::ConditionID TrueID = IDPair.TrueID;
-      mcdc::ConditionID FalseID = IDPair.FalseID;
+      if (ID > 0)
+        BranchParams =
+            mcdc::BranchParameters{ID, IDPair.TrueID, IDPair.FalseID};
 
       // If a condition can fold to true or false, the corresponding branch
       // will be removed.  Create a region with both counters hard-coded to
@@ -1052,11 +1061,11 @@ struct CounterCoverageMappingBuilder
       // CodeGenFunction.c always returns false, but that is very heavy-handed.
       if (ConditionFoldsToBool(C))
         popRegions(pushRegion(Counter::getZero(), getStart(C), getEnd(C),
-                              Counter::getZero(), ID, TrueID, FalseID));
+                              Counter::getZero(), BranchParams));
       else
         // Otherwise, create a region with the True counter and False counter.
-        popRegions(pushRegion(TrueCnt, getStart(C), getEnd(C), FalseCnt, ID,
-                              TrueID, FalseID));
+        popRegions(pushRegion(TrueCnt, getStart(C), getEnd(C), FalseCnt,
+                              BranchParams));
     }
   }
 
@@ -1147,12 +1156,9 @@ struct CounterCoverageMappingBuilder
         // we've seen this region.
         if (StartLocs.insert(Loc).second) {
           if (I.isBranch())
-            SourceRegions.emplace_back(
-                I.getCounter(), I.getFalseCounter(),
-                mcdc::Parameters{0, 0, I.getMCDCParams().ID,
-                                 I.getMCDCParams().TrueID,
-                                 I.getMCDCParams().FalseID},
-                Loc, getEndOfFileOrMacro(Loc), I.isBranch());
+            SourceRegions.emplace_back(I.getCounter(), I.getFalseCounter(),
+                                       I.getMCDCParams(), Loc,
+                                       getEndOfFileOrMacro(Loc), I.isBranch());
           else
             SourceRegions.emplace_back(I.getCounter(), Loc,
                                        getEndOfFileOrMacro(Loc));
@@ -2118,9 +2124,10 @@ static void dump(llvm::raw_ostream &OS, StringRef FunctionName,
     OS << "File " << R.FileID << ", " << R.LineStart << ":" << R.ColumnStart
        << " -> " << R.LineEnd << ":" << R.ColumnEnd << " = ";
 
-    if (R.Kind == CounterMappingRegion::MCDCDecisionRegion) {
-      OS << "M:" << R.MCDCParams.BitmapIdx;
-      OS << ", C:" << R.MCDCParams.NumConditions;
+    if (const auto *DecisionParams =
+            std::get_if<mcdc::DecisionParameters>(&R.MCDCParams)) {
+      OS << "M:" << DecisionParams->BitmapIdx;
+      OS << ", C:" << DecisionParams->NumConditions;
     } else {
       Ctx.dump(R.Count, OS);
 
@@ -2131,9 +2138,10 @@ static void dump(llvm::raw_ostream &OS, StringRef FunctionName,
       }
     }
 
-    if (R.Kind == CounterMappingRegion::MCDCBranchRegion) {
-      OS << " [" << R.MCDCParams.ID << "," << R.MCDCParams.TrueID;
-      OS << "," << R.MCDCParams.FalseID << "] ";
+    if (const auto *BranchParams =
+            std::get_if<mcdc::BranchParameters>(&R.MCDCParams)) {
+      OS << " [" << BranchParams->ID << "," << BranchParams->TrueID;
+      OS << "," << BranchParams->FalseID << "] ";
     }
 
     if (R.Kind == CounterMappingRegion::ExpansionRegion)
diff --git a/llvm/include/llvm/ProfileData/Coverage/CoverageMapping.h b/llvm/include/llvm/ProfileData/Coverage/CoverageMapping.h
index e877571..c6fbdb5 100644
--- a/llvm/include/llvm/ProfileData/Coverage/CoverageMapping.h
+++ b/llvm/include/llvm/ProfileData/Coverage/CoverageMapping.h
@@ -259,6 +259,24 @@ struct CounterMappingRegion {
   /// Parameters used for Modified Condition/Decision Coverage
   mcdc::Parameters MCDCParams;
 
+  template <class MaybeConstInnerParameters, class MaybeConstMCDCParameters>
+  static auto &getParams(MaybeConstMCDCParameters &MCDCParams) {
+    using InnerParameters =
+        typename std::remove_const<MaybeConstInnerParameters>::type;
+    MaybeConstInnerParameters *Params =
+        std::get_if<InnerParameters>(&MCDCParams);
+    assert(Params && "InnerParameters unavailable");
+    return *Params;
+  }
+
+  const auto &getDecisionParams() const {
+    return getParams<const mcdc::DecisionParameters>(MCDCParams);
+  }
+
+  const auto &getBranchParams() const {
+    return getParams<const mcdc::BranchParameters>(MCDCParams);
+  }
+
   unsigned FileID = 0;
   unsigned ExpandedFileID = 0;
   unsigned LineStart, ColumnStart, LineEnd, ColumnEnd;
@@ -272,19 +290,20 @@ struct CounterMappingRegion {
         LineStart(LineStart), ColumnStart(ColumnStart), LineEnd(LineEnd),
         ColumnEnd(ColumnEnd), Kind(Kind) {}
 
-  CounterMappingRegion(Counter Count, Counter FalseCount,
-                       mcdc::Parameters MCDCParams, unsigned FileID,
+  CounterMappingRegion(Counter Count, Counter FalseCount, unsigned FileID,
                        unsigned ExpandedFileID, unsigned LineStart,
                        unsigned ColumnStart, unsigned LineEnd,
-                       unsigned ColumnEnd, RegionKind Kind)
+                       unsigned ColumnEnd, RegionKind Kind,
+                       const mcdc::Parameters &MCDCParams = std::monostate())
       : Count(Count), FalseCount(FalseCount), MCDCParams(MCDCParams),
         FileID(FileID), ExpandedFileID(ExpandedFileID), LineStart(LineStart),
         ColumnStart(ColumnStart), LineEnd(LineEnd), ColumnEnd(ColumnEnd),
         Kind(Kind) {}
 
-  CounterMappingRegion(mcdc::Parameters MCDCParams, unsigned FileID,
-                       unsigned LineStart, unsigned ColumnStart,
-                       unsigned LineEnd, unsigned ColumnEnd, RegionKind Kind)
+  CounterMappingRegion(const mcdc::DecisionParameters &MCDCParams,
+                       unsigned FileID, unsigned LineStart,
+                       unsigned ColumnStart, unsigned LineEnd,
+                       unsigned ColumnEnd, RegionKind Kind)
       : MCDCParams(MCDCParams), FileID(FileID), LineStart(LineStart),
         ColumnStart(ColumnStart), LineEnd(LineEnd), ColumnEnd(ColumnEnd),
         Kind(Kind) {}
@@ -321,27 +340,20 @@ struct CounterMappingRegion {
   static CounterMappingRegion
   makeBranchRegion(Counter Count, Counter FalseCount, unsigned FileID,
                    unsigned LineStart, unsigned ColumnStart, unsigned LineEnd,
-                   unsigned ColumnEnd) {
-    return CounterMappingRegion(Count, FalseCount, mcdc::Parameters(), FileID,
-                                0, LineStart, ColumnStart, LineEnd, ColumnEnd,
-                                BranchRegion);
-  }
-
-  static CounterMappingRegion
-  makeBranchRegion(Counter Count, Counter FalseCount,
-                   mcdc::Parameters MCDCParams, unsigned FileID,
-                   unsigned LineStart, unsigned ColumnStart, unsigned LineEnd,
-                   unsigned ColumnEnd) {
-    return CounterMappingRegion(Count, FalseCount, MCDCParams, FileID, 0,
-                                LineStart, ColumnStart, LineEnd, ColumnEnd,
-                                MCDCParams.ID == 0 ? BranchRegion
-                                                   : MCDCBranchRegion);
+                   unsigned ColumnEnd,
+                   const mcdc::Parameters &MCDCParams = std::monostate()) {
+    return CounterMappingRegion(
+        Count, FalseCount, FileID, 0, LineStart, ColumnStart, LineEnd,
+        ColumnEnd,
+        (std::get_if<mcdc::BranchParameters>(&MCDCParams) ? MCDCBranchRegion
+                                                          : BranchRegion),
+        MCDCParams);
   }
 
   static CounterMappingRegion
-  makeDecisionRegion(mcdc::Parameters MCDCParams, unsigned FileID,
-                     unsigned LineStart, unsigned ColumnStart, unsigned LineEnd,
-                     unsigned ColumnEnd) {
+  makeDecisionRegion(const mcdc::DecisionParameters &MCDCParams,
+                     unsigned FileID, unsigned LineStart, unsigned ColumnStart,
+                     unsigned LineEnd, unsigned ColumnEnd) {
     return CounterMappingRegion(MCDCParams, FileID, LineStart, ColumnStart,
                                 LineEnd, ColumnEnd, MCDCDecisionRegion);
   }
@@ -406,9 +418,10 @@ public:
 
   CounterMappingRegion getDecisionRegion() const { return Region; }
   unsigned getNumConditions() const {
-    assert(Region.MCDCParams.NumConditions != 0 &&
+    unsigned NumConditions = Region.getDecisionParams().NumConditions;
+    assert(NumConditions != 0 &&
            "In MC/DC, NumConditions should never be zero!");
-    return Region.MCDCParams.NumConditions;
+    return NumConditions;
   }
   unsigned getNumTestVectors() const { return TV.size(); }
   bool isCondFolded(unsigned Condition) const { return Folded[Condition]; }
diff --git a/llvm/include/llvm/ProfileData/Coverage/MCDCTypes.h b/llvm/include/llvm/ProfileData/Coverage/MCDCTypes.h
index 365c899..b4e011c 100644
--- a/llvm/include/llvm/ProfileData/Coverage/MCDCTypes.h
+++ b/llvm/include/llvm/ProfileData/Coverage/MCDCTypes.h
@@ -13,24 +13,35 @@
 #ifndef LLVM_PROFILEDATA_COVERAGE_MCDCTYPES_H
 #define LLVM_PROFILEDATA_COVERAGE_MCDCTYPES_H
 
+#include <variant>
+
 namespace llvm::coverage::mcdc {
 
 /// The ID for MCDCBranch.
 using ConditionID = unsigned int;
 
-/// MC/DC-specifig parameters
-struct Parameters {
+struct DecisionParameters {
   /// Byte Index of Bitmap Coverage Object for a Decision Region.
-  unsigned BitmapIdx = 0;
+  unsigned BitmapIdx;
 
   /// Number of Conditions used for a Decision Region.
-  unsigned NumConditions = 0;
+  unsigned NumConditions;
+
+  DecisionParameters() = delete;
+};
 
+struct BranchParameters {
   /// IDs used to represent a branch region and other branch regions
   /// evaluated based on True and False branches.
-  ConditionID ID = 0, TrueID = 0, FalseID = 0;
+  ConditionID ID, TrueID, FalseID;
+
+  BranchParameters() = delete;
 };
 
+/// The type of MC/DC-specific parameters.
+using Parameters =
+    std::variant<std::monostate, DecisionParameters, BranchParameters>;
+
 } // namespace llvm::coverage::mcdc
 
 #endif // LLVM_PROFILEDATA_COVERAGE_MCDCTYPES_H
diff --git a/llvm/lib/ProfileData/Coverage/CoverageMapping.cpp b/llvm/lib/ProfileData/Coverage/CoverageMapping.cpp
index 0c65ab8..80b80f7 100644
--- a/llvm/lib/ProfileData/Coverage/CoverageMapping.cpp
+++ b/llvm/lib/ProfileData/Coverage/CoverageMapping.cpp
@@ -234,6 +234,7 @@ class MCDCRecordProcessor {
 
   /// Decision Region to which the ExecutedTestVectorBitmap applies.
   const CounterMappingRegion &Region;
+  const mcdc::DecisionParameters &DecisionParams;
 
   /// Array of branch regions corresponding each conditions in the boolean
   /// expression.
@@ -244,8 +245,8 @@ class MCDCRecordProcessor {
 
   unsigned BitmapIdx;
 
-  /// Mapping of a condition ID to its corresponding branch region.
-  llvm::DenseMap<unsigned, const CounterMappingRegion *> Map;
+  /// Mapping of a condition ID to its corresponding branch params.
+  llvm::DenseMap<unsigned, const mcdc::BranchParameters *> BranchParamsMap;
 
   /// Vector used to track whether a condition is constant folded.
   MCDCRecord::BoolVector Folded;
@@ -261,9 +262,10 @@ public:
   MCDCRecordProcessor(const BitVector &Bitmap,
                       const CounterMappingRegion &Region,
                       ArrayRef<const CounterMappingRegion *> Branches)
-      : Bitmap(Bitmap), Region(Region), Branches(Branches),
-        NumConditions(Region.MCDCParams.NumConditions),
-        BitmapIdx(Region.MCDCParams.BitmapIdx * CHAR_BIT),
+      : Bitmap(Bitmap), Region(Region),
+        DecisionParams(Region.getDecisionParams()), Branches(Branches),
+        NumConditions(DecisionParams.NumConditions),
+        BitmapIdx(DecisionParams.BitmapIdx * CHAR_BIT),
         Folded(NumConditions, false), IndependencePairs(NumConditions) {}
 
 private:
@@ -285,18 +287,18 @@ private:
   // the truth table.
   void buildTestVector(MCDCRecord::TestVector &TV, unsigned ID,
                        unsigned Index) {
-    const CounterMappingRegion *Branch = Map[ID];
+    auto [UnusedID, TrueID, FalseID] = *BranchParamsMap[ID];
 
     TV[ID - 1] = MCDCRecord::MCDC_False;
-    if (Branch->MCDCParams.FalseID > 0)
-      buildTestVector(TV, Branch->MCDCParams.FalseID, Index);
+    if (FalseID > 0)
+      buildTestVector(TV, FalseID, Index);
     else
       recordTestVector(TV, Index, MCDCRecord::MCDC_False);
 
     Index |= 1 << (ID - 1);
     TV[ID - 1] = MCDCRecord::MCDC_True;
-    if (Branch->MCDCParams.TrueID > 0)
-      buildTestVector(TV, Branch->MCDCParams.TrueID, Index);
+    if (TrueID > 0)
+      buildTestVector(TV, TrueID, Index);
     else
       recordTestVector(TV, Index, MCDCRecord::MCDC_True);
 
@@ -371,8 +373,9 @@ public:
     // - Record whether the condition is constant folded so that we exclude it
     //   from being measured.
     for (const auto *B : Branches) {
-      Map[B->MCDCParams.ID] = B;
-      PosToID[I] = B->MCDCParams.ID - 1;
+      const auto &BranchParams = B->getBranchParams();
+      BranchParamsMap[BranchParams.ID] = &BranchParams;
+      PosToID[I] = BranchParams.ID - 1;
       CondLoc[I] = B->startLoc();
       Folded[I++] = (B->Count.isZero() && B->FalseCount.isZero());
     }
@@ -492,10 +495,12 @@ static unsigned getMaxBitmapSize(const CounterMappingContext &Ctx,
   // Note that `<=` is used insted of `<`, because `BitmapIdx == 0` is valid
   // and `MaxBitmapIdx is `unsigned`. `BitmapIdx` is unique in the record.
   for (const auto &Region : reverse(Record.MappingRegions)) {
-    if (Region.Kind == CounterMappingRegion::MCDCDecisionRegion &&
-        MaxBitmapIdx <= Region.MCDCParams.BitmapIdx) {
-      MaxBitmapIdx = Region.MCDCParams.BitmapIdx;
-      NumConditions = Region.MCDCParams.NumConditions;
+    if (Region.Kind != CounterMappingRegion::MCDCDecisionRegion)
+      continue;
+    const auto &DecisionParams = Region.getDecisionParams();
+    if (MaxBitmapIdx <= DecisionParams.BitmapIdx) {
+      MaxBitmapIdx = DecisionParams.BitmapIdx;
+      NumConditions = DecisionParams.NumConditions;
     }
   }
   unsigned SizeInBits = llvm::alignTo(uint64_t(1) << NumConditions, CHAR_BIT);
@@ -515,6 +520,7 @@ private:
     const CounterMappingRegion *DecisionRegion;
 
     /// They are reflected from DecisionRegion for convenience.
+    mcdc::DecisionParameters DecisionParams;
     LineColPair DecisionStartLoc;
     LineColPair DecisionEndLoc;
 
@@ -533,7 +539,9 @@ private:
     DenseSet<unsigned> ExpandedFileIDs;
 
     DecisionRecord(const CounterMappingRegion &Decision)
-        : DecisionRegion(&Decision), DecisionStartLoc(Decision.startLoc()),
+        : DecisionRegion(&Decision),
+          DecisionParams(Decision.getDecisionParams()),
+          DecisionStartLoc(Decision.startLoc()),
           DecisionEndLoc(Decision.endLoc()) {
       assert(Decision.Kind == CounterMappingRegion::MCDCDecisionRegion);
     }
@@ -561,17 +569,17 @@ private:
     Result addBranch(const CounterMappingRegion &Branch) {
       assert(Branch.Kind == CounterMappingRegion::MCDCBranchRegion);
 
-      auto ConditionID = Branch.MCDCParams.ID;
+      auto ConditionID = Branch.getBranchParams().ID;
       assert(ConditionID > 0 && "ConditionID should begin with 1");
 
       if (ConditionIDs.contains(ConditionID) ||
-          ConditionID > DecisionRegion->MCDCParams.NumConditions)
+          ConditionID > DecisionParams.NumConditions)
         return NotProcessed;
 
       if (!this->dominates(Branch))
         return NotProcessed;
 
-      assert(MCDCBranches.size() < DecisionRegion->MCDCParams.NumConditions);
+      assert(MCDCBranches.size() < DecisionParams.NumConditions);
 
       // Put `ID=1` in front of `MCDCBranches` for convenience
       // even if `MCDCBranches` is not topological.
@@ -584,9 +592,8 @@ private:
       ConditionIDs.insert(ConditionID);
 
       // `Completed` when `MCDCBranches` is full
-      return (MCDCBranches.size() == DecisionRegion->MCDCParams.NumConditions
-                  ? Completed
-                  : Processed);
+      return (MCDCBranches.size() == DecisionParams.NumConditions ? Completed
+                                                                  : Processed);
     }
 
     /// Record Expansion if it is relevant to this Decision.
diff --git a/llvm/lib/ProfileData/Coverage/CoverageMappingReader.cpp b/llvm/lib/ProfileData/Coverage/CoverageMappingReader.cpp
index 061f0f1..d528d9a 100644
--- a/llvm/lib/ProfileData/Coverage/CoverageMappingReader.cpp
+++ b/llvm/lib/ProfileData/Coverage/CoverageMappingReader.cpp
@@ -244,7 +244,8 @@ Error RawCoverageMappingReader::readMappingRegionsSubArray(
   unsigned LineStart = 0;
   for (size_t I = 0; I < NumRegions; ++I) {
     Counter C, C2;
-    uint64_t BIDX = 0, NC = 0, ID = 0, TID = 0, FID = 0;
+    uint64_t BIDX, NC, ID, TID, FID;
+    mcdc::Parameters Params;
     CounterMappingRegion::RegionKind Kind = CounterMappingRegion::CodeRegion;
 
     // Read the combined counter + region kind.
@@ -312,6 +313,9 @@ Error RawCoverageMappingReader::readMappingRegionsSubArray(
             return make_error<CoverageMapError>(
                 coveragemap_error::malformed,
                 "MCDCConditionID shouldn't be zero");
+          Params = mcdc::BranchParameters{static_cast<unsigned>(ID),
+                                          static_cast<unsigned>(TID),
+                                          static_cast<unsigned>(FID)};
           break;
         case CounterMappingRegion::MCDCDecisionRegion:
           Kind = CounterMappingRegion::MCDCDecisionRegion;
@@ -319,6 +323,8 @@ Error RawCoverageMappingReader::readMappingRegionsSubArray(
             return Err;
           if (auto Err = readIntMax(NC, std::numeric_limits<unsigned>::max()))
             return Err;
+          Params = mcdc::DecisionParameters{static_cast<unsigned>(BIDX),
+                                            static_cast<unsigned>(NC)};
           break;
         default:
           return make_error<CoverageMapError>(coveragemap_error::malformed,
@@ -374,12 +380,8 @@ Error RawCoverageMappingReader::readMappingRegionsSubArray(
     });
 
     auto CMR = CounterMappingRegion(
-        C, C2,
-        mcdc::Parameters{static_cast<unsigned>(BIDX), static_cast<unsigned>(NC),
-                         static_cast<unsigned>(ID), static_cast<unsigned>(TID),
-                         static_cast<unsigned>(FID)},
-        InferredFileID, ExpandedFileID, LineStart, ColumnStart,
-        LineStart + NumLines, ColumnEnd, Kind);
+        C, C2, InferredFileID, ExpandedFileID, LineStart, ColumnStart,
+        LineStart + NumLines, ColumnEnd, Kind, Params);
     if (CMR.startLoc() > CMR.endLoc())
       return make_error<CoverageMapError>(
           coveragemap_error::malformed,
diff --git a/llvm/lib/ProfileData/Coverage/CoverageMappingWriter.cpp b/llvm/lib/ProfileData/Coverage/CoverageMappingWriter.cpp
index 248a6a7..3267afd 100644
--- a/llvm/lib/ProfileData/Coverage/CoverageMappingWriter.cpp
+++ b/llvm/lib/ProfileData/Coverage/CoverageMappingWriter.cpp
@@ -213,6 +213,7 @@ void CoverageMappingWriter::write(raw_ostream &OS) {
     }
     Counter Count = Minimizer.adjust(I->Count);
     Counter FalseCount = Minimizer.adjust(I->FalseCount);
+    bool ParamsShouldBeNull = true;
     switch (I->Kind) {
     case CounterMappingRegion::CodeRegion:
     case CounterMappingRegion::GapRegion:
@@ -251,17 +252,25 @@ void CoverageMappingWriter::write(raw_ostream &OS) {
                     OS);
       writeCounter(MinExpressions, Count, OS);
       writeCounter(MinExpressions, FalseCount, OS);
-      assert(I->MCDCParams.ID > 0);
-      encodeULEB128(unsigned(I->MCDCParams.ID), OS);
-      encodeULEB128(unsigned(I->MCDCParams.TrueID), OS);
-      encodeULEB128(unsigned(I->MCDCParams.FalseID), OS);
+      {
+        const auto &BranchParams = I->getBranchParams();
+        ParamsShouldBeNull = false;
+        assert(BranchParams.ID > 0);
+        encodeULEB128(static_cast<unsigned>(BranchParams.ID), OS);
+        encodeULEB128(static_cast<unsigned>(BranchParams.TrueID), OS);
+        encodeULEB128(static_cast<unsigned>(BranchParams.FalseID), OS);
+      }
       break;
     case CounterMappingRegion::MCDCDecisionRegion:
       encodeULEB128(unsigned(I->Kind)
                         << Counter::EncodingCounterTagAndExpansionRegionTagBits,
                     OS);
-      encodeULEB128(unsigned(I->MCDCParams.BitmapIdx), OS);
-      encodeULEB128(unsigned(I->MCDCParams.NumConditions), OS);
+      {
+        const auto &DecisionParams = I->getDecisionParams();
+        ParamsShouldBeNull = false;
+        encodeULEB128(static_cast<unsigned>(DecisionParams.BitmapIdx), OS);
+        encodeULEB128(static_cast<unsigned>(DecisionParams.NumConditions), OS);
+      }
       break;
     }
     assert(I->LineStart >= PrevLineStart);
@@ -271,6 +280,9 @@ void CoverageMappingWriter::write(raw_ostream &OS) {
     encodeULEB128(I->LineEnd - I->LineStart, OS);
     encodeULEB128(I->ColumnEnd, OS);
     PrevLineStart = I->LineStart;
+    assert((!ParamsShouldBeNull || std::get_if<0>(&I->MCDCParams)) &&
+           "MCDCParams should be empty");
+    (void)ParamsShouldBeNull;
   }
   // Ensure that all file ids have at least one mapping region.
   assert(CurrentFileID == (VirtualFileMapping.size() - 1));
diff --git a/llvm/unittests/ProfileData/CoverageMappingTest.cpp b/llvm/unittests/ProfileData/CoverageMappingTest.cpp
index d60312b..6f6718f 100644
--- a/llvm/unittests/ProfileData/CoverageMappingTest.cpp
+++ b/llvm/unittests/ProfileData/CoverageMappingTest.cpp
@@ -197,7 +197,7 @@ struct CoverageMappingTest : ::testing::TestWithParam<std::tuple<bool, bool>> {
     auto &Regions = InputFunctions.back().Regions;
     unsigned FileID = getFileIndexForFunction(File);
     Regions.push_back(CounterMappingRegion::makeDecisionRegion(
-        mcdc::Parameters{Mask, NC}, FileID, LS, CS, LE, CE));
+        mcdc::DecisionParameters{Mask, NC}, FileID, LS, CS, LE, CE));
   }
 
   void addMCDCBranchCMR(Counter C1, Counter C2, unsigned ID, unsigned TrueID,
@@ -206,8 +206,8 @@ struct CoverageMappingTest : ::testing::TestWithParam<std::tuple<bool, bool>> {
     auto &Regions = InputFunctions.back().Regions;
     unsigned FileID = getFileIndexForFunction(File);
     Regions.push_back(CounterMappingRegion::makeBranchRegion(
-        C1, C2, mcdc::Parameters{0, 0, ID, TrueID, FalseID}, FileID, LS, CS, LE,
-        CE));
+        C1, C2, FileID, LS, CS, LE, CE,
+        mcdc::BranchParameters{ID, TrueID, FalseID}));
   }
 
   void addExpansionCMR(StringRef File, StringRef ExpandedFile, unsigned LS,
-- 
cgit v1.1


From 880afa1c5d1b099eed5034340a67e56b9dda4c09 Mon Sep 17 00:00:00 2001
From: Jay Foad <jay.foad@amd.com>
Date: Tue, 13 Feb 2024 13:44:31 +0000
Subject: [TableGen] Use vectors instead of sets for testing intersection. NFC.
 (#81602)

In a few places we test whether sets (i.e. sorted ranges) intersect by
computing the set_intersection and then testing whether it is empty. For
this purpose it should be more efficient to use a std:vector instead of
a std::set to hold the result of the set_intersection, since insertion
is simpler.
---
 llvm/utils/TableGen/AsmMatcherEmitter.cpp |  7 +++----
 llvm/utils/TableGen/CodeGenRegisters.cpp  | 11 +++++------
 2 files changed, 8 insertions(+), 10 deletions(-)

diff --git a/llvm/utils/TableGen/AsmMatcherEmitter.cpp b/llvm/utils/TableGen/AsmMatcherEmitter.cpp
index 9065885..a212265 100644
--- a/llvm/utils/TableGen/AsmMatcherEmitter.cpp
+++ b/llvm/utils/TableGen/AsmMatcherEmitter.cpp
@@ -242,11 +242,10 @@ public:
       if (!isRegisterClass() || !RHS.isRegisterClass())
         return false;
 
-      RegisterSet Tmp;
-      std::insert_iterator<RegisterSet> II(Tmp, Tmp.begin());
+      std::vector<Record *> Tmp;
       std::set_intersection(Registers.begin(), Registers.end(),
-                            RHS.Registers.begin(), RHS.Registers.end(), II,
-                            LessRecordByID());
+                            RHS.Registers.begin(), RHS.Registers.end(),
+                            std::back_inserter(Tmp), LessRecordByID());
 
       return !Tmp.empty();
     }
diff --git a/llvm/utils/TableGen/CodeGenRegisters.cpp b/llvm/utils/TableGen/CodeGenRegisters.cpp
index 25f3864..5c74a6f 100644
--- a/llvm/utils/TableGen/CodeGenRegisters.cpp
+++ b/llvm/utils/TableGen/CodeGenRegisters.cpp
@@ -2034,12 +2034,11 @@ void CodeGenRegBank::computeRegUnitSets() {
     // Compare new sets with all original classes.
     for (unsigned SearchIdx = (Idx >= NumRegUnitSubSets) ? 0 : Idx + 1;
          SearchIdx != EndIdx; ++SearchIdx) {
-      std::set<unsigned> Intersection;
-      std::set_intersection(RegUnitSets[Idx].Units.begin(),
-                            RegUnitSets[Idx].Units.end(),
-                            RegUnitSets[SearchIdx].Units.begin(),
-                            RegUnitSets[SearchIdx].Units.end(),
-                            std::inserter(Intersection, Intersection.begin()));
+      std::vector<unsigned> Intersection;
+      std::set_intersection(
+          RegUnitSets[Idx].Units.begin(), RegUnitSets[Idx].Units.end(),
+          RegUnitSets[SearchIdx].Units.begin(),
+          RegUnitSets[SearchIdx].Units.end(), std::back_inserter(Intersection));
       if (Intersection.empty())
         continue;
 
-- 
cgit v1.1


From bb60c066a24eda6e6276fba9021cad85c4892343 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Timm=20B=C3=A4der?= <tbaeder@redhat.com>
Date: Tue, 13 Feb 2024 15:00:19 +0100
Subject: [clang][Interp] Handle Requires- and ConceptSpecializationExprs

Just emit their satisfaction state, which is what the current
interpreter does as well.
---
 clang/lib/AST/Interp/ByteCodeExprGen.cpp           | 13 +++++++++++++
 clang/lib/AST/Interp/ByteCodeExprGen.h             |  2 ++
 clang/test/SemaCXX/concept-crash-on-diagnostic.cpp |  1 +
 3 files changed, 16 insertions(+)

diff --git a/clang/lib/AST/Interp/ByteCodeExprGen.cpp b/clang/lib/AST/Interp/ByteCodeExprGen.cpp
index 880e338..91b9985 100644
--- a/clang/lib/AST/Interp/ByteCodeExprGen.cpp
+++ b/clang/lib/AST/Interp/ByteCodeExprGen.cpp
@@ -2119,6 +2119,19 @@ bool ByteCodeExprGen<Emitter>::VisitCXXUuidofExpr(const CXXUuidofExpr *E) {
   return this->emitInitPtr(E);
 }
 
+template <class Emitter>
+bool ByteCodeExprGen<Emitter>::VisitRequiresExpr(const RequiresExpr *E) {
+  assert(classifyPrim(E->getType()) == PT_Bool);
+  return this->emitConstBool(E->isSatisfied(), E);
+}
+
+template <class Emitter>
+bool ByteCodeExprGen<Emitter>::VisitConceptSpecializationExpr(
+    const ConceptSpecializationExpr *E) {
+  assert(classifyPrim(E->getType()) == PT_Bool);
+  return this->emitConstBool(E->isSatisfied(), E);
+}
+
 template <class Emitter> bool ByteCodeExprGen<Emitter>::discard(const Expr *E) {
   if (E->containsErrors())
     return false;
diff --git a/clang/lib/AST/Interp/ByteCodeExprGen.h b/clang/lib/AST/Interp/ByteCodeExprGen.h
index c723170..eeb56dc 100644
--- a/clang/lib/AST/Interp/ByteCodeExprGen.h
+++ b/clang/lib/AST/Interp/ByteCodeExprGen.h
@@ -114,6 +114,8 @@ public:
   bool VisitCXXInheritedCtorInitExpr(const CXXInheritedCtorInitExpr *E);
   bool VisitExpressionTraitExpr(const ExpressionTraitExpr *E);
   bool VisitCXXUuidofExpr(const CXXUuidofExpr *E);
+  bool VisitRequiresExpr(const RequiresExpr *E);
+  bool VisitConceptSpecializationExpr(const ConceptSpecializationExpr *E);
 
 protected:
   bool visitExpr(const Expr *E) override;
diff --git a/clang/test/SemaCXX/concept-crash-on-diagnostic.cpp b/clang/test/SemaCXX/concept-crash-on-diagnostic.cpp
index 00a39f9..71e55c8 100644
--- a/clang/test/SemaCXX/concept-crash-on-diagnostic.cpp
+++ b/clang/test/SemaCXX/concept-crash-on-diagnostic.cpp
@@ -1,4 +1,5 @@
 // RUN: %clang_cc1 -fsyntax-only -std=c++20 -verify %s
+// RUN: %clang_cc1 -fsyntax-only -std=c++20 -verify %s -fexperimental-new-constant-interpreter
 
 template <typename Iterator> class normal_iterator {};
 
-- 
cgit v1.1


From f6557783007377a80a4dc0c5e3c8e2513ed36b2f Mon Sep 17 00:00:00 2001
From: Erich Keane <ekeane@nvidia.com>
Date: Tue, 13 Feb 2024 06:02:13 -0800
Subject: [OpenACC] Implement AST for OpenACC Compute Constructs  (#81188)

'serial', 'parallel', and 'kernel' constructs are all considered
'Compute' constructs. This patch creates the AST type, plus the required
infrastructure for such a type, plus some base types that will be useful
in the future for breaking this up.

The only difference between the three is the 'kind'( plus some minor
 clause legalization rules, but those can be differentiated easily
enough), so rather than representing them as separate AST nodes, it
seems
to make sense to make them the same.

Additionally, no clause AST functionality is being implemented yet, as
that fits better in a separate patch, and this is enough to get the
'naked' constructs implemented.

This is otherwise an 'NFC' patch, as it doesn't alter execution at all,
so there aren't any tests.  I did this to break up the review workload
and to get feedback on the layout.
---
 clang/include/clang-c/Index.h                   |   6 +-
 clang/include/clang/AST/RecursiveASTVisitor.h   |  22 ++++
 clang/include/clang/AST/StmtOpenACC.h           | 142 ++++++++++++++++++++++++
 clang/include/clang/AST/StmtVisitor.h           |   3 +-
 clang/include/clang/AST/TextNodeDumper.h        |   1 +
 clang/include/clang/Basic/OpenACCKinds.h        |  31 +++++-
 clang/include/clang/Basic/StmtNodes.td          |   6 +
 clang/include/clang/Serialization/ASTBitCodes.h |   3 +
 clang/lib/AST/ASTStructuralEquivalence.cpp      |   1 +
 clang/lib/AST/CMakeLists.txt                    |   1 +
 clang/lib/AST/Stmt.cpp                          |   1 +
 clang/lib/AST/StmtOpenACC.cpp                   |  33 ++++++
 clang/lib/AST/StmtPrinter.cpp                   |   9 ++
 clang/lib/AST/StmtProfile.cpp                   |   7 ++
 clang/lib/AST/TextNodeDumper.cpp                |   5 +
 clang/lib/CodeGen/CGStmt.cpp                    |   3 +
 clang/lib/CodeGen/CodeGenFunction.h             |  10 ++
 clang/lib/Sema/SemaExceptionSpec.cpp            |   1 +
 clang/lib/Sema/TreeTransform.h                  |  23 ++++
 clang/lib/Serialization/ASTReaderStmt.cpp       |  23 ++++
 clang/lib/Serialization/ASTWriterStmt.cpp       |  21 ++++
 clang/lib/StaticAnalyzer/Core/ExprEngine.cpp    |   1 +
 clang/tools/libclang/CIndex.cpp                 |   2 +
 clang/tools/libclang/CXCursor.cpp               |   3 +
 24 files changed, 352 insertions(+), 6 deletions(-)
 create mode 100644 clang/include/clang/AST/StmtOpenACC.h
 create mode 100644 clang/lib/AST/StmtOpenACC.cpp

diff --git a/clang/include/clang-c/Index.h b/clang/include/clang-c/Index.h
index 6af4142..3f36206 100644
--- a/clang/include/clang-c/Index.h
+++ b/clang/include/clang-c/Index.h
@@ -2145,7 +2145,11 @@ enum CXCursorKind {
    */
   CXCursor_OMPScopeDirective = 306,
 
-  CXCursor_LastStmt = CXCursor_OMPScopeDirective,
+  /** OpenACC Compute Construct.
+   */
+  CXCursor_OpenACCComputeConstruct = 320,
+
+  CXCursor_LastStmt = CXCursor_OpenACCComputeConstruct,
 
   /**
    * Cursor that represents the translation unit itself.
diff --git a/clang/include/clang/AST/RecursiveASTVisitor.h b/clang/include/clang/AST/RecursiveASTVisitor.h
index 9da5206..5080551 100644
--- a/clang/include/clang/AST/RecursiveASTVisitor.h
+++ b/clang/include/clang/AST/RecursiveASTVisitor.h
@@ -34,6 +34,7 @@
 #include "clang/AST/Stmt.h"
 #include "clang/AST/StmtCXX.h"
 #include "clang/AST/StmtObjC.h"
+#include "clang/AST/StmtOpenACC.h"
 #include "clang/AST/StmtOpenMP.h"
 #include "clang/AST/TemplateBase.h"
 #include "clang/AST/TemplateName.h"
@@ -505,6 +506,9 @@ private:
   bool VisitOMPClauseWithPostUpdate(OMPClauseWithPostUpdate *Node);
 
   bool PostVisitStmt(Stmt *S);
+  bool TraverseOpenACCConstructStmt(OpenACCConstructStmt *S);
+  bool
+  TraverseOpenACCAssociatedStmtConstruct(OpenACCAssociatedStmtConstruct *S);
 };
 
 template <typename Derived>
@@ -3910,6 +3914,24 @@ bool RecursiveASTVisitor<Derived>::VisitOMPXBareClause(OMPXBareClause *C) {
   return true;
 }
 
+template <typename Derived>
+bool RecursiveASTVisitor<Derived>::TraverseOpenACCConstructStmt(
+    OpenACCConstructStmt *) {
+  // TODO OpenACC: When we implement clauses, ensure we traverse them here.
+  return true;
+}
+
+template <typename Derived>
+bool RecursiveASTVisitor<Derived>::TraverseOpenACCAssociatedStmtConstruct(
+    OpenACCAssociatedStmtConstruct *S) {
+  TRY_TO(TraverseOpenACCConstructStmt(S));
+  TRY_TO(TraverseStmt(S->getAssociatedStmt()));
+  return true;
+}
+
+DEF_TRAVERSE_STMT(OpenACCComputeConstruct,
+                  { TRY_TO(TraverseOpenACCAssociatedStmtConstruct(S)); })
+
 // FIXME: look at the following tricky-seeming exprs to see if we
 // need to recurse on anything.  These are ones that have methods
 // returning decls or qualtypes or nestednamespecifier -- though I'm
diff --git a/clang/include/clang/AST/StmtOpenACC.h b/clang/include/clang/AST/StmtOpenACC.h
new file mode 100644
index 0000000..9424f4f
--- /dev/null
+++ b/clang/include/clang/AST/StmtOpenACC.h
@@ -0,0 +1,142 @@
+//===- StmtOpenACC.h - Classes for OpenACC directives  ----------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file defines OpenACC AST classes for statement-level contructs.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CLANG_AST_STMTOPENACC_H
+#define LLVM_CLANG_AST_STMTOPENACC_H
+
+#include "clang/AST/Stmt.h"
+#include "clang/Basic/OpenACCKinds.h"
+#include "clang/Basic/SourceLocation.h"
+
+namespace clang {
+/// This is the base class for an OpenACC statement-level construct, other
+/// construct types are expected to inherit from this.
+class OpenACCConstructStmt : public Stmt {
+  friend class ASTStmtWriter;
+  friend class ASTStmtReader;
+  /// The directive kind. Each implementation of this interface should handle
+  /// specific kinds.
+  OpenACCDirectiveKind Kind = OpenACCDirectiveKind::Invalid;
+  /// The location of the directive statement, from the '#' to the last token of
+  /// the directive.
+  SourceRange Range;
+
+  // TODO OPENACC: Clauses should probably be collected in this class.
+
+protected:
+  OpenACCConstructStmt(StmtClass SC, OpenACCDirectiveKind K,
+                       SourceLocation Start, SourceLocation End)
+      : Stmt(SC), Kind(K), Range(Start, End) {}
+
+public:
+  OpenACCDirectiveKind getDirectiveKind() const { return Kind; }
+
+  static bool classof(const Stmt *S) {
+    return S->getStmtClass() >= firstOpenACCConstructStmtConstant &&
+           S->getStmtClass() <= lastOpenACCConstructStmtConstant;
+  }
+
+  SourceLocation getBeginLoc() const { return Range.getBegin(); }
+  SourceLocation getEndLoc() const { return Range.getEnd(); }
+
+  child_range children() {
+    return child_range(child_iterator(), child_iterator());
+  }
+
+  const_child_range children() const {
+    return const_cast<OpenACCConstructStmt *>(this)->children();
+  }
+};
+
+/// This is a base class for any OpenACC statement-level constructs that have an
+/// associated statement. This class is not intended to be instantiated, but is
+/// a convenient place to hold the associated statement.
+class OpenACCAssociatedStmtConstruct : public OpenACCConstructStmt {
+  friend class ASTStmtWriter;
+  friend class ASTStmtReader;
+  template <typename Derived> friend class RecursiveASTVisitor;
+  Stmt *AssociatedStmt = nullptr;
+
+protected:
+  OpenACCAssociatedStmtConstruct(StmtClass SC, OpenACCDirectiveKind K,
+                                 SourceLocation Start, SourceLocation End)
+      : OpenACCConstructStmt(SC, K, Start, End) {}
+
+  void setAssociatedStmt(Stmt *S) { AssociatedStmt = S; }
+  Stmt *getAssociatedStmt() { return AssociatedStmt; }
+  const Stmt *getAssociatedStmt() const {
+    return const_cast<OpenACCAssociatedStmtConstruct *>(this)
+        ->getAssociatedStmt();
+  }
+
+public:
+  child_range children() {
+    if (getAssociatedStmt())
+      return child_range(&AssociatedStmt, &AssociatedStmt + 1);
+    return child_range(child_iterator(), child_iterator());
+  }
+
+  const_child_range children() const {
+    return const_cast<OpenACCAssociatedStmtConstruct *>(this)->children();
+  }
+};
+/// This class represents a compute construct, representing a 'Kind' of
+/// `parallel', 'serial', or 'kernel'. These constructs are associated with a
+/// 'structured block', defined as:
+///
+///  in C or C++, an executable statement, possibly compound, with a single
+///  entry at the top and a single exit at the bottom
+///
+/// At the moment there is no real motivation to have a different AST node for
+/// those three, as they are semantically identical, and have only minor
+/// differences in the permitted list of clauses, which can be differentiated by
+/// the 'Kind'.
+class OpenACCComputeConstruct : public OpenACCAssociatedStmtConstruct {
+  friend class ASTStmtWriter;
+  friend class ASTStmtReader;
+  friend class ASTContext;
+  OpenACCComputeConstruct()
+      : OpenACCAssociatedStmtConstruct(OpenACCComputeConstructClass,
+                                       OpenACCDirectiveKind::Invalid,
+                                       SourceLocation{}, SourceLocation{}) {}
+
+  OpenACCComputeConstruct(OpenACCDirectiveKind K, SourceLocation Start,
+                          SourceLocation End)
+      : OpenACCAssociatedStmtConstruct(OpenACCComputeConstructClass, K, Start,
+                                       End) {
+    assert((K == OpenACCDirectiveKind::Parallel ||
+            K == OpenACCDirectiveKind::Serial ||
+            K == OpenACCDirectiveKind::Kernels) &&
+           "Only parallel, serial, and kernels constructs should be "
+           "represented by this type");
+  }
+
+  void setStructuredBlock(Stmt *S) { setAssociatedStmt(S); }
+
+public:
+  static bool classof(const Stmt *T) {
+    return T->getStmtClass() == OpenACCComputeConstructClass;
+  }
+
+  static OpenACCComputeConstruct *CreateEmpty(const ASTContext &C, EmptyShell);
+  static OpenACCComputeConstruct *Create(const ASTContext &C,
+                                         OpenACCDirectiveKind K,
+                                         SourceLocation BeginLoc,
+                                         SourceLocation EndLoc);
+
+  Stmt *getStructuredBlock() { return getAssociatedStmt(); }
+  const Stmt *getStructuredBlock() const {
+    return const_cast<OpenACCComputeConstruct *>(this)->getStructuredBlock();
+  }
+};
+} // namespace clang
+#endif // LLVM_CLANG_AST_STMTOPENACC_H
diff --git a/clang/include/clang/AST/StmtVisitor.h b/clang/include/clang/AST/StmtVisitor.h
index 3e51551..990aa2d 100644
--- a/clang/include/clang/AST/StmtVisitor.h
+++ b/clang/include/clang/AST/StmtVisitor.h
@@ -13,13 +13,14 @@
 #ifndef LLVM_CLANG_AST_STMTVISITOR_H
 #define LLVM_CLANG_AST_STMTVISITOR_H
 
-#include "clang/AST/ExprConcepts.h"
 #include "clang/AST/ExprCXX.h"
+#include "clang/AST/ExprConcepts.h"
 #include "clang/AST/ExprObjC.h"
 #include "clang/AST/ExprOpenMP.h"
 #include "clang/AST/Stmt.h"
 #include "clang/AST/StmtCXX.h"
 #include "clang/AST/StmtObjC.h"
+#include "clang/AST/StmtOpenACC.h"
 #include "clang/AST/StmtOpenMP.h"
 #include "clang/Basic/LLVM.h"
 #include "llvm/ADT/STLExtras.h"
diff --git a/clang/include/clang/AST/TextNodeDumper.h b/clang/include/clang/AST/TextNodeDumper.h
index 3c4283f..de67f0b 100644
--- a/clang/include/clang/AST/TextNodeDumper.h
+++ b/clang/include/clang/AST/TextNodeDumper.h
@@ -401,6 +401,7 @@ public:
   void
   VisitLifetimeExtendedTemporaryDecl(const LifetimeExtendedTemporaryDecl *D);
   void VisitHLSLBufferDecl(const HLSLBufferDecl *D);
+  void VisitOpenACCConstructStmt(const OpenACCConstructStmt *S);
 };
 
 } // namespace clang
diff --git a/clang/include/clang/Basic/OpenACCKinds.h b/clang/include/clang/Basic/OpenACCKinds.h
index afdd0e8..4456f4a 100644
--- a/clang/include/clang/Basic/OpenACCKinds.h
+++ b/clang/include/clang/Basic/OpenACCKinds.h
@@ -16,6 +16,7 @@
 
 #include "clang/Basic/Diagnostic.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
 
 namespace clang {
 // Represents the Construct/Directive kind of a pragma directive. Note the
@@ -65,8 +66,9 @@ enum class OpenACCDirectiveKind {
   Invalid,
 };
 
-inline const StreamingDiagnostic &operator<<(const StreamingDiagnostic &Out,
-                                             OpenACCDirectiveKind K) {
+template <typename StreamTy>
+inline StreamTy &PrintOpenACCDirectiveKind(StreamTy &Out,
+                                           OpenACCDirectiveKind K) {
   switch (K) {
   case OpenACCDirectiveKind::Parallel:
     return Out << "parallel";
@@ -134,6 +136,16 @@ inline const StreamingDiagnostic &operator<<(const StreamingDiagnostic &Out,
   llvm_unreachable("Uncovered directive kind");
 }
 
+inline const StreamingDiagnostic &operator<<(const StreamingDiagnostic &Out,
+                                             OpenACCDirectiveKind K) {
+  return PrintOpenACCDirectiveKind(Out, K);
+}
+
+inline llvm::raw_ostream &operator<<(llvm::raw_ostream &Out,
+                                     OpenACCDirectiveKind K) {
+  return PrintOpenACCDirectiveKind(Out, K);
+}
+
 enum class OpenACCAtomicKind {
   Read,
   Write,
@@ -253,8 +265,8 @@ enum class OpenACCClauseKind {
   Invalid,
 };
 
-inline const StreamingDiagnostic &operator<<(const StreamingDiagnostic &Out,
-                                             OpenACCClauseKind K) {
+template <typename StreamTy>
+inline StreamTy &PrintOpenACCClauseKind(StreamTy &Out, OpenACCClauseKind K) {
   switch (K) {
   case OpenACCClauseKind::Finalize:
     return Out << "finalize";
@@ -387,6 +399,17 @@ inline const StreamingDiagnostic &operator<<(const StreamingDiagnostic &Out,
   }
   llvm_unreachable("Uncovered clause kind");
 }
+
+inline const StreamingDiagnostic &operator<<(const StreamingDiagnostic &Out,
+                                             OpenACCClauseKind K) {
+  return PrintOpenACCClauseKind(Out, K);
+}
+
+inline llvm::raw_ostream &operator<<(llvm::raw_ostream &Out,
+                                     OpenACCClauseKind K) {
+  return PrintOpenACCClauseKind(Out, K);
+}
+
 enum class OpenACCDefaultClauseKind {
   /// 'none' option.
   None,
diff --git a/clang/include/clang/Basic/StmtNodes.td b/clang/include/clang/Basic/StmtNodes.td
index 9d03800..b4e3ae5 100644
--- a/clang/include/clang/Basic/StmtNodes.td
+++ b/clang/include/clang/Basic/StmtNodes.td
@@ -296,3 +296,9 @@ def OMPTargetTeamsGenericLoopDirective : StmtNode<OMPLoopDirective>;
 def OMPParallelGenericLoopDirective : StmtNode<OMPLoopDirective>;
 def OMPTargetParallelGenericLoopDirective : StmtNode<OMPLoopDirective>;
 def OMPErrorDirective : StmtNode<OMPExecutableDirective>;
+
+// OpenACC Constructs.
+def OpenACCConstructStmt : StmtNode<Stmt, /*abstract=*/1>;
+def OpenACCAssociatedStmtConstruct
+    : StmtNode<OpenACCConstructStmt, /*abstract=*/1>;
+def OpenACCComputeConstruct : StmtNode<OpenACCAssociatedStmtConstruct>;
diff --git a/clang/include/clang/Serialization/ASTBitCodes.h b/clang/include/clang/Serialization/ASTBitCodes.h
index 9de9251..f31efa5 100644
--- a/clang/include/clang/Serialization/ASTBitCodes.h
+++ b/clang/include/clang/Serialization/ASTBitCodes.h
@@ -2018,6 +2018,9 @@ enum StmtCode {
 
   // SYCLUniqueStableNameExpr
   EXPR_SYCL_UNIQUE_STABLE_NAME,
+
+  // OpenACC Constructs
+  STMT_OPENACC_COMPUTE_CONSTRUCT,
 };
 
 /// The kinds of designators that can occur in a
diff --git a/clang/lib/AST/ASTStructuralEquivalence.cpp b/clang/lib/AST/ASTStructuralEquivalence.cpp
index 3b7ebbb..fe6e03c 100644
--- a/clang/lib/AST/ASTStructuralEquivalence.cpp
+++ b/clang/lib/AST/ASTStructuralEquivalence.cpp
@@ -74,6 +74,7 @@
 #include "clang/AST/ExprOpenMP.h"
 #include "clang/AST/NestedNameSpecifier.h"
 #include "clang/AST/StmtObjC.h"
+#include "clang/AST/StmtOpenACC.h"
 #include "clang/AST/StmtOpenMP.h"
 #include "clang/AST/TemplateBase.h"
 #include "clang/AST/TemplateName.h"
diff --git a/clang/lib/AST/CMakeLists.txt b/clang/lib/AST/CMakeLists.txt
index ebcb395..49dcf2e 100644
--- a/clang/lib/AST/CMakeLists.txt
+++ b/clang/lib/AST/CMakeLists.txt
@@ -112,6 +112,7 @@ add_clang_library(clangAST
   StmtCXX.cpp
   StmtIterator.cpp
   StmtObjC.cpp
+  StmtOpenACC.cpp
   StmtOpenMP.cpp
   StmtPrinter.cpp
   StmtProfile.cpp
diff --git a/clang/lib/AST/Stmt.cpp b/clang/lib/AST/Stmt.cpp
index afd0588..fe59d60 100644
--- a/clang/lib/AST/Stmt.cpp
+++ b/clang/lib/AST/Stmt.cpp
@@ -23,6 +23,7 @@
 #include "clang/AST/ExprOpenMP.h"
 #include "clang/AST/StmtCXX.h"
 #include "clang/AST/StmtObjC.h"
+#include "clang/AST/StmtOpenACC.h"
 #include "clang/AST/StmtOpenMP.h"
 #include "clang/AST/Type.h"
 #include "clang/Basic/CharInfo.h"
diff --git a/clang/lib/AST/StmtOpenACC.cpp b/clang/lib/AST/StmtOpenACC.cpp
new file mode 100644
index 0000000..1a99c24
--- /dev/null
+++ b/clang/lib/AST/StmtOpenACC.cpp
@@ -0,0 +1,33 @@
+//===--- StmtOpenACC.cpp - Classes for OpenACC Constructs -----------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the subclesses of Stmt class declared in StmtOpenACC.h
+//
+//===----------------------------------------------------------------------===//
+
+#include "clang/AST/StmtOpenACC.h"
+#include "clang/AST/ASTContext.h"
+using namespace clang;
+
+OpenACCComputeConstruct *
+OpenACCComputeConstruct::CreateEmpty(const ASTContext &C, EmptyShell) {
+  void *Mem = C.Allocate(sizeof(OpenACCComputeConstruct),
+                         alignof(OpenACCComputeConstruct));
+  auto *Inst = new (Mem) OpenACCComputeConstruct;
+  return Inst;
+}
+
+OpenACCComputeConstruct *
+OpenACCComputeConstruct::Create(const ASTContext &C, OpenACCDirectiveKind K,
+                                SourceLocation BeginLoc,
+                                SourceLocation EndLoc) {
+  void *Mem = C.Allocate(sizeof(OpenACCComputeConstruct),
+                         alignof(OpenACCComputeConstruct));
+  auto *Inst = new (Mem) OpenACCComputeConstruct(K, BeginLoc, EndLoc);
+  return Inst;
+}
diff --git a/clang/lib/AST/StmtPrinter.cpp b/clang/lib/AST/StmtPrinter.cpp
index 1df040e..d66c3cc 100644
--- a/clang/lib/AST/StmtPrinter.cpp
+++ b/clang/lib/AST/StmtPrinter.cpp
@@ -1138,6 +1138,15 @@ void StmtPrinter::VisitOMPTargetParallelGenericLoopDirective(
 }
 
 //===----------------------------------------------------------------------===//
+//  OpenACC construct printing methods
+//===----------------------------------------------------------------------===//
+void StmtPrinter::VisitOpenACCComputeConstruct(OpenACCComputeConstruct *S) {
+  Indent() << "#pragma acc " << S->getDirectiveKind();
+  // TODO OpenACC: Print Clauses.
+  PrintStmt(S->getStructuredBlock());
+}
+
+//===----------------------------------------------------------------------===//
 //  Expr printing methods.
 //===----------------------------------------------------------------------===//
 
diff --git a/clang/lib/AST/StmtProfile.cpp b/clang/lib/AST/StmtProfile.cpp
index 1b817cf..b545ff4 100644
--- a/clang/lib/AST/StmtProfile.cpp
+++ b/clang/lib/AST/StmtProfile.cpp
@@ -2441,6 +2441,13 @@ void StmtProfiler::VisitTemplateArgument(const TemplateArgument &Arg) {
   }
 }
 
+void StmtProfiler::VisitOpenACCComputeConstruct(
+    const OpenACCComputeConstruct *S) {
+  // VisitStmt handles children, so the AssociatedStmt is handled.
+  VisitStmt(S);
+  // TODO OpenACC: Visit Clauses.
+}
+
 void Stmt::Profile(llvm::FoldingSetNodeID &ID, const ASTContext &Context,
                    bool Canonical, bool ProfileLambdaExpr) const {
   StmtProfilerWithPointers Profiler(ID, Context, Canonical, ProfileLambdaExpr);
diff --git a/clang/lib/AST/TextNodeDumper.cpp b/clang/lib/AST/TextNodeDumper.cpp
index 0000d26d..b683eb1 100644
--- a/clang/lib/AST/TextNodeDumper.cpp
+++ b/clang/lib/AST/TextNodeDumper.cpp
@@ -2668,3 +2668,8 @@ void TextNodeDumper::VisitHLSLBufferDecl(const HLSLBufferDecl *D) {
     OS << " tbuffer";
   dumpName(D);
 }
+
+void TextNodeDumper::VisitOpenACCConstructStmt(const OpenACCConstructStmt *S) {
+  OS << " " << S->getDirectiveKind();
+  // TODO OpenACC: Dump clauses as well.
+}
diff --git a/clang/lib/CodeGen/CGStmt.cpp b/clang/lib/CodeGen/CGStmt.cpp
index beff0ad..af51875 100644
--- a/clang/lib/CodeGen/CGStmt.cpp
+++ b/clang/lib/CodeGen/CGStmt.cpp
@@ -435,6 +435,9 @@ void CodeGenFunction::EmitStmt(const Stmt *S, ArrayRef<const Attr *> Attrs) {
   case Stmt::OMPParallelMaskedDirectiveClass:
     EmitOMPParallelMaskedDirective(cast<OMPParallelMaskedDirective>(*S));
     break;
+  case Stmt::OpenACCComputeConstructClass:
+    EmitOpenACCComputeConstruct(cast<OpenACCComputeConstruct>(*S));
+    break;
   }
 }
 
diff --git a/clang/lib/CodeGen/CodeGenFunction.h b/clang/lib/CodeGen/CodeGenFunction.h
index fc9b328..caa6a32 100644
--- a/clang/lib/CodeGen/CodeGenFunction.h
+++ b/clang/lib/CodeGen/CodeGenFunction.h
@@ -26,6 +26,7 @@
 #include "clang/AST/ExprCXX.h"
 #include "clang/AST/ExprObjC.h"
 #include "clang/AST/ExprOpenMP.h"
+#include "clang/AST/StmtOpenACC.h"
 #include "clang/AST/StmtOpenMP.h"
 #include "clang/AST/Type.h"
 #include "clang/Basic/ABI.h"
@@ -3840,6 +3841,15 @@ private:
   void EmitSections(const OMPExecutableDirective &S);
 
 public:
+  //===--------------------------------------------------------------------===//
+  //                         OpenACC Emission
+  //===--------------------------------------------------------------------===//
+  void EmitOpenACCComputeConstruct(const OpenACCComputeConstruct &S) {
+    // TODO OpenACC: Implement this.  It is currently implemented as a 'no-op',
+    // simply emitting its structured block, but in the future we will implement
+    // some sort of IR.
+    EmitStmt(S.getStructuredBlock());
+  }
 
   //===--------------------------------------------------------------------===//
   //                         LValue Expression Emission
diff --git a/clang/lib/Sema/SemaExceptionSpec.cpp b/clang/lib/Sema/SemaExceptionSpec.cpp
index 8d58ef5..3563b4f 100644
--- a/clang/lib/Sema/SemaExceptionSpec.cpp
+++ b/clang/lib/Sema/SemaExceptionSpec.cpp
@@ -1423,6 +1423,7 @@ CanThrowResult Sema::canThrow(const Stmt *S) {
     llvm_unreachable("Invalid class for expression");
 
     // Most statements can throw if any substatement can throw.
+  case Stmt::OpenACCComputeConstructClass:
   case Stmt::AttributedStmtClass:
   case Stmt::BreakStmtClass:
   case Stmt::CapturedStmtClass:
diff --git a/clang/lib/Sema/TreeTransform.h b/clang/lib/Sema/TreeTransform.h
index 3ed17c3..6e5ae12 100644
--- a/clang/lib/Sema/TreeTransform.h
+++ b/clang/lib/Sema/TreeTransform.h
@@ -27,6 +27,7 @@
 #include "clang/AST/Stmt.h"
 #include "clang/AST/StmtCXX.h"
 #include "clang/AST/StmtObjC.h"
+#include "clang/AST/StmtOpenACC.h"
 #include "clang/AST/StmtOpenMP.h"
 #include "clang/Basic/DiagnosticParse.h"
 #include "clang/Basic/OpenMPKinds.h"
@@ -3995,6 +3996,13 @@ public:
     return getSema().CreateRecoveryExpr(BeginLoc, EndLoc, SubExprs, Type);
   }
 
+  StmtResult RebuildOpenACCComputeConstruct(OpenACCDirectiveKind K,
+                                            SourceLocation BeginLoc,
+                                            SourceLocation EndLoc,
+                                            StmtResult StrBlock) {
+    llvm_unreachable("Not yet implemented!");
+  }
+
 private:
   TypeLoc TransformTypeInObjectScope(TypeLoc TL,
                                      QualType ObjectType,
@@ -10994,6 +11002,21 @@ OMPClause *TreeTransform<Derived>::TransformOMPXBareClause(OMPXBareClause *C) {
 }
 
 //===----------------------------------------------------------------------===//
+// OpenACC transformation
+//===----------------------------------------------------------------------===//
+template <typename Derived>
+StmtResult TreeTransform<Derived>::TransformOpenACCComputeConstruct(
+    OpenACCComputeConstruct *C) {
+  // TODO OpenACC: Transform clauses.
+
+  // Transform Structured Block.
+  StmtResult StrBlock = getDerived().TransformStmt(C->getStructuredBlock());
+
+  return getDerived().RebuildOpenACCComputeConstruct(
+      C->getDirectiveKind(), C->getBeginLoc(), C->getEndLoc(), StrBlock);
+}
+
+//===----------------------------------------------------------------------===//
 // Expression transformation
 //===----------------------------------------------------------------------===//
 template<typename Derived>
diff --git a/clang/lib/Serialization/ASTReaderStmt.cpp b/clang/lib/Serialization/ASTReaderStmt.cpp
index d79f194..3da44ff 100644
--- a/clang/lib/Serialization/ASTReaderStmt.cpp
+++ b/clang/lib/Serialization/ASTReaderStmt.cpp
@@ -2789,6 +2789,26 @@ void ASTStmtReader::VisitOMPTargetParallelGenericLoopDirective(
 }
 
 //===----------------------------------------------------------------------===//
+// OpenACC Constructs/Directives.
+//===----------------------------------------------------------------------===//
+void ASTStmtReader::VisitOpenACCConstructStmt(OpenACCConstructStmt *S) {
+  S->Kind = Record.readEnum<OpenACCDirectiveKind>();
+  S->Range = Record.readSourceRange();
+  // TODO OpenACC: Deserialize Clauses.
+}
+
+void ASTStmtReader::VisitOpenACCAssociatedStmtConstruct(
+    OpenACCAssociatedStmtConstruct *S) {
+  VisitOpenACCConstructStmt(S);
+  S->setAssociatedStmt(Record.readSubStmt());
+}
+
+void ASTStmtReader::VisitOpenACCComputeConstruct(OpenACCComputeConstruct *S) {
+  VisitStmt(S);
+  VisitOpenACCConstructStmt(S);
+}
+
+//===----------------------------------------------------------------------===//
 // ASTReader Implementation
 //===----------------------------------------------------------------------===//
 
@@ -4206,6 +4226,9 @@ Stmt *ASTReader::ReadStmtFromStream(ModuleFile &F) {
       S = new (Context) ConceptSpecializationExpr(Empty);
       break;
     }
+    case STMT_OPENACC_COMPUTE_CONSTRUCT:
+      S = OpenACCComputeConstruct::CreateEmpty(Context, Empty);
+      break;
 
     case EXPR_REQUIRES:
       unsigned numLocalParameters = Record[ASTStmtReader::NumExprFields];
diff --git a/clang/lib/Serialization/ASTWriterStmt.cpp b/clang/lib/Serialization/ASTWriterStmt.cpp
index 5b0b902..484621a 100644
--- a/clang/lib/Serialization/ASTWriterStmt.cpp
+++ b/clang/lib/Serialization/ASTWriterStmt.cpp
@@ -2839,6 +2839,27 @@ void ASTStmtWriter::VisitOMPTargetParallelGenericLoopDirective(
 }
 
 //===----------------------------------------------------------------------===//
+// OpenACC Constructs/Directives.
+//===----------------------------------------------------------------------===//
+void ASTStmtWriter::VisitOpenACCConstructStmt(OpenACCConstructStmt *S) {
+  Record.writeEnum(S->Kind);
+  Record.AddSourceRange(S->Range);
+  // TODO OpenACC: Serialize Clauses.
+}
+
+void ASTStmtWriter::VisitOpenACCAssociatedStmtConstruct(
+    OpenACCAssociatedStmtConstruct *S) {
+  VisitOpenACCConstructStmt(S);
+  Record.AddStmt(S->getAssociatedStmt());
+}
+
+void ASTStmtWriter::VisitOpenACCComputeConstruct(OpenACCComputeConstruct *S) {
+  VisitStmt(S);
+  VisitOpenACCConstructStmt(S);
+  Code = serialization::STMT_OPENACC_COMPUTE_CONSTRUCT;
+}
+
+//===----------------------------------------------------------------------===//
 // ASTWriter Implementation
 //===----------------------------------------------------------------------===//
 
diff --git a/clang/lib/StaticAnalyzer/Core/ExprEngine.cpp b/clang/lib/StaticAnalyzer/Core/ExprEngine.cpp
index ccc3c0f..09c69f9 100644
--- a/clang/lib/StaticAnalyzer/Core/ExprEngine.cpp
+++ b/clang/lib/StaticAnalyzer/Core/ExprEngine.cpp
@@ -1821,6 +1821,7 @@ void ExprEngine::Visit(const Stmt *S, ExplodedNode *Pred,
     case Stmt::OMPParallelGenericLoopDirectiveClass:
     case Stmt::OMPTargetParallelGenericLoopDirectiveClass:
     case Stmt::CapturedStmtClass:
+    case Stmt::OpenACCComputeConstructClass:
     case Stmt::OMPUnrollDirectiveClass:
     case Stmt::OMPMetaDirectiveClass: {
       const ExplodedNode *node = Bldr.generateSink(S, Pred, Pred->getState());
diff --git a/clang/tools/libclang/CIndex.cpp b/clang/tools/libclang/CIndex.cpp
index e5c0971..4ded92c 100644
--- a/clang/tools/libclang/CIndex.cpp
+++ b/clang/tools/libclang/CIndex.cpp
@@ -6114,6 +6114,8 @@ CXString clang_getCursorKindSpelling(enum CXCursorKind Kind) {
     return cxstring::createRef("attribute(aligned)");
   case CXCursor_ConceptDecl:
     return cxstring::createRef("ConceptDecl");
+  case CXCursor_OpenACCComputeConstruct:
+    return cxstring::createRef("OpenACCComputeConstruct");
   }
 
   llvm_unreachable("Unhandled CXCursorKind");
diff --git a/clang/tools/libclang/CXCursor.cpp b/clang/tools/libclang/CXCursor.cpp
index 01b8a23..454bf75 100644
--- a/clang/tools/libclang/CXCursor.cpp
+++ b/clang/tools/libclang/CXCursor.cpp
@@ -870,6 +870,9 @@ CXCursor cxcursor::MakeCXCursor(const Stmt *S, const Decl *Parent,
   case Stmt::OMPParallelGenericLoopDirectiveClass:
     K = CXCursor_OMPParallelGenericLoopDirective;
     break;
+  case Stmt::OpenACCComputeConstructClass:
+    K = CXCursor_OpenACCComputeConstruct;
+    break;
   case Stmt::OMPTargetParallelGenericLoopDirectiveClass:
     K = CXCursor_OMPTargetParallelGenericLoopDirective;
     break;
-- 
cgit v1.1


From af56beac45692bb25b62383b78ee0b50c40e1f60 Mon Sep 17 00:00:00 2001
From: LLVM GN Syncbot <llvmgnsyncbot@gmail.com>
Date: Tue, 13 Feb 2024 14:03:07 +0000
Subject: [gn build] Port f65577830073

---
 llvm/utils/gn/secondary/clang/lib/AST/BUILD.gn | 1 +
 1 file changed, 1 insertion(+)

diff --git a/llvm/utils/gn/secondary/clang/lib/AST/BUILD.gn b/llvm/utils/gn/secondary/clang/lib/AST/BUILD.gn
index ff79319..14cf759 100644
--- a/llvm/utils/gn/secondary/clang/lib/AST/BUILD.gn
+++ b/llvm/utils/gn/secondary/clang/lib/AST/BUILD.gn
@@ -141,6 +141,7 @@ static_library("AST") {
     "StmtCXX.cpp",
     "StmtIterator.cpp",
     "StmtObjC.cpp",
+    "StmtOpenACC.cpp",
     "StmtOpenMP.cpp",
     "StmtPrinter.cpp",
     "StmtProfile.cpp",
-- 
cgit v1.1


From 742ec3abb876cccf122de4cd92dbbc82dd6f52b2 Mon Sep 17 00:00:00 2001
From: NAKAMURA Takumi <geek4civic@gmail.com>
Date: Tue, 13 Feb 2024 23:13:34 +0900
Subject: MCDCTypes.h: Add ctors, fixup for #81227

---
 llvm/include/llvm/ProfileData/Coverage/MCDCTypes.h | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/llvm/include/llvm/ProfileData/Coverage/MCDCTypes.h b/llvm/include/llvm/ProfileData/Coverage/MCDCTypes.h
index b4e011c..d7520fa 100644
--- a/llvm/include/llvm/ProfileData/Coverage/MCDCTypes.h
+++ b/llvm/include/llvm/ProfileData/Coverage/MCDCTypes.h
@@ -28,6 +28,8 @@ struct DecisionParameters {
   unsigned NumConditions;
 
   DecisionParameters() = delete;
+  DecisionParameters(unsigned BitmapIdx, unsigned NumConditions)
+      : BitmapIdx(BitmapIdx), NumConditions(NumConditions) {}
 };
 
 struct BranchParameters {
@@ -36,6 +38,8 @@ struct BranchParameters {
   ConditionID ID, TrueID, FalseID;
 
   BranchParameters() = delete;
+  BranchParameters(ConditionID ID, ConditionID TrueID, ConditionID FalseID)
+      : ID(ID), TrueID(TrueID), FalseID(FalseID) {}
 };
 
 /// The type of MC/DC-specific parameters.
-- 
cgit v1.1


From 46122082a61ef5bb2871d2d9158739133ad0e113 Mon Sep 17 00:00:00 2001
From: Jacek Caban <jacek@codeweavers.com>
Date: Tue, 13 Feb 2024 15:17:28 +0100
Subject: [Object][COFF][NFC] Make writeImportLibrary NativeExports argument
 optional. (#81600)

It's not interesting for majority of downstream users.
---
 lld/COFF/Driver.cpp                                 | 12 ++++++------
 llvm/include/llvm/Object/COFFImportFile.h           | 18 ++++++++++++++----
 llvm/lib/Object/COFFImportFile.cpp                  |  4 ++--
 llvm/lib/ToolDrivers/llvm-dlltool/DlltoolDriver.cpp |  5 ++---
 llvm/lib/ToolDrivers/llvm-lib/LibDriver.cpp         |  5 ++---
 5 files changed, 26 insertions(+), 18 deletions(-)

diff --git a/lld/COFF/Driver.cpp b/lld/COFF/Driver.cpp
index 091aa0d..22ee2f1 100644
--- a/lld/COFF/Driver.cpp
+++ b/lld/COFF/Driver.cpp
@@ -939,7 +939,7 @@ std::string LinkerDriver::getImportName(bool asLib) {
 
 void LinkerDriver::createImportLibrary(bool asLib) {
   llvm::TimeTraceScope timeScope("Create import library");
-  std::vector<COFFShortExport> exports, nativeExports;
+  std::vector<COFFShortExport> exports;
   for (Export &e1 : ctx.config.exports) {
     COFFShortExport e2;
     e2.Name = std::string(e1.name);
@@ -958,8 +958,8 @@ void LinkerDriver::createImportLibrary(bool asLib) {
   std::string path = getImplibPath();
 
   if (!ctx.config.incremental) {
-    checkError(writeImportLibrary(libName, path, exports, nativeExports,
-                                  ctx.config.machine, ctx.config.mingw));
+    checkError(writeImportLibrary(libName, path, exports, ctx.config.machine,
+                                  ctx.config.mingw));
     return;
   }
 
@@ -968,8 +968,8 @@ void LinkerDriver::createImportLibrary(bool asLib) {
   ErrorOr<std::unique_ptr<MemoryBuffer>> oldBuf = MemoryBuffer::getFile(
       path, /*IsText=*/false, /*RequiresNullTerminator=*/false);
   if (!oldBuf) {
-    checkError(writeImportLibrary(libName, path, exports, nativeExports,
-                                  ctx.config.machine, ctx.config.mingw));
+    checkError(writeImportLibrary(libName, path, exports, ctx.config.machine,
+                                  ctx.config.mingw));
     return;
   }
 
@@ -979,7 +979,7 @@ void LinkerDriver::createImportLibrary(bool asLib) {
     fatal("cannot create temporary file for import library " + path + ": " +
           ec.message());
 
-  if (Error e = writeImportLibrary(libName, tmpName, exports, nativeExports,
+  if (Error e = writeImportLibrary(libName, tmpName, exports,
                                    ctx.config.machine, ctx.config.mingw)) {
     checkError(std::move(e));
     return;
diff --git a/llvm/include/llvm/Object/COFFImportFile.h b/llvm/include/llvm/Object/COFFImportFile.h
index 23c3e6a..402ded0 100644
--- a/llvm/include/llvm/Object/COFFImportFile.h
+++ b/llvm/include/llvm/Object/COFFImportFile.h
@@ -135,10 +135,20 @@ struct COFFShortExport {
   }
 };
 
-Error writeImportLibrary(StringRef ImportName, StringRef Path,
-                         ArrayRef<COFFShortExport> Exports,
-                         ArrayRef<COFFShortExport> NativeExports,
-                         COFF::MachineTypes Machine, bool MinGW);
+/// Writes a COFF import library containing entries described by the Exports
+/// array.
+///
+/// For hybrid targets such as ARM64EC, additional native entry points can be
+/// exposed using the NativeExports parameter. When NativeExports is used, the
+/// output import library will expose these native ARM64 imports alongside the
+/// entries described in the Exports array. Such a library can be used for
+/// linking both ARM64EC and pure ARM64 objects, and the linker will pick only
+/// the exports relevant to the target platform. For non-hybrid targets,
+/// the NativeExports parameter should not be used.
+Error writeImportLibrary(
+    StringRef ImportName, StringRef Path, ArrayRef<COFFShortExport> Exports,
+    COFF::MachineTypes Machine, bool MinGW,
+    ArrayRef<COFFShortExport> NativeExports = std::nullopt);
 
 } // namespace object
 } // namespace llvm
diff --git a/llvm/lib/Object/COFFImportFile.cpp b/llvm/lib/Object/COFFImportFile.cpp
index 9175c3e..f6f6cf2 100644
--- a/llvm/lib/Object/COFFImportFile.cpp
+++ b/llvm/lib/Object/COFFImportFile.cpp
@@ -625,8 +625,8 @@ NewArchiveMember ObjectFactory::createWeakExternal(StringRef Sym,
 
 Error writeImportLibrary(StringRef ImportName, StringRef Path,
                          ArrayRef<COFFShortExport> Exports,
-                         ArrayRef<COFFShortExport> NativeExports,
-                         MachineTypes Machine, bool MinGW) {
+                         MachineTypes Machine, bool MinGW,
+                         ArrayRef<COFFShortExport> NativeExports) {
 
   MachineTypes NativeMachine =
       isArm64EC(Machine) ? IMAGE_FILE_MACHINE_ARM64 : Machine;
diff --git a/llvm/lib/ToolDrivers/llvm-dlltool/DlltoolDriver.cpp b/llvm/lib/ToolDrivers/llvm-dlltool/DlltoolDriver.cpp
index 0749580..8349038 100644
--- a/llvm/lib/ToolDrivers/llvm-dlltool/DlltoolDriver.cpp
+++ b/llvm/lib/ToolDrivers/llvm-dlltool/DlltoolDriver.cpp
@@ -215,9 +215,8 @@ int llvm::dlltoolDriverMain(llvm::ArrayRef<const char *> ArgsArr) {
     }
   }
 
-  if (!Path.empty() &&
-      writeImportLibrary(Def->OutputFile, Path, Def->Exports, std::nullopt,
-                         Machine, /*MinGW=*/true))
+  if (!Path.empty() && writeImportLibrary(Def->OutputFile, Path, Def->Exports,
+                                          Machine, /*MinGW=*/true))
     return 1;
   return 0;
 }
diff --git a/llvm/lib/ToolDrivers/llvm-lib/LibDriver.cpp b/llvm/lib/ToolDrivers/llvm-lib/LibDriver.cpp
index 3baa0a08c..c3015d8 100644
--- a/llvm/lib/ToolDrivers/llvm-lib/LibDriver.cpp
+++ b/llvm/lib/ToolDrivers/llvm-lib/LibDriver.cpp
@@ -418,9 +418,8 @@ int llvm::libDriverMain(ArrayRef<const char *> ArgsArr) {
       OutputFile = std::move(NativeDef->OutputFile);
     }
 
-    return writeImportLibrary(OutputFile, OutputPath, Def->Exports,
-                              NativeExports, LibMachine,
-                              /*MinGW=*/false)
+    return writeImportLibrary(OutputFile, OutputPath, Def->Exports, LibMachine,
+                              /*MinGW=*/false, NativeExports)
                ? 1
                : 0;
   }
-- 
cgit v1.1


From d759618df76361a8e490eeae5c5399e0738cbfd0 Mon Sep 17 00:00:00 2001
From: OCHyams <orlando.hyams@sony.com>
Date: Tue, 13 Feb 2024 08:51:57 +0000
Subject: Reapply "[DebugInfo][RemoveDIs] Turn on non-instrinsic debug-info by
 default"

This reapplies commit bdde5f9 by undoing the revert bc66e0c.

The previous reapplication 5c9f768 was reverted due to a crash
(reproducer in comments for 5c9f768) which was fixed in #81595.

As noted in the original commit, this commit may break downstream tests.
If this commit is breaking your downstream tests, please see comment 12 in
[0], which documents the kind of variation in tests we'd expect to see from
this change and what to do about it.

[0] https://discourse.llvm.org/t/rfc-instruction-api-changes-needed-to-eliminate-debug-intrinsics-from-ir/68939
---
 llvm/lib/IR/BasicBlock.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/IR/BasicBlock.cpp b/llvm/lib/IR/BasicBlock.cpp
index fe9d0d0..bf02eba 100644
--- a/llvm/lib/IR/BasicBlock.cpp
+++ b/llvm/lib/IR/BasicBlock.cpp
@@ -34,7 +34,7 @@ cl::opt<bool>
     UseNewDbgInfoFormat("experimental-debuginfo-iterators",
                         cl::desc("Enable communicating debuginfo positions "
                                  "through iterators, eliminating intrinsics"),
-                        cl::init(false));
+                        cl::init(true));
 
 DPMarker *BasicBlock::createMarker(Instruction *I) {
   assert(IsNewDbgInfoFormat &&
-- 
cgit v1.1


From f7cddf80062848fbbb358d7e913650cc550d2547 Mon Sep 17 00:00:00 2001
From: Jay Foad <jay.foad@amd.com>
Date: Tue, 13 Feb 2024 14:31:54 +0000
Subject: [TableGen] Use std::move instead of swap. NFC. (#81606)

Historically TableGen has used `A.swap(B)` to move containers without
the expense of copying them. Perhaps this predated rvalue references. In
any case `A = std::move(B)` seems like a more direct way to implement
this when only A is required after the operation.
---
 llvm/utils/TableGen/AsmMatcherEmitter.cpp    | 9 ++++-----
 llvm/utils/TableGen/CodeGenRegisters.cpp     | 6 +++---
 llvm/utils/TableGen/CodeGenSchedule.cpp      | 2 +-
 llvm/utils/TableGen/GlobalISelMatchTable.cpp | 4 ++--
 llvm/utils/TableGen/SubtargetEmitter.cpp     | 2 +-
 5 files changed, 11 insertions(+), 12 deletions(-)

diff --git a/llvm/utils/TableGen/AsmMatcherEmitter.cpp b/llvm/utils/TableGen/AsmMatcherEmitter.cpp
index a212265..d6dc4b7 100644
--- a/llvm/utils/TableGen/AsmMatcherEmitter.cpp
+++ b/llvm/utils/TableGen/AsmMatcherEmitter.cpp
@@ -1269,11 +1269,10 @@ void AsmMatcherInfo::buildRegisterClasses(
       }
 
       RegisterSet Tmp;
-      std::swap(Tmp, ContainingSet);
-      std::insert_iterator<RegisterSet> II(ContainingSet,
-                                           ContainingSet.begin());
-      std::set_intersection(Tmp.begin(), Tmp.end(), RS.begin(), RS.end(), II,
-                            LessRecordByID());
+      std::insert_iterator<RegisterSet> II(Tmp, Tmp.begin());
+      std::set_intersection(ContainingSet.begin(), ContainingSet.end(),
+                            RS.begin(), RS.end(), II, LessRecordByID());
+      ContainingSet = std::move(Tmp);
     }
 
     if (!ContainingSet.empty()) {
diff --git a/llvm/utils/TableGen/CodeGenRegisters.cpp b/llvm/utils/TableGen/CodeGenRegisters.cpp
index 5c74a6f..25ef310 100644
--- a/llvm/utils/TableGen/CodeGenRegisters.cpp
+++ b/llvm/utils/TableGen/CodeGenRegisters.cpp
@@ -1964,9 +1964,9 @@ void CodeGenRegBank::pruneUnitSets() {
   for (unsigned i = 0, e = SuperSetIDs.size(); i != e; ++i) {
     unsigned SuperIdx = SuperSetIDs[i];
     PrunedUnitSets[i].Name = RegUnitSets[SuperIdx].Name;
-    PrunedUnitSets[i].Units.swap(RegUnitSets[SuperIdx].Units);
+    PrunedUnitSets[i].Units = std::move(RegUnitSets[SuperIdx].Units);
   }
-  RegUnitSets.swap(PrunedUnitSets);
+  RegUnitSets = std::move(PrunedUnitSets);
 }
 
 // Create a RegUnitSet for each RegClass that contains all units in the class
@@ -2139,7 +2139,7 @@ void CodeGenRegBank::computeRegUnitSets() {
     if (RCUnitSetsIdx == RegClassUnitSets.size()) {
       // Create a new list of UnitSets as a "fake" register class.
       RegClassUnitSets.resize(RCUnitSetsIdx + 1);
-      RegClassUnitSets[RCUnitSetsIdx].swap(RUSets);
+      RegClassUnitSets[RCUnitSetsIdx] = std::move(RUSets);
     }
   }
 }
diff --git a/llvm/utils/TableGen/CodeGenSchedule.cpp b/llvm/utils/TableGen/CodeGenSchedule.cpp
index 9cebc42..e56bf5b 100644
--- a/llvm/utils/TableGen/CodeGenSchedule.cpp
+++ b/llvm/utils/TableGen/CodeGenSchedule.cpp
@@ -1788,7 +1788,7 @@ void CodeGenSchedModels::inferFromRW(ArrayRef<unsigned> OperWrites,
     for (const PredTransition &Trans : LastTransitions)
       SubstitutedAny |= Transitions.substituteVariants(Trans);
     LLVM_DEBUG(Transitions.dump());
-    LastTransitions.swap(Transitions.TransVec);
+    LastTransitions = std::move(Transitions.TransVec);
   } while (SubstitutedAny);
 
   // WARNING: We are about to mutate the SchedClasses vector. Do not refer to
diff --git a/llvm/utils/TableGen/GlobalISelMatchTable.cpp b/llvm/utils/TableGen/GlobalISelMatchTable.cpp
index f7166ea..d1bdc30 100644
--- a/llvm/utils/TableGen/GlobalISelMatchTable.cpp
+++ b/llvm/utils/TableGen/GlobalISelMatchTable.cpp
@@ -545,8 +545,8 @@ void GroupMatcher::optimize() {
     if (T != E)
       F = ++T;
   }
-  optimizeRules<GroupMatcher>(Matchers, MatcherStorage).swap(Matchers);
-  optimizeRules<SwitchMatcher>(Matchers, MatcherStorage).swap(Matchers);
+  Matchers = optimizeRules<GroupMatcher>(Matchers, MatcherStorage);
+  Matchers = optimizeRules<SwitchMatcher>(Matchers, MatcherStorage);
 }
 
 //===- SwitchMatcher ------------------------------------------------------===//
diff --git a/llvm/utils/TableGen/SubtargetEmitter.cpp b/llvm/utils/TableGen/SubtargetEmitter.cpp
index b1502ea..ebe3916 100644
--- a/llvm/utils/TableGen/SubtargetEmitter.cpp
+++ b/llvm/utils/TableGen/SubtargetEmitter.cpp
@@ -1649,7 +1649,7 @@ static void collectProcessorIndices(const CodeGenSchedClass &SC,
     IdxVec PI;
     std::set_union(&T.ProcIndex, &T.ProcIndex + 1, ProcIndices.begin(),
                    ProcIndices.end(), std::back_inserter(PI));
-    ProcIndices.swap(PI);
+    ProcIndices = std::move(PI);
   }
 }
 
-- 
cgit v1.1


From d1f510cca8e966bd1742bf17256bfec99dcdf229 Mon Sep 17 00:00:00 2001
From: Mats Petersson <mats.petersson@arm.com>
Date: Tue, 13 Feb 2024 14:32:26 +0000
Subject: Fix warning by removing unused variable (#81604)

Apparently, some compilers [correctly] warn that the variable that was
created prior to this change is unused.

This reemoves the variable.
---
 flang/lib/Semantics/canonicalize-omp.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/flang/lib/Semantics/canonicalize-omp.cpp b/flang/lib/Semantics/canonicalize-omp.cpp
index 01adcf5..0481b3d 100644
--- a/flang/lib/Semantics/canonicalize-omp.cpp
+++ b/flang/lib/Semantics/canonicalize-omp.cpp
@@ -92,7 +92,7 @@ private:
     nextIt = it;
     while (++nextIt != block.end()) {
       // Ignore compiler directives.
-      if (auto *directive{GetConstructIf<parser::CompilerDirective>(*nextIt)})
+      if (GetConstructIf<parser::CompilerDirective>(*nextIt))
         continue;
 
       if (auto *doCons{GetConstructIf<parser::DoConstruct>(*nextIt)}) {
-- 
cgit v1.1


From 5e5e51e9062895bed9fcf0dbb157d868be0adf8d Mon Sep 17 00:00:00 2001
From: Jay Foad <jay.foad@amd.com>
Date: Tue, 13 Feb 2024 14:34:12 +0000
Subject: Make use of std::inserter. NFC.

---
 llvm/lib/Support/DeltaAlgorithm.cpp       | 6 +++---
 llvm/utils/TableGen/AsmMatcherEmitter.cpp | 4 ++--
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/llvm/lib/Support/DeltaAlgorithm.cpp b/llvm/lib/Support/DeltaAlgorithm.cpp
index 341de24..d763cde 100644
--- a/llvm/lib/Support/DeltaAlgorithm.cpp
+++ b/llvm/lib/Support/DeltaAlgorithm.cpp
@@ -83,9 +83,9 @@ bool DeltaAlgorithm::Search(const changeset_ty &Changes,
     if (Sets.size() > 2) {
       // FIXME: This is really slow.
       changeset_ty Complement;
-      std::set_difference(
-        Changes.begin(), Changes.end(), it->begin(), it->end(),
-        std::insert_iterator<changeset_ty>(Complement, Complement.begin()));
+      std::set_difference(Changes.begin(), Changes.end(), it->begin(),
+                          it->end(),
+                          std::inserter(Complement, Complement.begin()));
       if (GetTestResult(Complement)) {
         changesetlist_ty ComplementSets;
         ComplementSets.insert(ComplementSets.end(), Sets.begin(), it);
diff --git a/llvm/utils/TableGen/AsmMatcherEmitter.cpp b/llvm/utils/TableGen/AsmMatcherEmitter.cpp
index d6dc4b7..2c2c39a 100644
--- a/llvm/utils/TableGen/AsmMatcherEmitter.cpp
+++ b/llvm/utils/TableGen/AsmMatcherEmitter.cpp
@@ -1269,9 +1269,9 @@ void AsmMatcherInfo::buildRegisterClasses(
       }
 
       RegisterSet Tmp;
-      std::insert_iterator<RegisterSet> II(Tmp, Tmp.begin());
       std::set_intersection(ContainingSet.begin(), ContainingSet.end(),
-                            RS.begin(), RS.end(), II, LessRecordByID());
+                            RS.begin(), RS.end(),
+                            std::inserter(Tmp, Tmp.begin()), LessRecordByID());
       ContainingSet = std::move(Tmp);
     }
 
-- 
cgit v1.1


From 38c706e30f5f339bfb0bfb26fd7b5c2d5086064a Mon Sep 17 00:00:00 2001
From: David Spickett <david.spickett@linaro.org>
Date: Tue, 13 Feb 2024 14:52:02 +0000
Subject: [GitHub][workflows] Ask reviewers to merge PRs when author cannot
 (#81142)

This uses
https://pygithub.readthedocs.io/en/stable/github_objects/Repository.html?highlight=get_collaborator_permission#github.Repository.Repository.get_collaborator_permission.

Which does
https://docs.github.com/en/rest/collaborators/collaborators?apiVersion=2022-11-28#get-repository-permissions-for-a-user
and returns the top level "permission" key.

This is less detailed than the user/permissions key but should be fine
for this
use case.

When a review is submitted we check:
* If it's an approval.
* Whether we have already left a merge on behalf comment (by looking for
a hidden HTML comment).
* Whether the author has permissions to merge their own PR.
* Whether the reviewer has permissions to merge.

If needed we leave a comment tagging the reviewer. If the reviewer also
doesn't have merge permission, then it asks them to find someone else
who does.
---
 .github/workflows/approved-prs.yml  | 39 ++++++++++++++++++++++
 llvm/utils/git/github-automation.py | 65 +++++++++++++++++++++++++++++++++++++
 2 files changed, 104 insertions(+)
 create mode 100644 .github/workflows/approved-prs.yml

diff --git a/.github/workflows/approved-prs.yml b/.github/workflows/approved-prs.yml
new file mode 100644
index 0000000..309a921
--- /dev/null
+++ b/.github/workflows/approved-prs.yml
@@ -0,0 +1,39 @@
+name: "Prompt reviewers to merge PRs on behalf of authors"
+
+permissions:
+  contents: read
+
+on:
+  pull_request_review:
+    types:
+      - submitted
+
+jobs:
+  merge-on-behalf-information-comment:
+    runs-on: ubuntu-latest
+    permissions:
+      pull-requests: write
+    if: >-
+      (github.repository == 'llvm/llvm-project') &&
+      (github.event.review.state == 'APPROVED')
+    steps:
+      - name: Checkout Automation Script
+        uses: actions/checkout@v4
+        with:
+          sparse-checkout: llvm/utils/git/
+          ref: main
+
+      - name: Setup Automation Script
+        working-directory: ./llvm/utils/git/
+        run: |
+          pip install -r requirements.txt
+
+      - name: Add Merge On Behalf Comment
+        working-directory: ./llvm/utils/git/
+        run: |
+          python3 ./github-automation.py \
+            --token '${{ secrets.GITHUB_TOKEN }}' \
+            pr-merge-on-behalf-information \
+            --issue-number "${{ github.event.pull_request.number }}" \
+            --author "${{ github.event.pull_request.user.login }}" \
+            --reviewer "${{ github.event.review.user.login }}"
diff --git a/llvm/utils/git/github-automation.py b/llvm/utils/git/github-automation.py
index b475eff..ccef274 100755
--- a/llvm/utils/git/github-automation.py
+++ b/llvm/utils/git/github-automation.py
@@ -298,6 +298,55 @@ If you don't get any reports, no action is required from you. Your changes are w
         return True
 
 
+class PRMergeOnBehalfInformation:
+    COMMENT_TAG = "<!--LLVM MERGE ON BEHALF INFORMATION COMMENT-->\n"
+
+    def __init__(
+        self, token: str, repo: str, pr_number: int, author: str, reviewer: str
+    ):
+        self.repo = github.Github(token).get_repo(repo)
+        self.pr = self.repo.get_issue(pr_number).as_pull_request()
+        self.author = author
+        self.reviewer = reviewer
+
+    def can_merge(self, user: str) -> bool:
+        try:
+            return self.repo.get_collaborator_permission(user) in ["admin", "write"]
+        # There is a UnknownObjectException for this scenario, but this method
+        # does not use it.
+        except github.GithubException as e:
+            # 404 means the author was not found in the collaborator list, so we
+            # know they don't have push permissions. Anything else is a real API
+            # issue, raise it so it is visible.
+            if e.status != 404:
+                raise e
+            return False
+
+    def run(self) -> bool:
+        # Check this first because it only costs 1 API point.
+        if self.can_merge(self.author):
+            return
+
+        # A review can be approved more than once, only comment the first time.
+        for comment in self.pr.as_issue().get_comments():
+            if self.COMMENT_TAG in comment.body:
+                return
+
+        # This text is using Markdown formatting.
+        if self.can_merge(self.reviewer):
+            comment = f"""\
+{self.COMMENT_TAG}
+@{self.reviewer} the PR author does not have permission to merge their own PRs yet. Please merge on their behalf."""
+        else:
+            comment = f"""\
+{self.COMMENT_TAG}
+@{self.reviewer} the author of this PR does not have permission to merge and neither do you.
+Please find someone who has merge permissions who can merge it on the author's behalf. This could be one of the other reviewers or you can ask on [Discord](https://discord.com/invite/xS7Z362)."""
+
+        self.pr.as_issue().create_comment(comment)
+        return True
+
+
 def setup_llvmbot_git(git_dir="."):
     """
     Configure the git repo in `git_dir` with the llvmbot account so
@@ -665,6 +714,17 @@ pr_buildbot_information_parser = subparsers.add_parser("pr-buildbot-information"
 pr_buildbot_information_parser.add_argument("--issue-number", type=int, required=True)
 pr_buildbot_information_parser.add_argument("--author", type=str, required=True)
 
+pr_merge_on_behalf_information_parser = subparsers.add_parser(
+    "pr-merge-on-behalf-information"
+)
+pr_merge_on_behalf_information_parser.add_argument(
+    "--issue-number", type=int, required=True
+)
+pr_merge_on_behalf_information_parser.add_argument("--author", type=str, required=True)
+pr_merge_on_behalf_information_parser.add_argument(
+    "--reviewer", type=str, required=True
+)
+
 release_workflow_parser = subparsers.add_parser("release-workflow")
 release_workflow_parser.add_argument(
     "--llvm-project-dir",
@@ -724,6 +784,11 @@ elif args.command == "pr-buildbot-information":
         args.token, args.repo, args.issue_number, args.author
     )
     pr_buildbot_information.run()
+elif args.command == "pr-merge-on-behalf-information":
+    pr_merge_on_behalf_information = PRMergeOnBehalfInformation(
+        args.token, args.repo, args.issue_number, args.author, args.reviewer
+    )
+    pr_merge_on_behalf_information.run()
 elif args.command == "release-workflow":
     release_workflow = ReleaseWorkflow(
         args.token,
-- 
cgit v1.1


From 89c1bf1230e011f2f0e43554c278205fa1819de5 Mon Sep 17 00:00:00 2001
From: James Westwood <james.westwood@arm.com>
Date: Tue, 13 Feb 2024 15:12:35 +0000
Subject: [ARM] __ARM_ARCH macro definition fix (#81493)

This patch changes how the macro __ARM_ARCH is defined to match its
defintion in the ACLE. In ACLE 5.4.1, __ARM_ARCH is defined as equal to
the major architecture version for ISAs up to and including v8. From
v8.1 onwards, its definition is changed to include minor versions, such
that for an architecture vX.Y, __ARM_ARCH = X*100 + Y. Before this
patch, LLVM defined __ARM_ARCH using only the major architecture version
for all architecture versions. This patch adds functionality to define
__ARM_ARCH correctly for architectures greater than or equal to v8.1.
---
 clang/docs/ReleaseNotes.rst                      |  2 +
 clang/lib/Basic/Targets/AArch64.cpp              | 16 ++++++-
 clang/lib/Basic/Targets/ARM.cpp                  | 14 ++++--
 clang/lib/Basic/Targets/ARM.h                    |  1 +
 clang/test/Preprocessor/arm-target-features.c    | 34 +++++++-------
 llvm/include/llvm/TargetParser/ARMTargetParser.h |  1 +
 llvm/lib/TargetParser/ARMTargetParser.cpp        | 58 ++++++++++++++++++++++++
 llvm/unittests/TargetParser/TargetParserTest.cpp |  8 ++++
 8 files changed, 112 insertions(+), 22 deletions(-)

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index dd79023..5c245b7 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -255,6 +255,8 @@ X86 Support
 Arm and AArch64 Support
 ^^^^^^^^^^^^^^^^^^^^^^^
 
+- Fixed the incorrect definition of the __ARM_ARCH macro for architectures greater than or equal to v8.1.
+
 Android Support
 ^^^^^^^^^^^^^^^
 
diff --git a/clang/lib/Basic/Targets/AArch64.cpp b/clang/lib/Basic/Targets/AArch64.cpp
index 6803296..dd0218e 100644
--- a/clang/lib/Basic/Targets/AArch64.cpp
+++ b/clang/lib/Basic/Targets/AArch64.cpp
@@ -367,8 +367,20 @@ void AArch64TargetInfo::getTargetDefines(const LangOptions &Opts,
 
   // ACLE predefines. Many can only have one possible value on v8 AArch64.
   Builder.defineMacro("__ARM_ACLE", "200");
-  Builder.defineMacro("__ARM_ARCH",
-                      std::to_string(ArchInfo->Version.getMajor()));
+
+  // __ARM_ARCH is defined as an integer value indicating the current ARM ISA.
+  // For ISAs up to and including v8, __ARM_ARCH is equal to the major version
+  // number. For ISAs from v8.1 onwards, __ARM_ARCH is scaled up to include the
+  // minor version number, e.g. for ARM architecture ARMvX.Y:
+  // __ARM_ARCH = X * 100 + Y.
+  if (ArchInfo->Version.getMajor() == 8 && ArchInfo->Version.getMinor() == 0)
+    Builder.defineMacro("__ARM_ARCH",
+                        std::to_string(ArchInfo->Version.getMajor()));
+  else
+    Builder.defineMacro("__ARM_ARCH",
+                        std::to_string(ArchInfo->Version.getMajor() * 100 +
+                                       ArchInfo->Version.getMinor().value()));
+
   Builder.defineMacro("__ARM_ARCH_PROFILE",
                       std::string("'") + (char)ArchInfo->Profile + "'");
 
diff --git a/clang/lib/Basic/Targets/ARM.cpp b/clang/lib/Basic/Targets/ARM.cpp
index 55b7155..cd7fb95 100644
--- a/clang/lib/Basic/Targets/ARM.cpp
+++ b/clang/lib/Basic/Targets/ARM.cpp
@@ -130,6 +130,7 @@ void ARMTargetInfo::setArchInfo(llvm::ARM::ArchKind Kind) {
   SubArch = llvm::ARM::getSubArch(ArchKind);
   ArchProfile = llvm::ARM::parseArchProfile(SubArch);
   ArchVersion = llvm::ARM::parseArchVersion(SubArch);
+  ArchMinorVersion = llvm::ARM::parseArchMinorVersion(SubArch);
 
   // cache CPU related strings
   CPUAttr = getCPUAttr();
@@ -736,9 +737,16 @@ void ARMTargetInfo::getTargetDefines(const LangOptions &Opts,
   if (!CPUAttr.empty())
     Builder.defineMacro("__ARM_ARCH_" + CPUAttr + "__");
 
-  // ACLE 6.4.1 ARM/Thumb instruction set architecture
-  // __ARM_ARCH is defined as an integer value indicating the current ARM ISA
-  Builder.defineMacro("__ARM_ARCH", Twine(ArchVersion));
+  // __ARM_ARCH is defined as an integer value indicating the current ARM ISA.
+  // For ISAs up to and including v8, __ARM_ARCH is equal to the major version
+  // number. For ISAs from v8.1 onwards, __ARM_ARCH is scaled up to include the
+  // minor version number, e.g. for ARM architecture ARMvX.Y:
+  // __ARM_ARCH = X * 100 + Y.
+  if (ArchVersion >= 9 || ArchMinorVersion != 0)
+    Builder.defineMacro("__ARM_ARCH",
+                        Twine(ArchVersion * 100 + ArchMinorVersion));
+  else
+    Builder.defineMacro("__ARM_ARCH", Twine(ArchVersion));
 
   if (ArchVersion >= 8) {
     // ACLE 6.5.7 Crypto Extension
diff --git a/clang/lib/Basic/Targets/ARM.h b/clang/lib/Basic/Targets/ARM.h
index 71322a0..df06e4d 100644
--- a/clang/lib/Basic/Targets/ARM.h
+++ b/clang/lib/Basic/Targets/ARM.h
@@ -60,6 +60,7 @@ class LLVM_LIBRARY_VISIBILITY ARMTargetInfo : public TargetInfo {
   llvm::ARM::ArchKind ArchKind = llvm::ARM::ArchKind::ARMV4T;
   llvm::ARM::ProfileKind ArchProfile;
   unsigned ArchVersion;
+  unsigned ArchMinorVersion;
 
   LLVM_PREFERRED_TYPE(FPUMode)
   unsigned FPU : 5;
diff --git a/clang/test/Preprocessor/arm-target-features.c b/clang/test/Preprocessor/arm-target-features.c
index 236c9f2..733d068 100644
--- a/clang/test/Preprocessor/arm-target-features.c
+++ b/clang/test/Preprocessor/arm-target-features.c
@@ -737,7 +737,7 @@
 
 // Test whether predefines are as expected when targeting cortex-m55 (softfp FP ABI as default).
 // RUN: %clang -target arm-eabi -mcpu=cortex-m55 -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=M55 %s
-// M55: #define __ARM_ARCH 8
+// M55: #define __ARM_ARCH 801
 // M55: #define __ARM_ARCH_8_1M_MAIN__ 1
 // M55: #define __ARM_ARCH_EXT_IDIV__ 1
 // M55-NOT: __ARM_ARCH_ISA_ARM
@@ -764,7 +764,7 @@
 // KRAIT-ALLOW-FP-INSTR:#define  __ARM_VFPV4__ 1
 
 // RUN: %clang -target arm-arm-none-eabi -march=armv8.1-m.main -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=CHECK-V81M %s
-// CHECK-V81M: #define __ARM_ARCH 8
+// CHECK-V81M: #define __ARM_ARCH 801
 // CHECK-V81M: #define __ARM_ARCH_8_1M_MAIN__ 1
 // CHECK-V81M: #define __ARM_ARCH_ISA_THUMB 2
 // CHECK-V81M: #define __ARM_ARCH_PROFILE 'M'
@@ -821,14 +821,14 @@
 // CHECK-V8M-CDE-MASK2: #define __ARM_FEATURE_CDE_COPROC 0xff
 
 // RUN: %clang -target armv8.1a-none-none-eabi -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=CHECK-V81A %s
-// CHECK-V81A: #define __ARM_ARCH 8
+// CHECK-V81A: #define __ARM_ARCH 801
 // CHECK-V81A: #define __ARM_ARCH_8_1A__ 1
 // CHECK-V81A: #define __ARM_ARCH_PROFILE 'A'
 // CHECK-V81A: #define __ARM_FEATURE_QRDMX 1
 // CHECK-V81A: #define __ARM_FP 0xe
 
 // RUN: %clang -target armv8.2a-none-none-eabi -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=CHECK-V82A %s
-// CHECK-V82A: #define __ARM_ARCH 8
+// CHECK-V82A: #define __ARM_ARCH 802
 // CHECK-V82A: #define __ARM_ARCH_8_2A__ 1
 // CHECK-V82A: #define __ARM_ARCH_PROFILE 'A'
 // CHECK-V82A: #define __ARM_FEATURE_QRDMX 1
@@ -838,67 +838,67 @@
 // CHECK-DRIVERKIT-NOT: #define __ARM_PCS_VFP 1
 
 // RUN: %clang -target armv8.3a-none-none-eabi -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=CHECK-V83A %s
-// CHECK-V83A: #define __ARM_ARCH 8
+// CHECK-V83A: #define __ARM_ARCH 803
 // CHECK-V83A: #define __ARM_ARCH_8_3A__ 1
 // CHECK-V83A: #define __ARM_ARCH_PROFILE 'A'
 
 // RUN: %clang -target armv8.4a-none-none-eabi -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=CHECK-V84A %s
-// CHECK-V84A: #define __ARM_ARCH 8
+// CHECK-V84A: #define __ARM_ARCH 804
 // CHECK-V84A: #define __ARM_ARCH_8_4A__ 1
 // CHECK-V84A: #define __ARM_ARCH_PROFILE 'A'
 
 // RUN: %clang -target armv8.5a-none-none-eabi -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=CHECK-V85A %s
-// CHECK-V85A: #define __ARM_ARCH 8
+// CHECK-V85A: #define __ARM_ARCH 805
 // CHECK-V85A: #define __ARM_ARCH_8_5A__ 1
 // CHECK-V85A: #define __ARM_ARCH_PROFILE 'A'
 
 // RUN: %clang -target armv8.6a-none-none-eabi -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=CHECK-V86A %s
-// CHECK-V86A: #define __ARM_ARCH 8
+// CHECK-V86A: #define __ARM_ARCH 806
 // CHECK-V86A: #define __ARM_ARCH_8_6A__ 1
 // CHECK-V86A: #define __ARM_ARCH_PROFILE 'A'
 
 // RUN: %clang -target armv8.7a-none-none-eabi -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=CHECK-V87A %s
-// CHECK-V87A: #define __ARM_ARCH 8
+// CHECK-V87A: #define __ARM_ARCH 807
 // CHECK-V87A: #define __ARM_ARCH_8_7A__ 1
 // CHECK-V87A: #define __ARM_ARCH_PROFILE 'A'
 
 // RUN: %clang -target armv8.8a-none-none-eabi -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=CHECK-V88A %s
-// CHECK-V88A: #define __ARM_ARCH 8
+// CHECK-V88A: #define __ARM_ARCH 808
 // CHECK-V88A: #define __ARM_ARCH_8_8A__ 1
 // CHECK-V88A: #define __ARM_ARCH_PROFILE 'A'
 
 // RUN: %clang -target armv8.9a-none-none-eabi -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=CHECK-V89A %s
-// CHECK-V89A: #define __ARM_ARCH 8
+// CHECK-V89A: #define __ARM_ARCH 809
 // CHECK-V89A: #define __ARM_ARCH_8_9A__ 1
 // CHECK-V89A: #define __ARM_ARCH_PROFILE 'A'
 
 // RUN: %clang -target armv9a-none-none-eabi -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=CHECK-V9A %s
-// CHECK-V9A: #define __ARM_ARCH 9
+// CHECK-V9A: #define __ARM_ARCH 900
 // CHECK-V9A: #define __ARM_ARCH_9A__ 1
 // CHECK-V9A: #define __ARM_ARCH_PROFILE 'A'
 
 // RUN: %clang -target armv9.1a-none-none-eabi -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=CHECK-V91A %s
-// CHECK-V91A: #define __ARM_ARCH 9
+// CHECK-V91A: #define __ARM_ARCH 901
 // CHECK-V91A: #define __ARM_ARCH_9_1A__ 1
 // CHECK-V91A: #define __ARM_ARCH_PROFILE 'A'
 
 // RUN: %clang -target armv9.2a-none-none-eabi -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=CHECK-V92A %s
-// CHECK-V92A: #define __ARM_ARCH 9
+// CHECK-V92A: #define __ARM_ARCH 902
 // CHECK-V92A: #define __ARM_ARCH_9_2A__ 1
 // CHECK-V92A: #define __ARM_ARCH_PROFILE 'A'
 
 // RUN: %clang -target armv9.3a-none-none-eabi -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=CHECK-V93A %s
-// CHECK-V93A: #define __ARM_ARCH 9
+// CHECK-V93A: #define __ARM_ARCH 903
 // CHECK-V93A: #define __ARM_ARCH_9_3A__ 1
 // CHECK-V93A: #define __ARM_ARCH_PROFILE 'A'
 
 // RUN: %clang -target armv9.4a-none-none-eabi -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=CHECK-V94A %s
-// CHECK-V94A: #define __ARM_ARCH 9
+// CHECK-V94A: #define __ARM_ARCH 904
 // CHECK-V94A: #define __ARM_ARCH_9_4A__ 1
 // CHECK-V94A: #define __ARM_ARCH_PROFILE 'A'
 
 // RUN: %clang -target armv9.5a-none-none-eabi -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=CHECK-V95A %s
-// CHECK-V95A: #define __ARM_ARCH 9
+// CHECK-V95A: #define __ARM_ARCH 905
 // CHECK-V95A: #define __ARM_ARCH_9_5A__ 1
 // CHECK-V95A: #define __ARM_ARCH_PROFILE 'A'
 
diff --git a/llvm/include/llvm/TargetParser/ARMTargetParser.h b/llvm/include/llvm/TargetParser/ARMTargetParser.h
index c42d66f..ec38171 100644
--- a/llvm/include/llvm/TargetParser/ARMTargetParser.h
+++ b/llvm/include/llvm/TargetParser/ARMTargetParser.h
@@ -258,6 +258,7 @@ uint64_t parseArchExt(StringRef ArchExt);
 ArchKind parseCPUArch(StringRef CPU);
 ProfileKind parseArchProfile(StringRef Arch);
 unsigned parseArchVersion(StringRef Arch);
+unsigned parseArchMinorVersion(StringRef Arch);
 
 void fillValidCPUArchList(SmallVectorImpl<StringRef> &Values);
 StringRef computeDefaultTargetABI(const Triple &TT, StringRef CPU);
diff --git a/llvm/lib/TargetParser/ARMTargetParser.cpp b/llvm/lib/TargetParser/ARMTargetParser.cpp
index 67f937e..fac7019 100644
--- a/llvm/lib/TargetParser/ARMTargetParser.cpp
+++ b/llvm/lib/TargetParser/ARMTargetParser.cpp
@@ -94,6 +94,64 @@ unsigned ARM::parseArchVersion(StringRef Arch) {
   llvm_unreachable("Unhandled architecture");
 }
 
+unsigned ARM::parseArchMinorVersion(StringRef Arch) {
+  Arch = getCanonicalArchName(Arch);
+  switch (parseArch(Arch)) {
+  case ArchKind::ARMV4:
+  case ArchKind::ARMV4T:
+  case ArchKind::ARMV5T:
+  case ArchKind::ARMV5TE:
+  case ArchKind::IWMMXT:
+  case ArchKind::IWMMXT2:
+  case ArchKind::XSCALE:
+  case ArchKind::ARMV5TEJ:
+  case ArchKind::ARMV6:
+  case ArchKind::ARMV6K:
+  case ArchKind::ARMV6T2:
+  case ArchKind::ARMV6KZ:
+  case ArchKind::ARMV6M:
+  case ArchKind::ARMV7A:
+  case ArchKind::ARMV7VE:
+  case ArchKind::ARMV7R:
+  case ArchKind::ARMV7M:
+  case ArchKind::ARMV7S:
+  case ArchKind::ARMV7EM:
+  case ArchKind::ARMV7K:
+  case ArchKind::ARMV8A:
+  case ArchKind::ARMV8R:
+  case ArchKind::ARMV8MBaseline:
+  case ArchKind::ARMV8MMainline:
+  case ArchKind::ARMV9A:
+  case ArchKind::INVALID:
+    return 0;
+  case ArchKind::ARMV8_1A:
+  case ArchKind::ARMV8_1MMainline:
+  case ArchKind::ARMV9_1A:
+    return 1;
+  case ArchKind::ARMV8_2A:
+  case ArchKind::ARMV9_2A:
+    return 2;
+  case ArchKind::ARMV8_3A:
+  case ArchKind::ARMV9_3A:
+    return 3;
+  case ArchKind::ARMV8_4A:
+  case ArchKind::ARMV9_4A:
+    return 4;
+  case ArchKind::ARMV8_5A:
+  case ArchKind::ARMV9_5A:
+    return 5;
+  case ArchKind::ARMV8_6A:
+    return 6;
+  case ArchKind::ARMV8_7A:
+    return 7;
+  case ArchKind::ARMV8_8A:
+    return 8;
+  case ArchKind::ARMV8_9A:
+    return 9;
+  }
+  llvm_unreachable("Unhandled architecture");
+}
+
 static ARM::ProfileKind getProfileKind(ARM::ArchKind AK) {
   switch (AK) {
   case ARM::ArchKind::ARMV6M:
diff --git a/llvm/unittests/TargetParser/TargetParserTest.cpp b/llvm/unittests/TargetParser/TargetParserTest.cpp
index e89fc68..c6ee39f 100644
--- a/llvm/unittests/TargetParser/TargetParserTest.cpp
+++ b/llvm/unittests/TargetParser/TargetParserTest.cpp
@@ -976,6 +976,14 @@ TEST(TargetParserTest, ARMparseArchVersion) {
       EXPECT_EQ(5u, ARM::parseArchVersion(ARMArch[i]));
 }
 
+TEST(TargetParserTest, ARMparseArchMinorVersion) {
+  for (unsigned i = 0; i < std::size(ARMArch); i++)
+    if (((std::string)ARMArch[i]).find(".") == 5)
+      EXPECT_EQ((ARMArch[i][6] - 48u), ARM::parseArchMinorVersion(ARMArch[i]));
+    else
+      EXPECT_EQ(0u, ARM::parseArchMinorVersion(ARMArch[i]));
+}
+
 TEST(TargetParserTest, getARMCPUForArch) {
   // Platform specific defaults.
   {
-- 
cgit v1.1


From 25b9ed6e4964344e3710359bec4c831e5a8448b9 Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Tue, 13 Feb 2024 16:41:00 +0100
Subject: [DAGCombine] Fix multi-use miscompile in load combine (#81586)

The load combine replaces a number of original loads with one new loads
and also replaces the output chains of the original loads with the
output chain of the new load. This is incorrect if the original load is
retained (due to multi-use), as it may get incorrectly reordered.

Fix this by using makeEquivalentMemoryOrdering() instead, which will
create a TokenFactor with both chains.

Fixes https://github.com/llvm/llvm-project/issues/80911.
---
 llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 2 +-
 llvm/test/CodeGen/X86/load-combine.ll         | 3 +--
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index d3cd9b1..52011e5 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -9252,7 +9252,7 @@ SDValue DAGCombiner::MatchLoadCombine(SDNode *N) {
 
   // Transfer chain users from old loads to the new load.
   for (LoadSDNode *L : Loads)
-    DAG.ReplaceAllUsesOfValueWith(SDValue(L, 1), SDValue(NewLoad.getNode(), 1));
+    DAG.makeEquivalentMemoryOrdering(L, NewLoad);
 
   if (!NeedsBswap)
     return NewLoad;
diff --git a/llvm/test/CodeGen/X86/load-combine.ll b/llvm/test/CodeGen/X86/load-combine.ll
index 7e4e11f..b5f3e78 100644
--- a/llvm/test/CodeGen/X86/load-combine.ll
+++ b/llvm/test/CodeGen/X86/load-combine.ll
@@ -1283,7 +1283,6 @@ define i32 @zext_load_i32_by_i8_bswap_shl_16(ptr %arg) {
   ret i32 %tmp8
 }
 
-; FIXME: This is a miscompile.
 define i32 @pr80911_vector_load_multiuse(ptr %ptr, ptr %clobber) nounwind {
 ; CHECK-LABEL: pr80911_vector_load_multiuse:
 ; CHECK:       # %bb.0:
@@ -1299,9 +1298,9 @@ define i32 @pr80911_vector_load_multiuse(ptr %ptr, ptr %clobber) nounwind {
 ;
 ; CHECK64-LABEL: pr80911_vector_load_multiuse:
 ; CHECK64:       # %bb.0:
+; CHECK64-NEXT:    movl (%rdi), %ecx
 ; CHECK64-NEXT:    movzwl (%rdi), %eax
 ; CHECK64-NEXT:    movl $0, (%rsi)
-; CHECK64-NEXT:    movl (%rdi), %ecx
 ; CHECK64-NEXT:    movl %ecx, (%rdi)
 ; CHECK64-NEXT:    retq
   %load = load <4 x i8>, ptr %ptr, align 16
-- 
cgit v1.1


From 4ad9f5be8348374ed2bfff32842f395f6e5f41a4 Mon Sep 17 00:00:00 2001
From: Tom Stellard <tstellar@redhat.com>
Date: Tue, 13 Feb 2024 07:45:55 -0800
Subject: ci: Temporarily disable the buildkite job on Windows (#81538)

The failure rate is too high.
See
https://discourse.llvm.org/t/rfc-future-of-windows-pre-commit-ci/76840
---
 .ci/generate-buildkite-pipeline-premerge | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/.ci/generate-buildkite-pipeline-premerge b/.ci/generate-buildkite-pipeline-premerge
index 4ebf304..c14ec46 100755
--- a/.ci/generate-buildkite-pipeline-premerge
+++ b/.ci/generate-buildkite-pipeline-premerge
@@ -233,7 +233,10 @@ linux_projects=$(add-dependencies ${linux_projects_to_test} | sort | uniq)
 
 windows_projects_to_test=$(exclude-windows $(compute-projects-to-test ${modified_projects}))
 windows_check_targets=$(check-targets ${windows_projects_to_test} | sort | uniq)
-windows_projects=$(add-dependencies ${windows_projects_to_test} | sort | uniq)
+# Temporary disable the windows job.
+# See https://discourse.llvm.org/t/rfc-future-of-windows-pre-commit-ci/76840
+#windows_projects=$(add-dependencies ${windows_projects_to_test} | sort | uniq)
+windows_projects=""
 
 # Generate the appropriate pipeline
 if [[ "${linux_projects}" != "" ]]; then
-- 
cgit v1.1


From 192c23b0c7e5d3daefc2ad7d86c095509749eacd Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo@fhahn.com>
Date: Tue, 13 Feb 2024 15:56:38 +0000
Subject: [SLP] Add X86 version of non-power-of-2 vectorization tests.

Extra X86 tests for https://github.com/llvm/llvm-project/pull/77790.
---
 .../test/Transforms/SLPVectorizer/X86/vec3-base.ll | 317 +++++++++++++
 .../Transforms/SLPVectorizer/X86/vec3-calls.ll     |  60 +++
 .../SLPVectorizer/X86/vec3-gather-some-loads.ll    |  66 +++
 .../SLPVectorizer/X86/vec3-reorder-reshuffle.ll    | 513 +++++++++++++++++++++
 4 files changed, 956 insertions(+)
 create mode 100644 llvm/test/Transforms/SLPVectorizer/X86/vec3-base.ll
 create mode 100644 llvm/test/Transforms/SLPVectorizer/X86/vec3-calls.ll
 create mode 100644 llvm/test/Transforms/SLPVectorizer/X86/vec3-gather-some-loads.ll
 create mode 100644 llvm/test/Transforms/SLPVectorizer/X86/vec3-reorder-reshuffle.ll

diff --git a/llvm/test/Transforms/SLPVectorizer/X86/vec3-base.ll b/llvm/test/Transforms/SLPVectorizer/X86/vec3-base.ll
new file mode 100644
index 0000000..6560fc6
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/vec3-base.ll
@@ -0,0 +1,317 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -passes=slp-vectorizer -mtriple=x86_64-apple-macosx -S %s | FileCheck %s
+
+define void @v3_load_i32_mul_by_constant_store(ptr %src, ptr %dst) {
+; CHECK-LABEL: @v3_load_i32_mul_by_constant_store(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[GEP_SRC_0:%.*]] = getelementptr inbounds i32, ptr [[SRC:%.*]], i32 0
+; CHECK-NEXT:    [[L_SRC_0:%.*]] = load i32, ptr [[GEP_SRC_0]], align 4
+; CHECK-NEXT:    [[MUL_0:%.*]] = mul nsw i32 [[L_SRC_0]], 10
+; CHECK-NEXT:    [[GEP_SRC_1:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i32 1
+; CHECK-NEXT:    [[L_SRC_1:%.*]] = load i32, ptr [[GEP_SRC_1]], align 4
+; CHECK-NEXT:    [[MUL_1:%.*]] = mul nsw i32 [[L_SRC_1]], 10
+; CHECK-NEXT:    [[GEP_SRC_2:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i32 2
+; CHECK-NEXT:    [[L_SRC_2:%.*]] = load i32, ptr [[GEP_SRC_2]], align 4
+; CHECK-NEXT:    [[MUL_2:%.*]] = mul nsw i32 [[L_SRC_2]], 10
+; CHECK-NEXT:    store i32 [[MUL_0]], ptr [[DST:%.*]], align 4
+; CHECK-NEXT:    [[DST_1:%.*]] = getelementptr i32, ptr [[DST]], i32 1
+; CHECK-NEXT:    store i32 [[MUL_1]], ptr [[DST_1]], align 4
+; CHECK-NEXT:    [[DST_2:%.*]] = getelementptr i32, ptr [[DST]], i32 2
+; CHECK-NEXT:    store i32 [[MUL_2]], ptr [[DST_2]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %gep.src.0 = getelementptr inbounds i32, ptr %src, i32 0
+  %l.src.0 = load i32, ptr %gep.src.0, align 4
+  %mul.0 = mul nsw i32 %l.src.0, 10
+
+  %gep.src.1 = getelementptr inbounds i32, ptr %src, i32 1
+  %l.src.1 = load i32, ptr %gep.src.1, align 4
+  %mul.1 = mul nsw i32 %l.src.1, 10
+
+  %gep.src.2 = getelementptr inbounds i32, ptr %src, i32 2
+  %l.src.2 = load i32, ptr %gep.src.2, align 4
+  %mul.2 = mul nsw i32 %l.src.2, 10
+
+  store i32 %mul.0, ptr %dst
+
+  %dst.1 = getelementptr i32, ptr %dst, i32 1
+  store i32 %mul.1, ptr %dst.1
+
+  %dst.2 = getelementptr i32, ptr %dst, i32 2
+  store i32 %mul.2, ptr %dst.2
+
+  ret void
+}
+
+define void @v3_load_i32_mul_store(ptr %src.1, ptr %src.2, ptr %dst) {
+; CHECK-LABEL: @v3_load_i32_mul_store(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[GEP_SRC_1_0:%.*]] = getelementptr inbounds i32, ptr [[SRC_1:%.*]], i32 0
+; CHECK-NEXT:    [[L_SRC_1_0:%.*]] = load i32, ptr [[GEP_SRC_1_0]], align 4
+; CHECK-NEXT:    [[GEP_SRC_2_0:%.*]] = getelementptr inbounds i32, ptr [[SRC_2:%.*]], i32 0
+; CHECK-NEXT:    [[L_SRC_2_0:%.*]] = load i32, ptr [[GEP_SRC_2_0]], align 4
+; CHECK-NEXT:    [[MUL_0:%.*]] = mul nsw i32 [[L_SRC_1_0]], [[L_SRC_2_0]]
+; CHECK-NEXT:    [[GEP_SRC_1_1:%.*]] = getelementptr inbounds i32, ptr [[SRC_1]], i32 1
+; CHECK-NEXT:    [[L_SRC_1_1:%.*]] = load i32, ptr [[GEP_SRC_1_1]], align 4
+; CHECK-NEXT:    [[GEP_SRC_2_1:%.*]] = getelementptr inbounds i32, ptr [[SRC_2]], i32 1
+; CHECK-NEXT:    [[L_SRC_2_1:%.*]] = load i32, ptr [[GEP_SRC_2_1]], align 4
+; CHECK-NEXT:    [[MUL_1:%.*]] = mul nsw i32 [[L_SRC_1_1]], [[L_SRC_2_1]]
+; CHECK-NEXT:    [[GEP_SRC_1_2:%.*]] = getelementptr inbounds i32, ptr [[SRC_1]], i32 2
+; CHECK-NEXT:    [[L_SRC_1_2:%.*]] = load i32, ptr [[GEP_SRC_1_2]], align 4
+; CHECK-NEXT:    [[GEP_SRC_2_2:%.*]] = getelementptr inbounds i32, ptr [[SRC_2]], i32 2
+; CHECK-NEXT:    [[L_SRC_2_2:%.*]] = load i32, ptr [[GEP_SRC_2_2]], align 4
+; CHECK-NEXT:    [[MUL_2:%.*]] = mul nsw i32 [[L_SRC_1_2]], [[L_SRC_2_2]]
+; CHECK-NEXT:    store i32 [[MUL_0]], ptr [[DST:%.*]], align 4
+; CHECK-NEXT:    [[DST_1:%.*]] = getelementptr i32, ptr [[DST]], i32 1
+; CHECK-NEXT:    store i32 [[MUL_1]], ptr [[DST_1]], align 4
+; CHECK-NEXT:    [[DST_2:%.*]] = getelementptr i32, ptr [[DST]], i32 2
+; CHECK-NEXT:    store i32 [[MUL_2]], ptr [[DST_2]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %gep.src.1.0 = getelementptr inbounds i32, ptr %src.1, i32 0
+  %l.src.1.0 = load i32, ptr %gep.src.1.0, align 4
+  %gep.src.2.0 = getelementptr inbounds i32, ptr %src.2, i32 0
+  %l.src.2.0 = load i32, ptr %gep.src.2.0, align 4
+  %mul.0 = mul nsw i32 %l.src.1.0, %l.src.2.0
+
+  %gep.src.1.1 = getelementptr inbounds i32, ptr %src.1, i32 1
+  %l.src.1.1 = load i32, ptr %gep.src.1.1, align 4
+  %gep.src.2.1 = getelementptr inbounds i32, ptr %src.2, i32 1
+  %l.src.2.1 = load i32, ptr %gep.src.2.1, align 4
+  %mul.1 = mul nsw i32 %l.src.1.1, %l.src.2.1
+
+  %gep.src.1.2 = getelementptr inbounds i32, ptr %src.1, i32 2
+  %l.src.1.2 = load i32, ptr %gep.src.1.2, align 4
+  %gep.src.2.2 = getelementptr inbounds i32, ptr %src.2, i32 2
+  %l.src.2.2 = load i32, ptr %gep.src.2.2, align 4
+  %mul.2 = mul nsw i32 %l.src.1.2, %l.src.2.2
+
+  store i32 %mul.0, ptr %dst
+
+  %dst.1 = getelementptr i32, ptr %dst, i32 1
+  store i32 %mul.1, ptr %dst.1
+
+  %dst.2 = getelementptr i32, ptr %dst, i32 2
+  store i32 %mul.2, ptr %dst.2
+
+  ret void
+}
+
+define void @v3_load_i32_mul_add_const_store(ptr %src.1, ptr %src.2, ptr %dst) {
+; CHECK-LABEL: @v3_load_i32_mul_add_const_store(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[GEP_SRC_1_0:%.*]] = getelementptr inbounds i32, ptr [[SRC_1:%.*]], i32 0
+; CHECK-NEXT:    [[L_SRC_1_0:%.*]] = load i32, ptr [[GEP_SRC_1_0]], align 4
+; CHECK-NEXT:    [[GEP_SRC_2_0:%.*]] = getelementptr inbounds i32, ptr [[SRC_2:%.*]], i32 0
+; CHECK-NEXT:    [[L_SRC_2_0:%.*]] = load i32, ptr [[GEP_SRC_2_0]], align 4
+; CHECK-NEXT:    [[MUL_0:%.*]] = mul nsw i32 [[L_SRC_1_0]], [[L_SRC_2_0]]
+; CHECK-NEXT:    [[ADD_0:%.*]] = add i32 [[MUL_0]], 9
+; CHECK-NEXT:    [[GEP_SRC_1_1:%.*]] = getelementptr inbounds i32, ptr [[SRC_1]], i32 1
+; CHECK-NEXT:    [[L_SRC_1_1:%.*]] = load i32, ptr [[GEP_SRC_1_1]], align 4
+; CHECK-NEXT:    [[GEP_SRC_2_1:%.*]] = getelementptr inbounds i32, ptr [[SRC_2]], i32 1
+; CHECK-NEXT:    [[L_SRC_2_1:%.*]] = load i32, ptr [[GEP_SRC_2_1]], align 4
+; CHECK-NEXT:    [[MUL_1:%.*]] = mul nsw i32 [[L_SRC_1_1]], [[L_SRC_2_1]]
+; CHECK-NEXT:    [[ADD_1:%.*]] = add i32 [[MUL_1]], 9
+; CHECK-NEXT:    [[GEP_SRC_1_2:%.*]] = getelementptr inbounds i32, ptr [[SRC_1]], i32 2
+; CHECK-NEXT:    [[L_SRC_1_2:%.*]] = load i32, ptr [[GEP_SRC_1_2]], align 4
+; CHECK-NEXT:    [[GEP_SRC_2_2:%.*]] = getelementptr inbounds i32, ptr [[SRC_2]], i32 2
+; CHECK-NEXT:    [[L_SRC_2_2:%.*]] = load i32, ptr [[GEP_SRC_2_2]], align 4
+; CHECK-NEXT:    [[MUL_2:%.*]] = mul nsw i32 [[L_SRC_1_2]], [[L_SRC_2_2]]
+; CHECK-NEXT:    [[ADD_2:%.*]] = add i32 [[MUL_2]], 9
+; CHECK-NEXT:    store i32 [[ADD_0]], ptr [[DST:%.*]], align 4
+; CHECK-NEXT:    [[DST_1:%.*]] = getelementptr i32, ptr [[DST]], i32 1
+; CHECK-NEXT:    store i32 [[ADD_1]], ptr [[DST_1]], align 4
+; CHECK-NEXT:    [[DST_2:%.*]] = getelementptr i32, ptr [[DST]], i32 2
+; CHECK-NEXT:    store i32 [[ADD_2]], ptr [[DST_2]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %gep.src.1.0 = getelementptr inbounds i32, ptr %src.1, i32 0
+  %l.src.1.0 = load i32, ptr %gep.src.1.0, align 4
+  %gep.src.2.0 = getelementptr inbounds i32, ptr %src.2, i32 0
+  %l.src.2.0 = load i32, ptr %gep.src.2.0, align 4
+  %mul.0 = mul nsw i32 %l.src.1.0, %l.src.2.0
+  %add.0 = add i32 %mul.0, 9
+
+  %gep.src.1.1 = getelementptr inbounds i32, ptr %src.1, i32 1
+  %l.src.1.1 = load i32, ptr %gep.src.1.1, align 4
+  %gep.src.2.1 = getelementptr inbounds i32, ptr %src.2, i32 1
+  %l.src.2.1 = load i32, ptr %gep.src.2.1, align 4
+  %mul.1 = mul nsw i32 %l.src.1.1, %l.src.2.1
+  %add.1 = add i32 %mul.1, 9
+
+  %gep.src.1.2 = getelementptr inbounds i32, ptr %src.1, i32 2
+  %l.src.1.2 = load i32, ptr %gep.src.1.2, align 4
+  %gep.src.2.2 = getelementptr inbounds i32, ptr %src.2, i32 2
+  %l.src.2.2 = load i32, ptr %gep.src.2.2, align 4
+  %mul.2 = mul nsw i32 %l.src.1.2, %l.src.2.2
+  %add.2 = add i32 %mul.2, 9
+
+  store i32 %add.0, ptr %dst
+
+  %dst.1 = getelementptr i32, ptr %dst, i32 1
+  store i32 %add.1, ptr %dst.1
+
+  %dst.2 = getelementptr i32, ptr %dst, i32 2
+  store i32 %add.2, ptr %dst.2
+
+  ret void
+}
+
+define void @v3_load_f32_fadd_fadd_by_constant_store(ptr %src, ptr %dst) {
+; CHECK-LABEL: @v3_load_f32_fadd_fadd_by_constant_store(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[GEP_SRC_0:%.*]] = getelementptr inbounds float, ptr [[SRC:%.*]], i32 0
+; CHECK-NEXT:    [[GEP_SRC_2:%.*]] = getelementptr inbounds float, ptr [[SRC]], i32 2
+; CHECK-NEXT:    [[L_SRC_2:%.*]] = load float, ptr [[GEP_SRC_2]], align 4
+; CHECK-NEXT:    [[FADD_2:%.*]] = fadd float [[L_SRC_2]], 1.000000e+01
+; CHECK-NEXT:    [[TMP0:%.*]] = load <2 x float>, ptr [[GEP_SRC_0]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = fadd <2 x float> [[TMP0]], <float 1.000000e+01, float 1.000000e+01>
+; CHECK-NEXT:    store <2 x float> [[TMP1]], ptr [[DST:%.*]], align 4
+; CHECK-NEXT:    [[DST_2:%.*]] = getelementptr float, ptr [[DST]], i32 2
+; CHECK-NEXT:    store float [[FADD_2]], ptr [[DST_2]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %gep.src.0 = getelementptr inbounds float, ptr %src, i32 0
+  %l.src.0 = load float , ptr %gep.src.0, align 4
+  %fadd.0 = fadd float %l.src.0, 10.0
+
+  %gep.src.1 = getelementptr inbounds float , ptr %src, i32 1
+  %l.src.1 = load float, ptr %gep.src.1, align 4
+  %fadd.1 = fadd float %l.src.1, 10.0
+
+  %gep.src.2 = getelementptr inbounds float, ptr %src, i32 2
+  %l.src.2 = load float, ptr %gep.src.2, align 4
+  %fadd.2 = fadd float %l.src.2, 10.0
+
+  store float %fadd.0, ptr %dst
+
+  %dst.1 = getelementptr float, ptr %dst, i32 1
+  store float %fadd.1, ptr %dst.1
+
+  %dst.2 = getelementptr float, ptr %dst, i32 2
+  store float %fadd.2, ptr %dst.2
+
+  ret void
+}
+
+define void @phi_store3(ptr %dst) {
+; CHECK-LABEL: @phi_store3(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[EXIT:%.*]]
+; CHECK:       invoke.cont8.loopexit:
+; CHECK-NEXT:    br label [[EXIT]]
+; CHECK:       exit:
+; CHECK-NEXT:    [[P_2:%.*]] = phi i32 [ 3, [[ENTRY:%.*]] ], [ 0, [[INVOKE_CONT8_LOOPEXIT:%.*]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = phi <2 x i32> [ <i32 1, i32 2>, [[ENTRY]] ], [ poison, [[INVOKE_CONT8_LOOPEXIT]] ]
+; CHECK-NEXT:    [[DST_2:%.*]] = getelementptr i32, ptr [[DST:%.*]], i32 2
+; CHECK-NEXT:    store <2 x i32> [[TMP0]], ptr [[DST]], align 4
+; CHECK-NEXT:    store i32 [[P_2]], ptr [[DST_2]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %exit
+
+invoke.cont8.loopexit:                            ; No predecessors!
+  br label %exit
+
+exit:
+  %p.0 = phi i32 [ 1, %entry ], [ 0, %invoke.cont8.loopexit ]
+  %p.1 = phi i32 [ 2, %entry ], [ 0, %invoke.cont8.loopexit ]
+  %p.2 = phi i32 [ 3, %entry ], [ 0, %invoke.cont8.loopexit ]
+
+  %dst.1 = getelementptr i32, ptr %dst, i32 1
+  %dst.2 = getelementptr i32, ptr %dst, i32 2
+
+  store i32 %p.0, ptr %dst, align 4
+  store i32 %p.1, ptr %dst.1, align 4
+  store i32 %p.2, ptr %dst.2, align 4
+  ret void
+}
+
+define void @store_try_reorder(ptr %dst) {
+; CHECK-LABEL: @store_try_reorder(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[ADD:%.*]] = add i32 0, 0
+; CHECK-NEXT:    store i32 [[ADD]], ptr [[DST:%.*]], align 4
+; CHECK-NEXT:    [[ARRAYIDX_I1887:%.*]] = getelementptr i32, ptr [[DST]], i64 1
+; CHECK-NEXT:    store <2 x i32> zeroinitializer, ptr [[ARRAYIDX_I1887]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %add = add i32 0, 0
+  store i32 %add, ptr %dst, align 4
+  %add207 = sub i32 0, 0
+  %arrayidx.i1887 = getelementptr i32, ptr %dst, i64 1
+  store i32 %add207, ptr %arrayidx.i1887, align 4
+  %add216 = sub i32 0, 0
+  %arrayidx.i1891 = getelementptr i32, ptr %dst, i64 2
+  store i32 %add216, ptr %arrayidx.i1891, align 4
+  ret void
+}
+
+define void @vec3_fpext_cost(ptr %Colour, float %0) {
+; CHECK-LABEL: @vec3_fpext_cost(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[ARRAYIDX80:%.*]] = getelementptr float, ptr [[COLOUR:%.*]], i64 2
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x float> poison, float [[TMP0:%.*]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP3:%.*]] = fpext <2 x float> [[TMP2]] to <2 x double>
+; CHECK-NEXT:    [[TMP4:%.*]] = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[TMP3]], <2 x double> zeroinitializer, <2 x double> zeroinitializer)
+; CHECK-NEXT:    [[TMP5:%.*]] = fptrunc <2 x double> [[TMP4]] to <2 x float>
+; CHECK-NEXT:    store <2 x float> [[TMP5]], ptr [[COLOUR]], align 4
+; CHECK-NEXT:    [[CONV78:%.*]] = fpext float [[TMP0]] to double
+; CHECK-NEXT:    [[TMP6:%.*]] = call double @llvm.fmuladd.f64(double [[CONV78]], double 0.000000e+00, double 0.000000e+00)
+; CHECK-NEXT:    [[CONV82:%.*]] = fptrunc double [[TMP6]] to float
+; CHECK-NEXT:    store float [[CONV82]], ptr [[ARRAYIDX80]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %arrayidx72 = getelementptr float, ptr %Colour, i64 1
+  %arrayidx80 = getelementptr float, ptr %Colour, i64 2
+  %conv62 = fpext float %0 to double
+  %1 = call double @llvm.fmuladd.f64(double %conv62, double 0.000000e+00, double 0.000000e+00)
+  %conv66 = fptrunc double %1 to float
+  store float %conv66, ptr %Colour, align 4
+  %conv70 = fpext float %0 to double
+  %2 = call double @llvm.fmuladd.f64(double %conv70, double 0.000000e+00, double 0.000000e+00)
+  %conv74 = fptrunc double %2 to float
+  store float %conv74, ptr %arrayidx72, align 4
+  %conv78 = fpext float %0 to double
+  %3 = call double @llvm.fmuladd.f64(double %conv78, double 0.000000e+00, double 0.000000e+00)
+  %conv82 = fptrunc double %3 to float
+  store float %conv82, ptr %arrayidx80, align 4
+  ret void
+}
+
+define void @fpext_gather(ptr %dst, double %conv) {
+; CHECK-LABEL: @fpext_gather(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x double> poison, double [[CONV:%.*]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <2 x double> [[TMP0]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = fptrunc <2 x double> [[TMP1]] to <2 x float>
+; CHECK-NEXT:    [[LENGTHS:%.*]] = getelementptr float, ptr [[DST:%.*]], i64 0
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x float> [[TMP2]], i32 0
+; CHECK-NEXT:    store float [[TMP3]], ptr [[LENGTHS]], align 4
+; CHECK-NEXT:    [[ARRAYIDX32:%.*]] = getelementptr float, ptr [[DST]], i64 1
+; CHECK-NEXT:    store <2 x float> [[TMP2]], ptr [[ARRAYIDX32]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %conv25 = fptrunc double %conv to float
+  %Lengths = getelementptr float, ptr %dst, i64 0
+  store float %conv25, ptr %Lengths, align 4
+  %arrayidx32 = getelementptr float, ptr %dst, i64 1
+  store float %conv25, ptr %arrayidx32, align 4
+  %conv34 = fptrunc double %conv to float
+  %arrayidx37 = getelementptr float, ptr %dst, i64 2
+  store float %conv34, ptr %arrayidx37, align 4
+  ret void
+}
+
+declare float @llvm.fmuladd.f32(float, float, float)
+
+declare double @llvm.fmuladd.f64(double, double, double)
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/vec3-calls.ll b/llvm/test/Transforms/SLPVectorizer/X86/vec3-calls.ll
new file mode 100644
index 0000000..71b9315
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/vec3-calls.ll
@@ -0,0 +1,60 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -passes=slp-vectorizer -mtriple=x86_64-apple-macosx -S %s | FileCheck %s
+
+define void @vec3_vectorize_call(ptr %Colour, float %0) {
+; CHECK-LABEL: @vec3_vectorize_call(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x float>, ptr [[COLOUR:%.*]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> [[TMP1]], <2 x float> zeroinitializer, <2 x float> zeroinitializer)
+; CHECK-NEXT:    store <2 x float> [[TMP2]], ptr [[COLOUR]], align 4
+; CHECK-NEXT:    [[ARRAYIDX99_I1:%.*]] = getelementptr float, ptr [[COLOUR]], i64 2
+; CHECK-NEXT:    [[TMP3:%.*]] = call float @llvm.fmuladd.f32(float [[TMP0:%.*]], float 0.000000e+00, float 0.000000e+00)
+; CHECK-NEXT:    store float [[TMP3]], ptr [[ARRAYIDX99_I1]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %1 = load float, ptr %Colour, align 4
+  %2 = call float @llvm.fmuladd.f32(float %1, float 0.000000e+00, float 0.000000e+00)
+  store float %2, ptr %Colour, align 4
+  %arrayidx91.i = getelementptr float, ptr %Colour, i64 1
+  %3 = load float, ptr %arrayidx91.i, align 4
+  %4 = call float @llvm.fmuladd.f32(float %3, float 0.000000e+00, float 0.000000e+00)
+  store float %4, ptr %arrayidx91.i, align 4
+  %arrayidx99.i1 = getelementptr float, ptr %Colour, i64 2
+  %5 = call float @llvm.fmuladd.f32(float %0, float 0.000000e+00, float 0.000000e+00)
+  store float %5, ptr %arrayidx99.i1, align 4
+  ret void
+}
+
+define void @vec3_fmuladd_64(ptr %Colour, double %0) {
+; CHECK-LABEL: @vec3_fmuladd_64(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[ARRAYIDX80:%.*]] = getelementptr float, ptr [[COLOUR:%.*]], i64 2
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> poison, double [[TMP0:%.*]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP3:%.*]] = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[TMP2]], <2 x double> zeroinitializer, <2 x double> zeroinitializer)
+; CHECK-NEXT:    [[TMP4:%.*]] = fptrunc <2 x double> [[TMP3]] to <2 x float>
+; CHECK-NEXT:    store <2 x float> [[TMP4]], ptr [[COLOUR]], align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = call double @llvm.fmuladd.f64(double [[TMP0]], double 0.000000e+00, double 0.000000e+00)
+; CHECK-NEXT:    [[CONV82:%.*]] = fptrunc double [[TMP5]] to float
+; CHECK-NEXT:    store float [[CONV82]], ptr [[ARRAYIDX80]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %arrayidx72 = getelementptr float, ptr %Colour, i64 1
+  %arrayidx80 = getelementptr float, ptr %Colour, i64 2
+  %1 = call double @llvm.fmuladd.f64(double %0, double 0.000000e+00, double 0.000000e+00)
+  %conv66 = fptrunc double %1 to float
+  store float %conv66, ptr %Colour, align 4
+  %2 = call double @llvm.fmuladd.f64(double %0, double 0.000000e+00, double 0.000000e+00)
+  %conv74 = fptrunc double %2 to float
+  store float %conv74, ptr %arrayidx72, align 4
+  %3 = call double @llvm.fmuladd.f64(double %0, double 0.000000e+00, double 0.000000e+00)
+  %conv82 = fptrunc double %3 to float
+  store float %conv82, ptr %arrayidx80, align 4
+  ret void
+}
+
+declare float @llvm.fmuladd.f32(float, float, float)
+
+declare double @llvm.fmuladd.f64(double, double, double)
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/vec3-gather-some-loads.ll b/llvm/test/Transforms/SLPVectorizer/X86/vec3-gather-some-loads.ll
new file mode 100644
index 0000000..1411f94
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/vec3-gather-some-loads.ll
@@ -0,0 +1,66 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -passes=slp-vectorizer -mtriple=x86_64-apple-macosx -S %s | FileCheck %s
+
+target datalayout = "e-m:o-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
+
+define void @test_insert_loads(ptr %A, ptr noalias %B, float %0) #0 {
+; CHECK-LABEL: define void @test_insert_loads(
+; CHECK-SAME: ptr [[A:%.*]], ptr noalias [[B:%.*]], float [[TMP0:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[MULADD_0:%.*]] = tail call float @llvm.fmuladd.f32(float [[TMP0]], float 1.000000e+00, float 1.000000e+00)
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x float> poison, float [[TMP0]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP3:%.*]] = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> [[TMP2]], <2 x float> <float 3.000000e+00, float 2.000000e+00>, <2 x float> <float 3.000000e+00, float 2.000000e+00>)
+; CHECK-NEXT:    [[A_28:%.*]] = getelementptr i8, ptr [[A]], i64 28
+; CHECK-NEXT:    [[L_A_28:%.*]] = load float, ptr [[A_28]], align 4
+; CHECK-NEXT:    [[A_12:%.*]] = getelementptr i8, ptr [[A]], i64 12
+; CHECK-NEXT:    [[L_A_12:%.*]] = load float, ptr [[A_12]], align 4
+; CHECK-NEXT:    [[GEP_4:%.*]] = getelementptr i8, ptr [[B]], i64 4
+; CHECK-NEXT:    [[L_B_0:%.*]] = load float, ptr [[B]], align 4
+; CHECK-NEXT:    [[GEP_28:%.*]] = getelementptr i8, ptr [[B]], i64 28
+; CHECK-NEXT:    [[GEP_20:%.*]] = getelementptr i8, ptr [[B]], i64 20
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x float> poison, float [[TMP0]], i32 0
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <4 x float> <float poison, float poison, float poison, float 4.000000e+00>, float [[L_A_12]], i32 0
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <4 x float> [[TMP6]], float [[L_A_28]], i32 1
+; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <4 x float> [[TMP7]], <4 x float> poison, <4 x i32> <i32 0, i32 1, i32 1, i32 3>
+; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <4 x float> <float poison, float 0.000000e+00, float 0.000000e+00, float 4.000000e+00>, float [[L_B_0]], i32 0
+; CHECK-NEXT:    [[TMP10:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[TMP5]], <4 x float> [[TMP8]], <4 x float> [[TMP9]])
+; CHECK-NEXT:    store <4 x float> [[TMP10]], ptr [[GEP_4]], align 4
+; CHECK-NEXT:    store <2 x float> [[TMP3]], ptr [[GEP_20]], align 4
+; CHECK-NEXT:    store float [[MULADD_0]], ptr [[GEP_28]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %muladd.0 = tail call float @llvm.fmuladd.f32(float %0, float 1.000000e+00, float 1.000000e+00)
+  %muladd.1 = tail call float @llvm.fmuladd.f32(float %0, float 2.000000e+00, float 2.000000e+00)
+  %muladd.2 = tail call float @llvm.fmuladd.f32(float %0, float 3.000000e+00, float 3.000000e+00)
+  %muladd.3 = tail call float @llvm.fmuladd.f32(float %0, float 4.000000e+00, float 4.000000e+00)
+  %A.28 = getelementptr i8, ptr %A, i64 28
+  %l.A.28 = load float, ptr %A.28, align 4
+  %muladd.4 = tail call float @llvm.fmuladd.f32(float %0, float %l.A.28, float 0.000000e+00)
+  %muladd.5 = tail call float @llvm.fmuladd.f32(float %0, float %l.A.28, float 0.000000e+00)
+  %A.12 = getelementptr i8, ptr %A, i64 12
+  %l.A.12  = load float, ptr %A.12, align 4
+  %gep.4  = getelementptr i8, ptr %B, i64 4
+  %gep.12 = getelementptr i8, ptr %B, i64 12
+  %l.B.0 = load float, ptr %B, align 4
+  %muladd.6  = tail call float @llvm.fmuladd.f32(float %0, float %l.A.12, float %l.B.0)
+  %gep.28 = getelementptr i8, ptr %B, i64 28
+  %gep.24 = getelementptr i8, ptr %B, i64 24
+  %gep.20 = getelementptr i8, ptr %B, i64 20
+  %gep.16 = getelementptr i8, ptr %B, i64 16
+  %gep.8 = getelementptr i8, ptr %B, i64 8
+  store float %muladd.6, ptr %gep.4, align 4
+  store float %muladd.5, ptr %gep.8, align 8
+  store float %muladd.4, ptr %gep.12, align 4
+  store float %muladd.3, ptr %gep.16, align 16
+  store float %muladd.2, ptr %gep.20, align 4
+  store float %muladd.1, ptr %gep.24, align 8
+  store float %muladd.0, ptr %gep.28, align 4
+  ret void
+}
+
+declare float @llvm.fmuladd.f32(float, float, float)
+
+attributes #0 = { "target-cpu"="skylake-avx512" }
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/vec3-reorder-reshuffle.ll b/llvm/test/Transforms/SLPVectorizer/X86/vec3-reorder-reshuffle.ll
new file mode 100644
index 0000000..9584a66
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/vec3-reorder-reshuffle.ll
@@ -0,0 +1,513 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -passes=slp-vectorizer -mtriple=x86_64-apple-macosx -S %s | FileCheck %s
+
+%struct.zot = type { i32, i32, i32 }
+
+define i1 @reorder_results(ptr %arg, i1 %arg1, ptr %arg2, i64 %arg3, ptr %arg4) {
+; CHECK-LABEL: define i1 @reorder_results(
+; CHECK-SAME: ptr [[ARG:%.*]], i1 [[ARG1:%.*]], ptr [[ARG2:%.*]], i64 [[ARG3:%.*]], ptr [[ARG4:%.*]]) {
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    [[LOAD:%.*]] = load ptr, ptr [[ARG4]], align 8
+; CHECK-NEXT:    [[LOAD4:%.*]] = load i32, ptr [[LOAD]], align 4
+; CHECK-NEXT:    [[GETELEMENTPTR:%.*]] = getelementptr i8, ptr [[LOAD]], i64 4
+; CHECK-NEXT:    [[LOAD5:%.*]] = load i32, ptr [[GETELEMENTPTR]], align 4
+; CHECK-NEXT:    [[GETELEMENTPTR6:%.*]] = getelementptr i8, ptr [[LOAD]], i64 8
+; CHECK-NEXT:    [[LOAD7:%.*]] = load i32, ptr [[GETELEMENTPTR6]], align 4
+; CHECK-NEXT:    br i1 [[ARG1]], label [[BB12:%.*]], label [[BB9:%.*]]
+; CHECK:       bb8:
+; CHECK-NEXT:    ret i1 false
+; CHECK:       bb9:
+; CHECK-NEXT:    [[FREEZE:%.*]] = freeze ptr [[ARG]]
+; CHECK-NEXT:    store i32 [[LOAD4]], ptr [[FREEZE]], align 4
+; CHECK-NEXT:    [[GETELEMENTPTR10:%.*]] = getelementptr i8, ptr [[FREEZE]], i64 4
+; CHECK-NEXT:    store i32 [[LOAD7]], ptr [[GETELEMENTPTR10]], align 4
+; CHECK-NEXT:    [[GETELEMENTPTR11:%.*]] = getelementptr i8, ptr [[FREEZE]], i64 8
+; CHECK-NEXT:    store i32 [[LOAD5]], ptr [[GETELEMENTPTR11]], align 4
+; CHECK-NEXT:    br label [[BB8:%.*]]
+; CHECK:       bb12:
+; CHECK-NEXT:    [[GETELEMENTPTR13:%.*]] = getelementptr [[STRUCT_ZOT:%.*]], ptr [[ARG2]], i64 [[ARG3]]
+; CHECK-NEXT:    store i32 [[LOAD4]], ptr [[GETELEMENTPTR13]], align 4
+; CHECK-NEXT:    [[GETELEMENTPTR14:%.*]] = getelementptr i8, ptr [[GETELEMENTPTR13]], i64 4
+; CHECK-NEXT:    store i32 [[LOAD7]], ptr [[GETELEMENTPTR14]], align 4
+; CHECK-NEXT:    [[GETELEMENTPTR15:%.*]] = getelementptr i8, ptr [[GETELEMENTPTR13]], i64 8
+; CHECK-NEXT:    store i32 [[LOAD5]], ptr [[GETELEMENTPTR15]], align 4
+; CHECK-NEXT:    br label [[BB8]]
+;
+bb:
+  %load = load ptr, ptr %arg4, align 8
+  %load4 = load i32, ptr %load, align 4
+  %getelementptr = getelementptr i8, ptr %load, i64 4
+  %load5 = load i32, ptr %getelementptr, align 4
+  %getelementptr6 = getelementptr i8, ptr %load, i64 8
+  %load7 = load i32, ptr %getelementptr6, align 4
+  br i1 %arg1, label %bb12, label %bb9
+
+bb8:                                              ; preds = %bb12, %bb9
+  ret i1 false
+
+bb9:                                              ; preds = %bb
+  %freeze = freeze ptr %arg
+  store i32 %load4, ptr %freeze, align 4
+  %getelementptr10 = getelementptr i8, ptr %freeze, i64 4
+  store i32 %load7, ptr %getelementptr10, align 4
+  %getelementptr11 = getelementptr i8, ptr %freeze, i64 8
+  store i32 %load5, ptr %getelementptr11, align 4
+  br label %bb8
+
+bb12:                                             ; preds = %bb
+  %getelementptr13 = getelementptr %struct.zot, ptr %arg2, i64 %arg3
+  store i32 %load4, ptr %getelementptr13, align 4
+  %getelementptr14 = getelementptr i8, ptr %getelementptr13, i64 4
+  store i32 %load7, ptr %getelementptr14, align 4
+  %getelementptr15 = getelementptr i8, ptr %getelementptr13, i64 8
+  store i32 %load5, ptr %getelementptr15, align 4
+  br label %bb8
+}
+
+define void @extract_mask(ptr %object, double %conv503, double %conv520) {
+; CHECK-LABEL: define void @extract_mask(
+; CHECK-SAME: ptr [[OBJECT:%.*]], double [[CONV503:%.*]], double [[CONV520:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[OBJECT]], align 8
+; CHECK-NEXT:    [[BBOX483:%.*]] = getelementptr float, ptr [[TMP0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x float>, ptr [[BBOX483]], align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = fpext <2 x float> [[TMP1]] to <2 x double>
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <2 x double> [[TMP2]], <2 x double> poison, <2 x i32> <i32 1, i32 0>
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x double> [[TMP3]], double [[CONV503]], i32 0
+; CHECK-NEXT:    [[TMP5:%.*]] = fcmp ogt <2 x double> [[TMP4]], <double 0.000000e+00, double -2.000000e+10>
+; CHECK-NEXT:    [[TMP6:%.*]] = select <2 x i1> [[TMP5]], <2 x double> [[TMP3]], <2 x double> <double 0.000000e+00, double -2.000000e+10>
+; CHECK-NEXT:    [[TMP7:%.*]] = fsub <2 x double> zeroinitializer, [[TMP6]]
+; CHECK-NEXT:    [[TMP8:%.*]] = fptrunc <2 x double> [[TMP7]] to <2 x float>
+; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <2 x float> [[TMP8]], i32 0
+; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <2 x float> [[TMP8]], i32 1
+; CHECK-NEXT:    [[MUL646:%.*]] = fmul float [[TMP9]], [[TMP10]]
+; CHECK-NEXT:    [[CMP663:%.*]] = fcmp olt float [[MUL646]], 0.000000e+00
+; CHECK-NEXT:    br i1 [[CMP663]], label [[IF_THEN665:%.*]], label [[IF_END668:%.*]]
+; CHECK:       if.then665:
+; CHECK-NEXT:    [[ARRAYIDX656:%.*]] = getelementptr float, ptr [[OBJECT]], i64 10
+; CHECK-NEXT:    [[BBOX651:%.*]] = getelementptr float, ptr [[OBJECT]]
+; CHECK-NEXT:    [[CONV613:%.*]] = fptrunc double [[CONV503]] to float
+; CHECK-NEXT:    store float [[CONV613]], ptr [[BBOX651]], align 8
+; CHECK-NEXT:    [[BBOX_SROA_6_0_BBOX666_SROA_IDX:%.*]] = getelementptr float, ptr [[OBJECT]], i64 1
+; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <2 x double> [[TMP6]], double [[CONV520]], i32 1
+; CHECK-NEXT:    [[TMP12:%.*]] = fptrunc <2 x double> [[TMP11]] to <2 x float>
+; CHECK-NEXT:    store <2 x float> [[TMP12]], ptr [[BBOX_SROA_6_0_BBOX666_SROA_IDX]], align 4
+; CHECK-NEXT:    store <2 x float> [[TMP8]], ptr [[ARRAYIDX656]], align 8
+; CHECK-NEXT:    br label [[IF_END668]]
+; CHECK:       if.end668:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %0 = load ptr, ptr %object, align 8
+  %bbox483 = getelementptr float, ptr %0
+  %1 = load float, ptr %bbox483, align 8
+  %conv486 = fpext float %1 to double
+  %cmp487 = fcmp ogt double %conv486, -2.000000e+10
+  %conv486.2 = select i1 %cmp487, double %conv486, double -2.000000e+10
+  %arrayidx502 = getelementptr float, ptr %0, i64 1
+  %2 = load float, ptr %arrayidx502, align 4
+  %conv5033 = fpext float %2 to double
+  %cmp504 = fcmp ogt double %conv503, 0.000000e+00
+  %cond514 = select i1 %cmp504, double %conv5033, double 0.000000e+00
+  %sub626 = fsub double 0.000000e+00, %conv486.2
+  %conv627 = fptrunc double %sub626 to float
+  %sub632 = fsub double 0.000000e+00, %cond514
+  %conv633 = fptrunc double %sub632 to float
+  %mul646 = fmul float %conv633, %conv627
+  %cmp663 = fcmp olt float %mul646, 0.000000e+00
+  br i1 %cmp663, label %if.then665, label %if.end668
+
+if.then665:                                       ; preds = %entry
+  %arrayidx656 = getelementptr float, ptr %object, i64 10
+  %lengths652 = getelementptr float, ptr %object, i64 11
+  %bbox651 = getelementptr float, ptr %object
+  %conv621 = fptrunc double %conv520 to float
+  %conv617 = fptrunc double %cond514 to float
+  %conv613 = fptrunc double %conv503 to float
+  store float %conv613, ptr %bbox651, align 8
+  %bbox.sroa.6.0.bbox666.sroa_idx = getelementptr float, ptr %object, i64 1
+  store float %conv617, ptr %bbox.sroa.6.0.bbox666.sroa_idx, align 4
+  %bbox.sroa.8.0.bbox666.sroa_idx = getelementptr float, ptr %object, i64 2
+  store float %conv621, ptr %bbox.sroa.8.0.bbox666.sroa_idx, align 8
+  store float %conv627, ptr %lengths652, align 4
+  store float %conv633, ptr %arrayidx656, align 8
+  br label %if.end668
+
+if.end668:                                        ; preds = %if.then665, %entry
+  ret void
+}
+
+define void @gather_2(ptr %mat1, float %0, float %1) {
+; CHECK-LABEL: define void @gather_2(
+; CHECK-SAME: ptr [[MAT1:%.*]], float [[TMP0:%.*]], float [[TMP1:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x float> poison, float [[TMP0]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x float> [[TMP2]], float [[TMP1]], i32 1
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> <float 0.000000e+00, float poison>, <2 x i32> <i32 2, i32 0>
+; CHECK-NEXT:    [[TMP5:%.*]] = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> [[TMP3]], <2 x float> [[TMP4]], <2 x float> zeroinitializer)
+; CHECK-NEXT:    [[TMP6:%.*]] = call float @llvm.fmuladd.f32(float [[TMP0]], float [[TMP1]], float 0.000000e+00)
+; CHECK-NEXT:    [[TMP7:%.*]] = fmul float [[TMP6]], 0.000000e+00
+; CHECK-NEXT:    [[ARRAYIDX163:%.*]] = getelementptr [4 x [4 x float]], ptr [[MAT1]], i64 0, i64 1
+; CHECK-NEXT:    [[ARRAYIDX5_I_I_I280:%.*]] = getelementptr [4 x [4 x float]], ptr [[MAT1]], i64 0, i64 1, i64 2
+; CHECK-NEXT:    [[TMP8:%.*]] = fmul <2 x float> [[TMP5]], zeroinitializer
+; CHECK-NEXT:    store <2 x float> [[TMP8]], ptr [[ARRAYIDX163]], align 4
+; CHECK-NEXT:    store float [[TMP7]], ptr [[ARRAYIDX5_I_I_I280]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %2 = call float @llvm.fmuladd.f32(float %0, float 0.000000e+00, float 0.000000e+00)
+  %3 = call float @llvm.fmuladd.f32(float %1, float %0, float 0.000000e+00)
+  %4 = call float @llvm.fmuladd.f32(float %0, float %1, float 0.000000e+00)
+  %5 = fmul float %2, 0.000000e+00
+  %6 = fmul float %3, 0.000000e+00
+  %7 = fmul float %4, 0.000000e+00
+  %arrayidx163 = getelementptr [4 x [4 x float]], ptr %mat1, i64 0, i64 1
+  %arrayidx2.i.i.i278 = getelementptr [4 x [4 x float]], ptr %mat1, i64 0, i64 1, i64 1
+  %arrayidx5.i.i.i280 = getelementptr [4 x [4 x float]], ptr %mat1, i64 0, i64 1, i64 2
+  store float %5, ptr %arrayidx163, align 4
+  store float %6, ptr %arrayidx2.i.i.i278, align 4
+  store float %7, ptr %arrayidx5.i.i.i280, align 4
+  ret void
+}
+
+define i32 @reorder_indices_1(float %0) {
+; CHECK-LABEL: define i32 @reorder_indices_1(
+; CHECK-SAME: float [[TMP0:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[NOR1:%.*]] = alloca [0 x [3 x float]], i32 0, align 4
+; CHECK-NEXT:    [[ARRAYIDX2_I265:%.*]] = getelementptr float, ptr [[NOR1]], i64 2
+; CHECK-NEXT:    [[TMP1:%.*]] = load float, ptr [[ARRAYIDX2_I265]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x float>, ptr [[NOR1]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x float> [[TMP2]], i32 0
+; CHECK-NEXT:    [[TMP4:%.*]] = fneg float [[TMP3]]
+; CHECK-NEXT:    [[NEG11_I:%.*]] = fmul float [[TMP4]], [[TMP0]]
+; CHECK-NEXT:    [[TMP5:%.*]] = call float @llvm.fmuladd.f32(float [[TMP1]], float 0.000000e+00, float [[NEG11_I]])
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x float> poison, float [[TMP1]], i32 0
+; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <2 x float> [[TMP6]], <2 x float> [[TMP2]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    [[TMP8:%.*]] = fneg <2 x float> [[TMP7]]
+; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <2 x float> poison, float [[TMP0]], i32 0
+; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <2 x float> [[TMP9]], <2 x float> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP11:%.*]] = fmul <2 x float> [[TMP8]], [[TMP10]]
+; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <2 x float> [[TMP11]], <2 x float> poison, <2 x i32> <i32 1, i32 0>
+; CHECK-NEXT:    [[TMP13:%.*]] = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> [[TMP2]], <2 x float> zeroinitializer, <2 x float> [[TMP12]])
+; CHECK-NEXT:    [[TMP14:%.*]] = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> [[TMP10]], <2 x float> [[TMP13]], <2 x float> zeroinitializer)
+; CHECK-NEXT:    [[TMP15:%.*]] = call float @llvm.fmuladd.f32(float [[TMP0]], float [[TMP5]], float 0.000000e+00)
+; CHECK-NEXT:    [[TMP16:%.*]] = fmul <2 x float> [[TMP14]], zeroinitializer
+; CHECK-NEXT:    [[MUL6_I_I_I:%.*]] = fmul float [[TMP15]], 0.000000e+00
+; CHECK-NEXT:    store <2 x float> [[TMP16]], ptr [[NOR1]], align 4
+; CHECK-NEXT:    store float [[MUL6_I_I_I]], ptr [[ARRAYIDX2_I265]], align 4
+; CHECK-NEXT:    ret i32 0
+;
+entry:
+  %nor1 = alloca [0 x [3 x float]], i32 0, align 4
+  %arrayidx.i = getelementptr float, ptr %nor1, i64 1
+  %1 = load float, ptr %arrayidx.i, align 4
+  %arrayidx2.i265 = getelementptr float, ptr %nor1, i64 2
+  %2 = load float, ptr %arrayidx2.i265, align 4
+  %3 = fneg float %2
+  %neg.i267 = fmul float %3, %0
+  %4 = call float @llvm.fmuladd.f32(float %1, float 0.000000e+00, float %neg.i267)
+  %5 = load float, ptr %nor1, align 4
+  %6 = fneg float %5
+  %neg11.i = fmul float %6, %0
+  %7 = call float @llvm.fmuladd.f32(float %2, float 0.000000e+00, float %neg11.i)
+  %8 = fneg float %1
+  %neg18.i = fmul float %8, %0
+  %9 = call float @llvm.fmuladd.f32(float %5, float 0.000000e+00, float %neg18.i)
+  %10 = call float @llvm.fmuladd.f32(float %0, float %9, float 0.000000e+00)
+  %11 = call float @llvm.fmuladd.f32(float %0, float %4, float 0.000000e+00)
+  %12 = call float @llvm.fmuladd.f32(float %0, float %7, float 0.000000e+00)
+  %mul.i.i.i = fmul float %10, 0.000000e+00
+  %mul3.i.i.i = fmul float %11, 0.000000e+00
+  %mul6.i.i.i = fmul float %12, 0.000000e+00
+  store float %mul.i.i.i, ptr %nor1, align 4
+  store float %mul3.i.i.i, ptr %arrayidx.i, align 4
+  store float %mul6.i.i.i, ptr %arrayidx2.i265, align 4
+  ret i32 0
+}
+
+define void @reorder_indices_2(ptr %spoint) {
+; CHECK-LABEL: define void @reorder_indices_2(
+; CHECK-SAME: ptr [[SPOINT:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = extractelement <3 x float> zeroinitializer, i64 0
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call float @llvm.fmuladd.f32(float [[TMP0]], float 0.000000e+00, float 0.000000e+00)
+; CHECK-NEXT:    [[MUL4_I461:%.*]] = fmul float [[TMP1]], 0.000000e+00
+; CHECK-NEXT:    [[DSCO:%.*]] = getelementptr float, ptr [[SPOINT]], i64 0
+; CHECK-NEXT:    [[TMP2:%.*]] = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> zeroinitializer, <2 x float> zeroinitializer, <2 x float> zeroinitializer)
+; CHECK-NEXT:    [[TMP3:%.*]] = fmul <2 x float> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    store <2 x float> [[TMP3]], ptr [[DSCO]], align 4
+; CHECK-NEXT:    [[ARRAYIDX5_I476:%.*]] = getelementptr float, ptr [[SPOINT]], i64 2
+; CHECK-NEXT:    store float [[MUL4_I461]], ptr [[ARRAYIDX5_I476]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %0 = extractelement <3 x float> zeroinitializer, i64 1
+  %1 = extractelement <3 x float> zeroinitializer, i64 2
+  %2 = extractelement <3 x float> zeroinitializer, i64 0
+  %3 = tail call float @llvm.fmuladd.f32(float %0, float 0.000000e+00, float 0.000000e+00)
+  %4 = tail call float @llvm.fmuladd.f32(float %1, float 0.000000e+00, float 0.000000e+00)
+  %5 = tail call float @llvm.fmuladd.f32(float %2, float 0.000000e+00, float 0.000000e+00)
+  %mul.i457 = fmul float %3, 0.000000e+00
+  %mul2.i459 = fmul float %4, 0.000000e+00
+  %mul4.i461 = fmul float %5, 0.000000e+00
+  %dsco = getelementptr float, ptr %spoint, i64 0
+  store float %mul.i457, ptr %dsco, align 4
+  %arrayidx3.i474 = getelementptr float, ptr %spoint, i64 1
+  store float %mul2.i459, ptr %arrayidx3.i474, align 4
+  %arrayidx5.i476 = getelementptr float, ptr %spoint, i64 2
+  store float %mul4.i461, ptr %arrayidx5.i476, align 4
+  ret void
+}
+
+define void @reorder_indices_2x_load(ptr %png_ptr, ptr %info_ptr) {
+; CHECK-LABEL: define void @reorder_indices_2x_load(
+; CHECK-SAME: ptr [[PNG_PTR:%.*]], ptr [[INFO_PTR:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[BIT_DEPTH:%.*]] = getelementptr i8, ptr [[INFO_PTR]], i64 0
+; CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[BIT_DEPTH]], align 4
+; CHECK-NEXT:    [[COLOR_TYPE:%.*]] = getelementptr i8, ptr [[INFO_PTR]], i64 1
+; CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr [[COLOR_TYPE]], align 1
+; CHECK-NEXT:    [[BIT_DEPTH37_I:%.*]] = getelementptr i8, ptr [[PNG_PTR]], i64 11
+; CHECK-NEXT:    store i8 [[TMP0]], ptr [[BIT_DEPTH37_I]], align 1
+; CHECK-NEXT:    [[COLOR_TYPE39_I:%.*]] = getelementptr i8, ptr [[PNG_PTR]], i64 10
+; CHECK-NEXT:    store i8 [[TMP1]], ptr [[COLOR_TYPE39_I]], align 2
+; CHECK-NEXT:    [[USR_BIT_DEPTH_I:%.*]] = getelementptr i8, ptr [[PNG_PTR]], i64 12
+; CHECK-NEXT:    store i8 [[TMP0]], ptr [[USR_BIT_DEPTH_I]], align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  %bit_depth = getelementptr i8, ptr %info_ptr, i64 0
+  %0 = load i8, ptr %bit_depth, align 4
+  %color_type = getelementptr i8, ptr %info_ptr, i64 1
+  %1 = load i8, ptr %color_type, align 1
+  %bit_depth37.i = getelementptr i8, ptr %png_ptr, i64 11
+  store i8 %0, ptr %bit_depth37.i, align 1
+  %color_type39.i = getelementptr i8, ptr %png_ptr, i64 10
+  store i8 %1, ptr %color_type39.i, align 2
+  %usr_bit_depth.i = getelementptr i8, ptr %png_ptr, i64 12
+  store i8 %0, ptr %usr_bit_depth.i, align 8
+  ret void
+}
+
+define void @reuse_shuffle_indidces_1(ptr %col, float %0, float %1) {
+; CHECK-LABEL: define void @reuse_shuffle_indidces_1(
+; CHECK-SAME: ptr [[COL:%.*]], float [[TMP0:%.*]], float [[TMP1:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x float> poison, float [[TMP1]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x float> [[TMP2]], float [[TMP0]], i32 1
+; CHECK-NEXT:    [[TMP4:%.*]] = fmul <2 x float> [[TMP3]], zeroinitializer
+; CHECK-NEXT:    [[TMP5:%.*]] = fadd <2 x float> [[TMP4]], zeroinitializer
+; CHECK-NEXT:    store <2 x float> [[TMP5]], ptr [[COL]], align 4
+; CHECK-NEXT:    [[ARRAYIDX33:%.*]] = getelementptr float, ptr [[COL]], i64 2
+; CHECK-NEXT:    [[MUL38:%.*]] = fmul float [[TMP0]], 0.000000e+00
+; CHECK-NEXT:    [[TMP6:%.*]] = fadd float [[MUL38]], 0.000000e+00
+; CHECK-NEXT:    store float [[TMP6]], ptr [[ARRAYIDX33]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %mul24 = fmul float %1, 0.000000e+00
+  %2 = fadd float %mul24, 0.000000e+00
+  store float %2, ptr %col, align 4
+  %arrayidx26 = getelementptr float, ptr %col, i64 1
+  %mul31 = fmul float %0, 0.000000e+00
+  %3 = fadd float %mul31, 0.000000e+00
+  store float %3, ptr %arrayidx26, align 4
+  %arrayidx33 = getelementptr float, ptr %col, i64 2
+  %mul38 = fmul float %0, 0.000000e+00
+  %4 = fadd float %mul38, 0.000000e+00
+  store float %4, ptr %arrayidx33, align 4
+  ret void
+}
+
+define void @reuse_shuffle_indices_2(ptr %inertia, double %0) {
+; CHECK-LABEL: define void @reuse_shuffle_indices_2(
+; CHECK-SAME: ptr [[INERTIA:%.*]], double [[TMP0:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> poison, double [[TMP0]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP3:%.*]] = fptrunc <2 x double> [[TMP2]] to <2 x float>
+; CHECK-NEXT:    [[TMP4:%.*]] = fmul <2 x float> [[TMP3]], zeroinitializer
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 1, i32 poison>
+; CHECK-NEXT:    [[TMP6:%.*]] = fadd <4 x float> [[TMP5]], <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float undef>
+; CHECK-NEXT:    [[TMP7:%.*]] = fmul <4 x float> [[TMP6]], <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float undef>
+; CHECK-NEXT:    [[TMP8:%.*]] = fadd <4 x float> [[TMP7]], <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float undef>
+; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <4 x float> [[TMP8]], <4 x float> poison, <3 x i32> <i32 0, i32 1, i32 2>
+; CHECK-NEXT:    store <3 x float> [[TMP9]], ptr [[INERTIA]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %1 = insertelement <2 x double> poison, double %0, i32 0
+  %2 = shufflevector <2 x double> %1, <2 x double> poison, <2 x i32> zeroinitializer
+  %3 = fptrunc <2 x double> %2 to <2 x float>
+  %4 = fmul <2 x float> %3, zeroinitializer
+  %5 = shufflevector <2 x float> %4, <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 1, i32 poison>
+  %6 = fadd <4 x float> %5, <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float undef>
+  %7 = fmul <4 x float> %6, <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float undef>
+  %8 = fadd <4 x float> %7, <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float undef>
+  %9 = shufflevector <4 x float> %8, <4 x float> poison, <3 x i32> <i32 0, i32 1, i32 2>
+  store <3 x float> %9, ptr %inertia, align 4
+  ret void
+}
+
+define void @reuse_shuffle_indices_cost_crash_2(ptr %bezt, float %0) {
+; CHECK-LABEL: define void @reuse_shuffle_indices_cost_crash_2(
+; CHECK-SAME: ptr [[BEZT:%.*]], float [[TMP0:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[FNEG:%.*]] = fmul float [[TMP0]], 0.000000e+00
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x float> poison, float [[TMP0]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x float> poison, float [[FNEG]], i32 0
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP5:%.*]] = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> [[TMP2]], <2 x float> [[TMP4]], <2 x float> zeroinitializer)
+; CHECK-NEXT:    store <2 x float> [[TMP5]], ptr [[BEZT]], align 4
+; CHECK-NEXT:    [[TMP6:%.*]] = tail call float @llvm.fmuladd.f32(float [[FNEG]], float 0.000000e+00, float 0.000000e+00)
+; CHECK-NEXT:    [[ARRAYIDX8_I831:%.*]] = getelementptr float, ptr [[BEZT]], i64 2
+; CHECK-NEXT:    store float [[TMP6]], ptr [[ARRAYIDX8_I831]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %fneg = fmul float %0, 0.000000e+00
+  %1 = tail call float @llvm.fmuladd.f32(float %0, float %fneg, float 0.000000e+00)
+  store float %1, ptr %bezt, align 4
+  %2 = tail call float @llvm.fmuladd.f32(float %0, float %fneg, float 0.000000e+00)
+  %arrayidx5.i = getelementptr float, ptr %bezt, i64 1
+  store float %2, ptr %arrayidx5.i, align 4
+  %3 = tail call float @llvm.fmuladd.f32(float %fneg, float 0.000000e+00, float 0.000000e+00)
+  %arrayidx8.i831 = getelementptr float, ptr %bezt, i64 2
+  store float %3, ptr %arrayidx8.i831, align 4
+  ret void
+}
+
+define void @reuse_shuffle_indices_cost_crash_3(ptr %m, double %conv, double %conv2) {
+; CHECK-LABEL: define void @reuse_shuffle_indices_cost_crash_3(
+; CHECK-SAME: ptr [[M:%.*]], double [[CONV:%.*]], double [[CONV2:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[SUB19:%.*]] = fsub double 0.000000e+00, [[CONV2]]
+; CHECK-NEXT:    [[CONV20:%.*]] = fptrunc double [[SUB19]] to float
+; CHECK-NEXT:    store float [[CONV20]], ptr [[M]], align 4
+; CHECK-NEXT:    [[ADD:%.*]] = fadd double [[CONV]], 0.000000e+00
+; CHECK-NEXT:    [[CONV239:%.*]] = fptrunc double [[ADD]] to float
+; CHECK-NEXT:    [[ARRAYIDX25:%.*]] = getelementptr [4 x float], ptr [[M]], i64 0, i64 1
+; CHECK-NEXT:    store float [[CONV239]], ptr [[ARRAYIDX25]], align 4
+; CHECK-NEXT:    [[ADD26:%.*]] = fsub double [[CONV]], [[CONV]]
+; CHECK-NEXT:    [[CONV27:%.*]] = fptrunc double [[ADD26]] to float
+; CHECK-NEXT:    [[ARRAYIDX29:%.*]] = getelementptr [4 x float], ptr [[M]], i64 0, i64 2
+; CHECK-NEXT:    store float [[CONV27]], ptr [[ARRAYIDX29]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %sub19 = fsub double 0.000000e+00, %conv2
+  %conv20 = fptrunc double %sub19 to float
+  store float %conv20, ptr %m, align 4
+  %add = fadd double %conv, 0.000000e+00
+  %conv239 = fptrunc double %add to float
+  %arrayidx25 = getelementptr [4 x float], ptr %m, i64 0, i64 1
+  store float %conv239, ptr %arrayidx25, align 4
+  %add26 = fsub double %conv, %conv
+  %conv27 = fptrunc double %add26 to float
+  %arrayidx29 = getelementptr [4 x float], ptr %m, i64 0, i64 2
+  store float %conv27, ptr %arrayidx29, align 4
+  ret void
+}
+
+define void @reuse_shuffle_indices_cost_crash_4(double %conv7.i) {
+; CHECK-LABEL: define void @reuse_shuffle_indices_cost_crash_4(
+; CHECK-SAME: double [[CONV7_I:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[DATA_I111:%.*]] = alloca [0 x [0 x [0 x [3 x float]]]], i32 0, align 4
+; CHECK-NEXT:    [[ARRAYIDX_2_I:%.*]] = getelementptr [3 x float], ptr [[DATA_I111]], i64 0, i64 2
+; CHECK-NEXT:    [[MUL17_I_US:%.*]] = fmul double [[CONV7_I]], 0.000000e+00
+; CHECK-NEXT:    [[MUL_2_I_I_US:%.*]] = fmul double [[MUL17_I_US]], 0.000000e+00
+; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x double> poison, double [[CONV7_I]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <2 x double> [[TMP0]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = fadd <2 x double> [[TMP1]], zeroinitializer
+; CHECK-NEXT:    [[ADD_2_I_I_US:%.*]] = fadd double [[MUL_2_I_I_US]], 0.000000e+00
+; CHECK-NEXT:    [[TMP3:%.*]] = fmul <2 x double> [[TMP2]], [[TMP1]]
+; CHECK-NEXT:    [[TMP4:%.*]] = fadd <2 x double> [[TMP3]], zeroinitializer
+; CHECK-NEXT:    [[TMP5:%.*]] = fptrunc <2 x double> [[TMP4]] to <2 x float>
+; CHECK-NEXT:    store <2 x float> [[TMP5]], ptr [[DATA_I111]], align 4
+; CHECK-NEXT:    [[CONV_2_I46_US:%.*]] = fptrunc double [[ADD_2_I_I_US]] to float
+; CHECK-NEXT:    store float [[CONV_2_I46_US]], ptr [[ARRAYIDX_2_I]], align 4
+; CHECK-NEXT:    [[CALL2_I_US:%.*]] = load volatile ptr, ptr [[DATA_I111]], align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  %data.i111 = alloca [0 x [0 x [0 x [3 x float]]]], i32 0, align 4
+  %arrayidx.1.i = getelementptr [3 x float], ptr %data.i111, i64 0, i64 1
+  %arrayidx.2.i = getelementptr [3 x float], ptr %data.i111, i64 0, i64 2
+  %mul17.i.us = fmul double %conv7.i, 0.000000e+00
+  %mul.2.i.i.us = fmul double %mul17.i.us, 0.000000e+00
+  %add.i.i82.i.us = fadd double %conv7.i, 0.000000e+00
+  %add.1.i.i84.i.us = fadd double %conv7.i, 0.000000e+00
+  %mul.i.i91.i.us = fmul double %add.i.i82.i.us, %conv7.i
+  %mul.1.i.i92.i.us = fmul double %add.1.i.i84.i.us, %conv7.i
+  %add.i96.i.us = fadd double %mul.i.i91.i.us, 0.000000e+00
+  %add.1.i.i.us = fadd double %mul.1.i.i92.i.us, 0.000000e+00
+  %add.2.i.i.us = fadd double %mul.2.i.i.us, 0.000000e+00
+  %conv.i42.us = fptrunc double %add.i96.i.us to float
+  store float %conv.i42.us, ptr %data.i111, align 4
+  %conv.1.i44.us = fptrunc double %add.1.i.i.us to float
+  store float %conv.1.i44.us, ptr %arrayidx.1.i, align 4
+  %conv.2.i46.us = fptrunc double %add.2.i.i.us to float
+  store float %conv.2.i46.us, ptr %arrayidx.2.i, align 4
+  %call2.i.us = load volatile ptr, ptr %data.i111, align 8
+  ret void
+}
+
+define void @common_mask(ptr %m, double %conv, double %conv2) {
+; CHECK-LABEL: define void @common_mask(
+; CHECK-SAME: ptr [[M:%.*]], double [[CONV:%.*]], double [[CONV2:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[SUB19:%.*]] = fsub double [[CONV]], [[CONV]]
+; CHECK-NEXT:    [[CONV20:%.*]] = fptrunc double [[SUB19]] to float
+; CHECK-NEXT:    store float [[CONV20]], ptr [[M]], align 4
+; CHECK-NEXT:    [[ADD:%.*]] = fadd double [[CONV2]], 0.000000e+00
+; CHECK-NEXT:    [[CONV239:%.*]] = fptrunc double [[ADD]] to float
+; CHECK-NEXT:    [[ARRAYIDX25:%.*]] = getelementptr [4 x float], ptr [[M]], i64 0, i64 1
+; CHECK-NEXT:    store float [[CONV239]], ptr [[ARRAYIDX25]], align 4
+; CHECK-NEXT:    [[ADD26:%.*]] = fsub double 0.000000e+00, [[CONV]]
+; CHECK-NEXT:    [[CONV27:%.*]] = fptrunc double [[ADD26]] to float
+; CHECK-NEXT:    [[ARRAYIDX29:%.*]] = getelementptr [4 x float], ptr [[M]], i64 0, i64 2
+; CHECK-NEXT:    store float [[CONV27]], ptr [[ARRAYIDX29]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %sub19 = fsub double %conv, %conv
+  %conv20 = fptrunc double %sub19 to float
+  store float %conv20, ptr %m, align 4
+  %add = fadd double %conv2, 0.000000e+00
+  %conv239 = fptrunc double %add to float
+  %arrayidx25 = getelementptr [4 x float], ptr %m, i64 0, i64 1
+  store float %conv239, ptr %arrayidx25, align 4
+  %add26 = fsub double 0.000000e+00, %conv
+  %conv27 = fptrunc double %add26 to float
+  %arrayidx29 = getelementptr [4 x float], ptr %m, i64 0, i64 2
+  store float %conv27, ptr %arrayidx29, align 4
+  ret void
+}
+
+define void @vec3_extract(<3 x i16> %pixel.sroa.0.4.vec.insert606, ptr %call3.i536) {
+; CHECK-LABEL: define void @vec3_extract(
+; CHECK-SAME: <3 x i16> [[PIXEL_SROA_0_4_VEC_INSERT606:%.*]], ptr [[CALL3_I536:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[PIXEL_SROA_0_4_VEC_EXTRACT:%.*]] = extractelement <3 x i16> [[PIXEL_SROA_0_4_VEC_INSERT606]], i64 2
+; CHECK-NEXT:    [[RED668:%.*]] = getelementptr i16, ptr [[CALL3_I536]], i64 2
+; CHECK-NEXT:    store i16 [[PIXEL_SROA_0_4_VEC_EXTRACT]], ptr [[RED668]], align 2
+; CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <3 x i16> [[PIXEL_SROA_0_4_VEC_INSERT606]], <3 x i16> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    store <2 x i16> [[TMP0]], ptr [[CALL3_I536]], align 2
+; CHECK-NEXT:    ret void
+;
+entry:
+  %pixel.sroa.0.4.vec.extract = extractelement <3 x i16> %pixel.sroa.0.4.vec.insert606, i64 2
+  %red668 = getelementptr i16, ptr %call3.i536, i64 2
+  store i16 %pixel.sroa.0.4.vec.extract, ptr %red668, align 2
+  %pixel.sroa.0.2.vec.extract = extractelement <3 x i16> %pixel.sroa.0.4.vec.insert606, i64 1
+  %green670 = getelementptr i16, ptr %call3.i536, i64 1
+  store i16 %pixel.sroa.0.2.vec.extract, ptr %green670, align 2
+  %pixel.sroa.0.0.vec.extract = extractelement <3 x i16> %pixel.sroa.0.4.vec.insert606, i64 0
+  store i16 %pixel.sroa.0.0.vec.extract, ptr %call3.i536, align 2
+  ret void
+}
+
+declare float @llvm.fmuladd.f32(float, float, float)
-- 
cgit v1.1


From 485ebbff55f41bd12ad768c2974d3280cb581307 Mon Sep 17 00:00:00 2001
From: Jay Foad <jay.foad@amd.com>
Date: Tue, 13 Feb 2024 15:52:34 +0000
Subject: [TableGen] Use emplace_back instead of resize to size() + 1. NFC.

---
 llvm/utils/TableGen/CodeGenRegisters.cpp | 24 +++++++++++-------------
 llvm/utils/TableGen/CodeGenRegisters.h   |  7 +++----
 llvm/utils/TableGen/SubtargetEmitter.cpp | 10 ++++------
 3 files changed, 18 insertions(+), 23 deletions(-)

diff --git a/llvm/utils/TableGen/CodeGenRegisters.cpp b/llvm/utils/TableGen/CodeGenRegisters.cpp
index 25ef310..dd18507 100644
--- a/llvm/utils/TableGen/CodeGenRegisters.cpp
+++ b/llvm/utils/TableGen/CodeGenRegisters.cpp
@@ -1986,15 +1986,15 @@ void CodeGenRegBank::computeRegUnitSets() {
       continue;
 
     // Speculatively grow the RegUnitSets to hold the new set.
-    RegUnitSets.resize(RegUnitSets.size() + 1);
-    RegUnitSets.back().Name = RC.getName();
+    RegUnitSet &RUSet = RegUnitSets.emplace_back();
+    RUSet.Name = RC.getName();
 
     // Compute a sorted list of units in this class.
-    RC.buildRegUnitSet(*this, RegUnitSets.back().Units);
+    RC.buildRegUnitSet(*this, RUSet.Units);
 
     // Find an existing RegUnitSet.
     std::vector<RegUnitSet>::const_iterator SetI =
-        findRegUnitSet(RegUnitSets, RegUnitSets.back());
+        findRegUnitSet(RegUnitSets, RUSet);
     if (SetI != std::prev(RegUnitSets.end()))
       RegUnitSets.pop_back();
   }
@@ -2043,27 +2043,26 @@ void CodeGenRegBank::computeRegUnitSets() {
         continue;
 
       // Speculatively grow the RegUnitSets to hold the new set.
-      RegUnitSets.resize(RegUnitSets.size() + 1);
-      RegUnitSets.back().Name =
+      RegUnitSet &RUSet = RegUnitSets.emplace_back();
+      RUSet.Name =
           RegUnitSets[Idx].Name + "_with_" + RegUnitSets[SearchIdx].Name;
 
       std::set_union(RegUnitSets[Idx].Units.begin(),
                      RegUnitSets[Idx].Units.end(),
                      RegUnitSets[SearchIdx].Units.begin(),
                      RegUnitSets[SearchIdx].Units.end(),
-                     std::inserter(RegUnitSets.back().Units,
-                                   RegUnitSets.back().Units.begin()));
+                     std::inserter(RUSet.Units, RUSet.Units.begin()));
 
       // Find an existing RegUnitSet, or add the union to the unique sets.
       std::vector<RegUnitSet>::const_iterator SetI =
-          findRegUnitSet(RegUnitSets, RegUnitSets.back());
+          findRegUnitSet(RegUnitSets, RUSet);
       if (SetI != std::prev(RegUnitSets.end()))
         RegUnitSets.pop_back();
       else {
         LLVM_DEBUG(dbgs() << "UnitSet " << RegUnitSets.size() - 1 << " "
-                          << RegUnitSets.back().Name << ":";
+                          << RUSet.Name << ":";
                    for (auto &U
-                        : RegUnitSets.back().Units) printRegUnitName(U);
+                        : RUSet.Units) printRegUnitName(U);
                    dbgs() << "\n";);
       }
     }
@@ -2138,8 +2137,7 @@ void CodeGenRegBank::computeRegUnitSets() {
     RegUnits[UnitIdx].RegClassUnitSetsIdx = RCUnitSetsIdx;
     if (RCUnitSetsIdx == RegClassUnitSets.size()) {
       // Create a new list of UnitSets as a "fake" register class.
-      RegClassUnitSets.resize(RCUnitSetsIdx + 1);
-      RegClassUnitSets[RCUnitSetsIdx] = std::move(RUSets);
+      RegClassUnitSets.push_back(std::move(RUSets));
     }
   }
 }
diff --git a/llvm/utils/TableGen/CodeGenRegisters.h b/llvm/utils/TableGen/CodeGenRegisters.h
index cfc6d87..fc5cd67e 100644
--- a/llvm/utils/TableGen/CodeGenRegisters.h
+++ b/llvm/utils/TableGen/CodeGenRegisters.h
@@ -712,8 +712,7 @@ public:
   // Create a native register unit that is associated with one or two root
   // registers.
   unsigned newRegUnit(CodeGenRegister *R0, CodeGenRegister *R1 = nullptr) {
-    RegUnits.resize(RegUnits.size() + 1);
-    RegUnit &RU = RegUnits.back();
+    RegUnit &RU = RegUnits.emplace_back();
     RU.Roots[0] = R0;
     RU.Roots[1] = R1;
     RU.Artificial = R0->Artificial;
@@ -725,8 +724,8 @@ public:
   // Create a new non-native register unit that can be adopted by a register
   // to increase its pressure. Note that NumNativeRegUnits is not increased.
   unsigned newRegUnit(unsigned Weight) {
-    RegUnits.resize(RegUnits.size() + 1);
-    RegUnits.back().Weight = Weight;
+    RegUnit &RU = RegUnits.emplace_back();
+    RU.Weight = Weight;
     return RegUnits.size() - 1;
   }
 
diff --git a/llvm/utils/TableGen/SubtargetEmitter.cpp b/llvm/utils/TableGen/SubtargetEmitter.cpp
index ebe3916..2707f54 100644
--- a/llvm/utils/TableGen/SubtargetEmitter.cpp
+++ b/llvm/utils/TableGen/SubtargetEmitter.cpp
@@ -486,11 +486,10 @@ void SubtargetEmitter::EmitStageAndOperandCycleData(
   std::map<std::string, unsigned> ItinStageMap, ItinOperandMap;
   for (const CodeGenProcModel &ProcModel : SchedModels.procModels()) {
     // Add process itinerary to the list.
-    ProcItinLists.resize(ProcItinLists.size() + 1);
+    std::vector<InstrItinerary> &ItinList = ProcItinLists.emplace_back();
 
     // If this processor defines no itineraries, then leave the itinerary list
     // empty.
-    std::vector<InstrItinerary> &ItinList = ProcItinLists.back();
     if (!ProcModel.hasItineraries())
       continue;
 
@@ -1029,17 +1028,16 @@ void SubtargetEmitter::ExpandProcResources(
 // tables. Must be called for each processor in order.
 void SubtargetEmitter::GenSchedClassTables(const CodeGenProcModel &ProcModel,
                                            SchedClassTables &SchedTables) {
-  SchedTables.ProcSchedClasses.resize(SchedTables.ProcSchedClasses.size() + 1);
+  std::vector<MCSchedClassDesc> &SCTab =
+      SchedTables.ProcSchedClasses.emplace_back();
   if (!ProcModel.hasInstrSchedModel())
     return;
 
-  std::vector<MCSchedClassDesc> &SCTab = SchedTables.ProcSchedClasses.back();
   LLVM_DEBUG(dbgs() << "\n+++ SCHED CLASSES (GenSchedClassTables) +++\n");
   for (const CodeGenSchedClass &SC : SchedModels.schedClasses()) {
     LLVM_DEBUG(SC.dump(&SchedModels));
 
-    SCTab.resize(SCTab.size() + 1);
-    MCSchedClassDesc &SCDesc = SCTab.back();
+    MCSchedClassDesc &SCDesc = SCTab.emplace_back();
     // SCDesc.Name is guarded by NDEBUG
     SCDesc.NumMicroOps = 0;
     SCDesc.BeginGroup = false;
-- 
cgit v1.1


From 4f13f353cc8dc472a3f00932bc42179776f0f684 Mon Sep 17 00:00:00 2001
From: Paul Walker <paul.walker@arm.com>
Date: Tue, 13 Feb 2024 15:02:21 +0000
Subject: [NFC][LLVM][AsmWriter] Extract logic to write out ConstantFP from
 WriteConstantInternal.

This makes is easier to extend the code to support vector types.
---
 llvm/lib/IR/AsmWriter.cpp | 182 ++++++++++++++++++++++++----------------------
 1 file changed, 94 insertions(+), 88 deletions(-)

diff --git a/llvm/lib/IR/AsmWriter.cpp b/llvm/lib/IR/AsmWriter.cpp
index c6ef332..0ae720e 100644
--- a/llvm/lib/IR/AsmWriter.cpp
+++ b/llvm/lib/IR/AsmWriter.cpp
@@ -1406,6 +1406,99 @@ static void WriteOptimizationInfo(raw_ostream &Out, const User *U) {
   }
 }
 
+static void WriteAPFloatInternal(raw_ostream &Out, const APFloat &APF) {
+  if (&APF.getSemantics() == &APFloat::IEEEsingle() ||
+      &APF.getSemantics() == &APFloat::IEEEdouble()) {
+    // We would like to output the FP constant value in exponential notation,
+    // but we cannot do this if doing so will lose precision.  Check here to
+    // make sure that we only output it in exponential format if we can parse
+    // the value back and get the same value.
+    //
+    bool ignored;
+    bool isDouble = &APF.getSemantics() == &APFloat::IEEEdouble();
+    bool isInf = APF.isInfinity();
+    bool isNaN = APF.isNaN();
+
+    if (!isInf && !isNaN) {
+      double Val = APF.convertToDouble();
+      SmallString<128> StrVal;
+      APF.toString(StrVal, 6, 0, false);
+      // Check to make sure that the stringized number is not some string like
+      // "Inf" or NaN, that atof will accept, but the lexer will not.  Check
+      // that the string matches the "[-+]?[0-9]" regex.
+      //
+      assert((isDigit(StrVal[0]) ||
+              ((StrVal[0] == '-' || StrVal[0] == '+') && isDigit(StrVal[1]))) &&
+             "[-+]?[0-9] regex does not match!");
+      // Reparse stringized version!
+      if (APFloat(APFloat::IEEEdouble(), StrVal).convertToDouble() == Val) {
+        Out << StrVal;
+        return;
+      }
+    }
+
+    // Otherwise we could not reparse it to exactly the same value, so we must
+    // output the string in hexadecimal format!  Note that loading and storing
+    // floating point types changes the bits of NaNs on some hosts, notably
+    // x86, so we must not use these types.
+    static_assert(sizeof(double) == sizeof(uint64_t),
+                  "assuming that double is 64 bits!");
+    APFloat apf = APF;
+
+    // Floats are represented in ASCII IR as double, convert.
+    // FIXME: We should allow 32-bit hex float and remove this.
+    if (!isDouble) {
+      // A signaling NaN is quieted on conversion, so we need to recreate the
+      // expected value after convert (quiet bit of the payload is clear).
+      bool IsSNAN = apf.isSignaling();
+      apf.convert(APFloat::IEEEdouble(), APFloat::rmNearestTiesToEven,
+                  &ignored);
+      if (IsSNAN) {
+        APInt Payload = apf.bitcastToAPInt();
+        apf =
+            APFloat::getSNaN(APFloat::IEEEdouble(), apf.isNegative(), &Payload);
+      }
+    }
+
+    Out << format_hex(apf.bitcastToAPInt().getZExtValue(), 0, /*Upper=*/true);
+    return;
+  }
+
+  // Either half, bfloat or some form of long double.
+  // These appear as a magic letter identifying the type, then a
+  // fixed number of hex digits.
+  Out << "0x";
+  APInt API = APF.bitcastToAPInt();
+  if (&APF.getSemantics() == &APFloat::x87DoubleExtended()) {
+    Out << 'K';
+    Out << format_hex_no_prefix(API.getHiBits(16).getZExtValue(), 4,
+                                /*Upper=*/true);
+    Out << format_hex_no_prefix(API.getLoBits(64).getZExtValue(), 16,
+                                /*Upper=*/true);
+  } else if (&APF.getSemantics() == &APFloat::IEEEquad()) {
+    Out << 'L';
+    Out << format_hex_no_prefix(API.getLoBits(64).getZExtValue(), 16,
+                                /*Upper=*/true);
+    Out << format_hex_no_prefix(API.getHiBits(64).getZExtValue(), 16,
+                                /*Upper=*/true);
+  } else if (&APF.getSemantics() == &APFloat::PPCDoubleDouble()) {
+    Out << 'M';
+    Out << format_hex_no_prefix(API.getLoBits(64).getZExtValue(), 16,
+                                /*Upper=*/true);
+    Out << format_hex_no_prefix(API.getHiBits(64).getZExtValue(), 16,
+                                /*Upper=*/true);
+  } else if (&APF.getSemantics() == &APFloat::IEEEhalf()) {
+    Out << 'H';
+    Out << format_hex_no_prefix(API.getZExtValue(), 4,
+                                /*Upper=*/true);
+  } else if (&APF.getSemantics() == &APFloat::BFloat()) {
+    Out << 'R';
+    Out << format_hex_no_prefix(API.getZExtValue(), 4,
+                                /*Upper=*/true);
+  } else
+    llvm_unreachable("Unsupported floating point type");
+}
+
 static void WriteConstantInternal(raw_ostream &Out, const Constant *CV,
                                   AsmWriterContext &WriterCtx) {
   if (const ConstantInt *CI = dyn_cast<ConstantInt>(CV)) {
@@ -1418,94 +1511,7 @@ static void WriteConstantInternal(raw_ostream &Out, const Constant *CV,
   }
 
   if (const ConstantFP *CFP = dyn_cast<ConstantFP>(CV)) {
-    const APFloat &APF = CFP->getValueAPF();
-    if (&APF.getSemantics() == &APFloat::IEEEsingle() ||
-        &APF.getSemantics() == &APFloat::IEEEdouble()) {
-      // We would like to output the FP constant value in exponential notation,
-      // but we cannot do this if doing so will lose precision.  Check here to
-      // make sure that we only output it in exponential format if we can parse
-      // the value back and get the same value.
-      //
-      bool ignored;
-      bool isDouble = &APF.getSemantics() == &APFloat::IEEEdouble();
-      bool isInf = APF.isInfinity();
-      bool isNaN = APF.isNaN();
-      if (!isInf && !isNaN) {
-        double Val = APF.convertToDouble();
-        SmallString<128> StrVal;
-        APF.toString(StrVal, 6, 0, false);
-        // Check to make sure that the stringized number is not some string like
-        // "Inf" or NaN, that atof will accept, but the lexer will not.  Check
-        // that the string matches the "[-+]?[0-9]" regex.
-        //
-        assert((isDigit(StrVal[0]) || ((StrVal[0] == '-' || StrVal[0] == '+') &&
-                                       isDigit(StrVal[1]))) &&
-               "[-+]?[0-9] regex does not match!");
-        // Reparse stringized version!
-        if (APFloat(APFloat::IEEEdouble(), StrVal).convertToDouble() == Val) {
-          Out << StrVal;
-          return;
-        }
-      }
-      // Otherwise we could not reparse it to exactly the same value, so we must
-      // output the string in hexadecimal format!  Note that loading and storing
-      // floating point types changes the bits of NaNs on some hosts, notably
-      // x86, so we must not use these types.
-      static_assert(sizeof(double) == sizeof(uint64_t),
-                    "assuming that double is 64 bits!");
-      APFloat apf = APF;
-      // Floats are represented in ASCII IR as double, convert.
-      // FIXME: We should allow 32-bit hex float and remove this.
-      if (!isDouble) {
-        // A signaling NaN is quieted on conversion, so we need to recreate the
-        // expected value after convert (quiet bit of the payload is clear).
-        bool IsSNAN = apf.isSignaling();
-        apf.convert(APFloat::IEEEdouble(), APFloat::rmNearestTiesToEven,
-                    &ignored);
-        if (IsSNAN) {
-          APInt Payload = apf.bitcastToAPInt();
-          apf = APFloat::getSNaN(APFloat::IEEEdouble(), apf.isNegative(),
-                                 &Payload);
-        }
-      }
-      Out << format_hex(apf.bitcastToAPInt().getZExtValue(), 0, /*Upper=*/true);
-      return;
-    }
-
-    // Either half, bfloat or some form of long double.
-    // These appear as a magic letter identifying the type, then a
-    // fixed number of hex digits.
-    Out << "0x";
-    APInt API = APF.bitcastToAPInt();
-    if (&APF.getSemantics() == &APFloat::x87DoubleExtended()) {
-      Out << 'K';
-      Out << format_hex_no_prefix(API.getHiBits(16).getZExtValue(), 4,
-                                  /*Upper=*/true);
-      Out << format_hex_no_prefix(API.getLoBits(64).getZExtValue(), 16,
-                                  /*Upper=*/true);
-      return;
-    } else if (&APF.getSemantics() == &APFloat::IEEEquad()) {
-      Out << 'L';
-      Out << format_hex_no_prefix(API.getLoBits(64).getZExtValue(), 16,
-                                  /*Upper=*/true);
-      Out << format_hex_no_prefix(API.getHiBits(64).getZExtValue(), 16,
-                                  /*Upper=*/true);
-    } else if (&APF.getSemantics() == &APFloat::PPCDoubleDouble()) {
-      Out << 'M';
-      Out << format_hex_no_prefix(API.getLoBits(64).getZExtValue(), 16,
-                                  /*Upper=*/true);
-      Out << format_hex_no_prefix(API.getHiBits(64).getZExtValue(), 16,
-                                  /*Upper=*/true);
-    } else if (&APF.getSemantics() == &APFloat::IEEEhalf()) {
-      Out << 'H';
-      Out << format_hex_no_prefix(API.getZExtValue(), 4,
-                                  /*Upper=*/true);
-    } else if (&APF.getSemantics() == &APFloat::BFloat()) {
-      Out << 'R';
-      Out << format_hex_no_prefix(API.getZExtValue(), 4,
-                                  /*Upper=*/true);
-    } else
-      llvm_unreachable("Unsupported floating point type");
+    WriteAPFloatInternal(Out, CFP->getValueAPF());
     return;
   }
 
-- 
cgit v1.1


From 987258f5c7801ebb4f7ce7c6a035634b275a5759 Mon Sep 17 00:00:00 2001
From: Daniel Chen <cdchen@ca.ibm.com>
Date: Tue, 13 Feb 2024 11:03:54 -0500
Subject: [Flang] Add __powerpc__ macro to set c_intmax_t to c_int64_t rather
 than c_int128_t as PowerPC only supports up to c_int64_t. (#81222)

PowerPC only supports up to `c_int64_t`. Add macro `__powerpc__` and
preprocess it for setting `c_intmax_t` in `iso_c_binding` intrinsic
module.
---
 flang/lib/Frontend/CompilerInvocation.cpp       | 15 ++++++++++++++-
 flang/module/iso_c_binding.f90                  |  4 ++++
 flang/test/Driver/predefined-macros-powerpc.f90 | 11 +++++++++++
 3 files changed, 29 insertions(+), 1 deletion(-)
 create mode 100644 flang/test/Driver/predefined-macros-powerpc.f90

diff --git a/flang/lib/Frontend/CompilerInvocation.cpp b/flang/lib/Frontend/CompilerInvocation.cpp
index ffde7f5..4707de0 100644
--- a/flang/lib/Frontend/CompilerInvocation.cpp
+++ b/flang/lib/Frontend/CompilerInvocation.cpp
@@ -1326,10 +1326,23 @@ void CompilerInvocation::setDefaultPredefinitions() {
     Fortran::common::setOpenMPMacro(getLangOpts().OpenMPVersion,
                                     fortranOptions.predefinitions);
   }
+
   llvm::Triple targetTriple{llvm::Triple(this->targetOpts.triple)};
-  if (targetTriple.getArch() == llvm::Triple::ArchType::x86_64) {
+  switch (targetTriple.getArch()) {
+  default:
+    break;
+  case llvm::Triple::ArchType::x86_64:
     fortranOptions.predefinitions.emplace_back("__x86_64__", "1");
     fortranOptions.predefinitions.emplace_back("__x86_64", "1");
+    break;
+  case llvm::Triple::ArchType::ppc:
+  case llvm::Triple::ArchType::ppcle:
+  case llvm::Triple::ArchType::ppc64:
+  case llvm::Triple::ArchType::ppc64le:
+    // '__powerpc__' is a generic macro for any PowerPC cases. e.g. Max integer
+    // size.
+    fortranOptions.predefinitions.emplace_back("__powerpc__", "1");
+    break;
   }
 }
 
diff --git a/flang/module/iso_c_binding.f90 b/flang/module/iso_c_binding.f90
index 9a7e68f..1661fd5 100644
--- a/flang/module/iso_c_binding.f90
+++ b/flang/module/iso_c_binding.f90
@@ -47,7 +47,11 @@ module iso_c_binding
     c_long_long = c_int64_t, &
     c_signed_char = c_int8_t, &
     c_size_t = kind(c_sizeof(1)), &
+#if __powerpc__
+    c_intmax_t = c_int64_t, &
+#else
     c_intmax_t = c_int128_t, &
+#endif
     c_intptr_t = c_size_t, &
     c_ptrdiff_t = c_size_t
   integer, parameter, public :: &
diff --git a/flang/test/Driver/predefined-macros-powerpc.f90 b/flang/test/Driver/predefined-macros-powerpc.f90
new file mode 100644
index 0000000..b3d2b61
--- /dev/null
+++ b/flang/test/Driver/predefined-macros-powerpc.f90
@@ -0,0 +1,11 @@
+! Test predefined macro for PowerPC architecture
+
+! RUN: %flang_fc1 -cpp -E %s | FileCheck %s
+! REQUIRES: target=powerpc{{.*}}
+
+! CHECK: integer :: var1 = 1
+
+#if __powerpc__
+  integer :: var1 = __powerpc__
+#endif
+end program
-- 
cgit v1.1


From 381a00de4fdcccd904dac6a0856fb44f12ba0abb Mon Sep 17 00:00:00 2001
From: David Spickett <david.spickett@linaro.org>
Date: Tue, 13 Feb 2024 16:01:28 +0000
Subject: [clang][Driver][HLSL] Fix formatting of clang-dxc options group title

Some extra `<>` and a missing full stop.
---
 clang/include/clang/Driver/Options.td | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index d5017b9..187b845 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -8466,8 +8466,8 @@ def _SLASH_ZW : CLJoined<"ZW">;
 // clang-dxc Options
 //===----------------------------------------------------------------------===//
 
-def dxc_Group : OptionGroup<"<clang-dxc options>">, Visibility<[DXCOption]>,
-  HelpText<"dxc compatibility options">;
+def dxc_Group : OptionGroup<"clang-dxc options">, Visibility<[DXCOption]>,
+  HelpText<"dxc compatibility options.">;
 class DXCFlag<string name> : Option<["/", "-"], name, KIND_FLAG>,
   Group<dxc_Group>, Visibility<[DXCOption]>;
 class DXCJoinedOrSeparate<string name> : Option<["/", "-"], name,
-- 
cgit v1.1


From 11fcae69dbea4860e20ab799ecca9b0432d7f19d Mon Sep 17 00:00:00 2001
From: Joseph Huber <huberjn@outlook.com>
Date: Tue, 13 Feb 2024 10:06:25 -0600
Subject: [LLVM] Add `__builtin_readsteadycounter` intrinsic and builtin for
 realtime clocks (#81331)

Summary:
This patch adds a new intrinsic and builtin function mirroring the
existing `__builtin_readcyclecounter`. The difference is that this
implementation targets a separate counter that some targets have which
returns a fixed frequency clock that can be used to determine elapsed
time, this is different compared to the cycle counter which often has
variable frequency.

This patch only adds support for the NVPTX and AMDGPU targets.

This is done as a new and separate builtin rather than an argument to
`readcyclecounter` to avoid needing to change existing code and to make
the separation more explicit.
---
 clang/docs/LanguageExtensions.rst                  | 33 ++++++++++++
 clang/docs/ReleaseNotes.rst                        |  3 ++
 clang/include/clang/Basic/Builtins.td              |  6 +++
 clang/lib/CodeGen/CGBuiltin.cpp                    |  4 ++
 clang/test/CodeGen/builtins.c                      |  6 +++
 llvm/include/llvm/CodeGen/ISDOpcodes.h             |  6 +++
 llvm/include/llvm/IR/Intrinsics.td                 |  2 +
 llvm/include/llvm/Support/TargetOpcodes.def        |  3 ++
 llvm/include/llvm/Target/GenericOpcodes.td         |  6 +++
 .../llvm/Target/GlobalISel/SelectionDAGCompat.td   |  1 +
 llvm/include/llvm/Target/TargetSelectionDAG.td     |  3 ++
 llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp       |  2 +
 llvm/lib/CodeGen/IntrinsicLowering.cpp             |  6 +++
 llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp      |  6 ++-
 .../CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp  |  7 +--
 llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h      |  2 +-
 .../CodeGen/SelectionDAG/SelectionDAGBuilder.cpp   |  8 +++
 .../CodeGen/SelectionDAG/SelectionDAGDumper.cpp    |  1 +
 llvm/lib/CodeGen/TargetLoweringBase.cpp            |  3 ++
 llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp     |  2 +
 llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp  |  1 +
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp          |  4 ++
 llvm/lib/Target/AMDGPU/SMInstructions.td           | 14 +++++
 llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp        |  2 +
 llvm/lib/Target/NVPTX/NVPTXInstrInfo.td            |  1 -
 llvm/lib/Target/NVPTX/NVPTXIntrinsics.td           |  1 +
 .../GlobalISel/legalizer-info-validation.mir       |  3 ++
 llvm/test/CodeGen/AMDGPU/readsteadycounter.ll      | 24 +++++++++
 llvm/test/CodeGen/NVPTX/intrinsics.ll              | 12 +++++
 .../builtins/match-table-replacerreg.td            | 24 ++++-----
 .../GlobalISelCombinerEmitter/match-table-imms.td  | 32 +++++------
 .../match-table-intrinsics.td                      |  5 +-
 .../match-table-patfrag-root.td                    |  4 +-
 .../GlobalISelCombinerEmitter/match-table.td       | 62 +++++++++++-----------
 llvm/test/TableGen/GlobalISelEmitter.td            |  2 +-
 35 files changed, 229 insertions(+), 72 deletions(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/readsteadycounter.ll

diff --git a/clang/docs/LanguageExtensions.rst b/clang/docs/LanguageExtensions.rst
index e911568..ca78a5c 100644
--- a/clang/docs/LanguageExtensions.rst
+++ b/clang/docs/LanguageExtensions.rst
@@ -2764,6 +2764,39 @@ Query for this feature with ``__has_builtin(__builtin_readcyclecounter)``. Note
 that even if present, its use may depend on run-time privilege or other OS
 controlled state.
 
+``__builtin_readsteadycounter``
+------------------------------
+
+``__builtin_readsteadycounter`` is used to access the fixed frequency counter
+register (or a similar steady-rate clock) on those targets that support it.
+The function is similar to ``__builtin_readcyclecounter`` above except that the
+frequency is fixed, making it suitable for measuring elapsed time.
+
+**Syntax**:
+
+.. code-block:: c++
+
+  __builtin_readsteadycounter()
+
+**Example of Use**:
+
+.. code-block:: c++
+
+  unsigned long long t0 = __builtin_readsteadycounter();
+  do_something();
+  unsigned long long t1 = __builtin_readsteadycounter();
+  unsigned long long secs_to_do_something = (t1 - t0) / tick_rate;
+
+**Description**:
+
+The ``__builtin_readsteadycounter()`` builtin returns the frequency counter value.
+When not supported by the target, the return value is always zero. This builtin
+takes no arguments and produces an unsigned long long result. The builtin does 
+not guarantee any particular frequency, only that it is stable. Knowledge of the 
+counter's true frequency will need to be provided by the user.
+
+Query for this feature with ``__has_builtin(__builtin_readsteadycounter)``.
+
 ``__builtin_dump_struct``
 -------------------------
 
diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 5c245b7..dc2fb3b 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -117,6 +117,9 @@ C23 Feature Support
 Non-comprehensive list of changes in this release
 -------------------------------------------------
 
+- Added ``__builtin_readsteadycounter`` for reading fixed frequency hardware
+  counters.
+
 New Compiler Flags
 ------------------
 
diff --git a/clang/include/clang/Basic/Builtins.td b/clang/include/clang/Basic/Builtins.td
index 31a2bde..193d585 100644
--- a/clang/include/clang/Basic/Builtins.td
+++ b/clang/include/clang/Basic/Builtins.td
@@ -1110,6 +1110,12 @@ def ReadCycleCounter : Builtin {
   let Prototype = "unsigned long long int()";
 }
 
+def ReadSteadyCounter : Builtin {
+  let Spellings = ["__builtin_readsteadycounter"];
+  let Attributes = [NoThrow];
+  let Prototype = "unsigned long long int()";
+}
+
 def Trap : Builtin {
   let Spellings = ["__builtin_trap"];
   let Attributes = [NoThrow, NoReturn];
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index a7a410d..ee0b750 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -3443,6 +3443,10 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID,
     Function *F = CGM.getIntrinsic(Intrinsic::readcyclecounter);
     return RValue::get(Builder.CreateCall(F));
   }
+  case Builtin::BI__builtin_readsteadycounter: {
+    Function *F = CGM.getIntrinsic(Intrinsic::readsteadycounter);
+    return RValue::get(Builder.CreateCall(F));
+  }
   case Builtin::BI__builtin___clear_cache: {
     Value *Begin = EmitScalarExpr(E->getArg(0));
     Value *End = EmitScalarExpr(E->getArg(1));
diff --git a/clang/test/CodeGen/builtins.c b/clang/test/CodeGen/builtins.c
index ed03233..8828212 100644
--- a/clang/test/CodeGen/builtins.c
+++ b/clang/test/CodeGen/builtins.c
@@ -496,6 +496,12 @@ long long test_builtin_readcyclecounter(void) {
   return __builtin_readcyclecounter();
 }
 
+// CHECK-LABEL: define{{.*}} i64 @test_builtin_readsteadycounter
+long long test_builtin_readsteadycounter(void) {
+  // CHECK: call i64 @llvm.readsteadycounter()
+  return __builtin_readsteadycounter();
+}
+
 /// __builtin_launder should be a NOP in C since there are no vtables.
 // CHECK-LABEL: define{{.*}} void @test_builtin_launder
 void test_builtin_launder(int *p) {
diff --git a/llvm/include/llvm/CodeGen/ISDOpcodes.h b/llvm/include/llvm/CodeGen/ISDOpcodes.h
index 349d128..8cb0bc9 100644
--- a/llvm/include/llvm/CodeGen/ISDOpcodes.h
+++ b/llvm/include/llvm/CodeGen/ISDOpcodes.h
@@ -1179,6 +1179,12 @@ enum NodeType {
   /// counter-like register (or other high accuracy low latency clock source).
   READCYCLECOUNTER,
 
+  /// READSTEADYCOUNTER - This corresponds to the readfixedcounter intrinsic.
+  /// It has the same semantics as the READCYCLECOUNTER implementation except
+  /// that the result is the content of the architecture-specific fixed
+  /// frequency counter suitable for measuring elapsed time.
+  READSTEADYCOUNTER,
+
   /// HANDLENODE node - Used as a handle for various purposes.
   HANDLENODE,
 
diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td
index 3c19c7b..4becdd7 100644
--- a/llvm/include/llvm/IR/Intrinsics.td
+++ b/llvm/include/llvm/IR/Intrinsics.td
@@ -870,6 +870,8 @@ def int_pcmarker      : DefaultAttrsIntrinsic<[], [llvm_i32_ty]>;
 
 def int_readcyclecounter : DefaultAttrsIntrinsic<[llvm_i64_ty]>;
 
+def int_readsteadycounter : DefaultAttrsIntrinsic<[llvm_i64_ty]>;
+
 // The assume intrinsic is marked InaccessibleMemOnly so that proper control
 // dependencies will be maintained.
 def int_assume : DefaultAttrsIntrinsic<
diff --git a/llvm/include/llvm/Support/TargetOpcodes.def b/llvm/include/llvm/Support/TargetOpcodes.def
index abb2370..42cb854 100644
--- a/llvm/include/llvm/Support/TargetOpcodes.def
+++ b/llvm/include/llvm/Support/TargetOpcodes.def
@@ -352,6 +352,9 @@ HANDLE_TARGET_OPCODE(G_INTRINSIC_ROUNDEVEN)
 /// INTRINSIC readcyclecounter
 HANDLE_TARGET_OPCODE(G_READCYCLECOUNTER)
 
+/// INTRINSIC readsteadycounter
+HANDLE_TARGET_OPCODE(G_READSTEADYCOUNTER)
+
 /// Generic load (including anyext load)
 HANDLE_TARGET_OPCODE(G_LOAD)
 
diff --git a/llvm/include/llvm/Target/GenericOpcodes.td b/llvm/include/llvm/Target/GenericOpcodes.td
index 2c73b67..19197f5 100644
--- a/llvm/include/llvm/Target/GenericOpcodes.td
+++ b/llvm/include/llvm/Target/GenericOpcodes.td
@@ -1101,6 +1101,12 @@ def G_READCYCLECOUNTER : GenericInstruction {
   let hasSideEffects = true;
 }
 
+def G_READSTEADYCOUNTER : GenericInstruction {
+  let OutOperandList = (outs type0:$dst);
+  let InOperandList = (ins);
+  let hasSideEffects = true;
+}
+
 //------------------------------------------------------------------------------
 // Memory ops
 //------------------------------------------------------------------------------
diff --git a/llvm/include/llvm/Target/GlobalISel/SelectionDAGCompat.td b/llvm/include/llvm/Target/GlobalISel/SelectionDAGCompat.td
index 6bc1942..b1f3c50 100644
--- a/llvm/include/llvm/Target/GlobalISel/SelectionDAGCompat.td
+++ b/llvm/include/llvm/Target/GlobalISel/SelectionDAGCompat.td
@@ -168,6 +168,7 @@ def : GINodeEquiv<G_FMAXNUM_IEEE, fmaxnum_ieee>;
 def : GINodeEquiv<G_FMAXIMUM, fmaximum>;
 def : GINodeEquiv<G_FMINIMUM, fminimum>;
 def : GINodeEquiv<G_READCYCLECOUNTER, readcyclecounter>;
+def : GINodeEquiv<G_READSTEADYCOUNTER, readsteadycounter>;
 def : GINodeEquiv<G_ROTR, rotr>;
 def : GINodeEquiv<G_ROTL, rotl>;
 def : GINodeEquiv<G_LROUND, lround>;
diff --git a/llvm/include/llvm/Target/TargetSelectionDAG.td b/llvm/include/llvm/Target/TargetSelectionDAG.td
index 2236035..5f8bf0d 100644
--- a/llvm/include/llvm/Target/TargetSelectionDAG.td
+++ b/llvm/include/llvm/Target/TargetSelectionDAG.td
@@ -657,6 +657,9 @@ def prefetch   : SDNode<"ISD::PREFETCH"   , SDTPrefetch,
 def readcyclecounter : SDNode<"ISD::READCYCLECOUNTER", SDTIntLeaf,
                      [SDNPHasChain, SDNPSideEffect]>;
 
+def readsteadycounter : SDNode<"ISD::READSTEADYCOUNTER", SDTIntLeaf,
+                     [SDNPHasChain, SDNPSideEffect]>;
+
 def membarrier : SDNode<"ISD::MEMBARRIER", SDTNone,
                         [SDNPHasChain, SDNPSideEffect]>;
 
diff --git a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
index c1d8e89..311dd9d 100644
--- a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
@@ -1885,6 +1885,8 @@ unsigned IRTranslator::getSimpleIntrinsicOpcode(Intrinsic::ID ID) {
       return TargetOpcode::G_INTRINSIC_TRUNC;
     case Intrinsic::readcyclecounter:
       return TargetOpcode::G_READCYCLECOUNTER;
+    case Intrinsic::readsteadycounter:
+      return TargetOpcode::G_READSTEADYCOUNTER;
     case Intrinsic::ptrmask:
       return TargetOpcode::G_PTRMASK;
     case Intrinsic::lrint:
diff --git a/llvm/lib/CodeGen/IntrinsicLowering.cpp b/llvm/lib/CodeGen/IntrinsicLowering.cpp
index 61920a0..fe450cb 100644
--- a/llvm/lib/CodeGen/IntrinsicLowering.cpp
+++ b/llvm/lib/CodeGen/IntrinsicLowering.cpp
@@ -312,6 +312,12 @@ void IntrinsicLowering::LowerIntrinsicCall(CallInst *CI) {
     CI->replaceAllUsesWith(ConstantInt::get(Type::getInt64Ty(Context), 0));
     break;
   }
+  case Intrinsic::readsteadycounter: {
+    errs() << "WARNING: this target does not support the llvm.readsteadycounter"
+           << " intrinsic.  It is being lowered to a constant 0\n";
+    CI->replaceAllUsesWith(ConstantInt::get(Type::getInt64Ty(Context), 0));
+    break;
+  }
 
   case Intrinsic::dbg_declare:
   case Intrinsic::dbg_label:
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
index 892bfbd..252b6e9 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@@ -1127,8 +1127,9 @@ void SelectionDAGLegalize::LegalizeOp(SDNode *Node) {
       Action = TargetLowering::Custom;
     break;
   case ISD::READCYCLECOUNTER:
-    // READCYCLECOUNTER returns an i64, even if type legalization might have
-    // expanded that to several smaller types.
+  case ISD::READSTEADYCOUNTER:
+    // READCYCLECOUNTER and READSTEADYCOUNTER return a i64, even if type
+    // legalization might have expanded that to several smaller types.
     Action = TLI.getOperationAction(Node->getOpcode(), MVT::i64);
     break;
   case ISD::READ_REGISTER:
@@ -3080,6 +3081,7 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) {
     Results.push_back(Node->getOperand(0));
     break;
   case ISD::READCYCLECOUNTER:
+  case ISD::READSTEADYCOUNTER:
     // If the target didn't expand this, just return 'zero' and preserve the
     // chain.
     Results.append(Node->getNumValues() - 1,
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
index e73a092..a4ba261 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
@@ -2648,7 +2648,8 @@ void DAGTypeLegalizer::ExpandIntegerResult(SDNode *N, unsigned ResNo) {
   case ISD::LLRINT:      ExpandIntRes_XROUND_XRINT(N, Lo, Hi); break;
   case ISD::LOAD:        ExpandIntRes_LOAD(cast<LoadSDNode>(N), Lo, Hi); break;
   case ISD::MUL:         ExpandIntRes_MUL(N, Lo, Hi); break;
-  case ISD::READCYCLECOUNTER: ExpandIntRes_READCYCLECOUNTER(N, Lo, Hi); break;
+  case ISD::READCYCLECOUNTER:
+  case ISD::READSTEADYCOUNTER: ExpandIntRes_READCOUNTER(N, Lo, Hi); break;
   case ISD::SDIV:        ExpandIntRes_SDIV(N, Lo, Hi); break;
   case ISD::SIGN_EXTEND: ExpandIntRes_SIGN_EXTEND(N, Lo, Hi); break;
   case ISD::SIGN_EXTEND_INREG: ExpandIntRes_SIGN_EXTEND_INREG(N, Lo, Hi); break;
@@ -4031,8 +4032,8 @@ void DAGTypeLegalizer::ExpandIntRes_MUL(SDNode *N,
                Lo, Hi);
 }
 
-void DAGTypeLegalizer::ExpandIntRes_READCYCLECOUNTER(SDNode *N, SDValue &Lo,
-                                                     SDValue &Hi) {
+void DAGTypeLegalizer::ExpandIntRes_READCOUNTER(SDNode *N, SDValue &Lo,
+                                                SDValue &Hi) {
   SDLoc DL(N);
   EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
   SDVTList VTs = DAG.getVTList(NVT, NVT, MVT::Other);
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
index 09f0bca..9114987 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
@@ -439,7 +439,7 @@ private:
   void ExpandIntRes_CTPOP             (SDNode *N, SDValue &Lo, SDValue &Hi);
   void ExpandIntRes_CTTZ              (SDNode *N, SDValue &Lo, SDValue &Hi);
   void ExpandIntRes_LOAD          (LoadSDNode *N, SDValue &Lo, SDValue &Hi);
-  void ExpandIntRes_READCYCLECOUNTER  (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandIntRes_READCOUNTER       (SDNode *N, SDValue &Lo, SDValue &Hi);
   void ExpandIntRes_SIGN_EXTEND       (SDNode *N, SDValue &Lo, SDValue &Hi);
   void ExpandIntRes_SIGN_EXTEND_INREG (SDNode *N, SDValue &Lo, SDValue &Hi);
   void ExpandIntRes_TRUNCATE          (SDNode *N, SDValue &Lo, SDValue &Hi);
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index 5ce1013..28664b2 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -6781,6 +6781,14 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
     DAG.setRoot(Res.getValue(1));
     return;
   }
+  case Intrinsic::readsteadycounter: {
+    SDValue Op = getRoot();
+    Res = DAG.getNode(ISD::READSTEADYCOUNTER, sdl,
+                      DAG.getVTList(MVT::i64, MVT::Other), Op);
+    setValue(&I, Res);
+    DAG.setRoot(Res.getValue(1));
+    return;
+  }
   case Intrinsic::bitreverse:
     setValue(&I, DAG.getNode(ISD::BITREVERSE, sdl,
                              getValue(I.getArgOperand(0)).getValueType(),
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
index a28d834..0fbd999 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
@@ -104,6 +104,7 @@ std::string SDNode::getOperationName(const SelectionDAG *G) const {
   case ISD::ATOMIC_STORE:               return "AtomicStore";
   case ISD::PCMARKER:                   return "PCMarker";
   case ISD::READCYCLECOUNTER:           return "ReadCycleCounter";
+  case ISD::READSTEADYCOUNTER:          return "ReadSteadyCounter";
   case ISD::SRCVALUE:                   return "SrcValue";
   case ISD::MDNODE_SDNODE:              return "MDNode";
   case ISD::EntryToken:                 return "EntryToken";
diff --git a/llvm/lib/CodeGen/TargetLoweringBase.cpp b/llvm/lib/CodeGen/TargetLoweringBase.cpp
index d8302ba..dc76692 100644
--- a/llvm/lib/CodeGen/TargetLoweringBase.cpp
+++ b/llvm/lib/CodeGen/TargetLoweringBase.cpp
@@ -964,6 +964,9 @@ void TargetLoweringBase::initActions() {
   // Most targets also ignore the @llvm.readcyclecounter intrinsic.
   setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Expand);
 
+  // Most targets also ignore the @llvm.readsteadycounter intrinsic.
+  setOperationAction(ISD::READSTEADYCOUNTER, MVT::i64, Expand);
+
   // ConstantFP nodes default to expand.  Targets can either change this to
   // Legal, in which case all fp constants are legal, or use isFPImmLegal()
   // to optimize expansions for certain constants.
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index 5458dfc..0d3b158 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -1988,6 +1988,8 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
   getActionDefinitionsBuilder(G_READCYCLECOUNTER)
     .legalFor({S64});
 
+  getActionDefinitionsBuilder(G_READSTEADYCOUNTER).legalFor({S64});
+
   getActionDefinitionsBuilder(G_FENCE)
     .alwaysLegal();
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index 5323e4f..b174d57 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -4051,6 +4051,7 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
   case AMDGPU::G_CONSTANT:
   case AMDGPU::G_GLOBAL_VALUE:
   case AMDGPU::G_BLOCK_ADDR:
+  case AMDGPU::G_READSTEADYCOUNTER:
   case AMDGPU::G_READCYCLECOUNTER: {
     unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
     OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 83221f7..56f0e71 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -468,6 +468,10 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
 
   // On SI this is s_memtime and s_memrealtime on VI.
   setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Legal);
+
+  if (Subtarget->hasSMemRealTime() ||
+      Subtarget->getGeneration() >= AMDGPUSubtarget::GFX11)
+    setOperationAction(ISD::READSTEADYCOUNTER, MVT::i64, Legal);
   setOperationAction({ISD::TRAP, ISD::DEBUGTRAP}, MVT::Other, Custom);
 
   if (Subtarget->has16BitInsts()) {
diff --git a/llvm/lib/Target/AMDGPU/SMInstructions.td b/llvm/lib/Target/AMDGPU/SMInstructions.td
index f309696..29651a8 100644
--- a/llvm/lib/Target/AMDGPU/SMInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SMInstructions.td
@@ -1068,6 +1068,20 @@ def : GCNPat <
 }
 } // let OtherPredicates = [HasShaderCyclesRegister]
 
+let OtherPredicates = [HasSMemRealTime] in {
+def : GCNPat <
+  (i64 (readsteadycounter)),
+  (S_MEMREALTIME)
+>;
+} // let OtherPredicates = [HasSMemRealTime]
+
+let SubtargetPredicate = isGFX11Plus in {
+def : GCNPat <
+  (i64 (readsteadycounter)),
+  (S_SENDMSG_RTN_B64 (i32 /*MSG_RTN_GET_REALTIME=*/0x83))
+>;
+} // let SubtargetPredicate = [isGFX11Plus]
+
 def i32imm_zero : TImmLeaf <i32, [{
   return Imm == 0;
 }]>;
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index 80a67ca..7f58b31 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -490,6 +490,8 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
   setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2f16, Expand);
 
   setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Legal);
+  if (STI.getSmVersion() >= 30 && STI.getPTXVersion() > 31)
+    setOperationAction(ISD::READSTEADYCOUNTER, MVT::i64, Legal);
 
   setFP16OperationAction(ISD::SETCC, MVT::f16, Legal, Promote);
   setFP16OperationAction(ISD::SETCC, MVT::v2f16, Legal, Expand);
diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
index 4322eae..631136a 100644
--- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
+++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -3805,7 +3805,6 @@ def CALL_PROTOTYPE :
 
 include "NVPTXIntrinsics.td"
 
-
 //-----------------------------------
 // Notes
 //-----------------------------------
diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
index 133e282..477789a 100644
--- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
+++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
@@ -6382,6 +6382,7 @@ def INT_PTX_SREG_GLOBALTIMER :
 }
 
 def: Pat <(i64 (readcyclecounter)), (INT_PTX_SREG_CLOCK64)>;
+def: Pat <(i64 (readsteadycounter)), (INT_PTX_SREG_GLOBALTIMER)>;
 
 def INT_PTX_SREG_PM0 : PTX_READ_SREG_R32<"pm0", int_nvvm_read_ptx_sreg_pm0>;
 def INT_PTX_SREG_PM1 : PTX_READ_SREG_R32<"pm1", int_nvvm_read_ptx_sreg_pm1>;
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir
index c90c31a..aaf2fef 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir
@@ -163,6 +163,9 @@
 # DEBUG-NEXT: G_READCYCLECOUNTER (opcode {{[0-9]+}}): 1 type index, 0 imm indices
 # DEBUG-NEXT: .. type index coverage check SKIPPED: no rules defined
 # DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined
+# DEBUG-NEXT: G_READSTEADYCOUNTER (opcode {{[0-9]+}}): 1 type index, 0 imm indices
+# DEBUG-NEXT: .. type index coverage check SKIPPED: no rules defined
+# DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined
 
 # DEBUG-NEXT: G_LOAD (opcode {{[0-9]+}}): 2 type indices, 0 imm indices
 # DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected
diff --git a/llvm/test/CodeGen/AMDGPU/readsteadycounter.ll b/llvm/test/CodeGen/AMDGPU/readsteadycounter.ll
new file mode 100644
index 0000000..15f664c
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/readsteadycounter.ll
@@ -0,0 +1,24 @@
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx700 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,GFX700
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,GFX900
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,GFX900
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,GFX1100
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,GFX1100
+
+declare i64 @llvm.readsteadycounter() #0
+
+; GCN-LABEL: {{^}}test_readsteadycounter:
+; GFX700: s_mov_b32 s[[REG:[0-9]+]], 0
+; GFX900: s_memrealtime s[[[LO:[0-9]+]]:[[HI:[0-9]+]]]
+; GFX900: s_memrealtime s[[[LO:[0-9]+]]:[[HI:[0-9]+]]]
+; GFX1100: s_sendmsg_rtn_b64 s[[[LO:[0-9]+]]:[[HI:[0-9]+]]], sendmsg(MSG_RTN_GET_REALTIME)
+; GFX1100: s_sendmsg_rtn_b64 s[[[LO:[0-9]+]]:[[HI:[0-9]+]]], sendmsg(MSG_RTN_GET_REALTIME)
+define amdgpu_kernel void @test_readsteadycounter(ptr addrspace(1) %out) #0 {
+  %cycle0 = call i64 @llvm.readsteadycounter()
+  store volatile i64 %cycle0, ptr addrspace(1) %out
+
+  %cycle1 = call i64 @llvm.readsteadycounter()
+  store volatile i64 %cycle1, ptr addrspace(1) %out
+  ret void
+}
+
+attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/NVPTX/intrinsics.ll b/llvm/test/CodeGen/NVPTX/intrinsics.ll
index d84ee67..e160280 100644
--- a/llvm/test/CodeGen/NVPTX/intrinsics.ll
+++ b/llvm/test/CodeGen/NVPTX/intrinsics.ll
@@ -162,6 +162,17 @@ define i64 @test_cyclecounter() {
   ret i64 %ret
 }
 
+; CHECK-LABEL: test_steadycounter
+define i64 @test_steadycounter() {
+; CHECK: mov.u64         %r{{.*}}, %globaltimer;
+  %a = tail call i64 @llvm.readsteadycounter()
+; CHECK: mov.u64         %r{{.*}}, %globaltimer;
+  %b = tail call i64 @llvm.readsteadycounter()
+  %ret = add i64 %a, %b
+; CHECK: ret
+  ret i64 %ret
+}
+
 declare float @llvm.fabs.f32(float)
 declare double @llvm.fabs.f64(double)
 declare float @llvm.nvvm.sqrt.f(float)
@@ -178,3 +189,4 @@ declare i64 @llvm.nvvm.read.ptx.sreg.clock64()
 declare void @llvm.nvvm.exit()
 declare i64 @llvm.nvvm.read.ptx.sreg.globaltimer()
 declare i64 @llvm.readcyclecounter()
+declare i64 @llvm.readsteadycounter()
diff --git a/llvm/test/TableGen/GlobalISelCombinerEmitter/builtins/match-table-replacerreg.td b/llvm/test/TableGen/GlobalISelCombinerEmitter/builtins/match-table-replacerreg.td
index cf57a24..622d1df 100644
--- a/llvm/test/TableGen/GlobalISelCombinerEmitter/builtins/match-table-replacerreg.td
+++ b/llvm/test/TableGen/GlobalISelCombinerEmitter/builtins/match-table-replacerreg.td
@@ -26,13 +26,13 @@ def MyCombiner: GICombiner<"GenMyCombiner", [
   ReplaceTemp
 ]>;
 
-// CHECK:      const uint8_t *GenMyCombiner::getMatchTable() const {
+//      CHECK: const uint8_t *GenMyCombiner::getMatchTable() const {
 // CHECK-NEXT:   constexpr static uint8_t MatchTable0[] = {
-// CHECK-NEXT:     GIM_SwitchOpcode, /*MI*/0, /*[*/GIMT_Encode2(65), GIMT_Encode2(181), /*)*//*default:*//*Label 2*/ GIMT_Encode4(558),
-// CHECK-NEXT:     /*TargetOpcode::G_UNMERGE_VALUES*//*Label 0*/ GIMT_Encode4(474), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0),
-// CHECK-NEXT:     /*TargetOpcode::G_FNEG*//*Label 1*/ GIMT_Encode4(526),
-// CHECK-NEXT:     // Label 0: @474
-// CHECK-NEXT:     GIM_Try, /*On fail goto*//*Label 3*/ GIMT_Encode4(525), // Rule ID 1 //
+// CHECK-NEXT:     GIM_SwitchOpcode, /*MI*/0, /*[*/GIMT_Encode2(65), GIMT_Encode2(182), /*)*//*default:*//*Label 2*/ GIMT_Encode4(562),
+// CHECK-NEXT:     /*TargetOpcode::G_UNMERGE_VALUES*//*Label 0*/ GIMT_Encode4(478), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0),
+// CHECK-NEXT:     /*TargetOpcode::G_FNEG*//*Label 1*/ GIMT_Encode4(530),
+// CHECK-NEXT:     // Label 0: @478
+// CHECK-NEXT:     GIM_Try, /*On fail goto*//*Label 3*/ GIMT_Encode4(529), // Rule ID 1 //
 // CHECK-NEXT:       GIM_CheckSimplePredicate, GIMT_Encode2(GICXXPred_Simple_IsRule1Enabled),
 // CHECK-NEXT:       GIM_CheckNumOperands, /*MI*/0, /*Expected*/3,
 // CHECK-NEXT:       // MIs[0] a
@@ -57,10 +57,10 @@ def MyCombiner: GICombiner<"GenMyCombiner", [
 // CHECK-NEXT:       GIR_ReplaceRegWithTempReg, /*OldInsnID*/0, /*OldOpIdx*/1, /*TempRegID*/0,
 // CHECK-NEXT:       GIR_EraseFromParent, /*InsnID*/0,
 // CHECK-NEXT:       GIR_Done,
-// CHECK-NEXT:     // Label 3: @525
+// CHECK-NEXT:     // Label 3: @529
 // CHECK-NEXT:     GIM_Reject,
-// CHECK-NEXT:     // Label 1: @526
-// CHECK-NEXT:     GIM_Try, /*On fail goto*//*Label 4*/ GIMT_Encode4(557), // Rule ID 0 //
+// CHECK-NEXT:     // Label 1: @530
+// CHECK-NEXT:     GIM_Try, /*On fail goto*//*Label 4*/ GIMT_Encode4(561), // Rule ID 0 //
 // CHECK-NEXT:       GIM_CheckSimplePredicate, GIMT_Encode2(GICXXPred_Simple_IsRule0Enabled),
 // CHECK-NEXT:       // MIs[0] dst
 // CHECK-NEXT:       // No operand predicates
@@ -75,10 +75,10 @@ def MyCombiner: GICombiner<"GenMyCombiner", [
 // CHECK-NEXT:       GIR_ReplaceReg, /*OldInsnID*/0, /*OldOpIdx*/0, /*NewInsnId*/1, /*NewOpIdx*/1,
 // CHECK-NEXT:       GIR_EraseFromParent, /*InsnID*/0,
 // CHECK-NEXT:       GIR_Done,
-// CHECK-NEXT:     // Label 4: @557
+// CHECK-NEXT:     // Label 4: @561
 // CHECK-NEXT:     GIM_Reject,
-// CHECK-NEXT:     // Label 2: @558
+// CHECK-NEXT:     // Label 2: @562
 // CHECK-NEXT:     GIM_Reject,
-// CHECK-NEXT:     }; // Size: 559 bytes
+// CHECK-NEXT:     }; // Size: 563 bytes
 // CHECK-NEXT:   return MatchTable0;
 // CHECK-NEXT: }
diff --git a/llvm/test/TableGen/GlobalISelCombinerEmitter/match-table-imms.td b/llvm/test/TableGen/GlobalISelCombinerEmitter/match-table-imms.td
index d0c0eba..f0ca65a 100644
--- a/llvm/test/TableGen/GlobalISelCombinerEmitter/match-table-imms.td
+++ b/llvm/test/TableGen/GlobalISelCombinerEmitter/match-table-imms.td
@@ -32,14 +32,14 @@ def MyCombiner: GICombiner<"GenMyCombiner", [
   CImmInstTest1
 ]>;
 
-// CHECK:      const uint8_t *GenMyCombiner::getMatchTable() const {
+//      CHECK: const uint8_t *GenMyCombiner::getMatchTable() const {
 // CHECK-NEXT:   constexpr static uint8_t MatchTable0[] = {
-// CHECK-NEXT:     GIM_SwitchOpcode, /*MI*/0, /*[*/GIMT_Encode2(19), GIMT_Encode2(127), /*)*//*default:*//*Label 3*/ GIMT_Encode4(559),
-// CHECK-NEXT:     /*TargetOpcode::COPY*//*Label 0*/ GIMT_Encode4(442), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0),
-// CHECK-NEXT:     /*TargetOpcode::G_CONSTANT*//*Label 1*/ GIMT_Encode4(473), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0),
-// CHECK-NEXT:     /*TargetOpcode::G_ZEXT*//*Label 2*/ GIMT_Encode4(519),
-// CHECK-NEXT:     // Label 0: @442
-// CHECK-NEXT:     GIM_Try, /*On fail goto*//*Label 4*/ GIMT_Encode4(472), // Rule ID 0 //
+// CHECK-NEXT:     GIM_SwitchOpcode, /*MI*/0, /*[*/GIMT_Encode2(19), GIMT_Encode2(128), /*)*//*default:*//*Label 3*/ GIMT_Encode4(563),
+// CHECK-NEXT:     /*TargetOpcode::COPY*//*Label 0*/ GIMT_Encode4(446), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0),
+// CHECK-NEXT:     /*TargetOpcode::G_CONSTANT*//*Label 1*/ GIMT_Encode4(477), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0),
+// CHECK-NEXT:     /*TargetOpcode::G_ZEXT*//*Label 2*/ GIMT_Encode4(523),
+// CHECK-NEXT:     // Label 0: @446
+// CHECK-NEXT:     GIM_Try, /*On fail goto*//*Label 4*/ GIMT_Encode4(476), // Rule ID 0 //
 // CHECK-NEXT:       GIM_CheckSimplePredicate, GIMT_Encode2(GICXXPred_Simple_IsRule0Enabled),
 // CHECK-NEXT:       GIM_CheckType, /*MI*/0, /*Op*/1, /*Type*/GILLT_s32,
 // CHECK-NEXT:       // MIs[0] a
@@ -51,10 +51,10 @@ def MyCombiner: GICombiner<"GenMyCombiner", [
 // CHECK-NEXT:       GIR_AddImm8, /*InsnID*/0, /*Imm*/0,
 // CHECK-NEXT:       GIR_EraseFromParent, /*InsnID*/0,
 // CHECK-NEXT:       GIR_Done,
-// CHECK-NEXT:     // Label 4: @472
+// CHECK-NEXT:     // Label 4: @476
 // CHECK-NEXT:     GIM_Reject,
-// CHECK-NEXT:     // Label 1: @473
-// CHECK-NEXT:     GIM_Try, /*On fail goto*//*Label 5*/ GIMT_Encode4(518), // Rule ID 2 //
+// CHECK-NEXT:     // Label 1: @477
+// CHECK-NEXT:     GIM_Try, /*On fail goto*//*Label 5*/ GIMT_Encode4(522), // Rule ID 2 //
 // CHECK-NEXT:       GIM_CheckSimplePredicate, GIMT_Encode2(GICXXPred_Simple_IsRule2Enabled),
 // CHECK-NEXT:       GIM_CheckType, /*MI*/0, /*Op*/1, /*Type*/GILLT_s32,
 // CHECK-NEXT:       // MIs[0] a
@@ -66,10 +66,10 @@ def MyCombiner: GICombiner<"GenMyCombiner", [
 // CHECK-NEXT:       GIR_AddCImm, /*InsnID*/0, /*Type*/GILLT_s32, /*Imm*/GIMT_Encode8(42),
 // CHECK-NEXT:       GIR_EraseFromParent, /*InsnID*/0,
 // CHECK-NEXT:       GIR_Done,
-// CHECK-NEXT:     // Label 5: @518
+// CHECK-NEXT:     // Label 5: @522
 // CHECK-NEXT:     GIM_Reject,
-// CHECK-NEXT:     // Label 2: @519
-// CHECK-NEXT:     GIM_Try, /*On fail goto*//*Label 6*/ GIMT_Encode4(558), // Rule ID 1 //
+// CHECK-NEXT:     // Label 2: @523
+// CHECK-NEXT:     GIM_Try, /*On fail goto*//*Label 6*/ GIMT_Encode4(562), // Rule ID 1 //
 // CHECK-NEXT:       GIM_CheckSimplePredicate, GIMT_Encode2(GICXXPred_Simple_IsRule1Enabled),
 // CHECK-NEXT:       // MIs[0] a
 // CHECK-NEXT:       // No operand predicates
@@ -83,10 +83,10 @@ def MyCombiner: GICombiner<"GenMyCombiner", [
 // CHECK-NEXT:       GIR_AddSimpleTempRegister, /*InsnID*/0, /*TempRegID*/0,
 // CHECK-NEXT:       GIR_EraseFromParent, /*InsnID*/0,
 // CHECK-NEXT:       GIR_Done,
-// CHECK-NEXT:     // Label 6: @558
+// CHECK-NEXT:     // Label 6: @562
 // CHECK-NEXT:     GIM_Reject,
-// CHECK-NEXT:     // Label 3: @559
+// CHECK-NEXT:     // Label 3: @563
 // CHECK-NEXT:     GIM_Reject,
-// CHECK-NEXT:     }; // Size: 560 bytes
+// CHECK-NEXT:     }; // Size: 564 bytes
 // CHECK-NEXT:   return MatchTable0;
 // CHECK-NEXT: }
diff --git a/llvm/test/TableGen/GlobalISelCombinerEmitter/match-table-intrinsics.td b/llvm/test/TableGen/GlobalISelCombinerEmitter/match-table-intrinsics.td
index 94cc3e5..a446fb7 100644
--- a/llvm/test/TableGen/GlobalISelCombinerEmitter/match-table-intrinsics.td
+++ b/llvm/test/TableGen/GlobalISelCombinerEmitter/match-table-intrinsics.td
@@ -27,10 +27,9 @@ def MyCombiner: GICombiner<"GenMyCombiner", [
   SpecialIntrins
 ]>;
 
-
-// CHECK:      const uint8_t *GenMyCombiner::getMatchTable() const {
+//      CHECK: const uint8_t *GenMyCombiner::getMatchTable() const {
 // CHECK-NEXT:   constexpr static uint8_t MatchTable0[] = {
-// CHECK-NEXT:     GIM_SwitchOpcode, /*MI*/0, /*[*/GIMT_Encode2(114), GIMT_Encode2(116), /*)*//*default:*//*Label 2*/ GIMT_Encode4(132),
+// CHECK-NEXT:     GIM_SwitchOpcode, /*MI*/0, /*[*/GIMT_Encode2(115), GIMT_Encode2(117), /*)*//*default:*//*Label 2*/ GIMT_Encode4(132),
 // CHECK-NEXT:     /*TargetOpcode::G_INTRINSIC*//*Label 0*/ GIMT_Encode4(18),
 // CHECK-NEXT:     /*TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS*//*Label 1*/ GIMT_Encode4(73),
 // CHECK-NEXT:     // Label 0: @18
diff --git a/llvm/test/TableGen/GlobalISelCombinerEmitter/match-table-patfrag-root.td b/llvm/test/TableGen/GlobalISelCombinerEmitter/match-table-patfrag-root.td
index fdcb31e..d3c202c 100644
--- a/llvm/test/TableGen/GlobalISelCombinerEmitter/match-table-patfrag-root.td
+++ b/llvm/test/TableGen/GlobalISelCombinerEmitter/match-table-patfrag-root.td
@@ -26,9 +26,9 @@ def MyCombiner: GICombiner<"GenMyCombiner", [
   Test0
 ]>;
 
-// CHECK:      const uint8_t *GenMyCombiner::getMatchTable() const {
+//      CHECK: const uint8_t *GenMyCombiner::getMatchTable() const {
 // CHECK-NEXT:   constexpr static uint8_t MatchTable0[] = {
-// CHECK-NEXT:     GIM_SwitchOpcode, /*MI*/0, /*[*/GIMT_Encode2(119), GIMT_Encode2(182), /*)*//*default:*//*Label 3*/ GIMT_Encode4(380),
+// CHECK-NEXT:     GIM_SwitchOpcode, /*MI*/0, /*[*/GIMT_Encode2(120), GIMT_Encode2(183), /*)*//*default:*//*Label 3*/ GIMT_Encode4(380),
 // CHECK-NEXT:     /*TargetOpcode::G_TRUNC*//*Label 0*/ GIMT_Encode4(262), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0),
 // CHECK-NEXT:     /*TargetOpcode::G_ZEXT*//*Label 1*/ GIMT_Encode4(298), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0),
 // CHECK-NEXT:     /*TargetOpcode::G_FPEXT*//*Label 2*/ GIMT_Encode4(344),
diff --git a/llvm/test/TableGen/GlobalISelCombinerEmitter/match-table.td b/llvm/test/TableGen/GlobalISelCombinerEmitter/match-table.td
index 5ec44b5..57ad000 100644
--- a/llvm/test/TableGen/GlobalISelCombinerEmitter/match-table.td
+++ b/llvm/test/TableGen/GlobalISelCombinerEmitter/match-table.td
@@ -132,15 +132,15 @@ def MyCombiner: GICombiner<"GenMyCombiner", [
 // Verify match table.
 // CHECK:      const uint8_t *GenMyCombiner::getMatchTable() const {
 // CHECK-NEXT:   constexpr static uint8_t MatchTable0[] = {
-// CHECK-NEXT:     GIM_SwitchOpcode, /*MI*/0, /*[*/GIMT_Encode2(19), GIMT_Encode2(127), /*)*//*default:*//*Label 6*/ GIMT_Encode4(657),
-// CHECK-NEXT:     /*TargetOpcode::COPY*//*Label 0*/ GIMT_Encode4(442), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0),
-// CHECK-NEXT:     /*TargetOpcode::G_AND*//*Label 1*/ GIMT_Encode4(484), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0),
-// CHECK-NEXT:     /*TargetOpcode::G_STORE*//*Label 2*/ GIMT_Encode4(537), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0),
-// CHECK-NEXT:     /*TargetOpcode::G_TRUNC*//*Label 3*/ GIMT_Encode4(579), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0),
-// CHECK-NEXT:     /*TargetOpcode::G_SEXT*//*Label 4*/ GIMT_Encode4(604), GIMT_Encode4(0),
-// CHECK-NEXT:     /*TargetOpcode::G_ZEXT*//*Label 5*/ GIMT_Encode4(617),
-// CHECK-NEXT:     // Label 0: @442
-// CHECK-NEXT:     GIM_Try, /*On fail goto*//*Label 7*/ GIMT_Encode4(471), // Rule ID 4 //
+// CHECK-NEXT:     GIM_SwitchOpcode, /*MI*/0, /*[*/GIMT_Encode2(19), GIMT_Encode2(128), /*)*//*default:*//*Label 6*/ GIMT_Encode4(661),
+// CHECK-NEXT:     /*TargetOpcode::COPY*//*Label 0*/ GIMT_Encode4(446), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0),
+// CHECK-NEXT:     /*TargetOpcode::G_AND*//*Label 1*/ GIMT_Encode4(488), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0),
+// CHECK-NEXT:     /*TargetOpcode::G_STORE*//*Label 2*/ GIMT_Encode4(541), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0),
+// CHECK-NEXT:     /*TargetOpcode::G_TRUNC*//*Label 3*/ GIMT_Encode4(583), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0),
+// CHECK-NEXT:     /*TargetOpcode::G_SEXT*//*Label 4*/ GIMT_Encode4(608), GIMT_Encode4(0),
+// CHECK-NEXT:     /*TargetOpcode::G_ZEXT*//*Label 5*/ GIMT_Encode4(621),
+// CHECK-NEXT:     // Label 0: @446
+// CHECK-NEXT:     GIM_Try, /*On fail goto*//*Label 7*/ GIMT_Encode4(475), // Rule ID 4 //
 // CHECK-NEXT:       GIM_CheckFeatures, GIMT_Encode2(GIFBS_HasAnswerToEverything),
 // CHECK-NEXT:       GIM_CheckSimplePredicate, GIMT_Encode2(GICXXPred_Simple_IsRule3Enabled),
 // CHECK-NEXT:       // MIs[0] a
@@ -155,8 +155,8 @@ def MyCombiner: GICombiner<"GenMyCombiner", [
 // CHECK-NEXT:       // Combiner Rule #3: InstTest1
 // CHECK-NEXT:       GIR_CustomAction, GIMT_Encode2(GICXXCustomAction_CombineApplyGICombiner0),
 // CHECK-NEXT:       GIR_Done,
-// CHECK-NEXT:     // Label 7: @471
-// CHECK-NEXT:     GIM_Try, /*On fail goto*//*Label 8*/ GIMT_Encode4(483), // Rule ID 3 //
+// CHECK-NEXT:     // Label 7: @475
+// CHECK-NEXT:     GIM_Try, /*On fail goto*//*Label 8*/ GIMT_Encode4(487), // Rule ID 3 //
 // CHECK-NEXT:       GIM_CheckSimplePredicate, GIMT_Encode2(GICXXPred_Simple_IsRule2Enabled),
 // CHECK-NEXT:       // MIs[0] a
 // CHECK-NEXT:       // No operand predicates
@@ -165,10 +165,10 @@ def MyCombiner: GICombiner<"GenMyCombiner", [
 // CHECK-NEXT:       // Combiner Rule #2: InstTest0
 // CHECK-NEXT:       GIR_CustomAction, GIMT_Encode2(GICXXCustomAction_CombineApplyGICombiner1),
 // CHECK-NEXT:       GIR_Done,
-// CHECK-NEXT:     // Label 8: @483
+// CHECK-NEXT:     // Label 8: @487
 // CHECK-NEXT:     GIM_Reject,
-// CHECK-NEXT:     // Label 1: @484
-// CHECK-NEXT:     GIM_Try, /*On fail goto*//*Label 9*/ GIMT_Encode4(536), // Rule ID 6 //
+// CHECK-NEXT:     // Label 1: @488
+// CHECK-NEXT:     GIM_Try, /*On fail goto*//*Label 9*/ GIMT_Encode4(540), // Rule ID 6 //
 // CHECK-NEXT:       GIM_CheckSimplePredicate, GIMT_Encode2(GICXXPred_Simple_IsRule5Enabled),
 // CHECK-NEXT:       GIM_CheckType, /*MI*/0, /*Op*/2, /*Type*/GILLT_s32,
 // CHECK-NEXT:       // MIs[0] dst
@@ -186,10 +186,10 @@ def MyCombiner: GICombiner<"GenMyCombiner", [
 // CHECK-NEXT:       GIR_Copy, /*NewInsnID*/0, /*OldInsnID*/1, /*OpIdx*/1, // z
 // CHECK-NEXT:       GIR_EraseFromParent, /*InsnID*/0,
 // CHECK-NEXT:       GIR_Done,
-// CHECK-NEXT:     // Label 9: @536
+// CHECK-NEXT:     // Label 9: @540
 // CHECK-NEXT:     GIM_Reject,
-// CHECK-NEXT:     // Label 2: @537
-// CHECK-NEXT:     GIM_Try, /*On fail goto*//*Label 10*/ GIMT_Encode4(578), // Rule ID 5 //
+// CHECK-NEXT:     // Label 2: @541
+// CHECK-NEXT:     GIM_Try, /*On fail goto*//*Label 10*/ GIMT_Encode4(582), // Rule ID 5 //
 // CHECK-NEXT:       GIM_CheckSimplePredicate, GIMT_Encode2(GICXXPred_Simple_IsRule4Enabled),
 // CHECK-NEXT:       // MIs[0] tmp
 // CHECK-NEXT:       GIM_RecordInsnIgnoreCopies, /*DefineMI*/1, /*MI*/0, /*OpIdx*/0, // MIs[1]
@@ -207,32 +207,32 @@ def MyCombiner: GICombiner<"GenMyCombiner", [
 // CHECK-NEXT:       GIR_CustomAction, GIMT_Encode2(GICXXCustomAction_CombineApplyGICombiner2),
 // CHECK-NEXT:       GIR_EraseFromParent, /*InsnID*/0,
 // CHECK-NEXT:       GIR_Done,
-// CHECK-NEXT:     // Label 10: @578
+// CHECK-NEXT:     // Label 10: @582
 // CHECK-NEXT:     GIM_Reject,
-// CHECK-NEXT:     // Label 3: @579
-// CHECK-NEXT:     GIM_Try, /*On fail goto*//*Label 11*/ GIMT_Encode4(591), // Rule ID 0 //
+// CHECK-NEXT:     // Label 3: @583
+// CHECK-NEXT:     GIM_Try, /*On fail goto*//*Label 11*/ GIMT_Encode4(595), // Rule ID 0 //
 // CHECK-NEXT:       GIM_CheckSimplePredicate, GIMT_Encode2(GICXXPred_Simple_IsRule0Enabled),
 // CHECK-NEXT:       // Combiner Rule #0: WipOpcodeTest0; wip_match_opcode 'G_TRUNC'
 // CHECK-NEXT:       GIR_CustomAction, GIMT_Encode2(GICXXCustomAction_CombineApplyGICombiner0),
 // CHECK-NEXT:       GIR_Done,
-// CHECK-NEXT:     // Label 11: @591
-// CHECK-NEXT:     GIM_Try, /*On fail goto*//*Label 12*/ GIMT_Encode4(603), // Rule ID 1 //
+// CHECK-NEXT:     // Label 11: @595
+// CHECK-NEXT:     GIM_Try, /*On fail goto*//*Label 12*/ GIMT_Encode4(607), // Rule ID 1 //
 // CHECK-NEXT:       GIM_CheckSimplePredicate, GIMT_Encode2(GICXXPred_Simple_IsRule1Enabled),
 // CHECK-NEXT:       // Combiner Rule #1: WipOpcodeTest1; wip_match_opcode 'G_TRUNC'
 // CHECK-NEXT:       GIR_CustomAction, GIMT_Encode2(GICXXCustomAction_CombineApplyGICombiner0),
 // CHECK-NEXT:       GIR_Done,
-// CHECK-NEXT:     // Label 12: @603
+// CHECK-NEXT:     // Label 12: @607
 // CHECK-NEXT:     GIM_Reject,
-// CHECK-NEXT:     // Label 4: @604
-// CHECK-NEXT:     GIM_Try, /*On fail goto*//*Label 13*/ GIMT_Encode4(616), // Rule ID 2 //
+// CHECK-NEXT:     // Label 4: @608
+// CHECK-NEXT:     GIM_Try, /*On fail goto*//*Label 13*/ GIMT_Encode4(620), // Rule ID 2 //
 // CHECK-NEXT:       GIM_CheckSimplePredicate, GIMT_Encode2(GICXXPred_Simple_IsRule1Enabled),
 // CHECK-NEXT:       // Combiner Rule #1: WipOpcodeTest1; wip_match_opcode 'G_SEXT'
 // CHECK-NEXT:       GIR_CustomAction, GIMT_Encode2(GICXXCustomAction_CombineApplyGICombiner0),
 // CHECK-NEXT:       GIR_Done,
-// CHECK-NEXT:     // Label 13: @616
+// CHECK-NEXT:     // Label 13: @620
 // CHECK-NEXT:     GIM_Reject,
-// CHECK-NEXT:     // Label 5: @617
-// CHECK-NEXT:     GIM_Try, /*On fail goto*//*Label 14*/ GIMT_Encode4(656), // Rule ID 7 //
+// CHECK-NEXT:     // Label 5: @621
+// CHECK-NEXT:     GIM_Try, /*On fail goto*//*Label 14*/ GIMT_Encode4(660), // Rule ID 7 //
 // CHECK-NEXT:       GIM_CheckSimplePredicate, GIMT_Encode2(GICXXPred_Simple_IsRule6Enabled),
 // CHECK-NEXT:       // MIs[0] dst
 // CHECK-NEXT:       // No operand predicates
@@ -247,10 +247,10 @@ def MyCombiner: GICombiner<"GenMyCombiner", [
 // CHECK-NEXT:       GIR_AddSimpleTempRegister, /*InsnID*/0, /*TempRegID*/0,
 // CHECK-NEXT:       GIR_EraseFromParent, /*InsnID*/0,
 // CHECK-NEXT:       GIR_Done,
-// CHECK-NEXT:     // Label 14: @656
+// CHECK-NEXT:     // Label 14: @660
 // CHECK-NEXT:     GIM_Reject,
-// CHECK-NEXT:     // Label 6: @657
+// CHECK-NEXT:     // Label 6: @661
 // CHECK-NEXT:     GIM_Reject,
-// CHECK-NEXT:     }; // Size: 658 bytes
+// CHECK-NEXT:     }; // Size: 662 bytes
 // CHECK-NEXT:   return MatchTable0;
 // CHECK-NEXT: }
diff --git a/llvm/test/TableGen/GlobalISelEmitter.td b/llvm/test/TableGen/GlobalISelEmitter.td
index 3e65126..f79b792 100644
--- a/llvm/test/TableGen/GlobalISelEmitter.td
+++ b/llvm/test/TableGen/GlobalISelEmitter.td
@@ -518,7 +518,7 @@ def : Pat<(frag GPR32:$src1, complex:$src2, complex:$src3),
 // R00O-NEXT:  GIM_Reject,
 // R00O:       // Label [[DEFAULT_NUM]]: @[[DEFAULT]]
 // R00O-NEXT:  GIM_Reject,
-// R00O-NEXT:  }; // Size: 2019 bytes
+// R00O-NEXT:  }; // Size: 2023 bytes
 
 def INSNBOB : I<(outs GPR32:$dst), (ins GPR32:$src1, GPR32:$src2, GPR32:$src3, GPR32:$src4),
                  [(set GPR32:$dst,
-- 
cgit v1.1


From 1f90af183d7a007584fac041eaca9f126a1a942f Mon Sep 17 00:00:00 2001
From: Jay Foad <jay.foad@amd.com>
Date: Tue, 13 Feb 2024 16:04:41 +0000
Subject: [TableGen] Do not speculatively grow RegUnitSets. NFC.

This seems to be a trick to avoid copying a RegUnitSet, but it can be
done more simply using std::move.
---
 llvm/utils/TableGen/CodeGenRegisters.cpp | 25 ++++++++-----------------
 1 file changed, 8 insertions(+), 17 deletions(-)

diff --git a/llvm/utils/TableGen/CodeGenRegisters.cpp b/llvm/utils/TableGen/CodeGenRegisters.cpp
index dd18507..0b67127 100644
--- a/llvm/utils/TableGen/CodeGenRegisters.cpp
+++ b/llvm/utils/TableGen/CodeGenRegisters.cpp
@@ -1985,18 +1985,14 @@ void CodeGenRegBank::computeRegUnitSets() {
     if (!RC.Allocatable || RC.Artificial || !RC.GeneratePressureSet)
       continue;
 
-    // Speculatively grow the RegUnitSets to hold the new set.
-    RegUnitSet &RUSet = RegUnitSets.emplace_back();
-    RUSet.Name = RC.getName();
-
     // Compute a sorted list of units in this class.
+    RegUnitSet RUSet;
+    RUSet.Name = RC.getName();
     RC.buildRegUnitSet(*this, RUSet.Units);
 
     // Find an existing RegUnitSet.
-    std::vector<RegUnitSet>::const_iterator SetI =
-        findRegUnitSet(RegUnitSets, RUSet);
-    if (SetI != std::prev(RegUnitSets.end()))
-      RegUnitSets.pop_back();
+    if (findRegUnitSet(RegUnitSets, RUSet) == RegUnitSets.end())
+      RegUnitSets.push_back(std::move(RUSet));
   }
 
   if (RegUnitSets.empty())
@@ -2042,11 +2038,9 @@ void CodeGenRegBank::computeRegUnitSets() {
       if (Intersection.empty())
         continue;
 
-      // Speculatively grow the RegUnitSets to hold the new set.
-      RegUnitSet &RUSet = RegUnitSets.emplace_back();
+      RegUnitSet RUSet;
       RUSet.Name =
           RegUnitSets[Idx].Name + "_with_" + RegUnitSets[SearchIdx].Name;
-
       std::set_union(RegUnitSets[Idx].Units.begin(),
                      RegUnitSets[Idx].Units.end(),
                      RegUnitSets[SearchIdx].Units.begin(),
@@ -2054,16 +2048,13 @@ void CodeGenRegBank::computeRegUnitSets() {
                      std::inserter(RUSet.Units, RUSet.Units.begin()));
 
       // Find an existing RegUnitSet, or add the union to the unique sets.
-      std::vector<RegUnitSet>::const_iterator SetI =
-          findRegUnitSet(RegUnitSets, RUSet);
-      if (SetI != std::prev(RegUnitSets.end()))
-        RegUnitSets.pop_back();
-      else {
-        LLVM_DEBUG(dbgs() << "UnitSet " << RegUnitSets.size() - 1 << " "
+      if (findRegUnitSet(RegUnitSets, RUSet) == RegUnitSets.end()) {
+        LLVM_DEBUG(dbgs() << "UnitSet " << RegUnitSets.size() << " "
                           << RUSet.Name << ":";
                    for (auto &U
                         : RUSet.Units) printRegUnitName(U);
                    dbgs() << "\n";);
+        RegUnitSets.push_back(std::move(RUSet));
       }
     }
   }
-- 
cgit v1.1


From 8ba4ff392538dac7b803cfdf5bde217ff538a644 Mon Sep 17 00:00:00 2001
From: "S. Bharadwaj Yadavalli" <Bharadwaj.Yadavalli@microsoft.com>
Date: Tue, 13 Feb 2024 11:12:03 -0500
Subject: [DirectX][NFC] Change specification of overload types and attribute
 in DXIL.td (#81184)

- Specify overload types of DXIL Operation as list of types instead of a
string.
- Add supported DXIL type record definitions to `DXIL.td` leveraging
`LLVMType` to avoid duplicate definitions.
 - Spell out DXIL Operation Attribute specification string.
 - Make corresponding changes to process the records in DXILEmitter.cpp
---
 llvm/lib/Target/DirectX/DXIL.td     |  44 +++++++---
 llvm/utils/TableGen/DXILEmitter.cpp | 160 +++++++++++++++++++++++-------------
 2 files changed, 133 insertions(+), 71 deletions(-)

diff --git a/llvm/lib/Target/DirectX/DXIL.td b/llvm/lib/Target/DirectX/DXIL.td
index 3f3ace5..5215813 100644
--- a/llvm/lib/Target/DirectX/DXIL.td
+++ b/llvm/lib/Target/DirectX/DXIL.td
@@ -12,6 +12,7 @@
 //===----------------------------------------------------------------------===//
 
 include "llvm/IR/Intrinsics.td"
+include "llvm/IR/Attributes.td"
 
 // Abstract representation of the class a DXIL Operation belongs to.
 class DXILOpClass<string name> {
@@ -34,12 +35,29 @@ def BinaryUintCategory : DXILOpCategory<"Binary uint">;
 def UnaryFloatCategory : DXILOpCategory<"Unary float">;
 def ComputeIDCategory : DXILOpCategory<"Compute/Mesh/Amplification shader">;
 
+// Following are the scalar types supported by DXIL operations and are synonymous
+// to llvm_*_ty defined for readability and ease of use in the context of this file.
+
+def voidTy  : LLVMType<isVoid>;
+
+// Floating point types
+def f16Ty   : LLVMType<f16>;
+def f32Ty   : LLVMType<f32>;
+def f64Ty   : LLVMType<f64>;
+
+// Integer types
+def i1Ty   : LLVMType<i1>;
+def i8Ty   : LLVMType<i8>;
+def i16Ty  : LLVMType<i16>;
+def i32Ty  : LLVMType<i32>;
+def i64Ty  : LLVMType<i64>;
+
 // The parameter description for a DXIL operation
 class DXILOpParameter<int pos, string type, string name, string doc,
                  bit isConstant = 0, string enumName = "",
                  int maxValue = 0> {
   int Pos = pos;               // Position in parameter list
-  string LLVMType = type;      // LLVM type name, $o for overload, $r for resource
+  string Type = type;          // LLVM type name, $o for overload, $r for resource
                                // type, $cb for legacy cbuffer, $u4 for u4 struct
   string Name = name;          // Short, unique parameter name
   string Doc = doc;            // Description of this parameter
@@ -56,9 +74,11 @@ class DXILOperationDesc {
   DXILOpCategory OpCategory;  // Category of the operation
   string Doc = "";            // Description of the operation
   list<DXILOpParameter> Params = []; // Parameter list of the operation
-  string OverloadTypes = "";  // Overload types, if applicable
-  string Attributes = "";     // Attribute shorthands: rn=does not access
-                              // memory,ro=only reads from memory,
+  list<LLVMType> OverloadTypes = [];  // Overload types, if applicable
+  EnumAttr Attribute;         // Operation Attribute. Leverage attributes defined in Attributes.td
+                              // ReadNone - operation does not access memory.
+                              // ReadOnly - only reads from memory.
+                              // "ReadMemory"   - reads memory
   bit IsDerivative = 0;       // Whether this is some kind of derivative
   bit IsGradient = 0;         // Whether this requires a gradient calculation
   bit IsFeedback = 0;         // Whether this is a sampler feedback operation
@@ -71,7 +91,7 @@ class DXILOperationDesc {
 }
 
 class DXILOperation<string name, int opCode, DXILOpClass opClass, DXILOpCategory opCategory, string doc,
-              string oloadTypes, string attrs, list<DXILOpParameter> params,
+              list<LLVMType> oloadTypes, EnumAttr attrs, list<DXILOpParameter> params,
               list<string> statsGroup = []> : DXILOperationDesc {
   let OpName = name;
   let OpCode = opCode;
@@ -80,7 +100,7 @@ class DXILOperation<string name, int opCode, DXILOpClass opClass, DXILOpCategory
   let OpClass = opClass;
   let OpCategory = opCategory;
   let OverloadTypes = oloadTypes;
-  let Attributes = attrs;
+  let Attribute = attrs;
   let StatsGroup = statsGroup;
 }
 
@@ -88,7 +108,7 @@ class DXILOperation<string name, int opCode, DXILOpClass opClass, DXILOpCategory
 class LLVMIntrinsic<Intrinsic llvm_intrinsic_> { Intrinsic llvm_intrinsic = llvm_intrinsic_; }
 
 def Sin : DXILOperation<"Sin", 13, UnaryClass, UnaryFloatCategory, "returns sine(theta) for theta in radians.",
-  "half;float;", "rn",
+  [f16Ty,f32Ty], ReadNone,
   [
     DXILOpParameter<0, "$o", "", "operation result">,
     DXILOpParameter<1, "i32", "opcode", "DXIL opcode">,
@@ -98,7 +118,7 @@ def Sin : DXILOperation<"Sin", 13, UnaryClass, UnaryFloatCategory, "returns sine
   LLVMIntrinsic<int_sin>;
 
 def UMax : DXILOperation< "UMax", 39,  BinaryClass,  BinaryUintCategory, "unsigned integer maximum. UMax(a,b) = a > b ? a : b",
-    "i16;i32;i64;",  "rn",
+    [i16Ty,i32Ty,i64Ty],  ReadNone,
   [
     DXILOpParameter<0,  "$o",  "",  "operation result">,
     DXILOpParameter<1,  "i32",  "opcode",  "DXIL opcode">,
@@ -108,7 +128,7 @@ def UMax : DXILOperation< "UMax", 39,  BinaryClass,  BinaryUintCategory, "unsign
   ["uints"]>,
   LLVMIntrinsic<int_umax>;
 
-def ThreadId : DXILOperation< "ThreadId", 93,  ThreadIdClass, ComputeIDCategory, "reads the thread ID", "i32;",  "rn",
+def ThreadId : DXILOperation< "ThreadId", 93,  ThreadIdClass, ComputeIDCategory, "reads the thread ID", [i32Ty],  ReadNone,
   [
     DXILOpParameter<0,  "i32",  "",  "thread ID component">,
     DXILOpParameter<1,  "i32",  "opcode",  "DXIL opcode">,
@@ -116,7 +136,7 @@ def ThreadId : DXILOperation< "ThreadId", 93,  ThreadIdClass, ComputeIDCategory,
   ]>,
   LLVMIntrinsic<int_dx_thread_id>;
 
-def GroupId : DXILOperation< "GroupId", 94,  GroupIdClass, ComputeIDCategory, "reads the group ID (SV_GroupID)", "i32;",  "rn",
+def GroupId : DXILOperation< "GroupId", 94,  GroupIdClass, ComputeIDCategory, "reads the group ID (SV_GroupID)", [i32Ty],  ReadNone,
   [
     DXILOpParameter<0,  "i32",  "",  "group ID component">,
     DXILOpParameter<1,  "i32",  "opcode",  "DXIL opcode">,
@@ -125,7 +145,7 @@ def GroupId : DXILOperation< "GroupId", 94,  GroupIdClass, ComputeIDCategory, "r
   LLVMIntrinsic<int_dx_group_id>;
 
 def ThreadIdInGroup : DXILOperation< "ThreadIdInGroup", 95,  ThreadIdInGroupClass, ComputeIDCategory,
-  "reads the thread ID within the group (SV_GroupThreadID)", "i32;",  "rn",
+  "reads the thread ID within the group (SV_GroupThreadID)", [i32Ty],  ReadNone,
   [
     DXILOpParameter<0,  "i32",  "",  "thread ID in group component">,
     DXILOpParameter<1,  "i32",  "opcode",  "DXIL opcode">,
@@ -134,7 +154,7 @@ def ThreadIdInGroup : DXILOperation< "ThreadIdInGroup", 95,  ThreadIdInGroupClas
   LLVMIntrinsic<int_dx_thread_id_in_group>;
 
 def FlattenedThreadIdInGroup : DXILOperation< "FlattenedThreadIdInGroup", 96,  FlattenedThreadIdInGroupClass, ComputeIDCategory,
-   "provides a flattened index for a given thread within a given group (SV_GroupIndex)", "i32;",  "rn",
+   "provides a flattened index for a given thread within a given group (SV_GroupIndex)", [i32Ty],  ReadNone,
   [
     DXILOpParameter<0,  "i32",  "",  "result">,
     DXILOpParameter<1,  "i32",  "opcode",  "DXIL opcode">
diff --git a/llvm/utils/TableGen/DXILEmitter.cpp b/llvm/utils/TableGen/DXILEmitter.cpp
index 25e818a..3378a90 100644
--- a/llvm/utils/TableGen/DXILEmitter.cpp
+++ b/llvm/utils/TableGen/DXILEmitter.cpp
@@ -49,11 +49,11 @@ struct DXILOperationDesc {
   StringRef Doc;      // the documentation description of this instruction
 
   SmallVector<DXILParameter> Params; // the operands that this instruction takes
-  StringRef OverloadTypes;           // overload types if applicable
-  StringRef FnAttr;                  // attribute shorthands: rn=does not access
-                                     // memory,ro=only reads from memory
+  SmallVector<ParameterKind> OverloadTypes; // overload types if applicable
+  StringRef Attr; // operation attribute; reference to string representation
+                  // of llvm::Attribute::AttrKind
   StringRef Intrinsic;  // The llvm intrinsic map to OpName. Default is "" which
-                        // means no map exist
+                        // means no map exists
   bool IsDeriv = false; // whether this is some kind of derivative
   bool IsGradient = false; // whether this requires a gradient calculation
   bool IsFeedback = false; // whether this is a sampler feedback op
@@ -70,37 +70,32 @@ struct DXILOperationDesc {
   int OverloadParamIndex; // parameter index which control the overload.
                           // When < 0, should be only 1 overload type.
   SmallVector<StringRef, 4> counters; // counters for this inst.
-  DXILOperationDesc(const Record *R) {
-    OpName = R->getValueAsString("OpName");
-    OpCode = R->getValueAsInt("OpCode");
-    OpClass = R->getValueAsDef("OpClass")->getValueAsString("Name");
-    Category = R->getValueAsDef("OpCategory")->getValueAsString("Name");
-
-    if (R->getValue("llvm_intrinsic")) {
-      auto *IntrinsicDef = R->getValueAsDef("llvm_intrinsic");
-      auto DefName = IntrinsicDef->getName();
-      assert(DefName.starts_with("int_") && "invalid intrinsic name");
-      // Remove the int_ from intrinsic name.
-      Intrinsic = DefName.substr(4);
-    }
-
-    Doc = R->getValueAsString("Doc");
-
-    ListInit *ParamList = R->getValueAsListInit("Params");
-    OverloadParamIndex = -1;
-    for (unsigned I = 0; I < ParamList->size(); ++I) {
-      Record *Param = ParamList->getElementAsRecord(I);
-      Params.emplace_back(DXILParameter(Param));
-      auto &CurParam = Params.back();
-      if (CurParam.Kind >= ParameterKind::OVERLOAD)
-        OverloadParamIndex = I;
-    }
-    OverloadTypes = R->getValueAsString("OverloadTypes");
-    FnAttr = R->getValueAsString("Attributes");
-  }
+  DXILOperationDesc(const Record *);
 };
 } // end anonymous namespace
 
+// Convert DXIL type name string to dxil::ParameterKind
+//
+// @param typeNameStr Type name string
+// @return ParameterKind as defined in llvm/Support/DXILABI.h
+static ParameterKind getDXILTypeNameToKind(StringRef typeNameStr) {
+  return StringSwitch<ParameterKind>(typeNameStr)
+      .Case("voidTy", ParameterKind::VOID)
+      .Case("f16Ty", ParameterKind::HALF)
+      .Case("f32Ty", ParameterKind::FLOAT)
+      .Case("f64Ty", ParameterKind::DOUBLE)
+      .Case("i1Ty", ParameterKind::I1)
+      .Case("i8Ty", ParameterKind::I8)
+      .Case("i16Ty", ParameterKind::I16)
+      .Case("i32Ty", ParameterKind::I32)
+      .Case("i64Ty", ParameterKind::I64)
+      .Case("overloadTy", ParameterKind::OVERLOAD)
+      .Case("handleTy", ParameterKind::DXIL_HANDLE)
+      .Case("cbufferRetTy", ParameterKind::CBUFFER_RET)
+      .Case("resourceRetTy", ParameterKind::RESOURCE_RET)
+      .Default(ParameterKind::INVALID);
+}
+
 static ParameterKind parameterTypeNameToKind(StringRef Name) {
   return StringSwitch<ParameterKind>(Name)
       .Case("void", ParameterKind::VOID)
@@ -119,10 +114,44 @@ static ParameterKind parameterTypeNameToKind(StringRef Name) {
       .Default(ParameterKind::INVALID);
 }
 
+DXILOperationDesc::DXILOperationDesc(const Record *R) {
+  OpName = R->getValueAsString("OpName");
+  OpCode = R->getValueAsInt("OpCode");
+  OpClass = R->getValueAsDef("OpClass")->getValueAsString("Name");
+  Category = R->getValueAsDef("OpCategory")->getValueAsString("Name");
+
+  if (R->getValue("llvm_intrinsic")) {
+    auto *IntrinsicDef = R->getValueAsDef("llvm_intrinsic");
+    auto DefName = IntrinsicDef->getName();
+    assert(DefName.starts_with("int_") && "invalid intrinsic name");
+    // Remove the int_ from intrinsic name.
+    Intrinsic = DefName.substr(4);
+  }
+
+  Doc = R->getValueAsString("Doc");
+
+  ListInit *ParamList = R->getValueAsListInit("Params");
+  OverloadParamIndex = -1;
+  for (unsigned I = 0; I < ParamList->size(); ++I) {
+    Record *Param = ParamList->getElementAsRecord(I);
+    Params.emplace_back(DXILParameter(Param));
+    auto &CurParam = Params.back();
+    if (CurParam.Kind >= ParameterKind::OVERLOAD)
+      OverloadParamIndex = I;
+  }
+  ListInit *OverloadTypeList = R->getValueAsListInit("OverloadTypes");
+
+  for (unsigned I = 0; I < OverloadTypeList->size(); ++I) {
+    Record *R = OverloadTypeList->getElementAsRecord(I);
+    OverloadTypes.emplace_back(getDXILTypeNameToKind(R->getNameInitAsString()));
+  }
+  Attr = StringRef(R->getValue("Attribute")->getNameInitAsString());
+}
+
 DXILParameter::DXILParameter(const Record *R) {
   Name = R->getValueAsString("Name");
   Pos = R->getValueAsInt("Pos");
-  Kind = parameterTypeNameToKind(R->getValueAsString("LLVMType"));
+  Kind = parameterTypeNameToKind(R->getValueAsString("Type"));
   if (R->getValue("Doc"))
     Doc = R->getValueAsString("Doc");
   IsConst = R->getValueAsBit("IsConstant");
@@ -267,38 +296,51 @@ static void emitDXILIntrinsicMap(std::vector<DXILOperationDesc> &Ops,
   OS << "\n";
 }
 
-static std::string emitDXILOperationFnAttr(StringRef FnAttr) {
-  return StringSwitch<std::string>(FnAttr)
-      .Case("rn", "Attribute::ReadNone")
-      .Case("ro", "Attribute::ReadOnly")
+// Convert operation attribute string to Attribute enum
+//
+// @param Attr string reference
+// @return std::string Attribute enum string
+static std::string emitDXILOperationAttr(StringRef Attr) {
+  return StringSwitch<std::string>(Attr)
+      .Case("ReadNone", "Attribute::ReadNone")
+      .Case("ReadOnly", "Attribute::ReadOnly")
       .Default("Attribute::None");
 }
 
-static std::string getOverloadKind(StringRef Overload) {
-  return StringSwitch<std::string>(Overload)
-      .Case("half", "OverloadKind::HALF")
-      .Case("float", "OverloadKind::FLOAT")
-      .Case("double", "OverloadKind::DOUBLE")
-      .Case("i1", "OverloadKind::I1")
-      .Case("i16", "OverloadKind::I16")
-      .Case("i32", "OverloadKind::I32")
-      .Case("i64", "OverloadKind::I64")
-      .Case("udt", "OverloadKind::UserDefineType")
-      .Case("obj", "OverloadKind::ObjectType")
-      .Default("OverloadKind::VOID");
+static std::string overloadKindStr(ParameterKind Overload) {
+  switch (Overload) {
+  case ParameterKind::HALF:
+    return "OverloadKind::HALF";
+  case ParameterKind::FLOAT:
+    return "OverloadKind::FLOAT";
+  case ParameterKind::DOUBLE:
+    return "OverloadKind::DOUBLE";
+  case ParameterKind::I1:
+    return "OverloadKind::I1";
+  case ParameterKind::I8:
+    return "OverloadKind::I8";
+  case ParameterKind::I16:
+    return "OverloadKind::I16";
+  case ParameterKind::I32:
+    return "OverloadKind::I32";
+  case ParameterKind::I64:
+    return "OverloadKind::I64";
+  case ParameterKind::VOID:
+    return "OverloadKind::VOID";
+  default:
+    return "OverloadKind::UNKNOWN";
+  }
 }
 
-static std::string getDXILOperationOverload(StringRef Overloads) {
-  SmallVector<StringRef> OverloadStrs;
-  Overloads.split(OverloadStrs, ';', /*MaxSplit*/ -1, /*KeepEmpty*/ false);
+static std::string
+getDXILOperationOverloads(SmallVector<ParameterKind> Overloads) {
   // Format is: OverloadKind::FLOAT | OverloadKind::HALF
-  assert(!OverloadStrs.empty() && "Invalid overloads");
-  auto It = OverloadStrs.begin();
+  auto It = Overloads.begin();
   std::string Result;
   raw_string_ostream OS(Result);
-  OS << getOverloadKind(*It);
-  for (++It; It != OverloadStrs.end(); ++It) {
-    OS << " | " << getOverloadKind(*It);
+  OS << overloadKindStr(*It);
+  for (++It; It != Overloads.end(); ++It) {
+    OS << " | " << overloadKindStr(*It);
   }
   return OS.str();
 }
@@ -368,8 +410,8 @@ static void emitDXILOperationTable(std::vector<DXILOperationDesc> &Ops,
     OS << "  { dxil::OpCode::" << Op.OpName << ", "
        << OpStrings.get(Op.OpName.str()) << ", OpCodeClass::" << Op.OpClass
        << ", " << OpClassStrings.get(getDXILOpClassName(Op.OpClass)) << ", "
-       << getDXILOperationOverload(Op.OverloadTypes) << ", "
-       << emitDXILOperationFnAttr(Op.FnAttr) << ", " << Op.OverloadParamIndex
+       << getDXILOperationOverloads(Op.OverloadTypes) << ", "
+       << emitDXILOperationAttr(Op.Attr) << ", " << Op.OverloadParamIndex
        << ", " << Op.Params.size() << ", "
        << Parameters.get(ParameterMap[Op.OpClass]) << " },\n";
   }
-- 
cgit v1.1


From 1d8479225a8c1efc8c90511e6c7fe608ff38163c Mon Sep 17 00:00:00 2001
From: David Spickett <david.spickett@linaro.org>
Date: Tue, 13 Feb 2024 16:14:03 +0000
Subject: [clang][Driver] Small correction to print-runtime-dir

---
 clang/include/clang/Driver/Options.td | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index 187b845..c625d0d 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -5339,7 +5339,7 @@ def print_rocm_search_dirs : Flag<["-", "--"], "print-rocm-search-dirs">,
   HelpText<"Print the paths used for finding ROCm installation">,
   Visibility<[ClangOption, CLOption]>;
 def print_runtime_dir : Flag<["-", "--"], "print-runtime-dir">,
-  HelpText<"Print the directory pathname containing clangs runtime libraries">,
+  HelpText<"Print the directory pathname containing Clang's runtime libraries">,
   Visibility<[ClangOption, CLOption]>;
 def print_diagnostic_options : Flag<["-", "--"], "print-diagnostic-options">,
   HelpText<"Print all of Clang's warning options">,
-- 
cgit v1.1


From d58c128bc42b8a9cc45516ba9fe9e6a3c322d7b3 Mon Sep 17 00:00:00 2001
From: Zequan Wu <zequanwu@google.com>
Date: Tue, 13 Feb 2024 11:38:02 -0500
Subject: [lldb-dap][NFC] Add Breakpoint struct to share common logic. (#80753)

This adds a layer between `SounceBreakpoint`/`FunctionBreakpoint` and
`BreakpointBase` to have better separation and encapsulation so we are
not directly operating on `SBBreakpoint`.

I basically moved the `SBBreakpoint` and the methods that requires it
from `BreakpointBase` to `Breakpoint`. This allows adding support for
data watchpoint easier by sharing the logic inside `BreakpointBase`.
---
 lldb/tools/lldb-dap/Breakpoint.cpp                 |  76 ++++++
 lldb/tools/lldb-dap/Breakpoint.h                   |  33 +++
 lldb/tools/lldb-dap/BreakpointBase.cpp             | 299 +-------------------
 lldb/tools/lldb-dap/BreakpointBase.h               |  33 +--
 lldb/tools/lldb-dap/CMakeLists.txt                 |   1 +
 lldb/tools/lldb-dap/FunctionBreakpoint.cpp         |  12 +-
 lldb/tools/lldb-dap/FunctionBreakpoint.h           |   4 +-
 lldb/tools/lldb-dap/JSONUtils.cpp                  |  46 +---
 lldb/tools/lldb-dap/JSONUtils.h                    |   5 +-
 lldb/tools/lldb-dap/SourceBreakpoint.cpp           | 304 ++++++++++++++++++++-
 lldb/tools/lldb-dap/SourceBreakpoint.h             |  30 +-
 lldb/tools/lldb-dap/lldb-dap.cpp                   |  17 +-
 .../gn/secondary/lldb/tools/lldb-dap/BUILD.gn      |   1 +
 13 files changed, 459 insertions(+), 402 deletions(-)
 create mode 100644 lldb/tools/lldb-dap/Breakpoint.cpp
 create mode 100644 lldb/tools/lldb-dap/Breakpoint.h

diff --git a/lldb/tools/lldb-dap/Breakpoint.cpp b/lldb/tools/lldb-dap/Breakpoint.cpp
new file mode 100644
index 0000000..0c33d4b
--- /dev/null
+++ b/lldb/tools/lldb-dap/Breakpoint.cpp
@@ -0,0 +1,76 @@
+//===-- Breakpoint.cpp ----------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "Breakpoint.h"
+#include "DAP.h"
+#include "JSONUtils.h"
+#include "llvm/ADT/StringExtras.h"
+
+using namespace lldb_dap;
+
+void Breakpoint::SetCondition() { bp.SetCondition(condition.c_str()); }
+
+void Breakpoint::SetHitCondition() {
+  uint64_t hitCount = 0;
+  if (llvm::to_integer(hitCondition, hitCount))
+    bp.SetIgnoreCount(hitCount - 1);
+}
+
+void Breakpoint::CreateJsonObject(llvm::json::Object &object) {
+  // Each breakpoint location is treated as a separate breakpoint for VS code.
+  // They don't have the notion of a single breakpoint with multiple locations.
+  if (!bp.IsValid())
+    return;
+  object.try_emplace("verified", bp.GetNumResolvedLocations() > 0);
+  object.try_emplace("id", bp.GetID());
+  // VS Code DAP doesn't currently allow one breakpoint to have multiple
+  // locations so we just report the first one. If we report all locations
+  // then the IDE starts showing the wrong line numbers and locations for
+  // other source file and line breakpoints in the same file.
+
+  // Below we search for the first resolved location in a breakpoint and report
+  // this as the breakpoint location since it will have a complete location
+  // that is at least loaded in the current process.
+  lldb::SBBreakpointLocation bp_loc;
+  const auto num_locs = bp.GetNumLocations();
+  for (size_t i = 0; i < num_locs; ++i) {
+    bp_loc = bp.GetLocationAtIndex(i);
+    if (bp_loc.IsResolved())
+      break;
+  }
+  // If not locations are resolved, use the first location.
+  if (!bp_loc.IsResolved())
+    bp_loc = bp.GetLocationAtIndex(0);
+  auto bp_addr = bp_loc.GetAddress();
+
+  if (bp_addr.IsValid()) {
+    std::string formatted_addr =
+        "0x" + llvm::utohexstr(bp_addr.GetLoadAddress(g_dap.target));
+    object.try_emplace("instructionReference", formatted_addr);
+    auto line_entry = bp_addr.GetLineEntry();
+    const auto line = line_entry.GetLine();
+    if (line != UINT32_MAX)
+      object.try_emplace("line", line);
+    const auto column = line_entry.GetColumn();
+    if (column != 0)
+      object.try_emplace("column", column);
+    object.try_emplace("source", CreateSource(line_entry));
+  }
+}
+
+bool Breakpoint::MatchesName(const char *name) { return bp.MatchesName(name); }
+
+void Breakpoint::SetBreakpoint() {
+  // See comments in BreakpointBase::GetBreakpointLabel() for details of why
+  // we add a label to our breakpoints.
+  bp.AddName(GetBreakpointLabel());
+  if (!condition.empty())
+    SetCondition();
+  if (!hitCondition.empty())
+    SetHitCondition();
+}
diff --git a/lldb/tools/lldb-dap/Breakpoint.h b/lldb/tools/lldb-dap/Breakpoint.h
new file mode 100644
index 0000000..47a9d9c
--- /dev/null
+++ b/lldb/tools/lldb-dap/Breakpoint.h
@@ -0,0 +1,33 @@
+//===-- Breakpoint.h --------------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLDB_TOOLS_LLDB_DAP_BREAKPOINT_H
+#define LLDB_TOOLS_LLDB_DAP_BREAKPOINT_H
+
+#include "BreakpointBase.h"
+
+namespace lldb_dap {
+
+struct Breakpoint : public BreakpointBase {
+  // The LLDB breakpoint associated wit this source breakpoint
+  lldb::SBBreakpoint bp;
+
+  Breakpoint() = default;
+  Breakpoint(const llvm::json::Object &obj) : BreakpointBase(obj){};
+  Breakpoint(lldb::SBBreakpoint bp) : bp(bp) {}
+
+  void SetCondition() override;
+  void SetHitCondition() override;
+  void CreateJsonObject(llvm::json::Object &object) override;
+
+  bool MatchesName(const char *name);
+  void SetBreakpoint();
+};
+} // namespace lldb_dap
+
+#endif
diff --git a/lldb/tools/lldb-dap/BreakpointBase.cpp b/lldb/tools/lldb-dap/BreakpointBase.cpp
index fb4b27f..519729f 100644
--- a/lldb/tools/lldb-dap/BreakpointBase.cpp
+++ b/lldb/tools/lldb-dap/BreakpointBase.cpp
@@ -8,306 +8,13 @@
 
 #include "BreakpointBase.h"
 #include "DAP.h"
-#include "JSONUtils.h"
 #include "llvm/ADT/StringExtras.h"
 
 using namespace lldb_dap;
 
 BreakpointBase::BreakpointBase(const llvm::json::Object &obj)
     : condition(std::string(GetString(obj, "condition"))),
-      hitCondition(std::string(GetString(obj, "hitCondition"))),
-      logMessage(std::string(GetString(obj, "logMessage"))) {}
-
-void BreakpointBase::SetCondition() { bp.SetCondition(condition.c_str()); }
-
-void BreakpointBase::SetHitCondition() {
-  uint64_t hitCount = 0;
-  if (llvm::to_integer(hitCondition, hitCount))
-    bp.SetIgnoreCount(hitCount - 1);
-}
-
-lldb::SBError BreakpointBase::AppendLogMessagePart(llvm::StringRef part,
-                                                   bool is_expr) {
-  if (is_expr) {
-    logMessageParts.emplace_back(part, is_expr);
-  } else {
-    std::string formatted;
-    lldb::SBError error = FormatLogText(part, formatted);
-    if (error.Fail())
-      return error;
-    logMessageParts.emplace_back(formatted, is_expr);
-  }
-  return lldb::SBError();
-}
-
-// TODO: consolidate this code with the implementation in
-// FormatEntity::ParseInternal().
-lldb::SBError BreakpointBase::FormatLogText(llvm::StringRef text,
-                                            std::string &formatted) {
-  lldb::SBError error;
-  while (!text.empty()) {
-    size_t backslash_pos = text.find_first_of('\\');
-    if (backslash_pos == std::string::npos) {
-      formatted += text.str();
-      return error;
-    }
-
-    formatted += text.substr(0, backslash_pos).str();
-    // Skip the characters before and including '\'.
-    text = text.drop_front(backslash_pos + 1);
-
-    if (text.empty()) {
-      error.SetErrorString(
-          "'\\' character was not followed by another character");
-      return error;
-    }
-
-    const char desens_char = text[0];
-    text = text.drop_front(); // Skip the desensitized char character
-    switch (desens_char) {
-    case 'a':
-      formatted.push_back('\a');
-      break;
-    case 'b':
-      formatted.push_back('\b');
-      break;
-    case 'f':
-      formatted.push_back('\f');
-      break;
-    case 'n':
-      formatted.push_back('\n');
-      break;
-    case 'r':
-      formatted.push_back('\r');
-      break;
-    case 't':
-      formatted.push_back('\t');
-      break;
-    case 'v':
-      formatted.push_back('\v');
-      break;
-    case '\'':
-      formatted.push_back('\'');
-      break;
-    case '\\':
-      formatted.push_back('\\');
-      break;
-    case '0':
-      // 1 to 3 octal chars
-      {
-        if (text.empty()) {
-          error.SetErrorString("missing octal number following '\\0'");
-          return error;
-        }
-
-        // Make a string that can hold onto the initial zero char, up to 3
-        // octal digits, and a terminating NULL.
-        char oct_str[5] = {0, 0, 0, 0, 0};
-
-        size_t i;
-        for (i = 0;
-             i < text.size() && i < 4 && (text[i] >= '0' && text[i] <= '7');
-             ++i) {
-          oct_str[i] = text[i];
-        }
-
-        text = text.drop_front(i);
-        unsigned long octal_value = ::strtoul(oct_str, nullptr, 8);
-        if (octal_value <= UINT8_MAX) {
-          formatted.push_back((char)octal_value);
-        } else {
-          error.SetErrorString("octal number is larger than a single byte");
-          return error;
-        }
-      }
-      break;
-
-    case 'x': {
-      if (text.empty()) {
-        error.SetErrorString("missing hex number following '\\x'");
-        return error;
-      }
-      // hex number in the text
-      if (isxdigit(text[0])) {
-        // Make a string that can hold onto two hex chars plus a
-        // NULL terminator
-        char hex_str[3] = {0, 0, 0};
-        hex_str[0] = text[0];
-
-        text = text.drop_front();
-
-        if (!text.empty() && isxdigit(text[0])) {
-          hex_str[1] = text[0];
-          text = text.drop_front();
-        }
-
-        unsigned long hex_value = strtoul(hex_str, nullptr, 16);
-        if (hex_value <= UINT8_MAX) {
-          formatted.push_back((char)hex_value);
-        } else {
-          error.SetErrorString("hex number is larger than a single byte");
-          return error;
-        }
-      } else {
-        formatted.push_back(desens_char);
-      }
-      break;
-    }
-
-    default:
-      // Just desensitize any other character by just printing what came
-      // after the '\'
-      formatted.push_back(desens_char);
-      break;
-    }
-  }
-  return error;
-}
-
-// logMessage will be divided into array of LogMessagePart as two kinds:
-// 1. raw print text message, and
-// 2. interpolated expression for evaluation which is inside matching curly
-//    braces.
-//
-// The function tries to parse logMessage into a list of LogMessageParts
-// for easy later access in BreakpointHitCallback.
-void BreakpointBase::SetLogMessage() {
-  logMessageParts.clear();
-
-  // Contains unmatched open curly braces indices.
-  std::vector<int> unmatched_curly_braces;
-
-  // Contains all matched curly braces in logMessage.
-  // Loop invariant: matched_curly_braces_ranges are sorted by start index in
-  // ascending order without any overlap between them.
-  std::vector<std::pair<int, int>> matched_curly_braces_ranges;
-
-  lldb::SBError error;
-  // Part1 - parse matched_curly_braces_ranges.
-  // locating all curly braced expression ranges in logMessage.
-  // The algorithm takes care of nested and imbalanced curly braces.
-  for (size_t i = 0; i < logMessage.size(); ++i) {
-    if (logMessage[i] == '{') {
-      unmatched_curly_braces.push_back(i);
-    } else if (logMessage[i] == '}') {
-      if (unmatched_curly_braces.empty())
-        // Nothing to match.
-        continue;
-
-      int last_unmatched_index = unmatched_curly_braces.back();
-      unmatched_curly_braces.pop_back();
-
-      // Erase any matched ranges included in the new match.
-      while (!matched_curly_braces_ranges.empty()) {
-        assert(matched_curly_braces_ranges.back().first !=
-                   last_unmatched_index &&
-               "How can a curley brace be matched twice?");
-        if (matched_curly_braces_ranges.back().first < last_unmatched_index)
-          break;
-
-        // This is a nested range let's earse it.
-        assert((size_t)matched_curly_braces_ranges.back().second < i);
-        matched_curly_braces_ranges.pop_back();
-      }
-
-      // Assert invariant.
-      assert(matched_curly_braces_ranges.empty() ||
-             matched_curly_braces_ranges.back().first < last_unmatched_index);
-      matched_curly_braces_ranges.emplace_back(last_unmatched_index, i);
-    }
-  }
-
-  // Part2 - parse raw text and expresions parts.
-  // All expression ranges have been parsed in matched_curly_braces_ranges.
-  // The code below uses matched_curly_braces_ranges to divide logMessage
-  // into raw text parts and expression parts.
-  int last_raw_text_start = 0;
-  for (const std::pair<int, int> &curly_braces_range :
-       matched_curly_braces_ranges) {
-    // Raw text before open curly brace.
-    assert(curly_braces_range.first >= last_raw_text_start);
-    size_t raw_text_len = curly_braces_range.first - last_raw_text_start;
-    if (raw_text_len > 0) {
-      error = AppendLogMessagePart(
-          llvm::StringRef(logMessage.c_str() + last_raw_text_start,
-                          raw_text_len),
-          /*is_expr=*/false);
-      if (error.Fail()) {
-        NotifyLogMessageError(error.GetCString());
-        return;
-      }
-    }
-
-    // Expression between curly braces.
-    assert(curly_braces_range.second > curly_braces_range.first);
-    size_t expr_len = curly_braces_range.second - curly_braces_range.first - 1;
-    error = AppendLogMessagePart(
-        llvm::StringRef(logMessage.c_str() + curly_braces_range.first + 1,
-                        expr_len),
-        /*is_expr=*/true);
-    if (error.Fail()) {
-      NotifyLogMessageError(error.GetCString());
-      return;
-    }
-
-    last_raw_text_start = curly_braces_range.second + 1;
-  }
-  // Trailing raw text after close curly brace.
-  assert(last_raw_text_start >= 0);
-  if (logMessage.size() > (size_t)last_raw_text_start) {
-    error = AppendLogMessagePart(
-        llvm::StringRef(logMessage.c_str() + last_raw_text_start,
-                        logMessage.size() - last_raw_text_start),
-        /*is_expr=*/false);
-    if (error.Fail()) {
-      NotifyLogMessageError(error.GetCString());
-      return;
-    }
-  }
-
-  bp.SetCallback(BreakpointBase::BreakpointHitCallback, this);
-}
-
-void BreakpointBase::NotifyLogMessageError(llvm::StringRef error) {
-  std::string message = "Log message has error: ";
-  message += error;
-  g_dap.SendOutput(OutputType::Console, message);
-}
-
-/*static*/
-bool BreakpointBase::BreakpointHitCallback(
-    void *baton, lldb::SBProcess &process, lldb::SBThread &thread,
-    lldb::SBBreakpointLocation &location) {
-  if (!baton)
-    return true;
-
-  BreakpointBase *bp = (BreakpointBase *)baton;
-  lldb::SBFrame frame = thread.GetSelectedFrame();
-
-  std::string output;
-  for (const BreakpointBase::LogMessagePart &messagePart :
-       bp->logMessageParts) {
-    if (messagePart.is_expr) {
-      // Try local frame variables first before fall back to expression
-      // evaluation
-      const std::string &expr_str = messagePart.text;
-      const char *expr = expr_str.c_str();
-      lldb::SBValue value =
-          frame.GetValueForVariablePath(expr, lldb::eDynamicDontRunTarget);
-      if (value.GetError().Fail())
-        value = frame.EvaluateExpression(expr);
-      output += VariableDescription(value).display_value;
-    } else {
-      output += messagePart.text;
-    }
-  }
-  if (!output.empty() && output.back() != '\n')
-    output.push_back('\n'); // Ensure log message has line break.
-  g_dap.SendOutput(OutputType::Console, output.c_str());
-
-  // Do not stop.
-  return false;
-}
+      hitCondition(std::string(GetString(obj, "hitCondition"))) {}
 
 void BreakpointBase::UpdateBreakpoint(const BreakpointBase &request_bp) {
   if (condition != request_bp.condition) {
@@ -318,10 +25,6 @@ void BreakpointBase::UpdateBreakpoint(const BreakpointBase &request_bp) {
     hitCondition = request_bp.hitCondition;
     SetHitCondition();
   }
-  if (logMessage != request_bp.logMessage) {
-    logMessage = request_bp.logMessage;
-    SetLogMessage();
-  }
 }
 
 const char *BreakpointBase::GetBreakpointLabel() {
diff --git a/lldb/tools/lldb-dap/BreakpointBase.h b/lldb/tools/lldb-dap/BreakpointBase.h
index 41787f7..5a04bb2 100644
--- a/lldb/tools/lldb-dap/BreakpointBase.h
+++ b/lldb/tools/lldb-dap/BreakpointBase.h
@@ -9,7 +9,6 @@
 #ifndef LLDB_TOOLS_LLDB_DAP_BREAKPOINTBASE_H
 #define LLDB_TOOLS_LLDB_DAP_BREAKPOINTBASE_H
 
-#include "JSONUtils.h"
 #include "lldb/API/SBBreakpoint.h"
 #include "llvm/Support/JSON.h"
 #include <string>
@@ -18,44 +17,24 @@
 namespace lldb_dap {
 
 struct BreakpointBase {
-  // logMessage part can be either a raw text or an expression.
-  struct LogMessagePart {
-    LogMessagePart(llvm::StringRef text, bool is_expr)
-        : text(text), is_expr(is_expr) {}
-    std::string text;
-    bool is_expr;
-  };
+
   // An optional expression for conditional breakpoints.
   std::string condition;
   // An optional expression that controls how many hits of the breakpoint are
   // ignored. The backend is expected to interpret the expression as needed
   std::string hitCondition;
-  // If this attribute exists and is non-empty, the backend must not 'break'
-  // (stop) but log the message instead. Expressions within {} are
-  // interpolated.
-  std::string logMessage;
-  std::vector<LogMessagePart> logMessageParts;
-  // The LLDB breakpoint associated wit this source breakpoint
-  lldb::SBBreakpoint bp;
 
   BreakpointBase() = default;
   BreakpointBase(const llvm::json::Object &obj);
+  virtual ~BreakpointBase() = default;
 
-  void SetCondition();
-  void SetHitCondition();
-  void SetLogMessage();
-  void UpdateBreakpoint(const BreakpointBase &request_bp);
+  virtual void SetCondition() = 0;
+  virtual void SetHitCondition() = 0;
+  virtual void CreateJsonObject(llvm::json::Object &object) = 0;
 
-  // Format \param text and return formatted text in \param formatted.
-  // \return any formatting failures.
-  lldb::SBError FormatLogText(llvm::StringRef text, std::string &formatted);
-  lldb::SBError AppendLogMessagePart(llvm::StringRef part, bool is_expr);
-  void NotifyLogMessageError(llvm::StringRef error);
+  void UpdateBreakpoint(const BreakpointBase &request_bp);
 
   static const char *GetBreakpointLabel();
-  static bool BreakpointHitCallback(void *baton, lldb::SBProcess &process,
-                                    lldb::SBThread &thread,
-                                    lldb::SBBreakpointLocation &location);
 };
 
 } // namespace lldb_dap
diff --git a/lldb/tools/lldb-dap/CMakeLists.txt b/lldb/tools/lldb-dap/CMakeLists.txt
index 554567eb..f8c0e4e 100644
--- a/lldb/tools/lldb-dap/CMakeLists.txt
+++ b/lldb/tools/lldb-dap/CMakeLists.txt
@@ -24,6 +24,7 @@ tablegen(LLVM Options.inc -gen-opt-parser-defs)
 add_public_tablegen_target(LLDBDAPOptionsTableGen)
 add_lldb_tool(lldb-dap
   lldb-dap.cpp
+  Breakpoint.cpp
   BreakpointBase.cpp
   ExceptionBreakpoint.cpp
   FifoFiles.cpp
diff --git a/lldb/tools/lldb-dap/FunctionBreakpoint.cpp b/lldb/tools/lldb-dap/FunctionBreakpoint.cpp
index d4bdb97..21743bf9 100644
--- a/lldb/tools/lldb-dap/FunctionBreakpoint.cpp
+++ b/lldb/tools/lldb-dap/FunctionBreakpoint.cpp
@@ -12,21 +12,13 @@
 namespace lldb_dap {
 
 FunctionBreakpoint::FunctionBreakpoint(const llvm::json::Object &obj)
-    : BreakpointBase(obj), functionName(std::string(GetString(obj, "name"))) {}
+    : Breakpoint(obj), functionName(std::string(GetString(obj, "name"))) {}
 
 void FunctionBreakpoint::SetBreakpoint() {
   if (functionName.empty())
     return;
   bp = g_dap.target.BreakpointCreateByName(functionName.c_str());
-  // See comments in BreakpointBase::GetBreakpointLabel() for details of why
-  // we add a label to our breakpoints.
-  bp.AddName(GetBreakpointLabel());
-  if (!condition.empty())
-    SetCondition();
-  if (!hitCondition.empty())
-    SetHitCondition();
-  if (!logMessage.empty())
-    SetLogMessage();
+  Breakpoint::SetBreakpoint();
 }
 
 } // namespace lldb_dap
diff --git a/lldb/tools/lldb-dap/FunctionBreakpoint.h b/lldb/tools/lldb-dap/FunctionBreakpoint.h
index fc23e94..b15ff19 100644
--- a/lldb/tools/lldb-dap/FunctionBreakpoint.h
+++ b/lldb/tools/lldb-dap/FunctionBreakpoint.h
@@ -9,11 +9,11 @@
 #ifndef LLDB_TOOLS_LLDB_DAP_FUNCTIONBREAKPOINT_H
 #define LLDB_TOOLS_LLDB_DAP_FUNCTIONBREAKPOINT_H
 
-#include "BreakpointBase.h"
+#include "Breakpoint.h"
 
 namespace lldb_dap {
 
-struct FunctionBreakpoint : public BreakpointBase {
+struct FunctionBreakpoint : public Breakpoint {
   std::string functionName;
 
   FunctionBreakpoint() = default;
diff --git a/lldb/tools/lldb-dap/JSONUtils.cpp b/lldb/tools/lldb-dap/JSONUtils.cpp
index a8b438d..878449a 100644
--- a/lldb/tools/lldb-dap/JSONUtils.cpp
+++ b/lldb/tools/lldb-dap/JSONUtils.cpp
@@ -364,54 +364,14 @@ llvm::json::Value CreateScope(const llvm::StringRef name,
 //   },
 //   "required": [ "verified" ]
 // }
-llvm::json::Value CreateBreakpoint(lldb::SBBreakpoint &bp,
+llvm::json::Value CreateBreakpoint(BreakpointBase *bp,
                                    std::optional<llvm::StringRef> request_path,
                                    std::optional<uint32_t> request_line,
                                    std::optional<uint32_t> request_column) {
-  // Each breakpoint location is treated as a separate breakpoint for VS code.
-  // They don't have the notion of a single breakpoint with multiple locations.
   llvm::json::Object object;
-  if (!bp.IsValid())
-    return llvm::json::Value(std::move(object));
-
-  object.try_emplace("verified", bp.GetNumResolvedLocations() > 0);
-  object.try_emplace("id", bp.GetID());
-  // VS Code DAP doesn't currently allow one breakpoint to have multiple
-  // locations so we just report the first one. If we report all locations
-  // then the IDE starts showing the wrong line numbers and locations for
-  // other source file and line breakpoints in the same file.
-
-  // Below we search for the first resolved location in a breakpoint and report
-  // this as the breakpoint location since it will have a complete location
-  // that is at least loaded in the current process.
-  lldb::SBBreakpointLocation bp_loc;
-  const auto num_locs = bp.GetNumLocations();
-  for (size_t i = 0; i < num_locs; ++i) {
-    bp_loc = bp.GetLocationAtIndex(i);
-    if (bp_loc.IsResolved())
-      break;
-  }
-  // If not locations are resolved, use the first location.
-  if (!bp_loc.IsResolved())
-    bp_loc = bp.GetLocationAtIndex(0);
-  auto bp_addr = bp_loc.GetAddress();
-
   if (request_path)
     object.try_emplace("source", CreateSource(*request_path));
-
-  if (bp_addr.IsValid()) {
-    std::string formatted_addr =
-        "0x" + llvm::utohexstr(bp_addr.GetLoadAddress(g_dap.target));
-    object.try_emplace("instructionReference", formatted_addr);
-    auto line_entry = bp_addr.GetLineEntry();
-    const auto line = line_entry.GetLine();
-    if (line != UINT32_MAX)
-      object.try_emplace("line", line);
-    const auto column = line_entry.GetColumn();
-    if (column != 0)
-      object.try_emplace("column", column);
-    object.try_emplace("source", CreateSource(line_entry));
-  }
+  bp->CreateJsonObject(object);
   // We try to add request_line as a fallback
   if (request_line)
     object.try_emplace("line", *request_line);
@@ -506,7 +466,7 @@ llvm::json::Value CreateModule(lldb::SBModule &module) {
   return llvm::json::Value(std::move(object));
 }
 
-void AppendBreakpoint(lldb::SBBreakpoint &bp, llvm::json::Array &breakpoints,
+void AppendBreakpoint(BreakpointBase *bp, llvm::json::Array &breakpoints,
                       std::optional<llvm::StringRef> request_path,
                       std::optional<uint32_t> request_line) {
   breakpoints.emplace_back(CreateBreakpoint(bp, request_path, request_line));
diff --git a/lldb/tools/lldb-dap/JSONUtils.h b/lldb/tools/lldb-dap/JSONUtils.h
index 6233854..1515f5b 100644
--- a/lldb/tools/lldb-dap/JSONUtils.h
+++ b/lldb/tools/lldb-dap/JSONUtils.h
@@ -9,6 +9,7 @@
 #ifndef LLDB_TOOLS_LLDB_DAP_JSONUTILS_H
 #define LLDB_TOOLS_LLDB_DAP_JSONUTILS_H
 
+#include "BreakpointBase.h"
 #include "DAPForward.h"
 #include "lldb/API/SBModule.h"
 #include "llvm/ADT/StringRef.h"
@@ -191,7 +192,7 @@ void FillResponse(const llvm::json::Object &request,
 ///     provided by the setBreakpoints request are returned to the IDE as a
 ///     fallback.
 void AppendBreakpoint(
-    lldb::SBBreakpoint &bp, llvm::json::Array &breakpoints,
+    BreakpointBase *bp, llvm::json::Array &breakpoints,
     std::optional<llvm::StringRef> request_path = std::nullopt,
     std::optional<uint32_t> request_line = std::nullopt);
 
@@ -223,7 +224,7 @@ void AppendBreakpoint(
 ///     A "Breakpoint" JSON object with that follows the formal JSON
 ///     definition outlined by Microsoft.
 llvm::json::Value
-CreateBreakpoint(lldb::SBBreakpoint &bp,
+CreateBreakpoint(BreakpointBase *bp,
                  std::optional<llvm::StringRef> request_path = std::nullopt,
                  std::optional<uint32_t> request_line = std::nullopt,
                  std::optional<uint32_t> request_column = std::nullopt);
diff --git a/lldb/tools/lldb-dap/SourceBreakpoint.cpp b/lldb/tools/lldb-dap/SourceBreakpoint.cpp
index 3bd83c0..f5dd134 100644
--- a/lldb/tools/lldb-dap/SourceBreakpoint.cpp
+++ b/lldb/tools/lldb-dap/SourceBreakpoint.cpp
@@ -12,22 +12,308 @@
 namespace lldb_dap {
 
 SourceBreakpoint::SourceBreakpoint(const llvm::json::Object &obj)
-    : BreakpointBase(obj), line(GetUnsigned(obj, "line", 0)),
-      column(GetUnsigned(obj, "column", 0)) {}
+    : Breakpoint(obj), logMessage(std::string(GetString(obj, "logMessage"))),
+      line(GetUnsigned(obj, "line", 0)), column(GetUnsigned(obj, "column", 0)) {
+}
 
 void SourceBreakpoint::SetBreakpoint(const llvm::StringRef source_path) {
   lldb::SBFileSpecList module_list;
   bp = g_dap.target.BreakpointCreateByLocation(source_path.str().c_str(), line,
                                                column, 0, module_list);
-  // See comments in BreakpointBase::GetBreakpointLabel() for details of why
-  // we add a label to our breakpoints.
-  bp.AddName(GetBreakpointLabel());
-  if (!condition.empty())
-    SetCondition();
-  if (!hitCondition.empty())
-    SetHitCondition();
   if (!logMessage.empty())
     SetLogMessage();
+  Breakpoint::SetBreakpoint();
+}
+
+void SourceBreakpoint::UpdateBreakpoint(const SourceBreakpoint &request_bp) {
+  if (logMessage != request_bp.logMessage) {
+    logMessage = request_bp.logMessage;
+    SetLogMessage();
+  }
+  BreakpointBase::UpdateBreakpoint(request_bp);
+}
+
+lldb::SBError SourceBreakpoint::AppendLogMessagePart(llvm::StringRef part,
+                                                     bool is_expr) {
+  if (is_expr) {
+    logMessageParts.emplace_back(part, is_expr);
+  } else {
+    std::string formatted;
+    lldb::SBError error = FormatLogText(part, formatted);
+    if (error.Fail())
+      return error;
+    logMessageParts.emplace_back(formatted, is_expr);
+  }
+  return lldb::SBError();
+}
+
+// TODO: consolidate this code with the implementation in
+// FormatEntity::ParseInternal().
+lldb::SBError SourceBreakpoint::FormatLogText(llvm::StringRef text,
+                                              std::string &formatted) {
+  lldb::SBError error;
+  while (!text.empty()) {
+    size_t backslash_pos = text.find_first_of('\\');
+    if (backslash_pos == std::string::npos) {
+      formatted += text.str();
+      return error;
+    }
+
+    formatted += text.substr(0, backslash_pos).str();
+    // Skip the characters before and including '\'.
+    text = text.drop_front(backslash_pos + 1);
+
+    if (text.empty()) {
+      error.SetErrorString(
+          "'\\' character was not followed by another character");
+      return error;
+    }
+
+    const char desens_char = text[0];
+    text = text.drop_front(); // Skip the desensitized char character
+    switch (desens_char) {
+    case 'a':
+      formatted.push_back('\a');
+      break;
+    case 'b':
+      formatted.push_back('\b');
+      break;
+    case 'f':
+      formatted.push_back('\f');
+      break;
+    case 'n':
+      formatted.push_back('\n');
+      break;
+    case 'r':
+      formatted.push_back('\r');
+      break;
+    case 't':
+      formatted.push_back('\t');
+      break;
+    case 'v':
+      formatted.push_back('\v');
+      break;
+    case '\'':
+      formatted.push_back('\'');
+      break;
+    case '\\':
+      formatted.push_back('\\');
+      break;
+    case '0':
+      // 1 to 3 octal chars
+      {
+        if (text.empty()) {
+          error.SetErrorString("missing octal number following '\\0'");
+          return error;
+        }
+
+        // Make a string that can hold onto the initial zero char, up to 3
+        // octal digits, and a terminating NULL.
+        char oct_str[5] = {0, 0, 0, 0, 0};
+
+        size_t i;
+        for (i = 0;
+             i < text.size() && i < 4 && (text[i] >= '0' && text[i] <= '7');
+             ++i) {
+          oct_str[i] = text[i];
+        }
+
+        text = text.drop_front(i);
+        unsigned long octal_value = ::strtoul(oct_str, nullptr, 8);
+        if (octal_value <= UINT8_MAX) {
+          formatted.push_back((char)octal_value);
+        } else {
+          error.SetErrorString("octal number is larger than a single byte");
+          return error;
+        }
+      }
+      break;
+
+    case 'x': {
+      if (text.empty()) {
+        error.SetErrorString("missing hex number following '\\x'");
+        return error;
+      }
+      // hex number in the text
+      if (isxdigit(text[0])) {
+        // Make a string that can hold onto two hex chars plus a
+        // NULL terminator
+        char hex_str[3] = {0, 0, 0};
+        hex_str[0] = text[0];
+
+        text = text.drop_front();
+
+        if (!text.empty() && isxdigit(text[0])) {
+          hex_str[1] = text[0];
+          text = text.drop_front();
+        }
+
+        unsigned long hex_value = strtoul(hex_str, nullptr, 16);
+        if (hex_value <= UINT8_MAX) {
+          formatted.push_back((char)hex_value);
+        } else {
+          error.SetErrorString("hex number is larger than a single byte");
+          return error;
+        }
+      } else {
+        formatted.push_back(desens_char);
+      }
+      break;
+    }
+
+    default:
+      // Just desensitize any other character by just printing what came
+      // after the '\'
+      formatted.push_back(desens_char);
+      break;
+    }
+  }
+  return error;
+}
+
+// logMessage will be divided into array of LogMessagePart as two kinds:
+// 1. raw print text message, and
+// 2. interpolated expression for evaluation which is inside matching curly
+//    braces.
+//
+// The function tries to parse logMessage into a list of LogMessageParts
+// for easy later access in BreakpointHitCallback.
+void SourceBreakpoint::SetLogMessage() {
+  logMessageParts.clear();
+
+  // Contains unmatched open curly braces indices.
+  std::vector<int> unmatched_curly_braces;
+
+  // Contains all matched curly braces in logMessage.
+  // Loop invariant: matched_curly_braces_ranges are sorted by start index in
+  // ascending order without any overlap between them.
+  std::vector<std::pair<int, int>> matched_curly_braces_ranges;
+
+  lldb::SBError error;
+  // Part1 - parse matched_curly_braces_ranges.
+  // locating all curly braced expression ranges in logMessage.
+  // The algorithm takes care of nested and imbalanced curly braces.
+  for (size_t i = 0; i < logMessage.size(); ++i) {
+    if (logMessage[i] == '{') {
+      unmatched_curly_braces.push_back(i);
+    } else if (logMessage[i] == '}') {
+      if (unmatched_curly_braces.empty())
+        // Nothing to match.
+        continue;
+
+      int last_unmatched_index = unmatched_curly_braces.back();
+      unmatched_curly_braces.pop_back();
+
+      // Erase any matched ranges included in the new match.
+      while (!matched_curly_braces_ranges.empty()) {
+        assert(matched_curly_braces_ranges.back().first !=
+                   last_unmatched_index &&
+               "How can a curley brace be matched twice?");
+        if (matched_curly_braces_ranges.back().first < last_unmatched_index)
+          break;
+
+        // This is a nested range let's earse it.
+        assert((size_t)matched_curly_braces_ranges.back().second < i);
+        matched_curly_braces_ranges.pop_back();
+      }
+
+      // Assert invariant.
+      assert(matched_curly_braces_ranges.empty() ||
+             matched_curly_braces_ranges.back().first < last_unmatched_index);
+      matched_curly_braces_ranges.emplace_back(last_unmatched_index, i);
+    }
+  }
+
+  // Part2 - parse raw text and expresions parts.
+  // All expression ranges have been parsed in matched_curly_braces_ranges.
+  // The code below uses matched_curly_braces_ranges to divide logMessage
+  // into raw text parts and expression parts.
+  int last_raw_text_start = 0;
+  for (const std::pair<int, int> &curly_braces_range :
+       matched_curly_braces_ranges) {
+    // Raw text before open curly brace.
+    assert(curly_braces_range.first >= last_raw_text_start);
+    size_t raw_text_len = curly_braces_range.first - last_raw_text_start;
+    if (raw_text_len > 0) {
+      error = AppendLogMessagePart(
+          llvm::StringRef(logMessage.c_str() + last_raw_text_start,
+                          raw_text_len),
+          /*is_expr=*/false);
+      if (error.Fail()) {
+        NotifyLogMessageError(error.GetCString());
+        return;
+      }
+    }
+
+    // Expression between curly braces.
+    assert(curly_braces_range.second > curly_braces_range.first);
+    size_t expr_len = curly_braces_range.second - curly_braces_range.first - 1;
+    error = AppendLogMessagePart(
+        llvm::StringRef(logMessage.c_str() + curly_braces_range.first + 1,
+                        expr_len),
+        /*is_expr=*/true);
+    if (error.Fail()) {
+      NotifyLogMessageError(error.GetCString());
+      return;
+    }
+
+    last_raw_text_start = curly_braces_range.second + 1;
+  }
+  // Trailing raw text after close curly brace.
+  assert(last_raw_text_start >= 0);
+  if (logMessage.size() > (size_t)last_raw_text_start) {
+    error = AppendLogMessagePart(
+        llvm::StringRef(logMessage.c_str() + last_raw_text_start,
+                        logMessage.size() - last_raw_text_start),
+        /*is_expr=*/false);
+    if (error.Fail()) {
+      NotifyLogMessageError(error.GetCString());
+      return;
+    }
+  }
+
+  bp.SetCallback(BreakpointHitCallback, this);
+}
+
+void SourceBreakpoint::NotifyLogMessageError(llvm::StringRef error) {
+  std::string message = "Log message has error: ";
+  message += error;
+  g_dap.SendOutput(OutputType::Console, message);
+}
+
+/*static*/
+bool SourceBreakpoint::BreakpointHitCallback(
+    void *baton, lldb::SBProcess &process, lldb::SBThread &thread,
+    lldb::SBBreakpointLocation &location) {
+  if (!baton)
+    return true;
+
+  SourceBreakpoint *bp = (SourceBreakpoint *)baton;
+  lldb::SBFrame frame = thread.GetSelectedFrame();
+
+  std::string output;
+  for (const SourceBreakpoint::LogMessagePart &messagePart :
+       bp->logMessageParts) {
+    if (messagePart.is_expr) {
+      // Try local frame variables first before fall back to expression
+      // evaluation
+      const std::string &expr_str = messagePart.text;
+      const char *expr = expr_str.c_str();
+      lldb::SBValue value =
+          frame.GetValueForVariablePath(expr, lldb::eDynamicDontRunTarget);
+      if (value.GetError().Fail())
+        value = frame.EvaluateExpression(expr);
+      output += VariableDescription(value).display_value;
+    } else {
+      output += messagePart.text;
+    }
+  }
+  if (!output.empty() && output.back() != '\n')
+    output.push_back('\n'); // Ensure log message has line break.
+  g_dap.SendOutput(OutputType::Console, output.c_str());
+
+  // Do not stop.
+  return false;
 }
 
 } // namespace lldb_dap
diff --git a/lldb/tools/lldb-dap/SourceBreakpoint.h b/lldb/tools/lldb-dap/SourceBreakpoint.h
index f4b54a4..aa3fbe6 100644
--- a/lldb/tools/lldb-dap/SourceBreakpoint.h
+++ b/lldb/tools/lldb-dap/SourceBreakpoint.h
@@ -9,21 +9,45 @@
 #ifndef LLDB_TOOLS_LLDB_DAP_SOURCEBREAKPOINT_H
 #define LLDB_TOOLS_LLDB_DAP_SOURCEBREAKPOINT_H
 
-#include "BreakpointBase.h"
+#include "Breakpoint.h"
 #include "llvm/ADT/StringRef.h"
 
 namespace lldb_dap {
 
-struct SourceBreakpoint : public BreakpointBase {
+struct SourceBreakpoint : public Breakpoint {
+  // logMessage part can be either a raw text or an expression.
+  struct LogMessagePart {
+    LogMessagePart(llvm::StringRef text, bool is_expr)
+        : text(text), is_expr(is_expr) {}
+    std::string text;
+    bool is_expr;
+  };
+  // If this attribute exists and is non-empty, the backend must not 'break'
+  // (stop) but log the message instead. Expressions within {} are
+  // interpolated.
+  std::string logMessage;
+  std::vector<LogMessagePart> logMessageParts;
 
   uint32_t line;   ///< The source line of the breakpoint or logpoint
   uint32_t column; ///< An optional source column of the breakpoint
 
-  SourceBreakpoint() : BreakpointBase(), line(0), column(0) {}
+  SourceBreakpoint() : Breakpoint(), line(0), column(0) {}
   SourceBreakpoint(const llvm::json::Object &obj);
 
   // Set this breakpoint in LLDB as a new breakpoint
   void SetBreakpoint(const llvm::StringRef source_path);
+  void UpdateBreakpoint(const SourceBreakpoint &request_bp);
+
+  void SetLogMessage();
+  // Format \param text and return formatted text in \param formatted.
+  // \return any formatting failures.
+  lldb::SBError FormatLogText(llvm::StringRef text, std::string &formatted);
+  lldb::SBError AppendLogMessagePart(llvm::StringRef part, bool is_expr);
+  void NotifyLogMessageError(llvm::StringRef error);
+
+  static bool BreakpointHitCallback(void *baton, lldb::SBProcess &process,
+                                    lldb::SBThread &thread,
+                                    lldb::SBBreakpointLocation &location);
 };
 
 inline bool operator<(const SourceBreakpoint &lhs,
diff --git a/lldb/tools/lldb-dap/lldb-dap.cpp b/lldb/tools/lldb-dap/lldb-dap.cpp
index 01494dc..6702234 100644
--- a/lldb/tools/lldb-dap/lldb-dap.cpp
+++ b/lldb/tools/lldb-dap/lldb-dap.cpp
@@ -525,7 +525,8 @@ void EventThreadFunction() {
         if (event_mask & lldb::SBTarget::eBroadcastBitBreakpointChanged) {
           auto event_type =
               lldb::SBBreakpoint::GetBreakpointEventTypeFromEvent(event);
-          auto bp = lldb::SBBreakpoint::GetBreakpointFromEvent(event);
+          auto bp =
+              Breakpoint(lldb::SBBreakpoint::GetBreakpointFromEvent(event));
           // If the breakpoint was originated from the IDE, it will have the
           // BreakpointBase::GetBreakpointLabel() label attached. Regardless
           // of wether the locations were added or removed, the breakpoint
@@ -541,7 +542,7 @@ void EventThreadFunction() {
             // mapped. Note that CreateBreakpoint doesn't apply source mapping.
             // Besides, the current implementation of VSCode ignores the
             // "source" element of breakpoint events.
-            llvm::json::Value source_bp = CreateBreakpoint(bp);
+            llvm::json::Value source_bp = CreateBreakpoint(&bp);
             source_bp.getAsObject()->erase("source");
 
             body.try_emplace("breakpoint", source_bp);
@@ -2345,7 +2346,7 @@ void request_setBreakpoints(const llvm::json::Object &request) {
               existing_source_bps->second.find(src_bp.line);
           if (existing_bp != existing_source_bps->second.end()) {
             existing_bp->second.UpdateBreakpoint(src_bp);
-            AppendBreakpoint(existing_bp->second.bp, response_breakpoints, path,
+            AppendBreakpoint(&existing_bp->second, response_breakpoints, path,
                              src_bp.line);
             continue;
           }
@@ -2354,7 +2355,7 @@ void request_setBreakpoints(const llvm::json::Object &request) {
         g_dap.source_breakpoints[path][src_bp.line] = src_bp;
         SourceBreakpoint &new_bp = g_dap.source_breakpoints[path][src_bp.line];
         new_bp.SetBreakpoint(path.data());
-        AppendBreakpoint(new_bp.bp, response_breakpoints, path, new_bp.line);
+        AppendBreakpoint(&new_bp, response_breakpoints, path, new_bp.line);
       }
     }
   }
@@ -2567,7 +2568,7 @@ void request_setFunctionBreakpoints(const llvm::json::Object &request) {
       // handled it here and we don't need to set a new breakpoint below.
       request_bps.erase(request_pos);
       // Add this breakpoint info to the response
-      AppendBreakpoint(pair.second.bp, response_breakpoints);
+      AppendBreakpoint(&pair.second, response_breakpoints);
     }
   }
   // Remove any breakpoints that are no longer in our list
@@ -2581,7 +2582,7 @@ void request_setFunctionBreakpoints(const llvm::json::Object &request) {
     g_dap.function_breakpoints[pair.first()] = std::move(pair.second);
     FunctionBreakpoint &new_bp = g_dap.function_breakpoints[pair.first()];
     new_bp.SetBreakpoint();
-    AppendBreakpoint(new_bp.bp, response_breakpoints);
+    AppendBreakpoint(&new_bp, response_breakpoints);
   }
 
   llvm::json::Object body;
@@ -3582,8 +3583,8 @@ void request__testGetTargetBreakpoints(const llvm::json::Object &request) {
   FillResponse(request, response);
   llvm::json::Array response_breakpoints;
   for (uint32_t i = 0; g_dap.target.GetBreakpointAtIndex(i).IsValid(); ++i) {
-    auto bp = g_dap.target.GetBreakpointAtIndex(i);
-    AppendBreakpoint(bp, response_breakpoints);
+    auto bp = Breakpoint(g_dap.target.GetBreakpointAtIndex(i));
+    AppendBreakpoint(&bp, response_breakpoints);
   }
   llvm::json::Object body;
   body.try_emplace("breakpoints", std::move(response_breakpoints));
diff --git a/llvm/utils/gn/secondary/lldb/tools/lldb-dap/BUILD.gn b/llvm/utils/gn/secondary/lldb/tools/lldb-dap/BUILD.gn
index d8292df..98c2068 100644
--- a/llvm/utils/gn/secondary/lldb/tools/lldb-dap/BUILD.gn
+++ b/llvm/utils/gn/secondary/lldb/tools/lldb-dap/BUILD.gn
@@ -38,6 +38,7 @@ executable("lldb-dap") {
   # FIXME: rpath/install_name stuff on macOS for framework on macOS
 
   sources = [
+    "Breakpoint.cpp",
     "BreakpointBase.cpp",
     "DAP.cpp",
     "ExceptionBreakpoint.cpp",
-- 
cgit v1.1


From 7a5c1a4abc750fef335c2ee5191d59ebe9e4bf18 Mon Sep 17 00:00:00 2001
From: David Spickett <david.spickett@linaro.org>
Date: Tue, 13 Feb 2024 16:39:29 +0000
Subject: [clang][docs] Fix warning in LanguageExtensions

build-llvm/tools/clang/docs/LanguageExtensions.rst:2768: WARNING: Title underline too short.
---
 clang/docs/LanguageExtensions.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/clang/docs/LanguageExtensions.rst b/clang/docs/LanguageExtensions.rst
index ca78a5c..ee1d253 100644
--- a/clang/docs/LanguageExtensions.rst
+++ b/clang/docs/LanguageExtensions.rst
@@ -2765,7 +2765,7 @@ that even if present, its use may depend on run-time privilege or other OS
 controlled state.
 
 ``__builtin_readsteadycounter``
-------------------------------
+-------------------------------
 
 ``__builtin_readsteadycounter`` is used to access the fixed frequency counter
 register (or a similar steady-rate clock) on those targets that support it.
-- 
cgit v1.1


From 7a471133ef56bf6059b3e35125f86420ebbf3a33 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Andrzej=20Warzy=C5=84ski?= <andrzej.warzynski@arm.com>
Date: Tue, 13 Feb 2024 16:42:50 +0000
Subject: [mlir][nfc] Add tests for linalg.mmt4d (#81422)

linalg.mmt4d was added a while back (https://reviews.llvm.org/D105244),
but there are virtually no tests in-tree. In the spirit of documenting
through test, this PR adds a few basic examples.
---
 mlir/test/Dialect/Linalg/invalid.mlir              | 26 ++++++++
 mlir/test/Dialect/Linalg/named-ops.mlir            | 11 ++++
 .../Dialect/Linalg/transform-op-mmt4d-to-fma.mlir  | 70 ++++++++++++++++++++++
 mlir/test/Dialect/Linalg/vectorization.mlir        | 25 ++++++++
 4 files changed, 132 insertions(+)
 create mode 100644 mlir/test/Dialect/Linalg/transform-op-mmt4d-to-fma.mlir

diff --git a/mlir/test/Dialect/Linalg/invalid.mlir b/mlir/test/Dialect/Linalg/invalid.mlir
index 56890df..916c04f 100644
--- a/mlir/test/Dialect/Linalg/invalid.mlir
+++ b/mlir/test/Dialect/Linalg/invalid.mlir
@@ -744,3 +744,29 @@ func.func @illegal_softmax_output_shape(%arg0: tensor<2x16x32xf32>) -> tensor<2x
     -> tensor<2x16xf32>
   return %1 : tensor<2x16xf32>
 }
+
+// -----
+
+func.func @mmt4d_dims_mismatch(%A: tensor<16x16x8x1xf32>,
+                               %B: tensor<16x16x8x1xf32>,
+                               %C_in: tensor<16x16x8x1xf32>) -> tensor<16x16x8x1xf32> {
+    // expected-error @+1 {{inferred input/output operand #2 has shape's dimension #3 to be 8, but found 1}}
+    %res = linalg.mmt4d
+                     ins(%A, %B: tensor<16x16x8x1xf32>, tensor<16x16x8x1xf32>)
+                     outs(%C_in: tensor<16x16x8x1xf32>)
+                     -> tensor<16x16x8x1xf32>
+    return %res : tensor<16x16x8x1xf32>
+}
+
+// -----
+
+func.func @mmt4d_rank_mismatch(%A: tensor<16x16x8x1xf32>,
+                 %B: tensor<16x16x8x1xf32>,
+                 %C_in: tensor<8x8xf32>) -> tensor<8x8xf32> {
+    // expected-error @+1 {{expected operand rank (2) to match the result rank of indexing_map #2 (4)}}
+    %res = linalg.mmt4d
+                     ins(%A, %B: tensor<16x16x8x1xf32>, tensor<16x16x8x1xf32>)
+                     outs(%C_in: tensor<8x8xf32>)
+                     -> tensor<8x8xf32>
+    return %res : tensor<8x8xf32>
+}
diff --git a/mlir/test/Dialect/Linalg/named-ops.mlir b/mlir/test/Dialect/Linalg/named-ops.mlir
index 29977a7..7064e1b 100644
--- a/mlir/test/Dialect/Linalg/named-ops.mlir
+++ b/mlir/test/Dialect/Linalg/named-ops.mlir
@@ -1219,6 +1219,17 @@ func.func @batchmatmul_transpose_b(%arg0: memref<2x3x5xf32>, %arg1: memref<2x7x5
 
 // -----
 
+// CHECK-LABEL: func @mmt4d
+func.func @mmt4d(%A: tensor<10x32x8x1xf32>, %B: tensor<80x32x4x1xf32>, %C: tensor<10x80x8x4xf32>) -> tensor<10x80x8x4xf32> {
+  // CHECK: %{{.+}} = linalg.mmt4d
+  // CHECK-SAME: ins(%{{.+}}, %{{.+}} : tensor<10x32x8x1xf32>, tensor<80x32x4x1xf32>)
+  // CHECK-SAME: outs(%{{.+}} : tensor<10x80x8x4xf32>) -> tensor<10x80x8x4xf32>
+  %0 = linalg.mmt4d ins(%A, %B : tensor<10x32x8x1xf32>, tensor<80x32x4x1xf32>) outs(%C: tensor<10x80x8x4xf32>) -> tensor<10x80x8x4xf32>
+  return %0: tensor<10x80x8x4xf32>
+}
+
+// -----
+
 // CHECK-LABEL: func @batch_mmt4d
 func.func @batch_mmt4d(%arg0: tensor<128x10x32x8x1xf32>, %arg1: tensor<128x80x32x4x1xf32>, %arg2: tensor<128x10x80x8x4xf32>) -> tensor<128x10x80x8x4xf32> {
   // CHECK: %{{.+}} = linalg.batch_mmt4d
diff --git a/mlir/test/Dialect/Linalg/transform-op-mmt4d-to-fma.mlir b/mlir/test/Dialect/Linalg/transform-op-mmt4d-to-fma.mlir
new file mode 100644
index 0000000..61e13d1
--- /dev/null
+++ b/mlir/test/Dialect/Linalg/transform-op-mmt4d-to-fma.mlir
@@ -0,0 +1,70 @@
+// RUN: mlir-opt %s -transform-interpreter | FileCheck %s
+
+func.func @mmt4d_to_fma(%A: tensor<16x16x8x1xf32>, %B: tensor<16x16x8x1xf32>, %C_in: tensor<16x16x8x8xf32>) -> tensor<16x16x8x8xf32> {
+  %res = linalg.mmt4d
+                   ins(%A, %B: tensor<16x16x8x1xf32>, tensor<16x16x8x1xf32>)
+                   outs(%C_in: tensor<16x16x8x8xf32>)
+                   -> tensor<16x16x8x8xf32>
+  return %res : tensor<16x16x8x8xf32>
+}
+
+
+// CHECK-LABEL:     @mmt4d_to_fma
+// CHECK-COUNT-8:         vector.fma
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%module: !transform.any_op {transform.readonly}) {
+    %func = transform.structured.match ops{["func.func"]} in %module : (!transform.any_op) -> !transform.op<"func.func">
+
+    %mmt4d = transform.structured.match ops{["linalg.mmt4d"]} in %func
+
+    // Step 1: Tile
+      : (!transform.op<"func.func">) -> !transform.any_op
+    // Tile parallel dims
+    %tiled_linalg_op_p, %loops:4 = transform.structured.tile_using_for %mmt4d[1, 1, 0, 8, 8, 0]
+      : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op)
+    // Tile reduction dims
+    %tiled_linalg_op_r, %loops2:2 = transform.structured.tile_using_for %tiled_linalg_op_p[0, 0, 1, 0, 0, 1]
+      : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op)
+
+    // Step 2: Vectorize
+    transform.structured.vectorize %tiled_linalg_op_r : !transform.any_op
+
+    // Step 3: Simplify
+    // vector.multi_reduction --> vector.contract
+    // Generates a 6-dim vector.contract with the dim matching the original MMT4D Op
+    // and with the following split into parallel and reduction dims:
+    //    * parallel, parallel, reduction, parallel, parallel, reduction
+    transform.apply_patterns to %func {
+      transform.apply_patterns.vector.reduction_to_contract
+      // Reduce the rank of xfer ops. This transforms vector.contract to be
+      // more matmul-like and to enable the lowering to outer product Ops.
+      transform.apply_patterns.vector.transfer_permutation_patterns
+    } : !transform.op<"func.func">
+
+    // Hoisting and LICM - not strictly required
+    %func_h = transform.structured.hoist_redundant_vector_transfers %func
+      : (!transform.op<"func.func">) -> !transform.op<"func.func">
+    %all_loops = transform.structured.match interface{LoopLikeInterface} in %func_h
+      : (!transform.op<"func.func">) -> !transform.any_op
+    transform.apply_licm to %all_loops : !transform.any_op
+    transform.loop.hoist_loop_invariant_subsets %all_loops : !transform.any_op
+
+    // Simplify the 6-dim vector.contract into a 3-dim matmul-like
+    // vector.contract with the following split into parallel and reduction
+    // dims:
+    //    * parallel, parallel, reduction
+    transform.apply_patterns to %func_h {
+      transform.apply_patterns.vector.reduction_to_contract
+      transform.apply_patterns.vector.cast_away_vector_leading_one_dim
+      transform.apply_patterns.canonicalization
+    } : !transform.op<"func.func">
+
+    // Step 4: Lower vector.contract to vector.fma via vector.outerproduct
+    transform.apply_patterns to %func_h {
+      transform.apply_patterns.vector.lower_contraction lowering_strategy = "outerproduct"
+      transform.apply_patterns.vector.lower_outerproduct
+    } : !transform.op<"func.func">
+    transform.yield
+  }
+}
diff --git a/mlir/test/Dialect/Linalg/vectorization.mlir b/mlir/test/Dialect/Linalg/vectorization.mlir
index 5d1bef4..0272ac5 100644
--- a/mlir/test/Dialect/Linalg/vectorization.mlir
+++ b/mlir/test/Dialect/Linalg/vectorization.mlir
@@ -639,6 +639,31 @@ module attributes {transform.with_named_sequence} {
 
 // -----
 
+func.func @mmt4d(%A: memref<16x16x8x1xf32>, %B: memref<16x16x8x1xf32>, %C_in: memref<16x16x8x8xf32>) {
+  linalg.mmt4d ins(%A, %B: memref<16x16x8x1xf32>, memref<16x16x8x1xf32>)
+               outs(%C_in: memref<16x16x8x8xf32>)
+  return
+}
+
+// CHECK-LABEL:   func.func @mmt4d(
+// CHECK-SAME:      %[[A:.*]]: memref<16x16x8x1xf32>, %[[B:.*]]: memref<16x16x8x1xf32>, %[[C:.*]]: memref<16x16x8x8xf32>) {
+// CHECK:           %[[VEC_A:.*]] = vector.transfer_read %[[A]]{{.*}} : memref<16x16x8x1xf32>, vector<16x16x16x8x8x1xf32>
+// CHECK:           %[[VEC_B:.*]] = vector.transfer_read %[[B]]{{.*}} : memref<16x16x8x1xf32>, vector<16x16x16x8x8x1xf32>
+// CHECK:           %[[VEC_C:.*]] = vector.transfer_read %[[C]]{{.*}} : memref<16x16x8x8xf32>, vector<16x16x8x8xf32>
+// CHECK:           %[[MUL:.*]] = arith.mulf %[[VEC_A]], %[[VEC_B]] : vector<16x16x16x8x8x1xf32>
+// CHECK:           %[[RED:.*]] = vector.multi_reduction <add>, %[[MUL]], %[[VEC_C]] [2, 5] : vector<16x16x16x8x8x1xf32> to vector<16x16x8x8xf32>
+// CHECK:           vector.transfer_write %[[RED]], %[[C]]{{.*}} : vector<16x16x8x8xf32>, memref<16x16x8x8xf32>
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+    %mmt4d = transform.structured.match ops{["linalg.mmt4d"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    transform.structured.vectorize %mmt4d : !transform.any_op
+    transform.yield
+  }
+}
+
+// -----
+
 func.func @matmul_scalable(%A: memref<?x?xf32>, %B: memref<?x?xf32>, %C: memref<?x?xf32>) {
   linalg.matmul ins(%A, %B: memref<?x?xf32>, memref<?x?xf32>)
             outs(%C: memref<?x?xf32>)
-- 
cgit v1.1


From f879ac0385d4c5f7b2b9f4807cd7bd4a78556c1c Mon Sep 17 00:00:00 2001
From: Joseph Huber <huberjn@outlook.com>
Date: Tue, 13 Feb 2024 10:45:43 -0600
Subject: [libc] Rework the RPC interface to accept runtime wave sizes (#80914)

Summary:
The RPC interface needs to handle an entire warp or wavefront at once.
This is currently done by using a compile time constant indicating the
size of the buffer, which right now defaults to some value on the client
(GPU) side. However, there are currently attempts to move the `libc`
library to a single IR build. This is problematic as the size of the
wave fronts changes between ISAs on AMDGPU. The builitin
`__builtin_amdgcn_wavefrontsize()` will return the appropriate value,
but it is only known at runtime now.

In order to support this, this patch restructures the packet. Now
instead of having an array of arrays, we simply have a large array of
buffers and slice it according to the runtime value if we don't know it
ahead of time. This also somewhat has the advantage of making the buffer
contiguous within a page now that the header has been moved out of it.
---
 libc/src/__support/GPU/amdgpu/utils.h          |  10 +-
 libc/src/__support/GPU/generic/utils.h         |   4 +-
 libc/src/__support/GPU/nvptx/utils.h           |   7 +-
 libc/src/__support/RPC/rpc.h                   | 152 +++++++++++++------------
 libc/test/src/__support/RPC/rpc_smoke_test.cpp |  10 +-
 libc/utils/gpu/server/rpc_server.cpp           |  56 +++------
 6 files changed, 108 insertions(+), 131 deletions(-)

diff --git a/libc/src/__support/GPU/amdgpu/utils.h b/libc/src/__support/GPU/amdgpu/utils.h
index 58bbe29..9432b7b 100644
--- a/libc/src/__support/GPU/amdgpu/utils.h
+++ b/libc/src/__support/GPU/amdgpu/utils.h
@@ -17,9 +17,6 @@
 namespace LIBC_NAMESPACE {
 namespace gpu {
 
-/// The number of threads that execute in lock-step in a lane.
-constexpr const uint64_t LANE_SIZE = __AMDGCN_WAVEFRONT_SIZE;
-
 /// Type aliases to the address spaces used by the AMDGPU backend.
 template <typename T> using Private = [[clang::opencl_private]] T;
 template <typename T> using Constant = [[clang::opencl_constant]] T;
@@ -108,8 +105,11 @@ LIBC_INLINE uint64_t get_thread_id() {
          get_num_threads_x() * get_num_threads_y() * get_thread_id_z();
 }
 
-/// Returns the size of an AMD wavefront. Either 32 or 64 depending on hardware.
-LIBC_INLINE uint32_t get_lane_size() { return LANE_SIZE; }
+/// Returns the size of an AMD wavefront, either 32 or 64 depending on hardware
+/// and compilation options.
+LIBC_INLINE uint32_t get_lane_size() {
+  return __builtin_amdgcn_wavefrontsize();
+}
 
 /// Returns the id of the thread inside of an AMD wavefront executing together.
 [[clang::convergent]] LIBC_INLINE uint32_t get_lane_id() {
diff --git a/libc/src/__support/GPU/generic/utils.h b/libc/src/__support/GPU/generic/utils.h
index 00b59837..58db88d 100644
--- a/libc/src/__support/GPU/generic/utils.h
+++ b/libc/src/__support/GPU/generic/utils.h
@@ -16,8 +16,6 @@
 namespace LIBC_NAMESPACE {
 namespace gpu {
 
-constexpr const uint64_t LANE_SIZE = 1;
-
 template <typename T> using Private = T;
 template <typename T> using Constant = T;
 template <typename T> using Shared = T;
@@ -55,7 +53,7 @@ LIBC_INLINE uint32_t get_thread_id_z() { return 0; }
 
 LIBC_INLINE uint64_t get_thread_id() { return 0; }
 
-LIBC_INLINE uint32_t get_lane_size() { return LANE_SIZE; }
+LIBC_INLINE uint32_t get_lane_size() { return 1; }
 
 LIBC_INLINE uint32_t get_lane_id() { return 0; }
 
diff --git a/libc/src/__support/GPU/nvptx/utils.h b/libc/src/__support/GPU/nvptx/utils.h
index e7e297a..6c4bb5a 100644
--- a/libc/src/__support/GPU/nvptx/utils.h
+++ b/libc/src/__support/GPU/nvptx/utils.h
@@ -16,9 +16,6 @@
 namespace LIBC_NAMESPACE {
 namespace gpu {
 
-/// The number of threads that execute in lock-step in a warp.
-constexpr const uint64_t LANE_SIZE = 32;
-
 /// Type aliases to the address spaces used by the NVPTX backend.
 template <typename T> using Private = [[clang::opencl_private]] T;
 template <typename T> using Constant = [[clang::opencl_constant]] T;
@@ -95,8 +92,8 @@ LIBC_INLINE uint64_t get_thread_id() {
          get_num_threads_x() * get_num_threads_y() * get_thread_id_z();
 }
 
-/// Returns the size of a CUDA warp.
-LIBC_INLINE uint32_t get_lane_size() { return LANE_SIZE; }
+/// Returns the size of a CUDA warp, always 32 on NVIDIA hardware.
+LIBC_INLINE uint32_t get_lane_size() { return 32; }
 
 /// Returns the id of the thread inside of a CUDA warp executing together.
 [[clang::convergent]] LIBC_INLINE uint32_t get_lane_id() {
diff --git a/libc/src/__support/RPC/rpc.h b/libc/src/__support/RPC/rpc.h
index 7924d4c..5ed39ae 100644
--- a/libc/src/__support/RPC/rpc.h
+++ b/libc/src/__support/RPC/rpc.h
@@ -43,19 +43,6 @@ struct Header {
   uint16_t opcode;
 };
 
-/// The data payload for the associated packet. We provide enough space for each
-/// thread in the cooperating lane to have a buffer.
-template <uint32_t lane_size = gpu::LANE_SIZE> struct Payload {
-  Buffer slot[lane_size];
-};
-
-/// A packet used to share data between the client and server across an entire
-/// lane. We use a lane as the minimum granularity for execution.
-template <uint32_t lane_size = gpu::LANE_SIZE> struct alignas(64) Packet {
-  Header header;
-  Payload<lane_size> payload;
-};
-
 /// The maximum number of parallel ports that the RPC interface can support.
 constexpr uint64_t MAX_PORT_COUNT = 4096;
 
@@ -71,7 +58,7 @@ constexpr uint64_t MAX_PORT_COUNT = 4096;
 ///   - The client will always start with a 'send' operation.
 ///   - The server will always start with a 'recv' operation.
 ///   - Every 'send' or 'recv' call is mirrored by the other process.
-template <bool Invert, typename Packet> struct Process {
+template <bool Invert> struct Process {
   LIBC_INLINE Process() = default;
   LIBC_INLINE Process(const Process &) = delete;
   LIBC_INLINE Process &operator=(const Process &) = delete;
@@ -82,7 +69,8 @@ template <bool Invert, typename Packet> struct Process {
   uint32_t port_count = 0;
   cpp::Atomic<uint32_t> *inbox = nullptr;
   cpp::Atomic<uint32_t> *outbox = nullptr;
-  Packet *packet = nullptr;
+  Header *header = nullptr;
+  Buffer *packet = nullptr;
 
   static constexpr uint64_t NUM_BITS_IN_WORD = sizeof(uint32_t) * 8;
   cpp::Atomic<uint32_t> lock[MAX_PORT_COUNT / NUM_BITS_IN_WORD] = {0};
@@ -92,7 +80,9 @@ template <bool Invert, typename Packet> struct Process {
                                     advance(buffer, inbox_offset(port_count)))),
         outbox(reinterpret_cast<cpp::Atomic<uint32_t> *>(
             advance(buffer, outbox_offset(port_count)))),
-        packet(reinterpret_cast<Packet *>(
+        header(reinterpret_cast<Header *>(
+            advance(buffer, header_offset(port_count)))),
+        packet(reinterpret_cast<Buffer *>(
             advance(buffer, buffer_offset(port_count)))) {}
 
   /// Allocate a memory buffer sufficient to store the following equivalent
@@ -101,10 +91,12 @@ template <bool Invert, typename Packet> struct Process {
   /// struct Equivalent {
   ///   Atomic<uint32_t> primary[port_count];
   ///   Atomic<uint32_t> secondary[port_count];
-  ///   Packet buffer[port_count];
+  ///   Header header[port_count];
+  ///   Buffer packet[port_count][lane_size];
   /// };
-  LIBC_INLINE static constexpr uint64_t allocation_size(uint32_t port_count) {
-    return buffer_offset(port_count) + buffer_bytes(port_count);
+  LIBC_INLINE static constexpr uint64_t allocation_size(uint32_t port_count,
+                                                        uint32_t lane_size) {
+    return buffer_offset(port_count) + buffer_bytes(port_count, lane_size);
   }
 
   /// Retrieve the inbox state from memory shared between processes.
@@ -144,6 +136,13 @@ template <bool Invert, typename Packet> struct Process {
     atomic_thread_fence(cpp::MemoryOrder::ACQUIRE);
   }
 
+  /// The packet is a linearly allocated array of buffers used to communicate
+  /// with the other process. This function returns the appropriate slot in this
+  /// array such that the process can operate on an entire warp or wavefront.
+  LIBC_INLINE Buffer *get_packet(uint32_t index, uint32_t lane_size) {
+    return &packet[index * lane_size];
+  }
+
   /// Determines if this process needs to wait for ownership of the buffer. We
   /// invert the condition on one of the processes to indicate that if one
   /// process owns the buffer then the other does not.
@@ -219,8 +218,9 @@ template <bool Invert, typename Packet> struct Process {
   }
 
   /// Number of bytes to allocate for the buffer containing the packets.
-  LIBC_INLINE static constexpr uint64_t buffer_bytes(uint32_t port_count) {
-    return port_count * sizeof(Packet);
+  LIBC_INLINE static constexpr uint64_t buffer_bytes(uint32_t port_count,
+                                                     uint32_t lane_size) {
+    return port_count * lane_size * sizeof(Buffer);
   }
 
   /// Offset of the inbox in memory. This is the same as the outbox if inverted.
@@ -234,8 +234,14 @@ template <bool Invert, typename Packet> struct Process {
   }
 
   /// Offset of the buffer containing the packets after the inbox and outbox.
+  LIBC_INLINE static constexpr uint64_t header_offset(uint32_t port_count) {
+    return align_up(2 * mailbox_bytes(port_count), alignof(Header));
+  }
+
+  /// Offset of the buffer containing the packets after the inbox and outbox.
   LIBC_INLINE static constexpr uint64_t buffer_offset(uint32_t port_count) {
-    return align_up(2 * mailbox_bytes(port_count), alignof(Packet));
+    return align_up(header_offset(port_count) + port_count * sizeof(Header),
+                    alignof(Buffer));
   }
 
   /// Conditionally set the n-th bit in the atomic bitfield.
@@ -262,39 +268,39 @@ template <bool Invert, typename Packet> struct Process {
 };
 
 /// Invokes a function accross every active buffer across the total lane size.
-template <uint32_t lane_size>
 static LIBC_INLINE void invoke_rpc(cpp::function<void(Buffer *)> fn,
-                                   Packet<lane_size> &packet) {
+                                   uint32_t lane_size, uint64_t lane_mask,
+                                   Buffer *slot) {
   if constexpr (is_process_gpu()) {
-    fn(&packet.payload.slot[gpu::get_lane_id()]);
+    fn(&slot[gpu::get_lane_id()]);
   } else {
     for (uint32_t i = 0; i < lane_size; i += gpu::get_lane_size())
-      if (packet.header.mask & 1ul << i)
-        fn(&packet.payload.slot[i]);
+      if (lane_mask & (1ul << i))
+        fn(&slot[i]);
   }
 }
 
 /// Alternate version that also provides the index of the current lane.
-template <uint32_t lane_size>
 static LIBC_INLINE void invoke_rpc(cpp::function<void(Buffer *, uint32_t)> fn,
-                                   Packet<lane_size> &packet) {
+                                   uint32_t lane_size, uint64_t lane_mask,
+                                   Buffer *slot) {
   if constexpr (is_process_gpu()) {
-    fn(&packet.payload.slot[gpu::get_lane_id()], gpu::get_lane_id());
+    fn(&slot[gpu::get_lane_id()], gpu::get_lane_id());
   } else {
     for (uint32_t i = 0; i < lane_size; i += gpu::get_lane_size())
-      if (packet.header.mask & 1ul << i)
-        fn(&packet.payload.slot[i], i);
+      if (lane_mask & (1ul << i))
+        fn(&slot[i], i);
   }
 }
 
 /// The port provides the interface to communicate between the multiple
 /// processes. A port is conceptually an index into the memory provided by the
 /// underlying process that is guarded by a lock bit.
-template <bool T, typename S> struct Port {
-  LIBC_INLINE Port(Process<T, S> &process, uint64_t lane_mask, uint32_t index,
-                   uint32_t out)
-      : process(process), lane_mask(lane_mask), index(index), out(out),
-        receive(false), owns_buffer(true) {}
+template <bool T> struct Port {
+  LIBC_INLINE Port(Process<T> &process, uint64_t lane_mask, uint32_t lane_size,
+                   uint32_t index, uint32_t out)
+      : process(process), lane_mask(lane_mask), lane_size(lane_size),
+        index(index), out(out), receive(false), owns_buffer(true) {}
   LIBC_INLINE ~Port() = default;
 
 private:
@@ -305,7 +311,7 @@ private:
 
   friend struct Client;
   template <uint32_t U> friend struct Server;
-  friend class cpp::optional<Port<T, S>>;
+  friend class cpp::optional<Port<T>>;
 
 public:
   template <typename U> LIBC_INLINE void recv(U use);
@@ -319,7 +325,7 @@ public:
   LIBC_INLINE void recv_n(void **dst, uint64_t *size, A &&alloc);
 
   LIBC_INLINE uint16_t get_opcode() const {
-    return process.packet[index].header.opcode;
+    return process.header[index].opcode;
   }
 
   LIBC_INLINE uint16_t get_index() const { return index; }
@@ -333,8 +339,9 @@ public:
   }
 
 private:
-  Process<T, S> &process;
+  Process<T> &process;
   uint64_t lane_mask;
+  uint32_t lane_size;
   uint32_t index;
   uint32_t out;
   bool receive;
@@ -351,15 +358,14 @@ struct Client {
   LIBC_INLINE Client(uint32_t port_count, void *buffer)
       : process(port_count, buffer) {}
 
-  using Port = rpc::Port<false, Packet<gpu::LANE_SIZE>>;
+  using Port = rpc::Port<false>;
   template <uint16_t opcode> LIBC_INLINE Port open();
 
 private:
-  Process<false, Packet<gpu::LANE_SIZE>> process;
+  Process<false> process;
 };
 static_assert(cpp::is_trivially_copyable<Client>::value &&
-                  sizeof(Process<false, Packet<1>>) ==
-                      sizeof(Process<false, Packet<32>>),
+                  sizeof(Process<true>) == sizeof(Process<false>),
               "The client is not trivially copyable from the server");
 
 /// The RPC server used to respond to the client.
@@ -372,38 +378,35 @@ template <uint32_t lane_size> struct Server {
   LIBC_INLINE Server(uint32_t port_count, void *buffer)
       : process(port_count, buffer) {}
 
-  using Port = rpc::Port<true, Packet<lane_size>>;
+  using Port = rpc::Port<true>;
   LIBC_INLINE cpp::optional<Port> try_open(uint32_t start = 0);
   LIBC_INLINE Port open();
 
   LIBC_INLINE static uint64_t allocation_size(uint32_t port_count) {
-    return Process<true, Packet<lane_size>>::allocation_size(port_count);
+    return Process<true>::allocation_size(port_count, lane_size);
   }
 
 private:
-  Process<true, Packet<lane_size>> process;
+  Process<true> process;
 };
 
 /// Applies \p fill to the shared buffer and initiates a send operation.
-template <bool T, typename S>
-template <typename F>
-LIBC_INLINE void Port<T, S>::send(F fill) {
+template <bool T> template <typename F> LIBC_INLINE void Port<T>::send(F fill) {
   uint32_t in = owns_buffer ? out ^ T : process.load_inbox(lane_mask, index);
 
   // We need to wait until we own the buffer before sending.
   process.wait_for_ownership(lane_mask, index, out, in);
 
   // Apply the \p fill function to initialize the buffer and release the memory.
-  invoke_rpc(fill, process.packet[index]);
+  invoke_rpc(fill, lane_size, process.header[index].mask,
+             process.get_packet(index, lane_size));
   out = process.invert_outbox(index, out);
   owns_buffer = false;
   receive = false;
 }
 
 /// Applies \p use to the shared buffer and acknowledges the send.
-template <bool T, typename S>
-template <typename U>
-LIBC_INLINE void Port<T, S>::recv(U use) {
+template <bool T> template <typename U> LIBC_INLINE void Port<T>::recv(U use) {
   // We only exchange ownership of the buffer during a receive if we are waiting
   // for a previous receive to finish.
   if (receive) {
@@ -417,15 +420,16 @@ LIBC_INLINE void Port<T, S>::recv(U use) {
   process.wait_for_ownership(lane_mask, index, out, in);
 
   // Apply the \p use function to read the memory out of the buffer.
-  invoke_rpc(use, process.packet[index]);
+  invoke_rpc(use, lane_size, process.header[index].mask,
+             process.get_packet(index, lane_size));
   receive = true;
   owns_buffer = true;
 }
 
 /// Combines a send and receive into a single function.
-template <bool T, typename S>
+template <bool T>
 template <typename F, typename U>
-LIBC_INLINE void Port<T, S>::send_and_recv(F fill, U use) {
+LIBC_INLINE void Port<T>::send_and_recv(F fill, U use) {
   send(fill);
   recv(use);
 }
@@ -433,17 +437,17 @@ LIBC_INLINE void Port<T, S>::send_and_recv(F fill, U use) {
 /// Combines a receive and send operation into a single function. The \p work
 /// function modifies the buffer in-place and the send is only used to initiate
 /// the copy back.
-template <bool T, typename S>
+template <bool T>
 template <typename W>
-LIBC_INLINE void Port<T, S>::recv_and_send(W work) {
+LIBC_INLINE void Port<T>::recv_and_send(W work) {
   recv(work);
   send([](Buffer *) { /* no-op */ });
 }
 
 /// Helper routine to simplify the interface when sending from the GPU using
 /// thread private pointers to the underlying value.
-template <bool T, typename S>
-LIBC_INLINE void Port<T, S>::send_n(const void *src, uint64_t size) {
+template <bool T>
+LIBC_INLINE void Port<T>::send_n(const void *src, uint64_t size) {
   const void **src_ptr = &src;
   uint64_t *size_ptr = &size;
   send_n(src_ptr, size_ptr);
@@ -451,8 +455,8 @@ LIBC_INLINE void Port<T, S>::send_n(const void *src, uint64_t size) {
 
 /// Sends an arbitrarily sized data buffer \p src across the shared channel in
 /// multiples of the packet length.
-template <bool T, typename S>
-LIBC_INLINE void Port<T, S>::send_n(const void *const *src, uint64_t *size) {
+template <bool T>
+LIBC_INLINE void Port<T>::send_n(const void *const *src, uint64_t *size) {
   uint64_t num_sends = 0;
   send([&](Buffer *buffer, uint32_t id) {
     reinterpret_cast<uint64_t *>(buffer->data)[0] = lane_value(size, id);
@@ -465,7 +469,7 @@ LIBC_INLINE void Port<T, S>::send_n(const void *const *src, uint64_t *size) {
     rpc_memcpy(&buffer->data[1], lane_value(src, id), len);
   });
   uint64_t idx = sizeof(Buffer::data) - sizeof(uint64_t);
-  uint64_t mask = process.packet[index].header.mask;
+  uint64_t mask = process.header[index].mask;
   while (gpu::ballot(mask, idx < num_sends)) {
     send([=](Buffer *buffer, uint32_t id) {
       uint64_t len = lane_value(size, id) - idx > sizeof(Buffer::data)
@@ -481,9 +485,9 @@ LIBC_INLINE void Port<T, S>::send_n(const void *const *src, uint64_t *size) {
 /// Receives an arbitrarily sized data buffer across the shared channel in
 /// multiples of the packet length. The \p alloc function is called with the
 /// size of the data so that we can initialize the size of the \p dst buffer.
-template <bool T, typename S>
+template <bool T>
 template <typename A>
-LIBC_INLINE void Port<T, S>::recv_n(void **dst, uint64_t *size, A &&alloc) {
+LIBC_INLINE void Port<T>::recv_n(void **dst, uint64_t *size, A &&alloc) {
   uint64_t num_recvs = 0;
   recv([&](Buffer *buffer, uint32_t id) {
     lane_value(size, id) = reinterpret_cast<uint64_t *>(buffer->data)[0];
@@ -498,7 +502,7 @@ LIBC_INLINE void Port<T, S>::recv_n(void **dst, uint64_t *size, A &&alloc) {
     rpc_memcpy(lane_value(dst, id), &buffer->data[1], len);
   });
   uint64_t idx = sizeof(Buffer::data) - sizeof(uint64_t);
-  uint64_t mask = process.packet[index].header.mask;
+  uint64_t mask = process.header[index].mask;
   while (gpu::ballot(mask, idx < num_recvs)) {
     recv([=](Buffer *buffer, uint32_t id) {
       uint64_t len = lane_value(size, id) - idx > sizeof(Buffer::data)
@@ -515,8 +519,10 @@ LIBC_INLINE void Port<T, S>::recv_n(void **dst, uint64_t *size, A &&alloc) {
 /// only open a port if we find an index that is in a valid sending state. That
 /// is, there are send operations pending that haven't been serviced on this
 /// port. Each port instance uses an associated \p opcode to tell the server
-/// what to do.
-template <uint16_t opcode> LIBC_INLINE Client::Port Client::open() {
+/// what to do. The Client interface provides the appropriate lane size to the
+/// port using the platform's returned value.
+template <uint16_t opcode>
+[[clang::convergent]] LIBC_INLINE Client::Port Client::open() {
   // Repeatedly perform a naive linear scan for a port that can be opened to
   // send data.
   for (uint32_t index = gpu::get_cluster_id();; ++index) {
@@ -540,11 +546,11 @@ template <uint16_t opcode> LIBC_INLINE Client::Port Client::open() {
     }
 
     if (gpu::is_first_lane(lane_mask)) {
-      process.packet[index].header.opcode = opcode;
-      process.packet[index].header.mask = lane_mask;
+      process.header[index].opcode = opcode;
+      process.header[index].mask = lane_mask;
     }
     gpu::sync_lane(lane_mask);
-    return Port(process, lane_mask, index, out);
+    return Port(process, lane_mask, gpu::get_lane_size(), index, out);
   }
 }
 
@@ -577,7 +583,7 @@ template <uint32_t lane_size>
       continue;
     }
 
-    return Port(process, lane_mask, index, out);
+    return Port(process, lane_mask, lane_size, index, out);
   }
   return cpp::nullopt;
 }
diff --git a/libc/test/src/__support/RPC/rpc_smoke_test.cpp b/libc/test/src/__support/RPC/rpc_smoke_test.cpp
index 54821e2..58b318c 100644
--- a/libc/test/src/__support/RPC/rpc_smoke_test.cpp
+++ b/libc/test/src/__support/RPC/rpc_smoke_test.cpp
@@ -13,12 +13,8 @@
 namespace {
 enum { lane_size = 8, port_count = 4 };
 
-struct Packet {
-  uint64_t unused;
-};
-
-using ProcAType = LIBC_NAMESPACE::rpc::Process<false, Packet>;
-using ProcBType = LIBC_NAMESPACE::rpc::Process<true, Packet>;
+using ProcAType = LIBC_NAMESPACE::rpc::Process<false>;
+using ProcBType = LIBC_NAMESPACE::rpc::Process<true>;
 
 static_assert(ProcAType::inbox_offset(port_count) ==
               ProcBType::outbox_offset(port_count));
@@ -26,7 +22,7 @@ static_assert(ProcAType::inbox_offset(port_count) ==
 static_assert(ProcAType::outbox_offset(port_count) ==
               ProcBType::inbox_offset(port_count));
 
-enum { alloc_size = ProcAType::allocation_size(port_count) };
+enum { alloc_size = ProcAType::allocation_size(port_count, 1) };
 
 alignas(64) char buffer[alloc_size] = {0};
 } // namespace
diff --git a/libc/utils/gpu/server/rpc_server.cpp b/libc/utils/gpu/server/rpc_server.cpp
index a2e5d0f..4e535a2 100644
--- a/libc/utils/gpu/server/rpc_server.cpp
+++ b/libc/utils/gpu/server/rpc_server.cpp
@@ -396,62 +396,42 @@ const void *rpc_get_client_buffer(uint32_t device_id) {
 
 uint64_t rpc_get_client_size() { return sizeof(rpc::Client); }
 
-using ServerPort = std::variant<rpc::Server<1>::Port *, rpc::Server<32>::Port *,
-                                rpc::Server<64>::Port *>;
+using ServerPort = std::variant<rpc::Server<0>::Port *>;
 
 ServerPort get_port(rpc_port_t ref) {
-  if (ref.lane_size == 1)
-    return reinterpret_cast<rpc::Server<1>::Port *>(ref.handle);
-  else if (ref.lane_size == 32)
-    return reinterpret_cast<rpc::Server<32>::Port *>(ref.handle);
-  else if (ref.lane_size == 64)
-    return reinterpret_cast<rpc::Server<64>::Port *>(ref.handle);
-  else
-    __builtin_unreachable();
+  return reinterpret_cast<rpc::Server<0>::Port *>(ref.handle);
 }
 
 void rpc_send(rpc_port_t ref, rpc_port_callback_ty callback, void *data) {
-  auto port = get_port(ref);
-  std::visit(
-      [=](auto &port) {
-        port->send([=](rpc::Buffer *buffer) {
-          callback(reinterpret_cast<rpc_buffer_t *>(buffer), data);
-        });
-      },
-      port);
+  auto port = reinterpret_cast<rpc::Server<0>::Port *>(ref.handle);
+  port->send([=](rpc::Buffer *buffer) {
+    callback(reinterpret_cast<rpc_buffer_t *>(buffer), data);
+  });
 }
 
 void rpc_send_n(rpc_port_t ref, const void *const *src, uint64_t *size) {
-  auto port = get_port(ref);
-  std::visit([=](auto &port) { port->send_n(src, size); }, port);
+  auto port = reinterpret_cast<rpc::Server<0>::Port *>(ref.handle);
+  port->send_n(src, size);
 }
 
 void rpc_recv(rpc_port_t ref, rpc_port_callback_ty callback, void *data) {
-  auto port = get_port(ref);
-  std::visit(
-      [=](auto &port) {
-        port->recv([=](rpc::Buffer *buffer) {
-          callback(reinterpret_cast<rpc_buffer_t *>(buffer), data);
-        });
-      },
-      port);
+  auto port = reinterpret_cast<rpc::Server<0>::Port *>(ref.handle);
+  port->recv([=](rpc::Buffer *buffer) {
+    callback(reinterpret_cast<rpc_buffer_t *>(buffer), data);
+  });
 }
 
 void rpc_recv_n(rpc_port_t ref, void **dst, uint64_t *size, rpc_alloc_ty alloc,
                 void *data) {
-  auto port = get_port(ref);
+  auto port = reinterpret_cast<rpc::Server<0>::Port *>(ref.handle);
   auto alloc_fn = [=](uint64_t size) { return alloc(size, data); };
-  std::visit([=](auto &port) { port->recv_n(dst, size, alloc_fn); }, port);
+  port->recv_n(dst, size, alloc_fn);
 }
 
 void rpc_recv_and_send(rpc_port_t ref, rpc_port_callback_ty callback,
                        void *data) {
-  auto port = get_port(ref);
-  std::visit(
-      [=](auto &port) {
-        port->recv_and_send([=](rpc::Buffer *buffer) {
-          callback(reinterpret_cast<rpc_buffer_t *>(buffer), data);
-        });
-      },
-      port);
+  auto port = reinterpret_cast<rpc::Server<0>::Port *>(ref.handle);
+  port->recv_and_send([=](rpc::Buffer *buffer) {
+    callback(reinterpret_cast<rpc_buffer_t *>(buffer), data);
+  });
 }
-- 
cgit v1.1


From d79c3c50c45f2bd0acc0269dbedde9ddeed2d50e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Valentin=20Clement=20=28=E3=83=90=E3=83=AC=E3=83=B3?=
 =?UTF-8?q?=E3=82=BF=E3=82=A4=E3=83=B3=20=E3=82=AF=E3=83=AC=E3=83=A1?=
 =?UTF-8?q?=E3=83=B3=29?= <clementval@gmail.com>
Date: Tue, 13 Feb 2024 08:52:13 -0800
Subject: [flang][cuda] Lower launch_bounds values (#81537)

This PR adds a new attribute to carry over the information from
`launch_bounds`. The new attribute `CUDALaunchBoundsAttr` holds 2 to 3
integer attrinbutes and is added to `func.func` operation.
---
 flang/include/flang/Optimizer/Dialect/FIRAttr.td   | 12 ++++++
 .../flang/Optimizer/Dialect/FIROpsSupport.h        |  5 +++
 flang/lib/Lower/CallInterface.cpp                  | 45 +++++++++++++++++++---
 flang/lib/Optimizer/Dialect/FIRAttr.cpp            |  3 +-
 flang/test/Lower/CUDA/cuda-proc-attribute.cuf      |  6 +++
 5 files changed, 64 insertions(+), 7 deletions(-)

diff --git a/flang/include/flang/Optimizer/Dialect/FIRAttr.td b/flang/include/flang/Optimizer/Dialect/FIRAttr.td
index 00e293e..3602c67 100644
--- a/flang/include/flang/Optimizer/Dialect/FIRAttr.td
+++ b/flang/include/flang/Optimizer/Dialect/FIRAttr.td
@@ -113,4 +113,16 @@ def fir_CUDAProcAttributeAttr :
   let assemblyFormat = [{ ```<` $value `>` }];
 }
 
+def fir_CUDALaunchBoundsAttr : fir_Attr<"CUDALaunchBounds"> {
+  let mnemonic = "launch_bounds";
+
+  let parameters = (ins
+    "mlir::IntegerAttr":$maxTPB,
+    "mlir::IntegerAttr":$minBPM,
+    OptionalParameter<"mlir::IntegerAttr">:$upperBoundClusterSize
+  );
+
+  let assemblyFormat = "`<` struct(params) `>`";
+}
+
 #endif // FIR_DIALECT_FIR_ATTRS
diff --git a/flang/include/flang/Optimizer/Dialect/FIROpsSupport.h b/flang/include/flang/Optimizer/Dialect/FIROpsSupport.h
index 6ac6a31..29fa57c 100644
--- a/flang/include/flang/Optimizer/Dialect/FIROpsSupport.h
+++ b/flang/include/flang/Optimizer/Dialect/FIROpsSupport.h
@@ -75,6 +75,11 @@ static constexpr llvm::StringRef getTargetAttrName() { return "fir.target"; }
 /// Attribute to mark Fortran entities with the CUDA attribute.
 static constexpr llvm::StringRef getCUDAAttrName() { return "fir.cuda_attr"; }
 
+/// Attribute to carry CUDA launch_bounds values.
+static constexpr llvm::StringRef getCUDALaunchBoundsAttrName() {
+  return "fir.cuda_launch_bounds";
+}
+
 /// Attribute to mark that a function argument is a character dummy procedure.
 /// Character dummy procedure have special ABI constraints.
 static constexpr llvm::StringRef getCharacterProcedureDummyAttrName() {
diff --git a/flang/lib/Lower/CallInterface.cpp b/flang/lib/Lower/CallInterface.cpp
index 41597c1..f990e0b 100644
--- a/flang/lib/Lower/CallInterface.cpp
+++ b/flang/lib/Lower/CallInterface.cpp
@@ -524,6 +524,43 @@ static void addSymbolAttribute(mlir::func::FuncOp func,
                 mlir::StringAttr::get(&mlirContext, name));
 }
 
+static void
+setCUDAAttributes(mlir::func::FuncOp func,
+                  const Fortran::semantics::Symbol *sym,
+                  std::optional<Fortran::evaluate::characteristics::Procedure>
+                      characteristic) {
+  if (characteristic && characteristic->cudaSubprogramAttrs) {
+    func.getOperation()->setAttr(
+        fir::getCUDAAttrName(),
+        fir::getCUDAProcAttribute(func.getContext(),
+                                  *characteristic->cudaSubprogramAttrs));
+  }
+
+  if (sym) {
+    if (auto details =
+            sym->GetUltimate()
+                .detailsIf<Fortran::semantics::SubprogramDetails>()) {
+      if (!details->cudaLaunchBounds().empty()) {
+        assert(details->cudaLaunchBounds().size() >= 2 &&
+               "expect at least 2 values");
+        mlir::Type i64Ty = mlir::IntegerType::get(func.getContext(), 64);
+        auto maxTPBAttr =
+            mlir::IntegerAttr::get(i64Ty, details->cudaLaunchBounds()[0]);
+        auto minBPMAttr =
+            mlir::IntegerAttr::get(i64Ty, details->cudaLaunchBounds()[1]);
+        mlir::IntegerAttr ubAttr;
+        if (details->cudaLaunchBounds().size() > 2)
+          ubAttr =
+              mlir::IntegerAttr::get(i64Ty, details->cudaLaunchBounds()[2]);
+        func.getOperation()->setAttr(
+            fir::getCUDALaunchBoundsAttrName(),
+            fir::CUDALaunchBoundsAttr::get(func.getContext(), maxTPBAttr,
+                                           minBPMAttr, ubAttr));
+      }
+    }
+  }
+}
+
 /// Declare drives the different actions to be performed while analyzing the
 /// signature and building/finding the mlir::func::FuncOp.
 template <typename T>
@@ -559,12 +596,8 @@ void Fortran::lower::CallInterface<T>::declare() {
         if (!placeHolder.value().attributes.empty())
           func.setArgAttrs(placeHolder.index(), placeHolder.value().attributes);
       side().setFuncAttrs(func);
-    }
-    if (characteristic && characteristic->cudaSubprogramAttrs) {
-      func.getOperation()->setAttr(
-          fir::getCUDAAttrName(),
-          fir::getCUDAProcAttribute(func.getContext(),
-                                    *characteristic->cudaSubprogramAttrs));
+
+      setCUDAAttributes(func, side().getProcedureSymbol(), characteristic);
     }
   }
 }
diff --git a/flang/lib/Optimizer/Dialect/FIRAttr.cpp b/flang/lib/Optimizer/Dialect/FIRAttr.cpp
index 8df7a6c..8d780e0 100644
--- a/flang/lib/Optimizer/Dialect/FIRAttr.cpp
+++ b/flang/lib/Optimizer/Dialect/FIRAttr.cpp
@@ -298,5 +298,6 @@ void fir::printFirAttribute(FIROpsDialect *dialect, mlir::Attribute attr,
 void FIROpsDialect::registerAttributes() {
   addAttributes<ClosedIntervalAttr, ExactTypeAttr, FortranVariableFlagsAttr,
                 LowerBoundAttr, PointIntervalAttr, RealAttr, SubclassAttr,
-                UpperBoundAttr, CUDADataAttributeAttr, CUDAProcAttributeAttr>();
+                UpperBoundAttr, CUDADataAttributeAttr, CUDAProcAttributeAttr,
+                CUDALaunchBoundsAttr>();
 }
diff --git a/flang/test/Lower/CUDA/cuda-proc-attribute.cuf b/flang/test/Lower/CUDA/cuda-proc-attribute.cuf
index 0507310..9eb2b85 100644
--- a/flang/test/Lower/CUDA/cuda-proc-attribute.cuf
+++ b/flang/test/Lower/CUDA/cuda-proc-attribute.cuf
@@ -32,3 +32,9 @@ attributes(host) attributes(device) integer function fct_host_device; end
 
 attributes(device) attributes(host) integer function fct_device_host; end
 ! CHECK: func.func @_QPfct_device_host() -> i32 attributes {fir.cuda_attr = #fir.cuda_proc<host_device>}
+
+attributes(global) launch_bounds(1, 2) subroutine sub_lbounds1(); end
+! CHECK: func.func @_QPsub_lbounds1() attributes {fir.cuda_attr = #fir.cuda_proc<global>, fir.cuda_launch_bounds = #fir.launch_bounds<maxTPB = 1 : i64, minBPM = 2 : i64>}
+
+attributes(global) launch_bounds(1, 2, 3) subroutine sub_lbounds2(); end
+! CHECK: func.func @_QPsub_lbounds2() attributes {fir.cuda_attr = #fir.cuda_proc<global>, fir.cuda_launch_bounds = #fir.launch_bounds<maxTPB = 1 : i64, minBPM = 2 : i64, upperBoundClusterSize = 3 : i64>}
-- 
cgit v1.1


From 1dacfd119071af50eaef21a97a46076ee6ff20fd Mon Sep 17 00:00:00 2001
From: Joseph Huber <huberjn@outlook.com>
Date: Tue, 13 Feb 2024 10:55:28 -0600
Subject: [libc] Round up time for GPU nanosleep implementation (#81630)

Summary:
The GPU `nanosleep` tests would occasionally fail. This was due to the
fact that we used integer division to determine how many ticks we had to
sleep for. This would then truncate, leaving us with a value just
slightly below the requested value. This would then occasionally leave
us with a return value of `-1`. This patch just changes the code to
round up by 1 so we always sleep for at least the requested value.
---
 libc/src/time/gpu/nanosleep.cpp | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/libc/src/time/gpu/nanosleep.cpp b/libc/src/time/gpu/nanosleep.cpp
index e84fe62..34ff904 100644
--- a/libc/src/time/gpu/nanosleep.cpp
+++ b/libc/src/time/gpu/nanosleep.cpp
@@ -12,18 +12,19 @@
 
 namespace LIBC_NAMESPACE {
 
-constexpr uint64_t TICKS_PER_NS = 1000000000UL;
+constexpr uint64_t TICKS_PER_SEC = 1000000000UL;
 
 LLVM_LIBC_FUNCTION(int, nanosleep,
                    (const struct timespec *req, struct timespec *rem)) {
   if (!GPU_CLOCKS_PER_SEC || !req)
     return -1;
 
-  uint64_t nsecs = req->tv_nsec + req->tv_sec * TICKS_PER_NS;
+  uint64_t nsecs = req->tv_nsec + req->tv_sec * TICKS_PER_SEC;
+  uint64_t tick_rate = TICKS_PER_SEC / GPU_CLOCKS_PER_SEC;
 
   uint64_t start = gpu::fixed_frequency_clock();
 #if defined(LIBC_TARGET_ARCH_IS_NVPTX) && __CUDA_ARCH__ >= 700
-  uint64_t end = start + nsecs / (TICKS_PER_NS / GPU_CLOCKS_PER_SEC);
+  uint64_t end = start + (nsecs + tick_rate - 1) / tick_rate;
   uint64_t cur = gpu::fixed_frequency_clock();
   // The NVPTX architecture supports sleeping and guaruntees the actual time
   // slept will be somewhere between zero and twice the requested amount. Here
@@ -34,7 +35,7 @@ LLVM_LIBC_FUNCTION(int, nanosleep,
     nsecs -= nsecs > cur - start ? cur - start : 0;
   }
 #elif defined(LIBC_TARGET_ARCH_IS_AMDGPU)
-  uint64_t end = start + nsecs / (TICKS_PER_NS / GPU_CLOCKS_PER_SEC);
+  uint64_t end = start + (nsecs + tick_rate - 1) / tick_rate;
   uint64_t cur = gpu::fixed_frequency_clock();
   // The AMDGPU architecture does not provide a sleep implementation with a
   // known delay so we simply repeatedly sleep with a large value of ~960 clock
@@ -56,11 +57,11 @@ LLVM_LIBC_FUNCTION(int, nanosleep,
 
   // Check to make sure we slept for at least the desired duration and set the
   // remaining time if not.
-  uint64_t elapsed = (stop - start) * (TICKS_PER_NS / GPU_CLOCKS_PER_SEC);
+  uint64_t elapsed = (stop - start) * tick_rate;
   if (elapsed < nsecs) {
     if (rem) {
-      rem->tv_sec = (nsecs - elapsed) / TICKS_PER_NS;
-      rem->tv_nsec = (nsecs - elapsed) % TICKS_PER_NS;
+      rem->tv_sec = (nsecs - elapsed) / TICKS_PER_SEC;
+      rem->tv_nsec = (nsecs - elapsed) % TICKS_PER_SEC;
     }
     return -1;
   }
-- 
cgit v1.1


From e847abc5b47210de63455f67e58225121617873b Mon Sep 17 00:00:00 2001
From: Jay Foad <jay.foad@amd.com>
Date: Tue, 13 Feb 2024 16:47:45 +0000
Subject: [TableGen] Remove trivial helper function hasRegUnit. NFC.

---
 llvm/utils/TableGen/CodeGenRegisters.cpp | 10 ++--------
 1 file changed, 2 insertions(+), 8 deletions(-)

diff --git a/llvm/utils/TableGen/CodeGenRegisters.cpp b/llvm/utils/TableGen/CodeGenRegisters.cpp
index 0b67127..e29bc50 100644
--- a/llvm/utils/TableGen/CodeGenRegisters.cpp
+++ b/llvm/utils/TableGen/CodeGenRegisters.cpp
@@ -253,11 +253,6 @@ CodeGenRegister::RegUnitList RegUnitIterator::Sentinel;
 
 } // end anonymous namespace
 
-// Return true of this unit appears in RegUnits.
-static bool hasRegUnit(CodeGenRegister::RegUnitList &RegUnits, unsigned Unit) {
-  return RegUnits.test(Unit);
-}
-
 // Inherit register units from subregisters.
 // Return true if the RegUnits changed.
 bool CodeGenRegister::inheritRegUnits(CodeGenRegBank &RegBank) {
@@ -1842,9 +1837,8 @@ static bool normalizeWeight(CodeGenRegister *Reg,
     // for this register, has not been used to normalize a subregister's set,
     // and has not already been used to singularly determine this UberRegSet.
     unsigned AdjustUnit = *Reg->getRegUnits().begin();
-    if (Reg->getRegUnits().count() != 1 ||
-        hasRegUnit(NormalUnits, AdjustUnit) ||
-        hasRegUnit(UberSet->SingularDeterminants, AdjustUnit)) {
+    if (Reg->getRegUnits().count() != 1 || NormalUnits.test(AdjustUnit) ||
+        UberSet->SingularDeterminants.test(AdjustUnit)) {
       // We don't have an adjustable unit, so adopt a new one.
       AdjustUnit = RegBank.newRegUnit(UberSet->Weight - RegWeight);
       Reg->adoptRegUnit(AdjustUnit);
-- 
cgit v1.1


From a7cebadc10948ca1b9df1a740370f8ef7cef7e77 Mon Sep 17 00:00:00 2001
From: Jay Foad <jay.foad@amd.com>
Date: Tue, 13 Feb 2024 17:16:25 +0000
Subject: [TableGen] Trivial simplification in computeRegUnitSets. NFC.

---
 llvm/utils/TableGen/CodeGenRegisters.cpp | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/llvm/utils/TableGen/CodeGenRegisters.cpp b/llvm/utils/TableGen/CodeGenRegisters.cpp
index e29bc50..7d266c8 100644
--- a/llvm/utils/TableGen/CodeGenRegisters.cpp
+++ b/llvm/utils/TableGen/CodeGenRegisters.cpp
@@ -2107,10 +2107,8 @@ void CodeGenRegBank::computeRegUnitSets() {
        ++UnitIdx) {
     std::vector<unsigned> RUSets;
     for (unsigned i = 0, e = RegUnitSets.size(); i != e; ++i) {
-      RegUnitSet &RUSet = RegUnitSets[i];
-      if (!is_contained(RUSet.Units, UnitIdx))
-        continue;
-      RUSets.push_back(i);
+      if (is_contained(RegUnitSets[i].Units, UnitIdx))
+        RUSets.push_back(i);
     }
     unsigned RCUnitSetsIdx = 0;
     for (unsigned e = RegClassUnitSets.size(); RCUnitSetsIdx != e;
-- 
cgit v1.1


From 9be7b0a539f673081bf8d1d5a5b08135190fd46d Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Tue, 13 Feb 2024 09:46:50 -0800
Subject: [IRGen][AArch64][RISCV] Generalize bitcast between i1 predicate
 vector and i8 fixed vector. (#76548)

Instead of only handling vscale x 16 x i1 predicate vectors, handle any
scalable i1 vector where the known minimum is divisible by 8.

This is used on RISC-V where we have multiple sizes of predicate
types.
---
 clang/lib/CodeGen/CGCall.cpp                       | 50 ++++++++++-----------
 clang/lib/CodeGen/CGExprScalar.cpp                 | 51 +++++++++++-----------
 .../CodeGen/attr-riscv-rvv-vector-bits-bitcast.c   | 36 +++++++--------
 .../test/CodeGen/attr-riscv-rvv-vector-bits-call.c | 26 ++---------
 .../test/CodeGen/attr-riscv-rvv-vector-bits-cast.c | 14 ++----
 .../CodeGen/attr-riscv-rvv-vector-bits-codegen.c   | 21 +++++----
 .../CodeGen/attr-riscv-rvv-vector-bits-globals.c   | 26 +++++------
 7 files changed, 94 insertions(+), 130 deletions(-)

diff --git a/clang/lib/CodeGen/CGCall.cpp b/clang/lib/CodeGen/CGCall.cpp
index cd26a3d..d05cf1c 100644
--- a/clang/lib/CodeGen/CGCall.cpp
+++ b/clang/lib/CodeGen/CGCall.cpp
@@ -1301,27 +1301,25 @@ static llvm::Value *CreateCoercedLoad(Address Src, llvm::Type *Ty,
   // If coercing a fixed vector to a scalable vector for ABI compatibility, and
   // the types match, use the llvm.vector.insert intrinsic to perform the
   // conversion.
-  if (auto *ScalableDst = dyn_cast<llvm::ScalableVectorType>(Ty)) {
-    if (auto *FixedSrc = dyn_cast<llvm::FixedVectorType>(SrcTy)) {
-      // If we are casting a fixed i8 vector to a scalable 16 x i1 predicate
+  if (auto *ScalableDstTy = dyn_cast<llvm::ScalableVectorType>(Ty)) {
+    if (auto *FixedSrcTy = dyn_cast<llvm::FixedVectorType>(SrcTy)) {
+      // If we are casting a fixed i8 vector to a scalable i1 predicate
       // vector, use a vector insert and bitcast the result.
-      bool NeedsBitcast = false;
-      auto PredType =
-          llvm::ScalableVectorType::get(CGF.Builder.getInt1Ty(), 16);
-      llvm::Type *OrigType = Ty;
-      if (ScalableDst == PredType &&
-          FixedSrc->getElementType() == CGF.Builder.getInt8Ty()) {
-        ScalableDst = llvm::ScalableVectorType::get(CGF.Builder.getInt8Ty(), 2);
-        NeedsBitcast = true;
+      if (ScalableDstTy->getElementType()->isIntegerTy(1) &&
+          ScalableDstTy->getElementCount().isKnownMultipleOf(8) &&
+          FixedSrcTy->getElementType()->isIntegerTy(8)) {
+        ScalableDstTy = llvm::ScalableVectorType::get(
+            FixedSrcTy->getElementType(),
+            ScalableDstTy->getElementCount().getKnownMinValue() / 8);
       }
-      if (ScalableDst->getElementType() == FixedSrc->getElementType()) {
+      if (ScalableDstTy->getElementType() == FixedSrcTy->getElementType()) {
         auto *Load = CGF.Builder.CreateLoad(Src);
-        auto *UndefVec = llvm::UndefValue::get(ScalableDst);
+        auto *UndefVec = llvm::UndefValue::get(ScalableDstTy);
         auto *Zero = llvm::Constant::getNullValue(CGF.CGM.Int64Ty);
         llvm::Value *Result = CGF.Builder.CreateInsertVector(
-            ScalableDst, UndefVec, Load, Zero, "cast.scalable");
-        if (NeedsBitcast)
-          Result = CGF.Builder.CreateBitCast(Result, OrigType);
+            ScalableDstTy, UndefVec, Load, Zero, "cast.scalable");
+        if (ScalableDstTy != Ty)
+          Result = CGF.Builder.CreateBitCast(Result, Ty);
         return Result;
       }
     }
@@ -3199,13 +3197,14 @@ void CodeGenFunction::EmitFunctionProlog(const CGFunctionInfo &FI,
         llvm::Value *Coerced = Fn->getArg(FirstIRArg);
         if (auto *VecTyFrom =
                 dyn_cast<llvm::ScalableVectorType>(Coerced->getType())) {
-          // If we are casting a scalable 16 x i1 predicate vector to a fixed i8
+          // If we are casting a scalable i1 predicate vector to a fixed i8
           // vector, bitcast the source and use a vector extract.
-          auto PredType =
-              llvm::ScalableVectorType::get(Builder.getInt1Ty(), 16);
-          if (VecTyFrom == PredType &&
+          if (VecTyFrom->getElementType()->isIntegerTy(1) &&
+              VecTyFrom->getElementCount().isKnownMultipleOf(8) &&
               VecTyTo->getElementType() == Builder.getInt8Ty()) {
-            VecTyFrom = llvm::ScalableVectorType::get(Builder.getInt8Ty(), 2);
+            VecTyFrom = llvm::ScalableVectorType::get(
+                VecTyTo->getElementType(),
+                VecTyFrom->getElementCount().getKnownMinValue() / 8);
             Coerced = Builder.CreateBitCast(Coerced, VecTyFrom);
           }
           if (VecTyFrom->getElementType() == VecTyTo->getElementType()) {
@@ -5877,12 +5876,13 @@ RValue CodeGenFunction::EmitCall(const CGFunctionInfo &CallInfo,
       // If coercing a fixed vector from a scalable vector for ABI
       // compatibility, and the types match, use the llvm.vector.extract
       // intrinsic to perform the conversion.
-      if (auto *FixedDst = dyn_cast<llvm::FixedVectorType>(RetIRTy)) {
+      if (auto *FixedDstTy = dyn_cast<llvm::FixedVectorType>(RetIRTy)) {
         llvm::Value *V = CI;
-        if (auto *ScalableSrc = dyn_cast<llvm::ScalableVectorType>(V->getType())) {
-          if (FixedDst->getElementType() == ScalableSrc->getElementType()) {
+        if (auto *ScalableSrcTy =
+                dyn_cast<llvm::ScalableVectorType>(V->getType())) {
+          if (FixedDstTy->getElementType() == ScalableSrcTy->getElementType()) {
             llvm::Value *Zero = llvm::Constant::getNullValue(CGM.Int64Ty);
-            V = Builder.CreateExtractVector(FixedDst, V, Zero, "cast.fixed");
+            V = Builder.CreateExtractVector(FixedDstTy, V, Zero, "cast.fixed");
             return RValue::get(V);
           }
         }
diff --git a/clang/lib/CodeGen/CGExprScalar.cpp b/clang/lib/CodeGen/CGExprScalar.cpp
index fa03163..aa805f2 100644
--- a/clang/lib/CodeGen/CGExprScalar.cpp
+++ b/clang/lib/CodeGen/CGExprScalar.cpp
@@ -2137,26 +2137,24 @@ Value *ScalarExprEmitter::VisitCastExpr(CastExpr *CE) {
     // If Src is a fixed vector and Dst is a scalable vector, and both have the
     // same element type, use the llvm.vector.insert intrinsic to perform the
     // bitcast.
-    if (const auto *FixedSrc = dyn_cast<llvm::FixedVectorType>(SrcTy)) {
-      if (const auto *ScalableDst = dyn_cast<llvm::ScalableVectorType>(DstTy)) {
-        // If we are casting a fixed i8 vector to a scalable 16 x i1 predicate
+    if (auto *FixedSrcTy = dyn_cast<llvm::FixedVectorType>(SrcTy)) {
+      if (auto *ScalableDstTy = dyn_cast<llvm::ScalableVectorType>(DstTy)) {
+        // If we are casting a fixed i8 vector to a scalable i1 predicate
         // vector, use a vector insert and bitcast the result.
-        bool NeedsBitCast = false;
-        auto PredType = llvm::ScalableVectorType::get(Builder.getInt1Ty(), 16);
-        llvm::Type *OrigType = DstTy;
-        if (ScalableDst == PredType &&
-            FixedSrc->getElementType() == Builder.getInt8Ty()) {
-          DstTy = llvm::ScalableVectorType::get(Builder.getInt8Ty(), 2);
-          ScalableDst = cast<llvm::ScalableVectorType>(DstTy);
-          NeedsBitCast = true;
+        if (ScalableDstTy->getElementType()->isIntegerTy(1) &&
+            ScalableDstTy->getElementCount().isKnownMultipleOf(8) &&
+            FixedSrcTy->getElementType()->isIntegerTy(8)) {
+          ScalableDstTy = llvm::ScalableVectorType::get(
+              FixedSrcTy->getElementType(),
+              ScalableDstTy->getElementCount().getKnownMinValue() / 8);
         }
-        if (FixedSrc->getElementType() == ScalableDst->getElementType()) {
-          llvm::Value *UndefVec = llvm::UndefValue::get(DstTy);
+        if (FixedSrcTy->getElementType() == ScalableDstTy->getElementType()) {
+          llvm::Value *UndefVec = llvm::UndefValue::get(ScalableDstTy);
           llvm::Value *Zero = llvm::Constant::getNullValue(CGF.CGM.Int64Ty);
           llvm::Value *Result = Builder.CreateInsertVector(
-              DstTy, UndefVec, Src, Zero, "cast.scalable");
-          if (NeedsBitCast)
-            Result = Builder.CreateBitCast(Result, OrigType);
+              ScalableDstTy, UndefVec, Src, Zero, "cast.scalable");
+          if (Result->getType() != DstTy)
+            Result = Builder.CreateBitCast(Result, DstTy);
           return Result;
         }
       }
@@ -2165,18 +2163,19 @@ Value *ScalarExprEmitter::VisitCastExpr(CastExpr *CE) {
     // If Src is a scalable vector and Dst is a fixed vector, and both have the
     // same element type, use the llvm.vector.extract intrinsic to perform the
     // bitcast.
-    if (const auto *ScalableSrc = dyn_cast<llvm::ScalableVectorType>(SrcTy)) {
-      if (const auto *FixedDst = dyn_cast<llvm::FixedVectorType>(DstTy)) {
-        // If we are casting a scalable 16 x i1 predicate vector to a fixed i8
+    if (auto *ScalableSrcTy = dyn_cast<llvm::ScalableVectorType>(SrcTy)) {
+      if (auto *FixedDstTy = dyn_cast<llvm::FixedVectorType>(DstTy)) {
+        // If we are casting a scalable i1 predicate vector to a fixed i8
         // vector, bitcast the source and use a vector extract.
-        auto PredType = llvm::ScalableVectorType::get(Builder.getInt1Ty(), 16);
-        if (ScalableSrc == PredType &&
-            FixedDst->getElementType() == Builder.getInt8Ty()) {
-          SrcTy = llvm::ScalableVectorType::get(Builder.getInt8Ty(), 2);
-          ScalableSrc = cast<llvm::ScalableVectorType>(SrcTy);
-          Src = Builder.CreateBitCast(Src, SrcTy);
+        if (ScalableSrcTy->getElementType()->isIntegerTy(1) &&
+            ScalableSrcTy->getElementCount().isKnownMultipleOf(8) &&
+            FixedDstTy->getElementType()->isIntegerTy(8)) {
+          ScalableSrcTy = llvm::ScalableVectorType::get(
+              FixedDstTy->getElementType(),
+              ScalableSrcTy->getElementCount().getKnownMinValue() / 8);
+          Src = Builder.CreateBitCast(Src, ScalableSrcTy);
         }
-        if (ScalableSrc->getElementType() == FixedDst->getElementType()) {
+        if (ScalableSrcTy->getElementType() == FixedDstTy->getElementType()) {
           llvm::Value *Zero = llvm::Constant::getNullValue(CGF.CGM.Int64Ty);
           return Builder.CreateExtractVector(DstTy, Src, Zero, "cast.fixed");
         }
diff --git a/clang/test/CodeGen/attr-riscv-rvv-vector-bits-bitcast.c b/clang/test/CodeGen/attr-riscv-rvv-vector-bits-bitcast.c
index a7b3123..20fb4a0 100644
--- a/clang/test/CodeGen/attr-riscv-rvv-vector-bits-bitcast.c
+++ b/clang/test/CodeGen/attr-riscv-rvv-vector-bits-bitcast.c
@@ -177,29 +177,26 @@ void write_float64m1(struct struct_float64m1 *s, vfloat64m1_t x) {
 
 // CHECK-64-LABEL: @read_bool1(
 // CHECK-64-NEXT:  entry:
-// CHECK-64-NEXT:    [[SAVED_VALUE:%.*]] = alloca <8 x i8>, align 8
 // CHECK-64-NEXT:    [[Y:%.*]] = getelementptr inbounds i8, ptr [[S:%.*]], i64 8
 // CHECK-64-NEXT:    [[TMP0:%.*]] = load <8 x i8>, ptr [[Y]], align 8, !tbaa [[TBAA4]]
-// CHECK-64-NEXT:    store <8 x i8> [[TMP0]], ptr [[SAVED_VALUE]], align 8, !tbaa [[TBAA4]]
-// CHECK-64-NEXT:    [[TMP1:%.*]] = load <vscale x 64 x i1>, ptr [[SAVED_VALUE]], align 8, !tbaa [[TBAA4]]
+// CHECK-64-NEXT:    [[CAST_SCALABLE:%.*]] = tail call <vscale x 8 x i8> @llvm.vector.insert.nxv8i8.v8i8(<vscale x 8 x i8> undef, <8 x i8> [[TMP0]], i64 0)
+// CHECK-64-NEXT:    [[TMP1:%.*]] = bitcast <vscale x 8 x i8> [[CAST_SCALABLE]] to <vscale x 64 x i1>
 // CHECK-64-NEXT:    ret <vscale x 64 x i1> [[TMP1]]
 //
 // CHECK-128-LABEL: @read_bool1(
 // CHECK-128-NEXT:  entry:
-// CHECK-128-NEXT:    [[SAVED_VALUE:%.*]] = alloca <16 x i8>, align 16
 // CHECK-128-NEXT:    [[Y:%.*]] = getelementptr inbounds i8, ptr [[S:%.*]], i64 16
 // CHECK-128-NEXT:    [[TMP0:%.*]] = load <16 x i8>, ptr [[Y]], align 8, !tbaa [[TBAA4]]
-// CHECK-128-NEXT:    store <16 x i8> [[TMP0]], ptr [[SAVED_VALUE]], align 16, !tbaa [[TBAA4]]
-// CHECK-128-NEXT:    [[TMP1:%.*]] = load <vscale x 64 x i1>, ptr [[SAVED_VALUE]], align 16, !tbaa [[TBAA4]]
+// CHECK-128-NEXT:    [[CAST_SCALABLE:%.*]] = tail call <vscale x 8 x i8> @llvm.vector.insert.nxv8i8.v16i8(<vscale x 8 x i8> undef, <16 x i8> [[TMP0]], i64 0)
+// CHECK-128-NEXT:    [[TMP1:%.*]] = bitcast <vscale x 8 x i8> [[CAST_SCALABLE]] to <vscale x 64 x i1>
 // CHECK-128-NEXT:    ret <vscale x 64 x i1> [[TMP1]]
 //
 // CHECK-256-LABEL: @read_bool1(
 // CHECK-256-NEXT:  entry:
-// CHECK-256-NEXT:    [[SAVED_VALUE:%.*]] = alloca <32 x i8>, align 32
 // CHECK-256-NEXT:    [[Y:%.*]] = getelementptr inbounds i8, ptr [[S:%.*]], i64 32
 // CHECK-256-NEXT:    [[TMP0:%.*]] = load <32 x i8>, ptr [[Y]], align 8, !tbaa [[TBAA4]]
-// CHECK-256-NEXT:    store <32 x i8> [[TMP0]], ptr [[SAVED_VALUE]], align 32, !tbaa [[TBAA4]]
-// CHECK-256-NEXT:    [[TMP1:%.*]] = load <vscale x 64 x i1>, ptr [[SAVED_VALUE]], align 32, !tbaa [[TBAA4]]
+// CHECK-256-NEXT:    [[CAST_SCALABLE:%.*]] = tail call <vscale x 8 x i8> @llvm.vector.insert.nxv8i8.v32i8(<vscale x 8 x i8> undef, <32 x i8> [[TMP0]], i64 0)
+// CHECK-256-NEXT:    [[TMP1:%.*]] = bitcast <vscale x 8 x i8> [[CAST_SCALABLE]] to <vscale x 64 x i1>
 // CHECK-256-NEXT:    ret <vscale x 64 x i1> [[TMP1]]
 //
 vbool1_t read_bool1(struct struct_bool1 *s) {
@@ -208,29 +205,26 @@ vbool1_t read_bool1(struct struct_bool1 *s) {
 
 // CHECK-64-LABEL: @write_bool1(
 // CHECK-64-NEXT:  entry:
-// CHECK-64-NEXT:    [[SAVED_VALUE:%.*]] = alloca <vscale x 64 x i1>, align 8
-// CHECK-64-NEXT:    store <vscale x 64 x i1> [[X:%.*]], ptr [[SAVED_VALUE]], align 8, !tbaa [[TBAA7:![0-9]+]]
-// CHECK-64-NEXT:    [[TMP0:%.*]] = load <8 x i8>, ptr [[SAVED_VALUE]], align 8, !tbaa [[TBAA4]]
+// CHECK-64-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 64 x i1> [[X:%.*]] to <vscale x 8 x i8>
+// CHECK-64-NEXT:    [[CAST_FIXED:%.*]] = tail call <8 x i8> @llvm.vector.extract.v8i8.nxv8i8(<vscale x 8 x i8> [[TMP0]], i64 0)
 // CHECK-64-NEXT:    [[Y:%.*]] = getelementptr inbounds i8, ptr [[S:%.*]], i64 8
-// CHECK-64-NEXT:    store <8 x i8> [[TMP0]], ptr [[Y]], align 8, !tbaa [[TBAA4]]
+// CHECK-64-NEXT:    store <8 x i8> [[CAST_FIXED]], ptr [[Y]], align 8, !tbaa [[TBAA4]]
 // CHECK-64-NEXT:    ret void
 //
 // CHECK-128-LABEL: @write_bool1(
 // CHECK-128-NEXT:  entry:
-// CHECK-128-NEXT:    [[SAVED_VALUE:%.*]] = alloca <vscale x 64 x i1>, align 16
-// CHECK-128-NEXT:    store <vscale x 64 x i1> [[X:%.*]], ptr [[SAVED_VALUE]], align 16, !tbaa [[TBAA7:![0-9]+]]
-// CHECK-128-NEXT:    [[TMP0:%.*]] = load <16 x i8>, ptr [[SAVED_VALUE]], align 16, !tbaa [[TBAA4]]
+// CHECK-128-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 64 x i1> [[X:%.*]] to <vscale x 8 x i8>
+// CHECK-128-NEXT:    [[CAST_FIXED:%.*]] = tail call <16 x i8> @llvm.vector.extract.v16i8.nxv8i8(<vscale x 8 x i8> [[TMP0]], i64 0)
 // CHECK-128-NEXT:    [[Y:%.*]] = getelementptr inbounds i8, ptr [[S:%.*]], i64 16
-// CHECK-128-NEXT:    store <16 x i8> [[TMP0]], ptr [[Y]], align 8, !tbaa [[TBAA4]]
+// CHECK-128-NEXT:    store <16 x i8> [[CAST_FIXED]], ptr [[Y]], align 8, !tbaa [[TBAA4]]
 // CHECK-128-NEXT:    ret void
 //
 // CHECK-256-LABEL: @write_bool1(
 // CHECK-256-NEXT:  entry:
-// CHECK-256-NEXT:    [[SAVED_VALUE:%.*]] = alloca <vscale x 64 x i1>, align 8
-// CHECK-256-NEXT:    store <vscale x 64 x i1> [[X:%.*]], ptr [[SAVED_VALUE]], align 8, !tbaa [[TBAA7:![0-9]+]]
-// CHECK-256-NEXT:    [[TMP0:%.*]] = load <32 x i8>, ptr [[SAVED_VALUE]], align 8, !tbaa [[TBAA4]]
+// CHECK-256-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 64 x i1> [[X:%.*]] to <vscale x 8 x i8>
+// CHECK-256-NEXT:    [[CAST_FIXED:%.*]] = tail call <32 x i8> @llvm.vector.extract.v32i8.nxv8i8(<vscale x 8 x i8> [[TMP0]], i64 0)
 // CHECK-256-NEXT:    [[Y:%.*]] = getelementptr inbounds i8, ptr [[S:%.*]], i64 32
-// CHECK-256-NEXT:    store <32 x i8> [[TMP0]], ptr [[Y]], align 8, !tbaa [[TBAA4]]
+// CHECK-256-NEXT:    store <32 x i8> [[CAST_FIXED]], ptr [[Y]], align 8, !tbaa [[TBAA4]]
 // CHECK-256-NEXT:    ret void
 //
 void write_bool1(struct struct_bool1 *s, vbool1_t x) {
diff --git a/clang/test/CodeGen/attr-riscv-rvv-vector-bits-call.c b/clang/test/CodeGen/attr-riscv-rvv-vector-bits-call.c
index 888abe1..1824d97 100644
--- a/clang/test/CodeGen/attr-riscv-rvv-vector-bits-call.c
+++ b/clang/test/CodeGen/attr-riscv-rvv-vector-bits-call.c
@@ -70,13 +70,7 @@ fixed_float64m1_t call_float64_ff(fixed_float64m1_t op1, fixed_float64m1_t op2)
 
 // CHECK-LABEL: @call_bool1_ff(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SAVED_VALUE4:%.*]] = alloca <vscale x 64 x i1>, align 8
-// CHECK-NEXT:    [[RETVAL_COERCE:%.*]] = alloca <vscale x 64 x i1>, align 8
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 64 x i1> @llvm.riscv.vmand.nxv64i1.i64(<vscale x 64 x i1> [[OP1_COERCE:%.*]], <vscale x 64 x i1> [[OP2_COERCE:%.*]], i64 256)
-// CHECK-NEXT:    store <vscale x 64 x i1> [[TMP0]], ptr [[SAVED_VALUE4]], align 8, !tbaa [[TBAA4:![0-9]+]]
-// CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i8>, ptr [[SAVED_VALUE4]], align 8, !tbaa [[TBAA8:![0-9]+]]
-// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[RETVAL_COERCE]], align 8
-// CHECK-NEXT:    [[TMP2:%.*]] = load <vscale x 64 x i1>, ptr [[RETVAL_COERCE]], align 8
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 64 x i1> @llvm.riscv.vmand.nxv64i1.i64(<vscale x 64 x i1> [[TMP0:%.*]], <vscale x 64 x i1> [[TMP1:%.*]], i64 256)
 // CHECK-NEXT:    ret <vscale x 64 x i1> [[TMP2]]
 //
 fixed_bool1_t call_bool1_ff(fixed_bool1_t op1, fixed_bool1_t op2) {
@@ -116,14 +110,8 @@ fixed_float64m1_t call_float64_fs(fixed_float64m1_t op1, vfloat64m1_t op2) {
 
 // CHECK-LABEL: @call_bool1_fs(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SAVED_VALUE2:%.*]] = alloca <vscale x 64 x i1>, align 8
-// CHECK-NEXT:    [[RETVAL_COERCE:%.*]] = alloca <vscale x 64 x i1>, align 8
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 64 x i1> @llvm.riscv.vmand.nxv64i1.i64(<vscale x 64 x i1> [[OP1_COERCE:%.*]], <vscale x 64 x i1> [[OP2:%.*]], i64 256)
-// CHECK-NEXT:    store <vscale x 64 x i1> [[TMP0]], ptr [[SAVED_VALUE2]], align 8, !tbaa [[TBAA4]]
-// CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i8>, ptr [[SAVED_VALUE2]], align 8, !tbaa [[TBAA8]]
-// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[RETVAL_COERCE]], align 8
-// CHECK-NEXT:    [[TMP2:%.*]] = load <vscale x 64 x i1>, ptr [[RETVAL_COERCE]], align 8
-// CHECK-NEXT:    ret <vscale x 64 x i1> [[TMP2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 64 x i1> @llvm.riscv.vmand.nxv64i1.i64(<vscale x 64 x i1> [[TMP0:%.*]], <vscale x 64 x i1> [[OP2:%.*]], i64 256)
+// CHECK-NEXT:    ret <vscale x 64 x i1> [[TMP1]]
 //
 fixed_bool1_t call_bool1_fs(fixed_bool1_t op1, vbool1_t op2) {
   return __riscv_vmand(op1, op2, __riscv_v_fixed_vlen);
@@ -162,14 +150,8 @@ fixed_float64m1_t call_float64_ss(vfloat64m1_t op1, vfloat64m1_t op2) {
 
 // CHECK-LABEL: @call_bool1_ss(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SAVED_VALUE:%.*]] = alloca <vscale x 64 x i1>, align 8
-// CHECK-NEXT:    [[RETVAL_COERCE:%.*]] = alloca <vscale x 64 x i1>, align 8
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 64 x i1> @llvm.riscv.vmand.nxv64i1.i64(<vscale x 64 x i1> [[OP1:%.*]], <vscale x 64 x i1> [[OP2:%.*]], i64 256)
-// CHECK-NEXT:    store <vscale x 64 x i1> [[TMP0]], ptr [[SAVED_VALUE]], align 8, !tbaa [[TBAA4]]
-// CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i8>, ptr [[SAVED_VALUE]], align 8, !tbaa [[TBAA8]]
-// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[RETVAL_COERCE]], align 8
-// CHECK-NEXT:    [[TMP2:%.*]] = load <vscale x 64 x i1>, ptr [[RETVAL_COERCE]], align 8
-// CHECK-NEXT:    ret <vscale x 64 x i1> [[TMP2]]
+// CHECK-NEXT:    ret <vscale x 64 x i1> [[TMP0]]
 //
 fixed_bool1_t call_bool1_ss(vbool1_t op1, vbool1_t op2) {
   return __riscv_vmand(op1, op2, __riscv_v_fixed_vlen);
diff --git a/clang/test/CodeGen/attr-riscv-rvv-vector-bits-cast.c b/clang/test/CodeGen/attr-riscv-rvv-vector-bits-cast.c
index fe27817..3806c3e 100644
--- a/clang/test/CodeGen/attr-riscv-rvv-vector-bits-cast.c
+++ b/clang/test/CodeGen/attr-riscv-rvv-vector-bits-cast.c
@@ -65,13 +65,7 @@ fixed_float64m1_t from_vfloat64m1_t(vfloat64m1_t type) {
 
 // CHECK-LABEL: @from_vbool1_t(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SAVED_VALUE:%.*]] = alloca <vscale x 64 x i1>, align 8
-// CHECK-NEXT:    [[RETVAL_COERCE:%.*]] = alloca <vscale x 64 x i1>, align 8
-// CHECK-NEXT:    store <vscale x 64 x i1> [[TYPE:%.*]], ptr [[SAVED_VALUE]], align 8, !tbaa [[TBAA4:![0-9]+]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load <32 x i8>, ptr [[SAVED_VALUE]], align 8, !tbaa [[TBAA8:![0-9]+]]
-// CHECK-NEXT:    store <32 x i8> [[TMP0]], ptr [[RETVAL_COERCE]], align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = load <vscale x 64 x i1>, ptr [[RETVAL_COERCE]], align 8
-// CHECK-NEXT:    ret <vscale x 64 x i1> [[TMP1]]
+// CHECK-NEXT:    ret <vscale x 64 x i1> [[TYPE:%.*]]
 //
 fixed_bool1_t from_vbool1_t(vbool1_t type) {
   return type;
@@ -79,7 +73,7 @@ fixed_bool1_t from_vbool1_t(vbool1_t type) {
 
 // CHECK-LABEL: @to_vbool1_t(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    ret <vscale x 64 x i1> [[TYPE_COERCE:%.*]]
+// CHECK-NEXT:    ret <vscale x 64 x i1> [[TMP0:%.*]]
 //
 vbool1_t to_vbool1_t(fixed_bool1_t type) {
   return type;
@@ -105,8 +99,8 @@ vbool4_t to_vbool4_t(fixed_bool4_t type) {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[SAVED_VALUE:%.*]] = alloca <vscale x 2 x i1>, align 1
 // CHECK-NEXT:    [[RETVAL_COERCE:%.*]] = alloca <vscale x 2 x i1>, align 1
-// CHECK-NEXT:    store <vscale x 2 x i1> [[TYPE:%.*]], ptr [[SAVED_VALUE]], align 1, !tbaa [[TBAA9:![0-9]+]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load <1 x i8>, ptr [[SAVED_VALUE]], align 1, !tbaa [[TBAA8]]
+// CHECK-NEXT:    store <vscale x 2 x i1> [[TYPE:%.*]], ptr [[SAVED_VALUE]], align 1, !tbaa [[TBAA4:![0-9]+]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <1 x i8>, ptr [[SAVED_VALUE]], align 1, !tbaa [[TBAA8:![0-9]+]]
 // CHECK-NEXT:    store <1 x i8> [[TMP0]], ptr [[RETVAL_COERCE]], align 1
 // CHECK-NEXT:    [[TMP1:%.*]] = load <vscale x 2 x i1>, ptr [[RETVAL_COERCE]], align 1
 // CHECK-NEXT:    ret <vscale x 2 x i1> [[TMP1]]
diff --git a/clang/test/CodeGen/attr-riscv-rvv-vector-bits-codegen.c b/clang/test/CodeGen/attr-riscv-rvv-vector-bits-codegen.c
index ac22bdc..eb769fa 100644
--- a/clang/test/CodeGen/attr-riscv-rvv-vector-bits-codegen.c
+++ b/clang/test/CodeGen/attr-riscv-rvv-vector-bits-codegen.c
@@ -53,25 +53,24 @@ fixed_bool32_t global_bool32;
 // CHECK-NEXT:    [[M_ADDR:%.*]] = alloca <vscale x 64 x i1>, align 1
 // CHECK-NEXT:    [[VEC_ADDR:%.*]] = alloca <vscale x 64 x i8>, align 1
 // CHECK-NEXT:    [[MASK:%.*]] = alloca <vscale x 64 x i1>, align 1
-// CHECK-NEXT:    [[SAVED_VALUE:%.*]] = alloca <32 x i8>, align 32
 // CHECK-NEXT:    store <vscale x 64 x i1> [[M:%.*]], ptr [[M_ADDR]], align 1
 // CHECK-NEXT:    store <vscale x 64 x i8> [[VEC:%.*]], ptr [[VEC_ADDR]], align 1
 // CHECK-NEXT:    [[TMP0:%.*]] = load <vscale x 64 x i1>, ptr [[M_ADDR]], align 1
 // CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i8>, ptr @global_bool1, align 8
-// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[SAVED_VALUE]], align 32
-// CHECK-NEXT:    [[TMP2:%.*]] = load <vscale x 64 x i1>, ptr [[SAVED_VALUE]], align 32
+// CHECK-NEXT:    [[CAST_SCALABLE:%.*]] = call <vscale x 8 x i8> @llvm.vector.insert.nxv8i8.v32i8(<vscale x 8 x i8> undef, <32 x i8> [[TMP1]], i64 0)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <vscale x 8 x i8> [[CAST_SCALABLE]] to <vscale x 64 x i1>
 // CHECK-NEXT:    [[TMP3:%.*]] = call <vscale x 64 x i1> @llvm.riscv.vmand.nxv64i1.i64(<vscale x 64 x i1> [[TMP0]], <vscale x 64 x i1> [[TMP2]], i64 256)
 // CHECK-NEXT:    store <vscale x 64 x i1> [[TMP3]], ptr [[MASK]], align 1
 // CHECK-NEXT:    [[TMP4:%.*]] = load <vscale x 64 x i1>, ptr [[MASK]], align 1
 // CHECK-NEXT:    [[TMP5:%.*]] = load <vscale x 64 x i8>, ptr [[VEC_ADDR]], align 1
 // CHECK-NEXT:    [[TMP6:%.*]] = load <256 x i8>, ptr @global_vec_int8m8, align 8
-// CHECK-NEXT:    [[CAST_SCALABLE:%.*]] = call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.v256i8(<vscale x 64 x i8> undef, <256 x i8> [[TMP6]], i64 0)
-// CHECK-NEXT:    [[TMP7:%.*]] = call <vscale x 64 x i8> @llvm.riscv.vadd.mask.nxv64i8.nxv64i8.i64(<vscale x 64 x i8> poison, <vscale x 64 x i8> [[TMP5]], <vscale x 64 x i8> [[CAST_SCALABLE]], <vscale x 64 x i1> [[TMP4]], i64 256, i64 3)
+// CHECK-NEXT:    [[CAST_SCALABLE1:%.*]] = call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.v256i8(<vscale x 64 x i8> undef, <256 x i8> [[TMP6]], i64 0)
+// CHECK-NEXT:    [[TMP7:%.*]] = call <vscale x 64 x i8> @llvm.riscv.vadd.mask.nxv64i8.nxv64i8.i64(<vscale x 64 x i8> poison, <vscale x 64 x i8> [[TMP5]], <vscale x 64 x i8> [[CAST_SCALABLE1]], <vscale x 64 x i1> [[TMP4]], i64 256, i64 3)
 // CHECK-NEXT:    [[CAST_FIXED:%.*]] = call <256 x i8> @llvm.vector.extract.v256i8.nxv64i8(<vscale x 64 x i8> [[TMP7]], i64 0)
 // CHECK-NEXT:    store <256 x i8> [[CAST_FIXED]], ptr [[RETVAL]], align 8
 // CHECK-NEXT:    [[TMP8:%.*]] = load <256 x i8>, ptr [[RETVAL]], align 8
-// CHECK-NEXT:    [[CAST_SCALABLE1:%.*]] = call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.v256i8(<vscale x 64 x i8> undef, <256 x i8> [[TMP8]], i64 0)
-// CHECK-NEXT:    ret <vscale x 64 x i8> [[CAST_SCALABLE1]]
+// CHECK-NEXT:    [[CAST_SCALABLE2:%.*]] = call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.v256i8(<vscale x 64 x i8> undef, <256 x i8> [[TMP8]], i64 0)
+// CHECK-NEXT:    ret <vscale x 64 x i8> [[CAST_SCALABLE2]]
 //
 fixed_int8m8_t test_bool1(vbool1_t m, vint8m8_t vec) {
   vbool1_t mask = __riscv_vmand(m, global_bool1, __riscv_v_fixed_vlen);
@@ -181,15 +180,15 @@ fixed_int32m1_t array_arg(fixed_int32m1_t arr[]) {
 // CHECK-NEXT:    [[RETVAL:%.*]] = alloca <32 x i8>, align 8
 // CHECK-NEXT:    [[ARR:%.*]] = alloca [3 x <32 x i8>], align 8
 // CHECK-NEXT:    [[PARR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    [[RETVAL_COERCE:%.*]] = alloca <vscale x 64 x i1>, align 8
 // CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <32 x i8>], ptr [[ARR]], i64 0, i64 0
 // CHECK-NEXT:    store ptr [[ARRAYIDX]], ptr [[PARR]], align 8
 // CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[PARR]], align 8
 // CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 8
 // CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[RETVAL]], align 8
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL_COERCE]], ptr align 8 [[RETVAL]], i64 32, i1 false)
-// CHECK-NEXT:    [[TMP2:%.*]] = load <vscale x 64 x i1>, ptr [[RETVAL_COERCE]], align 8
-// CHECK-NEXT:    ret <vscale x 64 x i1> [[TMP2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i8>, ptr [[RETVAL]], align 8
+// CHECK-NEXT:    [[CAST_SCALABLE:%.*]] = call <vscale x 8 x i8> @llvm.vector.insert.nxv8i8.v32i8(<vscale x 8 x i8> undef, <32 x i8> [[TMP2]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <vscale x 8 x i8> [[CAST_SCALABLE]] to <vscale x 64 x i1>
+// CHECK-NEXT:    ret <vscale x 64 x i1> [[TMP3]]
 //
 fixed_bool1_t address_of_array_idx_bool1() {
   fixed_bool1_t arr[3];
diff --git a/clang/test/CodeGen/attr-riscv-rvv-vector-bits-globals.c b/clang/test/CodeGen/attr-riscv-rvv-vector-bits-globals.c
index d7df1a2..31a245d 100644
--- a/clang/test/CodeGen/attr-riscv-rvv-vector-bits-globals.c
+++ b/clang/test/CodeGen/attr-riscv-rvv-vector-bits-globals.c
@@ -56,18 +56,16 @@ void write_global_i64(vint64m1_t v) { global_i64 = v; }
 
 // CHECK-64-LABEL: @write_global_bool1(
 // CHECK-64-NEXT:  entry:
-// CHECK-64-NEXT:    [[SAVED_VALUE:%.*]] = alloca <vscale x 64 x i1>, align 8
-// CHECK-64-NEXT:    store <vscale x 64 x i1> [[V:%.*]], ptr [[SAVED_VALUE]], align 8, !tbaa [[TBAA7:![0-9]+]]
-// CHECK-64-NEXT:    [[TMP0:%.*]] = load <8 x i8>, ptr [[SAVED_VALUE]], align 8, !tbaa [[TBAA4]]
-// CHECK-64-NEXT:    store <8 x i8> [[TMP0]], ptr @global_bool1, align 8, !tbaa [[TBAA4]]
+// CHECK-64-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 64 x i1> [[V:%.*]] to <vscale x 8 x i8>
+// CHECK-64-NEXT:    [[CAST_FIXED:%.*]] = tail call <8 x i8> @llvm.vector.extract.v8i8.nxv8i8(<vscale x 8 x i8> [[TMP0]], i64 0)
+// CHECK-64-NEXT:    store <8 x i8> [[CAST_FIXED]], ptr @global_bool1, align 8, !tbaa [[TBAA4]]
 // CHECK-64-NEXT:    ret void
 //
 // CHECK-256-LABEL: @write_global_bool1(
 // CHECK-256-NEXT:  entry:
-// CHECK-256-NEXT:    [[SAVED_VALUE:%.*]] = alloca <vscale x 64 x i1>, align 8
-// CHECK-256-NEXT:    store <vscale x 64 x i1> [[V:%.*]], ptr [[SAVED_VALUE]], align 8, !tbaa [[TBAA7:![0-9]+]]
-// CHECK-256-NEXT:    [[TMP0:%.*]] = load <32 x i8>, ptr [[SAVED_VALUE]], align 8, !tbaa [[TBAA4]]
-// CHECK-256-NEXT:    store <32 x i8> [[TMP0]], ptr @global_bool1, align 8, !tbaa [[TBAA4]]
+// CHECK-256-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 64 x i1> [[V:%.*]] to <vscale x 8 x i8>
+// CHECK-256-NEXT:    [[CAST_FIXED:%.*]] = tail call <32 x i8> @llvm.vector.extract.v32i8.nxv8i8(<vscale x 8 x i8> [[TMP0]], i64 0)
+// CHECK-256-NEXT:    store <32 x i8> [[CAST_FIXED]], ptr @global_bool1, align 8, !tbaa [[TBAA4]]
 // CHECK-256-NEXT:    ret void
 //
 void write_global_bool1(vbool1_t v) { global_bool1 = v; }
@@ -92,7 +90,7 @@ void write_global_bool4(vbool4_t v) { global_bool4 = v; }
 // CHECK-256-LABEL: @write_global_bool32(
 // CHECK-256-NEXT:  entry:
 // CHECK-256-NEXT:    [[SAVED_VALUE:%.*]] = alloca <vscale x 2 x i1>, align 1
-// CHECK-256-NEXT:    store <vscale x 2 x i1> [[V:%.*]], ptr [[SAVED_VALUE]], align 1, !tbaa [[TBAA9:![0-9]+]]
+// CHECK-256-NEXT:    store <vscale x 2 x i1> [[V:%.*]], ptr [[SAVED_VALUE]], align 1, !tbaa [[TBAA7:![0-9]+]]
 // CHECK-256-NEXT:    [[TMP0:%.*]] = load <1 x i8>, ptr [[SAVED_VALUE]], align 1, !tbaa [[TBAA4]]
 // CHECK-256-NEXT:    store <1 x i8> [[TMP0]], ptr @global_bool32, align 1, !tbaa [[TBAA4]]
 // CHECK-256-NEXT:    ret void
@@ -120,18 +118,16 @@ vint64m1_t read_global_i64() { return global_i64; }
 
 // CHECK-64-LABEL: @read_global_bool1(
 // CHECK-64-NEXT:  entry:
-// CHECK-64-NEXT:    [[SAVED_VALUE:%.*]] = alloca <8 x i8>, align 8
 // CHECK-64-NEXT:    [[TMP0:%.*]] = load <8 x i8>, ptr @global_bool1, align 8, !tbaa [[TBAA4]]
-// CHECK-64-NEXT:    store <8 x i8> [[TMP0]], ptr [[SAVED_VALUE]], align 8, !tbaa [[TBAA4]]
-// CHECK-64-NEXT:    [[TMP1:%.*]] = load <vscale x 64 x i1>, ptr [[SAVED_VALUE]], align 8, !tbaa [[TBAA4]]
+// CHECK-64-NEXT:    [[CAST_SCALABLE:%.*]] = tail call <vscale x 8 x i8> @llvm.vector.insert.nxv8i8.v8i8(<vscale x 8 x i8> undef, <8 x i8> [[TMP0]], i64 0)
+// CHECK-64-NEXT:    [[TMP1:%.*]] = bitcast <vscale x 8 x i8> [[CAST_SCALABLE]] to <vscale x 64 x i1>
 // CHECK-64-NEXT:    ret <vscale x 64 x i1> [[TMP1]]
 //
 // CHECK-256-LABEL: @read_global_bool1(
 // CHECK-256-NEXT:  entry:
-// CHECK-256-NEXT:    [[SAVED_VALUE:%.*]] = alloca <32 x i8>, align 32
 // CHECK-256-NEXT:    [[TMP0:%.*]] = load <32 x i8>, ptr @global_bool1, align 8, !tbaa [[TBAA4]]
-// CHECK-256-NEXT:    store <32 x i8> [[TMP0]], ptr [[SAVED_VALUE]], align 32, !tbaa [[TBAA4]]
-// CHECK-256-NEXT:    [[TMP1:%.*]] = load <vscale x 64 x i1>, ptr [[SAVED_VALUE]], align 32, !tbaa [[TBAA4]]
+// CHECK-256-NEXT:    [[CAST_SCALABLE:%.*]] = tail call <vscale x 8 x i8> @llvm.vector.insert.nxv8i8.v32i8(<vscale x 8 x i8> undef, <32 x i8> [[TMP0]], i64 0)
+// CHECK-256-NEXT:    [[TMP1:%.*]] = bitcast <vscale x 8 x i8> [[CAST_SCALABLE]] to <vscale x 64 x i1>
 // CHECK-256-NEXT:    ret <vscale x 64 x i1> [[TMP1]]
 //
 vbool1_t read_global_bool1() { return global_bool1; }
-- 
cgit v1.1


From 742a06f577b4c3b1c1f994e91bb6579ae89fe4b0 Mon Sep 17 00:00:00 2001
From: Arthur Eubanks <aeubanks@google.com>
Date: Tue, 13 Feb 2024 10:49:22 -0700
Subject: [clang] Remove #undef alloca workaround (#81534)

Added in 26670dcba1609574cba5942aff78ff97b567c5f3 to workaround #4885.

Windows CI and a local Windows build are happy with this change, so it
seems like this has been properly fixed at some point. If this does
break somebody, this can be easily reverted. (Also, Linux does the same
`#define alloca` in system headers, so I'm not sure why it'd be
different on Windows)

This is tech debt that caused breakages, see comments on #71709.
---
 clang/include/clang/Basic/Builtins.h | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/clang/include/clang/Basic/Builtins.h b/clang/include/clang/Basic/Builtins.h
index f955d21..6700d19 100644
--- a/clang/include/clang/Basic/Builtins.h
+++ b/clang/include/clang/Basic/Builtins.h
@@ -20,10 +20,6 @@
 #include "llvm/ADT/StringRef.h"
 #include <cstring>
 
-// VC++ defines 'alloca' as an object-like macro, which interferes with our
-// builtins.
-#undef alloca
-
 namespace clang {
 class TargetInfo;
 class IdentifierTable;
-- 
cgit v1.1


From 9838c8512bc29e3a1b8edeb0eb2541160e4c727f Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Mon, 12 Feb 2024 21:17:53 -0800
Subject: [RISCV] Copy typepromotion-overflow.ll from AArch64. NFC

---
 llvm/test/CodeGen/RISCV/typepromotion-overflow.ll | 388 ++++++++++++++++++++++
 1 file changed, 388 insertions(+)
 create mode 100644 llvm/test/CodeGen/RISCV/typepromotion-overflow.ll

diff --git a/llvm/test/CodeGen/RISCV/typepromotion-overflow.ll b/llvm/test/CodeGen/RISCV/typepromotion-overflow.ll
new file mode 100644
index 0000000..fad9e6c
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/typepromotion-overflow.ll
@@ -0,0 +1,388 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=riscv64 -mattr=+m %s -o - | FileCheck %s
+
+define zeroext i16 @overflow_add(i16 zeroext %a, i16 zeroext %b) {
+; CHECK-LABEL: overflow_add:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    add a0, a1, a0
+; CHECK-NEXT:    ori a0, a0, 1
+; CHECK-NEXT:    slli a0, a0, 48
+; CHECK-NEXT:    srli a1, a0, 48
+; CHECK-NEXT:    li a2, 1024
+; CHECK-NEXT:    li a0, 2
+; CHECK-NEXT:    bltu a2, a1, .LBB0_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    li a0, 5
+; CHECK-NEXT:  .LBB0_2:
+; CHECK-NEXT:    ret
+  %add = add i16 %b, %a
+  %or = or i16 %add, 1
+  %cmp = icmp ugt i16 %or, 1024
+  %res = select i1 %cmp, i16 2, i16 5
+  ret i16 %res
+}
+
+define zeroext i16 @overflow_sub(i16 zeroext %a, i16 zeroext %b) {
+; CHECK-LABEL: overflow_sub:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    subw a0, a0, a1
+; CHECK-NEXT:    ori a0, a0, 1
+; CHECK-NEXT:    slli a0, a0, 48
+; CHECK-NEXT:    srli a1, a0, 48
+; CHECK-NEXT:    li a2, 1024
+; CHECK-NEXT:    li a0, 2
+; CHECK-NEXT:    bltu a2, a1, .LBB1_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    li a0, 5
+; CHECK-NEXT:  .LBB1_2:
+; CHECK-NEXT:    ret
+  %add = sub i16 %a, %b
+  %or = or i16 %add, 1
+  %cmp = icmp ugt i16 %or, 1024
+  %res = select i1 %cmp, i16 2, i16 5
+  ret i16 %res
+}
+
+define zeroext i16 @overflow_mul(i16 zeroext %a, i16 zeroext %b) {
+; CHECK-LABEL: overflow_mul:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    mul a0, a1, a0
+; CHECK-NEXT:    ori a0, a0, 1
+; CHECK-NEXT:    slli a0, a0, 48
+; CHECK-NEXT:    srli a1, a0, 48
+; CHECK-NEXT:    li a2, 1024
+; CHECK-NEXT:    li a0, 2
+; CHECK-NEXT:    bltu a2, a1, .LBB2_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    li a0, 5
+; CHECK-NEXT:  .LBB2_2:
+; CHECK-NEXT:    ret
+  %add = mul i16 %b, %a
+  %or = or i16 %add, 1
+  %cmp = icmp ugt i16 %or, 1024
+  %res = select i1 %cmp, i16 2, i16 5
+  ret i16 %res
+}
+
+define zeroext i16 @overflow_shl(i16 zeroext %a, i16 zeroext %b) {
+; CHECK-LABEL: overflow_shl:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    sll a0, a0, a1
+; CHECK-NEXT:    ori a0, a0, 1
+; CHECK-NEXT:    slli a0, a0, 48
+; CHECK-NEXT:    srli a1, a0, 48
+; CHECK-NEXT:    li a2, 1024
+; CHECK-NEXT:    li a0, 2
+; CHECK-NEXT:    bltu a2, a1, .LBB3_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    li a0, 5
+; CHECK-NEXT:  .LBB3_2:
+; CHECK-NEXT:    ret
+  %add = shl i16 %a, %b
+  %or = or i16 %add, 1
+  %cmp = icmp ugt i16 %or, 1024
+  %res = select i1 %cmp, i16 2, i16 5
+  ret i16 %res
+}
+
+define i32 @overflow_add_no_consts(i8 zeroext %a, i8 zeroext %b, i8 zeroext %limit) {
+; CHECK-LABEL: overflow_add_no_consts:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    add a0, a1, a0
+; CHECK-NEXT:    andi a1, a0, 255
+; CHECK-NEXT:    li a0, 8
+; CHECK-NEXT:    bltu a2, a1, .LBB4_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    li a0, 16
+; CHECK-NEXT:  .LBB4_2:
+; CHECK-NEXT:    ret
+  %add = add i8 %b, %a
+  %cmp = icmp ugt i8 %add, %limit
+  %res = select i1 %cmp, i32 8, i32 16
+  ret i32 %res
+}
+
+define i32 @overflow_add_const_limit(i8 zeroext %a, i8 zeroext %b) {
+; CHECK-LABEL: overflow_add_const_limit:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    add a0, a1, a0
+; CHECK-NEXT:    andi a1, a0, 255
+; CHECK-NEXT:    li a2, 128
+; CHECK-NEXT:    li a0, 8
+; CHECK-NEXT:    bltu a2, a1, .LBB5_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    li a0, 16
+; CHECK-NEXT:  .LBB5_2:
+; CHECK-NEXT:    ret
+  %add = add i8 %b, %a
+  %cmp = icmp ugt i8 %add, -128
+  %res = select i1 %cmp, i32 8, i32 16
+  ret i32 %res
+}
+
+define i32 @overflow_add_positive_const_limit(i8 zeroext %a) {
+; CHECK-LABEL: overflow_add_positive_const_limit:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    slli a0, a0, 56
+; CHECK-NEXT:    srai a1, a0, 56
+; CHECK-NEXT:    li a2, -1
+; CHECK-NEXT:    li a0, 8
+; CHECK-NEXT:    blt a1, a2, .LBB6_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    li a0, 16
+; CHECK-NEXT:  .LBB6_2:
+; CHECK-NEXT:    ret
+  %cmp = icmp slt i8 %a, -1
+  %res = select i1 %cmp, i32 8, i32 16
+  ret i32 %res
+}
+
+define i32 @unsafe_add_underflow(i8 zeroext %a) {
+; CHECK-LABEL: unsafe_add_underflow:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    mv a1, a0
+; CHECK-NEXT:    li a2, 1
+; CHECK-NEXT:    li a0, 8
+; CHECK-NEXT:    beq a1, a2, .LBB7_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    li a0, 16
+; CHECK-NEXT:  .LBB7_2:
+; CHECK-NEXT:    ret
+  %cmp = icmp eq i8 %a, 1
+  %res = select i1 %cmp, i32 8, i32 16
+  ret i32 %res
+}
+
+define i32 @safe_add_underflow(i8 zeroext %a) {
+; CHECK-LABEL: safe_add_underflow:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    mv a1, a0
+; CHECK-NEXT:    li a0, 8
+; CHECK-NEXT:    beqz a1, .LBB8_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    li a0, 16
+; CHECK-NEXT:  .LBB8_2:
+; CHECK-NEXT:    ret
+  %cmp = icmp eq i8 %a, 0
+  %res = select i1 %cmp, i32 8, i32 16
+  ret i32 %res
+}
+
+define i32 @safe_add_underflow_neg(i8 zeroext %a) {
+; CHECK-LABEL: safe_add_underflow_neg:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi a0, a0, -2
+; CHECK-NEXT:    andi a1, a0, 255
+; CHECK-NEXT:    li a2, 251
+; CHECK-NEXT:    li a0, 8
+; CHECK-NEXT:    bltu a1, a2, .LBB9_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    li a0, 16
+; CHECK-NEXT:  .LBB9_2:
+; CHECK-NEXT:    ret
+  %add = add i8 %a, -2
+  %cmp = icmp ult i8 %add, -5
+  %res = select i1 %cmp, i32 8, i32 16
+  ret i32 %res
+}
+
+define i32 @overflow_sub_negative_const_limit(i8 zeroext %a) {
+; CHECK-LABEL: overflow_sub_negative_const_limit:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    slli a0, a0, 56
+; CHECK-NEXT:    srai a1, a0, 56
+; CHECK-NEXT:    li a2, -1
+; CHECK-NEXT:    li a0, 8
+; CHECK-NEXT:    blt a1, a2, .LBB10_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    li a0, 16
+; CHECK-NEXT:  .LBB10_2:
+; CHECK-NEXT:    ret
+  %cmp = icmp slt i8 %a, -1
+  %res = select i1 %cmp, i32 8, i32 16
+  ret i32 %res
+}
+
+; This is valid so long as the icmp immediate is sext.
+define i32 @sext_sub_underflow(i8 zeroext %a) {
+; CHECK-LABEL: sext_sub_underflow:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi a0, a0, -6
+; CHECK-NEXT:    andi a1, a0, 255
+; CHECK-NEXT:    li a2, 250
+; CHECK-NEXT:    li a0, 8
+; CHECK-NEXT:    bltu a2, a1, .LBB11_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    li a0, 16
+; CHECK-NEXT:  .LBB11_2:
+; CHECK-NEXT:    ret
+  %sub = add i8 %a, -6
+  %cmp = icmp ugt i8 %sub, -6
+  %res = select i1 %cmp, i32 8, i32 16
+  ret i32 %res
+}
+
+define i32 @safe_sub_underflow(i8 zeroext %a) {
+; CHECK-LABEL: safe_sub_underflow:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    mv a1, a0
+; CHECK-NEXT:    li a0, 16
+; CHECK-NEXT:    beqz a1, .LBB12_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    li a0, 8
+; CHECK-NEXT:  .LBB12_2:
+; CHECK-NEXT:    ret
+  %cmp.not = icmp eq i8 %a, 0
+  %res = select i1 %cmp.not, i32 16, i32 8
+  ret i32 %res
+}
+
+define i32 @safe_sub_underflow_neg(i8 zeroext %a) {
+; CHECK-LABEL: safe_sub_underflow_neg:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi a0, a0, -4
+; CHECK-NEXT:    andi a1, a0, 255
+; CHECK-NEXT:    li a2, 250
+; CHECK-NEXT:    li a0, 8
+; CHECK-NEXT:    bltu a2, a1, .LBB13_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    li a0, 16
+; CHECK-NEXT:  .LBB13_2:
+; CHECK-NEXT:    ret
+  %sub = add i8 %a, -4
+  %cmp = icmp ugt i8 %sub, -6
+  %res = select i1 %cmp, i32 8, i32 16
+  ret i32 %res
+}
+
+; This is valid so long as the icmp immediate is sext.
+define i32 @sext_sub_underflow_neg(i8 zeroext %a) {
+; CHECK-LABEL: sext_sub_underflow_neg:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi a0, a0, -4
+; CHECK-NEXT:    andi a1, a0, 255
+; CHECK-NEXT:    li a2, 253
+; CHECK-NEXT:    li a0, 8
+; CHECK-NEXT:    bltu a1, a2, .LBB14_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    li a0, 16
+; CHECK-NEXT:  .LBB14_2:
+; CHECK-NEXT:    ret
+  %sub = add i8 %a, -4
+  %cmp = icmp ult i8 %sub, -3
+  %res = select i1 %cmp, i32 8, i32 16
+  ret i32 %res
+}
+
+define i32 @safe_sub_imm_var(ptr nocapture readonly %b) local_unnamed_addr #1 {
+; CHECK-LABEL: safe_sub_imm_var:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    li a0, 0
+; CHECK-NEXT:    ret
+entry:
+  ret i32 0
+}
+
+define i32 @safe_sub_var_imm(ptr nocapture readonly %b) local_unnamed_addr #1 {
+; CHECK-LABEL: safe_sub_var_imm:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    lbu a0, 0(a0)
+; CHECK-NEXT:    addi a0, a0, 8
+; CHECK-NEXT:    andi a0, a0, 255
+; CHECK-NEXT:    sltiu a0, a0, 253
+; CHECK-NEXT:    xori a0, a0, 1
+; CHECK-NEXT:    ret
+entry:
+  %0 = load i8, ptr %b, align 1
+  %sub = add nsw i8 %0, 8
+  %cmp = icmp ugt i8 %sub, -4
+  %conv4 = zext i1 %cmp to i32
+  ret i32 %conv4
+}
+
+define i32 @safe_add_imm_var(ptr nocapture readnone %b) {
+; CHECK-LABEL: safe_add_imm_var:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    li a0, 1
+; CHECK-NEXT:    ret
+entry:
+  ret i32 1
+}
+
+define i32 @safe_add_var_imm(ptr nocapture readnone %b) {
+; CHECK-LABEL: safe_add_var_imm:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    li a0, 1
+; CHECK-NEXT:    ret
+entry:
+  ret i32 1
+}
+
+define i8 @convert_add_order(i8 zeroext %arg) {
+; CHECK-LABEL: convert_add_order:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ori a1, a0, 1
+; CHECK-NEXT:    sltiu a2, a1, 50
+; CHECK-NEXT:    addi a1, a1, -40
+; CHECK-NEXT:    andi a1, a1, 255
+; CHECK-NEXT:    sltiu a1, a1, 20
+; CHECK-NEXT:    li a3, 2
+; CHECK-NEXT:    sub a3, a3, a1
+; CHECK-NEXT:    addi a2, a2, -1
+; CHECK-NEXT:    or a2, a2, a3
+; CHECK-NEXT:    and a0, a2, a0
+; CHECK-NEXT:    ret
+  %shl = or i8 %arg, 1
+  %cmp.0 = icmp ult i8 %shl, 50
+  %sub = add nsw i8 %shl, -40
+  %cmp.1 = icmp ult i8 %sub, 20
+  %mask.sel.v = select i1 %cmp.1, i8 1, i8 2
+  %mask.sel = select i1 %cmp.0, i8 %mask.sel.v, i8 -1
+  %res = and i8 %mask.sel, %arg
+  ret i8 %res
+}
+
+define i8 @underflow_if_sub(i32 %arg, i8 zeroext %arg1) {
+; CHECK-LABEL: underflow_if_sub:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    sext.w a2, a0
+; CHECK-NEXT:    sgtz a2, a2
+; CHECK-NEXT:    and a0, a2, a0
+; CHECK-NEXT:    addi a0, a0, -11
+; CHECK-NEXT:    andi a2, a0, 247
+; CHECK-NEXT:    bltu a2, a1, .LBB20_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    li a0, 100
+; CHECK-NEXT:  .LBB20_2:
+; CHECK-NEXT:    ret
+  %cmp = icmp sgt i32 %arg, 0
+  %conv = zext i1 %cmp to i32
+  %and = and i32 %conv, %arg
+  %trunc = trunc i32 %and to i8
+  %conv1 = add nuw nsw i8 %trunc, -11
+  %cmp.1 = icmp ult i8 %conv1, %arg1
+  %res = select i1 %cmp.1, i8 %conv1, i8 100
+  ret i8 %res
+}
+
+define i8 @underflow_if_sub_signext(i32 %arg, i8 signext %arg1) {
+; CHECK-LABEL: underflow_if_sub_signext:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    sext.w a2, a0
+; CHECK-NEXT:    sgtz a2, a2
+; CHECK-NEXT:    and a0, a2, a0
+; CHECK-NEXT:    addi a0, a0, -11
+; CHECK-NEXT:    bltu a0, a1, .LBB21_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    li a0, 100
+; CHECK-NEXT:  .LBB21_2:
+; CHECK-NEXT:    ret
+  %cmp = icmp sgt i32 %arg, 0
+  %conv = zext i1 %cmp to i32
+  %and = and i32 %conv, %arg
+  %trunc = trunc i32 %and to i8
+  %conv1 = add nuw nsw i8 %trunc, -11
+  %cmp.1 = icmp ult i8 %conv1, %arg1
+  %res = select i1 %cmp.1, i8 %conv1, i8 100
+  ret i8 %res
+}
-- 
cgit v1.1


From 7d40ea85d5ea5cc837536f61e3b4f80ea69f14d0 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Mon, 12 Feb 2024 21:43:35 -0800
Subject: [RISCV] Enable the TypePromotion pass from AArch64/ARM.

This pass looks for unsigned icmps that have illegal types and tries
to widen the use/def graph to improve the placement of the zero
extends that type legalization would need to insert.

I've explicitly disabled it for i32 by adding a check for
isSExtCheaperThanZExt to the pass.

The generated code isn't perfect, but my data shows a net
dynamic instruction count improvement on spec2017 for both base and
Zba+Zbb+Zbs.
---
 llvm/lib/CodeGen/TypePromotion.cpp                 |   2 +
 llvm/lib/Target/RISCV/RISCVTargetMachine.cpp       |   7 ++
 llvm/test/CodeGen/RISCV/O3-pipeline.ll             |   1 +
 .../RISCV/lack-of-signed-truncation-check.ll       |  92 ++++++++++++------
 llvm/test/CodeGen/RISCV/signbit-test.ll            |  30 ++++--
 llvm/test/CodeGen/RISCV/signed-truncation-check.ll | 104 ++++++++++++++-------
 llvm/test/CodeGen/RISCV/typepromotion-overflow.ll  |  41 ++++----
 7 files changed, 190 insertions(+), 87 deletions(-)

diff --git a/llvm/lib/CodeGen/TypePromotion.cpp b/llvm/lib/CodeGen/TypePromotion.cpp
index 053caf5..7a3bc6c 100644
--- a/llvm/lib/CodeGen/TypePromotion.cpp
+++ b/llvm/lib/CodeGen/TypePromotion.cpp
@@ -937,6 +937,8 @@ bool TypePromotionImpl::run(Function &F, const TargetMachine *TM,
       return 0;
 
     EVT PromotedVT = TLI->getTypeToTransformTo(*Ctx, SrcVT);
+    if (TLI->isSExtCheaperThanZExt(SrcVT, PromotedVT))
+      return 0;
     if (RegisterBitWidth < PromotedVT.getFixedSizeInBits()) {
       LLVM_DEBUG(dbgs() << "IR Promotion: Couldn't find target register "
                         << "for promoted type\n");
diff --git a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
index 4c3da3a..adef40e 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
@@ -366,6 +366,7 @@ public:
 
   void addIRPasses() override;
   bool addPreISel() override;
+  void addCodeGenPrepare() override;
   bool addInstSelector() override;
   bool addIRTranslator() override;
   void addPreLegalizeMachineIR() override;
@@ -452,6 +453,12 @@ bool RISCVPassConfig::addPreISel() {
   return false;
 }
 
+void RISCVPassConfig::addCodeGenPrepare() {
+  if (getOptLevel() != CodeGenOptLevel::None)
+    addPass(createTypePromotionLegacyPass());
+  TargetPassConfig::addCodeGenPrepare();
+}
+
 bool RISCVPassConfig::addInstSelector() {
   addPass(createRISCVISelDag(getRISCVTargetMachine(), getOptLevel()));
 
diff --git a/llvm/test/CodeGen/RISCV/O3-pipeline.ll b/llvm/test/CodeGen/RISCV/O3-pipeline.ll
index e7db8ef..364c1e4 100644
--- a/llvm/test/CodeGen/RISCV/O3-pipeline.ll
+++ b/llvm/test/CodeGen/RISCV/O3-pipeline.ll
@@ -68,6 +68,7 @@
 ; CHECK-NEXT:       Expand reduction intrinsics
 ; CHECK-NEXT:       Natural Loop Information
 ; CHECK-NEXT:       TLS Variable Hoist
+; CHECK-NEXT:       Type Promotion
 ; CHECK-NEXT:       CodeGen Prepare
 ; CHECK-NEXT:       Dominator Tree Construction
 ; CHECK-NEXT:       Exception handling preparation
diff --git a/llvm/test/CodeGen/RISCV/lack-of-signed-truncation-check.ll b/llvm/test/CodeGen/RISCV/lack-of-signed-truncation-check.ll
index 9e7f2e9..6e3a505 100644
--- a/llvm/test/CodeGen/RISCV/lack-of-signed-truncation-check.ll
+++ b/llvm/test/CodeGen/RISCV/lack-of-signed-truncation-check.ll
@@ -254,21 +254,39 @@ define i1 @shifts_necmp_i64_i8(i64 %x) nounwind {
 ; ---------------------------------------------------------------------------- ;
 
 define i1 @add_ultcmp_i16_i8(i16 %x) nounwind {
-; RV32-LABEL: add_ultcmp_i16_i8:
-; RV32:       # %bb.0:
-; RV32-NEXT:    addi a0, a0, -128
-; RV32-NEXT:    slli a0, a0, 16
-; RV32-NEXT:    srli a0, a0, 24
-; RV32-NEXT:    sltiu a0, a0, 255
-; RV32-NEXT:    ret
+; RV32I-LABEL: add_ultcmp_i16_i8:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    slli a0, a0, 16
+; RV32I-NEXT:    srli a0, a0, 16
+; RV32I-NEXT:    addi a0, a0, -128
+; RV32I-NEXT:    srli a0, a0, 8
+; RV32I-NEXT:    sltiu a0, a0, 255
+; RV32I-NEXT:    ret
 ;
-; RV64-LABEL: add_ultcmp_i16_i8:
-; RV64:       # %bb.0:
-; RV64-NEXT:    addi a0, a0, -128
-; RV64-NEXT:    slli a0, a0, 48
-; RV64-NEXT:    srli a0, a0, 56
-; RV64-NEXT:    sltiu a0, a0, 255
-; RV64-NEXT:    ret
+; RV64I-LABEL: add_ultcmp_i16_i8:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    slli a0, a0, 48
+; RV64I-NEXT:    srli a0, a0, 48
+; RV64I-NEXT:    addi a0, a0, -128
+; RV64I-NEXT:    srli a0, a0, 8
+; RV64I-NEXT:    sltiu a0, a0, 255
+; RV64I-NEXT:    ret
+;
+; RV32ZBB-LABEL: add_ultcmp_i16_i8:
+; RV32ZBB:       # %bb.0:
+; RV32ZBB-NEXT:    zext.h a0, a0
+; RV32ZBB-NEXT:    addi a0, a0, -128
+; RV32ZBB-NEXT:    srli a0, a0, 8
+; RV32ZBB-NEXT:    sltiu a0, a0, 255
+; RV32ZBB-NEXT:    ret
+;
+; RV64ZBB-LABEL: add_ultcmp_i16_i8:
+; RV64ZBB:       # %bb.0:
+; RV64ZBB-NEXT:    zext.h a0, a0
+; RV64ZBB-NEXT:    addi a0, a0, -128
+; RV64ZBB-NEXT:    srli a0, a0, 8
+; RV64ZBB-NEXT:    sltiu a0, a0, 255
+; RV64ZBB-NEXT:    ret
   %tmp0 = add i16 %x, -128 ; ~0U << (8-1)
   %tmp1 = icmp ult i16 %tmp0, -256 ; ~0U << 8
   ret i1 %tmp1
@@ -421,21 +439,39 @@ define i1 @add_ultcmp_i64_i8(i64 %x) nounwind {
 
 ; Slightly more canonical variant
 define i1 @add_ulecmp_i16_i8(i16 %x) nounwind {
-; RV32-LABEL: add_ulecmp_i16_i8:
-; RV32:       # %bb.0:
-; RV32-NEXT:    addi a0, a0, -128
-; RV32-NEXT:    slli a0, a0, 16
-; RV32-NEXT:    srli a0, a0, 24
-; RV32-NEXT:    sltiu a0, a0, 255
-; RV32-NEXT:    ret
+; RV32I-LABEL: add_ulecmp_i16_i8:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    slli a0, a0, 16
+; RV32I-NEXT:    srli a0, a0, 16
+; RV32I-NEXT:    addi a0, a0, -128
+; RV32I-NEXT:    srli a0, a0, 8
+; RV32I-NEXT:    sltiu a0, a0, 255
+; RV32I-NEXT:    ret
 ;
-; RV64-LABEL: add_ulecmp_i16_i8:
-; RV64:       # %bb.0:
-; RV64-NEXT:    addi a0, a0, -128
-; RV64-NEXT:    slli a0, a0, 48
-; RV64-NEXT:    srli a0, a0, 56
-; RV64-NEXT:    sltiu a0, a0, 255
-; RV64-NEXT:    ret
+; RV64I-LABEL: add_ulecmp_i16_i8:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    slli a0, a0, 48
+; RV64I-NEXT:    srli a0, a0, 48
+; RV64I-NEXT:    addi a0, a0, -128
+; RV64I-NEXT:    srli a0, a0, 8
+; RV64I-NEXT:    sltiu a0, a0, 255
+; RV64I-NEXT:    ret
+;
+; RV32ZBB-LABEL: add_ulecmp_i16_i8:
+; RV32ZBB:       # %bb.0:
+; RV32ZBB-NEXT:    zext.h a0, a0
+; RV32ZBB-NEXT:    addi a0, a0, -128
+; RV32ZBB-NEXT:    srli a0, a0, 8
+; RV32ZBB-NEXT:    sltiu a0, a0, 255
+; RV32ZBB-NEXT:    ret
+;
+; RV64ZBB-LABEL: add_ulecmp_i16_i8:
+; RV64ZBB:       # %bb.0:
+; RV64ZBB-NEXT:    zext.h a0, a0
+; RV64ZBB-NEXT:    addi a0, a0, -128
+; RV64ZBB-NEXT:    srli a0, a0, 8
+; RV64ZBB-NEXT:    sltiu a0, a0, 255
+; RV64ZBB-NEXT:    ret
   %tmp0 = add i16 %x, -128 ; ~0U << (8-1)
   %tmp1 = icmp ule i16 %tmp0, -257 ; ~0U << 8 - 1
   ret i1 %tmp1
diff --git a/llvm/test/CodeGen/RISCV/signbit-test.ll b/llvm/test/CodeGen/RISCV/signbit-test.ll
index 69a9026..4e10fae 100644
--- a/llvm/test/CodeGen/RISCV/signbit-test.ll
+++ b/llvm/test/CodeGen/RISCV/signbit-test.ll
@@ -303,7 +303,10 @@ define i16 @test_clear_mask_i16_i8(i16 %x) nounwind {
 ; RV32-NEXT:    bnez a1, .LBB10_2
 ; RV32-NEXT:  # %bb.1: # %t
 ; RV32-NEXT:    li a0, 42
-; RV32-NEXT:  .LBB10_2: # %f
+; RV32-NEXT:    ret
+; RV32-NEXT:  .LBB10_2:
+; RV32-NEXT:    slli a0, a0, 16
+; RV32-NEXT:    srli a0, a0, 16
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: test_clear_mask_i16_i8:
@@ -312,7 +315,10 @@ define i16 @test_clear_mask_i16_i8(i16 %x) nounwind {
 ; RV64-NEXT:    bnez a1, .LBB10_2
 ; RV64-NEXT:  # %bb.1: # %t
 ; RV64-NEXT:    li a0, 42
-; RV64-NEXT:  .LBB10_2: # %f
+; RV64-NEXT:    ret
+; RV64-NEXT:  .LBB10_2:
+; RV64-NEXT:    slli a0, a0, 48
+; RV64-NEXT:    srli a0, a0, 48
 ; RV64-NEXT:    ret
 entry:
   %a = and i16 %x, 128
@@ -332,7 +338,10 @@ define i16 @test_set_mask_i16_i8(i16 %x) nounwind {
 ; RV32-NEXT:    beqz a1, .LBB11_2
 ; RV32-NEXT:  # %bb.1: # %t
 ; RV32-NEXT:    li a0, 42
-; RV32-NEXT:  .LBB11_2: # %f
+; RV32-NEXT:    ret
+; RV32-NEXT:  .LBB11_2:
+; RV32-NEXT:    slli a0, a0, 16
+; RV32-NEXT:    srli a0, a0, 16
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: test_set_mask_i16_i8:
@@ -341,7 +350,10 @@ define i16 @test_set_mask_i16_i8(i16 %x) nounwind {
 ; RV64-NEXT:    beqz a1, .LBB11_2
 ; RV64-NEXT:  # %bb.1: # %t
 ; RV64-NEXT:    li a0, 42
-; RV64-NEXT:  .LBB11_2: # %f
+; RV64-NEXT:    ret
+; RV64-NEXT:  .LBB11_2:
+; RV64-NEXT:    slli a0, a0, 48
+; RV64-NEXT:    srli a0, a0, 48
 ; RV64-NEXT:    ret
 entry:
   %a = and i16 %x, 128
@@ -361,7 +373,10 @@ define i16 @test_set_mask_i16_i7(i16 %x) nounwind {
 ; RV32-NEXT:    beqz a1, .LBB12_2
 ; RV32-NEXT:  # %bb.1: # %t
 ; RV32-NEXT:    li a0, 42
-; RV32-NEXT:  .LBB12_2: # %f
+; RV32-NEXT:    ret
+; RV32-NEXT:  .LBB12_2:
+; RV32-NEXT:    slli a0, a0, 16
+; RV32-NEXT:    srli a0, a0, 16
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: test_set_mask_i16_i7:
@@ -370,7 +385,10 @@ define i16 @test_set_mask_i16_i7(i16 %x) nounwind {
 ; RV64-NEXT:    beqz a1, .LBB12_2
 ; RV64-NEXT:  # %bb.1: # %t
 ; RV64-NEXT:    li a0, 42
-; RV64-NEXT:  .LBB12_2: # %f
+; RV64-NEXT:    ret
+; RV64-NEXT:  .LBB12_2:
+; RV64-NEXT:    slli a0, a0, 48
+; RV64-NEXT:    srli a0, a0, 48
 ; RV64-NEXT:    ret
 entry:
   %a = and i16 %x, 64
diff --git a/llvm/test/CodeGen/RISCV/signed-truncation-check.ll b/llvm/test/CodeGen/RISCV/signed-truncation-check.ll
index 0860853..de36bcd 100644
--- a/llvm/test/CodeGen/RISCV/signed-truncation-check.ll
+++ b/llvm/test/CodeGen/RISCV/signed-truncation-check.ll
@@ -254,23 +254,43 @@ define i1 @shifts_eqcmp_i64_i8(i64 %x) nounwind {
 ; ---------------------------------------------------------------------------- ;
 
 define i1 @add_ugecmp_i16_i8(i16 %x) nounwind {
-; RV32-LABEL: add_ugecmp_i16_i8:
-; RV32:       # %bb.0:
-; RV32-NEXT:    addi a0, a0, -128
-; RV32-NEXT:    slli a0, a0, 16
-; RV32-NEXT:    srli a0, a0, 24
-; RV32-NEXT:    sltiu a0, a0, 255
-; RV32-NEXT:    xori a0, a0, 1
-; RV32-NEXT:    ret
+; RV32I-LABEL: add_ugecmp_i16_i8:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    slli a0, a0, 16
+; RV32I-NEXT:    srli a0, a0, 16
+; RV32I-NEXT:    addi a0, a0, -128
+; RV32I-NEXT:    srli a0, a0, 8
+; RV32I-NEXT:    sltiu a0, a0, 255
+; RV32I-NEXT:    xori a0, a0, 1
+; RV32I-NEXT:    ret
 ;
-; RV64-LABEL: add_ugecmp_i16_i8:
-; RV64:       # %bb.0:
-; RV64-NEXT:    addi a0, a0, -128
-; RV64-NEXT:    slli a0, a0, 48
-; RV64-NEXT:    srli a0, a0, 56
-; RV64-NEXT:    sltiu a0, a0, 255
-; RV64-NEXT:    xori a0, a0, 1
-; RV64-NEXT:    ret
+; RV64I-LABEL: add_ugecmp_i16_i8:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    slli a0, a0, 48
+; RV64I-NEXT:    srli a0, a0, 48
+; RV64I-NEXT:    addi a0, a0, -128
+; RV64I-NEXT:    srli a0, a0, 8
+; RV64I-NEXT:    sltiu a0, a0, 255
+; RV64I-NEXT:    xori a0, a0, 1
+; RV64I-NEXT:    ret
+;
+; RV32ZBB-LABEL: add_ugecmp_i16_i8:
+; RV32ZBB:       # %bb.0:
+; RV32ZBB-NEXT:    zext.h a0, a0
+; RV32ZBB-NEXT:    addi a0, a0, -128
+; RV32ZBB-NEXT:    srli a0, a0, 8
+; RV32ZBB-NEXT:    sltiu a0, a0, 255
+; RV32ZBB-NEXT:    xori a0, a0, 1
+; RV32ZBB-NEXT:    ret
+;
+; RV64ZBB-LABEL: add_ugecmp_i16_i8:
+; RV64ZBB:       # %bb.0:
+; RV64ZBB-NEXT:    zext.h a0, a0
+; RV64ZBB-NEXT:    addi a0, a0, -128
+; RV64ZBB-NEXT:    srli a0, a0, 8
+; RV64ZBB-NEXT:    sltiu a0, a0, 255
+; RV64ZBB-NEXT:    xori a0, a0, 1
+; RV64ZBB-NEXT:    ret
   %tmp0 = add i16 %x, -128 ; ~0U << (8-1)
   %tmp1 = icmp uge i16 %tmp0, -256 ; ~0U << 8
   ret i1 %tmp1
@@ -471,23 +491,43 @@ define i1 @add_ugecmp_i64_i8(i64 %x) nounwind {
 
 ; Slightly more canonical variant
 define i1 @add_ugtcmp_i16_i8(i16 %x) nounwind {
-; RV32-LABEL: add_ugtcmp_i16_i8:
-; RV32:       # %bb.0:
-; RV32-NEXT:    addi a0, a0, -128
-; RV32-NEXT:    slli a0, a0, 16
-; RV32-NEXT:    srli a0, a0, 24
-; RV32-NEXT:    sltiu a0, a0, 255
-; RV32-NEXT:    xori a0, a0, 1
-; RV32-NEXT:    ret
+; RV32I-LABEL: add_ugtcmp_i16_i8:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    slli a0, a0, 16
+; RV32I-NEXT:    srli a0, a0, 16
+; RV32I-NEXT:    addi a0, a0, -128
+; RV32I-NEXT:    srli a0, a0, 8
+; RV32I-NEXT:    sltiu a0, a0, 255
+; RV32I-NEXT:    xori a0, a0, 1
+; RV32I-NEXT:    ret
 ;
-; RV64-LABEL: add_ugtcmp_i16_i8:
-; RV64:       # %bb.0:
-; RV64-NEXT:    addi a0, a0, -128
-; RV64-NEXT:    slli a0, a0, 48
-; RV64-NEXT:    srli a0, a0, 56
-; RV64-NEXT:    sltiu a0, a0, 255
-; RV64-NEXT:    xori a0, a0, 1
-; RV64-NEXT:    ret
+; RV64I-LABEL: add_ugtcmp_i16_i8:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    slli a0, a0, 48
+; RV64I-NEXT:    srli a0, a0, 48
+; RV64I-NEXT:    addi a0, a0, -128
+; RV64I-NEXT:    srli a0, a0, 8
+; RV64I-NEXT:    sltiu a0, a0, 255
+; RV64I-NEXT:    xori a0, a0, 1
+; RV64I-NEXT:    ret
+;
+; RV32ZBB-LABEL: add_ugtcmp_i16_i8:
+; RV32ZBB:       # %bb.0:
+; RV32ZBB-NEXT:    zext.h a0, a0
+; RV32ZBB-NEXT:    addi a0, a0, -128
+; RV32ZBB-NEXT:    srli a0, a0, 8
+; RV32ZBB-NEXT:    sltiu a0, a0, 255
+; RV32ZBB-NEXT:    xori a0, a0, 1
+; RV32ZBB-NEXT:    ret
+;
+; RV64ZBB-LABEL: add_ugtcmp_i16_i8:
+; RV64ZBB:       # %bb.0:
+; RV64ZBB-NEXT:    zext.h a0, a0
+; RV64ZBB-NEXT:    addi a0, a0, -128
+; RV64ZBB-NEXT:    srli a0, a0, 8
+; RV64ZBB-NEXT:    sltiu a0, a0, 255
+; RV64ZBB-NEXT:    xori a0, a0, 1
+; RV64ZBB-NEXT:    ret
   %tmp0 = add i16 %x, -128 ; ~0U << (8-1)
   %tmp1 = icmp ugt i16 %tmp0, -257 ; ~0U << 8 - 1
   ret i1 %tmp1
diff --git a/llvm/test/CodeGen/RISCV/typepromotion-overflow.ll b/llvm/test/CodeGen/RISCV/typepromotion-overflow.ll
index fad9e6c..3740dc6 100644
--- a/llvm/test/CodeGen/RISCV/typepromotion-overflow.ll
+++ b/llvm/test/CodeGen/RISCV/typepromotion-overflow.ll
@@ -171,8 +171,7 @@ define i32 @safe_add_underflow(i8 zeroext %a) {
 define i32 @safe_add_underflow_neg(i8 zeroext %a) {
 ; CHECK-LABEL: safe_add_underflow_neg:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi a0, a0, -2
-; CHECK-NEXT:    andi a1, a0, 255
+; CHECK-NEXT:    addi a1, a0, -2
 ; CHECK-NEXT:    li a2, 251
 ; CHECK-NEXT:    li a0, 8
 ; CHECK-NEXT:    bltu a1, a2, .LBB9_2
@@ -207,9 +206,8 @@ define i32 @overflow_sub_negative_const_limit(i8 zeroext %a) {
 define i32 @sext_sub_underflow(i8 zeroext %a) {
 ; CHECK-LABEL: sext_sub_underflow:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi a0, a0, -6
-; CHECK-NEXT:    andi a1, a0, 255
-; CHECK-NEXT:    li a2, 250
+; CHECK-NEXT:    addi a1, a0, -6
+; CHECK-NEXT:    li a2, -6
 ; CHECK-NEXT:    li a0, 8
 ; CHECK-NEXT:    bltu a2, a1, .LBB11_2
 ; CHECK-NEXT:  # %bb.1:
@@ -240,8 +238,7 @@ define i32 @safe_sub_underflow(i8 zeroext %a) {
 define i32 @safe_sub_underflow_neg(i8 zeroext %a) {
 ; CHECK-LABEL: safe_sub_underflow_neg:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi a0, a0, -4
-; CHECK-NEXT:    andi a1, a0, 255
+; CHECK-NEXT:    addi a1, a0, -4
 ; CHECK-NEXT:    li a2, 250
 ; CHECK-NEXT:    li a0, 8
 ; CHECK-NEXT:    bltu a2, a1, .LBB13_2
@@ -259,9 +256,8 @@ define i32 @safe_sub_underflow_neg(i8 zeroext %a) {
 define i32 @sext_sub_underflow_neg(i8 zeroext %a) {
 ; CHECK-LABEL: sext_sub_underflow_neg:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi a0, a0, -4
-; CHECK-NEXT:    andi a1, a0, 255
-; CHECK-NEXT:    li a2, 253
+; CHECK-NEXT:    addi a1, a0, -4
+; CHECK-NEXT:    li a2, -3
 ; CHECK-NEXT:    li a0, 8
 ; CHECK-NEXT:    bltu a1, a2, .LBB14_2
 ; CHECK-NEXT:  # %bb.1:
@@ -322,15 +318,18 @@ define i8 @convert_add_order(i8 zeroext %arg) {
 ; CHECK-LABEL: convert_add_order:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    ori a1, a0, 1
-; CHECK-NEXT:    sltiu a2, a1, 50
+; CHECK-NEXT:    li a2, 50
+; CHECK-NEXT:    bltu a1, a2, .LBB19_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    li a1, 255
+; CHECK-NEXT:    and a0, a1, a0
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  .LBB19_2:
 ; CHECK-NEXT:    addi a1, a1, -40
-; CHECK-NEXT:    andi a1, a1, 255
 ; CHECK-NEXT:    sltiu a1, a1, 20
-; CHECK-NEXT:    li a3, 2
-; CHECK-NEXT:    sub a3, a3, a1
-; CHECK-NEXT:    addi a2, a2, -1
-; CHECK-NEXT:    or a2, a2, a3
-; CHECK-NEXT:    and a0, a2, a0
+; CHECK-NEXT:    li a2, 2
+; CHECK-NEXT:    sub a1, a2, a1
+; CHECK-NEXT:    and a0, a1, a0
 ; CHECK-NEXT:    ret
   %shl = or i8 %arg, 1
   %cmp.0 = icmp ult i8 %shl, 50
@@ -348,9 +347,8 @@ define i8 @underflow_if_sub(i32 %arg, i8 zeroext %arg1) {
 ; CHECK-NEXT:    sext.w a2, a0
 ; CHECK-NEXT:    sgtz a2, a2
 ; CHECK-NEXT:    and a0, a2, a0
-; CHECK-NEXT:    addi a0, a0, -11
-; CHECK-NEXT:    andi a2, a0, 247
-; CHECK-NEXT:    bltu a2, a1, .LBB20_2
+; CHECK-NEXT:    addi a0, a0, 245
+; CHECK-NEXT:    bltu a0, a1, .LBB20_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    li a0, 100
 ; CHECK-NEXT:  .LBB20_2:
@@ -369,9 +367,10 @@ define i8 @underflow_if_sub_signext(i32 %arg, i8 signext %arg1) {
 ; CHECK-LABEL: underflow_if_sub_signext:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    sext.w a2, a0
+; CHECK-NEXT:    andi a1, a1, 255
 ; CHECK-NEXT:    sgtz a2, a2
 ; CHECK-NEXT:    and a0, a2, a0
-; CHECK-NEXT:    addi a0, a0, -11
+; CHECK-NEXT:    addi a0, a0, 245
 ; CHECK-NEXT:    bltu a0, a1, .LBB21_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    li a0, 100
-- 
cgit v1.1


From 5e3c7e3aa48356a62a4b70d5d9d3e4ddd055a390 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Valentin=20Clement=20=28=E3=83=90=E3=83=AC=E3=83=B3?=
 =?UTF-8?q?=E3=82=BF=E3=82=A4=E3=83=B3=20=E3=82=AF=E3=83=AC=E3=83=A1?=
 =?UTF-8?q?=E3=83=B3=29?= <clementval@gmail.com>
Date: Tue, 13 Feb 2024 10:02:52 -0800
Subject: [flang][cuda] Lower cluster_dims values (#81636)

This PR adds a new attribute to carry over the information from
`cluster_dims`. The new attribute `CUDAClusterDimsAttr` holds 3 integer
attributes and is added to `func.func` operation.
---
 flang/include/flang/Optimizer/Dialect/FIRAttr.td      | 12 ++++++++++++
 flang/include/flang/Optimizer/Dialect/FIROpsSupport.h |  5 +++++
 flang/lib/Lower/CallInterface.cpp                     | 16 +++++++++++++++-
 flang/lib/Optimizer/Dialect/FIRAttr.cpp               |  2 +-
 flang/test/Lower/CUDA/cuda-proc-attribute.cuf         |  3 +++
 5 files changed, 36 insertions(+), 2 deletions(-)

diff --git a/flang/include/flang/Optimizer/Dialect/FIRAttr.td b/flang/include/flang/Optimizer/Dialect/FIRAttr.td
index 3602c67..66d6cd4 100644
--- a/flang/include/flang/Optimizer/Dialect/FIRAttr.td
+++ b/flang/include/flang/Optimizer/Dialect/FIRAttr.td
@@ -125,4 +125,16 @@ def fir_CUDALaunchBoundsAttr : fir_Attr<"CUDALaunchBounds"> {
   let assemblyFormat = "`<` struct(params) `>`";
 }
 
+def fir_CUDAClusterDimsAttr : fir_Attr<"CUDAClusterDims"> {
+  let mnemonic = "cluster_dims";
+
+  let parameters = (ins
+    "mlir::IntegerAttr":$x,
+    "mlir::IntegerAttr":$y,
+    "mlir::IntegerAttr":$z
+  );
+
+  let assemblyFormat = "`<` struct(params) `>`";
+}
+
 #endif // FIR_DIALECT_FIR_ATTRS
diff --git a/flang/include/flang/Optimizer/Dialect/FIROpsSupport.h b/flang/include/flang/Optimizer/Dialect/FIROpsSupport.h
index 29fa57c..e8226b6 100644
--- a/flang/include/flang/Optimizer/Dialect/FIROpsSupport.h
+++ b/flang/include/flang/Optimizer/Dialect/FIROpsSupport.h
@@ -80,6 +80,11 @@ static constexpr llvm::StringRef getCUDALaunchBoundsAttrName() {
   return "fir.cuda_launch_bounds";
 }
 
+/// Attribute to carry CUDA cluster_dims values.
+static constexpr llvm::StringRef getCUDAClusterDimsAttrName() {
+  return "fir.cuda_cluster_dims";
+}
+
 /// Attribute to mark that a function argument is a character dummy procedure.
 /// Character dummy procedure have special ABI constraints.
 static constexpr llvm::StringRef getCharacterProcedureDummyAttrName() {
diff --git a/flang/lib/Lower/CallInterface.cpp b/flang/lib/Lower/CallInterface.cpp
index f990e0b..6b71aab 100644
--- a/flang/lib/Lower/CallInterface.cpp
+++ b/flang/lib/Lower/CallInterface.cpp
@@ -540,10 +540,10 @@ setCUDAAttributes(mlir::func::FuncOp func,
     if (auto details =
             sym->GetUltimate()
                 .detailsIf<Fortran::semantics::SubprogramDetails>()) {
+      mlir::Type i64Ty = mlir::IntegerType::get(func.getContext(), 64);
       if (!details->cudaLaunchBounds().empty()) {
         assert(details->cudaLaunchBounds().size() >= 2 &&
                "expect at least 2 values");
-        mlir::Type i64Ty = mlir::IntegerType::get(func.getContext(), 64);
         auto maxTPBAttr =
             mlir::IntegerAttr::get(i64Ty, details->cudaLaunchBounds()[0]);
         auto minBPMAttr =
@@ -557,6 +557,20 @@ setCUDAAttributes(mlir::func::FuncOp func,
             fir::CUDALaunchBoundsAttr::get(func.getContext(), maxTPBAttr,
                                            minBPMAttr, ubAttr));
       }
+
+      if (!details->cudaClusterDims().empty()) {
+        assert(details->cudaClusterDims().size() == 3 && "expect 3 values");
+        auto xAttr =
+            mlir::IntegerAttr::get(i64Ty, details->cudaClusterDims()[0]);
+        auto yAttr =
+            mlir::IntegerAttr::get(i64Ty, details->cudaClusterDims()[1]);
+        auto zAttr =
+            mlir::IntegerAttr::get(i64Ty, details->cudaClusterDims()[2]);
+        func.getOperation()->setAttr(
+            fir::getCUDAClusterDimsAttrName(),
+            fir::CUDAClusterDimsAttr::get(func.getContext(), xAttr, yAttr,
+                                          zAttr));
+      }
     }
   }
 }
diff --git a/flang/lib/Optimizer/Dialect/FIRAttr.cpp b/flang/lib/Optimizer/Dialect/FIRAttr.cpp
index 8d780e0..0cf8dfb 100644
--- a/flang/lib/Optimizer/Dialect/FIRAttr.cpp
+++ b/flang/lib/Optimizer/Dialect/FIRAttr.cpp
@@ -299,5 +299,5 @@ void FIROpsDialect::registerAttributes() {
   addAttributes<ClosedIntervalAttr, ExactTypeAttr, FortranVariableFlagsAttr,
                 LowerBoundAttr, PointIntervalAttr, RealAttr, SubclassAttr,
                 UpperBoundAttr, CUDADataAttributeAttr, CUDAProcAttributeAttr,
-                CUDALaunchBoundsAttr>();
+                CUDALaunchBoundsAttr, CUDAClusterDimsAttr>();
 }
diff --git a/flang/test/Lower/CUDA/cuda-proc-attribute.cuf b/flang/test/Lower/CUDA/cuda-proc-attribute.cuf
index 9eb2b85..d9765f6 100644
--- a/flang/test/Lower/CUDA/cuda-proc-attribute.cuf
+++ b/flang/test/Lower/CUDA/cuda-proc-attribute.cuf
@@ -38,3 +38,6 @@ attributes(global) launch_bounds(1, 2) subroutine sub_lbounds1(); end
 
 attributes(global) launch_bounds(1, 2, 3) subroutine sub_lbounds2(); end
 ! CHECK: func.func @_QPsub_lbounds2() attributes {fir.cuda_attr = #fir.cuda_proc<global>, fir.cuda_launch_bounds = #fir.launch_bounds<maxTPB = 1 : i64, minBPM = 2 : i64, upperBoundClusterSize = 3 : i64>}
+
+attributes(global) cluster_dims(1, 2, 3) subroutine sub_clusterdims1(); end
+! CHECK: func.func @_QPsub_clusterdims1() attributes {fir.cuda_attr = #fir.cuda_proc<global>, fir.cuda_cluster_dims = #fir.cluster_dims<x = 1 : i64, y = 2 : i64, z = 3 : i64>}
-- 
cgit v1.1


From 502a88bae799694d0ed90e1839cd7a0aacb6bc9d Mon Sep 17 00:00:00 2001
From: Alex Langford <alangford@apple.com>
Date: Tue, 13 Feb 2024 10:13:35 -0800
Subject: [lldb][NFCI] Add header guard to PlatformRemoteAppleXR.h (#81565)

---
 lldb/source/Plugins/Platform/MacOSX/PlatformRemoteAppleXR.h | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/lldb/source/Plugins/Platform/MacOSX/PlatformRemoteAppleXR.h b/lldb/source/Plugins/Platform/MacOSX/PlatformRemoteAppleXR.h
index 4fed6e1..2fbb6ca 100644
--- a/lldb/source/Plugins/Platform/MacOSX/PlatformRemoteAppleXR.h
+++ b/lldb/source/Plugins/Platform/MacOSX/PlatformRemoteAppleXR.h
@@ -6,6 +6,9 @@
 //
 //===----------------------------------------------------------------------===//
 
+#ifndef LLDB_SOURCE_PLUGINS_PLATFORM_MACOSX_PLATFORMREMOTEAPPLEXR_H
+#define LLDB_SOURCE_PLUGINS_PLATFORM_MACOSX_PLATFORMREMOTEAPPLEXR_H
+
 #include "PlatformRemoteDarwinDevice.h"
 
 namespace lldb_private {
@@ -36,3 +39,5 @@ protected:
   llvm::StringRef GetPlatformName() override;
 };
 } // namespace lldb_private
+
+#endif // LLDB_SOURCE_PLUGINS_PLATFORM_MACOSX_PLATFORMREMOTEAPPLEXR_H
-- 
cgit v1.1


From 63198e0682058c81cd546cb1851e785cec1387ef Mon Sep 17 00:00:00 2001
From: Joseph Huber <huberjn@outlook.com>
Date: Tue, 13 Feb 2024 12:26:45 -0600
Subject: [libc] Remove remaining GPU architecture dependent instructions
 (#81612)

Summary:
Recent patches have added solutions to the remaining sources of
divergence. This patch simply removes the last occures of things like
`has_builtin`, `ifdef` or builtins with feature requirements. The one
exception here is `nanosleep`, but I made changes in the
`__nvvm_reflect` pass to make usage like this actually work at O0.

Depends on https://github.com/llvm/llvm-project/pull/81331
---
 libc/src/__support/GPU/amdgpu/utils.h |  9 +--------
 libc/src/__support/GPU/nvptx/utils.h  |  6 ++----
 libc/src/__support/RPC/rpc_util.h     |  5 +++--
 libc/src/time/gpu/nanosleep.cpp       |  5 +++--
 libc/src/time/gpu/time_utils.h        | 21 +++++----------------
 5 files changed, 14 insertions(+), 32 deletions(-)

diff --git a/libc/src/__support/GPU/amdgpu/utils.h b/libc/src/__support/GPU/amdgpu/utils.h
index 9432b7b..75f0b57 100644
--- a/libc/src/__support/GPU/amdgpu/utils.h
+++ b/libc/src/__support/GPU/amdgpu/utils.h
@@ -152,14 +152,7 @@ LIBC_INLINE uint64_t processor_clock() { return __builtin_readcyclecounter(); }
 /// Returns a fixed-frequency timestamp. The actual frequency is dependent on
 /// the card and can only be queried via the driver.
 LIBC_INLINE uint64_t fixed_frequency_clock() {
-  if constexpr (LIBC_HAS_BUILTIN(__builtin_amdgcn_s_sendmsg_rtnl))
-    return __builtin_amdgcn_s_sendmsg_rtnl(0x83);
-  else if constexpr (LIBC_HAS_BUILTIN(__builtin_amdgcn_s_memrealtime))
-    return __builtin_amdgcn_s_memrealtime();
-  else if constexpr (LIBC_HAS_BUILTIN(__builtin_amdgcn_s_memtime))
-    return __builtin_amdgcn_s_memtime();
-  else
-    return 0;
+  return __builtin_readsteadycounter();
 }
 
 /// Terminates execution of the associated wavefront.
diff --git a/libc/src/__support/GPU/nvptx/utils.h b/libc/src/__support/GPU/nvptx/utils.h
index 6c4bb5a..22a46e8 100644
--- a/libc/src/__support/GPU/nvptx/utils.h
+++ b/libc/src/__support/GPU/nvptx/utils.h
@@ -135,13 +135,11 @@ LIBC_INLINE uint32_t get_lane_size() { return 32; }
 }
 
 /// Returns the current value of the GPU's processor clock.
-LIBC_INLINE uint64_t processor_clock() {
-  return __nvvm_read_ptx_sreg_clock64();
-}
+LIBC_INLINE uint64_t processor_clock() { return __builtin_readcyclecounter(); }
 
 /// Returns a global fixed-frequency timer at nanosecond frequency.
 LIBC_INLINE uint64_t fixed_frequency_clock() {
-  return __nvvm_read_ptx_sreg_globaltimer();
+  return __builtin_readsteadycounter();
 }
 
 /// Terminates execution of the calling thread.
diff --git a/libc/src/__support/RPC/rpc_util.h b/libc/src/__support/RPC/rpc_util.h
index ff95692..cc2a11a 100644
--- a/libc/src/__support/RPC/rpc_util.h
+++ b/libc/src/__support/RPC/rpc_util.h
@@ -21,8 +21,9 @@ namespace rpc {
 
 /// Suspend the thread briefly to assist the thread scheduler during busy loops.
 LIBC_INLINE void sleep_briefly() {
-#if defined(LIBC_TARGET_ARCH_IS_NVPTX) && __CUDA_ARCH__ >= 700
-  __nvvm_nanosleep(64);
+#if defined(LIBC_TARGET_ARCH_IS_NVPTX)
+  if (__nvvm_reflect("__CUDA_ARCH") >= 700)
+    LIBC_INLINE_ASM("nanosleep.u32 64;" ::: "memory");
 #elif defined(LIBC_TARGET_ARCH_IS_AMDGPU)
   __builtin_amdgcn_s_sleep(2);
 #elif defined(LIBC_TARGET_ARCH_IS_X86)
diff --git a/libc/src/time/gpu/nanosleep.cpp b/libc/src/time/gpu/nanosleep.cpp
index 34ff904..dd669ff 100644
--- a/libc/src/time/gpu/nanosleep.cpp
+++ b/libc/src/time/gpu/nanosleep.cpp
@@ -23,14 +23,15 @@ LLVM_LIBC_FUNCTION(int, nanosleep,
   uint64_t tick_rate = TICKS_PER_SEC / GPU_CLOCKS_PER_SEC;
 
   uint64_t start = gpu::fixed_frequency_clock();
-#if defined(LIBC_TARGET_ARCH_IS_NVPTX) && __CUDA_ARCH__ >= 700
+#if defined(LIBC_TARGET_ARCH_IS_NVPTX)
   uint64_t end = start + (nsecs + tick_rate - 1) / tick_rate;
   uint64_t cur = gpu::fixed_frequency_clock();
   // The NVPTX architecture supports sleeping and guaruntees the actual time
   // slept will be somewhere between zero and twice the requested amount. Here
   // we will sleep again if we undershot the time.
   while (cur < end) {
-    __nvvm_nanosleep(static_cast<uint32_t>(nsecs));
+    if (__nvvm_reflect("__CUDA_ARCH") >= 700)
+      LIBC_INLINE_ASM("nanosleep.u32 %0;" ::"r"(nsecs));
     cur = gpu::fixed_frequency_clock();
     nsecs -= nsecs > cur - start ? cur - start : 0;
   }
diff --git a/libc/src/time/gpu/time_utils.h b/libc/src/time/gpu/time_utils.h
index 531a748..8a9a5f0 100644
--- a/libc/src/time/gpu/time_utils.h
+++ b/libc/src/time/gpu/time_utils.h
@@ -15,24 +15,13 @@ namespace LIBC_NAMESPACE {
 
 #if defined(LIBC_TARGET_ARCH_IS_AMDGPU)
 // AMDGPU does not have a single set frequency. Different architectures and
-// cards can have vary values. Here we default to a few known values, but for
-// complete support the frequency needs to be read from the kernel driver.
-#if defined(__GFX10__) || defined(__GFX11__) || defined(__GFX12__) ||          \
-    defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)
-// These architectures use a 100 MHz fixed frequency clock.
-constexpr uint64_t clock_freq = 100000000;
-#elif defined(__GFX9__)
-// These architectures use a 25 MHz fixed frequency clock expect for Vega 10
-// which is actually 27 Mhz. We default to 25 MHz in all cases anyway.
-constexpr uint64_t clock_freq = 25000000;
-#else
-// The frequency for these architecture is unknown. We simply default to zero.
-constexpr uint64_t clock_freq = 0;
-#endif
+// cards can have different values. The actualy frequency needs to be read from
+// the kernel driver and will be between 25 MHz and 100 MHz on most cards. All
+// cards following the GFX9 ISAs use a 100 MHz clock so we will default to that.
+constexpr uint64_t clock_freq = 100000000UL;
 
 // We provide an externally visible symbol such that the runtime can set
-// this to the correct value. If it is not set we try to default to the
-// known values.
+// this to the correct value.
 extern "C" [[gnu::visibility("protected")]] uint64_t
     [[clang::address_space(4)]] __llvm_libc_clock_freq;
 #define GPU_CLOCKS_PER_SEC static_cast<clock_t>(__llvm_libc_clock_freq)
-- 
cgit v1.1


From 16140ff219b68f61fedf92df13019d89a4990a47 Mon Sep 17 00:00:00 2001
From: Giuseppe Rossini <giuseppe.rossini@amd.com>
Date: Tue, 13 Feb 2024 18:29:49 +0000
Subject: [mlir][ROCDL] Add synchronization primitives (#80888)

This PR adds two LLVM intrinsics to MLIR:
- llvm.amdgcn.s.setprio which sets the priority of a wave for the GPU
scheduler
- llvm.amdgcn.sched.barrier which sets a software barrier so that the
scheduler cannot move instructions around
---
 mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td | 16 ++++++++++++++++
 mlir/test/Dialect/LLVMIR/rocdl.mlir          | 12 ++++++++++++
 mlir/test/Target/LLVMIR/rocdl.mlir           | 16 ++++++++++++++++
 3 files changed, 44 insertions(+)

diff --git a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
index 638e46a..962c159 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
@@ -192,6 +192,22 @@ def ROCDL_BarrierOp : ROCDL_Op<"barrier"> {
   let assemblyFormat = "attr-dict";
 }
 
+def ROCDL_SetPrioOp : ROCDL_IntrOp<"s.setprio", [], [], [], 0>,
+  Arguments<(ins I16Attr:$priority)> {
+  let results = (outs);
+  let assemblyFormat = "$priority attr-dict";
+  string llvmBuilder =
+    "createIntrinsicCall(builder, llvm::Intrinsic::amdgcn_s_setprio,builder.getInt16(op.getPriority()));";
+}
+
+def ROCDL_SchedBarrier : ROCDL_IntrOp<"sched.barrier", [], [], [], 0>,
+  Arguments<(ins I32Attr:$mask)> {
+  let results = (outs);
+  let assemblyFormat = "$mask attr-dict";
+  string llvmBuilder =
+    "createIntrinsicCall(builder, llvm::Intrinsic::amdgcn_sched_barrier,builder.getInt32(op.getMask()));";
+}
+
 
 //===---------------------------------------------------------------------===//
 // Xdlops intrinsics
diff --git a/mlir/test/Dialect/LLVMIR/rocdl.mlir b/mlir/test/Dialect/LLVMIR/rocdl.mlir
index 5a14df9..89e8e78 100644
--- a/mlir/test/Dialect/LLVMIR/rocdl.mlir
+++ b/mlir/test/Dialect/LLVMIR/rocdl.mlir
@@ -35,6 +35,18 @@ func.func @rocdl.barrier() {
   llvm.return
 }
 
+func.func @rocdl.sched_barrier() {
+  // CHECK: rocdl.sched.barrier
+  rocdl.sched.barrier 0
+  llvm.return
+}
+
+func.func @rocdl.setprio() {
+  // CHECK: rocdl.s.setprio
+  rocdl.s.setprio 0
+  llvm.return
+}
+
 func.func @rocdl.xdlops(%arg0 : f32, %arg1 : f32,
                    %arg2 : vector<32xf32>, %arg3 : i32,
                    %arg4 : vector<16xf32>, %arg5 : vector<4xf32>,
diff --git a/mlir/test/Target/LLVMIR/rocdl.mlir b/mlir/test/Target/LLVMIR/rocdl.mlir
index 2612330..06b7865 100644
--- a/mlir/test/Target/LLVMIR/rocdl.mlir
+++ b/mlir/test/Target/LLVMIR/rocdl.mlir
@@ -90,6 +90,22 @@ llvm.func @rocdl.barrier() {
   llvm.return
 }
 
+llvm.func @rocdl.setprio() {
+  // CHECK: call void @llvm.amdgcn.s.setprio(i16 0)
+  rocdl.s.setprio 0
+  // CHECK-NEXT: call void @llvm.amdgcn.s.setprio(i16 1)
+  rocdl.s.setprio 1
+  llvm.return
+}
+
+llvm.func @rocdl.schedbarrier() {
+  // CHECK: call void @llvm.amdgcn.sched.barrier(i32 0)
+  rocdl.sched.barrier 0
+  // CHECK-NEXT: call void @llvm.amdgcn.sched.barrier(i32 1)
+  rocdl.sched.barrier 1
+  llvm.return
+}
+
 llvm.func @rocdl.xdlops(%arg0 : f32, %arg1 : f32,
                    %arg2 : vector<32 x f32>, %arg3: i32,
                    %arg4 : vector<16 x f32>, %arg5 : vector<4xf32>,
-- 
cgit v1.1


From c830c1205dc164b645edb9c40cccbe768d5b337c Mon Sep 17 00:00:00 2001
From: Joseph Huber <huberjn@outlook.com>
Date: Tue, 13 Feb 2024 12:43:44 -0600
Subject: [libc] Remove leftover target dependent intrinsic

Summary:
I forgot to remove these because I thought I did it already. This caused
the build to fail when actually linked.
---
 libc/src/__support/GPU/nvptx/utils.h | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/libc/src/__support/GPU/nvptx/utils.h b/libc/src/__support/GPU/nvptx/utils.h
index 22a46e8..a92c884 100644
--- a/libc/src/__support/GPU/nvptx/utils.h
+++ b/libc/src/__support/GPU/nvptx/utils.h
@@ -110,21 +110,13 @@ LIBC_INLINE uint32_t get_lane_size() { return 32; }
                                                            uint32_t x) {
   uint32_t mask = static_cast<uint32_t>(lane_mask);
   uint32_t id = __builtin_ffs(mask) - 1;
-#if __CUDA_ARCH__ >= 600
   return __nvvm_shfl_sync_idx_i32(mask, x, id, get_lane_size() - 1);
-#else
-  return __nvvm_shfl_idx_i32(x, id, get_lane_size() - 1);
-#endif
 }
 
 /// Returns a bitmask of threads in the current lane for which \p x is true.
 [[clang::convergent]] LIBC_INLINE uint64_t ballot(uint64_t lane_mask, bool x) {
   uint32_t mask = static_cast<uint32_t>(lane_mask);
-#if __CUDA_ARCH__ >= 600
   return __nvvm_vote_ballot_sync(mask, x);
-#else
-  return mask & __nvvm_vote_ballot(x);
-#endif
 }
 /// Waits for all the threads in the block to converge and issues a fence.
 [[clang::convergent]] LIBC_INLINE void sync_threads() { __syncthreads(); }
-- 
cgit v1.1


From 2422e969bf0a05b9b5cb4a6233a5f8dd335c2de5 Mon Sep 17 00:00:00 2001
From: Mingming Liu <mingmingl@google.com>
Date: Tue, 13 Feb 2024 10:49:35 -0800
Subject: [NFC][InstrProf]Factor out getCanonicalName to compute the canonical
 name given a pgo name. (#81547)

- Also update the `InstrProf::addFuncWithName` to call the newly added
`getCanonicalName`.
---
 llvm/include/llvm/ProfileData/InstrProf.h | 11 +++++++
 llvm/lib/ProfileData/InstrProf.cpp        | 49 +++++++++++++++++++------------
 2 files changed, 41 insertions(+), 19 deletions(-)

diff --git a/llvm/include/llvm/ProfileData/InstrProf.h b/llvm/include/llvm/ProfileData/InstrProf.h
index aa08e94..a928ba6 100644
--- a/llvm/include/llvm/ProfileData/InstrProf.h
+++ b/llvm/include/llvm/ProfileData/InstrProf.h
@@ -449,6 +449,17 @@ private:
     return "** External Symbol **";
   }
 
+  // Returns the canonial name of the given PGOName. In a canonical name, all
+  // suffixes that begins with "." except ".__uniq." are stripped.
+  // FIXME: Unify this with `FunctionSamples::getCanonicalFnName`.
+  static StringRef getCanonicalName(StringRef PGOName);
+
+  // Add the function into the symbol table, by creating the following
+  // map entries:
+  // name-set = {PGOFuncName} + {getCanonicalName(PGOFuncName)} if the canonical
+  // name is different from pgo name
+  // - In MD5NameMap: <MD5Hash(name), name> for name in name-set
+  // - In MD5FuncMap: <MD5Hash(name), &F> for name in name-set
   Error addFuncWithName(Function &F, StringRef PGOFuncName);
 
   // If the symtab is created by a series of calls to \c addFuncName, \c
diff --git a/llvm/lib/ProfileData/InstrProf.cpp b/llvm/lib/ProfileData/InstrProf.cpp
index d26004e..2eeeff9 100644
--- a/llvm/lib/ProfileData/InstrProf.cpp
+++ b/llvm/lib/ProfileData/InstrProf.cpp
@@ -517,35 +517,46 @@ Error InstrProfSymtab::create(StringRef NameStrings) {
       std::bind(&InstrProfSymtab::addFuncName, this, std::placeholders::_1));
 }
 
-Error InstrProfSymtab::addFuncWithName(Function &F, StringRef PGOFuncName) {
-  if (Error E = addFuncName(PGOFuncName))
-    return E;
-  MD5FuncMap.emplace_back(Function::getGUID(PGOFuncName), &F);
+StringRef InstrProfSymtab::getCanonicalName(StringRef PGOName) {
   // In ThinLTO, local function may have been promoted to global and have
   // suffix ".llvm." added to the function name. We need to add the
   // stripped function name to the symbol table so that we can find a match
   // from profile.
   //
-  // We may have other suffixes similar as ".llvm." which are needed to
-  // be stripped before the matching, but ".__uniq." suffix which is used
-  // to differentiate internal linkage functions in different modules
-  // should be kept. Now this is the only suffix with the pattern ".xxx"
-  // which is kept before matching.
+  // ".__uniq." suffix is used to differentiate internal linkage functions in
+  // different modules and should be kept. This is the only suffix with the
+  // pattern ".xxx" which is kept before matching, other suffixes similar as
+  // ".llvm." will be stripped.
   const std::string UniqSuffix = ".__uniq.";
-  auto pos = PGOFuncName.find(UniqSuffix);
-  // Search '.' after ".__uniq." if ".__uniq." exists, otherwise
-  // search '.' from the beginning.
-  if (pos != std::string::npos)
+  size_t pos = PGOName.find(UniqSuffix);
+  if (pos != StringRef::npos)
     pos += UniqSuffix.length();
   else
     pos = 0;
-  pos = PGOFuncName.find('.', pos);
-  if (pos != std::string::npos && pos != 0) {
-    StringRef OtherFuncName = PGOFuncName.substr(0, pos);
-    if (Error E = addFuncName(OtherFuncName))
+
+  // Search '.' after ".__uniq." if ".__uniq." exists, otherwise search '.' from
+  // the beginning.
+  pos = PGOName.find('.', pos);
+  if (pos != StringRef::npos && pos != 0)
+    return PGOName.substr(0, pos);
+
+  return PGOName;
+}
+
+Error InstrProfSymtab::addFuncWithName(Function &F, StringRef PGOFuncName) {
+  auto mapName = [&](StringRef Name) -> Error {
+    if (Error E = addFuncName(Name))
       return E;
-    MD5FuncMap.emplace_back(Function::getGUID(OtherFuncName), &F);
-  }
+    MD5FuncMap.emplace_back(Function::getGUID(Name), &F);
+    return Error::success();
+  };
+  if (Error E = mapName(PGOFuncName))
+    return E;
+
+  StringRef CanonicalFuncName = getCanonicalName(PGOFuncName);
+  if (CanonicalFuncName != PGOFuncName)
+    return mapName(CanonicalFuncName);
+
   return Error::success();
 }
 
-- 
cgit v1.1


From 79ce933114e46c891a5632f7ad4a004b93a5b808 Mon Sep 17 00:00:00 2001
From: Noah Goldstein <goldstein.w.n@gmail.com>
Date: Tue, 13 Feb 2024 00:42:24 -0600
Subject: [InstCombine] Extend `(lshr/shl (shl/lshr -1, x), x)` -> `(lshr/shl
 -1, x)` for multi-use

We previously did this iff the inner `(shl/lshr -1, x)` was
one-use. No instructions are added even if the inner `(shl/lshr -1,
x)` is multi-use and this canonicalization both makes the resulting
instruction easier to analyze and shrinks its dependency chain.

Closes #81576
---
 .../Transforms/InstCombine/InstCombineShifts.cpp   | 12 +++++++++++
 ...lize-low-bit-mask-v4-and-icmp-eq-to-icmp-ule.ll | 20 +++++++++---------
 ...lize-low-bit-mask-v4-and-icmp-ne-to-icmp-ugt.ll | 20 +++++++++---------
 ...ift-input-masking-after-truncation-variant-d.ll | 14 ++++++-------
 ...redundant-left-shift-input-masking-variant-d.ll | 10 ++++-----
 ...ift-input-masking-after-truncation-variant-d.ll | 10 ++++-----
 ...redundant-left-shift-input-masking-variant-d.ll | 24 +++++++++++-----------
 7 files changed, 61 insertions(+), 49 deletions(-)

diff --git a/llvm/lib/Transforms/InstCombine/InstCombineShifts.cpp b/llvm/lib/Transforms/InstCombine/InstCombineShifts.cpp
index 3fbe98f..eafd288 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineShifts.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineShifts.cpp
@@ -1206,6 +1206,12 @@ Instruction *InstCombinerImpl::visitShl(BinaryOperator &I) {
     return BinaryOperator::CreateAnd(Mask, X);
   }
 
+  // Transform  (-1 >> y) << y  to -1 << y
+  if (match(Op0, m_LShr(m_AllOnes(), m_Specific(Op1)))) {
+    Constant *AllOnes = ConstantInt::getAllOnesValue(Ty);
+    return BinaryOperator::CreateShl(AllOnes, Op1);
+  }
+
   Constant *C1;
   if (match(Op1, m_Constant(C1))) {
     Constant *C2;
@@ -1493,6 +1499,12 @@ Instruction *InstCombinerImpl::visitLShr(BinaryOperator &I) {
     return BinaryOperator::CreateAnd(Mask, X);
   }
 
+  // Transform  (-1 << y) >> y  to -1 >> y
+  if (match(Op0, m_Shl(m_AllOnes(), m_Specific(Op1)))) {
+    Constant *AllOnes = ConstantInt::getAllOnesValue(Ty);
+    return BinaryOperator::CreateLShr(AllOnes, Op1);
+  }
+
   if (Instruction *Overflow = foldLShrOverflowBit(I))
     return Overflow;
 
diff --git a/llvm/test/Transforms/InstCombine/canonicalize-low-bit-mask-v4-and-icmp-eq-to-icmp-ule.ll b/llvm/test/Transforms/InstCombine/canonicalize-low-bit-mask-v4-and-icmp-eq-to-icmp-ule.ll
index 81f9fe4..d13129c 100644
--- a/llvm/test/Transforms/InstCombine/canonicalize-low-bit-mask-v4-and-icmp-eq-to-icmp-ule.ll
+++ b/llvm/test/Transforms/InstCombine/canonicalize-low-bit-mask-v4-and-icmp-eq-to-icmp-ule.ll
@@ -22,7 +22,7 @@ define i1 @p0(i8 %x, i8 %y) {
 ; CHECK-LABEL: @p0(
 ; CHECK-NEXT:    [[T0:%.*]] = shl nsw i8 -1, [[Y:%.*]]
 ; CHECK-NEXT:    call void @use8(i8 [[T0]])
-; CHECK-NEXT:    [[T1:%.*]] = lshr exact i8 [[T0]], [[Y]]
+; CHECK-NEXT:    [[T1:%.*]] = lshr i8 -1, [[Y]]
 ; CHECK-NEXT:    [[RET:%.*]] = icmp uge i8 [[T1]], [[X:%.*]]
 ; CHECK-NEXT:    ret i1 [[RET]]
 ;
@@ -42,7 +42,7 @@ define <2 x i1> @p1_vec(<2 x i8> %x, <2 x i8> %y) {
 ; CHECK-LABEL: @p1_vec(
 ; CHECK-NEXT:    [[T0:%.*]] = shl nsw <2 x i8> <i8 -1, i8 -1>, [[Y:%.*]]
 ; CHECK-NEXT:    call void @use2i8(<2 x i8> [[T0]])
-; CHECK-NEXT:    [[T1:%.*]] = lshr exact <2 x i8> [[T0]], [[Y]]
+; CHECK-NEXT:    [[T1:%.*]] = lshr <2 x i8> <i8 -1, i8 -1>, [[Y]]
 ; CHECK-NEXT:    [[RET:%.*]] = icmp uge <2 x i8> [[T1]], [[X:%.*]]
 ; CHECK-NEXT:    ret <2 x i1> [[RET]]
 ;
@@ -58,7 +58,7 @@ define <3 x i1> @p2_vec_undef0(<3 x i8> %x, <3 x i8> %y) {
 ; CHECK-LABEL: @p2_vec_undef0(
 ; CHECK-NEXT:    [[T0:%.*]] = shl <3 x i8> <i8 -1, i8 undef, i8 -1>, [[Y:%.*]]
 ; CHECK-NEXT:    call void @use3i8(<3 x i8> [[T0]])
-; CHECK-NEXT:    [[T1:%.*]] = lshr exact <3 x i8> [[T0]], [[Y]]
+; CHECK-NEXT:    [[T1:%.*]] = lshr <3 x i8> <i8 -1, i8 -1, i8 -1>, [[Y]]
 ; CHECK-NEXT:    [[RET:%.*]] = icmp uge <3 x i8> [[T1]], [[X:%.*]]
 ; CHECK-NEXT:    ret <3 x i1> [[RET]]
 ;
@@ -80,7 +80,7 @@ define i1 @c0(i8 %y) {
 ; CHECK-LABEL: @c0(
 ; CHECK-NEXT:    [[T0:%.*]] = shl nsw i8 -1, [[Y:%.*]]
 ; CHECK-NEXT:    call void @use8(i8 [[T0]])
-; CHECK-NEXT:    [[T1:%.*]] = lshr exact i8 [[T0]], [[Y]]
+; CHECK-NEXT:    [[T1:%.*]] = lshr i8 -1, [[Y]]
 ; CHECK-NEXT:    [[X:%.*]] = call i8 @gen8()
 ; CHECK-NEXT:    [[RET:%.*]] = icmp ule i8 [[X]], [[T1]]
 ; CHECK-NEXT:    ret i1 [[RET]]
@@ -98,7 +98,7 @@ define i1 @c1(i8 %y) {
 ; CHECK-LABEL: @c1(
 ; CHECK-NEXT:    [[T0:%.*]] = shl nsw i8 -1, [[Y:%.*]]
 ; CHECK-NEXT:    call void @use8(i8 [[T0]])
-; CHECK-NEXT:    [[T1:%.*]] = lshr exact i8 [[T0]], [[Y]]
+; CHECK-NEXT:    [[T1:%.*]] = lshr i8 -1, [[Y]]
 ; CHECK-NEXT:    [[X:%.*]] = call i8 @gen8()
 ; CHECK-NEXT:    [[RET:%.*]] = icmp ule i8 [[X]], [[T1]]
 ; CHECK-NEXT:    ret i1 [[RET]]
@@ -116,7 +116,7 @@ define i1 @c2(i8 %y) {
 ; CHECK-LABEL: @c2(
 ; CHECK-NEXT:    [[T0:%.*]] = shl nsw i8 -1, [[Y:%.*]]
 ; CHECK-NEXT:    call void @use8(i8 [[T0]])
-; CHECK-NEXT:    [[T1:%.*]] = lshr exact i8 [[T0]], [[Y]]
+; CHECK-NEXT:    [[T1:%.*]] = lshr i8 -1, [[Y]]
 ; CHECK-NEXT:    [[X:%.*]] = call i8 @gen8()
 ; CHECK-NEXT:    [[RET:%.*]] = icmp ule i8 [[X]], [[T1]]
 ; CHECK-NEXT:    ret i1 [[RET]]
@@ -138,7 +138,7 @@ define i1 @oneuse0(i8 %x, i8 %y) {
 ; CHECK-LABEL: @oneuse0(
 ; CHECK-NEXT:    [[T0:%.*]] = shl nsw i8 -1, [[Y:%.*]]
 ; CHECK-NEXT:    call void @use8(i8 [[T0]])
-; CHECK-NEXT:    [[T1:%.*]] = lshr exact i8 [[T0]], [[Y]]
+; CHECK-NEXT:    [[T1:%.*]] = lshr i8 -1, [[Y]]
 ; CHECK-NEXT:    call void @use8(i8 [[T1]])
 ; CHECK-NEXT:    [[RET:%.*]] = icmp uge i8 [[T1]], [[X:%.*]]
 ; CHECK-NEXT:    ret i1 [[RET]]
@@ -156,7 +156,7 @@ define i1 @oneuse1(i8 %x, i8 %y) {
 ; CHECK-LABEL: @oneuse1(
 ; CHECK-NEXT:    [[T0:%.*]] = shl nsw i8 -1, [[Y:%.*]]
 ; CHECK-NEXT:    call void @use8(i8 [[T0]])
-; CHECK-NEXT:    [[T1:%.*]] = lshr exact i8 [[T0]], [[Y]]
+; CHECK-NEXT:    [[T1:%.*]] = lshr i8 -1, [[Y]]
 ; CHECK-NEXT:    [[T2:%.*]] = and i8 [[T1]], [[X:%.*]]
 ; CHECK-NEXT:    call void @use8(i8 [[T2]])
 ; CHECK-NEXT:    [[RET:%.*]] = icmp uge i8 [[T1]], [[X]]
@@ -175,7 +175,7 @@ define i1 @oneuse2(i8 %x, i8 %y) {
 ; CHECK-LABEL: @oneuse2(
 ; CHECK-NEXT:    [[T0:%.*]] = shl nsw i8 -1, [[Y:%.*]]
 ; CHECK-NEXT:    call void @use8(i8 [[T0]])
-; CHECK-NEXT:    [[T1:%.*]] = lshr exact i8 [[T0]], [[Y]]
+; CHECK-NEXT:    [[T1:%.*]] = lshr i8 -1, [[Y]]
 ; CHECK-NEXT:    call void @use8(i8 [[T1]])
 ; CHECK-NEXT:    [[T2:%.*]] = and i8 [[T1]], [[X:%.*]]
 ; CHECK-NEXT:    call void @use8(i8 [[T2]])
@@ -200,7 +200,7 @@ define i1 @n0(i8 %x, i8 %y, i8 %notx) {
 ; CHECK-LABEL: @n0(
 ; CHECK-NEXT:    [[T0:%.*]] = shl nsw i8 -1, [[Y:%.*]]
 ; CHECK-NEXT:    call void @use8(i8 [[T0]])
-; CHECK-NEXT:    [[T1:%.*]] = lshr exact i8 [[T0]], [[Y]]
+; CHECK-NEXT:    [[T1:%.*]] = lshr i8 -1, [[Y]]
 ; CHECK-NEXT:    [[T2:%.*]] = and i8 [[T1]], [[X:%.*]]
 ; CHECK-NEXT:    [[RET:%.*]] = icmp eq i8 [[T2]], [[NOTX:%.*]]
 ; CHECK-NEXT:    ret i1 [[RET]]
diff --git a/llvm/test/Transforms/InstCombine/canonicalize-low-bit-mask-v4-and-icmp-ne-to-icmp-ugt.ll b/llvm/test/Transforms/InstCombine/canonicalize-low-bit-mask-v4-and-icmp-ne-to-icmp-ugt.ll
index 321a115..a1517b36 100644
--- a/llvm/test/Transforms/InstCombine/canonicalize-low-bit-mask-v4-and-icmp-ne-to-icmp-ugt.ll
+++ b/llvm/test/Transforms/InstCombine/canonicalize-low-bit-mask-v4-and-icmp-ne-to-icmp-ugt.ll
@@ -22,7 +22,7 @@ define i1 @p0(i8 %x, i8 %y) {
 ; CHECK-LABEL: @p0(
 ; CHECK-NEXT:    [[T0:%.*]] = shl nsw i8 -1, [[Y:%.*]]
 ; CHECK-NEXT:    call void @use8(i8 [[T0]])
-; CHECK-NEXT:    [[T1:%.*]] = lshr exact i8 [[T0]], [[Y]]
+; CHECK-NEXT:    [[T1:%.*]] = lshr i8 -1, [[Y]]
 ; CHECK-NEXT:    [[RET:%.*]] = icmp ult i8 [[T1]], [[X:%.*]]
 ; CHECK-NEXT:    ret i1 [[RET]]
 ;
@@ -42,7 +42,7 @@ define <2 x i1> @p1_vec(<2 x i8> %x, <2 x i8> %y) {
 ; CHECK-LABEL: @p1_vec(
 ; CHECK-NEXT:    [[T0:%.*]] = shl nsw <2 x i8> <i8 -1, i8 -1>, [[Y:%.*]]
 ; CHECK-NEXT:    call void @use2i8(<2 x i8> [[T0]])
-; CHECK-NEXT:    [[T1:%.*]] = lshr exact <2 x i8> [[T0]], [[Y]]
+; CHECK-NEXT:    [[T1:%.*]] = lshr <2 x i8> <i8 -1, i8 -1>, [[Y]]
 ; CHECK-NEXT:    [[RET:%.*]] = icmp ult <2 x i8> [[T1]], [[X:%.*]]
 ; CHECK-NEXT:    ret <2 x i1> [[RET]]
 ;
@@ -58,7 +58,7 @@ define <3 x i1> @p2_vec_undef0(<3 x i8> %x, <3 x i8> %y) {
 ; CHECK-LABEL: @p2_vec_undef0(
 ; CHECK-NEXT:    [[T0:%.*]] = shl <3 x i8> <i8 -1, i8 undef, i8 -1>, [[Y:%.*]]
 ; CHECK-NEXT:    call void @use3i8(<3 x i8> [[T0]])
-; CHECK-NEXT:    [[T1:%.*]] = lshr exact <3 x i8> [[T0]], [[Y]]
+; CHECK-NEXT:    [[T1:%.*]] = lshr <3 x i8> <i8 -1, i8 -1, i8 -1>, [[Y]]
 ; CHECK-NEXT:    [[RET:%.*]] = icmp ult <3 x i8> [[T1]], [[X:%.*]]
 ; CHECK-NEXT:    ret <3 x i1> [[RET]]
 ;
@@ -80,7 +80,7 @@ define i1 @c0(i8 %y) {
 ; CHECK-LABEL: @c0(
 ; CHECK-NEXT:    [[T0:%.*]] = shl nsw i8 -1, [[Y:%.*]]
 ; CHECK-NEXT:    call void @use8(i8 [[T0]])
-; CHECK-NEXT:    [[T1:%.*]] = lshr exact i8 [[T0]], [[Y]]
+; CHECK-NEXT:    [[T1:%.*]] = lshr i8 -1, [[Y]]
 ; CHECK-NEXT:    [[X:%.*]] = call i8 @gen8()
 ; CHECK-NEXT:    [[RET:%.*]] = icmp ugt i8 [[X]], [[T1]]
 ; CHECK-NEXT:    ret i1 [[RET]]
@@ -98,7 +98,7 @@ define i1 @c1(i8 %y) {
 ; CHECK-LABEL: @c1(
 ; CHECK-NEXT:    [[T0:%.*]] = shl nsw i8 -1, [[Y:%.*]]
 ; CHECK-NEXT:    call void @use8(i8 [[T0]])
-; CHECK-NEXT:    [[T1:%.*]] = lshr exact i8 [[T0]], [[Y]]
+; CHECK-NEXT:    [[T1:%.*]] = lshr i8 -1, [[Y]]
 ; CHECK-NEXT:    [[X:%.*]] = call i8 @gen8()
 ; CHECK-NEXT:    [[RET:%.*]] = icmp ugt i8 [[X]], [[T1]]
 ; CHECK-NEXT:    ret i1 [[RET]]
@@ -116,7 +116,7 @@ define i1 @c2(i8 %y) {
 ; CHECK-LABEL: @c2(
 ; CHECK-NEXT:    [[T0:%.*]] = shl nsw i8 -1, [[Y:%.*]]
 ; CHECK-NEXT:    call void @use8(i8 [[T0]])
-; CHECK-NEXT:    [[T1:%.*]] = lshr exact i8 [[T0]], [[Y]]
+; CHECK-NEXT:    [[T1:%.*]] = lshr i8 -1, [[Y]]
 ; CHECK-NEXT:    [[X:%.*]] = call i8 @gen8()
 ; CHECK-NEXT:    [[RET:%.*]] = icmp ugt i8 [[X]], [[T1]]
 ; CHECK-NEXT:    ret i1 [[RET]]
@@ -138,7 +138,7 @@ define i1 @oneuse0(i8 %x, i8 %y) {
 ; CHECK-LABEL: @oneuse0(
 ; CHECK-NEXT:    [[T0:%.*]] = shl nsw i8 -1, [[Y:%.*]]
 ; CHECK-NEXT:    call void @use8(i8 [[T0]])
-; CHECK-NEXT:    [[T1:%.*]] = lshr exact i8 [[T0]], [[Y]]
+; CHECK-NEXT:    [[T1:%.*]] = lshr i8 -1, [[Y]]
 ; CHECK-NEXT:    call void @use8(i8 [[T1]])
 ; CHECK-NEXT:    [[RET:%.*]] = icmp ult i8 [[T1]], [[X:%.*]]
 ; CHECK-NEXT:    ret i1 [[RET]]
@@ -156,7 +156,7 @@ define i1 @oneuse1(i8 %x, i8 %y) {
 ; CHECK-LABEL: @oneuse1(
 ; CHECK-NEXT:    [[T0:%.*]] = shl nsw i8 -1, [[Y:%.*]]
 ; CHECK-NEXT:    call void @use8(i8 [[T0]])
-; CHECK-NEXT:    [[T1:%.*]] = lshr exact i8 [[T0]], [[Y]]
+; CHECK-NEXT:    [[T1:%.*]] = lshr i8 -1, [[Y]]
 ; CHECK-NEXT:    [[T2:%.*]] = and i8 [[T1]], [[X:%.*]]
 ; CHECK-NEXT:    call void @use8(i8 [[T2]])
 ; CHECK-NEXT:    [[RET:%.*]] = icmp ult i8 [[T1]], [[X]]
@@ -175,7 +175,7 @@ define i1 @oneuse2(i8 %x, i8 %y) {
 ; CHECK-LABEL: @oneuse2(
 ; CHECK-NEXT:    [[T0:%.*]] = shl nsw i8 -1, [[Y:%.*]]
 ; CHECK-NEXT:    call void @use8(i8 [[T0]])
-; CHECK-NEXT:    [[T1:%.*]] = lshr exact i8 [[T0]], [[Y]]
+; CHECK-NEXT:    [[T1:%.*]] = lshr i8 -1, [[Y]]
 ; CHECK-NEXT:    call void @use8(i8 [[T1]])
 ; CHECK-NEXT:    [[T2:%.*]] = and i8 [[T1]], [[X:%.*]]
 ; CHECK-NEXT:    call void @use8(i8 [[T2]])
@@ -200,7 +200,7 @@ define i1 @n0(i8 %x, i8 %y, i8 %notx) {
 ; CHECK-LABEL: @n0(
 ; CHECK-NEXT:    [[T0:%.*]] = shl nsw i8 -1, [[Y:%.*]]
 ; CHECK-NEXT:    call void @use8(i8 [[T0]])
-; CHECK-NEXT:    [[T1:%.*]] = lshr exact i8 [[T0]], [[Y]]
+; CHECK-NEXT:    [[T1:%.*]] = lshr i8 -1, [[Y]]
 ; CHECK-NEXT:    [[T2:%.*]] = and i8 [[T1]], [[X:%.*]]
 ; CHECK-NEXT:    [[RET:%.*]] = icmp ne i8 [[T2]], [[NOTX:%.*]]
 ; CHECK-NEXT:    ret i1 [[RET]]
diff --git a/llvm/test/Transforms/InstCombine/partally-redundant-left-shift-input-masking-after-truncation-variant-d.ll b/llvm/test/Transforms/InstCombine/partally-redundant-left-shift-input-masking-after-truncation-variant-d.ll
index 6b7061f..4887385 100644
--- a/llvm/test/Transforms/InstCombine/partally-redundant-left-shift-input-masking-after-truncation-variant-d.ll
+++ b/llvm/test/Transforms/InstCombine/partally-redundant-left-shift-input-masking-after-truncation-variant-d.ll
@@ -18,7 +18,7 @@ define i32 @t0_basic(i64 %x, i32 %nbits) {
 ; CHECK-LABEL: @t0_basic(
 ; CHECK-NEXT:    [[T0:%.*]] = zext i32 [[NBITS:%.*]] to i64
 ; CHECK-NEXT:    [[T1:%.*]] = shl nsw i64 -1, [[T0]]
-; CHECK-NEXT:    [[T2:%.*]] = lshr exact i64 [[T1]], [[T0]]
+; CHECK-NEXT:    [[T2:%.*]] = lshr i64 -1, [[T0]]
 ; CHECK-NEXT:    [[T3:%.*]] = add i32 [[NBITS]], -33
 ; CHECK-NEXT:    call void @use64(i64 [[T0]])
 ; CHECK-NEXT:    call void @use64(i64 [[T1]])
@@ -54,7 +54,7 @@ define <8 x i32> @t1_vec_splat(<8 x i64> %x, <8 x i32> %nbits) {
 ; CHECK-LABEL: @t1_vec_splat(
 ; CHECK-NEXT:    [[T0:%.*]] = zext <8 x i32> [[NBITS:%.*]] to <8 x i64>
 ; CHECK-NEXT:    [[T1:%.*]] = shl nsw <8 x i64> <i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1>, [[T0]]
-; CHECK-NEXT:    [[T2:%.*]] = lshr exact <8 x i64> [[T1]], [[T0]]
+; CHECK-NEXT:    [[T2:%.*]] = lshr <8 x i64> <i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1>, [[T0]]
 ; CHECK-NEXT:    [[T3:%.*]] = add <8 x i32> [[NBITS]], <i32 -33, i32 -33, i32 -33, i32 -33, i32 -33, i32 -33, i32 -33, i32 -33>
 ; CHECK-NEXT:    call void @use8xi64(<8 x i64> [[T0]])
 ; CHECK-NEXT:    call void @use8xi64(<8 x i64> [[T1]])
@@ -85,7 +85,7 @@ define <8 x i32> @t2_vec_splat_undef(<8 x i64> %x, <8 x i32> %nbits) {
 ; CHECK-LABEL: @t2_vec_splat_undef(
 ; CHECK-NEXT:    [[T0:%.*]] = zext <8 x i32> [[NBITS:%.*]] to <8 x i64>
 ; CHECK-NEXT:    [[T1:%.*]] = shl <8 x i64> <i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 undef, i64 -1>, [[T0]]
-; CHECK-NEXT:    [[T2:%.*]] = lshr exact <8 x i64> [[T1]], [[T0]]
+; CHECK-NEXT:    [[T2:%.*]] = lshr <8 x i64> <i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1>, [[T0]]
 ; CHECK-NEXT:    [[T3:%.*]] = add <8 x i32> [[NBITS]], <i32 -33, i32 -33, i32 -33, i32 -33, i32 -33, i32 -33, i32 undef, i32 -33>
 ; CHECK-NEXT:    call void @use8xi64(<8 x i64> [[T0]])
 ; CHECK-NEXT:    call void @use8xi64(<8 x i64> [[T1]])
@@ -116,7 +116,7 @@ define <8 x i32> @t3_vec_nonsplat(<8 x i64> %x, <8 x i32> %nbits) {
 ; CHECK-LABEL: @t3_vec_nonsplat(
 ; CHECK-NEXT:    [[T0:%.*]] = zext <8 x i32> [[NBITS:%.*]] to <8 x i64>
 ; CHECK-NEXT:    [[T1:%.*]] = shl <8 x i64> <i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 undef, i64 -1>, [[T0]]
-; CHECK-NEXT:    [[T2:%.*]] = lshr exact <8 x i64> [[T1]], [[T0]]
+; CHECK-NEXT:    [[T2:%.*]] = lshr <8 x i64> <i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1>, [[T0]]
 ; CHECK-NEXT:    [[T3:%.*]] = add <8 x i32> [[NBITS]], <i32 -64, i32 -63, i32 -33, i32 -32, i32 63, i32 64, i32 undef, i32 65>
 ; CHECK-NEXT:    call void @use8xi64(<8 x i64> [[T0]])
 ; CHECK-NEXT:    call void @use8xi64(<8 x i64> [[T1]])
@@ -149,7 +149,7 @@ define i32 @n4_extrause0(i64 %x, i32 %nbits) {
 ; CHECK-LABEL: @n4_extrause0(
 ; CHECK-NEXT:    [[T0:%.*]] = zext i32 [[NBITS:%.*]] to i64
 ; CHECK-NEXT:    [[T1:%.*]] = shl nsw i64 -1, [[T0]]
-; CHECK-NEXT:    [[T2:%.*]] = lshr exact i64 [[T1]], [[T0]]
+; CHECK-NEXT:    [[T2:%.*]] = lshr i64 -1, [[T0]]
 ; CHECK-NEXT:    [[T3:%.*]] = add i32 [[NBITS]], -33
 ; CHECK-NEXT:    call void @use64(i64 [[T0]])
 ; CHECK-NEXT:    call void @use64(i64 [[T1]])
@@ -182,7 +182,7 @@ define i32 @n5_extrause1(i64 %x, i32 %nbits) {
 ; CHECK-LABEL: @n5_extrause1(
 ; CHECK-NEXT:    [[T0:%.*]] = zext i32 [[NBITS:%.*]] to i64
 ; CHECK-NEXT:    [[T1:%.*]] = shl nsw i64 -1, [[T0]]
-; CHECK-NEXT:    [[T2:%.*]] = lshr exact i64 [[T1]], [[T0]]
+; CHECK-NEXT:    [[T2:%.*]] = lshr i64 -1, [[T0]]
 ; CHECK-NEXT:    [[T3:%.*]] = add i32 [[NBITS]], -33
 ; CHECK-NEXT:    call void @use64(i64 [[T0]])
 ; CHECK-NEXT:    call void @use64(i64 [[T1]])
@@ -215,7 +215,7 @@ define i32 @n6_extrause2(i64 %x, i32 %nbits) {
 ; CHECK-LABEL: @n6_extrause2(
 ; CHECK-NEXT:    [[T0:%.*]] = zext i32 [[NBITS:%.*]] to i64
 ; CHECK-NEXT:    [[T1:%.*]] = shl nsw i64 -1, [[T0]]
-; CHECK-NEXT:    [[T2:%.*]] = lshr exact i64 [[T1]], [[T0]]
+; CHECK-NEXT:    [[T2:%.*]] = lshr i64 -1, [[T0]]
 ; CHECK-NEXT:    [[T3:%.*]] = add i32 [[NBITS]], -33
 ; CHECK-NEXT:    call void @use64(i64 [[T0]])
 ; CHECK-NEXT:    call void @use64(i64 [[T1]])
diff --git a/llvm/test/Transforms/InstCombine/partally-redundant-left-shift-input-masking-variant-d.ll b/llvm/test/Transforms/InstCombine/partally-redundant-left-shift-input-masking-variant-d.ll
index 92805c6..9c096d1 100644
--- a/llvm/test/Transforms/InstCombine/partally-redundant-left-shift-input-masking-variant-d.ll
+++ b/llvm/test/Transforms/InstCombine/partally-redundant-left-shift-input-masking-variant-d.ll
@@ -16,7 +16,7 @@ declare void @use32(i32)
 define i32 @t0_basic(i32 %x, i32 %nbits) {
 ; CHECK-LABEL: @t0_basic(
 ; CHECK-NEXT:    [[T0:%.*]] = shl nsw i32 -1, [[NBITS:%.*]]
-; CHECK-NEXT:    [[T1:%.*]] = lshr exact i32 [[T0]], [[NBITS]]
+; CHECK-NEXT:    [[T1:%.*]] = lshr i32 -1, [[NBITS]]
 ; CHECK-NEXT:    [[T3:%.*]] = add i32 [[NBITS]], -1
 ; CHECK-NEXT:    call void @use32(i32 [[T0]])
 ; CHECK-NEXT:    call void @use32(i32 [[T1]])
@@ -43,7 +43,7 @@ declare void @use8xi32(<8 x i32>)
 define <8 x i32> @t2_vec_splat(<8 x i32> %x, <8 x i32> %nbits) {
 ; CHECK-LABEL: @t2_vec_splat(
 ; CHECK-NEXT:    [[T0:%.*]] = shl nsw <8 x i32> <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>, [[NBITS:%.*]]
-; CHECK-NEXT:    [[T1:%.*]] = lshr exact <8 x i32> [[T0]], [[NBITS]]
+; CHECK-NEXT:    [[T1:%.*]] = lshr <8 x i32> <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>, [[NBITS]]
 ; CHECK-NEXT:    [[T3:%.*]] = add <8 x i32> [[NBITS]], <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>
 ; CHECK-NEXT:    call void @use8xi32(<8 x i32> [[T0]])
 ; CHECK-NEXT:    call void @use8xi32(<8 x i32> [[T1]])
@@ -66,7 +66,7 @@ define <8 x i32> @t2_vec_splat(<8 x i32> %x, <8 x i32> %nbits) {
 define <8 x i32> @t2_vec_splat_undef(<8 x i32> %x, <8 x i32> %nbits) {
 ; CHECK-LABEL: @t2_vec_splat_undef(
 ; CHECK-NEXT:    [[T0:%.*]] = shl <8 x i32> <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 undef, i32 -1>, [[NBITS:%.*]]
-; CHECK-NEXT:    [[T1:%.*]] = lshr exact <8 x i32> [[T0]], [[NBITS]]
+; CHECK-NEXT:    [[T1:%.*]] = lshr <8 x i32> <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>, [[NBITS]]
 ; CHECK-NEXT:    [[T3:%.*]] = add <8 x i32> [[NBITS]], <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 undef, i32 -1>
 ; CHECK-NEXT:    call void @use8xi32(<8 x i32> [[T0]])
 ; CHECK-NEXT:    call void @use8xi32(<8 x i32> [[T1]])
@@ -89,7 +89,7 @@ define <8 x i32> @t2_vec_splat_undef(<8 x i32> %x, <8 x i32> %nbits) {
 define <8 x i32> @t2_vec_nonsplat(<8 x i32> %x, <8 x i32> %nbits) {
 ; CHECK-LABEL: @t2_vec_nonsplat(
 ; CHECK-NEXT:    [[T0:%.*]] = shl nsw <8 x i32> <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>, [[NBITS:%.*]]
-; CHECK-NEXT:    [[T1:%.*]] = lshr exact <8 x i32> [[T0]], [[NBITS]]
+; CHECK-NEXT:    [[T1:%.*]] = lshr <8 x i32> <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>, [[NBITS]]
 ; CHECK-NEXT:    [[T3:%.*]] = add <8 x i32> [[NBITS]], <i32 -32, i32 -31, i32 -1, i32 0, i32 1, i32 31, i32 32, i32 33>
 ; CHECK-NEXT:    call void @use8xi32(<8 x i32> [[T0]])
 ; CHECK-NEXT:    call void @use8xi32(<8 x i32> [[T1]])
@@ -114,7 +114,7 @@ define <8 x i32> @t2_vec_nonsplat(<8 x i32> %x, <8 x i32> %nbits) {
 define i32 @n3_extrause(i32 %x, i32 %nbits) {
 ; CHECK-LABEL: @n3_extrause(
 ; CHECK-NEXT:    [[T0:%.*]] = shl nsw i32 -1, [[NBITS:%.*]]
-; CHECK-NEXT:    [[T1:%.*]] = lshr exact i32 [[T0]], [[NBITS]]
+; CHECK-NEXT:    [[T1:%.*]] = lshr i32 -1, [[NBITS]]
 ; CHECK-NEXT:    [[T2:%.*]] = and i32 [[T1]], [[X:%.*]]
 ; CHECK-NEXT:    [[T3:%.*]] = add i32 [[NBITS]], -1
 ; CHECK-NEXT:    call void @use32(i32 [[T0]])
diff --git a/llvm/test/Transforms/InstCombine/redundant-left-shift-input-masking-after-truncation-variant-d.ll b/llvm/test/Transforms/InstCombine/redundant-left-shift-input-masking-after-truncation-variant-d.ll
index bdc7beb..1a977f6 100644
--- a/llvm/test/Transforms/InstCombine/redundant-left-shift-input-masking-after-truncation-variant-d.ll
+++ b/llvm/test/Transforms/InstCombine/redundant-left-shift-input-masking-after-truncation-variant-d.ll
@@ -18,7 +18,7 @@ define i32 @t0_basic(i64 %x, i32 %nbits) {
 ; CHECK-LABEL: @t0_basic(
 ; CHECK-NEXT:    [[T0:%.*]] = zext i32 [[NBITS:%.*]] to i64
 ; CHECK-NEXT:    [[T1:%.*]] = shl nsw i64 -1, [[T0]]
-; CHECK-NEXT:    [[T2:%.*]] = lshr exact i64 [[T1]], [[T0]]
+; CHECK-NEXT:    [[T2:%.*]] = lshr i64 -1, [[T0]]
 ; CHECK-NEXT:    [[T3:%.*]] = add i32 [[NBITS]], -32
 ; CHECK-NEXT:    [[T4:%.*]] = and i64 [[T2]], [[X:%.*]]
 ; CHECK-NEXT:    call void @use64(i64 [[T0]])
@@ -56,7 +56,7 @@ define <8 x i32> @t1_vec_splat(<8 x i64> %x, <8 x i32> %nbits) {
 ; CHECK-LABEL: @t1_vec_splat(
 ; CHECK-NEXT:    [[T0:%.*]] = zext <8 x i32> [[NBITS:%.*]] to <8 x i64>
 ; CHECK-NEXT:    [[T1:%.*]] = shl nsw <8 x i64> <i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1>, [[T0]]
-; CHECK-NEXT:    [[T2:%.*]] = lshr exact <8 x i64> [[T1]], [[T0]]
+; CHECK-NEXT:    [[T2:%.*]] = lshr <8 x i64> <i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1>, [[T0]]
 ; CHECK-NEXT:    [[T3:%.*]] = add <8 x i32> [[NBITS]], <i32 -32, i32 -32, i32 -32, i32 -32, i32 -32, i32 -32, i32 -32, i32 -32>
 ; CHECK-NEXT:    [[T4:%.*]] = and <8 x i64> [[T2]], [[X:%.*]]
 ; CHECK-NEXT:    call void @use8xi64(<8 x i64> [[T0]])
@@ -89,7 +89,7 @@ define <8 x i32> @t2_vec_splat_undef(<8 x i64> %x, <8 x i32> %nbits) {
 ; CHECK-LABEL: @t2_vec_splat_undef(
 ; CHECK-NEXT:    [[T0:%.*]] = zext <8 x i32> [[NBITS:%.*]] to <8 x i64>
 ; CHECK-NEXT:    [[T1:%.*]] = shl <8 x i64> <i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 undef, i64 -1>, [[T0]]
-; CHECK-NEXT:    [[T2:%.*]] = lshr exact <8 x i64> [[T1]], [[T0]]
+; CHECK-NEXT:    [[T2:%.*]] = lshr <8 x i64> <i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1>, [[T0]]
 ; CHECK-NEXT:    [[T3:%.*]] = add <8 x i32> [[NBITS]], <i32 -32, i32 -32, i32 -32, i32 -32, i32 -32, i32 -32, i32 undef, i32 -32>
 ; CHECK-NEXT:    [[T4:%.*]] = and <8 x i64> [[T2]], [[X:%.*]]
 ; CHECK-NEXT:    call void @use8xi64(<8 x i64> [[T0]])
@@ -122,7 +122,7 @@ define <8 x i32> @t3_vec_nonsplat(<8 x i64> %x, <8 x i32> %nbits) {
 ; CHECK-LABEL: @t3_vec_nonsplat(
 ; CHECK-NEXT:    [[T0:%.*]] = zext <8 x i32> [[NBITS:%.*]] to <8 x i64>
 ; CHECK-NEXT:    [[T1:%.*]] = shl <8 x i64> <i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 undef, i64 -1>, [[T0]]
-; CHECK-NEXT:    [[T2:%.*]] = lshr exact <8 x i64> [[T1]], [[T0]]
+; CHECK-NEXT:    [[T2:%.*]] = lshr <8 x i64> <i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1>, [[T0]]
 ; CHECK-NEXT:    [[T3:%.*]] = add <8 x i32> [[NBITS]], <i32 -32, i32 -1, i32 0, i32 1, i32 31, i32 32, i32 undef, i32 64>
 ; CHECK-NEXT:    [[T4:%.*]] = and <8 x i64> [[T2]], [[X:%.*]]
 ; CHECK-NEXT:    call void @use8xi64(<8 x i64> [[T0]])
@@ -157,7 +157,7 @@ define i32 @n4_extrause(i64 %x, i32 %nbits) {
 ; CHECK-LABEL: @n4_extrause(
 ; CHECK-NEXT:    [[T0:%.*]] = zext i32 [[NBITS:%.*]] to i64
 ; CHECK-NEXT:    [[T1:%.*]] = shl nsw i64 -1, [[T0]]
-; CHECK-NEXT:    [[T2:%.*]] = lshr exact i64 [[T1]], [[T0]]
+; CHECK-NEXT:    [[T2:%.*]] = lshr i64 -1, [[T0]]
 ; CHECK-NEXT:    [[T3:%.*]] = add i32 [[NBITS]], -32
 ; CHECK-NEXT:    [[T4:%.*]] = and i64 [[T2]], [[X:%.*]]
 ; CHECK-NEXT:    call void @use64(i64 [[T0]])
diff --git a/llvm/test/Transforms/InstCombine/redundant-left-shift-input-masking-variant-d.ll b/llvm/test/Transforms/InstCombine/redundant-left-shift-input-masking-variant-d.ll
index c91e5a0..549729f 100644
--- a/llvm/test/Transforms/InstCombine/redundant-left-shift-input-masking-variant-d.ll
+++ b/llvm/test/Transforms/InstCombine/redundant-left-shift-input-masking-variant-d.ll
@@ -18,7 +18,7 @@ declare void @use32(i32)
 define i32 @t0_basic(i32 %x, i32 %nbits) {
 ; CHECK-LABEL: @t0_basic(
 ; CHECK-NEXT:    [[T0:%.*]] = shl nsw i32 -1, [[NBITS:%.*]]
-; CHECK-NEXT:    [[T1:%.*]] = lshr exact i32 [[T0]], [[NBITS]]
+; CHECK-NEXT:    [[T1:%.*]] = lshr i32 -1, [[NBITS]]
 ; CHECK-NEXT:    [[T2:%.*]] = and i32 [[T1]], [[X:%.*]]
 ; CHECK-NEXT:    call void @use32(i32 [[T0]])
 ; CHECK-NEXT:    call void @use32(i32 [[T1]])
@@ -39,7 +39,7 @@ define i32 @t0_basic(i32 %x, i32 %nbits) {
 define i32 @t1_bigger_shift(i32 %x, i32 %nbits) {
 ; CHECK-LABEL: @t1_bigger_shift(
 ; CHECK-NEXT:    [[T0:%.*]] = shl nsw i32 -1, [[NBITS:%.*]]
-; CHECK-NEXT:    [[T1:%.*]] = lshr exact i32 [[T0]], [[NBITS]]
+; CHECK-NEXT:    [[T1:%.*]] = lshr i32 -1, [[NBITS]]
 ; CHECK-NEXT:    [[T2:%.*]] = and i32 [[T1]], [[X:%.*]]
 ; CHECK-NEXT:    [[T3:%.*]] = add i32 [[NBITS]], 1
 ; CHECK-NEXT:    call void @use32(i32 [[T0]])
@@ -68,7 +68,7 @@ declare void @use3xi32(<3 x i32>)
 define <3 x i32> @t2_vec_splat(<3 x i32> %x, <3 x i32> %nbits) {
 ; CHECK-LABEL: @t2_vec_splat(
 ; CHECK-NEXT:    [[T0:%.*]] = shl nsw <3 x i32> <i32 -1, i32 -1, i32 -1>, [[NBITS:%.*]]
-; CHECK-NEXT:    [[T1:%.*]] = lshr exact <3 x i32> [[T0]], [[NBITS]]
+; CHECK-NEXT:    [[T1:%.*]] = lshr <3 x i32> <i32 -1, i32 -1, i32 -1>, [[NBITS]]
 ; CHECK-NEXT:    [[T2:%.*]] = and <3 x i32> [[T1]], [[X:%.*]]
 ; CHECK-NEXT:    [[T3:%.*]] = add <3 x i32> [[NBITS]], <i32 1, i32 1, i32 1>
 ; CHECK-NEXT:    call void @use3xi32(<3 x i32> [[T0]])
@@ -93,7 +93,7 @@ define <3 x i32> @t2_vec_splat(<3 x i32> %x, <3 x i32> %nbits) {
 define <3 x i32> @t3_vec_nonsplat(<3 x i32> %x, <3 x i32> %nbits) {
 ; CHECK-LABEL: @t3_vec_nonsplat(
 ; CHECK-NEXT:    [[T0:%.*]] = shl nsw <3 x i32> <i32 -1, i32 -1, i32 -1>, [[NBITS:%.*]]
-; CHECK-NEXT:    [[T1:%.*]] = lshr exact <3 x i32> [[T0]], [[NBITS]]
+; CHECK-NEXT:    [[T1:%.*]] = lshr <3 x i32> <i32 -1, i32 -1, i32 -1>, [[NBITS]]
 ; CHECK-NEXT:    [[T2:%.*]] = and <3 x i32> [[T1]], [[X:%.*]]
 ; CHECK-NEXT:    [[T3:%.*]] = add <3 x i32> [[NBITS]], <i32 1, i32 0, i32 2>
 ; CHECK-NEXT:    call void @use3xi32(<3 x i32> [[T0]])
@@ -118,7 +118,7 @@ define <3 x i32> @t3_vec_nonsplat(<3 x i32> %x, <3 x i32> %nbits) {
 define <3 x i32> @t4_vec_undef(<3 x i32> %x, <3 x i32> %nbits) {
 ; CHECK-LABEL: @t4_vec_undef(
 ; CHECK-NEXT:    [[T0:%.*]] = shl <3 x i32> <i32 -1, i32 undef, i32 -1>, [[NBITS:%.*]]
-; CHECK-NEXT:    [[T1:%.*]] = lshr exact <3 x i32> [[T0]], [[NBITS]]
+; CHECK-NEXT:    [[T1:%.*]] = lshr <3 x i32> <i32 -1, i32 -1, i32 -1>, [[NBITS]]
 ; CHECK-NEXT:    [[T2:%.*]] = and <3 x i32> [[T1]], [[X:%.*]]
 ; CHECK-NEXT:    call void @use3xi32(<3 x i32> [[T0]])
 ; CHECK-NEXT:    call void @use3xi32(<3 x i32> [[T1]])
@@ -147,7 +147,7 @@ define i32 @t5_commutativity0(i32 %nbits) {
 ; CHECK-LABEL: @t5_commutativity0(
 ; CHECK-NEXT:    [[X:%.*]] = call i32 @gen32()
 ; CHECK-NEXT:    [[T0:%.*]] = shl nsw i32 -1, [[NBITS:%.*]]
-; CHECK-NEXT:    [[T1:%.*]] = lshr exact i32 [[T0]], [[NBITS]]
+; CHECK-NEXT:    [[T1:%.*]] = lshr i32 -1, [[NBITS]]
 ; CHECK-NEXT:    [[T2:%.*]] = and i32 [[X]], [[T1]]
 ; CHECK-NEXT:    call void @use32(i32 [[T0]])
 ; CHECK-NEXT:    call void @use32(i32 [[T1]])
@@ -169,7 +169,7 @@ define i32 @t5_commutativity0(i32 %nbits) {
 define i32 @t6_commutativity1(i32 %nbits0, i32 %nbits1) {
 ; CHECK-LABEL: @t6_commutativity1(
 ; CHECK-NEXT:    [[T0:%.*]] = shl nsw i32 -1, [[NBITS0:%.*]]
-; CHECK-NEXT:    [[T1:%.*]] = lshr exact i32 [[T0]], [[NBITS0]]
+; CHECK-NEXT:    [[T1:%.*]] = lshr i32 -1, [[NBITS0]]
 ; CHECK-NEXT:    [[T2:%.*]] = shl nsw i32 -1, [[NBITS1:%.*]]
 ; CHECK-NEXT:    [[T3:%.*]] = lshr i32 [[T0]], [[NBITS1]]
 ; CHECK-NEXT:    [[T4:%.*]] = and i32 [[T3]], [[T1]]
@@ -197,7 +197,7 @@ define i32 @t6_commutativity1(i32 %nbits0, i32 %nbits1) {
 define i32 @t7_commutativity2(i32 %nbits0, i32 %nbits1) {
 ; CHECK-LABEL: @t7_commutativity2(
 ; CHECK-NEXT:    [[T0:%.*]] = shl nsw i32 -1, [[NBITS0:%.*]]
-; CHECK-NEXT:    [[T1:%.*]] = lshr exact i32 [[T0]], [[NBITS0]]
+; CHECK-NEXT:    [[T1:%.*]] = lshr i32 -1, [[NBITS0]]
 ; CHECK-NEXT:    [[T2:%.*]] = shl nsw i32 -1, [[NBITS1:%.*]]
 ; CHECK-NEXT:    [[T3:%.*]] = lshr i32 [[T0]], [[NBITS1]]
 ; CHECK-NEXT:    [[T4:%.*]] = and i32 [[T3]], [[T1]]
@@ -228,7 +228,7 @@ define i32 @t7_commutativity2(i32 %nbits0, i32 %nbits1) {
 define i32 @t8_nuw(i32 %x, i32 %nbits) {
 ; CHECK-LABEL: @t8_nuw(
 ; CHECK-NEXT:    [[T0:%.*]] = shl nsw i32 -1, [[NBITS:%.*]]
-; CHECK-NEXT:    [[T1:%.*]] = lshr exact i32 [[T0]], [[NBITS]]
+; CHECK-NEXT:    [[T1:%.*]] = lshr i32 -1, [[NBITS]]
 ; CHECK-NEXT:    [[T2:%.*]] = and i32 [[T1]], [[X:%.*]]
 ; CHECK-NEXT:    call void @use32(i32 [[T0]])
 ; CHECK-NEXT:    call void @use32(i32 [[T1]])
@@ -249,7 +249,7 @@ define i32 @t8_nuw(i32 %x, i32 %nbits) {
 define i32 @t9_nsw(i32 %x, i32 %nbits) {
 ; CHECK-LABEL: @t9_nsw(
 ; CHECK-NEXT:    [[T0:%.*]] = shl nsw i32 -1, [[NBITS:%.*]]
-; CHECK-NEXT:    [[T1:%.*]] = lshr exact i32 [[T0]], [[NBITS]]
+; CHECK-NEXT:    [[T1:%.*]] = lshr i32 -1, [[NBITS]]
 ; CHECK-NEXT:    [[T2:%.*]] = and i32 [[T1]], [[X:%.*]]
 ; CHECK-NEXT:    call void @use32(i32 [[T0]])
 ; CHECK-NEXT:    call void @use32(i32 [[T1]])
@@ -270,7 +270,7 @@ define i32 @t9_nsw(i32 %x, i32 %nbits) {
 define i32 @t10_nuw_nsw(i32 %x, i32 %nbits) {
 ; CHECK-LABEL: @t10_nuw_nsw(
 ; CHECK-NEXT:    [[T0:%.*]] = shl nsw i32 -1, [[NBITS:%.*]]
-; CHECK-NEXT:    [[T1:%.*]] = lshr exact i32 [[T0]], [[NBITS]]
+; CHECK-NEXT:    [[T1:%.*]] = lshr i32 -1, [[NBITS]]
 ; CHECK-NEXT:    [[T2:%.*]] = and i32 [[T1]], [[X:%.*]]
 ; CHECK-NEXT:    call void @use32(i32 [[T0]])
 ; CHECK-NEXT:    call void @use32(i32 [[T1]])
@@ -298,7 +298,7 @@ define i32 @t11_assume_uge(i32 %x, i32 %masknbits, i32 %shiftnbits) {
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp uge i32 [[SHIFTNBITS:%.*]], [[MASKNBITS:%.*]]
 ; CHECK-NEXT:    call void @llvm.assume(i1 [[CMP]])
 ; CHECK-NEXT:    [[T0:%.*]] = shl nsw i32 -1, [[MASKNBITS]]
-; CHECK-NEXT:    [[T1:%.*]] = lshr exact i32 [[T0]], [[MASKNBITS]]
+; CHECK-NEXT:    [[T1:%.*]] = lshr i32 -1, [[MASKNBITS]]
 ; CHECK-NEXT:    [[T2:%.*]] = and i32 [[T1]], [[X:%.*]]
 ; CHECK-NEXT:    call void @use32(i32 [[T0]])
 ; CHECK-NEXT:    call void @use32(i32 [[T1]])
-- 
cgit v1.1


From f79f58d5f1be370ccc212236e1c10b55835eb3c7 Mon Sep 17 00:00:00 2001
From: Prabhuk <prabhukrllvm@gmail.com>
Date: Tue, 13 Feb 2024 10:57:39 -0800
Subject: Revert "[clang] Remove #undef alloca workaround" (#81649)

Reverts llvm/llvm-project#81534

llvm/llvm-project#81534 breaks building (Fuchsia) Clang toolchain on
Windows.

Log:
https://logs.chromium.org/logs/fuchsia/buildbucket/cr-buildbucket/8756186536543250705/+/u/clang/install/stdout
Builder:
https://ci.chromium.org/ui/p/fuchsia/builders/toolchain.ci/clang-windows-x64/b8756186536543250705/overview

```
FAILED: tools/clang/tools/extra/clang-include-fixer/tool/CMakeFiles/clang-include-fixer.dir/ClangIncludeFixer.cpp.obj
C:\b\s\w\ir\x\w\cipd\bin\clang-cl.exe  /nologo -TP -DCLANG_REPOSITORY_STRING=\"https://llvm.googlesource.com/llvm-project\" -DGTEST_HAS_RTTI=0 -DUNICODE -D_CRT_NONSTDC_NO_DEPRECATE -D_CRT_NONSTDC_NO_WARNINGS -D_CRT_SECURE_NO_DEPRECATE -D_CRT_SECURE_NO_WARNINGS -D_GLIBCXX_ASSERTIONS -D_HAS_EXCEPTIONS=0 -D_SCL_SECURE_NO_DEPRECATE -D_SCL_SECURE_NO_WARNINGS -D_UNICODE -D__STDC_CONSTANT_MACROS -D__STDC_FORMAT_MACROS -D__STDC_LIMIT_MACROS -IC:\b\s\w\ir\x\w\llvm_build\tools\clang\tools\extra\clang-include-fixer\tool -IC:\b\s\w\ir\x\w\llvm-llvm-project\clang-tools-extra\clang-include-fixer\tool -IC:\b\s\w\ir\x\w\llvm-llvm-project\clang\include -IC:\b\s\w\ir\x\w\llvm_build\tools\clang\include -IC:\b\s\w\ir\x\w\recipe_cleanup\tensorflow-venv\store\python_venv-q9i5kpsp0iun0ktmqgab125ti8\contents\Lib\site-packages\tensorflow\include -IC:\b\s\w\ir\x\w\llvm_build\include -IC:\b\s\w\ir\x\w\llvm-llvm-project\llvm\include -IC:\b\s\w\ir\x\w\llvm-llvm-project\clang-tools-extra\clang-include-fixer\tool\.. -imsvcC:\b\s\w\ir\x\w\zlib_install_target\include -imsvcC:\b\s\w\ir\x\w\zstd_install\include /DWIN32 /D_WINDOWS   /Zc:inline /Zc:__cplusplus /Oi /Brepro /bigobj /permissive- /W4  -Wextra -Wno-unused-parameter -Wwrite-strings -Wcast-qual -Wmissing-field-initializers -Wimplicit-fallthrough -Wcovered-switch-default -Wno-noexcept-type -Wnon-virtual-dtor -Wdelete-non-virtual-dtor -Wsuggest-override -Wstring-conversion -Wmisleading-indentation -Wctad-maybe-unsupported /Gw -no-canonical-prefixes /O2 /Ob2  -std:c++17 -MT  /EHs-c- /GR- -UNDEBUG /showIncludes /Fotools\clang\tools\extra\clang-include-fixer\tool\CMakeFiles\clang-include-fixer.dir\ClangIncludeFixer.cpp.obj /Fdtools\clang\tools\extra\clang-include-fixer\tool\CMakeFiles\clang-include-fixer.dir\ -c -- C:\b\s\w\ir\x\w\llvm-llvm-project\clang-tools-extra\clang-include-fixer\tool\ClangIncludeFixer.cpp
In file included from C:\b\s\w\ir\x\w\llvm-llvm-project\clang-tools-extra\clang-include-fixer\tool\ClangIncludeFixer.cpp:11:
In file included from C:\b\s\w\ir\x\w\llvm-llvm-project\clang-tools-extra\clang-include-fixer\tool\..\IncludeFixer.h:15:
In file included from C:\b\s\w\ir\x\w\llvm-llvm-project\clang\include\clang/Sema/ExternalSemaSource.h:15:
In file included from C:\b\s\w\ir\x\w\llvm-llvm-project\clang\include\clang/AST/ExternalASTSource.h:18:
In file included from C:\b\s\w\ir\x\w\llvm-llvm-project\clang\include\clang/AST/DeclBase.h:18:
In file included from C:\b\s\w\ir\x\w\llvm-llvm-project\clang\include\clang/AST/DeclarationName.h:18:
In file included from C:\b\s\w\ir\x\w\llvm-llvm-project\clang\include\clang/Basic/IdentifierTable.h:18:
In file included from C:\b\s\w\ir\x\w\llvm-llvm-project\clang\include\clang/Basic/Builtins.h:63:
C:\b\s\w\ir\x\w\llvm_build\tools\clang\include\clang/Basic/Builtins.inc(151,1): error: redefinition of enumerator 'BI_alloca'
  151 | LANGBUILTIN(_alloca, "v*z", "n", ALL_MS_LANGUAGES)
      | ^
C:\b\s\w\ir\x\w\llvm_build\tools\clang\include\clang/Basic/Builtins.inc(15,54): note: expanded from macro 'LANGBUILTIN'
   15 | #  define LANGBUILTIN(ID, TYPE, ATTRS, BUILTIN_LANG) BUILTIN(ID, TYPE, ATTRS)
      |                                                      ^
C:\b\s\w\ir\x\w\llvm-llvm-project\clang\include\clang/Basic/Builtins.h(62,34): note: expanded from macro 'BUILTIN'
   62 | #define BUILTIN(ID, TYPE, ATTRS) BI##ID,
      |                                  ^
<scratch space>(72,1): note: expanded from here
   72 | BI_alloca
      | ^
C:\b\s\w\ir\x\w\llvm_build\tools\clang\include\clang/Basic/Builtins.inc(150,1): note: previous definition is here
  150 | LIBBUILTIN(alloca, "v*z", "fn", STDLIB_H, ALL_GNU_LANGUAGES)
      | ^
C:\b\s\w\ir\x\w\llvm_build\tools\clang\include\clang/Basic/Builtins.inc(11,61): note: expanded from macro 'LIBBUILTIN'
   11 | #  define LIBBUILTIN(ID, TYPE, ATTRS, HEADER, BUILTIN_LANG) BUILTIN(ID, TYPE, ATTRS)
      |                                                             ^
C:\b\s\w\ir\x\w\llvm-llvm-project\clang\include\clang/Basic/Builtins.h(62,34): note: expanded from macro 'BUILTIN'
   62 | #define BUILTIN(ID, TYPE, ATTRS) BI##ID,
      |                                  ^
<scratch space>(71,1): note: expanded from here
   71 | BI_alloca
      | ^
```
---
 clang/include/clang/Basic/Builtins.h | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/clang/include/clang/Basic/Builtins.h b/clang/include/clang/Basic/Builtins.h
index 6700d19..f955d21 100644
--- a/clang/include/clang/Basic/Builtins.h
+++ b/clang/include/clang/Basic/Builtins.h
@@ -20,6 +20,10 @@
 #include "llvm/ADT/StringRef.h"
 #include <cstring>
 
+// VC++ defines 'alloca' as an object-like macro, which interferes with our
+// builtins.
+#undef alloca
+
 namespace clang {
 class TargetInfo;
 class IdentifierTable;
-- 
cgit v1.1


From e20462a069670c24ff512cca32688a29803852f4 Mon Sep 17 00:00:00 2001
From: Danila Malyutin <danilaml@users.noreply.github.com>
Date: Tue, 13 Feb 2024 22:58:01 +0400
Subject: [StatepointLowering] Use Constant instead of TargetConstant for undef
 value (#81635)

Prevents isel errors when trying to lower gc relocate of undef value
(which turns into CopyToReg of TargetConstant). Such relocates may occur
after DCE (e.g. after GVN removes some dead blocks) if there are not
passes like instcombine scheduled after to clean them up.

Fixes #80294

---------

Co-authored-by: Matt Arsenault <arsenm2@gmail.com>
---
 .../CodeGen/SelectionDAG/StatepointLowering.cpp    |  2 +-
 llvm/test/CodeGen/X86/statepoint-relocate-undef.ll | 32 ++++++++++++++++++++++
 2 files changed, 33 insertions(+), 1 deletion(-)
 create mode 100644 llvm/test/CodeGen/X86/statepoint-relocate-undef.ll

diff --git a/llvm/lib/CodeGen/SelectionDAG/StatepointLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/StatepointLowering.cpp
index 1320479..d7f4d1c 100644
--- a/llvm/lib/CodeGen/SelectionDAG/StatepointLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/StatepointLowering.cpp
@@ -1287,7 +1287,7 @@ void SelectionDAGBuilder::visitGCRelocate(const GCRelocateInst &Relocate) {
   if (SD.isUndef() && SD.getValueType().getSizeInBits() <= 64) {
     // Lowering relocate(undef) as arbitrary constant. Current constant value
     // is chosen such that it's unlikely to be a valid pointer.
-    setValue(&Relocate, DAG.getTargetConstant(0xFEFEFEFE, SDLoc(SD), MVT::i64));
+    setValue(&Relocate, DAG.getConstant(0xFEFEFEFE, SDLoc(SD), MVT::i64));
     return;
   }
 
diff --git a/llvm/test/CodeGen/X86/statepoint-relocate-undef.ll b/llvm/test/CodeGen/X86/statepoint-relocate-undef.ll
new file mode 100644
index 0000000..69e6976
--- /dev/null
+++ b/llvm/test/CodeGen/X86/statepoint-relocate-undef.ll
@@ -0,0 +1,32 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -verify-machineinstrs < %s | FileCheck %s
+
+target datalayout = "e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-pc-linux-gnu"
+
+declare void @use(...)
+declare void @f()
+declare token @llvm.experimental.gc.statepoint.p0(i64, i32, ptr, i32, i32, ...)
+declare ptr addrspace(1) @llvm.experimental.gc.relocate.p1(token, i32, i32)
+
+;; Check that llvm doesn't crash if relocate with undef base/derived ptr survives until isel
+define void @test_gcrelocate_undef(ptr addrspace(1) %ptr) gc "statepoint-example" {
+; CHECK-LABEL: test_gcrelocate_undef:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    pushq %rax
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    movq %rdi, (%rsp)
+; CHECK-NEXT:    callq f@PLT
+; CHECK-NEXT:  .Ltmp0:
+; CHECK-NEXT:    movl $4278124286, %edi # imm = 0xFEFEFEFE
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    callq use@PLT
+; CHECK-NEXT:    popq %rax
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    retq
+  %tok = tail call token (i64, i32, ptr, i32, i32, ...)
+      @llvm.experimental.gc.statepoint.p0(i64 0, i32 0, ptr elementtype(void ()) @f, i32 0, i32 0, i32 0, i32 0) ["gc-live" (ptr addrspace(1) %ptr, ptr addrspace(1) undef), "deopt" (ptr addrspace(1) %ptr, i32 undef)]
+  %a = call ptr addrspace(1) @llvm.experimental.gc.relocate.p1(token %tok, i32 0, i32 1)
+  call void (...) @use(ptr addrspace(1) %a)
+  ret void
+}
-- 
cgit v1.1


From 9dd2c59312bfae3526cee5e836a6b67b2e9b4989 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Wed, 14 Feb 2024 00:28:46 +0530
Subject: InstCombine: Enable SimplifyDemandedUseFPClass and remove flag
 (#81108)

This completes the unrevert of ef388334ee5a3584255b9ef5b3fefdb244fa3fd7.
---
 clang/test/Headers/__clang_hip_math.hip            | 56 +++++++++++++++++-----
 .../InstCombine/InstructionCombining.cpp           |  9 ----
 .../InstCombine/simplify-demanded-fpclass.ll       |  2 +-
 3 files changed, 45 insertions(+), 22 deletions(-)

diff --git a/clang/test/Headers/__clang_hip_math.hip b/clang/test/Headers/__clang_hip_math.hip
index e9a9cb4..37099de 100644
--- a/clang/test/Headers/__clang_hip_math.hip
+++ b/clang/test/Headers/__clang_hip_math.hip
@@ -2557,33 +2557,65 @@ extern "C" __device__ double test_nan(const char *tag) {
   return nan(tag);
 }
 
-// CHECK-LABEL: @test_nanf_emptystr(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    ret float 0x7FF8000000000000
+// DEFAULT-LABEL: @test_nanf_emptystr(
+// DEFAULT-NEXT:  entry:
+// DEFAULT-NEXT:    ret float 0x7FF8000000000000
+//
+// FINITEONLY-LABEL: @test_nanf_emptystr(
+// FINITEONLY-NEXT:  entry:
+// FINITEONLY-NEXT:    ret float poison
+//
+// APPROX-LABEL: @test_nanf_emptystr(
+// APPROX-NEXT:  entry:
+// APPROX-NEXT:    ret float 0x7FF8000000000000
 //
 extern "C" __device__ float test_nanf_emptystr() {
   return nanf("");
 }
 
-// CHECK-LABEL: @test_nan_emptystr(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    ret double 0x7FF8000000000000
+// DEFAULT-LABEL: @test_nan_emptystr(
+// DEFAULT-NEXT:  entry:
+// DEFAULT-NEXT:    ret double 0x7FF8000000000000
+//
+// FINITEONLY-LABEL: @test_nan_emptystr(
+// FINITEONLY-NEXT:  entry:
+// FINITEONLY-NEXT:    ret double poison
+//
+// APPROX-LABEL: @test_nan_emptystr(
+// APPROX-NEXT:  entry:
+// APPROX-NEXT:    ret double 0x7FF8000000000000
 //
 extern "C" __device__ double test_nan_emptystr() {
   return nan("");
 }
 
-// CHECK-LABEL: @test_nanf_fill(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    ret float 0x7FF8000000000000
+// DEFAULT-LABEL: @test_nanf_fill(
+// DEFAULT-NEXT:  entry:
+// DEFAULT-NEXT:    ret float 0x7FF8000000000000
+//
+// FINITEONLY-LABEL: @test_nanf_fill(
+// FINITEONLY-NEXT:  entry:
+// FINITEONLY-NEXT:    ret float poison
+//
+// APPROX-LABEL: @test_nanf_fill(
+// APPROX-NEXT:  entry:
+// APPROX-NEXT:    ret float 0x7FF8000000000000
 //
 extern "C" __device__ float test_nanf_fill() {
   return nanf("0x456");
 }
 
-// CHECK-LABEL: @test_nan_fill(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    ret double 0x7FF8000000000000
+// DEFAULT-LABEL: @test_nan_fill(
+// DEFAULT-NEXT:  entry:
+// DEFAULT-NEXT:    ret double 0x7FF8000000000000
+//
+// FINITEONLY-LABEL: @test_nan_fill(
+// FINITEONLY-NEXT:  entry:
+// FINITEONLY-NEXT:    ret double poison
+//
+// APPROX-LABEL: @test_nan_fill(
+// APPROX-NEXT:  entry:
+// APPROX-NEXT:    ret double 0x7FF8000000000000
 //
 extern "C" __device__ double test_nan_fill() {
   return nan("0x123");
diff --git a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
index b1e2262..7450f39 100644
--- a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
@@ -142,12 +142,6 @@ static cl::opt<unsigned>
 MaxArraySize("instcombine-maxarray-size", cl::init(1024),
              cl::desc("Maximum array size considered when doing a combine"));
 
-// TODO: Remove this option
-static cl::opt<bool> EnableSimplifyDemandedUseFPClass(
-    "instcombine-simplify-demanded-fp-class",
-    cl::desc("Enable demanded floating-point class optimizations"),
-    cl::init(false));
-
 // FIXME: Remove this flag when it is no longer necessary to convert
 // llvm.dbg.declare to avoid inaccurate debug info. Setting this to false
 // increases variable availability at the cost of accuracy. Variables that
@@ -3111,9 +3105,6 @@ Instruction *InstCombinerImpl::visitFree(CallInst &FI, Value *Op) {
 }
 
 Instruction *InstCombinerImpl::visitReturnInst(ReturnInst &RI) {
-  if (!EnableSimplifyDemandedUseFPClass)
-    return nullptr;
-
   Value *RetVal = RI.getReturnValue();
   if (!RetVal || !AttributeFuncs::isNoFPClassCompatibleType(RetVal->getType()))
     return nullptr;
diff --git a/llvm/test/Transforms/InstCombine/simplify-demanded-fpclass.ll b/llvm/test/Transforms/InstCombine/simplify-demanded-fpclass.ll
index dd9b714..5dfeb07 100644
--- a/llvm/test/Transforms/InstCombine/simplify-demanded-fpclass.ll
+++ b/llvm/test/Transforms/InstCombine/simplify-demanded-fpclass.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2
-; RUN: opt -S -passes=instcombine -instcombine-simplify-demanded-fp-class < %s | FileCheck %s
+; RUN: opt -S -passes=instcombine < %s | FileCheck %s
 
 declare float @llvm.fabs.f32(float)
 declare float @llvm.copysign.f32(float, float)
-- 
cgit v1.1


From fc0e9c8315564288f9079a633892abadace534cf Mon Sep 17 00:00:00 2001
From: Mark de Wever <koraq@xs4all.nl>
Date: Tue, 13 Feb 2024 20:04:34 +0100
Subject: [libc++][modules] Re-add build dir CMakeLists.txt. (#81370)

This CMakeLists.txt is used to build modules without build system
support. This was removed in d06ae33ec32122bb526fb35025c1f0cf979f1090.
This is used in the documentation how to use modules.

Made some minor changes to make it work with the std.compat module using
the std module.

Note the CMakeLists.txt in the build dir should be removed once build
system support is generally available.
---
 libcxx/docs/Modules.rst          |  4 ++
 libcxx/modules/CMakeLists.txt    | 20 +++++++++
 libcxx/modules/CMakeLists.txt.in | 88 ++++++++++++++++++++++++++++++++++++++++
 3 files changed, 112 insertions(+)
 create mode 100644 libcxx/modules/CMakeLists.txt.in

diff --git a/libcxx/docs/Modules.rst b/libcxx/docs/Modules.rst
index 533c3fb..ee2b81d 100644
--- a/libcxx/docs/Modules.rst
+++ b/libcxx/docs/Modules.rst
@@ -218,9 +218,13 @@ Building this project is done with the following steps, assuming the files
 
   $ mkdir build
   $ cmake -G Ninja -S . -B build -DCMAKE_CXX_COMPILER=<path-to-compiler> -DLIBCXX_BUILD=<build>
+  $ ninja -j1 std -C build
   $ ninja -C build
   $ build/main
 
+.. note:: The ``std`` dependencies of ``std.compat`` is not always resolved when
+          building the ``std`` target using multiple jobs.
+
 .. warning:: ``<path-to-compiler>`` should point point to the real binary and
              not to a symlink.
 
diff --git a/libcxx/modules/CMakeLists.txt b/libcxx/modules/CMakeLists.txt
index 0388c04..0dea8cf 100644
--- a/libcxx/modules/CMakeLists.txt
+++ b/libcxx/modules/CMakeLists.txt
@@ -137,6 +137,25 @@ set(LIBCXX_MODULE_STD_COMPAT_SOURCES
   std.compat/cwctype.inc
 )
 
+# TODO MODULES the CMakeLists.txt in the build directory is only temporary.
+# This allows using as available in the build directory. Once build systems
+# have proper support for the installed files this will be removed.
+if ("${LIBCXX_GENERATED_INCLUDE_DIR}" STREQUAL "${LIBCXX_GENERATED_INCLUDE_TARGET_DIR}")
+  # This typically happens when the target is not installed.
+  set(LIBCXX_CONFIGURED_INCLUDE_DIRS "${LIBCXX_GENERATED_INCLUDE_DIR}")
+else()
+  # It's important that the arch directory be included first so that its header files
+  # which interpose on the default include dir be included instead of the default ones.
+  set(LIBCXX_CONFIGURED_INCLUDE_DIRS
+    "${LIBCXX_GENERATED_INCLUDE_TARGET_DIR};${LIBCXX_GENERATED_INCLUDE_DIR}"
+  )
+endif()
+configure_file(
+  "CMakeLists.txt.in"
+  "${LIBCXX_GENERATED_MODULE_DIR}/CMakeLists.txt"
+  @ONLY
+)
+
 set(LIBCXX_MODULE_STD_INCLUDE_SOURCES)
 foreach(file ${LIBCXX_MODULE_STD_SOURCES})
   set(
@@ -166,6 +185,7 @@ configure_file(
 )
 
 set(_all_modules)
+list(APPEND _all_modules "${LIBCXX_GENERATED_MODULE_DIR}/CMakeLists.txt")
 list(APPEND _all_modules "${LIBCXX_GENERATED_MODULE_DIR}/std.cppm")
 list(APPEND _all_modules "${LIBCXX_GENERATED_MODULE_DIR}/std.compat.cppm")
 foreach(file ${LIBCXX_MODULE_STD_SOURCES} ${LIBCXX_MODULE_STD_COMPAT_SOURCES})
diff --git a/libcxx/modules/CMakeLists.txt.in b/libcxx/modules/CMakeLists.txt.in
new file mode 100644
index 0000000..e332d70
--- /dev/null
+++ b/libcxx/modules/CMakeLists.txt.in
@@ -0,0 +1,88 @@
+cmake_minimum_required(VERSION 3.26)
+
+project(libc++-modules LANGUAGES CXX)
+
+# Enable CMake's module support
+if(CMAKE_VERSION VERSION_LESS "3.28.0")
+  if(CMAKE_VERSION VERSION_LESS "3.27.0")
+    set(CMAKE_EXPERIMENTAL_CXX_MODULE_CMAKE_API "2182bf5c-ef0d-489a-91da-49dbc3090d2a")
+  else()
+    set(CMAKE_EXPERIMENTAL_CXX_MODULE_CMAKE_API "aa1f7df0-828a-4fcd-9afc-2dc80491aca7")
+  endif()
+  set(CMAKE_EXPERIMENTAL_CXX_MODULE_DYNDEP 1)
+else()
+  cmake_policy(VERSION 3.28)
+endif()
+
+# Default to C++ extensions being off. Libc++'s modules support have trouble
+# with extensions right now.
+set(CMAKE_CXX_EXTENSIONS OFF)
+
+# Propagates the CMake options to the modules.
+#
+# This uses the std module hard-coded since the std.compat module does not
+# depend on these flags.
+macro(compile_define_if_not condition def)
+  if (NOT ${condition})
+    target_compile_definitions(std PRIVATE ${def})
+  endif()
+endmacro()
+macro(compile_define_if condition def)
+  if (${condition})
+    target_compile_definitions(std PRIVATE ${def})
+  endif()
+endmacro()
+
+### STD
+
+add_library(std)
+target_sources(std
+  PUBLIC FILE_SET cxx_modules TYPE CXX_MODULES FILES
+    std.cppm
+)
+
+target_include_directories(std SYSTEM PRIVATE @LIBCXX_CONFIGURED_INCLUDE_DIRS@)
+
+if (NOT @LIBCXX_ENABLE_EXCEPTIONS@)
+  target_compile_options(std PUBLIC -fno-exceptions)
+endif()
+
+target_compile_options(std
+  PUBLIC
+    -nostdinc++
+    -Wno-reserved-module-identifier
+    -Wno-reserved-user-defined-literal
+    @LIBCXX_COMPILE_FLAGS@
+)
+set_target_properties(std
+  PROPERTIES
+    OUTPUT_NAME   "c++std"
+)
+
+### STD.COMPAT
+
+add_library(std.compat)
+target_sources(std.compat
+  PUBLIC FILE_SET cxx_modules TYPE CXX_MODULES FILES
+    std.compat.cppm
+)
+
+target_include_directories(std.compat SYSTEM PRIVATE @LIBCXX_CONFIGURED_INCLUDE_DIRS@)
+
+if (NOT @LIBCXX_ENABLE_EXCEPTIONS@)
+  target_compile_options(std.compat PUBLIC -fno-exceptions)
+endif()
+
+target_compile_options(std.compat
+  PUBLIC
+    -nostdinc++
+    -Wno-reserved-module-identifier
+    -Wno-reserved-user-defined-literal
+	-fmodule-file=std=${CMAKE_CURRENT_BINARY_DIR}/CMakeFiles/std.dir/std.pcm
+    @LIBCXX_COMPILE_FLAGS@
+)
+set_target_properties(std.compat
+  PROPERTIES
+    OUTPUT_NAME   "c++std.compat"
+)
+add_dependencies(std.compat std)
-- 
cgit v1.1